From f5f039c918b931d812aa836217392cc77771cd9d Mon Sep 17 00:00:00 2001
From: Mert <62549656+fswair@users.noreply.github.com>
Date: Sun, 15 Mar 2026 21:27:33 +0300
Subject: [PATCH 1/8] demo

---
 src/vowel/__init__.py        |  25 +
 src/vowel/codemode.py        | 742 +++++++++++++++++++++++++++
 src/vowel/eval_types.py      |   6 +-
 src/vowel/evals.py           |   3 -
 src/vowel/executor.py        | 962 +++++++++++++++++++++++++++++++++++
 src/vowel/runner.py          |  51 ++
 src/vowel/spec_validation.py | 342 +++++++++++++
 src/vowel/tdd.py             | 178 +++++--
 src/vowel/utils.py           |  24 +-
 9 files changed, 2265 insertions(+), 68 deletions(-)
 create mode 100644 src/vowel/codemode.py
 create mode 100644 src/vowel/executor.py
 create mode 100644 src/vowel/spec_validation.py

diff --git a/src/vowel/__init__.py b/src/vowel/__init__.py
index 8c065a7..c01915e 100644
--- a/src/vowel/__init__.py
+++ b/src/vowel/__init__.py
@@ -32,9 +32,20 @@
 __version__ = importlib.metadata.version("vowel")
 
 from .ai import EvalGenerator, GenerationResult, UnsupportedParameterTypeError
+from .codemode import CodeModeGenerator, CodeModeResult, ExplorationPlan, SnippetResult
 from .context import EVAL_SPEC_CONTEXT
 from .errors import FixturePathError, SignatureError
 from .eval_types import EvalsFile
+from .executor import (
+    DefaultExecutor,
+    DefaultSession,
+    ExecutionResult,
+    ExecutionSession,
+    Executor,
+    MontyExecutor,
+    MontyReplSession,
+    get_executor,
+)
 from .runner import Function, RunEvals
 from .utils import (
     EvalResult,
@@ -73,4 +84,18 @@
     "check_compatibility",
     "get_unsupported_params",
     "is_yaml_serializable_type",
+    # CodeMode executor
+    "Executor",
+    "ExecutionResult",
+    "ExecutionSession",
+    "MontyExecutor",
+    "MontyReplSession",
+    "DefaultExecutor",
+    "DefaultSession",
+    "get_executor",
+    # CodeMode pipeline
+    "CodeModeGenerator",
+    "CodeModeResult",
+    "ExplorationPlan",
+    "SnippetResult",
 ]
diff --git a/src/vowel/codemode.py b/src/vowel/codemode.py
new file mode 100644
index 0000000..e488827
--- /dev/null
+++ b/src/vowel/codemode.py
@@ -0,0 +1,742 @@
+"""CodeMode eval generation pipeline.
+
+This module provides ``CodeModeGenerator`` — a two-phase pipeline that uses
+a sandboxed code executor to produce ground-truth expected values before
+generating YAML eval specs.
+
+Phase 1 — **Exploration**
+    The LLM writes small Python snippets that call ``target_func`` with various
+    inputs.  Each snippet is executed via ``Executor`` (Monty sandbox by default)
+    and the real outputs are collected.  This replaces guesswork with empirical
+    observation.
+
+Phase 2 — **Spec Generation**
+    The exploration results (inputs → outputs, edge cases, exceptions) are fed
+    back to the LLM together with the eval spec context.  The LLM produces the
+    final YAML spec with verified expected values.
+
+All steps are instrumented with ``logfire`` for full observability.
+"""
+
+from __future__ import annotations
+
+import os
+import time
+from typing import Any
+
+import logfire
+import yaml
+from pydantic import BaseModel, Field
+from pydantic_ai import Agent
+
+from vowel.context import EVAL_SPEC_CONTEXT
+from vowel.eval_types import EvalsSource
+from vowel.executor import ExecutionResult, Executor, get_executor
+from vowel.monitoring import enable_monitoring
+from vowel.runner import Function, RunEvals
+from vowel.spec_validation import (
+    build_call_code,
+    build_failure_context,
+    inject_durations,
+    inject_missing_error_cases,
+    validate_expected_values,
+)
+from vowel.utils import EvalSummary
+from vowel.validation import validate_and_fix_spec
+
+enable_monitoring(service_name="vowel-codemode")
+
+
+# ---------------------------------------------------------------------------
+# Exploration output model — what the LLM returns in Phase 1
+# ---------------------------------------------------------------------------
+
+
+class ExplorationSnippet(BaseModel):
+    """A single exploration snippet that tests normal (non-error) behaviour."""
+
+    description: str = Field(
+        description="One-line description of what this snippet tests "
+        "(e.g. 'empty list edge case', 'negative numbers').",
+    )
+    code: str = Field(
+        description="Python code to execute.  May call ``target_func(...)`` "
+        "which is the function under test.  The value of the last "
+        "expression is captured as output.",
+    )
+
+
+class ErrorSnippet(BaseModel):
+    """A snippet that should trigger an exception from the function."""
+
+    description: str = Field(
+        description="What error scenario this tests "
+        "(e.g. 'None input', 'division by zero', 'wrong type').",
+    )
+    code: str = Field(
+        description="Python code that should trigger an exception.  "
+        "Use the function's real name — the source is prepended at runtime.",
+    )
+
+
+class ExplorationPlan(BaseModel):
+    """LLM output for Phase 1: normal snippets + error-triggering snippets."""
+
+    snippets: list[ExplorationSnippet] = Field(
+        description="Snippets that test NORMAL (succeeding) behaviour: "
+        "happy-path, boundary values, return type exploration, "
+        "equivalence partitioning, invariants, composition.",
+        min_length=10,
+    )
+    error_snippets: list[ErrorSnippet] = Field(
+        description="Snippets that should TRIGGER EXCEPTIONS: wrong types, "
+        "invalid values, None inputs, out-of-range arguments.  "
+        "Every guard clause and raise statement in the function "
+        "must be exercised by at least one error snippet.",
+        min_length=3,
+    )
+
+
+# ---------------------------------------------------------------------------
+# Exploration result — what we feed back to Phase 2
+# ---------------------------------------------------------------------------
+
+
+class SnippetResult(BaseModel):
+    """Result of executing a single exploration snippet."""
+
+    description: str
+    code: str
+    success: bool
+    output: Any = None
+    stdout: str = ""
+    error: str | None = None
+    error_type: str | None = None
+    duration_ms: float = 0.0
+
+    model_config = {"arbitrary_types_allowed": True}
+
+    @classmethod
+    def from_execution(
+        cls,
+        snippet: ExplorationSnippet | ErrorSnippet,
+        result: ExecutionResult,
+    ) -> SnippetResult:
+        return cls(
+            description=snippet.description,
+            code=snippet.code,
+            success=result.success,
+            output=result.output,
+            stdout=result.stdout,
+            error=result.error,
+            error_type=result.error_type,
+            duration_ms=result.duration_ms,
+        )
+
+    def to_context_block(self) -> str:
+        """Format as a context block for the spec-generation prompt."""
+        if self.success:
+            out = repr(self.output)
+            return (
+                f"# {self.description}\n"
+                f">>> {self.code.strip()}\n"
+                f"Output: {out}  ({self.duration_ms:.2f} ms)"
+            )
+        return (
+            f"# {self.description}\n>>> {self.code.strip()}\nRAISED {self.error_type}: {self.error}"
+        )
+
+
+# ---------------------------------------------------------------------------
+# Pipeline result
+# ---------------------------------------------------------------------------
+
+
+class CodeModeResult(BaseModel):
+    """Full result of the CodeMode generation pipeline."""
+
+    exploration_results: list[SnippetResult] = Field(
+        description="Results from Phase 1 exploration.",
+    )
+    yaml_spec: str = Field(description="Final YAML eval specification.")
+    summary: EvalSummary | None = Field(
+        default=None,
+        description="Eval run summary (if run_evals=True).",
+    )
+    refinement_rounds: int = Field(
+        default=0,
+        description="Number of refinement iterations needed (0 = first-pass success).",
+    )
+
+    model_config = {"arbitrary_types_allowed": True}
+
+
+# ---------------------------------------------------------------------------
+# CodeModeGenerator
+# ---------------------------------------------------------------------------
+
+
+class CodeModeGenerator:
+    """Two-phase eval generator: explore with executor, then generate spec.
+
+    Parameters
+    ----------
+    model:
+        LLM model identifier (e.g. ``"openai:gpt-4o"``).
+    executor:
+        Code execution backend.  Defaults to ``get_executor("auto")``
+        which prefers MontyExecutor when available.
+    additional_context:
+        Extra instructions appended to the system prompt.
+    """
+
+    def __init__(
+        self,
+        model: str | None = None,
+        executor: Executor | None = None,
+        additional_context: str = "",
+        min_snippets: int = 15,
+        **opts,
+    ) -> None:
+        self.model = model or os.getenv("MODEL_NAME", "")
+        if not self.model:
+            logfire.warn("No model specified; set MODEL_NAME env var or pass model=")
+        self.executor = executor or get_executor("auto")
+        self.additional_context = additional_context
+        self.min_snippets = min_snippets
+        self._opts = opts
+
+        # Lazy agents
+        self._explorer_agent: Agent[None, ExplorationPlan] | None = None
+        self._spec_agent: Agent[None, EvalsSource] | None = None
+
+        logfire.info(
+            "CodeModeGenerator initialized",
+            model=self.model,
+            executor=type(self.executor).__name__,
+        )
+
+    # -- Agent properties --------------------------------------------------
+
+    @property
+    def explorer_agent(self) -> Agent[None, ExplorationPlan]:
+        if self._explorer_agent is None:
+            self._explorer_agent = Agent(
+                self.model,
+                output_type=ExplorationPlan,
+                system_prompt=self._explorer_system_prompt(),
+                **self._opts,
+            )
+        return self._explorer_agent
+
+    @property
+    def spec_agent(self) -> Agent[None, EvalsSource]:
+        if self._spec_agent is None:
+            self._spec_agent = Agent(
+                self.model,
+                output_type=EvalsSource,
+                system_prompt=self._spec_system_prompt(),
+                **self._opts,
+            )
+        return self._spec_agent
+
+    # -- System prompts ----------------------------------------------------
+
+    def _explorer_system_prompt(self) -> str:
+        return f"""You are a Python testing expert.  Your job is to write small
+code snippets that explore a function's behaviour empirically.
+
+You will receive:
+- The function's source code (with its real name)
+- The function's description
+
+You produce TWO separate lists of snippets:
+
+## `snippets` — Normal / succeeding behaviour
+These snippets call the function with VALID inputs and capture the return
+value.  They MUST cover:
+1. Normal / happy-path behaviour (typical valid inputs)
+2. Boundary values (empty collections, zero, negative, very large, min/max)
+3. Return type and structure exploration
+4. Equivalence partitioning (representative from each input class)
+5. Invariant verification (e.g. idempotency, commutativity, sort stability)
+6. Composition / interaction (combining parameters, dependent arguments)
+
+Produce AT LEAST {self.min_snippets} normal snippets.
+
+## `error_snippets` — Exception-triggering inputs
+These snippets call the function with inputs that SHOULD RAISE exceptions.
+They MUST cover:
+1. Wrong types (None, int instead of list, str instead of int, etc.)
+2. Invalid values (out-of-range, malformed strings, empty when not allowed)
+3. Every `raise` statement and guard clause in the function source code
+
+Produce AT LEAST 3 error snippets.  If the function has more raise
+statements or guard clauses, produce MORE — one per distinct error path.
+
+STRICT RULES:
+- Each snippet MUST end with an expression whose value will be captured.
+- Use the function's REAL NAME — the function source code will be prepended
+  automatically at runtime.  Do NOT define the function yourself.
+- Keep each snippet focused on ONE scenario.
+- Do NOT guess outputs — the snippets will be executed and the real
+  outputs collected automatically.
+- NEVER use try/except in your snippets.  Let exceptions propagate
+  naturally — the execution environment captures raised errors
+  automatically.  For example, write `flatten(None)` NOT
+  `try: flatten(None) except Exception as e: type(e)`.
+- `snippets` must contain ONLY inputs expected to SUCCEED.
+- `error_snippets` must contain ONLY inputs expected to RAISE exceptions.
+  Do NOT mix them."""
+
+    def _spec_system_prompt(self) -> str:
+        ctx = ""
+        if self.additional_context:
+            ctx = f"\n\n<AdditionalContext>\n{self.additional_context}\n</AdditionalContext>"
+        return f"""You are an expert vowel YAML SPEC generator.
+
+<EvalsInstructions>{EVAL_SPEC_CONTEXT}</EvalsInstructions>{ctx}
+
+CRITICAL: You have access to VERIFIED execution results below.  Use the
+EXACT outputs shown — do NOT guess or calculate expected values yourself.
+The execution results are ground-truth from running the real function."""
+
+    # -- Phase 1: Exploration ----------------------------------------------
+
+    async def explore(
+        self,
+        func: Function,
+    ) -> list[SnippetResult]:
+        """Phase 1: Generate and execute exploration snippets.
+
+        Uses ``create_session()`` to compile the function source **once**,
+        then feeds each snippet against the preserved runtime state —
+        zero re-parse overhead per snippet.
+
+        Returns a list of ``SnippetResult`` with real outputs from the
+        executor.
+        """
+        with logfire.span(
+            "codemode.explore",
+            func_name=func.name,
+            executor=type(self.executor).__name__,
+        ):
+            # 1. Ask the LLM for exploration snippets
+            plan = await self._get_exploration_plan(func)
+
+            # 2. Compile function source once, feed each snippet
+            all_snippets = [
+                *((s, "normal") for s in plan.snippets),
+                *((s, "error") for s in plan.error_snippets),
+            ]
+            total = len(all_snippets)
+            results: list[SnippetResult] = []
+            with self.executor.create_session(func.code) as session:
+                for i, (snippet, kind) in enumerate(all_snippets):
+                    with logfire.span(
+                        "codemode.execute_snippet",
+                        index=i,
+                        kind=kind,
+                        description=snippet.description,
+                    ):
+                        logfire.info(
+                            "Executing snippet {index}/{total} [{kind}]: {description}",
+                            index=i + 1,
+                            total=total,
+                            kind=kind,
+                            description=snippet.description,
+                            code=snippet.code,
+                        )
+
+                        exec_result = session.feed(snippet.code)
+
+                        sr = SnippetResult.from_execution(snippet, exec_result)
+                        results.append(sr)
+
+                        logfire.info(
+                            "Snippet result: success={success}, output={output}, "
+                            "duration={duration_ms:.2f}ms",
+                            success=sr.success,
+                            output=repr(sr.output)[:200],
+                            duration_ms=sr.duration_ms,
+                            error=sr.error,
+                            error_type=sr.error_type,
+                        )
+
+            # Summary log
+            successes = sum(1 for r in results if r.success)
+            failures = len(results) - successes
+            logfire.info(
+                "Exploration complete: {successes} succeeded, {failures} raised errors",
+                successes=successes,
+                failures=failures,
+            )
+
+            return results
+
+    async def _get_exploration_plan(self, func: Function) -> ExplorationPlan:
+        """Ask the LLM for exploration snippets."""
+        with logfire.span("codemode.llm_explore", func_name=func.name):
+            prompt = f"""Explore the following function by writing test snippets:
+
+<FunctionName>{func.name}</FunctionName>
+<FunctionCode>
+{func.code}
+</FunctionCode>
+<Description>{func.description}</Description>
+
+Write diverse snippets that call {func.name}(...) to discover the function's
+behaviour across all important scenarios.  Use the real function name
+`{func.name}` — the implementation will be prepended automatically."""
+
+            result = await self.explorer_agent.run(prompt)
+            plan = result.output
+
+            logfire.info(
+                "LLM produced {normal} normal + {error} error snippets",
+                normal=len(plan.snippets),
+                error=len(plan.error_snippets),
+                snippets=[s.description for s in plan.snippets],
+                error_snippets=[s.description for s in plan.error_snippets],
+            )
+            return plan
+
+    # -- Phase 2: Spec Generation ------------------------------------------
+
+    async def generate_spec(
+        self,
+        func: Function,
+        exploration_results: list[SnippetResult],
+        failure_context: str | None = None,
+    ) -> str:
+        """Phase 2: Generate YAML spec using verified exploration results.
+
+        Parameters
+        ----------
+        failure_context:
+            When provided (on refinement rounds), appended to the prompt so
+            the LLM can fix specific failures from the previous attempt.
+        """
+        with logfire.span(
+            "codemode.generate_spec",
+            func_name=func.name,
+            n_results=len(exploration_results),
+            is_refinement=failure_context is not None,
+        ):
+            # Build exploration context for the prompt
+            success_results = [r for r in exploration_results if r.success]
+            error_results = [r for r in exploration_results if not r.success]
+
+            success_context = (
+                "\n\n".join(r.to_context_block() for r in success_results)
+                if success_results
+                else "(none)"
+            )
+            error_context = (
+                "\n\n".join(r.to_context_block() for r in error_results)
+                if error_results
+                else "(none)"
+            )
+
+            refinement_block = ""
+            if failure_context:
+                refinement_block = f"""
+
+⚠️ PREVIOUS ATTEMPT FAILED — fix these issues:
+{failure_context}
+
+Regenerate the YAML spec addressing every failure above.  Keep all
+passing cases intact — only fix the broken ones."""
+
+            prompt = f"""Generate vowel evals YAML spec for `{func.name}`:
+
+<PythonImpl>
+{func.code}
+</PythonImpl>
+
+<Docstring>{func.description}</Docstring>
+
+<VerifiedExecutionResults>
+The following results are from ACTUALLY RUNNING the function — use these
+exact outputs as expected values:
+
+{success_context}
+</VerifiedExecutionResults>
+
+<ErrorResults count="{len(error_results)}">
+These inputs RAISED exceptions when run against the real function.
+Each one MUST become a `raises:` case in the spec — no exceptions.
+
+{error_context}
+</ErrorResults>
+
+REQUIREMENTS:
+- Use {func.name} as eval_id.
+- Generate at least {max(len(exploration_results), 5)} diverse test cases.
+- Use the EXACT outputs from the execution results above.
+- You MUST generate exactly {len(error_results)} raises cases — one for
+  each RAISED result above.  The spec is invalid without them.
+- Cover normal, edge, and error cases.
+- In assertions, use `input` (NOT `inputs`) for accessing input values.
+{refinement_block}"""
+
+            logfire.info(
+                "Sending spec generation prompt",
+                func_name=func.name,
+                success_results=len(success_results),
+                error_results=len(error_results),
+            )
+
+            result = await self.spec_agent.run(prompt)
+            yaml_spec = result.output.yaml_spec
+
+            # Sanitize: strip YAML tags that safe_load rejects
+            import re
+
+            yaml_spec = re.sub(r"!!python/[\w.:]+", "", yaml_spec)
+            yaml_spec = re.sub(r"!!binary\b", "", yaml_spec)
+
+            # Validate YAML syntax
+            yaml.safe_load(yaml_spec)
+
+            # Validate and auto-fix
+            validation = validate_and_fix_spec(
+                yaml_spec,
+                function_code=func.code,
+            )
+            if validation.has_warnings:
+                logfire.info(
+                    "Spec validation applied fixes",
+                    summary=validation.summary(),
+                )
+            final_spec = validation.fixed_yaml if validation.was_modified else yaml_spec
+
+            # Executor-based validation: fix expected values against real execution
+            final_spec = validate_expected_values(final_spec, func, self.executor)
+
+            # Inject missing error cases from exploration
+            error_snippet_dicts = [
+                {
+                    "code": r.code,
+                    "error_type": r.error_type,
+                    "error": r.error,
+                    "description": r.description,
+                }
+                for r in exploration_results
+                if not r.success and r.error_type
+            ]
+            final_spec = inject_missing_error_cases(final_spec, func.name, error_snippet_dicts)
+
+            logfire.info(
+                "YAML spec generated",
+                func_name=func.name,
+                spec_length=len(final_spec),
+                spec_preview=final_spec[:500],
+            )
+
+            return final_spec
+
+    # -- Helpers -----------------------------------------------------------
+
+    @staticmethod
+    def _build_failure_context(summary: EvalSummary) -> str:
+        """Build a concise failure report to inject into the retry prompt."""
+        return build_failure_context(summary)
+
+    def _inject_durations(
+        self,
+        yaml_spec: str,
+        func: Function,
+        *,
+        buffer_pct: float = 0.5,
+        floor_ms: float = 10.0,
+    ) -> str:
+        """Add per-case ``duration`` fields based on actual execution times."""
+        return inject_durations(
+            yaml_spec,
+            func,
+            self.executor,
+            buffer_pct=buffer_pct,
+            floor_ms=floor_ms,
+        )
+
+    @staticmethod
+    def _build_call_code(func_name: str, case: dict) -> str | None:
+        """Build a ``func(args...)`` call string from a case dict."""
+        return build_call_code(func_name, case)
+
+    # -- Full pipeline -----------------------------------------------------
+
+    async def generate(
+        self,
+        func: Function,
+        *,
+        run_evals: bool = True,
+        save_to_file: bool = False,
+        max_refinement_rounds: int = 2,
+        min_coverage: float = 1.0,
+        inject_durations: bool = True,
+    ) -> CodeModeResult:
+        """Run the full CodeMode pipeline with post-generation validation.
+
+        Pipeline::
+
+            Phase 1: explore()                        (once)
+            Phase 2: generate_spec()                  (may loop)
+            Phase 3: validate via RunEvals            (per attempt)
+            Phase 4: refine on failure                (up to N rounds)
+            Phase 5: inject_durations()               (once, at end)
+
+        Exploration (Phase 1) runs once — the ground-truth snippet results
+        don't change.  Only spec generation (Phase 2) is re-run on failure,
+        with a failure report injected into the prompt.
+
+        Parameters
+        ----------
+        func:
+            The function to generate evals for.
+        run_evals:
+            Whether to run the generated evals and include the summary.
+        save_to_file:
+            Whether to save the YAML spec to ``{func.name}_evals.yml``.
+        max_refinement_rounds:
+            Maximum number of spec-regeneration attempts after the initial
+            generation (0 = single attempt, no refinement).
+        min_coverage:
+            Target pass-rate in 0.0–1.0 (default 1.0 = 100 %).  The loop
+            exits early when coverage meets or exceeds this threshold.
+        inject_durations:
+            Whether to measure and inject per-case ``duration`` fields
+            into the final YAML spec.
+
+        Returns
+        -------
+        CodeModeResult
+            Contains exploration results, YAML spec, summary, and
+            the number of refinement rounds used.
+        """
+        with logfire.span(
+            "codemode.pipeline",
+            func_name=func.name,
+            model=self.model,
+            executor=type(self.executor).__name__,
+        ):
+            t0 = time.perf_counter()
+
+            # Phase 1 — explore (once)
+            exploration_results = await self.explore(func)
+
+            # Phase 2–4 — generate spec + validate + refine
+            yaml_spec = ""
+            summary: EvalSummary | None = None
+            refinement_rounds = 0
+            failure_context: str | None = None
+            total_attempts = max_refinement_rounds + 1 if run_evals else 1
+
+            for attempt in range(total_attempts):
+                with logfire.span(
+                    "codemode.spec_attempt",
+                    attempt=attempt + 1,
+                    is_refinement=attempt > 0,
+                ):
+                    try:
+                        yaml_spec = await self.generate_spec(
+                            func,
+                            exploration_results,
+                            failure_context,
+                        )
+                    except Exception as gen_exc:
+                        logfire.warn(
+                            "Spec generation failed on attempt {attempt}, retrying",
+                            attempt=attempt + 1,
+                            error=str(gen_exc),
+                        )
+                        failure_context = f"Generation error: {gen_exc}"
+                        refinement_rounds = attempt + 1
+                        continue
+
+                    if not run_evals:
+                        break
+
+                    # Validate: run evals with ignore_duration=True
+                    try:
+                        runner = (
+                            RunEvals.from_source(yaml_spec)
+                            .with_functions({func.name: func.impl})
+                            .ignore_duration()
+                        )
+                        summary = runner.run()
+
+                        logfire.info(
+                            "Attempt {attempt}: {passed}/{total} passed, coverage={coverage:.1f}%",
+                            attempt=attempt + 1,
+                            passed=summary.success_count,
+                            total=summary.total_count,
+                            failed=summary.failed_count,
+                            errors=summary.error_count,
+                            coverage=summary.coverage * 100,
+                        )
+
+                        if summary.coverage >= min_coverage:
+                            break
+
+                        # Build failure context for next attempt
+                        failure_context = self._build_failure_context(summary)
+                        refinement_rounds = attempt + 1
+                        logfire.warn(
+                            "Coverage {coverage:.0f}% below target {target:.0f}%, refining",
+                            coverage=summary.coverage * 100,
+                            target=min_coverage * 100,
+                            attempt=attempt + 1,
+                        )
+
+                    except Exception as exc:
+                        logfire.warn(
+                            "Failed to run evals on attempt {attempt}, retrying",
+                            attempt=attempt + 1,
+                            func_name=func.name,
+                            error=str(exc),
+                        )
+                        failure_context = f"Eval run error: {exc}"
+                        refinement_rounds = attempt + 1
+                        continue
+
+            # Phase 5 — inject per-case durations
+            if inject_durations:
+                with logfire.span("codemode.inject_durations", func_name=func.name):
+                    yaml_spec = self._inject_durations(yaml_spec, func)
+
+            # Final summary run (with durations now present, but still ignored)
+            if run_evals and summary is not None:
+                try:
+                    final_runner = (
+                        RunEvals.from_source(yaml_spec)
+                        .with_functions({func.name: func.impl})
+                        .ignore_duration()
+                    )
+                    summary = final_runner.run()
+                except Exception:  # noqa: BLE001
+                    pass  # keep last good summary
+
+            if save_to_file:
+                path = f"{func.name}_evals.yml"
+                with open(path, "w") as f:
+                    f.write(yaml_spec)
+                logfire.info("Saved spec to {path}", path=path)
+
+            elapsed = (time.perf_counter() - t0) * 1000
+            logfire.info(
+                "CodeMode pipeline complete in {elapsed:.0f}ms (refinements={rounds})",
+                elapsed=elapsed,
+                func_name=func.name,
+                exploration_count=len(exploration_results),
+                refinement_rounds=refinement_rounds,
+                has_summary=summary is not None,
+            )
+
+            return CodeModeResult(
+                exploration_results=exploration_results,
+                yaml_spec=yaml_spec,
+                summary=summary,
+                refinement_rounds=refinement_rounds,
+            )
diff --git a/src/vowel/eval_types.py b/src/vowel/eval_types.py
index a19c2e7..241a71d 100644
--- a/src/vowel/eval_types.py
+++ b/src/vowel/eval_types.py
@@ -20,15 +20,13 @@
     EvalsFile: Root model for YAML file parsing
 """
 
-import logging
+import logfire
 import os
 from typing import Any, Literal
 
 from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
 from pydantic.experimental.missing_sentinel import MISSING
 
-logger = logging.getLogger(__name__)
-
 
 # =============================================================================
 # LLM Output Models
@@ -784,6 +782,6 @@ def get_evals(self) -> dict[str, Evals]:
                 try:
                     result[key] = Evals(id=key, **value)
                 except Exception as e:
-                    logger.warning(f"Failed to process eval '{key}': {e}")
+                    logfire.warn("Failed to process eval '{key}': {error}", key=key, error=str(e))
 
         return result
diff --git a/src/vowel/evals.py b/src/vowel/evals.py
index 6065042..f9fc79b 100644
--- a/src/vowel/evals.py
+++ b/src/vowel/evals.py
@@ -17,7 +17,6 @@
 """
 
 import importlib.util
-import logging
 import os
 import re
 import typing
@@ -29,8 +28,6 @@
 from pydantic_ai.settings import ModelSettings
 from pydantic_evals.evaluators import EvaluationReason, Evaluator, EvaluatorContext, LLMJudge
 
-logger = logging.getLogger(__name__)
-
 MONTY_AVAILABLE = bool(importlib.util.find_spec("pydantic-monty"))
 
 
diff --git a/src/vowel/executor.py b/src/vowel/executor.py
new file mode 100644
index 0000000..9de81e1
--- /dev/null
+++ b/src/vowel/executor.py
@@ -0,0 +1,962 @@
+"""Code execution backends for CodeMode eval generation.
+
+CodeMode allows the eval generation agent to *run* code inside a sandbox
+rather than guessing expected values.  This produces ground-truth outputs
+and lets the agent empirically explore function behaviour (edge cases,
+exception boundaries, return types) before writing test cases.
+
+Architecture
+------------
+``Executor`` is a runtime Protocol — any object that implements ``execute()``
+qualifies.  Two concrete implementations are provided:
+
+* ``MontyExecutor``   — uses ``pydantic-monty`` (Rust-based sandbox, <0.1 ms
+                        startup, no filesystem/network access).  **Recommended
+                        for production and the optimization loop.**
+* ``DefaultExecutor`` — uses Python's built-in ``exec()`` with stdout capture.
+                        No sandboxing.  Safe only for trusted, local code;
+                        useful during development when Monty is not installed.
+
+The ``execute()`` method accepts two orthogonal injection mechanisms that
+mirror Monty's native API:
+
+* ``inputs``             — ``dict[str, Any]`` of *values* injected as
+                           top-level variables visible to the snippet.
+* ``external_functions`` — ``dict[str, Callable]`` of *host-side callbacks*
+                           the snippet can call by name.  In the Monty
+                           backend each call exits the sandbox, runs on
+                           the host, and returns the result.
+
+Session API
+-----------
+For batch exploration (e.g. CodeMode), use ``create_session()`` to compile
+the function source **once**, then ``feed()`` each snippet against the
+preserved runtime state.
+
+* ``MontyReplSession``   — backed by ``MontyRepl``: zero re-parse overhead
+                           per snippet, heap/globals preserved across feeds.
+* ``DefaultSession``     — backed by a persistent ``exec()`` namespace.
+
+Usage examples
+--------------
+**External functions** — inject one or more real functions::
+
+    await executor.execute(
+        '''
+        results = []
+        results.append(target_func([1, 3, 5, 7, 9], 5))
+        results.append(target_func([], 1))
+        results
+        ''',
+        external_functions={"target_func": binary_search},
+    )
+
+**Inputs** — inject plain values::
+
+    await executor.execute(
+        "x + y",
+        inputs={"x": 10, "y": 20},
+    )
+
+**Session** — compile once, feed many snippets::
+
+    async with executor.create_session(func_code) as session:
+        r1 = session.feed("binary_search([1,3,5], 3)")
+        r2 = session.feed("binary_search([], 1)")
+
+The value of the last expression becomes ``ExecutionResult.output``.
+"""
+
+from __future__ import annotations
+
+import ast
+import asyncio
+import contextlib
+import importlib.util
+import io
+import time
+from collections.abc import Callable
+from dataclasses import dataclass
+from typing import Any, Literal, Protocol, runtime_checkable
+
+NEST_AVAILABLE = importlib.util.find_spec("nest_asyncio") is not None
+MONTY_AVAILABLE = importlib.util.find_spec("pydantic_monty") is not None
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
+def run_sync(coro: Any) -> Any:
+    """Run a coroutine synchronously, even inside a running event loop.
+
+    Tries ``asyncio.run()`` first (clean, no patching).  If there is
+    already a running loop (e.g. Jupyter, async framework), falls back
+    to ``nest_asyncio`` + ``loop.run_until_complete()``.
+    """
+    try:
+        return asyncio.run(coro)
+    except RuntimeError as exc:
+        if "running event loop" not in str(exc) and "cannot be called from a running" not in str(
+            exc
+        ):
+            raise
+        # Already inside an event loop — patch and retry
+        if not NEST_AVAILABLE:
+            raise RuntimeError(
+                "execute_sync() was called from inside a running event loop. "
+                "Install nest-asyncio to support this: pip install nest-asyncio"
+            ) from exc
+
+        import nest_asyncio
+
+        nest_asyncio.apply()
+        loop = asyncio.get_event_loop()
+        return loop.run_until_complete(coro)
+
+
+# ---------------------------------------------------------------------------
+# Result type
+# ---------------------------------------------------------------------------
+
+
+@dataclass
+class ExecutionResult:
+    """Result of running a code snippet through an executor.
+
+    Attributes
+    ----------
+    output:
+        The value of the last expression evaluated in the snippet, or the
+        value assigned to ``__result__`` in the namespace.  ``None`` when
+        execution fails.
+    stdout:
+        Everything written to stdout during execution (via ``print()``).
+    success:
+        ``True`` when the snippet completed without raising an exception.
+    error:
+        Human-readable error message when ``success is False``.
+    error_type:
+        The Python exception class name (e.g. ``"ValueError"``) when
+        ``success is False``.
+    duration_ms:
+        Wall-clock time spent executing the snippet, in milliseconds.
+    """
+
+    output: Any
+    stdout: str
+    success: bool
+    error: str | None = None
+    error_type: str | None = None
+    duration_ms: float = 0.0
+
+
+# ---------------------------------------------------------------------------
+# Protocol
+# ---------------------------------------------------------------------------
+
+
+@runtime_checkable
+class Executor(Protocol):
+    """Protocol for code execution backends used by CodeMode.
+
+    Any callable object that matches this signature qualifies — concrete
+    classes do *not* need to inherit from ``Executor``.
+
+    Parameters
+    ----------
+    code:
+        Python source code to execute.
+    inputs:
+        ``dict[str, Any]`` of values injected as top-level variables
+        visible to the snippet.  For example ``{"x": 42}`` makes ``x``
+        available inside the code.
+    external_functions:
+        ``dict[str, Callable]`` of host-side callbacks the snippet can
+        call by name.  In the Monty backend each call exits the sandbox,
+        runs on the host, and returns the result — so the real function
+        can use any library.
+    timeout:
+        Maximum wall-clock seconds allowed for the snippet.  Execution is
+        interrupted (or the result discarded) after this duration.
+    max_memory:
+        Maximum heap memory in bytes available to the sandbox.  Ignored by
+        ``DefaultExecutor`` which has no memory isolation.
+
+    Returns
+    -------
+    ExecutionResult
+    """
+
+    async def execute(
+        self,
+        code: str,
+        *,
+        inputs: dict[str, Any] | None = None,
+        external_functions: dict[str, Callable[..., Any]] | None = None,
+        timeout: float = 5.0,
+        max_memory: int = 10 * 1024 * 1024,
+    ) -> ExecutionResult:
+        raise NotImplementedError
+
+    def execute_sync(
+        self,
+        code: str,
+        *,
+        inputs: dict[str, Any] | None = None,
+        external_functions: dict[str, Callable[..., Any]] | None = None,
+        timeout: float = 5.0,
+        max_memory: int = 10 * 1024 * 1024,
+    ) -> ExecutionResult:
+        raise NotImplementedError
+
+    def create_session(
+        self,
+        setup_code: str,
+        *,
+        timeout: float = 5.0,
+        max_memory: int = 10 * 1024 * 1024,
+    ) -> ExecutionSession:
+        raise NotImplementedError
+
+
+# ---------------------------------------------------------------------------
+# ExecutionSession — compile once, feed many snippets
+# ---------------------------------------------------------------------------
+
+
+@runtime_checkable
+class ExecutionSession(Protocol):
+    """A reusable execution context with pre-compiled setup code.
+
+    The session compiles the *setup_code* (typically a function definition)
+    once, then each ``feed()`` call runs a snippet against the preserved
+    runtime state without re-parsing the setup code.
+
+    This is the key optimisation for CodeMode exploration: when testing
+    N edge-case snippets against the same function, the function is parsed
+    and compiled only once instead of N times.
+
+    The session is a context manager — use ``async with`` or ``with`` to
+    ensure proper cleanup.
+    """
+
+    def feed(self, code: str) -> ExecutionResult:
+        """Execute *code* against the session's pre-compiled state.
+
+        Returns an ``ExecutionResult`` with the last expression value,
+        stdout, and error info (if any).
+        """
+        raise NotImplementedError
+
+    def close(self) -> None:
+        """Release resources held by the session."""
+        raise NotImplementedError
+
+    def __enter__(self) -> ExecutionSession:
+        return self
+
+    def __exit__(self, *_: Any) -> None:
+        self.close()
+
+
+# ---------------------------------------------------------------------------
+# MontyReplSession — sandboxed session using MontyRepl
+# ---------------------------------------------------------------------------
+
+
+class MontyReplSession:
+    """Session backed by ``MontyRepl`` — compile once, feed many snippets.
+
+    On construction the *setup_code* is parsed, compiled and executed once
+    via ``MontyRepl.create()``.  Each subsequent ``feed()`` call runs a
+    snippet against the preserved heap/globals without re-parsing the setup
+    code.
+
+    This is the recommended path for CodeMode exploration with Monty.  For
+    a function with N edge-case snippets, the function source is compiled
+    only once.
+    """
+
+    def __init__(
+        self,
+        setup_code: str,
+        *,
+        timeout: float = 5.0,
+        max_memory: int = 10 * 1024 * 1024,
+    ) -> None:
+        import pydantic_monty
+
+        self._pydantic_monty = pydantic_monty
+        self._limits = pydantic_monty.ResourceLimits(
+            max_duration_secs=timeout,
+            max_memory=max_memory,
+        )
+
+        stdout_lines: list[str] = []
+
+        def _print_callback(_stream: str, text: str) -> None:
+            stdout_lines.append(text)
+
+        # Compile + execute setup code (function definitions, imports, etc.)
+        self._repl, _init_output = pydantic_monty.MontyRepl.create(
+            setup_code,
+            limits=self._limits,
+            print_callback=_print_callback,
+        )
+        self._setup_stdout = "\n".join(stdout_lines)
+
+    def feed(self, code: str) -> ExecutionResult:
+        """Execute *code* against the REPL's preserved state."""
+        stdout_lines: list[str] = []
+
+        def _print_callback(_stream: str, text: str) -> None:
+            stdout_lines.append(text)
+
+        t0 = time.perf_counter()
+        try:
+            if not self._repl:
+                # TODO: wrap with custom exception and detailed message
+                raise ValueError("Repl not found.")
+            else:
+                output = self._repl.feed(code, print_callback=_print_callback)
+                duration_ms = (time.perf_counter() - t0) * 1000
+                return ExecutionResult(
+                    output=output,
+                    stdout="\n".join(stdout_lines),
+                    success=True,
+                    duration_ms=duration_ms,
+                )
+
+        except self._pydantic_monty.MontyRuntimeError as exc:
+            duration_ms = (time.perf_counter() - t0) * 1000
+            inner = exc.exception()
+            return ExecutionResult(
+                output=None,
+                stdout="\n".join(stdout_lines),
+                success=False,
+                error=exc.display(format="type-msg"),
+                error_type=type(inner).__name__,
+                duration_ms=duration_ms,
+            )
+
+        except self._pydantic_monty.MontySyntaxError as exc:
+            duration_ms = (time.perf_counter() - t0) * 1000
+            return ExecutionResult(
+                output=None,
+                stdout="",
+                success=False,
+                error=str(exc),
+                error_type="SyntaxError",
+                duration_ms=duration_ms,
+            )
+
+        except Exception as exc:  # noqa: BLE001
+            duration_ms = (time.perf_counter() - t0) * 1000
+            return ExecutionResult(
+                output=None,
+                stdout="\n".join(stdout_lines),
+                success=False,
+                error=str(exc),
+                error_type=type(exc).__name__,
+                duration_ms=duration_ms,
+            )
+
+    def close(self) -> None:
+        """Release the REPL instance."""
+        self._repl = None  # type: ignore[assignment]
+
+    def __enter__(self) -> MontyReplSession:
+        return self
+
+    def __exit__(self, *_: Any) -> None:
+        self.close()
+
+
+# ---------------------------------------------------------------------------
+# FallbackSession — Monty with auto-fallback to DefaultSession
+# ---------------------------------------------------------------------------
+
+import logfire as _logfire
+
+
+class FallbackSession:
+    """Session that tries MontyReplSession first, falls back to DefaultSession.
+
+    Two fallback modes:
+
+    1. **Session-level**: If ``MontyReplSession.__init__`` raises (e.g.
+       ``MontySyntaxError`` for unsupported syntax like f-string ``!r``),
+       the entire session transparently switches to ``DefaultSession``.
+
+    2. **Snippet-level**: If a ``feed()`` call returns a
+       ``ModuleNotFoundError`` (Monty doesn't have the module), that single
+       snippet is re-executed via a ``DefaultSession``.  Subsequent Monty
+       feeds continue normally — only the failing snippet falls back.
+    """
+
+    def __init__(
+        self,
+        setup_code: str,
+        *,
+        timeout: float = 5.0,
+        max_memory: int = 10 * 1024 * 1024,
+    ) -> None:
+        self._setup_code = setup_code
+        self._timeout = timeout
+        self._max_memory = max_memory
+        self._monty_session: MontyReplSession | None = None
+        self._default_session: DefaultSession | None = None
+        self._monty_failed_permanently = False
+
+        try:
+            self._monty_session = MontyReplSession(
+                setup_code,
+                timeout=timeout,
+                max_memory=max_memory,
+            )
+        except Exception as exc:
+            _logfire.info(
+                "Monty session creation failed ({exc_type}: {exc_msg}), falling back to DefaultSession",
+                exc_type=type(exc).__name__,
+                exc_msg=str(exc),
+            )
+            self._monty_failed_permanently = True
+            self._default_session = DefaultSession(
+                setup_code,
+                timeout=timeout,
+                max_memory=max_memory,
+            )
+
+    def _get_default_session(self) -> DefaultSession:
+        """Lazily create the DefaultSession (only when first needed)."""
+        if self._default_session is None:
+            self._default_session = DefaultSession(
+                self._setup_code,
+                timeout=self._timeout,
+                max_memory=self._max_memory,
+            )
+        return self._default_session
+
+    def feed(self, code: str) -> ExecutionResult:
+        """Execute *code*, falling back to DefaultSession on Monty gaps."""
+        # Session-level fallback — Monty never worked
+        if self._monty_failed_permanently:
+            return self._get_default_session().feed(code)
+
+        assert self._monty_session is not None
+        result = self._monty_session.feed(code)
+
+        # Snippet-level fallback — ModuleNotFoundError means Monty
+        # doesn't have that stdlib module; retry with DefaultSession.
+        if not result.success and result.error_type == "ModuleNotFoundError":
+            _logfire.info(
+                "Monty ModuleNotFoundError, retrying snippet with DefaultSession: {error}",
+                error=result.error,
+            )
+            return self._get_default_session().feed(code)
+
+        return result
+
+    def close(self) -> None:
+        if self._monty_session is not None:
+            self._monty_session.close()
+        if self._default_session is not None:
+            self._default_session.close()
+
+    def __enter__(self) -> FallbackSession:
+        return self
+
+    def __exit__(self, *_: Any) -> None:
+        self.close()
+
+
+# ---------------------------------------------------------------------------
+# MontyExecutor — sandboxed, production-grade
+# ---------------------------------------------------------------------------
+
+
+class MontyExecutor:
+    """Sandboxed executor backed by ``pydantic-monty`` (Rust interpreter).
+
+    Monty provides strict isolation: no filesystem access, no network, no
+    environment variables.  External functions are injected as host-side
+    callbacks — they run on the *host* Python process with full access to
+    stdlib and third-party libraries.
+
+    Uses ``pydantic_monty.run_monty_async`` which implements Monty's step
+    protocol (``start()`` → ``MontySnapshot`` → ``resume()``) with proper
+    async support.  External functions can be sync or async — Monty handles
+    both transparently.  The GIL is released during execution and Monty
+    steps are offloaded to a thread pool.
+
+    Requires the ``pydantic-monty`` package::
+
+        pip install "vowel[monty]"   # or: pip install pydantic-monty
+
+    Raises
+    ------
+    ImportError
+        If ``pydantic-monty`` is not installed.
+    """
+
+    def __init__(self) -> None:
+        if not MONTY_AVAILABLE:
+            raise ImportError(
+                'MontyExecutor requires pydantic-monty. Install it with: pip install "vowel[monty]"'
+            )
+
+    async def execute(
+        self,
+        code: str,
+        *,
+        inputs: dict[str, Any] | None = None,
+        external_functions: dict[str, Callable[..., Any]] | None = None,
+        timeout: float = 5.0,
+        max_memory: int = 10 * 1024 * 1024,
+    ) -> ExecutionResult:
+        """Execute *code* inside the Monty sandbox.
+
+        Delegates to ``pydantic_monty.run_monty_async`` which handles the
+        full step protocol (``start()`` → snapshot → ``resume()``).
+
+        ``NameLookupSnapshot`` (undefined variables) is not handled by
+        ``run_monty_async`` — it raises ``AssertionError``.  We catch that
+        and use ``isinstance`` to detect the snapshot type cleanly.
+
+        Parameters
+        ----------
+        code:
+            Python source to run.
+        inputs:
+            Values injected as top-level variables (Monty ``inputs``).
+        external_functions:
+            Host-side callbacks the snippet can call by name.
+        timeout / max_memory:
+            Resource limits forwarded to Monty.
+        """
+        import pydantic_monty
+
+        stdout_lines: list[str] = []
+
+        def _print_callback(_stream: str, text: str) -> None:
+            stdout_lines.append(text)
+
+        input_names = list(inputs) if inputs else None
+
+        limits = pydantic_monty.ResourceLimits(
+            max_duration_secs=timeout,
+            max_memory=max_memory,
+        )
+
+        t0 = time.perf_counter()
+        try:
+            m = pydantic_monty.Monty(
+                code,
+                inputs=input_names,
+            )
+            output = await pydantic_monty.run_monty_async(
+                m,
+                inputs=inputs,
+                limits=limits,
+                external_functions=external_functions,
+                print_callback=_print_callback,
+            )
+            duration_ms = (time.perf_counter() - t0) * 1000
+            return ExecutionResult(
+                output=output,
+                stdout="\n".join(stdout_lines),
+                success=True,
+                duration_ms=duration_ms,
+            )
+
+        except pydantic_monty.MontyRuntimeError as exc:
+            duration_ms = (time.perf_counter() - t0) * 1000
+            inner = exc.exception()
+            return ExecutionResult(
+                output=None,
+                stdout="\n".join(stdout_lines),
+                success=False,
+                error=exc.display(format="type-msg"),
+                error_type=type(inner).__name__,
+                duration_ms=duration_ms,
+            )
+
+        except pydantic_monty.MontySyntaxError as exc:
+            duration_ms = (time.perf_counter() - t0) * 1000
+            return ExecutionResult(
+                output=None,
+                stdout="",
+                success=False,
+                error=str(exc),
+                error_type="SyntaxError",
+                duration_ms=duration_ms,
+            )
+
+        except AssertionError as exc:
+            duration_ms = (time.perf_counter() - t0) * 1000
+            # run_monty_async doesn't handle NameLookupSnapshot — it hits
+            # `assert isinstance(progress, FutureSnapshot)` and the repr
+            # of the snapshot is embedded in the assertion message.
+            exc_msg = str(exc)
+            if "NameLookupSnapshot" in exc_msg:
+                marker = 'variable_name="'
+                start = exc_msg.find(marker)
+                if start != -1:
+                    start += len(marker)
+                    end = exc_msg.find('"', start)
+                    var = exc_msg[start:end]
+                    error = f"name '{var}' is not defined"
+                else:
+                    error = "name is not defined"
+                return ExecutionResult(
+                    output=None,
+                    stdout="\n".join(stdout_lines),
+                    success=False,
+                    error=error,
+                    error_type="NameError",
+                    duration_ms=duration_ms,
+                )
+            return ExecutionResult(
+                output=None,
+                stdout="\n".join(stdout_lines),
+                success=False,
+                error=exc_msg,
+                error_type=type(exc).__name__,
+                duration_ms=duration_ms,
+            )
+
+        except Exception as exc:  # noqa: BLE001 — catch-all for unexpected errors
+            duration_ms = (time.perf_counter() - t0) * 1000
+            return ExecutionResult(
+                output=None,
+                stdout="\n".join(stdout_lines),
+                success=False,
+                error=str(exc),
+                error_type=type(exc).__name__,
+                duration_ms=duration_ms,
+            )
+
+    def execute_sync(
+        self,
+        code: str,
+        *,
+        inputs: dict[str, Any] | None = None,
+        external_functions: dict[str, Callable[..., Any]] | None = None,
+        timeout: float = 5.0,
+        max_memory: int = 10 * 1024 * 1024,
+    ) -> ExecutionResult:
+        """Synchronous wrapper around :meth:`execute`."""
+        return run_sync(
+            self.execute(
+                code,
+                inputs=inputs,
+                external_functions=external_functions,
+                timeout=timeout,
+                max_memory=max_memory,
+            )
+        )
+
+    def create_session(
+        self,
+        setup_code: str,
+        *,
+        timeout: float = 5.0,
+        max_memory: int = 10 * 1024 * 1024,
+    ) -> FallbackSession:
+        """Create a session that uses Monty with auto-fallback to DefaultSession.
+
+        The *setup_code* (typically a function definition) is compiled and
+        executed **once**.  If Monty cannot handle the code (e.g. unsupported
+        syntax), the session transparently falls back to ``DefaultSession``.
+        Individual ``feed()`` calls also fall back on ``ModuleNotFoundError``.
+        """
+        return FallbackSession(
+            setup_code,
+            timeout=timeout,
+            max_memory=max_memory,
+        )
+
+
+# ---------------------------------------------------------------------------
+# DefaultSession — unsandboxed session using persistent namespace
+# ---------------------------------------------------------------------------
+
+
+def _rewrite_last_expr(code: str) -> tuple[Any, bool]:
+    """Parse *code* and rewrite the last expression to capture its value.
+
+    Returns ``(compiled_code, has_result)`` where *has_result* is True when
+    the last statement was an expression that was rewritten to assign to
+    ``__result__``.
+    """
+    tree = ast.parse(code, "<vowel-session>", "exec")
+    has_result = False
+    if tree.body and isinstance(tree.body[-1], ast.Expr):
+        last_expr = tree.body.pop()
+        assign = ast.Assign(
+            targets=[ast.Name(id="__result__", ctx=ast.Store())],
+            value=last_expr.value,  # type: ignore[attr-defined]
+        )
+        ast.copy_location(last_expr, assign)
+        tree.body.append(assign)
+        ast.fix_missing_locations(tree)
+        has_result = True
+    return compile(tree, "<vowel-session>", "exec"), has_result
+
+
+class DefaultSession:
+    """Session backed by a persistent ``exec()`` namespace.
+
+    The *setup_code* is executed once into a namespace dict on construction.
+    Each ``feed()`` call executes a snippet in the **same** namespace, so
+    functions and variables defined in the setup remain available.
+
+    This mirrors ``MontyReplSession`` semantics for environments where Monty
+    is not installed.
+    """
+
+    def __init__(
+        self,
+        setup_code: str,
+        *,
+        timeout: float = 5.0,
+        max_memory: int = 10 * 1024 * 1024,
+    ) -> None:
+        self._namespace: dict[str, Any] = {}
+        self._timeout = timeout
+        # Execute setup code to define functions/variables
+        exec(compile(setup_code, "<vowel-session-setup>", "exec"), self._namespace)  # noqa: S102
+
+    def feed(self, code: str) -> ExecutionResult:
+        """Execute *code* against the session's persistent namespace."""
+        # Remove any leftover __result__ from previous feed
+        self._namespace.pop("__result__", None)
+
+        try:
+            compiled, _has_result = _rewrite_last_expr(code)
+        except SyntaxError as exc:
+            return ExecutionResult(
+                output=None,
+                stdout="",
+                success=False,
+                error=str(exc),
+                error_type="SyntaxError",
+                duration_ms=0.0,
+            )
+
+        stdout_buf = io.StringIO()
+        t0 = time.perf_counter()
+        try:
+            with contextlib.redirect_stdout(stdout_buf):
+                exec(compiled, self._namespace)  # noqa: S102
+            duration_ms = (time.perf_counter() - t0) * 1000
+            output = self._namespace.get("__result__")
+            return ExecutionResult(
+                output=output,
+                stdout=stdout_buf.getvalue(),
+                success=True,
+                duration_ms=duration_ms,
+            )
+
+        except Exception as exc:  # noqa: BLE001
+            duration_ms = (time.perf_counter() - t0) * 1000
+            return ExecutionResult(
+                output=None,
+                stdout=stdout_buf.getvalue(),
+                success=False,
+                error=str(exc),
+                error_type=type(exc).__name__,
+                duration_ms=duration_ms,
+            )
+
+    def close(self) -> None:
+        """Clear the namespace."""
+        self._namespace.clear()
+
+    def __enter__(self) -> DefaultSession:
+        return self
+
+    def __exit__(self, *_: Any) -> None:
+        self.close()
+
+
+# ---------------------------------------------------------------------------
+# DefaultExecutor — exec()-based, no sandbox
+# ---------------------------------------------------------------------------
+
+
+class DefaultExecutor:
+    """Unsandboxed executor backed by Python's built-in ``exec()``.
+
+    **WARNING: runs code with full host privileges.**  Only suitable for
+    development, local testing, or environments where the code being executed
+    is fully trusted.
+
+    Both ``inputs`` and ``external_functions`` are merged into the execution
+    namespace so the snippet can reference them as plain names.  The last
+    assigned value of ``__result__``, or the module-level name ``results``
+    if present, is returned as ``output``.
+
+    No additional dependencies required — works with plain Python.
+
+    Notes
+    -----
+    * ``timeout`` and ``max_memory`` parameters are accepted for API
+      compatibility but are **not enforced**.
+    * Stdout is captured via ``contextlib.redirect_stdout``.
+    """
+
+    async def execute(
+        self,
+        code: str,
+        *,
+        inputs: dict[str, Any] | None = None,
+        external_functions: dict[str, Callable[..., Any]] | None = None,
+        timeout: float = 5.0,
+        max_memory: int = 10 * 1024 * 1024,
+    ) -> ExecutionResult:
+        """Execute *code* using ``exec()`` — no sandbox, no isolation.
+
+        To match Monty's behaviour, the value of the *last expression* in
+        the snippet is captured automatically using ``ast`` rewriting.  If
+        the snippet explicitly sets ``__result__``, that takes priority.
+        """
+        namespace: dict[str, Any] = {}
+        if inputs:
+            namespace.update(inputs)
+        if external_functions:
+            namespace.update(external_functions)
+
+        # Rewrite the last expression statement to capture its value.
+        try:
+            tree = ast.parse(code, "<vowel-codemode>", "exec")
+            if tree.body and isinstance(tree.body[-1], ast.Expr):
+                last_expr = tree.body.pop()
+                assign = ast.Assign(
+                    targets=[ast.Name(id="__result__", ctx=ast.Store())],
+                    value=last_expr.value,  # type: ignore[attr-defined]
+                )
+                ast.copy_location(last_expr, assign)
+                tree.body.append(assign)
+                ast.fix_missing_locations(tree)
+            compiled = compile(tree, "<vowel-codemode>", "exec")
+        except SyntaxError as exc:
+            return ExecutionResult(
+                output=None,
+                stdout="",
+                success=False,
+                error=str(exc),
+                error_type="SyntaxError",
+                duration_ms=0.0,
+            )
+
+        stdout_buf = io.StringIO()
+        t0 = time.perf_counter()
+        try:
+            with contextlib.redirect_stdout(stdout_buf):
+                exec(compiled, namespace)  # noqa: S102
+            duration_ms = (time.perf_counter() - t0) * 1000
+
+            output = namespace.get("__result__")
+
+            return ExecutionResult(
+                output=output,
+                stdout=stdout_buf.getvalue(),
+                success=True,
+                duration_ms=duration_ms,
+            )
+
+        except Exception as exc:  # noqa: BLE001
+            duration_ms = (time.perf_counter() - t0) * 1000
+            return ExecutionResult(
+                output=None,
+                stdout=stdout_buf.getvalue(),
+                success=False,
+                error=str(exc),
+                error_type=type(exc).__name__,
+                duration_ms=duration_ms,
+            )
+
+    def execute_sync(
+        self,
+        code: str,
+        *,
+        inputs: dict[str, Any] | None = None,
+        external_functions: dict[str, Callable[..., Any]] | None = None,
+        timeout: float = 5.0,
+        max_memory: int = 10 * 1024 * 1024,
+    ) -> ExecutionResult:
+        """Synchronous wrapper around :meth:`execute`."""
+        return run_sync(
+            self.execute(
+                code,
+                inputs=inputs,
+                external_functions=external_functions,
+                timeout=timeout,
+                max_memory=max_memory,
+            )
+        )
+
+    def create_session(
+        self,
+        setup_code: str,
+        *,
+        timeout: float = 5.0,
+        max_memory: int = 10 * 1024 * 1024,
+    ) -> DefaultSession:
+        """Create an unsandboxed session with a persistent namespace.
+
+        The *setup_code* is executed once into a shared namespace dict.
+        Each ``session.feed(snippet)`` call runs in the same namespace,
+        preserving functions and variables across calls.
+        """
+        return DefaultSession(
+            setup_code,
+            timeout=timeout,
+            max_memory=max_memory,
+        )
+
+
+# ---------------------------------------------------------------------------
+# Factory
+# ---------------------------------------------------------------------------
+
+
+def get_executor(backend: Literal["monty", "auto", "default"] = "auto") -> Executor:
+    """Return a configured executor instance.
+
+    Parameters
+    ----------
+    backend:
+        ``"monty"``   — always use ``MontyExecutor`` (raises if not installed).
+        ``"default"`` — always use ``DefaultExecutor``.
+        ``"auto"``    — use ``MontyExecutor`` when available, fall back to
+                        ``DefaultExecutor`` with a warning.
+
+    Returns
+    -------
+    Executor
+        A ready-to-use executor instance.
+    """
+    if backend == "monty":
+        return MontyExecutor()
+
+    if backend == "default":
+        return DefaultExecutor()
+
+    if backend == "auto":
+        if MONTY_AVAILABLE:
+            return MontyExecutor()
+        import warnings
+
+        warnings.warn(
+            "pydantic-monty not installed; falling back to DefaultExecutor "
+            '(no sandboxing). Install with: pip install "vowel[monty]"',
+            stacklevel=2,
+        )
+        return DefaultExecutor()
+
+    raise ValueError(
+        f"Unknown executor backend: {backend!r}. Choose 'monty', 'default', or 'auto'."
+    )
diff --git a/src/vowel/runner.py b/src/vowel/runner.py
index a69c3e5..4018fcd 100644
--- a/src/vowel/runner.py
+++ b/src/vowel/runner.py
@@ -97,6 +97,7 @@ def execute(self) -> None:
         local_scope: dict[str, object] = {}
         try:
             code = self.code
+            code = self._sanitize_code(code)
             try:
                 exec(code, local_scope, local_scope)
             except Exception:
@@ -105,12 +106,62 @@ def execute(self) -> None:
                     exec(code, local_scope, local_scope)
                 else:
                     raise
+            self.code = code  # persist cleaned code for downstream use
 
         except Exception as e:
             raise RuntimeError(f"Error executing code for function '{self.name}'.") from e
 
         self.func = local_scope[self.name]
 
+    @staticmethod
+    def _sanitize_code(code: str) -> str:
+        """Fix common LLM code-generation artefacts before exec.
+
+        1. Strip escaped quotes (``\\\"``) that break docstrings.
+        2. Remove redundant ``from typing import`` of Python 3.9+ builtins
+           (dict, list, tuple, set, frozenset, type) that cause ImportError
+           on Python ≥ 3.11.
+        """
+        import re as _re
+
+        # 1. Un-escape literal backslash-quote sequences
+        if '\\"' in code or "\\'" in code:
+            code = code.replace('\\"', '"').replace("\\'", "'")
+
+        # 2. Remove typing imports of builtin generics
+        _BUILTIN_GENERICS = {
+            "Dict",
+            "List",
+            "Tuple",
+            "Set",
+            "FrozenSet",
+            "Type",
+            "dict",
+            "list",
+            "tuple",
+            "set",
+            "frozenset",
+            "type",
+        }
+
+        def _clean_typing_import(m: _re.Match) -> str:
+            names = [n.strip() for n in m.group(1).split(",")]
+            remaining = [n for n in names if n not in _BUILTIN_GENERICS]
+            if not remaining:
+                return ""  # remove the entire import line
+            return f"from typing import {', '.join(remaining)}"
+
+        code = _re.sub(
+            r"^from\s+typing\s+import\s+(.+)$",
+            _clean_typing_import,
+            code,
+            flags=_re.MULTILINE,
+        )
+        # Remove any blank lines left behind
+        code = _re.sub(r"\n{3,}", "\n\n", code)
+
+        return code
+
     def __call__(self, *args, **kwargs) -> _RT:
         """
         Call the function implementation with the provided arguments.
diff --git a/src/vowel/spec_validation.py b/src/vowel/spec_validation.py
new file mode 100644
index 0000000..d559dda
--- /dev/null
+++ b/src/vowel/spec_validation.py
@@ -0,0 +1,342 @@
+"""Shared spec validation utilities for eval generation pipelines.
+
+Functions in this module are used by both ``CodeModeGenerator`` and
+``TDDGenerator`` to validate generated YAML specs against real execution
+and to inject measured durations.
+"""
+
+from __future__ import annotations
+
+from typing import Any
+
+import logfire
+import yaml
+
+from vowel.executor import Executor, get_executor
+from vowel.runner import Function
+from vowel.utils import EvalSummary
+
+
+def build_failure_context(summary: EvalSummary) -> str:
+    """Build a concise failure report to inject into a retry prompt.
+
+    Iterates over :class:`EvalSummary` results and formats each failed
+    case/assertion as a single line.  Returns a multi-line string suitable
+    for LLM prompts.
+    """
+    lines: list[str] = []
+    for result in summary.results:
+        if result.report:
+            for case in result.report.cases:
+                failed_assertions = {k: v for k, v in case.assertions.items() if not v.value}
+                if failed_assertions:
+                    parts = []
+                    for k, v in failed_assertions.items():
+                        if v.reason:
+                            parts.append(f"{k}: {v.reason}")
+                        else:
+                            parts.append(f"{k}: FAILED")
+                    lines.append(f"- Case '{case.name}' FAILED [{', '.join(parts)}]")
+        if result.error:
+            lines.append(f"- Error: {result.error}")
+    return "\n".join(lines) if lines else "Unknown failures"
+
+
+def build_call_code(
+    func_name: str, case: dict
+) -> (
+    str | None
+):  # TODO: intead of building call code, consider passing arguments through executor inputs
+    """Build a ``func(args...)`` call string from a YAML case dict.
+
+    Returns ``None`` when no input is present (e.g. raises-only case
+    without input).
+    """
+    if "inputs" in case and case["inputs"] is not None:
+        args = case["inputs"]
+        if isinstance(args, list):
+            arg_strs = ", ".join(repr(a) for a in args)
+            return f"{func_name}({arg_strs})"
+        if isinstance(args, dict):
+            kwarg_strs = ", ".join(f"{k}={v!r}" for k, v in args.items())
+            return f"{func_name}({kwarg_strs})"
+    elif "input" in case and case["input"] is not None:
+        return f"{func_name}({case['input']!r})"
+    return None
+
+
+def inject_durations(
+    yaml_spec: str,
+    func: Function,
+    executor: Executor,
+    *,
+    buffer_pct: float = 0.5,
+    floor_ms: float = 10.0,
+) -> str:
+    """Add per-case ``duration`` fields based on actual execution times.
+
+    Each non-raises case is executed once via the executor session.
+    The measured ``duration_ms`` is inflated by *buffer_pct* (default 50%)
+    with a minimum of *floor_ms* (default 10 ms) to absorb noise.
+
+    Parameters
+    ----------
+    yaml_spec:
+        YAML string to augment.
+    func:
+        Function to execute cases against.
+    executor:
+        Executor backend to use for timing.
+    buffer_pct:
+        Fractional buffer added on top of measured time (0.5 = +50%).
+    floor_ms:
+        Absolute minimum duration in ms — protects sub-ms cases from
+        flaky failures due to measurement noise.
+    """
+    spec = yaml.safe_load(yaml_spec)
+    if not isinstance(spec, dict):
+        return yaml_spec
+
+    try:
+        session = executor.create_session(func.code)
+    except Exception:
+        logfire.warn("Could not create session for duration injection")
+        return yaml_spec
+
+    with session:
+        for eval_id, eval_def in spec.items():
+            if not isinstance(eval_def, dict):
+                continue
+            for case_entry in eval_def.get("dataset", []):
+                case = case_entry.get("case", {})
+                if not isinstance(case, dict):
+                    continue
+                # Skip cases that expect exceptions
+                if case.get("raises"):
+                    continue
+
+                call_code = build_call_code(eval_id, case)
+                if call_code is None:
+                    continue
+
+                result = session.feed(call_code)
+                if result.success:
+                    dur = max(
+                        result.duration_ms * (1 + buffer_pct),
+                        floor_ms,
+                    )
+                    case["duration"] = round(dur, 1)
+
+    return yaml.dump(spec, default_flow_style=False, allow_unicode=True, sort_keys=False)
+
+
+def validate_expected_values(
+    yaml_spec: str,
+    func: Function,
+    executor: Executor | None = None,
+) -> str:
+    """Validate and fix expected values in a YAML spec by executing cases.
+
+    For each case that has ``expected`` and no ``raises``, executes the
+    function call and compares the result.  If the actual output differs
+    from the YAML expected value, the YAML is updated to the real value.
+
+    Also validates ``raises`` cases: if the case expects an exception but
+    the function doesn't raise (or raises a different type), the case is
+    corrected.
+
+    Parameters
+    ----------
+    yaml_spec:
+        YAML spec string to validate.
+    func:
+        Function to execute.
+    executor:
+        Executor backend.  Defaults to ``get_executor("auto")``.
+
+    Returns
+    -------
+    str
+        Fixed YAML spec with corrected expected values.
+    """
+    executor = executor or get_executor("auto")
+
+    spec = yaml.safe_load(yaml_spec)
+    if not isinstance(spec, dict):
+        return yaml_spec
+
+    try:
+        session = executor.create_session(func.code)
+    except Exception:
+        logfire.warn("Could not create session for expected value validation")
+        return yaml_spec
+
+    fixes_applied = 0
+
+    with session:
+        for eval_id, eval_def in spec.items():
+            if not isinstance(eval_def, dict):
+                continue
+            for case_entry in eval_def.get("dataset", []):
+                case = case_entry.get("case", {})
+                if not isinstance(case, dict):
+                    continue
+
+                call_code = build_call_code(eval_id, case)
+                if call_code is None:
+                    continue
+
+                result = session.feed(call_code)
+
+                # --- Fix expected values ---
+                if "expected" in case and not case.get("raises"):
+                    if result.success and result.output != case["expected"]:
+                        logfire.info(
+                            "Fixing expected value for case: {expected} → {actual}",
+                            expected=repr(case["expected"]),
+                            actual=repr(result.output),
+                        )
+                        case["expected"] = result.output
+                        fixes_applied += 1
+
+                # --- Fix raises cases ---
+                if case.get("raises"):
+                    expected_exc = case["raises"]
+                    if result.success:
+                        # Function didn't raise — remove raises, set expected
+                        logfire.info(
+                            "Case expected {exc} but function returned {output}, fixing",
+                            exc=expected_exc,
+                            output=repr(result.output),
+                        )
+                        del case["raises"]
+                        if "match" in case:
+                            del case["match"]
+                        case["expected"] = result.output
+                        fixes_applied += 1
+                    elif result.error_type and result.error_type != expected_exc:
+                        # Wrong exception type
+                        logfire.info(
+                            "Case expected {expected} but got {actual}, fixing",
+                            expected=expected_exc,
+                            actual=result.error_type,
+                        )
+                        case["raises"] = result.error_type
+                        fixes_applied += 1
+
+    if fixes_applied > 0:
+        logfire.info("Validated spec: {count} fixes applied", count=fixes_applied)
+        return yaml.dump(spec, default_flow_style=False, allow_unicode=True, sort_keys=False)
+
+    return yaml_spec
+
+
+def inject_missing_error_cases(
+    yaml_spec: str,
+    func_name: str,
+    error_snippets: list[dict],
+) -> str:
+    """Inject error cases from exploration into the spec if the LLM missed them.
+
+    Each item in *error_snippets* should have keys:
+
+    - ``code``: Python snippet that triggered the error (e.g. ``"flatten(None)"``)
+    - ``error_type``: Exception class name (e.g. ``"TypeError"``)
+    - ``error``: Full error message
+    - ``description``: One-line description
+
+    Uses :mod:`ast` to extract function call arguments from the snippet
+    code.  If parsing fails (multi-line code, complex expressions), the
+    snippet is silently skipped.
+
+    Returns the (possibly modified) YAML spec string.
+    """
+    import ast
+
+    if not error_snippets:
+        return yaml_spec
+
+    spec = yaml.safe_load(yaml_spec)
+    if not isinstance(spec, dict) or func_name not in spec:
+        return yaml_spec
+
+    eval_def = spec[func_name]
+    dataset = eval_def.setdefault("dataset", [])
+
+    # Collect existing raises case inputs to avoid duplicates
+    existing_raises_inputs: set[str] = set()
+    for entry in dataset:
+        case = entry.get("case", {})
+        if isinstance(case, dict) and case.get("raises"):
+            # Normalise existing input for comparison
+            inp = case.get("input")
+            inps = case.get("inputs")
+            existing_raises_inputs.add(repr((inp, inps)))
+
+    injected = 0
+
+    for snippet in error_snippets:
+        code = snippet["code"].strip()
+        error_type = snippet["error_type"]
+        description = snippet.get("description", "")
+
+        # Try to extract arguments from a simple function call
+        try:
+            tree = ast.parse(code, mode="eval")
+        except SyntaxError:
+            continue
+
+        if not isinstance(tree.body, ast.Call):
+            continue
+
+        try:
+            args = [ast.literal_eval(a) for a in tree.body.args]
+            kwargs = {kw.arg: ast.literal_eval(kw.value) for kw in tree.body.keywords}
+        except (ValueError, TypeError):
+            # Complex expression that can't be literal-evaluted — skip
+            continue
+
+        # Determine input/inputs format
+        if kwargs:
+            input_repr = repr((None, kwargs))
+            if input_repr in existing_raises_inputs:
+                continue
+            case_dict: dict[str, Any] = {
+                "id": f"error_{error_type.lower()}_{injected}",
+                "inputs": kwargs,
+                "raises": error_type,
+            }
+        elif len(args) == 1:
+            input_repr = repr((args[0], None))
+            if input_repr in existing_raises_inputs:
+                continue
+            case_dict = {
+                "id": f"error_{error_type.lower()}_{injected}",
+                "input": args[0],
+                "raises": error_type,
+            }
+        elif len(args) > 1:
+            input_repr = repr((None, args))
+            if input_repr in existing_raises_inputs:
+                continue
+            case_dict = {
+                "id": f"error_{error_type.lower()}_{injected}",
+                "inputs": args,
+                "raises": error_type,
+            }
+        else:
+            continue
+
+        dataset.append({"case": case_dict})
+        injected += 1
+        logfire.info(
+            "Injected error case: {desc} → raises {exc}",
+            desc=description,
+            exc=error_type,
+        )
+
+    if injected > 0:
+        logfire.info("Injected {count} missing error cases into spec", count=injected)
+        return yaml.dump(spec, default_flow_style=False, allow_unicode=True, sort_keys=False)
+
+    return yaml_spec
diff --git a/src/vowel/tdd.py b/src/vowel/tdd.py
index e123a2c..2f637f1 100644
--- a/src/vowel/tdd.py
+++ b/src/vowel/tdd.py
@@ -20,6 +20,7 @@
     print(result.func.code)
 """
 
+import inspect
 import os
 import re
 import time
@@ -31,11 +32,17 @@
 import yaml
 from pydantic import BaseModel, Field
 from pydantic_ai import Agent
+from pydantic_ai.models import Model
 
 from vowel.context import EVAL_SPEC_CONTEXT
 from vowel.eval_types import EvalsSource
+from vowel.executor import Executor, get_executor
 from vowel.monitoring import enable_monitoring
 from vowel.runner import Function, RunEvals
+from vowel.spec_validation import (
+    build_failure_context,
+    validate_expected_values,
+)
 from vowel.utils import EvalSummary
 from vowel.validation import validate_and_fix_spec
 
@@ -218,9 +225,11 @@ class TDDGenerator:
 
     def __init__(
         self,
-        model: str | None = None,
+        model: str | Model | None = None,
         additional_context: str | list[str] | None = None,
         load_env: bool = False,
+        executor: Executor | None = None,
+        **opts,
     ):
         if load_env:
             import dotenv
@@ -245,6 +254,11 @@ def __init__(
         self._impl_agent: Any = None
         self._signature_agent: Any = None
 
+        # Optional executor for expected-value validation
+        self._executor = executor
+
+        self._opts = opts
+
         logfire.info("TDDGenerator initialized", model=self.model)
 
     @property
@@ -264,6 +278,7 @@ def signature_agent(self) -> Agent[None, FunctionSignature]:
 - Specify return type accurately
 - Write a clear, complete description
 """,
+                **self._opts,
             )
         return cast(Agent[None, FunctionSignature], self._signature_agent)
 
@@ -463,6 +478,7 @@ def eval_agent(self) -> Agent[None, EvalsSource]:
 
 For complex validations, use case-specific assertions instead.
 """,
+                **self._opts,
             )
         return cast(Agent[None, EvalsSource], self._eval_agent)
 
@@ -631,6 +647,7 @@ def my_func(data: list, target: int) -> int:
 - [ ] For path parsing: handle both `key` and `[index]` formats
 - [ ] For nested access: check type at EACH level before accessing
 """,
+                **self._opts,
             )
         return cast(Agent[None, Function], self._impl_agent)
 
@@ -737,34 +754,76 @@ def generate_evals_from_signature(
 IMPORTANT: In assertions, use `input[0]`, `input[1]` to access positional args.
 {extra_context}
 {f"<UserContext>{additional_context}</UserContext>" if additional_context else ""}"""
-                result = self.eval_agent.run_sync(prompt)
-                yaml_spec = result.output.yaml_spec  # type: ignore[attr-defined]
-
-                # Sanitize: strip YAML tags that safe_load rejects
-                yaml_spec = re.sub(r"!!python/[\w.:]+", "", yaml_spec)
-                yaml_spec = re.sub(r"!!binary\b", "", yaml_spec)
-
-                # Validate YAML syntax
-                yaml.safe_load(yaml_spec)
-
-                # Static validation: fix common LLM generation mistakes
-                validation = validate_and_fix_spec(yaml_spec)
-                if validation.has_warnings:
-                    logfire.info("Spec validation results", summary=validation.summary())
-                if validation.was_modified:
-                    yaml_spec = validation.fixed_yaml
-
-                runner = RunEvals.from_source(yaml_spec)
-                logfire.info(
-                    "Evals generated", cases=len(yaml_spec.split("- case:")), attempt=attempt + 1
-                )
+                try:
+                    result = self.eval_agent.run_sync(prompt)
+                    yaml_spec = result.output.yaml_spec  # type: ignore[attr-defined]
+
+                    # Sanitize: strip YAML tags that safe_load rejects
+                    yaml_spec = re.sub(r"!!python/[\w.:]+", "", yaml_spec)
+                    yaml_spec = re.sub(r"!!binary\b", "", yaml_spec)
+
+                    # Validate YAML syntax
+                    yaml.safe_load(yaml_spec)
+
+                    # Static validation: fix common LLM generation mistakes
+                    validation = validate_and_fix_spec(yaml_spec)
+                    if validation.has_warnings:
+                        logfire.info("Spec validation results", summary=validation.summary())
+                    if validation.was_modified:
+                        yaml_spec = validation.fixed_yaml
+
+                    # Executor-based validation: fix expected values by executing
+                    # each case through the sandbox and correcting mismatches.
+                    if func is not None:
+                        # Resolve source code for validation
+                        if isinstance(func, Function):
+                            real_code = func.code
+                        elif callable(func):
+                            try:
+                                real_code = inspect.getsource(func)
+                            except OSError:
+                                real_code = None
+                        else:
+                            real_code = None
+
+                        if real_code is not None:
+                            val_func = Function(
+                                name=signature.name,
+                                code=real_code,
+                                description=signature.description,
+                            )
+                            executor = getattr(self, "_executor", None) or get_executor("auto")
+                            yaml_spec = validate_expected_values(
+                                yaml_spec,
+                                val_func,
+                                executor,
+                            )
+
+                    runner = RunEvals.from_source(yaml_spec)
+                    logfire.info(
+                        "Evals generated",
+                        cases=len(yaml_spec.split("- case:")),
+                        attempt=attempt + 1,
+                    )
+
+                except Exception as gen_exc:
+                    logfire.warn(
+                        "Eval spec generation failed on attempt {attempt}, retrying",
+                        attempt=attempt + 1,
+                        error=str(gen_exc),
+                    )
+                    last_failure_context = f"Generation error: {gen_exc}"
+                    if attempt < max_retries:
+                        time.sleep(retry_delay)
+                    continue
 
                 # If no func provided, return without validation
                 if func is None:
                     return runner, yaml_spec
 
                 # Run spec against the provided function
-                test_runner = runner.with_functions({signature.name: func})
+                func_callable = func.impl if isinstance(func, Function) else func
+                test_runner = runner.with_functions({signature.name: func_callable})
                 if ignore_duration:
                     test_runner = test_runner.ignore_duration()
                 summary = test_runner.run()
@@ -778,7 +837,7 @@ def generate_evals_from_signature(
                     return runner, yaml_spec
 
                 # Build failure context for next attempt
-                last_failure_context = self._build_eval_failure_context(summary)
+                last_failure_context = build_failure_context(summary)
                 logfire.warn(
                     "Eval spec below coverage, retrying",
                     coverage=f"{summary.coverage * 100:.0f}%",
@@ -789,31 +848,24 @@ def generate_evals_from_signature(
                 if attempt < max_retries:
                     time.sleep(retry_delay)
 
-        # Exhausted retries — return last generated spec
-        # (summary/runner/yaml_spec are always set when func is not None and loop ran at least once)
-        assert summary is not None and runner is not None  # noqa: S101
-        logfire.warn(
-            "Eval generation exhausted retries",
-            final_coverage=f"{summary.coverage * 100:.0f}%",
-            target=f"{min_coverage * 100:.0f}%",
-        )
-        return runner, yaml_spec
+        # Exhausted retries — return last generated spec if we have one
+        if runner is not None and summary is not None:
+            logfire.warn(
+                "Eval generation exhausted retries",
+                final_coverage=f"{summary.coverage * 100:.0f}%",
+                target=f"{min_coverage * 100:.0f}%",
+            )
+            return runner, yaml_spec
 
-    def _build_eval_failure_context(self, summary: EvalSummary) -> str:
-        """Build a concise failure report to inject into the retry prompt."""
-        lines: list[str] = []
-        for result in summary.results:
-            if result.report:
-                for case in result.report.cases:
-                    failed_assertions = {k: v for k, v in case.assertions.items() if not v.value}
-                    if failed_assertions:
-                        reasons = ", ".join(
-                            f"{k}: {v.reason}" for k, v in failed_assertions.items() if v.reason
-                        )
-                        lines.append(f"- Case '{case.name}' FAILED [{reasons}]")
-            if result.error:
-                lines.append(f"- Error: {result.error}")
-        return "\n".join(lines) if lines else "Unknown failures"
+        # All attempts failed with generation errors — return whatever we have
+        if yaml_spec:
+            runner = RunEvals.from_source(yaml_spec)
+            return runner, yaml_spec
+
+        raise RuntimeError(
+            f"Failed to generate valid eval spec for '{signature.name}' "
+            f"after {max_retries + 1} attempts"
+        )
 
     def generate_implementation(
         self,
@@ -875,6 +927,15 @@ def generate_all(
     ) -> TDDResult:
         """Run complete TDD flow: Signature -> Evals -> Implementation.
 
+        1. Generate function signature from description
+        2. Generate eval spec from signature (tests first)
+        3. Generate implementation that passes the evals (code last)
+        4. Run evals & retry implementation on failure
+
+        When ``executor`` is set (at init), generated expected values are
+        validated against actual execution and auto-corrected before the
+        coverage check.
+
         Args:
             description: What the function should do
             name: Function name
@@ -898,7 +959,7 @@ def generate_all(
 
         for flow_attempt in range(max_flow_retries + 1):
             with logfire.span("TDD generation flow", name=name, flow_attempt=flow_attempt + 1):
-                # Step 2: Generate evals
+                # Step 2: Generate evals from signature
                 logfire.info("Step 2: Generating evals", flow_attempt=flow_attempt + 1)
                 runner, yaml_spec = self.generate_evals_from_signature(
                     signature,
@@ -912,16 +973,27 @@ def generate_all(
 
                 summary: EvalSummary | None = None
                 for impl_attempt in range(max_impl_retries + 1):
-                    func = self.generate_implementation(
-                        signature, yaml_spec, additional_context, description
-                    )
+                    try:
+                        func = self.generate_implementation(
+                            signature, yaml_spec, additional_context, description
+                        )
+                    except RuntimeError as exc:
+                        logfire.warn(
+                            "Implementation failed to compile, retrying",
+                            impl_attempt=impl_attempt + 1,
+                            error=str(exc),
+                        )
+                        if impl_attempt < max_impl_retries:
+                            time.sleep(retry_delay)
+                            continue
+                        raise
 
                     # If max_eval_retries > 0, re-validate evals against this impl
                     if max_eval_retries > 0 and impl_attempt == 0:
                         runner, yaml_spec = self.generate_evals_from_signature(
                             signature,
                             min_cases,
-                            func=func.impl,
+                            func=func,
                             max_retries=max_eval_retries,
                             min_coverage=min_coverage,
                             retry_delay=retry_delay,
diff --git a/src/vowel/utils.py b/src/vowel/utils.py
index 1b8be82..c6c4f67 100644
--- a/src/vowel/utils.py
+++ b/src/vowel/utils.py
@@ -26,7 +26,7 @@
 import importlib
 import importlib.util
 import inspect
-import logging
+import logfire
 import os
 import sys
 import types
@@ -56,8 +56,6 @@
     create_llm_judge,
 )
 
-logger = logging.getLogger(__name__)
-
 _SYS_PATH_MODIFIED = False
 
 
@@ -825,7 +823,11 @@ def import_function(module_path: str) -> Any:
         try:
             module = importlib.import_module(module_name)
         except ImportError as e:
-            logger.debug(f"Standard import failed for '{module_name}': {e}")
+            logfire.debug(
+                "Standard import failed for '{module_name}': {error}",
+                module_name=module_name,
+                error=str(e),
+            )
             relative_path = module_name.replace(".", os.sep) + ".py"
             file_path = os.path.join(os.getcwd(), relative_path)
 
@@ -835,9 +837,15 @@ def import_function(module_path: str) -> Any:
                     if spec and spec.loader:
                         module = importlib.util.module_from_spec(spec)
                         spec.loader.exec_module(module)
-                        logger.debug(f"File-based import succeeded for '{file_path}'")
+                        logfire.debug(
+                            "File-based import succeeded for '{file_path}'", file_path=file_path
+                        )
                 except Exception as e:
-                    logger.debug(f"File-based import failed for '{file_path}': {e}")
+                    logfire.debug(
+                        "File-based import failed for '{file_path}': {error}",
+                        file_path=file_path,
+                        error=str(e),
+                    )
 
         if module:
             try:
@@ -846,7 +854,7 @@ def import_function(module_path: str) -> Any:
                     obj = getattr(obj, part)
                 return obj
             except AttributeError as e:
-                logger.debug(f"Attribute lookup failed: {e}")
+                logfire.debug("Attribute lookup failed: {error}", error=str(e))
                 continue
 
     try:
@@ -1127,7 +1135,7 @@ def to_dataset(
             input_value = {"input": match_case.input}
 
         if any(case for case in dataset_cases if input_value == case.inputs):
-            logger.warning("Already exists in dataset, skipping duplicate case.")
+            logfire.warn("Already exists in dataset, skipping duplicate case.")
             continue
 
         dataset_cases.append(

From 83b84c2686ef17a66731ee974607402ed176c363 Mon Sep 17 00:00:00 2001
From: Mert <62549656+fswair@users.noreply.github.com>
Date: Sun, 15 Mar 2026 22:30:29 +0300
Subject: [PATCH 2/8] demo:save

---
 .gitignore                          |  12 +
 AGENTS.md                           |   9 +
 CLAUDE.md                           |   9 +
 docs/FEEDBACK_GUIDED_EXPLORATION.md | 250 +++++++
 docs/MONTY_RESEARCH.md              | 984 ++++++++++++++++++++++++++++
 pyproject.toml                      |  13 +-
 pyrightconfig.json                  |   1 +
 pytest.ini                          |   5 -
 src/vowel/__init__.py               |   2 +
 src/vowel/cli.py                    |  27 +-
 src/vowel/codemode.py               | 331 ++++++++--
 src/vowel/eval_types.py             |  74 ++-
 src/vowel/evals.py                  |  85 ++-
 src/vowel/executor.py               | 194 +++++-
 src/vowel/runner.py                 |  42 +-
 src/vowel/spec_validation.py        |  43 +-
 src/vowel/tdd.py                    |   9 +-
 src/vowel/utils.py                  | 183 +++---
 src/vowel/validation.py             |   2 +-
 tests/test_cli.py                   |  35 +
 tests/test_evaluators.py            |  15 +
 tests/test_executor.py              | 457 +++++++++++++
 tests/test_fixtures.py              | 105 +++
 tests/test_import_function.py       |  19 +
 tests/test_run_evals.py             |  23 +
 tests/test_session.py               | 232 +++++++
 tests/test_tdd_eval_retries.py      |  18 +-
 27 files changed, 2945 insertions(+), 234 deletions(-)
 create mode 100644 docs/FEEDBACK_GUIDED_EXPLORATION.md
 create mode 100644 docs/MONTY_RESEARCH.md
 delete mode 100644 pytest.ini
 create mode 100644 tests/test_cli.py
 create mode 100644 tests/test_executor.py
 create mode 100644 tests/test_session.py

diff --git a/.gitignore b/.gitignore
index eb9b4d5..59c3bc4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -69,3 +69,15 @@ evaluations/
 # !!
 TODO
 docs/FIXTURE_GENERATION_RFC.md
+
+# CodeMode
+monty.py
+monty/
+
+# Benchmarks
+benchmark*
+parse_cron_evals.yml
+PLAN.md
+codegen.py
+bundle_*.py
+*test.py
diff --git a/AGENTS.md b/AGENTS.md
index 9de1c04..5b79f13 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -30,5 +30,14 @@ This document contains concise rules for how agents should inspect and use this
 - If you have questions or uncertainty, consult `README.md` and the relevant docs pages.
 - Check `TODO` for pending tasks or known issues.
 
+## Critical Thinking & Intellectual Honesty
+
+- **Never defer to the user's idea just because they said it.** Evaluate every proposal — yours or the user's — on its own merits: trade-offs, costs, complexity, correctness.
+- **If the user's idea has flaws, say so.** Explain why with concrete reasoning (performance, token cost, latency, maintainability, correctness risk). Do not soften criticism to be agreeable.
+- **If your own idea has flaws, admit it first.** Don't wait for the user to find the holes. Present disadvantages upfront.
+- **When comparing approaches, use structured analysis:** list pros/cons for each, identify the real trade-offs, and state which you'd pick and why — before asking for input.
+- **"You're right" must be earned.** If you catch yourself agreeing immediately, stop and ask: "Did I actually evaluate this, or am I just being agreeable?" If the latter, go back and do the analysis.
+- **The user is a collaborator, not an authority.** Good ideas win regardless of who proposed them. Bad ideas lose regardless of who proposed them.
+
 These rules help agents use the project consistently and safely.
 
diff --git a/CLAUDE.md b/CLAUDE.md
index 28360e4..78ebc3b 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -30,5 +30,14 @@ Claude-type agents working with this repository should follow these steps:
 - If you have questions or uncertainty, consult `README.md` and the relevant docs pages.
 - Check `TODO` for pending tasks or known issues.
 
+## Critical Thinking & Intellectual Honesty
+
+- **Never defer to the user's idea just because they said it.** Evaluate every proposal — yours or the user's — on its own merits: trade-offs, costs, complexity, correctness.
+- **If the user's idea has flaws, say so.** Explain why with concrete reasoning (performance, token cost, latency, maintainability, correctness risk). Do not soften criticism to be agreeable.
+- **If your own idea has flaws, admit it first.** Don't wait for the user to find the holes. Present disadvantages upfront.
+- **When comparing approaches, use structured analysis:** list pros/cons for each, identify the real trade-offs, and state which you'd pick and why — before asking for input.
+- **"You're right" must be earned.** If you catch yourself agreeing immediately, stop and ask: "Did I actually evaluate this, or am I just being agreeable?" If the latter, go back and do the analysis.
+- **The user is a collaborator, not an authority.** Good ideas win regardless of who proposed them. Bad ideas lose regardless of who proposed them.
+
 These guidelines are intended to help Claude agents use the repository consistently.
 
diff --git a/docs/FEEDBACK_GUIDED_EXPLORATION.md b/docs/FEEDBACK_GUIDED_EXPLORATION.md
new file mode 100644
index 0000000..7aff3af
--- /dev/null
+++ b/docs/FEEDBACK_GUIDED_EXPLORATION.md
@@ -0,0 +1,250 @@
+# Feedback-Guided Exploration
+
+## The Problem: Single-Shot Exploration is Blind
+
+Prior to this change, the CodeMode pipeline ran exploration in a single LLM call:
+
+```
+Function source code → LLM (one call) → N snippets → Execute all → Done
+```
+
+The LLM never saw execution results during exploration. It generated all snippets based purely on **static reasoning** — reading the source code and inferring what inputs would be interesting. This is "speculation-based exploration."
+
+This works surprisingly well with strong models. In our benchmark, Claude Opus 4.6 produced 44 snippets for `parse_cron` in a single call and achieved 100% coverage with zero refinements. But the approach has structural limitations that no amount of model intelligence can overcome:
+
+### What single-shot exploration misses
+
+**1. Exact error messages**
+
+The LLM reads a `raise ValueError(...)` statement and guesses the error message. But the actual message depends on runtime state — string interpolation, variable values, branch ordering. Example:
+
+```python
+# LLM expects:
+parse_cron('-1 0 1 1 0')  →  ValueError("minute: -1 out of range 0-59")
+
+# Reality:
+parse_cron('-1 0 1 1 0')  →  ValueError("invalid literal for int() with base 10: ''")
+```
+
+The minus sign is consumed by the range parser (`-` is the range delimiter), leaving an empty string that fails `int()` conversion. This is a parsing precedence issue that can only be discovered by execution.
+
+**2. Input combination explosions**
+
+For grammar-heavy functions (parsers, validators, DSLs), each syntax element works in isolation, but **combinations** of elements may trigger different code paths. Example from cron parsing:
+
+- `*/15` works (step with wildcard)
+- `1-10` works (range)
+- `1,5,10` works (comma-separated)
+- `1,5-7,*/20` — comma + range + step in one field — was never tried
+
+The LLM tests each primitive but rarely discovers multi-primitive combinations without seeing prior execution results.
+
+**3. Error path ordering**
+
+When a function has multiple validation layers, the order matters:
+
+```python
+# Does step validation happen before or after range validation?
+parse_cron('0-60/0 * * * *')
+# Could be: "Step must be positive" or "invalid range 0-60"
+```
+
+Only execution reveals which guard fires first.
+
+## The Solution: Two-Round Evidence-Based Exploration
+
+The new pipeline adds a second exploration round that receives **actual execution results** from Round 1:
+
+```
+Round 1: LLM (static reasoning) → 15-30 snippets → Execute
+              ↓
+         Deterministic cluster summary
+              ↓
+Round 2: LLM (evidence-based)   →  8-12 snippets → Execute
+              ↓
+         Combined results → Spec Generation
+```
+
+Round 2 sees:
+- Every snippet that was tried and its exact output
+- A programmatic cluster summary grouping results by behavior class
+- An explicit "do not repeat" list
+
+This transforms exploration from **speculation** into **hypothesis refinement under feedback** — the LLM reasons about what it _hasn't_ seen, informed by what it _has_ seen.
+
+## Design Decisions
+
+### Why programmatic clustering (not LLM-based)?
+
+We considered two approaches for building the cluster summary between rounds:
+
+| | Programmatic (chosen) | LLM-based |
+|---|---|---|
+| Cost | Zero — no LLM call | 1 additional call |
+| Determinism | Always produces same output for same input | Non-deterministic |
+| Speed | Microseconds | Seconds |
+| Depth | Surface-level (type + message prefix) | Semantic understanding |
+
+We chose programmatic clustering because the goal is not "perfect semantic grouping" — it's "sufficient signal to guide Round 2." The Round 2 LLM is intelligent enough to infer gaps from a simple type+message summary. Adding a clustering LLM call would introduce cost and non-determinism without proportional benefit.
+
+### Why exactly 2 rounds (not 3+)?
+
+Three considerations:
+
+1. **Diminishing returns**: Round 1 covers ~80-90% of behavior space through static reasoning. Round 2 targets the remaining gaps. A Round 3 would operate on an already-dense behavior map with very few remaining gaps — the ROI drops sharply.
+
+2. **Reasoning fragmentation**: Strong models like Opus do their best reasoning in large, focused context windows. Splitting reasoning across many small rounds can actually degrade quality. Two rounds is the sweet spot: one large reasoning pass, one targeted refinement.
+
+3. **Cost predictability**: Fixed 2-round means exactly 2 exploration LLM calls. This is predictable and benchmarkable. Variable rounds (3-5) make cost unpredictable and harder to compare across models.
+
+The `exploration_rounds` parameter allows override (`=1` restores old behavior, `=3` for complex domains if needed), but the default of 2 is intentional.
+
+### Why early exit conditions?
+
+Two conditions can terminate exploration before Round 2 completes:
+
+1. **No snippets produced**: If the Round 2 LLM returns an empty plan, it believes Round 1 was already comprehensive. Forcing it to produce snippets would yield duplicates.
+
+2. **No new behavior classes discovered**: After executing Round 2 snippets, we compare behavior keys (`ok:dict`, `err:ValueError:minute: 60 out of range`) between prior and new results. If every new snippet produced a behavior we already had, the exploration space is saturated.
+
+## Implementation Details
+
+### Cluster Summary Format
+
+The `_build_cluster_summary()` method produces a structured text summary:
+
+```markdown
+## Observed Behaviour Clusters
+
+### Success clusters
+- output type `dict`: 18 cases
+- output type `bool`: 3 cases
+- output type `list`: 1 case
+
+### Error clusters
+- `ValueError` (8 distinct messages):
+  - "Expected 5 fields, got 3"
+  - "minute: 60 out of range 0-59"
+  - "Step must be positive, got -1"
+  - ...
+- `AttributeError` (2 distinct messages):
+  - "'NoneType' object has no attribute 'strip'"
+  - "'int' object has no attribute 'strip'"
+
+### Already tried (25 snippets — do NOT repeat these)
+- `parse_cron('* * * * *')`
+- `parse_cron('5 14 1 6 3')`
+- ...
+```
+
+This is deterministic, costs zero LLM tokens to produce, and provides exactly the signal Round 2 needs:
+- What **output shapes** have been seen (so the LLM can target new ones)
+- What **error types and messages** were discovered (so the LLM can find adjacent error paths)
+- What **exact code** was already tried (so the LLM won't duplicate)
+
+### Round 2 Prompt Structure
+
+The Round 2 prompt includes:
+
+```
+<FunctionCode>   — same source code as Round 1
+<Round1Results>  — full execution results (code + output/error for each snippet)
+<ClusterSummary> — the programmatic summary above
+
+RULES:
+- Do NOT repeat any snippet from the "Already tried" list.
+- Produce 8–12 NEW normal snippets targeting uncovered behaviour.
+- Produce 3–5 NEW error snippets targeting untried error paths.
+```
+
+The snippet count targets (8-12 normal, 3-5 error) are intentionally smaller than Round 1 (15+ normal, 3+ error). Round 2 is surgical, not broad.
+
+### Behavior Key Format
+
+For early exit detection, each result is hashed into a behavior key:
+
+```
+Success: "ok:{output_type}"         → "ok:dict", "ok:bool", "ok:list"
+Error:   "err:{error_type}:{msg40}" → "err:ValueError:minute: 60 out of range 0-59"
+```
+
+The message prefix is truncated to 40 characters — enough to distinguish error paths without being sensitive to minor wording variations.
+
+## Code Changes
+
+All changes are in `src/vowel/codemode.py`. No new files, no new dependencies.
+
+### Modified methods
+
+| Method | Change |
+|---|---|
+| `explore()` | 2-round loop with early exit; delegates execution to `_execute_plan()` |
+| `_get_exploration_plan()` | Unchanged logic, updated docstring and logfire tags |
+
+### New methods
+
+| Method | Purpose |
+|---|---|
+| `_execute_plan()` | Extracted snippet execution loop (reused by both rounds) |
+| `_get_targeted_exploration_plan()` | Round 2 prompt with prior results + cluster summary |
+| `_build_cluster_summary()` | Programmatic clustering of results into text summary |
+| `_count_new_behaviors()` | Compares behavior keys between prior and new results |
+
+### Backward compatibility
+
+- `explore(func, exploration_rounds=1)` restores exact single-shot behavior
+- Default is `exploration_rounds=2` — existing callers get the improvement automatically
+- `generate()` calls `explore()` without arguments, so it automatically benefits
+- All 478 existing unit tests pass without modification
+
+## Expected Impact
+
+### On strong models (Opus-class)
+
+- Round 1 already produces excellent coverage
+- Round 2 adds **combination discovery** and **exact error message alignment**
+- Net: ~10-15% more snippets, potentially fewer spec refinement rounds (error messages will match exactly)
+
+### On weaker models (Flash/Lite-class)
+
+- Round 1 produces decent but shallow coverage — misses edge cases
+- Round 2 **compensates for weaker static reasoning** by showing actual execution results
+- Net: significant quality improvement, likely converting some FAIL scenarios to PASS
+
+### On benchmark discriminability
+
+With Katman 3 (behavioral discovery) added, benchmarks now measure a higher-order capability: **adaptive reasoning under feedback**. This separates models that can merely read code from models that can learn from execution traces — a much more meaningful distinction for agentic coding systems.
+
+## Relationship to the Full Pipeline
+
+The evidence flow through the pipeline is now:
+
+```
+Round 1 (speculation)     → snippets → execute → results
+                                                    ↓
+Round 2 (evidence-based)  → snippets → execute → results
+                                                    ↓
+                                            all exploration results
+                                                    ↓
+Spec Generation ← VerifiedExecutionResults + ErrorResults
+                                                    ↓
+                                              YAML eval spec
+                                                    ↓
+Validation → RunEvals → coverage check
+                                                    ↓
+Refinement (if needed) ← failure context
+```
+
+Evidence-based reasoning now starts at the **exploration phase** rather than only at spec generation. Since exploration results feed directly into spec generation, any improvement in exploration quality cascades through the entire downstream pipeline.
+
+## Origin
+
+This feature was designed through a three-way analysis between the developer, the implementation agent (GitHub Copilot / Claude Opus 4.6), and ChatGPT. ChatGPT identified the core insight: the pipeline was doing "speculation-based exploration" when it could be doing "evidence-based exploration." The implementation agent confirmed this against the actual codebase, proposed the programmatic clustering approach (Yol A) over LLM-based clustering, and implemented the 2-round design.
+
+The key framing that guided the design:
+
+```
+Layer 1: Domain awareness    (from function description)     ✅ already strong
+Layer 2: Grammar inference   (from source code)              ✅ already strong
+Layer 3: Behavioral discovery (from runtime feedback)        ✅ now added
+```
diff --git a/docs/MONTY_RESEARCH.md b/docs/MONTY_RESEARCH.md
new file mode 100644
index 0000000..3742755
--- /dev/null
+++ b/docs/MONTY_RESEARCH.md
@@ -0,0 +1,984 @@
+# Monty Research Notes
+
+> Bu doküman, `pydantic-monty` projesinin evalspec ekosistemi (vowel eval generation pipeline'ları ve vowel-optimization) ile entegrasyonu için yapılan araştırmanın özetidir. CodeMode, tüm eval generation pipeline'larında kullanılabilecek genel bir mekanizmadır — optimizasyon bunlardan sadece biridir.
+
+## 1. Genel Bakış
+
+**Monty**, Pydantic ekibi tarafından Rust ile yazılmış, minimal ve güvenli bir Python yorumlayıcısıdır. Temel amacı: **AI tarafından üretilen kodu güvenli bir sandbox ortamında çalıştırmak.**
+
+- **Repo:** `pydantic/monty` (GitHub)
+- **PyPI paketi:** `pydantic-monty`
+- **NPM paketi:** `@pydantic/monty`
+- **Lisans:** MIT
+- **Dil:** Rust (PyO3 ile Python bindings, napi-rs ile JS bindings)
+- **Hedef Python sürümü:** 3.14
+
+### Temel Özellikler
+
+| Özellik | Detay |
+|---------|-------|
+| Güvenlik | Filesystem, network, env vars tamamen bloklu — sadece kontrollü external function callbacks |
+| Başlatma süresi | <0.06ms (~60 mikrosaniye) |
+| Performans | CPython'a benzer çalışma hızı |
+| Boyut | ~4.5MB download |
+| Serileştirme | `dump()`/`load()` ile parsed code ve execution state kaydedilebilir |
+| Kaynak limitleri | Süre, bellek, allocation sayısı, recursion derinliği sınırlandırılabilir |
+| Tip kontrolü | Opsiyonel statik tip analizi (Monty'nin kendi type checker'ı) |
+
+## 2. Güvenlik Modeli
+
+Monty, **untrusted/potentially malicious** kodun çalıştırılması için tasarlanmıştır. Güvenlik garantileri:
+
+- **Filesystem erişimi YOK** — Sadece `OSAccess` ile kontrollü sanal dosya sistemi
+- **Network erişimi YOK** — Socket, HTTP vb. hiçbir ağ işlemi yapılamaz
+- **Ortam değişkenleri YOK** — `os.environ`, `os.getenv` yalnızca host callback ile
+- **Subprocess/shell YOK** — `os.system`, `subprocess` vb. yok
+- **Import sistemi kısıtlı** — Sadece izin verilen modüller (sys, typing, asyncio)
+- **C FFI yok** — Tamamen Rust ile implement edilmiş, unsafe yok
+
+Tüm dış dünya erişimi **external functions** mekanizması üzerinden olur — host tarafı bu fonksiyonları sağlar, sandbox kodu bunları çağırır, host gerçek işlemi yapar ve sonucu sandbox'a döndürür.
+
+## 3. Python API
+
+### 3.1. Kurulum
+
+```bash
+pip install pydantic-monty
+```
+
+### 3.2. Temel Kullanım
+
+```python
+import pydantic_monty
+
+# Basit ifade çalıştırma
+m = pydantic_monty.Monty('1 + 2 * 3')
+result = m.run()  # -> 7
+
+# Input değişkenleri ile
+m = pydantic_monty.Monty('x + y', inputs=['x', 'y'])
+result = m.run(inputs={"x": 10, "y": 20})  # -> 30
+
+# Aynı parsed code farklı girdilerle tekrar çalıştırılabilir
+result2 = m.run(inputs={"x": 100, "y": 200})  # -> 300
+```
+
+### 3.3. `Monty` Sınıfı — Constructor
+
+```python
+pydantic_monty.Monty(
+    code: str,                                    # Çalıştırılacak Python kodu
+    *,
+    script_name: str = 'main.py',                 # Traceback'lerde görünecek isim
+    inputs: list[str] | None = None,              # Kod içinde kullanılabilecek input değişken isimleri
+    external_functions: list[str] | None = None,   # Kod içinden çağrılabilecek harici fonksiyon isimleri
+    type_check: bool = False,                      # Statik tip kontrolü yapılsın mı
+    type_check_stubs: str | None = None,           # Tip kontrolü için ek stub tanımları
+    dataclass_registry: list[type] | None = None,  # Dataclass tip kayıtları
+)
+```
+
+**Raises:**
+- `MontySyntaxError` — Kod parse edilemezse
+- `MontyTypingError` — `type_check=True` ise ve tip hataları varsa
+
+### 3.4. `Monty.run()` — Senkron Çalıştırma
+
+```python
+m.run(
+    *,
+    inputs: dict[str, Any] | None = None,                          # Input değerleri
+    limits: ResourceLimits | None = None,                           # Kaynak limitleri
+    external_functions: dict[str, Callable[..., Any]] | None = None, # Harici fonksiyon implementasyonları
+    print_callback: Callable[[Literal['stdout'], str], None] | None = None,  # print() çıktısı callback
+    os: Callable[[OsFunction, tuple[Any, ...]], Any] | None = None,          # OS erişimi callback
+) -> Any
+```
+
+**Önemli:** GIL serbest bırakılır — paralel çalıştırma mümkün.
+
+### 3.5. External Functions (Harici Fonksiyonlar)
+
+Bu, Monty'nin en güçlü mekanizmasıdır. Sandbox kodu bir fonksiyon çağırdığında, çalışma durur, host taraftaki gerçek Python fonksiyonu çalışır ve sonuç sandbox'a döndürülür.
+
+```python
+# Sandbox kodunda "fetch" fonksiyonu çağrılabilir
+m = pydantic_monty.Monty(
+    'fetch("https://example.com")',
+    external_functions=['fetch']
+)
+
+# Host tarafında gerçek implementasyon
+def fetch(url: str) -> str:
+    return f'Fetched: {url}'
+
+result = m.run(external_functions={"fetch": fetch})
+# -> "Fetched: https://example.com"
+```
+
+**Kritik nokta:** External fonksiyonlar host ortamında çalışır — yani hedef fonksiyonun stdlib, third-party lib, dosya sistemi vb. kullanması sorun olmaz. Monty sadece orkestrasyonu yapar.
+
+### 3.6. İteratif Çalıştırma (start/resume)
+
+External fonksiyon çağrılarında adım adım kontrol sağlar:
+
+```python
+m = pydantic_monty.Monty(
+    'result = fetch(url)',
+    inputs=['url'],
+    external_functions=['fetch']
+)
+
+# Çalıştırmayı başlat
+progress = m.start(inputs={"url": "https://example.com"})
+
+if isinstance(progress, pydantic_monty.MontySnapshot):
+    # Bir external function çağrısında durdu
+    print(progress.function_name)   # -> "fetch"
+    print(progress.args)            # -> ("https://example.com",)
+    print(progress.kwargs)          # -> {}
+    
+    # Sonucu döndürerek devam et
+    progress = progress.resume(return_value="response data")
+
+if isinstance(progress, pydantic_monty.MontyComplete):
+    print(progress.output)  # -> Son ifadenin değeri
+```
+
+**İlerleme tipleri:**
+- `MontySnapshot` — External function çağrısı bekliyor
+- `MontyFutureSnapshot` — Birden fazla async future bekliyor
+- `MontyComplete` — Çalışma tamamlandı, `.output` ile sonuç alınır
+
+### 3.7. Asenkron Çalıştırma
+
+```python
+async def main():
+    m = pydantic_monty.Monty(
+        'await fetch(url)',
+        inputs=['url'],
+        external_functions=['fetch']
+    )
+    
+    async def real_fetch(url: str) -> str:
+        async with httpx.AsyncClient() as client:
+            r = await client.get(url)
+            return r.text
+    
+    result = await pydantic_monty.run_monty_async(
+        m,
+        inputs={"url": "https://example.com"},
+        external_functions={"fetch": real_fetch},
+    )
+```
+
+### 3.8. REPL Modu
+
+Durum korunarak ardışık kod parçaları çalıştırılabilir:
+
+```python
+repl, output = pydantic_monty.MontyRepl.create('x = 10', inputs=['x'])
+# output = 10 (veya None — son ifadenin değeri)
+
+result1 = repl.feed('x + 5')    # -> 15
+result2 = repl.feed('x * 2')    # -> 20
+# x hâlâ 10, önceki state korunur
+```
+
+### 3.9. Kaynak Limitleri (ResourceLimits)
+
+```python
+limits = pydantic_monty.ResourceLimits(
+    max_duration_secs=5.0,        # Maksimum çalışma süresi (saniye)
+    max_memory=1024 * 1024,        # Maksimum heap bellek (byte) — 1MB
+    max_allocations=10000,         # Maksimum heap allocation sayısı
+    max_recursion_depth=1000,      # Maksimum recursion derinliği (default: 1000)
+    gc_interval=100,               # Her N allocation'da GC çalıştır
+)
+
+m = pydantic_monty.Monty('fib(30)', external_functions=['fib'])
+result = m.run(
+    external_functions={"fib": my_fib},
+    limits=limits,
+)
+```
+
+### 3.10. Serileştirme (dump/load)
+
+Parsed code veya çalışma durumu (snapshot) kaydedilebilir:
+
+```python
+# Parsed code'u kaydet
+m = pydantic_monty.Monty('x + 1', inputs=['x'])
+data = m.dump()  # -> bytes
+
+# Daha sonra geri yükle (parse maliyeti sıfır)
+m2 = pydantic_monty.Monty.load(data)
+result = m2.run(inputs={"x": 41})  # -> 42
+
+# Snapshot'ü da kaydedebilirsin
+progress = m.start(inputs={"x": 10})
+if isinstance(progress, pydantic_monty.MontySnapshot):
+    snapshot_data = progress.dump()  # -> bytes
+    # Farklı process'te bile geri yüklenebilir
+    restored = pydantic_monty.MontySnapshot.load(snapshot_data)
+```
+
+### 3.11. Sanal Dosya Sistemi (OSAccess)
+
+```python
+from pydantic_monty import OSAccess, MemoryFile, CallbackFile
+
+# Bellekte sanal dosyalar oluştur
+fs = OSAccess([
+    MemoryFile('/data/input.csv', content='col1,col2\n1,2\n3,4'),
+    MemoryFile('/data/config.json', content='{"key": "value"}'),
+])
+
+# Sandbox kodunda Path.read_text() vb. kullanılabilir
+m = pydantic_monty.Monty("""
+from pathlib import Path
+data = Path('/data/input.csv').read_text()
+data.split('\\n')
+""")
+
+result = await pydantic_monty.run_monty_async(m, os=fs)
+```
+
+### 3.12. Tip Kontrolü
+
+```python
+# Opsiyonel statik analiz
+m = pydantic_monty.Monty(
+    'x + "hello"',
+    inputs=['x'],
+    type_check=True,
+    type_check_stubs='x: int',  # Input tiplerini belirt
+)
+# MontyTypingError fırlatabilir
+
+# Hata formatları
+try:
+    m.type_check(prefix_code='x: int')
+except pydantic_monty.MontyTypingError as e:
+    print(e.display(format='full', color=True))
+    # format seçenekleri: 'full', 'concise', 'azure', 'json', 'jsonlines',
+    #                      'rdjson', 'pylint', 'gitlab', 'github'
+```
+
+## 4. Hata Tipleri
+
+```
+MontyError (base)
+├── MontySyntaxError    — Parse hataları
+├── MontyRuntimeError   — Çalışma zamanı hataları (ZeroDivisionError, ValueError vb.)
+└── MontyTypingError    — Statik tip analizi hataları
+```
+
+### MontyRuntimeError Detayları
+
+```python
+try:
+    m = pydantic_monty.Monty('1 / 0')
+    m.run()
+except pydantic_monty.MontyRuntimeError as e:
+    # İç exception'a eriş
+    inner = e.exception()  # -> ZeroDivisionError instance
+    
+    # Traceback al
+    frames = e.traceback()  # -> list[Frame]
+    for frame in frames:
+        print(f"  {frame.filename}:{frame.line}:{frame.column} in {frame.function_name}")
+        print(f"    {frame.source_line}")
+    
+    # Formatlanmış çıktı
+    print(e.display(format='traceback'))  # Full traceback
+    print(e.display(format='type-msg'))   # "ZeroDivisionError: division by zero"
+    print(e.display(format='msg'))        # "division by zero"
+```
+
+**ÖNEMLİ:** Monty, Python exception'larını birebir eşleştirir. `ZeroDivisionError`, `ValueError`, `TypeError` vb. host tarafında doğru exception tipleri olarak yakalanabilir.
+
+## 5. Dil Destekleri ve Kısıtlamalar
+
+### 5.1. Desteklenen Python Deyimleri (Statements)
+
+Kaynak: `crates/monty/src/expressions.rs` — `Node` ve `Expr` enum'ları
+
+| Deyim | Notlar |
+|-------|--------|
+| `x = expr` | Basit atama |
+| `x, y = expr` | Tuple unpacking (iç içe dahil: `(a, b), c = ...`) |
+| `first, *rest = expr` | Starred unpacking |
+| `x += expr` (augmented assigns) | `+=`, `-=`, `*=`, `/=`, `//=`, `%=`, `**=`, `&=`, `\|=`, `^=`, `<<=`, `>>=` |
+| `obj[i] = val` | Subscript assignment |
+| `obj.attr = val` | Attribute assignment (dataclass alanları) |
+| `if / elif / else` | Tam destekli |
+| `for target in iter` | `else` bloğu dahil |
+| `while test` | `else` bloğu dahil |
+| `break` | ✅ |
+| `continue` | ✅ |
+| `return` / `return expr` | ✅ |
+| `raise` / `raise Exception(...)` | ✅ |
+| `try / except / else / finally` | Tam hiyerarşi destekli, çoklu `except` |
+| `assert test, msg` | ✅ |
+| `pass` | ✅ |
+| `def func(...)` | `async def` dahil |
+| `global x` | ✅ |
+| `nonlocal x` | ✅ |
+| `import sys` | Sadece whitelist'teki modüller |
+| `from typing import X` | Sadece whitelist'teki modüller |
+| `del` | ❌ Henüz yok |
+| `class MyClass:` | ❌ Henüz yok |
+| `match x:` | ❌ Desteklenmiyor |
+| `with ... as ...:` | ❌ Henüz yok |
+
+### 5.2. Desteklenen İfadeler (Expressions)
+
+| İfade | Notlar |
+|-------|--------|
+| Literaller | `int`, `float`, `str`, `bytes`, `bool`, `None`, `...` |
+| Büyük int'ler | `2**200` gibi i64 aşan değerler (arbitrary precision) |
+| f-string | `f"hello {name!r}"` — format spec dahil |
+| Aritmetik | `+`, `-`, `*`, `/`, `//`, `%`, `**` |
+| Bitwise | `&`, `\|`, `^`, `~`, `<<`, `>>` |
+| Karşılaştırma | `==`, `!=`, `<`, `<=`, `>`, `>=`, `is`, `is not`, `in`, `not in` |
+| Zincirleme karşılaştırma | `a < b < c` — kısa devre değerlendirmeli |
+| Boolean | `and`, `or`, `not` |
+| Unary | `-x`, `+x`, `~x` |
+| Ternary | `x if cond else y` |
+| Walrus | `(x := expr)` |
+| `await expr` | Modül seviyesinde de kullanılabilir (Jupyter tarzı) |
+| List/dict/set literali | `[1,2]`, `{k:v}`, `{1,2}` |
+| List/set/dict comprehension | `[x for x in iter if cond]` |
+| Generator expression | `(x for x in iter)` |
+| Lambda | `lambda x, y: x + y` |
+| Subscript | `obj[i]`, `obj[a:b:c]` |
+| Slice | `obj[::2]` |
+| Attribute erişimi | `obj.attr` (zincirli dahil) |
+| Fonksiyon çağrısı | `f(a, b, *args, key=val, **kwargs)` |
+| Method çağrısı | `obj.method(args)` |
+| `isinstance(obj, Type)` | ✅ |
+
+### 5.3. Desteklenen Yerleşik Tipler (Built-in Types)
+
+```
+bool  int  float  str  bytes
+list  tuple  dict  set  frozenset
+range  slice  iter
+type  property
+```
+
+Ayrıca:
+- `None`, `True`, `False`, `...` (Ellipsis)
+- `LongInt` — arbitrarily large integers
+- `NamedTuple` — `collections.namedtuple` benzeri (built-in desteği var)
+- `Dataclass` — `@dataclass` decorator'ı ile (host'tan registry ile)
+- `pathlib.Path` — `from pathlib import Path` ile
+
+### 5.4. Desteklenen Builtin Fonksiyonlar
+
+Kaynak: `crates/monty/src/builtins/mod.rs` — `BuiltinsFunctions` enum'u
+
+**Mevcut (✅):**
+```
+abs()       all()       any()       bin()       chr()
+divmod()    enumerate() filter()    getattr()   hash()
+hex()       id()        isinstance() len()      map()
+max()       min()       next()      oct()       ord()
+pow()       print()     repr()      reversed()  round()
+sorted()    sum()       type()      zip()
+```
+
+**Henüz yok / yorum satırı (❌):**
+```
+aiter()     anext()     ascii()     breakpoint()
+callable()  compile()   dir()       eval()
+exec()      format()    globals()   hasattr()
+help()      input()     issubclass() iter() [kısmen]
+locals()    open()      setattr()   staticmethod()
+classmethod() super()  vars()      __import__()
+```
+
+**Type constructor olarak kullanılabilenler:**
+```
+bool()  int()  float()  str()  bytes()
+list()  tuple()  dict()  set()  frozenset()
+range()  slice()  iter()  type()  property()
+```
+
+**Exception constructor'ları:**
+```
+Exception        BaseException    SystemExit       KeyboardInterrupt
+ArithmeticError  OverflowError    ZeroDivisionError
+LookupError      IndexError       KeyError
+RuntimeError     NotImplementedError  RecursionError
+AttributeError   FrozenInstanceError
+NameError        UnboundLocalError
+ValueError       UnicodeDecodeError
+ImportError      ModuleNotFoundError
+OSError          FileNotFoundError  FileExistsError
+IsADirectoryError  NotADirectoryError
+AssertionError   MemoryError      StopIteration
+SyntaxError      TimeoutError     TypeError
+```
+
+### 5.5. Desteklenen Stdlib Modülleri
+
+#### `sys`
+```python
+import sys
+sys.version        # "3.14.0 (Monty)"
+sys.version_info   # named tuple: (major=3, minor=14, micro=0, ...)
+sys.platform       # "monty"
+sys.stdout         # marker (gerçek I/O yok)
+sys.stderr         # marker (gerçek I/O yok)
+```
+
+#### `typing`
+```python
+from typing import (
+    TYPE_CHECKING,  # her zaman False
+    Any, Optional, Union, List, Dict, Tuple, Set,
+    FrozenSet, Callable, Type, Sequence, Mapping,
+    Iterable, Iterator, Generator, ClassVar,
+    Final, Literal, TypeVar, Generic, Protocol,
+    Annotated, Self, Never, NoReturn
+)
+```
+Bunlar runtime'da `Marker` değerleri olarak işlenir — tip anotasyonlarda kullanılabilirler.
+
+#### `asyncio`
+```python
+import asyncio
+asyncio.run(coro)        # await coro ile eşdeğer
+asyncio.gather(*coros)   # Eşzamanlı birden fazla coroutine çalıştırma
+# create_task, sleep, wait vb. → YOK
+```
+
+#### `os`
+```python
+import os
+os.getenv("KEY", default=None)  # host callback üzerinden
+os.environ                       # host callback üzerinden dict döner
+# os.path, os.listdir, os.system vb. → YOK
+```
+
+#### `pathlib`
+```python
+from pathlib import Path
+p = Path("/data/file.txt")
+
+# Pure methods (I/O gerektirmez — doğrudan çalışır):
+p.name         # "file.txt"
+p.stem         # "file"
+p.suffix       # ".txt"
+p.suffixes     # [".txt"]
+p.parent       # Path("/data")
+p.parts        # ["/", "data", "file.txt"]
+p / "subdir"   # Path birleştirme (/ operatörü)
+str(p)         # "/data/file.txt"
+
+# Filesystem methods (OSAccess host callback gerektirir):
+p.exists()     read_text()   read_bytes()
+p.is_file()    write_text()  write_bytes()
+p.is_dir()     mkdir()       unlink()
+p.is_symlink() rmdir()       iterdir()
+p.stat()       rename()      resolve()
+p.absolute()
+```
+
+### 5.6. Tip Metodları — Detay
+
+#### `str` metodları
+```
+capitalize  casefold    center      count       encode
+endswith    find        index       isalnum     isalpha
+isascii     isdecimal   isdigit     isidentifier islower
+isnumeric   isspace     istitle     isupper     join
+ljust       lower       lstrip      partition   removeprefix
+removesuffix replace    rfind       rindex      rjust
+rpartition  rsplit      rstrip      split       splitlines
+startswith  strip       swapcase    title       upper      zfill
+```
+Ayrıca: `+` (concat), `*` (repeat), `in` (contains), `[]` (index/slice), `len()`, `str()` constructor
+
+#### `list` metodları
+```
+append  clear  copy  count  extend  index  insert  pop  remove  reverse  sort
+```
+Ayrıca: `+`, `*`, `in`, `[]`, `len()`, comprehension, unpacking
+
+#### `dict` metodları
+```
+clear  copy  fromkeys  get  items  keys  pop  popitem  setdefault  update  values
+```
+Ayrıca: `in`, `[]`, `len()`, comprehension
+
+#### `set` / `frozenset` metodları
+```
+add  clear  copy  difference  discard  intersection  isdisjoint
+issubset  issuperset  pop  remove  symmetric_difference  union  update
+```
+Ayrıca: `|`, `&`, `-`, `^` operatörleri
+
+#### `tuple` metodları
+```
+count  index
+```
+Ayrıca: `+`, `*`, `in`, `[]`, `len()`, unpacking
+
+#### `bytes` metodları
+```
+capitalize  center       count      decode      endswith
+find        fromhex      hex        index       isalnum
+isalpha     isascii      isdigit    islower     isspace
+istitle     isupper      join       ljust       lower
+lstrip      partition    removeprefix removesuffix replace
+rfind       rindex       rjust      rpartition  rsplit
+rstrip      split        splitlines startswith  strip
+swapcase    title        upper      zfill
+```
+
+#### `int` metodları
+```
+bit_length  bit_count  to_bytes  from_bytes
+```
+Ayrıca: tüm aritmetik ve bitwise operatörler
+
+#### `range`
+```
+range(stop)
+range(start, stop)
+range(start, stop, step)
+```
+Iteration, `in`, `len()`, `list(range(...))` desteklenir.
+
+### 5.7. Desteklenmeyen Özellikler
+
+| Özellik | Durum |
+|---------|-------|
+| **`class` tanımı** | ❌ Henüz yok (geliyor) |
+| **`match` / `case`** | ❌ Planlanmamış |
+| **`with` / bağlam yöneticisi** | ❌ Henüz yok |
+| **`del` deyimi** | ❌ Henüz yok |
+| **`yield from`** | ❌ Henüz yok |
+| **`*args` spread in comprehension** | ⚠️ Kısıtlı |
+| **`eval()`, `exec()`** | ❌ Hiçbir zaman olmayacak |
+| **`__import__`** | ❌ Hiçbir zaman olmayacak |
+| **Third-party kütüphaneler** | ❌ Sandbox içinde kullanılamaz |
+| **`json` modülü** | ❌ Henüz yok (geliyor) |
+| **`dataclasses` modülü (import)** | ❌ Henüz yok; dataclass desteği var ama host'tan |
+| **`collections`, `itertools`, `math`** | ❌ Yok |
+| **`re` (regex)** | ❌ Yok |
+| **`datetime`** | ❌ Yok |
+| **`functools`** | ❌ Yok |
+| **`enum`** | ❌ Yok |
+| **Decorator'lar** | ⚠️ Sadece basit fonksiyon decorator'ları |
+| **`super()`** | ❌ Yok |
+| **`classmethod`, `staticmethod`** | ❌ Yok |
+
+## 6. Mimari (Dahili)
+
+- **Parser:** Ruff'un `ruff_python_parser`'ı kullanılır → AST üretilir
+- **Prepare phase:** AST'den Scope analizi yapılır, isimler namespace index'lerine çözümlenir
+- **Bytecode:** Hazırlanan AST doğrudan bytecode VM'e beslenir (CPython benzeri register VM)
+- **Bellek:** Manuel reference counting (`drop_with_heap`, `clone_with_heap`); GC configurable intervals ile
+- **Serileştirme:** `serde` ile binary format (parsed code + snapshot)
+
+### Crate yapısı
+
+| Crate | İçerik |
+|-------|--------|
+| `crates/monty/` | Çekirdek interpreter (VM, types, builtins, modules) |
+| `crates/monty-python/` | PyO3 Python bindings |
+| `crates/monty-js/` | napi-rs JavaScript bindings |
+| `crates/monty-cli/` | CLI aracı |
+| `crates/monty-type-checking/` | Statik tip analizi |
+| `crates/monty-typeshed/` | Tip stub dosyaları (vendor + custom) |
+| `crates/fuzz/` | Fuzzing testleri |
+
+### Modül whitelist
+
+`import` ifadesi sadece şu modülleri yükleyebilir (kaynak: `modules/mod.rs`):
+
+```
+sys      typing      asyncio      pathlib      os
+```
+
+Başka herhangi bir `import X` → `ModuleNotFoundError`.
+
+## 7. PydanticAI Entegrasyonu
+
+Monty, PydanticAI'de **CodeMode** özelliğini güçlendirecek şekilde tasarlanmıştır. LLM sıralı tool çağrıları yapmak yerine, tool'ları fonksiyon olarak çağıran Python kodu yazar ve Monty bunu güvenli şekilde çalıştırır.
+
+```python
+from pydantic_ai import Agent
+from pydantic_ai.toolsets.code_mode import CodeModeToolset
+from pydantic_ai.toolsets.function import FunctionToolset
+
+# Araçları tanımla
+tools = FunctionToolset()
+
+@tools.tool
+async def get_weather(location: str) -> dict:
+    ...
+
+# Agent'ı CodeMode ile oluştur
+agent = Agent(
+    'anthropic:claude-sonnet-4-5',
+    toolsets=[CodeModeToolset(tools)],  # Monty-powered code execution
+)
+
+# Agent Python kodu yazarak tool'ları çağırır
+result = await agent.run("Compare weather in London and Paris")
+```
+
+## 8. Alternatiflere Karşı Pozisyon
+
+| Tech | Dil Tamamlığı | Güvenlik | Başlatma | Maliyet |
+|------|---------------|----------|----------|---------|
+| **Monty** | Kısmi | Katı | 0.06ms | Ücretsiz/OSS |
+| Docker | Tam | İyi | 195ms | Ücretsiz/OSS |
+| Pyodide | Tam | Zayıf | 2800ms | Ücretsiz/OSS |
+| starlark-rust | Çok kısıtlı | İyi | 1.7ms | Ücretsiz/OSS |
+| WASI/Wasmer | Neredeyse tam | Katı | 66ms | Ücretsiz* |
+| Sandboxing servisi (E2B, Modal) | Tam | Katı | 1033ms | Ücretli |
+| YOLO Python (exec) | Tam | Yok | 0.1ms | Ücretsiz/OSS |
+
+**Monty'nin avantajları:** En düşük başlatma süresi + katı güvenlik + kolay kurulum + serileştirme desteği.
+
+## 9. Eval Generation İçin Kullanım Senaryosu
+
+### Problem
+
+Eval generation pipeline'larında (hem tek seferlik generation hem de optimization döngüsünde) LLM agent expected değerleri **tahmin ediyor** — bu özellikle algoritmik fonksiyonlarda hallüsinasyona yol açar (ör. `binary_search([1,3,5,7], 5)` için yanlış index döndürme).
+
+### Çözüm: CodeMode Eval Generation
+
+CodeMode, **tüm eval generation pipeline'larında** kullanılabilecek genel bir mekanizmadır. Agent expected değerleri tahmin etmek yerine, Monty sandbox'ında **gerçek fonksiyonu çalıştırarak** ground-truth değerleri elde eder.
+
+**Kullanım alanları:**
+- **Tek seferlik eval generation** — `vowel` CLI veya API ile bir fonksiyon için eval dosyası üretirken
+- **Optimization döngüsü** — GEPA ile prompt optimize ederken her iterasyonda (burada özellikle etkili çünkü yüzlerce eval üretiliyor)
+- **CI/CD pipeline'ları** — Otomatik test üretimi akışlarında
+- **Herhangi bir eval generation çağrısı** — CodeMode, pipeline'dan bağımsız bir altyapı katmanıdır
+
+### Temel Mimari
+
+```
+┌─────────────────────────────────────────────────────────┐
+│  LLM Agent                                              │
+│  "Bu fonksiyon için ilginç test girdileri tasarla"      │
+│                                                         │
+│  Agent üretir:                                          │
+│    inputs = [                                           │
+│        {"x": [1,3,5,7,9], "target": 5},                │
+│        {"x": [], "target": 1},                          │
+│        {"x": [1], "target": 1},                         │
+│    ]                                                    │
+└──────────────────────┬──────────────────────────────────┘
+                       │
+                       ▼
+┌─────────────────────────────────────────────────────────┐
+│  Monty Sandbox                                          │
+│                                                         │
+│  # Agent tarafından üretilen test harness               │
+│  results = []                                           │
+│  results.append(target_func([1,3,5,7,9], 5))           │
+│  results.append(target_func([], 1))                     │
+│  results.append(target_func([1], 1))                    │
+│  results                                                │
+│                                                         │
+│  external_functions = {"target_func": real_function}    │
+│  limits = ResourceLimits(max_duration_secs=5.0)         │
+└──────────────────────┬──────────────────────────────────┘
+                       │
+                       ▼
+┌─────────────────────────────────────────────────────────┐
+│  Ground-Truth Sonuçlar                                  │
+│                                                         │
+│  results = [2, -1, 0]  ← gerçek fonksiyon çıktıları    │
+│                                                         │
+│  Bu değerler YAML eval dosyasındaki expected alanına    │
+│  yazılır — hallüsinasyon riski sıfır.                   │
+└─────────────────────────────────────────────────────────┘
+```
+
+### Neden External Function Mekanizması Kritik?
+
+Hedef fonksiyon (ör. `binary_search`) şunları kullanabilir:
+- Stdlib modülleri (`collections`, `itertools`, `math` vb.)
+- Third-party kütüphaneler (`numpy`, `pandas` vb.)
+- Dosya sistemi, network vb.
+
+Monty sandbox'ı bunların hiçbirini desteklemez. **AMA** external function olarak inject edildiğinde, `target_func(...)` çağrısı host tarafındaki gerçek Python fonksiyonunu çalıştırır — yani tüm bağımlılıklar sorunsuz çalışır.
+
+### ExecutorAdapter Protokolü (Taslak)
+
+```python
+from dataclasses import dataclass
+from typing import Any, Protocol
+
+@dataclass
+class ExecutionResult:
+    """Sandbox çalıştırma sonucu."""
+    output: Any                    # Kodun döndürdüğü değer
+    stdout: str                    # print() çıktısı
+    success: bool                  # Hatasız tamamlandı mı
+    error: str | None = None       # Hata mesajı (varsa)
+    error_type: str | None = None  # Hata tipi (ör. "ValueError")
+    duration_ms: float = 0.0       # Çalışma süresi
+
+class ExecutorAdapter(Protocol):
+    """Kod çalıştırma adaptör protokolü."""
+    async def execute(
+        self,
+        code: str,
+        *,
+        target_function: callable | None = None,
+        inputs: dict[str, Any] | None = None,
+        timeout: float = 5.0,
+        max_memory: int = 10 * 1024 * 1024,  # 10MB
+    ) -> ExecutionResult: ...
+
+class MontyExecutor:
+    """Monty tabanlı güvenli kod çalıştırıcı."""
+    
+    def __init__(self):
+        import pydantic_monty
+        self._monty = pydantic_monty
+    
+    async def execute(
+        self,
+        code: str,
+        *,
+        target_function: callable | None = None,
+        inputs: dict[str, Any] | None = None,
+        timeout: float = 5.0,
+        max_memory: int = 10 * 1024 * 1024,
+    ) -> ExecutionResult:
+        import time
+        
+        stdout_lines: list[str] = []
+        
+        def print_callback(stream: str, text: str):
+            stdout_lines.append(text)
+        
+        # External function listesi oluştur
+        ext_names = ["target_func"] if target_function else []
+        ext_impls = {"target_func": target_function} if target_function else {}
+        
+        # Input isimleri
+        input_names = list(inputs.keys()) if inputs else []
+        
+        try:
+            m = self._monty.Monty(
+                code,
+                inputs=input_names or None,
+                external_functions=ext_names or None,
+            )
+            
+            limits = self._monty.ResourceLimits(
+                max_duration_secs=timeout,
+                max_memory=max_memory,
+            )
+            
+            start = time.perf_counter()
+            result = m.run(
+                inputs=inputs,
+                limits=limits,
+                external_functions=ext_impls,
+                print_callback=print_callback,
+            )
+            duration = (time.perf_counter() - start) * 1000
+            
+            return ExecutionResult(
+                output=result,
+                stdout="\n".join(stdout_lines),
+                success=True,
+                duration_ms=duration,
+            )
+            
+        except self._monty.MontyRuntimeError as e:
+            inner = e.exception()
+            return ExecutionResult(
+                output=None,
+                stdout="\n".join(stdout_lines),
+                success=False,
+                error=str(e),
+                error_type=type(inner).__name__,
+                duration_ms=0.0,
+            )
+        except self._monty.MontySyntaxError as e:
+            return ExecutionResult(
+                output=None,
+                stdout="",
+                success=False,
+                error=str(e),
+                error_type="SyntaxError",
+                duration_ms=0.0,
+            )
+```
+
+### BuiltinExecutor (Geliştirme/Fallback)
+
+```python
+class BuiltinExecutor:
+    """exec() tabanlı çalıştırıcı — sadece güvenilir kodlar için."""
+    
+    async def execute(
+        self,
+        code: str,
+        *,
+        target_function: callable | None = None,
+        inputs: dict[str, Any] | None = None,
+        timeout: float = 5.0,
+        max_memory: int = 10 * 1024 * 1024,
+    ) -> ExecutionResult:
+        import io, contextlib, time
+        
+        namespace = dict(inputs or {})
+        if target_function:
+            namespace["target_func"] = target_function
+        
+        stdout = io.StringIO()
+        start = time.perf_counter()
+        
+        try:
+            with contextlib.redirect_stdout(stdout):
+                exec(code, namespace)
+            duration = (time.perf_counter() - start) * 1000
+            
+            # Son ifadenin değerini al (eğer varsa)
+            result = namespace.get("__result__", namespace.get("results"))
+            
+            return ExecutionResult(
+                output=result,
+                stdout=stdout.getvalue(),
+                success=True,
+                duration_ms=duration,
+            )
+        except Exception as e:
+            return ExecutionResult(
+                output=None,
+                stdout=stdout.getvalue(),
+                success=False,
+                error=str(e),
+                error_type=type(e).__name__,
+                duration_ms=0.0,
+            )
+```
+
+## 10. Entegrasyon Tasarım Kararları
+
+### Açık Sorular
+
+1. **Agent'ın kodu nasıl üretecek?**
+   - Seçenek A: Agent sadece input listesi üretir, harness kodu otomatik oluşturulur
+   - Seçenek B: Agent tam test harness kodunu yazar (daha esnek ama hata riski daha yüksek)
+   - Seçenek C: Hibrit — Agent input + beklenen davranış tanımlar, edge case'ler için raises testi de yazabilir
+
+2. **Exception test etme nasıl olacak?**
+   - `raises` assertion'ları için agent'ın exception beklediğini belirtmesi gerekir
+   - Monty'de try/except destekleniyor, agent try/except yazarak exception tipini yakalayabilir
+
+3. **Mevcut pipeline ile entegrasyon noktası neresi?**
+   - `task.py`'daki `generate_and_score()` akışında, agent YAML ürettikten sonra expected değerleri doğrulamak için Monty kullanılabilir
+   - Veya: Agent doğrudan Monty ile çalışan bir "CodeMode" prompt ile yönlendirilir
+
+4. **Performans etkisi?**
+   - Monty başlatma: ~0.06ms
+   - Her test case çalıştırma: fonksiyonun karmaşıklığına bağlı (host'ta çalışır)
+   - 25 fonksiyon × 20 test case = 500 çalıştırma → toplam <1 saniye ek maliyet
+
+5. **Hangi fonksiyonlar CodeMode'a uygun?**
+   - Deterministik fonksiyonlar (aynı input → aynı output): ✅ İdeal
+   - Yan etkili fonksiyonlar (dosya yazma, API çağrısı): ⚠️ Dikkatli olunmalı
+   - Rastgele çıktılı fonksiyonlar: ❌ Uygun değil (expected value sabitlenmeli)
+
+### Kısıtlamalar ve Çözümler
+
+| Kısıtlama | Etki | Çözüm |
+|-----------|------|-------|
+| Class tanımı yok | Agent class kullanamaz | Fonksiyon + dict / NamedTuple kullan |
+| `json` modülü yok | String serialization zor | Host'a external function olarak delege et |
+| `match` statement yok | Pattern matching yok | if/elif zincirleri kullan |
+| `with` statement yok | Context manager yok | İstisnai durum; hedef fonksiyon host'ta çalışır |
+| `math`, `collections`, `itertools` yok | Sandbox içi hesaplama kısıtlı | Tüm asıl hesaplama host fonksiyonunda yapılır |
+| Sadece 5 modül import edilebilir | `sys`, `typing`, `asyncio`, `pathlib`, `os` | Yeterli — sandbox kodu sadece orkestrasyon yapıyor |
+
+**En kritik çözüm:** Sandbox kodunun amacı karmaşık hesaplama yapmak değil — **sadece test girdilerini organize edip hedef fonksiyonu çağırmak**. Asıl hesaplama external function (hedef fonksiyon) içinde, host tarafında yapılır.
+
+## 11. Örnek: Tam Çalışma Akışı
+
+```python
+# 1. Hedef fonksiyon (test edilecek)
+def binary_search(arr: list[int], target: int) -> int:
+    lo, hi = 0, len(arr) - 1
+    while lo <= hi:
+        mid = (lo + hi) // 2
+        if arr[mid] == target:
+            return mid
+        elif arr[mid] < target:
+            lo = mid + 1
+        else:
+            hi = mid - 1
+    return -1
+
+# 2. Agent'ın ürettiği Monty kodu
+agent_code = """
+results = []
+
+# Normal cases
+results.append({"input": {"arr": [1,3,5,7,9], "target": 5}, "expected": target_func([1,3,5,7,9], 5)})
+results.append({"input": {"arr": [1,3,5,7,9], "target": 1}, "expected": target_func([1,3,5,7,9], 1)})
+results.append({"input": {"arr": [1,3,5,7,9], "target": 9}, "expected": target_func([1,3,5,7,9], 9)})
+
+# Not found
+results.append({"input": {"arr": [1,3,5,7,9], "target": 4}, "expected": target_func([1,3,5,7,9], 4)})
+
+# Edge cases
+results.append({"input": {"arr": [], "target": 1}, "expected": target_func([], 1)})
+results.append({"input": {"arr": [1], "target": 1}, "expected": target_func([1], 1)})
+results.append({"input": {"arr": [1], "target": 2}, "expected": target_func([1], 2)})
+
+results
+"""
+
+# 3. Monty'de çalıştır
+import pydantic_monty
+
+m = pydantic_monty.Monty(
+    agent_code,
+    external_functions=["target_func"],
+)
+
+results = m.run(
+    external_functions={"target_func": binary_search},
+    limits=pydantic_monty.ResourceLimits(max_duration_secs=5.0),
+)
+
+# 4. Sonuç: Ground-truth expected değerlerle test case'ler
+# results = [
+#     {"input": {"arr": [1,3,5,7,9], "target": 5}, "expected": 2},
+#     {"input": {"arr": [1,3,5,7,9], "target": 1}, "expected": 0},
+#     {"input": {"arr": [1,3,5,7,9], "target": 9}, "expected": 4},
+#     {"input": {"arr": [1,3,5,7,9], "target": 4}, "expected": -1},
+#     {"input": {"arr": [], "target": 1}, "expected": -1},
+#     {"input": {"arr": [1], "target": 1}, "expected": 0},
+#     {"input": {"arr": [1], "target": 2}, "expected": -1},
+# ]
+```
+
+**Hiçbir expected değer hallüsine edilmedi — hepsi gerçek fonksiyon çıktısı.**
+
+## 12. Sonraki Adımlar
+
+1. ~~Monty API'yi tam anla~~ ✅
+2. `ExecutorAdapter` protokolünü finalize et
+3. `MontyExecutor` implementasyonunu yaz
+4. `task.py`'ye CodeMode akışını entegre et
+5. Agent prompt'unu CodeMode için güncelle
+6. 25 referans fonksiyon üzerinde test et
+7. Mevcut "tahmin" modu ile CodeMode'u karşılaştır (A/B)
diff --git a/pyproject.toml b/pyproject.toml
index 3e5db9e..bc13a87 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -55,7 +55,9 @@ logfire = [
 ]
 
 monty = [
-    "pydantic-monty>=0.0.7"
+    "pydantic-monty>=0.0.8"
+    # i have decided to pin working version
+    # because of new changes to MontyRepl in version 0.0.8
 ]
 optimize = [
     "vowel-optimization"
@@ -84,7 +86,7 @@ target-version = ["py311"]
 [tool.ruff]
 line-length = 100
 target-version = "py311"
-exclude = ["vowel-optimization"]
+exclude = ["vowel-optimization", "benchmark_v1"]
 
 [tool.ruff.lint]
 select = [
@@ -118,7 +120,7 @@ ignore_missing_imports = true
 python-version = "3.11"
 
 [tool.ty.src]
-exclude = ["vowel-optimization"]
+exclude = ["vowel-optimization", "benchmark_v1"]
 
 [tool.ty.rules]
 unresolved-import = "ignore"
@@ -142,3 +144,8 @@ markers = [
     "integration: integration tests",
     "llm: tests that require LLM API calls",
 ]
+
+[tool.uv.workspace]
+members = [
+    "pydantic-acp",
+]
diff --git a/pyrightconfig.json b/pyrightconfig.json
index e93018b..19456c7 100644
--- a/pyrightconfig.json
+++ b/pyrightconfig.json
@@ -11,6 +11,7 @@
   "reportUnknownVariableType": "none",
   "reportUnknownMemberType": "none",
   "reportUnknownParameterType": "none",
+  "reportAttributeAccessIssue": "none",
   "reportAny": "none",
   "reportExplicitAny": "none",
   "reportMissingParameterType": "none",
diff --git a/pytest.ini b/pytest.ini
deleted file mode 100644
index 4ecb1ad..0000000
--- a/pytest.ini
+++ /dev/null
@@ -1,5 +0,0 @@
-[pytest]
-testpaths = tests
-python_files = test_*.py
-python_classes = Test*
-python_functions = test_*
diff --git a/src/vowel/__init__.py b/src/vowel/__init__.py
index c01915e..f890ed4 100644
--- a/src/vowel/__init__.py
+++ b/src/vowel/__init__.py
@@ -45,6 +45,7 @@
     MontyExecutor,
     MontyReplSession,
     get_executor,
+    resolve_executors,
 )
 from .runner import Function, RunEvals
 from .utils import (
@@ -93,6 +94,7 @@
     "DefaultExecutor",
     "DefaultSession",
     "get_executor",
+    "resolve_executors",
     # CodeMode pipeline
     "CodeModeGenerator",
     "CodeModeResult",
diff --git a/src/vowel/cli.py b/src/vowel/cli.py
index 114f08b..868b574 100644
--- a/src/vowel/cli.py
+++ b/src/vowel/cli.py
@@ -740,7 +740,7 @@ def on_modified(self, event):
     if export_json:
         import json
 
-        json_data = summary.json()
+        json_data = summary.to_json()
         with open(export_json, "w") as f:
             json.dump(json_data, f, indent=2)
         if not quiet:
@@ -749,6 +749,10 @@ def on_modified(self, event):
     # Failed assertions detail
     if summary.failed_results:
         console.print()
+
+        all_failures_are_duration = True
+        has_any_failures = False
+
         for result in summary.failed_results:
             console.print(Panel(result.eval_id, title="Failed Assertions", border_style="yellow"))
 
@@ -758,6 +762,7 @@ def on_modified(self, event):
                 ]
 
                 if failed_assertions:
+                    has_any_failures = True
                     total_assertions = len(case.assertions)
                     failed_count = len(failed_assertions)
 
@@ -767,6 +772,12 @@ def on_modified(self, event):
                     )
 
                     for assertion_name, res in failed_assertions:
+                        if (
+                            "duration" not in assertion_name.lower()
+                            and "maxduration" not in assertion_name.lower()
+                        ):
+                            all_failures_are_duration = False
+
                         console.print(f"\n    [red]x {assertion_name}[/red]")
                         if res.reason:
                             reason_lines = str(res.reason).split("\n")
@@ -774,6 +785,20 @@ def on_modified(self, event):
                                 if line.strip():
                                     console.print(f"       [dim]{line.strip()}[/dim]")
 
+        # Inform user if all errors are just duration errors
+        if has_any_failures and all_failures_are_duration:
+            console.print()
+            console.print(
+                Panel(
+                    "All failing evaluators are related to duration (MaxDuration). "
+                    "You can run the command with `--ignore-duration` to skip performance constraints "
+                    "and get a more accurate evaluation of functional correctness.",
+                    title="Insight",
+                    border_style="cyan",
+                    style="cyan",
+                )
+            )
+
     console.print()
 
     # CI mode
diff --git a/src/vowel/codemode.py b/src/vowel/codemode.py
index e488827..6ef8256 100644
--- a/src/vowel/codemode.py
+++ b/src/vowel/codemode.py
@@ -31,7 +31,7 @@
 
 from vowel.context import EVAL_SPEC_CONTEXT
 from vowel.eval_types import EvalsSource
-from vowel.executor import ExecutionResult, Executor, get_executor
+from vowel.executor import ExecutionResult, Executor, resolve_executors
 from vowel.monitoring import enable_monitoring
 from vowel.runner import Function, RunEvals
 from vowel.spec_validation import (
@@ -192,16 +192,29 @@ class CodeModeGenerator:
 
     def __init__(
         self,
-        model: str | None = None,
-        executor: Executor | None = None,
+        spec_model: str | None = None,
+        exploration_model: str | None = None,
+        default_executor: Executor | None = None,
+        fallback_executor: Executor | None = None,
         additional_context: str = "",
         min_snippets: int = 15,
         **opts,
     ) -> None:
-        self.model = model or os.getenv("MODEL_NAME", "")
-        if not self.model:
-            logfire.warn("No model specified; set MODEL_NAME env var or pass model=")
-        self.executor = executor or get_executor("auto")
+        # Default fallback from kwargs (for backwards compatibility) or environment
+        base_fallback = opts.pop("model", None) or os.getenv("MODEL_NAME", "")
+
+        self.spec_model = spec_model or os.getenv("SPEC_MODEL") or base_fallback
+        self.exploration_model = (
+            exploration_model or os.getenv("EXPLORATION_MODEL") or base_fallback
+        )
+
+        if not self.spec_model or not self.exploration_model:
+            raise ValueError(
+                "Both spec_model and exploration_model must be specified. "
+                "Provide them via constructor/kwargs, or set SPEC_MODEL, EXPLORATION_MODEL, or MODEL_NAME environment variables."
+            )
+
+        self.executor = resolve_executors(default_executor, fallback_executor)
         self.additional_context = additional_context
         self.min_snippets = min_snippets
         self._opts = opts
@@ -212,7 +225,8 @@ def __init__(
 
         logfire.info(
             "CodeModeGenerator initialized",
-            model=self.model,
+            spec_model=self.spec_model,
+            exploration_model=self.exploration_model,
             executor=type(self.executor).__name__,
         )
 
@@ -222,7 +236,7 @@ def __init__(
     def explorer_agent(self) -> Agent[None, ExplorationPlan]:
         if self._explorer_agent is None:
             self._explorer_agent = Agent(
-                self.model,
+                self.exploration_model,
                 output_type=ExplorationPlan,
                 system_prompt=self._explorer_system_prompt(),
                 **self._opts,
@@ -233,7 +247,7 @@ def explorer_agent(self) -> Agent[None, ExplorationPlan]:
     def spec_agent(self) -> Agent[None, EvalsSource]:
         if self._spec_agent is None:
             self._spec_agent = Agent(
-                self.model,
+                self.spec_model,
                 output_type=EvalsSource,
                 system_prompt=self._spec_system_prompt(),
                 **self._opts,
@@ -306,12 +320,21 @@ def _spec_system_prompt(self) -> str:
     async def explore(
         self,
         func: Function,
+        *,
+        exploration_rounds: int = 2,
     ) -> list[SnippetResult]:
         """Phase 1: Generate and execute exploration snippets.
 
-        Uses ``create_session()`` to compile the function source **once**,
-        then feeds each snippet against the preserved runtime state —
-        zero re-parse overhead per snippet.
+        Supports multi-round feedback-guided exploration.  Round 1 uses
+        static reasoning (speculation-based).  Round 2+ receives a
+        programmatic cluster summary of prior results so the LLM can
+        target unexplored behaviour classes (evidence-based).
+
+        Parameters
+        ----------
+        exploration_rounds:
+            Number of exploration rounds (default 2).  Set to 1 to
+            restore single-shot behaviour.
 
         Returns a list of ``SnippetResult`` with real outputs from the
         executor.
@@ -320,63 +343,178 @@ async def explore(
             "codemode.explore",
             func_name=func.name,
             executor=type(self.executor).__name__,
+            exploration_rounds=exploration_rounds,
         ):
-            # 1. Ask the LLM for exploration snippets
-            plan = await self._get_exploration_plan(func)
+            all_results: list[SnippetResult] = []
 
-            # 2. Compile function source once, feed each snippet
-            all_snippets = [
-                *((s, "normal") for s in plan.snippets),
-                *((s, "error") for s in plan.error_snippets),
-            ]
-            total = len(all_snippets)
-            results: list[SnippetResult] = []
-            with self.executor.create_session(func.code) as session:
-                for i, (snippet, kind) in enumerate(all_snippets):
-                    with logfire.span(
-                        "codemode.execute_snippet",
-                        index=i,
-                        kind=kind,
-                        description=snippet.description,
-                    ):
-                        logfire.info(
-                            "Executing snippet {index}/{total} [{kind}]: {description}",
-                            index=i + 1,
-                            total=total,
-                            kind=kind,
-                            description=snippet.description,
-                            code=snippet.code,
+            for round_num in range(1, exploration_rounds + 1):
+                with logfire.span(
+                    "codemode.explore_round",
+                    round=round_num,
+                    prior_results=len(all_results),
+                ):
+                    # Get exploration plan (round 2+ includes prior context)
+                    if round_num == 1:
+                        plan = await self._get_exploration_plan(func)
+                    else:
+                        cluster_summary = self._build_cluster_summary(all_results)
+                        plan = await self._get_targeted_exploration_plan(
+                            func,
+                            all_results,
+                            cluster_summary,
                         )
+                        # Early exit: if no new snippets were produced
+                        if not plan.snippets and not plan.error_snippets:
+                            logfire.info(
+                                "Round {round} produced no new snippets, stopping",
+                                round=round_num,
+                            )
+                            break
 
-                        exec_result = session.feed(snippet.code)
-
-                        sr = SnippetResult.from_execution(snippet, exec_result)
-                        results.append(sr)
+                    # Execute snippets
+                    new_results = self._execute_plan(func, plan, round_num)
+                    all_results.extend(new_results)
 
+                    # Early exit: round 2+ found no new behaviour
+                    if round_num > 1:
+                        new_behaviors = self._count_new_behaviors(
+                            all_results[: -len(new_results)],
+                            new_results,
+                        )
                         logfire.info(
-                            "Snippet result: success={success}, output={output}, "
-                            "duration={duration_ms:.2f}ms",
-                            success=sr.success,
-                            output=repr(sr.output)[:200],
-                            duration_ms=sr.duration_ms,
-                            error=sr.error,
-                            error_type=sr.error_type,
+                            "Round {round}: {new} new behaviour classes discovered",
+                            round=round_num,
+                            new=new_behaviors,
                         )
 
             # Summary log
-            successes = sum(1 for r in results if r.success)
-            failures = len(results) - successes
+            successes = sum(1 for r in all_results if r.success)
+            failures = len(all_results) - successes
             logfire.info(
                 "Exploration complete: {successes} succeeded, {failures} raised errors",
                 successes=successes,
                 failures=failures,
             )
 
-            return results
+            return all_results
+
+    def _execute_plan(
+        self,
+        func: Function,
+        plan: ExplorationPlan,
+        round_num: int = 1,
+    ) -> list[SnippetResult]:
+        """Execute all snippets in a plan and return results."""
+        all_snippets = [
+            *((s, "normal") for s in plan.snippets),
+            *((s, "error") for s in plan.error_snippets),
+        ]
+        total = len(all_snippets)
+        results: list[SnippetResult] = []
+        with self.executor.create_session(func.code) as session:
+            for i, (snippet, kind) in enumerate(all_snippets):
+                with logfire.span(
+                    "codemode.execute_snippet",
+                    index=i,
+                    kind=kind,
+                    round=round_num,
+                    description=snippet.description,
+                ):
+                    logfire.info(
+                        "Executing snippet {index}/{total} R{round} [{kind}]: {description}",
+                        index=i + 1,
+                        total=total,
+                        round=round_num,
+                        kind=kind,
+                        description=snippet.description,
+                        code=snippet.code,
+                    )
+
+                    exec_result = session.feed(snippet.code)
+
+                    sr = SnippetResult.from_execution(snippet, exec_result)
+                    results.append(sr)
+
+                    logfire.info(
+                        "Snippet result: success={success}, output={output}, "
+                        "duration={duration_ms:.2f}ms",
+                        success=sr.success,
+                        output=repr(sr.output)[:200],
+                        duration_ms=sr.duration_ms,
+                        error=sr.error,
+                        error_type=sr.error_type,
+                    )
+        return results
+
+    @staticmethod
+    def _build_cluster_summary(results: list[SnippetResult]) -> str:
+        """Build a deterministic cluster summary from exploration results.
+
+        Groups results by output type / error type and formats a concise
+        summary for the Round 2 exploration prompt.
+        """
+        # -- Success clusters --
+        success_types: dict[str, int] = {}
+        for r in results:
+            if r.success:
+                t = type(r.output).__name__
+                success_types[t] = success_types.get(t, 0) + 1
+
+        # -- Error clusters --
+        error_clusters: dict[str, list[str]] = {}
+        for r in results:
+            if not r.success and r.error_type:
+                msgs = error_clusters.setdefault(r.error_type, [])
+                prefix = (r.error or "")[:60]
+                if prefix not in msgs:
+                    msgs.append(prefix)
+
+        # -- Already-tried snippets (to avoid repeats) --
+        tried_codes = [r.code.strip() for r in results]
+
+        lines = ["## Observed Behaviour Clusters\n"]
+
+        lines.append("### Success clusters")
+        if success_types:
+            for t, count in sorted(success_types.items()):
+                lines.append(f"- output type `{t}`: {count} cases")
+        else:
+            lines.append("- (none)")
+
+        lines.append("\n### Error clusters")
+        if error_clusters:
+            for etype, msgs in sorted(error_clusters.items()):
+                lines.append(f"- `{etype}` ({len(msgs)} distinct messages):")
+                for m in msgs:
+                    lines.append(f'  - "{m}"')
+        else:
+            lines.append("- (none)")
+
+        lines.append(f"\n### Already tried ({len(tried_codes)} snippets — do NOT repeat these)")
+        for code in tried_codes:
+            lines.append(f"- `{code}`")
+
+        return "\n".join(lines)
+
+    @staticmethod
+    def _count_new_behaviors(
+        prior: list[SnippetResult],
+        new: list[SnippetResult],
+    ) -> int:
+        """Count how many new behaviour classes the new results introduced."""
+
+        def _behavior_key(r: SnippetResult) -> str:
+            if r.success:
+                return f"ok:{type(r.output).__name__}"
+            return f"err:{r.error_type}:{(r.error or '')[:40]}"
+
+        prior_keys = {_behavior_key(r) for r in prior}
+        new_keys = {_behavior_key(r) for r in new}
+        return len(new_keys - prior_keys)
 
     async def _get_exploration_plan(self, func: Function) -> ExplorationPlan:
-        """Ask the LLM for exploration snippets."""
-        with logfire.span("codemode.llm_explore", func_name=func.name):
+        """Ask the LLM for exploration snippets (Round 1 — static reasoning)."""
+        with logfire.span("codemode.llm_explore", func_name=func.name, round=1):
             prompt = f"""Explore the following function by writing test snippets:
 
 <FunctionName>{func.name}</FunctionName>
@@ -393,7 +531,58 @@ async def _get_exploration_plan(self, func: Function) -> ExplorationPlan:
             plan = result.output
 
             logfire.info(
-                "LLM produced {normal} normal + {error} error snippets",
+                "Round 1: LLM produced {normal} normal + {error} error snippets",
+                normal=len(plan.snippets),
+                error=len(plan.error_snippets),
+                snippets=[s.description for s in plan.snippets],
+                error_snippets=[s.description for s in plan.error_snippets],
+            )
+            return plan
+
+    async def _get_targeted_exploration_plan(
+        self,
+        func: Function,
+        prior_results: list[SnippetResult],
+        cluster_summary: str,
+    ) -> ExplorationPlan:
+        """Ask the LLM for targeted snippets (Round 2 — evidence-based)."""
+        with logfire.span("codemode.llm_explore", func_name=func.name, round=2):
+            prompt = f"""You previously explored `{func.name}` and the snippets were
+executed.  Below are the ACTUAL results and a cluster summary.
+
+Your job now is to find **new behaviour classes** that were NOT covered
+in Round 1.  Focus on:
+- Syntax / input combinations not yet tried
+- Edge cases at boundaries between observed clusters
+- Error paths whose exact error type or message differs from expectation
+- Interactions between parameters / sub-expressions
+
+<FunctionName>{func.name}</FunctionName>
+<FunctionCode>
+{func.code}
+</FunctionCode>
+<Description>{func.description}</Description>
+
+<Round1Results>
+{chr(10).join(r.to_context_block() for r in prior_results)}
+</Round1Results>
+
+<ClusterSummary>
+{cluster_summary}
+</ClusterSummary>
+
+RULES:
+- Do NOT repeat any snippet from the "Already tried" list.
+- Produce 8–12 NEW normal snippets targeting uncovered behaviour.
+- Produce 3–5 NEW error snippets targeting untried error paths.
+- Same strict rules as before: no try/except, real function name,
+  one scenario per snippet, last expression captured."""
+
+            result = await self.explorer_agent.run(prompt)
+            plan = result.output
+
+            logfire.info(
+                "Round 2: LLM produced {normal} normal + {error} error snippets",
                 normal=len(plan.snippets),
                 error=len(plan.error_snippets),
                 snippets=[s.description for s in plan.snippets],
@@ -471,13 +660,21 @@ async def generate_spec(
 </ErrorResults>
 
 REQUIREMENTS:
-- Use {func.name} as eval_id.
+- The top-level YAML key MUST be `{func.name}` (the function name).
 - Generate at least {max(len(exploration_results), 5)} diverse test cases.
 - Use the EXACT outputs from the execution results above.
 - You MUST generate exactly {len(error_results)} raises cases — one for
   each RAISED result above.  The spec is invalid without them.
 - Cover normal, edge, and error cases.
 - In assertions, use `input` (NOT `inputs`) for accessing input values.
+
+YAML FORMAT — STRICT RULES (violations cause parse failure):
+- NEVER use YAML tags: `!!python/tuple`, `!!python/object`, `!!binary`,
+  `!!omap`, `!!str`, `!!int`, `!!float`, or ANY `!!` tag whatsoever.
+  Plain YAML scalars and sequences only.  `yaml.safe_load()` will be used
+  to parse the output — it rejects all `!!` tags and will hard-fail.
+- Represent tuples as YAML sequences (lists).
+- NEVER emit `!!python/...` or any non-standard YAML type annotation.
 {refinement_block}"""
 
             logfire.info(
@@ -490,11 +687,13 @@ async def generate_spec(
             result = await self.spec_agent.run(prompt)
             yaml_spec = result.output.yaml_spec
 
-            # Sanitize: strip YAML tags that safe_load rejects
+            # Sanitize: strip ALL !!<tag> annotations — safe_load only accepts
+            # a tiny subset (str/int/float/bool/null/seq/map) and rejects
+            # anything else (!!python/tuple, !!binary, !!omap, etc.).
+            # Stripping them is safe: scalar values fall back to plain YAML types.
             import re
 
-            yaml_spec = re.sub(r"!!python/[\w.:]+", "", yaml_spec)
-            yaml_spec = re.sub(r"!!binary\b", "", yaml_spec)
+            yaml_spec = re.sub(r"!![^\s\[\]{},]+", "", yaml_spec)
 
             # Validate YAML syntax
             yaml.safe_load(yaml_spec)
@@ -581,15 +780,18 @@ async def generate(
 
         Pipeline::
 
-            Phase 1: explore()                        (once)
+            Phase 1: explore()                        (2 rounds by default)
+              Round 1 — static reasoning (speculation-based)
+              Round 2 — targeted exploration (evidence-based)
             Phase 2: generate_spec()                  (may loop)
             Phase 3: validate via RunEvals            (per attempt)
             Phase 4: refine on failure                (up to N rounds)
             Phase 5: inject_durations()               (once, at end)
 
-        Exploration (Phase 1) runs once — the ground-truth snippet results
-        don't change.  Only spec generation (Phase 2) is re-run on failure,
-        with a failure report injected into the prompt.
+        Exploration (Phase 1) runs in two rounds.  Round 1 uses static
+        reasoning; Round 2 receives a cluster summary of Round 1 results
+        and targets uncovered behaviour classes.  Only spec generation
+        (Phase 2) is re-run on validation failure.
 
         Parameters
         ----------
@@ -618,7 +820,8 @@ async def generate(
         with logfire.span(
             "codemode.pipeline",
             func_name=func.name,
-            model=self.model,
+            spec_model=self.spec_model,
+            exploration_model=self.exploration_model,
             executor=type(self.executor).__name__,
         ):
             t0 = time.perf_counter()
@@ -726,7 +929,7 @@ async def generate(
 
             elapsed = (time.perf_counter() - t0) * 1000
             logfire.info(
-                "CodeMode pipeline complete in {elapsed:.0f}ms (refinements={rounds})",
+                "CodeMode pipeline complete in {elapsed:.0f}ms (refinements={refinement_rounds})",
                 elapsed=elapsed,
                 func_name=func.name,
                 exploration_count=len(exploration_results),
diff --git a/src/vowel/eval_types.py b/src/vowel/eval_types.py
index 241a71d..e9cc14d 100644
--- a/src/vowel/eval_types.py
+++ b/src/vowel/eval_types.py
@@ -20,14 +20,14 @@
     EvalsFile: Root model for YAML file parsing
 """
 
-import logfire
 import os
+import typing
 from typing import Any, Literal
 
+import logfire
 from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
 from pydantic.experimental.missing_sentinel import MISSING
 
-
 # =============================================================================
 # LLM Output Models
 # =============================================================================
@@ -106,6 +106,63 @@
         assertion: "len(output) == 2"
 """
 
+SAFE_ASSERTION_BUILTINS = {
+    "abs": abs,
+    "all": all,
+    "any": any,
+    "bool": bool,
+    "dict": dict,
+    "enumerate": enumerate,
+    "float": float,
+    "int": int,
+    "isinstance": isinstance,
+    "len": len,
+    "list": list,
+    "max": max,
+    "min": min,
+    "range": range,
+    "round": round,
+    "set": set,
+    "sorted": sorted,
+    "str": str,
+    "sum": sum,
+    "tuple": tuple,
+    "type": type,
+    "zip": zip,
+}
+
+SAFE_TYPE_NAMES: dict[str, Any] = {
+    "Any": Any,
+    "None": None,
+    "bool": bool,
+    "bytes": bytes,
+    "dict": dict,
+    "float": float,
+    "frozenset": frozenset,
+    "int": int,
+    "list": list,
+    "object": object,
+    "set": set,
+    "str": str,
+    "tuple": tuple,
+    "typing": typing,
+}
+SAFE_TYPE_NAMES.update(
+    {name: getattr(typing, name) for name in dir(typing) if not name.startswith("_")}
+)
+
+
+def _eval_assertion_restricted(assertion: str, env: dict[str, Any]) -> bool:
+    namespace = {"__builtins__": SAFE_ASSERTION_BUILTINS}
+    namespace.update(env)
+    return bool(eval(assertion, namespace, namespace))
+
+
+def _eval_type_restricted(type_expr: str) -> Any:
+    namespace = {"__builtins__": {}}
+    namespace.update(SAFE_TYPE_NAMES)
+    return eval(type_expr, namespace, namespace)
+
 
 class EvalsSource(BaseModel):
     """LLM output model for YAML eval specification."""
@@ -203,7 +260,11 @@ class IsInstanceCase(BaseModel):
     )
 
     def evaluate(self, output: Any) -> bool:
-        return isinstance(output, eval(self.type))
+        try:
+            expected = _eval_type_restricted(self.type)
+        except Exception:
+            expected = eval(self.type)
+        return isinstance(output, expected)
 
 
 class AssertionCase(BaseModel):
@@ -258,7 +319,10 @@ class AssertionCase(BaseModel):
 
     def evaluate(self, input: Any, output: Any) -> bool:
         env = {"input": input, "output": output}
-        return eval(self.assertion, env, env)
+        try:
+            return _eval_assertion_restricted(self.assertion, env)
+        except Exception:
+            return bool(eval(self.assertion, env, env))
 
 
 class DurationCase(BaseModel):
@@ -774,7 +838,7 @@ def model_validate(cls, obj, **kwargs):
 
     def get_evals(self) -> dict[str, Evals]:
         result = {}
-        extras = getattr(self, "__pydantic_extra__", None) or {}
+        extras = getattr(self, "__pydantic_extra__", {})
         for key, value in extras.items():
             if key == "fixtures":
                 continue
diff --git a/src/vowel/evals.py b/src/vowel/evals.py
index f9fc79b..5e78ecd 100644
--- a/src/vowel/evals.py
+++ b/src/vowel/evals.py
@@ -23,6 +23,7 @@
 from contextlib import suppress
 from dataclasses import dataclass
 
+import logfire
 from pydantic import ValidationError
 from pydantic.type_adapter import TypeAdapter
 from pydantic_ai.settings import ModelSettings
@@ -30,6 +31,63 @@
 
 MONTY_AVAILABLE = bool(importlib.util.find_spec("pydantic-monty"))
 
+SAFE_ASSERTION_BUILTINS = {
+    "abs": abs,
+    "all": all,
+    "any": any,
+    "bool": bool,
+    "dict": dict,
+    "enumerate": enumerate,
+    "float": float,
+    "int": int,
+    "isinstance": isinstance,
+    "len": len,
+    "list": list,
+    "max": max,
+    "min": min,
+    "range": range,
+    "round": round,
+    "set": set,
+    "sorted": sorted,
+    "str": str,
+    "sum": sum,
+    "tuple": tuple,
+    "type": type,
+    "zip": zip,
+}
+
+SAFE_TYPE_NAMES = {
+    "Any": typing.Any,
+    "None": None,
+    "bool": bool,
+    "bytes": bytes,
+    "dict": dict,
+    "float": float,
+    "frozenset": frozenset,
+    "int": int,
+    "list": list,
+    "object": object,
+    "set": set,
+    "str": str,
+    "tuple": tuple,
+    "typing": typing,
+}
+SAFE_TYPE_NAMES.update(
+    {name: getattr(typing, name) for name in dir(typing) if not name.startswith("_")}
+)
+
+
+def _eval_assertion_restricted(condition: str, inputs: dict[str, typing.Any]) -> bool:
+    env = {"__builtins__": SAFE_ASSERTION_BUILTINS}
+    env.update(inputs)
+    return bool(eval(condition, env, env))
+
+
+def _eval_type_restricted(type_expr: str) -> typing.Any:
+    env = {"__builtins__": {}}
+    env.update(SAFE_TYPE_NAMES)
+    return eval(type_expr, env, env)
+
 
 def prepare_env_and_condition(ctx: EvaluatorContext, condition: str) -> tuple[dict, str]:
     """
@@ -123,6 +181,20 @@ def eval_python(self, condition: str, inputs: dict) -> EvaluationReason:
                 )
 
         except Exception:
+            pass
+
+        try:
+            if _eval_assertion_restricted(self.condition, inputs):
+                return EvaluationReason(
+                    value=True, reason=f"Assertion passed for condition: {condition}"
+                )
+        except Exception as exc:
+            logfire.info(
+                "Restricted assertion eval failed; falling back to raw eval",
+                condition=self.condition,
+                error_type=type(exc).__name__,
+                error=str(exc),
+            )
             with suppress(Exception):
                 if eval(self.condition, inputs, inputs):
                     return EvaluationReason(
@@ -147,15 +219,8 @@ def evaluate(self, ctx: EvaluatorContext) -> EvaluationReason:
         """Validate that output matches the expected type."""
         if isinstance(ctx.output, dict) and "_exception" in ctx.output:
             return EvaluationReason(value=True, reason="Skipped (exception case)")
-        type_env = {
-            "typing": typing,
-            "__import__": None,
-            "eval": None,
-            "exec": None,
-            "compile": None,
-        }
         try:
-            expected_type = eval(self.type, type_env, type_env)
+            expected_type = _eval_type_restricted(self.type)
             ta = TypeAdapter(expected_type)
         except Exception:
             return EvaluationReason(
@@ -325,7 +390,9 @@ def evaluate(self, ctx: EvaluatorContext) -> EvaluationReason:
             )
         if self.expected_exception_match:
             exception_message = str(actual_exception)
-            if not re.search(self.expected_exception_match, exception_message, re.I):
+            if self.expected_exception_match != exception_message and not re.search(
+                self.expected_exception_match, exception_message, re.I
+            ):
                 return EvaluationReason(
                     value=False,
                     reason=(
diff --git a/src/vowel/executor.py b/src/vowel/executor.py
index 9de81e1..fe14dbd 100644
--- a/src/vowel/executor.py
+++ b/src/vowel/executor.py
@@ -79,6 +79,8 @@
 from dataclasses import dataclass
 from typing import Any, Literal, Protocol, runtime_checkable
 
+import logfire as _logfire
+
 NEST_AVAILABLE = importlib.util.find_spec("nest_asyncio") is not None
 MONTY_AVAILABLE = importlib.util.find_spec("pydantic_monty") is not None
 
@@ -298,12 +300,9 @@ def __init__(
         def _print_callback(_stream: str, text: str) -> None:
             stdout_lines.append(text)
 
-        # Compile + execute setup code (function definitions, imports, etc.)
-        self._repl, _init_output = pydantic_monty.MontyRepl.create(
-            setup_code,
-            limits=self._limits,
-            print_callback=_print_callback,
-        )
+        # Create empty REPL and initialize it with setup code
+        self._repl = pydantic_monty.MontyRepl(limits=self._limits)
+        self._repl.feed_run(setup_code, print_callback=_print_callback)
         self._setup_stdout = "\n".join(stdout_lines)
 
     def feed(self, code: str) -> ExecutionResult:
@@ -315,11 +314,11 @@ def _print_callback(_stream: str, text: str) -> None:
 
         t0 = time.perf_counter()
         try:
-            if not self._repl:
+            if not getattr(self, "_repl", None):
                 # TODO: wrap with custom exception and detailed message
                 raise ValueError("Repl not found.")
             else:
-                output = self._repl.feed(code, print_callback=_print_callback)
+                output = self._repl.feed_run(code, print_callback=_print_callback)
                 duration_ms = (time.perf_counter() - t0) * 1000
                 return ExecutionResult(
                     output=output,
@@ -364,7 +363,8 @@ def _print_callback(_stream: str, text: str) -> None:
 
     def close(self) -> None:
         """Release the REPL instance."""
-        self._repl = None  # type: ignore[assignment]
+        # TODO: not sure about releasing the REPL instance is needed
+        # self._repl = None  # type: ignore
 
     def __enter__(self) -> MontyReplSession:
         return self
@@ -377,8 +377,6 @@ def __exit__(self, *_: Any) -> None:
 # FallbackSession — Monty with auto-fallback to DefaultSession
 # ---------------------------------------------------------------------------
 
-import logfire as _logfire
-
 
 class FallbackSession:
     """Session that tries MontyReplSession first, falls back to DefaultSession.
@@ -401,12 +399,14 @@ def __init__(
         *,
         timeout: float = 5.0,
         max_memory: int = 10 * 1024 * 1024,
+        fallback_executor: Executor | None = None,
     ) -> None:
         self._setup_code = setup_code
         self._timeout = timeout
         self._max_memory = max_memory
+        self._fallback_executor = fallback_executor or DefaultExecutor()
         self._monty_session: MontyReplSession | None = None
-        self._default_session: DefaultSession | None = None
+        self._fallback_session: ExecutionSession | None = None
         self._monty_failed_permanently = False
 
         try:
@@ -417,52 +417,54 @@ def __init__(
             )
         except Exception as exc:
             _logfire.info(
-                "Monty session creation failed ({exc_type}: {exc_msg}), falling back to DefaultSession",
+                "Monty session creation failed ({exc_type}: {exc_msg}), falling back to {fallback}",
                 exc_type=type(exc).__name__,
                 exc_msg=str(exc),
+                fallback=type(self._fallback_executor).__name__,
             )
             self._monty_failed_permanently = True
-            self._default_session = DefaultSession(
+            self._fallback_session = self._fallback_executor.create_session(
                 setup_code,
                 timeout=timeout,
                 max_memory=max_memory,
             )
 
-    def _get_default_session(self) -> DefaultSession:
-        """Lazily create the DefaultSession (only when first needed)."""
-        if self._default_session is None:
-            self._default_session = DefaultSession(
+    def _get_fallback_session(self) -> ExecutionSession:
+        """Lazily create the fallback session (only when first needed)."""
+        if self._fallback_session is None:
+            self._fallback_session = self._fallback_executor.create_session(
                 self._setup_code,
                 timeout=self._timeout,
                 max_memory=self._max_memory,
             )
-        return self._default_session
+        return self._fallback_session
 
     def feed(self, code: str) -> ExecutionResult:
-        """Execute *code*, falling back to DefaultSession on Monty gaps."""
+        """Execute *code*, falling back to the configured session on Monty gaps."""
         # Session-level fallback — Monty never worked
         if self._monty_failed_permanently:
-            return self._get_default_session().feed(code)
+            return self._get_fallback_session().feed(code)
 
         assert self._monty_session is not None
         result = self._monty_session.feed(code)
 
         # Snippet-level fallback — ModuleNotFoundError means Monty
-        # doesn't have that stdlib module; retry with DefaultSession.
+        # doesn't have that stdlib module; retry with fallback session.
         if not result.success and result.error_type == "ModuleNotFoundError":
             _logfire.info(
-                "Monty ModuleNotFoundError, retrying snippet with DefaultSession: {error}",
+                "Monty ModuleNotFoundError, retrying snippet with {fallback}: {error}",
+                fallback=type(self._fallback_executor).__name__,
                 error=result.error,
             )
-            return self._get_default_session().feed(code)
+            return self._get_fallback_session().feed(code)
 
         return result
 
     def close(self) -> None:
         if self._monty_session is not None:
             self._monty_session.close()
-        if self._default_session is not None:
-            self._default_session.close()
+        if self._fallback_session is not None:
+            self._fallback_session.close()
 
     def __enter__(self) -> FallbackSession:
         return self
@@ -500,11 +502,12 @@ class MontyExecutor:
         If ``pydantic-monty`` is not installed.
     """
 
-    def __init__(self) -> None:
+    def __init__(self, fallback_executor: Executor | None = None) -> None:
         if not MONTY_AVAILABLE:
             raise ImportError(
                 'MontyExecutor requires pydantic-monty. Install it with: pip install "vowel[monty]"'
             )
+        self._fallback_executor = fallback_executor or DefaultExecutor()
 
     async def execute(
         self,
@@ -675,6 +678,7 @@ def create_session(
             setup_code,
             timeout=timeout,
             max_memory=max_memory,
+            fallback_executor=self._fallback_executor,
         )
 
 
@@ -918,6 +922,142 @@ def create_session(
         )
 
 
+class ResolvedExecutor:
+    """Executor wrapper that falls back when the primary executor raises."""
+
+    def __init__(self, primary: Executor, fallback: Executor) -> None:
+        self.primary = primary
+        self.fallback = fallback
+
+    async def execute(
+        self,
+        code: str,
+        *,
+        inputs: dict[str, Any] | None = None,
+        external_functions: dict[str, Callable[..., Any]] | None = None,
+        timeout: float = 5.0,
+        max_memory: int = 10 * 1024 * 1024,
+    ) -> ExecutionResult:
+        try:
+            return await self.primary.execute(
+                code,
+                inputs=inputs,
+                external_functions=external_functions,
+                timeout=timeout,
+                max_memory=max_memory,
+            )
+        except Exception as exc:  # noqa: BLE001
+            _logfire.info(
+                "Primary executor {primary} raised {exc_type}; falling back to {fallback}",
+                primary=type(self.primary).__name__,
+                exc_type=type(exc).__name__,
+                fallback=type(self.fallback).__name__,
+            )
+            return await self.fallback.execute(
+                code,
+                inputs=inputs,
+                external_functions=external_functions,
+                timeout=timeout,
+                max_memory=max_memory,
+            )
+
+    def execute_sync(
+        self,
+        code: str,
+        *,
+        inputs: dict[str, Any] | None = None,
+        external_functions: dict[str, Callable[..., Any]] | None = None,
+        timeout: float = 5.0,
+        max_memory: int = 10 * 1024 * 1024,
+    ) -> ExecutionResult:
+        try:
+            return self.primary.execute_sync(
+                code,
+                inputs=inputs,
+                external_functions=external_functions,
+                timeout=timeout,
+                max_memory=max_memory,
+            )
+        except Exception as exc:  # noqa: BLE001
+            _logfire.info(
+                "Primary executor {primary} raised {exc_type}; falling back to {fallback}",
+                primary=type(self.primary).__name__,
+                exc_type=type(exc).__name__,
+                fallback=type(self.fallback).__name__,
+            )
+            return self.fallback.execute_sync(
+                code,
+                inputs=inputs,
+                external_functions=external_functions,
+                timeout=timeout,
+                max_memory=max_memory,
+            )
+
+    def create_session(
+        self,
+        setup_code: str,
+        *,
+        timeout: float = 5.0,
+        max_memory: int = 10 * 1024 * 1024,
+    ) -> ExecutionSession:
+        try:
+            return self.primary.create_session(
+                setup_code,
+                timeout=timeout,
+                max_memory=max_memory,
+            )
+        except Exception as exc:  # noqa: BLE001
+            _logfire.info(
+                "Primary executor {primary} session creation raised {exc_type}; "
+                "falling back to {fallback}",
+                primary=type(self.primary).__name__,
+                exc_type=type(exc).__name__,
+                fallback=type(self.fallback).__name__,
+            )
+            return self.fallback.create_session(
+                setup_code,
+                timeout=timeout,
+                max_memory=max_memory,
+            )
+
+
+def resolve_executors(
+    executor: Executor | None = None,
+    fallback_executor: Executor | None = None,
+) -> Executor:
+    """Resolve primary/fallback executors while preserving Monty-first defaults."""
+    fallback = fallback_executor or DefaultExecutor()
+
+    if isinstance(executor, ResolvedExecutor):
+        if fallback_executor is None:
+            return executor
+        return ResolvedExecutor(executor.primary, fallback)
+
+    if executor is None:
+        if MONTY_AVAILABLE:
+            return MontyExecutor(fallback_executor=fallback)
+        import warnings
+
+        warnings.warn(
+            "pydantic-monty not installed; using fallback executor "
+            f'{type(fallback).__name__} (no sandboxing). Install with: pip install "vowel[monty]"',
+            stacklevel=2,
+        )
+        return fallback
+
+    if isinstance(executor, DefaultExecutor) and fallback_executor is None:
+        return executor
+
+    if isinstance(executor, MontyExecutor):
+        executor._fallback_executor = fallback  # type: ignore[attr-defined]
+        return executor
+
+    if executor is fallback:
+        return executor
+
+    return ResolvedExecutor(executor, fallback)
+
+
 # ---------------------------------------------------------------------------
 # Factory
 # ---------------------------------------------------------------------------
diff --git a/src/vowel/runner.py b/src/vowel/runner.py
index 4018fcd..9906018 100644
--- a/src/vowel/runner.py
+++ b/src/vowel/runner.py
@@ -36,7 +36,8 @@ def my_func(x):
 from pydantic import BaseModel, Field
 
 from .eval_types import Evals, EvalsFile, FixtureDefinition
-from .utils import EvalSummary
+from .executor import Executor
+from .utils import EvalSummary, EvalsBundle
 from .utils import run_evals as _run_evals
 
 _T = TypeVar("_T", bound=Any)
@@ -129,7 +130,7 @@ def _sanitize_code(code: str) -> str:
             code = code.replace('\\"', '"').replace("\\'", "'")
 
         # 2. Remove typing imports of builtin generics
-        _BUILTIN_GENERICS = {
+        _builtin_generics = {
             "Dict",
             "List",
             "Tuple",
@@ -146,7 +147,7 @@ def _sanitize_code(code: str) -> str:
 
         def _clean_typing_import(m: _re.Match) -> str:
             names = [n.strip() for n in m.group(1).split(",")]
-            remaining = [n for n in names if n not in _BUILTIN_GENERICS]
+            remaining = [n for n in names if n not in _builtin_generics]
             if not remaining:
                 return ""  # remove the entire import line
             return f"from typing import {', '.join(remaining)}"
@@ -281,7 +282,7 @@ class RunEvals:
 
     def __init__(
         self,
-        source: str | Path | dict | EvalsFile | Evals | Sequence[Evals],
+        source: str | Path | dict | EvalsFile | EvalsBundle | Evals | Sequence[Evals],
         *,
         functions: dict[str, Callable] | None = None,
         filter_funcs: list[str] | None = None,
@@ -292,6 +293,8 @@ def __init__(
             dict[str, Callable | tuple[Callable, Callable | None] | FixtureDefinition] | None
         ) = None,
         ignore_duration: bool = False,
+        executor: Executor | None = None,
+        fallback_executor: Executor | None = None,
     ):
         self._source = source
         self._functions = functions or {}
@@ -301,6 +304,8 @@ def __init__(
         self._serial_fn = serial_fn or {}
         self._fixtures = fixtures or {}
         self._ignore_duration = ignore_duration
+        self._executor = executor
+        self._fallback_executor = fallback_executor
 
     @classmethod
     def from_file(cls, path: str | Path) -> "RunEvals":
@@ -318,6 +323,22 @@ def from_file(cls, path: str | Path) -> "RunEvals":
         """
         return cls(str(path))
 
+    @classmethod
+    def from_bundle(cls, bundle: EvalsBundle) -> "RunEvals":
+        """
+        Create from a EvalsBundle object.
+
+        Args:
+            bundle: EvalsBundle object
+
+        Returns:
+            RunEvals instance
+
+        Example:
+            RunEvals.from_bundle(bundle).run()
+        """
+        return cls(bundle)
+
     @classmethod
     def from_source(cls, source: str | dict | EvalsFile) -> "RunEvals":
         """
@@ -565,6 +586,8 @@ def run(self) -> EvalSummary:
             serial_fn=self._serial_fn,
             fixtures=self._fixtures,
             ignore_duration=self._ignore_duration,
+            executor=self._executor,
+            fallback_executor=self._fallback_executor,
         )
 
     def ignore_duration(self) -> "RunEvals":
@@ -579,3 +602,14 @@ def ignore_duration(self) -> "RunEvals":
         """
         self._ignore_duration = True
         return self
+
+    def with_executor(
+        self,
+        executor: Executor | None = None,
+        *,
+        fallback_executor: Executor | None = None,
+    ) -> "RunEvals":
+        """Store executor preferences for downstream execution-aware flows."""
+        self._executor = executor
+        self._fallback_executor = fallback_executor
+        return self
diff --git a/src/vowel/spec_validation.py b/src/vowel/spec_validation.py
index d559dda..293762c 100644
--- a/src/vowel/spec_validation.py
+++ b/src/vowel/spec_validation.py
@@ -12,7 +12,7 @@
 import logfire
 import yaml
 
-from vowel.executor import Executor, get_executor
+from vowel.executor import Executor, resolve_executors
 from vowel.runner import Function
 from vowel.utils import EvalSummary
 
@@ -70,6 +70,7 @@ def inject_durations(
     func: Function,
     executor: Executor,
     *,
+    fallback_executor: Executor | None = None,
     buffer_pct: float = 0.5,
     floor_ms: float = 10.0,
 ) -> str:
@@ -97,6 +98,8 @@ def inject_durations(
     if not isinstance(spec, dict):
         return yaml_spec
 
+    executor = resolve_executors(executor, fallback_executor)
+
     try:
         session = executor.create_session(func.code)
     except Exception:
@@ -127,13 +130,14 @@ def inject_durations(
                     )
                     case["duration"] = round(dur, 1)
 
-    return yaml.dump(spec, default_flow_style=False, allow_unicode=True, sort_keys=False)
+    return yaml.safe_dump(spec, default_flow_style=False, allow_unicode=True, sort_keys=False)
 
 
 def validate_expected_values(
     yaml_spec: str,
     func: Function,
     executor: Executor | None = None,
+    fallback_executor: Executor | None = None,
 ) -> str:
     """Validate and fix expected values in a YAML spec by executing cases.
 
@@ -152,14 +156,14 @@ def validate_expected_values(
     func:
         Function to execute.
     executor:
-        Executor backend.  Defaults to ``get_executor("auto")``.
+        Executor backend.  Defaults to Monty-first with Default fallback.
 
     Returns
     -------
     str
         Fixed YAML spec with corrected expected values.
     """
-    executor = executor or get_executor("auto")
+    executor = resolve_executors(executor, fallback_executor)
 
     spec = yaml.safe_load(yaml_spec)
     if not isinstance(spec, dict):
@@ -189,15 +193,19 @@ def validate_expected_values(
                 result = session.feed(call_code)
 
                 # --- Fix expected values ---
-                if "expected" in case and not case.get("raises"):
-                    if result.success and result.output != case["expected"]:
-                        logfire.info(
-                            "Fixing expected value for case: {expected} → {actual}",
-                            expected=repr(case["expected"]),
-                            actual=repr(result.output),
-                        )
-                        case["expected"] = result.output
-                        fixes_applied += 1
+                if (
+                    "expected" in case
+                    and not case.get("raises")
+                    and result.success
+                    and result.output != case["expected"]
+                ):
+                    logfire.info(
+                        "Fixing expected value for case: {expected} → {actual}",
+                        expected=repr(case["expected"]),
+                        actual=repr(result.output),
+                    )
+                    case["expected"] = result.output
+                    fixes_applied += 1
 
                 # --- Fix raises cases ---
                 if case.get("raises"):
@@ -226,7 +234,7 @@ def validate_expected_values(
 
     if fixes_applied > 0:
         logfire.info("Validated spec: {count} fixes applied", count=fixes_applied)
-        return yaml.dump(spec, default_flow_style=False, allow_unicode=True, sort_keys=False)
+        return yaml.safe_dump(spec, default_flow_style=False, allow_unicode=True, sort_keys=False)
 
     return yaml_spec
 
@@ -307,6 +315,11 @@ def inject_missing_error_cases(
                 "raises": error_type,
             }
         elif len(args) == 1:
+            # Tuples cannot be represented in yaml.safe_load()-compatible YAML.
+            # Other non-list inputs (None, int, str, dict) already cover the
+            # same TypeError path, so skip rather than convert and break semantics.
+            if isinstance(args[0], tuple):
+                continue
             input_repr = repr((args[0], None))
             if input_repr in existing_raises_inputs:
                 continue
@@ -337,6 +350,6 @@ def inject_missing_error_cases(
 
     if injected > 0:
         logfire.info("Injected {count} missing error cases into spec", count=injected)
-        return yaml.dump(spec, default_flow_style=False, allow_unicode=True, sort_keys=False)
+        return yaml.safe_dump(spec, default_flow_style=False, allow_unicode=True, sort_keys=False)
 
     return yaml_spec
diff --git a/src/vowel/tdd.py b/src/vowel/tdd.py
index 2f637f1..005fc27 100644
--- a/src/vowel/tdd.py
+++ b/src/vowel/tdd.py
@@ -36,7 +36,7 @@
 
 from vowel.context import EVAL_SPEC_CONTEXT
 from vowel.eval_types import EvalsSource
-from vowel.executor import Executor, get_executor
+from vowel.executor import Executor, resolve_executors
 from vowel.monitoring import enable_monitoring
 from vowel.runner import Function, RunEvals
 from vowel.spec_validation import (
@@ -229,6 +229,7 @@ def __init__(
         additional_context: str | list[str] | None = None,
         load_env: bool = False,
         executor: Executor | None = None,
+        fallback_executor: Executor | None = None,
         **opts,
     ):
         if load_env:
@@ -256,6 +257,7 @@ def __init__(
 
         # Optional executor for expected-value validation
         self._executor = executor
+        self._fallback_executor = fallback_executor
 
         self._opts = opts
 
@@ -792,7 +794,10 @@ def generate_evals_from_signature(
                                 code=real_code,
                                 description=signature.description,
                             )
-                            executor = getattr(self, "_executor", None) or get_executor("auto")
+                            executor = resolve_executors(
+                                getattr(self, "_executor", None),
+                                getattr(self, "_fallback_executor", None),
+                            )
                             yaml_spec = validate_expected_values(
                                 yaml_spec,
                                 val_func,
diff --git a/src/vowel/utils.py b/src/vowel/utils.py
index c6c4f67..b092d8a 100644
--- a/src/vowel/utils.py
+++ b/src/vowel/utils.py
@@ -26,9 +26,9 @@
 import importlib
 import importlib.util
 import inspect
-import logfire
 import os
 import sys
+import threading
 import types
 from collections.abc import Callable, Mapping, Sequence
 from datetime import date, datetime, time, timedelta
@@ -38,6 +38,7 @@
 from typing import Any, Literal, Optional, Union, get_args, get_origin
 
 import click
+import logfire
 import yaml
 from pydantic import BaseModel, ConfigDict, Field
 from pydantic_ai import format_as_xml
@@ -55,9 +56,7 @@
     TypeAdapterEvaluator,
     create_llm_judge,
 )
-
-_SYS_PATH_MODIFIED = False
-
+from .executor import Executor
 
 # =============================================================================
 # Evals Bundle - Container for evals and fixtures
@@ -334,14 +333,24 @@ def check_compatibility(func: Callable) -> tuple[bool, list[str]]:
     return False, issues
 
 
-def _ensure_cwd_in_path() -> None:
-    """Ensure current working directory is in sys.path (run once)."""
-    global _SYS_PATH_MODIFIED
-    if not _SYS_PATH_MODIFIED:
-        cwd = os.getcwd()
-        if cwd not in sys.path:
-            sys.path.insert(0, cwd)
-        _SYS_PATH_MODIFIED = True
+@contextlib.contextmanager
+def _cwd_on_syspath() -> Any:
+    """Temporarily prepend the current working directory to ``sys.path``."""
+    cwd = os.getcwd()
+    inserted = cwd not in sys.path
+    if inserted:
+        sys.path.insert(0, cwd)
+    try:
+        yield
+    finally:
+        if inserted:
+            with contextlib.suppress(ValueError):
+                sys.path.remove(cwd)
+
+
+def _is_yaml_source_string(source_str: str) -> bool:
+    """Best-effort heuristic for distinguishing inline YAML from file paths."""
+    return "\n" in source_str or source_str.strip().startswith("{") or ":" in source_str
 
 
 def _apply_serializer(
@@ -592,9 +601,9 @@ def __init__(
     ):
         self.definitions = fixtures
         self._fixture_funcs = fixture_funcs or {}
-        self._instances: dict[str, Any] = {}  # Cached fixture instances (all scopes)
-        self._scope_counts: dict[str, int] = {}  # Reference counts for scoped fixtures
+        self._instances: dict[str, Any] = {}  # Cached fixture instances
         self._generators: dict[str, Any] = {}  # Active generator fixtures for cleanup
+        self._lock = threading.RLock()
 
     def setup(self, fixture_name: str) -> Any:
         """
@@ -613,19 +622,19 @@ def setup(self, fixture_name: str) -> Any:
                 f"Available fixtures: {available if available else '(none defined)'}"
             )
 
-        defn = self.definitions[fixture_name]
+        with self._lock:
+            defn = self.definitions[fixture_name]
 
-        # For module/session scope, return cached instance if exists
-        if defn.scope in ("module", "session") and fixture_name in self._instances:
-            self._scope_counts[fixture_name] = self._scope_counts.get(fixture_name, 0) + 1
-            return self._instances[fixture_name]
+            # For module/session scope, return cached instance if exists.
+            if defn.scope in ("module", "session") and fixture_name in self._instances:
+                return self._instances[fixture_name]
 
-        # Class-based fixture
-        if defn.cls:
-            return self._setup_class_fixture(fixture_name, defn)
+            # Class-based fixture
+            if defn.cls:
+                return self._setup_class_fixture(fixture_name, defn)
 
-        # Function-based fixture
-        return self._setup_function_fixture(fixture_name, defn)
+            # Function-based fixture
+            return self._setup_function_fixture(fixture_name, defn)
 
     def _setup_class_fixture(self, fixture_name: str, defn: FixtureDefinition) -> Any:
         """Setup a class-based fixture by instantiating the class."""
@@ -644,9 +653,7 @@ def _setup_class_fixture(self, fixture_name: str, defn: FixtureDefinition) -> An
         except Exception as e:
             raise RuntimeError(f"Failed to instantiate {defn.cls}: {e}") from e
 
-        # Cache instance
         self._instances[fixture_name] = instance
-        self._scope_counts[fixture_name] = self._scope_counts.get(fixture_name, 0) + 1
 
         return instance
 
@@ -682,9 +689,7 @@ def _setup_function_fixture(self, fixture_name: str, defn: FixtureDefinition) ->
         except Exception as e:
             raise RuntimeError(f"Failed to setup fixture '{fixture_name}': {e}") from e
 
-        # Cache instance (all scopes - function scope will be cleared on teardown)
         self._instances[fixture_name] = instance
-        self._scope_counts[fixture_name] = self._scope_counts.get(fixture_name, 0) + 1
 
         return instance
 
@@ -699,22 +704,17 @@ def teardown(self, fixture_name: str, scope_trigger: str = "function") -> None:
         if fixture_name not in self.definitions:
             return
 
-        defn = self.definitions[fixture_name]
+        with self._lock:
+            defn = self.definitions[fixture_name]
 
-        # Only teardown if scope matches
-        if defn.scope != scope_trigger:
-            return
+            # Only teardown if scope matches
+            if defn.scope != scope_trigger:
+                return
 
-        # Decrement reference count
-        if fixture_name in self._scope_counts:
-            self._scope_counts[fixture_name] -= 1
-            # For module/session scope, only teardown when count reaches 0
-            if defn.scope in ("module", "session") and self._scope_counts[fixture_name] > 0:
-                return  # Still in use
+            instance = self._instances.pop(fixture_name, None)
+            if instance is None:
+                return
 
-        # Perform teardown
-        instance = self._instances.pop(fixture_name, None)
-        if instance is not None:
             # Check if this is a generator fixture (pytest-style yield)
             gen = self._generators.pop(fixture_name, None)
             if gen is not None:
@@ -729,7 +729,7 @@ def teardown(self, fixture_name: str, scope_trigger: str = "function") -> None:
                 _, teardown_func = self._fixture_funcs[fixture_name]
             elif defn.teardown:
                 # Check if teardown is a class method (e.g., 'Connection.close')
-                if "." in defn.teardown and defn.cls:
+                if "." in defn.teardown and defn.cls and instance is not None:
                     parts = defn.teardown.split(".")
                     if len(parts) == 2:
                         class_name, method_name = parts
@@ -799,7 +799,6 @@ def import_function(module_path: str) -> Any:
         ImportError: If the module cannot be imported
         AttributeError: If the function is not found in the module
     """
-    _ensure_cwd_in_path()
     tried_combinations = []
 
     if "." not in module_path:
@@ -813,49 +812,50 @@ def import_function(module_path: str) -> Any:
 
     parts = module_path.split(".")
 
-    for i in range(len(parts) - 1, 0, -1):
-        module_name = ".".join(parts[:i])
-        remaining_parts = parts[i:]
-        tried_combinations.append(f"module='{module_name}', attr='{'.'.join(remaining_parts)}'")
+    with _cwd_on_syspath():
+        for i in range(len(parts) - 1, 0, -1):
+            module_name = ".".join(parts[:i])
+            remaining_parts = parts[i:]
+            tried_combinations.append(f"module='{module_name}', attr='{'.'.join(remaining_parts)}'")
 
-        module = None
+            module = None
 
-        try:
-            module = importlib.import_module(module_name)
-        except ImportError as e:
-            logfire.debug(
-                "Standard import failed for '{module_name}': {error}",
-                module_name=module_name,
-                error=str(e),
-            )
-            relative_path = module_name.replace(".", os.sep) + ".py"
-            file_path = os.path.join(os.getcwd(), relative_path)
+            try:
+                module = importlib.import_module(module_name)
+            except ImportError as e:
+                logfire.debug(
+                    "Standard import failed for '{module_name}': {error}",
+                    module_name=module_name,
+                    error=str(e),
+                )
+                relative_path = module_name.replace(".", os.sep) + ".py"
+                file_path = os.path.join(os.getcwd(), relative_path)
 
-            if os.path.exists(file_path):
-                try:
-                    spec = importlib.util.spec_from_file_location(module_name, file_path)
-                    if spec and spec.loader:
-                        module = importlib.util.module_from_spec(spec)
-                        spec.loader.exec_module(module)
+                if os.path.exists(file_path):
+                    try:
+                        spec = importlib.util.spec_from_file_location(module_name, file_path)
+                        if spec and spec.loader:
+                            module = importlib.util.module_from_spec(spec)
+                            spec.loader.exec_module(module)
+                            logfire.debug(
+                                "File-based import succeeded for '{file_path}'", file_path=file_path
+                            )
+                    except Exception as e:
                         logfire.debug(
-                            "File-based import succeeded for '{file_path}'", file_path=file_path
+                            "File-based import failed for '{file_path}': {error}",
+                            file_path=file_path,
+                            error=str(e),
                         )
-                except Exception as e:
-                    logfire.debug(
-                        "File-based import failed for '{file_path}': {error}",
-                        file_path=file_path,
-                        error=str(e),
-                    )
 
-        if module:
-            try:
-                obj: Any = module
-                for part in remaining_parts:
-                    obj = getattr(obj, part)
-                return obj
-            except AttributeError as e:
-                logfire.debug("Attribute lookup failed: {error}", error=str(e))
-                continue
+            if module:
+                try:
+                    obj: Any = module
+                    for part in remaining_parts:
+                        obj = getattr(obj, part)
+                    return obj
+                except AttributeError as e:
+                    logfire.debug("Attribute lookup failed: {error}", error=str(e))
+                    continue
 
     try:
         obj = getattr(builtins, parts[0])
@@ -885,8 +885,6 @@ def import_class(class_path: str) -> type:
         ImportError: If the module cannot be imported
         AttributeError: If the class is not found in the module
     """
-    _ensure_cwd_in_path()
-
     parts = class_path.split(".")
     if len(parts) < 2 or any(not p for p in parts):
         raise ImportError(f"Invalid class path '{class_path}'. Expected format 'module.ClassName'.")
@@ -895,7 +893,8 @@ def import_class(class_path: str) -> type:
     class_name = parts[-1]
 
     try:
-        module = importlib.import_module(module_name)
+        with _cwd_on_syspath():
+            module = importlib.import_module(module_name)
     except ImportError as e:
         raise ImportError(f"Cannot import module '{module_name}': {e}") from e
 
@@ -943,7 +942,7 @@ def load_evals(source: str | Path | dict | EvalsFile) -> dict[str, Evals]:
         # Check if it's an existing file path first, before YAML heuristics
         if os.path.exists(source_str):
             return load_evals_file(source_str)
-        if "\n" in source_str or source_str.strip().startswith("{") or ":" in source_str:
+        if _is_yaml_source_string(source_str):
             return load_evals_from_yaml_string(source_str)
         else:
             return load_evals_file(source_str)
@@ -1003,7 +1002,9 @@ def load_bundle(source: str | Path | dict | EvalsFile) -> EvalsBundle:
         return load_bundle_from_dict(source)
     elif isinstance(source, (str, Path)):
         source_str = str(source)
-        if "\n" in source_str or source_str.strip().startswith("{") or ":" in source_str:
+        if os.path.exists(source_str):
+            return load_bundle_file(source_str)
+        if _is_yaml_source_string(source_str):
             return load_bundle_from_yaml_string(source_str)
         else:
             return load_bundle_file(source_str)
@@ -1983,7 +1984,7 @@ def xml(self) -> str:
 
 
 def run_evals(
-    source: str | Path | dict | EvalsFile,
+    source: str | Path | dict | EvalsFile | EvalsBundle,
     *,
     filter_funcs: list[str] | None = None,
     functions: dict[str, Callable] | None = None,
@@ -1994,6 +1995,8 @@ def run_evals(
         dict[str, Callable | tuple[Callable, Callable | None] | FixtureDefinition] | None
     ) = None,
     ignore_duration: bool = False,
+    executor: Executor | None = None,
+    fallback_executor: Executor | None = None,
 ) -> EvalSummary:
     """
     Run evaluations from various sources.
@@ -2007,12 +2010,18 @@ def run_evals(
         serial_fn: Optional dict of serializer functions (receive full input dict)
         fixtures: Optional dict of fixture functions {name: setup_fn} or {name: (setup_fn, teardown_fn)}
         ignore_duration: If True, skip duration constraints
+        executor: Optional primary executor configuration for execution-aware subflows
+        fallback_executor: Optional fallback executor paired with ``executor``
 
     Returns:
         EvalSummary with aggregated results
     """
     # Load both evals and fixtures from YAML
-    bundle = load_bundle(source)
+    _ = (executor, fallback_executor)
+    if isinstance(source, EvalsBundle):
+        bundle = source
+    else:
+        bundle = load_bundle(source)
     all_evals = bundle.evals
     yaml_fixtures = bundle.fixtures
 
diff --git a/src/vowel/validation.py b/src/vowel/validation.py
index 73d942b..b989f9e 100644
--- a/src/vowel/validation.py
+++ b/src/vowel/validation.py
@@ -388,7 +388,7 @@ def validate_and_fix_spec(
             modified = True
 
     if modified:
-        result.fixed_yaml = yaml.dump(
+        result.fixed_yaml = yaml.safe_dump(
             data, default_flow_style=False, allow_unicode=True, sort_keys=False
         )
         logfire.info(
diff --git a/tests/test_cli.py b/tests/test_cli.py
new file mode 100644
index 0000000..cfd2e2d
--- /dev/null
+++ b/tests/test_cli.py
@@ -0,0 +1,35 @@
+"""Tests for CLI behavior outside watch mode."""
+
+import json
+
+from click.testing import CliRunner
+
+from vowel.cli import main
+
+
+class TestCliExportJson:
+    """Test JSON export behavior."""
+
+    def test_export_json_writes_object_payload(self, tmp_path):
+        """--export-json should write a JSON object, not a quoted string."""
+        yaml_path = tmp_path / "evals.yml"
+        export_path = tmp_path / "results.json"
+        yaml_path.write_text(
+            """
+len:
+  dataset:
+    - case:
+        input: [1, 2, 3]
+        expected: 3
+"""
+        )
+
+        runner = CliRunner()
+        result = runner.invoke(main, [str(yaml_path), "--export-json", str(export_path), "--quiet"])
+
+        assert result.exit_code == 0
+
+        payload = json.loads(export_path.read_text())
+        assert isinstance(payload, dict)
+        assert "summary" in payload
+        assert "results" in payload
diff --git a/tests/test_evaluators.py b/tests/test_evaluators.py
index 272f053..931916c 100644
--- a/tests/test_evaluators.py
+++ b/tests/test_evaluators.py
@@ -144,6 +144,21 @@ def test_case_level_assertion(self):
 
         assert summary.all_passed
 
+    def test_assertion_raw_fallback_preserves_compatibility(self):
+        """Assertions outside the restricted builtins set should still work via fallback."""
+        spec = {
+            "identity": {
+                "evals": {"Assertion": {"assertion": "pow(output, 2) == 16"}},
+                "dataset": [
+                    {"case": {"input": 4}},
+                ],
+            }
+        }
+
+        summary = RunEvals.from_dict(spec).with_functions({"identity": lambda x: x}).run()
+
+        assert summary.all_passed
+
 
 class TestTypeEvaluator:
     """Tests for type checking evaluator."""
diff --git a/tests/test_executor.py b/tests/test_executor.py
new file mode 100644
index 0000000..a23252c
--- /dev/null
+++ b/tests/test_executor.py
@@ -0,0 +1,457 @@
+"""Tests for vowel.executor — CodeMode execution backends.
+
+Covers MontyExecutor, DefaultExecutor, and get_executor factory across
+all injection modes: external_functions, inputs, both, and pure code.
+
+Tests:
+     1. External functions only
+     2. Inputs only
+     3. Inputs + external functions combined
+     4. Pure code (no injection)
+     5. Stdout capture
+     6. Error handling
+     7. ExecutionResult structure
+     8. Protocol conformance
+     9. get_executor factory
+    10. Parity — both executors produce the same output
+"""
+
+from __future__ import annotations
+
+import asyncio
+import importlib.util
+from typing import TYPE_CHECKING
+
+import pytest
+
+from vowel.executor import (
+    DefaultExecutor,
+    Executor,
+    get_executor,
+    resolve_executors,
+)
+
+if TYPE_CHECKING:
+    from vowel.executor import MontyExecutor
+
+# MontyExecutor requires pydantic-monty; skip gracefully if unavailable.
+_MONTY_AVAILABLE = importlib.util.find_spec("pydantic_monty") is not None
+
+if _MONTY_AVAILABLE:
+    from vowel.executor import MontyExecutor  # noqa: F811
+
+# ---------------------------------------------------------------------------
+# Fixtures
+# ---------------------------------------------------------------------------
+
+
+def _binary_search(arr: list[int], target: int) -> int:
+    """Reference binary search used across test classes."""
+    lo, hi = 0, len(arr) - 1
+    while lo <= hi:
+        mid = (lo + hi) // 2
+        if arr[mid] == target:
+            return mid
+        elif arr[mid] < target:
+            lo = mid + 1
+        else:
+            hi = mid - 1
+    return -1
+
+
+def _add(a, b):
+    return a + b
+
+
+def _build_executors() -> tuple[list[Executor], list[str]]:
+    instances: list[Executor] = [DefaultExecutor()]
+    ids = ["default"]
+    if _MONTY_AVAILABLE:
+        instances.insert(0, MontyExecutor())
+        ids.insert(0, "monty")
+    return instances, ids
+
+
+EXECUTOR_INSTANCES, EXECUTOR_IDS = _build_executors()
+
+
+@pytest.fixture(params=EXECUTOR_INSTANCES, ids=EXECUTOR_IDS)
+def executor(request) -> Executor:
+    """Parametrised fixture yielding each executor backend."""
+    return request.param
+
+
+# ---------------------------------------------------------------------------
+# 1. External functions only
+# ---------------------------------------------------------------------------
+
+
+class TestExternalFunctions:
+    """Snippet calls host-side callbacks via external_functions."""
+
+    def test_single_function(self, executor: Executor):
+        code = "_binary_search([1, 3, 5, 7, 9], 5)"
+        r = asyncio.run(
+            executor.execute(code, external_functions={"_binary_search": _binary_search})
+        )
+        assert r.success is True
+        assert r.output == 2
+
+    def test_multiple_calls(self, executor: Executor):
+        code = (
+            "results = []\n"
+            "results.append(search([1, 3, 5, 7, 9], 5))\n"
+            "results.append(search([1, 3, 5, 7, 9], 4))\n"
+            "results.append(search([1], 1))\n"
+            "results.append(search([], 1))\n"
+            "results\n"
+        )
+        r = asyncio.run(executor.execute(code, external_functions={"search": _binary_search}))
+        assert r.success is True
+        assert r.output == [2, -1, 0, -1]
+
+    def test_multiple_functions(self, executor: Executor):
+        code = (
+            "results = []\n"
+            "results.append(search([10, 20, 30], 20))\n"
+            "results.append(add(3, 4))\n"
+            "results\n"
+        )
+        r = asyncio.run(
+            executor.execute(
+                code,
+                external_functions={"search": _binary_search, "add": _add},
+            )
+        )
+        assert r.success is True
+        assert r.output == [1, 7]
+
+
+# ---------------------------------------------------------------------------
+# 2. Inputs only
+# ---------------------------------------------------------------------------
+
+
+class TestInputs:
+    """Snippet uses injected values via inputs."""
+
+    def test_arithmetic(self, executor: Executor):
+        r = asyncio.run(executor.execute("x * y + z", inputs={"x": 10, "y": 3, "z": 5}))
+        assert r.success is True
+        assert r.output == 35
+
+    def test_list_input(self, executor: Executor):
+        r = asyncio.run(executor.execute("sorted(data)", inputs={"data": [3, 1, 2]}))
+        assert r.success is True
+        assert r.output == [1, 2, 3]
+
+    def test_string_input(self, executor: Executor):
+        r = asyncio.run(executor.execute("name.upper()", inputs={"name": "hello"}))
+        assert r.success is True
+        assert r.output == "HELLO"
+
+    def test_dict_input(self, executor: Executor):
+        r = asyncio.run(executor.execute("len(d)", inputs={"d": {"a": 1, "b": 2}}))
+        assert r.success is True
+        assert r.output == 2
+
+
+# ---------------------------------------------------------------------------
+# 3. Inputs + external functions combined
+# ---------------------------------------------------------------------------
+
+
+class TestCombined:
+    """Snippet uses both inputs and external_functions."""
+
+    def test_search_with_data(self, executor: Executor):
+        r = asyncio.run(
+            executor.execute(
+                "search(data, query)",
+                inputs={"data": [2, 4, 6, 8, 10], "query": 6},
+                external_functions={"search": _binary_search},
+            )
+        )
+        assert r.success is True
+        assert r.output == 2
+
+    def test_function_with_multiple_inputs(self, executor: Executor):
+        code = (
+            "results = []\n"
+            "for item in items:\n"
+            "    results.append(transform(item, factor))\n"
+            "results\n"
+        )
+        r = asyncio.run(
+            executor.execute(
+                code,
+                inputs={"items": [1, 2, 3], "factor": 10},
+                external_functions={"transform": lambda x, f: x * f},
+            )
+        )
+        assert r.success is True
+        assert r.output == [10, 20, 30]
+
+
+# ---------------------------------------------------------------------------
+# 4. Pure code (no injection)
+# ---------------------------------------------------------------------------
+
+
+class TestPureCode:
+    """Snippet needs no external injection."""
+
+    def test_comprehension(self, executor: Executor):
+        r = asyncio.run(executor.execute("[i**2 for i in range(5)]"))
+        assert r.success is True
+        assert r.output == [0, 1, 4, 9, 16]
+
+    def test_arithmetic_expression(self, executor: Executor):
+        r = asyncio.run(executor.execute("2 ** 10"))
+        assert r.success is True
+        assert r.output == 1024
+
+    def test_multiline_with_last_expr(self, executor: Executor):
+        code = "x = [1, 2, 3]\ny = [i * 2 for i in x]\nsum(y)\n"
+        r = asyncio.run(executor.execute(code))
+        assert r.success is True
+        assert r.output == 12
+
+    def test_no_trailing_expression(self, executor: Executor):
+        """When the last statement is not an expression output should be None."""
+        r = asyncio.run(executor.execute("x = 42"))
+        assert r.success is True
+        assert r.output is None
+
+
+# ---------------------------------------------------------------------------
+# 5. Stdout capture
+# ---------------------------------------------------------------------------
+
+
+class TestStdout:
+    """print() output is captured in ExecutionResult.stdout."""
+
+    def test_print_captured(self, executor: Executor):
+        r = asyncio.run(executor.execute('print("hello")'))
+        assert r.success is True
+        assert "hello" in r.stdout
+
+
+# ---------------------------------------------------------------------------
+# 6. Error handling
+# ---------------------------------------------------------------------------
+
+
+class TestErrors:
+    """Errors are returned as structured results, never raised."""
+
+    def test_runtime_error(self, executor: Executor):
+        r = asyncio.run(executor.execute("1 / 0"))
+        assert r.success is False
+        assert r.error_type == "ZeroDivisionError"
+        assert r.output is None
+
+    def test_type_error_in_external(self, executor: Executor):
+        r = asyncio.run(
+            executor.execute(
+                'search("not_a_list", 5)',
+                external_functions={"search": _binary_search},
+            )
+        )
+        assert r.success is False
+        assert r.error_type == "TypeError"
+
+    def test_name_error(self, executor: Executor):
+        r = asyncio.run(executor.execute("undefined_var + 1"))
+        assert r.success is False
+        assert r.error_type == "NameError"
+
+    def test_syntax_error(self, executor: Executor):
+        r = asyncio.run(executor.execute("def foo(:"))
+        assert r.success is False
+        assert r.error_type == "SyntaxError"
+
+    def test_error_has_message(self, executor: Executor):
+        r = asyncio.run(executor.execute("1 / 0"))
+        assert r.error is not None
+        assert len(r.error) > 0
+
+
+# ---------------------------------------------------------------------------
+# 7. ExecutionResult structure
+# ---------------------------------------------------------------------------
+
+
+class TestExecutionResult:
+    """ExecutionResult fields are correctly populated."""
+
+    def test_duration_is_positive(self, executor: Executor):
+        r = asyncio.run(executor.execute("42"))
+        assert r.duration_ms > 0
+
+    def test_success_fields(self, executor: Executor):
+        r = asyncio.run(executor.execute("42"))
+        assert r.success is True
+        assert r.error is None
+        assert r.error_type is None
+
+    def test_failure_fields(self, executor: Executor):
+        r = asyncio.run(executor.execute("1/0"))
+        assert r.success is False
+        assert r.error is not None
+        assert r.error_type is not None
+        assert r.output is None
+
+
+# ---------------------------------------------------------------------------
+# 8. Protocol conformance
+# ---------------------------------------------------------------------------
+
+
+class TestProtocol:
+    """Both executors satisfy the Executor protocol."""
+
+    @pytest.mark.skipif(not _MONTY_AVAILABLE, reason="pydantic-monty not installed")
+    def test_monty_is_executor(self):
+        assert isinstance(MontyExecutor(), Executor)
+
+    def test_default_is_executor(self):
+        assert isinstance(DefaultExecutor(), Executor)
+
+
+# ---------------------------------------------------------------------------
+# 9. get_executor factory
+# ---------------------------------------------------------------------------
+
+
+class TestFactory:
+    """get_executor returns the correct backend."""
+
+    def test_auto(self):
+        ex = get_executor("auto")
+        assert isinstance(ex, Executor)
+
+    @pytest.mark.skipif(not _MONTY_AVAILABLE, reason="pydantic-monty not installed")
+    def test_monty(self):
+        ex = get_executor("monty")
+        assert isinstance(ex, MontyExecutor)
+
+    def test_default(self):
+        ex = get_executor("default")
+        assert isinstance(ex, DefaultExecutor)
+
+    def test_invalid_backend(self):
+        with pytest.raises(ValueError, match="Unknown executor backend"):
+            get_executor("invalid")  # type: ignore
+
+
+class _StaticSession:
+    def __init__(self, value):
+        self.value = value
+
+    def feed(self, code):
+        from vowel.executor import ExecutionResult
+
+        return ExecutionResult(output=self.value, stdout="", success=True)
+
+    def close(self):
+        return None
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, *_):
+        self.close()
+
+
+class _RaisingExecutor:
+    async def execute(self, code, **kwargs):
+        raise RuntimeError("boom")
+
+    def execute_sync(self, code, **kwargs):
+        raise RuntimeError("boom")
+
+    def create_session(self, setup_code, **kwargs):
+        raise RuntimeError("boom")
+
+
+class _StaticExecutor:
+    def __init__(self, value):
+        self.value = value
+
+    async def execute(self, code, **kwargs):
+        from vowel.executor import ExecutionResult
+
+        return ExecutionResult(output=self.value, stdout="", success=True)
+
+    def execute_sync(self, code, **kwargs):
+        from vowel.executor import ExecutionResult
+
+        return ExecutionResult(output=self.value, stdout="", success=True)
+
+    def create_session(self, setup_code, **kwargs):
+        return _StaticSession(self.value)
+
+
+class TestResolveExecutors:
+    def test_custom_executor_uses_default_fallback_on_session_failure(self):
+        ex = resolve_executors(_RaisingExecutor())
+
+        with ex.create_session("x = 1") as session:
+            result = session.feed("x + 1")
+
+        assert result.success is True
+        assert result.output == 2
+
+    def test_custom_fallback_executor_is_used(self):
+        ex = resolve_executors(_RaisingExecutor(), _StaticExecutor("fallback"))
+
+        with ex.create_session("ignored") as session:
+            result = session.feed("ignored")
+
+        assert result.success is True
+        assert result.output == "fallback"
+
+
+# ---------------------------------------------------------------------------
+# 10. Parity — both executors produce the same output
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.skipif(not _MONTY_AVAILABLE, reason="pydantic-monty not installed")
+class TestParity:
+    """MontyExecutor and DefaultExecutor must agree on output."""
+
+    CASES = [
+        ("pure_arithmetic", "2 + 3", {}, {}),
+        ("list_ops", "[1,2,3] + [4,5]", {}, {}),
+        ("string_method", '"hello world".split()', {}, {}),
+        ("with_inputs", "a + b", {"a": 10, "b": 20}, {}),
+        ("with_ext_func", "f(3, 4)", {}, {"f": _add}),
+        ("combined", "f(x, y)", {"x": 5, "y": 6}, {"f": _add}),
+    ]
+
+    @pytest.mark.parametrize(
+        "label,code,inputs,ext_fns",
+        CASES,
+        ids=[c[0] for c in CASES],
+    )
+    def test_output_matches(self, label, code, inputs, ext_fns):
+        monty = MontyExecutor()
+        default = DefaultExecutor()
+        kwargs: dict = {}
+        if inputs:
+            kwargs["inputs"] = inputs
+        if ext_fns:
+            kwargs["external_functions"] = ext_fns
+
+        r_monty = asyncio.run(monty.execute(code, **kwargs))
+        r_default = asyncio.run(default.execute(code, **kwargs))
+
+        assert r_monty.success is True, f"Monty failed: {r_monty.error}"
+        assert r_default.success is True, f"Default failed: {r_default.error}"
+        assert r_monty.output == r_default.output, (
+            f"Parity mismatch for '{label}': "
+            f"monty={r_monty.output!r} vs default={r_default.output!r}"
+        )
diff --git a/tests/test_fixtures.py b/tests/test_fixtures.py
index 2fe4b2a..da7ed08 100644
--- a/tests/test_fixtures.py
+++ b/tests/test_fixtures.py
@@ -107,6 +107,7 @@ def test_missing_fixture_param(self):
 
 _db_instances = []
 _cache_instances = []
+_session_fixture_events: list[str] = []
 
 
 def setup_db(host: str = "localhost", port: int = 5432):
@@ -134,11 +135,23 @@ def teardown_cache(instance):
     _cache_instances.remove(instance)
 
 
+def setup_session_counter():
+    """Track session fixture setup calls."""
+    _session_fixture_events.append("setup")
+    return {"bonus": 10}
+
+
+def teardown_session_counter(instance):
+    """Track session fixture teardown calls."""
+    _session_fixture_events.append(f"teardown:{instance['bonus']}")
+
+
 class TestFixtureManager:
     def setup_method(self):
         """Clear instances before each test."""
         _db_instances.clear()
         _cache_instances.clear()
+        _session_fixture_events.clear()
 
     def test_setup_function_scope(self):
         """Should setup function-scoped fixture."""
@@ -340,6 +353,18 @@ def test_eval_fixture_field(self):
 
         assert evals.fixture == ["db"]
 
+    def test_load_bundle_prefers_existing_file_before_yaml_heuristic(self, monkeypatch):
+        """Existing file paths should not be misclassified as inline YAML."""
+        import vowel.utils as utils
+
+        monkeypatch.setattr(utils.os.path, "exists", lambda path: True)
+        monkeypatch.setattr(utils, "load_bundle_file", lambda path: ("file", path))
+        monkeypatch.setattr(utils, "load_bundle_from_yaml_string", lambda src: ("yaml", src))
+
+        result = utils.load_bundle(r"C:\tmp\spec.yml")
+
+        assert result == ("file", r"C:\tmp\spec.yml")
+
 
 def function_with_db(a: int, b: int, *, db: dict) -> int:
     """Test function that uses a db fixture."""
@@ -352,6 +377,7 @@ class TestIntegration:
     def setup_method(self):
         _db_instances.clear()
         _cache_instances.clear()
+        _session_fixture_events.clear()
 
     def test_fixture_injection_valid_signature(self):
         """Should validate and use fixtures correctly."""
@@ -386,6 +412,7 @@ class TestProgrammaticFixtures:
     def setup_method(self):
         _db_instances.clear()
         _cache_instances.clear()
+        _session_fixture_events.clear()
 
     def test_with_fixtures_setup_only(self):
         """Should work with setup-only fixtures via with_fixtures."""
@@ -520,6 +547,84 @@ def test_fixture_missing_error(self):
         assert not summary.all_passed
         assert summary.error_count == 1
 
+    def test_session_scope_fixture_runs_setup_and_teardown_once_per_eval_run(self):
+        """Session-scoped fixtures should setup once and teardown once across all cases."""
+        yaml_content = """
+add_with_db:
+  fixture:
+    - db
+  dataset:
+    - case:
+        inputs: {a: 1, b: 2}
+        expected: 13
+    - case:
+        inputs: {a: 3, b: 4}
+        expected: 17
+"""
+
+        summary = (
+            RunEvals.from_source(yaml_content)
+            .with_functions({"add_with_db": add_with_db})
+            .with_fixtures(
+                {
+                    "db": FixtureDefinition(
+                        setup="test_fixtures.setup_session_counter",
+                        teardown="test_fixtures.teardown_session_counter",
+                        scope="session",
+                    )
+                }
+            )
+            .run()
+        )
+
+        assert summary.all_passed
+        assert _session_fixture_events == ["setup", "teardown:10"]
+
+    def test_session_scope_fixture_is_shared_across_multiple_functions(self):
+        """Session-scoped fixtures should teardown once after the full run ends."""
+        yaml_content = """
+add_with_db:
+  fixture:
+    - db
+  dataset:
+    - case:
+        inputs: {a: 1, b: 2}
+        expected: 13
+subtract_with_db:
+  fixture:
+    - db
+  dataset:
+    - case:
+        inputs: {a: 10, b: 3}
+        expected: 17
+"""
+
+        def subtract_with_db(a: int, b: int, *, db: dict) -> int:
+            return a - b + db["bonus"]
+
+        summary = (
+            RunEvals.from_source(yaml_content)
+            .with_functions(
+                {
+                    "add_with_db": add_with_db,
+                    "subtract_with_db": subtract_with_db,
+                }
+            )
+            .with_fixtures(
+                {
+                    "db": FixtureDefinition(
+                        setup="test_fixtures.setup_session_counter",
+                        teardown="test_fixtures.teardown_session_counter",
+                        scope="session",
+                    )
+                }
+            )
+            .run()
+        )
+
+        assert summary.all_passed
+        assert _session_fixture_events == ["setup", "teardown:10"]
+
 
 def setup_db_with_args(host: str, port: int):
     """Setup that requires positional args."""
diff --git a/tests/test_import_function.py b/tests/test_import_function.py
index f0b945b..dabbc30 100644
--- a/tests/test_import_function.py
+++ b/tests/test_import_function.py
@@ -206,6 +206,25 @@ def helper(x):
             sys.path = original_path
             os.chdir(original_cwd)
 
+    def test_import_local_module_does_not_mutate_sys_path(self, tmp_path, monkeypatch):
+        """Local imports should not leave the working directory on sys.path."""
+        module_file = tmp_path / "my_module.py"
+        module_file.write_text(
+            """
+def my_function(x):
+    return x * 2
+"""
+        )
+
+        monkeypatch.chdir(tmp_path)
+        monkeypatch.setattr(sys, "path", [p for p in sys.path if p != str(tmp_path)])
+        before = sys.path.copy()
+
+        func = import_function("my_module.my_function")
+
+        assert func(5) == 10
+        assert sys.path == before
+
 
 class TestImportErrors:
     """Tests for import error handling."""
diff --git a/tests/test_run_evals.py b/tests/test_run_evals.py
index 49fac6d..0659958 100644
--- a/tests/test_run_evals.py
+++ b/tests/test_run_evals.py
@@ -5,6 +5,7 @@
 import pytest
 
 from vowel import EvalSummary, RunEvals, run_evals
+from vowel.executor import DefaultExecutor
 
 
 class TestRunEvalsFromFile:
@@ -146,6 +147,28 @@ def test_with_functions_chained(self, simple_yaml_spec: str):
 
         assert summary.all_passed
 
+    def test_with_executor_preserves_existing_run_behavior(self, simple_yaml_spec: str):
+        """Executor preferences should be accepted without changing normal eval behavior."""
+        summary = (
+            RunEvals.from_source(simple_yaml_spec)
+            .with_functions({"add": lambda a, b: a + b})
+            .with_executor(DefaultExecutor(), fallback_executor=DefaultExecutor())
+            .run()
+        )
+
+        assert summary.all_passed
+
+    def test_run_evals_accepts_executor_preferences(self, simple_yaml_spec: str):
+        """Top-level run_evals should accept executor preferences."""
+        summary = run_evals(
+            simple_yaml_spec,
+            functions={"add": lambda a, b: a + b},
+            executor=DefaultExecutor(),
+            fallback_executor=DefaultExecutor(),
+        )
+
+        assert summary.all_passed
+
 
 class TestRunEvalsFilter:
     """Tests for filter() method."""
diff --git a/tests/test_session.py b/tests/test_session.py
new file mode 100644
index 0000000..b4b9d10
--- /dev/null
+++ b/tests/test_session.py
@@ -0,0 +1,232 @@
+"""Tests for ExecutionSession API — DefaultSession and MontyReplSession.
+
+Covers:
+    - Basic feed() results (binary search)
+    - Error handling (ZeroDivisionError)
+    - Syntax error reporting
+    - State preservation across feed() calls
+    - Stdout capture through sessions
+    - Context-manager lifecycle
+    - Session isolation (fresh state per session)
+"""
+
+from __future__ import annotations
+
+import importlib.util
+from typing import TYPE_CHECKING
+
+import pytest
+
+from vowel.executor import (
+    DefaultExecutor,
+    DefaultSession,
+    ExecutionSession,
+)
+
+if TYPE_CHECKING:
+    from vowel.executor import FallbackSession, MontyExecutor
+
+# MontyExecutor requires pydantic-monty; skip gracefully if unavailable.
+_MONTY_AVAILABLE = importlib.util.find_spec("pydantic_monty") is not None
+
+if _MONTY_AVAILABLE:
+    from vowel.executor import FallbackSession, MontyExecutor  # noqa: F811
+
+# ---------------------------------------------------------------------------
+# Shared test data
+# ---------------------------------------------------------------------------
+
+FUNC_CODE = """\
+def binary_search(arr, target):
+    lo, hi = 0, len(arr) - 1
+    while lo <= hi:
+        mid = (lo + hi) // 2
+        if arr[mid] == target:
+            return mid
+        elif arr[mid] < target:
+            lo = mid + 1
+        else:
+            hi = mid - 1
+    return -1
+"""
+
+SEARCH_CASES = [
+    ("binary_search([1, 3, 5, 7, 9], 5)", 2),
+    ("binary_search([], 1)", -1),
+    ("binary_search([1, 2, 3], 4)", -1),
+    ("binary_search([10, 20, 30], 10)", 0),
+]
+
+
+def _build_executor_params() -> tuple[list, list[str]]:
+    params = [(DefaultExecutor, DefaultSession)]
+    ids = ["default"]
+    if _MONTY_AVAILABLE:
+        params.insert(0, (MontyExecutor, FallbackSession))  # type: ignore
+        ids.insert(0, "monty")
+    return params, ids
+
+
+EXECUTOR_CLASSES, EXECUTOR_IDS = _build_executor_params()
+
+
+@pytest.fixture(params=EXECUTOR_CLASSES, ids=EXECUTOR_IDS)
+def executor_and_session(request):
+    """Yield (executor_instance, expected_session_class)."""
+    cls, session_cls = request.param
+    return cls(), session_cls
+
+
+# ---------------------------------------------------------------------------
+# Basic session correctness
+# ---------------------------------------------------------------------------
+
+
+class TestSessionBasic:
+    """feed() returns correct outputs for a simple function."""
+
+    def test_binary_search_cases(self, executor_and_session):
+        executor, _ = executor_and_session
+        with executor.create_session(FUNC_CODE) as session:
+            for snippet, expected in SEARCH_CASES:
+                r = session.feed(snippet)
+                assert r.success, f"Failed: {snippet} => {r.error}"
+                assert r.output == expected, f"{snippet}: got {r.output!r}, expected {expected!r}"
+
+    def test_session_type(self, executor_and_session):
+        """create_session() returns the correct session class."""
+        executor, session_cls = executor_and_session
+        with executor.create_session("x = 1") as session:
+            assert isinstance(session, session_cls)
+
+
+# ---------------------------------------------------------------------------
+# Error handling
+# ---------------------------------------------------------------------------
+
+
+class TestSessionErrors:
+    """Errors are returned structured, not raised."""
+
+    def test_zero_division(self, executor_and_session):
+        executor, _ = executor_and_session
+        with executor.create_session("def foo(x): return 1/x") as session:
+            r = session.feed("foo(0)")
+            assert not r.success
+            assert r.error_type == "ZeroDivisionError"
+            assert r.error is not None
+
+    def test_name_error(self, executor_and_session):
+        executor, _ = executor_and_session
+        with executor.create_session("x = 1") as session:
+            r = session.feed("undefined_var + 1")
+            assert not r.success
+            assert r.error_type == "NameError"
+
+    def test_syntax_error(self, executor_and_session):
+        executor, _ = executor_and_session
+        with executor.create_session("def foo(): return 42") as session:
+            r = session.feed("foo(")
+            assert not r.success
+            assert r.error_type == "SyntaxError"
+
+    def test_error_does_not_break_session(self, executor_and_session):
+        """A single error in feed() should not corrupt the session."""
+        executor, _ = executor_and_session
+        with executor.create_session("def foo(x): return 1/x") as session:
+            r_bad = session.feed("foo(0)")
+            assert not r_bad.success
+            # Session should still work after error:
+            r_ok = session.feed("foo(2)")
+            assert r_ok.success
+            assert r_ok.output == 0.5
+
+
+# ---------------------------------------------------------------------------
+# State preservation
+# ---------------------------------------------------------------------------
+
+
+class TestStatePreservation:
+    """State persists across feed() calls within a single session."""
+
+    def test_mutation_persists(self, executor_and_session):
+        executor, _ = executor_and_session
+        with executor.create_session("x = 10") as session:
+            r1 = session.feed("x + 5")
+            assert r1.output == 15
+
+            session.feed("x = x * 2")
+
+            r3 = session.feed("x")
+            assert r3.output == 20
+
+    def test_function_defined_in_session(self, executor_and_session):
+        """Functions defined in one feed() are available in subsequent feeds."""
+        executor, _ = executor_and_session
+        with executor.create_session("y = 100") as session:
+            session.feed("def double(n): return n * 2")
+            r = session.feed("double(y)")
+            assert r.success
+            assert r.output == 200
+
+    def test_list_accumulation(self, executor_and_session):
+        executor, _ = executor_and_session
+        with executor.create_session("items = []") as session:
+            session.feed("items.append(1)")
+            session.feed("items.append(2)")
+            session.feed("items.append(3)")
+            r = session.feed("items")
+            assert r.output == [1, 2, 3]
+
+
+# ---------------------------------------------------------------------------
+# Stdout capture
+# ---------------------------------------------------------------------------
+
+
+class TestSessionStdout:
+    """print() output is captured through the session."""
+
+    def test_stdout_captured(self, executor_and_session):
+        executor, _ = executor_and_session
+        with executor.create_session("def greet(name): print(f'Hello {name}')") as session:
+            r = session.feed("greet('World')")
+            assert "Hello World" in r.stdout
+
+
+# ---------------------------------------------------------------------------
+# Session isolation
+# ---------------------------------------------------------------------------
+
+
+class TestSessionIsolation:
+    """Each session starts with a clean namespace."""
+
+    def test_separate_sessions_isolated(self, executor_and_session):
+        executor, _ = executor_and_session
+
+        with executor.create_session("x = 42") as s1:
+            r1 = s1.feed("x")
+            assert r1.output == 42
+
+        # A new session should NOT see x from the previous one:
+        with executor.create_session("y = 99") as s2:
+            r2 = s2.feed("y")
+            assert r2.output == 99
+            r_x = s2.feed("x")
+            assert not r_x.success  # x should not exist
+
+
+# ---------------------------------------------------------------------------
+# Protocol conformance
+# ---------------------------------------------------------------------------
+
+
+class TestSessionProtocol:
+    """Sessions satisfy the ExecutionSession protocol."""
+
+    def test_protocol(self, executor_and_session):
+        executor, _ = executor_and_session
+        with executor.create_session("x = 1") as session:
+            assert isinstance(session, ExecutionSession)
diff --git a/tests/test_tdd_eval_retries.py b/tests/test_tdd_eval_retries.py
index efa5fdf..b3c0fdc 100644
--- a/tests/test_tdd_eval_retries.py
+++ b/tests/test_tdd_eval_retries.py
@@ -4,6 +4,7 @@
 from unittest.mock import MagicMock, PropertyMock, patch
 
 from vowel.eval_types import EvalsSource
+from vowel.spec_validation import build_failure_context
 from vowel.tdd import FunctionSignature, Param, TDDGenerator
 
 
@@ -35,7 +36,8 @@ def _make_signature() -> FunctionSignature:
   dataset:
     - case:
         inputs: [1, 2]
-        expected: 999
+        expected: 3
+        assertion: "output > 100"
     - case:
         inputs: [0, 0]
         expected: 0
@@ -166,27 +168,21 @@ def test_partial_coverage_accepted(self, mock_agent_prop):
         mock_agent.run_sync.assert_called_once()
 
 
-class TestBuildEvalFailureContext(unittest.TestCase):
-    """Test the failure context builder."""
+class TestBuildFailureContext(unittest.TestCase):
+    """Test the shared failure context builder."""
 
     def test_builds_context_from_failures(self):
-        gen = TDDGenerator.__new__(TDDGenerator)
-        gen.model = "test"
-
         # Run actual evals with a bad spec to get real summary
         from vowel.runner import RunEvals
 
         summary = RunEvals.from_source(BAD_YAML).with_functions({"add": add}).run()
 
-        context = gen._build_eval_failure_context(summary)
+        context = build_failure_context(summary)
         assert "FAILED" in context
 
     def test_unknown_failures_fallback(self):
-        gen = TDDGenerator.__new__(TDDGenerator)
-        gen.model = "test"
-
         # Mock summary with no useful info
         mock_summary = MagicMock()
         mock_summary.results = []
-        context = gen._build_eval_failure_context(mock_summary)
+        context = build_failure_context(mock_summary)
         assert context == "Unknown failures"

From 60b602f308752f689f17b9800a600f1bfe259e93 Mon Sep 17 00:00:00 2001
From: Mert <62549656+fswair@users.noreply.github.com>
Date: Tue, 17 Mar 2026 20:19:01 +0300
Subject: [PATCH 3/8] final_commit

---
 db_fixture.yml                 |   2 +-
 pyrightconfig.json             |   1 +
 src/vowel/__init__.py          |  55 +--
 src/vowel/ai.py                |  36 +-
 src/vowel/cli.py               |  99 +++++-
 src/vowel/codemode.py          | 211 +++++------
 src/vowel/context.py           |  11 +-
 src/vowel/eval_types.py        |  65 ++--
 src/vowel/evals.py             |  18 +-
 src/vowel/executor.py          |  76 +---
 src/vowel/mcp_server.py        |  68 +---
 src/vowel/runner.py            |  35 +-
 src/vowel/schema.py            | 115 ++++++
 src/vowel/spec_validation.py   | 355 -------------------
 src/vowel/tdd.py               |  28 +-
 src/vowel/utils.py             | 103 ++----
 src/vowel/validation.py        | 279 ++++++++++++++-
 tests/test_executor.py         |  18 +-
 tests/test_llm_integration.py  |   6 +-
 tests/test_llm_judge.py        |   5 +-
 tests/test_session.py          |  12 +-
 tests/test_tdd_eval_retries.py |   2 +-
 tests/test_yaml_loading.py     | 110 +++---
 vowel-schema.json              | 623 +++++++++++++++++++++++++++++----
 24 files changed, 1277 insertions(+), 1056 deletions(-)
 create mode 100644 src/vowel/schema.py
 delete mode 100644 src/vowel/spec_validation.py

diff --git a/db_fixture.yml b/db_fixture.yml
index 32fa6a1..e8be58c 100644
--- a/db_fixture.yml
+++ b/db_fixture.yml
@@ -29,4 +29,4 @@ db.Connection.execute:
     - case:
         input: "SELECT * FROM developers" # (buggy query - invalid table)
         raises: NoTableError
-        match: "no such table" ## must match the exception message (case ignored)
\ No newline at end of file
+        match: "no such table" ## must match the exception message (case ignored)
diff --git a/pyrightconfig.json b/pyrightconfig.json
index 19456c7..98a899b 100644
--- a/pyrightconfig.json
+++ b/pyrightconfig.json
@@ -4,6 +4,7 @@
   "exclude": [
     "vowel-optimization",
     "tmp",
+    "benchmark*",
     "**/.*"
   ],
   "reportMissingModuleSource": "none",
diff --git a/src/vowel/__init__.py b/src/vowel/__init__.py
index f890ed4..b3d98d2 100644
--- a/src/vowel/__init__.py
+++ b/src/vowel/__init__.py
@@ -1,33 +1,7 @@
-"""
-vowel - A modular evaluation framework for testing functions with YAML-based specifications.
-
-This package provides a comprehensive evaluation framework for testing Python functions
-using YAML-based specifications. It supports various evaluation types including:
-
-- Type checking (isinstance validation)
-- Custom assertions (Python expressions)
-- Performance constraints (duration limits)
-- Input containment checks
-- Regex pattern matching
-- Exception validation
-- LLM-based semantic evaluation
-
-Quick Start:
-    # Run evaluations from a YAML file
-    from vowel import run_evals
-    summary = run_evals("evals.yml")
-
-    # Generate evals for a function using LLM
-    from vowel import EvalGenerator, Function
-    gen = EvalGenerator(model="openai:gpt-4o")
-    func = Function(name="add", code="def add(a, b): return a + b", description="Add two numbers")
-    summary = gen.generate_and_run(func, auto_retry=True)
-
-For more information, see the documentation at:
-https://github.com/fswair/vowel
-"""
+"""Public package exports for the vowel evaluation framework."""
 
 import importlib.metadata
+from contextlib import suppress
 
 __version__ = importlib.metadata.version("vowel")
 
@@ -48,27 +22,28 @@
     resolve_executors,
 )
 from .runner import Function, RunEvals
+from .schema import ensure_cached_schema
 from .utils import (
     EvalResult,
     EvalSummary,
     check_compatibility,
     get_unsupported_params,
     is_yaml_serializable_type,
-    load_evals,
-    load_evals_file,
-    load_evals_from_dict,
-    load_evals_from_object,
-    load_evals_from_yaml_string,
+    load_bundle,
+    load_bundle_file,
+    load_bundle_from_dict,
+    load_bundle_from_object,
+    load_bundle_from_yaml_string,
     run_evals,
     to_dataset,
 )
 
 __all__ = [
-    "load_evals_file",
-    "load_evals_from_yaml_string",
-    "load_evals_from_dict",
-    "load_evals_from_object",
-    "load_evals",
+    "load_bundle_file",
+    "load_bundle_from_yaml_string",
+    "load_bundle_from_dict",
+    "load_bundle_from_object",
+    "load_bundle",
     "to_dataset",
     "run_evals",
     "RunEvals",
@@ -101,3 +76,7 @@
     "ExplorationPlan",
     "SnippetResult",
 ]
+
+
+with suppress(Exception):
+    ensure_cached_schema(__version__)
diff --git a/src/vowel/ai.py b/src/vowel/ai.py
index 6abcc0e..50937ac 100644
--- a/src/vowel/ai.py
+++ b/src/vowel/ai.py
@@ -1,34 +1,4 @@
-"""LLM-powered evaluation specification generator and function healer.
-
-This module provides:
-- EvalGenerator: Generate eval specs and heal buggy functions using LLMs
-- generate_eval_spec: Generate YAML eval specs from function definitions
-- prepare_agent: Create a pydantic_ai Agent for eval generation
-
-Key Features:
-- Auto-generate YAML eval specs from function code and description
-- Heal buggy functions based on failing test inputs
-- Retry logic with configurable coverage thresholds
-- Support for async and sync function generation
-
-Example:
-    from vowel import EvalGenerator, Function
-
-    generator = EvalGenerator(model="openai:gpt-4o")
-
-    func = Function(
-        name="factorial",
-        description="Calculate factorial of n",
-        code="def factorial(n): return 1 if n <= 1 else n * factorial(n - 1)"
-    )
-
-    summary = generator.generate_and_run(
-        func,
-        auto_retry=True,
-        heal_function=True,
-        min_coverage=0.9
-    )
-"""
+"""LLM-backed eval generation and function healing utilities."""
 
 import os
 import time
@@ -43,6 +13,7 @@
 from vowel.eval_types import EvalsSource
 from vowel.monitoring import enable_monitoring
 from vowel.runner import Function, RunEvals
+from vowel.schema import materialize_yaml_with_schema_header
 from vowel.utils import EvalSummary, check_compatibility, import_function
 from vowel.validation import validate_and_fix_spec
 
@@ -563,8 +534,9 @@ def generate_eval_spec(
             )
 
             if save_to_file:
+                spec_to_write = materialize_yaml_with_schema_header(spec_to_use)
                 with open(f"{func.name}_evals.yml", "w") as f:
-                    f.write(spec_to_use)
+                    f.write(spec_to_write)
 
             runner = RunEvals.from_source(spec_to_use)
             if func.func:
diff --git a/src/vowel/cli.py b/src/vowel/cli.py
index 868b574..a5bf430 100644
--- a/src/vowel/cli.py
+++ b/src/vowel/cli.py
@@ -1,18 +1,13 @@
-"""Command-line interface for the vowel evaluation framework.
-
-Usage:
-    vowel <yaml_file>                 Run evaluations from a YAML spec
-    vowel -d <directory>              Run all YAML files in a directory
-    vowel <yaml_file> -v              Detailed summary with spec semantics
-    vowel <yaml_file> --hide-report   Hide pydantic_evals report output
-"""
+"""Command-line entry points for running and managing vowel eval specs."""
 
+import json
 import sys
 import time
 from pathlib import Path
 
 import click
 import dotenv
+import yaml
 from rich import box
 from rich.console import Console
 from rich.panel import Panel
@@ -27,6 +22,7 @@
     LLMJudgeCase,
     PatternMatchCase,
 )
+from .schema import build_yaml_schema_from_bundle, materialize_yaml_with_schema_header
 from .utils import EvalsBundle, EvalSummary, load_bundle, run_evals
 
 dotenv.load_dotenv()
@@ -249,7 +245,9 @@ def validate_coverage(ctx, param, value):
 
 
 @click.command()
-@click.argument("yaml_file", type=click.Path(exists=True, path_type=Path), required=False)
+@click.argument("arg1", type=click.Path(path_type=Path), required=False)
+@click.argument("arg2", type=click.Path(path_type=Path), required=False)
+@click.argument("arg3", type=click.Path(path_type=Path), required=False)
 @click.option("--ci", is_flag=True, help="Enable CI mode")
 @click.option(
     "--coverage",
@@ -278,8 +276,16 @@ def validate_coverage(ctx, param, value):
 @click.option("--watch", "-w", is_flag=True, help="Watch mode: re-run on file changes")
 @click.option("--verbose", "-v", is_flag=True, help="Show detailed evaluation summary")
 @click.option("--hide-report", is_flag=True, help="Hide pydantic_evals report output")
+@click.option(
+    "--create",
+    "schema_create",
+    is_flag=True,
+    help="With 'vowel schema': generate vowel-schema.json in current directory",
+)
 def main(
-    yaml_file: Path | None,
+    arg1: Path | None,
+    arg2: Path | None,
+    arg3: Path | None,
     debug: bool,
     coverage: float,
     filter_func: str | None,
@@ -295,10 +301,75 @@ def main(
     watch: bool,
     verbose: bool,
     hide_report: bool,
+    schema_create: bool,
 ):
     """vowel — YAML-based evaluation framework for Python functions."""
     console = Console(force_terminal=False, no_color=True) if no_color else Console()
 
+    # Command mode: vowel schema <file_path>
+    if arg1 is not None and str(arg1) == "schema":
+        # Command mode: vowel schema --create [output_path]
+        if schema_create:
+            output_path = arg2 if arg2 is not None else Path.cwd() / "vowel-schema.json"
+            output_path.parent.mkdir(parents=True, exist_ok=True)
+            schema = build_yaml_schema_from_bundle()
+            output_path.write_text(
+                json.dumps(schema, indent=2, ensure_ascii=False) + "\n", encoding="utf-8"
+            )
+            console.print(f"[green]✓[/green] Generated schema: [cyan]{output_path}[/cyan]")
+            return
+
+        if arg2 is None:
+            click.secho("ERROR: vowel schema requires <file_path> or --create", fg="red", err=True)
+            raise click.Abort()
+
+        target_path = arg2
+        if not target_path.exists():
+            click.secho(f"ERROR: File not found: {target_path}", fg="red", err=True)
+            raise SystemExit(1)
+
+        if target_path.suffix.lower() == ".json":
+            click.secho(
+                "ERROR: JSON files are not supported by 'vowel schema <file_path>'. "
+                "Use a YAML file (.yml/.yaml).",
+                fg="red",
+                err=True,
+            )
+            raise SystemExit(1)
+
+        existing = target_path.read_text(encoding="utf-8")
+
+        # Do not inject schema header into invalid YAML files.
+        try:
+            yaml.safe_load(existing)
+        except Exception as e:
+            click.secho(
+                f"ERROR: File is not valid YAML, schema header not added: {e}",
+                fg="red",
+                err=True,
+            )
+            raise SystemExit(1) from None
+
+        # Do not inject schema header if content is not a valid vowel spec.
+        try:
+            load_bundle(existing)
+        except Exception as e:
+            click.secho(
+                f"ERROR: Pydantic validation failed, schema header not added: {e}",
+                fg="red",
+                err=True,
+            )
+            raise SystemExit(1) from None
+
+        updated = materialize_yaml_with_schema_header(existing)
+        target_path.write_text(updated, encoding="utf-8")
+        console.print(f"[green]✓[/green] Updated schema header: [cyan]{target_path}[/cyan]")
+
+        console.print("[green]✓[/green] Pydantic validation passed")
+        return
+
+    yaml_file = arg1
+
     # Validate incompatible options
     if directory and filter_func:
         click.secho("ERROR: --filter cannot be used with --dir", fg="red", err=True)
@@ -486,6 +557,9 @@ def main(
         if not yaml_file:
             click.secho("ERROR: --watch requires a YAML file", fg="red", err=True)
             raise click.Abort()
+        if not yaml_file.exists():
+            click.secho(f"ERROR: YAML file not found: {yaml_file}", fg="red", err=True)
+            raise click.Abort()
 
         try:
             from watchdog.events import FileSystemEventHandler
@@ -565,6 +639,9 @@ def on_modified(self, event):
         if not quiet:
             console.print(f"Found [cyan]{len(yaml_files)}[/cyan] YAML file(s)")
     elif yaml_file:
+        if not yaml_file.exists():
+            click.secho(f"ERROR: YAML file not found: {yaml_file}", fg="red", err=True)
+            raise click.Abort()
         yaml_files = [yaml_file]
     else:
         click.secho("ERROR: Either YAML_FILE or --dir is required", fg="red", err=True)
@@ -738,8 +815,6 @@ def on_modified(self, event):
 
     # Export JSON
     if export_json:
-        import json
-
         json_data = summary.to_json()
         with open(export_json, "w") as f:
             json.dump(json_data, f, indent=2)
diff --git a/src/vowel/codemode.py b/src/vowel/codemode.py
index 6ef8256..e02f421 100644
--- a/src/vowel/codemode.py
+++ b/src/vowel/codemode.py
@@ -1,21 +1,11 @@
-"""CodeMode eval generation pipeline.
+"""CodeMode pipeline for execution-aware eval spec generation.
 
-This module provides ``CodeModeGenerator`` — a two-phase pipeline that uses
-a sandboxed code executor to produce ground-truth expected values before
-generating YAML eval specs.
+CodeMode uses real execution feedback to generate robust vowel eval specs:
+1. Explore behavior by running LLM-generated snippets against the target code.
+2. Generate and refine a spec from verified outputs/errors.
 
-Phase 1 — **Exploration**
-    The LLM writes small Python snippets that call ``target_func`` with various
-    inputs.  Each snippet is executed via ``Executor`` (Monty sandbox by default)
-    and the real outputs are collected.  This replaces guesswork with empirical
-    observation.
-
-Phase 2 — **Spec Generation**
-    The exploration results (inputs → outputs, edge cases, exceptions) are fed
-    back to the LLM together with the eval spec context.  The LLM produces the
-    final YAML spec with verified expected values.
-
-All steps are instrumented with ``logfire`` for full observability.
+The pipeline supports both YAML output and structured bundle output, and keeps
+traceability via logfire spans.
 """
 
 from __future__ import annotations
@@ -34,15 +24,16 @@
 from vowel.executor import ExecutionResult, Executor, resolve_executors
 from vowel.monitoring import enable_monitoring
 from vowel.runner import Function, RunEvals
-from vowel.spec_validation import (
+from vowel.schema import materialize_yaml_with_schema_header
+from vowel.utils import EvalsBundle, EvalSummary
+from vowel.validation import (
     build_call_code,
     build_failure_context,
     inject_durations,
     inject_missing_error_cases,
+    validate_and_fix_spec,
     validate_expected_values,
 )
-from vowel.utils import EvalSummary
-from vowel.validation import validate_and_fix_spec
 
 enable_monitoring(service_name="vowel-codemode")
 
@@ -177,17 +168,10 @@ class CodeModeResult(BaseModel):
 
 
 class CodeModeGenerator:
-    """Two-phase eval generator: explore with executor, then generate spec.
-
-    Parameters
-    ----------
-    model:
-        LLM model identifier (e.g. ``"openai:gpt-4o"``).
-    executor:
-        Code execution backend.  Defaults to ``get_executor("auto")``
-        which prefers MontyExecutor when available.
-    additional_context:
-        Extra instructions appended to the system prompt.
+    """Execution-guided eval generator.
+
+    The generator first discovers behavior by running snippets, then produces
+    a validated eval spec (YAML or bundle) from those verified results.
     """
 
     def __init__(
@@ -198,6 +182,7 @@ def __init__(
         fallback_executor: Executor | None = None,
         additional_context: str = "",
         min_snippets: int = 15,
+        use_model_spec: bool = False,
         **opts,
     ) -> None:
         # Default fallback from kwargs (for backwards compatibility) or environment
@@ -217,11 +202,12 @@ def __init__(
         self.executor = resolve_executors(default_executor, fallback_executor)
         self.additional_context = additional_context
         self.min_snippets = min_snippets
+        self.use_model_spec = use_model_spec
         self._opts = opts
 
         # Lazy agents
         self._explorer_agent: Agent[None, ExplorationPlan] | None = None
-        self._spec_agent: Agent[None, EvalsSource] | None = None
+        self._spec_agent: Agent[None, EvalsSource | EvalsBundle] | None = None
 
         logfire.info(
             "CodeModeGenerator initialized",
@@ -244,11 +230,12 @@ def explorer_agent(self) -> Agent[None, ExplorationPlan]:
         return self._explorer_agent
 
     @property
-    def spec_agent(self) -> Agent[None, EvalsSource]:
+    def spec_agent(self) -> Agent[None, EvalsSource | EvalsBundle]:
         if self._spec_agent is None:
+            output_type = EvalsBundle if self.use_model_spec else EvalsSource
             self._spec_agent = Agent(
                 self.spec_model,
-                output_type=EvalsSource,
+                output_type=output_type,
                 system_prompt=self._spec_system_prompt(),
                 **self._opts,
             )
@@ -323,21 +310,10 @@ async def explore(
         *,
         exploration_rounds: int = 2,
     ) -> list[SnippetResult]:
-        """Phase 1: Generate and execute exploration snippets.
-
-        Supports multi-round feedback-guided exploration.  Round 1 uses
-        static reasoning (speculation-based).  Round 2+ receives a
-        programmatic cluster summary of prior results so the LLM can
-        target unexplored behaviour classes (evidence-based).
+        """Generate and execute exploration snippets.
 
-        Parameters
-        ----------
-        exploration_rounds:
-            Number of exploration rounds (default 2).  Set to 1 to
-            restore single-shot behaviour.
-
-        Returns a list of ``SnippetResult`` with real outputs from the
-        executor.
+        Round 1 discovers baseline behavior. Subsequent rounds receive prior
+        execution evidence and target uncovered behavior classes.
         """
         with logfire.span(
             "codemode.explore",
@@ -404,7 +380,7 @@ def _execute_plan(
         plan: ExplorationPlan,
         round_num: int = 1,
     ) -> list[SnippetResult]:
-        """Execute all snippets in a plan and return results."""
+        """Execute all snippets in an exploration plan and collect results."""
         all_snippets = [
             *((s, "normal") for s in plan.snippets),
             *((s, "error") for s in plan.error_snippets),
@@ -448,11 +424,7 @@ def _execute_plan(
 
     @staticmethod
     def _build_cluster_summary(results: list[SnippetResult]) -> str:
-        """Build a deterministic cluster summary from exploration results.
-
-        Groups results by output type / error type and formats a concise
-        summary for the Round 2 exploration prompt.
-        """
+        """Summarize observed output/error clusters for targeted exploration."""
         # -- Success clusters --
         success_types: dict[str, int] = {}
         for r in results:
@@ -501,7 +473,7 @@ def _count_new_behaviors(
         prior: list[SnippetResult],
         new: list[SnippetResult],
     ) -> int:
-        """Count how many new behaviour classes the new results introduced."""
+        """Count new behavior signatures introduced by a round."""
 
         def _behavior_key(r: SnippetResult) -> str:
             if r.success:
@@ -513,7 +485,7 @@ def _behavior_key(r: SnippetResult) -> str:
         return len(new_keys - prior_keys)
 
     async def _get_exploration_plan(self, func: Function) -> ExplorationPlan:
-        """Ask the LLM for exploration snippets (Round 1 — static reasoning)."""
+        """Request initial exploration snippets from the model."""
         with logfire.span("codemode.llm_explore", func_name=func.name, round=1):
             prompt = f"""Explore the following function by writing test snippets:
 
@@ -545,7 +517,7 @@ async def _get_targeted_exploration_plan(
         prior_results: list[SnippetResult],
         cluster_summary: str,
     ) -> ExplorationPlan:
-        """Ask the LLM for targeted snippets (Round 2 — evidence-based)."""
+        """Request targeted snippets using prior execution evidence."""
         with logfire.span("codemode.llm_explore", func_name=func.name, round=2):
             prompt = f"""You previously explored `{func.name}` and the snippets were
 executed.  Below are the ACTUAL results and a cluster summary.
@@ -597,14 +569,11 @@ async def generate_spec(
         func: Function,
         exploration_results: list[SnippetResult],
         failure_context: str | None = None,
-    ) -> str:
-        """Phase 2: Generate YAML spec using verified exploration results.
+    ) -> str | EvalsBundle:
+        """Generate a spec from verified exploration results.
 
-        Parameters
-        ----------
-        failure_context:
-            When provided (on refinement rounds), appended to the prompt so
-            the LLM can fix specific failures from the previous attempt.
+        Returns YAML text in default mode, or ``EvalsBundle`` when
+        ``use_model_spec=True``.
         """
         with logfire.span(
             "codemode.generate_spec",
@@ -685,6 +654,18 @@ async def generate_spec(
             )
 
             result = await self.spec_agent.run(prompt)
+
+            if self.use_model_spec:
+                bundle = result.output
+                assert isinstance(bundle, EvalsBundle)
+                logfire.info(
+                    "Model spec bundle generated",
+                    func_name=func.name,
+                    eval_count=len(bundle.evals),
+                    fixture_count=len(bundle.fixtures),
+                )
+                return bundle
+
             yaml_spec = result.output.yaml_spec
 
             # Sanitize: strip ALL !!<tag> annotations — safe_load only accepts
@@ -739,7 +720,7 @@ async def generate_spec(
 
     @staticmethod
     def _build_failure_context(summary: EvalSummary) -> str:
-        """Build a concise failure report to inject into the retry prompt."""
+        """Build retry context from failed assertions/errors."""
         return build_failure_context(summary)
 
     def _inject_durations(
@@ -750,7 +731,7 @@ def _inject_durations(
         buffer_pct: float = 0.5,
         floor_ms: float = 10.0,
     ) -> str:
-        """Add per-case ``duration`` fields based on actual execution times."""
+        """Inject measured duration thresholds into cases."""
         return inject_durations(
             yaml_spec,
             func,
@@ -761,7 +742,7 @@ def _inject_durations(
 
     @staticmethod
     def _build_call_code(func_name: str, case: dict) -> str | None:
-        """Build a ``func(args...)`` call string from a case dict."""
+        """Build a callable expression from a dataset case."""
         return build_call_code(func_name, case)
 
     # -- Full pipeline -----------------------------------------------------
@@ -776,46 +757,11 @@ async def generate(
         min_coverage: float = 1.0,
         inject_durations: bool = True,
     ) -> CodeModeResult:
-        """Run the full CodeMode pipeline with post-generation validation.
-
-        Pipeline::
-
-            Phase 1: explore()                        (2 rounds by default)
-              Round 1 — static reasoning (speculation-based)
-              Round 2 — targeted exploration (evidence-based)
-            Phase 2: generate_spec()                  (may loop)
-            Phase 3: validate via RunEvals            (per attempt)
-            Phase 4: refine on failure                (up to N rounds)
-            Phase 5: inject_durations()               (once, at end)
-
-        Exploration (Phase 1) runs in two rounds.  Round 1 uses static
-        reasoning; Round 2 receives a cluster summary of Round 1 results
-        and targets uncovered behaviour classes.  Only spec generation
-        (Phase 2) is re-run on validation failure.
-
-        Parameters
-        ----------
-        func:
-            The function to generate evals for.
-        run_evals:
-            Whether to run the generated evals and include the summary.
-        save_to_file:
-            Whether to save the YAML spec to ``{func.name}_evals.yml``.
-        max_refinement_rounds:
-            Maximum number of spec-regeneration attempts after the initial
-            generation (0 = single attempt, no refinement).
-        min_coverage:
-            Target pass-rate in 0.0–1.0 (default 1.0 = 100 %).  The loop
-            exits early when coverage meets or exceeds this threshold.
-        inject_durations:
-            Whether to measure and inject per-case ``duration`` fields
-            into the final YAML spec.
-
-        Returns
-        -------
-        CodeModeResult
-            Contains exploration results, YAML spec, summary, and
-            the number of refinement rounds used.
+        """Run full CodeMode generation, validation, and optional refinement.
+
+        Flow: explore -> generate spec -> validate -> refine (optional) ->
+        inject durations (optional). Returns exploration artifacts, final spec,
+        and evaluation summary when ``run_evals`` is enabled.
         """
         with logfire.span(
             "codemode.pipeline",
@@ -831,6 +777,7 @@ async def generate(
 
             # Phase 2–4 — generate spec + validate + refine
             yaml_spec = ""
+            generated_bundle: EvalsBundle | None = None
             summary: EvalSummary | None = None
             refinement_rounds = 0
             failure_context: str | None = None
@@ -843,18 +790,25 @@ async def generate(
                     is_refinement=attempt > 0,
                 ):
                     try:
-                        yaml_spec = await self.generate_spec(
+                        bundle = await self.generate_spec(
                             func,
                             exploration_results,
                             failure_context,
                         )
-                    except Exception as gen_exc:
+
+                        if isinstance(bundle, EvalsBundle):
+                            generated_bundle = bundle
+                            yaml_spec = bundle.to_yaml()
+                        else:
+                            generated_bundle = None
+                            yaml_spec = bundle
+                    except Exception as exc:
                         logfire.warn(
                             "Spec generation failed on attempt {attempt}, retrying",
                             attempt=attempt + 1,
-                            error=str(gen_exc),
+                            error=str(exc),
                         )
-                        failure_context = f"Generation error: {gen_exc}"
+                        failure_context = f"Generation error: {exc}"
                         refinement_rounds = attempt + 1
                         continue
 
@@ -863,11 +817,18 @@ async def generate(
 
                     # Validate: run evals with ignore_duration=True
                     try:
-                        runner = (
-                            RunEvals.from_source(yaml_spec)
-                            .with_functions({func.name: func.impl})
-                            .ignore_duration()
-                        )
+                        if generated_bundle is not None:
+                            runner = (
+                                RunEvals.from_bundle(generated_bundle)
+                                .with_functions({func.name: func.impl})
+                                .ignore_duration()
+                            )
+                        else:
+                            runner = (
+                                RunEvals.from_source(yaml_spec)
+                                .with_functions({func.name: func.impl})
+                                .ignore_duration()
+                            )
                         summary = runner.run()
 
                         logfire.info(
@@ -912,19 +873,27 @@ async def generate(
             # Final summary run (with durations now present, but still ignored)
             if run_evals and summary is not None:
                 try:
-                    final_runner = (
-                        RunEvals.from_source(yaml_spec)
-                        .with_functions({func.name: func.impl})
-                        .ignore_duration()
-                    )
+                    if generated_bundle is not None:
+                        final_runner = (
+                            RunEvals.from_bundle(generated_bundle)
+                            .with_functions({func.name: func.impl})
+                            .ignore_duration()
+                        )
+                    else:
+                        final_runner = (
+                            RunEvals.from_source(yaml_spec)
+                            .with_functions({func.name: func.impl})
+                            .ignore_duration()
+                        )
                     summary = final_runner.run()
                 except Exception:  # noqa: BLE001
                     pass  # keep last good summary
 
             if save_to_file:
                 path = f"{func.name}_evals.yml"
+                spec_to_write = materialize_yaml_with_schema_header(yaml_spec)
                 with open(path, "w") as f:
-                    f.write(yaml_spec)
+                    f.write(spec_to_write)
                 logfire.info("Saved spec to {path}", path=path)
 
             elapsed = (time.perf_counter() - t0) * 1000
diff --git a/src/vowel/context.py b/src/vowel/context.py
index 4e584b0..36fa283 100644
--- a/src/vowel/context.py
+++ b/src/vowel/context.py
@@ -1,13 +1,4 @@
-"""
-Context definitions for vowel eval specification generation.
-
-This module contains the EVAL_SPEC_CONTEXT which provides comprehensive
-documentation about vowel's YAML-based evaluation specification format.
-This context is used by EvalGenerator to guide LLM-based eval generation.
-
-Set VOWEL_CONTEXT_VERSION=legacy to use the pre-optimization prompt.
-Default is "v3" (GEPA-optimized with Sonnet proposer).
-"""
+"""Prompt context strings used for LLM-driven eval specification generation."""
 
 import os
 
diff --git a/src/vowel/eval_types.py b/src/vowel/eval_types.py
index e9cc14d..17f8495 100644
--- a/src/vowel/eval_types.py
+++ b/src/vowel/eval_types.py
@@ -1,24 +1,4 @@
-"""Pydantic models for vowel evaluation specifications.
-
-This module defines the data models used to parse and validate
-YAML evaluation specifications. These models ensure type safety
-and provide clear schemas for evaluation definitions.
-
-Main evaluation types:
-    IsInstanceCase: Type checking validation
-    AssertionCase: Custom Python assertion evaluation
-    DurationCase: Performance/timing validation
-    ContainsInputCase: Input containment check
-    PatternMatchCase: Regex pattern matching
-    RaisesCase: Exception validation
-    LLMJudgeCase: LLM-based semantic evaluation
-
-Container models:
-    MatchCase: Individual test case with input/expected output
-    DatasetCase: Wrapper for test cases in dataset
-    Evals: Complete evaluation specification for a function
-    EvalsFile: Root model for YAML file parsing
-"""
+"""Pydantic models for parsing and validating vowel YAML specifications."""
 
 import os
 import typing
@@ -181,15 +161,33 @@ class EvalsSource(BaseModel):
 # Fixture Models
 # =============================================================================
 
-FixtureScope = Literal["function", "module", "session"]
-"""Scope for fixture lifecycle.
+FixtureScope = Literal["case", "eval", "file", "function", "module", "session"]
+"""Supported fixture scope names.
+
+Canonical user-facing names:
+- case: per dataset case
+- eval: per function eval block
+- file: per YAML file / run invocation
+
+Compatibility aliases:
+- function -> case
+- module -> eval
+- session -> file
 
-- function: Setup/teardown for each test case (default)
-- module: Setup once per eval file, teardown after all cases
-- session: Setup once per run_evals call, teardown at end
+Note:
+Runtime lifecycle currently uses legacy internal values
+(`function`/`module`/`session`). New names are normalized to these
+internal values for behavior-preserving migration.
 """
 
 
+_FIXTURE_SCOPE_ALIASES: dict[str, str] = {
+    "case": "function",
+    "eval": "module",
+    "file": "session",
+}
+
+
 class FixtureDefinition(BaseModel):
     """Definition of a single fixture with setup/teardown lifecycle."""
 
@@ -218,9 +216,22 @@ class FixtureDefinition(BaseModel):
     )
     scope: FixtureScope = Field(
         default="function",
-        description="Lifecycle scope: 'function' (per case), 'module' (per eval), or 'session' (per run)",
+        description=(
+            "Fixture lifecycle scope. Preferred names: 'case', 'eval', 'file'. "
+            "Compatibility aliases are accepted: 'function', 'module', 'session'. "
+            "Current runtime normalization maps case->function, eval->module, file->session."
+        ),
     )
 
+    @field_validator("scope", mode="before")
+    @classmethod
+    def normalize_scope_aliases(cls, value: Any) -> Any:
+        """Normalize new scope names to legacy internal values."""
+        if value is None or not isinstance(value, str):
+            return value
+        normalized = value.strip().lower()
+        return _FIXTURE_SCOPE_ALIASES.get(normalized, normalized)
+
     @model_validator(mode="after")
     def validate_setup_or_cls(self):
         if not self.setup and not self.cls:
diff --git a/src/vowel/evals.py b/src/vowel/evals.py
index 5e78ecd..722dc55 100644
--- a/src/vowel/evals.py
+++ b/src/vowel/evals.py
@@ -1,20 +1,4 @@
-"""Evaluator implementations for the vowel framework.
-
-This module contains the concrete evaluator classes that implement
-the evaluation logic defined in eval_types.py. Each evaluator
-integrates with pydantic-evals to provide result reporting.
-
-Evaluators:
-    AssertionEvaluator: Runs Python assertion expressions
-    TypeAdapterEvaluator: Validates output types using Pydantic
-    ContainsInputEvaluator: Checks if output contains input value
-    PatternMatchingEvaluator: Validates output against regex patterns
-    RaisesEvaluator: Validates expected exception raising
-
-Factory functions:
-    create_llm_judge: Creates an LLM-based judge evaluator
-    prepare_env_and_condition: Prepares evaluation context
-"""
+"""Concrete evaluator implementations used by the vowel runtime."""
 
 import importlib.util
 import os
diff --git a/src/vowel/executor.py b/src/vowel/executor.py
index fe14dbd..1046744 100644
--- a/src/vowel/executor.py
+++ b/src/vowel/executor.py
@@ -1,71 +1,4 @@
-"""Code execution backends for CodeMode eval generation.
-
-CodeMode allows the eval generation agent to *run* code inside a sandbox
-rather than guessing expected values.  This produces ground-truth outputs
-and lets the agent empirically explore function behaviour (edge cases,
-exception boundaries, return types) before writing test cases.
-
-Architecture
-------------
-``Executor`` is a runtime Protocol — any object that implements ``execute()``
-qualifies.  Two concrete implementations are provided:
-
-* ``MontyExecutor``   — uses ``pydantic-monty`` (Rust-based sandbox, <0.1 ms
-                        startup, no filesystem/network access).  **Recommended
-                        for production and the optimization loop.**
-* ``DefaultExecutor`` — uses Python's built-in ``exec()`` with stdout capture.
-                        No sandboxing.  Safe only for trusted, local code;
-                        useful during development when Monty is not installed.
-
-The ``execute()`` method accepts two orthogonal injection mechanisms that
-mirror Monty's native API:
-
-* ``inputs``             — ``dict[str, Any]`` of *values* injected as
-                           top-level variables visible to the snippet.
-* ``external_functions`` — ``dict[str, Callable]`` of *host-side callbacks*
-                           the snippet can call by name.  In the Monty
-                           backend each call exits the sandbox, runs on
-                           the host, and returns the result.
-
-Session API
------------
-For batch exploration (e.g. CodeMode), use ``create_session()`` to compile
-the function source **once**, then ``feed()`` each snippet against the
-preserved runtime state.
-
-* ``MontyReplSession``   — backed by ``MontyRepl``: zero re-parse overhead
-                           per snippet, heap/globals preserved across feeds.
-* ``DefaultSession``     — backed by a persistent ``exec()`` namespace.
-
-Usage examples
---------------
-**External functions** — inject one or more real functions::
-
-    await executor.execute(
-        '''
-        results = []
-        results.append(target_func([1, 3, 5, 7, 9], 5))
-        results.append(target_func([], 1))
-        results
-        ''',
-        external_functions={"target_func": binary_search},
-    )
-
-**Inputs** — inject plain values::
-
-    await executor.execute(
-        "x + y",
-        inputs={"x": 10, "y": 20},
-    )
-
-**Session** — compile once, feed many snippets::
-
-    async with executor.create_session(func_code) as session:
-        r1 = session.feed("binary_search([1,3,5], 3)")
-        r2 = session.feed("binary_search([], 1)")
-
-The value of the last expression becomes ``ExecutionResult.output``.
-"""
+"""Execution backends used by CodeMode for sandboxed and local code runs."""
 
 from __future__ import annotations
 
@@ -90,12 +23,7 @@
 
 
 def run_sync(coro: Any) -> Any:
-    """Run a coroutine synchronously, even inside a running event loop.
-
-    Tries ``asyncio.run()`` first (clean, no patching).  If there is
-    already a running loop (e.g. Jupyter, async framework), falls back
-    to ``nest_asyncio`` + ``loop.run_until_complete()``.
-    """
+    """Run a coroutine from sync code, including running-loop environments."""
     try:
         return asyncio.run(coro)
     except RuntimeError as exc:
diff --git a/src/vowel/mcp_server.py b/src/vowel/mcp_server.py
index 729a44e..fc84660 100644
--- a/src/vowel/mcp_server.py
+++ b/src/vowel/mcp_server.py
@@ -1,63 +1,4 @@
-"""Vowel MCP Server - Model Context Protocol server for eval generation.
-
-This module exposes vowel's full evaluation, generation, and TDD capabilities via
-MCP (Model Context Protocol), enabling AI assistants to run evaluations, generate
-functions, create test specs, and perform TDD workflows.
-
-Configuration is set via the ``env`` field in your MCP client JSON config.
-The env field should contain API keys and model names only. All other parameters
-(auto_retry, min_coverage, etc.) are tool parameters with sensible defaults.
-
-Usage:
-    # Add to MCP client config (e.g., Claude Desktop, VS Code Copilot)
-    {
-        "mcpServers": {
-            "vowel": {
-                "command": "python",
-                "args": ["-m", "vowel.mcp_server"],
-                "env": {
-                    "MODEL_NAME": "openai:gpt-4o",
-                    "OPENAI_API_KEY": "sk-..."
-                }
-            }
-        }
-    }
-
-    # Or run directly (reads env vars from shell)
-    python -m vowel.mcp_server
-
-Supported env vars:
-    MODEL_NAME          — Default LLM model (e.g. "openai:gpt-4o", "gemini-3-flash-preview")
-    JUDGE_MODEL         — Model for LLM Judge evaluator
-    OPENAI_API_KEY      — OpenAI API key
-    ANTHROPIC_API_KEY   — Anthropic API key
-    GOOGLE_API_KEY      — Google AI API key
-
-Available Tools (14):
-    Eval Runner:
-        - run_evals_from_file: Run evaluations from a YAML file
-        - run_evals_from_yaml: Run evaluations from YAML content string
-        - run_evals_with_fixtures: Run evaluations with fixture injection
-        - validate_yaml_spec: Validate a YAML eval specification
-        - check_function_compatibility: Check function compatibility with eval generation
-        - list_yaml_files: List YAML files in a directory
-
-    EvalGenerator:
-        - generate_function: Generate a Python function from description
-        - generate_eval_spec: Generate eval spec for a function
-        - generate_and_run_evals: Generate spec + run + auto-retry + heal
-
-    TDDGenerator:
-        - tdd_generate_signature: Generate function signature from description
-        - tdd_generate_evals: Generate eval spec from a signature
-        - tdd_generate_implementation: Generate implementation from signature + spec
-        - tdd_generate_all: Full TDD flow: description → signature → evals → implementation
-        - tdd_generate_and_validate: TDD with eval validation against implementation
-
-Available Resources:
-    - vowel://context: Eval specification documentation
-    - vowel://example: Example YAML eval specification
-"""
+"""MCP server exposing vowel evaluation, generation, and TDD tools."""
 
 from __future__ import annotations
 
@@ -67,11 +8,12 @@
 import nest_asyncio
 from mcp.server.fastmcp import FastMCP
 
-from vowel import check_compatibility, load_evals_from_yaml_string, run_evals
+from vowel import check_compatibility, run_evals
 from vowel.ai import EVAL_SPEC_CONTEXT, EvalGenerator
 from vowel.monitoring import enable_monitoring
 from vowel.runner import Function, RunEvals
 from vowel.tdd import TDDGenerator
+from vowel.utils import load_bundle_from_yaml_string
 
 enable_monitoring(service_name="vowel-mcp")
 
@@ -204,8 +146,8 @@ def validate_yaml_spec(yaml_content: str) -> dict[str, Any]:
         yaml_content: YAML eval specification to validate
     """
     try:
-        evals = load_evals_from_yaml_string(yaml_content)
-        function_names = list(evals.keys())
+        bundle = load_bundle_from_yaml_string(yaml_content)
+        function_names = list(bundle.evals.keys())
         return {
             "valid": True,
             "functions": function_names,
diff --git a/src/vowel/runner.py b/src/vowel/runner.py
index 9906018..66f39c2 100644
--- a/src/vowel/runner.py
+++ b/src/vowel/runner.py
@@ -1,28 +1,4 @@
-"""RunEvals - A fluent API for running evaluations.
-
-This module provides:
-- Function: Pydantic model representing a function with code and metadata
-- RunEvals: Fluent API for loading and running evaluations
-
-Example:
-    # Run from YAML file
-    from vowel import RunEvals
-
-    summary = RunEvals.from_file("evals.yml").run()
-    print(f"All passed: {summary.all_passed}")
-
-    # Run with custom functions
-    def my_func(x):
-        return x * 2
-
-    summary = (
-        RunEvals.from_file("evals.yml")
-        .with_functions({"my_func": my_func})
-        .filter(["my_func"])
-        .debug()
-        .run()
-    )
-"""
+"""Fluent APIs and models for loading and running evals."""
 
 import ast
 import codecs
@@ -37,7 +13,7 @@ def my_func(x):
 
 from .eval_types import Evals, EvalsFile, FixtureDefinition
 from .executor import Executor
-from .utils import EvalSummary, EvalsBundle
+from .utils import EvalsBundle, EvalSummary
 from .utils import run_evals as _run_evals
 
 _T = TypeVar("_T", bound=Any)
@@ -76,12 +52,7 @@ def __name__(self) -> str:  # pyright: ignore[reportIncompatibleVariableOverride
 
     @property
     def impl(self) -> Callable[..., _RT]:
-        """
-        Get the function implementation as a callable.
-
-        Returns:
-            Callable: The function implementation.
-        """
+        """Return the executable function object for this definition."""
         if not self.func:
             self.execute()
         return cast(Callable, self.func)
diff --git a/src/vowel/schema.py b/src/vowel/schema.py
new file mode 100644
index 0000000..80e4647
--- /dev/null
+++ b/src/vowel/schema.py
@@ -0,0 +1,115 @@
+"""Versioned JSON Schema cache and YAML header helpers."""
+
+from __future__ import annotations
+
+import importlib.metadata
+import json
+import re
+from copy import deepcopy
+from pathlib import Path
+from typing import Any
+
+from .utils import EvalsBundle
+
+SCHEMA_CACHE_DIR = Path.home() / ".vowel"
+
+
+def _schema_version_token(version: str | None = None) -> str:
+    if version is None:
+        try:
+            version = importlib.metadata.version("vowel")
+        except importlib.metadata.PackageNotFoundError:
+            version = "0.0.0"
+
+    ver = version
+    nums = re.findall(r"\d+", ver)
+    if not nums:
+        return "000"
+    return "".join(nums)
+
+
+def build_yaml_schema_from_bundle() -> dict[str, Any]:
+    """Build YAML-file schema directly from runtime models.
+
+    No repository reference file is used. The root shape is forced to match
+    vowel's YAML file format:
+    - top-level optional `fixtures`
+    - top-level additionalProperties => per-function `Evals`
+    """
+    bundle_schema = EvalsBundle.model_json_schema(ref_template="#/$defs/{model}")
+    defs = bundle_schema.get("$defs", {})
+    properties = bundle_schema.get("properties", {})
+    fixtures_schema = properties.get(
+        "fixtures",
+        {
+            "type": "object",
+            "title": "Fixtures",
+        },
+    )
+
+    additional_properties: dict[str, Any]
+    if "Evals" in defs:
+        # Top-level YAML uses function name as key, so `id` should not be
+        # required in each map value even though runtime Evals model has it.
+        evals_map_value = deepcopy(defs["Evals"])
+        required = evals_map_value.get("required")
+        if isinstance(required, list):
+            evals_map_value["required"] = [k for k in required if k != "id"]
+        evals_map_value["title"] = "Function"
+        evals_map_value["description"] = (
+            "Function evaluation specification keyed by function import path/name. "
+            "Contains fixture dependencies, global evaluators (`evals`), and dataset cases."
+        )
+        defs["EvalsMapValue"] = evals_map_value
+        additional_properties = {"$ref": "#/$defs/EvalsMapValue"}
+    else:
+        evals_schema = properties.get("evals", {"type": "object"})
+        additional_properties = evals_schema.get("additionalProperties", {"type": "object"})
+
+    schema: dict[str, Any] = {
+        "$schema": "http://json-schema.org/draft-07/schema#",
+        "type": "object",
+        "properties": {
+            "fixtures": fixtures_schema,
+        },
+        "additionalProperties": additional_properties,
+        "$defs": defs,
+    }
+
+    return schema
+
+
+def ensure_cached_schema(version: str | None = None) -> Path:
+    """Ensure the versioned schema file exists and is up to date."""
+    token = _schema_version_token(version)
+    schema_path = SCHEMA_CACHE_DIR / f"vowel-schema_{token}.json"
+    schema_path.parent.mkdir(parents=True, exist_ok=True)
+
+    schema_data = build_yaml_schema_from_bundle()
+    rendered = json.dumps(schema_data, indent=2, ensure_ascii=False) + "\n"
+
+    if not schema_path.exists() or schema_path.read_text(encoding="utf-8") != rendered:
+        schema_path.write_text(rendered, encoding="utf-8")
+
+    return schema_path
+
+
+def add_schema_header(yaml_spec: str, schema_path: Path | str) -> str:
+    """Prepend YAML language-server schema reference header to YAML content."""
+    schema_str = str(schema_path)
+    header = f"# yaml-language-server: $schema={schema_str}"
+
+    lines = yaml_spec.splitlines()
+    if lines and lines[0].startswith("# yaml-language-server: $schema="):
+        lines = lines[1:]
+        if lines and lines[0] == "":
+            lines = lines[1:]
+
+    body = "\n".join(lines).rstrip("\n")
+    return f"{header}\n\n{body}\n"
+
+
+def materialize_yaml_with_schema_header(yaml_spec: str, version: str | None = None) -> str:
+    """Create/refresh versioned schema cache and return header-prefixed YAML."""
+    schema_path = ensure_cached_schema(version)
+    return add_schema_header(yaml_spec, schema_path)
diff --git a/src/vowel/spec_validation.py b/src/vowel/spec_validation.py
deleted file mode 100644
index 293762c..0000000
--- a/src/vowel/spec_validation.py
+++ /dev/null
@@ -1,355 +0,0 @@
-"""Shared spec validation utilities for eval generation pipelines.
-
-Functions in this module are used by both ``CodeModeGenerator`` and
-``TDDGenerator`` to validate generated YAML specs against real execution
-and to inject measured durations.
-"""
-
-from __future__ import annotations
-
-from typing import Any
-
-import logfire
-import yaml
-
-from vowel.executor import Executor, resolve_executors
-from vowel.runner import Function
-from vowel.utils import EvalSummary
-
-
-def build_failure_context(summary: EvalSummary) -> str:
-    """Build a concise failure report to inject into a retry prompt.
-
-    Iterates over :class:`EvalSummary` results and formats each failed
-    case/assertion as a single line.  Returns a multi-line string suitable
-    for LLM prompts.
-    """
-    lines: list[str] = []
-    for result in summary.results:
-        if result.report:
-            for case in result.report.cases:
-                failed_assertions = {k: v for k, v in case.assertions.items() if not v.value}
-                if failed_assertions:
-                    parts = []
-                    for k, v in failed_assertions.items():
-                        if v.reason:
-                            parts.append(f"{k}: {v.reason}")
-                        else:
-                            parts.append(f"{k}: FAILED")
-                    lines.append(f"- Case '{case.name}' FAILED [{', '.join(parts)}]")
-        if result.error:
-            lines.append(f"- Error: {result.error}")
-    return "\n".join(lines) if lines else "Unknown failures"
-
-
-def build_call_code(
-    func_name: str, case: dict
-) -> (
-    str | None
-):  # TODO: intead of building call code, consider passing arguments through executor inputs
-    """Build a ``func(args...)`` call string from a YAML case dict.
-
-    Returns ``None`` when no input is present (e.g. raises-only case
-    without input).
-    """
-    if "inputs" in case and case["inputs"] is not None:
-        args = case["inputs"]
-        if isinstance(args, list):
-            arg_strs = ", ".join(repr(a) for a in args)
-            return f"{func_name}({arg_strs})"
-        if isinstance(args, dict):
-            kwarg_strs = ", ".join(f"{k}={v!r}" for k, v in args.items())
-            return f"{func_name}({kwarg_strs})"
-    elif "input" in case and case["input"] is not None:
-        return f"{func_name}({case['input']!r})"
-    return None
-
-
-def inject_durations(
-    yaml_spec: str,
-    func: Function,
-    executor: Executor,
-    *,
-    fallback_executor: Executor | None = None,
-    buffer_pct: float = 0.5,
-    floor_ms: float = 10.0,
-) -> str:
-    """Add per-case ``duration`` fields based on actual execution times.
-
-    Each non-raises case is executed once via the executor session.
-    The measured ``duration_ms`` is inflated by *buffer_pct* (default 50%)
-    with a minimum of *floor_ms* (default 10 ms) to absorb noise.
-
-    Parameters
-    ----------
-    yaml_spec:
-        YAML string to augment.
-    func:
-        Function to execute cases against.
-    executor:
-        Executor backend to use for timing.
-    buffer_pct:
-        Fractional buffer added on top of measured time (0.5 = +50%).
-    floor_ms:
-        Absolute minimum duration in ms — protects sub-ms cases from
-        flaky failures due to measurement noise.
-    """
-    spec = yaml.safe_load(yaml_spec)
-    if not isinstance(spec, dict):
-        return yaml_spec
-
-    executor = resolve_executors(executor, fallback_executor)
-
-    try:
-        session = executor.create_session(func.code)
-    except Exception:
-        logfire.warn("Could not create session for duration injection")
-        return yaml_spec
-
-    with session:
-        for eval_id, eval_def in spec.items():
-            if not isinstance(eval_def, dict):
-                continue
-            for case_entry in eval_def.get("dataset", []):
-                case = case_entry.get("case", {})
-                if not isinstance(case, dict):
-                    continue
-                # Skip cases that expect exceptions
-                if case.get("raises"):
-                    continue
-
-                call_code = build_call_code(eval_id, case)
-                if call_code is None:
-                    continue
-
-                result = session.feed(call_code)
-                if result.success:
-                    dur = max(
-                        result.duration_ms * (1 + buffer_pct),
-                        floor_ms,
-                    )
-                    case["duration"] = round(dur, 1)
-
-    return yaml.safe_dump(spec, default_flow_style=False, allow_unicode=True, sort_keys=False)
-
-
-def validate_expected_values(
-    yaml_spec: str,
-    func: Function,
-    executor: Executor | None = None,
-    fallback_executor: Executor | None = None,
-) -> str:
-    """Validate and fix expected values in a YAML spec by executing cases.
-
-    For each case that has ``expected`` and no ``raises``, executes the
-    function call and compares the result.  If the actual output differs
-    from the YAML expected value, the YAML is updated to the real value.
-
-    Also validates ``raises`` cases: if the case expects an exception but
-    the function doesn't raise (or raises a different type), the case is
-    corrected.
-
-    Parameters
-    ----------
-    yaml_spec:
-        YAML spec string to validate.
-    func:
-        Function to execute.
-    executor:
-        Executor backend.  Defaults to Monty-first with Default fallback.
-
-    Returns
-    -------
-    str
-        Fixed YAML spec with corrected expected values.
-    """
-    executor = resolve_executors(executor, fallback_executor)
-
-    spec = yaml.safe_load(yaml_spec)
-    if not isinstance(spec, dict):
-        return yaml_spec
-
-    try:
-        session = executor.create_session(func.code)
-    except Exception:
-        logfire.warn("Could not create session for expected value validation")
-        return yaml_spec
-
-    fixes_applied = 0
-
-    with session:
-        for eval_id, eval_def in spec.items():
-            if not isinstance(eval_def, dict):
-                continue
-            for case_entry in eval_def.get("dataset", []):
-                case = case_entry.get("case", {})
-                if not isinstance(case, dict):
-                    continue
-
-                call_code = build_call_code(eval_id, case)
-                if call_code is None:
-                    continue
-
-                result = session.feed(call_code)
-
-                # --- Fix expected values ---
-                if (
-                    "expected" in case
-                    and not case.get("raises")
-                    and result.success
-                    and result.output != case["expected"]
-                ):
-                    logfire.info(
-                        "Fixing expected value for case: {expected} → {actual}",
-                        expected=repr(case["expected"]),
-                        actual=repr(result.output),
-                    )
-                    case["expected"] = result.output
-                    fixes_applied += 1
-
-                # --- Fix raises cases ---
-                if case.get("raises"):
-                    expected_exc = case["raises"]
-                    if result.success:
-                        # Function didn't raise — remove raises, set expected
-                        logfire.info(
-                            "Case expected {exc} but function returned {output}, fixing",
-                            exc=expected_exc,
-                            output=repr(result.output),
-                        )
-                        del case["raises"]
-                        if "match" in case:
-                            del case["match"]
-                        case["expected"] = result.output
-                        fixes_applied += 1
-                    elif result.error_type and result.error_type != expected_exc:
-                        # Wrong exception type
-                        logfire.info(
-                            "Case expected {expected} but got {actual}, fixing",
-                            expected=expected_exc,
-                            actual=result.error_type,
-                        )
-                        case["raises"] = result.error_type
-                        fixes_applied += 1
-
-    if fixes_applied > 0:
-        logfire.info("Validated spec: {count} fixes applied", count=fixes_applied)
-        return yaml.safe_dump(spec, default_flow_style=False, allow_unicode=True, sort_keys=False)
-
-    return yaml_spec
-
-
-def inject_missing_error_cases(
-    yaml_spec: str,
-    func_name: str,
-    error_snippets: list[dict],
-) -> str:
-    """Inject error cases from exploration into the spec if the LLM missed them.
-
-    Each item in *error_snippets* should have keys:
-
-    - ``code``: Python snippet that triggered the error (e.g. ``"flatten(None)"``)
-    - ``error_type``: Exception class name (e.g. ``"TypeError"``)
-    - ``error``: Full error message
-    - ``description``: One-line description
-
-    Uses :mod:`ast` to extract function call arguments from the snippet
-    code.  If parsing fails (multi-line code, complex expressions), the
-    snippet is silently skipped.
-
-    Returns the (possibly modified) YAML spec string.
-    """
-    import ast
-
-    if not error_snippets:
-        return yaml_spec
-
-    spec = yaml.safe_load(yaml_spec)
-    if not isinstance(spec, dict) or func_name not in spec:
-        return yaml_spec
-
-    eval_def = spec[func_name]
-    dataset = eval_def.setdefault("dataset", [])
-
-    # Collect existing raises case inputs to avoid duplicates
-    existing_raises_inputs: set[str] = set()
-    for entry in dataset:
-        case = entry.get("case", {})
-        if isinstance(case, dict) and case.get("raises"):
-            # Normalise existing input for comparison
-            inp = case.get("input")
-            inps = case.get("inputs")
-            existing_raises_inputs.add(repr((inp, inps)))
-
-    injected = 0
-
-    for snippet in error_snippets:
-        code = snippet["code"].strip()
-        error_type = snippet["error_type"]
-        description = snippet.get("description", "")
-
-        # Try to extract arguments from a simple function call
-        try:
-            tree = ast.parse(code, mode="eval")
-        except SyntaxError:
-            continue
-
-        if not isinstance(tree.body, ast.Call):
-            continue
-
-        try:
-            args = [ast.literal_eval(a) for a in tree.body.args]
-            kwargs = {kw.arg: ast.literal_eval(kw.value) for kw in tree.body.keywords}
-        except (ValueError, TypeError):
-            # Complex expression that can't be literal-evaluted — skip
-            continue
-
-        # Determine input/inputs format
-        if kwargs:
-            input_repr = repr((None, kwargs))
-            if input_repr in existing_raises_inputs:
-                continue
-            case_dict: dict[str, Any] = {
-                "id": f"error_{error_type.lower()}_{injected}",
-                "inputs": kwargs,
-                "raises": error_type,
-            }
-        elif len(args) == 1:
-            # Tuples cannot be represented in yaml.safe_load()-compatible YAML.
-            # Other non-list inputs (None, int, str, dict) already cover the
-            # same TypeError path, so skip rather than convert and break semantics.
-            if isinstance(args[0], tuple):
-                continue
-            input_repr = repr((args[0], None))
-            if input_repr in existing_raises_inputs:
-                continue
-            case_dict = {
-                "id": f"error_{error_type.lower()}_{injected}",
-                "input": args[0],
-                "raises": error_type,
-            }
-        elif len(args) > 1:
-            input_repr = repr((None, args))
-            if input_repr in existing_raises_inputs:
-                continue
-            case_dict = {
-                "id": f"error_{error_type.lower()}_{injected}",
-                "inputs": args,
-                "raises": error_type,
-            }
-        else:
-            continue
-
-        dataset.append({"case": case_dict})
-        injected += 1
-        logfire.info(
-            "Injected error case: {desc} → raises {exc}",
-            desc=description,
-            exc=error_type,
-        )
-
-    if injected > 0:
-        logfire.info("Injected {count} missing error cases into spec", count=injected)
-        return yaml.safe_dump(spec, default_flow_style=False, allow_unicode=True, sort_keys=False)
-
-    return yaml_spec
diff --git a/src/vowel/tdd.py b/src/vowel/tdd.py
index 005fc27..5a69bdf 100644
--- a/src/vowel/tdd.py
+++ b/src/vowel/tdd.py
@@ -1,24 +1,4 @@
-"""TDD-based eval generation: Intent -> Signature -> Evals -> Implementation.
-
-This module provides a true TDD approach where:
-1. LLM generates function signature from description (intent)
-2. LLM generates eval spec from signature (tests first)
-3. LLM generates implementation that passes the evals (code last)
-
-Example:
-    from vowel.tdd import TDDGenerator
-
-    generator = TDDGenerator(model="openai:gpt-4o")
-
-    result = generator.generate_all(
-        description="Binary search for target in sorted list. Returns index or -1.",
-        name="binary_search"
-    )
-
-    print(result.signature.to_signature_str())
-    print(result.yaml_spec)
-    print(result.func.code)
-"""
+"""TDD pipeline for generating signatures, evals, and implementations."""
 
 import inspect
 import os
@@ -39,12 +19,12 @@
 from vowel.executor import Executor, resolve_executors
 from vowel.monitoring import enable_monitoring
 from vowel.runner import Function, RunEvals
-from vowel.spec_validation import (
+from vowel.utils import EvalSummary
+from vowel.validation import (
     build_failure_context,
+    validate_and_fix_spec,
     validate_expected_values,
 )
-from vowel.utils import EvalSummary
-from vowel.validation import validate_and_fix_spec
 
 # Configure logfire for tracing
 dotenv.load_dotenv()
diff --git a/src/vowel/utils.py b/src/vowel/utils.py
index b092d8a..1c710c8 100644
--- a/src/vowel/utils.py
+++ b/src/vowel/utils.py
@@ -1,23 +1,4 @@
-"""Utility functions for the vowel evaluation framework.
-
-This module provides core utilities for:
-- Loading and parsing YAML evaluation specifications
-- Type compatibility checking for YAML serialization
-- Function import and execution helpers
-- Dataset creation and evaluation running
-- Result aggregation and reporting
-
-Key classes:
-    EvalResult: Result of a single function evaluation
-    EvalSummary: Aggregated results from multiple evaluations
-
-Key functions:
-    run_evals: Main entry point for running evaluations
-    load_evals: Load evaluations from various sources
-    to_dataset: Convert Evals to pydantic-evals Dataset
-    is_yaml_serializable_type: Check if a type can be serialized to YAML
-    check_compatibility: Validate function parameters for YAML compatibility
-"""
+"""Shared utilities for loading specs, building datasets, and running evals."""
 
 import asyncio
 import builtins
@@ -68,9 +49,40 @@ class EvalsBundle(BaseModel):
 
     model_config = ConfigDict(arbitrary_types_allowed=True)
 
-    evals: dict[str, Evals] = Field(default_factory=dict)
+    evals: dict[str, Evals] = Field(min_length=1)
     fixtures: dict[str, FixtureDefinition] = Field(default_factory=dict)
 
+    def to_yaml(self) -> str:
+        """Serialize bundle to current vowel YAML spec format."""
+        data: dict[str, Any] = {}
+
+        for func_id, evals in self.evals.items():
+            evals_dict = evals.model_dump(
+                mode="python",
+                exclude_none=True,
+                exclude_defaults=True,
+            )
+            # Function id is represented by the top-level YAML key.
+            evals_dict.pop("id", None)
+            data[func_id] = evals_dict
+
+        if self.fixtures:
+            data["fixtures"] = {
+                name: definition.model_dump(
+                    mode="python",
+                    exclude_none=True,
+                    exclude_defaults=True,
+                )
+                for name, definition in self.fixtures.items()
+            }
+
+        return yaml.safe_dump(
+            data,
+            sort_keys=False,
+            allow_unicode=True,
+            default_flow_style=False,
+        )
+
 
 # =============================================================================
 # YAML Serializable Types
@@ -909,50 +921,6 @@ def import_class(class_path: str) -> type:
     return cls
 
 
-def load_evals_file(yaml_path: str) -> dict[str, Evals]:
-    with open(yaml_path) as f:
-        loaded = yaml.safe_load(f)
-
-    evals_file = EvalsFile.model_validate(loaded)
-    return evals_file.get_evals()
-
-
-def load_evals_from_yaml_string(yaml_content: str) -> dict[str, Evals]:
-    loaded = yaml.safe_load(yaml_content)
-    evals_file = EvalsFile.model_validate(loaded)
-    return evals_file.get_evals()
-
-
-def load_evals_from_dict(data: dict) -> dict[str, Evals]:
-    evals_file = EvalsFile.model_validate(data)
-    return evals_file.get_evals()
-
-
-def load_evals_from_object(evals_obj: EvalsFile) -> dict[str, Evals]:
-    return evals_obj.get_evals()
-
-
-def load_evals(source: str | Path | dict | EvalsFile) -> dict[str, Evals]:
-    if isinstance(source, EvalsFile):
-        return load_evals_from_object(source)
-    elif isinstance(source, dict):
-        return load_evals_from_dict(source)
-    elif isinstance(source, (str, Path)):
-        source_str = str(source)
-        # Check if it's an existing file path first, before YAML heuristics
-        if os.path.exists(source_str):
-            return load_evals_file(source_str)
-        if _is_yaml_source_string(source_str):
-            return load_evals_from_yaml_string(source_str)
-        else:
-            return load_evals_file(source_str)
-    else:
-        raise TypeError(
-            f"source must be a file path (str/Path), YAML string (str), dict, "
-            f"or EvalsFile object, got {type(source)}"
-        )
-
-
 # =============================================================================
 # Bundle Loading Functions (with fixtures)
 # =============================================================================
@@ -2018,10 +1986,7 @@ def run_evals(
     """
     # Load both evals and fixtures from YAML
     _ = (executor, fallback_executor)
-    if isinstance(source, EvalsBundle):
-        bundle = source
-    else:
-        bundle = load_bundle(source)
+    bundle = source if isinstance(source, EvalsBundle) else load_bundle(source)
     all_evals = bundle.evals
     yaml_fixtures = bundle.fixtures
 
diff --git a/src/vowel/validation.py b/src/vowel/validation.py
index b989f9e..20636dd 100644
--- a/src/vowel/validation.py
+++ b/src/vowel/validation.py
@@ -1,24 +1,17 @@
-"""Static validator for LLM-generated eval specifications.
-
-Catches common LLM generation mistakes BEFORE the spec is used:
-1. Extra fields in cases (comment, note, description, etc.)
-2. YAML-unparseable type remnants (set literals, tuple strings, float('inf'), etc.)
-3. Invented exception types not in function code
-4. Removes or fixes problematic cases, returns clean YAML
-
-Usage:
-    from vowel.validation import validate_and_fix_spec
-
-    fixed_yaml, warnings = validate_and_fix_spec(yaml_spec, function_code="def foo(x): ...")
-"""
+"""Validation and normalization helpers for generated eval specs."""
 
+import ast
 import re
 from dataclasses import dataclass, field
-from typing import Literal
+from typing import Any, Literal
 
 import logfire
 import yaml
 
+from vowel.executor import Executor, resolve_executors
+from vowel.runner import Function
+from vowel.utils import EvalSummary
+
 # Fields allowed in a case block (from MatchCase model)
 ALLOWED_CASE_FIELDS = frozenset(
     {
@@ -399,3 +392,261 @@ def validate_and_fix_spec(
         )
 
     return result
+
+
+def build_failure_context(summary: EvalSummary) -> str:
+    """Build a concise failure report to inject into a retry prompt."""
+    lines: list[str] = []
+    for result in summary.results:
+        if result.report:
+            for case in result.report.cases:
+                failed_assertions = {k: v for k, v in case.assertions.items() if not v.value}
+                if failed_assertions:
+                    parts = []
+                    for k, v in failed_assertions.items():
+                        if v.reason:
+                            parts.append(f"{k}: {v.reason}")
+                        else:
+                            parts.append(f"{k}: FAILED")
+                    lines.append(f"- Case '{case.name}' FAILED [{', '.join(parts)}]")
+        if result.error:
+            lines.append(f"- Error: {result.error}")
+    return "\n".join(lines) if lines else "Unknown failures"
+
+
+def build_call_code(
+    func_name: str, case: dict
+) -> (
+    str | None
+):  # TODO: intead of building call code, consider passing arguments through executor inputs
+    """Build a ``func(args...)`` call string from a YAML case dict."""
+    if "inputs" in case and case["inputs"] is not None:
+        args = case["inputs"]
+        if isinstance(args, list):
+            arg_strs = ", ".join(repr(a) for a in args)
+            return f"{func_name}({arg_strs})"
+        if isinstance(args, dict):
+            kwarg_strs = ", ".join(f"{k}={v!r}" for k, v in args.items())
+            return f"{func_name}({kwarg_strs})"
+    elif "input" in case and case["input"] is not None:
+        return f"{func_name}({case['input']!r})"
+    return None
+
+
+def inject_durations(
+    yaml_spec: str,
+    func: Function,
+    executor: Executor,
+    *,
+    fallback_executor: Executor | None = None,
+    buffer_pct: float = 0.5,
+    floor_ms: float = 10.0,
+) -> str:
+    """Add per-case ``duration`` fields based on actual execution times."""
+    spec = yaml.safe_load(yaml_spec)
+    if not isinstance(spec, dict):
+        return yaml_spec
+
+    executor = resolve_executors(executor, fallback_executor)
+
+    try:
+        session = executor.create_session(func.code)
+    except Exception:
+        logfire.warn("Could not create session for duration injection")
+        return yaml_spec
+
+    with session:
+        for eval_id, eval_def in spec.items():
+            if not isinstance(eval_def, dict):
+                continue
+            for case_entry in eval_def.get("dataset", []):
+                case = case_entry.get("case", {})
+                if not isinstance(case, dict):
+                    continue
+                if case.get("raises"):
+                    continue
+
+                call_code = build_call_code(eval_id, case)
+                if call_code is None:
+                    continue
+
+                result = session.feed(call_code)
+                if result.success:
+                    dur = max(
+                        result.duration_ms * (1 + buffer_pct),
+                        floor_ms,
+                    )
+                    case["duration"] = round(dur, 1)
+
+    return yaml.safe_dump(spec, default_flow_style=False, allow_unicode=True, sort_keys=False)
+
+
+def validate_expected_values(
+    yaml_spec: str,
+    func: Function,
+    executor: Executor | None = None,
+    fallback_executor: Executor | None = None,
+) -> str:
+    """Validate and fix expected values in a YAML spec by executing cases."""
+    executor = resolve_executors(executor, fallback_executor)
+
+    spec = yaml.safe_load(yaml_spec)
+    if not isinstance(spec, dict):
+        return yaml_spec
+
+    try:
+        session = executor.create_session(func.code)
+    except Exception:
+        logfire.warn("Could not create session for expected value validation")
+        return yaml_spec
+
+    fixes_applied = 0
+
+    with session:
+        for eval_id, eval_def in spec.items():
+            if not isinstance(eval_def, dict):
+                continue
+            for case_entry in eval_def.get("dataset", []):
+                case = case_entry.get("case", {})
+                if not isinstance(case, dict):
+                    continue
+
+                call_code = build_call_code(eval_id, case)
+                if call_code is None:
+                    continue
+
+                result = session.feed(call_code)
+
+                if (
+                    "expected" in case
+                    and not case.get("raises")
+                    and result.success
+                    and result.output != case["expected"]
+                ):
+                    logfire.info(
+                        "Fixing expected value for case: {expected} → {actual}",
+                        expected=repr(case["expected"]),
+                        actual=repr(result.output),
+                    )
+                    case["expected"] = result.output
+                    fixes_applied += 1
+
+                if case.get("raises"):
+                    expected_exc = case["raises"]
+                    if result.success:
+                        logfire.info(
+                            "Case expected {exc} but function returned {output}, fixing",
+                            exc=expected_exc,
+                            output=repr(result.output),
+                        )
+                        del case["raises"]
+                        if "match" in case:
+                            del case["match"]
+                        case["expected"] = result.output
+                        fixes_applied += 1
+                    elif result.error_type and result.error_type != expected_exc:
+                        logfire.info(
+                            "Case expected {expected} but got {actual}, fixing",
+                            expected=expected_exc,
+                            actual=result.error_type,
+                        )
+                        case["raises"] = result.error_type
+                        fixes_applied += 1
+
+    if fixes_applied > 0:
+        logfire.info("Validated spec: {count} fixes applied", count=fixes_applied)
+        return yaml.safe_dump(spec, default_flow_style=False, allow_unicode=True, sort_keys=False)
+
+    return yaml_spec
+
+
+def inject_missing_error_cases(
+    yaml_spec: str,
+    func_name: str,
+    error_snippets: list[dict],
+) -> str:
+    """Inject error cases from exploration into the spec if the LLM missed them."""
+    if not error_snippets:
+        return yaml_spec
+
+    spec = yaml.safe_load(yaml_spec)
+    if not isinstance(spec, dict) or func_name not in spec:
+        return yaml_spec
+
+    eval_def = spec[func_name]
+    dataset = eval_def.setdefault("dataset", [])
+
+    existing_raises_inputs: set[str] = set()
+    for entry in dataset:
+        case = entry.get("case", {})
+        if isinstance(case, dict) and case.get("raises"):
+            inp = case.get("input")
+            inps = case.get("inputs")
+            existing_raises_inputs.add(repr((inp, inps)))
+
+    injected = 0
+
+    for snippet in error_snippets:
+        code = snippet["code"].strip()
+        error_type = snippet["error_type"]
+        description = snippet.get("description", "")
+
+        try:
+            tree = ast.parse(code, mode="eval")
+        except SyntaxError:
+            continue
+
+        if not isinstance(tree.body, ast.Call):
+            continue
+
+        try:
+            args = [ast.literal_eval(a) for a in tree.body.args]
+            kwargs = {kw.arg: ast.literal_eval(kw.value) for kw in tree.body.keywords}
+        except (ValueError, TypeError):
+            continue
+
+        if kwargs:
+            input_repr = repr((None, kwargs))
+            if input_repr in existing_raises_inputs:
+                continue
+            case_dict: dict[str, Any] = {
+                "id": f"error_{error_type.lower()}_{injected}",
+                "inputs": kwargs,
+                "raises": error_type,
+            }
+        elif len(args) == 1:
+            if isinstance(args[0], tuple):
+                continue
+            input_repr = repr((args[0], None))
+            if input_repr in existing_raises_inputs:
+                continue
+            case_dict = {
+                "id": f"error_{error_type.lower()}_{injected}",
+                "input": args[0],
+                "raises": error_type,
+            }
+        elif len(args) > 1:
+            input_repr = repr((None, args))
+            if input_repr in existing_raises_inputs:
+                continue
+            case_dict = {
+                "id": f"error_{error_type.lower()}_{injected}",
+                "inputs": args,
+                "raises": error_type,
+            }
+        else:
+            continue
+
+        dataset.append({"case": case_dict})
+        injected += 1
+        logfire.info(
+            "Injected error case: {desc} → raises {exc}",
+            desc=description,
+            exc=error_type,
+        )
+
+    if injected > 0:
+        logfire.info("Injected {count} missing error cases into spec", count=injected)
+        return yaml.safe_dump(spec, default_flow_style=False, allow_unicode=True, sort_keys=False)
+
+    return yaml_spec
diff --git a/tests/test_executor.py b/tests/test_executor.py
index a23252c..1f99246 100644
--- a/tests/test_executor.py
+++ b/tests/test_executor.py
@@ -1,20 +1,4 @@
-"""Tests for vowel.executor — CodeMode execution backends.
-
-Covers MontyExecutor, DefaultExecutor, and get_executor factory across
-all injection modes: external_functions, inputs, both, and pure code.
-
-Tests:
-     1. External functions only
-     2. Inputs only
-     3. Inputs + external functions combined
-     4. Pure code (no injection)
-     5. Stdout capture
-     6. Error handling
-     7. ExecutionResult structure
-     8. Protocol conformance
-     9. get_executor factory
-    10. Parity — both executors produce the same output
-"""
+"""Tests for executor backends, factory selection, and output parity."""
 
 from __future__ import annotations
 
diff --git a/tests/test_llm_integration.py b/tests/test_llm_integration.py
index e20b2e8..bcce51b 100644
--- a/tests/test_llm_integration.py
+++ b/tests/test_llm_integration.py
@@ -1,8 +1,4 @@
-"""LLM-based integration tests with cassette caching.
-
-These tests use real LLM calls but cache responses for reproducibility.
-Run with --update-cassettes to refresh cached responses.
-"""
+"""LLM integration tests with cassette-backed response caching."""
 
 import hashlib
 import json
diff --git a/tests/test_llm_judge.py b/tests/test_llm_judge.py
index 6a2996f..be4e30f 100644
--- a/tests/test_llm_judge.py
+++ b/tests/test_llm_judge.py
@@ -1,7 +1,4 @@
-"""LLM Judge evaluator tests with cassette caching.
-
-These tests specifically test the LLMJudge evaluator functionality.
-"""
+"""Tests for LLMJudge evaluator behavior using cassette caching."""
 
 import hashlib
 import json
diff --git a/tests/test_session.py b/tests/test_session.py
index b4b9d10..400749b 100644
--- a/tests/test_session.py
+++ b/tests/test_session.py
@@ -1,14 +1,4 @@
-"""Tests for ExecutionSession API — DefaultSession and MontyReplSession.
-
-Covers:
-    - Basic feed() results (binary search)
-    - Error handling (ZeroDivisionError)
-    - Syntax error reporting
-    - State preservation across feed() calls
-    - Stdout capture through sessions
-    - Context-manager lifecycle
-    - Session isolation (fresh state per session)
-"""
+"""Tests for ExecutionSession behavior across default and Monty-backed sessions."""
 
 from __future__ import annotations
 
diff --git a/tests/test_tdd_eval_retries.py b/tests/test_tdd_eval_retries.py
index b3c0fdc..9ab9b72 100644
--- a/tests/test_tdd_eval_retries.py
+++ b/tests/test_tdd_eval_retries.py
@@ -4,8 +4,8 @@
 from unittest.mock import MagicMock, PropertyMock, patch
 
 from vowel.eval_types import EvalsSource
-from vowel.spec_validation import build_failure_context
 from vowel.tdd import FunctionSignature, Param, TDDGenerator
+from vowel.validation import build_failure_context
 
 
 def _make_signature() -> FunctionSignature:
diff --git a/tests/test_yaml_loading.py b/tests/test_yaml_loading.py
index 2e56e3b..b2fb12b 100644
--- a/tests/test_yaml_loading.py
+++ b/tests/test_yaml_loading.py
@@ -6,59 +6,59 @@
 
 from vowel import (
     EvalsFile,
-    load_evals,
-    load_evals_file,
-    load_evals_from_dict,
-    load_evals_from_object,
-    load_evals_from_yaml_string,
+    load_bundle,
+    load_bundle_file,
+    load_bundle_from_dict,
+    load_bundle_from_object,
+    load_bundle_from_yaml_string,
 )
 
 
-class TestLoadEvalsFromYamlString:
-    """Tests for load_evals_from_yaml_string function."""
+class TestLoadBundleFromYamlString:
+    """Tests for load_bundle_from_yaml_string function."""
 
     def test_simple_yaml_loading(self, simple_yaml_spec: str):
         """Test loading a simple YAML spec."""
-        evals = load_evals_from_yaml_string(simple_yaml_spec)
+        bundle = load_bundle_from_yaml_string(simple_yaml_spec)
 
-        assert "add" in evals
-        assert len(evals["add"].dataset) == 2
+        assert "add" in bundle.evals
+        assert len(bundle.evals["add"].dataset) == 2
 
     def test_yaml_with_evaluators(self, yaml_with_evaluators: str):
         """Test loading YAML with evaluators."""
-        evals = load_evals_from_yaml_string(yaml_with_evaluators)
+        bundle = load_bundle_from_yaml_string(yaml_with_evaluators)
 
-        assert "is_even" in evals
-        assert evals["is_even"].evals is not None
+        assert "is_even" in bundle.evals
+        assert bundle.evals["is_even"].evals is not None
 
     def test_yaml_with_type_check(self, yaml_with_type_check: str):
         """Test loading YAML with type checking."""
-        evals = load_evals_from_yaml_string(yaml_with_type_check)
+        bundle = load_bundle_from_yaml_string(yaml_with_type_check)
 
-        assert "divide" in evals
-        assert len(evals["divide"].dataset) == 2
+        assert "divide" in bundle.evals
+        assert len(bundle.evals["divide"].dataset) == 2
 
     def test_yaml_with_raises(self, yaml_with_raises: str):
         """Test loading YAML with exception testing."""
-        evals = load_evals_from_yaml_string(yaml_with_raises)
+        bundle = load_bundle_from_yaml_string(yaml_with_raises)
 
-        assert "divide" in evals
-        raises_cases = [c for c in evals["divide"].dataset if c.case.raises]
+        assert "divide" in bundle.evals
+        raises_cases = [c for c in bundle.evals["divide"].dataset if c.case.raises]
         assert len(raises_cases) == 1
 
     def test_empty_yaml_raises_error(self):
         """Test that empty YAML raises an error."""
         with pytest.raises(Exception):  # noqa: B017
-            load_evals_from_yaml_string("")
+            load_bundle_from_yaml_string("")
 
     def test_invalid_yaml_raises_error(self):
         """Test that invalid YAML raises an error."""
         with pytest.raises(Exception):  # noqa: B017
-            load_evals_from_yaml_string("invalid: [unclosed")
+            load_bundle_from_yaml_string("invalid: [unclosed")
 
 
-class TestLoadEvalsFromDict:
-    """Tests for load_evals_from_dict function."""
+class TestLoadBundleFromDict:
+    """Tests for load_bundle_from_dict function."""
 
     def test_dict_loading(self):
         """Test loading from a dictionary."""
@@ -71,10 +71,10 @@ def test_dict_loading(self):
             }
         }
 
-        evals = load_evals_from_dict(spec_dict)
+        bundle = load_bundle_from_dict(spec_dict)
 
-        assert "multiply" in evals
-        assert len(evals["multiply"].dataset) == 2
+        assert "multiply" in bundle.evals
+        assert len(bundle.evals["multiply"].dataset) == 2
 
     def test_dict_with_evaluators(self):
         """Test loading dict with evaluators."""
@@ -88,60 +88,60 @@ def test_dict_with_evaluators(self):
             }
         }
 
-        evals = load_evals_from_dict(spec_dict)
+        bundle = load_bundle_from_dict(spec_dict)
 
-        assert "square" in evals
-        assert evals["square"].evals is not None
+        assert "square" in bundle.evals
+        assert bundle.evals["square"].evals is not None
 
 
-class TestLoadEvalsFile:
-    """Tests for load_evals_file function."""
+class TestLoadBundleFile:
+    """Tests for load_bundle_file function."""
 
     def test_load_from_file(self, temp_yaml_file: Path):
         """Test loading from a YAML file."""
-        evals = load_evals_file(str(temp_yaml_file))
+        bundle = load_bundle_file(str(temp_yaml_file))
 
-        assert "add" in evals
+        assert "add" in bundle.evals
 
     def test_nonexistent_file_raises_error(self):
         """Test that loading non-existent file raises error."""
         with pytest.raises(FileNotFoundError):
-            load_evals_file("nonexistent_file.yml")
+            load_bundle_file("nonexistent_file.yml")
 
 
-class TestLoadEvals:
-    """Tests for the unified load_evals function."""
+class TestLoadBundle:
+    """Tests for the unified load_bundle function."""
 
     def test_load_from_string(self, simple_yaml_spec: str):
-        """Test load_evals with YAML string."""
-        evals = load_evals(simple_yaml_spec)
-        assert "add" in evals
+        """Test load_bundle with YAML string."""
+        bundle = load_bundle(simple_yaml_spec)
+        assert "add" in bundle.evals
 
     def test_load_from_dict(self):
-        """Test load_evals with dict."""
+        """Test load_bundle with dict."""
         spec_dict = {"test": {"dataset": [{"case": {"input": 1, "expected": 1}}]}}
-        evals = load_evals(spec_dict)
-        assert "test" in evals
+        bundle = load_bundle(spec_dict)
+        assert "test" in bundle.evals
 
     def test_load_from_path(self, temp_yaml_file: Path):
-        """Test load_evals with Path object."""
-        evals = load_evals(temp_yaml_file)
-        assert "add" in evals
+        """Test load_bundle with Path object."""
+        bundle = load_bundle(temp_yaml_file)
+        assert "add" in bundle.evals
 
     def test_load_from_evals_file_object(self, simple_yaml_spec: str):
-        """Test load_evals with EvalsFile object."""
+        """Test load_bundle with EvalsFile object."""
         import yaml
 
         data = yaml.safe_load(simple_yaml_spec)
         evals_file = EvalsFile.model_validate(data)
 
-        evals = load_evals_from_object(evals_file)
-        assert "add" in evals
+        bundle = load_bundle_from_object(evals_file)
+        assert "add" in bundle.evals
 
     def test_invalid_source_type_raises_error(self):
         """Test that invalid source type raises TypeError."""
         with pytest.raises(TypeError):
-            load_evals(12345)  # type: ignore[arg-type]
+            load_bundle(12345)  # type: ignore[arg-type]
 
 
 class TestInputFormats:
@@ -156,8 +156,8 @@ def test_single_input(self):
         input: 5
         expected: 10
 """
-        evals = load_evals_from_yaml_string(yaml_spec)
-        case = evals["double"].dataset[0].case
+        bundle = load_bundle_from_yaml_string(yaml_spec)
+        case = bundle.evals["double"].dataset[0].case
         assert case.input == 5
 
     def test_inputs_dict(self):
@@ -169,8 +169,8 @@ def test_inputs_dict(self):
         inputs: { x: 1, y: 2 }
         expected: 3
 """
-        evals = load_evals_from_yaml_string(yaml_spec)
-        case = evals["add"].dataset[0].case
+        bundle = load_bundle_from_yaml_string(yaml_spec)
+        case = bundle.evals["add"].dataset[0].case
         assert case.inputs == {"x": 1, "y": 2}
 
     def test_inputs_list(self):
@@ -182,6 +182,6 @@ def test_inputs_list(self):
         inputs: [1, 2, 3]
         expected: 6
 """
-        evals = load_evals_from_yaml_string(yaml_spec)
-        case = evals["add"].dataset[0].case
+        bundle = load_bundle_from_yaml_string(yaml_spec)
+        case = bundle.evals["add"].dataset[0].case
         assert case.inputs == [1, 2, 3]
diff --git a/vowel-schema.json b/vowel-schema.json
index eded93b..241bbeb 100644
--- a/vowel-schema.json
+++ b/vowel-schema.json
@@ -3,16 +3,15 @@
   "type": "object",
   "properties": {
     "fixtures": {
-      "type": "object",
       "additionalProperties": {
         "$ref": "#/$defs/FixtureDefinition"
       },
       "title": "Fixtures",
-      "description": "Dictionary of fixture definitions. Each key is the fixture name."
+      "type": "object"
     }
   },
   "additionalProperties": {
-    "$ref": "#/$defs/Evals"
+    "$ref": "#/$defs/EvalsMapValue"
   },
   "$defs": {
     "AssertionCase": {
@@ -43,7 +42,9 @@
           "type": "string"
         }
       },
-      "required": ["assertion"],
+      "required": [
+        "assertion"
+      ],
       "title": "AssertionCase",
       "type": "object"
     },
@@ -80,7 +81,9 @@
           "description": "The test case containing input, expected output, and constraints."
         }
       },
-      "required": ["case"],
+      "required": [
+        "case"
+      ],
       "title": "DatasetCase",
       "type": "object"
     },
@@ -89,13 +92,20 @@
       "properties": {
         "duration": {
           "description": "Maximum allowed duration in seconds. Test fails if execution takes longer.",
-          "examples": [0.1, 1.0, 5.0, 0.001],
+          "examples": [
+            0.1,
+            1.0,
+            5.0,
+            0.001
+          ],
           "exclusiveMinimum": 0,
           "title": "Duration",
           "type": "number"
         }
       },
-      "required": ["duration"],
+      "required": [
+        "duration"
+      ],
       "title": "DurationCase",
       "type": "object"
     },
@@ -105,13 +115,29 @@
       "properties": {
         "id": {
           "description": "Function name to evaluate. Must match the actual function name.",
-          "examples": ["is_prime", "calculate_sum", "process_data", "validate_email"],
+          "examples": [
+            "is_prime",
+            "calculate_sum",
+            "process_data",
+            "validate_email"
+          ],
           "title": "Id",
           "type": "string"
         },
         "fixture": {
           "description": "List of fixture names this function depends on. Fixtures must be defined in the top-level 'fixtures' section. They will be injected as keyword-only arguments to the function.",
-          "examples": [["db"], ["db", "cache"], ["redis"]],
+          "examples": [
+            [
+              "db"
+            ],
+            [
+              "db",
+              "cache"
+            ],
+            [
+              "redis"
+            ]
+          ],
           "items": {
             "type": "string"
           },
@@ -121,28 +147,54 @@
         "evals": {
           "additionalProperties": {
             "anyOf": [
-              {"$ref": "#/$defs/IsInstanceCase"},
-              {"$ref": "#/$defs/AssertionCase"},
-              {"$ref": "#/$defs/DurationCase"},
-              {"$ref": "#/$defs/ContainsInputCase"},
-              {"$ref": "#/$defs/PatternMatchCase"},
-              {"$ref": "#/$defs/LLMJudgeCase"}
+              {
+                "$ref": "#/$defs/IsInstanceCase"
+              },
+              {
+                "$ref": "#/$defs/AssertionCase"
+              },
+              {
+                "$ref": "#/$defs/DurationCase"
+              },
+              {
+                "$ref": "#/$defs/ContainsInputCase"
+              },
+              {
+                "$ref": "#/$defs/PatternMatchCase"
+              },
+              {
+                "$ref": "#/$defs/LLMJudgeCase"
+              }
             ]
           },
           "description": "Dictionary of evaluation rules that apply to ALL test cases. Each key is a descriptive name, value is the evaluation case. Use IsInstanceCase for type checks, AssertionCase for custom logic, DurationCase for performance constraints, ContainsInputCase for input containment, PatternMatchCase for regex pattern matching.",
           "examples": [
             {
-              "IsInteger": {"type": "int"},
-              "IsPositive": {"assertion": "output > 0"}
+              "IsInteger": {
+                "type": "int"
+              },
+              "IsPositive": {
+                "assertion": "output > 0"
+              }
             },
             {
-              "IsUppercase": {"assertion": "output.isupper()"},
-              "NotEmpty": {"assertion": "len(output) > 0"},
-              "TypeCheck": {"type": "str"}
+              "IsUppercase": {
+                "assertion": "output.isupper()"
+              },
+              "NotEmpty": {
+                "assertion": "len(output) > 0"
+              },
+              "TypeCheck": {
+                "type": "str"
+              }
             },
             {
-              "CorrectLogic": {"assertion": "(output and input > 0) or (not output and input <= 0)"},
-              "IsBoolean": {"type": "bool"}
+              "CorrectLogic": {
+                "assertion": "(output and input > 0) or (not output and input <= 0)"
+              },
+              "IsBoolean": {
+                "type": "bool"
+              }
             }
           ],
           "title": "Evals",
@@ -152,17 +204,58 @@
           "description": "List of test cases. Each case has input, expected output, and optional constraints. Should cover normal cases, edge cases, and corner cases.",
           "examples": [
             [
-              {"case": {"expected": 4, "input": 2}},
-              {"case": {"expected": 0, "input": 0}},
-              {"case": {"expected": 9, "input": -3}}
+              {
+                "case": {
+                  "expected": 4,
+                  "input": 2
+                }
+              },
+              {
+                "case": {
+                  "expected": 0,
+                  "input": 0
+                }
+              },
+              {
+                "case": {
+                  "expected": 9,
+                  "input": -3
+                }
+              }
             ],
             [
-              {"case": {"expected": "HELLO", "input": "hello"}},
-              {"case": {"expected": "WORLD", "input": "world"}}
+              {
+                "case": {
+                  "expected": "HELLO",
+                  "input": "hello"
+                }
+              },
+              {
+                "case": {
+                  "expected": "WORLD",
+                  "input": "world"
+                }
+              }
             ],
             [
-              {"case": {"expected": 5, "input": {"x": 2, "y": 3}}},
-              {"case": {"expected": 30, "input": {"x": 10, "y": 20}}}
+              {
+                "case": {
+                  "expected": 5,
+                  "input": {
+                    "x": 2,
+                    "y": 3
+                  }
+                }
+              },
+              {
+                "case": {
+                  "expected": 30,
+                  "input": {
+                    "x": 10,
+                    "y": 20
+                  }
+                }
+              }
             ]
           ],
           "items": {
@@ -173,7 +266,10 @@
           "type": "array"
         }
       },
-      "required": ["dataset"],
+      "required": [
+        "id",
+        "dataset"
+      ],
       "title": "Evals",
       "type": "object"
     },
@@ -181,47 +277,69 @@
       "description": "Definition of a single fixture with setup/teardown lifecycle.",
       "properties": {
         "setup": {
-          "anyOf": [{"type": "string"}, {"type": "null"}],
+          "anyOf": [
+            {
+              "type": "string"
+            },
+            {
+              "type": "null"
+            }
+          ],
           "default": null,
           "description": "Import path to setup function (e.g., 'fixtures.create_db'). Required if 'cls' is not specified.",
           "title": "Setup"
         },
         "cls": {
-          "anyOf": [{"type": "string"}, {"type": "null"}],
+          "anyOf": [
+            {
+              "type": "string"
+            },
+            {
+              "type": "null"
+            }
+          ],
           "default": null,
           "description": "Import path to class (e.g., 'myapp.Database'). Class will be instantiated with args/kwargs.",
           "title": "Cls"
         },
         "args": {
-          "description": "Positional arguments to pass to class constructor (used with 'cls')",
+          "description": "Positional arguments unpacked into the callable: setup_func(*args) or MyClass(*args)",
           "items": {},
           "title": "Args",
           "type": "array"
         },
         "kwargs": {
           "additionalProperties": true,
-          "description": "Keyword arguments to pass to class constructor (used with 'cls')",
+          "description": "Keyword arguments unpacked into the callable: setup_func(**kwargs) or MyClass(**kwargs)",
           "title": "Kwargs",
           "type": "object"
         },
         "teardown": {
-          "anyOf": [{"type": "string"}, {"type": "null"}],
+          "anyOf": [
+            {
+              "type": "string"
+            },
+            {
+              "type": "null"
+            }
+          ],
           "default": null,
           "description": "Import path to teardown function (e.g., 'fixtures.drop_db'). Can also be a class method (e.g., 'Connection.close') which will be called on the instance.",
           "title": "Teardown"
         },
         "scope": {
           "default": "function",
-          "description": "Lifecycle scope: 'function' (per case), 'module' (per eval), or 'session' (per run)",
-          "enum": ["function", "module", "session"],
+          "description": "Fixture lifecycle scope. Preferred names: 'case', 'eval', 'file'. Compatibility aliases are accepted: 'function', 'module', 'session'. Current runtime normalization maps case->function, eval->module, file->session.",
+          "enum": [
+            "case",
+            "eval",
+            "file",
+            "function",
+            "module",
+            "session"
+          ],
           "title": "Scope",
           "type": "string"
-        },
-        "params": {
-          "additionalProperties": true,
-          "description": "Parameters to pass to the setup function",
-          "title": "Params",
-          "type": "object"
         }
       },
       "title": "FixtureDefinition",
@@ -232,18 +350,35 @@
       "properties": {
         "type": {
           "description": "Python type as string to check against. Can use union types with '|'.",
-          "examples": ["int", "str", "bool", "list", "dict", "int | float", "str | None"],
+          "examples": [
+            "int",
+            "str",
+            "bool",
+            "list",
+            "dict",
+            "int | float",
+            "str | None"
+          ],
           "title": "Type",
           "type": "string"
         },
         "strict": {
-          "anyOf": [{"type": "boolean"}, {"type": "null"}],
+          "anyOf": [
+            {
+              "type": "boolean"
+            },
+            {
+              "type": "null"
+            }
+          ],
           "default": null,
           "description": "Whether to use strict mode for type validation. When True, performs stricter type checking.",
           "title": "Strict"
         }
       },
-      "required": ["type"],
+      "required": [
+        "type"
+      ],
       "title": "IsInstanceCase",
       "type": "object"
     },
@@ -262,7 +397,18 @@
         },
         "include": {
           "description": "List of context variables to include in the evaluation. Valid options: 'input', 'expected_output'.",
-          "examples": [["input"], ["expected_output"], ["input", "expected_output"]],
+          "examples": [
+            [
+              "input"
+            ],
+            [
+              "expected_output"
+            ],
+            [
+              "input",
+              "expected_output"
+            ]
+          ],
           "items": {
             "type": "string"
           },
@@ -276,7 +422,9 @@
           "type": "object"
         }
       },
-      "required": ["rubric"],
+      "required": [
+        "rubric"
+      ],
       "title": "LLMJudgeCase",
       "type": "object"
     },
@@ -285,61 +433,181 @@
       "description": "Test case with input, expected output, and optional constraints.",
       "properties": {
         "id": {
-          "anyOf": [{"type": "string"}, {"type": "null"}],
+          "anyOf": [
+            {
+              "type": "string"
+            },
+            {
+              "type": "null"
+            }
+          ],
           "default": null,
           "description": "Optional unique identifier for this test case.",
-          "examples": ["test_positive_numbers", "edge_case_empty_list", "error_invalid_input"],
+          "examples": [
+            "test_positive_numbers",
+            "edge_case_empty_list",
+            "error_invalid_input"
+          ],
           "title": "Id"
         },
         "input": {
-          "anyOf": [{}, {"type": "null"}],
+          "anyOf": [
+            {},
+            {
+              "type": "null"
+            }
+          ],
           "default": null,
           "description": "Single input value to pass to the function as the only argument. Use this when the function takes a single argument. Cannot be used together with 'inputs'.",
-          "examples": [5, "hello", [1, 2, 3], {"x": 10, "y": 20}, {"name": "test", "value": 42}],
+          "examples": [
+            5,
+            "hello",
+            [
+              1,
+              2,
+              3
+            ],
+            {
+              "x": 10,
+              "y": 20
+            },
+            {
+              "name": "test",
+              "value": 42
+            }
+          ],
           "title": "Input"
         },
         "inputs": {
           "anyOf": [
-            {"items": {}, "type": "array"},
-            {"additionalProperties": true, "type": "object"},
-            {"type": "null"}
+            {
+              "items": {},
+              "type": "array"
+            },
+            {
+              "additionalProperties": true,
+              "type": "object"
+            },
+            {
+              "type": "null"
+            }
           ],
           "default": null,
           "description": "Multiple input values to pass to the function as separate arguments (*args). Use this when the function takes multiple arguments. Cannot be used together with 'input'.",
-          "examples": [[1, 2], [10, 20, 30], ["hello", "world"], [{"x": 1}, {"y": 2}]],
+          "examples": [
+            [
+              1,
+              2
+            ],
+            [
+              10,
+              20,
+              30
+            ],
+            [
+              "hello",
+              "world"
+            ],
+            [
+              {
+                "x": 1
+              },
+              {
+                "y": 2
+              }
+            ]
+          ],
           "title": "Inputs"
         },
         "expected": {
           "description": "Expected output value. If provided, output will be compared for equality. Use `null` to expect None.",
-          "examples": [25, "HELLO", [1, 3, 5], true, {"result": 30}, null],
+          "examples": [
+            25,
+            "HELLO",
+            [
+              1,
+              3,
+              5
+            ],
+            true,
+            {
+              "result": 30
+            },
+            null
+          ],
           "title": "Expected"
         },
         "duration": {
-          "anyOf": [{"exclusiveMinimum": 0, "type": "number"}, {"type": "null"}],
+          "anyOf": [
+            {
+              "exclusiveMinimum": 0,
+              "type": "number"
+            },
+            {
+              "type": "null"
+            }
+          ],
           "default": null,
           "description": "Maximum allowed execution time in milliseconds for this specific case.",
-          "examples": [100, 500, 1000, 50],
+          "examples": [
+            100,
+            500,
+            1000,
+            50
+          ],
           "title": "Duration"
         },
         "contains": {
-          "anyOf": [{}, {"type": "null"}],
+          "anyOf": [
+            {},
+            {
+              "type": "null"
+            }
+          ],
           "default": null,
           "description": "Value that should be contained in the output.",
-          "examples": ["substring", 42, "expected_key"],
+          "examples": [
+            "substring",
+            42,
+            "expected_key"
+          ],
           "title": "Contains"
         },
         "assertion": {
-          "anyOf": [{"type": "string"}, {"type": "null"}],
+          "anyOf": [
+            {
+              "type": "string"
+            },
+            {
+              "type": "null"
+            }
+          ],
           "default": null,
           "description": "Optional case-specific Python assertion expression. Same as global assertions but only for this case.\nAvailable variables: input, output, expected, duration, metadata.\nExamples: 'output > 0', 'len(output) == 3', 'output == input * 2'",
-          "examples": ["output > 0", "len(output) == 3", "output % 2 == 0", "output in input"],
+          "examples": [
+            "output > 0",
+            "len(output) == 3",
+            "output % 2 == 0",
+            "output in input"
+          ],
           "title": "Assertion"
         },
         "pattern": {
-          "anyOf": [{"type": "string"}, {"type": "null"}],
+          "anyOf": [
+            {
+              "type": "string"
+            },
+            {
+              "type": "null"
+            }
+          ],
           "default": null,
           "description": "Optional regex pattern to match against the output (converted to string) for this specific case.",
-          "examples": ["^\\d+$", "^[A-Z]+$", ".*@.*\\.com$"],
+          "examples": [
+            "^\\d+$",
+            "^[A-Z]+$",
+            ".*@.*\\.com$"
+          ],
           "title": "Pattern"
         },
         "case_sensitive": {
@@ -349,17 +617,43 @@
           "type": "boolean"
         },
         "raises": {
-          "anyOf": [{"type": "string"}, {"type": "null"}],
+          "anyOf": [
+            {
+              "type": "string"
+            },
+            {
+              "type": "null"
+            }
+          ],
           "default": null,
           "description": "Expected exception type for this case. If specified, the test expects the function to raise this exception. Append '?' for optional raises (e.g., 'TypeError?') — passes if the exception is raised OR if the function returns normally.",
-          "examples": ["ValueError", "TypeError", "KeyError", "ZeroDivisionError", "TypeError?"],
+          "examples": [
+            "ValueError",
+            "TypeError",
+            "KeyError",
+            "ZeroDivisionError",
+            "TypeError?"
+          ],
           "title": "Raises"
         },
         "type": {
-          "anyOf": [{"type": "string"}, {"type": "null"}],
+          "anyOf": [
+            {
+              "type": "string"
+            },
+            {
+              "type": "null"
+            }
+          ],
           "default": null,
           "description": "Expected output type for this specific case. Can be a simple type name or a complex type annotation.",
-          "examples": ["int", "str", "list[int]", "dict[str, Any]", "Optional[int]"],
+          "examples": [
+            "int",
+            "str",
+            "list[int]",
+            "dict[str, Any]",
+            "Optional[int]"
+          ],
           "title": "Type"
         },
         "strict_type": {
@@ -369,10 +663,21 @@
           "type": "boolean"
         },
         "match": {
-          "anyOf": [{"type": "string"}, {"type": "null"}],
+          "anyOf": [
+            {
+              "type": "string"
+            },
+            {
+              "type": "null"
+            }
+          ],
           "default": null,
           "description": "Optional regex pattern to match against the exception message (only used if raises is specified).",
-          "examples": ["invalid input", "must be positive", "not found"],
+          "examples": [
+            "invalid input",
+            "must be positive",
+            "not found"
+          ],
           "title": "Match"
         }
       },
@@ -384,7 +689,12 @@
       "properties": {
         "pattern": {
           "description": "Regular expression pattern to match against the output (converted to string).",
-          "examples": ["^\\d+$", "^[A-Z]+$", ".*@.*\\.com$", "id: \\d+"],
+          "examples": [
+            "^\\d+$",
+            "^[A-Z]+$",
+            ".*@.*\\.com$",
+            "id: \\d+"
+          ],
           "title": "Pattern",
           "type": "string"
         },
@@ -395,9 +705,174 @@
           "type": "boolean"
         }
       },
-      "required": ["pattern"],
+      "required": [
+        "pattern"
+      ],
       "title": "PatternMatchCase",
       "type": "object"
+    },
+    "EvalsMapValue": {
+      "additionalProperties": false,
+      "description": "Function evaluation specification keyed by function import path/name. Contains fixture dependencies, global evaluators (`evals`), and dataset cases.",
+      "properties": {
+        "id": {
+          "description": "Function name to evaluate. Must match the actual function name.",
+          "examples": [
+            "is_prime",
+            "calculate_sum",
+            "process_data",
+            "validate_email"
+          ],
+          "title": "Id",
+          "type": "string"
+        },
+        "fixture": {
+          "description": "List of fixture names this function depends on. Fixtures must be defined in the top-level 'fixtures' section. They will be injected as keyword-only arguments to the function.",
+          "examples": [
+            [
+              "db"
+            ],
+            [
+              "db",
+              "cache"
+            ],
+            [
+              "redis"
+            ]
+          ],
+          "items": {
+            "type": "string"
+          },
+          "title": "Fixture",
+          "type": "array"
+        },
+        "evals": {
+          "additionalProperties": {
+            "anyOf": [
+              {
+                "$ref": "#/$defs/IsInstanceCase"
+              },
+              {
+                "$ref": "#/$defs/AssertionCase"
+              },
+              {
+                "$ref": "#/$defs/DurationCase"
+              },
+              {
+                "$ref": "#/$defs/ContainsInputCase"
+              },
+              {
+                "$ref": "#/$defs/PatternMatchCase"
+              },
+              {
+                "$ref": "#/$defs/LLMJudgeCase"
+              }
+            ]
+          },
+          "description": "Dictionary of evaluation rules that apply to ALL test cases. Each key is a descriptive name, value is the evaluation case. Use IsInstanceCase for type checks, AssertionCase for custom logic, DurationCase for performance constraints, ContainsInputCase for input containment, PatternMatchCase for regex pattern matching.",
+          "examples": [
+            {
+              "IsInteger": {
+                "type": "int"
+              },
+              "IsPositive": {
+                "assertion": "output > 0"
+              }
+            },
+            {
+              "IsUppercase": {
+                "assertion": "output.isupper()"
+              },
+              "NotEmpty": {
+                "assertion": "len(output) > 0"
+              },
+              "TypeCheck": {
+                "type": "str"
+              }
+            },
+            {
+              "CorrectLogic": {
+                "assertion": "(output and input > 0) or (not output and input <= 0)"
+              },
+              "IsBoolean": {
+                "type": "bool"
+              }
+            }
+          ],
+          "title": "Evals",
+          "type": "object"
+        },
+        "dataset": {
+          "description": "List of test cases. Each case has input, expected output, and optional constraints. Should cover normal cases, edge cases, and corner cases.",
+          "examples": [
+            [
+              {
+                "case": {
+                  "expected": 4,
+                  "input": 2
+                }
+              },
+              {
+                "case": {
+                  "expected": 0,
+                  "input": 0
+                }
+              },
+              {
+                "case": {
+                  "expected": 9,
+                  "input": -3
+                }
+              }
+            ],
+            [
+              {
+                "case": {
+                  "expected": "HELLO",
+                  "input": "hello"
+                }
+              },
+              {
+                "case": {
+                  "expected": "WORLD",
+                  "input": "world"
+                }
+              }
+            ],
+            [
+              {
+                "case": {
+                  "expected": 5,
+                  "input": {
+                    "x": 2,
+                    "y": 3
+                  }
+                }
+              },
+              {
+                "case": {
+                  "expected": 30,
+                  "input": {
+                    "x": 10,
+                    "y": 20
+                  }
+                }
+              }
+            ]
+          ],
+          "items": {
+            "$ref": "#/$defs/DatasetCase"
+          },
+          "minItems": 1,
+          "title": "Dataset",
+          "type": "array"
+        }
+      },
+      "required": [
+        "dataset"
+      ],
+      "title": "Function",
+      "type": "object"
     }
   }
-}
\ No newline at end of file
+}

From 596754e070e4281a927f49a331642f7897087572 Mon Sep 17 00:00:00 2001
From: Mert <62549656+fswair@users.noreply.github.com>
Date: Thu, 19 Mar 2026 00:13:59 +0300
Subject: [PATCH 4/8] commit before native serializers

---
 .gitignore                                    |  12 +-
 README.md                                     |   4 +
 docs/SERIALIZERS.md                           |   2 +
 src/vowel/__init__.py                         |   2 +
 src/vowel/cli.py                              | 264 +++++++++++++
 src/vowel/codemode.py                         | 345 ++++++++++-------
 src/vowel/costs.py                            | 358 ++++++++++++++++++
 src/vowel/evals.py                            |  21 +-
 src/vowel/utils.py                            |  65 +++-
 tests/cassettes/llm_judge_custom_model.json   |   2 +-
 tests/cassettes/test_generate_and_run.json    |   4 +-
 tests/cassettes/test_generate_factorial.json  |   6 +-
 tests/cassettes/test_generate_palindrome.json |   4 +-
 .../cassettes/test_generate_spec_simple.json  |   2 +-
 .../cassettes/test_generate_spec_string.json  |   2 +-
 tests/test_generation.py                      |  36 --
 tests/test_llm_integration.py                 |   4 +-
 tests/test_llm_judge_env_refs.py              |  33 ++
 tests/test_run_evals.py                       |  18 +
 tests/test_serializer.py                      |  22 ++
 20 files changed, 1010 insertions(+), 196 deletions(-)
 create mode 100644 src/vowel/costs.py
 delete mode 100644 tests/test_generation.py
 create mode 100644 tests/test_llm_judge_env_refs.py

diff --git a/.gitignore b/.gitignore
index 59c3bc4..dc1809e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -70,14 +70,10 @@ evaluations/
 TODO
 docs/FIXTURE_GENERATION_RFC.md
 
-# CodeMode
-monty.py
-monty/
-
 # Benchmarks
 benchmark*
 parse_cron_evals.yml
-PLAN.md
-codegen.py
-bundle_*.py
-*test.py
+
+# Known Models with Costs
+costs.yml
+db_fixture_serializers.yml
diff --git a/README.md b/README.md
index 0ce4681..f3ec800 100644
--- a/README.md
+++ b/README.md
@@ -122,6 +122,8 @@ summary = (
 summary.print()
 ```
 
+> **Name matching note:** If your YAML uses `module.function`, programmatic mappings can use either the exact key (`module.function`) or the short function name (`function`) in `.with_functions(...)`.
+
 ---
 
 ## Features
@@ -196,6 +198,8 @@ summary = (
 )
 ```
 
+> **Serializer key matching:** Serializer mappings follow the same rule as `.with_functions(...)` — both `module.function` and short `function` keys are accepted.
+
 > **Full reference:** [docs/SERIALIZERS.md](https://github.com/fswair/vowel/blob/main/docs/SERIALIZERS.md)
 
 ### AI-Powered Generation
diff --git a/docs/SERIALIZERS.md b/docs/SERIALIZERS.md
index de92302..deb621e 100644
--- a/docs/SERIALIZERS.md
+++ b/docs/SERIALIZERS.md
@@ -58,6 +58,8 @@ summary = (
 )
 ```
 
+> Key matching note: If YAML eval ids use `module.function`, both programmatic maps accept either the exact id (`module.function`) or short name (`function`) keys in `.with_functions(...)`, `.with_serializer(...)`, and `serial_fn={...}`.
+
 ---
 
 ## Advanced Examples
diff --git a/src/vowel/__init__.py b/src/vowel/__init__.py
index b3d98d2..d60bdbf 100644
--- a/src/vowel/__init__.py
+++ b/src/vowel/__init__.py
@@ -8,6 +8,7 @@
 from .ai import EvalGenerator, GenerationResult, UnsupportedParameterTypeError
 from .codemode import CodeModeGenerator, CodeModeResult, ExplorationPlan, SnippetResult
 from .context import EVAL_SPEC_CONTEXT
+from .costs import CostManager
 from .errors import FixturePathError, SignatureError
 from .eval_types import EvalsFile
 from .executor import (
@@ -75,6 +76,7 @@
     "CodeModeResult",
     "ExplorationPlan",
     "SnippetResult",
+    "CostManager",
 ]
 
 
diff --git a/src/vowel/cli.py b/src/vowel/cli.py
index a5bf430..6b2ed4c 100644
--- a/src/vowel/cli.py
+++ b/src/vowel/cli.py
@@ -27,6 +27,239 @@
 
 dotenv.load_dotenv()
 console = Console()
+COSTS_FILE = Path.home() / ".vowel" / "codemode" / "generation_costs.json"
+
+
+def _load_cost_store() -> dict:
+    if not COSTS_FILE.exists():
+        return {"schema_version": 1, "generations": {}}
+    try:
+        data = json.loads(COSTS_FILE.read_text(encoding="utf-8"))
+    except Exception:
+        return {"schema_version": 1, "generations": {}}
+    if not isinstance(data, dict):
+        return {"schema_version": 1, "generations": {}}
+    generations = data.get("generations")
+    if not isinstance(generations, dict):
+        data["generations"] = {}
+    return data
+
+
+def _flatten_runs(store: dict) -> list[tuple[str, dict, dict]]:
+    rows: list[tuple[str, dict, dict]] = []
+    for gid, gen in store.get("generations", {}).items():
+        runs = gen.get("runs", {}) if isinstance(gen, dict) else {}
+        if not isinstance(runs, dict):
+            continue
+        for run_id, run in runs.items():
+            rows.append((gid, gen, {"run_id": run_id, **run}))
+    return rows
+
+
+def _print_generation_table(store: dict) -> list[str]:
+    generations = store.get("generations", {})
+    ordered = sorted(
+        generations.items(),
+        key=lambda x: str(x[1].get("created_at", "")),
+        reverse=True,
+    )
+    table = Table(title="Generations", box=box.ROUNDED)
+    table.add_column("#", style="cyan", no_wrap=True)
+    table.add_column("Generation ID", style="white")
+    table.add_column("Created", style="dim")
+    table.add_column("Runs", justify="right")
+    table.add_column("USD", justify="right", style="green")
+
+    ids: list[str] = []
+    for idx, (gid, gen) in enumerate(ordered, start=1):
+        totals = gen.get("totals", {})
+        run_count = len(gen.get("runs", {})) if isinstance(gen.get("runs", {}), dict) else 0
+        table.add_row(
+            str(idx),
+            gid,
+            str(gen.get("created_at", "-")),
+            str(run_count),
+            f"{float(totals.get('usd', 0.0) or 0.0):.6f}",
+        )
+        ids.append(gid)
+
+    console.print(table)
+    return ids
+
+
+def _print_generation_detail(generation_id: str, generation: dict) -> None:
+    totals = generation.get("totals", {})
+    info = Table.grid(padding=(0, 2))
+    info.add_column(style="bold")
+    info.add_column()
+    info.add_row("Generation", generation_id)
+    info.add_row("Created", str(generation.get("created_at", "-")))
+    info.add_row("Spec model", str(generation.get("spec_model", "-")))
+    info.add_row("Exploration model", str(generation.get("exploration_model", "-")))
+    info.add_row("USD", f"{float(totals.get('usd', 0.0) or 0.0):.6f}")
+    info.add_row("Input tokens", str(int(totals.get("input_tokens", 0) or 0)))
+    info.add_row("Output tokens", str(int(totals.get("output_tokens", 0) or 0)))
+    info.add_row("Requests", str(int(totals.get("requests", 0) or 0)))
+    console.print(Panel(info, title="Generation Summary", border_style="bright_cyan"))
+
+    run_table = Table(title="Runs", box=box.ROUNDED)
+    run_table.add_column("Run ID", style="white")
+    run_table.add_column("Function", style="cyan")
+    run_table.add_column("Status")
+    run_table.add_column("USD", justify="right", style="green")
+    run_table.add_column("Input", justify="right")
+    run_table.add_column("Output", justify="right")
+    run_table.add_column("Requests", justify="right")
+    run_table.add_column("Created", style="dim")
+
+    runs = generation.get("runs", {}) if isinstance(generation.get("runs", {}), dict) else {}
+    for run_id, run in runs.items():
+        rt = run.get("totals", {})
+        run_table.add_row(
+            run_id,
+            str(run.get("func_name", "-")),
+            str(run.get("status", "-")),
+            f"{float(rt.get('usd', 0.0) or 0.0):.6f}",
+            str(int(rt.get("input_tokens", 0) or 0)),
+            str(int(rt.get("output_tokens", 0) or 0)),
+            str(int(rt.get("requests", 0) or 0)),
+            str(run.get("created_at", "-")),
+        )
+    console.print(run_table)
+
+
+def _print_run_detail(generation_id: str, run: dict) -> None:
+    totals = run.get("totals", {})
+    info = Table.grid(padding=(0, 2))
+    info.add_column(style="bold")
+    info.add_column()
+    info.add_row("Generation", generation_id)
+    info.add_row("Run", str(run.get("run_id", "-")))
+    info.add_row("Function", str(run.get("func_name", "-")))
+    info.add_row("Status", str(run.get("status", "-")))
+    info.add_row("USD", f"{float(totals.get('usd', 0.0) or 0.0):.6f}")
+    info.add_row("Input tokens", str(int(totals.get("input_tokens", 0) or 0)))
+    info.add_row("Output tokens", str(int(totals.get("output_tokens", 0) or 0)))
+    info.add_row("Requests", str(int(totals.get("requests", 0) or 0)))
+    console.print(Panel(info, title="Run Summary", border_style="bright_cyan"))
+
+    step_table = Table(title="Steps", box=box.ROUNDED)
+    step_table.add_column("Step", style="white")
+    step_table.add_column("Calls", justify="right")
+    step_table.add_column("USD", justify="right", style="green")
+    step_table.add_column("Input", justify="right")
+    step_table.add_column("Output", justify="right")
+    step_table.add_column("Requests", justify="right")
+
+    steps = run.get("steps", {}) if isinstance(run.get("steps", {}), dict) else {}
+    for step_name, step_data in steps.items():
+        usages = step_data.get("usages", []) if isinstance(step_data, dict) else []
+        usd = 0.0
+        input_tokens = 0
+        output_tokens = 0
+        requests = 0
+        for u in usages:
+            usage = u.get("usage", {}) if isinstance(u, dict) else {}
+            usd += float(u.get("usd", 0.0) or 0.0)
+            input_tokens += int(usage.get("input_tokens", 0) or 0)
+            output_tokens += int(usage.get("output_tokens", 0) or 0)
+            requests += int(usage.get("requests", 0) or 0)
+
+        step_table.add_row(
+            step_name,
+            str(len(usages)),
+            f"{usd:.6f}",
+            str(input_tokens),
+            str(output_tokens),
+            str(requests),
+        )
+
+    console.print(step_table)
+
+
+def _handle_costs_command(
+    *,
+    list_costs: bool,
+    by_generation: bool,
+    by_run: bool,
+    generation_id: str | None,
+    run_id: str | None,
+) -> None:
+    store = _load_cost_store()
+    generations = store.get("generations", {})
+    if not generations:
+        console.print("[yellow]No cost records found yet.[/yellow]")
+        return
+
+    if generation_id:
+        generation = generations.get(generation_id)
+        if not isinstance(generation, dict):
+            click.secho(f"ERROR: Generation not found: {generation_id}", fg="red", err=True)
+            raise SystemExit(1)
+        _print_generation_detail(generation_id, generation)
+        return
+
+    if run_id:
+        for gid, gen in generations.items():
+            runs = gen.get("runs", {}) if isinstance(gen, dict) else {}
+            if isinstance(runs, dict) and run_id in runs:
+                run = {"run_id": run_id, **runs[run_id]}
+                _print_run_detail(gid, run)
+                return
+        click.secho(f"ERROR: Run not found: {run_id}", fg="red", err=True)
+        raise SystemExit(1)
+
+    if not list_costs:
+        _print_generation_table(store)
+        return
+
+    if by_generation:
+        ids = _print_generation_table(store)
+        if not ids:
+            return
+        choice = click.prompt("Select generation number", type=int)
+        if choice < 1 or choice > len(ids):
+            click.secho("ERROR: Invalid selection", fg="red", err=True)
+            raise SystemExit(1)
+        selected = ids[choice - 1]
+        _print_generation_detail(selected, generations[selected])
+        return
+
+    if by_run:
+        rows = _flatten_runs(store)
+        if not rows:
+            console.print("[yellow]No runs found.[/yellow]")
+            return
+
+        table = Table(title="Runs", box=box.ROUNDED)
+        table.add_column("#", style="cyan", no_wrap=True)
+        table.add_column("Run ID", style="white")
+        table.add_column("Generation", style="dim")
+        table.add_column("Function", style="cyan")
+        table.add_column("Status")
+        table.add_column("USD", justify="right", style="green")
+
+        for idx, (gid, _, run) in enumerate(rows, start=1):
+            totals = run.get("totals", {})
+            table.add_row(
+                str(idx),
+                str(run.get("run_id", "-")),
+                gid,
+                str(run.get("func_name", "-")),
+                str(run.get("status", "-")),
+                f"{float(totals.get('usd', 0.0) or 0.0):.6f}",
+            )
+        console.print(table)
+
+        choice = click.prompt("Select run number", type=int)
+        if choice < 1 or choice > len(rows):
+            click.secho("ERROR: Invalid selection", fg="red", err=True)
+            raise SystemExit(1)
+        gid, _, run = rows[choice - 1]
+        _print_run_detail(gid, run)
+        return
+
+    _print_generation_table(store)
 
 
 def _eval_type_label(case) -> str:
@@ -282,6 +515,21 @@ def validate_coverage(ctx, param, value):
     is_flag=True,
     help="With 'vowel schema': generate vowel-schema.json in current directory",
 )
+@click.option("--list", "list_costs", is_flag=True, help="With 'vowel costs': list records")
+@click.option(
+    "-g",
+    "--by-generation",
+    is_flag=True,
+    help="With 'vowel costs --list': browse generations interactively",
+)
+@click.option(
+    "-r",
+    "--by-run",
+    is_flag=True,
+    help="With 'vowel costs --list': browse runs interactively",
+)
+@click.option("--generation", "generation_id", help="With 'vowel costs': show generation id")
+@click.option("--run", "run_id_option", help="With 'vowel costs': show run id")
 def main(
     arg1: Path | None,
     arg2: Path | None,
@@ -302,6 +550,11 @@ def main(
     verbose: bool,
     hide_report: bool,
     schema_create: bool,
+    list_costs: bool,
+    by_generation: bool,
+    by_run: bool,
+    generation_id: str | None,
+    run_id_option: str | None,
 ):
     """vowel — YAML-based evaluation framework for Python functions."""
     console = Console(force_terminal=False, no_color=True) if no_color else Console()
@@ -368,6 +621,17 @@ def main(
         console.print("[green]✓[/green] Pydantic validation passed")
         return
 
+    # Command mode: vowel costs [--list -g|-r] [--generation <id>] [--run <id>]
+    if arg1 is not None and str(arg1) == "costs":
+        _handle_costs_command(
+            list_costs=list_costs,
+            by_generation=by_generation,
+            by_run=by_run,
+            generation_id=generation_id,
+            run_id=run_id_option,
+        )
+        return
+
     yaml_file = arg1
 
     # Validate incompatible options
diff --git a/src/vowel/codemode.py b/src/vowel/codemode.py
index e02f421..33269c5 100644
--- a/src/vowel/codemode.py
+++ b/src/vowel/codemode.py
@@ -20,6 +20,7 @@
 from pydantic_ai import Agent
 
 from vowel.context import EVAL_SPEC_CONTEXT
+from vowel.costs import CostManager
 from vowel.eval_types import EvalsSource
 from vowel.executor import ExecutionResult, Executor, resolve_executors
 from vowel.monitoring import enable_monitoring
@@ -178,6 +179,7 @@ def __init__(
         self,
         spec_model: str | None = None,
         exploration_model: str | None = None,
+        generation_id: str | None = None,
         default_executor: Executor | None = None,
         fallback_executor: Executor | None = None,
         additional_context: str = "",
@@ -203,6 +205,13 @@ def __init__(
         self.additional_context = additional_context
         self.min_snippets = min_snippets
         self.use_model_spec = use_model_spec
+        self.cost_manager = CostManager(
+            spec_model=self.spec_model,
+            exploration_model=self.exploration_model,
+            generation_id=generation_id,
+        )
+        self.generation_id = self.cost_manager.generation_id
+        self._active_run_id: str | None = None
         self._opts = opts
 
         # Lazy agents
@@ -213,9 +222,13 @@ def __init__(
             "CodeModeGenerator initialized",
             spec_model=self.spec_model,
             exploration_model=self.exploration_model,
+            generation_id=self.generation_id,
             executor=type(self.executor).__name__,
         )
 
+    def print_total_cost(self, run_id: str | None = None) -> None:
+        self.cost_manager.print_total_cost(run_id=run_id)
+
     # -- Agent properties --------------------------------------------------
 
     @property
@@ -280,6 +293,13 @@ def _explorer_system_prompt(self) -> str:
 - Use the function's REAL NAME — the function source code will be prepended
   automatically at runtime.  Do NOT define the function yourself.
 - Keep each snippet focused on ONE scenario.
+- Do NOT produce duplicate snippets.  Two snippets are duplicates if they test
+    the same input shape and same behavior class.
+- For `error_snippets`, each snippet must map to a DISTINCT error mode
+    (different guard/branch, exception type, or message pattern).
+- If the function signature has no positional-only (`/`) or keyword-only (`*`)
+    limiters, prefer positional call style for multi-argument calls and avoid
+    equivalent keyword-style duplicates.
 - Do NOT guess outputs — the snippets will be executed and the real
   outputs collected automatically.
 - NEVER use try/except in your snippets.  Let exceptions propagate
@@ -309,6 +329,7 @@ async def explore(
         func: Function,
         *,
         exploration_rounds: int = 2,
+        run_id: str | None = None,
     ) -> list[SnippetResult]:
         """Generate and execute exploration snippets.
 
@@ -331,13 +352,15 @@ async def explore(
                 ):
                     # Get exploration plan (round 2+ includes prior context)
                     if round_num == 1:
-                        plan = await self._get_exploration_plan(func)
+                        plan = await self._get_exploration_plan(func, run_id=run_id)
                     else:
                         cluster_summary = self._build_cluster_summary(all_results)
                         plan = await self._get_targeted_exploration_plan(
                             func,
                             all_results,
                             cluster_summary,
+                            run_id=run_id,
+                            round_num=round_num,
                         )
                         # Early exit: if no new snippets were produced
                         if not plan.snippets and not plan.error_snippets:
@@ -484,7 +507,12 @@ def _behavior_key(r: SnippetResult) -> str:
         new_keys = {_behavior_key(r) for r in new}
         return len(new_keys - prior_keys)
 
-    async def _get_exploration_plan(self, func: Function) -> ExplorationPlan:
+    async def _get_exploration_plan(
+        self,
+        func: Function,
+        *,
+        run_id: str | None = None,
+    ) -> ExplorationPlan:
         """Request initial exploration snippets from the model."""
         with logfire.span("codemode.llm_explore", func_name=func.name, round=1):
             prompt = f"""Explore the following function by writing test snippets:
@@ -500,6 +528,13 @@ async def _get_exploration_plan(self, func: Function) -> ExplorationPlan:
 `{func.name}` — the implementation will be prepended automatically."""
 
             result = await self.explorer_agent.run(prompt)
+            if run_id:
+                self.cost_manager.record_agent_usage(
+                    run_id=run_id,
+                    step_key="exploration_round_1",
+                    result=result,
+                    model_name=self.exploration_model,
+                )
             plan = result.output
 
             logfire.info(
@@ -516,6 +551,9 @@ async def _get_targeted_exploration_plan(
         func: Function,
         prior_results: list[SnippetResult],
         cluster_summary: str,
+        *,
+        run_id: str | None = None,
+        round_num: int = 2,
     ) -> ExplorationPlan:
         """Request targeted snippets using prior execution evidence."""
         with logfire.span("codemode.llm_explore", func_name=func.name, round=2):
@@ -547,10 +585,21 @@ async def _get_targeted_exploration_plan(
 - Do NOT repeat any snippet from the "Already tried" list.
 - Produce 8–12 NEW normal snippets targeting uncovered behaviour.
 - Produce 3–5 NEW error snippets targeting untried error paths.
+- Prefer diversity over volume: no semantically duplicate cases.
+- Each new error snippet should cover a unique failure mode.
+- If signature has no `/` or `*` limiters, use positional calling style for
+    multi-argument calls and avoid keyword/positional duplicates of same scenario.
 - Same strict rules as before: no try/except, real function name,
   one scenario per snippet, last expression captured."""
 
             result = await self.explorer_agent.run(prompt)
+            if run_id:
+                self.cost_manager.record_agent_usage(
+                    run_id=run_id,
+                    step_key=f"targeted_exploration_round_{round_num}",
+                    result=result,
+                    model_name=self.exploration_model,
+                )
             plan = result.output
 
             logfire.info(
@@ -569,6 +618,9 @@ async def generate_spec(
         func: Function,
         exploration_results: list[SnippetResult],
         failure_context: str | None = None,
+        *,
+        run_id: str | None = None,
+        attempt: int = 1,
     ) -> str | EvalsBundle:
         """Generate a spec from verified exploration results.
 
@@ -632,10 +684,28 @@ async def generate_spec(
 - The top-level YAML key MUST be `{func.name}` (the function name).
 - Generate at least {max(len(exploration_results), 5)} diverse test cases.
 - Use the EXACT outputs from the execution results above.
-- You MUST generate exactly {len(error_results)} raises cases — one for
-  each RAISED result above.  The spec is invalid without them.
+- Error coverage rule: include AT LEAST one raises case for each UNIQUE
+    observed error mode (exception type + meaningfully distinct call pattern).
+- Do NOT duplicate semantically equivalent error cases. If two cases represent
+    the same failing input semantics, keep only one (prefer the one with `match`
+    when message is deterministic from observed execution).
 - Cover normal, edge, and error cases.
 - In assertions, use `input` (NOT `inputs`) for accessing input values.
+- Prefer `expected` over `assertion` whenever the exact output is known from
+    verified execution results.
+- Use `assertion` only for true invariants/properties that are not redundant
+    with exact `expected` values.
+- Do NOT use broad/trivial assertions (e.g. `output >= 0`, `output <= len(input)`)
+    when a precise expected value can be asserted.
+- Keep the dataset compact and non-redundant: no duplicate cases with the same
+    effective behavior.
+- If function signature has no positional-only (`/`) or keyword-only (`*`)
+    limiters, prefer positional style for multi-argument calls and do not include
+    both positional and keyword variants of the same scenario.
+- Stay aligned with function contract/type hints: do not add contract-irrelevant
+    cases that only test incidental duck-typing unless explicitly motivated.
+- For `raises` cases, only claim exception type/message patterns that are present
+    in observed execution results; do not invent unsupported error expectations.
 
 YAML FORMAT — STRICT RULES (violations cause parse failure):
 - NEVER use YAML tags: `!!python/tuple`, `!!python/object`, `!!binary`,
@@ -654,6 +724,13 @@ async def generate_spec(
             )
 
             result = await self.spec_agent.run(prompt)
+            if run_id:
+                self.cost_manager.record_agent_usage(
+                    run_id=run_id,
+                    step_key=f"spec_generation_attempt_{attempt}",
+                    result=result,
+                    model_name=self.spec_model,
+                )
 
             if self.use_model_spec:
                 bundle = result.output
@@ -751,6 +828,7 @@ async def generate(
         self,
         func: Function,
         *,
+        run_id: str | None = None,
         run_evals: bool = True,
         save_to_file: bool = False,
         max_refinement_rounds: int = 2,
@@ -766,149 +844,162 @@ async def generate(
         with logfire.span(
             "codemode.pipeline",
             func_name=func.name,
+            generation_id=self.generation_id,
             spec_model=self.spec_model,
             exploration_model=self.exploration_model,
             executor=type(self.executor).__name__,
         ):
+            run_id = self.cost_manager.start_run(run_id=run_id, func_name=func.name)
+            self._active_run_id = run_id
+
             t0 = time.perf_counter()
 
-            # Phase 1 — explore (once)
-            exploration_results = await self.explore(func)
+            try:
+                # Phase 1 — explore (once)
+                exploration_results = await self.explore(func, run_id=run_id)
+
+                # Phase 2–4 — generate spec + validate + refine
+                yaml_spec = ""
+                generated_bundle: EvalsBundle | None = None
+                summary: EvalSummary | None = None
+                refinement_rounds = 0
+                failure_context: str | None = None
+                total_attempts = max_refinement_rounds + 1 if run_evals else 1
+
+                for attempt in range(total_attempts):
+                    with logfire.span(
+                        "codemode.spec_attempt",
+                        attempt=attempt + 1,
+                        is_refinement=attempt > 0,
+                    ):
+                        try:
+                            bundle = await self.generate_spec(
+                                func,
+                                exploration_results,
+                                failure_context,
+                                run_id=run_id,
+                                attempt=attempt + 1,
+                            )
 
-            # Phase 2–4 — generate spec + validate + refine
-            yaml_spec = ""
-            generated_bundle: EvalsBundle | None = None
-            summary: EvalSummary | None = None
-            refinement_rounds = 0
-            failure_context: str | None = None
-            total_attempts = max_refinement_rounds + 1 if run_evals else 1
+                            if isinstance(bundle, EvalsBundle):
+                                generated_bundle = bundle
+                                yaml_spec = bundle.to_yaml()
+                            else:
+                                generated_bundle = None
+                                yaml_spec = bundle
+                        except Exception as exc:
+                            logfire.warn(
+                                "Spec generation failed on attempt {attempt}, retrying",
+                                attempt=attempt + 1,
+                                error=str(exc),
+                            )
+                            failure_context = f"Generation error: {exc}"
+                            refinement_rounds = attempt + 1
+                            continue
 
-            for attempt in range(total_attempts):
-                with logfire.span(
-                    "codemode.spec_attempt",
-                    attempt=attempt + 1,
-                    is_refinement=attempt > 0,
-                ):
-                    try:
-                        bundle = await self.generate_spec(
-                            func,
-                            exploration_results,
-                            failure_context,
-                        )
+                        if not run_evals:
+                            break
 
-                        if isinstance(bundle, EvalsBundle):
-                            generated_bundle = bundle
-                            yaml_spec = bundle.to_yaml()
-                        else:
-                            generated_bundle = None
-                            yaml_spec = bundle
-                    except Exception as exc:
-                        logfire.warn(
-                            "Spec generation failed on attempt {attempt}, retrying",
-                            attempt=attempt + 1,
-                            error=str(exc),
-                        )
-                        failure_context = f"Generation error: {exc}"
-                        refinement_rounds = attempt + 1
-                        continue
+                        # Validate: run evals with ignore_duration=True
+                        try:
+                            if generated_bundle is not None:
+                                runner = (
+                                    RunEvals.from_bundle(generated_bundle)
+                                    .with_functions({func.name: func.impl})
+                                    .ignore_duration()
+                                )
+                            else:
+                                runner = (
+                                    RunEvals.from_source(yaml_spec)
+                                    .with_functions({func.name: func.impl})
+                                    .ignore_duration()
+                                )
+                            summary = runner.run()
 
-                    if not run_evals:
-                        break
+                            logfire.info(
+                                "Attempt {attempt}: {passed}/{total} passed, coverage={coverage:.1f}%",
+                                attempt=attempt + 1,
+                                passed=summary.success_count,
+                                total=summary.total_count,
+                                failed=summary.failed_count,
+                                errors=summary.error_count,
+                                coverage=summary.coverage * 100,
+                            )
 
-                    # Validate: run evals with ignore_duration=True
+                            if summary.coverage >= min_coverage:
+                                break
+
+                            # Build failure context for next attempt
+                            failure_context = self._build_failure_context(summary)
+                            refinement_rounds = attempt + 1
+                            logfire.warn(
+                                "Coverage {coverage:.0f}% below target {target:.0f}%, refining",
+                                coverage=summary.coverage * 100,
+                                target=min_coverage * 100,
+                                attempt=attempt + 1,
+                            )
+
+                        except Exception as exc:
+                            logfire.warn(
+                                "Failed to run evals on attempt {attempt}, retrying",
+                                attempt=attempt + 1,
+                                func_name=func.name,
+                                error=str(exc),
+                            )
+                            failure_context = f"Eval run error: {exc}"
+                            refinement_rounds = attempt + 1
+                            continue
+
+                # Phase 5 — inject per-case durations
+                if inject_durations:
+                    with logfire.span("codemode.inject_durations", func_name=func.name):
+                        yaml_spec = self._inject_durations(yaml_spec, func)
+
+                # Final summary run (with durations now present, but still ignored)
+                if run_evals and summary is not None:
                     try:
                         if generated_bundle is not None:
-                            runner = (
+                            final_runner = (
                                 RunEvals.from_bundle(generated_bundle)
                                 .with_functions({func.name: func.impl})
                                 .ignore_duration()
                             )
                         else:
-                            runner = (
+                            final_runner = (
                                 RunEvals.from_source(yaml_spec)
                                 .with_functions({func.name: func.impl})
                                 .ignore_duration()
                             )
-                        summary = runner.run()
-
-                        logfire.info(
-                            "Attempt {attempt}: {passed}/{total} passed, coverage={coverage:.1f}%",
-                            attempt=attempt + 1,
-                            passed=summary.success_count,
-                            total=summary.total_count,
-                            failed=summary.failed_count,
-                            errors=summary.error_count,
-                            coverage=summary.coverage * 100,
-                        )
-
-                        if summary.coverage >= min_coverage:
-                            break
-
-                        # Build failure context for next attempt
-                        failure_context = self._build_failure_context(summary)
-                        refinement_rounds = attempt + 1
-                        logfire.warn(
-                            "Coverage {coverage:.0f}% below target {target:.0f}%, refining",
-                            coverage=summary.coverage * 100,
-                            target=min_coverage * 100,
-                            attempt=attempt + 1,
-                        )
-
-                    except Exception as exc:
-                        logfire.warn(
-                            "Failed to run evals on attempt {attempt}, retrying",
-                            attempt=attempt + 1,
-                            func_name=func.name,
-                            error=str(exc),
-                        )
-                        failure_context = f"Eval run error: {exc}"
-                        refinement_rounds = attempt + 1
-                        continue
-
-            # Phase 5 — inject per-case durations
-            if inject_durations:
-                with logfire.span("codemode.inject_durations", func_name=func.name):
-                    yaml_spec = self._inject_durations(yaml_spec, func)
-
-            # Final summary run (with durations now present, but still ignored)
-            if run_evals and summary is not None:
-                try:
-                    if generated_bundle is not None:
-                        final_runner = (
-                            RunEvals.from_bundle(generated_bundle)
-                            .with_functions({func.name: func.impl})
-                            .ignore_duration()
-                        )
-                    else:
-                        final_runner = (
-                            RunEvals.from_source(yaml_spec)
-                            .with_functions({func.name: func.impl})
-                            .ignore_duration()
-                        )
-                    summary = final_runner.run()
-                except Exception:  # noqa: BLE001
-                    pass  # keep last good summary
-
-            if save_to_file:
-                path = f"{func.name}_evals.yml"
-                spec_to_write = materialize_yaml_with_schema_header(yaml_spec)
-                with open(path, "w") as f:
-                    f.write(spec_to_write)
-                logfire.info("Saved spec to {path}", path=path)
-
-            elapsed = (time.perf_counter() - t0) * 1000
-            logfire.info(
-                "CodeMode pipeline complete in {elapsed:.0f}ms (refinements={refinement_rounds})",
-                elapsed=elapsed,
-                func_name=func.name,
-                exploration_count=len(exploration_results),
-                refinement_rounds=refinement_rounds,
-                has_summary=summary is not None,
-            )
+                        summary = final_runner.run()
+                    except Exception:  # noqa: BLE001
+                        pass  # keep last good summary
+
+                if save_to_file:
+                    path = f"{func.name}_evals.yml"
+                    spec_to_write = materialize_yaml_with_schema_header(yaml_spec)
+                    with open(path, "w") as f:
+                        f.write(spec_to_write)
+                    logfire.info("Saved spec to {path}", path=path)
+
+                elapsed = (time.perf_counter() - t0) * 1000
+                self.cost_manager.mark_run_completed(run_id)
+                logfire.info(
+                    "CodeMode pipeline complete in {elapsed:.0f}ms (refinements={refinement_rounds})",
+                    elapsed=elapsed,
+                    func_name=func.name,
+                    generation_id=self.generation_id,
+                    run_id=run_id,
+                    exploration_count=len(exploration_results),
+                    refinement_rounds=refinement_rounds,
+                    has_summary=summary is not None,
+                )
 
-            return CodeModeResult(
-                exploration_results=exploration_results,
-                yaml_spec=yaml_spec,
-                summary=summary,
-                refinement_rounds=refinement_rounds,
-            )
+                return CodeModeResult(
+                    exploration_results=exploration_results,
+                    yaml_spec=yaml_spec,
+                    summary=summary,
+                    refinement_rounds=refinement_rounds,
+                )
+            except Exception as exc:
+                self.cost_manager.mark_run_failed(run_id, str(exc))
+                raise
diff --git a/src/vowel/costs.py b/src/vowel/costs.py
new file mode 100644
index 0000000..72d90f7
--- /dev/null
+++ b/src/vowel/costs.py
@@ -0,0 +1,358 @@
+"""Cost tracking and persistence utilities for CodeMode runs."""
+
+from __future__ import annotations
+
+import fcntl
+import json
+import os
+import tempfile
+import uuid
+from datetime import UTC, datetime
+from pathlib import Path
+from typing import Any
+
+import logfire
+import yaml
+
+
+class CostManager:
+    """Manage generation/run cost records, pricing, and persistence."""
+
+    def __init__(
+        self,
+        *,
+        spec_model: str,
+        exploration_model: str,
+        generation_id: str | None = None,
+        costs_file: Path | None = None,
+    ) -> None:
+        self.spec_model = spec_model
+        self.exploration_model = exploration_model
+        self.generation_id = generation_id or self._new_generation_id()
+        self._costs_file = (
+            costs_file or Path.home() / ".vowel" / "codemode" / "generation_costs.json"
+        )
+        self._price_table = self._load_costs_yml()
+        self._cost_records: dict[str, Any] = self._load_cost_records()
+        self._ensure_generation_record()
+
+    @staticmethod
+    def _new_generation_id() -> str:
+        ts = datetime.now(UTC).strftime("%Y%m%dT%H%M%SZ")
+        return f"gen_{ts}_{uuid.uuid4().hex[:8]}"
+
+    @staticmethod
+    def _new_run_id() -> str:
+        return f"run_{uuid.uuid4().hex}"
+
+    @staticmethod
+    def _default_cost_store() -> dict[str, Any]:
+        return {"schema_version": 1, "generations": {}}
+
+    def _load_cost_records(self) -> dict[str, Any]:
+        if not self._costs_file.exists():
+            return self._default_cost_store()
+        try:
+            data = json.loads(self._costs_file.read_text(encoding="utf-8"))
+        except Exception:
+            logfire.warn("Failed to parse cost records, resetting store")
+            return self._default_cost_store()
+
+        if not isinstance(data, dict) or "generations" not in data:
+            return self._default_cost_store()
+        return data
+
+    def _ensure_generation_record(self) -> None:
+        generations = self._cost_records.setdefault("generations", {})
+        if self.generation_id in generations:
+            return
+        generations[self.generation_id] = {
+            "generation_id": self.generation_id,
+            "created_at": datetime.now(UTC).isoformat(),
+            "spec_model": self.spec_model,
+            "exploration_model": self.exploration_model,
+            "totals": {"usd": 0.0, "input_tokens": 0, "output_tokens": 0, "requests": 0},
+            "runs": {},
+        }
+
+    @staticmethod
+    def _normalize_models(data: Any) -> dict[str, dict[str, float]]:
+        if not isinstance(data, dict):
+            return {}
+
+        models_obj = data.get("models")
+        normalized: dict[str, dict[str, float]] = {}
+
+        if isinstance(models_obj, dict):
+            items = models_obj.items()
+        elif isinstance(models_obj, list):
+            items = []
+            for item in models_obj:
+                if isinstance(item, dict):
+                    items.extend(item.items())
+        else:
+            items = []
+
+        for model_name, model_data in items:
+            if not isinstance(model_name, str) or not isinstance(model_data, dict):
+                continue
+            normalized[model_name] = {
+                "input_per_million": float(model_data.get("input_per_million", 0.0) or 0.0),
+                "output_per_million": float(model_data.get("output_per_million", 0.0) or 0.0),
+                "cached_input_per_million": float(
+                    model_data.get("cached_input_per_million", 0.0) or 0.0
+                ),
+            }
+
+        return normalized
+
+    def _load_costs_yml(self) -> dict[str, Any]:
+        candidates = [Path.cwd() / "costs.yml", Path(__file__).resolve().parents[2] / "costs.yml"]
+        for path in candidates:
+            if not path.exists():
+                continue
+            try:
+                data = yaml.safe_load(path.read_text(encoding="utf-8"))
+            except Exception:
+                continue
+            models = self._normalize_models(data)
+            if models:
+                return {"models": models}
+        return {}
+
+    def _persist_costs_atomic(self) -> None:
+        self._costs_file.parent.mkdir(parents=True, exist_ok=True)
+        payload = json.dumps(self._cost_records, ensure_ascii=False, indent=2) + "\n"
+        lock_path = self._costs_file.parent / ".generation_costs.lock"
+
+        with open(lock_path, "a+", encoding="utf-8") as lock_file:
+            fcntl.flock(lock_file.fileno(), fcntl.LOCK_EX)
+            try:
+                with tempfile.NamedTemporaryFile(
+                    mode="w", encoding="utf-8", dir=self._costs_file.parent, delete=False
+                ) as tmp:
+                    tmp.write(payload)
+                    tmp.flush()
+                    os.fsync(tmp.fileno())
+                    tmp_path = Path(tmp.name)
+                os.replace(tmp_path, self._costs_file)
+            finally:
+                fcntl.flock(lock_file.fileno(), fcntl.LOCK_UN)
+
+    def _ensure_run_record(self, run_id: str, func_name: str) -> None:
+        generation = self._cost_records["generations"][self.generation_id]
+        runs = generation.setdefault("runs", {})
+        if run_id in runs:
+            return
+        runs[run_id] = {
+            "run_id": run_id,
+            "func_name": func_name,
+            "created_at": datetime.now(UTC).isoformat(),
+            "status": "running",
+            "error": None,
+            "steps": {},
+            "totals": {"usd": 0.0, "input_tokens": 0, "output_tokens": 0, "requests": 0},
+        }
+
+    def _get_run_record(self, run_id: str) -> dict[str, Any]:
+        return self._cost_records["generations"][self.generation_id]["runs"][run_id]
+
+    @staticmethod
+    def _run_usage_dict(usage: Any) -> dict[str, int]:
+        return {
+            "requests": int(getattr(usage, "requests", 0) or 0),
+            "input_tokens": int(getattr(usage, "input_tokens", 0) or 0),
+            "output_tokens": int(getattr(usage, "output_tokens", 0) or 0),
+            "cached_input_tokens": int(getattr(usage, "cached_input_tokens", 0) or 0),
+        }
+
+    @staticmethod
+    def _normalize_model_name(model_name: str) -> str:
+        normalized = model_name.strip()
+        if ":" in normalized:
+            normalized = normalized.split(":", 1)[1]
+        if "/" in normalized:
+            normalized = normalized.rsplit("/", 1)[1]
+        return normalized
+
+    def _resolve_price_from_costs_yml(self, model_name: str) -> dict[str, float] | None:
+        models = self._price_table.get("models") if isinstance(self._price_table, dict) else None
+        if not isinstance(models, dict):
+            return None
+
+        normalized = self._normalize_model_name(model_name)
+        for key in (model_name, normalized):
+            data = models.get(key)
+            if not isinstance(data, dict):
+                continue
+            return {
+                "input_per_million": float(data.get("input_per_million", 0.0) or 0.0),
+                "output_per_million": float(data.get("output_per_million", 0.0) or 0.0),
+                "cached_input_per_million": float(data.get("cached_input_per_million", 0.0) or 0.0),
+            }
+        return None
+
+    def _resolve_price(self, model_name: str) -> tuple[dict[str, float] | None, str, bool]:
+        normalized = self._normalize_model_name(model_name)
+
+        try:
+            import genai_prices  # type: ignore
+
+            for attr in ("get_price", "lookup_price", "resolve_price"):
+                fn = getattr(genai_prices, attr, None)
+                if callable(fn):
+                    for name in (model_name, normalized):
+                        out = fn(name)
+                        if isinstance(out, dict):
+                            return (
+                                {
+                                    "input_per_million": float(
+                                        out.get("input_per_million")
+                                        or out.get("input")
+                                        or out.get("prompt")
+                                        or 0.0
+                                    ),
+                                    "output_per_million": float(
+                                        out.get("output_per_million")
+                                        or out.get("output")
+                                        or out.get("completion")
+                                        or 0.0
+                                    ),
+                                    "cached_input_per_million": float(
+                                        out.get("cached_input_per_million")
+                                        or out.get("cached_input")
+                                        or 0.0
+                                    ),
+                                },
+                                "genai-prices",
+                                False,
+                            )
+        except Exception:
+            pass
+
+        yml_price = self._resolve_price_from_costs_yml(model_name)
+        if yml_price is not None:
+            return yml_price, "costs.yml", False
+
+        return None, "missing", True
+
+    def _estimate_step_usd(self, model_name: str, usage: dict[str, int]) -> tuple[float, str, bool]:
+        price, source, missing = self._resolve_price(model_name)
+        if price is None:
+            return 0.0, source, True
+
+        in_cost = usage["input_tokens"] / 1_000_000 * price["input_per_million"]
+        out_cost = usage["output_tokens"] / 1_000_000 * price["output_per_million"]
+        cached_cost = usage["cached_input_tokens"] / 1_000_000 * price["cached_input_per_million"]
+        return in_cost + out_cost + cached_cost, source, missing
+
+    def _recompute_totals(self) -> None:
+        generation = self._cost_records["generations"][self.generation_id]
+        g_totals = {"usd": 0.0, "input_tokens": 0, "output_tokens": 0, "requests": 0}
+
+        for run in generation.get("runs", {}).values():
+            r_totals = {"usd": 0.0, "input_tokens": 0, "output_tokens": 0, "requests": 0}
+            for step in run.get("steps", {}).values():
+                usages = step.get("usages", [])
+                for item in usages:
+                    usage = item.get("usage", {})
+                    r_totals["usd"] += float(item.get("usd", 0.0) or 0.0)
+                    r_totals["input_tokens"] += int(usage.get("input_tokens", 0) or 0)
+                    r_totals["output_tokens"] += int(usage.get("output_tokens", 0) or 0)
+                    r_totals["requests"] += int(usage.get("requests", 0) or 0)
+            run["totals"] = r_totals
+
+            g_totals["usd"] += r_totals["usd"]
+            g_totals["input_tokens"] += r_totals["input_tokens"]
+            g_totals["output_tokens"] += r_totals["output_tokens"]
+            g_totals["requests"] += r_totals["requests"]
+
+        generation["totals"] = g_totals
+
+    def start_run(self, *, run_id: str | None, func_name: str) -> str:
+        final_run_id = run_id or self._new_run_id()
+        self._ensure_generation_record()
+        self._ensure_run_record(final_run_id, func_name)
+        self._persist_costs_atomic()
+        return final_run_id
+
+    def record_agent_usage(
+        self, *, run_id: str, step_key: str, result: Any, model_name: str
+    ) -> None:
+        run = self._get_run_record(run_id)
+        step = run.setdefault("steps", {}).setdefault(step_key, {"usages": []})
+
+        usage_obj = result.usage() if callable(getattr(result, "usage", None)) else None
+        usage = (
+            self._run_usage_dict(usage_obj) if usage_obj is not None else self._run_usage_dict(None)
+        )
+        usd, price_source, price_missing = self._estimate_step_usd(model_name, usage)
+
+        step_item = {
+            "timestamp": datetime.now(UTC).isoformat(),
+            "agent_run_id": getattr(result, "run_id", None),
+            "model_name": model_name,
+            "usage": usage,
+            "usd": usd,
+            "price_source": price_source,
+            "price_missing": price_missing,
+        }
+        step["usages"].append(step_item)
+
+        self._recompute_totals()
+        self._persist_costs_atomic()
+
+        logfire.info(
+            "CodeMode step cost recorded",
+            generation_id=self.generation_id,
+            run_id=run_id,
+            step=step_key,
+            model_name=model_name,
+            usd=usd,
+            usage=usage,
+            price_source=price_source,
+            price_missing=price_missing,
+        )
+
+    def mark_run_completed(self, run_id: str) -> None:
+        run_rec = self._get_run_record(run_id)
+        run_rec["status"] = "completed"
+        run_rec["completed_at"] = datetime.now(UTC).isoformat()
+        self._recompute_totals()
+        self._persist_costs_atomic()
+
+    def mark_run_failed(self, run_id: str, error: str) -> None:
+        run_rec = self._get_run_record(run_id)
+        run_rec["status"] = "failed"
+        run_rec["error"] = error
+        run_rec["completed_at"] = datetime.now(UTC).isoformat()
+        self._recompute_totals()
+        self._persist_costs_atomic()
+
+    def print_total_cost(self, run_id: str | None = None) -> None:
+        generation = self._cost_records["generations"].get(self.generation_id, {})
+        if run_id is not None:
+            run = generation.get("runs", {}).get(run_id)
+            if not run:
+                print(f"run not found: {run_id}")
+                return
+            totals = run.get("totals", {})
+            print(
+                "run_cost",
+                run_id,
+                f"usd={totals.get('usd', 0.0):.6f}",
+                f"input_tokens={totals.get('input_tokens', 0)}",
+                f"output_tokens={totals.get('output_tokens', 0)}",
+                f"requests={totals.get('requests', 0)}",
+            )
+            return
+
+        totals = generation.get("totals", {})
+        print(
+            "generation_cost",
+            self.generation_id,
+            f"usd={totals.get('usd', 0.0):.6f}",
+            f"input_tokens={totals.get('input_tokens', 0)}",
+            f"output_tokens={totals.get('output_tokens', 0)}",
+            f"requests={totals.get('requests', 0)}",
+        )
diff --git a/src/vowel/evals.py b/src/vowel/evals.py
index 722dc55..f4d1b2f 100644
--- a/src/vowel/evals.py
+++ b/src/vowel/evals.py
@@ -11,7 +11,13 @@
 from pydantic import ValidationError
 from pydantic.type_adapter import TypeAdapter
 from pydantic_ai.settings import ModelSettings
-from pydantic_evals.evaluators import EvaluationReason, Evaluator, EvaluatorContext, LLMJudge
+from pydantic_evals.evaluators import (
+    EvaluationReason,
+    Evaluator,
+    EvaluatorContext,
+    LLMJudge,
+    OutputConfig,  # noqa: F401
+)
 
 MONTY_AVAILABLE = bool(importlib.util.find_spec("pydantic-monty"))
 
@@ -396,6 +402,9 @@ def create_llm_judge(
     include: list[str] | None = None,
     config: dict | None = None,
 ) -> LLMJudge:
+    # Imported lazily to avoid circular import at module import time.
+    from .utils import _resolve_env_ref
+
     if config is None:
         config = {}
 
@@ -405,14 +414,8 @@ def create_llm_judge(
             "'model' must be specified in config or set JUDGE_MODEL environment variable"
         )
 
-    if model.strip().startswith("$"):
-        env_var = model.strip().lstrip("$")
-        model = os.getenv(env_var)
-        if not model:
-            raise ValueError(
-                f"Environment variable {env_var} is not set for judge model, "
-                f"set {env_var} to a valid model name."
-            )
+    model = _resolve_env_ref(model, field_name="model")
+    rubric = _resolve_env_ref(rubric, field_name="rubric")
 
     include_input = False
     include_expected_output = False
diff --git a/src/vowel/utils.py b/src/vowel/utils.py
index 1c710c8..60bab83 100644
--- a/src/vowel/utils.py
+++ b/src/vowel/utils.py
@@ -130,6 +130,24 @@ def to_yaml(self) -> str:
 }
 
 
+def _resolve_env_ref(
+    value: str, *, field_name: str, scope: Literal["judge", "model"] | str = "judge"
+) -> str:
+    """Resolve $ENV_VAR references used in YAML evaluator settings."""
+    value = value.strip()
+    if not value.startswith("$"):
+        return value
+
+    env_var = value.lstrip("$")
+    resolved = os.getenv(env_var)
+    if not resolved:
+        raise ValueError(
+            f"Environment variable {env_var} is not set for {scope} {field_name}, "
+            f"set {env_var} to a valid value."
+        )
+    return resolved
+
+
 def is_yaml_serializable_type(type_hint: Any) -> bool:
     """
     Check if a type hint represents a YAML-serializable type.
@@ -1225,6 +1243,42 @@ def _merge_programmatic_fixtures(
     return merged_fixtures, fixture_funcs
 
 
+def _resolve_eval_id_mapping(
+    mapping: Mapping[str, Any] | None,
+    eval_id: str,
+    *,
+    mapping_name: str,
+) -> Any | None:
+    """Resolve mapping entries by exact id first, then by short function name.
+
+    Supports using programmatic keys like ``{"func": fn}`` for specs that use
+    ``module.func`` eval ids.
+    """
+    if not mapping:
+        return None
+
+    if eval_id in mapping:
+        return mapping[eval_id]
+
+    short_name = eval_id.rsplit(".", 1)[-1]
+    if short_name in mapping:
+        return mapping[short_name]
+
+    # Reverse direction: when eval id is bare and mapping uses module.function.
+    if "." not in eval_id:
+        matches = [value for key, value in mapping.items() if key.rsplit(".", 1)[-1] == eval_id]
+        if len(matches) == 1:
+            return matches[0]
+        if len(matches) > 1:
+            candidates = sorted({key for key in mapping if key.rsplit(".", 1)[-1] == eval_id})
+            raise ValueError(
+                f"Ambiguous {mapping_name} mapping for '{eval_id}'. "
+                f"Provide an exact key. Candidates: {candidates}"
+            )
+
+    return None
+
+
 def _import_and_detect_class_method(
     eval_id: str,
     functions: dict[str, Callable] | None,
@@ -1238,8 +1292,9 @@ def _import_and_detect_class_method(
         - class_path: Full module.ClassName path for class methods, None otherwise
         - class_name: Class name for class methods, None otherwise
     """
-    if functions and eval_id in functions:
-        func = functions[eval_id]
+    resolved_func = _resolve_eval_id_mapping(functions, eval_id, mapping_name="function")
+    if resolved_func is not None:
+        func = resolved_func
         # Check if bound method (exclude builtin functions where __self__ is the module)
         self_obj = getattr(func, "__self__", None)
         if self_obj is not None and not isinstance(self_obj, types.ModuleType):
@@ -1491,8 +1546,10 @@ def _evaluate_single_function(
         )
 
         # Get serializers for this function if defined
-        func_schema = schema.get(eval_id)
-        func_serial_fn = serial_fn.get(eval_id)
+        func_schema = _resolve_eval_id_mapping(schema, eval_id, mapping_name="serializer schema")
+        func_serial_fn = _resolve_eval_id_mapping(
+            serial_fn, eval_id, mapping_name="serializer function"
+        )
 
         # Setup module-scoped fixtures for this eval
         module_fixtures = {}
diff --git a/tests/cassettes/llm_judge_custom_model.json b/tests/cassettes/llm_judge_custom_model.json
index 412c044..4d6c526 100644
--- a/tests/cassettes/llm_judge_custom_model.json
+++ b/tests/cassettes/llm_judge_custom_model.json
@@ -4,7 +4,7 @@
     "input_preview": "john doe",
     "result": {
       "passed": false,
-      "model": "openrouter:google/gemini-3-flash-preview"
+      "model": "openrouter:anthropic/claude-opus-4.6"
     }
   }
 }
\ No newline at end of file
diff --git a/tests/cassettes/test_generate_and_run.json b/tests/cassettes/test_generate_and_run.json
index 4ace11b..35414fa 100644
--- a/tests/cassettes/test_generate_and_run.json
+++ b/tests/cassettes/test_generate_and_run.json
@@ -3,9 +3,9 @@
     "prompt_preview": "generate_and_run",
     "model": "openrouter:google/gemini-3-flash-preview",
     "response": {
-      "yaml_spec": "double:\n  evals:\n    CorrectType:\n      type: int\n    DoubleLogic:\n      assertion: output == input * 2\n    NonNegativeIfInputNonNegative:\n      assertion: input < 0 or output >= input\n  dataset:\n  - case:\n      id: positive_integer\n      input: 5\n      expected: 10\n  - case:\n      id: zero\n      input: 0\n      expected: 0\n  - case:\n      id: negative_integer\n      input: -4\n      expected: -8\n  - case:\n      id: large_integer\n      input: 1000000\n      expected: 2000000\n  - case:\n      id: sequence_multiplication_check\n      input: 1\n      expected: 2\n  - case:\n      id: invalid_type_string\n      input: '10'\n      assertion: output == '1010'\n",
+      "yaml_spec": "double:\n  evals:\n    IsInteger:\n      type: int\n    CorrectCalculation:\n      assertion: output == input * 2\n    FastEnough:\n      duration: 0.001\n  dataset:\n  - case:\n      id: positive_integer\n      input: 10\n      expected: 20\n  - case:\n      id: negative_integer\n      input: -5\n      expected: -10\n  - case:\n      id: zero_input\n      input: 0\n      expected: 0\n  - case:\n      id: large_integer\n      input: 1000000\n      expected: 2000000\n  - case:\n      id: string_input_error\n      input: '5'\n      assertion: output == '55'\n      type: str\n",
       "was_healed": false,
-      "coverage": 0.8333333333333334
+      "coverage": 0.0
     }
   }
 }
\ No newline at end of file
diff --git a/tests/cassettes/test_generate_factorial.json b/tests/cassettes/test_generate_factorial.json
index 758b00c..094b8db 100644
--- a/tests/cassettes/test_generate_factorial.json
+++ b/tests/cassettes/test_generate_factorial.json
@@ -3,9 +3,9 @@
     "prompt_preview": "generate_function",
     "model": "openrouter:google/gemini-3-flash-preview",
     "response": {
-      "name": "calculate_factorial",
-      "description": "Calculates the factorial of a non-negative integer n using an iterative approach. Includes input validation for non-integers and negative values.",
-      "code": "def calculate_factorial(n: int) -> int:\n    \\\"\\\"\\\"\n    Calculates the factorial of a non-negative integer n.\n    \n    Args:\n        n (int): A non-negative integer.\n        \n    Returns:\n        int: The factorial of n.\n        \n    Raises:\n        ValueError: If n is negative.\n        TypeError: If n is not an integer.\n    \\\"\\\"\\\"\n    if not isinstance(n, int):\n        raise TypeError(\\\"Input must be an integer.\\\")\n    if n < 0:\n        raise ValueError(\\\"Input must be a non-negative integer.\\\")\n    \n    result = 1\n    for i in range(2, n + 1):\n        result *= i\n    return result\n"
+      "name": "factorial",
+      "description": "Calculates the factorial of a non-negative integer using an iterative approach to avoid recursion depth issues.",
+      "code": "def factorial(n: int) -> int:\n    \\\"\\\"\\\"\n    Calculates the factorial of a non-negative integer n.\n    \n    Args:\n        n: A non-negative integer.\n        \n    Returns:\n        The factorial of n.\n        \n    Raises:\n        ValueError: If n is negative.\n        TypeError: If n is not an integer.\n    \\\"\\\"\\\"\n    if not isinstance(n, int):\n        raise TypeError(\\\"Input must be an integer.\\\")\n    if n < 0:\n        raise ValueError(\\\"Factorial is not defined for negative numbers.\\\")\n    \n    result = 1\n    for i in range(2, n + 1):\n        result *= i\n    return result\n"
     }
   }
 }
\ No newline at end of file
diff --git a/tests/cassettes/test_generate_palindrome.json b/tests/cassettes/test_generate_palindrome.json
index 452654a..437686c 100644
--- a/tests/cassettes/test_generate_palindrome.json
+++ b/tests/cassettes/test_generate_palindrome.json
@@ -4,8 +4,8 @@
     "model": "openrouter:google/gemini-3-flash-preview",
     "response": {
       "name": "is_palindrome",
-      "description": "Checks if a string is a palindrome while ignoring case and spaces. Only spaces are ignored, other punctuation is preserved.",
-      "code": "def is_palindrome(text: str) -> bool:\n    \"\"\"\n    Checks if a string is a palindrome, ignoring case and spaces.\n    \n    Args:\n        text: The string to check.\n        \n    Returns:\n        True if the string is a palindrome, False otherwise.\n    \"\"\"\n    if text is None:\n        raise TypeError(\"Input must be a string\")\n        \n    # Remove spaces and convert to lowercase\n    normalized = text.replace(\" \", \"\").lower()\n    \n    # Check if string matches its reverse\n    return normalized == normalized[::-1]\n"
+      "description": "Checks if a string is a palindrome while ignoring case and whitespace.",
+      "code": "def is_palindrome(text: str) -> bool:\n    \\\"\\\"\\\"\n    Checks if a string is a palindrome, ignoring case and spaces.\n    \n    Args:\n        text (str): The string to check.\n        \n    Returns:\n        bool: True if it is a palindrome, False otherwise.\n    \\\"\\\"\\\"\n    if not isinstance(text, str):\n        return False\n        \n    # Remove spaces and convert to lowercase\n    cleaned = \\\"\\\".join(text.split()).lower()\n    \n    # Check if string matches its reverse\n    return cleaned == cleaned[::-1]\n"
     }
   }
 }
\ No newline at end of file
diff --git a/tests/cassettes/test_generate_spec_simple.json b/tests/cassettes/test_generate_spec_simple.json
index def07eb..6bbf8b9 100644
--- a/tests/cassettes/test_generate_spec_simple.json
+++ b/tests/cassettes/test_generate_spec_simple.json
@@ -3,7 +3,7 @@
     "prompt_preview": "generate_spec",
     "model": "openrouter:google/gemini-3-flash-preview",
     "response": {
-      "yaml_spec": "add_numbers:\n  evals:\n    IsInt:\n      type: int\n    IdentityProperty:\n      assertion: (input[0] == 0 and output == input[1]) or (input[1] == 0 and output\n        == input[0]) or True\n    CommutativeProperty:\n      assertion: output == input[1] + input[0]\n  dataset:\n  - case:\n      id: typical_positive\n      inputs:\n      - 10\n      - 25\n      expected: 35\n  - case:\n      id: negative_numbers\n      inputs:\n      - -5\n      - -15\n      expected: -20\n  - case:\n      id: mixed_signs\n      inputs:\n      - 100\n      - -40\n      expected: 60\n  - case:\n      id: zero_identity\n      inputs:\n      - 0\n      - 42\n      expected: 42\n  - case:\n      id: large_integers\n      inputs:\n      - 1000000\n      - 2000000\n      expected: 3000000\n  - case:\n      id: boundary_zero_sum\n      inputs:\n      - 50\n      - -50\n      expected: 0\n",
+      "yaml_spec": "add_numbers:\n  evals:\n    IsInteger:\n      type: int\n    CorrectSum:\n      assertion: output == input[0] + input[1]\n    FastExecution:\n      duration: 0.001\n  dataset:\n  - case:\n      id: positive_integers\n      inputs:\n      - 10\n      - 20\n      expected: 30\n  - case:\n      id: negative_integers\n      inputs:\n      - -5\n      - -15\n      expected: -20\n  - case:\n      id: mixed_signs\n      inputs:\n      - -10\n      - 25\n      expected: 15\n  - case:\n      id: zero_addition\n      inputs:\n      - 0\n      - 100\n      expected: 100\n  - case:\n      id: large_integers\n      inputs:\n      - 1000000\n      - 2000000\n      expected: 3000000\n  - case:\n      id: identity_property\n      inputs:\n      - 42\n      - 0\n      expected: 42\n",
       "func_name": "add_numbers"
     }
   }
diff --git a/tests/cassettes/test_generate_spec_string.json b/tests/cassettes/test_generate_spec_string.json
index 2968bb9..0a365b1 100644
--- a/tests/cassettes/test_generate_spec_string.json
+++ b/tests/cassettes/test_generate_spec_string.json
@@ -3,7 +3,7 @@
     "prompt_preview": "generate_spec",
     "model": "openrouter:google/gemini-3-flash-preview",
     "response": {
-      "yaml_spec": "reverse_string:\n  evals:\n    IsString:\n      type: str\n    ReverseProperty:\n      assertion: output[::-1] == input\n    LengthInvariant:\n      assertion: len(output) == len(input)\n  dataset:\n  - case:\n      id: typical_word\n      input: hello\n      expected: olleh\n  - case:\n      id: empty_string\n      input: ''\n      expected: ''\n  - case:\n      id: single_character\n      input: z\n      expected: z\n  - case:\n      id: palindrome\n      input: racecar\n      expected: racecar\n  - case:\n      id: strings_with_spaces\n      input: abc def\n      expected: fed cba\n  - case:\n      id: numeric_string\n      input: '12345'\n      expected: '54321'\n  - case:\n      id: special_characters\n      input: '!@#$%^&*()'\n      expected: )(*&^%$#@!\n",
+      "yaml_spec": "reverse_string:\n  evals:\n    IsString:\n      type: str\n    CorrectLength:\n      assertion: len(output) == len(input)\n    IdentityProperty:\n      assertion: output[::-1] == input\n  dataset:\n  - case:\n      id: normal_word\n      input: hello\n      expected: olleh\n  - case:\n      id: empty_string\n      input: ''\n      expected: ''\n  - case:\n      id: single_character\n      input: A\n      expected: A\n  - case:\n      id: with_spaces\n      input: nurses run\n      expected: nur sesrun\n  - case:\n      id: palindrome\n      input: racecar\n      expected: racecar\n  - case:\n      id: numeric_string\n      input: '123456789'\n      expected: '987654321'\n  - case:\n      id: special_characters\n      input: '!@#$%^&*'\n      expected: '*&^%$#@!'\n",
       "func_name": "reverse_string"
     }
   }
diff --git a/tests/test_generation.py b/tests/test_generation.py
deleted file mode 100644
index 011546c..0000000
--- a/tests/test_generation.py
+++ /dev/null
@@ -1,36 +0,0 @@
-"""Test script for EvalGenerator and GenerationResult."""
-
-from vowel import EvalGenerator, GenerationResult
-
-
-def main():
-    generator = EvalGenerator(load_env=True)
-
-    print(f"\nUsing model: {generator.model}")
-    print("\n🚀 Step 1: Generate a function from prompt\n")
-
-    func = generator.generate_function(
-        prompt="Create a function called 'is_prime' that checks if a number is prime. Return True if prime, False otherwise.",
-        async_func=False,
-    )
-
-    print(f"Generated: {func.name}")
-    func.print()
-
-    print("\n🧪 Step 2: Generate spec and run evals\n")
-
-    result: GenerationResult = generator.generate_and_run(
-        func,
-        auto_retry=True,
-        max_retries=2,
-        min_coverage=0.9,
-        heal_function=True,
-    )
-
-    result.print()
-
-    print("✅ Test completed!\n")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/tests/test_llm_integration.py b/tests/test_llm_integration.py
index bcce51b..7e55a20 100644
--- a/tests/test_llm_integration.py
+++ b/tests/test_llm_integration.py
@@ -11,10 +11,10 @@
 
 dotenv.load_dotenv()
 
-DEFAULT_MODEL = os.getenv("MODEL_NAME", "openrouter:google/gemini-3-flash-preview")
+DEFAULT_MODEL = "openrouter:google/gemini-3-flash-preview"
 
 pytestmark = pytest.mark.skipif(
-    not os.getenv("OPENROUTER_API_KEY") and not os.getenv("OPENAI_API_KEY"),
+    not os.getenv("OPENROUTER_API_KEY"),
     reason="No API key available for LLM tests (need OPENROUTER_API_KEY or OPENAI_API_KEY)",
 )
 
diff --git a/tests/test_llm_judge_env_refs.py b/tests/test_llm_judge_env_refs.py
new file mode 100644
index 0000000..7407426
--- /dev/null
+++ b/tests/test_llm_judge_env_refs.py
@@ -0,0 +1,33 @@
+"""Tests for environment variable references in LLM Judge configuration."""
+
+import pytest
+
+from vowel.evals import create_llm_judge
+
+
+def test_create_llm_judge_resolves_rubric_and_model_env_refs(monkeypatch):
+    """Rubric and model support $ENV_VAR style references."""
+    monkeypatch.setenv("TEST_JUDGE_MODEL", "openrouter:google/gemini-2.5-flash")
+    monkeypatch.setenv("_TEST_JUDGE_RUBRIC", "Output should be concise and accurate")
+
+    judge = create_llm_judge(
+        rubric="$_TEST_JUDGE_RUBRIC",
+        include=["input"],
+        config={"model": "$TEST_JUDGE_MODEL", "temperature": 0.0},
+    )
+
+    assert judge.model == "openrouter:google/gemini-2.5-flash"
+    assert judge.rubric == "Output should be concise and accurate"
+
+
+def test_create_llm_judge_raises_when_rubric_env_ref_missing(monkeypatch):
+    """Missing rubric env var should raise a clear error."""
+    monkeypatch.setenv("TEST_JUDGE_MODEL", "openrouter:google/gemini-2.5-flash")
+    monkeypatch.delenv("_MISSING_RUBRIC", raising=False)
+
+    with pytest.raises(ValueError, match="_MISSING_RUBRIC"):
+        create_llm_judge(
+            rubric="$_MISSING_RUBRIC",
+            include=["input"],
+            config={"model": "$TEST_JUDGE_MODEL"},
+        )
diff --git a/tests/test_run_evals.py b/tests/test_run_evals.py
index 0659958..4ff09e5 100644
--- a/tests/test_run_evals.py
+++ b/tests/test_run_evals.py
@@ -147,6 +147,24 @@ def test_with_functions_chained(self, simple_yaml_spec: str):
 
         assert summary.all_passed
 
+    def test_with_functions_short_name_matches_module_function_spec(self):
+        """module.function eval ids should match short-name keys from with_functions."""
+
+        def add(a, b):
+            return a + b
+
+        spec = {
+            "pkg.add": {
+                "dataset": [
+                    {"case": {"inputs": {"a": 1, "b": 2}, "expected": 3}},
+                ]
+            }
+        }
+
+        summary = RunEvals.from_dict(spec).with_functions({"add": add}).run()
+
+        assert summary.all_passed
+
     def test_with_executor_preserves_existing_run_behavior(self, simple_yaml_spec: str):
         """Executor preferences should be accepted without changing normal eval behavior."""
         summary = (
diff --git a/tests/test_serializer.py b/tests/test_serializer.py
index b38bc56..1ed83ec 100644
--- a/tests/test_serializer.py
+++ b/tests/test_serializer.py
@@ -198,6 +198,28 @@ def test_multiple_cases(self):
         assert summary.all_passed
         assert summary.total_count == 1
 
+    def test_serializer_short_name_matches_module_function_spec(self):
+        """Serializer mapping by short name should work for module.function eval ids."""
+        spec = {
+            "pkg.get_user_info": {
+                "dataset": [
+                    {
+                        "case": {
+                            "input": {"id": 1, "name": "Alice", "email": "a@a.com"},
+                            "expected": "User Alice has email a@a.com",
+                        }
+                    },
+                ]
+            }
+        }
+        summary = (
+            RunEvals.from_dict(spec)
+            .with_functions({"get_user_info": get_user_info})
+            .with_serializer({"get_user_info": User})
+            .run()
+        )
+        assert summary.all_passed
+
 
 class TestSerialFn:
     """Tests for serial_fn-based serialization."""

From bd73afdc9f2981570954271c7850ea5dec8bd3b4 Mon Sep 17 00:00:00 2001
From: Mert <62549656+fswair@users.noreply.github.com>
Date: Thu, 19 Mar 2026 19:42:00 +0300
Subject: [PATCH 5/8] bump version 0.3.5 -> 0.4.0

---
 .env.sample                                 |  14 +-
 .gitignore                                  |   6 +-
 .gitmodules                                 |   3 +
 CHANGELOG.md                                | 198 +++++
 README.md                                   |  46 +-
 VERSION                                     |   2 +-
 costs.yml                                   | 409 +++++++++++
 db_fixture.yml                              |  32 -
 docs/CLI.md                                 |  41 ++
 docs/CODEMODE.md                            | 164 +++++
 docs/FIXTURES.md                            |  17 +-
 docs/README.md                              |   2 +
 docs/SERIALIZERS.md                         |  47 ++
 docs/YAML_SPEC.md                           |  20 +-
 examples/basic_usage/evals.yml              |  20 +
 db.py => examples/db_fixtures/db.py         |   4 +-
 examples/evals/builtins.yml                 |  42 +-
 examples/evals/math.yml                     |  24 +-
 examples/evals/strings.yml                  |  33 +-
 examples/evals/validation.yml               |  32 +-
 examples/evaluators/evals.yml               |  28 +-
 examples/fixtures/evals.yml                 |   5 +
 examples/fluent_api/evals.yml               |  26 +-
 examples/serializers/__init__.py            |   1 +
 examples/serializers/app.py                 |  18 +
 examples/serializers/db_query_evals.yml     |  53 ++
 examples/serializers/defn.py                |  41 ++
 examples/serializers/fixtures.py            |  39 +
 examples/serializers/util.py                |   0
 pyproject.toml                              |   2 +-
 quality-judge/evals.py                      |  33 +
 quality-judge/largestPathValue_evals.yml    | 762 ++++++++++++++++++++
 quality-judge/largest_color_value_judge.yml | 330 +++++++++
 quality-judge/runner.py                     |  22 +
 src/vowel/eval_types.py                     |  57 +-
 src/vowel/evals.py                          | 103 ++-
 src/vowel/schema.py                         |  19 +-
 src/vowel/utils.py                          | 178 ++++-
 tests/test_run_evals.py                     |  75 ++
 tests/test_schema.py                        |  38 +
 tests/test_serializer.py                    | 177 +++++
 tests/test_yaml_loading.py                  |  37 +
 vowel-schema.json                           |  73 ++
 43 files changed, 3105 insertions(+), 168 deletions(-)
 create mode 100644 CHANGELOG.md
 create mode 100644 costs.yml
 delete mode 100644 db_fixture.yml
 create mode 100644 docs/CODEMODE.md
 rename db.py => examples/db_fixtures/db.py (95%)
 create mode 100644 examples/serializers/__init__.py
 create mode 100644 examples/serializers/app.py
 create mode 100644 examples/serializers/db_query_evals.yml
 create mode 100644 examples/serializers/defn.py
 create mode 100644 examples/serializers/fixtures.py
 create mode 100644 examples/serializers/util.py
 create mode 100644 quality-judge/evals.py
 create mode 100644 quality-judge/largestPathValue_evals.yml
 create mode 100644 quality-judge/largest_color_value_judge.yml
 create mode 100644 quality-judge/runner.py
 create mode 100644 tests/test_schema.py

diff --git a/.env.sample b/.env.sample
index 8f8c9d7..2d59281 100644
--- a/.env.sample
+++ b/.env.sample
@@ -11,4 +11,16 @@ LOGFIRE_ENABLED=false
 JUDGE_MODEL=openrouter:google/gemini-3-flash-preview
 
 # Default model used by Agents
-MODEL_NAME=openrouter:google/gemini-3-flash-preview
\ No newline at end of file
+MODEL_NAME=openrouter:google/gemini-3-flash-preview
+
+# Default spec & exploration models used by CodeMode pipeline
+# Spec agent generates tests
+# Exploration agent generates snippets to discover behaviors (code-execution)
+SPEC_MODEL=openrouter:anthropic/claude-opus-4.6
+EXPLORATION_MODEL=openrouter:anthropic/claude-sonnet-4.6
+
+# Default spec & exploration models used by CodeMode benchmark pipeline
+# NOTE: Models should be comma seperated, length of spec models must equals to exploration models
+# spec[i] will be mapped to exploration[i] (Case N)
+BENCHMARK_SPEC_MODELS=openrouter:anthropic/claude-opus-4.6
+BENCHMARK_EXPLORATION_MODELS=openrouter:anthropic/claude-sonnet-4.6
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
index dc1809e..8b40711 100644
--- a/.gitignore
+++ b/.gitignore
@@ -72,8 +72,4 @@ docs/FIXTURE_GENERATION_RFC.md
 
 # Benchmarks
 benchmark*
-parse_cron_evals.yml
-
-# Known Models with Costs
-costs.yml
-db_fixture_serializers.yml
+important-links.md
diff --git a/.gitmodules b/.gitmodules
index e0064d4..1b9806f 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -4,3 +4,6 @@
 [submodule "skills/vowel-core"]
 	path = skills/vowel-core
 	url = https://github.com/fswair/vowel-core.git
+[submodule "codemode-benchmark"]
+	path = codemode-benchmark
+	url = https://github.com/fswair/codemode-benchmark
diff --git a/CHANGELOG.md b/CHANGELOG.md
new file mode 100644
index 0000000..1c21190
--- /dev/null
+++ b/CHANGELOG.md
@@ -0,0 +1,198 @@
+# CHANGELOG
+
+## codemode_driven_generation
+
+This document summarizes the main features added or improved on this branch.
+
+## 1) Executor and ExecutionSession protocols
+
+- The code execution interface was formalized using Protocols.
+- The Executor async/sync API was standardized:
+  - execute(...)
+  - execute_sync(...)
+  - create_session(...)
+- ExecutionSession now compiles/executes setup code once and supports multi-snippet feed execution.
+- This reduces repeated parse/compile overhead while exploring the same function.
+- The run_sync helper was hardened for running-loop environments via nest-asyncio.
+
+## 2) MontyExecutor, DefaultExecutor, MontySession, FallbackSession structures
+
+- MontyExecutor was added:
+  - sandboxed execution via pydantic-monty,
+  - ResourceLimits support (timeout/memory),
+  - stdout capture and normalized error typing/messages,
+- DefaultExecutor was added/improved:
+  - pure Python exec-based fallback execution,
+  - last-expression capture (__result__) and stdout capture.
+- MontyReplSession (MontySession role) was added:
+  - one-time setup load, reusable feed-run model.
+- FallbackSession was added:
+  - Session-level fallback: if Monty session initialization fails, switch entirely to DefaultSession.
+  - Snippet-level fallback: if Monty returns ModuleNotFoundError for a snippet, rerun that snippet via fallback executor.
+- Executor/fallback wiring was simplified through resolve_executors.
+
+## 3) Main implementation: CodeModeGenerator
+
+- Two-phase exploration-guided generation flow:
+  - Phase 1: behavior exploration (exploration snippets + error snippets)
+  - Phase 2: spec generation from verified observations
+- Lazy Agent architecture:
+  - explorer_agent (ExplorationPlan)
+  - spec_agent (EvalsSource or EvalsBundle)
+- Prompt layers were clearly separated:
+  - exploration prompt: coverage, diversity, duplicate prevention
+  - spec prompt: expected values from verified outputs only
+- A refinement loop was added:
+  - generate -> run -> failure_context -> regenerate
+- Optional duration injection and a final summary run were added at the end.
+
+## 4) Runtime hierarchy and utility usage
+
+CodeMode hierarchy:
+
+1. explore()
+2. generate_spec()
+3. validate_and_fix_spec()
+4. validate_expected_values()
+5. inject_missing_error_cases()
+6. inject_durations() (optional)
+7. validation/refinement with RunEvals
+
+Utilities used:
+
+- build_call_code
+- build_failure_context
+- validate_and_fix_spec
+- validate_expected_values
+- inject_missing_error_cases
+- inject_durations
+
+## 5) Cost Manager
+
+- Generation/run cost tracking was added for CodeMode.
+- Features:
+  - generation_id and run_id lifecycle management,
+  - step-level usage/cost recording,
+  - model price resolution (genai-prices or costs.yml),
+  - atomic/locked JSON persistence,
+  - generation-level and run-level totals,
+  - status tracking: running/completed/failed.
+- The CLI costs command now supports list/by-generation/by-run views.
+
+## 6) Serializer syntax and YAML-native serializer registry
+
+- Top-level serializers registry support was added at EvalsFile level.
+- Per-eval serializer references are now supported via serializer:.
+- SerializerSpec was clarified with one-of behavior:
+  - schema (string or dict)
+  - serializer (callable import path)
+  - not both at the same time.
+- Runtime resolver additions:
+  - import-path resolution,
+  - cached imports (_import_path_cached),
+  - per-eval resolution (_resolve_yaml_serializer_entry).
+- Precedence between programmatic serializer maps and YAML serializer registry was defined.
+
+## 7) Spec model / Exploration model separation
+
+- Model separation in CodeModeGenerator constructor was formalized:
+  - spec_model
+  - exploration_model
+- use_model_spec output mode was clarified:
+  - use_model_spec=True: structured output mode (schema/model output via EvalsBundle)
+  - use_model_spec=False: YAML string output mode (via EvalsSource.yaml_spec)
+- HIGHLY RECOMMENDED TO KEEP use_model_spec=False.
+- Model resolution order and env fallback logic were added.
+- Cost tracking now supports separate model usage across separate steps.
+
+## 8) Adding executor/fallback executor to utilities
+
+- Utility flows were updated to accept executor and fallback executor parameters.
+- Monty -> Default fallback behavior was generalized in execution-aware paths.
+- Executor behavior was centralized across run_evals and validation stages.
+
+## 9) YAML schema generator
+
+- Runtime-model-driven schema generation was improved:
+  - supports top-level fixtures + serializers,
+  - preserves function-level EvalsMapValue behavior.
+- Schema cache strategy was updated:
+  - content-hash-based filename (reduces stale editor cache issues).
+- File header updates are handled safely via materialize_yaml_with_schema_header.
+
+## 10) CLI komutları: schema, costs
+
+- vowel schema <file>:
+  - update schema header after YAML + pydantic validation
+- vowel schema --create [path]:
+  - direct schema JSON generation
+- vowel costs:
+  - --list
+  - --by-generation
+  - --by-run
+  - --generation <id>
+  - --run <id>
+
+## 11) module.function -> function alias support
+
+- Alias support was added for programmatic mapping resolution:
+  - function map
+  - serializer schema map
+  - serializer function map
+- Behavior:
+  - exact match first,
+  - short-name fallback,
+  - explicit error for ambiguous reverse short-name mapping.
+
+## 12) Feedback-guided exploration
+
+- A targeted Round-2 exploration flow was added:
+  - build cluster summaries from Round-1 results,
+  - generate snippets focused on uncovered behavior classes.
+- Duplicate/semantic repetition minimization was reinforced at prompt level.
+- Distinct failure-mode coverage was improved for error snippets.
+- Additional rounds now measure value via new-behavior counting.
+
+## 13) Assertion + serializer integration
+
+- AssertionEvaluator input context is now serializer-aware.
+- Assertions now see serialized input for schema, serial_fn, and nested/dict schema modes.
+- This behavior is covered by regression tests.
+
+## 14) LLM Judge env-ref improvements
+
+- create_llm_judge now supports $ENV_VAR resolution for rubric/model fields.
+- Missing env refs now produce clearer errors.
+
+## 15) Examples, documentation, and test coverage
+
+- A runnable native serializer + fixture example was added.
+- README and serializer docs were updated with serializer/assertion context notes.
+- Meaningful id fields were added to eval cases under examples.
+- New/updated tests include:
+  - test_schema
+  - test_llm_judge_env_refs
+  - serializer assertion regressions
+  - YAML/native serializer parsing tests
+
+## 16) Fixture scope alias support
+
+- Fixture scopes now support clearer canonical names:
+  - case
+  - eval
+  - file
+- Backward-compatible aliases are still accepted:
+  - function (alias of case)
+  - module (alias of eval)
+  - session (alias of file)
+- At parse time, canonical names are normalized to legacy internal runtime values:
+  - case -> function
+  - eval -> module
+  - file -> session
+- This keeps existing runtime lifecycle behavior unchanged while allowing more descriptive scope names in YAML.
+
+Note: Old names would be deprecated after v1.0.0
+
+## Note
+
+This changelog is based on features observed and validated in code on this branch, without using git history.
diff --git a/README.md b/README.md
index f3ec800..4c2d792 100644
--- a/README.md
+++ b/README.md
@@ -46,7 +46,7 @@ pip install -e ".[all]"
 ## Quick Start
 
 > **Note:**  
-> For a deeper understanding of how vowel handles fixtures, see the examples in [`db_fixture.yml`](./db_fixture.yml) and [`db.py`](./db.py). These files demonstrate the underlying mechanics of fixture setup and usage.
+> For a deeper understanding of how vowel handles fixtures, see the examples in [`examples/db_fixtures`](./examples/db_fixtures/). These example demonstrate the underlying mechanics of fixture setup and usage.
 
 > **Tip:**  
 > To enable YAML schema validation in your editor, place `vowel-schema.json` in your project directory.  
@@ -183,6 +183,29 @@ def query_user(user_id: int, *, db: dict) -> dict | None:
     return db["users"].get(user_id)
 ```
 
+Fixture scope aliases:
+- Preferred scope names: `case`, `eval`, `file`
+- Backward-compatible aliases: `function`, `module`, `session`
+- Normalization mapping: `case -> function`, `eval -> module`, `file -> session`
+
+Example:
+
+```yaml
+fixtures:
+  temp_data:
+    setup: myapp.make_temp_data
+    scope: case
+
+  db:
+    setup: myapp.setup_db
+    teardown: myapp.close_db
+    scope: eval
+
+  cache:
+    setup: myapp.setup_cache
+    scope: file
+```
+
 > **Full reference:** [docs/FIXTURES.md](https://github.com/fswair/vowel/blob/main/docs/FIXTURES.md)
 
 ### Input Serializers
@@ -200,6 +223,24 @@ summary = (
 
 > **Serializer key matching:** Serializer mappings follow the same rule as `.with_functions(...)` — both `module.function` and short `function` keys are accepted.
 
+> **Assertion context and serializers:** When a serializer is configured, assertion evaluators use the serialized value for `input` (not raw YAML). This applies to schema mode, `serial_fn`, and nested/dict schemas.
+
+Runnable example (YAML-native serializers + fixtures):
+
+```bash
+vowel examples/serializers/db_query_evals.yml
+```
+
+This example demonstrates:
+- top-level `serializers:` registry with both `schema` and `serializer` entries,
+- per-eval `serializer:` references,
+- fixture class lifecycle wiring with `cls` + `teardown`,
+- assertion checks that read serialized `input` values.
+
+See:
+- `examples/serializers/db_query_evals.yml`
+- `examples/serializers/util.py`
+
 > **Full reference:** [docs/SERIALIZERS.md](https://github.com/fswair/vowel/blob/main/docs/SERIALIZERS.md)
 
 ### AI-Powered Generation
@@ -263,6 +304,9 @@ vowel evals.yml --dry-run                # Show plan without running
 vowel evals.yml --export-json out.json   # Export results
 vowel evals.yml -v                       # Verbose summary
 vowel evals.yml -v --hide-report         # Verbose, hide pydantic_evals report
+vowel schema examples/serializers/db_query_evals.yml   # Validate + update schema header
+vowel schema --create                                   # Generate vowel-schema.json
+vowel costs --list                                      # List tracked generation/run costs
 ```
 
 > **Full reference:** [docs/CLI.md](https://github.com/fswair/vowel/blob/main/docs/CLI.md)
diff --git a/VERSION b/VERSION
index 09e9157..60a2d3e 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-0.3.5
\ No newline at end of file
+0.4.0
\ No newline at end of file
diff --git a/costs.yml b/costs.yml
new file mode 100644
index 0000000..6b02e8a
--- /dev/null
+++ b/costs.yml
@@ -0,0 +1,409 @@
+models:
+- amazon-nova-micro:
+    cached_input_per_million: null
+    input_per_million: 0.035
+    output_per_million: 0.14
+- amazon-nova-lite:
+    cached_input_per_million: null
+    input_per_million: 0.06
+    output_per_million: 0.24
+- amazon-nova-pro:
+    cached_input_per_million: null
+    input_per_million: 0.8
+    output_per_million: 3.2
+- amazon-nova-premier:
+    cached_input_per_million: null
+    input_per_million: 2.5
+    output_per_million: 12.5
+- claude-3.7-sonnet:
+    cached_input_per_million: null
+    input_per_million: 3
+    output_per_million: 15
+- claude-3.5-sonnet:
+    cached_input_per_million: null
+    input_per_million: 3
+    output_per_million: 15
+- claude-3-opus:
+    cached_input_per_million: null
+    input_per_million: 15
+    output_per_million: 75
+- claude-3-haiku:
+    cached_input_per_million: null
+    input_per_million: 0.25
+    output_per_million: 1.25
+- claude-3.5-haiku:
+    cached_input_per_million: null
+    input_per_million: 0.8
+    output_per_million: 4
+- claude-4.5-haiku:
+    cached_input_per_million: null
+    input_per_million: 1
+    output_per_million: 5
+- claude-sonnet-4.5:
+    cached_input_per_million: null
+    input_per_million: 3
+    output_per_million: 15
+- claude-sonnet-4.5-200k:
+    cached_input_per_million: null
+    input_per_million: 6
+    output_per_million: 22.5
+- claude-opus-4:
+    cached_input_per_million: null
+    input_per_million: 15
+    output_per_million: 75
+- claude-opus-4-1:
+    cached_input_per_million: null
+    input_per_million: 15
+    output_per_million: 75
+- claude-opus-4-5:
+    cached_input_per_million: null
+    input_per_million: 5
+    output_per_million: 25
+- claude-opus-4.6:
+    cached_input_per_million: null
+    input_per_million: 5
+    output_per_million: 25
+- deepseek-chat:
+    cached_input_per_million: null
+    input_per_million: 0.27
+    output_per_million: 1.1
+- deepseek-reasoner:
+    cached_input_per_million: null
+    input_per_million: 0.55
+    output_per_million: 2.19
+- gemini-2.5-pro-preview-03-25:
+    cached_input_per_million: null
+    input_per_million: 1.25
+    output_per_million: 10
+- gemini-2.5-pro-preview-03-25-200k:
+    cached_input_per_million: null
+    input_per_million: 2.5
+    output_per_million: 15
+- gemini-2.0-flash-lite:
+    cached_input_per_million: null
+    input_per_million: 0.075
+    output_per_million: 0.3
+- gemini-2.0-flash:
+    cached_input_per_million: null
+    input_per_million: 0.1
+    output_per_million: 0.4
+- gemini-1.5-flash:
+    cached_input_per_million: null
+    input_per_million: 0.075
+    output_per_million: 0.3
+- gemini-1.5-flash-128k:
+    cached_input_per_million: null
+    input_per_million: 0.15
+    output_per_million: 0.6
+- gemini-1.5-flash-8b:
+    cached_input_per_million: null
+    input_per_million: 0.0375
+    output_per_million: 0.15
+- gemini-1.5-flash-8b-128k:
+    cached_input_per_million: null
+    input_per_million: 0.075
+    output_per_million: 0.3
+- gemini-1.5-pro:
+    cached_input_per_million: null
+    input_per_million: 1.25
+    output_per_million: 5
+- gemini-1.5-pro-128k:
+    cached_input_per_million: null
+    input_per_million: 2.5
+    output_per_million: 10
+- gemini-2.5-flash:
+    cached_input_per_million: 0.03
+    input_per_million: 0.3
+    output_per_million: 2.5
+- gemini-2.5-flash-lite:
+    cached_input_per_million: 0.01
+    input_per_million: 0.1
+    output_per_million: 0.4
+- gemini-2.5-flash-preview-09-2025:
+    cached_input_per_million: 0.03
+    input_per_million: 0.3
+    output_per_million: 2.5
+- gemini-2.5-pro:
+    cached_input_per_million: 0.125
+    input_per_million: 1.25
+    output_per_million: 10
+- gemini-2.5-pro-200k:
+    cached_input_per_million: 0.25
+    input_per_million: 2.5
+    output_per_million: 15
+- gemini-3-pro-preview:
+    cached_input_per_million: null
+    input_per_million: 2
+    output_per_million: 12
+- gemini-3-pro-preview-200k:
+    cached_input_per_million: null
+    input_per_million: 4
+    output_per_million: 18
+- gemini-3-flash-preview:
+    cached_input_per_million: null
+    input_per_million: 0.5
+    output_per_million: 3
+- gemini-3-1-pro-preview:
+    cached_input_per_million: null
+    input_per_million: 2
+    output_per_million: 12
+- gemini-3-1-pro-preview-200k:
+    cached_input_per_million: null
+    input_per_million: 4
+    output_per_million: 18
+- gemini-3.1-flash-lite-preview:
+    cached_input_per_million: 0.025
+    input_per_million: 0.25
+    output_per_million: 1.5
+- minimax-m2:
+    cached_input_per_million: null
+    input_per_million: 0.3
+    output_per_million: 1.2
+- pixtral-12b:
+    cached_input_per_million: null
+    input_per_million: 0.15
+    output_per_million: 0.15
+- mistral-small-latest:
+    cached_input_per_million: null
+    input_per_million: 0.1
+    output_per_million: 0.3
+- mistral-medium-2505:
+    cached_input_per_million: null
+    input_per_million: 0.4
+    output_per_million: 2
+- mistral-nemo:
+    cached_input_per_million: null
+    input_per_million: 0.15
+    output_per_million: 0.15
+- open-mistral-7b:
+    cached_input_per_million: null
+    input_per_million: 0.25
+    output_per_million: 0.25
+- open-mixtral-8x7b:
+    cached_input_per_million: null
+    input_per_million: 0.7
+    output_per_million: 0.7
+- open-mixtral-8x22b:
+    cached_input_per_million: null
+    input_per_million: 2
+    output_per_million: 6
+- mistral-large-latest:
+    cached_input_per_million: null
+    input_per_million: 2
+    output_per_million: 6
+- pixtral-large-latest:
+    cached_input_per_million: null
+    input_per_million: 2
+    output_per_million: 6
+- mistral-saba-latest:
+    cached_input_per_million: null
+    input_per_million: 0.2
+    output_per_million: 0.6
+- codestral-latest:
+    cached_input_per_million: null
+    input_per_million: 0.3
+    output_per_million: 0.9
+- ministral-8b-latest:
+    cached_input_per_million: null
+    input_per_million: 0.1
+    output_per_million: 0.1
+- ministral-3b-latest:
+    cached_input_per_million: null
+    input_per_million: 0.04
+    output_per_million: 0.04
+- magistral-medium-latest:
+    cached_input_per_million: null
+    input_per_million: 2
+    output_per_million: 5
+- kimi-k2-0905-preview:
+    cached_input_per_million: 0.15
+    input_per_million: 0.6
+    output_per_million: 2.5
+- kimi-k2-0711-preview:
+    cached_input_per_million: 0.15
+    input_per_million: 0.6
+    output_per_million: 2.5
+- kimi-k2-turbo-preview:
+    cached_input_per_million: 0.15
+    input_per_million: 1.15
+    output_per_million: 8.0
+- kimi-k2-thinking:
+    cached_input_per_million: 0.15
+    input_per_million: 0.6
+    output_per_million: 2.5
+- kimi-k2-thinking-turbo:
+    cached_input_per_million: 0.15
+    input_per_million: 1.15
+    output_per_million: 8.0
+- text-davinci-003:
+    cached_input_per_million: null
+    input_per_million: 20
+    output_per_million: 20
+- gpt-4.5:
+    cached_input_per_million: 37.5
+    input_per_million: 75
+    output_per_million: 150
+- gpt-4o:
+    cached_input_per_million: 1.25
+    input_per_million: 2.5
+    output_per_million: 10
+- gpt-4o-mini:
+    cached_input_per_million: 0.075
+    input_per_million: 0.15
+    output_per_million: 0.6
+- chatgpt-4o-latest:
+    cached_input_per_million: null
+    input_per_million: 5
+    output_per_million: 15
+- o1-preview:
+    cached_input_per_million: 7.5
+    input_per_million: 15
+    output_per_million: 60
+- o1-pro:
+    cached_input_per_million: null
+    input_per_million: 150
+    output_per_million: 600
+- o1-mini:
+    cached_input_per_million: 0.55
+    input_per_million: 1.1
+    output_per_million: 4.4
+- o3-mini:
+    cached_input_per_million: 0.55
+    input_per_million: 1.1
+    output_per_million: 4.4
+- gpt-4.1:
+    cached_input_per_million: 0.5
+    input_per_million: 2
+    output_per_million: 8
+- gpt-4.1-mini:
+    cached_input_per_million: 0.1
+    input_per_million: 0.4
+    output_per_million: 1.6
+- gpt-4.1-nano:
+    cached_input_per_million: 0.025
+    input_per_million: 0.1
+    output_per_million: 0.4
+- o3:
+    cached_input_per_million: 0.5
+    input_per_million: 10
+    output_per_million: 40
+- o4-mini:
+    cached_input_per_million: 0.275
+    input_per_million: 1.1
+    output_per_million: 4.4
+- gpt-5-nano:
+    cached_input_per_million: 0.005
+    input_per_million: 0.05
+    output_per_million: 0.4
+- gpt-5-mini:
+    cached_input_per_million: 0.025
+    input_per_million: 0.25
+    output_per_million: 2
+- gpt-5:
+    cached_input_per_million: 0.125
+    input_per_million: 1.25
+    output_per_million: 10
+- gpt-image-1:
+    cached_input_per_million: 1.25
+    input_per_million: 10
+    output_per_million: 40
+- gpt-image-1-mini:
+    cached_input_per_million: 0.2
+    input_per_million: 2
+    output_per_million: 8
+- gpt-5-pro:
+    cached_input_per_million: null
+    input_per_million: 15
+    output_per_million: 120
+- o3-pro:
+    cached_input_per_million: null
+    input_per_million: 20
+    output_per_million: 80
+- o4-mini-deep-research:
+    cached_input_per_million: 0.5
+    input_per_million: 2
+    output_per_million: 8
+- o3-deep-research:
+    cached_input_per_million: 2.5
+    input_per_million: 10
+    output_per_million: 40
+- gpt-5.1-codex-mini:
+    cached_input_per_million: 0.025
+    input_per_million: 0.25
+    output_per_million: 2.0
+- gpt-5.1-codex:
+    cached_input_per_million: 0.125
+    input_per_million: 1.25
+    output_per_million: 10.0
+- gpt-5.1:
+    cached_input_per_million: 0.125
+    input_per_million: 1.25
+    output_per_million: 10.0
+- gpt-5.2:
+    cached_input_per_million: 0.175
+    input_per_million: 1.75
+    output_per_million: 14.0
+- gpt-5.2-pro:
+    cached_input_per_million: null
+    input_per_million: 21.0
+    output_per_million: 168.0
+- gpt-5.4:
+    cached_input_per_million: 0.25
+    input_per_million: 2.5
+    output_per_million: 15.0
+- gpt-5.4-272k:
+    cached_input_per_million: 0.5
+    input_per_million: 5.0
+    output_per_million: 22.5
+- gpt-5.4-pro:
+    cached_input_per_million: null
+    input_per_million: 30.0
+    output_per_million: 180.0
+- gpt-5.4-pro-272k:
+    cached_input_per_million: null
+    input_per_million: 60.0
+    output_per_million: 270.0
+- grok-3:
+    cached_input_per_million: 0.75
+    input_per_million: 3
+    output_per_million: 15
+- grok-3-mini:
+    cached_input_per_million: 0.075
+    input_per_million: 0.3
+    output_per_million: 0.5
+- grok-4-fast:
+    cached_input_per_million: 0.05
+    input_per_million: 0.2
+    output_per_million: 0.5
+- grok-4:
+    cached_input_per_million: 0.75
+    input_per_million: 3
+    output_per_million: 15
+- grok-4-128k:
+    cached_input_per_million: 0.75
+    input_per_million: 6
+    output_per_million: 30
+- grok-4-fast:
+    cached_input_per_million: 0.05
+    input_per_million: 0.2
+    output_per_million: 0.5
+- grok-4-fast-128k:
+    cached_input_per_million: 0.05
+    input_per_million: 0.4
+    output_per_million: 1.0
+- grok-4-fast-reasoning:
+    cached_input_per_million: 0.05
+    input_per_million: 0.2
+    output_per_million: 0.5
+- grok-4-fast-reasoning-128k:
+    cached_input_per_million: 0.05
+    input_per_million: 0.4
+    output_per_million: 1.0
+- grok-code-fast-1:
+    cached_input_per_million: 0.02
+    input_per_million: 0.2
+    output_per_million: 1.5
+- claude-sonnet-4.6:
+    cached_input_per_million: null
+    input_per_million: 3
+    output_per_million: 15
\ No newline at end of file
diff --git a/db_fixture.yml b/db_fixture.yml
deleted file mode 100644
index e8be58c..0000000
--- a/db_fixture.yml
+++ /dev/null
@@ -1,32 +0,0 @@
-# yaml-language-server: $schema=vowel-schema.json
-
-fixtures:
-  db:
-    cls: db.Connection # setup cls
-    kwargs:
-      db_path: users.db ## db.Connection(db_path="users.db")
-    teardown: db.Connection.close # teardown method (db.Connection.close())
-    scope: module # scope 
-
-db.Connection.execute:
-  fixture:
-    - db # db.execute(query) -> self is the db fixture instance
-  evals:
-    IsDict:
-      type: list[dict[str, typing.Union[str, int]]] # type assertion for output
-  dataset:
-    - case:
-        input: "SELECT * FROM users WHERE id=1" 
-        assertion: "output and isinstance(output[0], dict)"
-    - case:
-        input: "SELECT * FROM notes" # (buggy query - invalid table)
-        raises: any # if any exception is raised, the case will pass.
-        match: "no such table" ## no effect, because any exception is accepted
-    - case:
-        input: "SELECT * FROM players" # (buggy query - invalid table)
-        raises: any? # if any exception is raised/returned normally, the case will pass.
-        match: "no such table" ## no effect, because any exception or normal return is enough
-    - case:
-        input: "SELECT * FROM developers" # (buggy query - invalid table)
-        raises: NoTableError
-        match: "no such table" ## must match the exception message (case ignored)
diff --git a/docs/CLI.md b/docs/CLI.md
index 0cdd808..0be46f1 100644
--- a/docs/CLI.md
+++ b/docs/CLI.md
@@ -74,4 +74,45 @@ vowel evals.yml -v --hide-report
 
 # Hide report without verbose (still shows Overall Summary panel)
 vowel evals.yml --hide-report
+
+# Validate YAML + refresh schema header
+vowel schema evals.yml
+
+# Generate schema JSON file (default: vowel-schema.json)
+vowel schema --create
+
+# Generate schema JSON at a custom path
+vowel schema --create ./schemas/vowel-schema.json
+
+# Show tracked model costs
+vowel costs --list
+vowel costs --by-generation
+vowel costs --by-run
+vowel costs --generation <generation_id>
+vowel costs --run <run_id>
 ```
+
+---
+
+## Schema Commands
+
+Use schema commands to validate specs and keep YAML schema headers in sync.
+
+| Command | Description |
+|--------|-------------|
+| `vowel schema <file>` | Validates YAML and updates the file's schema header safely |
+| `vowel schema --create [path]` | Generates `vowel-schema.json` (or writes to custom path) |
+
+---
+
+## Cost Commands
+
+Use cost commands to inspect generation and run cost history.
+
+| Command | Description |
+|--------|-------------|
+| `vowel costs --list` | List all tracked generations and runs |
+| `vowel costs --by-generation` | Aggregate totals by generation id |
+| `vowel costs --by-run` | Aggregate totals by run id |
+| `vowel costs --generation <id>` | Show detailed rows for one generation |
+| `vowel costs --run <id>` | Show detailed rows for one run |
diff --git a/docs/CODEMODE.md b/docs/CODEMODE.md
new file mode 100644
index 0000000..91517bd
--- /dev/null
+++ b/docs/CODEMODE.md
@@ -0,0 +1,164 @@
+# CodeMode
+
+CodeMode is Vowel's exploration-guided evaluation spec generator.
+
+Instead of generating test specs from description only, CodeMode runs exploration snippets against real function code first, then generates and refines eval specs using verified outputs and observed errors.
+
+## Pipeline Overview
+
+CodeMode runs in phases:
+
+1. Explore behavior
+- Generates normal snippets and error snippets.
+- Executes snippets against the target function.
+- Collects real outputs, exceptions, and timings.
+
+2. Generate spec
+- Builds a spec prompt from verified execution results.
+- Produces either YAML text or structured bundle output.
+
+3. Validate and refine
+- Runs generated evals against the function.
+- If coverage is below target, builds failure context and retries.
+- Repeats up to max refinement rounds.
+
+4. Optional duration injection
+- Measures runtime and injects duration thresholds into cases.
+
+5. Final summary
+- Returns a CodeModeResult with exploration artifacts, final YAML spec, and optional EvalSummary.
+
+## Core API
+
+CodeMode class:
+- `vowel.codemode.CodeModeGenerator`
+
+Result type:
+- `vowel.codemode.CodeModeResult`
+
+Main entrypoint:
+- `await CodeModeGenerator.generate(...)`
+
+## Model Configuration
+
+Constructor model resolution order:
+
+- `spec_model` argument, else `SPEC_MODEL`, else fallback `model`/`MODEL_NAME`
+- `exploration_model` argument, else `EXPLORATION_MODEL`, else fallback `model`/`MODEL_NAME`
+
+Both models must resolve to non-empty values.
+
+## Output Modes (`use_model_spec`)
+
+- `use_model_spec=False` (default)
+  - Spec agent output type: `EvalsSource`
+  - Generates YAML string via `yaml_spec`
+
+- `use_model_spec=True`
+  - Spec agent output type: `EvalsBundle`
+  - Generates structured model output first, then can be converted to YAML
+
+Recommendation used in this repository benchmark flow:
+- HIGHLY RECOMMENDED TO KEEP `use_model_spec=False`.
+
+## Minimal Example
+
+```python
+import asyncio
+
+from vowel.codemode import CodeModeGenerator
+from vowel.runner import Function
+
+func = Function(
+    name="flatten",
+    description="Recursively flatten an arbitrarily nested list.",
+    code="""
+def flatten(lst: list) -> list:
+    if not isinstance(lst, list):
+        raise TypeError(f'Expected list, got {type(lst).__name__}')
+    out = []
+    for item in lst:
+        if isinstance(item, list):
+            out.extend(flatten(item))
+        else:
+            out.append(item)
+    return out
+""",
+)
+
+async def main() -> None:
+    gen = CodeModeGenerator(
+        spec_model="openrouter:google/gemini-3-flash-preview",
+        exploration_model="openrouter:google/gemini-3.1-flash-lite-preview",
+        use_model_spec=False,
+    )
+
+    result = await gen.generate(
+        func,
+        run_evals=True,
+        max_refinement_rounds=2,
+        min_coverage=1.0,
+        inject_durations=False,
+        save_to_file=True,
+    )
+
+    print(result.yaml_spec)
+    if result.summary:
+        result.summary.print()
+
+asyncio.run(main())
+```
+
+## `generate(...)` Parameters
+
+Important flags in `CodeModeGenerator.generate`:
+
+- `run_id`: optional run identifier for cost tracking
+- `run_evals`: run generated spec after generation
+- `save_to_file`: write `<function_name>_evals.yml`
+- `max_refinement_rounds`: retry/refinement budget
+- `min_coverage`: stop threshold (default 1.0)
+- `inject_durations`: inject measured duration checks
+
+## What `CodeModeResult` Contains
+
+- `exploration_results`: snippet execution results
+- `yaml_spec`: final YAML eval spec
+- `summary`: EvalSummary when `run_evals=True`
+- `refinement_rounds`: number of refinement retries used
+
+## Benchmark Integration (`python -m codemode_benchmark`)
+
+Benchmark runner path:
+- `codemode_benchmark/run_benchmark.py`
+
+Typical usage:
+
+```bash
+python -m codemode_benchmark
+python -m codemode_benchmark --only flatten group_by
+python -m codemode_benchmark --show-config
+python -m codemode_benchmark --replay codemode_benchmark/run_20260312_181510
+```
+
+If you use Python launcher on your machine:
+
+```bash
+py -m codemode_benchmark
+```
+
+Benchmark runner compares model pairs (`spec_model`, `exploration_model`) across built-in scenarios and stores artifacts under `codemode_benchmark/run_<timestamp>/`.
+
+## Troubleshooting
+
+- Error: spec/exploration model not set
+  - Set constructor args or env vars (`SPEC_MODEL`, `EXPLORATION_MODEL`, `MODEL_NAME`).
+
+- Low coverage after generation
+  - Increase `max_refinement_rounds`.
+  - Provide clearer function descriptions.
+  - Check whether the function has non-deterministic behavior.
+
+- YAML parse/validation failures
+  - Keep `use_model_spec=False` for YAML-first flow in this repo.
+  - Let refinement run (`run_evals=True`) so failure context can repair issues.
diff --git a/docs/FIXTURES.md b/docs/FIXTURES.md
index da93794..5bf91ff 100644
--- a/docs/FIXTURES.md
+++ b/docs/FIXTURES.md
@@ -82,24 +82,29 @@ Fixtures support three lifecycle scopes (defined in YAML):
 
 | Scope | Behavior |
 |-------|----------|
-| `function` (default) | Setup/teardown for **each** test case |
-| `module` | Setup once per eval spec, teardown after all cases |
-| `session` | Setup once per `run_evals()` call, teardown at end |
+| `case` (preferred) / `function` (alias, default) | Setup/teardown for **each** test case |
+| `eval` (preferred) / `module` (alias) | Setup once per eval spec, teardown after all cases |
+| `file` (preferred) / `session` (alias) | Setup once per `run_evals()` call, teardown at end |
+
+Alias normalization:
+- `case -> function`
+- `eval -> module`
+- `file -> session`
 
 ```yaml
 fixtures:
   temp_file:
     setup: my_fixtures.temp_file
-    scope: function
+    scope: case
 
   db:
     setup: my_fixtures.setup_db
     teardown: my_fixtures.teardown_db
-    scope: module
+    scope: eval
 
   cache:
     setup: my_fixtures.setup_cache
-    scope: session
+    scope: file
 ```
 
 ---
diff --git a/docs/README.md b/docs/README.md
index ff567e3..c328c08 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -8,6 +8,7 @@ Welcome to the Vowel framework documentation.
 |----------|-------------|
 | [User Guide](./USERGUIDE.md) | Complete guide to using Vowel |
 | [API Reference](./API.md) | Detailed API documentation |
+| [CodeMode](./CODEMODE.md) | Exploration-guided spec generation pipeline and benchmark usage |
 
 ## Quick Links
 
@@ -16,3 +17,4 @@ Welcome to the Vowel framework documentation.
 - **Evaluators**: See [User Guide - Evaluators](./USERGUIDE.md#evaluators)
 - **RunEvals API**: See [API - RunEvals](./API.md#runevals)
 - **EvalGenerator**: See [API - EvalGenerator](./API.md#evalgenerator)
+- **CodeMode**: See [CodeMode Guide](./CODEMODE.md)
diff --git a/docs/SERIALIZERS.md b/docs/SERIALIZERS.md
index deb621e..7def43c 100644
--- a/docs/SERIALIZERS.md
+++ b/docs/SERIALIZERS.md
@@ -60,6 +60,50 @@ summary = (
 
 > Key matching note: If YAML eval ids use `module.function`, both programmatic maps accept either the exact id (`module.function`) or short name (`function`) keys in `.with_functions(...)`, `.with_serializer(...)`, and `serial_fn={...}`.
 
+> Assertion context note: When a serializer is active, assertion evaluators see the serialized `input` value (not raw YAML payload).
+>
+> - Schema mode: `input` is the model/callable output.
+> - Serial fn mode: `input` is whatever `serial_fn` returns (single value, tuple, or dict).
+> - Dict/nested schema mode: `input` contains per-parameter serialized values.
+
+---
+
+## YAML-Native Serializer Registry
+
+You can define serializers directly in YAML and reference them per eval.
+
+```yaml
+serializers:
+  query_schema:
+    schema: examples.serializers.util.Query
+  query_serial_fn:
+    serializer: examples.serializers.util.query_from_payload
+
+examples.serializers.util.query_users:
+  serializer: query_schema
+  dataset:
+    - case:
+        input:
+          sql: "SELECT name FROM users WHERE age > ?"
+          params: [30]
+
+examples.serializers.util.query_users_custom:
+  serializer: query_serial_fn
+  dataset:
+    - case:
+        input: "SELECT COUNT(*) AS total FROM users"
+```
+
+One-of rule for each serializer registry entry:
+- use `schema` or `serializer`
+- do not define both in the same entry
+
+Runnable example:
+
+```bash
+vowel examples/serializers/db_query_evals.yml
+```
+
 ---
 
 ## Advanced Examples
@@ -91,6 +135,9 @@ summary = (
     .with_serializer({"process": {"user": User, "config": Config}})
     .run()
 )
+
+# Assertions can access serialized nested values
+# assertion: "input['user'].email.endswith('@a.com') and input['config'].timeout == 30"
 ```
 
 ### Custom Parsing Logic
diff --git a/docs/YAML_SPEC.md b/docs/YAML_SPEC.md
index f467ba6..b7ded00 100644
--- a/docs/YAML_SPEC.md
+++ b/docs/YAML_SPEC.md
@@ -10,7 +10,8 @@ fixtures:
   fixture_name:
     setup: module.setup_func      # Import path to setup function
     teardown: module.teardown_func # Import path to teardown (optional)
-    scope: function                # function | module | session
+    scope: case                    # preferred: case | eval | file
+                                   # aliases: function | module | session
     kwargs:                        # Keyword arguments for setup function (optional)
       key: value
 
@@ -116,18 +117,18 @@ fixtures:
   db:
     setup: myapp.fixtures.setup_db
     teardown: myapp.fixtures.close_db
-    scope: module          # Created once, shared across all cases
+    scope: eval            # Created once, shared across all cases
     params:
       db_name: test_db
 
   cache:
     setup: myapp.fixtures.setup_cache
-    scope: session         # Created once per run_evals call
+    scope: file            # Created once per run_evals call
 
   temp_dir:
     setup: myapp.fixtures.create_temp_dir
     teardown: myapp.fixtures.remove_temp_dir
-    scope: function        # Created fresh for each case (default)
+    scope: case            # Created fresh for each case (default)
 
 # Function depends on 'db' fixture
 query_user:
@@ -185,9 +186,14 @@ summary = (
 ```
 
 **Fixture scopes:**
-- `function` (default): Setup/teardown for **each** test case
-- `module`: Setup once per eval spec, teardown after all cases
-- `session`: Setup once per `run_evals()` call, teardown at end
+- Preferred names:
+  - `case` (default): Setup/teardown for **each** test case
+  - `eval`: Setup once per eval spec, teardown after all cases
+  - `file`: Setup once per `run_evals()` call, teardown at end
+- Backward-compatible aliases:
+  - `function` = `case`
+  - `module` = `eval`
+  - `session` = `file`
 
 > See [FIXTURES.md](./FIXTURES.md) for the complete fixture guide including Python API patterns.
 
diff --git a/examples/basic_usage/evals.yml b/examples/basic_usage/evals.yml
index a7a9d94..c184afc 100644
--- a/examples/basic_usage/evals.yml
+++ b/examples/basic_usage/evals.yml
@@ -6,9 +6,11 @@
 greet:
   dataset:
     - case:
+        id: greet_world
         input: "World"
         expected: "Hello, World!"
     - case:
+        id: greet_alice
         input: "Alice"
         expected: "Hello, Alice!"
 
@@ -16,12 +18,15 @@ greet:
 add:
   dataset:
     - case:
+        id: add_positive_pair
         inputs: { x: 1, y: 2 }
         expected: 3
     - case:
+        id: add_zero_sum
         inputs: { x: -5, y: 5 }
         expected: 0
     - case:
+        id: add_large_values
         inputs: { x: 100, y: 200 }
         expected: 300
 
@@ -29,12 +34,15 @@ add:
 multiply:
   dataset:
     - case:
+        id: multiply_basic_product
         inputs: [3, 4]
         expected: 12
     - case:
+        id: multiply_zero_factor
         inputs: [0, 999]
         expected: 0
     - case:
+        id: multiply_negative_product
         inputs: [-2, 5]
         expected: -10
 
@@ -42,15 +50,19 @@ multiply:
 factorial:
   dataset:
     - case:
+        id: factorial_zero_base
         input: 0
         expected: 1
     - case:
+        id: factorial_five
         input: 5
         expected: 120
     - case:
+        id: factorial_ten
         input: 10
         expected: 3628800
     - case:
+        id: factorial_negative_raises
         input: -1
         raises: ValueError
         match: "non-negative"
@@ -59,12 +71,15 @@ factorial:
 is_even:
   dataset:
     - case:
+        id: is_even_four
         input: 4
         expected: true
     - case:
+        id: is_even_seven
         input: 7
         expected: false
     - case:
+        id: is_even_zero
         input: 0
         expected: true
 
@@ -72,9 +87,11 @@ is_even:
 len:
   dataset:
     - case:
+        id: len_list_three
         input: [1, 2, 3]
         expected: 3
     - case:
+        id: len_string_hello
         input: "hello"
         expected: 5
 
@@ -82,14 +99,17 @@ len:
 math.sqrt:
   dataset:
     - case:
+        id: sqrt_16
         input: 16
         expected: 4.0
     - case:
+        id: sqrt_9
         input: 9
         expected: 3.0
 
 os.path.join:
   dataset:
     - case:
+        id: join_home_user
         inputs: ["/home", "user"]
         expected: "/home/user"
diff --git a/db.py b/examples/db_fixtures/db.py
similarity index 95%
rename from db.py
rename to examples/db_fixtures/db.py
index 078f78b..332aa8d 100644
--- a/db.py
+++ b/examples/db_fixtures/db.py
@@ -15,8 +15,10 @@
 
 import logfire
 
+from vowel.monitoring import enable_monitoring
+
 # enable observability (optional)
-# logfire.configure(service_name="db-fixture")
+enable_monitoring(service_name="db-fixture")
 
 
 class NoTableError(Exception):
diff --git a/examples/evals/builtins.yml b/examples/evals/builtins.yml
index 46e1146..fbf388c 100644
--- a/examples/evals/builtins.yml
+++ b/examples/evals/builtins.yml
@@ -3,53 +3,55 @@
 
 len:
   dataset:
-    - case: { input: [1, 2, 3], expected: 3 }
-    - case: { input: "hello", expected: 5 }
-    - case: { input: [], expected: 0 }
+    - case: { id: len_list_three, input: [1, 2, 3], expected: 3 }
+    - case: { id: len_string_hello, input: "hello", expected: 5 }
+    - case: { id: len_empty_list, input: [], expected: 0 }
 
 abs:
   dataset:
-    - case: { input: -7, expected: 7 }
-    - case: { input: 0, expected: 0 }
-    - case: { input: 42, expected: 42 }
+    - case: { id: abs_negative, input: -7, expected: 7 }
+    - case: { id: abs_zero, input: 0, expected: 0 }
+    - case: { id: abs_positive, input: 42, expected: 42 }
 
 sorted:
   dataset:
-    - case: { input: [3, 1, 2], expected: [1, 2, 3] }
-    - case: { input: [5, 5, 5], expected: [5, 5, 5] }
-    - case: { input: [], expected: [] }
+    - case: { id: sorted_unsorted_numbers, input: [3, 1, 2], expected: [1, 2, 3] }
+    - case: { id: sorted_all_equal, input: [5, 5, 5], expected: [5, 5, 5] }
+    - case: { id: sorted_empty_list, input: [], expected: [] }
 
 sum:
   dataset:
-    - case: { input: [1, 2, 3], expected: 6 }
-    - case: { input: [], expected: 0 }
+    - case: { id: sum_simple_list, input: [1, 2, 3], expected: 6 }
+    - case: { id: sum_empty_list, input: [], expected: 0 }
 
 min:
   dataset:
-    - case: { input: [3, 1, 2], expected: 1 }
-    - case: { input: [99], expected: 99 }
+    - case: { id: min_list_values, input: [3, 1, 2], expected: 1 }
+    - case: { id: min_singleton, input: [99], expected: 99 }
 
 max:
   dataset:
-    - case: { input: [3, 1, 2], expected: 3 }
+    - case: { id: max_list_values, input: [3, 1, 2], expected: 3 }
 
 math.sqrt:
   dataset:
-    - case: { input: 16, expected: 4.0 }
-    - case: { input: 9, expected: 3.0 }
-    - case: { input: 0, expected: 0.0 }
+    - case: { id: sqrt_16, input: 16, expected: 4.0 }
+    - case: { id: sqrt_9, input: 9, expected: 3.0 }
+    - case: { id: sqrt_0, input: 0, expected: 0.0 }
 
 math.factorial:
   dataset:
-    - case: { input: 0, expected: 1 }
-    - case: { input: 5, expected: 120 }
-    - case: { input: 10, expected: 3628800 }
+    - case: { id: factorial_0, input: 0, expected: 1 }
+    - case: { id: factorial_5, input: 5, expected: 120 }
+    - case: { id: factorial_10, input: 10, expected: 3628800 }
 
 os.path.join:
   dataset:
     - case:
+        id: join_two_parts
         inputs: ["/home", "user"]
         expected: "/home/user"
     - case:
+        id: join_three_parts
         inputs: ["/var", "log", "app.log"]
         expected: "/var/log/app.log"
diff --git a/examples/evals/math.yml b/examples/evals/math.yml
index 05c8447..d0730b7 100644
--- a/examples/evals/math.yml
+++ b/examples/evals/math.yml
@@ -6,11 +6,12 @@ examples.evals.functions.fibonacci:
     IsInt:
       type: int
   dataset:
-    - case: { input: 0, expected: 0 }
-    - case: { input: 1, expected: 1 }
-    - case: { input: 10, expected: 55 }
-    - case: { input: 20, expected: 6765 }
+    - case: { id: fib_0, input: 0, expected: 0 }
+    - case: { id: fib_1, input: 1, expected: 1 }
+    - case: { id: fib_10, input: 10, expected: 55 }
+    - case: { id: fib_20, input: 20, expected: 6765 }
     - case:
+        id: fib_negative_raises
         input: -1
         raises: ValueError
         match: "non-negative"
@@ -24,10 +25,11 @@ examples.evals.functions.calculate_bmi:
     CorrectFormula:
       assertion: "abs(output - input[0] / (input[1] ** 2)) < 0.1"
   dataset:
-    - case: { inputs: [70.0, 1.75], expected: 22.86 }
-    - case: { inputs: [85.0, 1.80], expected: 26.23 }
-    - case: { inputs: [60.0, 1.65], expected: 22.04 }
+    - case: { id: bmi_normal_weight, inputs: [70.0, 1.75], expected: 22.86 }
+    - case: { id: bmi_overweight_range, inputs: [85.0, 1.80], expected: 26.23 }
+    - case: { id: bmi_light_weight, inputs: [60.0, 1.65], expected: 22.04 }
     - case:
+        id: bmi_zero_weight_raises
         inputs: [0.0, 1.70]
         raises: ValueError
         match: "positive"
@@ -39,7 +41,7 @@ examples.evals.functions.clamp:
     WithinBounds:
       assertion: "input[1] <= output <= input[2]"
   dataset:
-    - case: { inputs: [5, 0, 10], expected: 5 }
-    - case: { inputs: [-5, 0, 10], expected: 0 }
-    - case: { inputs: [99, 0, 10], expected: 10 }
-    - case: { inputs: [0, 0, 0], expected: 0 }
+    - case: { id: clamp_within_bounds, inputs: [5, 0, 10], expected: 5 }
+    - case: { id: clamp_below_min, inputs: [-5, 0, 10], expected: 0 }
+    - case: { id: clamp_above_max, inputs: [99, 0, 10], expected: 10 }
+    - case: { id: clamp_equal_bounds, inputs: [0, 0, 0], expected: 0 }
diff --git a/examples/evals/strings.yml b/examples/evals/strings.yml
index 07fb578..8866040 100644
--- a/examples/evals/strings.yml
+++ b/examples/evals/strings.yml
@@ -6,12 +6,12 @@ examples.evals.functions.is_palindrome:
     IsBool:
       type: bool
   dataset:
-    - case: { input: "racecar", expected: true }
-    - case: { input: "hello", expected: false }
-    - case: { input: "A man a plan a canal Panama", expected: true }
-    - case: { input: "Was it a rat I saw", expected: true }
-    - case: { input: "12321", expected: true }
-    - case: { input: "", expected: true }
+    - case: { id: palindrome_racecar, input: "racecar", expected: true }
+    - case: { id: palindrome_hello_false, input: "hello", expected: false }
+    - case: { id: palindrome_phrase_panama, input: "A man a plan a canal Panama", expected: true }
+    - case: { id: palindrome_phrase_rat, input: "Was it a rat I saw", expected: true }
+    - case: { id: palindrome_numeric, input: "12321", expected: true }
+    - case: { id: palindrome_empty_string, input: "", expected: true }
 
 examples.evals.functions.count_words:
   evals:
@@ -20,10 +20,10 @@ examples.evals.functions.count_words:
     NonNegative:
       assertion: "output >= 0"
   dataset:
-    - case: { input: "Hello world from Python", expected: 4 }
-    - case: { input: "Single", expected: 1 }
-    - case: { input: "", expected: 0 }
-    - case: { input: "  spaces  ", expected: 1 }
+    - case: { id: count_words_sentence, input: "Hello world from Python", expected: 4 }
+    - case: { id: count_words_single, input: "Single", expected: 1 }
+    - case: { id: count_words_empty, input: "", expected: 0 }
+    - case: { id: count_words_trim_spaces, input: "  spaces  ", expected: 1 }
 
 examples.evals.functions.get_file_extension:
   evals:
@@ -32,11 +32,11 @@ examples.evals.functions.get_file_extension:
     LowercaseOnly:
       pattern: "^[a-z0-9]*$"
   dataset:
-    - case: { input: "document.txt", expected: "txt" }
-    - case: { input: "image.PNG", expected: "png" }
-    - case: { input: "archive.tar.gz", expected: "gz" }
-    - case: { input: "noextension", expected: "" }
-    - case: { input: "script.py", expected: "py" }
+    - case: { id: ext_txt, input: "document.txt", expected: "txt" }
+    - case: { id: ext_png_uppercase, input: "image.PNG", expected: "png" }
+    - case: { id: ext_multi_dot_gz, input: "archive.tar.gz", expected: "gz" }
+    - case: { id: ext_no_extension, input: "noextension", expected: "" }
+    - case: { id: ext_py, input: "script.py", expected: "py" }
 
 examples.evals.functions.extract_hashtags:
   evals:
@@ -46,11 +46,14 @@ examples.evals.functions.extract_hashtags:
       assertion: "all(tag.startswith('#') for tag in output) if output else True"
   dataset:
     - case:
+        id: hashtags_two_tags
         input: "Learning #python and #coding today!"
         expected: ["#python", "#coding"]
     - case:
+        id: hashtags_none
         input: "No hashtags here"
         expected: []
     - case:
+        id: hashtags_three_tags
         input: "#AI #ML #DL"
         expected: ["#AI", "#ML", "#DL"]
diff --git a/examples/evals/validation.yml b/examples/evals/validation.yml
index 6cf9dd3..6a19217 100644
--- a/examples/evals/validation.yml
+++ b/examples/evals/validation.yml
@@ -7,11 +7,11 @@ examples.evals.functions.validate_email:
       type: bool
       strict: true
   dataset:
-    - case: { input: "user@example.com", expected: true }
-    - case: { input: "invalid.email", expected: false }
-    - case: { input: "test@domain.co.uk", expected: true }
-    - case: { input: "@nodomain.com", expected: false }
-    - case: { input: "spaces @mail.com", expected: false }
+    - case: { id: email_valid_basic, input: "user@example.com", expected: true }
+    - case: { id: email_invalid_missing_at, input: "invalid.email", expected: false }
+    - case: { id: email_valid_subdomain, input: "test@domain.co.uk", expected: true }
+    - case: { id: email_invalid_missing_user, input: "@nodomain.com", expected: false }
+    - case: { id: email_invalid_with_space, input: "spaces @mail.com", expected: false }
 
 examples.evals.functions.classify_age_group:
   evals:
@@ -26,22 +26,23 @@ examples.evals.functions.classify_age_group:
         (18 <= input < 65 and output == 'adult') or\
         (input >= 65 and output == 'senior')
   dataset:
-    - case: { input: 5, expected: "child" }
-    - case: { input: 15, expected: "teenager" }
-    - case: { input: 30, expected: "adult" }
-    - case: { input: 70, expected: "senior" }
-    - case: { input: 12, expected: "child" }
-    - case: { input: 18, expected: "adult" }
-    - case: { input: 65, expected: "senior" }
+    - case: { id: age_5_child, input: 5, expected: "child" }
+    - case: { id: age_15_teenager, input: 15, expected: "teenager" }
+    - case: { id: age_30_adult, input: 30, expected: "adult" }
+    - case: { id: age_70_senior, input: 70, expected: "senior" }
+    - case: { id: age_12_child_boundary, input: 12, expected: "child" }
+    - case: { id: age_18_adult_boundary, input: 18, expected: "adult" }
+    - case: { id: age_65_senior_boundary, input: 65, expected: "senior" }
 
 examples.evals.functions.format_phone:
   evals:
     PhoneFormat:
       pattern: "^\\(\\d{3}\\) \\d{3}-\\d{4}$"
   dataset:
-    - case: { input: "5551234567", expected: "(555) 123-4567" }
-    - case: { input: "2129876543", expected: "(212) 987-6543" }
+    - case: { id: phone_valid_555, input: "5551234567", expected: "(555) 123-4567" }
+    - case: { id: phone_valid_212, input: "2129876543", expected: "(212) 987-6543" }
     - case:
+        id: phone_short_raises
         input: "123"
         raises: ValueError
         match: "10 digits"
@@ -52,11 +53,14 @@ examples.evals.functions.parse_json:
       type: dict
   dataset:
     - case:
+        id: json_valid_object
         input: '{"key": "value", "n": 42}'
         expected: { key: "value", n: 42 }
     - case:
+        id: json_invalid_returns_empty
         input: "invalid json"
         expected: {}
     - case:
+        id: json_nested_object
         input: '{"nested": {"ok": true}}'
         expected: { nested: { ok: true } }
diff --git a/examples/evaluators/evals.yml b/examples/evaluators/evals.yml
index 3b90ddc..7e85c64 100644
--- a/examples/evaluators/evals.yml
+++ b/examples/evaluators/evals.yml
@@ -11,12 +11,15 @@ validate_email:
       strict: true
   dataset:
     - case:
+        id: email_valid_user_example
         input: "user@example.com"
         expected: true
     - case:
+        id: email_invalid_missing_at
         input: "invalid.email"
         expected: false
     - case:
+        id: email_invalid_missing_user
         input: "@nodomain.com"
         expected: false
 
@@ -31,12 +34,15 @@ calculate_discount:
       assertion: "output <= input[0]"
   dataset:
     - case:
+        id: discount_20_percent
         inputs: [100.0, 20.0]
         expected: 80.0
     - case:
+        id: discount_half_price
         inputs: [50.0, 50.0]
         expected: 25.0
     - case:
+        id: discount_zero_percent
         inputs: [200.0, 0.0]
         expected: 200.0
 
@@ -48,9 +54,11 @@ format_phone:
       pattern: "^\\(\\d{3}\\) \\d{3}-\\d{4}$"
   dataset:
     - case:
+        id: phone_555_format
         input: "5551234567"
         expected: "(555) 123-4567"
     - case:
+        id: phone_212_format
         input: "2129876543"
         expected: "(212) 987-6543"
 
@@ -64,12 +72,15 @@ fibonacci:
       duration: 0.01
   dataset:
     - case:
+        id: fib_0
         input: 0
         expected: 0
     - case:
+        id: fib_10
         input: 10
         expected: 55
     - case:
+        id: fib_20
         input: 20
         expected: 6765
 
@@ -83,11 +94,13 @@ extract_hashtags:
       assertion: "all(tag.startswith('#') for tag in output) if output else True"
   dataset:
     - case:
+        id: hashtags_two
         input: "Learning #python and #coding today!"
         expected:
           - "#python"
           - "#coding"
     - case:
+        id: hashtags_none
         input: "No hashtags here"
         expected: []
 
@@ -106,12 +119,12 @@ classify_age_group:
         (18 <= input < 65 and output == 'adult') or\
         (input >= 65 and output == 'senior')
   dataset:
-    - case: { input: 5, expected: "child" }
-    - case: { input: 15, expected: "teenager" }
-    - case: { input: 30, expected: "adult" }
-    - case: { input: 70, expected: "senior" }
-    - case: { input: 12, expected: "child" }
-    - case: { input: 18, expected: "adult" }
+    - case: { id: age_5_child, input: 5, expected: "child" }
+    - case: { id: age_15_teenager, input: 15, expected: "teenager" }
+    - case: { id: age_30_adult, input: 30, expected: "adult" }
+    - case: { id: age_70_senior, input: 70, expected: "senior" }
+    - case: { id: age_12_child_boundary, input: 12, expected: "child" }
+    - case: { id: age_18_adult_boundary, input: 18, expected: "adult" }
 
 # ─── Raises (Exception Testing) ──────────────────────────────
 # Verify that specific exceptions are raised with optional message matching.
@@ -125,12 +138,15 @@ calculate_bmi:
       assertion: "abs(output - input[0] / (input[1] ** 2)) < 0.1"
   dataset:
     - case:
+        id: bmi_normal_weight
         inputs: [70.0, 1.75]
         expected: 22.86
     - case:
+        id: bmi_overweight_range
         inputs: [85.0, 1.80]
         expected: 26.23
     - case:
+        id: bmi_zero_weight_raises
         inputs: [0.0, 1.70]
         raises: ValueError
         match: "positive"
diff --git a/examples/fixtures/evals.yml b/examples/fixtures/evals.yml
index 683f9d9..952aa0a 100644
--- a/examples/fixtures/evals.yml
+++ b/examples/fixtures/evals.yml
@@ -12,9 +12,11 @@ write_and_count:
     - tmp
   dataset:
     - case:
+        id: write_count_hello_world
         input: "Hello World"
         expected: 11
     - case:
+        id: write_count_test
         input: "Test"
         expected: 4
 
@@ -23,9 +25,11 @@ count_users:
     - db
   dataset:
     - case:
+        id: count_users_alice
         input: "Alice"
         expected: 2
     - case:
+        id: count_users_bob
         input: "Bob"
         expected: 2
 
@@ -34,5 +38,6 @@ add_with_bonus:
     - config
   dataset:
     - case:
+        id: add_with_bonus_basic
         inputs: { a: 1, b: 2 }
         expected: 13
diff --git a/examples/fluent_api/evals.yml b/examples/fluent_api/evals.yml
index 64448cf..94618c6 100644
--- a/examples/fluent_api/evals.yml
+++ b/examples/fluent_api/evals.yml
@@ -5,23 +5,23 @@ double:
     IsInt:
       type: int
   dataset:
-    - case: { input: 5, expected: 10 }
-    - case: { input: 0, expected: 0 }
-    - case: { input: -4, expected: -8 }
+    - case: { id: double_positive, input: 5, expected: 10 }
+    - case: { id: double_zero, input: 0, expected: 0 }
+    - case: { id: double_negative, input: -4, expected: -8 }
 
 triple:
   dataset:
-    - case: { input: 3, expected: 9 }
-    - case: { input: -1, expected: -3 }
+    - case: { id: triple_positive, input: 3, expected: 9 }
+    - case: { id: triple_negative, input: -1, expected: -3 }
 
 reverse:
   evals:
     IsString:
       type: str
   dataset:
-    - case: { input: "hello", expected: "olleh" }
-    - case: { input: "abcba", expected: "abcba" }
-    - case: { input: "", expected: "" }
+    - case: { id: reverse_hello, input: "hello", expected: "olleh" }
+    - case: { id: reverse_palindrome, input: "abcba", expected: "abcba" }
+    - case: { id: reverse_empty, input: "", expected: "" }
 
 fizzbuzz:
   evals:
@@ -30,8 +30,8 @@ fizzbuzz:
     ValidOutput:
       pattern: "^(Fizz|Buzz|FizzBuzz|\\d+)$"
   dataset:
-    - case: { input: 1, expected: "1" }
-    - case: { input: 3, expected: "Fizz" }
-    - case: { input: 5, expected: "Buzz" }
-    - case: { input: 15, expected: "FizzBuzz" }
-    - case: { input: 7, expected: "7" }
+    - case: { id: fizzbuzz_1, input: 1, expected: "1" }
+    - case: { id: fizzbuzz_3, input: 3, expected: "Fizz" }
+    - case: { id: fizzbuzz_5, input: 5, expected: "Buzz" }
+    - case: { id: fizzbuzz_15, input: 15, expected: "FizzBuzz" }
+    - case: { id: fizzbuzz_7, input: 7, expected: "7" }
diff --git a/examples/serializers/__init__.py b/examples/serializers/__init__.py
new file mode 100644
index 0000000..8fbeb8d
--- /dev/null
+++ b/examples/serializers/__init__.py
@@ -0,0 +1 @@
+"""Native YAML serializer + fixture example package."""
diff --git a/examples/serializers/app.py b/examples/serializers/app.py
new file mode 100644
index 0000000..6b05f15
--- /dev/null
+++ b/examples/serializers/app.py
@@ -0,0 +1,18 @@
+"""Functions under test for native serializer + fixture example."""
+
+from __future__ import annotations
+
+from typing import Any
+
+from .defn import Query
+from .fixtures import DbConnection
+
+
+def query_users(query: Query, *, db: DbConnection) -> list[dict[str, Any]]:
+    """Schema mode example: input dict -> Query model via serializer schema."""
+    return db.execute_query(query)
+
+
+def query_users_custom(query: Query, *, db: DbConnection) -> list[dict[str, Any]]:
+    """serial_fn mode example: raw payload -> Query via custom serializer function."""
+    return db.execute_query(query)
diff --git a/examples/serializers/db_query_evals.yml b/examples/serializers/db_query_evals.yml
new file mode 100644
index 0000000..9cafce2
--- /dev/null
+++ b/examples/serializers/db_query_evals.yml
@@ -0,0 +1,53 @@
+# yaml-language-server: $schema=../../vowel-schema.json
+
+
+serializers:
+  query_schema:
+    schema: examples.serializers.util.Query
+  query_serial_fn:
+    serializer: examples.serializers.util.query_from_payload
+
+fixtures:
+  db:
+    cls: examples.serializers.util.DbConnection
+    kwargs:
+      db_path: ":memory:"
+    teardown: examples.serializers.util.DbConnection.close
+    scope: module
+
+examples.serializers.util.query_users:
+  fixture:
+    - db
+  serializer: query_schema
+  evals:
+    ReturnsRows:
+      type: list[dict[str, typing.Any]]
+    CheckSqlIsNotEmpty:
+      assertion: "input.sql is not None"
+  dataset:
+    - case:
+        id: by_age_threshold
+        input:
+          sql: "SELECT name FROM users WHERE age > ? ORDER BY age"
+          params: [30]
+        assertion: "output == [{'name': 'Bob'}, {'name': 'Cara'}]"
+    - case:
+        id: invalid_table_raises
+        input:
+          sql: "SELECT * FROM ghost_table"
+          params: []
+        raises: any
+
+examples.serializers.util.query_users_custom:
+  fixture:
+    - db
+  serializer: query_serial_fn
+  evals:
+    ReturnsRows:
+      type: list[dict[str, typing.Any]]
+  dataset:
+    - case:
+        id: count_users_from_text
+        input: "SELECT COUNT(*) AS total FROM users"
+        expected:
+          - {total: 3}
diff --git a/examples/serializers/defn.py b/examples/serializers/defn.py
new file mode 100644
index 0000000..b0013c3
--- /dev/null
+++ b/examples/serializers/defn.py
@@ -0,0 +1,41 @@
+"""Serializer models and helpers for the native YAML serializer example."""
+
+from __future__ import annotations
+
+from typing import Any
+
+from pydantic import BaseModel, Field
+
+
+class Query(BaseModel):
+    """Simple SQL query payload used by example evals."""
+
+    sql: str
+    params: list[Any] = Field(default_factory=list)
+
+
+def query_from_payload(payload: dict[str, Any]) -> Query:
+    """serial_fn mode example for YAML-native serializer registry.
+
+    Accepts both:
+    - {"input": "SELECT ..."}
+    - {"input": {"sql": "SELECT ...", "params": [...]}}
+    """
+
+    value = payload.get("input")
+    if value is None:
+        value = payload.get("inputs")
+
+    if isinstance(value, str):
+        return Query(sql=value)
+
+    if isinstance(value, dict):
+        sql = value.get("sql")
+        params = value.get("params", [])
+        if not isinstance(sql, str):
+            raise ValueError("Expected 'sql' to be a string in query payload")
+        if not isinstance(params, list):
+            raise ValueError("Expected 'params' to be a list in query payload")
+        return Query(sql=sql, params=params)
+
+    raise ValueError("Unsupported query payload format")
diff --git a/examples/serializers/fixtures.py b/examples/serializers/fixtures.py
new file mode 100644
index 0000000..d917f25
--- /dev/null
+++ b/examples/serializers/fixtures.py
@@ -0,0 +1,39 @@
+"""Fixture utilities for the native YAML serializer example."""
+
+from __future__ import annotations
+
+import sqlite3
+from typing import Any
+
+from .defn import Query
+
+
+class DbConnection:
+    """Tiny sqlite fixture class used by vowel fixture injection."""
+
+    def __init__(self, db_path: str = ":memory:"):
+        # Vowel can execute cases in worker threads; allow sqlite usage across them.
+        self.conn = sqlite3.connect(db_path, check_same_thread=False)
+        self.conn.row_factory = sqlite3.Row
+        self._seed()
+
+    def _seed(self) -> None:
+        cur = self.conn.cursor()
+        cur.execute(
+            "CREATE TABLE IF NOT EXISTS users (id INTEGER PRIMARY KEY, name TEXT, age INTEGER)"
+        )
+        cur.execute("DELETE FROM users")
+        cur.executemany(
+            "INSERT INTO users (name, age) VALUES (?, ?)",
+            [("Alice", 28), ("Bob", 34), ("Cara", 41)],
+        )
+        self.conn.commit()
+
+    def execute_query(self, query: Query) -> list[dict[str, Any]]:
+        cur = self.conn.cursor()
+        cur.execute(query.sql, query.params)
+        rows = cur.fetchall()
+        return [dict(row) for row in rows]
+
+    def close(self) -> None:
+        self.conn.close()
diff --git a/examples/serializers/util.py b/examples/serializers/util.py
new file mode 100644
index 0000000..e69de29
diff --git a/pyproject.toml b/pyproject.toml
index bc13a87..09a7072 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 
 [project]
 name = "vowel"
-version = "0.3.5"
+version = "0.4.0"
 description = "A modular evaluation framework for testing functions with YAML-based specifications"
 readme = "README.md"
 requires-python = ">=3.10"
diff --git a/quality-judge/evals.py b/quality-judge/evals.py
new file mode 100644
index 0000000..c9dd454
--- /dev/null
+++ b/quality-judge/evals.py
@@ -0,0 +1,33 @@
+import os
+import pathlib
+
+import dotenv
+
+from vowel.codemode import CodeModeGenerator
+from vowel.runner import Function
+
+dotenv.load_dotenv()
+
+SPEC_MODEL = os.getenv("SPEC_MODEL")
+EXPLORATION_MODEL = os.getenv("EXPLORATION_MODEL")
+
+generator = CodeModeGenerator(
+    spec_model=SPEC_MODEL,
+    exploration_model=EXPLORATION_MODEL,
+    generation_id="largest_color_value_judge_spec_quality",
+)
+
+
+async def generate_spec(fn: Function):
+    # check for code can compile (it will be executed in monty anyways)
+    _ = fn.impl
+    result = await generator.generate(fn, save_to_file=True)
+    print(result)
+    generator.print_total_cost()
+    return result.yaml_spec
+
+
+async def generate_spec_mock(fn: Function):
+    return pathlib.Path(
+        "/Users/mert/Desktop/LIP/evalspec/quality-judge/largestPathValue_evals.yml"
+    ).read_text()
diff --git a/quality-judge/largestPathValue_evals.yml b/quality-judge/largestPathValue_evals.yml
new file mode 100644
index 0000000..b21778e
--- /dev/null
+++ b/quality-judge/largestPathValue_evals.yml
@@ -0,0 +1,762 @@
+# yaml-language-server: $schema=/Users/mert/.vowel/vowel-schema_035.json
+
+largestPathValue:
+  evals:
+    ReturnType:
+      type: int
+    ResultRange:
+      assertion: output >= -1
+  dataset:
+  - case:
+      id: example_abaca
+      inputs:
+        colors: abaca
+        edges:
+        - - 0
+          - 1
+        - - 0
+          - 2
+        - - 2
+          - 3
+        - - 3
+          - 4
+      expected: 3
+      duration: 10.0
+  - case:
+      id: example_cycle_self_loop
+      inputs:
+        colors: a
+        edges:
+        - - 0
+          - 0
+      expected: -1
+      duration: 10.0
+  - case:
+      id: single_node_no_edges_a
+      inputs:
+        colors: a
+        edges: []
+      expected: 1
+      duration: 10.0
+  - case:
+      id: single_node_no_edges_z
+      inputs:
+        colors: z
+        edges: []
+      expected: 1
+      duration: 10.0
+  - case:
+      id: two_nodes_same_color
+      inputs:
+        colors: aa
+        edges:
+        - - 0
+          - 1
+      expected: 2
+      duration: 10.0
+  - case:
+      id: two_nodes_diff_color
+      inputs:
+        colors: ab
+        edges:
+        - - 0
+          - 1
+      expected: 1
+      duration: 10.0
+  - case:
+      id: linear_all_same_color
+      inputs:
+        colors: aaaa
+        edges:
+        - - 0
+          - 1
+        - - 1
+          - 2
+        - - 2
+          - 3
+      expected: 4
+      duration: 10.0
+  - case:
+      id: linear_alternating_colors
+      inputs:
+        colors: abab
+        edges:
+        - - 0
+          - 1
+        - - 1
+          - 2
+        - - 2
+          - 3
+      expected: 2
+      duration: 10.0
+  - case:
+      id: linear_five_same_color
+      inputs:
+        colors: aaaaa
+        edges:
+        - - 0
+          - 1
+        - - 1
+          - 2
+        - - 2
+          - 3
+        - - 3
+          - 4
+      expected: 5
+      duration: 10.0
+  - case:
+      id: linear_abcba
+      inputs:
+        colors: abcba
+        edges:
+        - - 0
+          - 1
+        - - 1
+          - 2
+        - - 2
+          - 3
+        - - 3
+          - 4
+      expected: 2
+      duration: 10.0
+  - case:
+      id: linear_a_then_bbbbb
+      inputs:
+        colors: abbbbb
+        edges:
+        - - 0
+          - 1
+        - - 1
+          - 2
+        - - 2
+          - 3
+        - - 3
+          - 4
+        - - 4
+          - 5
+      expected: 5
+      duration: 10.0
+  - case:
+      id: linear_aba
+      inputs:
+        colors: aba
+        edges:
+        - - 0
+          - 1
+        - - 1
+          - 2
+      expected: 2
+      duration: 10.0
+  - case:
+      id: dag_fork_aab
+      inputs:
+        colors: aab
+        edges:
+        - - 0
+          - 1
+        - - 0
+          - 2
+        - - 1
+          - 2
+      expected: 2
+      duration: 10.0
+  - case:
+      id: diamond_abba
+      inputs:
+        colors: abba
+        edges:
+        - - 0
+          - 1
+        - - 0
+          - 2
+        - - 1
+          - 3
+        - - 2
+          - 3
+      expected: 2
+      duration: 10.0
+  - case:
+      id: diamond_all_same
+      inputs:
+        colors: aaaa
+        edges:
+        - - 0
+          - 1
+        - - 0
+          - 2
+        - - 1
+          - 3
+        - - 2
+          - 3
+      expected: 3
+      duration: 10.0
+  - case:
+      id: diamond_all_distinct
+      inputs:
+        colors: hecb
+        edges:
+        - - 0
+          - 1
+        - - 0
+          - 2
+        - - 1
+          - 3
+        - - 2
+          - 3
+      expected: 1
+      duration: 10.0
+  - case:
+      id: no_edges_abc
+      inputs:
+        colors: abc
+        edges: []
+      expected: 1
+      duration: 10.0
+  - case:
+      id: no_edges_abcde
+      inputs:
+        colors: abcde
+        edges: []
+      expected: 1
+      duration: 10.0
+  - case:
+      id: two_components_aabba
+      inputs:
+        colors: aabba
+        edges:
+        - - 0
+          - 1
+        - - 1
+          - 2
+        - - 3
+          - 4
+      expected: 2
+      duration: 10.0
+  - case:
+      id: branching_aabba
+      inputs:
+        colors: aabba
+        edges:
+        - - 0
+          - 2
+        - - 1
+          - 2
+        - - 2
+          - 3
+        - - 2
+          - 4
+      expected: 2
+      duration: 10.0
+  - case:
+      id: branching_single_color_five
+      inputs:
+        colors: aaaaa
+        edges:
+        - - 0
+          - 1
+        - - 0
+          - 2
+        - - 1
+          - 3
+        - - 2
+          - 4
+      expected: 3
+      duration: 10.0
+  - case:
+      id: fan_in_sink_aaab
+      inputs:
+        colors: aaab
+        edges:
+        - - 0
+          - 3
+        - - 1
+          - 3
+        - - 2
+          - 3
+      expected: 1
+      duration: 10.0
+  - case:
+      id: fan_in_sink_aaaab
+      inputs:
+        colors: aaaab
+        edges:
+        - - 0
+          - 4
+        - - 1
+          - 4
+        - - 2
+          - 4
+        - - 3
+          - 4
+      expected: 1
+      duration: 10.0
+  - case:
+      id: cycle_three_nodes
+      inputs:
+        colors: abc
+        edges:
+        - - 0
+          - 1
+        - - 1
+          - 2
+        - - 2
+          - 0
+      expected: -1
+      duration: 10.0
+  - case:
+      id: cycle_with_extra_nodes
+      inputs:
+        colors: abcd
+        edges:
+        - - 0
+          - 1
+        - - 1
+          - 2
+        - - 2
+          - 1
+        - - 1
+          - 3
+      expected: -1
+      duration: 10.0
+  - case:
+      id: two_node_cycle
+      inputs:
+        colors: ab
+        edges:
+        - - 0
+          - 1
+        - - 1
+          - 0
+      expected: -1
+      duration: 10.0
+  - case:
+      id: back_edge_cycle
+      inputs:
+        colors: abcde
+        edges:
+        - - 0
+          - 1
+        - - 1
+          - 2
+        - - 2
+          - 3
+        - - 3
+          - 4
+        - - 4
+          - 1
+      expected: -1
+      duration: 10.0
+  - case:
+      id: self_loop_non_first_node
+      inputs:
+        colors: abc
+        edges:
+        - - 0
+          - 1
+        - - 1
+          - 2
+        - - 2
+          - 2
+      expected: -1
+      duration: 10.0
+  - case:
+      id: empty_string_no_edges
+      inputs:
+        colors: ''
+        edges: []
+      expected: 0
+      duration: 10.0
+  - case:
+      id: multi_edge_same_pair
+      inputs:
+        colors: ab
+        edges:
+        - - 0
+          - 1
+        - - 0
+          - 1
+      expected: 1
+      duration: 10.0
+  - case:
+      id: all_26_colors_chain
+      inputs:
+        colors: abcdefghijklmnopqrstuvwxyz
+        edges:
+        - - 0
+          - 1
+        - - 1
+          - 2
+        - - 2
+          - 3
+        - - 3
+          - 4
+        - - 4
+          - 5
+        - - 5
+          - 6
+        - - 6
+          - 7
+        - - 7
+          - 8
+        - - 8
+          - 9
+        - - 9
+          - 10
+        - - 10
+          - 11
+        - - 11
+          - 12
+        - - 12
+          - 13
+        - - 13
+          - 14
+        - - 14
+          - 15
+        - - 15
+          - 16
+        - - 16
+          - 17
+        - - 17
+          - 18
+        - - 18
+          - 19
+        - - 19
+          - 20
+        - - 20
+          - 21
+        - - 21
+          - 22
+        - - 22
+          - 23
+        - - 23
+          - 24
+        - - 24
+          - 25
+      expected: 1
+      duration: 10.0
+  - case:
+      id: alternating_ab_chain_10
+      inputs:
+        colors: ababababab
+        edges:
+        - - 0
+          - 1
+        - - 1
+          - 2
+        - - 2
+          - 3
+        - - 3
+          - 4
+        - - 4
+          - 5
+        - - 5
+          - 6
+        - - 6
+          - 7
+        - - 7
+          - 8
+        - - 8
+          - 9
+      expected: 5
+      duration: 10.0
+  - case:
+      id: all_same_color_linear_equals_n
+      inputs:
+        colors: aaaaaa
+        edges:
+        - - 0
+          - 1
+        - - 1
+          - 2
+        - - 2
+          - 3
+        - - 3
+          - 4
+        - - 4
+          - 5
+      expected: 6
+      duration: 10.0
+  - case:
+      id: list_of_chars_input
+      inputs:
+        colors:
+        - a
+        - b
+        - c
+        edges:
+        - - 0
+          - 1
+        - - 1
+          - 2
+      expected: 1
+      duration: 10.0
+  - case:
+      id: cycle_returns_minus_one
+      inputs:
+        colors: abcde
+        edges:
+        - - 0
+          - 1
+        - - 1
+          - 2
+        - - 2
+          - 3
+        - - 3
+          - 4
+        - - 4
+          - 2
+      assertion: output == -1
+      duration: 10.0
+  - case:
+      id: dag_result_at_least_one
+      inputs:
+        colors: abcdef
+        edges:
+        - - 0
+          - 1
+        - - 1
+          - 2
+        - - 3
+          - 4
+        - - 4
+          - 5
+      assertion: output >= 1
+      duration: 10.0
+  - case:
+      id: single_path_bounded_by_length
+      inputs:
+        colors: abcabc
+        edges:
+        - - 0
+          - 1
+        - - 1
+          - 2
+        - - 2
+          - 3
+        - - 3
+          - 4
+        - - 4
+          - 5
+      assertion: output >= 1 and output <= 6
+      duration: 10.0
+  - case:
+      id: w_shaped_dag
+      inputs:
+        colors: aabaa
+        edges:
+        - - 0
+          - 2
+        - - 1
+          - 2
+        - - 2
+          - 3
+        - - 2
+          - 4
+      assertion: output >= 1 and output <= 5
+      duration: 10.0
+  - case:
+      id: two_sources_one_sink_same_color
+      inputs:
+        colors: aaa
+        edges:
+        - - 0
+          - 2
+        - - 1
+          - 2
+      assertion: output == 2
+      duration: 10.0
+  - case:
+      id: long_path_single_color_at_ends
+      inputs:
+        colors: abcda
+        edges:
+        - - 0
+          - 1
+        - - 1
+          - 2
+        - - 2
+          - 3
+        - - 3
+          - 4
+      assertion: output == 2
+      duration: 10.0
+  - case:
+      id: star_topology_center_unique
+      inputs:
+        colors: baaaa
+        edges:
+        - - 0
+          - 1
+        - - 0
+          - 2
+        - - 0
+          - 3
+        - - 0
+          - 4
+      assertion: output == 1
+      duration: 10.0
+  - case:
+      id: chain_mostly_b_with_one_a
+      inputs:
+        colors: bbbba
+        edges:
+        - - 0
+          - 1
+        - - 1
+          - 2
+        - - 2
+          - 3
+        - - 3
+          - 4
+      expected: 4
+      duration: 10.0
+  - case:
+      id: error_none_colors
+      inputs:
+        colors: null
+        edges:
+        - - 0
+          - 1
+      raises: TypeError
+      match: has no len
+  - case:
+      id: error_none_edges
+      inputs:
+        colors: abc
+        edges: null
+      raises: TypeError
+      match: not iterable
+  - case:
+      id: error_int_colors
+      inputs:
+        colors: 123
+        edges:
+        - - 0
+          - 1
+      raises: TypeError
+      match: has no len
+  - case:
+      id: error_out_of_range_edge
+      inputs:
+        colors: ab
+        edges:
+        - - 0
+          - 5
+      raises: IndexError
+      match: list index out of range
+  - case:
+      id: error_empty_colors_with_edges
+      inputs:
+        colors: ''
+        edges:
+        - - 0
+          - 1
+      raises: IndexError
+      match: list index out of range
+  - case:
+      id: error_none_in_edge_list
+      inputs:
+        colors: abc
+        edges:
+        - null
+        - - 0
+          - 1
+      raises: TypeError
+      match: cannot unpack non-iterable NoneType
+  - case:
+      id: error_int_in_edge_list
+      inputs:
+        colors: abc
+        edges:
+        - 1
+        - - 0
+          - 1
+      raises: TypeError
+      match: cannot unpack non-iterable int
+  - case:
+      id: error_uppercase_color
+      inputs:
+        colors: A
+        edges: []
+      raises: IndexError
+      match: list assignment index out of range
+  - case:
+      id: error_string_in_edge_list
+      inputs:
+        colors: abc
+        edges:
+        - ab
+        - - 0
+          - 1
+      raises: TypeError
+      match: list indices must be integers or slices
+  - case:
+      id: error_typeerror_0
+      inputs:
+      - null
+      - - - 0
+          - 1
+      raises: TypeError
+  - case:
+      id: error_typeerror_1
+      inputs:
+      - abc
+      - null
+      raises: TypeError
+  - case:
+      id: error_typeerror_2
+      inputs:
+      - 123
+      - - - 0
+          - 1
+      raises: TypeError
+  - case:
+      id: error_indexerror_3
+      inputs:
+      - ab
+      - - - 0
+          - 5
+      raises: IndexError
+  - case:
+      id: error_indexerror_4
+      inputs:
+      - ''
+      - - - 0
+          - 1
+      raises: IndexError
+  - case:
+      id: error_typeerror_5
+      inputs:
+      - abc
+      - - null
+        - - 0
+          - 1
+      raises: TypeError
+  - case:
+      id: error_typeerror_6
+      inputs:
+      - abc
+      - - 1
+        - - 0
+          - 1
+      raises: TypeError
+  - case:
+      id: error_valueerror_7
+      inputs:
+      - ab
+      - - - 0
+      raises: ValueError
+  - case:
+      id: error_indexerror_8
+      inputs:
+      - A
+      - []
+      raises: IndexError
+  - case:
+      id: error_typeerror_9
+      inputs:
+      - abc
+      - - ab
+        - - 0
+          - 1
+      raises: TypeError
\ No newline at end of file
diff --git a/quality-judge/largest_color_value_judge.yml b/quality-judge/largest_color_value_judge.yml
new file mode 100644
index 0000000..9e48340
--- /dev/null
+++ b/quality-judge/largest_color_value_judge.yml
@@ -0,0 +1,330 @@
+# yaml-language-server: $schema=/Users/mert/.vowel/vowel-schema_035.json
+
+# costs: https://gist.github.com/fswair/7631878d75d6ed18a4fe3cb9b579600f#file-terminal-txt-L1309
+
+evals.generate_spec:
+  evals:
+    EvalSpecCoversFunction:
+      rubric: |
+        You are grading the quality of a generated evaluation spec against the given function source.
+
+        Your task: assess how well the generated spec covers the function's real behavior, edge cases,
+        error paths, and output contracts. Do not reward verbosity; reward correctness and meaningful coverage.
+
+        Calibration rule (critical):
+        - If trusted run results indicate the generated spec achieved 100% pass rate, do NOT confidently claim
+          that specific passing cases are "wrong" just from static suspicion.
+        - In that situation, treat your uncertainty as potential judge hallucination and shift focus to quality:
+          uniqueness, diversity, coverage depth, evaluator precision, and contract alignment.
+        - You may still flag a case as problematic only when there is strong internal evidence (clear contradiction,
+          invalid expectation syntax, impossible assertion, or direct mismatch with the given function contract).
+
+        Score using this weighted rubric (0-100 total):
+
+        1) Functional Coverage (0-35)
+        - Core happy paths are tested.
+        - Boundary/edge conditions are tested (empty/min/max/null-ish/degenerate where relevant).
+        - Important behavior branches in source are represented by cases.
+        - Missing major branch or core behavior: deduct heavily.
+
+        2) Error and Guard Coverage (0-20)
+        - All meaningful raise/guard paths are represented.
+        - Exception type expectations are accurate.
+        - Error-message match checks are used when meaningful.
+        - Missing critical error path: major deduction.
+
+        3) Case Quality and Dataset Design (0-20)
+        - Cases are concrete, non-redundant, and behavior-focused.
+        - Inputs are realistic and varied (not trivial permutations only).
+        - Dataset is unique, diverse, and comprehensive (not AI slop or cosmetic rewrites of same scenario).
+        - Expected values/assertions are specific and verifiable.
+        - No vague, tautological, or self-fulfilling checks.
+
+        4) Evaluator Quality (0-15)
+        - Evaluators are appropriate for the function contract (expected/raises/type/assertion/pattern/etc).
+        - Assertions are precise and meaningful (not always-true).
+        - Type and semantic checks are correctly used where needed.
+        - Evaluators reflect what was actually observed from execution evidence when such evidence is provided.
+
+        5) Spec Correctness and Maintainability (0-10)
+        - YAML is structurally valid and unambiguous.
+        - Case naming/readability is good.
+        - Spec is concise but complete.
+
+        Hard-fail conditions (cap score at 40 max):
+        - Core function behavior is not tested.
+        - Critical error/guard behavior is absent.
+        - Evaluators are mostly weak/tautological/misaligned.
+        - Spec appears invalid or internally inconsistent.
+        - Do NOT trigger hard-fail based only on speculative "this case should fail" claims that conflict with
+          trusted 100% pass execution evidence.
+
+        Return format (mandatory):
+        1) Criterion scores with reasons:
+           - Functional Coverage: <score>/35
+             Reason: <why this score, with concrete evidence from function/spec>
+           - Error and Guard Coverage: <score>/20
+             Reason: <why this score, with concrete evidence>
+           - Case Quality and Dataset Design: <score>/20
+             Reason: <why this score, with concrete evidence>
+           - Evaluator Quality: <score>/15
+             Reason: <why this score, with concrete evidence>
+           - Spec Correctness and Maintainability: <score>/10
+             Reason: <why this score, with concrete evidence>
+
+        2) Final numeric score: <sum>/100
+           - Must equal the sum of criterion scores.
+
+        3) 3-6 key findings (strengths/weaknesses), each tied to specific evidence.
+
+        4) Top 3 actionable improvements, prioritized by impact.
+
+        Important:
+        - Do NOT return only a final score.
+        - Every criterion MUST include both score and explicit reason.
+        - If pass-rate evidence is 100%, prioritize quality diagnostics over speculative invalid-case accusations.
+        - When you suspect a case is wrong despite pass evidence, mark it as "low-confidence concern" unless
+          you can cite direct, concrete contradiction from the source/contract.
+      include:
+        - input
+      config:
+        model: $JUDGE_MODEL
+        max_tokens: 4096
+  dataset:
+    - case:
+        input:
+          name: largestPathValue
+          description: |
+            Largest color value in a directed graph
+
+            There is a directed graph of n colored nodes and m edges. The nodes are numbered from 0 to n - 1.
+
+            You are given a string colors where colors[i] is a lowercase English letter representing the color of the ith node in this graph (0-indexed). You are also given a 2D array edges where edges[j] = [aj, bj] indicates that there is a directed edge from node aj to node bj.
+
+            A valid path in the graph is a sequence of nodes x1 -> x2 -> x3 -> ... -> xk such that there is a directed edge from xi to xi+1 for every 1 <= i < k. The color value of the path is the number of nodes that are colored the most frequently occurring color along that path.
+
+            Return the largest color value of any valid path in the given graph, or -1 if the graph contains a cycle.
+
+
+            Input: colors = "abaca", edges = [[0,1],[0,2],[2,3],[3,4]]
+            Output: 3
+            Explanation: The path 0 -> 2 -> 3 -> 4 contains 3 nodes that are colored "a" (red in the above image).
+
+            Input: colors = "a", edges = [[0,0]]
+            Output: -1
+            Explanation: There is a cycle from 0 to 0.
+
+
+            Constraints:
+                n == colors.length
+                m == edges.length
+                1 <= n <= 105
+                0 <= m <= 105
+                colors consists of lowercase English letters.
+                0 <= aj, bj < n
+          code: |
+            from collections import deque
+
+
+            def largestPathValue(colors: str, edges: list[list[int]]) -> int:
+                n = len(colors)
+                graph = [[] for _ in range(n)]
+                indegree = [0] * n
+
+                for u, v in edges:
+                  graph[u].append(v)
+                  indegree[v] += 1
+
+                dp = [[0] * 26 for _ in range(n)]
+                queue = deque()
+
+                for i in range(n):
+                  if indegree[i] == 0:
+                    queue.append(i)
+                    dp[i][ord(colors[i]) - ord("a")] = 1
+
+                visited = 0
+                answer = 0
+
+                while queue:
+                  node = queue.popleft()
+                  visited += 1
+                  answer = max(answer, max(dp[node]))
+
+                  for nei in graph[node]:
+                    for c in range(26):
+                      dp[nei][c] = max(
+                        dp[nei][c],
+                        dp[node][c] + (1 if c == ord(colors[nei]) - ord("a") else 0),
+                      )
+
+                    indegree[nei] -= 1
+                    if indegree[nei] == 0:
+                      queue.append(nei)
+
+                return answer if visited == n else -1
+        
+
+
+evals.generate_spec_mock:
+  evals:
+    EvalSpecCoversFunction:
+      rubric: |
+        You are grading the quality of a generated evaluation spec against the given function source.
+
+        Your task: assess how well the generated spec covers the function's real behavior, edge cases,
+        error paths, and output contracts. Do not reward verbosity; reward correctness and meaningful coverage.
+
+        Calibration rule (critical):
+        - If trusted run results indicate the generated spec achieved 100% pass rate, do NOT confidently claim
+          that specific passing cases are "wrong" just from static suspicion.
+        - In that situation, treat your uncertainty as potential judge hallucination and shift focus to quality:
+          uniqueness, diversity, coverage depth, evaluator precision, and contract alignment.
+        - You may still flag a case as problematic only when there is strong internal evidence (clear contradiction,
+          invalid expectation syntax, impossible assertion, or direct mismatch with the given function contract).
+
+        Score using this weighted rubric (0-100 total):
+
+        1) Functional Coverage (0-35)
+        - Core happy paths are tested.
+        - Boundary/edge conditions are tested (empty/min/max/null-ish/degenerate where relevant).
+        - Important behavior branches in source are represented by cases.
+        - Missing major branch or core behavior: deduct heavily.
+
+        2) Error and Guard Coverage (0-20)
+        - All meaningful raise/guard paths are represented.
+        - Exception type expectations are accurate.
+        - Error-message match checks are used when meaningful.
+        - Missing critical error path: major deduction.
+
+        3) Case Quality and Dataset Design (0-20)
+        - Cases are concrete, non-redundant, and behavior-focused.
+        - Inputs are realistic and varied (not trivial permutations only).
+        - Dataset is unique, diverse, and comprehensive (not AI slop or cosmetic rewrites of same scenario).
+        - Expected values/assertions are specific and verifiable.
+        - No vague, tautological, or self-fulfilling checks.
+
+        4) Evaluator Quality (0-15)
+        - Evaluators are appropriate for the function contract (expected/raises/type/assertion/pattern/etc).
+        - Assertions are precise and meaningful (not always-true).
+        - Type and semantic checks are correctly used where needed.
+        - Evaluators reflect what was actually observed from execution evidence when such evidence is provided.
+
+        5) Spec Correctness and Maintainability (0-10)
+        - YAML is structurally valid and unambiguous.
+        - Case naming/readability is good.
+        - Spec is concise but complete.
+
+        Hard-fail conditions (cap score at 40 max):
+        - Core function behavior is not tested.
+        - Critical error/guard behavior is absent.
+        - Evaluators are mostly weak/tautological/misaligned.
+        - Spec appears invalid or internally inconsistent.
+        - Do NOT trigger hard-fail based only on speculative "this case should fail" claims that conflict with
+          trusted 100% pass execution evidence.
+
+        Return format (mandatory):
+        1) Criterion scores with reasons:
+           - Functional Coverage: <score>/35
+             Reason: <why this score, with concrete evidence from function/spec>
+           - Error and Guard Coverage: <score>/20
+             Reason: <why this score, with concrete evidence>
+           - Case Quality and Dataset Design: <score>/20
+             Reason: <why this score, with concrete evidence>
+           - Evaluator Quality: <score>/15
+             Reason: <why this score, with concrete evidence>
+           - Spec Correctness and Maintainability: <score>/10
+             Reason: <why this score, with concrete evidence>
+
+        2) Final numeric score: <sum>/100
+           - Must equal the sum of criterion scores.
+
+        3) 3-6 key findings (strengths/weaknesses), each tied to specific evidence.
+
+        4) Top 3 actionable improvements, prioritized by impact.
+
+        Important:
+        - Do NOT return only a final score.
+        - Every criterion MUST include both score and explicit reason.
+        - If pass-rate evidence is 100%, prioritize quality diagnostics over speculative invalid-case accusations.
+        - When you suspect a case is wrong despite pass evidence, mark it as "low-confidence concern" unless
+          you can cite direct, concrete contradiction from the source/contract.
+      include:
+        - input
+      config:
+        model: $JUDGE_MODEL
+        max_tokens: 4096
+  dataset:
+    - case:
+        input:
+          name: largestPathValue
+          description: |
+            Largest color value in a directed graph
+
+            There is a directed graph of n colored nodes and m edges. The nodes are numbered from 0 to n - 1.
+
+            You are given a string colors where colors[i] is a lowercase English letter representing the color of the ith node in this graph (0-indexed). You are also given a 2D array edges where edges[j] = [aj, bj] indicates that there is a directed edge from node aj to node bj.
+
+            A valid path in the graph is a sequence of nodes x1 -> x2 -> x3 -> ... -> xk such that there is a directed edge from xi to xi+1 for every 1 <= i < k. The color value of the path is the number of nodes that are colored the most frequently occurring color along that path.
+
+            Return the largest color value of any valid path in the given graph, or -1 if the graph contains a cycle.
+
+
+            Input: colors = "abaca", edges = [[0,1],[0,2],[2,3],[3,4]]
+            Output: 3
+            Explanation: The path 0 -> 2 -> 3 -> 4 contains 3 nodes that are colored "a" (red in the above image).
+
+            Input: colors = "a", edges = [[0,0]]
+            Output: -1
+            Explanation: There is a cycle from 0 to 0.
+
+
+            Constraints:
+                n == colors.length
+                m == edges.length
+                1 <= n <= 105
+                0 <= m <= 105
+                colors consists of lowercase English letters.
+                0 <= aj, bj < n
+          code: |
+            from collections import deque
+
+
+            def largestPathValue(colors: str, edges: list[list[int]]) -> int:
+                n = len(colors)
+                graph = [[] for _ in range(n)]
+                indegree = [0] * n
+
+                for u, v in edges:
+                  graph[u].append(v)
+                  indegree[v] += 1
+
+                dp = [[0] * 26 for _ in range(n)]
+                queue = deque()
+
+                for i in range(n):
+                  if indegree[i] == 0:
+                    queue.append(i)
+                    dp[i][ord(colors[i]) - ord("a")] = 1
+
+                visited = 0
+                answer = 0
+
+                while queue:
+                  node = queue.popleft()
+                  visited += 1
+                  answer = max(answer, max(dp[node]))
+
+                  for nei in graph[node]:
+                    for c in range(26):
+                      dp[nei][c] = max(
+                        dp[nei][c],
+                        dp[node][c] + (1 if c == ord(colors[nei]) - ord("a") else 0),
+                      )
+
+                    indegree[nei] -= 1
+                    if indegree[nei] == 0:
+                      queue.append(nei)
+
+                return answer if visited == n else -1
+        
diff --git a/quality-judge/runner.py b/quality-judge/runner.py
new file mode 100644
index 0000000..587b79c
--- /dev/null
+++ b/quality-judge/runner.py
@@ -0,0 +1,22 @@
+from vowel.monitoring import enable_monitoring
+from vowel.runner import Function, RunEvals
+
+enable_monitoring(
+    logfire_enabled=True,
+    service_name="quality-judge",
+)
+
+runner = RunEvals.from_file("largest_color_value_judge.yml")
+
+main_runner = runner.with_serializer({"evals.generate_spec": Function}).filter(
+    "evals.generate_spec"
+)
+
+# mock_runner = runner.with_serializer({"evals.generate_spec_mock": Function}).filter(
+#     "evals.generate_spec_mock"
+# )
+
+
+summary = main_runner.run()
+
+summary.print()
diff --git a/src/vowel/eval_types.py b/src/vowel/eval_types.py
index 17f8495..8ad7509 100644
--- a/src/vowel/eval_types.py
+++ b/src/vowel/eval_types.py
@@ -249,6 +249,36 @@ class FixturesConfig(BaseModel):
     )
 
 
+class SerializerSpec(BaseModel):
+    """Serializer registry entry for YAML-native serializer configuration."""
+
+    model_config = ConfigDict(extra="forbid", populate_by_name=True)
+
+    serializer_schema: str | dict[str, str] | None = Field(
+        default=None,
+        alias="schema",
+        serialization_alias="schema",
+        description=(
+            "Schema converter path(s). Use a single import path string for direct mode, "
+            "or a mapping of parameter name to import path for nested mode."
+        ),
+    )
+    serializer: str | None = Field(
+        default=None,
+        description="Import path to custom serializer function (serial_fn mode).",
+    )
+
+    @model_validator(mode="after")
+    def validate_one_of(self):
+        has_schema = self.serializer_schema is not None
+        has_serializer = self.serializer is not None
+        if has_schema and has_serializer:
+            raise ValueError("Serializer spec cannot define both 'schema' and 'serializer'")
+        if not has_schema and not has_serializer:
+            raise ValueError("Serializer spec must define one of: 'schema' or 'serializer'")
+        return self
+
+
 # =============================================================================
 # Evaluation Case Models
 # =============================================================================
@@ -737,6 +767,14 @@ class Evals(BaseModel):
         examples=[["db"], ["db", "cache"], ["redis"]],
     )
 
+    serializer: str | None = Field(
+        default=None,
+        description=(
+            "Optional serializer registry key from top-level 'serializers'. "
+            "When set, this eval uses that serializer definition."
+        ),
+    )
+
     evals: dict[
         str,
         IsInstanceCase
@@ -807,20 +845,32 @@ class EvalsFile(BaseModel):
         default_factory=dict,
         description="Global fixture definitions available to all evals in this file",
     )
+    serializers: dict[str, SerializerSpec] = Field(
+        default_factory=dict,
+        description="Global serializer definitions available to evals in this file",
+    )
 
     @classmethod
     def model_validate(cls, obj, **kwargs):
         # Parse fixtures if present (don't mutate caller's dict)
         fixtures_data = obj.get("fixtures", {})
-        obj = {k: v for k, v in obj.items() if k != "fixtures"}
+        serializers_data = obj.get("serializers", {})
+        obj = {k: v for k, v in obj.items() if k not in {"fixtures", "serializers"}}
         fixtures = {}
+        serializers = {}
         for name, defn in fixtures_data.items():
             if isinstance(defn, dict):
                 fixtures[name] = FixtureDefinition(**defn)
             elif isinstance(defn, FixtureDefinition):
                 fixtures[name] = defn
 
-        instance = cls.model_construct(fixtures=fixtures, **obj)
+        for name, defn in serializers_data.items():
+            if isinstance(defn, dict):
+                serializers[name] = SerializerSpec(**defn)
+            elif isinstance(defn, SerializerSpec):
+                serializers[name] = defn
+
+        instance = cls.model_construct(fixtures=fixtures, serializers=serializers, **obj)
         return instance
 
     # Pydantic internal attributes to skip when iterating
@@ -843,6 +893,7 @@ def model_validate(cls, obj, **kwargs):
             "model_dump",
             "model_dump_json",
             "fixtures",
+            "serializers",
             # Skip fixtures when iterating evals
         }
     )
@@ -851,7 +902,7 @@ def get_evals(self) -> dict[str, Evals]:
         result = {}
         extras = getattr(self, "__pydantic_extra__", {})
         for key, value in extras.items():
-            if key == "fixtures":
+            if key in {"fixtures", "serializers"}:
                 continue
             if isinstance(value, dict) and "dataset" in value:
                 try:
diff --git a/src/vowel/evals.py b/src/vowel/evals.py
index f4d1b2f..4d7cab4 100644
--- a/src/vowel/evals.py
+++ b/src/vowel/evals.py
@@ -79,7 +79,83 @@ def _eval_type_restricted(type_expr: str) -> typing.Any:
     return eval(type_expr, env, env)
 
 
-def prepare_env_and_condition(ctx: EvaluatorContext, condition: str) -> tuple[dict, str]:
+def _apply_serializer_for_assertion(
+    value: typing.Any,
+    serializer: type | typing.Callable | dict[str, type | typing.Callable] | None,
+    *,
+    param_name: str | None = None,
+) -> typing.Any:
+    """Apply serializer in assertion path to mirror function call conversions."""
+    if serializer is None:
+        return value
+
+    if isinstance(serializer, dict):
+        if param_name and param_name in serializer:
+            return _apply_serializer_for_assertion(value, serializer[param_name])
+        if isinstance(value, dict):
+            converted: dict[str, typing.Any] = {}
+            for key, item in value.items():
+                if key in serializer:
+                    converted[key] = _apply_serializer_for_assertion(item, serializer[key])
+                else:
+                    converted[key] = item
+            return converted
+        return value
+
+    if isinstance(value, dict):
+        try:
+            return serializer(**value)
+        except TypeError:
+            return serializer(value)
+
+    return serializer(value)
+
+
+def _normalize_input_for_assertion(
+    raw_inputs: typing.Any,
+    serializer: type | typing.Callable | dict[str, type | typing.Callable] | None,
+    serializer_fn: typing.Callable[[dict], typing.Any] | None,
+) -> typing.Any:
+    """Compute assertion `input` value from raw case inputs using active serializer config."""
+    if not isinstance(raw_inputs, dict):
+        return _apply_serializer_for_assertion(raw_inputs, serializer)
+
+    if serializer_fn is not None:
+        serialized = serializer_fn(raw_inputs)
+        if isinstance(serialized, tuple):
+            return serialized[0] if len(serialized) == 1 else serialized
+        return serialized
+
+    if "input" in raw_inputs:
+        return _apply_serializer_for_assertion(raw_inputs["input"], serializer)
+
+    if "inputs" in raw_inputs:
+        values = raw_inputs["inputs"]
+        if values is None:
+            return None
+        if isinstance(values, dict):
+            if serializer is not None and not isinstance(serializer, dict):
+                return _apply_serializer_for_assertion(values, serializer)
+            if isinstance(serializer, dict):
+                return {
+                    key: _apply_serializer_for_assertion(item, serializer, param_name=key)
+                    for key, item in values.items()
+                }
+            return values
+        if serializer is None:
+            return values
+        return [_apply_serializer_for_assertion(item, serializer) for item in values]
+
+    return raw_inputs
+
+
+def prepare_env_and_condition(
+    ctx: EvaluatorContext,
+    condition: str,
+    *,
+    serializer: type | typing.Callable | dict[str, type | typing.Callable] | None = None,
+    serializer_fn: typing.Callable[[dict], typing.Any] | None = None,
+) -> tuple[dict, str]:
     """
     Prepare environment variables and format condition string for evaluation.
 
@@ -90,12 +166,7 @@ def prepare_env_and_condition(ctx: EvaluatorContext, condition: str) -> tuple[di
     Returns:
         Tuple of (environment dict, formatted condition string)
     """
-    actual_input = ctx.inputs
-    if isinstance(ctx.inputs, dict):
-        if "input" in ctx.inputs:
-            actual_input = ctx.inputs["input"]
-        elif "inputs" in ctx.inputs:
-            actual_input = ctx.inputs["inputs"]
+    actual_input = _normalize_input_for_assertion(ctx.inputs, serializer, serializer_fn)
 
     env = {
         "input": actual_input,
@@ -122,9 +193,18 @@ class AssertionEvaluator(Evaluator):
     metrics, metadata, and duration variables.
     """
 
-    def __init__(self, condition: str, *, evaluation_name: str = "Assertion"):
+    def __init__(
+        self,
+        condition: str,
+        *,
+        evaluation_name: str = "Assertion",
+        serializer: type | typing.Callable | dict[str, type | typing.Callable] | None = None,
+        serializer_fn: typing.Callable[[dict], typing.Any] | None = None,
+    ):
         self.condition = condition
         self.evaluation_name = evaluation_name
+        self.serializer = serializer
+        self.serializer_fn = serializer_fn
         self.interpreter = None
         if MONTY_AVAILABLE:
             import pydantic_monty
@@ -141,7 +221,12 @@ def evaluate(self, ctx: EvaluatorContext) -> EvaluationReason:
             return EvaluationReason(value=True, reason="Skipped (exception case)")
         if "__import__" in self.condition:
             raise ValueError(f"__import__ is not allowed in assertions: {self.condition}")
-        env, condition = prepare_env_and_condition(ctx, self.condition)
+        env, condition = prepare_env_and_condition(
+            ctx,
+            self.condition,
+            serializer=self.serializer,
+            serializer_fn=self.serializer_fn,
+        )
 
         # TL;DR
         # BETA API
diff --git a/src/vowel/schema.py b/src/vowel/schema.py
index 80e4647..15e04be 100644
--- a/src/vowel/schema.py
+++ b/src/vowel/schema.py
@@ -2,6 +2,7 @@
 
 from __future__ import annotations
 
+import hashlib
 import importlib.metadata
 import json
 import re
@@ -34,6 +35,7 @@ def build_yaml_schema_from_bundle() -> dict[str, Any]:
     No repository reference file is used. The root shape is forced to match
     vowel's YAML file format:
     - top-level optional `fixtures`
+    - top-level optional `serializers`
     - top-level additionalProperties => per-function `Evals`
     """
     bundle_schema = EvalsBundle.model_json_schema(ref_template="#/$defs/{model}")
@@ -46,6 +48,13 @@ def build_yaml_schema_from_bundle() -> dict[str, Any]:
             "title": "Fixtures",
         },
     )
+    serializers_schema = properties.get(
+        "serializers",
+        {
+            "type": "object",
+            "title": "Serializers",
+        },
+    )
 
     additional_properties: dict[str, Any]
     if "Evals" in defs:
@@ -71,6 +80,7 @@ def build_yaml_schema_from_bundle() -> dict[str, Any]:
         "type": "object",
         "properties": {
             "fixtures": fixtures_schema,
+            "serializers": serializers_schema,
         },
         "additionalProperties": additional_properties,
         "$defs": defs,
@@ -81,13 +91,14 @@ def build_yaml_schema_from_bundle() -> dict[str, Any]:
 
 def ensure_cached_schema(version: str | None = None) -> Path:
     """Ensure the versioned schema file exists and is up to date."""
-    token = _schema_version_token(version)
-    schema_path = SCHEMA_CACHE_DIR / f"vowel-schema_{token}.json"
-    schema_path.parent.mkdir(parents=True, exist_ok=True)
-
     schema_data = build_yaml_schema_from_bundle()
     rendered = json.dumps(schema_data, indent=2, ensure_ascii=False) + "\n"
 
+    token = _schema_version_token(version)
+    digest = hashlib.sha1(rendered.encode("utf-8")).hexdigest()[:8]
+    schema_path = SCHEMA_CACHE_DIR / f"vowel-schema_{token}_{digest}.json"
+    schema_path.parent.mkdir(parents=True, exist_ok=True)
+
     if not schema_path.exists() or schema_path.read_text(encoding="utf-8") != rendered:
         schema_path.write_text(rendered, encoding="utf-8")
 
diff --git a/src/vowel/utils.py b/src/vowel/utils.py
index 60bab83..ac54c1b 100644
--- a/src/vowel/utils.py
+++ b/src/vowel/utils.py
@@ -14,7 +14,7 @@
 from collections.abc import Callable, Mapping, Sequence
 from datetime import date, datetime, time, timedelta
 from decimal import Decimal
-from functools import wraps
+from functools import lru_cache, wraps
 from pathlib import Path, PurePath
 from typing import Any, Literal, Optional, Union, get_args, get_origin
 
@@ -28,7 +28,7 @@
 from pydantic_evals.reporting import EvaluationReport
 
 from .errors import FixturePathError, SignatureError
-from .eval_types import Evals, EvalsFile, FixtureDefinition
+from .eval_types import Evals, EvalsFile, FixtureDefinition, SerializerSpec
 from .evals import (
     AssertionEvaluator,
     ContainsInputEvaluator,
@@ -39,6 +39,8 @@
 )
 from .executor import Executor
 
+PROJECT_ROOT = Path(__file__).resolve().parents[2]
+
 # =============================================================================
 # Evals Bundle - Container for evals and fixtures
 # =============================================================================
@@ -51,6 +53,7 @@ class EvalsBundle(BaseModel):
 
     evals: dict[str, Evals] = Field(min_length=1)
     fixtures: dict[str, FixtureDefinition] = Field(default_factory=dict)
+    serializers: dict[str, SerializerSpec] = Field(default_factory=dict)
 
     def to_yaml(self) -> str:
         """Serialize bundle to current vowel YAML spec format."""
@@ -76,6 +79,17 @@ def to_yaml(self) -> str:
                 for name, definition in self.fixtures.items()
             }
 
+        if self.serializers:
+            data["serializers"] = {
+                name: serializer.model_dump(
+                    mode="python",
+                    by_alias=True,
+                    exclude_none=True,
+                    exclude_defaults=True,
+                )
+                for name, serializer in self.serializers.items()
+            }
+
         return yaml.safe_dump(
             data,
             sort_keys=False,
@@ -365,17 +379,20 @@ def check_compatibility(func: Callable) -> tuple[bool, list[str]]:
 
 @contextlib.contextmanager
 def _cwd_on_syspath() -> Any:
-    """Temporarily prepend the current working directory to ``sys.path``."""
+    """Temporarily prepend cwd and project root to ``sys.path``."""
     cwd = os.getcwd()
-    inserted = cwd not in sys.path
-    if inserted:
-        sys.path.insert(0, cwd)
+    candidates = [cwd, str(PROJECT_ROOT)]
+    inserted: list[str] = []
+    for candidate in candidates:
+        if candidate not in sys.path:
+            sys.path.insert(0, candidate)
+            inserted.append(candidate)
     try:
         yield
     finally:
-        if inserted:
+        for candidate in inserted:
             with contextlib.suppress(ValueError):
-                sys.path.remove(cwd)
+                sys.path.remove(candidate)
 
 
 def _is_yaml_source_string(source_str: str) -> bool:
@@ -859,9 +876,12 @@ def import_function(module_path: str) -> Any:
                     error=str(e),
                 )
                 relative_path = module_name.replace(".", os.sep) + ".py"
-                file_path = os.path.join(os.getcwd(), relative_path)
+                candidate_roots = [os.getcwd(), str(PROJECT_ROOT)]
 
-                if os.path.exists(file_path):
+                for root in candidate_roots:
+                    file_path = os.path.join(root, relative_path)
+                    if not os.path.exists(file_path):
+                        continue
                     try:
                         spec = importlib.util.spec_from_file_location(module_name, file_path)
                         if spec and spec.loader:
@@ -870,6 +890,7 @@ def import_function(module_path: str) -> Any:
                             logfire.debug(
                                 "File-based import succeeded for '{file_path}'", file_path=file_path
                             )
+                            break
                     except Exception as e:
                         logfire.debug(
                             "File-based import failed for '{file_path}': {error}",
@@ -901,6 +922,46 @@ def import_function(module_path: str) -> Any:
     )
 
 
+@lru_cache(maxsize=512)
+def _import_path_cached(path: str) -> Any:
+    """Import and cache objects referenced by import paths."""
+    return import_function(path)
+
+
+def _resolve_yaml_serializer_entry(
+    serializers: Mapping[str, SerializerSpec],
+    serializer_name: str,
+) -> tuple[type | Callable | dict[str, type | Callable] | None, Callable[[dict], Any] | None]:
+    """Resolve a serializer registry entry into schema or serial_fn mapping values."""
+    if serializer_name not in serializers:
+        available = sorted(serializers.keys())
+        raise ValueError(
+            f"Unknown serializer '{serializer_name}'. Available serializers: {available}"
+        )
+
+    spec = serializers[serializer_name]
+
+    if spec.serializer is not None:
+        loaded = _import_path_cached(spec.serializer)
+        if not callable(loaded):
+            raise TypeError(f"Serializer '{spec.serializer}' must resolve to a callable")
+        return None, loaded
+
+    schema = spec.serializer_schema
+    if isinstance(schema, str):
+        return _import_path_cached(schema), None
+
+    if isinstance(schema, dict):
+        resolved: dict[str, type | Callable] = {}
+        for key, path in schema.items():
+            resolved[key] = _import_path_cached(path)
+        return resolved, None
+
+    raise ValueError(
+        f"Serializer '{serializer_name}' must define one of: schema (str|dict) or serializer"
+    )
+
+
 def import_class(class_path: str) -> type:
     """
     Import a class from a module path.
@@ -950,26 +1011,42 @@ def load_bundle_file(yaml_path: str) -> EvalsBundle:
         loaded = yaml.safe_load(f)
 
     evals_file = EvalsFile.model_validate(loaded)
-    return EvalsBundle(evals=evals_file.get_evals(), fixtures=evals_file.fixtures)
+    return EvalsBundle(
+        evals=evals_file.get_evals(),
+        fixtures=evals_file.fixtures,
+        serializers=evals_file.serializers,
+    )
 
 
 def load_bundle_from_yaml_string(yaml_content: str) -> EvalsBundle:
     """Load evals and fixtures from a YAML string."""
     loaded = yaml.safe_load(yaml_content)
     evals_file = EvalsFile.model_validate(loaded)
-    return EvalsBundle(evals=evals_file.get_evals(), fixtures=evals_file.fixtures)
+    return EvalsBundle(
+        evals=evals_file.get_evals(),
+        fixtures=evals_file.fixtures,
+        serializers=evals_file.serializers,
+    )
 
 
 def load_bundle_from_dict(data: dict) -> EvalsBundle:
     """Load evals and fixtures from a dictionary."""
     evals_file = EvalsFile.model_validate(data)
-    return EvalsBundle(evals=evals_file.get_evals(), fixtures=evals_file.fixtures)
+    return EvalsBundle(
+        evals=evals_file.get_evals(),
+        fixtures=evals_file.fixtures,
+        serializers=evals_file.serializers,
+    )
 
 
 def load_bundle_from_object(evals_obj: EvalsFile) -> EvalsBundle:
     """Load evals and fixtures from an EvalsFile object."""
     assert isinstance(evals_obj, EvalsFile)
-    return EvalsBundle(evals=evals_obj.get_evals(), fixtures=evals_obj.fixtures)
+    return EvalsBundle(
+        evals=evals_obj.get_evals(),
+        fixtures=evals_obj.fixtures,
+        serializers=evals_obj.serializers,
+    )
 
 
 def load_bundle(source: str | Path | dict | EvalsFile) -> EvalsBundle:
@@ -1118,7 +1195,7 @@ def to_dataset(
             display_input = f"inputs: {match_case.inputs}"
             input_value = {"inputs": match_case.inputs}
         else:
-            display_input = f"input: {match_case.input}"
+            display_input = f"input: {str(match_case.input)[:300]}"
             input_value = {"input": match_case.input}
 
         if any(case for case in dataset_cases if input_value == case.inputs):
@@ -1551,6 +1628,17 @@ def _evaluate_single_function(
             serial_fn, eval_id, mapping_name="serializer function"
         )
 
+        for evaluator in dataset.evaluators:
+            if isinstance(evaluator, AssertionEvaluator):
+                evaluator.serializer = func_schema
+                evaluator.serializer_fn = func_serial_fn
+
+        for case in dataset.cases:
+            for evaluator in case.evaluators:
+                if isinstance(evaluator, AssertionEvaluator):
+                    evaluator.serializer = func_schema
+                    evaluator.serializer_fn = func_serial_fn
+
         # Setup module-scoped fixtures for this eval
         module_fixtures = {}
         if fixture_manager and fixture_names:
@@ -2046,6 +2134,7 @@ def run_evals(
     bundle = source if isinstance(source, EvalsBundle) else load_bundle(source)
     all_evals = bundle.evals
     yaml_fixtures = bundle.fixtures
+    yaml_serializers = bundle.serializers
 
     # Merge programmatic fixtures with YAML fixtures
     merged_fixtures, fixture_funcs = _merge_programmatic_fixtures(yaml_fixtures, fixtures)
@@ -2057,13 +2146,37 @@ def run_evals(
     serial_fn = serial_fn or {}
 
     if filter_funcs:
-        filtered_evals = {k: v for k, v in all_evals.items() if k in filter_funcs}
+        resolved_filter_ids: list[str] = []
+
+        for raw_filter in filter_funcs:
+            if raw_filter in all_evals:
+                resolved_filter_ids.append(raw_filter)
+                continue
+
+            short_name = raw_filter.rsplit(".", 1)[-1]
+            matches = [
+                eval_id for eval_id in all_evals if eval_id.rsplit(".", 1)[-1] == short_name
+            ]
+
+            if len(matches) == 1:
+                resolved_filter_ids.append(matches[0])
+            elif len(matches) > 1:
+                candidates = sorted(matches)
+                raise ValueError(
+                    f"Ambiguous filter '{raw_filter}'. Provide an exact eval id. "
+                    f"Candidates: {candidates}"
+                )
+        # Keep stable input order while removing duplicates.
+        ordered_unique_filter_ids = list(dict.fromkeys(resolved_filter_ids))
+        filtered_evals = {k: v for k, v in all_evals.items() if k in ordered_unique_filter_ids}
+
         if not filtered_evals:
             available = list(all_evals.keys())
             raise ValueError(
                 f"No functions found matching filters: {', '.join(filter_funcs)}. "
                 f"Available: {', '.join(available)}"
             )
+
         all_evals = filtered_evals
 
     # Create fixture manager if fixtures are defined
@@ -2074,14 +2187,43 @@ def run_evals(
     try:
         for eval_id, evals in all_evals.items():
             try:
+                effective_schema = dict(schema)
+                effective_serial_fn = dict(serial_fn)
+
+                # YAML-native serializer registry (per-eval reference).
+                if evals.serializer:
+                    yaml_schema, yaml_serial = _resolve_yaml_serializer_entry(
+                        yaml_serializers,
+                        evals.serializer,
+                    )
+
+                    # Programmatic mappings have precedence.
+                    has_programmatic_schema = (
+                        _resolve_eval_id_mapping(schema, eval_id, mapping_name="serializer schema")
+                        is not None
+                    )
+                    has_programmatic_serial = (
+                        _resolve_eval_id_mapping(
+                            serial_fn,
+                            eval_id,
+                            mapping_name="serializer function",
+                        )
+                        is not None
+                    )
+
+                    if yaml_schema is not None and not has_programmatic_schema:
+                        effective_schema[eval_id] = yaml_schema
+                    if yaml_serial is not None and not has_programmatic_serial:
+                        effective_serial_fn[eval_id] = yaml_serial
+
                 result = _evaluate_single_function(
                     eval_id,
                     evals,
                     functions,
                     merged_fixtures,
                     fixture_manager,
-                    schema,
-                    serial_fn,
+                    effective_schema,
+                    effective_serial_fn,
                     ignore_duration,
                 )
                 results.append(result)
diff --git a/tests/test_run_evals.py b/tests/test_run_evals.py
index 4ff09e5..e91e35a 100644
--- a/tests/test_run_evals.py
+++ b/tests/test_run_evals.py
@@ -238,6 +238,81 @@ def test_filter_multiple_functions(self):
 
         assert summary.total_count == 2
 
+    def test_filter_module_name_matches_short_eval_id(self):
+        """module.function filter should match bare function eval ids."""
+        spec = {
+            "add": {"dataset": [{"case": {"inputs": {"a": 1, "b": 2}, "expected": 3}}]},
+            "sub": {"dataset": [{"case": {"inputs": {"a": 5, "b": 3}, "expected": 2}}]},
+        }
+
+        summary = (
+            RunEvals.from_dict(spec)
+            .with_functions(
+                {
+                    "add": lambda a, b: a + b,
+                    "sub": lambda a, b: a - b,
+                }
+            )
+            .filter(["math.add"])
+            .run()
+        )
+
+        assert summary.total_count == 1
+        assert summary.results[0].eval_id == "add"
+
+    def test_filter_short_name_matches_module_eval_id(self):
+        """bare function filter should match module.function eval ids."""
+        spec = {
+            "pkg.add": {
+                "dataset": [
+                    {"case": {"inputs": {"a": 1, "b": 2}, "expected": 3}},
+                ]
+            },
+            "pkg.sub": {
+                "dataset": [
+                    {"case": {"inputs": {"a": 5, "b": 3}, "expected": 2}},
+                ]
+            },
+        }
+
+        summary = (
+            RunEvals.from_dict(spec)
+            .with_functions(
+                {
+                    "add": lambda a, b: a + b,
+                    "sub": lambda a, b: a - b,
+                }
+            )
+            .filter(["add"])
+            .run()
+        )
+
+        assert summary.total_count == 1
+        assert summary.results[0].eval_id == "pkg.add"
+
+    def test_filter_short_name_raises_on_ambiguous_matches(self):
+        """Short-name filters should fail fast when multiple eval ids share a suffix."""
+        spec = {
+            "pkg.add": {
+                "dataset": [
+                    {"case": {"inputs": {"a": 1, "b": 2}, "expected": 3}},
+                ]
+            },
+            "other.add": {
+                "dataset": [
+                    {"case": {"inputs": {"a": 2, "b": 3}, "expected": 5}},
+                ]
+            },
+        }
+
+        with pytest.raises(ValueError, match="Ambiguous filter 'add'"):
+            (
+                RunEvals.from_dict(spec)
+                .with_functions({"add": lambda a, b: a + b})
+                .filter(["add"])
+                .run()
+            )
+
 
 class TestRunEvalsDebug:
     """Tests for debug() method."""
diff --git a/tests/test_schema.py b/tests/test_schema.py
new file mode 100644
index 0000000..c90b62d
--- /dev/null
+++ b/tests/test_schema.py
@@ -0,0 +1,38 @@
+"""Tests for generated YAML schema support."""
+
+import json
+from pathlib import Path
+
+from vowel.schema import build_yaml_schema_from_bundle, materialize_yaml_with_schema_header
+
+
+def test_generated_schema_includes_top_level_serializers_property():
+    """Top-level `serializers` should be explicitly supported in generated schema."""
+    schema = build_yaml_schema_from_bundle()
+    properties = schema.get("properties", {})
+
+    assert "fixtures" in properties
+    assert "serializers" in properties
+
+
+def test_generated_schema_keeps_function_additional_properties():
+    """Unknown top-level keys must still map to per-function Evals definitions."""
+    schema = build_yaml_schema_from_bundle()
+
+    additional = schema.get("additionalProperties", {})
+    assert additional == {"$ref": "#/$defs/EvalsMapValue"}
+
+
+def test_materialized_header_uses_hashed_cache_with_serializers():
+    """Schema header should reference a content-addressed cache file that supports serializers."""
+    yaml_text = "len:\n  dataset:\n    - case:\n        id: len_basic\n        input: [1]\n        expected: 1\n"
+    materialized = materialize_yaml_with_schema_header(yaml_text)
+    first_line = materialized.splitlines()[0]
+
+    assert first_line.startswith("# yaml-language-server: $schema=")
+    schema_path = Path(first_line.split("$schema=", 1)[1])
+    assert schema_path.name.startswith("vowel-schema_")
+    assert schema_path.exists()
+
+    schema_obj = json.loads(schema_path.read_text(encoding="utf-8"))
+    assert "serializers" in schema_obj.get("properties", {})
diff --git a/tests/test_serializer.py b/tests/test_serializer.py
index 1ed83ec..f6516d1 100644
--- a/tests/test_serializer.py
+++ b/tests/test_serializer.py
@@ -28,6 +28,16 @@ def process_with_config(user: User, config: Config) -> str:
     return f"{user.name} (timeout={config.timeout})"
 
 
+def yaml_serialize_user(data: dict) -> User:
+    """Serializer function used by YAML-native serializer registry tests."""
+    raw = data.get("input") or data.get("inputs")
+    if isinstance(raw, list):
+        raw = raw[0]
+    if not isinstance(raw, dict):
+        raise ValueError("Expected serializer input payload to be a dict")
+    return User(**raw)
+
+
 class TestSchemaSerializer:
     """Tests for schema-based serialization."""
 
@@ -147,6 +157,36 @@ def test_inputs_named_params_different_types(self):
         )
         assert summary.all_passed
 
+    def test_assertion_uses_serialized_input_with_dict_schema(self):
+        """Assertion `input` should contain per-param serialized objects for dict schema."""
+        spec = {
+            "process_with_config": {
+                "evals": {
+                    "CheckSerializedInput": {
+                        "assertion": "input['user'].email.endswith('@a.com') and input['config'].timeout == 30"
+                    }
+                },
+                "dataset": [
+                    {
+                        "case": {
+                            "inputs": {
+                                "user": {"id": 1, "name": "Alice", "email": "a@a.com"},
+                                "config": {"timeout": 30, "verbose": True},
+                            },
+                            "expected": "Alice (timeout=30)",
+                        }
+                    },
+                ],
+            }
+        }
+        summary = (
+            RunEvals.from_dict(spec)
+            .with_functions({"process_with_config": process_with_config})
+            .with_serializer({"process_with_config": {"user": User, "config": Config}})
+            .run()
+        )
+        assert summary.all_passed
+
     def test_no_serializer_passthrough(self):
         """Without serializer, dict is passed as-is."""
 
@@ -220,6 +260,29 @@ def test_serializer_short_name_matches_module_function_spec(self):
         )
         assert summary.all_passed
 
+    def test_assertion_uses_serialized_input_with_schema(self):
+        """Assertion `input` should be the schema-serialized object, not raw YAML dict."""
+        spec = {
+            "get_user_info": {
+                "evals": {"CheckSerializedInput": {"assertion": "input.email.endswith('@a.com')"}},
+                "dataset": [
+                    {
+                        "case": {
+                            "input": {"id": 1, "name": "Alice", "email": "a@a.com"},
+                            "expected": "User Alice has email a@a.com",
+                        }
+                    },
+                ],
+            }
+        }
+        summary = (
+            RunEvals.from_dict(spec)
+            .with_functions({"get_user_info": get_user_info})
+            .with_serializer({"get_user_info": User})
+            .run()
+        )
+        assert summary.all_passed
+
 
 class TestSerialFn:
     """Tests for serial_fn-based serialization."""
@@ -355,6 +418,35 @@ def get_full_name(user: User) -> str:
         )
         assert summary.all_passed
 
+    def test_assertion_uses_serialized_input_with_serial_fn(self):
+        """Assertion `input` should reflect serial_fn output type."""
+
+        def serialize_user(d: dict) -> User:
+            data = d.get("input") or d.get("inputs")
+            assert data is not None
+            return User(**data)
+
+        spec = {
+            "get_user_info": {
+                "evals": {"CheckSerializedInput": {"assertion": "input.id == 7"}},
+                "dataset": [
+                    {
+                        "case": {
+                            "input": {"id": 7, "name": "Ada", "email": "ada@a.com"},
+                            "expected": "User Ada has email ada@a.com",
+                        }
+                    },
+                ],
+            }
+        }
+        summary = (
+            RunEvals.from_dict(spec)
+            .with_functions({"get_user_info": get_user_info})
+            .with_serializer(serial_fn={"get_user_info": serialize_user})
+            .run()
+        )
+        assert summary.all_passed
+
 
 class TestSerializerChaining:
     """Tests for serializer method chaining."""
@@ -467,3 +559,88 @@ def test_serializer_validation_error(self):
         )
         assert not summary.all_passed
         assert summary.failed_count == 1
+
+
+class TestYamlNativeSerializerRegistry:
+    """Tests for YAML-native top-level serializer registry."""
+
+    def test_yaml_registry_schema_mode(self):
+        yaml_spec = """
+serializers:
+    user_schema:
+        schema: tests.test_serializer.User
+
+get_user_info:
+    serializer: user_schema
+    dataset:
+        - case:
+                input: {id: 1, name: Alice, email: a@a.com}
+                expected: "User Alice has email a@a.com"
+"""
+        summary = (
+            RunEvals.from_source(yaml_spec).with_functions({"get_user_info": get_user_info}).run()
+        )
+        assert summary.all_passed
+
+    def test_yaml_registry_serial_fn_mode(self):
+        yaml_spec = """
+serializers:
+    user_custom:
+        serializer: tests.test_serializer.yaml_serialize_user
+
+get_user_info:
+    serializer: user_custom
+    dataset:
+        - case:
+                inputs: {id: 2, name: Bob, email: b@b.com}
+                expected: "User Bob has email b@b.com"
+"""
+        summary = (
+            RunEvals.from_source(yaml_spec).with_functions({"get_user_info": get_user_info}).run()
+        )
+        assert summary.all_passed
+
+    def test_yaml_registry_imports_are_cached(self, monkeypatch):
+        """Same serializer path used by multiple evals should be imported once."""
+        from vowel import utils as utils_module
+
+        calls: list[str] = []
+        original_import_function = utils_module.import_function
+
+        def counting_import(path: str):
+            calls.append(path)
+            return original_import_function(path)
+
+        utils_module._import_path_cached.cache_clear()
+        monkeypatch.setattr(utils_module, "import_function", counting_import)
+
+        yaml_spec = """
+serializers:
+    user_schema:
+        schema: tests.test_serializer.User
+
+get_user_info:
+    serializer: user_schema
+    dataset:
+        - case:
+                input: {id: 1, name: Alice, email: a@a.com}
+                expected: "User Alice has email a@a.com"
+
+get_user_name:
+    serializer: user_schema
+    dataset:
+        - case:
+                input: {id: 2, name: Bob, email: b@b.com}
+                expected: "Bob"
+"""
+
+        def get_user_name(user: User) -> str:
+            return user.name
+
+        summary = (
+            RunEvals.from_source(yaml_spec)
+            .with_functions({"get_user_info": get_user_info, "get_user_name": get_user_name})
+            .run()
+        )
+        assert summary.all_passed
+        assert calls.count("tests.test_serializer.User") == 1
diff --git a/tests/test_yaml_loading.py b/tests/test_yaml_loading.py
index b2fb12b..031911f 100644
--- a/tests/test_yaml_loading.py
+++ b/tests/test_yaml_loading.py
@@ -56,6 +56,43 @@ def test_invalid_yaml_raises_error(self):
         with pytest.raises(Exception):  # noqa: B017
             load_bundle_from_yaml_string("invalid: [unclosed")
 
+        def test_yaml_with_top_level_serializers(self):
+            """Test loading top-level serializer registry and eval references."""
+            yaml_spec = """
+serializers:
+    user_schema:
+        schema: tests.test_serializer.User
+
+get_user_info:
+    serializer: user_schema
+    dataset:
+        - case:
+                input: {id: 1, name: Alice, email: a@a.com}
+                expected: "User Alice has email a@a.com"
+"""
+            bundle = load_bundle_from_yaml_string(yaml_spec)
+
+            assert "user_schema" in bundle.serializers
+            assert bundle.evals["get_user_info"].serializer == "user_schema"
+
+        def test_yaml_invalid_serializer_spec_raises_error(self):
+            """Serializer specs cannot define both schema and serializer at once."""
+            yaml_spec = """
+serializers:
+    invalid:
+        schema: tests.test_serializer.User
+        serializer: tests.test_serializer.yaml_serialize_user
+
+get_user_info:
+    serializer: invalid
+    dataset:
+        - case:
+                input: {id: 1, name: Alice, email: a@a.com}
+                expected: "User Alice has email a@a.com"
+"""
+            with pytest.raises(Exception):  # noqa: B017
+                load_bundle_from_yaml_string(yaml_spec)
+
 
 class TestLoadBundleFromDict:
     """Tests for load_bundle_from_dict function."""
diff --git a/vowel-schema.json b/vowel-schema.json
index 241bbeb..15ee5e6 100644
--- a/vowel-schema.json
+++ b/vowel-schema.json
@@ -8,6 +8,13 @@
       },
       "title": "Fixtures",
       "type": "object"
+    },
+    "serializers": {
+      "additionalProperties": {
+        "$ref": "#/$defs/SerializerSpec"
+      },
+      "title": "Serializers",
+      "type": "object"
     }
   },
   "additionalProperties": {
@@ -144,6 +151,19 @@
           "title": "Fixture",
           "type": "array"
         },
+        "serializer": {
+          "anyOf": [
+            {
+              "type": "string"
+            },
+            {
+              "type": "null"
+            }
+          ],
+          "default": null,
+          "description": "Optional serializer registry key from top-level 'serializers'. When set, this eval uses that serializer definition.",
+          "title": "Serializer"
+        },
         "evals": {
           "additionalProperties": {
             "anyOf": [
@@ -711,6 +731,46 @@
       "title": "PatternMatchCase",
       "type": "object"
     },
+    "SerializerSpec": {
+      "additionalProperties": false,
+      "description": "Serializer registry entry for YAML-native serializer configuration.",
+      "properties": {
+        "schema": {
+          "anyOf": [
+            {
+              "type": "string"
+            },
+            {
+              "additionalProperties": {
+                "type": "string"
+              },
+              "type": "object"
+            },
+            {
+              "type": "null"
+            }
+          ],
+          "default": null,
+          "description": "Schema converter path(s). Use a single import path string for direct mode, or a mapping of parameter name to import path for nested mode.",
+          "title": "Schema"
+        },
+        "serializer": {
+          "anyOf": [
+            {
+              "type": "string"
+            },
+            {
+              "type": "null"
+            }
+          ],
+          "default": null,
+          "description": "Import path to custom serializer function (serial_fn mode).",
+          "title": "Serializer"
+        }
+      },
+      "title": "SerializerSpec",
+      "type": "object"
+    },
     "EvalsMapValue": {
       "additionalProperties": false,
       "description": "Function evaluation specification keyed by function import path/name. Contains fixture dependencies, global evaluators (`evals`), and dataset cases.",
@@ -746,6 +806,19 @@
           "title": "Fixture",
           "type": "array"
         },
+        "serializer": {
+          "anyOf": [
+            {
+              "type": "string"
+            },
+            {
+              "type": "null"
+            }
+          ],
+          "default": null,
+          "description": "Optional serializer registry key from top-level 'serializers'. When set, this eval uses that serializer definition.",
+          "title": "Serializer"
+        },
         "evals": {
           "additionalProperties": {
             "anyOf": [

From 49491a5a02d470d5ef8cfa8edc579845c903c899 Mon Sep 17 00:00:00 2001
From: Mert <62549656+fswair@users.noreply.github.com>
Date: Thu, 19 Mar 2026 19:47:12 +0300
Subject: [PATCH 6/8] Update Python version matrix in tests.yml

Removed Python 3.10 from the test matrix.
---
 .github/workflows/tests.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 01835e8..2e922ac 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -11,7 +11,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ["3.10", "3.11", "3.12", "3.13", "3.14"]
+        python-version: ["3.11", "3.12", "3.13", "3.14"]
 
     steps:
       - uses: actions/checkout@v4

From 5d4c6d2b795ca8f43bf55785b6dcc8ac6326e4c9 Mon Sep 17 00:00:00 2001
From: Mert <62549656+fswair@users.noreply.github.com>
Date: Thu, 19 Mar 2026 19:49:01 +0300
Subject: [PATCH 7/8] Change pip install target from 'dev' to 'all'

---
 .github/workflows/tests.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 2e922ac..975ef0f 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -30,7 +30,7 @@ jobs:
       - name: Install dependencies
         run: |
           source venv/bin/activate
-          uv pip install -e ".[dev]"
+          uv pip install -e ".[all]"
 
       - name: Run tests
         run: |

From 567743287194656d576a618b87092f5f9d81880b Mon Sep 17 00:00:00 2001
From: Mert <62549656+fswair@users.noreply.github.com>
Date: Thu, 19 Mar 2026 20:05:14 +0300
Subject: [PATCH 8/8] Potential fix for pull request finding

Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com>
---
 .env.sample | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.env.sample b/.env.sample
index 2d59281..a925a7f 100644
--- a/.env.sample
+++ b/.env.sample
@@ -20,7 +20,7 @@ SPEC_MODEL=openrouter:anthropic/claude-opus-4.6
 EXPLORATION_MODEL=openrouter:anthropic/claude-sonnet-4.6
 
 # Default spec & exploration models used by CodeMode benchmark pipeline
-# NOTE: Models should be comma seperated, length of spec models must equals to exploration models
+# NOTE: Models should be comma separated, length of spec models must equal to exploration models
 # spec[i] will be mapped to exploration[i] (Case N)
 BENCHMARK_SPEC_MODELS=openrouter:anthropic/claude-opus-4.6
 BENCHMARK_EXPLORATION_MODELS=openrouter:anthropic/claude-sonnet-4.6
\ No newline at end of file