From f5f039c918b931d812aa836217392cc77771cd9d Mon Sep 17 00:00:00 2001
From: Mert <62549656+fswair@users.noreply.github.com>
Date: Sun, 15 Mar 2026 21:27:33 +0300
Subject: [PATCH 1/8] demo
---
src/vowel/__init__.py | 25 +
src/vowel/codemode.py | 742 +++++++++++++++++++++++++++
src/vowel/eval_types.py | 6 +-
src/vowel/evals.py | 3 -
src/vowel/executor.py | 962 +++++++++++++++++++++++++++++++++++
src/vowel/runner.py | 51 ++
src/vowel/spec_validation.py | 342 +++++++++++++
src/vowel/tdd.py | 178 +++++--
src/vowel/utils.py | 24 +-
9 files changed, 2265 insertions(+), 68 deletions(-)
create mode 100644 src/vowel/codemode.py
create mode 100644 src/vowel/executor.py
create mode 100644 src/vowel/spec_validation.py
diff --git a/src/vowel/__init__.py b/src/vowel/__init__.py
index 8c065a7..c01915e 100644
--- a/src/vowel/__init__.py
+++ b/src/vowel/__init__.py
@@ -32,9 +32,20 @@
__version__ = importlib.metadata.version("vowel")
from .ai import EvalGenerator, GenerationResult, UnsupportedParameterTypeError
+from .codemode import CodeModeGenerator, CodeModeResult, ExplorationPlan, SnippetResult
from .context import EVAL_SPEC_CONTEXT
from .errors import FixturePathError, SignatureError
from .eval_types import EvalsFile
+from .executor import (
+ DefaultExecutor,
+ DefaultSession,
+ ExecutionResult,
+ ExecutionSession,
+ Executor,
+ MontyExecutor,
+ MontyReplSession,
+ get_executor,
+)
from .runner import Function, RunEvals
from .utils import (
EvalResult,
@@ -73,4 +84,18 @@
"check_compatibility",
"get_unsupported_params",
"is_yaml_serializable_type",
+ # CodeMode executor
+ "Executor",
+ "ExecutionResult",
+ "ExecutionSession",
+ "MontyExecutor",
+ "MontyReplSession",
+ "DefaultExecutor",
+ "DefaultSession",
+ "get_executor",
+ # CodeMode pipeline
+ "CodeModeGenerator",
+ "CodeModeResult",
+ "ExplorationPlan",
+ "SnippetResult",
]
diff --git a/src/vowel/codemode.py b/src/vowel/codemode.py
new file mode 100644
index 0000000..e488827
--- /dev/null
+++ b/src/vowel/codemode.py
@@ -0,0 +1,742 @@
+"""CodeMode eval generation pipeline.
+
+This module provides ``CodeModeGenerator`` — a two-phase pipeline that uses
+a sandboxed code executor to produce ground-truth expected values before
+generating YAML eval specs.
+
+Phase 1 — **Exploration**
+ The LLM writes small Python snippets that call ``target_func`` with various
+ inputs. Each snippet is executed via ``Executor`` (Monty sandbox by default)
+ and the real outputs are collected. This replaces guesswork with empirical
+ observation.
+
+Phase 2 — **Spec Generation**
+ The exploration results (inputs → outputs, edge cases, exceptions) are fed
+ back to the LLM together with the eval spec context. The LLM produces the
+ final YAML spec with verified expected values.
+
+All steps are instrumented with ``logfire`` for full observability.
+"""
+
+from __future__ import annotations
+
+import os
+import time
+from typing import Any
+
+import logfire
+import yaml
+from pydantic import BaseModel, Field
+from pydantic_ai import Agent
+
+from vowel.context import EVAL_SPEC_CONTEXT
+from vowel.eval_types import EvalsSource
+from vowel.executor import ExecutionResult, Executor, get_executor
+from vowel.monitoring import enable_monitoring
+from vowel.runner import Function, RunEvals
+from vowel.spec_validation import (
+ build_call_code,
+ build_failure_context,
+ inject_durations,
+ inject_missing_error_cases,
+ validate_expected_values,
+)
+from vowel.utils import EvalSummary
+from vowel.validation import validate_and_fix_spec
+
+enable_monitoring(service_name="vowel-codemode")
+
+
+# ---------------------------------------------------------------------------
+# Exploration output model — what the LLM returns in Phase 1
+# ---------------------------------------------------------------------------
+
+
+class ExplorationSnippet(BaseModel):
+ """A single exploration snippet that tests normal (non-error) behaviour."""
+
+ description: str = Field(
+ description="One-line description of what this snippet tests "
+ "(e.g. 'empty list edge case', 'negative numbers').",
+ )
+ code: str = Field(
+ description="Python code to execute. May call ``target_func(...)`` "
+ "which is the function under test. The value of the last "
+ "expression is captured as output.",
+ )
+
+
+class ErrorSnippet(BaseModel):
+ """A snippet that should trigger an exception from the function."""
+
+ description: str = Field(
+ description="What error scenario this tests "
+ "(e.g. 'None input', 'division by zero', 'wrong type').",
+ )
+ code: str = Field(
+ description="Python code that should trigger an exception. "
+ "Use the function's real name — the source is prepended at runtime.",
+ )
+
+
+class ExplorationPlan(BaseModel):
+ """LLM output for Phase 1: normal snippets + error-triggering snippets."""
+
+ snippets: list[ExplorationSnippet] = Field(
+ description="Snippets that test NORMAL (succeeding) behaviour: "
+ "happy-path, boundary values, return type exploration, "
+ "equivalence partitioning, invariants, composition.",
+ min_length=10,
+ )
+ error_snippets: list[ErrorSnippet] = Field(
+ description="Snippets that should TRIGGER EXCEPTIONS: wrong types, "
+ "invalid values, None inputs, out-of-range arguments. "
+ "Every guard clause and raise statement in the function "
+ "must be exercised by at least one error snippet.",
+ min_length=3,
+ )
+
+
+# ---------------------------------------------------------------------------
+# Exploration result — what we feed back to Phase 2
+# ---------------------------------------------------------------------------
+
+
+class SnippetResult(BaseModel):
+ """Result of executing a single exploration snippet."""
+
+ description: str
+ code: str
+ success: bool
+ output: Any = None
+ stdout: str = ""
+ error: str | None = None
+ error_type: str | None = None
+ duration_ms: float = 0.0
+
+ model_config = {"arbitrary_types_allowed": True}
+
+ @classmethod
+ def from_execution(
+ cls,
+ snippet: ExplorationSnippet | ErrorSnippet,
+ result: ExecutionResult,
+ ) -> SnippetResult:
+ return cls(
+ description=snippet.description,
+ code=snippet.code,
+ success=result.success,
+ output=result.output,
+ stdout=result.stdout,
+ error=result.error,
+ error_type=result.error_type,
+ duration_ms=result.duration_ms,
+ )
+
+ def to_context_block(self) -> str:
+ """Format as a context block for the spec-generation prompt."""
+ if self.success:
+ out = repr(self.output)
+ return (
+ f"# {self.description}\n"
+ f">>> {self.code.strip()}\n"
+ f"Output: {out} ({self.duration_ms:.2f} ms)"
+ )
+ return (
+ f"# {self.description}\n>>> {self.code.strip()}\nRAISED {self.error_type}: {self.error}"
+ )
+
+
+# ---------------------------------------------------------------------------
+# Pipeline result
+# ---------------------------------------------------------------------------
+
+
+class CodeModeResult(BaseModel):
+ """Full result of the CodeMode generation pipeline."""
+
+ exploration_results: list[SnippetResult] = Field(
+ description="Results from Phase 1 exploration.",
+ )
+ yaml_spec: str = Field(description="Final YAML eval specification.")
+ summary: EvalSummary | None = Field(
+ default=None,
+ description="Eval run summary (if run_evals=True).",
+ )
+ refinement_rounds: int = Field(
+ default=0,
+ description="Number of refinement iterations needed (0 = first-pass success).",
+ )
+
+ model_config = {"arbitrary_types_allowed": True}
+
+
+# ---------------------------------------------------------------------------
+# CodeModeGenerator
+# ---------------------------------------------------------------------------
+
+
+class CodeModeGenerator:
+ """Two-phase eval generator: explore with executor, then generate spec.
+
+ Parameters
+ ----------
+ model:
+ LLM model identifier (e.g. ``"openai:gpt-4o"``).
+ executor:
+ Code execution backend. Defaults to ``get_executor("auto")``
+ which prefers MontyExecutor when available.
+ additional_context:
+ Extra instructions appended to the system prompt.
+ """
+
+ def __init__(
+ self,
+ model: str | None = None,
+ executor: Executor | None = None,
+ additional_context: str = "",
+ min_snippets: int = 15,
+ **opts,
+ ) -> None:
+ self.model = model or os.getenv("MODEL_NAME", "")
+ if not self.model:
+ logfire.warn("No model specified; set MODEL_NAME env var or pass model=")
+ self.executor = executor or get_executor("auto")
+ self.additional_context = additional_context
+ self.min_snippets = min_snippets
+ self._opts = opts
+
+ # Lazy agents
+ self._explorer_agent: Agent[None, ExplorationPlan] | None = None
+ self._spec_agent: Agent[None, EvalsSource] | None = None
+
+ logfire.info(
+ "CodeModeGenerator initialized",
+ model=self.model,
+ executor=type(self.executor).__name__,
+ )
+
+ # -- Agent properties --------------------------------------------------
+
+ @property
+ def explorer_agent(self) -> Agent[None, ExplorationPlan]:
+ if self._explorer_agent is None:
+ self._explorer_agent = Agent(
+ self.model,
+ output_type=ExplorationPlan,
+ system_prompt=self._explorer_system_prompt(),
+ **self._opts,
+ )
+ return self._explorer_agent
+
+ @property
+ def spec_agent(self) -> Agent[None, EvalsSource]:
+ if self._spec_agent is None:
+ self._spec_agent = Agent(
+ self.model,
+ output_type=EvalsSource,
+ system_prompt=self._spec_system_prompt(),
+ **self._opts,
+ )
+ return self._spec_agent
+
+ # -- System prompts ----------------------------------------------------
+
+ def _explorer_system_prompt(self) -> str:
+ return f"""You are a Python testing expert. Your job is to write small
+code snippets that explore a function's behaviour empirically.
+
+You will receive:
+- The function's source code (with its real name)
+- The function's description
+
+You produce TWO separate lists of snippets:
+
+## `snippets` — Normal / succeeding behaviour
+These snippets call the function with VALID inputs and capture the return
+value. They MUST cover:
+1. Normal / happy-path behaviour (typical valid inputs)
+2. Boundary values (empty collections, zero, negative, very large, min/max)
+3. Return type and structure exploration
+4. Equivalence partitioning (representative from each input class)
+5. Invariant verification (e.g. idempotency, commutativity, sort stability)
+6. Composition / interaction (combining parameters, dependent arguments)
+
+Produce AT LEAST {self.min_snippets} normal snippets.
+
+## `error_snippets` — Exception-triggering inputs
+These snippets call the function with inputs that SHOULD RAISE exceptions.
+They MUST cover:
+1. Wrong types (None, int instead of list, str instead of int, etc.)
+2. Invalid values (out-of-range, malformed strings, empty when not allowed)
+3. Every `raise` statement and guard clause in the function source code
+
+Produce AT LEAST 3 error snippets. If the function has more raise
+statements or guard clauses, produce MORE — one per distinct error path.
+
+STRICT RULES:
+- Each snippet MUST end with an expression whose value will be captured.
+- Use the function's REAL NAME — the function source code will be prepended
+ automatically at runtime. Do NOT define the function yourself.
+- Keep each snippet focused on ONE scenario.
+- Do NOT guess outputs — the snippets will be executed and the real
+ outputs collected automatically.
+- NEVER use try/except in your snippets. Let exceptions propagate
+ naturally — the execution environment captures raised errors
+ automatically. For example, write `flatten(None)` NOT
+ `try: flatten(None) except Exception as e: type(e)`.
+- `snippets` must contain ONLY inputs expected to SUCCEED.
+- `error_snippets` must contain ONLY inputs expected to RAISE exceptions.
+ Do NOT mix them."""
+
+ def _spec_system_prompt(self) -> str:
+ ctx = ""
+ if self.additional_context:
+ ctx = f"\n\n\n{self.additional_context}\n"
+ return f"""You are an expert vowel YAML SPEC generator.
+
+{EVAL_SPEC_CONTEXT}{ctx}
+
+CRITICAL: You have access to VERIFIED execution results below. Use the
+EXACT outputs shown — do NOT guess or calculate expected values yourself.
+The execution results are ground-truth from running the real function."""
+
+ # -- Phase 1: Exploration ----------------------------------------------
+
+ async def explore(
+ self,
+ func: Function,
+ ) -> list[SnippetResult]:
+ """Phase 1: Generate and execute exploration snippets.
+
+ Uses ``create_session()`` to compile the function source **once**,
+ then feeds each snippet against the preserved runtime state —
+ zero re-parse overhead per snippet.
+
+ Returns a list of ``SnippetResult`` with real outputs from the
+ executor.
+ """
+ with logfire.span(
+ "codemode.explore",
+ func_name=func.name,
+ executor=type(self.executor).__name__,
+ ):
+ # 1. Ask the LLM for exploration snippets
+ plan = await self._get_exploration_plan(func)
+
+ # 2. Compile function source once, feed each snippet
+ all_snippets = [
+ *((s, "normal") for s in plan.snippets),
+ *((s, "error") for s in plan.error_snippets),
+ ]
+ total = len(all_snippets)
+ results: list[SnippetResult] = []
+ with self.executor.create_session(func.code) as session:
+ for i, (snippet, kind) in enumerate(all_snippets):
+ with logfire.span(
+ "codemode.execute_snippet",
+ index=i,
+ kind=kind,
+ description=snippet.description,
+ ):
+ logfire.info(
+ "Executing snippet {index}/{total} [{kind}]: {description}",
+ index=i + 1,
+ total=total,
+ kind=kind,
+ description=snippet.description,
+ code=snippet.code,
+ )
+
+ exec_result = session.feed(snippet.code)
+
+ sr = SnippetResult.from_execution(snippet, exec_result)
+ results.append(sr)
+
+ logfire.info(
+ "Snippet result: success={success}, output={output}, "
+ "duration={duration_ms:.2f}ms",
+ success=sr.success,
+ output=repr(sr.output)[:200],
+ duration_ms=sr.duration_ms,
+ error=sr.error,
+ error_type=sr.error_type,
+ )
+
+ # Summary log
+ successes = sum(1 for r in results if r.success)
+ failures = len(results) - successes
+ logfire.info(
+ "Exploration complete: {successes} succeeded, {failures} raised errors",
+ successes=successes,
+ failures=failures,
+ )
+
+ return results
+
+ async def _get_exploration_plan(self, func: Function) -> ExplorationPlan:
+ """Ask the LLM for exploration snippets."""
+ with logfire.span("codemode.llm_explore", func_name=func.name):
+ prompt = f"""Explore the following function by writing test snippets:
+
+{func.name}
+
+{func.code}
+
+{func.description}
+
+Write diverse snippets that call {func.name}(...) to discover the function's
+behaviour across all important scenarios. Use the real function name
+`{func.name}` — the implementation will be prepended automatically."""
+
+ result = await self.explorer_agent.run(prompt)
+ plan = result.output
+
+ logfire.info(
+ "LLM produced {normal} normal + {error} error snippets",
+ normal=len(plan.snippets),
+ error=len(plan.error_snippets),
+ snippets=[s.description for s in plan.snippets],
+ error_snippets=[s.description for s in plan.error_snippets],
+ )
+ return plan
+
+ # -- Phase 2: Spec Generation ------------------------------------------
+
+ async def generate_spec(
+ self,
+ func: Function,
+ exploration_results: list[SnippetResult],
+ failure_context: str | None = None,
+ ) -> str:
+ """Phase 2: Generate YAML spec using verified exploration results.
+
+ Parameters
+ ----------
+ failure_context:
+ When provided (on refinement rounds), appended to the prompt so
+ the LLM can fix specific failures from the previous attempt.
+ """
+ with logfire.span(
+ "codemode.generate_spec",
+ func_name=func.name,
+ n_results=len(exploration_results),
+ is_refinement=failure_context is not None,
+ ):
+ # Build exploration context for the prompt
+ success_results = [r for r in exploration_results if r.success]
+ error_results = [r for r in exploration_results if not r.success]
+
+ success_context = (
+ "\n\n".join(r.to_context_block() for r in success_results)
+ if success_results
+ else "(none)"
+ )
+ error_context = (
+ "\n\n".join(r.to_context_block() for r in error_results)
+ if error_results
+ else "(none)"
+ )
+
+ refinement_block = ""
+ if failure_context:
+ refinement_block = f"""
+
+⚠️ PREVIOUS ATTEMPT FAILED — fix these issues:
+{failure_context}
+
+Regenerate the YAML spec addressing every failure above. Keep all
+passing cases intact — only fix the broken ones."""
+
+ prompt = f"""Generate vowel evals YAML spec for `{func.name}`:
+
+
+{func.code}
+
+
+{func.description}
+
+
+The following results are from ACTUALLY RUNNING the function — use these
+exact outputs as expected values:
+
+{success_context}
+
+
+
+These inputs RAISED exceptions when run against the real function.
+Each one MUST become a `raises:` case in the spec — no exceptions.
+
+{error_context}
+
+
+REQUIREMENTS:
+- Use {func.name} as eval_id.
+- Generate at least {max(len(exploration_results), 5)} diverse test cases.
+- Use the EXACT outputs from the execution results above.
+- You MUST generate exactly {len(error_results)} raises cases — one for
+ each RAISED result above. The spec is invalid without them.
+- Cover normal, edge, and error cases.
+- In assertions, use `input` (NOT `inputs`) for accessing input values.
+{refinement_block}"""
+
+ logfire.info(
+ "Sending spec generation prompt",
+ func_name=func.name,
+ success_results=len(success_results),
+ error_results=len(error_results),
+ )
+
+ result = await self.spec_agent.run(prompt)
+ yaml_spec = result.output.yaml_spec
+
+ # Sanitize: strip YAML tags that safe_load rejects
+ import re
+
+ yaml_spec = re.sub(r"!!python/[\w.:]+", "", yaml_spec)
+ yaml_spec = re.sub(r"!!binary\b", "", yaml_spec)
+
+ # Validate YAML syntax
+ yaml.safe_load(yaml_spec)
+
+ # Validate and auto-fix
+ validation = validate_and_fix_spec(
+ yaml_spec,
+ function_code=func.code,
+ )
+ if validation.has_warnings:
+ logfire.info(
+ "Spec validation applied fixes",
+ summary=validation.summary(),
+ )
+ final_spec = validation.fixed_yaml if validation.was_modified else yaml_spec
+
+ # Executor-based validation: fix expected values against real execution
+ final_spec = validate_expected_values(final_spec, func, self.executor)
+
+ # Inject missing error cases from exploration
+ error_snippet_dicts = [
+ {
+ "code": r.code,
+ "error_type": r.error_type,
+ "error": r.error,
+ "description": r.description,
+ }
+ for r in exploration_results
+ if not r.success and r.error_type
+ ]
+ final_spec = inject_missing_error_cases(final_spec, func.name, error_snippet_dicts)
+
+ logfire.info(
+ "YAML spec generated",
+ func_name=func.name,
+ spec_length=len(final_spec),
+ spec_preview=final_spec[:500],
+ )
+
+ return final_spec
+
+ # -- Helpers -----------------------------------------------------------
+
+ @staticmethod
+ def _build_failure_context(summary: EvalSummary) -> str:
+ """Build a concise failure report to inject into the retry prompt."""
+ return build_failure_context(summary)
+
+ def _inject_durations(
+ self,
+ yaml_spec: str,
+ func: Function,
+ *,
+ buffer_pct: float = 0.5,
+ floor_ms: float = 10.0,
+ ) -> str:
+ """Add per-case ``duration`` fields based on actual execution times."""
+ return inject_durations(
+ yaml_spec,
+ func,
+ self.executor,
+ buffer_pct=buffer_pct,
+ floor_ms=floor_ms,
+ )
+
+ @staticmethod
+ def _build_call_code(func_name: str, case: dict) -> str | None:
+ """Build a ``func(args...)`` call string from a case dict."""
+ return build_call_code(func_name, case)
+
+ # -- Full pipeline -----------------------------------------------------
+
+ async def generate(
+ self,
+ func: Function,
+ *,
+ run_evals: bool = True,
+ save_to_file: bool = False,
+ max_refinement_rounds: int = 2,
+ min_coverage: float = 1.0,
+ inject_durations: bool = True,
+ ) -> CodeModeResult:
+ """Run the full CodeMode pipeline with post-generation validation.
+
+ Pipeline::
+
+ Phase 1: explore() (once)
+ Phase 2: generate_spec() (may loop)
+ Phase 3: validate via RunEvals (per attempt)
+ Phase 4: refine on failure (up to N rounds)
+ Phase 5: inject_durations() (once, at end)
+
+ Exploration (Phase 1) runs once — the ground-truth snippet results
+ don't change. Only spec generation (Phase 2) is re-run on failure,
+ with a failure report injected into the prompt.
+
+ Parameters
+ ----------
+ func:
+ The function to generate evals for.
+ run_evals:
+ Whether to run the generated evals and include the summary.
+ save_to_file:
+ Whether to save the YAML spec to ``{func.name}_evals.yml``.
+ max_refinement_rounds:
+ Maximum number of spec-regeneration attempts after the initial
+ generation (0 = single attempt, no refinement).
+ min_coverage:
+ Target pass-rate in 0.0–1.0 (default 1.0 = 100 %). The loop
+ exits early when coverage meets or exceeds this threshold.
+ inject_durations:
+ Whether to measure and inject per-case ``duration`` fields
+ into the final YAML spec.
+
+ Returns
+ -------
+ CodeModeResult
+ Contains exploration results, YAML spec, summary, and
+ the number of refinement rounds used.
+ """
+ with logfire.span(
+ "codemode.pipeline",
+ func_name=func.name,
+ model=self.model,
+ executor=type(self.executor).__name__,
+ ):
+ t0 = time.perf_counter()
+
+ # Phase 1 — explore (once)
+ exploration_results = await self.explore(func)
+
+ # Phase 2–4 — generate spec + validate + refine
+ yaml_spec = ""
+ summary: EvalSummary | None = None
+ refinement_rounds = 0
+ failure_context: str | None = None
+ total_attempts = max_refinement_rounds + 1 if run_evals else 1
+
+ for attempt in range(total_attempts):
+ with logfire.span(
+ "codemode.spec_attempt",
+ attempt=attempt + 1,
+ is_refinement=attempt > 0,
+ ):
+ try:
+ yaml_spec = await self.generate_spec(
+ func,
+ exploration_results,
+ failure_context,
+ )
+ except Exception as gen_exc:
+ logfire.warn(
+ "Spec generation failed on attempt {attempt}, retrying",
+ attempt=attempt + 1,
+ error=str(gen_exc),
+ )
+ failure_context = f"Generation error: {gen_exc}"
+ refinement_rounds = attempt + 1
+ continue
+
+ if not run_evals:
+ break
+
+ # Validate: run evals with ignore_duration=True
+ try:
+ runner = (
+ RunEvals.from_source(yaml_spec)
+ .with_functions({func.name: func.impl})
+ .ignore_duration()
+ )
+ summary = runner.run()
+
+ logfire.info(
+ "Attempt {attempt}: {passed}/{total} passed, coverage={coverage:.1f}%",
+ attempt=attempt + 1,
+ passed=summary.success_count,
+ total=summary.total_count,
+ failed=summary.failed_count,
+ errors=summary.error_count,
+ coverage=summary.coverage * 100,
+ )
+
+ if summary.coverage >= min_coverage:
+ break
+
+ # Build failure context for next attempt
+ failure_context = self._build_failure_context(summary)
+ refinement_rounds = attempt + 1
+ logfire.warn(
+ "Coverage {coverage:.0f}% below target {target:.0f}%, refining",
+ coverage=summary.coverage * 100,
+ target=min_coverage * 100,
+ attempt=attempt + 1,
+ )
+
+ except Exception as exc:
+ logfire.warn(
+ "Failed to run evals on attempt {attempt}, retrying",
+ attempt=attempt + 1,
+ func_name=func.name,
+ error=str(exc),
+ )
+ failure_context = f"Eval run error: {exc}"
+ refinement_rounds = attempt + 1
+ continue
+
+ # Phase 5 — inject per-case durations
+ if inject_durations:
+ with logfire.span("codemode.inject_durations", func_name=func.name):
+ yaml_spec = self._inject_durations(yaml_spec, func)
+
+ # Final summary run (with durations now present, but still ignored)
+ if run_evals and summary is not None:
+ try:
+ final_runner = (
+ RunEvals.from_source(yaml_spec)
+ .with_functions({func.name: func.impl})
+ .ignore_duration()
+ )
+ summary = final_runner.run()
+ except Exception: # noqa: BLE001
+ pass # keep last good summary
+
+ if save_to_file:
+ path = f"{func.name}_evals.yml"
+ with open(path, "w") as f:
+ f.write(yaml_spec)
+ logfire.info("Saved spec to {path}", path=path)
+
+ elapsed = (time.perf_counter() - t0) * 1000
+ logfire.info(
+ "CodeMode pipeline complete in {elapsed:.0f}ms (refinements={rounds})",
+ elapsed=elapsed,
+ func_name=func.name,
+ exploration_count=len(exploration_results),
+ refinement_rounds=refinement_rounds,
+ has_summary=summary is not None,
+ )
+
+ return CodeModeResult(
+ exploration_results=exploration_results,
+ yaml_spec=yaml_spec,
+ summary=summary,
+ refinement_rounds=refinement_rounds,
+ )
diff --git a/src/vowel/eval_types.py b/src/vowel/eval_types.py
index a19c2e7..241a71d 100644
--- a/src/vowel/eval_types.py
+++ b/src/vowel/eval_types.py
@@ -20,15 +20,13 @@
EvalsFile: Root model for YAML file parsing
"""
-import logging
+import logfire
import os
from typing import Any, Literal
from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
from pydantic.experimental.missing_sentinel import MISSING
-logger = logging.getLogger(__name__)
-
# =============================================================================
# LLM Output Models
@@ -784,6 +782,6 @@ def get_evals(self) -> dict[str, Evals]:
try:
result[key] = Evals(id=key, **value)
except Exception as e:
- logger.warning(f"Failed to process eval '{key}': {e}")
+ logfire.warn("Failed to process eval '{key}': {error}", key=key, error=str(e))
return result
diff --git a/src/vowel/evals.py b/src/vowel/evals.py
index 6065042..f9fc79b 100644
--- a/src/vowel/evals.py
+++ b/src/vowel/evals.py
@@ -17,7 +17,6 @@
"""
import importlib.util
-import logging
import os
import re
import typing
@@ -29,8 +28,6 @@
from pydantic_ai.settings import ModelSettings
from pydantic_evals.evaluators import EvaluationReason, Evaluator, EvaluatorContext, LLMJudge
-logger = logging.getLogger(__name__)
-
MONTY_AVAILABLE = bool(importlib.util.find_spec("pydantic-monty"))
diff --git a/src/vowel/executor.py b/src/vowel/executor.py
new file mode 100644
index 0000000..9de81e1
--- /dev/null
+++ b/src/vowel/executor.py
@@ -0,0 +1,962 @@
+"""Code execution backends for CodeMode eval generation.
+
+CodeMode allows the eval generation agent to *run* code inside a sandbox
+rather than guessing expected values. This produces ground-truth outputs
+and lets the agent empirically explore function behaviour (edge cases,
+exception boundaries, return types) before writing test cases.
+
+Architecture
+------------
+``Executor`` is a runtime Protocol — any object that implements ``execute()``
+qualifies. Two concrete implementations are provided:
+
+* ``MontyExecutor`` — uses ``pydantic-monty`` (Rust-based sandbox, <0.1 ms
+ startup, no filesystem/network access). **Recommended
+ for production and the optimization loop.**
+* ``DefaultExecutor`` — uses Python's built-in ``exec()`` with stdout capture.
+ No sandboxing. Safe only for trusted, local code;
+ useful during development when Monty is not installed.
+
+The ``execute()`` method accepts two orthogonal injection mechanisms that
+mirror Monty's native API:
+
+* ``inputs`` — ``dict[str, Any]`` of *values* injected as
+ top-level variables visible to the snippet.
+* ``external_functions`` — ``dict[str, Callable]`` of *host-side callbacks*
+ the snippet can call by name. In the Monty
+ backend each call exits the sandbox, runs on
+ the host, and returns the result.
+
+Session API
+-----------
+For batch exploration (e.g. CodeMode), use ``create_session()`` to compile
+the function source **once**, then ``feed()`` each snippet against the
+preserved runtime state.
+
+* ``MontyReplSession`` — backed by ``MontyRepl``: zero re-parse overhead
+ per snippet, heap/globals preserved across feeds.
+* ``DefaultSession`` — backed by a persistent ``exec()`` namespace.
+
+Usage examples
+--------------
+**External functions** — inject one or more real functions::
+
+ await executor.execute(
+ '''
+ results = []
+ results.append(target_func([1, 3, 5, 7, 9], 5))
+ results.append(target_func([], 1))
+ results
+ ''',
+ external_functions={"target_func": binary_search},
+ )
+
+**Inputs** — inject plain values::
+
+ await executor.execute(
+ "x + y",
+ inputs={"x": 10, "y": 20},
+ )
+
+**Session** — compile once, feed many snippets::
+
+ async with executor.create_session(func_code) as session:
+ r1 = session.feed("binary_search([1,3,5], 3)")
+ r2 = session.feed("binary_search([], 1)")
+
+The value of the last expression becomes ``ExecutionResult.output``.
+"""
+
+from __future__ import annotations
+
+import ast
+import asyncio
+import contextlib
+import importlib.util
+import io
+import time
+from collections.abc import Callable
+from dataclasses import dataclass
+from typing import Any, Literal, Protocol, runtime_checkable
+
+NEST_AVAILABLE = importlib.util.find_spec("nest_asyncio") is not None
+MONTY_AVAILABLE = importlib.util.find_spec("pydantic_monty") is not None
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
+def run_sync(coro: Any) -> Any:
+ """Run a coroutine synchronously, even inside a running event loop.
+
+ Tries ``asyncio.run()`` first (clean, no patching). If there is
+ already a running loop (e.g. Jupyter, async framework), falls back
+ to ``nest_asyncio`` + ``loop.run_until_complete()``.
+ """
+ try:
+ return asyncio.run(coro)
+ except RuntimeError as exc:
+ if "running event loop" not in str(exc) and "cannot be called from a running" not in str(
+ exc
+ ):
+ raise
+ # Already inside an event loop — patch and retry
+ if not NEST_AVAILABLE:
+ raise RuntimeError(
+ "execute_sync() was called from inside a running event loop. "
+ "Install nest-asyncio to support this: pip install nest-asyncio"
+ ) from exc
+
+ import nest_asyncio
+
+ nest_asyncio.apply()
+ loop = asyncio.get_event_loop()
+ return loop.run_until_complete(coro)
+
+
+# ---------------------------------------------------------------------------
+# Result type
+# ---------------------------------------------------------------------------
+
+
+@dataclass
+class ExecutionResult:
+ """Result of running a code snippet through an executor.
+
+ Attributes
+ ----------
+ output:
+ The value of the last expression evaluated in the snippet, or the
+ value assigned to ``__result__`` in the namespace. ``None`` when
+ execution fails.
+ stdout:
+ Everything written to stdout during execution (via ``print()``).
+ success:
+ ``True`` when the snippet completed without raising an exception.
+ error:
+ Human-readable error message when ``success is False``.
+ error_type:
+ The Python exception class name (e.g. ``"ValueError"``) when
+ ``success is False``.
+ duration_ms:
+ Wall-clock time spent executing the snippet, in milliseconds.
+ """
+
+ output: Any
+ stdout: str
+ success: bool
+ error: str | None = None
+ error_type: str | None = None
+ duration_ms: float = 0.0
+
+
+# ---------------------------------------------------------------------------
+# Protocol
+# ---------------------------------------------------------------------------
+
+
+@runtime_checkable
+class Executor(Protocol):
+ """Protocol for code execution backends used by CodeMode.
+
+ Any callable object that matches this signature qualifies — concrete
+ classes do *not* need to inherit from ``Executor``.
+
+ Parameters
+ ----------
+ code:
+ Python source code to execute.
+ inputs:
+ ``dict[str, Any]`` of values injected as top-level variables
+ visible to the snippet. For example ``{"x": 42}`` makes ``x``
+ available inside the code.
+ external_functions:
+ ``dict[str, Callable]`` of host-side callbacks the snippet can
+ call by name. In the Monty backend each call exits the sandbox,
+ runs on the host, and returns the result — so the real function
+ can use any library.
+ timeout:
+ Maximum wall-clock seconds allowed for the snippet. Execution is
+ interrupted (or the result discarded) after this duration.
+ max_memory:
+ Maximum heap memory in bytes available to the sandbox. Ignored by
+ ``DefaultExecutor`` which has no memory isolation.
+
+ Returns
+ -------
+ ExecutionResult
+ """
+
+ async def execute(
+ self,
+ code: str,
+ *,
+ inputs: dict[str, Any] | None = None,
+ external_functions: dict[str, Callable[..., Any]] | None = None,
+ timeout: float = 5.0,
+ max_memory: int = 10 * 1024 * 1024,
+ ) -> ExecutionResult:
+ raise NotImplementedError
+
+ def execute_sync(
+ self,
+ code: str,
+ *,
+ inputs: dict[str, Any] | None = None,
+ external_functions: dict[str, Callable[..., Any]] | None = None,
+ timeout: float = 5.0,
+ max_memory: int = 10 * 1024 * 1024,
+ ) -> ExecutionResult:
+ raise NotImplementedError
+
+ def create_session(
+ self,
+ setup_code: str,
+ *,
+ timeout: float = 5.0,
+ max_memory: int = 10 * 1024 * 1024,
+ ) -> ExecutionSession:
+ raise NotImplementedError
+
+
+# ---------------------------------------------------------------------------
+# ExecutionSession — compile once, feed many snippets
+# ---------------------------------------------------------------------------
+
+
+@runtime_checkable
+class ExecutionSession(Protocol):
+ """A reusable execution context with pre-compiled setup code.
+
+ The session compiles the *setup_code* (typically a function definition)
+ once, then each ``feed()`` call runs a snippet against the preserved
+ runtime state without re-parsing the setup code.
+
+ This is the key optimisation for CodeMode exploration: when testing
+ N edge-case snippets against the same function, the function is parsed
+ and compiled only once instead of N times.
+
+ The session is a context manager — use ``async with`` or ``with`` to
+ ensure proper cleanup.
+ """
+
+ def feed(self, code: str) -> ExecutionResult:
+ """Execute *code* against the session's pre-compiled state.
+
+ Returns an ``ExecutionResult`` with the last expression value,
+ stdout, and error info (if any).
+ """
+ raise NotImplementedError
+
+ def close(self) -> None:
+ """Release resources held by the session."""
+ raise NotImplementedError
+
+ def __enter__(self) -> ExecutionSession:
+ return self
+
+ def __exit__(self, *_: Any) -> None:
+ self.close()
+
+
+# ---------------------------------------------------------------------------
+# MontyReplSession — sandboxed session using MontyRepl
+# ---------------------------------------------------------------------------
+
+
+class MontyReplSession:
+ """Session backed by ``MontyRepl`` — compile once, feed many snippets.
+
+ On construction the *setup_code* is parsed, compiled and executed once
+ via ``MontyRepl.create()``. Each subsequent ``feed()`` call runs a
+ snippet against the preserved heap/globals without re-parsing the setup
+ code.
+
+ This is the recommended path for CodeMode exploration with Monty. For
+ a function with N edge-case snippets, the function source is compiled
+ only once.
+ """
+
+ def __init__(
+ self,
+ setup_code: str,
+ *,
+ timeout: float = 5.0,
+ max_memory: int = 10 * 1024 * 1024,
+ ) -> None:
+ import pydantic_monty
+
+ self._pydantic_monty = pydantic_monty
+ self._limits = pydantic_monty.ResourceLimits(
+ max_duration_secs=timeout,
+ max_memory=max_memory,
+ )
+
+ stdout_lines: list[str] = []
+
+ def _print_callback(_stream: str, text: str) -> None:
+ stdout_lines.append(text)
+
+ # Compile + execute setup code (function definitions, imports, etc.)
+ self._repl, _init_output = pydantic_monty.MontyRepl.create(
+ setup_code,
+ limits=self._limits,
+ print_callback=_print_callback,
+ )
+ self._setup_stdout = "\n".join(stdout_lines)
+
+ def feed(self, code: str) -> ExecutionResult:
+ """Execute *code* against the REPL's preserved state."""
+ stdout_lines: list[str] = []
+
+ def _print_callback(_stream: str, text: str) -> None:
+ stdout_lines.append(text)
+
+ t0 = time.perf_counter()
+ try:
+ if not self._repl:
+ # TODO: wrap with custom exception and detailed message
+ raise ValueError("Repl not found.")
+ else:
+ output = self._repl.feed(code, print_callback=_print_callback)
+ duration_ms = (time.perf_counter() - t0) * 1000
+ return ExecutionResult(
+ output=output,
+ stdout="\n".join(stdout_lines),
+ success=True,
+ duration_ms=duration_ms,
+ )
+
+ except self._pydantic_monty.MontyRuntimeError as exc:
+ duration_ms = (time.perf_counter() - t0) * 1000
+ inner = exc.exception()
+ return ExecutionResult(
+ output=None,
+ stdout="\n".join(stdout_lines),
+ success=False,
+ error=exc.display(format="type-msg"),
+ error_type=type(inner).__name__,
+ duration_ms=duration_ms,
+ )
+
+ except self._pydantic_monty.MontySyntaxError as exc:
+ duration_ms = (time.perf_counter() - t0) * 1000
+ return ExecutionResult(
+ output=None,
+ stdout="",
+ success=False,
+ error=str(exc),
+ error_type="SyntaxError",
+ duration_ms=duration_ms,
+ )
+
+ except Exception as exc: # noqa: BLE001
+ duration_ms = (time.perf_counter() - t0) * 1000
+ return ExecutionResult(
+ output=None,
+ stdout="\n".join(stdout_lines),
+ success=False,
+ error=str(exc),
+ error_type=type(exc).__name__,
+ duration_ms=duration_ms,
+ )
+
+ def close(self) -> None:
+ """Release the REPL instance."""
+ self._repl = None # type: ignore[assignment]
+
+ def __enter__(self) -> MontyReplSession:
+ return self
+
+ def __exit__(self, *_: Any) -> None:
+ self.close()
+
+
+# ---------------------------------------------------------------------------
+# FallbackSession — Monty with auto-fallback to DefaultSession
+# ---------------------------------------------------------------------------
+
+import logfire as _logfire
+
+
+class FallbackSession:
+ """Session that tries MontyReplSession first, falls back to DefaultSession.
+
+ Two fallback modes:
+
+ 1. **Session-level**: If ``MontyReplSession.__init__`` raises (e.g.
+ ``MontySyntaxError`` for unsupported syntax like f-string ``!r``),
+ the entire session transparently switches to ``DefaultSession``.
+
+ 2. **Snippet-level**: If a ``feed()`` call returns a
+ ``ModuleNotFoundError`` (Monty doesn't have the module), that single
+ snippet is re-executed via a ``DefaultSession``. Subsequent Monty
+ feeds continue normally — only the failing snippet falls back.
+ """
+
+ def __init__(
+ self,
+ setup_code: str,
+ *,
+ timeout: float = 5.0,
+ max_memory: int = 10 * 1024 * 1024,
+ ) -> None:
+ self._setup_code = setup_code
+ self._timeout = timeout
+ self._max_memory = max_memory
+ self._monty_session: MontyReplSession | None = None
+ self._default_session: DefaultSession | None = None
+ self._monty_failed_permanently = False
+
+ try:
+ self._monty_session = MontyReplSession(
+ setup_code,
+ timeout=timeout,
+ max_memory=max_memory,
+ )
+ except Exception as exc:
+ _logfire.info(
+ "Monty session creation failed ({exc_type}: {exc_msg}), falling back to DefaultSession",
+ exc_type=type(exc).__name__,
+ exc_msg=str(exc),
+ )
+ self._monty_failed_permanently = True
+ self._default_session = DefaultSession(
+ setup_code,
+ timeout=timeout,
+ max_memory=max_memory,
+ )
+
+ def _get_default_session(self) -> DefaultSession:
+ """Lazily create the DefaultSession (only when first needed)."""
+ if self._default_session is None:
+ self._default_session = DefaultSession(
+ self._setup_code,
+ timeout=self._timeout,
+ max_memory=self._max_memory,
+ )
+ return self._default_session
+
+ def feed(self, code: str) -> ExecutionResult:
+ """Execute *code*, falling back to DefaultSession on Monty gaps."""
+ # Session-level fallback — Monty never worked
+ if self._monty_failed_permanently:
+ return self._get_default_session().feed(code)
+
+ assert self._monty_session is not None
+ result = self._monty_session.feed(code)
+
+ # Snippet-level fallback — ModuleNotFoundError means Monty
+ # doesn't have that stdlib module; retry with DefaultSession.
+ if not result.success and result.error_type == "ModuleNotFoundError":
+ _logfire.info(
+ "Monty ModuleNotFoundError, retrying snippet with DefaultSession: {error}",
+ error=result.error,
+ )
+ return self._get_default_session().feed(code)
+
+ return result
+
+ def close(self) -> None:
+ if self._monty_session is not None:
+ self._monty_session.close()
+ if self._default_session is not None:
+ self._default_session.close()
+
+ def __enter__(self) -> FallbackSession:
+ return self
+
+ def __exit__(self, *_: Any) -> None:
+ self.close()
+
+
+# ---------------------------------------------------------------------------
+# MontyExecutor — sandboxed, production-grade
+# ---------------------------------------------------------------------------
+
+
+class MontyExecutor:
+ """Sandboxed executor backed by ``pydantic-monty`` (Rust interpreter).
+
+ Monty provides strict isolation: no filesystem access, no network, no
+ environment variables. External functions are injected as host-side
+ callbacks — they run on the *host* Python process with full access to
+ stdlib and third-party libraries.
+
+ Uses ``pydantic_monty.run_monty_async`` which implements Monty's step
+ protocol (``start()`` → ``MontySnapshot`` → ``resume()``) with proper
+ async support. External functions can be sync or async — Monty handles
+ both transparently. The GIL is released during execution and Monty
+ steps are offloaded to a thread pool.
+
+ Requires the ``pydantic-monty`` package::
+
+ pip install "vowel[monty]" # or: pip install pydantic-monty
+
+ Raises
+ ------
+ ImportError
+ If ``pydantic-monty`` is not installed.
+ """
+
+ def __init__(self) -> None:
+ if not MONTY_AVAILABLE:
+ raise ImportError(
+ 'MontyExecutor requires pydantic-monty. Install it with: pip install "vowel[monty]"'
+ )
+
+ async def execute(
+ self,
+ code: str,
+ *,
+ inputs: dict[str, Any] | None = None,
+ external_functions: dict[str, Callable[..., Any]] | None = None,
+ timeout: float = 5.0,
+ max_memory: int = 10 * 1024 * 1024,
+ ) -> ExecutionResult:
+ """Execute *code* inside the Monty sandbox.
+
+ Delegates to ``pydantic_monty.run_monty_async`` which handles the
+ full step protocol (``start()`` → snapshot → ``resume()``).
+
+ ``NameLookupSnapshot`` (undefined variables) is not handled by
+ ``run_monty_async`` — it raises ``AssertionError``. We catch that
+ and use ``isinstance`` to detect the snapshot type cleanly.
+
+ Parameters
+ ----------
+ code:
+ Python source to run.
+ inputs:
+ Values injected as top-level variables (Monty ``inputs``).
+ external_functions:
+ Host-side callbacks the snippet can call by name.
+ timeout / max_memory:
+ Resource limits forwarded to Monty.
+ """
+ import pydantic_monty
+
+ stdout_lines: list[str] = []
+
+ def _print_callback(_stream: str, text: str) -> None:
+ stdout_lines.append(text)
+
+ input_names = list(inputs) if inputs else None
+
+ limits = pydantic_monty.ResourceLimits(
+ max_duration_secs=timeout,
+ max_memory=max_memory,
+ )
+
+ t0 = time.perf_counter()
+ try:
+ m = pydantic_monty.Monty(
+ code,
+ inputs=input_names,
+ )
+ output = await pydantic_monty.run_monty_async(
+ m,
+ inputs=inputs,
+ limits=limits,
+ external_functions=external_functions,
+ print_callback=_print_callback,
+ )
+ duration_ms = (time.perf_counter() - t0) * 1000
+ return ExecutionResult(
+ output=output,
+ stdout="\n".join(stdout_lines),
+ success=True,
+ duration_ms=duration_ms,
+ )
+
+ except pydantic_monty.MontyRuntimeError as exc:
+ duration_ms = (time.perf_counter() - t0) * 1000
+ inner = exc.exception()
+ return ExecutionResult(
+ output=None,
+ stdout="\n".join(stdout_lines),
+ success=False,
+ error=exc.display(format="type-msg"),
+ error_type=type(inner).__name__,
+ duration_ms=duration_ms,
+ )
+
+ except pydantic_monty.MontySyntaxError as exc:
+ duration_ms = (time.perf_counter() - t0) * 1000
+ return ExecutionResult(
+ output=None,
+ stdout="",
+ success=False,
+ error=str(exc),
+ error_type="SyntaxError",
+ duration_ms=duration_ms,
+ )
+
+ except AssertionError as exc:
+ duration_ms = (time.perf_counter() - t0) * 1000
+ # run_monty_async doesn't handle NameLookupSnapshot — it hits
+ # `assert isinstance(progress, FutureSnapshot)` and the repr
+ # of the snapshot is embedded in the assertion message.
+ exc_msg = str(exc)
+ if "NameLookupSnapshot" in exc_msg:
+ marker = 'variable_name="'
+ start = exc_msg.find(marker)
+ if start != -1:
+ start += len(marker)
+ end = exc_msg.find('"', start)
+ var = exc_msg[start:end]
+ error = f"name '{var}' is not defined"
+ else:
+ error = "name is not defined"
+ return ExecutionResult(
+ output=None,
+ stdout="\n".join(stdout_lines),
+ success=False,
+ error=error,
+ error_type="NameError",
+ duration_ms=duration_ms,
+ )
+ return ExecutionResult(
+ output=None,
+ stdout="\n".join(stdout_lines),
+ success=False,
+ error=exc_msg,
+ error_type=type(exc).__name__,
+ duration_ms=duration_ms,
+ )
+
+ except Exception as exc: # noqa: BLE001 — catch-all for unexpected errors
+ duration_ms = (time.perf_counter() - t0) * 1000
+ return ExecutionResult(
+ output=None,
+ stdout="\n".join(stdout_lines),
+ success=False,
+ error=str(exc),
+ error_type=type(exc).__name__,
+ duration_ms=duration_ms,
+ )
+
+ def execute_sync(
+ self,
+ code: str,
+ *,
+ inputs: dict[str, Any] | None = None,
+ external_functions: dict[str, Callable[..., Any]] | None = None,
+ timeout: float = 5.0,
+ max_memory: int = 10 * 1024 * 1024,
+ ) -> ExecutionResult:
+ """Synchronous wrapper around :meth:`execute`."""
+ return run_sync(
+ self.execute(
+ code,
+ inputs=inputs,
+ external_functions=external_functions,
+ timeout=timeout,
+ max_memory=max_memory,
+ )
+ )
+
+ def create_session(
+ self,
+ setup_code: str,
+ *,
+ timeout: float = 5.0,
+ max_memory: int = 10 * 1024 * 1024,
+ ) -> FallbackSession:
+ """Create a session that uses Monty with auto-fallback to DefaultSession.
+
+ The *setup_code* (typically a function definition) is compiled and
+ executed **once**. If Monty cannot handle the code (e.g. unsupported
+ syntax), the session transparently falls back to ``DefaultSession``.
+ Individual ``feed()`` calls also fall back on ``ModuleNotFoundError``.
+ """
+ return FallbackSession(
+ setup_code,
+ timeout=timeout,
+ max_memory=max_memory,
+ )
+
+
+# ---------------------------------------------------------------------------
+# DefaultSession — unsandboxed session using persistent namespace
+# ---------------------------------------------------------------------------
+
+
+def _rewrite_last_expr(code: str) -> tuple[Any, bool]:
+ """Parse *code* and rewrite the last expression to capture its value.
+
+ Returns ``(compiled_code, has_result)`` where *has_result* is True when
+ the last statement was an expression that was rewritten to assign to
+ ``__result__``.
+ """
+ tree = ast.parse(code, "", "exec")
+ has_result = False
+ if tree.body and isinstance(tree.body[-1], ast.Expr):
+ last_expr = tree.body.pop()
+ assign = ast.Assign(
+ targets=[ast.Name(id="__result__", ctx=ast.Store())],
+ value=last_expr.value, # type: ignore[attr-defined]
+ )
+ ast.copy_location(last_expr, assign)
+ tree.body.append(assign)
+ ast.fix_missing_locations(tree)
+ has_result = True
+ return compile(tree, "", "exec"), has_result
+
+
+class DefaultSession:
+ """Session backed by a persistent ``exec()`` namespace.
+
+ The *setup_code* is executed once into a namespace dict on construction.
+ Each ``feed()`` call executes a snippet in the **same** namespace, so
+ functions and variables defined in the setup remain available.
+
+ This mirrors ``MontyReplSession`` semantics for environments where Monty
+ is not installed.
+ """
+
+ def __init__(
+ self,
+ setup_code: str,
+ *,
+ timeout: float = 5.0,
+ max_memory: int = 10 * 1024 * 1024,
+ ) -> None:
+ self._namespace: dict[str, Any] = {}
+ self._timeout = timeout
+ # Execute setup code to define functions/variables
+ exec(compile(setup_code, "", "exec"), self._namespace) # noqa: S102
+
+ def feed(self, code: str) -> ExecutionResult:
+ """Execute *code* against the session's persistent namespace."""
+ # Remove any leftover __result__ from previous feed
+ self._namespace.pop("__result__", None)
+
+ try:
+ compiled, _has_result = _rewrite_last_expr(code)
+ except SyntaxError as exc:
+ return ExecutionResult(
+ output=None,
+ stdout="",
+ success=False,
+ error=str(exc),
+ error_type="SyntaxError",
+ duration_ms=0.0,
+ )
+
+ stdout_buf = io.StringIO()
+ t0 = time.perf_counter()
+ try:
+ with contextlib.redirect_stdout(stdout_buf):
+ exec(compiled, self._namespace) # noqa: S102
+ duration_ms = (time.perf_counter() - t0) * 1000
+ output = self._namespace.get("__result__")
+ return ExecutionResult(
+ output=output,
+ stdout=stdout_buf.getvalue(),
+ success=True,
+ duration_ms=duration_ms,
+ )
+
+ except Exception as exc: # noqa: BLE001
+ duration_ms = (time.perf_counter() - t0) * 1000
+ return ExecutionResult(
+ output=None,
+ stdout=stdout_buf.getvalue(),
+ success=False,
+ error=str(exc),
+ error_type=type(exc).__name__,
+ duration_ms=duration_ms,
+ )
+
+ def close(self) -> None:
+ """Clear the namespace."""
+ self._namespace.clear()
+
+ def __enter__(self) -> DefaultSession:
+ return self
+
+ def __exit__(self, *_: Any) -> None:
+ self.close()
+
+
+# ---------------------------------------------------------------------------
+# DefaultExecutor — exec()-based, no sandbox
+# ---------------------------------------------------------------------------
+
+
+class DefaultExecutor:
+ """Unsandboxed executor backed by Python's built-in ``exec()``.
+
+ **WARNING: runs code with full host privileges.** Only suitable for
+ development, local testing, or environments where the code being executed
+ is fully trusted.
+
+ Both ``inputs`` and ``external_functions`` are merged into the execution
+ namespace so the snippet can reference them as plain names. The last
+ assigned value of ``__result__``, or the module-level name ``results``
+ if present, is returned as ``output``.
+
+ No additional dependencies required — works with plain Python.
+
+ Notes
+ -----
+ * ``timeout`` and ``max_memory`` parameters are accepted for API
+ compatibility but are **not enforced**.
+ * Stdout is captured via ``contextlib.redirect_stdout``.
+ """
+
+ async def execute(
+ self,
+ code: str,
+ *,
+ inputs: dict[str, Any] | None = None,
+ external_functions: dict[str, Callable[..., Any]] | None = None,
+ timeout: float = 5.0,
+ max_memory: int = 10 * 1024 * 1024,
+ ) -> ExecutionResult:
+ """Execute *code* using ``exec()`` — no sandbox, no isolation.
+
+ To match Monty's behaviour, the value of the *last expression* in
+ the snippet is captured automatically using ``ast`` rewriting. If
+ the snippet explicitly sets ``__result__``, that takes priority.
+ """
+ namespace: dict[str, Any] = {}
+ if inputs:
+ namespace.update(inputs)
+ if external_functions:
+ namespace.update(external_functions)
+
+ # Rewrite the last expression statement to capture its value.
+ try:
+ tree = ast.parse(code, "", "exec")
+ if tree.body and isinstance(tree.body[-1], ast.Expr):
+ last_expr = tree.body.pop()
+ assign = ast.Assign(
+ targets=[ast.Name(id="__result__", ctx=ast.Store())],
+ value=last_expr.value, # type: ignore[attr-defined]
+ )
+ ast.copy_location(last_expr, assign)
+ tree.body.append(assign)
+ ast.fix_missing_locations(tree)
+ compiled = compile(tree, "", "exec")
+ except SyntaxError as exc:
+ return ExecutionResult(
+ output=None,
+ stdout="",
+ success=False,
+ error=str(exc),
+ error_type="SyntaxError",
+ duration_ms=0.0,
+ )
+
+ stdout_buf = io.StringIO()
+ t0 = time.perf_counter()
+ try:
+ with contextlib.redirect_stdout(stdout_buf):
+ exec(compiled, namespace) # noqa: S102
+ duration_ms = (time.perf_counter() - t0) * 1000
+
+ output = namespace.get("__result__")
+
+ return ExecutionResult(
+ output=output,
+ stdout=stdout_buf.getvalue(),
+ success=True,
+ duration_ms=duration_ms,
+ )
+
+ except Exception as exc: # noqa: BLE001
+ duration_ms = (time.perf_counter() - t0) * 1000
+ return ExecutionResult(
+ output=None,
+ stdout=stdout_buf.getvalue(),
+ success=False,
+ error=str(exc),
+ error_type=type(exc).__name__,
+ duration_ms=duration_ms,
+ )
+
+ def execute_sync(
+ self,
+ code: str,
+ *,
+ inputs: dict[str, Any] | None = None,
+ external_functions: dict[str, Callable[..., Any]] | None = None,
+ timeout: float = 5.0,
+ max_memory: int = 10 * 1024 * 1024,
+ ) -> ExecutionResult:
+ """Synchronous wrapper around :meth:`execute`."""
+ return run_sync(
+ self.execute(
+ code,
+ inputs=inputs,
+ external_functions=external_functions,
+ timeout=timeout,
+ max_memory=max_memory,
+ )
+ )
+
+ def create_session(
+ self,
+ setup_code: str,
+ *,
+ timeout: float = 5.0,
+ max_memory: int = 10 * 1024 * 1024,
+ ) -> DefaultSession:
+ """Create an unsandboxed session with a persistent namespace.
+
+ The *setup_code* is executed once into a shared namespace dict.
+ Each ``session.feed(snippet)`` call runs in the same namespace,
+ preserving functions and variables across calls.
+ """
+ return DefaultSession(
+ setup_code,
+ timeout=timeout,
+ max_memory=max_memory,
+ )
+
+
+# ---------------------------------------------------------------------------
+# Factory
+# ---------------------------------------------------------------------------
+
+
+def get_executor(backend: Literal["monty", "auto", "default"] = "auto") -> Executor:
+ """Return a configured executor instance.
+
+ Parameters
+ ----------
+ backend:
+ ``"monty"`` — always use ``MontyExecutor`` (raises if not installed).
+ ``"default"`` — always use ``DefaultExecutor``.
+ ``"auto"`` — use ``MontyExecutor`` when available, fall back to
+ ``DefaultExecutor`` with a warning.
+
+ Returns
+ -------
+ Executor
+ A ready-to-use executor instance.
+ """
+ if backend == "monty":
+ return MontyExecutor()
+
+ if backend == "default":
+ return DefaultExecutor()
+
+ if backend == "auto":
+ if MONTY_AVAILABLE:
+ return MontyExecutor()
+ import warnings
+
+ warnings.warn(
+ "pydantic-monty not installed; falling back to DefaultExecutor "
+ '(no sandboxing). Install with: pip install "vowel[monty]"',
+ stacklevel=2,
+ )
+ return DefaultExecutor()
+
+ raise ValueError(
+ f"Unknown executor backend: {backend!r}. Choose 'monty', 'default', or 'auto'."
+ )
diff --git a/src/vowel/runner.py b/src/vowel/runner.py
index a69c3e5..4018fcd 100644
--- a/src/vowel/runner.py
+++ b/src/vowel/runner.py
@@ -97,6 +97,7 @@ def execute(self) -> None:
local_scope: dict[str, object] = {}
try:
code = self.code
+ code = self._sanitize_code(code)
try:
exec(code, local_scope, local_scope)
except Exception:
@@ -105,12 +106,62 @@ def execute(self) -> None:
exec(code, local_scope, local_scope)
else:
raise
+ self.code = code # persist cleaned code for downstream use
except Exception as e:
raise RuntimeError(f"Error executing code for function '{self.name}'.") from e
self.func = local_scope[self.name]
+ @staticmethod
+ def _sanitize_code(code: str) -> str:
+ """Fix common LLM code-generation artefacts before exec.
+
+ 1. Strip escaped quotes (``\\\"``) that break docstrings.
+ 2. Remove redundant ``from typing import`` of Python 3.9+ builtins
+ (dict, list, tuple, set, frozenset, type) that cause ImportError
+ on Python ≥ 3.11.
+ """
+ import re as _re
+
+ # 1. Un-escape literal backslash-quote sequences
+ if '\\"' in code or "\\'" in code:
+ code = code.replace('\\"', '"').replace("\\'", "'")
+
+ # 2. Remove typing imports of builtin generics
+ _BUILTIN_GENERICS = {
+ "Dict",
+ "List",
+ "Tuple",
+ "Set",
+ "FrozenSet",
+ "Type",
+ "dict",
+ "list",
+ "tuple",
+ "set",
+ "frozenset",
+ "type",
+ }
+
+ def _clean_typing_import(m: _re.Match) -> str:
+ names = [n.strip() for n in m.group(1).split(",")]
+ remaining = [n for n in names if n not in _BUILTIN_GENERICS]
+ if not remaining:
+ return "" # remove the entire import line
+ return f"from typing import {', '.join(remaining)}"
+
+ code = _re.sub(
+ r"^from\s+typing\s+import\s+(.+)$",
+ _clean_typing_import,
+ code,
+ flags=_re.MULTILINE,
+ )
+ # Remove any blank lines left behind
+ code = _re.sub(r"\n{3,}", "\n\n", code)
+
+ return code
+
def __call__(self, *args, **kwargs) -> _RT:
"""
Call the function implementation with the provided arguments.
diff --git a/src/vowel/spec_validation.py b/src/vowel/spec_validation.py
new file mode 100644
index 0000000..d559dda
--- /dev/null
+++ b/src/vowel/spec_validation.py
@@ -0,0 +1,342 @@
+"""Shared spec validation utilities for eval generation pipelines.
+
+Functions in this module are used by both ``CodeModeGenerator`` and
+``TDDGenerator`` to validate generated YAML specs against real execution
+and to inject measured durations.
+"""
+
+from __future__ import annotations
+
+from typing import Any
+
+import logfire
+import yaml
+
+from vowel.executor import Executor, get_executor
+from vowel.runner import Function
+from vowel.utils import EvalSummary
+
+
+def build_failure_context(summary: EvalSummary) -> str:
+ """Build a concise failure report to inject into a retry prompt.
+
+ Iterates over :class:`EvalSummary` results and formats each failed
+ case/assertion as a single line. Returns a multi-line string suitable
+ for LLM prompts.
+ """
+ lines: list[str] = []
+ for result in summary.results:
+ if result.report:
+ for case in result.report.cases:
+ failed_assertions = {k: v for k, v in case.assertions.items() if not v.value}
+ if failed_assertions:
+ parts = []
+ for k, v in failed_assertions.items():
+ if v.reason:
+ parts.append(f"{k}: {v.reason}")
+ else:
+ parts.append(f"{k}: FAILED")
+ lines.append(f"- Case '{case.name}' FAILED [{', '.join(parts)}]")
+ if result.error:
+ lines.append(f"- Error: {result.error}")
+ return "\n".join(lines) if lines else "Unknown failures"
+
+
+def build_call_code(
+ func_name: str, case: dict
+) -> (
+ str | None
+): # TODO: intead of building call code, consider passing arguments through executor inputs
+ """Build a ``func(args...)`` call string from a YAML case dict.
+
+ Returns ``None`` when no input is present (e.g. raises-only case
+ without input).
+ """
+ if "inputs" in case and case["inputs"] is not None:
+ args = case["inputs"]
+ if isinstance(args, list):
+ arg_strs = ", ".join(repr(a) for a in args)
+ return f"{func_name}({arg_strs})"
+ if isinstance(args, dict):
+ kwarg_strs = ", ".join(f"{k}={v!r}" for k, v in args.items())
+ return f"{func_name}({kwarg_strs})"
+ elif "input" in case and case["input"] is not None:
+ return f"{func_name}({case['input']!r})"
+ return None
+
+
+def inject_durations(
+ yaml_spec: str,
+ func: Function,
+ executor: Executor,
+ *,
+ buffer_pct: float = 0.5,
+ floor_ms: float = 10.0,
+) -> str:
+ """Add per-case ``duration`` fields based on actual execution times.
+
+ Each non-raises case is executed once via the executor session.
+ The measured ``duration_ms`` is inflated by *buffer_pct* (default 50%)
+ with a minimum of *floor_ms* (default 10 ms) to absorb noise.
+
+ Parameters
+ ----------
+ yaml_spec:
+ YAML string to augment.
+ func:
+ Function to execute cases against.
+ executor:
+ Executor backend to use for timing.
+ buffer_pct:
+ Fractional buffer added on top of measured time (0.5 = +50%).
+ floor_ms:
+ Absolute minimum duration in ms — protects sub-ms cases from
+ flaky failures due to measurement noise.
+ """
+ spec = yaml.safe_load(yaml_spec)
+ if not isinstance(spec, dict):
+ return yaml_spec
+
+ try:
+ session = executor.create_session(func.code)
+ except Exception:
+ logfire.warn("Could not create session for duration injection")
+ return yaml_spec
+
+ with session:
+ for eval_id, eval_def in spec.items():
+ if not isinstance(eval_def, dict):
+ continue
+ for case_entry in eval_def.get("dataset", []):
+ case = case_entry.get("case", {})
+ if not isinstance(case, dict):
+ continue
+ # Skip cases that expect exceptions
+ if case.get("raises"):
+ continue
+
+ call_code = build_call_code(eval_id, case)
+ if call_code is None:
+ continue
+
+ result = session.feed(call_code)
+ if result.success:
+ dur = max(
+ result.duration_ms * (1 + buffer_pct),
+ floor_ms,
+ )
+ case["duration"] = round(dur, 1)
+
+ return yaml.dump(spec, default_flow_style=False, allow_unicode=True, sort_keys=False)
+
+
+def validate_expected_values(
+ yaml_spec: str,
+ func: Function,
+ executor: Executor | None = None,
+) -> str:
+ """Validate and fix expected values in a YAML spec by executing cases.
+
+ For each case that has ``expected`` and no ``raises``, executes the
+ function call and compares the result. If the actual output differs
+ from the YAML expected value, the YAML is updated to the real value.
+
+ Also validates ``raises`` cases: if the case expects an exception but
+ the function doesn't raise (or raises a different type), the case is
+ corrected.
+
+ Parameters
+ ----------
+ yaml_spec:
+ YAML spec string to validate.
+ func:
+ Function to execute.
+ executor:
+ Executor backend. Defaults to ``get_executor("auto")``.
+
+ Returns
+ -------
+ str
+ Fixed YAML spec with corrected expected values.
+ """
+ executor = executor or get_executor("auto")
+
+ spec = yaml.safe_load(yaml_spec)
+ if not isinstance(spec, dict):
+ return yaml_spec
+
+ try:
+ session = executor.create_session(func.code)
+ except Exception:
+ logfire.warn("Could not create session for expected value validation")
+ return yaml_spec
+
+ fixes_applied = 0
+
+ with session:
+ for eval_id, eval_def in spec.items():
+ if not isinstance(eval_def, dict):
+ continue
+ for case_entry in eval_def.get("dataset", []):
+ case = case_entry.get("case", {})
+ if not isinstance(case, dict):
+ continue
+
+ call_code = build_call_code(eval_id, case)
+ if call_code is None:
+ continue
+
+ result = session.feed(call_code)
+
+ # --- Fix expected values ---
+ if "expected" in case and not case.get("raises"):
+ if result.success and result.output != case["expected"]:
+ logfire.info(
+ "Fixing expected value for case: {expected} → {actual}",
+ expected=repr(case["expected"]),
+ actual=repr(result.output),
+ )
+ case["expected"] = result.output
+ fixes_applied += 1
+
+ # --- Fix raises cases ---
+ if case.get("raises"):
+ expected_exc = case["raises"]
+ if result.success:
+ # Function didn't raise — remove raises, set expected
+ logfire.info(
+ "Case expected {exc} but function returned {output}, fixing",
+ exc=expected_exc,
+ output=repr(result.output),
+ )
+ del case["raises"]
+ if "match" in case:
+ del case["match"]
+ case["expected"] = result.output
+ fixes_applied += 1
+ elif result.error_type and result.error_type != expected_exc:
+ # Wrong exception type
+ logfire.info(
+ "Case expected {expected} but got {actual}, fixing",
+ expected=expected_exc,
+ actual=result.error_type,
+ )
+ case["raises"] = result.error_type
+ fixes_applied += 1
+
+ if fixes_applied > 0:
+ logfire.info("Validated spec: {count} fixes applied", count=fixes_applied)
+ return yaml.dump(spec, default_flow_style=False, allow_unicode=True, sort_keys=False)
+
+ return yaml_spec
+
+
+def inject_missing_error_cases(
+ yaml_spec: str,
+ func_name: str,
+ error_snippets: list[dict],
+) -> str:
+ """Inject error cases from exploration into the spec if the LLM missed them.
+
+ Each item in *error_snippets* should have keys:
+
+ - ``code``: Python snippet that triggered the error (e.g. ``"flatten(None)"``)
+ - ``error_type``: Exception class name (e.g. ``"TypeError"``)
+ - ``error``: Full error message
+ - ``description``: One-line description
+
+ Uses :mod:`ast` to extract function call arguments from the snippet
+ code. If parsing fails (multi-line code, complex expressions), the
+ snippet is silently skipped.
+
+ Returns the (possibly modified) YAML spec string.
+ """
+ import ast
+
+ if not error_snippets:
+ return yaml_spec
+
+ spec = yaml.safe_load(yaml_spec)
+ if not isinstance(spec, dict) or func_name not in spec:
+ return yaml_spec
+
+ eval_def = spec[func_name]
+ dataset = eval_def.setdefault("dataset", [])
+
+ # Collect existing raises case inputs to avoid duplicates
+ existing_raises_inputs: set[str] = set()
+ for entry in dataset:
+ case = entry.get("case", {})
+ if isinstance(case, dict) and case.get("raises"):
+ # Normalise existing input for comparison
+ inp = case.get("input")
+ inps = case.get("inputs")
+ existing_raises_inputs.add(repr((inp, inps)))
+
+ injected = 0
+
+ for snippet in error_snippets:
+ code = snippet["code"].strip()
+ error_type = snippet["error_type"]
+ description = snippet.get("description", "")
+
+ # Try to extract arguments from a simple function call
+ try:
+ tree = ast.parse(code, mode="eval")
+ except SyntaxError:
+ continue
+
+ if not isinstance(tree.body, ast.Call):
+ continue
+
+ try:
+ args = [ast.literal_eval(a) for a in tree.body.args]
+ kwargs = {kw.arg: ast.literal_eval(kw.value) for kw in tree.body.keywords}
+ except (ValueError, TypeError):
+ # Complex expression that can't be literal-evaluted — skip
+ continue
+
+ # Determine input/inputs format
+ if kwargs:
+ input_repr = repr((None, kwargs))
+ if input_repr in existing_raises_inputs:
+ continue
+ case_dict: dict[str, Any] = {
+ "id": f"error_{error_type.lower()}_{injected}",
+ "inputs": kwargs,
+ "raises": error_type,
+ }
+ elif len(args) == 1:
+ input_repr = repr((args[0], None))
+ if input_repr in existing_raises_inputs:
+ continue
+ case_dict = {
+ "id": f"error_{error_type.lower()}_{injected}",
+ "input": args[0],
+ "raises": error_type,
+ }
+ elif len(args) > 1:
+ input_repr = repr((None, args))
+ if input_repr in existing_raises_inputs:
+ continue
+ case_dict = {
+ "id": f"error_{error_type.lower()}_{injected}",
+ "inputs": args,
+ "raises": error_type,
+ }
+ else:
+ continue
+
+ dataset.append({"case": case_dict})
+ injected += 1
+ logfire.info(
+ "Injected error case: {desc} → raises {exc}",
+ desc=description,
+ exc=error_type,
+ )
+
+ if injected > 0:
+ logfire.info("Injected {count} missing error cases into spec", count=injected)
+ return yaml.dump(spec, default_flow_style=False, allow_unicode=True, sort_keys=False)
+
+ return yaml_spec
diff --git a/src/vowel/tdd.py b/src/vowel/tdd.py
index e123a2c..2f637f1 100644
--- a/src/vowel/tdd.py
+++ b/src/vowel/tdd.py
@@ -20,6 +20,7 @@
print(result.func.code)
"""
+import inspect
import os
import re
import time
@@ -31,11 +32,17 @@
import yaml
from pydantic import BaseModel, Field
from pydantic_ai import Agent
+from pydantic_ai.models import Model
from vowel.context import EVAL_SPEC_CONTEXT
from vowel.eval_types import EvalsSource
+from vowel.executor import Executor, get_executor
from vowel.monitoring import enable_monitoring
from vowel.runner import Function, RunEvals
+from vowel.spec_validation import (
+ build_failure_context,
+ validate_expected_values,
+)
from vowel.utils import EvalSummary
from vowel.validation import validate_and_fix_spec
@@ -218,9 +225,11 @@ class TDDGenerator:
def __init__(
self,
- model: str | None = None,
+ model: str | Model | None = None,
additional_context: str | list[str] | None = None,
load_env: bool = False,
+ executor: Executor | None = None,
+ **opts,
):
if load_env:
import dotenv
@@ -245,6 +254,11 @@ def __init__(
self._impl_agent: Any = None
self._signature_agent: Any = None
+ # Optional executor for expected-value validation
+ self._executor = executor
+
+ self._opts = opts
+
logfire.info("TDDGenerator initialized", model=self.model)
@property
@@ -264,6 +278,7 @@ def signature_agent(self) -> Agent[None, FunctionSignature]:
- Specify return type accurately
- Write a clear, complete description
""",
+ **self._opts,
)
return cast(Agent[None, FunctionSignature], self._signature_agent)
@@ -463,6 +478,7 @@ def eval_agent(self) -> Agent[None, EvalsSource]:
For complex validations, use case-specific assertions instead.
""",
+ **self._opts,
)
return cast(Agent[None, EvalsSource], self._eval_agent)
@@ -631,6 +647,7 @@ def my_func(data: list, target: int) -> int:
- [ ] For path parsing: handle both `key` and `[index]` formats
- [ ] For nested access: check type at EACH level before accessing
""",
+ **self._opts,
)
return cast(Agent[None, Function], self._impl_agent)
@@ -737,34 +754,76 @@ def generate_evals_from_signature(
IMPORTANT: In assertions, use `input[0]`, `input[1]` to access positional args.
{extra_context}
{f"{additional_context}" if additional_context else ""}"""
- result = self.eval_agent.run_sync(prompt)
- yaml_spec = result.output.yaml_spec # type: ignore[attr-defined]
-
- # Sanitize: strip YAML tags that safe_load rejects
- yaml_spec = re.sub(r"!!python/[\w.:]+", "", yaml_spec)
- yaml_spec = re.sub(r"!!binary\b", "", yaml_spec)
-
- # Validate YAML syntax
- yaml.safe_load(yaml_spec)
-
- # Static validation: fix common LLM generation mistakes
- validation = validate_and_fix_spec(yaml_spec)
- if validation.has_warnings:
- logfire.info("Spec validation results", summary=validation.summary())
- if validation.was_modified:
- yaml_spec = validation.fixed_yaml
-
- runner = RunEvals.from_source(yaml_spec)
- logfire.info(
- "Evals generated", cases=len(yaml_spec.split("- case:")), attempt=attempt + 1
- )
+ try:
+ result = self.eval_agent.run_sync(prompt)
+ yaml_spec = result.output.yaml_spec # type: ignore[attr-defined]
+
+ # Sanitize: strip YAML tags that safe_load rejects
+ yaml_spec = re.sub(r"!!python/[\w.:]+", "", yaml_spec)
+ yaml_spec = re.sub(r"!!binary\b", "", yaml_spec)
+
+ # Validate YAML syntax
+ yaml.safe_load(yaml_spec)
+
+ # Static validation: fix common LLM generation mistakes
+ validation = validate_and_fix_spec(yaml_spec)
+ if validation.has_warnings:
+ logfire.info("Spec validation results", summary=validation.summary())
+ if validation.was_modified:
+ yaml_spec = validation.fixed_yaml
+
+ # Executor-based validation: fix expected values by executing
+ # each case through the sandbox and correcting mismatches.
+ if func is not None:
+ # Resolve source code for validation
+ if isinstance(func, Function):
+ real_code = func.code
+ elif callable(func):
+ try:
+ real_code = inspect.getsource(func)
+ except OSError:
+ real_code = None
+ else:
+ real_code = None
+
+ if real_code is not None:
+ val_func = Function(
+ name=signature.name,
+ code=real_code,
+ description=signature.description,
+ )
+ executor = getattr(self, "_executor", None) or get_executor("auto")
+ yaml_spec = validate_expected_values(
+ yaml_spec,
+ val_func,
+ executor,
+ )
+
+ runner = RunEvals.from_source(yaml_spec)
+ logfire.info(
+ "Evals generated",
+ cases=len(yaml_spec.split("- case:")),
+ attempt=attempt + 1,
+ )
+
+ except Exception as gen_exc:
+ logfire.warn(
+ "Eval spec generation failed on attempt {attempt}, retrying",
+ attempt=attempt + 1,
+ error=str(gen_exc),
+ )
+ last_failure_context = f"Generation error: {gen_exc}"
+ if attempt < max_retries:
+ time.sleep(retry_delay)
+ continue
# If no func provided, return without validation
if func is None:
return runner, yaml_spec
# Run spec against the provided function
- test_runner = runner.with_functions({signature.name: func})
+ func_callable = func.impl if isinstance(func, Function) else func
+ test_runner = runner.with_functions({signature.name: func_callable})
if ignore_duration:
test_runner = test_runner.ignore_duration()
summary = test_runner.run()
@@ -778,7 +837,7 @@ def generate_evals_from_signature(
return runner, yaml_spec
# Build failure context for next attempt
- last_failure_context = self._build_eval_failure_context(summary)
+ last_failure_context = build_failure_context(summary)
logfire.warn(
"Eval spec below coverage, retrying",
coverage=f"{summary.coverage * 100:.0f}%",
@@ -789,31 +848,24 @@ def generate_evals_from_signature(
if attempt < max_retries:
time.sleep(retry_delay)
- # Exhausted retries — return last generated spec
- # (summary/runner/yaml_spec are always set when func is not None and loop ran at least once)
- assert summary is not None and runner is not None # noqa: S101
- logfire.warn(
- "Eval generation exhausted retries",
- final_coverage=f"{summary.coverage * 100:.0f}%",
- target=f"{min_coverage * 100:.0f}%",
- )
- return runner, yaml_spec
+ # Exhausted retries — return last generated spec if we have one
+ if runner is not None and summary is not None:
+ logfire.warn(
+ "Eval generation exhausted retries",
+ final_coverage=f"{summary.coverage * 100:.0f}%",
+ target=f"{min_coverage * 100:.0f}%",
+ )
+ return runner, yaml_spec
- def _build_eval_failure_context(self, summary: EvalSummary) -> str:
- """Build a concise failure report to inject into the retry prompt."""
- lines: list[str] = []
- for result in summary.results:
- if result.report:
- for case in result.report.cases:
- failed_assertions = {k: v for k, v in case.assertions.items() if not v.value}
- if failed_assertions:
- reasons = ", ".join(
- f"{k}: {v.reason}" for k, v in failed_assertions.items() if v.reason
- )
- lines.append(f"- Case '{case.name}' FAILED [{reasons}]")
- if result.error:
- lines.append(f"- Error: {result.error}")
- return "\n".join(lines) if lines else "Unknown failures"
+ # All attempts failed with generation errors — return whatever we have
+ if yaml_spec:
+ runner = RunEvals.from_source(yaml_spec)
+ return runner, yaml_spec
+
+ raise RuntimeError(
+ f"Failed to generate valid eval spec for '{signature.name}' "
+ f"after {max_retries + 1} attempts"
+ )
def generate_implementation(
self,
@@ -875,6 +927,15 @@ def generate_all(
) -> TDDResult:
"""Run complete TDD flow: Signature -> Evals -> Implementation.
+ 1. Generate function signature from description
+ 2. Generate eval spec from signature (tests first)
+ 3. Generate implementation that passes the evals (code last)
+ 4. Run evals & retry implementation on failure
+
+ When ``executor`` is set (at init), generated expected values are
+ validated against actual execution and auto-corrected before the
+ coverage check.
+
Args:
description: What the function should do
name: Function name
@@ -898,7 +959,7 @@ def generate_all(
for flow_attempt in range(max_flow_retries + 1):
with logfire.span("TDD generation flow", name=name, flow_attempt=flow_attempt + 1):
- # Step 2: Generate evals
+ # Step 2: Generate evals from signature
logfire.info("Step 2: Generating evals", flow_attempt=flow_attempt + 1)
runner, yaml_spec = self.generate_evals_from_signature(
signature,
@@ -912,16 +973,27 @@ def generate_all(
summary: EvalSummary | None = None
for impl_attempt in range(max_impl_retries + 1):
- func = self.generate_implementation(
- signature, yaml_spec, additional_context, description
- )
+ try:
+ func = self.generate_implementation(
+ signature, yaml_spec, additional_context, description
+ )
+ except RuntimeError as exc:
+ logfire.warn(
+ "Implementation failed to compile, retrying",
+ impl_attempt=impl_attempt + 1,
+ error=str(exc),
+ )
+ if impl_attempt < max_impl_retries:
+ time.sleep(retry_delay)
+ continue
+ raise
# If max_eval_retries > 0, re-validate evals against this impl
if max_eval_retries > 0 and impl_attempt == 0:
runner, yaml_spec = self.generate_evals_from_signature(
signature,
min_cases,
- func=func.impl,
+ func=func,
max_retries=max_eval_retries,
min_coverage=min_coverage,
retry_delay=retry_delay,
diff --git a/src/vowel/utils.py b/src/vowel/utils.py
index 1b8be82..c6c4f67 100644
--- a/src/vowel/utils.py
+++ b/src/vowel/utils.py
@@ -26,7 +26,7 @@
import importlib
import importlib.util
import inspect
-import logging
+import logfire
import os
import sys
import types
@@ -56,8 +56,6 @@
create_llm_judge,
)
-logger = logging.getLogger(__name__)
-
_SYS_PATH_MODIFIED = False
@@ -825,7 +823,11 @@ def import_function(module_path: str) -> Any:
try:
module = importlib.import_module(module_name)
except ImportError as e:
- logger.debug(f"Standard import failed for '{module_name}': {e}")
+ logfire.debug(
+ "Standard import failed for '{module_name}': {error}",
+ module_name=module_name,
+ error=str(e),
+ )
relative_path = module_name.replace(".", os.sep) + ".py"
file_path = os.path.join(os.getcwd(), relative_path)
@@ -835,9 +837,15 @@ def import_function(module_path: str) -> Any:
if spec and spec.loader:
module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(module)
- logger.debug(f"File-based import succeeded for '{file_path}'")
+ logfire.debug(
+ "File-based import succeeded for '{file_path}'", file_path=file_path
+ )
except Exception as e:
- logger.debug(f"File-based import failed for '{file_path}': {e}")
+ logfire.debug(
+ "File-based import failed for '{file_path}': {error}",
+ file_path=file_path,
+ error=str(e),
+ )
if module:
try:
@@ -846,7 +854,7 @@ def import_function(module_path: str) -> Any:
obj = getattr(obj, part)
return obj
except AttributeError as e:
- logger.debug(f"Attribute lookup failed: {e}")
+ logfire.debug("Attribute lookup failed: {error}", error=str(e))
continue
try:
@@ -1127,7 +1135,7 @@ def to_dataset(
input_value = {"input": match_case.input}
if any(case for case in dataset_cases if input_value == case.inputs):
- logger.warning("Already exists in dataset, skipping duplicate case.")
+ logfire.warn("Already exists in dataset, skipping duplicate case.")
continue
dataset_cases.append(
From 83b84c2686ef17a66731ee974607402ed176c363 Mon Sep 17 00:00:00 2001
From: Mert <62549656+fswair@users.noreply.github.com>
Date: Sun, 15 Mar 2026 22:30:29 +0300
Subject: [PATCH 2/8] demo:save
---
.gitignore | 12 +
AGENTS.md | 9 +
CLAUDE.md | 9 +
docs/FEEDBACK_GUIDED_EXPLORATION.md | 250 +++++++
docs/MONTY_RESEARCH.md | 984 ++++++++++++++++++++++++++++
pyproject.toml | 13 +-
pyrightconfig.json | 1 +
pytest.ini | 5 -
src/vowel/__init__.py | 2 +
src/vowel/cli.py | 27 +-
src/vowel/codemode.py | 331 ++++++++--
src/vowel/eval_types.py | 74 ++-
src/vowel/evals.py | 85 ++-
src/vowel/executor.py | 194 +++++-
src/vowel/runner.py | 42 +-
src/vowel/spec_validation.py | 43 +-
src/vowel/tdd.py | 9 +-
src/vowel/utils.py | 183 +++---
src/vowel/validation.py | 2 +-
tests/test_cli.py | 35 +
tests/test_evaluators.py | 15 +
tests/test_executor.py | 457 +++++++++++++
tests/test_fixtures.py | 105 +++
tests/test_import_function.py | 19 +
tests/test_run_evals.py | 23 +
tests/test_session.py | 232 +++++++
tests/test_tdd_eval_retries.py | 18 +-
27 files changed, 2945 insertions(+), 234 deletions(-)
create mode 100644 docs/FEEDBACK_GUIDED_EXPLORATION.md
create mode 100644 docs/MONTY_RESEARCH.md
delete mode 100644 pytest.ini
create mode 100644 tests/test_cli.py
create mode 100644 tests/test_executor.py
create mode 100644 tests/test_session.py
diff --git a/.gitignore b/.gitignore
index eb9b4d5..59c3bc4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -69,3 +69,15 @@ evaluations/
# !!
TODO
docs/FIXTURE_GENERATION_RFC.md
+
+# CodeMode
+monty.py
+monty/
+
+# Benchmarks
+benchmark*
+parse_cron_evals.yml
+PLAN.md
+codegen.py
+bundle_*.py
+*test.py
diff --git a/AGENTS.md b/AGENTS.md
index 9de1c04..5b79f13 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -30,5 +30,14 @@ This document contains concise rules for how agents should inspect and use this
- If you have questions or uncertainty, consult `README.md` and the relevant docs pages.
- Check `TODO` for pending tasks or known issues.
+## Critical Thinking & Intellectual Honesty
+
+- **Never defer to the user's idea just because they said it.** Evaluate every proposal — yours or the user's — on its own merits: trade-offs, costs, complexity, correctness.
+- **If the user's idea has flaws, say so.** Explain why with concrete reasoning (performance, token cost, latency, maintainability, correctness risk). Do not soften criticism to be agreeable.
+- **If your own idea has flaws, admit it first.** Don't wait for the user to find the holes. Present disadvantages upfront.
+- **When comparing approaches, use structured analysis:** list pros/cons for each, identify the real trade-offs, and state which you'd pick and why — before asking for input.
+- **"You're right" must be earned.** If you catch yourself agreeing immediately, stop and ask: "Did I actually evaluate this, or am I just being agreeable?" If the latter, go back and do the analysis.
+- **The user is a collaborator, not an authority.** Good ideas win regardless of who proposed them. Bad ideas lose regardless of who proposed them.
+
These rules help agents use the project consistently and safely.
diff --git a/CLAUDE.md b/CLAUDE.md
index 28360e4..78ebc3b 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -30,5 +30,14 @@ Claude-type agents working with this repository should follow these steps:
- If you have questions or uncertainty, consult `README.md` and the relevant docs pages.
- Check `TODO` for pending tasks or known issues.
+## Critical Thinking & Intellectual Honesty
+
+- **Never defer to the user's idea just because they said it.** Evaluate every proposal — yours or the user's — on its own merits: trade-offs, costs, complexity, correctness.
+- **If the user's idea has flaws, say so.** Explain why with concrete reasoning (performance, token cost, latency, maintainability, correctness risk). Do not soften criticism to be agreeable.
+- **If your own idea has flaws, admit it first.** Don't wait for the user to find the holes. Present disadvantages upfront.
+- **When comparing approaches, use structured analysis:** list pros/cons for each, identify the real trade-offs, and state which you'd pick and why — before asking for input.
+- **"You're right" must be earned.** If you catch yourself agreeing immediately, stop and ask: "Did I actually evaluate this, or am I just being agreeable?" If the latter, go back and do the analysis.
+- **The user is a collaborator, not an authority.** Good ideas win regardless of who proposed them. Bad ideas lose regardless of who proposed them.
+
These guidelines are intended to help Claude agents use the repository consistently.
diff --git a/docs/FEEDBACK_GUIDED_EXPLORATION.md b/docs/FEEDBACK_GUIDED_EXPLORATION.md
new file mode 100644
index 0000000..7aff3af
--- /dev/null
+++ b/docs/FEEDBACK_GUIDED_EXPLORATION.md
@@ -0,0 +1,250 @@
+# Feedback-Guided Exploration
+
+## The Problem: Single-Shot Exploration is Blind
+
+Prior to this change, the CodeMode pipeline ran exploration in a single LLM call:
+
+```
+Function source code → LLM (one call) → N snippets → Execute all → Done
+```
+
+The LLM never saw execution results during exploration. It generated all snippets based purely on **static reasoning** — reading the source code and inferring what inputs would be interesting. This is "speculation-based exploration."
+
+This works surprisingly well with strong models. In our benchmark, Claude Opus 4.6 produced 44 snippets for `parse_cron` in a single call and achieved 100% coverage with zero refinements. But the approach has structural limitations that no amount of model intelligence can overcome:
+
+### What single-shot exploration misses
+
+**1. Exact error messages**
+
+The LLM reads a `raise ValueError(...)` statement and guesses the error message. But the actual message depends on runtime state — string interpolation, variable values, branch ordering. Example:
+
+```python
+# LLM expects:
+parse_cron('-1 0 1 1 0') → ValueError("minute: -1 out of range 0-59")
+
+# Reality:
+parse_cron('-1 0 1 1 0') → ValueError("invalid literal for int() with base 10: ''")
+```
+
+The minus sign is consumed by the range parser (`-` is the range delimiter), leaving an empty string that fails `int()` conversion. This is a parsing precedence issue that can only be discovered by execution.
+
+**2. Input combination explosions**
+
+For grammar-heavy functions (parsers, validators, DSLs), each syntax element works in isolation, but **combinations** of elements may trigger different code paths. Example from cron parsing:
+
+- `*/15` works (step with wildcard)
+- `1-10` works (range)
+- `1,5,10` works (comma-separated)
+- `1,5-7,*/20` — comma + range + step in one field — was never tried
+
+The LLM tests each primitive but rarely discovers multi-primitive combinations without seeing prior execution results.
+
+**3. Error path ordering**
+
+When a function has multiple validation layers, the order matters:
+
+```python
+# Does step validation happen before or after range validation?
+parse_cron('0-60/0 * * * *')
+# Could be: "Step must be positive" or "invalid range 0-60"
+```
+
+Only execution reveals which guard fires first.
+
+## The Solution: Two-Round Evidence-Based Exploration
+
+The new pipeline adds a second exploration round that receives **actual execution results** from Round 1:
+
+```
+Round 1: LLM (static reasoning) → 15-30 snippets → Execute
+ ↓
+ Deterministic cluster summary
+ ↓
+Round 2: LLM (evidence-based) → 8-12 snippets → Execute
+ ↓
+ Combined results → Spec Generation
+```
+
+Round 2 sees:
+- Every snippet that was tried and its exact output
+- A programmatic cluster summary grouping results by behavior class
+- An explicit "do not repeat" list
+
+This transforms exploration from **speculation** into **hypothesis refinement under feedback** — the LLM reasons about what it _hasn't_ seen, informed by what it _has_ seen.
+
+## Design Decisions
+
+### Why programmatic clustering (not LLM-based)?
+
+We considered two approaches for building the cluster summary between rounds:
+
+| | Programmatic (chosen) | LLM-based |
+|---|---|---|
+| Cost | Zero — no LLM call | 1 additional call |
+| Determinism | Always produces same output for same input | Non-deterministic |
+| Speed | Microseconds | Seconds |
+| Depth | Surface-level (type + message prefix) | Semantic understanding |
+
+We chose programmatic clustering because the goal is not "perfect semantic grouping" — it's "sufficient signal to guide Round 2." The Round 2 LLM is intelligent enough to infer gaps from a simple type+message summary. Adding a clustering LLM call would introduce cost and non-determinism without proportional benefit.
+
+### Why exactly 2 rounds (not 3+)?
+
+Three considerations:
+
+1. **Diminishing returns**: Round 1 covers ~80-90% of behavior space through static reasoning. Round 2 targets the remaining gaps. A Round 3 would operate on an already-dense behavior map with very few remaining gaps — the ROI drops sharply.
+
+2. **Reasoning fragmentation**: Strong models like Opus do their best reasoning in large, focused context windows. Splitting reasoning across many small rounds can actually degrade quality. Two rounds is the sweet spot: one large reasoning pass, one targeted refinement.
+
+3. **Cost predictability**: Fixed 2-round means exactly 2 exploration LLM calls. This is predictable and benchmarkable. Variable rounds (3-5) make cost unpredictable and harder to compare across models.
+
+The `exploration_rounds` parameter allows override (`=1` restores old behavior, `=3` for complex domains if needed), but the default of 2 is intentional.
+
+### Why early exit conditions?
+
+Two conditions can terminate exploration before Round 2 completes:
+
+1. **No snippets produced**: If the Round 2 LLM returns an empty plan, it believes Round 1 was already comprehensive. Forcing it to produce snippets would yield duplicates.
+
+2. **No new behavior classes discovered**: After executing Round 2 snippets, we compare behavior keys (`ok:dict`, `err:ValueError:minute: 60 out of range`) between prior and new results. If every new snippet produced a behavior we already had, the exploration space is saturated.
+
+## Implementation Details
+
+### Cluster Summary Format
+
+The `_build_cluster_summary()` method produces a structured text summary:
+
+```markdown
+## Observed Behaviour Clusters
+
+### Success clusters
+- output type `dict`: 18 cases
+- output type `bool`: 3 cases
+- output type `list`: 1 case
+
+### Error clusters
+- `ValueError` (8 distinct messages):
+ - "Expected 5 fields, got 3"
+ - "minute: 60 out of range 0-59"
+ - "Step must be positive, got -1"
+ - ...
+- `AttributeError` (2 distinct messages):
+ - "'NoneType' object has no attribute 'strip'"
+ - "'int' object has no attribute 'strip'"
+
+### Already tried (25 snippets — do NOT repeat these)
+- `parse_cron('* * * * *')`
+- `parse_cron('5 14 1 6 3')`
+- ...
+```
+
+This is deterministic, costs zero LLM tokens to produce, and provides exactly the signal Round 2 needs:
+- What **output shapes** have been seen (so the LLM can target new ones)
+- What **error types and messages** were discovered (so the LLM can find adjacent error paths)
+- What **exact code** was already tried (so the LLM won't duplicate)
+
+### Round 2 Prompt Structure
+
+The Round 2 prompt includes:
+
+```
+ — same source code as Round 1
+ — full execution results (code + output/error for each snippet)
+ — the programmatic summary above
+
+RULES:
+- Do NOT repeat any snippet from the "Already tried" list.
+- Produce 8–12 NEW normal snippets targeting uncovered behaviour.
+- Produce 3–5 NEW error snippets targeting untried error paths.
+```
+
+The snippet count targets (8-12 normal, 3-5 error) are intentionally smaller than Round 1 (15+ normal, 3+ error). Round 2 is surgical, not broad.
+
+### Behavior Key Format
+
+For early exit detection, each result is hashed into a behavior key:
+
+```
+Success: "ok:{output_type}" → "ok:dict", "ok:bool", "ok:list"
+Error: "err:{error_type}:{msg40}" → "err:ValueError:minute: 60 out of range 0-59"
+```
+
+The message prefix is truncated to 40 characters — enough to distinguish error paths without being sensitive to minor wording variations.
+
+## Code Changes
+
+All changes are in `src/vowel/codemode.py`. No new files, no new dependencies.
+
+### Modified methods
+
+| Method | Change |
+|---|---|
+| `explore()` | 2-round loop with early exit; delegates execution to `_execute_plan()` |
+| `_get_exploration_plan()` | Unchanged logic, updated docstring and logfire tags |
+
+### New methods
+
+| Method | Purpose |
+|---|---|
+| `_execute_plan()` | Extracted snippet execution loop (reused by both rounds) |
+| `_get_targeted_exploration_plan()` | Round 2 prompt with prior results + cluster summary |
+| `_build_cluster_summary()` | Programmatic clustering of results into text summary |
+| `_count_new_behaviors()` | Compares behavior keys between prior and new results |
+
+### Backward compatibility
+
+- `explore(func, exploration_rounds=1)` restores exact single-shot behavior
+- Default is `exploration_rounds=2` — existing callers get the improvement automatically
+- `generate()` calls `explore()` without arguments, so it automatically benefits
+- All 478 existing unit tests pass without modification
+
+## Expected Impact
+
+### On strong models (Opus-class)
+
+- Round 1 already produces excellent coverage
+- Round 2 adds **combination discovery** and **exact error message alignment**
+- Net: ~10-15% more snippets, potentially fewer spec refinement rounds (error messages will match exactly)
+
+### On weaker models (Flash/Lite-class)
+
+- Round 1 produces decent but shallow coverage — misses edge cases
+- Round 2 **compensates for weaker static reasoning** by showing actual execution results
+- Net: significant quality improvement, likely converting some FAIL scenarios to PASS
+
+### On benchmark discriminability
+
+With Katman 3 (behavioral discovery) added, benchmarks now measure a higher-order capability: **adaptive reasoning under feedback**. This separates models that can merely read code from models that can learn from execution traces — a much more meaningful distinction for agentic coding systems.
+
+## Relationship to the Full Pipeline
+
+The evidence flow through the pipeline is now:
+
+```
+Round 1 (speculation) → snippets → execute → results
+ ↓
+Round 2 (evidence-based) → snippets → execute → results
+ ↓
+ all exploration results
+ ↓
+Spec Generation ← VerifiedExecutionResults + ErrorResults
+ ↓
+ YAML eval spec
+ ↓
+Validation → RunEvals → coverage check
+ ↓
+Refinement (if needed) ← failure context
+```
+
+Evidence-based reasoning now starts at the **exploration phase** rather than only at spec generation. Since exploration results feed directly into spec generation, any improvement in exploration quality cascades through the entire downstream pipeline.
+
+## Origin
+
+This feature was designed through a three-way analysis between the developer, the implementation agent (GitHub Copilot / Claude Opus 4.6), and ChatGPT. ChatGPT identified the core insight: the pipeline was doing "speculation-based exploration" when it could be doing "evidence-based exploration." The implementation agent confirmed this against the actual codebase, proposed the programmatic clustering approach (Yol A) over LLM-based clustering, and implemented the 2-round design.
+
+The key framing that guided the design:
+
+```
+Layer 1: Domain awareness (from function description) ✅ already strong
+Layer 2: Grammar inference (from source code) ✅ already strong
+Layer 3: Behavioral discovery (from runtime feedback) ✅ now added
+```
diff --git a/docs/MONTY_RESEARCH.md b/docs/MONTY_RESEARCH.md
new file mode 100644
index 0000000..3742755
--- /dev/null
+++ b/docs/MONTY_RESEARCH.md
@@ -0,0 +1,984 @@
+# Monty Research Notes
+
+> Bu doküman, `pydantic-monty` projesinin evalspec ekosistemi (vowel eval generation pipeline'ları ve vowel-optimization) ile entegrasyonu için yapılan araştırmanın özetidir. CodeMode, tüm eval generation pipeline'larında kullanılabilecek genel bir mekanizmadır — optimizasyon bunlardan sadece biridir.
+
+## 1. Genel Bakış
+
+**Monty**, Pydantic ekibi tarafından Rust ile yazılmış, minimal ve güvenli bir Python yorumlayıcısıdır. Temel amacı: **AI tarafından üretilen kodu güvenli bir sandbox ortamında çalıştırmak.**
+
+- **Repo:** `pydantic/monty` (GitHub)
+- **PyPI paketi:** `pydantic-monty`
+- **NPM paketi:** `@pydantic/monty`
+- **Lisans:** MIT
+- **Dil:** Rust (PyO3 ile Python bindings, napi-rs ile JS bindings)
+- **Hedef Python sürümü:** 3.14
+
+### Temel Özellikler
+
+| Özellik | Detay |
+|---------|-------|
+| Güvenlik | Filesystem, network, env vars tamamen bloklu — sadece kontrollü external function callbacks |
+| Başlatma süresi | <0.06ms (~60 mikrosaniye) |
+| Performans | CPython'a benzer çalışma hızı |
+| Boyut | ~4.5MB download |
+| Serileştirme | `dump()`/`load()` ile parsed code ve execution state kaydedilebilir |
+| Kaynak limitleri | Süre, bellek, allocation sayısı, recursion derinliği sınırlandırılabilir |
+| Tip kontrolü | Opsiyonel statik tip analizi (Monty'nin kendi type checker'ı) |
+
+## 2. Güvenlik Modeli
+
+Monty, **untrusted/potentially malicious** kodun çalıştırılması için tasarlanmıştır. Güvenlik garantileri:
+
+- **Filesystem erişimi YOK** — Sadece `OSAccess` ile kontrollü sanal dosya sistemi
+- **Network erişimi YOK** — Socket, HTTP vb. hiçbir ağ işlemi yapılamaz
+- **Ortam değişkenleri YOK** — `os.environ`, `os.getenv` yalnızca host callback ile
+- **Subprocess/shell YOK** — `os.system`, `subprocess` vb. yok
+- **Import sistemi kısıtlı** — Sadece izin verilen modüller (sys, typing, asyncio)
+- **C FFI yok** — Tamamen Rust ile implement edilmiş, unsafe yok
+
+Tüm dış dünya erişimi **external functions** mekanizması üzerinden olur — host tarafı bu fonksiyonları sağlar, sandbox kodu bunları çağırır, host gerçek işlemi yapar ve sonucu sandbox'a döndürür.
+
+## 3. Python API
+
+### 3.1. Kurulum
+
+```bash
+pip install pydantic-monty
+```
+
+### 3.2. Temel Kullanım
+
+```python
+import pydantic_monty
+
+# Basit ifade çalıştırma
+m = pydantic_monty.Monty('1 + 2 * 3')
+result = m.run() # -> 7
+
+# Input değişkenleri ile
+m = pydantic_monty.Monty('x + y', inputs=['x', 'y'])
+result = m.run(inputs={"x": 10, "y": 20}) # -> 30
+
+# Aynı parsed code farklı girdilerle tekrar çalıştırılabilir
+result2 = m.run(inputs={"x": 100, "y": 200}) # -> 300
+```
+
+### 3.3. `Monty` Sınıfı — Constructor
+
+```python
+pydantic_monty.Monty(
+ code: str, # Çalıştırılacak Python kodu
+ *,
+ script_name: str = 'main.py', # Traceback'lerde görünecek isim
+ inputs: list[str] | None = None, # Kod içinde kullanılabilecek input değişken isimleri
+ external_functions: list[str] | None = None, # Kod içinden çağrılabilecek harici fonksiyon isimleri
+ type_check: bool = False, # Statik tip kontrolü yapılsın mı
+ type_check_stubs: str | None = None, # Tip kontrolü için ek stub tanımları
+ dataclass_registry: list[type] | None = None, # Dataclass tip kayıtları
+)
+```
+
+**Raises:**
+- `MontySyntaxError` — Kod parse edilemezse
+- `MontyTypingError` — `type_check=True` ise ve tip hataları varsa
+
+### 3.4. `Monty.run()` — Senkron Çalıştırma
+
+```python
+m.run(
+ *,
+ inputs: dict[str, Any] | None = None, # Input değerleri
+ limits: ResourceLimits | None = None, # Kaynak limitleri
+ external_functions: dict[str, Callable[..., Any]] | None = None, # Harici fonksiyon implementasyonları
+ print_callback: Callable[[Literal['stdout'], str], None] | None = None, # print() çıktısı callback
+ os: Callable[[OsFunction, tuple[Any, ...]], Any] | None = None, # OS erişimi callback
+) -> Any
+```
+
+**Önemli:** GIL serbest bırakılır — paralel çalıştırma mümkün.
+
+### 3.5. External Functions (Harici Fonksiyonlar)
+
+Bu, Monty'nin en güçlü mekanizmasıdır. Sandbox kodu bir fonksiyon çağırdığında, çalışma durur, host taraftaki gerçek Python fonksiyonu çalışır ve sonuç sandbox'a döndürülür.
+
+```python
+# Sandbox kodunda "fetch" fonksiyonu çağrılabilir
+m = pydantic_monty.Monty(
+ 'fetch("https://example.com")',
+ external_functions=['fetch']
+)
+
+# Host tarafında gerçek implementasyon
+def fetch(url: str) -> str:
+ return f'Fetched: {url}'
+
+result = m.run(external_functions={"fetch": fetch})
+# -> "Fetched: https://example.com"
+```
+
+**Kritik nokta:** External fonksiyonlar host ortamında çalışır — yani hedef fonksiyonun stdlib, third-party lib, dosya sistemi vb. kullanması sorun olmaz. Monty sadece orkestrasyonu yapar.
+
+### 3.6. İteratif Çalıştırma (start/resume)
+
+External fonksiyon çağrılarında adım adım kontrol sağlar:
+
+```python
+m = pydantic_monty.Monty(
+ 'result = fetch(url)',
+ inputs=['url'],
+ external_functions=['fetch']
+)
+
+# Çalıştırmayı başlat
+progress = m.start(inputs={"url": "https://example.com"})
+
+if isinstance(progress, pydantic_monty.MontySnapshot):
+ # Bir external function çağrısında durdu
+ print(progress.function_name) # -> "fetch"
+ print(progress.args) # -> ("https://example.com",)
+ print(progress.kwargs) # -> {}
+
+ # Sonucu döndürerek devam et
+ progress = progress.resume(return_value="response data")
+
+if isinstance(progress, pydantic_monty.MontyComplete):
+ print(progress.output) # -> Son ifadenin değeri
+```
+
+**İlerleme tipleri:**
+- `MontySnapshot` — External function çağrısı bekliyor
+- `MontyFutureSnapshot` — Birden fazla async future bekliyor
+- `MontyComplete` — Çalışma tamamlandı, `.output` ile sonuç alınır
+
+### 3.7. Asenkron Çalıştırma
+
+```python
+async def main():
+ m = pydantic_monty.Monty(
+ 'await fetch(url)',
+ inputs=['url'],
+ external_functions=['fetch']
+ )
+
+ async def real_fetch(url: str) -> str:
+ async with httpx.AsyncClient() as client:
+ r = await client.get(url)
+ return r.text
+
+ result = await pydantic_monty.run_monty_async(
+ m,
+ inputs={"url": "https://example.com"},
+ external_functions={"fetch": real_fetch},
+ )
+```
+
+### 3.8. REPL Modu
+
+Durum korunarak ardışık kod parçaları çalıştırılabilir:
+
+```python
+repl, output = pydantic_monty.MontyRepl.create('x = 10', inputs=['x'])
+# output = 10 (veya None — son ifadenin değeri)
+
+result1 = repl.feed('x + 5') # -> 15
+result2 = repl.feed('x * 2') # -> 20
+# x hâlâ 10, önceki state korunur
+```
+
+### 3.9. Kaynak Limitleri (ResourceLimits)
+
+```python
+limits = pydantic_monty.ResourceLimits(
+ max_duration_secs=5.0, # Maksimum çalışma süresi (saniye)
+ max_memory=1024 * 1024, # Maksimum heap bellek (byte) — 1MB
+ max_allocations=10000, # Maksimum heap allocation sayısı
+ max_recursion_depth=1000, # Maksimum recursion derinliği (default: 1000)
+ gc_interval=100, # Her N allocation'da GC çalıştır
+)
+
+m = pydantic_monty.Monty('fib(30)', external_functions=['fib'])
+result = m.run(
+ external_functions={"fib": my_fib},
+ limits=limits,
+)
+```
+
+### 3.10. Serileştirme (dump/load)
+
+Parsed code veya çalışma durumu (snapshot) kaydedilebilir:
+
+```python
+# Parsed code'u kaydet
+m = pydantic_monty.Monty('x + 1', inputs=['x'])
+data = m.dump() # -> bytes
+
+# Daha sonra geri yükle (parse maliyeti sıfır)
+m2 = pydantic_monty.Monty.load(data)
+result = m2.run(inputs={"x": 41}) # -> 42
+
+# Snapshot'ü da kaydedebilirsin
+progress = m.start(inputs={"x": 10})
+if isinstance(progress, pydantic_monty.MontySnapshot):
+ snapshot_data = progress.dump() # -> bytes
+ # Farklı process'te bile geri yüklenebilir
+ restored = pydantic_monty.MontySnapshot.load(snapshot_data)
+```
+
+### 3.11. Sanal Dosya Sistemi (OSAccess)
+
+```python
+from pydantic_monty import OSAccess, MemoryFile, CallbackFile
+
+# Bellekte sanal dosyalar oluştur
+fs = OSAccess([
+ MemoryFile('/data/input.csv', content='col1,col2\n1,2\n3,4'),
+ MemoryFile('/data/config.json', content='{"key": "value"}'),
+])
+
+# Sandbox kodunda Path.read_text() vb. kullanılabilir
+m = pydantic_monty.Monty("""
+from pathlib import Path
+data = Path('/data/input.csv').read_text()
+data.split('\\n')
+""")
+
+result = await pydantic_monty.run_monty_async(m, os=fs)
+```
+
+### 3.12. Tip Kontrolü
+
+```python
+# Opsiyonel statik analiz
+m = pydantic_monty.Monty(
+ 'x + "hello"',
+ inputs=['x'],
+ type_check=True,
+ type_check_stubs='x: int', # Input tiplerini belirt
+)
+# MontyTypingError fırlatabilir
+
+# Hata formatları
+try:
+ m.type_check(prefix_code='x: int')
+except pydantic_monty.MontyTypingError as e:
+ print(e.display(format='full', color=True))
+ # format seçenekleri: 'full', 'concise', 'azure', 'json', 'jsonlines',
+ # 'rdjson', 'pylint', 'gitlab', 'github'
+```
+
+## 4. Hata Tipleri
+
+```
+MontyError (base)
+├── MontySyntaxError — Parse hataları
+├── MontyRuntimeError — Çalışma zamanı hataları (ZeroDivisionError, ValueError vb.)
+└── MontyTypingError — Statik tip analizi hataları
+```
+
+### MontyRuntimeError Detayları
+
+```python
+try:
+ m = pydantic_monty.Monty('1 / 0')
+ m.run()
+except pydantic_monty.MontyRuntimeError as e:
+ # İç exception'a eriş
+ inner = e.exception() # -> ZeroDivisionError instance
+
+ # Traceback al
+ frames = e.traceback() # -> list[Frame]
+ for frame in frames:
+ print(f" {frame.filename}:{frame.line}:{frame.column} in {frame.function_name}")
+ print(f" {frame.source_line}")
+
+ # Formatlanmış çıktı
+ print(e.display(format='traceback')) # Full traceback
+ print(e.display(format='type-msg')) # "ZeroDivisionError: division by zero"
+ print(e.display(format='msg')) # "division by zero"
+```
+
+**ÖNEMLİ:** Monty, Python exception'larını birebir eşleştirir. `ZeroDivisionError`, `ValueError`, `TypeError` vb. host tarafında doğru exception tipleri olarak yakalanabilir.
+
+## 5. Dil Destekleri ve Kısıtlamalar
+
+### 5.1. Desteklenen Python Deyimleri (Statements)
+
+Kaynak: `crates/monty/src/expressions.rs` — `Node` ve `Expr` enum'ları
+
+| Deyim | Notlar |
+|-------|--------|
+| `x = expr` | Basit atama |
+| `x, y = expr` | Tuple unpacking (iç içe dahil: `(a, b), c = ...`) |
+| `first, *rest = expr` | Starred unpacking |
+| `x += expr` (augmented assigns) | `+=`, `-=`, `*=`, `/=`, `//=`, `%=`, `**=`, `&=`, `\|=`, `^=`, `<<=`, `>>=` |
+| `obj[i] = val` | Subscript assignment |
+| `obj.attr = val` | Attribute assignment (dataclass alanları) |
+| `if / elif / else` | Tam destekli |
+| `for target in iter` | `else` bloğu dahil |
+| `while test` | `else` bloğu dahil |
+| `break` | ✅ |
+| `continue` | ✅ |
+| `return` / `return expr` | ✅ |
+| `raise` / `raise Exception(...)` | ✅ |
+| `try / except / else / finally` | Tam hiyerarşi destekli, çoklu `except` |
+| `assert test, msg` | ✅ |
+| `pass` | ✅ |
+| `def func(...)` | `async def` dahil |
+| `global x` | ✅ |
+| `nonlocal x` | ✅ |
+| `import sys` | Sadece whitelist'teki modüller |
+| `from typing import X` | Sadece whitelist'teki modüller |
+| `del` | ❌ Henüz yok |
+| `class MyClass:` | ❌ Henüz yok |
+| `match x:` | ❌ Desteklenmiyor |
+| `with ... as ...:` | ❌ Henüz yok |
+
+### 5.2. Desteklenen İfadeler (Expressions)
+
+| İfade | Notlar |
+|-------|--------|
+| Literaller | `int`, `float`, `str`, `bytes`, `bool`, `None`, `...` |
+| Büyük int'ler | `2**200` gibi i64 aşan değerler (arbitrary precision) |
+| f-string | `f"hello {name!r}"` — format spec dahil |
+| Aritmetik | `+`, `-`, `*`, `/`, `//`, `%`, `**` |
+| Bitwise | `&`, `\|`, `^`, `~`, `<<`, `>>` |
+| Karşılaştırma | `==`, `!=`, `<`, `<=`, `>`, `>=`, `is`, `is not`, `in`, `not in` |
+| Zincirleme karşılaştırma | `a < b < c` — kısa devre değerlendirmeli |
+| Boolean | `and`, `or`, `not` |
+| Unary | `-x`, `+x`, `~x` |
+| Ternary | `x if cond else y` |
+| Walrus | `(x := expr)` |
+| `await expr` | Modül seviyesinde de kullanılabilir (Jupyter tarzı) |
+| List/dict/set literali | `[1,2]`, `{k:v}`, `{1,2}` |
+| List/set/dict comprehension | `[x for x in iter if cond]` |
+| Generator expression | `(x for x in iter)` |
+| Lambda | `lambda x, y: x + y` |
+| Subscript | `obj[i]`, `obj[a:b:c]` |
+| Slice | `obj[::2]` |
+| Attribute erişimi | `obj.attr` (zincirli dahil) |
+| Fonksiyon çağrısı | `f(a, b, *args, key=val, **kwargs)` |
+| Method çağrısı | `obj.method(args)` |
+| `isinstance(obj, Type)` | ✅ |
+
+### 5.3. Desteklenen Yerleşik Tipler (Built-in Types)
+
+```
+bool int float str bytes
+list tuple dict set frozenset
+range slice iter
+type property
+```
+
+Ayrıca:
+- `None`, `True`, `False`, `...` (Ellipsis)
+- `LongInt` — arbitrarily large integers
+- `NamedTuple` — `collections.namedtuple` benzeri (built-in desteği var)
+- `Dataclass` — `@dataclass` decorator'ı ile (host'tan registry ile)
+- `pathlib.Path` — `from pathlib import Path` ile
+
+### 5.4. Desteklenen Builtin Fonksiyonlar
+
+Kaynak: `crates/monty/src/builtins/mod.rs` — `BuiltinsFunctions` enum'u
+
+**Mevcut (✅):**
+```
+abs() all() any() bin() chr()
+divmod() enumerate() filter() getattr() hash()
+hex() id() isinstance() len() map()
+max() min() next() oct() ord()
+pow() print() repr() reversed() round()
+sorted() sum() type() zip()
+```
+
+**Henüz yok / yorum satırı (❌):**
+```
+aiter() anext() ascii() breakpoint()
+callable() compile() dir() eval()
+exec() format() globals() hasattr()
+help() input() issubclass() iter() [kısmen]
+locals() open() setattr() staticmethod()
+classmethod() super() vars() __import__()
+```
+
+**Type constructor olarak kullanılabilenler:**
+```
+bool() int() float() str() bytes()
+list() tuple() dict() set() frozenset()
+range() slice() iter() type() property()
+```
+
+**Exception constructor'ları:**
+```
+Exception BaseException SystemExit KeyboardInterrupt
+ArithmeticError OverflowError ZeroDivisionError
+LookupError IndexError KeyError
+RuntimeError NotImplementedError RecursionError
+AttributeError FrozenInstanceError
+NameError UnboundLocalError
+ValueError UnicodeDecodeError
+ImportError ModuleNotFoundError
+OSError FileNotFoundError FileExistsError
+IsADirectoryError NotADirectoryError
+AssertionError MemoryError StopIteration
+SyntaxError TimeoutError TypeError
+```
+
+### 5.5. Desteklenen Stdlib Modülleri
+
+#### `sys`
+```python
+import sys
+sys.version # "3.14.0 (Monty)"
+sys.version_info # named tuple: (major=3, minor=14, micro=0, ...)
+sys.platform # "monty"
+sys.stdout # marker (gerçek I/O yok)
+sys.stderr # marker (gerçek I/O yok)
+```
+
+#### `typing`
+```python
+from typing import (
+ TYPE_CHECKING, # her zaman False
+ Any, Optional, Union, List, Dict, Tuple, Set,
+ FrozenSet, Callable, Type, Sequence, Mapping,
+ Iterable, Iterator, Generator, ClassVar,
+ Final, Literal, TypeVar, Generic, Protocol,
+ Annotated, Self, Never, NoReturn
+)
+```
+Bunlar runtime'da `Marker` değerleri olarak işlenir — tip anotasyonlarda kullanılabilirler.
+
+#### `asyncio`
+```python
+import asyncio
+asyncio.run(coro) # await coro ile eşdeğer
+asyncio.gather(*coros) # Eşzamanlı birden fazla coroutine çalıştırma
+# create_task, sleep, wait vb. → YOK
+```
+
+#### `os`
+```python
+import os
+os.getenv("KEY", default=None) # host callback üzerinden
+os.environ # host callback üzerinden dict döner
+# os.path, os.listdir, os.system vb. → YOK
+```
+
+#### `pathlib`
+```python
+from pathlib import Path
+p = Path("/data/file.txt")
+
+# Pure methods (I/O gerektirmez — doğrudan çalışır):
+p.name # "file.txt"
+p.stem # "file"
+p.suffix # ".txt"
+p.suffixes # [".txt"]
+p.parent # Path("/data")
+p.parts # ["/", "data", "file.txt"]
+p / "subdir" # Path birleştirme (/ operatörü)
+str(p) # "/data/file.txt"
+
+# Filesystem methods (OSAccess host callback gerektirir):
+p.exists() read_text() read_bytes()
+p.is_file() write_text() write_bytes()
+p.is_dir() mkdir() unlink()
+p.is_symlink() rmdir() iterdir()
+p.stat() rename() resolve()
+p.absolute()
+```
+
+### 5.6. Tip Metodları — Detay
+
+#### `str` metodları
+```
+capitalize casefold center count encode
+endswith find index isalnum isalpha
+isascii isdecimal isdigit isidentifier islower
+isnumeric isspace istitle isupper join
+ljust lower lstrip partition removeprefix
+removesuffix replace rfind rindex rjust
+rpartition rsplit rstrip split splitlines
+startswith strip swapcase title upper zfill
+```
+Ayrıca: `+` (concat), `*` (repeat), `in` (contains), `[]` (index/slice), `len()`, `str()` constructor
+
+#### `list` metodları
+```
+append clear copy count extend index insert pop remove reverse sort
+```
+Ayrıca: `+`, `*`, `in`, `[]`, `len()`, comprehension, unpacking
+
+#### `dict` metodları
+```
+clear copy fromkeys get items keys pop popitem setdefault update values
+```
+Ayrıca: `in`, `[]`, `len()`, comprehension
+
+#### `set` / `frozenset` metodları
+```
+add clear copy difference discard intersection isdisjoint
+issubset issuperset pop remove symmetric_difference union update
+```
+Ayrıca: `|`, `&`, `-`, `^` operatörleri
+
+#### `tuple` metodları
+```
+count index
+```
+Ayrıca: `+`, `*`, `in`, `[]`, `len()`, unpacking
+
+#### `bytes` metodları
+```
+capitalize center count decode endswith
+find fromhex hex index isalnum
+isalpha isascii isdigit islower isspace
+istitle isupper join ljust lower
+lstrip partition removeprefix removesuffix replace
+rfind rindex rjust rpartition rsplit
+rstrip split splitlines startswith strip
+swapcase title upper zfill
+```
+
+#### `int` metodları
+```
+bit_length bit_count to_bytes from_bytes
+```
+Ayrıca: tüm aritmetik ve bitwise operatörler
+
+#### `range`
+```
+range(stop)
+range(start, stop)
+range(start, stop, step)
+```
+Iteration, `in`, `len()`, `list(range(...))` desteklenir.
+
+### 5.7. Desteklenmeyen Özellikler
+
+| Özellik | Durum |
+|---------|-------|
+| **`class` tanımı** | ❌ Henüz yok (geliyor) |
+| **`match` / `case`** | ❌ Planlanmamış |
+| **`with` / bağlam yöneticisi** | ❌ Henüz yok |
+| **`del` deyimi** | ❌ Henüz yok |
+| **`yield from`** | ❌ Henüz yok |
+| **`*args` spread in comprehension** | ⚠️ Kısıtlı |
+| **`eval()`, `exec()`** | ❌ Hiçbir zaman olmayacak |
+| **`__import__`** | ❌ Hiçbir zaman olmayacak |
+| **Third-party kütüphaneler** | ❌ Sandbox içinde kullanılamaz |
+| **`json` modülü** | ❌ Henüz yok (geliyor) |
+| **`dataclasses` modülü (import)** | ❌ Henüz yok; dataclass desteği var ama host'tan |
+| **`collections`, `itertools`, `math`** | ❌ Yok |
+| **`re` (regex)** | ❌ Yok |
+| **`datetime`** | ❌ Yok |
+| **`functools`** | ❌ Yok |
+| **`enum`** | ❌ Yok |
+| **Decorator'lar** | ⚠️ Sadece basit fonksiyon decorator'ları |
+| **`super()`** | ❌ Yok |
+| **`classmethod`, `staticmethod`** | ❌ Yok |
+
+## 6. Mimari (Dahili)
+
+- **Parser:** Ruff'un `ruff_python_parser`'ı kullanılır → AST üretilir
+- **Prepare phase:** AST'den Scope analizi yapılır, isimler namespace index'lerine çözümlenir
+- **Bytecode:** Hazırlanan AST doğrudan bytecode VM'e beslenir (CPython benzeri register VM)
+- **Bellek:** Manuel reference counting (`drop_with_heap`, `clone_with_heap`); GC configurable intervals ile
+- **Serileştirme:** `serde` ile binary format (parsed code + snapshot)
+
+### Crate yapısı
+
+| Crate | İçerik |
+|-------|--------|
+| `crates/monty/` | Çekirdek interpreter (VM, types, builtins, modules) |
+| `crates/monty-python/` | PyO3 Python bindings |
+| `crates/monty-js/` | napi-rs JavaScript bindings |
+| `crates/monty-cli/` | CLI aracı |
+| `crates/monty-type-checking/` | Statik tip analizi |
+| `crates/monty-typeshed/` | Tip stub dosyaları (vendor + custom) |
+| `crates/fuzz/` | Fuzzing testleri |
+
+### Modül whitelist
+
+`import` ifadesi sadece şu modülleri yükleyebilir (kaynak: `modules/mod.rs`):
+
+```
+sys typing asyncio pathlib os
+```
+
+Başka herhangi bir `import X` → `ModuleNotFoundError`.
+
+## 7. PydanticAI Entegrasyonu
+
+Monty, PydanticAI'de **CodeMode** özelliğini güçlendirecek şekilde tasarlanmıştır. LLM sıralı tool çağrıları yapmak yerine, tool'ları fonksiyon olarak çağıran Python kodu yazar ve Monty bunu güvenli şekilde çalıştırır.
+
+```python
+from pydantic_ai import Agent
+from pydantic_ai.toolsets.code_mode import CodeModeToolset
+from pydantic_ai.toolsets.function import FunctionToolset
+
+# Araçları tanımla
+tools = FunctionToolset()
+
+@tools.tool
+async def get_weather(location: str) -> dict:
+ ...
+
+# Agent'ı CodeMode ile oluştur
+agent = Agent(
+ 'anthropic:claude-sonnet-4-5',
+ toolsets=[CodeModeToolset(tools)], # Monty-powered code execution
+)
+
+# Agent Python kodu yazarak tool'ları çağırır
+result = await agent.run("Compare weather in London and Paris")
+```
+
+## 8. Alternatiflere Karşı Pozisyon
+
+| Tech | Dil Tamamlığı | Güvenlik | Başlatma | Maliyet |
+|------|---------------|----------|----------|---------|
+| **Monty** | Kısmi | Katı | 0.06ms | Ücretsiz/OSS |
+| Docker | Tam | İyi | 195ms | Ücretsiz/OSS |
+| Pyodide | Tam | Zayıf | 2800ms | Ücretsiz/OSS |
+| starlark-rust | Çok kısıtlı | İyi | 1.7ms | Ücretsiz/OSS |
+| WASI/Wasmer | Neredeyse tam | Katı | 66ms | Ücretsiz* |
+| Sandboxing servisi (E2B, Modal) | Tam | Katı | 1033ms | Ücretli |
+| YOLO Python (exec) | Tam | Yok | 0.1ms | Ücretsiz/OSS |
+
+**Monty'nin avantajları:** En düşük başlatma süresi + katı güvenlik + kolay kurulum + serileştirme desteği.
+
+## 9. Eval Generation İçin Kullanım Senaryosu
+
+### Problem
+
+Eval generation pipeline'larında (hem tek seferlik generation hem de optimization döngüsünde) LLM agent expected değerleri **tahmin ediyor** — bu özellikle algoritmik fonksiyonlarda hallüsinasyona yol açar (ör. `binary_search([1,3,5,7], 5)` için yanlış index döndürme).
+
+### Çözüm: CodeMode Eval Generation
+
+CodeMode, **tüm eval generation pipeline'larında** kullanılabilecek genel bir mekanizmadır. Agent expected değerleri tahmin etmek yerine, Monty sandbox'ında **gerçek fonksiyonu çalıştırarak** ground-truth değerleri elde eder.
+
+**Kullanım alanları:**
+- **Tek seferlik eval generation** — `vowel` CLI veya API ile bir fonksiyon için eval dosyası üretirken
+- **Optimization döngüsü** — GEPA ile prompt optimize ederken her iterasyonda (burada özellikle etkili çünkü yüzlerce eval üretiliyor)
+- **CI/CD pipeline'ları** — Otomatik test üretimi akışlarında
+- **Herhangi bir eval generation çağrısı** — CodeMode, pipeline'dan bağımsız bir altyapı katmanıdır
+
+### Temel Mimari
+
+```
+┌─────────────────────────────────────────────────────────┐
+│ LLM Agent │
+│ "Bu fonksiyon için ilginç test girdileri tasarla" │
+│ │
+│ Agent üretir: │
+│ inputs = [ │
+│ {"x": [1,3,5,7,9], "target": 5}, │
+│ {"x": [], "target": 1}, │
+│ {"x": [1], "target": 1}, │
+│ ] │
+└──────────────────────┬──────────────────────────────────┘
+ │
+ ▼
+┌─────────────────────────────────────────────────────────┐
+│ Monty Sandbox │
+│ │
+│ # Agent tarafından üretilen test harness │
+│ results = [] │
+│ results.append(target_func([1,3,5,7,9], 5)) │
+│ results.append(target_func([], 1)) │
+│ results.append(target_func([1], 1)) │
+│ results │
+│ │
+│ external_functions = {"target_func": real_function} │
+│ limits = ResourceLimits(max_duration_secs=5.0) │
+└──────────────────────┬──────────────────────────────────┘
+ │
+ ▼
+┌─────────────────────────────────────────────────────────┐
+│ Ground-Truth Sonuçlar │
+│ │
+│ results = [2, -1, 0] ← gerçek fonksiyon çıktıları │
+│ │
+│ Bu değerler YAML eval dosyasındaki expected alanına │
+│ yazılır — hallüsinasyon riski sıfır. │
+└─────────────────────────────────────────────────────────┘
+```
+
+### Neden External Function Mekanizması Kritik?
+
+Hedef fonksiyon (ör. `binary_search`) şunları kullanabilir:
+- Stdlib modülleri (`collections`, `itertools`, `math` vb.)
+- Third-party kütüphaneler (`numpy`, `pandas` vb.)
+- Dosya sistemi, network vb.
+
+Monty sandbox'ı bunların hiçbirini desteklemez. **AMA** external function olarak inject edildiğinde, `target_func(...)` çağrısı host tarafındaki gerçek Python fonksiyonunu çalıştırır — yani tüm bağımlılıklar sorunsuz çalışır.
+
+### ExecutorAdapter Protokolü (Taslak)
+
+```python
+from dataclasses import dataclass
+from typing import Any, Protocol
+
+@dataclass
+class ExecutionResult:
+ """Sandbox çalıştırma sonucu."""
+ output: Any # Kodun döndürdüğü değer
+ stdout: str # print() çıktısı
+ success: bool # Hatasız tamamlandı mı
+ error: str | None = None # Hata mesajı (varsa)
+ error_type: str | None = None # Hata tipi (ör. "ValueError")
+ duration_ms: float = 0.0 # Çalışma süresi
+
+class ExecutorAdapter(Protocol):
+ """Kod çalıştırma adaptör protokolü."""
+ async def execute(
+ self,
+ code: str,
+ *,
+ target_function: callable | None = None,
+ inputs: dict[str, Any] | None = None,
+ timeout: float = 5.0,
+ max_memory: int = 10 * 1024 * 1024, # 10MB
+ ) -> ExecutionResult: ...
+
+class MontyExecutor:
+ """Monty tabanlı güvenli kod çalıştırıcı."""
+
+ def __init__(self):
+ import pydantic_monty
+ self._monty = pydantic_monty
+
+ async def execute(
+ self,
+ code: str,
+ *,
+ target_function: callable | None = None,
+ inputs: dict[str, Any] | None = None,
+ timeout: float = 5.0,
+ max_memory: int = 10 * 1024 * 1024,
+ ) -> ExecutionResult:
+ import time
+
+ stdout_lines: list[str] = []
+
+ def print_callback(stream: str, text: str):
+ stdout_lines.append(text)
+
+ # External function listesi oluştur
+ ext_names = ["target_func"] if target_function else []
+ ext_impls = {"target_func": target_function} if target_function else {}
+
+ # Input isimleri
+ input_names = list(inputs.keys()) if inputs else []
+
+ try:
+ m = self._monty.Monty(
+ code,
+ inputs=input_names or None,
+ external_functions=ext_names or None,
+ )
+
+ limits = self._monty.ResourceLimits(
+ max_duration_secs=timeout,
+ max_memory=max_memory,
+ )
+
+ start = time.perf_counter()
+ result = m.run(
+ inputs=inputs,
+ limits=limits,
+ external_functions=ext_impls,
+ print_callback=print_callback,
+ )
+ duration = (time.perf_counter() - start) * 1000
+
+ return ExecutionResult(
+ output=result,
+ stdout="\n".join(stdout_lines),
+ success=True,
+ duration_ms=duration,
+ )
+
+ except self._monty.MontyRuntimeError as e:
+ inner = e.exception()
+ return ExecutionResult(
+ output=None,
+ stdout="\n".join(stdout_lines),
+ success=False,
+ error=str(e),
+ error_type=type(inner).__name__,
+ duration_ms=0.0,
+ )
+ except self._monty.MontySyntaxError as e:
+ return ExecutionResult(
+ output=None,
+ stdout="",
+ success=False,
+ error=str(e),
+ error_type="SyntaxError",
+ duration_ms=0.0,
+ )
+```
+
+### BuiltinExecutor (Geliştirme/Fallback)
+
+```python
+class BuiltinExecutor:
+ """exec() tabanlı çalıştırıcı — sadece güvenilir kodlar için."""
+
+ async def execute(
+ self,
+ code: str,
+ *,
+ target_function: callable | None = None,
+ inputs: dict[str, Any] | None = None,
+ timeout: float = 5.0,
+ max_memory: int = 10 * 1024 * 1024,
+ ) -> ExecutionResult:
+ import io, contextlib, time
+
+ namespace = dict(inputs or {})
+ if target_function:
+ namespace["target_func"] = target_function
+
+ stdout = io.StringIO()
+ start = time.perf_counter()
+
+ try:
+ with contextlib.redirect_stdout(stdout):
+ exec(code, namespace)
+ duration = (time.perf_counter() - start) * 1000
+
+ # Son ifadenin değerini al (eğer varsa)
+ result = namespace.get("__result__", namespace.get("results"))
+
+ return ExecutionResult(
+ output=result,
+ stdout=stdout.getvalue(),
+ success=True,
+ duration_ms=duration,
+ )
+ except Exception as e:
+ return ExecutionResult(
+ output=None,
+ stdout=stdout.getvalue(),
+ success=False,
+ error=str(e),
+ error_type=type(e).__name__,
+ duration_ms=0.0,
+ )
+```
+
+## 10. Entegrasyon Tasarım Kararları
+
+### Açık Sorular
+
+1. **Agent'ın kodu nasıl üretecek?**
+ - Seçenek A: Agent sadece input listesi üretir, harness kodu otomatik oluşturulur
+ - Seçenek B: Agent tam test harness kodunu yazar (daha esnek ama hata riski daha yüksek)
+ - Seçenek C: Hibrit — Agent input + beklenen davranış tanımlar, edge case'ler için raises testi de yazabilir
+
+2. **Exception test etme nasıl olacak?**
+ - `raises` assertion'ları için agent'ın exception beklediğini belirtmesi gerekir
+ - Monty'de try/except destekleniyor, agent try/except yazarak exception tipini yakalayabilir
+
+3. **Mevcut pipeline ile entegrasyon noktası neresi?**
+ - `task.py`'daki `generate_and_score()` akışında, agent YAML ürettikten sonra expected değerleri doğrulamak için Monty kullanılabilir
+ - Veya: Agent doğrudan Monty ile çalışan bir "CodeMode" prompt ile yönlendirilir
+
+4. **Performans etkisi?**
+ - Monty başlatma: ~0.06ms
+ - Her test case çalıştırma: fonksiyonun karmaşıklığına bağlı (host'ta çalışır)
+ - 25 fonksiyon × 20 test case = 500 çalıştırma → toplam <1 saniye ek maliyet
+
+5. **Hangi fonksiyonlar CodeMode'a uygun?**
+ - Deterministik fonksiyonlar (aynı input → aynı output): ✅ İdeal
+ - Yan etkili fonksiyonlar (dosya yazma, API çağrısı): ⚠️ Dikkatli olunmalı
+ - Rastgele çıktılı fonksiyonlar: ❌ Uygun değil (expected value sabitlenmeli)
+
+### Kısıtlamalar ve Çözümler
+
+| Kısıtlama | Etki | Çözüm |
+|-----------|------|-------|
+| Class tanımı yok | Agent class kullanamaz | Fonksiyon + dict / NamedTuple kullan |
+| `json` modülü yok | String serialization zor | Host'a external function olarak delege et |
+| `match` statement yok | Pattern matching yok | if/elif zincirleri kullan |
+| `with` statement yok | Context manager yok | İstisnai durum; hedef fonksiyon host'ta çalışır |
+| `math`, `collections`, `itertools` yok | Sandbox içi hesaplama kısıtlı | Tüm asıl hesaplama host fonksiyonunda yapılır |
+| Sadece 5 modül import edilebilir | `sys`, `typing`, `asyncio`, `pathlib`, `os` | Yeterli — sandbox kodu sadece orkestrasyon yapıyor |
+
+**En kritik çözüm:** Sandbox kodunun amacı karmaşık hesaplama yapmak değil — **sadece test girdilerini organize edip hedef fonksiyonu çağırmak**. Asıl hesaplama external function (hedef fonksiyon) içinde, host tarafında yapılır.
+
+## 11. Örnek: Tam Çalışma Akışı
+
+```python
+# 1. Hedef fonksiyon (test edilecek)
+def binary_search(arr: list[int], target: int) -> int:
+ lo, hi = 0, len(arr) - 1
+ while lo <= hi:
+ mid = (lo + hi) // 2
+ if arr[mid] == target:
+ return mid
+ elif arr[mid] < target:
+ lo = mid + 1
+ else:
+ hi = mid - 1
+ return -1
+
+# 2. Agent'ın ürettiği Monty kodu
+agent_code = """
+results = []
+
+# Normal cases
+results.append({"input": {"arr": [1,3,5,7,9], "target": 5}, "expected": target_func([1,3,5,7,9], 5)})
+results.append({"input": {"arr": [1,3,5,7,9], "target": 1}, "expected": target_func([1,3,5,7,9], 1)})
+results.append({"input": {"arr": [1,3,5,7,9], "target": 9}, "expected": target_func([1,3,5,7,9], 9)})
+
+# Not found
+results.append({"input": {"arr": [1,3,5,7,9], "target": 4}, "expected": target_func([1,3,5,7,9], 4)})
+
+# Edge cases
+results.append({"input": {"arr": [], "target": 1}, "expected": target_func([], 1)})
+results.append({"input": {"arr": [1], "target": 1}, "expected": target_func([1], 1)})
+results.append({"input": {"arr": [1], "target": 2}, "expected": target_func([1], 2)})
+
+results
+"""
+
+# 3. Monty'de çalıştır
+import pydantic_monty
+
+m = pydantic_monty.Monty(
+ agent_code,
+ external_functions=["target_func"],
+)
+
+results = m.run(
+ external_functions={"target_func": binary_search},
+ limits=pydantic_monty.ResourceLimits(max_duration_secs=5.0),
+)
+
+# 4. Sonuç: Ground-truth expected değerlerle test case'ler
+# results = [
+# {"input": {"arr": [1,3,5,7,9], "target": 5}, "expected": 2},
+# {"input": {"arr": [1,3,5,7,9], "target": 1}, "expected": 0},
+# {"input": {"arr": [1,3,5,7,9], "target": 9}, "expected": 4},
+# {"input": {"arr": [1,3,5,7,9], "target": 4}, "expected": -1},
+# {"input": {"arr": [], "target": 1}, "expected": -1},
+# {"input": {"arr": [1], "target": 1}, "expected": 0},
+# {"input": {"arr": [1], "target": 2}, "expected": -1},
+# ]
+```
+
+**Hiçbir expected değer hallüsine edilmedi — hepsi gerçek fonksiyon çıktısı.**
+
+## 12. Sonraki Adımlar
+
+1. ~~Monty API'yi tam anla~~ ✅
+2. `ExecutorAdapter` protokolünü finalize et
+3. `MontyExecutor` implementasyonunu yaz
+4. `task.py`'ye CodeMode akışını entegre et
+5. Agent prompt'unu CodeMode için güncelle
+6. 25 referans fonksiyon üzerinde test et
+7. Mevcut "tahmin" modu ile CodeMode'u karşılaştır (A/B)
diff --git a/pyproject.toml b/pyproject.toml
index 3e5db9e..bc13a87 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -55,7 +55,9 @@ logfire = [
]
monty = [
- "pydantic-monty>=0.0.7"
+ "pydantic-monty>=0.0.8"
+ # i have decided to pin working version
+ # because of new changes to MontyRepl in version 0.0.8
]
optimize = [
"vowel-optimization"
@@ -84,7 +86,7 @@ target-version = ["py311"]
[tool.ruff]
line-length = 100
target-version = "py311"
-exclude = ["vowel-optimization"]
+exclude = ["vowel-optimization", "benchmark_v1"]
[tool.ruff.lint]
select = [
@@ -118,7 +120,7 @@ ignore_missing_imports = true
python-version = "3.11"
[tool.ty.src]
-exclude = ["vowel-optimization"]
+exclude = ["vowel-optimization", "benchmark_v1"]
[tool.ty.rules]
unresolved-import = "ignore"
@@ -142,3 +144,8 @@ markers = [
"integration: integration tests",
"llm: tests that require LLM API calls",
]
+
+[tool.uv.workspace]
+members = [
+ "pydantic-acp",
+]
diff --git a/pyrightconfig.json b/pyrightconfig.json
index e93018b..19456c7 100644
--- a/pyrightconfig.json
+++ b/pyrightconfig.json
@@ -11,6 +11,7 @@
"reportUnknownVariableType": "none",
"reportUnknownMemberType": "none",
"reportUnknownParameterType": "none",
+ "reportAttributeAccessIssue": "none",
"reportAny": "none",
"reportExplicitAny": "none",
"reportMissingParameterType": "none",
diff --git a/pytest.ini b/pytest.ini
deleted file mode 100644
index 4ecb1ad..0000000
--- a/pytest.ini
+++ /dev/null
@@ -1,5 +0,0 @@
-[pytest]
-testpaths = tests
-python_files = test_*.py
-python_classes = Test*
-python_functions = test_*
diff --git a/src/vowel/__init__.py b/src/vowel/__init__.py
index c01915e..f890ed4 100644
--- a/src/vowel/__init__.py
+++ b/src/vowel/__init__.py
@@ -45,6 +45,7 @@
MontyExecutor,
MontyReplSession,
get_executor,
+ resolve_executors,
)
from .runner import Function, RunEvals
from .utils import (
@@ -93,6 +94,7 @@
"DefaultExecutor",
"DefaultSession",
"get_executor",
+ "resolve_executors",
# CodeMode pipeline
"CodeModeGenerator",
"CodeModeResult",
diff --git a/src/vowel/cli.py b/src/vowel/cli.py
index 114f08b..868b574 100644
--- a/src/vowel/cli.py
+++ b/src/vowel/cli.py
@@ -740,7 +740,7 @@ def on_modified(self, event):
if export_json:
import json
- json_data = summary.json()
+ json_data = summary.to_json()
with open(export_json, "w") as f:
json.dump(json_data, f, indent=2)
if not quiet:
@@ -749,6 +749,10 @@ def on_modified(self, event):
# Failed assertions detail
if summary.failed_results:
console.print()
+
+ all_failures_are_duration = True
+ has_any_failures = False
+
for result in summary.failed_results:
console.print(Panel(result.eval_id, title="Failed Assertions", border_style="yellow"))
@@ -758,6 +762,7 @@ def on_modified(self, event):
]
if failed_assertions:
+ has_any_failures = True
total_assertions = len(case.assertions)
failed_count = len(failed_assertions)
@@ -767,6 +772,12 @@ def on_modified(self, event):
)
for assertion_name, res in failed_assertions:
+ if (
+ "duration" not in assertion_name.lower()
+ and "maxduration" not in assertion_name.lower()
+ ):
+ all_failures_are_duration = False
+
console.print(f"\n [red]x {assertion_name}[/red]")
if res.reason:
reason_lines = str(res.reason).split("\n")
@@ -774,6 +785,20 @@ def on_modified(self, event):
if line.strip():
console.print(f" [dim]{line.strip()}[/dim]")
+ # Inform user if all errors are just duration errors
+ if has_any_failures and all_failures_are_duration:
+ console.print()
+ console.print(
+ Panel(
+ "All failing evaluators are related to duration (MaxDuration). "
+ "You can run the command with `--ignore-duration` to skip performance constraints "
+ "and get a more accurate evaluation of functional correctness.",
+ title="Insight",
+ border_style="cyan",
+ style="cyan",
+ )
+ )
+
console.print()
# CI mode
diff --git a/src/vowel/codemode.py b/src/vowel/codemode.py
index e488827..6ef8256 100644
--- a/src/vowel/codemode.py
+++ b/src/vowel/codemode.py
@@ -31,7 +31,7 @@
from vowel.context import EVAL_SPEC_CONTEXT
from vowel.eval_types import EvalsSource
-from vowel.executor import ExecutionResult, Executor, get_executor
+from vowel.executor import ExecutionResult, Executor, resolve_executors
from vowel.monitoring import enable_monitoring
from vowel.runner import Function, RunEvals
from vowel.spec_validation import (
@@ -192,16 +192,29 @@ class CodeModeGenerator:
def __init__(
self,
- model: str | None = None,
- executor: Executor | None = None,
+ spec_model: str | None = None,
+ exploration_model: str | None = None,
+ default_executor: Executor | None = None,
+ fallback_executor: Executor | None = None,
additional_context: str = "",
min_snippets: int = 15,
**opts,
) -> None:
- self.model = model or os.getenv("MODEL_NAME", "")
- if not self.model:
- logfire.warn("No model specified; set MODEL_NAME env var or pass model=")
- self.executor = executor or get_executor("auto")
+ # Default fallback from kwargs (for backwards compatibility) or environment
+ base_fallback = opts.pop("model", None) or os.getenv("MODEL_NAME", "")
+
+ self.spec_model = spec_model or os.getenv("SPEC_MODEL") or base_fallback
+ self.exploration_model = (
+ exploration_model or os.getenv("EXPLORATION_MODEL") or base_fallback
+ )
+
+ if not self.spec_model or not self.exploration_model:
+ raise ValueError(
+ "Both spec_model and exploration_model must be specified. "
+ "Provide them via constructor/kwargs, or set SPEC_MODEL, EXPLORATION_MODEL, or MODEL_NAME environment variables."
+ )
+
+ self.executor = resolve_executors(default_executor, fallback_executor)
self.additional_context = additional_context
self.min_snippets = min_snippets
self._opts = opts
@@ -212,7 +225,8 @@ def __init__(
logfire.info(
"CodeModeGenerator initialized",
- model=self.model,
+ spec_model=self.spec_model,
+ exploration_model=self.exploration_model,
executor=type(self.executor).__name__,
)
@@ -222,7 +236,7 @@ def __init__(
def explorer_agent(self) -> Agent[None, ExplorationPlan]:
if self._explorer_agent is None:
self._explorer_agent = Agent(
- self.model,
+ self.exploration_model,
output_type=ExplorationPlan,
system_prompt=self._explorer_system_prompt(),
**self._opts,
@@ -233,7 +247,7 @@ def explorer_agent(self) -> Agent[None, ExplorationPlan]:
def spec_agent(self) -> Agent[None, EvalsSource]:
if self._spec_agent is None:
self._spec_agent = Agent(
- self.model,
+ self.spec_model,
output_type=EvalsSource,
system_prompt=self._spec_system_prompt(),
**self._opts,
@@ -306,12 +320,21 @@ def _spec_system_prompt(self) -> str:
async def explore(
self,
func: Function,
+ *,
+ exploration_rounds: int = 2,
) -> list[SnippetResult]:
"""Phase 1: Generate and execute exploration snippets.
- Uses ``create_session()`` to compile the function source **once**,
- then feeds each snippet against the preserved runtime state —
- zero re-parse overhead per snippet.
+ Supports multi-round feedback-guided exploration. Round 1 uses
+ static reasoning (speculation-based). Round 2+ receives a
+ programmatic cluster summary of prior results so the LLM can
+ target unexplored behaviour classes (evidence-based).
+
+ Parameters
+ ----------
+ exploration_rounds:
+ Number of exploration rounds (default 2). Set to 1 to
+ restore single-shot behaviour.
Returns a list of ``SnippetResult`` with real outputs from the
executor.
@@ -320,63 +343,178 @@ async def explore(
"codemode.explore",
func_name=func.name,
executor=type(self.executor).__name__,
+ exploration_rounds=exploration_rounds,
):
- # 1. Ask the LLM for exploration snippets
- plan = await self._get_exploration_plan(func)
+ all_results: list[SnippetResult] = []
- # 2. Compile function source once, feed each snippet
- all_snippets = [
- *((s, "normal") for s in plan.snippets),
- *((s, "error") for s in plan.error_snippets),
- ]
- total = len(all_snippets)
- results: list[SnippetResult] = []
- with self.executor.create_session(func.code) as session:
- for i, (snippet, kind) in enumerate(all_snippets):
- with logfire.span(
- "codemode.execute_snippet",
- index=i,
- kind=kind,
- description=snippet.description,
- ):
- logfire.info(
- "Executing snippet {index}/{total} [{kind}]: {description}",
- index=i + 1,
- total=total,
- kind=kind,
- description=snippet.description,
- code=snippet.code,
+ for round_num in range(1, exploration_rounds + 1):
+ with logfire.span(
+ "codemode.explore_round",
+ round=round_num,
+ prior_results=len(all_results),
+ ):
+ # Get exploration plan (round 2+ includes prior context)
+ if round_num == 1:
+ plan = await self._get_exploration_plan(func)
+ else:
+ cluster_summary = self._build_cluster_summary(all_results)
+ plan = await self._get_targeted_exploration_plan(
+ func,
+ all_results,
+ cluster_summary,
)
+ # Early exit: if no new snippets were produced
+ if not plan.snippets and not plan.error_snippets:
+ logfire.info(
+ "Round {round} produced no new snippets, stopping",
+ round=round_num,
+ )
+ break
- exec_result = session.feed(snippet.code)
-
- sr = SnippetResult.from_execution(snippet, exec_result)
- results.append(sr)
+ # Execute snippets
+ new_results = self._execute_plan(func, plan, round_num)
+ all_results.extend(new_results)
+ # Early exit: round 2+ found no new behaviour
+ if round_num > 1:
+ new_behaviors = self._count_new_behaviors(
+ all_results[: -len(new_results)],
+ new_results,
+ )
logfire.info(
- "Snippet result: success={success}, output={output}, "
- "duration={duration_ms:.2f}ms",
- success=sr.success,
- output=repr(sr.output)[:200],
- duration_ms=sr.duration_ms,
- error=sr.error,
- error_type=sr.error_type,
+ "Round {round}: {new} new behaviour classes discovered",
+ round=round_num,
+ new=new_behaviors,
)
# Summary log
- successes = sum(1 for r in results if r.success)
- failures = len(results) - successes
+ successes = sum(1 for r in all_results if r.success)
+ failures = len(all_results) - successes
logfire.info(
"Exploration complete: {successes} succeeded, {failures} raised errors",
successes=successes,
failures=failures,
)
- return results
+ return all_results
+
+ def _execute_plan(
+ self,
+ func: Function,
+ plan: ExplorationPlan,
+ round_num: int = 1,
+ ) -> list[SnippetResult]:
+ """Execute all snippets in a plan and return results."""
+ all_snippets = [
+ *((s, "normal") for s in plan.snippets),
+ *((s, "error") for s in plan.error_snippets),
+ ]
+ total = len(all_snippets)
+ results: list[SnippetResult] = []
+ with self.executor.create_session(func.code) as session:
+ for i, (snippet, kind) in enumerate(all_snippets):
+ with logfire.span(
+ "codemode.execute_snippet",
+ index=i,
+ kind=kind,
+ round=round_num,
+ description=snippet.description,
+ ):
+ logfire.info(
+ "Executing snippet {index}/{total} R{round} [{kind}]: {description}",
+ index=i + 1,
+ total=total,
+ round=round_num,
+ kind=kind,
+ description=snippet.description,
+ code=snippet.code,
+ )
+
+ exec_result = session.feed(snippet.code)
+
+ sr = SnippetResult.from_execution(snippet, exec_result)
+ results.append(sr)
+
+ logfire.info(
+ "Snippet result: success={success}, output={output}, "
+ "duration={duration_ms:.2f}ms",
+ success=sr.success,
+ output=repr(sr.output)[:200],
+ duration_ms=sr.duration_ms,
+ error=sr.error,
+ error_type=sr.error_type,
+ )
+ return results
+
+ @staticmethod
+ def _build_cluster_summary(results: list[SnippetResult]) -> str:
+ """Build a deterministic cluster summary from exploration results.
+
+ Groups results by output type / error type and formats a concise
+ summary for the Round 2 exploration prompt.
+ """
+ # -- Success clusters --
+ success_types: dict[str, int] = {}
+ for r in results:
+ if r.success:
+ t = type(r.output).__name__
+ success_types[t] = success_types.get(t, 0) + 1
+
+ # -- Error clusters --
+ error_clusters: dict[str, list[str]] = {}
+ for r in results:
+ if not r.success and r.error_type:
+ msgs = error_clusters.setdefault(r.error_type, [])
+ prefix = (r.error or "")[:60]
+ if prefix not in msgs:
+ msgs.append(prefix)
+
+ # -- Already-tried snippets (to avoid repeats) --
+ tried_codes = [r.code.strip() for r in results]
+
+ lines = ["## Observed Behaviour Clusters\n"]
+
+ lines.append("### Success clusters")
+ if success_types:
+ for t, count in sorted(success_types.items()):
+ lines.append(f"- output type `{t}`: {count} cases")
+ else:
+ lines.append("- (none)")
+
+ lines.append("\n### Error clusters")
+ if error_clusters:
+ for etype, msgs in sorted(error_clusters.items()):
+ lines.append(f"- `{etype}` ({len(msgs)} distinct messages):")
+ for m in msgs:
+ lines.append(f' - "{m}"')
+ else:
+ lines.append("- (none)")
+
+ lines.append(f"\n### Already tried ({len(tried_codes)} snippets — do NOT repeat these)")
+ for code in tried_codes:
+ lines.append(f"- `{code}`")
+
+ return "\n".join(lines)
+
+ @staticmethod
+ def _count_new_behaviors(
+ prior: list[SnippetResult],
+ new: list[SnippetResult],
+ ) -> int:
+ """Count how many new behaviour classes the new results introduced."""
+
+ def _behavior_key(r: SnippetResult) -> str:
+ if r.success:
+ return f"ok:{type(r.output).__name__}"
+ return f"err:{r.error_type}:{(r.error or '')[:40]}"
+
+ prior_keys = {_behavior_key(r) for r in prior}
+ new_keys = {_behavior_key(r) for r in new}
+ return len(new_keys - prior_keys)
async def _get_exploration_plan(self, func: Function) -> ExplorationPlan:
- """Ask the LLM for exploration snippets."""
- with logfire.span("codemode.llm_explore", func_name=func.name):
+ """Ask the LLM for exploration snippets (Round 1 — static reasoning)."""
+ with logfire.span("codemode.llm_explore", func_name=func.name, round=1):
prompt = f"""Explore the following function by writing test snippets:
{func.name}
@@ -393,7 +531,58 @@ async def _get_exploration_plan(self, func: Function) -> ExplorationPlan:
plan = result.output
logfire.info(
- "LLM produced {normal} normal + {error} error snippets",
+ "Round 1: LLM produced {normal} normal + {error} error snippets",
+ normal=len(plan.snippets),
+ error=len(plan.error_snippets),
+ snippets=[s.description for s in plan.snippets],
+ error_snippets=[s.description for s in plan.error_snippets],
+ )
+ return plan
+
+ async def _get_targeted_exploration_plan(
+ self,
+ func: Function,
+ prior_results: list[SnippetResult],
+ cluster_summary: str,
+ ) -> ExplorationPlan:
+ """Ask the LLM for targeted snippets (Round 2 — evidence-based)."""
+ with logfire.span("codemode.llm_explore", func_name=func.name, round=2):
+ prompt = f"""You previously explored `{func.name}` and the snippets were
+executed. Below are the ACTUAL results and a cluster summary.
+
+Your job now is to find **new behaviour classes** that were NOT covered
+in Round 1. Focus on:
+- Syntax / input combinations not yet tried
+- Edge cases at boundaries between observed clusters
+- Error paths whose exact error type or message differs from expectation
+- Interactions between parameters / sub-expressions
+
+{func.name}
+
+{func.code}
+
+{func.description}
+
+
+{chr(10).join(r.to_context_block() for r in prior_results)}
+
+
+
+{cluster_summary}
+
+
+RULES:
+- Do NOT repeat any snippet from the "Already tried" list.
+- Produce 8–12 NEW normal snippets targeting uncovered behaviour.
+- Produce 3–5 NEW error snippets targeting untried error paths.
+- Same strict rules as before: no try/except, real function name,
+ one scenario per snippet, last expression captured."""
+
+ result = await self.explorer_agent.run(prompt)
+ plan = result.output
+
+ logfire.info(
+ "Round 2: LLM produced {normal} normal + {error} error snippets",
normal=len(plan.snippets),
error=len(plan.error_snippets),
snippets=[s.description for s in plan.snippets],
@@ -471,13 +660,21 @@ async def generate_spec(
REQUIREMENTS:
-- Use {func.name} as eval_id.
+- The top-level YAML key MUST be `{func.name}` (the function name).
- Generate at least {max(len(exploration_results), 5)} diverse test cases.
- Use the EXACT outputs from the execution results above.
- You MUST generate exactly {len(error_results)} raises cases — one for
each RAISED result above. The spec is invalid without them.
- Cover normal, edge, and error cases.
- In assertions, use `input` (NOT `inputs`) for accessing input values.
+
+YAML FORMAT — STRICT RULES (violations cause parse failure):
+- NEVER use YAML tags: `!!python/tuple`, `!!python/object`, `!!binary`,
+ `!!omap`, `!!str`, `!!int`, `!!float`, or ANY `!!` tag whatsoever.
+ Plain YAML scalars and sequences only. `yaml.safe_load()` will be used
+ to parse the output — it rejects all `!!` tags and will hard-fail.
+- Represent tuples as YAML sequences (lists).
+- NEVER emit `!!python/...` or any non-standard YAML type annotation.
{refinement_block}"""
logfire.info(
@@ -490,11 +687,13 @@ async def generate_spec(
result = await self.spec_agent.run(prompt)
yaml_spec = result.output.yaml_spec
- # Sanitize: strip YAML tags that safe_load rejects
+ # Sanitize: strip ALL !! annotations — safe_load only accepts
+ # a tiny subset (str/int/float/bool/null/seq/map) and rejects
+ # anything else (!!python/tuple, !!binary, !!omap, etc.).
+ # Stripping them is safe: scalar values fall back to plain YAML types.
import re
- yaml_spec = re.sub(r"!!python/[\w.:]+", "", yaml_spec)
- yaml_spec = re.sub(r"!!binary\b", "", yaml_spec)
+ yaml_spec = re.sub(r"!![^\s\[\]{},]+", "", yaml_spec)
# Validate YAML syntax
yaml.safe_load(yaml_spec)
@@ -581,15 +780,18 @@ async def generate(
Pipeline::
- Phase 1: explore() (once)
+ Phase 1: explore() (2 rounds by default)
+ Round 1 — static reasoning (speculation-based)
+ Round 2 — targeted exploration (evidence-based)
Phase 2: generate_spec() (may loop)
Phase 3: validate via RunEvals (per attempt)
Phase 4: refine on failure (up to N rounds)
Phase 5: inject_durations() (once, at end)
- Exploration (Phase 1) runs once — the ground-truth snippet results
- don't change. Only spec generation (Phase 2) is re-run on failure,
- with a failure report injected into the prompt.
+ Exploration (Phase 1) runs in two rounds. Round 1 uses static
+ reasoning; Round 2 receives a cluster summary of Round 1 results
+ and targets uncovered behaviour classes. Only spec generation
+ (Phase 2) is re-run on validation failure.
Parameters
----------
@@ -618,7 +820,8 @@ async def generate(
with logfire.span(
"codemode.pipeline",
func_name=func.name,
- model=self.model,
+ spec_model=self.spec_model,
+ exploration_model=self.exploration_model,
executor=type(self.executor).__name__,
):
t0 = time.perf_counter()
@@ -726,7 +929,7 @@ async def generate(
elapsed = (time.perf_counter() - t0) * 1000
logfire.info(
- "CodeMode pipeline complete in {elapsed:.0f}ms (refinements={rounds})",
+ "CodeMode pipeline complete in {elapsed:.0f}ms (refinements={refinement_rounds})",
elapsed=elapsed,
func_name=func.name,
exploration_count=len(exploration_results),
diff --git a/src/vowel/eval_types.py b/src/vowel/eval_types.py
index 241a71d..e9cc14d 100644
--- a/src/vowel/eval_types.py
+++ b/src/vowel/eval_types.py
@@ -20,14 +20,14 @@
EvalsFile: Root model for YAML file parsing
"""
-import logfire
import os
+import typing
from typing import Any, Literal
+import logfire
from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
from pydantic.experimental.missing_sentinel import MISSING
-
# =============================================================================
# LLM Output Models
# =============================================================================
@@ -106,6 +106,63 @@
assertion: "len(output) == 2"
"""
+SAFE_ASSERTION_BUILTINS = {
+ "abs": abs,
+ "all": all,
+ "any": any,
+ "bool": bool,
+ "dict": dict,
+ "enumerate": enumerate,
+ "float": float,
+ "int": int,
+ "isinstance": isinstance,
+ "len": len,
+ "list": list,
+ "max": max,
+ "min": min,
+ "range": range,
+ "round": round,
+ "set": set,
+ "sorted": sorted,
+ "str": str,
+ "sum": sum,
+ "tuple": tuple,
+ "type": type,
+ "zip": zip,
+}
+
+SAFE_TYPE_NAMES: dict[str, Any] = {
+ "Any": Any,
+ "None": None,
+ "bool": bool,
+ "bytes": bytes,
+ "dict": dict,
+ "float": float,
+ "frozenset": frozenset,
+ "int": int,
+ "list": list,
+ "object": object,
+ "set": set,
+ "str": str,
+ "tuple": tuple,
+ "typing": typing,
+}
+SAFE_TYPE_NAMES.update(
+ {name: getattr(typing, name) for name in dir(typing) if not name.startswith("_")}
+)
+
+
+def _eval_assertion_restricted(assertion: str, env: dict[str, Any]) -> bool:
+ namespace = {"__builtins__": SAFE_ASSERTION_BUILTINS}
+ namespace.update(env)
+ return bool(eval(assertion, namespace, namespace))
+
+
+def _eval_type_restricted(type_expr: str) -> Any:
+ namespace = {"__builtins__": {}}
+ namespace.update(SAFE_TYPE_NAMES)
+ return eval(type_expr, namespace, namespace)
+
class EvalsSource(BaseModel):
"""LLM output model for YAML eval specification."""
@@ -203,7 +260,11 @@ class IsInstanceCase(BaseModel):
)
def evaluate(self, output: Any) -> bool:
- return isinstance(output, eval(self.type))
+ try:
+ expected = _eval_type_restricted(self.type)
+ except Exception:
+ expected = eval(self.type)
+ return isinstance(output, expected)
class AssertionCase(BaseModel):
@@ -258,7 +319,10 @@ class AssertionCase(BaseModel):
def evaluate(self, input: Any, output: Any) -> bool:
env = {"input": input, "output": output}
- return eval(self.assertion, env, env)
+ try:
+ return _eval_assertion_restricted(self.assertion, env)
+ except Exception:
+ return bool(eval(self.assertion, env, env))
class DurationCase(BaseModel):
@@ -774,7 +838,7 @@ def model_validate(cls, obj, **kwargs):
def get_evals(self) -> dict[str, Evals]:
result = {}
- extras = getattr(self, "__pydantic_extra__", None) or {}
+ extras = getattr(self, "__pydantic_extra__", {})
for key, value in extras.items():
if key == "fixtures":
continue
diff --git a/src/vowel/evals.py b/src/vowel/evals.py
index f9fc79b..5e78ecd 100644
--- a/src/vowel/evals.py
+++ b/src/vowel/evals.py
@@ -23,6 +23,7 @@
from contextlib import suppress
from dataclasses import dataclass
+import logfire
from pydantic import ValidationError
from pydantic.type_adapter import TypeAdapter
from pydantic_ai.settings import ModelSettings
@@ -30,6 +31,63 @@
MONTY_AVAILABLE = bool(importlib.util.find_spec("pydantic-monty"))
+SAFE_ASSERTION_BUILTINS = {
+ "abs": abs,
+ "all": all,
+ "any": any,
+ "bool": bool,
+ "dict": dict,
+ "enumerate": enumerate,
+ "float": float,
+ "int": int,
+ "isinstance": isinstance,
+ "len": len,
+ "list": list,
+ "max": max,
+ "min": min,
+ "range": range,
+ "round": round,
+ "set": set,
+ "sorted": sorted,
+ "str": str,
+ "sum": sum,
+ "tuple": tuple,
+ "type": type,
+ "zip": zip,
+}
+
+SAFE_TYPE_NAMES = {
+ "Any": typing.Any,
+ "None": None,
+ "bool": bool,
+ "bytes": bytes,
+ "dict": dict,
+ "float": float,
+ "frozenset": frozenset,
+ "int": int,
+ "list": list,
+ "object": object,
+ "set": set,
+ "str": str,
+ "tuple": tuple,
+ "typing": typing,
+}
+SAFE_TYPE_NAMES.update(
+ {name: getattr(typing, name) for name in dir(typing) if not name.startswith("_")}
+)
+
+
+def _eval_assertion_restricted(condition: str, inputs: dict[str, typing.Any]) -> bool:
+ env = {"__builtins__": SAFE_ASSERTION_BUILTINS}
+ env.update(inputs)
+ return bool(eval(condition, env, env))
+
+
+def _eval_type_restricted(type_expr: str) -> typing.Any:
+ env = {"__builtins__": {}}
+ env.update(SAFE_TYPE_NAMES)
+ return eval(type_expr, env, env)
+
def prepare_env_and_condition(ctx: EvaluatorContext, condition: str) -> tuple[dict, str]:
"""
@@ -123,6 +181,20 @@ def eval_python(self, condition: str, inputs: dict) -> EvaluationReason:
)
except Exception:
+ pass
+
+ try:
+ if _eval_assertion_restricted(self.condition, inputs):
+ return EvaluationReason(
+ value=True, reason=f"Assertion passed for condition: {condition}"
+ )
+ except Exception as exc:
+ logfire.info(
+ "Restricted assertion eval failed; falling back to raw eval",
+ condition=self.condition,
+ error_type=type(exc).__name__,
+ error=str(exc),
+ )
with suppress(Exception):
if eval(self.condition, inputs, inputs):
return EvaluationReason(
@@ -147,15 +219,8 @@ def evaluate(self, ctx: EvaluatorContext) -> EvaluationReason:
"""Validate that output matches the expected type."""
if isinstance(ctx.output, dict) and "_exception" in ctx.output:
return EvaluationReason(value=True, reason="Skipped (exception case)")
- type_env = {
- "typing": typing,
- "__import__": None,
- "eval": None,
- "exec": None,
- "compile": None,
- }
try:
- expected_type = eval(self.type, type_env, type_env)
+ expected_type = _eval_type_restricted(self.type)
ta = TypeAdapter(expected_type)
except Exception:
return EvaluationReason(
@@ -325,7 +390,9 @@ def evaluate(self, ctx: EvaluatorContext) -> EvaluationReason:
)
if self.expected_exception_match:
exception_message = str(actual_exception)
- if not re.search(self.expected_exception_match, exception_message, re.I):
+ if self.expected_exception_match != exception_message and not re.search(
+ self.expected_exception_match, exception_message, re.I
+ ):
return EvaluationReason(
value=False,
reason=(
diff --git a/src/vowel/executor.py b/src/vowel/executor.py
index 9de81e1..fe14dbd 100644
--- a/src/vowel/executor.py
+++ b/src/vowel/executor.py
@@ -79,6 +79,8 @@
from dataclasses import dataclass
from typing import Any, Literal, Protocol, runtime_checkable
+import logfire as _logfire
+
NEST_AVAILABLE = importlib.util.find_spec("nest_asyncio") is not None
MONTY_AVAILABLE = importlib.util.find_spec("pydantic_monty") is not None
@@ -298,12 +300,9 @@ def __init__(
def _print_callback(_stream: str, text: str) -> None:
stdout_lines.append(text)
- # Compile + execute setup code (function definitions, imports, etc.)
- self._repl, _init_output = pydantic_monty.MontyRepl.create(
- setup_code,
- limits=self._limits,
- print_callback=_print_callback,
- )
+ # Create empty REPL and initialize it with setup code
+ self._repl = pydantic_monty.MontyRepl(limits=self._limits)
+ self._repl.feed_run(setup_code, print_callback=_print_callback)
self._setup_stdout = "\n".join(stdout_lines)
def feed(self, code: str) -> ExecutionResult:
@@ -315,11 +314,11 @@ def _print_callback(_stream: str, text: str) -> None:
t0 = time.perf_counter()
try:
- if not self._repl:
+ if not getattr(self, "_repl", None):
# TODO: wrap with custom exception and detailed message
raise ValueError("Repl not found.")
else:
- output = self._repl.feed(code, print_callback=_print_callback)
+ output = self._repl.feed_run(code, print_callback=_print_callback)
duration_ms = (time.perf_counter() - t0) * 1000
return ExecutionResult(
output=output,
@@ -364,7 +363,8 @@ def _print_callback(_stream: str, text: str) -> None:
def close(self) -> None:
"""Release the REPL instance."""
- self._repl = None # type: ignore[assignment]
+ # TODO: not sure about releasing the REPL instance is needed
+ # self._repl = None # type: ignore
def __enter__(self) -> MontyReplSession:
return self
@@ -377,8 +377,6 @@ def __exit__(self, *_: Any) -> None:
# FallbackSession — Monty with auto-fallback to DefaultSession
# ---------------------------------------------------------------------------
-import logfire as _logfire
-
class FallbackSession:
"""Session that tries MontyReplSession first, falls back to DefaultSession.
@@ -401,12 +399,14 @@ def __init__(
*,
timeout: float = 5.0,
max_memory: int = 10 * 1024 * 1024,
+ fallback_executor: Executor | None = None,
) -> None:
self._setup_code = setup_code
self._timeout = timeout
self._max_memory = max_memory
+ self._fallback_executor = fallback_executor or DefaultExecutor()
self._monty_session: MontyReplSession | None = None
- self._default_session: DefaultSession | None = None
+ self._fallback_session: ExecutionSession | None = None
self._monty_failed_permanently = False
try:
@@ -417,52 +417,54 @@ def __init__(
)
except Exception as exc:
_logfire.info(
- "Monty session creation failed ({exc_type}: {exc_msg}), falling back to DefaultSession",
+ "Monty session creation failed ({exc_type}: {exc_msg}), falling back to {fallback}",
exc_type=type(exc).__name__,
exc_msg=str(exc),
+ fallback=type(self._fallback_executor).__name__,
)
self._monty_failed_permanently = True
- self._default_session = DefaultSession(
+ self._fallback_session = self._fallback_executor.create_session(
setup_code,
timeout=timeout,
max_memory=max_memory,
)
- def _get_default_session(self) -> DefaultSession:
- """Lazily create the DefaultSession (only when first needed)."""
- if self._default_session is None:
- self._default_session = DefaultSession(
+ def _get_fallback_session(self) -> ExecutionSession:
+ """Lazily create the fallback session (only when first needed)."""
+ if self._fallback_session is None:
+ self._fallback_session = self._fallback_executor.create_session(
self._setup_code,
timeout=self._timeout,
max_memory=self._max_memory,
)
- return self._default_session
+ return self._fallback_session
def feed(self, code: str) -> ExecutionResult:
- """Execute *code*, falling back to DefaultSession on Monty gaps."""
+ """Execute *code*, falling back to the configured session on Monty gaps."""
# Session-level fallback — Monty never worked
if self._monty_failed_permanently:
- return self._get_default_session().feed(code)
+ return self._get_fallback_session().feed(code)
assert self._monty_session is not None
result = self._monty_session.feed(code)
# Snippet-level fallback — ModuleNotFoundError means Monty
- # doesn't have that stdlib module; retry with DefaultSession.
+ # doesn't have that stdlib module; retry with fallback session.
if not result.success and result.error_type == "ModuleNotFoundError":
_logfire.info(
- "Monty ModuleNotFoundError, retrying snippet with DefaultSession: {error}",
+ "Monty ModuleNotFoundError, retrying snippet with {fallback}: {error}",
+ fallback=type(self._fallback_executor).__name__,
error=result.error,
)
- return self._get_default_session().feed(code)
+ return self._get_fallback_session().feed(code)
return result
def close(self) -> None:
if self._monty_session is not None:
self._monty_session.close()
- if self._default_session is not None:
- self._default_session.close()
+ if self._fallback_session is not None:
+ self._fallback_session.close()
def __enter__(self) -> FallbackSession:
return self
@@ -500,11 +502,12 @@ class MontyExecutor:
If ``pydantic-monty`` is not installed.
"""
- def __init__(self) -> None:
+ def __init__(self, fallback_executor: Executor | None = None) -> None:
if not MONTY_AVAILABLE:
raise ImportError(
'MontyExecutor requires pydantic-monty. Install it with: pip install "vowel[monty]"'
)
+ self._fallback_executor = fallback_executor or DefaultExecutor()
async def execute(
self,
@@ -675,6 +678,7 @@ def create_session(
setup_code,
timeout=timeout,
max_memory=max_memory,
+ fallback_executor=self._fallback_executor,
)
@@ -918,6 +922,142 @@ def create_session(
)
+class ResolvedExecutor:
+ """Executor wrapper that falls back when the primary executor raises."""
+
+ def __init__(self, primary: Executor, fallback: Executor) -> None:
+ self.primary = primary
+ self.fallback = fallback
+
+ async def execute(
+ self,
+ code: str,
+ *,
+ inputs: dict[str, Any] | None = None,
+ external_functions: dict[str, Callable[..., Any]] | None = None,
+ timeout: float = 5.0,
+ max_memory: int = 10 * 1024 * 1024,
+ ) -> ExecutionResult:
+ try:
+ return await self.primary.execute(
+ code,
+ inputs=inputs,
+ external_functions=external_functions,
+ timeout=timeout,
+ max_memory=max_memory,
+ )
+ except Exception as exc: # noqa: BLE001
+ _logfire.info(
+ "Primary executor {primary} raised {exc_type}; falling back to {fallback}",
+ primary=type(self.primary).__name__,
+ exc_type=type(exc).__name__,
+ fallback=type(self.fallback).__name__,
+ )
+ return await self.fallback.execute(
+ code,
+ inputs=inputs,
+ external_functions=external_functions,
+ timeout=timeout,
+ max_memory=max_memory,
+ )
+
+ def execute_sync(
+ self,
+ code: str,
+ *,
+ inputs: dict[str, Any] | None = None,
+ external_functions: dict[str, Callable[..., Any]] | None = None,
+ timeout: float = 5.0,
+ max_memory: int = 10 * 1024 * 1024,
+ ) -> ExecutionResult:
+ try:
+ return self.primary.execute_sync(
+ code,
+ inputs=inputs,
+ external_functions=external_functions,
+ timeout=timeout,
+ max_memory=max_memory,
+ )
+ except Exception as exc: # noqa: BLE001
+ _logfire.info(
+ "Primary executor {primary} raised {exc_type}; falling back to {fallback}",
+ primary=type(self.primary).__name__,
+ exc_type=type(exc).__name__,
+ fallback=type(self.fallback).__name__,
+ )
+ return self.fallback.execute_sync(
+ code,
+ inputs=inputs,
+ external_functions=external_functions,
+ timeout=timeout,
+ max_memory=max_memory,
+ )
+
+ def create_session(
+ self,
+ setup_code: str,
+ *,
+ timeout: float = 5.0,
+ max_memory: int = 10 * 1024 * 1024,
+ ) -> ExecutionSession:
+ try:
+ return self.primary.create_session(
+ setup_code,
+ timeout=timeout,
+ max_memory=max_memory,
+ )
+ except Exception as exc: # noqa: BLE001
+ _logfire.info(
+ "Primary executor {primary} session creation raised {exc_type}; "
+ "falling back to {fallback}",
+ primary=type(self.primary).__name__,
+ exc_type=type(exc).__name__,
+ fallback=type(self.fallback).__name__,
+ )
+ return self.fallback.create_session(
+ setup_code,
+ timeout=timeout,
+ max_memory=max_memory,
+ )
+
+
+def resolve_executors(
+ executor: Executor | None = None,
+ fallback_executor: Executor | None = None,
+) -> Executor:
+ """Resolve primary/fallback executors while preserving Monty-first defaults."""
+ fallback = fallback_executor or DefaultExecutor()
+
+ if isinstance(executor, ResolvedExecutor):
+ if fallback_executor is None:
+ return executor
+ return ResolvedExecutor(executor.primary, fallback)
+
+ if executor is None:
+ if MONTY_AVAILABLE:
+ return MontyExecutor(fallback_executor=fallback)
+ import warnings
+
+ warnings.warn(
+ "pydantic-monty not installed; using fallback executor "
+ f'{type(fallback).__name__} (no sandboxing). Install with: pip install "vowel[monty]"',
+ stacklevel=2,
+ )
+ return fallback
+
+ if isinstance(executor, DefaultExecutor) and fallback_executor is None:
+ return executor
+
+ if isinstance(executor, MontyExecutor):
+ executor._fallback_executor = fallback # type: ignore[attr-defined]
+ return executor
+
+ if executor is fallback:
+ return executor
+
+ return ResolvedExecutor(executor, fallback)
+
+
# ---------------------------------------------------------------------------
# Factory
# ---------------------------------------------------------------------------
diff --git a/src/vowel/runner.py b/src/vowel/runner.py
index 4018fcd..9906018 100644
--- a/src/vowel/runner.py
+++ b/src/vowel/runner.py
@@ -36,7 +36,8 @@ def my_func(x):
from pydantic import BaseModel, Field
from .eval_types import Evals, EvalsFile, FixtureDefinition
-from .utils import EvalSummary
+from .executor import Executor
+from .utils import EvalSummary, EvalsBundle
from .utils import run_evals as _run_evals
_T = TypeVar("_T", bound=Any)
@@ -129,7 +130,7 @@ def _sanitize_code(code: str) -> str:
code = code.replace('\\"', '"').replace("\\'", "'")
# 2. Remove typing imports of builtin generics
- _BUILTIN_GENERICS = {
+ _builtin_generics = {
"Dict",
"List",
"Tuple",
@@ -146,7 +147,7 @@ def _sanitize_code(code: str) -> str:
def _clean_typing_import(m: _re.Match) -> str:
names = [n.strip() for n in m.group(1).split(",")]
- remaining = [n for n in names if n not in _BUILTIN_GENERICS]
+ remaining = [n for n in names if n not in _builtin_generics]
if not remaining:
return "" # remove the entire import line
return f"from typing import {', '.join(remaining)}"
@@ -281,7 +282,7 @@ class RunEvals:
def __init__(
self,
- source: str | Path | dict | EvalsFile | Evals | Sequence[Evals],
+ source: str | Path | dict | EvalsFile | EvalsBundle | Evals | Sequence[Evals],
*,
functions: dict[str, Callable] | None = None,
filter_funcs: list[str] | None = None,
@@ -292,6 +293,8 @@ def __init__(
dict[str, Callable | tuple[Callable, Callable | None] | FixtureDefinition] | None
) = None,
ignore_duration: bool = False,
+ executor: Executor | None = None,
+ fallback_executor: Executor | None = None,
):
self._source = source
self._functions = functions or {}
@@ -301,6 +304,8 @@ def __init__(
self._serial_fn = serial_fn or {}
self._fixtures = fixtures or {}
self._ignore_duration = ignore_duration
+ self._executor = executor
+ self._fallback_executor = fallback_executor
@classmethod
def from_file(cls, path: str | Path) -> "RunEvals":
@@ -318,6 +323,22 @@ def from_file(cls, path: str | Path) -> "RunEvals":
"""
return cls(str(path))
+ @classmethod
+ def from_bundle(cls, bundle: EvalsBundle) -> "RunEvals":
+ """
+ Create from a EvalsBundle object.
+
+ Args:
+ bundle: EvalsBundle object
+
+ Returns:
+ RunEvals instance
+
+ Example:
+ RunEvals.from_bundle(bundle).run()
+ """
+ return cls(bundle)
+
@classmethod
def from_source(cls, source: str | dict | EvalsFile) -> "RunEvals":
"""
@@ -565,6 +586,8 @@ def run(self) -> EvalSummary:
serial_fn=self._serial_fn,
fixtures=self._fixtures,
ignore_duration=self._ignore_duration,
+ executor=self._executor,
+ fallback_executor=self._fallback_executor,
)
def ignore_duration(self) -> "RunEvals":
@@ -579,3 +602,14 @@ def ignore_duration(self) -> "RunEvals":
"""
self._ignore_duration = True
return self
+
+ def with_executor(
+ self,
+ executor: Executor | None = None,
+ *,
+ fallback_executor: Executor | None = None,
+ ) -> "RunEvals":
+ """Store executor preferences for downstream execution-aware flows."""
+ self._executor = executor
+ self._fallback_executor = fallback_executor
+ return self
diff --git a/src/vowel/spec_validation.py b/src/vowel/spec_validation.py
index d559dda..293762c 100644
--- a/src/vowel/spec_validation.py
+++ b/src/vowel/spec_validation.py
@@ -12,7 +12,7 @@
import logfire
import yaml
-from vowel.executor import Executor, get_executor
+from vowel.executor import Executor, resolve_executors
from vowel.runner import Function
from vowel.utils import EvalSummary
@@ -70,6 +70,7 @@ def inject_durations(
func: Function,
executor: Executor,
*,
+ fallback_executor: Executor | None = None,
buffer_pct: float = 0.5,
floor_ms: float = 10.0,
) -> str:
@@ -97,6 +98,8 @@ def inject_durations(
if not isinstance(spec, dict):
return yaml_spec
+ executor = resolve_executors(executor, fallback_executor)
+
try:
session = executor.create_session(func.code)
except Exception:
@@ -127,13 +130,14 @@ def inject_durations(
)
case["duration"] = round(dur, 1)
- return yaml.dump(spec, default_flow_style=False, allow_unicode=True, sort_keys=False)
+ return yaml.safe_dump(spec, default_flow_style=False, allow_unicode=True, sort_keys=False)
def validate_expected_values(
yaml_spec: str,
func: Function,
executor: Executor | None = None,
+ fallback_executor: Executor | None = None,
) -> str:
"""Validate and fix expected values in a YAML spec by executing cases.
@@ -152,14 +156,14 @@ def validate_expected_values(
func:
Function to execute.
executor:
- Executor backend. Defaults to ``get_executor("auto")``.
+ Executor backend. Defaults to Monty-first with Default fallback.
Returns
-------
str
Fixed YAML spec with corrected expected values.
"""
- executor = executor or get_executor("auto")
+ executor = resolve_executors(executor, fallback_executor)
spec = yaml.safe_load(yaml_spec)
if not isinstance(spec, dict):
@@ -189,15 +193,19 @@ def validate_expected_values(
result = session.feed(call_code)
# --- Fix expected values ---
- if "expected" in case and not case.get("raises"):
- if result.success and result.output != case["expected"]:
- logfire.info(
- "Fixing expected value for case: {expected} → {actual}",
- expected=repr(case["expected"]),
- actual=repr(result.output),
- )
- case["expected"] = result.output
- fixes_applied += 1
+ if (
+ "expected" in case
+ and not case.get("raises")
+ and result.success
+ and result.output != case["expected"]
+ ):
+ logfire.info(
+ "Fixing expected value for case: {expected} → {actual}",
+ expected=repr(case["expected"]),
+ actual=repr(result.output),
+ )
+ case["expected"] = result.output
+ fixes_applied += 1
# --- Fix raises cases ---
if case.get("raises"):
@@ -226,7 +234,7 @@ def validate_expected_values(
if fixes_applied > 0:
logfire.info("Validated spec: {count} fixes applied", count=fixes_applied)
- return yaml.dump(spec, default_flow_style=False, allow_unicode=True, sort_keys=False)
+ return yaml.safe_dump(spec, default_flow_style=False, allow_unicode=True, sort_keys=False)
return yaml_spec
@@ -307,6 +315,11 @@ def inject_missing_error_cases(
"raises": error_type,
}
elif len(args) == 1:
+ # Tuples cannot be represented in yaml.safe_load()-compatible YAML.
+ # Other non-list inputs (None, int, str, dict) already cover the
+ # same TypeError path, so skip rather than convert and break semantics.
+ if isinstance(args[0], tuple):
+ continue
input_repr = repr((args[0], None))
if input_repr in existing_raises_inputs:
continue
@@ -337,6 +350,6 @@ def inject_missing_error_cases(
if injected > 0:
logfire.info("Injected {count} missing error cases into spec", count=injected)
- return yaml.dump(spec, default_flow_style=False, allow_unicode=True, sort_keys=False)
+ return yaml.safe_dump(spec, default_flow_style=False, allow_unicode=True, sort_keys=False)
return yaml_spec
diff --git a/src/vowel/tdd.py b/src/vowel/tdd.py
index 2f637f1..005fc27 100644
--- a/src/vowel/tdd.py
+++ b/src/vowel/tdd.py
@@ -36,7 +36,7 @@
from vowel.context import EVAL_SPEC_CONTEXT
from vowel.eval_types import EvalsSource
-from vowel.executor import Executor, get_executor
+from vowel.executor import Executor, resolve_executors
from vowel.monitoring import enable_monitoring
from vowel.runner import Function, RunEvals
from vowel.spec_validation import (
@@ -229,6 +229,7 @@ def __init__(
additional_context: str | list[str] | None = None,
load_env: bool = False,
executor: Executor | None = None,
+ fallback_executor: Executor | None = None,
**opts,
):
if load_env:
@@ -256,6 +257,7 @@ def __init__(
# Optional executor for expected-value validation
self._executor = executor
+ self._fallback_executor = fallback_executor
self._opts = opts
@@ -792,7 +794,10 @@ def generate_evals_from_signature(
code=real_code,
description=signature.description,
)
- executor = getattr(self, "_executor", None) or get_executor("auto")
+ executor = resolve_executors(
+ getattr(self, "_executor", None),
+ getattr(self, "_fallback_executor", None),
+ )
yaml_spec = validate_expected_values(
yaml_spec,
val_func,
diff --git a/src/vowel/utils.py b/src/vowel/utils.py
index c6c4f67..b092d8a 100644
--- a/src/vowel/utils.py
+++ b/src/vowel/utils.py
@@ -26,9 +26,9 @@
import importlib
import importlib.util
import inspect
-import logfire
import os
import sys
+import threading
import types
from collections.abc import Callable, Mapping, Sequence
from datetime import date, datetime, time, timedelta
@@ -38,6 +38,7 @@
from typing import Any, Literal, Optional, Union, get_args, get_origin
import click
+import logfire
import yaml
from pydantic import BaseModel, ConfigDict, Field
from pydantic_ai import format_as_xml
@@ -55,9 +56,7 @@
TypeAdapterEvaluator,
create_llm_judge,
)
-
-_SYS_PATH_MODIFIED = False
-
+from .executor import Executor
# =============================================================================
# Evals Bundle - Container for evals and fixtures
@@ -334,14 +333,24 @@ def check_compatibility(func: Callable) -> tuple[bool, list[str]]:
return False, issues
-def _ensure_cwd_in_path() -> None:
- """Ensure current working directory is in sys.path (run once)."""
- global _SYS_PATH_MODIFIED
- if not _SYS_PATH_MODIFIED:
- cwd = os.getcwd()
- if cwd not in sys.path:
- sys.path.insert(0, cwd)
- _SYS_PATH_MODIFIED = True
+@contextlib.contextmanager
+def _cwd_on_syspath() -> Any:
+ """Temporarily prepend the current working directory to ``sys.path``."""
+ cwd = os.getcwd()
+ inserted = cwd not in sys.path
+ if inserted:
+ sys.path.insert(0, cwd)
+ try:
+ yield
+ finally:
+ if inserted:
+ with contextlib.suppress(ValueError):
+ sys.path.remove(cwd)
+
+
+def _is_yaml_source_string(source_str: str) -> bool:
+ """Best-effort heuristic for distinguishing inline YAML from file paths."""
+ return "\n" in source_str or source_str.strip().startswith("{") or ":" in source_str
def _apply_serializer(
@@ -592,9 +601,9 @@ def __init__(
):
self.definitions = fixtures
self._fixture_funcs = fixture_funcs or {}
- self._instances: dict[str, Any] = {} # Cached fixture instances (all scopes)
- self._scope_counts: dict[str, int] = {} # Reference counts for scoped fixtures
+ self._instances: dict[str, Any] = {} # Cached fixture instances
self._generators: dict[str, Any] = {} # Active generator fixtures for cleanup
+ self._lock = threading.RLock()
def setup(self, fixture_name: str) -> Any:
"""
@@ -613,19 +622,19 @@ def setup(self, fixture_name: str) -> Any:
f"Available fixtures: {available if available else '(none defined)'}"
)
- defn = self.definitions[fixture_name]
+ with self._lock:
+ defn = self.definitions[fixture_name]
- # For module/session scope, return cached instance if exists
- if defn.scope in ("module", "session") and fixture_name in self._instances:
- self._scope_counts[fixture_name] = self._scope_counts.get(fixture_name, 0) + 1
- return self._instances[fixture_name]
+ # For module/session scope, return cached instance if exists.
+ if defn.scope in ("module", "session") and fixture_name in self._instances:
+ return self._instances[fixture_name]
- # Class-based fixture
- if defn.cls:
- return self._setup_class_fixture(fixture_name, defn)
+ # Class-based fixture
+ if defn.cls:
+ return self._setup_class_fixture(fixture_name, defn)
- # Function-based fixture
- return self._setup_function_fixture(fixture_name, defn)
+ # Function-based fixture
+ return self._setup_function_fixture(fixture_name, defn)
def _setup_class_fixture(self, fixture_name: str, defn: FixtureDefinition) -> Any:
"""Setup a class-based fixture by instantiating the class."""
@@ -644,9 +653,7 @@ def _setup_class_fixture(self, fixture_name: str, defn: FixtureDefinition) -> An
except Exception as e:
raise RuntimeError(f"Failed to instantiate {defn.cls}: {e}") from e
- # Cache instance
self._instances[fixture_name] = instance
- self._scope_counts[fixture_name] = self._scope_counts.get(fixture_name, 0) + 1
return instance
@@ -682,9 +689,7 @@ def _setup_function_fixture(self, fixture_name: str, defn: FixtureDefinition) ->
except Exception as e:
raise RuntimeError(f"Failed to setup fixture '{fixture_name}': {e}") from e
- # Cache instance (all scopes - function scope will be cleared on teardown)
self._instances[fixture_name] = instance
- self._scope_counts[fixture_name] = self._scope_counts.get(fixture_name, 0) + 1
return instance
@@ -699,22 +704,17 @@ def teardown(self, fixture_name: str, scope_trigger: str = "function") -> None:
if fixture_name not in self.definitions:
return
- defn = self.definitions[fixture_name]
+ with self._lock:
+ defn = self.definitions[fixture_name]
- # Only teardown if scope matches
- if defn.scope != scope_trigger:
- return
+ # Only teardown if scope matches
+ if defn.scope != scope_trigger:
+ return
- # Decrement reference count
- if fixture_name in self._scope_counts:
- self._scope_counts[fixture_name] -= 1
- # For module/session scope, only teardown when count reaches 0
- if defn.scope in ("module", "session") and self._scope_counts[fixture_name] > 0:
- return # Still in use
+ instance = self._instances.pop(fixture_name, None)
+ if instance is None:
+ return
- # Perform teardown
- instance = self._instances.pop(fixture_name, None)
- if instance is not None:
# Check if this is a generator fixture (pytest-style yield)
gen = self._generators.pop(fixture_name, None)
if gen is not None:
@@ -729,7 +729,7 @@ def teardown(self, fixture_name: str, scope_trigger: str = "function") -> None:
_, teardown_func = self._fixture_funcs[fixture_name]
elif defn.teardown:
# Check if teardown is a class method (e.g., 'Connection.close')
- if "." in defn.teardown and defn.cls:
+ if "." in defn.teardown and defn.cls and instance is not None:
parts = defn.teardown.split(".")
if len(parts) == 2:
class_name, method_name = parts
@@ -799,7 +799,6 @@ def import_function(module_path: str) -> Any:
ImportError: If the module cannot be imported
AttributeError: If the function is not found in the module
"""
- _ensure_cwd_in_path()
tried_combinations = []
if "." not in module_path:
@@ -813,49 +812,50 @@ def import_function(module_path: str) -> Any:
parts = module_path.split(".")
- for i in range(len(parts) - 1, 0, -1):
- module_name = ".".join(parts[:i])
- remaining_parts = parts[i:]
- tried_combinations.append(f"module='{module_name}', attr='{'.'.join(remaining_parts)}'")
+ with _cwd_on_syspath():
+ for i in range(len(parts) - 1, 0, -1):
+ module_name = ".".join(parts[:i])
+ remaining_parts = parts[i:]
+ tried_combinations.append(f"module='{module_name}', attr='{'.'.join(remaining_parts)}'")
- module = None
+ module = None
- try:
- module = importlib.import_module(module_name)
- except ImportError as e:
- logfire.debug(
- "Standard import failed for '{module_name}': {error}",
- module_name=module_name,
- error=str(e),
- )
- relative_path = module_name.replace(".", os.sep) + ".py"
- file_path = os.path.join(os.getcwd(), relative_path)
+ try:
+ module = importlib.import_module(module_name)
+ except ImportError as e:
+ logfire.debug(
+ "Standard import failed for '{module_name}': {error}",
+ module_name=module_name,
+ error=str(e),
+ )
+ relative_path = module_name.replace(".", os.sep) + ".py"
+ file_path = os.path.join(os.getcwd(), relative_path)
- if os.path.exists(file_path):
- try:
- spec = importlib.util.spec_from_file_location(module_name, file_path)
- if spec and spec.loader:
- module = importlib.util.module_from_spec(spec)
- spec.loader.exec_module(module)
+ if os.path.exists(file_path):
+ try:
+ spec = importlib.util.spec_from_file_location(module_name, file_path)
+ if spec and spec.loader:
+ module = importlib.util.module_from_spec(spec)
+ spec.loader.exec_module(module)
+ logfire.debug(
+ "File-based import succeeded for '{file_path}'", file_path=file_path
+ )
+ except Exception as e:
logfire.debug(
- "File-based import succeeded for '{file_path}'", file_path=file_path
+ "File-based import failed for '{file_path}': {error}",
+ file_path=file_path,
+ error=str(e),
)
- except Exception as e:
- logfire.debug(
- "File-based import failed for '{file_path}': {error}",
- file_path=file_path,
- error=str(e),
- )
- if module:
- try:
- obj: Any = module
- for part in remaining_parts:
- obj = getattr(obj, part)
- return obj
- except AttributeError as e:
- logfire.debug("Attribute lookup failed: {error}", error=str(e))
- continue
+ if module:
+ try:
+ obj: Any = module
+ for part in remaining_parts:
+ obj = getattr(obj, part)
+ return obj
+ except AttributeError as e:
+ logfire.debug("Attribute lookup failed: {error}", error=str(e))
+ continue
try:
obj = getattr(builtins, parts[0])
@@ -885,8 +885,6 @@ def import_class(class_path: str) -> type:
ImportError: If the module cannot be imported
AttributeError: If the class is not found in the module
"""
- _ensure_cwd_in_path()
-
parts = class_path.split(".")
if len(parts) < 2 or any(not p for p in parts):
raise ImportError(f"Invalid class path '{class_path}'. Expected format 'module.ClassName'.")
@@ -895,7 +893,8 @@ def import_class(class_path: str) -> type:
class_name = parts[-1]
try:
- module = importlib.import_module(module_name)
+ with _cwd_on_syspath():
+ module = importlib.import_module(module_name)
except ImportError as e:
raise ImportError(f"Cannot import module '{module_name}': {e}") from e
@@ -943,7 +942,7 @@ def load_evals(source: str | Path | dict | EvalsFile) -> dict[str, Evals]:
# Check if it's an existing file path first, before YAML heuristics
if os.path.exists(source_str):
return load_evals_file(source_str)
- if "\n" in source_str or source_str.strip().startswith("{") or ":" in source_str:
+ if _is_yaml_source_string(source_str):
return load_evals_from_yaml_string(source_str)
else:
return load_evals_file(source_str)
@@ -1003,7 +1002,9 @@ def load_bundle(source: str | Path | dict | EvalsFile) -> EvalsBundle:
return load_bundle_from_dict(source)
elif isinstance(source, (str, Path)):
source_str = str(source)
- if "\n" in source_str or source_str.strip().startswith("{") or ":" in source_str:
+ if os.path.exists(source_str):
+ return load_bundle_file(source_str)
+ if _is_yaml_source_string(source_str):
return load_bundle_from_yaml_string(source_str)
else:
return load_bundle_file(source_str)
@@ -1983,7 +1984,7 @@ def xml(self) -> str:
def run_evals(
- source: str | Path | dict | EvalsFile,
+ source: str | Path | dict | EvalsFile | EvalsBundle,
*,
filter_funcs: list[str] | None = None,
functions: dict[str, Callable] | None = None,
@@ -1994,6 +1995,8 @@ def run_evals(
dict[str, Callable | tuple[Callable, Callable | None] | FixtureDefinition] | None
) = None,
ignore_duration: bool = False,
+ executor: Executor | None = None,
+ fallback_executor: Executor | None = None,
) -> EvalSummary:
"""
Run evaluations from various sources.
@@ -2007,12 +2010,18 @@ def run_evals(
serial_fn: Optional dict of serializer functions (receive full input dict)
fixtures: Optional dict of fixture functions {name: setup_fn} or {name: (setup_fn, teardown_fn)}
ignore_duration: If True, skip duration constraints
+ executor: Optional primary executor configuration for execution-aware subflows
+ fallback_executor: Optional fallback executor paired with ``executor``
Returns:
EvalSummary with aggregated results
"""
# Load both evals and fixtures from YAML
- bundle = load_bundle(source)
+ _ = (executor, fallback_executor)
+ if isinstance(source, EvalsBundle):
+ bundle = source
+ else:
+ bundle = load_bundle(source)
all_evals = bundle.evals
yaml_fixtures = bundle.fixtures
diff --git a/src/vowel/validation.py b/src/vowel/validation.py
index 73d942b..b989f9e 100644
--- a/src/vowel/validation.py
+++ b/src/vowel/validation.py
@@ -388,7 +388,7 @@ def validate_and_fix_spec(
modified = True
if modified:
- result.fixed_yaml = yaml.dump(
+ result.fixed_yaml = yaml.safe_dump(
data, default_flow_style=False, allow_unicode=True, sort_keys=False
)
logfire.info(
diff --git a/tests/test_cli.py b/tests/test_cli.py
new file mode 100644
index 0000000..cfd2e2d
--- /dev/null
+++ b/tests/test_cli.py
@@ -0,0 +1,35 @@
+"""Tests for CLI behavior outside watch mode."""
+
+import json
+
+from click.testing import CliRunner
+
+from vowel.cli import main
+
+
+class TestCliExportJson:
+ """Test JSON export behavior."""
+
+ def test_export_json_writes_object_payload(self, tmp_path):
+ """--export-json should write a JSON object, not a quoted string."""
+ yaml_path = tmp_path / "evals.yml"
+ export_path = tmp_path / "results.json"
+ yaml_path.write_text(
+ """
+len:
+ dataset:
+ - case:
+ input: [1, 2, 3]
+ expected: 3
+"""
+ )
+
+ runner = CliRunner()
+ result = runner.invoke(main, [str(yaml_path), "--export-json", str(export_path), "--quiet"])
+
+ assert result.exit_code == 0
+
+ payload = json.loads(export_path.read_text())
+ assert isinstance(payload, dict)
+ assert "summary" in payload
+ assert "results" in payload
diff --git a/tests/test_evaluators.py b/tests/test_evaluators.py
index 272f053..931916c 100644
--- a/tests/test_evaluators.py
+++ b/tests/test_evaluators.py
@@ -144,6 +144,21 @@ def test_case_level_assertion(self):
assert summary.all_passed
+ def test_assertion_raw_fallback_preserves_compatibility(self):
+ """Assertions outside the restricted builtins set should still work via fallback."""
+ spec = {
+ "identity": {
+ "evals": {"Assertion": {"assertion": "pow(output, 2) == 16"}},
+ "dataset": [
+ {"case": {"input": 4}},
+ ],
+ }
+ }
+
+ summary = RunEvals.from_dict(spec).with_functions({"identity": lambda x: x}).run()
+
+ assert summary.all_passed
+
class TestTypeEvaluator:
"""Tests for type checking evaluator."""
diff --git a/tests/test_executor.py b/tests/test_executor.py
new file mode 100644
index 0000000..a23252c
--- /dev/null
+++ b/tests/test_executor.py
@@ -0,0 +1,457 @@
+"""Tests for vowel.executor — CodeMode execution backends.
+
+Covers MontyExecutor, DefaultExecutor, and get_executor factory across
+all injection modes: external_functions, inputs, both, and pure code.
+
+Tests:
+ 1. External functions only
+ 2. Inputs only
+ 3. Inputs + external functions combined
+ 4. Pure code (no injection)
+ 5. Stdout capture
+ 6. Error handling
+ 7. ExecutionResult structure
+ 8. Protocol conformance
+ 9. get_executor factory
+ 10. Parity — both executors produce the same output
+"""
+
+from __future__ import annotations
+
+import asyncio
+import importlib.util
+from typing import TYPE_CHECKING
+
+import pytest
+
+from vowel.executor import (
+ DefaultExecutor,
+ Executor,
+ get_executor,
+ resolve_executors,
+)
+
+if TYPE_CHECKING:
+ from vowel.executor import MontyExecutor
+
+# MontyExecutor requires pydantic-monty; skip gracefully if unavailable.
+_MONTY_AVAILABLE = importlib.util.find_spec("pydantic_monty") is not None
+
+if _MONTY_AVAILABLE:
+ from vowel.executor import MontyExecutor # noqa: F811
+
+# ---------------------------------------------------------------------------
+# Fixtures
+# ---------------------------------------------------------------------------
+
+
+def _binary_search(arr: list[int], target: int) -> int:
+ """Reference binary search used across test classes."""
+ lo, hi = 0, len(arr) - 1
+ while lo <= hi:
+ mid = (lo + hi) // 2
+ if arr[mid] == target:
+ return mid
+ elif arr[mid] < target:
+ lo = mid + 1
+ else:
+ hi = mid - 1
+ return -1
+
+
+def _add(a, b):
+ return a + b
+
+
+def _build_executors() -> tuple[list[Executor], list[str]]:
+ instances: list[Executor] = [DefaultExecutor()]
+ ids = ["default"]
+ if _MONTY_AVAILABLE:
+ instances.insert(0, MontyExecutor())
+ ids.insert(0, "monty")
+ return instances, ids
+
+
+EXECUTOR_INSTANCES, EXECUTOR_IDS = _build_executors()
+
+
+@pytest.fixture(params=EXECUTOR_INSTANCES, ids=EXECUTOR_IDS)
+def executor(request) -> Executor:
+ """Parametrised fixture yielding each executor backend."""
+ return request.param
+
+
+# ---------------------------------------------------------------------------
+# 1. External functions only
+# ---------------------------------------------------------------------------
+
+
+class TestExternalFunctions:
+ """Snippet calls host-side callbacks via external_functions."""
+
+ def test_single_function(self, executor: Executor):
+ code = "_binary_search([1, 3, 5, 7, 9], 5)"
+ r = asyncio.run(
+ executor.execute(code, external_functions={"_binary_search": _binary_search})
+ )
+ assert r.success is True
+ assert r.output == 2
+
+ def test_multiple_calls(self, executor: Executor):
+ code = (
+ "results = []\n"
+ "results.append(search([1, 3, 5, 7, 9], 5))\n"
+ "results.append(search([1, 3, 5, 7, 9], 4))\n"
+ "results.append(search([1], 1))\n"
+ "results.append(search([], 1))\n"
+ "results\n"
+ )
+ r = asyncio.run(executor.execute(code, external_functions={"search": _binary_search}))
+ assert r.success is True
+ assert r.output == [2, -1, 0, -1]
+
+ def test_multiple_functions(self, executor: Executor):
+ code = (
+ "results = []\n"
+ "results.append(search([10, 20, 30], 20))\n"
+ "results.append(add(3, 4))\n"
+ "results\n"
+ )
+ r = asyncio.run(
+ executor.execute(
+ code,
+ external_functions={"search": _binary_search, "add": _add},
+ )
+ )
+ assert r.success is True
+ assert r.output == [1, 7]
+
+
+# ---------------------------------------------------------------------------
+# 2. Inputs only
+# ---------------------------------------------------------------------------
+
+
+class TestInputs:
+ """Snippet uses injected values via inputs."""
+
+ def test_arithmetic(self, executor: Executor):
+ r = asyncio.run(executor.execute("x * y + z", inputs={"x": 10, "y": 3, "z": 5}))
+ assert r.success is True
+ assert r.output == 35
+
+ def test_list_input(self, executor: Executor):
+ r = asyncio.run(executor.execute("sorted(data)", inputs={"data": [3, 1, 2]}))
+ assert r.success is True
+ assert r.output == [1, 2, 3]
+
+ def test_string_input(self, executor: Executor):
+ r = asyncio.run(executor.execute("name.upper()", inputs={"name": "hello"}))
+ assert r.success is True
+ assert r.output == "HELLO"
+
+ def test_dict_input(self, executor: Executor):
+ r = asyncio.run(executor.execute("len(d)", inputs={"d": {"a": 1, "b": 2}}))
+ assert r.success is True
+ assert r.output == 2
+
+
+# ---------------------------------------------------------------------------
+# 3. Inputs + external functions combined
+# ---------------------------------------------------------------------------
+
+
+class TestCombined:
+ """Snippet uses both inputs and external_functions."""
+
+ def test_search_with_data(self, executor: Executor):
+ r = asyncio.run(
+ executor.execute(
+ "search(data, query)",
+ inputs={"data": [2, 4, 6, 8, 10], "query": 6},
+ external_functions={"search": _binary_search},
+ )
+ )
+ assert r.success is True
+ assert r.output == 2
+
+ def test_function_with_multiple_inputs(self, executor: Executor):
+ code = (
+ "results = []\n"
+ "for item in items:\n"
+ " results.append(transform(item, factor))\n"
+ "results\n"
+ )
+ r = asyncio.run(
+ executor.execute(
+ code,
+ inputs={"items": [1, 2, 3], "factor": 10},
+ external_functions={"transform": lambda x, f: x * f},
+ )
+ )
+ assert r.success is True
+ assert r.output == [10, 20, 30]
+
+
+# ---------------------------------------------------------------------------
+# 4. Pure code (no injection)
+# ---------------------------------------------------------------------------
+
+
+class TestPureCode:
+ """Snippet needs no external injection."""
+
+ def test_comprehension(self, executor: Executor):
+ r = asyncio.run(executor.execute("[i**2 for i in range(5)]"))
+ assert r.success is True
+ assert r.output == [0, 1, 4, 9, 16]
+
+ def test_arithmetic_expression(self, executor: Executor):
+ r = asyncio.run(executor.execute("2 ** 10"))
+ assert r.success is True
+ assert r.output == 1024
+
+ def test_multiline_with_last_expr(self, executor: Executor):
+ code = "x = [1, 2, 3]\ny = [i * 2 for i in x]\nsum(y)\n"
+ r = asyncio.run(executor.execute(code))
+ assert r.success is True
+ assert r.output == 12
+
+ def test_no_trailing_expression(self, executor: Executor):
+ """When the last statement is not an expression output should be None."""
+ r = asyncio.run(executor.execute("x = 42"))
+ assert r.success is True
+ assert r.output is None
+
+
+# ---------------------------------------------------------------------------
+# 5. Stdout capture
+# ---------------------------------------------------------------------------
+
+
+class TestStdout:
+ """print() output is captured in ExecutionResult.stdout."""
+
+ def test_print_captured(self, executor: Executor):
+ r = asyncio.run(executor.execute('print("hello")'))
+ assert r.success is True
+ assert "hello" in r.stdout
+
+
+# ---------------------------------------------------------------------------
+# 6. Error handling
+# ---------------------------------------------------------------------------
+
+
+class TestErrors:
+ """Errors are returned as structured results, never raised."""
+
+ def test_runtime_error(self, executor: Executor):
+ r = asyncio.run(executor.execute("1 / 0"))
+ assert r.success is False
+ assert r.error_type == "ZeroDivisionError"
+ assert r.output is None
+
+ def test_type_error_in_external(self, executor: Executor):
+ r = asyncio.run(
+ executor.execute(
+ 'search("not_a_list", 5)',
+ external_functions={"search": _binary_search},
+ )
+ )
+ assert r.success is False
+ assert r.error_type == "TypeError"
+
+ def test_name_error(self, executor: Executor):
+ r = asyncio.run(executor.execute("undefined_var + 1"))
+ assert r.success is False
+ assert r.error_type == "NameError"
+
+ def test_syntax_error(self, executor: Executor):
+ r = asyncio.run(executor.execute("def foo(:"))
+ assert r.success is False
+ assert r.error_type == "SyntaxError"
+
+ def test_error_has_message(self, executor: Executor):
+ r = asyncio.run(executor.execute("1 / 0"))
+ assert r.error is not None
+ assert len(r.error) > 0
+
+
+# ---------------------------------------------------------------------------
+# 7. ExecutionResult structure
+# ---------------------------------------------------------------------------
+
+
+class TestExecutionResult:
+ """ExecutionResult fields are correctly populated."""
+
+ def test_duration_is_positive(self, executor: Executor):
+ r = asyncio.run(executor.execute("42"))
+ assert r.duration_ms > 0
+
+ def test_success_fields(self, executor: Executor):
+ r = asyncio.run(executor.execute("42"))
+ assert r.success is True
+ assert r.error is None
+ assert r.error_type is None
+
+ def test_failure_fields(self, executor: Executor):
+ r = asyncio.run(executor.execute("1/0"))
+ assert r.success is False
+ assert r.error is not None
+ assert r.error_type is not None
+ assert r.output is None
+
+
+# ---------------------------------------------------------------------------
+# 8. Protocol conformance
+# ---------------------------------------------------------------------------
+
+
+class TestProtocol:
+ """Both executors satisfy the Executor protocol."""
+
+ @pytest.mark.skipif(not _MONTY_AVAILABLE, reason="pydantic-monty not installed")
+ def test_monty_is_executor(self):
+ assert isinstance(MontyExecutor(), Executor)
+
+ def test_default_is_executor(self):
+ assert isinstance(DefaultExecutor(), Executor)
+
+
+# ---------------------------------------------------------------------------
+# 9. get_executor factory
+# ---------------------------------------------------------------------------
+
+
+class TestFactory:
+ """get_executor returns the correct backend."""
+
+ def test_auto(self):
+ ex = get_executor("auto")
+ assert isinstance(ex, Executor)
+
+ @pytest.mark.skipif(not _MONTY_AVAILABLE, reason="pydantic-monty not installed")
+ def test_monty(self):
+ ex = get_executor("monty")
+ assert isinstance(ex, MontyExecutor)
+
+ def test_default(self):
+ ex = get_executor("default")
+ assert isinstance(ex, DefaultExecutor)
+
+ def test_invalid_backend(self):
+ with pytest.raises(ValueError, match="Unknown executor backend"):
+ get_executor("invalid") # type: ignore
+
+
+class _StaticSession:
+ def __init__(self, value):
+ self.value = value
+
+ def feed(self, code):
+ from vowel.executor import ExecutionResult
+
+ return ExecutionResult(output=self.value, stdout="", success=True)
+
+ def close(self):
+ return None
+
+ def __enter__(self):
+ return self
+
+ def __exit__(self, *_):
+ self.close()
+
+
+class _RaisingExecutor:
+ async def execute(self, code, **kwargs):
+ raise RuntimeError("boom")
+
+ def execute_sync(self, code, **kwargs):
+ raise RuntimeError("boom")
+
+ def create_session(self, setup_code, **kwargs):
+ raise RuntimeError("boom")
+
+
+class _StaticExecutor:
+ def __init__(self, value):
+ self.value = value
+
+ async def execute(self, code, **kwargs):
+ from vowel.executor import ExecutionResult
+
+ return ExecutionResult(output=self.value, stdout="", success=True)
+
+ def execute_sync(self, code, **kwargs):
+ from vowel.executor import ExecutionResult
+
+ return ExecutionResult(output=self.value, stdout="", success=True)
+
+ def create_session(self, setup_code, **kwargs):
+ return _StaticSession(self.value)
+
+
+class TestResolveExecutors:
+ def test_custom_executor_uses_default_fallback_on_session_failure(self):
+ ex = resolve_executors(_RaisingExecutor())
+
+ with ex.create_session("x = 1") as session:
+ result = session.feed("x + 1")
+
+ assert result.success is True
+ assert result.output == 2
+
+ def test_custom_fallback_executor_is_used(self):
+ ex = resolve_executors(_RaisingExecutor(), _StaticExecutor("fallback"))
+
+ with ex.create_session("ignored") as session:
+ result = session.feed("ignored")
+
+ assert result.success is True
+ assert result.output == "fallback"
+
+
+# ---------------------------------------------------------------------------
+# 10. Parity — both executors produce the same output
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.skipif(not _MONTY_AVAILABLE, reason="pydantic-monty not installed")
+class TestParity:
+ """MontyExecutor and DefaultExecutor must agree on output."""
+
+ CASES = [
+ ("pure_arithmetic", "2 + 3", {}, {}),
+ ("list_ops", "[1,2,3] + [4,5]", {}, {}),
+ ("string_method", '"hello world".split()', {}, {}),
+ ("with_inputs", "a + b", {"a": 10, "b": 20}, {}),
+ ("with_ext_func", "f(3, 4)", {}, {"f": _add}),
+ ("combined", "f(x, y)", {"x": 5, "y": 6}, {"f": _add}),
+ ]
+
+ @pytest.mark.parametrize(
+ "label,code,inputs,ext_fns",
+ CASES,
+ ids=[c[0] for c in CASES],
+ )
+ def test_output_matches(self, label, code, inputs, ext_fns):
+ monty = MontyExecutor()
+ default = DefaultExecutor()
+ kwargs: dict = {}
+ if inputs:
+ kwargs["inputs"] = inputs
+ if ext_fns:
+ kwargs["external_functions"] = ext_fns
+
+ r_monty = asyncio.run(monty.execute(code, **kwargs))
+ r_default = asyncio.run(default.execute(code, **kwargs))
+
+ assert r_monty.success is True, f"Monty failed: {r_monty.error}"
+ assert r_default.success is True, f"Default failed: {r_default.error}"
+ assert r_monty.output == r_default.output, (
+ f"Parity mismatch for '{label}': "
+ f"monty={r_monty.output!r} vs default={r_default.output!r}"
+ )
diff --git a/tests/test_fixtures.py b/tests/test_fixtures.py
index 2fe4b2a..da7ed08 100644
--- a/tests/test_fixtures.py
+++ b/tests/test_fixtures.py
@@ -107,6 +107,7 @@ def test_missing_fixture_param(self):
_db_instances = []
_cache_instances = []
+_session_fixture_events: list[str] = []
def setup_db(host: str = "localhost", port: int = 5432):
@@ -134,11 +135,23 @@ def teardown_cache(instance):
_cache_instances.remove(instance)
+def setup_session_counter():
+ """Track session fixture setup calls."""
+ _session_fixture_events.append("setup")
+ return {"bonus": 10}
+
+
+def teardown_session_counter(instance):
+ """Track session fixture teardown calls."""
+ _session_fixture_events.append(f"teardown:{instance['bonus']}")
+
+
class TestFixtureManager:
def setup_method(self):
"""Clear instances before each test."""
_db_instances.clear()
_cache_instances.clear()
+ _session_fixture_events.clear()
def test_setup_function_scope(self):
"""Should setup function-scoped fixture."""
@@ -340,6 +353,18 @@ def test_eval_fixture_field(self):
assert evals.fixture == ["db"]
+ def test_load_bundle_prefers_existing_file_before_yaml_heuristic(self, monkeypatch):
+ """Existing file paths should not be misclassified as inline YAML."""
+ import vowel.utils as utils
+
+ monkeypatch.setattr(utils.os.path, "exists", lambda path: True)
+ monkeypatch.setattr(utils, "load_bundle_file", lambda path: ("file", path))
+ monkeypatch.setattr(utils, "load_bundle_from_yaml_string", lambda src: ("yaml", src))
+
+ result = utils.load_bundle(r"C:\tmp\spec.yml")
+
+ assert result == ("file", r"C:\tmp\spec.yml")
+
def function_with_db(a: int, b: int, *, db: dict) -> int:
"""Test function that uses a db fixture."""
@@ -352,6 +377,7 @@ class TestIntegration:
def setup_method(self):
_db_instances.clear()
_cache_instances.clear()
+ _session_fixture_events.clear()
def test_fixture_injection_valid_signature(self):
"""Should validate and use fixtures correctly."""
@@ -386,6 +412,7 @@ class TestProgrammaticFixtures:
def setup_method(self):
_db_instances.clear()
_cache_instances.clear()
+ _session_fixture_events.clear()
def test_with_fixtures_setup_only(self):
"""Should work with setup-only fixtures via with_fixtures."""
@@ -520,6 +547,84 @@ def test_fixture_missing_error(self):
assert not summary.all_passed
assert summary.error_count == 1
+ def test_session_scope_fixture_runs_setup_and_teardown_once_per_eval_run(self):
+ """Session-scoped fixtures should setup once and teardown once across all cases."""
+ yaml_content = """
+add_with_db:
+ fixture:
+ - db
+ dataset:
+ - case:
+ inputs: {a: 1, b: 2}
+ expected: 13
+ - case:
+ inputs: {a: 3, b: 4}
+ expected: 17
+"""
+
+ summary = (
+ RunEvals.from_source(yaml_content)
+ .with_functions({"add_with_db": add_with_db})
+ .with_fixtures(
+ {
+ "db": FixtureDefinition(
+ setup="test_fixtures.setup_session_counter",
+ teardown="test_fixtures.teardown_session_counter",
+ scope="session",
+ )
+ }
+ )
+ .run()
+ )
+
+ assert summary.all_passed
+ assert _session_fixture_events == ["setup", "teardown:10"]
+
+ def test_session_scope_fixture_is_shared_across_multiple_functions(self):
+ """Session-scoped fixtures should teardown once after the full run ends."""
+ yaml_content = """
+add_with_db:
+ fixture:
+ - db
+ dataset:
+ - case:
+ inputs: {a: 1, b: 2}
+ expected: 13
+subtract_with_db:
+ fixture:
+ - db
+ dataset:
+ - case:
+ inputs: {a: 10, b: 3}
+ expected: 17
+"""
+
+ def subtract_with_db(a: int, b: int, *, db: dict) -> int:
+ return a - b + db["bonus"]
+
+ summary = (
+ RunEvals.from_source(yaml_content)
+ .with_functions(
+ {
+ "add_with_db": add_with_db,
+ "subtract_with_db": subtract_with_db,
+ }
+ )
+ .with_fixtures(
+ {
+ "db": FixtureDefinition(
+ setup="test_fixtures.setup_session_counter",
+ teardown="test_fixtures.teardown_session_counter",
+ scope="session",
+ )
+ }
+ )
+ .run()
+ )
+
+ assert summary.all_passed
+ assert _session_fixture_events == ["setup", "teardown:10"]
+
def setup_db_with_args(host: str, port: int):
"""Setup that requires positional args."""
diff --git a/tests/test_import_function.py b/tests/test_import_function.py
index f0b945b..dabbc30 100644
--- a/tests/test_import_function.py
+++ b/tests/test_import_function.py
@@ -206,6 +206,25 @@ def helper(x):
sys.path = original_path
os.chdir(original_cwd)
+ def test_import_local_module_does_not_mutate_sys_path(self, tmp_path, monkeypatch):
+ """Local imports should not leave the working directory on sys.path."""
+ module_file = tmp_path / "my_module.py"
+ module_file.write_text(
+ """
+def my_function(x):
+ return x * 2
+"""
+ )
+
+ monkeypatch.chdir(tmp_path)
+ monkeypatch.setattr(sys, "path", [p for p in sys.path if p != str(tmp_path)])
+ before = sys.path.copy()
+
+ func = import_function("my_module.my_function")
+
+ assert func(5) == 10
+ assert sys.path == before
+
class TestImportErrors:
"""Tests for import error handling."""
diff --git a/tests/test_run_evals.py b/tests/test_run_evals.py
index 49fac6d..0659958 100644
--- a/tests/test_run_evals.py
+++ b/tests/test_run_evals.py
@@ -5,6 +5,7 @@
import pytest
from vowel import EvalSummary, RunEvals, run_evals
+from vowel.executor import DefaultExecutor
class TestRunEvalsFromFile:
@@ -146,6 +147,28 @@ def test_with_functions_chained(self, simple_yaml_spec: str):
assert summary.all_passed
+ def test_with_executor_preserves_existing_run_behavior(self, simple_yaml_spec: str):
+ """Executor preferences should be accepted without changing normal eval behavior."""
+ summary = (
+ RunEvals.from_source(simple_yaml_spec)
+ .with_functions({"add": lambda a, b: a + b})
+ .with_executor(DefaultExecutor(), fallback_executor=DefaultExecutor())
+ .run()
+ )
+
+ assert summary.all_passed
+
+ def test_run_evals_accepts_executor_preferences(self, simple_yaml_spec: str):
+ """Top-level run_evals should accept executor preferences."""
+ summary = run_evals(
+ simple_yaml_spec,
+ functions={"add": lambda a, b: a + b},
+ executor=DefaultExecutor(),
+ fallback_executor=DefaultExecutor(),
+ )
+
+ assert summary.all_passed
+
class TestRunEvalsFilter:
"""Tests for filter() method."""
diff --git a/tests/test_session.py b/tests/test_session.py
new file mode 100644
index 0000000..b4b9d10
--- /dev/null
+++ b/tests/test_session.py
@@ -0,0 +1,232 @@
+"""Tests for ExecutionSession API — DefaultSession and MontyReplSession.
+
+Covers:
+ - Basic feed() results (binary search)
+ - Error handling (ZeroDivisionError)
+ - Syntax error reporting
+ - State preservation across feed() calls
+ - Stdout capture through sessions
+ - Context-manager lifecycle
+ - Session isolation (fresh state per session)
+"""
+
+from __future__ import annotations
+
+import importlib.util
+from typing import TYPE_CHECKING
+
+import pytest
+
+from vowel.executor import (
+ DefaultExecutor,
+ DefaultSession,
+ ExecutionSession,
+)
+
+if TYPE_CHECKING:
+ from vowel.executor import FallbackSession, MontyExecutor
+
+# MontyExecutor requires pydantic-monty; skip gracefully if unavailable.
+_MONTY_AVAILABLE = importlib.util.find_spec("pydantic_monty") is not None
+
+if _MONTY_AVAILABLE:
+ from vowel.executor import FallbackSession, MontyExecutor # noqa: F811
+
+# ---------------------------------------------------------------------------
+# Shared test data
+# ---------------------------------------------------------------------------
+
+FUNC_CODE = """\
+def binary_search(arr, target):
+ lo, hi = 0, len(arr) - 1
+ while lo <= hi:
+ mid = (lo + hi) // 2
+ if arr[mid] == target:
+ return mid
+ elif arr[mid] < target:
+ lo = mid + 1
+ else:
+ hi = mid - 1
+ return -1
+"""
+
+SEARCH_CASES = [
+ ("binary_search([1, 3, 5, 7, 9], 5)", 2),
+ ("binary_search([], 1)", -1),
+ ("binary_search([1, 2, 3], 4)", -1),
+ ("binary_search([10, 20, 30], 10)", 0),
+]
+
+
+def _build_executor_params() -> tuple[list, list[str]]:
+ params = [(DefaultExecutor, DefaultSession)]
+ ids = ["default"]
+ if _MONTY_AVAILABLE:
+ params.insert(0, (MontyExecutor, FallbackSession)) # type: ignore
+ ids.insert(0, "monty")
+ return params, ids
+
+
+EXECUTOR_CLASSES, EXECUTOR_IDS = _build_executor_params()
+
+
+@pytest.fixture(params=EXECUTOR_CLASSES, ids=EXECUTOR_IDS)
+def executor_and_session(request):
+ """Yield (executor_instance, expected_session_class)."""
+ cls, session_cls = request.param
+ return cls(), session_cls
+
+
+# ---------------------------------------------------------------------------
+# Basic session correctness
+# ---------------------------------------------------------------------------
+
+
+class TestSessionBasic:
+ """feed() returns correct outputs for a simple function."""
+
+ def test_binary_search_cases(self, executor_and_session):
+ executor, _ = executor_and_session
+ with executor.create_session(FUNC_CODE) as session:
+ for snippet, expected in SEARCH_CASES:
+ r = session.feed(snippet)
+ assert r.success, f"Failed: {snippet} => {r.error}"
+ assert r.output == expected, f"{snippet}: got {r.output!r}, expected {expected!r}"
+
+ def test_session_type(self, executor_and_session):
+ """create_session() returns the correct session class."""
+ executor, session_cls = executor_and_session
+ with executor.create_session("x = 1") as session:
+ assert isinstance(session, session_cls)
+
+
+# ---------------------------------------------------------------------------
+# Error handling
+# ---------------------------------------------------------------------------
+
+
+class TestSessionErrors:
+ """Errors are returned structured, not raised."""
+
+ def test_zero_division(self, executor_and_session):
+ executor, _ = executor_and_session
+ with executor.create_session("def foo(x): return 1/x") as session:
+ r = session.feed("foo(0)")
+ assert not r.success
+ assert r.error_type == "ZeroDivisionError"
+ assert r.error is not None
+
+ def test_name_error(self, executor_and_session):
+ executor, _ = executor_and_session
+ with executor.create_session("x = 1") as session:
+ r = session.feed("undefined_var + 1")
+ assert not r.success
+ assert r.error_type == "NameError"
+
+ def test_syntax_error(self, executor_and_session):
+ executor, _ = executor_and_session
+ with executor.create_session("def foo(): return 42") as session:
+ r = session.feed("foo(")
+ assert not r.success
+ assert r.error_type == "SyntaxError"
+
+ def test_error_does_not_break_session(self, executor_and_session):
+ """A single error in feed() should not corrupt the session."""
+ executor, _ = executor_and_session
+ with executor.create_session("def foo(x): return 1/x") as session:
+ r_bad = session.feed("foo(0)")
+ assert not r_bad.success
+ # Session should still work after error:
+ r_ok = session.feed("foo(2)")
+ assert r_ok.success
+ assert r_ok.output == 0.5
+
+
+# ---------------------------------------------------------------------------
+# State preservation
+# ---------------------------------------------------------------------------
+
+
+class TestStatePreservation:
+ """State persists across feed() calls within a single session."""
+
+ def test_mutation_persists(self, executor_and_session):
+ executor, _ = executor_and_session
+ with executor.create_session("x = 10") as session:
+ r1 = session.feed("x + 5")
+ assert r1.output == 15
+
+ session.feed("x = x * 2")
+
+ r3 = session.feed("x")
+ assert r3.output == 20
+
+ def test_function_defined_in_session(self, executor_and_session):
+ """Functions defined in one feed() are available in subsequent feeds."""
+ executor, _ = executor_and_session
+ with executor.create_session("y = 100") as session:
+ session.feed("def double(n): return n * 2")
+ r = session.feed("double(y)")
+ assert r.success
+ assert r.output == 200
+
+ def test_list_accumulation(self, executor_and_session):
+ executor, _ = executor_and_session
+ with executor.create_session("items = []") as session:
+ session.feed("items.append(1)")
+ session.feed("items.append(2)")
+ session.feed("items.append(3)")
+ r = session.feed("items")
+ assert r.output == [1, 2, 3]
+
+
+# ---------------------------------------------------------------------------
+# Stdout capture
+# ---------------------------------------------------------------------------
+
+
+class TestSessionStdout:
+ """print() output is captured through the session."""
+
+ def test_stdout_captured(self, executor_and_session):
+ executor, _ = executor_and_session
+ with executor.create_session("def greet(name): print(f'Hello {name}')") as session:
+ r = session.feed("greet('World')")
+ assert "Hello World" in r.stdout
+
+
+# ---------------------------------------------------------------------------
+# Session isolation
+# ---------------------------------------------------------------------------
+
+
+class TestSessionIsolation:
+ """Each session starts with a clean namespace."""
+
+ def test_separate_sessions_isolated(self, executor_and_session):
+ executor, _ = executor_and_session
+
+ with executor.create_session("x = 42") as s1:
+ r1 = s1.feed("x")
+ assert r1.output == 42
+
+ # A new session should NOT see x from the previous one:
+ with executor.create_session("y = 99") as s2:
+ r2 = s2.feed("y")
+ assert r2.output == 99
+ r_x = s2.feed("x")
+ assert not r_x.success # x should not exist
+
+
+# ---------------------------------------------------------------------------
+# Protocol conformance
+# ---------------------------------------------------------------------------
+
+
+class TestSessionProtocol:
+ """Sessions satisfy the ExecutionSession protocol."""
+
+ def test_protocol(self, executor_and_session):
+ executor, _ = executor_and_session
+ with executor.create_session("x = 1") as session:
+ assert isinstance(session, ExecutionSession)
diff --git a/tests/test_tdd_eval_retries.py b/tests/test_tdd_eval_retries.py
index efa5fdf..b3c0fdc 100644
--- a/tests/test_tdd_eval_retries.py
+++ b/tests/test_tdd_eval_retries.py
@@ -4,6 +4,7 @@
from unittest.mock import MagicMock, PropertyMock, patch
from vowel.eval_types import EvalsSource
+from vowel.spec_validation import build_failure_context
from vowel.tdd import FunctionSignature, Param, TDDGenerator
@@ -35,7 +36,8 @@ def _make_signature() -> FunctionSignature:
dataset:
- case:
inputs: [1, 2]
- expected: 999
+ expected: 3
+ assertion: "output > 100"
- case:
inputs: [0, 0]
expected: 0
@@ -166,27 +168,21 @@ def test_partial_coverage_accepted(self, mock_agent_prop):
mock_agent.run_sync.assert_called_once()
-class TestBuildEvalFailureContext(unittest.TestCase):
- """Test the failure context builder."""
+class TestBuildFailureContext(unittest.TestCase):
+ """Test the shared failure context builder."""
def test_builds_context_from_failures(self):
- gen = TDDGenerator.__new__(TDDGenerator)
- gen.model = "test"
-
# Run actual evals with a bad spec to get real summary
from vowel.runner import RunEvals
summary = RunEvals.from_source(BAD_YAML).with_functions({"add": add}).run()
- context = gen._build_eval_failure_context(summary)
+ context = build_failure_context(summary)
assert "FAILED" in context
def test_unknown_failures_fallback(self):
- gen = TDDGenerator.__new__(TDDGenerator)
- gen.model = "test"
-
# Mock summary with no useful info
mock_summary = MagicMock()
mock_summary.results = []
- context = gen._build_eval_failure_context(mock_summary)
+ context = build_failure_context(mock_summary)
assert context == "Unknown failures"
From 60b602f308752f689f17b9800a600f1bfe259e93 Mon Sep 17 00:00:00 2001
From: Mert <62549656+fswair@users.noreply.github.com>
Date: Tue, 17 Mar 2026 20:19:01 +0300
Subject: [PATCH 3/8] final_commit
---
db_fixture.yml | 2 +-
pyrightconfig.json | 1 +
src/vowel/__init__.py | 55 +--
src/vowel/ai.py | 36 +-
src/vowel/cli.py | 99 +++++-
src/vowel/codemode.py | 211 +++++------
src/vowel/context.py | 11 +-
src/vowel/eval_types.py | 65 ++--
src/vowel/evals.py | 18 +-
src/vowel/executor.py | 76 +---
src/vowel/mcp_server.py | 68 +---
src/vowel/runner.py | 35 +-
src/vowel/schema.py | 115 ++++++
src/vowel/spec_validation.py | 355 -------------------
src/vowel/tdd.py | 28 +-
src/vowel/utils.py | 103 ++----
src/vowel/validation.py | 279 ++++++++++++++-
tests/test_executor.py | 18 +-
tests/test_llm_integration.py | 6 +-
tests/test_llm_judge.py | 5 +-
tests/test_session.py | 12 +-
tests/test_tdd_eval_retries.py | 2 +-
tests/test_yaml_loading.py | 110 +++---
vowel-schema.json | 623 +++++++++++++++++++++++++++++----
24 files changed, 1277 insertions(+), 1056 deletions(-)
create mode 100644 src/vowel/schema.py
delete mode 100644 src/vowel/spec_validation.py
diff --git a/db_fixture.yml b/db_fixture.yml
index 32fa6a1..e8be58c 100644
--- a/db_fixture.yml
+++ b/db_fixture.yml
@@ -29,4 +29,4 @@ db.Connection.execute:
- case:
input: "SELECT * FROM developers" # (buggy query - invalid table)
raises: NoTableError
- match: "no such table" ## must match the exception message (case ignored)
\ No newline at end of file
+ match: "no such table" ## must match the exception message (case ignored)
diff --git a/pyrightconfig.json b/pyrightconfig.json
index 19456c7..98a899b 100644
--- a/pyrightconfig.json
+++ b/pyrightconfig.json
@@ -4,6 +4,7 @@
"exclude": [
"vowel-optimization",
"tmp",
+ "benchmark*",
"**/.*"
],
"reportMissingModuleSource": "none",
diff --git a/src/vowel/__init__.py b/src/vowel/__init__.py
index f890ed4..b3d98d2 100644
--- a/src/vowel/__init__.py
+++ b/src/vowel/__init__.py
@@ -1,33 +1,7 @@
-"""
-vowel - A modular evaluation framework for testing functions with YAML-based specifications.
-
-This package provides a comprehensive evaluation framework for testing Python functions
-using YAML-based specifications. It supports various evaluation types including:
-
-- Type checking (isinstance validation)
-- Custom assertions (Python expressions)
-- Performance constraints (duration limits)
-- Input containment checks
-- Regex pattern matching
-- Exception validation
-- LLM-based semantic evaluation
-
-Quick Start:
- # Run evaluations from a YAML file
- from vowel import run_evals
- summary = run_evals("evals.yml")
-
- # Generate evals for a function using LLM
- from vowel import EvalGenerator, Function
- gen = EvalGenerator(model="openai:gpt-4o")
- func = Function(name="add", code="def add(a, b): return a + b", description="Add two numbers")
- summary = gen.generate_and_run(func, auto_retry=True)
-
-For more information, see the documentation at:
-https://github.com/fswair/vowel
-"""
+"""Public package exports for the vowel evaluation framework."""
import importlib.metadata
+from contextlib import suppress
__version__ = importlib.metadata.version("vowel")
@@ -48,27 +22,28 @@
resolve_executors,
)
from .runner import Function, RunEvals
+from .schema import ensure_cached_schema
from .utils import (
EvalResult,
EvalSummary,
check_compatibility,
get_unsupported_params,
is_yaml_serializable_type,
- load_evals,
- load_evals_file,
- load_evals_from_dict,
- load_evals_from_object,
- load_evals_from_yaml_string,
+ load_bundle,
+ load_bundle_file,
+ load_bundle_from_dict,
+ load_bundle_from_object,
+ load_bundle_from_yaml_string,
run_evals,
to_dataset,
)
__all__ = [
- "load_evals_file",
- "load_evals_from_yaml_string",
- "load_evals_from_dict",
- "load_evals_from_object",
- "load_evals",
+ "load_bundle_file",
+ "load_bundle_from_yaml_string",
+ "load_bundle_from_dict",
+ "load_bundle_from_object",
+ "load_bundle",
"to_dataset",
"run_evals",
"RunEvals",
@@ -101,3 +76,7 @@
"ExplorationPlan",
"SnippetResult",
]
+
+
+with suppress(Exception):
+ ensure_cached_schema(__version__)
diff --git a/src/vowel/ai.py b/src/vowel/ai.py
index 6abcc0e..50937ac 100644
--- a/src/vowel/ai.py
+++ b/src/vowel/ai.py
@@ -1,34 +1,4 @@
-"""LLM-powered evaluation specification generator and function healer.
-
-This module provides:
-- EvalGenerator: Generate eval specs and heal buggy functions using LLMs
-- generate_eval_spec: Generate YAML eval specs from function definitions
-- prepare_agent: Create a pydantic_ai Agent for eval generation
-
-Key Features:
-- Auto-generate YAML eval specs from function code and description
-- Heal buggy functions based on failing test inputs
-- Retry logic with configurable coverage thresholds
-- Support for async and sync function generation
-
-Example:
- from vowel import EvalGenerator, Function
-
- generator = EvalGenerator(model="openai:gpt-4o")
-
- func = Function(
- name="factorial",
- description="Calculate factorial of n",
- code="def factorial(n): return 1 if n <= 1 else n * factorial(n - 1)"
- )
-
- summary = generator.generate_and_run(
- func,
- auto_retry=True,
- heal_function=True,
- min_coverage=0.9
- )
-"""
+"""LLM-backed eval generation and function healing utilities."""
import os
import time
@@ -43,6 +13,7 @@
from vowel.eval_types import EvalsSource
from vowel.monitoring import enable_monitoring
from vowel.runner import Function, RunEvals
+from vowel.schema import materialize_yaml_with_schema_header
from vowel.utils import EvalSummary, check_compatibility, import_function
from vowel.validation import validate_and_fix_spec
@@ -563,8 +534,9 @@ def generate_eval_spec(
)
if save_to_file:
+ spec_to_write = materialize_yaml_with_schema_header(spec_to_use)
with open(f"{func.name}_evals.yml", "w") as f:
- f.write(spec_to_use)
+ f.write(spec_to_write)
runner = RunEvals.from_source(spec_to_use)
if func.func:
diff --git a/src/vowel/cli.py b/src/vowel/cli.py
index 868b574..a5bf430 100644
--- a/src/vowel/cli.py
+++ b/src/vowel/cli.py
@@ -1,18 +1,13 @@
-"""Command-line interface for the vowel evaluation framework.
-
-Usage:
- vowel Run evaluations from a YAML spec
- vowel -d Run all YAML files in a directory
- vowel -v Detailed summary with spec semantics
- vowel --hide-report Hide pydantic_evals report output
-"""
+"""Command-line entry points for running and managing vowel eval specs."""
+import json
import sys
import time
from pathlib import Path
import click
import dotenv
+import yaml
from rich import box
from rich.console import Console
from rich.panel import Panel
@@ -27,6 +22,7 @@
LLMJudgeCase,
PatternMatchCase,
)
+from .schema import build_yaml_schema_from_bundle, materialize_yaml_with_schema_header
from .utils import EvalsBundle, EvalSummary, load_bundle, run_evals
dotenv.load_dotenv()
@@ -249,7 +245,9 @@ def validate_coverage(ctx, param, value):
@click.command()
-@click.argument("yaml_file", type=click.Path(exists=True, path_type=Path), required=False)
+@click.argument("arg1", type=click.Path(path_type=Path), required=False)
+@click.argument("arg2", type=click.Path(path_type=Path), required=False)
+@click.argument("arg3", type=click.Path(path_type=Path), required=False)
@click.option("--ci", is_flag=True, help="Enable CI mode")
@click.option(
"--coverage",
@@ -278,8 +276,16 @@ def validate_coverage(ctx, param, value):
@click.option("--watch", "-w", is_flag=True, help="Watch mode: re-run on file changes")
@click.option("--verbose", "-v", is_flag=True, help="Show detailed evaluation summary")
@click.option("--hide-report", is_flag=True, help="Hide pydantic_evals report output")
+@click.option(
+ "--create",
+ "schema_create",
+ is_flag=True,
+ help="With 'vowel schema': generate vowel-schema.json in current directory",
+)
def main(
- yaml_file: Path | None,
+ arg1: Path | None,
+ arg2: Path | None,
+ arg3: Path | None,
debug: bool,
coverage: float,
filter_func: str | None,
@@ -295,10 +301,75 @@ def main(
watch: bool,
verbose: bool,
hide_report: bool,
+ schema_create: bool,
):
"""vowel — YAML-based evaluation framework for Python functions."""
console = Console(force_terminal=False, no_color=True) if no_color else Console()
+ # Command mode: vowel schema
+ if arg1 is not None and str(arg1) == "schema":
+ # Command mode: vowel schema --create [output_path]
+ if schema_create:
+ output_path = arg2 if arg2 is not None else Path.cwd() / "vowel-schema.json"
+ output_path.parent.mkdir(parents=True, exist_ok=True)
+ schema = build_yaml_schema_from_bundle()
+ output_path.write_text(
+ json.dumps(schema, indent=2, ensure_ascii=False) + "\n", encoding="utf-8"
+ )
+ console.print(f"[green]✓[/green] Generated schema: [cyan]{output_path}[/cyan]")
+ return
+
+ if arg2 is None:
+ click.secho("ERROR: vowel schema requires or --create", fg="red", err=True)
+ raise click.Abort()
+
+ target_path = arg2
+ if not target_path.exists():
+ click.secho(f"ERROR: File not found: {target_path}", fg="red", err=True)
+ raise SystemExit(1)
+
+ if target_path.suffix.lower() == ".json":
+ click.secho(
+ "ERROR: JSON files are not supported by 'vowel schema '. "
+ "Use a YAML file (.yml/.yaml).",
+ fg="red",
+ err=True,
+ )
+ raise SystemExit(1)
+
+ existing = target_path.read_text(encoding="utf-8")
+
+ # Do not inject schema header into invalid YAML files.
+ try:
+ yaml.safe_load(existing)
+ except Exception as e:
+ click.secho(
+ f"ERROR: File is not valid YAML, schema header not added: {e}",
+ fg="red",
+ err=True,
+ )
+ raise SystemExit(1) from None
+
+ # Do not inject schema header if content is not a valid vowel spec.
+ try:
+ load_bundle(existing)
+ except Exception as e:
+ click.secho(
+ f"ERROR: Pydantic validation failed, schema header not added: {e}",
+ fg="red",
+ err=True,
+ )
+ raise SystemExit(1) from None
+
+ updated = materialize_yaml_with_schema_header(existing)
+ target_path.write_text(updated, encoding="utf-8")
+ console.print(f"[green]✓[/green] Updated schema header: [cyan]{target_path}[/cyan]")
+
+ console.print("[green]✓[/green] Pydantic validation passed")
+ return
+
+ yaml_file = arg1
+
# Validate incompatible options
if directory and filter_func:
click.secho("ERROR: --filter cannot be used with --dir", fg="red", err=True)
@@ -486,6 +557,9 @@ def main(
if not yaml_file:
click.secho("ERROR: --watch requires a YAML file", fg="red", err=True)
raise click.Abort()
+ if not yaml_file.exists():
+ click.secho(f"ERROR: YAML file not found: {yaml_file}", fg="red", err=True)
+ raise click.Abort()
try:
from watchdog.events import FileSystemEventHandler
@@ -565,6 +639,9 @@ def on_modified(self, event):
if not quiet:
console.print(f"Found [cyan]{len(yaml_files)}[/cyan] YAML file(s)")
elif yaml_file:
+ if not yaml_file.exists():
+ click.secho(f"ERROR: YAML file not found: {yaml_file}", fg="red", err=True)
+ raise click.Abort()
yaml_files = [yaml_file]
else:
click.secho("ERROR: Either YAML_FILE or --dir is required", fg="red", err=True)
@@ -738,8 +815,6 @@ def on_modified(self, event):
# Export JSON
if export_json:
- import json
-
json_data = summary.to_json()
with open(export_json, "w") as f:
json.dump(json_data, f, indent=2)
diff --git a/src/vowel/codemode.py b/src/vowel/codemode.py
index 6ef8256..e02f421 100644
--- a/src/vowel/codemode.py
+++ b/src/vowel/codemode.py
@@ -1,21 +1,11 @@
-"""CodeMode eval generation pipeline.
+"""CodeMode pipeline for execution-aware eval spec generation.
-This module provides ``CodeModeGenerator`` — a two-phase pipeline that uses
-a sandboxed code executor to produce ground-truth expected values before
-generating YAML eval specs.
+CodeMode uses real execution feedback to generate robust vowel eval specs:
+1. Explore behavior by running LLM-generated snippets against the target code.
+2. Generate and refine a spec from verified outputs/errors.
-Phase 1 — **Exploration**
- The LLM writes small Python snippets that call ``target_func`` with various
- inputs. Each snippet is executed via ``Executor`` (Monty sandbox by default)
- and the real outputs are collected. This replaces guesswork with empirical
- observation.
-
-Phase 2 — **Spec Generation**
- The exploration results (inputs → outputs, edge cases, exceptions) are fed
- back to the LLM together with the eval spec context. The LLM produces the
- final YAML spec with verified expected values.
-
-All steps are instrumented with ``logfire`` for full observability.
+The pipeline supports both YAML output and structured bundle output, and keeps
+traceability via logfire spans.
"""
from __future__ import annotations
@@ -34,15 +24,16 @@
from vowel.executor import ExecutionResult, Executor, resolve_executors
from vowel.monitoring import enable_monitoring
from vowel.runner import Function, RunEvals
-from vowel.spec_validation import (
+from vowel.schema import materialize_yaml_with_schema_header
+from vowel.utils import EvalsBundle, EvalSummary
+from vowel.validation import (
build_call_code,
build_failure_context,
inject_durations,
inject_missing_error_cases,
+ validate_and_fix_spec,
validate_expected_values,
)
-from vowel.utils import EvalSummary
-from vowel.validation import validate_and_fix_spec
enable_monitoring(service_name="vowel-codemode")
@@ -177,17 +168,10 @@ class CodeModeResult(BaseModel):
class CodeModeGenerator:
- """Two-phase eval generator: explore with executor, then generate spec.
-
- Parameters
- ----------
- model:
- LLM model identifier (e.g. ``"openai:gpt-4o"``).
- executor:
- Code execution backend. Defaults to ``get_executor("auto")``
- which prefers MontyExecutor when available.
- additional_context:
- Extra instructions appended to the system prompt.
+ """Execution-guided eval generator.
+
+ The generator first discovers behavior by running snippets, then produces
+ a validated eval spec (YAML or bundle) from those verified results.
"""
def __init__(
@@ -198,6 +182,7 @@ def __init__(
fallback_executor: Executor | None = None,
additional_context: str = "",
min_snippets: int = 15,
+ use_model_spec: bool = False,
**opts,
) -> None:
# Default fallback from kwargs (for backwards compatibility) or environment
@@ -217,11 +202,12 @@ def __init__(
self.executor = resolve_executors(default_executor, fallback_executor)
self.additional_context = additional_context
self.min_snippets = min_snippets
+ self.use_model_spec = use_model_spec
self._opts = opts
# Lazy agents
self._explorer_agent: Agent[None, ExplorationPlan] | None = None
- self._spec_agent: Agent[None, EvalsSource] | None = None
+ self._spec_agent: Agent[None, EvalsSource | EvalsBundle] | None = None
logfire.info(
"CodeModeGenerator initialized",
@@ -244,11 +230,12 @@ def explorer_agent(self) -> Agent[None, ExplorationPlan]:
return self._explorer_agent
@property
- def spec_agent(self) -> Agent[None, EvalsSource]:
+ def spec_agent(self) -> Agent[None, EvalsSource | EvalsBundle]:
if self._spec_agent is None:
+ output_type = EvalsBundle if self.use_model_spec else EvalsSource
self._spec_agent = Agent(
self.spec_model,
- output_type=EvalsSource,
+ output_type=output_type,
system_prompt=self._spec_system_prompt(),
**self._opts,
)
@@ -323,21 +310,10 @@ async def explore(
*,
exploration_rounds: int = 2,
) -> list[SnippetResult]:
- """Phase 1: Generate and execute exploration snippets.
-
- Supports multi-round feedback-guided exploration. Round 1 uses
- static reasoning (speculation-based). Round 2+ receives a
- programmatic cluster summary of prior results so the LLM can
- target unexplored behaviour classes (evidence-based).
+ """Generate and execute exploration snippets.
- Parameters
- ----------
- exploration_rounds:
- Number of exploration rounds (default 2). Set to 1 to
- restore single-shot behaviour.
-
- Returns a list of ``SnippetResult`` with real outputs from the
- executor.
+ Round 1 discovers baseline behavior. Subsequent rounds receive prior
+ execution evidence and target uncovered behavior classes.
"""
with logfire.span(
"codemode.explore",
@@ -404,7 +380,7 @@ def _execute_plan(
plan: ExplorationPlan,
round_num: int = 1,
) -> list[SnippetResult]:
- """Execute all snippets in a plan and return results."""
+ """Execute all snippets in an exploration plan and collect results."""
all_snippets = [
*((s, "normal") for s in plan.snippets),
*((s, "error") for s in plan.error_snippets),
@@ -448,11 +424,7 @@ def _execute_plan(
@staticmethod
def _build_cluster_summary(results: list[SnippetResult]) -> str:
- """Build a deterministic cluster summary from exploration results.
-
- Groups results by output type / error type and formats a concise
- summary for the Round 2 exploration prompt.
- """
+ """Summarize observed output/error clusters for targeted exploration."""
# -- Success clusters --
success_types: dict[str, int] = {}
for r in results:
@@ -501,7 +473,7 @@ def _count_new_behaviors(
prior: list[SnippetResult],
new: list[SnippetResult],
) -> int:
- """Count how many new behaviour classes the new results introduced."""
+ """Count new behavior signatures introduced by a round."""
def _behavior_key(r: SnippetResult) -> str:
if r.success:
@@ -513,7 +485,7 @@ def _behavior_key(r: SnippetResult) -> str:
return len(new_keys - prior_keys)
async def _get_exploration_plan(self, func: Function) -> ExplorationPlan:
- """Ask the LLM for exploration snippets (Round 1 — static reasoning)."""
+ """Request initial exploration snippets from the model."""
with logfire.span("codemode.llm_explore", func_name=func.name, round=1):
prompt = f"""Explore the following function by writing test snippets:
@@ -545,7 +517,7 @@ async def _get_targeted_exploration_plan(
prior_results: list[SnippetResult],
cluster_summary: str,
) -> ExplorationPlan:
- """Ask the LLM for targeted snippets (Round 2 — evidence-based)."""
+ """Request targeted snippets using prior execution evidence."""
with logfire.span("codemode.llm_explore", func_name=func.name, round=2):
prompt = f"""You previously explored `{func.name}` and the snippets were
executed. Below are the ACTUAL results and a cluster summary.
@@ -597,14 +569,11 @@ async def generate_spec(
func: Function,
exploration_results: list[SnippetResult],
failure_context: str | None = None,
- ) -> str:
- """Phase 2: Generate YAML spec using verified exploration results.
+ ) -> str | EvalsBundle:
+ """Generate a spec from verified exploration results.
- Parameters
- ----------
- failure_context:
- When provided (on refinement rounds), appended to the prompt so
- the LLM can fix specific failures from the previous attempt.
+ Returns YAML text in default mode, or ``EvalsBundle`` when
+ ``use_model_spec=True``.
"""
with logfire.span(
"codemode.generate_spec",
@@ -685,6 +654,18 @@ async def generate_spec(
)
result = await self.spec_agent.run(prompt)
+
+ if self.use_model_spec:
+ bundle = result.output
+ assert isinstance(bundle, EvalsBundle)
+ logfire.info(
+ "Model spec bundle generated",
+ func_name=func.name,
+ eval_count=len(bundle.evals),
+ fixture_count=len(bundle.fixtures),
+ )
+ return bundle
+
yaml_spec = result.output.yaml_spec
# Sanitize: strip ALL !! annotations — safe_load only accepts
@@ -739,7 +720,7 @@ async def generate_spec(
@staticmethod
def _build_failure_context(summary: EvalSummary) -> str:
- """Build a concise failure report to inject into the retry prompt."""
+ """Build retry context from failed assertions/errors."""
return build_failure_context(summary)
def _inject_durations(
@@ -750,7 +731,7 @@ def _inject_durations(
buffer_pct: float = 0.5,
floor_ms: float = 10.0,
) -> str:
- """Add per-case ``duration`` fields based on actual execution times."""
+ """Inject measured duration thresholds into cases."""
return inject_durations(
yaml_spec,
func,
@@ -761,7 +742,7 @@ def _inject_durations(
@staticmethod
def _build_call_code(func_name: str, case: dict) -> str | None:
- """Build a ``func(args...)`` call string from a case dict."""
+ """Build a callable expression from a dataset case."""
return build_call_code(func_name, case)
# -- Full pipeline -----------------------------------------------------
@@ -776,46 +757,11 @@ async def generate(
min_coverage: float = 1.0,
inject_durations: bool = True,
) -> CodeModeResult:
- """Run the full CodeMode pipeline with post-generation validation.
-
- Pipeline::
-
- Phase 1: explore() (2 rounds by default)
- Round 1 — static reasoning (speculation-based)
- Round 2 — targeted exploration (evidence-based)
- Phase 2: generate_spec() (may loop)
- Phase 3: validate via RunEvals (per attempt)
- Phase 4: refine on failure (up to N rounds)
- Phase 5: inject_durations() (once, at end)
-
- Exploration (Phase 1) runs in two rounds. Round 1 uses static
- reasoning; Round 2 receives a cluster summary of Round 1 results
- and targets uncovered behaviour classes. Only spec generation
- (Phase 2) is re-run on validation failure.
-
- Parameters
- ----------
- func:
- The function to generate evals for.
- run_evals:
- Whether to run the generated evals and include the summary.
- save_to_file:
- Whether to save the YAML spec to ``{func.name}_evals.yml``.
- max_refinement_rounds:
- Maximum number of spec-regeneration attempts after the initial
- generation (0 = single attempt, no refinement).
- min_coverage:
- Target pass-rate in 0.0–1.0 (default 1.0 = 100 %). The loop
- exits early when coverage meets or exceeds this threshold.
- inject_durations:
- Whether to measure and inject per-case ``duration`` fields
- into the final YAML spec.
-
- Returns
- -------
- CodeModeResult
- Contains exploration results, YAML spec, summary, and
- the number of refinement rounds used.
+ """Run full CodeMode generation, validation, and optional refinement.
+
+ Flow: explore -> generate spec -> validate -> refine (optional) ->
+ inject durations (optional). Returns exploration artifacts, final spec,
+ and evaluation summary when ``run_evals`` is enabled.
"""
with logfire.span(
"codemode.pipeline",
@@ -831,6 +777,7 @@ async def generate(
# Phase 2–4 — generate spec + validate + refine
yaml_spec = ""
+ generated_bundle: EvalsBundle | None = None
summary: EvalSummary | None = None
refinement_rounds = 0
failure_context: str | None = None
@@ -843,18 +790,25 @@ async def generate(
is_refinement=attempt > 0,
):
try:
- yaml_spec = await self.generate_spec(
+ bundle = await self.generate_spec(
func,
exploration_results,
failure_context,
)
- except Exception as gen_exc:
+
+ if isinstance(bundle, EvalsBundle):
+ generated_bundle = bundle
+ yaml_spec = bundle.to_yaml()
+ else:
+ generated_bundle = None
+ yaml_spec = bundle
+ except Exception as exc:
logfire.warn(
"Spec generation failed on attempt {attempt}, retrying",
attempt=attempt + 1,
- error=str(gen_exc),
+ error=str(exc),
)
- failure_context = f"Generation error: {gen_exc}"
+ failure_context = f"Generation error: {exc}"
refinement_rounds = attempt + 1
continue
@@ -863,11 +817,18 @@ async def generate(
# Validate: run evals with ignore_duration=True
try:
- runner = (
- RunEvals.from_source(yaml_spec)
- .with_functions({func.name: func.impl})
- .ignore_duration()
- )
+ if generated_bundle is not None:
+ runner = (
+ RunEvals.from_bundle(generated_bundle)
+ .with_functions({func.name: func.impl})
+ .ignore_duration()
+ )
+ else:
+ runner = (
+ RunEvals.from_source(yaml_spec)
+ .with_functions({func.name: func.impl})
+ .ignore_duration()
+ )
summary = runner.run()
logfire.info(
@@ -912,19 +873,27 @@ async def generate(
# Final summary run (with durations now present, but still ignored)
if run_evals and summary is not None:
try:
- final_runner = (
- RunEvals.from_source(yaml_spec)
- .with_functions({func.name: func.impl})
- .ignore_duration()
- )
+ if generated_bundle is not None:
+ final_runner = (
+ RunEvals.from_bundle(generated_bundle)
+ .with_functions({func.name: func.impl})
+ .ignore_duration()
+ )
+ else:
+ final_runner = (
+ RunEvals.from_source(yaml_spec)
+ .with_functions({func.name: func.impl})
+ .ignore_duration()
+ )
summary = final_runner.run()
except Exception: # noqa: BLE001
pass # keep last good summary
if save_to_file:
path = f"{func.name}_evals.yml"
+ spec_to_write = materialize_yaml_with_schema_header(yaml_spec)
with open(path, "w") as f:
- f.write(yaml_spec)
+ f.write(spec_to_write)
logfire.info("Saved spec to {path}", path=path)
elapsed = (time.perf_counter() - t0) * 1000
diff --git a/src/vowel/context.py b/src/vowel/context.py
index 4e584b0..36fa283 100644
--- a/src/vowel/context.py
+++ b/src/vowel/context.py
@@ -1,13 +1,4 @@
-"""
-Context definitions for vowel eval specification generation.
-
-This module contains the EVAL_SPEC_CONTEXT which provides comprehensive
-documentation about vowel's YAML-based evaluation specification format.
-This context is used by EvalGenerator to guide LLM-based eval generation.
-
-Set VOWEL_CONTEXT_VERSION=legacy to use the pre-optimization prompt.
-Default is "v3" (GEPA-optimized with Sonnet proposer).
-"""
+"""Prompt context strings used for LLM-driven eval specification generation."""
import os
diff --git a/src/vowel/eval_types.py b/src/vowel/eval_types.py
index e9cc14d..17f8495 100644
--- a/src/vowel/eval_types.py
+++ b/src/vowel/eval_types.py
@@ -1,24 +1,4 @@
-"""Pydantic models for vowel evaluation specifications.
-
-This module defines the data models used to parse and validate
-YAML evaluation specifications. These models ensure type safety
-and provide clear schemas for evaluation definitions.
-
-Main evaluation types:
- IsInstanceCase: Type checking validation
- AssertionCase: Custom Python assertion evaluation
- DurationCase: Performance/timing validation
- ContainsInputCase: Input containment check
- PatternMatchCase: Regex pattern matching
- RaisesCase: Exception validation
- LLMJudgeCase: LLM-based semantic evaluation
-
-Container models:
- MatchCase: Individual test case with input/expected output
- DatasetCase: Wrapper for test cases in dataset
- Evals: Complete evaluation specification for a function
- EvalsFile: Root model for YAML file parsing
-"""
+"""Pydantic models for parsing and validating vowel YAML specifications."""
import os
import typing
@@ -181,15 +161,33 @@ class EvalsSource(BaseModel):
# Fixture Models
# =============================================================================
-FixtureScope = Literal["function", "module", "session"]
-"""Scope for fixture lifecycle.
+FixtureScope = Literal["case", "eval", "file", "function", "module", "session"]
+"""Supported fixture scope names.
+
+Canonical user-facing names:
+- case: per dataset case
+- eval: per function eval block
+- file: per YAML file / run invocation
+
+Compatibility aliases:
+- function -> case
+- module -> eval
+- session -> file
-- function: Setup/teardown for each test case (default)
-- module: Setup once per eval file, teardown after all cases
-- session: Setup once per run_evals call, teardown at end
+Note:
+Runtime lifecycle currently uses legacy internal values
+(`function`/`module`/`session`). New names are normalized to these
+internal values for behavior-preserving migration.
"""
+_FIXTURE_SCOPE_ALIASES: dict[str, str] = {
+ "case": "function",
+ "eval": "module",
+ "file": "session",
+}
+
+
class FixtureDefinition(BaseModel):
"""Definition of a single fixture with setup/teardown lifecycle."""
@@ -218,9 +216,22 @@ class FixtureDefinition(BaseModel):
)
scope: FixtureScope = Field(
default="function",
- description="Lifecycle scope: 'function' (per case), 'module' (per eval), or 'session' (per run)",
+ description=(
+ "Fixture lifecycle scope. Preferred names: 'case', 'eval', 'file'. "
+ "Compatibility aliases are accepted: 'function', 'module', 'session'. "
+ "Current runtime normalization maps case->function, eval->module, file->session."
+ ),
)
+ @field_validator("scope", mode="before")
+ @classmethod
+ def normalize_scope_aliases(cls, value: Any) -> Any:
+ """Normalize new scope names to legacy internal values."""
+ if value is None or not isinstance(value, str):
+ return value
+ normalized = value.strip().lower()
+ return _FIXTURE_SCOPE_ALIASES.get(normalized, normalized)
+
@model_validator(mode="after")
def validate_setup_or_cls(self):
if not self.setup and not self.cls:
diff --git a/src/vowel/evals.py b/src/vowel/evals.py
index 5e78ecd..722dc55 100644
--- a/src/vowel/evals.py
+++ b/src/vowel/evals.py
@@ -1,20 +1,4 @@
-"""Evaluator implementations for the vowel framework.
-
-This module contains the concrete evaluator classes that implement
-the evaluation logic defined in eval_types.py. Each evaluator
-integrates with pydantic-evals to provide result reporting.
-
-Evaluators:
- AssertionEvaluator: Runs Python assertion expressions
- TypeAdapterEvaluator: Validates output types using Pydantic
- ContainsInputEvaluator: Checks if output contains input value
- PatternMatchingEvaluator: Validates output against regex patterns
- RaisesEvaluator: Validates expected exception raising
-
-Factory functions:
- create_llm_judge: Creates an LLM-based judge evaluator
- prepare_env_and_condition: Prepares evaluation context
-"""
+"""Concrete evaluator implementations used by the vowel runtime."""
import importlib.util
import os
diff --git a/src/vowel/executor.py b/src/vowel/executor.py
index fe14dbd..1046744 100644
--- a/src/vowel/executor.py
+++ b/src/vowel/executor.py
@@ -1,71 +1,4 @@
-"""Code execution backends for CodeMode eval generation.
-
-CodeMode allows the eval generation agent to *run* code inside a sandbox
-rather than guessing expected values. This produces ground-truth outputs
-and lets the agent empirically explore function behaviour (edge cases,
-exception boundaries, return types) before writing test cases.
-
-Architecture
-------------
-``Executor`` is a runtime Protocol — any object that implements ``execute()``
-qualifies. Two concrete implementations are provided:
-
-* ``MontyExecutor`` — uses ``pydantic-monty`` (Rust-based sandbox, <0.1 ms
- startup, no filesystem/network access). **Recommended
- for production and the optimization loop.**
-* ``DefaultExecutor`` — uses Python's built-in ``exec()`` with stdout capture.
- No sandboxing. Safe only for trusted, local code;
- useful during development when Monty is not installed.
-
-The ``execute()`` method accepts two orthogonal injection mechanisms that
-mirror Monty's native API:
-
-* ``inputs`` — ``dict[str, Any]`` of *values* injected as
- top-level variables visible to the snippet.
-* ``external_functions`` — ``dict[str, Callable]`` of *host-side callbacks*
- the snippet can call by name. In the Monty
- backend each call exits the sandbox, runs on
- the host, and returns the result.
-
-Session API
------------
-For batch exploration (e.g. CodeMode), use ``create_session()`` to compile
-the function source **once**, then ``feed()`` each snippet against the
-preserved runtime state.
-
-* ``MontyReplSession`` — backed by ``MontyRepl``: zero re-parse overhead
- per snippet, heap/globals preserved across feeds.
-* ``DefaultSession`` — backed by a persistent ``exec()`` namespace.
-
-Usage examples
---------------
-**External functions** — inject one or more real functions::
-
- await executor.execute(
- '''
- results = []
- results.append(target_func([1, 3, 5, 7, 9], 5))
- results.append(target_func([], 1))
- results
- ''',
- external_functions={"target_func": binary_search},
- )
-
-**Inputs** — inject plain values::
-
- await executor.execute(
- "x + y",
- inputs={"x": 10, "y": 20},
- )
-
-**Session** — compile once, feed many snippets::
-
- async with executor.create_session(func_code) as session:
- r1 = session.feed("binary_search([1,3,5], 3)")
- r2 = session.feed("binary_search([], 1)")
-
-The value of the last expression becomes ``ExecutionResult.output``.
-"""
+"""Execution backends used by CodeMode for sandboxed and local code runs."""
from __future__ import annotations
@@ -90,12 +23,7 @@
def run_sync(coro: Any) -> Any:
- """Run a coroutine synchronously, even inside a running event loop.
-
- Tries ``asyncio.run()`` first (clean, no patching). If there is
- already a running loop (e.g. Jupyter, async framework), falls back
- to ``nest_asyncio`` + ``loop.run_until_complete()``.
- """
+ """Run a coroutine from sync code, including running-loop environments."""
try:
return asyncio.run(coro)
except RuntimeError as exc:
diff --git a/src/vowel/mcp_server.py b/src/vowel/mcp_server.py
index 729a44e..fc84660 100644
--- a/src/vowel/mcp_server.py
+++ b/src/vowel/mcp_server.py
@@ -1,63 +1,4 @@
-"""Vowel MCP Server - Model Context Protocol server for eval generation.
-
-This module exposes vowel's full evaluation, generation, and TDD capabilities via
-MCP (Model Context Protocol), enabling AI assistants to run evaluations, generate
-functions, create test specs, and perform TDD workflows.
-
-Configuration is set via the ``env`` field in your MCP client JSON config.
-The env field should contain API keys and model names only. All other parameters
-(auto_retry, min_coverage, etc.) are tool parameters with sensible defaults.
-
-Usage:
- # Add to MCP client config (e.g., Claude Desktop, VS Code Copilot)
- {
- "mcpServers": {
- "vowel": {
- "command": "python",
- "args": ["-m", "vowel.mcp_server"],
- "env": {
- "MODEL_NAME": "openai:gpt-4o",
- "OPENAI_API_KEY": "sk-..."
- }
- }
- }
- }
-
- # Or run directly (reads env vars from shell)
- python -m vowel.mcp_server
-
-Supported env vars:
- MODEL_NAME — Default LLM model (e.g. "openai:gpt-4o", "gemini-3-flash-preview")
- JUDGE_MODEL — Model for LLM Judge evaluator
- OPENAI_API_KEY — OpenAI API key
- ANTHROPIC_API_KEY — Anthropic API key
- GOOGLE_API_KEY — Google AI API key
-
-Available Tools (14):
- Eval Runner:
- - run_evals_from_file: Run evaluations from a YAML file
- - run_evals_from_yaml: Run evaluations from YAML content string
- - run_evals_with_fixtures: Run evaluations with fixture injection
- - validate_yaml_spec: Validate a YAML eval specification
- - check_function_compatibility: Check function compatibility with eval generation
- - list_yaml_files: List YAML files in a directory
-
- EvalGenerator:
- - generate_function: Generate a Python function from description
- - generate_eval_spec: Generate eval spec for a function
- - generate_and_run_evals: Generate spec + run + auto-retry + heal
-
- TDDGenerator:
- - tdd_generate_signature: Generate function signature from description
- - tdd_generate_evals: Generate eval spec from a signature
- - tdd_generate_implementation: Generate implementation from signature + spec
- - tdd_generate_all: Full TDD flow: description → signature → evals → implementation
- - tdd_generate_and_validate: TDD with eval validation against implementation
-
-Available Resources:
- - vowel://context: Eval specification documentation
- - vowel://example: Example YAML eval specification
-"""
+"""MCP server exposing vowel evaluation, generation, and TDD tools."""
from __future__ import annotations
@@ -67,11 +8,12 @@
import nest_asyncio
from mcp.server.fastmcp import FastMCP
-from vowel import check_compatibility, load_evals_from_yaml_string, run_evals
+from vowel import check_compatibility, run_evals
from vowel.ai import EVAL_SPEC_CONTEXT, EvalGenerator
from vowel.monitoring import enable_monitoring
from vowel.runner import Function, RunEvals
from vowel.tdd import TDDGenerator
+from vowel.utils import load_bundle_from_yaml_string
enable_monitoring(service_name="vowel-mcp")
@@ -204,8 +146,8 @@ def validate_yaml_spec(yaml_content: str) -> dict[str, Any]:
yaml_content: YAML eval specification to validate
"""
try:
- evals = load_evals_from_yaml_string(yaml_content)
- function_names = list(evals.keys())
+ bundle = load_bundle_from_yaml_string(yaml_content)
+ function_names = list(bundle.evals.keys())
return {
"valid": True,
"functions": function_names,
diff --git a/src/vowel/runner.py b/src/vowel/runner.py
index 9906018..66f39c2 100644
--- a/src/vowel/runner.py
+++ b/src/vowel/runner.py
@@ -1,28 +1,4 @@
-"""RunEvals - A fluent API for running evaluations.
-
-This module provides:
-- Function: Pydantic model representing a function with code and metadata
-- RunEvals: Fluent API for loading and running evaluations
-
-Example:
- # Run from YAML file
- from vowel import RunEvals
-
- summary = RunEvals.from_file("evals.yml").run()
- print(f"All passed: {summary.all_passed}")
-
- # Run with custom functions
- def my_func(x):
- return x * 2
-
- summary = (
- RunEvals.from_file("evals.yml")
- .with_functions({"my_func": my_func})
- .filter(["my_func"])
- .debug()
- .run()
- )
-"""
+"""Fluent APIs and models for loading and running evals."""
import ast
import codecs
@@ -37,7 +13,7 @@ def my_func(x):
from .eval_types import Evals, EvalsFile, FixtureDefinition
from .executor import Executor
-from .utils import EvalSummary, EvalsBundle
+from .utils import EvalsBundle, EvalSummary
from .utils import run_evals as _run_evals
_T = TypeVar("_T", bound=Any)
@@ -76,12 +52,7 @@ def __name__(self) -> str: # pyright: ignore[reportIncompatibleVariableOverride
@property
def impl(self) -> Callable[..., _RT]:
- """
- Get the function implementation as a callable.
-
- Returns:
- Callable: The function implementation.
- """
+ """Return the executable function object for this definition."""
if not self.func:
self.execute()
return cast(Callable, self.func)
diff --git a/src/vowel/schema.py b/src/vowel/schema.py
new file mode 100644
index 0000000..80e4647
--- /dev/null
+++ b/src/vowel/schema.py
@@ -0,0 +1,115 @@
+"""Versioned JSON Schema cache and YAML header helpers."""
+
+from __future__ import annotations
+
+import importlib.metadata
+import json
+import re
+from copy import deepcopy
+from pathlib import Path
+from typing import Any
+
+from .utils import EvalsBundle
+
+SCHEMA_CACHE_DIR = Path.home() / ".vowel"
+
+
+def _schema_version_token(version: str | None = None) -> str:
+ if version is None:
+ try:
+ version = importlib.metadata.version("vowel")
+ except importlib.metadata.PackageNotFoundError:
+ version = "0.0.0"
+
+ ver = version
+ nums = re.findall(r"\d+", ver)
+ if not nums:
+ return "000"
+ return "".join(nums)
+
+
+def build_yaml_schema_from_bundle() -> dict[str, Any]:
+ """Build YAML-file schema directly from runtime models.
+
+ No repository reference file is used. The root shape is forced to match
+ vowel's YAML file format:
+ - top-level optional `fixtures`
+ - top-level additionalProperties => per-function `Evals`
+ """
+ bundle_schema = EvalsBundle.model_json_schema(ref_template="#/$defs/{model}")
+ defs = bundle_schema.get("$defs", {})
+ properties = bundle_schema.get("properties", {})
+ fixtures_schema = properties.get(
+ "fixtures",
+ {
+ "type": "object",
+ "title": "Fixtures",
+ },
+ )
+
+ additional_properties: dict[str, Any]
+ if "Evals" in defs:
+ # Top-level YAML uses function name as key, so `id` should not be
+ # required in each map value even though runtime Evals model has it.
+ evals_map_value = deepcopy(defs["Evals"])
+ required = evals_map_value.get("required")
+ if isinstance(required, list):
+ evals_map_value["required"] = [k for k in required if k != "id"]
+ evals_map_value["title"] = "Function"
+ evals_map_value["description"] = (
+ "Function evaluation specification keyed by function import path/name. "
+ "Contains fixture dependencies, global evaluators (`evals`), and dataset cases."
+ )
+ defs["EvalsMapValue"] = evals_map_value
+ additional_properties = {"$ref": "#/$defs/EvalsMapValue"}
+ else:
+ evals_schema = properties.get("evals", {"type": "object"})
+ additional_properties = evals_schema.get("additionalProperties", {"type": "object"})
+
+ schema: dict[str, Any] = {
+ "$schema": "http://json-schema.org/draft-07/schema#",
+ "type": "object",
+ "properties": {
+ "fixtures": fixtures_schema,
+ },
+ "additionalProperties": additional_properties,
+ "$defs": defs,
+ }
+
+ return schema
+
+
+def ensure_cached_schema(version: str | None = None) -> Path:
+ """Ensure the versioned schema file exists and is up to date."""
+ token = _schema_version_token(version)
+ schema_path = SCHEMA_CACHE_DIR / f"vowel-schema_{token}.json"
+ schema_path.parent.mkdir(parents=True, exist_ok=True)
+
+ schema_data = build_yaml_schema_from_bundle()
+ rendered = json.dumps(schema_data, indent=2, ensure_ascii=False) + "\n"
+
+ if not schema_path.exists() or schema_path.read_text(encoding="utf-8") != rendered:
+ schema_path.write_text(rendered, encoding="utf-8")
+
+ return schema_path
+
+
+def add_schema_header(yaml_spec: str, schema_path: Path | str) -> str:
+ """Prepend YAML language-server schema reference header to YAML content."""
+ schema_str = str(schema_path)
+ header = f"# yaml-language-server: $schema={schema_str}"
+
+ lines = yaml_spec.splitlines()
+ if lines and lines[0].startswith("# yaml-language-server: $schema="):
+ lines = lines[1:]
+ if lines and lines[0] == "":
+ lines = lines[1:]
+
+ body = "\n".join(lines).rstrip("\n")
+ return f"{header}\n\n{body}\n"
+
+
+def materialize_yaml_with_schema_header(yaml_spec: str, version: str | None = None) -> str:
+ """Create/refresh versioned schema cache and return header-prefixed YAML."""
+ schema_path = ensure_cached_schema(version)
+ return add_schema_header(yaml_spec, schema_path)
diff --git a/src/vowel/spec_validation.py b/src/vowel/spec_validation.py
deleted file mode 100644
index 293762c..0000000
--- a/src/vowel/spec_validation.py
+++ /dev/null
@@ -1,355 +0,0 @@
-"""Shared spec validation utilities for eval generation pipelines.
-
-Functions in this module are used by both ``CodeModeGenerator`` and
-``TDDGenerator`` to validate generated YAML specs against real execution
-and to inject measured durations.
-"""
-
-from __future__ import annotations
-
-from typing import Any
-
-import logfire
-import yaml
-
-from vowel.executor import Executor, resolve_executors
-from vowel.runner import Function
-from vowel.utils import EvalSummary
-
-
-def build_failure_context(summary: EvalSummary) -> str:
- """Build a concise failure report to inject into a retry prompt.
-
- Iterates over :class:`EvalSummary` results and formats each failed
- case/assertion as a single line. Returns a multi-line string suitable
- for LLM prompts.
- """
- lines: list[str] = []
- for result in summary.results:
- if result.report:
- for case in result.report.cases:
- failed_assertions = {k: v for k, v in case.assertions.items() if not v.value}
- if failed_assertions:
- parts = []
- for k, v in failed_assertions.items():
- if v.reason:
- parts.append(f"{k}: {v.reason}")
- else:
- parts.append(f"{k}: FAILED")
- lines.append(f"- Case '{case.name}' FAILED [{', '.join(parts)}]")
- if result.error:
- lines.append(f"- Error: {result.error}")
- return "\n".join(lines) if lines else "Unknown failures"
-
-
-def build_call_code(
- func_name: str, case: dict
-) -> (
- str | None
-): # TODO: intead of building call code, consider passing arguments through executor inputs
- """Build a ``func(args...)`` call string from a YAML case dict.
-
- Returns ``None`` when no input is present (e.g. raises-only case
- without input).
- """
- if "inputs" in case and case["inputs"] is not None:
- args = case["inputs"]
- if isinstance(args, list):
- arg_strs = ", ".join(repr(a) for a in args)
- return f"{func_name}({arg_strs})"
- if isinstance(args, dict):
- kwarg_strs = ", ".join(f"{k}={v!r}" for k, v in args.items())
- return f"{func_name}({kwarg_strs})"
- elif "input" in case and case["input"] is not None:
- return f"{func_name}({case['input']!r})"
- return None
-
-
-def inject_durations(
- yaml_spec: str,
- func: Function,
- executor: Executor,
- *,
- fallback_executor: Executor | None = None,
- buffer_pct: float = 0.5,
- floor_ms: float = 10.0,
-) -> str:
- """Add per-case ``duration`` fields based on actual execution times.
-
- Each non-raises case is executed once via the executor session.
- The measured ``duration_ms`` is inflated by *buffer_pct* (default 50%)
- with a minimum of *floor_ms* (default 10 ms) to absorb noise.
-
- Parameters
- ----------
- yaml_spec:
- YAML string to augment.
- func:
- Function to execute cases against.
- executor:
- Executor backend to use for timing.
- buffer_pct:
- Fractional buffer added on top of measured time (0.5 = +50%).
- floor_ms:
- Absolute minimum duration in ms — protects sub-ms cases from
- flaky failures due to measurement noise.
- """
- spec = yaml.safe_load(yaml_spec)
- if not isinstance(spec, dict):
- return yaml_spec
-
- executor = resolve_executors(executor, fallback_executor)
-
- try:
- session = executor.create_session(func.code)
- except Exception:
- logfire.warn("Could not create session for duration injection")
- return yaml_spec
-
- with session:
- for eval_id, eval_def in spec.items():
- if not isinstance(eval_def, dict):
- continue
- for case_entry in eval_def.get("dataset", []):
- case = case_entry.get("case", {})
- if not isinstance(case, dict):
- continue
- # Skip cases that expect exceptions
- if case.get("raises"):
- continue
-
- call_code = build_call_code(eval_id, case)
- if call_code is None:
- continue
-
- result = session.feed(call_code)
- if result.success:
- dur = max(
- result.duration_ms * (1 + buffer_pct),
- floor_ms,
- )
- case["duration"] = round(dur, 1)
-
- return yaml.safe_dump(spec, default_flow_style=False, allow_unicode=True, sort_keys=False)
-
-
-def validate_expected_values(
- yaml_spec: str,
- func: Function,
- executor: Executor | None = None,
- fallback_executor: Executor | None = None,
-) -> str:
- """Validate and fix expected values in a YAML spec by executing cases.
-
- For each case that has ``expected`` and no ``raises``, executes the
- function call and compares the result. If the actual output differs
- from the YAML expected value, the YAML is updated to the real value.
-
- Also validates ``raises`` cases: if the case expects an exception but
- the function doesn't raise (or raises a different type), the case is
- corrected.
-
- Parameters
- ----------
- yaml_spec:
- YAML spec string to validate.
- func:
- Function to execute.
- executor:
- Executor backend. Defaults to Monty-first with Default fallback.
-
- Returns
- -------
- str
- Fixed YAML spec with corrected expected values.
- """
- executor = resolve_executors(executor, fallback_executor)
-
- spec = yaml.safe_load(yaml_spec)
- if not isinstance(spec, dict):
- return yaml_spec
-
- try:
- session = executor.create_session(func.code)
- except Exception:
- logfire.warn("Could not create session for expected value validation")
- return yaml_spec
-
- fixes_applied = 0
-
- with session:
- for eval_id, eval_def in spec.items():
- if not isinstance(eval_def, dict):
- continue
- for case_entry in eval_def.get("dataset", []):
- case = case_entry.get("case", {})
- if not isinstance(case, dict):
- continue
-
- call_code = build_call_code(eval_id, case)
- if call_code is None:
- continue
-
- result = session.feed(call_code)
-
- # --- Fix expected values ---
- if (
- "expected" in case
- and not case.get("raises")
- and result.success
- and result.output != case["expected"]
- ):
- logfire.info(
- "Fixing expected value for case: {expected} → {actual}",
- expected=repr(case["expected"]),
- actual=repr(result.output),
- )
- case["expected"] = result.output
- fixes_applied += 1
-
- # --- Fix raises cases ---
- if case.get("raises"):
- expected_exc = case["raises"]
- if result.success:
- # Function didn't raise — remove raises, set expected
- logfire.info(
- "Case expected {exc} but function returned {output}, fixing",
- exc=expected_exc,
- output=repr(result.output),
- )
- del case["raises"]
- if "match" in case:
- del case["match"]
- case["expected"] = result.output
- fixes_applied += 1
- elif result.error_type and result.error_type != expected_exc:
- # Wrong exception type
- logfire.info(
- "Case expected {expected} but got {actual}, fixing",
- expected=expected_exc,
- actual=result.error_type,
- )
- case["raises"] = result.error_type
- fixes_applied += 1
-
- if fixes_applied > 0:
- logfire.info("Validated spec: {count} fixes applied", count=fixes_applied)
- return yaml.safe_dump(spec, default_flow_style=False, allow_unicode=True, sort_keys=False)
-
- return yaml_spec
-
-
-def inject_missing_error_cases(
- yaml_spec: str,
- func_name: str,
- error_snippets: list[dict],
-) -> str:
- """Inject error cases from exploration into the spec if the LLM missed them.
-
- Each item in *error_snippets* should have keys:
-
- - ``code``: Python snippet that triggered the error (e.g. ``"flatten(None)"``)
- - ``error_type``: Exception class name (e.g. ``"TypeError"``)
- - ``error``: Full error message
- - ``description``: One-line description
-
- Uses :mod:`ast` to extract function call arguments from the snippet
- code. If parsing fails (multi-line code, complex expressions), the
- snippet is silently skipped.
-
- Returns the (possibly modified) YAML spec string.
- """
- import ast
-
- if not error_snippets:
- return yaml_spec
-
- spec = yaml.safe_load(yaml_spec)
- if not isinstance(spec, dict) or func_name not in spec:
- return yaml_spec
-
- eval_def = spec[func_name]
- dataset = eval_def.setdefault("dataset", [])
-
- # Collect existing raises case inputs to avoid duplicates
- existing_raises_inputs: set[str] = set()
- for entry in dataset:
- case = entry.get("case", {})
- if isinstance(case, dict) and case.get("raises"):
- # Normalise existing input for comparison
- inp = case.get("input")
- inps = case.get("inputs")
- existing_raises_inputs.add(repr((inp, inps)))
-
- injected = 0
-
- for snippet in error_snippets:
- code = snippet["code"].strip()
- error_type = snippet["error_type"]
- description = snippet.get("description", "")
-
- # Try to extract arguments from a simple function call
- try:
- tree = ast.parse(code, mode="eval")
- except SyntaxError:
- continue
-
- if not isinstance(tree.body, ast.Call):
- continue
-
- try:
- args = [ast.literal_eval(a) for a in tree.body.args]
- kwargs = {kw.arg: ast.literal_eval(kw.value) for kw in tree.body.keywords}
- except (ValueError, TypeError):
- # Complex expression that can't be literal-evaluted — skip
- continue
-
- # Determine input/inputs format
- if kwargs:
- input_repr = repr((None, kwargs))
- if input_repr in existing_raises_inputs:
- continue
- case_dict: dict[str, Any] = {
- "id": f"error_{error_type.lower()}_{injected}",
- "inputs": kwargs,
- "raises": error_type,
- }
- elif len(args) == 1:
- # Tuples cannot be represented in yaml.safe_load()-compatible YAML.
- # Other non-list inputs (None, int, str, dict) already cover the
- # same TypeError path, so skip rather than convert and break semantics.
- if isinstance(args[0], tuple):
- continue
- input_repr = repr((args[0], None))
- if input_repr in existing_raises_inputs:
- continue
- case_dict = {
- "id": f"error_{error_type.lower()}_{injected}",
- "input": args[0],
- "raises": error_type,
- }
- elif len(args) > 1:
- input_repr = repr((None, args))
- if input_repr in existing_raises_inputs:
- continue
- case_dict = {
- "id": f"error_{error_type.lower()}_{injected}",
- "inputs": args,
- "raises": error_type,
- }
- else:
- continue
-
- dataset.append({"case": case_dict})
- injected += 1
- logfire.info(
- "Injected error case: {desc} → raises {exc}",
- desc=description,
- exc=error_type,
- )
-
- if injected > 0:
- logfire.info("Injected {count} missing error cases into spec", count=injected)
- return yaml.safe_dump(spec, default_flow_style=False, allow_unicode=True, sort_keys=False)
-
- return yaml_spec
diff --git a/src/vowel/tdd.py b/src/vowel/tdd.py
index 005fc27..5a69bdf 100644
--- a/src/vowel/tdd.py
+++ b/src/vowel/tdd.py
@@ -1,24 +1,4 @@
-"""TDD-based eval generation: Intent -> Signature -> Evals -> Implementation.
-
-This module provides a true TDD approach where:
-1. LLM generates function signature from description (intent)
-2. LLM generates eval spec from signature (tests first)
-3. LLM generates implementation that passes the evals (code last)
-
-Example:
- from vowel.tdd import TDDGenerator
-
- generator = TDDGenerator(model="openai:gpt-4o")
-
- result = generator.generate_all(
- description="Binary search for target in sorted list. Returns index or -1.",
- name="binary_search"
- )
-
- print(result.signature.to_signature_str())
- print(result.yaml_spec)
- print(result.func.code)
-"""
+"""TDD pipeline for generating signatures, evals, and implementations."""
import inspect
import os
@@ -39,12 +19,12 @@
from vowel.executor import Executor, resolve_executors
from vowel.monitoring import enable_monitoring
from vowel.runner import Function, RunEvals
-from vowel.spec_validation import (
+from vowel.utils import EvalSummary
+from vowel.validation import (
build_failure_context,
+ validate_and_fix_spec,
validate_expected_values,
)
-from vowel.utils import EvalSummary
-from vowel.validation import validate_and_fix_spec
# Configure logfire for tracing
dotenv.load_dotenv()
diff --git a/src/vowel/utils.py b/src/vowel/utils.py
index b092d8a..1c710c8 100644
--- a/src/vowel/utils.py
+++ b/src/vowel/utils.py
@@ -1,23 +1,4 @@
-"""Utility functions for the vowel evaluation framework.
-
-This module provides core utilities for:
-- Loading and parsing YAML evaluation specifications
-- Type compatibility checking for YAML serialization
-- Function import and execution helpers
-- Dataset creation and evaluation running
-- Result aggregation and reporting
-
-Key classes:
- EvalResult: Result of a single function evaluation
- EvalSummary: Aggregated results from multiple evaluations
-
-Key functions:
- run_evals: Main entry point for running evaluations
- load_evals: Load evaluations from various sources
- to_dataset: Convert Evals to pydantic-evals Dataset
- is_yaml_serializable_type: Check if a type can be serialized to YAML
- check_compatibility: Validate function parameters for YAML compatibility
-"""
+"""Shared utilities for loading specs, building datasets, and running evals."""
import asyncio
import builtins
@@ -68,9 +49,40 @@ class EvalsBundle(BaseModel):
model_config = ConfigDict(arbitrary_types_allowed=True)
- evals: dict[str, Evals] = Field(default_factory=dict)
+ evals: dict[str, Evals] = Field(min_length=1)
fixtures: dict[str, FixtureDefinition] = Field(default_factory=dict)
+ def to_yaml(self) -> str:
+ """Serialize bundle to current vowel YAML spec format."""
+ data: dict[str, Any] = {}
+
+ for func_id, evals in self.evals.items():
+ evals_dict = evals.model_dump(
+ mode="python",
+ exclude_none=True,
+ exclude_defaults=True,
+ )
+ # Function id is represented by the top-level YAML key.
+ evals_dict.pop("id", None)
+ data[func_id] = evals_dict
+
+ if self.fixtures:
+ data["fixtures"] = {
+ name: definition.model_dump(
+ mode="python",
+ exclude_none=True,
+ exclude_defaults=True,
+ )
+ for name, definition in self.fixtures.items()
+ }
+
+ return yaml.safe_dump(
+ data,
+ sort_keys=False,
+ allow_unicode=True,
+ default_flow_style=False,
+ )
+
# =============================================================================
# YAML Serializable Types
@@ -909,50 +921,6 @@ def import_class(class_path: str) -> type:
return cls
-def load_evals_file(yaml_path: str) -> dict[str, Evals]:
- with open(yaml_path) as f:
- loaded = yaml.safe_load(f)
-
- evals_file = EvalsFile.model_validate(loaded)
- return evals_file.get_evals()
-
-
-def load_evals_from_yaml_string(yaml_content: str) -> dict[str, Evals]:
- loaded = yaml.safe_load(yaml_content)
- evals_file = EvalsFile.model_validate(loaded)
- return evals_file.get_evals()
-
-
-def load_evals_from_dict(data: dict) -> dict[str, Evals]:
- evals_file = EvalsFile.model_validate(data)
- return evals_file.get_evals()
-
-
-def load_evals_from_object(evals_obj: EvalsFile) -> dict[str, Evals]:
- return evals_obj.get_evals()
-
-
-def load_evals(source: str | Path | dict | EvalsFile) -> dict[str, Evals]:
- if isinstance(source, EvalsFile):
- return load_evals_from_object(source)
- elif isinstance(source, dict):
- return load_evals_from_dict(source)
- elif isinstance(source, (str, Path)):
- source_str = str(source)
- # Check if it's an existing file path first, before YAML heuristics
- if os.path.exists(source_str):
- return load_evals_file(source_str)
- if _is_yaml_source_string(source_str):
- return load_evals_from_yaml_string(source_str)
- else:
- return load_evals_file(source_str)
- else:
- raise TypeError(
- f"source must be a file path (str/Path), YAML string (str), dict, "
- f"or EvalsFile object, got {type(source)}"
- )
-
-
# =============================================================================
# Bundle Loading Functions (with fixtures)
# =============================================================================
@@ -2018,10 +1986,7 @@ def run_evals(
"""
# Load both evals and fixtures from YAML
_ = (executor, fallback_executor)
- if isinstance(source, EvalsBundle):
- bundle = source
- else:
- bundle = load_bundle(source)
+ bundle = source if isinstance(source, EvalsBundle) else load_bundle(source)
all_evals = bundle.evals
yaml_fixtures = bundle.fixtures
diff --git a/src/vowel/validation.py b/src/vowel/validation.py
index b989f9e..20636dd 100644
--- a/src/vowel/validation.py
+++ b/src/vowel/validation.py
@@ -1,24 +1,17 @@
-"""Static validator for LLM-generated eval specifications.
-
-Catches common LLM generation mistakes BEFORE the spec is used:
-1. Extra fields in cases (comment, note, description, etc.)
-2. YAML-unparseable type remnants (set literals, tuple strings, float('inf'), etc.)
-3. Invented exception types not in function code
-4. Removes or fixes problematic cases, returns clean YAML
-
-Usage:
- from vowel.validation import validate_and_fix_spec
-
- fixed_yaml, warnings = validate_and_fix_spec(yaml_spec, function_code="def foo(x): ...")
-"""
+"""Validation and normalization helpers for generated eval specs."""
+import ast
import re
from dataclasses import dataclass, field
-from typing import Literal
+from typing import Any, Literal
import logfire
import yaml
+from vowel.executor import Executor, resolve_executors
+from vowel.runner import Function
+from vowel.utils import EvalSummary
+
# Fields allowed in a case block (from MatchCase model)
ALLOWED_CASE_FIELDS = frozenset(
{
@@ -399,3 +392,261 @@ def validate_and_fix_spec(
)
return result
+
+
+def build_failure_context(summary: EvalSummary) -> str:
+ """Build a concise failure report to inject into a retry prompt."""
+ lines: list[str] = []
+ for result in summary.results:
+ if result.report:
+ for case in result.report.cases:
+ failed_assertions = {k: v for k, v in case.assertions.items() if not v.value}
+ if failed_assertions:
+ parts = []
+ for k, v in failed_assertions.items():
+ if v.reason:
+ parts.append(f"{k}: {v.reason}")
+ else:
+ parts.append(f"{k}: FAILED")
+ lines.append(f"- Case '{case.name}' FAILED [{', '.join(parts)}]")
+ if result.error:
+ lines.append(f"- Error: {result.error}")
+ return "\n".join(lines) if lines else "Unknown failures"
+
+
+def build_call_code(
+ func_name: str, case: dict
+) -> (
+ str | None
+): # TODO: intead of building call code, consider passing arguments through executor inputs
+ """Build a ``func(args...)`` call string from a YAML case dict."""
+ if "inputs" in case and case["inputs"] is not None:
+ args = case["inputs"]
+ if isinstance(args, list):
+ arg_strs = ", ".join(repr(a) for a in args)
+ return f"{func_name}({arg_strs})"
+ if isinstance(args, dict):
+ kwarg_strs = ", ".join(f"{k}={v!r}" for k, v in args.items())
+ return f"{func_name}({kwarg_strs})"
+ elif "input" in case and case["input"] is not None:
+ return f"{func_name}({case['input']!r})"
+ return None
+
+
+def inject_durations(
+ yaml_spec: str,
+ func: Function,
+ executor: Executor,
+ *,
+ fallback_executor: Executor | None = None,
+ buffer_pct: float = 0.5,
+ floor_ms: float = 10.0,
+) -> str:
+ """Add per-case ``duration`` fields based on actual execution times."""
+ spec = yaml.safe_load(yaml_spec)
+ if not isinstance(spec, dict):
+ return yaml_spec
+
+ executor = resolve_executors(executor, fallback_executor)
+
+ try:
+ session = executor.create_session(func.code)
+ except Exception:
+ logfire.warn("Could not create session for duration injection")
+ return yaml_spec
+
+ with session:
+ for eval_id, eval_def in spec.items():
+ if not isinstance(eval_def, dict):
+ continue
+ for case_entry in eval_def.get("dataset", []):
+ case = case_entry.get("case", {})
+ if not isinstance(case, dict):
+ continue
+ if case.get("raises"):
+ continue
+
+ call_code = build_call_code(eval_id, case)
+ if call_code is None:
+ continue
+
+ result = session.feed(call_code)
+ if result.success:
+ dur = max(
+ result.duration_ms * (1 + buffer_pct),
+ floor_ms,
+ )
+ case["duration"] = round(dur, 1)
+
+ return yaml.safe_dump(spec, default_flow_style=False, allow_unicode=True, sort_keys=False)
+
+
+def validate_expected_values(
+ yaml_spec: str,
+ func: Function,
+ executor: Executor | None = None,
+ fallback_executor: Executor | None = None,
+) -> str:
+ """Validate and fix expected values in a YAML spec by executing cases."""
+ executor = resolve_executors(executor, fallback_executor)
+
+ spec = yaml.safe_load(yaml_spec)
+ if not isinstance(spec, dict):
+ return yaml_spec
+
+ try:
+ session = executor.create_session(func.code)
+ except Exception:
+ logfire.warn("Could not create session for expected value validation")
+ return yaml_spec
+
+ fixes_applied = 0
+
+ with session:
+ for eval_id, eval_def in spec.items():
+ if not isinstance(eval_def, dict):
+ continue
+ for case_entry in eval_def.get("dataset", []):
+ case = case_entry.get("case", {})
+ if not isinstance(case, dict):
+ continue
+
+ call_code = build_call_code(eval_id, case)
+ if call_code is None:
+ continue
+
+ result = session.feed(call_code)
+
+ if (
+ "expected" in case
+ and not case.get("raises")
+ and result.success
+ and result.output != case["expected"]
+ ):
+ logfire.info(
+ "Fixing expected value for case: {expected} → {actual}",
+ expected=repr(case["expected"]),
+ actual=repr(result.output),
+ )
+ case["expected"] = result.output
+ fixes_applied += 1
+
+ if case.get("raises"):
+ expected_exc = case["raises"]
+ if result.success:
+ logfire.info(
+ "Case expected {exc} but function returned {output}, fixing",
+ exc=expected_exc,
+ output=repr(result.output),
+ )
+ del case["raises"]
+ if "match" in case:
+ del case["match"]
+ case["expected"] = result.output
+ fixes_applied += 1
+ elif result.error_type and result.error_type != expected_exc:
+ logfire.info(
+ "Case expected {expected} but got {actual}, fixing",
+ expected=expected_exc,
+ actual=result.error_type,
+ )
+ case["raises"] = result.error_type
+ fixes_applied += 1
+
+ if fixes_applied > 0:
+ logfire.info("Validated spec: {count} fixes applied", count=fixes_applied)
+ return yaml.safe_dump(spec, default_flow_style=False, allow_unicode=True, sort_keys=False)
+
+ return yaml_spec
+
+
+def inject_missing_error_cases(
+ yaml_spec: str,
+ func_name: str,
+ error_snippets: list[dict],
+) -> str:
+ """Inject error cases from exploration into the spec if the LLM missed them."""
+ if not error_snippets:
+ return yaml_spec
+
+ spec = yaml.safe_load(yaml_spec)
+ if not isinstance(spec, dict) or func_name not in spec:
+ return yaml_spec
+
+ eval_def = spec[func_name]
+ dataset = eval_def.setdefault("dataset", [])
+
+ existing_raises_inputs: set[str] = set()
+ for entry in dataset:
+ case = entry.get("case", {})
+ if isinstance(case, dict) and case.get("raises"):
+ inp = case.get("input")
+ inps = case.get("inputs")
+ existing_raises_inputs.add(repr((inp, inps)))
+
+ injected = 0
+
+ for snippet in error_snippets:
+ code = snippet["code"].strip()
+ error_type = snippet["error_type"]
+ description = snippet.get("description", "")
+
+ try:
+ tree = ast.parse(code, mode="eval")
+ except SyntaxError:
+ continue
+
+ if not isinstance(tree.body, ast.Call):
+ continue
+
+ try:
+ args = [ast.literal_eval(a) for a in tree.body.args]
+ kwargs = {kw.arg: ast.literal_eval(kw.value) for kw in tree.body.keywords}
+ except (ValueError, TypeError):
+ continue
+
+ if kwargs:
+ input_repr = repr((None, kwargs))
+ if input_repr in existing_raises_inputs:
+ continue
+ case_dict: dict[str, Any] = {
+ "id": f"error_{error_type.lower()}_{injected}",
+ "inputs": kwargs,
+ "raises": error_type,
+ }
+ elif len(args) == 1:
+ if isinstance(args[0], tuple):
+ continue
+ input_repr = repr((args[0], None))
+ if input_repr in existing_raises_inputs:
+ continue
+ case_dict = {
+ "id": f"error_{error_type.lower()}_{injected}",
+ "input": args[0],
+ "raises": error_type,
+ }
+ elif len(args) > 1:
+ input_repr = repr((None, args))
+ if input_repr in existing_raises_inputs:
+ continue
+ case_dict = {
+ "id": f"error_{error_type.lower()}_{injected}",
+ "inputs": args,
+ "raises": error_type,
+ }
+ else:
+ continue
+
+ dataset.append({"case": case_dict})
+ injected += 1
+ logfire.info(
+ "Injected error case: {desc} → raises {exc}",
+ desc=description,
+ exc=error_type,
+ )
+
+ if injected > 0:
+ logfire.info("Injected {count} missing error cases into spec", count=injected)
+ return yaml.safe_dump(spec, default_flow_style=False, allow_unicode=True, sort_keys=False)
+
+ return yaml_spec
diff --git a/tests/test_executor.py b/tests/test_executor.py
index a23252c..1f99246 100644
--- a/tests/test_executor.py
+++ b/tests/test_executor.py
@@ -1,20 +1,4 @@
-"""Tests for vowel.executor — CodeMode execution backends.
-
-Covers MontyExecutor, DefaultExecutor, and get_executor factory across
-all injection modes: external_functions, inputs, both, and pure code.
-
-Tests:
- 1. External functions only
- 2. Inputs only
- 3. Inputs + external functions combined
- 4. Pure code (no injection)
- 5. Stdout capture
- 6. Error handling
- 7. ExecutionResult structure
- 8. Protocol conformance
- 9. get_executor factory
- 10. Parity — both executors produce the same output
-"""
+"""Tests for executor backends, factory selection, and output parity."""
from __future__ import annotations
diff --git a/tests/test_llm_integration.py b/tests/test_llm_integration.py
index e20b2e8..bcce51b 100644
--- a/tests/test_llm_integration.py
+++ b/tests/test_llm_integration.py
@@ -1,8 +1,4 @@
-"""LLM-based integration tests with cassette caching.
-
-These tests use real LLM calls but cache responses for reproducibility.
-Run with --update-cassettes to refresh cached responses.
-"""
+"""LLM integration tests with cassette-backed response caching."""
import hashlib
import json
diff --git a/tests/test_llm_judge.py b/tests/test_llm_judge.py
index 6a2996f..be4e30f 100644
--- a/tests/test_llm_judge.py
+++ b/tests/test_llm_judge.py
@@ -1,7 +1,4 @@
-"""LLM Judge evaluator tests with cassette caching.
-
-These tests specifically test the LLMJudge evaluator functionality.
-"""
+"""Tests for LLMJudge evaluator behavior using cassette caching."""
import hashlib
import json
diff --git a/tests/test_session.py b/tests/test_session.py
index b4b9d10..400749b 100644
--- a/tests/test_session.py
+++ b/tests/test_session.py
@@ -1,14 +1,4 @@
-"""Tests for ExecutionSession API — DefaultSession and MontyReplSession.
-
-Covers:
- - Basic feed() results (binary search)
- - Error handling (ZeroDivisionError)
- - Syntax error reporting
- - State preservation across feed() calls
- - Stdout capture through sessions
- - Context-manager lifecycle
- - Session isolation (fresh state per session)
-"""
+"""Tests for ExecutionSession behavior across default and Monty-backed sessions."""
from __future__ import annotations
diff --git a/tests/test_tdd_eval_retries.py b/tests/test_tdd_eval_retries.py
index b3c0fdc..9ab9b72 100644
--- a/tests/test_tdd_eval_retries.py
+++ b/tests/test_tdd_eval_retries.py
@@ -4,8 +4,8 @@
from unittest.mock import MagicMock, PropertyMock, patch
from vowel.eval_types import EvalsSource
-from vowel.spec_validation import build_failure_context
from vowel.tdd import FunctionSignature, Param, TDDGenerator
+from vowel.validation import build_failure_context
def _make_signature() -> FunctionSignature:
diff --git a/tests/test_yaml_loading.py b/tests/test_yaml_loading.py
index 2e56e3b..b2fb12b 100644
--- a/tests/test_yaml_loading.py
+++ b/tests/test_yaml_loading.py
@@ -6,59 +6,59 @@
from vowel import (
EvalsFile,
- load_evals,
- load_evals_file,
- load_evals_from_dict,
- load_evals_from_object,
- load_evals_from_yaml_string,
+ load_bundle,
+ load_bundle_file,
+ load_bundle_from_dict,
+ load_bundle_from_object,
+ load_bundle_from_yaml_string,
)
-class TestLoadEvalsFromYamlString:
- """Tests for load_evals_from_yaml_string function."""
+class TestLoadBundleFromYamlString:
+ """Tests for load_bundle_from_yaml_string function."""
def test_simple_yaml_loading(self, simple_yaml_spec: str):
"""Test loading a simple YAML spec."""
- evals = load_evals_from_yaml_string(simple_yaml_spec)
+ bundle = load_bundle_from_yaml_string(simple_yaml_spec)
- assert "add" in evals
- assert len(evals["add"].dataset) == 2
+ assert "add" in bundle.evals
+ assert len(bundle.evals["add"].dataset) == 2
def test_yaml_with_evaluators(self, yaml_with_evaluators: str):
"""Test loading YAML with evaluators."""
- evals = load_evals_from_yaml_string(yaml_with_evaluators)
+ bundle = load_bundle_from_yaml_string(yaml_with_evaluators)
- assert "is_even" in evals
- assert evals["is_even"].evals is not None
+ assert "is_even" in bundle.evals
+ assert bundle.evals["is_even"].evals is not None
def test_yaml_with_type_check(self, yaml_with_type_check: str):
"""Test loading YAML with type checking."""
- evals = load_evals_from_yaml_string(yaml_with_type_check)
+ bundle = load_bundle_from_yaml_string(yaml_with_type_check)
- assert "divide" in evals
- assert len(evals["divide"].dataset) == 2
+ assert "divide" in bundle.evals
+ assert len(bundle.evals["divide"].dataset) == 2
def test_yaml_with_raises(self, yaml_with_raises: str):
"""Test loading YAML with exception testing."""
- evals = load_evals_from_yaml_string(yaml_with_raises)
+ bundle = load_bundle_from_yaml_string(yaml_with_raises)
- assert "divide" in evals
- raises_cases = [c for c in evals["divide"].dataset if c.case.raises]
+ assert "divide" in bundle.evals
+ raises_cases = [c for c in bundle.evals["divide"].dataset if c.case.raises]
assert len(raises_cases) == 1
def test_empty_yaml_raises_error(self):
"""Test that empty YAML raises an error."""
with pytest.raises(Exception): # noqa: B017
- load_evals_from_yaml_string("")
+ load_bundle_from_yaml_string("")
def test_invalid_yaml_raises_error(self):
"""Test that invalid YAML raises an error."""
with pytest.raises(Exception): # noqa: B017
- load_evals_from_yaml_string("invalid: [unclosed")
+ load_bundle_from_yaml_string("invalid: [unclosed")
-class TestLoadEvalsFromDict:
- """Tests for load_evals_from_dict function."""
+class TestLoadBundleFromDict:
+ """Tests for load_bundle_from_dict function."""
def test_dict_loading(self):
"""Test loading from a dictionary."""
@@ -71,10 +71,10 @@ def test_dict_loading(self):
}
}
- evals = load_evals_from_dict(spec_dict)
+ bundle = load_bundle_from_dict(spec_dict)
- assert "multiply" in evals
- assert len(evals["multiply"].dataset) == 2
+ assert "multiply" in bundle.evals
+ assert len(bundle.evals["multiply"].dataset) == 2
def test_dict_with_evaluators(self):
"""Test loading dict with evaluators."""
@@ -88,60 +88,60 @@ def test_dict_with_evaluators(self):
}
}
- evals = load_evals_from_dict(spec_dict)
+ bundle = load_bundle_from_dict(spec_dict)
- assert "square" in evals
- assert evals["square"].evals is not None
+ assert "square" in bundle.evals
+ assert bundle.evals["square"].evals is not None
-class TestLoadEvalsFile:
- """Tests for load_evals_file function."""
+class TestLoadBundleFile:
+ """Tests for load_bundle_file function."""
def test_load_from_file(self, temp_yaml_file: Path):
"""Test loading from a YAML file."""
- evals = load_evals_file(str(temp_yaml_file))
+ bundle = load_bundle_file(str(temp_yaml_file))
- assert "add" in evals
+ assert "add" in bundle.evals
def test_nonexistent_file_raises_error(self):
"""Test that loading non-existent file raises error."""
with pytest.raises(FileNotFoundError):
- load_evals_file("nonexistent_file.yml")
+ load_bundle_file("nonexistent_file.yml")
-class TestLoadEvals:
- """Tests for the unified load_evals function."""
+class TestLoadBundle:
+ """Tests for the unified load_bundle function."""
def test_load_from_string(self, simple_yaml_spec: str):
- """Test load_evals with YAML string."""
- evals = load_evals(simple_yaml_spec)
- assert "add" in evals
+ """Test load_bundle with YAML string."""
+ bundle = load_bundle(simple_yaml_spec)
+ assert "add" in bundle.evals
def test_load_from_dict(self):
- """Test load_evals with dict."""
+ """Test load_bundle with dict."""
spec_dict = {"test": {"dataset": [{"case": {"input": 1, "expected": 1}}]}}
- evals = load_evals(spec_dict)
- assert "test" in evals
+ bundle = load_bundle(spec_dict)
+ assert "test" in bundle.evals
def test_load_from_path(self, temp_yaml_file: Path):
- """Test load_evals with Path object."""
- evals = load_evals(temp_yaml_file)
- assert "add" in evals
+ """Test load_bundle with Path object."""
+ bundle = load_bundle(temp_yaml_file)
+ assert "add" in bundle.evals
def test_load_from_evals_file_object(self, simple_yaml_spec: str):
- """Test load_evals with EvalsFile object."""
+ """Test load_bundle with EvalsFile object."""
import yaml
data = yaml.safe_load(simple_yaml_spec)
evals_file = EvalsFile.model_validate(data)
- evals = load_evals_from_object(evals_file)
- assert "add" in evals
+ bundle = load_bundle_from_object(evals_file)
+ assert "add" in bundle.evals
def test_invalid_source_type_raises_error(self):
"""Test that invalid source type raises TypeError."""
with pytest.raises(TypeError):
- load_evals(12345) # type: ignore[arg-type]
+ load_bundle(12345) # type: ignore[arg-type]
class TestInputFormats:
@@ -156,8 +156,8 @@ def test_single_input(self):
input: 5
expected: 10
"""
- evals = load_evals_from_yaml_string(yaml_spec)
- case = evals["double"].dataset[0].case
+ bundle = load_bundle_from_yaml_string(yaml_spec)
+ case = bundle.evals["double"].dataset[0].case
assert case.input == 5
def test_inputs_dict(self):
@@ -169,8 +169,8 @@ def test_inputs_dict(self):
inputs: { x: 1, y: 2 }
expected: 3
"""
- evals = load_evals_from_yaml_string(yaml_spec)
- case = evals["add"].dataset[0].case
+ bundle = load_bundle_from_yaml_string(yaml_spec)
+ case = bundle.evals["add"].dataset[0].case
assert case.inputs == {"x": 1, "y": 2}
def test_inputs_list(self):
@@ -182,6 +182,6 @@ def test_inputs_list(self):
inputs: [1, 2, 3]
expected: 6
"""
- evals = load_evals_from_yaml_string(yaml_spec)
- case = evals["add"].dataset[0].case
+ bundle = load_bundle_from_yaml_string(yaml_spec)
+ case = bundle.evals["add"].dataset[0].case
assert case.inputs == [1, 2, 3]
diff --git a/vowel-schema.json b/vowel-schema.json
index eded93b..241bbeb 100644
--- a/vowel-schema.json
+++ b/vowel-schema.json
@@ -3,16 +3,15 @@
"type": "object",
"properties": {
"fixtures": {
- "type": "object",
"additionalProperties": {
"$ref": "#/$defs/FixtureDefinition"
},
"title": "Fixtures",
- "description": "Dictionary of fixture definitions. Each key is the fixture name."
+ "type": "object"
}
},
"additionalProperties": {
- "$ref": "#/$defs/Evals"
+ "$ref": "#/$defs/EvalsMapValue"
},
"$defs": {
"AssertionCase": {
@@ -43,7 +42,9 @@
"type": "string"
}
},
- "required": ["assertion"],
+ "required": [
+ "assertion"
+ ],
"title": "AssertionCase",
"type": "object"
},
@@ -80,7 +81,9 @@
"description": "The test case containing input, expected output, and constraints."
}
},
- "required": ["case"],
+ "required": [
+ "case"
+ ],
"title": "DatasetCase",
"type": "object"
},
@@ -89,13 +92,20 @@
"properties": {
"duration": {
"description": "Maximum allowed duration in seconds. Test fails if execution takes longer.",
- "examples": [0.1, 1.0, 5.0, 0.001],
+ "examples": [
+ 0.1,
+ 1.0,
+ 5.0,
+ 0.001
+ ],
"exclusiveMinimum": 0,
"title": "Duration",
"type": "number"
}
},
- "required": ["duration"],
+ "required": [
+ "duration"
+ ],
"title": "DurationCase",
"type": "object"
},
@@ -105,13 +115,29 @@
"properties": {
"id": {
"description": "Function name to evaluate. Must match the actual function name.",
- "examples": ["is_prime", "calculate_sum", "process_data", "validate_email"],
+ "examples": [
+ "is_prime",
+ "calculate_sum",
+ "process_data",
+ "validate_email"
+ ],
"title": "Id",
"type": "string"
},
"fixture": {
"description": "List of fixture names this function depends on. Fixtures must be defined in the top-level 'fixtures' section. They will be injected as keyword-only arguments to the function.",
- "examples": [["db"], ["db", "cache"], ["redis"]],
+ "examples": [
+ [
+ "db"
+ ],
+ [
+ "db",
+ "cache"
+ ],
+ [
+ "redis"
+ ]
+ ],
"items": {
"type": "string"
},
@@ -121,28 +147,54 @@
"evals": {
"additionalProperties": {
"anyOf": [
- {"$ref": "#/$defs/IsInstanceCase"},
- {"$ref": "#/$defs/AssertionCase"},
- {"$ref": "#/$defs/DurationCase"},
- {"$ref": "#/$defs/ContainsInputCase"},
- {"$ref": "#/$defs/PatternMatchCase"},
- {"$ref": "#/$defs/LLMJudgeCase"}
+ {
+ "$ref": "#/$defs/IsInstanceCase"
+ },
+ {
+ "$ref": "#/$defs/AssertionCase"
+ },
+ {
+ "$ref": "#/$defs/DurationCase"
+ },
+ {
+ "$ref": "#/$defs/ContainsInputCase"
+ },
+ {
+ "$ref": "#/$defs/PatternMatchCase"
+ },
+ {
+ "$ref": "#/$defs/LLMJudgeCase"
+ }
]
},
"description": "Dictionary of evaluation rules that apply to ALL test cases. Each key is a descriptive name, value is the evaluation case. Use IsInstanceCase for type checks, AssertionCase for custom logic, DurationCase for performance constraints, ContainsInputCase for input containment, PatternMatchCase for regex pattern matching.",
"examples": [
{
- "IsInteger": {"type": "int"},
- "IsPositive": {"assertion": "output > 0"}
+ "IsInteger": {
+ "type": "int"
+ },
+ "IsPositive": {
+ "assertion": "output > 0"
+ }
},
{
- "IsUppercase": {"assertion": "output.isupper()"},
- "NotEmpty": {"assertion": "len(output) > 0"},
- "TypeCheck": {"type": "str"}
+ "IsUppercase": {
+ "assertion": "output.isupper()"
+ },
+ "NotEmpty": {
+ "assertion": "len(output) > 0"
+ },
+ "TypeCheck": {
+ "type": "str"
+ }
},
{
- "CorrectLogic": {"assertion": "(output and input > 0) or (not output and input <= 0)"},
- "IsBoolean": {"type": "bool"}
+ "CorrectLogic": {
+ "assertion": "(output and input > 0) or (not output and input <= 0)"
+ },
+ "IsBoolean": {
+ "type": "bool"
+ }
}
],
"title": "Evals",
@@ -152,17 +204,58 @@
"description": "List of test cases. Each case has input, expected output, and optional constraints. Should cover normal cases, edge cases, and corner cases.",
"examples": [
[
- {"case": {"expected": 4, "input": 2}},
- {"case": {"expected": 0, "input": 0}},
- {"case": {"expected": 9, "input": -3}}
+ {
+ "case": {
+ "expected": 4,
+ "input": 2
+ }
+ },
+ {
+ "case": {
+ "expected": 0,
+ "input": 0
+ }
+ },
+ {
+ "case": {
+ "expected": 9,
+ "input": -3
+ }
+ }
],
[
- {"case": {"expected": "HELLO", "input": "hello"}},
- {"case": {"expected": "WORLD", "input": "world"}}
+ {
+ "case": {
+ "expected": "HELLO",
+ "input": "hello"
+ }
+ },
+ {
+ "case": {
+ "expected": "WORLD",
+ "input": "world"
+ }
+ }
],
[
- {"case": {"expected": 5, "input": {"x": 2, "y": 3}}},
- {"case": {"expected": 30, "input": {"x": 10, "y": 20}}}
+ {
+ "case": {
+ "expected": 5,
+ "input": {
+ "x": 2,
+ "y": 3
+ }
+ }
+ },
+ {
+ "case": {
+ "expected": 30,
+ "input": {
+ "x": 10,
+ "y": 20
+ }
+ }
+ }
]
],
"items": {
@@ -173,7 +266,10 @@
"type": "array"
}
},
- "required": ["dataset"],
+ "required": [
+ "id",
+ "dataset"
+ ],
"title": "Evals",
"type": "object"
},
@@ -181,47 +277,69 @@
"description": "Definition of a single fixture with setup/teardown lifecycle.",
"properties": {
"setup": {
- "anyOf": [{"type": "string"}, {"type": "null"}],
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "null"
+ }
+ ],
"default": null,
"description": "Import path to setup function (e.g., 'fixtures.create_db'). Required if 'cls' is not specified.",
"title": "Setup"
},
"cls": {
- "anyOf": [{"type": "string"}, {"type": "null"}],
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "null"
+ }
+ ],
"default": null,
"description": "Import path to class (e.g., 'myapp.Database'). Class will be instantiated with args/kwargs.",
"title": "Cls"
},
"args": {
- "description": "Positional arguments to pass to class constructor (used with 'cls')",
+ "description": "Positional arguments unpacked into the callable: setup_func(*args) or MyClass(*args)",
"items": {},
"title": "Args",
"type": "array"
},
"kwargs": {
"additionalProperties": true,
- "description": "Keyword arguments to pass to class constructor (used with 'cls')",
+ "description": "Keyword arguments unpacked into the callable: setup_func(**kwargs) or MyClass(**kwargs)",
"title": "Kwargs",
"type": "object"
},
"teardown": {
- "anyOf": [{"type": "string"}, {"type": "null"}],
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "null"
+ }
+ ],
"default": null,
"description": "Import path to teardown function (e.g., 'fixtures.drop_db'). Can also be a class method (e.g., 'Connection.close') which will be called on the instance.",
"title": "Teardown"
},
"scope": {
"default": "function",
- "description": "Lifecycle scope: 'function' (per case), 'module' (per eval), or 'session' (per run)",
- "enum": ["function", "module", "session"],
+ "description": "Fixture lifecycle scope. Preferred names: 'case', 'eval', 'file'. Compatibility aliases are accepted: 'function', 'module', 'session'. Current runtime normalization maps case->function, eval->module, file->session.",
+ "enum": [
+ "case",
+ "eval",
+ "file",
+ "function",
+ "module",
+ "session"
+ ],
"title": "Scope",
"type": "string"
- },
- "params": {
- "additionalProperties": true,
- "description": "Parameters to pass to the setup function",
- "title": "Params",
- "type": "object"
}
},
"title": "FixtureDefinition",
@@ -232,18 +350,35 @@
"properties": {
"type": {
"description": "Python type as string to check against. Can use union types with '|'.",
- "examples": ["int", "str", "bool", "list", "dict", "int | float", "str | None"],
+ "examples": [
+ "int",
+ "str",
+ "bool",
+ "list",
+ "dict",
+ "int | float",
+ "str | None"
+ ],
"title": "Type",
"type": "string"
},
"strict": {
- "anyOf": [{"type": "boolean"}, {"type": "null"}],
+ "anyOf": [
+ {
+ "type": "boolean"
+ },
+ {
+ "type": "null"
+ }
+ ],
"default": null,
"description": "Whether to use strict mode for type validation. When True, performs stricter type checking.",
"title": "Strict"
}
},
- "required": ["type"],
+ "required": [
+ "type"
+ ],
"title": "IsInstanceCase",
"type": "object"
},
@@ -262,7 +397,18 @@
},
"include": {
"description": "List of context variables to include in the evaluation. Valid options: 'input', 'expected_output'.",
- "examples": [["input"], ["expected_output"], ["input", "expected_output"]],
+ "examples": [
+ [
+ "input"
+ ],
+ [
+ "expected_output"
+ ],
+ [
+ "input",
+ "expected_output"
+ ]
+ ],
"items": {
"type": "string"
},
@@ -276,7 +422,9 @@
"type": "object"
}
},
- "required": ["rubric"],
+ "required": [
+ "rubric"
+ ],
"title": "LLMJudgeCase",
"type": "object"
},
@@ -285,61 +433,181 @@
"description": "Test case with input, expected output, and optional constraints.",
"properties": {
"id": {
- "anyOf": [{"type": "string"}, {"type": "null"}],
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "null"
+ }
+ ],
"default": null,
"description": "Optional unique identifier for this test case.",
- "examples": ["test_positive_numbers", "edge_case_empty_list", "error_invalid_input"],
+ "examples": [
+ "test_positive_numbers",
+ "edge_case_empty_list",
+ "error_invalid_input"
+ ],
"title": "Id"
},
"input": {
- "anyOf": [{}, {"type": "null"}],
+ "anyOf": [
+ {},
+ {
+ "type": "null"
+ }
+ ],
"default": null,
"description": "Single input value to pass to the function as the only argument. Use this when the function takes a single argument. Cannot be used together with 'inputs'.",
- "examples": [5, "hello", [1, 2, 3], {"x": 10, "y": 20}, {"name": "test", "value": 42}],
+ "examples": [
+ 5,
+ "hello",
+ [
+ 1,
+ 2,
+ 3
+ ],
+ {
+ "x": 10,
+ "y": 20
+ },
+ {
+ "name": "test",
+ "value": 42
+ }
+ ],
"title": "Input"
},
"inputs": {
"anyOf": [
- {"items": {}, "type": "array"},
- {"additionalProperties": true, "type": "object"},
- {"type": "null"}
+ {
+ "items": {},
+ "type": "array"
+ },
+ {
+ "additionalProperties": true,
+ "type": "object"
+ },
+ {
+ "type": "null"
+ }
],
"default": null,
"description": "Multiple input values to pass to the function as separate arguments (*args). Use this when the function takes multiple arguments. Cannot be used together with 'input'.",
- "examples": [[1, 2], [10, 20, 30], ["hello", "world"], [{"x": 1}, {"y": 2}]],
+ "examples": [
+ [
+ 1,
+ 2
+ ],
+ [
+ 10,
+ 20,
+ 30
+ ],
+ [
+ "hello",
+ "world"
+ ],
+ [
+ {
+ "x": 1
+ },
+ {
+ "y": 2
+ }
+ ]
+ ],
"title": "Inputs"
},
"expected": {
"description": "Expected output value. If provided, output will be compared for equality. Use `null` to expect None.",
- "examples": [25, "HELLO", [1, 3, 5], true, {"result": 30}, null],
+ "examples": [
+ 25,
+ "HELLO",
+ [
+ 1,
+ 3,
+ 5
+ ],
+ true,
+ {
+ "result": 30
+ },
+ null
+ ],
"title": "Expected"
},
"duration": {
- "anyOf": [{"exclusiveMinimum": 0, "type": "number"}, {"type": "null"}],
+ "anyOf": [
+ {
+ "exclusiveMinimum": 0,
+ "type": "number"
+ },
+ {
+ "type": "null"
+ }
+ ],
"default": null,
"description": "Maximum allowed execution time in milliseconds for this specific case.",
- "examples": [100, 500, 1000, 50],
+ "examples": [
+ 100,
+ 500,
+ 1000,
+ 50
+ ],
"title": "Duration"
},
"contains": {
- "anyOf": [{}, {"type": "null"}],
+ "anyOf": [
+ {},
+ {
+ "type": "null"
+ }
+ ],
"default": null,
"description": "Value that should be contained in the output.",
- "examples": ["substring", 42, "expected_key"],
+ "examples": [
+ "substring",
+ 42,
+ "expected_key"
+ ],
"title": "Contains"
},
"assertion": {
- "anyOf": [{"type": "string"}, {"type": "null"}],
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "null"
+ }
+ ],
"default": null,
"description": "Optional case-specific Python assertion expression. Same as global assertions but only for this case.\nAvailable variables: input, output, expected, duration, metadata.\nExamples: 'output > 0', 'len(output) == 3', 'output == input * 2'",
- "examples": ["output > 0", "len(output) == 3", "output % 2 == 0", "output in input"],
+ "examples": [
+ "output > 0",
+ "len(output) == 3",
+ "output % 2 == 0",
+ "output in input"
+ ],
"title": "Assertion"
},
"pattern": {
- "anyOf": [{"type": "string"}, {"type": "null"}],
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "null"
+ }
+ ],
"default": null,
"description": "Optional regex pattern to match against the output (converted to string) for this specific case.",
- "examples": ["^\\d+$", "^[A-Z]+$", ".*@.*\\.com$"],
+ "examples": [
+ "^\\d+$",
+ "^[A-Z]+$",
+ ".*@.*\\.com$"
+ ],
"title": "Pattern"
},
"case_sensitive": {
@@ -349,17 +617,43 @@
"type": "boolean"
},
"raises": {
- "anyOf": [{"type": "string"}, {"type": "null"}],
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "null"
+ }
+ ],
"default": null,
"description": "Expected exception type for this case. If specified, the test expects the function to raise this exception. Append '?' for optional raises (e.g., 'TypeError?') — passes if the exception is raised OR if the function returns normally.",
- "examples": ["ValueError", "TypeError", "KeyError", "ZeroDivisionError", "TypeError?"],
+ "examples": [
+ "ValueError",
+ "TypeError",
+ "KeyError",
+ "ZeroDivisionError",
+ "TypeError?"
+ ],
"title": "Raises"
},
"type": {
- "anyOf": [{"type": "string"}, {"type": "null"}],
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "null"
+ }
+ ],
"default": null,
"description": "Expected output type for this specific case. Can be a simple type name or a complex type annotation.",
- "examples": ["int", "str", "list[int]", "dict[str, Any]", "Optional[int]"],
+ "examples": [
+ "int",
+ "str",
+ "list[int]",
+ "dict[str, Any]",
+ "Optional[int]"
+ ],
"title": "Type"
},
"strict_type": {
@@ -369,10 +663,21 @@
"type": "boolean"
},
"match": {
- "anyOf": [{"type": "string"}, {"type": "null"}],
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "null"
+ }
+ ],
"default": null,
"description": "Optional regex pattern to match against the exception message (only used if raises is specified).",
- "examples": ["invalid input", "must be positive", "not found"],
+ "examples": [
+ "invalid input",
+ "must be positive",
+ "not found"
+ ],
"title": "Match"
}
},
@@ -384,7 +689,12 @@
"properties": {
"pattern": {
"description": "Regular expression pattern to match against the output (converted to string).",
- "examples": ["^\\d+$", "^[A-Z]+$", ".*@.*\\.com$", "id: \\d+"],
+ "examples": [
+ "^\\d+$",
+ "^[A-Z]+$",
+ ".*@.*\\.com$",
+ "id: \\d+"
+ ],
"title": "Pattern",
"type": "string"
},
@@ -395,9 +705,174 @@
"type": "boolean"
}
},
- "required": ["pattern"],
+ "required": [
+ "pattern"
+ ],
"title": "PatternMatchCase",
"type": "object"
+ },
+ "EvalsMapValue": {
+ "additionalProperties": false,
+ "description": "Function evaluation specification keyed by function import path/name. Contains fixture dependencies, global evaluators (`evals`), and dataset cases.",
+ "properties": {
+ "id": {
+ "description": "Function name to evaluate. Must match the actual function name.",
+ "examples": [
+ "is_prime",
+ "calculate_sum",
+ "process_data",
+ "validate_email"
+ ],
+ "title": "Id",
+ "type": "string"
+ },
+ "fixture": {
+ "description": "List of fixture names this function depends on. Fixtures must be defined in the top-level 'fixtures' section. They will be injected as keyword-only arguments to the function.",
+ "examples": [
+ [
+ "db"
+ ],
+ [
+ "db",
+ "cache"
+ ],
+ [
+ "redis"
+ ]
+ ],
+ "items": {
+ "type": "string"
+ },
+ "title": "Fixture",
+ "type": "array"
+ },
+ "evals": {
+ "additionalProperties": {
+ "anyOf": [
+ {
+ "$ref": "#/$defs/IsInstanceCase"
+ },
+ {
+ "$ref": "#/$defs/AssertionCase"
+ },
+ {
+ "$ref": "#/$defs/DurationCase"
+ },
+ {
+ "$ref": "#/$defs/ContainsInputCase"
+ },
+ {
+ "$ref": "#/$defs/PatternMatchCase"
+ },
+ {
+ "$ref": "#/$defs/LLMJudgeCase"
+ }
+ ]
+ },
+ "description": "Dictionary of evaluation rules that apply to ALL test cases. Each key is a descriptive name, value is the evaluation case. Use IsInstanceCase for type checks, AssertionCase for custom logic, DurationCase for performance constraints, ContainsInputCase for input containment, PatternMatchCase for regex pattern matching.",
+ "examples": [
+ {
+ "IsInteger": {
+ "type": "int"
+ },
+ "IsPositive": {
+ "assertion": "output > 0"
+ }
+ },
+ {
+ "IsUppercase": {
+ "assertion": "output.isupper()"
+ },
+ "NotEmpty": {
+ "assertion": "len(output) > 0"
+ },
+ "TypeCheck": {
+ "type": "str"
+ }
+ },
+ {
+ "CorrectLogic": {
+ "assertion": "(output and input > 0) or (not output and input <= 0)"
+ },
+ "IsBoolean": {
+ "type": "bool"
+ }
+ }
+ ],
+ "title": "Evals",
+ "type": "object"
+ },
+ "dataset": {
+ "description": "List of test cases. Each case has input, expected output, and optional constraints. Should cover normal cases, edge cases, and corner cases.",
+ "examples": [
+ [
+ {
+ "case": {
+ "expected": 4,
+ "input": 2
+ }
+ },
+ {
+ "case": {
+ "expected": 0,
+ "input": 0
+ }
+ },
+ {
+ "case": {
+ "expected": 9,
+ "input": -3
+ }
+ }
+ ],
+ [
+ {
+ "case": {
+ "expected": "HELLO",
+ "input": "hello"
+ }
+ },
+ {
+ "case": {
+ "expected": "WORLD",
+ "input": "world"
+ }
+ }
+ ],
+ [
+ {
+ "case": {
+ "expected": 5,
+ "input": {
+ "x": 2,
+ "y": 3
+ }
+ }
+ },
+ {
+ "case": {
+ "expected": 30,
+ "input": {
+ "x": 10,
+ "y": 20
+ }
+ }
+ }
+ ]
+ ],
+ "items": {
+ "$ref": "#/$defs/DatasetCase"
+ },
+ "minItems": 1,
+ "title": "Dataset",
+ "type": "array"
+ }
+ },
+ "required": [
+ "dataset"
+ ],
+ "title": "Function",
+ "type": "object"
}
}
-}
\ No newline at end of file
+}
From 596754e070e4281a927f49a331642f7897087572 Mon Sep 17 00:00:00 2001
From: Mert <62549656+fswair@users.noreply.github.com>
Date: Thu, 19 Mar 2026 00:13:59 +0300
Subject: [PATCH 4/8] commit before native serializers
---
.gitignore | 12 +-
README.md | 4 +
docs/SERIALIZERS.md | 2 +
src/vowel/__init__.py | 2 +
src/vowel/cli.py | 264 +++++++++++++
src/vowel/codemode.py | 345 ++++++++++-------
src/vowel/costs.py | 358 ++++++++++++++++++
src/vowel/evals.py | 21 +-
src/vowel/utils.py | 65 +++-
tests/cassettes/llm_judge_custom_model.json | 2 +-
tests/cassettes/test_generate_and_run.json | 4 +-
tests/cassettes/test_generate_factorial.json | 6 +-
tests/cassettes/test_generate_palindrome.json | 4 +-
.../cassettes/test_generate_spec_simple.json | 2 +-
.../cassettes/test_generate_spec_string.json | 2 +-
tests/test_generation.py | 36 --
tests/test_llm_integration.py | 4 +-
tests/test_llm_judge_env_refs.py | 33 ++
tests/test_run_evals.py | 18 +
tests/test_serializer.py | 22 ++
20 files changed, 1010 insertions(+), 196 deletions(-)
create mode 100644 src/vowel/costs.py
delete mode 100644 tests/test_generation.py
create mode 100644 tests/test_llm_judge_env_refs.py
diff --git a/.gitignore b/.gitignore
index 59c3bc4..dc1809e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -70,14 +70,10 @@ evaluations/
TODO
docs/FIXTURE_GENERATION_RFC.md
-# CodeMode
-monty.py
-monty/
-
# Benchmarks
benchmark*
parse_cron_evals.yml
-PLAN.md
-codegen.py
-bundle_*.py
-*test.py
+
+# Known Models with Costs
+costs.yml
+db_fixture_serializers.yml
diff --git a/README.md b/README.md
index 0ce4681..f3ec800 100644
--- a/README.md
+++ b/README.md
@@ -122,6 +122,8 @@ summary = (
summary.print()
```
+> **Name matching note:** If your YAML uses `module.function`, programmatic mappings can use either the exact key (`module.function`) or the short function name (`function`) in `.with_functions(...)`.
+
---
## Features
@@ -196,6 +198,8 @@ summary = (
)
```
+> **Serializer key matching:** Serializer mappings follow the same rule as `.with_functions(...)` — both `module.function` and short `function` keys are accepted.
+
> **Full reference:** [docs/SERIALIZERS.md](https://github.com/fswair/vowel/blob/main/docs/SERIALIZERS.md)
### AI-Powered Generation
diff --git a/docs/SERIALIZERS.md b/docs/SERIALIZERS.md
index de92302..deb621e 100644
--- a/docs/SERIALIZERS.md
+++ b/docs/SERIALIZERS.md
@@ -58,6 +58,8 @@ summary = (
)
```
+> Key matching note: If YAML eval ids use `module.function`, both programmatic maps accept either the exact id (`module.function`) or short name (`function`) keys in `.with_functions(...)`, `.with_serializer(...)`, and `serial_fn={...}`.
+
---
## Advanced Examples
diff --git a/src/vowel/__init__.py b/src/vowel/__init__.py
index b3d98d2..d60bdbf 100644
--- a/src/vowel/__init__.py
+++ b/src/vowel/__init__.py
@@ -8,6 +8,7 @@
from .ai import EvalGenerator, GenerationResult, UnsupportedParameterTypeError
from .codemode import CodeModeGenerator, CodeModeResult, ExplorationPlan, SnippetResult
from .context import EVAL_SPEC_CONTEXT
+from .costs import CostManager
from .errors import FixturePathError, SignatureError
from .eval_types import EvalsFile
from .executor import (
@@ -75,6 +76,7 @@
"CodeModeResult",
"ExplorationPlan",
"SnippetResult",
+ "CostManager",
]
diff --git a/src/vowel/cli.py b/src/vowel/cli.py
index a5bf430..6b2ed4c 100644
--- a/src/vowel/cli.py
+++ b/src/vowel/cli.py
@@ -27,6 +27,239 @@
dotenv.load_dotenv()
console = Console()
+COSTS_FILE = Path.home() / ".vowel" / "codemode" / "generation_costs.json"
+
+
+def _load_cost_store() -> dict:
+ if not COSTS_FILE.exists():
+ return {"schema_version": 1, "generations": {}}
+ try:
+ data = json.loads(COSTS_FILE.read_text(encoding="utf-8"))
+ except Exception:
+ return {"schema_version": 1, "generations": {}}
+ if not isinstance(data, dict):
+ return {"schema_version": 1, "generations": {}}
+ generations = data.get("generations")
+ if not isinstance(generations, dict):
+ data["generations"] = {}
+ return data
+
+
+def _flatten_runs(store: dict) -> list[tuple[str, dict, dict]]:
+ rows: list[tuple[str, dict, dict]] = []
+ for gid, gen in store.get("generations", {}).items():
+ runs = gen.get("runs", {}) if isinstance(gen, dict) else {}
+ if not isinstance(runs, dict):
+ continue
+ for run_id, run in runs.items():
+ rows.append((gid, gen, {"run_id": run_id, **run}))
+ return rows
+
+
+def _print_generation_table(store: dict) -> list[str]:
+ generations = store.get("generations", {})
+ ordered = sorted(
+ generations.items(),
+ key=lambda x: str(x[1].get("created_at", "")),
+ reverse=True,
+ )
+ table = Table(title="Generations", box=box.ROUNDED)
+ table.add_column("#", style="cyan", no_wrap=True)
+ table.add_column("Generation ID", style="white")
+ table.add_column("Created", style="dim")
+ table.add_column("Runs", justify="right")
+ table.add_column("USD", justify="right", style="green")
+
+ ids: list[str] = []
+ for idx, (gid, gen) in enumerate(ordered, start=1):
+ totals = gen.get("totals", {})
+ run_count = len(gen.get("runs", {})) if isinstance(gen.get("runs", {}), dict) else 0
+ table.add_row(
+ str(idx),
+ gid,
+ str(gen.get("created_at", "-")),
+ str(run_count),
+ f"{float(totals.get('usd', 0.0) or 0.0):.6f}",
+ )
+ ids.append(gid)
+
+ console.print(table)
+ return ids
+
+
+def _print_generation_detail(generation_id: str, generation: dict) -> None:
+ totals = generation.get("totals", {})
+ info = Table.grid(padding=(0, 2))
+ info.add_column(style="bold")
+ info.add_column()
+ info.add_row("Generation", generation_id)
+ info.add_row("Created", str(generation.get("created_at", "-")))
+ info.add_row("Spec model", str(generation.get("spec_model", "-")))
+ info.add_row("Exploration model", str(generation.get("exploration_model", "-")))
+ info.add_row("USD", f"{float(totals.get('usd', 0.0) or 0.0):.6f}")
+ info.add_row("Input tokens", str(int(totals.get("input_tokens", 0) or 0)))
+ info.add_row("Output tokens", str(int(totals.get("output_tokens", 0) or 0)))
+ info.add_row("Requests", str(int(totals.get("requests", 0) or 0)))
+ console.print(Panel(info, title="Generation Summary", border_style="bright_cyan"))
+
+ run_table = Table(title="Runs", box=box.ROUNDED)
+ run_table.add_column("Run ID", style="white")
+ run_table.add_column("Function", style="cyan")
+ run_table.add_column("Status")
+ run_table.add_column("USD", justify="right", style="green")
+ run_table.add_column("Input", justify="right")
+ run_table.add_column("Output", justify="right")
+ run_table.add_column("Requests", justify="right")
+ run_table.add_column("Created", style="dim")
+
+ runs = generation.get("runs", {}) if isinstance(generation.get("runs", {}), dict) else {}
+ for run_id, run in runs.items():
+ rt = run.get("totals", {})
+ run_table.add_row(
+ run_id,
+ str(run.get("func_name", "-")),
+ str(run.get("status", "-")),
+ f"{float(rt.get('usd', 0.0) or 0.0):.6f}",
+ str(int(rt.get("input_tokens", 0) or 0)),
+ str(int(rt.get("output_tokens", 0) or 0)),
+ str(int(rt.get("requests", 0) or 0)),
+ str(run.get("created_at", "-")),
+ )
+ console.print(run_table)
+
+
+def _print_run_detail(generation_id: str, run: dict) -> None:
+ totals = run.get("totals", {})
+ info = Table.grid(padding=(0, 2))
+ info.add_column(style="bold")
+ info.add_column()
+ info.add_row("Generation", generation_id)
+ info.add_row("Run", str(run.get("run_id", "-")))
+ info.add_row("Function", str(run.get("func_name", "-")))
+ info.add_row("Status", str(run.get("status", "-")))
+ info.add_row("USD", f"{float(totals.get('usd', 0.0) or 0.0):.6f}")
+ info.add_row("Input tokens", str(int(totals.get("input_tokens", 0) or 0)))
+ info.add_row("Output tokens", str(int(totals.get("output_tokens", 0) or 0)))
+ info.add_row("Requests", str(int(totals.get("requests", 0) or 0)))
+ console.print(Panel(info, title="Run Summary", border_style="bright_cyan"))
+
+ step_table = Table(title="Steps", box=box.ROUNDED)
+ step_table.add_column("Step", style="white")
+ step_table.add_column("Calls", justify="right")
+ step_table.add_column("USD", justify="right", style="green")
+ step_table.add_column("Input", justify="right")
+ step_table.add_column("Output", justify="right")
+ step_table.add_column("Requests", justify="right")
+
+ steps = run.get("steps", {}) if isinstance(run.get("steps", {}), dict) else {}
+ for step_name, step_data in steps.items():
+ usages = step_data.get("usages", []) if isinstance(step_data, dict) else []
+ usd = 0.0
+ input_tokens = 0
+ output_tokens = 0
+ requests = 0
+ for u in usages:
+ usage = u.get("usage", {}) if isinstance(u, dict) else {}
+ usd += float(u.get("usd", 0.0) or 0.0)
+ input_tokens += int(usage.get("input_tokens", 0) or 0)
+ output_tokens += int(usage.get("output_tokens", 0) or 0)
+ requests += int(usage.get("requests", 0) or 0)
+
+ step_table.add_row(
+ step_name,
+ str(len(usages)),
+ f"{usd:.6f}",
+ str(input_tokens),
+ str(output_tokens),
+ str(requests),
+ )
+
+ console.print(step_table)
+
+
+def _handle_costs_command(
+ *,
+ list_costs: bool,
+ by_generation: bool,
+ by_run: bool,
+ generation_id: str | None,
+ run_id: str | None,
+) -> None:
+ store = _load_cost_store()
+ generations = store.get("generations", {})
+ if not generations:
+ console.print("[yellow]No cost records found yet.[/yellow]")
+ return
+
+ if generation_id:
+ generation = generations.get(generation_id)
+ if not isinstance(generation, dict):
+ click.secho(f"ERROR: Generation not found: {generation_id}", fg="red", err=True)
+ raise SystemExit(1)
+ _print_generation_detail(generation_id, generation)
+ return
+
+ if run_id:
+ for gid, gen in generations.items():
+ runs = gen.get("runs", {}) if isinstance(gen, dict) else {}
+ if isinstance(runs, dict) and run_id in runs:
+ run = {"run_id": run_id, **runs[run_id]}
+ _print_run_detail(gid, run)
+ return
+ click.secho(f"ERROR: Run not found: {run_id}", fg="red", err=True)
+ raise SystemExit(1)
+
+ if not list_costs:
+ _print_generation_table(store)
+ return
+
+ if by_generation:
+ ids = _print_generation_table(store)
+ if not ids:
+ return
+ choice = click.prompt("Select generation number", type=int)
+ if choice < 1 or choice > len(ids):
+ click.secho("ERROR: Invalid selection", fg="red", err=True)
+ raise SystemExit(1)
+ selected = ids[choice - 1]
+ _print_generation_detail(selected, generations[selected])
+ return
+
+ if by_run:
+ rows = _flatten_runs(store)
+ if not rows:
+ console.print("[yellow]No runs found.[/yellow]")
+ return
+
+ table = Table(title="Runs", box=box.ROUNDED)
+ table.add_column("#", style="cyan", no_wrap=True)
+ table.add_column("Run ID", style="white")
+ table.add_column("Generation", style="dim")
+ table.add_column("Function", style="cyan")
+ table.add_column("Status")
+ table.add_column("USD", justify="right", style="green")
+
+ for idx, (gid, _, run) in enumerate(rows, start=1):
+ totals = run.get("totals", {})
+ table.add_row(
+ str(idx),
+ str(run.get("run_id", "-")),
+ gid,
+ str(run.get("func_name", "-")),
+ str(run.get("status", "-")),
+ f"{float(totals.get('usd', 0.0) or 0.0):.6f}",
+ )
+ console.print(table)
+
+ choice = click.prompt("Select run number", type=int)
+ if choice < 1 or choice > len(rows):
+ click.secho("ERROR: Invalid selection", fg="red", err=True)
+ raise SystemExit(1)
+ gid, _, run = rows[choice - 1]
+ _print_run_detail(gid, run)
+ return
+
+ _print_generation_table(store)
def _eval_type_label(case) -> str:
@@ -282,6 +515,21 @@ def validate_coverage(ctx, param, value):
is_flag=True,
help="With 'vowel schema': generate vowel-schema.json in current directory",
)
+@click.option("--list", "list_costs", is_flag=True, help="With 'vowel costs': list records")
+@click.option(
+ "-g",
+ "--by-generation",
+ is_flag=True,
+ help="With 'vowel costs --list': browse generations interactively",
+)
+@click.option(
+ "-r",
+ "--by-run",
+ is_flag=True,
+ help="With 'vowel costs --list': browse runs interactively",
+)
+@click.option("--generation", "generation_id", help="With 'vowel costs': show generation id")
+@click.option("--run", "run_id_option", help="With 'vowel costs': show run id")
def main(
arg1: Path | None,
arg2: Path | None,
@@ -302,6 +550,11 @@ def main(
verbose: bool,
hide_report: bool,
schema_create: bool,
+ list_costs: bool,
+ by_generation: bool,
+ by_run: bool,
+ generation_id: str | None,
+ run_id_option: str | None,
):
"""vowel — YAML-based evaluation framework for Python functions."""
console = Console(force_terminal=False, no_color=True) if no_color else Console()
@@ -368,6 +621,17 @@ def main(
console.print("[green]✓[/green] Pydantic validation passed")
return
+ # Command mode: vowel costs [--list -g|-r] [--generation ] [--run ]
+ if arg1 is not None and str(arg1) == "costs":
+ _handle_costs_command(
+ list_costs=list_costs,
+ by_generation=by_generation,
+ by_run=by_run,
+ generation_id=generation_id,
+ run_id=run_id_option,
+ )
+ return
+
yaml_file = arg1
# Validate incompatible options
diff --git a/src/vowel/codemode.py b/src/vowel/codemode.py
index e02f421..33269c5 100644
--- a/src/vowel/codemode.py
+++ b/src/vowel/codemode.py
@@ -20,6 +20,7 @@
from pydantic_ai import Agent
from vowel.context import EVAL_SPEC_CONTEXT
+from vowel.costs import CostManager
from vowel.eval_types import EvalsSource
from vowel.executor import ExecutionResult, Executor, resolve_executors
from vowel.monitoring import enable_monitoring
@@ -178,6 +179,7 @@ def __init__(
self,
spec_model: str | None = None,
exploration_model: str | None = None,
+ generation_id: str | None = None,
default_executor: Executor | None = None,
fallback_executor: Executor | None = None,
additional_context: str = "",
@@ -203,6 +205,13 @@ def __init__(
self.additional_context = additional_context
self.min_snippets = min_snippets
self.use_model_spec = use_model_spec
+ self.cost_manager = CostManager(
+ spec_model=self.spec_model,
+ exploration_model=self.exploration_model,
+ generation_id=generation_id,
+ )
+ self.generation_id = self.cost_manager.generation_id
+ self._active_run_id: str | None = None
self._opts = opts
# Lazy agents
@@ -213,9 +222,13 @@ def __init__(
"CodeModeGenerator initialized",
spec_model=self.spec_model,
exploration_model=self.exploration_model,
+ generation_id=self.generation_id,
executor=type(self.executor).__name__,
)
+ def print_total_cost(self, run_id: str | None = None) -> None:
+ self.cost_manager.print_total_cost(run_id=run_id)
+
# -- Agent properties --------------------------------------------------
@property
@@ -280,6 +293,13 @@ def _explorer_system_prompt(self) -> str:
- Use the function's REAL NAME — the function source code will be prepended
automatically at runtime. Do NOT define the function yourself.
- Keep each snippet focused on ONE scenario.
+- Do NOT produce duplicate snippets. Two snippets are duplicates if they test
+ the same input shape and same behavior class.
+- For `error_snippets`, each snippet must map to a DISTINCT error mode
+ (different guard/branch, exception type, or message pattern).
+- If the function signature has no positional-only (`/`) or keyword-only (`*`)
+ limiters, prefer positional call style for multi-argument calls and avoid
+ equivalent keyword-style duplicates.
- Do NOT guess outputs — the snippets will be executed and the real
outputs collected automatically.
- NEVER use try/except in your snippets. Let exceptions propagate
@@ -309,6 +329,7 @@ async def explore(
func: Function,
*,
exploration_rounds: int = 2,
+ run_id: str | None = None,
) -> list[SnippetResult]:
"""Generate and execute exploration snippets.
@@ -331,13 +352,15 @@ async def explore(
):
# Get exploration plan (round 2+ includes prior context)
if round_num == 1:
- plan = await self._get_exploration_plan(func)
+ plan = await self._get_exploration_plan(func, run_id=run_id)
else:
cluster_summary = self._build_cluster_summary(all_results)
plan = await self._get_targeted_exploration_plan(
func,
all_results,
cluster_summary,
+ run_id=run_id,
+ round_num=round_num,
)
# Early exit: if no new snippets were produced
if not plan.snippets and not plan.error_snippets:
@@ -484,7 +507,12 @@ def _behavior_key(r: SnippetResult) -> str:
new_keys = {_behavior_key(r) for r in new}
return len(new_keys - prior_keys)
- async def _get_exploration_plan(self, func: Function) -> ExplorationPlan:
+ async def _get_exploration_plan(
+ self,
+ func: Function,
+ *,
+ run_id: str | None = None,
+ ) -> ExplorationPlan:
"""Request initial exploration snippets from the model."""
with logfire.span("codemode.llm_explore", func_name=func.name, round=1):
prompt = f"""Explore the following function by writing test snippets:
@@ -500,6 +528,13 @@ async def _get_exploration_plan(self, func: Function) -> ExplorationPlan:
`{func.name}` — the implementation will be prepended automatically."""
result = await self.explorer_agent.run(prompt)
+ if run_id:
+ self.cost_manager.record_agent_usage(
+ run_id=run_id,
+ step_key="exploration_round_1",
+ result=result,
+ model_name=self.exploration_model,
+ )
plan = result.output
logfire.info(
@@ -516,6 +551,9 @@ async def _get_targeted_exploration_plan(
func: Function,
prior_results: list[SnippetResult],
cluster_summary: str,
+ *,
+ run_id: str | None = None,
+ round_num: int = 2,
) -> ExplorationPlan:
"""Request targeted snippets using prior execution evidence."""
with logfire.span("codemode.llm_explore", func_name=func.name, round=2):
@@ -547,10 +585,21 @@ async def _get_targeted_exploration_plan(
- Do NOT repeat any snippet from the "Already tried" list.
- Produce 8–12 NEW normal snippets targeting uncovered behaviour.
- Produce 3–5 NEW error snippets targeting untried error paths.
+- Prefer diversity over volume: no semantically duplicate cases.
+- Each new error snippet should cover a unique failure mode.
+- If signature has no `/` or `*` limiters, use positional calling style for
+ multi-argument calls and avoid keyword/positional duplicates of same scenario.
- Same strict rules as before: no try/except, real function name,
one scenario per snippet, last expression captured."""
result = await self.explorer_agent.run(prompt)
+ if run_id:
+ self.cost_manager.record_agent_usage(
+ run_id=run_id,
+ step_key=f"targeted_exploration_round_{round_num}",
+ result=result,
+ model_name=self.exploration_model,
+ )
plan = result.output
logfire.info(
@@ -569,6 +618,9 @@ async def generate_spec(
func: Function,
exploration_results: list[SnippetResult],
failure_context: str | None = None,
+ *,
+ run_id: str | None = None,
+ attempt: int = 1,
) -> str | EvalsBundle:
"""Generate a spec from verified exploration results.
@@ -632,10 +684,28 @@ async def generate_spec(
- The top-level YAML key MUST be `{func.name}` (the function name).
- Generate at least {max(len(exploration_results), 5)} diverse test cases.
- Use the EXACT outputs from the execution results above.
-- You MUST generate exactly {len(error_results)} raises cases — one for
- each RAISED result above. The spec is invalid without them.
+- Error coverage rule: include AT LEAST one raises case for each UNIQUE
+ observed error mode (exception type + meaningfully distinct call pattern).
+- Do NOT duplicate semantically equivalent error cases. If two cases represent
+ the same failing input semantics, keep only one (prefer the one with `match`
+ when message is deterministic from observed execution).
- Cover normal, edge, and error cases.
- In assertions, use `input` (NOT `inputs`) for accessing input values.
+- Prefer `expected` over `assertion` whenever the exact output is known from
+ verified execution results.
+- Use `assertion` only for true invariants/properties that are not redundant
+ with exact `expected` values.
+- Do NOT use broad/trivial assertions (e.g. `output >= 0`, `output <= len(input)`)
+ when a precise expected value can be asserted.
+- Keep the dataset compact and non-redundant: no duplicate cases with the same
+ effective behavior.
+- If function signature has no positional-only (`/`) or keyword-only (`*`)
+ limiters, prefer positional style for multi-argument calls and do not include
+ both positional and keyword variants of the same scenario.
+- Stay aligned with function contract/type hints: do not add contract-irrelevant
+ cases that only test incidental duck-typing unless explicitly motivated.
+- For `raises` cases, only claim exception type/message patterns that are present
+ in observed execution results; do not invent unsupported error expectations.
YAML FORMAT — STRICT RULES (violations cause parse failure):
- NEVER use YAML tags: `!!python/tuple`, `!!python/object`, `!!binary`,
@@ -654,6 +724,13 @@ async def generate_spec(
)
result = await self.spec_agent.run(prompt)
+ if run_id:
+ self.cost_manager.record_agent_usage(
+ run_id=run_id,
+ step_key=f"spec_generation_attempt_{attempt}",
+ result=result,
+ model_name=self.spec_model,
+ )
if self.use_model_spec:
bundle = result.output
@@ -751,6 +828,7 @@ async def generate(
self,
func: Function,
*,
+ run_id: str | None = None,
run_evals: bool = True,
save_to_file: bool = False,
max_refinement_rounds: int = 2,
@@ -766,149 +844,162 @@ async def generate(
with logfire.span(
"codemode.pipeline",
func_name=func.name,
+ generation_id=self.generation_id,
spec_model=self.spec_model,
exploration_model=self.exploration_model,
executor=type(self.executor).__name__,
):
+ run_id = self.cost_manager.start_run(run_id=run_id, func_name=func.name)
+ self._active_run_id = run_id
+
t0 = time.perf_counter()
- # Phase 1 — explore (once)
- exploration_results = await self.explore(func)
+ try:
+ # Phase 1 — explore (once)
+ exploration_results = await self.explore(func, run_id=run_id)
+
+ # Phase 2–4 — generate spec + validate + refine
+ yaml_spec = ""
+ generated_bundle: EvalsBundle | None = None
+ summary: EvalSummary | None = None
+ refinement_rounds = 0
+ failure_context: str | None = None
+ total_attempts = max_refinement_rounds + 1 if run_evals else 1
+
+ for attempt in range(total_attempts):
+ with logfire.span(
+ "codemode.spec_attempt",
+ attempt=attempt + 1,
+ is_refinement=attempt > 0,
+ ):
+ try:
+ bundle = await self.generate_spec(
+ func,
+ exploration_results,
+ failure_context,
+ run_id=run_id,
+ attempt=attempt + 1,
+ )
- # Phase 2–4 — generate spec + validate + refine
- yaml_spec = ""
- generated_bundle: EvalsBundle | None = None
- summary: EvalSummary | None = None
- refinement_rounds = 0
- failure_context: str | None = None
- total_attempts = max_refinement_rounds + 1 if run_evals else 1
+ if isinstance(bundle, EvalsBundle):
+ generated_bundle = bundle
+ yaml_spec = bundle.to_yaml()
+ else:
+ generated_bundle = None
+ yaml_spec = bundle
+ except Exception as exc:
+ logfire.warn(
+ "Spec generation failed on attempt {attempt}, retrying",
+ attempt=attempt + 1,
+ error=str(exc),
+ )
+ failure_context = f"Generation error: {exc}"
+ refinement_rounds = attempt + 1
+ continue
- for attempt in range(total_attempts):
- with logfire.span(
- "codemode.spec_attempt",
- attempt=attempt + 1,
- is_refinement=attempt > 0,
- ):
- try:
- bundle = await self.generate_spec(
- func,
- exploration_results,
- failure_context,
- )
+ if not run_evals:
+ break
- if isinstance(bundle, EvalsBundle):
- generated_bundle = bundle
- yaml_spec = bundle.to_yaml()
- else:
- generated_bundle = None
- yaml_spec = bundle
- except Exception as exc:
- logfire.warn(
- "Spec generation failed on attempt {attempt}, retrying",
- attempt=attempt + 1,
- error=str(exc),
- )
- failure_context = f"Generation error: {exc}"
- refinement_rounds = attempt + 1
- continue
+ # Validate: run evals with ignore_duration=True
+ try:
+ if generated_bundle is not None:
+ runner = (
+ RunEvals.from_bundle(generated_bundle)
+ .with_functions({func.name: func.impl})
+ .ignore_duration()
+ )
+ else:
+ runner = (
+ RunEvals.from_source(yaml_spec)
+ .with_functions({func.name: func.impl})
+ .ignore_duration()
+ )
+ summary = runner.run()
- if not run_evals:
- break
+ logfire.info(
+ "Attempt {attempt}: {passed}/{total} passed, coverage={coverage:.1f}%",
+ attempt=attempt + 1,
+ passed=summary.success_count,
+ total=summary.total_count,
+ failed=summary.failed_count,
+ errors=summary.error_count,
+ coverage=summary.coverage * 100,
+ )
- # Validate: run evals with ignore_duration=True
+ if summary.coverage >= min_coverage:
+ break
+
+ # Build failure context for next attempt
+ failure_context = self._build_failure_context(summary)
+ refinement_rounds = attempt + 1
+ logfire.warn(
+ "Coverage {coverage:.0f}% below target {target:.0f}%, refining",
+ coverage=summary.coverage * 100,
+ target=min_coverage * 100,
+ attempt=attempt + 1,
+ )
+
+ except Exception as exc:
+ logfire.warn(
+ "Failed to run evals on attempt {attempt}, retrying",
+ attempt=attempt + 1,
+ func_name=func.name,
+ error=str(exc),
+ )
+ failure_context = f"Eval run error: {exc}"
+ refinement_rounds = attempt + 1
+ continue
+
+ # Phase 5 — inject per-case durations
+ if inject_durations:
+ with logfire.span("codemode.inject_durations", func_name=func.name):
+ yaml_spec = self._inject_durations(yaml_spec, func)
+
+ # Final summary run (with durations now present, but still ignored)
+ if run_evals and summary is not None:
try:
if generated_bundle is not None:
- runner = (
+ final_runner = (
RunEvals.from_bundle(generated_bundle)
.with_functions({func.name: func.impl})
.ignore_duration()
)
else:
- runner = (
+ final_runner = (
RunEvals.from_source(yaml_spec)
.with_functions({func.name: func.impl})
.ignore_duration()
)
- summary = runner.run()
-
- logfire.info(
- "Attempt {attempt}: {passed}/{total} passed, coverage={coverage:.1f}%",
- attempt=attempt + 1,
- passed=summary.success_count,
- total=summary.total_count,
- failed=summary.failed_count,
- errors=summary.error_count,
- coverage=summary.coverage * 100,
- )
-
- if summary.coverage >= min_coverage:
- break
-
- # Build failure context for next attempt
- failure_context = self._build_failure_context(summary)
- refinement_rounds = attempt + 1
- logfire.warn(
- "Coverage {coverage:.0f}% below target {target:.0f}%, refining",
- coverage=summary.coverage * 100,
- target=min_coverage * 100,
- attempt=attempt + 1,
- )
-
- except Exception as exc:
- logfire.warn(
- "Failed to run evals on attempt {attempt}, retrying",
- attempt=attempt + 1,
- func_name=func.name,
- error=str(exc),
- )
- failure_context = f"Eval run error: {exc}"
- refinement_rounds = attempt + 1
- continue
-
- # Phase 5 — inject per-case durations
- if inject_durations:
- with logfire.span("codemode.inject_durations", func_name=func.name):
- yaml_spec = self._inject_durations(yaml_spec, func)
-
- # Final summary run (with durations now present, but still ignored)
- if run_evals and summary is not None:
- try:
- if generated_bundle is not None:
- final_runner = (
- RunEvals.from_bundle(generated_bundle)
- .with_functions({func.name: func.impl})
- .ignore_duration()
- )
- else:
- final_runner = (
- RunEvals.from_source(yaml_spec)
- .with_functions({func.name: func.impl})
- .ignore_duration()
- )
- summary = final_runner.run()
- except Exception: # noqa: BLE001
- pass # keep last good summary
-
- if save_to_file:
- path = f"{func.name}_evals.yml"
- spec_to_write = materialize_yaml_with_schema_header(yaml_spec)
- with open(path, "w") as f:
- f.write(spec_to_write)
- logfire.info("Saved spec to {path}", path=path)
-
- elapsed = (time.perf_counter() - t0) * 1000
- logfire.info(
- "CodeMode pipeline complete in {elapsed:.0f}ms (refinements={refinement_rounds})",
- elapsed=elapsed,
- func_name=func.name,
- exploration_count=len(exploration_results),
- refinement_rounds=refinement_rounds,
- has_summary=summary is not None,
- )
+ summary = final_runner.run()
+ except Exception: # noqa: BLE001
+ pass # keep last good summary
+
+ if save_to_file:
+ path = f"{func.name}_evals.yml"
+ spec_to_write = materialize_yaml_with_schema_header(yaml_spec)
+ with open(path, "w") as f:
+ f.write(spec_to_write)
+ logfire.info("Saved spec to {path}", path=path)
+
+ elapsed = (time.perf_counter() - t0) * 1000
+ self.cost_manager.mark_run_completed(run_id)
+ logfire.info(
+ "CodeMode pipeline complete in {elapsed:.0f}ms (refinements={refinement_rounds})",
+ elapsed=elapsed,
+ func_name=func.name,
+ generation_id=self.generation_id,
+ run_id=run_id,
+ exploration_count=len(exploration_results),
+ refinement_rounds=refinement_rounds,
+ has_summary=summary is not None,
+ )
- return CodeModeResult(
- exploration_results=exploration_results,
- yaml_spec=yaml_spec,
- summary=summary,
- refinement_rounds=refinement_rounds,
- )
+ return CodeModeResult(
+ exploration_results=exploration_results,
+ yaml_spec=yaml_spec,
+ summary=summary,
+ refinement_rounds=refinement_rounds,
+ )
+ except Exception as exc:
+ self.cost_manager.mark_run_failed(run_id, str(exc))
+ raise
diff --git a/src/vowel/costs.py b/src/vowel/costs.py
new file mode 100644
index 0000000..72d90f7
--- /dev/null
+++ b/src/vowel/costs.py
@@ -0,0 +1,358 @@
+"""Cost tracking and persistence utilities for CodeMode runs."""
+
+from __future__ import annotations
+
+import fcntl
+import json
+import os
+import tempfile
+import uuid
+from datetime import UTC, datetime
+from pathlib import Path
+from typing import Any
+
+import logfire
+import yaml
+
+
+class CostManager:
+ """Manage generation/run cost records, pricing, and persistence."""
+
+ def __init__(
+ self,
+ *,
+ spec_model: str,
+ exploration_model: str,
+ generation_id: str | None = None,
+ costs_file: Path | None = None,
+ ) -> None:
+ self.spec_model = spec_model
+ self.exploration_model = exploration_model
+ self.generation_id = generation_id or self._new_generation_id()
+ self._costs_file = (
+ costs_file or Path.home() / ".vowel" / "codemode" / "generation_costs.json"
+ )
+ self._price_table = self._load_costs_yml()
+ self._cost_records: dict[str, Any] = self._load_cost_records()
+ self._ensure_generation_record()
+
+ @staticmethod
+ def _new_generation_id() -> str:
+ ts = datetime.now(UTC).strftime("%Y%m%dT%H%M%SZ")
+ return f"gen_{ts}_{uuid.uuid4().hex[:8]}"
+
+ @staticmethod
+ def _new_run_id() -> str:
+ return f"run_{uuid.uuid4().hex}"
+
+ @staticmethod
+ def _default_cost_store() -> dict[str, Any]:
+ return {"schema_version": 1, "generations": {}}
+
+ def _load_cost_records(self) -> dict[str, Any]:
+ if not self._costs_file.exists():
+ return self._default_cost_store()
+ try:
+ data = json.loads(self._costs_file.read_text(encoding="utf-8"))
+ except Exception:
+ logfire.warn("Failed to parse cost records, resetting store")
+ return self._default_cost_store()
+
+ if not isinstance(data, dict) or "generations" not in data:
+ return self._default_cost_store()
+ return data
+
+ def _ensure_generation_record(self) -> None:
+ generations = self._cost_records.setdefault("generations", {})
+ if self.generation_id in generations:
+ return
+ generations[self.generation_id] = {
+ "generation_id": self.generation_id,
+ "created_at": datetime.now(UTC).isoformat(),
+ "spec_model": self.spec_model,
+ "exploration_model": self.exploration_model,
+ "totals": {"usd": 0.0, "input_tokens": 0, "output_tokens": 0, "requests": 0},
+ "runs": {},
+ }
+
+ @staticmethod
+ def _normalize_models(data: Any) -> dict[str, dict[str, float]]:
+ if not isinstance(data, dict):
+ return {}
+
+ models_obj = data.get("models")
+ normalized: dict[str, dict[str, float]] = {}
+
+ if isinstance(models_obj, dict):
+ items = models_obj.items()
+ elif isinstance(models_obj, list):
+ items = []
+ for item in models_obj:
+ if isinstance(item, dict):
+ items.extend(item.items())
+ else:
+ items = []
+
+ for model_name, model_data in items:
+ if not isinstance(model_name, str) or not isinstance(model_data, dict):
+ continue
+ normalized[model_name] = {
+ "input_per_million": float(model_data.get("input_per_million", 0.0) or 0.0),
+ "output_per_million": float(model_data.get("output_per_million", 0.0) or 0.0),
+ "cached_input_per_million": float(
+ model_data.get("cached_input_per_million", 0.0) or 0.0
+ ),
+ }
+
+ return normalized
+
+ def _load_costs_yml(self) -> dict[str, Any]:
+ candidates = [Path.cwd() / "costs.yml", Path(__file__).resolve().parents[2] / "costs.yml"]
+ for path in candidates:
+ if not path.exists():
+ continue
+ try:
+ data = yaml.safe_load(path.read_text(encoding="utf-8"))
+ except Exception:
+ continue
+ models = self._normalize_models(data)
+ if models:
+ return {"models": models}
+ return {}
+
+ def _persist_costs_atomic(self) -> None:
+ self._costs_file.parent.mkdir(parents=True, exist_ok=True)
+ payload = json.dumps(self._cost_records, ensure_ascii=False, indent=2) + "\n"
+ lock_path = self._costs_file.parent / ".generation_costs.lock"
+
+ with open(lock_path, "a+", encoding="utf-8") as lock_file:
+ fcntl.flock(lock_file.fileno(), fcntl.LOCK_EX)
+ try:
+ with tempfile.NamedTemporaryFile(
+ mode="w", encoding="utf-8", dir=self._costs_file.parent, delete=False
+ ) as tmp:
+ tmp.write(payload)
+ tmp.flush()
+ os.fsync(tmp.fileno())
+ tmp_path = Path(tmp.name)
+ os.replace(tmp_path, self._costs_file)
+ finally:
+ fcntl.flock(lock_file.fileno(), fcntl.LOCK_UN)
+
+ def _ensure_run_record(self, run_id: str, func_name: str) -> None:
+ generation = self._cost_records["generations"][self.generation_id]
+ runs = generation.setdefault("runs", {})
+ if run_id in runs:
+ return
+ runs[run_id] = {
+ "run_id": run_id,
+ "func_name": func_name,
+ "created_at": datetime.now(UTC).isoformat(),
+ "status": "running",
+ "error": None,
+ "steps": {},
+ "totals": {"usd": 0.0, "input_tokens": 0, "output_tokens": 0, "requests": 0},
+ }
+
+ def _get_run_record(self, run_id: str) -> dict[str, Any]:
+ return self._cost_records["generations"][self.generation_id]["runs"][run_id]
+
+ @staticmethod
+ def _run_usage_dict(usage: Any) -> dict[str, int]:
+ return {
+ "requests": int(getattr(usage, "requests", 0) or 0),
+ "input_tokens": int(getattr(usage, "input_tokens", 0) or 0),
+ "output_tokens": int(getattr(usage, "output_tokens", 0) or 0),
+ "cached_input_tokens": int(getattr(usage, "cached_input_tokens", 0) or 0),
+ }
+
+ @staticmethod
+ def _normalize_model_name(model_name: str) -> str:
+ normalized = model_name.strip()
+ if ":" in normalized:
+ normalized = normalized.split(":", 1)[1]
+ if "/" in normalized:
+ normalized = normalized.rsplit("/", 1)[1]
+ return normalized
+
+ def _resolve_price_from_costs_yml(self, model_name: str) -> dict[str, float] | None:
+ models = self._price_table.get("models") if isinstance(self._price_table, dict) else None
+ if not isinstance(models, dict):
+ return None
+
+ normalized = self._normalize_model_name(model_name)
+ for key in (model_name, normalized):
+ data = models.get(key)
+ if not isinstance(data, dict):
+ continue
+ return {
+ "input_per_million": float(data.get("input_per_million", 0.0) or 0.0),
+ "output_per_million": float(data.get("output_per_million", 0.0) or 0.0),
+ "cached_input_per_million": float(data.get("cached_input_per_million", 0.0) or 0.0),
+ }
+ return None
+
+ def _resolve_price(self, model_name: str) -> tuple[dict[str, float] | None, str, bool]:
+ normalized = self._normalize_model_name(model_name)
+
+ try:
+ import genai_prices # type: ignore
+
+ for attr in ("get_price", "lookup_price", "resolve_price"):
+ fn = getattr(genai_prices, attr, None)
+ if callable(fn):
+ for name in (model_name, normalized):
+ out = fn(name)
+ if isinstance(out, dict):
+ return (
+ {
+ "input_per_million": float(
+ out.get("input_per_million")
+ or out.get("input")
+ or out.get("prompt")
+ or 0.0
+ ),
+ "output_per_million": float(
+ out.get("output_per_million")
+ or out.get("output")
+ or out.get("completion")
+ or 0.0
+ ),
+ "cached_input_per_million": float(
+ out.get("cached_input_per_million")
+ or out.get("cached_input")
+ or 0.0
+ ),
+ },
+ "genai-prices",
+ False,
+ )
+ except Exception:
+ pass
+
+ yml_price = self._resolve_price_from_costs_yml(model_name)
+ if yml_price is not None:
+ return yml_price, "costs.yml", False
+
+ return None, "missing", True
+
+ def _estimate_step_usd(self, model_name: str, usage: dict[str, int]) -> tuple[float, str, bool]:
+ price, source, missing = self._resolve_price(model_name)
+ if price is None:
+ return 0.0, source, True
+
+ in_cost = usage["input_tokens"] / 1_000_000 * price["input_per_million"]
+ out_cost = usage["output_tokens"] / 1_000_000 * price["output_per_million"]
+ cached_cost = usage["cached_input_tokens"] / 1_000_000 * price["cached_input_per_million"]
+ return in_cost + out_cost + cached_cost, source, missing
+
+ def _recompute_totals(self) -> None:
+ generation = self._cost_records["generations"][self.generation_id]
+ g_totals = {"usd": 0.0, "input_tokens": 0, "output_tokens": 0, "requests": 0}
+
+ for run in generation.get("runs", {}).values():
+ r_totals = {"usd": 0.0, "input_tokens": 0, "output_tokens": 0, "requests": 0}
+ for step in run.get("steps", {}).values():
+ usages = step.get("usages", [])
+ for item in usages:
+ usage = item.get("usage", {})
+ r_totals["usd"] += float(item.get("usd", 0.0) or 0.0)
+ r_totals["input_tokens"] += int(usage.get("input_tokens", 0) or 0)
+ r_totals["output_tokens"] += int(usage.get("output_tokens", 0) or 0)
+ r_totals["requests"] += int(usage.get("requests", 0) or 0)
+ run["totals"] = r_totals
+
+ g_totals["usd"] += r_totals["usd"]
+ g_totals["input_tokens"] += r_totals["input_tokens"]
+ g_totals["output_tokens"] += r_totals["output_tokens"]
+ g_totals["requests"] += r_totals["requests"]
+
+ generation["totals"] = g_totals
+
+ def start_run(self, *, run_id: str | None, func_name: str) -> str:
+ final_run_id = run_id or self._new_run_id()
+ self._ensure_generation_record()
+ self._ensure_run_record(final_run_id, func_name)
+ self._persist_costs_atomic()
+ return final_run_id
+
+ def record_agent_usage(
+ self, *, run_id: str, step_key: str, result: Any, model_name: str
+ ) -> None:
+ run = self._get_run_record(run_id)
+ step = run.setdefault("steps", {}).setdefault(step_key, {"usages": []})
+
+ usage_obj = result.usage() if callable(getattr(result, "usage", None)) else None
+ usage = (
+ self._run_usage_dict(usage_obj) if usage_obj is not None else self._run_usage_dict(None)
+ )
+ usd, price_source, price_missing = self._estimate_step_usd(model_name, usage)
+
+ step_item = {
+ "timestamp": datetime.now(UTC).isoformat(),
+ "agent_run_id": getattr(result, "run_id", None),
+ "model_name": model_name,
+ "usage": usage,
+ "usd": usd,
+ "price_source": price_source,
+ "price_missing": price_missing,
+ }
+ step["usages"].append(step_item)
+
+ self._recompute_totals()
+ self._persist_costs_atomic()
+
+ logfire.info(
+ "CodeMode step cost recorded",
+ generation_id=self.generation_id,
+ run_id=run_id,
+ step=step_key,
+ model_name=model_name,
+ usd=usd,
+ usage=usage,
+ price_source=price_source,
+ price_missing=price_missing,
+ )
+
+ def mark_run_completed(self, run_id: str) -> None:
+ run_rec = self._get_run_record(run_id)
+ run_rec["status"] = "completed"
+ run_rec["completed_at"] = datetime.now(UTC).isoformat()
+ self._recompute_totals()
+ self._persist_costs_atomic()
+
+ def mark_run_failed(self, run_id: str, error: str) -> None:
+ run_rec = self._get_run_record(run_id)
+ run_rec["status"] = "failed"
+ run_rec["error"] = error
+ run_rec["completed_at"] = datetime.now(UTC).isoformat()
+ self._recompute_totals()
+ self._persist_costs_atomic()
+
+ def print_total_cost(self, run_id: str | None = None) -> None:
+ generation = self._cost_records["generations"].get(self.generation_id, {})
+ if run_id is not None:
+ run = generation.get("runs", {}).get(run_id)
+ if not run:
+ print(f"run not found: {run_id}")
+ return
+ totals = run.get("totals", {})
+ print(
+ "run_cost",
+ run_id,
+ f"usd={totals.get('usd', 0.0):.6f}",
+ f"input_tokens={totals.get('input_tokens', 0)}",
+ f"output_tokens={totals.get('output_tokens', 0)}",
+ f"requests={totals.get('requests', 0)}",
+ )
+ return
+
+ totals = generation.get("totals", {})
+ print(
+ "generation_cost",
+ self.generation_id,
+ f"usd={totals.get('usd', 0.0):.6f}",
+ f"input_tokens={totals.get('input_tokens', 0)}",
+ f"output_tokens={totals.get('output_tokens', 0)}",
+ f"requests={totals.get('requests', 0)}",
+ )
diff --git a/src/vowel/evals.py b/src/vowel/evals.py
index 722dc55..f4d1b2f 100644
--- a/src/vowel/evals.py
+++ b/src/vowel/evals.py
@@ -11,7 +11,13 @@
from pydantic import ValidationError
from pydantic.type_adapter import TypeAdapter
from pydantic_ai.settings import ModelSettings
-from pydantic_evals.evaluators import EvaluationReason, Evaluator, EvaluatorContext, LLMJudge
+from pydantic_evals.evaluators import (
+ EvaluationReason,
+ Evaluator,
+ EvaluatorContext,
+ LLMJudge,
+ OutputConfig, # noqa: F401
+)
MONTY_AVAILABLE = bool(importlib.util.find_spec("pydantic-monty"))
@@ -396,6 +402,9 @@ def create_llm_judge(
include: list[str] | None = None,
config: dict | None = None,
) -> LLMJudge:
+ # Imported lazily to avoid circular import at module import time.
+ from .utils import _resolve_env_ref
+
if config is None:
config = {}
@@ -405,14 +414,8 @@ def create_llm_judge(
"'model' must be specified in config or set JUDGE_MODEL environment variable"
)
- if model.strip().startswith("$"):
- env_var = model.strip().lstrip("$")
- model = os.getenv(env_var)
- if not model:
- raise ValueError(
- f"Environment variable {env_var} is not set for judge model, "
- f"set {env_var} to a valid model name."
- )
+ model = _resolve_env_ref(model, field_name="model")
+ rubric = _resolve_env_ref(rubric, field_name="rubric")
include_input = False
include_expected_output = False
diff --git a/src/vowel/utils.py b/src/vowel/utils.py
index 1c710c8..60bab83 100644
--- a/src/vowel/utils.py
+++ b/src/vowel/utils.py
@@ -130,6 +130,24 @@ def to_yaml(self) -> str:
}
+def _resolve_env_ref(
+ value: str, *, field_name: str, scope: Literal["judge", "model"] | str = "judge"
+) -> str:
+ """Resolve $ENV_VAR references used in YAML evaluator settings."""
+ value = value.strip()
+ if not value.startswith("$"):
+ return value
+
+ env_var = value.lstrip("$")
+ resolved = os.getenv(env_var)
+ if not resolved:
+ raise ValueError(
+ f"Environment variable {env_var} is not set for {scope} {field_name}, "
+ f"set {env_var} to a valid value."
+ )
+ return resolved
+
+
def is_yaml_serializable_type(type_hint: Any) -> bool:
"""
Check if a type hint represents a YAML-serializable type.
@@ -1225,6 +1243,42 @@ def _merge_programmatic_fixtures(
return merged_fixtures, fixture_funcs
+def _resolve_eval_id_mapping(
+ mapping: Mapping[str, Any] | None,
+ eval_id: str,
+ *,
+ mapping_name: str,
+) -> Any | None:
+ """Resolve mapping entries by exact id first, then by short function name.
+
+ Supports using programmatic keys like ``{"func": fn}`` for specs that use
+ ``module.func`` eval ids.
+ """
+ if not mapping:
+ return None
+
+ if eval_id in mapping:
+ return mapping[eval_id]
+
+ short_name = eval_id.rsplit(".", 1)[-1]
+ if short_name in mapping:
+ return mapping[short_name]
+
+ # Reverse direction: when eval id is bare and mapping uses module.function.
+ if "." not in eval_id:
+ matches = [value for key, value in mapping.items() if key.rsplit(".", 1)[-1] == eval_id]
+ if len(matches) == 1:
+ return matches[0]
+ if len(matches) > 1:
+ candidates = sorted({key for key in mapping if key.rsplit(".", 1)[-1] == eval_id})
+ raise ValueError(
+ f"Ambiguous {mapping_name} mapping for '{eval_id}'. "
+ f"Provide an exact key. Candidates: {candidates}"
+ )
+
+ return None
+
+
def _import_and_detect_class_method(
eval_id: str,
functions: dict[str, Callable] | None,
@@ -1238,8 +1292,9 @@ def _import_and_detect_class_method(
- class_path: Full module.ClassName path for class methods, None otherwise
- class_name: Class name for class methods, None otherwise
"""
- if functions and eval_id in functions:
- func = functions[eval_id]
+ resolved_func = _resolve_eval_id_mapping(functions, eval_id, mapping_name="function")
+ if resolved_func is not None:
+ func = resolved_func
# Check if bound method (exclude builtin functions where __self__ is the module)
self_obj = getattr(func, "__self__", None)
if self_obj is not None and not isinstance(self_obj, types.ModuleType):
@@ -1491,8 +1546,10 @@ def _evaluate_single_function(
)
# Get serializers for this function if defined
- func_schema = schema.get(eval_id)
- func_serial_fn = serial_fn.get(eval_id)
+ func_schema = _resolve_eval_id_mapping(schema, eval_id, mapping_name="serializer schema")
+ func_serial_fn = _resolve_eval_id_mapping(
+ serial_fn, eval_id, mapping_name="serializer function"
+ )
# Setup module-scoped fixtures for this eval
module_fixtures = {}
diff --git a/tests/cassettes/llm_judge_custom_model.json b/tests/cassettes/llm_judge_custom_model.json
index 412c044..4d6c526 100644
--- a/tests/cassettes/llm_judge_custom_model.json
+++ b/tests/cassettes/llm_judge_custom_model.json
@@ -4,7 +4,7 @@
"input_preview": "john doe",
"result": {
"passed": false,
- "model": "openrouter:google/gemini-3-flash-preview"
+ "model": "openrouter:anthropic/claude-opus-4.6"
}
}
}
\ No newline at end of file
diff --git a/tests/cassettes/test_generate_and_run.json b/tests/cassettes/test_generate_and_run.json
index 4ace11b..35414fa 100644
--- a/tests/cassettes/test_generate_and_run.json
+++ b/tests/cassettes/test_generate_and_run.json
@@ -3,9 +3,9 @@
"prompt_preview": "generate_and_run",
"model": "openrouter:google/gemini-3-flash-preview",
"response": {
- "yaml_spec": "double:\n evals:\n CorrectType:\n type: int\n DoubleLogic:\n assertion: output == input * 2\n NonNegativeIfInputNonNegative:\n assertion: input < 0 or output >= input\n dataset:\n - case:\n id: positive_integer\n input: 5\n expected: 10\n - case:\n id: zero\n input: 0\n expected: 0\n - case:\n id: negative_integer\n input: -4\n expected: -8\n - case:\n id: large_integer\n input: 1000000\n expected: 2000000\n - case:\n id: sequence_multiplication_check\n input: 1\n expected: 2\n - case:\n id: invalid_type_string\n input: '10'\n assertion: output == '1010'\n",
+ "yaml_spec": "double:\n evals:\n IsInteger:\n type: int\n CorrectCalculation:\n assertion: output == input * 2\n FastEnough:\n duration: 0.001\n dataset:\n - case:\n id: positive_integer\n input: 10\n expected: 20\n - case:\n id: negative_integer\n input: -5\n expected: -10\n - case:\n id: zero_input\n input: 0\n expected: 0\n - case:\n id: large_integer\n input: 1000000\n expected: 2000000\n - case:\n id: string_input_error\n input: '5'\n assertion: output == '55'\n type: str\n",
"was_healed": false,
- "coverage": 0.8333333333333334
+ "coverage": 0.0
}
}
}
\ No newline at end of file
diff --git a/tests/cassettes/test_generate_factorial.json b/tests/cassettes/test_generate_factorial.json
index 758b00c..094b8db 100644
--- a/tests/cassettes/test_generate_factorial.json
+++ b/tests/cassettes/test_generate_factorial.json
@@ -3,9 +3,9 @@
"prompt_preview": "generate_function",
"model": "openrouter:google/gemini-3-flash-preview",
"response": {
- "name": "calculate_factorial",
- "description": "Calculates the factorial of a non-negative integer n using an iterative approach. Includes input validation for non-integers and negative values.",
- "code": "def calculate_factorial(n: int) -> int:\n \\\"\\\"\\\"\n Calculates the factorial of a non-negative integer n.\n \n Args:\n n (int): A non-negative integer.\n \n Returns:\n int: The factorial of n.\n \n Raises:\n ValueError: If n is negative.\n TypeError: If n is not an integer.\n \\\"\\\"\\\"\n if not isinstance(n, int):\n raise TypeError(\\\"Input must be an integer.\\\")\n if n < 0:\n raise ValueError(\\\"Input must be a non-negative integer.\\\")\n \n result = 1\n for i in range(2, n + 1):\n result *= i\n return result\n"
+ "name": "factorial",
+ "description": "Calculates the factorial of a non-negative integer using an iterative approach to avoid recursion depth issues.",
+ "code": "def factorial(n: int) -> int:\n \\\"\\\"\\\"\n Calculates the factorial of a non-negative integer n.\n \n Args:\n n: A non-negative integer.\n \n Returns:\n The factorial of n.\n \n Raises:\n ValueError: If n is negative.\n TypeError: If n is not an integer.\n \\\"\\\"\\\"\n if not isinstance(n, int):\n raise TypeError(\\\"Input must be an integer.\\\")\n if n < 0:\n raise ValueError(\\\"Factorial is not defined for negative numbers.\\\")\n \n result = 1\n for i in range(2, n + 1):\n result *= i\n return result\n"
}
}
}
\ No newline at end of file
diff --git a/tests/cassettes/test_generate_palindrome.json b/tests/cassettes/test_generate_palindrome.json
index 452654a..437686c 100644
--- a/tests/cassettes/test_generate_palindrome.json
+++ b/tests/cassettes/test_generate_palindrome.json
@@ -4,8 +4,8 @@
"model": "openrouter:google/gemini-3-flash-preview",
"response": {
"name": "is_palindrome",
- "description": "Checks if a string is a palindrome while ignoring case and spaces. Only spaces are ignored, other punctuation is preserved.",
- "code": "def is_palindrome(text: str) -> bool:\n \"\"\"\n Checks if a string is a palindrome, ignoring case and spaces.\n \n Args:\n text: The string to check.\n \n Returns:\n True if the string is a palindrome, False otherwise.\n \"\"\"\n if text is None:\n raise TypeError(\"Input must be a string\")\n \n # Remove spaces and convert to lowercase\n normalized = text.replace(\" \", \"\").lower()\n \n # Check if string matches its reverse\n return normalized == normalized[::-1]\n"
+ "description": "Checks if a string is a palindrome while ignoring case and whitespace.",
+ "code": "def is_palindrome(text: str) -> bool:\n \\\"\\\"\\\"\n Checks if a string is a palindrome, ignoring case and spaces.\n \n Args:\n text (str): The string to check.\n \n Returns:\n bool: True if it is a palindrome, False otherwise.\n \\\"\\\"\\\"\n if not isinstance(text, str):\n return False\n \n # Remove spaces and convert to lowercase\n cleaned = \\\"\\\".join(text.split()).lower()\n \n # Check if string matches its reverse\n return cleaned == cleaned[::-1]\n"
}
}
}
\ No newline at end of file
diff --git a/tests/cassettes/test_generate_spec_simple.json b/tests/cassettes/test_generate_spec_simple.json
index def07eb..6bbf8b9 100644
--- a/tests/cassettes/test_generate_spec_simple.json
+++ b/tests/cassettes/test_generate_spec_simple.json
@@ -3,7 +3,7 @@
"prompt_preview": "generate_spec",
"model": "openrouter:google/gemini-3-flash-preview",
"response": {
- "yaml_spec": "add_numbers:\n evals:\n IsInt:\n type: int\n IdentityProperty:\n assertion: (input[0] == 0 and output == input[1]) or (input[1] == 0 and output\n == input[0]) or True\n CommutativeProperty:\n assertion: output == input[1] + input[0]\n dataset:\n - case:\n id: typical_positive\n inputs:\n - 10\n - 25\n expected: 35\n - case:\n id: negative_numbers\n inputs:\n - -5\n - -15\n expected: -20\n - case:\n id: mixed_signs\n inputs:\n - 100\n - -40\n expected: 60\n - case:\n id: zero_identity\n inputs:\n - 0\n - 42\n expected: 42\n - case:\n id: large_integers\n inputs:\n - 1000000\n - 2000000\n expected: 3000000\n - case:\n id: boundary_zero_sum\n inputs:\n - 50\n - -50\n expected: 0\n",
+ "yaml_spec": "add_numbers:\n evals:\n IsInteger:\n type: int\n CorrectSum:\n assertion: output == input[0] + input[1]\n FastExecution:\n duration: 0.001\n dataset:\n - case:\n id: positive_integers\n inputs:\n - 10\n - 20\n expected: 30\n - case:\n id: negative_integers\n inputs:\n - -5\n - -15\n expected: -20\n - case:\n id: mixed_signs\n inputs:\n - -10\n - 25\n expected: 15\n - case:\n id: zero_addition\n inputs:\n - 0\n - 100\n expected: 100\n - case:\n id: large_integers\n inputs:\n - 1000000\n - 2000000\n expected: 3000000\n - case:\n id: identity_property\n inputs:\n - 42\n - 0\n expected: 42\n",
"func_name": "add_numbers"
}
}
diff --git a/tests/cassettes/test_generate_spec_string.json b/tests/cassettes/test_generate_spec_string.json
index 2968bb9..0a365b1 100644
--- a/tests/cassettes/test_generate_spec_string.json
+++ b/tests/cassettes/test_generate_spec_string.json
@@ -3,7 +3,7 @@
"prompt_preview": "generate_spec",
"model": "openrouter:google/gemini-3-flash-preview",
"response": {
- "yaml_spec": "reverse_string:\n evals:\n IsString:\n type: str\n ReverseProperty:\n assertion: output[::-1] == input\n LengthInvariant:\n assertion: len(output) == len(input)\n dataset:\n - case:\n id: typical_word\n input: hello\n expected: olleh\n - case:\n id: empty_string\n input: ''\n expected: ''\n - case:\n id: single_character\n input: z\n expected: z\n - case:\n id: palindrome\n input: racecar\n expected: racecar\n - case:\n id: strings_with_spaces\n input: abc def\n expected: fed cba\n - case:\n id: numeric_string\n input: '12345'\n expected: '54321'\n - case:\n id: special_characters\n input: '!@#$%^&*()'\n expected: )(*&^%$#@!\n",
+ "yaml_spec": "reverse_string:\n evals:\n IsString:\n type: str\n CorrectLength:\n assertion: len(output) == len(input)\n IdentityProperty:\n assertion: output[::-1] == input\n dataset:\n - case:\n id: normal_word\n input: hello\n expected: olleh\n - case:\n id: empty_string\n input: ''\n expected: ''\n - case:\n id: single_character\n input: A\n expected: A\n - case:\n id: with_spaces\n input: nurses run\n expected: nur sesrun\n - case:\n id: palindrome\n input: racecar\n expected: racecar\n - case:\n id: numeric_string\n input: '123456789'\n expected: '987654321'\n - case:\n id: special_characters\n input: '!@#$%^&*'\n expected: '*&^%$#@!'\n",
"func_name": "reverse_string"
}
}
diff --git a/tests/test_generation.py b/tests/test_generation.py
deleted file mode 100644
index 011546c..0000000
--- a/tests/test_generation.py
+++ /dev/null
@@ -1,36 +0,0 @@
-"""Test script for EvalGenerator and GenerationResult."""
-
-from vowel import EvalGenerator, GenerationResult
-
-
-def main():
- generator = EvalGenerator(load_env=True)
-
- print(f"\nUsing model: {generator.model}")
- print("\n🚀 Step 1: Generate a function from prompt\n")
-
- func = generator.generate_function(
- prompt="Create a function called 'is_prime' that checks if a number is prime. Return True if prime, False otherwise.",
- async_func=False,
- )
-
- print(f"Generated: {func.name}")
- func.print()
-
- print("\n🧪 Step 2: Generate spec and run evals\n")
-
- result: GenerationResult = generator.generate_and_run(
- func,
- auto_retry=True,
- max_retries=2,
- min_coverage=0.9,
- heal_function=True,
- )
-
- result.print()
-
- print("✅ Test completed!\n")
-
-
-if __name__ == "__main__":
- main()
diff --git a/tests/test_llm_integration.py b/tests/test_llm_integration.py
index bcce51b..7e55a20 100644
--- a/tests/test_llm_integration.py
+++ b/tests/test_llm_integration.py
@@ -11,10 +11,10 @@
dotenv.load_dotenv()
-DEFAULT_MODEL = os.getenv("MODEL_NAME", "openrouter:google/gemini-3-flash-preview")
+DEFAULT_MODEL = "openrouter:google/gemini-3-flash-preview"
pytestmark = pytest.mark.skipif(
- not os.getenv("OPENROUTER_API_KEY") and not os.getenv("OPENAI_API_KEY"),
+ not os.getenv("OPENROUTER_API_KEY"),
reason="No API key available for LLM tests (need OPENROUTER_API_KEY or OPENAI_API_KEY)",
)
diff --git a/tests/test_llm_judge_env_refs.py b/tests/test_llm_judge_env_refs.py
new file mode 100644
index 0000000..7407426
--- /dev/null
+++ b/tests/test_llm_judge_env_refs.py
@@ -0,0 +1,33 @@
+"""Tests for environment variable references in LLM Judge configuration."""
+
+import pytest
+
+from vowel.evals import create_llm_judge
+
+
+def test_create_llm_judge_resolves_rubric_and_model_env_refs(monkeypatch):
+ """Rubric and model support $ENV_VAR style references."""
+ monkeypatch.setenv("TEST_JUDGE_MODEL", "openrouter:google/gemini-2.5-flash")
+ monkeypatch.setenv("_TEST_JUDGE_RUBRIC", "Output should be concise and accurate")
+
+ judge = create_llm_judge(
+ rubric="$_TEST_JUDGE_RUBRIC",
+ include=["input"],
+ config={"model": "$TEST_JUDGE_MODEL", "temperature": 0.0},
+ )
+
+ assert judge.model == "openrouter:google/gemini-2.5-flash"
+ assert judge.rubric == "Output should be concise and accurate"
+
+
+def test_create_llm_judge_raises_when_rubric_env_ref_missing(monkeypatch):
+ """Missing rubric env var should raise a clear error."""
+ monkeypatch.setenv("TEST_JUDGE_MODEL", "openrouter:google/gemini-2.5-flash")
+ monkeypatch.delenv("_MISSING_RUBRIC", raising=False)
+
+ with pytest.raises(ValueError, match="_MISSING_RUBRIC"):
+ create_llm_judge(
+ rubric="$_MISSING_RUBRIC",
+ include=["input"],
+ config={"model": "$TEST_JUDGE_MODEL"},
+ )
diff --git a/tests/test_run_evals.py b/tests/test_run_evals.py
index 0659958..4ff09e5 100644
--- a/tests/test_run_evals.py
+++ b/tests/test_run_evals.py
@@ -147,6 +147,24 @@ def test_with_functions_chained(self, simple_yaml_spec: str):
assert summary.all_passed
+ def test_with_functions_short_name_matches_module_function_spec(self):
+ """module.function eval ids should match short-name keys from with_functions."""
+
+ def add(a, b):
+ return a + b
+
+ spec = {
+ "pkg.add": {
+ "dataset": [
+ {"case": {"inputs": {"a": 1, "b": 2}, "expected": 3}},
+ ]
+ }
+ }
+
+ summary = RunEvals.from_dict(spec).with_functions({"add": add}).run()
+
+ assert summary.all_passed
+
def test_with_executor_preserves_existing_run_behavior(self, simple_yaml_spec: str):
"""Executor preferences should be accepted without changing normal eval behavior."""
summary = (
diff --git a/tests/test_serializer.py b/tests/test_serializer.py
index b38bc56..1ed83ec 100644
--- a/tests/test_serializer.py
+++ b/tests/test_serializer.py
@@ -198,6 +198,28 @@ def test_multiple_cases(self):
assert summary.all_passed
assert summary.total_count == 1
+ def test_serializer_short_name_matches_module_function_spec(self):
+ """Serializer mapping by short name should work for module.function eval ids."""
+ spec = {
+ "pkg.get_user_info": {
+ "dataset": [
+ {
+ "case": {
+ "input": {"id": 1, "name": "Alice", "email": "a@a.com"},
+ "expected": "User Alice has email a@a.com",
+ }
+ },
+ ]
+ }
+ }
+ summary = (
+ RunEvals.from_dict(spec)
+ .with_functions({"get_user_info": get_user_info})
+ .with_serializer({"get_user_info": User})
+ .run()
+ )
+ assert summary.all_passed
+
class TestSerialFn:
"""Tests for serial_fn-based serialization."""
From bd73afdc9f2981570954271c7850ea5dec8bd3b4 Mon Sep 17 00:00:00 2001
From: Mert <62549656+fswair@users.noreply.github.com>
Date: Thu, 19 Mar 2026 19:42:00 +0300
Subject: [PATCH 5/8] bump version 0.3.5 -> 0.4.0
---
.env.sample | 14 +-
.gitignore | 6 +-
.gitmodules | 3 +
CHANGELOG.md | 198 +++++
README.md | 46 +-
VERSION | 2 +-
costs.yml | 409 +++++++++++
db_fixture.yml | 32 -
docs/CLI.md | 41 ++
docs/CODEMODE.md | 164 +++++
docs/FIXTURES.md | 17 +-
docs/README.md | 2 +
docs/SERIALIZERS.md | 47 ++
docs/YAML_SPEC.md | 20 +-
examples/basic_usage/evals.yml | 20 +
db.py => examples/db_fixtures/db.py | 4 +-
examples/evals/builtins.yml | 42 +-
examples/evals/math.yml | 24 +-
examples/evals/strings.yml | 33 +-
examples/evals/validation.yml | 32 +-
examples/evaluators/evals.yml | 28 +-
examples/fixtures/evals.yml | 5 +
examples/fluent_api/evals.yml | 26 +-
examples/serializers/__init__.py | 1 +
examples/serializers/app.py | 18 +
examples/serializers/db_query_evals.yml | 53 ++
examples/serializers/defn.py | 41 ++
examples/serializers/fixtures.py | 39 +
examples/serializers/util.py | 0
pyproject.toml | 2 +-
quality-judge/evals.py | 33 +
quality-judge/largestPathValue_evals.yml | 762 ++++++++++++++++++++
quality-judge/largest_color_value_judge.yml | 330 +++++++++
quality-judge/runner.py | 22 +
src/vowel/eval_types.py | 57 +-
src/vowel/evals.py | 103 ++-
src/vowel/schema.py | 19 +-
src/vowel/utils.py | 178 ++++-
tests/test_run_evals.py | 75 ++
tests/test_schema.py | 38 +
tests/test_serializer.py | 177 +++++
tests/test_yaml_loading.py | 37 +
vowel-schema.json | 73 ++
43 files changed, 3105 insertions(+), 168 deletions(-)
create mode 100644 CHANGELOG.md
create mode 100644 costs.yml
delete mode 100644 db_fixture.yml
create mode 100644 docs/CODEMODE.md
rename db.py => examples/db_fixtures/db.py (95%)
create mode 100644 examples/serializers/__init__.py
create mode 100644 examples/serializers/app.py
create mode 100644 examples/serializers/db_query_evals.yml
create mode 100644 examples/serializers/defn.py
create mode 100644 examples/serializers/fixtures.py
create mode 100644 examples/serializers/util.py
create mode 100644 quality-judge/evals.py
create mode 100644 quality-judge/largestPathValue_evals.yml
create mode 100644 quality-judge/largest_color_value_judge.yml
create mode 100644 quality-judge/runner.py
create mode 100644 tests/test_schema.py
diff --git a/.env.sample b/.env.sample
index 8f8c9d7..2d59281 100644
--- a/.env.sample
+++ b/.env.sample
@@ -11,4 +11,16 @@ LOGFIRE_ENABLED=false
JUDGE_MODEL=openrouter:google/gemini-3-flash-preview
# Default model used by Agents
-MODEL_NAME=openrouter:google/gemini-3-flash-preview
\ No newline at end of file
+MODEL_NAME=openrouter:google/gemini-3-flash-preview
+
+# Default spec & exploration models used by CodeMode pipeline
+# Spec agent generates tests
+# Exploration agent generates snippets to discover behaviors (code-execution)
+SPEC_MODEL=openrouter:anthropic/claude-opus-4.6
+EXPLORATION_MODEL=openrouter:anthropic/claude-sonnet-4.6
+
+# Default spec & exploration models used by CodeMode benchmark pipeline
+# NOTE: Models should be comma seperated, length of spec models must equals to exploration models
+# spec[i] will be mapped to exploration[i] (Case N)
+BENCHMARK_SPEC_MODELS=openrouter:anthropic/claude-opus-4.6
+BENCHMARK_EXPLORATION_MODELS=openrouter:anthropic/claude-sonnet-4.6
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
index dc1809e..8b40711 100644
--- a/.gitignore
+++ b/.gitignore
@@ -72,8 +72,4 @@ docs/FIXTURE_GENERATION_RFC.md
# Benchmarks
benchmark*
-parse_cron_evals.yml
-
-# Known Models with Costs
-costs.yml
-db_fixture_serializers.yml
+important-links.md
diff --git a/.gitmodules b/.gitmodules
index e0064d4..1b9806f 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -4,3 +4,6 @@
[submodule "skills/vowel-core"]
path = skills/vowel-core
url = https://github.com/fswair/vowel-core.git
+[submodule "codemode-benchmark"]
+ path = codemode-benchmark
+ url = https://github.com/fswair/codemode-benchmark
diff --git a/CHANGELOG.md b/CHANGELOG.md
new file mode 100644
index 0000000..1c21190
--- /dev/null
+++ b/CHANGELOG.md
@@ -0,0 +1,198 @@
+# CHANGELOG
+
+## codemode_driven_generation
+
+This document summarizes the main features added or improved on this branch.
+
+## 1) Executor and ExecutionSession protocols
+
+- The code execution interface was formalized using Protocols.
+- The Executor async/sync API was standardized:
+ - execute(...)
+ - execute_sync(...)
+ - create_session(...)
+- ExecutionSession now compiles/executes setup code once and supports multi-snippet feed execution.
+- This reduces repeated parse/compile overhead while exploring the same function.
+- The run_sync helper was hardened for running-loop environments via nest-asyncio.
+
+## 2) MontyExecutor, DefaultExecutor, MontySession, FallbackSession structures
+
+- MontyExecutor was added:
+ - sandboxed execution via pydantic-monty,
+ - ResourceLimits support (timeout/memory),
+ - stdout capture and normalized error typing/messages,
+- DefaultExecutor was added/improved:
+ - pure Python exec-based fallback execution,
+ - last-expression capture (__result__) and stdout capture.
+- MontyReplSession (MontySession role) was added:
+ - one-time setup load, reusable feed-run model.
+- FallbackSession was added:
+ - Session-level fallback: if Monty session initialization fails, switch entirely to DefaultSession.
+ - Snippet-level fallback: if Monty returns ModuleNotFoundError for a snippet, rerun that snippet via fallback executor.
+- Executor/fallback wiring was simplified through resolve_executors.
+
+## 3) Main implementation: CodeModeGenerator
+
+- Two-phase exploration-guided generation flow:
+ - Phase 1: behavior exploration (exploration snippets + error snippets)
+ - Phase 2: spec generation from verified observations
+- Lazy Agent architecture:
+ - explorer_agent (ExplorationPlan)
+ - spec_agent (EvalsSource or EvalsBundle)
+- Prompt layers were clearly separated:
+ - exploration prompt: coverage, diversity, duplicate prevention
+ - spec prompt: expected values from verified outputs only
+- A refinement loop was added:
+ - generate -> run -> failure_context -> regenerate
+- Optional duration injection and a final summary run were added at the end.
+
+## 4) Runtime hierarchy and utility usage
+
+CodeMode hierarchy:
+
+1. explore()
+2. generate_spec()
+3. validate_and_fix_spec()
+4. validate_expected_values()
+5. inject_missing_error_cases()
+6. inject_durations() (optional)
+7. validation/refinement with RunEvals
+
+Utilities used:
+
+- build_call_code
+- build_failure_context
+- validate_and_fix_spec
+- validate_expected_values
+- inject_missing_error_cases
+- inject_durations
+
+## 5) Cost Manager
+
+- Generation/run cost tracking was added for CodeMode.
+- Features:
+ - generation_id and run_id lifecycle management,
+ - step-level usage/cost recording,
+ - model price resolution (genai-prices or costs.yml),
+ - atomic/locked JSON persistence,
+ - generation-level and run-level totals,
+ - status tracking: running/completed/failed.
+- The CLI costs command now supports list/by-generation/by-run views.
+
+## 6) Serializer syntax and YAML-native serializer registry
+
+- Top-level serializers registry support was added at EvalsFile level.
+- Per-eval serializer references are now supported via serializer:.
+- SerializerSpec was clarified with one-of behavior:
+ - schema (string or dict)
+ - serializer (callable import path)
+ - not both at the same time.
+- Runtime resolver additions:
+ - import-path resolution,
+ - cached imports (_import_path_cached),
+ - per-eval resolution (_resolve_yaml_serializer_entry).
+- Precedence between programmatic serializer maps and YAML serializer registry was defined.
+
+## 7) Spec model / Exploration model separation
+
+- Model separation in CodeModeGenerator constructor was formalized:
+ - spec_model
+ - exploration_model
+- use_model_spec output mode was clarified:
+ - use_model_spec=True: structured output mode (schema/model output via EvalsBundle)
+ - use_model_spec=False: YAML string output mode (via EvalsSource.yaml_spec)
+- HIGHLY RECOMMENDED TO KEEP use_model_spec=False.
+- Model resolution order and env fallback logic were added.
+- Cost tracking now supports separate model usage across separate steps.
+
+## 8) Adding executor/fallback executor to utilities
+
+- Utility flows were updated to accept executor and fallback executor parameters.
+- Monty -> Default fallback behavior was generalized in execution-aware paths.
+- Executor behavior was centralized across run_evals and validation stages.
+
+## 9) YAML schema generator
+
+- Runtime-model-driven schema generation was improved:
+ - supports top-level fixtures + serializers,
+ - preserves function-level EvalsMapValue behavior.
+- Schema cache strategy was updated:
+ - content-hash-based filename (reduces stale editor cache issues).
+- File header updates are handled safely via materialize_yaml_with_schema_header.
+
+## 10) CLI komutları: schema, costs
+
+- vowel schema :
+ - update schema header after YAML + pydantic validation
+- vowel schema --create [path]:
+ - direct schema JSON generation
+- vowel costs:
+ - --list
+ - --by-generation
+ - --by-run
+ - --generation
+ - --run
+
+## 11) module.function -> function alias support
+
+- Alias support was added for programmatic mapping resolution:
+ - function map
+ - serializer schema map
+ - serializer function map
+- Behavior:
+ - exact match first,
+ - short-name fallback,
+ - explicit error for ambiguous reverse short-name mapping.
+
+## 12) Feedback-guided exploration
+
+- A targeted Round-2 exploration flow was added:
+ - build cluster summaries from Round-1 results,
+ - generate snippets focused on uncovered behavior classes.
+- Duplicate/semantic repetition minimization was reinforced at prompt level.
+- Distinct failure-mode coverage was improved for error snippets.
+- Additional rounds now measure value via new-behavior counting.
+
+## 13) Assertion + serializer integration
+
+- AssertionEvaluator input context is now serializer-aware.
+- Assertions now see serialized input for schema, serial_fn, and nested/dict schema modes.
+- This behavior is covered by regression tests.
+
+## 14) LLM Judge env-ref improvements
+
+- create_llm_judge now supports $ENV_VAR resolution for rubric/model fields.
+- Missing env refs now produce clearer errors.
+
+## 15) Examples, documentation, and test coverage
+
+- A runnable native serializer + fixture example was added.
+- README and serializer docs were updated with serializer/assertion context notes.
+- Meaningful id fields were added to eval cases under examples.
+- New/updated tests include:
+ - test_schema
+ - test_llm_judge_env_refs
+ - serializer assertion regressions
+ - YAML/native serializer parsing tests
+
+## 16) Fixture scope alias support
+
+- Fixture scopes now support clearer canonical names:
+ - case
+ - eval
+ - file
+- Backward-compatible aliases are still accepted:
+ - function (alias of case)
+ - module (alias of eval)
+ - session (alias of file)
+- At parse time, canonical names are normalized to legacy internal runtime values:
+ - case -> function
+ - eval -> module
+ - file -> session
+- This keeps existing runtime lifecycle behavior unchanged while allowing more descriptive scope names in YAML.
+
+Note: Old names would be deprecated after v1.0.0
+
+## Note
+
+This changelog is based on features observed and validated in code on this branch, without using git history.
diff --git a/README.md b/README.md
index f3ec800..4c2d792 100644
--- a/README.md
+++ b/README.md
@@ -46,7 +46,7 @@ pip install -e ".[all]"
## Quick Start
> **Note:**
-> For a deeper understanding of how vowel handles fixtures, see the examples in [`db_fixture.yml`](./db_fixture.yml) and [`db.py`](./db.py). These files demonstrate the underlying mechanics of fixture setup and usage.
+> For a deeper understanding of how vowel handles fixtures, see the examples in [`examples/db_fixtures`](./examples/db_fixtures/). These example demonstrate the underlying mechanics of fixture setup and usage.
> **Tip:**
> To enable YAML schema validation in your editor, place `vowel-schema.json` in your project directory.
@@ -183,6 +183,29 @@ def query_user(user_id: int, *, db: dict) -> dict | None:
return db["users"].get(user_id)
```
+Fixture scope aliases:
+- Preferred scope names: `case`, `eval`, `file`
+- Backward-compatible aliases: `function`, `module`, `session`
+- Normalization mapping: `case -> function`, `eval -> module`, `file -> session`
+
+Example:
+
+```yaml
+fixtures:
+ temp_data:
+ setup: myapp.make_temp_data
+ scope: case
+
+ db:
+ setup: myapp.setup_db
+ teardown: myapp.close_db
+ scope: eval
+
+ cache:
+ setup: myapp.setup_cache
+ scope: file
+```
+
> **Full reference:** [docs/FIXTURES.md](https://github.com/fswair/vowel/blob/main/docs/FIXTURES.md)
### Input Serializers
@@ -200,6 +223,24 @@ summary = (
> **Serializer key matching:** Serializer mappings follow the same rule as `.with_functions(...)` — both `module.function` and short `function` keys are accepted.
+> **Assertion context and serializers:** When a serializer is configured, assertion evaluators use the serialized value for `input` (not raw YAML). This applies to schema mode, `serial_fn`, and nested/dict schemas.
+
+Runnable example (YAML-native serializers + fixtures):
+
+```bash
+vowel examples/serializers/db_query_evals.yml
+```
+
+This example demonstrates:
+- top-level `serializers:` registry with both `schema` and `serializer` entries,
+- per-eval `serializer:` references,
+- fixture class lifecycle wiring with `cls` + `teardown`,
+- assertion checks that read serialized `input` values.
+
+See:
+- `examples/serializers/db_query_evals.yml`
+- `examples/serializers/util.py`
+
> **Full reference:** [docs/SERIALIZERS.md](https://github.com/fswair/vowel/blob/main/docs/SERIALIZERS.md)
### AI-Powered Generation
@@ -263,6 +304,9 @@ vowel evals.yml --dry-run # Show plan without running
vowel evals.yml --export-json out.json # Export results
vowel evals.yml -v # Verbose summary
vowel evals.yml -v --hide-report # Verbose, hide pydantic_evals report
+vowel schema examples/serializers/db_query_evals.yml # Validate + update schema header
+vowel schema --create # Generate vowel-schema.json
+vowel costs --list # List tracked generation/run costs
```
> **Full reference:** [docs/CLI.md](https://github.com/fswair/vowel/blob/main/docs/CLI.md)
diff --git a/VERSION b/VERSION
index 09e9157..60a2d3e 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-0.3.5
\ No newline at end of file
+0.4.0
\ No newline at end of file
diff --git a/costs.yml b/costs.yml
new file mode 100644
index 0000000..6b02e8a
--- /dev/null
+++ b/costs.yml
@@ -0,0 +1,409 @@
+models:
+- amazon-nova-micro:
+ cached_input_per_million: null
+ input_per_million: 0.035
+ output_per_million: 0.14
+- amazon-nova-lite:
+ cached_input_per_million: null
+ input_per_million: 0.06
+ output_per_million: 0.24
+- amazon-nova-pro:
+ cached_input_per_million: null
+ input_per_million: 0.8
+ output_per_million: 3.2
+- amazon-nova-premier:
+ cached_input_per_million: null
+ input_per_million: 2.5
+ output_per_million: 12.5
+- claude-3.7-sonnet:
+ cached_input_per_million: null
+ input_per_million: 3
+ output_per_million: 15
+- claude-3.5-sonnet:
+ cached_input_per_million: null
+ input_per_million: 3
+ output_per_million: 15
+- claude-3-opus:
+ cached_input_per_million: null
+ input_per_million: 15
+ output_per_million: 75
+- claude-3-haiku:
+ cached_input_per_million: null
+ input_per_million: 0.25
+ output_per_million: 1.25
+- claude-3.5-haiku:
+ cached_input_per_million: null
+ input_per_million: 0.8
+ output_per_million: 4
+- claude-4.5-haiku:
+ cached_input_per_million: null
+ input_per_million: 1
+ output_per_million: 5
+- claude-sonnet-4.5:
+ cached_input_per_million: null
+ input_per_million: 3
+ output_per_million: 15
+- claude-sonnet-4.5-200k:
+ cached_input_per_million: null
+ input_per_million: 6
+ output_per_million: 22.5
+- claude-opus-4:
+ cached_input_per_million: null
+ input_per_million: 15
+ output_per_million: 75
+- claude-opus-4-1:
+ cached_input_per_million: null
+ input_per_million: 15
+ output_per_million: 75
+- claude-opus-4-5:
+ cached_input_per_million: null
+ input_per_million: 5
+ output_per_million: 25
+- claude-opus-4.6:
+ cached_input_per_million: null
+ input_per_million: 5
+ output_per_million: 25
+- deepseek-chat:
+ cached_input_per_million: null
+ input_per_million: 0.27
+ output_per_million: 1.1
+- deepseek-reasoner:
+ cached_input_per_million: null
+ input_per_million: 0.55
+ output_per_million: 2.19
+- gemini-2.5-pro-preview-03-25:
+ cached_input_per_million: null
+ input_per_million: 1.25
+ output_per_million: 10
+- gemini-2.5-pro-preview-03-25-200k:
+ cached_input_per_million: null
+ input_per_million: 2.5
+ output_per_million: 15
+- gemini-2.0-flash-lite:
+ cached_input_per_million: null
+ input_per_million: 0.075
+ output_per_million: 0.3
+- gemini-2.0-flash:
+ cached_input_per_million: null
+ input_per_million: 0.1
+ output_per_million: 0.4
+- gemini-1.5-flash:
+ cached_input_per_million: null
+ input_per_million: 0.075
+ output_per_million: 0.3
+- gemini-1.5-flash-128k:
+ cached_input_per_million: null
+ input_per_million: 0.15
+ output_per_million: 0.6
+- gemini-1.5-flash-8b:
+ cached_input_per_million: null
+ input_per_million: 0.0375
+ output_per_million: 0.15
+- gemini-1.5-flash-8b-128k:
+ cached_input_per_million: null
+ input_per_million: 0.075
+ output_per_million: 0.3
+- gemini-1.5-pro:
+ cached_input_per_million: null
+ input_per_million: 1.25
+ output_per_million: 5
+- gemini-1.5-pro-128k:
+ cached_input_per_million: null
+ input_per_million: 2.5
+ output_per_million: 10
+- gemini-2.5-flash:
+ cached_input_per_million: 0.03
+ input_per_million: 0.3
+ output_per_million: 2.5
+- gemini-2.5-flash-lite:
+ cached_input_per_million: 0.01
+ input_per_million: 0.1
+ output_per_million: 0.4
+- gemini-2.5-flash-preview-09-2025:
+ cached_input_per_million: 0.03
+ input_per_million: 0.3
+ output_per_million: 2.5
+- gemini-2.5-pro:
+ cached_input_per_million: 0.125
+ input_per_million: 1.25
+ output_per_million: 10
+- gemini-2.5-pro-200k:
+ cached_input_per_million: 0.25
+ input_per_million: 2.5
+ output_per_million: 15
+- gemini-3-pro-preview:
+ cached_input_per_million: null
+ input_per_million: 2
+ output_per_million: 12
+- gemini-3-pro-preview-200k:
+ cached_input_per_million: null
+ input_per_million: 4
+ output_per_million: 18
+- gemini-3-flash-preview:
+ cached_input_per_million: null
+ input_per_million: 0.5
+ output_per_million: 3
+- gemini-3-1-pro-preview:
+ cached_input_per_million: null
+ input_per_million: 2
+ output_per_million: 12
+- gemini-3-1-pro-preview-200k:
+ cached_input_per_million: null
+ input_per_million: 4
+ output_per_million: 18
+- gemini-3.1-flash-lite-preview:
+ cached_input_per_million: 0.025
+ input_per_million: 0.25
+ output_per_million: 1.5
+- minimax-m2:
+ cached_input_per_million: null
+ input_per_million: 0.3
+ output_per_million: 1.2
+- pixtral-12b:
+ cached_input_per_million: null
+ input_per_million: 0.15
+ output_per_million: 0.15
+- mistral-small-latest:
+ cached_input_per_million: null
+ input_per_million: 0.1
+ output_per_million: 0.3
+- mistral-medium-2505:
+ cached_input_per_million: null
+ input_per_million: 0.4
+ output_per_million: 2
+- mistral-nemo:
+ cached_input_per_million: null
+ input_per_million: 0.15
+ output_per_million: 0.15
+- open-mistral-7b:
+ cached_input_per_million: null
+ input_per_million: 0.25
+ output_per_million: 0.25
+- open-mixtral-8x7b:
+ cached_input_per_million: null
+ input_per_million: 0.7
+ output_per_million: 0.7
+- open-mixtral-8x22b:
+ cached_input_per_million: null
+ input_per_million: 2
+ output_per_million: 6
+- mistral-large-latest:
+ cached_input_per_million: null
+ input_per_million: 2
+ output_per_million: 6
+- pixtral-large-latest:
+ cached_input_per_million: null
+ input_per_million: 2
+ output_per_million: 6
+- mistral-saba-latest:
+ cached_input_per_million: null
+ input_per_million: 0.2
+ output_per_million: 0.6
+- codestral-latest:
+ cached_input_per_million: null
+ input_per_million: 0.3
+ output_per_million: 0.9
+- ministral-8b-latest:
+ cached_input_per_million: null
+ input_per_million: 0.1
+ output_per_million: 0.1
+- ministral-3b-latest:
+ cached_input_per_million: null
+ input_per_million: 0.04
+ output_per_million: 0.04
+- magistral-medium-latest:
+ cached_input_per_million: null
+ input_per_million: 2
+ output_per_million: 5
+- kimi-k2-0905-preview:
+ cached_input_per_million: 0.15
+ input_per_million: 0.6
+ output_per_million: 2.5
+- kimi-k2-0711-preview:
+ cached_input_per_million: 0.15
+ input_per_million: 0.6
+ output_per_million: 2.5
+- kimi-k2-turbo-preview:
+ cached_input_per_million: 0.15
+ input_per_million: 1.15
+ output_per_million: 8.0
+- kimi-k2-thinking:
+ cached_input_per_million: 0.15
+ input_per_million: 0.6
+ output_per_million: 2.5
+- kimi-k2-thinking-turbo:
+ cached_input_per_million: 0.15
+ input_per_million: 1.15
+ output_per_million: 8.0
+- text-davinci-003:
+ cached_input_per_million: null
+ input_per_million: 20
+ output_per_million: 20
+- gpt-4.5:
+ cached_input_per_million: 37.5
+ input_per_million: 75
+ output_per_million: 150
+- gpt-4o:
+ cached_input_per_million: 1.25
+ input_per_million: 2.5
+ output_per_million: 10
+- gpt-4o-mini:
+ cached_input_per_million: 0.075
+ input_per_million: 0.15
+ output_per_million: 0.6
+- chatgpt-4o-latest:
+ cached_input_per_million: null
+ input_per_million: 5
+ output_per_million: 15
+- o1-preview:
+ cached_input_per_million: 7.5
+ input_per_million: 15
+ output_per_million: 60
+- o1-pro:
+ cached_input_per_million: null
+ input_per_million: 150
+ output_per_million: 600
+- o1-mini:
+ cached_input_per_million: 0.55
+ input_per_million: 1.1
+ output_per_million: 4.4
+- o3-mini:
+ cached_input_per_million: 0.55
+ input_per_million: 1.1
+ output_per_million: 4.4
+- gpt-4.1:
+ cached_input_per_million: 0.5
+ input_per_million: 2
+ output_per_million: 8
+- gpt-4.1-mini:
+ cached_input_per_million: 0.1
+ input_per_million: 0.4
+ output_per_million: 1.6
+- gpt-4.1-nano:
+ cached_input_per_million: 0.025
+ input_per_million: 0.1
+ output_per_million: 0.4
+- o3:
+ cached_input_per_million: 0.5
+ input_per_million: 10
+ output_per_million: 40
+- o4-mini:
+ cached_input_per_million: 0.275
+ input_per_million: 1.1
+ output_per_million: 4.4
+- gpt-5-nano:
+ cached_input_per_million: 0.005
+ input_per_million: 0.05
+ output_per_million: 0.4
+- gpt-5-mini:
+ cached_input_per_million: 0.025
+ input_per_million: 0.25
+ output_per_million: 2
+- gpt-5:
+ cached_input_per_million: 0.125
+ input_per_million: 1.25
+ output_per_million: 10
+- gpt-image-1:
+ cached_input_per_million: 1.25
+ input_per_million: 10
+ output_per_million: 40
+- gpt-image-1-mini:
+ cached_input_per_million: 0.2
+ input_per_million: 2
+ output_per_million: 8
+- gpt-5-pro:
+ cached_input_per_million: null
+ input_per_million: 15
+ output_per_million: 120
+- o3-pro:
+ cached_input_per_million: null
+ input_per_million: 20
+ output_per_million: 80
+- o4-mini-deep-research:
+ cached_input_per_million: 0.5
+ input_per_million: 2
+ output_per_million: 8
+- o3-deep-research:
+ cached_input_per_million: 2.5
+ input_per_million: 10
+ output_per_million: 40
+- gpt-5.1-codex-mini:
+ cached_input_per_million: 0.025
+ input_per_million: 0.25
+ output_per_million: 2.0
+- gpt-5.1-codex:
+ cached_input_per_million: 0.125
+ input_per_million: 1.25
+ output_per_million: 10.0
+- gpt-5.1:
+ cached_input_per_million: 0.125
+ input_per_million: 1.25
+ output_per_million: 10.0
+- gpt-5.2:
+ cached_input_per_million: 0.175
+ input_per_million: 1.75
+ output_per_million: 14.0
+- gpt-5.2-pro:
+ cached_input_per_million: null
+ input_per_million: 21.0
+ output_per_million: 168.0
+- gpt-5.4:
+ cached_input_per_million: 0.25
+ input_per_million: 2.5
+ output_per_million: 15.0
+- gpt-5.4-272k:
+ cached_input_per_million: 0.5
+ input_per_million: 5.0
+ output_per_million: 22.5
+- gpt-5.4-pro:
+ cached_input_per_million: null
+ input_per_million: 30.0
+ output_per_million: 180.0
+- gpt-5.4-pro-272k:
+ cached_input_per_million: null
+ input_per_million: 60.0
+ output_per_million: 270.0
+- grok-3:
+ cached_input_per_million: 0.75
+ input_per_million: 3
+ output_per_million: 15
+- grok-3-mini:
+ cached_input_per_million: 0.075
+ input_per_million: 0.3
+ output_per_million: 0.5
+- grok-4-fast:
+ cached_input_per_million: 0.05
+ input_per_million: 0.2
+ output_per_million: 0.5
+- grok-4:
+ cached_input_per_million: 0.75
+ input_per_million: 3
+ output_per_million: 15
+- grok-4-128k:
+ cached_input_per_million: 0.75
+ input_per_million: 6
+ output_per_million: 30
+- grok-4-fast:
+ cached_input_per_million: 0.05
+ input_per_million: 0.2
+ output_per_million: 0.5
+- grok-4-fast-128k:
+ cached_input_per_million: 0.05
+ input_per_million: 0.4
+ output_per_million: 1.0
+- grok-4-fast-reasoning:
+ cached_input_per_million: 0.05
+ input_per_million: 0.2
+ output_per_million: 0.5
+- grok-4-fast-reasoning-128k:
+ cached_input_per_million: 0.05
+ input_per_million: 0.4
+ output_per_million: 1.0
+- grok-code-fast-1:
+ cached_input_per_million: 0.02
+ input_per_million: 0.2
+ output_per_million: 1.5
+- claude-sonnet-4.6:
+ cached_input_per_million: null
+ input_per_million: 3
+ output_per_million: 15
\ No newline at end of file
diff --git a/db_fixture.yml b/db_fixture.yml
deleted file mode 100644
index e8be58c..0000000
--- a/db_fixture.yml
+++ /dev/null
@@ -1,32 +0,0 @@
-# yaml-language-server: $schema=vowel-schema.json
-
-fixtures:
- db:
- cls: db.Connection # setup cls
- kwargs:
- db_path: users.db ## db.Connection(db_path="users.db")
- teardown: db.Connection.close # teardown method (db.Connection.close())
- scope: module # scope
-
-db.Connection.execute:
- fixture:
- - db # db.execute(query) -> self is the db fixture instance
- evals:
- IsDict:
- type: list[dict[str, typing.Union[str, int]]] # type assertion for output
- dataset:
- - case:
- input: "SELECT * FROM users WHERE id=1"
- assertion: "output and isinstance(output[0], dict)"
- - case:
- input: "SELECT * FROM notes" # (buggy query - invalid table)
- raises: any # if any exception is raised, the case will pass.
- match: "no such table" ## no effect, because any exception is accepted
- - case:
- input: "SELECT * FROM players" # (buggy query - invalid table)
- raises: any? # if any exception is raised/returned normally, the case will pass.
- match: "no such table" ## no effect, because any exception or normal return is enough
- - case:
- input: "SELECT * FROM developers" # (buggy query - invalid table)
- raises: NoTableError
- match: "no such table" ## must match the exception message (case ignored)
diff --git a/docs/CLI.md b/docs/CLI.md
index 0cdd808..0be46f1 100644
--- a/docs/CLI.md
+++ b/docs/CLI.md
@@ -74,4 +74,45 @@ vowel evals.yml -v --hide-report
# Hide report without verbose (still shows Overall Summary panel)
vowel evals.yml --hide-report
+
+# Validate YAML + refresh schema header
+vowel schema evals.yml
+
+# Generate schema JSON file (default: vowel-schema.json)
+vowel schema --create
+
+# Generate schema JSON at a custom path
+vowel schema --create ./schemas/vowel-schema.json
+
+# Show tracked model costs
+vowel costs --list
+vowel costs --by-generation
+vowel costs --by-run
+vowel costs --generation
+vowel costs --run
```
+
+---
+
+## Schema Commands
+
+Use schema commands to validate specs and keep YAML schema headers in sync.
+
+| Command | Description |
+|--------|-------------|
+| `vowel schema ` | Validates YAML and updates the file's schema header safely |
+| `vowel schema --create [path]` | Generates `vowel-schema.json` (or writes to custom path) |
+
+---
+
+## Cost Commands
+
+Use cost commands to inspect generation and run cost history.
+
+| Command | Description |
+|--------|-------------|
+| `vowel costs --list` | List all tracked generations and runs |
+| `vowel costs --by-generation` | Aggregate totals by generation id |
+| `vowel costs --by-run` | Aggregate totals by run id |
+| `vowel costs --generation ` | Show detailed rows for one generation |
+| `vowel costs --run ` | Show detailed rows for one run |
diff --git a/docs/CODEMODE.md b/docs/CODEMODE.md
new file mode 100644
index 0000000..91517bd
--- /dev/null
+++ b/docs/CODEMODE.md
@@ -0,0 +1,164 @@
+# CodeMode
+
+CodeMode is Vowel's exploration-guided evaluation spec generator.
+
+Instead of generating test specs from description only, CodeMode runs exploration snippets against real function code first, then generates and refines eval specs using verified outputs and observed errors.
+
+## Pipeline Overview
+
+CodeMode runs in phases:
+
+1. Explore behavior
+- Generates normal snippets and error snippets.
+- Executes snippets against the target function.
+- Collects real outputs, exceptions, and timings.
+
+2. Generate spec
+- Builds a spec prompt from verified execution results.
+- Produces either YAML text or structured bundle output.
+
+3. Validate and refine
+- Runs generated evals against the function.
+- If coverage is below target, builds failure context and retries.
+- Repeats up to max refinement rounds.
+
+4. Optional duration injection
+- Measures runtime and injects duration thresholds into cases.
+
+5. Final summary
+- Returns a CodeModeResult with exploration artifacts, final YAML spec, and optional EvalSummary.
+
+## Core API
+
+CodeMode class:
+- `vowel.codemode.CodeModeGenerator`
+
+Result type:
+- `vowel.codemode.CodeModeResult`
+
+Main entrypoint:
+- `await CodeModeGenerator.generate(...)`
+
+## Model Configuration
+
+Constructor model resolution order:
+
+- `spec_model` argument, else `SPEC_MODEL`, else fallback `model`/`MODEL_NAME`
+- `exploration_model` argument, else `EXPLORATION_MODEL`, else fallback `model`/`MODEL_NAME`
+
+Both models must resolve to non-empty values.
+
+## Output Modes (`use_model_spec`)
+
+- `use_model_spec=False` (default)
+ - Spec agent output type: `EvalsSource`
+ - Generates YAML string via `yaml_spec`
+
+- `use_model_spec=True`
+ - Spec agent output type: `EvalsBundle`
+ - Generates structured model output first, then can be converted to YAML
+
+Recommendation used in this repository benchmark flow:
+- HIGHLY RECOMMENDED TO KEEP `use_model_spec=False`.
+
+## Minimal Example
+
+```python
+import asyncio
+
+from vowel.codemode import CodeModeGenerator
+from vowel.runner import Function
+
+func = Function(
+ name="flatten",
+ description="Recursively flatten an arbitrarily nested list.",
+ code="""
+def flatten(lst: list) -> list:
+ if not isinstance(lst, list):
+ raise TypeError(f'Expected list, got {type(lst).__name__}')
+ out = []
+ for item in lst:
+ if isinstance(item, list):
+ out.extend(flatten(item))
+ else:
+ out.append(item)
+ return out
+""",
+)
+
+async def main() -> None:
+ gen = CodeModeGenerator(
+ spec_model="openrouter:google/gemini-3-flash-preview",
+ exploration_model="openrouter:google/gemini-3.1-flash-lite-preview",
+ use_model_spec=False,
+ )
+
+ result = await gen.generate(
+ func,
+ run_evals=True,
+ max_refinement_rounds=2,
+ min_coverage=1.0,
+ inject_durations=False,
+ save_to_file=True,
+ )
+
+ print(result.yaml_spec)
+ if result.summary:
+ result.summary.print()
+
+asyncio.run(main())
+```
+
+## `generate(...)` Parameters
+
+Important flags in `CodeModeGenerator.generate`:
+
+- `run_id`: optional run identifier for cost tracking
+- `run_evals`: run generated spec after generation
+- `save_to_file`: write `_evals.yml`
+- `max_refinement_rounds`: retry/refinement budget
+- `min_coverage`: stop threshold (default 1.0)
+- `inject_durations`: inject measured duration checks
+
+## What `CodeModeResult` Contains
+
+- `exploration_results`: snippet execution results
+- `yaml_spec`: final YAML eval spec
+- `summary`: EvalSummary when `run_evals=True`
+- `refinement_rounds`: number of refinement retries used
+
+## Benchmark Integration (`python -m codemode_benchmark`)
+
+Benchmark runner path:
+- `codemode_benchmark/run_benchmark.py`
+
+Typical usage:
+
+```bash
+python -m codemode_benchmark
+python -m codemode_benchmark --only flatten group_by
+python -m codemode_benchmark --show-config
+python -m codemode_benchmark --replay codemode_benchmark/run_20260312_181510
+```
+
+If you use Python launcher on your machine:
+
+```bash
+py -m codemode_benchmark
+```
+
+Benchmark runner compares model pairs (`spec_model`, `exploration_model`) across built-in scenarios and stores artifacts under `codemode_benchmark/run_/`.
+
+## Troubleshooting
+
+- Error: spec/exploration model not set
+ - Set constructor args or env vars (`SPEC_MODEL`, `EXPLORATION_MODEL`, `MODEL_NAME`).
+
+- Low coverage after generation
+ - Increase `max_refinement_rounds`.
+ - Provide clearer function descriptions.
+ - Check whether the function has non-deterministic behavior.
+
+- YAML parse/validation failures
+ - Keep `use_model_spec=False` for YAML-first flow in this repo.
+ - Let refinement run (`run_evals=True`) so failure context can repair issues.
diff --git a/docs/FIXTURES.md b/docs/FIXTURES.md
index da93794..5bf91ff 100644
--- a/docs/FIXTURES.md
+++ b/docs/FIXTURES.md
@@ -82,24 +82,29 @@ Fixtures support three lifecycle scopes (defined in YAML):
| Scope | Behavior |
|-------|----------|
-| `function` (default) | Setup/teardown for **each** test case |
-| `module` | Setup once per eval spec, teardown after all cases |
-| `session` | Setup once per `run_evals()` call, teardown at end |
+| `case` (preferred) / `function` (alias, default) | Setup/teardown for **each** test case |
+| `eval` (preferred) / `module` (alias) | Setup once per eval spec, teardown after all cases |
+| `file` (preferred) / `session` (alias) | Setup once per `run_evals()` call, teardown at end |
+
+Alias normalization:
+- `case -> function`
+- `eval -> module`
+- `file -> session`
```yaml
fixtures:
temp_file:
setup: my_fixtures.temp_file
- scope: function
+ scope: case
db:
setup: my_fixtures.setup_db
teardown: my_fixtures.teardown_db
- scope: module
+ scope: eval
cache:
setup: my_fixtures.setup_cache
- scope: session
+ scope: file
```
---
diff --git a/docs/README.md b/docs/README.md
index ff567e3..c328c08 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -8,6 +8,7 @@ Welcome to the Vowel framework documentation.
|----------|-------------|
| [User Guide](./USERGUIDE.md) | Complete guide to using Vowel |
| [API Reference](./API.md) | Detailed API documentation |
+| [CodeMode](./CODEMODE.md) | Exploration-guided spec generation pipeline and benchmark usage |
## Quick Links
@@ -16,3 +17,4 @@ Welcome to the Vowel framework documentation.
- **Evaluators**: See [User Guide - Evaluators](./USERGUIDE.md#evaluators)
- **RunEvals API**: See [API - RunEvals](./API.md#runevals)
- **EvalGenerator**: See [API - EvalGenerator](./API.md#evalgenerator)
+- **CodeMode**: See [CodeMode Guide](./CODEMODE.md)
diff --git a/docs/SERIALIZERS.md b/docs/SERIALIZERS.md
index deb621e..7def43c 100644
--- a/docs/SERIALIZERS.md
+++ b/docs/SERIALIZERS.md
@@ -60,6 +60,50 @@ summary = (
> Key matching note: If YAML eval ids use `module.function`, both programmatic maps accept either the exact id (`module.function`) or short name (`function`) keys in `.with_functions(...)`, `.with_serializer(...)`, and `serial_fn={...}`.
+> Assertion context note: When a serializer is active, assertion evaluators see the serialized `input` value (not raw YAML payload).
+>
+> - Schema mode: `input` is the model/callable output.
+> - Serial fn mode: `input` is whatever `serial_fn` returns (single value, tuple, or dict).
+> - Dict/nested schema mode: `input` contains per-parameter serialized values.
+
+---
+
+## YAML-Native Serializer Registry
+
+You can define serializers directly in YAML and reference them per eval.
+
+```yaml
+serializers:
+ query_schema:
+ schema: examples.serializers.util.Query
+ query_serial_fn:
+ serializer: examples.serializers.util.query_from_payload
+
+examples.serializers.util.query_users:
+ serializer: query_schema
+ dataset:
+ - case:
+ input:
+ sql: "SELECT name FROM users WHERE age > ?"
+ params: [30]
+
+examples.serializers.util.query_users_custom:
+ serializer: query_serial_fn
+ dataset:
+ - case:
+ input: "SELECT COUNT(*) AS total FROM users"
+```
+
+One-of rule for each serializer registry entry:
+- use `schema` or `serializer`
+- do not define both in the same entry
+
+Runnable example:
+
+```bash
+vowel examples/serializers/db_query_evals.yml
+```
+
---
## Advanced Examples
@@ -91,6 +135,9 @@ summary = (
.with_serializer({"process": {"user": User, "config": Config}})
.run()
)
+
+# Assertions can access serialized nested values
+# assertion: "input['user'].email.endswith('@a.com') and input['config'].timeout == 30"
```
### Custom Parsing Logic
diff --git a/docs/YAML_SPEC.md b/docs/YAML_SPEC.md
index f467ba6..b7ded00 100644
--- a/docs/YAML_SPEC.md
+++ b/docs/YAML_SPEC.md
@@ -10,7 +10,8 @@ fixtures:
fixture_name:
setup: module.setup_func # Import path to setup function
teardown: module.teardown_func # Import path to teardown (optional)
- scope: function # function | module | session
+ scope: case # preferred: case | eval | file
+ # aliases: function | module | session
kwargs: # Keyword arguments for setup function (optional)
key: value
@@ -116,18 +117,18 @@ fixtures:
db:
setup: myapp.fixtures.setup_db
teardown: myapp.fixtures.close_db
- scope: module # Created once, shared across all cases
+ scope: eval # Created once, shared across all cases
params:
db_name: test_db
cache:
setup: myapp.fixtures.setup_cache
- scope: session # Created once per run_evals call
+ scope: file # Created once per run_evals call
temp_dir:
setup: myapp.fixtures.create_temp_dir
teardown: myapp.fixtures.remove_temp_dir
- scope: function # Created fresh for each case (default)
+ scope: case # Created fresh for each case (default)
# Function depends on 'db' fixture
query_user:
@@ -185,9 +186,14 @@ summary = (
```
**Fixture scopes:**
-- `function` (default): Setup/teardown for **each** test case
-- `module`: Setup once per eval spec, teardown after all cases
-- `session`: Setup once per `run_evals()` call, teardown at end
+- Preferred names:
+ - `case` (default): Setup/teardown for **each** test case
+ - `eval`: Setup once per eval spec, teardown after all cases
+ - `file`: Setup once per `run_evals()` call, teardown at end
+- Backward-compatible aliases:
+ - `function` = `case`
+ - `module` = `eval`
+ - `session` = `file`
> See [FIXTURES.md](./FIXTURES.md) for the complete fixture guide including Python API patterns.
diff --git a/examples/basic_usage/evals.yml b/examples/basic_usage/evals.yml
index a7a9d94..c184afc 100644
--- a/examples/basic_usage/evals.yml
+++ b/examples/basic_usage/evals.yml
@@ -6,9 +6,11 @@
greet:
dataset:
- case:
+ id: greet_world
input: "World"
expected: "Hello, World!"
- case:
+ id: greet_alice
input: "Alice"
expected: "Hello, Alice!"
@@ -16,12 +18,15 @@ greet:
add:
dataset:
- case:
+ id: add_positive_pair
inputs: { x: 1, y: 2 }
expected: 3
- case:
+ id: add_zero_sum
inputs: { x: -5, y: 5 }
expected: 0
- case:
+ id: add_large_values
inputs: { x: 100, y: 200 }
expected: 300
@@ -29,12 +34,15 @@ add:
multiply:
dataset:
- case:
+ id: multiply_basic_product
inputs: [3, 4]
expected: 12
- case:
+ id: multiply_zero_factor
inputs: [0, 999]
expected: 0
- case:
+ id: multiply_negative_product
inputs: [-2, 5]
expected: -10
@@ -42,15 +50,19 @@ multiply:
factorial:
dataset:
- case:
+ id: factorial_zero_base
input: 0
expected: 1
- case:
+ id: factorial_five
input: 5
expected: 120
- case:
+ id: factorial_ten
input: 10
expected: 3628800
- case:
+ id: factorial_negative_raises
input: -1
raises: ValueError
match: "non-negative"
@@ -59,12 +71,15 @@ factorial:
is_even:
dataset:
- case:
+ id: is_even_four
input: 4
expected: true
- case:
+ id: is_even_seven
input: 7
expected: false
- case:
+ id: is_even_zero
input: 0
expected: true
@@ -72,9 +87,11 @@ is_even:
len:
dataset:
- case:
+ id: len_list_three
input: [1, 2, 3]
expected: 3
- case:
+ id: len_string_hello
input: "hello"
expected: 5
@@ -82,14 +99,17 @@ len:
math.sqrt:
dataset:
- case:
+ id: sqrt_16
input: 16
expected: 4.0
- case:
+ id: sqrt_9
input: 9
expected: 3.0
os.path.join:
dataset:
- case:
+ id: join_home_user
inputs: ["/home", "user"]
expected: "/home/user"
diff --git a/db.py b/examples/db_fixtures/db.py
similarity index 95%
rename from db.py
rename to examples/db_fixtures/db.py
index 078f78b..332aa8d 100644
--- a/db.py
+++ b/examples/db_fixtures/db.py
@@ -15,8 +15,10 @@
import logfire
+from vowel.monitoring import enable_monitoring
+
# enable observability (optional)
-# logfire.configure(service_name="db-fixture")
+enable_monitoring(service_name="db-fixture")
class NoTableError(Exception):
diff --git a/examples/evals/builtins.yml b/examples/evals/builtins.yml
index 46e1146..fbf388c 100644
--- a/examples/evals/builtins.yml
+++ b/examples/evals/builtins.yml
@@ -3,53 +3,55 @@
len:
dataset:
- - case: { input: [1, 2, 3], expected: 3 }
- - case: { input: "hello", expected: 5 }
- - case: { input: [], expected: 0 }
+ - case: { id: len_list_three, input: [1, 2, 3], expected: 3 }
+ - case: { id: len_string_hello, input: "hello", expected: 5 }
+ - case: { id: len_empty_list, input: [], expected: 0 }
abs:
dataset:
- - case: { input: -7, expected: 7 }
- - case: { input: 0, expected: 0 }
- - case: { input: 42, expected: 42 }
+ - case: { id: abs_negative, input: -7, expected: 7 }
+ - case: { id: abs_zero, input: 0, expected: 0 }
+ - case: { id: abs_positive, input: 42, expected: 42 }
sorted:
dataset:
- - case: { input: [3, 1, 2], expected: [1, 2, 3] }
- - case: { input: [5, 5, 5], expected: [5, 5, 5] }
- - case: { input: [], expected: [] }
+ - case: { id: sorted_unsorted_numbers, input: [3, 1, 2], expected: [1, 2, 3] }
+ - case: { id: sorted_all_equal, input: [5, 5, 5], expected: [5, 5, 5] }
+ - case: { id: sorted_empty_list, input: [], expected: [] }
sum:
dataset:
- - case: { input: [1, 2, 3], expected: 6 }
- - case: { input: [], expected: 0 }
+ - case: { id: sum_simple_list, input: [1, 2, 3], expected: 6 }
+ - case: { id: sum_empty_list, input: [], expected: 0 }
min:
dataset:
- - case: { input: [3, 1, 2], expected: 1 }
- - case: { input: [99], expected: 99 }
+ - case: { id: min_list_values, input: [3, 1, 2], expected: 1 }
+ - case: { id: min_singleton, input: [99], expected: 99 }
max:
dataset:
- - case: { input: [3, 1, 2], expected: 3 }
+ - case: { id: max_list_values, input: [3, 1, 2], expected: 3 }
math.sqrt:
dataset:
- - case: { input: 16, expected: 4.0 }
- - case: { input: 9, expected: 3.0 }
- - case: { input: 0, expected: 0.0 }
+ - case: { id: sqrt_16, input: 16, expected: 4.0 }
+ - case: { id: sqrt_9, input: 9, expected: 3.0 }
+ - case: { id: sqrt_0, input: 0, expected: 0.0 }
math.factorial:
dataset:
- - case: { input: 0, expected: 1 }
- - case: { input: 5, expected: 120 }
- - case: { input: 10, expected: 3628800 }
+ - case: { id: factorial_0, input: 0, expected: 1 }
+ - case: { id: factorial_5, input: 5, expected: 120 }
+ - case: { id: factorial_10, input: 10, expected: 3628800 }
os.path.join:
dataset:
- case:
+ id: join_two_parts
inputs: ["/home", "user"]
expected: "/home/user"
- case:
+ id: join_three_parts
inputs: ["/var", "log", "app.log"]
expected: "/var/log/app.log"
diff --git a/examples/evals/math.yml b/examples/evals/math.yml
index 05c8447..d0730b7 100644
--- a/examples/evals/math.yml
+++ b/examples/evals/math.yml
@@ -6,11 +6,12 @@ examples.evals.functions.fibonacci:
IsInt:
type: int
dataset:
- - case: { input: 0, expected: 0 }
- - case: { input: 1, expected: 1 }
- - case: { input: 10, expected: 55 }
- - case: { input: 20, expected: 6765 }
+ - case: { id: fib_0, input: 0, expected: 0 }
+ - case: { id: fib_1, input: 1, expected: 1 }
+ - case: { id: fib_10, input: 10, expected: 55 }
+ - case: { id: fib_20, input: 20, expected: 6765 }
- case:
+ id: fib_negative_raises
input: -1
raises: ValueError
match: "non-negative"
@@ -24,10 +25,11 @@ examples.evals.functions.calculate_bmi:
CorrectFormula:
assertion: "abs(output - input[0] / (input[1] ** 2)) < 0.1"
dataset:
- - case: { inputs: [70.0, 1.75], expected: 22.86 }
- - case: { inputs: [85.0, 1.80], expected: 26.23 }
- - case: { inputs: [60.0, 1.65], expected: 22.04 }
+ - case: { id: bmi_normal_weight, inputs: [70.0, 1.75], expected: 22.86 }
+ - case: { id: bmi_overweight_range, inputs: [85.0, 1.80], expected: 26.23 }
+ - case: { id: bmi_light_weight, inputs: [60.0, 1.65], expected: 22.04 }
- case:
+ id: bmi_zero_weight_raises
inputs: [0.0, 1.70]
raises: ValueError
match: "positive"
@@ -39,7 +41,7 @@ examples.evals.functions.clamp:
WithinBounds:
assertion: "input[1] <= output <= input[2]"
dataset:
- - case: { inputs: [5, 0, 10], expected: 5 }
- - case: { inputs: [-5, 0, 10], expected: 0 }
- - case: { inputs: [99, 0, 10], expected: 10 }
- - case: { inputs: [0, 0, 0], expected: 0 }
+ - case: { id: clamp_within_bounds, inputs: [5, 0, 10], expected: 5 }
+ - case: { id: clamp_below_min, inputs: [-5, 0, 10], expected: 0 }
+ - case: { id: clamp_above_max, inputs: [99, 0, 10], expected: 10 }
+ - case: { id: clamp_equal_bounds, inputs: [0, 0, 0], expected: 0 }
diff --git a/examples/evals/strings.yml b/examples/evals/strings.yml
index 07fb578..8866040 100644
--- a/examples/evals/strings.yml
+++ b/examples/evals/strings.yml
@@ -6,12 +6,12 @@ examples.evals.functions.is_palindrome:
IsBool:
type: bool
dataset:
- - case: { input: "racecar", expected: true }
- - case: { input: "hello", expected: false }
- - case: { input: "A man a plan a canal Panama", expected: true }
- - case: { input: "Was it a rat I saw", expected: true }
- - case: { input: "12321", expected: true }
- - case: { input: "", expected: true }
+ - case: { id: palindrome_racecar, input: "racecar", expected: true }
+ - case: { id: palindrome_hello_false, input: "hello", expected: false }
+ - case: { id: palindrome_phrase_panama, input: "A man a plan a canal Panama", expected: true }
+ - case: { id: palindrome_phrase_rat, input: "Was it a rat I saw", expected: true }
+ - case: { id: palindrome_numeric, input: "12321", expected: true }
+ - case: { id: palindrome_empty_string, input: "", expected: true }
examples.evals.functions.count_words:
evals:
@@ -20,10 +20,10 @@ examples.evals.functions.count_words:
NonNegative:
assertion: "output >= 0"
dataset:
- - case: { input: "Hello world from Python", expected: 4 }
- - case: { input: "Single", expected: 1 }
- - case: { input: "", expected: 0 }
- - case: { input: " spaces ", expected: 1 }
+ - case: { id: count_words_sentence, input: "Hello world from Python", expected: 4 }
+ - case: { id: count_words_single, input: "Single", expected: 1 }
+ - case: { id: count_words_empty, input: "", expected: 0 }
+ - case: { id: count_words_trim_spaces, input: " spaces ", expected: 1 }
examples.evals.functions.get_file_extension:
evals:
@@ -32,11 +32,11 @@ examples.evals.functions.get_file_extension:
LowercaseOnly:
pattern: "^[a-z0-9]*$"
dataset:
- - case: { input: "document.txt", expected: "txt" }
- - case: { input: "image.PNG", expected: "png" }
- - case: { input: "archive.tar.gz", expected: "gz" }
- - case: { input: "noextension", expected: "" }
- - case: { input: "script.py", expected: "py" }
+ - case: { id: ext_txt, input: "document.txt", expected: "txt" }
+ - case: { id: ext_png_uppercase, input: "image.PNG", expected: "png" }
+ - case: { id: ext_multi_dot_gz, input: "archive.tar.gz", expected: "gz" }
+ - case: { id: ext_no_extension, input: "noextension", expected: "" }
+ - case: { id: ext_py, input: "script.py", expected: "py" }
examples.evals.functions.extract_hashtags:
evals:
@@ -46,11 +46,14 @@ examples.evals.functions.extract_hashtags:
assertion: "all(tag.startswith('#') for tag in output) if output else True"
dataset:
- case:
+ id: hashtags_two_tags
input: "Learning #python and #coding today!"
expected: ["#python", "#coding"]
- case:
+ id: hashtags_none
input: "No hashtags here"
expected: []
- case:
+ id: hashtags_three_tags
input: "#AI #ML #DL"
expected: ["#AI", "#ML", "#DL"]
diff --git a/examples/evals/validation.yml b/examples/evals/validation.yml
index 6cf9dd3..6a19217 100644
--- a/examples/evals/validation.yml
+++ b/examples/evals/validation.yml
@@ -7,11 +7,11 @@ examples.evals.functions.validate_email:
type: bool
strict: true
dataset:
- - case: { input: "user@example.com", expected: true }
- - case: { input: "invalid.email", expected: false }
- - case: { input: "test@domain.co.uk", expected: true }
- - case: { input: "@nodomain.com", expected: false }
- - case: { input: "spaces @mail.com", expected: false }
+ - case: { id: email_valid_basic, input: "user@example.com", expected: true }
+ - case: { id: email_invalid_missing_at, input: "invalid.email", expected: false }
+ - case: { id: email_valid_subdomain, input: "test@domain.co.uk", expected: true }
+ - case: { id: email_invalid_missing_user, input: "@nodomain.com", expected: false }
+ - case: { id: email_invalid_with_space, input: "spaces @mail.com", expected: false }
examples.evals.functions.classify_age_group:
evals:
@@ -26,22 +26,23 @@ examples.evals.functions.classify_age_group:
(18 <= input < 65 and output == 'adult') or\
(input >= 65 and output == 'senior')
dataset:
- - case: { input: 5, expected: "child" }
- - case: { input: 15, expected: "teenager" }
- - case: { input: 30, expected: "adult" }
- - case: { input: 70, expected: "senior" }
- - case: { input: 12, expected: "child" }
- - case: { input: 18, expected: "adult" }
- - case: { input: 65, expected: "senior" }
+ - case: { id: age_5_child, input: 5, expected: "child" }
+ - case: { id: age_15_teenager, input: 15, expected: "teenager" }
+ - case: { id: age_30_adult, input: 30, expected: "adult" }
+ - case: { id: age_70_senior, input: 70, expected: "senior" }
+ - case: { id: age_12_child_boundary, input: 12, expected: "child" }
+ - case: { id: age_18_adult_boundary, input: 18, expected: "adult" }
+ - case: { id: age_65_senior_boundary, input: 65, expected: "senior" }
examples.evals.functions.format_phone:
evals:
PhoneFormat:
pattern: "^\\(\\d{3}\\) \\d{3}-\\d{4}$"
dataset:
- - case: { input: "5551234567", expected: "(555) 123-4567" }
- - case: { input: "2129876543", expected: "(212) 987-6543" }
+ - case: { id: phone_valid_555, input: "5551234567", expected: "(555) 123-4567" }
+ - case: { id: phone_valid_212, input: "2129876543", expected: "(212) 987-6543" }
- case:
+ id: phone_short_raises
input: "123"
raises: ValueError
match: "10 digits"
@@ -52,11 +53,14 @@ examples.evals.functions.parse_json:
type: dict
dataset:
- case:
+ id: json_valid_object
input: '{"key": "value", "n": 42}'
expected: { key: "value", n: 42 }
- case:
+ id: json_invalid_returns_empty
input: "invalid json"
expected: {}
- case:
+ id: json_nested_object
input: '{"nested": {"ok": true}}'
expected: { nested: { ok: true } }
diff --git a/examples/evaluators/evals.yml b/examples/evaluators/evals.yml
index 3b90ddc..7e85c64 100644
--- a/examples/evaluators/evals.yml
+++ b/examples/evaluators/evals.yml
@@ -11,12 +11,15 @@ validate_email:
strict: true
dataset:
- case:
+ id: email_valid_user_example
input: "user@example.com"
expected: true
- case:
+ id: email_invalid_missing_at
input: "invalid.email"
expected: false
- case:
+ id: email_invalid_missing_user
input: "@nodomain.com"
expected: false
@@ -31,12 +34,15 @@ calculate_discount:
assertion: "output <= input[0]"
dataset:
- case:
+ id: discount_20_percent
inputs: [100.0, 20.0]
expected: 80.0
- case:
+ id: discount_half_price
inputs: [50.0, 50.0]
expected: 25.0
- case:
+ id: discount_zero_percent
inputs: [200.0, 0.0]
expected: 200.0
@@ -48,9 +54,11 @@ format_phone:
pattern: "^\\(\\d{3}\\) \\d{3}-\\d{4}$"
dataset:
- case:
+ id: phone_555_format
input: "5551234567"
expected: "(555) 123-4567"
- case:
+ id: phone_212_format
input: "2129876543"
expected: "(212) 987-6543"
@@ -64,12 +72,15 @@ fibonacci:
duration: 0.01
dataset:
- case:
+ id: fib_0
input: 0
expected: 0
- case:
+ id: fib_10
input: 10
expected: 55
- case:
+ id: fib_20
input: 20
expected: 6765
@@ -83,11 +94,13 @@ extract_hashtags:
assertion: "all(tag.startswith('#') for tag in output) if output else True"
dataset:
- case:
+ id: hashtags_two
input: "Learning #python and #coding today!"
expected:
- "#python"
- "#coding"
- case:
+ id: hashtags_none
input: "No hashtags here"
expected: []
@@ -106,12 +119,12 @@ classify_age_group:
(18 <= input < 65 and output == 'adult') or\
(input >= 65 and output == 'senior')
dataset:
- - case: { input: 5, expected: "child" }
- - case: { input: 15, expected: "teenager" }
- - case: { input: 30, expected: "adult" }
- - case: { input: 70, expected: "senior" }
- - case: { input: 12, expected: "child" }
- - case: { input: 18, expected: "adult" }
+ - case: { id: age_5_child, input: 5, expected: "child" }
+ - case: { id: age_15_teenager, input: 15, expected: "teenager" }
+ - case: { id: age_30_adult, input: 30, expected: "adult" }
+ - case: { id: age_70_senior, input: 70, expected: "senior" }
+ - case: { id: age_12_child_boundary, input: 12, expected: "child" }
+ - case: { id: age_18_adult_boundary, input: 18, expected: "adult" }
# ─── Raises (Exception Testing) ──────────────────────────────
# Verify that specific exceptions are raised with optional message matching.
@@ -125,12 +138,15 @@ calculate_bmi:
assertion: "abs(output - input[0] / (input[1] ** 2)) < 0.1"
dataset:
- case:
+ id: bmi_normal_weight
inputs: [70.0, 1.75]
expected: 22.86
- case:
+ id: bmi_overweight_range
inputs: [85.0, 1.80]
expected: 26.23
- case:
+ id: bmi_zero_weight_raises
inputs: [0.0, 1.70]
raises: ValueError
match: "positive"
diff --git a/examples/fixtures/evals.yml b/examples/fixtures/evals.yml
index 683f9d9..952aa0a 100644
--- a/examples/fixtures/evals.yml
+++ b/examples/fixtures/evals.yml
@@ -12,9 +12,11 @@ write_and_count:
- tmp
dataset:
- case:
+ id: write_count_hello_world
input: "Hello World"
expected: 11
- case:
+ id: write_count_test
input: "Test"
expected: 4
@@ -23,9 +25,11 @@ count_users:
- db
dataset:
- case:
+ id: count_users_alice
input: "Alice"
expected: 2
- case:
+ id: count_users_bob
input: "Bob"
expected: 2
@@ -34,5 +38,6 @@ add_with_bonus:
- config
dataset:
- case:
+ id: add_with_bonus_basic
inputs: { a: 1, b: 2 }
expected: 13
diff --git a/examples/fluent_api/evals.yml b/examples/fluent_api/evals.yml
index 64448cf..94618c6 100644
--- a/examples/fluent_api/evals.yml
+++ b/examples/fluent_api/evals.yml
@@ -5,23 +5,23 @@ double:
IsInt:
type: int
dataset:
- - case: { input: 5, expected: 10 }
- - case: { input: 0, expected: 0 }
- - case: { input: -4, expected: -8 }
+ - case: { id: double_positive, input: 5, expected: 10 }
+ - case: { id: double_zero, input: 0, expected: 0 }
+ - case: { id: double_negative, input: -4, expected: -8 }
triple:
dataset:
- - case: { input: 3, expected: 9 }
- - case: { input: -1, expected: -3 }
+ - case: { id: triple_positive, input: 3, expected: 9 }
+ - case: { id: triple_negative, input: -1, expected: -3 }
reverse:
evals:
IsString:
type: str
dataset:
- - case: { input: "hello", expected: "olleh" }
- - case: { input: "abcba", expected: "abcba" }
- - case: { input: "", expected: "" }
+ - case: { id: reverse_hello, input: "hello", expected: "olleh" }
+ - case: { id: reverse_palindrome, input: "abcba", expected: "abcba" }
+ - case: { id: reverse_empty, input: "", expected: "" }
fizzbuzz:
evals:
@@ -30,8 +30,8 @@ fizzbuzz:
ValidOutput:
pattern: "^(Fizz|Buzz|FizzBuzz|\\d+)$"
dataset:
- - case: { input: 1, expected: "1" }
- - case: { input: 3, expected: "Fizz" }
- - case: { input: 5, expected: "Buzz" }
- - case: { input: 15, expected: "FizzBuzz" }
- - case: { input: 7, expected: "7" }
+ - case: { id: fizzbuzz_1, input: 1, expected: "1" }
+ - case: { id: fizzbuzz_3, input: 3, expected: "Fizz" }
+ - case: { id: fizzbuzz_5, input: 5, expected: "Buzz" }
+ - case: { id: fizzbuzz_15, input: 15, expected: "FizzBuzz" }
+ - case: { id: fizzbuzz_7, input: 7, expected: "7" }
diff --git a/examples/serializers/__init__.py b/examples/serializers/__init__.py
new file mode 100644
index 0000000..8fbeb8d
--- /dev/null
+++ b/examples/serializers/__init__.py
@@ -0,0 +1 @@
+"""Native YAML serializer + fixture example package."""
diff --git a/examples/serializers/app.py b/examples/serializers/app.py
new file mode 100644
index 0000000..6b05f15
--- /dev/null
+++ b/examples/serializers/app.py
@@ -0,0 +1,18 @@
+"""Functions under test for native serializer + fixture example."""
+
+from __future__ import annotations
+
+from typing import Any
+
+from .defn import Query
+from .fixtures import DbConnection
+
+
+def query_users(query: Query, *, db: DbConnection) -> list[dict[str, Any]]:
+ """Schema mode example: input dict -> Query model via serializer schema."""
+ return db.execute_query(query)
+
+
+def query_users_custom(query: Query, *, db: DbConnection) -> list[dict[str, Any]]:
+ """serial_fn mode example: raw payload -> Query via custom serializer function."""
+ return db.execute_query(query)
diff --git a/examples/serializers/db_query_evals.yml b/examples/serializers/db_query_evals.yml
new file mode 100644
index 0000000..9cafce2
--- /dev/null
+++ b/examples/serializers/db_query_evals.yml
@@ -0,0 +1,53 @@
+# yaml-language-server: $schema=../../vowel-schema.json
+
+
+serializers:
+ query_schema:
+ schema: examples.serializers.util.Query
+ query_serial_fn:
+ serializer: examples.serializers.util.query_from_payload
+
+fixtures:
+ db:
+ cls: examples.serializers.util.DbConnection
+ kwargs:
+ db_path: ":memory:"
+ teardown: examples.serializers.util.DbConnection.close
+ scope: module
+
+examples.serializers.util.query_users:
+ fixture:
+ - db
+ serializer: query_schema
+ evals:
+ ReturnsRows:
+ type: list[dict[str, typing.Any]]
+ CheckSqlIsNotEmpty:
+ assertion: "input.sql is not None"
+ dataset:
+ - case:
+ id: by_age_threshold
+ input:
+ sql: "SELECT name FROM users WHERE age > ? ORDER BY age"
+ params: [30]
+ assertion: "output == [{'name': 'Bob'}, {'name': 'Cara'}]"
+ - case:
+ id: invalid_table_raises
+ input:
+ sql: "SELECT * FROM ghost_table"
+ params: []
+ raises: any
+
+examples.serializers.util.query_users_custom:
+ fixture:
+ - db
+ serializer: query_serial_fn
+ evals:
+ ReturnsRows:
+ type: list[dict[str, typing.Any]]
+ dataset:
+ - case:
+ id: count_users_from_text
+ input: "SELECT COUNT(*) AS total FROM users"
+ expected:
+ - {total: 3}
diff --git a/examples/serializers/defn.py b/examples/serializers/defn.py
new file mode 100644
index 0000000..b0013c3
--- /dev/null
+++ b/examples/serializers/defn.py
@@ -0,0 +1,41 @@
+"""Serializer models and helpers for the native YAML serializer example."""
+
+from __future__ import annotations
+
+from typing import Any
+
+from pydantic import BaseModel, Field
+
+
+class Query(BaseModel):
+ """Simple SQL query payload used by example evals."""
+
+ sql: str
+ params: list[Any] = Field(default_factory=list)
+
+
+def query_from_payload(payload: dict[str, Any]) -> Query:
+ """serial_fn mode example for YAML-native serializer registry.
+
+ Accepts both:
+ - {"input": "SELECT ..."}
+ - {"input": {"sql": "SELECT ...", "params": [...]}}
+ """
+
+ value = payload.get("input")
+ if value is None:
+ value = payload.get("inputs")
+
+ if isinstance(value, str):
+ return Query(sql=value)
+
+ if isinstance(value, dict):
+ sql = value.get("sql")
+ params = value.get("params", [])
+ if not isinstance(sql, str):
+ raise ValueError("Expected 'sql' to be a string in query payload")
+ if not isinstance(params, list):
+ raise ValueError("Expected 'params' to be a list in query payload")
+ return Query(sql=sql, params=params)
+
+ raise ValueError("Unsupported query payload format")
diff --git a/examples/serializers/fixtures.py b/examples/serializers/fixtures.py
new file mode 100644
index 0000000..d917f25
--- /dev/null
+++ b/examples/serializers/fixtures.py
@@ -0,0 +1,39 @@
+"""Fixture utilities for the native YAML serializer example."""
+
+from __future__ import annotations
+
+import sqlite3
+from typing import Any
+
+from .defn import Query
+
+
+class DbConnection:
+ """Tiny sqlite fixture class used by vowel fixture injection."""
+
+ def __init__(self, db_path: str = ":memory:"):
+ # Vowel can execute cases in worker threads; allow sqlite usage across them.
+ self.conn = sqlite3.connect(db_path, check_same_thread=False)
+ self.conn.row_factory = sqlite3.Row
+ self._seed()
+
+ def _seed(self) -> None:
+ cur = self.conn.cursor()
+ cur.execute(
+ "CREATE TABLE IF NOT EXISTS users (id INTEGER PRIMARY KEY, name TEXT, age INTEGER)"
+ )
+ cur.execute("DELETE FROM users")
+ cur.executemany(
+ "INSERT INTO users (name, age) VALUES (?, ?)",
+ [("Alice", 28), ("Bob", 34), ("Cara", 41)],
+ )
+ self.conn.commit()
+
+ def execute_query(self, query: Query) -> list[dict[str, Any]]:
+ cur = self.conn.cursor()
+ cur.execute(query.sql, query.params)
+ rows = cur.fetchall()
+ return [dict(row) for row in rows]
+
+ def close(self) -> None:
+ self.conn.close()
diff --git a/examples/serializers/util.py b/examples/serializers/util.py
new file mode 100644
index 0000000..e69de29
diff --git a/pyproject.toml b/pyproject.toml
index bc13a87..09a7072 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
[project]
name = "vowel"
-version = "0.3.5"
+version = "0.4.0"
description = "A modular evaluation framework for testing functions with YAML-based specifications"
readme = "README.md"
requires-python = ">=3.10"
diff --git a/quality-judge/evals.py b/quality-judge/evals.py
new file mode 100644
index 0000000..c9dd454
--- /dev/null
+++ b/quality-judge/evals.py
@@ -0,0 +1,33 @@
+import os
+import pathlib
+
+import dotenv
+
+from vowel.codemode import CodeModeGenerator
+from vowel.runner import Function
+
+dotenv.load_dotenv()
+
+SPEC_MODEL = os.getenv("SPEC_MODEL")
+EXPLORATION_MODEL = os.getenv("EXPLORATION_MODEL")
+
+generator = CodeModeGenerator(
+ spec_model=SPEC_MODEL,
+ exploration_model=EXPLORATION_MODEL,
+ generation_id="largest_color_value_judge_spec_quality",
+)
+
+
+async def generate_spec(fn: Function):
+ # check for code can compile (it will be executed in monty anyways)
+ _ = fn.impl
+ result = await generator.generate(fn, save_to_file=True)
+ print(result)
+ generator.print_total_cost()
+ return result.yaml_spec
+
+
+async def generate_spec_mock(fn: Function):
+ return pathlib.Path(
+ "/Users/mert/Desktop/LIP/evalspec/quality-judge/largestPathValue_evals.yml"
+ ).read_text()
diff --git a/quality-judge/largestPathValue_evals.yml b/quality-judge/largestPathValue_evals.yml
new file mode 100644
index 0000000..b21778e
--- /dev/null
+++ b/quality-judge/largestPathValue_evals.yml
@@ -0,0 +1,762 @@
+# yaml-language-server: $schema=/Users/mert/.vowel/vowel-schema_035.json
+
+largestPathValue:
+ evals:
+ ReturnType:
+ type: int
+ ResultRange:
+ assertion: output >= -1
+ dataset:
+ - case:
+ id: example_abaca
+ inputs:
+ colors: abaca
+ edges:
+ - - 0
+ - 1
+ - - 0
+ - 2
+ - - 2
+ - 3
+ - - 3
+ - 4
+ expected: 3
+ duration: 10.0
+ - case:
+ id: example_cycle_self_loop
+ inputs:
+ colors: a
+ edges:
+ - - 0
+ - 0
+ expected: -1
+ duration: 10.0
+ - case:
+ id: single_node_no_edges_a
+ inputs:
+ colors: a
+ edges: []
+ expected: 1
+ duration: 10.0
+ - case:
+ id: single_node_no_edges_z
+ inputs:
+ colors: z
+ edges: []
+ expected: 1
+ duration: 10.0
+ - case:
+ id: two_nodes_same_color
+ inputs:
+ colors: aa
+ edges:
+ - - 0
+ - 1
+ expected: 2
+ duration: 10.0
+ - case:
+ id: two_nodes_diff_color
+ inputs:
+ colors: ab
+ edges:
+ - - 0
+ - 1
+ expected: 1
+ duration: 10.0
+ - case:
+ id: linear_all_same_color
+ inputs:
+ colors: aaaa
+ edges:
+ - - 0
+ - 1
+ - - 1
+ - 2
+ - - 2
+ - 3
+ expected: 4
+ duration: 10.0
+ - case:
+ id: linear_alternating_colors
+ inputs:
+ colors: abab
+ edges:
+ - - 0
+ - 1
+ - - 1
+ - 2
+ - - 2
+ - 3
+ expected: 2
+ duration: 10.0
+ - case:
+ id: linear_five_same_color
+ inputs:
+ colors: aaaaa
+ edges:
+ - - 0
+ - 1
+ - - 1
+ - 2
+ - - 2
+ - 3
+ - - 3
+ - 4
+ expected: 5
+ duration: 10.0
+ - case:
+ id: linear_abcba
+ inputs:
+ colors: abcba
+ edges:
+ - - 0
+ - 1
+ - - 1
+ - 2
+ - - 2
+ - 3
+ - - 3
+ - 4
+ expected: 2
+ duration: 10.0
+ - case:
+ id: linear_a_then_bbbbb
+ inputs:
+ colors: abbbbb
+ edges:
+ - - 0
+ - 1
+ - - 1
+ - 2
+ - - 2
+ - 3
+ - - 3
+ - 4
+ - - 4
+ - 5
+ expected: 5
+ duration: 10.0
+ - case:
+ id: linear_aba
+ inputs:
+ colors: aba
+ edges:
+ - - 0
+ - 1
+ - - 1
+ - 2
+ expected: 2
+ duration: 10.0
+ - case:
+ id: dag_fork_aab
+ inputs:
+ colors: aab
+ edges:
+ - - 0
+ - 1
+ - - 0
+ - 2
+ - - 1
+ - 2
+ expected: 2
+ duration: 10.0
+ - case:
+ id: diamond_abba
+ inputs:
+ colors: abba
+ edges:
+ - - 0
+ - 1
+ - - 0
+ - 2
+ - - 1
+ - 3
+ - - 2
+ - 3
+ expected: 2
+ duration: 10.0
+ - case:
+ id: diamond_all_same
+ inputs:
+ colors: aaaa
+ edges:
+ - - 0
+ - 1
+ - - 0
+ - 2
+ - - 1
+ - 3
+ - - 2
+ - 3
+ expected: 3
+ duration: 10.0
+ - case:
+ id: diamond_all_distinct
+ inputs:
+ colors: hecb
+ edges:
+ - - 0
+ - 1
+ - - 0
+ - 2
+ - - 1
+ - 3
+ - - 2
+ - 3
+ expected: 1
+ duration: 10.0
+ - case:
+ id: no_edges_abc
+ inputs:
+ colors: abc
+ edges: []
+ expected: 1
+ duration: 10.0
+ - case:
+ id: no_edges_abcde
+ inputs:
+ colors: abcde
+ edges: []
+ expected: 1
+ duration: 10.0
+ - case:
+ id: two_components_aabba
+ inputs:
+ colors: aabba
+ edges:
+ - - 0
+ - 1
+ - - 1
+ - 2
+ - - 3
+ - 4
+ expected: 2
+ duration: 10.0
+ - case:
+ id: branching_aabba
+ inputs:
+ colors: aabba
+ edges:
+ - - 0
+ - 2
+ - - 1
+ - 2
+ - - 2
+ - 3
+ - - 2
+ - 4
+ expected: 2
+ duration: 10.0
+ - case:
+ id: branching_single_color_five
+ inputs:
+ colors: aaaaa
+ edges:
+ - - 0
+ - 1
+ - - 0
+ - 2
+ - - 1
+ - 3
+ - - 2
+ - 4
+ expected: 3
+ duration: 10.0
+ - case:
+ id: fan_in_sink_aaab
+ inputs:
+ colors: aaab
+ edges:
+ - - 0
+ - 3
+ - - 1
+ - 3
+ - - 2
+ - 3
+ expected: 1
+ duration: 10.0
+ - case:
+ id: fan_in_sink_aaaab
+ inputs:
+ colors: aaaab
+ edges:
+ - - 0
+ - 4
+ - - 1
+ - 4
+ - - 2
+ - 4
+ - - 3
+ - 4
+ expected: 1
+ duration: 10.0
+ - case:
+ id: cycle_three_nodes
+ inputs:
+ colors: abc
+ edges:
+ - - 0
+ - 1
+ - - 1
+ - 2
+ - - 2
+ - 0
+ expected: -1
+ duration: 10.0
+ - case:
+ id: cycle_with_extra_nodes
+ inputs:
+ colors: abcd
+ edges:
+ - - 0
+ - 1
+ - - 1
+ - 2
+ - - 2
+ - 1
+ - - 1
+ - 3
+ expected: -1
+ duration: 10.0
+ - case:
+ id: two_node_cycle
+ inputs:
+ colors: ab
+ edges:
+ - - 0
+ - 1
+ - - 1
+ - 0
+ expected: -1
+ duration: 10.0
+ - case:
+ id: back_edge_cycle
+ inputs:
+ colors: abcde
+ edges:
+ - - 0
+ - 1
+ - - 1
+ - 2
+ - - 2
+ - 3
+ - - 3
+ - 4
+ - - 4
+ - 1
+ expected: -1
+ duration: 10.0
+ - case:
+ id: self_loop_non_first_node
+ inputs:
+ colors: abc
+ edges:
+ - - 0
+ - 1
+ - - 1
+ - 2
+ - - 2
+ - 2
+ expected: -1
+ duration: 10.0
+ - case:
+ id: empty_string_no_edges
+ inputs:
+ colors: ''
+ edges: []
+ expected: 0
+ duration: 10.0
+ - case:
+ id: multi_edge_same_pair
+ inputs:
+ colors: ab
+ edges:
+ - - 0
+ - 1
+ - - 0
+ - 1
+ expected: 1
+ duration: 10.0
+ - case:
+ id: all_26_colors_chain
+ inputs:
+ colors: abcdefghijklmnopqrstuvwxyz
+ edges:
+ - - 0
+ - 1
+ - - 1
+ - 2
+ - - 2
+ - 3
+ - - 3
+ - 4
+ - - 4
+ - 5
+ - - 5
+ - 6
+ - - 6
+ - 7
+ - - 7
+ - 8
+ - - 8
+ - 9
+ - - 9
+ - 10
+ - - 10
+ - 11
+ - - 11
+ - 12
+ - - 12
+ - 13
+ - - 13
+ - 14
+ - - 14
+ - 15
+ - - 15
+ - 16
+ - - 16
+ - 17
+ - - 17
+ - 18
+ - - 18
+ - 19
+ - - 19
+ - 20
+ - - 20
+ - 21
+ - - 21
+ - 22
+ - - 22
+ - 23
+ - - 23
+ - 24
+ - - 24
+ - 25
+ expected: 1
+ duration: 10.0
+ - case:
+ id: alternating_ab_chain_10
+ inputs:
+ colors: ababababab
+ edges:
+ - - 0
+ - 1
+ - - 1
+ - 2
+ - - 2
+ - 3
+ - - 3
+ - 4
+ - - 4
+ - 5
+ - - 5
+ - 6
+ - - 6
+ - 7
+ - - 7
+ - 8
+ - - 8
+ - 9
+ expected: 5
+ duration: 10.0
+ - case:
+ id: all_same_color_linear_equals_n
+ inputs:
+ colors: aaaaaa
+ edges:
+ - - 0
+ - 1
+ - - 1
+ - 2
+ - - 2
+ - 3
+ - - 3
+ - 4
+ - - 4
+ - 5
+ expected: 6
+ duration: 10.0
+ - case:
+ id: list_of_chars_input
+ inputs:
+ colors:
+ - a
+ - b
+ - c
+ edges:
+ - - 0
+ - 1
+ - - 1
+ - 2
+ expected: 1
+ duration: 10.0
+ - case:
+ id: cycle_returns_minus_one
+ inputs:
+ colors: abcde
+ edges:
+ - - 0
+ - 1
+ - - 1
+ - 2
+ - - 2
+ - 3
+ - - 3
+ - 4
+ - - 4
+ - 2
+ assertion: output == -1
+ duration: 10.0
+ - case:
+ id: dag_result_at_least_one
+ inputs:
+ colors: abcdef
+ edges:
+ - - 0
+ - 1
+ - - 1
+ - 2
+ - - 3
+ - 4
+ - - 4
+ - 5
+ assertion: output >= 1
+ duration: 10.0
+ - case:
+ id: single_path_bounded_by_length
+ inputs:
+ colors: abcabc
+ edges:
+ - - 0
+ - 1
+ - - 1
+ - 2
+ - - 2
+ - 3
+ - - 3
+ - 4
+ - - 4
+ - 5
+ assertion: output >= 1 and output <= 6
+ duration: 10.0
+ - case:
+ id: w_shaped_dag
+ inputs:
+ colors: aabaa
+ edges:
+ - - 0
+ - 2
+ - - 1
+ - 2
+ - - 2
+ - 3
+ - - 2
+ - 4
+ assertion: output >= 1 and output <= 5
+ duration: 10.0
+ - case:
+ id: two_sources_one_sink_same_color
+ inputs:
+ colors: aaa
+ edges:
+ - - 0
+ - 2
+ - - 1
+ - 2
+ assertion: output == 2
+ duration: 10.0
+ - case:
+ id: long_path_single_color_at_ends
+ inputs:
+ colors: abcda
+ edges:
+ - - 0
+ - 1
+ - - 1
+ - 2
+ - - 2
+ - 3
+ - - 3
+ - 4
+ assertion: output == 2
+ duration: 10.0
+ - case:
+ id: star_topology_center_unique
+ inputs:
+ colors: baaaa
+ edges:
+ - - 0
+ - 1
+ - - 0
+ - 2
+ - - 0
+ - 3
+ - - 0
+ - 4
+ assertion: output == 1
+ duration: 10.0
+ - case:
+ id: chain_mostly_b_with_one_a
+ inputs:
+ colors: bbbba
+ edges:
+ - - 0
+ - 1
+ - - 1
+ - 2
+ - - 2
+ - 3
+ - - 3
+ - 4
+ expected: 4
+ duration: 10.0
+ - case:
+ id: error_none_colors
+ inputs:
+ colors: null
+ edges:
+ - - 0
+ - 1
+ raises: TypeError
+ match: has no len
+ - case:
+ id: error_none_edges
+ inputs:
+ colors: abc
+ edges: null
+ raises: TypeError
+ match: not iterable
+ - case:
+ id: error_int_colors
+ inputs:
+ colors: 123
+ edges:
+ - - 0
+ - 1
+ raises: TypeError
+ match: has no len
+ - case:
+ id: error_out_of_range_edge
+ inputs:
+ colors: ab
+ edges:
+ - - 0
+ - 5
+ raises: IndexError
+ match: list index out of range
+ - case:
+ id: error_empty_colors_with_edges
+ inputs:
+ colors: ''
+ edges:
+ - - 0
+ - 1
+ raises: IndexError
+ match: list index out of range
+ - case:
+ id: error_none_in_edge_list
+ inputs:
+ colors: abc
+ edges:
+ - null
+ - - 0
+ - 1
+ raises: TypeError
+ match: cannot unpack non-iterable NoneType
+ - case:
+ id: error_int_in_edge_list
+ inputs:
+ colors: abc
+ edges:
+ - 1
+ - - 0
+ - 1
+ raises: TypeError
+ match: cannot unpack non-iterable int
+ - case:
+ id: error_uppercase_color
+ inputs:
+ colors: A
+ edges: []
+ raises: IndexError
+ match: list assignment index out of range
+ - case:
+ id: error_string_in_edge_list
+ inputs:
+ colors: abc
+ edges:
+ - ab
+ - - 0
+ - 1
+ raises: TypeError
+ match: list indices must be integers or slices
+ - case:
+ id: error_typeerror_0
+ inputs:
+ - null
+ - - - 0
+ - 1
+ raises: TypeError
+ - case:
+ id: error_typeerror_1
+ inputs:
+ - abc
+ - null
+ raises: TypeError
+ - case:
+ id: error_typeerror_2
+ inputs:
+ - 123
+ - - - 0
+ - 1
+ raises: TypeError
+ - case:
+ id: error_indexerror_3
+ inputs:
+ - ab
+ - - - 0
+ - 5
+ raises: IndexError
+ - case:
+ id: error_indexerror_4
+ inputs:
+ - ''
+ - - - 0
+ - 1
+ raises: IndexError
+ - case:
+ id: error_typeerror_5
+ inputs:
+ - abc
+ - - null
+ - - 0
+ - 1
+ raises: TypeError
+ - case:
+ id: error_typeerror_6
+ inputs:
+ - abc
+ - - 1
+ - - 0
+ - 1
+ raises: TypeError
+ - case:
+ id: error_valueerror_7
+ inputs:
+ - ab
+ - - - 0
+ raises: ValueError
+ - case:
+ id: error_indexerror_8
+ inputs:
+ - A
+ - []
+ raises: IndexError
+ - case:
+ id: error_typeerror_9
+ inputs:
+ - abc
+ - - ab
+ - - 0
+ - 1
+ raises: TypeError
\ No newline at end of file
diff --git a/quality-judge/largest_color_value_judge.yml b/quality-judge/largest_color_value_judge.yml
new file mode 100644
index 0000000..9e48340
--- /dev/null
+++ b/quality-judge/largest_color_value_judge.yml
@@ -0,0 +1,330 @@
+# yaml-language-server: $schema=/Users/mert/.vowel/vowel-schema_035.json
+
+# costs: https://gist.github.com/fswair/7631878d75d6ed18a4fe3cb9b579600f#file-terminal-txt-L1309
+
+evals.generate_spec:
+ evals:
+ EvalSpecCoversFunction:
+ rubric: |
+ You are grading the quality of a generated evaluation spec against the given function source.
+
+ Your task: assess how well the generated spec covers the function's real behavior, edge cases,
+ error paths, and output contracts. Do not reward verbosity; reward correctness and meaningful coverage.
+
+ Calibration rule (critical):
+ - If trusted run results indicate the generated spec achieved 100% pass rate, do NOT confidently claim
+ that specific passing cases are "wrong" just from static suspicion.
+ - In that situation, treat your uncertainty as potential judge hallucination and shift focus to quality:
+ uniqueness, diversity, coverage depth, evaluator precision, and contract alignment.
+ - You may still flag a case as problematic only when there is strong internal evidence (clear contradiction,
+ invalid expectation syntax, impossible assertion, or direct mismatch with the given function contract).
+
+ Score using this weighted rubric (0-100 total):
+
+ 1) Functional Coverage (0-35)
+ - Core happy paths are tested.
+ - Boundary/edge conditions are tested (empty/min/max/null-ish/degenerate where relevant).
+ - Important behavior branches in source are represented by cases.
+ - Missing major branch or core behavior: deduct heavily.
+
+ 2) Error and Guard Coverage (0-20)
+ - All meaningful raise/guard paths are represented.
+ - Exception type expectations are accurate.
+ - Error-message match checks are used when meaningful.
+ - Missing critical error path: major deduction.
+
+ 3) Case Quality and Dataset Design (0-20)
+ - Cases are concrete, non-redundant, and behavior-focused.
+ - Inputs are realistic and varied (not trivial permutations only).
+ - Dataset is unique, diverse, and comprehensive (not AI slop or cosmetic rewrites of same scenario).
+ - Expected values/assertions are specific and verifiable.
+ - No vague, tautological, or self-fulfilling checks.
+
+ 4) Evaluator Quality (0-15)
+ - Evaluators are appropriate for the function contract (expected/raises/type/assertion/pattern/etc).
+ - Assertions are precise and meaningful (not always-true).
+ - Type and semantic checks are correctly used where needed.
+ - Evaluators reflect what was actually observed from execution evidence when such evidence is provided.
+
+ 5) Spec Correctness and Maintainability (0-10)
+ - YAML is structurally valid and unambiguous.
+ - Case naming/readability is good.
+ - Spec is concise but complete.
+
+ Hard-fail conditions (cap score at 40 max):
+ - Core function behavior is not tested.
+ - Critical error/guard behavior is absent.
+ - Evaluators are mostly weak/tautological/misaligned.
+ - Spec appears invalid or internally inconsistent.
+ - Do NOT trigger hard-fail based only on speculative "this case should fail" claims that conflict with
+ trusted 100% pass execution evidence.
+
+ Return format (mandatory):
+ 1) Criterion scores with reasons:
+ - Functional Coverage: /35
+ Reason:
+ - Error and Guard Coverage: /20
+ Reason:
+ - Case Quality and Dataset Design: /20
+ Reason:
+ - Evaluator Quality: /15
+ Reason:
+ - Spec Correctness and Maintainability: /10
+ Reason:
+
+ 2) Final numeric score: /100
+ - Must equal the sum of criterion scores.
+
+ 3) 3-6 key findings (strengths/weaknesses), each tied to specific evidence.
+
+ 4) Top 3 actionable improvements, prioritized by impact.
+
+ Important:
+ - Do NOT return only a final score.
+ - Every criterion MUST include both score and explicit reason.
+ - If pass-rate evidence is 100%, prioritize quality diagnostics over speculative invalid-case accusations.
+ - When you suspect a case is wrong despite pass evidence, mark it as "low-confidence concern" unless
+ you can cite direct, concrete contradiction from the source/contract.
+ include:
+ - input
+ config:
+ model: $JUDGE_MODEL
+ max_tokens: 4096
+ dataset:
+ - case:
+ input:
+ name: largestPathValue
+ description: |
+ Largest color value in a directed graph
+
+ There is a directed graph of n colored nodes and m edges. The nodes are numbered from 0 to n - 1.
+
+ You are given a string colors where colors[i] is a lowercase English letter representing the color of the ith node in this graph (0-indexed). You are also given a 2D array edges where edges[j] = [aj, bj] indicates that there is a directed edge from node aj to node bj.
+
+ A valid path in the graph is a sequence of nodes x1 -> x2 -> x3 -> ... -> xk such that there is a directed edge from xi to xi+1 for every 1 <= i < k. The color value of the path is the number of nodes that are colored the most frequently occurring color along that path.
+
+ Return the largest color value of any valid path in the given graph, or -1 if the graph contains a cycle.
+
+
+ Input: colors = "abaca", edges = [[0,1],[0,2],[2,3],[3,4]]
+ Output: 3
+ Explanation: The path 0 -> 2 -> 3 -> 4 contains 3 nodes that are colored "a" (red in the above image).
+
+ Input: colors = "a", edges = [[0,0]]
+ Output: -1
+ Explanation: There is a cycle from 0 to 0.
+
+
+ Constraints:
+ n == colors.length
+ m == edges.length
+ 1 <= n <= 105
+ 0 <= m <= 105
+ colors consists of lowercase English letters.
+ 0 <= aj, bj < n
+ code: |
+ from collections import deque
+
+
+ def largestPathValue(colors: str, edges: list[list[int]]) -> int:
+ n = len(colors)
+ graph = [[] for _ in range(n)]
+ indegree = [0] * n
+
+ for u, v in edges:
+ graph[u].append(v)
+ indegree[v] += 1
+
+ dp = [[0] * 26 for _ in range(n)]
+ queue = deque()
+
+ for i in range(n):
+ if indegree[i] == 0:
+ queue.append(i)
+ dp[i][ord(colors[i]) - ord("a")] = 1
+
+ visited = 0
+ answer = 0
+
+ while queue:
+ node = queue.popleft()
+ visited += 1
+ answer = max(answer, max(dp[node]))
+
+ for nei in graph[node]:
+ for c in range(26):
+ dp[nei][c] = max(
+ dp[nei][c],
+ dp[node][c] + (1 if c == ord(colors[nei]) - ord("a") else 0),
+ )
+
+ indegree[nei] -= 1
+ if indegree[nei] == 0:
+ queue.append(nei)
+
+ return answer if visited == n else -1
+
+
+
+evals.generate_spec_mock:
+ evals:
+ EvalSpecCoversFunction:
+ rubric: |
+ You are grading the quality of a generated evaluation spec against the given function source.
+
+ Your task: assess how well the generated spec covers the function's real behavior, edge cases,
+ error paths, and output contracts. Do not reward verbosity; reward correctness and meaningful coverage.
+
+ Calibration rule (critical):
+ - If trusted run results indicate the generated spec achieved 100% pass rate, do NOT confidently claim
+ that specific passing cases are "wrong" just from static suspicion.
+ - In that situation, treat your uncertainty as potential judge hallucination and shift focus to quality:
+ uniqueness, diversity, coverage depth, evaluator precision, and contract alignment.
+ - You may still flag a case as problematic only when there is strong internal evidence (clear contradiction,
+ invalid expectation syntax, impossible assertion, or direct mismatch with the given function contract).
+
+ Score using this weighted rubric (0-100 total):
+
+ 1) Functional Coverage (0-35)
+ - Core happy paths are tested.
+ - Boundary/edge conditions are tested (empty/min/max/null-ish/degenerate where relevant).
+ - Important behavior branches in source are represented by cases.
+ - Missing major branch or core behavior: deduct heavily.
+
+ 2) Error and Guard Coverage (0-20)
+ - All meaningful raise/guard paths are represented.
+ - Exception type expectations are accurate.
+ - Error-message match checks are used when meaningful.
+ - Missing critical error path: major deduction.
+
+ 3) Case Quality and Dataset Design (0-20)
+ - Cases are concrete, non-redundant, and behavior-focused.
+ - Inputs are realistic and varied (not trivial permutations only).
+ - Dataset is unique, diverse, and comprehensive (not AI slop or cosmetic rewrites of same scenario).
+ - Expected values/assertions are specific and verifiable.
+ - No vague, tautological, or self-fulfilling checks.
+
+ 4) Evaluator Quality (0-15)
+ - Evaluators are appropriate for the function contract (expected/raises/type/assertion/pattern/etc).
+ - Assertions are precise and meaningful (not always-true).
+ - Type and semantic checks are correctly used where needed.
+ - Evaluators reflect what was actually observed from execution evidence when such evidence is provided.
+
+ 5) Spec Correctness and Maintainability (0-10)
+ - YAML is structurally valid and unambiguous.
+ - Case naming/readability is good.
+ - Spec is concise but complete.
+
+ Hard-fail conditions (cap score at 40 max):
+ - Core function behavior is not tested.
+ - Critical error/guard behavior is absent.
+ - Evaluators are mostly weak/tautological/misaligned.
+ - Spec appears invalid or internally inconsistent.
+ - Do NOT trigger hard-fail based only on speculative "this case should fail" claims that conflict with
+ trusted 100% pass execution evidence.
+
+ Return format (mandatory):
+ 1) Criterion scores with reasons:
+ - Functional Coverage: /35
+ Reason:
+ - Error and Guard Coverage: /20
+ Reason:
+ - Case Quality and Dataset Design: /20
+ Reason:
+ - Evaluator Quality: /15
+ Reason:
+ - Spec Correctness and Maintainability: /10
+ Reason:
+
+ 2) Final numeric score: /100
+ - Must equal the sum of criterion scores.
+
+ 3) 3-6 key findings (strengths/weaknesses), each tied to specific evidence.
+
+ 4) Top 3 actionable improvements, prioritized by impact.
+
+ Important:
+ - Do NOT return only a final score.
+ - Every criterion MUST include both score and explicit reason.
+ - If pass-rate evidence is 100%, prioritize quality diagnostics over speculative invalid-case accusations.
+ - When you suspect a case is wrong despite pass evidence, mark it as "low-confidence concern" unless
+ you can cite direct, concrete contradiction from the source/contract.
+ include:
+ - input
+ config:
+ model: $JUDGE_MODEL
+ max_tokens: 4096
+ dataset:
+ - case:
+ input:
+ name: largestPathValue
+ description: |
+ Largest color value in a directed graph
+
+ There is a directed graph of n colored nodes and m edges. The nodes are numbered from 0 to n - 1.
+
+ You are given a string colors where colors[i] is a lowercase English letter representing the color of the ith node in this graph (0-indexed). You are also given a 2D array edges where edges[j] = [aj, bj] indicates that there is a directed edge from node aj to node bj.
+
+ A valid path in the graph is a sequence of nodes x1 -> x2 -> x3 -> ... -> xk such that there is a directed edge from xi to xi+1 for every 1 <= i < k. The color value of the path is the number of nodes that are colored the most frequently occurring color along that path.
+
+ Return the largest color value of any valid path in the given graph, or -1 if the graph contains a cycle.
+
+
+ Input: colors = "abaca", edges = [[0,1],[0,2],[2,3],[3,4]]
+ Output: 3
+ Explanation: The path 0 -> 2 -> 3 -> 4 contains 3 nodes that are colored "a" (red in the above image).
+
+ Input: colors = "a", edges = [[0,0]]
+ Output: -1
+ Explanation: There is a cycle from 0 to 0.
+
+
+ Constraints:
+ n == colors.length
+ m == edges.length
+ 1 <= n <= 105
+ 0 <= m <= 105
+ colors consists of lowercase English letters.
+ 0 <= aj, bj < n
+ code: |
+ from collections import deque
+
+
+ def largestPathValue(colors: str, edges: list[list[int]]) -> int:
+ n = len(colors)
+ graph = [[] for _ in range(n)]
+ indegree = [0] * n
+
+ for u, v in edges:
+ graph[u].append(v)
+ indegree[v] += 1
+
+ dp = [[0] * 26 for _ in range(n)]
+ queue = deque()
+
+ for i in range(n):
+ if indegree[i] == 0:
+ queue.append(i)
+ dp[i][ord(colors[i]) - ord("a")] = 1
+
+ visited = 0
+ answer = 0
+
+ while queue:
+ node = queue.popleft()
+ visited += 1
+ answer = max(answer, max(dp[node]))
+
+ for nei in graph[node]:
+ for c in range(26):
+ dp[nei][c] = max(
+ dp[nei][c],
+ dp[node][c] + (1 if c == ord(colors[nei]) - ord("a") else 0),
+ )
+
+ indegree[nei] -= 1
+ if indegree[nei] == 0:
+ queue.append(nei)
+
+ return answer if visited == n else -1
+
diff --git a/quality-judge/runner.py b/quality-judge/runner.py
new file mode 100644
index 0000000..587b79c
--- /dev/null
+++ b/quality-judge/runner.py
@@ -0,0 +1,22 @@
+from vowel.monitoring import enable_monitoring
+from vowel.runner import Function, RunEvals
+
+enable_monitoring(
+ logfire_enabled=True,
+ service_name="quality-judge",
+)
+
+runner = RunEvals.from_file("largest_color_value_judge.yml")
+
+main_runner = runner.with_serializer({"evals.generate_spec": Function}).filter(
+ "evals.generate_spec"
+)
+
+# mock_runner = runner.with_serializer({"evals.generate_spec_mock": Function}).filter(
+# "evals.generate_spec_mock"
+# )
+
+
+summary = main_runner.run()
+
+summary.print()
diff --git a/src/vowel/eval_types.py b/src/vowel/eval_types.py
index 17f8495..8ad7509 100644
--- a/src/vowel/eval_types.py
+++ b/src/vowel/eval_types.py
@@ -249,6 +249,36 @@ class FixturesConfig(BaseModel):
)
+class SerializerSpec(BaseModel):
+ """Serializer registry entry for YAML-native serializer configuration."""
+
+ model_config = ConfigDict(extra="forbid", populate_by_name=True)
+
+ serializer_schema: str | dict[str, str] | None = Field(
+ default=None,
+ alias="schema",
+ serialization_alias="schema",
+ description=(
+ "Schema converter path(s). Use a single import path string for direct mode, "
+ "or a mapping of parameter name to import path for nested mode."
+ ),
+ )
+ serializer: str | None = Field(
+ default=None,
+ description="Import path to custom serializer function (serial_fn mode).",
+ )
+
+ @model_validator(mode="after")
+ def validate_one_of(self):
+ has_schema = self.serializer_schema is not None
+ has_serializer = self.serializer is not None
+ if has_schema and has_serializer:
+ raise ValueError("Serializer spec cannot define both 'schema' and 'serializer'")
+ if not has_schema and not has_serializer:
+ raise ValueError("Serializer spec must define one of: 'schema' or 'serializer'")
+ return self
+
+
# =============================================================================
# Evaluation Case Models
# =============================================================================
@@ -737,6 +767,14 @@ class Evals(BaseModel):
examples=[["db"], ["db", "cache"], ["redis"]],
)
+ serializer: str | None = Field(
+ default=None,
+ description=(
+ "Optional serializer registry key from top-level 'serializers'. "
+ "When set, this eval uses that serializer definition."
+ ),
+ )
+
evals: dict[
str,
IsInstanceCase
@@ -807,20 +845,32 @@ class EvalsFile(BaseModel):
default_factory=dict,
description="Global fixture definitions available to all evals in this file",
)
+ serializers: dict[str, SerializerSpec] = Field(
+ default_factory=dict,
+ description="Global serializer definitions available to evals in this file",
+ )
@classmethod
def model_validate(cls, obj, **kwargs):
# Parse fixtures if present (don't mutate caller's dict)
fixtures_data = obj.get("fixtures", {})
- obj = {k: v for k, v in obj.items() if k != "fixtures"}
+ serializers_data = obj.get("serializers", {})
+ obj = {k: v for k, v in obj.items() if k not in {"fixtures", "serializers"}}
fixtures = {}
+ serializers = {}
for name, defn in fixtures_data.items():
if isinstance(defn, dict):
fixtures[name] = FixtureDefinition(**defn)
elif isinstance(defn, FixtureDefinition):
fixtures[name] = defn
- instance = cls.model_construct(fixtures=fixtures, **obj)
+ for name, defn in serializers_data.items():
+ if isinstance(defn, dict):
+ serializers[name] = SerializerSpec(**defn)
+ elif isinstance(defn, SerializerSpec):
+ serializers[name] = defn
+
+ instance = cls.model_construct(fixtures=fixtures, serializers=serializers, **obj)
return instance
# Pydantic internal attributes to skip when iterating
@@ -843,6 +893,7 @@ def model_validate(cls, obj, **kwargs):
"model_dump",
"model_dump_json",
"fixtures",
+ "serializers",
# Skip fixtures when iterating evals
}
)
@@ -851,7 +902,7 @@ def get_evals(self) -> dict[str, Evals]:
result = {}
extras = getattr(self, "__pydantic_extra__", {})
for key, value in extras.items():
- if key == "fixtures":
+ if key in {"fixtures", "serializers"}:
continue
if isinstance(value, dict) and "dataset" in value:
try:
diff --git a/src/vowel/evals.py b/src/vowel/evals.py
index f4d1b2f..4d7cab4 100644
--- a/src/vowel/evals.py
+++ b/src/vowel/evals.py
@@ -79,7 +79,83 @@ def _eval_type_restricted(type_expr: str) -> typing.Any:
return eval(type_expr, env, env)
-def prepare_env_and_condition(ctx: EvaluatorContext, condition: str) -> tuple[dict, str]:
+def _apply_serializer_for_assertion(
+ value: typing.Any,
+ serializer: type | typing.Callable | dict[str, type | typing.Callable] | None,
+ *,
+ param_name: str | None = None,
+) -> typing.Any:
+ """Apply serializer in assertion path to mirror function call conversions."""
+ if serializer is None:
+ return value
+
+ if isinstance(serializer, dict):
+ if param_name and param_name in serializer:
+ return _apply_serializer_for_assertion(value, serializer[param_name])
+ if isinstance(value, dict):
+ converted: dict[str, typing.Any] = {}
+ for key, item in value.items():
+ if key in serializer:
+ converted[key] = _apply_serializer_for_assertion(item, serializer[key])
+ else:
+ converted[key] = item
+ return converted
+ return value
+
+ if isinstance(value, dict):
+ try:
+ return serializer(**value)
+ except TypeError:
+ return serializer(value)
+
+ return serializer(value)
+
+
+def _normalize_input_for_assertion(
+ raw_inputs: typing.Any,
+ serializer: type | typing.Callable | dict[str, type | typing.Callable] | None,
+ serializer_fn: typing.Callable[[dict], typing.Any] | None,
+) -> typing.Any:
+ """Compute assertion `input` value from raw case inputs using active serializer config."""
+ if not isinstance(raw_inputs, dict):
+ return _apply_serializer_for_assertion(raw_inputs, serializer)
+
+ if serializer_fn is not None:
+ serialized = serializer_fn(raw_inputs)
+ if isinstance(serialized, tuple):
+ return serialized[0] if len(serialized) == 1 else serialized
+ return serialized
+
+ if "input" in raw_inputs:
+ return _apply_serializer_for_assertion(raw_inputs["input"], serializer)
+
+ if "inputs" in raw_inputs:
+ values = raw_inputs["inputs"]
+ if values is None:
+ return None
+ if isinstance(values, dict):
+ if serializer is not None and not isinstance(serializer, dict):
+ return _apply_serializer_for_assertion(values, serializer)
+ if isinstance(serializer, dict):
+ return {
+ key: _apply_serializer_for_assertion(item, serializer, param_name=key)
+ for key, item in values.items()
+ }
+ return values
+ if serializer is None:
+ return values
+ return [_apply_serializer_for_assertion(item, serializer) for item in values]
+
+ return raw_inputs
+
+
+def prepare_env_and_condition(
+ ctx: EvaluatorContext,
+ condition: str,
+ *,
+ serializer: type | typing.Callable | dict[str, type | typing.Callable] | None = None,
+ serializer_fn: typing.Callable[[dict], typing.Any] | None = None,
+) -> tuple[dict, str]:
"""
Prepare environment variables and format condition string for evaluation.
@@ -90,12 +166,7 @@ def prepare_env_and_condition(ctx: EvaluatorContext, condition: str) -> tuple[di
Returns:
Tuple of (environment dict, formatted condition string)
"""
- actual_input = ctx.inputs
- if isinstance(ctx.inputs, dict):
- if "input" in ctx.inputs:
- actual_input = ctx.inputs["input"]
- elif "inputs" in ctx.inputs:
- actual_input = ctx.inputs["inputs"]
+ actual_input = _normalize_input_for_assertion(ctx.inputs, serializer, serializer_fn)
env = {
"input": actual_input,
@@ -122,9 +193,18 @@ class AssertionEvaluator(Evaluator):
metrics, metadata, and duration variables.
"""
- def __init__(self, condition: str, *, evaluation_name: str = "Assertion"):
+ def __init__(
+ self,
+ condition: str,
+ *,
+ evaluation_name: str = "Assertion",
+ serializer: type | typing.Callable | dict[str, type | typing.Callable] | None = None,
+ serializer_fn: typing.Callable[[dict], typing.Any] | None = None,
+ ):
self.condition = condition
self.evaluation_name = evaluation_name
+ self.serializer = serializer
+ self.serializer_fn = serializer_fn
self.interpreter = None
if MONTY_AVAILABLE:
import pydantic_monty
@@ -141,7 +221,12 @@ def evaluate(self, ctx: EvaluatorContext) -> EvaluationReason:
return EvaluationReason(value=True, reason="Skipped (exception case)")
if "__import__" in self.condition:
raise ValueError(f"__import__ is not allowed in assertions: {self.condition}")
- env, condition = prepare_env_and_condition(ctx, self.condition)
+ env, condition = prepare_env_and_condition(
+ ctx,
+ self.condition,
+ serializer=self.serializer,
+ serializer_fn=self.serializer_fn,
+ )
# TL;DR
# BETA API
diff --git a/src/vowel/schema.py b/src/vowel/schema.py
index 80e4647..15e04be 100644
--- a/src/vowel/schema.py
+++ b/src/vowel/schema.py
@@ -2,6 +2,7 @@
from __future__ import annotations
+import hashlib
import importlib.metadata
import json
import re
@@ -34,6 +35,7 @@ def build_yaml_schema_from_bundle() -> dict[str, Any]:
No repository reference file is used. The root shape is forced to match
vowel's YAML file format:
- top-level optional `fixtures`
+ - top-level optional `serializers`
- top-level additionalProperties => per-function `Evals`
"""
bundle_schema = EvalsBundle.model_json_schema(ref_template="#/$defs/{model}")
@@ -46,6 +48,13 @@ def build_yaml_schema_from_bundle() -> dict[str, Any]:
"title": "Fixtures",
},
)
+ serializers_schema = properties.get(
+ "serializers",
+ {
+ "type": "object",
+ "title": "Serializers",
+ },
+ )
additional_properties: dict[str, Any]
if "Evals" in defs:
@@ -71,6 +80,7 @@ def build_yaml_schema_from_bundle() -> dict[str, Any]:
"type": "object",
"properties": {
"fixtures": fixtures_schema,
+ "serializers": serializers_schema,
},
"additionalProperties": additional_properties,
"$defs": defs,
@@ -81,13 +91,14 @@ def build_yaml_schema_from_bundle() -> dict[str, Any]:
def ensure_cached_schema(version: str | None = None) -> Path:
"""Ensure the versioned schema file exists and is up to date."""
- token = _schema_version_token(version)
- schema_path = SCHEMA_CACHE_DIR / f"vowel-schema_{token}.json"
- schema_path.parent.mkdir(parents=True, exist_ok=True)
-
schema_data = build_yaml_schema_from_bundle()
rendered = json.dumps(schema_data, indent=2, ensure_ascii=False) + "\n"
+ token = _schema_version_token(version)
+ digest = hashlib.sha1(rendered.encode("utf-8")).hexdigest()[:8]
+ schema_path = SCHEMA_CACHE_DIR / f"vowel-schema_{token}_{digest}.json"
+ schema_path.parent.mkdir(parents=True, exist_ok=True)
+
if not schema_path.exists() or schema_path.read_text(encoding="utf-8") != rendered:
schema_path.write_text(rendered, encoding="utf-8")
diff --git a/src/vowel/utils.py b/src/vowel/utils.py
index 60bab83..ac54c1b 100644
--- a/src/vowel/utils.py
+++ b/src/vowel/utils.py
@@ -14,7 +14,7 @@
from collections.abc import Callable, Mapping, Sequence
from datetime import date, datetime, time, timedelta
from decimal import Decimal
-from functools import wraps
+from functools import lru_cache, wraps
from pathlib import Path, PurePath
from typing import Any, Literal, Optional, Union, get_args, get_origin
@@ -28,7 +28,7 @@
from pydantic_evals.reporting import EvaluationReport
from .errors import FixturePathError, SignatureError
-from .eval_types import Evals, EvalsFile, FixtureDefinition
+from .eval_types import Evals, EvalsFile, FixtureDefinition, SerializerSpec
from .evals import (
AssertionEvaluator,
ContainsInputEvaluator,
@@ -39,6 +39,8 @@
)
from .executor import Executor
+PROJECT_ROOT = Path(__file__).resolve().parents[2]
+
# =============================================================================
# Evals Bundle - Container for evals and fixtures
# =============================================================================
@@ -51,6 +53,7 @@ class EvalsBundle(BaseModel):
evals: dict[str, Evals] = Field(min_length=1)
fixtures: dict[str, FixtureDefinition] = Field(default_factory=dict)
+ serializers: dict[str, SerializerSpec] = Field(default_factory=dict)
def to_yaml(self) -> str:
"""Serialize bundle to current vowel YAML spec format."""
@@ -76,6 +79,17 @@ def to_yaml(self) -> str:
for name, definition in self.fixtures.items()
}
+ if self.serializers:
+ data["serializers"] = {
+ name: serializer.model_dump(
+ mode="python",
+ by_alias=True,
+ exclude_none=True,
+ exclude_defaults=True,
+ )
+ for name, serializer in self.serializers.items()
+ }
+
return yaml.safe_dump(
data,
sort_keys=False,
@@ -365,17 +379,20 @@ def check_compatibility(func: Callable) -> tuple[bool, list[str]]:
@contextlib.contextmanager
def _cwd_on_syspath() -> Any:
- """Temporarily prepend the current working directory to ``sys.path``."""
+ """Temporarily prepend cwd and project root to ``sys.path``."""
cwd = os.getcwd()
- inserted = cwd not in sys.path
- if inserted:
- sys.path.insert(0, cwd)
+ candidates = [cwd, str(PROJECT_ROOT)]
+ inserted: list[str] = []
+ for candidate in candidates:
+ if candidate not in sys.path:
+ sys.path.insert(0, candidate)
+ inserted.append(candidate)
try:
yield
finally:
- if inserted:
+ for candidate in inserted:
with contextlib.suppress(ValueError):
- sys.path.remove(cwd)
+ sys.path.remove(candidate)
def _is_yaml_source_string(source_str: str) -> bool:
@@ -859,9 +876,12 @@ def import_function(module_path: str) -> Any:
error=str(e),
)
relative_path = module_name.replace(".", os.sep) + ".py"
- file_path = os.path.join(os.getcwd(), relative_path)
+ candidate_roots = [os.getcwd(), str(PROJECT_ROOT)]
- if os.path.exists(file_path):
+ for root in candidate_roots:
+ file_path = os.path.join(root, relative_path)
+ if not os.path.exists(file_path):
+ continue
try:
spec = importlib.util.spec_from_file_location(module_name, file_path)
if spec and spec.loader:
@@ -870,6 +890,7 @@ def import_function(module_path: str) -> Any:
logfire.debug(
"File-based import succeeded for '{file_path}'", file_path=file_path
)
+ break
except Exception as e:
logfire.debug(
"File-based import failed for '{file_path}': {error}",
@@ -901,6 +922,46 @@ def import_function(module_path: str) -> Any:
)
+@lru_cache(maxsize=512)
+def _import_path_cached(path: str) -> Any:
+ """Import and cache objects referenced by import paths."""
+ return import_function(path)
+
+
+def _resolve_yaml_serializer_entry(
+ serializers: Mapping[str, SerializerSpec],
+ serializer_name: str,
+) -> tuple[type | Callable | dict[str, type | Callable] | None, Callable[[dict], Any] | None]:
+ """Resolve a serializer registry entry into schema or serial_fn mapping values."""
+ if serializer_name not in serializers:
+ available = sorted(serializers.keys())
+ raise ValueError(
+ f"Unknown serializer '{serializer_name}'. Available serializers: {available}"
+ )
+
+ spec = serializers[serializer_name]
+
+ if spec.serializer is not None:
+ loaded = _import_path_cached(spec.serializer)
+ if not callable(loaded):
+ raise TypeError(f"Serializer '{spec.serializer}' must resolve to a callable")
+ return None, loaded
+
+ schema = spec.serializer_schema
+ if isinstance(schema, str):
+ return _import_path_cached(schema), None
+
+ if isinstance(schema, dict):
+ resolved: dict[str, type | Callable] = {}
+ for key, path in schema.items():
+ resolved[key] = _import_path_cached(path)
+ return resolved, None
+
+ raise ValueError(
+ f"Serializer '{serializer_name}' must define one of: schema (str|dict) or serializer"
+ )
+
+
def import_class(class_path: str) -> type:
"""
Import a class from a module path.
@@ -950,26 +1011,42 @@ def load_bundle_file(yaml_path: str) -> EvalsBundle:
loaded = yaml.safe_load(f)
evals_file = EvalsFile.model_validate(loaded)
- return EvalsBundle(evals=evals_file.get_evals(), fixtures=evals_file.fixtures)
+ return EvalsBundle(
+ evals=evals_file.get_evals(),
+ fixtures=evals_file.fixtures,
+ serializers=evals_file.serializers,
+ )
def load_bundle_from_yaml_string(yaml_content: str) -> EvalsBundle:
"""Load evals and fixtures from a YAML string."""
loaded = yaml.safe_load(yaml_content)
evals_file = EvalsFile.model_validate(loaded)
- return EvalsBundle(evals=evals_file.get_evals(), fixtures=evals_file.fixtures)
+ return EvalsBundle(
+ evals=evals_file.get_evals(),
+ fixtures=evals_file.fixtures,
+ serializers=evals_file.serializers,
+ )
def load_bundle_from_dict(data: dict) -> EvalsBundle:
"""Load evals and fixtures from a dictionary."""
evals_file = EvalsFile.model_validate(data)
- return EvalsBundle(evals=evals_file.get_evals(), fixtures=evals_file.fixtures)
+ return EvalsBundle(
+ evals=evals_file.get_evals(),
+ fixtures=evals_file.fixtures,
+ serializers=evals_file.serializers,
+ )
def load_bundle_from_object(evals_obj: EvalsFile) -> EvalsBundle:
"""Load evals and fixtures from an EvalsFile object."""
assert isinstance(evals_obj, EvalsFile)
- return EvalsBundle(evals=evals_obj.get_evals(), fixtures=evals_obj.fixtures)
+ return EvalsBundle(
+ evals=evals_obj.get_evals(),
+ fixtures=evals_obj.fixtures,
+ serializers=evals_obj.serializers,
+ )
def load_bundle(source: str | Path | dict | EvalsFile) -> EvalsBundle:
@@ -1118,7 +1195,7 @@ def to_dataset(
display_input = f"inputs: {match_case.inputs}"
input_value = {"inputs": match_case.inputs}
else:
- display_input = f"input: {match_case.input}"
+ display_input = f"input: {str(match_case.input)[:300]}"
input_value = {"input": match_case.input}
if any(case for case in dataset_cases if input_value == case.inputs):
@@ -1551,6 +1628,17 @@ def _evaluate_single_function(
serial_fn, eval_id, mapping_name="serializer function"
)
+ for evaluator in dataset.evaluators:
+ if isinstance(evaluator, AssertionEvaluator):
+ evaluator.serializer = func_schema
+ evaluator.serializer_fn = func_serial_fn
+
+ for case in dataset.cases:
+ for evaluator in case.evaluators:
+ if isinstance(evaluator, AssertionEvaluator):
+ evaluator.serializer = func_schema
+ evaluator.serializer_fn = func_serial_fn
+
# Setup module-scoped fixtures for this eval
module_fixtures = {}
if fixture_manager and fixture_names:
@@ -2046,6 +2134,7 @@ def run_evals(
bundle = source if isinstance(source, EvalsBundle) else load_bundle(source)
all_evals = bundle.evals
yaml_fixtures = bundle.fixtures
+ yaml_serializers = bundle.serializers
# Merge programmatic fixtures with YAML fixtures
merged_fixtures, fixture_funcs = _merge_programmatic_fixtures(yaml_fixtures, fixtures)
@@ -2057,13 +2146,37 @@ def run_evals(
serial_fn = serial_fn or {}
if filter_funcs:
- filtered_evals = {k: v for k, v in all_evals.items() if k in filter_funcs}
+ resolved_filter_ids: list[str] = []
+
+ for raw_filter in filter_funcs:
+ if raw_filter in all_evals:
+ resolved_filter_ids.append(raw_filter)
+ continue
+
+ short_name = raw_filter.rsplit(".", 1)[-1]
+ matches = [
+ eval_id for eval_id in all_evals if eval_id.rsplit(".", 1)[-1] == short_name
+ ]
+
+ if len(matches) == 1:
+ resolved_filter_ids.append(matches[0])
+ elif len(matches) > 1:
+ candidates = sorted(matches)
+ raise ValueError(
+ f"Ambiguous filter '{raw_filter}'. Provide an exact eval id. "
+ f"Candidates: {candidates}"
+ )
+ # Keep stable input order while removing duplicates.
+ ordered_unique_filter_ids = list(dict.fromkeys(resolved_filter_ids))
+ filtered_evals = {k: v for k, v in all_evals.items() if k in ordered_unique_filter_ids}
+
if not filtered_evals:
available = list(all_evals.keys())
raise ValueError(
f"No functions found matching filters: {', '.join(filter_funcs)}. "
f"Available: {', '.join(available)}"
)
+
all_evals = filtered_evals
# Create fixture manager if fixtures are defined
@@ -2074,14 +2187,43 @@ def run_evals(
try:
for eval_id, evals in all_evals.items():
try:
+ effective_schema = dict(schema)
+ effective_serial_fn = dict(serial_fn)
+
+ # YAML-native serializer registry (per-eval reference).
+ if evals.serializer:
+ yaml_schema, yaml_serial = _resolve_yaml_serializer_entry(
+ yaml_serializers,
+ evals.serializer,
+ )
+
+ # Programmatic mappings have precedence.
+ has_programmatic_schema = (
+ _resolve_eval_id_mapping(schema, eval_id, mapping_name="serializer schema")
+ is not None
+ )
+ has_programmatic_serial = (
+ _resolve_eval_id_mapping(
+ serial_fn,
+ eval_id,
+ mapping_name="serializer function",
+ )
+ is not None
+ )
+
+ if yaml_schema is not None and not has_programmatic_schema:
+ effective_schema[eval_id] = yaml_schema
+ if yaml_serial is not None and not has_programmatic_serial:
+ effective_serial_fn[eval_id] = yaml_serial
+
result = _evaluate_single_function(
eval_id,
evals,
functions,
merged_fixtures,
fixture_manager,
- schema,
- serial_fn,
+ effective_schema,
+ effective_serial_fn,
ignore_duration,
)
results.append(result)
diff --git a/tests/test_run_evals.py b/tests/test_run_evals.py
index 4ff09e5..e91e35a 100644
--- a/tests/test_run_evals.py
+++ b/tests/test_run_evals.py
@@ -238,6 +238,81 @@ def test_filter_multiple_functions(self):
assert summary.total_count == 2
+ def test_filter_module_name_matches_short_eval_id(self):
+ """module.function filter should match bare function eval ids."""
+ spec = {
+ "add": {"dataset": [{"case": {"inputs": {"a": 1, "b": 2}, "expected": 3}}]},
+ "sub": {"dataset": [{"case": {"inputs": {"a": 5, "b": 3}, "expected": 2}}]},
+ }
+
+ summary = (
+ RunEvals.from_dict(spec)
+ .with_functions(
+ {
+ "add": lambda a, b: a + b,
+ "sub": lambda a, b: a - b,
+ }
+ )
+ .filter(["math.add"])
+ .run()
+ )
+
+ assert summary.total_count == 1
+ assert summary.results[0].eval_id == "add"
+
+ def test_filter_short_name_matches_module_eval_id(self):
+ """bare function filter should match module.function eval ids."""
+ spec = {
+ "pkg.add": {
+ "dataset": [
+ {"case": {"inputs": {"a": 1, "b": 2}, "expected": 3}},
+ ]
+ },
+ "pkg.sub": {
+ "dataset": [
+ {"case": {"inputs": {"a": 5, "b": 3}, "expected": 2}},
+ ]
+ },
+ }
+
+ summary = (
+ RunEvals.from_dict(spec)
+ .with_functions(
+ {
+ "add": lambda a, b: a + b,
+ "sub": lambda a, b: a - b,
+ }
+ )
+ .filter(["add"])
+ .run()
+ )
+
+ assert summary.total_count == 1
+ assert summary.results[0].eval_id == "pkg.add"
+
+ def test_filter_short_name_raises_on_ambiguous_matches(self):
+ """Short-name filters should fail fast when multiple eval ids share a suffix."""
+ spec = {
+ "pkg.add": {
+ "dataset": [
+ {"case": {"inputs": {"a": 1, "b": 2}, "expected": 3}},
+ ]
+ },
+ "other.add": {
+ "dataset": [
+ {"case": {"inputs": {"a": 2, "b": 3}, "expected": 5}},
+ ]
+ },
+ }
+
+ with pytest.raises(ValueError, match="Ambiguous filter 'add'"):
+ (
+ RunEvals.from_dict(spec)
+ .with_functions({"add": lambda a, b: a + b})
+ .filter(["add"])
+ .run()
+ )
+
class TestRunEvalsDebug:
"""Tests for debug() method."""
diff --git a/tests/test_schema.py b/tests/test_schema.py
new file mode 100644
index 0000000..c90b62d
--- /dev/null
+++ b/tests/test_schema.py
@@ -0,0 +1,38 @@
+"""Tests for generated YAML schema support."""
+
+import json
+from pathlib import Path
+
+from vowel.schema import build_yaml_schema_from_bundle, materialize_yaml_with_schema_header
+
+
+def test_generated_schema_includes_top_level_serializers_property():
+ """Top-level `serializers` should be explicitly supported in generated schema."""
+ schema = build_yaml_schema_from_bundle()
+ properties = schema.get("properties", {})
+
+ assert "fixtures" in properties
+ assert "serializers" in properties
+
+
+def test_generated_schema_keeps_function_additional_properties():
+ """Unknown top-level keys must still map to per-function Evals definitions."""
+ schema = build_yaml_schema_from_bundle()
+
+ additional = schema.get("additionalProperties", {})
+ assert additional == {"$ref": "#/$defs/EvalsMapValue"}
+
+
+def test_materialized_header_uses_hashed_cache_with_serializers():
+ """Schema header should reference a content-addressed cache file that supports serializers."""
+ yaml_text = "len:\n dataset:\n - case:\n id: len_basic\n input: [1]\n expected: 1\n"
+ materialized = materialize_yaml_with_schema_header(yaml_text)
+ first_line = materialized.splitlines()[0]
+
+ assert first_line.startswith("# yaml-language-server: $schema=")
+ schema_path = Path(first_line.split("$schema=", 1)[1])
+ assert schema_path.name.startswith("vowel-schema_")
+ assert schema_path.exists()
+
+ schema_obj = json.loads(schema_path.read_text(encoding="utf-8"))
+ assert "serializers" in schema_obj.get("properties", {})
diff --git a/tests/test_serializer.py b/tests/test_serializer.py
index 1ed83ec..f6516d1 100644
--- a/tests/test_serializer.py
+++ b/tests/test_serializer.py
@@ -28,6 +28,16 @@ def process_with_config(user: User, config: Config) -> str:
return f"{user.name} (timeout={config.timeout})"
+def yaml_serialize_user(data: dict) -> User:
+ """Serializer function used by YAML-native serializer registry tests."""
+ raw = data.get("input") or data.get("inputs")
+ if isinstance(raw, list):
+ raw = raw[0]
+ if not isinstance(raw, dict):
+ raise ValueError("Expected serializer input payload to be a dict")
+ return User(**raw)
+
+
class TestSchemaSerializer:
"""Tests for schema-based serialization."""
@@ -147,6 +157,36 @@ def test_inputs_named_params_different_types(self):
)
assert summary.all_passed
+ def test_assertion_uses_serialized_input_with_dict_schema(self):
+ """Assertion `input` should contain per-param serialized objects for dict schema."""
+ spec = {
+ "process_with_config": {
+ "evals": {
+ "CheckSerializedInput": {
+ "assertion": "input['user'].email.endswith('@a.com') and input['config'].timeout == 30"
+ }
+ },
+ "dataset": [
+ {
+ "case": {
+ "inputs": {
+ "user": {"id": 1, "name": "Alice", "email": "a@a.com"},
+ "config": {"timeout": 30, "verbose": True},
+ },
+ "expected": "Alice (timeout=30)",
+ }
+ },
+ ],
+ }
+ }
+ summary = (
+ RunEvals.from_dict(spec)
+ .with_functions({"process_with_config": process_with_config})
+ .with_serializer({"process_with_config": {"user": User, "config": Config}})
+ .run()
+ )
+ assert summary.all_passed
+
def test_no_serializer_passthrough(self):
"""Without serializer, dict is passed as-is."""
@@ -220,6 +260,29 @@ def test_serializer_short_name_matches_module_function_spec(self):
)
assert summary.all_passed
+ def test_assertion_uses_serialized_input_with_schema(self):
+ """Assertion `input` should be the schema-serialized object, not raw YAML dict."""
+ spec = {
+ "get_user_info": {
+ "evals": {"CheckSerializedInput": {"assertion": "input.email.endswith('@a.com')"}},
+ "dataset": [
+ {
+ "case": {
+ "input": {"id": 1, "name": "Alice", "email": "a@a.com"},
+ "expected": "User Alice has email a@a.com",
+ }
+ },
+ ],
+ }
+ }
+ summary = (
+ RunEvals.from_dict(spec)
+ .with_functions({"get_user_info": get_user_info})
+ .with_serializer({"get_user_info": User})
+ .run()
+ )
+ assert summary.all_passed
+
class TestSerialFn:
"""Tests for serial_fn-based serialization."""
@@ -355,6 +418,35 @@ def get_full_name(user: User) -> str:
)
assert summary.all_passed
+ def test_assertion_uses_serialized_input_with_serial_fn(self):
+ """Assertion `input` should reflect serial_fn output type."""
+
+ def serialize_user(d: dict) -> User:
+ data = d.get("input") or d.get("inputs")
+ assert data is not None
+ return User(**data)
+
+ spec = {
+ "get_user_info": {
+ "evals": {"CheckSerializedInput": {"assertion": "input.id == 7"}},
+ "dataset": [
+ {
+ "case": {
+ "input": {"id": 7, "name": "Ada", "email": "ada@a.com"},
+ "expected": "User Ada has email ada@a.com",
+ }
+ },
+ ],
+ }
+ }
+ summary = (
+ RunEvals.from_dict(spec)
+ .with_functions({"get_user_info": get_user_info})
+ .with_serializer(serial_fn={"get_user_info": serialize_user})
+ .run()
+ )
+ assert summary.all_passed
+
class TestSerializerChaining:
"""Tests for serializer method chaining."""
@@ -467,3 +559,88 @@ def test_serializer_validation_error(self):
)
assert not summary.all_passed
assert summary.failed_count == 1
+
+
+class TestYamlNativeSerializerRegistry:
+ """Tests for YAML-native top-level serializer registry."""
+
+ def test_yaml_registry_schema_mode(self):
+ yaml_spec = """
+serializers:
+ user_schema:
+ schema: tests.test_serializer.User
+
+get_user_info:
+ serializer: user_schema
+ dataset:
+ - case:
+ input: {id: 1, name: Alice, email: a@a.com}
+ expected: "User Alice has email a@a.com"
+"""
+ summary = (
+ RunEvals.from_source(yaml_spec).with_functions({"get_user_info": get_user_info}).run()
+ )
+ assert summary.all_passed
+
+ def test_yaml_registry_serial_fn_mode(self):
+ yaml_spec = """
+serializers:
+ user_custom:
+ serializer: tests.test_serializer.yaml_serialize_user
+
+get_user_info:
+ serializer: user_custom
+ dataset:
+ - case:
+ inputs: {id: 2, name: Bob, email: b@b.com}
+ expected: "User Bob has email b@b.com"
+"""
+ summary = (
+ RunEvals.from_source(yaml_spec).with_functions({"get_user_info": get_user_info}).run()
+ )
+ assert summary.all_passed
+
+ def test_yaml_registry_imports_are_cached(self, monkeypatch):
+ """Same serializer path used by multiple evals should be imported once."""
+ from vowel import utils as utils_module
+
+ calls: list[str] = []
+ original_import_function = utils_module.import_function
+
+ def counting_import(path: str):
+ calls.append(path)
+ return original_import_function(path)
+
+ utils_module._import_path_cached.cache_clear()
+ monkeypatch.setattr(utils_module, "import_function", counting_import)
+
+ yaml_spec = """
+serializers:
+ user_schema:
+ schema: tests.test_serializer.User
+
+get_user_info:
+ serializer: user_schema
+ dataset:
+ - case:
+ input: {id: 1, name: Alice, email: a@a.com}
+ expected: "User Alice has email a@a.com"
+
+get_user_name:
+ serializer: user_schema
+ dataset:
+ - case:
+ input: {id: 2, name: Bob, email: b@b.com}
+ expected: "Bob"
+"""
+
+ def get_user_name(user: User) -> str:
+ return user.name
+
+ summary = (
+ RunEvals.from_source(yaml_spec)
+ .with_functions({"get_user_info": get_user_info, "get_user_name": get_user_name})
+ .run()
+ )
+ assert summary.all_passed
+ assert calls.count("tests.test_serializer.User") == 1
diff --git a/tests/test_yaml_loading.py b/tests/test_yaml_loading.py
index b2fb12b..031911f 100644
--- a/tests/test_yaml_loading.py
+++ b/tests/test_yaml_loading.py
@@ -56,6 +56,43 @@ def test_invalid_yaml_raises_error(self):
with pytest.raises(Exception): # noqa: B017
load_bundle_from_yaml_string("invalid: [unclosed")
+ def test_yaml_with_top_level_serializers(self):
+ """Test loading top-level serializer registry and eval references."""
+ yaml_spec = """
+serializers:
+ user_schema:
+ schema: tests.test_serializer.User
+
+get_user_info:
+ serializer: user_schema
+ dataset:
+ - case:
+ input: {id: 1, name: Alice, email: a@a.com}
+ expected: "User Alice has email a@a.com"
+"""
+ bundle = load_bundle_from_yaml_string(yaml_spec)
+
+ assert "user_schema" in bundle.serializers
+ assert bundle.evals["get_user_info"].serializer == "user_schema"
+
+ def test_yaml_invalid_serializer_spec_raises_error(self):
+ """Serializer specs cannot define both schema and serializer at once."""
+ yaml_spec = """
+serializers:
+ invalid:
+ schema: tests.test_serializer.User
+ serializer: tests.test_serializer.yaml_serialize_user
+
+get_user_info:
+ serializer: invalid
+ dataset:
+ - case:
+ input: {id: 1, name: Alice, email: a@a.com}
+ expected: "User Alice has email a@a.com"
+"""
+ with pytest.raises(Exception): # noqa: B017
+ load_bundle_from_yaml_string(yaml_spec)
+
class TestLoadBundleFromDict:
"""Tests for load_bundle_from_dict function."""
diff --git a/vowel-schema.json b/vowel-schema.json
index 241bbeb..15ee5e6 100644
--- a/vowel-schema.json
+++ b/vowel-schema.json
@@ -8,6 +8,13 @@
},
"title": "Fixtures",
"type": "object"
+ },
+ "serializers": {
+ "additionalProperties": {
+ "$ref": "#/$defs/SerializerSpec"
+ },
+ "title": "Serializers",
+ "type": "object"
}
},
"additionalProperties": {
@@ -144,6 +151,19 @@
"title": "Fixture",
"type": "array"
},
+ "serializer": {
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "null"
+ }
+ ],
+ "default": null,
+ "description": "Optional serializer registry key from top-level 'serializers'. When set, this eval uses that serializer definition.",
+ "title": "Serializer"
+ },
"evals": {
"additionalProperties": {
"anyOf": [
@@ -711,6 +731,46 @@
"title": "PatternMatchCase",
"type": "object"
},
+ "SerializerSpec": {
+ "additionalProperties": false,
+ "description": "Serializer registry entry for YAML-native serializer configuration.",
+ "properties": {
+ "schema": {
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "additionalProperties": {
+ "type": "string"
+ },
+ "type": "object"
+ },
+ {
+ "type": "null"
+ }
+ ],
+ "default": null,
+ "description": "Schema converter path(s). Use a single import path string for direct mode, or a mapping of parameter name to import path for nested mode.",
+ "title": "Schema"
+ },
+ "serializer": {
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "null"
+ }
+ ],
+ "default": null,
+ "description": "Import path to custom serializer function (serial_fn mode).",
+ "title": "Serializer"
+ }
+ },
+ "title": "SerializerSpec",
+ "type": "object"
+ },
"EvalsMapValue": {
"additionalProperties": false,
"description": "Function evaluation specification keyed by function import path/name. Contains fixture dependencies, global evaluators (`evals`), and dataset cases.",
@@ -746,6 +806,19 @@
"title": "Fixture",
"type": "array"
},
+ "serializer": {
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "null"
+ }
+ ],
+ "default": null,
+ "description": "Optional serializer registry key from top-level 'serializers'. When set, this eval uses that serializer definition.",
+ "title": "Serializer"
+ },
"evals": {
"additionalProperties": {
"anyOf": [
From 49491a5a02d470d5ef8cfa8edc579845c903c899 Mon Sep 17 00:00:00 2001
From: Mert <62549656+fswair@users.noreply.github.com>
Date: Thu, 19 Mar 2026 19:47:12 +0300
Subject: [PATCH 6/8] Update Python version matrix in tests.yml
Removed Python 3.10 from the test matrix.
---
.github/workflows/tests.yml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 01835e8..2e922ac 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -11,7 +11,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
- python-version: ["3.10", "3.11", "3.12", "3.13", "3.14"]
+ python-version: ["3.11", "3.12", "3.13", "3.14"]
steps:
- uses: actions/checkout@v4
From 5d4c6d2b795ca8f43bf55785b6dcc8ac6326e4c9 Mon Sep 17 00:00:00 2001
From: Mert <62549656+fswair@users.noreply.github.com>
Date: Thu, 19 Mar 2026 19:49:01 +0300
Subject: [PATCH 7/8] Change pip install target from 'dev' to 'all'
---
.github/workflows/tests.yml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 2e922ac..975ef0f 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -30,7 +30,7 @@ jobs:
- name: Install dependencies
run: |
source venv/bin/activate
- uv pip install -e ".[dev]"
+ uv pip install -e ".[all]"
- name: Run tests
run: |
From 567743287194656d576a618b87092f5f9d81880b Mon Sep 17 00:00:00 2001
From: Mert <62549656+fswair@users.noreply.github.com>
Date: Thu, 19 Mar 2026 20:05:14 +0300
Subject: [PATCH 8/8] Potential fix for pull request finding
Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com>
---
.env.sample | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/.env.sample b/.env.sample
index 2d59281..a925a7f 100644
--- a/.env.sample
+++ b/.env.sample
@@ -20,7 +20,7 @@ SPEC_MODEL=openrouter:anthropic/claude-opus-4.6
EXPLORATION_MODEL=openrouter:anthropic/claude-sonnet-4.6
# Default spec & exploration models used by CodeMode benchmark pipeline
-# NOTE: Models should be comma seperated, length of spec models must equals to exploration models
+# NOTE: Models should be comma separated, length of spec models must equal to exploration models
# spec[i] will be mapped to exploration[i] (Case N)
BENCHMARK_SPEC_MODELS=openrouter:anthropic/claude-opus-4.6
BENCHMARK_EXPLORATION_MODELS=openrouter:anthropic/claude-sonnet-4.6
\ No newline at end of file