diff --git a/.env.sample b/.env.sample index 8f8c9d7..a925a7f 100644 --- a/.env.sample +++ b/.env.sample @@ -11,4 +11,16 @@ LOGFIRE_ENABLED=false JUDGE_MODEL=openrouter:google/gemini-3-flash-preview # Default model used by Agents -MODEL_NAME=openrouter:google/gemini-3-flash-preview \ No newline at end of file +MODEL_NAME=openrouter:google/gemini-3-flash-preview + +# Default spec & exploration models used by CodeMode pipeline +# Spec agent generates tests +# Exploration agent generates snippets to discover behaviors (code-execution) +SPEC_MODEL=openrouter:anthropic/claude-opus-4.6 +EXPLORATION_MODEL=openrouter:anthropic/claude-sonnet-4.6 + +# Default spec & exploration models used by CodeMode benchmark pipeline +# NOTE: Models should be comma separated, length of spec models must equal to exploration models +# spec[i] will be mapped to exploration[i] (Case N) +BENCHMARK_SPEC_MODELS=openrouter:anthropic/claude-opus-4.6 +BENCHMARK_EXPLORATION_MODELS=openrouter:anthropic/claude-sonnet-4.6 \ No newline at end of file diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 01835e8..975ef0f 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -11,7 +11,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: ["3.10", "3.11", "3.12", "3.13", "3.14"] + python-version: ["3.11", "3.12", "3.13", "3.14"] steps: - uses: actions/checkout@v4 @@ -30,7 +30,7 @@ jobs: - name: Install dependencies run: | source venv/bin/activate - uv pip install -e ".[dev]" + uv pip install -e ".[all]" - name: Run tests run: | diff --git a/.gitignore b/.gitignore index eb9b4d5..8b40711 100644 --- a/.gitignore +++ b/.gitignore @@ -69,3 +69,7 @@ evaluations/ # !! TODO docs/FIXTURE_GENERATION_RFC.md + +# Benchmarks +benchmark* +important-links.md diff --git a/.gitmodules b/.gitmodules index e0064d4..1b9806f 100644 --- a/.gitmodules +++ b/.gitmodules @@ -4,3 +4,6 @@ [submodule "skills/vowel-core"] path = skills/vowel-core url = https://github.com/fswair/vowel-core.git +[submodule "codemode-benchmark"] + path = codemode-benchmark + url = https://github.com/fswair/codemode-benchmark diff --git a/AGENTS.md b/AGENTS.md index 9de1c04..5b79f13 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -30,5 +30,14 @@ This document contains concise rules for how agents should inspect and use this - If you have questions or uncertainty, consult `README.md` and the relevant docs pages. - Check `TODO` for pending tasks or known issues. +## Critical Thinking & Intellectual Honesty + +- **Never defer to the user's idea just because they said it.** Evaluate every proposal — yours or the user's — on its own merits: trade-offs, costs, complexity, correctness. +- **If the user's idea has flaws, say so.** Explain why with concrete reasoning (performance, token cost, latency, maintainability, correctness risk). Do not soften criticism to be agreeable. +- **If your own idea has flaws, admit it first.** Don't wait for the user to find the holes. Present disadvantages upfront. +- **When comparing approaches, use structured analysis:** list pros/cons for each, identify the real trade-offs, and state which you'd pick and why — before asking for input. +- **"You're right" must be earned.** If you catch yourself agreeing immediately, stop and ask: "Did I actually evaluate this, or am I just being agreeable?" If the latter, go back and do the analysis. +- **The user is a collaborator, not an authority.** Good ideas win regardless of who proposed them. Bad ideas lose regardless of who proposed them. + These rules help agents use the project consistently and safely. diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..1c21190 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,198 @@ +# CHANGELOG + +## codemode_driven_generation + +This document summarizes the main features added or improved on this branch. + +## 1) Executor and ExecutionSession protocols + +- The code execution interface was formalized using Protocols. +- The Executor async/sync API was standardized: + - execute(...) + - execute_sync(...) + - create_session(...) +- ExecutionSession now compiles/executes setup code once and supports multi-snippet feed execution. +- This reduces repeated parse/compile overhead while exploring the same function. +- The run_sync helper was hardened for running-loop environments via nest-asyncio. + +## 2) MontyExecutor, DefaultExecutor, MontySession, FallbackSession structures + +- MontyExecutor was added: + - sandboxed execution via pydantic-monty, + - ResourceLimits support (timeout/memory), + - stdout capture and normalized error typing/messages, +- DefaultExecutor was added/improved: + - pure Python exec-based fallback execution, + - last-expression capture (__result__) and stdout capture. +- MontyReplSession (MontySession role) was added: + - one-time setup load, reusable feed-run model. +- FallbackSession was added: + - Session-level fallback: if Monty session initialization fails, switch entirely to DefaultSession. + - Snippet-level fallback: if Monty returns ModuleNotFoundError for a snippet, rerun that snippet via fallback executor. +- Executor/fallback wiring was simplified through resolve_executors. + +## 3) Main implementation: CodeModeGenerator + +- Two-phase exploration-guided generation flow: + - Phase 1: behavior exploration (exploration snippets + error snippets) + - Phase 2: spec generation from verified observations +- Lazy Agent architecture: + - explorer_agent (ExplorationPlan) + - spec_agent (EvalsSource or EvalsBundle) +- Prompt layers were clearly separated: + - exploration prompt: coverage, diversity, duplicate prevention + - spec prompt: expected values from verified outputs only +- A refinement loop was added: + - generate -> run -> failure_context -> regenerate +- Optional duration injection and a final summary run were added at the end. + +## 4) Runtime hierarchy and utility usage + +CodeMode hierarchy: + +1. explore() +2. generate_spec() +3. validate_and_fix_spec() +4. validate_expected_values() +5. inject_missing_error_cases() +6. inject_durations() (optional) +7. validation/refinement with RunEvals + +Utilities used: + +- build_call_code +- build_failure_context +- validate_and_fix_spec +- validate_expected_values +- inject_missing_error_cases +- inject_durations + +## 5) Cost Manager + +- Generation/run cost tracking was added for CodeMode. +- Features: + - generation_id and run_id lifecycle management, + - step-level usage/cost recording, + - model price resolution (genai-prices or costs.yml), + - atomic/locked JSON persistence, + - generation-level and run-level totals, + - status tracking: running/completed/failed. +- The CLI costs command now supports list/by-generation/by-run views. + +## 6) Serializer syntax and YAML-native serializer registry + +- Top-level serializers registry support was added at EvalsFile level. +- Per-eval serializer references are now supported via serializer:. +- SerializerSpec was clarified with one-of behavior: + - schema (string or dict) + - serializer (callable import path) + - not both at the same time. +- Runtime resolver additions: + - import-path resolution, + - cached imports (_import_path_cached), + - per-eval resolution (_resolve_yaml_serializer_entry). +- Precedence between programmatic serializer maps and YAML serializer registry was defined. + +## 7) Spec model / Exploration model separation + +- Model separation in CodeModeGenerator constructor was formalized: + - spec_model + - exploration_model +- use_model_spec output mode was clarified: + - use_model_spec=True: structured output mode (schema/model output via EvalsBundle) + - use_model_spec=False: YAML string output mode (via EvalsSource.yaml_spec) +- HIGHLY RECOMMENDED TO KEEP use_model_spec=False. +- Model resolution order and env fallback logic were added. +- Cost tracking now supports separate model usage across separate steps. + +## 8) Adding executor/fallback executor to utilities + +- Utility flows were updated to accept executor and fallback executor parameters. +- Monty -> Default fallback behavior was generalized in execution-aware paths. +- Executor behavior was centralized across run_evals and validation stages. + +## 9) YAML schema generator + +- Runtime-model-driven schema generation was improved: + - supports top-level fixtures + serializers, + - preserves function-level EvalsMapValue behavior. +- Schema cache strategy was updated: + - content-hash-based filename (reduces stale editor cache issues). +- File header updates are handled safely via materialize_yaml_with_schema_header. + +## 10) CLI komutları: schema, costs + +- vowel schema : + - update schema header after YAML + pydantic validation +- vowel schema --create [path]: + - direct schema JSON generation +- vowel costs: + - --list + - --by-generation + - --by-run + - --generation + - --run + +## 11) module.function -> function alias support + +- Alias support was added for programmatic mapping resolution: + - function map + - serializer schema map + - serializer function map +- Behavior: + - exact match first, + - short-name fallback, + - explicit error for ambiguous reverse short-name mapping. + +## 12) Feedback-guided exploration + +- A targeted Round-2 exploration flow was added: + - build cluster summaries from Round-1 results, + - generate snippets focused on uncovered behavior classes. +- Duplicate/semantic repetition minimization was reinforced at prompt level. +- Distinct failure-mode coverage was improved for error snippets. +- Additional rounds now measure value via new-behavior counting. + +## 13) Assertion + serializer integration + +- AssertionEvaluator input context is now serializer-aware. +- Assertions now see serialized input for schema, serial_fn, and nested/dict schema modes. +- This behavior is covered by regression tests. + +## 14) LLM Judge env-ref improvements + +- create_llm_judge now supports $ENV_VAR resolution for rubric/model fields. +- Missing env refs now produce clearer errors. + +## 15) Examples, documentation, and test coverage + +- A runnable native serializer + fixture example was added. +- README and serializer docs were updated with serializer/assertion context notes. +- Meaningful id fields were added to eval cases under examples. +- New/updated tests include: + - test_schema + - test_llm_judge_env_refs + - serializer assertion regressions + - YAML/native serializer parsing tests + +## 16) Fixture scope alias support + +- Fixture scopes now support clearer canonical names: + - case + - eval + - file +- Backward-compatible aliases are still accepted: + - function (alias of case) + - module (alias of eval) + - session (alias of file) +- At parse time, canonical names are normalized to legacy internal runtime values: + - case -> function + - eval -> module + - file -> session +- This keeps existing runtime lifecycle behavior unchanged while allowing more descriptive scope names in YAML. + +Note: Old names would be deprecated after v1.0.0 + +## Note + +This changelog is based on features observed and validated in code on this branch, without using git history. diff --git a/CLAUDE.md b/CLAUDE.md index 28360e4..78ebc3b 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -30,5 +30,14 @@ Claude-type agents working with this repository should follow these steps: - If you have questions or uncertainty, consult `README.md` and the relevant docs pages. - Check `TODO` for pending tasks or known issues. +## Critical Thinking & Intellectual Honesty + +- **Never defer to the user's idea just because they said it.** Evaluate every proposal — yours or the user's — on its own merits: trade-offs, costs, complexity, correctness. +- **If the user's idea has flaws, say so.** Explain why with concrete reasoning (performance, token cost, latency, maintainability, correctness risk). Do not soften criticism to be agreeable. +- **If your own idea has flaws, admit it first.** Don't wait for the user to find the holes. Present disadvantages upfront. +- **When comparing approaches, use structured analysis:** list pros/cons for each, identify the real trade-offs, and state which you'd pick and why — before asking for input. +- **"You're right" must be earned.** If you catch yourself agreeing immediately, stop and ask: "Did I actually evaluate this, or am I just being agreeable?" If the latter, go back and do the analysis. +- **The user is a collaborator, not an authority.** Good ideas win regardless of who proposed them. Bad ideas lose regardless of who proposed them. + These guidelines are intended to help Claude agents use the repository consistently. diff --git a/README.md b/README.md index 0ce4681..4c2d792 100644 --- a/README.md +++ b/README.md @@ -46,7 +46,7 @@ pip install -e ".[all]" ## Quick Start > **Note:** -> For a deeper understanding of how vowel handles fixtures, see the examples in [`db_fixture.yml`](./db_fixture.yml) and [`db.py`](./db.py). These files demonstrate the underlying mechanics of fixture setup and usage. +> For a deeper understanding of how vowel handles fixtures, see the examples in [`examples/db_fixtures`](./examples/db_fixtures/). These example demonstrate the underlying mechanics of fixture setup and usage. > **Tip:** > To enable YAML schema validation in your editor, place `vowel-schema.json` in your project directory. @@ -122,6 +122,8 @@ summary = ( summary.print() ``` +> **Name matching note:** If your YAML uses `module.function`, programmatic mappings can use either the exact key (`module.function`) or the short function name (`function`) in `.with_functions(...)`. + --- ## Features @@ -181,6 +183,29 @@ def query_user(user_id: int, *, db: dict) -> dict | None: return db["users"].get(user_id) ``` +Fixture scope aliases: +- Preferred scope names: `case`, `eval`, `file` +- Backward-compatible aliases: `function`, `module`, `session` +- Normalization mapping: `case -> function`, `eval -> module`, `file -> session` + +Example: + +```yaml +fixtures: + temp_data: + setup: myapp.make_temp_data + scope: case + + db: + setup: myapp.setup_db + teardown: myapp.close_db + scope: eval + + cache: + setup: myapp.setup_cache + scope: file +``` + > **Full reference:** [docs/FIXTURES.md](https://github.com/fswair/vowel/blob/main/docs/FIXTURES.md) ### Input Serializers @@ -196,6 +221,26 @@ summary = ( ) ``` +> **Serializer key matching:** Serializer mappings follow the same rule as `.with_functions(...)` — both `module.function` and short `function` keys are accepted. + +> **Assertion context and serializers:** When a serializer is configured, assertion evaluators use the serialized value for `input` (not raw YAML). This applies to schema mode, `serial_fn`, and nested/dict schemas. + +Runnable example (YAML-native serializers + fixtures): + +```bash +vowel examples/serializers/db_query_evals.yml +``` + +This example demonstrates: +- top-level `serializers:` registry with both `schema` and `serializer` entries, +- per-eval `serializer:` references, +- fixture class lifecycle wiring with `cls` + `teardown`, +- assertion checks that read serialized `input` values. + +See: +- `examples/serializers/db_query_evals.yml` +- `examples/serializers/util.py` + > **Full reference:** [docs/SERIALIZERS.md](https://github.com/fswair/vowel/blob/main/docs/SERIALIZERS.md) ### AI-Powered Generation @@ -259,6 +304,9 @@ vowel evals.yml --dry-run # Show plan without running vowel evals.yml --export-json out.json # Export results vowel evals.yml -v # Verbose summary vowel evals.yml -v --hide-report # Verbose, hide pydantic_evals report +vowel schema examples/serializers/db_query_evals.yml # Validate + update schema header +vowel schema --create # Generate vowel-schema.json +vowel costs --list # List tracked generation/run costs ``` > **Full reference:** [docs/CLI.md](https://github.com/fswair/vowel/blob/main/docs/CLI.md) diff --git a/VERSION b/VERSION index 09e9157..60a2d3e 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.3.5 \ No newline at end of file +0.4.0 \ No newline at end of file diff --git a/costs.yml b/costs.yml new file mode 100644 index 0000000..6b02e8a --- /dev/null +++ b/costs.yml @@ -0,0 +1,409 @@ +models: +- amazon-nova-micro: + cached_input_per_million: null + input_per_million: 0.035 + output_per_million: 0.14 +- amazon-nova-lite: + cached_input_per_million: null + input_per_million: 0.06 + output_per_million: 0.24 +- amazon-nova-pro: + cached_input_per_million: null + input_per_million: 0.8 + output_per_million: 3.2 +- amazon-nova-premier: + cached_input_per_million: null + input_per_million: 2.5 + output_per_million: 12.5 +- claude-3.7-sonnet: + cached_input_per_million: null + input_per_million: 3 + output_per_million: 15 +- claude-3.5-sonnet: + cached_input_per_million: null + input_per_million: 3 + output_per_million: 15 +- claude-3-opus: + cached_input_per_million: null + input_per_million: 15 + output_per_million: 75 +- claude-3-haiku: + cached_input_per_million: null + input_per_million: 0.25 + output_per_million: 1.25 +- claude-3.5-haiku: + cached_input_per_million: null + input_per_million: 0.8 + output_per_million: 4 +- claude-4.5-haiku: + cached_input_per_million: null + input_per_million: 1 + output_per_million: 5 +- claude-sonnet-4.5: + cached_input_per_million: null + input_per_million: 3 + output_per_million: 15 +- claude-sonnet-4.5-200k: + cached_input_per_million: null + input_per_million: 6 + output_per_million: 22.5 +- claude-opus-4: + cached_input_per_million: null + input_per_million: 15 + output_per_million: 75 +- claude-opus-4-1: + cached_input_per_million: null + input_per_million: 15 + output_per_million: 75 +- claude-opus-4-5: + cached_input_per_million: null + input_per_million: 5 + output_per_million: 25 +- claude-opus-4.6: + cached_input_per_million: null + input_per_million: 5 + output_per_million: 25 +- deepseek-chat: + cached_input_per_million: null + input_per_million: 0.27 + output_per_million: 1.1 +- deepseek-reasoner: + cached_input_per_million: null + input_per_million: 0.55 + output_per_million: 2.19 +- gemini-2.5-pro-preview-03-25: + cached_input_per_million: null + input_per_million: 1.25 + output_per_million: 10 +- gemini-2.5-pro-preview-03-25-200k: + cached_input_per_million: null + input_per_million: 2.5 + output_per_million: 15 +- gemini-2.0-flash-lite: + cached_input_per_million: null + input_per_million: 0.075 + output_per_million: 0.3 +- gemini-2.0-flash: + cached_input_per_million: null + input_per_million: 0.1 + output_per_million: 0.4 +- gemini-1.5-flash: + cached_input_per_million: null + input_per_million: 0.075 + output_per_million: 0.3 +- gemini-1.5-flash-128k: + cached_input_per_million: null + input_per_million: 0.15 + output_per_million: 0.6 +- gemini-1.5-flash-8b: + cached_input_per_million: null + input_per_million: 0.0375 + output_per_million: 0.15 +- gemini-1.5-flash-8b-128k: + cached_input_per_million: null + input_per_million: 0.075 + output_per_million: 0.3 +- gemini-1.5-pro: + cached_input_per_million: null + input_per_million: 1.25 + output_per_million: 5 +- gemini-1.5-pro-128k: + cached_input_per_million: null + input_per_million: 2.5 + output_per_million: 10 +- gemini-2.5-flash: + cached_input_per_million: 0.03 + input_per_million: 0.3 + output_per_million: 2.5 +- gemini-2.5-flash-lite: + cached_input_per_million: 0.01 + input_per_million: 0.1 + output_per_million: 0.4 +- gemini-2.5-flash-preview-09-2025: + cached_input_per_million: 0.03 + input_per_million: 0.3 + output_per_million: 2.5 +- gemini-2.5-pro: + cached_input_per_million: 0.125 + input_per_million: 1.25 + output_per_million: 10 +- gemini-2.5-pro-200k: + cached_input_per_million: 0.25 + input_per_million: 2.5 + output_per_million: 15 +- gemini-3-pro-preview: + cached_input_per_million: null + input_per_million: 2 + output_per_million: 12 +- gemini-3-pro-preview-200k: + cached_input_per_million: null + input_per_million: 4 + output_per_million: 18 +- gemini-3-flash-preview: + cached_input_per_million: null + input_per_million: 0.5 + output_per_million: 3 +- gemini-3-1-pro-preview: + cached_input_per_million: null + input_per_million: 2 + output_per_million: 12 +- gemini-3-1-pro-preview-200k: + cached_input_per_million: null + input_per_million: 4 + output_per_million: 18 +- gemini-3.1-flash-lite-preview: + cached_input_per_million: 0.025 + input_per_million: 0.25 + output_per_million: 1.5 +- minimax-m2: + cached_input_per_million: null + input_per_million: 0.3 + output_per_million: 1.2 +- pixtral-12b: + cached_input_per_million: null + input_per_million: 0.15 + output_per_million: 0.15 +- mistral-small-latest: + cached_input_per_million: null + input_per_million: 0.1 + output_per_million: 0.3 +- mistral-medium-2505: + cached_input_per_million: null + input_per_million: 0.4 + output_per_million: 2 +- mistral-nemo: + cached_input_per_million: null + input_per_million: 0.15 + output_per_million: 0.15 +- open-mistral-7b: + cached_input_per_million: null + input_per_million: 0.25 + output_per_million: 0.25 +- open-mixtral-8x7b: + cached_input_per_million: null + input_per_million: 0.7 + output_per_million: 0.7 +- open-mixtral-8x22b: + cached_input_per_million: null + input_per_million: 2 + output_per_million: 6 +- mistral-large-latest: + cached_input_per_million: null + input_per_million: 2 + output_per_million: 6 +- pixtral-large-latest: + cached_input_per_million: null + input_per_million: 2 + output_per_million: 6 +- mistral-saba-latest: + cached_input_per_million: null + input_per_million: 0.2 + output_per_million: 0.6 +- codestral-latest: + cached_input_per_million: null + input_per_million: 0.3 + output_per_million: 0.9 +- ministral-8b-latest: + cached_input_per_million: null + input_per_million: 0.1 + output_per_million: 0.1 +- ministral-3b-latest: + cached_input_per_million: null + input_per_million: 0.04 + output_per_million: 0.04 +- magistral-medium-latest: + cached_input_per_million: null + input_per_million: 2 + output_per_million: 5 +- kimi-k2-0905-preview: + cached_input_per_million: 0.15 + input_per_million: 0.6 + output_per_million: 2.5 +- kimi-k2-0711-preview: + cached_input_per_million: 0.15 + input_per_million: 0.6 + output_per_million: 2.5 +- kimi-k2-turbo-preview: + cached_input_per_million: 0.15 + input_per_million: 1.15 + output_per_million: 8.0 +- kimi-k2-thinking: + cached_input_per_million: 0.15 + input_per_million: 0.6 + output_per_million: 2.5 +- kimi-k2-thinking-turbo: + cached_input_per_million: 0.15 + input_per_million: 1.15 + output_per_million: 8.0 +- text-davinci-003: + cached_input_per_million: null + input_per_million: 20 + output_per_million: 20 +- gpt-4.5: + cached_input_per_million: 37.5 + input_per_million: 75 + output_per_million: 150 +- gpt-4o: + cached_input_per_million: 1.25 + input_per_million: 2.5 + output_per_million: 10 +- gpt-4o-mini: + cached_input_per_million: 0.075 + input_per_million: 0.15 + output_per_million: 0.6 +- chatgpt-4o-latest: + cached_input_per_million: null + input_per_million: 5 + output_per_million: 15 +- o1-preview: + cached_input_per_million: 7.5 + input_per_million: 15 + output_per_million: 60 +- o1-pro: + cached_input_per_million: null + input_per_million: 150 + output_per_million: 600 +- o1-mini: + cached_input_per_million: 0.55 + input_per_million: 1.1 + output_per_million: 4.4 +- o3-mini: + cached_input_per_million: 0.55 + input_per_million: 1.1 + output_per_million: 4.4 +- gpt-4.1: + cached_input_per_million: 0.5 + input_per_million: 2 + output_per_million: 8 +- gpt-4.1-mini: + cached_input_per_million: 0.1 + input_per_million: 0.4 + output_per_million: 1.6 +- gpt-4.1-nano: + cached_input_per_million: 0.025 + input_per_million: 0.1 + output_per_million: 0.4 +- o3: + cached_input_per_million: 0.5 + input_per_million: 10 + output_per_million: 40 +- o4-mini: + cached_input_per_million: 0.275 + input_per_million: 1.1 + output_per_million: 4.4 +- gpt-5-nano: + cached_input_per_million: 0.005 + input_per_million: 0.05 + output_per_million: 0.4 +- gpt-5-mini: + cached_input_per_million: 0.025 + input_per_million: 0.25 + output_per_million: 2 +- gpt-5: + cached_input_per_million: 0.125 + input_per_million: 1.25 + output_per_million: 10 +- gpt-image-1: + cached_input_per_million: 1.25 + input_per_million: 10 + output_per_million: 40 +- gpt-image-1-mini: + cached_input_per_million: 0.2 + input_per_million: 2 + output_per_million: 8 +- gpt-5-pro: + cached_input_per_million: null + input_per_million: 15 + output_per_million: 120 +- o3-pro: + cached_input_per_million: null + input_per_million: 20 + output_per_million: 80 +- o4-mini-deep-research: + cached_input_per_million: 0.5 + input_per_million: 2 + output_per_million: 8 +- o3-deep-research: + cached_input_per_million: 2.5 + input_per_million: 10 + output_per_million: 40 +- gpt-5.1-codex-mini: + cached_input_per_million: 0.025 + input_per_million: 0.25 + output_per_million: 2.0 +- gpt-5.1-codex: + cached_input_per_million: 0.125 + input_per_million: 1.25 + output_per_million: 10.0 +- gpt-5.1: + cached_input_per_million: 0.125 + input_per_million: 1.25 + output_per_million: 10.0 +- gpt-5.2: + cached_input_per_million: 0.175 + input_per_million: 1.75 + output_per_million: 14.0 +- gpt-5.2-pro: + cached_input_per_million: null + input_per_million: 21.0 + output_per_million: 168.0 +- gpt-5.4: + cached_input_per_million: 0.25 + input_per_million: 2.5 + output_per_million: 15.0 +- gpt-5.4-272k: + cached_input_per_million: 0.5 + input_per_million: 5.0 + output_per_million: 22.5 +- gpt-5.4-pro: + cached_input_per_million: null + input_per_million: 30.0 + output_per_million: 180.0 +- gpt-5.4-pro-272k: + cached_input_per_million: null + input_per_million: 60.0 + output_per_million: 270.0 +- grok-3: + cached_input_per_million: 0.75 + input_per_million: 3 + output_per_million: 15 +- grok-3-mini: + cached_input_per_million: 0.075 + input_per_million: 0.3 + output_per_million: 0.5 +- grok-4-fast: + cached_input_per_million: 0.05 + input_per_million: 0.2 + output_per_million: 0.5 +- grok-4: + cached_input_per_million: 0.75 + input_per_million: 3 + output_per_million: 15 +- grok-4-128k: + cached_input_per_million: 0.75 + input_per_million: 6 + output_per_million: 30 +- grok-4-fast: + cached_input_per_million: 0.05 + input_per_million: 0.2 + output_per_million: 0.5 +- grok-4-fast-128k: + cached_input_per_million: 0.05 + input_per_million: 0.4 + output_per_million: 1.0 +- grok-4-fast-reasoning: + cached_input_per_million: 0.05 + input_per_million: 0.2 + output_per_million: 0.5 +- grok-4-fast-reasoning-128k: + cached_input_per_million: 0.05 + input_per_million: 0.4 + output_per_million: 1.0 +- grok-code-fast-1: + cached_input_per_million: 0.02 + input_per_million: 0.2 + output_per_million: 1.5 +- claude-sonnet-4.6: + cached_input_per_million: null + input_per_million: 3 + output_per_million: 15 \ No newline at end of file diff --git a/db_fixture.yml b/db_fixture.yml deleted file mode 100644 index 32fa6a1..0000000 --- a/db_fixture.yml +++ /dev/null @@ -1,32 +0,0 @@ -# yaml-language-server: $schema=vowel-schema.json - -fixtures: - db: - cls: db.Connection # setup cls - kwargs: - db_path: users.db ## db.Connection(db_path="users.db") - teardown: db.Connection.close # teardown method (db.Connection.close()) - scope: module # scope - -db.Connection.execute: - fixture: - - db # db.execute(query) -> self is the db fixture instance - evals: - IsDict: - type: list[dict[str, typing.Union[str, int]]] # type assertion for output - dataset: - - case: - input: "SELECT * FROM users WHERE id=1" - assertion: "output and isinstance(output[0], dict)" - - case: - input: "SELECT * FROM notes" # (buggy query - invalid table) - raises: any # if any exception is raised, the case will pass. - match: "no such table" ## no effect, because any exception is accepted - - case: - input: "SELECT * FROM players" # (buggy query - invalid table) - raises: any? # if any exception is raised/returned normally, the case will pass. - match: "no such table" ## no effect, because any exception or normal return is enough - - case: - input: "SELECT * FROM developers" # (buggy query - invalid table) - raises: NoTableError - match: "no such table" ## must match the exception message (case ignored) \ No newline at end of file diff --git a/docs/CLI.md b/docs/CLI.md index 0cdd808..0be46f1 100644 --- a/docs/CLI.md +++ b/docs/CLI.md @@ -74,4 +74,45 @@ vowel evals.yml -v --hide-report # Hide report without verbose (still shows Overall Summary panel) vowel evals.yml --hide-report + +# Validate YAML + refresh schema header +vowel schema evals.yml + +# Generate schema JSON file (default: vowel-schema.json) +vowel schema --create + +# Generate schema JSON at a custom path +vowel schema --create ./schemas/vowel-schema.json + +# Show tracked model costs +vowel costs --list +vowel costs --by-generation +vowel costs --by-run +vowel costs --generation +vowel costs --run ``` + +--- + +## Schema Commands + +Use schema commands to validate specs and keep YAML schema headers in sync. + +| Command | Description | +|--------|-------------| +| `vowel schema ` | Validates YAML and updates the file's schema header safely | +| `vowel schema --create [path]` | Generates `vowel-schema.json` (or writes to custom path) | + +--- + +## Cost Commands + +Use cost commands to inspect generation and run cost history. + +| Command | Description | +|--------|-------------| +| `vowel costs --list` | List all tracked generations and runs | +| `vowel costs --by-generation` | Aggregate totals by generation id | +| `vowel costs --by-run` | Aggregate totals by run id | +| `vowel costs --generation ` | Show detailed rows for one generation | +| `vowel costs --run ` | Show detailed rows for one run | diff --git a/docs/CODEMODE.md b/docs/CODEMODE.md new file mode 100644 index 0000000..91517bd --- /dev/null +++ b/docs/CODEMODE.md @@ -0,0 +1,164 @@ +# CodeMode + +CodeMode is Vowel's exploration-guided evaluation spec generator. + +Instead of generating test specs from description only, CodeMode runs exploration snippets against real function code first, then generates and refines eval specs using verified outputs and observed errors. + +## Pipeline Overview + +CodeMode runs in phases: + +1. Explore behavior +- Generates normal snippets and error snippets. +- Executes snippets against the target function. +- Collects real outputs, exceptions, and timings. + +2. Generate spec +- Builds a spec prompt from verified execution results. +- Produces either YAML text or structured bundle output. + +3. Validate and refine +- Runs generated evals against the function. +- If coverage is below target, builds failure context and retries. +- Repeats up to max refinement rounds. + +4. Optional duration injection +- Measures runtime and injects duration thresholds into cases. + +5. Final summary +- Returns a CodeModeResult with exploration artifacts, final YAML spec, and optional EvalSummary. + +## Core API + +CodeMode class: +- `vowel.codemode.CodeModeGenerator` + +Result type: +- `vowel.codemode.CodeModeResult` + +Main entrypoint: +- `await CodeModeGenerator.generate(...)` + +## Model Configuration + +Constructor model resolution order: + +- `spec_model` argument, else `SPEC_MODEL`, else fallback `model`/`MODEL_NAME` +- `exploration_model` argument, else `EXPLORATION_MODEL`, else fallback `model`/`MODEL_NAME` + +Both models must resolve to non-empty values. + +## Output Modes (`use_model_spec`) + +- `use_model_spec=False` (default) + - Spec agent output type: `EvalsSource` + - Generates YAML string via `yaml_spec` + +- `use_model_spec=True` + - Spec agent output type: `EvalsBundle` + - Generates structured model output first, then can be converted to YAML + +Recommendation used in this repository benchmark flow: +- HIGHLY RECOMMENDED TO KEEP `use_model_spec=False`. + +## Minimal Example + +```python +import asyncio + +from vowel.codemode import CodeModeGenerator +from vowel.runner import Function + +func = Function( + name="flatten", + description="Recursively flatten an arbitrarily nested list.", + code=""" +def flatten(lst: list) -> list: + if not isinstance(lst, list): + raise TypeError(f'Expected list, got {type(lst).__name__}') + out = [] + for item in lst: + if isinstance(item, list): + out.extend(flatten(item)) + else: + out.append(item) + return out +""", +) + +async def main() -> None: + gen = CodeModeGenerator( + spec_model="openrouter:google/gemini-3-flash-preview", + exploration_model="openrouter:google/gemini-3.1-flash-lite-preview", + use_model_spec=False, + ) + + result = await gen.generate( + func, + run_evals=True, + max_refinement_rounds=2, + min_coverage=1.0, + inject_durations=False, + save_to_file=True, + ) + + print(result.yaml_spec) + if result.summary: + result.summary.print() + +asyncio.run(main()) +``` + +## `generate(...)` Parameters + +Important flags in `CodeModeGenerator.generate`: + +- `run_id`: optional run identifier for cost tracking +- `run_evals`: run generated spec after generation +- `save_to_file`: write `_evals.yml` +- `max_refinement_rounds`: retry/refinement budget +- `min_coverage`: stop threshold (default 1.0) +- `inject_durations`: inject measured duration checks + +## What `CodeModeResult` Contains + +- `exploration_results`: snippet execution results +- `yaml_spec`: final YAML eval spec +- `summary`: EvalSummary when `run_evals=True` +- `refinement_rounds`: number of refinement retries used + +## Benchmark Integration (`python -m codemode_benchmark`) + +Benchmark runner path: +- `codemode_benchmark/run_benchmark.py` + +Typical usage: + +```bash +python -m codemode_benchmark +python -m codemode_benchmark --only flatten group_by +python -m codemode_benchmark --show-config +python -m codemode_benchmark --replay codemode_benchmark/run_20260312_181510 +``` + +If you use Python launcher on your machine: + +```bash +py -m codemode_benchmark +``` + +Benchmark runner compares model pairs (`spec_model`, `exploration_model`) across built-in scenarios and stores artifacts under `codemode_benchmark/run_/`. + +## Troubleshooting + +- Error: spec/exploration model not set + - Set constructor args or env vars (`SPEC_MODEL`, `EXPLORATION_MODEL`, `MODEL_NAME`). + +- Low coverage after generation + - Increase `max_refinement_rounds`. + - Provide clearer function descriptions. + - Check whether the function has non-deterministic behavior. + +- YAML parse/validation failures + - Keep `use_model_spec=False` for YAML-first flow in this repo. + - Let refinement run (`run_evals=True`) so failure context can repair issues. diff --git a/docs/FEEDBACK_GUIDED_EXPLORATION.md b/docs/FEEDBACK_GUIDED_EXPLORATION.md new file mode 100644 index 0000000..7aff3af --- /dev/null +++ b/docs/FEEDBACK_GUIDED_EXPLORATION.md @@ -0,0 +1,250 @@ +# Feedback-Guided Exploration + +## The Problem: Single-Shot Exploration is Blind + +Prior to this change, the CodeMode pipeline ran exploration in a single LLM call: + +``` +Function source code → LLM (one call) → N snippets → Execute all → Done +``` + +The LLM never saw execution results during exploration. It generated all snippets based purely on **static reasoning** — reading the source code and inferring what inputs would be interesting. This is "speculation-based exploration." + +This works surprisingly well with strong models. In our benchmark, Claude Opus 4.6 produced 44 snippets for `parse_cron` in a single call and achieved 100% coverage with zero refinements. But the approach has structural limitations that no amount of model intelligence can overcome: + +### What single-shot exploration misses + +**1. Exact error messages** + +The LLM reads a `raise ValueError(...)` statement and guesses the error message. But the actual message depends on runtime state — string interpolation, variable values, branch ordering. Example: + +```python +# LLM expects: +parse_cron('-1 0 1 1 0') → ValueError("minute: -1 out of range 0-59") + +# Reality: +parse_cron('-1 0 1 1 0') → ValueError("invalid literal for int() with base 10: ''") +``` + +The minus sign is consumed by the range parser (`-` is the range delimiter), leaving an empty string that fails `int()` conversion. This is a parsing precedence issue that can only be discovered by execution. + +**2. Input combination explosions** + +For grammar-heavy functions (parsers, validators, DSLs), each syntax element works in isolation, but **combinations** of elements may trigger different code paths. Example from cron parsing: + +- `*/15` works (step with wildcard) +- `1-10` works (range) +- `1,5,10` works (comma-separated) +- `1,5-7,*/20` — comma + range + step in one field — was never tried + +The LLM tests each primitive but rarely discovers multi-primitive combinations without seeing prior execution results. + +**3. Error path ordering** + +When a function has multiple validation layers, the order matters: + +```python +# Does step validation happen before or after range validation? +parse_cron('0-60/0 * * * *') +# Could be: "Step must be positive" or "invalid range 0-60" +``` + +Only execution reveals which guard fires first. + +## The Solution: Two-Round Evidence-Based Exploration + +The new pipeline adds a second exploration round that receives **actual execution results** from Round 1: + +``` +Round 1: LLM (static reasoning) → 15-30 snippets → Execute + ↓ + Deterministic cluster summary + ↓ +Round 2: LLM (evidence-based) → 8-12 snippets → Execute + ↓ + Combined results → Spec Generation +``` + +Round 2 sees: +- Every snippet that was tried and its exact output +- A programmatic cluster summary grouping results by behavior class +- An explicit "do not repeat" list + +This transforms exploration from **speculation** into **hypothesis refinement under feedback** — the LLM reasons about what it _hasn't_ seen, informed by what it _has_ seen. + +## Design Decisions + +### Why programmatic clustering (not LLM-based)? + +We considered two approaches for building the cluster summary between rounds: + +| | Programmatic (chosen) | LLM-based | +|---|---|---| +| Cost | Zero — no LLM call | 1 additional call | +| Determinism | Always produces same output for same input | Non-deterministic | +| Speed | Microseconds | Seconds | +| Depth | Surface-level (type + message prefix) | Semantic understanding | + +We chose programmatic clustering because the goal is not "perfect semantic grouping" — it's "sufficient signal to guide Round 2." The Round 2 LLM is intelligent enough to infer gaps from a simple type+message summary. Adding a clustering LLM call would introduce cost and non-determinism without proportional benefit. + +### Why exactly 2 rounds (not 3+)? + +Three considerations: + +1. **Diminishing returns**: Round 1 covers ~80-90% of behavior space through static reasoning. Round 2 targets the remaining gaps. A Round 3 would operate on an already-dense behavior map with very few remaining gaps — the ROI drops sharply. + +2. **Reasoning fragmentation**: Strong models like Opus do their best reasoning in large, focused context windows. Splitting reasoning across many small rounds can actually degrade quality. Two rounds is the sweet spot: one large reasoning pass, one targeted refinement. + +3. **Cost predictability**: Fixed 2-round means exactly 2 exploration LLM calls. This is predictable and benchmarkable. Variable rounds (3-5) make cost unpredictable and harder to compare across models. + +The `exploration_rounds` parameter allows override (`=1` restores old behavior, `=3` for complex domains if needed), but the default of 2 is intentional. + +### Why early exit conditions? + +Two conditions can terminate exploration before Round 2 completes: + +1. **No snippets produced**: If the Round 2 LLM returns an empty plan, it believes Round 1 was already comprehensive. Forcing it to produce snippets would yield duplicates. + +2. **No new behavior classes discovered**: After executing Round 2 snippets, we compare behavior keys (`ok:dict`, `err:ValueError:minute: 60 out of range`) between prior and new results. If every new snippet produced a behavior we already had, the exploration space is saturated. + +## Implementation Details + +### Cluster Summary Format + +The `_build_cluster_summary()` method produces a structured text summary: + +```markdown +## Observed Behaviour Clusters + +### Success clusters +- output type `dict`: 18 cases +- output type `bool`: 3 cases +- output type `list`: 1 case + +### Error clusters +- `ValueError` (8 distinct messages): + - "Expected 5 fields, got 3" + - "minute: 60 out of range 0-59" + - "Step must be positive, got -1" + - ... +- `AttributeError` (2 distinct messages): + - "'NoneType' object has no attribute 'strip'" + - "'int' object has no attribute 'strip'" + +### Already tried (25 snippets — do NOT repeat these) +- `parse_cron('* * * * *')` +- `parse_cron('5 14 1 6 3')` +- ... +``` + +This is deterministic, costs zero LLM tokens to produce, and provides exactly the signal Round 2 needs: +- What **output shapes** have been seen (so the LLM can target new ones) +- What **error types and messages** were discovered (so the LLM can find adjacent error paths) +- What **exact code** was already tried (so the LLM won't duplicate) + +### Round 2 Prompt Structure + +The Round 2 prompt includes: + +``` + — same source code as Round 1 + — full execution results (code + output/error for each snippet) + — the programmatic summary above + +RULES: +- Do NOT repeat any snippet from the "Already tried" list. +- Produce 8–12 NEW normal snippets targeting uncovered behaviour. +- Produce 3–5 NEW error snippets targeting untried error paths. +``` + +The snippet count targets (8-12 normal, 3-5 error) are intentionally smaller than Round 1 (15+ normal, 3+ error). Round 2 is surgical, not broad. + +### Behavior Key Format + +For early exit detection, each result is hashed into a behavior key: + +``` +Success: "ok:{output_type}" → "ok:dict", "ok:bool", "ok:list" +Error: "err:{error_type}:{msg40}" → "err:ValueError:minute: 60 out of range 0-59" +``` + +The message prefix is truncated to 40 characters — enough to distinguish error paths without being sensitive to minor wording variations. + +## Code Changes + +All changes are in `src/vowel/codemode.py`. No new files, no new dependencies. + +### Modified methods + +| Method | Change | +|---|---| +| `explore()` | 2-round loop with early exit; delegates execution to `_execute_plan()` | +| `_get_exploration_plan()` | Unchanged logic, updated docstring and logfire tags | + +### New methods + +| Method | Purpose | +|---|---| +| `_execute_plan()` | Extracted snippet execution loop (reused by both rounds) | +| `_get_targeted_exploration_plan()` | Round 2 prompt with prior results + cluster summary | +| `_build_cluster_summary()` | Programmatic clustering of results into text summary | +| `_count_new_behaviors()` | Compares behavior keys between prior and new results | + +### Backward compatibility + +- `explore(func, exploration_rounds=1)` restores exact single-shot behavior +- Default is `exploration_rounds=2` — existing callers get the improvement automatically +- `generate()` calls `explore()` without arguments, so it automatically benefits +- All 478 existing unit tests pass without modification + +## Expected Impact + +### On strong models (Opus-class) + +- Round 1 already produces excellent coverage +- Round 2 adds **combination discovery** and **exact error message alignment** +- Net: ~10-15% more snippets, potentially fewer spec refinement rounds (error messages will match exactly) + +### On weaker models (Flash/Lite-class) + +- Round 1 produces decent but shallow coverage — misses edge cases +- Round 2 **compensates for weaker static reasoning** by showing actual execution results +- Net: significant quality improvement, likely converting some FAIL scenarios to PASS + +### On benchmark discriminability + +With Katman 3 (behavioral discovery) added, benchmarks now measure a higher-order capability: **adaptive reasoning under feedback**. This separates models that can merely read code from models that can learn from execution traces — a much more meaningful distinction for agentic coding systems. + +## Relationship to the Full Pipeline + +The evidence flow through the pipeline is now: + +``` +Round 1 (speculation) → snippets → execute → results + ↓ +Round 2 (evidence-based) → snippets → execute → results + ↓ + all exploration results + ↓ +Spec Generation ← VerifiedExecutionResults + ErrorResults + ↓ + YAML eval spec + ↓ +Validation → RunEvals → coverage check + ↓ +Refinement (if needed) ← failure context +``` + +Evidence-based reasoning now starts at the **exploration phase** rather than only at spec generation. Since exploration results feed directly into spec generation, any improvement in exploration quality cascades through the entire downstream pipeline. + +## Origin + +This feature was designed through a three-way analysis between the developer, the implementation agent (GitHub Copilot / Claude Opus 4.6), and ChatGPT. ChatGPT identified the core insight: the pipeline was doing "speculation-based exploration" when it could be doing "evidence-based exploration." The implementation agent confirmed this against the actual codebase, proposed the programmatic clustering approach (Yol A) over LLM-based clustering, and implemented the 2-round design. + +The key framing that guided the design: + +``` +Layer 1: Domain awareness (from function description) ✅ already strong +Layer 2: Grammar inference (from source code) ✅ already strong +Layer 3: Behavioral discovery (from runtime feedback) ✅ now added +``` diff --git a/docs/FIXTURES.md b/docs/FIXTURES.md index da93794..5bf91ff 100644 --- a/docs/FIXTURES.md +++ b/docs/FIXTURES.md @@ -82,24 +82,29 @@ Fixtures support three lifecycle scopes (defined in YAML): | Scope | Behavior | |-------|----------| -| `function` (default) | Setup/teardown for **each** test case | -| `module` | Setup once per eval spec, teardown after all cases | -| `session` | Setup once per `run_evals()` call, teardown at end | +| `case` (preferred) / `function` (alias, default) | Setup/teardown for **each** test case | +| `eval` (preferred) / `module` (alias) | Setup once per eval spec, teardown after all cases | +| `file` (preferred) / `session` (alias) | Setup once per `run_evals()` call, teardown at end | + +Alias normalization: +- `case -> function` +- `eval -> module` +- `file -> session` ```yaml fixtures: temp_file: setup: my_fixtures.temp_file - scope: function + scope: case db: setup: my_fixtures.setup_db teardown: my_fixtures.teardown_db - scope: module + scope: eval cache: setup: my_fixtures.setup_cache - scope: session + scope: file ``` --- diff --git a/docs/MONTY_RESEARCH.md b/docs/MONTY_RESEARCH.md new file mode 100644 index 0000000..3742755 --- /dev/null +++ b/docs/MONTY_RESEARCH.md @@ -0,0 +1,984 @@ +# Monty Research Notes + +> Bu doküman, `pydantic-monty` projesinin evalspec ekosistemi (vowel eval generation pipeline'ları ve vowel-optimization) ile entegrasyonu için yapılan araştırmanın özetidir. CodeMode, tüm eval generation pipeline'larında kullanılabilecek genel bir mekanizmadır — optimizasyon bunlardan sadece biridir. + +## 1. Genel Bakış + +**Monty**, Pydantic ekibi tarafından Rust ile yazılmış, minimal ve güvenli bir Python yorumlayıcısıdır. Temel amacı: **AI tarafından üretilen kodu güvenli bir sandbox ortamında çalıştırmak.** + +- **Repo:** `pydantic/monty` (GitHub) +- **PyPI paketi:** `pydantic-monty` +- **NPM paketi:** `@pydantic/monty` +- **Lisans:** MIT +- **Dil:** Rust (PyO3 ile Python bindings, napi-rs ile JS bindings) +- **Hedef Python sürümü:** 3.14 + +### Temel Özellikler + +| Özellik | Detay | +|---------|-------| +| Güvenlik | Filesystem, network, env vars tamamen bloklu — sadece kontrollü external function callbacks | +| Başlatma süresi | <0.06ms (~60 mikrosaniye) | +| Performans | CPython'a benzer çalışma hızı | +| Boyut | ~4.5MB download | +| Serileştirme | `dump()`/`load()` ile parsed code ve execution state kaydedilebilir | +| Kaynak limitleri | Süre, bellek, allocation sayısı, recursion derinliği sınırlandırılabilir | +| Tip kontrolü | Opsiyonel statik tip analizi (Monty'nin kendi type checker'ı) | + +## 2. Güvenlik Modeli + +Monty, **untrusted/potentially malicious** kodun çalıştırılması için tasarlanmıştır. Güvenlik garantileri: + +- **Filesystem erişimi YOK** — Sadece `OSAccess` ile kontrollü sanal dosya sistemi +- **Network erişimi YOK** — Socket, HTTP vb. hiçbir ağ işlemi yapılamaz +- **Ortam değişkenleri YOK** — `os.environ`, `os.getenv` yalnızca host callback ile +- **Subprocess/shell YOK** — `os.system`, `subprocess` vb. yok +- **Import sistemi kısıtlı** — Sadece izin verilen modüller (sys, typing, asyncio) +- **C FFI yok** — Tamamen Rust ile implement edilmiş, unsafe yok + +Tüm dış dünya erişimi **external functions** mekanizması üzerinden olur — host tarafı bu fonksiyonları sağlar, sandbox kodu bunları çağırır, host gerçek işlemi yapar ve sonucu sandbox'a döndürür. + +## 3. Python API + +### 3.1. Kurulum + +```bash +pip install pydantic-monty +``` + +### 3.2. Temel Kullanım + +```python +import pydantic_monty + +# Basit ifade çalıştırma +m = pydantic_monty.Monty('1 + 2 * 3') +result = m.run() # -> 7 + +# Input değişkenleri ile +m = pydantic_monty.Monty('x + y', inputs=['x', 'y']) +result = m.run(inputs={"x": 10, "y": 20}) # -> 30 + +# Aynı parsed code farklı girdilerle tekrar çalıştırılabilir +result2 = m.run(inputs={"x": 100, "y": 200}) # -> 300 +``` + +### 3.3. `Monty` Sınıfı — Constructor + +```python +pydantic_monty.Monty( + code: str, # Çalıştırılacak Python kodu + *, + script_name: str = 'main.py', # Traceback'lerde görünecek isim + inputs: list[str] | None = None, # Kod içinde kullanılabilecek input değişken isimleri + external_functions: list[str] | None = None, # Kod içinden çağrılabilecek harici fonksiyon isimleri + type_check: bool = False, # Statik tip kontrolü yapılsın mı + type_check_stubs: str | None = None, # Tip kontrolü için ek stub tanımları + dataclass_registry: list[type] | None = None, # Dataclass tip kayıtları +) +``` + +**Raises:** +- `MontySyntaxError` — Kod parse edilemezse +- `MontyTypingError` — `type_check=True` ise ve tip hataları varsa + +### 3.4. `Monty.run()` — Senkron Çalıştırma + +```python +m.run( + *, + inputs: dict[str, Any] | None = None, # Input değerleri + limits: ResourceLimits | None = None, # Kaynak limitleri + external_functions: dict[str, Callable[..., Any]] | None = None, # Harici fonksiyon implementasyonları + print_callback: Callable[[Literal['stdout'], str], None] | None = None, # print() çıktısı callback + os: Callable[[OsFunction, tuple[Any, ...]], Any] | None = None, # OS erişimi callback +) -> Any +``` + +**Önemli:** GIL serbest bırakılır — paralel çalıştırma mümkün. + +### 3.5. External Functions (Harici Fonksiyonlar) + +Bu, Monty'nin en güçlü mekanizmasıdır. Sandbox kodu bir fonksiyon çağırdığında, çalışma durur, host taraftaki gerçek Python fonksiyonu çalışır ve sonuç sandbox'a döndürülür. + +```python +# Sandbox kodunda "fetch" fonksiyonu çağrılabilir +m = pydantic_monty.Monty( + 'fetch("https://example.com")', + external_functions=['fetch'] +) + +# Host tarafında gerçek implementasyon +def fetch(url: str) -> str: + return f'Fetched: {url}' + +result = m.run(external_functions={"fetch": fetch}) +# -> "Fetched: https://example.com" +``` + +**Kritik nokta:** External fonksiyonlar host ortamında çalışır — yani hedef fonksiyonun stdlib, third-party lib, dosya sistemi vb. kullanması sorun olmaz. Monty sadece orkestrasyonu yapar. + +### 3.6. İteratif Çalıştırma (start/resume) + +External fonksiyon çağrılarında adım adım kontrol sağlar: + +```python +m = pydantic_monty.Monty( + 'result = fetch(url)', + inputs=['url'], + external_functions=['fetch'] +) + +# Çalıştırmayı başlat +progress = m.start(inputs={"url": "https://example.com"}) + +if isinstance(progress, pydantic_monty.MontySnapshot): + # Bir external function çağrısında durdu + print(progress.function_name) # -> "fetch" + print(progress.args) # -> ("https://example.com",) + print(progress.kwargs) # -> {} + + # Sonucu döndürerek devam et + progress = progress.resume(return_value="response data") + +if isinstance(progress, pydantic_monty.MontyComplete): + print(progress.output) # -> Son ifadenin değeri +``` + +**İlerleme tipleri:** +- `MontySnapshot` — External function çağrısı bekliyor +- `MontyFutureSnapshot` — Birden fazla async future bekliyor +- `MontyComplete` — Çalışma tamamlandı, `.output` ile sonuç alınır + +### 3.7. Asenkron Çalıştırma + +```python +async def main(): + m = pydantic_monty.Monty( + 'await fetch(url)', + inputs=['url'], + external_functions=['fetch'] + ) + + async def real_fetch(url: str) -> str: + async with httpx.AsyncClient() as client: + r = await client.get(url) + return r.text + + result = await pydantic_monty.run_monty_async( + m, + inputs={"url": "https://example.com"}, + external_functions={"fetch": real_fetch}, + ) +``` + +### 3.8. REPL Modu + +Durum korunarak ardışık kod parçaları çalıştırılabilir: + +```python +repl, output = pydantic_monty.MontyRepl.create('x = 10', inputs=['x']) +# output = 10 (veya None — son ifadenin değeri) + +result1 = repl.feed('x + 5') # -> 15 +result2 = repl.feed('x * 2') # -> 20 +# x hâlâ 10, önceki state korunur +``` + +### 3.9. Kaynak Limitleri (ResourceLimits) + +```python +limits = pydantic_monty.ResourceLimits( + max_duration_secs=5.0, # Maksimum çalışma süresi (saniye) + max_memory=1024 * 1024, # Maksimum heap bellek (byte) — 1MB + max_allocations=10000, # Maksimum heap allocation sayısı + max_recursion_depth=1000, # Maksimum recursion derinliği (default: 1000) + gc_interval=100, # Her N allocation'da GC çalıştır +) + +m = pydantic_monty.Monty('fib(30)', external_functions=['fib']) +result = m.run( + external_functions={"fib": my_fib}, + limits=limits, +) +``` + +### 3.10. Serileştirme (dump/load) + +Parsed code veya çalışma durumu (snapshot) kaydedilebilir: + +```python +# Parsed code'u kaydet +m = pydantic_monty.Monty('x + 1', inputs=['x']) +data = m.dump() # -> bytes + +# Daha sonra geri yükle (parse maliyeti sıfır) +m2 = pydantic_monty.Monty.load(data) +result = m2.run(inputs={"x": 41}) # -> 42 + +# Snapshot'ü da kaydedebilirsin +progress = m.start(inputs={"x": 10}) +if isinstance(progress, pydantic_monty.MontySnapshot): + snapshot_data = progress.dump() # -> bytes + # Farklı process'te bile geri yüklenebilir + restored = pydantic_monty.MontySnapshot.load(snapshot_data) +``` + +### 3.11. Sanal Dosya Sistemi (OSAccess) + +```python +from pydantic_monty import OSAccess, MemoryFile, CallbackFile + +# Bellekte sanal dosyalar oluştur +fs = OSAccess([ + MemoryFile('/data/input.csv', content='col1,col2\n1,2\n3,4'), + MemoryFile('/data/config.json', content='{"key": "value"}'), +]) + +# Sandbox kodunda Path.read_text() vb. kullanılabilir +m = pydantic_monty.Monty(""" +from pathlib import Path +data = Path('/data/input.csv').read_text() +data.split('\\n') +""") + +result = await pydantic_monty.run_monty_async(m, os=fs) +``` + +### 3.12. Tip Kontrolü + +```python +# Opsiyonel statik analiz +m = pydantic_monty.Monty( + 'x + "hello"', + inputs=['x'], + type_check=True, + type_check_stubs='x: int', # Input tiplerini belirt +) +# MontyTypingError fırlatabilir + +# Hata formatları +try: + m.type_check(prefix_code='x: int') +except pydantic_monty.MontyTypingError as e: + print(e.display(format='full', color=True)) + # format seçenekleri: 'full', 'concise', 'azure', 'json', 'jsonlines', + # 'rdjson', 'pylint', 'gitlab', 'github' +``` + +## 4. Hata Tipleri + +``` +MontyError (base) +├── MontySyntaxError — Parse hataları +├── MontyRuntimeError — Çalışma zamanı hataları (ZeroDivisionError, ValueError vb.) +└── MontyTypingError — Statik tip analizi hataları +``` + +### MontyRuntimeError Detayları + +```python +try: + m = pydantic_monty.Monty('1 / 0') + m.run() +except pydantic_monty.MontyRuntimeError as e: + # İç exception'a eriş + inner = e.exception() # -> ZeroDivisionError instance + + # Traceback al + frames = e.traceback() # -> list[Frame] + for frame in frames: + print(f" {frame.filename}:{frame.line}:{frame.column} in {frame.function_name}") + print(f" {frame.source_line}") + + # Formatlanmış çıktı + print(e.display(format='traceback')) # Full traceback + print(e.display(format='type-msg')) # "ZeroDivisionError: division by zero" + print(e.display(format='msg')) # "division by zero" +``` + +**ÖNEMLİ:** Monty, Python exception'larını birebir eşleştirir. `ZeroDivisionError`, `ValueError`, `TypeError` vb. host tarafında doğru exception tipleri olarak yakalanabilir. + +## 5. Dil Destekleri ve Kısıtlamalar + +### 5.1. Desteklenen Python Deyimleri (Statements) + +Kaynak: `crates/monty/src/expressions.rs` — `Node` ve `Expr` enum'ları + +| Deyim | Notlar | +|-------|--------| +| `x = expr` | Basit atama | +| `x, y = expr` | Tuple unpacking (iç içe dahil: `(a, b), c = ...`) | +| `first, *rest = expr` | Starred unpacking | +| `x += expr` (augmented assigns) | `+=`, `-=`, `*=`, `/=`, `//=`, `%=`, `**=`, `&=`, `\|=`, `^=`, `<<=`, `>>=` | +| `obj[i] = val` | Subscript assignment | +| `obj.attr = val` | Attribute assignment (dataclass alanları) | +| `if / elif / else` | Tam destekli | +| `for target in iter` | `else` bloğu dahil | +| `while test` | `else` bloğu dahil | +| `break` | ✅ | +| `continue` | ✅ | +| `return` / `return expr` | ✅ | +| `raise` / `raise Exception(...)` | ✅ | +| `try / except / else / finally` | Tam hiyerarşi destekli, çoklu `except` | +| `assert test, msg` | ✅ | +| `pass` | ✅ | +| `def func(...)` | `async def` dahil | +| `global x` | ✅ | +| `nonlocal x` | ✅ | +| `import sys` | Sadece whitelist'teki modüller | +| `from typing import X` | Sadece whitelist'teki modüller | +| `del` | ❌ Henüz yok | +| `class MyClass:` | ❌ Henüz yok | +| `match x:` | ❌ Desteklenmiyor | +| `with ... as ...:` | ❌ Henüz yok | + +### 5.2. Desteklenen İfadeler (Expressions) + +| İfade | Notlar | +|-------|--------| +| Literaller | `int`, `float`, `str`, `bytes`, `bool`, `None`, `...` | +| Büyük int'ler | `2**200` gibi i64 aşan değerler (arbitrary precision) | +| f-string | `f"hello {name!r}"` — format spec dahil | +| Aritmetik | `+`, `-`, `*`, `/`, `//`, `%`, `**` | +| Bitwise | `&`, `\|`, `^`, `~`, `<<`, `>>` | +| Karşılaştırma | `==`, `!=`, `<`, `<=`, `>`, `>=`, `is`, `is not`, `in`, `not in` | +| Zincirleme karşılaştırma | `a < b < c` — kısa devre değerlendirmeli | +| Boolean | `and`, `or`, `not` | +| Unary | `-x`, `+x`, `~x` | +| Ternary | `x if cond else y` | +| Walrus | `(x := expr)` | +| `await expr` | Modül seviyesinde de kullanılabilir (Jupyter tarzı) | +| List/dict/set literali | `[1,2]`, `{k:v}`, `{1,2}` | +| List/set/dict comprehension | `[x for x in iter if cond]` | +| Generator expression | `(x for x in iter)` | +| Lambda | `lambda x, y: x + y` | +| Subscript | `obj[i]`, `obj[a:b:c]` | +| Slice | `obj[::2]` | +| Attribute erişimi | `obj.attr` (zincirli dahil) | +| Fonksiyon çağrısı | `f(a, b, *args, key=val, **kwargs)` | +| Method çağrısı | `obj.method(args)` | +| `isinstance(obj, Type)` | ✅ | + +### 5.3. Desteklenen Yerleşik Tipler (Built-in Types) + +``` +bool int float str bytes +list tuple dict set frozenset +range slice iter +type property +``` + +Ayrıca: +- `None`, `True`, `False`, `...` (Ellipsis) +- `LongInt` — arbitrarily large integers +- `NamedTuple` — `collections.namedtuple` benzeri (built-in desteği var) +- `Dataclass` — `@dataclass` decorator'ı ile (host'tan registry ile) +- `pathlib.Path` — `from pathlib import Path` ile + +### 5.4. Desteklenen Builtin Fonksiyonlar + +Kaynak: `crates/monty/src/builtins/mod.rs` — `BuiltinsFunctions` enum'u + +**Mevcut (✅):** +``` +abs() all() any() bin() chr() +divmod() enumerate() filter() getattr() hash() +hex() id() isinstance() len() map() +max() min() next() oct() ord() +pow() print() repr() reversed() round() +sorted() sum() type() zip() +``` + +**Henüz yok / yorum satırı (❌):** +``` +aiter() anext() ascii() breakpoint() +callable() compile() dir() eval() +exec() format() globals() hasattr() +help() input() issubclass() iter() [kısmen] +locals() open() setattr() staticmethod() +classmethod() super() vars() __import__() +``` + +**Type constructor olarak kullanılabilenler:** +``` +bool() int() float() str() bytes() +list() tuple() dict() set() frozenset() +range() slice() iter() type() property() +``` + +**Exception constructor'ları:** +``` +Exception BaseException SystemExit KeyboardInterrupt +ArithmeticError OverflowError ZeroDivisionError +LookupError IndexError KeyError +RuntimeError NotImplementedError RecursionError +AttributeError FrozenInstanceError +NameError UnboundLocalError +ValueError UnicodeDecodeError +ImportError ModuleNotFoundError +OSError FileNotFoundError FileExistsError +IsADirectoryError NotADirectoryError +AssertionError MemoryError StopIteration +SyntaxError TimeoutError TypeError +``` + +### 5.5. Desteklenen Stdlib Modülleri + +#### `sys` +```python +import sys +sys.version # "3.14.0 (Monty)" +sys.version_info # named tuple: (major=3, minor=14, micro=0, ...) +sys.platform # "monty" +sys.stdout # marker (gerçek I/O yok) +sys.stderr # marker (gerçek I/O yok) +``` + +#### `typing` +```python +from typing import ( + TYPE_CHECKING, # her zaman False + Any, Optional, Union, List, Dict, Tuple, Set, + FrozenSet, Callable, Type, Sequence, Mapping, + Iterable, Iterator, Generator, ClassVar, + Final, Literal, TypeVar, Generic, Protocol, + Annotated, Self, Never, NoReturn +) +``` +Bunlar runtime'da `Marker` değerleri olarak işlenir — tip anotasyonlarda kullanılabilirler. + +#### `asyncio` +```python +import asyncio +asyncio.run(coro) # await coro ile eşdeğer +asyncio.gather(*coros) # Eşzamanlı birden fazla coroutine çalıştırma +# create_task, sleep, wait vb. → YOK +``` + +#### `os` +```python +import os +os.getenv("KEY", default=None) # host callback üzerinden +os.environ # host callback üzerinden dict döner +# os.path, os.listdir, os.system vb. → YOK +``` + +#### `pathlib` +```python +from pathlib import Path +p = Path("/data/file.txt") + +# Pure methods (I/O gerektirmez — doğrudan çalışır): +p.name # "file.txt" +p.stem # "file" +p.suffix # ".txt" +p.suffixes # [".txt"] +p.parent # Path("/data") +p.parts # ["/", "data", "file.txt"] +p / "subdir" # Path birleştirme (/ operatörü) +str(p) # "/data/file.txt" + +# Filesystem methods (OSAccess host callback gerektirir): +p.exists() read_text() read_bytes() +p.is_file() write_text() write_bytes() +p.is_dir() mkdir() unlink() +p.is_symlink() rmdir() iterdir() +p.stat() rename() resolve() +p.absolute() +``` + +### 5.6. Tip Metodları — Detay + +#### `str` metodları +``` +capitalize casefold center count encode +endswith find index isalnum isalpha +isascii isdecimal isdigit isidentifier islower +isnumeric isspace istitle isupper join +ljust lower lstrip partition removeprefix +removesuffix replace rfind rindex rjust +rpartition rsplit rstrip split splitlines +startswith strip swapcase title upper zfill +``` +Ayrıca: `+` (concat), `*` (repeat), `in` (contains), `[]` (index/slice), `len()`, `str()` constructor + +#### `list` metodları +``` +append clear copy count extend index insert pop remove reverse sort +``` +Ayrıca: `+`, `*`, `in`, `[]`, `len()`, comprehension, unpacking + +#### `dict` metodları +``` +clear copy fromkeys get items keys pop popitem setdefault update values +``` +Ayrıca: `in`, `[]`, `len()`, comprehension + +#### `set` / `frozenset` metodları +``` +add clear copy difference discard intersection isdisjoint +issubset issuperset pop remove symmetric_difference union update +``` +Ayrıca: `|`, `&`, `-`, `^` operatörleri + +#### `tuple` metodları +``` +count index +``` +Ayrıca: `+`, `*`, `in`, `[]`, `len()`, unpacking + +#### `bytes` metodları +``` +capitalize center count decode endswith +find fromhex hex index isalnum +isalpha isascii isdigit islower isspace +istitle isupper join ljust lower +lstrip partition removeprefix removesuffix replace +rfind rindex rjust rpartition rsplit +rstrip split splitlines startswith strip +swapcase title upper zfill +``` + +#### `int` metodları +``` +bit_length bit_count to_bytes from_bytes +``` +Ayrıca: tüm aritmetik ve bitwise operatörler + +#### `range` +``` +range(stop) +range(start, stop) +range(start, stop, step) +``` +Iteration, `in`, `len()`, `list(range(...))` desteklenir. + +### 5.7. Desteklenmeyen Özellikler + +| Özellik | Durum | +|---------|-------| +| **`class` tanımı** | ❌ Henüz yok (geliyor) | +| **`match` / `case`** | ❌ Planlanmamış | +| **`with` / bağlam yöneticisi** | ❌ Henüz yok | +| **`del` deyimi** | ❌ Henüz yok | +| **`yield from`** | ❌ Henüz yok | +| **`*args` spread in comprehension** | ⚠️ Kısıtlı | +| **`eval()`, `exec()`** | ❌ Hiçbir zaman olmayacak | +| **`__import__`** | ❌ Hiçbir zaman olmayacak | +| **Third-party kütüphaneler** | ❌ Sandbox içinde kullanılamaz | +| **`json` modülü** | ❌ Henüz yok (geliyor) | +| **`dataclasses` modülü (import)** | ❌ Henüz yok; dataclass desteği var ama host'tan | +| **`collections`, `itertools`, `math`** | ❌ Yok | +| **`re` (regex)** | ❌ Yok | +| **`datetime`** | ❌ Yok | +| **`functools`** | ❌ Yok | +| **`enum`** | ❌ Yok | +| **Decorator'lar** | ⚠️ Sadece basit fonksiyon decorator'ları | +| **`super()`** | ❌ Yok | +| **`classmethod`, `staticmethod`** | ❌ Yok | + +## 6. Mimari (Dahili) + +- **Parser:** Ruff'un `ruff_python_parser`'ı kullanılır → AST üretilir +- **Prepare phase:** AST'den Scope analizi yapılır, isimler namespace index'lerine çözümlenir +- **Bytecode:** Hazırlanan AST doğrudan bytecode VM'e beslenir (CPython benzeri register VM) +- **Bellek:** Manuel reference counting (`drop_with_heap`, `clone_with_heap`); GC configurable intervals ile +- **Serileştirme:** `serde` ile binary format (parsed code + snapshot) + +### Crate yapısı + +| Crate | İçerik | +|-------|--------| +| `crates/monty/` | Çekirdek interpreter (VM, types, builtins, modules) | +| `crates/monty-python/` | PyO3 Python bindings | +| `crates/monty-js/` | napi-rs JavaScript bindings | +| `crates/monty-cli/` | CLI aracı | +| `crates/monty-type-checking/` | Statik tip analizi | +| `crates/monty-typeshed/` | Tip stub dosyaları (vendor + custom) | +| `crates/fuzz/` | Fuzzing testleri | + +### Modül whitelist + +`import` ifadesi sadece şu modülleri yükleyebilir (kaynak: `modules/mod.rs`): + +``` +sys typing asyncio pathlib os +``` + +Başka herhangi bir `import X` → `ModuleNotFoundError`. + +## 7. PydanticAI Entegrasyonu + +Monty, PydanticAI'de **CodeMode** özelliğini güçlendirecek şekilde tasarlanmıştır. LLM sıralı tool çağrıları yapmak yerine, tool'ları fonksiyon olarak çağıran Python kodu yazar ve Monty bunu güvenli şekilde çalıştırır. + +```python +from pydantic_ai import Agent +from pydantic_ai.toolsets.code_mode import CodeModeToolset +from pydantic_ai.toolsets.function import FunctionToolset + +# Araçları tanımla +tools = FunctionToolset() + +@tools.tool +async def get_weather(location: str) -> dict: + ... + +# Agent'ı CodeMode ile oluştur +agent = Agent( + 'anthropic:claude-sonnet-4-5', + toolsets=[CodeModeToolset(tools)], # Monty-powered code execution +) + +# Agent Python kodu yazarak tool'ları çağırır +result = await agent.run("Compare weather in London and Paris") +``` + +## 8. Alternatiflere Karşı Pozisyon + +| Tech | Dil Tamamlığı | Güvenlik | Başlatma | Maliyet | +|------|---------------|----------|----------|---------| +| **Monty** | Kısmi | Katı | 0.06ms | Ücretsiz/OSS | +| Docker | Tam | İyi | 195ms | Ücretsiz/OSS | +| Pyodide | Tam | Zayıf | 2800ms | Ücretsiz/OSS | +| starlark-rust | Çok kısıtlı | İyi | 1.7ms | Ücretsiz/OSS | +| WASI/Wasmer | Neredeyse tam | Katı | 66ms | Ücretsiz* | +| Sandboxing servisi (E2B, Modal) | Tam | Katı | 1033ms | Ücretli | +| YOLO Python (exec) | Tam | Yok | 0.1ms | Ücretsiz/OSS | + +**Monty'nin avantajları:** En düşük başlatma süresi + katı güvenlik + kolay kurulum + serileştirme desteği. + +## 9. Eval Generation İçin Kullanım Senaryosu + +### Problem + +Eval generation pipeline'larında (hem tek seferlik generation hem de optimization döngüsünde) LLM agent expected değerleri **tahmin ediyor** — bu özellikle algoritmik fonksiyonlarda hallüsinasyona yol açar (ör. `binary_search([1,3,5,7], 5)` için yanlış index döndürme). + +### Çözüm: CodeMode Eval Generation + +CodeMode, **tüm eval generation pipeline'larında** kullanılabilecek genel bir mekanizmadır. Agent expected değerleri tahmin etmek yerine, Monty sandbox'ında **gerçek fonksiyonu çalıştırarak** ground-truth değerleri elde eder. + +**Kullanım alanları:** +- **Tek seferlik eval generation** — `vowel` CLI veya API ile bir fonksiyon için eval dosyası üretirken +- **Optimization döngüsü** — GEPA ile prompt optimize ederken her iterasyonda (burada özellikle etkili çünkü yüzlerce eval üretiliyor) +- **CI/CD pipeline'ları** — Otomatik test üretimi akışlarında +- **Herhangi bir eval generation çağrısı** — CodeMode, pipeline'dan bağımsız bir altyapı katmanıdır + +### Temel Mimari + +``` +┌─────────────────────────────────────────────────────────┐ +│ LLM Agent │ +│ "Bu fonksiyon için ilginç test girdileri tasarla" │ +│ │ +│ Agent üretir: │ +│ inputs = [ │ +│ {"x": [1,3,5,7,9], "target": 5}, │ +│ {"x": [], "target": 1}, │ +│ {"x": [1], "target": 1}, │ +│ ] │ +└──────────────────────┬──────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────┐ +│ Monty Sandbox │ +│ │ +│ # Agent tarafından üretilen test harness │ +│ results = [] │ +│ results.append(target_func([1,3,5,7,9], 5)) │ +│ results.append(target_func([], 1)) │ +│ results.append(target_func([1], 1)) │ +│ results │ +│ │ +│ external_functions = {"target_func": real_function} │ +│ limits = ResourceLimits(max_duration_secs=5.0) │ +└──────────────────────┬──────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────┐ +│ Ground-Truth Sonuçlar │ +│ │ +│ results = [2, -1, 0] ← gerçek fonksiyon çıktıları │ +│ │ +│ Bu değerler YAML eval dosyasındaki expected alanına │ +│ yazılır — hallüsinasyon riski sıfır. │ +└─────────────────────────────────────────────────────────┘ +``` + +### Neden External Function Mekanizması Kritik? + +Hedef fonksiyon (ör. `binary_search`) şunları kullanabilir: +- Stdlib modülleri (`collections`, `itertools`, `math` vb.) +- Third-party kütüphaneler (`numpy`, `pandas` vb.) +- Dosya sistemi, network vb. + +Monty sandbox'ı bunların hiçbirini desteklemez. **AMA** external function olarak inject edildiğinde, `target_func(...)` çağrısı host tarafındaki gerçek Python fonksiyonunu çalıştırır — yani tüm bağımlılıklar sorunsuz çalışır. + +### ExecutorAdapter Protokolü (Taslak) + +```python +from dataclasses import dataclass +from typing import Any, Protocol + +@dataclass +class ExecutionResult: + """Sandbox çalıştırma sonucu.""" + output: Any # Kodun döndürdüğü değer + stdout: str # print() çıktısı + success: bool # Hatasız tamamlandı mı + error: str | None = None # Hata mesajı (varsa) + error_type: str | None = None # Hata tipi (ör. "ValueError") + duration_ms: float = 0.0 # Çalışma süresi + +class ExecutorAdapter(Protocol): + """Kod çalıştırma adaptör protokolü.""" + async def execute( + self, + code: str, + *, + target_function: callable | None = None, + inputs: dict[str, Any] | None = None, + timeout: float = 5.0, + max_memory: int = 10 * 1024 * 1024, # 10MB + ) -> ExecutionResult: ... + +class MontyExecutor: + """Monty tabanlı güvenli kod çalıştırıcı.""" + + def __init__(self): + import pydantic_monty + self._monty = pydantic_monty + + async def execute( + self, + code: str, + *, + target_function: callable | None = None, + inputs: dict[str, Any] | None = None, + timeout: float = 5.0, + max_memory: int = 10 * 1024 * 1024, + ) -> ExecutionResult: + import time + + stdout_lines: list[str] = [] + + def print_callback(stream: str, text: str): + stdout_lines.append(text) + + # External function listesi oluştur + ext_names = ["target_func"] if target_function else [] + ext_impls = {"target_func": target_function} if target_function else {} + + # Input isimleri + input_names = list(inputs.keys()) if inputs else [] + + try: + m = self._monty.Monty( + code, + inputs=input_names or None, + external_functions=ext_names or None, + ) + + limits = self._monty.ResourceLimits( + max_duration_secs=timeout, + max_memory=max_memory, + ) + + start = time.perf_counter() + result = m.run( + inputs=inputs, + limits=limits, + external_functions=ext_impls, + print_callback=print_callback, + ) + duration = (time.perf_counter() - start) * 1000 + + return ExecutionResult( + output=result, + stdout="\n".join(stdout_lines), + success=True, + duration_ms=duration, + ) + + except self._monty.MontyRuntimeError as e: + inner = e.exception() + return ExecutionResult( + output=None, + stdout="\n".join(stdout_lines), + success=False, + error=str(e), + error_type=type(inner).__name__, + duration_ms=0.0, + ) + except self._monty.MontySyntaxError as e: + return ExecutionResult( + output=None, + stdout="", + success=False, + error=str(e), + error_type="SyntaxError", + duration_ms=0.0, + ) +``` + +### BuiltinExecutor (Geliştirme/Fallback) + +```python +class BuiltinExecutor: + """exec() tabanlı çalıştırıcı — sadece güvenilir kodlar için.""" + + async def execute( + self, + code: str, + *, + target_function: callable | None = None, + inputs: dict[str, Any] | None = None, + timeout: float = 5.0, + max_memory: int = 10 * 1024 * 1024, + ) -> ExecutionResult: + import io, contextlib, time + + namespace = dict(inputs or {}) + if target_function: + namespace["target_func"] = target_function + + stdout = io.StringIO() + start = time.perf_counter() + + try: + with contextlib.redirect_stdout(stdout): + exec(code, namespace) + duration = (time.perf_counter() - start) * 1000 + + # Son ifadenin değerini al (eğer varsa) + result = namespace.get("__result__", namespace.get("results")) + + return ExecutionResult( + output=result, + stdout=stdout.getvalue(), + success=True, + duration_ms=duration, + ) + except Exception as e: + return ExecutionResult( + output=None, + stdout=stdout.getvalue(), + success=False, + error=str(e), + error_type=type(e).__name__, + duration_ms=0.0, + ) +``` + +## 10. Entegrasyon Tasarım Kararları + +### Açık Sorular + +1. **Agent'ın kodu nasıl üretecek?** + - Seçenek A: Agent sadece input listesi üretir, harness kodu otomatik oluşturulur + - Seçenek B: Agent tam test harness kodunu yazar (daha esnek ama hata riski daha yüksek) + - Seçenek C: Hibrit — Agent input + beklenen davranış tanımlar, edge case'ler için raises testi de yazabilir + +2. **Exception test etme nasıl olacak?** + - `raises` assertion'ları için agent'ın exception beklediğini belirtmesi gerekir + - Monty'de try/except destekleniyor, agent try/except yazarak exception tipini yakalayabilir + +3. **Mevcut pipeline ile entegrasyon noktası neresi?** + - `task.py`'daki `generate_and_score()` akışında, agent YAML ürettikten sonra expected değerleri doğrulamak için Monty kullanılabilir + - Veya: Agent doğrudan Monty ile çalışan bir "CodeMode" prompt ile yönlendirilir + +4. **Performans etkisi?** + - Monty başlatma: ~0.06ms + - Her test case çalıştırma: fonksiyonun karmaşıklığına bağlı (host'ta çalışır) + - 25 fonksiyon × 20 test case = 500 çalıştırma → toplam <1 saniye ek maliyet + +5. **Hangi fonksiyonlar CodeMode'a uygun?** + - Deterministik fonksiyonlar (aynı input → aynı output): ✅ İdeal + - Yan etkili fonksiyonlar (dosya yazma, API çağrısı): ⚠️ Dikkatli olunmalı + - Rastgele çıktılı fonksiyonlar: ❌ Uygun değil (expected value sabitlenmeli) + +### Kısıtlamalar ve Çözümler + +| Kısıtlama | Etki | Çözüm | +|-----------|------|-------| +| Class tanımı yok | Agent class kullanamaz | Fonksiyon + dict / NamedTuple kullan | +| `json` modülü yok | String serialization zor | Host'a external function olarak delege et | +| `match` statement yok | Pattern matching yok | if/elif zincirleri kullan | +| `with` statement yok | Context manager yok | İstisnai durum; hedef fonksiyon host'ta çalışır | +| `math`, `collections`, `itertools` yok | Sandbox içi hesaplama kısıtlı | Tüm asıl hesaplama host fonksiyonunda yapılır | +| Sadece 5 modül import edilebilir | `sys`, `typing`, `asyncio`, `pathlib`, `os` | Yeterli — sandbox kodu sadece orkestrasyon yapıyor | + +**En kritik çözüm:** Sandbox kodunun amacı karmaşık hesaplama yapmak değil — **sadece test girdilerini organize edip hedef fonksiyonu çağırmak**. Asıl hesaplama external function (hedef fonksiyon) içinde, host tarafında yapılır. + +## 11. Örnek: Tam Çalışma Akışı + +```python +# 1. Hedef fonksiyon (test edilecek) +def binary_search(arr: list[int], target: int) -> int: + lo, hi = 0, len(arr) - 1 + while lo <= hi: + mid = (lo + hi) // 2 + if arr[mid] == target: + return mid + elif arr[mid] < target: + lo = mid + 1 + else: + hi = mid - 1 + return -1 + +# 2. Agent'ın ürettiği Monty kodu +agent_code = """ +results = [] + +# Normal cases +results.append({"input": {"arr": [1,3,5,7,9], "target": 5}, "expected": target_func([1,3,5,7,9], 5)}) +results.append({"input": {"arr": [1,3,5,7,9], "target": 1}, "expected": target_func([1,3,5,7,9], 1)}) +results.append({"input": {"arr": [1,3,5,7,9], "target": 9}, "expected": target_func([1,3,5,7,9], 9)}) + +# Not found +results.append({"input": {"arr": [1,3,5,7,9], "target": 4}, "expected": target_func([1,3,5,7,9], 4)}) + +# Edge cases +results.append({"input": {"arr": [], "target": 1}, "expected": target_func([], 1)}) +results.append({"input": {"arr": [1], "target": 1}, "expected": target_func([1], 1)}) +results.append({"input": {"arr": [1], "target": 2}, "expected": target_func([1], 2)}) + +results +""" + +# 3. Monty'de çalıştır +import pydantic_monty + +m = pydantic_monty.Monty( + agent_code, + external_functions=["target_func"], +) + +results = m.run( + external_functions={"target_func": binary_search}, + limits=pydantic_monty.ResourceLimits(max_duration_secs=5.0), +) + +# 4. Sonuç: Ground-truth expected değerlerle test case'ler +# results = [ +# {"input": {"arr": [1,3,5,7,9], "target": 5}, "expected": 2}, +# {"input": {"arr": [1,3,5,7,9], "target": 1}, "expected": 0}, +# {"input": {"arr": [1,3,5,7,9], "target": 9}, "expected": 4}, +# {"input": {"arr": [1,3,5,7,9], "target": 4}, "expected": -1}, +# {"input": {"arr": [], "target": 1}, "expected": -1}, +# {"input": {"arr": [1], "target": 1}, "expected": 0}, +# {"input": {"arr": [1], "target": 2}, "expected": -1}, +# ] +``` + +**Hiçbir expected değer hallüsine edilmedi — hepsi gerçek fonksiyon çıktısı.** + +## 12. Sonraki Adımlar + +1. ~~Monty API'yi tam anla~~ ✅ +2. `ExecutorAdapter` protokolünü finalize et +3. `MontyExecutor` implementasyonunu yaz +4. `task.py`'ye CodeMode akışını entegre et +5. Agent prompt'unu CodeMode için güncelle +6. 25 referans fonksiyon üzerinde test et +7. Mevcut "tahmin" modu ile CodeMode'u karşılaştır (A/B) diff --git a/docs/README.md b/docs/README.md index ff567e3..c328c08 100644 --- a/docs/README.md +++ b/docs/README.md @@ -8,6 +8,7 @@ Welcome to the Vowel framework documentation. |----------|-------------| | [User Guide](./USERGUIDE.md) | Complete guide to using Vowel | | [API Reference](./API.md) | Detailed API documentation | +| [CodeMode](./CODEMODE.md) | Exploration-guided spec generation pipeline and benchmark usage | ## Quick Links @@ -16,3 +17,4 @@ Welcome to the Vowel framework documentation. - **Evaluators**: See [User Guide - Evaluators](./USERGUIDE.md#evaluators) - **RunEvals API**: See [API - RunEvals](./API.md#runevals) - **EvalGenerator**: See [API - EvalGenerator](./API.md#evalgenerator) +- **CodeMode**: See [CodeMode Guide](./CODEMODE.md) diff --git a/docs/SERIALIZERS.md b/docs/SERIALIZERS.md index de92302..7def43c 100644 --- a/docs/SERIALIZERS.md +++ b/docs/SERIALIZERS.md @@ -58,6 +58,52 @@ summary = ( ) ``` +> Key matching note: If YAML eval ids use `module.function`, both programmatic maps accept either the exact id (`module.function`) or short name (`function`) keys in `.with_functions(...)`, `.with_serializer(...)`, and `serial_fn={...}`. + +> Assertion context note: When a serializer is active, assertion evaluators see the serialized `input` value (not raw YAML payload). +> +> - Schema mode: `input` is the model/callable output. +> - Serial fn mode: `input` is whatever `serial_fn` returns (single value, tuple, or dict). +> - Dict/nested schema mode: `input` contains per-parameter serialized values. + +--- + +## YAML-Native Serializer Registry + +You can define serializers directly in YAML and reference them per eval. + +```yaml +serializers: + query_schema: + schema: examples.serializers.util.Query + query_serial_fn: + serializer: examples.serializers.util.query_from_payload + +examples.serializers.util.query_users: + serializer: query_schema + dataset: + - case: + input: + sql: "SELECT name FROM users WHERE age > ?" + params: [30] + +examples.serializers.util.query_users_custom: + serializer: query_serial_fn + dataset: + - case: + input: "SELECT COUNT(*) AS total FROM users" +``` + +One-of rule for each serializer registry entry: +- use `schema` or `serializer` +- do not define both in the same entry + +Runnable example: + +```bash +vowel examples/serializers/db_query_evals.yml +``` + --- ## Advanced Examples @@ -89,6 +135,9 @@ summary = ( .with_serializer({"process": {"user": User, "config": Config}}) .run() ) + +# Assertions can access serialized nested values +# assertion: "input['user'].email.endswith('@a.com') and input['config'].timeout == 30" ``` ### Custom Parsing Logic diff --git a/docs/YAML_SPEC.md b/docs/YAML_SPEC.md index f467ba6..b7ded00 100644 --- a/docs/YAML_SPEC.md +++ b/docs/YAML_SPEC.md @@ -10,7 +10,8 @@ fixtures: fixture_name: setup: module.setup_func # Import path to setup function teardown: module.teardown_func # Import path to teardown (optional) - scope: function # function | module | session + scope: case # preferred: case | eval | file + # aliases: function | module | session kwargs: # Keyword arguments for setup function (optional) key: value @@ -116,18 +117,18 @@ fixtures: db: setup: myapp.fixtures.setup_db teardown: myapp.fixtures.close_db - scope: module # Created once, shared across all cases + scope: eval # Created once, shared across all cases params: db_name: test_db cache: setup: myapp.fixtures.setup_cache - scope: session # Created once per run_evals call + scope: file # Created once per run_evals call temp_dir: setup: myapp.fixtures.create_temp_dir teardown: myapp.fixtures.remove_temp_dir - scope: function # Created fresh for each case (default) + scope: case # Created fresh for each case (default) # Function depends on 'db' fixture query_user: @@ -185,9 +186,14 @@ summary = ( ``` **Fixture scopes:** -- `function` (default): Setup/teardown for **each** test case -- `module`: Setup once per eval spec, teardown after all cases -- `session`: Setup once per `run_evals()` call, teardown at end +- Preferred names: + - `case` (default): Setup/teardown for **each** test case + - `eval`: Setup once per eval spec, teardown after all cases + - `file`: Setup once per `run_evals()` call, teardown at end +- Backward-compatible aliases: + - `function` = `case` + - `module` = `eval` + - `session` = `file` > See [FIXTURES.md](./FIXTURES.md) for the complete fixture guide including Python API patterns. diff --git a/examples/basic_usage/evals.yml b/examples/basic_usage/evals.yml index a7a9d94..c184afc 100644 --- a/examples/basic_usage/evals.yml +++ b/examples/basic_usage/evals.yml @@ -6,9 +6,11 @@ greet: dataset: - case: + id: greet_world input: "World" expected: "Hello, World!" - case: + id: greet_alice input: "Alice" expected: "Hello, Alice!" @@ -16,12 +18,15 @@ greet: add: dataset: - case: + id: add_positive_pair inputs: { x: 1, y: 2 } expected: 3 - case: + id: add_zero_sum inputs: { x: -5, y: 5 } expected: 0 - case: + id: add_large_values inputs: { x: 100, y: 200 } expected: 300 @@ -29,12 +34,15 @@ add: multiply: dataset: - case: + id: multiply_basic_product inputs: [3, 4] expected: 12 - case: + id: multiply_zero_factor inputs: [0, 999] expected: 0 - case: + id: multiply_negative_product inputs: [-2, 5] expected: -10 @@ -42,15 +50,19 @@ multiply: factorial: dataset: - case: + id: factorial_zero_base input: 0 expected: 1 - case: + id: factorial_five input: 5 expected: 120 - case: + id: factorial_ten input: 10 expected: 3628800 - case: + id: factorial_negative_raises input: -1 raises: ValueError match: "non-negative" @@ -59,12 +71,15 @@ factorial: is_even: dataset: - case: + id: is_even_four input: 4 expected: true - case: + id: is_even_seven input: 7 expected: false - case: + id: is_even_zero input: 0 expected: true @@ -72,9 +87,11 @@ is_even: len: dataset: - case: + id: len_list_three input: [1, 2, 3] expected: 3 - case: + id: len_string_hello input: "hello" expected: 5 @@ -82,14 +99,17 @@ len: math.sqrt: dataset: - case: + id: sqrt_16 input: 16 expected: 4.0 - case: + id: sqrt_9 input: 9 expected: 3.0 os.path.join: dataset: - case: + id: join_home_user inputs: ["/home", "user"] expected: "/home/user" diff --git a/db.py b/examples/db_fixtures/db.py similarity index 95% rename from db.py rename to examples/db_fixtures/db.py index 078f78b..332aa8d 100644 --- a/db.py +++ b/examples/db_fixtures/db.py @@ -15,8 +15,10 @@ import logfire +from vowel.monitoring import enable_monitoring + # enable observability (optional) -# logfire.configure(service_name="db-fixture") +enable_monitoring(service_name="db-fixture") class NoTableError(Exception): diff --git a/examples/evals/builtins.yml b/examples/evals/builtins.yml index 46e1146..fbf388c 100644 --- a/examples/evals/builtins.yml +++ b/examples/evals/builtins.yml @@ -3,53 +3,55 @@ len: dataset: - - case: { input: [1, 2, 3], expected: 3 } - - case: { input: "hello", expected: 5 } - - case: { input: [], expected: 0 } + - case: { id: len_list_three, input: [1, 2, 3], expected: 3 } + - case: { id: len_string_hello, input: "hello", expected: 5 } + - case: { id: len_empty_list, input: [], expected: 0 } abs: dataset: - - case: { input: -7, expected: 7 } - - case: { input: 0, expected: 0 } - - case: { input: 42, expected: 42 } + - case: { id: abs_negative, input: -7, expected: 7 } + - case: { id: abs_zero, input: 0, expected: 0 } + - case: { id: abs_positive, input: 42, expected: 42 } sorted: dataset: - - case: { input: [3, 1, 2], expected: [1, 2, 3] } - - case: { input: [5, 5, 5], expected: [5, 5, 5] } - - case: { input: [], expected: [] } + - case: { id: sorted_unsorted_numbers, input: [3, 1, 2], expected: [1, 2, 3] } + - case: { id: sorted_all_equal, input: [5, 5, 5], expected: [5, 5, 5] } + - case: { id: sorted_empty_list, input: [], expected: [] } sum: dataset: - - case: { input: [1, 2, 3], expected: 6 } - - case: { input: [], expected: 0 } + - case: { id: sum_simple_list, input: [1, 2, 3], expected: 6 } + - case: { id: sum_empty_list, input: [], expected: 0 } min: dataset: - - case: { input: [3, 1, 2], expected: 1 } - - case: { input: [99], expected: 99 } + - case: { id: min_list_values, input: [3, 1, 2], expected: 1 } + - case: { id: min_singleton, input: [99], expected: 99 } max: dataset: - - case: { input: [3, 1, 2], expected: 3 } + - case: { id: max_list_values, input: [3, 1, 2], expected: 3 } math.sqrt: dataset: - - case: { input: 16, expected: 4.0 } - - case: { input: 9, expected: 3.0 } - - case: { input: 0, expected: 0.0 } + - case: { id: sqrt_16, input: 16, expected: 4.0 } + - case: { id: sqrt_9, input: 9, expected: 3.0 } + - case: { id: sqrt_0, input: 0, expected: 0.0 } math.factorial: dataset: - - case: { input: 0, expected: 1 } - - case: { input: 5, expected: 120 } - - case: { input: 10, expected: 3628800 } + - case: { id: factorial_0, input: 0, expected: 1 } + - case: { id: factorial_5, input: 5, expected: 120 } + - case: { id: factorial_10, input: 10, expected: 3628800 } os.path.join: dataset: - case: + id: join_two_parts inputs: ["/home", "user"] expected: "/home/user" - case: + id: join_three_parts inputs: ["/var", "log", "app.log"] expected: "/var/log/app.log" diff --git a/examples/evals/math.yml b/examples/evals/math.yml index 05c8447..d0730b7 100644 --- a/examples/evals/math.yml +++ b/examples/evals/math.yml @@ -6,11 +6,12 @@ examples.evals.functions.fibonacci: IsInt: type: int dataset: - - case: { input: 0, expected: 0 } - - case: { input: 1, expected: 1 } - - case: { input: 10, expected: 55 } - - case: { input: 20, expected: 6765 } + - case: { id: fib_0, input: 0, expected: 0 } + - case: { id: fib_1, input: 1, expected: 1 } + - case: { id: fib_10, input: 10, expected: 55 } + - case: { id: fib_20, input: 20, expected: 6765 } - case: + id: fib_negative_raises input: -1 raises: ValueError match: "non-negative" @@ -24,10 +25,11 @@ examples.evals.functions.calculate_bmi: CorrectFormula: assertion: "abs(output - input[0] / (input[1] ** 2)) < 0.1" dataset: - - case: { inputs: [70.0, 1.75], expected: 22.86 } - - case: { inputs: [85.0, 1.80], expected: 26.23 } - - case: { inputs: [60.0, 1.65], expected: 22.04 } + - case: { id: bmi_normal_weight, inputs: [70.0, 1.75], expected: 22.86 } + - case: { id: bmi_overweight_range, inputs: [85.0, 1.80], expected: 26.23 } + - case: { id: bmi_light_weight, inputs: [60.0, 1.65], expected: 22.04 } - case: + id: bmi_zero_weight_raises inputs: [0.0, 1.70] raises: ValueError match: "positive" @@ -39,7 +41,7 @@ examples.evals.functions.clamp: WithinBounds: assertion: "input[1] <= output <= input[2]" dataset: - - case: { inputs: [5, 0, 10], expected: 5 } - - case: { inputs: [-5, 0, 10], expected: 0 } - - case: { inputs: [99, 0, 10], expected: 10 } - - case: { inputs: [0, 0, 0], expected: 0 } + - case: { id: clamp_within_bounds, inputs: [5, 0, 10], expected: 5 } + - case: { id: clamp_below_min, inputs: [-5, 0, 10], expected: 0 } + - case: { id: clamp_above_max, inputs: [99, 0, 10], expected: 10 } + - case: { id: clamp_equal_bounds, inputs: [0, 0, 0], expected: 0 } diff --git a/examples/evals/strings.yml b/examples/evals/strings.yml index 07fb578..8866040 100644 --- a/examples/evals/strings.yml +++ b/examples/evals/strings.yml @@ -6,12 +6,12 @@ examples.evals.functions.is_palindrome: IsBool: type: bool dataset: - - case: { input: "racecar", expected: true } - - case: { input: "hello", expected: false } - - case: { input: "A man a plan a canal Panama", expected: true } - - case: { input: "Was it a rat I saw", expected: true } - - case: { input: "12321", expected: true } - - case: { input: "", expected: true } + - case: { id: palindrome_racecar, input: "racecar", expected: true } + - case: { id: palindrome_hello_false, input: "hello", expected: false } + - case: { id: palindrome_phrase_panama, input: "A man a plan a canal Panama", expected: true } + - case: { id: palindrome_phrase_rat, input: "Was it a rat I saw", expected: true } + - case: { id: palindrome_numeric, input: "12321", expected: true } + - case: { id: palindrome_empty_string, input: "", expected: true } examples.evals.functions.count_words: evals: @@ -20,10 +20,10 @@ examples.evals.functions.count_words: NonNegative: assertion: "output >= 0" dataset: - - case: { input: "Hello world from Python", expected: 4 } - - case: { input: "Single", expected: 1 } - - case: { input: "", expected: 0 } - - case: { input: " spaces ", expected: 1 } + - case: { id: count_words_sentence, input: "Hello world from Python", expected: 4 } + - case: { id: count_words_single, input: "Single", expected: 1 } + - case: { id: count_words_empty, input: "", expected: 0 } + - case: { id: count_words_trim_spaces, input: " spaces ", expected: 1 } examples.evals.functions.get_file_extension: evals: @@ -32,11 +32,11 @@ examples.evals.functions.get_file_extension: LowercaseOnly: pattern: "^[a-z0-9]*$" dataset: - - case: { input: "document.txt", expected: "txt" } - - case: { input: "image.PNG", expected: "png" } - - case: { input: "archive.tar.gz", expected: "gz" } - - case: { input: "noextension", expected: "" } - - case: { input: "script.py", expected: "py" } + - case: { id: ext_txt, input: "document.txt", expected: "txt" } + - case: { id: ext_png_uppercase, input: "image.PNG", expected: "png" } + - case: { id: ext_multi_dot_gz, input: "archive.tar.gz", expected: "gz" } + - case: { id: ext_no_extension, input: "noextension", expected: "" } + - case: { id: ext_py, input: "script.py", expected: "py" } examples.evals.functions.extract_hashtags: evals: @@ -46,11 +46,14 @@ examples.evals.functions.extract_hashtags: assertion: "all(tag.startswith('#') for tag in output) if output else True" dataset: - case: + id: hashtags_two_tags input: "Learning #python and #coding today!" expected: ["#python", "#coding"] - case: + id: hashtags_none input: "No hashtags here" expected: [] - case: + id: hashtags_three_tags input: "#AI #ML #DL" expected: ["#AI", "#ML", "#DL"] diff --git a/examples/evals/validation.yml b/examples/evals/validation.yml index 6cf9dd3..6a19217 100644 --- a/examples/evals/validation.yml +++ b/examples/evals/validation.yml @@ -7,11 +7,11 @@ examples.evals.functions.validate_email: type: bool strict: true dataset: - - case: { input: "user@example.com", expected: true } - - case: { input: "invalid.email", expected: false } - - case: { input: "test@domain.co.uk", expected: true } - - case: { input: "@nodomain.com", expected: false } - - case: { input: "spaces @mail.com", expected: false } + - case: { id: email_valid_basic, input: "user@example.com", expected: true } + - case: { id: email_invalid_missing_at, input: "invalid.email", expected: false } + - case: { id: email_valid_subdomain, input: "test@domain.co.uk", expected: true } + - case: { id: email_invalid_missing_user, input: "@nodomain.com", expected: false } + - case: { id: email_invalid_with_space, input: "spaces @mail.com", expected: false } examples.evals.functions.classify_age_group: evals: @@ -26,22 +26,23 @@ examples.evals.functions.classify_age_group: (18 <= input < 65 and output == 'adult') or\ (input >= 65 and output == 'senior') dataset: - - case: { input: 5, expected: "child" } - - case: { input: 15, expected: "teenager" } - - case: { input: 30, expected: "adult" } - - case: { input: 70, expected: "senior" } - - case: { input: 12, expected: "child" } - - case: { input: 18, expected: "adult" } - - case: { input: 65, expected: "senior" } + - case: { id: age_5_child, input: 5, expected: "child" } + - case: { id: age_15_teenager, input: 15, expected: "teenager" } + - case: { id: age_30_adult, input: 30, expected: "adult" } + - case: { id: age_70_senior, input: 70, expected: "senior" } + - case: { id: age_12_child_boundary, input: 12, expected: "child" } + - case: { id: age_18_adult_boundary, input: 18, expected: "adult" } + - case: { id: age_65_senior_boundary, input: 65, expected: "senior" } examples.evals.functions.format_phone: evals: PhoneFormat: pattern: "^\\(\\d{3}\\) \\d{3}-\\d{4}$" dataset: - - case: { input: "5551234567", expected: "(555) 123-4567" } - - case: { input: "2129876543", expected: "(212) 987-6543" } + - case: { id: phone_valid_555, input: "5551234567", expected: "(555) 123-4567" } + - case: { id: phone_valid_212, input: "2129876543", expected: "(212) 987-6543" } - case: + id: phone_short_raises input: "123" raises: ValueError match: "10 digits" @@ -52,11 +53,14 @@ examples.evals.functions.parse_json: type: dict dataset: - case: + id: json_valid_object input: '{"key": "value", "n": 42}' expected: { key: "value", n: 42 } - case: + id: json_invalid_returns_empty input: "invalid json" expected: {} - case: + id: json_nested_object input: '{"nested": {"ok": true}}' expected: { nested: { ok: true } } diff --git a/examples/evaluators/evals.yml b/examples/evaluators/evals.yml index 3b90ddc..7e85c64 100644 --- a/examples/evaluators/evals.yml +++ b/examples/evaluators/evals.yml @@ -11,12 +11,15 @@ validate_email: strict: true dataset: - case: + id: email_valid_user_example input: "user@example.com" expected: true - case: + id: email_invalid_missing_at input: "invalid.email" expected: false - case: + id: email_invalid_missing_user input: "@nodomain.com" expected: false @@ -31,12 +34,15 @@ calculate_discount: assertion: "output <= input[0]" dataset: - case: + id: discount_20_percent inputs: [100.0, 20.0] expected: 80.0 - case: + id: discount_half_price inputs: [50.0, 50.0] expected: 25.0 - case: + id: discount_zero_percent inputs: [200.0, 0.0] expected: 200.0 @@ -48,9 +54,11 @@ format_phone: pattern: "^\\(\\d{3}\\) \\d{3}-\\d{4}$" dataset: - case: + id: phone_555_format input: "5551234567" expected: "(555) 123-4567" - case: + id: phone_212_format input: "2129876543" expected: "(212) 987-6543" @@ -64,12 +72,15 @@ fibonacci: duration: 0.01 dataset: - case: + id: fib_0 input: 0 expected: 0 - case: + id: fib_10 input: 10 expected: 55 - case: + id: fib_20 input: 20 expected: 6765 @@ -83,11 +94,13 @@ extract_hashtags: assertion: "all(tag.startswith('#') for tag in output) if output else True" dataset: - case: + id: hashtags_two input: "Learning #python and #coding today!" expected: - "#python" - "#coding" - case: + id: hashtags_none input: "No hashtags here" expected: [] @@ -106,12 +119,12 @@ classify_age_group: (18 <= input < 65 and output == 'adult') or\ (input >= 65 and output == 'senior') dataset: - - case: { input: 5, expected: "child" } - - case: { input: 15, expected: "teenager" } - - case: { input: 30, expected: "adult" } - - case: { input: 70, expected: "senior" } - - case: { input: 12, expected: "child" } - - case: { input: 18, expected: "adult" } + - case: { id: age_5_child, input: 5, expected: "child" } + - case: { id: age_15_teenager, input: 15, expected: "teenager" } + - case: { id: age_30_adult, input: 30, expected: "adult" } + - case: { id: age_70_senior, input: 70, expected: "senior" } + - case: { id: age_12_child_boundary, input: 12, expected: "child" } + - case: { id: age_18_adult_boundary, input: 18, expected: "adult" } # ─── Raises (Exception Testing) ────────────────────────────── # Verify that specific exceptions are raised with optional message matching. @@ -125,12 +138,15 @@ calculate_bmi: assertion: "abs(output - input[0] / (input[1] ** 2)) < 0.1" dataset: - case: + id: bmi_normal_weight inputs: [70.0, 1.75] expected: 22.86 - case: + id: bmi_overweight_range inputs: [85.0, 1.80] expected: 26.23 - case: + id: bmi_zero_weight_raises inputs: [0.0, 1.70] raises: ValueError match: "positive" diff --git a/examples/fixtures/evals.yml b/examples/fixtures/evals.yml index 683f9d9..952aa0a 100644 --- a/examples/fixtures/evals.yml +++ b/examples/fixtures/evals.yml @@ -12,9 +12,11 @@ write_and_count: - tmp dataset: - case: + id: write_count_hello_world input: "Hello World" expected: 11 - case: + id: write_count_test input: "Test" expected: 4 @@ -23,9 +25,11 @@ count_users: - db dataset: - case: + id: count_users_alice input: "Alice" expected: 2 - case: + id: count_users_bob input: "Bob" expected: 2 @@ -34,5 +38,6 @@ add_with_bonus: - config dataset: - case: + id: add_with_bonus_basic inputs: { a: 1, b: 2 } expected: 13 diff --git a/examples/fluent_api/evals.yml b/examples/fluent_api/evals.yml index 64448cf..94618c6 100644 --- a/examples/fluent_api/evals.yml +++ b/examples/fluent_api/evals.yml @@ -5,23 +5,23 @@ double: IsInt: type: int dataset: - - case: { input: 5, expected: 10 } - - case: { input: 0, expected: 0 } - - case: { input: -4, expected: -8 } + - case: { id: double_positive, input: 5, expected: 10 } + - case: { id: double_zero, input: 0, expected: 0 } + - case: { id: double_negative, input: -4, expected: -8 } triple: dataset: - - case: { input: 3, expected: 9 } - - case: { input: -1, expected: -3 } + - case: { id: triple_positive, input: 3, expected: 9 } + - case: { id: triple_negative, input: -1, expected: -3 } reverse: evals: IsString: type: str dataset: - - case: { input: "hello", expected: "olleh" } - - case: { input: "abcba", expected: "abcba" } - - case: { input: "", expected: "" } + - case: { id: reverse_hello, input: "hello", expected: "olleh" } + - case: { id: reverse_palindrome, input: "abcba", expected: "abcba" } + - case: { id: reverse_empty, input: "", expected: "" } fizzbuzz: evals: @@ -30,8 +30,8 @@ fizzbuzz: ValidOutput: pattern: "^(Fizz|Buzz|FizzBuzz|\\d+)$" dataset: - - case: { input: 1, expected: "1" } - - case: { input: 3, expected: "Fizz" } - - case: { input: 5, expected: "Buzz" } - - case: { input: 15, expected: "FizzBuzz" } - - case: { input: 7, expected: "7" } + - case: { id: fizzbuzz_1, input: 1, expected: "1" } + - case: { id: fizzbuzz_3, input: 3, expected: "Fizz" } + - case: { id: fizzbuzz_5, input: 5, expected: "Buzz" } + - case: { id: fizzbuzz_15, input: 15, expected: "FizzBuzz" } + - case: { id: fizzbuzz_7, input: 7, expected: "7" } diff --git a/examples/serializers/__init__.py b/examples/serializers/__init__.py new file mode 100644 index 0000000..8fbeb8d --- /dev/null +++ b/examples/serializers/__init__.py @@ -0,0 +1 @@ +"""Native YAML serializer + fixture example package.""" diff --git a/examples/serializers/app.py b/examples/serializers/app.py new file mode 100644 index 0000000..6b05f15 --- /dev/null +++ b/examples/serializers/app.py @@ -0,0 +1,18 @@ +"""Functions under test for native serializer + fixture example.""" + +from __future__ import annotations + +from typing import Any + +from .defn import Query +from .fixtures import DbConnection + + +def query_users(query: Query, *, db: DbConnection) -> list[dict[str, Any]]: + """Schema mode example: input dict -> Query model via serializer schema.""" + return db.execute_query(query) + + +def query_users_custom(query: Query, *, db: DbConnection) -> list[dict[str, Any]]: + """serial_fn mode example: raw payload -> Query via custom serializer function.""" + return db.execute_query(query) diff --git a/examples/serializers/db_query_evals.yml b/examples/serializers/db_query_evals.yml new file mode 100644 index 0000000..9cafce2 --- /dev/null +++ b/examples/serializers/db_query_evals.yml @@ -0,0 +1,53 @@ +# yaml-language-server: $schema=../../vowel-schema.json + + +serializers: + query_schema: + schema: examples.serializers.util.Query + query_serial_fn: + serializer: examples.serializers.util.query_from_payload + +fixtures: + db: + cls: examples.serializers.util.DbConnection + kwargs: + db_path: ":memory:" + teardown: examples.serializers.util.DbConnection.close + scope: module + +examples.serializers.util.query_users: + fixture: + - db + serializer: query_schema + evals: + ReturnsRows: + type: list[dict[str, typing.Any]] + CheckSqlIsNotEmpty: + assertion: "input.sql is not None" + dataset: + - case: + id: by_age_threshold + input: + sql: "SELECT name FROM users WHERE age > ? ORDER BY age" + params: [30] + assertion: "output == [{'name': 'Bob'}, {'name': 'Cara'}]" + - case: + id: invalid_table_raises + input: + sql: "SELECT * FROM ghost_table" + params: [] + raises: any + +examples.serializers.util.query_users_custom: + fixture: + - db + serializer: query_serial_fn + evals: + ReturnsRows: + type: list[dict[str, typing.Any]] + dataset: + - case: + id: count_users_from_text + input: "SELECT COUNT(*) AS total FROM users" + expected: + - {total: 3} diff --git a/examples/serializers/defn.py b/examples/serializers/defn.py new file mode 100644 index 0000000..b0013c3 --- /dev/null +++ b/examples/serializers/defn.py @@ -0,0 +1,41 @@ +"""Serializer models and helpers for the native YAML serializer example.""" + +from __future__ import annotations + +from typing import Any + +from pydantic import BaseModel, Field + + +class Query(BaseModel): + """Simple SQL query payload used by example evals.""" + + sql: str + params: list[Any] = Field(default_factory=list) + + +def query_from_payload(payload: dict[str, Any]) -> Query: + """serial_fn mode example for YAML-native serializer registry. + + Accepts both: + - {"input": "SELECT ..."} + - {"input": {"sql": "SELECT ...", "params": [...]}} + """ + + value = payload.get("input") + if value is None: + value = payload.get("inputs") + + if isinstance(value, str): + return Query(sql=value) + + if isinstance(value, dict): + sql = value.get("sql") + params = value.get("params", []) + if not isinstance(sql, str): + raise ValueError("Expected 'sql' to be a string in query payload") + if not isinstance(params, list): + raise ValueError("Expected 'params' to be a list in query payload") + return Query(sql=sql, params=params) + + raise ValueError("Unsupported query payload format") diff --git a/examples/serializers/fixtures.py b/examples/serializers/fixtures.py new file mode 100644 index 0000000..d917f25 --- /dev/null +++ b/examples/serializers/fixtures.py @@ -0,0 +1,39 @@ +"""Fixture utilities for the native YAML serializer example.""" + +from __future__ import annotations + +import sqlite3 +from typing import Any + +from .defn import Query + + +class DbConnection: + """Tiny sqlite fixture class used by vowel fixture injection.""" + + def __init__(self, db_path: str = ":memory:"): + # Vowel can execute cases in worker threads; allow sqlite usage across them. + self.conn = sqlite3.connect(db_path, check_same_thread=False) + self.conn.row_factory = sqlite3.Row + self._seed() + + def _seed(self) -> None: + cur = self.conn.cursor() + cur.execute( + "CREATE TABLE IF NOT EXISTS users (id INTEGER PRIMARY KEY, name TEXT, age INTEGER)" + ) + cur.execute("DELETE FROM users") + cur.executemany( + "INSERT INTO users (name, age) VALUES (?, ?)", + [("Alice", 28), ("Bob", 34), ("Cara", 41)], + ) + self.conn.commit() + + def execute_query(self, query: Query) -> list[dict[str, Any]]: + cur = self.conn.cursor() + cur.execute(query.sql, query.params) + rows = cur.fetchall() + return [dict(row) for row in rows] + + def close(self) -> None: + self.conn.close() diff --git a/examples/serializers/util.py b/examples/serializers/util.py new file mode 100644 index 0000000..e69de29 diff --git a/pyproject.toml b/pyproject.toml index 3e5db9e..09a7072 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "vowel" -version = "0.3.5" +version = "0.4.0" description = "A modular evaluation framework for testing functions with YAML-based specifications" readme = "README.md" requires-python = ">=3.10" @@ -55,7 +55,9 @@ logfire = [ ] monty = [ - "pydantic-monty>=0.0.7" + "pydantic-monty>=0.0.8" + # i have decided to pin working version + # because of new changes to MontyRepl in version 0.0.8 ] optimize = [ "vowel-optimization" @@ -84,7 +86,7 @@ target-version = ["py311"] [tool.ruff] line-length = 100 target-version = "py311" -exclude = ["vowel-optimization"] +exclude = ["vowel-optimization", "benchmark_v1"] [tool.ruff.lint] select = [ @@ -118,7 +120,7 @@ ignore_missing_imports = true python-version = "3.11" [tool.ty.src] -exclude = ["vowel-optimization"] +exclude = ["vowel-optimization", "benchmark_v1"] [tool.ty.rules] unresolved-import = "ignore" @@ -142,3 +144,8 @@ markers = [ "integration: integration tests", "llm: tests that require LLM API calls", ] + +[tool.uv.workspace] +members = [ + "pydantic-acp", +] diff --git a/pyrightconfig.json b/pyrightconfig.json index e93018b..98a899b 100644 --- a/pyrightconfig.json +++ b/pyrightconfig.json @@ -4,6 +4,7 @@ "exclude": [ "vowel-optimization", "tmp", + "benchmark*", "**/.*" ], "reportMissingModuleSource": "none", @@ -11,6 +12,7 @@ "reportUnknownVariableType": "none", "reportUnknownMemberType": "none", "reportUnknownParameterType": "none", + "reportAttributeAccessIssue": "none", "reportAny": "none", "reportExplicitAny": "none", "reportMissingParameterType": "none", diff --git a/pytest.ini b/pytest.ini deleted file mode 100644 index 4ecb1ad..0000000 --- a/pytest.ini +++ /dev/null @@ -1,5 +0,0 @@ -[pytest] -testpaths = tests -python_files = test_*.py -python_classes = Test* -python_functions = test_* diff --git a/quality-judge/evals.py b/quality-judge/evals.py new file mode 100644 index 0000000..c9dd454 --- /dev/null +++ b/quality-judge/evals.py @@ -0,0 +1,33 @@ +import os +import pathlib + +import dotenv + +from vowel.codemode import CodeModeGenerator +from vowel.runner import Function + +dotenv.load_dotenv() + +SPEC_MODEL = os.getenv("SPEC_MODEL") +EXPLORATION_MODEL = os.getenv("EXPLORATION_MODEL") + +generator = CodeModeGenerator( + spec_model=SPEC_MODEL, + exploration_model=EXPLORATION_MODEL, + generation_id="largest_color_value_judge_spec_quality", +) + + +async def generate_spec(fn: Function): + # check for code can compile (it will be executed in monty anyways) + _ = fn.impl + result = await generator.generate(fn, save_to_file=True) + print(result) + generator.print_total_cost() + return result.yaml_spec + + +async def generate_spec_mock(fn: Function): + return pathlib.Path( + "/Users/mert/Desktop/LIP/evalspec/quality-judge/largestPathValue_evals.yml" + ).read_text() diff --git a/quality-judge/largestPathValue_evals.yml b/quality-judge/largestPathValue_evals.yml new file mode 100644 index 0000000..b21778e --- /dev/null +++ b/quality-judge/largestPathValue_evals.yml @@ -0,0 +1,762 @@ +# yaml-language-server: $schema=/Users/mert/.vowel/vowel-schema_035.json + +largestPathValue: + evals: + ReturnType: + type: int + ResultRange: + assertion: output >= -1 + dataset: + - case: + id: example_abaca + inputs: + colors: abaca + edges: + - - 0 + - 1 + - - 0 + - 2 + - - 2 + - 3 + - - 3 + - 4 + expected: 3 + duration: 10.0 + - case: + id: example_cycle_self_loop + inputs: + colors: a + edges: + - - 0 + - 0 + expected: -1 + duration: 10.0 + - case: + id: single_node_no_edges_a + inputs: + colors: a + edges: [] + expected: 1 + duration: 10.0 + - case: + id: single_node_no_edges_z + inputs: + colors: z + edges: [] + expected: 1 + duration: 10.0 + - case: + id: two_nodes_same_color + inputs: + colors: aa + edges: + - - 0 + - 1 + expected: 2 + duration: 10.0 + - case: + id: two_nodes_diff_color + inputs: + colors: ab + edges: + - - 0 + - 1 + expected: 1 + duration: 10.0 + - case: + id: linear_all_same_color + inputs: + colors: aaaa + edges: + - - 0 + - 1 + - - 1 + - 2 + - - 2 + - 3 + expected: 4 + duration: 10.0 + - case: + id: linear_alternating_colors + inputs: + colors: abab + edges: + - - 0 + - 1 + - - 1 + - 2 + - - 2 + - 3 + expected: 2 + duration: 10.0 + - case: + id: linear_five_same_color + inputs: + colors: aaaaa + edges: + - - 0 + - 1 + - - 1 + - 2 + - - 2 + - 3 + - - 3 + - 4 + expected: 5 + duration: 10.0 + - case: + id: linear_abcba + inputs: + colors: abcba + edges: + - - 0 + - 1 + - - 1 + - 2 + - - 2 + - 3 + - - 3 + - 4 + expected: 2 + duration: 10.0 + - case: + id: linear_a_then_bbbbb + inputs: + colors: abbbbb + edges: + - - 0 + - 1 + - - 1 + - 2 + - - 2 + - 3 + - - 3 + - 4 + - - 4 + - 5 + expected: 5 + duration: 10.0 + - case: + id: linear_aba + inputs: + colors: aba + edges: + - - 0 + - 1 + - - 1 + - 2 + expected: 2 + duration: 10.0 + - case: + id: dag_fork_aab + inputs: + colors: aab + edges: + - - 0 + - 1 + - - 0 + - 2 + - - 1 + - 2 + expected: 2 + duration: 10.0 + - case: + id: diamond_abba + inputs: + colors: abba + edges: + - - 0 + - 1 + - - 0 + - 2 + - - 1 + - 3 + - - 2 + - 3 + expected: 2 + duration: 10.0 + - case: + id: diamond_all_same + inputs: + colors: aaaa + edges: + - - 0 + - 1 + - - 0 + - 2 + - - 1 + - 3 + - - 2 + - 3 + expected: 3 + duration: 10.0 + - case: + id: diamond_all_distinct + inputs: + colors: hecb + edges: + - - 0 + - 1 + - - 0 + - 2 + - - 1 + - 3 + - - 2 + - 3 + expected: 1 + duration: 10.0 + - case: + id: no_edges_abc + inputs: + colors: abc + edges: [] + expected: 1 + duration: 10.0 + - case: + id: no_edges_abcde + inputs: + colors: abcde + edges: [] + expected: 1 + duration: 10.0 + - case: + id: two_components_aabba + inputs: + colors: aabba + edges: + - - 0 + - 1 + - - 1 + - 2 + - - 3 + - 4 + expected: 2 + duration: 10.0 + - case: + id: branching_aabba + inputs: + colors: aabba + edges: + - - 0 + - 2 + - - 1 + - 2 + - - 2 + - 3 + - - 2 + - 4 + expected: 2 + duration: 10.0 + - case: + id: branching_single_color_five + inputs: + colors: aaaaa + edges: + - - 0 + - 1 + - - 0 + - 2 + - - 1 + - 3 + - - 2 + - 4 + expected: 3 + duration: 10.0 + - case: + id: fan_in_sink_aaab + inputs: + colors: aaab + edges: + - - 0 + - 3 + - - 1 + - 3 + - - 2 + - 3 + expected: 1 + duration: 10.0 + - case: + id: fan_in_sink_aaaab + inputs: + colors: aaaab + edges: + - - 0 + - 4 + - - 1 + - 4 + - - 2 + - 4 + - - 3 + - 4 + expected: 1 + duration: 10.0 + - case: + id: cycle_three_nodes + inputs: + colors: abc + edges: + - - 0 + - 1 + - - 1 + - 2 + - - 2 + - 0 + expected: -1 + duration: 10.0 + - case: + id: cycle_with_extra_nodes + inputs: + colors: abcd + edges: + - - 0 + - 1 + - - 1 + - 2 + - - 2 + - 1 + - - 1 + - 3 + expected: -1 + duration: 10.0 + - case: + id: two_node_cycle + inputs: + colors: ab + edges: + - - 0 + - 1 + - - 1 + - 0 + expected: -1 + duration: 10.0 + - case: + id: back_edge_cycle + inputs: + colors: abcde + edges: + - - 0 + - 1 + - - 1 + - 2 + - - 2 + - 3 + - - 3 + - 4 + - - 4 + - 1 + expected: -1 + duration: 10.0 + - case: + id: self_loop_non_first_node + inputs: + colors: abc + edges: + - - 0 + - 1 + - - 1 + - 2 + - - 2 + - 2 + expected: -1 + duration: 10.0 + - case: + id: empty_string_no_edges + inputs: + colors: '' + edges: [] + expected: 0 + duration: 10.0 + - case: + id: multi_edge_same_pair + inputs: + colors: ab + edges: + - - 0 + - 1 + - - 0 + - 1 + expected: 1 + duration: 10.0 + - case: + id: all_26_colors_chain + inputs: + colors: abcdefghijklmnopqrstuvwxyz + edges: + - - 0 + - 1 + - - 1 + - 2 + - - 2 + - 3 + - - 3 + - 4 + - - 4 + - 5 + - - 5 + - 6 + - - 6 + - 7 + - - 7 + - 8 + - - 8 + - 9 + - - 9 + - 10 + - - 10 + - 11 + - - 11 + - 12 + - - 12 + - 13 + - - 13 + - 14 + - - 14 + - 15 + - - 15 + - 16 + - - 16 + - 17 + - - 17 + - 18 + - - 18 + - 19 + - - 19 + - 20 + - - 20 + - 21 + - - 21 + - 22 + - - 22 + - 23 + - - 23 + - 24 + - - 24 + - 25 + expected: 1 + duration: 10.0 + - case: + id: alternating_ab_chain_10 + inputs: + colors: ababababab + edges: + - - 0 + - 1 + - - 1 + - 2 + - - 2 + - 3 + - - 3 + - 4 + - - 4 + - 5 + - - 5 + - 6 + - - 6 + - 7 + - - 7 + - 8 + - - 8 + - 9 + expected: 5 + duration: 10.0 + - case: + id: all_same_color_linear_equals_n + inputs: + colors: aaaaaa + edges: + - - 0 + - 1 + - - 1 + - 2 + - - 2 + - 3 + - - 3 + - 4 + - - 4 + - 5 + expected: 6 + duration: 10.0 + - case: + id: list_of_chars_input + inputs: + colors: + - a + - b + - c + edges: + - - 0 + - 1 + - - 1 + - 2 + expected: 1 + duration: 10.0 + - case: + id: cycle_returns_minus_one + inputs: + colors: abcde + edges: + - - 0 + - 1 + - - 1 + - 2 + - - 2 + - 3 + - - 3 + - 4 + - - 4 + - 2 + assertion: output == -1 + duration: 10.0 + - case: + id: dag_result_at_least_one + inputs: + colors: abcdef + edges: + - - 0 + - 1 + - - 1 + - 2 + - - 3 + - 4 + - - 4 + - 5 + assertion: output >= 1 + duration: 10.0 + - case: + id: single_path_bounded_by_length + inputs: + colors: abcabc + edges: + - - 0 + - 1 + - - 1 + - 2 + - - 2 + - 3 + - - 3 + - 4 + - - 4 + - 5 + assertion: output >= 1 and output <= 6 + duration: 10.0 + - case: + id: w_shaped_dag + inputs: + colors: aabaa + edges: + - - 0 + - 2 + - - 1 + - 2 + - - 2 + - 3 + - - 2 + - 4 + assertion: output >= 1 and output <= 5 + duration: 10.0 + - case: + id: two_sources_one_sink_same_color + inputs: + colors: aaa + edges: + - - 0 + - 2 + - - 1 + - 2 + assertion: output == 2 + duration: 10.0 + - case: + id: long_path_single_color_at_ends + inputs: + colors: abcda + edges: + - - 0 + - 1 + - - 1 + - 2 + - - 2 + - 3 + - - 3 + - 4 + assertion: output == 2 + duration: 10.0 + - case: + id: star_topology_center_unique + inputs: + colors: baaaa + edges: + - - 0 + - 1 + - - 0 + - 2 + - - 0 + - 3 + - - 0 + - 4 + assertion: output == 1 + duration: 10.0 + - case: + id: chain_mostly_b_with_one_a + inputs: + colors: bbbba + edges: + - - 0 + - 1 + - - 1 + - 2 + - - 2 + - 3 + - - 3 + - 4 + expected: 4 + duration: 10.0 + - case: + id: error_none_colors + inputs: + colors: null + edges: + - - 0 + - 1 + raises: TypeError + match: has no len + - case: + id: error_none_edges + inputs: + colors: abc + edges: null + raises: TypeError + match: not iterable + - case: + id: error_int_colors + inputs: + colors: 123 + edges: + - - 0 + - 1 + raises: TypeError + match: has no len + - case: + id: error_out_of_range_edge + inputs: + colors: ab + edges: + - - 0 + - 5 + raises: IndexError + match: list index out of range + - case: + id: error_empty_colors_with_edges + inputs: + colors: '' + edges: + - - 0 + - 1 + raises: IndexError + match: list index out of range + - case: + id: error_none_in_edge_list + inputs: + colors: abc + edges: + - null + - - 0 + - 1 + raises: TypeError + match: cannot unpack non-iterable NoneType + - case: + id: error_int_in_edge_list + inputs: + colors: abc + edges: + - 1 + - - 0 + - 1 + raises: TypeError + match: cannot unpack non-iterable int + - case: + id: error_uppercase_color + inputs: + colors: A + edges: [] + raises: IndexError + match: list assignment index out of range + - case: + id: error_string_in_edge_list + inputs: + colors: abc + edges: + - ab + - - 0 + - 1 + raises: TypeError + match: list indices must be integers or slices + - case: + id: error_typeerror_0 + inputs: + - null + - - - 0 + - 1 + raises: TypeError + - case: + id: error_typeerror_1 + inputs: + - abc + - null + raises: TypeError + - case: + id: error_typeerror_2 + inputs: + - 123 + - - - 0 + - 1 + raises: TypeError + - case: + id: error_indexerror_3 + inputs: + - ab + - - - 0 + - 5 + raises: IndexError + - case: + id: error_indexerror_4 + inputs: + - '' + - - - 0 + - 1 + raises: IndexError + - case: + id: error_typeerror_5 + inputs: + - abc + - - null + - - 0 + - 1 + raises: TypeError + - case: + id: error_typeerror_6 + inputs: + - abc + - - 1 + - - 0 + - 1 + raises: TypeError + - case: + id: error_valueerror_7 + inputs: + - ab + - - - 0 + raises: ValueError + - case: + id: error_indexerror_8 + inputs: + - A + - [] + raises: IndexError + - case: + id: error_typeerror_9 + inputs: + - abc + - - ab + - - 0 + - 1 + raises: TypeError \ No newline at end of file diff --git a/quality-judge/largest_color_value_judge.yml b/quality-judge/largest_color_value_judge.yml new file mode 100644 index 0000000..9e48340 --- /dev/null +++ b/quality-judge/largest_color_value_judge.yml @@ -0,0 +1,330 @@ +# yaml-language-server: $schema=/Users/mert/.vowel/vowel-schema_035.json + +# costs: https://gist.github.com/fswair/7631878d75d6ed18a4fe3cb9b579600f#file-terminal-txt-L1309 + +evals.generate_spec: + evals: + EvalSpecCoversFunction: + rubric: | + You are grading the quality of a generated evaluation spec against the given function source. + + Your task: assess how well the generated spec covers the function's real behavior, edge cases, + error paths, and output contracts. Do not reward verbosity; reward correctness and meaningful coverage. + + Calibration rule (critical): + - If trusted run results indicate the generated spec achieved 100% pass rate, do NOT confidently claim + that specific passing cases are "wrong" just from static suspicion. + - In that situation, treat your uncertainty as potential judge hallucination and shift focus to quality: + uniqueness, diversity, coverage depth, evaluator precision, and contract alignment. + - You may still flag a case as problematic only when there is strong internal evidence (clear contradiction, + invalid expectation syntax, impossible assertion, or direct mismatch with the given function contract). + + Score using this weighted rubric (0-100 total): + + 1) Functional Coverage (0-35) + - Core happy paths are tested. + - Boundary/edge conditions are tested (empty/min/max/null-ish/degenerate where relevant). + - Important behavior branches in source are represented by cases. + - Missing major branch or core behavior: deduct heavily. + + 2) Error and Guard Coverage (0-20) + - All meaningful raise/guard paths are represented. + - Exception type expectations are accurate. + - Error-message match checks are used when meaningful. + - Missing critical error path: major deduction. + + 3) Case Quality and Dataset Design (0-20) + - Cases are concrete, non-redundant, and behavior-focused. + - Inputs are realistic and varied (not trivial permutations only). + - Dataset is unique, diverse, and comprehensive (not AI slop or cosmetic rewrites of same scenario). + - Expected values/assertions are specific and verifiable. + - No vague, tautological, or self-fulfilling checks. + + 4) Evaluator Quality (0-15) + - Evaluators are appropriate for the function contract (expected/raises/type/assertion/pattern/etc). + - Assertions are precise and meaningful (not always-true). + - Type and semantic checks are correctly used where needed. + - Evaluators reflect what was actually observed from execution evidence when such evidence is provided. + + 5) Spec Correctness and Maintainability (0-10) + - YAML is structurally valid and unambiguous. + - Case naming/readability is good. + - Spec is concise but complete. + + Hard-fail conditions (cap score at 40 max): + - Core function behavior is not tested. + - Critical error/guard behavior is absent. + - Evaluators are mostly weak/tautological/misaligned. + - Spec appears invalid or internally inconsistent. + - Do NOT trigger hard-fail based only on speculative "this case should fail" claims that conflict with + trusted 100% pass execution evidence. + + Return format (mandatory): + 1) Criterion scores with reasons: + - Functional Coverage: /35 + Reason: + - Error and Guard Coverage: /20 + Reason: + - Case Quality and Dataset Design: /20 + Reason: + - Evaluator Quality: /15 + Reason: + - Spec Correctness and Maintainability: /10 + Reason: + + 2) Final numeric score: /100 + - Must equal the sum of criterion scores. + + 3) 3-6 key findings (strengths/weaknesses), each tied to specific evidence. + + 4) Top 3 actionable improvements, prioritized by impact. + + Important: + - Do NOT return only a final score. + - Every criterion MUST include both score and explicit reason. + - If pass-rate evidence is 100%, prioritize quality diagnostics over speculative invalid-case accusations. + - When you suspect a case is wrong despite pass evidence, mark it as "low-confidence concern" unless + you can cite direct, concrete contradiction from the source/contract. + include: + - input + config: + model: $JUDGE_MODEL + max_tokens: 4096 + dataset: + - case: + input: + name: largestPathValue + description: | + Largest color value in a directed graph + + There is a directed graph of n colored nodes and m edges. The nodes are numbered from 0 to n - 1. + + You are given a string colors where colors[i] is a lowercase English letter representing the color of the ith node in this graph (0-indexed). You are also given a 2D array edges where edges[j] = [aj, bj] indicates that there is a directed edge from node aj to node bj. + + A valid path in the graph is a sequence of nodes x1 -> x2 -> x3 -> ... -> xk such that there is a directed edge from xi to xi+1 for every 1 <= i < k. The color value of the path is the number of nodes that are colored the most frequently occurring color along that path. + + Return the largest color value of any valid path in the given graph, or -1 if the graph contains a cycle. + + + Input: colors = "abaca", edges = [[0,1],[0,2],[2,3],[3,4]] + Output: 3 + Explanation: The path 0 -> 2 -> 3 -> 4 contains 3 nodes that are colored "a" (red in the above image). + + Input: colors = "a", edges = [[0,0]] + Output: -1 + Explanation: There is a cycle from 0 to 0. + + + Constraints: + n == colors.length + m == edges.length + 1 <= n <= 105 + 0 <= m <= 105 + colors consists of lowercase English letters. + 0 <= aj, bj < n + code: | + from collections import deque + + + def largestPathValue(colors: str, edges: list[list[int]]) -> int: + n = len(colors) + graph = [[] for _ in range(n)] + indegree = [0] * n + + for u, v in edges: + graph[u].append(v) + indegree[v] += 1 + + dp = [[0] * 26 for _ in range(n)] + queue = deque() + + for i in range(n): + if indegree[i] == 0: + queue.append(i) + dp[i][ord(colors[i]) - ord("a")] = 1 + + visited = 0 + answer = 0 + + while queue: + node = queue.popleft() + visited += 1 + answer = max(answer, max(dp[node])) + + for nei in graph[node]: + for c in range(26): + dp[nei][c] = max( + dp[nei][c], + dp[node][c] + (1 if c == ord(colors[nei]) - ord("a") else 0), + ) + + indegree[nei] -= 1 + if indegree[nei] == 0: + queue.append(nei) + + return answer if visited == n else -1 + + + +evals.generate_spec_mock: + evals: + EvalSpecCoversFunction: + rubric: | + You are grading the quality of a generated evaluation spec against the given function source. + + Your task: assess how well the generated spec covers the function's real behavior, edge cases, + error paths, and output contracts. Do not reward verbosity; reward correctness and meaningful coverage. + + Calibration rule (critical): + - If trusted run results indicate the generated spec achieved 100% pass rate, do NOT confidently claim + that specific passing cases are "wrong" just from static suspicion. + - In that situation, treat your uncertainty as potential judge hallucination and shift focus to quality: + uniqueness, diversity, coverage depth, evaluator precision, and contract alignment. + - You may still flag a case as problematic only when there is strong internal evidence (clear contradiction, + invalid expectation syntax, impossible assertion, or direct mismatch with the given function contract). + + Score using this weighted rubric (0-100 total): + + 1) Functional Coverage (0-35) + - Core happy paths are tested. + - Boundary/edge conditions are tested (empty/min/max/null-ish/degenerate where relevant). + - Important behavior branches in source are represented by cases. + - Missing major branch or core behavior: deduct heavily. + + 2) Error and Guard Coverage (0-20) + - All meaningful raise/guard paths are represented. + - Exception type expectations are accurate. + - Error-message match checks are used when meaningful. + - Missing critical error path: major deduction. + + 3) Case Quality and Dataset Design (0-20) + - Cases are concrete, non-redundant, and behavior-focused. + - Inputs are realistic and varied (not trivial permutations only). + - Dataset is unique, diverse, and comprehensive (not AI slop or cosmetic rewrites of same scenario). + - Expected values/assertions are specific and verifiable. + - No vague, tautological, or self-fulfilling checks. + + 4) Evaluator Quality (0-15) + - Evaluators are appropriate for the function contract (expected/raises/type/assertion/pattern/etc). + - Assertions are precise and meaningful (not always-true). + - Type and semantic checks are correctly used where needed. + - Evaluators reflect what was actually observed from execution evidence when such evidence is provided. + + 5) Spec Correctness and Maintainability (0-10) + - YAML is structurally valid and unambiguous. + - Case naming/readability is good. + - Spec is concise but complete. + + Hard-fail conditions (cap score at 40 max): + - Core function behavior is not tested. + - Critical error/guard behavior is absent. + - Evaluators are mostly weak/tautological/misaligned. + - Spec appears invalid or internally inconsistent. + - Do NOT trigger hard-fail based only on speculative "this case should fail" claims that conflict with + trusted 100% pass execution evidence. + + Return format (mandatory): + 1) Criterion scores with reasons: + - Functional Coverage: /35 + Reason: + - Error and Guard Coverage: /20 + Reason: + - Case Quality and Dataset Design: /20 + Reason: + - Evaluator Quality: /15 + Reason: + - Spec Correctness and Maintainability: /10 + Reason: + + 2) Final numeric score: /100 + - Must equal the sum of criterion scores. + + 3) 3-6 key findings (strengths/weaknesses), each tied to specific evidence. + + 4) Top 3 actionable improvements, prioritized by impact. + + Important: + - Do NOT return only a final score. + - Every criterion MUST include both score and explicit reason. + - If pass-rate evidence is 100%, prioritize quality diagnostics over speculative invalid-case accusations. + - When you suspect a case is wrong despite pass evidence, mark it as "low-confidence concern" unless + you can cite direct, concrete contradiction from the source/contract. + include: + - input + config: + model: $JUDGE_MODEL + max_tokens: 4096 + dataset: + - case: + input: + name: largestPathValue + description: | + Largest color value in a directed graph + + There is a directed graph of n colored nodes and m edges. The nodes are numbered from 0 to n - 1. + + You are given a string colors where colors[i] is a lowercase English letter representing the color of the ith node in this graph (0-indexed). You are also given a 2D array edges where edges[j] = [aj, bj] indicates that there is a directed edge from node aj to node bj. + + A valid path in the graph is a sequence of nodes x1 -> x2 -> x3 -> ... -> xk such that there is a directed edge from xi to xi+1 for every 1 <= i < k. The color value of the path is the number of nodes that are colored the most frequently occurring color along that path. + + Return the largest color value of any valid path in the given graph, or -1 if the graph contains a cycle. + + + Input: colors = "abaca", edges = [[0,1],[0,2],[2,3],[3,4]] + Output: 3 + Explanation: The path 0 -> 2 -> 3 -> 4 contains 3 nodes that are colored "a" (red in the above image). + + Input: colors = "a", edges = [[0,0]] + Output: -1 + Explanation: There is a cycle from 0 to 0. + + + Constraints: + n == colors.length + m == edges.length + 1 <= n <= 105 + 0 <= m <= 105 + colors consists of lowercase English letters. + 0 <= aj, bj < n + code: | + from collections import deque + + + def largestPathValue(colors: str, edges: list[list[int]]) -> int: + n = len(colors) + graph = [[] for _ in range(n)] + indegree = [0] * n + + for u, v in edges: + graph[u].append(v) + indegree[v] += 1 + + dp = [[0] * 26 for _ in range(n)] + queue = deque() + + for i in range(n): + if indegree[i] == 0: + queue.append(i) + dp[i][ord(colors[i]) - ord("a")] = 1 + + visited = 0 + answer = 0 + + while queue: + node = queue.popleft() + visited += 1 + answer = max(answer, max(dp[node])) + + for nei in graph[node]: + for c in range(26): + dp[nei][c] = max( + dp[nei][c], + dp[node][c] + (1 if c == ord(colors[nei]) - ord("a") else 0), + ) + + indegree[nei] -= 1 + if indegree[nei] == 0: + queue.append(nei) + + return answer if visited == n else -1 + diff --git a/quality-judge/runner.py b/quality-judge/runner.py new file mode 100644 index 0000000..587b79c --- /dev/null +++ b/quality-judge/runner.py @@ -0,0 +1,22 @@ +from vowel.monitoring import enable_monitoring +from vowel.runner import Function, RunEvals + +enable_monitoring( + logfire_enabled=True, + service_name="quality-judge", +) + +runner = RunEvals.from_file("largest_color_value_judge.yml") + +main_runner = runner.with_serializer({"evals.generate_spec": Function}).filter( + "evals.generate_spec" +) + +# mock_runner = runner.with_serializer({"evals.generate_spec_mock": Function}).filter( +# "evals.generate_spec_mock" +# ) + + +summary = main_runner.run() + +summary.print() diff --git a/src/vowel/__init__.py b/src/vowel/__init__.py index 8c065a7..d60bdbf 100644 --- a/src/vowel/__init__.py +++ b/src/vowel/__init__.py @@ -1,62 +1,50 @@ -""" -vowel - A modular evaluation framework for testing functions with YAML-based specifications. - -This package provides a comprehensive evaluation framework for testing Python functions -using YAML-based specifications. It supports various evaluation types including: - -- Type checking (isinstance validation) -- Custom assertions (Python expressions) -- Performance constraints (duration limits) -- Input containment checks -- Regex pattern matching -- Exception validation -- LLM-based semantic evaluation - -Quick Start: - # Run evaluations from a YAML file - from vowel import run_evals - summary = run_evals("evals.yml") - - # Generate evals for a function using LLM - from vowel import EvalGenerator, Function - gen = EvalGenerator(model="openai:gpt-4o") - func = Function(name="add", code="def add(a, b): return a + b", description="Add two numbers") - summary = gen.generate_and_run(func, auto_retry=True) - -For more information, see the documentation at: -https://github.com/fswair/vowel -""" +"""Public package exports for the vowel evaluation framework.""" import importlib.metadata +from contextlib import suppress __version__ = importlib.metadata.version("vowel") from .ai import EvalGenerator, GenerationResult, UnsupportedParameterTypeError +from .codemode import CodeModeGenerator, CodeModeResult, ExplorationPlan, SnippetResult from .context import EVAL_SPEC_CONTEXT +from .costs import CostManager from .errors import FixturePathError, SignatureError from .eval_types import EvalsFile +from .executor import ( + DefaultExecutor, + DefaultSession, + ExecutionResult, + ExecutionSession, + Executor, + MontyExecutor, + MontyReplSession, + get_executor, + resolve_executors, +) from .runner import Function, RunEvals +from .schema import ensure_cached_schema from .utils import ( EvalResult, EvalSummary, check_compatibility, get_unsupported_params, is_yaml_serializable_type, - load_evals, - load_evals_file, - load_evals_from_dict, - load_evals_from_object, - load_evals_from_yaml_string, + load_bundle, + load_bundle_file, + load_bundle_from_dict, + load_bundle_from_object, + load_bundle_from_yaml_string, run_evals, to_dataset, ) __all__ = [ - "load_evals_file", - "load_evals_from_yaml_string", - "load_evals_from_dict", - "load_evals_from_object", - "load_evals", + "load_bundle_file", + "load_bundle_from_yaml_string", + "load_bundle_from_dict", + "load_bundle_from_object", + "load_bundle", "to_dataset", "run_evals", "RunEvals", @@ -73,4 +61,24 @@ "check_compatibility", "get_unsupported_params", "is_yaml_serializable_type", + # CodeMode executor + "Executor", + "ExecutionResult", + "ExecutionSession", + "MontyExecutor", + "MontyReplSession", + "DefaultExecutor", + "DefaultSession", + "get_executor", + "resolve_executors", + # CodeMode pipeline + "CodeModeGenerator", + "CodeModeResult", + "ExplorationPlan", + "SnippetResult", + "CostManager", ] + + +with suppress(Exception): + ensure_cached_schema(__version__) diff --git a/src/vowel/ai.py b/src/vowel/ai.py index 6abcc0e..50937ac 100644 --- a/src/vowel/ai.py +++ b/src/vowel/ai.py @@ -1,34 +1,4 @@ -"""LLM-powered evaluation specification generator and function healer. - -This module provides: -- EvalGenerator: Generate eval specs and heal buggy functions using LLMs -- generate_eval_spec: Generate YAML eval specs from function definitions -- prepare_agent: Create a pydantic_ai Agent for eval generation - -Key Features: -- Auto-generate YAML eval specs from function code and description -- Heal buggy functions based on failing test inputs -- Retry logic with configurable coverage thresholds -- Support for async and sync function generation - -Example: - from vowel import EvalGenerator, Function - - generator = EvalGenerator(model="openai:gpt-4o") - - func = Function( - name="factorial", - description="Calculate factorial of n", - code="def factorial(n): return 1 if n <= 1 else n * factorial(n - 1)" - ) - - summary = generator.generate_and_run( - func, - auto_retry=True, - heal_function=True, - min_coverage=0.9 - ) -""" +"""LLM-backed eval generation and function healing utilities.""" import os import time @@ -43,6 +13,7 @@ from vowel.eval_types import EvalsSource from vowel.monitoring import enable_monitoring from vowel.runner import Function, RunEvals +from vowel.schema import materialize_yaml_with_schema_header from vowel.utils import EvalSummary, check_compatibility, import_function from vowel.validation import validate_and_fix_spec @@ -563,8 +534,9 @@ def generate_eval_spec( ) if save_to_file: + spec_to_write = materialize_yaml_with_schema_header(spec_to_use) with open(f"{func.name}_evals.yml", "w") as f: - f.write(spec_to_use) + f.write(spec_to_write) runner = RunEvals.from_source(spec_to_use) if func.func: diff --git a/src/vowel/cli.py b/src/vowel/cli.py index 114f08b..6b2ed4c 100644 --- a/src/vowel/cli.py +++ b/src/vowel/cli.py @@ -1,18 +1,13 @@ -"""Command-line interface for the vowel evaluation framework. - -Usage: - vowel Run evaluations from a YAML spec - vowel -d Run all YAML files in a directory - vowel -v Detailed summary with spec semantics - vowel --hide-report Hide pydantic_evals report output -""" +"""Command-line entry points for running and managing vowel eval specs.""" +import json import sys import time from pathlib import Path import click import dotenv +import yaml from rich import box from rich.console import Console from rich.panel import Panel @@ -27,10 +22,244 @@ LLMJudgeCase, PatternMatchCase, ) +from .schema import build_yaml_schema_from_bundle, materialize_yaml_with_schema_header from .utils import EvalsBundle, EvalSummary, load_bundle, run_evals dotenv.load_dotenv() console = Console() +COSTS_FILE = Path.home() / ".vowel" / "codemode" / "generation_costs.json" + + +def _load_cost_store() -> dict: + if not COSTS_FILE.exists(): + return {"schema_version": 1, "generations": {}} + try: + data = json.loads(COSTS_FILE.read_text(encoding="utf-8")) + except Exception: + return {"schema_version": 1, "generations": {}} + if not isinstance(data, dict): + return {"schema_version": 1, "generations": {}} + generations = data.get("generations") + if not isinstance(generations, dict): + data["generations"] = {} + return data + + +def _flatten_runs(store: dict) -> list[tuple[str, dict, dict]]: + rows: list[tuple[str, dict, dict]] = [] + for gid, gen in store.get("generations", {}).items(): + runs = gen.get("runs", {}) if isinstance(gen, dict) else {} + if not isinstance(runs, dict): + continue + for run_id, run in runs.items(): + rows.append((gid, gen, {"run_id": run_id, **run})) + return rows + + +def _print_generation_table(store: dict) -> list[str]: + generations = store.get("generations", {}) + ordered = sorted( + generations.items(), + key=lambda x: str(x[1].get("created_at", "")), + reverse=True, + ) + table = Table(title="Generations", box=box.ROUNDED) + table.add_column("#", style="cyan", no_wrap=True) + table.add_column("Generation ID", style="white") + table.add_column("Created", style="dim") + table.add_column("Runs", justify="right") + table.add_column("USD", justify="right", style="green") + + ids: list[str] = [] + for idx, (gid, gen) in enumerate(ordered, start=1): + totals = gen.get("totals", {}) + run_count = len(gen.get("runs", {})) if isinstance(gen.get("runs", {}), dict) else 0 + table.add_row( + str(idx), + gid, + str(gen.get("created_at", "-")), + str(run_count), + f"{float(totals.get('usd', 0.0) or 0.0):.6f}", + ) + ids.append(gid) + + console.print(table) + return ids + + +def _print_generation_detail(generation_id: str, generation: dict) -> None: + totals = generation.get("totals", {}) + info = Table.grid(padding=(0, 2)) + info.add_column(style="bold") + info.add_column() + info.add_row("Generation", generation_id) + info.add_row("Created", str(generation.get("created_at", "-"))) + info.add_row("Spec model", str(generation.get("spec_model", "-"))) + info.add_row("Exploration model", str(generation.get("exploration_model", "-"))) + info.add_row("USD", f"{float(totals.get('usd', 0.0) or 0.0):.6f}") + info.add_row("Input tokens", str(int(totals.get("input_tokens", 0) or 0))) + info.add_row("Output tokens", str(int(totals.get("output_tokens", 0) or 0))) + info.add_row("Requests", str(int(totals.get("requests", 0) or 0))) + console.print(Panel(info, title="Generation Summary", border_style="bright_cyan")) + + run_table = Table(title="Runs", box=box.ROUNDED) + run_table.add_column("Run ID", style="white") + run_table.add_column("Function", style="cyan") + run_table.add_column("Status") + run_table.add_column("USD", justify="right", style="green") + run_table.add_column("Input", justify="right") + run_table.add_column("Output", justify="right") + run_table.add_column("Requests", justify="right") + run_table.add_column("Created", style="dim") + + runs = generation.get("runs", {}) if isinstance(generation.get("runs", {}), dict) else {} + for run_id, run in runs.items(): + rt = run.get("totals", {}) + run_table.add_row( + run_id, + str(run.get("func_name", "-")), + str(run.get("status", "-")), + f"{float(rt.get('usd', 0.0) or 0.0):.6f}", + str(int(rt.get("input_tokens", 0) or 0)), + str(int(rt.get("output_tokens", 0) or 0)), + str(int(rt.get("requests", 0) or 0)), + str(run.get("created_at", "-")), + ) + console.print(run_table) + + +def _print_run_detail(generation_id: str, run: dict) -> None: + totals = run.get("totals", {}) + info = Table.grid(padding=(0, 2)) + info.add_column(style="bold") + info.add_column() + info.add_row("Generation", generation_id) + info.add_row("Run", str(run.get("run_id", "-"))) + info.add_row("Function", str(run.get("func_name", "-"))) + info.add_row("Status", str(run.get("status", "-"))) + info.add_row("USD", f"{float(totals.get('usd', 0.0) or 0.0):.6f}") + info.add_row("Input tokens", str(int(totals.get("input_tokens", 0) or 0))) + info.add_row("Output tokens", str(int(totals.get("output_tokens", 0) or 0))) + info.add_row("Requests", str(int(totals.get("requests", 0) or 0))) + console.print(Panel(info, title="Run Summary", border_style="bright_cyan")) + + step_table = Table(title="Steps", box=box.ROUNDED) + step_table.add_column("Step", style="white") + step_table.add_column("Calls", justify="right") + step_table.add_column("USD", justify="right", style="green") + step_table.add_column("Input", justify="right") + step_table.add_column("Output", justify="right") + step_table.add_column("Requests", justify="right") + + steps = run.get("steps", {}) if isinstance(run.get("steps", {}), dict) else {} + for step_name, step_data in steps.items(): + usages = step_data.get("usages", []) if isinstance(step_data, dict) else [] + usd = 0.0 + input_tokens = 0 + output_tokens = 0 + requests = 0 + for u in usages: + usage = u.get("usage", {}) if isinstance(u, dict) else {} + usd += float(u.get("usd", 0.0) or 0.0) + input_tokens += int(usage.get("input_tokens", 0) or 0) + output_tokens += int(usage.get("output_tokens", 0) or 0) + requests += int(usage.get("requests", 0) or 0) + + step_table.add_row( + step_name, + str(len(usages)), + f"{usd:.6f}", + str(input_tokens), + str(output_tokens), + str(requests), + ) + + console.print(step_table) + + +def _handle_costs_command( + *, + list_costs: bool, + by_generation: bool, + by_run: bool, + generation_id: str | None, + run_id: str | None, +) -> None: + store = _load_cost_store() + generations = store.get("generations", {}) + if not generations: + console.print("[yellow]No cost records found yet.[/yellow]") + return + + if generation_id: + generation = generations.get(generation_id) + if not isinstance(generation, dict): + click.secho(f"ERROR: Generation not found: {generation_id}", fg="red", err=True) + raise SystemExit(1) + _print_generation_detail(generation_id, generation) + return + + if run_id: + for gid, gen in generations.items(): + runs = gen.get("runs", {}) if isinstance(gen, dict) else {} + if isinstance(runs, dict) and run_id in runs: + run = {"run_id": run_id, **runs[run_id]} + _print_run_detail(gid, run) + return + click.secho(f"ERROR: Run not found: {run_id}", fg="red", err=True) + raise SystemExit(1) + + if not list_costs: + _print_generation_table(store) + return + + if by_generation: + ids = _print_generation_table(store) + if not ids: + return + choice = click.prompt("Select generation number", type=int) + if choice < 1 or choice > len(ids): + click.secho("ERROR: Invalid selection", fg="red", err=True) + raise SystemExit(1) + selected = ids[choice - 1] + _print_generation_detail(selected, generations[selected]) + return + + if by_run: + rows = _flatten_runs(store) + if not rows: + console.print("[yellow]No runs found.[/yellow]") + return + + table = Table(title="Runs", box=box.ROUNDED) + table.add_column("#", style="cyan", no_wrap=True) + table.add_column("Run ID", style="white") + table.add_column("Generation", style="dim") + table.add_column("Function", style="cyan") + table.add_column("Status") + table.add_column("USD", justify="right", style="green") + + for idx, (gid, _, run) in enumerate(rows, start=1): + totals = run.get("totals", {}) + table.add_row( + str(idx), + str(run.get("run_id", "-")), + gid, + str(run.get("func_name", "-")), + str(run.get("status", "-")), + f"{float(totals.get('usd', 0.0) or 0.0):.6f}", + ) + console.print(table) + + choice = click.prompt("Select run number", type=int) + if choice < 1 or choice > len(rows): + click.secho("ERROR: Invalid selection", fg="red", err=True) + raise SystemExit(1) + gid, _, run = rows[choice - 1] + _print_run_detail(gid, run) + return + + _print_generation_table(store) def _eval_type_label(case) -> str: @@ -249,7 +478,9 @@ def validate_coverage(ctx, param, value): @click.command() -@click.argument("yaml_file", type=click.Path(exists=True, path_type=Path), required=False) +@click.argument("arg1", type=click.Path(path_type=Path), required=False) +@click.argument("arg2", type=click.Path(path_type=Path), required=False) +@click.argument("arg3", type=click.Path(path_type=Path), required=False) @click.option("--ci", is_flag=True, help="Enable CI mode") @click.option( "--coverage", @@ -278,8 +509,31 @@ def validate_coverage(ctx, param, value): @click.option("--watch", "-w", is_flag=True, help="Watch mode: re-run on file changes") @click.option("--verbose", "-v", is_flag=True, help="Show detailed evaluation summary") @click.option("--hide-report", is_flag=True, help="Hide pydantic_evals report output") +@click.option( + "--create", + "schema_create", + is_flag=True, + help="With 'vowel schema': generate vowel-schema.json in current directory", +) +@click.option("--list", "list_costs", is_flag=True, help="With 'vowel costs': list records") +@click.option( + "-g", + "--by-generation", + is_flag=True, + help="With 'vowel costs --list': browse generations interactively", +) +@click.option( + "-r", + "--by-run", + is_flag=True, + help="With 'vowel costs --list': browse runs interactively", +) +@click.option("--generation", "generation_id", help="With 'vowel costs': show generation id") +@click.option("--run", "run_id_option", help="With 'vowel costs': show run id") def main( - yaml_file: Path | None, + arg1: Path | None, + arg2: Path | None, + arg3: Path | None, debug: bool, coverage: float, filter_func: str | None, @@ -295,10 +549,91 @@ def main( watch: bool, verbose: bool, hide_report: bool, + schema_create: bool, + list_costs: bool, + by_generation: bool, + by_run: bool, + generation_id: str | None, + run_id_option: str | None, ): """vowel — YAML-based evaluation framework for Python functions.""" console = Console(force_terminal=False, no_color=True) if no_color else Console() + # Command mode: vowel schema + if arg1 is not None and str(arg1) == "schema": + # Command mode: vowel schema --create [output_path] + if schema_create: + output_path = arg2 if arg2 is not None else Path.cwd() / "vowel-schema.json" + output_path.parent.mkdir(parents=True, exist_ok=True) + schema = build_yaml_schema_from_bundle() + output_path.write_text( + json.dumps(schema, indent=2, ensure_ascii=False) + "\n", encoding="utf-8" + ) + console.print(f"[green]✓[/green] Generated schema: [cyan]{output_path}[/cyan]") + return + + if arg2 is None: + click.secho("ERROR: vowel schema requires or --create", fg="red", err=True) + raise click.Abort() + + target_path = arg2 + if not target_path.exists(): + click.secho(f"ERROR: File not found: {target_path}", fg="red", err=True) + raise SystemExit(1) + + if target_path.suffix.lower() == ".json": + click.secho( + "ERROR: JSON files are not supported by 'vowel schema '. " + "Use a YAML file (.yml/.yaml).", + fg="red", + err=True, + ) + raise SystemExit(1) + + existing = target_path.read_text(encoding="utf-8") + + # Do not inject schema header into invalid YAML files. + try: + yaml.safe_load(existing) + except Exception as e: + click.secho( + f"ERROR: File is not valid YAML, schema header not added: {e}", + fg="red", + err=True, + ) + raise SystemExit(1) from None + + # Do not inject schema header if content is not a valid vowel spec. + try: + load_bundle(existing) + except Exception as e: + click.secho( + f"ERROR: Pydantic validation failed, schema header not added: {e}", + fg="red", + err=True, + ) + raise SystemExit(1) from None + + updated = materialize_yaml_with_schema_header(existing) + target_path.write_text(updated, encoding="utf-8") + console.print(f"[green]✓[/green] Updated schema header: [cyan]{target_path}[/cyan]") + + console.print("[green]✓[/green] Pydantic validation passed") + return + + # Command mode: vowel costs [--list -g|-r] [--generation ] [--run ] + if arg1 is not None and str(arg1) == "costs": + _handle_costs_command( + list_costs=list_costs, + by_generation=by_generation, + by_run=by_run, + generation_id=generation_id, + run_id=run_id_option, + ) + return + + yaml_file = arg1 + # Validate incompatible options if directory and filter_func: click.secho("ERROR: --filter cannot be used with --dir", fg="red", err=True) @@ -486,6 +821,9 @@ def main( if not yaml_file: click.secho("ERROR: --watch requires a YAML file", fg="red", err=True) raise click.Abort() + if not yaml_file.exists(): + click.secho(f"ERROR: YAML file not found: {yaml_file}", fg="red", err=True) + raise click.Abort() try: from watchdog.events import FileSystemEventHandler @@ -565,6 +903,9 @@ def on_modified(self, event): if not quiet: console.print(f"Found [cyan]{len(yaml_files)}[/cyan] YAML file(s)") elif yaml_file: + if not yaml_file.exists(): + click.secho(f"ERROR: YAML file not found: {yaml_file}", fg="red", err=True) + raise click.Abort() yaml_files = [yaml_file] else: click.secho("ERROR: Either YAML_FILE or --dir is required", fg="red", err=True) @@ -738,9 +1079,7 @@ def on_modified(self, event): # Export JSON if export_json: - import json - - json_data = summary.json() + json_data = summary.to_json() with open(export_json, "w") as f: json.dump(json_data, f, indent=2) if not quiet: @@ -749,6 +1088,10 @@ def on_modified(self, event): # Failed assertions detail if summary.failed_results: console.print() + + all_failures_are_duration = True + has_any_failures = False + for result in summary.failed_results: console.print(Panel(result.eval_id, title="Failed Assertions", border_style="yellow")) @@ -758,6 +1101,7 @@ def on_modified(self, event): ] if failed_assertions: + has_any_failures = True total_assertions = len(case.assertions) failed_count = len(failed_assertions) @@ -767,6 +1111,12 @@ def on_modified(self, event): ) for assertion_name, res in failed_assertions: + if ( + "duration" not in assertion_name.lower() + and "maxduration" not in assertion_name.lower() + ): + all_failures_are_duration = False + console.print(f"\n [red]x {assertion_name}[/red]") if res.reason: reason_lines = str(res.reason).split("\n") @@ -774,6 +1124,20 @@ def on_modified(self, event): if line.strip(): console.print(f" [dim]{line.strip()}[/dim]") + # Inform user if all errors are just duration errors + if has_any_failures and all_failures_are_duration: + console.print() + console.print( + Panel( + "All failing evaluators are related to duration (MaxDuration). " + "You can run the command with `--ignore-duration` to skip performance constraints " + "and get a more accurate evaluation of functional correctness.", + title="Insight", + border_style="cyan", + style="cyan", + ) + ) + console.print() # CI mode diff --git a/src/vowel/codemode.py b/src/vowel/codemode.py new file mode 100644 index 0000000..33269c5 --- /dev/null +++ b/src/vowel/codemode.py @@ -0,0 +1,1005 @@ +"""CodeMode pipeline for execution-aware eval spec generation. + +CodeMode uses real execution feedback to generate robust vowel eval specs: +1. Explore behavior by running LLM-generated snippets against the target code. +2. Generate and refine a spec from verified outputs/errors. + +The pipeline supports both YAML output and structured bundle output, and keeps +traceability via logfire spans. +""" + +from __future__ import annotations + +import os +import time +from typing import Any + +import logfire +import yaml +from pydantic import BaseModel, Field +from pydantic_ai import Agent + +from vowel.context import EVAL_SPEC_CONTEXT +from vowel.costs import CostManager +from vowel.eval_types import EvalsSource +from vowel.executor import ExecutionResult, Executor, resolve_executors +from vowel.monitoring import enable_monitoring +from vowel.runner import Function, RunEvals +from vowel.schema import materialize_yaml_with_schema_header +from vowel.utils import EvalsBundle, EvalSummary +from vowel.validation import ( + build_call_code, + build_failure_context, + inject_durations, + inject_missing_error_cases, + validate_and_fix_spec, + validate_expected_values, +) + +enable_monitoring(service_name="vowel-codemode") + + +# --------------------------------------------------------------------------- +# Exploration output model — what the LLM returns in Phase 1 +# --------------------------------------------------------------------------- + + +class ExplorationSnippet(BaseModel): + """A single exploration snippet that tests normal (non-error) behaviour.""" + + description: str = Field( + description="One-line description of what this snippet tests " + "(e.g. 'empty list edge case', 'negative numbers').", + ) + code: str = Field( + description="Python code to execute. May call ``target_func(...)`` " + "which is the function under test. The value of the last " + "expression is captured as output.", + ) + + +class ErrorSnippet(BaseModel): + """A snippet that should trigger an exception from the function.""" + + description: str = Field( + description="What error scenario this tests " + "(e.g. 'None input', 'division by zero', 'wrong type').", + ) + code: str = Field( + description="Python code that should trigger an exception. " + "Use the function's real name — the source is prepended at runtime.", + ) + + +class ExplorationPlan(BaseModel): + """LLM output for Phase 1: normal snippets + error-triggering snippets.""" + + snippets: list[ExplorationSnippet] = Field( + description="Snippets that test NORMAL (succeeding) behaviour: " + "happy-path, boundary values, return type exploration, " + "equivalence partitioning, invariants, composition.", + min_length=10, + ) + error_snippets: list[ErrorSnippet] = Field( + description="Snippets that should TRIGGER EXCEPTIONS: wrong types, " + "invalid values, None inputs, out-of-range arguments. " + "Every guard clause and raise statement in the function " + "must be exercised by at least one error snippet.", + min_length=3, + ) + + +# --------------------------------------------------------------------------- +# Exploration result — what we feed back to Phase 2 +# --------------------------------------------------------------------------- + + +class SnippetResult(BaseModel): + """Result of executing a single exploration snippet.""" + + description: str + code: str + success: bool + output: Any = None + stdout: str = "" + error: str | None = None + error_type: str | None = None + duration_ms: float = 0.0 + + model_config = {"arbitrary_types_allowed": True} + + @classmethod + def from_execution( + cls, + snippet: ExplorationSnippet | ErrorSnippet, + result: ExecutionResult, + ) -> SnippetResult: + return cls( + description=snippet.description, + code=snippet.code, + success=result.success, + output=result.output, + stdout=result.stdout, + error=result.error, + error_type=result.error_type, + duration_ms=result.duration_ms, + ) + + def to_context_block(self) -> str: + """Format as a context block for the spec-generation prompt.""" + if self.success: + out = repr(self.output) + return ( + f"# {self.description}\n" + f">>> {self.code.strip()}\n" + f"Output: {out} ({self.duration_ms:.2f} ms)" + ) + return ( + f"# {self.description}\n>>> {self.code.strip()}\nRAISED {self.error_type}: {self.error}" + ) + + +# --------------------------------------------------------------------------- +# Pipeline result +# --------------------------------------------------------------------------- + + +class CodeModeResult(BaseModel): + """Full result of the CodeMode generation pipeline.""" + + exploration_results: list[SnippetResult] = Field( + description="Results from Phase 1 exploration.", + ) + yaml_spec: str = Field(description="Final YAML eval specification.") + summary: EvalSummary | None = Field( + default=None, + description="Eval run summary (if run_evals=True).", + ) + refinement_rounds: int = Field( + default=0, + description="Number of refinement iterations needed (0 = first-pass success).", + ) + + model_config = {"arbitrary_types_allowed": True} + + +# --------------------------------------------------------------------------- +# CodeModeGenerator +# --------------------------------------------------------------------------- + + +class CodeModeGenerator: + """Execution-guided eval generator. + + The generator first discovers behavior by running snippets, then produces + a validated eval spec (YAML or bundle) from those verified results. + """ + + def __init__( + self, + spec_model: str | None = None, + exploration_model: str | None = None, + generation_id: str | None = None, + default_executor: Executor | None = None, + fallback_executor: Executor | None = None, + additional_context: str = "", + min_snippets: int = 15, + use_model_spec: bool = False, + **opts, + ) -> None: + # Default fallback from kwargs (for backwards compatibility) or environment + base_fallback = opts.pop("model", None) or os.getenv("MODEL_NAME", "") + + self.spec_model = spec_model or os.getenv("SPEC_MODEL") or base_fallback + self.exploration_model = ( + exploration_model or os.getenv("EXPLORATION_MODEL") or base_fallback + ) + + if not self.spec_model or not self.exploration_model: + raise ValueError( + "Both spec_model and exploration_model must be specified. " + "Provide them via constructor/kwargs, or set SPEC_MODEL, EXPLORATION_MODEL, or MODEL_NAME environment variables." + ) + + self.executor = resolve_executors(default_executor, fallback_executor) + self.additional_context = additional_context + self.min_snippets = min_snippets + self.use_model_spec = use_model_spec + self.cost_manager = CostManager( + spec_model=self.spec_model, + exploration_model=self.exploration_model, + generation_id=generation_id, + ) + self.generation_id = self.cost_manager.generation_id + self._active_run_id: str | None = None + self._opts = opts + + # Lazy agents + self._explorer_agent: Agent[None, ExplorationPlan] | None = None + self._spec_agent: Agent[None, EvalsSource | EvalsBundle] | None = None + + logfire.info( + "CodeModeGenerator initialized", + spec_model=self.spec_model, + exploration_model=self.exploration_model, + generation_id=self.generation_id, + executor=type(self.executor).__name__, + ) + + def print_total_cost(self, run_id: str | None = None) -> None: + self.cost_manager.print_total_cost(run_id=run_id) + + # -- Agent properties -------------------------------------------------- + + @property + def explorer_agent(self) -> Agent[None, ExplorationPlan]: + if self._explorer_agent is None: + self._explorer_agent = Agent( + self.exploration_model, + output_type=ExplorationPlan, + system_prompt=self._explorer_system_prompt(), + **self._opts, + ) + return self._explorer_agent + + @property + def spec_agent(self) -> Agent[None, EvalsSource | EvalsBundle]: + if self._spec_agent is None: + output_type = EvalsBundle if self.use_model_spec else EvalsSource + self._spec_agent = Agent( + self.spec_model, + output_type=output_type, + system_prompt=self._spec_system_prompt(), + **self._opts, + ) + return self._spec_agent + + # -- System prompts ---------------------------------------------------- + + def _explorer_system_prompt(self) -> str: + return f"""You are a Python testing expert. Your job is to write small +code snippets that explore a function's behaviour empirically. + +You will receive: +- The function's source code (with its real name) +- The function's description + +You produce TWO separate lists of snippets: + +## `snippets` — Normal / succeeding behaviour +These snippets call the function with VALID inputs and capture the return +value. They MUST cover: +1. Normal / happy-path behaviour (typical valid inputs) +2. Boundary values (empty collections, zero, negative, very large, min/max) +3. Return type and structure exploration +4. Equivalence partitioning (representative from each input class) +5. Invariant verification (e.g. idempotency, commutativity, sort stability) +6. Composition / interaction (combining parameters, dependent arguments) + +Produce AT LEAST {self.min_snippets} normal snippets. + +## `error_snippets` — Exception-triggering inputs +These snippets call the function with inputs that SHOULD RAISE exceptions. +They MUST cover: +1. Wrong types (None, int instead of list, str instead of int, etc.) +2. Invalid values (out-of-range, malformed strings, empty when not allowed) +3. Every `raise` statement and guard clause in the function source code + +Produce AT LEAST 3 error snippets. If the function has more raise +statements or guard clauses, produce MORE — one per distinct error path. + +STRICT RULES: +- Each snippet MUST end with an expression whose value will be captured. +- Use the function's REAL NAME — the function source code will be prepended + automatically at runtime. Do NOT define the function yourself. +- Keep each snippet focused on ONE scenario. +- Do NOT produce duplicate snippets. Two snippets are duplicates if they test + the same input shape and same behavior class. +- For `error_snippets`, each snippet must map to a DISTINCT error mode + (different guard/branch, exception type, or message pattern). +- If the function signature has no positional-only (`/`) or keyword-only (`*`) + limiters, prefer positional call style for multi-argument calls and avoid + equivalent keyword-style duplicates. +- Do NOT guess outputs — the snippets will be executed and the real + outputs collected automatically. +- NEVER use try/except in your snippets. Let exceptions propagate + naturally — the execution environment captures raised errors + automatically. For example, write `flatten(None)` NOT + `try: flatten(None) except Exception as e: type(e)`. +- `snippets` must contain ONLY inputs expected to SUCCEED. +- `error_snippets` must contain ONLY inputs expected to RAISE exceptions. + Do NOT mix them.""" + + def _spec_system_prompt(self) -> str: + ctx = "" + if self.additional_context: + ctx = f"\n\n\n{self.additional_context}\n" + return f"""You are an expert vowel YAML SPEC generator. + +{EVAL_SPEC_CONTEXT}{ctx} + +CRITICAL: You have access to VERIFIED execution results below. Use the +EXACT outputs shown — do NOT guess or calculate expected values yourself. +The execution results are ground-truth from running the real function.""" + + # -- Phase 1: Exploration ---------------------------------------------- + + async def explore( + self, + func: Function, + *, + exploration_rounds: int = 2, + run_id: str | None = None, + ) -> list[SnippetResult]: + """Generate and execute exploration snippets. + + Round 1 discovers baseline behavior. Subsequent rounds receive prior + execution evidence and target uncovered behavior classes. + """ + with logfire.span( + "codemode.explore", + func_name=func.name, + executor=type(self.executor).__name__, + exploration_rounds=exploration_rounds, + ): + all_results: list[SnippetResult] = [] + + for round_num in range(1, exploration_rounds + 1): + with logfire.span( + "codemode.explore_round", + round=round_num, + prior_results=len(all_results), + ): + # Get exploration plan (round 2+ includes prior context) + if round_num == 1: + plan = await self._get_exploration_plan(func, run_id=run_id) + else: + cluster_summary = self._build_cluster_summary(all_results) + plan = await self._get_targeted_exploration_plan( + func, + all_results, + cluster_summary, + run_id=run_id, + round_num=round_num, + ) + # Early exit: if no new snippets were produced + if not plan.snippets and not plan.error_snippets: + logfire.info( + "Round {round} produced no new snippets, stopping", + round=round_num, + ) + break + + # Execute snippets + new_results = self._execute_plan(func, plan, round_num) + all_results.extend(new_results) + + # Early exit: round 2+ found no new behaviour + if round_num > 1: + new_behaviors = self._count_new_behaviors( + all_results[: -len(new_results)], + new_results, + ) + logfire.info( + "Round {round}: {new} new behaviour classes discovered", + round=round_num, + new=new_behaviors, + ) + + # Summary log + successes = sum(1 for r in all_results if r.success) + failures = len(all_results) - successes + logfire.info( + "Exploration complete: {successes} succeeded, {failures} raised errors", + successes=successes, + failures=failures, + ) + + return all_results + + def _execute_plan( + self, + func: Function, + plan: ExplorationPlan, + round_num: int = 1, + ) -> list[SnippetResult]: + """Execute all snippets in an exploration plan and collect results.""" + all_snippets = [ + *((s, "normal") for s in plan.snippets), + *((s, "error") for s in plan.error_snippets), + ] + total = len(all_snippets) + results: list[SnippetResult] = [] + with self.executor.create_session(func.code) as session: + for i, (snippet, kind) in enumerate(all_snippets): + with logfire.span( + "codemode.execute_snippet", + index=i, + kind=kind, + round=round_num, + description=snippet.description, + ): + logfire.info( + "Executing snippet {index}/{total} R{round} [{kind}]: {description}", + index=i + 1, + total=total, + round=round_num, + kind=kind, + description=snippet.description, + code=snippet.code, + ) + + exec_result = session.feed(snippet.code) + + sr = SnippetResult.from_execution(snippet, exec_result) + results.append(sr) + + logfire.info( + "Snippet result: success={success}, output={output}, " + "duration={duration_ms:.2f}ms", + success=sr.success, + output=repr(sr.output)[:200], + duration_ms=sr.duration_ms, + error=sr.error, + error_type=sr.error_type, + ) + return results + + @staticmethod + def _build_cluster_summary(results: list[SnippetResult]) -> str: + """Summarize observed output/error clusters for targeted exploration.""" + # -- Success clusters -- + success_types: dict[str, int] = {} + for r in results: + if r.success: + t = type(r.output).__name__ + success_types[t] = success_types.get(t, 0) + 1 + + # -- Error clusters -- + error_clusters: dict[str, list[str]] = {} + for r in results: + if not r.success and r.error_type: + msgs = error_clusters.setdefault(r.error_type, []) + prefix = (r.error or "")[:60] + if prefix not in msgs: + msgs.append(prefix) + + # -- Already-tried snippets (to avoid repeats) -- + tried_codes = [r.code.strip() for r in results] + + lines = ["## Observed Behaviour Clusters\n"] + + lines.append("### Success clusters") + if success_types: + for t, count in sorted(success_types.items()): + lines.append(f"- output type `{t}`: {count} cases") + else: + lines.append("- (none)") + + lines.append("\n### Error clusters") + if error_clusters: + for etype, msgs in sorted(error_clusters.items()): + lines.append(f"- `{etype}` ({len(msgs)} distinct messages):") + for m in msgs: + lines.append(f' - "{m}"') + else: + lines.append("- (none)") + + lines.append(f"\n### Already tried ({len(tried_codes)} snippets — do NOT repeat these)") + for code in tried_codes: + lines.append(f"- `{code}`") + + return "\n".join(lines) + + @staticmethod + def _count_new_behaviors( + prior: list[SnippetResult], + new: list[SnippetResult], + ) -> int: + """Count new behavior signatures introduced by a round.""" + + def _behavior_key(r: SnippetResult) -> str: + if r.success: + return f"ok:{type(r.output).__name__}" + return f"err:{r.error_type}:{(r.error or '')[:40]}" + + prior_keys = {_behavior_key(r) for r in prior} + new_keys = {_behavior_key(r) for r in new} + return len(new_keys - prior_keys) + + async def _get_exploration_plan( + self, + func: Function, + *, + run_id: str | None = None, + ) -> ExplorationPlan: + """Request initial exploration snippets from the model.""" + with logfire.span("codemode.llm_explore", func_name=func.name, round=1): + prompt = f"""Explore the following function by writing test snippets: + +{func.name} + +{func.code} + +{func.description} + +Write diverse snippets that call {func.name}(...) to discover the function's +behaviour across all important scenarios. Use the real function name +`{func.name}` — the implementation will be prepended automatically.""" + + result = await self.explorer_agent.run(prompt) + if run_id: + self.cost_manager.record_agent_usage( + run_id=run_id, + step_key="exploration_round_1", + result=result, + model_name=self.exploration_model, + ) + plan = result.output + + logfire.info( + "Round 1: LLM produced {normal} normal + {error} error snippets", + normal=len(plan.snippets), + error=len(plan.error_snippets), + snippets=[s.description for s in plan.snippets], + error_snippets=[s.description for s in plan.error_snippets], + ) + return plan + + async def _get_targeted_exploration_plan( + self, + func: Function, + prior_results: list[SnippetResult], + cluster_summary: str, + *, + run_id: str | None = None, + round_num: int = 2, + ) -> ExplorationPlan: + """Request targeted snippets using prior execution evidence.""" + with logfire.span("codemode.llm_explore", func_name=func.name, round=2): + prompt = f"""You previously explored `{func.name}` and the snippets were +executed. Below are the ACTUAL results and a cluster summary. + +Your job now is to find **new behaviour classes** that were NOT covered +in Round 1. Focus on: +- Syntax / input combinations not yet tried +- Edge cases at boundaries between observed clusters +- Error paths whose exact error type or message differs from expectation +- Interactions between parameters / sub-expressions + +{func.name} + +{func.code} + +{func.description} + + +{chr(10).join(r.to_context_block() for r in prior_results)} + + + +{cluster_summary} + + +RULES: +- Do NOT repeat any snippet from the "Already tried" list. +- Produce 8–12 NEW normal snippets targeting uncovered behaviour. +- Produce 3–5 NEW error snippets targeting untried error paths. +- Prefer diversity over volume: no semantically duplicate cases. +- Each new error snippet should cover a unique failure mode. +- If signature has no `/` or `*` limiters, use positional calling style for + multi-argument calls and avoid keyword/positional duplicates of same scenario. +- Same strict rules as before: no try/except, real function name, + one scenario per snippet, last expression captured.""" + + result = await self.explorer_agent.run(prompt) + if run_id: + self.cost_manager.record_agent_usage( + run_id=run_id, + step_key=f"targeted_exploration_round_{round_num}", + result=result, + model_name=self.exploration_model, + ) + plan = result.output + + logfire.info( + "Round 2: LLM produced {normal} normal + {error} error snippets", + normal=len(plan.snippets), + error=len(plan.error_snippets), + snippets=[s.description for s in plan.snippets], + error_snippets=[s.description for s in plan.error_snippets], + ) + return plan + + # -- Phase 2: Spec Generation ------------------------------------------ + + async def generate_spec( + self, + func: Function, + exploration_results: list[SnippetResult], + failure_context: str | None = None, + *, + run_id: str | None = None, + attempt: int = 1, + ) -> str | EvalsBundle: + """Generate a spec from verified exploration results. + + Returns YAML text in default mode, or ``EvalsBundle`` when + ``use_model_spec=True``. + """ + with logfire.span( + "codemode.generate_spec", + func_name=func.name, + n_results=len(exploration_results), + is_refinement=failure_context is not None, + ): + # Build exploration context for the prompt + success_results = [r for r in exploration_results if r.success] + error_results = [r for r in exploration_results if not r.success] + + success_context = ( + "\n\n".join(r.to_context_block() for r in success_results) + if success_results + else "(none)" + ) + error_context = ( + "\n\n".join(r.to_context_block() for r in error_results) + if error_results + else "(none)" + ) + + refinement_block = "" + if failure_context: + refinement_block = f""" + +⚠️ PREVIOUS ATTEMPT FAILED — fix these issues: +{failure_context} + +Regenerate the YAML spec addressing every failure above. Keep all +passing cases intact — only fix the broken ones.""" + + prompt = f"""Generate vowel evals YAML spec for `{func.name}`: + + +{func.code} + + +{func.description} + + +The following results are from ACTUALLY RUNNING the function — use these +exact outputs as expected values: + +{success_context} + + + +These inputs RAISED exceptions when run against the real function. +Each one MUST become a `raises:` case in the spec — no exceptions. + +{error_context} + + +REQUIREMENTS: +- The top-level YAML key MUST be `{func.name}` (the function name). +- Generate at least {max(len(exploration_results), 5)} diverse test cases. +- Use the EXACT outputs from the execution results above. +- Error coverage rule: include AT LEAST one raises case for each UNIQUE + observed error mode (exception type + meaningfully distinct call pattern). +- Do NOT duplicate semantically equivalent error cases. If two cases represent + the same failing input semantics, keep only one (prefer the one with `match` + when message is deterministic from observed execution). +- Cover normal, edge, and error cases. +- In assertions, use `input` (NOT `inputs`) for accessing input values. +- Prefer `expected` over `assertion` whenever the exact output is known from + verified execution results. +- Use `assertion` only for true invariants/properties that are not redundant + with exact `expected` values. +- Do NOT use broad/trivial assertions (e.g. `output >= 0`, `output <= len(input)`) + when a precise expected value can be asserted. +- Keep the dataset compact and non-redundant: no duplicate cases with the same + effective behavior. +- If function signature has no positional-only (`/`) or keyword-only (`*`) + limiters, prefer positional style for multi-argument calls and do not include + both positional and keyword variants of the same scenario. +- Stay aligned with function contract/type hints: do not add contract-irrelevant + cases that only test incidental duck-typing unless explicitly motivated. +- For `raises` cases, only claim exception type/message patterns that are present + in observed execution results; do not invent unsupported error expectations. + +YAML FORMAT — STRICT RULES (violations cause parse failure): +- NEVER use YAML tags: `!!python/tuple`, `!!python/object`, `!!binary`, + `!!omap`, `!!str`, `!!int`, `!!float`, or ANY `!!` tag whatsoever. + Plain YAML scalars and sequences only. `yaml.safe_load()` will be used + to parse the output — it rejects all `!!` tags and will hard-fail. +- Represent tuples as YAML sequences (lists). +- NEVER emit `!!python/...` or any non-standard YAML type annotation. +{refinement_block}""" + + logfire.info( + "Sending spec generation prompt", + func_name=func.name, + success_results=len(success_results), + error_results=len(error_results), + ) + + result = await self.spec_agent.run(prompt) + if run_id: + self.cost_manager.record_agent_usage( + run_id=run_id, + step_key=f"spec_generation_attempt_{attempt}", + result=result, + model_name=self.spec_model, + ) + + if self.use_model_spec: + bundle = result.output + assert isinstance(bundle, EvalsBundle) + logfire.info( + "Model spec bundle generated", + func_name=func.name, + eval_count=len(bundle.evals), + fixture_count=len(bundle.fixtures), + ) + return bundle + + yaml_spec = result.output.yaml_spec + + # Sanitize: strip ALL !! annotations — safe_load only accepts + # a tiny subset (str/int/float/bool/null/seq/map) and rejects + # anything else (!!python/tuple, !!binary, !!omap, etc.). + # Stripping them is safe: scalar values fall back to plain YAML types. + import re + + yaml_spec = re.sub(r"!![^\s\[\]{},]+", "", yaml_spec) + + # Validate YAML syntax + yaml.safe_load(yaml_spec) + + # Validate and auto-fix + validation = validate_and_fix_spec( + yaml_spec, + function_code=func.code, + ) + if validation.has_warnings: + logfire.info( + "Spec validation applied fixes", + summary=validation.summary(), + ) + final_spec = validation.fixed_yaml if validation.was_modified else yaml_spec + + # Executor-based validation: fix expected values against real execution + final_spec = validate_expected_values(final_spec, func, self.executor) + + # Inject missing error cases from exploration + error_snippet_dicts = [ + { + "code": r.code, + "error_type": r.error_type, + "error": r.error, + "description": r.description, + } + for r in exploration_results + if not r.success and r.error_type + ] + final_spec = inject_missing_error_cases(final_spec, func.name, error_snippet_dicts) + + logfire.info( + "YAML spec generated", + func_name=func.name, + spec_length=len(final_spec), + spec_preview=final_spec[:500], + ) + + return final_spec + + # -- Helpers ----------------------------------------------------------- + + @staticmethod + def _build_failure_context(summary: EvalSummary) -> str: + """Build retry context from failed assertions/errors.""" + return build_failure_context(summary) + + def _inject_durations( + self, + yaml_spec: str, + func: Function, + *, + buffer_pct: float = 0.5, + floor_ms: float = 10.0, + ) -> str: + """Inject measured duration thresholds into cases.""" + return inject_durations( + yaml_spec, + func, + self.executor, + buffer_pct=buffer_pct, + floor_ms=floor_ms, + ) + + @staticmethod + def _build_call_code(func_name: str, case: dict) -> str | None: + """Build a callable expression from a dataset case.""" + return build_call_code(func_name, case) + + # -- Full pipeline ----------------------------------------------------- + + async def generate( + self, + func: Function, + *, + run_id: str | None = None, + run_evals: bool = True, + save_to_file: bool = False, + max_refinement_rounds: int = 2, + min_coverage: float = 1.0, + inject_durations: bool = True, + ) -> CodeModeResult: + """Run full CodeMode generation, validation, and optional refinement. + + Flow: explore -> generate spec -> validate -> refine (optional) -> + inject durations (optional). Returns exploration artifacts, final spec, + and evaluation summary when ``run_evals`` is enabled. + """ + with logfire.span( + "codemode.pipeline", + func_name=func.name, + generation_id=self.generation_id, + spec_model=self.spec_model, + exploration_model=self.exploration_model, + executor=type(self.executor).__name__, + ): + run_id = self.cost_manager.start_run(run_id=run_id, func_name=func.name) + self._active_run_id = run_id + + t0 = time.perf_counter() + + try: + # Phase 1 — explore (once) + exploration_results = await self.explore(func, run_id=run_id) + + # Phase 2–4 — generate spec + validate + refine + yaml_spec = "" + generated_bundle: EvalsBundle | None = None + summary: EvalSummary | None = None + refinement_rounds = 0 + failure_context: str | None = None + total_attempts = max_refinement_rounds + 1 if run_evals else 1 + + for attempt in range(total_attempts): + with logfire.span( + "codemode.spec_attempt", + attempt=attempt + 1, + is_refinement=attempt > 0, + ): + try: + bundle = await self.generate_spec( + func, + exploration_results, + failure_context, + run_id=run_id, + attempt=attempt + 1, + ) + + if isinstance(bundle, EvalsBundle): + generated_bundle = bundle + yaml_spec = bundle.to_yaml() + else: + generated_bundle = None + yaml_spec = bundle + except Exception as exc: + logfire.warn( + "Spec generation failed on attempt {attempt}, retrying", + attempt=attempt + 1, + error=str(exc), + ) + failure_context = f"Generation error: {exc}" + refinement_rounds = attempt + 1 + continue + + if not run_evals: + break + + # Validate: run evals with ignore_duration=True + try: + if generated_bundle is not None: + runner = ( + RunEvals.from_bundle(generated_bundle) + .with_functions({func.name: func.impl}) + .ignore_duration() + ) + else: + runner = ( + RunEvals.from_source(yaml_spec) + .with_functions({func.name: func.impl}) + .ignore_duration() + ) + summary = runner.run() + + logfire.info( + "Attempt {attempt}: {passed}/{total} passed, coverage={coverage:.1f}%", + attempt=attempt + 1, + passed=summary.success_count, + total=summary.total_count, + failed=summary.failed_count, + errors=summary.error_count, + coverage=summary.coverage * 100, + ) + + if summary.coverage >= min_coverage: + break + + # Build failure context for next attempt + failure_context = self._build_failure_context(summary) + refinement_rounds = attempt + 1 + logfire.warn( + "Coverage {coverage:.0f}% below target {target:.0f}%, refining", + coverage=summary.coverage * 100, + target=min_coverage * 100, + attempt=attempt + 1, + ) + + except Exception as exc: + logfire.warn( + "Failed to run evals on attempt {attempt}, retrying", + attempt=attempt + 1, + func_name=func.name, + error=str(exc), + ) + failure_context = f"Eval run error: {exc}" + refinement_rounds = attempt + 1 + continue + + # Phase 5 — inject per-case durations + if inject_durations: + with logfire.span("codemode.inject_durations", func_name=func.name): + yaml_spec = self._inject_durations(yaml_spec, func) + + # Final summary run (with durations now present, but still ignored) + if run_evals and summary is not None: + try: + if generated_bundle is not None: + final_runner = ( + RunEvals.from_bundle(generated_bundle) + .with_functions({func.name: func.impl}) + .ignore_duration() + ) + else: + final_runner = ( + RunEvals.from_source(yaml_spec) + .with_functions({func.name: func.impl}) + .ignore_duration() + ) + summary = final_runner.run() + except Exception: # noqa: BLE001 + pass # keep last good summary + + if save_to_file: + path = f"{func.name}_evals.yml" + spec_to_write = materialize_yaml_with_schema_header(yaml_spec) + with open(path, "w") as f: + f.write(spec_to_write) + logfire.info("Saved spec to {path}", path=path) + + elapsed = (time.perf_counter() - t0) * 1000 + self.cost_manager.mark_run_completed(run_id) + logfire.info( + "CodeMode pipeline complete in {elapsed:.0f}ms (refinements={refinement_rounds})", + elapsed=elapsed, + func_name=func.name, + generation_id=self.generation_id, + run_id=run_id, + exploration_count=len(exploration_results), + refinement_rounds=refinement_rounds, + has_summary=summary is not None, + ) + + return CodeModeResult( + exploration_results=exploration_results, + yaml_spec=yaml_spec, + summary=summary, + refinement_rounds=refinement_rounds, + ) + except Exception as exc: + self.cost_manager.mark_run_failed(run_id, str(exc)) + raise diff --git a/src/vowel/context.py b/src/vowel/context.py index 4e584b0..36fa283 100644 --- a/src/vowel/context.py +++ b/src/vowel/context.py @@ -1,13 +1,4 @@ -""" -Context definitions for vowel eval specification generation. - -This module contains the EVAL_SPEC_CONTEXT which provides comprehensive -documentation about vowel's YAML-based evaluation specification format. -This context is used by EvalGenerator to guide LLM-based eval generation. - -Set VOWEL_CONTEXT_VERSION=legacy to use the pre-optimization prompt. -Default is "v3" (GEPA-optimized with Sonnet proposer). -""" +"""Prompt context strings used for LLM-driven eval specification generation.""" import os diff --git a/src/vowel/costs.py b/src/vowel/costs.py new file mode 100644 index 0000000..72d90f7 --- /dev/null +++ b/src/vowel/costs.py @@ -0,0 +1,358 @@ +"""Cost tracking and persistence utilities for CodeMode runs.""" + +from __future__ import annotations + +import fcntl +import json +import os +import tempfile +import uuid +from datetime import UTC, datetime +from pathlib import Path +from typing import Any + +import logfire +import yaml + + +class CostManager: + """Manage generation/run cost records, pricing, and persistence.""" + + def __init__( + self, + *, + spec_model: str, + exploration_model: str, + generation_id: str | None = None, + costs_file: Path | None = None, + ) -> None: + self.spec_model = spec_model + self.exploration_model = exploration_model + self.generation_id = generation_id or self._new_generation_id() + self._costs_file = ( + costs_file or Path.home() / ".vowel" / "codemode" / "generation_costs.json" + ) + self._price_table = self._load_costs_yml() + self._cost_records: dict[str, Any] = self._load_cost_records() + self._ensure_generation_record() + + @staticmethod + def _new_generation_id() -> str: + ts = datetime.now(UTC).strftime("%Y%m%dT%H%M%SZ") + return f"gen_{ts}_{uuid.uuid4().hex[:8]}" + + @staticmethod + def _new_run_id() -> str: + return f"run_{uuid.uuid4().hex}" + + @staticmethod + def _default_cost_store() -> dict[str, Any]: + return {"schema_version": 1, "generations": {}} + + def _load_cost_records(self) -> dict[str, Any]: + if not self._costs_file.exists(): + return self._default_cost_store() + try: + data = json.loads(self._costs_file.read_text(encoding="utf-8")) + except Exception: + logfire.warn("Failed to parse cost records, resetting store") + return self._default_cost_store() + + if not isinstance(data, dict) or "generations" not in data: + return self._default_cost_store() + return data + + def _ensure_generation_record(self) -> None: + generations = self._cost_records.setdefault("generations", {}) + if self.generation_id in generations: + return + generations[self.generation_id] = { + "generation_id": self.generation_id, + "created_at": datetime.now(UTC).isoformat(), + "spec_model": self.spec_model, + "exploration_model": self.exploration_model, + "totals": {"usd": 0.0, "input_tokens": 0, "output_tokens": 0, "requests": 0}, + "runs": {}, + } + + @staticmethod + def _normalize_models(data: Any) -> dict[str, dict[str, float]]: + if not isinstance(data, dict): + return {} + + models_obj = data.get("models") + normalized: dict[str, dict[str, float]] = {} + + if isinstance(models_obj, dict): + items = models_obj.items() + elif isinstance(models_obj, list): + items = [] + for item in models_obj: + if isinstance(item, dict): + items.extend(item.items()) + else: + items = [] + + for model_name, model_data in items: + if not isinstance(model_name, str) or not isinstance(model_data, dict): + continue + normalized[model_name] = { + "input_per_million": float(model_data.get("input_per_million", 0.0) or 0.0), + "output_per_million": float(model_data.get("output_per_million", 0.0) or 0.0), + "cached_input_per_million": float( + model_data.get("cached_input_per_million", 0.0) or 0.0 + ), + } + + return normalized + + def _load_costs_yml(self) -> dict[str, Any]: + candidates = [Path.cwd() / "costs.yml", Path(__file__).resolve().parents[2] / "costs.yml"] + for path in candidates: + if not path.exists(): + continue + try: + data = yaml.safe_load(path.read_text(encoding="utf-8")) + except Exception: + continue + models = self._normalize_models(data) + if models: + return {"models": models} + return {} + + def _persist_costs_atomic(self) -> None: + self._costs_file.parent.mkdir(parents=True, exist_ok=True) + payload = json.dumps(self._cost_records, ensure_ascii=False, indent=2) + "\n" + lock_path = self._costs_file.parent / ".generation_costs.lock" + + with open(lock_path, "a+", encoding="utf-8") as lock_file: + fcntl.flock(lock_file.fileno(), fcntl.LOCK_EX) + try: + with tempfile.NamedTemporaryFile( + mode="w", encoding="utf-8", dir=self._costs_file.parent, delete=False + ) as tmp: + tmp.write(payload) + tmp.flush() + os.fsync(tmp.fileno()) + tmp_path = Path(tmp.name) + os.replace(tmp_path, self._costs_file) + finally: + fcntl.flock(lock_file.fileno(), fcntl.LOCK_UN) + + def _ensure_run_record(self, run_id: str, func_name: str) -> None: + generation = self._cost_records["generations"][self.generation_id] + runs = generation.setdefault("runs", {}) + if run_id in runs: + return + runs[run_id] = { + "run_id": run_id, + "func_name": func_name, + "created_at": datetime.now(UTC).isoformat(), + "status": "running", + "error": None, + "steps": {}, + "totals": {"usd": 0.0, "input_tokens": 0, "output_tokens": 0, "requests": 0}, + } + + def _get_run_record(self, run_id: str) -> dict[str, Any]: + return self._cost_records["generations"][self.generation_id]["runs"][run_id] + + @staticmethod + def _run_usage_dict(usage: Any) -> dict[str, int]: + return { + "requests": int(getattr(usage, "requests", 0) or 0), + "input_tokens": int(getattr(usage, "input_tokens", 0) or 0), + "output_tokens": int(getattr(usage, "output_tokens", 0) or 0), + "cached_input_tokens": int(getattr(usage, "cached_input_tokens", 0) or 0), + } + + @staticmethod + def _normalize_model_name(model_name: str) -> str: + normalized = model_name.strip() + if ":" in normalized: + normalized = normalized.split(":", 1)[1] + if "/" in normalized: + normalized = normalized.rsplit("/", 1)[1] + return normalized + + def _resolve_price_from_costs_yml(self, model_name: str) -> dict[str, float] | None: + models = self._price_table.get("models") if isinstance(self._price_table, dict) else None + if not isinstance(models, dict): + return None + + normalized = self._normalize_model_name(model_name) + for key in (model_name, normalized): + data = models.get(key) + if not isinstance(data, dict): + continue + return { + "input_per_million": float(data.get("input_per_million", 0.0) or 0.0), + "output_per_million": float(data.get("output_per_million", 0.0) or 0.0), + "cached_input_per_million": float(data.get("cached_input_per_million", 0.0) or 0.0), + } + return None + + def _resolve_price(self, model_name: str) -> tuple[dict[str, float] | None, str, bool]: + normalized = self._normalize_model_name(model_name) + + try: + import genai_prices # type: ignore + + for attr in ("get_price", "lookup_price", "resolve_price"): + fn = getattr(genai_prices, attr, None) + if callable(fn): + for name in (model_name, normalized): + out = fn(name) + if isinstance(out, dict): + return ( + { + "input_per_million": float( + out.get("input_per_million") + or out.get("input") + or out.get("prompt") + or 0.0 + ), + "output_per_million": float( + out.get("output_per_million") + or out.get("output") + or out.get("completion") + or 0.0 + ), + "cached_input_per_million": float( + out.get("cached_input_per_million") + or out.get("cached_input") + or 0.0 + ), + }, + "genai-prices", + False, + ) + except Exception: + pass + + yml_price = self._resolve_price_from_costs_yml(model_name) + if yml_price is not None: + return yml_price, "costs.yml", False + + return None, "missing", True + + def _estimate_step_usd(self, model_name: str, usage: dict[str, int]) -> tuple[float, str, bool]: + price, source, missing = self._resolve_price(model_name) + if price is None: + return 0.0, source, True + + in_cost = usage["input_tokens"] / 1_000_000 * price["input_per_million"] + out_cost = usage["output_tokens"] / 1_000_000 * price["output_per_million"] + cached_cost = usage["cached_input_tokens"] / 1_000_000 * price["cached_input_per_million"] + return in_cost + out_cost + cached_cost, source, missing + + def _recompute_totals(self) -> None: + generation = self._cost_records["generations"][self.generation_id] + g_totals = {"usd": 0.0, "input_tokens": 0, "output_tokens": 0, "requests": 0} + + for run in generation.get("runs", {}).values(): + r_totals = {"usd": 0.0, "input_tokens": 0, "output_tokens": 0, "requests": 0} + for step in run.get("steps", {}).values(): + usages = step.get("usages", []) + for item in usages: + usage = item.get("usage", {}) + r_totals["usd"] += float(item.get("usd", 0.0) or 0.0) + r_totals["input_tokens"] += int(usage.get("input_tokens", 0) or 0) + r_totals["output_tokens"] += int(usage.get("output_tokens", 0) or 0) + r_totals["requests"] += int(usage.get("requests", 0) or 0) + run["totals"] = r_totals + + g_totals["usd"] += r_totals["usd"] + g_totals["input_tokens"] += r_totals["input_tokens"] + g_totals["output_tokens"] += r_totals["output_tokens"] + g_totals["requests"] += r_totals["requests"] + + generation["totals"] = g_totals + + def start_run(self, *, run_id: str | None, func_name: str) -> str: + final_run_id = run_id or self._new_run_id() + self._ensure_generation_record() + self._ensure_run_record(final_run_id, func_name) + self._persist_costs_atomic() + return final_run_id + + def record_agent_usage( + self, *, run_id: str, step_key: str, result: Any, model_name: str + ) -> None: + run = self._get_run_record(run_id) + step = run.setdefault("steps", {}).setdefault(step_key, {"usages": []}) + + usage_obj = result.usage() if callable(getattr(result, "usage", None)) else None + usage = ( + self._run_usage_dict(usage_obj) if usage_obj is not None else self._run_usage_dict(None) + ) + usd, price_source, price_missing = self._estimate_step_usd(model_name, usage) + + step_item = { + "timestamp": datetime.now(UTC).isoformat(), + "agent_run_id": getattr(result, "run_id", None), + "model_name": model_name, + "usage": usage, + "usd": usd, + "price_source": price_source, + "price_missing": price_missing, + } + step["usages"].append(step_item) + + self._recompute_totals() + self._persist_costs_atomic() + + logfire.info( + "CodeMode step cost recorded", + generation_id=self.generation_id, + run_id=run_id, + step=step_key, + model_name=model_name, + usd=usd, + usage=usage, + price_source=price_source, + price_missing=price_missing, + ) + + def mark_run_completed(self, run_id: str) -> None: + run_rec = self._get_run_record(run_id) + run_rec["status"] = "completed" + run_rec["completed_at"] = datetime.now(UTC).isoformat() + self._recompute_totals() + self._persist_costs_atomic() + + def mark_run_failed(self, run_id: str, error: str) -> None: + run_rec = self._get_run_record(run_id) + run_rec["status"] = "failed" + run_rec["error"] = error + run_rec["completed_at"] = datetime.now(UTC).isoformat() + self._recompute_totals() + self._persist_costs_atomic() + + def print_total_cost(self, run_id: str | None = None) -> None: + generation = self._cost_records["generations"].get(self.generation_id, {}) + if run_id is not None: + run = generation.get("runs", {}).get(run_id) + if not run: + print(f"run not found: {run_id}") + return + totals = run.get("totals", {}) + print( + "run_cost", + run_id, + f"usd={totals.get('usd', 0.0):.6f}", + f"input_tokens={totals.get('input_tokens', 0)}", + f"output_tokens={totals.get('output_tokens', 0)}", + f"requests={totals.get('requests', 0)}", + ) + return + + totals = generation.get("totals", {}) + print( + "generation_cost", + self.generation_id, + f"usd={totals.get('usd', 0.0):.6f}", + f"input_tokens={totals.get('input_tokens', 0)}", + f"output_tokens={totals.get('output_tokens', 0)}", + f"requests={totals.get('requests', 0)}", + ) diff --git a/src/vowel/eval_types.py b/src/vowel/eval_types.py index a19c2e7..8ad7509 100644 --- a/src/vowel/eval_types.py +++ b/src/vowel/eval_types.py @@ -1,35 +1,13 @@ -"""Pydantic models for vowel evaluation specifications. - -This module defines the data models used to parse and validate -YAML evaluation specifications. These models ensure type safety -and provide clear schemas for evaluation definitions. - -Main evaluation types: - IsInstanceCase: Type checking validation - AssertionCase: Custom Python assertion evaluation - DurationCase: Performance/timing validation - ContainsInputCase: Input containment check - PatternMatchCase: Regex pattern matching - RaisesCase: Exception validation - LLMJudgeCase: LLM-based semantic evaluation - -Container models: - MatchCase: Individual test case with input/expected output - DatasetCase: Wrapper for test cases in dataset - Evals: Complete evaluation specification for a function - EvalsFile: Root model for YAML file parsing -""" +"""Pydantic models for parsing and validating vowel YAML specifications.""" -import logging import os +import typing from typing import Any, Literal +import logfire from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator from pydantic.experimental.missing_sentinel import MISSING -logger = logging.getLogger(__name__) - - # ============================================================================= # LLM Output Models # ============================================================================= @@ -108,6 +86,63 @@ assertion: "len(output) == 2" """ +SAFE_ASSERTION_BUILTINS = { + "abs": abs, + "all": all, + "any": any, + "bool": bool, + "dict": dict, + "enumerate": enumerate, + "float": float, + "int": int, + "isinstance": isinstance, + "len": len, + "list": list, + "max": max, + "min": min, + "range": range, + "round": round, + "set": set, + "sorted": sorted, + "str": str, + "sum": sum, + "tuple": tuple, + "type": type, + "zip": zip, +} + +SAFE_TYPE_NAMES: dict[str, Any] = { + "Any": Any, + "None": None, + "bool": bool, + "bytes": bytes, + "dict": dict, + "float": float, + "frozenset": frozenset, + "int": int, + "list": list, + "object": object, + "set": set, + "str": str, + "tuple": tuple, + "typing": typing, +} +SAFE_TYPE_NAMES.update( + {name: getattr(typing, name) for name in dir(typing) if not name.startswith("_")} +) + + +def _eval_assertion_restricted(assertion: str, env: dict[str, Any]) -> bool: + namespace = {"__builtins__": SAFE_ASSERTION_BUILTINS} + namespace.update(env) + return bool(eval(assertion, namespace, namespace)) + + +def _eval_type_restricted(type_expr: str) -> Any: + namespace = {"__builtins__": {}} + namespace.update(SAFE_TYPE_NAMES) + return eval(type_expr, namespace, namespace) + class EvalsSource(BaseModel): """LLM output model for YAML eval specification.""" @@ -126,15 +161,33 @@ class EvalsSource(BaseModel): # Fixture Models # ============================================================================= -FixtureScope = Literal["function", "module", "session"] -"""Scope for fixture lifecycle. +FixtureScope = Literal["case", "eval", "file", "function", "module", "session"] +"""Supported fixture scope names. + +Canonical user-facing names: +- case: per dataset case +- eval: per function eval block +- file: per YAML file / run invocation -- function: Setup/teardown for each test case (default) -- module: Setup once per eval file, teardown after all cases -- session: Setup once per run_evals call, teardown at end +Compatibility aliases: +- function -> case +- module -> eval +- session -> file + +Note: +Runtime lifecycle currently uses legacy internal values +(`function`/`module`/`session`). New names are normalized to these +internal values for behavior-preserving migration. """ +_FIXTURE_SCOPE_ALIASES: dict[str, str] = { + "case": "function", + "eval": "module", + "file": "session", +} + + class FixtureDefinition(BaseModel): """Definition of a single fixture with setup/teardown lifecycle.""" @@ -163,9 +216,22 @@ class FixtureDefinition(BaseModel): ) scope: FixtureScope = Field( default="function", - description="Lifecycle scope: 'function' (per case), 'module' (per eval), or 'session' (per run)", + description=( + "Fixture lifecycle scope. Preferred names: 'case', 'eval', 'file'. " + "Compatibility aliases are accepted: 'function', 'module', 'session'. " + "Current runtime normalization maps case->function, eval->module, file->session." + ), ) + @field_validator("scope", mode="before") + @classmethod + def normalize_scope_aliases(cls, value: Any) -> Any: + """Normalize new scope names to legacy internal values.""" + if value is None or not isinstance(value, str): + return value + normalized = value.strip().lower() + return _FIXTURE_SCOPE_ALIASES.get(normalized, normalized) + @model_validator(mode="after") def validate_setup_or_cls(self): if not self.setup and not self.cls: @@ -183,6 +249,36 @@ class FixturesConfig(BaseModel): ) +class SerializerSpec(BaseModel): + """Serializer registry entry for YAML-native serializer configuration.""" + + model_config = ConfigDict(extra="forbid", populate_by_name=True) + + serializer_schema: str | dict[str, str] | None = Field( + default=None, + alias="schema", + serialization_alias="schema", + description=( + "Schema converter path(s). Use a single import path string for direct mode, " + "or a mapping of parameter name to import path for nested mode." + ), + ) + serializer: str | None = Field( + default=None, + description="Import path to custom serializer function (serial_fn mode).", + ) + + @model_validator(mode="after") + def validate_one_of(self): + has_schema = self.serializer_schema is not None + has_serializer = self.serializer is not None + if has_schema and has_serializer: + raise ValueError("Serializer spec cannot define both 'schema' and 'serializer'") + if not has_schema and not has_serializer: + raise ValueError("Serializer spec must define one of: 'schema' or 'serializer'") + return self + + # ============================================================================= # Evaluation Case Models # ============================================================================= @@ -205,7 +301,11 @@ class IsInstanceCase(BaseModel): ) def evaluate(self, output: Any) -> bool: - return isinstance(output, eval(self.type)) + try: + expected = _eval_type_restricted(self.type) + except Exception: + expected = eval(self.type) + return isinstance(output, expected) class AssertionCase(BaseModel): @@ -260,7 +360,10 @@ class AssertionCase(BaseModel): def evaluate(self, input: Any, output: Any) -> bool: env = {"input": input, "output": output} - return eval(self.assertion, env, env) + try: + return _eval_assertion_restricted(self.assertion, env) + except Exception: + return bool(eval(self.assertion, env, env)) class DurationCase(BaseModel): @@ -664,6 +767,14 @@ class Evals(BaseModel): examples=[["db"], ["db", "cache"], ["redis"]], ) + serializer: str | None = Field( + default=None, + description=( + "Optional serializer registry key from top-level 'serializers'. " + "When set, this eval uses that serializer definition." + ), + ) + evals: dict[ str, IsInstanceCase @@ -734,20 +845,32 @@ class EvalsFile(BaseModel): default_factory=dict, description="Global fixture definitions available to all evals in this file", ) + serializers: dict[str, SerializerSpec] = Field( + default_factory=dict, + description="Global serializer definitions available to evals in this file", + ) @classmethod def model_validate(cls, obj, **kwargs): # Parse fixtures if present (don't mutate caller's dict) fixtures_data = obj.get("fixtures", {}) - obj = {k: v for k, v in obj.items() if k != "fixtures"} + serializers_data = obj.get("serializers", {}) + obj = {k: v for k, v in obj.items() if k not in {"fixtures", "serializers"}} fixtures = {} + serializers = {} for name, defn in fixtures_data.items(): if isinstance(defn, dict): fixtures[name] = FixtureDefinition(**defn) elif isinstance(defn, FixtureDefinition): fixtures[name] = defn - instance = cls.model_construct(fixtures=fixtures, **obj) + for name, defn in serializers_data.items(): + if isinstance(defn, dict): + serializers[name] = SerializerSpec(**defn) + elif isinstance(defn, SerializerSpec): + serializers[name] = defn + + instance = cls.model_construct(fixtures=fixtures, serializers=serializers, **obj) return instance # Pydantic internal attributes to skip when iterating @@ -770,20 +893,21 @@ def model_validate(cls, obj, **kwargs): "model_dump", "model_dump_json", "fixtures", + "serializers", # Skip fixtures when iterating evals } ) def get_evals(self) -> dict[str, Evals]: result = {} - extras = getattr(self, "__pydantic_extra__", None) or {} + extras = getattr(self, "__pydantic_extra__", {}) for key, value in extras.items(): - if key == "fixtures": + if key in {"fixtures", "serializers"}: continue if isinstance(value, dict) and "dataset" in value: try: result[key] = Evals(id=key, **value) except Exception as e: - logger.warning(f"Failed to process eval '{key}': {e}") + logfire.warn("Failed to process eval '{key}': {error}", key=key, error=str(e)) return result diff --git a/src/vowel/evals.py b/src/vowel/evals.py index 6065042..4d7cab4 100644 --- a/src/vowel/evals.py +++ b/src/vowel/evals.py @@ -1,40 +1,161 @@ -"""Evaluator implementations for the vowel framework. - -This module contains the concrete evaluator classes that implement -the evaluation logic defined in eval_types.py. Each evaluator -integrates with pydantic-evals to provide result reporting. - -Evaluators: - AssertionEvaluator: Runs Python assertion expressions - TypeAdapterEvaluator: Validates output types using Pydantic - ContainsInputEvaluator: Checks if output contains input value - PatternMatchingEvaluator: Validates output against regex patterns - RaisesEvaluator: Validates expected exception raising - -Factory functions: - create_llm_judge: Creates an LLM-based judge evaluator - prepare_env_and_condition: Prepares evaluation context -""" +"""Concrete evaluator implementations used by the vowel runtime.""" import importlib.util -import logging import os import re import typing from contextlib import suppress from dataclasses import dataclass +import logfire from pydantic import ValidationError from pydantic.type_adapter import TypeAdapter from pydantic_ai.settings import ModelSettings -from pydantic_evals.evaluators import EvaluationReason, Evaluator, EvaluatorContext, LLMJudge - -logger = logging.getLogger(__name__) +from pydantic_evals.evaluators import ( + EvaluationReason, + Evaluator, + EvaluatorContext, + LLMJudge, + OutputConfig, # noqa: F401 +) MONTY_AVAILABLE = bool(importlib.util.find_spec("pydantic-monty")) +SAFE_ASSERTION_BUILTINS = { + "abs": abs, + "all": all, + "any": any, + "bool": bool, + "dict": dict, + "enumerate": enumerate, + "float": float, + "int": int, + "isinstance": isinstance, + "len": len, + "list": list, + "max": max, + "min": min, + "range": range, + "round": round, + "set": set, + "sorted": sorted, + "str": str, + "sum": sum, + "tuple": tuple, + "type": type, + "zip": zip, +} + +SAFE_TYPE_NAMES = { + "Any": typing.Any, + "None": None, + "bool": bool, + "bytes": bytes, + "dict": dict, + "float": float, + "frozenset": frozenset, + "int": int, + "list": list, + "object": object, + "set": set, + "str": str, + "tuple": tuple, + "typing": typing, +} +SAFE_TYPE_NAMES.update( + {name: getattr(typing, name) for name in dir(typing) if not name.startswith("_")} +) + + +def _eval_assertion_restricted(condition: str, inputs: dict[str, typing.Any]) -> bool: + env = {"__builtins__": SAFE_ASSERTION_BUILTINS} + env.update(inputs) + return bool(eval(condition, env, env)) + + +def _eval_type_restricted(type_expr: str) -> typing.Any: + env = {"__builtins__": {}} + env.update(SAFE_TYPE_NAMES) + return eval(type_expr, env, env) + + +def _apply_serializer_for_assertion( + value: typing.Any, + serializer: type | typing.Callable | dict[str, type | typing.Callable] | None, + *, + param_name: str | None = None, +) -> typing.Any: + """Apply serializer in assertion path to mirror function call conversions.""" + if serializer is None: + return value + + if isinstance(serializer, dict): + if param_name and param_name in serializer: + return _apply_serializer_for_assertion(value, serializer[param_name]) + if isinstance(value, dict): + converted: dict[str, typing.Any] = {} + for key, item in value.items(): + if key in serializer: + converted[key] = _apply_serializer_for_assertion(item, serializer[key]) + else: + converted[key] = item + return converted + return value -def prepare_env_and_condition(ctx: EvaluatorContext, condition: str) -> tuple[dict, str]: + if isinstance(value, dict): + try: + return serializer(**value) + except TypeError: + return serializer(value) + + return serializer(value) + + +def _normalize_input_for_assertion( + raw_inputs: typing.Any, + serializer: type | typing.Callable | dict[str, type | typing.Callable] | None, + serializer_fn: typing.Callable[[dict], typing.Any] | None, +) -> typing.Any: + """Compute assertion `input` value from raw case inputs using active serializer config.""" + if not isinstance(raw_inputs, dict): + return _apply_serializer_for_assertion(raw_inputs, serializer) + + if serializer_fn is not None: + serialized = serializer_fn(raw_inputs) + if isinstance(serialized, tuple): + return serialized[0] if len(serialized) == 1 else serialized + return serialized + + if "input" in raw_inputs: + return _apply_serializer_for_assertion(raw_inputs["input"], serializer) + + if "inputs" in raw_inputs: + values = raw_inputs["inputs"] + if values is None: + return None + if isinstance(values, dict): + if serializer is not None and not isinstance(serializer, dict): + return _apply_serializer_for_assertion(values, serializer) + if isinstance(serializer, dict): + return { + key: _apply_serializer_for_assertion(item, serializer, param_name=key) + for key, item in values.items() + } + return values + if serializer is None: + return values + return [_apply_serializer_for_assertion(item, serializer) for item in values] + + return raw_inputs + + +def prepare_env_and_condition( + ctx: EvaluatorContext, + condition: str, + *, + serializer: type | typing.Callable | dict[str, type | typing.Callable] | None = None, + serializer_fn: typing.Callable[[dict], typing.Any] | None = None, +) -> tuple[dict, str]: """ Prepare environment variables and format condition string for evaluation. @@ -45,12 +166,7 @@ def prepare_env_and_condition(ctx: EvaluatorContext, condition: str) -> tuple[di Returns: Tuple of (environment dict, formatted condition string) """ - actual_input = ctx.inputs - if isinstance(ctx.inputs, dict): - if "input" in ctx.inputs: - actual_input = ctx.inputs["input"] - elif "inputs" in ctx.inputs: - actual_input = ctx.inputs["inputs"] + actual_input = _normalize_input_for_assertion(ctx.inputs, serializer, serializer_fn) env = { "input": actual_input, @@ -77,9 +193,18 @@ class AssertionEvaluator(Evaluator): metrics, metadata, and duration variables. """ - def __init__(self, condition: str, *, evaluation_name: str = "Assertion"): + def __init__( + self, + condition: str, + *, + evaluation_name: str = "Assertion", + serializer: type | typing.Callable | dict[str, type | typing.Callable] | None = None, + serializer_fn: typing.Callable[[dict], typing.Any] | None = None, + ): self.condition = condition self.evaluation_name = evaluation_name + self.serializer = serializer + self.serializer_fn = serializer_fn self.interpreter = None if MONTY_AVAILABLE: import pydantic_monty @@ -96,7 +221,12 @@ def evaluate(self, ctx: EvaluatorContext) -> EvaluationReason: return EvaluationReason(value=True, reason="Skipped (exception case)") if "__import__" in self.condition: raise ValueError(f"__import__ is not allowed in assertions: {self.condition}") - env, condition = prepare_env_and_condition(ctx, self.condition) + env, condition = prepare_env_and_condition( + ctx, + self.condition, + serializer=self.serializer, + serializer_fn=self.serializer_fn, + ) # TL;DR # BETA API @@ -126,6 +256,20 @@ def eval_python(self, condition: str, inputs: dict) -> EvaluationReason: ) except Exception: + pass + + try: + if _eval_assertion_restricted(self.condition, inputs): + return EvaluationReason( + value=True, reason=f"Assertion passed for condition: {condition}" + ) + except Exception as exc: + logfire.info( + "Restricted assertion eval failed; falling back to raw eval", + condition=self.condition, + error_type=type(exc).__name__, + error=str(exc), + ) with suppress(Exception): if eval(self.condition, inputs, inputs): return EvaluationReason( @@ -150,15 +294,8 @@ def evaluate(self, ctx: EvaluatorContext) -> EvaluationReason: """Validate that output matches the expected type.""" if isinstance(ctx.output, dict) and "_exception" in ctx.output: return EvaluationReason(value=True, reason="Skipped (exception case)") - type_env = { - "typing": typing, - "__import__": None, - "eval": None, - "exec": None, - "compile": None, - } try: - expected_type = eval(self.type, type_env, type_env) + expected_type = _eval_type_restricted(self.type) ta = TypeAdapter(expected_type) except Exception: return EvaluationReason( @@ -328,7 +465,9 @@ def evaluate(self, ctx: EvaluatorContext) -> EvaluationReason: ) if self.expected_exception_match: exception_message = str(actual_exception) - if not re.search(self.expected_exception_match, exception_message, re.I): + if self.expected_exception_match != exception_message and not re.search( + self.expected_exception_match, exception_message, re.I + ): return EvaluationReason( value=False, reason=( @@ -348,6 +487,9 @@ def create_llm_judge( include: list[str] | None = None, config: dict | None = None, ) -> LLMJudge: + # Imported lazily to avoid circular import at module import time. + from .utils import _resolve_env_ref + if config is None: config = {} @@ -357,14 +499,8 @@ def create_llm_judge( "'model' must be specified in config or set JUDGE_MODEL environment variable" ) - if model.strip().startswith("$"): - env_var = model.strip().lstrip("$") - model = os.getenv(env_var) - if not model: - raise ValueError( - f"Environment variable {env_var} is not set for judge model, " - f"set {env_var} to a valid model name." - ) + model = _resolve_env_ref(model, field_name="model") + rubric = _resolve_env_ref(rubric, field_name="rubric") include_input = False include_expected_output = False diff --git a/src/vowel/executor.py b/src/vowel/executor.py new file mode 100644 index 0000000..1046744 --- /dev/null +++ b/src/vowel/executor.py @@ -0,0 +1,1030 @@ +"""Execution backends used by CodeMode for sandboxed and local code runs.""" + +from __future__ import annotations + +import ast +import asyncio +import contextlib +import importlib.util +import io +import time +from collections.abc import Callable +from dataclasses import dataclass +from typing import Any, Literal, Protocol, runtime_checkable + +import logfire as _logfire + +NEST_AVAILABLE = importlib.util.find_spec("nest_asyncio") is not None +MONTY_AVAILABLE = importlib.util.find_spec("pydantic_monty") is not None + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def run_sync(coro: Any) -> Any: + """Run a coroutine from sync code, including running-loop environments.""" + try: + return asyncio.run(coro) + except RuntimeError as exc: + if "running event loop" not in str(exc) and "cannot be called from a running" not in str( + exc + ): + raise + # Already inside an event loop — patch and retry + if not NEST_AVAILABLE: + raise RuntimeError( + "execute_sync() was called from inside a running event loop. " + "Install nest-asyncio to support this: pip install nest-asyncio" + ) from exc + + import nest_asyncio + + nest_asyncio.apply() + loop = asyncio.get_event_loop() + return loop.run_until_complete(coro) + + +# --------------------------------------------------------------------------- +# Result type +# --------------------------------------------------------------------------- + + +@dataclass +class ExecutionResult: + """Result of running a code snippet through an executor. + + Attributes + ---------- + output: + The value of the last expression evaluated in the snippet, or the + value assigned to ``__result__`` in the namespace. ``None`` when + execution fails. + stdout: + Everything written to stdout during execution (via ``print()``). + success: + ``True`` when the snippet completed without raising an exception. + error: + Human-readable error message when ``success is False``. + error_type: + The Python exception class name (e.g. ``"ValueError"``) when + ``success is False``. + duration_ms: + Wall-clock time spent executing the snippet, in milliseconds. + """ + + output: Any + stdout: str + success: bool + error: str | None = None + error_type: str | None = None + duration_ms: float = 0.0 + + +# --------------------------------------------------------------------------- +# Protocol +# --------------------------------------------------------------------------- + + +@runtime_checkable +class Executor(Protocol): + """Protocol for code execution backends used by CodeMode. + + Any callable object that matches this signature qualifies — concrete + classes do *not* need to inherit from ``Executor``. + + Parameters + ---------- + code: + Python source code to execute. + inputs: + ``dict[str, Any]`` of values injected as top-level variables + visible to the snippet. For example ``{"x": 42}`` makes ``x`` + available inside the code. + external_functions: + ``dict[str, Callable]`` of host-side callbacks the snippet can + call by name. In the Monty backend each call exits the sandbox, + runs on the host, and returns the result — so the real function + can use any library. + timeout: + Maximum wall-clock seconds allowed for the snippet. Execution is + interrupted (or the result discarded) after this duration. + max_memory: + Maximum heap memory in bytes available to the sandbox. Ignored by + ``DefaultExecutor`` which has no memory isolation. + + Returns + ------- + ExecutionResult + """ + + async def execute( + self, + code: str, + *, + inputs: dict[str, Any] | None = None, + external_functions: dict[str, Callable[..., Any]] | None = None, + timeout: float = 5.0, + max_memory: int = 10 * 1024 * 1024, + ) -> ExecutionResult: + raise NotImplementedError + + def execute_sync( + self, + code: str, + *, + inputs: dict[str, Any] | None = None, + external_functions: dict[str, Callable[..., Any]] | None = None, + timeout: float = 5.0, + max_memory: int = 10 * 1024 * 1024, + ) -> ExecutionResult: + raise NotImplementedError + + def create_session( + self, + setup_code: str, + *, + timeout: float = 5.0, + max_memory: int = 10 * 1024 * 1024, + ) -> ExecutionSession: + raise NotImplementedError + + +# --------------------------------------------------------------------------- +# ExecutionSession — compile once, feed many snippets +# --------------------------------------------------------------------------- + + +@runtime_checkable +class ExecutionSession(Protocol): + """A reusable execution context with pre-compiled setup code. + + The session compiles the *setup_code* (typically a function definition) + once, then each ``feed()`` call runs a snippet against the preserved + runtime state without re-parsing the setup code. + + This is the key optimisation for CodeMode exploration: when testing + N edge-case snippets against the same function, the function is parsed + and compiled only once instead of N times. + + The session is a context manager — use ``async with`` or ``with`` to + ensure proper cleanup. + """ + + def feed(self, code: str) -> ExecutionResult: + """Execute *code* against the session's pre-compiled state. + + Returns an ``ExecutionResult`` with the last expression value, + stdout, and error info (if any). + """ + raise NotImplementedError + + def close(self) -> None: + """Release resources held by the session.""" + raise NotImplementedError + + def __enter__(self) -> ExecutionSession: + return self + + def __exit__(self, *_: Any) -> None: + self.close() + + +# --------------------------------------------------------------------------- +# MontyReplSession — sandboxed session using MontyRepl +# --------------------------------------------------------------------------- + + +class MontyReplSession: + """Session backed by ``MontyRepl`` — compile once, feed many snippets. + + On construction the *setup_code* is parsed, compiled and executed once + via ``MontyRepl.create()``. Each subsequent ``feed()`` call runs a + snippet against the preserved heap/globals without re-parsing the setup + code. + + This is the recommended path for CodeMode exploration with Monty. For + a function with N edge-case snippets, the function source is compiled + only once. + """ + + def __init__( + self, + setup_code: str, + *, + timeout: float = 5.0, + max_memory: int = 10 * 1024 * 1024, + ) -> None: + import pydantic_monty + + self._pydantic_monty = pydantic_monty + self._limits = pydantic_monty.ResourceLimits( + max_duration_secs=timeout, + max_memory=max_memory, + ) + + stdout_lines: list[str] = [] + + def _print_callback(_stream: str, text: str) -> None: + stdout_lines.append(text) + + # Create empty REPL and initialize it with setup code + self._repl = pydantic_monty.MontyRepl(limits=self._limits) + self._repl.feed_run(setup_code, print_callback=_print_callback) + self._setup_stdout = "\n".join(stdout_lines) + + def feed(self, code: str) -> ExecutionResult: + """Execute *code* against the REPL's preserved state.""" + stdout_lines: list[str] = [] + + def _print_callback(_stream: str, text: str) -> None: + stdout_lines.append(text) + + t0 = time.perf_counter() + try: + if not getattr(self, "_repl", None): + # TODO: wrap with custom exception and detailed message + raise ValueError("Repl not found.") + else: + output = self._repl.feed_run(code, print_callback=_print_callback) + duration_ms = (time.perf_counter() - t0) * 1000 + return ExecutionResult( + output=output, + stdout="\n".join(stdout_lines), + success=True, + duration_ms=duration_ms, + ) + + except self._pydantic_monty.MontyRuntimeError as exc: + duration_ms = (time.perf_counter() - t0) * 1000 + inner = exc.exception() + return ExecutionResult( + output=None, + stdout="\n".join(stdout_lines), + success=False, + error=exc.display(format="type-msg"), + error_type=type(inner).__name__, + duration_ms=duration_ms, + ) + + except self._pydantic_monty.MontySyntaxError as exc: + duration_ms = (time.perf_counter() - t0) * 1000 + return ExecutionResult( + output=None, + stdout="", + success=False, + error=str(exc), + error_type="SyntaxError", + duration_ms=duration_ms, + ) + + except Exception as exc: # noqa: BLE001 + duration_ms = (time.perf_counter() - t0) * 1000 + return ExecutionResult( + output=None, + stdout="\n".join(stdout_lines), + success=False, + error=str(exc), + error_type=type(exc).__name__, + duration_ms=duration_ms, + ) + + def close(self) -> None: + """Release the REPL instance.""" + # TODO: not sure about releasing the REPL instance is needed + # self._repl = None # type: ignore + + def __enter__(self) -> MontyReplSession: + return self + + def __exit__(self, *_: Any) -> None: + self.close() + + +# --------------------------------------------------------------------------- +# FallbackSession — Monty with auto-fallback to DefaultSession +# --------------------------------------------------------------------------- + + +class FallbackSession: + """Session that tries MontyReplSession first, falls back to DefaultSession. + + Two fallback modes: + + 1. **Session-level**: If ``MontyReplSession.__init__`` raises (e.g. + ``MontySyntaxError`` for unsupported syntax like f-string ``!r``), + the entire session transparently switches to ``DefaultSession``. + + 2. **Snippet-level**: If a ``feed()`` call returns a + ``ModuleNotFoundError`` (Monty doesn't have the module), that single + snippet is re-executed via a ``DefaultSession``. Subsequent Monty + feeds continue normally — only the failing snippet falls back. + """ + + def __init__( + self, + setup_code: str, + *, + timeout: float = 5.0, + max_memory: int = 10 * 1024 * 1024, + fallback_executor: Executor | None = None, + ) -> None: + self._setup_code = setup_code + self._timeout = timeout + self._max_memory = max_memory + self._fallback_executor = fallback_executor or DefaultExecutor() + self._monty_session: MontyReplSession | None = None + self._fallback_session: ExecutionSession | None = None + self._monty_failed_permanently = False + + try: + self._monty_session = MontyReplSession( + setup_code, + timeout=timeout, + max_memory=max_memory, + ) + except Exception as exc: + _logfire.info( + "Monty session creation failed ({exc_type}: {exc_msg}), falling back to {fallback}", + exc_type=type(exc).__name__, + exc_msg=str(exc), + fallback=type(self._fallback_executor).__name__, + ) + self._monty_failed_permanently = True + self._fallback_session = self._fallback_executor.create_session( + setup_code, + timeout=timeout, + max_memory=max_memory, + ) + + def _get_fallback_session(self) -> ExecutionSession: + """Lazily create the fallback session (only when first needed).""" + if self._fallback_session is None: + self._fallback_session = self._fallback_executor.create_session( + self._setup_code, + timeout=self._timeout, + max_memory=self._max_memory, + ) + return self._fallback_session + + def feed(self, code: str) -> ExecutionResult: + """Execute *code*, falling back to the configured session on Monty gaps.""" + # Session-level fallback — Monty never worked + if self._monty_failed_permanently: + return self._get_fallback_session().feed(code) + + assert self._monty_session is not None + result = self._monty_session.feed(code) + + # Snippet-level fallback — ModuleNotFoundError means Monty + # doesn't have that stdlib module; retry with fallback session. + if not result.success and result.error_type == "ModuleNotFoundError": + _logfire.info( + "Monty ModuleNotFoundError, retrying snippet with {fallback}: {error}", + fallback=type(self._fallback_executor).__name__, + error=result.error, + ) + return self._get_fallback_session().feed(code) + + return result + + def close(self) -> None: + if self._monty_session is not None: + self._monty_session.close() + if self._fallback_session is not None: + self._fallback_session.close() + + def __enter__(self) -> FallbackSession: + return self + + def __exit__(self, *_: Any) -> None: + self.close() + + +# --------------------------------------------------------------------------- +# MontyExecutor — sandboxed, production-grade +# --------------------------------------------------------------------------- + + +class MontyExecutor: + """Sandboxed executor backed by ``pydantic-monty`` (Rust interpreter). + + Monty provides strict isolation: no filesystem access, no network, no + environment variables. External functions are injected as host-side + callbacks — they run on the *host* Python process with full access to + stdlib and third-party libraries. + + Uses ``pydantic_monty.run_monty_async`` which implements Monty's step + protocol (``start()`` → ``MontySnapshot`` → ``resume()``) with proper + async support. External functions can be sync or async — Monty handles + both transparently. The GIL is released during execution and Monty + steps are offloaded to a thread pool. + + Requires the ``pydantic-monty`` package:: + + pip install "vowel[monty]" # or: pip install pydantic-monty + + Raises + ------ + ImportError + If ``pydantic-monty`` is not installed. + """ + + def __init__(self, fallback_executor: Executor | None = None) -> None: + if not MONTY_AVAILABLE: + raise ImportError( + 'MontyExecutor requires pydantic-monty. Install it with: pip install "vowel[monty]"' + ) + self._fallback_executor = fallback_executor or DefaultExecutor() + + async def execute( + self, + code: str, + *, + inputs: dict[str, Any] | None = None, + external_functions: dict[str, Callable[..., Any]] | None = None, + timeout: float = 5.0, + max_memory: int = 10 * 1024 * 1024, + ) -> ExecutionResult: + """Execute *code* inside the Monty sandbox. + + Delegates to ``pydantic_monty.run_monty_async`` which handles the + full step protocol (``start()`` → snapshot → ``resume()``). + + ``NameLookupSnapshot`` (undefined variables) is not handled by + ``run_monty_async`` — it raises ``AssertionError``. We catch that + and use ``isinstance`` to detect the snapshot type cleanly. + + Parameters + ---------- + code: + Python source to run. + inputs: + Values injected as top-level variables (Monty ``inputs``). + external_functions: + Host-side callbacks the snippet can call by name. + timeout / max_memory: + Resource limits forwarded to Monty. + """ + import pydantic_monty + + stdout_lines: list[str] = [] + + def _print_callback(_stream: str, text: str) -> None: + stdout_lines.append(text) + + input_names = list(inputs) if inputs else None + + limits = pydantic_monty.ResourceLimits( + max_duration_secs=timeout, + max_memory=max_memory, + ) + + t0 = time.perf_counter() + try: + m = pydantic_monty.Monty( + code, + inputs=input_names, + ) + output = await pydantic_monty.run_monty_async( + m, + inputs=inputs, + limits=limits, + external_functions=external_functions, + print_callback=_print_callback, + ) + duration_ms = (time.perf_counter() - t0) * 1000 + return ExecutionResult( + output=output, + stdout="\n".join(stdout_lines), + success=True, + duration_ms=duration_ms, + ) + + except pydantic_monty.MontyRuntimeError as exc: + duration_ms = (time.perf_counter() - t0) * 1000 + inner = exc.exception() + return ExecutionResult( + output=None, + stdout="\n".join(stdout_lines), + success=False, + error=exc.display(format="type-msg"), + error_type=type(inner).__name__, + duration_ms=duration_ms, + ) + + except pydantic_monty.MontySyntaxError as exc: + duration_ms = (time.perf_counter() - t0) * 1000 + return ExecutionResult( + output=None, + stdout="", + success=False, + error=str(exc), + error_type="SyntaxError", + duration_ms=duration_ms, + ) + + except AssertionError as exc: + duration_ms = (time.perf_counter() - t0) * 1000 + # run_monty_async doesn't handle NameLookupSnapshot — it hits + # `assert isinstance(progress, FutureSnapshot)` and the repr + # of the snapshot is embedded in the assertion message. + exc_msg = str(exc) + if "NameLookupSnapshot" in exc_msg: + marker = 'variable_name="' + start = exc_msg.find(marker) + if start != -1: + start += len(marker) + end = exc_msg.find('"', start) + var = exc_msg[start:end] + error = f"name '{var}' is not defined" + else: + error = "name is not defined" + return ExecutionResult( + output=None, + stdout="\n".join(stdout_lines), + success=False, + error=error, + error_type="NameError", + duration_ms=duration_ms, + ) + return ExecutionResult( + output=None, + stdout="\n".join(stdout_lines), + success=False, + error=exc_msg, + error_type=type(exc).__name__, + duration_ms=duration_ms, + ) + + except Exception as exc: # noqa: BLE001 — catch-all for unexpected errors + duration_ms = (time.perf_counter() - t0) * 1000 + return ExecutionResult( + output=None, + stdout="\n".join(stdout_lines), + success=False, + error=str(exc), + error_type=type(exc).__name__, + duration_ms=duration_ms, + ) + + def execute_sync( + self, + code: str, + *, + inputs: dict[str, Any] | None = None, + external_functions: dict[str, Callable[..., Any]] | None = None, + timeout: float = 5.0, + max_memory: int = 10 * 1024 * 1024, + ) -> ExecutionResult: + """Synchronous wrapper around :meth:`execute`.""" + return run_sync( + self.execute( + code, + inputs=inputs, + external_functions=external_functions, + timeout=timeout, + max_memory=max_memory, + ) + ) + + def create_session( + self, + setup_code: str, + *, + timeout: float = 5.0, + max_memory: int = 10 * 1024 * 1024, + ) -> FallbackSession: + """Create a session that uses Monty with auto-fallback to DefaultSession. + + The *setup_code* (typically a function definition) is compiled and + executed **once**. If Monty cannot handle the code (e.g. unsupported + syntax), the session transparently falls back to ``DefaultSession``. + Individual ``feed()`` calls also fall back on ``ModuleNotFoundError``. + """ + return FallbackSession( + setup_code, + timeout=timeout, + max_memory=max_memory, + fallback_executor=self._fallback_executor, + ) + + +# --------------------------------------------------------------------------- +# DefaultSession — unsandboxed session using persistent namespace +# --------------------------------------------------------------------------- + + +def _rewrite_last_expr(code: str) -> tuple[Any, bool]: + """Parse *code* and rewrite the last expression to capture its value. + + Returns ``(compiled_code, has_result)`` where *has_result* is True when + the last statement was an expression that was rewritten to assign to + ``__result__``. + """ + tree = ast.parse(code, "", "exec") + has_result = False + if tree.body and isinstance(tree.body[-1], ast.Expr): + last_expr = tree.body.pop() + assign = ast.Assign( + targets=[ast.Name(id="__result__", ctx=ast.Store())], + value=last_expr.value, # type: ignore[attr-defined] + ) + ast.copy_location(last_expr, assign) + tree.body.append(assign) + ast.fix_missing_locations(tree) + has_result = True + return compile(tree, "", "exec"), has_result + + +class DefaultSession: + """Session backed by a persistent ``exec()`` namespace. + + The *setup_code* is executed once into a namespace dict on construction. + Each ``feed()`` call executes a snippet in the **same** namespace, so + functions and variables defined in the setup remain available. + + This mirrors ``MontyReplSession`` semantics for environments where Monty + is not installed. + """ + + def __init__( + self, + setup_code: str, + *, + timeout: float = 5.0, + max_memory: int = 10 * 1024 * 1024, + ) -> None: + self._namespace: dict[str, Any] = {} + self._timeout = timeout + # Execute setup code to define functions/variables + exec(compile(setup_code, "", "exec"), self._namespace) # noqa: S102 + + def feed(self, code: str) -> ExecutionResult: + """Execute *code* against the session's persistent namespace.""" + # Remove any leftover __result__ from previous feed + self._namespace.pop("__result__", None) + + try: + compiled, _has_result = _rewrite_last_expr(code) + except SyntaxError as exc: + return ExecutionResult( + output=None, + stdout="", + success=False, + error=str(exc), + error_type="SyntaxError", + duration_ms=0.0, + ) + + stdout_buf = io.StringIO() + t0 = time.perf_counter() + try: + with contextlib.redirect_stdout(stdout_buf): + exec(compiled, self._namespace) # noqa: S102 + duration_ms = (time.perf_counter() - t0) * 1000 + output = self._namespace.get("__result__") + return ExecutionResult( + output=output, + stdout=stdout_buf.getvalue(), + success=True, + duration_ms=duration_ms, + ) + + except Exception as exc: # noqa: BLE001 + duration_ms = (time.perf_counter() - t0) * 1000 + return ExecutionResult( + output=None, + stdout=stdout_buf.getvalue(), + success=False, + error=str(exc), + error_type=type(exc).__name__, + duration_ms=duration_ms, + ) + + def close(self) -> None: + """Clear the namespace.""" + self._namespace.clear() + + def __enter__(self) -> DefaultSession: + return self + + def __exit__(self, *_: Any) -> None: + self.close() + + +# --------------------------------------------------------------------------- +# DefaultExecutor — exec()-based, no sandbox +# --------------------------------------------------------------------------- + + +class DefaultExecutor: + """Unsandboxed executor backed by Python's built-in ``exec()``. + + **WARNING: runs code with full host privileges.** Only suitable for + development, local testing, or environments where the code being executed + is fully trusted. + + Both ``inputs`` and ``external_functions`` are merged into the execution + namespace so the snippet can reference them as plain names. The last + assigned value of ``__result__``, or the module-level name ``results`` + if present, is returned as ``output``. + + No additional dependencies required — works with plain Python. + + Notes + ----- + * ``timeout`` and ``max_memory`` parameters are accepted for API + compatibility but are **not enforced**. + * Stdout is captured via ``contextlib.redirect_stdout``. + """ + + async def execute( + self, + code: str, + *, + inputs: dict[str, Any] | None = None, + external_functions: dict[str, Callable[..., Any]] | None = None, + timeout: float = 5.0, + max_memory: int = 10 * 1024 * 1024, + ) -> ExecutionResult: + """Execute *code* using ``exec()`` — no sandbox, no isolation. + + To match Monty's behaviour, the value of the *last expression* in + the snippet is captured automatically using ``ast`` rewriting. If + the snippet explicitly sets ``__result__``, that takes priority. + """ + namespace: dict[str, Any] = {} + if inputs: + namespace.update(inputs) + if external_functions: + namespace.update(external_functions) + + # Rewrite the last expression statement to capture its value. + try: + tree = ast.parse(code, "", "exec") + if tree.body and isinstance(tree.body[-1], ast.Expr): + last_expr = tree.body.pop() + assign = ast.Assign( + targets=[ast.Name(id="__result__", ctx=ast.Store())], + value=last_expr.value, # type: ignore[attr-defined] + ) + ast.copy_location(last_expr, assign) + tree.body.append(assign) + ast.fix_missing_locations(tree) + compiled = compile(tree, "", "exec") + except SyntaxError as exc: + return ExecutionResult( + output=None, + stdout="", + success=False, + error=str(exc), + error_type="SyntaxError", + duration_ms=0.0, + ) + + stdout_buf = io.StringIO() + t0 = time.perf_counter() + try: + with contextlib.redirect_stdout(stdout_buf): + exec(compiled, namespace) # noqa: S102 + duration_ms = (time.perf_counter() - t0) * 1000 + + output = namespace.get("__result__") + + return ExecutionResult( + output=output, + stdout=stdout_buf.getvalue(), + success=True, + duration_ms=duration_ms, + ) + + except Exception as exc: # noqa: BLE001 + duration_ms = (time.perf_counter() - t0) * 1000 + return ExecutionResult( + output=None, + stdout=stdout_buf.getvalue(), + success=False, + error=str(exc), + error_type=type(exc).__name__, + duration_ms=duration_ms, + ) + + def execute_sync( + self, + code: str, + *, + inputs: dict[str, Any] | None = None, + external_functions: dict[str, Callable[..., Any]] | None = None, + timeout: float = 5.0, + max_memory: int = 10 * 1024 * 1024, + ) -> ExecutionResult: + """Synchronous wrapper around :meth:`execute`.""" + return run_sync( + self.execute( + code, + inputs=inputs, + external_functions=external_functions, + timeout=timeout, + max_memory=max_memory, + ) + ) + + def create_session( + self, + setup_code: str, + *, + timeout: float = 5.0, + max_memory: int = 10 * 1024 * 1024, + ) -> DefaultSession: + """Create an unsandboxed session with a persistent namespace. + + The *setup_code* is executed once into a shared namespace dict. + Each ``session.feed(snippet)`` call runs in the same namespace, + preserving functions and variables across calls. + """ + return DefaultSession( + setup_code, + timeout=timeout, + max_memory=max_memory, + ) + + +class ResolvedExecutor: + """Executor wrapper that falls back when the primary executor raises.""" + + def __init__(self, primary: Executor, fallback: Executor) -> None: + self.primary = primary + self.fallback = fallback + + async def execute( + self, + code: str, + *, + inputs: dict[str, Any] | None = None, + external_functions: dict[str, Callable[..., Any]] | None = None, + timeout: float = 5.0, + max_memory: int = 10 * 1024 * 1024, + ) -> ExecutionResult: + try: + return await self.primary.execute( + code, + inputs=inputs, + external_functions=external_functions, + timeout=timeout, + max_memory=max_memory, + ) + except Exception as exc: # noqa: BLE001 + _logfire.info( + "Primary executor {primary} raised {exc_type}; falling back to {fallback}", + primary=type(self.primary).__name__, + exc_type=type(exc).__name__, + fallback=type(self.fallback).__name__, + ) + return await self.fallback.execute( + code, + inputs=inputs, + external_functions=external_functions, + timeout=timeout, + max_memory=max_memory, + ) + + def execute_sync( + self, + code: str, + *, + inputs: dict[str, Any] | None = None, + external_functions: dict[str, Callable[..., Any]] | None = None, + timeout: float = 5.0, + max_memory: int = 10 * 1024 * 1024, + ) -> ExecutionResult: + try: + return self.primary.execute_sync( + code, + inputs=inputs, + external_functions=external_functions, + timeout=timeout, + max_memory=max_memory, + ) + except Exception as exc: # noqa: BLE001 + _logfire.info( + "Primary executor {primary} raised {exc_type}; falling back to {fallback}", + primary=type(self.primary).__name__, + exc_type=type(exc).__name__, + fallback=type(self.fallback).__name__, + ) + return self.fallback.execute_sync( + code, + inputs=inputs, + external_functions=external_functions, + timeout=timeout, + max_memory=max_memory, + ) + + def create_session( + self, + setup_code: str, + *, + timeout: float = 5.0, + max_memory: int = 10 * 1024 * 1024, + ) -> ExecutionSession: + try: + return self.primary.create_session( + setup_code, + timeout=timeout, + max_memory=max_memory, + ) + except Exception as exc: # noqa: BLE001 + _logfire.info( + "Primary executor {primary} session creation raised {exc_type}; " + "falling back to {fallback}", + primary=type(self.primary).__name__, + exc_type=type(exc).__name__, + fallback=type(self.fallback).__name__, + ) + return self.fallback.create_session( + setup_code, + timeout=timeout, + max_memory=max_memory, + ) + + +def resolve_executors( + executor: Executor | None = None, + fallback_executor: Executor | None = None, +) -> Executor: + """Resolve primary/fallback executors while preserving Monty-first defaults.""" + fallback = fallback_executor or DefaultExecutor() + + if isinstance(executor, ResolvedExecutor): + if fallback_executor is None: + return executor + return ResolvedExecutor(executor.primary, fallback) + + if executor is None: + if MONTY_AVAILABLE: + return MontyExecutor(fallback_executor=fallback) + import warnings + + warnings.warn( + "pydantic-monty not installed; using fallback executor " + f'{type(fallback).__name__} (no sandboxing). Install with: pip install "vowel[monty]"', + stacklevel=2, + ) + return fallback + + if isinstance(executor, DefaultExecutor) and fallback_executor is None: + return executor + + if isinstance(executor, MontyExecutor): + executor._fallback_executor = fallback # type: ignore[attr-defined] + return executor + + if executor is fallback: + return executor + + return ResolvedExecutor(executor, fallback) + + +# --------------------------------------------------------------------------- +# Factory +# --------------------------------------------------------------------------- + + +def get_executor(backend: Literal["monty", "auto", "default"] = "auto") -> Executor: + """Return a configured executor instance. + + Parameters + ---------- + backend: + ``"monty"`` — always use ``MontyExecutor`` (raises if not installed). + ``"default"`` — always use ``DefaultExecutor``. + ``"auto"`` — use ``MontyExecutor`` when available, fall back to + ``DefaultExecutor`` with a warning. + + Returns + ------- + Executor + A ready-to-use executor instance. + """ + if backend == "monty": + return MontyExecutor() + + if backend == "default": + return DefaultExecutor() + + if backend == "auto": + if MONTY_AVAILABLE: + return MontyExecutor() + import warnings + + warnings.warn( + "pydantic-monty not installed; falling back to DefaultExecutor " + '(no sandboxing). Install with: pip install "vowel[monty]"', + stacklevel=2, + ) + return DefaultExecutor() + + raise ValueError( + f"Unknown executor backend: {backend!r}. Choose 'monty', 'default', or 'auto'." + ) diff --git a/src/vowel/mcp_server.py b/src/vowel/mcp_server.py index 729a44e..fc84660 100644 --- a/src/vowel/mcp_server.py +++ b/src/vowel/mcp_server.py @@ -1,63 +1,4 @@ -"""Vowel MCP Server - Model Context Protocol server for eval generation. - -This module exposes vowel's full evaluation, generation, and TDD capabilities via -MCP (Model Context Protocol), enabling AI assistants to run evaluations, generate -functions, create test specs, and perform TDD workflows. - -Configuration is set via the ``env`` field in your MCP client JSON config. -The env field should contain API keys and model names only. All other parameters -(auto_retry, min_coverage, etc.) are tool parameters with sensible defaults. - -Usage: - # Add to MCP client config (e.g., Claude Desktop, VS Code Copilot) - { - "mcpServers": { - "vowel": { - "command": "python", - "args": ["-m", "vowel.mcp_server"], - "env": { - "MODEL_NAME": "openai:gpt-4o", - "OPENAI_API_KEY": "sk-..." - } - } - } - } - - # Or run directly (reads env vars from shell) - python -m vowel.mcp_server - -Supported env vars: - MODEL_NAME — Default LLM model (e.g. "openai:gpt-4o", "gemini-3-flash-preview") - JUDGE_MODEL — Model for LLM Judge evaluator - OPENAI_API_KEY — OpenAI API key - ANTHROPIC_API_KEY — Anthropic API key - GOOGLE_API_KEY — Google AI API key - -Available Tools (14): - Eval Runner: - - run_evals_from_file: Run evaluations from a YAML file - - run_evals_from_yaml: Run evaluations from YAML content string - - run_evals_with_fixtures: Run evaluations with fixture injection - - validate_yaml_spec: Validate a YAML eval specification - - check_function_compatibility: Check function compatibility with eval generation - - list_yaml_files: List YAML files in a directory - - EvalGenerator: - - generate_function: Generate a Python function from description - - generate_eval_spec: Generate eval spec for a function - - generate_and_run_evals: Generate spec + run + auto-retry + heal - - TDDGenerator: - - tdd_generate_signature: Generate function signature from description - - tdd_generate_evals: Generate eval spec from a signature - - tdd_generate_implementation: Generate implementation from signature + spec - - tdd_generate_all: Full TDD flow: description → signature → evals → implementation - - tdd_generate_and_validate: TDD with eval validation against implementation - -Available Resources: - - vowel://context: Eval specification documentation - - vowel://example: Example YAML eval specification -""" +"""MCP server exposing vowel evaluation, generation, and TDD tools.""" from __future__ import annotations @@ -67,11 +8,12 @@ import nest_asyncio from mcp.server.fastmcp import FastMCP -from vowel import check_compatibility, load_evals_from_yaml_string, run_evals +from vowel import check_compatibility, run_evals from vowel.ai import EVAL_SPEC_CONTEXT, EvalGenerator from vowel.monitoring import enable_monitoring from vowel.runner import Function, RunEvals from vowel.tdd import TDDGenerator +from vowel.utils import load_bundle_from_yaml_string enable_monitoring(service_name="vowel-mcp") @@ -204,8 +146,8 @@ def validate_yaml_spec(yaml_content: str) -> dict[str, Any]: yaml_content: YAML eval specification to validate """ try: - evals = load_evals_from_yaml_string(yaml_content) - function_names = list(evals.keys()) + bundle = load_bundle_from_yaml_string(yaml_content) + function_names = list(bundle.evals.keys()) return { "valid": True, "functions": function_names, diff --git a/src/vowel/runner.py b/src/vowel/runner.py index a69c3e5..66f39c2 100644 --- a/src/vowel/runner.py +++ b/src/vowel/runner.py @@ -1,28 +1,4 @@ -"""RunEvals - A fluent API for running evaluations. - -This module provides: -- Function: Pydantic model representing a function with code and metadata -- RunEvals: Fluent API for loading and running evaluations - -Example: - # Run from YAML file - from vowel import RunEvals - - summary = RunEvals.from_file("evals.yml").run() - print(f"All passed: {summary.all_passed}") - - # Run with custom functions - def my_func(x): - return x * 2 - - summary = ( - RunEvals.from_file("evals.yml") - .with_functions({"my_func": my_func}) - .filter(["my_func"]) - .debug() - .run() - ) -""" +"""Fluent APIs and models for loading and running evals.""" import ast import codecs @@ -36,7 +12,8 @@ def my_func(x): from pydantic import BaseModel, Field from .eval_types import Evals, EvalsFile, FixtureDefinition -from .utils import EvalSummary +from .executor import Executor +from .utils import EvalsBundle, EvalSummary from .utils import run_evals as _run_evals _T = TypeVar("_T", bound=Any) @@ -75,12 +52,7 @@ def __name__(self) -> str: # pyright: ignore[reportIncompatibleVariableOverride @property def impl(self) -> Callable[..., _RT]: - """ - Get the function implementation as a callable. - - Returns: - Callable: The function implementation. - """ + """Return the executable function object for this definition.""" if not self.func: self.execute() return cast(Callable, self.func) @@ -97,6 +69,7 @@ def execute(self) -> None: local_scope: dict[str, object] = {} try: code = self.code + code = self._sanitize_code(code) try: exec(code, local_scope, local_scope) except Exception: @@ -105,12 +78,62 @@ def execute(self) -> None: exec(code, local_scope, local_scope) else: raise + self.code = code # persist cleaned code for downstream use except Exception as e: raise RuntimeError(f"Error executing code for function '{self.name}'.") from e self.func = local_scope[self.name] + @staticmethod + def _sanitize_code(code: str) -> str: + """Fix common LLM code-generation artefacts before exec. + + 1. Strip escaped quotes (``\\\"``) that break docstrings. + 2. Remove redundant ``from typing import`` of Python 3.9+ builtins + (dict, list, tuple, set, frozenset, type) that cause ImportError + on Python ≥ 3.11. + """ + import re as _re + + # 1. Un-escape literal backslash-quote sequences + if '\\"' in code or "\\'" in code: + code = code.replace('\\"', '"').replace("\\'", "'") + + # 2. Remove typing imports of builtin generics + _builtin_generics = { + "Dict", + "List", + "Tuple", + "Set", + "FrozenSet", + "Type", + "dict", + "list", + "tuple", + "set", + "frozenset", + "type", + } + + def _clean_typing_import(m: _re.Match) -> str: + names = [n.strip() for n in m.group(1).split(",")] + remaining = [n for n in names if n not in _builtin_generics] + if not remaining: + return "" # remove the entire import line + return f"from typing import {', '.join(remaining)}" + + code = _re.sub( + r"^from\s+typing\s+import\s+(.+)$", + _clean_typing_import, + code, + flags=_re.MULTILINE, + ) + # Remove any blank lines left behind + code = _re.sub(r"\n{3,}", "\n\n", code) + + return code + def __call__(self, *args, **kwargs) -> _RT: """ Call the function implementation with the provided arguments. @@ -230,7 +253,7 @@ class RunEvals: def __init__( self, - source: str | Path | dict | EvalsFile | Evals | Sequence[Evals], + source: str | Path | dict | EvalsFile | EvalsBundle | Evals | Sequence[Evals], *, functions: dict[str, Callable] | None = None, filter_funcs: list[str] | None = None, @@ -241,6 +264,8 @@ def __init__( dict[str, Callable | tuple[Callable, Callable | None] | FixtureDefinition] | None ) = None, ignore_duration: bool = False, + executor: Executor | None = None, + fallback_executor: Executor | None = None, ): self._source = source self._functions = functions or {} @@ -250,6 +275,8 @@ def __init__( self._serial_fn = serial_fn or {} self._fixtures = fixtures or {} self._ignore_duration = ignore_duration + self._executor = executor + self._fallback_executor = fallback_executor @classmethod def from_file(cls, path: str | Path) -> "RunEvals": @@ -267,6 +294,22 @@ def from_file(cls, path: str | Path) -> "RunEvals": """ return cls(str(path)) + @classmethod + def from_bundle(cls, bundle: EvalsBundle) -> "RunEvals": + """ + Create from a EvalsBundle object. + + Args: + bundle: EvalsBundle object + + Returns: + RunEvals instance + + Example: + RunEvals.from_bundle(bundle).run() + """ + return cls(bundle) + @classmethod def from_source(cls, source: str | dict | EvalsFile) -> "RunEvals": """ @@ -514,6 +557,8 @@ def run(self) -> EvalSummary: serial_fn=self._serial_fn, fixtures=self._fixtures, ignore_duration=self._ignore_duration, + executor=self._executor, + fallback_executor=self._fallback_executor, ) def ignore_duration(self) -> "RunEvals": @@ -528,3 +573,14 @@ def ignore_duration(self) -> "RunEvals": """ self._ignore_duration = True return self + + def with_executor( + self, + executor: Executor | None = None, + *, + fallback_executor: Executor | None = None, + ) -> "RunEvals": + """Store executor preferences for downstream execution-aware flows.""" + self._executor = executor + self._fallback_executor = fallback_executor + return self diff --git a/src/vowel/schema.py b/src/vowel/schema.py new file mode 100644 index 0000000..15e04be --- /dev/null +++ b/src/vowel/schema.py @@ -0,0 +1,126 @@ +"""Versioned JSON Schema cache and YAML header helpers.""" + +from __future__ import annotations + +import hashlib +import importlib.metadata +import json +import re +from copy import deepcopy +from pathlib import Path +from typing import Any + +from .utils import EvalsBundle + +SCHEMA_CACHE_DIR = Path.home() / ".vowel" + + +def _schema_version_token(version: str | None = None) -> str: + if version is None: + try: + version = importlib.metadata.version("vowel") + except importlib.metadata.PackageNotFoundError: + version = "0.0.0" + + ver = version + nums = re.findall(r"\d+", ver) + if not nums: + return "000" + return "".join(nums) + + +def build_yaml_schema_from_bundle() -> dict[str, Any]: + """Build YAML-file schema directly from runtime models. + + No repository reference file is used. The root shape is forced to match + vowel's YAML file format: + - top-level optional `fixtures` + - top-level optional `serializers` + - top-level additionalProperties => per-function `Evals` + """ + bundle_schema = EvalsBundle.model_json_schema(ref_template="#/$defs/{model}") + defs = bundle_schema.get("$defs", {}) + properties = bundle_schema.get("properties", {}) + fixtures_schema = properties.get( + "fixtures", + { + "type": "object", + "title": "Fixtures", + }, + ) + serializers_schema = properties.get( + "serializers", + { + "type": "object", + "title": "Serializers", + }, + ) + + additional_properties: dict[str, Any] + if "Evals" in defs: + # Top-level YAML uses function name as key, so `id` should not be + # required in each map value even though runtime Evals model has it. + evals_map_value = deepcopy(defs["Evals"]) + required = evals_map_value.get("required") + if isinstance(required, list): + evals_map_value["required"] = [k for k in required if k != "id"] + evals_map_value["title"] = "Function" + evals_map_value["description"] = ( + "Function evaluation specification keyed by function import path/name. " + "Contains fixture dependencies, global evaluators (`evals`), and dataset cases." + ) + defs["EvalsMapValue"] = evals_map_value + additional_properties = {"$ref": "#/$defs/EvalsMapValue"} + else: + evals_schema = properties.get("evals", {"type": "object"}) + additional_properties = evals_schema.get("additionalProperties", {"type": "object"}) + + schema: dict[str, Any] = { + "$schema": "http://json-schema.org/draft-07/schema#", + "type": "object", + "properties": { + "fixtures": fixtures_schema, + "serializers": serializers_schema, + }, + "additionalProperties": additional_properties, + "$defs": defs, + } + + return schema + + +def ensure_cached_schema(version: str | None = None) -> Path: + """Ensure the versioned schema file exists and is up to date.""" + schema_data = build_yaml_schema_from_bundle() + rendered = json.dumps(schema_data, indent=2, ensure_ascii=False) + "\n" + + token = _schema_version_token(version) + digest = hashlib.sha1(rendered.encode("utf-8")).hexdigest()[:8] + schema_path = SCHEMA_CACHE_DIR / f"vowel-schema_{token}_{digest}.json" + schema_path.parent.mkdir(parents=True, exist_ok=True) + + if not schema_path.exists() or schema_path.read_text(encoding="utf-8") != rendered: + schema_path.write_text(rendered, encoding="utf-8") + + return schema_path + + +def add_schema_header(yaml_spec: str, schema_path: Path | str) -> str: + """Prepend YAML language-server schema reference header to YAML content.""" + schema_str = str(schema_path) + header = f"# yaml-language-server: $schema={schema_str}" + + lines = yaml_spec.splitlines() + if lines and lines[0].startswith("# yaml-language-server: $schema="): + lines = lines[1:] + if lines and lines[0] == "": + lines = lines[1:] + + body = "\n".join(lines).rstrip("\n") + return f"{header}\n\n{body}\n" + + +def materialize_yaml_with_schema_header(yaml_spec: str, version: str | None = None) -> str: + """Create/refresh versioned schema cache and return header-prefixed YAML.""" + schema_path = ensure_cached_schema(version) + return add_schema_header(yaml_spec, schema_path) diff --git a/src/vowel/tdd.py b/src/vowel/tdd.py index e123a2c..5a69bdf 100644 --- a/src/vowel/tdd.py +++ b/src/vowel/tdd.py @@ -1,25 +1,6 @@ -"""TDD-based eval generation: Intent -> Signature -> Evals -> Implementation. - -This module provides a true TDD approach where: -1. LLM generates function signature from description (intent) -2. LLM generates eval spec from signature (tests first) -3. LLM generates implementation that passes the evals (code last) - -Example: - from vowel.tdd import TDDGenerator - - generator = TDDGenerator(model="openai:gpt-4o") - - result = generator.generate_all( - description="Binary search for target in sorted list. Returns index or -1.", - name="binary_search" - ) - - print(result.signature.to_signature_str()) - print(result.yaml_spec) - print(result.func.code) -""" +"""TDD pipeline for generating signatures, evals, and implementations.""" +import inspect import os import re import time @@ -31,13 +12,19 @@ import yaml from pydantic import BaseModel, Field from pydantic_ai import Agent +from pydantic_ai.models import Model from vowel.context import EVAL_SPEC_CONTEXT from vowel.eval_types import EvalsSource +from vowel.executor import Executor, resolve_executors from vowel.monitoring import enable_monitoring from vowel.runner import Function, RunEvals from vowel.utils import EvalSummary -from vowel.validation import validate_and_fix_spec +from vowel.validation import ( + build_failure_context, + validate_and_fix_spec, + validate_expected_values, +) # Configure logfire for tracing dotenv.load_dotenv() @@ -218,9 +205,12 @@ class TDDGenerator: def __init__( self, - model: str | None = None, + model: str | Model | None = None, additional_context: str | list[str] | None = None, load_env: bool = False, + executor: Executor | None = None, + fallback_executor: Executor | None = None, + **opts, ): if load_env: import dotenv @@ -245,6 +235,12 @@ def __init__( self._impl_agent: Any = None self._signature_agent: Any = None + # Optional executor for expected-value validation + self._executor = executor + self._fallback_executor = fallback_executor + + self._opts = opts + logfire.info("TDDGenerator initialized", model=self.model) @property @@ -264,6 +260,7 @@ def signature_agent(self) -> Agent[None, FunctionSignature]: - Specify return type accurately - Write a clear, complete description """, + **self._opts, ) return cast(Agent[None, FunctionSignature], self._signature_agent) @@ -463,6 +460,7 @@ def eval_agent(self) -> Agent[None, EvalsSource]: For complex validations, use case-specific assertions instead. """, + **self._opts, ) return cast(Agent[None, EvalsSource], self._eval_agent) @@ -631,6 +629,7 @@ def my_func(data: list, target: int) -> int: - [ ] For path parsing: handle both `key` and `[index]` formats - [ ] For nested access: check type at EACH level before accessing """, + **self._opts, ) return cast(Agent[None, Function], self._impl_agent) @@ -737,34 +736,79 @@ def generate_evals_from_signature( IMPORTANT: In assertions, use `input[0]`, `input[1]` to access positional args. {extra_context} {f"{additional_context}" if additional_context else ""}""" - result = self.eval_agent.run_sync(prompt) - yaml_spec = result.output.yaml_spec # type: ignore[attr-defined] - - # Sanitize: strip YAML tags that safe_load rejects - yaml_spec = re.sub(r"!!python/[\w.:]+", "", yaml_spec) - yaml_spec = re.sub(r"!!binary\b", "", yaml_spec) - - # Validate YAML syntax - yaml.safe_load(yaml_spec) - - # Static validation: fix common LLM generation mistakes - validation = validate_and_fix_spec(yaml_spec) - if validation.has_warnings: - logfire.info("Spec validation results", summary=validation.summary()) - if validation.was_modified: - yaml_spec = validation.fixed_yaml - - runner = RunEvals.from_source(yaml_spec) - logfire.info( - "Evals generated", cases=len(yaml_spec.split("- case:")), attempt=attempt + 1 - ) + try: + result = self.eval_agent.run_sync(prompt) + yaml_spec = result.output.yaml_spec # type: ignore[attr-defined] + + # Sanitize: strip YAML tags that safe_load rejects + yaml_spec = re.sub(r"!!python/[\w.:]+", "", yaml_spec) + yaml_spec = re.sub(r"!!binary\b", "", yaml_spec) + + # Validate YAML syntax + yaml.safe_load(yaml_spec) + + # Static validation: fix common LLM generation mistakes + validation = validate_and_fix_spec(yaml_spec) + if validation.has_warnings: + logfire.info("Spec validation results", summary=validation.summary()) + if validation.was_modified: + yaml_spec = validation.fixed_yaml + + # Executor-based validation: fix expected values by executing + # each case through the sandbox and correcting mismatches. + if func is not None: + # Resolve source code for validation + if isinstance(func, Function): + real_code = func.code + elif callable(func): + try: + real_code = inspect.getsource(func) + except OSError: + real_code = None + else: + real_code = None + + if real_code is not None: + val_func = Function( + name=signature.name, + code=real_code, + description=signature.description, + ) + executor = resolve_executors( + getattr(self, "_executor", None), + getattr(self, "_fallback_executor", None), + ) + yaml_spec = validate_expected_values( + yaml_spec, + val_func, + executor, + ) + + runner = RunEvals.from_source(yaml_spec) + logfire.info( + "Evals generated", + cases=len(yaml_spec.split("- case:")), + attempt=attempt + 1, + ) + + except Exception as gen_exc: + logfire.warn( + "Eval spec generation failed on attempt {attempt}, retrying", + attempt=attempt + 1, + error=str(gen_exc), + ) + last_failure_context = f"Generation error: {gen_exc}" + if attempt < max_retries: + time.sleep(retry_delay) + continue # If no func provided, return without validation if func is None: return runner, yaml_spec # Run spec against the provided function - test_runner = runner.with_functions({signature.name: func}) + func_callable = func.impl if isinstance(func, Function) else func + test_runner = runner.with_functions({signature.name: func_callable}) if ignore_duration: test_runner = test_runner.ignore_duration() summary = test_runner.run() @@ -778,7 +822,7 @@ def generate_evals_from_signature( return runner, yaml_spec # Build failure context for next attempt - last_failure_context = self._build_eval_failure_context(summary) + last_failure_context = build_failure_context(summary) logfire.warn( "Eval spec below coverage, retrying", coverage=f"{summary.coverage * 100:.0f}%", @@ -789,31 +833,24 @@ def generate_evals_from_signature( if attempt < max_retries: time.sleep(retry_delay) - # Exhausted retries — return last generated spec - # (summary/runner/yaml_spec are always set when func is not None and loop ran at least once) - assert summary is not None and runner is not None # noqa: S101 - logfire.warn( - "Eval generation exhausted retries", - final_coverage=f"{summary.coverage * 100:.0f}%", - target=f"{min_coverage * 100:.0f}%", - ) - return runner, yaml_spec + # Exhausted retries — return last generated spec if we have one + if runner is not None and summary is not None: + logfire.warn( + "Eval generation exhausted retries", + final_coverage=f"{summary.coverage * 100:.0f}%", + target=f"{min_coverage * 100:.0f}%", + ) + return runner, yaml_spec - def _build_eval_failure_context(self, summary: EvalSummary) -> str: - """Build a concise failure report to inject into the retry prompt.""" - lines: list[str] = [] - for result in summary.results: - if result.report: - for case in result.report.cases: - failed_assertions = {k: v for k, v in case.assertions.items() if not v.value} - if failed_assertions: - reasons = ", ".join( - f"{k}: {v.reason}" for k, v in failed_assertions.items() if v.reason - ) - lines.append(f"- Case '{case.name}' FAILED [{reasons}]") - if result.error: - lines.append(f"- Error: {result.error}") - return "\n".join(lines) if lines else "Unknown failures" + # All attempts failed with generation errors — return whatever we have + if yaml_spec: + runner = RunEvals.from_source(yaml_spec) + return runner, yaml_spec + + raise RuntimeError( + f"Failed to generate valid eval spec for '{signature.name}' " + f"after {max_retries + 1} attempts" + ) def generate_implementation( self, @@ -875,6 +912,15 @@ def generate_all( ) -> TDDResult: """Run complete TDD flow: Signature -> Evals -> Implementation. + 1. Generate function signature from description + 2. Generate eval spec from signature (tests first) + 3. Generate implementation that passes the evals (code last) + 4. Run evals & retry implementation on failure + + When ``executor`` is set (at init), generated expected values are + validated against actual execution and auto-corrected before the + coverage check. + Args: description: What the function should do name: Function name @@ -898,7 +944,7 @@ def generate_all( for flow_attempt in range(max_flow_retries + 1): with logfire.span("TDD generation flow", name=name, flow_attempt=flow_attempt + 1): - # Step 2: Generate evals + # Step 2: Generate evals from signature logfire.info("Step 2: Generating evals", flow_attempt=flow_attempt + 1) runner, yaml_spec = self.generate_evals_from_signature( signature, @@ -912,16 +958,27 @@ def generate_all( summary: EvalSummary | None = None for impl_attempt in range(max_impl_retries + 1): - func = self.generate_implementation( - signature, yaml_spec, additional_context, description - ) + try: + func = self.generate_implementation( + signature, yaml_spec, additional_context, description + ) + except RuntimeError as exc: + logfire.warn( + "Implementation failed to compile, retrying", + impl_attempt=impl_attempt + 1, + error=str(exc), + ) + if impl_attempt < max_impl_retries: + time.sleep(retry_delay) + continue + raise # If max_eval_retries > 0, re-validate evals against this impl if max_eval_retries > 0 and impl_attempt == 0: runner, yaml_spec = self.generate_evals_from_signature( signature, min_cases, - func=func.impl, + func=func, max_retries=max_eval_retries, min_coverage=min_coverage, retry_delay=retry_delay, diff --git a/src/vowel/utils.py b/src/vowel/utils.py index 1b8be82..ac54c1b 100644 --- a/src/vowel/utils.py +++ b/src/vowel/utils.py @@ -1,23 +1,4 @@ -"""Utility functions for the vowel evaluation framework. - -This module provides core utilities for: -- Loading and parsing YAML evaluation specifications -- Type compatibility checking for YAML serialization -- Function import and execution helpers -- Dataset creation and evaluation running -- Result aggregation and reporting - -Key classes: - EvalResult: Result of a single function evaluation - EvalSummary: Aggregated results from multiple evaluations - -Key functions: - run_evals: Main entry point for running evaluations - load_evals: Load evaluations from various sources - to_dataset: Convert Evals to pydantic-evals Dataset - is_yaml_serializable_type: Check if a type can be serialized to YAML - check_compatibility: Validate function parameters for YAML compatibility -""" +"""Shared utilities for loading specs, building datasets, and running evals.""" import asyncio import builtins @@ -26,18 +7,19 @@ import importlib import importlib.util import inspect -import logging import os import sys +import threading import types from collections.abc import Callable, Mapping, Sequence from datetime import date, datetime, time, timedelta from decimal import Decimal -from functools import wraps +from functools import lru_cache, wraps from pathlib import Path, PurePath from typing import Any, Literal, Optional, Union, get_args, get_origin import click +import logfire import yaml from pydantic import BaseModel, ConfigDict, Field from pydantic_ai import format_as_xml @@ -46,7 +28,7 @@ from pydantic_evals.reporting import EvaluationReport from .errors import FixturePathError, SignatureError -from .eval_types import Evals, EvalsFile, FixtureDefinition +from .eval_types import Evals, EvalsFile, FixtureDefinition, SerializerSpec from .evals import ( AssertionEvaluator, ContainsInputEvaluator, @@ -55,11 +37,9 @@ TypeAdapterEvaluator, create_llm_judge, ) +from .executor import Executor -logger = logging.getLogger(__name__) - -_SYS_PATH_MODIFIED = False - +PROJECT_ROOT = Path(__file__).resolve().parents[2] # ============================================================================= # Evals Bundle - Container for evals and fixtures @@ -71,8 +51,51 @@ class EvalsBundle(BaseModel): model_config = ConfigDict(arbitrary_types_allowed=True) - evals: dict[str, Evals] = Field(default_factory=dict) + evals: dict[str, Evals] = Field(min_length=1) fixtures: dict[str, FixtureDefinition] = Field(default_factory=dict) + serializers: dict[str, SerializerSpec] = Field(default_factory=dict) + + def to_yaml(self) -> str: + """Serialize bundle to current vowel YAML spec format.""" + data: dict[str, Any] = {} + + for func_id, evals in self.evals.items(): + evals_dict = evals.model_dump( + mode="python", + exclude_none=True, + exclude_defaults=True, + ) + # Function id is represented by the top-level YAML key. + evals_dict.pop("id", None) + data[func_id] = evals_dict + + if self.fixtures: + data["fixtures"] = { + name: definition.model_dump( + mode="python", + exclude_none=True, + exclude_defaults=True, + ) + for name, definition in self.fixtures.items() + } + + if self.serializers: + data["serializers"] = { + name: serializer.model_dump( + mode="python", + by_alias=True, + exclude_none=True, + exclude_defaults=True, + ) + for name, serializer in self.serializers.items() + } + + return yaml.safe_dump( + data, + sort_keys=False, + allow_unicode=True, + default_flow_style=False, + ) # ============================================================================= @@ -121,6 +144,24 @@ class EvalsBundle(BaseModel): } +def _resolve_env_ref( + value: str, *, field_name: str, scope: Literal["judge", "model"] | str = "judge" +) -> str: + """Resolve $ENV_VAR references used in YAML evaluator settings.""" + value = value.strip() + if not value.startswith("$"): + return value + + env_var = value.lstrip("$") + resolved = os.getenv(env_var) + if not resolved: + raise ValueError( + f"Environment variable {env_var} is not set for {scope} {field_name}, " + f"set {env_var} to a valid value." + ) + return resolved + + def is_yaml_serializable_type(type_hint: Any) -> bool: """ Check if a type hint represents a YAML-serializable type. @@ -336,14 +377,27 @@ def check_compatibility(func: Callable) -> tuple[bool, list[str]]: return False, issues -def _ensure_cwd_in_path() -> None: - """Ensure current working directory is in sys.path (run once).""" - global _SYS_PATH_MODIFIED - if not _SYS_PATH_MODIFIED: - cwd = os.getcwd() - if cwd not in sys.path: - sys.path.insert(0, cwd) - _SYS_PATH_MODIFIED = True +@contextlib.contextmanager +def _cwd_on_syspath() -> Any: + """Temporarily prepend cwd and project root to ``sys.path``.""" + cwd = os.getcwd() + candidates = [cwd, str(PROJECT_ROOT)] + inserted: list[str] = [] + for candidate in candidates: + if candidate not in sys.path: + sys.path.insert(0, candidate) + inserted.append(candidate) + try: + yield + finally: + for candidate in inserted: + with contextlib.suppress(ValueError): + sys.path.remove(candidate) + + +def _is_yaml_source_string(source_str: str) -> bool: + """Best-effort heuristic for distinguishing inline YAML from file paths.""" + return "\n" in source_str or source_str.strip().startswith("{") or ":" in source_str def _apply_serializer( @@ -594,9 +648,9 @@ def __init__( ): self.definitions = fixtures self._fixture_funcs = fixture_funcs or {} - self._instances: dict[str, Any] = {} # Cached fixture instances (all scopes) - self._scope_counts: dict[str, int] = {} # Reference counts for scoped fixtures + self._instances: dict[str, Any] = {} # Cached fixture instances self._generators: dict[str, Any] = {} # Active generator fixtures for cleanup + self._lock = threading.RLock() def setup(self, fixture_name: str) -> Any: """ @@ -615,19 +669,19 @@ def setup(self, fixture_name: str) -> Any: f"Available fixtures: {available if available else '(none defined)'}" ) - defn = self.definitions[fixture_name] + with self._lock: + defn = self.definitions[fixture_name] - # For module/session scope, return cached instance if exists - if defn.scope in ("module", "session") and fixture_name in self._instances: - self._scope_counts[fixture_name] = self._scope_counts.get(fixture_name, 0) + 1 - return self._instances[fixture_name] + # For module/session scope, return cached instance if exists. + if defn.scope in ("module", "session") and fixture_name in self._instances: + return self._instances[fixture_name] - # Class-based fixture - if defn.cls: - return self._setup_class_fixture(fixture_name, defn) + # Class-based fixture + if defn.cls: + return self._setup_class_fixture(fixture_name, defn) - # Function-based fixture - return self._setup_function_fixture(fixture_name, defn) + # Function-based fixture + return self._setup_function_fixture(fixture_name, defn) def _setup_class_fixture(self, fixture_name: str, defn: FixtureDefinition) -> Any: """Setup a class-based fixture by instantiating the class.""" @@ -646,9 +700,7 @@ def _setup_class_fixture(self, fixture_name: str, defn: FixtureDefinition) -> An except Exception as e: raise RuntimeError(f"Failed to instantiate {defn.cls}: {e}") from e - # Cache instance self._instances[fixture_name] = instance - self._scope_counts[fixture_name] = self._scope_counts.get(fixture_name, 0) + 1 return instance @@ -684,9 +736,7 @@ def _setup_function_fixture(self, fixture_name: str, defn: FixtureDefinition) -> except Exception as e: raise RuntimeError(f"Failed to setup fixture '{fixture_name}': {e}") from e - # Cache instance (all scopes - function scope will be cleared on teardown) self._instances[fixture_name] = instance - self._scope_counts[fixture_name] = self._scope_counts.get(fixture_name, 0) + 1 return instance @@ -701,22 +751,17 @@ def teardown(self, fixture_name: str, scope_trigger: str = "function") -> None: if fixture_name not in self.definitions: return - defn = self.definitions[fixture_name] + with self._lock: + defn = self.definitions[fixture_name] - # Only teardown if scope matches - if defn.scope != scope_trigger: - return + # Only teardown if scope matches + if defn.scope != scope_trigger: + return - # Decrement reference count - if fixture_name in self._scope_counts: - self._scope_counts[fixture_name] -= 1 - # For module/session scope, only teardown when count reaches 0 - if defn.scope in ("module", "session") and self._scope_counts[fixture_name] > 0: - return # Still in use + instance = self._instances.pop(fixture_name, None) + if instance is None: + return - # Perform teardown - instance = self._instances.pop(fixture_name, None) - if instance is not None: # Check if this is a generator fixture (pytest-style yield) gen = self._generators.pop(fixture_name, None) if gen is not None: @@ -731,7 +776,7 @@ def teardown(self, fixture_name: str, scope_trigger: str = "function") -> None: _, teardown_func = self._fixture_funcs[fixture_name] elif defn.teardown: # Check if teardown is a class method (e.g., 'Connection.close') - if "." in defn.teardown and defn.cls: + if "." in defn.teardown and defn.cls and instance is not None: parts = defn.teardown.split(".") if len(parts) == 2: class_name, method_name = parts @@ -801,7 +846,6 @@ def import_function(module_path: str) -> Any: ImportError: If the module cannot be imported AttributeError: If the function is not found in the module """ - _ensure_cwd_in_path() tried_combinations = [] if "." not in module_path: @@ -815,39 +859,54 @@ def import_function(module_path: str) -> Any: parts = module_path.split(".") - for i in range(len(parts) - 1, 0, -1): - module_name = ".".join(parts[:i]) - remaining_parts = parts[i:] - tried_combinations.append(f"module='{module_name}', attr='{'.'.join(remaining_parts)}'") + with _cwd_on_syspath(): + for i in range(len(parts) - 1, 0, -1): + module_name = ".".join(parts[:i]) + remaining_parts = parts[i:] + tried_combinations.append(f"module='{module_name}', attr='{'.'.join(remaining_parts)}'") - module = None + module = None - try: - module = importlib.import_module(module_name) - except ImportError as e: - logger.debug(f"Standard import failed for '{module_name}': {e}") - relative_path = module_name.replace(".", os.sep) + ".py" - file_path = os.path.join(os.getcwd(), relative_path) + try: + module = importlib.import_module(module_name) + except ImportError as e: + logfire.debug( + "Standard import failed for '{module_name}': {error}", + module_name=module_name, + error=str(e), + ) + relative_path = module_name.replace(".", os.sep) + ".py" + candidate_roots = [os.getcwd(), str(PROJECT_ROOT)] + + for root in candidate_roots: + file_path = os.path.join(root, relative_path) + if not os.path.exists(file_path): + continue + try: + spec = importlib.util.spec_from_file_location(module_name, file_path) + if spec and spec.loader: + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + logfire.debug( + "File-based import succeeded for '{file_path}'", file_path=file_path + ) + break + except Exception as e: + logfire.debug( + "File-based import failed for '{file_path}': {error}", + file_path=file_path, + error=str(e), + ) - if os.path.exists(file_path): + if module: try: - spec = importlib.util.spec_from_file_location(module_name, file_path) - if spec and spec.loader: - module = importlib.util.module_from_spec(spec) - spec.loader.exec_module(module) - logger.debug(f"File-based import succeeded for '{file_path}'") - except Exception as e: - logger.debug(f"File-based import failed for '{file_path}': {e}") - - if module: - try: - obj: Any = module - for part in remaining_parts: - obj = getattr(obj, part) - return obj - except AttributeError as e: - logger.debug(f"Attribute lookup failed: {e}") - continue + obj: Any = module + for part in remaining_parts: + obj = getattr(obj, part) + return obj + except AttributeError as e: + logfire.debug("Attribute lookup failed: {error}", error=str(e)) + continue try: obj = getattr(builtins, parts[0]) @@ -863,6 +922,46 @@ def import_function(module_path: str) -> Any: ) +@lru_cache(maxsize=512) +def _import_path_cached(path: str) -> Any: + """Import and cache objects referenced by import paths.""" + return import_function(path) + + +def _resolve_yaml_serializer_entry( + serializers: Mapping[str, SerializerSpec], + serializer_name: str, +) -> tuple[type | Callable | dict[str, type | Callable] | None, Callable[[dict], Any] | None]: + """Resolve a serializer registry entry into schema or serial_fn mapping values.""" + if serializer_name not in serializers: + available = sorted(serializers.keys()) + raise ValueError( + f"Unknown serializer '{serializer_name}'. Available serializers: {available}" + ) + + spec = serializers[serializer_name] + + if spec.serializer is not None: + loaded = _import_path_cached(spec.serializer) + if not callable(loaded): + raise TypeError(f"Serializer '{spec.serializer}' must resolve to a callable") + return None, loaded + + schema = spec.serializer_schema + if isinstance(schema, str): + return _import_path_cached(schema), None + + if isinstance(schema, dict): + resolved: dict[str, type | Callable] = {} + for key, path in schema.items(): + resolved[key] = _import_path_cached(path) + return resolved, None + + raise ValueError( + f"Serializer '{serializer_name}' must define one of: schema (str|dict) or serializer" + ) + + def import_class(class_path: str) -> type: """ Import a class from a module path. @@ -877,8 +976,6 @@ def import_class(class_path: str) -> type: ImportError: If the module cannot be imported AttributeError: If the class is not found in the module """ - _ensure_cwd_in_path() - parts = class_path.split(".") if len(parts) < 2 or any(not p for p in parts): raise ImportError(f"Invalid class path '{class_path}'. Expected format 'module.ClassName'.") @@ -887,7 +984,8 @@ def import_class(class_path: str) -> type: class_name = parts[-1] try: - module = importlib.import_module(module_name) + with _cwd_on_syspath(): + module = importlib.import_module(module_name) except ImportError as e: raise ImportError(f"Cannot import module '{module_name}': {e}") from e @@ -902,50 +1000,6 @@ def import_class(class_path: str) -> type: return cls -def load_evals_file(yaml_path: str) -> dict[str, Evals]: - with open(yaml_path) as f: - loaded = yaml.safe_load(f) - - evals_file = EvalsFile.model_validate(loaded) - return evals_file.get_evals() - - -def load_evals_from_yaml_string(yaml_content: str) -> dict[str, Evals]: - loaded = yaml.safe_load(yaml_content) - evals_file = EvalsFile.model_validate(loaded) - return evals_file.get_evals() - - -def load_evals_from_dict(data: dict) -> dict[str, Evals]: - evals_file = EvalsFile.model_validate(data) - return evals_file.get_evals() - - -def load_evals_from_object(evals_obj: EvalsFile) -> dict[str, Evals]: - return evals_obj.get_evals() - - -def load_evals(source: str | Path | dict | EvalsFile) -> dict[str, Evals]: - if isinstance(source, EvalsFile): - return load_evals_from_object(source) - elif isinstance(source, dict): - return load_evals_from_dict(source) - elif isinstance(source, (str, Path)): - source_str = str(source) - # Check if it's an existing file path first, before YAML heuristics - if os.path.exists(source_str): - return load_evals_file(source_str) - if "\n" in source_str or source_str.strip().startswith("{") or ":" in source_str: - return load_evals_from_yaml_string(source_str) - else: - return load_evals_file(source_str) - else: - raise TypeError( - f"source must be a file path (str/Path), YAML string (str), dict, " - f"or EvalsFile object, got {type(source)}" - ) - - # ============================================================================= # Bundle Loading Functions (with fixtures) # ============================================================================= @@ -957,26 +1011,42 @@ def load_bundle_file(yaml_path: str) -> EvalsBundle: loaded = yaml.safe_load(f) evals_file = EvalsFile.model_validate(loaded) - return EvalsBundle(evals=evals_file.get_evals(), fixtures=evals_file.fixtures) + return EvalsBundle( + evals=evals_file.get_evals(), + fixtures=evals_file.fixtures, + serializers=evals_file.serializers, + ) def load_bundle_from_yaml_string(yaml_content: str) -> EvalsBundle: """Load evals and fixtures from a YAML string.""" loaded = yaml.safe_load(yaml_content) evals_file = EvalsFile.model_validate(loaded) - return EvalsBundle(evals=evals_file.get_evals(), fixtures=evals_file.fixtures) + return EvalsBundle( + evals=evals_file.get_evals(), + fixtures=evals_file.fixtures, + serializers=evals_file.serializers, + ) def load_bundle_from_dict(data: dict) -> EvalsBundle: """Load evals and fixtures from a dictionary.""" evals_file = EvalsFile.model_validate(data) - return EvalsBundle(evals=evals_file.get_evals(), fixtures=evals_file.fixtures) + return EvalsBundle( + evals=evals_file.get_evals(), + fixtures=evals_file.fixtures, + serializers=evals_file.serializers, + ) def load_bundle_from_object(evals_obj: EvalsFile) -> EvalsBundle: """Load evals and fixtures from an EvalsFile object.""" assert isinstance(evals_obj, EvalsFile) - return EvalsBundle(evals=evals_obj.get_evals(), fixtures=evals_obj.fixtures) + return EvalsBundle( + evals=evals_obj.get_evals(), + fixtures=evals_obj.fixtures, + serializers=evals_obj.serializers, + ) def load_bundle(source: str | Path | dict | EvalsFile) -> EvalsBundle: @@ -995,7 +1065,9 @@ def load_bundle(source: str | Path | dict | EvalsFile) -> EvalsBundle: return load_bundle_from_dict(source) elif isinstance(source, (str, Path)): source_str = str(source) - if "\n" in source_str or source_str.strip().startswith("{") or ":" in source_str: + if os.path.exists(source_str): + return load_bundle_file(source_str) + if _is_yaml_source_string(source_str): return load_bundle_from_yaml_string(source_str) else: return load_bundle_file(source_str) @@ -1123,11 +1195,11 @@ def to_dataset( display_input = f"inputs: {match_case.inputs}" input_value = {"inputs": match_case.inputs} else: - display_input = f"input: {match_case.input}" + display_input = f"input: {str(match_case.input)[:300]}" input_value = {"input": match_case.input} if any(case for case in dataset_cases if input_value == case.inputs): - logger.warning("Already exists in dataset, skipping duplicate case.") + logfire.warn("Already exists in dataset, skipping duplicate case.") continue dataset_cases.append( @@ -1248,6 +1320,42 @@ def _merge_programmatic_fixtures( return merged_fixtures, fixture_funcs +def _resolve_eval_id_mapping( + mapping: Mapping[str, Any] | None, + eval_id: str, + *, + mapping_name: str, +) -> Any | None: + """Resolve mapping entries by exact id first, then by short function name. + + Supports using programmatic keys like ``{"func": fn}`` for specs that use + ``module.func`` eval ids. + """ + if not mapping: + return None + + if eval_id in mapping: + return mapping[eval_id] + + short_name = eval_id.rsplit(".", 1)[-1] + if short_name in mapping: + return mapping[short_name] + + # Reverse direction: when eval id is bare and mapping uses module.function. + if "." not in eval_id: + matches = [value for key, value in mapping.items() if key.rsplit(".", 1)[-1] == eval_id] + if len(matches) == 1: + return matches[0] + if len(matches) > 1: + candidates = sorted({key for key in mapping if key.rsplit(".", 1)[-1] == eval_id}) + raise ValueError( + f"Ambiguous {mapping_name} mapping for '{eval_id}'. " + f"Provide an exact key. Candidates: {candidates}" + ) + + return None + + def _import_and_detect_class_method( eval_id: str, functions: dict[str, Callable] | None, @@ -1261,8 +1369,9 @@ def _import_and_detect_class_method( - class_path: Full module.ClassName path for class methods, None otherwise - class_name: Class name for class methods, None otherwise """ - if functions and eval_id in functions: - func = functions[eval_id] + resolved_func = _resolve_eval_id_mapping(functions, eval_id, mapping_name="function") + if resolved_func is not None: + func = resolved_func # Check if bound method (exclude builtin functions where __self__ is the module) self_obj = getattr(func, "__self__", None) if self_obj is not None and not isinstance(self_obj, types.ModuleType): @@ -1514,8 +1623,21 @@ def _evaluate_single_function( ) # Get serializers for this function if defined - func_schema = schema.get(eval_id) - func_serial_fn = serial_fn.get(eval_id) + func_schema = _resolve_eval_id_mapping(schema, eval_id, mapping_name="serializer schema") + func_serial_fn = _resolve_eval_id_mapping( + serial_fn, eval_id, mapping_name="serializer function" + ) + + for evaluator in dataset.evaluators: + if isinstance(evaluator, AssertionEvaluator): + evaluator.serializer = func_schema + evaluator.serializer_fn = func_serial_fn + + for case in dataset.cases: + for evaluator in case.evaluators: + if isinstance(evaluator, AssertionEvaluator): + evaluator.serializer = func_schema + evaluator.serializer_fn = func_serial_fn # Setup module-scoped fixtures for this eval module_fixtures = {} @@ -1975,7 +2097,7 @@ def xml(self) -> str: def run_evals( - source: str | Path | dict | EvalsFile, + source: str | Path | dict | EvalsFile | EvalsBundle, *, filter_funcs: list[str] | None = None, functions: dict[str, Callable] | None = None, @@ -1986,6 +2108,8 @@ def run_evals( dict[str, Callable | tuple[Callable, Callable | None] | FixtureDefinition] | None ) = None, ignore_duration: bool = False, + executor: Executor | None = None, + fallback_executor: Executor | None = None, ) -> EvalSummary: """ Run evaluations from various sources. @@ -1999,14 +2123,18 @@ def run_evals( serial_fn: Optional dict of serializer functions (receive full input dict) fixtures: Optional dict of fixture functions {name: setup_fn} or {name: (setup_fn, teardown_fn)} ignore_duration: If True, skip duration constraints + executor: Optional primary executor configuration for execution-aware subflows + fallback_executor: Optional fallback executor paired with ``executor`` Returns: EvalSummary with aggregated results """ # Load both evals and fixtures from YAML - bundle = load_bundle(source) + _ = (executor, fallback_executor) + bundle = source if isinstance(source, EvalsBundle) else load_bundle(source) all_evals = bundle.evals yaml_fixtures = bundle.fixtures + yaml_serializers = bundle.serializers # Merge programmatic fixtures with YAML fixtures merged_fixtures, fixture_funcs = _merge_programmatic_fixtures(yaml_fixtures, fixtures) @@ -2018,13 +2146,37 @@ def run_evals( serial_fn = serial_fn or {} if filter_funcs: - filtered_evals = {k: v for k, v in all_evals.items() if k in filter_funcs} + resolved_filter_ids: list[str] = [] + + for raw_filter in filter_funcs: + if raw_filter in all_evals: + resolved_filter_ids.append(raw_filter) + continue + + short_name = raw_filter.rsplit(".", 1)[-1] + matches = [ + eval_id for eval_id in all_evals if eval_id.rsplit(".", 1)[-1] == short_name + ] + + if len(matches) == 1: + resolved_filter_ids.append(matches[0]) + elif len(matches) > 1: + candidates = sorted(matches) + raise ValueError( + f"Ambiguous filter '{raw_filter}'. Provide an exact eval id. " + f"Candidates: {candidates}" + ) + # Keep stable input order while removing duplicates. + ordered_unique_filter_ids = list(dict.fromkeys(resolved_filter_ids)) + filtered_evals = {k: v for k, v in all_evals.items() if k in ordered_unique_filter_ids} + if not filtered_evals: available = list(all_evals.keys()) raise ValueError( f"No functions found matching filters: {', '.join(filter_funcs)}. " f"Available: {', '.join(available)}" ) + all_evals = filtered_evals # Create fixture manager if fixtures are defined @@ -2035,14 +2187,43 @@ def run_evals( try: for eval_id, evals in all_evals.items(): try: + effective_schema = dict(schema) + effective_serial_fn = dict(serial_fn) + + # YAML-native serializer registry (per-eval reference). + if evals.serializer: + yaml_schema, yaml_serial = _resolve_yaml_serializer_entry( + yaml_serializers, + evals.serializer, + ) + + # Programmatic mappings have precedence. + has_programmatic_schema = ( + _resolve_eval_id_mapping(schema, eval_id, mapping_name="serializer schema") + is not None + ) + has_programmatic_serial = ( + _resolve_eval_id_mapping( + serial_fn, + eval_id, + mapping_name="serializer function", + ) + is not None + ) + + if yaml_schema is not None and not has_programmatic_schema: + effective_schema[eval_id] = yaml_schema + if yaml_serial is not None and not has_programmatic_serial: + effective_serial_fn[eval_id] = yaml_serial + result = _evaluate_single_function( eval_id, evals, functions, merged_fixtures, fixture_manager, - schema, - serial_fn, + effective_schema, + effective_serial_fn, ignore_duration, ) results.append(result) diff --git a/src/vowel/validation.py b/src/vowel/validation.py index 73d942b..20636dd 100644 --- a/src/vowel/validation.py +++ b/src/vowel/validation.py @@ -1,24 +1,17 @@ -"""Static validator for LLM-generated eval specifications. - -Catches common LLM generation mistakes BEFORE the spec is used: -1. Extra fields in cases (comment, note, description, etc.) -2. YAML-unparseable type remnants (set literals, tuple strings, float('inf'), etc.) -3. Invented exception types not in function code -4. Removes or fixes problematic cases, returns clean YAML - -Usage: - from vowel.validation import validate_and_fix_spec - - fixed_yaml, warnings = validate_and_fix_spec(yaml_spec, function_code="def foo(x): ...") -""" +"""Validation and normalization helpers for generated eval specs.""" +import ast import re from dataclasses import dataclass, field -from typing import Literal +from typing import Any, Literal import logfire import yaml +from vowel.executor import Executor, resolve_executors +from vowel.runner import Function +from vowel.utils import EvalSummary + # Fields allowed in a case block (from MatchCase model) ALLOWED_CASE_FIELDS = frozenset( { @@ -388,7 +381,7 @@ def validate_and_fix_spec( modified = True if modified: - result.fixed_yaml = yaml.dump( + result.fixed_yaml = yaml.safe_dump( data, default_flow_style=False, allow_unicode=True, sort_keys=False ) logfire.info( @@ -399,3 +392,261 @@ def validate_and_fix_spec( ) return result + + +def build_failure_context(summary: EvalSummary) -> str: + """Build a concise failure report to inject into a retry prompt.""" + lines: list[str] = [] + for result in summary.results: + if result.report: + for case in result.report.cases: + failed_assertions = {k: v for k, v in case.assertions.items() if not v.value} + if failed_assertions: + parts = [] + for k, v in failed_assertions.items(): + if v.reason: + parts.append(f"{k}: {v.reason}") + else: + parts.append(f"{k}: FAILED") + lines.append(f"- Case '{case.name}' FAILED [{', '.join(parts)}]") + if result.error: + lines.append(f"- Error: {result.error}") + return "\n".join(lines) if lines else "Unknown failures" + + +def build_call_code( + func_name: str, case: dict +) -> ( + str | None +): # TODO: intead of building call code, consider passing arguments through executor inputs + """Build a ``func(args...)`` call string from a YAML case dict.""" + if "inputs" in case and case["inputs"] is not None: + args = case["inputs"] + if isinstance(args, list): + arg_strs = ", ".join(repr(a) for a in args) + return f"{func_name}({arg_strs})" + if isinstance(args, dict): + kwarg_strs = ", ".join(f"{k}={v!r}" for k, v in args.items()) + return f"{func_name}({kwarg_strs})" + elif "input" in case and case["input"] is not None: + return f"{func_name}({case['input']!r})" + return None + + +def inject_durations( + yaml_spec: str, + func: Function, + executor: Executor, + *, + fallback_executor: Executor | None = None, + buffer_pct: float = 0.5, + floor_ms: float = 10.0, +) -> str: + """Add per-case ``duration`` fields based on actual execution times.""" + spec = yaml.safe_load(yaml_spec) + if not isinstance(spec, dict): + return yaml_spec + + executor = resolve_executors(executor, fallback_executor) + + try: + session = executor.create_session(func.code) + except Exception: + logfire.warn("Could not create session for duration injection") + return yaml_spec + + with session: + for eval_id, eval_def in spec.items(): + if not isinstance(eval_def, dict): + continue + for case_entry in eval_def.get("dataset", []): + case = case_entry.get("case", {}) + if not isinstance(case, dict): + continue + if case.get("raises"): + continue + + call_code = build_call_code(eval_id, case) + if call_code is None: + continue + + result = session.feed(call_code) + if result.success: + dur = max( + result.duration_ms * (1 + buffer_pct), + floor_ms, + ) + case["duration"] = round(dur, 1) + + return yaml.safe_dump(spec, default_flow_style=False, allow_unicode=True, sort_keys=False) + + +def validate_expected_values( + yaml_spec: str, + func: Function, + executor: Executor | None = None, + fallback_executor: Executor | None = None, +) -> str: + """Validate and fix expected values in a YAML spec by executing cases.""" + executor = resolve_executors(executor, fallback_executor) + + spec = yaml.safe_load(yaml_spec) + if not isinstance(spec, dict): + return yaml_spec + + try: + session = executor.create_session(func.code) + except Exception: + logfire.warn("Could not create session for expected value validation") + return yaml_spec + + fixes_applied = 0 + + with session: + for eval_id, eval_def in spec.items(): + if not isinstance(eval_def, dict): + continue + for case_entry in eval_def.get("dataset", []): + case = case_entry.get("case", {}) + if not isinstance(case, dict): + continue + + call_code = build_call_code(eval_id, case) + if call_code is None: + continue + + result = session.feed(call_code) + + if ( + "expected" in case + and not case.get("raises") + and result.success + and result.output != case["expected"] + ): + logfire.info( + "Fixing expected value for case: {expected} → {actual}", + expected=repr(case["expected"]), + actual=repr(result.output), + ) + case["expected"] = result.output + fixes_applied += 1 + + if case.get("raises"): + expected_exc = case["raises"] + if result.success: + logfire.info( + "Case expected {exc} but function returned {output}, fixing", + exc=expected_exc, + output=repr(result.output), + ) + del case["raises"] + if "match" in case: + del case["match"] + case["expected"] = result.output + fixes_applied += 1 + elif result.error_type and result.error_type != expected_exc: + logfire.info( + "Case expected {expected} but got {actual}, fixing", + expected=expected_exc, + actual=result.error_type, + ) + case["raises"] = result.error_type + fixes_applied += 1 + + if fixes_applied > 0: + logfire.info("Validated spec: {count} fixes applied", count=fixes_applied) + return yaml.safe_dump(spec, default_flow_style=False, allow_unicode=True, sort_keys=False) + + return yaml_spec + + +def inject_missing_error_cases( + yaml_spec: str, + func_name: str, + error_snippets: list[dict], +) -> str: + """Inject error cases from exploration into the spec if the LLM missed them.""" + if not error_snippets: + return yaml_spec + + spec = yaml.safe_load(yaml_spec) + if not isinstance(spec, dict) or func_name not in spec: + return yaml_spec + + eval_def = spec[func_name] + dataset = eval_def.setdefault("dataset", []) + + existing_raises_inputs: set[str] = set() + for entry in dataset: + case = entry.get("case", {}) + if isinstance(case, dict) and case.get("raises"): + inp = case.get("input") + inps = case.get("inputs") + existing_raises_inputs.add(repr((inp, inps))) + + injected = 0 + + for snippet in error_snippets: + code = snippet["code"].strip() + error_type = snippet["error_type"] + description = snippet.get("description", "") + + try: + tree = ast.parse(code, mode="eval") + except SyntaxError: + continue + + if not isinstance(tree.body, ast.Call): + continue + + try: + args = [ast.literal_eval(a) for a in tree.body.args] + kwargs = {kw.arg: ast.literal_eval(kw.value) for kw in tree.body.keywords} + except (ValueError, TypeError): + continue + + if kwargs: + input_repr = repr((None, kwargs)) + if input_repr in existing_raises_inputs: + continue + case_dict: dict[str, Any] = { + "id": f"error_{error_type.lower()}_{injected}", + "inputs": kwargs, + "raises": error_type, + } + elif len(args) == 1: + if isinstance(args[0], tuple): + continue + input_repr = repr((args[0], None)) + if input_repr in existing_raises_inputs: + continue + case_dict = { + "id": f"error_{error_type.lower()}_{injected}", + "input": args[0], + "raises": error_type, + } + elif len(args) > 1: + input_repr = repr((None, args)) + if input_repr in existing_raises_inputs: + continue + case_dict = { + "id": f"error_{error_type.lower()}_{injected}", + "inputs": args, + "raises": error_type, + } + else: + continue + + dataset.append({"case": case_dict}) + injected += 1 + logfire.info( + "Injected error case: {desc} → raises {exc}", + desc=description, + exc=error_type, + ) + + if injected > 0: + logfire.info("Injected {count} missing error cases into spec", count=injected) + return yaml.safe_dump(spec, default_flow_style=False, allow_unicode=True, sort_keys=False) + + return yaml_spec diff --git a/tests/cassettes/llm_judge_custom_model.json b/tests/cassettes/llm_judge_custom_model.json index 412c044..4d6c526 100644 --- a/tests/cassettes/llm_judge_custom_model.json +++ b/tests/cassettes/llm_judge_custom_model.json @@ -4,7 +4,7 @@ "input_preview": "john doe", "result": { "passed": false, - "model": "openrouter:google/gemini-3-flash-preview" + "model": "openrouter:anthropic/claude-opus-4.6" } } } \ No newline at end of file diff --git a/tests/cassettes/test_generate_and_run.json b/tests/cassettes/test_generate_and_run.json index 4ace11b..35414fa 100644 --- a/tests/cassettes/test_generate_and_run.json +++ b/tests/cassettes/test_generate_and_run.json @@ -3,9 +3,9 @@ "prompt_preview": "generate_and_run", "model": "openrouter:google/gemini-3-flash-preview", "response": { - "yaml_spec": "double:\n evals:\n CorrectType:\n type: int\n DoubleLogic:\n assertion: output == input * 2\n NonNegativeIfInputNonNegative:\n assertion: input < 0 or output >= input\n dataset:\n - case:\n id: positive_integer\n input: 5\n expected: 10\n - case:\n id: zero\n input: 0\n expected: 0\n - case:\n id: negative_integer\n input: -4\n expected: -8\n - case:\n id: large_integer\n input: 1000000\n expected: 2000000\n - case:\n id: sequence_multiplication_check\n input: 1\n expected: 2\n - case:\n id: invalid_type_string\n input: '10'\n assertion: output == '1010'\n", + "yaml_spec": "double:\n evals:\n IsInteger:\n type: int\n CorrectCalculation:\n assertion: output == input * 2\n FastEnough:\n duration: 0.001\n dataset:\n - case:\n id: positive_integer\n input: 10\n expected: 20\n - case:\n id: negative_integer\n input: -5\n expected: -10\n - case:\n id: zero_input\n input: 0\n expected: 0\n - case:\n id: large_integer\n input: 1000000\n expected: 2000000\n - case:\n id: string_input_error\n input: '5'\n assertion: output == '55'\n type: str\n", "was_healed": false, - "coverage": 0.8333333333333334 + "coverage": 0.0 } } } \ No newline at end of file diff --git a/tests/cassettes/test_generate_factorial.json b/tests/cassettes/test_generate_factorial.json index 758b00c..094b8db 100644 --- a/tests/cassettes/test_generate_factorial.json +++ b/tests/cassettes/test_generate_factorial.json @@ -3,9 +3,9 @@ "prompt_preview": "generate_function", "model": "openrouter:google/gemini-3-flash-preview", "response": { - "name": "calculate_factorial", - "description": "Calculates the factorial of a non-negative integer n using an iterative approach. Includes input validation for non-integers and negative values.", - "code": "def calculate_factorial(n: int) -> int:\n \\\"\\\"\\\"\n Calculates the factorial of a non-negative integer n.\n \n Args:\n n (int): A non-negative integer.\n \n Returns:\n int: The factorial of n.\n \n Raises:\n ValueError: If n is negative.\n TypeError: If n is not an integer.\n \\\"\\\"\\\"\n if not isinstance(n, int):\n raise TypeError(\\\"Input must be an integer.\\\")\n if n < 0:\n raise ValueError(\\\"Input must be a non-negative integer.\\\")\n \n result = 1\n for i in range(2, n + 1):\n result *= i\n return result\n" + "name": "factorial", + "description": "Calculates the factorial of a non-negative integer using an iterative approach to avoid recursion depth issues.", + "code": "def factorial(n: int) -> int:\n \\\"\\\"\\\"\n Calculates the factorial of a non-negative integer n.\n \n Args:\n n: A non-negative integer.\n \n Returns:\n The factorial of n.\n \n Raises:\n ValueError: If n is negative.\n TypeError: If n is not an integer.\n \\\"\\\"\\\"\n if not isinstance(n, int):\n raise TypeError(\\\"Input must be an integer.\\\")\n if n < 0:\n raise ValueError(\\\"Factorial is not defined for negative numbers.\\\")\n \n result = 1\n for i in range(2, n + 1):\n result *= i\n return result\n" } } } \ No newline at end of file diff --git a/tests/cassettes/test_generate_palindrome.json b/tests/cassettes/test_generate_palindrome.json index 452654a..437686c 100644 --- a/tests/cassettes/test_generate_palindrome.json +++ b/tests/cassettes/test_generate_palindrome.json @@ -4,8 +4,8 @@ "model": "openrouter:google/gemini-3-flash-preview", "response": { "name": "is_palindrome", - "description": "Checks if a string is a palindrome while ignoring case and spaces. Only spaces are ignored, other punctuation is preserved.", - "code": "def is_palindrome(text: str) -> bool:\n \"\"\"\n Checks if a string is a palindrome, ignoring case and spaces.\n \n Args:\n text: The string to check.\n \n Returns:\n True if the string is a palindrome, False otherwise.\n \"\"\"\n if text is None:\n raise TypeError(\"Input must be a string\")\n \n # Remove spaces and convert to lowercase\n normalized = text.replace(\" \", \"\").lower()\n \n # Check if string matches its reverse\n return normalized == normalized[::-1]\n" + "description": "Checks if a string is a palindrome while ignoring case and whitespace.", + "code": "def is_palindrome(text: str) -> bool:\n \\\"\\\"\\\"\n Checks if a string is a palindrome, ignoring case and spaces.\n \n Args:\n text (str): The string to check.\n \n Returns:\n bool: True if it is a palindrome, False otherwise.\n \\\"\\\"\\\"\n if not isinstance(text, str):\n return False\n \n # Remove spaces and convert to lowercase\n cleaned = \\\"\\\".join(text.split()).lower()\n \n # Check if string matches its reverse\n return cleaned == cleaned[::-1]\n" } } } \ No newline at end of file diff --git a/tests/cassettes/test_generate_spec_simple.json b/tests/cassettes/test_generate_spec_simple.json index def07eb..6bbf8b9 100644 --- a/tests/cassettes/test_generate_spec_simple.json +++ b/tests/cassettes/test_generate_spec_simple.json @@ -3,7 +3,7 @@ "prompt_preview": "generate_spec", "model": "openrouter:google/gemini-3-flash-preview", "response": { - "yaml_spec": "add_numbers:\n evals:\n IsInt:\n type: int\n IdentityProperty:\n assertion: (input[0] == 0 and output == input[1]) or (input[1] == 0 and output\n == input[0]) or True\n CommutativeProperty:\n assertion: output == input[1] + input[0]\n dataset:\n - case:\n id: typical_positive\n inputs:\n - 10\n - 25\n expected: 35\n - case:\n id: negative_numbers\n inputs:\n - -5\n - -15\n expected: -20\n - case:\n id: mixed_signs\n inputs:\n - 100\n - -40\n expected: 60\n - case:\n id: zero_identity\n inputs:\n - 0\n - 42\n expected: 42\n - case:\n id: large_integers\n inputs:\n - 1000000\n - 2000000\n expected: 3000000\n - case:\n id: boundary_zero_sum\n inputs:\n - 50\n - -50\n expected: 0\n", + "yaml_spec": "add_numbers:\n evals:\n IsInteger:\n type: int\n CorrectSum:\n assertion: output == input[0] + input[1]\n FastExecution:\n duration: 0.001\n dataset:\n - case:\n id: positive_integers\n inputs:\n - 10\n - 20\n expected: 30\n - case:\n id: negative_integers\n inputs:\n - -5\n - -15\n expected: -20\n - case:\n id: mixed_signs\n inputs:\n - -10\n - 25\n expected: 15\n - case:\n id: zero_addition\n inputs:\n - 0\n - 100\n expected: 100\n - case:\n id: large_integers\n inputs:\n - 1000000\n - 2000000\n expected: 3000000\n - case:\n id: identity_property\n inputs:\n - 42\n - 0\n expected: 42\n", "func_name": "add_numbers" } } diff --git a/tests/cassettes/test_generate_spec_string.json b/tests/cassettes/test_generate_spec_string.json index 2968bb9..0a365b1 100644 --- a/tests/cassettes/test_generate_spec_string.json +++ b/tests/cassettes/test_generate_spec_string.json @@ -3,7 +3,7 @@ "prompt_preview": "generate_spec", "model": "openrouter:google/gemini-3-flash-preview", "response": { - "yaml_spec": "reverse_string:\n evals:\n IsString:\n type: str\n ReverseProperty:\n assertion: output[::-1] == input\n LengthInvariant:\n assertion: len(output) == len(input)\n dataset:\n - case:\n id: typical_word\n input: hello\n expected: olleh\n - case:\n id: empty_string\n input: ''\n expected: ''\n - case:\n id: single_character\n input: z\n expected: z\n - case:\n id: palindrome\n input: racecar\n expected: racecar\n - case:\n id: strings_with_spaces\n input: abc def\n expected: fed cba\n - case:\n id: numeric_string\n input: '12345'\n expected: '54321'\n - case:\n id: special_characters\n input: '!@#$%^&*()'\n expected: )(*&^%$#@!\n", + "yaml_spec": "reverse_string:\n evals:\n IsString:\n type: str\n CorrectLength:\n assertion: len(output) == len(input)\n IdentityProperty:\n assertion: output[::-1] == input\n dataset:\n - case:\n id: normal_word\n input: hello\n expected: olleh\n - case:\n id: empty_string\n input: ''\n expected: ''\n - case:\n id: single_character\n input: A\n expected: A\n - case:\n id: with_spaces\n input: nurses run\n expected: nur sesrun\n - case:\n id: palindrome\n input: racecar\n expected: racecar\n - case:\n id: numeric_string\n input: '123456789'\n expected: '987654321'\n - case:\n id: special_characters\n input: '!@#$%^&*'\n expected: '*&^%$#@!'\n", "func_name": "reverse_string" } } diff --git a/tests/test_cli.py b/tests/test_cli.py new file mode 100644 index 0000000..cfd2e2d --- /dev/null +++ b/tests/test_cli.py @@ -0,0 +1,35 @@ +"""Tests for CLI behavior outside watch mode.""" + +import json + +from click.testing import CliRunner + +from vowel.cli import main + + +class TestCliExportJson: + """Test JSON export behavior.""" + + def test_export_json_writes_object_payload(self, tmp_path): + """--export-json should write a JSON object, not a quoted string.""" + yaml_path = tmp_path / "evals.yml" + export_path = tmp_path / "results.json" + yaml_path.write_text( + """ +len: + dataset: + - case: + input: [1, 2, 3] + expected: 3 +""" + ) + + runner = CliRunner() + result = runner.invoke(main, [str(yaml_path), "--export-json", str(export_path), "--quiet"]) + + assert result.exit_code == 0 + + payload = json.loads(export_path.read_text()) + assert isinstance(payload, dict) + assert "summary" in payload + assert "results" in payload diff --git a/tests/test_evaluators.py b/tests/test_evaluators.py index 272f053..931916c 100644 --- a/tests/test_evaluators.py +++ b/tests/test_evaluators.py @@ -144,6 +144,21 @@ def test_case_level_assertion(self): assert summary.all_passed + def test_assertion_raw_fallback_preserves_compatibility(self): + """Assertions outside the restricted builtins set should still work via fallback.""" + spec = { + "identity": { + "evals": {"Assertion": {"assertion": "pow(output, 2) == 16"}}, + "dataset": [ + {"case": {"input": 4}}, + ], + } + } + + summary = RunEvals.from_dict(spec).with_functions({"identity": lambda x: x}).run() + + assert summary.all_passed + class TestTypeEvaluator: """Tests for type checking evaluator.""" diff --git a/tests/test_executor.py b/tests/test_executor.py new file mode 100644 index 0000000..1f99246 --- /dev/null +++ b/tests/test_executor.py @@ -0,0 +1,441 @@ +"""Tests for executor backends, factory selection, and output parity.""" + +from __future__ import annotations + +import asyncio +import importlib.util +from typing import TYPE_CHECKING + +import pytest + +from vowel.executor import ( + DefaultExecutor, + Executor, + get_executor, + resolve_executors, +) + +if TYPE_CHECKING: + from vowel.executor import MontyExecutor + +# MontyExecutor requires pydantic-monty; skip gracefully if unavailable. +_MONTY_AVAILABLE = importlib.util.find_spec("pydantic_monty") is not None + +if _MONTY_AVAILABLE: + from vowel.executor import MontyExecutor # noqa: F811 + +# --------------------------------------------------------------------------- +# Fixtures +# --------------------------------------------------------------------------- + + +def _binary_search(arr: list[int], target: int) -> int: + """Reference binary search used across test classes.""" + lo, hi = 0, len(arr) - 1 + while lo <= hi: + mid = (lo + hi) // 2 + if arr[mid] == target: + return mid + elif arr[mid] < target: + lo = mid + 1 + else: + hi = mid - 1 + return -1 + + +def _add(a, b): + return a + b + + +def _build_executors() -> tuple[list[Executor], list[str]]: + instances: list[Executor] = [DefaultExecutor()] + ids = ["default"] + if _MONTY_AVAILABLE: + instances.insert(0, MontyExecutor()) + ids.insert(0, "monty") + return instances, ids + + +EXECUTOR_INSTANCES, EXECUTOR_IDS = _build_executors() + + +@pytest.fixture(params=EXECUTOR_INSTANCES, ids=EXECUTOR_IDS) +def executor(request) -> Executor: + """Parametrised fixture yielding each executor backend.""" + return request.param + + +# --------------------------------------------------------------------------- +# 1. External functions only +# --------------------------------------------------------------------------- + + +class TestExternalFunctions: + """Snippet calls host-side callbacks via external_functions.""" + + def test_single_function(self, executor: Executor): + code = "_binary_search([1, 3, 5, 7, 9], 5)" + r = asyncio.run( + executor.execute(code, external_functions={"_binary_search": _binary_search}) + ) + assert r.success is True + assert r.output == 2 + + def test_multiple_calls(self, executor: Executor): + code = ( + "results = []\n" + "results.append(search([1, 3, 5, 7, 9], 5))\n" + "results.append(search([1, 3, 5, 7, 9], 4))\n" + "results.append(search([1], 1))\n" + "results.append(search([], 1))\n" + "results\n" + ) + r = asyncio.run(executor.execute(code, external_functions={"search": _binary_search})) + assert r.success is True + assert r.output == [2, -1, 0, -1] + + def test_multiple_functions(self, executor: Executor): + code = ( + "results = []\n" + "results.append(search([10, 20, 30], 20))\n" + "results.append(add(3, 4))\n" + "results\n" + ) + r = asyncio.run( + executor.execute( + code, + external_functions={"search": _binary_search, "add": _add}, + ) + ) + assert r.success is True + assert r.output == [1, 7] + + +# --------------------------------------------------------------------------- +# 2. Inputs only +# --------------------------------------------------------------------------- + + +class TestInputs: + """Snippet uses injected values via inputs.""" + + def test_arithmetic(self, executor: Executor): + r = asyncio.run(executor.execute("x * y + z", inputs={"x": 10, "y": 3, "z": 5})) + assert r.success is True + assert r.output == 35 + + def test_list_input(self, executor: Executor): + r = asyncio.run(executor.execute("sorted(data)", inputs={"data": [3, 1, 2]})) + assert r.success is True + assert r.output == [1, 2, 3] + + def test_string_input(self, executor: Executor): + r = asyncio.run(executor.execute("name.upper()", inputs={"name": "hello"})) + assert r.success is True + assert r.output == "HELLO" + + def test_dict_input(self, executor: Executor): + r = asyncio.run(executor.execute("len(d)", inputs={"d": {"a": 1, "b": 2}})) + assert r.success is True + assert r.output == 2 + + +# --------------------------------------------------------------------------- +# 3. Inputs + external functions combined +# --------------------------------------------------------------------------- + + +class TestCombined: + """Snippet uses both inputs and external_functions.""" + + def test_search_with_data(self, executor: Executor): + r = asyncio.run( + executor.execute( + "search(data, query)", + inputs={"data": [2, 4, 6, 8, 10], "query": 6}, + external_functions={"search": _binary_search}, + ) + ) + assert r.success is True + assert r.output == 2 + + def test_function_with_multiple_inputs(self, executor: Executor): + code = ( + "results = []\n" + "for item in items:\n" + " results.append(transform(item, factor))\n" + "results\n" + ) + r = asyncio.run( + executor.execute( + code, + inputs={"items": [1, 2, 3], "factor": 10}, + external_functions={"transform": lambda x, f: x * f}, + ) + ) + assert r.success is True + assert r.output == [10, 20, 30] + + +# --------------------------------------------------------------------------- +# 4. Pure code (no injection) +# --------------------------------------------------------------------------- + + +class TestPureCode: + """Snippet needs no external injection.""" + + def test_comprehension(self, executor: Executor): + r = asyncio.run(executor.execute("[i**2 for i in range(5)]")) + assert r.success is True + assert r.output == [0, 1, 4, 9, 16] + + def test_arithmetic_expression(self, executor: Executor): + r = asyncio.run(executor.execute("2 ** 10")) + assert r.success is True + assert r.output == 1024 + + def test_multiline_with_last_expr(self, executor: Executor): + code = "x = [1, 2, 3]\ny = [i * 2 for i in x]\nsum(y)\n" + r = asyncio.run(executor.execute(code)) + assert r.success is True + assert r.output == 12 + + def test_no_trailing_expression(self, executor: Executor): + """When the last statement is not an expression output should be None.""" + r = asyncio.run(executor.execute("x = 42")) + assert r.success is True + assert r.output is None + + +# --------------------------------------------------------------------------- +# 5. Stdout capture +# --------------------------------------------------------------------------- + + +class TestStdout: + """print() output is captured in ExecutionResult.stdout.""" + + def test_print_captured(self, executor: Executor): + r = asyncio.run(executor.execute('print("hello")')) + assert r.success is True + assert "hello" in r.stdout + + +# --------------------------------------------------------------------------- +# 6. Error handling +# --------------------------------------------------------------------------- + + +class TestErrors: + """Errors are returned as structured results, never raised.""" + + def test_runtime_error(self, executor: Executor): + r = asyncio.run(executor.execute("1 / 0")) + assert r.success is False + assert r.error_type == "ZeroDivisionError" + assert r.output is None + + def test_type_error_in_external(self, executor: Executor): + r = asyncio.run( + executor.execute( + 'search("not_a_list", 5)', + external_functions={"search": _binary_search}, + ) + ) + assert r.success is False + assert r.error_type == "TypeError" + + def test_name_error(self, executor: Executor): + r = asyncio.run(executor.execute("undefined_var + 1")) + assert r.success is False + assert r.error_type == "NameError" + + def test_syntax_error(self, executor: Executor): + r = asyncio.run(executor.execute("def foo(:")) + assert r.success is False + assert r.error_type == "SyntaxError" + + def test_error_has_message(self, executor: Executor): + r = asyncio.run(executor.execute("1 / 0")) + assert r.error is not None + assert len(r.error) > 0 + + +# --------------------------------------------------------------------------- +# 7. ExecutionResult structure +# --------------------------------------------------------------------------- + + +class TestExecutionResult: + """ExecutionResult fields are correctly populated.""" + + def test_duration_is_positive(self, executor: Executor): + r = asyncio.run(executor.execute("42")) + assert r.duration_ms > 0 + + def test_success_fields(self, executor: Executor): + r = asyncio.run(executor.execute("42")) + assert r.success is True + assert r.error is None + assert r.error_type is None + + def test_failure_fields(self, executor: Executor): + r = asyncio.run(executor.execute("1/0")) + assert r.success is False + assert r.error is not None + assert r.error_type is not None + assert r.output is None + + +# --------------------------------------------------------------------------- +# 8. Protocol conformance +# --------------------------------------------------------------------------- + + +class TestProtocol: + """Both executors satisfy the Executor protocol.""" + + @pytest.mark.skipif(not _MONTY_AVAILABLE, reason="pydantic-monty not installed") + def test_monty_is_executor(self): + assert isinstance(MontyExecutor(), Executor) + + def test_default_is_executor(self): + assert isinstance(DefaultExecutor(), Executor) + + +# --------------------------------------------------------------------------- +# 9. get_executor factory +# --------------------------------------------------------------------------- + + +class TestFactory: + """get_executor returns the correct backend.""" + + def test_auto(self): + ex = get_executor("auto") + assert isinstance(ex, Executor) + + @pytest.mark.skipif(not _MONTY_AVAILABLE, reason="pydantic-monty not installed") + def test_monty(self): + ex = get_executor("monty") + assert isinstance(ex, MontyExecutor) + + def test_default(self): + ex = get_executor("default") + assert isinstance(ex, DefaultExecutor) + + def test_invalid_backend(self): + with pytest.raises(ValueError, match="Unknown executor backend"): + get_executor("invalid") # type: ignore + + +class _StaticSession: + def __init__(self, value): + self.value = value + + def feed(self, code): + from vowel.executor import ExecutionResult + + return ExecutionResult(output=self.value, stdout="", success=True) + + def close(self): + return None + + def __enter__(self): + return self + + def __exit__(self, *_): + self.close() + + +class _RaisingExecutor: + async def execute(self, code, **kwargs): + raise RuntimeError("boom") + + def execute_sync(self, code, **kwargs): + raise RuntimeError("boom") + + def create_session(self, setup_code, **kwargs): + raise RuntimeError("boom") + + +class _StaticExecutor: + def __init__(self, value): + self.value = value + + async def execute(self, code, **kwargs): + from vowel.executor import ExecutionResult + + return ExecutionResult(output=self.value, stdout="", success=True) + + def execute_sync(self, code, **kwargs): + from vowel.executor import ExecutionResult + + return ExecutionResult(output=self.value, stdout="", success=True) + + def create_session(self, setup_code, **kwargs): + return _StaticSession(self.value) + + +class TestResolveExecutors: + def test_custom_executor_uses_default_fallback_on_session_failure(self): + ex = resolve_executors(_RaisingExecutor()) + + with ex.create_session("x = 1") as session: + result = session.feed("x + 1") + + assert result.success is True + assert result.output == 2 + + def test_custom_fallback_executor_is_used(self): + ex = resolve_executors(_RaisingExecutor(), _StaticExecutor("fallback")) + + with ex.create_session("ignored") as session: + result = session.feed("ignored") + + assert result.success is True + assert result.output == "fallback" + + +# --------------------------------------------------------------------------- +# 10. Parity — both executors produce the same output +# --------------------------------------------------------------------------- + + +@pytest.mark.skipif(not _MONTY_AVAILABLE, reason="pydantic-monty not installed") +class TestParity: + """MontyExecutor and DefaultExecutor must agree on output.""" + + CASES = [ + ("pure_arithmetic", "2 + 3", {}, {}), + ("list_ops", "[1,2,3] + [4,5]", {}, {}), + ("string_method", '"hello world".split()', {}, {}), + ("with_inputs", "a + b", {"a": 10, "b": 20}, {}), + ("with_ext_func", "f(3, 4)", {}, {"f": _add}), + ("combined", "f(x, y)", {"x": 5, "y": 6}, {"f": _add}), + ] + + @pytest.mark.parametrize( + "label,code,inputs,ext_fns", + CASES, + ids=[c[0] for c in CASES], + ) + def test_output_matches(self, label, code, inputs, ext_fns): + monty = MontyExecutor() + default = DefaultExecutor() + kwargs: dict = {} + if inputs: + kwargs["inputs"] = inputs + if ext_fns: + kwargs["external_functions"] = ext_fns + + r_monty = asyncio.run(monty.execute(code, **kwargs)) + r_default = asyncio.run(default.execute(code, **kwargs)) + + assert r_monty.success is True, f"Monty failed: {r_monty.error}" + assert r_default.success is True, f"Default failed: {r_default.error}" + assert r_monty.output == r_default.output, ( + f"Parity mismatch for '{label}': " + f"monty={r_monty.output!r} vs default={r_default.output!r}" + ) diff --git a/tests/test_fixtures.py b/tests/test_fixtures.py index 2fe4b2a..da7ed08 100644 --- a/tests/test_fixtures.py +++ b/tests/test_fixtures.py @@ -107,6 +107,7 @@ def test_missing_fixture_param(self): _db_instances = [] _cache_instances = [] +_session_fixture_events: list[str] = [] def setup_db(host: str = "localhost", port: int = 5432): @@ -134,11 +135,23 @@ def teardown_cache(instance): _cache_instances.remove(instance) +def setup_session_counter(): + """Track session fixture setup calls.""" + _session_fixture_events.append("setup") + return {"bonus": 10} + + +def teardown_session_counter(instance): + """Track session fixture teardown calls.""" + _session_fixture_events.append(f"teardown:{instance['bonus']}") + + class TestFixtureManager: def setup_method(self): """Clear instances before each test.""" _db_instances.clear() _cache_instances.clear() + _session_fixture_events.clear() def test_setup_function_scope(self): """Should setup function-scoped fixture.""" @@ -340,6 +353,18 @@ def test_eval_fixture_field(self): assert evals.fixture == ["db"] + def test_load_bundle_prefers_existing_file_before_yaml_heuristic(self, monkeypatch): + """Existing file paths should not be misclassified as inline YAML.""" + import vowel.utils as utils + + monkeypatch.setattr(utils.os.path, "exists", lambda path: True) + monkeypatch.setattr(utils, "load_bundle_file", lambda path: ("file", path)) + monkeypatch.setattr(utils, "load_bundle_from_yaml_string", lambda src: ("yaml", src)) + + result = utils.load_bundle(r"C:\tmp\spec.yml") + + assert result == ("file", r"C:\tmp\spec.yml") + def function_with_db(a: int, b: int, *, db: dict) -> int: """Test function that uses a db fixture.""" @@ -352,6 +377,7 @@ class TestIntegration: def setup_method(self): _db_instances.clear() _cache_instances.clear() + _session_fixture_events.clear() def test_fixture_injection_valid_signature(self): """Should validate and use fixtures correctly.""" @@ -386,6 +412,7 @@ class TestProgrammaticFixtures: def setup_method(self): _db_instances.clear() _cache_instances.clear() + _session_fixture_events.clear() def test_with_fixtures_setup_only(self): """Should work with setup-only fixtures via with_fixtures.""" @@ -520,6 +547,84 @@ def test_fixture_missing_error(self): assert not summary.all_passed assert summary.error_count == 1 + def test_session_scope_fixture_runs_setup_and_teardown_once_per_eval_run(self): + """Session-scoped fixtures should setup once and teardown once across all cases.""" + yaml_content = """ +add_with_db: + fixture: + - db + dataset: + - case: + inputs: {a: 1, b: 2} + expected: 13 + - case: + inputs: {a: 3, b: 4} + expected: 17 +""" + + summary = ( + RunEvals.from_source(yaml_content) + .with_functions({"add_with_db": add_with_db}) + .with_fixtures( + { + "db": FixtureDefinition( + setup="test_fixtures.setup_session_counter", + teardown="test_fixtures.teardown_session_counter", + scope="session", + ) + } + ) + .run() + ) + + assert summary.all_passed + assert _session_fixture_events == ["setup", "teardown:10"] + + def test_session_scope_fixture_is_shared_across_multiple_functions(self): + """Session-scoped fixtures should teardown once after the full run ends.""" + yaml_content = """ +add_with_db: + fixture: + - db + dataset: + - case: + inputs: {a: 1, b: 2} + expected: 13 +subtract_with_db: + fixture: + - db + dataset: + - case: + inputs: {a: 10, b: 3} + expected: 17 +""" + + def subtract_with_db(a: int, b: int, *, db: dict) -> int: + return a - b + db["bonus"] + + summary = ( + RunEvals.from_source(yaml_content) + .with_functions( + { + "add_with_db": add_with_db, + "subtract_with_db": subtract_with_db, + } + ) + .with_fixtures( + { + "db": FixtureDefinition( + setup="test_fixtures.setup_session_counter", + teardown="test_fixtures.teardown_session_counter", + scope="session", + ) + } + ) + .run() + ) + + assert summary.all_passed + assert _session_fixture_events == ["setup", "teardown:10"] + def setup_db_with_args(host: str, port: int): """Setup that requires positional args.""" diff --git a/tests/test_generation.py b/tests/test_generation.py deleted file mode 100644 index 011546c..0000000 --- a/tests/test_generation.py +++ /dev/null @@ -1,36 +0,0 @@ -"""Test script for EvalGenerator and GenerationResult.""" - -from vowel import EvalGenerator, GenerationResult - - -def main(): - generator = EvalGenerator(load_env=True) - - print(f"\nUsing model: {generator.model}") - print("\n🚀 Step 1: Generate a function from prompt\n") - - func = generator.generate_function( - prompt="Create a function called 'is_prime' that checks if a number is prime. Return True if prime, False otherwise.", - async_func=False, - ) - - print(f"Generated: {func.name}") - func.print() - - print("\n🧪 Step 2: Generate spec and run evals\n") - - result: GenerationResult = generator.generate_and_run( - func, - auto_retry=True, - max_retries=2, - min_coverage=0.9, - heal_function=True, - ) - - result.print() - - print("✅ Test completed!\n") - - -if __name__ == "__main__": - main() diff --git a/tests/test_import_function.py b/tests/test_import_function.py index f0b945b..dabbc30 100644 --- a/tests/test_import_function.py +++ b/tests/test_import_function.py @@ -206,6 +206,25 @@ def helper(x): sys.path = original_path os.chdir(original_cwd) + def test_import_local_module_does_not_mutate_sys_path(self, tmp_path, monkeypatch): + """Local imports should not leave the working directory on sys.path.""" + module_file = tmp_path / "my_module.py" + module_file.write_text( + """ +def my_function(x): + return x * 2 +""" + ) + + monkeypatch.chdir(tmp_path) + monkeypatch.setattr(sys, "path", [p for p in sys.path if p != str(tmp_path)]) + before = sys.path.copy() + + func = import_function("my_module.my_function") + + assert func(5) == 10 + assert sys.path == before + class TestImportErrors: """Tests for import error handling.""" diff --git a/tests/test_llm_integration.py b/tests/test_llm_integration.py index e20b2e8..7e55a20 100644 --- a/tests/test_llm_integration.py +++ b/tests/test_llm_integration.py @@ -1,8 +1,4 @@ -"""LLM-based integration tests with cassette caching. - -These tests use real LLM calls but cache responses for reproducibility. -Run with --update-cassettes to refresh cached responses. -""" +"""LLM integration tests with cassette-backed response caching.""" import hashlib import json @@ -15,10 +11,10 @@ dotenv.load_dotenv() -DEFAULT_MODEL = os.getenv("MODEL_NAME", "openrouter:google/gemini-3-flash-preview") +DEFAULT_MODEL = "openrouter:google/gemini-3-flash-preview" pytestmark = pytest.mark.skipif( - not os.getenv("OPENROUTER_API_KEY") and not os.getenv("OPENAI_API_KEY"), + not os.getenv("OPENROUTER_API_KEY"), reason="No API key available for LLM tests (need OPENROUTER_API_KEY or OPENAI_API_KEY)", ) diff --git a/tests/test_llm_judge.py b/tests/test_llm_judge.py index 6a2996f..be4e30f 100644 --- a/tests/test_llm_judge.py +++ b/tests/test_llm_judge.py @@ -1,7 +1,4 @@ -"""LLM Judge evaluator tests with cassette caching. - -These tests specifically test the LLMJudge evaluator functionality. -""" +"""Tests for LLMJudge evaluator behavior using cassette caching.""" import hashlib import json diff --git a/tests/test_llm_judge_env_refs.py b/tests/test_llm_judge_env_refs.py new file mode 100644 index 0000000..7407426 --- /dev/null +++ b/tests/test_llm_judge_env_refs.py @@ -0,0 +1,33 @@ +"""Tests for environment variable references in LLM Judge configuration.""" + +import pytest + +from vowel.evals import create_llm_judge + + +def test_create_llm_judge_resolves_rubric_and_model_env_refs(monkeypatch): + """Rubric and model support $ENV_VAR style references.""" + monkeypatch.setenv("TEST_JUDGE_MODEL", "openrouter:google/gemini-2.5-flash") + monkeypatch.setenv("_TEST_JUDGE_RUBRIC", "Output should be concise and accurate") + + judge = create_llm_judge( + rubric="$_TEST_JUDGE_RUBRIC", + include=["input"], + config={"model": "$TEST_JUDGE_MODEL", "temperature": 0.0}, + ) + + assert judge.model == "openrouter:google/gemini-2.5-flash" + assert judge.rubric == "Output should be concise and accurate" + + +def test_create_llm_judge_raises_when_rubric_env_ref_missing(monkeypatch): + """Missing rubric env var should raise a clear error.""" + monkeypatch.setenv("TEST_JUDGE_MODEL", "openrouter:google/gemini-2.5-flash") + monkeypatch.delenv("_MISSING_RUBRIC", raising=False) + + with pytest.raises(ValueError, match="_MISSING_RUBRIC"): + create_llm_judge( + rubric="$_MISSING_RUBRIC", + include=["input"], + config={"model": "$TEST_JUDGE_MODEL"}, + ) diff --git a/tests/test_run_evals.py b/tests/test_run_evals.py index 49fac6d..e91e35a 100644 --- a/tests/test_run_evals.py +++ b/tests/test_run_evals.py @@ -5,6 +5,7 @@ import pytest from vowel import EvalSummary, RunEvals, run_evals +from vowel.executor import DefaultExecutor class TestRunEvalsFromFile: @@ -146,6 +147,46 @@ def test_with_functions_chained(self, simple_yaml_spec: str): assert summary.all_passed + def test_with_functions_short_name_matches_module_function_spec(self): + """module.function eval ids should match short-name keys from with_functions.""" + + def add(a, b): + return a + b + + spec = { + "pkg.add": { + "dataset": [ + {"case": {"inputs": {"a": 1, "b": 2}, "expected": 3}}, + ] + } + } + + summary = RunEvals.from_dict(spec).with_functions({"add": add}).run() + + assert summary.all_passed + + def test_with_executor_preserves_existing_run_behavior(self, simple_yaml_spec: str): + """Executor preferences should be accepted without changing normal eval behavior.""" + summary = ( + RunEvals.from_source(simple_yaml_spec) + .with_functions({"add": lambda a, b: a + b}) + .with_executor(DefaultExecutor(), fallback_executor=DefaultExecutor()) + .run() + ) + + assert summary.all_passed + + def test_run_evals_accepts_executor_preferences(self, simple_yaml_spec: str): + """Top-level run_evals should accept executor preferences.""" + summary = run_evals( + simple_yaml_spec, + functions={"add": lambda a, b: a + b}, + executor=DefaultExecutor(), + fallback_executor=DefaultExecutor(), + ) + + assert summary.all_passed + class TestRunEvalsFilter: """Tests for filter() method.""" @@ -197,6 +238,81 @@ def test_filter_multiple_functions(self): assert summary.total_count == 2 + def test_filter_module_name_matches_short_eval_id(self): + """module.function filter should match bare function eval ids.""" + spec = { + "add": {"dataset": [{"case": {"inputs": {"a": 1, "b": 2}, "expected": 3}}]}, + "sub": {"dataset": [{"case": {"inputs": {"a": 5, "b": 3}, "expected": 2}}]}, + } + + summary = ( + RunEvals.from_dict(spec) + .with_functions( + { + "add": lambda a, b: a + b, + "sub": lambda a, b: a - b, + } + ) + .filter(["math.add"]) + .run() + ) + + assert summary.total_count == 1 + assert summary.results[0].eval_id == "add" + + def test_filter_short_name_matches_module_eval_id(self): + """bare function filter should match module.function eval ids.""" + spec = { + "pkg.add": { + "dataset": [ + {"case": {"inputs": {"a": 1, "b": 2}, "expected": 3}}, + ] + }, + "pkg.sub": { + "dataset": [ + {"case": {"inputs": {"a": 5, "b": 3}, "expected": 2}}, + ] + }, + } + + summary = ( + RunEvals.from_dict(spec) + .with_functions( + { + "add": lambda a, b: a + b, + "sub": lambda a, b: a - b, + } + ) + .filter(["add"]) + .run() + ) + + assert summary.total_count == 1 + assert summary.results[0].eval_id == "pkg.add" + + def test_filter_short_name_raises_on_ambiguous_matches(self): + """Short-name filters should fail fast when multiple eval ids share a suffix.""" + spec = { + "pkg.add": { + "dataset": [ + {"case": {"inputs": {"a": 1, "b": 2}, "expected": 3}}, + ] + }, + "other.add": { + "dataset": [ + {"case": {"inputs": {"a": 2, "b": 3}, "expected": 5}}, + ] + }, + } + + with pytest.raises(ValueError, match="Ambiguous filter 'add'"): + ( + RunEvals.from_dict(spec) + .with_functions({"add": lambda a, b: a + b}) + .filter(["add"]) + .run() + ) + class TestRunEvalsDebug: """Tests for debug() method.""" diff --git a/tests/test_schema.py b/tests/test_schema.py new file mode 100644 index 0000000..c90b62d --- /dev/null +++ b/tests/test_schema.py @@ -0,0 +1,38 @@ +"""Tests for generated YAML schema support.""" + +import json +from pathlib import Path + +from vowel.schema import build_yaml_schema_from_bundle, materialize_yaml_with_schema_header + + +def test_generated_schema_includes_top_level_serializers_property(): + """Top-level `serializers` should be explicitly supported in generated schema.""" + schema = build_yaml_schema_from_bundle() + properties = schema.get("properties", {}) + + assert "fixtures" in properties + assert "serializers" in properties + + +def test_generated_schema_keeps_function_additional_properties(): + """Unknown top-level keys must still map to per-function Evals definitions.""" + schema = build_yaml_schema_from_bundle() + + additional = schema.get("additionalProperties", {}) + assert additional == {"$ref": "#/$defs/EvalsMapValue"} + + +def test_materialized_header_uses_hashed_cache_with_serializers(): + """Schema header should reference a content-addressed cache file that supports serializers.""" + yaml_text = "len:\n dataset:\n - case:\n id: len_basic\n input: [1]\n expected: 1\n" + materialized = materialize_yaml_with_schema_header(yaml_text) + first_line = materialized.splitlines()[0] + + assert first_line.startswith("# yaml-language-server: $schema=") + schema_path = Path(first_line.split("$schema=", 1)[1]) + assert schema_path.name.startswith("vowel-schema_") + assert schema_path.exists() + + schema_obj = json.loads(schema_path.read_text(encoding="utf-8")) + assert "serializers" in schema_obj.get("properties", {}) diff --git a/tests/test_serializer.py b/tests/test_serializer.py index b38bc56..f6516d1 100644 --- a/tests/test_serializer.py +++ b/tests/test_serializer.py @@ -28,6 +28,16 @@ def process_with_config(user: User, config: Config) -> str: return f"{user.name} (timeout={config.timeout})" +def yaml_serialize_user(data: dict) -> User: + """Serializer function used by YAML-native serializer registry tests.""" + raw = data.get("input") or data.get("inputs") + if isinstance(raw, list): + raw = raw[0] + if not isinstance(raw, dict): + raise ValueError("Expected serializer input payload to be a dict") + return User(**raw) + + class TestSchemaSerializer: """Tests for schema-based serialization.""" @@ -147,6 +157,36 @@ def test_inputs_named_params_different_types(self): ) assert summary.all_passed + def test_assertion_uses_serialized_input_with_dict_schema(self): + """Assertion `input` should contain per-param serialized objects for dict schema.""" + spec = { + "process_with_config": { + "evals": { + "CheckSerializedInput": { + "assertion": "input['user'].email.endswith('@a.com') and input['config'].timeout == 30" + } + }, + "dataset": [ + { + "case": { + "inputs": { + "user": {"id": 1, "name": "Alice", "email": "a@a.com"}, + "config": {"timeout": 30, "verbose": True}, + }, + "expected": "Alice (timeout=30)", + } + }, + ], + } + } + summary = ( + RunEvals.from_dict(spec) + .with_functions({"process_with_config": process_with_config}) + .with_serializer({"process_with_config": {"user": User, "config": Config}}) + .run() + ) + assert summary.all_passed + def test_no_serializer_passthrough(self): """Without serializer, dict is passed as-is.""" @@ -198,6 +238,51 @@ def test_multiple_cases(self): assert summary.all_passed assert summary.total_count == 1 + def test_serializer_short_name_matches_module_function_spec(self): + """Serializer mapping by short name should work for module.function eval ids.""" + spec = { + "pkg.get_user_info": { + "dataset": [ + { + "case": { + "input": {"id": 1, "name": "Alice", "email": "a@a.com"}, + "expected": "User Alice has email a@a.com", + } + }, + ] + } + } + summary = ( + RunEvals.from_dict(spec) + .with_functions({"get_user_info": get_user_info}) + .with_serializer({"get_user_info": User}) + .run() + ) + assert summary.all_passed + + def test_assertion_uses_serialized_input_with_schema(self): + """Assertion `input` should be the schema-serialized object, not raw YAML dict.""" + spec = { + "get_user_info": { + "evals": {"CheckSerializedInput": {"assertion": "input.email.endswith('@a.com')"}}, + "dataset": [ + { + "case": { + "input": {"id": 1, "name": "Alice", "email": "a@a.com"}, + "expected": "User Alice has email a@a.com", + } + }, + ], + } + } + summary = ( + RunEvals.from_dict(spec) + .with_functions({"get_user_info": get_user_info}) + .with_serializer({"get_user_info": User}) + .run() + ) + assert summary.all_passed + class TestSerialFn: """Tests for serial_fn-based serialization.""" @@ -333,6 +418,35 @@ def get_full_name(user: User) -> str: ) assert summary.all_passed + def test_assertion_uses_serialized_input_with_serial_fn(self): + """Assertion `input` should reflect serial_fn output type.""" + + def serialize_user(d: dict) -> User: + data = d.get("input") or d.get("inputs") + assert data is not None + return User(**data) + + spec = { + "get_user_info": { + "evals": {"CheckSerializedInput": {"assertion": "input.id == 7"}}, + "dataset": [ + { + "case": { + "input": {"id": 7, "name": "Ada", "email": "ada@a.com"}, + "expected": "User Ada has email ada@a.com", + } + }, + ], + } + } + summary = ( + RunEvals.from_dict(spec) + .with_functions({"get_user_info": get_user_info}) + .with_serializer(serial_fn={"get_user_info": serialize_user}) + .run() + ) + assert summary.all_passed + class TestSerializerChaining: """Tests for serializer method chaining.""" @@ -445,3 +559,88 @@ def test_serializer_validation_error(self): ) assert not summary.all_passed assert summary.failed_count == 1 + + +class TestYamlNativeSerializerRegistry: + """Tests for YAML-native top-level serializer registry.""" + + def test_yaml_registry_schema_mode(self): + yaml_spec = """ +serializers: + user_schema: + schema: tests.test_serializer.User + +get_user_info: + serializer: user_schema + dataset: + - case: + input: {id: 1, name: Alice, email: a@a.com} + expected: "User Alice has email a@a.com" +""" + summary = ( + RunEvals.from_source(yaml_spec).with_functions({"get_user_info": get_user_info}).run() + ) + assert summary.all_passed + + def test_yaml_registry_serial_fn_mode(self): + yaml_spec = """ +serializers: + user_custom: + serializer: tests.test_serializer.yaml_serialize_user + +get_user_info: + serializer: user_custom + dataset: + - case: + inputs: {id: 2, name: Bob, email: b@b.com} + expected: "User Bob has email b@b.com" +""" + summary = ( + RunEvals.from_source(yaml_spec).with_functions({"get_user_info": get_user_info}).run() + ) + assert summary.all_passed + + def test_yaml_registry_imports_are_cached(self, monkeypatch): + """Same serializer path used by multiple evals should be imported once.""" + from vowel import utils as utils_module + + calls: list[str] = [] + original_import_function = utils_module.import_function + + def counting_import(path: str): + calls.append(path) + return original_import_function(path) + + utils_module._import_path_cached.cache_clear() + monkeypatch.setattr(utils_module, "import_function", counting_import) + + yaml_spec = """ +serializers: + user_schema: + schema: tests.test_serializer.User + +get_user_info: + serializer: user_schema + dataset: + - case: + input: {id: 1, name: Alice, email: a@a.com} + expected: "User Alice has email a@a.com" + +get_user_name: + serializer: user_schema + dataset: + - case: + input: {id: 2, name: Bob, email: b@b.com} + expected: "Bob" +""" + + def get_user_name(user: User) -> str: + return user.name + + summary = ( + RunEvals.from_source(yaml_spec) + .with_functions({"get_user_info": get_user_info, "get_user_name": get_user_name}) + .run() + ) + assert summary.all_passed + assert calls.count("tests.test_serializer.User") == 1 diff --git a/tests/test_session.py b/tests/test_session.py new file mode 100644 index 0000000..400749b --- /dev/null +++ b/tests/test_session.py @@ -0,0 +1,222 @@ +"""Tests for ExecutionSession behavior across default and Monty-backed sessions.""" + +from __future__ import annotations + +import importlib.util +from typing import TYPE_CHECKING + +import pytest + +from vowel.executor import ( + DefaultExecutor, + DefaultSession, + ExecutionSession, +) + +if TYPE_CHECKING: + from vowel.executor import FallbackSession, MontyExecutor + +# MontyExecutor requires pydantic-monty; skip gracefully if unavailable. +_MONTY_AVAILABLE = importlib.util.find_spec("pydantic_monty") is not None + +if _MONTY_AVAILABLE: + from vowel.executor import FallbackSession, MontyExecutor # noqa: F811 + +# --------------------------------------------------------------------------- +# Shared test data +# --------------------------------------------------------------------------- + +FUNC_CODE = """\ +def binary_search(arr, target): + lo, hi = 0, len(arr) - 1 + while lo <= hi: + mid = (lo + hi) // 2 + if arr[mid] == target: + return mid + elif arr[mid] < target: + lo = mid + 1 + else: + hi = mid - 1 + return -1 +""" + +SEARCH_CASES = [ + ("binary_search([1, 3, 5, 7, 9], 5)", 2), + ("binary_search([], 1)", -1), + ("binary_search([1, 2, 3], 4)", -1), + ("binary_search([10, 20, 30], 10)", 0), +] + + +def _build_executor_params() -> tuple[list, list[str]]: + params = [(DefaultExecutor, DefaultSession)] + ids = ["default"] + if _MONTY_AVAILABLE: + params.insert(0, (MontyExecutor, FallbackSession)) # type: ignore + ids.insert(0, "monty") + return params, ids + + +EXECUTOR_CLASSES, EXECUTOR_IDS = _build_executor_params() + + +@pytest.fixture(params=EXECUTOR_CLASSES, ids=EXECUTOR_IDS) +def executor_and_session(request): + """Yield (executor_instance, expected_session_class).""" + cls, session_cls = request.param + return cls(), session_cls + + +# --------------------------------------------------------------------------- +# Basic session correctness +# --------------------------------------------------------------------------- + + +class TestSessionBasic: + """feed() returns correct outputs for a simple function.""" + + def test_binary_search_cases(self, executor_and_session): + executor, _ = executor_and_session + with executor.create_session(FUNC_CODE) as session: + for snippet, expected in SEARCH_CASES: + r = session.feed(snippet) + assert r.success, f"Failed: {snippet} => {r.error}" + assert r.output == expected, f"{snippet}: got {r.output!r}, expected {expected!r}" + + def test_session_type(self, executor_and_session): + """create_session() returns the correct session class.""" + executor, session_cls = executor_and_session + with executor.create_session("x = 1") as session: + assert isinstance(session, session_cls) + + +# --------------------------------------------------------------------------- +# Error handling +# --------------------------------------------------------------------------- + + +class TestSessionErrors: + """Errors are returned structured, not raised.""" + + def test_zero_division(self, executor_and_session): + executor, _ = executor_and_session + with executor.create_session("def foo(x): return 1/x") as session: + r = session.feed("foo(0)") + assert not r.success + assert r.error_type == "ZeroDivisionError" + assert r.error is not None + + def test_name_error(self, executor_and_session): + executor, _ = executor_and_session + with executor.create_session("x = 1") as session: + r = session.feed("undefined_var + 1") + assert not r.success + assert r.error_type == "NameError" + + def test_syntax_error(self, executor_and_session): + executor, _ = executor_and_session + with executor.create_session("def foo(): return 42") as session: + r = session.feed("foo(") + assert not r.success + assert r.error_type == "SyntaxError" + + def test_error_does_not_break_session(self, executor_and_session): + """A single error in feed() should not corrupt the session.""" + executor, _ = executor_and_session + with executor.create_session("def foo(x): return 1/x") as session: + r_bad = session.feed("foo(0)") + assert not r_bad.success + # Session should still work after error: + r_ok = session.feed("foo(2)") + assert r_ok.success + assert r_ok.output == 0.5 + + +# --------------------------------------------------------------------------- +# State preservation +# --------------------------------------------------------------------------- + + +class TestStatePreservation: + """State persists across feed() calls within a single session.""" + + def test_mutation_persists(self, executor_and_session): + executor, _ = executor_and_session + with executor.create_session("x = 10") as session: + r1 = session.feed("x + 5") + assert r1.output == 15 + + session.feed("x = x * 2") + + r3 = session.feed("x") + assert r3.output == 20 + + def test_function_defined_in_session(self, executor_and_session): + """Functions defined in one feed() are available in subsequent feeds.""" + executor, _ = executor_and_session + with executor.create_session("y = 100") as session: + session.feed("def double(n): return n * 2") + r = session.feed("double(y)") + assert r.success + assert r.output == 200 + + def test_list_accumulation(self, executor_and_session): + executor, _ = executor_and_session + with executor.create_session("items = []") as session: + session.feed("items.append(1)") + session.feed("items.append(2)") + session.feed("items.append(3)") + r = session.feed("items") + assert r.output == [1, 2, 3] + + +# --------------------------------------------------------------------------- +# Stdout capture +# --------------------------------------------------------------------------- + + +class TestSessionStdout: + """print() output is captured through the session.""" + + def test_stdout_captured(self, executor_and_session): + executor, _ = executor_and_session + with executor.create_session("def greet(name): print(f'Hello {name}')") as session: + r = session.feed("greet('World')") + assert "Hello World" in r.stdout + + +# --------------------------------------------------------------------------- +# Session isolation +# --------------------------------------------------------------------------- + + +class TestSessionIsolation: + """Each session starts with a clean namespace.""" + + def test_separate_sessions_isolated(self, executor_and_session): + executor, _ = executor_and_session + + with executor.create_session("x = 42") as s1: + r1 = s1.feed("x") + assert r1.output == 42 + + # A new session should NOT see x from the previous one: + with executor.create_session("y = 99") as s2: + r2 = s2.feed("y") + assert r2.output == 99 + r_x = s2.feed("x") + assert not r_x.success # x should not exist + + +# --------------------------------------------------------------------------- +# Protocol conformance +# --------------------------------------------------------------------------- + + +class TestSessionProtocol: + """Sessions satisfy the ExecutionSession protocol.""" + + def test_protocol(self, executor_and_session): + executor, _ = executor_and_session + with executor.create_session("x = 1") as session: + assert isinstance(session, ExecutionSession) diff --git a/tests/test_tdd_eval_retries.py b/tests/test_tdd_eval_retries.py index efa5fdf..9ab9b72 100644 --- a/tests/test_tdd_eval_retries.py +++ b/tests/test_tdd_eval_retries.py @@ -5,6 +5,7 @@ from vowel.eval_types import EvalsSource from vowel.tdd import FunctionSignature, Param, TDDGenerator +from vowel.validation import build_failure_context def _make_signature() -> FunctionSignature: @@ -35,7 +36,8 @@ def _make_signature() -> FunctionSignature: dataset: - case: inputs: [1, 2] - expected: 999 + expected: 3 + assertion: "output > 100" - case: inputs: [0, 0] expected: 0 @@ -166,27 +168,21 @@ def test_partial_coverage_accepted(self, mock_agent_prop): mock_agent.run_sync.assert_called_once() -class TestBuildEvalFailureContext(unittest.TestCase): - """Test the failure context builder.""" +class TestBuildFailureContext(unittest.TestCase): + """Test the shared failure context builder.""" def test_builds_context_from_failures(self): - gen = TDDGenerator.__new__(TDDGenerator) - gen.model = "test" - # Run actual evals with a bad spec to get real summary from vowel.runner import RunEvals summary = RunEvals.from_source(BAD_YAML).with_functions({"add": add}).run() - context = gen._build_eval_failure_context(summary) + context = build_failure_context(summary) assert "FAILED" in context def test_unknown_failures_fallback(self): - gen = TDDGenerator.__new__(TDDGenerator) - gen.model = "test" - # Mock summary with no useful info mock_summary = MagicMock() mock_summary.results = [] - context = gen._build_eval_failure_context(mock_summary) + context = build_failure_context(mock_summary) assert context == "Unknown failures" diff --git a/tests/test_yaml_loading.py b/tests/test_yaml_loading.py index 2e56e3b..031911f 100644 --- a/tests/test_yaml_loading.py +++ b/tests/test_yaml_loading.py @@ -6,59 +6,96 @@ from vowel import ( EvalsFile, - load_evals, - load_evals_file, - load_evals_from_dict, - load_evals_from_object, - load_evals_from_yaml_string, + load_bundle, + load_bundle_file, + load_bundle_from_dict, + load_bundle_from_object, + load_bundle_from_yaml_string, ) -class TestLoadEvalsFromYamlString: - """Tests for load_evals_from_yaml_string function.""" +class TestLoadBundleFromYamlString: + """Tests for load_bundle_from_yaml_string function.""" def test_simple_yaml_loading(self, simple_yaml_spec: str): """Test loading a simple YAML spec.""" - evals = load_evals_from_yaml_string(simple_yaml_spec) + bundle = load_bundle_from_yaml_string(simple_yaml_spec) - assert "add" in evals - assert len(evals["add"].dataset) == 2 + assert "add" in bundle.evals + assert len(bundle.evals["add"].dataset) == 2 def test_yaml_with_evaluators(self, yaml_with_evaluators: str): """Test loading YAML with evaluators.""" - evals = load_evals_from_yaml_string(yaml_with_evaluators) + bundle = load_bundle_from_yaml_string(yaml_with_evaluators) - assert "is_even" in evals - assert evals["is_even"].evals is not None + assert "is_even" in bundle.evals + assert bundle.evals["is_even"].evals is not None def test_yaml_with_type_check(self, yaml_with_type_check: str): """Test loading YAML with type checking.""" - evals = load_evals_from_yaml_string(yaml_with_type_check) + bundle = load_bundle_from_yaml_string(yaml_with_type_check) - assert "divide" in evals - assert len(evals["divide"].dataset) == 2 + assert "divide" in bundle.evals + assert len(bundle.evals["divide"].dataset) == 2 def test_yaml_with_raises(self, yaml_with_raises: str): """Test loading YAML with exception testing.""" - evals = load_evals_from_yaml_string(yaml_with_raises) + bundle = load_bundle_from_yaml_string(yaml_with_raises) - assert "divide" in evals - raises_cases = [c for c in evals["divide"].dataset if c.case.raises] + assert "divide" in bundle.evals + raises_cases = [c for c in bundle.evals["divide"].dataset if c.case.raises] assert len(raises_cases) == 1 def test_empty_yaml_raises_error(self): """Test that empty YAML raises an error.""" with pytest.raises(Exception): # noqa: B017 - load_evals_from_yaml_string("") + load_bundle_from_yaml_string("") def test_invalid_yaml_raises_error(self): """Test that invalid YAML raises an error.""" with pytest.raises(Exception): # noqa: B017 - load_evals_from_yaml_string("invalid: [unclosed") + load_bundle_from_yaml_string("invalid: [unclosed") + + def test_yaml_with_top_level_serializers(self): + """Test loading top-level serializer registry and eval references.""" + yaml_spec = """ +serializers: + user_schema: + schema: tests.test_serializer.User + +get_user_info: + serializer: user_schema + dataset: + - case: + input: {id: 1, name: Alice, email: a@a.com} + expected: "User Alice has email a@a.com" +""" + bundle = load_bundle_from_yaml_string(yaml_spec) + + assert "user_schema" in bundle.serializers + assert bundle.evals["get_user_info"].serializer == "user_schema" + + def test_yaml_invalid_serializer_spec_raises_error(self): + """Serializer specs cannot define both schema and serializer at once.""" + yaml_spec = """ +serializers: + invalid: + schema: tests.test_serializer.User + serializer: tests.test_serializer.yaml_serialize_user + +get_user_info: + serializer: invalid + dataset: + - case: + input: {id: 1, name: Alice, email: a@a.com} + expected: "User Alice has email a@a.com" +""" + with pytest.raises(Exception): # noqa: B017 + load_bundle_from_yaml_string(yaml_spec) -class TestLoadEvalsFromDict: - """Tests for load_evals_from_dict function.""" +class TestLoadBundleFromDict: + """Tests for load_bundle_from_dict function.""" def test_dict_loading(self): """Test loading from a dictionary.""" @@ -71,10 +108,10 @@ def test_dict_loading(self): } } - evals = load_evals_from_dict(spec_dict) + bundle = load_bundle_from_dict(spec_dict) - assert "multiply" in evals - assert len(evals["multiply"].dataset) == 2 + assert "multiply" in bundle.evals + assert len(bundle.evals["multiply"].dataset) == 2 def test_dict_with_evaluators(self): """Test loading dict with evaluators.""" @@ -88,60 +125,60 @@ def test_dict_with_evaluators(self): } } - evals = load_evals_from_dict(spec_dict) + bundle = load_bundle_from_dict(spec_dict) - assert "square" in evals - assert evals["square"].evals is not None + assert "square" in bundle.evals + assert bundle.evals["square"].evals is not None -class TestLoadEvalsFile: - """Tests for load_evals_file function.""" +class TestLoadBundleFile: + """Tests for load_bundle_file function.""" def test_load_from_file(self, temp_yaml_file: Path): """Test loading from a YAML file.""" - evals = load_evals_file(str(temp_yaml_file)) + bundle = load_bundle_file(str(temp_yaml_file)) - assert "add" in evals + assert "add" in bundle.evals def test_nonexistent_file_raises_error(self): """Test that loading non-existent file raises error.""" with pytest.raises(FileNotFoundError): - load_evals_file("nonexistent_file.yml") + load_bundle_file("nonexistent_file.yml") -class TestLoadEvals: - """Tests for the unified load_evals function.""" +class TestLoadBundle: + """Tests for the unified load_bundle function.""" def test_load_from_string(self, simple_yaml_spec: str): - """Test load_evals with YAML string.""" - evals = load_evals(simple_yaml_spec) - assert "add" in evals + """Test load_bundle with YAML string.""" + bundle = load_bundle(simple_yaml_spec) + assert "add" in bundle.evals def test_load_from_dict(self): - """Test load_evals with dict.""" + """Test load_bundle with dict.""" spec_dict = {"test": {"dataset": [{"case": {"input": 1, "expected": 1}}]}} - evals = load_evals(spec_dict) - assert "test" in evals + bundle = load_bundle(spec_dict) + assert "test" in bundle.evals def test_load_from_path(self, temp_yaml_file: Path): - """Test load_evals with Path object.""" - evals = load_evals(temp_yaml_file) - assert "add" in evals + """Test load_bundle with Path object.""" + bundle = load_bundle(temp_yaml_file) + assert "add" in bundle.evals def test_load_from_evals_file_object(self, simple_yaml_spec: str): - """Test load_evals with EvalsFile object.""" + """Test load_bundle with EvalsFile object.""" import yaml data = yaml.safe_load(simple_yaml_spec) evals_file = EvalsFile.model_validate(data) - evals = load_evals_from_object(evals_file) - assert "add" in evals + bundle = load_bundle_from_object(evals_file) + assert "add" in bundle.evals def test_invalid_source_type_raises_error(self): """Test that invalid source type raises TypeError.""" with pytest.raises(TypeError): - load_evals(12345) # type: ignore[arg-type] + load_bundle(12345) # type: ignore[arg-type] class TestInputFormats: @@ -156,8 +193,8 @@ def test_single_input(self): input: 5 expected: 10 """ - evals = load_evals_from_yaml_string(yaml_spec) - case = evals["double"].dataset[0].case + bundle = load_bundle_from_yaml_string(yaml_spec) + case = bundle.evals["double"].dataset[0].case assert case.input == 5 def test_inputs_dict(self): @@ -169,8 +206,8 @@ def test_inputs_dict(self): inputs: { x: 1, y: 2 } expected: 3 """ - evals = load_evals_from_yaml_string(yaml_spec) - case = evals["add"].dataset[0].case + bundle = load_bundle_from_yaml_string(yaml_spec) + case = bundle.evals["add"].dataset[0].case assert case.inputs == {"x": 1, "y": 2} def test_inputs_list(self): @@ -182,6 +219,6 @@ def test_inputs_list(self): inputs: [1, 2, 3] expected: 6 """ - evals = load_evals_from_yaml_string(yaml_spec) - case = evals["add"].dataset[0].case + bundle = load_bundle_from_yaml_string(yaml_spec) + case = bundle.evals["add"].dataset[0].case assert case.inputs == [1, 2, 3] diff --git a/vowel-schema.json b/vowel-schema.json index eded93b..15ee5e6 100644 --- a/vowel-schema.json +++ b/vowel-schema.json @@ -3,16 +3,22 @@ "type": "object", "properties": { "fixtures": { - "type": "object", "additionalProperties": { "$ref": "#/$defs/FixtureDefinition" }, "title": "Fixtures", - "description": "Dictionary of fixture definitions. Each key is the fixture name." + "type": "object" + }, + "serializers": { + "additionalProperties": { + "$ref": "#/$defs/SerializerSpec" + }, + "title": "Serializers", + "type": "object" } }, "additionalProperties": { - "$ref": "#/$defs/Evals" + "$ref": "#/$defs/EvalsMapValue" }, "$defs": { "AssertionCase": { @@ -43,7 +49,9 @@ "type": "string" } }, - "required": ["assertion"], + "required": [ + "assertion" + ], "title": "AssertionCase", "type": "object" }, @@ -80,7 +88,9 @@ "description": "The test case containing input, expected output, and constraints." } }, - "required": ["case"], + "required": [ + "case" + ], "title": "DatasetCase", "type": "object" }, @@ -89,13 +99,20 @@ "properties": { "duration": { "description": "Maximum allowed duration in seconds. Test fails if execution takes longer.", - "examples": [0.1, 1.0, 5.0, 0.001], + "examples": [ + 0.1, + 1.0, + 5.0, + 0.001 + ], "exclusiveMinimum": 0, "title": "Duration", "type": "number" } }, - "required": ["duration"], + "required": [ + "duration" + ], "title": "DurationCase", "type": "object" }, @@ -105,44 +122,99 @@ "properties": { "id": { "description": "Function name to evaluate. Must match the actual function name.", - "examples": ["is_prime", "calculate_sum", "process_data", "validate_email"], + "examples": [ + "is_prime", + "calculate_sum", + "process_data", + "validate_email" + ], "title": "Id", "type": "string" }, "fixture": { "description": "List of fixture names this function depends on. Fixtures must be defined in the top-level 'fixtures' section. They will be injected as keyword-only arguments to the function.", - "examples": [["db"], ["db", "cache"], ["redis"]], + "examples": [ + [ + "db" + ], + [ + "db", + "cache" + ], + [ + "redis" + ] + ], "items": { "type": "string" }, "title": "Fixture", "type": "array" }, + "serializer": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Optional serializer registry key from top-level 'serializers'. When set, this eval uses that serializer definition.", + "title": "Serializer" + }, "evals": { "additionalProperties": { "anyOf": [ - {"$ref": "#/$defs/IsInstanceCase"}, - {"$ref": "#/$defs/AssertionCase"}, - {"$ref": "#/$defs/DurationCase"}, - {"$ref": "#/$defs/ContainsInputCase"}, - {"$ref": "#/$defs/PatternMatchCase"}, - {"$ref": "#/$defs/LLMJudgeCase"} + { + "$ref": "#/$defs/IsInstanceCase" + }, + { + "$ref": "#/$defs/AssertionCase" + }, + { + "$ref": "#/$defs/DurationCase" + }, + { + "$ref": "#/$defs/ContainsInputCase" + }, + { + "$ref": "#/$defs/PatternMatchCase" + }, + { + "$ref": "#/$defs/LLMJudgeCase" + } ] }, "description": "Dictionary of evaluation rules that apply to ALL test cases. Each key is a descriptive name, value is the evaluation case. Use IsInstanceCase for type checks, AssertionCase for custom logic, DurationCase for performance constraints, ContainsInputCase for input containment, PatternMatchCase for regex pattern matching.", "examples": [ { - "IsInteger": {"type": "int"}, - "IsPositive": {"assertion": "output > 0"} + "IsInteger": { + "type": "int" + }, + "IsPositive": { + "assertion": "output > 0" + } }, { - "IsUppercase": {"assertion": "output.isupper()"}, - "NotEmpty": {"assertion": "len(output) > 0"}, - "TypeCheck": {"type": "str"} + "IsUppercase": { + "assertion": "output.isupper()" + }, + "NotEmpty": { + "assertion": "len(output) > 0" + }, + "TypeCheck": { + "type": "str" + } }, { - "CorrectLogic": {"assertion": "(output and input > 0) or (not output and input <= 0)"}, - "IsBoolean": {"type": "bool"} + "CorrectLogic": { + "assertion": "(output and input > 0) or (not output and input <= 0)" + }, + "IsBoolean": { + "type": "bool" + } } ], "title": "Evals", @@ -152,17 +224,58 @@ "description": "List of test cases. Each case has input, expected output, and optional constraints. Should cover normal cases, edge cases, and corner cases.", "examples": [ [ - {"case": {"expected": 4, "input": 2}}, - {"case": {"expected": 0, "input": 0}}, - {"case": {"expected": 9, "input": -3}} + { + "case": { + "expected": 4, + "input": 2 + } + }, + { + "case": { + "expected": 0, + "input": 0 + } + }, + { + "case": { + "expected": 9, + "input": -3 + } + } ], [ - {"case": {"expected": "HELLO", "input": "hello"}}, - {"case": {"expected": "WORLD", "input": "world"}} + { + "case": { + "expected": "HELLO", + "input": "hello" + } + }, + { + "case": { + "expected": "WORLD", + "input": "world" + } + } ], [ - {"case": {"expected": 5, "input": {"x": 2, "y": 3}}}, - {"case": {"expected": 30, "input": {"x": 10, "y": 20}}} + { + "case": { + "expected": 5, + "input": { + "x": 2, + "y": 3 + } + } + }, + { + "case": { + "expected": 30, + "input": { + "x": 10, + "y": 20 + } + } + } ] ], "items": { @@ -173,7 +286,10 @@ "type": "array" } }, - "required": ["dataset"], + "required": [ + "id", + "dataset" + ], "title": "Evals", "type": "object" }, @@ -181,47 +297,69 @@ "description": "Definition of a single fixture with setup/teardown lifecycle.", "properties": { "setup": { - "anyOf": [{"type": "string"}, {"type": "null"}], + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], "default": null, "description": "Import path to setup function (e.g., 'fixtures.create_db'). Required if 'cls' is not specified.", "title": "Setup" }, "cls": { - "anyOf": [{"type": "string"}, {"type": "null"}], + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], "default": null, "description": "Import path to class (e.g., 'myapp.Database'). Class will be instantiated with args/kwargs.", "title": "Cls" }, "args": { - "description": "Positional arguments to pass to class constructor (used with 'cls')", + "description": "Positional arguments unpacked into the callable: setup_func(*args) or MyClass(*args)", "items": {}, "title": "Args", "type": "array" }, "kwargs": { "additionalProperties": true, - "description": "Keyword arguments to pass to class constructor (used with 'cls')", + "description": "Keyword arguments unpacked into the callable: setup_func(**kwargs) or MyClass(**kwargs)", "title": "Kwargs", "type": "object" }, "teardown": { - "anyOf": [{"type": "string"}, {"type": "null"}], + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], "default": null, "description": "Import path to teardown function (e.g., 'fixtures.drop_db'). Can also be a class method (e.g., 'Connection.close') which will be called on the instance.", "title": "Teardown" }, "scope": { "default": "function", - "description": "Lifecycle scope: 'function' (per case), 'module' (per eval), or 'session' (per run)", - "enum": ["function", "module", "session"], + "description": "Fixture lifecycle scope. Preferred names: 'case', 'eval', 'file'. Compatibility aliases are accepted: 'function', 'module', 'session'. Current runtime normalization maps case->function, eval->module, file->session.", + "enum": [ + "case", + "eval", + "file", + "function", + "module", + "session" + ], "title": "Scope", "type": "string" - }, - "params": { - "additionalProperties": true, - "description": "Parameters to pass to the setup function", - "title": "Params", - "type": "object" } }, "title": "FixtureDefinition", @@ -232,18 +370,35 @@ "properties": { "type": { "description": "Python type as string to check against. Can use union types with '|'.", - "examples": ["int", "str", "bool", "list", "dict", "int | float", "str | None"], + "examples": [ + "int", + "str", + "bool", + "list", + "dict", + "int | float", + "str | None" + ], "title": "Type", "type": "string" }, "strict": { - "anyOf": [{"type": "boolean"}, {"type": "null"}], + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "null" + } + ], "default": null, "description": "Whether to use strict mode for type validation. When True, performs stricter type checking.", "title": "Strict" } }, - "required": ["type"], + "required": [ + "type" + ], "title": "IsInstanceCase", "type": "object" }, @@ -262,7 +417,18 @@ }, "include": { "description": "List of context variables to include in the evaluation. Valid options: 'input', 'expected_output'.", - "examples": [["input"], ["expected_output"], ["input", "expected_output"]], + "examples": [ + [ + "input" + ], + [ + "expected_output" + ], + [ + "input", + "expected_output" + ] + ], "items": { "type": "string" }, @@ -276,7 +442,9 @@ "type": "object" } }, - "required": ["rubric"], + "required": [ + "rubric" + ], "title": "LLMJudgeCase", "type": "object" }, @@ -285,61 +453,181 @@ "description": "Test case with input, expected output, and optional constraints.", "properties": { "id": { - "anyOf": [{"type": "string"}, {"type": "null"}], + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], "default": null, "description": "Optional unique identifier for this test case.", - "examples": ["test_positive_numbers", "edge_case_empty_list", "error_invalid_input"], + "examples": [ + "test_positive_numbers", + "edge_case_empty_list", + "error_invalid_input" + ], "title": "Id" }, "input": { - "anyOf": [{}, {"type": "null"}], + "anyOf": [ + {}, + { + "type": "null" + } + ], "default": null, "description": "Single input value to pass to the function as the only argument. Use this when the function takes a single argument. Cannot be used together with 'inputs'.", - "examples": [5, "hello", [1, 2, 3], {"x": 10, "y": 20}, {"name": "test", "value": 42}], + "examples": [ + 5, + "hello", + [ + 1, + 2, + 3 + ], + { + "x": 10, + "y": 20 + }, + { + "name": "test", + "value": 42 + } + ], "title": "Input" }, "inputs": { "anyOf": [ - {"items": {}, "type": "array"}, - {"additionalProperties": true, "type": "object"}, - {"type": "null"} + { + "items": {}, + "type": "array" + }, + { + "additionalProperties": true, + "type": "object" + }, + { + "type": "null" + } ], "default": null, "description": "Multiple input values to pass to the function as separate arguments (*args). Use this when the function takes multiple arguments. Cannot be used together with 'input'.", - "examples": [[1, 2], [10, 20, 30], ["hello", "world"], [{"x": 1}, {"y": 2}]], + "examples": [ + [ + 1, + 2 + ], + [ + 10, + 20, + 30 + ], + [ + "hello", + "world" + ], + [ + { + "x": 1 + }, + { + "y": 2 + } + ] + ], "title": "Inputs" }, "expected": { "description": "Expected output value. If provided, output will be compared for equality. Use `null` to expect None.", - "examples": [25, "HELLO", [1, 3, 5], true, {"result": 30}, null], + "examples": [ + 25, + "HELLO", + [ + 1, + 3, + 5 + ], + true, + { + "result": 30 + }, + null + ], "title": "Expected" }, "duration": { - "anyOf": [{"exclusiveMinimum": 0, "type": "number"}, {"type": "null"}], + "anyOf": [ + { + "exclusiveMinimum": 0, + "type": "number" + }, + { + "type": "null" + } + ], "default": null, "description": "Maximum allowed execution time in milliseconds for this specific case.", - "examples": [100, 500, 1000, 50], + "examples": [ + 100, + 500, + 1000, + 50 + ], "title": "Duration" }, "contains": { - "anyOf": [{}, {"type": "null"}], + "anyOf": [ + {}, + { + "type": "null" + } + ], "default": null, "description": "Value that should be contained in the output.", - "examples": ["substring", 42, "expected_key"], + "examples": [ + "substring", + 42, + "expected_key" + ], "title": "Contains" }, "assertion": { - "anyOf": [{"type": "string"}, {"type": "null"}], + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], "default": null, "description": "Optional case-specific Python assertion expression. Same as global assertions but only for this case.\nAvailable variables: input, output, expected, duration, metadata.\nExamples: 'output > 0', 'len(output) == 3', 'output == input * 2'", - "examples": ["output > 0", "len(output) == 3", "output % 2 == 0", "output in input"], + "examples": [ + "output > 0", + "len(output) == 3", + "output % 2 == 0", + "output in input" + ], "title": "Assertion" }, "pattern": { - "anyOf": [{"type": "string"}, {"type": "null"}], + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], "default": null, "description": "Optional regex pattern to match against the output (converted to string) for this specific case.", - "examples": ["^\\d+$", "^[A-Z]+$", ".*@.*\\.com$"], + "examples": [ + "^\\d+$", + "^[A-Z]+$", + ".*@.*\\.com$" + ], "title": "Pattern" }, "case_sensitive": { @@ -349,17 +637,43 @@ "type": "boolean" }, "raises": { - "anyOf": [{"type": "string"}, {"type": "null"}], + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], "default": null, "description": "Expected exception type for this case. If specified, the test expects the function to raise this exception. Append '?' for optional raises (e.g., 'TypeError?') — passes if the exception is raised OR if the function returns normally.", - "examples": ["ValueError", "TypeError", "KeyError", "ZeroDivisionError", "TypeError?"], + "examples": [ + "ValueError", + "TypeError", + "KeyError", + "ZeroDivisionError", + "TypeError?" + ], "title": "Raises" }, "type": { - "anyOf": [{"type": "string"}, {"type": "null"}], + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], "default": null, "description": "Expected output type for this specific case. Can be a simple type name or a complex type annotation.", - "examples": ["int", "str", "list[int]", "dict[str, Any]", "Optional[int]"], + "examples": [ + "int", + "str", + "list[int]", + "dict[str, Any]", + "Optional[int]" + ], "title": "Type" }, "strict_type": { @@ -369,10 +683,21 @@ "type": "boolean" }, "match": { - "anyOf": [{"type": "string"}, {"type": "null"}], + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], "default": null, "description": "Optional regex pattern to match against the exception message (only used if raises is specified).", - "examples": ["invalid input", "must be positive", "not found"], + "examples": [ + "invalid input", + "must be positive", + "not found" + ], "title": "Match" } }, @@ -384,7 +709,12 @@ "properties": { "pattern": { "description": "Regular expression pattern to match against the output (converted to string).", - "examples": ["^\\d+$", "^[A-Z]+$", ".*@.*\\.com$", "id: \\d+"], + "examples": [ + "^\\d+$", + "^[A-Z]+$", + ".*@.*\\.com$", + "id: \\d+" + ], "title": "Pattern", "type": "string" }, @@ -395,9 +725,227 @@ "type": "boolean" } }, - "required": ["pattern"], + "required": [ + "pattern" + ], "title": "PatternMatchCase", "type": "object" + }, + "SerializerSpec": { + "additionalProperties": false, + "description": "Serializer registry entry for YAML-native serializer configuration.", + "properties": { + "schema": { + "anyOf": [ + { + "type": "string" + }, + { + "additionalProperties": { + "type": "string" + }, + "type": "object" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Schema converter path(s). Use a single import path string for direct mode, or a mapping of parameter name to import path for nested mode.", + "title": "Schema" + }, + "serializer": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Import path to custom serializer function (serial_fn mode).", + "title": "Serializer" + } + }, + "title": "SerializerSpec", + "type": "object" + }, + "EvalsMapValue": { + "additionalProperties": false, + "description": "Function evaluation specification keyed by function import path/name. Contains fixture dependencies, global evaluators (`evals`), and dataset cases.", + "properties": { + "id": { + "description": "Function name to evaluate. Must match the actual function name.", + "examples": [ + "is_prime", + "calculate_sum", + "process_data", + "validate_email" + ], + "title": "Id", + "type": "string" + }, + "fixture": { + "description": "List of fixture names this function depends on. Fixtures must be defined in the top-level 'fixtures' section. They will be injected as keyword-only arguments to the function.", + "examples": [ + [ + "db" + ], + [ + "db", + "cache" + ], + [ + "redis" + ] + ], + "items": { + "type": "string" + }, + "title": "Fixture", + "type": "array" + }, + "serializer": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Optional serializer registry key from top-level 'serializers'. When set, this eval uses that serializer definition.", + "title": "Serializer" + }, + "evals": { + "additionalProperties": { + "anyOf": [ + { + "$ref": "#/$defs/IsInstanceCase" + }, + { + "$ref": "#/$defs/AssertionCase" + }, + { + "$ref": "#/$defs/DurationCase" + }, + { + "$ref": "#/$defs/ContainsInputCase" + }, + { + "$ref": "#/$defs/PatternMatchCase" + }, + { + "$ref": "#/$defs/LLMJudgeCase" + } + ] + }, + "description": "Dictionary of evaluation rules that apply to ALL test cases. Each key is a descriptive name, value is the evaluation case. Use IsInstanceCase for type checks, AssertionCase for custom logic, DurationCase for performance constraints, ContainsInputCase for input containment, PatternMatchCase for regex pattern matching.", + "examples": [ + { + "IsInteger": { + "type": "int" + }, + "IsPositive": { + "assertion": "output > 0" + } + }, + { + "IsUppercase": { + "assertion": "output.isupper()" + }, + "NotEmpty": { + "assertion": "len(output) > 0" + }, + "TypeCheck": { + "type": "str" + } + }, + { + "CorrectLogic": { + "assertion": "(output and input > 0) or (not output and input <= 0)" + }, + "IsBoolean": { + "type": "bool" + } + } + ], + "title": "Evals", + "type": "object" + }, + "dataset": { + "description": "List of test cases. Each case has input, expected output, and optional constraints. Should cover normal cases, edge cases, and corner cases.", + "examples": [ + [ + { + "case": { + "expected": 4, + "input": 2 + } + }, + { + "case": { + "expected": 0, + "input": 0 + } + }, + { + "case": { + "expected": 9, + "input": -3 + } + } + ], + [ + { + "case": { + "expected": "HELLO", + "input": "hello" + } + }, + { + "case": { + "expected": "WORLD", + "input": "world" + } + } + ], + [ + { + "case": { + "expected": 5, + "input": { + "x": 2, + "y": 3 + } + } + }, + { + "case": { + "expected": 30, + "input": { + "x": 10, + "y": 20 + } + } + } + ] + ], + "items": { + "$ref": "#/$defs/DatasetCase" + }, + "minItems": 1, + "title": "Dataset", + "type": "array" + } + }, + "required": [ + "dataset" + ], + "title": "Function", + "type": "object" } } -} \ No newline at end of file +}