diff --git a/.gitignore b/.gitignore index 65d1fa7e..1ead0904 100644 --- a/.gitignore +++ b/.gitignore @@ -214,4 +214,9 @@ environments/healthbench/test* .vscode/ pyrightconfig.json - +.claude +.codex +.devcontainer +plans/ +verifiers/ +.gitmodules \ No newline at end of file diff --git a/AGENTS.md b/AGENTS.md index 4c204e41..a32bbc6c 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -4,7 +4,7 @@ - `medarc_verifiers/`: Core Python package (CLI entrypoints, parsers, rewards, orchestration utilities). - `environments//`: Individual Verifiers environments (each is a small Python package with `.py` and its own `pyproject.toml`). -- `configs/`: YAML configs for `medarc-eval bench` (job matrices, env configs, judge configs). +- `configs/`: TOML configs for `medarc-eval bench`, endpoint registries, and environment/judge configs. - `docs/`: Usage docs for `medarc-eval` and related workflows. - `tests/`: `pytest` suite. @@ -12,12 +12,12 @@ - **IMPORTANT: Read `docs/medarc-verifiers-architecture.md` before writing or modifying any code.** - Quick workflow: eval → process → winrate - - raw outputs: `runs/raw//...` + - eval outputs: `runs/evals////...` - processed parquet: `runs/processed//.parquet` + `runs/processed/env_index.json` - - winrate outputs: `runs/winrate/latest.json` and `runs/winrate/latest.csv` + - winrate outputs: `runs/processed/winrate/latest.json` and `runs/processed/winrate/latest.csv` - `medarc-eval` CLI entrypoint/router: (`medarc_verifiers/cli/main.py`; docs: `docs/medarc-eval.md`) - `medarc-orchestrate` CLI entrypoint: (`medarc_verifiers/orchestrate/cli.py`; docs: `docs/medarc-orchestrate.md`) -- Batch resume/restart state lives in `runs/raw//run_manifest.json` +- Old YAML-runner `runs/raw` artifacts must be converted with `scripts/convert_legacy_raw_runs.py` before processing. - Environment `load_environment()` params become CLI flags (see `medarc-eval --help`). - Environment authoring utilities (used by `environments/*`): - parsing/prompts: `medarc_verifiers/parsers/`, `medarc_verifiers/prompts.py` (XML preferred; BOXED supported) @@ -32,7 +32,7 @@ - `uv pip install -e .`: Install `medarc-verifiers` in editable mode. - `vf-install `: Install an environment from `environments//` in editable mode. - `uv run medarc-eval -m -n 5`: Run a small evaluation. -- `uv run medarc-eval bench --config configs/job.yaml`: Run a batch evaluation from a YAML config. +- `uv run medarc-eval bench --config configs/medmarks-smoke.toml`: Run a batch evaluation from a TOML config. - `uv run pytest tests/`: Run the full test suite. - `uv run ruff check medarc_verifiers/ && uv run ruff format medarc_verifiers/`: Lint/format. diff --git a/README.md b/README.md index f98c7d13..2a66fbda 100644 --- a/README.md +++ b/README.md @@ -1,206 +1,167 @@ -# MedARC Medical Language Model Environments +# Medmarks -This repository is used to build verifiers environments and tools for the MedARC medical language model project. +[![Website](https://img.shields.io/badge/website-medmarks.ai-0f766e)](https://medmarks.ai) +[![arXiv](https://img.shields.io/badge/arXiv-2605.01417-b31b1b.svg)](https://arxiv.org/abs/2605.01417) +[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](LICENSE) +[![Python](https://img.shields.io/badge/python-3.12-blue.svg)](pyproject.toml) -It also contains the medarc-verifiers package, which provides additional tools for creating verifiers environments. +Open-source LLM benchmark suite for medical tasks. -## Getting Started with Verifiers Environments +[medmarks.ai](https://medmarks.ai) | [arXiv:2605.01417](https://arxiv.org/abs/2605.01417) -The steps below guide you through creating a new environment package under `environments/[my-new-env]`, installing it locally, testing it with Verifiers tooling, and optionally publishing it through Prime Intellect's Environments Hub. +Medmarks is a comprehensive benchmark suite for evaluating medical capabilities in large language models. It includes 30 open-source benchmarks spanning question answering, information extraction, consumer health questions, clinical reasoning, EHR interactions, medical calculations, and open-ended medical tasks. -### 1. Prerequisites -- Python 3.11 or 3.12 -- [`uv`](https://docs.astral.sh/uv/) for dependency management -- The [`prime` CLI](https://github.com/PrimeIntellect-ai/prime-cli) for scaffolding and publishing -- An OpenAI-compatible API key (export it as `OPENAI_API_KEY`) or OpenAI compatible model for testing the environment with `vf-eval` +This repository contains the runnable benchmark environments, evaluation configs, result processing tools, and win-rate analysis pipeline used for Medmarks. It also contains the [`medarc_verifiers` Python library](docs/README.md), which provides the shared CLI, parsers, rewards, judging utilities, and orchestration helpers used by the benchmark environments. -### 2. Setup +## Benchmark Suite -Create and activate a virtual environment, then install the required tooling: +Medmarks is organized into three practical subsets: -```bash -uv venv --python 3.12 -source .venv/bin/activate -uv tool install prime -uv pip install verifiers -``` +| Subset | Description | +|--------|-------------| +| Medmarks-V | Verifiable tasks, including multiple-choice QA and other tasks with deterministic or programmatic grading | +| Medmarks-OE | Open-ended tasks evaluated with LLM-as-a-Judge | +| Medmarks-T | Experimental training-capable environments with train/test splits for post-training and RL experiments | -After this setup the `prime env`, `vf-install`, and `vf-eval` commands will be available (or runnable via `uv run `). +The benchmark suite is implemented as [verifiers](https://github.com/primeintellect-ai/verifiers) environments under [`environments/`](environments/). The main runnable suite configs are: -### 3. Create a New Environment -Always place new Verifiers packages inside `environments/my-new-env`. The Prime CLI ensures this by default: +| Config | Purpose | +|--------|---------| +| [`configs/medmarks-verified.toml`](configs/medmarks-verified.toml) | Medmarks-V suite | +| [`configs/medmarks-open_ended.toml`](configs/medmarks-open_ended.toml) | Medmarks-OE suite | +| [`configs/medmarks-endpoints.toml`](configs/medmarks-endpoints.toml) | Portable model aliases and sampling defaults for Medmarks runs | +| [`configs/medmarks-smoke.toml`](configs/medmarks-smoke.toml) | Small Medmarks-V sanity-check run | -```bash -# from the repository root -prime env init my-new-env -``` +## Quick Start -The template produces: -``` -environments/my_new_env/ -├── my_new_env.py -├── pyproject.toml -└── README.md -``` - -Edit `my_new_env.py` to configure datasets, parsers, and rubrics, and update the package metadata in `pyproject.toml` (name, version, dependencies, tags, etc.). - -If the `prime env init` command doesn't add it, you'll want to add the following prime env metadata so prime/verifiers knows where the environment is in a flat repo: - -```toml -[tool.prime.environment] -loader = "my_new_env:load_environment" -display_name = "My New Env" -visibility = "PUBLIC" +```bash +uv venv --python 3.12 +source .venv/bin/activate +uv sync ``` -### 4. Install the Environment for Local Development -Install your new environment in editable mode so changes are picked up immediately: +Run a single benchmark: ```bash -vf-install my-new-env -# equivalent to: -# uv pip install -e ./environments/my_new_env +uv run medarc-eval medqa -m openai/gpt-4.1-mini -n 25 ``` -You can now import it from Python or let Verifiers discover it with `verifiers.load_environment("my-new-env")`. - -### 5. Smoke-Test with `vf-eval` -Run a small batch of rollouts to confirm the environment behaves as expected. Set `OPENAI_API_KEY` (or whichever OpenAI client compatible credentials you plan to use) before invoking the CLI. +Run a Medmarks suite config: ```bash -export OPENAI_API_KEY=sk-... -vf-eval my-new-env -m gpt-4.1-mini -n 5 -s +uv run medarc-eval bench --config configs/medmarks-verified.toml ``` -A few useful arguments: - -- -m selects the inference model -- -n controls dataset size -- -s saves results locally. - -Use vf-eval -h for the full set of options (rollouts per example, max concurrency, etc.) - -During development you can iterate quickly by tweaking prompts, parser logic, or reward functions, reinstalling with `vf-install` if dependencies change, and rerunning `vf-eval` to view the results. - -After running with `-s`, inspect saved runs with `vf-tui`, which provides a terminal UI for browsing prompts, completions, and rewards under the generated `outputs/evals` folders. - -## Using an Existing MedARC Environment - -Once your tooling is set up you can install MedARC-maintained environments directly from the Prime Environments Hub (for example [`medarc/medcasereasoning`](https://app.primeintellect.ai/dashboard/environments/medarc/medcasereasoning) or [`medarc/metamedqa`](https://app.primeintellect.ai/dashboard/environments/medarc/metamedqa)). - -- **Install from the Hub.** Run `prime env install medarc/medcasereasoning` to pull the latest published version (add `@version` to pin a release). -- **Run an evaluation.** Execute `vf-eval medcasereasoning -m gpt-4.1-mini -n 10 -s` to generate a small batch of rollouts. -- **Load programmatically.** Environments installed via the Hub are importable like any other Verifiers module: - - ```python - import verifiers as vf - - env = vf.load_environment("medcasereasoning", split="validation") - results = env.evaluate(model_client, "gpt-4.1-mini", num_examples=5) - ``` - -## medarc-eval CLI - -`medarc-eval` wraps the upstream `vf-eval` flow, adding environment-specific flags and batch orchestration. See [full documentation](docs/medarc-eval.md). - -| Command | Description | -|---------|-------------| -| [`medarc-eval `](docs/medarc-eval-single-run.md) | Run a single benchmark with auto-discovered environment flags | -| [`medarc-eval bench`](docs/medarc-eval-bench.md) | Run multiple benchmarks from a YAML config with resume support | -| [`medarc-eval process`](docs/medarc-eval-process.md) | Convert raw outputs to parquet for analysis | -| [`medarc-eval winrate`](docs/medarc-eval-winrate.md) | Compute HELM-style win rates across models | - -### Quick Start +Run a Medmarks suite with one of the published model aliases: ```bash -# Run a single benchmark -uv run medarc-eval medqa -m gpt-4.1-mini -n 25 - -# Run batch evaluations from config -uv run medarc-eval bench --config configs/job-gpt-oss-20b.yaml - -# Process results and compute win rates -uv run medarc-eval process -uv run medarc-eval winrate +uv run medarc-eval bench \ + --config configs/medmarks-verified.toml \ + --endpoints-path configs/medmarks-endpoints.toml \ + -m gpt-oss-20b-low \ + --api-base-url https://api.pinference.ai/api/v1 \ + --api-key-var PRIME_API_KEY ``` -### Environment-Specific Flags +[`configs/medmarks-endpoints.toml`](configs/medmarks-endpoints.toml) is an alias registry, not a deployment config. It maps names such as `gpt-oss-20b-low` or `medgemma-27b-text` to provider model IDs, client types, and model-specific sampling defaults. It intentionally omits `url`, `key`, and `max_concurrent`; supply those with `--provider` or with `--api-base-url` and `--api-key-var` for your deployment. The gpt-oss aliases use the Verifiers `openai_responses` client type. -Each environment's `load_environment()` parameters become CLI flags automatically: +Preview the resolved jobs before running: ```bash -# Discover available flags -uv run medarc-eval longhealth --help - -# Use environment-specific options -uv run medarc-eval longhealth --task task1 --shuffle-answers -m gpt-4.1-mini -n 10 +uv run medarc-eval bench \ + --config configs/medmarks-verified.toml \ + --endpoints-path configs/medmarks-endpoints.toml \ + -m gpt-oss-20b-low \ + --api-base-url https://api.pinference.ai/api/v1 \ + --api-key-var PRIME_API_KEY \ + --dry-run ``` -For complex arguments (dicts, nested structures), use `--env-args`: +Run the same alias against a local vLLM server exposing an OpenAI-compatible API: ```bash -uv run medarc-eval careqa --env-args '{"split": "open", "judge_model": "gpt-4o"}' +VLLM_API_KEY=local-key uv run medarc-eval bench \ + --config configs/medmarks-verified.toml \ + --endpoints-path configs/medmarks-endpoints.toml \ + -m gpt-oss-20b-low \ + --api-base-url http://127.0.0.1:8000/v1 \ + --api-key-var VLLM_API_KEY \ + --dry-run ``` -## Batch Evaluations - -Use `medarc-eval bench` to run multiple model × environment evaluations from a config file. See [full batch mode documentation](docs/medarc-eval-bench.md). - -```yaml -name: gpt-oss-20b-med - -models: - gpt-oss-20b: - model: openai/gpt-oss-20b - api_base_url: http://localhost:8000/v1 - sampling_args: - temperature: 1.0 - reasoning_effort: medium - -jobs: - - model: gpt-oss-20b - env: [m_arc, medcalc_bench, medxpertqa] -``` +Process outputs and compute win rates: ```bash -# Run the batch -uv run medarc-eval bench --config configs/job-gpt-oss-20b.yaml - -# Preview without executing -uv run medarc-eval bench --config configs/job-gpt-oss-20b.yaml --dry-run +uv run medarc-eval process --runs-dir runs/evals +uv run medarc-eval winrate ``` -Batch mode supports automatic resume, job manifests, and matrix sweeps for parameter grids. See the [batch mode documentation](docs/medarc-eval-bench.md) for config file format, resume/restart options, and advanced features. - -### Matrix Sweeps - -Environment configs support matrix expansion for parameter grid runs: - -```yaml -- id: medconceptsqa - module: medconceptsqa - num_examples: -1 - env_args: - shuffle_answers: true - matrix: - difficulty: [easy, medium, hard] - shuffle_seed: [1618, 9331] - matrix_id_format: "{base}-{difficulty}-s{shuffle_seed}" +Evaluation outputs are written under `runs/evals/`, processed parquet files under `runs/processed/`, and win-rate summaries under `runs/processed/winrate/`. + +## Documentation + +| Page | Description | +|------|-------------| +| [`docs/developer-guide.md`](docs/developer-guide.md) | Developer setup, environment authoring, and local workflow | +| [`docs/medarc-eval.md`](docs/medarc-eval.md) | Full `medarc-eval` CLI documentation | +| [`docs/medarc-eval-bench.md`](docs/medarc-eval-bench.md) | TOML benchmark suite execution | +| [`docs/medarc-eval-process.md`](docs/medarc-eval-process.md) | Processing eval outputs into parquet | +| [`docs/medarc-eval-winrate.md`](docs/medarc-eval-winrate.md) | HELM-style win-rate computation | +| [`docs/medarc-orchestrate.md`](docs/medarc-orchestrate.md) | Running local vLLM benchmark jobs with Docker or Slurm/Pyxis | + +## Datasets + +`--` indicates no dedicated training split. `Not specified` means we found no explicit dataset license in the dataset source. Evaluated counts reflect the effective Medmarks evaluation split or configured subset; MedDialog is intentionally capped at the first 2,500 examples. + +| Dataset | Description | License / terms | # Evaluated | # Training | +|---------|-------------|-----------------|------------:|-----------:| +| **Medmarks-V (Verifiable)** | | | | | +| CareQA | Healthcare QA exam questions with multiple-choice reasoning questions, English subset. | Apache-2.0 | 5,621 | -- | +| HEAD-QA v2 | Extended healthcare questions spanning 10 years of Spanish professional exams, English subset. | MIT | 12,751 | -- | +| LongHealth | Long-context synthetic patient cases with information extraction and sorting tasks, task1 and task2 splits. | Apache-2.0 | 1,200 | -- | +| M-ARC | Long-tail medical questions designed to test model resistance to inflexible clinical reasoning patterns. | Apache-2.0 | 100 | -- | +| Med-HALT | Clinical Reasoning Hallucination detection via false confidence tests and "none of the above" recognition. | Apache-2.0 | 22,152 | -- | +| MedCalc-Bench | Clinical calculator questions evaluating medical computation and formula application skills. | CC-BY-SA-4.0 | 1,100 | 10,538 | +| MedConceptsQA | Multiple-choice questions on medical coding systems, e.g., ICD-9, ICD-10, etc., only ICD-10CM subsamples evaluated. | Not specified | 6,000 | -- | +| Medbullets | USMLE Step 2 and Step 3 style clinical reasoning questions sourced from social media. | Not specified | 308 | -- | +| MedHallu | Medical hallucination detection benchmark with four domain-specific error categories derived from the PubMedQA dataset. | MIT | 2,000 | -- | +| MedMCQA | Multiple-choice questions from Indian medical entrance exams across 21 medical subjects. | Apache-2.0 | 4,183 | 182,822 | +| MedQA | Multiple-choice questions from USMLE medical licensing exams. | CC-BY-4.0 | 1,273 | 10,178 | +| MedXpertQA | High-difficulty MCQ questions with ~10 options across 17 specialties to evaluate expert-level medical knowledge, text subset. | MIT | 2,450 | -- | +| MetaMedQA | Questions testing model's awareness and recognition of unanswerable medical queries using uncertainty options. | CC-BY-4.0 | 1,373 | -- | +| MMLU-Pro-Health | Health subset of MMLU-Pro benchmark featuring general health-related questions with up to 10 answer options per question. | MIT | 818 | -- | +| PubHealthBench | Multiple-choice questions derived from UK government public health guidance documents, reviewed subset. | CC-BY-4.0 | 760 | -- | +| PubMedQA | Yes/no/maybe question answering requiring reasoning over biomedical research abstracts, labeled subset. | MIT | 500 | 211,269 | +| SCTPublic | Script Concordance Tests evaluating clinical reasoning under diagnostic uncertainty. | MIT | 174 | -- | +| SuperGPQA-Med | Graduate-level questions spanning 6 medical fields, easy and hard difficulty subsets. | ODC-BY | 1,126 | -- | +| **Medmarks-OE (Open-Ended)** | | | | | +| ACI-Bench | Clinical dialogue transcripts paired with corresponding structured clinical notes. | CC-BY-4.0 | 210 | 114 | +| AgentClinic | Multimodal multi-agent OSCE-style clinical dialogues for interactive diagnostic reasoning evaluation. | MIT | 214 | -- | +| CareQA Open | Healthcare QA exam questions with open-ended reasoning questions, English subset. | Apache-2.0 | 2,769 | -- | +| HealthBench | Multi-turn healthcare conversations evaluated using physician-written scoring rubrics. | MIT | 5,000 | -- | +| MedAgentBench v2 | Agentic electronic health record tasks requiring FHIR API interactions. | Not specified; V1 MIT | 600 | -- | +| MedCaseReasoning | Diagnostic QA with clinician-authored reasoning traces from clinical case reports. | MIT | 500 | 13,092 | +| MedDialog | Large-scale patient-doctor conversations for medical dialogue generation and understanding; Medmarks evaluates a small subsample. | Not specified | 2,500 | 205,973 | +| MedExQA | Questions with dual expert explanations across 5 underrepresented medical specialties. | CC-BY-NC-SA-4.0 | 940 | -- | +| MedicationQA | Consumer-style medication questions with expert-validated answers from MedlinePlus. | CC-BY-4.0 | 690 | -- | +| MEDEC | Medical dataset for clinical error detection, extraction, and correction in synthetic medical notes. | CC-BY-4.0 | 597 | 2,189 | +| MedR-Bench | Clinical reasoning benchmark with step-by-step diagnostic and treatment planning traces on rare disease cases. | CC-BY-SA-4.0 | 1,453 | -- | +| MTSamples | Transcribed medical operative notes and reports evaluating models on procedural summaries and clinically appropriate treatment plans. | Not specified | 559 | -- | + +## Citation + +```bibtex +@misc{warner2026medmarkscomprehensiveopensourcellm, + title={Medmarks: A Comprehensive Open-Source LLM Benchmark Suite for Medical Tasks}, + author={Benjamin Warner and Ratna Sagari Grandhi and Max Kieffer and Aymane Ouraq and Saurav Panigrahi and Geetu Ambwani and Kunal Bagga and Nikhil Khandekar and Arya Hariharan and Nishant Mishra and Manish Ram and Shamus Sim Zi Yang and Ahmed Essouaied and Adepoju Jeremiah Moyondafoluwa and Robert Scholz and Bofeng Huang and Molly Beavers and Srishti Gureja and Anish Mahishi and Sameed Khan and Maxime Griot and Hunar Batra and Jean-Benoit Delbrouck and Siddhant Bharadwaj and Ronald Clark and Ashish Vashist and Anas Zafar and Leema Krishna Murali and Harsh Deshpande and Ameen Patel and William Brown and Johannes Hagemann and Connor Lane and Paul Steven Scotti and Tanishq Mathew Abraham}, + year={2026}, + eprint={2605.01417}, + archivePrefix={arXiv}, + primaryClass={cs.CL}, + url={https://arxiv.org/abs/2605.01417}, +} ``` -This expands into six variants (`medconceptsqa-base-easy-s1618`, …). See [batch mode docs](docs/medarc-eval-bench.md) for full details on matrix expansion, exclusions, and split config files. - -## Processing and Win Rates - -After running benchmarks, convert results to parquet and compute model comparisons: - -```bash -# Process raw outputs to parquet -uv run medarc-eval process - -# Compute HELM-style win rates -uv run medarc-eval winrate -``` +## License -See [processing documentation](docs/medarc-eval-process.md) and [win rate documentation](docs/medarc-eval-winrate.md) for configuration options, HuggingFace integration, and output formats. +Medmarks code in this repository is released under the [MIT License](LICENSE). Individual benchmark datasets may have their own licenses or terms of use; consult the corresponding dataset sources and environment documentation before redistribution or commercial use. diff --git a/configs/README.md b/configs/README.md new file mode 100644 index 00000000..6d91fc41 --- /dev/null +++ b/configs/README.md @@ -0,0 +1,50 @@ +# MedARC Eval TOML Configs + +These configs use upstream `verifiers` TOML semantics. Repeated `env_id` entries +and `[[ablation]]` sweeps intentionally keep the upstream environment id stable; +`medarc-eval bench` writes deterministic variant directories for differing +`env_args` and `sampling_args`. + +```bash +medarc-eval bench --config configs/medmarks-smoke.toml --dry-run +medarc-eval bench --config configs/medmarks-verified.toml +medarc-eval process --runs-dir runs/evals --output-dir runs/processed +``` + +Use `medmarks-endpoints.toml` when you want one of the Medmarks model aliases +and its sampling defaults: + +```bash +medarc-eval bench \ + --config configs/medmarks-verified.toml \ + --endpoints-path configs/medmarks-endpoints.toml \ + -m gpt-oss-20b-low \ + --api-base-url https://api.pinference.ai/api/v1 \ + --api-key-var PRIME_API_KEY \ + --dry-run +``` + +`medmarks-endpoints.toml` is a portable alias registry. It maps endpoint IDs to +model IDs, client types, and sampling defaults, but intentionally omits `url`, +`key`, and `max_concurrent` because those are deployment-specific. Supply those +settings with `--provider` or with `--api-base-url` and `--api-key-var`. +The gpt-oss aliases use the Verifiers `openai_responses` client type. + +For a local vLLM server exposing an OpenAI-compatible API, keep using the same +alias registry and override only the deployment settings: + +```bash +VLLM_API_KEY=local-key medarc-eval bench \ + --config configs/medmarks-verified.toml \ + --endpoints-path configs/medmarks-endpoints.toml \ + -m gpt-oss-20b-low \ + --api-base-url http://127.0.0.1:8000/v1 \ + --api-key-var VLLM_API_KEY \ + --dry-run +``` + +Per-environment `[tool.verifiers.eval]` defaults are read from editable installs +where the environment `pyproject.toml` is discoverable next to the module. Wheel +installs may ignore those defaults unless the package includes `pyproject.toml`, +so production suite configs keep explicit `num_examples` and +`rollouts_per_example` values. diff --git a/configs/endpoints.toml b/configs/endpoints.toml new file mode 100644 index 00000000..363e12ca --- /dev/null +++ b/configs/endpoints.toml @@ -0,0 +1,4 @@ +# Default upstream verifiers endpoint registry. +# +# Add [[endpoint]] entries here to resolve endpoint_id aliases. An empty registry +# is valid; provider/model defaults are used when no alias matches. diff --git a/configs/envs/agentclinic.yaml b/configs/envs/agentclinic.yaml deleted file mode 100644 index a152a7f7..00000000 --- a/configs/envs/agentclinic.yaml +++ /dev/null @@ -1,41 +0,0 @@ -- id: agentclinic - module: agentclinic - num_examples: -1 - verbose: false - env_args: - judge_model: - - openai/gpt-5-mini - - google/gemini-3-flash-preview - judge_base_url: https://api.pinference.ai/api/v1 - patient_model: openai/gpt-5-mini - patient_base_url: https://api.pinference.ai/api/v1 - measurement_model: openai/gpt-5-mini - measurement_base_url: https://api.pinference.ai/api/v1 - -- id: agentclinic_rollout_1 - module: agentclinic - num_examples: -1 - verbose: false - env_args: - judge_model: - - openai/gpt-5-mini - - google/gemini-3-flash-preview - judge_base_url: https://api.pinference.ai/api/v1 - patient_model: openai/gpt-5-mini - patient_base_url: https://api.pinference.ai/api/v1 - measurement_model: openai/gpt-5-mini - measurement_base_url: https://api.pinference.ai/api/v1 - -- id: agentclinic_rollout_2 - module: agentclinic - num_examples: -1 - verbose: false - env_args: - judge_model: - - openai/gpt-5-mini - - google/gemini-3-flash-preview - judge_base_url: https://api.pinference.ai/api/v1 - patient_model: openai/gpt-5-mini - patient_base_url: https://api.pinference.ai/api/v1 - measurement_model: openai/gpt-5-mini - measurement_base_url: https://api.pinference.ai/api/v1 \ No newline at end of file diff --git a/configs/envs/careqa_en.yaml b/configs/envs/careqa_en.yaml deleted file mode 100644 index c39f52da..00000000 --- a/configs/envs/careqa_en.yaml +++ /dev/null @@ -1,17 +0,0 @@ -- id: careqa_en - module: careqa - num_examples: -1 - verbose: false - env_args: - split: en - -- id: careqa_en - module: careqa - num_examples: -1 - verbose: false - env_args: - shuffle_answers: true - split: en - matrix: - shuffle_seed: [1618, 9331] - matrix_id_format: "{base}-rollout{shuffle_seed}" \ No newline at end of file diff --git a/configs/envs/careqa_open.yaml b/configs/envs/careqa_open.yaml deleted file mode 100644 index 623259a6..00000000 --- a/configs/envs/careqa_open.yaml +++ /dev/null @@ -1,11 +0,0 @@ -- id: careqa_open - module: careqa - num_examples: -1 - verbose: false - rerun: true - env_args: - split: open - judge_model: - - openai/gpt-5-mini - - google/gemini-3-flash-preview - judge_base_url: https://api.pinference.ai/api/v1 \ No newline at end of file diff --git a/configs/envs/head_qa_v2.yaml b/configs/envs/head_qa_v2.yaml deleted file mode 100644 index 8daa0191..00000000 --- a/configs/envs/head_qa_v2.yaml +++ /dev/null @@ -1,4 +0,0 @@ -- id: head_qa_v2 - module: head_qa_v2 - num_examples: -1 - verbose: false \ No newline at end of file diff --git a/configs/envs/healthbench.yaml b/configs/envs/healthbench.yaml deleted file mode 100644 index e6ea873a..00000000 --- a/configs/envs/healthbench.yaml +++ /dev/null @@ -1,9 +0,0 @@ -- id: healthbench - module: healthbench - num_examples: -1 - verbose: false - rerun: true - env_args: - judge_model: openai/gpt-5-mini - judge_base_url: https://api.pinference.ai/api/v1 - difficulty: all \ No newline at end of file diff --git a/configs/envs/longhealth.yaml b/configs/envs/longhealth.yaml deleted file mode 100644 index 2f85f1a7..00000000 --- a/configs/envs/longhealth.yaml +++ /dev/null @@ -1,42 +0,0 @@ -# Base variants (no answer shuffling) -- id: longhealth - module: longhealth - rollouts_per_example: 1 - num_examples: -1 - verbose: false - env_args: - doc_shuffle_seed: 2718 - matrix: - task: [task1, task2] - matrix_id_format: "{base}-{task}" - max_concurrent: 64 - -# Shuffled variants with different seeds -- id: longhealth - module: longhealth - rollouts_per_example: 1 - num_examples: -1 - verbose: false - env_args: - shuffle_answers: true - shuffle_seed: 1618 - doc_shuffle_seed: 1618 - matrix: - task: [task1, task2] - matrix_id_format: "{base}-{task}-rollout1618" - max_concurrent: 64 - -# Shuffled variants with different seeds -- id: longhealth - module: longhealth - rollouts_per_example: 1 - num_examples: -1 - verbose: false - env_args: - shuffle_answers: true - shuffle_seed: 9331 - doc_shuffle_seed: 9331 - matrix: - task: [task1, task2] - matrix_id_format: "{base}-{task}-rollout9331" - max_concurrent: 64 diff --git a/configs/envs/m_arc.yaml b/configs/envs/m_arc.yaml deleted file mode 100644 index 0c8bb132..00000000 --- a/configs/envs/m_arc.yaml +++ /dev/null @@ -1,14 +0,0 @@ -- id: m_arc - module: m_arc - num_examples: -1 - verbose: false - -- id: m_arc - module: m_arc - num_examples: -1 - verbose: false - env_args: - shuffle_answers: true - matrix: - shuffle_seed: [1618, 9331] - matrix_id_format: "{base}-rollout{shuffle_seed}" diff --git a/configs/envs/med_dialog.yaml b/configs/envs/med_dialog.yaml deleted file mode 100644 index de7626db..00000000 --- a/configs/envs/med_dialog.yaml +++ /dev/null @@ -1,10 +0,0 @@ -- id: med_dialog - module: med_dialog - num_examples: 2500 - verbose: false - rerun: true - env_args: - judge_model: - - openai/gpt-5-mini - - x-ai/grok-4.1-fast - judge_base_url: https://api.pinference.ai/api/v1 \ No newline at end of file diff --git a/configs/envs/med_halt.yaml b/configs/envs/med_halt.yaml deleted file mode 100644 index 3853e922..00000000 --- a/configs/envs/med_halt.yaml +++ /dev/null @@ -1,8 +0,0 @@ -- id: med_halt - module: med_halt - rollouts_per_example: 1 - num_examples: -1 - verbose: false - matrix: - question_type: [reasoning_fct, reasoning_nota] - matrix_id_format: "{base}-{question_type}" \ No newline at end of file diff --git a/configs/envs/med_mcqa.yaml b/configs/envs/med_mcqa.yaml deleted file mode 100644 index 06ccfd02..00000000 --- a/configs/envs/med_mcqa.yaml +++ /dev/null @@ -1,16 +0,0 @@ -- id: med_mcqa - module: med_mcqa - rollouts_per_example: 1 - num_examples: -1 - verbose: false - -- id: med_mcqa - module: med_mcqa - rollouts_per_example: 1 - num_examples: -1 - verbose: false - env_args: - shuffle_answers: true - matrix: - shuffle_seed: [1618, 9331] - matrix_id_format: "{base}-rollout{shuffle_seed}" diff --git a/configs/envs/medagentbench.yaml b/configs/envs/medagentbench.yaml deleted file mode 100644 index 4b8578e0..00000000 --- a/configs/envs/medagentbench.yaml +++ /dev/null @@ -1,6 +0,0 @@ -- id: medagentbench - module: medagentbench - num_examples: -1 - verbose: false - env_args: - fhir_api_base: http://localhost:8080/fhir/ \ No newline at end of file diff --git a/configs/envs/medagentbenchv2.yaml b/configs/envs/medagentbenchv2.yaml deleted file mode 100644 index 450c2a3f..00000000 --- a/configs/envs/medagentbenchv2.yaml +++ /dev/null @@ -1,6 +0,0 @@ -- id: medagentbenchv2 - module: medagentbenchv2 - num_examples: -1 - verbose: false - env_args: - fhir_api_base: http://localhost:8080/fhir/ \ No newline at end of file diff --git a/configs/envs/medbullets.yaml b/configs/envs/medbullets.yaml deleted file mode 100644 index 56442158..00000000 --- a/configs/envs/medbullets.yaml +++ /dev/null @@ -1,20 +0,0 @@ -- id: medbullets - module: medbullets - rollouts_per_example: 1 - num_examples: -1 - verbose: false - matrix: - num_options: [4, 5] - matrix_id_format: "{base}-op{num_options}" - -- id: medbullets - module: medbullets - rollouts_per_example: 1 - num_examples: -1 - verbose: false - env_args: - shuffle_answers: true - matrix: - num_options: [4, 5] - shuffle_seed: [1618, 9331] - matrix_id_format: "{base}-op{num_options}-rollout{shuffle_seed}" diff --git a/configs/envs/medcalc_bench.yaml b/configs/envs/medcalc_bench.yaml deleted file mode 100644 index 487a95a0..00000000 --- a/configs/envs/medcalc_bench.yaml +++ /dev/null @@ -1,21 +0,0 @@ -- id: medcalc_bench - module: medcalc_bench - rollouts_per_example: 1 - verbose: false - num_examples: -1 - env_args: - version: "1.2" - -- id: medcalc_bench_tools - module: medcalc_bench - rollouts_per_example: 1 - verbose: false - num_examples: -1 - env_args: - version: "verified" - add_python_tool: true - add_calculator_tool: true - - export: - extra_columns: [lower_bound, upper_bound] - answer_column: ground_truth \ No newline at end of file diff --git a/configs/envs/medcasereasoning.yaml b/configs/envs/medcasereasoning.yaml deleted file mode 100644 index d0c300ce..00000000 --- a/configs/envs/medcasereasoning.yaml +++ /dev/null @@ -1,7 +0,0 @@ -- id: medcasereasoning - module: medcasereasoning - num_examples: -1 - verbose: false - env_args: - judge_model: openai/gpt-5-nano - judge_base_url: https://api.pinference.ai/api/v1 \ No newline at end of file diff --git a/configs/envs/medconceptsqa_sample.yaml b/configs/envs/medconceptsqa_sample.yaml deleted file mode 100644 index a1e09a92..00000000 --- a/configs/envs/medconceptsqa_sample.yaml +++ /dev/null @@ -1,23 +0,0 @@ -- id: medconceptsqa_sample - module: medconceptsqa - rollouts_per_example: 1 - num_examples: -1 - verbose: false - env_args: - vocab: icd10cm_sample - matrix: - difficulty: [easy, medium, hard] - matrix_id_format: "{base}-{difficulty}" - -- id: medconceptsqa_sample - module: medconceptsqa - rollouts_per_example: 1 - num_examples: -1 - verbose: false - env_args: - vocab: icd10cm_sample - shuffle_answers: true - matrix: - difficulty: [easy, medium, hard] - shuffle_seed: [1618, 9331] - matrix_id_format: "{base}-{difficulty}-rollout{shuffle_seed}" diff --git a/configs/envs/medec.yaml b/configs/envs/medec.yaml deleted file mode 100644 index 3d2a5f03..00000000 --- a/configs/envs/medec.yaml +++ /dev/null @@ -1,9 +0,0 @@ -- id: medec - module: medec - num_examples: -1 - verbose: false - env_args: - judge_model: - - openai/gpt-5-mini - - google/gemini-3-flash-preview - judge_base_url: https://api.pinference.ai/api/v1 \ No newline at end of file diff --git a/configs/envs/medexqa.yaml b/configs/envs/medexqa.yaml deleted file mode 100644 index 4317830e..00000000 --- a/configs/envs/medexqa.yaml +++ /dev/null @@ -1,11 +0,0 @@ -- id: medexqa - module: medexqa - num_examples: -1 - verbose: false - rerun: true - env_args: - judge_model: - - openai/gpt-5-mini - - google/gemini-3-flash-preview - judge_base_url: https://api.pinference.ai/api/v1 - use_judge: true \ No newline at end of file diff --git a/configs/envs/medhallu.yaml b/configs/envs/medhallu.yaml deleted file mode 100644 index bb313c1e..00000000 --- a/configs/envs/medhallu.yaml +++ /dev/null @@ -1,8 +0,0 @@ -- id: medhallu - module: medhallu - rollouts_per_example: 1 - num_examples: -1 - verbose: false - matrix: - difficulty: [easy, medium, hard] - matrix_id_format: "{base}-{difficulty}" diff --git a/configs/envs/medicationqa.yaml b/configs/envs/medicationqa.yaml deleted file mode 100644 index 3f3d12cf..00000000 --- a/configs/envs/medicationqa.yaml +++ /dev/null @@ -1,10 +0,0 @@ -- id: medicationqa - module: medicationqa - num_examples: -1 - verbose: false - rerun: true - env_args: - judge_model: - - openai/gpt-5-mini - - x-ai/grok-4.1-fast - judge_base_url: https://api.pinference.ai/api/v1 \ No newline at end of file diff --git a/configs/envs/medqa.yaml b/configs/envs/medqa.yaml deleted file mode 100644 index a0ff9abc..00000000 --- a/configs/envs/medqa.yaml +++ /dev/null @@ -1,16 +0,0 @@ -- id: medqa - module: medqa - rollouts_per_example: 1 - num_examples: -1 - verbose: false - -- id: medqa - module: medqa - rollouts_per_example: 1 - num_examples: -1 - verbose: false - env_args: - shuffle_answers: true - matrix: - shuffle_seed: [1618, 9331] - matrix_id_format: "{base}-rollout{shuffle_seed}" diff --git a/configs/envs/medrbench.yaml b/configs/envs/medrbench.yaml deleted file mode 100644 index 597852ae..00000000 --- a/configs/envs/medrbench.yaml +++ /dev/null @@ -1,14 +0,0 @@ -- id: medrbench - module: medrbench - num_examples: -1 - verbose: false - env_args: - judge_model: - - openai/gpt-5-mini - - google/gemini-3-flash-preview - judge_base_url: https://api.pinference.ai/api/v1 - patient_agent_model: openai/gpt-5-mini - patient_agent_base_url: https://api.pinference.ai/api/v1 - matrix: - task: [oracle, 1turn, free_turn] - matrix_id_format: "{base}-{task}" \ No newline at end of file diff --git a/configs/envs/medxpertqa.yaml b/configs/envs/medxpertqa.yaml deleted file mode 100644 index 595c1b4f..00000000 --- a/configs/envs/medxpertqa.yaml +++ /dev/null @@ -1,32 +0,0 @@ -- id: medxpertqa - module: medxpertqa - rollouts_per_example: 1 - num_examples: -1 - verbose: false - matrix: - question_type: [reasoning, understanding] - matrix_id_format: "{base}-{question_type}" - -- id: medxpertqa - module: medxpertqa - rollouts_per_example: 1 - num_examples: -1 - verbose: false - env_args: - shuffle_answers: true - shuffle_seed: 1618 - matrix: - question_type: [reasoning, understanding] - matrix_id_format: "{base}-{question_type}-rollout1618" - -- id: medxpertqa - module: medxpertqa - rollouts_per_example: 1 - num_examples: -1 - verbose: false - env_args: - shuffle_answers: true - shuffle_seed: 9331 - matrix: - question_type: [reasoning, understanding] - matrix_id_format: "{base}-{question_type}-rollout9331" \ No newline at end of file diff --git a/configs/envs/meqsum.yaml b/configs/envs/meqsum.yaml deleted file mode 100644 index 83e0cd24..00000000 --- a/configs/envs/meqsum.yaml +++ /dev/null @@ -1,20 +0,0 @@ -# MeQSum - Consumer Health Question Summarization -# Dataset: medarc/MeQSum-patient-consumer-health-questions - -meqsum: - env_name: meqsum - env_args: - split: test - compute_auto_metrics: true - -meqsum_val: - env_name: meqsum - env_args: - split: validation - compute_auto_metrics: true - -meqsum_fast: - env_name: meqsum - env_args: - split: test - compute_auto_metrics: false diff --git a/configs/envs/metamedqa.yaml b/configs/envs/metamedqa.yaml deleted file mode 100644 index ae12c4ea..00000000 --- a/configs/envs/metamedqa.yaml +++ /dev/null @@ -1,16 +0,0 @@ -- id: metamedqa - module: metamedqa - rollouts_per_example: 1 - num_examples: -1 - verbose: false - -- id: metamedqa - module: metamedqa - rollouts_per_example: 1 - num_examples: -1 - verbose: false - env_args: - shuffle_answers: true - matrix: - shuffle_seed: [1618, 9331] - matrix_id_format: "{base}-rollout{shuffle_seed}" diff --git a/configs/envs/mmlu_pro_health.yaml b/configs/envs/mmlu_pro_health.yaml deleted file mode 100644 index 28b7576e..00000000 --- a/configs/envs/mmlu_pro_health.yaml +++ /dev/null @@ -1,16 +0,0 @@ -- id: mmlu_pro_health - module: mmlu_pro_health - rollouts_per_example: 1 - num_examples: -1 - verbose: false - -- id: mmlu_pro_health - module: mmlu_pro_health - rollouts_per_example: 1 - num_examples: -1 - verbose: false - env_args: - shuffle_answers: true - matrix: - shuffle_seed: [1618, 9331] - matrix_id_format: "{base}-rollout{shuffle_seed}" diff --git a/configs/envs/mtsamples.yaml b/configs/envs/mtsamples.yaml deleted file mode 100644 index d709d3d6..00000000 --- a/configs/envs/mtsamples.yaml +++ /dev/null @@ -1,19 +0,0 @@ -- id: mtsamples_procedures - module: mtsamples_procedures - verbose: false - num_examples: -1 - env_args: - judge_model: - - openai/gpt-5-mini - - x-ai/grok-4.1-fast - judge_base_url: https://api.pinference.ai/api/v1 - -- id: mtsamples_replicate - module: mtsamples_replicate - verbose: false - num_examples: -1 - env_args: - judge_model: - - openai/gpt-5-mini - - x-ai/grok-4.1-fast - judge_base_url: https://api.pinference.ai/api/v1 \ No newline at end of file diff --git a/configs/envs/pubhealthbench_free.yaml b/configs/envs/pubhealthbench_free.yaml deleted file mode 100644 index b3fcea38..00000000 --- a/configs/envs/pubhealthbench_free.yaml +++ /dev/null @@ -1,11 +0,0 @@ -# Full test set - single unshuffled run -- id: pubhealthbench_freeform - module: pubhealthbench - num_examples: -1 - verbose: false - env_args: - split: freeform - judge_model: - - openai/gpt-5-mini - - google/gemini-3-flash-preview - judge_base_url: https://api.pinference.ai/api/v1 \ No newline at end of file diff --git a/configs/envs/pubhealthbench_mcq.yaml b/configs/envs/pubhealthbench_mcq.yaml deleted file mode 100644 index f7347e3a..00000000 --- a/configs/envs/pubhealthbench_mcq.yaml +++ /dev/null @@ -1,18 +0,0 @@ -# Reviewed set - shuffled with matrix -- id: pubhealthbench_reviewed - module: pubhealthbench - num_examples: -1 - verbose: false - env_args: - split: reviewed - -- id: pubhealthbench_reviewed - module: pubhealthbench - num_examples: -1 - verbose: false - env_args: - split: reviewed - shuffle_answers: true - matrix: - shuffle_seed: [1618, 9331] - matrix_id_format: "{base}-rollout{shuffle_seed}" diff --git a/configs/envs/pubmedqa.yaml b/configs/envs/pubmedqa.yaml deleted file mode 100644 index d71c01e1..00000000 --- a/configs/envs/pubmedqa.yaml +++ /dev/null @@ -1,16 +0,0 @@ -- id: pubmedqa - module: pubmedqa - rollouts_per_example: 1 - num_examples: -1 - verbose: false - -- id: pubmedqa - module: pubmedqa - rollouts_per_example: 1 - num_examples: -1 - verbose: false - env_args: - shuffle_answers: true - matrix: - shuffle_seed: [1618, 9331] - matrix_id_format: "{base}-rollout{shuffle_seed}" diff --git a/configs/envs/sctpublic.yaml b/configs/envs/sctpublic.yaml deleted file mode 100644 index 533ab0d2..00000000 --- a/configs/envs/sctpublic.yaml +++ /dev/null @@ -1,5 +0,0 @@ -- id: sctpublic - module: sctpublic - num_examples: -1 - rollouts_per_example: 1 - verbose: false \ No newline at end of file diff --git a/configs/envs/supergpqa_medicine.yaml b/configs/envs/supergpqa_medicine.yaml deleted file mode 100644 index ba1e922a..00000000 --- a/configs/envs/supergpqa_medicine.yaml +++ /dev/null @@ -1,32 +0,0 @@ -- id: supergpqa_medicine - module: supergpqa_medicine - rollouts_per_example: 1 - num_examples: -1 - verbose: false - matrix: - difficulty: [easy, hard] - matrix_id_format: "{base}-{difficulty}" - -- id: supergpqa_medicine - module: supergpqa_medicine - rollouts_per_example: 1 - num_examples: -1 - verbose: false - env_args: - shuffle_answers: true - shuffle_seed: 1618 - matrix: - difficulty: [easy, hard] - matrix_id_format: "{base}-{difficulty}-rollout1618" - -- id: supergpqa_medicine - module: supergpqa_medicine - rollouts_per_example: 1 - num_examples: -1 - verbose: false - env_args: - shuffle_answers: true - shuffle_seed: 9331 - matrix: - difficulty: [easy, hard] - matrix_id_format: "{base}-{difficulty}-rollout9331" diff --git a/configs/medmarks-endpoints.toml b/configs/medmarks-endpoints.toml new file mode 100644 index 00000000..be95de32 --- /dev/null +++ b/configs/medmarks-endpoints.toml @@ -0,0 +1,942 @@ +# MedMarks model alias registry ported from configs/old model sections. +# URL, key, max_concurrent, orchestration, and job matrix settings are intentionally omitted. +# Supply deployment-specific API settings with --provider or --api-base-url/--api-key-var. + +[[endpoint]] +endpoint_id = "afm-4-5b" +model = "arcee-ai/AFM-4.5B" +api_client_type = "openai_chat_completions" + +[endpoint.sampling_args] +temperature = 0.5 +top_p = 0.95 +top_k = 50 + + +[[endpoint]] +endpoint_id = "ai21-jamba2-mini" +model = "ai21labs/AI21-Jamba2-Mini" +api_client_type = "openai_chat_completions" + +[endpoint.sampling_args] +temperature = 0.6 +top_p = 1.0 + + +[[endpoint]] +endpoint_id = "antangelmed" +model = "MedAIBase/AntAngelMed" +api_client_type = "openai_chat_completions" + +[endpoint.sampling_args] +temperature = 0.6 +top_p = 0.95 +top_k = 20 + + +[[endpoint]] +endpoint_id = "baichuan-m2" +model = "baichuan-inc/Baichuan-M2-32B" +api_client_type = "openai_chat_completions" + +[endpoint.sampling_args] +temperature = 0.6 +top_p = 0.95 +top_k = 20 + + +[[endpoint]] +endpoint_id = "baichuan-m3" +model = "baichuan-inc/Baichuan-M3-235B" +api_client_type = "openai_chat_completions" + +[endpoint.sampling_args] +temperature = 0.6 +top_p = 0.95 +top_k = 20 + + +[[endpoint]] +endpoint_id = "dasd-30b-a3b" +model = "Alibaba-Apsara/DASD-30B-A3B-Thinking-Preview" +api_client_type = "openai_chat_completions" + +[endpoint.sampling_args] +temperature = 1.0 +top_p = 1.0 + + +[[endpoint]] +endpoint_id = "dasd-4b-thinking" +model = "Alibaba-Apsara/DASD-4B-Thinking" +api_client_type = "openai_chat_completions" + +[endpoint.sampling_args] +temperature = 1.0 +top_p = 1.0 + + +[[endpoint]] +endpoint_id = "deepseek-v3.2-speciale" +model = "deepseek/deepseek-v3.2-speciale" +api_client_type = "openai_chat_completions" + +[endpoint.sampling_args] +temperature = 1 +top_p = 0.95 + +[endpoint.sampling_args.extra_body] + +[endpoint.sampling_args.extra_body.usage] +include = true + + +[[endpoint]] +endpoint_id = "fiercefalcon" +model = "fiercefalcon" +api_client_type = "openai_chat_completions" + +[endpoint.sampling_args] +reasoning_effort = "low" +temperature = 1 +top_p = 0.95 + + +[[endpoint]] +endpoint_id = "gemini-3-pro-preview" +model = "gemini-3-pro-preview" +api_client_type = "openai_chat_completions" + +[endpoint.sampling_args] +temperature = 1 + + +[[endpoint]] +endpoint_id = "gemma-3-12b-it" +model = "google/gemma-3-12b-it" +api_client_type = "openai_chat_completions" + +[endpoint.sampling_args] +temperature = 1.0 +top_p = 0.95 +top_k = 60 + + +[[endpoint]] +endpoint_id = "gemma-3-27b-it" +model = "google/gemma-3-27b-it" +api_client_type = "openai_chat_completions" + +[endpoint.sampling_args] +temperature = 1.0 +top_p = 0.95 +top_k = 60 + + +[[endpoint]] +endpoint_id = "gemma-3-4b-it" +model = "google/gemma-3-4b-it" +api_client_type = "openai_chat_completions" + +[endpoint.sampling_args] +temperature = 1.0 +top_p = 0.95 +top_k = 60 + + +[[endpoint]] +endpoint_id = "glm-4.5-air" +model = "zai-org/GLM-4.5-Air" +api_client_type = "openai_chat_completions" + +[endpoint.sampling_args] +temperature = 0.6 +top_p = 0.95 + + +[[endpoint]] +endpoint_id = "glm-4.7-fp8" +model = "zai-org/GLM-4.7-FP8" +api_client_type = "openai_chat_completions" + +[endpoint.sampling_args] +temperature = 1.0 +top_p = 0.95 + + +[[endpoint]] +endpoint_id = "glm-4_7-flash" +model = "zai-org/GLM-4.7-Flash" +api_client_type = "openai_chat_completions" + +[endpoint.sampling_args] +temperature = 1.0 +top_p = 0.95 + + +# Medmarks 1.0 GPT runs predated Verifiers Responses API support. +# These aliases now use openai_responses to better match OpenAI GPT model behavior. +[[endpoint]] +endpoint_id = "gpt-5-nano" +model = "openai/gpt-5-nano" +api_client_type = "openai_responses" + +[endpoint.sampling_args] +reasoning_effort = "medium" + +[endpoint.sampling_args.extra_body] + +[endpoint.sampling_args.extra_body.usage] +include = true + + +[[endpoint]] +endpoint_id = "gpt-5_2" +model = "openai/gpt-5.2" +api_client_type = "openai_responses" + +[endpoint.sampling_args] +reasoning_effort = "medium" + +[endpoint.sampling_args.extra_body] + +[endpoint.sampling_args.extra_body.usage] +include = true + + +[[endpoint]] +endpoint_id = "gpt-5_2-high" +model = "openai/gpt-5.2" +api_client_type = "openai_responses" + +[endpoint.sampling_args] +reasoning_effort = "high" + +[endpoint.sampling_args.extra_body] + +[endpoint.sampling_args.extra_body.usage] +include = true + + +# Medmarks 1.0 GPT-OSS runs predated Verifiers Responses API support. +# These aliases now use openai_responses to better match gpt-oss model behavior. +[[endpoint]] +endpoint_id = "gpt-oss-120b" +model = "openai/gpt-oss-120b" +api_client_type = "openai_responses" + +[endpoint.sampling_args] +temperature = 1.0 +top_p = 1.0 +top_k = 0 +reasoning_effort = "medium" + + +[[endpoint]] +endpoint_id = "gpt-oss-120b-high" +model = "openai/gpt-oss-120b" +api_client_type = "openai_responses" + +[endpoint.sampling_args] +temperature = 1.0 +top_p = 1.0 +top_k = 0 +reasoning_effort = "high" + + +[[endpoint]] +endpoint_id = "gpt-oss-120b-low" +model = "openai/gpt-oss-120b" +api_client_type = "openai_responses" + +[endpoint.sampling_args] +temperature = 1.0 +top_p = 1.0 +top_k = 0 +reasoning_effort = "low" + + +[[endpoint]] +endpoint_id = "gpt-oss-20b" +model = "openai/gpt-oss-20b" +api_client_type = "openai_responses" + +[endpoint.sampling_args] +temperature = 1.0 +top_p = 1.0 +top_k = 0 +reasoning_effort = "medium" + + +[[endpoint]] +endpoint_id = "gpt-oss-20b-high" +model = "openai/gpt-oss-20b" +api_client_type = "openai_responses" + +[endpoint.sampling_args] +temperature = 1.0 +top_p = 1.0 +top_k = 0 +reasoning_effort = "high" + + +[[endpoint]] +endpoint_id = "gpt-oss-20b-low" +model = "openai/gpt-oss-20b" +api_client_type = "openai_responses" + +[endpoint.sampling_args] +temperature = 1.0 +top_p = 1.0 +top_k = 0 +reasoning_effort = "low" + + +# Medmarks 1.0 GPT runs predated Verifiers Responses API support. +# These aliases now use openai_responses to better match OpenAI GPT model behavior. +[[endpoint]] +endpoint_id = "gpt_5_1" +model = "openai/gpt-5.1" +api_client_type = "openai_responses" + +[endpoint.sampling_args] +reasoning_effort = "medium" + +[endpoint.sampling_args.extra_body] + +[endpoint.sampling_args.extra_body.usage] +include = true + + +[[endpoint]] +endpoint_id = "gpt_5_mini" +model = "openai/gpt-5-mini" +api_client_type = "openai_responses" + +[endpoint.sampling_args] +reasoning_effort = "low" + +[endpoint.sampling_args.extra_body] + +[endpoint.sampling_args.extra_body.usage] +include = true + + +[[endpoint]] +endpoint_id = "granite-4-0-h-small" +model = "ibm-granite/granite-4.0-h-small" +api_client_type = "openai_chat_completions" + +[endpoint.sampling_args] +temperature = 0.0 +top_p = 1.0 +top_k = 0 + + +[[endpoint]] +endpoint_id = "granite-4-0-h-tiny" +model = "ibm-granite/granite-4.0-h-tiny" +api_client_type = "openai_chat_completions" + +[endpoint.sampling_args] +temperature = 0.0 +top_p = 1.0 +top_k = 0 + + +[[endpoint]] +endpoint_id = "grok-4" +model = "x-ai/grok-4" +api_client_type = "openai_chat_completions" + +[endpoint.sampling_args] +temperature = 1 +top_p = 0.95 + +[endpoint.sampling_args.extra_body] + +[endpoint.sampling_args.extra_body.usage] +include = true + + +[[endpoint]] +endpoint_id = "hermes-4-14b" +model = "NousResearch/Hermes-4-14B" +api_client_type = "openai_chat_completions" + +[endpoint.sampling_args] +temperature = 0.6 +top_p = 0.95 +top_k = 20 + + +[[endpoint]] +endpoint_id = "hermes-4-70b" +model = "NousResearch/Hermes-4-70B" +api_client_type = "openai_chat_completions" + +[endpoint.sampling_args] +temperature = 0.6 +top_p = 0.95 +top_k = 20 + + +[[endpoint]] +endpoint_id = "intellect3" +model = "PrimeIntellect/INTELLECT-3" +api_client_type = "openai_chat_completions" + +[endpoint.sampling_args] +temperature = 0.6 +top_p = 0.95 + + +[[endpoint]] +endpoint_id = "kimi-k2-thinking" +model = "moonshotai/Kimi-K2-Thinking" +api_client_type = "openai_chat_completions" + +[endpoint.sampling_args] +temperature = 1.0 +top_p = 0.95 + + +[[endpoint]] +endpoint_id = "kimi-linear" +model = "moonshotai/Kimi-Linear-48B-A3B-Instruct" +api_client_type = "openai_chat_completions" + +[endpoint.sampling_args] +temperature = 1.0 +top_p = 0.95 + + +[[endpoint]] +endpoint_id = "ling-2-flash" +model = "inclusionAI/Ling-flash-2.0" +api_client_type = "openai_chat_completions" + +[endpoint.sampling_args] +temperature = 0.7 +top_p = 0.8 + + +[[endpoint]] +endpoint_id = "llama-3-70b-instruct" +model = "meta-llama/Llama-3.3-70B-Instruct" +api_client_type = "openai_chat_completions" + +[endpoint.sampling_args] +temperature = 0.7 +top_p = 0.95 +top_k = 0 + + +[[endpoint]] +endpoint_id = "llama-3-8b-instruct" +model = "meta-llama/Llama-3.1-8B-Instruct" +api_client_type = "openai_chat_completions" + +[endpoint.sampling_args] +temperature = 0.7 +top_p = 0.95 +top_k = 0 + + +[[endpoint]] +endpoint_id = "magistral-small" +model = "mistralai/Magistral-Small-2509" +api_client_type = "openai_chat_completions" + +[endpoint.sampling_args] +temperature = 0.7 +top_p = 0.95 + + +[[endpoint]] +endpoint_id = "medgemma-27b-text" +model = "google/medgemma-27b-text-it" +api_client_type = "openai_chat_completions" + +[endpoint.sampling_args] +temperature = 0.0 +top_p = 1.0 +top_k = 0 + + +[[endpoint]] +endpoint_id = "medgemma-4b-1_5-it" +model = "google/medgemma-1.5-4b-it" +api_client_type = "openai_chat_completions" + +[endpoint.sampling_args] +temperature = 0.0 +top_p = 0.95 +top_k = 64 + + +[[endpoint]] +endpoint_id = "medgemma-4b-it" +model = "google/medgemma-4b-it" +api_client_type = "openai_chat_completions" + +[endpoint.sampling_args] +temperature = 0.0 +top_p = 1.0 +top_k = 0 + + +[[endpoint]] +endpoint_id = "mediphi" +model = "microsoft/MediPhi-Instruct" +api_client_type = "openai_chat_completions" + +[endpoint.sampling_args] +temperature = 0.0 +top_p = 0.95 + + +[[endpoint]] +endpoint_id = "minimax-m2" +model = "MiniMaxAI/MiniMax-M2" +api_client_type = "openai_chat_completions" + +[endpoint.sampling_args] +temperature = 1.0 +top_p = 0.95 +top_k = 40 + + +[[endpoint]] +endpoint_id = "minimax-m2.1" +model = "MiniMaxAI/MiniMax-M2.1" +api_client_type = "openai_chat_completions" + +[endpoint.sampling_args] +temperature = 1.0 +top_p = 0.95 +top_k = 40 + + +[[endpoint]] +endpoint_id = "ministral3-14b-instruct" +model = "mistralai/Ministral-3-14B-Instruct-2512" +api_client_type = "openai_chat_completions" + +[endpoint.sampling_args] +temperature = 0.1 +top_p = 0.95 + + +[[endpoint]] +endpoint_id = "ministral3-14b-reason" +model = "mistralai/Ministral-3-14B-Reasoning-2512" +api_client_type = "openai_chat_completions" + +[endpoint.sampling_args] +temperature = 0.7 +top_p = 0.95 + + +[[endpoint]] +endpoint_id = "ministral3-3b-instruct" +model = "mistralai/Ministral-3-3B-Instruct-2512" +api_client_type = "openai_chat_completions" + +[endpoint.sampling_args] +temperature = 0.1 +top_p = 0.95 + + +[[endpoint]] +endpoint_id = "ministral3-3b-reason" +model = "mistralai/Ministral-3-3B-Reasoning-2512" +api_client_type = "openai_chat_completions" + +[endpoint.sampling_args] +temperature = 0.7 +top_p = 0.95 + + +[[endpoint]] +endpoint_id = "ministral3-8b-instruct" +model = "mistralai/Ministral-3-8B-Instruct-2512" +api_client_type = "openai_chat_completions" + +[endpoint.sampling_args] +temperature = 0.1 +top_p = 0.95 + + +[[endpoint]] +endpoint_id = "ministral3-8b-reason" +model = "mistralai/Ministral-3-8B-Reasoning-2512" +api_client_type = "openai_chat_completions" + +[endpoint.sampling_args] +temperature = 0.7 +top_p = 0.95 + + +[[endpoint]] +endpoint_id = "mirothinker-235b-a22b" +model = "miromind-ai/MiroThinker-v1.5-235B" +api_client_type = "openai_chat_completions" + +[endpoint.sampling_args] +temperature = 1.0 +top_p = 0.95 + + +[[endpoint]] +endpoint_id = "mirothinker-30b-a3b" +model = "miromind-ai/MiroThinker-v1.5-30B" +api_client_type = "openai_chat_completions" + +[endpoint.sampling_args] +temperature = 1.0 +top_p = 0.95 + + +[[endpoint]] +endpoint_id = "nemotron-nano-v2" +model = "nvidia/NVIDIA-Nemotron-Nano-12B-v2" +api_client_type = "openai_chat_completions" + +[endpoint.sampling_args] +temperature = 0.6 +top_p = 0.95 + + +[[endpoint]] +endpoint_id = "nemotron-nano-v3" +model = "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16" +api_client_type = "openai_chat_completions" + +[endpoint.sampling_args] +temperature = 1.0 +top_p = 1.0 + + +[[endpoint]] +endpoint_id = "olmo-3-32b-think" +model = "allenai/Olmo-3-32B-Think" +api_client_type = "openai_chat_completions" + +[endpoint.sampling_args] +temperature = 0.6 +top_p = 0.95 + + +[[endpoint]] +endpoint_id = "olmo-3-7b-instruct" +model = "allenai/Olmo-3-7B-Instruct" +api_client_type = "openai_chat_completions" + +[endpoint.sampling_args] +temperature = 0.6 +top_p = 0.95 + + +[[endpoint]] +endpoint_id = "olmo-3-7b-think" +model = "allenai/Olmo-3-7B-Think" +api_client_type = "openai_chat_completions" + +[endpoint.sampling_args] +temperature = 0.6 +top_p = 0.95 + + +[[endpoint]] +endpoint_id = "olmo-3_1-32b-instruct" +model = "allenai/Olmo-3.1-32B-Instruct" +api_client_type = "openai_chat_completions" + +[endpoint.sampling_args] +temperature = 0.6 +top_p = 0.95 + + +[[endpoint]] +endpoint_id = "olmo-3_1-32b-think" +model = "allenai/Olmo-3.1-32B-Think" +api_client_type = "openai_chat_completions" + +[endpoint.sampling_args] +temperature = 0.6 +top_p = 0.95 + + +[[endpoint]] +endpoint_id = "phi-4-reasoning" +model = "microsoft/Phi-4-reasoning" +api_client_type = "openai_chat_completions" + +[endpoint.sampling_args] +temperature = 0.8 +top_p = 0.95 +top_k = 50 + + +[[endpoint]] +endpoint_id = "phi-4-reasoning-plus" +model = "microsoft/Phi-4-reasoning-plus" +api_client_type = "openai_chat_completions" + +[endpoint.sampling_args] +temperature = 0.8 +top_p = 0.95 +top_k = 50 + + +[[endpoint]] +endpoint_id = "qwen-235b-a22b-thinking" +model = "Qwen/Qwen3-235B-A22B-Thinking-2507" +api_client_type = "openai_chat_completions" + +[endpoint.sampling_args] +temperature = 0.6 +top_p = 0.95 +top_k = 20 +min_p = 0 + + +[[endpoint]] +endpoint_id = "qwen-3-14b-thinking" +model = "Qwen/Qwen3-14B" +api_client_type = "openai_chat_completions" + +[endpoint.sampling_args] +temperature = 0.6 +top_p = 0.95 +top_k = 20 +min_p = 0 + + +[[endpoint]] +endpoint_id = "qwen-3-4b-thinking" +model = "Qwen/Qwen3-4B-Thinking-2507" +api_client_type = "openai_chat_completions" + +[endpoint.sampling_args] +temperature = 0.6 +top_p = 0.95 +top_k = 20 +min_p = 0 + + +[[endpoint]] +endpoint_id = "qwen-3-8b-thinking" +model = "Qwen/Qwen3-8B" +api_client_type = "openai_chat_completions" + +[endpoint.sampling_args] +temperature = 0.6 +top_p = 0.95 +top_k = 20 +min_p = 0 + + +[[endpoint]] +endpoint_id = "qwen-30b-a3b-instruct" +model = "Qwen/Qwen3-30B-A3B-Instruct-2507" +api_client_type = "openai_chat_completions" + +[endpoint.sampling_args] +temperature = 0.7 +top_p = 0.8 +top_k = 20 +min_p = 0 + + +[[endpoint]] +endpoint_id = "qwen-30b-a3b-instruct-awq-4bit" +model = "cpatonn/Qwen3-30B-A3B-Instruct-2507-AWQ-4bit" +api_client_type = "openai_chat_completions" + +[endpoint.sampling_args] +temperature = 0.7 +top_p = 0.8 +top_k = 20 +min_p = 0 + + +[[endpoint]] +endpoint_id = "qwen-30b-a3b-instruct-awq-8bit" +model = "cpatonn/Qwen3-30B-A3B-Instruct-2507-AWQ-8bit" +api_client_type = "openai_chat_completions" + +[endpoint.sampling_args] +temperature = 0.7 +top_p = 0.8 +top_k = 20 +min_p = 0 + + +[[endpoint]] +endpoint_id = "qwen-30b-a3b-instruct-fp8" +model = "Qwen/Qwen3-30B-A3B-Instruct-2507-FP8" +api_client_type = "openai_chat_completions" + +[endpoint.sampling_args] +temperature = 0.7 +top_p = 0.8 +top_k = 20 +min_p = 0 + + +[[endpoint]] +endpoint_id = "qwen-30b-a3b-thinking" +model = "Qwen/Qwen3-30B-A3B-Thinking-2507" +api_client_type = "openai_chat_completions" + +[endpoint.sampling_args] +temperature = 0.6 +top_p = 0.95 +top_k = 20 +min_p = 0 + + +[[endpoint]] +endpoint_id = "qwen-30b-a3b-thinking-awq-4bit" +model = "cpatonn/Qwen3-30B-A3B-Thinking-2507-AWQ-4bit" +api_client_type = "openai_chat_completions" + +[endpoint.sampling_args] +temperature = 0.6 +top_p = 0.95 +top_k = 20 +min_p = 0 + + +[[endpoint]] +endpoint_id = "qwen-30b-a3b-thinking-awq-8bit" +model = "cpatonn/Qwen3-30B-A3B-Thinking-2507-AWQ-8bit" +api_client_type = "openai_chat_completions" + +[endpoint.sampling_args] +temperature = 0.6 +top_p = 0.95 +top_k = 20 +min_p = 0 + + +[[endpoint]] +endpoint_id = "qwen-30b-a3b-thinking-fp8" +model = "Qwen/Qwen3-30B-A3B-Thinking-2507-FP8" +api_client_type = "openai_chat_completions" + +[endpoint.sampling_args] +temperature = 0.6 +top_p = 0.95 +top_k = 20 +min_p = 0 + + +[[endpoint]] +endpoint_id = "qwen-next-80b-a3b-instruct" +model = "Qwen/Qwen3-Next-80B-A3B-Instruct" +api_client_type = "openai_chat_completions" + +[endpoint.sampling_args] +temperature = 0.7 +top_p = 0.8 +top_k = 20 +min_p = 0 + + +[[endpoint]] +endpoint_id = "qwen-next-80b-a3b-thinking" +model = "Qwen/Qwen3-Next-80B-A3B-Thinking" +api_client_type = "openai_chat_completions" + +[endpoint.sampling_args] +temperature = 0.6 +top_p = 0.95 +top_k = 20 +min_p = 0 + + +# Qwen3-VL temperature guidance varies across official sources and examples; +# 1.0, 0.8, and 0.6 are all referenced. This preserves the legacy run config. +[[endpoint]] +endpoint_id = "qwen-vl-30b-a3b-thinking" +model = "Qwen/Qwen3-VL-30B-A3B-Thinking" +api_client_type = "openai_chat_completions" + +[endpoint.sampling_args] +temperature = 1.0 +top_p = 0.95 +top_k = 20 +min_p = 0 + + +[[endpoint]] +endpoint_id = "qwen2_5-32b-instruct" +model = "Qwen/Qwen2.5-32B-Instruct" +api_client_type = "openai_chat_completions" + +[endpoint.sampling_args] +temperature = 0.7 +top_p = 0.8 +top_k = 20 + + +[[endpoint]] +endpoint_id = "qwen3-max" +model = "qwen/qwen3-max" +api_client_type = "openai_chat_completions" + +[endpoint.sampling_args] +temperature = 0.6 +top_p = 0.95 +top_k = 20 +min_p = 0 + +[endpoint.sampling_args.extra_body] + +[endpoint.sampling_args.extra_body.usage] +include = true + + +[[endpoint]] +endpoint_id = "smollm3-3b" +model = "HuggingFaceTB/SmolLM3-3B" +api_client_type = "openai_chat_completions" + +[endpoint.sampling_args] +temperature = 0.6 +top_p = 0.95 + + +[[endpoint]] +endpoint_id = "sonnet-4_5" +model = "claude-sonnet-4-5-20250929" +api_client_type = "anthropic_messages" + +[endpoint.sampling_args] +temperature = 0.7 + + +[[endpoint]] +endpoint_id = "trinity-mini" +model = "arcee-ai/Trinity-Mini" +api_client_type = "openai_chat_completions" + +[endpoint.sampling_args] +temperature = 0.15 +top_k = 50 +top_p = 0.75 +min_p = 0.06 + + +[[endpoint]] +endpoint_id = "trinity-nano-preview" +model = "arcee-ai/Trinity-Nano-Preview" +api_client_type = "openai_chat_completions" + +[endpoint.sampling_args] +temperature = 0.5 +top_k = 50 +top_p = 0.95 diff --git a/configs/medmarks-open_ended.toml b/configs/medmarks-open_ended.toml new file mode 100644 index 00000000..fc0e5327 --- /dev/null +++ b/configs/medmarks-open_ended.toml @@ -0,0 +1,94 @@ +# MedARC judge- and free-form-heavy benchmark suite. + +save_results = true +output_dir = "runs/evals" + +[[eval]] +env_id = "agentclinic" +num_examples = -1 +rollouts_per_example = 1 +env_args = { judge_model = ["openai/gpt-5-mini", "google/gemini-3-flash-preview"], judge_base_url = "https://api.pinference.ai/api/v1", patient_model = "openai/gpt-5-mini", patient_base_url = "https://api.pinference.ai/api/v1", measurement_model = "openai/gpt-5-mini", measurement_base_url = "https://api.pinference.ai/api/v1" } + +[[eval]] +env_id = "agentclinic" +name = "rollout_1" +num_examples = -1 +rollouts_per_example = 1 +env_args = { judge_model = ["openai/gpt-5-mini", "google/gemini-3-flash-preview"], judge_base_url = "https://api.pinference.ai/api/v1", patient_model = "openai/gpt-5-mini", patient_base_url = "https://api.pinference.ai/api/v1", measurement_model = "openai/gpt-5-mini", measurement_base_url = "https://api.pinference.ai/api/v1" } + +[[eval]] +env_id = "agentclinic" +name = "rollout_2" +num_examples = -1 +rollouts_per_example = 1 +env_args = { judge_model = ["openai/gpt-5-mini", "google/gemini-3-flash-preview"], judge_base_url = "https://api.pinference.ai/api/v1", patient_model = "openai/gpt-5-mini", patient_base_url = "https://api.pinference.ai/api/v1", measurement_model = "openai/gpt-5-mini", measurement_base_url = "https://api.pinference.ai/api/v1" } + +[[eval]] +env_id = "careqa" +num_examples = -1 +rollouts_per_example = 1 +env_args = { split = "open", judge_model = ["openai/gpt-5-mini", "google/gemini-3-flash-preview"], judge_base_url = "https://api.pinference.ai/api/v1" } + +[[eval]] +env_id = "healthbench" +num_examples = -1 +rollouts_per_example = 1 +env_args = { judge_model = "openai/gpt-5-mini", judge_base_url = "https://api.pinference.ai/api/v1", difficulty = "all" } + +[[eval]] +env_id = "med_dialog" +num_examples = 2500 +rollouts_per_example = 1 +env_args = { judge_model = ["openai/gpt-5-mini", "x-ai/grok-4.1-fast"], judge_base_url = "https://api.pinference.ai/api/v1" } + +[[eval]] +env_id = "medcasereasoning" +num_examples = -1 +rollouts_per_example = 1 +env_args = { judge_model = "openai/gpt-5-nano", judge_base_url = "https://api.pinference.ai/api/v1" } + +[[eval]] +env_id = "medec" +num_examples = -1 +rollouts_per_example = 1 +env_args = { judge_model = ["openai/gpt-5-mini", "google/gemini-3-flash-preview"], judge_base_url = "https://api.pinference.ai/api/v1" } + +[[eval]] +env_id = "medexqa" +num_examples = -1 +rollouts_per_example = 1 +env_args = { use_judge = true, judge_model = ["openai/gpt-5-mini", "google/gemini-3-flash-preview"], judge_base_url = "https://api.pinference.ai/api/v1" } + +[[eval]] +env_id = "medicationqa" +num_examples = -1 +rollouts_per_example = 1 +env_args = { judge_model = ["openai/gpt-5-mini", "x-ai/grok-4.1-fast"], judge_base_url = "https://api.pinference.ai/api/v1" } + +[[ablation]] +env_id = "medrbench" +name = "{env_args.task}" +num_examples = -1 +rollouts_per_example = 1 +env_args = { judge_model = ["openai/gpt-5-mini", "google/gemini-3-flash-preview"], judge_base_url = "https://api.pinference.ai/api/v1", patient_agent_model = "openai/gpt-5-mini", patient_agent_base_url = "https://api.pinference.ai/api/v1" } + +[ablation.sweep.env_args] +task = ["oracle", "1turn", "free_turn"] + +[[eval]] +env_id = "mtsamples_procedures" +num_examples = -1 +rollouts_per_example = 1 +env_args = { judge_model = ["openai/gpt-5-mini", "x-ai/grok-4.1-fast"], judge_base_url = "https://api.pinference.ai/api/v1" } + +[[eval]] +env_id = "mtsamples_replicate" +num_examples = -1 +rollouts_per_example = 1 +env_args = { judge_model = ["openai/gpt-5-mini", "x-ai/grok-4.1-fast"], judge_base_url = "https://api.pinference.ai/api/v1" } + +[[eval]] +env_id = "pubhealthbench" +num_examples = -1 +rollouts_per_example = 1 +env_args = { split = "freeform", judge_model = ["openai/gpt-5-mini", "google/gemini-3-flash-preview"], judge_base_url = "https://api.pinference.ai/api/v1" } diff --git a/configs/medmarks-smoke.toml b/configs/medmarks-smoke.toml new file mode 100644 index 00000000..24e0d047 --- /dev/null +++ b/configs/medmarks-smoke.toml @@ -0,0 +1,105 @@ +# Small Medmarks-V smoke run. +# Runs 10 examples from each verified environment without ablations. + +save_results = true +output_dir = "runs/smoke" + +[[eval]] +env_id = "careqa" +num_examples = 10 +rollouts_per_example = 1 +env_args = { split = "en" } + +[[eval]] +env_id = "head_qa_v2" +num_examples = 10 +rollouts_per_example = 1 + +[[eval]] +env_id = "longhealth" +num_examples = 10 +rollouts_per_example = 1 +env_args = { task = "task1", doc_shuffle_seed = 2718 } + +[[eval]] +env_id = "m_arc" +num_examples = 10 +rollouts_per_example = 1 + +[[eval]] +env_id = "med_halt" +num_examples = 10 +rollouts_per_example = 1 +env_args = { question_type = "reasoning_fct" } + +[[eval]] +env_id = "med_mcqa" +num_examples = 10 +rollouts_per_example = 1 + +[[eval]] +env_id = "medbullets" +num_examples = 10 +rollouts_per_example = 1 +env_args = { num_options = 4 } + +[[eval]] +env_id = "medcalc_bench" +num_examples = 10 +rollouts_per_example = 1 +env_args = { version = "1.2" } + +[[eval]] +env_id = "medconceptsqa" +num_examples = 10 +rollouts_per_example = 1 +env_args = { vocab = "icd10cm_sample", difficulty = "easy" } + +[[eval]] +env_id = "medhallu" +num_examples = 10 +rollouts_per_example = 1 +env_args = { difficulty = "easy" } + +[[eval]] +env_id = "medqa" +num_examples = 10 +rollouts_per_example = 1 + +[[eval]] +env_id = "medxpertqa" +num_examples = 10 +rollouts_per_example = 1 +env_args = { question_type = "reasoning" } + +[[eval]] +env_id = "metamedqa" +num_examples = 10 +rollouts_per_example = 1 + +[[eval]] +env_id = "mmlu_pro_health" +num_examples = 10 +rollouts_per_example = 1 + +[[eval]] +env_id = "pubhealthbench" +num_examples = 10 +rollouts_per_example = 1 +env_args = { split = "reviewed" } + +[[eval]] +env_id = "pubmedqa" +num_examples = 10 +rollouts_per_example = 1 + +[[eval]] +env_id = "sctpublic" +num_examples = 10 +rollouts_per_example = 1 + +[[eval]] +env_id = "supergpqa_medicine" +num_examples = 10 +rollouts_per_example = 1 +env_args = { difficulty = "easy" } diff --git a/configs/medmarks-verified.toml b/configs/medmarks-verified.toml new file mode 100644 index 00000000..709782a6 --- /dev/null +++ b/configs/medmarks-verified.toml @@ -0,0 +1,281 @@ +# MedARC multiple-choice benchmark suite. +# Ablations become deterministic variant directories such as +# runs/evals///env_args.shuffle_seed-1618/. + +save_results = true +output_dir = "runs/evals" + +[[eval]] +env_id = "careqa" +num_examples = -1 +rollouts_per_example = 1 +env_args = { split = "en" } + +[[ablation]] +env_id = "careqa" +name = "shuffle_seed-{env_args.shuffle_seed}" +num_examples = -1 +rollouts_per_example = 1 +env_args = { split = "en", shuffle_answers = true } + +[ablation.sweep.env_args] +shuffle_seed = [1618, 9331] + +[[eval]] +env_id = "head_qa_v2" +num_examples = -1 +rollouts_per_example = 1 + +[[ablation]] +env_id = "longhealth" +name = "{env_args.task}" +num_examples = -1 +rollouts_per_example = 1 +env_args = { doc_shuffle_seed = 2718 } + +[ablation.sweep.env_args] +task = ["task1", "task2"] + +[[ablation]] +env_id = "longhealth" +name = "{env_args.task}__shuffle_seed-1618" +num_examples = -1 +rollouts_per_example = 1 +env_args = { shuffle_answers = true, shuffle_seed = 1618, doc_shuffle_seed = 1618 } + +[ablation.sweep.env_args] +task = ["task1", "task2"] + +[[ablation]] +env_id = "longhealth" +name = "{env_args.task}__shuffle_seed-9331" +num_examples = -1 +rollouts_per_example = 1 +env_args = { shuffle_answers = true, shuffle_seed = 9331, doc_shuffle_seed = 9331 } + +[ablation.sweep.env_args] +task = ["task1", "task2"] + +[[eval]] +env_id = "m_arc" +num_examples = -1 +rollouts_per_example = 1 + +[[ablation]] +env_id = "m_arc" +name = "shuffle_seed-{env_args.shuffle_seed}" +num_examples = -1 +rollouts_per_example = 1 +env_args = { shuffle_answers = true } + +[ablation.sweep.env_args] +shuffle_seed = [1618, 9331] + +[[ablation]] +env_id = "med_halt" +name = "{env_args.question_type}" +num_examples = -1 +rollouts_per_example = 1 + +[ablation.sweep.env_args] +question_type = ["reasoning_fct", "reasoning_nota"] + +[[eval]] +env_id = "med_mcqa" +num_examples = -1 +rollouts_per_example = 1 + +[[ablation]] +env_id = "med_mcqa" +name = "shuffle_seed-{env_args.shuffle_seed}" +num_examples = -1 +rollouts_per_example = 1 +env_args = { shuffle_answers = true } + +[ablation.sweep.env_args] +shuffle_seed = [1618, 9331] + +[[ablation]] +env_id = "medbullets" +name = "num_options-{env_args.num_options}" +num_examples = -1 +rollouts_per_example = 1 + +[ablation.sweep.env_args] +num_options = [4, 5] + +[[ablation]] +env_id = "medbullets" +name = "num_options-{env_args.num_options}__shuffle_seed-{env_args.shuffle_seed}" +num_examples = -1 +rollouts_per_example = 1 +env_args = { shuffle_answers = true } + +[ablation.sweep.env_args] +num_options = [4, 5] +shuffle_seed = [1618, 9331] + +[[eval]] +env_id = "medcalc_bench" +name = "version-1.2" +num_examples = -1 +rollouts_per_example = 1 +env_args = { version = "1.2" } + +[[eval]] +env_id = "medcalc_bench" +name = "tools" +num_examples = -1 +rollouts_per_example = 1 +env_args = { version = "verified", add_python_tool = true, add_calculator_tool = true } + +[[ablation]] +env_id = "medconceptsqa" +name = "{env_args.difficulty}" +num_examples = -1 +rollouts_per_example = 1 +env_args = { vocab = "icd10cm_sample" } + +[ablation.sweep.env_args] +difficulty = ["easy", "medium", "hard"] + +[[ablation]] +env_id = "medconceptsqa" +name = "{env_args.difficulty}__shuffle_seed-{env_args.shuffle_seed}" +num_examples = -1 +rollouts_per_example = 1 +env_args = { vocab = "icd10cm_sample", shuffle_answers = true } + +[ablation.sweep.env_args] +difficulty = ["easy", "medium", "hard"] +shuffle_seed = [1618, 9331] + +[[ablation]] +env_id = "medhallu" +name = "{env_args.difficulty}" +num_examples = -1 +rollouts_per_example = 1 + +[ablation.sweep.env_args] +difficulty = ["easy", "medium", "hard"] + +[[eval]] +env_id = "medqa" +num_examples = -1 +rollouts_per_example = 1 + +[[ablation]] +env_id = "medqa" +name = "shuffle_seed-{env_args.shuffle_seed}" +num_examples = -1 +rollouts_per_example = 1 +env_args = { shuffle_answers = true } + +[ablation.sweep.env_args] +shuffle_seed = [1618, 9331] + +[[ablation]] +env_id = "medxpertqa" +name = "{env_args.question_type}" +num_examples = -1 +rollouts_per_example = 1 + +[ablation.sweep.env_args] +question_type = ["reasoning", "understanding"] + +[[ablation]] +env_id = "medxpertqa" +name = "{env_args.question_type}__shuffle_seed-{env_args.shuffle_seed}" +num_examples = -1 +rollouts_per_example = 1 +env_args = { shuffle_answers = true } + +[ablation.sweep.env_args] +question_type = ["reasoning", "understanding"] +shuffle_seed = [1618, 9331] + +[[eval]] +env_id = "metamedqa" +num_examples = -1 +rollouts_per_example = 1 + +[[ablation]] +env_id = "metamedqa" +name = "shuffle_seed-{env_args.shuffle_seed}" +num_examples = -1 +rollouts_per_example = 1 +env_args = { shuffle_answers = true } + +[ablation.sweep.env_args] +shuffle_seed = [1618, 9331] + +[[eval]] +env_id = "mmlu_pro_health" +num_examples = -1 +rollouts_per_example = 1 + +[[ablation]] +env_id = "mmlu_pro_health" +name = "shuffle_seed-{env_args.shuffle_seed}" +num_examples = -1 +rollouts_per_example = 1 +env_args = { shuffle_answers = true } + +[ablation.sweep.env_args] +shuffle_seed = [1618, 9331] + +[[eval]] +env_id = "pubhealthbench" +num_examples = -1 +rollouts_per_example = 1 +env_args = { split = "reviewed" } + +[[ablation]] +env_id = "pubhealthbench" +name = "shuffle_seed-{env_args.shuffle_seed}" +num_examples = -1 +rollouts_per_example = 1 +env_args = { split = "reviewed", shuffle_answers = true } + +[ablation.sweep.env_args] +shuffle_seed = [1618, 9331] + +[[eval]] +env_id = "pubmedqa" +num_examples = -1 +rollouts_per_example = 1 + +[[ablation]] +env_id = "pubmedqa" +name = "shuffle_seed-{env_args.shuffle_seed}" +num_examples = -1 +rollouts_per_example = 1 +env_args = { shuffle_answers = true } + +[ablation.sweep.env_args] +shuffle_seed = [1618, 9331] + +[[eval]] +env_id = "sctpublic" +num_examples = -1 +rollouts_per_example = 1 + +[[ablation]] +env_id = "supergpqa_medicine" +name = "{env_args.difficulty}" +num_examples = -1 +rollouts_per_example = 1 + +[ablation.sweep.env_args] +difficulty = ["easy", "hard"] + +[[ablation]] +env_id = "supergpqa_medicine" +name = "{env_args.difficulty}__shuffle_seed-{env_args.shuffle_seed}" +num_examples = -1 +rollouts_per_example = 1 +env_args = { shuffle_answers = true } + +[ablation.sweep.env_args] +difficulty = ["easy", "hard"] +shuffle_seed = [1618, 9331] diff --git a/docs/README.md b/docs/README.md index ff72a226..dee7b530 100644 --- a/docs/README.md +++ b/docs/README.md @@ -1,6 +1,6 @@ # medarc-verifiers -Utilities and CLI for running medical LLM benchmarks with [verifiers](https://github.com/primeintellect-ai/verifiers). Provides batch orchestration, result processing, and shared building blocks for authoring environments. +Utilities and CLI for running medical LLM benchmarks with [verifiers](https://github.com/primeintellect-ai/verifiers). Provides TOML bench execution, result processing, and shared building blocks for authoring environments. ## Install @@ -17,8 +17,12 @@ Environments are installed separately via `prime env install ` (from | Command | Description | |---------|-------------| | `medarc-eval ` | Run a single benchmark; env-specific flags inferred from `load_environment()` | -| `medarc-eval bench` | Run multiple model × environment jobs from a YAML config, with resume support | -| `medarc-eval process` | Convert raw outputs to analysis-ready parquet | +| `medarc-eval bench` | Run upstream TOML eval configs with deterministic MedARC paths | +| `medarc-eval process` | Convert eval outputs to analysis-ready parquet | | `medarc-eval winrate` | Compute HELM-style win rates across models | See [medarc-eval.md](medarc-eval.md) for full documentation. + +## Developer workflow + +See [developer-guide.md](developer-guide.md) for local setup, environment authoring, and development workflow notes. diff --git a/docs/developer-guide.md b/docs/developer-guide.md new file mode 100644 index 00000000..311e78c4 --- /dev/null +++ b/docs/developer-guide.md @@ -0,0 +1,204 @@ +# Getting Started + +This guide covers the developer workflow for Medmarks benchmark environments and the `medarc-verifiers` tooling in this repository. + +## Getting Started with Verifiers Environments + +The steps below guide you through creating a new environment package under `environments/[my-new-env]`, installing it locally, testing it with Verifiers tooling, and optionally publishing it through Prime Intellect's Environments Hub. + +### 1. Prerequisites + +- Python 3.11 or 3.12 +- [`uv`](https://docs.astral.sh/uv/) for dependency management +- The [`prime` CLI](https://github.com/PrimeIntellect-ai/prime-cli) for scaffolding and publishing +- An OpenAI-compatible API key, exported as `OPENAI_API_KEY`, or another OpenAI-compatible model endpoint for testing environments with `vf-eval` + +### 2. Setup + +Create and activate a virtual environment, then install the required tooling: + +```bash +uv venv --python 3.12 +source .venv/bin/activate +uv sync +uv tool install prime +``` + +After this setup the `prime env`, `vf-install`, `vf-eval`, and `medarc-eval` commands will be available, or runnable via `uv run `. + +### 3. Create a New Environment + +Always place new Verifiers packages inside `environments/my-new-env`. The Prime CLI ensures this by default: + +```bash +# from the repository root +prime env init my-new-env +``` + +The template produces: + +```text +environments/my_new_env/ +|-- my_new_env.py +|-- pyproject.toml +`-- README.md +``` + +Edit `my_new_env.py` to configure datasets, parsers, and rubrics, and update the package metadata in `pyproject.toml` with the package name, version, dependencies, tags, and related fields. + +If the `prime env init` command does not add it, add the following Prime environment metadata so Prime and Verifiers know where the environment lives in a flat repo: + +```toml +[tool.prime.environment] +loader = "my_new_env:load_environment" +display_name = "My New Env" +visibility = "PUBLIC" +``` + +### 4. Install the Environment for Local Development + +Install your new environment in editable mode so changes are picked up immediately: + +```bash +vf-install my-new-env +# equivalent to: +# uv pip install -e ./environments/my_new_env +``` + +You can now import it from Python or let Verifiers discover it with `verifiers.load_environment("my-new-env")`. + +### 5. Smoke-Test with `vf-eval` + +Run a small batch of rollouts to confirm the environment behaves as expected. Set `OPENAI_API_KEY`, or whichever OpenAI-compatible credentials you plan to use, before invoking the CLI. + +```bash +export OPENAI_API_KEY=sk-... +vf-eval my-new-env -m gpt-4.1-mini -n 5 -s +``` + +A few useful arguments: + +- `-m` selects the inference model. +- `-n` controls dataset size. +- `-s` saves results locally. + +Use `vf-eval -h` for the full set of options, including rollouts per example and max concurrency. + +During development you can iterate quickly by tweaking prompts, parser logic, or reward functions, reinstalling with `vf-install` if dependencies change, and rerunning `vf-eval` to view the results. + +After running with `-s`, inspect saved runs with `vf-tui`, which provides a terminal UI for browsing prompts, completions, and rewards under the generated `outputs/evals` folders. + +## Using an Existing Medmarks Environment + +Once your tooling is set up you can install MedARC-maintained environments directly from the Prime Environments Hub, for example [`medarc/medcasereasoning`](https://app.primeintellect.ai/dashboard/environments/medarc/medcasereasoning) or [`medarc/metamedqa`](https://app.primeintellect.ai/dashboard/environments/medarc/metamedqa). + +- Install from the Hub: run `prime env install medarc/medcasereasoning` to pull the latest published version. Add `@version` to pin a release. +- Run an evaluation: execute `vf-eval medcasereasoning -m gpt-4.1-mini -n 10 -s` to generate a small batch of rollouts. +- Load programmatically: + +```python +import verifiers as vf + +env = vf.load_environment("medcasereasoning", split="validation") +results = env.evaluate(model_client, "gpt-4.1-mini", num_examples=5) +``` + +## medarc-eval CLI + +`medarc-eval` wraps the upstream `verifiers` eval flow, adding environment-specific flags and a TOML bench workflow. See the [full documentation](medarc-eval.md). + +| Command | Description | +|---------|-------------| +| [`medarc-eval `](medarc-eval-single-run.md) | Run a single benchmark with auto-discovered environment flags | +| [`medarc-eval bench`](medarc-eval-bench.md) | Run upstream TOML eval configs with deterministic MedARC paths | +| [`medarc-eval process`](medarc-eval-process.md) | Convert eval outputs to parquet for analysis | +| [`medarc-eval winrate`](medarc-eval-winrate.md) | Compute HELM-style win rates across models | + +### Quick Start + +```bash +# Run a single benchmark +uv run medarc-eval medqa -m gpt-4.1-mini -n 25 + +# Run batch evaluations from config +uv run medarc-eval bench --config configs/medmarks-smoke.toml + +# Process results and compute win rates +uv run medarc-eval process --runs-dir runs/evals +uv run medarc-eval winrate +``` + +### Environment-Specific Flags + +Each environment's `load_environment()` parameters become CLI flags automatically: + +```bash +# Discover available flags +uv run medarc-eval longhealth --help + +# Use environment-specific options +uv run medarc-eval longhealth --task task1 --shuffle-answers -m gpt-4.1-mini -n 10 +``` + +For complex arguments such as dicts and nested structures, use `--env-args`: + +```bash +uv run medarc-eval careqa --env-args '{"split": "open", "judge_model": "gpt-4o"}' +``` + +## Batch Evaluations + +Use `medarc-eval bench` to run upstream `verifiers` TOML eval configs sequentially with deterministic MedARC output paths. See the [bench mode documentation](medarc-eval-bench.md). + +```toml +model = "openai/gpt-4.1-mini" +save_results = true +output_dir = "runs/evals" + +[[eval]] +env_id = "medqa" +num_examples = 25 +rollouts_per_example = 1 +env_args = { shuffle_answers = true, shuffle_seed = 1618 } +``` + +```bash +# Run the batch +uv run medarc-eval bench --config configs/medmarks-verified.toml + +# Preview without executing +uv run medarc-eval bench --config configs/medmarks-verified.toml --dry-run +``` + +Bench mode resumes matching deterministic result directories and supports `[[ablation]]` sweeps for parameter grids. The removed YAML job/manifest runner is documented only in the migration notes in the [bench mode docs](medarc-eval-bench.md). + +### Ablation Sweeps + +Use upstream TOML ablations for parameter grid runs: + +```toml +[[ablation]] +env_id = "medconceptsqa" +num_examples = -1 +env_args = { shuffle_answers = true } + +[ablation.sweep.env_args] +difficulty = ["easy", "medium", "hard"] +shuffle_seed = [1618, 9331] +``` + +This expands into deterministic variant directories under `runs/evals//medconceptsqa/`. See the [bench mode docs](medarc-eval-bench.md) for details. + +## Processing and Win Rates + +After running benchmarks, convert results to parquet and compute model comparisons: + +```bash +# Process eval outputs to parquet +uv run medarc-eval process --runs-dir runs/evals + +# Compute HELM-style win rates +uv run medarc-eval winrate +``` + +See the [processing documentation](medarc-eval-process.md) and [win rate documentation](medarc-eval-winrate.md) for configuration options, Hugging Face integration, and output formats. diff --git a/docs/medarc-eval-bench.md b/docs/medarc-eval-bench.md index c32e313e..ed0a6444 100644 --- a/docs/medarc-eval-bench.md +++ b/docs/medarc-eval-bench.md @@ -1,300 +1,322 @@ -# Batch Mode +# TOML Bench Mode -Run multiple benchmarks across multiple models using a configuration file. Batch mode handles job scheduling, progress tracking, and automatic resume. +`medarc-eval bench` runs upstream `verifiers` TOML eval configs sequentially with +MedARC-specific deterministic output paths. It is the supported path for +systematic local benchmark runs. -Each job invokes the verifiers [`vf-eval`](https://github.com/primeintellect-ai/verifiers) evaluation loop under the hood, with configuration-driven environment and sampling arguments. +The old MedARC YAML benchmark runner has been removed. `bench --config` now +accepts `.toml` files only. ## Quick Start ```bash -# Run all jobs from config -medarc-eval bench --config configs/job-gpt-oss-20b.yaml +# Preview the repository smoke config +medarc-eval bench --config configs/medmarks-smoke.toml --dry-run -# Preview what would run -medarc-eval bench --config configs/job-gpt-oss-20b.yaml --dry-run +# Run the verified production suite +medarc-eval bench --config configs/medmarks-verified.toml -# Force all jobs to use a specific API endpoint -medarc-eval bench --config configs/job-gpt-oss-20b.yaml --api-base-url http://127.0.0.1:8000/v1 -``` +# Require all selected env packages to already be installed +medarc-eval bench --config configs/medmarks-verified.toml --no-auto-install -## Writing a Config File - -A minimal config defines models and which benchmarks to run: - -```yaml -name: gpt-oss-20b-med - -models: - gpt-oss-20b: - model: openai/gpt-oss-20b - api_base_url: http://localhost:8000/v1 - sampling_args: - temperature: 1.0 - top_p: 1.0 - top_k: 0 - reasoning_effort: medium - -jobs: - - model: gpt-oss-20b - env: - - m_arc - - medcalc_bench - - medxpertqa +# Run the verified suite against a local OpenAI-compatible server +medarc-eval bench \ + --config configs/medmarks-verified.toml \ + --api-base-url http://127.0.0.1:8000/v1 \ + --provider local \ + --model openai/my-local-model ``` -This creates 3 jobs: gpt-oss-20b evaluated on m_arc, medcalc_bench, and medxpertqa. - -### Config Structure - -| Field | Description | -|-------|-------------| -| `name` | Human-readable run name | -| `output_dir` | Where to save results (default: `runs/raw`) | -| `models` | Map of model ID → model configuration | -| `jobs` | List of model + environment combinations to run | - -### Model Configuration - -```yaml -models: - gpt-oss-20b: - model: openai/gpt-oss-20b # Model identifier - api_base_url: http://localhost:8000/v1 # API endpoint (local or remote) - api_key_var: OPENAI_API_KEY # Optional: env var for API key - max_concurrent: 10 # Optional: parallel request limit - timeout: 120 # Optional: request timeout (seconds) - sampling_args: - temperature: 1.0 - top_p: 1.0 - reasoning_effort: medium +Repository suite configs live in `configs/`: + +| Config | Purpose | +|--------|---------| +| `medmarks-smoke.toml` | Small Medmarks-V smoke test used by CLI tests | +| `medmarks-verified.toml` | Verified benchmark suite | +| `medmarks-open_ended.toml` | Open-ended benchmark suite | + +## Config Format + +Bench configs use upstream `verifiers` TOML semantics: top-level defaults plus +one or more `[[eval]]` blocks. MedARC adds deterministic output planning around +selected raw eval configs; it does not use YAML `models`, `envs`, or `jobs` +sections. + +```toml +model = "openai/gpt-4.1-mini" +save_results = true +output_dir = "runs/evals" + +[[eval]] +env_id = "medqa" +num_examples = 25 +rollouts_per_example = 1 +env_args = { shuffle_answers = true, shuffle_seed = 1618 } +sampling_args = { temperature = 0.0 } + +[[eval]] +env_id = "pubmedqa" +num_examples = 25 +rollouts_per_example = 1 ``` -### Runtime API Base URL Override +Per-environment defaults can also live in an environment package +`pyproject.toml` under `[tool.verifiers.eval]`. Production suite configs keep +explicit `num_examples` and `rollouts_per_example` values so they remain stable +across editable and wheel installs. + +## Local Environment Install Lifecycle + +By default, TOML bench auto-installs selected local environment packages that +are not already importable in the active Python environment. Auto-install only +applies to missing local packages resolved from `--env-dir`; selected envs that +are already importable keep the normal in-process execution path. -Use `--api-base-url` to override `models.*.api_base_url` for all jobs at runtime: +`--env-dir` defaults to `environments/`. When auto-install is needed, bench +creates a system temporary directory with a `medarc-bench-venv-` prefix, creates +a venv inside it with `uv venv`, installs the selected local env package +editable into that venv, runs one eval through the private bench child, and then +removes the temporary venv. ```bash -medarc-eval bench --config my-config.yaml --api-base-url http://127.0.0.1:8000/v1 +medarc-eval bench \ + --config configs/medmarks-verified.toml \ + --eval-index "$SLURM_ARRAY_TASK_ID" ``` -### Environment Configuration +When a selected env package is missing, bench prints a warning to stderr and +runs that eval in an isolated temporary venv. The parent process loads and +expands the TOML config, applies `--eval-index` / `--start-at` / `--stop-after`, +plans deterministic output paths from raw TOML and CLI values, creates a temp +venv, installs MedARC into it, installs the target env package into it, runs the +bench child with the parent-planned `resume_path`, and deletes the temp venv. -Benchmarks are configured in `configs/envs/`. Each file defines one or more environment variants: +If the active `medarc-verifiers` install is editable, isolated mode installs +that same checkout editable into the temp venv. If the active install is not +editable, isolated mode installs `medarc-verifiers==` and +requires that package/version to be resolvable by the normal package resolver. +If resolution fails, run from an editable checkout or preinstall env packages +and pass `--no-auto-install`. -```yaml -# configs/envs/medqa.yaml -- id: medqa - module: medqa - num_examples: -1 # -1 = all examples - rollouts_per_example: 1 - env_args: - shuffle_answers: true - shuffle_seed: 1618 +For faster strict local iteration, preinstall environments and opt out: + +```bash +vf-install medqa +vf-install pubmedqa +medarc-eval bench --config configs/medmarks-verified.toml --no-auto-install ``` -#### Matrix Sweeps +`--dry-run` does not create venvs, install packages, or spawn child processes. +If selected env packages are missing, dry run says they would be auto-installed. +Dry-run identity and deterministic paths are based on TOML and CLI values only; +environment package `[tool.verifiers.eval]` defaults are execution-time defaults +and do not affect dry-run display or path planning. + +Isolated mode removes shared Python package metadata mutation from auto-install, +but it is not full filesystem or side-effect isolation. Concurrent runs can +still collide if they target the same deterministic output directory without +unique selections, output roots, or variants. Hugging Face caches, judge caches, +cwd-relative artifacts, temp files created by environment code, and network/API +side effects can also remain shared. + +## Ablations and Variants + +Use upstream `[[ablation]]` tables to sweep values. The upstream env id stays +unchanged, and MedARC writes each differing config to a deterministic variant +directory. + +```toml +model = "openai/gpt-4.1-mini" +save_results = true +output_dir = "runs/evals" + +[[ablation]] +env_id = "medqa" +name = "shuffle_seed-{env_args.shuffle_seed}" +num_examples = -1 +rollouts_per_example = 1 +env_args = { shuffle_answers = true } + +[ablation.sweep.env_args] +shuffle_seed = [1618, 9331] +``` -Run the same benchmark with different parameter combinations: +Example output paths: -```yaml -- id: medqa - module: medqa - num_examples: -1 - env_args: - shuffle_answers: true - matrix: - shuffle_seed: [1618, 9331, 2718] - matrix_id_format: "{base}-seed{shuffle_seed}" +```text +runs/evals/openai-gpt-4.1-mini/medqa/shuffle_seed-1618/ +runs/evals/openai-gpt-4.1-mini/medqa/shuffle_seed-9331/ ``` -This creates three variants: `medqa-seed1618`, `medqa-seed9331`, `medqa-seed2718`. - -## Resume and Restart +Non-variant evals use the reserved variant id `base` and write to +`runs/evals///base/`. Duplicate `(model, env)` evals must provide +an explicit `variant_id` or `name`. `name` may use simple templates such as +`shuffle_seed-{env_args.shuffle_seed}` after ablation expansion. -Batch mode automatically tracks job status and can resume interrupted runs. +`variant_id` and `name` are path identities. They must already be path-safe: +use only letters, numbers, `.`, `_`, and `-`. For example, +`variant_id = "shuffle_seed-1618"` is valid, while +`variant_id = "shuffle seed = 1618"` fails with a clear error. -### Automatic Resume (Default) +## Metadata -When you re-run the same config, completed jobs are skipped: +Upstream `metadata.json` remains a normal `verifiers` file. MedARC does not +write separate bench metadata. Processing recovers exact model and environment +identity from upstream metadata, and recovers variant identity from the +deterministic path segment. -```bash -# First run - runs all jobs -medarc-eval bench --config my-config.yaml +## Output Root, Resume, and Force -# Interrupted, re-run - skips completed jobs -medarc-eval bench --config my-config.yaml -``` +Bench writes each eval to a deterministic result directory. If neither +`--output-dir` nor TOML `output_dir` is set, the output root defaults to +`runs/evals`. -### Force Fresh Run +Existing valid outputs resume automatically. This makes Slurm retries +idempotent for a fixed `--eval-index`: ```bash -# Disable auto-resume, create new run directory -medarc-eval bench --config my-config.yaml --no-auto-resume - -# Re-run everything, even completed jobs -medarc-eval bench --config my-config.yaml --force +medarc-eval bench --config configs/medmarks-verified.toml --eval-index "$SLURM_ARRAY_TASK_ID" ``` -### Restart from Previous Run - -Copy completed jobs from an old run to seed a new one: +If the deterministic target already contains both `metadata.json` and +`results.jsonl`, MedARC passes that path to upstream `verifiers` as +`resume_path` and lets upstream resume. If the target exists but is malformed or +partial, bench fails unless `--force` is set: ```bash -medarc-eval bench --config updated-config.yaml --restart old-run-id +# Archive existing deterministic outputs and rerun +medarc-eval bench --config configs/medmarks-verified.toml --force ``` -### Re-run Specific Environments - -```bash -# Re-run only medqa jobs (keep other completed jobs) -medarc-eval bench --config my-config.yaml --forced medqa - -# Re-run multiple environments -medarc-eval bench --config my-config.yaml --forced medqa,pubmedqa -``` +`--resume` is still accepted for compatibility, but deterministic bench outputs +resume automatically when valid artifacts exist. MedARC does not maintain a +sampling-argument allowlist or fingerprint blocker for resume safety. New +provider arguments pass through to upstream. ## Common Flags -### Job Selection - | Flag | Description | |------|-------------| -| `--config PATH` | **Required.** Path to config YAML | -| `--endpoints-path PATH` | Endpoint registry path (default: `configs/endpoints.toml`) | -| `--job-id ID` | Run only specific job(s) by ID (repeatable) | -| `--dry-run` | Show plan without executing | +| `--config PATH` | Required path to an upstream TOML eval config | +| `--dry-run` | Resolve evals and print the deterministic plan | +| `--force` | Archive existing deterministic output and rerun | +| `--resume` | Compatibility flag; valid deterministic outputs resume automatically | +| `--output-dir PATH` | Override the config output directory, default `runs/evals` | +| `--env-dir PATH` | Directory containing local environments, default `environments` | +| `--auto-install` / `--no-auto-install` | Auto-install missing local env packages in isolated temp venvs (default) or require selected envs to be preinstalled | +| `--endpoints-path PATH` | Endpoint registry path, default `configs/endpoints.toml` | +| `--api-base-url URL` | Override API base URL for every eval | +| `--api-key-var NAME` | Override API key environment variable | +| `--provider NAME` | Override upstream provider shorthand | +| `--model MODEL` | Override model for every eval | +| `--eval-index N` | Run one resolved eval by 1-based index | +| `--start-at N` / `--stop-after N` | Run a contiguous 1-based eval range | +| `--continue-on-error` | Continue after a failed eval | +| `--env-arg KEY=VALUE` / `--env-args JSON` | Apply environment arg overrides | +| `--sampling-arg KEY=VALUE` / `--sampling-args JSON` | Apply sampling arg overrides | +| `--max-concurrent N` | Override max concurrency for every eval | +| `--timeout SEC` | Override request timeout for every eval | +| `--max-retries N` | Override upstream rollout retries for every eval | +| `--sleep SEC` | Sleep after each eval | + +## Endpoint Sampling Profiles + +MedARC extends upstream `verifiers` TOML endpoint registries with optional +endpoint-level `sampling_args`. Use these for model/provider defaults and +compatibility knobs, such as vLLM-only parameters. Put benchmark experiment +settings in the eval TOML or CLI overrides. + +```toml +[[endpoint]] +endpoint_id = "gpt-oss-20b-low-local" +model = "openai/gpt-oss-20b" +url = "http://host.docker.internal:8010/v1" +key = "VLLM_API_KEY" +api_client_type = "openai_responses" + +[endpoint.sampling_args] +temperature = 1.0 +top_p = 1.0 +top_k = 0 +reasoning_effort = "low" + +[[endpoint]] +endpoint_id = "another-model" +model = "openai/another-model" +url = "http://host.docker.internal:8011/v1" +key = "VLLM_API_KEY" +``` -### Output Control +Inline tables are also supported: -| Flag | Description | -|------|-------------| -| `--output-dir PATH` | Override output directory | -| `--run-id ID` | Force specific run directory name | -| `--name NAME` | Override run name in manifest | +```toml +sampling_args = { temperature = 1.0, top_p = 1.0, top_k = 0, reasoning_effort = "low" } +``` -### Override All Jobs +Precedence is: Prime Inference defaults, endpoint `sampling_args`, raw scalar +`temperature` / `max_tokens`, raw TOML `sampling_args`, then CLI +`--sampling-args` / `--sampling-arg`. Unknown OpenAI parameters such as `top_k` +are still moved under `extra_body` after the merge. -| Flag | Description | -|------|-------------| -| `--env-args JSON` | Override environment args for all jobs | -| `--sampling-args JSON` | Override sampling args for all jobs | -| `--max-concurrent N` | Override concurrency for all jobs | -| `--timeout SEC` | Override timeout for all jobs | -| `--include-usage` / `--no-include-usage` | Enable/disable usage reporting (auto-detected for Prime Inference) | +After `[endpoint.sampling_args]`, TOML keys remain inside that nested table +until the next table header. Start a new `[[endpoint]]` before defining another +endpoint. -### Prime Inference +## Prime Inference -When using Prime Inference (`https://api.pinference.ai/api/v1`), the CLI automatically: -- Uses `PRIME_API_KEY` for authentication (if set) -- Adds `X-Prime-Team-ID` header from the `PRIME_TEAM_ID` env var -- Enables usage reporting in API requests +When `--api-base-url` or a config points at Prime Inference +(`https://api.pinference.ai/api/v1`), MedARC applies the same Prime helpers used +by single-run mode: -Just set the environment variables and the config stays simple: +- `PRIME_API_KEY` is preferred when available. +- `X-Prime-Team-ID` is added from `PRIME_TEAM_ID`. +- Usage reporting is enabled unless `MEDARC_INCLUDE_USAGE=false` is set. ```bash -export PRIME_API_KEY=your-api-key -export PRIME_TEAM_ID=your-team-id -``` - -```yaml -models: - my-model: - model: openai/gpt-5-nano - api_base_url: https://api.pinference.ai/api/v1 -``` +export PRIME_API_KEY=... +export PRIME_TEAM_ID=... -Manual configuration is only needed to override auto-detection: - -```yaml -models: - my-model: - model: openai/gpt-5-nano - api_base_url: https://api.pinference.ai/api/v1 - api_key_var: PRIME_API_KEY - headers: - X-Prime-Team-ID: override-team-id - sampling_args: - extra_body: - usage: - include: false # disable usage reporting +medarc-eval bench \ + --config configs/medmarks-verified.toml \ + --api-base-url https://api.pinference.ai/api/v1 ``` -### Endpoints Registry Migration (`endpoints.py` -> `endpoints.toml`) +## Processing Outputs -Batch mode now defaults `--endpoints-path` to `configs/endpoints.toml`. - -If your project still uses a Python registry, pass it explicitly: +After a TOML bench run, process the deterministic eval outputs: ```bash -medarc-eval bench --config my-config.yaml --endpoints-path configs/endpoints.py +medarc-eval process --runs-dir runs/evals --output-dir runs/processed +medarc-eval winrate --processed-dir runs/processed ``` -## Output Structure - -``` -runs/raw// -├── run_manifest.json # Run metadata, job status, checksums -├── / -│ ├── results.jsonl # Per-example results -│ ├── summary.json # Aggregate metrics -│ └── metadata.json # Job configuration snapshot -└── / - └── ... -``` +Processing reads eval-output directories under `runs/evals`. Legacy +`runs/raw//run_manifest.json` outputs must be converted with +`scripts/convert_legacy_raw_runs.py` before processing. New bench runs should +use `runs/evals`. -The manifest tracks: -- Job status (pending, running, completed, failed) -- Configuration checksums for resume detection -- Timing information -- Output paths - -## Example Workflows - -### Evaluate Multiple Models on Core Benchmarks - -```yaml -name: model-comparison - -models: - gpt-oss-20b: - model: openai/gpt-oss-20b - api_base_url: http://192.168.1.152:8000/v1 - sampling_args: - temperature: 1.0 - reasoning_effort: medium - - gpt-oss-20b-low: - model: openai/gpt-oss-20b - api_base_url: http://192.168.1.152:8000/v1 - sampling_args: - temperature: 0.7 - reasoning_effort: low - -jobs: - - model: gpt-oss-20b - env: [m_arc, medcalc_bench, medxpertqa] - - model: gpt-oss-20b-low - env: [m_arc, medcalc_bench, medxpertqa] -``` +## Migrating from the Removed YAML Runner -### Override Parameters at Runtime +Move old YAML `models` entries into top-level TOML defaults or explicit +`[[eval]]` blocks. Move old `envs` and matrix variants into repeated `[[eval]]` +blocks or upstream `[[ablation]]` sweeps. -```bash -# Lower concurrency for rate-limited API -medarc-eval bench --config my-config.yaml --max-concurrent 5 +Removed YAML-runner concepts no longer exist in `medarc-eval bench`: -# Change temperature for all jobs -medarc-eval bench --config my-config.yaml --sampling-args '{"temperature": 0.5}' +- YAML `models`, `envs`, and `jobs` schemas +- `run_manifest.json` creation for new bench runs +- `--run-id`, `--restart`, `--auto-resume`, `--no-auto-resume` +- `--job-id`, `--forced`, `--on-complete` +- custom YAML job status and manifest planning -# Enable usage reporting for all jobs -medarc-eval bench --config my-config.yaml --include-usage +Old raw outputs must be converted before processing: -# Disable usage reporting (overrides auto-detection for Prime Inference) -medarc-eval bench --config my-config.yaml --no-include-usage +```bash +uv run python scripts/convert_legacy_raw_runs.py \ + --raw-dir runs/raw \ + --output-dir runs/evals \ + --dry-run ``` -## Next Steps - -After batch runs complete: -1. [Process results](medarc-eval-process.md) into parquet format -2. [Compute win rates](medarc-eval-winrate.md) to compare models +The converter is an operator migration helper. It does not mutate `runs/raw` and +defaults to dry-run; pass `--no-dry-run` to write converted eval outputs. diff --git a/docs/medarc-eval-process.md b/docs/medarc-eval-process.md index 1b57c7ba..5a4b6ffe 100644 --- a/docs/medarc-eval-process.md +++ b/docs/medarc-eval-process.md @@ -1,15 +1,19 @@ # Processing Results -Convert raw benchmark outputs into analysis-ready parquet files. This step prepares data for win rate computation and other analyses. +Convert eval outputs into analysis-ready parquet files. This step prepares data +for win rate computation and other analyses. ## Quick Start ```bash -# Process all completed jobs (uses defaults) +# Process outputs from the current TOML bench runner +medarc-eval process --runs-dir runs/evals --output-dir runs/processed + +# Process outputs from the default runs/evals directory medarc-eval process -# Specify directories explicitly -medarc-eval process --runs-dir runs/raw --output-dir runs/processed +# Convert old YAML-runner raw outputs first +uv run python scripts/convert_legacy_raw_runs.py --raw-dir runs/raw --output-dir runs/evals --dry-run # Preview what would be processed medarc-eval process --dry-run @@ -17,8 +21,9 @@ medarc-eval process --dry-run ## What Processing Does -1. **Discovers** jobs in `runs/raw/` and filters by manifest status (default: `completed`) -2. **Extracts** results from each job's output files +1. **Discovers** eval outputs in `runs/evals/` by scanning output directories + containing `metadata.json` and `results.jsonl` +2. **Extracts** results from each eval output directory 3. **Normalizes** data into a fixed output schema 4. **Writes** parquet files organized by model and environment 5. **Creates** an index (`env_index.json`) for downstream tools @@ -43,27 +48,24 @@ On-disk model and env path components are slugified, so filenames may not exactl | Flag | Description | Default | |------|-------------|---------| -| `--runs-dir PATH` | Directory containing raw runs | `runs/raw` | +| `--runs-dir PATH` | Directory containing eval output directories | `runs/evals` | | `--output-dir PATH` | Where to write processed files | `runs/processed` | | `--max-workers N` | Parallel worker processes | 4 | | `--dry-run` | Show what would be processed | - | | `--yes` | Skip confirmation prompts | - | | `--exclude-dataset NAME` | Skip processing specific datasets/env ids (repeatable) | - | | `--exclude-model MODEL` | Skip processing specific model ids (repeatable) | - | +| `--replace-env NAME` | Rebuild existing processed outputs for specific env ids (repeatable) | - | +| `--replace-model MODEL` | Rebuild existing processed outputs for specific model ids (repeatable) | - | +| `--max-results-missing-pct N` | Fail latest selected outputs missing more than this percentage of expected rows | 2.5 | +| `--winrate PATH` | Run winrate after processing with the provided config file | - | ## Filtering Runs -### By Completion Status - -By default, `medarc-eval process` only selects jobs whose manifest status is `completed`. - -Note: successful jobs are written to `run_manifest.json` with `status: completed`. - -To override that default, pass one or more explicit status filters: - -```bash -medarc-eval process --status completed --status failed -``` +For current TOML bench outputs, processing scans for directories containing +`metadata.json` and `results.jsonl`. Model and environment identity come from +upstream metadata when available; variant identity comes from the deterministic +path segment. Ad hoc upstream outputs fall back to metadata/path inference. You can also gate partially complete outputs by missing `results.jsonl` rows: @@ -75,14 +77,17 @@ medarc-eval process --max-results-missing-pct 2.5 medarc-eval process --max-results-missing-pct 100 ``` -This gate uses manifest job metadata only: +This gate uses `metadata.json` values for expected rows and the observed +`results.jsonl` row count: - `expected_rows = num_examples * rollouts_per_example` -- `observed_rows = row_count` +- `observed_rows = results.jsonl row count` -It is computed per selected job record and enforced only on the latest selected run for each processed model/environment output. It does not use manifest `summary.completed` / `summary.total`, and it does not fall back to older runs if the latest one is too incomplete. +It is computed per selected output and enforced only on the latest selected run +for each processed model/environment output. It does not fall back to older runs +if the latest one is too incomplete. -Selected records with missing `results.jsonl` fail processing immediately. +Directories without `results.jsonl` are not process candidates. ### Latest Runs Only @@ -106,7 +111,7 @@ Store common options in a YAML file: ```yaml # process-config.yaml -runs_dir: runs/raw +runs_dir: runs/evals process: dir: processed @@ -130,10 +135,11 @@ CLI flags override config values. Supported config schema for `medarc-eval process`: -- Top-level `runs_dir`: raw run root. +- Top-level `runs_dir`: eval output root, usually `runs/evals`. - Top-level `process:`: process-specific defaults. - Optional top-level `winrate:`: embedded post-process winrate step. - Optional top-level `hf:`: shared HF settings. For embedded winrate uploads, use `hf.winrate_dir`. +- Removed process config keys are rejected: use `max_results_missing_pct` instead of `max_run_missing_pct`; status filtering is no longer supported for current eval outputs. Path shortcuts: @@ -143,7 +149,7 @@ Path shortcuts: Example: ```yaml -runs_dir: runs/raw +runs_dir: runs/evals process: dir: processed @@ -163,7 +169,7 @@ Sync processed datasets to/from the Hugging Face Hub: ```yaml # process-config.yaml -runs_dir: runs/raw +runs_dir: runs/evals process: dir: processed @@ -225,10 +231,10 @@ This runs `medarc-eval winrate` automatically after processing completes when th ```bash # 1. Run benchmarks -medarc-eval bench --config my-eval.yaml +medarc-eval bench --config configs/medmarks-verified.toml # 2. Process results -medarc-eval process +medarc-eval process --runs-dir runs/evals # 3. Compute win rates medarc-eval winrate @@ -249,8 +255,8 @@ medarc-eval process \ ### Incremental Updates ```bash -# Process only new runs (default behavior) -medarc-eval process +# Process only new TOML bench outputs +medarc-eval process --runs-dir runs/evals # env_index.json tracks what's already processed ``` @@ -280,20 +286,40 @@ When both flags are present, processing only rebuilds outputs that match both fi Check that: 1. `--runs-dir` points to the correct location -2. Runs have completed (check `run_manifest.json` `jobs[*].status`) -3. Use `--status pending` or `--status running` to include non-completed jobs +2. For TOML bench outputs, each eval directory contains `results.jsonl` and `metadata.json` +3. Each eval output directory contains both `metadata.json` and `results.jsonl` ### Missing data in output -By default, only jobs with `completed` status are included. In addition, `--max-results-missing-pct` fails if a selected latest job record is missing more than 2.5% of its expected `results.jsonl` rows, using manifest job fields: +By default, TOML bench outputs are selected from valid eval directories. +`--max-results-missing-pct` fails if a selected latest output is missing more +than 2.5% of its expected `results.jsonl` rows. Processing uses eval metadata +plus the observed JSONL row count: -- `row_count` - `num_examples` - `rollouts_per_example` -The gate is per selected record, not per whole run manifest. If the latest selected run for a model/dataset is too incomplete, processing fails fast instead of silently falling back to an older run. Records with unknown expected rows or unknown `row_count` are not gated. +The gate is per selected output. If the latest selected run for a model/dataset +is too incomplete, processing fails fast instead of silently falling back to an +older run. Records with unknown expected rows are not gated. + +Use `--max-results-missing-pct 100` to disable the gate. + +### Migrating Old Raw Runs + +`medarc-eval process` no longer reads `runs/raw//run_manifest.json` +directly. Convert old local artifacts into the current eval-output shape first: + +```bash +uv run python scripts/convert_legacy_raw_runs.py \ + --raw-dir runs/raw \ + --output-dir runs/evals \ + --dry-run +``` -Use `--max-results-missing-pct 100` to disable the gate, or pass explicit `--status` values to include other statuses. +The converter defaults to dry-run, never mutates `runs/raw`, and fails on +existing target paths. Re-run with `--no-dry-run` to write converted +`metadata.json` and `results.jsonl` directories under `runs/evals`. ### Integrity-check failures for existing parquet files diff --git a/docs/medarc-eval-winrate.md b/docs/medarc-eval-winrate.md index 47c28f92..b56e6d02 100644 --- a/docs/medarc-eval-winrate.md +++ b/docs/medarc-eval-winrate.md @@ -21,7 +21,7 @@ Win rate computation requires processed parquet files with an `env_index.json`: ```bash # If you haven't processed yet: -medarc-eval process +medarc-eval process --runs-dir runs/evals ``` ## How Win Rates Work @@ -103,7 +103,7 @@ The JSON output includes: ```yaml # process-config.yaml -runs_dir: runs/raw +runs_dir: runs/evals process: dir: processed @@ -192,7 +192,7 @@ medarc-eval winrate \ ```yaml # process-config.yaml -runs_dir: runs/raw +runs_dir: runs/evals process: dir: processed diff --git a/docs/medarc-eval.md b/docs/medarc-eval.md index 395d251f..844b8027 100644 --- a/docs/medarc-eval.md +++ b/docs/medarc-eval.md @@ -2,7 +2,7 @@ `medarc-eval` is a command-line tool for evaluating language models on medical benchmarks. It handles the full pipeline: running benchmarks, processing results, and computing model comparisons. -> **Note:** `medarc-eval ` and `medarc-eval bench` are wrappers around the [verifiers](https://github.com/primeintellect-ai/verifiers) `vf-eval` command, adding medical-specific environments, batch orchestration, and environment-specific CLI flags inferred from each benchmark's `load_environment()` signature. +> **Note:** `medarc-eval ` and `medarc-eval bench` are wrappers around the [verifiers](https://github.com/primeintellect-ai/verifiers) eval flow. Single-run mode adds environment-specific CLI flags inferred from each benchmark's `load_environment()` signature; bench mode runs upstream TOML eval configs sequentially with deterministic MedARC output paths. ## Quick Start @@ -11,10 +11,10 @@ medarc-eval medqa -m gpt-4.1-mini -n 25 # Run a batch of benchmarks from a config file -medarc-eval bench --config configs/job-gpt-oss-20b.yaml +medarc-eval bench --config configs/medmarks-smoke.toml -# Process raw results into analysis-ready parquet files -medarc-eval process +# Process eval outputs into analysis-ready parquet files +medarc-eval process --runs-dir runs/evals # Compute win rates across models medarc-eval winrate @@ -27,7 +27,7 @@ medarc-eval winrate (bench or single) (process) (winrate) | | | v v v - runs/raw/ runs/processed/ runs/processed/winrate/ + runs/evals/ runs/processed/ runs/processed/winrate/ ``` ## Commands @@ -36,7 +36,7 @@ medarc-eval winrate |---------|---------| | `medarc-eval ` | Run a single benchmark interactively | | `medarc-eval bench` | Run multiple benchmarks from a config file | -| `medarc-eval process` | Convert raw results to parquet for analysis | +| `medarc-eval process` | Convert eval outputs to parquet for analysis | | `medarc-eval winrate` | Compute model comparisons from processed data | ## Command Structure @@ -46,8 +46,8 @@ medarc-eval winrate medarc-eval medqa -m gpt-4.1-mini -n 50 # Subcommands: keyword comes first -medarc-eval bench --config configs/my-run.yaml -medarc-eval process --runs-dir runs/raw +medarc-eval bench --config configs/medmarks-verified.toml +medarc-eval process --runs-dir runs/evals medarc-eval winrate --processed-dir runs/processed ``` @@ -70,17 +70,17 @@ medarc-eval longhealth --help ### Batch Mode (`medarc-eval bench`) -**Best for:** Systematic evaluation across multiple models and benchmarks. +**Best for:** Systematic evaluation across TOML eval configs. ```bash # Run all jobs defined in config -medarc-eval bench --config configs/job-gpt-oss-20b.yaml +medarc-eval bench --config configs/medmarks-verified.toml # Preview what would run without executing -medarc-eval bench --config configs/job-gpt-oss-20b.yaml --dry-run +medarc-eval bench --config configs/medmarks-verified.toml --dry-run # Force all jobs to use a specific API endpoint -medarc-eval bench --config configs/job-gpt-oss-20b.yaml --api-base-url http://127.0.0.1:8000/v1 +medarc-eval bench --config configs/medmarks-verified.toml --api-base-url http://127.0.0.1:8000/v1 --provider local ``` ### Processing Mode (`medarc-eval process`) @@ -88,11 +88,11 @@ medarc-eval bench --config configs/job-gpt-oss-20b.yaml --api-base-url http://12 **Best for:** Preparing results for analysis after batch runs complete. ```bash -# Process all completed runs -medarc-eval process +# Process current TOML bench outputs +medarc-eval process --runs-dir runs/evals # Process specific directory -medarc-eval process --runs-dir runs/raw --output-dir runs/processed +medarc-eval process --runs-dir runs/evals --output-dir runs/processed ``` ### Win Rate Mode (`medarc-eval winrate`) @@ -111,25 +111,27 @@ medarc-eval winrate --list-models ``` runs/ -├── raw/ # Raw benchmark outputs (from bench/single-run) -│ └── / -│ ├── run_manifest.json # Run metadata and job status -│ └── / # Per-job results -│ ├── results.jsonl -│ └── summary.json -├── processed/ # Analysis-ready parquet files (from process) -│ ├── env_index.json # Dataset inventory -│ └── /.parquet -└── winrate/ # Model comparison outputs (from winrate) - ├── latest.json - └── latest.csv +├── evals/ # Raw TOML bench outputs +│ └── / +│ └── / +│ └── / +│ ├── results.jsonl +│ └── metadata.json +└── processed/ # Analysis-ready parquet files (from process) + ├── env_index.json # Dataset inventory + ├── /.parquet + └── winrate/ # Model comparison outputs (from winrate) + ├── winrates-.json + ├── winrates-.csv + ├── latest.json + └── latest.csv ``` ## Getting Help ```bash medarc-eval --help # General usage -medarc-eval bench --help # Batch mode options +medarc-eval bench --help # TOML bench options medarc-eval process --help # Processing options medarc-eval winrate --help # Win rate options medarc-eval medqa --help # Environment-specific options @@ -166,6 +168,6 @@ prime env install owner/environment-name@0.1.3 ## Detailed Documentation - [Single-Run Mode](medarc-eval-single-run.md) - Run individual benchmarks with custom options -- [Batch Mode](medarc-eval-bench.md) - Configure and run systematic evaluations +- [TOML Bench Mode](medarc-eval-bench.md) - Configure and run systematic evaluations - [Processing](medarc-eval-process.md) - Prepare results for analysis - [Win Rates](medarc-eval-winrate.md) - Compare models across benchmarks diff --git a/docs/medarc-orchestrate.md b/docs/medarc-orchestrate.md index 028dd1da..8a9accfd 100644 --- a/docs/medarc-orchestrate.md +++ b/docs/medarc-orchestrate.md @@ -39,7 +39,7 @@ Create a plan YAML listing the job configs you want to orchestrate: ```yaml name: local-vllm job_configs: - - configs/job-gpt-oss-20b.yaml + - configs/eval/local-qwen.toml env_file: .env gpu_range: "0-3" port_range: "8000-8999" @@ -49,41 +49,42 @@ resume: false rerun_failed: false ``` -Each job config must define exactly one model under `models:` and include a top-level -`orchestrate:` block with per-model serve settings. +Each job config should be an upstream `medarc-eval bench` TOML config with a top-level +`model` and a namespaced `[medarc.orchestrate]` table. The `env_file` is a dotenv file that is loaded for every Docker launch. If unset and a repo-level `.env` exists, it is used automatically. You can also override it via `--env-file`. -Optional: set `orchestrate.restart` to reuse completed jobs from a previous `medarc-eval` run (it is forwarded as -`medarc-eval bench --restart ...`). - Shared container config: -```yaml -orchestrate: - qwen-30b-a3b: - gpus: 2 - tensor_parallel_size: 2 - serve: - max_model_len: 40960 - vllm-container: - image: vllm/vllm-openai:latest - container_port: 8000 - volumes: - - /data/huggingface:/root/.cache/huggingface - ipc_mode: host - pyxis: - srun_extra_args: [] +```toml +model = "Qwen/Qwen3-30B-A3B" + +[[eval]] +env_id = "medqa" + +[medarc.orchestrate.qwen-30b-a3b] +gpus = 2 +tensor_parallel_size = 2 + +[medarc.orchestrate.qwen-30b-a3b.serve] +max_model_len = 40960 + +[medarc.orchestrate.vllm-container] +image = "vllm/vllm-openai:latest" +container_port = 8000 +volumes = ["/data/huggingface:/root/.cache/huggingface"] +ipc_mode = "host" + +[medarc.orchestrate.pyxis] +srun_extra_args = [] ``` Config notes: -- `orchestrate.vllm-container` is the preferred key. -- `orchestrate.vllm-docker` is still accepted as a deprecated alias. -- Do not set both keys in the same job config. +- `medarc.orchestrate.vllm-container` is required. - `ipc_mode` is Docker-only and is ignored in `--runtime pyxis`. -- `orchestrate.pyxis` is Pyxis-only and is ignored in `--runtime docker`. +- `medarc.orchestrate.pyxis` is Pyxis-only and is ignored in `--runtime docker`. - In Pyxis mode, Slurm allocates GPUs per `srun` step. The orchestrator only reserves localhost ports. ### CLI usage @@ -126,10 +127,18 @@ runtime: pyxis Artifacts are written under `outputs/orchestrator//`: - `summary.json` aggregates task states. -- per-task folders contain `run_manifest.json`, `serve/` logs, `bench/` outputs, and `result.json`. +- per-task folders contain orchestrator task state, `serve/` logs, `bench/` outputs, and `result.json`. ### Runtime behavior +For each task, the orchestrator launches vLLM, waits for readiness, then runs: + +```bash +medarc-eval bench --config --api-base-url --provider local +``` + +The bench command exits naturally on completion; the orchestrator passes TOML bench flags only. + Docker mode: - The orchestrator reserves concrete local GPU IDs and host ports. diff --git a/docs/medarc-verifiers-architecture.md b/docs/medarc-verifiers-architecture.md index d9f25cd2..41b8b005 100644 --- a/docs/medarc-verifiers-architecture.md +++ b/docs/medarc-verifiers-architecture.md @@ -4,264 +4,265 @@ This is a coding agents guide to `medarc_verifiers/`. ## What `medarc_verifiers` is -`medarc_verifiers` is the repository’s Python package that wraps and extends the upstream `verifiers` evaluation framework with: +`medarc_verifiers` wraps and extends the upstream `verifiers` evaluation +framework with: -- A unified CLI (`medarc-eval`) for running many medical benchmark environments consistently. -- Batch orchestration with durable run manifests (resume/restart/force). -- A processing pipeline that converts raw run artifacts into analysis-ready Parquet datasets. +- A unified CLI (`medarc-eval`) for medical benchmark environments. +- A TOML bench wrapper for sequential local benchmark runs with deterministic output paths. +- A processing pipeline that converts eval output artifacts into analysis-ready Parquet datasets. - HELM-style win rate computation across models from processed outputs. -- Shared building blocks used by environments (parsers, rewards, shuffling utilities, judge helpers). +- Shared environment utilities for parsers, rewards, shuffling, and judging. -At a high level, everything funnels into a three-stage workflow: +The current workflow is: -1. **Run** evals (single or batch) → `runs/raw//...` -2. **Process** raw outputs → `runs/processed//.parquet` + `env_index.json` -3. **Winrate** on processed outputs → `runs/processed/winrate/*.json` and `*.csv` +1. **Run** evals with single-run mode or TOML bench -> `runs/evals///...` +2. **Process** eval outputs -> `runs/processed//.parquet` plus `env_index.json` +3. **Winrate** on processed outputs -> `runs/processed/winrate/*.json` and `*.csv` -## Important side effects (auto-installed patches) +Historical YAML-runner outputs under `runs/raw//...` must be converted +with `scripts/convert_legacy_raw_runs.py` before `medarc-eval process` can read +them. The YAML benchmark runner itself has been removed. -Importing `medarc_verifiers` installs monkey patches into `verifiers` by default (`medarc_verifiers/__init__.py`): +## Import Side Effects -- **Judge cache namespacing**: cached judge responses are keyed by `base_url::model` so multi-judge runs don’t collide (`medarc_verifiers/judging/judge_cache_fix.py`). +Importing `medarc_verifiers` installs monkey patches into `verifiers` by default +(`medarc_verifiers/__init__.py`): -`token_usage` is now produced by upstream `verifiers` output serialization and is flattened into explicit columns during `medarc-eval process`. +- **Judge cache namespacing**: cached judge responses are keyed by + `base_url::model` so multi-judge runs do not collide + (`medarc_verifiers/judging/judge_cache_fix.py`). -## `medarc-eval` CLI: modes and code layout +`token_usage` is produced by upstream `verifiers` output serialization and is +flattened into explicit columns during `medarc-eval process`. + +## `medarc-eval` CLI Entry point and router: `medarc_verifiers/cli/main.py`. It supports: - **Single-run mode**: `medarc-eval ...` - - Special rule: the environment name must be the first token. + - The environment name must be the first token. - Implemented in `medarc_verifiers/cli/_single_run.py`. -- **Batch mode**: `medarc-eval bench --config ` - - Loads config, expands job matrix, creates/updates a run manifest, then executes jobs. - - Implemented across: - - Config loading + matrix expansion: `medarc_verifiers/cli/_config_loader.py` - - Schemas: `medarc_verifiers/cli/_schemas.py` - - Job expansion: `medarc_verifiers/cli/_job_builder.py` - - Manifest creation + conflict detection: `medarc_verifiers/cli/_manifest.py` - - Resume/restart planning: `medarc_verifiers/cli/_manifest_planner.py` - - Execution loop: `medarc_verifiers/cli/_job_executor.py` +- **TOML bench mode**: `medarc-eval bench --config ` + - Loads upstream `verifiers` TOML eval configs, expands ablations, plans + deterministic output directories from selected raw configs, then runs evals + sequentially through upstream execution. + - Missing selected local environment packages are auto-installed by default + from `--env-dir` (default `environments`) in isolated system temporary + venvs with a `medarc-bench-venv-` prefix. Importable envs stay on the + in-process path. `--no-auto-install` requires selected envs to already be + importable. + - Main implementation: `medarc_verifiers/cli/main.py` + - Isolated auto-install helper: `medarc_verifiers/cli/isolated_env.py` + - Isolated child runner: `medarc_verifiers/cli/bench_child.py` + - Upstream eval boundary: `medarc_verifiers/cli/upstream_eval.py` + - Deterministic identity/path helpers: `medarc_verifiers/cli/eval_identity.py` - **Processing**: `medarc-eval process ...` - Pipeline wiring: `medarc_verifiers/cli/process/pipeline.py` - **Win rates**: `medarc-eval winrate ...` - - Runner that reads processed datasets and writes results: `medarc_verifiers/cli/winrate/runner.py` - - Core computations live in `medarc_verifiers/cli/winrate/api.py`. + - Runner: `medarc_verifiers/cli/winrate/runner.py` + - Core math: `medarc_verifiers/cli/winrate/api.py` -Shared CLI constants (paths, command strings): `medarc_verifiers/cli/_constants.py`. +Shared CLI constants live in `medarc_verifiers/cli/_constants.py`. -### How single-run “dynamic env flags” works +## Dynamic Env Flags -Single-run mode introspects each environment’s `load_environment()` signature (and docstring) to generate argparse flags on the fly: +Single-run mode introspects each environment's `load_environment()` signature +and docstring to generate argparse flags dynamically: -- Introspection + validation: `medarc_verifiers/cli/utils/env_args.py` +- Introspection and validation: `medarc_verifiers/cli/utils/env_args.py` -That’s why `medarc-eval longhealth --help` shows environment-specific flags even though they aren’t hardcoded. For anything too complex for flags, both single/batch support: +That is why `medarc-eval longhealth --help` shows environment-specific flags +even though they are not hardcoded. For anything too complex for flags, +single-run and TOML bench both support: - `--env-args '{...json...}'` - `--env-arg key=value` (repeatable; smart type coercion) -Override parsing helper: `medarc_verifiers/cli/utils/overrides.py`. - -## Config + override semantics (batch mode) - -Batch configs (YAML) validate into pydantic models in `medarc_verifiers/cli/_schemas.py`. After validation: +Override parsing lives in `medarc_verifiers/cli/utils/overrides.py`. -- Environment matrices expand into multiple env variants (IDs can be formatted) in `medarc_verifiers/cli/_config_loader.py`. -- Jobs expand into concrete “model × env variant” runs in `medarc_verifiers/cli/_job_builder.py`. +## TOML Bench Config Semantics -### `env_args` precedence +Bench configs use upstream `verifiers` TOML shape: top-level defaults plus one +or more `[[eval]]` entries. Upstream `[[ablation]]` tables expand into repeated +eval configs. MedARC adds deterministic paths around selected raw eval configs +before importing env packages. Duplicate `(model, env)` outputs must use +explicit `variant_id` or `name` identity; the reserved default variant id is +`base`. -`env_args` are merged in layers. Think “low → high priority”: +`env_args` precedence is low to high: -1. Environment config `env.env_args` (from `configs/envs/*.yaml`) -2. Model config `model.env_args` -3. Model env-specific override `model.env_overrides[...]` (lookup tries: env id → matrix base id → module) -4. Job-level overrides `job.env_args` -5. CLI overrides (`--env-args` / `--env-arg`) applied later when building `EvalConfig` +1. Environment package `[tool.verifiers.eval]` defaults, when discoverable +2. TOML top-level defaults +3. Per-`[[eval]]` values +4. Expanded `[[ablation]]` values +5. CLI overrides (`--env-args` / `--env-arg`) -The merge is handled by `medarc_verifiers/cli/utils/env_args.py` (with optional metadata validation). +Environment package `[tool.verifiers.eval]` defaults are execution-time +defaults. They do not affect deterministic path planning or dry-run display, +because bench plans from TOML and CLI values before importing env packages. -### `sampling_args` precedence and sanitation +`sampling_args` follow the same TOML -> eval -> ablation -> CLI override model, +then are sanitized once for the resolved Verifiers client type: -`sampling_args` merge from model → job → CLI, and are then sanitized for OpenAI-compatible clients: - -- Unknown parameters are moved under `extra_body` so they can be forwarded to compatible servers (e.g., vLLM). +- Unknown parameters move under `extra_body` for compatible servers such as vLLM. +- OpenAI Chat Completions keeps `reasoning_effort` as a top-level request field. +- OpenAI Responses maps `reasoning_effort` to `reasoning = {"effort": ...}`. +- Anthropic Messages uses adaptive thinking only: + `thinking = {"type": "adaptive"}` plus + `output_config = {"effort": ...}`. Manual `budget_tokens` thinking configs + are rejected before execution. - Sanitizer: `medarc_verifiers/utils/sampling_args.py` -- Merge point: `medarc_verifiers/cli/_eval_builder.py` +- Import boundary: `medarc_verifiers/cli/upstream_eval.py` +- Temporary merge/adaptation adapter behind that boundary: + `medarc_verifiers/cli/verifiers_adapter.py` + +The old YAML `models`, `envs`, `jobs`, matrix expansion, job builder, and +manifest planner modules have been deleted. -## Endpoints and Prime Inference integration +## Endpoints and Prime Inference There are two related concepts: -1. **Endpoint registry** (optional): resolves a model alias to an endpoint URL and key env var. - - Loader + cache: `medarc_verifiers/cli/utils/endpoint_utils.py` - - CLI default path: `configs/endpoints.toml` (TOML-first, aligned with upstream verifiers) - - Legacy Python registries remain usable via explicit `--endpoints-path configs/endpoints.py`. +1. **Endpoint registry**: optional aliases for endpoint URL and key env var. + - Loader and cache: `medarc_verifiers/cli/utils/endpoint_utils.py` + - CLI default path: `configs/endpoints.toml` 2. **Prime Inference overrides**: - - Adds `X-Prime-Team-ID` header (if `PRIME_TEAM_ID` is set and base URL is Prime Inference). - - Optionally injects `extra_body.usage.include = true` for usage reporting. + - Adds `X-Prime-Team-ID` from `PRIME_TEAM_ID`. - Selects `PRIME_API_KEY` when available for Prime Inference endpoints. + - Enables usage reporting unless disabled by `MEDARC_INCLUDE_USAGE=false`. - Implementation: `medarc_verifiers/utils/prime_inference.py` Relevant env vars: -- `OPENAI_API_KEY` (default model key var) -- `PRIME_API_KEY`, `PRIME_TEAM_ID` (Prime Inference) -- `MEDARC_INCLUDE_USAGE` (force usage reporting true/false globally) +- `OPENAI_API_KEY` +- `PRIME_API_KEY`, `PRIME_TEAM_ID` +- `MEDARC_INCLUDE_USAGE` -Programmatic usage (build headers/sampling overrides for a base URL): +## Resume and Deterministic Paths -```python -from medarc_verifiers.utils.prime_inference import prime_inference_overrides +TOML bench writes eval outputs under deterministic directories: -headers, sampling_overrides, api_key_var = prime_inference_overrides(base_url) -``` +- Non-variant evals: `runs/evals///base/` +- Variant evals: `runs/evals////` -### Judge defaults and judge API keys +If neither `--output-dir` nor TOML `output_dir` is set, the output root +defaults to `runs/evals`. Existing valid outputs resume automatically: bench +passes the deterministic target as upstream `EvalConfig.resume_path` and trusts +upstream resume validation. Partial or malformed existing targets fail unless +`--force` archives the existing target and reruns. -Judging defaults are centralized and provider-tuned: +For missing local envs, auto-install creates a temporary venv, mirrors the +current `medarc-verifiers` install into that venv, installs the target env +package, and only then prepares or archives the deterministic output directory. +Editable MedARC installs mirror the same checkout from package metadata. +Non-editable installs use `medarc-verifiers==` and require +that distribution to be resolvable. -- `medarc_verifiers/utils/judge_helpers.py` +`medarc-eval bench` does not monkey-patch upstream metadata saving and does not +write MedARC identity into upstream `metadata.json`. Variant identity is the +deterministic path segment, so `variant_id` / `name` values must already be +path-safe. -Key env vars: +Historical raw-run manifest schemas are not part of the runtime package. Use +`scripts/convert_legacy_raw_runs.py` as a one-off migration helper for old +`runs/raw` artifacts. -- `JUDGE_API_KEY` (preferred for judge calls) -- fallback to `PRIME_API_KEY` (if judging via Prime Inference) or `OPENAI_API_KEY`. +## Eval Outputs -## Resume, restart, and manifests (batch mode) +TOML bench outputs include: -Batch mode writes `runs/raw//run_manifest.json` (manifest v3). +- `results.jsonl`: per-example rollouts +- `metadata.json`: eval configuration and metrics snapshot -- Manifest schema + update methods: `medarc_verifiers/cli/_manifest.py` -- Planning which jobs to run vs reuse: `medarc_verifiers/cli/_manifest_planner.py` +The runner executes via `verifiers.utils.eval_utils.run_evaluation()` from +single-run mode and the TOML bench code in `medarc_verifiers/cli/main.py`. -Important concepts: +## Processing Pipeline -- A **job** is a resolved combination of model + environment variant + args (plus sampling args). -- Auto-resume tries to find the newest run matching the config checksum and skip completed jobs. -- Restart can “seed” a new run from an old run, reusing outputs when job signatures match. -- Conflict detection is conservative for most fields, but treats some model fields as “resume tolerant” (e.g., base URLs/timeouts) so you can move between providers without being blocked. +Docs: `docs/medarc-eval-process.md`. -## Raw outputs (what eval produces) +Entry point: `medarc_verifiers/cli/process/pipeline.py`. -Raw outputs are expected under `runs/raw///` and include: +Processing: -- `results.jsonl`: per-example rollouts -- `summary.json`: aggregated job metrics -- `metadata.json`: job configuration snapshot (env/model/sampling args, etc.) +1. Discovers eval outputs from `runs/evals` by scanning directories containing + `metadata.json` and `results.jsonl`. +2. Normalizes identity from upstream `metadata.json` and deterministic paths. +3. Loads rows from `results.jsonl`, drops large prompt/completion fields, and + flattens `token_usage`. +4. Aggregates rows per model and environment, preserving variant ids. +5. Writes Parquet files plus `env_index.json` and `dataset_infos.json`. -The runner executes via `verifiers.utils.eval_utils.run_evaluation()` (called from `medarc_verifiers/cli/_single_run.py` and `medarc_verifiers/cli/_job_executor.py`). +Important modules: -## Processing pipeline (raw → parquet) - -Docs: `docs/medarc-eval-process.md`. +- Discovery: `medarc_verifiers/cli/process/discovery.py` +- Metadata normalization: `medarc_verifiers/cli/process/metadata.py` +- Row loading: `medarc_verifiers/cli/process/rows.py` +- Aggregation: `medarc_verifiers/cli/process/aggregate.py` +- Writing/indexing: `medarc_verifiers/cli/process/writer.py`, + `medarc_verifiers/cli/process/env_index.py` -Entry point: `medarc_verifiers/cli/process/pipeline.py` (via `run_process()`). - -### What processing does - -1. **Discover** job outputs from `runs/raw` by reading run manifests: - - `medarc_verifiers/cli/process/discovery.py` -2. **Normalize metadata** by merging manifest fields with `metadata.json`: - - `medarc_verifiers/cli/process/metadata.py` -3. **Handle rollouts**: - - MedARC sometimes “fakes” multiple rollouts by running the same base environment multiple times with different settings (e.g., different seeds). - - These fake rollouts are identified by a rollout suffix in the **manifest env id** like `env-a-rollout7` or `env-a-r7` (fallback: parse the results directory name). - - This suffix-derived rollout index is only used when rollouts are faked this way. Native verifiers rollouts (below) use the per-row JSONL field. - - `medarc_verifiers/cli/process/rollout.py` -4. **Load rows from `results.jsonl`**: - - Always drops large fields (`prompt`, `completion`). - - Allows selecting extra per-env columns into a JSON-encoded `extras` column. - - If the JSONL provides a per-row `rollout_index` (native verifiers multi-rollout runs), it is treated as authoritative and preserved. - - If `rollout_index` is missing but the JSONL contains multiple rows per `example_id`, computes a data-driven `rollout_index` based on occurrence count. - - Flattens `token_usage` into explicit columns like `model_token_total`, `judge_cost`, etc. - - `medarc_verifiers/cli/process/rows.py` -5. **Aggregate** rows per `(model_id, base_env_id)` and union schemas: - - `medarc_verifiers/cli/process/aggregate.py` - - When aggregating fake rollouts (manifest env ids include rollout suffixes), ensures every row has a `rollout_index` (derived from the suffix if missing) and normalizes indices to `0..K-1` within the dataset. - - When aggregating native verifiers rollouts (no rollout suffixes), preserves `rollout_index` values as provided by `results.jsonl` (no normalization). -6. **Write Parquet**: - - Output path is `//.parquet`. - - Output columns are restricted to a fixed allowlist schema for downstream compatibility. - - Adds exporter metadata under a Parquet schema metadata key. - - Writes `env_index.json` (v2) and `dataset_infos.json` for HF datasets UX. - - `medarc_verifiers/cli/process/writer.py`, `medarc_verifiers/cli/process/env_index.py` - -### Delta processing and HF baselines - -Processing can use `env_index.json` to do incremental updates (delta processing). It also supports pulling/pushing processed artifacts to/from Hugging Face: - -- HF baseline management (download/copy policies): `medarc_verifiers/cli/process/workspace.py` -- HF sync operations: `medarc_verifiers/cli/hf/sync.py` - -## Win rates (processed parquet → comparisons) +## Win Rates Docs: `docs/medarc-eval-winrate.md`. -`medarc-eval winrate` reads dataset inventory from `env_index.json`, averages rollouts per `(example_id, model_id)`, then computes pairwise model comparisons. +`medarc-eval winrate` reads dataset inventory from `env_index.json`, averages +rollouts per `(example_id, model_id)`, and computes pairwise model comparisons. -- Dataset discovery via `env_index.json`: `medarc_verifiers/cli/winrate/runner.py` -- Core math + weighting policies: `medarc_verifiers/cli/winrate/api.py` -- Outputs: - - timestamped `winrates-.json` and `.csv` - - `latest.json` and `latest.csv` - - JSON shape is model-centric: top-level `models` and `datasets` - - CSV contains aggregate winrates plus per-dataset average rewards, not pairwise `vs_*` columns +- Dataset discovery: `medarc_verifiers/cli/winrate/runner.py` +- Core math and weighting policies: `medarc_verifiers/cli/winrate/api.py` +- Outputs: timestamped `winrates-.json` / `.csv` plus + `latest.json` / `latest.csv` -## Shared building blocks used by environments +## Environment Utilities -These utilities are frequently imported by environment packages under `environments/*`: +Frequently imported utilities under `environments/*`: - Prompts and answer format constants: `medarc_verifiers/prompts.py` -- Parsers: - - XML parser (supports raw string or chat messages): `medarc_verifiers/parsers/xml_parser.py` - - JSON parser (field alternatives, optional pydantic schema validation, “format reward”): `medarc_verifiers/parsers/json_parser.py` -- Rewards: - - Robust MCQ grading with CoT/anchored patterns + answer-text fallback: `medarc_verifiers/rewards/multiple_choice_accuracy.py` - - Normalize judge dimension scores (1–5 → 0–1): `medarc_verifiers/rewards/normalize_helm_reward.py` -- MCQ shuffling with deterministic seeding and “anchor option” preservation: - - Skips shuffling entirely if options reference other labels (“A or B”, “Both A and C”), to avoid corrupting the question. - - `medarc_verifiers/utils/randomize_multiple_choice.py` +- XML parser: `medarc_verifiers/parsers/xml_parser.py` +- JSON parser: `medarc_verifiers/parsers/json_parser.py` +- MCQ grading: `medarc_verifiers/rewards/multiple_choice_accuracy.py` +- HELM reward normalization: `medarc_verifiers/rewards/normalize_helm_reward.py` +- Deterministic MCQ shuffling: `medarc_verifiers/utils/randomize_multiple_choice.py` +- Judge helpers: `medarc_verifiers/utils/judge_helpers.py` -## Judging and multi-judge support +## Judging and Multi-Judge Support -Some environments use “LLM-as-judge” scoring. `medarc_verifiers` provides: +Some environments use LLM-as-judge scoring. `medarc_verifiers` provides: -- A safer judge call wrapper with clearer errors: `medarc_verifiers/judging/judge_core.py` -- A `MultiJudge` that runs multiple judge models concurrently: `medarc_verifiers/judging/multi_judge.py` -- A `verifiers`-compatible rubric wrapper: `medarc_verifiers/judging/multi_judge_rubric.py` +- Judge call wrapper: `medarc_verifiers/judging/judge_core.py` +- Multi-judge runner: `medarc_verifiers/judging/multi_judge.py` +- Verifiers-compatible rubric wrapper: `medarc_verifiers/judging/multi_judge_rubric.py` -## vLLM orchestrator (local Docker) – separate CLI +## vLLM Orchestrator Docs: `docs/medarc-orchestrate.md`. -This is a separate tool (`medarc-orchestrate`) for running batch configs against locally hosted vLLM containers with GPU/port scheduling across Docker or Slurm+Pyxis runtimes. +`medarc-orchestrate` runs TOML bench configs against locally hosted vLLM +containers with GPU/port scheduling across Docker or Slurm+Pyxis runtimes. - CLI entry: `medarc_verifiers/orchestrate/cli.py` - Runtime loop: `medarc_verifiers/orchestrate/run.py` -It essentially: +It: -1. Launches vLLM containers -2. Waits for readiness -3. Runs `uv run medarc-eval bench --config ... --api-base-url ` -4. Tracks orchestration state under `outputs/orchestrator//` +1. Launches vLLM containers. +2. Waits for readiness. +3. Runs `uv run medarc-eval bench --config --api-base-url --provider local`. +4. Tracks orchestration state under `outputs/orchestrator//`. -## Where to change things (quick mental index) +## Where To Change Things -- Add/adjust CLI flags or command behavior: +- CLI flags or routing: - `medarc_verifiers/cli/main.py`, `medarc_verifiers/cli/_single_run.py` -- Change config semantics (matrix, normalization, validation): - - `medarc_verifiers/cli/_config_loader.py`, `medarc_verifiers/cli/_schemas.py` -- Fix resume/restart quirks: - - `medarc_verifiers/cli/_manifest.py`, `medarc_verifiers/cli/_manifest_planner.py` -- Add new columns or modify processed dataset schema: - - extraction: `medarc_verifiers/cli/process/rows.py` - - allowed columns/output schema: `medarc_verifiers/cli/process/writer.py` -- Change winrate math/output: +- TOML bench behavior, deterministic paths, or bench sidecar identity: + - `medarc_verifiers/cli/main.py`, `medarc_verifiers/cli/eval_identity.py`, + `medarc_verifiers/cli/upstream_eval.py`, `medarc_verifiers/cli/verifiers_adapter.py` +- Processed dataset schema: + - `medarc_verifiers/cli/process/rows.py`, `medarc_verifiers/cli/process/writer.py` +- Winrate math/output: - `medarc_verifiers/cli/winrate/api.py`, `medarc_verifiers/cli/winrate/runner.py` -- Adjust judging defaults/provider behaviors: +- Judging/provider behavior: - `medarc_verifiers/utils/judge_helpers.py`, `medarc_verifiers/utils/prime_inference.py` diff --git a/environments/aci_bench/aci_bench/aci_bench.py b/environments/aci_bench/aci_bench/aci_bench.py index 4f7346db..842e8f66 100644 --- a/environments/aci_bench/aci_bench/aci_bench.py +++ b/environments/aci_bench/aci_bench/aci_bench.py @@ -29,11 +29,11 @@ def _to_vf_format(dataset: Dataset) -> Dataset: lambda row: { "question": prompt.format(conversation=row["dialogue"]), "answer": row["note"], - "task": "aci-bench", "info": { "conversation": row["dialogue"], "reference_response": row["note"], "transcript_version": row["transcript_version"], + "aci_bench_task": "aci-bench", }, } ) diff --git a/environments/aci_bench/pyproject.toml b/environments/aci_bench/pyproject.toml index 37a7afb2..9bc9bd4b 100644 --- a/environments/aci_bench/pyproject.toml +++ b/environments/aci_bench/pyproject.toml @@ -6,7 +6,7 @@ tags = ["medical", "clinical", "dialogue", "summarization", "llm-judge", "single version = "0.1.0" requires-python = ">=3.11" dependencies = [ - "verifiers>=0.1.5.post0", + "verifiers>=0.1.12,<0.2", "medarc-verifiers>=0.1.0", ] diff --git a/environments/agentclinic/agentclinic/agentclinic.py b/environments/agentclinic/agentclinic/agentclinic.py index ca4a4a6a..e9899327 100644 --- a/environments/agentclinic/agentclinic/agentclinic.py +++ b/environments/agentclinic/agentclinic/agentclinic.py @@ -423,12 +423,12 @@ def load_environment( "reference_response": scenario.diagnosis_information(), "case_id": i, "dataset_type": dataset_type, + "agentclinic_task": f"agentclinic-{dataset_type}", } records.append( { "question": question, "answer": scenario.diagnosis_information(), - "task": f"agentclinic-{dataset_type}", "info": info, } ) diff --git a/environments/agentclinic/pyproject.toml b/environments/agentclinic/pyproject.toml index 742b5244..d97e1ebb 100644 --- a/environments/agentclinic/pyproject.toml +++ b/environments/agentclinic/pyproject.toml @@ -8,7 +8,7 @@ version = "0.1.0" requires-python = ">=3.11" dependencies = [ "medarc-verifiers>=0.1.0", - "verifiers", + "verifiers>=0.1.12,<0.2", "datasets", ] diff --git a/environments/careqa/careqa.py b/environments/careqa/careqa.py index fe5c833c..d2488ebb 100644 --- a/environments/careqa/careqa.py +++ b/environments/careqa/careqa.py @@ -230,10 +230,10 @@ def _load_open_ended_environment( def _map(ex): info = {} info["question"] = ex["question"].strip() + info["careqa_task"] = "careqa_open" return { "question": ex["question"].strip(), "answer": ex.get("answer_explanation", ex.get("answer", "")), - "task": "careqa_open", "info": info, } diff --git a/environments/careqa/pyproject.toml b/environments/careqa/pyproject.toml index eae6dfa2..8b1a793e 100644 --- a/environments/careqa/pyproject.toml +++ b/environments/careqa/pyproject.toml @@ -5,7 +5,7 @@ tags = ["healthcare", "medical-qa", "mcq", "clinical", "single-turn", "open-ende version = "0.1.1" requires-python = ">=3.11" dependencies = [ - "verifiers>=0.1.4", + "verifiers>=0.1.12,<0.2", "datasets>=2.13.0", "medarc-verifiers>=0.1.0", ] diff --git a/environments/head_qa/pyproject.toml b/environments/head_qa/pyproject.toml index c7fbf46b..2645e2b7 100644 --- a/environments/head_qa/pyproject.toml +++ b/environments/head_qa/pyproject.toml @@ -10,7 +10,7 @@ authors = [ ] dependencies = [ "medarc-verifiers>=0.1.0", - "verifiers>=0.1.6.post0", + "verifiers>=0.1.12,<0.2", ] [build-system] diff --git a/environments/head_qa_v2/pyproject.toml b/environments/head_qa_v2/pyproject.toml index c35a3d4f..e42339c5 100644 --- a/environments/head_qa_v2/pyproject.toml +++ b/environments/head_qa_v2/pyproject.toml @@ -10,7 +10,7 @@ authors = [ ] dependencies = [ "medarc-verifiers>=0.1.0", - "verifiers>=0.1.5.post0", + "verifiers>=0.1.12,<0.2", ] [build-system] diff --git a/environments/healthbench/pyproject.toml b/environments/healthbench/pyproject.toml index d2a35374..c968a509 100644 --- a/environments/healthbench/pyproject.toml +++ b/environments/healthbench/pyproject.toml @@ -6,7 +6,7 @@ version = "0.1.0" requires-python = ">=3.11" dependencies = [ "medarc-verifiers>=0.1.0", - "verifiers>=0.1.4", + "verifiers>=0.1.12,<0.2", "datasets>=4.1.1", "openai>=2.1.0", ] @@ -20,3 +20,7 @@ dev = [ [build-system] requires = ["hatchling"] build-backend = "hatchling.build" + +[tool.verifiers.eval] +num_examples = -1 +rollouts_per_example = 1 diff --git a/environments/longhealth/README.md b/environments/longhealth/README.md index 96095872..d033659f 100644 --- a/environments/longhealth/README.md +++ b/environments/longhealth/README.md @@ -63,7 +63,7 @@ medarc-eval longhealth -m "openai/gpt-5-mini" -n 10 -s --task all --doc-shuffle- | Metric | Meaning | | ------ | ------- | | `reward` | Exact match accuracy (1.0 if correct letter, 0.0 otherwise) | -| `info.task` | Which sub-task: `task1`, `task2_negation`, or `task2_identification` | +| `info.longhealth_task` | Which sub-task: `task1`, `task2_negation`, or `task2_identification` | | `info.has_answer_docs` | Whether answer-containing documents were included | | `info.num_docs` | Number of documents in the context | diff --git a/environments/longhealth/longhealth.py b/environments/longhealth/longhealth.py index 173b9bc6..ab188896 100644 --- a/environments/longhealth/longhealth.py +++ b/environments/longhealth/longhealth.py @@ -276,7 +276,7 @@ def _prepare_task1_data( info = { "patient_id": patient_id, "question_no": question.get("No"), - "task": "task1", + "longhealth_task": "task1", "correct_answer_text": correct_answer, "num_docs": len(selected_docs), "has_answer_docs": len(answer_docs) > 0, @@ -401,7 +401,7 @@ def _prepare_task2_data( info_neg = { "patient_id": patient_id, "question_no": question.get("No"), - "task": "task2_negation", + "longhealth_task": "task2_negation", "correct_answer_text": "Question cannot be answered with provided documents", "num_docs": len(selected_docs_neg), "has_answer_docs": False, @@ -444,7 +444,7 @@ def _prepare_task2_data( info_ident = { "patient_id": patient_id, "question_no": question.get("No"), - "task": "task2_identification", + "longhealth_task": "task2_identification", "correct_answer_text": correct_answer, "num_docs": len(selected_docs_ident), "has_answer_docs": True, diff --git a/environments/longhealth/pyproject.toml b/environments/longhealth/pyproject.toml index 01cd8e0a..1e7ae6f0 100644 --- a/environments/longhealth/pyproject.toml +++ b/environments/longhealth/pyproject.toml @@ -8,7 +8,7 @@ authors = [ { name = "Shamus Sim Zi Yang", email = "shamus.sim@monash.edu" }, ] dependencies = [ - "verifiers>=0.1.5.post0", + "verifiers>=0.1.12,<0.2", "medarc-verifiers>=0.1.0", ] diff --git a/environments/m_arc/m_arc.py b/environments/m_arc/m_arc.py index 595bf01e..43064afb 100644 --- a/environments/m_arc/m_arc.py +++ b/environments/m_arc/m_arc.py @@ -118,6 +118,7 @@ def _format_row(row: dict, idx: int) -> dict: # question and answer have been moved to top-level, so remove them here info = dict(row) + info.pop("task", None) # update shuffled answer choices in the info dict if shuffle_answers: diff --git a/environments/m_arc/pyproject.toml b/environments/m_arc/pyproject.toml index b69e2337..37d15b69 100644 --- a/environments/m_arc/pyproject.toml +++ b/environments/m_arc/pyproject.toml @@ -5,7 +5,7 @@ tags = ["medical", "clinical", "single-turn", "multiple-choice", "evaluation", " version = "0.1.1" requires-python = ">=3.11" dependencies = [ - "verifiers>=0.1.3.post0", + "verifiers>=0.1.12,<0.2", "medarc-verifiers>=0.1.0", ] @@ -21,3 +21,6 @@ build-backend = "hatchling.build" [tool.hatch.build] include = ["m_arc.py"] +[tool.verifiers.eval] +num_examples = -1 +rollouts_per_example = 1 diff --git a/environments/med_dialog/med_dialog/med_dialog.py b/environments/med_dialog/med_dialog/med_dialog.py index 23cab690..a805dccf 100644 --- a/environments/med_dialog/med_dialog/med_dialog.py +++ b/environments/med_dialog/med_dialog/med_dialog.py @@ -56,6 +56,7 @@ def _format_row(row: dict[str, Any], *, subset: str = subset) -> dict[str, Any]: response = str(row.get("tgt", "")) info = dict(row) + info.pop("task", None) info["conversation"] = prompt info["reference_response"] = response info["subset"] = subset diff --git a/environments/med_dialog/pyproject.toml b/environments/med_dialog/pyproject.toml index 92389a26..f141f505 100644 --- a/environments/med_dialog/pyproject.toml +++ b/environments/med_dialog/pyproject.toml @@ -5,7 +5,7 @@ tags = ["medical", "dialogue", "summarization", "single-turn", "llm-judge"] version = "0.1.0" requires-python = ">=3.11" dependencies = [ - "verifiers>=0.1.4", + "verifiers>=0.1.12,<0.2", "datasets>=4.1.1", "openai>=2.1.0", "medarc-verifiers>=0.1.0", diff --git a/environments/med_halt/pyproject.toml b/environments/med_halt/pyproject.toml index c5ac0939..82f9f372 100644 --- a/environments/med_halt/pyproject.toml +++ b/environments/med_halt/pyproject.toml @@ -6,7 +6,7 @@ version = "0.1.1" requires-python = ">=3.11" dependencies = [ "datasets>=4.0.0", - "verifiers>=0.1.2.post0", + "verifiers>=0.1.12,<0.2", "medarc-verifiers>=0.1.0", ] diff --git a/environments/med_mcqa/pyproject.toml b/environments/med_mcqa/pyproject.toml index ebd2998d..016aab20 100644 --- a/environments/med_mcqa/pyproject.toml +++ b/environments/med_mcqa/pyproject.toml @@ -9,7 +9,7 @@ authors = [ { name = "Ratna Sagari Grandhi", email = "sagari.grandhi@gmail.com" }, ] dependencies = [ - "verifiers>=0.1.5.post0", + "verifiers>=0.1.12,<0.2", "medarc-verifiers>=0.1.0", ] @@ -21,4 +21,8 @@ build-backend = "hatchling.build" [tool.prime.environment] loader = "med_mcqa:load_environment" display_name = "MedMCQA" -visibility = "PUBLIC" \ No newline at end of file +visibility = "PUBLIC" + +[tool.verifiers.eval] +num_examples = -1 +rollouts_per_example = 1 diff --git a/environments/medagentbench/pyproject.toml b/environments/medagentbench/pyproject.toml index 766e07e8..2a0cfaad 100644 --- a/environments/medagentbench/pyproject.toml +++ b/environments/medagentbench/pyproject.toml @@ -6,7 +6,7 @@ version = "0.1.2" dependencies = [ "medarc-verifiers>=0.1.0", "datasets", - "verifiers>=0.1.2.post1", + "verifiers>=0.1.12,<0.2", ] [build-system] diff --git a/environments/medagentbenchv2/medagentbenchv2/env.py b/environments/medagentbenchv2/medagentbenchv2/env.py index a181f1d9..cd26d6e5 100644 --- a/environments/medagentbenchv2/medagentbenchv2/env.py +++ b/environments/medagentbenchv2/medagentbenchv2/env.py @@ -362,12 +362,14 @@ def _map(task: dict[str, Any]) -> dict[str, Any]: "content": _build_user_message(task["instruction"], task.get("context")), }, ] + info = dict(task) + info.pop("task", None) + info["medagentbench_task"] = "medagentbenchv2" return { "id": task["id"], "prompt": prompt, - "info": dict(task), + "info": info, "answer": "", - "task": "medagentbenchv2", } eval_dataset = Dataset.from_list([_map(task) for task in tasks]) diff --git a/environments/medagentbenchv2/pyproject.toml b/environments/medagentbenchv2/pyproject.toml index 92f59484..a135be24 100644 --- a/environments/medagentbenchv2/pyproject.toml +++ b/environments/medagentbenchv2/pyproject.toml @@ -6,7 +6,7 @@ readme = "README.md" requires-python = ">=3.11,<3.13" dependencies = [ "medarc-verifiers>=0.1.0", - "verifiers>=0.1.10", + "verifiers>=0.1.12,<0.2", "datasets", "requests", "pydantic", diff --git a/environments/medbullets/medbullets.py b/environments/medbullets/medbullets.py index f0d3431e..fb183b3d 100644 --- a/environments/medbullets/medbullets.py +++ b/environments/medbullets/medbullets.py @@ -60,6 +60,7 @@ def _format_row(row: dict) -> dict: # question and answer have been moved to top-level, so remove them here info = dict(row) + info.pop("task", None) # update shuffled answer choices in the info dict if shuffle_answers: diff --git a/environments/medbullets/pyproject.toml b/environments/medbullets/pyproject.toml index bca2a05b..488d0d91 100644 --- a/environments/medbullets/pyproject.toml +++ b/environments/medbullets/pyproject.toml @@ -5,7 +5,7 @@ tags = ["medical", "clinical", "single-turn", "multiple-choice", "usmle", "train version = "0.1.1" requires-python = ">=3.11" dependencies = [ - "verifiers>=0.1.3.post0", + "verifiers>=0.1.12,<0.2", "medarc-verifiers>=0.1.0", ] diff --git a/environments/medcalc_bench/medcalc_bench/medcalc_bench.py b/environments/medcalc_bench/medcalc_bench/medcalc_bench.py index b4f0090d..2ecbab62 100644 --- a/environments/medcalc_bench/medcalc_bench/medcalc_bench.py +++ b/environments/medcalc_bench/medcalc_bench/medcalc_bench.py @@ -363,12 +363,12 @@ def _map(row: dict): answer_format=answer_format, ), "answer": row["Ground Truth Answer"], - "task": "medcalc_bench", "info": { "calc_id": row["Calculator ID"], "ground_truth": row["Ground Truth Answer"], "lower_bound": row["Lower Limit"], "upper_bound": row["Upper Limit"], + "medcalc_task": "medcalc_bench", }, } diff --git a/environments/medcalc_bench/pyproject.toml b/environments/medcalc_bench/pyproject.toml index 4e37b302..79e45490 100644 --- a/environments/medcalc_bench/pyproject.toml +++ b/environments/medcalc_bench/pyproject.toml @@ -5,7 +5,7 @@ description = "MedCalc-Bench clinical calculator evaluation" readme = "README.md" requires-python = ">=3.11,<3.13" dependencies = [ - "verifiers>=0.1.4", + "verifiers>=0.1.12,<0.2", "datasets>=4.0.0", "numpy>=1.26.0", "simpleeval>=0.9.10", diff --git a/environments/medcasereasoning/medcasereasoning.py b/environments/medcasereasoning/medcasereasoning.py index 87afc915..e386a0e1 100644 --- a/environments/medcasereasoning/medcasereasoning.py +++ b/environments/medcasereasoning/medcasereasoning.py @@ -59,8 +59,7 @@ def load_environment( lambda x: { "question": QUESTION_TEMPLATE.format(question=x["case_prompt"]), "answer": x["final_diagnosis"], - "task": "medcasereasoning", - "info": {"case_prompt": x["case_prompt"]}, + "info": {"case_prompt": x["case_prompt"], "medcasereasoning_task": "medcasereasoning"}, } ) @@ -68,8 +67,7 @@ def load_environment( lambda x: { "question": QUESTION_TEMPLATE.format(question=x["case_prompt"]), "answer": x["final_diagnosis"], - "task": "medcasereasoning", - "info": {"case_prompt": x["case_prompt"]}, + "info": {"case_prompt": x["case_prompt"], "medcasereasoning_task": "medcasereasoning"}, } ) diff --git a/environments/medcasereasoning/pyproject.toml b/environments/medcasereasoning/pyproject.toml index 2d37215a..3cd81193 100644 --- a/environments/medcasereasoning/pyproject.toml +++ b/environments/medcasereasoning/pyproject.toml @@ -5,7 +5,7 @@ description = "MedCaseReasoning medical diagnosis evaluation" tags = ["medical", "reasoning", "single-turn", "llm-judge", "diagnosis"] dependencies = [ "medarc-verifiers>=0.1.0", - "verifiers>=0.1.2.post0", + "verifiers>=0.1.12,<0.2", "datasets", "openai", ] diff --git a/environments/medconceptsqa/pyproject.toml b/environments/medconceptsqa/pyproject.toml index 752878e8..773b22d8 100644 --- a/environments/medconceptsqa/pyproject.toml +++ b/environments/medconceptsqa/pyproject.toml @@ -8,7 +8,7 @@ authors = [ { name = "Anish Mahishi", email = "anish.mahishi@gmail.com" }, ] dependencies = [ - "verifiers>=0.1.5.post0", + "verifiers>=0.1.12,<0.2", "datasets>=4.1.1", "medarc-verifiers>=0.1.0", ] diff --git a/environments/medec/pyproject.toml b/environments/medec/pyproject.toml index 5931bf0a..ba2c971a 100644 --- a/environments/medec/pyproject.toml +++ b/environments/medec/pyproject.toml @@ -6,7 +6,7 @@ version = "0.1.0" requires-python = ">=3.11" dependencies = [ "medarc-verifiers>=0.1.0", - "verifiers>=0.1.4", + "verifiers>=0.1.12,<0.2", "datasets", "openai>=1.3.0", "numpy", diff --git a/environments/medexqa/medexqa.py b/environments/medexqa/medexqa.py index aa866392..abde2acc 100644 --- a/environments/medexqa/medexqa.py +++ b/environments/medexqa/medexqa.py @@ -175,6 +175,7 @@ def _format_row(row: dict, idx: int | None = None) -> dict: # Keep original data in info info = dict(row) + info.pop("task", None) info["answer_text"] = answer_text info["answer"] = answer_letter info["question"] = question diff --git a/environments/medexqa/pyproject.toml b/environments/medexqa/pyproject.toml index bbcdd4d6..6ba9135f 100644 --- a/environments/medexqa/pyproject.toml +++ b/environments/medexqa/pyproject.toml @@ -9,7 +9,7 @@ authors = [ ] dependencies = [ "datasets>=4.0.0", - "verifiers>=0.1.2.post0", + "verifiers>=0.1.12,<0.2", "pandas>=2.0.0", "medarc-verifiers>=0.1.0", ] diff --git a/environments/medhallu/pyproject.toml b/environments/medhallu/pyproject.toml index e335d88e..c5bc5063 100644 --- a/environments/medhallu/pyproject.toml +++ b/environments/medhallu/pyproject.toml @@ -6,7 +6,7 @@ version = "0.1.0" requires-python = ">=3.10" dependencies = [ "medarc-verifiers>=0.1.0", - "verifiers>=0.1.4", + "verifiers>=0.1.12,<0.2", ] [dependency-groups] diff --git a/environments/medicationqa/pyproject.toml b/environments/medicationqa/pyproject.toml index 06f523ee..f729fea8 100644 --- a/environments/medicationqa/pyproject.toml +++ b/environments/medicationqa/pyproject.toml @@ -5,7 +5,7 @@ tags = ["medical", "qa", "consumer", "single-turn", "llm-judge"] version = "0.1.0" requires-python = ">=3.11" dependencies = [ - "verifiers>=0.1.4", + "verifiers>=0.1.12,<0.2", "requests", "medarc-verifiers>=0.1.0", "openpyxl" diff --git a/environments/medqa/pyproject.toml b/environments/medqa/pyproject.toml index 6994e36e..658dfbc8 100644 --- a/environments/medqa/pyproject.toml +++ b/environments/medqa/pyproject.toml @@ -6,7 +6,7 @@ readme = "README.md" requires-python = ">=3.11" dependencies = [ "datasets>=4.0.0", - "verifiers>=0.1.2.post0", + "verifiers>=0.1.12,<0.2", "medarc-verifiers>=0.1.0", ] @@ -23,3 +23,7 @@ include = ["medqa.py"] loader = "medqa:load_environment" display_name = "MedQA" visibility = "PUBLIC" + +[tool.verifiers.eval] +num_examples = -1 +rollouts_per_example = 1 diff --git a/environments/medrbench/medrbench/medrbench.py b/environments/medrbench/medrbench/medrbench.py index d729a9a6..74ca5b96 100644 --- a/environments/medrbench/medrbench/medrbench.py +++ b/environments/medrbench/medrbench/medrbench.py @@ -105,11 +105,11 @@ def _to_vf_format_diagnosis(data: dict[str, Any], rare_disease_only: bool = Fals else: question = MULTI_TURN_FIRST_TURN_PROMPT.format(case=case_without_tests) + medrbench_task = f"medrbench-diagnosis-{task.value}" records.append( { "question": question, "answer": diagnosis_results, - "task": f"medrbench-diagnosis-{task.value}", "info": { "pmc_id": pmc_id, "case_summary": case_summary, @@ -118,6 +118,7 @@ def _to_vf_format_diagnosis(data: dict[str, Any], rare_disease_only: bool = Fals "differential_diagnosis": differential_diagnosis, "reference_response": diagnosis_results, "task_type": "medrbench-diagnosis", + "medrbench_task": medrbench_task, "body_category": case.get("body_category", []), "disorder_category": case.get("disorder_category", []), "checked_rare_disease": case.get("checked_rare_disease", []), @@ -149,13 +150,13 @@ def _to_vf_format_treatment(data: dict[str, Any], rare_disease_only: bool = Fals { "question": question, "answer": treatment_plan_results, - "task": "medrbench-treatment", "info": { "pmc_id": pmc_id, "case_summary": case_summary, "treatment_planning_analysis": treatment_planning_analysis, "reference_response": treatment_plan_results, "task_type": "medrbench-treatment", + "medrbench_task": "medrbench-treatment", "body_category": case.get("body_category", []), "disorder_category": case.get("disorder_category", []), "checked_rare_disease": case.get("checked_rare_disease", []), @@ -497,7 +498,7 @@ async def judge_rubric_reward(completion: Messages, info: Info, state: State, ** gold_response = str(info.get("reference_response") or "") extracted_answer = parser.parse_answer(completion) or "" - task_name = str(state.get("task") or info.get("task_type") or "medrbench-diagnosis") + task_name = str(info.get("medrbench_task") or info.get("task_type") or "medrbench-diagnosis") if task_name.startswith("medrbench-diagnosis-free_turn"): info.setdefault("turns_used", _turn_count(state)) diff --git a/environments/medrbench/pyproject.toml b/environments/medrbench/pyproject.toml index ac12160d..81fd9e2e 100644 --- a/environments/medrbench/pyproject.toml +++ b/environments/medrbench/pyproject.toml @@ -8,7 +8,7 @@ authors = [ { name = "Hunar Batra", email = "i@hunarbatra.com" }, ] dependencies = [ - "verifiers>=0.1.5.post0", + "verifiers>=0.1.12,<0.2", "medarc-verifiers>=0.1.0", "requests>=2.28.0", ] diff --git a/environments/medredqa/medredqa.py b/environments/medredqa/medredqa.py index 50810dab..1d8d616c 100644 --- a/environments/medredqa/medredqa.py +++ b/environments/medredqa/medredqa.py @@ -47,8 +47,7 @@ def load_environment( lambda x: { "question": x["title"] + "\n" + x["body"] if x["title"] else x["body"], "answer": x["response"], - "task": "medredqa", - "info": {"judge_response": "Pending.."}, + "info": {"judge_response": "Pending..", "medredqa_task": "medredqa"}, } ) @@ -56,8 +55,7 @@ def load_environment( lambda x: { "question": x["title"] + "\n" + x["body"] if x["title"] else x["body"], "answer": x["response"], - "task": "medredqa", - "info": {"judge_response": "Pending.."}, + "info": {"judge_response": "Pending..", "medredqa_task": "medredqa"}, } ) diff --git a/environments/medredqa/pyproject.toml b/environments/medredqa/pyproject.toml index 4e2ba001..a92d3366 100644 --- a/environments/medredqa/pyproject.toml +++ b/environments/medredqa/pyproject.toml @@ -9,7 +9,7 @@ authors = [ ] dependencies = [ "medarc-verifiers>=0.1.0", - "verifiers>=0.1.2.post0", + "verifiers>=0.1.12,<0.2", "datasets", "openai", ] diff --git a/environments/medxpertqa/medxpertqa.py b/environments/medxpertqa/medxpertqa.py index 2ef3334e..a581c641 100644 --- a/environments/medxpertqa/medxpertqa.py +++ b/environments/medxpertqa/medxpertqa.py @@ -79,6 +79,8 @@ def _map(example: dict) -> dict: answer_text = options.get(answer_letter) info = dict(example) + info.pop("task", None) + info["medxpertqa_question_type"] = question_type.value if shuffle_answers: info["options"] = options info["label"] = answer_letter @@ -88,7 +90,6 @@ def _map(example: dict) -> dict: "question": _format_question_with_options(example.get("question", ""), options), "answer": answer_letter if answer_letter else "", "info": info, - "task": question_type.value, } # Disable the Datasets cache when shuffling answers diff --git a/environments/medxpertqa/pyproject.toml b/environments/medxpertqa/pyproject.toml index 021c6dd6..0a651ecc 100644 --- a/environments/medxpertqa/pyproject.toml +++ b/environments/medxpertqa/pyproject.toml @@ -5,7 +5,7 @@ tags = ["eval"] version = "0.1.1" requires-python = ">=3.11" dependencies = [ - "verifiers>=0.1.4", + "verifiers>=0.1.12,<0.2", "datasets>=4.0.0", "medarc-verifiers>=0.1.0", ] diff --git a/environments/meqsum/pyproject.toml b/environments/meqsum/pyproject.toml index cfef2e00..dbc48a61 100644 --- a/environments/meqsum/pyproject.toml +++ b/environments/meqsum/pyproject.toml @@ -5,7 +5,7 @@ tags = ["medical", "nlp", "summarization", "single-turn", "llm-judge", "nlg-metr version = "0.1.0" requires-python = ">=3.11" dependencies = [ - "verifiers>=0.1.4", + "verifiers>=0.1.12,<0.2", "datasets", "medarc-verifiers>=0.1.0" ] diff --git a/environments/metamedqa/pyproject.toml b/environments/metamedqa/pyproject.toml index 8924dec7..8dec10b8 100644 --- a/environments/metamedqa/pyproject.toml +++ b/environments/metamedqa/pyproject.toml @@ -8,7 +8,7 @@ authors = [ { name = "Aymane Ouraq", email = "ouraqaymane@gmail.com" }, ] dependencies = [ - "verifiers>=0.1.2.post0", # same major/minor line as your working env + "verifiers>=0.1.12,<0.2", "datasets>=4.0.0", "medarc-verifiers>=0.1.0", ] diff --git a/environments/mmlu_pro_health/mmlu_pro_health.py b/environments/mmlu_pro_health/mmlu_pro_health.py index 6ad0feb1..626a5c94 100644 --- a/environments/mmlu_pro_health/mmlu_pro_health.py +++ b/environments/mmlu_pro_health/mmlu_pro_health.py @@ -114,6 +114,7 @@ def _format_row(row: dict, idx: int) -> dict: # question and answer have been moved to top-level, so remove them here info = dict(row) + info.pop("task", None) # update shuffled answer choices in the info dict if shuffle_answers: diff --git a/environments/mmlu_pro_health/pyproject.toml b/environments/mmlu_pro_health/pyproject.toml index 15c36e1e..64391770 100644 --- a/environments/mmlu_pro_health/pyproject.toml +++ b/environments/mmlu_pro_health/pyproject.toml @@ -5,7 +5,7 @@ tags = ["medical", "clinical", "single-turn", "multiple-choice", "mmlu", "evalua version = "0.1.1" requires-python = ">=3.11" dependencies = [ - "verifiers>=0.1.3.post0", + "verifiers>=0.1.12,<0.2", "medarc-verifiers>=0.1.0", ] diff --git a/environments/mtsamples_procedures/mtsamples_procedures/mtsamples_procedures.py b/environments/mtsamples_procedures/mtsamples_procedures/mtsamples_procedures.py index 68d04d58..50111719 100644 --- a/environments/mtsamples_procedures/mtsamples_procedures/mtsamples_procedures.py +++ b/environments/mtsamples_procedures/mtsamples_procedures/mtsamples_procedures.py @@ -1,5 +1,6 @@ import json import os +import shutil from pathlib import Path from typing import Any from urllib.parse import quote @@ -74,35 +75,36 @@ def _download_txt_files(cache_path: Path) -> list[Path]: txt_dir = cache_path / "txt_files" txt_dir.mkdir(parents=True, exist_ok=True) - existing_files = list(txt_dir.glob("*.txt")) - if len(existing_files) > 0: - return existing_files - files_json = download_file(API_URL, cache_path / "files.json") files_data = json.loads(files_json.read_text(encoding="utf-8")) + expected_names = sorted(file_info["name"] for file_info in files_data if file_info["name"].endswith(".txt")) - downloaded_files = [] - for file_info in files_data: - if file_info["name"].endswith(".txt"): - encoded_name = quote(file_info["name"]) - file_url = f"{BASE_URL}/{encoded_name}" - dest_path = txt_dir / file_info["name"] + txt_files = [] + for name in expected_names: + encoded_name = quote(name) + file_url = f"{BASE_URL}/{encoded_name}" + dest_path = txt_dir / name + if not dest_path.exists(): download_file(file_url, dest_path) - downloaded_files.append(dest_path) + txt_files.append(dest_path) - return downloaded_files + return txt_files def _load_dataset(cache_dir: Path | str | None = None) -> Dataset: cache_path = _resolve_cache_dir(cache_dir) cache_path.mkdir(parents=True, exist_ok=True) + txt_files = _download_txt_files(cache_path) dataset_cache = cache_path / "dataset" + metadata_path = dataset_cache / "medarc_cache_metadata.json" if dataset_cache.exists(): - return Dataset.load_from_disk(str(dataset_cache)) - - txt_files = _download_txt_files(cache_path) + if metadata_path.exists(): + metadata = json.loads(metadata_path.read_text(encoding="utf-8")) + if metadata.get("source_files") == len(txt_files): + return Dataset.load_from_disk(str(dataset_cache)) + shutil.rmtree(dataset_cache) examples = [] @@ -145,6 +147,10 @@ def _load_dataset(cache_dir: Path | str | None = None) -> Dataset: dataset = Dataset.from_list(examples) dataset.save_to_disk(str(dataset_cache)) + metadata_path.write_text( + json.dumps({"source_files": len(txt_files), "examples": len(dataset)}, indent=2), + encoding="utf-8", + ) return dataset diff --git a/environments/mtsamples_procedures/pyproject.toml b/environments/mtsamples_procedures/pyproject.toml index e9a119fd..84bd5818 100644 --- a/environments/mtsamples_procedures/pyproject.toml +++ b/environments/mtsamples_procedures/pyproject.toml @@ -5,7 +5,7 @@ tags = ["medical", "procedures", "plan_generation", "single-turn", "llm-judge"] version = "0.1.0" requires-python = ">=3.11" dependencies = [ - "verifiers>=0.1.4", + "verifiers>=0.1.12,<0.2", "datasets>=4.1.1", "openai>=2.1.0", "medarc-verifiers>=0.1.0", diff --git a/environments/mtsamples_replicate/mtsamples_replicate/mtsamples_replicate.py b/environments/mtsamples_replicate/mtsamples_replicate/mtsamples_replicate.py index a1d7384e..3642d306 100644 --- a/environments/mtsamples_replicate/mtsamples_replicate/mtsamples_replicate.py +++ b/environments/mtsamples_replicate/mtsamples_replicate/mtsamples_replicate.py @@ -1,5 +1,6 @@ import json import os +import shutil from pathlib import Path from typing import Any from urllib.parse import quote @@ -81,35 +82,36 @@ def _download_txt_files(cache_path: Path) -> list[Path]: txt_dir = cache_path / "txt_files" txt_dir.mkdir(parents=True, exist_ok=True) - existing_files = list(txt_dir.glob("*.txt")) - if len(existing_files) > 0: - return existing_files - files_json = download_file(API_URL, cache_path / "files.json") files_data = json.loads(files_json.read_text(encoding="utf-8")) + expected_names = sorted(file_info["name"] for file_info in files_data if file_info["name"].endswith(".txt")) - downloaded_files = [] - for file_info in files_data: - if file_info["name"].endswith(".txt"): - encoded_name = quote(file_info["name"]) - file_url = f"{BASE_URL}/{encoded_name}" - dest_path = txt_dir / file_info["name"] + txt_files = [] + for name in expected_names: + encoded_name = quote(name) + file_url = f"{BASE_URL}/{encoded_name}" + dest_path = txt_dir / name + if not dest_path.exists(): download_file(file_url, dest_path) - downloaded_files.append(dest_path) + txt_files.append(dest_path) - return downloaded_files + return txt_files def _load_dataset(cache_dir: Path | str | None = None) -> Dataset: cache_path = _resolve_cache_dir(cache_dir) cache_path.mkdir(parents=True, exist_ok=True) + txt_files = _download_txt_files(cache_path) dataset_cache = cache_path / "dataset" + metadata_path = dataset_cache / "medarc_cache_metadata.json" if dataset_cache.exists(): - return Dataset.load_from_disk(str(dataset_cache)) - - txt_files = _download_txt_files(cache_path) + if metadata_path.exists(): + metadata = json.loads(metadata_path.read_text(encoding="utf-8")) + if metadata.get("source_files") == len(txt_files): + return Dataset.load_from_disk(str(dataset_cache)) + shutil.rmtree(dataset_cache) examples = [] @@ -152,6 +154,10 @@ def _load_dataset(cache_dir: Path | str | None = None) -> Dataset: dataset = Dataset.from_list(examples) dataset.save_to_disk(str(dataset_cache)) + metadata_path.write_text( + json.dumps({"source_files": len(txt_files), "examples": len(dataset)}, indent=2), + encoding="utf-8", + ) return dataset diff --git a/environments/mtsamples_replicate/pyproject.toml b/environments/mtsamples_replicate/pyproject.toml index 5c854dd7..07a2dc0b 100644 --- a/environments/mtsamples_replicate/pyproject.toml +++ b/environments/mtsamples_replicate/pyproject.toml @@ -12,7 +12,7 @@ tags = [ version = "0.1.0" requires-python = ">=3.11" dependencies = [ - "verifiers>=0.1.4", + "verifiers>=0.1.12,<0.2", "datasets>=4.1.1", "openai>=2.1.0", "medarc-verifiers>=0.1.0", diff --git a/environments/pubhealthbench/pyproject.toml b/environments/pubhealthbench/pyproject.toml index c283fd19..fd5b671a 100644 --- a/environments/pubhealthbench/pyproject.toml +++ b/environments/pubhealthbench/pyproject.toml @@ -6,7 +6,7 @@ readme = "README.md" requires-python = ">=3.11" dependencies = [ "datasets>=4.0.0", - "verifiers>=0.1.2.post0", + "verifiers>=0.1.12,<0.2", "medarc-verifiers>=0.1.0", ] diff --git a/environments/pubmedqa/pubmedqa.py b/environments/pubmedqa/pubmedqa.py index dc733550..a5ee4ab9 100644 --- a/environments/pubmedqa/pubmedqa.py +++ b/environments/pubmedqa/pubmedqa.py @@ -74,6 +74,7 @@ def map_row_to_mcq_prompt( # required fields: question (for the prompt), and answer (for the scoring) info = { "answer_text": options.get(correct_answer_letter, final_decision), + "pubmedqa_task": "pubmedqa", } if shuffle_answers: info["options"] = options @@ -81,7 +82,6 @@ def map_row_to_mcq_prompt( return { "question": complete_prompt, "answer": correct_answer_letter, - "task": "pubmedqa", "info": info, } diff --git a/environments/pubmedqa/pyproject.toml b/environments/pubmedqa/pyproject.toml index 216a6e6d..85982185 100644 --- a/environments/pubmedqa/pyproject.toml +++ b/environments/pubmedqa/pyproject.toml @@ -8,7 +8,7 @@ authors = [ { name = "Robert Scholz", email = "robert.scholz@maxplanckschools.de" }, ] dependencies = [ - "verifiers>=0.1.3.post0", + "verifiers>=0.1.12,<0.2", "datasets>= 4.0.0", "medarc-verifiers>=0.1.0", ] @@ -25,3 +25,7 @@ build-backend = "hatchling.build" [tool.hatch.build] include = ["pubmedqa.py", "data/"] + +[tool.verifiers.eval] +num_examples = -1 +rollouts_per_example = 1 diff --git a/environments/sctpublic/pyproject.toml b/environments/sctpublic/pyproject.toml index 94a3e395..949c9cc3 100644 --- a/environments/sctpublic/pyproject.toml +++ b/environments/sctpublic/pyproject.toml @@ -9,7 +9,7 @@ authors = [ ] dependencies = [ "medarc-verifiers>=0.1.0", - "verifiers>=0.1.6.post0", + "verifiers>=0.1.12,<0.2", ] [build-system] diff --git a/environments/supergpqa_medicine/pyproject.toml b/environments/supergpqa_medicine/pyproject.toml index db8b268a..0bb6b46e 100644 --- a/environments/supergpqa_medicine/pyproject.toml +++ b/environments/supergpqa_medicine/pyproject.toml @@ -5,7 +5,7 @@ tags = ["medicine", "single-turn", "multiple-choice", "supergpqa", "evaluation", version = "0.1.0" requires-python = ">=3.11" dependencies = [ - "verifiers>=0.1.3.post0", + "verifiers>=0.1.12,<0.2", "medarc-verifiers>=0.1.0", ] diff --git a/environments/supergpqa_medicine/supergpqa_medicine.py b/environments/supergpqa_medicine/supergpqa_medicine.py index 404a6220..b7b55cbb 100644 --- a/environments/supergpqa_medicine/supergpqa_medicine.py +++ b/environments/supergpqa_medicine/supergpqa_medicine.py @@ -12,14 +12,14 @@ disable_progress_bar() # suppress datasets mapping progress bar -ZERO_SHOT_PROMPT_TEMPLATE = """ -Answer the following multiple choice question. There is only one correct answer. The last line of your response should be in the format 'Answer: \\boxed{{$LETTER}}' (without quotes), where LETTER is one of A, B, C, D, E, F, G, H, I, or J. +ZERO_SHOT_PROMPT_TEMPLATE = r""" +Answer the following multiple choice question. There is only one correct answer. The last line of your response should be in the format 'Answer: \boxed{{$LETTER}}' (without quotes), where LETTER is one of A, B, C, D, E, F, G, H, I, or J. {} """.strip() -FIVE_SHOT_PROMPT_TEMPLATE = """ -Answer the following multiple choice question. There is only one correct answer. The last line of your response should be in the format 'Answer: \\boxed{{$LETTER}}' (without quotes), where LETTER is one of A, B, C, D, E, F, G, H, I, or J. +FIVE_SHOT_PROMPT_TEMPLATE = r""" +Answer the following multiple choice question. There is only one correct answer. The last line of your response should be in the format 'Answer: \boxed{{$LETTER}}' (without quotes), where LETTER is one of A, B, C, D, E, F, G, H, I, or J. Question: A refracting telescope consists of two converging lenses separated by 100 cm. The eye-piece lens has a focal length of 20 cm. The angular magnification of the telescope is @@ -35,7 +35,7 @@ J) 20 Answer: Let's think step by step. In a refracting telescope, if both lenses are converging, the focus of both lenses must be between the two lenses, and thus the focal lengths of the two lenses must add up to their separation. Since the focal length of one lens is 20 cm, the focal length of the other must be 80 cm. The magnification is the ratio of these two focal lengths, or 4. -Answer: \\boxed{{H}}. +Answer: \boxed{{H}}. Question: Say the pupil of your eye has a diameter of 5 mm and you have a telescope with an aperture of 50 cm. How much more light can the telescope gather than your eye? @@ -54,7 +54,7 @@ \[ \frac{{\left(\frac{{50 \text{{ cm}}}}{{2}}\right)^2}}{{\left(\frac{{5 \text{{ mm}}}}{{2}}\right)^2}} = \frac{{\left(\frac{{50 \text{{ cm}}}}{{0.1 \text{{ cm}}}}\right)^2}}{{\left(\frac{{5 \text{{ mm}}}}{{0.1 \text{{ cm}}}}\right)^2}} = \frac{{500^2}}{{5^2}} = 10000. \] -Answer: \\boxed{{E}}. +Answer: \boxed{{E}}. Question: Where do most short-period comets come from and how do we know? @@ -67,7 +67,7 @@ G) The asteroid belt; short period comets have orbital periods similar to asteroids like Vesta and are found in the plane of the solar system just like the asteroid belt. Answer: Let's think step by step. Most short-period comets originate from the Kuiper belt. This is deduced from the observation that these comets tend to follow orbits that lie in the plane of the solar system, similar to the distribution of objects in the Kuiper belt itself. Thus, the alignment of these cometary orbits with the ecliptic plane points to their Kuiper belt origin. -Answer: \\boxed{{A}}. +Answer: \boxed{{A}}. Question: Colors in a soap bubble result from light @@ -83,7 +83,7 @@ J) transmission Answer: Let's think step by step. The colorful patterns observed in a soap bubble are caused by the phenomenon of light interference. This occurs when light waves bounce between the two surfaces of the soap film, combining constructively or destructively based on their phase differences and the varying thickness of the film. These interactions result in vibrant color patterns due to variations in the intensity of different wavelengths of light. -Answer: \\boxed{{E}}. +Answer: \boxed{{E}}. Question: A microwave oven is connected to an outlet, 120 V, and draws a current of 2 amps. At what rate is energy being used by the microwave oven? @@ -103,7 +103,7 @@ \text{{Power}} = \text{{Voltage}} \times \text{{Current}} = 120 \, \text{{V}} \times 2 \, \text{{A}} = 240 \, \text{{W}}. \] Therefore, the microwave oven uses energy at a rate of 240 watts. -Answer: \\boxed{{A}}. +Answer: \boxed{{A}}. Question: {} @@ -202,6 +202,7 @@ def _format_row(row: dict, idx: int) -> dict: # question and answer have been moved to top-level, so remove them here info = dict(row) + info.pop("task", None) # update shuffled answer choices in the info dict if shuffle_answers: diff --git a/medarc_verifiers/__init__.py b/medarc_verifiers/__init__.py index 7bf106ce..be21dcdf 100644 --- a/medarc_verifiers/__init__.py +++ b/medarc_verifiers/__init__.py @@ -1,6 +1,6 @@ import logging -__version__ = "0.1.0" +__version__ = "0.2.0" # Always install judge cache namespacing. try: diff --git a/medarc_verifiers/cli/_config_loader.py b/medarc_verifiers/cli/_config_loader.py deleted file mode 100644 index 62097dea..00000000 --- a/medarc_verifiers/cli/_config_loader.py +++ /dev/null @@ -1,512 +0,0 @@ -"""Config loader utilities bridging OmegaConf YAML files and Pydantic schemas.""" - -from __future__ import annotations - -import logging -from collections.abc import Iterable, Mapping -from itertools import product -from pathlib import Path -from typing import Any, Callable - -from omegaconf import OmegaConf - -from ._schemas import RESERVED_MATRIX_KEYS, EnvironmentConfigSchema, RunConfigSchema -from .utils.endpoint_utils import EnvMetadataCache, load_env_metadata -from .utils.env_args import validate_env_args_or_raise - -logger = logging.getLogger(__name__) -DEFAULT_ENV_FILE_SUFFIXES = (".yaml", ".yml") - -# Scalar fields (non-env_args) that may be overridden by matrix combos. -SCALAR_FIELD_NAMES = { - name for name in EnvironmentConfigSchema.model_fields if name not in RESERVED_MATRIX_KEYS and name != "env_args" -} - - -class ConfigFormatError(ValueError): - """Raised when a configuration file cannot be interpreted as a mapping.""" - - -def _load_raw_config(path: Path) -> Any: - """Load and resolve an OmegaConf configuration file.""" - cfg = OmegaConf.load(path) - OmegaConf.resolve(cfg) - return OmegaConf.to_container(cfg, resolve=True) - - -def load_run_config(path: str | Path, *, env_default_root: str | Path | None = None) -> RunConfigSchema: - """Load a run configuration file into the top-level schema.""" - # Loader responsibilities: - # 1. Read and resolve OmegaConf input (supporting includes and defaults). - # 2. Normalize models/envs/jobs into canonical mappings or lists. - # 3. Let Pydantic schemas handle structural validation and coercion. - # 4. Expand environment matrices into concrete variants. - # 5. Perform lightweight env_args validation using environment metadata. - resolved_path = Path(path).expanduser().resolve() - env_default_root_path = Path(env_default_root).expanduser().resolve() if env_default_root is not None else None - data = _load_raw_config(resolved_path) - - if not isinstance(data, dict): - msg = f"Configuration root must be a mapping, got {type(data).__name__}." - raise ConfigFormatError(msg) - - if "envs" not in data or data["envs"] in (None, [], {}): - if env_default_root_path is None: - raise ConfigFormatError( - "Configuration must define 'envs' or --env-config-root must supply a discovery directory." - ) - data = dict(data) - data["envs"] = str(env_default_root_path) - - data = _normalize_config_fields(data, base_dir=resolved_path.parent, env_default_root=env_default_root_path) - - run_config = RunConfigSchema(**data) - expanded_envs = _expand_env_matrices(run_config.envs) - _validate_env_args(expanded_envs.values()) - return run_config.model_copy(update={"envs": expanded_envs}) - - -def _expand_env_matrices(envs: dict[str, EnvironmentConfigSchema]) -> dict[str, EnvironmentConfigSchema]: - scalar_fields = SCALAR_FIELD_NAMES - expanded: dict[str, EnvironmentConfigSchema] = {} - for env_id, env in envs.items(): - env_with_id = env if env.id else env.model_copy(update={"id": env_id}) - for variant in _expand_single_environment(env_with_id, scalar_fields): - if variant.id in expanded: - raise ValueError(f"environment '{variant.id}' defined multiple times after expansion.") - expanded[variant.id] = variant - return expanded - - -def _expand_single_environment( - env: EnvironmentConfigSchema, - scalar_fields: Iterable[str], -) -> list[EnvironmentConfigSchema]: - if not env.matrix: - return [ - env.model_copy( - update={ - "env_args": dict(env.env_args), - "matrix": None, - "matrix_exclude": None, - "matrix_id_format": None, - } - ) - ] - - matrix = env.matrix - base_id = env.id - if not base_id: - raise ValueError("environment entries must specify an id.") - - matrix_keys = list(matrix.keys()) - matrix_values = [matrix[key] for key in matrix_keys] - variants: list[EnvironmentConfigSchema] = [] - seen_ids: set[str] = set() - - base_env_args = dict(env.env_args) - module_name = env.module or env.id # prefer explicit module override when present - - exclude_patterns = env.matrix_exclude or [] - - combos: Iterable[tuple[Any, ...]] - if matrix_keys: - combos = product(*matrix_values) - else: - combos = [()] - - for combo_values in combos: - combo = dict(zip(matrix_keys, combo_values)) - if any(_matches_matrix_pattern(combo, pattern) for pattern in exclude_patterns): - continue - - env_args = dict(base_env_args) - updates: dict[str, Any] = {} - for key, value in combo.items(): - if value is None: - continue - if key in scalar_fields: - updates[key] = value - else: - env_args[key] = value - - variant_id = _build_matrix_variant_id(base_id, combo, env.matrix_id_format) - if variant_id in seen_ids: - raise ValueError(f"environment '{base_id}' matrix generated duplicate id '{variant_id}'.") - seen_ids.add(variant_id) - - variant_data = env.model_dump() - variant_data.update(updates) - variant_data["id"] = variant_id - variant_data["env_args"] = env_args - variant_data["module"] = module_name - variant_data["matrix"] = None - variant_data["matrix_exclude"] = None - variant_data["matrix_id_format"] = None - variant_data["matrix_base_id"] = base_id - - variants.append(EnvironmentConfigSchema(**variant_data)) - - if not variants: - raise ValueError(f"environment '{base_id}' matrix produced no variants after exclusions.") - - return variants - - -def _normalize_config_fields( - data: Mapping[str, Any], *, base_dir: Path, env_default_root: Path | None -) -> dict[str, Any]: - """Apply include expansion and shape normalization before schema validation.""" - - normalized = dict(data) - - if "models" in normalized: - normalized["models"] = _normalize_models_field(normalized["models"], base_dir=base_dir) - - if "envs" in normalized: - normalized["envs"] = _normalize_envs_field( - normalized["envs"], - base_dir=base_dir, - env_default_root=env_default_root, - ) - - if "jobs" in normalized: - normalized["jobs"] = _normalize_jobs_field(normalized["jobs"], base_dir=base_dir) - - return normalized - - -def _normalize_models_field(value: Any, *, base_dir: Path) -> dict[str, Any]: - return _normalize_section( - value, - base_dir=base_dir, - context="models", - entry_description="models", - default_id_from_key=True, - allow_duplicate_ids=False, - env_default_root=None, - ) - - -def _normalize_envs_field(value: Any, *, base_dir: Path, env_default_root: Path | None) -> dict[str, Any]: - # Env configs intentionally allow duplicate "id" entries so multiple blocks can - # share a common base id (e.g., m_arc + rollout variants). We de-duplicate only - # the internal map key while preserving each entry's explicit "id". - return _normalize_section( - value, - base_dir=base_dir, - context="envs", - entry_description="envs", - default_id_from_key=True, - allow_duplicate_ids=True, - duplicate_key_fn=_make_duplicate_key, - env_default_root=env_default_root, - ) - - -def _make_duplicate_key(base: str, count: int, existing: Mapping[str, Any]) -> str: - suffix = count - while True: - candidate = f"{base}__dup__{suffix}" - if candidate not in existing: - return candidate - suffix += 1 - - -def _normalize_jobs_field(value: Any, *, base_dir: Path) -> list[dict[str, Any]]: - entries = _collect_entries( - value, - base_dir=base_dir, - context="jobs", - entry_description="jobs", - env_default_root=None, - ) - return [_adapt_job_entry(entry) for entry in entries] - - -def _normalize_section( - value: Any, - *, - base_dir: Path, - context: str, - entry_description: str, - default_id_from_key: bool, - allow_duplicate_ids: bool, - duplicate_key_fn: Callable[[str, int, Mapping[str, Any]], str] | None = None, - env_default_root: Path | None, -) -> dict[str, Any]: - """Normalize section entries (models/envs) with shared include handling.""" - if value is None: - return {} - - normalized: dict[str, Any] = {} - - def _add_entry( - entry: Mapping[str, Any], *, key_hint: str | None = None, count_map: dict[str, int] | None = None - ) -> None: - if not isinstance(entry, Mapping): - raise ValueError(f"{context} entries must be mappings.") - adapted = dict(entry) - item_id = adapted.get("id") or key_hint - if not item_id: - raise ValueError(f"{context} entries must include an 'id'.") - key = str(item_id) - if count_map is not None: - count_map.setdefault(key, 1) - if key in normalized: - if not allow_duplicate_ids: - raise ValueError(f"Duplicate {entry_description.rstrip('s')} id '{key}' in configuration.") - if duplicate_key_fn is None: - raise ValueError(f"Duplicate {entry_description.rstrip('s')} id '{key}' in configuration.") - # Env entries can intentionally repeat ids to group variants under a common - # base id; we only de-duplicate the internal map key, not the entry's id. - counter = 2 - if count_map is not None: - counter = count_map.get(key, 1) + 1 - count_map[key] = counter - key = duplicate_key_fn(key, counter, normalized) - normalized[key] = adapted - - if isinstance(value, Mapping) and all(isinstance(v, Mapping) for v in value.values()): - for key, entry in value.items(): - adapted = dict(entry) - if default_id_from_key and "id" not in adapted: - adapted["id"] = str(key) - _add_entry(adapted) - return normalized - - entries = _collect_entries( - value, - base_dir=base_dir, - context=context, - entry_description=entry_description, - env_default_root=env_default_root, - ) - duplicate_counts: dict[str, int] = {} - for entry in entries: - _add_entry(entry, count_map=duplicate_counts) - return normalized - - -def _collect_entries( - source: Any, - *, - base_dir: Path, - context: str, - entry_description: str, - env_default_root: Path | None, -) -> list[dict[str, Any]]: - if source is None: - return [] - if isinstance(source, Mapping): - return [dict(source)] - if isinstance(source, (str, Path)): - return _collect_entries_from_path( - source, - base_dir=base_dir, - context=context, - entry_description=entry_description, - env_default_root=env_default_root, - ) - if isinstance(source, list): - entries: list[dict[str, Any]] = [] - for index, item in enumerate(source): - item_context = f"{context}[{index}]" - if isinstance(item, Mapping): - entries.append(dict(item)) - elif isinstance(item, (str, Path)): - entries.extend( - _collect_entries_from_path( - item, - base_dir=base_dir, - context=item_context, - entry_description=entry_description, - env_default_root=env_default_root, - ) - ) - else: - raise ValueError(f"{item_context} must be a mapping or path.") - return entries - raise ValueError(f"{context} must be provided as a mapping, list, or path.") - - -def _collect_entries_from_path( - source: str | Path, - *, - base_dir: Path, - context: str, - entry_description: str, - env_default_root: Path | None, -) -> list[dict[str, Any]]: - path = _resolve_include_path(source, base_dir=base_dir) - if not path.exists() and entry_description == "envs": - fallback = _resolve_default_env_path(source, base_dir=base_dir, env_default_root=env_default_root) - if fallback is not None: - path = fallback - if not path.exists(): - raise FileNotFoundError(f"{context} path '{path}' does not exist.") - if path.is_dir(): - if entry_description not in {"envs", "jobs"}: - msg = f"{context} path '{path}' must be a file. Directory includes are only supported for envs and jobs." - raise ValueError(msg) - entries: list[dict[str, Any]] = [] - for child in sorted(path.iterdir()): - if child.is_file() and child.suffix.lower() in {".yaml", ".yml"}: - entries.extend( - _collect_entries_from_path( - child, - base_dir=child.parent, - context=f"{context}/{child.name}", - entry_description=entry_description, - env_default_root=env_default_root, - ) - ) - return entries - - loaded = _load_raw_config(path) - if isinstance(loaded, Mapping): - if not loaded: - return [] - if not all(isinstance(v, Mapping) for v in loaded.values()): - msg = f"{context} included {entry_description} must be a mapping of id→mapping or a list of mappings." - raise ValueError(msg) - entries: list[dict[str, Any]] = [] - for key, value in loaded.items(): - entry = dict(value) - entry.setdefault("id", str(key)) - entries.append(entry) - return entries - if isinstance(loaded, list): - entries: list[dict[str, Any]] = [] - for index, item in enumerate(loaded): - if not isinstance(item, Mapping): - raise ValueError(f"{context}[{index}] in included {entry_description} must be a mapping.") - entries.append(dict(item)) - return entries - if loaded is None: - return [] - raise ValueError(f"{context} included {entry_description} must be a mapping of id→mapping or a list of mappings.") - - -def _resolve_include_path(source: str | Path, *, base_dir: Path) -> Path: - path = Path(source).expanduser() - if not path.is_absolute(): - path = (base_dir / path).resolve() - else: - path = path.resolve() - return path - - -def _resolve_default_env_path(source: str | Path, *, base_dir: Path, env_default_root: Path | None) -> Path | None: - raw_source = Path(source) - if raw_source.is_absolute() or env_default_root is None: - return None - - normalized = env_default_root if env_default_root.is_absolute() else env_default_root.resolve() - candidates = _candidate_env_paths(normalized, raw_source) - for candidate in candidates: - if candidate.exists(): - return candidate - return None - - -def _candidate_env_paths(root: Path, relative_entry: Path) -> list[Path]: - base = root / relative_entry - candidates = [base] - if not relative_entry.suffix: - for suffix in DEFAULT_ENV_FILE_SUFFIXES: - candidates.append((root / relative_entry).with_suffix(suffix)) - return [candidate.resolve() for candidate in candidates] - - -def _adapt_job_entry(entry: Any) -> Any: - if not isinstance(entry, dict): - return entry - - normalized = dict(entry) - for key in ("env_args", "sampling_args"): - value = normalized.get(key) - if value is None: - normalized[key] = {} - elif isinstance(value, dict): - normalized[key] = dict(value) - else: - raise ValueError(f"job {key} must be a mapping when provided.") - - return normalized - - -def _build_matrix_variant_id( - base_id: str, - combo: dict[str, Any], - id_format: str | None, -) -> str: - format_values = {key: _format_matrix_value(value) for key, value in combo.items()} - format_values["base"] = base_id - - if id_format: - try: - variant_id = id_format.format(**format_values) - except KeyError as exc: # noqa: F841 - missing = exc.args[0] - raise ValueError(f"environment '{base_id}' matrix_id_format references unknown key '{missing}'.") from exc - else: - suffix_parts = [f"{key}-{_format_matrix_value(value)}" for key, value in combo.items() if value is not None] - variant_id = base_id if not suffix_parts else f"{base_id}-{'-'.join(suffix_parts)}" - - if not isinstance(variant_id, str) or not variant_id: - raise ValueError(f"environment '{base_id}' matrix generated an invalid id '{variant_id!r}'.") - - return variant_id - - -def _format_matrix_value(value: Any) -> str: - if value is None: - return "base" - if isinstance(value, bool): - return "true" if value else "false" - return str(value) - - -def _matches_matrix_pattern(combo: dict[str, Any], pattern: dict[str, Any]) -> bool: - return all(combo.get(key) == value for key, value in pattern.items()) - - -def _validate_env_args(envs: Iterable[EnvironmentConfigSchema]) -> None: - """Validate env_args at config load time (lenient - no required param enforcement). - - This is the first of two validation phases: - Phase 1 (here): Check for unknown parameters and type mismatches - Do NOT enforce required parameters (allow partial configs) - Phase 2 (executor): Enforce required parameters after CLI overrides applied - - Why two phases? - - Matrix expansion can create variants with different required params - - Users might load a config with 100 jobs but only run 5 with --job-id - - Failing at load time for jobs we won't run would be frustrating - - This phase catches obvious mistakes (typos, wrong types) early while deferring - required parameter checks until execution when we know what will actually run. - """ - cache: EnvMetadataCache = {} - for env in envs: - env_module = env.module or env.matrix_base_id or env.id - if not env_module: - continue - try: - metadata = load_env_metadata(env_module, cache=cache) - except ImportError as exc: - logger.warning("Skipping env_args validation for '%s': %s", env_module, exc) - continue - # Phase 1 validation: unknown/type checks only; do not enforce requireds at load time. - validate_env_args_or_raise( - env_module, - env.env_args, - metadata=metadata, - metadata_cache=cache, - allow_unknown=False, - enforce_required=False, # Deferred to execution time - ) - - -__all__ = ["ConfigFormatError", "load_run_config"] diff --git a/medarc_verifiers/cli/_constants.py b/medarc_verifiers/cli/_constants.py index a466e47b..fd65f19e 100644 --- a/medarc_verifiers/cli/_constants.py +++ b/medarc_verifiers/cli/_constants.py @@ -18,6 +18,6 @@ DEFAULT_ENDPOINTS_PATH = Path("configs") / "endpoints.toml" DEFAULT_ENV_DIR = Path("environments") DEFAULT_ENV_CONFIG_ROOT = Path("configs") / "envs" -DEFAULT_RUNS_RAW_DIR = Path("runs") / "raw" +DEFAULT_EVALS_DIR = Path("runs") / "evals" DEFAULT_PROCESSED_DIR = Path("runs") / "processed" DEFAULT_WINRATE_DIR = DEFAULT_PROCESSED_DIR / "winrate" diff --git a/medarc_verifiers/cli/_eval_builder.py b/medarc_verifiers/cli/_eval_builder.py deleted file mode 100644 index 94462c8c..00000000 --- a/medarc_verifiers/cli/_eval_builder.py +++ /dev/null @@ -1,306 +0,0 @@ -"""Shared helpers for building client and eval configs.""" - -from __future__ import annotations - -import logging -from pathlib import Path -from typing import Any, Callable, Mapping - -from verifiers.types import ClientConfig, EndpointClientConfig, EvalConfig - -from medarc_verifiers.cli._schemas import EnvironmentConfigSchema, ModelConfigSchema -from medarc_verifiers.cli.utils.endpoint_utils import ( - EndpointRegistry, - EnvMetadataCache, - load_env_metadata, - resolve_model_endpoint, -) -from medarc_verifiers.cli.utils.env_args import merge_env_args -from medarc_verifiers.cli.utils.shared import ( - DEFAULT_BATCH_MAX_CONCURRENT, - merge_sampling_overrides, - normalize_headers, - resolve_env_identifier, - resolve_max_concurrent, -) -from medarc_verifiers.utils.prime_inference import PRIME_INFERENCE_URL, prime_inference_overrides - -logger = logging.getLogger(__name__) - - -def build_client_config( - model_cfg: ModelConfigSchema, - *, - endpoints: EndpointRegistry, - default_api_key_var: str, - default_api_key_var_explicit: bool, - default_api_base_url: str, - api_base_url_override: str | None, - http_max_retries_override: int | None, - timeout_override: float | None, - headers: list[str] | dict[str, str] | None, -) -> tuple[str, ClientConfig, dict[str, Any]]: - """Resolve model alias + endpoint settings into a ClientConfig. - - Returns: - A tuple of (resolved_model, client_config, sampling_overrides). - - resolved_model: The resolved model identifier - - client_config: The ClientConfig for API calls - - sampling_overrides: Prime Inference sampling args to merge (e.g., usage reporting) - """ - normalized_headers = normalize_headers(headers if headers is not None else model_cfg.headers) - model_alias = model_cfg.model or model_cfg.id - if not model_alias: - raise ValueError("Model entries must define 'id' or 'model'.") - - model_api_key_var_explicit = model_cfg.api_key_var is not None - default_key_var = model_cfg.api_key_var if model_api_key_var_explicit else default_api_key_var - default_base_url = model_cfg.api_base_url or default_api_base_url - endpoint_group = endpoints.get(model_alias, []) - resolved_model, api_key_var, api_base_url = resolve_model_endpoint( - model_alias, - endpoints, - default_key_var=default_key_var, - default_base_url=default_base_url, - ) - if api_base_url_override is not None: - logger.debug("Forcing api_base_url override for model '%s'.", model_alias) - api_base_url = api_base_url_override - - # Get Prime Inference-specific overrides (headers + sampling args). - prime_headers, sampling_overrides = prime_inference_overrides(api_base_url) - - effective_api_key_var = api_key_var - # MedARC defaults to OPENAI_API_KEY. For Prime URLs, force PRIME_API_KEY only when - # neither model config nor CLI explicitly selected a key var, and no endpoint - # registry group resolved this alias (which may intentionally provide a custom key var). - if ( - api_base_url == PRIME_INFERENCE_URL - and not model_api_key_var_explicit - and not default_api_key_var_explicit - and not endpoint_group - ): - effective_api_key_var = "PRIME_API_KEY" - - # Merge headers: user-provided headers take precedence over Prime auto-detected - merged_headers = {**prime_headers, **(normalized_headers or {})} - - endpoint_configs: list[EndpointClientConfig] = [] - if api_base_url_override is None and len(endpoint_group) > 1: - first_entry = endpoint_group[0] - expected_model = first_entry.get("model", model_alias) - expected_key = first_entry.get("key", default_key_var) - for idx, endpoint in enumerate(endpoint_group[1:], start=1): - entry_model = endpoint.get("model", model_alias) - entry_key = endpoint.get("key", default_key_var) - if entry_model != expected_model or entry_key != expected_key: - raise ValueError( - "Endpoint replicas for " - f"'{model_alias}' must agree on 'model' and 'key'; " - f"variant 0 has model={expected_model!r}, key={expected_key!r}, " - f"variant {idx} has model={entry_model!r}, key={entry_key!r}." - ) - endpoint_configs = [ - EndpointClientConfig( - api_key_var=effective_api_key_var, - api_base_url=endpoint["url"], - extra_headers=merged_headers, - ) - for endpoint in endpoint_group - ] - - client_kwargs: dict[str, Any] = { - "api_key_var": effective_api_key_var, - "api_base_url": api_base_url, - "endpoint_configs": endpoint_configs, - "extra_headers": merged_headers, - } - timeout = timeout_override if timeout_override is not None else model_cfg.timeout - if timeout is not None: - client_kwargs["timeout"] = timeout - if model_cfg.max_connections is not None: - client_kwargs["max_connections"] = model_cfg.max_connections - if model_cfg.max_keepalive_connections is not None: - client_kwargs["max_keepalive_connections"] = model_cfg.max_keepalive_connections - if http_max_retries_override is not None: - client_kwargs["max_retries"] = http_max_retries_override - elif model_cfg.max_retries is not None: - client_kwargs["max_retries"] = model_cfg.max_retries - - return resolved_model, ClientConfig(**client_kwargs), sampling_overrides - - -def build_eval_config( - *, - job_label: str | None, - model_cfg: ModelConfigSchema, - env_cfg: EnvironmentConfigSchema, - env_args: Mapping[str, Any], - sampling_args: Mapping[str, Any], - cli_env_args: Mapping[str, Any] | None, - cli_sampling_args: Mapping[str, Any] | None, - resolved_model: str, - client_config: ClientConfig, - env_dir: Path, - max_concurrent_override: int | None, - max_concurrent_generation: int | None, - max_concurrent_scoring: int | None, - rollout_max_retries: int = 0, - resume_path: Path | None = None, - default_max_concurrent: int = DEFAULT_BATCH_MAX_CONCURRENT, - save_results: bool = True, - save_to_hf_hub: bool = False, - hf_hub_dataset_name: str | None = None, - verbose: bool = False, - env_metadata_cache: EnvMetadataCache | None = None, - env_metadata_loader: Callable[..., Any] = load_env_metadata, - enforce_required_env_args: bool = True, - allow_unknown_env_args: bool = False, -) -> EvalConfig: - """Assemble EvalConfig with shared env/sampling override handling.""" - env_id = resolve_env_identifier(env_cfg) - try: - metadata = _call_env_metadata_loader(env_metadata_loader, env_id, env_metadata_cache) - except ImportError as exc: - logger.warning("Skipping env_args validation for '%s': %s", env_id, exc) - metadata = None - - merged_env_args = merge_env_args( - env_id, - sources=[env_args, cli_env_args or {}], - metadata=metadata, - metadata_cache=env_metadata_cache, - allow_unknown=allow_unknown_env_args, - enforce_required=enforce_required_env_args, - verbose=verbose, - ) - - merged_sampling = dict(sampling_args) - merged_sampling = merge_sampling_overrides(merged_sampling, cli_sampling_args) - - _warn_deprecated_eval_knobs( - env_cfg=env_cfg, - env_id=env_id, - job_label=job_label, - max_concurrent_generation=max_concurrent_generation, - max_concurrent_scoring=max_concurrent_scoring, - ) - - max_concurrent = resolve_max_concurrent( - cli_override=max_concurrent_override, - model_max=model_cfg.max_concurrent, - env_max=env_cfg.max_concurrent, - default_max=default_max_concurrent, - ) - effective_save_results = save_results - if resume_path is not None and not effective_save_results: - logger.warning("Enabling save_results (required for resume support).") - effective_save_results = True - - verbose_flag = env_cfg.verbose if env_cfg.verbose is not None else verbose - state_columns = list(env_cfg.state_columns) if env_cfg.state_columns else None - eval_config_fields = _pydantic_field_names(EvalConfig) - - eval_kwargs: dict[str, Any] = { - "env_id": env_id, - "env_args": merged_env_args, - "env_dir_path": str(env_dir), - "model": resolved_model, - "client_config": client_config, - "sampling_args": merged_sampling, - "num_examples": env_cfg.num_examples, - "rollouts_per_example": env_cfg.rollouts_per_example, - "max_concurrent": max_concurrent, - "verbose": verbose_flag, - "state_columns": state_columns, - "save_results": effective_save_results, - "save_to_hf_hub": save_to_hf_hub, - "hf_hub_dataset_name": hf_hub_dataset_name, - } - if "max_retries" in eval_config_fields: - eval_kwargs["max_retries"] = rollout_max_retries - if "resume_path" in eval_config_fields: - eval_kwargs["resume_path"] = resume_path - - independent_scoring = getattr(env_cfg, "independent_scoring", None) - interleave_scoring = getattr(env_cfg, "interleave_scoring", None) - - if interleave_scoring is not None: - raise ValueError( - f"Environment '{env_id}' uses interleave_scoring, which is no longer supported; use independent_scoring." - ) - - if "independent_scoring" in eval_config_fields: - if independent_scoring is None: - independent_scoring = True - eval_kwargs["independent_scoring"] = bool(independent_scoring) - elif independent_scoring is not None: - logger.warning( - "Environment '%s' set independent_scoring=%s, but installed verifiers does not support it; ignoring.", - env_id, - independent_scoring, - ) - - if "extra_env_kwargs" in eval_config_fields: - extra_env_kwargs = getattr(env_cfg, "extra_env_kwargs", None) - if extra_env_kwargs is not None: - eval_kwargs["extra_env_kwargs"] = dict(extra_env_kwargs) - - return EvalConfig(**eval_kwargs) - - -__all__ = ["build_client_config", "build_eval_config"] - - -def _call_env_metadata_loader(loader: Callable[..., Any], env_id: str, cache: EnvMetadataCache | None) -> Any: - """Invoke env metadata loader tolerant of positional-only stubs used in tests.""" - try: - return loader(env_id, cache=cache) - except TypeError: - return loader(env_id) - - -def _pydantic_field_names(model_type: type[Any]) -> set[str]: - fields = getattr(model_type, "model_fields", None) - if isinstance(fields, dict): - return set(fields.keys()) - fields = getattr(model_type, "__fields__", None) - if isinstance(fields, dict): - return set(fields.keys()) - return set() - - -def _warn_deprecated_eval_knobs( - *, - env_cfg: Any, - env_id: str, - job_label: str | None, - max_concurrent_generation: int | None, - max_concurrent_scoring: int | None, -) -> None: - env_fields_set = set(getattr(env_cfg, "model_fields_set", set())) - - deprecated_env_knobs: list[str] = [] - if "save_every" in env_fields_set and getattr(env_cfg, "save_every", None) is not None: - deprecated_env_knobs.append("save_every") - if "print_results" in env_fields_set: - deprecated_env_knobs.append("print_results") - if deprecated_env_knobs: - logger.warning( - "Environment '%s' sets deprecated eval knob(s): %s. These options are ignored.", - env_id, - ", ".join(sorted(deprecated_env_knobs)), - ) - - deprecated_concurrency_knobs: list[str] = [] - if max_concurrent_generation is not None: - deprecated_concurrency_knobs.append("max_concurrent_generation") - if max_concurrent_scoring is not None: - deprecated_concurrency_knobs.append("max_concurrent_scoring") - if deprecated_concurrency_knobs: - label = job_label or env_id - logger.warning( - "Job '%s' sets deprecated eval knob(s): %s. These options are ignored.", - label, - ", ".join(sorted(deprecated_concurrency_knobs)), - ) diff --git a/medarc_verifiers/cli/_job_builder.py b/medarc_verifiers/cli/_job_builder.py deleted file mode 100644 index f04a4c6e..00000000 --- a/medarc_verifiers/cli/_job_builder.py +++ /dev/null @@ -1,204 +0,0 @@ -"""Resolve validated run configurations into executable job definitions.""" - -from __future__ import annotations - -from dataclasses import dataclass -from typing import Any, Iterable - -from ._schemas import EnvironmentConfigSchema, ModelConfigSchema, RunConfigSchema -from .utils.env_args import merge_env_args -from .utils.shared import compute_checksum, slugify - - -@dataclass(slots=True) -class ResolvedJob: - """Executable job produced from a run configuration.""" - - job_id: str - name: str - model: ModelConfigSchema - env: EnvironmentConfigSchema - env_args: dict[str, Any] - sampling_args: dict[str, Any] - sleep: float | None = None - - -def build_jobs(config: RunConfigSchema) -> list[ResolvedJob]: - """Expand a validated run configuration into concrete jobs.""" - matrix_index = _build_matrix_index(config.envs.values()) - models: dict[str, ModelConfigSchema] = config.models - resolved: list[ResolvedJob] = [] - used_ids: set[str] = set() - - for job_cfg in config.jobs: - model_id, model = _resolve_model(job_cfg.model, models) - if model.id is None: - model = model.model_copy(update={"id": model_id}) - models[model_id] = model - env_targets = _coerce_iterable(job_cfg.env) - for env_target in env_targets: - for env_id in _resolve_env_ids(env_target, config.envs, matrix_index): - env = config.envs[env_id] - if env.id is None: - env = env.model_copy(update={"id": env_id}) - config.envs[env_id] = env - env_args = _compose_env_args(env, model, job_cfg.env_args) - sampling_args = _compose_sampling_args(model.sampling_args, job_cfg.sampling_args) - name = job_cfg.name or f"{model_id}-{env.id}" - job_id = _build_job_id( - model_id=model_id, - env_id=env.id, - job_name=job_cfg.name, - env_overrides=job_cfg.env_args, - sampling_overrides=job_cfg.sampling_args, - used_ids=used_ids, - ) - used_ids.add(job_id) - resolved.append( - ResolvedJob( - job_id=job_id, - name=name, - model=model, - env=env, - env_args=env_args, - sampling_args=sampling_args, - sleep=job_cfg.sleep, - ) - ) - - return resolved - - -def _resolve_model( - model_ref: str | dict[str, Any], - models: dict[str, ModelConfigSchema], -) -> tuple[str, ModelConfigSchema]: - if isinstance(model_ref, str): - model = models.get(model_ref) - if model is None: - raise ValueError(f"Job references unknown model '{model_ref}'.") - return model_ref, model - - inline = ModelConfigSchema(**model_ref) - if not inline.id: - raise ValueError("Inline model definitions must include an 'id'.") - existing = models.get(inline.id) - if existing is not None and existing != inline: - raise ValueError(f"Conflicting inline model definition for id '{inline.id}'.") - models[inline.id] = inline - return inline.id, inline - - -def _resolve_env_ids( - env_ref: str, - envs: dict[str, EnvironmentConfigSchema], - matrix_index: dict[str, list[str]], -) -> list[str]: - candidates: list[str] = [] - if env_ref in envs: - candidates.append(env_ref) - if env_ref in matrix_index: - candidates.extend(matrix_index[env_ref]) - if not candidates: - raise ValueError(f"Job references unknown environment '{env_ref}'.") - # Preserve order while removing duplicates - unique: list[str] = [] - seen: set[str] = set() - for env_id in candidates: - if env_id not in seen: - unique.append(env_id) - seen.add(env_id) - return unique - - -def _resolve_env_override(model: ModelConfigSchema, env: EnvironmentConfigSchema) -> dict[str, Any] | None: - """Resolve env-specific overrides from model config. - - Tries in order: - 1. env.id (exact match for the environment identifier) - 2. env.matrix_base_id (for matrix-expanded variants like 'medqa-seed-1') - 3. env.module (fallback for module-based lookup) - - Returns the override dict if found, None otherwise. - """ - for key in (env.id, env.matrix_base_id, env.module): - if key and key in model.env_overrides: - return model.env_overrides[key] - return None - - -def _compose_env_args( - env: EnvironmentConfigSchema, - model: ModelConfigSchema, - job_env_args: dict[str, Any], -) -> dict[str, Any]: - """Compose env_args up to job overrides (CLI is applied later).""" - return merge_env_args( - None, - sources=[ - env.env_args, - model.env_args, - _resolve_env_override(model, env) or {}, - job_env_args, - ], - ) - - -def _compose_sampling_args( - model_sampling: dict[str, Any], - job_sampling: dict[str, Any], -) -> dict[str, Any]: - merged = dict(model_sampling) - merged.update(job_sampling) - return merged - - -def _build_matrix_index(envs: Iterable[EnvironmentConfigSchema]) -> dict[str, list[str]]: - index: dict[str, list[str]] = {} - for env in envs: - base_id = env.matrix_base_id - if base_id: - index.setdefault(base_id, []).append(env.id) - return index - - -def _coerce_iterable(value: str | list[str]) -> list[str]: - if isinstance(value, str): - return [value] - return list(value) - - -def _build_job_id( - *, - model_id: str, - env_id: str, - job_name: str | None, - env_overrides: dict[str, Any], - sampling_overrides: dict[str, Any], - used_ids: set[str], -) -> str: - segments = [slugify(model_id), slugify(env_id)] - if job_name: - segments.append(slugify(job_name)) - base = "-".join(filter(None, segments)) or "job" - job_id = base - if job_id not in used_ids: - return job_id - - payload = { - "model_id": model_id, - "env_id": env_id, - "job_name": job_name, - "env_overrides": env_overrides, - "sampling_overrides": sampling_overrides, - } - fingerprint = compute_checksum(payload)[:10] - job_id = f"{base}-{fingerprint}" - suffix = 1 - while job_id in used_ids: - suffix += 1 - job_id = f"{base}-{fingerprint}{suffix}" - return job_id - - -__all__ = ["ResolvedJob", "build_jobs"] diff --git a/medarc_verifiers/cli/_job_executor.py b/medarc_verifiers/cli/_job_executor.py deleted file mode 100644 index 31dd8f4b..00000000 --- a/medarc_verifiers/cli/_job_executor.py +++ /dev/null @@ -1,584 +0,0 @@ -"""Job execution utilities for the unified CLI.""" - -from __future__ import annotations - -import asyncio -import contextlib -import logging -import shutil -from datetime import UTC, datetime -from pathlib import Path -from time import perf_counter, sleep -from typing import Any, Literal, Mapping, Sequence -from pydantic import BaseModel, Field, field_validator - -from verifiers.types import GenerateOutputs -from verifiers.utils.eval_utils import run_evaluation - -from medarc_verifiers.cli._constants import DEFAULT_ENDPOINTS_PATH -from medarc_verifiers.cli._eval_builder import build_client_config, build_eval_config -from medarc_verifiers.cli._job_builder import ResolvedJob -from medarc_verifiers.cli._manifest import RunManifest -from medarc_verifiers.cli._schemas import ModelConfigSchema -from medarc_verifiers.cli.utils.endpoint_utils import ( - EndpointRegistry, - EndpointRegistryCache, - EnvMetadataCache, - load_endpoint_registry, - load_env_metadata, -) -from medarc_verifiers.cli.utils.resume import ( - format_resume_mismatch_lines, - is_resume_metadata_mismatch_error, - is_valid_resume_results_path, - load_resume_metadata_values, -) -from medarc_verifiers.cli.utils.shared import DEFAULT_BATCH_MAX_CONCURRENT, ensure_root_logging, resolve_env_identifier - -try: - from rich import print as rich_print # type: ignore -except ImportError: # pragma: no cover - rich is optional - rich_print = None - -logger = logging.getLogger(__name__) - - -class ExecutorSettings(BaseModel): - """Run-level options controlling how jobs are executed.""" - - run_id: str - output_dir: Path - env_dir: Path - endpoints_path: Path | None = None - endpoints_path_explicit: bool = False - default_api_key_var: str - default_api_key_var_explicit: bool = False - default_api_base_url: str - api_base_url_override: str | None = None - log_level: str = "INFO" - verbose: bool = False - save_results: bool = True - save_to_hf_hub: bool = False - hf_hub_dataset_name: str | None = None - max_concurrent_generation: int | None = None # Deprecated; accepted for compatibility and ignored. - max_concurrent_scoring: int | None = None # Deprecated; accepted for compatibility and ignored. - max_concurrent: int | None = None # CLI override for max_concurrent - http_max_retries: int | None = None # CLI override for ClientConfig.max_retries - rollout_max_retries: int = 0 # CLI override for EvalConfig.max_retries - timeout: float | None = None - sleep: float = 0.0 - dry_run: bool = False - cli_env_args: dict[str, Any] | None = None - cli_sampling_args: dict[str, Any] | None = None - forced_job_ids: set[str] = Field(default_factory=set) - - @field_validator("output_dir", "env_dir", mode="before") - @classmethod - def _expand_path(cls, value: Path | str) -> Path: - return Path(value).expanduser() - - @field_validator("endpoints_path", mode="before") - @classmethod - def _expand_optional_path(cls, value: Path | str | None) -> Path | None: - if value is None: - return None - return Path(value).expanduser() - - -class JobExecutionResult(BaseModel): - """Outcome emitted for each executed job.""" - - job_id: str - status: Literal["succeeded", "failed", "skipped"] - error: str | None = None - duration_seconds: float | None = None - output_path: Path | None = None - result: Any | None = None - - -def execute_jobs( - jobs: Sequence[ResolvedJob], - settings: ExecutorSettings, - *, - endpoints_cache: EndpointRegistryCache | None = None, - env_metadata_cache: EnvMetadataCache | None = None, - manifest: RunManifest | None = None, -) -> list[JobExecutionResult]: - """Execute a sequence of resolved jobs.""" - ensure_root_logging(settings.log_level) - logger.info("Starting run '%s' with %d job(s).", settings.run_id, len(jobs)) - - run_dir = settings.output_dir / settings.run_id - run_dir.mkdir(parents=True, exist_ok=True) - - job_statuses: dict[str, str] = {job.job_id: "pending" for job in jobs} - results: list[JobExecutionResult] = [] - interrupted = False - - for index, job in enumerate(jobs): - is_last_job = index == len(jobs) - 1 - env_identifier = resolve_env_identifier(job.env) - model_identifier = job.model.id or job.model.model or job.job_id - job_label = f"{job.job_id} (env={env_identifier}, model={model_identifier})" - logger.info("Job %d/%d starting: %s", index + 1, len(jobs), job_label) - job_dir = (run_dir / job.job_id).resolve() - job_dir.mkdir(parents=True, exist_ok=True) - job_statuses[job.job_id] = "running" - forced_clean = job.job_id in settings.forced_job_ids - - if settings.dry_run: - logger.info("Dry run enabled; skipping execution for job '%s'.", job.job_id) - results.append( - JobExecutionResult( - job_id=job.job_id, - status="skipped", - output_path=job_dir, - ) - ) - job_statuses[job.job_id] = "skipped" - _log_job_progress_window(jobs, index, job_statuses, event="dry-run skip") - continue - - if manifest is not None: - manifest.record_job_start(job.job_id) - - try: - _prepare_job_dir_for_resume(job_id=job.job_id, job_dir=job_dir, forced_clean=forced_clean) - except Exception as exc: # noqa: BLE001 - error_message = f"{job_label} preflight failed: {exc}" - logger.exception("%s", error_message) - _record_job_failure( - results=results, - job_statuses=job_statuses, - jobs=jobs, - center_index=index, - manifest=manifest, - job_id=job.job_id, - output_path=job_dir, - error_message=error_message, - manifest_error=str(exc), - duration_seconds=None, - status_label="failed", - event="failure", - note="during preflight", - ) - _maybe_sleep_between_jobs(job, settings, is_last=is_last_job) - continue - - try: - endpoints = _load_endpoints_for_model(job.model, settings, cache=endpoints_cache) - resolved_model, client_config, prime_sampling_overrides = build_client_config( - job.model, - endpoints=endpoints, - default_api_key_var=settings.default_api_key_var, - default_api_key_var_explicit=settings.default_api_key_var_explicit, - default_api_base_url=settings.default_api_base_url, - api_base_url_override=settings.api_base_url_override, - http_max_retries_override=settings.http_max_retries, - timeout_override=settings.timeout, - headers=job.model.headers, - ) - # Merge Prime Inference overrides with job sampling args (job args take precedence) - merged_sampling_args = {**prime_sampling_overrides, **job.sampling_args} - eval_config = build_eval_config( - job_label=job.job_id, - model_cfg=job.model, - env_cfg=job.env, - env_args=job.env_args, - sampling_args=merged_sampling_args, - cli_env_args=settings.cli_env_args, - cli_sampling_args=settings.cli_sampling_args, - resolved_model=resolved_model, - client_config=client_config, - env_dir=settings.env_dir, - max_concurrent_override=settings.max_concurrent, - max_concurrent_generation=settings.max_concurrent_generation, - max_concurrent_scoring=settings.max_concurrent_scoring, - rollout_max_retries=settings.rollout_max_retries, - resume_path=job_dir, - default_max_concurrent=DEFAULT_BATCH_MAX_CONCURRENT, - save_results=settings.save_results, - save_to_hf_hub=settings.save_to_hf_hub, - hf_hub_dataset_name=settings.hf_hub_dataset_name, - verbose=settings.verbose, - env_metadata_cache=env_metadata_cache, - env_metadata_loader=load_env_metadata, - enforce_required_env_args=True, - ) - except KeyboardInterrupt: - logger.warning("Interrupted while preparing job %s.", job_label) - interruption_message = f"{job_label} interrupted by user" - _record_job_failure( - results=results, - job_statuses=job_statuses, - jobs=jobs, - center_index=index, - manifest=manifest, - job_id=job.job_id, - output_path=job_dir, - error_message=interruption_message, - manifest_error="interrupted by user", - duration_seconds=None, - status_label="interrupted", - event="interruption", - note="during preparation", - ) - interrupted = True - break - except Exception as exc: # noqa: BLE001 - error_message = f"{job_label} preparation failed: {exc}" - logger.exception("%s", error_message) - _record_job_failure( - results=results, - job_statuses=job_statuses, - jobs=jobs, - center_index=index, - manifest=manifest, - job_id=job.job_id, - output_path=job_dir, - error_message=error_message, - manifest_error=str(exc), - duration_seconds=None, - status_label="failed", - event="failure", - note="during preparation", - ) - _maybe_sleep_between_jobs(job, settings, is_last=is_last_job) - continue - - start = perf_counter() - try: - eval_result = asyncio.run(run_evaluation(eval_config)) - except KeyboardInterrupt: - duration = perf_counter() - start - logger.warning("Job %s interrupted by user after %.2fs.", job_label, duration) - interruption_message = f"{job_label} interrupted by user" - _record_job_failure( - results=results, - job_statuses=job_statuses, - jobs=jobs, - center_index=index, - manifest=manifest, - job_id=job.job_id, - output_path=job_dir, - error_message=interruption_message, - manifest_error="interrupted by user", - duration_seconds=duration, - status_label="interrupted", - event="interruption", - ) - interrupted = True - break - except Exception as exc: # noqa: BLE001 - duration = perf_counter() - start - if is_resume_metadata_mismatch_error(exc): - _log_resume_mismatch_diagnostics(job_id=job.job_id, resume_path=job_dir, eval_config=eval_config) - prescriptive = ( - "Job output dir contains incompatible prior results; " - "use --force to rerun cleanly or start a new run_id." - ) - error_message = f"{job_label} failed after {duration:.2f}s: {prescriptive}" - logger.error("%s", error_message) - manifest_error = prescriptive - else: - error_message = f"{job_label} failed after {duration:.2f}s: {exc}" - logger.exception("%s", error_message) - manifest_error = str(exc) - _record_job_failure( - results=results, - job_statuses=job_statuses, - jobs=jobs, - center_index=index, - manifest=manifest, - job_id=job.job_id, - output_path=job_dir, - error_message=error_message, - manifest_error=manifest_error, - duration_seconds=duration, - status_label="failed", - event="failure", - ) - _maybe_sleep_between_jobs(job, settings, is_last=is_last_job) - continue - - duration = perf_counter() - start - logger.info("Job '%s' completed in %.2fs.", job.job_id, duration) - - _materialize_results(job_dir, eval_result) - avg_reward = _extract_avg_reward(eval_result) - metrics_avg = _extract_avg_metrics(eval_result) - metadata = _safe_get(eval_result, "metadata", None) - num_examples = _safe_get(metadata, "num_examples", None) - rollouts_per_example = _safe_get(metadata, "rollouts_per_example", None) - - if manifest is not None: - manifest.record_job_completion( - job.job_id, - duration_seconds=duration, - results_dir=job_dir, - avg_reward=avg_reward, - metrics=metrics_avg, - num_examples=num_examples, - rollouts_per_example=rollouts_per_example, - ) - - results.append( - JobExecutionResult( - job_id=job.job_id, - status="succeeded", - duration_seconds=duration, - output_path=job_dir, - result=eval_result, - ) - ) - job_statuses[job.job_id] = "completed" - _log_job_progress_window(jobs, index, job_statuses, event="completion") - _maybe_sleep_between_jobs(job, settings, is_last=is_last_job) - - if interrupted: - logger.warning("Execution interrupted by user; %d job(s) left pending.", len(jobs) - len(results)) - - return results - - -def _load_endpoints_for_model( - model_cfg: ModelConfigSchema, - settings: ExecutorSettings, - *, - cache: EndpointRegistryCache | None, -) -> EndpointRegistry: - """Load the endpoint registry to use for a model.""" - registry_path = model_cfg.endpoints_path or settings.endpoints_path - if registry_path is None: - return {} - - registry_path_obj = Path(registry_path).expanduser() - default_registry_path = Path(DEFAULT_ENDPOINTS_PATH).expanduser() - explicit_path = bool(model_cfg.endpoints_path) or settings.endpoints_path_explicit - - if not registry_path_obj.exists(): - if explicit_path: - raise FileNotFoundError(f"Endpoint registry not found at {registry_path_obj}") - if _same_path(registry_path_obj, default_registry_path): - logger.warning( - "Default endpoints registry '%s' not found; continuing without endpoint aliases.", - registry_path_obj, - ) - return {} - - endpoints = load_endpoint_registry(registry_path_obj, cache=cache) - if explicit_path and not endpoints: - raise ValueError(f"Failed to load endpoint registry from explicit path '{registry_path_obj}'") - return endpoints - - -def _record_job_failure( - *, - results: list[JobExecutionResult], - job_statuses: dict[str, str], - jobs: Sequence[ResolvedJob], - center_index: int, - manifest: RunManifest | None, - job_id: str, - output_path: Path, - error_message: str, - manifest_error: str, - duration_seconds: float | None, - status_label: str, - event: str, - note: str | None = None, -) -> None: - if manifest is not None: - manifest.record_job_failure(job_id, error=manifest_error, duration_seconds=duration_seconds) - results.append( - JobExecutionResult( - job_id=job_id, - status="failed", - error=error_message, - duration_seconds=duration_seconds, - output_path=output_path, - ) - ) - job_statuses[job_id] = status_label - _log_job_progress_window(jobs, center_index, job_statuses, event=event, note=note) - - -def _safe_get(obj: Any, key: str, default: Any = None) -> Any: - """Retrieve attribute or dict key, allowing newer dict-style GenerateOutputs.""" - if isinstance(obj, dict): - return obj.get(key, default) - return getattr(obj, key, default) - - -def _same_path(left: Path, right: Path) -> bool: - try: - return left.resolve(strict=False) == right.resolve(strict=False) - except OSError: - return left == right - - -def _materialize_results(job_dir: Path, results: GenerateOutputs) -> None: - """Move evaluation artifacts into the job directory.""" - metadata = _safe_get(results, "metadata", None) - raw_path = _safe_get(metadata, "path_to_save", None) - src_path = Path(raw_path) if raw_path else job_dir - try: - resolved_src = src_path.resolve() - except OSError: - resolved_src = src_path - try: - resolved_job_dir = job_dir.resolve() - except OSError: - resolved_job_dir = job_dir - - if resolved_src == resolved_job_dir: - logger.debug("Results already in job_dir; _materialize_results no-op for %s.", job_dir) - return - - if src_path.exists() and resolved_src != resolved_job_dir: - logger.warning( - "Unexpected results source path for job '%s': src=%s, job_dir=%s. Materializing as a safety net.", - job_dir.name, - src_path, - job_dir, - ) - for item in src_path.iterdir(): - target = job_dir / item.name - if target.exists(): - if target.is_dir(): - shutil.rmtree(target) - else: - target.unlink() - shutil.move(str(item), target) - with contextlib.suppress(OSError): - src_path.rmdir() - - -def _prepare_job_dir_for_resume(*, job_id: str, job_dir: Path, forced_clean: bool) -> None: - if not job_dir.exists(): - job_dir.mkdir(parents=True, exist_ok=True) - return - if not job_dir.is_dir(): - msg = f"Job output dir '{job_dir}' is not a directory. Use --force to rerun cleanly or choose a new run_id." - raise ValueError(msg) - if not any(job_dir.iterdir()): - return - - if forced_clean: - archive_path = _archive_job_dir(job_dir) - logger.info("Forced rerun for job '%s': archived '%s' -> '%s'.", job_id, job_dir, archive_path) - job_dir.mkdir(parents=True, exist_ok=True) - return - - if is_valid_resume_results_path(job_dir): - return - - msg = ( - f"Job output dir '{job_dir}' is non-empty but not a valid evaluation results path " - "(expected results.jsonl and metadata.json). " - "Use --force to rerun cleanly or choose a new run_id." - ) - raise ValueError(msg) - - -def _archive_job_dir(job_dir: Path) -> Path: - timestamp = datetime.now(UTC).strftime("%Y%m%d-%H%M%S") - candidate = job_dir.with_name(f"{job_dir.name}__old_{timestamp}") - suffix = 1 - while candidate.exists(): - candidate = job_dir.with_name(f"{job_dir.name}__old_{timestamp}_{suffix}") - suffix += 1 - job_dir.rename(candidate) - return candidate - - -def _log_resume_mismatch_diagnostics(*, job_id: str, resume_path: Path, eval_config: Any) -> None: - logger.error("Resume metadata mismatch for job '%s' at %s.", job_id, resume_path) - saved_values = load_resume_metadata_values(resume_path) - current_values = { - "env_id": getattr(eval_config, "env_id", ""), - "model": getattr(eval_config, "model", ""), - "rollouts_per_example": getattr(eval_config, "rollouts_per_example", ""), - "num_examples": getattr(eval_config, "num_examples", ""), - } - for line in format_resume_mismatch_lines(saved_values=saved_values, current_values=current_values): - logger.error(" %s", line) - logger.error("Resume supports increasing num_examples, but not decreasing it.") - - -def _extract_avg_reward(results: GenerateOutputs) -> float | None: - """Return metadata-level average reward from GenerateOutputs.""" - metadata = _safe_get(results, "metadata", None) - metadata_avg = _safe_get(metadata, "avg_reward", None) - if metadata_avg is not None: - return float(metadata_avg) - return None - - -def _extract_avg_metrics(results: GenerateOutputs) -> dict[str, float]: - """Return metadata-level average metrics from GenerateOutputs.""" - metadata = _safe_get(results, "metadata", None) - raw_metrics = _safe_get(metadata, "avg_metrics", None) - if not isinstance(raw_metrics, Mapping): - return {} - - metrics: dict[str, float] = {} - for key, value in raw_metrics.items(): - if value is None: - continue - try: - metrics[str(key)] = float(value) - except (TypeError, ValueError): - continue - return metrics - - -def _log_job_progress_window( - jobs: Sequence[ResolvedJob], - center_index: int, - job_statuses: Mapping[str, str], - *, - event: str, - note: str | None = None, -) -> None: - if not jobs: - return - start = max(0, center_index - 1) - end = min(len(jobs), center_index + 2) - lines: list[str] = [] - header = "Segment | Job ID | Status | Model | Env | Name" - divider = "-" * len(header) - lines.append(header) - lines.append(divider) - for idx in range(start, end): - job = jobs[idx] - segment = "current" if idx == center_index else ("previous" if idx < center_index else "next") - status = job_statuses.get(job.job_id, "pending") - model_label = job.model.id or job.model.model or "-" - try: - env_label = resolve_env_identifier(job.env) - except ValueError: - env_label = job.env.id or job.job_id - lines.append( - f"{segment:8} | {job.job_id:20} | {status:10} | {model_label:15} | {env_label:20} | {job.name or '-'}" - ) - label = f"Job progress after {event}" - if note: - label = f"{label} ({note})" - logger.info("%s:\n%s", label, "\n".join(lines)) - - -def _maybe_sleep_between_jobs(job: ResolvedJob, settings: ExecutorSettings, *, is_last: bool) -> None: - """Optionally pause between jobs to spread out environment runs.""" - if settings.dry_run or is_last: - return - delay = job.sleep if job.sleep is not None else settings.sleep - if delay is None or delay <= 0: - return - if rich_print: - rich_print(f"[cyan]Sleeping {delay:.2f} second(s) before next job...[/cyan]") - logger.info("Sleeping %.2f second(s) before next job...", delay) - sleep(delay) - - -__all__ = ["ExecutorSettings", "JobExecutionResult", "execute_jobs"] diff --git a/medarc_verifiers/cli/_manifest.py b/medarc_verifiers/cli/_manifest.py deleted file mode 100644 index f3827d4f..00000000 --- a/medarc_verifiers/cli/_manifest.py +++ /dev/null @@ -1,892 +0,0 @@ -"""Run manifest helpers for the unified CLI.""" - -from __future__ import annotations - -import json -import logging -from collections import Counter -from dataclasses import dataclass -from datetime import UTC, datetime -from pathlib import Path -from typing import Any, Mapping, Sequence - -from pydantic import BaseModel, ConfigDict, Field, model_validator - -from medarc_verifiers.cli._job_builder import ResolvedJob -from medarc_verifiers.cli._schemas import ModelConfigSchema -from medarc_verifiers.cli.utils.json_io import dumps_json -from medarc_verifiers.cli.utils.shared import count_jsonl_rows, compute_checksum, resolve_env_identifier_or -from medarc_verifiers.utils.pathing import normalize_results_dir_for_manifest - -MANIFEST_FILENAME = "run_manifest.json" -MANIFEST_VERSION = 3 -SUPPORTED_MANIFEST_VERSIONS = {3} - -logger = logging.getLogger(__name__) - - -class ManifestConflictError(ValueError): - """Raised when an existing manifest conflicts with the current config.""" - - -def _normalize_model_slug(value: str) -> str: - """Normalize model slugs for restart comparisons. - - Some providers expose the same model under different namespaces (e.g. - `google/gemini-3-pro-preview` vs `gemini-3-pro-preview`). For now, we only - normalize Gemini model slugs by stripping a single leading namespace. - """ - if not value: - return value - if "/" not in value: - return value - candidate = value.rsplit("/", 1)[-1] - if candidate.startswith("gemini-"): - return candidate - return value - - -class ManifestJobEntry(BaseModel): - """Pydantic model describing a single manifest job entry.""" - - model_config = ConfigDict(extra="ignore") - - job_id: str - env_id: str | None = None - model_id: str | None = None - env_template_id: str - env_variant_id: str - env_args: dict[str, Any] - sampling_args: dict[str, Any] | None = None - status: str = "pending" - reason: str | None = None - attempt: int = 0 - started_at: str | None = None - ended_at: str | None = None - duration_seconds: float | None = None - results_dir: str | None = None - results_relpath: str | None = None - metadata_relpath: str | None = None - row_count: int | None = None - metrics: dict[str, Any] | None = None - avg_reward: float | None = None - num_examples: int | None = None - rollouts_per_example: int | None = None - - -# Keep this list aligned with runtime/progress fields mutated by record_job_* methods. -# Fields here must survive ensure_job() refreshes when config/env metadata changes. -_ENSURE_JOB_RUNTIME_STATE_FIELDS = ( - "status", - "reason", - "attempt", - "started_at", - "ended_at", - "duration_seconds", - "row_count", - "metrics", - "avg_reward", - "num_examples", - "rollouts_per_example", -) - - -def _validate_ensure_job_runtime_state_fields() -> None: - missing = set(_ENSURE_JOB_RUNTIME_STATE_FIELDS) - set(ManifestJobEntry.model_fields) - if missing: - msg = f"Unknown manifest fields in _ENSURE_JOB_RUNTIME_STATE_FIELDS: {sorted(missing)}" - raise ValueError(msg) - - -_validate_ensure_job_runtime_state_fields() - - -class RunManifestModel(BaseModel): - """Root manifest payload persisted to disk.""" - - model_config = ConfigDict(extra="allow") - - version: int = MANIFEST_VERSION - run_id: str - name: str - config_source: str - config_checksum: str - created_at: str - updated_at: str - restart_source: str | None = None - artifacts_root: str = "." - models: dict[str, dict[str, Any]] = Field(default_factory=dict) - env_templates: dict[str, dict[str, Any]] = Field(default_factory=dict) - jobs: list[ManifestJobEntry] = Field(default_factory=list) - summary: dict[str, int] = Field(default_factory=dict) - - @model_validator(mode="after") - def _check_version(self) -> RunManifestModel: - if self.version not in SUPPORTED_MANIFEST_VERSIONS: - msg = ( - f"Manifest version {self.version} is not supported; " - f"expected one of {sorted(SUPPORTED_MANIFEST_VERSIONS)}." - ) - raise ValueError(msg) - return self - - -def timestamp() -> str: - """Return an ISO8601 timestamp in UTC.""" - return datetime.now(UTC).replace(microsecond=0).isoformat().replace("+00:00", "Z") - - -def compute_snapshot_checksum(snapshot: Mapping[str, Any]) -> str: - """Public helper to compute the checksum for a config snapshot.""" - sanitized = dict(snapshot) - models = sanitized.get("models") - if isinstance(models, Mapping): - sanitized_models: dict[str, Any] = {} - for model_id, payload in models.items(): - if isinstance(payload, Mapping): - sanitized_models[str(model_id)] = { - key: value for key, value in payload.items() if key not in ModelConfigSchema.resume_tolerant_fields - } - else: - sanitized_models[str(model_id)] = payload - sanitized["models"] = sanitized_models - return compute_checksum(sanitized) - - -def _drop_resume_tolerant_fields(payload: Mapping[str, Any]) -> dict[str, Any]: - cleaned = dict(payload) - model_payload = cleaned.get("model") - if isinstance(model_payload, Mapping): - cleaned["model"] = { - key: value for key, value in model_payload.items() if key not in ModelConfigSchema.resume_tolerant_fields - } - return cleaned - - -def _relativize_results_dir(value: str | Path, *, run_dir: Path) -> str: - """Ensure results directories are stored relative to the project root.""" - return normalize_results_dir_for_manifest(value, run_dir=run_dir) - - -def _to_jsonable(value: Any) -> Any: - """Convert arbitrary data to JSON-serializable structures (default=str).""" - return json.loads(json.dumps(value, default=str)) - - -def _normalize_payload(payload: Mapping[str, Any]) -> dict[str, Any]: - def _drop(value: Any) -> Any: - if isinstance(value, dict): - return {k: _drop(v) for k, v in value.items() if v is not None} - if isinstance(value, list): - return [_drop(v) for v in value] - return value - - return _drop(_to_jsonable(payload)) - - -def _require_manifest_v3(payload: Mapping[str, Any], *, path: Path | None = None) -> None: - version = payload.get("version") - if version not in SUPPORTED_MANIFEST_VERSIONS: - location = f" '{path}'" if path else "" - msg = f"Manifest{location} uses version {version}; expected one of {sorted(SUPPORTED_MANIFEST_VERSIONS)}." - raise ValueError(msg) - - -def _sanitize_model_payload(model_payload: Mapping[str, Any]) -> dict[str, Any]: - sanitized = { - key: value for key, value in model_payload.items() if key not in ModelConfigSchema.resume_tolerant_fields - } - - model_slug = sanitized.get("model") - if isinstance(model_slug, str): - sanitized["model"] = _normalize_model_slug(model_slug) - - # Provider quirks: OpenAI-compatible endpoints vary widely in what they accept when - # we forward `sampling_args.extra_body`. Treat *all* of extra_body as resume-tolerant - # for the purposes of manifest conflict detection so users can switch providers - # without getting blocked by payload drift. - sampling_args = sanitized.get("sampling_args") - if isinstance(sampling_args, Mapping): - updated_sampling_args = dict(sampling_args) - updated_sampling_args.pop("extra_body", None) - if updated_sampling_args: - sanitized["sampling_args"] = updated_sampling_args - else: - sanitized.pop("sampling_args", None) - - return sanitized - - -def _sampling_extra_body(model_payload: Mapping[str, Any]) -> dict[str, Any] | None: - sampling_args = model_payload.get("sampling_args") - if not isinstance(sampling_args, Mapping): - return None - extra_body = sampling_args.get("extra_body") - if not isinstance(extra_body, Mapping): - return None - normalized = _normalize_payload(extra_body) - return normalized or None - - -def _warn_extra_body_change(key: str, existing: Mapping[str, Any], payload: Mapping[str, Any]) -> None: - existing_extra = _sampling_extra_body(existing) - payload_extra = _sampling_extra_body(payload) - if existing_extra is None and payload_extra is None: - return - if compute_checksum(existing_extra or {}) == compute_checksum(payload_extra or {}): - return - logger.warning( - "Model '%s' sampling_args.extra_body changed; allowing restart, but providers may reject unknown fields.", - key, - ) - - -def _sampling_args_payload(model_payload: Mapping[str, Any]) -> dict[str, Any] | None: - sampling_args = model_payload.get("sampling_args") - if not isinstance(sampling_args, Mapping): - return None - normalized = _normalize_payload(sampling_args) - return normalized or None - - -def _warn_sampling_args_change(key: str, existing: Mapping[str, Any], payload: Mapping[str, Any]) -> None: - existing_sampling = _sampling_args_payload(existing) - payload_sampling = _sampling_args_payload(payload) - if existing_sampling is None and payload_sampling is None: - return - if compute_checksum(existing_sampling or {}) == compute_checksum(payload_sampling or {}): - return - logger.warning( - "Model '%s' sampling_args changed; allowing restart, but providers may reject unsupported parameters.", - key, - ) - - -def _effective_sampling_args(entry: ManifestJobEntry, model_payload: Mapping[str, Any]) -> Mapping[str, Any]: - if entry.sampling_args is not None: - return _normalize_payload(entry.sampling_args) - return _normalize_payload(model_payload.get("sampling_args") or {}) - - -def _canonical_manifest_parts( - *, - model_payload: Mapping[str, Any], - env_payload: Mapping[str, Any], - env_args: Mapping[str, Any], - sampling_args: Mapping[str, Any], - env_identifier: str | None, - env_variant_id: str, - env_payload_is_template: bool = False, - sampling_args_already_resolved: bool = False, -) -> dict[str, Any]: - model_normalized = _normalize_payload(model_payload) - env_normalized = _normalize_payload(env_payload) - if env_payload_is_template: - env_template_payload = dict(env_normalized) - else: - env_template_payload = _build_env_template_payload(env_normalized) - if "module" not in env_template_payload: - env_template_payload["module"] = env_identifier - if sampling_args_already_resolved: - sampling_override = None - sampling_payload = sampling_args - else: - sampling_override = _sampling_args_override(sampling_args=sampling_args, model_payload=model_normalized) - sampling_payload = sampling_override or model_normalized.get("sampling_args") or {} - effective_env_payload = { - **env_template_payload, - "module": env_identifier, - "id": env_variant_id, - "env_args": _normalize_payload(env_args), - } - return { - "model_sanitized": _sanitize_model_payload(model_normalized), - "env_template_payload": _normalize_payload(env_template_payload), - "effective_env_payload": _normalize_payload(effective_env_payload), - "sampling_payload": _normalize_payload(sampling_payload), - "sampling_override": sampling_override, - } - - -def manifest_job_signature(manifest: RunManifestModel, entry: ManifestJobEntry) -> dict[str, Any]: - model_payload = _normalize_payload(manifest.models.get(entry.model_id or "", {}) or {}) - env_template = _normalize_payload(manifest.env_templates.get(entry.env_template_id, {}) or {}) - env_identifier = entry.env_id or env_template.get("module") - canonical = _canonical_manifest_parts( - model_payload=model_payload, - env_payload=env_template, - env_args=entry.env_args, - sampling_args=_effective_sampling_args(entry, model_payload), - env_identifier=env_identifier, - env_variant_id=entry.env_variant_id, - env_payload_is_template=True, - sampling_args_already_resolved=True, - ) - signature = { - "model": canonical["model_sanitized"], - "env": canonical["effective_env_payload"], - "sampling_args": canonical["sampling_payload"], - } - return _normalize_payload(signature) - - -def resolved_job_signature( - job: ResolvedJob, - *, - env_args: Mapping[str, Any], - sampling_args: Mapping[str, Any], -) -> dict[str, Any]: - model_payload = _normalize_payload(json.loads(job.model.model_dump_json(exclude_none=True))) - env_payload = _normalize_payload(json.loads(job.env.model_dump_json(exclude_none=True))) - env_id = env_payload.get("module") or _resolve_env_identifier(job) - env_variant_id = str(env_payload.get("id") or job.job_id) - canonical = _canonical_manifest_parts( - model_payload=model_payload, - env_payload=env_payload, - env_args=env_args, - sampling_args=sampling_args, - env_identifier=env_id, - env_variant_id=env_variant_id, - ) - signature = { - "model": canonical["model_sanitized"], - "env": canonical["effective_env_payload"], - "sampling_args": canonical["sampling_payload"], - } - return _normalize_payload(signature) - - -def _maybe_store_results_dir(value: str | Path | None, *, run_dir: Path, job_id: str) -> str | None: - if value is None: - return None - normalized = _relativize_results_dir(value, run_dir=run_dir) - default_value = _relativize_results_dir(run_dir / job_id, run_dir=run_dir) - if normalized == default_value: - return None - return normalized - - -def _manifest_relative_artifacts(*, run_dir: Path, job_id: str, results_dir: Path | str | None) -> tuple[str, str]: - if results_dir is None: - base_rel = Path(job_id) - else: - candidate = Path(results_dir) - if not candidate.is_absolute(): - candidate = (run_dir / candidate).resolve() - else: - candidate = candidate.resolve() - try: - base_rel = candidate.relative_to(run_dir) - except ValueError: - base_rel = Path(job_id) - base_rel = Path(base_rel.as_posix()) - return ( - (base_rel / "results.jsonl").as_posix(), - (base_rel / "metadata.json").as_posix(), - ) - - -def _build_env_template_payload(env_payload: Mapping[str, Any]) -> dict[str, Any]: - payload = dict(env_payload) - payload.pop("id", None) - payload.pop("env_args", None) - return _normalize_payload(payload) - - -def _env_template_id(env_id: str, env_template_payload: Mapping[str, Any]) -> str: - digest = compute_checksum(_normalize_payload(env_template_payload))[:12] - return f"{env_id}:{digest}" - - -def _sampling_args_override( - *, - sampling_args: Mapping[str, Any], - model_payload: Mapping[str, Any], -) -> dict[str, Any] | None: - normalized_sampling = _normalize_payload(sampling_args) - model_sampling = model_payload.get("sampling_args") or {} - normalized_model_sampling = _normalize_payload(model_sampling) - if compute_checksum(normalized_sampling) == compute_checksum(normalized_model_sampling): - return None - return normalized_sampling - - -def _merge_unique_model_payload( - container: dict[str, dict[str, Any]], - key: str, - payload: dict[str, Any], - *, - allow_mismatch: bool, -) -> None: - existing = container.get(key) - if existing is None: - container[key] = payload - return - if existing == payload: - return - if allow_mismatch: - container[key] = payload - return - sanitized_existing = _sanitize_model_payload(existing) - sanitized_payload = _sanitize_model_payload(payload) - if sanitized_existing == sanitized_payload: - _warn_extra_body_change(key, existing, payload) - container[key] = payload - return - - stripped_existing = dict(sanitized_existing) - stripped_payload = dict(sanitized_payload) - stripped_existing.pop("sampling_args", None) - stripped_payload.pop("sampling_args", None) - if stripped_existing == stripped_payload: - _warn_sampling_args_change(key, existing, payload) - _warn_extra_body_change(key, existing, payload) - container[key] = payload - return - - all_keys = set(sanitized_existing) | set(sanitized_payload) - diff_keys = sorted(key for key in all_keys if sanitized_existing.get(key) != sanitized_payload.get(key)) - suffix = f" (conflicting keys: {', '.join(diff_keys)})" if diff_keys else "" - msg = f"Conflicting model payload for '{key}'{suffix}." - raise ManifestConflictError(msg) - - -def _merge_unique_payload( - container: dict[str, dict[str, Any]], - key: str, - payload: dict[str, Any], - *, - allow_mismatch: bool, - label: str, -) -> None: - existing = container.get(key) - if existing is None: - container[key] = payload - return - if existing != payload and not allow_mismatch: - msg = f"Conflicting {label} payload for '{key}'." - raise ValueError(msg) - container[key] = payload - - -def _resolve_env_identifier(job: ResolvedJob) -> str: - return resolve_env_identifier_or(job.env, job.job_id) - - -def _resolve_model_identifier(job: ResolvedJob) -> str: - mid = getattr(job.model, "id", None) - if mid: - return mid - if getattr(job.model, "model", None): - return job.model.model # type: ignore[return-value] - return job.job_id - - -def build_job_entry( - job: ResolvedJob, - *, - env_args: Mapping[str, Any], - sampling_args: Mapping[str, Any], - results_dir: str | None, - models: dict[str, dict[str, Any]] | None = None, - env_templates: dict[str, dict[str, Any]] | None = None, - allow_model_mismatch: bool = False, -) -> ManifestJobEntry: - """Build the manifest entry recorded for a job.""" - model_payload = _normalize_payload(json.loads(job.model.model_dump_json(exclude_none=True))) - env_payload = _normalize_payload(json.loads(job.env.model_dump_json(exclude_none=True))) - env_id = env_payload.get("module") or _resolve_env_identifier(job) - env_variant_id = str(env_payload.get("id") or job.job_id) - canonical = _canonical_manifest_parts( - model_payload=model_payload, - env_payload=env_payload, - env_args=env_args, - sampling_args=sampling_args, - env_identifier=env_id, - env_variant_id=env_variant_id, - ) - env_template_payload = canonical["env_template_payload"] - env_template_id = _env_template_id(env_id, env_template_payload) - if models is not None: - _merge_unique_model_payload( - models, - _resolve_model_identifier(job), - model_payload, - allow_mismatch=allow_model_mismatch, - ) - if env_templates is not None: - _merge_unique_payload( - env_templates, - env_template_id, - env_template_payload, - allow_mismatch=False, - label="manifest template", - ) - results_relpath, metadata_relpath = _manifest_relative_artifacts( - run_dir=Path("."), - job_id=job.job_id, - results_dir=job.job_id, - ) - return ManifestJobEntry( - job_id=job.job_id, - env_id=env_id, - model_id=_resolve_model_identifier(job), - env_template_id=env_template_id, - env_variant_id=env_variant_id, - env_args=_normalize_payload(env_args), - sampling_args=canonical["sampling_override"], - status="pending", - reason=None, - attempt=0, - started_at=None, - ended_at=None, - duration_seconds=None, - results_dir=results_dir, - results_relpath=results_relpath, - metadata_relpath=metadata_relpath, - row_count=None, - metrics=None, - avg_reward=None, - num_examples=None, - rollouts_per_example=None, - ) - - -def _summarize_jobs(entries: Sequence[ManifestJobEntry]) -> dict[str, int]: - counter = Counter((entry.status or "pending") for entry in entries) - skipped = sum(1 for entry in entries if entry.reason in {"up_to_date", "skipped"}) - summary = { - "total": len(entries), - "pending": counter.get("pending", 0), - "running": counter.get("running", 0), - "completed": counter.get("completed", 0), - "failed": counter.get("failed", 0), - "skipped": skipped, - } - return summary - - -@dataclass -class RunManifest: - """In-memory representation of a run manifest.""" - - path: Path - model: RunManifestModel - persist: bool = True - - def __post_init__(self) -> None: - self._jobs: list[ManifestJobEntry] = list(self.model.jobs) - self.model.jobs = self._jobs - self._index: dict[str, ManifestJobEntry] = {entry.job_id: entry for entry in self._jobs if entry.job_id} - if not self.model.summary: - self.model.summary = _summarize_jobs(self._jobs) - - @property - def jobs(self) -> list[ManifestJobEntry]: - return self._jobs - - @property - def summary(self) -> Mapping[str, Any]: - return self.model.summary - - @property - def payload(self) -> dict[str, Any]: - """Dictionary representation (back-compat).""" - return self.model.model_dump() - - def job_entry(self, job_id: str) -> ManifestJobEntry | None: - return self._index.get(job_id) - - @property - def run_dir(self) -> Path: - return self.path.parent - - def ensure_job( - self, - job: ResolvedJob, - *, - env_args: Mapping[str, Any], - sampling_args: Mapping[str, Any], - results_dir: Path, - ) -> ManifestJobEntry: - entry = self._index.get(job.job_id) - normalized_results_dir = _maybe_store_results_dir(results_dir, run_dir=self.run_dir, job_id=job.job_id) - results_relpath, metadata_relpath = _manifest_relative_artifacts( - run_dir=self.run_dir, - job_id=job.job_id, - results_dir=results_dir, - ) - if entry is None: - entry = build_job_entry( - job, - env_args=env_args, - sampling_args=sampling_args, - results_dir=normalized_results_dir, - models=self.model.models, - env_templates=self.model.env_templates, - ) - entry.results_relpath = results_relpath - entry.metadata_relpath = metadata_relpath - self._jobs.append(entry) - self._index[job.job_id] = entry - self._refresh_summary(save=False) - return entry - - updated = build_job_entry( - job, - env_args=env_args, - sampling_args=sampling_args, - results_dir=normalized_results_dir, - models=self.model.models, - env_templates=self.model.env_templates, - ) - runtime_state = {field: getattr(entry, field) for field in _ENSURE_JOB_RUNTIME_STATE_FIELDS} - if entry.results_dir is not None: - results_dir_value = entry.results_dir - else: - results_dir_value = updated.results_dir - replacement = updated.model_copy( - update={ - **runtime_state, - "results_dir": results_dir_value, - "results_relpath": results_relpath, - "metadata_relpath": metadata_relpath, - } - ) - # Preserve object identity so external references to `entry` remain live. - for field_name, value in replacement.model_dump().items(): - setattr(entry, field_name, value) - self._index[job.job_id] = entry - return entry - - def record_job_start(self, job_id: str) -> None: - entry = self._index.get(job_id) - if not entry: - return - entry.status = "running" - entry.reason = None - entry.started_at = timestamp() - entry.attempt = int(entry.attempt or 0) + 1 - self._refresh_summary() - - def record_job_completion( - self, - job_id: str, - *, - duration_seconds: float, - results_dir: Path, - avg_reward: float | None, - metrics: Mapping[str, Any], - num_examples: int | None, - rollouts_per_example: int | None, - ) -> None: - entry = self._index.get(job_id) - if not entry: - return - entry.status = "completed" - entry.reason = None - entry.ended_at = timestamp() - entry.duration_seconds = duration_seconds - entry.results_dir = _maybe_store_results_dir(results_dir, run_dir=self.run_dir, job_id=job_id) - results_relpath, metadata_relpath = _manifest_relative_artifacts( - run_dir=self.run_dir, - job_id=job_id, - results_dir=results_dir, - ) - entry.results_relpath = results_relpath - entry.metadata_relpath = metadata_relpath - entry.avg_reward = avg_reward - entry.metrics = dict(metrics) if metrics else None - entry.num_examples = num_examples - entry.rollouts_per_example = rollouts_per_example - results_path = results_dir / "results.jsonl" - entry.row_count = count_jsonl_rows(results_path) - self._refresh_summary() - - def record_job_failure(self, job_id: str, *, error: str, duration_seconds: float | None = None) -> None: - entry = self._index.get(job_id) - if not entry: - return - entry.status = "failed" - entry.reason = error - entry.ended_at = timestamp() - entry.duration_seconds = duration_seconds - self._refresh_summary() - - def record_job_skip( - self, - job_id: str, - *, - reason: str, - results_dir: str | Path | None = None, - source_entry: Mapping[str, Any] | None = None, - ) -> None: - entry = self._index.get(job_id) - if not entry: - return - entry.status = "completed" - entry.reason = reason - entry.ended_at = entry.ended_at or timestamp() - - if source_entry: - is_mapping = isinstance(source_entry, Mapping) - for key in ( - "duration_seconds", - "avg_reward", - "metrics", - "num_examples", - "rollouts_per_example", - "row_count", - ): - if is_mapping: - if key in source_entry: - setattr(entry, key, source_entry[key]) - else: - setattr(entry, key, getattr(source_entry, key)) - if results_dir: - entry.results_dir = _maybe_store_results_dir(results_dir, run_dir=self.run_dir, job_id=job_id) - results_relpath, metadata_relpath = _manifest_relative_artifacts( - run_dir=self.run_dir, - job_id=job_id, - results_dir=results_dir, - ) - entry.results_relpath = results_relpath - entry.metadata_relpath = metadata_relpath - if entry.metrics == {}: - entry.metrics = None - self._refresh_summary() - - def _refresh_summary(self, *, save: bool = True) -> None: - self.model.summary = _summarize_jobs(self._jobs) - self.model.updated_at = timestamp() - if save: - self.save() - - def save(self) -> None: - if not self.persist: - return - tmp_path = self.path.with_suffix(".tmp") - self.path.parent.mkdir(parents=True, exist_ok=True) - text = dumps_json(self.model.model_dump(exclude_none=True)) - tmp_path.write_text(text, encoding="utf-8") - tmp_path.replace(self.path) - - @classmethod - def load(cls, path: Path, *, persist: bool = True) -> RunManifest: - if not path.exists(): - raise FileNotFoundError(f"Run manifest '{path}' not found.") - with path.open("r", encoding="utf-8") as handle: - payload = json.load(handle) - payload, _ = _upgrade_manifest_payload(payload) - model = RunManifestModel.model_validate(payload) - return cls(path=path, model=model, persist=persist) - - @classmethod - def create( - cls, - *, - run_dir: Path, - run_id: str, - run_name: str, - config_source: Path, - config_checksum: str, - jobs: Sequence[ResolvedJob], - env_args_map: Mapping[str, Mapping[str, Any]], - sampling_args_map: Mapping[str, Mapping[str, Any]], - persist: bool = True, - restart_source: str | None = None, - ) -> RunManifest: - run_dir.mkdir(parents=True, exist_ok=True) - path = run_dir / MANIFEST_FILENAME - payload: Mapping[str, Any] = { - "version": MANIFEST_VERSION, - "run_id": run_id, - "name": run_name, - "config_source": str(config_source), - "config_checksum": config_checksum, - "created_at": timestamp(), - "updated_at": timestamp(), - "restart_source": restart_source, - "artifacts_root": ".", - "models": {}, - "env_templates": {}, - "jobs": [], - "summary": {}, - } - model = RunManifestModel.model_validate(payload) - manifest = cls(path=path, model=model, persist=persist) - for job in jobs: - env_args = env_args_map[job.job_id] - sampling_args = sampling_args_map[job.job_id] - manifest.ensure_job( - job, - env_args=env_args, - sampling_args=sampling_args, - results_dir=(run_dir / job.job_id), - ) - manifest._refresh_summary(save=True) - return manifest - - -__all__ = [ - "MANIFEST_FILENAME", - "RunManifest", - "RunManifestModel", - "ManifestJobEntry", - "build_job_entry", - "compute_snapshot_checksum", - "manifest_job_signature", - "resolved_job_signature", - "timestamp", -] - - -def _upgrade_manifest_payload(payload: Any) -> tuple[Any, bool]: - """Apply in-memory migrations for older manifest payloads.""" - if not isinstance(payload, dict): - return payload, False - - changed = False - version = payload.get("version") - jobs = payload.get("jobs") - if version == 2: - payload["version"] = MANIFEST_VERSION - payload.setdefault("artifacts_root", ".") - changed = True - if isinstance(jobs, list): - for index, job in enumerate(jobs): - if not isinstance(job, dict): - continue - job_id = str(job.get("job_id") or "") - base_dir = Path(str(job.get("results_dir") or job_id or "")) - if not base_dir.as_posix(): - base_dir = Path(job_id or f"job-{index}") - if "results_relpath" not in job: - job["results_relpath"] = (base_dir / "results.jsonl").as_posix() - changed = True - if "metadata_relpath" not in job: - job["metadata_relpath"] = (base_dir / "metadata.json").as_posix() - changed = True - if "summary_relpath" in job: - job.pop("summary_relpath", None) - changed = True - if "artifacts_checksum" in job: - job.pop("artifacts_checksum", None) - changed = True - if "artifacts" in job: - job.pop("artifacts", None) - changed = True - - env_templates = payload.get("env_templates") - if isinstance(env_templates, dict): - for template_id, template in env_templates.items(): - if not isinstance(template, dict): - continue - if "interleave_scoring" not in template: - continue - interleave_value = template.pop("interleave_scoring") - template.setdefault("independent_scoring", interleave_value) - env_templates[template_id] = template - changed = True - - return payload, changed diff --git a/medarc_verifiers/cli/_manifest_planner.py b/medarc_verifiers/cli/_manifest_planner.py deleted file mode 100644 index 64897260..00000000 --- a/medarc_verifiers/cli/_manifest_planner.py +++ /dev/null @@ -1,414 +0,0 @@ -"""Manifest planning helpers separating selection from runnable computation.""" - -from __future__ import annotations - -import json -import logging -from dataclasses import dataclass -from datetime import UTC, datetime -from pathlib import Path -from typing import Any, Mapping, Sequence - -from medarc_verifiers.cli._job_builder import ResolvedJob -from medarc_verifiers.cli._manifest import ( - MANIFEST_FILENAME, - RunManifest, - manifest_job_signature, - resolved_job_signature, -) -from medarc_verifiers.cli.utils.shared import slugify -from medarc_verifiers.utils.pathing import resolve_results_dir_from_manifest - -logger = logging.getLogger(__name__) - - -@dataclass -class ManifestPlan: - manifest: RunManifest - runnable_job_ids: set[str] - reused_job_ids: set[str] - - -@dataclass -class ManifestSelection: - manifest: RunManifest - seed_manifest: RunManifest | None - strategy: str - - -class ManifestPlanner: - """Resolve a manifest for a run and compute runnable/reused job sets.""" - - def __init__( - self, - *, - output_dir: Path, - run_id: str | None, - run_name: str, - config_path: Path, - config_checksum: str, - jobs: Sequence[ResolvedJob], - env_args_map: Mapping[str, Mapping[str, Any]], - sampling_args_map: Mapping[str, Mapping[str, Any]], - restart_source: str | None, - auto_resume: bool, - persist: bool, - ) -> None: - self.output_dir = Path(output_dir) - self.run_id = run_id - self.run_name = run_name - self.config_path = Path(config_path) - self.config_checksum = config_checksum - self.jobs = jobs - self.env_args_map = env_args_map - self.sampling_args_map = sampling_args_map - self.restart_source = restart_source - self.auto_resume = auto_resume - self.persist = persist - - def plan(self, *, force_all: bool, forced_envs: set[str]) -> ManifestPlan: - selection = self._select_manifest() - runnable, reused = self._compute_runnable(selection, force_all=force_all, forced_envs=forced_envs) - return ManifestPlan(manifest=selection.manifest, runnable_job_ids=runnable, reused_job_ids=reused) - - # Selection helpers - def _select_manifest(self) -> ManifestSelection: - if self.restart_source: - restart = self._select_restart_manifest(self.restart_source) - if restart: - return restart - - if self.auto_resume: - resumed = self._select_auto_resume_manifest() - if resumed: - return resumed - - manifest = self._create_fresh_manifest() - return ManifestSelection(manifest=manifest, seed_manifest=None, strategy="fresh") - - def _select_restart_manifest(self, restart_source: str) -> ManifestSelection | None: - persist = self.persist - restart_path = Path(restart_source).expanduser() - seed_dir: Path | None = None - if restart_path.exists() and restart_path.is_dir(): - seed_dir = restart_path.resolve() - else: - candidate = self.output_dir / restart_source - if candidate.exists() and candidate.is_dir(): - seed_dir = candidate.resolve() - if seed_dir and (seed_dir / MANIFEST_FILENAME).exists(): - seed_manifest = RunManifest.load(seed_dir / MANIFEST_FILENAME, persist=persist) - logger.info( - "Restart in-place: extending existing run '%s' with any new jobs from current config.", - seed_manifest.model.run_id, - ) - self._ensure_jobs(seed_manifest, seed_manifest.run_dir) - return ManifestSelection( - manifest=seed_manifest, - seed_manifest=seed_manifest, - strategy="restart_in_place", - ) - - if seed_dir is None: - return None - if not (seed_dir / MANIFEST_FILENAME).exists(): - msg = f"Invalid --restart '{seed_dir}': missing {MANIFEST_FILENAME}" - raise ValueError(msg) - seed_manifest = RunManifest.load(seed_dir / MANIFEST_FILENAME, persist=False) - dest_run_id = self.run_id or _generate_run_id(self.run_name) - run_dir = self._run_dir_for(dest_run_id) - manifest_path = run_dir / MANIFEST_FILENAME - if run_dir.exists() and manifest_path.exists() and persist: - msg = f"Run directory '{run_dir}' already exists; choose a different --run-id." - raise ValueError(msg) - logger.info("Restarting run '%s' from prior run '%s'.", dest_run_id, restart_source) - manifest = RunManifest.create( - run_dir=run_dir, - run_id=dest_run_id, - run_name=self.run_name, - config_source=self.config_path, - config_checksum=self.config_checksum, - jobs=self.jobs, - env_args_map=self.env_args_map, - sampling_args_map=self.sampling_args_map, - persist=persist, - restart_source=restart_source, - ) - self._ensure_jobs(manifest, run_dir) - return ManifestSelection(manifest=manifest, seed_manifest=seed_manifest, strategy="restart_new") - - def _select_auto_resume_manifest(self) -> ManifestSelection | None: - persist = self.persist - if self.run_id: - run_dir = self._run_dir_for(self.run_id) - manifest_path = run_dir / MANIFEST_FILENAME - if manifest_path.exists(): - manifest = RunManifest.load(manifest_path, persist=persist) - existing_checksum = manifest.model.config_checksum - if existing_checksum and existing_checksum != self.config_checksum: - msg = ( - f"Run '{self.run_id}' was created from a different configuration. " - f"To start fresh, pick a different --run-id or pass --no-auto-resume. " - f"To reuse completed jobs from this run, pass --restart {self.run_id}." - ) - raise ValueError(msg) - self._ensure_jobs(manifest, run_dir) - return ManifestSelection(manifest=manifest, seed_manifest=None, strategy="auto_resume") - if run_dir.exists(): - msg = f"Run '{self.run_id}' is missing {MANIFEST_FILENAME}; cannot auto-resume." - raise ValueError(msg) - logger.info( - "Auto-resume requested for run '%s', but no prior run exists. Starting a fresh run with this id.", - self.run_id, - ) - manifest = self._create_fresh_manifest(run_id=self.run_id) - return ManifestSelection(manifest=manifest, seed_manifest=None, strategy="fresh") - - candidate = _find_auto_resume_candidate(self.output_dir, expected_checksum=self.config_checksum) - if candidate is None: - logger.info( - "Auto-resume enabled but no matching run exists in %s; starting a fresh run. " - "Use --no-auto-resume to always start new runs.", - self.output_dir, - ) - return None - manifest = RunManifest.load(candidate / MANIFEST_FILENAME, persist=persist) - self._ensure_jobs(manifest, manifest.run_dir) - return ManifestSelection(manifest=manifest, seed_manifest=None, strategy="auto_resume") - - def _create_fresh_manifest(self, run_id: str | None = None) -> RunManifest: - dest_run_id = run_id or _generate_run_id(self.run_name) - run_dir = self._run_dir_for(dest_run_id) - manifest = RunManifest.create( - run_dir=run_dir, - run_id=dest_run_id, - run_name=self.run_name, - config_source=self.config_path, - config_checksum=self.config_checksum, - jobs=self.jobs, - env_args_map=self.env_args_map, - sampling_args_map=self.sampling_args_map, - persist=self.persist, - restart_source=None, - ) - self._ensure_jobs(manifest, run_dir) - return manifest - - def _ensure_jobs(self, manifest: RunManifest, run_dir: Path) -> None: - for job in self.jobs: - manifest.ensure_job( - job, - env_args=self.env_args_map[job.job_id], - sampling_args=self.sampling_args_map[job.job_id], - results_dir=run_dir / job.job_id, - ) - - def _run_dir_for(self, run_id: str) -> Path: - return Path(self.output_dir) / run_id - - # Runnable computation - def _compute_runnable( - self, - selection: ManifestSelection, - *, - force_all: bool, - forced_envs: set[str], - ) -> tuple[set[str], set[str]]: - manifest = selection.manifest - strategy = selection.strategy - if strategy in {"restart_in_place", "restart_new"} and selection.seed_manifest is not None: - runnable, reused = _plan_regen_jobs( - manifest=manifest, - seed_manifest=selection.seed_manifest, - jobs=self.jobs, - force_all=force_all, - forced_envs=forced_envs, - ) - if strategy == "restart_new" and reused: - logger.info("Reused %d completed job(s) from '%s'.", len(reused), self.restart_source) - return runnable, reused - - if strategy == "auto_resume": - runnable = _plan_auto_resume_jobs( - manifest=manifest, - jobs=self.jobs, - env_args_map=self.env_args_map, - sampling_args_map=self.sampling_args_map, - force_all=force_all, - forced_envs=forced_envs, - ) - return runnable, set() - - runnable = {job.job_id for job in self.jobs} - return runnable, set() - - -def _find_auto_resume_candidate(output_dir: Path, *, expected_checksum: str) -> Path | None: - """Pick the best prior run directory to auto-resume for the given checksum. - - Preference order: - 1) Matching config checksum and incomplete (completed < total) - 2) Matching config checksum and most recent updated_at - Returns the run directory Path or None if no candidates. - """ - candidates: list[tuple[bool, float, Path]] = [] - for child in sorted(output_dir.iterdir() if output_dir.exists() else [], key=lambda p: p.name): - if not child.is_dir(): - continue - manifest_path = child / MANIFEST_FILENAME - if not manifest_path.exists(): - continue - try: - with manifest_path.open("r", encoding="utf-8") as fh: - payload = json.load(fh) - except Exception: # noqa: BLE001 - continue - if payload.get("config_checksum") != expected_checksum: - continue - summary = payload.get("summary") or {} - total = int(summary.get("total", 0)) - completed = int(summary.get("completed", 0)) - incomplete = completed < total if total > 0 else True - updated_at = payload.get("updated_at") or payload.get("created_at") - try: - ts = _parse_iso_ts(updated_at) if isinstance(updated_at, str) else (manifest_path.stat().st_mtime) - except Exception: # noqa: BLE001 - ts = manifest_path.stat().st_mtime - candidates.append((incomplete, float(ts), child)) - - if not candidates: - return None - candidates.sort(key=lambda t: (t[0], t[1])) - return candidates[-1][2] - - -def _parse_iso_ts(value: str) -> float: - # Accept timestamps like '2025-11-07T01:23:45Z' or ISO with offset - try: - normalized = value.replace("Z", "+00:00") - return datetime.fromisoformat(normalized).timestamp() - except Exception: # noqa: BLE001 - return 0.0 - - -def _plan_auto_resume_jobs( - *, - manifest: RunManifest, - jobs: Sequence[ResolvedJob], - env_args_map: Mapping[str, Mapping[str, Any]], - sampling_args_map: Mapping[str, Mapping[str, Any]], - force_all: bool, - forced_envs: set[str], -) -> set[str]: - job_lookup = {job.job_id: job for job in jobs} - manifest_signatures: dict[str, dict[str, Any]] = {} - resolved_signatures: dict[str, dict[str, Any]] = {} - runnable: set[str] = set() - manifest_job_ids = {entry.job_id for entry in manifest.jobs if entry.job_id} - new_jobs = set(job_lookup) - manifest_job_ids - if new_jobs: - logger.info( - "Auto-resume ignoring %d new job(s) not present in the manifest: %s", - len(new_jobs), - ", ".join(sorted(new_jobs)), - ) - for entry in manifest.jobs: - job_id = entry.job_id - if not job_id: - continue - job = job_lookup.get(job_id) - if job is None: - logger.debug("Manifest contains job '%s' that is absent from the current config; skipping.", job_id) - continue - manifest_signature = manifest_signatures.get(job_id) - if manifest_signature is None: - manifest_signature = manifest_job_signature(manifest.model, entry) - manifest_signatures[job_id] = manifest_signature - resolved_signature = resolved_signatures.get(job_id) - if resolved_signature is None: - resolved_signature = resolved_job_signature( - job, - env_args=env_args_map[job_id], - sampling_args=sampling_args_map[job_id], - ) - resolved_signatures[job_id] = resolved_signature - if manifest_signature != resolved_signature: - msg = ( - f"Job '{job_id}' arguments changed since the manifest was recorded. " - "Start a fresh run by choosing a different --run-id or passing --no-auto-resume. " - "To reuse completed jobs from this run, pass --restart ." - ) - raise ValueError(msg) - env_id = (entry.env_id or job.env.id or job.job_id).lower() - forced = force_all or env_id in forced_envs - if forced or entry.status != "completed": - runnable.add(job_id) - return runnable - - -def _plan_regen_jobs( - *, - manifest: RunManifest, - seed_manifest: RunManifest, - jobs: Sequence[ResolvedJob], - force_all: bool, - forced_envs: set[str], -) -> tuple[set[str], set[str]]: - runnable: set[str] = set() - reused: set[str] = set() - manifest_signatures: dict[str, dict[str, Any]] = {} - seed_signatures: dict[str, dict[str, Any]] = {} - for job in jobs: - entry = manifest.job_entry(job.job_id) - if entry is None: - continue - seed_entry = seed_manifest.job_entry(job.job_id) - env_id = (entry.env_id or job.env.id or job.job_id).lower() - forced = force_all or env_id in forced_envs - if ( - not forced - and seed_entry is not None - and seed_entry.status == "completed" - and _manifest_job_signature_cached(seed_manifest, seed_entry, seed_signatures) - == _manifest_job_signature_cached(manifest, entry, manifest_signatures) - ): - seed_results_dir = seed_entry.results_dir - if seed_results_dir is None: - seed_results_dir = seed_manifest.run_dir / seed_entry.job_id - if isinstance(seed_results_dir, Path): - resolved_results_dir: Path | str | None = seed_results_dir - else: - resolved_results_dir = resolve_results_dir_from_manifest( - str(seed_results_dir) if seed_results_dir is not None else None, - job_id=seed_entry.job_id, - run_dir=seed_manifest.run_dir, - ) - manifest.record_job_skip( - job.job_id, - reason="up_to_date", - results_dir=resolved_results_dir or seed_results_dir, - source_entry=seed_entry, - ) - reused.add(job.job_id) - continue - runnable.add(job.job_id) - return runnable, reused - - -def _manifest_job_signature_cached( - manifest: RunManifest, - entry: Any, - cache: dict[str, dict[str, Any]], -) -> dict[str, Any]: - job_id = entry.job_id - signature = cache.get(job_id) - if signature is None: - signature = manifest_job_signature(manifest.model, entry) - cache[job_id] = signature - return signature - - -def _generate_run_id(name: str) -> str: - base = slugify(name or "run") - timestamp = datetime.now(UTC).strftime("%Y%m%d-%H%M%S") - return f"{base}-{timestamp}" diff --git a/medarc_verifiers/cli/_manifest_tools.py b/medarc_verifiers/cli/_manifest_tools.py deleted file mode 100644 index 836fd9d2..00000000 --- a/medarc_verifiers/cli/_manifest_tools.py +++ /dev/null @@ -1,389 +0,0 @@ -"""Utilities for manifest validation and migration.""" - -from __future__ import annotations - -import os -import json -import logging -import sys -from concurrent.futures import ThreadPoolExecutor, as_completed -from dataclasses import dataclass -from pathlib import Path -from typing import Any, Mapping, Sequence - -from medarc_verifiers.cli._manifest import MANIFEST_FILENAME, RunManifestModel, SUPPORTED_MANIFEST_VERSIONS - -logger = logging.getLogger(__name__) - - -@dataclass(slots=True) -class ManifestValidationIssue: - run_id: str - job_id: str - kind: str - message: str - - -@dataclass(slots=True) -class ManifestValidationResult: - manifests_checked: int - jobs_checked: int - issues: list[ManifestValidationIssue] - - @property - def has_errors(self) -> bool: - return any(issue.kind == "error" for issue in self.issues) - - -def validate_manifests_in_runs(runs_dir: Path | str, *, strict: bool = False) -> ManifestValidationResult: - runs_path = Path(runs_dir) - issues: list[ManifestValidationIssue] = [] - manifests_checked = 0 - jobs_checked = 0 - if not runs_path.exists(): - return ManifestValidationResult(manifests_checked=0, jobs_checked=0, issues=[]) - - run_dirs = sorted(path for path in runs_path.iterdir() if path.is_dir()) - logger.info("Scanning manifests under %s...", runs_path) - - manifest_run_dirs = [run_dir for run_dir in run_dirs if (run_dir / MANIFEST_FILENAME).exists()] - if not manifest_run_dirs: - return ManifestValidationResult(manifests_checked=0, jobs_checked=0, issues=[]) - - max_workers = min(len(manifest_run_dirs), max(1, (os.cpu_count() or 4) * 4)) - if max_workers <= 1: - results = [_validate_run_dir(run_dir, strict=strict) for run_dir in manifest_run_dirs] - else: - results = list(_validate_run_dirs_parallel(manifest_run_dirs, strict=strict, max_workers=max_workers)) - - for result in results: - manifests_checked += result.manifests_checked - jobs_checked += result.jobs_checked - issues.extend(result.issues) - - issues.sort(key=lambda item: (item.run_id, item.job_id, item.kind, item.message)) - return ManifestValidationResult(manifests_checked=manifests_checked, jobs_checked=jobs_checked, issues=issues) - - -def _validate_run_dirs_parallel( - run_dirs: Sequence[Path], - *, - strict: bool, - max_workers: int, -) -> list[ManifestValidationResult]: - results: list[ManifestValidationResult] = [] - progress, task_id = _create_manifest_scan_progress(len(run_dirs)) - executor: ThreadPoolExecutor | None = None - futures = [] - try: - executor = ThreadPoolExecutor(max_workers=max_workers) - futures = [executor.submit(_validate_run_dir, run_dir, strict=strict) for run_dir in run_dirs] - if progress is not None and task_id is not None: - with progress: - for future in as_completed(futures): - results.append(future.result()) - progress.update(task_id, advance=1) - else: - for future in as_completed(futures): - results.append(future.result()) - except KeyboardInterrupt: - logger.warning("Manifest scanning interrupted; cancelling validation workers.") - for future in futures: - future.cancel() - if executor is not None: - executor.shutdown(wait=False, cancel_futures=True) - executor = None - raise - finally: - if executor is not None: - executor.shutdown(wait=True, cancel_futures=False) - return results - - -def _create_manifest_scan_progress(total: int) -> tuple[object | None, object | None]: - if total <= 0 or not sys.stderr.isatty(): - return None, None - try: - from rich.progress import BarColumn, Progress, SpinnerColumn, TaskProgressColumn, TextColumn, TimeElapsedColumn - - progress = Progress( - SpinnerColumn(), - TextColumn("[progress.description]{task.description}"), - BarColumn(), - TaskProgressColumn(), - TimeElapsedColumn(), - transient=True, - ) - task_id = progress.add_task("Scanning manifests", total=total) - return progress, task_id - except Exception: - return None, None - - -def _validate_run_dir(run_dir: Path, *, strict: bool) -> ManifestValidationResult: - issues: list[ManifestValidationIssue] = [] - manifest_path = run_dir / MANIFEST_FILENAME - if not manifest_path.exists(): - return ManifestValidationResult(manifests_checked=0, jobs_checked=0, issues=[]) - - try: - payload = json.loads(manifest_path.read_text(encoding="utf-8")) - except Exception as exc: # noqa: BLE001 - return ManifestValidationResult( - manifests_checked=1, - jobs_checked=0, - issues=[ - ManifestValidationIssue( - run_id=run_dir.name, - job_id="", - kind="error", - message=f"Failed to parse manifest: {exc}", - ) - ], - ) - - version = payload.get("version") - if version not in SUPPORTED_MANIFEST_VERSIONS: - return ManifestValidationResult( - manifests_checked=1, - jobs_checked=0, - issues=[ - ManifestValidationIssue( - run_id=run_dir.name, - job_id="", - kind="error", - message=f"Unsupported manifest version: {version}", - ) - ], - ) - - model = RunManifestModel.model_validate(payload) - artifacts_root = str(getattr(model, "artifacts_root", ".") or ".") - jobs_checked = 0 - - for entry in model.jobs: - jobs_checked += 1 - results_path, metadata_path, used_fallback = _resolve_job_artifact_paths( - run_dir=run_dir, - artifacts_root=artifacts_root, - job_id=entry.job_id, - results_relpath=entry.results_relpath, - metadata_relpath=entry.metadata_relpath, - ) - if used_fallback: - issues.append( - ManifestValidationIssue( - run_id=model.run_id, - job_id=entry.job_id, - kind="warning", - message="Manifest artifact path missing; fallback to run-relative job directory would be used.", - ) - ) - if not results_path.exists(): - kind = "error" if strict else "warning" - issues.append( - ManifestValidationIssue( - run_id=model.run_id, - job_id=entry.job_id, - kind=kind, - message=f"Missing results.jsonl at {results_path}", - ) - ) - if results_path.exists(): - for message in _quick_validate_results_jsonl( - results_path, - num_examples=entry.num_examples, - rollouts_per_example=entry.rollouts_per_example, - ): - kind = "error" if strict else "warning" - issues.append( - ManifestValidationIssue( - run_id=model.run_id, - job_id=entry.job_id, - kind=kind, - message=message, - ) - ) - if entry.metadata_relpath and not metadata_path.exists(): - kind = "error" if strict else "warning" - issues.append( - ManifestValidationIssue( - run_id=model.run_id, - job_id=entry.job_id, - kind=kind, - message=f"Missing metadata.json at {metadata_path}", - ) - ) - - return ManifestValidationResult(manifests_checked=1, jobs_checked=jobs_checked, issues=issues) - - -def _resolve_job_artifact_paths( - *, - run_dir: Path, - artifacts_root: str, - job_id: str, - results_relpath: str | None, - metadata_relpath: str | None, -) -> tuple[Path, Path, bool]: - used_fallback = False - if results_relpath: - root = (run_dir / artifacts_root).resolve() - results_path = (root / results_relpath).resolve() - metadata_path = ( - root / (metadata_relpath or f"{Path(results_relpath).parent.as_posix()}/metadata.json") - ).resolve() - else: - base_dir = (run_dir / job_id).resolve() - results_path = base_dir / "results.jsonl" - metadata_path = base_dir / "metadata.json" - if not results_path.exists() and (run_dir / job_id / "results.jsonl").exists(): - used_fallback = True - results_path = (run_dir / job_id / "results.jsonl").resolve() - metadata_path = (run_dir / job_id / "metadata.json").resolve() - return results_path, metadata_path, used_fallback - - -def _quick_validate_results_jsonl( - path: Path, - *, - num_examples: int | None, - rollouts_per_example: int | None, -) -> list[str]: - first_line = _read_first_nonempty_line(path) - last_line = _read_last_nonempty_line(path) - if first_line is None or last_line is None: - return [f"results.jsonl at {path} is empty"] - - issues: list[str] = [] - first_payload = _decode_probe_line(first_line, path=path, position="first", issues=issues) - last_payload = _decode_probe_line(last_line, path=path, position="last", issues=issues) - if first_payload is None or last_payload is None: - return issues - - for position, payload in (("first", first_payload), ("last", last_payload)): - if "example_id" not in payload: - issues.append(f"{position} JSONL row in {path} is missing example_id") - _validate_rollout_index( - first_payload, - path=path, - position="first", - rollouts_per_example=rollouts_per_example, - issues=issues, - ) - _validate_rollout_index( - last_payload, - path=path, - position="last", - rollouts_per_example=rollouts_per_example, - issues=issues, - ) - - return issues - - -def _decode_probe_line( - raw_line: str, - *, - path: Path, - position: str, - issues: list[str], -) -> Mapping[str, Any] | None: - try: - payload = json.loads(raw_line) - except json.JSONDecodeError as exc: - issues.append(f"failed to parse {position} JSONL row in {path}: {exc.msg}") - return None - if not isinstance(payload, Mapping): - issues.append(f"{position} JSONL row in {path} is not a JSON object") - return None - return payload - - -def _read_first_nonempty_line(path: Path) -> str | None: - with path.open("r", encoding="utf-8") as handle: - for line in handle: - candidate = line.strip() - if candidate: - return candidate - return None - - -def _read_last_nonempty_line(path: Path) -> str | None: - with path.open("rb") as handle: - handle.seek(0, os.SEEK_END) - file_size = handle.tell() - if file_size <= 0: - return None - - chunk_size = 8192 - buffer = b"" - position = file_size - while position > 0: - read_size = min(chunk_size, position) - position -= read_size - handle.seek(position) - buffer = handle.read(read_size) + buffer - lines = buffer.splitlines() - for raw_line in reversed(lines): - candidate = raw_line.strip() - if candidate: - return candidate.decode("utf-8") - return None - - -def _validate_rollout_index( - payload: Mapping[str, Any], - *, - path: Path, - position: str, - rollouts_per_example: int | None, - issues: list[str], -) -> None: - rollout_index = _coerce_int(payload.get("rollout_index")) - if rollout_index is None: - return - if rollout_index < 0: - issues.append(f"{position} JSONL row in {path} has negative rollout_index={payload.get('rollout_index')!r}") - return - if rollouts_per_example and rollout_index >= rollouts_per_example: - issues.append( - f"{position} JSONL row in {path} has out-of-range rollout_index={payload.get('rollout_index')!r}; " - f"expected < {rollouts_per_example}" - ) - - -def _coerce_int(value: Any) -> int | None: - if value is None or isinstance(value, bool): - return None - if isinstance(value, int): - return value - if isinstance(value, float): - if value.is_integer(): - return int(value) - return None - if isinstance(value, str): - try: - return int(value.strip()) - except ValueError: - return None - return None - - -def format_validation_issues(issues: Sequence[ManifestValidationIssue]) -> list[str]: - lines: list[str] = [] - for issue in issues: - prefix = issue.kind.upper() - target = f"run={issue.run_id}" - if issue.job_id: - target += f" job={issue.job_id}" - lines.append(f"[{prefix}] {target}: {issue.message}") - return lines - - -__all__ = [ - "ManifestValidationIssue", - "ManifestValidationResult", - "validate_manifests_in_runs", - "format_validation_issues", -] diff --git a/medarc_verifiers/cli/_schemas.py b/medarc_verifiers/cli/_schemas.py index 05dc139e..1958cdc7 100644 --- a/medarc_verifiers/cli/_schemas.py +++ b/medarc_verifiers/cli/_schemas.py @@ -1,127 +1,14 @@ -"""Pydantic schema stubs for the unified CLI configuration system.""" +"""Small schemas still shared by process export configuration.""" from __future__ import annotations -from pathlib import Path -from typing import Any, ClassVar +from typing import Any -from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator - -RESERVED_MATRIX_KEYS = { - "id", - "module", - "env_args", - "extra_env_kwargs", - "independent_scoring", - "interleave_scoring", - "matrix", - "matrix_exclude", - "matrix_id_format", - "matrix_base_id", - "state_columns", -} - - -# NOTE: These schema definitions are intentionally incomplete. They provide -# the structural scaffolding required to start wiring the config loader and -# will be expanded in subsequent steps of the integration plan. - - -class ModelConfigSchema(BaseModel): - """Schema for model configuration entries (keyed by identifier).""" - - resume_tolerant_fields: ClassVar[set[str]] = frozenset( - { - "api_key_var", - "api_base_url", - "endpoints_path", - "headers", - "timeout", - "max_connections", - "max_keepalive_connections", - "max_retries", - "max_concurrent", - } - ) - - id: str | None = Field( - None, - description="Optional model identifier (legacy list format).", - ) - model: str | None = Field(None, description="Provider-specific model slug.") - headers: list[str] | dict[str, str] | None = Field( - None, - description="Optional HTTP headers to attach to requests.", - ) - sampling_args: dict[str, Any] = Field(default_factory=dict) - env_args: dict[str, Any] = Field(default_factory=dict) - env_overrides: dict[str, dict[str, Any]] = Field(default_factory=dict) - api_key_var: str | None = None - api_base_url: str | None = None - endpoints_path: str | None = None - timeout: float | None = Field(None, ge=0) - max_connections: int | None = Field(None, ge=1) - max_keepalive_connections: int | None = Field(None, ge=1) - max_retries: int | None = Field(None, ge=0) - max_concurrent: int | None = Field(None, ge=1) - - @model_validator(mode="before") - @classmethod - def merge_legacy_params(cls, data: Any) -> Any: - if not isinstance(data, dict): - return data - params = data.get("params") - if not isinstance(params, dict): - return data - merged = dict(params) - for key, value in data.items(): - if key == "params": - continue - merged[key] = value - merged.setdefault("id", data.get("id")) - return merged - - @field_validator("headers") - @classmethod - def validate_headers(cls, value: list[str] | dict[str, str] | None) -> list[str] | dict[str, str] | None: - if value is None: - return None - if isinstance(value, dict): - return {str(key): str(item) for key, item in value.items()} - if isinstance(value, list): - for entry in value: - if not isinstance(entry, str): - msg = "Header entries must be strings when provided as a list." - raise ValueError(msg) - else: - msg = "Headers must be provided as a list of strings or a mapping." - raise ValueError(msg) - return value - - @field_validator("env_args") - @classmethod - def default_model_env_args(cls, value: dict[str, Any]) -> dict[str, Any]: - return dict(value) - - @field_validator("env_overrides", mode="before") - @classmethod - def validate_env_overrides(cls, value: Any) -> dict[str, dict[str, Any]]: - if value is None: - return {} - if not isinstance(value, dict): - raise ValueError("env_overrides must be a mapping of environment ids to mappings.") - normalized: dict[str, dict[str, Any]] = {} - for env_id, override in value.items(): - if not isinstance(env_id, str) or not env_id: - raise ValueError("env_overrides keys must be non-empty strings.") - if not isinstance(override, dict): - raise ValueError(f"env_overrides['{env_id}'] must be a mapping.") - normalized[env_id] = dict(override) - return normalized +from pydantic import BaseModel, ConfigDict, Field, field_validator class EnvironmentExportConfig(BaseModel): - """Optional export customization embedded in environment configs.""" + """Optional export customization embedded in legacy environment configs.""" model_config = ConfigDict(populate_by_name=True) @@ -163,197 +50,23 @@ def validate_answer_column(cls, value: Any) -> str | None: class EnvironmentConfigSchema(BaseModel): - """Schema for environment configuration entries (keyed by identifier).""" + """Legacy environment YAML entry schema used for process export overrides.""" - id: str | None = Field(None, description="Optional environment identifier (legacy list format).") - module: str | None = Field(None, description="Optional module override when the ID differs from the import path.") - num_examples: int = Field(5, description="Number of examples to evaluate (-1 for all).") - rollouts_per_example: int = Field(1, description="Number of rollouts to perform per example.") - max_concurrent: int | None = Field( - None, description="Maximum number of concurrent requests when running the environment." - ) - independent_scoring: bool | None = Field( - default=None, - description=( - "Whether to score each rollout independently (verifiers>=0.1.9). " - "When unset, defaults to rollout-level scoring." - ), - ) - interleave_scoring: bool | None = Field( - default=None, - description="No longer supported; use independent_scoring instead.", - ) - state_columns: list[str] | None = Field( - default=None, description="Optional state columns to persist in job outputs." - ) - save_every: int | None = Field(default=None, description="Deprecated; accepted for compatibility and ignored.") - print_results: bool = Field(False, description="Deprecated; accepted for compatibility and ignored.") - verbose: bool | None = Field(None, description="Override per-environment verbosity.") + model_config = ConfigDict(extra="ignore") + + id: str | None = None + module: str | None = None env_args: dict[str, Any] = Field(default_factory=dict) - extra_env_kwargs: dict[str, Any] | None = Field( - default=None, - description="Optional kwargs forwarded to verifiers Environment.set_kwargs(...) (verifiers>=0.1.9).", - ) - rerun: bool = Field( - False, - description="Re-run jobs for this environment when resuming/regenerating even if previously completed.", - ) - matrix: dict[str, list[Any]] | None = Field(default=None, description="Parameter sweeps for expansion.") - matrix_exclude: list[dict[str, Any]] | None = Field(default=None, description="List of matrix patterns to exclude.") - matrix_id_format: str | None = Field(default=None, description="Optional format string for matrix variant IDs.") matrix_base_id: str | None = Field(default=None, exclude=True) - export: EnvironmentExportConfig | None = Field( - default=None, - description="Optional export customization (keep/drop columns, prompt settings).", - ) - - @model_validator(mode="after") - def validate_scoring_flags(self) -> EnvironmentConfigSchema: - if self.interleave_scoring is not None: - raise ValueError("interleave_scoring is no longer supported; use independent_scoring instead.") - return self - - @field_validator("num_examples") - @classmethod - def validate_num_examples(cls, value: int) -> int: - if value == -1 or value >= 1: - return value - msg = "num_examples must be -1 (all) or >= 1." - raise ValueError(msg) + export: EnvironmentExportConfig | None = None @field_validator("env_args") @classmethod def default_env_args(cls, value: dict[str, Any]) -> dict[str, Any]: return dict(value) - @field_validator("rollouts_per_example") - @classmethod - def validate_rollouts_per_example(cls, value: int) -> int: - if value >= 1: - return value - raise ValueError("rollouts_per_example must be >= 1.") - - @field_validator("max_concurrent") - @classmethod - def validate_max_concurrent(cls, value: int | None) -> int | None: - if value is None or value >= 1: - return value - raise ValueError("max_concurrent must be >= 1 when provided.") - - @field_validator("state_columns") - @classmethod - def validate_state_columns(cls, value: list[str] | None) -> list[str] | None: - if value is None: - return None - if not isinstance(value, list): - raise ValueError("state_columns must be a list of strings when provided.") - return [str(item) for item in value] - - @field_validator("save_every") - @classmethod - def validate_save_every(cls, value: int | None) -> int | None: - if value is None: - return None - if value >= 1: - return value - raise ValueError("save_every must be >= 1 when provided.") - - @field_validator("matrix", mode="before") - @classmethod - def validate_matrix(cls, value: Any) -> dict[str, list[Any]] | None: - if value is None: - return None - if not isinstance(value, dict): - raise ValueError("matrix must be a mapping of parameter names to value lists.") - normalized: dict[str, list[Any]] = {} - for key, items in value.items(): - if not isinstance(key, str) or not key: - raise ValueError("matrix keys must be non-empty strings.") - if isinstance(items, tuple): - items = list(items) - elif not isinstance(items, list): - raise ValueError(f"matrix['{key}'] must be a list of values.") - if not items: - raise ValueError(f"matrix['{key}'] must contain at least one value.") - normalized[key] = list(items) - return normalized - - @field_validator("matrix_exclude", mode="before") - @classmethod - def validate_matrix_exclude(cls, value: Any) -> list[dict[str, Any]] | None: - if value is None: - return None - if not isinstance(value, list): - raise ValueError("matrix_exclude must be a list of mappings.") - normalized: list[dict[str, Any]] = [] - for entry in value: - if not isinstance(entry, dict): - raise ValueError("matrix_exclude entries must be mappings.") - normalized.append(dict(entry)) - return normalized - - @field_validator("matrix_id_format") - @classmethod - def validate_matrix_id_format(cls, value: str | None) -> str | None: - if value is None: - return None - if not isinstance(value, str) or not value: - raise ValueError("matrix_id_format must be a non-empty string when provided.") - return value - - @model_validator(mode="after") - def validate_matrix_constraints(self) -> "EnvironmentConfigSchema": - matrix = self.matrix or {} - if matrix: - base_id = self.id or "" - for key in matrix: - if key in RESERVED_MATRIX_KEYS: - raise ValueError(f"environment '{base_id}' matrix cannot vary '{key}'.") - matrix_keys = set(matrix) - if self.matrix_exclude: - for pattern in self.matrix_exclude: - invalid_keys = set(pattern) - matrix_keys - if invalid_keys: - invalid = ", ".join(sorted(invalid_keys)) - raise ValueError( - f"environment '{base_id}' matrix_exclude entry references unknown keys: {invalid}." - ) - return self - - -class JobConfigSchema(BaseModel): - """Schema for job entries mapping models to environments.""" - - model_config = ConfigDict(populate_by_name=True) - - model: str | dict[str, Any] = Field(..., description="Reference to a defined model id or inline model definition.") - env: str | list[str] = Field(..., description="Reference to an environment id or list of ids.") - env_args: dict[str, Any] = Field(default_factory=dict) - sampling_args: dict[str, Any] = Field(default_factory=dict) - name: str | None = Field(default=None, description="Optional human-friendly job label.") - sleep: float | None = Field(default=None, ge=0, description="Optional delay (in seconds) after this job.") - - -DEFAULT_RUN_OUTPUT_DIR = Path("runs") / "raw" - - -class RunConfigSchema(BaseModel): - """Top-level configuration for unified CLI runs.""" - - name: str = Field("benchmark", description="Human readable run name.") - models: dict[str, ModelConfigSchema] = Field(default_factory=dict, description="Map of model id -> configuration.") - envs: dict[str, EnvironmentConfigSchema] = Field( - ..., description="Map of environment id -> configuration.", min_length=1 - ) - jobs: list[JobConfigSchema] = Field(default_factory=list) - output_dir: Path = Field(default_factory=lambda: DEFAULT_RUN_OUTPUT_DIR) - __all__ = [ - "ModelConfigSchema", "EnvironmentConfigSchema", "EnvironmentExportConfig", - "JobConfigSchema", - "RunConfigSchema", - "RESERVED_MATRIX_KEYS", ] diff --git a/medarc_verifiers/cli/_single_run.py b/medarc_verifiers/cli/_single_run.py index 1414fd6b..c2578738 100644 --- a/medarc_verifiers/cli/_single_run.py +++ b/medarc_verifiers/cli/_single_run.py @@ -20,10 +20,8 @@ DEFAULT_API_KEY_VAR, DEFAULT_ENDPOINTS_PATH, ) -from medarc_verifiers.cli._eval_builder import build_client_config, build_eval_config -from medarc_verifiers.cli._schemas import ModelConfigSchema +from medarc_verifiers.cli.upstream_eval import build_eval_config from medarc_verifiers.cli.utils.env_args import EnvParam, MissingEnvParamError, gather_env_cli_metadata, merge_env_args -from medarc_verifiers.cli.utils.endpoint_utils import load_endpoint_registry from medarc_verifiers.cli.utils.overrides import build_cli_override from medarc_verifiers.cli.utils.resume import ( format_resume_mismatch_lines, @@ -40,6 +38,7 @@ merge_sampling_args, normalize_headers, ) +from medarc_verifiers.utils.prime_inference import PRIME_INFERENCE_URL logger = logging.getLogger(__name__) @@ -53,21 +52,6 @@ class EnvOptionBinding: default: Any -@dataclass -class _SingleRunEnvConfig: - """Lightweight env config to reuse the shared EvalConfig builder.""" - - id: str - module: str | None = None - matrix_base_id: str | None = None - num_examples: int = 5 - rollouts_per_example: int = 1 - max_concurrent: int | None = None - independent_scoring: bool = True - state_columns: list[str] | None = None - verbose: bool | None = False - - def run_single_mode(argv: Sequence[str] | None = None) -> int: """Entry point for single-run (medarc-eval style) execution.""" args_list = list(argv) if argv is not None else sys.argv[1:] @@ -84,6 +68,7 @@ def run_single_mode(argv: Sequence[str] | None = None) -> int: remaining = args_list[1:] endpoints_path_explicit = _option_was_provided(remaining, "--endpoints-path", "-e") api_key_var_explicit = _option_was_provided(remaining, "--api-key-var", "-k") + api_base_url_explicit = _option_was_provided(remaining, "--api-base-url", "-b") parser, env_group, reserved_dests = _build_base_parser_layout(require_env=True, add_help=True, env_id=env_id) try: @@ -97,17 +82,10 @@ def run_single_mode(argv: Sequence[str] | None = None) -> int: args = parser.parse_args([env_id, *remaining]) except SystemExit as exc: # pragma: no cover - argparse already emitted error/help return int(exc.code) - try: - args.model_call_retries = _resolve_model_call_retries( - args.model_call_retries, - args.enable_additional_retries, - ) - if args.http_max_retries is not None and args.http_max_retries < 0: - raise ValueError("--http-max-retries must be >= 0.") - if args.rollout_max_retries < 0: - raise ValueError("--rollout-max-retries must be >= 0.") - except ValueError as exc: - parser.error(str(exc)) + if args.http_max_retries is not None and args.http_max_retries < 0: + parser.error("--http-max-retries must be >= 0.") + if args.rollout_max_retries < 0: + parser.error("--rollout-max-retries must be >= 0.") try: env_override_mapping = build_cli_override( @@ -152,7 +130,7 @@ def run_single_mode(argv: Sequence[str] | None = None) -> int: ) try: - headers = normalize_headers(args.header, header_file=args.header_file) + headers = normalize_headers(args.header) except ValueError as exc: parser.error(str(exc)) @@ -160,119 +138,57 @@ def run_single_mode(argv: Sequence[str] | None = None) -> int: ensure_root_logging("DEBUG" if args.verbose else "INFO") - if args.model_call_retries > 0 and not args.dry_run: - from datetime import datetime - - from medarc_verifiers.utils.retry import patch_verifiers_model_response_retry - - cwd = Path.cwd() - ts = datetime.now().strftime("%Y%m%d_%H%M%S") - retry_log_path = cwd / "logs" / f"medarc_model_retry_{ts}.log" - patch_verifiers_model_response_retry( - attempts=args.model_call_retries, - log_path=retry_log_path, - ) - endpoints_path = Path(args.endpoints_path).expanduser() - default_endpoints_path = Path(DEFAULT_ENDPOINTS_PATH).expanduser() - if not endpoints_path.exists(): - if endpoints_path_explicit: - logger.error("Explicit endpoints registry path does not exist: %s", endpoints_path) - return 2 - if _same_path(endpoints_path, default_endpoints_path): - logger.warning( - "Default endpoints registry '%s' not found; continuing without endpoint aliases.", - endpoints_path, - ) - endpoints = {} - else: - try: - endpoints = load_endpoint_registry(endpoints_path) - except Exception as exc: # noqa: BLE001 - if endpoints_path_explicit: - logger.error("Failed to load explicit endpoints registry '%s': %s", endpoints_path, exc) - return 2 - logger.warning( - "Failed to load default endpoints registry '%s'; continuing without endpoint aliases: %s", - endpoints_path, - exc, - ) - endpoints = {} - - if endpoints_path_explicit and not endpoints: - logger.error("Failed to load endpoint registry from explicit path: %s", endpoints_path) + if endpoints_path_explicit and not endpoints_path.exists(): + logger.error("Explicit endpoints registry path does not exist: %s", endpoints_path) return 2 - model_cfg = ModelConfigSchema(model=args.model) - resolved_model, client_config, prime_sampling_overrides = build_client_config( - model_cfg, - endpoints=endpoints, - default_api_key_var=args.api_key_var, - default_api_key_var_explicit=api_key_var_explicit, - default_api_base_url=args.api_base_url, - api_base_url_override=None, - http_max_retries_override=args.http_max_retries, - timeout_override=args.timeout, - headers=headers, - ) - - # Merge Prime Inference overrides with user sampling args (user args take precedence) - merged_sampling_args = {**prime_sampling_overrides, **merged_sampling_args} - - env_cfg = _SingleRunEnvConfig( - id=args.env, - num_examples=args.num_examples, - rollouts_per_example=args.rollouts_per_example, - max_concurrent=args.max_concurrent, - independent_scoring=not args.group_scoring, - state_columns=state_columns or None, - verbose=args.verbose, - ) + raw_config: dict[str, Any] = { + "env_id": args.env, + "model": args.model, + "env_args": merged_env_args, + "sampling_args": merged_sampling_args, + "include_none_max_tokens": False, + "env_dir_path": str(Path(args.env_dir_path).expanduser()), + "endpoints_path": str(endpoints_path), + "headers": headers, + "num_examples": args.num_examples, + "rollouts_per_example": args.rollouts_per_example, + "max_concurrent": args.max_concurrent, + "max_retries": args.rollout_max_retries, + "http_max_retries": args.http_max_retries, + "client_timeout": args.timeout, + "independent_scoring": not args.group_scoring, + "state_columns": state_columns, + "save_results": bool(args.save_results or args.resume), + "resume": args.resume, + "save_to_hf_hub": args.save_to_hf_hub, + "hf_hub_dataset_name": args.hf_hub_dataset_name or "", + "verbose": args.verbose, + } + if api_base_url_explicit: + raw_config["api_base_url"] = args.api_base_url + else: + raw_config["default_api_base_url"] = args.api_base_url + if api_key_var_explicit: + raw_config["api_key_var"] = args.api_key_var + elif not (api_base_url_explicit and args.api_base_url == PRIME_INFERENCE_URL): + raw_config["default_api_key_var"] = args.api_key_var try: + eval_config = build_eval_config(raw_config) resume_path = resolve_resume_path( resume_arg=args.resume, - env_id=args.env, - model=resolved_model, - num_examples=args.num_examples, - rollouts_per_example=args.rollouts_per_example, - env_dir_path=Path(args.env_dir_path).expanduser(), + env_id=eval_config.env_id, + model=eval_config.model, + num_examples=eval_config.num_examples, + rollouts_per_example=eval_config.rollouts_per_example, + env_dir_path=eval_config.env_dir_path, ) except ValueError as exc: parser.error(str(exc)) - - if isinstance(args.resume, str): - logger.info("Resuming from explicit path: %s", resume_path) - elif args.resume is True: - if resume_path is not None: - logger.info("Auto-resuming from: %s", resume_path) - else: - logger.info("No matching incomplete run found for --resume; starting a new run.") - - eval_config = build_eval_config( - job_label=args.env, - model_cfg=model_cfg, - env_cfg=env_cfg, - env_args=merged_env_args, - sampling_args=merged_sampling_args, - cli_env_args=None, - cli_sampling_args=None, - resolved_model=resolved_model, - client_config=client_config, - env_dir=Path(args.env_dir_path).expanduser(), - max_concurrent_override=args.max_concurrent, - max_concurrent_generation=args.max_concurrent_generation, - max_concurrent_scoring=args.max_concurrent_scoring, - rollout_max_retries=args.rollout_max_retries, - resume_path=resume_path, - default_max_concurrent=DEFAULT_SINGLE_RUN_MAX_CONCURRENT, - save_results=args.save_results, - save_to_hf_hub=args.save_to_hf_hub, - hf_hub_dataset_name=args.hf_hub_dataset_name or None, - verbose=args.verbose, - env_metadata_cache=None, - enforce_required_env_args=True, - ) + if resume_path is not None: + eval_config = eval_config.model_copy(update={"resume_path": resume_path, "save_results": True}) if args.dry_run: print(eval_config.model_dump_json(indent=2)) @@ -291,9 +207,9 @@ def run_single_mode(argv: Sequence[str] | None = None) -> int: logger.error("Evaluation interrupted by user.") return 1 except Exception as exc: # noqa: BLE001 - if resume_path is not None and is_resume_metadata_mismatch_error(exc): - logger.error("Resume metadata mismatch for %s.", resume_path) - saved_values = load_resume_metadata_values(resume_path) + if eval_config.resume_path is not None and is_resume_metadata_mismatch_error(exc): + logger.error("Resume metadata mismatch for %s.", eval_config.resume_path) + saved_values = load_resume_metadata_values(eval_config.resume_path) current_values = { "env_id": eval_config.env_id, "model": eval_config.model, @@ -382,12 +298,6 @@ def _add_and_track(group, *args: str, **kwargs: Any) -> None: action="append", help=f"Extra HTTP header to send ('Name{HEADER_SEPARATOR} Value'). Repeatable.", ) - _add_and_track( - core_group, - "--header-file", - type=Path, - help="File containing newline-delimited 'Name: Value' header entries. Overrides --header on conflicts.", - ) _add_and_track(core_group, "--num-examples", "-n", type=int, default=5, help="Number of examples to evaluate.") _add_and_track( core_group, "--rollouts-per-example", "-r", type=int, default=3, help="Number of rollouts per example." @@ -400,20 +310,6 @@ def _add_and_track(group, *args: str, **kwargs: Any) -> None: default=DEFAULT_SINGLE_RUN_MAX_CONCURRENT, help="Maximum number of concurrent requests.", ) - _add_and_track( - core_group, - "--max-concurrent-generation", - type=int, - default=None, - help="Deprecated: ignored.", - ) - _add_and_track( - core_group, - "--max-concurrent-scoring", - type=int, - default=None, - help="Deprecated: ignored.", - ) _add_and_track( core_group, "--timeout", @@ -435,20 +331,6 @@ def _add_and_track(group, *args: str, **kwargs: Any) -> None: default=0, help="Retry full rollout/group on retryable infra/invalid-response errors.", ) - _add_and_track( - core_group, - "--model-call-retries", - type=int, - default=None, - help="Per-model-call MedARC retry attempts (0 disables the monkeypatch).", - ) - _add_and_track( - core_group, - "--enable-additional-retries", - action=argparse.BooleanOptionalAction, - default=None, - help="Deprecated alias for --model-call-retries (true maps to 3 attempts).", - ) _add_and_track( core_group, "--max-tokens", @@ -649,23 +531,6 @@ def _option_was_provided(argv: Sequence[str], long_flag: str, short_flag: str | return False -def _resolve_model_call_retries(model_call_retries: int | None, deprecated_toggle: bool | None) -> int: - if model_call_retries is not None: - if model_call_retries < 0: - raise ValueError("--model-call-retries must be >= 0.") - if deprecated_toggle is not None: - logger.warning( - "Ignoring deprecated --enable-additional-retries because --model-call-retries was explicitly set." - ) - return model_call_retries - - if deprecated_toggle is None: - return 0 - - logger.warning("Flag --enable-additional-retries is deprecated; use --model-call-retries instead.") - return 3 if deprecated_toggle else 0 - - def _same_path(left: Path, right: Path) -> bool: try: return left.resolve(strict=False) == right.resolve(strict=False) diff --git a/medarc_verifiers/cli/bench_child.py b/medarc_verifiers/cli/bench_child.py new file mode 100644 index 00000000..54225465 --- /dev/null +++ b/medarc_verifiers/cli/bench_child.py @@ -0,0 +1,111 @@ +"""Private subprocess runner for one TOML bench eval with env lifecycle.""" + +from __future__ import annotations + +import argparse +import asyncio +import json +import sys +import traceback +from pathlib import Path +from typing import Any + +from verifiers.utils.eval_utils import run_evaluation + +from medarc_verifiers.cli.env_lifecycle import ( + EnvInstallState, + ensure_installed, + resolve_env_package, + uninstall_if_child_installed, +) +from medarc_verifiers.cli.upstream_eval import EvalConfigOverrides, build_eval_config + + +def main(argv: list[str] | None = None) -> int: + parser = argparse.ArgumentParser(description="Run one TOML bench eval child payload.") + parser.add_argument("payload", type=Path) + args = parser.parse_args(argv) + payload = json.loads(args.payload.read_text(encoding="utf-8")) + status = _run_payload(payload) + status_path = Path(payload["status_path"]) + status_path.parent.mkdir(parents=True, exist_ok=True) + status_path.write_text(json.dumps(status, sort_keys=True), encoding="utf-8") + return int(status["exit_code"]) + + +def _run_payload(payload: dict[str, Any]) -> dict[str, Any]: + installed_state: EnvInstallState | None = None + eval_failed = False + cleanup_failed = False + status: dict[str, Any] = { + "env_id": payload.get("expected_env_id"), + "model": payload.get("expected_model"), + "installed_by_child": False, + "eval_ok": False, + "cleanup_ok": True, + "primary_error": None, + "cleanup_error": None, + "exit_code": 1, + "exit_reason": "not_started", + } + + try: + if payload.get("env_preinstalled", False): + status["installed_by_child"] = False + else: + ref = resolve_env_package(payload["raw_config"]["env_id"], payload["env_dir"]) + installed_state = ensure_installed(ref) + status["installed_by_child"] = installed_state.installed_by_child + + config = build_eval_config(payload["raw_config"], overrides=_overrides_from_payload(payload["overrides"])) + planned_resume_path = Path(payload["resume_path"]) + if config.env_id != payload["expected_env_id"]: + raise ValueError(f"Child resolved env_id {config.env_id!r}, expected {payload['expected_env_id']!r}.") + if config.model != payload["expected_model"]: + raise ValueError(f"Child resolved model {config.model!r}, expected {payload['expected_model']!r}.") + config = config.model_copy(update={"resume_path": planned_resume_path, "save_results": True}) + asyncio.run(run_evaluation(config)) + status["eval_ok"] = True + status["exit_code"] = 0 + status["exit_reason"] = "success" + except Exception as exc: # noqa: BLE001 + eval_failed = True + status["primary_error"] = _format_exception(exc) + status["exit_reason"] = "eval_failed" + finally: + try: + if installed_state is not None and payload.get("cleanup_env_package", True): + uninstall_if_child_installed(installed_state) + except Exception as exc: # noqa: BLE001 + cleanup_failed = True + status["cleanup_ok"] = False + status["cleanup_error"] = _format_exception(exc) + + if cleanup_failed and not eval_failed: + status["exit_code"] = 1 + status["exit_reason"] = "cleanup_failed" + elif eval_failed: + status["exit_code"] = 1 + return status + + +def _overrides_from_payload(payload: dict[str, Any]) -> EvalConfigOverrides: + return EvalConfigOverrides( + model=payload.get("model"), + provider=payload.get("provider"), + api_base_url=payload.get("api_base_url"), + api_key_var=payload.get("api_key_var"), + api_client_type=payload.get("api_client_type"), + endpoints_path=payload.get("endpoints_path"), + max_concurrent=payload.get("max_concurrent"), + env_args=payload.get("env_args"), + sampling_args=payload.get("sampling_args"), + ) + + +def _format_exception(exc: BaseException) -> str: + return "".join(traceback.format_exception_only(type(exc), exc)).strip() + + +if __name__ == "__main__": + raise SystemExit(main(sys.argv[1:])) diff --git a/medarc_verifiers/cli/env_lifecycle.py b/medarc_verifiers/cli/env_lifecycle.py new file mode 100644 index 00000000..bd76356c --- /dev/null +++ b/medarc_verifiers/cli/env_lifecycle.py @@ -0,0 +1,148 @@ +"""Local environment package lifecycle helpers for TOML bench subprocesses.""" + +from __future__ import annotations + +import importlib +import importlib.metadata +import importlib.util +import subprocess +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Any + +from verifiers.utils.import_utils import load_toml + + +@dataclass(frozen=True) +class EnvPackageRef: + env_id: str + module_name: str + project_name: str + env_path: Path + loader: str | None = None + + +@dataclass(frozen=True) +class EnvInstallState: + ref: EnvPackageRef + installed_by_child: bool + distribution_preexisting: bool + module_preexisting: bool + + +def upstream_module_name(env_id: str) -> str: + return env_id.replace("-", "_").split("/")[-1] + + +def resolve_env_package(env_id: str, env_dir: str | Path) -> EnvPackageRef: + module_name = upstream_module_name(env_id) + env_root = Path(env_dir).expanduser() / module_name + pyproject_path = env_root / "pyproject.toml" + if not env_root.exists(): + raise FileNotFoundError( + f"Environment {env_id!r} is not installed and no local package was found at {env_root}. " + "Install it manually or pass --env-dir." + ) + if not pyproject_path.is_file(): + raise FileNotFoundError(f"Environment {env_id!r} local package at {env_root} is missing pyproject.toml.") + + with pyproject_path.open("rb") as handle: + pyproject_data: dict[str, Any] = load_toml(handle) + + project_name = pyproject_data.get("project", {}).get("name") + if not isinstance(project_name, str) or not project_name: + raise ValueError(f"Environment {env_id!r} pyproject.toml must define [project].name.") + + loader = pyproject_data.get("tool", {}).get("prime", {}).get("environment", {}).get("loader") + if loader is not None and not isinstance(loader, str): + loader = None + + return EnvPackageRef( + env_id=env_id, + module_name=module_name, + project_name=project_name, + env_path=env_root, + loader=loader, + ) + + +def inspect_install_state(ref: EnvPackageRef) -> EnvInstallState: + distribution_preexisting = _distribution_exists(ref.project_name) + module_preexisting = _module_importable(ref.module_name) + + if distribution_preexisting and not module_preexisting: + loader_note = f" Loader metadata is {ref.loader!r}." if ref.loader else "" + raise ModuleNotFoundError( + f"Distribution {ref.project_name!r} is installed, but upstream module " + f"{ref.module_name!r} is not importable.{loader_note}" + ) + + return EnvInstallState( + ref=ref, + installed_by_child=False, + distribution_preexisting=distribution_preexisting, + module_preexisting=module_preexisting, + ) + + +def ensure_installed(ref: EnvPackageRef) -> EnvInstallState: + state = inspect_install_state(ref) + if state.distribution_preexisting or state.module_preexisting: + return state + + subprocess.run( + ["uv", "pip", "install", "--python", sys.executable, "-e", str(ref.env_path)], + check=True, + ) + importlib.invalidate_caches() + if not _module_importable(ref.module_name): + subprocess.run( + ["uv", "pip", "uninstall", "--python", sys.executable, "-y", ref.project_name], + check=False, + ) + loader_note = f" Loader metadata is {ref.loader!r}." if ref.loader else "" + raise ModuleNotFoundError( + f"Installed {ref.project_name!r} from {ref.env_path}, but upstream module " + f"{ref.module_name!r} is still not importable.{loader_note}" + ) + return EnvInstallState( + ref=ref, + installed_by_child=True, + distribution_preexisting=False, + module_preexisting=False, + ) + + +def uninstall_if_child_installed(state: EnvInstallState) -> None: + if not state.installed_by_child: + return + subprocess.run( + ["uv", "pip", "uninstall", "--python", sys.executable, "-y", state.ref.project_name], + check=True, + ) + importlib.invalidate_caches() + sys.modules.pop(state.ref.module_name, None) + + +def _distribution_exists(project_name: str) -> bool: + try: + importlib.metadata.distribution(project_name) + except importlib.metadata.PackageNotFoundError: + return False + return True + + +def _module_importable(module_name: str) -> bool: + return importlib.util.find_spec(module_name) is not None + + +__all__ = [ + "EnvInstallState", + "EnvPackageRef", + "ensure_installed", + "inspect_install_state", + "resolve_env_package", + "uninstall_if_child_installed", + "upstream_module_name", +] diff --git a/medarc_verifiers/cli/eval_identity.py b/medarc_verifiers/cli/eval_identity.py new file mode 100644 index 00000000..55eddcb7 --- /dev/null +++ b/medarc_verifiers/cli/eval_identity.py @@ -0,0 +1,258 @@ +"""Deterministic eval identity helpers for the TOML bench wrapper.""" + +from __future__ import annotations + +import hashlib +import json +import re +from collections import Counter +from collections.abc import Iterable, Mapping, Sequence +from dataclasses import dataclass +from pathlib import Path +from typing import Any + +MEDARC_VARIANT_ID_KEY = "variant_id" +BASE_VARIANT_ID = "base" + +_SLUG_PATTERN = re.compile(r"[^A-Za-z0-9._-]+") +_MAX_SEGMENT_LENGTH = 80 +_MAX_VARIANT_ID_LENGTH = 160 + + +@dataclass(frozen=True) +class EvalIdentity: + """Resolved model/env identity plus semantic variant metadata.""" + + model_id: str + env_id: str + variant_id: str = BASE_VARIANT_ID + + @property + def dataset_id(self) -> str: + return f"{self.env_id}::{self.variant_id}" + + +@dataclass(frozen=True) +class EvalPathPlan: + """Deterministic result location for one eval config.""" + + identity: EvalIdentity + results_path: Path + + +def slug_component(value: Any, *, max_length: int = _MAX_SEGMENT_LENGTH) -> str: + """Return a path-safe slug for one path component.""" + + slug = _SLUG_PATTERN.sub("-", str(value).strip()).strip("-._") + if not slug: + slug = "value" + if len(slug) <= max_length: + return slug + digest = _short_text_digest(str(value), length=10) + return f"{slug[: max_length - 11].rstrip('-._')}-{digest}" + + +def generate_variant_id(payload: Mapping[str, Any]) -> str: + """Generate a stable human-readable variant id for legacy export config keys.""" + + if not payload: + return BASE_VARIANT_ID + + segments: list[str] = [] + for key, value in sorted(payload.items()): + if isinstance(value, Mapping): + for nested_key, nested_value in sorted(value.items()): + segments.append(_variant_segment(f"{key}.{nested_key}", nested_value)) + else: + segments.append(_variant_segment(key, value)) + + if not segments: + return BASE_VARIANT_ID + + variant_id = "__".join(segments) + if len(variant_id) <= _MAX_VARIANT_ID_LENGTH and all(not segment.endswith("-hash") for segment in segments): + return variant_id + return f"{variant_id[:120].rstrip('-._')}__{_short_json_digest(payload, length=12)}" + + +def plan_eval_paths(raw_configs: Sequence[Mapping[str, Any]], *, output_root: str | Path) -> list[EvalPathPlan]: + """Plan deterministic output paths for TOML bench eval configs.""" + + keys = [(_model_id(config), _env_id(config)) for config in raw_configs] + plans: list[EvalPathPlan] = [] + for idx, (config, key) in enumerate(zip(raw_configs, keys)): + model_id, env_id = key + variant_id = _variant_id(config, index=idx + 1) + + identity = EvalIdentity(model_id=model_id, env_id=env_id, variant_id=variant_id) + path = Path(output_root) / slug_component(model_id) / slug_component(env_id) / variant_id + plans.append(EvalPathPlan(identity=identity, results_path=path)) + + _ensure_unique_identities(plans) + _ensure_unique_slugs(plans) + return plans + + +def _ensure_unique_identities(plans: Sequence[EvalPathPlan]) -> None: + identities = [(plan.identity.model_id, plan.identity.env_id, plan.identity.variant_id) for plan in plans] + duplicates = sorted(identity for identity, count in Counter(identities).items() if count > 1) + if duplicates: + rendered = ", ".join( + f"model={model!r}, env_id={env_id!r}, variant_id={variant_id!r}" for model, env_id, variant_id in duplicates + ) + raise ValueError(f"Duplicate TOML eval identity; add a distinct variant_id/name: {rendered}") + + +def _ensure_unique_slugs(plans: Sequence[EvalPathPlan]) -> None: + _raise_slug_collisions( + "model", + ((slug_component(plan.identity.model_id), plan.identity.model_id) for plan in plans), + ) + + _raise_slug_collisions( + "env", + ( + ( + f"{slug_component(plan.identity.model_id)}/{slug_component(plan.identity.env_id)}", + f"{plan.identity.model_id}/{plan.identity.env_id}", + ) + for plan in plans + ), + ) + _raise_slug_collisions( + "variant", + ( + ( + "/".join( + ( + slug_component(plan.identity.model_id), + slug_component(plan.identity.env_id), + slug_component(plan.identity.variant_id, max_length=_MAX_VARIANT_ID_LENGTH), + ) + ), + f"{plan.identity.model_id}/{plan.identity.env_id}/{plan.identity.variant_id}", + ) + for plan in plans + ), + ) + + paths = [plan.results_path for plan in plans] + duplicate_paths = sorted(path for path, count in Counter(paths).items() if count > 1) + if duplicate_paths: + rendered = ", ".join(str(path) for path in duplicate_paths) + raise ValueError(f"Deterministic eval path collision: {rendered}") + + +def _raise_slug_collisions(label: str, pairs: Iterable[tuple[str, str]]) -> None: + values_by_slug: dict[str, set[str]] = {} + for slug, value in pairs: + values_by_slug.setdefault(slug, set()).add(value) + collisions = {slug: sorted(values) for slug, values in values_by_slug.items() if len(values) > 1} + if not collisions: + return + rendered = "; ".join(f"{slug}: {values}" for slug, values in sorted(collisions.items())) + raise ValueError(f"Deterministic eval {label} slug collision: {rendered}") + + +def _variant_id(config: Mapping[str, Any], *, index: int) -> str: + raw_variant = config.get("variant_id") + raw_name = config.get("name") + variant = _normalize_variant(raw_variant, config=config, field="variant_id", index=index) + name = _normalize_variant(raw_name, config=config, field="name", index=index) + if variant and name and variant != name: + raise ValueError(f"TOML eval {index} has conflicting variant_id/name values: {variant!r} != {name!r}.") + return variant or name or BASE_VARIANT_ID + + +def _normalize_variant(value: Any, *, config: Mapping[str, Any], field: str, index: int) -> str | None: + if value is None: + return None + text = _expand_variant_template(str(value).strip(), config) + if not text: + raise ValueError(f"TOML eval {index} {field} must not be empty.") + if slug_component(text, max_length=_MAX_VARIANT_ID_LENGTH) != text: + raise ValueError( + f"TOML eval {index} {field} {text!r} is not path-safe. " + 'Use only letters, numbers, ".", "_", and "-", for example "shuffle_seed-1618".' + ) + return text + + +def _expand_variant_template(template: str, config: Mapping[str, Any]) -> str: + def replace(match: re.Match[str]) -> str: + path = match.group(1).strip() + value: Any = config + for part in path.split("."): + if isinstance(value, Mapping) and part in value: + value = value[part] + else: + raise ValueError(f"Variant template references unknown field: {path}") + return str(value) + + return re.sub(r"\{([^{}]+)\}", replace, template).strip() + + +def _model_id(config: Mapping[str, Any]) -> str: + value = config.get("model") + if not value: + raise ValueError( + "Eval config must include resolved 'model' for deterministic identity; build EvalConfig before planning paths." + ) + return str(value) + + +def _env_id(config: Mapping[str, Any]) -> str: + value = config.get("env_id") + if not value: + raise ValueError("Eval config must include 'env_id' for deterministic identity.") + return str(value) + + +def _short_text_digest(value: str, *, length: int) -> str: + return hashlib.sha256(value.encode("utf-8")).hexdigest()[:length] + + +def _short_json_digest(value: Any, *, length: int) -> str: + encoded = json.dumps(_canonicalize(value), sort_keys=True, separators=(",", ":"), default=str).encode("utf-8") + return hashlib.sha256(encoded).hexdigest()[:length] + + +def _variant_segment(key: str, value: Any) -> str: + key_slug = slug_component(key, max_length=40) + value_slug = slug_component(_variant_value_text(value), max_length=80) + if isinstance(value, Mapping | Sequence) and not isinstance(value, str | bytes | bytearray): + return f"{key_slug}-{value_slug}-{_short_json_digest(value, length=8)}" + return f"{key_slug}-{value_slug}" + + +def _variant_value_text(value: Any) -> str: + if isinstance(value, bool): + return str(value).lower() + if value is None: + return "none" + if isinstance(value, int | float | str): + return str(value) + return "hash" + + +def _canonicalize(value: Any) -> Any: + if isinstance(value, Mapping): + return {str(key): _canonicalize(value[key]) for key in sorted(value)} + if isinstance(value, list | tuple): + return [_canonicalize(item) for item in value] + if isinstance(value, set): + return [_canonicalize(item) for item in sorted(value, key=str)] + if isinstance(value, Path): + return str(value) + return value + + +__all__ = [ + "BASE_VARIANT_ID", + "EvalIdentity", + "EvalPathPlan", + "MEDARC_VARIANT_ID_KEY", + "generate_variant_id", + "plan_eval_paths", + "slug_component", +] diff --git a/medarc_verifiers/cli/isolated_env.py b/medarc_verifiers/cli/isolated_env.py new file mode 100644 index 00000000..e6ce0326 --- /dev/null +++ b/medarc_verifiers/cli/isolated_env.py @@ -0,0 +1,143 @@ +"""Temporary virtual environment helpers for isolated TOML bench evals.""" + +from __future__ import annotations + +import json +import shutil +import subprocess +import sys +import tempfile +from contextlib import contextmanager +from dataclasses import dataclass +from importlib import metadata +from pathlib import Path +from typing import Iterator +from urllib.parse import unquote, urlparse + + +class IsolatedEnvError(RuntimeError): + """Raised when an isolated bench environment cannot be prepared.""" + + +@dataclass(frozen=True) +class MedarcInstallSpec: + editable: bool + version: str + checkout_root: Path | None = None + + +def venv_python_path(venv_path: Path) -> Path: + posix_path = venv_path / "bin" / "python" + if posix_path.exists(): + return posix_path + return venv_path / "Scripts" / "python.exe" + + +def current_medarc_install_spec() -> MedarcInstallSpec: + try: + dist = metadata.distribution("medarc-verifiers") + except metadata.PackageNotFoundError as exc: + raise IsolatedEnvError("Cannot auto-install isolated envs because medarc-verifiers is not installed.") from exc + + direct_url_text = dist.read_text("direct_url.json") + if direct_url_text: + try: + direct_url = json.loads(direct_url_text) + except json.JSONDecodeError as exc: + raise IsolatedEnvError("Installed medarc-verifiers has malformed direct_url.json metadata.") from exc + if direct_url.get("dir_info", {}).get("editable"): + url = direct_url.get("url") + parsed = urlparse(url) if isinstance(url, str) else None + if parsed is None or parsed.scheme != "file": + raise IsolatedEnvError("Editable medarc-verifiers install does not point at a local file:// checkout.") + checkout_root = Path(unquote(parsed.path)).expanduser().resolve() + _validate_editable_checkout(checkout_root) + return MedarcInstallSpec(editable=True, version=dist.version, checkout_root=checkout_root) + + return MedarcInstallSpec(editable=False, version=dist.version) + + +@contextmanager +def temporary_bench_venv(repo_root: Path | None = None) -> Iterator[Path]: + temp_root = Path(tempfile.mkdtemp(prefix="medarc-bench-venv-")) + try: + python_executable = _create_venv(temp_root) + install_medarc_into_venv(python_executable, repo_root=repo_root) + yield python_executable + finally: + shutil.rmtree(temp_root, ignore_errors=True) + + +def install_medarc_into_venv(python_executable: Path, *, repo_root: Path | None = None) -> None: + spec = current_medarc_install_spec() + if spec.editable: + checkout_root = repo_root or spec.checkout_root + if checkout_root is None: + raise IsolatedEnvError("Editable medarc-verifiers checkout path could not be resolved.") + _validate_editable_checkout(checkout_root) + command = ["uv", "pip", "install", "--python", str(python_executable), "-e", str(checkout_root)] + _run_uv(command, "install editable medarc-verifiers into isolated venv") + return + + requirement = f"medarc-verifiers=={spec.version}" + command = ["uv", "pip", "install", "--python", str(python_executable), requirement] + try: + _run_uv(command, f"install {requirement} into isolated venv") + except IsolatedEnvError as exc: + raise IsolatedEnvError( + f"Could not resolve {requirement} for isolated auto-install. Run from an editable checkout, " + "or preinstall environment packages and pass --no-auto-install." + ) from exc + + +def install_env_package(python_executable: Path, env_path: Path) -> None: + _run_uv( + ["uv", "pip", "install", "--python", str(python_executable), "-e", str(env_path)], + f"install environment package {env_path} into isolated venv", + ) + + +def _create_venv(venv_path: Path) -> Path: + _run_uv(["uv", "venv", "--python", sys.executable, str(venv_path)], "create isolated bench venv") + python_executable = venv_python_path(venv_path) + if not python_executable.exists(): + raise IsolatedEnvError(f"uv created {venv_path}, but no Python executable was found in it.") + return python_executable + + +def _validate_editable_checkout(checkout_root: Path) -> None: + if not (checkout_root / "pyproject.toml").is_file() or not (checkout_root / "medarc_verifiers").is_dir(): + raise IsolatedEnvError( + f"Editable medarc-verifiers checkout at {checkout_root} is invalid; expected pyproject.toml " + "and medarc_verifiers/." + ) + + +def _run_uv(command: list[str], action: str) -> None: + try: + completed = subprocess.run(command, check=False, capture_output=True, text=True) + except FileNotFoundError as exc: + raise IsolatedEnvError(f"Cannot {action}: uv is not installed or not on PATH.") from exc + if completed.returncode != 0: + stderr_tail = _tail(completed.stderr) + stdout_tail = _tail(completed.stdout) + detail = "\n".join(part for part in (stderr_tail, stdout_tail) if part) + raise IsolatedEnvError(f"Failed to {action} with exit code {completed.returncode}.\n{detail}".rstrip()) + + +def _tail(text: str, *, lines: int = 20) -> str: + stripped = text.strip() + if not stripped: + return "" + return "\n".join(stripped.splitlines()[-lines:]) + + +__all__ = [ + "IsolatedEnvError", + "MedarcInstallSpec", + "current_medarc_install_spec", + "install_env_package", + "install_medarc_into_venv", + "temporary_bench_venv", + "venv_python_path", +] diff --git a/medarc_verifiers/cli/main.py b/medarc_verifiers/cli/main.py index 97ca6e50..744c8342 100644 --- a/medarc_verifiers/cli/main.py +++ b/medarc_verifiers/cli/main.py @@ -3,48 +3,61 @@ from __future__ import annotations import argparse +import asyncio +import importlib.util +import json import logging import os +import shutil +import subprocess import sys +import tempfile +from datetime import UTC, datetime from pathlib import Path from textwrap import dedent +from types import SimpleNamespace from typing import Any, Literal, Mapping, Sequence import yaml from pydantic import ValidationError from rich.console import Console from rich.table import Table +from verifiers.utils.eval_utils import run_evaluation -from medarc_verifiers.cli._config_loader import ConfigFormatError, load_run_config from medarc_verifiers.cli._constants import ( BENCH_COMMAND, COMMAND, - DEFAULT_API_BASE_URL, - DEFAULT_API_KEY_VAR, DEFAULT_ENDPOINTS_PATH, + DEFAULT_EVALS_DIR, DEFAULT_ENV_CONFIG_ROOT, DEFAULT_ENV_DIR, DEFAULT_PROCESSED_DIR, - DEFAULT_RUNS_RAW_DIR, PROCESS_COMMAND, WINRATE_COMMAND, ) -from medarc_verifiers.cli._job_builder import ResolvedJob, build_jobs -from medarc_verifiers.cli._job_executor import ExecutorSettings, JobExecutionResult, execute_jobs -from medarc_verifiers.cli._manifest import MANIFEST_FILENAME, ManifestJobEntry, RunManifest, compute_snapshot_checksum -from medarc_verifiers.cli._manifest_planner import ManifestPlanner from medarc_verifiers.cli._schemas import EnvironmentConfigSchema, EnvironmentExportConfig from medarc_verifiers.cli._single_run import run_single_mode +from medarc_verifiers.cli.eval_identity import ( + EvalPathPlan, + generate_variant_id, + plan_eval_paths, +) from medarc_verifiers.cli.hf import HFSyncConfig, sync_files_to_hub -from medarc_verifiers.cli.process import PROCESS_DEFAULT_STATUS_FILTER, ProcessOptions, ProcessResult, run_process +from medarc_verifiers.cli.env_lifecycle import EnvPackageRef, resolve_env_package, upstream_module_name +from medarc_verifiers.cli.isolated_env import install_env_package, temporary_bench_venv +from medarc_verifiers.cli.process import ProcessOptions, ProcessResult, run_process from medarc_verifiers.cli.utils.config_io import load_mapping_file from medarc_verifiers.cli.utils.overrides import build_cli_override from medarc_verifiers.cli.utils.shared import ( dataset_is_excluded, normalize_dataset_ids, normalize_model_ids, - slugify, - validate_simple_name, +) +from medarc_verifiers.cli.upstream_eval import ( + EvalConfigOverrides, + build_eval_config, + build_eval_identity_payload, + load_toml_eval_configs, ) from medarc_verifiers.utils.pathing import resolve_under from medarc_verifiers.cli.winrate import ( @@ -63,44 +76,14 @@ def build_batch_parser() -> argparse.ArgumentParser: """Construct the unified CLI parser.""" parser = argparse.ArgumentParser( prog=COMMAND, - description="Run MedARC evaluations using unified configuration files.", - ) - parser.add_argument("-c", "--config", required=True, type=Path, help="Path to a run configuration YAML file.") - parser.add_argument( - "--run-id", - help="Override the generated run identifier (simple name only: no slashes, no '..', not absolute).", - ) - parser.add_argument("--name", help="Override the human-friendly run name (defaults to the config name).") - parser.add_argument( - "--restart", - help=( - "Seed jobs from a previous run directory or run_manifest.json path; " - "otherwise treated as a run id under output_dir." - ), - ) - parser.add_argument( - "--auto-resume", - action=argparse.BooleanOptionalAction, - default=True, - help=( - "Automatically resume the newest matching run (default: enabled). " - "Pass --no-auto-resume to force a fresh run." - ), - ) - parser.add_argument( - "--on-complete", - choices=("exit", "continue", "rerun", "new", "prompt"), - default="prompt", - help=( - "Action when all selected jobs are already completed. " - "Use 'prompt' for interactive selection (default: prompt)." - ), + description="Run MedARC evaluations using upstream verifiers TOML configs.", ) - parser.add_argument("--force", action="store_true", help="Re-run every job regardless of manifest state.") + parser.add_argument("-c", "--config", required=True, type=Path, help="Path to an upstream TOML eval config file.") + parser.add_argument("--force", action="store_true", help="Archive existing deterministic output and rerun.") parser.add_argument( - "--forced", - action="append", - help="Re-run jobs for the specified environment(s); repeat or comma-separate values.", + "--resume", + action="store_true", + help="Accepted for compatibility; deterministic bench outputs resume automatically when valid artifacts exist.", ) parser.add_argument("--output-dir", type=Path, help="Override the output directory from the configuration.") parser.add_argument( @@ -109,11 +92,19 @@ def build_batch_parser() -> argparse.ArgumentParser: default=DEFAULT_ENV_DIR, help="Directory containing environments (default: %(default)s).", ) - parser.add_argument( - "--env-config-root", - type=Path, - default=DEFAULT_ENV_CONFIG_ROOT, - help="Directory containing environment YAMLs for auto-discovery (default: %(default)s).", + auto_install_group = parser.add_mutually_exclusive_group() + auto_install_group.add_argument( + "--auto-install", + dest="auto_install", + action="store_true", + default=True, + help="Auto-install missing local env packages in isolated temporary venvs (default).", + ) + auto_install_group.add_argument( + "--no-auto-install", + dest="auto_install", + action="store_false", + help="Require selected environment packages to already be importable in the active Python environment.", ) parser.add_argument( "--endpoints-path", @@ -121,26 +112,23 @@ def build_batch_parser() -> argparse.ArgumentParser: default=DEFAULT_ENDPOINTS_PATH, help=f"Path to the endpoints registry file (default: {DEFAULT_ENDPOINTS_PATH}).", ) - parser.add_argument( - "--default-api-key-var", - default=DEFAULT_API_KEY_VAR, - help=f"Default API key environment variable (default: {DEFAULT_API_KEY_VAR}).", - ) - parser.add_argument( - "--default-api-base-url", - default=DEFAULT_API_BASE_URL, - help=f"Default API base URL (default: {DEFAULT_API_BASE_URL}).", - ) parser.add_argument( "--api-base-url", default=None, - help=( - "Override API base URL for all models (CLI force > model api_base_url > --default-api-base-url). " - "Useful when pointing a config at a dynamically assigned endpoint." - ), + help="Override API base URL for all TOML evals.", + ) + parser.add_argument("--api-key-var", default=None, help="Override API key environment variable for TOML bench.") + parser.add_argument("--provider", default=None, help="Override provider shorthand for TOML bench.") + parser.add_argument("--model", "-m", default=None, help="Override model for every TOML eval.") + parser.add_argument( + "--eval-index", "--job-index", dest="eval_index", type=int, help="Run only one TOML eval by 1-based index." ) + parser.add_argument("--start-at", type=int, help="Start TOML execution at this 1-based eval index.") + parser.add_argument("--stop-after", type=int, help="Stop TOML execution after this 1-based eval index.") parser.add_argument( - "--job-id", action="append", help="Run only the specified job identifier (repeat to select multiple)." + "--continue-on-error", + action="store_true", + help="Continue TOML sequential execution after a failed eval.", ) parser.add_argument( "--env-arg", action="append", help="Override an environment argument with KEY=VALUE (repeatable)." @@ -154,50 +142,24 @@ def build_batch_parser() -> argparse.ArgumentParser: "--dry-run", action="store_true", help="Resolve jobs and report overrides without executing them." ) parser.add_argument("--verbose", "-v", action="store_true", help="Enable verbose logging.") - parser.add_argument( - "--save-results", - action=argparse.BooleanOptionalAction, - default=True, - help="Persist evaluation outputs (default: enabled).", - ) - parser.add_argument( - "--save-to-hf-hub", - action=argparse.BooleanOptionalAction, - default=False, - help="Upload results to the Hugging Face Hub.", - ) - parser.add_argument("--hf-hub-dataset-name", help="Custom dataset name when uploading to the Hub.") parser.add_argument( "--max-concurrent", type=int, default=None, - help="Override env max_concurrent for all jobs (CLI > model > env > defaults).", + help="Override max_concurrent for every TOML eval.", ) - parser.add_argument("--max-concurrent-generation", type=int, help="Deprecated: ignored.") - parser.add_argument("--max-concurrent-scoring", type=int, help="Deprecated: ignored.") parser.add_argument( "--timeout", type=float, default=None, - help="Override request timeout in seconds for all jobs (CLI > model > default).", - ) - parser.add_argument( - "--http-max-retries", - type=int, - default=None, - help="HTTP/client-level retries for model calls (CLI > model max_retries).", - ) - parser.add_argument( - "--rollout-max-retries", - type=int, - default=0, - help="Retry full rollout/group on retryable infra/invalid-response errors.", + help="Override request timeout in seconds for every TOML eval.", ) parser.add_argument( - "--model-call-retries", + "--max-retries", + dest="rollout_max_retries", type=int, default=None, - help="Per-model-call MedARC retry attempts (0 disables the monkeypatch).", + help="Override upstream rollout max_retries for every TOML eval.", ) parser.add_argument( "--sleep", @@ -207,21 +169,6 @@ def build_batch_parser() -> argparse.ArgumentParser: default=0.0, help="Sleep this many seconds after each job (overridden by per-job sleep).", ) - parser.add_argument( - "--enable-additional-retries", - action=argparse.BooleanOptionalAction, - default=None, - help="Deprecated alias for --model-call-retries (true maps to 3 attempts).", - ) - parser.add_argument( - "--include-usage", - action=argparse.BooleanOptionalAction, - default=None, - help=( - "Include usage reporting in API requests (extra_body.usage.include). " - "Default: auto-detect (enabled for Prime Inference, disabled otherwise)." - ), - ) return parser @@ -240,7 +187,7 @@ def build_process_parser() -> argparse.ArgumentParser: "--runs-dir", type=Path, default=None, - help=f"Directory containing raw run outputs (default: {DEFAULT_RUNS_RAW_DIR}).", + help=f"Directory containing eval output directories (default: {DEFAULT_EVALS_DIR}).", ) parser.add_argument( "--output-dir", @@ -254,12 +201,6 @@ def build_process_parser() -> argparse.ArgumentParser: default=None, help=f"Directory containing environment YAMLs for export settings (default: {DEFAULT_ENV_CONFIG_ROOT}).", ) - parser.add_argument( - "--status", - action="append", - default=None, - help="Filter runs by manifest status (repeatable).", - ) parser.add_argument( "--exclude-dataset", action="append", @@ -303,10 +244,10 @@ def build_process_parser() -> argparse.ArgumentParser: type=float, default=None, help=( - "Fail if a selected latest job record is missing more than this percentage of expected results.jsonl rows " - "based on manifest job fields (row_count, num_examples, rollouts_per_example). " - "Computed per selected job record and enforced only on the latest selected run; does not use " - "manifest summary.completed/summary.total or fall back to older runs (default: 2.5)." + "Fail if a selected latest eval output is missing more than this percentage of expected results.jsonl rows " + "based on metadata num_examples and rollouts_per_example. " + "Computed per selected output and enforced only on the latest selected run; does not fall back to older " + "runs (default: 2.5)." ), ) parser.add_argument( @@ -523,7 +464,6 @@ def _run_batch_mode(argv: Sequence[str]) -> int: parser = build_batch_parser() args = parser.parse_args(argv) args.endpoints_path_explicit = _option_was_provided(argv, "--endpoints-path") - args.default_api_key_var_explicit = _option_was_provided(argv, "--default-api-key-var") try: args.cli_env_args = build_cli_override( @@ -538,33 +478,19 @@ def _run_batch_mode(argv: Sequence[str]) -> int: json_flag="--sampling-args", pair_flag="--sampling-arg", ) - args.model_call_retries = _resolve_model_call_retries( - args.model_call_retries, - args.enable_additional_retries, - ) - if args.http_max_retries is not None and args.http_max_retries < 0: - raise ValueError("--http-max-retries must be >= 0.") - if args.rollout_max_retries < 0: - raise ValueError("--rollout-max-retries must be >= 0.") + if args.rollout_max_retries is not None and args.rollout_max_retries < 0: + raise ValueError("--max-retries must be >= 0.") except ValueError as exc: parser.error(str(exc)) - if args.restart: - args.auto_resume = False - # Restarting is an explicit workflow; disable auto-resume selection when --restart is set. - # The planner may restart in-place when --restart points to an existing run directory. - + config_path = Path(args.config).expanduser() + if config_path.suffix.lower() != ".toml": + parser.error("medarc-eval bench now accepts upstream TOML configs only.") try: - return _execute_batch(args) - except KeyboardInterrupt: - logger.warning("Batch run interrupted by user.") - return 1 - except ConfigFormatError as exc: - parser.error(str(exc)) - except SystemExit: # pragma: no cover - argparse already handled messaging - raise + _validate_toml_selection_args(args, parser=parser) + return _run_toml_bench(args) except Exception as exc: # noqa: BLE001 - logger.exception("Unhandled error: %s", exc) + logger.exception("TOML bench failed: %s", exc) return 1 @@ -634,11 +560,8 @@ def _build_process_options(args: argparse.Namespace) -> ProcessOptions: retries=args.hf_retries, max_files_per_commit=args.hf_max_files_per_commit, ) - status_values = list(args.status or []) - status_filter = tuple(status_values) if status_values else PROCESS_DEFAULT_STATUS_FILTER max_results_missing_pct = float(args.max_results_missing_pct) if args.max_results_missing_pct is not None else 2.5 processed_with_args = { - "status": list(status_filter), "max_results_missing_pct": max_results_missing_pct, "exclude_datasets": args.exclude_dataset or [], "exclude_models": args.exclude_model or [], @@ -662,7 +585,6 @@ def _build_process_options(args: argparse.Namespace) -> ProcessOptions: replace_envs=tuple(args.replace_env or ()), processed_at=args.processed_at, processed_with_args=processed_with_args, - status_filter=status_filter, max_results_missing_pct=max_results_missing_pct, dry_run=bool(args.dry_run), clean=bool(args.clean), @@ -829,11 +751,15 @@ def _load_config_payload(path: Path, *, mode: Literal["process", "winrate"]) -> def _reject_removed_process_config_keys(payload: Mapping[str, Any]) -> None: if "max_run_missing_pct" in payload: raise ValueError("Process config field 'max_run_missing_pct' was removed; use 'max_results_missing_pct'.") + if "status" in payload: + raise ValueError("Process config field 'status' was removed; process now reads completed eval outputs.") process_section = payload.get("process") if isinstance(process_section, Mapping) and "max_run_missing_pct" in process_section: raise ValueError( "Process config field 'process.max_run_missing_pct' was removed; use 'process.max_results_missing_pct'." ) + if isinstance(process_section, Mapping) and "status" in process_section: + raise ValueError("Process config field 'process.status' was removed; process now reads completed eval outputs.") def _expand_embedded_pipeline_config(payload: dict[str, Any], *, mode: Literal["process", "winrate"]) -> dict[str, Any]: @@ -881,7 +807,6 @@ def _merge_process_section( "output_dir": "output_dir", "env_config_root": "env_config_root", "processed_at": "processed_at", - "status": "status", "exclude_datasets": "exclude_datasets", "exclude_models": "exclude_models", "replace_models": "replace_models", @@ -954,8 +879,9 @@ def _resolve_process_dir_value(value: Any, *, runs_dir: Any | None) -> Path | No candidate = Path(raw) if candidate.is_absolute(): return candidate - runs_base = Path(str(runs_dir)).parent if runs_dir is not None else DEFAULT_RUNS_RAW_DIR.parent - return runs_base / candidate + if runs_dir is not None: + return Path(str(runs_dir)).parent / candidate + return DEFAULT_EVALS_DIR.parent / candidate def _resolve_winrate_dir_value(value: Any, *, process_output_dir: Path | None) -> Path | None: @@ -1081,7 +1007,6 @@ def _load_and_apply_config( }[mode] repeatable_fields = { "process": { - "status": "status", "exclude_datasets": "exclude_dataset", "exclude_models": "exclude_model", "replace_models": "replace_model", @@ -1156,7 +1081,7 @@ def _finalize_config_args(args: argparse.Namespace, *, mode: Literal["process", """Fill any unset process/winrate args with defaults after config + CLI merge.""" defaults = { "process": { - "runs_dir": DEFAULT_RUNS_RAW_DIR, + "runs_dir": DEFAULT_EVALS_DIR, "output_dir": DEFAULT_PROCESSED_DIR, "env_config_root": DEFAULT_ENV_CONFIG_ROOT, "max_workers": 4, @@ -1338,249 +1263,412 @@ def _run_winrate_mode(argv: Sequence[str]) -> int: return 0 -def _execute_batch(args: argparse.Namespace) -> int: - # Set the include_usage environment variable if explicitly specified - if getattr(args, "include_usage", None) is not None: - import os +def _validate_toml_selection_args(args: argparse.Namespace, *, parser: argparse.ArgumentParser) -> None: + for attr, flag in (("eval_index", "--eval-index"), ("start_at", "--start-at"), ("stop_after", "--stop-after")): + value = getattr(args, attr, None) + if value is not None and value < 1: + parser.error(f"{flag} must be a 1-based index.") + if args.eval_index is not None and (args.start_at is not None or args.stop_after is not None): + parser.error("--eval-index cannot be combined with --start-at or --stop-after.") + if args.start_at is not None and args.stop_after is not None and args.stop_after < args.start_at: + parser.error("--stop-after must be greater than or equal to --start-at.") - os.environ["MEDARC_INCLUDE_USAGE"] = "true" if args.include_usage else "false" +def _run_toml_bench(args: argparse.Namespace) -> int: config_path = Path(args.config).expanduser() - env_root_override = Path(args.env_config_root).expanduser().resolve() if args.env_config_root else None - run_config = load_run_config(config_path, env_default_root=env_root_override) - - run_name = args.name or run_config.name - output_dir = Path(args.output_dir).expanduser() if args.output_dir else Path(run_config.output_dir).expanduser() - output_dir = output_dir.resolve() - run_id = args.run_id # May be None when using --auto-resume discovery - if run_id is not None: - try: - run_id = validate_simple_name(run_id, flag="--run-id") - except ValueError as exc: - logger.error("Invalid --run-id '%s': %s", run_id, exc) - logger.error("Suggested safe value: --run-id %s", slugify(run_id)) - return 1 - - if args.restart: - restart_raw = args.restart - restart_path = Path(restart_raw).expanduser() - try: - if restart_path.exists(): - if restart_path.is_dir(): - args.restart = str(restart_path) - elif restart_path.is_file() and restart_path.name == MANIFEST_FILENAME: - args.restart = str(restart_path.parent) - else: - logger.error( - "Invalid --restart '%s': expected a run directory or %s file.", - restart_raw, - MANIFEST_FILENAME, - ) - return 1 - else: - args.restart = validate_simple_name(restart_raw, flag="--restart") - except OSError as exc: - logger.error("Invalid --restart '%s': %s", restart_raw, exc) - return 1 - except ValueError as exc: - logger.error("Invalid --restart '%s': %s", restart_raw, exc) - return 1 + raw_configs = _prepare_toml_raw_configs( + load_toml_eval_configs(config_path, extra_valid_fields={"name", "variant_id"}), + args, + ) + selected_raw, plan_inputs, path_plans, overrides = _plan_selected_toml_raw_configs(raw_configs, args) + display_configs = _display_configs_from_plan_inputs(plan_inputs) + missing_envs = _missing_selected_env_refs(plan_inputs, args) + _print_toml_bench_plan(display_configs, path_plans, dry_run=bool(args.dry_run)) + if missing_envs and args.auto_install: + _print_auto_install_warning(missing_envs, dry_run=bool(args.dry_run)) + if args.dry_run: + return 0 + if missing_envs and not args.auto_install: + raise RuntimeError(_missing_envs_error(missing_envs)) + return _execute_selected_toml_plan( + selected_raw, plan_inputs, path_plans, overrides, args, missing_envs=missing_envs + ) - if args.model_call_retries > 0 and not args.dry_run: - from datetime import datetime - from medarc_verifiers.utils.retry import patch_verifiers_model_response_retry +def _toml_eval_overrides(args: argparse.Namespace) -> EvalConfigOverrides: + overrides = EvalConfigOverrides( + model=args.model, + provider=args.provider, + api_base_url=args.api_base_url, + api_key_var=args.api_key_var, + endpoints_path=args.endpoints_path if getattr(args, "endpoints_path_explicit", False) else None, + max_concurrent=args.max_concurrent, + env_args=getattr(args, "cli_env_args", None), + sampling_args=getattr(args, "cli_sampling_args", None), + ) + return overrides - cwd = Path.cwd() - ts = datetime.now().strftime("%Y%m%d_%H%M%S") - retry_log_path = cwd / "logs" / f"medarc_model_retry_{ts}.log" - patch_verifiers_model_response_retry( - attempts=args.model_call_retries, - log_path=retry_log_path, - ) - jobs = build_jobs(run_config) - if not jobs: - logger.error("Configuration %s did not produce any jobs.", config_path) - return 1 +def _plan_selected_toml_raw_configs( + raw_configs: Sequence[dict[str, Any]], + args: argparse.Namespace, +) -> tuple[list[dict[str, Any]], list[dict[str, Any]], list[EvalPathPlan], EvalConfigOverrides]: + selected_raw = _select_toml_raw_configs(raw_configs, args) + overrides = _toml_eval_overrides(args) + plan_inputs = [build_eval_identity_payload(raw, overrides=overrides) for raw in selected_raw] + output_root = _resolve_toml_output_root_from_raw(selected_raw, args) + path_plans = plan_eval_paths(plan_inputs, output_root=output_root) + return selected_raw, plan_inputs, path_plans, overrides - selected_jobs = _filter_jobs(jobs, args.job_id) - if not selected_jobs: - logger.error("No jobs matched the provided filters.") - return 1 - env_args_map, sampling_args_map = _build_effective_args(jobs) - config_checksum = compute_snapshot_checksum(run_config.model_dump()) - forced_envs = _parse_forced_envs(args.forced) - forced_envs.update(_collect_rerun_envs(run_config.envs)) +def _missing_selected_env_refs( + plan_inputs: Sequence[Mapping[str, Any]], + args: argparse.Namespace, +) -> dict[str, EnvPackageRef]: + missing: dict[str, EnvPackageRef] = {} + for plan_input in plan_inputs: + env_id = str(plan_input["env_id"]) + if _module_importable(upstream_module_name(env_id)): + continue + if env_id not in missing: + missing[env_id] = resolve_env_package(env_id, args.env_dir) + return missing + + +def _print_auto_install_warning(missing_envs: Mapping[str, EnvPackageRef], *, dry_run: bool) -> None: + verb = "would auto-install" if dry_run else "will auto-install" + console = Console(stderr=True) + console.print( + f"[yellow]Warning:[/yellow] {len(missing_envs)} selected environment package(s) are not installed " + "in the active Python environment." + ) + console.print(f"MedARC {verb} missing local envs in isolated temporary venvs for this run.") + console.print("Preinstall envs with vf-install or pass --no-auto-install to require installed packages.") + for env_id, ref in missing_envs.items(): + console.print(f" - {env_id}: {ref.env_path}") + + +def _missing_envs_error(missing_envs: Mapping[str, EnvPackageRef]) -> str: + lines = [ + "Selected environment packages are not importable and --no-auto-install was passed:", + *[f"- {env_id}: {ref.env_path}" for env_id, ref in missing_envs.items()], + "Preinstall envs with vf-install or rerun without --no-auto-install.", + ] + return "\n".join(lines) + + +def _module_importable(module_name: str) -> bool: + return importlib.util.find_spec(module_name) is not None + + +def _prepare_toml_raw_configs(raw_configs: Sequence[dict[str, Any]], args: argparse.Namespace) -> list[dict[str, Any]]: + prepared: list[dict[str, Any]] = [] + for raw in raw_configs: + item = dict(raw) + item.setdefault("save_results", True) + item.setdefault("env_dir_path", str(args.env_dir)) + if args.max_concurrent is None and "max_concurrent" not in item: + item["max_concurrent"] = 1 + if args.timeout is not None: + item["timeout"] = args.timeout + if args.rollout_max_retries is not None: + item["max_retries"] = args.rollout_max_retries + if args.verbose: + item["verbose"] = True + prepared.append(item) + return prepared + + +def _select_toml_raw_configs(raw_configs: Sequence[dict[str, Any]], args: argparse.Namespace) -> list[dict[str, Any]]: + indexed = list(raw_configs) + if args.eval_index is not None: + start = args.eval_index - 1 + indexed = indexed[start : start + 1] + else: + if args.start_at is not None: + indexed = indexed[args.start_at - 1 :] + if args.stop_after is not None: + indexed = indexed[: args.stop_after - (args.start_at or 1) + 1] + if not indexed: + raise ValueError("No TOML evals matched the requested selection.") + return list(indexed) - planner = ManifestPlanner( - output_dir=output_dir, - run_id=run_id, - run_name=run_name, - config_path=config_path, - config_checksum=config_checksum, - jobs=jobs, - env_args_map=env_args_map, - sampling_args_map=sampling_args_map, - restart_source=args.restart, - auto_resume=bool(args.auto_resume), - persist=not bool(args.dry_run), - ) - try: - manifest_plan = planner.plan(force_all=bool(args.force), forced_envs=forced_envs) - except ValueError as exc: - logger.error("%s", exc) - return 1 +def _resolve_toml_output_root_from_raw(raw_configs: Sequence[Mapping[str, Any]], args: argparse.Namespace) -> Path: + if args.output_dir: + return Path(args.output_dir).expanduser() - runnable_ids = manifest_plan.runnable_job_ids - selected_ids = {job.job_id for job in selected_jobs} - planned_jobs = [job for job in jobs if job.job_id in runnable_ids and job.job_id in selected_ids] + configured_roots = {str(config["output_dir"]) for config in raw_configs if config.get("output_dir")} + if len(configured_roots) > 1: + raise ValueError( + "TOML bench deterministic output supports one output_dir per run; use a single global output_dir." + ) + if configured_roots: + return Path(configured_roots.pop()).expanduser() + return DEFAULT_EVALS_DIR + + +def _display_configs_from_plan_inputs(plan_inputs: Sequence[Mapping[str, Any]]) -> list[Any]: + return [ + SimpleNamespace( + model=str(payload["model"]), + env_id=str(payload["env_id"]), + num_examples=payload.get("num_examples", "-"), + rollouts_per_example=payload.get("rollouts_per_example", "-"), + max_concurrent=payload.get("max_concurrent", "-"), + ) + for payload in plan_inputs + ] - _print_job_plan( - selected_jobs, - manifest=manifest_plan.manifest, - runnable_job_ids=runnable_ids, - discovered_total=len(jobs), - dry_run=bool(args.dry_run), - ) - if not planned_jobs: - if manifest_plan.reused_job_ids: +def _execute_selected_toml_plan( + raw_configs: Sequence[Mapping[str, Any]], + plan_inputs: Sequence[Mapping[str, Any]], + path_plans: Sequence[EvalPathPlan], + overrides: EvalConfigOverrides, + args: argparse.Namespace, + *, + missing_envs: Mapping[str, EnvPackageRef], +) -> int: + failures = 0 + for index, (raw, plan_input, path_plan) in enumerate(zip(raw_configs, plan_inputs, path_plans), start=1): + try: + env_id = str(plan_input["env_id"]) + if env_id in missing_envs: + _execute_isolated_toml_eval( + raw, + plan_input, + path_plan, + overrides, + args, + index=index, + total=len(raw_configs), + env_ref=missing_envs[env_id], + ) + else: + config = build_eval_config(raw, overrides=overrides) + if config.env_id != plan_input["env_id"]: + raise ValueError(f"Resolved env_id {config.env_id!r}, expected {plan_input['env_id']!r}.") + if config.model != plan_input["model"]: + raise ValueError(f"Resolved model {config.model!r}, expected {plan_input['model']!r}.") + _prepare_toml_results_dir(path_plan.results_path, force=bool(args.force)) + run_config = config.model_copy(update={"resume_path": path_plan.results_path, "save_results": True}) + logger.info("Running TOML eval %d/%d: %s on %s", index, len(raw_configs), config.env_id, config.model) + asyncio.run(_run_one_toml_eval(run_config)) + except Exception as exc: # noqa: BLE001 + failures += 1 + logger.exception("TOML eval %d failed: %s", index, exc) + if not args.continue_on_error: + return 1 + if args.sleep and index < len(raw_configs): + import time + + time.sleep(float(args.sleep)) + return 1 if failures else 0 + + +def _execute_isolated_toml_eval( + raw: Mapping[str, Any], + plan_input: Mapping[str, Any], + path_plan: EvalPathPlan, + overrides: EvalConfigOverrides, + args: argparse.Namespace, + *, + index: int, + total: int, + env_ref: EnvPackageRef, +) -> None: + with tempfile.TemporaryDirectory(prefix="medarc-bench-child-") as temp_dir: + temp_root = Path(temp_dir) + payload_path = temp_root / f"eval-{index}.json" + status_path = temp_root / f"eval-{index}-status.json" + with temporary_bench_venv() as child_python: + install_env_package(child_python, env_ref.env_path) + _prepare_toml_results_dir(path_plan.results_path, force=bool(args.force)) + payload = _bench_child_payload( + raw, + plan_input, + path_plan, + overrides, + args, + status_path=status_path, + cleanup_env_package=False, + env_preinstalled=True, + ) + payload_path.write_text(json.dumps(payload, sort_keys=True, default=str), encoding="utf-8") logger.info( - "All jobs already completed (reused %d job(s) from prior manifests).", - len(manifest_plan.reused_job_ids), + "Running TOML eval %d/%d in isolated venv: %s on %s", + index, + total, + plan_input["env_id"], + plan_input["model"], ) - else: - logger.info("No jobs were scheduled after applying filters and resume settings.") + completed = subprocess.run( + [str(child_python), "-m", "medarc_verifiers.cli.bench_child", str(payload_path)], + check=False, + capture_output=True, + text=True, + ) + status = _load_child_status(status_path, completed=completed) + if completed.returncode != 0 or int(status.get("exit_code", 1)) != 0: + detail = status.get("primary_error") or status.get("cleanup_error") or status.get("exit_reason") + output_tail = _completed_process_tail(completed) + if output_tail: + detail = f"{detail}\n{output_tail}" if detail else output_tail + raise RuntimeError(f"Bench child failed: {detail}") + + +def _bench_child_payload( + raw: Mapping[str, Any], + plan_input: Mapping[str, Any], + path_plan: EvalPathPlan, + overrides: EvalConfigOverrides, + args: argparse.Namespace, + *, + status_path: Path, + cleanup_env_package: bool = True, + env_preinstalled: bool = False, +) -> dict[str, Any]: + return { + "raw_config": _jsonable_mapping(raw), + "overrides": _jsonable_mapping(_overrides_payload(overrides)), + "env_dir": str(Path(args.env_dir).expanduser()), + "resume_path": str(path_plan.results_path), + "status_path": str(status_path), + "expected_env_id": str(plan_input["env_id"]), + "expected_model": str(plan_input["model"]), + "cleanup_env_package": cleanup_env_package, + "env_preinstalled": env_preinstalled, + } - # Check if all selected jobs are completed (not just filtered out) - all_completed = all( - manifest_plan.manifest.job_entry(job.job_id) - and manifest_plan.manifest.job_entry(job.job_id).status == "completed" - for job in selected_jobs - ) - if all_completed and selected_jobs and not args.dry_run and not args.force: - choice = args.on_complete - if choice == "prompt": - choice = _prompt_completed_jobs_action() - if choice == "new": - logger.info("Creating a new run with all jobs...") - # Create a fresh run by disabling auto-resume and forcing a new run_id - # Recursively call with updated args to create new manifest - new_args = argparse.Namespace(**vars(args)) - new_args.auto_resume = False - new_args.run_id = None # Force generation of new run_id - new_args.restart = None - return _execute_batch(new_args) - elif choice == "rerun": - logger.info("Rerunning all completed jobs...") - # Set all selected jobs to runnable - runnable_ids = {job.job_id for job in selected_jobs} - planned_jobs = [job for job in jobs if job.job_id in runnable_ids and job.job_id in selected_ids] - # Continue execution below - elif choice == "exit": - logger.info("Exiting without running jobs.") - _log_summary([], manifest_plan.manifest) - return 0 - else: # continue/skip - logger.info("Continuing without running jobs.") - _log_summary([], manifest_plan.manifest) - return 0 +def _overrides_payload(overrides: EvalConfigOverrides) -> dict[str, Any]: + return { + "model": overrides.model, + "provider": overrides.provider, + "api_base_url": overrides.api_base_url, + "api_key_var": overrides.api_key_var, + "api_client_type": overrides.api_client_type, + "endpoints_path": str(overrides.endpoints_path) if overrides.endpoints_path is not None else None, + "max_concurrent": overrides.max_concurrent, + "env_args": dict(overrides.env_args or {}), + "sampling_args": dict(overrides.sampling_args or {}), + } + + +def _jsonable_mapping(value: Mapping[str, Any]) -> dict[str, Any]: + result: dict[str, Any] = {} + for key, item in value.items(): + if isinstance(item, Path): + result[key] = str(item) + elif isinstance(item, Mapping): + result[key] = _jsonable_mapping(item) + elif isinstance(item, list): + result[key] = [str(element) if isinstance(element, Path) else element for element in item] else: - _log_summary([], manifest_plan.manifest) - return 0 + result[key] = item + return result - if not planned_jobs: - # After prompting, still no planned jobs (shouldn't happen, but safety check) - _log_summary([], manifest_plan.manifest) - return 0 - forced_job_ids = _compute_forced_job_ids( - planned_jobs=planned_jobs, - runnable_job_ids=runnable_ids, - manifest=manifest_plan.manifest, - force_all=bool(args.force), - forced_envs=forced_envs, - ) +def _load_child_status( + status_path: Path, *, completed: subprocess.CompletedProcess[str] | None = None +) -> dict[str, Any]: + if not status_path.is_file(): + tail = _completed_process_tail(completed) if completed is not None else "" + detail = f"\n{tail}" if tail else "" + raise RuntimeError(f"Bench child did not write status file: {status_path}{detail}") + try: + status = json.loads(status_path.read_text(encoding="utf-8")) + except json.JSONDecodeError as exc: + tail = _completed_process_tail(completed) if completed is not None else "" + detail = f"\n{tail}" if tail else "" + raise RuntimeError(f"Bench child wrote malformed status file {status_path}: {exc}{detail}") from exc + if not isinstance(status, dict): + raise RuntimeError(f"Bench child wrote non-object status file: {status_path}") + return status - settings = ExecutorSettings( - run_id=manifest_plan.manifest.model.run_id or "", - output_dir=output_dir, - env_dir=Path(args.env_dir).expanduser(), - endpoints_path=Path(args.endpoints_path).expanduser() if args.endpoints_path else None, - endpoints_path_explicit=bool(getattr(args, "endpoints_path_explicit", False)), - default_api_key_var=args.default_api_key_var, - default_api_key_var_explicit=bool(getattr(args, "default_api_key_var_explicit", False)), - default_api_base_url=args.default_api_base_url, - api_base_url_override=args.api_base_url, - log_level="DEBUG" if args.verbose else "INFO", - verbose=args.verbose, - save_results=args.save_results, - save_to_hf_hub=args.save_to_hf_hub, - hf_hub_dataset_name=_coerce_optional_str(args.hf_hub_dataset_name), - max_concurrent_generation=args.max_concurrent_generation, - max_concurrent_scoring=args.max_concurrent_scoring, - max_concurrent=args.max_concurrent, # CLI override (None if not provided) - http_max_retries=args.http_max_retries, - rollout_max_retries=args.rollout_max_retries, - timeout=args.timeout, - sleep=args.sleep, - dry_run=args.dry_run, - cli_env_args=getattr(args, "cli_env_args", None), - cli_sampling_args=getattr(args, "cli_sampling_args", None), - forced_job_ids=forced_job_ids, - ) - logger.info( - "Loaded %d job(s); executing %d after filters (%d reusable).", - len(jobs), - len(planned_jobs), - len(manifest_plan.reused_job_ids), - ) +def _completed_process_tail(completed: subprocess.CompletedProcess[str] | None, *, lines: int = 20) -> str: + if completed is None: + return "" + parts: list[str] = [] + for label, text in (("stderr", completed.stderr), ("stdout", completed.stdout)): + stripped = (text or "").strip() + if stripped: + parts.append(f"{label} tail:\n" + "\n".join(stripped.splitlines()[-lines:])) + return "\n".join(parts) - endpoints_cache: dict[str, Any] = {} - env_metadata_cache: dict[str, Any] = {} - results = execute_jobs( - planned_jobs, - settings, - endpoints_cache=endpoints_cache, - env_metadata_cache=env_metadata_cache, - manifest=None if args.dry_run else manifest_plan.manifest, - ) +async def _run_one_toml_eval(config: Any) -> Any: + return await run_evaluation(config) - _log_summary(results, manifest_plan.manifest) - has_failures = any(result.status == "failed" for result in results if result.status != "skipped") - return 1 if has_failures else 0 +def _prepare_toml_results_dir( + results_path: Path, + *, + force: bool, +) -> None: + if results_path.exists() and force: + _archive_existing_path(results_path) + metadata_path = results_path / "metadata.json" + results_file = results_path / "results.jsonl" + if results_path.exists(): + if results_path.is_dir() and not any(results_path.iterdir()): + return + has_metadata = metadata_path.is_file() + has_results = results_file.is_file() + if not (has_metadata and has_results): + raise ValueError( + f"Cannot use existing output {results_path}: metadata.json and results.jsonl are both required. " + "Use --force to archive and rerun." + ) + return -def _build_effective_args( - jobs: Sequence[ResolvedJob], -) -> tuple[dict[str, dict[str, Any]], dict[str, dict[str, Any]]]: - env_map: dict[str, dict[str, Any]] = {} - sampling_map: dict[str, dict[str, Any]] = {} - for job in jobs: - env_map[job.job_id] = dict(job.env_args) - sampling_map[job.job_id] = dict(job.sampling_args) - return env_map, sampling_map + results_path.mkdir(parents=True, exist_ok=True) -def _parse_forced_envs(values: Sequence[str] | None) -> set[str]: - forced: set[str] = set() - if not values: - return forced - for chunk in values: - if not chunk: - continue - for item in chunk.split(","): - value = item.strip() - if value: - forced.add(value.lower()) - return forced +def _archive_existing_path(path: Path) -> Path: + timestamp = datetime.now(UTC).strftime("%Y%m%d-%H%M%S") + candidate = path.with_name(f"{path.name}__old_{timestamp}") + suffix = 1 + while candidate.exists(): + candidate = path.with_name(f"{path.name}__old_{timestamp}_{suffix}") + suffix += 1 + shutil.move(str(path), str(candidate)) + return candidate + + +def _print_toml_bench_plan(eval_configs: Sequence[Any], path_plans: Sequence[EvalPathPlan], *, dry_run: bool) -> None: + console = Console(width=240) + action = "dry-run" if dry_run else "run" + table = Table( + title="TOML Bench Dry Run" if dry_run else "TOML Bench Plan", + caption=f"{len(eval_configs)} eval(s) to {action}", + expand=True, + ) + table.add_column("#", justify="right", style="dim") + table.add_column("Model", style="magenta", overflow="fold") + table.add_column("Environment", style="green", overflow="fold") + table.add_column("Variant", style="cyan", overflow="fold") + table.add_column("Examples", justify="right") + table.add_column("Rollouts", justify="right") + table.add_column("Max Concurrency", justify="right") + table.add_column("Output Path", overflow="fold") + + for index, (config, path_plan) in enumerate(zip(eval_configs, path_plans), start=1): + table.add_row( + str(index), + config.model, + config.env_id, + path_plan.identity.variant_id or "-", + str(config.num_examples), + str(config.rollouts_per_example), + str(config.max_concurrent), + str(path_plan.results_path), + ) + + console.print(table) def _parse_repeatable_csv(values: Sequence[str] | None) -> list[str]: @@ -1602,23 +1690,6 @@ def _option_was_provided(argv: Sequence[str], long_flag: str) -> bool: return False -def _resolve_model_call_retries(model_call_retries: int | None, deprecated_toggle: bool | None) -> int: - if model_call_retries is not None: - if model_call_retries < 0: - raise ValueError("--model-call-retries must be >= 0.") - if deprecated_toggle is not None: - logger.warning( - "Ignoring deprecated --enable-additional-retries because --model-call-retries was explicitly set." - ) - return model_call_retries - - if deprecated_toggle is None: - return 0 - - logger.warning("Flag --enable-additional-retries is deprecated; use --model-call-retries instead.") - return 3 if deprecated_toggle else 0 - - def _filter_winrate_datasets( datasets: Sequence[tuple[str, Sequence[Path]]], exclude_datasets: Sequence[str], @@ -1638,132 +1709,16 @@ def _filter_winrate_datasets( return filtered -def _collect_rerun_envs(envs: Mapping[str, EnvironmentConfigSchema]) -> set[str]: - rerun: set[str] = set() - for env in envs.values(): - if getattr(env, "rerun", False): - for key in (env.id, env.module, env.matrix_base_id): - if key: - rerun.add(str(key).lower()) - return rerun - - -def _compute_forced_job_ids( - *, - planned_jobs: Sequence[ResolvedJob], - runnable_job_ids: set[str], - manifest: RunManifest | None, - force_all: bool, - forced_envs: set[str], -) -> set[str]: - forced_ids: set[str] = set() - if force_all: - return {job.job_id for job in planned_jobs} - - for job in planned_jobs: - entry = manifest.job_entry(job.job_id) if manifest is not None else None - env_forced = any(key in forced_envs for key in _force_keys_for_job(job, entry)) - completed_but_runnable = bool( - entry is not None and entry.status == "completed" and job.job_id in runnable_job_ids - ) - if env_forced or completed_but_runnable: - forced_ids.add(job.job_id) - return forced_ids - - -def _force_keys_for_job(job: ResolvedJob, entry: ManifestJobEntry | None) -> set[str]: - keys: set[str] = {job.job_id.lower()} - for value in ( - getattr(job.env, "id", None), - getattr(job.env, "module", None), - getattr(job.env, "matrix_base_id", None), - getattr(entry, "env_id", None), - ): - if value: - keys.add(str(value).lower()) - return keys - - -def _filter_jobs(jobs: Sequence[ResolvedJob], job_filters: Sequence[str] | None) -> list[ResolvedJob]: - if not job_filters: - return list(jobs) - filters = set(job_filters) - selected = [job for job in jobs if job.job_id in filters] - missing = filters - {job.job_id for job in selected} - if missing: - logger.warning("Unknown job ids requested: %s", ", ".join(sorted(missing))) - return selected - - -def _coerce_optional_str(value: str | None) -> str | None: - if value is None or value == "": - return None - return value - - -def _prompt_completed_jobs_action() -> str: - """Prompt user to choose what to do when all jobs are completed. - - Returns: - "new", "rerun", "continue", or "exit" - """ - console = Console() - - message = "\n[bold yellow]All jobs are already completed.[/bold yellow]\n" - message += "What would you like to do?\n" - message += " [bold cyan]n[/bold cyan] - Create a new run\n" - message += " [bold cyan]r[/bold cyan] - Rerun all jobs (ignore completion status)\n" - message += " [bold cyan]c[/bold cyan] - Continue without running (default)\n" - message += " [bold cyan]e[/bold cyan] - Exit\n" - - console.print(message) - - try: - response = input("Choose [n/r/c/e]: ").strip().lower() - except (EOFError, KeyboardInterrupt): - print() # New line after Ctrl+C - return "exit" - - if response == "n" or response == "new": - return "new" - elif response == "r" or response == "rerun": - return "rerun" - elif response == "e" or response == "exit": - return "exit" - else: - # Default to continue for any other input (including empty/enter) - return "continue" - - -def _log_summary(results: Sequence[JobExecutionResult], manifest: RunManifest | None = None) -> None: - if manifest is not None: - summary = manifest.summary - logger.info( - "Run complete: %d completed, %d pending, %d failed, %d skipped (total %d).", - summary.get("completed", 0), - summary.get("pending", 0), - summary.get("failed", 0), - summary.get("skipped", 0), - summary.get("total", 0), - ) - return - total = len(results) - succeeded = sum(result.status == "succeeded" for result in results) - skipped = sum(result.status == "skipped" for result in results) - failed = sum(result.status == "failed" for result in results) - logger.info("Run complete: %d succeeded, %d skipped, %d failed (total %d).", succeeded, skipped, failed, total) - - def _print_general_help() -> None: message = dedent( f"""\ Usage: {COMMAND} [options] # Single run (ENV must be first; use ENV --help for details) - {COMMAND} {BENCH_COMMAND} --config CONFIG.yaml ... # Batch run (see: {COMMAND} {BENCH_COMMAND} --help) - {COMMAND} {PROCESS_COMMAND} [options] # Export raw runs to parquet (see: {COMMAND} {PROCESS_COMMAND} --help) + {COMMAND} {BENCH_COMMAND} --config CONFIG.toml ... # Sequential TOML bench + {COMMAND} {PROCESS_COMMAND} [options] # Export eval outputs to parquet (see: {COMMAND} {PROCESS_COMMAND} --help) {COMMAND} {WINRATE_COMMAND} [options] # Compute win rates from processed parquet outputs - First argument must be the environment slug for single runs. Use '{COMMAND} {BENCH_COMMAND} --help' for batch mode options.""" + First argument must be the environment slug for single runs. Use '{COMMAND} {BENCH_COMMAND} --help' for TOML bench options.""" ) print(message) @@ -1837,78 +1792,13 @@ def _load_env_export_map(root: Path | None) -> dict[str, EnvironmentExportConfig if env_cfg.export is None: continue keys = {env_cfg.id, env_cfg.matrix_base_id} + if env_cfg.module and env_cfg.env_args: + keys.add(f"{env_cfg.module}::{generate_variant_id({'env_args': env_cfg.env_args})}") for key in filter(None, keys): export_map[key] = env_cfg.export return export_map -def _print_job_plan( - jobs: Sequence[ResolvedJob], - *, - manifest: RunManifest | None, - runnable_job_ids: set[str], - discovered_total: int, - dry_run: bool, -) -> None: - """Render a human-friendly summary of the jobs scheduled for execution.""" - listed_total = len(jobs) - scheduled_total = sum(1 for job in jobs if job.job_id in runnable_job_ids) - caption_parts: list[str] = [f"{listed_total} job(s) listed"] - caption_parts.append(f"{scheduled_total} to {'dry-run' if dry_run else 'run'}") - if discovered_total != listed_total: - caption_parts.append(f"{discovered_total} discovered") - caption = " | ".join(part for part in caption_parts if part) - - if not jobs: - logger.info("No jobs to display (%s).", caption) - return - - def _format_label(primary: str | None, secondary: str | None) -> str: - if primary and secondary and primary != secondary: - return f"{primary} ({secondary})" - return primary or secondary or "-" - - def _resolve_status(job_id: str, entry: ManifestJobEntry | None) -> str: - if job_id in runnable_job_ids: - return "next" - if entry and entry.status == "completed": - return "completed" - return "pending" - - entries = {} - if manifest is not None: - entries = {entry.job_id: entry for entry in manifest.jobs if entry.job_id} - - console = Console() - table = Table(title="Planned Jobs", caption=caption, expand=True) - table.add_column("#", justify="right", style="dim") - table.add_column("Job ID", style="bold cyan", overflow="fold") - table.add_column("Status", style="yellow") - table.add_column("Name", style="white", overflow="fold") - table.add_column("Model", style="magenta", overflow="fold") - table.add_column("Environment", style="green", overflow="fold") - table.add_column("Examples", justify="right") - table.add_column("Rollouts", justify="right") - - for index, job in enumerate(jobs, start=1): - entry = entries.get(job.job_id) - model_label = _format_label(job.model.id, job.model.model) - env_label = _format_label(job.env.id, job.env.module) - status = _resolve_status(job.job_id, entry) - table.add_row( - str(index), - job.job_id, - status, - job.name or "-", - model_label, - env_label, - str(job.env.num_examples), - str(job.env.rollouts_per_example), - ) - - console.print(table) - - if __name__ == "__main__": # pragma: no cover raise SystemExit(main()) diff --git a/medarc_verifiers/cli/process/aggregate.py b/medarc_verifiers/cli/process/aggregate.py index cfca0baf..c8e0776e 100644 --- a/medarc_verifiers/cli/process/aggregate.py +++ b/medarc_verifiers/cli/process/aggregate.py @@ -2,6 +2,7 @@ from __future__ import annotations +import json import logging from dataclasses import dataclass from typing import Any, Iterable, Mapping @@ -19,6 +20,8 @@ class AggregatedEnvRows: env_id: str base_env_id: str model_id: str | None + variant_id: str | None + variant_payload: Mapping[str, Any] | None rows: list[Mapping[str, Any]] column_names: tuple[str, ...] job_run_ids: tuple[str, ...] @@ -29,18 +32,22 @@ def aggregate_rows_by_env( *, identities: Iterable[RunIdentity] | None = None, ) -> list[AggregatedEnvRows]: - """Group enriched rows by (model_id, base_env_id), capturing unioned schemas.""" - groups: dict[tuple[str, str], dict[str, Any]] = {} + """Group enriched rows by (model_id, base_env_id, variant_id), capturing unioned schemas.""" + groups: dict[tuple[str, str, str], dict[str, Any]] = {} identity_list = list(identities or ()) fake_rollout_groups = { - (identity.model_id, identity.output_env_id) for identity in identity_list if identity.rollout_index is not None + (identity.model_id, identity.output_env_id, identity.variant_id or "") + for identity in identity_list + if identity.rollout_index is not None } for row in rows: base_env_id = str(row.get("base_env_id") or row.get("env_id") or "") env_id = str(row.get("env_id") or base_env_id) model_id = str(row.get("model_id") or "unknown") - group_key = (model_id, base_env_id or env_id) + variant_id = _string_or_none(row.get("variant_id")) + variant_payload = _decode_variant_payload(row.get("variant_payload")) + group_key = (model_id, base_env_id or env_id, variant_id or "") if not group_key[1]: # no env identifier logger.debug("Skipping row without env identifiers.") continue @@ -50,6 +57,8 @@ def aggregate_rows_by_env( "env_id": env_id if env_id else base_env_id, "base_env_id": base_env_id, "model_id": model_id, + "variant_id": variant_id, + "variant_payload": variant_payload, "rows": [], "column_names": set(), "job_run_ids": set(), @@ -62,6 +71,10 @@ def aggregate_rows_by_env( group["base_env_id"] = base_env_id if not group["model_id"] and model_id: group["model_id"] = model_id + if not group["variant_id"] and variant_id: + group["variant_id"] = variant_id + if group["variant_payload"] is None and variant_payload is not None: + group["variant_payload"] = variant_payload group["rows"].append(row) group["column_names"].update(row.keys()) job_run_id = row.get("job_run_id") @@ -81,6 +94,7 @@ def aggregate_rows_by_env( identities=identity_list, model_id=group["model_id"], base_env_id=group["base_env_id"] or key[1], + variant_id=group["variant_id"], ) _normalize_rollout_indices(normalized_rows) elif _group_uses_rollout_suffixes(normalized_rows, base_env_id=group["base_env_id"] or key[1]): @@ -92,6 +106,8 @@ def aggregate_rows_by_env( env_id=candidate_env_id, base_env_id=group["base_env_id"] or key[1], model_id=group["model_id"], + variant_id=group["variant_id"], + variant_payload=group["variant_payload"], rows=normalized_rows, column_names=tuple(sorted(group["column_names"])), job_run_ids=tuple(sorted(group["job_run_ids"])), @@ -106,11 +122,14 @@ def _ensure_rollout_index_from_identities( identities: list[RunIdentity], model_id: str, base_env_id: str, + variant_id: str | None, ) -> None: rollout_by_manifest_env: dict[str, int] = {} for identity in identities: if identity.model_id != model_id or identity.output_env_id != base_env_id: continue + if identity.variant_id != variant_id: + continue if identity.rollout_index is None: continue rollout_by_manifest_env[identity.manifest_env_id] = identity.rollout_index @@ -168,10 +187,39 @@ def _ensure_rollout_index_from_suffix(rows: list[Mapping[str, Any]], *, base_env def _coerce_rollout_index(value: Any) -> int | None: if value is None or isinstance(value, bool): return None + if isinstance(value, int): + return value + if isinstance(value, float): + if value.is_integer(): + return int(value) + return None + if isinstance(value, str): + try: + return int(value.strip()) + except ValueError: + return None + return None + + +def _string_or_none(value: Any) -> str | None: + if value is None: + return None + text = str(value).strip() + return text or None + + +def _decode_variant_payload(value: Any) -> Mapping[str, Any] | None: + if isinstance(value, Mapping): + return dict(value) + if not isinstance(value, str) or not value.strip(): + return None try: - return int(value) + payload = json.loads(value) except (TypeError, ValueError): return None + if isinstance(payload, Mapping): + return dict(payload) + return None def _normalize_rollout_indices(rows: list[Mapping[str, Any]]) -> None: diff --git a/medarc_verifiers/cli/process/discovery.py b/medarc_verifiers/cli/process/discovery.py index 7aba00f8..a38eb1b7 100644 --- a/medarc_verifiers/cli/process/discovery.py +++ b/medarc_verifiers/cli/process/discovery.py @@ -4,22 +4,18 @@ import json import logging +from datetime import UTC, datetime from dataclasses import dataclass, field from pathlib import Path -from typing import Any, Dict, Iterator, Mapping, Sequence +from typing import Any, Iterator, Mapping, Sequence -from pydantic import ValidationError - -from medarc_verifiers.cli._manifest import ( - MANIFEST_FILENAME, - ManifestJobEntry, - RunManifestModel, - _require_manifest_v3, -) +from medarc_verifiers.cli.eval_identity import MEDARC_VARIANT_ID_KEY logger = logging.getLogger(__name__) DEFAULT_STATUS = "unknown" +RESULTS_FILENAME = "results.jsonl" +METADATA_FILENAME = "metadata.json" @dataclass(frozen=True, slots=True) @@ -91,243 +87,212 @@ def iter_run_records( ) -> Iterator[RunRecord]: """Yield run records for each job entry found under the runs directory.""" runs_path = Path(runs_dir) + normalized_status = _normalize_status_filter(filter_status) if not runs_path.exists(): - logger.debug("Runs directory %s does not exist; nothing to process.", runs_path) + logger.debug("Runs directory %s does not exist.", runs_path) return - normalized_status = _normalize_status_filter(filter_status) + for record in _iter_eval_output_records(runs_path): + if normalized_status and record.status not in normalized_status: + continue + yield record + +def _iter_eval_output_records(evals_root: Path) -> Iterator[RunRecord]: + """Yield synthetic run records for upstream eval output directories.""" try: - run_dirs = sorted(path for path in runs_path.iterdir() if path.is_dir()) + results_paths = sorted(evals_root.rglob(RESULTS_FILENAME)) except OSError as exc: # noqa: FBT003 - logger.warning("Failed to list runs directory %s: %s", runs_path, exc) + logger.warning("Failed to scan eval outputs under %s: %s", evals_root, exc) return - for run_dir in run_dirs: - manifest_info, job_entries = _load_manifest(run_dir) - if manifest_info is None: + seen: set[Path] = set() + for results_path in results_paths: + results_dir = results_path.parent + if results_path.name == RESULTS_FILENAME and results_dir.name == "__pycache__": + continue + key = _dedupe_key(results_dir) + if key in seen: continue - summary_map = _load_run_summary(run_dir) - for job_entry in job_entries: - summary_entry = summary_map.get(job_entry.job_id or "") - record = _build_run_record(manifest_info, job_entry, summary_entry) - if record is None: - continue - if normalized_status and record.status not in normalized_status: - continue + seen.add(key) + metadata_path = results_dir / METADATA_FILENAME + if not metadata_path.exists(): + continue + record = _build_eval_output_record(evals_root, results_dir) + if record is not None: yield record -def _build_run_record( - manifest: RunManifestInfo, - job_entry: ManifestJobEntry, - summary_entry: Mapping[str, Any] | None, +def _build_eval_output_record( + evals_root: Path, + results_dir: Path, ) -> RunRecord | None: - job_id = job_entry.job_id - if not job_id: - logger.debug("Skipping job entry without a valid job_id in %s", manifest.manifest_path) + metadata_path = results_dir / METADATA_FILENAME + metadata_payload = _read_metadata_payload(metadata_path) + if metadata_payload is None: return None - results_dir_name, results_dir = _resolve_results_dir( - job_entry.results_relpath, - manifest.artifacts_root, - job_id, - manifest.run_dir, - ) - results_dir_name, results_dir = _fallback_results_dir_if_missing( - results_dir_name, - results_dir, - manifest.run_dir, - job_id, - ) - metadata_path = results_dir / "metadata.json" - results_path = results_dir / "results.jsonl" - summary_path = results_dir / "summary.json" - - status = DEFAULT_STATUS - duration_seconds = None - reason: str | None = None - - if summary_entry: - status = (str(summary_entry.get("status", DEFAULT_STATUS)) or DEFAULT_STATUS).lower() - duration_seconds = summary_entry.get("duration_seconds") - reason = summary_entry.get("error") - elif job_entry.status: - status = job_entry.status.lower() - reason = job_entry.reason - - model_config = _ensure_mapping(manifest.models.get(job_entry.model_id) if manifest.models else {}) - env_template = _ensure_mapping( - manifest.env_templates.get(job_entry.env_template_id) if manifest.env_templates else {} + layout = _infer_eval_output_layout(evals_root, results_dir, metadata_payload) + updated_at = _path_timestamp(metadata_path) + job_run_id = layout["job_run_id"] + job_id = layout["job_id"] + model_id = layout["model_id"] + env_id = layout["env_id"] + + manifest = RunManifestInfo( + job_run_id=job_run_id, + run_name=job_run_id, + summary_completed=1, + summary_total=1, + summary_total_known=True, + manifest_path=metadata_path, + run_dir=results_dir, + created_at=updated_at, + updated_at=updated_at, + config_source=None, + config_checksum=None, + run_summary_path=results_dir / "summary.json", + models={model_id: {"sampling_args": _mapping_or_empty(metadata_payload.get("sampling_args"))}}, + env_templates={env_id: {"module": env_id}}, ) - env_config = dict(env_template) - if "module" not in env_config and job_entry.env_id: - env_config["module"] = job_entry.env_id - env_config["id"] = job_entry.env_variant_id - env_config["env_args"] = job_entry.env_args - env_args = _ensure_mapping(job_entry.env_args) - sampling_args = _ensure_mapping(job_entry.sampling_args or model_config.get("sampling_args")) + env_args = _mapping_or_empty(metadata_payload.get("env_args")) + sampling_args = _mapping_or_empty(metadata_payload.get("sampling_args")) + row_count = _count_results_rows(results_dir / RESULTS_FILENAME) return RunRecord( manifest=manifest, job_id=job_id, - model_id=job_entry.model_id, - manifest_env_id=job_entry.env_id, - results_dir_name=results_dir_name, + model_id=model_id, + manifest_env_id=env_id, + results_dir_name=results_dir.name, results_dir=results_dir, metadata_path=metadata_path, - results_path=results_path, - summary_path=summary_path, - has_metadata=metadata_path.exists(), - has_results=results_path.exists(), - has_summary=summary_path.exists(), - status=status, - duration_seconds=duration_seconds, - reason=reason or job_entry.reason, - started_at=job_entry.started_at, - ended_at=job_entry.ended_at, - avg_reward=job_entry.avg_reward, - num_examples=job_entry.num_examples, - rollouts_per_example=job_entry.rollouts_per_example, - row_count=job_entry.row_count, + results_path=results_dir / RESULTS_FILENAME, + summary_path=results_dir / "summary.json", + has_metadata=True, + has_results=True, + has_summary=(results_dir / "summary.json").exists(), + status="completed", + duration_seconds=None, + reason=None, + started_at=None, + ended_at=None, + avg_reward=_float_or_none(metadata_payload.get("avg_reward")), + num_examples=_int_or_none(metadata_payload.get("num_examples")), + rollouts_per_example=_int_or_none(metadata_payload.get("rollouts_per_example")), + row_count=row_count, env_args=env_args, sampling_args=sampling_args, - env_config=env_config, - model_config=model_config, + env_config={ + "id": env_id, + "module": env_id, + "variant_id": layout.get("variant_id"), + }, + model_config={"sampling_args": sampling_args}, ) -def _ensure_mapping(value: Any) -> Mapping[str, Any]: - if isinstance(value, Mapping): - return value - return {} +def _infer_eval_output_layout( + evals_root: Path, + results_dir: Path, + metadata_payload: Mapping[str, Any], +) -> dict[str, str]: + try: + parts = results_dir.relative_to(evals_root).parts + except ValueError: + parts = results_dir.parts + + metadata_env_id = _string_or_none(metadata_payload.get("env_id")) + metadata_model = _string_or_none(metadata_payload.get("model")) + if len(parts) == 2 and "--" in parts[0]: + env_from_parent, model_from_parent = parts[0].split("--", 1) + env_id = metadata_env_id or env_from_parent + model_id = metadata_model or model_from_parent + job_run_id = results_dir.name + variant_id = None + else: + model_id = metadata_model or (parts[0] if len(parts) >= 1 else "unknown") + env_id = metadata_env_id or (parts[1] if len(parts) >= 2 else results_dir.name) + variant_id = parts[2] if len(parts) >= 3 else _string_or_none(metadata_payload.get(MEDARC_VARIANT_ID_KEY)) + job_run_id = "::".join(part for part in (model_id, env_id, variant_id) if part) + return { + "job_run_id": job_run_id, + "job_id": results_dir.name, + "model_id": model_id, + "env_id": env_id, + "variant_id": variant_id or "", + } -def _resolve_results_dir( - stored_results_relpath: str | None, - artifacts_root: str | None, - job_id: str, - run_dir: Path, -) -> tuple[str, Path]: - """Resolve a job's results directory from v3 manifest artifact fields.""" - if stored_results_relpath: - rel = Path(stored_results_relpath) - base = run_dir / str(artifacts_root or ".") - candidate_file = (base / rel).resolve() - # v3 stores results_relpath to results.jsonl; derive the containing directory. - candidate_dir = candidate_file.parent if candidate_file.name == "results.jsonl" else candidate_file - return candidate_dir.name, candidate_dir - - # Backward-compatible fallback for malformed v3 payloads missing relpaths. - fallback = (run_dir / job_id).resolve() - return job_id, fallback - - -def _fallback_results_dir_if_missing( - results_dir_name: str, - results_dir: Path, - run_dir: Path, - job_id: str, -) -> tuple[str, Path]: - metadata_path = results_dir / "metadata.json" - results_path = results_dir / "results.jsonl" - if metadata_path.exists() or results_path.exists(): - return results_dir_name, results_dir - fallback = (run_dir / job_id).resolve() - fallback_metadata = fallback / "metadata.json" - fallback_results = fallback / "results.jsonl" - if fallback_metadata.exists() or fallback_results.exists(): - logger.warning( - "Manifest results path missing for job '%s'; falling back to run-relative directory '%s'.", - job_id, - fallback, - ) - return job_id, fallback - return results_dir_name, results_dir - - -def _load_manifest(run_dir: Path) -> tuple[RunManifestInfo | None, Sequence[ManifestJobEntry]]: - manifest_path = run_dir / MANIFEST_FILENAME - if not manifest_path.exists(): - logger.debug("Skipping %s: no %s present.", run_dir, MANIFEST_FILENAME) - return None, () + +def _read_metadata_payload(path: Path) -> Mapping[str, Any] | None: try: - manifest_payload = json.loads(manifest_path.read_text(encoding="utf-8")) + payload = json.loads(path.read_text(encoding="utf-8")) except (OSError, ValueError) as exc: # noqa: FBT003 - logger.warning("Failed to parse manifest %s: %s", manifest_path, exc) - return None, () + logger.warning("Failed to parse eval metadata %s: %s", path, exc) + return None + if not isinstance(payload, Mapping): + logger.warning("Invalid eval metadata payload type for %s: expected JSON object.", path) + return None + return dict(payload) - _require_manifest_v3(manifest_payload, path=manifest_path) +def _dedupe_key(path: Path) -> Path: try: - manifest_model = RunManifestModel.model_validate(manifest_payload) - except ValidationError as exc: - logger.warning("Manifest schema validation failed for %s: %s", manifest_path, exc) - return None, () + return path.resolve() + except OSError: + return path.absolute() + - job_run_id = manifest_model.run_id or run_dir.name - summary_payload = manifest_model.summary or {} +def _path_timestamp(path: Path) -> str: try: - completed_count = int(summary_payload.get("completed", 0)) - except Exception: - completed_count = 0 - total_known = False - if "total" in summary_payload: - try: - total_count = int(summary_payload.get("total", 0)) - except Exception: - total_count = 0 - total_known = total_count > 0 or not manifest_model.jobs - else: - total_count = 0 - if total_count == 0 and manifest_model.jobs: - total_count = len(manifest_model.jobs) - total_known = True + timestamp = path.stat().st_mtime + except OSError: + return "" + return datetime.fromtimestamp(timestamp, UTC).replace(microsecond=0).isoformat().replace("+00:00", "Z") - manifest_info = RunManifestInfo( - job_run_id=job_run_id, - run_name=manifest_model.name, - summary_completed=completed_count, - summary_total=total_count, - summary_total_known=total_known, - manifest_path=manifest_path, - run_dir=run_dir, - created_at=manifest_model.created_at, - updated_at=manifest_model.updated_at, - config_source=manifest_model.config_source, - config_checksum=manifest_model.config_checksum, - version=int(manifest_model.version), - artifacts_root=str(getattr(manifest_model, "artifacts_root", ".") or "."), - run_summary_path=run_dir / "run_summary.json", - models=manifest_model.models or {}, - env_templates=manifest_model.env_templates or {}, - ) - if not manifest_model.jobs: - logger.debug("Manifest %s has no jobs array.", manifest_path) - return manifest_info, () - return manifest_info, manifest_model.jobs +def _count_results_rows(path: Path) -> int | None: + count = 0 + try: + with path.open("r", encoding="utf-8") as handle: + for line in handle: + if line.strip(): + count += 1 + except OSError: + return None + return count + + +def _mapping_or_empty(value: Any) -> Mapping[str, Any]: + if isinstance(value, Mapping): + return value + return {} + + +def _string_or_none(value: Any) -> str | None: + if value is None: + return None + text = str(value).strip() + return text or None -def _load_run_summary(run_dir: Path) -> Mapping[str, Mapping[str, Any]]: - summary_path = run_dir / "run_summary.json" - if not summary_path.exists(): - return {} +def _int_or_none(value: Any) -> int | None: + if value is None or isinstance(value, bool): + return None try: - payload = json.loads(summary_path.read_text(encoding="utf-8")) - except (OSError, ValueError) as exc: # noqa: FBT003 - logger.warning("Failed to parse run summary %s: %s", summary_path, exc) - return {} - jobs = payload.get("jobs") - if not isinstance(jobs, list): - return {} - summary: Dict[str, Mapping[str, Any]] = {} - for entry in jobs: - job_id = entry.get("job_id") if isinstance(entry, Mapping) else None - if not job_id: - continue - summary[job_id] = entry - return summary + return int(value) + except (TypeError, ValueError): + return None + + +def _float_or_none(value: Any) -> float | None: + if value is None or isinstance(value, bool): + return None + try: + return float(value) + except (TypeError, ValueError): + return None def _normalize_status_filter(statuses: Sequence[str] | None) -> tuple[str, ...]: diff --git a/medarc_verifiers/cli/process/env_index.py b/medarc_verifiers/cli/process/env_index.py index 86fecd50..927469f7 100644 --- a/medarc_verifiers/cli/process/env_index.py +++ b/medarc_verifiers/cli/process/env_index.py @@ -43,10 +43,12 @@ def _inventory_from_v2(payload: Mapping[str, Any], base_dir: Path) -> EnvIndexIn env_id = entry.get("env_id") or entry.get("base_env_id") if not env_id: continue + variant_id = entry.get("variant_id") + dataset_id = f"{env_id}::{variant_id}" if variant_id else str(env_id) resolved = _resolve_path(base_dir, str(path_str)) if not resolved: continue - env_paths.setdefault(str(env_id), []).append(resolved) + env_paths.setdefault(dataset_id, []).append(resolved) return EnvIndexInventory(env_paths=env_paths, version=2) diff --git a/medarc_verifiers/cli/process/metadata.py b/medarc_verifiers/cli/process/metadata.py index d7bc8c58..1dfd94c9 100644 --- a/medarc_verifiers/cli/process/metadata.py +++ b/medarc_verifiers/cli/process/metadata.py @@ -4,7 +4,6 @@ import json import logging -import math from dataclasses import dataclass from pathlib import Path from typing import Any, Mapping, MutableMapping @@ -16,6 +15,11 @@ logger = logging.getLogger(__name__) +MEDARC_CONFIG_FINGERPRINT_KEY = "medarc_config_fingerprint" +MEDARC_CONFIG_FINGERPRINT_PAYLOAD_KEY = "medarc_config_fingerprint_payload" +MEDARC_VARIANT_ID_KEY = "variant_id" +MEDARC_VARIANT_PAYLOAD_KEY = "variant_payload" + class _MetadataPayload(BaseModel): """Lightweight schema for metadata.json rows.""" @@ -28,11 +32,15 @@ class _MetadataPayload(BaseModel): num_examples: int | None = None rollouts_per_example: int | None = None sampling_args: dict[str, Any] = Field(default_factory=dict) + medarc_config_fingerprint: str | None = None + medarc_config_fingerprint_payload: dict[str, Any] | None = None + variant_id: str | None = None + variant_payload: dict[str, Any] | None = None @dataclass(slots=True) class NormalizedMetadata: - """Normalized view of metadata.json merged with manifest discovery data.""" + """Normalized view of metadata.json plus output-path discovery data.""" identity: "RunIdentity" record: RunRecord @@ -48,6 +56,10 @@ class NormalizedMetadata: sampling_args: Mapping[str, Any] num_examples: int | None rollouts_per_example: int | None + variant_id: str | None + variant_payload: Mapping[str, Any] | None + medarc_config_fingerprint: str | None + medarc_config_fingerprint_payload: Mapping[str, Any] | None @dataclass(frozen=True, slots=True) @@ -60,6 +72,7 @@ class RunIdentity: rollout_index: int | None job_run_id: str output_env_id: str + variant_id: str | None = None @dataclass(frozen=True, slots=True) @@ -72,6 +85,7 @@ class ResolvedRunIdentity: rollout_index: int | None job_run_id: str output_env_id: str + variant_id: str | None = None @dataclass(frozen=True, slots=True) @@ -87,6 +101,10 @@ class _ResolvedMetadataContext: sampling_args: Mapping[str, Any] num_examples: int | None rollouts_per_example: int | None + variant_id: str | None + variant_payload: Mapping[str, Any] | None + medarc_config_fingerprint: str | None + medarc_config_fingerprint_payload: Mapping[str, Any] | None def resolve_run_identity( @@ -106,6 +124,7 @@ def resolve_run_identity( rollout_index=resolved_rollout_index, job_run_id=record.manifest.job_run_id, output_env_id=context.base_env_id or context.manifest_env_id or record.job_id, + variant_id=context.variant_id, ) @@ -114,7 +133,7 @@ def load_normalized_metadata( *, combine_rollouts: bool = True, ) -> NormalizedMetadata: - """Merge manifest fields with metadata.json (when present).""" + """Load and normalize metadata.json with output-path identity.""" context = _resolve_metadata_context(record, combine_rollouts=combine_rollouts) if not context.model_id: raise RuntimeError(format_missing_model_id_error(record)) @@ -128,6 +147,7 @@ def load_normalized_metadata( rollout_index=resolved_rollout_index, job_run_id=record.manifest.job_run_id, output_env_id=context.base_env_id or context.manifest_env_id or record.job_id, + variant_id=context.variant_id, ) return NormalizedMetadata( @@ -145,6 +165,10 @@ def load_normalized_metadata( sampling_args=context.sampling_args, num_examples=context.num_examples, rollouts_per_example=context.rollouts_per_example, + variant_id=context.variant_id, + variant_payload=context.variant_payload, + medarc_config_fingerprint=context.medarc_config_fingerprint, + medarc_config_fingerprint_payload=context.medarc_config_fingerprint_payload, ) @@ -154,16 +178,15 @@ def _resolve_metadata_context( combine_rollouts: bool, ) -> _ResolvedMetadataContext: metadata_payload, raw_metadata = _load_metadata(record) - _warn_manifest_metadata_result_mismatch(record, metadata_payload) metadata_env_id = metadata_payload.env_id if metadata_payload else None metadata_model = metadata_payload.model if metadata_payload else None env_args = _merge_mappings( - primary=record.env_args, - fallback=metadata_payload.env_args if metadata_payload else None, + primary=metadata_payload.env_args if metadata_payload else None, + fallback=record.env_args, ) sampling_args = _merge_mappings( - primary=record.sampling_args, - fallback=metadata_payload.sampling_args if metadata_payload else None, + primary=metadata_payload.sampling_args if metadata_payload else None, + fallback=record.sampling_args, ) manifest_env_id = ( _extract_env_config_id(record.env_config) or record.manifest_env_id or metadata_env_id or record.job_id @@ -176,6 +199,8 @@ def _resolve_metadata_context( alt_index = extract_rollout_index(record.results_dir_name) if alt_index: rollout_index = alt_index + record_variant_id = _string_or_none(record.env_config.get("variant_id") if record.env_config else None) + record_variant_payload = _mapping_or_none(record.env_config.get("variant_payload") if record.env_config else None) return _ResolvedMetadataContext( raw_metadata=raw_metadata, manifest_env_id=manifest_env_id, @@ -186,13 +211,37 @@ def _resolve_metadata_context( metadata_model=metadata_model, env_args=env_args, sampling_args=sampling_args, - num_examples=_prefer_manifest_value( - record.num_examples, - metadata_payload.num_examples if metadata_payload else None, + num_examples=( + metadata_payload.num_examples + if metadata_payload and metadata_payload.num_examples is not None + else record.num_examples + ), + rollouts_per_example=( + metadata_payload.rollouts_per_example + if metadata_payload and metadata_payload.rollouts_per_example is not None + else record.rollouts_per_example + ), + variant_id=record_variant_id or _string_or_none(metadata_payload.variant_id if metadata_payload else None), + variant_payload=_mapping_or_none( + _raw_metadata_value( + raw_metadata, + MEDARC_VARIANT_PAYLOAD_KEY, + (metadata_payload.variant_payload if metadata_payload else None) or record_variant_payload, + ) ), - rollouts_per_example=_prefer_manifest_value( - record.rollouts_per_example, - metadata_payload.rollouts_per_example if metadata_payload else None, + medarc_config_fingerprint=_string_or_none( + _raw_metadata_value( + raw_metadata, + MEDARC_CONFIG_FINGERPRINT_KEY, + metadata_payload.medarc_config_fingerprint if metadata_payload else None, + ) + ), + medarc_config_fingerprint_payload=_mapping_or_none( + _raw_metadata_value( + raw_metadata, + MEDARC_CONFIG_FINGERPRINT_PAYLOAD_KEY, + metadata_payload.medarc_config_fingerprint_payload if metadata_payload else None, + ) ), ) @@ -201,7 +250,7 @@ def format_missing_model_id_error(record: RunRecord) -> str: return ( "Missing model_id for run " f"(job_run_id={record.manifest.job_run_id}, job_id={record.job_id}, " - f"results_dir={record.results_dir}, manifest={record.manifest.manifest_path})" + f"results_dir={record.results_dir}, metadata={record.metadata_path})" ) @@ -255,43 +304,23 @@ def _merge_mappings( return result -def _prefer_manifest_value(primary: int | None, fallback: int | None) -> int | None: - if primary is not None: - return primary +def _raw_metadata_value(raw_metadata: Mapping[str, Any], key: str, fallback: Any) -> Any: + if key in raw_metadata: + return raw_metadata.get(key) return fallback -def _warn_manifest_metadata_result_mismatch(record: RunRecord, metadata_payload: _MetadataPayload | None) -> None: - if metadata_payload is None: - return - - mismatches: list[str] = [] - if _has_float_mismatch(record.avg_reward, metadata_payload.avg_reward): - mismatches.append(f"avg_reward manifest={record.avg_reward!r} metadata={metadata_payload.avg_reward!r}") - if _has_int_mismatch(record.num_examples, metadata_payload.num_examples): - mismatches.append(f"num_examples manifest={record.num_examples!r} metadata={metadata_payload.num_examples!r}") - if not mismatches: - return - - logger.warning( - "Manifest/metadata result mismatch for process input (job_run_id=%s, job_id=%s, metadata=%s): %s", - record.manifest.job_run_id, - record.job_id, - record.metadata_path, - "; ".join(mismatches), - ) - - -def _has_float_mismatch(left: float | None, right: float | None) -> bool: - if left is None or right is None: - return False - return not math.isclose(left, right, rel_tol=1e-9, abs_tol=1e-9) +def _mapping_or_none(value: Any) -> Mapping[str, Any] | None: + if isinstance(value, Mapping): + return dict(value) + return None -def _has_int_mismatch(left: int | None, right: int | None) -> bool: - if left is None or right is None: - return False - return left != right +def _string_or_none(value: Any) -> str | None: + if value is None: + return None + text = str(value).strip() + return text or None def _extract_env_config_id(env_config: Mapping[str, Any] | None) -> str | None: diff --git a/medarc_verifiers/cli/process/pipeline.py b/medarc_verifiers/cli/process/pipeline.py index 7b10a60c..76ea40c9 100644 --- a/medarc_verifiers/cli/process/pipeline.py +++ b/medarc_verifiers/cli/process/pipeline.py @@ -45,7 +45,6 @@ class ProcessOptions: replace_envs: Sequence[str] = field(default_factory=tuple) processed_at: str | None = None processed_with_args: Mapping[str, Any] = field(default_factory=dict) - status_filter: Sequence[str] = field(default_factory=lambda: PROCESS_DEFAULT_STATUS_FILTER) dry_run: bool = False clean: bool = False assume_yes: bool = False @@ -60,7 +59,6 @@ def __post_init__(self) -> None: self.max_workers = max(1, int(self.max_workers)) if not self.processed_at: self.processed_at = datetime.now(UTC).replace(microsecond=0).isoformat().replace("+00:00", "Z") - self.status_filter = tuple(str(status) for status in self.status_filter) self.exclude_datasets = tuple(str(value) for value in self.exclude_datasets if str(value).strip()) self.exclude_models = tuple(str(value) for value in self.exclude_models if str(value).strip()) self.replace_models = tuple(str(value) for value in self.replace_models if str(value).strip()) @@ -150,10 +148,7 @@ def _run_pipeline() -> ProcessResult: baseline_result = preparation.baseline_result index_files = {} if options.clean else env_index.read_env_index_files(options.output_dir) - discovered = discovery.discover_run_records( - options.runs_dir, - filter_status=options.status_filter or None, - ) + discovered = discovery.discover_run_records(options.runs_dir) selection = select_work_items( discovered, options=options, @@ -338,13 +333,22 @@ def select_work_items( def _resolve_env_export( manifest_env_id: str | None, + variant_id: str | None, env_export_map: Mapping[str, EnvironmentExportConfig], ) -> EnvironmentExportConfig: if not manifest_env_id: return EnvironmentExportConfig() + if variant_id: + variant_key = f"{manifest_env_id}::{variant_id}" + if variant_key in env_export_map: + return env_export_map[variant_key] if manifest_env_id in env_export_map: return env_export_map[manifest_env_id] base_env_id, _ = rollout.derive_base_env_id(manifest_env_id) + if base_env_id and variant_id: + variant_base_key = f"{base_env_id}::{variant_id}" + if variant_base_key in env_export_map: + return env_export_map[variant_base_key] if base_env_id and base_env_id in env_export_map: return env_export_map[base_env_id] return EnvironmentExportConfig() @@ -358,9 +362,14 @@ def _plan_selection_record( record: discovery.RunRecord, env_export_map: Mapping[str, EnvironmentExportConfig], ) -> SelectionRecord: - env_export = _resolve_env_export(record.manifest_env_id, env_export_map) + env_export = _resolve_env_export(record.manifest_env_id, None, env_export_map) combine_rollouts = bool(env_export.combine_rollouts) identity = metadata.resolve_run_identity(record, combine_rollouts=combine_rollouts) + variant_export = _resolve_env_export(record.manifest_env_id, identity.variant_id, env_export_map) + if variant_export != env_export: + env_export = variant_export + combine_rollouts = bool(env_export.combine_rollouts) + identity = metadata.resolve_run_identity(record, combine_rollouts=combine_rollouts) return SelectionRecord( record=record, identity=identity, @@ -372,9 +381,9 @@ def _plan_selection_record( def _raise_for_latest_invalid_selection(records: Sequence[SelectionRecord]) -> None: - latest_by_target: dict[tuple[str, str], SelectionRecord] = {} + latest_by_target: dict[tuple[str, str, str], SelectionRecord] = {} for planned in records: - selection_key = (planned.identity.output_env_id, planned.record.job_id) + selection_key = (planned.identity.output_env_id, planned.identity.variant_id or "", planned.record.job_id) current = latest_by_target.get(selection_key) if current is None or _run_sort_key( _source_updated_at(planned.record), @@ -397,14 +406,14 @@ def _raise_for_latest_invalid_selection(records: Sequence[SelectionRecord]) -> N def _select_latest_work_items(records: Sequence[SelectionRecord]) -> list[SelectionWorkItem]: - grouped: dict[tuple[str, str], dict[str, list[SelectionRecord]]] = {} + grouped: dict[tuple[str, str, str], dict[str, list[SelectionRecord]]] = {} run_timestamps: dict[str, str] = {} for planned in records: identity = planned.identity if not identity.model_id: continue - group_key = (identity.model_id, identity.output_env_id) + group_key = (identity.model_id, identity.output_env_id, identity.variant_id or "") grouped.setdefault(group_key, {}).setdefault(identity.job_run_id, []).append(planned) run_timestamps.setdefault(identity.job_run_id, _source_updated_at(planned.record)) @@ -454,7 +463,11 @@ def _apply_exclusions( filtered: list[PlannedWorkItem] = [] skipped = 0 for item in work_items: - if exclude_dataset_set and _env_is_excluded(item.identity.output_env_id, exclude_dataset_set): + if exclude_dataset_set and _env_is_excluded( + item.identity.output_env_id, + exclude_dataset_set, + variant_id=item.identity.variant_id, + ): skipped += 1 continue if exclude_model_set and model_is_excluded(item.identity.model_id, exclude_model_set): @@ -513,6 +526,7 @@ def _apply_additive_delta( options.output_dir, model_id=item.identity.model_id, env_id=item.identity.output_env_id, + variant_id=item.identity.variant_id, ) if not output_path.exists(): filtered.append(item) @@ -867,10 +881,15 @@ def _source_updated_at(record: discovery.RunRecord) -> str: return record.manifest.updated_at or record.manifest.created_at or "" -def _env_is_excluded(env_id: str, exclude_set: set[str]) -> bool: +def _env_is_excluded(env_id: str, exclude_set: set[str], *, variant_id: str | None = None) -> bool: env_identifier = str(env_id or "").strip() base_env_id, _ = rollout.derive_base_env_id(env_identifier) - return dataset_is_excluded(env_identifier, exclude_set, base_dataset_id=base_env_id) + dataset_id = f"{env_identifier}::{variant_id}" if variant_id else env_identifier + if dataset_is_excluded(dataset_id, exclude_set, base_dataset_id=base_env_id): + return True + if variant_id: + return dataset_is_excluded(env_identifier, exclude_set, base_dataset_id=base_env_id) + return False def _strip_env_group_rows(group: AggregatedEnvRows) -> AggregatedEnvRows: @@ -878,6 +897,8 @@ def _strip_env_group_rows(group: AggregatedEnvRows) -> AggregatedEnvRows: env_id=group.env_id, base_env_id=group.base_env_id, model_id=group.model_id, + variant_id=group.variant_id, + variant_payload=group.variant_payload, rows=[], column_names=group.column_names, job_run_ids=group.job_run_ids, diff --git a/medarc_verifiers/cli/process/rows.py b/medarc_verifiers/cli/process/rows.py index e27896a7..095018b6 100644 --- a/medarc_verifiers/cli/process/rows.py +++ b/medarc_verifiers/cli/process/rows.py @@ -41,6 +41,7 @@ def load_rows( decoded_rows, example_counts = _decode_results_jsonl(results_path) multi_rollout = _detect_multi_rollout_shape(example_counts) version_info_json = _encode_metadata_json_column(metadata.raw_metadata.get("version_info")) + variant_payload_json = _encode_metadata_json_column(metadata.variant_payload) rows: list[dict[str, Any]] = [] seen_per_example: dict[Any, int] = {} @@ -67,6 +68,7 @@ def load_rows( line_number=line_number, rollout_index=rollout_index, version_info_json=version_info_json, + variant_payload_json=variant_payload_json, ) rows.append(enriched) @@ -243,6 +245,7 @@ def _attach_row_metadata( line_number: int, rollout_index: int, version_info_json: str | None, + variant_payload_json: str | None, ) -> MutableMapping[str, Any]: record = metadata.record identity = metadata.identity @@ -258,6 +261,8 @@ def _attach_row_metadata( "run_id": record.job_id, "model_id": identity.model_id, "version_info": version_info_json, + "variant_id": metadata.variant_id, + "variant_payload": variant_payload_json, "status": record.status, "error": error_value, "started_at": record.started_at, diff --git a/medarc_verifiers/cli/process/writer.py b/medarc_verifiers/cli/process/writer.py index a9256cdb..10a51273 100644 --- a/medarc_verifiers/cli/process/writer.py +++ b/medarc_verifiers/cli/process/writer.py @@ -36,6 +36,8 @@ "model_cost", "model_id", "version_info", + "variant_id", + "variant_payload", "model_token_completion", "model_token_prompt", "model_token_total", @@ -63,6 +65,8 @@ "model_cost": pl.Float64, "model_id": pl.String, "version_info": pl.String, + "variant_id": pl.String, + "variant_payload": pl.String, "model_token_completion": pl.Float64, "model_token_prompt": pl.Float64, "model_token_total": pl.Float64, @@ -91,6 +95,8 @@ pa.field("model_cost", pa.float64()), pa.field("model_id", pa.large_string()), pa.field("version_info", pa.large_string()), + pa.field("variant_id", pa.large_string()), + pa.field("variant_payload", pa.large_string()), pa.field("model_token_completion", pa.float64()), pa.field("model_token_prompt", pa.float64()), pa.field("model_token_total", pa.float64()), @@ -147,6 +153,8 @@ class EnvWriteSummary: env_id: str base_env_id: str model_id: str + variant_id: str | None + variant_payload: Mapping[str, Any] | None output_path: Path row_count: int job_run_ids: tuple[str, ...] @@ -250,7 +258,7 @@ def _write_group(group: AggregatedEnvRows, config: WriterConfig) -> EnvWriteSumm model_id = group.model_id if not model_id: raise ValueError("model_id is required for parquet output.") - output_path = build_output_path(config.output_dir, model_id=model_id, env_id=env_id) + output_path = build_output_path(config.output_dir, model_id=model_id, env_id=env_id, variant_id=group.variant_id) if not config.dry_run: output_path.parent.mkdir(parents=True, exist_ok=True) file_exists = output_path.exists() @@ -261,6 +269,8 @@ def _write_group(group: AggregatedEnvRows, config: WriterConfig) -> EnvWriteSumm "processed_with_args": dict(config.processed_with_args), "env_id": env_id, "model_id": model_id, + "variant_id": group.variant_id, + "variant_payload": group.variant_payload, } row_count = len(group.rows) job_run_ids_set = set(group.job_run_ids) @@ -271,6 +281,8 @@ def _write_group(group: AggregatedEnvRows, config: WriterConfig) -> EnvWriteSumm env_id=env_id, base_env_id=group.base_env_id, model_id=model_id, + variant_id=group.variant_id, + variant_payload=group.variant_payload, output_path=output_path, row_count=row_count, job_run_ids=group.job_run_ids, @@ -293,6 +305,8 @@ def _write_group(group: AggregatedEnvRows, config: WriterConfig) -> EnvWriteSumm env_id=env_id, base_env_id=group.base_env_id, model_id=model_id, + variant_id=group.variant_id, + variant_payload=group.variant_payload, output_path=output_path, row_count=row_count, job_run_ids=group.job_run_ids, @@ -364,7 +378,10 @@ def _write_env_index( timestamps: list[str] = [] files[path_str] = { "env_id": summary.env_id, + "base_env_id": summary.base_env_id, "model_id": summary.model_id, + "variant_id": summary.variant_id, + "variant_payload": summary.variant_payload, "row_count": summary.row_count, } for job_run_id in summary.job_run_ids: @@ -468,14 +485,17 @@ def _normalize_columns(df: pl.DataFrame) -> pl.DataFrame: return out -def build_output_path(output_dir: Path, *, model_id: str, env_id: str) -> Path: - """Return the canonical parquet output path for a (model_id, env_id) dataset.""" +def build_output_path(output_dir: Path, *, model_id: str, env_id: str, variant_id: str | None = None) -> Path: + """Return the canonical parquet output path for a processed dataset.""" if not model_id: raise ValueError("model_id is required for output path.") if not env_id: raise ValueError("env_id is required for output path.") model_dir = output_dir / slugify_filename_component(model_id) - return model_dir / f"{slugify_filename_component(env_id)}.parquet" + env_slug = slugify_filename_component(env_id) + if variant_id: + return model_dir / f"{env_slug}__variants" / f"{slugify_filename_component(variant_id)}.parquet" + return model_dir / f"{env_slug}.parquet" __all__ = ["EnvWriteSummary", "WriterConfig", "build_output_path", "write_env_groups", "write_env_index"] diff --git a/medarc_verifiers/cli/upstream_eval.py b/medarc_verifiers/cli/upstream_eval.py new file mode 100644 index 00000000..eaf7e5c6 --- /dev/null +++ b/medarc_verifiers/cli/upstream_eval.py @@ -0,0 +1,24 @@ +"""Boundary for upstream ``verifiers`` eval configuration. + +``verifiers==0.1.14`` keeps full ``EvalConfig`` construction nested inside +``verifiers.scripts.eval.main()``, so MedARC still uses a temporary adapter. +Import eval config behavior through this module so callers do not depend on the +adapter directly and the deletion point is isolated when upstream exposes a +public builder. +""" + +from __future__ import annotations + +from medarc_verifiers.cli.verifiers_adapter import ( + EvalConfigOverrides, + build_eval_config, + build_eval_identity_payload, + load_toml_eval_configs, +) + +__all__ = [ + "EvalConfigOverrides", + "build_eval_config", + "build_eval_identity_payload", + "load_toml_eval_configs", +] diff --git a/medarc_verifiers/cli/utils/endpoint_utils.py b/medarc_verifiers/cli/utils/endpoint_utils.py index d81d9f34..54d0797e 100644 --- a/medarc_verifiers/cli/utils/endpoint_utils.py +++ b/medarc_verifiers/cli/utils/endpoint_utils.py @@ -4,10 +4,11 @@ import logging from pathlib import Path -from typing import MutableMapping, Sequence +from typing import Any, MutableMapping, Sequence, cast from verifiers.types import Endpoints -from verifiers.utils.eval_utils import load_endpoints +from verifiers.utils.eval_utils import load_endpoints, resolve_endpoints_file +from verifiers.utils.import_utils import load_toml from medarc_verifiers.cli.utils.env_args import EnvParam, gather_env_cli_metadata @@ -43,6 +44,54 @@ def load_endpoint_registry( return store[normalized] +def load_endpoint_sampling_profiles(path: str | Path) -> dict[str, list[dict[str, Any]]]: + """Load MedARC endpoint-level sampling defaults from a TOML registry.""" + try: + resolved = resolve_endpoints_file(str(path)) + except ValueError: + if Path(path).suffix == ".py": + return {} + raise + if resolved is None or not resolved.exists() or resolved.suffix != ".toml": + return {} + + with resolved.open("rb") as handle: + raw_toml = load_toml(handle) + if not isinstance(raw_toml, dict): + raise ValueError(f"Expected top-level TOML table in endpoint registry {resolved}") + + raw_entries = raw_toml.get("endpoint", []) + if not isinstance(raw_entries, list): + raise ValueError(f"Expected [[endpoint]] array-of-tables in endpoint registry {resolved}") + + profiles: dict[str, list[dict[str, Any]]] = {} + for index, raw_entry in enumerate(raw_entries): + entry_source = f"{resolved} ([[endpoint]] index {index})" + if not isinstance(raw_entry, dict): + raise ValueError(f"Each [[endpoint]] entry must be a table in {entry_source}") + + endpoint_id = raw_entry.get("endpoint_id") + if not isinstance(endpoint_id, str) or not endpoint_id: + if "sampling_args" in raw_entry: + raise ValueError( + f"Endpoint profile with sampling_args must include non-empty string endpoint_id in {entry_source}" + ) + continue + + raw_sampling_args = raw_entry.get("sampling_args", {}) + if isinstance(raw_sampling_args, list): + raise ValueError( + f"Endpoint '{endpoint_id}' sampling_args must be a table in {entry_source}; " + "use [endpoint.sampling_args] or an inline table, not [[endpoint.sampling_args]]." + ) + if not isinstance(raw_sampling_args, dict): + raise ValueError(f"Endpoint '{endpoint_id}' sampling_args must be a table in {entry_source}") + + profiles.setdefault(endpoint_id, []).append(dict(cast(dict[str, Any], raw_sampling_args))) + + return profiles + + def load_env_metadata( env_id: str, *, @@ -103,6 +152,7 @@ def resolve_model_endpoint( "EndpointRegistryCache", "EnvMetadataCache", "load_endpoint_registry", + "load_endpoint_sampling_profiles", "load_env_metadata", "resolve_model_endpoint", ] diff --git a/medarc_verifiers/cli/verifiers_adapter.py b/medarc_verifiers/cli/verifiers_adapter.py new file mode 100644 index 00000000..81651a42 --- /dev/null +++ b/medarc_verifiers/cli/verifiers_adapter.py @@ -0,0 +1,651 @@ +"""Small adapter for upstream ``verifiers`` eval configuration. + +Upstream ``verifiers`` owns TOML loading and eval execution, but in 0.1.14 the +``EvalConfig`` builder lives inside ``verifiers.scripts.eval.main()`` and cannot +be imported directly. Keep this module deliberately narrow until upstream exposes +a public builder. +""" + +from __future__ import annotations + +import importlib.util +import logging +from collections.abc import Mapping +from dataclasses import dataclass +from pathlib import Path +from typing import Any, cast + +from verifiers.types import ( + ClientConfig, + ClientType, + Endpoint, + EndpointClientConfig, + EvalConfig, +) +from verifiers.utils.eval_utils import load_toml_config, resolve_endpoints_file +from verifiers.utils.import_utils import load_toml + +from medarc_verifiers.cli.utils.endpoint_utils import load_endpoint_sampling_profiles +from medarc_verifiers.utils.prime_inference import prime_inference_overrides +from medarc_verifiers.utils.sampling_args import sanitize_sampling_args + +logger = logging.getLogger(__name__) + +DEFAULT_MODEL = "openai/gpt-4.1-mini" +DEFAULT_ENV_DIR_PATH = "./environments" +DEFAULT_ENDPOINTS_PATH = "./configs/endpoints.toml" +DEFAULT_NUM_EXAMPLES = 5 +DEFAULT_ROLLOUTS_PER_EXAMPLE = 3 +DEFAULT_MAX_CONCURRENT = 32 +DEFAULT_CLIENT_TYPE = "openai_chat_completions" +DEFAULT_PROVIDER = "prime" +ADAPTER_TOML_FIELDS = {"debug", "header_from_state", "headers_from_state", "timeout"} +MEDARC_TOML_METADATA_FIELD = "medarc" +MEDARC_TOML_IDENTITY_FIELDS = {"name", "variant_id"} + +PROVIDER_CONFIGS: dict[str, dict[str, str]] = { + "prime": { + "url": "https://api.pinference.ai/api/v1", + "key": "PRIME_API_KEY", + }, + "openrouter": { + "url": "https://openrouter.ai/api/v1", + "key": "OPENROUTER_API_KEY", + }, + "openai": { + "url": "https://api.openai.com/v1", + "key": "OPENAI_API_KEY", + }, + "anthropic": { + "url": "https://api.anthropic.com", + "key": "ANTHROPIC_API_KEY", + "client_type": "anthropic_messages", + }, + "minimax": { + "url": "https://api.minimax.chat/v1", + "key": "MINIMAX_API_KEY", + }, + "deepseek": { + "url": "https://api.deepseek.com/v1", + "key": "DEEPSEEK_API_KEY", + }, + "glm": { + "url": "https://open.bigmodel.cn/api/paas/v4", + "key": "GLM_API_KEY", + }, + "local": { + "url": "http://localhost:8000/v1", + "key": "VLLM_API_KEY", + }, + "vllm": { + "url": "http://localhost:8000/v1", + "key": "VLLM_API_KEY", + }, +} + + +@dataclass(frozen=True) +class EvalConfigOverrides: + """CLI-level overrides applied after TOML globals and per-eval fields.""" + + model: str | None = None + provider: str | None = None + api_base_url: str | None = None + api_key_var: str | None = None + api_client_type: str | None = None + endpoints_path: str | Path | None = None + max_concurrent: int | None = None + env_args: Mapping[str, Any] | None = None + sampling_args: Mapping[str, Any] | None = None + + +def load_toml_eval_configs(path: str | Path, *, extra_valid_fields: set[str] | None = None) -> list[dict[str, Any]]: + """Load upstream TOML eval configs, including ``[[ablation]]`` expansion.""" + + valid_fields = ( + ADAPTER_TOML_FIELDS | {MEDARC_TOML_METADATA_FIELD} | MEDARC_TOML_IDENTITY_FIELDS | (extra_valid_fields or set()) + ) + return [_strip_medarc_metadata(raw) for raw in load_toml_config(Path(path), extra_valid_fields=valid_fields)] + + +def _strip_medarc_metadata(raw: Mapping[str, Any]) -> dict[str, Any]: + cleaned = dict(raw) + cleaned.pop(MEDARC_TOML_METADATA_FIELD, None) + return cleaned + + +def _load_endpoint_registry(endpoints_path: str) -> dict[str, list[Endpoint]]: + """Load endpoint aliases, allowing model-only entries for portable alias registries.""" + endpoints_file = resolve_endpoints_file(endpoints_path) + if endpoints_file is None or not endpoints_file.exists(): + return {} + if endpoints_file.suffix != ".toml": + raise ValueError(f"Unsupported endpoints file extension '{endpoints_file.suffix}' at {endpoints_file}") + + with endpoints_file.open("rb") as handle: + raw_toml = load_toml(handle) + if not isinstance(raw_toml, dict): + raise ValueError(f"Expected top-level TOML table in endpoint registry {endpoints_file}") + + raw_entries = raw_toml.get("endpoint", []) + if not isinstance(raw_entries, list): + raise ValueError(f"Expected [[endpoint]] array-of-tables in endpoint registry {endpoints_file}") + + endpoints: dict[str, list[Endpoint]] = {} + for index, raw_entry in enumerate(raw_entries): + entry_source = f"{endpoints_file} ([[endpoint]] index {index})" + if not isinstance(raw_entry, dict): + raise ValueError(f"Each [[endpoint]] entry must be a table in {entry_source}") + + endpoint_id = raw_entry.get("endpoint_id") + if not isinstance(endpoint_id, str) or not endpoint_id: + raise ValueError(f"Each [[endpoint]] entry must include non-empty string 'endpoint_id' in {entry_source}") + + model = raw_entry.get("model") + if not isinstance(model, str) or not model: + raise ValueError(f"Endpoint '{endpoint_id}' must include non-empty string 'model' in {entry_source}") + + url = raw_entry.get("url") + api_base_url = raw_entry.get("api_base_url") + if url is not None and api_base_url is not None and url != api_base_url: + raise ValueError(f"Conflicting values for 'url' and 'api_base_url' in {entry_source}") + resolved_url = url if url is not None else api_base_url + if resolved_url is not None and not isinstance(resolved_url, str): + raise ValueError(f"Endpoint '{endpoint_id}' url/api_base_url must be a string in {entry_source}") + + key = raw_entry.get("key") + api_key_var = raw_entry.get("api_key_var") + if key is not None and api_key_var is not None and key != api_key_var: + raise ValueError(f"Conflicting values for 'key' and 'api_key_var' in {entry_source}") + resolved_key = key if key is not None else api_key_var + if resolved_key is not None and not isinstance(resolved_key, str): + raise ValueError(f"Endpoint '{endpoint_id}' key/api_key_var must be a string in {entry_source}") + + short_client_type = raw_entry.get("type") + long_client_type = raw_entry.get("api_client_type") + if short_client_type is not None and long_client_type is not None and short_client_type != long_client_type: + raise ValueError(f"Conflicting values for 'type' and 'api_client_type' in {entry_source}") + client_type = short_client_type if short_client_type is not None else long_client_type + if client_type is not None and not isinstance(client_type, str): + raise ValueError(f"Endpoint '{endpoint_id}' api_client_type/type must be a string in {entry_source}") + + endpoint: Endpoint = {"model": model} + if resolved_url is not None: + endpoint["url"] = resolved_url + if resolved_key is not None: + endpoint["key"] = resolved_key + if client_type is not None: + endpoint["api_client_type"] = cast(ClientType, client_type) + + raw_headers = raw_entry.get("headers") + raw_extra_headers = raw_entry.get("extra_headers") + if raw_headers is not None and raw_extra_headers is not None: + raise ValueError(f"Use only one of 'headers' or 'extra_headers' in {entry_source}, not both") + headers = raw_headers if raw_headers is not None else raw_extra_headers + if headers is not None: + endpoint["extra_headers"] = _validate_header_mapping(headers) + + endpoints.setdefault(endpoint_id, []).append(endpoint) + + return endpoints + + +def build_eval_config(raw: Mapping[str, Any], *, overrides: EvalConfigOverrides | None = None) -> EvalConfig: + """Build an upstream ``EvalConfig`` from one loaded TOML/CLI eval mapping.""" + + merged_raw = _apply_overrides(dict(raw), overrides) + env_id = merged_raw["env_id"] + + env_defaults = get_env_eval_defaults(env_id) + raw_num_examples = merged_raw.get("num_examples") + raw_rollouts = merged_raw.get("rollouts_per_example") + num_examples = ( + raw_num_examples if raw_num_examples is not None else env_defaults.get("num_examples", DEFAULT_NUM_EXAMPLES) + ) + rollouts_per_example = ( + raw_rollouts + if raw_rollouts is not None + else env_defaults.get("rollouts_per_example", DEFAULT_ROLLOUTS_PER_EXAMPLE) + ) + + endpoints_path = str(merged_raw.get("endpoints_path", DEFAULT_ENDPOINTS_PATH)) + endpoints = _load_endpoint_registry(endpoints_path) + model, resolved_endpoint_id, client_config = _build_client_config(merged_raw, endpoints, endpoints_path) + + endpoint_sampling_profiles = load_endpoint_sampling_profiles(endpoints_path) + endpoint_sampling_args = _resolve_endpoint_sampling_args(endpoint_sampling_profiles, resolved_endpoint_id) + cli_sampling_args = overrides.sampling_args if overrides is not None else None + sampling_args = _build_sampling_args( + merged_raw, + client_config.api_base_url, + client_type=client_config.client_type, + endpoint_sampling_args=endpoint_sampling_args, + cli_sampling_args=cli_sampling_args, + ) + + extra_env_kwargs = dict(merged_raw.get("extra_env_kwargs", {})) + if merged_raw.get("timeout") is not None: + extra_env_kwargs["timeout_seconds"] = merged_raw["timeout"] + + eval_config_kwargs: dict[str, Any] = { + "env_id": env_id, + "env_args": merged_raw.get("env_args", {}), + "env_dir_path": merged_raw.get("env_dir_path", DEFAULT_ENV_DIR_PATH), + "output_dir": merged_raw.get("output_dir"), + "extra_env_kwargs": extra_env_kwargs, + "endpoint_id": resolved_endpoint_id, + "model": model, + "client_config": client_config, + "sampling_args": sampling_args, + "num_examples": num_examples, + "rollouts_per_example": rollouts_per_example, + "max_concurrent": merged_raw.get("max_concurrent", DEFAULT_MAX_CONCURRENT), + "max_retries": merged_raw.get("max_retries", 0), + "num_workers": merged_raw.get("num_workers", "auto"), + "verbose": merged_raw.get("verbose", False), + "disable_env_server": merged_raw.get("disable_env_server", False), + "state_columns": merged_raw.get("state_columns", []), + "save_results": merged_raw.get("save_results", False), + "resume_path": None, + "independent_scoring": merged_raw.get("independent_scoring", False), + "save_to_hf_hub": merged_raw.get("save_to_hf_hub", False), + "hf_hub_dataset_name": merged_raw.get("hf_hub_dataset_name", ""), + } + return EvalConfig(**eval_config_kwargs) + + +def build_eval_identity_payload( + raw: Mapping[str, Any], *, overrides: EvalConfigOverrides | None = None +) -> dict[str, Any]: + """Resolve TOML eval identity without importing the environment package.""" + + merged_raw = _apply_overrides(dict(raw), overrides) + endpoints_path = str(merged_raw.get("endpoints_path", DEFAULT_ENDPOINTS_PATH)) + endpoints = _load_endpoint_registry(endpoints_path) + model, _resolved_endpoint_id, _client_config = _build_client_config(merged_raw, endpoints, endpoints_path) + + payload = { + "env_args": dict(merged_raw.get("env_args", {})), + "env_id": merged_raw["env_id"], + "model": model, + "num_examples": merged_raw.get("num_examples", DEFAULT_NUM_EXAMPLES), + "rollouts_per_example": merged_raw.get("rollouts_per_example", DEFAULT_ROLLOUTS_PER_EXAMPLE), + "max_concurrent": merged_raw.get("max_concurrent", DEFAULT_MAX_CONCURRENT), + "sampling_args": dict(merged_raw.get("sampling_args", {})), + } + if "variant_id" in raw: + payload["variant_id"] = raw["variant_id"] + if "name" in raw: + payload["name"] = raw["name"] + return payload + + +def get_env_eval_defaults(env_id: str) -> dict[str, Any]: + """Read ``[tool.verifiers.eval]`` defaults from an installed env package.""" + + defaults: dict[str, Any] = {} + module_name = env_id.replace("-", "_").split("/")[-1] + + try: + spec = importlib.util.find_spec(module_name) + if spec is None: + raise ModuleNotFoundError(module_name) + + if spec.submodule_search_locations: + base_dir = Path(next(iter(spec.submodule_search_locations))) + elif spec.origin: + base_dir = Path(spec.origin).parent + else: + logger.debug("Could not determine module path for %s; skipping eval defaults", module_name) + return defaults + + pyproject_file = _find_env_pyproject(base_dir) + if not pyproject_file.is_file(): + logger.debug("pyproject.toml not found for installed module %s", module_name) + return defaults + + with pyproject_file.open("rb") as handle: + pyproject_data = load_toml(handle) + + eval_config = pyproject_data.get("tool", {}).get("verifiers", {}).get("eval", {}) + if "num_examples" in eval_config: + defaults["num_examples"] = eval_config["num_examples"] + if "rollouts_per_example" in eval_config: + defaults["rollouts_per_example"] = eval_config["rollouts_per_example"] + except ModuleNotFoundError: + logger.debug("Module %s not installed", module_name) + except Exception as exc: + logger.debug("Could not load eval defaults from %s pyproject.toml: %s", module_name, exc) + + return defaults + + +def _find_env_pyproject(base_dir: Path) -> Path: + candidates = [base_dir / "pyproject.toml", base_dir.parent / "pyproject.toml"] + for candidate in candidates: + if candidate.is_file(): + return candidate + return candidates[0] + + +def _apply_overrides(raw: dict[str, Any], overrides: EvalConfigOverrides | None) -> dict[str, Any]: + if overrides is None: + return raw + + for field in ("provider", "api_base_url", "api_key_var", "api_client_type", "max_concurrent"): + value = getattr(overrides, field) + if value is not None: + raw[field] = value + + if overrides.endpoints_path is not None: + raw["endpoints_path"] = str(overrides.endpoints_path) + + if overrides.model is not None: + raw["model"] = overrides.model + raw.pop("endpoint_id", None) + + if overrides.env_args: + raw["env_args"] = {**dict(raw.get("env_args", {})), **dict(overrides.env_args)} + return raw + + +def _build_client_config( + raw: Mapping[str, Any], endpoints: Mapping[str, list[Endpoint]], endpoints_path: str +) -> tuple[str, str | None, ClientConfig]: + raw_endpoint_id = raw.get("endpoint_id") + raw_model_field = raw.get("model") + if raw_endpoint_id is not None and raw_model_field is not None: + raise ValueError("Cannot set both 'endpoint_id' and 'model' in eval config; choose one.") + if raw_endpoint_id is not None and not isinstance(raw_endpoint_id, str): + raise ValueError("'endpoint_id' must be a string when provided.") + if isinstance(raw_endpoint_id, str) and not raw_endpoint_id: + raise ValueError("'endpoint_id' must be a non-empty string when provided.") + + resolved_endpoints_file = resolve_endpoints_file(endpoints_path) + if raw_endpoint_id is not None and (resolved_endpoints_file is None or resolved_endpoints_file.suffix != ".toml"): + raise ValueError( + "'endpoint_id' is only supported with TOML endpoint registries. Set endpoints_path to an endpoints.toml file." + ) + + raw_model = raw_model_field if raw_model_field is not None else DEFAULT_MODEL + endpoint_lookup_id = raw_endpoint_id if raw_endpoint_id is not None else raw_model + raw_api_base_url = raw.get("api_base_url") + if isinstance(raw_api_base_url, list): + raise ValueError( + "api_base_url lists are no longer supported. Use endpoint_id + endpoints.toml for multi-endpoint configuration." + ) + + raw_provider = raw.get("provider") + if raw_provider is not None and raw_provider not in PROVIDER_CONFIGS: + raise ValueError(f"Unknown provider '{raw_provider}'. Valid providers are: {sorted(PROVIDER_CONFIGS)}") + + api_key_override = raw.get("api_key_var") is not None + api_base_url_override = raw_api_base_url is not None + client_type_override = raw.get("api_client_type") is not None + endpoint_group: list[Endpoint] | None = None + resolved_endpoint_id: str | None = None + + if endpoint_lookup_id in endpoints: + endpoint_group = list(endpoints[endpoint_lookup_id]) + resolved_endpoint_id = cast(str, endpoint_lookup_id) + endpoint = endpoint_group[0] + provider_cfg = PROVIDER_CONFIGS[raw_provider or DEFAULT_PROVIDER] + + api_key_var = endpoint.get("key") or raw.get("default_api_key_var", provider_cfg["key"]) + api_base_url = endpoint.get("url") or raw.get("default_api_base_url", provider_cfg["url"]) + client_type = endpoint.get("api_client_type", provider_cfg.get("client_type", DEFAULT_CLIENT_TYPE)) + + endpoint_models = {entry["model"] for entry in endpoint_group} + if len(endpoint_models) > 1: + raise ValueError( + f"Endpoint alias '{endpoint_lookup_id}' maps to multiple model ids {sorted(endpoint_models)}, " + "which is not yet supported by EvalConfig." + ) + model = endpoint["model"] + + if raw_provider is not None: + provider_cfg = PROVIDER_CONFIGS[raw_provider] + api_key_var = provider_cfg["key"] + api_base_url = provider_cfg["url"] + client_type = provider_cfg.get("client_type", client_type) + if api_key_override: + api_key_var = raw["api_key_var"] + if api_base_url_override: + api_base_url = raw_api_base_url + if client_type_override: + client_type = raw["api_client_type"] + else: + if raw_endpoint_id is not None: + raise ValueError(f"Endpoint id '{raw_endpoint_id}' not found in endpoint registry at {endpoints_path}") + provider_cfg = PROVIDER_CONFIGS[raw_provider or DEFAULT_PROVIDER] + model = raw_model + api_key_var = raw["api_key_var"] if api_key_override else raw.get("default_api_key_var", provider_cfg["key"]) + api_base_url = ( + raw_api_base_url if api_base_url_override else raw.get("default_api_base_url", provider_cfg["url"]) + ) + client_type = ( + raw["api_client_type"] if client_type_override else provider_cfg.get("client_type", DEFAULT_CLIENT_TYPE) + ) + + if not isinstance(api_base_url, str): + raise ValueError("api_base_url must be a single string URL") + if not isinstance(api_key_var, str): + raise ValueError("api_key_var must be a string") + + eval_headers_merged = _build_extra_headers(raw) + prime_headers, _ = prime_inference_overrides(api_base_url) + eval_headers_from_state = {"X-Session-ID": "example_id", **_build_extra_headers_from_state(raw)} + + registry_headers_base: dict[str, str] = {} + if endpoint_group is not None: + registry_headers_base = dict(endpoint_group[0].get("extra_headers", {})) + merged_headers = {**prime_headers, **registry_headers_base, **eval_headers_merged} + + endpoint_configs: list[EndpointClientConfig] = [] + if ( + endpoint_group is not None + and not api_base_url_override + and raw_provider is None + and len(endpoint_group) > 1 + and all("url" in endpoint and "key" in endpoint for endpoint in endpoint_group) + ): + endpoint_configs = [ + EndpointClientConfig( + api_key_var=api_key_var if api_key_override else endpoint["key"], + api_base_url=endpoint["url"], + extra_headers={**prime_headers, **dict(endpoint.get("extra_headers", {})), **eval_headers_merged}, + ) + for endpoint in endpoint_group + ] + + client_kwargs: dict[str, Any] = { + "client_type": cast(ClientType, client_type), + "api_key_var": api_key_var, + "api_base_url": api_base_url, + "endpoint_configs": endpoint_configs, + "extra_headers": merged_headers, + "extra_headers_from_state": eval_headers_from_state, + } + if raw.get("client_timeout") is not None: + client_kwargs["timeout"] = raw["client_timeout"] + if raw.get("http_max_retries") is not None: + client_kwargs["max_retries"] = raw["http_max_retries"] + + client_config = ClientConfig(**client_kwargs) + return cast(str, model), resolved_endpoint_id, client_config + + +def _resolve_endpoint_sampling_args( + endpoint_sampling_profiles: Mapping[str, list[dict[str, Any]]], endpoint_id: str | None +) -> dict[str, Any]: + if endpoint_id is None: + return {} + + profiles = endpoint_sampling_profiles.get(endpoint_id, []) + if not profiles: + return {} + + first = profiles[0] + for profile in profiles[1:]: + if profile != first: + raise ValueError( + f"Endpoint alias '{endpoint_id}' has conflicting sampling_args across replica entries. " + "Use identical sampling_args for every replica or omit them from every replica." + ) + return dict(first) + + +def _build_sampling_args( + raw: Mapping[str, Any], + api_base_url: str, + *, + client_type: str, + endpoint_sampling_args: Mapping[str, Any] | None = None, + cli_sampling_args: Mapping[str, Any] | None = None, +) -> dict[str, Any]: + _, prime_sampling_overrides = prime_inference_overrides(api_base_url) + endpoint_sampling = _validate_sampling_mapping(endpoint_sampling_args, "endpoint sampling_args") + include_none_max_tokens = raw.get("include_none_max_tokens", True) and ( + "max_tokens" in raw or "max_tokens" not in endpoint_sampling + ) + scalar_sampling_args = _merge_sampling_args( + None, + max_tokens=raw.get("max_tokens"), + temperature=raw.get("temperature"), + include_none_max_tokens=include_none_max_tokens, + ) + merged = _merge_sampling_layer({}, prime_sampling_overrides) + merged = _merge_sampling_layer(merged, endpoint_sampling) + merged = _merge_sampling_layer(merged, scalar_sampling_args) + merged = _merge_sampling_layer(merged, _validate_sampling_mapping(raw.get("sampling_args"), "sampling_args")) + merged = _merge_sampling_layer(merged, _validate_sampling_mapping(cli_sampling_args, "CLI sampling_args")) + return sanitize_sampling_args(merged, client_type=client_type) + + +def _merge_sampling_args( + sampling_args: Mapping[str, Any] | None, + *, + max_tokens: int | None = None, + temperature: float | None = None, + prefer_existing_keys: bool = True, + include_none_max_tokens: bool = False, +) -> dict[str, Any]: + merged_sampling_args = dict(sampling_args or {}) + if (not prefer_existing_keys or "max_tokens" not in merged_sampling_args) and ( + include_none_max_tokens or max_tokens is not None + ): + merged_sampling_args["max_tokens"] = max_tokens + if temperature is not None and (not prefer_existing_keys or "temperature" not in merged_sampling_args): + merged_sampling_args["temperature"] = temperature + return merged_sampling_args + + +def _build_extra_headers(raw: Mapping[str, Any]) -> dict[str, str]: + eval_headers_table: dict[str, str] = {} + raw_headers = raw.get("headers") + if raw_headers is not None: + eval_headers_table = _validate_header_mapping(raw_headers) + + raw_header_values = raw.get("header") or [] + if not isinstance(raw_header_values, list): + raise ValueError("'header' must be a list of 'Name: Value' strings") + + eval_headers_from_list: dict[str, str] = {} + for header_value in raw_header_values: + if not isinstance(header_value, str): + raise ValueError(f"Each 'header' entry must be a string 'Name: Value', got: {header_value!r}") + if ":" not in header_value: + raise ValueError(f"--header must be 'Name: Value', got: {header_value!r}") + key, value = header_value.split(":", 1) + key, value = key.strip(), value.strip() + if not key: + raise ValueError("--header name cannot be empty") + eval_headers_from_list[key] = value + + return {**eval_headers_table, **eval_headers_from_list} + + +def _build_extra_headers_from_state(raw: Mapping[str, Any]) -> dict[str, str]: + table: dict[str, str] = {} + raw_table = raw.get("headers_from_state") + if raw_table is not None: + table = _validate_header_mapping(raw_table) + + raw_list = raw.get("header_from_state") or [] + if not isinstance(raw_list, list): + raise ValueError("'header_from_state' must be a list of 'Name: state_key' strings") + + from_list: dict[str, str] = {} + for entry in raw_list: + if not isinstance(entry, str): + raise ValueError(f"Each 'header_from_state' entry must be a string 'Name: state_key', got: {entry!r}") + if ":" not in entry: + raise ValueError(f"--header-from-state must be 'Name: state_key', got: {entry!r}") + key, value = entry.split(":", 1) + key, value = key.strip(), value.strip() + if not key: + raise ValueError("--header-from-state name cannot be empty") + if not value: + raise ValueError("--header-from-state state_key cannot be empty") + from_list[key] = value + + return {**table, **from_list} + + +def _deep_merge(base: Mapping[str, Any], override: Mapping[str, Any]) -> dict[str, Any]: + merged = dict(base) + for key, value in override.items(): + if isinstance(value, Mapping) and isinstance(merged.get(key), Mapping): + merged[key] = _deep_merge(cast(Mapping[str, Any], merged[key]), value) + else: + merged[key] = value + return merged + + +def _merge_sampling_layer(base: Mapping[str, Any], override: Mapping[str, Any]) -> dict[str, Any]: + merged = dict(base) + if "reasoning" in override: + merged.pop("reasoning_effort", None) + if "reasoning_effort" in override: + merged.pop("reasoning", None) + direct_override_keys = set(override) - {"extra_body"} + if direct_override_keys and isinstance(merged.get("extra_body"), Mapping): + extra_body = dict(cast(Mapping[str, Any], merged["extra_body"])) + for key in direct_override_keys: + extra_body.pop(key, None) + merged["extra_body"] = extra_body + return _deep_merge(merged, override) + + +def _validate_sampling_mapping(value: object, label: str) -> dict[str, Any]: + if value is None: + return {} + if not isinstance(value, Mapping): + raise ValueError(f"{label} must be a dict") + return dict(cast(Mapping[str, Any], value)) + + +def _validate_header_mapping(value: object) -> dict[str, str]: + if not isinstance(value, dict): + raise ValueError("headers must be a dict") + + headers: dict[str, str] = {} + for key, header_value in value.items(): + if not isinstance(key, str) or not key.strip(): + raise ValueError("headers keys must be non-empty strings") + if not isinstance(header_value, str): + raise ValueError("headers values must be strings") + headers[key] = header_value + return headers + + +__all__ = [ + "DEFAULT_MAX_CONCURRENT", + "DEFAULT_NUM_EXAMPLES", + "DEFAULT_ROLLOUTS_PER_EXAMPLE", + "EvalConfigOverrides", + "build_eval_config", + "build_eval_identity_payload", + "get_env_eval_defaults", + "load_toml_eval_configs", +] diff --git a/medarc_verifiers/cli/winrate/api.py b/medarc_verifiers/cli/winrate/api.py index 88f375cb..5ae37e69 100644 --- a/medarc_verifiers/cli/winrate/api.py +++ b/medarc_verifiers/cli/winrate/api.py @@ -315,12 +315,14 @@ def compute_winrates( seen_model_case_map: dict[str, str] = {} dataset_iter: Iterable[tuple[str, Path | str]] = datasets - try: - from rich.progress import track + console = _get_console() + if console is not None and getattr(console, "is_terminal", False): + try: + from rich.progress import track - dataset_iter = track(datasets, description="Computing win rates", transient=True) - except Exception: - dataset_iter = datasets + dataset_iter = track(datasets, description="Computing win rates", transient=True, console=console) + except Exception: + dataset_iter = datasets for dataset_name, parquet_path in dataset_iter: stats, models_present, missingness = _process_dataset( @@ -759,8 +761,13 @@ def _models_present(df_avg: pl.DataFrame) -> list[str]: def _is_dataset_excluded(dataset_name: str, exclude_set: set[str]) -> bool: - base, _ = derive_base_env_id(dataset_name) - return dataset_is_excluded(dataset_name, exclude_set, base_dataset_id=base) + env_name, _, variant_id = dataset_name.partition("::") + base, _ = derive_base_env_id(env_name) + if dataset_is_excluded(dataset_name, exclude_set, base_dataset_id=base): + return True + if variant_id: + return dataset_is_excluded(env_name, exclude_set, base_dataset_id=base) + return False def _filter_models( diff --git a/medarc_verifiers/orchestrate/cli.py b/medarc_verifiers/orchestrate/cli.py index 53edafb3..b2cab3c9 100644 --- a/medarc_verifiers/orchestrate/cli.py +++ b/medarc_verifiers/orchestrate/cli.py @@ -33,7 +33,7 @@ def build_parser() -> argparse.ArgumentParser: prog="medarc-orchestrate", description="Run vLLM orchestration over job configs.", ) - parser.add_argument("--plan", required=True, type=Path, help="Path to orchestrator plan YAML.") + parser.add_argument("--plan", required=True, type=Path, help="Path to orchestrator plan YAML or JSON.") parser.add_argument( "--env-file", type=Path, diff --git a/medarc_verifiers/orchestrate/config.py b/medarc_verifiers/orchestrate/config.py index 264e1db5..07277681 100644 --- a/medarc_verifiers/orchestrate/config.py +++ b/medarc_verifiers/orchestrate/config.py @@ -5,12 +5,15 @@ from dataclasses import dataclass from pathlib import Path from typing import Any, Mapping -import warnings +import tomllib from omegaconf import OmegaConf from pydantic import BaseModel, Field, ValidationError +_ORCHESTRATE_NON_MODEL_KEYS = {"restart", "vllm-container", "pyxis"} + + class PlanConfig(BaseModel): """Schema for the orchestrator plan file.""" @@ -76,6 +79,8 @@ def load_plan(path: Path) -> PlanConfig: def load_job_config(path: Path) -> Mapping[str, Any]: resolved = path.expanduser().resolve() + if resolved.suffix != ".toml": + raise ValueError(f"Unsupported job config format: {resolved} (expected .toml)") return _load_mapping(resolved) @@ -84,7 +89,7 @@ def expand_tasks(plan: PlanConfig) -> list[TaskSpec]: for job_path in plan.job_configs: resolved_job_path = job_path.expanduser().resolve() job_cfg = load_job_config(resolved_job_path) - model_key, model_entry = _extract_single_model(job_cfg, source=resolved_job_path) + model_key, model_entry = _extract_task_model(job_cfg, source=resolved_job_path) orchestrate_cfg = _extract_orchestrate_config(job_cfg, model_key=model_key, source=resolved_job_path) model_id = str(model_entry.get("model", "")).strip() if not model_id: @@ -105,10 +110,13 @@ def expand_tasks(plan: PlanConfig) -> list[TaskSpec]: def _load_mapping(path: Path) -> Mapping[str, Any]: if not path.exists(): raise FileNotFoundError(f"Config not found: {path}") - if path.suffix not in {".yaml", ".yml", ".json"}: - raise ValueError(f"Unsupported config format: {path} (expected .yaml/.yml/.json)") + if path.suffix not in {".yaml", ".yml", ".json", ".toml"}: + raise ValueError(f"Unsupported config format: {path} (expected .yaml/.yml/.json/.toml)") try: - data = OmegaConf.to_container(OmegaConf.load(path), resolve=True) + if path.suffix == ".toml": + data = tomllib.loads(path.read_text(encoding="utf-8")) + else: + data = OmegaConf.to_container(OmegaConf.load(path), resolve=True) except Exception as exc: # pragma: no cover - OmegaConf error types vary raise ConfigFormatError(f"Failed to load config: {path}") from exc if not isinstance(data, Mapping): @@ -116,42 +124,40 @@ def _load_mapping(path: Path) -> Mapping[str, Any]: return data -def _extract_single_model(payload: Mapping[str, Any], *, source: Path) -> tuple[str, Mapping[str, Any]]: - models = payload.get("models") - if not isinstance(models, Mapping): - raise ValueError(f"Job config {source} must define a models mapping.") - keys = list(models.keys()) - if len(keys) != 1: - raise ValueError(f"Job config {source} must define exactly one model; found {len(keys)}.") - model_key = str(keys[0]) - model_entry = models.get(model_key) - if not isinstance(model_entry, Mapping): - raise ValueError(f"Job config {source} models.{model_key} must be a mapping.") - return model_key, model_entry +def _extract_task_model(payload: Mapping[str, Any], *, source: Path) -> tuple[str, Mapping[str, Any]]: + model_id = str(payload.get("model", "")).strip() + if not model_id: + raise ValueError(f"Job config {source} must define a top-level model.") + orchestrate, table_name = _extract_orchestrate_root(payload, source=source) + model_keys = [str(key) for key, value in orchestrate.items() if key not in _ORCHESTRATE_NON_MODEL_KEYS] + if len(model_keys) != 1: + raise ValueError( + f"Job config {source} must define exactly one {table_name} model settings table; found {len(model_keys)}." + ) + return model_keys[0], {"model": model_id} def _extract_orchestrate_config(payload: Mapping[str, Any], *, model_key: str, source: Path) -> Mapping[str, Any]: - orchestrate = payload.get("orchestrate") - if not isinstance(orchestrate, Mapping): - raise ValueError(f"Job config {source} must define a top-level orchestrate mapping.") - has_container = "vllm-container" in orchestrate - has_docker = "vllm-docker" in orchestrate - if has_container and has_docker: - raise ValueError(f"Job config {source} defines both orchestrate.vllm-container and orchestrate.vllm-docker.") - if not has_container and not has_docker: - raise ValueError(f"Job config {source} must define orchestrate.vllm-container settings.") + orchestrate, table_name = _extract_orchestrate_root(payload, source=source) + if "vllm-container" not in orchestrate: + raise ValueError(f"Job config {source} must define {table_name}.vllm-container settings.") if model_key not in orchestrate: - raise ValueError(f"Job config {source} must define orchestrate.{model_key} settings.") - normalized = dict(orchestrate) - if has_docker: - warnings.warn( - (f"Job config {source} uses deprecated orchestrate.vllm-docker; rename it to orchestrate.vllm-container."), - DeprecationWarning, - stacklevel=2, - ) - normalized["vllm-container"] = orchestrate["vllm-docker"] - del normalized["vllm-docker"] - return normalized + raise ValueError(f"Job config {source} must define {table_name}.{model_key} settings.") + return orchestrate + + +def _extract_orchestrate_root(payload: Mapping[str, Any], *, source: Path) -> tuple[Mapping[str, Any], str]: + medarc = payload.get("medarc") + if medarc is not None: + if not isinstance(medarc, Mapping): + raise ValueError(f"Job config {source} medarc must be a mapping.") + medarc_orchestrate = medarc.get("orchestrate") + if medarc_orchestrate is not None: + if not isinstance(medarc_orchestrate, Mapping): + raise ValueError(f"Job config {source} medarc.orchestrate must be a mapping.") + return medarc_orchestrate, "medarc.orchestrate" + + raise ValueError(f"Job config {source} must define a [medarc.orchestrate] mapping.") __all__ = ["ConfigFormatError", "PlanConfig", "TaskSpec", "expand_tasks", "load_job_config", "load_plan"] diff --git a/medarc_verifiers/orchestrate/run.py b/medarc_verifiers/orchestrate/run.py index 9e79aa14..489f350a 100644 --- a/medarc_verifiers/orchestrate/run.py +++ b/medarc_verifiers/orchestrate/run.py @@ -39,10 +39,8 @@ ) from medarc_verifiers.orchestrate.vllm_args import build_container_args, normalize_volume_mounts -_COMMAND_TEMPLATE_UV = ( - "uv run medarc-eval bench --config {job_config_path} --api-base-url {base_url} --on-complete exit" -) -_COMMAND_TEMPLATE_BARE = "medarc-eval bench --config {job_config_path} --api-base-url {base_url} --on-complete exit" +_COMMAND_TEMPLATE_UV = "uv run medarc-eval bench --config {job_config_path} --api-base-url {base_url} --provider local" +_COMMAND_TEMPLATE_BARE = "medarc-eval bench --config {job_config_path} --api-base-url {base_url} --provider local" _TASK_DIR_ALLOWED = re.compile(r"[^a-zA-Z0-9_.-]+") @@ -330,11 +328,6 @@ async def _run_task_once( "job_config_path": str(task.job_config_path), } command = render_command(self._command_template, command_context) - restart_source = orchestrate.get("restart") - if restart_source: - restart_value = str(restart_source) - if "--restart" not in command: - command.extend(["--restart", restart_value]) manifest.bench_command = shlex.join(command) self._dashboard.log(f"JOB bench-start task={task.task_id} cmd={_shorten(manifest.bench_command)}") self._set_state(manifest, paths, JobState.running) diff --git a/medarc_verifiers/rewards/multiple_choice_accuracy.py b/medarc_verifiers/rewards/multiple_choice_accuracy.py index cdee4780..0c86c92f 100644 --- a/medarc_verifiers/rewards/multiple_choice_accuracy.py +++ b/medarc_verifiers/rewards/multiple_choice_accuracy.py @@ -768,10 +768,11 @@ def multiple_choice_accuracy( answer_text: str, prefix: Optional[str] = None, accept_answer_text: bool = True, + strict: bool = False, strip_tex: bool = True, return_details: bool = False, ) -> bool | MCQAccuracyResult: - """Grade an MCQ answer using short-mode scans and tail-authoritative long-mode scans.""" + """Grade an MCQ answer using exact matching or permissive MCQ extraction heuristics.""" if not llm_answer: return _result(False, "none", None, None, return_details) @@ -834,6 +835,9 @@ def multiple_choice_accuracy( return_details, ) + if strict: + return _result(False, "none", None, None, return_details) + is_long = len(structural_text) > LONG_RESPONSE_THRESHOLD_CHARS terminal_region = structural_text[-TERMINAL_WINDOW_CHARS:] if is_long else structural_text strong_tail_region = terminal_region[-STRONG_TAIL_WINDOW_CHARS:] if is_long else structural_text diff --git a/medarc_verifiers/utils/__init__.py b/medarc_verifiers/utils/__init__.py index 776aaf86..c7ccc1c9 100644 --- a/medarc_verifiers/utils/__init__.py +++ b/medarc_verifiers/utils/__init__.py @@ -5,7 +5,7 @@ randomize_multiple_choice_hf_map, randomize_multiple_choice_row, ) -from .sampling_args import sanitize_sampling_args_for_openai +from .sampling_args import sanitize_sampling_args, sanitize_sampling_args_for_openai __all__ = [ "download_file", @@ -15,5 +15,6 @@ "randomize_multiple_choice_row", "default_judge_api_key", "judge_sampling_args_and_headers", + "sanitize_sampling_args", "sanitize_sampling_args_for_openai", ] diff --git a/medarc_verifiers/utils/pathing.py b/medarc_verifiers/utils/pathing.py index e53dc69a..8924fe02 100644 --- a/medarc_verifiers/utils/pathing.py +++ b/medarc_verifiers/utils/pathing.py @@ -1,77 +1,10 @@ -"""Shared filesystem helpers for locating and relativizing project paths.""" +"""Shared filesystem helpers for safe relative paths.""" from __future__ import annotations -from functools import lru_cache from pathlib import Path -@lru_cache(maxsize=1) -def project_root() -> Path: - """Best-effort detection of the repository root (directory containing pyproject.toml).""" - current = Path(__file__).resolve() - for candidate in (current,) + tuple(current.parents): - if (candidate / "pyproject.toml").exists(): - return candidate - # Fallback to current working directory if no project marker is found. - return Path.cwd().resolve() - - -def to_project_relative(path: Path | str, *, default_base: Path | None = None) -> str: - """Convert an absolute path to a string relative to the project root when possible. - - If `path` is relative, treat it as rooted at `default_base` when provided. - """ - resolved = _resolve_path(path, default_base=default_base) - root = project_root() - try: - return resolved.relative_to(root).as_posix() - except ValueError: - return resolved.as_posix() - - -def from_project_relative(path: Path | str) -> Path: - """Convert a stored manifest path back into an absolute path under the project root.""" - candidate = Path(path) - if candidate.is_absolute(): - return candidate - return (project_root() / candidate).resolve() - - -def normalize_results_dir_for_manifest(value: str | Path, *, run_dir: Path) -> str: - """Normalize results_dir entries before storing them in a manifest.""" - candidate = Path(value) - if not candidate.is_absolute(): - if candidate.parts and candidate.parts[0] == "runs": - candidate = (project_root() / candidate).resolve() - else: - candidate = (run_dir / candidate).resolve() - else: - candidate = candidate.resolve() - return to_project_relative(candidate) - - -def resolve_results_dir_from_manifest(value: str | None, *, job_id: str, run_dir: Path) -> Path: - """Resolve manifest results_dir entries into concrete paths.""" - raw = "" if value is None else str(value) - name = raw.strip() or job_id - candidate = Path(name) - if candidate.is_absolute(): - return candidate - if candidate.parts and candidate.parts[0] == "runs": - return from_project_relative(candidate) - return (run_dir / candidate).resolve() - - -def _resolve_path(path: Path | str, *, default_base: Path | None = None) -> Path: - candidate = Path(path) - if candidate.is_absolute(): - return candidate - if default_base is not None: - return (default_base / candidate).resolve() - return candidate.resolve() - - def resolve_under(base_dir: Path, rel_path: str | Path) -> Path | None: """Join rel_path under base_dir, rejecting obvious traversal.""" raw = str(rel_path).strip() @@ -89,10 +22,5 @@ def resolve_under(base_dir: Path, rel_path: str | Path) -> Path | None: __all__ = [ - "project_root", - "to_project_relative", - "from_project_relative", "resolve_under", - "normalize_results_dir_for_manifest", - "resolve_results_dir_from_manifest", ] diff --git a/medarc_verifiers/utils/sampling_args.py b/medarc_verifiers/utils/sampling_args.py index 58ae2aed..ccff5ce7 100644 --- a/medarc_verifiers/utils/sampling_args.py +++ b/medarc_verifiers/utils/sampling_args.py @@ -1,91 +1,251 @@ +from __future__ import annotations + import inspect from collections.abc import Mapping from functools import lru_cache -from typing import Any +from typing import Any, Literal, get_args, get_origin +_OPENAI_REASONING_KEYS = {"reasoning", "reasoning_effort", "thinking", "output_config"} +_FRAMEWORK_REQUEST_KEYS = {"model", "messages", "input", "prompt", "tools", "system", "extra_headers"} -def sanitize_sampling_args_for_openai(sampling_args: Mapping[str, Any] | None) -> dict[str, Any]: - """Return sampling args split into OpenAI-recognized kwargs and extra_body. - Any parameters not recognized by the OpenAI Chat Completions API are moved under - the `extra_body` key so they can be forwarded to compatible servers (e.g., vLLM/Qwen). - """ +def sanitize_sampling_args( + sampling_args: Mapping[str, Any] | None, + *, + client_type: str, +) -> dict[str, Any]: + """Return sampling args in the request shape expected by the resolved client.""" if not sampling_args: return {} - allowed_keys = _get_openai_allowed_param_names() + if client_type in {"openai_chat_completions", "openai_chat_completions_token"}: + return _sanitize_openai_chat(sampling_args) + if client_type == "openai_responses": + return _sanitize_openai_responses(sampling_args) + if client_type == "openai_completions": + return _sanitize_openai_completions(sampling_args) + if client_type == "anthropic_messages": + return _sanitize_anthropic_messages(sampling_args) + if client_type in {"renderer", "nemorl_chat_completions"}: + return _drop_none(sampling_args) + return _drop_none(sampling_args) + + +def sanitize_sampling_args_for_openai(sampling_args: Mapping[str, Any] | None) -> dict[str, Any]: + """Compatibility wrapper for existing OpenAI Chat Completions call sites.""" + return sanitize_sampling_args(sampling_args, client_type="openai_chat_completions") + + +def _sanitize_openai_chat(sampling_args: Mapping[str, Any]) -> dict[str, Any]: + cleaned = _drop_none(sampling_args, preserve_none_keys={"max_tokens"}) + _drop_framework_request_keys(cleaned) + return _move_compatible_extras_to_extra_body(cleaned, allowed_top_level_keys=_get_openai_chat_allowed_param_names()) + + +def _sanitize_openai_responses(sampling_args: Mapping[str, Any]) -> dict[str, Any]: + cleaned = _drop_none(sampling_args, preserve_none_keys={"max_tokens"}) + _normalize_openai_responses_sampling_args(cleaned) + _drop_framework_request_keys(cleaned) + reasoning_effort = cleaned.pop("reasoning_effort", None) + if reasoning_effort is not None: + existing_reasoning = cleaned.get("reasoning") + if existing_reasoning is None: + cleaned["reasoning"] = {"effort": reasoning_effort} + elif isinstance(existing_reasoning, Mapping): + reasoning = dict(existing_reasoning) + reasoning.setdefault("effort", reasoning_effort) + cleaned["reasoning"] = reasoning + else: + raise ValueError("sampling_args.reasoning must be a dict when used with openai_responses") + return _move_compatible_extras_to_extra_body( + cleaned, allowed_top_level_keys=_get_openai_responses_allowed_param_names() + ) + + +def _normalize_openai_responses_sampling_args(sampling_args: dict[str, Any]) -> None: + n = sampling_args.pop("n", None) + if n not in (None, 1): + raise ValueError("Responses API client only supports n=1") + + max_tokens = sampling_args.pop("max_tokens", None) + max_completion_tokens = sampling_args.pop("max_completion_tokens", None) + if "max_output_tokens" not in sampling_args: + output_tokens = max_tokens if max_tokens is not None else max_completion_tokens + if output_tokens is not None: + sampling_args["max_output_tokens"] = output_tokens + + if sampling_args.get("stop") is not None: + raise ValueError("Responses API client does not support stop sequences") + sampling_args.pop("stop", None) + sampling_args.pop("modalities", None) + +def _sanitize_openai_completions(sampling_args: Mapping[str, Any]) -> dict[str, Any]: + cleaned = _drop_none(sampling_args, preserve_none_keys={"max_tokens"}) + _drop_framework_request_keys(cleaned) + for key in _OPENAI_REASONING_KEYS: + cleaned.pop(key, None) + return _move_compatible_extras_to_extra_body( + cleaned, allowed_top_level_keys=_get_openai_completions_allowed_param_names() + ) + + +def _sanitize_anthropic_messages(sampling_args: Mapping[str, Any]) -> dict[str, Any]: + cleaned = _drop_none(sampling_args) + _drop_framework_request_keys(cleaned) + reasoning_effort = cleaned.pop("reasoning_effort", None) + cleaned.pop("reasoning", None) + cleaned.pop("effort", None) + cleaned.pop("extra_body", None) + + thinking = cleaned.get("thinking") + if thinking is not None: + if not isinstance(thinking, Mapping): + raise ValueError("sampling_args.thinking must be a dict when used with anthropic_messages") + thinking_dict = dict(thinking) + if thinking_dict.get("type") != "adaptive": + raise ValueError("anthropic_messages only supports adaptive thinking configs") + if "budget_tokens" in thinking_dict: + raise ValueError("anthropic_messages does not support manual budget_tokens thinking configs") + thinking_dict.pop("effort", None) + cleaned["thinking"] = thinking_dict + + if reasoning_effort is not None: + effort = _validate_anthropic_effort(reasoning_effort) + cleaned["thinking"] = {"type": "adaptive"} + output_config = cleaned.get("output_config") + if output_config is None: + cleaned["output_config"] = {"effort": effort} + elif isinstance(output_config, Mapping): + cleaned["output_config"] = {**dict(output_config), "effort": effort} + else: + raise ValueError("sampling_args.output_config must be a dict when used with anthropic_messages") + elif "output_config" in cleaned: + output_config = cleaned["output_config"] + if not isinstance(output_config, Mapping): + raise ValueError("sampling_args.output_config must be a dict when used with anthropic_messages") + output_config_dict = dict(output_config) + if "effort" in output_config_dict: + output_config_dict["effort"] = _validate_anthropic_effort(output_config_dict["effort"]) + cleaned["output_config"] = output_config_dict + + allowed_keys = _get_anthropic_allowed_param_names() + return {key: value for key, value in cleaned.items() if key in allowed_keys} + + +def _validate_anthropic_effort(value: Any) -> str: + effort_values = _get_anthropic_effort_values() + if not isinstance(value, str) or value not in effort_values: + raise ValueError(f"anthropic_messages reasoning effort must be one of: {', '.join(sorted(effort_values))}") + return value + + +def _drop_framework_request_keys(sampling_args: dict[str, Any]) -> None: + for key in _FRAMEWORK_REQUEST_KEYS: + sampling_args.pop(key, None) + + +def _move_compatible_extras_to_extra_body( + sampling_args: Mapping[str, Any], + *, + allowed_top_level_keys: set[str], +) -> dict[str, Any]: filtered: dict[str, Any] = {} extras: dict[str, Any] = {} for key, value in sampling_args.items(): - if key in allowed_keys: + if key in allowed_top_level_keys: filtered[key] = value else: extras[key] = value - if extras: - # OpenAI python client forwards unknown params via `extra_body`. - # If the caller already supplied an `extra_body` (e.g., to request `usage.include`), - # merge rather than overwrite it. - existing = filtered.get("extra_body") - if existing is None: - filtered["extra_body"] = extras - elif isinstance(existing, Mapping): - merged = dict(existing) - for key, value in extras.items(): - merged.setdefault(key, value) - filtered["extra_body"] = merged - else: - filtered["extra_body"] = {"_passthrough_extra_body": existing, **extras} + if not extras: + return filtered + + existing = filtered.get("extra_body") + if existing is None: + filtered["extra_body"] = extras + elif isinstance(existing, Mapping): + filtered["extra_body"] = _deep_merge(extras, existing) + else: + filtered["extra_body"] = {"_passthrough_extra_body": existing, **extras} return filtered @lru_cache(maxsize=1) -def _get_openai_allowed_param_names() -> set[str]: - """Infer allowed kwargs for OpenAI create() by inspecting client signatures. - - We union parameter names from: - - openai.resources.chat.completions.AsyncCompletions.create - - openai.resources.completions.AsyncCompletions.create - - On failure, return a conservative fallback. Always include 'extra_body'. - """ - try: - from openai.resources.chat.completions import AsyncCompletions as ChatAsyncCompletions # type: ignore - from openai.resources.completions import AsyncCompletions as TextAsyncCompletions # type: ignore - except Exception: - return { - "temperature", - "top_p", - "max_tokens", - "max_completion_tokens", - "n", - "stop", - "presence_penalty", - "frequency_penalty", - "logit_bias", - "seed", - "response_format", - "tool_choice", - "tools", - "stream", - "extra_body", - } - - def _param_names(callable_obj: Any) -> set[str]: - try: - sig = inspect.signature(callable_obj) - except Exception: - return set() - names: set[str] = set() - for name, param in sig.parameters.items(): - if name == "self": - continue - if param.kind == inspect.Parameter.VAR_POSITIONAL: - continue - names.add(name) - return names - - allowed = _param_names(ChatAsyncCompletions.create) | _param_names(TextAsyncCompletions.create) - allowed.add("extra_body") - return allowed +def _get_anthropic_effort_values() -> set[str]: + from anthropic.types import OutputConfigParam + from typing import get_type_hints + + effort_type = get_type_hints(OutputConfigParam)["effort"] + return _literal_string_values(effort_type) + + +def _literal_string_values(type_hint: Any) -> set[str]: + values: set[str] = set() + origin = get_origin(type_hint) + if origin is None: + return values + if origin is Literal: + return {value for value in get_args(type_hint) if isinstance(value, str)} + for arg in get_args(type_hint): + values.update(_literal_string_values(arg)) + return values + + +@lru_cache(maxsize=1) +def _get_openai_chat_allowed_param_names() -> set[str]: + from openai.resources.chat.completions import AsyncCompletions as ChatAsyncCompletions # type: ignore + + return _param_names(ChatAsyncCompletions.create) + + +@lru_cache(maxsize=1) +def _get_openai_responses_allowed_param_names() -> set[str]: + from openai.resources.responses import AsyncResponses # type: ignore + + return _param_names(AsyncResponses.create) + + +@lru_cache(maxsize=1) +def _get_openai_completions_allowed_param_names() -> set[str]: + from openai.resources.completions import AsyncCompletions as TextAsyncCompletions # type: ignore + + return _param_names(TextAsyncCompletions.create) + + +@lru_cache(maxsize=1) +def _get_anthropic_allowed_param_names() -> set[str]: + from anthropic.resources.messages import AsyncMessages + + return _param_names(AsyncMessages.create) + + +def _param_names(callable_obj: Any) -> set[str]: + sig = inspect.signature(callable_obj) + names: set[str] = set() + for name, param in sig.parameters.items(): + if name == "self": + continue + if param.kind == inspect.Parameter.VAR_POSITIONAL: + continue + names.add(name) + return names + + +def _drop_none( + sampling_args: Mapping[str, Any], + *, + preserve_none_keys: set[str] | None = None, +) -> dict[str, Any]: + preserve_none_keys = preserve_none_keys or set() + return {key: value for key, value in sampling_args.items() if value is not None or key in preserve_none_keys} + + +def _deep_merge(base: Mapping[str, Any], override: Mapping[str, Any]) -> dict[str, Any]: + merged = dict(base) + for key, value in override.items(): + if isinstance(value, Mapping) and isinstance(merged.get(key), Mapping): + merged[key] = _deep_merge(merged[key], value) + else: + merged[key] = value + return merged diff --git a/pyproject.toml b/pyproject.toml index cd0c653d..808603d8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,13 +1,13 @@ [project] name = "medarc-verifiers" -version = "0.1.0" +version = "0.2.0" description = "A collection of MedARC utilities and tools for Prime Intellect's verifiers package" readme = "docs/README.md" requires-python = ">=3.11" license = { file = "LICENSE" } dependencies = [ "prime>=0.3.35", - "verifiers>=0.1.10,<0.1.12", + "verifiers>=0.1.14,<0.2", "pyyaml>=6.0.1", "docstring-parser>=0.17.0", "pylatexenc>=2.10", diff --git a/scripts/convert_legacy_raw_runs.py b/scripts/convert_legacy_raw_runs.py new file mode 100644 index 00000000..82bc1609 --- /dev/null +++ b/scripts/convert_legacy_raw_runs.py @@ -0,0 +1,734 @@ +"""Convert retired YAML-runner raw outputs into current eval-output directories.""" + +from __future__ import annotations + +import argparse +import json +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Any, Mapping, Sequence + +from medarc_verifiers.cli.eval_identity import BASE_VARIANT_ID, slug_component + +MANIFEST_FILENAME = "run_manifest.json" +RESULTS_FILENAME = "results.jsonl" +METADATA_FILENAME = "metadata.json" +SUPPORTED_MANIFEST_VERSION = 3 +MAX_VARIANT_LENGTH = 160 + + +@dataclass(frozen=True, slots=True) +class ConversionEntry: + run_id: str + job_id: str | None + status: str + reason: str + source_results: str | None = None + target_dir: str | None = None + + +@dataclass(frozen=True, slots=True) +class ConversionReport: + entries: tuple[ConversionEntry, ...] + dry_run: bool + + @property + def converted(self) -> int: + return sum(1 for entry in self.entries if entry.status == "converted") + + @property + def would_convert(self) -> int: + return sum(1 for entry in self.entries if entry.status == "would_convert") + + @property + def skipped(self) -> int: + return sum(1 for entry in self.entries if entry.status == "skipped") + + @property + def failed(self) -> int: + return sum(1 for entry in self.entries if entry.status == "failed") + + def to_dict(self) -> dict[str, Any]: + return { + "dry_run": self.dry_run, + "summary": { + "converted": self.converted, + "would_convert": self.would_convert, + "skipped": self.skipped, + "failed": self.failed, + }, + "entries": [ + { + "run_id": entry.run_id, + "job_id": entry.job_id, + "status": entry.status, + "reason": entry.reason, + "source_results": entry.source_results, + "target_dir": entry.target_dir, + } + for entry in self.entries + ], + } + + +@dataclass(frozen=True, slots=True) +class _PlannedConversion: + run_id: str + job: Mapping[str, Any] + source_results: Path + source_metadata: Path | None + source_metadata_payload: Mapping[str, Any] + target_dir: Path + env_id: str + model_id: str + variant_id: str + manifest: Mapping[str, Any] + + +def convert_legacy_raw_runs( + *, + raw_dir: Path | str, + output_dir: Path | str, + dry_run: bool = True, +) -> ConversionReport: + raw_path = Path(raw_dir) + output_path = Path(output_dir) + entries: list[ConversionEntry] = [] + plans: list[_PlannedConversion] = [] + + if not raw_path.exists(): + return ConversionReport( + entries=( + ConversionEntry( + run_id=str(raw_path), + job_id=None, + status="failed", + reason="raw directory does not exist", + ), + ), + dry_run=dry_run, + ) + + for manifest_path in sorted(raw_path.glob(f"*/{MANIFEST_FILENAME}")): + run_dir = manifest_path.parent + try: + manifest = _read_json_object(manifest_path) + except ValueError as exc: + entries.append( + ConversionEntry( + run_id=run_dir.name, + job_id=None, + status="failed", + reason=str(exc), + ) + ) + continue + + if manifest.get("version") != SUPPORTED_MANIFEST_VERSION: + entries.append( + ConversionEntry( + run_id=_run_id(manifest, run_dir), + job_id=None, + status="failed", + reason=f"unsupported manifest version {manifest.get('version')!r}; expected 3", + ) + ) + continue + + jobs = manifest.get("jobs") + if not isinstance(jobs, list): + entries.append( + ConversionEntry( + run_id=_run_id(manifest, run_dir), + job_id=None, + status="failed", + reason="manifest jobs must be a list", + ) + ) + continue + + for job in jobs: + if not isinstance(job, Mapping): + entries.append( + ConversionEntry( + run_id=_run_id(manifest, run_dir), + job_id=None, + status="skipped", + reason="job entry is not an object", + ) + ) + continue + planned = _plan_job(run_dir, manifest, job, output_path) + if isinstance(planned, ConversionEntry): + entries.append(planned) + else: + plans.append(planned) + + entries.extend(_collision_entries(plans, existing_targets_fail=not dry_run)) + failed_targets = { + entry.target_dir + for entry in entries + if entry.status == "failed" and entry.target_dir is not None and "collision" in entry.reason + } + failed_targets.update( + entry.target_dir + for entry in entries + if entry.status == "failed" and entry.target_dir is not None and "already exists" in entry.reason + ) + runnable_plans = [plan for plan in plans if str(plan.target_dir) not in failed_targets] + + for plan in runnable_plans: + if dry_run: + entries.append(_entry_for_plan(plan, status="would_convert", reason="dry run")) + continue + try: + _write_conversion(plan) + except (OSError, ValueError) as exc: + entries.append(_entry_for_plan(plan, status="failed", reason=f"write failed: {exc}")) + continue + entries.append(_entry_for_plan(plan, status="converted", reason="converted")) + + return ConversionReport(entries=tuple(entries), dry_run=dry_run) + + +def _plan_job( + run_dir: Path, + manifest: Mapping[str, Any], + job: Mapping[str, Any], + output_dir: Path, +) -> _PlannedConversion | ConversionEntry: + run_id = _run_id(manifest, run_dir) + job_id = _string_or_none(job.get("job_id")) + if not job_id: + return ConversionEntry(run_id=run_id, job_id=None, status="skipped", reason="missing job_id") + + status = (_string_or_none(job.get("status")) or "pending").lower() + if status != "completed": + return ConversionEntry(run_id=run_id, job_id=job_id, status="skipped", reason=f"job status is {status!r}") + + model_id = _string_or_none(job.get("model_id")) + env_id = _string_or_none(job.get("env_id")) + if not model_id or not env_id: + return ConversionEntry(run_id=run_id, job_id=job_id, status="skipped", reason="missing model_id or env_id") + + variant = _resolve_variant(job, env_id) + if isinstance(variant, str): + variant_id = variant + else: + return ConversionEntry(run_id=run_id, job_id=job_id, status="skipped", reason=variant["reason"]) + + results_path = _resolve_results_path(run_dir, manifest, job, job_id) + if not results_path.exists(): + return ConversionEntry( + run_id=run_id, + job_id=job_id, + status="skipped", + reason="missing results.jsonl", + source_results=str(results_path), + ) + + source_metadata = _resolve_metadata_path(run_dir, manifest, job, results_path) + source_metadata_payload: Mapping[str, Any] = {} + if source_metadata is not None and not source_metadata.exists(): + source_metadata = None + if source_metadata is not None: + try: + source_metadata_payload = _read_json_object(source_metadata) + except ValueError as exc: + return ConversionEntry( + run_id=run_id, + job_id=job_id, + status="skipped", + reason=f"invalid metadata.json: {exc}", + source_results=str(results_path), + ) + + target_dir = output_dir / slug_component(model_id) / slug_component(env_id) / variant_id + return _PlannedConversion( + run_id=run_id, + job=job, + source_results=results_path, + source_metadata=source_metadata, + source_metadata_payload=source_metadata_payload, + target_dir=target_dir, + env_id=env_id, + model_id=model_id, + variant_id=variant_id, + manifest=manifest, + ) + + +def _resolve_variant(job: Mapping[str, Any], env_id: str) -> str | dict[str, str]: + raw = _string_or_none(job.get("env_variant_id")) + if raw is None or raw == env_id: + return BASE_VARIANT_ID + + split_variant = _resolve_split_variant(job, env_id, raw) + if split_variant is not None: + return split_variant + + prefix_colon = f"{env_id}::" + prefix_slash = f"{env_id}/" + if raw.startswith(prefix_colon): + variant_id = raw[len(prefix_colon) :] + elif raw.startswith(prefix_slash): + variant_id = raw[len(prefix_slash) :] + elif raw.startswith(f"{env_id}-") or raw.startswith(f"{env_id}_"): + variant_id = raw[len(env_id) + 1 :] + else: + return {"reason": f"ambiguous env_variant_id {raw!r} for env_id {env_id!r}"} + + if not variant_id: + return {"reason": f"empty parsed variant from env_variant_id {raw!r}"} + if variant_id == BASE_VARIANT_ID: + return {"reason": "variant identity conflict: source variant maps to reserved base"} + if "/" in variant_id or "\\" in variant_id: + return {"reason": f"path-unsafe variant {variant_id!r}"} + if slug_component(variant_id, max_length=MAX_VARIANT_LENGTH) != variant_id: + return {"reason": f"path-unsafe variant {variant_id!r}"} + return variant_id + + +def _resolve_split_variant(job: Mapping[str, Any], env_id: str, raw: str) -> str | None: + env_args = job.get("env_args") + split = _string_or_none(env_args.get("split")) if isinstance(env_args, Mapping) else None + if split != "en": + return None + + for delimiter in ("_", "-"): + split_prefix = f"{env_id}{delimiter}{split}" + if raw == split_prefix: + return BASE_VARIANT_ID + rollout_prefix = f"{split_prefix}-" + if raw.startswith(rollout_prefix): + return raw[len(rollout_prefix) :] + return None + + +def _resolve_results_path( + run_dir: Path, + manifest: Mapping[str, Any], + job: Mapping[str, Any], + job_id: str, +) -> Path: + artifacts_root = _string_or_none(manifest.get("artifacts_root")) or "." + base = run_dir / artifacts_root + relpath = _string_or_none(job.get("results_relpath")) or _string_or_none(job.get("results_dir")) + if relpath: + candidate = base / relpath + if candidate.name == RESULTS_FILENAME: + return candidate + return candidate / RESULTS_FILENAME + return run_dir / job_id / RESULTS_FILENAME + + +def _resolve_metadata_path( + run_dir: Path, + manifest: Mapping[str, Any], + job: Mapping[str, Any], + results_path: Path, +) -> Path | None: + artifacts_root = _string_or_none(manifest.get("artifacts_root")) or "." + relpath = _string_or_none(job.get("metadata_relpath")) + if relpath: + return run_dir / artifacts_root / relpath + candidate = results_path.parent / METADATA_FILENAME + return candidate if candidate.exists() else None + + +def _collision_entries( + plans: Sequence[_PlannedConversion], + *, + existing_targets_fail: bool, +) -> list[ConversionEntry]: + entries: list[ConversionEntry] = [] + by_target: dict[Path, list[_PlannedConversion]] = {} + for plan in plans: + by_target.setdefault(plan.target_dir, []).append(plan) + + for target, target_plans in sorted(by_target.items(), key=lambda item: str(item[0])): + if len(target_plans) > 1: + for plan in target_plans: + entries.append(_entry_for_plan(plan, status="failed", reason="planned output path collision")) + elif existing_targets_fail and target.exists(): + entries.append(_entry_for_plan(target_plans[0], status="failed", reason="target path already exists")) + return entries + + +def _write_conversion(plan: _PlannedConversion) -> None: + plan.target_dir.mkdir(parents=True, exist_ok=False) + row_stats = _write_converted_results(plan) + metadata = _converted_metadata(plan, row_stats=row_stats) + (plan.target_dir / METADATA_FILENAME).write_text( + json.dumps(metadata, indent=2, sort_keys=True) + "\n", + encoding="utf-8", + ) + + +def _write_converted_results(plan: _PlannedConversion) -> dict[str, Any]: + stats = _RowStats() + with ( + plan.source_results.open("r", encoding="utf-8") as source, + (plan.target_dir / RESULTS_FILENAME).open("w", encoding="utf-8") as target, + ): + for line_number, line in enumerate(source, start=1): + if not line.strip(): + continue + try: + payload = json.loads(line) + except ValueError as exc: + raise ValueError(f"invalid JSON in {plan.source_results} line {line_number}: {exc}") from exc + if not isinstance(payload, Mapping): + raise ValueError(f"expected JSON object in {plan.source_results} line {line_number}") + converted = _converted_result_row(payload) + stats.add(converted) + target.write(json.dumps(converted, sort_keys=True) + "\n") + return stats.to_metadata() + + +def _converted_result_row(payload: Mapping[str, Any]) -> dict[str, Any]: + converted = dict(payload) + converted["timing"] = _converted_timing(payload) + converted.pop("generation_ms", None) + converted.pop("scoring_ms", None) + converted.pop("total_ms", None) + + converted["is_completed"] = bool(payload.get("is_completed", payload.get("error") is None)) + converted["is_truncated"] = bool(payload.get("is_truncated", False)) + converted["stop_condition"] = payload.get("stop_condition", "max_turns_reached") + converted["metrics"] = _converted_metrics(payload) + converted["tool_defs"] = payload.get("tool_defs", []) + + usage = _converted_token_usage(payload.get("token_usage")) + if usage is not None: + converted["token_usage"] = usage + else: + converted.pop("token_usage", None) + + return converted + + +def _converted_timing(payload: Mapping[str, Any]) -> dict[str, Any]: + timing = payload.get("timing") + if isinstance(timing, Mapping): + return dict(timing) + + generation = _milliseconds_to_seconds(payload.get("generation_ms")) + scoring = _milliseconds_to_seconds(payload.get("scoring_ms")) + total = _milliseconds_to_seconds(payload.get("total_ms")) + return { + "setup": {"duration": 0.0, "spans": []}, + "generation": {"duration": generation, "start": 0.0, "end": generation}, + "scoring": {"duration": scoring, "start": generation, "end": generation + scoring}, + "model": {"duration": generation, "spans": [{"duration": generation, "start": 0.0, "end": generation}]}, + "env": {"duration": 0.0, "spans": []}, + "total": total, + "overhead": max(0.0, total - generation - scoring), + } + + +def _converted_metrics(payload: Mapping[str, Any]) -> dict[str, float]: + metrics = payload.get("metrics") + if isinstance(metrics, Mapping): + return {str(key): float(value) for key, value in metrics.items() if _float_or_none(value) is not None} + + converted: dict[str, float] = {} + for key in ("accuracy", "num_turns"): + value = _float_or_none(payload.get(key)) + if value is not None: + converted[key] = value + return converted + + +def _converted_token_usage(value: Any) -> dict[str, float] | None: + if not isinstance(value, Mapping): + return None + if "input_tokens" in value or "output_tokens" in value: + input_tokens = _float_or_none(value.get("input_tokens")) or 0.0 + output_tokens = _float_or_none(value.get("output_tokens")) or 0.0 + usage = {"input_tokens": input_tokens, "output_tokens": output_tokens} + final_input = _float_or_none(value.get("final_input_tokens")) + final_output = _float_or_none(value.get("final_output_tokens")) + if final_input is not None and final_output is not None: + usage["final_input_tokens"] = final_input + usage["final_output_tokens"] = final_output + return usage + + model_usage = value.get("model") + if not isinstance(model_usage, Mapping): + return None + input_tokens = _float_or_none(model_usage.get("prompt")) or 0.0 + output_tokens = _float_or_none(model_usage.get("completion")) or 0.0 + return { + "input_tokens": input_tokens, + "output_tokens": output_tokens, + "final_input_tokens": input_tokens, + "final_output_tokens": output_tokens, + } + + +@dataclass(slots=True) +class _RowStats: + count: int = 0 + reward_total: float = 0.0 + reward_count: int = 0 + error_count: int = 0 + metric_totals: dict[str, float] | None = None + metric_counts: dict[str, int] | None = None + input_tokens_total: float = 0.0 + output_tokens_total: float = 0.0 + usage_count: int = 0 + final_input_tokens_total: float = 0.0 + final_output_tokens_total: float = 0.0 + final_usage_count: int = 0 + + def add(self, row: Mapping[str, Any]) -> None: + self.count += 1 + reward = _float_or_none(row.get("reward")) + if reward is not None: + self.reward_total += reward + self.reward_count += 1 + if row.get("error") is not None: + self.error_count += 1 + + metrics = row.get("metrics") + if isinstance(metrics, Mapping): + if self.metric_totals is None: + self.metric_totals = {} + self.metric_counts = {} + assert self.metric_counts is not None + for key, value in metrics.items(): + numeric = _float_or_none(value) + if numeric is None: + continue + metric_key = str(key) + self.metric_totals[metric_key] = self.metric_totals.get(metric_key, 0.0) + numeric + self.metric_counts[metric_key] = self.metric_counts.get(metric_key, 0) + 1 + + usage = row.get("token_usage") + if isinstance(usage, Mapping): + input_tokens = _float_or_none(usage.get("input_tokens")) + output_tokens = _float_or_none(usage.get("output_tokens")) + if input_tokens is not None or output_tokens is not None: + self.input_tokens_total += input_tokens or 0.0 + self.output_tokens_total += output_tokens or 0.0 + self.usage_count += 1 + final_input = _float_or_none(usage.get("final_input_tokens")) + final_output = _float_or_none(usage.get("final_output_tokens")) + if final_input is not None and final_output is not None: + self.final_input_tokens_total += final_input + self.final_output_tokens_total += final_output + self.final_usage_count += 1 + + def to_metadata(self) -> dict[str, Any]: + avg_metrics: dict[str, float] = {} + if self.metric_totals and self.metric_counts: + avg_metrics = { + key: total / self.metric_counts[key] + for key, total in sorted(self.metric_totals.items()) + if self.metric_counts.get(key) + } + + usage: dict[str, float] | None = None + if self.usage_count: + usage = { + "input_tokens": self.input_tokens_total / self.usage_count, + "output_tokens": self.output_tokens_total / self.usage_count, + } + if self.final_usage_count: + usage["final_input_tokens"] = self.final_input_tokens_total / self.final_usage_count + usage["final_output_tokens"] = self.final_output_tokens_total / self.final_usage_count + + return { + "row_count": self.count, + "avg_reward": self.reward_total / self.reward_count if self.reward_count else 0.0, + "avg_error": self.error_count / self.count if self.count else 0.0, + "avg_metrics": avg_metrics, + "usage": usage, + } + + +def _converted_metadata(plan: _PlannedConversion, *, row_stats: Mapping[str, Any]) -> dict[str, Any]: + metadata: dict[str, Any] = {} + if plan.source_metadata_payload: + source = plan.source_metadata_payload + for key in ( + "env_args", + "sampling_args", + "num_examples", + "rollouts_per_example", + "avg_reward", + "avg_metrics", + "avg_error", + "base_url", + "state_columns", + "tools", + "usage", + "version_info", + ): + if key in source: + metadata[key] = source[key] + if "time" in source: + metadata["time"] = source["time"] + elif "time_ms" in source: + metadata["time"] = _milliseconds_to_seconds(source["time_ms"]) + + model_table = plan.manifest.get("models") + model_config = model_table.get(plan.model_id) if isinstance(model_table, Mapping) else None + if "sampling_args" not in metadata and isinstance(model_config, Mapping): + sampling_args = model_config.get("sampling_args") + if isinstance(sampling_args, Mapping): + metadata["sampling_args"] = dict(sampling_args) + + for key in ("env_args", "sampling_args"): + job_value = plan.job.get(key) + if key not in metadata and isinstance(job_value, Mapping): + metadata[key] = dict(job_value) + + for key in ("num_examples", "rollouts_per_example", "avg_reward"): + if key not in metadata and plan.job.get(key) is not None: + metadata[key] = plan.job[key] + + metadata.setdefault("env_args", {}) + metadata.setdefault("sampling_args", {}) + metadata.setdefault("base_url", "") + metadata.setdefault("time", plan.job.get("duration_seconds", 0.0)) + metadata["avg_reward"] = row_stats.get("avg_reward", metadata.get("avg_reward", 0.0)) + metadata["avg_metrics"] = row_stats.get("avg_metrics") or metadata.get("avg_metrics") or _job_metrics(plan.job) + metadata["avg_error"] = row_stats.get("avg_error", metadata.get("avg_error", 0.0)) + metadata.setdefault("pass_at_k", {}) + metadata.setdefault("pass_all_k", {}) + metadata.setdefault("pass_threshold", 0.5) + metadata["usage"] = row_stats.get("usage", metadata.get("usage")) + metadata.setdefault("version_info", {}) + metadata.setdefault("state_columns", []) + metadata.setdefault("tools", None) + metadata["env_id"] = plan.env_id + metadata["model"] = plan.model_id + metadata["num_examples"] = int(metadata.get("num_examples") or row_stats.get("row_count") or 0) + metadata["rollouts_per_example"] = int(metadata.get("rollouts_per_example") or 1) + return metadata + + +def _job_metrics(job: Mapping[str, Any]) -> dict[str, float]: + metrics = job.get("metrics") + if not isinstance(metrics, Mapping): + return {} + return {str(key): float(value) for key, value in metrics.items() if _float_or_none(value) is not None} + + +def _milliseconds_to_seconds(value: Any) -> float: + numeric = _float_or_none(value) + return 0.0 if numeric is None else numeric / 1000.0 + + +def _float_or_none(value: Any) -> float | None: + if value is None or isinstance(value, bool): + return None + try: + return float(value) + except (TypeError, ValueError): + return None + + +def _entry_for_plan(plan: _PlannedConversion, *, status: str, reason: str) -> ConversionEntry: + return ConversionEntry( + run_id=plan.run_id, + job_id=_string_or_none(plan.job.get("job_id")), + status=status, + reason=reason, + source_results=str(plan.source_results), + target_dir=str(plan.target_dir), + ) + + +def _read_json_object(path: Path) -> dict[str, Any]: + try: + payload = json.loads(path.read_text(encoding="utf-8")) + except (OSError, ValueError) as exc: + raise ValueError(f"failed to parse {path}: {exc}") from exc + if not isinstance(payload, dict): + raise ValueError(f"expected JSON object in {path}") + return payload + + +def _run_id(manifest: Mapping[str, Any], run_dir: Path) -> str: + return _string_or_none(manifest.get("run_id")) or run_dir.name + + +def _string_or_none(value: Any) -> str | None: + if value is None: + return None + text = str(value).strip() + return text or None + + +def _run_conversion_cli(*, raw_dir: Path, output_dir: Path, dry_run: bool, report_path: Path | None) -> int: + report = convert_legacy_raw_runs(raw_dir=raw_dir, output_dir=output_dir, dry_run=dry_run) + encoded = json.dumps(report.to_dict(), indent=2, sort_keys=True) + if report_path: + report_path.parent.mkdir(parents=True, exist_ok=True) + report_path.write_text(encoded + "\n", encoding="utf-8") + print(encoded) + return 1 if report.failed else 0 + + +class _HelpFormatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescriptionHelpFormatter): + pass + + +def _build_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser( + description=__doc__, + formatter_class=_HelpFormatter, + epilog=""" +Examples: + python scripts/convert_legacy_raw_runs.py + Preview conversion from runs/raw to runs/evals. + + python scripts/convert_legacy_raw_runs.py --no-dry-run --report-path report.json + Write converted eval-output directories and save the JSON report. + + python scripts/convert_legacy_raw_runs.py --raw-dir old/runs/raw --output-dir runs/evals + Preview conversion from a custom legacy raw-run directory. +""", + ) + parser.add_argument( + "--raw-dir", + type=Path, + default=Path("runs") / "raw", + help="legacy raw-run root directory containing */run_manifest.json files", + ) + parser.add_argument( + "--output-dir", + type=Path, + default=Path("runs") / "evals", + help="converted eval-output root directory", + ) + parser.add_argument( + "--dry-run", + action=argparse.BooleanOptionalAction, + default=True, + help="plan conversion without writing files; use --no-dry-run to write converted outputs", + ) + parser.add_argument( + "--report-path", + type=Path, + help="optional path for a JSON copy of the conversion report", + ) + return parser + + +def main(argv: Sequence[str] | None = None) -> int: + args = _build_parser().parse_args(argv) + return _run_conversion_cli( + raw_dir=args.raw_dir, + output_dir=args.output_dir, + dry_run=args.dry_run, + report_path=args.report_path, + ) + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tests/test_cli/test_bench_child.py b/tests/test_cli/test_bench_child.py new file mode 100644 index 00000000..943c9d41 --- /dev/null +++ b/tests/test_cli/test_bench_child.py @@ -0,0 +1,161 @@ +from __future__ import annotations + +from pathlib import Path +from types import SimpleNamespace + +import pytest + +from medarc_verifiers.cli import bench_child +from medarc_verifiers.cli.env_lifecycle import EnvInstallState, EnvPackageRef + + +def _payload(tmp_path: Path) -> dict: + return { + "raw_config": {"env_id": "medqa", "model": "parent-model"}, + "overrides": {}, + "env_dir": str(tmp_path / "envs"), + "resume_path": str(tmp_path / "runs" / "evals" / "parent-model" / "medqa" / "base"), + "status_path": str(tmp_path / "status.json"), + "expected_env_id": "medqa", + "expected_model": "parent-model", + } + + +def _state(installed_by_child: bool) -> EnvInstallState: + ref = EnvPackageRef("medqa", "medqa", "medqa", Path("envs/medqa"), None) + return EnvInstallState(ref, installed_by_child, False, False) + + +def test_child_installs_builds_runs_and_cleans_up(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: + calls: list[str] = [] + config = SimpleNamespace( + env_id="medqa", + model="parent-model", + model_copy=lambda update: SimpleNamespace(env_id="medqa", model="parent-model", **update), + ) + + monkeypatch.setattr(bench_child, "resolve_env_package", lambda env_id, env_dir: object()) + monkeypatch.setattr(bench_child, "ensure_installed", lambda ref: calls.append("install") or _state(True)) + monkeypatch.setattr(bench_child, "build_eval_config", lambda raw, overrides: calls.append("build") or config) + monkeypatch.setattr(bench_child, "uninstall_if_child_installed", lambda state: calls.append("cleanup")) + + async def fake_run_evaluation(run_config): + calls.append(f"run:{run_config.resume_path}") + + monkeypatch.setattr(bench_child, "run_evaluation", fake_run_evaluation) + + status = bench_child._run_payload(_payload(tmp_path)) + + assert status["exit_code"] == 0 + assert status["installed_by_child"] is True + assert calls == [ + "install", + "build", + f"run:{tmp_path / 'runs' / 'evals' / 'parent-model' / 'medqa' / 'base'}", + "cleanup", + ] + + +def test_child_cleanup_env_package_false_skips_uninstall(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: + calls: list[str] = [] + config = SimpleNamespace( + env_id="medqa", + model="parent-model", + model_copy=lambda update: SimpleNamespace(env_id="medqa", model="parent-model", **update), + ) + + monkeypatch.setattr(bench_child, "resolve_env_package", lambda env_id, env_dir: object()) + monkeypatch.setattr(bench_child, "ensure_installed", lambda ref: calls.append("install") or _state(True)) + monkeypatch.setattr(bench_child, "build_eval_config", lambda raw, overrides: calls.append("build") or config) + monkeypatch.setattr(bench_child, "uninstall_if_child_installed", lambda state: calls.append("cleanup")) + + async def fake_run_evaluation(run_config): + calls.append(f"run:{run_config.resume_path}") + + monkeypatch.setattr(bench_child, "run_evaluation", fake_run_evaluation) + + payload = _payload(tmp_path) + payload["cleanup_env_package"] = False + status = bench_child._run_payload(payload) + + assert status["exit_code"] == 0 + assert status["installed_by_child"] is True + assert calls == ["install", "build", f"run:{tmp_path / 'runs' / 'evals' / 'parent-model' / 'medqa' / 'base'}"] + + +def test_child_env_preinstalled_skips_install_and_cleanup(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: + calls: list[str] = [] + config = SimpleNamespace( + env_id="medqa", + model="parent-model", + model_copy=lambda update: SimpleNamespace(env_id="medqa", model="parent-model", **update), + ) + + monkeypatch.setattr(bench_child, "resolve_env_package", lambda env_id, env_dir: calls.append("resolve")) + monkeypatch.setattr(bench_child, "ensure_installed", lambda ref: calls.append("install") or _state(True)) + monkeypatch.setattr(bench_child, "build_eval_config", lambda raw, overrides: calls.append("build") or config) + monkeypatch.setattr(bench_child, "uninstall_if_child_installed", lambda state: calls.append("cleanup")) + + async def fake_run_evaluation(run_config): + calls.append(f"run:{run_config.resume_path}") + + monkeypatch.setattr(bench_child, "run_evaluation", fake_run_evaluation) + + payload = _payload(tmp_path) + payload["env_preinstalled"] = True + payload["cleanup_env_package"] = False + status = bench_child._run_payload(payload) + + assert status["exit_code"] == 0 + assert status["installed_by_child"] is False + assert calls == ["build", f"run:{tmp_path / 'runs' / 'evals' / 'parent-model' / 'medqa' / 'base'}"] + + +def test_child_install_failure_does_not_build_or_cleanup(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: + calls: list[str] = [] + + monkeypatch.setattr(bench_child, "resolve_env_package", lambda env_id, env_dir: object()) + + def fail_install(ref): + calls.append("install") + raise RuntimeError("install failed") + + monkeypatch.setattr(bench_child, "ensure_installed", fail_install) + monkeypatch.setattr(bench_child, "build_eval_config", lambda raw, overrides: calls.append("build")) + monkeypatch.setattr(bench_child, "uninstall_if_child_installed", lambda state: calls.append("cleanup")) + + status = bench_child._run_payload(_payload(tmp_path)) + + assert status["exit_code"] == 1 + assert status["exit_reason"] == "eval_failed" + assert "install failed" in status["primary_error"] + assert calls == ["install"] + + +def test_child_cleanup_failure_after_success_is_fatal(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: + config = SimpleNamespace( + env_id="medqa", + model="parent-model", + model_copy=lambda update: SimpleNamespace(env_id="medqa", model="parent-model", **update), + ) + + monkeypatch.setattr(bench_child, "resolve_env_package", lambda env_id, env_dir: object()) + monkeypatch.setattr(bench_child, "ensure_installed", lambda ref: _state(True)) + monkeypatch.setattr(bench_child, "build_eval_config", lambda raw, overrides: config) + + async def fake_run_evaluation(run_config): + return None + + monkeypatch.setattr(bench_child, "run_evaluation", fake_run_evaluation) + + def fail_cleanup(state): + raise RuntimeError("cleanup failed") + + monkeypatch.setattr(bench_child, "uninstall_if_child_installed", fail_cleanup) + + status = bench_child._run_payload(_payload(tmp_path)) + + assert status["eval_ok"] is True + assert status["cleanup_ok"] is False + assert status["exit_code"] == 1 + assert status["exit_reason"] == "cleanup_failed" diff --git a/tests/test_cli/test_config_loader.py b/tests/test_cli/test_config_loader.py deleted file mode 100644 index b39e8d87..00000000 --- a/tests/test_cli/test_config_loader.py +++ /dev/null @@ -1,742 +0,0 @@ -from __future__ import annotations - -from dataclasses import dataclass -from pathlib import Path - -import pytest -from pydantic import ValidationError - -from medarc_verifiers.cli._config_loader import ConfigFormatError, load_run_config -from medarc_verifiers.cli._job_builder import build_jobs -from medarc_verifiers.cli._job_executor import ExecutorSettings, execute_jobs - - -@dataclass -class _FakeParam: - name: str - required: bool = False - choices: tuple | None = None - argparse_type: type | None = None - is_list: bool = False - element_type: type | None = None - action: str | None = None - kind: str | None = None - supports_cli: bool = True - - -def _write_yaml(path: Path, content: str) -> Path: - path.write_text(content) - return path - - -def test_load_run_config_parses_basic_yaml(monkeypatch, tmp_path: Path) -> None: - monkeypatch.setattr( - "medarc_verifiers.cli._config_loader.load_env_metadata", - lambda _env_id, cache=None: [], - ) - config_path = _write_yaml( - tmp_path / "config.yaml", - """ - name: demo-run - models: - - id: gpt-mini - envs: - - id: medqa - jobs: - - model: gpt-mini - env: medqa - """, - ) - - config = load_run_config(config_path) - - assert config.name == "demo-run" - assert len(config.models) == 1 - assert "gpt-mini" in config.models - assert config.models["gpt-mini"].id == "gpt-mini" - assert len(config.envs) == 1 - assert "medqa" in config.envs - assert config.envs["medqa"].id == "medqa" - assert len(config.jobs) == 1 - assert config.jobs[0].model == "gpt-mini" - - -def test_load_run_config_supports_mapped_format(monkeypatch, tmp_path: Path) -> None: - monkeypatch.setattr( - "medarc_verifiers.cli._config_loader.load_env_metadata", - lambda _env_id, cache=None: [], - ) - config_path = _write_yaml( - tmp_path / "mapped.yaml", - """ - models: - gpt-mini: - model: openai/gpt-mini - envs: - medqa: - num_examples: 5 - jobs: - - model: gpt-mini - env: medqa - """, - ) - - config = load_run_config(config_path) - - assert set(config.models) == {"gpt-mini"} - assert config.models["gpt-mini"].model == "openai/gpt-mini" - assert set(config.envs) == {"medqa"} - assert config.envs["medqa"].num_examples == 5 - - -def test_load_run_config_rejects_non_mapping_root(tmp_path: Path) -> None: - config_path = _write_yaml( - tmp_path / "invalid.yaml", - """ - - not: a-mapping - """, - ) - - with pytest.raises(ConfigFormatError): - load_run_config(config_path) - - -def test_model_headers_validation(monkeypatch, tmp_path: Path) -> None: - monkeypatch.setattr( - "medarc_verifiers.cli._config_loader.load_env_metadata", - lambda _env_id, cache=None: [], - ) - config_path = _write_yaml( - tmp_path / "headers.yaml", - """ - models: - - id: bad - headers: - - 123 - envs: - - id: medqa - jobs: - - model: bad - env: medqa - """, - ) - - with pytest.raises(ValidationError): - load_run_config(config_path) - - -def test_environment_num_examples_validation(monkeypatch, tmp_path: Path) -> None: - monkeypatch.setattr( - "medarc_verifiers.cli._config_loader.load_env_metadata", - lambda _env_id, cache=None: [], - ) - config_path = _write_yaml( - tmp_path / "envs.yaml", - """ - envs: - - id: medqa - num_examples: 0 - jobs: - - model: gpt - env: medqa - models: - - id: gpt - """, - ) - - with pytest.raises(ValidationError): - load_run_config(config_path) - - -def test_environment_env_args_unknown(monkeypatch, tmp_path: Path) -> None: - def fake_metadata(_env_id: str, cache=None): - return [_FakeParam("shuffle_seed"), _FakeParam("shuffle_answers")] - - monkeypatch.setattr( - "medarc_verifiers.cli._config_loader.load_env_metadata", - fake_metadata, - ) - - config_path = _write_yaml( - tmp_path / "unknown_env_arg.yaml", - """ - envs: - - id: medqa - env_args: - invalid_param: true - models: - - id: gpt - jobs: - - model: gpt - env: medqa - """, - ) - - with pytest.raises(ValueError): - load_run_config(config_path) - - -def test_environment_env_args_known(monkeypatch, tmp_path: Path) -> None: - def fake_metadata(_env_id: str, cache=None): - return [_FakeParam("shuffle_answers"), _FakeParam("shuffle_seed")] - - monkeypatch.setattr( - "medarc_verifiers.cli._config_loader.load_env_metadata", - fake_metadata, - ) - - config_path = _write_yaml( - tmp_path / "known_env_arg.yaml", - """ - envs: - - id: medqa - env_args: - shuffle_answers: true - models: - - id: gpt - jobs: - - model: gpt - env: medqa - """, - ) - - config = load_run_config(config_path) - - assert config.envs["medqa"].env_args == {"shuffle_answers": True} - - -def test_env_paths_resolve_with_cli_default_root(monkeypatch, tmp_path: Path) -> None: - monkeypatch.setattr( - "medarc_verifiers.cli._config_loader.load_env_metadata", - lambda _env_id, cache=None: [], - ) - configs_dir = tmp_path / "configs" - envs_dir = configs_dir / "envs" - envs_dir.mkdir(parents=True) - _write_yaml( - envs_dir / "medqa.yaml", - """ - - id: medqa - module: medqa - """, - ) - config_path = _write_yaml( - configs_dir / "jobs.yaml", - """ - models: - - id: gpt - envs: - - medqa - jobs: - - model: gpt - env: medqa - """, - ) - - config = load_run_config(config_path, env_default_root=envs_dir) - - assert "medqa" in config.envs - - -def test_env_paths_use_env_config_root(monkeypatch, tmp_path: Path) -> None: - monkeypatch.setattr( - "medarc_verifiers.cli._config_loader.load_env_metadata", - lambda _env_id, cache=None: [], - ) - shared_envs = tmp_path / "shared_envs" - shared_envs.mkdir() - _write_yaml( - shared_envs / "custom_env.yaml", - """ - - id: custom_env - module: custom_env - """, - ) - config_path = _write_yaml( - tmp_path / "jobs.yaml", - """ - models: - - id: gpt - envs: - - custom_env - jobs: - - model: gpt - env: custom_env - """, - ) - - config = load_run_config(config_path, env_default_root=shared_envs) - - assert "custom_env" in config.envs - - -def test_envs_auto_discovered_from_env_config_root(monkeypatch, tmp_path: Path) -> None: - monkeypatch.setattr( - "medarc_verifiers.cli._config_loader.load_env_metadata", - lambda _env_id, cache=None: [], - ) - env_root = tmp_path / "auto_envs" - env_root.mkdir() - _write_yaml( - env_root / "auto.yaml", - """ - - id: auto_env - module: auto_env - """, - ) - config_path = _write_yaml( - tmp_path / "config.yaml", - """ - models: - - id: auto-model - jobs: - - model: auto-model - env: auto_env - """, - ) - - config = load_run_config(config_path, env_default_root=env_root) - - assert sorted(config.envs) == ["auto_env"] - - -def test_environment_env_args_missing_required(monkeypatch, tmp_path: Path) -> None: - def fake_metadata(_env_id: str, cache=None): - return [_FakeParam("subset", required=True)] - - monkeypatch.setattr( - "medarc_verifiers.cli._config_loader.load_env_metadata", - fake_metadata, - ) - monkeypatch.setattr( - "medarc_verifiers.cli._job_executor.load_env_metadata", - fake_metadata, - ) - monkeypatch.setattr( - "medarc_verifiers.cli._job_executor.load_endpoint_registry", - lambda _path, cache=None: {}, - ) - - async def fail_if_called(*_args, **_kwargs): # pragma: no cover - sanity guard - raise AssertionError("run_evaluation should not execute when env args are invalid.") - - monkeypatch.setattr("medarc_verifiers.cli._job_executor.run_evaluation", fail_if_called) - - config_path = _write_yaml( - tmp_path / "missing_required.yaml", - """ - envs: - - id: medqa - models: - - id: gpt - jobs: - - model: gpt - env: medqa - """, - ) - - config = load_run_config(config_path) - jobs = build_jobs(config) - - env_dir = tmp_path / "envs" - env_dir.mkdir() - - settings = ExecutorSettings( - run_id="run-1", - output_dir=tmp_path / "runs", - env_dir=env_dir, - endpoints_path=tmp_path / "endpoints.yaml", - default_api_key_var="API_KEY", - default_api_base_url="https://api.example", - dry_run=False, - ) - - results = execute_jobs(jobs, settings) - - assert results[0].status == "failed" - assert results[0].error is not None - assert "Missing required environment arguments" in results[0].error - - -def test_environment_env_args_type_validation(monkeypatch, tmp_path: Path) -> None: - def fake_metadata(_env_id: str, cache=None): - return [_FakeParam("shuffle_seed", argparse_type=int)] - - monkeypatch.setattr( - "medarc_verifiers.cli._config_loader.load_env_metadata", - fake_metadata, - ) - - config_path = _write_yaml( - tmp_path / "invalid_type.yaml", - """ - envs: - - id: medqa - env_args: - shuffle_seed: wrong - models: - - id: gpt - jobs: - - model: gpt - env: medqa - """, - ) - - with pytest.raises(ValueError): - load_run_config(config_path) - - -def test_matrix_expansion_generates_variants(monkeypatch, tmp_path: Path) -> None: - def fake_metadata(env_id: str, cache=None): # noqa: ARG001 - return [ - _FakeParam("shuffle_answers"), - _FakeParam("shuffle_seed", argparse_type=int), - ] - - monkeypatch.setattr( - "medarc_verifiers.cli._config_loader.load_env_metadata", - fake_metadata, - ) - - config_path = _write_yaml( - tmp_path / "matrix.yaml", - """ - envs: - - id: medqa - module: medqa - env_args: - shuffle_answers: true - matrix: - shuffle_seed: [1618, 9331] - matrix_id_format: "{base}-r{shuffle_seed}" - models: - - id: gpt - jobs: - - model: gpt - env: medqa - """, - ) - - config = load_run_config(config_path) - - env_ids = list(config.envs.keys()) - assert env_ids == ["medqa-r1618", "medqa-r9331"] - assert all(env.matrix is None for env in config.envs.values()) - assert all(env.matrix_base_id == "medqa" for env in config.envs.values()) - assert {env.env_args["shuffle_seed"] for env in config.envs.values()} == {1618, 9331} - assert all(env.env_args["shuffle_answers"] is True for env in config.envs.values()) - assert all(env.module == "medqa" for env in config.envs.values()) - - -def test_duplicate_env_ids_from_files_expand_variants(monkeypatch, tmp_path: Path) -> None: - monkeypatch.setattr( - "medarc_verifiers.cli._config_loader.load_env_metadata", - lambda _env_id, cache=None: [], - ) - - env_dir = tmp_path / "envs" - env_dir.mkdir() - _write_yaml( - env_dir / "longhealth.yaml", - """ - - id: longhealth - module: longhealth - matrix: - task: ["task1", "task2"] - matrix_id_format: "{base}-{task}" - - id: longhealth - module: longhealth - matrix: - task: ["task3"] - matrix_id_format: "{base}-{task}-alt" - """, - ) - - config_path = _write_yaml( - tmp_path / "config.yaml", - f""" - envs: - - "{env_dir}" - models: - - id: gpt - jobs: - - model: gpt - env: longhealth - """, - ) - - config = load_run_config(config_path) - - assert sorted(config.envs.keys()) == ["longhealth-task1", "longhealth-task2", "longhealth-task3-alt"] - assert all(env.matrix_base_id == "longhealth" for env in config.envs.values()) - - -def test_matrix_exclude_and_scalar_fields(monkeypatch, tmp_path: Path) -> None: - def fake_metadata(env_id: str, cache=None): # noqa: ARG001 - return [_FakeParam("shuffle_seed", argparse_type=int)] - - monkeypatch.setattr( - "medarc_verifiers.cli._config_loader.load_env_metadata", - fake_metadata, - ) - - config_path = _write_yaml( - tmp_path / "matrix_scalar.yaml", - """ - envs: - - id: medqa - module: medqa - matrix: - num_examples: [10, 20] - shuffle_seed: [1618, 9331] - matrix_exclude: - - num_examples: 20 - shuffle_seed: 9331 - matrix_id_format: "{base}-n{num_examples}-r{shuffle_seed}" - models: - - id: gpt - jobs: - - model: gpt - env: medqa - """, - ) - - config = load_run_config(config_path) - - env_ids = sorted(config.envs) - assert env_ids == ["medqa-n10-r1618", "medqa-n10-r9331", "medqa-n20-r1618"] - - num_examples = {env_id: env.num_examples for env_id, env in config.envs.items()} - assert num_examples["medqa-n10-r1618"] == 10 - assert num_examples["medqa-n10-r9331"] == 10 - assert num_examples["medqa-n20-r1618"] == 20 - - assert all("shuffle_seed" in env.env_args for env in config.envs.values()) - assert "medqa-n20-r9331" not in env_ids - - -def test_legacy_model_params_adapter(monkeypatch, tmp_path: Path) -> None: - monkeypatch.setattr( - "medarc_verifiers.cli._config_loader.load_env_metadata", - lambda _env_id, cache=None: [], - ) - - config_path = _write_yaml( - tmp_path / "legacy_model.yaml", - """ - models: - - id: gpt - params: - model: openai/gpt - env_overrides: - medqa: - shuffle_answers: true - envs: - - id: medqa - module: medqa - jobs: - - model: gpt - env: medqa - """, - ) - - config = load_run_config(config_path) - - assert len(config.models) == 1 - model_cfg = config.models["gpt"] - assert model_cfg.model == "openai/gpt" - assert model_cfg.env_overrides == {"medqa": {"shuffle_answers": True}} - assert model_cfg.env_args == {} - - -def test_envs_can_reference_yaml_file(monkeypatch, tmp_path: Path) -> None: - monkeypatch.setattr( - "medarc_verifiers.cli._config_loader.load_env_metadata", - lambda _env_id, cache=None: [], - ) - env_file = tmp_path / "envs.yaml" - env_file.write_text( - """ - - id: included-env - module: included_env - num_examples: 3 - """, - ) - config_path = _write_yaml( - tmp_path / "config.yaml", - f""" - models: - gpt-mini: - model: openai/gpt-mini - envs: "{env_file.name}" - jobs: - - model: gpt-mini - env: included-env - """, - ) - - config = load_run_config(config_path) - - assert set(config.envs) == {"included-env"} - assert config.envs["included-env"].num_examples == 3 - - -def test_envs_can_reference_directory(monkeypatch, tmp_path: Path) -> None: - monkeypatch.setattr( - "medarc_verifiers.cli._config_loader.load_env_metadata", - lambda _env_id, cache=None: [], - ) - env_dir = tmp_path / "envs" - env_dir.mkdir() - (env_dir / "a.yaml").write_text( - """ - - id: env-a - num_examples: 2 - """, - ) - (env_dir / "b.yaml").write_text( - """ - - id: env-b - num_examples: 4 - """, - ) - - config_path = _write_yaml( - tmp_path / "config.yaml", - """ - models: - gpt-mini: - model: openai/gpt-mini - envs: "envs" - jobs: - - model: gpt-mini - env: env-a - """, - ) - - config = load_run_config(config_path) - - assert set(config.envs) == {"env-a", "env-b"} - assert config.envs["env-b"].num_examples == 4 - - -def test_included_file_strict_shapes(monkeypatch, tmp_path: Path) -> None: - monkeypatch.setattr( - "medarc_verifiers.cli._config_loader.load_env_metadata", - lambda _env_id, cache=None: [], - ) - - bad_mapping = tmp_path / "bad_mapping.yaml" - bad_mapping.write_text( - """ - env-basic: - id: env-basic - env-invalid: 1 - """, - ) - - config_path = _write_yaml( - tmp_path / "config_bad_mapping.yaml", - f""" - models: - gpt-mini: - model: openai/gpt-mini - envs: "{bad_mapping.name}" - jobs: - - model: gpt-mini - env: env-basic - """, - ) - - with pytest.raises(ValueError) as excinfo: - load_run_config(config_path) - assert "mapping of id" in str(excinfo.value) - - bad_list = tmp_path / "bad_list.yaml" - bad_list.write_text( - """ - - id: env-basic - - 42 - """, - ) - - config_path = _write_yaml( - tmp_path / "config_bad_list.yaml", - f""" - models: - gpt-mini: - model: openai/gpt-mini - envs: "{bad_list.name}" - jobs: - - model: gpt-mini - env: env-basic - """, - ) - - with pytest.raises(ValueError) as excinfo: - load_run_config(config_path) - assert "must be a mapping" in str(excinfo.value) - - -def test_models_can_reference_yaml_file(monkeypatch, tmp_path: Path) -> None: - monkeypatch.setattr( - "medarc_verifiers.cli._config_loader.load_env_metadata", - lambda _env_id, cache=None: [], - ) - models_file = tmp_path / "models.yaml" - models_file.write_text( - """ - - id: model-a - model: openai/gpt-a - sampling_args: - max_tokens: 256 - """, - ) - config_path = _write_yaml( - tmp_path / "config.yaml", - f""" - models: "{models_file.name}" - envs: - env-a: - num_examples: 5 - jobs: - - model: model-a - env: env-a - """, - ) - - config = load_run_config(config_path) - - assert set(config.models) == {"model-a"} - assert config.models["model-a"].sampling_args["max_tokens"] == 256 - - -def test_jobs_can_reference_yaml_file(monkeypatch, tmp_path: Path) -> None: - monkeypatch.setattr( - "medarc_verifiers.cli._config_loader.load_env_metadata", - lambda _env_id, cache=None: [], - ) - jobs_file = tmp_path / "jobs.yaml" - jobs_file.write_text( - """ - - model: gpt-mini - env: env-a - """, - ) - config_path = _write_yaml( - tmp_path / "config.yaml", - f""" - models: - gpt-mini: - model: openai/gpt-mini - envs: - env-a: - num_examples: 5 - jobs: "{jobs_file.name}" - """, - ) - - config = load_run_config(config_path) - - assert len(config.jobs) == 1 - assert config.jobs[0].model == "gpt-mini" diff --git a/tests/test_cli/test_env_lifecycle.py b/tests/test_cli/test_env_lifecycle.py new file mode 100644 index 00000000..5e37ffcb --- /dev/null +++ b/tests/test_cli/test_env_lifecycle.py @@ -0,0 +1,86 @@ +from __future__ import annotations + +from pathlib import Path +from types import SimpleNamespace + +import pytest + +from medarc_verifiers.cli import env_lifecycle + + +def _write_env(root: Path, folder: str, *, project_name: str, loader: str | None = None) -> Path: + env_path = root / folder + env_path.mkdir(parents=True) + loader_block = "" + if loader is not None: + loader_block = f'\n[tool.prime.environment]\nloader = "{loader}"\n' + (env_path / "pyproject.toml").write_text( + f'[project]\nname = "{project_name}"\n{loader_block}', + encoding="utf-8", + ) + return env_path + + +def test_resolve_env_package_uses_upstream_module_and_project_name(tmp_path: Path) -> None: + env_path = _write_env(tmp_path, "head_qa_v2", project_name="head-qa-v2", loader="other:load_environment") + + ref = env_lifecycle.resolve_env_package("owner/head-qa-v2", tmp_path) + + assert ref.env_id == "owner/head-qa-v2" + assert ref.module_name == "head_qa_v2" + assert ref.project_name == "head-qa-v2" + assert ref.env_path == env_path + assert ref.loader == "other:load_environment" + + +def test_resolve_env_package_errors_for_missing_pyproject(tmp_path: Path) -> None: + (tmp_path / "medqa").mkdir() + + with pytest.raises(FileNotFoundError, match="missing pyproject.toml"): + env_lifecycle.resolve_env_package("medqa", tmp_path) + + +def test_inspect_install_state_rejects_installed_distribution_without_module(monkeypatch: pytest.MonkeyPatch) -> None: + ref = env_lifecycle.EnvPackageRef("medqa", "medqa", "medqa", Path("envs/medqa"), None) + monkeypatch.setattr(env_lifecycle, "_distribution_exists", lambda name: True) + monkeypatch.setattr(env_lifecycle, "_module_importable", lambda name: False) + + with pytest.raises(ModuleNotFoundError, match="upstream module 'medqa' is not importable"): + env_lifecycle.inspect_install_state(ref) + + +def test_ensure_installed_installs_missing_package(monkeypatch: pytest.MonkeyPatch) -> None: + ref = env_lifecycle.EnvPackageRef("medqa", "medqa", "medqa", Path("envs/medqa"), None) + calls: list[list[str]] = [] + importable = [False, True] + + monkeypatch.setattr(env_lifecycle, "_distribution_exists", lambda name: False) + monkeypatch.setattr(env_lifecycle, "_module_importable", lambda name: importable.pop(0)) + monkeypatch.setattr(env_lifecycle.importlib, "invalidate_caches", lambda: None) + monkeypatch.setattr( + env_lifecycle.subprocess, + "run", + lambda cmd, check: calls.append(cmd) or SimpleNamespace(returncode=0), + ) + + state = env_lifecycle.ensure_installed(ref) + + assert state.installed_by_child is True + assert calls[0][:4] == ["uv", "pip", "install", "--python"] + + +def test_uninstall_only_child_installed_packages(monkeypatch: pytest.MonkeyPatch) -> None: + ref = env_lifecycle.EnvPackageRef("medqa", "medqa", "medqa", Path("envs/medqa"), None) + state = env_lifecycle.EnvInstallState(ref, True, False, False) + calls: list[list[str]] = [] + + monkeypatch.setattr(env_lifecycle.importlib, "invalidate_caches", lambda: None) + monkeypatch.setattr( + env_lifecycle.subprocess, + "run", + lambda cmd, check: calls.append(cmd) or SimpleNamespace(returncode=0), + ) + + env_lifecycle.uninstall_if_child_installed(state) + + assert calls[0][:4] == ["uv", "pip", "uninstall", "--python"] diff --git a/tests/test_cli/test_eval_builder.py b/tests/test_cli/test_eval_builder.py deleted file mode 100644 index ab9da748..00000000 --- a/tests/test_cli/test_eval_builder.py +++ /dev/null @@ -1,166 +0,0 @@ -from __future__ import annotations - -import pytest - -from medarc_verifiers.cli._eval_builder import build_client_config -from medarc_verifiers.cli._schemas import ModelConfigSchema -from medarc_verifiers.utils.prime_inference import PRIME_INFERENCE_URL - - -def test_build_client_config_populates_endpoint_configs_for_replicas() -> None: - model_cfg = ModelConfigSchema(id="alias-model", headers={"X-Test": "1"}) - endpoints = { - "alias-model": [ - {"model": "resolved-model", "key": "MODEL_KEY", "url": "https://endpoint-a.example/v1"}, - {"model": "resolved-model", "key": "MODEL_KEY", "url": "https://endpoint-b.example/v1"}, - {"model": "resolved-model", "key": "MODEL_KEY", "url": "https://endpoint-c.example/v1"}, - ] - } - - resolved_model, client_config, sampling_overrides = build_client_config( - model_cfg, - endpoints=endpoints, - default_api_key_var="DEFAULT_KEY", - default_api_key_var_explicit=False, - default_api_base_url="https://default.example/v1", - api_base_url_override=None, - http_max_retries_override=None, - timeout_override=None, - headers=None, - ) - - assert resolved_model == "resolved-model" - assert client_config.api_base_url == "https://endpoint-a.example/v1" - assert client_config.api_key_var == "MODEL_KEY" - assert sampling_overrides == {} - assert [entry.api_base_url for entry in client_config.endpoint_configs] == [ - "https://endpoint-a.example/v1", - "https://endpoint-b.example/v1", - "https://endpoint-c.example/v1", - ] - assert all(entry.api_key_var == "MODEL_KEY" for entry in client_config.endpoint_configs) - assert all(entry.extra_headers == {"X-Test": "1"} for entry in client_config.endpoint_configs) - - -def test_build_client_config_api_base_url_override_suppresses_endpoint_configs() -> None: - model_cfg = ModelConfigSchema(id="alias-model") - endpoints = { - "alias-model": [ - {"model": "resolved-model", "key": "MODEL_KEY", "url": "https://endpoint-a.example/v1"}, - {"model": "resolved-model", "key": "MODEL_KEY", "url": "https://endpoint-b.example/v1"}, - {"model": "resolved-model", "key": "MODEL_KEY", "url": "https://endpoint-c.example/v1"}, - ] - } - - _, client_config, _ = build_client_config( - model_cfg, - endpoints=endpoints, - default_api_key_var="DEFAULT_KEY", - default_api_key_var_explicit=False, - default_api_base_url="https://default.example/v1", - api_base_url_override="http://127.0.0.1:8000/v1", - http_max_retries_override=None, - timeout_override=None, - headers=None, - ) - - assert client_config.api_base_url == "http://127.0.0.1:8000/v1" - assert client_config.endpoint_configs == [] - - -def test_build_client_config_replicas_must_share_model_and_key() -> None: - model_cfg = ModelConfigSchema(id="alias-model") - endpoints = { - "alias-model": [ - {"model": "resolved-model", "key": "MODEL_KEY", "url": "https://endpoint-a.example/v1"}, - {"model": "resolved-model", "key": "MODEL_KEY_B", "url": "https://endpoint-b.example/v1"}, - ] - } - - with pytest.raises(ValueError, match="must agree on 'model' and 'key'"): - build_client_config( - model_cfg, - endpoints=endpoints, - default_api_key_var="DEFAULT_KEY", - default_api_key_var_explicit=False, - default_api_base_url="https://default.example/v1", - api_base_url_override=None, - http_max_retries_override=None, - timeout_override=None, - headers=None, - ) - - -def test_build_client_config_prime_base_url_forces_prime_key_when_non_explicit() -> None: - model_cfg = ModelConfigSchema(model="prime-model") - - _, client_config, _ = build_client_config( - model_cfg, - endpoints={}, - default_api_key_var="OPENAI_API_KEY", - default_api_key_var_explicit=False, - default_api_base_url=PRIME_INFERENCE_URL, - api_base_url_override=None, - http_max_retries_override=None, - timeout_override=None, - headers=None, - ) - - assert client_config.api_base_url == PRIME_INFERENCE_URL - assert client_config.api_key_var == "PRIME_API_KEY" - - -def test_build_client_config_prime_registry_keeps_endpoint_key_var() -> None: - model_cfg = ModelConfigSchema(model="prime-model") - endpoints = { - "prime-model": [ - { - "model": "prime-model-resolved", - "key": "CUSTOM_KEY", - "url": PRIME_INFERENCE_URL, - } - ] - } - - _, client_config, _ = build_client_config( - model_cfg, - endpoints=endpoints, - default_api_key_var="OPENAI_API_KEY", - default_api_key_var_explicit=False, - default_api_base_url="https://default.example/v1", - api_base_url_override=None, - http_max_retries_override=None, - timeout_override=None, - headers=None, - ) - - assert client_config.api_base_url == PRIME_INFERENCE_URL - assert client_config.api_key_var == "CUSTOM_KEY" - - -@pytest.mark.parametrize( - ("model_cfg", "default_key_var", "default_key_var_explicit", "expected"), - [ - (ModelConfigSchema(model="prime-model", api_key_var="MODEL_KEY"), "OPENAI_API_KEY", False, "MODEL_KEY"), - (ModelConfigSchema(model="prime-model"), "CUSTOM_KEY", True, "CUSTOM_KEY"), - ], -) -def test_build_client_config_prime_base_url_respects_explicit_key_var( - model_cfg: ModelConfigSchema, - default_key_var: str, - default_key_var_explicit: bool, - expected: str, -) -> None: - _, client_config, _ = build_client_config( - model_cfg, - endpoints={}, - default_api_key_var=default_key_var, - default_api_key_var_explicit=default_key_var_explicit, - default_api_base_url=PRIME_INFERENCE_URL, - api_base_url_override=None, - http_max_retries_override=None, - timeout_override=None, - headers=None, - ) - - assert client_config.api_key_var == expected diff --git a/tests/test_cli/test_eval_identity.py b/tests/test_cli/test_eval_identity.py new file mode 100644 index 00000000..6172e549 --- /dev/null +++ b/tests/test_cli/test_eval_identity.py @@ -0,0 +1,151 @@ +from __future__ import annotations + +from pathlib import Path + +import pytest + +from medarc_verifiers.cli.eval_identity import ( + BASE_VARIANT_ID, + generate_variant_id, + plan_eval_paths, + slug_component, +) + + +def test_unique_model_env_path_uses_base_variant_directory(tmp_path: Path) -> None: + [plan] = plan_eval_paths( + [{"model": "openai/gpt-5-mini", "env_id": "medqa"}], + output_root=tmp_path / "runs" / "evals", + ) + + assert plan.identity.model_id == "openai/gpt-5-mini" + assert plan.identity.env_id == "medqa" + assert plan.identity.variant_id == BASE_VARIANT_ID + assert plan.results_path == tmp_path / "runs" / "evals" / "openai-gpt-5-mini" / "medqa" / "base" + + +def test_explicit_variant_id_controls_variant_directory(tmp_path: Path) -> None: + [plan] = plan_eval_paths( + [{"model": "gpt-5-mini", "env_id": "medqa", "variant_id": "shuffle_seed-1618"}], + output_root=tmp_path, + ) + + assert plan.identity.variant_id == "shuffle_seed-1618" + assert plan.results_path == tmp_path / "gpt-5-mini" / "medqa" / "shuffle_seed-1618" + + +def test_name_is_variant_id_alias(tmp_path: Path) -> None: + [plan] = plan_eval_paths( + [{"model": "gpt-5-mini", "env_id": "medqa", "name": "seed-1618"}], + output_root=tmp_path, + ) + + assert plan.identity.variant_id == "seed-1618" + assert plan.results_path == tmp_path / "gpt-5-mini" / "medqa" / "seed-1618" + + +def test_name_can_template_expanded_env_args(tmp_path: Path) -> None: + [plan] = plan_eval_paths( + [ + { + "model": "gpt-5-mini", + "env_id": "medqa", + "env_args": {"shuffle_seed": 1618}, + "name": "shuffle_seed-{env_args.shuffle_seed}", + } + ], + output_root=tmp_path, + ) + + assert plan.identity.variant_id == "shuffle_seed-1618" + + +def test_matching_name_and_variant_id_are_allowed(tmp_path: Path) -> None: + [plan] = plan_eval_paths( + [{"model": "gpt-5-mini", "env_id": "medqa", "name": "base", "variant_id": "base"}], + output_root=tmp_path, + ) + + assert plan.identity.variant_id == "base" + + +def test_conflicting_name_and_variant_id_fail(tmp_path: Path) -> None: + with pytest.raises(ValueError, match="conflicting variant_id/name"): + plan_eval_paths( + [{"model": "gpt-5-mini", "env_id": "medqa", "name": "left", "variant_id": "right"}], + output_root=tmp_path, + ) + + +def test_variant_id_must_be_path_safe(tmp_path: Path) -> None: + with pytest.raises(ValueError, match="variant_id .* is not path-safe"): + plan_eval_paths( + [{"model": "gpt-5-mini", "env_id": "medqa", "variant_id": "shuffle seed = 1618"}], + output_root=tmp_path, + ) + + +def test_name_template_result_must_be_path_safe(tmp_path: Path) -> None: + with pytest.raises(ValueError, match="name .* is not path-safe"): + plan_eval_paths( + [ + { + "model": "gpt-5-mini", + "env_id": "medqa", + "env_args": {"difficulty": "very hard"}, + "name": "{env_args.difficulty}", + } + ], + output_root=tmp_path, + ) + + +def test_duplicate_model_env_requires_explicit_variant(tmp_path: Path) -> None: + with pytest.raises(ValueError, match="Duplicate TOML eval identity"): + plan_eval_paths( + [ + {"model": "gpt-5-mini", "env_id": "medqa"}, + {"model": "gpt-5-mini", "env_id": "medqa"}, + ], + output_root=tmp_path, + ) + + +def test_same_variant_condition_across_models_keeps_same_variant_id(tmp_path: Path) -> None: + plans = plan_eval_paths( + [ + {"model": "gpt-5-mini", "env_id": "medqa", "variant_id": "seed-1618"}, + {"model": "gpt-5", "env_id": "medqa", "variant_id": "seed-1618"}, + ], + output_root=tmp_path, + ) + + assert [plan.identity.variant_id for plan in plans] == ["seed-1618", "seed-1618"] + assert plans[0].results_path == tmp_path / "gpt-5-mini" / "medqa" / "seed-1618" + assert plans[1].results_path == tmp_path / "gpt-5" / "medqa" / "seed-1618" + + +def test_slug_collisions_fail(tmp_path: Path) -> None: + with pytest.raises(ValueError, match="model slug collision"): + plan_eval_paths( + [ + {"model": "openai/gpt", "env_id": "medqa"}, + {"model": "openai:gpt", "env_id": "pubmedqa"}, + ], + output_root=tmp_path, + ) + + +def test_legacy_variant_generator_remains_stable_for_export_config_lookup() -> None: + payload = {"env_args": {"shuffle_seed": 1618}} + + assert generate_variant_id(payload) == "env_args.shuffle_seed-1618" + + +def test_slug_component_is_path_safe_and_stable_for_long_values() -> None: + slug = slug_component(" openai/gpt-5:mini " + "x" * 120) + + assert "/" not in slug + assert ":" not in slug + assert len(slug) <= 80 + assert slug == slug_component(" openai/gpt-5:mini " + "x" * 120) diff --git a/tests/test_cli/test_isolated_env.py b/tests/test_cli/test_isolated_env.py new file mode 100644 index 00000000..1c11b1eb --- /dev/null +++ b/tests/test_cli/test_isolated_env.py @@ -0,0 +1,140 @@ +from __future__ import annotations + +import json +import os +import subprocess +from pathlib import Path +from types import SimpleNamespace + +import pytest + +from medarc_verifiers.cli import isolated_env + + +class FakeDistribution: + def __init__(self, *, version: str = "1.2.3", direct_url: dict | None = None) -> None: + self.version = version + self._direct_url = direct_url + + def read_text(self, name: str) -> str | None: + if name != "direct_url.json" or self._direct_url is None: + return None + return json.dumps(self._direct_url) + + +def test_current_medarc_install_spec_detects_editable_checkout( + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + (tmp_path / "pyproject.toml").write_text("[project]\nname = 'medarc-verifiers'\n", encoding="utf-8") + (tmp_path / "medarc_verifiers").mkdir() + direct_url = {"url": tmp_path.as_uri(), "dir_info": {"editable": True}} + + monkeypatch.setattr(isolated_env.metadata, "distribution", lambda name: FakeDistribution(direct_url=direct_url)) + + spec = isolated_env.current_medarc_install_spec() + + assert spec.editable is True + assert spec.checkout_root == tmp_path + + +def test_current_medarc_install_spec_rejects_invalid_editable_checkout( + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + direct_url = {"url": tmp_path.as_uri(), "dir_info": {"editable": True}} + monkeypatch.setattr(isolated_env.metadata, "distribution", lambda name: FakeDistribution(direct_url=direct_url)) + + with pytest.raises(isolated_env.IsolatedEnvError, match="invalid"): + isolated_env.current_medarc_install_spec() + + +def test_install_medarc_non_editable_uses_pinned_version(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: + commands: list[list[str]] = [] + + monkeypatch.setattr( + isolated_env, "current_medarc_install_spec", lambda: isolated_env.MedarcInstallSpec(False, "9.8.7") + ) + monkeypatch.setattr(isolated_env, "_run_uv", lambda command, action: commands.append(command)) + + isolated_env.install_medarc_into_venv(tmp_path / "python") + + assert commands == [["uv", "pip", "install", "--python", str(tmp_path / "python"), "medarc-verifiers==9.8.7"]] + + +def test_install_medarc_non_editable_resolution_failure_is_actionable( + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + monkeypatch.setattr( + isolated_env, "current_medarc_install_spec", lambda: isolated_env.MedarcInstallSpec(False, "9.8.7") + ) + + def fail(command: list[str], action: str) -> None: + raise isolated_env.IsolatedEnvError("resolver failed") + + monkeypatch.setattr(isolated_env, "_run_uv", fail) + + with pytest.raises(isolated_env.IsolatedEnvError, match="preinstall environment packages"): + isolated_env.install_medarc_into_venv(tmp_path / "python") + + +def test_temporary_bench_venv_cleans_up(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: + venv_path = tmp_path / "venv" + python_path = venv_path / "bin" / "python" + created: list[Path] = [] + + def fake_mkdtemp(prefix: str) -> str: + venv_path.mkdir(parents=True) + python_path.parent.mkdir(parents=True) + python_path.write_text("", encoding="utf-8") + return str(venv_path) + + monkeypatch.setattr(isolated_env.tempfile, "mkdtemp", fake_mkdtemp) + monkeypatch.setattr(isolated_env, "_create_venv", lambda path: created.append(path) or python_path) + monkeypatch.setattr(isolated_env, "install_medarc_into_venv", lambda python, repo_root=None: None) + + with isolated_env.temporary_bench_venv() as python: + assert python == python_path + assert venv_path.exists() + + assert created == [venv_path] + assert not venv_path.exists() + + +def test_run_uv_reports_missing_uv(monkeypatch: pytest.MonkeyPatch) -> None: + def missing_uv(*args, **kwargs): + raise FileNotFoundError("uv") + + monkeypatch.setattr(isolated_env.subprocess, "run", missing_uv) + + with pytest.raises(isolated_env.IsolatedEnvError, match="uv is not installed"): + isolated_env._run_uv(["uv", "venv"], "create venv") + + +def test_run_uv_reports_failing_command(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setattr( + isolated_env.subprocess, + "run", + lambda *args, **kwargs: SimpleNamespace(returncode=2, stderr="bad\nerror", stdout=""), + ) + + with pytest.raises(isolated_env.IsolatedEnvError, match="error"): + isolated_env._run_uv(["uv", "venv"], "create venv") + + +@pytest.mark.skipif( + os.environ.get("MEDARC_RUN_ISOLATED_ENV_SMOKE") != "1", + reason="set MEDARC_RUN_ISOLATED_ENV_SMOKE=1 to run the real uv isolated-env smoke", +) +def test_temporary_bench_venv_real_helper_imports_bench_child() -> None: + with isolated_env.temporary_bench_venv() as python: + completed = subprocess.run( + [str(python), "-m", "medarc_verifiers.cli.bench_child", "--help"], + check=False, + capture_output=True, + text=True, + ) + + assert completed.returncode == 0 + assert "Run one TOML bench eval child payload" in completed.stdout diff --git a/tests/test_cli/test_job_builder.py b/tests/test_cli/test_job_builder.py deleted file mode 100644 index 39b0d22c..00000000 --- a/tests/test_cli/test_job_builder.py +++ /dev/null @@ -1,182 +0,0 @@ -from __future__ import annotations - -from pathlib import Path - -import pytest - -from medarc_verifiers.cli._config_loader import load_run_config -from medarc_verifiers.cli._job_builder import ResolvedJob, build_jobs - - -def _write_yaml(path: Path, content: str) -> Path: - path.write_text(content) - return path - - -def _stub_metadata(monkeypatch) -> None: - monkeypatch.setattr( - "medarc_verifiers.cli._config_loader.load_env_metadata", - lambda _env_id, cache=None: [], - ) - - -def test_build_jobs_basic(monkeypatch, tmp_path: Path) -> None: - _stub_metadata(monkeypatch) - config_path = _write_yaml( - tmp_path / "config.yaml", - """ - models: - gpt-mini: - model: openai/gpt-mini - envs: - medqa: - num_examples: 5 - jobs: - - model: gpt-mini - env: medqa - """, - ) - - run_config = load_run_config(config_path) - jobs = build_jobs(run_config) - - assert len(jobs) == 1 - job = jobs[0] - assert isinstance(job, ResolvedJob) - assert job.job_id == "gpt-mini-medqa" - assert job.env.id == "medqa" - assert job.env_args == {} - assert job.sampling_args == {} - - -def test_env_args_precedence(monkeypatch, tmp_path: Path) -> None: - _stub_metadata(monkeypatch) - config_path = _write_yaml( - tmp_path / "config.yaml", - """ - models: - gpt-mini: - env_args: - shared: model - model_only: 1 - env_overrides: - medqa: - shared: override - override_only: true - envs: - medqa: - env_args: - shared: env - env_only: 2 - jobs: - - model: gpt-mini - env: medqa - env_args: - shared: job - job_only: 3 - """, - ) - - run_config = load_run_config(config_path) - jobs = build_jobs(run_config) - - assert len(jobs) == 1 - env_args = jobs[0].env_args - assert env_args["env_only"] == 2 - assert env_args["model_only"] == 1 - assert env_args["override_only"] is True - assert env_args["job_only"] == 3 - assert env_args["shared"] == "job" - - -def test_matrix_base_expansion(monkeypatch, tmp_path: Path) -> None: - _stub_metadata(monkeypatch) - config_path = _write_yaml( - tmp_path / "config.yaml", - """ - models: - gpt-mini: {} - envs: - medqa: - matrix: - shuffle_seed: [1618, 9331] - matrix_id_format: "{base}-r{shuffle_seed}" - jobs: - - model: gpt-mini - env: medqa - """, - ) - - run_config = load_run_config(config_path) - jobs = build_jobs(run_config) - - job_ids = {job.env.id for job in jobs} - assert job_ids == {"medqa-r1618", "medqa-r9331"} - - -def test_duplicate_job_ids_get_fingerprinted(monkeypatch, tmp_path: Path) -> None: - _stub_metadata(monkeypatch) - config_path = _write_yaml( - tmp_path / "config.yaml", - """ - models: - gpt-mini: {} - envs: - medqa: {} - jobs: - - model: gpt-mini - env: medqa - env_args: - variant: 1 - - model: gpt-mini - env: medqa - env_args: - variant: 2 - """, - ) - - run_config = load_run_config(config_path) - jobs = build_jobs(run_config) - - assert len(jobs) == 2 - job_ids = {job.job_id for job in jobs} - assert len(job_ids) == 2 - assert any(job_id.startswith("gpt-mini-medqa-") for job_id in job_ids) - - -def test_unknown_model_raises(monkeypatch, tmp_path: Path) -> None: - _stub_metadata(monkeypatch) - config_path = _write_yaml( - tmp_path / "config.yaml", - """ - envs: - medqa: {} - jobs: - - model: missing - env: medqa - """, - ) - - run_config = load_run_config(config_path) - with pytest.raises(ValueError, match="unknown model"): - build_jobs(run_config) - - -def test_unknown_environment_raises(monkeypatch, tmp_path: Path) -> None: - _stub_metadata(monkeypatch) - config_path = _write_yaml( - tmp_path / "config.yaml", - """ - models: - gpt-mini: {} - envs: - medqa: {} - jobs: - - model: gpt-mini - env: missing - """, - ) - - run_config = load_run_config(config_path) - with pytest.raises(ValueError, match="unknown environment"): - build_jobs(run_config) diff --git a/tests/test_cli/test_job_executor.py b/tests/test_cli/test_job_executor.py deleted file mode 100644 index 68c35d27..00000000 --- a/tests/test_cli/test_job_executor.py +++ /dev/null @@ -1,680 +0,0 @@ -from __future__ import annotations - -import logging -from pathlib import Path -from types import SimpleNamespace - -import pytest - -from medarc_verifiers.cli._constants import DEFAULT_ENDPOINTS_PATH -from medarc_verifiers.cli._job_builder import ResolvedJob -from medarc_verifiers.cli._job_executor import ( - ExecutorSettings, - JobExecutionResult, - _load_endpoints_for_model, - execute_jobs, -) -from medarc_verifiers.cli._schemas import EnvironmentConfigSchema, ModelConfigSchema -from medarc_verifiers.cli.utils.env_args import EnvParam - - -def _stub_metadata(required: bool = False) -> list[EnvParam]: - return [ - EnvParam( - name="seed", - cli_name="seed", - kind="int", - default=None, - required=required, - help="Seed value", - annotation=int, - argparse_type=int, - choices=None, - action=None, - is_list=False, - element_type=None, - unsupported_reason=None, - ) - ] - - -def _settings(tmp_path: Path, **overrides: object) -> ExecutorSettings: - base_kwargs = dict( - run_id="run-1", - output_dir=tmp_path / "runs", - env_dir=tmp_path / "environments", - endpoints_path=tmp_path / "endpoints.py", - endpoints_path_explicit=False, - default_api_key_var="DEFAULT_KEY", - default_api_base_url="https://api.default", - log_level="INFO", - verbose=False, - save_results=True, - save_to_hf_hub=False, - hf_hub_dataset_name=None, - max_concurrent_generation=None, - max_concurrent_scoring=None, - # New concurrency precedence: CLI (--max-concurrent) > env_cfg.max_concurrent > DEFAULT_BATCH_MAX_CONCURRENT (128) - # Provide a placeholder so tests can inject a CLI override via overrides (max_concurrent=VALUE). - max_concurrent=None, - timeout=None, - sleep=0.0, - dry_run=False, - ) - base_kwargs.update(overrides) - return ExecutorSettings(**base_kwargs) - - -def _stub_results(value: float = 0.5) -> SimpleNamespace: - metadata = SimpleNamespace( - path_to_save="", - avg_reward=value, - num_examples=1, - rollouts_per_example=1, - avg_metrics={"pass_rate": value}, - ) - return SimpleNamespace(metadata=metadata, reward=[value], metrics={"pass_rate": [value]}) - - -def _stub_results_metadata_only(value: float = 0.5) -> SimpleNamespace: - metadata = SimpleNamespace( - path_to_save="", - avg_reward=value, - num_examples=2, - rollouts_per_example=3, - avg_metrics={"pass_rate": value, "accuracy": value / 2}, - ) - return SimpleNamespace(metadata=metadata) - - -def test_execute_jobs_invokes_run_evaluation(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: - captured = {} - - async def fake_run(config): - captured["config"] = config - return _stub_results() - - monkeypatch.setattr("medarc_verifiers.cli._job_executor.run_evaluation", fake_run) - monkeypatch.setattr( - "medarc_verifiers.cli._job_executor.load_endpoint_registry", - lambda path, cache=None: { - "alias": [{"model": "resolved-model", "key": "MODEL_KEY", "url": "https://api.resolved"}] - }, - ) - monkeypatch.setattr( - "medarc_verifiers.cli._job_executor.load_env_metadata", - lambda env_id, cache=None: _stub_metadata(required=True), - ) - - model_cfg = ModelConfigSchema(id="alias", headers={"X-Test": "1"}, sampling_args={"temperature": 0.1}) - env_cfg = EnvironmentConfigSchema(id="medqa", env_args={"seed": 1}, num_examples=3) - job = ResolvedJob( - job_id="alias-medqa", - name="alias-medqa", - model=model_cfg, - env=env_cfg, - env_args={"seed": 1}, - sampling_args={"temperature": 0.1}, - ) - - results = execute_jobs([job], _settings(tmp_path)) - - assert len(results) == 1 - result = results[0] - assert isinstance(result, JobExecutionResult) - assert result.status == "succeeded" - assert result.output_path == (tmp_path / "runs" / "run-1" / job.job_id) - assert "config" in captured - config = captured["config"] - assert config.model == "resolved-model" - assert Path(str(config.resume_path)) == (tmp_path / "runs" / "run-1" / job.job_id) - assert config.client_config.api_key_var == "MODEL_KEY" - assert config.client_config.api_base_url == "https://api.resolved" - assert config.client_config.extra_headers == {"X-Test": "1"} - assert config.env_args == {"seed": 1} - # With no CLI override and no env-level max_concurrent, falls back to DEFAULT_BATCH_MAX_CONCURRENT (128) - assert config.max_concurrent == 128 - - -def test_execute_jobs_records_failures(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: - async def failing_run(config): - raise RuntimeError("boom") - - monkeypatch.setattr("medarc_verifiers.cli._job_executor.run_evaluation", failing_run) - monkeypatch.setattr( - "medarc_verifiers.cli._job_executor.load_endpoint_registry", - lambda path, cache=None: {}, - ) - monkeypatch.setattr( - "medarc_verifiers.cli._job_executor.load_env_metadata", - lambda env_id, cache=None: _stub_metadata(required=False), - ) - - model_cfg = ModelConfigSchema(id="alias") - env_cfg = EnvironmentConfigSchema(id="medqa", env_args={"seed": 1}) - job = ResolvedJob( - job_id="alias-medqa", - name="alias-medqa", - model=model_cfg, - env=env_cfg, - env_args={"seed": 1}, - sampling_args={}, - ) - - results = execute_jobs([job], _settings(tmp_path)) - - assert len(results) == 1 - result = results[0] - assert result.status == "failed" - assert result.error is not None - assert "boom" in result.error - assert "alias-medqa" in result.error - assert "env=medqa" in result.error - assert result.output_path == (tmp_path / "runs" / "run-1" / job.job_id) - - -def test_materialize_results_noop_logs_debug_when_source_matches_job_dir( - monkeypatch: pytest.MonkeyPatch, - tmp_path: Path, - caplog: pytest.LogCaptureFixture, -) -> None: - async def fake_run(config): - metadata = SimpleNamespace( - path_to_save=str(config.resume_path), - avg_reward=0.5, - num_examples=1, - rollouts_per_example=1, - avg_metrics={"pass_rate": 0.5}, - ) - return SimpleNamespace(metadata=metadata, reward=[0.5], metrics={"pass_rate": [0.5]}) - - monkeypatch.setattr("medarc_verifiers.cli._job_executor.run_evaluation", fake_run) - monkeypatch.setattr("medarc_verifiers.cli._job_executor.load_endpoint_registry", lambda path, cache=None: {}) - monkeypatch.setattr("medarc_verifiers.cli._job_executor.load_env_metadata", lambda env_id, cache=None: []) - - job = ResolvedJob( - job_id="alias-medqa", - name="alias-medqa", - model=ModelConfigSchema(id="alias"), - env=EnvironmentConfigSchema(id="medqa"), - env_args={}, - sampling_args={}, - ) - - with caplog.at_level(logging.DEBUG): - results = execute_jobs([job], _settings(tmp_path, log_level="DEBUG")) - - assert results[0].status == "succeeded" - assert "Results already in job_dir; _materialize_results no-op" in caplog.text - - -def test_forced_job_archives_and_resets_existing_job_dir(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: - captured: dict[str, object] = {} - - async def fake_run(config): - captured["resume_path"] = config.resume_path - metadata = SimpleNamespace( - path_to_save=str(config.resume_path), - avg_reward=0.5, - num_examples=1, - rollouts_per_example=1, - avg_metrics={"pass_rate": 0.5}, - ) - return SimpleNamespace(metadata=metadata, reward=[0.5], metrics={"pass_rate": [0.5]}) - - monkeypatch.setattr("medarc_verifiers.cli._job_executor.run_evaluation", fake_run) - monkeypatch.setattr("medarc_verifiers.cli._job_executor.load_endpoint_registry", lambda path, cache=None: {}) - monkeypatch.setattr("medarc_verifiers.cli._job_executor.load_env_metadata", lambda env_id, cache=None: []) - - job = ResolvedJob( - job_id="alias-medqa", - name="alias-medqa", - model=ModelConfigSchema(id="alias"), - env=EnvironmentConfigSchema(id="medqa"), - env_args={}, - sampling_args={}, - ) - run_dir = tmp_path / "runs" / "run-1" - job_dir = run_dir / job.job_id - job_dir.mkdir(parents=True, exist_ok=True) - (job_dir / "stale.txt").write_text("stale", encoding="utf-8") - - results = execute_jobs([job], _settings(tmp_path, forced_job_ids={job.job_id})) - - assert results[0].status == "succeeded" - assert Path(str(captured["resume_path"])) == job_dir - archived = sorted(run_dir.glob(f"{job.job_id}__old_*")) - assert len(archived) == 1 - assert (archived[0] / "stale.txt").exists() - assert job_dir.exists() - assert not (job_dir / "stale.txt").exists() - - -def test_non_forced_invalid_nonempty_job_dir_fails_prescriptively( - monkeypatch: pytest.MonkeyPatch, - tmp_path: Path, -) -> None: - async def fail_if_called(_config): - raise AssertionError("run_evaluation should not run when preflight fails") - - monkeypatch.setattr("medarc_verifiers.cli._job_executor.run_evaluation", fail_if_called) - monkeypatch.setattr("medarc_verifiers.cli._job_executor.load_endpoint_registry", lambda path, cache=None: {}) - monkeypatch.setattr("medarc_verifiers.cli._job_executor.load_env_metadata", lambda env_id, cache=None: []) - - job = ResolvedJob( - job_id="alias-medqa", - name="alias-medqa", - model=ModelConfigSchema(id="alias"), - env=EnvironmentConfigSchema(id="medqa"), - env_args={}, - sampling_args={}, - ) - job_dir = tmp_path / "runs" / "run-1" / job.job_id - job_dir.mkdir(parents=True, exist_ok=True) - (job_dir / "orphan.log").write_text("invalid state", encoding="utf-8") - - results = execute_jobs([job], _settings(tmp_path)) - - assert len(results) == 1 - assert results[0].status == "failed" - assert results[0].error is not None - assert "not a valid evaluation results path" in results[0].error - assert "--force" in results[0].error - assert "new run_id" in results[0].error - - -def test_batch_resume_mismatch_logs_saved_and_current_values( - monkeypatch: pytest.MonkeyPatch, - tmp_path: Path, - caplog: pytest.LogCaptureFixture, -) -> None: - job = ResolvedJob( - job_id="alias-medqa", - name="alias-medqa", - model=ModelConfigSchema(id="alias"), - env=EnvironmentConfigSchema(id="medqa", num_examples=5, rollouts_per_example=3), - env_args={}, - sampling_args={}, - ) - job_dir = tmp_path / "runs" / "run-1" / job.job_id - job_dir.mkdir(parents=True, exist_ok=True) - (job_dir / "results.jsonl").write_text("", encoding="utf-8") - (job_dir / "metadata.json").write_text( - ('{"env_id":"saved-env","model":"saved-model","rollouts_per_example":2,"num_examples":8}'), - encoding="utf-8", - ) - - async def fake_run(_config): - raise ValueError( - f"Cannot resume from {job_dir}: metadata mismatch (env_id: saved='saved-env', current='medqa')" - ) - - monkeypatch.setattr("medarc_verifiers.cli._job_executor.run_evaluation", fake_run) - monkeypatch.setattr("medarc_verifiers.cli._job_executor.load_endpoint_registry", lambda path, cache=None: {}) - monkeypatch.setattr("medarc_verifiers.cli._job_executor.load_env_metadata", lambda env_id, cache=None: []) - - with caplog.at_level(logging.ERROR): - results = execute_jobs([job], _settings(tmp_path)) - - assert len(results) == 1 - assert results[0].status == "failed" - assert results[0].error is not None - assert "incompatible prior results" in results[0].error - assert "Resume metadata mismatch for job 'alias-medqa'" in caplog.text - assert "env_id: saved='saved-env', current='medqa'" in caplog.text - assert "model: saved='saved-model', current='alias'" in caplog.text - assert "rollouts_per_example: saved=2, current=3" in caplog.text - assert "num_examples: saved=8, current=5 (current must be >= saved)" in caplog.text - - -def test_execute_jobs_uses_metadata_averages(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: - class _ManifestStub: - def __init__(self) -> None: - self.started: list[str] = [] - self.completed: list[dict[str, object]] = [] - - def record_job_start(self, job_id: str) -> None: - self.started.append(job_id) - - def record_job_completion(self, job_id: str, **kwargs: object) -> None: - payload = {"job_id": job_id} - payload.update(kwargs) - self.completed.append(payload) - - def record_job_failure(self, job_id: str, **kwargs: object) -> None: - raise AssertionError(f"Job should not fail: {job_id}, {kwargs}") - - async def fake_run(config): - return _stub_results_metadata_only(0.8) - - monkeypatch.setattr("medarc_verifiers.cli._job_executor.run_evaluation", fake_run) - monkeypatch.setattr("medarc_verifiers.cli._job_executor.load_endpoint_registry", lambda path, cache=None: {}) - monkeypatch.setattr("medarc_verifiers.cli._job_executor.load_env_metadata", lambda env_id, cache=None: []) - - job = ResolvedJob( - job_id="alias-medqa", - name="alias-medqa", - model=ModelConfigSchema(id="alias"), - env=EnvironmentConfigSchema(id="medqa"), - env_args={}, - sampling_args={}, - ) - manifest = _ManifestStub() - - results = execute_jobs([job], _settings(tmp_path), manifest=manifest) - - assert results[0].status == "succeeded" - assert manifest.started == ["alias-medqa"] - assert len(manifest.completed) == 1 - completed = manifest.completed[0] - assert completed["job_id"] == "alias-medqa" - assert completed["avg_reward"] == pytest.approx(0.8) - metrics = completed["metrics"] - assert isinstance(metrics, dict) - assert metrics["pass_rate"] == pytest.approx(0.8) - assert metrics["accuracy"] == pytest.approx(0.4) - assert completed["num_examples"] == 2 - assert completed["rollouts_per_example"] == 3 - - -def test_execute_jobs_respects_dry_run(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: - async def raise_if_called(*args, **kwargs): - raise AssertionError("run_evaluation should not be invoked during dry runs.") - - monkeypatch.setattr("medarc_verifiers.cli._job_executor.run_evaluation", raise_if_called) - monkeypatch.setattr( - "medarc_verifiers.cli._job_executor.load_endpoint_registry", - lambda path, cache=None: {}, - ) - monkeypatch.setattr( - "medarc_verifiers.cli._job_executor.load_env_metadata", - lambda env_id, cache=None: _stub_metadata(required=False), - ) - - model_cfg = ModelConfigSchema(id="alias") - env_cfg = EnvironmentConfigSchema(id="medqa") - job = ResolvedJob( - job_id="alias-medqa", - name="alias-medqa", - model=model_cfg, - env=env_cfg, - env_args={}, - sampling_args={}, - ) - - results = execute_jobs([job], _settings(tmp_path, dry_run=True)) - - assert results[0].status == "skipped" - assert results[0].output_path == (tmp_path / "runs" / "run-1" / job.job_id) - - -def test_executor_timeout_precedence(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: - captured = {} - - async def fake_run(config): - captured["config"] = config - return _stub_results() - - monkeypatch.setattr("medarc_verifiers.cli._job_executor.run_evaluation", fake_run) - monkeypatch.setattr( - "medarc_verifiers.cli._job_executor.load_endpoint_registry", - lambda path, cache=None: {}, - ) - monkeypatch.setattr( - "medarc_verifiers.cli._job_executor.load_env_metadata", - lambda env_id, cache=None: _stub_metadata(required=False), - ) - - model_cfg = ModelConfigSchema(id="alias", timeout=5.0) - env_cfg = EnvironmentConfigSchema(id="medqa") - job = ResolvedJob( - job_id="alias-medqa", - name="alias-medqa", - model=model_cfg, - env=env_cfg, - env_args={}, - sampling_args={}, - ) - - # CLI override should take precedence when provided. - execute_jobs([job], _settings(tmp_path, timeout=10.0)) - config = captured["config"] - assert config.client_config.timeout == 10.0 - - # Model-level timeout applies when CLI flag is absent. - captured.clear() - execute_jobs([job], _settings(tmp_path)) - config = captured["config"] - assert config.client_config.timeout == 5.0 - - -def test_cli_env_arg_overrides_yaml(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: - captured = {} - - async def fake_run(config): - captured["config"] = config - return _stub_results() - - monkeypatch.setattr("medarc_verifiers.cli._job_executor.run_evaluation", fake_run) - monkeypatch.setattr( - "medarc_verifiers.cli._job_executor.load_endpoint_registry", - lambda path, cache=None: {}, - ) - metadata = [ - EnvParam( - name="flag", - cli_name="flag", - kind="bool", - default=False, - required=False, - help="Boolean flag", - annotation=bool, - argparse_type=None, - choices=None, - action="BooleanOptionalAction", - is_list=False, - element_type=None, - unsupported_reason=None, - ) - ] - monkeypatch.setattr( - "medarc_verifiers.cli._job_executor.load_env_metadata", - lambda env_id, cache=None: metadata, - ) - - model_cfg = ModelConfigSchema(id="alias", env_args={"flag": True}) - env_cfg = EnvironmentConfigSchema(id="medqa", env_args={"flag": False}) - job = ResolvedJob( - job_id="alias-medqa", - name="alias-medqa", - model=model_cfg, - env=env_cfg, - env_args={"flag": False}, - sampling_args={}, - ) - - results = execute_jobs([job], _settings(tmp_path, cli_env_args={"flag": True})) - - assert results[0].status == "succeeded" - assert captured["config"].env_args["flag"] is True - - -def test_cli_sampling_arg_overrides_yaml(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: - captured = {} - - async def fake_run(config): - captured["config"] = config - return _stub_results() - - monkeypatch.setattr("medarc_verifiers.cli._job_executor.run_evaluation", fake_run) - monkeypatch.setattr( - "medarc_verifiers.cli._job_executor.load_endpoint_registry", - lambda path, cache=None: {}, - ) - monkeypatch.setattr( - "medarc_verifiers.cli._job_executor.load_env_metadata", - lambda env_id, cache=None: [], - ) - - model_cfg = ModelConfigSchema(id="alias", sampling_args={"temperature": 0.7}) - env_cfg = EnvironmentConfigSchema(id="medqa") - job = ResolvedJob( - job_id="alias-medqa", - name="alias-medqa", - model=model_cfg, - env=env_cfg, - env_args={}, - sampling_args={"temperature": 0.5}, - ) - - results = execute_jobs( - [job], - _settings(tmp_path, cli_sampling_args={"temperature": 0.2}), - ) - - assert results[0].status == "succeeded" - assert captured["config"].sampling_args["temperature"] == 0.2 - - -def test_execute_jobs_handles_keyboard_interrupt(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: - async def interrupting_run(config): # noqa: ARG001 - raise KeyboardInterrupt - - monkeypatch.setattr("medarc_verifiers.cli._job_executor.run_evaluation", interrupting_run) - monkeypatch.setattr( - "medarc_verifiers.cli._job_executor.load_endpoint_registry", - lambda path, cache=None: {}, - ) - monkeypatch.setattr( - "medarc_verifiers.cli._job_executor.load_env_metadata", - lambda env_id, cache=None: [], - ) - - model_cfg = ModelConfigSchema(id="alias") - env_cfg = EnvironmentConfigSchema(id="medqa") - job = ResolvedJob( - job_id="alias-medqa", - name="alias-medqa", - model=model_cfg, - env=env_cfg, - env_args={}, - sampling_args={}, - ) - - results = execute_jobs([job], _settings(tmp_path)) - - assert len(results) == 1 - result = results[0] - assert result.status == "failed" - assert result.error is not None - assert "interrupted" in result.error.lower() - - -def test_job_sleep_overrides_cli(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: - sleep_calls: list[float] = [] - - async def fake_run(config): # noqa: ARG001 - return _stub_results() - - monkeypatch.setattr("medarc_verifiers.cli._job_executor.run_evaluation", fake_run) - monkeypatch.setattr("medarc_verifiers.cli._job_executor.load_endpoint_registry", lambda path, cache=None: {}) - monkeypatch.setattr( - "medarc_verifiers.cli._job_executor.load_env_metadata", - lambda env_id, cache=None: _stub_metadata(required=False), - ) - monkeypatch.setattr("medarc_verifiers.cli._job_executor.sleep", lambda seconds: sleep_calls.append(seconds)) - - model_cfg = ModelConfigSchema(id="alias") - env_cfg = EnvironmentConfigSchema(id="medqa") - - jobs = [ - ResolvedJob( - job_id="alias-medqa-a", - name="alias-medqa-a", - model=model_cfg, - env=env_cfg, - env_args={}, - sampling_args={}, - sleep=1.5, - ), - ResolvedJob( - job_id="alias-medqa-b", - name="alias-medqa-b", - model=model_cfg, - env=env_cfg, - env_args={}, - sampling_args={}, - sleep=None, - ), - ] - - results = execute_jobs(jobs, _settings(tmp_path, sleep=0.25)) - - assert all(result.status == "succeeded" for result in results) - assert sleep_calls == [pytest.approx(1.5)] - - -def test_execute_jobs_warns_for_deprecated_eval_knobs( - monkeypatch: pytest.MonkeyPatch, - tmp_path: Path, - caplog: pytest.LogCaptureFixture, -) -> None: - async def fake_run(config): # noqa: ARG001 - return _stub_results() - - monkeypatch.setattr("medarc_verifiers.cli._job_executor.run_evaluation", fake_run) - monkeypatch.setattr("medarc_verifiers.cli._job_executor.load_endpoint_registry", lambda path, cache=None: {}) - monkeypatch.setattr("medarc_verifiers.cli._job_executor.load_env_metadata", lambda env_id, cache=None: []) - - model_cfg = ModelConfigSchema(id="alias") - env_cfg = EnvironmentConfigSchema( - id="medqa", - save_every=5, - print_results=True, - ) - job = ResolvedJob( - job_id="alias-medqa", - name="alias-medqa", - model=model_cfg, - env=env_cfg, - env_args={}, - sampling_args={}, - ) - - with caplog.at_level(logging.WARNING): - results = execute_jobs( - [job], - _settings( - tmp_path, - max_concurrent_generation=2, - max_concurrent_scoring=3, - ), - ) - - assert results[0].status == "succeeded" - assert "Environment 'medqa' sets deprecated eval knob(s): print_results, save_every" in caplog.text - assert ( - "Job 'alias-medqa' sets deprecated eval knob(s): max_concurrent_generation, max_concurrent_scoring" - in caplog.text - ) - - -def test_load_endpoints_for_model_missing_default_path_is_non_fatal( - monkeypatch: pytest.MonkeyPatch, - tmp_path: Path, -) -> None: - monkeypatch.chdir(tmp_path) - settings = _settings(tmp_path, endpoints_path=Path(DEFAULT_ENDPOINTS_PATH), endpoints_path_explicit=False) - model_cfg = ModelConfigSchema(id="alias") - - endpoints = _load_endpoints_for_model(model_cfg, settings, cache=None) - - assert endpoints == {} - - -def test_load_endpoints_for_model_missing_explicit_path_raises(tmp_path: Path) -> None: - settings = _settings(tmp_path, endpoints_path=tmp_path / "missing.toml", endpoints_path_explicit=True) - model_cfg = ModelConfigSchema(id="alias") - - with pytest.raises(FileNotFoundError): - _load_endpoints_for_model(model_cfg, settings, cache=None) diff --git a/tests/test_cli/test_main.py b/tests/test_cli/test_main.py index c78b6cad..fbb929f8 100644 --- a/tests/test_cli/test_main.py +++ b/tests/test_cli/test_main.py @@ -29,10 +29,6 @@ def _patch_single_run_env(monkeypatch: pytest.MonkeyPatch, metadata: list[EnvPar "medarc_verifiers.cli._single_run.gather_env_cli_metadata", lambda env_id: metadata, ) - monkeypatch.setattr( - "medarc_verifiers.cli._single_run.load_endpoint_registry", - lambda *args, **kwargs: {}, - ) def _patch_single_run_metadata_only(monkeypatch: pytest.MonkeyPatch, metadata: list[EnvParam]) -> None: @@ -42,6 +38,10 @@ def _patch_single_run_metadata_only(monkeypatch: pytest.MonkeyPatch, metadata: l ) +def _patch_toml_bench_envs_installed(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setattr(main, "_missing_selected_env_refs", lambda plan_inputs, args: {}) + + def _make_env_param( name: str, *, @@ -104,1148 +104,861 @@ def _write_resume_artifacts( ) -def test_cli_runs_configuration(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: - config_path = tmp_path / "config.yaml" +def test_toml_bench_dry_run_expands_evals_and_ablations( + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, + capsys: pytest.CaptureFixture[str], +) -> None: + config_path = tmp_path / "bench.toml" _write_config( config_path, """ - models: - model-a: - model: alias-model - headers: - X-Test: one - envs: - medqa: - env_args: {} - jobs: - - model: model-a - env: medqa + model = "gpt-5-mini" + save_results = true + + [[eval]] + env_id = "medqa" + num_examples = 1 + rollouts_per_example = 1 + + [[ablation]] + env_id = "medqa" + name = "shuffle_seed-{env_args.shuffle_seed}" + num_examples = 1 + rollouts_per_example = 1 + env_args = { shuffle_answers = true } + + [ablation.sweep.env_args] + shuffle_seed = [1618, 9331] """, ) - captured = [] - - async def fake_run(config): - captured.append(config) - return _stub_cli_result() - - monkeypatch.setattr("medarc_verifiers.cli._config_loader.load_env_metadata", lambda *args, **kwargs: []) - monkeypatch.setattr("medarc_verifiers.cli._job_executor.load_env_metadata", lambda *args, **kwargs: []) - monkeypatch.setattr("medarc_verifiers.cli._job_executor.load_endpoint_registry", lambda *args, **kwargs: {}) - monkeypatch.setattr("medarc_verifiers.cli._job_executor.run_evaluation", fake_run) - - output_dir = tmp_path / "runs_out" - env_dir = tmp_path / "envs" - env_dir.mkdir() - exit_code = main.main( [ "bench", "--config", str(config_path), + "--dry-run", "--output-dir", - str(output_dir), - "--env-dir", - str(env_dir), + str(tmp_path / "evals"), "--max-concurrent", - "5", + "1", ] ) + output = capsys.readouterr().out assert exit_code == 0 - assert len(captured) == 1 - config = captured[0] - assert config.model == "alias-model" - assert config.env_dir_path == str(env_dir) - assert config.client_config.extra_headers == {"X-Test": "one"} - assert config.max_concurrent == 5 - run_dirs = list(output_dir.iterdir()) - assert len(run_dirs) == 1 - assert run_dirs[0].is_dir() - manifest_path = run_dirs[0] / "run_manifest.json" - assert manifest_path.exists() - manifest = json.loads(manifest_path.read_text()) - assert manifest["summary"]["completed"] == 1 - assert manifest["jobs"][0]["status"] == "completed" - - -def test_batch_api_base_url_override_forces_endpoint(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: - config_path = tmp_path / "config.yaml" - _write_config( - config_path, - """ - models: - model-a: - model: alias-model - api_base_url: https://config.example/v1 - envs: - medqa: - env_args: {} - jobs: - - model: model-a - env: medqa - """, - ) - - captured = [] + assert "TOML Bench Dry Run" in output + assert "3 eval(s) to dry-run" in output + assert "base" in output + assert "shuffle_seed-1618" in output + assert "shuffle_seed-9331" in output + assert str(tmp_path / "evals" / "gpt-5-mini" / "medqa" / "base") in output - async def fake_run(config): - captured.append(config) - return _stub_cli_result() - - monkeypatch.setattr("medarc_verifiers.cli._config_loader.load_env_metadata", lambda *args, **kwargs: []) - monkeypatch.setattr("medarc_verifiers.cli._job_executor.load_env_metadata", lambda *args, **kwargs: []) - monkeypatch.setattr( - "medarc_verifiers.cli._job_executor.load_endpoint_registry", - lambda *args, **kwargs: { - "alias-model": [{"model": "resolved-model", "url": "https://endpoint.example/v1", "key": "REGISTRY_KEY"}] - }, - ) - monkeypatch.setattr("medarc_verifiers.cli._job_executor.run_evaluation", fake_run) - output_dir = tmp_path / "runs_out" - env_dir = tmp_path / "envs" - env_dir.mkdir() +def test_repository_smoke_toml_config_dry_runs(capsys: pytest.CaptureFixture[str]) -> None: + exit_code = main.main(["bench", "--config", "configs/medmarks-smoke.toml", "--dry-run"]) - override_url = "http://127.0.0.1:8000/v1" - assert ( - main.main( - [ - "bench", - "--config", - str(config_path), - "--output-dir", - str(output_dir), - "--env-dir", - str(env_dir), - "--api-base-url", - override_url, - ] - ) - == 0 - ) - - assert len(captured) == 1 - assert captured[0].client_config.api_base_url == override_url + output = capsys.readouterr().out + assert exit_code == 0 + assert "TOML Bench Dry Run" in output + assert "18 eval(s) to dry-run" in output + assert "medqa" in output + assert "runs/smoke/openai-gpt-4.1-mini/medqa" in output -def test_batch_prime_base_url_forces_prime_api_key_when_default_not_explicit( - monkeypatch: pytest.MonkeyPatch, - tmp_path: Path, +def test_toml_bench_dry_run_accepts_medarc_orchestrate_metadata( + tmp_path: Path, capsys: pytest.CaptureFixture[str] ) -> None: - config_path = tmp_path / "config.yaml" + config_path = tmp_path / "bench.toml" _write_config( config_path, - f""" - models: - model-a: - model: alias-model - api_base_url: {PRIME_INFERENCE_URL} - envs: - medqa: {{}} - jobs: - - model: model-a - env: medqa - """, - ) - - captured = [] - - async def fake_run(config): - captured.append(config) - return _stub_cli_result() - - monkeypatch.setattr("medarc_verifiers.cli._config_loader.load_env_metadata", lambda *args, **kwargs: []) - monkeypatch.setattr("medarc_verifiers.cli._job_executor.load_env_metadata", lambda *args, **kwargs: []) - monkeypatch.setattr("medarc_verifiers.cli._job_executor.load_endpoint_registry", lambda *args, **kwargs: {}) - monkeypatch.setattr("medarc_verifiers.cli._job_executor.run_evaluation", fake_run) - - output_dir = tmp_path / "runs_out" - env_dir = tmp_path / "envs" - env_dir.mkdir() - - exit_code = main.main( - [ - "bench", - "--config", - str(config_path), - "--output-dir", - str(output_dir), - "--env-dir", - str(env_dir), - ] - ) + """ + model = "gpt-5-mini" - assert exit_code == 0 - assert len(captured) == 1 - assert captured[0].client_config.api_key_var == "PRIME_API_KEY" + [[eval]] + env_id = "medqa" + num_examples = 1 + rollouts_per_example = 1 + [medarc.orchestrate.foo] + gpus = 1 -def test_batch_explicit_default_api_key_var_is_respected_for_prime_base_url( - monkeypatch: pytest.MonkeyPatch, - tmp_path: Path, -) -> None: - config_path = tmp_path / "config.yaml" - _write_config( - config_path, - f""" - models: - model-a: - model: alias-model - api_base_url: {PRIME_INFERENCE_URL} - envs: - medqa: {{}} - jobs: - - model: model-a - env: medqa + [medarc.orchestrate.vllm-container] + image = "vllm/vllm-openai:latest" """, ) - captured = [] - - async def fake_run(config): - captured.append(config) - return _stub_cli_result() - - monkeypatch.setattr("medarc_verifiers.cli._config_loader.load_env_metadata", lambda *args, **kwargs: []) - monkeypatch.setattr("medarc_verifiers.cli._job_executor.load_env_metadata", lambda *args, **kwargs: []) - monkeypatch.setattr("medarc_verifiers.cli._job_executor.load_endpoint_registry", lambda *args, **kwargs: {}) - monkeypatch.setattr("medarc_verifiers.cli._job_executor.run_evaluation", fake_run) - - output_dir = tmp_path / "runs_out" - env_dir = tmp_path / "envs" - env_dir.mkdir() - - exit_code = main.main( - [ - "bench", - "--config", - str(config_path), - "--output-dir", - str(output_dir), - "--env-dir", - str(env_dir), - "--default-api-key-var", - "OPENAI_API_KEY", - ] - ) + exit_code = main.main(["bench", "--config", str(config_path), "--dry-run"]) + output = capsys.readouterr().out assert exit_code == 0 - assert len(captured) == 1 - assert captured[0].client_config.api_key_var == "OPENAI_API_KEY" + assert "TOML Bench Dry Run" in output + assert "medqa" in output -def test_model_level_max_concurrent_applies(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: - config_path = tmp_path / "config.yaml" - _write_config( - config_path, - """ - models: - model-a: - model: alias-model - max_concurrent: 7 - envs: - medqa: {} - jobs: - - model: model-a - env: medqa - """, - ) +def test_bench_rejects_non_toml_config(tmp_path: Path, capsys: pytest.CaptureFixture[str]) -> None: + config_path = tmp_path / "bench.yaml" + _write_config(config_path, "models: {}\n") - captured = [] + with pytest.raises(SystemExit) as excinfo: + main.main(["bench", "--config", str(config_path), "--dry-run"]) - async def fake_run(config): - captured.append(config) - return _stub_cli_result() + assert excinfo.value.code == 2 + err = capsys.readouterr().err + assert "medarc-eval bench now accepts upstream TOML configs only." in err - monkeypatch.setattr("medarc_verifiers.cli._config_loader.load_env_metadata", lambda *args, **kwargs: []) - monkeypatch.setattr("medarc_verifiers.cli._job_executor.load_env_metadata", lambda *args, **kwargs: []) - monkeypatch.setattr("medarc_verifiers.cli._job_executor.load_endpoint_registry", lambda *args, **kwargs: {}) - monkeypatch.setattr("medarc_verifiers.cli._job_executor.run_evaluation", fake_run) - output_dir = tmp_path / "runs_out" - env_dir = tmp_path / "envs" - env_dir.mkdir() +def test_bench_rejects_removed_yaml_runner_flags(capsys: pytest.CaptureFixture[str]) -> None: + with pytest.raises(SystemExit) as excinfo: + main.main(["bench", "--config", "configs/medmarks-smoke.toml", "--restart"]) - exit_code = main.main( - [ - "bench", - "--config", - str(config_path), - "--output-dir", - str(output_dir), - "--env-dir", - str(env_dir), - ] - ) + assert excinfo.value.code == 2 + err = capsys.readouterr().err + assert "unrecognized arguments: --restart" in err - assert exit_code == 0 - assert len(captured) == 1 - config = captured[0] - assert config.max_concurrent == 7 +def test_repository_verified_toml_config_dry_run_shows_ablation_variants(capsys: pytest.CaptureFixture[str]) -> None: + exit_code = main.main(["bench", "--config", "configs/medmarks-verified.toml", "--dry-run", "--eval-index", "45"]) -def test_batch_rollout_max_retries_sets_eval_config(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: - config_path = tmp_path / "config.yaml" - _write_config( - config_path, - """ - models: - model-a: - model: alias-model - envs: - medqa: {} - jobs: - - model: model-a - env: medqa - """, - ) + output = capsys.readouterr().out + assert exit_code == 0 + assert "medqa" in output + assert "shuffle_seed-1618" in output + assert "runs/evals/openai-gpt-4.1-mini/medqa/shuffle_seed-1618" in output - captured = [] - async def fake_run(config): - captured.append(config) - return _stub_cli_result() +def test_repository_open_ended_toml_config_loads_expected_judge_args() -> None: + configs = main.load_toml_eval_configs("configs/medmarks-open_ended.toml") + healthbench = next(config for config in configs if config["env_id"] == "healthbench") + medrbench = [config for config in configs if config["env_id"] == "medrbench"] - monkeypatch.setattr("medarc_verifiers.cli._config_loader.load_env_metadata", lambda *args, **kwargs: []) - monkeypatch.setattr("medarc_verifiers.cli._job_executor.load_env_metadata", lambda *args, **kwargs: []) - monkeypatch.setattr("medarc_verifiers.cli._job_executor.load_endpoint_registry", lambda *args, **kwargs: {}) - monkeypatch.setattr("medarc_verifiers.cli._job_executor.run_evaluation", fake_run) + assert healthbench["env_args"]["judge_model"] == "openai/gpt-5-mini" + assert healthbench["env_args"]["judge_base_url"] == "https://api.pinference.ai/api/v1" + assert {config["env_args"]["task"] for config in medrbench} == {"oracle", "1turn", "free_turn"} - output_dir = tmp_path / "runs_out" - env_dir = tmp_path / "envs" - env_dir.mkdir() - exit_code = main.main( - [ - "bench", - "--config", - str(config_path), - "--output-dir", - str(output_dir), - "--env-dir", - str(env_dir), - "--rollout-max-retries", - "3", - ] - ) - - assert exit_code == 0 - assert len(captured) == 1 - assert captured[0].max_retries == 3 - - -def test_batch_http_max_retries_sets_client_config(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: - config_path = tmp_path / "config.yaml" +def test_toml_bench_dry_run_model_override( + tmp_path: Path, + capsys: pytest.CaptureFixture[str], +) -> None: + config_path = tmp_path / "bench.toml" _write_config( config_path, """ - models: - model-a: - model: alias-model - envs: - medqa: {} - jobs: - - model: model-a - env: medqa + model = "config-model" + + [[eval]] + env_id = "medqa" + num_examples = 1 + rollouts_per_example = 1 """, ) - captured = [] - - async def fake_run(config): - captured.append(config) - return _stub_cli_result() - - monkeypatch.setattr("medarc_verifiers.cli._config_loader.load_env_metadata", lambda *args, **kwargs: []) - monkeypatch.setattr("medarc_verifiers.cli._job_executor.load_env_metadata", lambda *args, **kwargs: []) - monkeypatch.setattr("medarc_verifiers.cli._job_executor.load_endpoint_registry", lambda *args, **kwargs: {}) - monkeypatch.setattr("medarc_verifiers.cli._job_executor.run_evaluation", fake_run) - - output_dir = tmp_path / "runs_out" - env_dir = tmp_path / "envs" - env_dir.mkdir() - - exit_code = main.main( - [ - "bench", - "--config", - str(config_path), - "--output-dir", - str(output_dir), - "--env-dir", - str(env_dir), - "--http-max-retries", - "7", - ] - ) + exit_code = main.main(["bench", "--config", str(config_path), "--dry-run", "--model", "cli-model"]) + output = capsys.readouterr().out assert exit_code == 0 - assert len(captured) == 1 - assert captured[0].client_config.max_retries == 7 + assert "cli-model" in output + assert "config-model" not in output -def test_deprecated_enable_additional_retries_warns_and_maps_to_default_attempts( +def test_toml_bench_auto_install_defaults_true_and_dry_run_does_not_build_configs_or_spawn( monkeypatch: pytest.MonkeyPatch, tmp_path: Path, - caplog: pytest.LogCaptureFixture, + capsys: pytest.CaptureFixture[str], ) -> None: - config_path = tmp_path / "config.yaml" + config_path = tmp_path / "bench.toml" _write_config( config_path, """ - models: - model-a: - model: alias-model - envs: - medqa: {} - jobs: - - model: model-a - env: medqa + model = "gpt-5-mini" + + [[eval]] + env_id = "missing-env" + num_examples = 1 + rollouts_per_example = 1 """, ) - - captured_attempts: list[int] = [] - - def fake_patch(*, attempts=3, backoff_s=1.0, log_path="medarc_model_retry.log"): # noqa: ARG001 - captured_attempts.append(attempts) - - async def fake_run(config): # noqa: ARG001 - return _stub_cli_result() - - monkeypatch.setattr("medarc_verifiers.utils.retry.patch_verifiers_model_response_retry", fake_patch) - monkeypatch.setattr("medarc_verifiers.cli._config_loader.load_env_metadata", lambda *args, **kwargs: []) - monkeypatch.setattr("medarc_verifiers.cli._job_executor.load_env_metadata", lambda *args, **kwargs: []) - monkeypatch.setattr("medarc_verifiers.cli._job_executor.load_endpoint_registry", lambda *args, **kwargs: {}) - monkeypatch.setattr("medarc_verifiers.cli._job_executor.run_evaluation", fake_run) - - output_dir = tmp_path / "runs_out" env_dir = tmp_path / "envs" - env_dir.mkdir() - - with caplog.at_level(logging.WARNING): - exit_code = main.main( - [ - "bench", - "--config", - str(config_path), - "--output-dir", - str(output_dir), - "--env-dir", - str(env_dir), - "--enable-additional-retries", - ] - ) - + env_pkg = env_dir / "missing_env" + env_pkg.mkdir(parents=True) + (env_pkg / "pyproject.toml").write_text('[project]\nname = "missing-env"\n') + monkeypatch.setattr(main, "build_eval_config", lambda raw, overrides: pytest.fail("parent built EvalConfig")) + monkeypatch.setattr(main.subprocess, "run", lambda *args, **kwargs: pytest.fail("parent spawned child")) + + parser = main.build_batch_parser() + parsed = parser.parse_args(["--config", str(config_path)]) + assert parsed.auto_install is True + parsed_explicit = parser.parse_args(["--config", str(config_path), "--auto-install"]) + assert parsed_explicit.auto_install is True + parsed_disabled = parser.parse_args(["--config", str(config_path), "--no-auto-install"]) + assert parsed_disabled.auto_install is False + + exit_code = main.main(["bench", "--config", str(config_path), "--env-dir", str(env_dir), "--dry-run"]) + + captured = capsys.readouterr() + output = captured.out assert exit_code == 0 - assert captured_attempts == [3] - assert "Flag --enable-additional-retries is deprecated" in caplog.text + assert "missing-env" in output + assert "would auto-install" in captured.err -def test_model_call_retries_overrides_deprecated_toggle( - monkeypatch: pytest.MonkeyPatch, - tmp_path: Path, - caplog: pytest.LogCaptureFixture, -) -> None: - config_path = tmp_path / "config.yaml" +def test_toml_bench_rejects_old_install_envs_flag(tmp_path: Path) -> None: + config_path = tmp_path / "bench.toml" _write_config( config_path, """ - models: - model-a: - model: alias-model - envs: - medqa: {} - jobs: - - model: model-a - env: medqa + model = "gpt-5-mini" + + [[eval]] + env_id = "medqa" """, ) - captured_attempts: list[int] = [] - - def fake_patch(*, attempts=3, backoff_s=1.0, log_path="medarc_model_retry.log"): # noqa: ARG001 - captured_attempts.append(attempts) - - async def fake_run(config): # noqa: ARG001 - return _stub_cli_result() - - monkeypatch.setattr("medarc_verifiers.utils.retry.patch_verifiers_model_response_retry", fake_patch) - monkeypatch.setattr("medarc_verifiers.cli._config_loader.load_env_metadata", lambda *args, **kwargs: []) - monkeypatch.setattr("medarc_verifiers.cli._job_executor.load_env_metadata", lambda *args, **kwargs: []) - monkeypatch.setattr("medarc_verifiers.cli._job_executor.load_endpoint_registry", lambda *args, **kwargs: {}) - monkeypatch.setattr("medarc_verifiers.cli._job_executor.run_evaluation", fake_run) - - output_dir = tmp_path / "runs_out" - env_dir = tmp_path / "envs" - env_dir.mkdir() - - with caplog.at_level(logging.WARNING): - exit_code = main.main( - [ - "bench", - "--config", - str(config_path), - "--output-dir", - str(output_dir), - "--env-dir", - str(env_dir), - "--enable-additional-retries", - "--model-call-retries", - "5", - ] - ) - - assert exit_code == 0 - assert captured_attempts == [5] - assert "Ignoring deprecated --enable-additional-retries" in caplog.text + with pytest.raises(SystemExit): + main.build_batch_parser().parse_args(["--config", str(config_path), "--install-envs"]) -def test_batch_dry_run_with_model_call_retries_does_not_patch( +def test_toml_bench_no_auto_install_plans_selected_raw_before_building_config( monkeypatch: pytest.MonkeyPatch, tmp_path: Path, ) -> None: - config_path = tmp_path / "config.yaml" + config_path = tmp_path / "bench.toml" + output_dir = tmp_path / "evals" _write_config( config_path, """ - models: - model-a: - model: alias-model - envs: - medqa: {} - jobs: - - model: model-a - env: medqa + model = "gpt-5-mini" + + [[eval]] + env_id = "bad-unselected" + num_examples = 1 + rollouts_per_example = 1 + + [[eval]] + env_id = "selected-env" + num_examples = 2 + rollouts_per_example = 1 """, ) + built_envs: list[str] = [] + calls: list[Path] = [] - captured_attempts: list[int] = [] - - def fake_patch(*, attempts=3, backoff_s=1.0, log_path="medarc_model_retry.log"): # noqa: ARG001 - captured_attempts.append(attempts) + def fake_build(raw: dict[str, Any], *, overrides: Any) -> SimpleNamespace: + built_envs.append(raw["env_id"]) + return SimpleNamespace( + env_id=raw["env_id"], + model=raw.get("model", "gpt-5-mini"), + model_copy=lambda update: SimpleNamespace( + env_id=raw["env_id"], + model=raw.get("model", "gpt-5-mini"), + **update, + ), + ) - monkeypatch.setattr("medarc_verifiers.utils.retry.patch_verifiers_model_response_retry", fake_patch) - monkeypatch.setattr("medarc_verifiers.cli._config_loader.load_env_metadata", lambda *args, **kwargs: []) - monkeypatch.setattr("medarc_verifiers.cli._job_executor.load_env_metadata", lambda *args, **kwargs: []) - monkeypatch.setattr("medarc_verifiers.cli._job_executor.load_endpoint_registry", lambda *args, **kwargs: {}) + async def fake_run(config, **_kwargs): + calls.append(Path(config.resume_path)) + Path(config.resume_path, "results.jsonl").write_text(json.dumps({"example_id": "0"}) + "\n") + Path(config.resume_path, "metadata.json").write_text( + json.dumps({"env_id": config.env_id, "model": config.model}) + ) + return {"outputs": [], "metadata": {}} - output_dir = tmp_path / "runs_out" - env_dir = tmp_path / "envs" - env_dir.mkdir() + monkeypatch.setattr(main, "build_eval_config", fake_build) + monkeypatch.setattr(main, "run_evaluation", fake_run) + monkeypatch.setattr(main, "_module_importable", lambda module_name: module_name == "selected_env") exit_code = main.main( [ "bench", "--config", str(config_path), + "--no-auto-install", + "--eval-index", + "2", "--output-dir", str(output_dir), - "--env-dir", - str(env_dir), - "--dry-run", - "--model-call-retries", - "3", ] ) assert exit_code == 0 - assert captured_attempts == [] + assert built_envs == ["selected-env"] + assert calls == [output_dir / "gpt-5-mini" / "selected-env" / "base"] + assert (output_dir / "gpt-5-mini" / "selected-env" / "base").is_dir() -def test_env_rerun_flag_forces_completed_jobs(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: - config_path = tmp_path / "config.yaml" +def test_toml_bench_mixed_missing_env_routes_only_missing_to_isolated_child( + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + config_path = tmp_path / "bench.toml" + output_dir = tmp_path / "evals" + env_dir = tmp_path / "envs" + missing_pkg = env_dir / "missing_env" + missing_pkg.mkdir(parents=True) + (missing_pkg / "pyproject.toml").write_text('[project]\nname = "missing-env"\n', encoding="utf-8") _write_config( config_path, """ - name: rerun-check - models: - model-a: - model: alias-model - envs: - env-a: - rerun: true - jobs: - - model: model-a - env: env-a + model = "gpt-5-mini" + + [[eval]] + env_id = "installed-env" + num_examples = 1 + rollouts_per_example = 1 + + [[eval]] + env_id = "missing-env" + num_examples = 1 + rollouts_per_example = 1 """, ) + parent_runs: list[str] = [] + child_commands: list[list[str]] = [] + installed_paths: list[Path] = [] - captured = [] + def fake_build(raw: dict[str, Any], *, overrides: Any) -> SimpleNamespace: + return SimpleNamespace( + env_id=raw["env_id"], + model=raw.get("model", "gpt-5-mini"), + model_copy=lambda update: SimpleNamespace( + env_id=raw["env_id"], + model=raw.get("model", "gpt-5-mini"), + **update, + ), + ) - async def fake_run(config): - captured.append(config) - return _stub_cli_result() + async def fake_run(config, **_kwargs): + parent_runs.append(config.env_id) + Path(config.resume_path, "results.jsonl").write_text(json.dumps({"example_id": "0"}) + "\n") + Path(config.resume_path, "metadata.json").write_text( + json.dumps({"env_id": config.env_id, "model": config.model}) + ) - monkeypatch.setattr("medarc_verifiers.cli._config_loader.load_env_metadata", lambda *args, **kwargs: []) - monkeypatch.setattr("medarc_verifiers.cli._job_executor.load_env_metadata", lambda *args, **kwargs: []) - monkeypatch.setattr("medarc_verifiers.cli._job_executor.load_endpoint_registry", lambda *args, **kwargs: {}) - monkeypatch.setattr("medarc_verifiers.cli._job_executor.run_evaluation", fake_run) + class FakeVenv: + def __enter__(self) -> Path: + return tmp_path / "fake-venv" / "bin" / "python" - output_dir = tmp_path / "runs_out" - env_dir = tmp_path / "envs" - env_dir.mkdir() + def __exit__(self, exc_type, exc, tb) -> None: + return None + + def fake_subprocess_run(cmd, check=False, capture_output=False, text=False): + child_commands.append([str(part) for part in cmd]) + payload = json.loads(Path(cmd[-1]).read_text(encoding="utf-8")) + assert payload["cleanup_env_package"] is False + assert payload["env_preinstalled"] is True + Path(payload["status_path"]).write_text(json.dumps({"exit_code": 0}), encoding="utf-8") + return SimpleNamespace(returncode=0, stdout="", stderr="") + + monkeypatch.setattr(main, "_module_importable", lambda module_name: module_name == "installed_env") + monkeypatch.setattr(main, "build_eval_config", fake_build) + monkeypatch.setattr(main, "run_evaluation", fake_run) + monkeypatch.setattr(main, "temporary_bench_venv", lambda: FakeVenv()) + monkeypatch.setattr(main, "install_env_package", lambda python, env_path: installed_paths.append(Path(env_path))) + monkeypatch.setattr(main.subprocess, "run", fake_subprocess_run) exit_code = main.main( [ "bench", "--config", str(config_path), - "--output-dir", - str(output_dir), "--env-dir", str(env_dir), - ] - ) - assert exit_code == 0 - assert len(captured) == 1 - run_dirs = list(output_dir.iterdir()) - assert len(run_dirs) == 1 - run_dir = run_dirs[0] - - exit_code_second = main.main( - [ - "bench", - "--config", - str(config_path), "--output-dir", str(output_dir), - "--env-dir", - str(env_dir), ] ) - assert exit_code_second == 0 - assert len(captured) == 2 - manifest_path = run_dir / "run_manifest.json" - manifest = json.loads(manifest_path.read_text()) - job_entry = manifest["jobs"][0] - assert job_entry["status"] == "completed" - assert job_entry["attempt"] == 2 - assert manifest["summary"]["completed"] == 1 + assert exit_code == 0 + assert parent_runs == ["installed-env"] + assert installed_paths == [missing_pkg] + assert len(child_commands) == 1 + assert child_commands[0][0] == str(tmp_path / "fake-venv" / "bin" / "python") -def test_on_complete_rerun_marks_completed_jobs_as_forced( +def test_toml_bench_no_auto_install_missing_env_does_not_force_archive_or_execute( monkeypatch: pytest.MonkeyPatch, tmp_path: Path, ) -> None: - config_path = tmp_path / "config.yaml" + config_path = tmp_path / "bench.toml" + output_dir = tmp_path / "evals" + env_dir = tmp_path / "envs" + env_pkg = env_dir / "missing_env" + env_pkg.mkdir(parents=True) + (env_pkg / "pyproject.toml").write_text('[project]\nname = "missing-env"\n', encoding="utf-8") + results_path = output_dir / "gpt-5-mini" / "missing-env" / "base" + _write_resume_artifacts(results_path, env_id="missing-env", model="gpt-5-mini") _write_config( config_path, """ - name: rerun-on-complete - models: - model-a: - model: alias-model - envs: - medqa: {} - jobs: - - model: model-a - env: medqa + model = "gpt-5-mini" + + [[eval]] + env_id = "missing-env" + num_examples = 1 + rollouts_per_example = 1 """, ) - monkeypatch.setattr("medarc_verifiers.cli._config_loader.load_env_metadata", lambda *args, **kwargs: []) - monkeypatch.setattr("medarc_verifiers.cli._job_executor.load_env_metadata", lambda *args, **kwargs: []) - monkeypatch.setattr("medarc_verifiers.cli._job_executor.load_endpoint_registry", lambda *args, **kwargs: {}) - - async def fake_run(config): # noqa: ARG001 - return _stub_cli_result() - - monkeypatch.setattr("medarc_verifiers.cli._job_executor.run_evaluation", fake_run) - - output_dir = tmp_path / "runs_out" - env_dir = tmp_path / "envs" - env_dir.mkdir() + monkeypatch.setattr(main, "_module_importable", lambda module_name: False) + monkeypatch.setattr(main, "build_eval_config", lambda raw, overrides: pytest.fail("built EvalConfig")) + monkeypatch.setattr(main.subprocess, "run", lambda *args, **kwargs: pytest.fail("spawned child")) - first_exit = main.main( + exit_code = main.main( [ "bench", "--config", str(config_path), - "--output-dir", - str(output_dir), "--env-dir", str(env_dir), - "--run-id", - "forced-rerun-test", - ] - ) - assert first_exit == 0 - - captured: dict[str, Any] = {} - - def fake_execute_jobs(planned_jobs, settings, **kwargs): # noqa: ANN001, ARG001 - captured["planned_job_ids"] = [job.job_id for job in planned_jobs] - captured["forced_job_ids"] = set(settings.forced_job_ids) - return [ - main.JobExecutionResult( - job_id=planned_jobs[0].job_id, - status="skipped", - output_path=settings.output_dir / settings.run_id / planned_jobs[0].job_id, - ) - ] - - monkeypatch.setattr("medarc_verifiers.cli.main.execute_jobs", fake_execute_jobs) - - second_exit = main.main( - [ - "bench", - "--config", - str(config_path), "--output-dir", str(output_dir), - "--env-dir", - str(env_dir), - "--run-id", - "forced-rerun-test", - "--on-complete", - "rerun", + "--no-auto-install", + "--force", ] ) - assert second_exit == 0 - assert captured["planned_job_ids"] == ["model-a-medqa"] - assert captured["forced_job_ids"] == {"model-a-medqa"} + + assert exit_code == 1 + assert (results_path / "metadata.json").is_file() + assert not list(results_path.parent.glob("base__old_*")) -def test_cli_env_config_root_override(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: - config_dir = tmp_path / "configs" - config_dir.mkdir() - config_path = config_dir / "jobs.yaml" +def test_toml_bench_isolated_setup_failure_does_not_force_archive( + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + config_path = tmp_path / "bench.toml" + output_dir = tmp_path / "evals" + env_dir = tmp_path / "envs" + env_pkg = env_dir / "missing_env" + env_pkg.mkdir(parents=True) + (env_pkg / "pyproject.toml").write_text('[project]\nname = "missing-env"\n', encoding="utf-8") + results_path = output_dir / "gpt-5-mini" / "missing-env" / "base" + _write_resume_artifacts(results_path, env_id="missing-env", model="gpt-5-mini") _write_config( config_path, """ - models: - model-a: {} - envs: - - custom_env - jobs: - - model: model-a - env: custom_env - """, - ) + model = "gpt-5-mini" - shared_envs = tmp_path / "shared_envs" - shared_envs.mkdir() - (shared_envs / "custom_env.yaml").write_text( - """ - - id: custom_env - module: custom_env + [[eval]] + env_id = "missing-env" + num_examples = 1 + rollouts_per_example = 1 """, - encoding="utf-8", ) - monkeypatch.setattr("medarc_verifiers.cli._config_loader.load_env_metadata", lambda *args, **kwargs: []) - monkeypatch.setattr("medarc_verifiers.cli._job_executor.load_env_metadata", lambda *args, **kwargs: []) - monkeypatch.setattr("medarc_verifiers.cli._job_executor.load_endpoint_registry", lambda *args, **kwargs: {}) + class FakeVenv: + def __enter__(self) -> Path: + return tmp_path / "fake-python" - async def fake_run(config): - return _stub_cli_result() + def __exit__(self, exc_type, exc, tb) -> None: + return None + + def fail_install(python: Path, env_path: Path) -> None: + raise RuntimeError("env install failed") - monkeypatch.setattr("medarc_verifiers.cli._job_executor.run_evaluation", fake_run) + monkeypatch.setattr(main, "_module_importable", lambda module_name: False) + monkeypatch.setattr(main, "temporary_bench_venv", lambda: FakeVenv()) + monkeypatch.setattr(main, "install_env_package", fail_install) + monkeypatch.setattr(main.subprocess, "run", lambda *args, **kwargs: pytest.fail("spawned child")) - output_dir = tmp_path / "runs_out" exit_code = main.main( [ "bench", "--config", str(config_path), + "--env-dir", + str(env_dir), "--output-dir", str(output_dir), - "--env-config-root", - str(shared_envs), + "--force", ] ) - assert exit_code == 0 - - -## + assert exit_code == 1 + assert (results_path / "metadata.json").is_file() + assert not list(results_path.parent.glob("base__old_*")) -def test_regen_reuses_completed_jobs(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: - config_path = tmp_path / "config.yaml" +def test_toml_bench_dry_run_display_ignores_env_package_defaults( + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, + capsys: pytest.CaptureFixture[str], +) -> None: + config_path = tmp_path / "bench.toml" _write_config( config_path, """ - models: - model-a: {} - model-b: {} - envs: - medqa: {} - jobs: - - model: model-a - env: medqa - - model: model-b - env: medqa + model = "gpt-5-mini" + + [[eval]] + env_id = "medqa" """, ) + monkeypatch.setattr(main, "build_eval_config", lambda raw, overrides: pytest.fail("parent built EvalConfig")) - monkeypatch.setattr("medarc_verifiers.cli._config_loader.load_env_metadata", lambda *args, **kwargs: []) - monkeypatch.setattr("medarc_verifiers.cli._job_executor.load_env_metadata", lambda *args, **kwargs: []) - monkeypatch.setattr("medarc_verifiers.cli._job_executor.load_endpoint_registry", lambda *args, **kwargs: {}) + exit_code = main.main(["bench", "--config", str(config_path), "--dry-run"]) - async def first_run(config): - return _stub_cli_result() + output = capsys.readouterr().out + assert exit_code == 0 + assert "medqa" in output + assert "runs/evals/gpt-5-mini/medqa/base" in output + assert "1000" not in output - monkeypatch.setattr("medarc_verifiers.cli._job_executor.run_evaluation", first_run) - output_dir = tmp_path / "runs_out" - base_run = "base-run" - assert ( - main.main(["bench", "--config", str(config_path), "--output-dir", str(output_dir), "--run-id", base_run]) == 0 +def test_toml_bench_dry_run_uses_toml_output_dir( + tmp_path: Path, + capsys: pytest.CaptureFixture[str], +) -> None: + config_path = tmp_path / "bench.toml" + output_dir = tmp_path / "toml-output" + _write_config( + config_path, + f""" + model = "gpt-5-mini" + output_dir = "{output_dir}" + + [[eval]] + env_id = "medqa" + """, ) - base_manifest_path = output_dir / base_run / "run_manifest.json" - base_manifest = json.loads(base_manifest_path.read_text()) - base_manifest["jobs"][1]["status"] = "failed" - base_manifest["jobs"][1]["reason"] = "boom" - base_manifest_path.write_text(json.dumps(base_manifest, indent=2)) + assert main.main(["bench", "--config", str(config_path), "--dry-run"]) == 0 - calls: list[int] = [] + assert str(output_dir / "gpt-5-mini" / "medqa" / "base") in capsys.readouterr().out - async def regen_run(config): - calls.append(1) - return _stub_cli_result() - monkeypatch.setattr("medarc_verifiers.cli._job_executor.run_evaluation", regen_run) +def test_toml_bench_executes_sequentially_to_deterministic_path( + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + _patch_toml_bench_envs_installed(monkeypatch) + config_path = tmp_path / "bench.toml" + output_dir = tmp_path / "evals" + _write_config( + config_path, + """ + model = "gpt-5-mini" - # Restart now uses the --restart flag and performs in-place extension of the seed run. - exit_code = main.main( - [ - "bench", - "--config", - str(config_path), - "--output-dir", - str(output_dir), - "--restart", - base_run, - ] + [[eval]] + env_id = "medqa" + num_examples = 1 + rollouts_per_example = 1 + """, ) - assert exit_code == 0 - assert len(calls) == 1 + calls: list[Path] = [] + + async def fake_run(config, on_progress=None, **_kwargs): + results_path = Path(config.resume_path) + calls.append(results_path) + metadata = {"env_id": config.env_id, "model": config.model} + if on_progress is not None: + on_progress([], [], metadata) + (results_path / "results.jsonl").write_text(json.dumps({"example_id": "0", "reward": 1.0}) + "\n") + (results_path / "metadata.json").write_text(json.dumps(metadata)) + return {"outputs": [], "metadata": metadata} - # Manifest is updated in-place under the original run id (base_run) - updated_manifest = json.loads((output_dir / base_run / "run_manifest.json").read_text()) - reasons = {entry["job_id"]: entry.get("reason") for entry in updated_manifest["jobs"]} - assert reasons["model-a-medqa"] == "up_to_date" - assert updated_manifest["summary"]["completed"] == 2 - # restart_source may remain None for in-place restarts; no assertion on legacy regen_source field. + monkeypatch.setattr(main, "run_evaluation", fake_run) + exit_code = main.main(["bench", "--config", str(config_path), "--output-dir", str(output_dir)]) -def test_regen_accepts_path_to_run_dir(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: - """--restart can be a direct path to a run directory, not only a run-id under output_dir.""" - config_path = tmp_path / "config.yaml" + results_path = output_dir / "gpt-5-mini" / "medqa" / "base" + assert exit_code == 0 + assert calls == [results_path] + assert (results_path / "results.jsonl").exists() + metadata = json.loads((results_path / "metadata.json").read_text()) + assert "medarc_config_fingerprint" not in metadata + assert "variant_id" not in metadata + assert "variant_payload" not in metadata + assert not (output_dir / "gpt-5-mini" / ".medarc_eval_metadata.json").exists() + + +def test_toml_bench_defaults_max_concurrent_to_one(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: + _patch_toml_bench_envs_installed(monkeypatch) + config_path = tmp_path / "bench.toml" _write_config( config_path, """ - models: - model-a: {} - envs: - medqa: {} - jobs: - - model: model-a - env: medqa + model = "gpt-5-mini" + + [[eval]] + env_id = "medqa" """, ) + captured: list[int] = [] - monkeypatch.setattr("medarc_verifiers.cli._config_loader.load_env_metadata", lambda *args, **kwargs: []) - monkeypatch.setattr("medarc_verifiers.cli._job_executor.load_env_metadata", lambda *args, **kwargs: []) - monkeypatch.setattr("medarc_verifiers.cli._job_executor.load_endpoint_registry", lambda *args, **kwargs: {}) + async def fake_run(config, **_kwargs): + captured.append(config.max_concurrent) + Path(config.resume_path, "results.jsonl").write_text(json.dumps({"example_id": "0"}) + "\n") + return {"outputs": [], "metadata": {}} - async def fake_run(config): - return _stub_cli_result() + monkeypatch.setattr(main, "run_evaluation", fake_run) - monkeypatch.setattr("medarc_verifiers.cli._job_executor.run_evaluation", fake_run) + assert main.main(["bench", "--config", str(config_path), "--output-dir", str(tmp_path / "evals")]) == 0 + assert captured == [1] - output_dir = tmp_path / "runs_out" - base_run = output_dir / "base-run" - # First run to create a seed manifest + captured.clear() assert ( - main.main(["bench", "--config", str(config_path), "--output-dir", str(output_dir), "--run-id", "base-run"]) == 0 - ) - - # Now use --restart with an explicit path to the run directory - # Use --restart with explicit path to existing run directory; should update in place. - # Mock interactive prompt to avoid stdin capture when all jobs are already completed. - monkeypatch.setattr("medarc_verifiers.cli.main._prompt_completed_jobs_action", lambda: "continue") - exit_code = main.main( - [ - "bench", - "--config", - str(config_path), - "--output-dir", - str(output_dir), - "--restart", - str(base_run), - ] + main.main( + [ + "bench", + "--config", + str(config_path), + "--output-dir", + str(tmp_path / "evals-override"), + "--max-concurrent", + "4", + ] + ) + == 0 ) - assert exit_code == 0 - # Ensure manifest exists after restart-in-place; legacy regen_source not asserted. - assert (output_dir / "base-run" / "run_manifest.json").exists() + assert captured == [4] -def test_regen_accepts_manifest_path(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: - config_path = tmp_path / "config.yaml" +def test_toml_bench_defaults_to_runs_evals(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: + _patch_toml_bench_envs_installed(monkeypatch) + monkeypatch.chdir(tmp_path) + config_path = tmp_path / "bench.toml" _write_config( config_path, """ - models: - model-a: {} - envs: - medqa: {} - jobs: - - model: model-a - env: medqa + model = "gpt-5-mini" + + [[eval]] + env_id = "medqa" """, ) + calls: list[Path] = [] - monkeypatch.setattr("medarc_verifiers.cli._config_loader.load_env_metadata", lambda *args, **kwargs: []) - monkeypatch.setattr("medarc_verifiers.cli._job_executor.load_env_metadata", lambda *args, **kwargs: []) - monkeypatch.setattr("medarc_verifiers.cli._job_executor.load_endpoint_registry", lambda *args, **kwargs: {}) - - async def fake_run(config): - return _stub_cli_result() - - monkeypatch.setattr("medarc_verifiers.cli._job_executor.run_evaluation", fake_run) + async def fake_run(config, **_kwargs): + results_path = Path(config.resume_path) + calls.append(results_path) + (results_path / "metadata.json").write_text(json.dumps({"env_id": "medqa", "model": "gpt-5-mini"})) + (results_path / "results.jsonl").write_text(json.dumps({"example_id": "0"}) + "\n") + return {"outputs": [], "metadata": {}} - output_dir = tmp_path / "runs_out" - base_run = "base-run" - assert ( - main.main(["bench", "--config", str(config_path), "--output-dir", str(output_dir), "--run-id", base_run]) == 0 - ) + monkeypatch.setattr(main, "run_evaluation", fake_run) - manifest_path = output_dir / base_run / "run_manifest.json" - monkeypatch.setattr("medarc_verifiers.cli.main._prompt_completed_jobs_action", lambda: "continue") - exit_code = main.main( - [ - "bench", - "--config", - str(config_path), - "--output-dir", - str(output_dir), - "--restart", - str(manifest_path), - ] - ) - assert exit_code == 0 + assert main.main(["bench", "--config", str(config_path)]) == 0 + assert calls == [Path("runs/evals/gpt-5-mini/medqa/base")] -def test_invalid_run_id_rejected( - monkeypatch: pytest.MonkeyPatch, tmp_path: Path, caplog: pytest.LogCaptureFixture -) -> None: - config_path = tmp_path / "config.yaml" +def test_toml_bench_auto_resumes_existing_output(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: + _patch_toml_bench_envs_installed(monkeypatch) + config_path = tmp_path / "bench.toml" + output_dir = tmp_path / "evals" _write_config( config_path, """ - models: - model-a: {} - envs: - medqa: {} - jobs: - - model: model-a - env: medqa + model = "gpt-5-mini" + + [[eval]] + env_id = "medqa" + env_args = { shuffle_seed = 1618 } """, ) + calls = 0 - monkeypatch.setattr("medarc_verifiers.cli._config_loader.load_env_metadata", lambda *args, **kwargs: []) - monkeypatch.setattr("medarc_verifiers.cli._job_executor.load_env_metadata", lambda *args, **kwargs: []) - monkeypatch.setattr("medarc_verifiers.cli._job_executor.load_endpoint_registry", lambda *args, **kwargs: {}) + async def fake_run(config, **_kwargs): + nonlocal calls + calls += 1 + Path(config.resume_path, "results.jsonl").write_text(json.dumps({"example_id": "0"}) + "\n") + Path(config.resume_path, "metadata.json").write_text(json.dumps({"env_id": "medqa", "model": "gpt-5-mini"})) + return {"outputs": [], "metadata": {}} - output_dir = tmp_path / "runs_out" - caplog.set_level(logging.ERROR) - exit_code = main.main( - [ - "bench", - "--config", - str(config_path), - "--output-dir", - str(output_dir), - "--run-id", - "../oops", - ] - ) - assert exit_code == 1 - assert "Invalid --run-id '../oops'" in caplog.text - assert "Suggested safe value: --run-id" in caplog.text + monkeypatch.setattr(main, "run_evaluation", fake_run) - caplog.clear() - exit_code = main.main( - [ - "bench", - "--config", - str(config_path), - "--output-dir", - str(output_dir), - "--run-id", - "/tmp/elsewhere", - ] - ) - assert exit_code == 1 - assert "Invalid --run-id '/tmp/elsewhere'" in caplog.text + assert main.main(["bench", "--config", str(config_path), "--output-dir", str(output_dir)]) == 0 + assert main.main(["bench", "--config", str(config_path), "--output-dir", str(output_dir)]) == 0 + assert calls == 2 -def test_restart_run_id_rejects_traversal( +def test_toml_bench_resume_refuses_malformed_existing_output( monkeypatch: pytest.MonkeyPatch, tmp_path: Path, - caplog: pytest.LogCaptureFixture, ) -> None: - config_path = tmp_path / "config.yaml" + _patch_toml_bench_envs_installed(monkeypatch) + config_path = tmp_path / "bench.toml" + output_dir = tmp_path / "evals" _write_config( config_path, """ - models: - model-a: {} - envs: - medqa: {} - jobs: - - model: model-a - env: medqa + model = "gpt-5-mini" + + [[eval]] + env_id = "medqa" """, ) + results_path = output_dir / "gpt-5-mini" / "medqa" / "base" + (results_path / "metadata.json").mkdir(parents=True) + (results_path / "results.jsonl").write_text(json.dumps({"example_id": "0"}) + "\n") + calls = 0 - monkeypatch.setattr("medarc_verifiers.cli._config_loader.load_env_metadata", lambda *args, **kwargs: []) - monkeypatch.setattr("medarc_verifiers.cli._job_executor.load_env_metadata", lambda *args, **kwargs: []) - monkeypatch.setattr("medarc_verifiers.cli._job_executor.load_endpoint_registry", lambda *args, **kwargs: {}) + async def fake_run(config, **_kwargs): + nonlocal calls + calls += 1 + return {"outputs": [], "metadata": {}} - output_dir = tmp_path / "runs_out" - caplog.set_level(logging.ERROR) - exit_code = main.main( - [ - "bench", - "--config", - str(config_path), - "--output-dir", - str(output_dir), - "--restart", - "../escape", - ] - ) - assert exit_code == 1 - assert "Invalid --restart '../escape'" in caplog.text + monkeypatch.setattr(main, "run_evaluation", fake_run) + + assert main.main(["bench", "--config", str(config_path), "--output-dir", str(output_dir)]) == 1 + assert calls == 0 -def test_auto_resume_discovery_without_run_id(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: - """Auto-resume should discover a prior matching run when --run-id is omitted.""" - config_path = tmp_path / "config.yaml" +def test_toml_bench_reuses_empty_existing_output_dir(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: + _patch_toml_bench_envs_installed(monkeypatch) + config_path = tmp_path / "bench.toml" + output_dir = tmp_path / "evals" _write_config( config_path, """ -models: - model-a: {} - model-b: {} -envs: - medqa: {} -jobs: - - model: model-a - env: medqa - - model: model-b - env: medqa + model = "gpt-5-mini" + + [[eval]] + env_id = "medqa" """, ) + results_path = output_dir / "gpt-5-mini" / "medqa" / "base" + results_path.mkdir(parents=True) + calls = 0 - # Avoid external dependencies - monkeypatch.setattr("medarc_verifiers.cli._config_loader.load_env_metadata", lambda *args, **kwargs: []) - monkeypatch.setattr("medarc_verifiers.cli._job_executor.load_env_metadata", lambda *args, **kwargs: []) - monkeypatch.setattr("medarc_verifiers.cli._job_executor.load_endpoint_registry", lambda *args, **kwargs: {}) + async def fake_run(config, **_kwargs): + nonlocal calls + calls += 1 + assert Path(config.resume_path) == results_path + Path(config.resume_path, "results.jsonl").write_text(json.dumps({"example_id": "0"}) + "\n") + Path(config.resume_path, "metadata.json").write_text(json.dumps({"env_id": "medqa", "model": "gpt-5-mini"})) + return {"outputs": [], "metadata": {}} - async def first_run(config): - return _stub_cli_result() + monkeypatch.setattr(main, "run_evaluation", fake_run) - monkeypatch.setattr("medarc_verifiers.cli._job_executor.run_evaluation", first_run) + assert main.main(["bench", "--config", str(config_path), "--output-dir", str(output_dir)]) == 0 + assert calls == 1 + assert (results_path / "metadata.json").is_file() - output_dir = tmp_path / "runs_out" - run_id = "discover-me" - # Create the prior run - assert main.main(["bench", "--config", str(config_path), "--output-dir", str(output_dir), "--run-id", run_id]) == 0 - # Mark one job as failed to make the run incomplete - manifest_path = output_dir / run_id / "run_manifest.json" - manifest = json.loads(manifest_path.read_text()) - manifest["jobs"][1]["status"] = "failed" - manifest["jobs"][1]["reason"] = "boom" - manifest_path.write_text(json.dumps(manifest, indent=2)) +def test_toml_bench_force_archives_existing_output(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: + _patch_toml_bench_envs_installed(monkeypatch) + config_path = tmp_path / "bench.toml" + output_dir = tmp_path / "evals" + _write_config( + config_path, + """ + model = "gpt-5-mini" + + [[eval]] + env_id = "medqa" + """, + ) - # Now resume without specifying --run-id; it should discover the 'discover-me' run - calls: list[int] = [] + async def fake_run(config, **_kwargs): + Path(config.resume_path, "results.jsonl").write_text(json.dumps({"example_id": "0"}) + "\n") + return {"outputs": [], "metadata": {}} - async def resume_run(config): - calls.append(1) - return _stub_cli_result() + monkeypatch.setattr(main, "run_evaluation", fake_run) - monkeypatch.setattr("medarc_verifiers.cli._job_executor.run_evaluation", resume_run) + assert main.main(["bench", "--config", str(config_path), "--output-dir", str(output_dir)]) == 0 + results_path = output_dir / "gpt-5-mini" / "medqa" / "base" + (results_path / "sentinel.txt").write_text("old") - exit_code = main.main(["bench", "--config", str(config_path), "--output-dir", str(output_dir)]) - assert exit_code == 0 - assert len(calls) == 1 # only the failed job should be re-run + assert main.main(["bench", "--config", str(config_path), "--output-dir", str(output_dir), "--force"]) == 0 - # Verify the discovered run was updated to completion - manifest_after = json.loads(manifest_path.read_text()) - assert manifest_after["summary"]["completed"] == 2 - assert manifest_after["summary"]["failed"] == 0 + archived = list((output_dir / "gpt-5-mini" / "medqa").glob("base__old_*")) + assert len(archived) == 1 + assert (archived[0] / "sentinel.txt").read_text() == "old" + assert not (results_path / "sentinel.txt").exists() -def test_no_auto_resume_forces_new_run(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: - """Passing --no-auto-resume should ignore existing manifests and start a new run.""" - config_path = tmp_path / "config.yaml" +def test_toml_bench_resume_preserves_existing_metadata(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: + _patch_toml_bench_envs_installed(monkeypatch) + config_path = tmp_path / "bench.toml" + output_dir = tmp_path / "evals" _write_config( config_path, """ -models: - model-a: {} - model-b: {} -envs: - medqa: {} -jobs: - - model: model-a - env: medqa - - model: model-b - env: medqa + model = "gpt-5-mini" + + [[eval]] + env_id = "medqa" """, ) + calls = 0 + + async def fake_run(config, **_kwargs): + nonlocal calls + calls += 1 + results_path = Path(config.resume_path) + if calls == 1: + (results_path / "results.jsonl").write_text(json.dumps({"example_id": "0"}) + "\n") + (results_path / "metadata.json").write_text( + json.dumps( + { + "avg_reward": 0.75, + "avg_metrics": {"accuracy": 0.75}, + "total_tokens": 123, + } + ) + ) + return {"outputs": [], "metadata": {}} - monkeypatch.setattr("medarc_verifiers.cli._config_loader.load_env_metadata", lambda *args, **kwargs: []) - monkeypatch.setattr("medarc_verifiers.cli._job_executor.load_env_metadata", lambda *args, **kwargs: []) - monkeypatch.setattr("medarc_verifiers.cli._job_executor.load_endpoint_registry", lambda *args, **kwargs: {}) + monkeypatch.setattr(main, "run_evaluation", fake_run) + + assert main.main(["bench", "--config", str(config_path), "--output-dir", str(output_dir)]) == 0 + assert main.main(["bench", "--config", str(config_path), "--output-dir", str(output_dir)]) == 0 + + metadata = json.loads((output_dir / "gpt-5-mini" / "medqa" / "base" / "metadata.json").read_text()) + assert metadata["avg_reward"] == 0.75 + assert metadata["avg_metrics"] == {"accuracy": 0.75} + assert metadata["total_tokens"] == 123 + assert "medarc_config_fingerprint" not in metadata - async def first_run(config): - return _stub_cli_result() - monkeypatch.setattr("medarc_verifiers.cli._job_executor.run_evaluation", first_run) +def test_toml_bench_does_not_patch_upstream_metadata_saves( + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + _patch_toml_bench_envs_installed(monkeypatch) + import verifiers.envs.environment as environment_module - output_dir = tmp_path / "runs_out" - run_id = "baseline-run" - assert main.main(["bench", "--config", str(config_path), "--output-dir", str(output_dir), "--run-id", run_id]) == 0 + config_path = tmp_path / "bench.toml" + _write_config( + config_path, + """ + model = "gpt-5-mini" - manifest_path = output_dir / run_id / "run_manifest.json" - manifest = json.loads(manifest_path.read_text()) - manifest["jobs"][0]["status"] = "failed" - manifest["jobs"][0]["reason"] = "boom" - manifest_path.write_text(json.dumps(manifest, indent=2)) + [[eval]] + env_id = "medqa" + """, + ) + saved_metadata: list[dict[str, Any]] = [] - calls: list[int] = [] + def fake_save_metadata(metadata, result_path): + saved_metadata.append(dict(metadata)) + Path(result_path).mkdir(parents=True, exist_ok=True) + Path(result_path, "metadata.json").write_text(json.dumps(metadata)) - async def fresh_run(config): - calls.append(1) - return _stub_cli_result() + async def fake_run(config, on_progress=None, **_kwargs): + metadata = {} + if on_progress is not None: + on_progress([], [], metadata) + environment_module.save_metadata({}, Path(config.resume_path)) + Path(config.resume_path, "results.jsonl").write_text(json.dumps({"example_id": "0"}) + "\n") + return {"outputs": [], "metadata": metadata} - monkeypatch.setattr("medarc_verifiers.cli._job_executor.run_evaluation", fresh_run) + monkeypatch.setattr(environment_module, "save_metadata", fake_save_metadata) + monkeypatch.setattr(main, "run_evaluation", fake_run) - preexisting = {child.name for child in output_dir.iterdir()} - exit_code = main.main(["bench", "--config", str(config_path), "--output-dir", str(output_dir), "--no-auto-resume"]) - assert exit_code == 0 - assert len(calls) == 2 # both jobs rerun in the fresh run + assert main.main(["bench", "--config", str(config_path), "--output-dir", str(tmp_path / "evals")]) == 0 - post = {child.name for child in output_dir.iterdir()} - new_runs = post - preexisting - assert run_id in post - assert len(new_runs) == 1 - new_run_id = next(iter(new_runs)) - assert new_run_id != run_id - assert (output_dir / new_run_id / "run_manifest.json").exists() + assert saved_metadata == [{}] + metadata = json.loads((tmp_path / "evals" / "gpt-5-mini" / "medqa" / "base" / "metadata.json").read_text()) + assert "medarc_config_fingerprint" not in metadata + assert "variant_id" not in metadata + assert "variant_payload" not in metadata -def test_single_run_help_lists_env_section_and_header_file( +def test_single_run_help_lists_env_section_and_header_option( monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str], ) -> None: @@ -1264,7 +977,8 @@ def test_single_run_help_lists_env_section_and_header_file( assert exit_code == 0 captured = capsys.readouterr().out assert "medqa environment options:" in captured - assert "--header-file" in captured + assert "--header" in captured + assert "--header-file" not in captured def test_single_run_help_orders_env_group_before_core_options( @@ -1370,32 +1084,26 @@ async def fake_run(config): assert eval_config.sampling_args["max_tokens"] == 64 -def test_single_run_header_file_overrides_cli_headers( +def test_single_run_headers_pass_through_to_eval_config( monkeypatch: pytest.MonkeyPatch, - tmp_path: Path, capsys: pytest.CaptureFixture[str], ) -> None: metadata: list[EnvParam] = [] _patch_single_run_env(monkeypatch, metadata) - header_file = tmp_path / "headers.txt" - header_file.write_text("X-Test: file\n", encoding="utf-8") - exit_code = main.main( [ "medqa", "--dry-run", "--header", "X-Test: cli", - "--header-file", - str(header_file), ] ) assert exit_code == 0 output = capsys.readouterr().out config = json.loads(output) - assert config["client_config"]["extra_headers"] == {"X-Test": "file"} + assert config["client_config"]["extra_headers"] == {"X-Test": "cli"} def test_single_run_auto_adds_prime_team_header( @@ -1470,43 +1178,59 @@ def test_single_run_explicit_api_key_var_is_respected_for_prime_url( assert config["client_config"]["api_key_var"] == "OPENAI_API_KEY" -def test_single_run_dry_run_outputs_config( +def test_single_run_endpoint_alias_uses_registry_url_and_key( monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, capsys: pytest.CaptureFixture[str], ) -> None: metadata: list[EnvParam] = [] _patch_single_run_env(monkeypatch, metadata) + endpoints_path = tmp_path / "endpoints.toml" + endpoints_path.write_text( + """ + [[endpoint]] + endpoint_id = "openai-alias" + model = "openai/resolved" + url = "https://registry.example/v1" + key = "REGISTRY_KEY" + """, + encoding="utf-8", + ) - async def fail_if_called(*args, **kwargs): - raise AssertionError("run_evaluation should not execute during dry-run.") - - monkeypatch.setattr("medarc_verifiers.cli._single_run.run_evaluation", fail_if_called) - - exit_code = main.main(["medqa", "--dry-run"]) + exit_code = main.main( + [ + "medqa", + "--dry-run", + "--model", + "openai-alias", + "--endpoints-path", + str(endpoints_path), + ] + ) assert exit_code == 0 - output = capsys.readouterr().out - assert '"env_id": "medqa"' in output + config = json.loads(capsys.readouterr().out) + assert config["endpoint_id"] == "openai-alias" + assert config["model"] == "openai/resolved" + assert config["client_config"]["api_base_url"] == "https://registry.example/v1" + assert config["client_config"]["api_key_var"] == "REGISTRY_KEY" -def test_single_run_dry_run_with_model_call_retries_does_not_patch( +def test_single_run_dry_run_outputs_config( monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str], ) -> None: metadata: list[EnvParam] = [] _patch_single_run_env(monkeypatch, metadata) - captured_attempts: list[int] = [] - - def fake_patch(*, attempts=3, backoff_s=1.0, log_path="medarc_model_retry.log"): # noqa: ARG001 - captured_attempts.append(attempts) + async def fail_if_called(*args, **kwargs): + raise AssertionError("run_evaluation should not execute during dry-run.") - monkeypatch.setattr("medarc_verifiers.utils.retry.patch_verifiers_model_response_retry", fake_patch) + monkeypatch.setattr("medarc_verifiers.cli._single_run.run_evaluation", fail_if_called) - exit_code = main.main(["medqa", "--dry-run", "--model-call-retries", "3"]) + exit_code = main.main(["medqa", "--dry-run"]) assert exit_code == 0 - assert captured_attempts == [] output = capsys.readouterr().out assert '"env_id": "medqa"' in output @@ -1764,8 +1488,6 @@ def fake_run(options, env_export_map): str(tmp_path / "processed"), "--env-config-root", str(env_root), - "--status", - "completed", "--hf-repo", "medarc/demo", "--dry-run", @@ -1774,12 +1496,39 @@ def fake_run(options, env_export_map): assert exit_code == 0 options = captured["options"] - assert options.status_filter == ("completed",) assert options.hf_config is not None env_map = captured["env_export_map"] assert "demo-env" in env_map +def test_load_env_export_map_adds_module_variant_keys(tmp_path: Path) -> None: + env_root = tmp_path / "envs" + env_root.mkdir() + (env_root / "medcalc_bench.yaml").write_text( + """ + - id: medcalc_bench_tools + module: medcalc_bench + env_args: + version: verified + add_python_tool: true + add_calculator_tool: true + export: + extra_columns: [lower_bound, upper_bound] + answer_column: ground_truth + """, + encoding="utf-8", + ) + + env_map = main._load_env_export_map(env_root) + + variant_key = ( + "medcalc_bench::env_args.add_calculator_tool-true__env_args.add_python_tool-true__env_args.version-verified" + ) + assert "medcalc_bench_tools" in env_map + assert variant_key in env_map + assert env_map[variant_key].answer_column == "ground_truth" + + def test_process_cli_applies_config_defaults(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: env_root = tmp_path / "envs" env_root.mkdir() @@ -1794,7 +1543,7 @@ def test_process_cli_applies_config_defaults(monkeypatch: pytest.MonkeyPatch, tm cfg_path = tmp_path / "process.yaml" cfg_path.write_text( f""" - runs_dir: runs/raw-from-config + runs_dir: runs/evals-from-config process: dir: processed env_config_root: {env_root} @@ -1822,7 +1571,7 @@ def fake_run(options, env_export_map): assert exit_code == 0 options = captured["options"] - assert options.runs_dir == Path("runs/raw-from-config") + assert options.runs_dir == Path("runs/evals-from-config") assert options.output_dir == Path("runs/processed") assert options.max_workers == 2 assert options.hf_pull_policy == "pull" @@ -1848,7 +1597,7 @@ def test_process_cli_resolves_hf_token_env_reference(monkeypatch: pytest.MonkeyP cfg_path = tmp_path / "process.yaml" cfg_path.write_text( """ - runs_dir: runs/raw-from-config + runs_dir: runs/evals-from-config process: dir: processed hf: @@ -1877,7 +1626,7 @@ def test_winrate_cli_applies_config_defaults(monkeypatch: pytest.MonkeyPatch, tm cfg_path = tmp_path / "winrate.yaml" cfg_path.write_text( """ - runs_dir: runs/raw-from-config + runs_dir: runs/evals-from-config process: dir: processed winrate: @@ -2010,7 +1759,7 @@ def test_process_cli_rejects_unset_hf_token_env_reference( cfg_path = tmp_path / "process.yaml" cfg_path.write_text( """ - runs_dir: runs/raw-from-config + runs_dir: runs/evals-from-config process: dir: processed hf: @@ -2030,7 +1779,7 @@ def test_process_cli_rejects_unset_hf_token_env_reference( def test_expand_embedded_process_config_promotes_process_section() -> None: payload = { - "runs_dir": "runs/raw", + "runs_dir": "runs/evals", "process": { "dir": "processed", "max_workers": 8, @@ -2041,7 +1790,7 @@ def test_expand_embedded_process_config_promotes_process_section() -> None: expanded = main._expand_embedded_pipeline_config(payload, mode="process") - assert expanded["runs_dir"] == "runs/raw" + assert expanded["runs_dir"] == "runs/evals" assert expanded["output_dir"] == Path("runs/processed") assert expanded["max_workers"] == 8 assert expanded["replace_models"] == ["model-a"] @@ -2049,6 +1798,20 @@ def test_expand_embedded_process_config_promotes_process_section() -> None: assert payload["process"]["dir"] == "processed" +def test_expand_embedded_process_config_uses_default_evals_parent_for_relative_dir() -> None: + payload = { + "process": { + "dir": "processed", + "max_workers": 8, + }, + } + + expanded = main._expand_embedded_pipeline_config(payload, mode="process") + + assert expanded["output_dir"] == Path("runs/processed") + assert expanded["max_workers"] == 8 + + def test_expand_embedded_winrate_config_resolves_relative_dirs() -> None: payload = { "runs_dir": "artifacts/raw", @@ -2099,7 +1862,7 @@ def test_process_cli_requires_winrate_config_path(tmp_path: Path) -> None: ) -def test_process_cli_defaults_status_filter_to_completed(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: +def test_process_cli_records_default_max_results_missing_pct(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: captured: dict[str, Any] = {} def fake_run_process(options, env_export_map): @@ -2121,13 +1884,11 @@ def fake_run_process(options, env_export_map): assert exit_code == 0 options = captured["options"] - assert options.status_filter == ("completed",) - assert options.processed_with_args["status"] == ["completed"] assert options.max_results_missing_pct == pytest.approx(2.5) assert options.processed_with_args["max_results_missing_pct"] == pytest.approx(2.5) -def test_process_cli_uses_explicit_status_filter(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: +def test_process_cli_records_explicit_max_results_missing_pct(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: captured: dict[str, Any] = {} def fake_run_process(options, env_export_map): @@ -2143,8 +1904,6 @@ def fake_run_process(options, env_export_map): str(tmp_path / "runs"), "--output-dir", str(tmp_path / "processed"), - "--status", - "failed", "--max-results-missing-pct", "100", "--dry-run", @@ -2153,8 +1912,6 @@ def fake_run_process(options, env_export_map): assert exit_code == 0 options = captured["options"] - assert options.status_filter == ("failed",) - assert options.processed_with_args["status"] == ["failed"] assert options.max_results_missing_pct == pytest.approx(100.0) @@ -2180,42 +1937,11 @@ def test_process_cli_rejects_negative_max_results_missing_pct( assert "--max-results-missing-pct must be non-negative." in err -def test_process_config_empty_status_uses_default_filter( - monkeypatch: pytest.MonkeyPatch, - tmp_path: Path, -) -> None: - cfg_path = tmp_path / "process.yaml" - cfg_path.write_text( - """ - runs_dir: runs/raw - process: - dir: processed - status: [] - """, - encoding="utf-8", - ) - - captured: dict[str, Any] = {} - - def fake_run_process(options, env_export_map): - captured["options"] = options - return ProcessResult(records_processed=0, rows_processed=0, env_groups=[], env_summaries=[], hf_summary=None) - - monkeypatch.setattr(main, "run_process", fake_run_process) - - exit_code = main.main(["process", "--config", str(cfg_path), "--dry-run"]) - - assert exit_code == 0 - options = captured["options"] - assert options.status_filter == ("completed",) - assert options.processed_with_args["status"] == ["completed"] - - def test_process_cli_runs_embedded_winrate_post_step(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: cfg_path = tmp_path / "process.yaml" cfg_path.write_text( """ - runs_dir: runs/raw + runs_dir: runs/evals process: dir: processed winrate: @@ -2304,7 +2030,7 @@ def test_process_cli_defaults_winrate_output_dir_under_processed( cfg_path = tmp_path / "process.yaml" cfg_path.write_text( """ - runs_dir: runs/raw + runs_dir: runs/evals process: dir: processed winrate: @@ -2431,7 +2157,7 @@ def test_process_cli_rejects_invalid_typed_config_values( cfg_path = tmp_path / "process-invalid.yaml" cfg_path.write_text( f""" - runs_dir: runs/raw + runs_dir: runs/evals output_dir: runs/processed {field}: {value} """, @@ -2454,7 +2180,7 @@ def test_process_cli_rejects_removed_top_level_max_run_missing_pct_config_key( cfg_path = tmp_path / "process-removed-top-level.yaml" cfg_path.write_text( """ - runs_dir: runs/raw + runs_dir: runs/evals output_dir: runs/processed max_run_missing_pct: 2.5 """, @@ -2477,7 +2203,7 @@ def test_process_cli_rejects_removed_embedded_max_run_missing_pct_config_key( cfg_path = tmp_path / "process-removed-embedded.yaml" cfg_path.write_text( """ - runs_dir: runs/raw + runs_dir: runs/evals process: dir: processed max_run_missing_pct: 2.5 @@ -2494,6 +2220,29 @@ def test_process_cli_rejects_removed_embedded_max_run_missing_pct_config_key( assert "process.max_results_missing_pct" in err +def test_process_cli_rejects_removed_status_config_key( + tmp_path: Path, + capsys: pytest.CaptureFixture[str], +) -> None: + cfg_path = tmp_path / "process-removed-status.yaml" + cfg_path.write_text( + """ + runs_dir: runs/evals + process: + dir: processed + status: [completed] + """, + encoding="utf-8", + ) + + with pytest.raises(SystemExit) as excinfo: + main.main(["process", "--config", str(cfg_path)]) + + assert excinfo.value.code == 2 + err = capsys.readouterr().err + assert "Process config field 'process.status' was removed" in err + + def test_winrate_cli_ignores_removed_process_only_missing_pct_key( monkeypatch: pytest.MonkeyPatch, tmp_path: Path, @@ -2577,7 +2326,7 @@ def test_process_cli_allows_cli_override_of_malformed_numeric_config( cfg_path = tmp_path / "process-invalid-override.yaml" cfg_path.write_text( """ - runs_dir: runs/raw + runs_dir: runs/evals output_dir: runs/processed max_workers: not-an-int """, diff --git a/tests/test_cli/test_manifest_planner.py b/tests/test_cli/test_manifest_planner.py deleted file mode 100644 index ce44b091..00000000 --- a/tests/test_cli/test_manifest_planner.py +++ /dev/null @@ -1,491 +0,0 @@ -from __future__ import annotations - -import logging -from pathlib import Path - -import pytest - -from medarc_verifiers.cli._job_builder import ResolvedJob -from medarc_verifiers.cli._manifest import RunManifest -from medarc_verifiers.cli._manifest_planner import ManifestPlanner, _find_auto_resume_candidate -from medarc_verifiers.cli._schemas import EnvironmentConfigSchema, ModelConfigSchema - - -def _make_job(job_id: str = "job-a", env_id: str = "env-a", model_id: str = "model-a") -> ResolvedJob: - env = EnvironmentConfigSchema(id=env_id, module=env_id) - model = ModelConfigSchema(id=model_id, model="gpt-4.1-mini") - return ResolvedJob( - job_id=job_id, - name=job_id, - model=model, - env=env, - env_args={}, - sampling_args={}, - ) - - -def _planner( - *, - tmp_path: Path, - jobs: list[ResolvedJob], - config_checksum: str = "abc123", - run_id: str | None = None, - restart_source: str | None = None, - auto_resume: bool = True, - persist: bool = True, -) -> ManifestPlanner: - config_path = tmp_path / "config.yaml" - config_path.write_text("config: test\n", encoding="utf-8") - env_args_map = {job.job_id: {} for job in jobs} - sampling_args_map = {job.job_id: {} for job in jobs} - return ManifestPlanner( - output_dir=tmp_path / "runs", - run_id=run_id, - run_name="demo-run", - config_path=config_path, - config_checksum=config_checksum, - jobs=jobs, - env_args_map=env_args_map, - sampling_args_map=sampling_args_map, - restart_source=restart_source, - auto_resume=auto_resume, - persist=persist, - ) - - -def test_restart_in_place_reuses_completed_job(tmp_path: Path) -> None: - config_path = tmp_path / "config.yaml" - config_path.write_text("config: test\n", encoding="utf-8") - job = _make_job() - env_args_map = {job.job_id: {}} - sampling_args_map = {job.job_id: {}} - run_dir = tmp_path / "runs" / "base-run" - manifest = RunManifest.create( - run_dir=run_dir, - run_id="base-run", - run_name="demo-run", - config_source=config_path, - config_checksum="abc123", - jobs=[job], - env_args_map=env_args_map, - sampling_args_map=sampling_args_map, - persist=True, - restart_source=None, - ) - manifest.record_job_completion( - job.job_id, - duration_seconds=1.0, - results_dir=run_dir / job.job_id, - avg_reward=None, - metrics={}, - num_examples=job.env.num_examples, - rollouts_per_example=job.env.rollouts_per_example, - ) - - planner = _planner(tmp_path=tmp_path, jobs=[job], restart_source="base-run") - plan = planner.plan(force_all=False, forced_envs=set()) - - assert plan.manifest.path == manifest.path - assert plan.runnable_job_ids == set() - assert plan.reused_job_ids == {job.job_id} - - -def test_auto_resume_prefers_incomplete_run(tmp_path: Path) -> None: - config_path = tmp_path / "config.yaml" - config_path.write_text("config: test\n", encoding="utf-8") - job = _make_job() - env_args_map = {job.job_id: {}} - sampling_args_map = {job.job_id: {}} - output_dir = tmp_path / "runs" - output_dir.mkdir(parents=True, exist_ok=True) - - incomplete_dir = output_dir / "incomplete-run" - RunManifest.create( - run_dir=incomplete_dir, - run_id="incomplete-run", - run_name="demo-run", - config_source=config_path, - config_checksum="abc123", - jobs=[job], - env_args_map=env_args_map, - sampling_args_map=sampling_args_map, - persist=True, - restart_source=None, - ) - - complete_dir = output_dir / "complete-run" - complete_manifest = RunManifest.create( - run_dir=complete_dir, - run_id="complete-run", - run_name="demo-run", - config_source=config_path, - config_checksum="abc123", - jobs=[job], - env_args_map=env_args_map, - sampling_args_map=sampling_args_map, - persist=True, - restart_source=None, - ) - complete_manifest.record_job_completion( - job.job_id, - duration_seconds=1.0, - results_dir=complete_dir / job.job_id, - avg_reward=None, - metrics={}, - num_examples=job.env.num_examples, - rollouts_per_example=job.env.rollouts_per_example, - ) - - candidate = _find_auto_resume_candidate(output_dir, expected_checksum="abc123") - assert candidate == incomplete_dir - - planner = _planner(tmp_path=tmp_path, jobs=[job], auto_resume=True) - plan = planner.plan(force_all=False, forced_envs=set()) - - assert plan.manifest.run_dir == incomplete_dir - assert plan.runnable_job_ids == {job.job_id} - assert plan.reused_job_ids == set() - - -def test_auto_resume_with_checksum_mismatch_raises(tmp_path: Path) -> None: - config_path = tmp_path / "config.yaml" - config_path.write_text("config: test\n", encoding="utf-8") - job = _make_job() - env_args_map = {job.job_id: {}} - sampling_args_map = {job.job_id: {}} - run_dir = tmp_path / "runs" / "existing" - RunManifest.create( - run_dir=run_dir, - run_id="existing", - run_name="demo-run", - config_source=config_path, - config_checksum="different", - jobs=[job], - env_args_map=env_args_map, - sampling_args_map=sampling_args_map, - persist=True, - restart_source=None, - ) - - planner = _planner(tmp_path=tmp_path, jobs=[job], run_id="existing", auto_resume=True, config_checksum="abc123") - with pytest.raises( - ValueError, - match=( - r"Run 'existing' was created from a different configuration\." - r".*--no-auto-resume.*--restart existing" - ), - ): - planner.plan(force_all=False, forced_envs=set()) - - -def test_auto_resume_allows_resume_tolerant_model_fields(tmp_path: Path) -> None: - config_path = tmp_path / "config.yaml" - config_path.write_text("config: test\n", encoding="utf-8") - env = EnvironmentConfigSchema(id="env-a", module="env-a") - model = ModelConfigSchema(id="model-a", model="gpt-4.1-mini", max_concurrent=16, timeout=30.0) - job = ResolvedJob( - job_id="job-a", - name="job-a", - model=model, - env=env, - env_args={}, - sampling_args={}, - ) - - env_args_map = {job.job_id: {}} - sampling_args_map = {job.job_id: {}} - run_dir = tmp_path / "runs" / "existing" - manifest = RunManifest.create( - run_dir=run_dir, - run_id="existing", - run_name="demo-run", - config_source=config_path, - config_checksum="abc123", - jobs=[job], - env_args_map=env_args_map, - sampling_args_map=sampling_args_map, - persist=True, - restart_source=None, - ) - manifest.record_job_completion( - job.job_id, - duration_seconds=1.0, - results_dir=run_dir / job.job_id, - avg_reward=None, - metrics={}, - num_examples=job.env.num_examples, - rollouts_per_example=job.env.rollouts_per_example, - ) - - planner = _planner(tmp_path=tmp_path, jobs=[job], run_id="existing", auto_resume=True, config_checksum="abc123") - plan = planner.plan(force_all=False, forced_envs=set()) - - assert plan.manifest.run_dir == run_dir - assert plan.runnable_job_ids == set() - - -def test_restart_dir_missing_manifest_raises(tmp_path: Path) -> None: - job = _make_job() - seed_dir = tmp_path / "seed-run" - seed_dir.mkdir(parents=True, exist_ok=True) - planner = _planner(tmp_path=tmp_path, jobs=[job], restart_source=str(seed_dir)) - with pytest.raises(ValueError, match="run_manifest.json"): - planner.plan(force_all=False, forced_envs=set()) - - -def test_auto_resume_allows_provider_overrides(tmp_path: Path) -> None: - config_path = tmp_path / "config.yaml" - config_path.write_text("config: test\n", encoding="utf-8") - env = EnvironmentConfigSchema(id="env-a", module="env-a") - - job_seed = ResolvedJob( - job_id="job-a", - name="job-a", - model=ModelConfigSchema( - id="model-a", - model="google/gemini-3-pro-preview", - api_base_url="https://api.pinference.ai/api/v1", - api_key_var="PRIME_API_KEY", - headers={"X-Prime-Team-ID": "team-a"}, - ), - env=env, - env_args={}, - sampling_args={}, - ) - - env_args_map = {job_seed.job_id: {}} - sampling_args_map = {job_seed.job_id: {}} - run_dir = tmp_path / "runs" / "existing" - manifest = RunManifest.create( - run_dir=run_dir, - run_id="existing", - run_name="demo-run", - config_source=config_path, - config_checksum="abc123", - jobs=[job_seed], - env_args_map=env_args_map, - sampling_args_map=sampling_args_map, - persist=True, - restart_source=None, - ) - manifest.record_job_completion( - job_seed.job_id, - duration_seconds=1.0, - results_dir=run_dir / job_seed.job_id, - avg_reward=None, - metrics={}, - num_examples=job_seed.env.num_examples, - rollouts_per_example=job_seed.env.rollouts_per_example, - ) - - # Same model id, but updated provider settings. These should be resume-tolerant. - job_current = ResolvedJob( - job_id="job-a", - name="job-a", - model=ModelConfigSchema( - id="model-a", - model="google/gemini-3-pro-preview", - api_base_url="https://generativelanguage.googleapis.com/v1beta/openai", - api_key_var="GEMINI_API_KEY", - headers={}, - ), - env=env, - env_args={}, - sampling_args={}, - ) - - planner = _planner( - tmp_path=tmp_path, - jobs=[job_current], - run_id="existing", - auto_resume=True, - config_checksum="abc123", - ) - plan = planner.plan(force_all=False, forced_envs=set()) - - assert plan.manifest.run_dir == run_dir - assert plan.runnable_job_ids == set() - # Auto-resume doesn't populate reused_job_ids (only restart strategies do). - assert plan.reused_job_ids == set() - - -def test_restart_in_place_allows_extra_body_usage_override(caplog: pytest.LogCaptureFixture, tmp_path: Path) -> None: - caplog.set_level(logging.WARNING, logger="medarc_verifiers.cli._manifest") - config_path = tmp_path / "config.yaml" - config_path.write_text("config: test\n", encoding="utf-8") - env = EnvironmentConfigSchema(id="env-a", module="env-a") - - job_seed = ResolvedJob( - job_id="job-a", - name="job-a", - model=ModelConfigSchema( - id="model-a", - model="google/gemini-3-pro-preview", - sampling_args={"temperature": 0.2, "extra_body": {"usage": {"include": True}}}, - ), - env=env, - env_args={}, - sampling_args={}, - ) - - env_args_map = {job_seed.job_id: {}} - sampling_args_map = {job_seed.job_id: {}} - run_dir = tmp_path / "runs" / "base-run" - RunManifest.create( - run_dir=run_dir, - run_id="base-run", - run_name="demo-run", - config_source=config_path, - config_checksum="seed", - jobs=[job_seed], - env_args_map=env_args_map, - sampling_args_map=sampling_args_map, - persist=True, - restart_source=None, - ) - - # Same model id, but drop/alter extra_body.usage (provider-specific). - job_current = ResolvedJob( - job_id="job-a", - name="job-a", - model=ModelConfigSchema( - id="model-a", - model="google/gemini-3-pro-preview", - sampling_args={"temperature": 0.2, "extra_body": {}}, - ), - env=env, - env_args={}, - sampling_args={}, - ) - - planner = _planner( - tmp_path=tmp_path, - jobs=[job_current], - restart_source=str(run_dir), - auto_resume=False, - config_checksum="current", - ) - plan = planner.plan(force_all=False, forced_envs=set()) - - assert plan.manifest.run_dir == run_dir - assert any("sampling_args.extra_body changed" in record.message for record in caplog.records) - - -def test_restart_in_place_allows_sampling_args_override(caplog: pytest.LogCaptureFixture, tmp_path: Path) -> None: - caplog.set_level(logging.WARNING, logger="medarc_verifiers.cli._manifest") - config_path = tmp_path / "config.yaml" - config_path.write_text("config: test\n", encoding="utf-8") - env = EnvironmentConfigSchema(id="env-a", module="env-a") - - job_seed = ResolvedJob( - job_id="job-a", - name="job-a", - model=ModelConfigSchema( - id="model-a", - model="google/gemini-3-pro-preview", - sampling_args={"temperature": 0.2, "top_k": 64}, - ), - env=env, - env_args={}, - sampling_args={}, - ) - - env_args_map = {job_seed.job_id: {}} - sampling_args_map = {job_seed.job_id: {}} - run_dir = tmp_path / "runs" / "base-run" - RunManifest.create( - run_dir=run_dir, - run_id="base-run", - run_name="demo-run", - config_source=config_path, - config_checksum="seed", - jobs=[job_seed], - env_args_map=env_args_map, - sampling_args_map=sampling_args_map, - persist=True, - restart_source=None, - ) - - # Same model id, but provider-specific sampling args changed. - job_current = ResolvedJob( - job_id="job-a", - name="job-a", - model=ModelConfigSchema( - id="model-a", - model="google/gemini-3-pro-preview", - sampling_args={"temperature": 0.2}, - ), - env=env, - env_args={}, - sampling_args={}, - ) - - planner = _planner( - tmp_path=tmp_path, - jobs=[job_current], - restart_source=str(run_dir), - auto_resume=False, - config_checksum="current", - ) - plan = planner.plan(force_all=False, forced_envs=set()) - - assert plan.manifest.run_dir == run_dir - assert any("sampling_args changed" in record.message for record in caplog.records) - - -def test_restart_in_place_allows_model_namespace_override(tmp_path: Path) -> None: - config_path = tmp_path / "config.yaml" - config_path.write_text("config: test\n", encoding="utf-8") - env = EnvironmentConfigSchema(id="env-a", module="env-a") - - job_seed = ResolvedJob( - job_id="job-a", - name="job-a", - model=ModelConfigSchema( - id="model-a", - model="google/gemini-3-pro-preview", - ), - env=env, - env_args={}, - sampling_args={}, - ) - - env_args_map = {job_seed.job_id: {}} - sampling_args_map = {job_seed.job_id: {}} - run_dir = tmp_path / "runs" / "base-run" - RunManifest.create( - run_dir=run_dir, - run_id="base-run", - run_name="demo-run", - config_source=config_path, - config_checksum="seed", - jobs=[job_seed], - env_args_map=env_args_map, - sampling_args_map=sampling_args_map, - persist=True, - restart_source=None, - ) - - # Same underlying model, but without the provider namespace prefix. - job_current = ResolvedJob( - job_id="job-a", - name="job-a", - model=ModelConfigSchema( - id="model-a", - model="gemini-3-pro-preview", - ), - env=env, - env_args={}, - sampling_args={}, - ) - - planner = _planner( - tmp_path=tmp_path, - jobs=[job_current], - restart_source=str(run_dir), - auto_resume=False, - config_checksum="current", - ) - plan = planner.plan(force_all=False, forced_envs=set()) - - assert plan.manifest.run_dir == run_dir diff --git a/tests/test_cli/test_manifest_snapshot.py b/tests/test_cli/test_manifest_snapshot.py deleted file mode 100644 index ccddb9ee..00000000 --- a/tests/test_cli/test_manifest_snapshot.py +++ /dev/null @@ -1,452 +0,0 @@ -from __future__ import annotations - -import json -import os -from pathlib import Path -from typing import Any - -import pytest - -from medarc_verifiers.cli._job_builder import ResolvedJob -from medarc_verifiers.cli._manifest import ( - _ENSURE_JOB_RUNTIME_STATE_FIELDS, - MANIFEST_FILENAME, - MANIFEST_VERSION, - ManifestJobEntry, - RunManifest, - RunManifestModel, - build_job_entry, - compute_snapshot_checksum, - manifest_job_signature, - resolved_job_signature, -) -from medarc_verifiers.cli._schemas import EnvironmentConfigSchema, ModelConfigSchema - -SNAPSHOT_ENV_VAR = "UPDATE_CLI_MANIFEST_SNAPSHOT" -SNAPSHOT_PATH = Path(__file__).parent / "data" / "run_manifest_snapshot.json" - - -def _build_job() -> ResolvedJob: - model = ModelConfigSchema( - id="snapshot-model", - model="gpt-4o-mini", - headers={"X-Test": "one"}, - sampling_args={"max_tokens": 256, "temperature": 0.3}, - env_args={"split": "dev"}, - env_overrides={"snapshot-env": {"temperature": 0.2}}, - ) - env = EnvironmentConfigSchema( - id="snapshot-env", - module="environments.snapshot_env", - num_examples=3, - rollouts_per_example=2, - max_concurrent=4, - independent_scoring=False, - state_columns=["student_answer", "score"], - env_args={"difficulty": "easy", "runner_seed": 99}, - ) - return ResolvedJob( - job_id="snapshot-model-snapshot-env", - name="snapshot-eval", - model=model, - env=env, - env_args={"difficulty": "easy", "runner_seed": 99, "split": "dev", "job_seed": 7}, - sampling_args={"max_tokens": 256, "temperature": 0.3, "eval_seed": 17}, - ) - - -def _normalize_manifest(payload: Any, *, base_dir: Path) -> Any: - base_posix = base_dir.as_posix() - base_native = str(base_dir) - - if isinstance(payload, dict): - return {key: _normalize_manifest(value, base_dir=base_dir) for key, value in payload.items()} - if isinstance(payload, list): - return [_normalize_manifest(item, base_dir=base_dir) for item in payload] - if isinstance(payload, str): - return payload.replace(base_posix, "").replace(base_native, "") - return payload - - -def test_run_manifest_snapshot(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: - job = _build_job() - monkeypatch.setattr("medarc_verifiers.cli._manifest.timestamp", lambda: "2024-03-01T00:00:00Z") - - run_dir = tmp_path / "snapshot-run" - snapshot_cfg = { - "models": {"snapshot-model": {"model": "gpt-4o-mini"}}, - "envs": {"snapshot-env": {"module": "environments.snapshot_env"}}, - "jobs": [{"model": "snapshot-model", "env": "snapshot-env"}], - } - manifest = RunManifest.create( - run_dir=run_dir, - run_id="snapshot-run", - run_name="Snapshot Run", - config_source=Path("configs/snapshot.yaml"), - config_checksum=compute_snapshot_checksum(snapshot_cfg), - jobs=[job], - env_args_map={job.job_id: job.env_args}, - sampling_args_map={job.job_id: job.sampling_args}, - persist=True, - restart_source="baseline-run", - ) - - manifest_path = manifest.path - assert manifest_path.name == MANIFEST_FILENAME - payload = json.loads(manifest_path.read_text(encoding="utf-8")) - normalized = _normalize_manifest(payload, base_dir=tmp_path) - - SNAPSHOT_PATH.parent.mkdir(parents=True, exist_ok=True) - serialized = json.dumps(normalized, indent=2, sort_keys=True) + "\n" - - if os.environ.get(SNAPSHOT_ENV_VAR): - SNAPSHOT_PATH.write_text(serialized, encoding="utf-8") - - expected = json.loads(SNAPSHOT_PATH.read_text(encoding="utf-8")) - assert normalized == expected - - loaded = RunManifest.load(manifest_path, persist=False) - assert loaded.model.config_checksum == expected["config_checksum"] - assert loaded.jobs[0].status == "pending" - - -def test_manifest_load_upgrades_interleave_scoring(tmp_path: Path) -> None: - """Older manifests may store interleave_scoring in env_templates; load should upgrade it.""" - manifest_path = tmp_path / "run_manifest.json" - payload = { - "version": 2, - "run_id": "demo", - "name": "Demo", - "config_source": "configs/demo.yaml", - "config_checksum": "abc", - "created_at": "2024-03-01T00:00:00Z", - "updated_at": "2024-03-01T00:00:00Z", - "models": {}, - "env_templates": { - "env:template": { - "module": "environments.snapshot_env", - "num_examples": 3, - "rollouts_per_example": 2, - "interleave_scoring": False, - } - }, - "jobs": [], - "summary": {}, - } - manifest_path.write_text(json.dumps(payload), encoding="utf-8") - - loaded = RunManifest.load(manifest_path, persist=False) - template = loaded.model.env_templates["env:template"] - assert "interleave_scoring" not in template - assert template["independent_scoring"] is False - - -def test_manifest_serialization_prunes_nones_and_relativizes(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: - job = _build_job() - fake_root = tmp_path / "repo" - fake_root.mkdir() - run_dir = fake_root / "runs" / "phase5" - - def fake_to_project_relative(path: Path | str, *, default_base: Path | None = None) -> str: - resolved = Path(path).resolve() - base = fake_root if default_base is None else default_base - return resolved.relative_to(base).as_posix() - - monkeypatch.setattr("medarc_verifiers.utils.pathing.project_root", lambda: fake_root) - monkeypatch.setattr("medarc_verifiers.utils.pathing.to_project_relative", fake_to_project_relative) - - snapshot_cfg = { - "models": {"snapshot-model": {"model": "gpt-4o-mini"}}, - "envs": {"snapshot-env": {"module": "environments.snapshot_env"}}, - "jobs": [{"model": "snapshot-model", "env": "snapshot-env"}], - } - manifest = RunManifest.create( - run_dir=run_dir, - run_id="phase5", - run_name="Phase 5 Run", - config_source=fake_root / "configs" / "phase5.yaml", - config_checksum=compute_snapshot_checksum(snapshot_cfg), - jobs=[job], - env_args_map={job.job_id: job.env_args}, - sampling_args_map={job.job_id: job.sampling_args}, - ) - - payload = json.loads(manifest.path.read_text(encoding="utf-8")) - job_payload = payload["jobs"][0] - - assert "results_dir" not in job_payload - assert "reason" not in job_payload - assert "avg_reward" not in job_payload - assert job_payload["env_args"]["job_seed"] == 7 - assert job_payload["sampling_args"]["eval_seed"] == 17 - - -def test_manifest_job_signature_is_stable(tmp_path: Path) -> None: - job = _build_job() - run_dir = tmp_path / "sig-run" - manifest = RunManifest.create( - run_dir=run_dir, - run_id="sig-run", - run_name="Signature Run", - config_source=Path("configs/sig.yaml"), - config_checksum="sig", - jobs=[job], - env_args_map={job.job_id: job.env_args}, - sampling_args_map={job.job_id: job.sampling_args}, - persist=False, - ) - entry = manifest.jobs[0] - - signature = manifest_job_signature(manifest.model, entry) - assert signature == { - "model": { - "id": "snapshot-model", - "model": "gpt-4o-mini", - "sampling_args": {"max_tokens": 256, "temperature": 0.3}, - "env_args": {"split": "dev"}, - "env_overrides": {"snapshot-env": {"temperature": 0.2}}, - }, - "env": { - "module": "environments.snapshot_env", - "num_examples": 3, - "rollouts_per_example": 2, - "max_concurrent": 4, - "independent_scoring": False, - "state_columns": ["student_answer", "score"], - "print_results": False, - "rerun": False, - "id": "snapshot-env", - "env_args": {"difficulty": "easy", "runner_seed": 99, "split": "dev", "job_seed": 7}, - }, - "sampling_args": {"max_tokens": 256, "temperature": 0.3, "eval_seed": 17}, - } - - -def test_resolved_job_signature_is_stable() -> None: - job = _build_job() - - signature = resolved_job_signature(job, env_args=job.env_args, sampling_args=job.sampling_args) - assert signature == { - "model": { - "id": "snapshot-model", - "model": "gpt-4o-mini", - "sampling_args": {"max_tokens": 256, "temperature": 0.3}, - "env_args": {"split": "dev"}, - "env_overrides": {"snapshot-env": {"temperature": 0.2}}, - }, - "env": { - "module": "environments.snapshot_env", - "num_examples": 3, - "rollouts_per_example": 2, - "max_concurrent": 4, - "independent_scoring": False, - "state_columns": ["student_answer", "score"], - "print_results": False, - "rerun": False, - "id": "snapshot-env", - "env_args": {"difficulty": "easy", "runner_seed": 99, "split": "dev", "job_seed": 7}, - }, - "sampling_args": {"max_tokens": 256, "temperature": 0.3, "eval_seed": 17}, - } - - -def test_build_job_entry_is_stable() -> None: - job = _build_job() - entry = build_job_entry(job, env_args=job.env_args, sampling_args=job.sampling_args, results_dir=None) - assert entry.model_dump() == { - "job_id": "snapshot-model-snapshot-env", - "env_id": "environments.snapshot_env", - "model_id": "snapshot-model", - "env_template_id": "environments.snapshot_env:6ef485576891", - "env_variant_id": "snapshot-env", - "env_args": {"difficulty": "easy", "runner_seed": 99, "split": "dev", "job_seed": 7}, - "sampling_args": {"max_tokens": 256, "temperature": 0.3, "eval_seed": 17}, - "status": "pending", - "reason": None, - "attempt": 0, - "started_at": None, - "ended_at": None, - "duration_seconds": None, - "results_dir": None, - "results_relpath": "snapshot-model-snapshot-env/results.jsonl", - "metadata_relpath": "snapshot-model-snapshot-env/metadata.json", - "row_count": None, - "metrics": None, - "avg_reward": None, - "num_examples": None, - "rollouts_per_example": None, - } - - -def test_resolved_job_signature_ignores_resume_tolerant_fields() -> None: - base_job = _build_job() - model_variant = base_job.model.model_copy(update={"api_key_var": "ALT_KEY"}) - variant_job = ResolvedJob( - job_id=base_job.job_id, - name=base_job.name, - model=model_variant, - env=base_job.env, - env_args=base_job.env_args, - sampling_args=base_job.sampling_args, - sleep=base_job.sleep, - ) - - base_sig = resolved_job_signature(base_job, env_args=base_job.env_args, sampling_args=base_job.sampling_args) - variant_sig = resolved_job_signature( - variant_job, env_args=variant_job.env_args, sampling_args=variant_job.sampling_args - ) - - assert base_sig == variant_sig - - -def test_ensure_job_preserves_runtime_fields_on_update(tmp_path: Path) -> None: - seed_job = _build_job() - run_dir = tmp_path / "runtime-run" - manifest = RunManifest.create( - run_dir=run_dir, - run_id="runtime-run", - run_name="Runtime Run", - config_source=Path("configs/runtime.yaml"), - config_checksum="runtime", - jobs=[seed_job], - env_args_map={seed_job.job_id: seed_job.env_args}, - sampling_args_map={seed_job.job_id: seed_job.sampling_args}, - persist=False, - ) - manifest.record_job_completion( - seed_job.job_id, - duration_seconds=3.5, - results_dir=run_dir / seed_job.job_id, - avg_reward=0.75, - metrics={"pass_rate": 0.75}, - num_examples=12, - rollouts_per_example=2, - ) - entry_before = manifest.job_entry(seed_job.job_id) - assert entry_before is not None - entry_before.row_count = 4 - assert set(_ENSURE_JOB_RUNTIME_STATE_FIELDS) == { - "status", - "reason", - "attempt", - "started_at", - "ended_at", - "duration_seconds", - "row_count", - "metrics", - "avg_reward", - "num_examples", - "rollouts_per_example", - } - before_runtime = { - "status": entry_before.status, - "reason": entry_before.reason, - "attempt": entry_before.attempt, - "started_at": entry_before.started_at, - "ended_at": entry_before.ended_at, - "duration_seconds": entry_before.duration_seconds, - "row_count": entry_before.row_count, - "metrics": entry_before.metrics, - "avg_reward": entry_before.avg_reward, - "num_examples": entry_before.num_examples, - "rollouts_per_example": entry_before.rollouts_per_example, - } - - updated_job = ResolvedJob( - job_id=seed_job.job_id, - name=seed_job.name, - model=seed_job.model, - env=seed_job.env, - env_args={**seed_job.env_args, "job_seed": 999}, - sampling_args={**seed_job.sampling_args, "eval_seed": 999}, - sleep=seed_job.sleep, - ) - manifest.ensure_job( - updated_job, - env_args=updated_job.env_args, - sampling_args=updated_job.sampling_args, - results_dir=run_dir / updated_job.job_id, - ) - - entry_after = manifest.job_entry(seed_job.job_id) - assert entry_after is not None - after_runtime = { - "status": entry_after.status, - "reason": entry_after.reason, - "attempt": entry_after.attempt, - "started_at": entry_after.started_at, - "ended_at": entry_after.ended_at, - "duration_seconds": entry_after.duration_seconds, - "row_count": entry_after.row_count, - "metrics": entry_after.metrics, - "avg_reward": entry_after.avg_reward, - "num_examples": entry_after.num_examples, - "rollouts_per_example": entry_after.rollouts_per_example, - } - - assert before_runtime == after_runtime - - -def test_ensure_job_preserves_entry_object_identity(tmp_path: Path) -> None: - seed_job = _build_job() - run_dir = tmp_path / "identity-run" - manifest = RunManifest.create( - run_dir=run_dir, - run_id="identity-run", - run_name="Identity Run", - config_source=Path("configs/identity.yaml"), - config_checksum="identity", - jobs=[seed_job], - env_args_map={seed_job.job_id: seed_job.env_args}, - sampling_args_map={seed_job.job_id: seed_job.sampling_args}, - persist=False, - ) - entry_before = manifest.job_entry(seed_job.job_id) - assert entry_before is not None - - updated_job = ResolvedJob( - job_id=seed_job.job_id, - name=seed_job.name, - model=seed_job.model, - env=seed_job.env, - env_args={**seed_job.env_args, "job_seed": 111}, - sampling_args={**seed_job.sampling_args, "eval_seed": 111}, - sleep=seed_job.sleep, - ) - manifest.ensure_job( - updated_job, - env_args=updated_job.env_args, - sampling_args=updated_job.sampling_args, - results_dir=run_dir / updated_job.job_id, - ) - entry_after = manifest.job_entry(seed_job.job_id) - assert entry_after is not None - assert entry_before is entry_after - assert entry_before.env_args["job_seed"] == 111 - - -def test_manifest_job_signature_does_not_fallback_module_to_variant_id() -> None: - model = RunManifestModel( - version=MANIFEST_VERSION, - run_id="r", - name="n", - config_source="cfg.yaml", - config_checksum="x", - created_at="2024-01-01T00:00:00Z", - updated_at="2024-01-01T00:00:00Z", - models={}, - env_templates={"template-no-module": {}}, - jobs=[], - summary={}, - ) - entry = ManifestJobEntry( - job_id="job-x", - env_id=None, - model_id="missing-model", - env_template_id="template-no-module", - env_variant_id="variant-x", - env_args={}, - ) - signature = manifest_job_signature(model, entry) - assert "module" not in signature["env"] - assert signature["env"]["id"] == "variant-x" diff --git a/tests/test_cli/test_manifest_tools.py b/tests/test_cli/test_manifest_tools.py deleted file mode 100644 index 4274fb1e..00000000 --- a/tests/test_cli/test_manifest_tools.py +++ /dev/null @@ -1,151 +0,0 @@ -from __future__ import annotations - -import json -from pathlib import Path - -from medarc_verifiers.cli._manifest_tools import validate_manifests_in_runs - - -def _write_json(path: Path, payload: dict) -> None: - path.parent.mkdir(parents=True, exist_ok=True) - path.write_text(json.dumps(payload), encoding="utf-8") - - -def _write_manifest( - run_dir: Path, - *, - num_examples: int | None = None, - rollouts_per_example: int | None = None, -) -> None: - payload = { - "version": 3, - "run_id": "demo-run", - "name": "demo", - "config_source": "cfg.yaml", - "config_checksum": "x", - "created_at": "2024-01-01T00:00:00Z", - "updated_at": "2024-01-01T00:00:00Z", - "artifacts_root": ".", - "models": {}, - "env_templates": {}, - "jobs": [ - { - "job_id": "job-1", - "model_id": "m", - "env_id": "e", - "env_template_id": "e:t", - "env_variant_id": "e", - "env_args": {}, - "results_relpath": "job-1/results.jsonl", - "metadata_relpath": "job-1/metadata.json", - "status": "completed", - "num_examples": num_examples, - "rollouts_per_example": rollouts_per_example, - } - ], - "summary": {"total": 1, "completed": 1, "pending": 0, "failed": 0, "running": 0, "skipped": 0}, - } - _write_json(run_dir / "run_manifest.json", payload) - - -def test_validate_manifests_reports_broken_paths(tmp_path: Path) -> None: - runs_dir = tmp_path / "runs" / "raw" - run_dir = runs_dir / "demo-run" - job_dir = run_dir / "job-1" - _write_json(job_dir / "metadata.json", {"env_id": "demo"}) - (job_dir / "results.jsonl").write_text('{"example_id": 1}\n', encoding="utf-8") - - payload = { - "version": 3, - "run_id": "demo-run", - "name": "demo", - "config_source": "cfg.yaml", - "config_checksum": "x", - "created_at": "2024-01-01T00:00:00Z", - "updated_at": "2024-01-01T00:00:00Z", - "artifacts_root": ".", - "models": {}, - "env_templates": {}, - "jobs": [ - { - "job_id": "job-1", - "model_id": "m", - "env_id": "e", - "env_template_id": "e:t", - "env_variant_id": "e", - "env_args": {}, - "results_relpath": "broken/job-1/results.jsonl", - "status": "completed", - } - ], - "summary": {"total": 1, "completed": 1, "pending": 0, "failed": 0, "running": 0, "skipped": 0}, - } - _write_json(run_dir / "run_manifest.json", payload) - - result = validate_manifests_in_runs(runs_dir, strict=False) - assert result.manifests_checked == 1 - assert result.jobs_checked == 1 - assert any(issue.kind == "warning" and "fallback" in issue.message.lower() for issue in result.issues) - - -def test_validate_manifests_accepts_partial_rollout_file(tmp_path: Path) -> None: - runs_dir = tmp_path / "runs" / "raw" - run_dir = runs_dir / "demo-run" - job_dir = run_dir / "job-1" - _write_json(job_dir / "metadata.json", {"env_id": "demo"}) - (job_dir / "results.jsonl").write_text( - "\n".join( - [ - '{"example_id": 1, "rollout_index": 0}', - '{"example_id": 2, "rollout_index": 0}', - '{"example_id": 1, "rollout_index": 1}', - '{"example_id": 2, "rollout_index": 1}', - '{"example_id": 1, "rollout_index": 2}', - ] - ) - + "\n", - encoding="utf-8", - ) - _write_manifest(run_dir, num_examples=2, rollouts_per_example=3) - - result = validate_manifests_in_runs(runs_dir, strict=False) - - assert result.manifests_checked == 1 - assert result.jobs_checked == 1 - assert result.issues == [] - - -def test_validate_manifests_reports_out_of_range_rollout_index(tmp_path: Path) -> None: - runs_dir = tmp_path / "runs" / "raw" - run_dir = runs_dir / "demo-run" - job_dir = run_dir / "job-1" - _write_json(job_dir / "metadata.json", {"env_id": "demo"}) - (job_dir / "results.jsonl").write_text( - "\n".join( - [ - '{"example_id": 1, "rollout_index": 0}', - '{"example_id": 2, "rollout_index": 0}', - '{"example_id": 1, "rollout_index": 3}', - ] - ) - + "\n", - encoding="utf-8", - ) - _write_manifest(run_dir, num_examples=2, rollouts_per_example=3) - - result = validate_manifests_in_runs(runs_dir, strict=False) - - assert any("out-of-range rollout_index" in issue.message for issue in result.issues) - - -def test_validate_manifests_reports_malformed_last_jsonl_row(tmp_path: Path) -> None: - runs_dir = tmp_path / "runs" / "raw" - run_dir = runs_dir / "demo-run" - job_dir = run_dir / "job-1" - _write_json(job_dir / "metadata.json", {"env_id": "demo"}) - (job_dir / "results.jsonl").write_text('{"example_id": 1}\n{"example_id": ', encoding="utf-8") - _write_manifest(run_dir, num_examples=1, rollouts_per_example=1) - - result = validate_manifests_in_runs(runs_dir, strict=False) - - assert any("failed to parse last JSONL row" in issue.message for issue in result.issues) diff --git a/tests/test_cli/test_process_discovery.py b/tests/test_cli/test_process_discovery.py index a41a6bed..c4ef7f7d 100644 --- a/tests/test_cli/test_process_discovery.py +++ b/tests/test_cli/test_process_discovery.py @@ -3,7 +3,8 @@ import json from pathlib import Path -from medarc_verifiers.cli.process.discovery import RunManifestInfo, discover_run_records +from medarc_verifiers.cli.process.discovery import discover_run_records +from medarc_verifiers.cli.process.metadata import load_normalized_metadata def _write_json(path: Path, payload: dict) -> None: @@ -11,241 +12,178 @@ def _write_json(path: Path, payload: dict) -> None: path.write_text(json.dumps(payload), encoding="utf-8") -def _base_manifest( - job_payloads: list[dict], - *, - models: dict | None = None, - env_templates: dict | None = None, -) -> dict: - return { - "version": 3, - "run_id": "job-run-123", - "name": "example-run", - "config_source": "configs/example.yaml", - "config_checksum": "abc123", - "created_at": "2024-01-01T00:00:00Z", - "updated_at": "2024-01-01T00:05:00Z", - "artifacts_root": ".", - "models": models or {}, - "env_templates": env_templates or {}, - "jobs": job_payloads, - "summary": {"completed": 1}, - } - - -def _manifest_info(*, completed: int, total: int, total_known: bool) -> RunManifestInfo: - return RunManifestInfo( - job_run_id="job-run-123", - run_name="example-run", - summary_completed=completed, - summary_total=total, - summary_total_known=total_known, - manifest_path=Path("/tmp/run_manifest.json"), - run_dir=Path("/tmp/job-run-123"), - created_at="2024-01-01T00:00:00Z", - updated_at="2024-01-01T00:05:00Z", - config_source="configs/example.yaml", - config_checksum="abc123", - run_summary_path=Path("/tmp/run_summary.json"), - ) - - -def test_discover_run_records_basic(tmp_path: Path) -> None: - runs_dir = tmp_path / "runs" - run_dir = runs_dir / "job-run-123" - results_dir = run_dir / "model-env-job" - - manifest_payload = _base_manifest( - [ - { - "job_id": "model-env-job", - "job_name": "demo-job", - "model_id": "gpt-4", - "env_id": "demo-env-module", - "env_template_id": "demo-env-template", - "env_variant_id": "demo-env", - "env_args": {"fold": "dev"}, - "results_relpath": "model-env-job/results.jsonl", - "metadata_relpath": "model-env-job/metadata.json", - "status": "completed", - "started_at": "2024-01-01T00:00:30Z", - "ended_at": "2024-01-01T00:01:00Z", - "avg_reward": 0.75, - "num_examples": 10, - "rollouts_per_example": 2, - "row_count": 20, - } - ], - models={"gpt-4": {"sampling_args": {"temperature": 0.2}}}, - env_templates={"demo-env-template": {"module": "demo-env-module"}}, - ) - _write_json(run_dir / "run_manifest.json", manifest_payload) - +def _write_eval_output(path: Path, metadata: dict | None = None, *, rows: list[dict] | None = None) -> None: _write_json( - run_dir / "run_summary.json", + path / "metadata.json", { - "jobs": [ - { - "job_id": "model-env-job", - "status": "succeeded", - "duration_seconds": 12.5, - "error": None, - } - ] + "env_id": "medqa", + "model": "gpt-5-mini", + "env_args": {"split": "test"}, + "sampling_args": {"temperature": 0}, + "num_examples": 1, + "rollouts_per_example": 1, + "avg_reward": 1.0, + **(metadata or {}), }, ) + result_rows = rows if rows is not None else [{"example_id": "ex-1", "reward": 1.0}] + with (path / "results.jsonl").open("w", encoding="utf-8") as handle: + for row in result_rows: + handle.write(json.dumps(row) + "\n") + + +def test_discover_run_records_includes_deterministic_base_layout(tmp_path: Path) -> None: + evals_dir = tmp_path / "runs" / "evals" + output_dir = evals_dir / "gpt-5-mini" / "medqa" / "base" + _write_eval_output(output_dir) + _write_json(output_dir / "summary.json", {"env_id": "medqa"}) - _write_json(results_dir / "metadata.json", {"env_id": "demo-env"}) - (results_dir / "results.jsonl").write_text("{}", encoding="utf-8") - _write_json(results_dir / "summary.json", {"env_id": "demo-env"}) + records = discover_run_records(evals_dir, filter_status=("completed",)) - records = discover_run_records(runs_dir) assert len(records) == 1 record = records[0] - assert record.status == "succeeded" - assert record.duration_seconds == 12.5 + assert record.status == "completed" + assert record.model_id == "gpt-5-mini" + assert record.manifest_env_id == "medqa" + assert record.results_dir == output_dir assert record.has_metadata is True assert record.has_results is True assert record.has_summary is True - assert record.env_args == {"fold": "dev"} - assert record.sampling_args == {"temperature": 0.2} - assert record.avg_reward == 0.75 - assert record.row_count == 20 - assert record.manifest.job_run_id == "job-run-123" - - -def test_discover_run_records_filters_status(tmp_path: Path) -> None: - runs_dir = tmp_path / "runs" - run_dir = runs_dir / "job-run-123" - results_dir = run_dir / "model-env-job" - - manifest_payload = _base_manifest( - [ - { - "job_id": "model-env-job", - "model_id": "gpt-4", - "env_id": "demo-env-module", - "env_template_id": "demo-env-template", - "env_variant_id": "demo-env", - "env_args": {}, - "results_relpath": "model-env-job/results.jsonl", - } - ], - models={"gpt-4": {"sampling_args": {}}}, - env_templates={"demo-env-template": {"module": "demo-env-module"}}, - ) - _write_json(run_dir / "run_manifest.json", manifest_payload) - _write_json( - run_dir / "run_summary.json", - {"jobs": [{"job_id": "model-env-job", "status": "failed", "error": "boom"}]}, - ) - results_dir.mkdir(parents=True, exist_ok=True) - (results_dir / "results.jsonl").write_text("{}", encoding="utf-8") - - filtered = discover_run_records(runs_dir, filter_status=("failed",)) - assert len(filtered) == 1 - assert filtered[0].status == "failed" - - filtered_none = discover_run_records(runs_dir, filter_status=("succeeded",)) - assert filtered_none == [] - - -def test_discover_run_records_missing_summary_uses_manifest_status(tmp_path: Path) -> None: - runs_dir = tmp_path / "runs" - run_dir = runs_dir / "job-run-123" - results_dir = run_dir / "model-env-job" - - manifest_payload = _base_manifest( - [ - { - "job_id": "model-env-job", - "status": "completed", - "reason": "cached", - "model_id": "gpt-4", - "env_id": "demo-env-module", - "env_template_id": "demo-env-template", - "env_variant_id": "demo-env", - "env_args": {}, - "results_relpath": "model-env-job/results.jsonl", - } - ], - models={"gpt-4": {"sampling_args": {}}}, - env_templates={"demo-env-template": {"module": "demo-env-module"}}, - ) - _write_json(run_dir / "run_manifest.json", manifest_payload) + assert record.env_args == {"split": "test"} + assert record.sampling_args == {"temperature": 0} + assert record.avg_reward == 1.0 + assert record.row_count == 1 + assert record.manifest.job_run_id == "gpt-5-mini::medqa::base" + normalized = load_normalized_metadata(record) + assert normalized.variant_id == "base" + - results_dir.mkdir(parents=True, exist_ok=True) - (results_dir / "results.jsonl").write_text("{}", encoding="utf-8") +def test_discover_run_records_includes_deterministic_eval_variants(tmp_path: Path) -> None: + evals_dir = tmp_path / "runs" / "evals" + variant_id = "env_args.shuffle_seed-1618" + output_dir = evals_dir / "gpt-5-mini" / "medqa" / variant_id + _write_eval_output(output_dir) + + records = discover_run_records(evals_dir, filter_status=("completed",)) + + assert len(records) == 1 + normalized = load_normalized_metadata(records[0]) + assert normalized.variant_id == variant_id + assert normalized.variant_payload is None + assert normalized.medarc_config_fingerprint is None + assert normalized.medarc_config_fingerprint_payload is None + + +def test_discover_run_records_preserves_path_safe_variant_identity(tmp_path: Path) -> None: + evals_dir = tmp_path / "runs" / "evals" + variant_id = "name.with-safe_chars-123" + output_dir = evals_dir / "gpt-5-mini" / "foo--bar" / variant_id + _write_eval_output(output_dir, {"env_id": "foo--bar", "model": "gpt-5-mini"}) + + records = discover_run_records(evals_dir, filter_status=("completed",)) - records = discover_run_records(runs_dir) assert len(records) == 1 record = records[0] - assert record.status == "completed" - assert record.reason == "cached" - assert record.has_summary is False - - -def test_discover_run_records_respects_artifacts_root(tmp_path: Path, monkeypatch) -> None: - runs_dir = tmp_path / "runs_llm_judge" / "raw" - run_dir = runs_dir / "job-run-123" - artifacts_dir = run_dir / "artifacts" - results_dir = artifacts_dir / "model-env-job" - - manifest_payload = _base_manifest( - [ - { - "job_id": "model-env-job", - "model_id": "gpt-4", - "env_id": "demo-env-module", - "env_template_id": "demo-env-template", - "env_variant_id": "demo-env", - "env_args": {}, - "results_relpath": "model-env-job/results.jsonl", - "metadata_relpath": "model-env-job/metadata.json", - "status": "completed", - } - ], - models={"gpt-4": {"sampling_args": {"temperature": 0.2}}}, - env_templates={"demo-env-template": {"module": "demo-env-module"}}, - ) - manifest_payload["artifacts_root"] = "artifacts" - _write_json(run_dir / "run_manifest.json", manifest_payload) + assert record.model_id == "gpt-5-mini" + assert record.manifest_env_id == "foo--bar" + normalized = load_normalized_metadata(record) + assert normalized.variant_id == variant_id + + +def test_discover_run_records_includes_direct_upstream_uuid_outputs(tmp_path: Path) -> None: + evals_dir = tmp_path / "runs" / "evals" + run_id = "016f4b4a-92a4-4a5b-a7c1-853af3318c52" + upstream_dir = evals_dir / "medqa--gpt-5-mini" / run_id + _write_eval_output(upstream_dir) - results_dir.mkdir(parents=True, exist_ok=True) - _write_json(results_dir / "metadata.json", {"env_id": "demo-env"}) - (results_dir / "results.jsonl").write_text("{}", encoding="utf-8") + records = discover_run_records(evals_dir, filter_status=("completed",)) - records = discover_run_records(runs_dir) assert len(records) == 1 - assert records[0].has_results is True - - -def test_discover_run_records_fallbacks_to_job_dir_when_results_relpath_is_broken(tmp_path: Path) -> None: - runs_dir = tmp_path / "runs" / "raw" - run_dir = runs_dir / "job-run-123" - job_dir = run_dir / "model-env-job" - - manifest_payload = _base_manifest( - [ - { - "job_id": "model-env-job", - "model_id": "gpt-4", - "env_id": "demo-env-module", - "env_template_id": "demo-env-template", - "env_variant_id": "demo-env", - "env_args": {}, - "results_relpath": "wrong-dir/results.jsonl", - "status": "completed", - } - ], - models={"gpt-4": {"sampling_args": {}}}, - env_templates={"demo-env-template": {"module": "demo-env-module"}}, + record = records[0] + assert record.model_id == "gpt-5-mini" + assert record.manifest_env_id == "medqa" + assert record.manifest.job_run_id == run_id + normalized = load_normalized_metadata(record) + assert normalized.variant_id is None + + +def test_discover_run_records_skips_missing_metadata(tmp_path: Path) -> None: + evals_dir = tmp_path / "runs" / "evals" + output_dir = evals_dir / "gpt-5-mini" / "medqa" / "base" + output_dir.mkdir(parents=True) + (output_dir / "results.jsonl").write_text('{"example_id":"ex-1"}\n', encoding="utf-8") + + assert discover_run_records(evals_dir, filter_status=("completed",)) == [] + + +def test_discover_run_records_skips_invalid_metadata(tmp_path: Path) -> None: + evals_dir = tmp_path / "runs" / "evals" + output_dir = evals_dir / "gpt-5-mini" / "medqa" / "base" + output_dir.mkdir(parents=True) + (output_dir / "metadata.json").write_text("not json", encoding="utf-8") + (output_dir / "results.jsonl").write_text('{"example_id":"ex-1"}\n', encoding="utf-8") + + assert discover_run_records(evals_dir, filter_status=("completed",)) == [] + + +def test_discover_run_records_skips_metadata_only_directory(tmp_path: Path) -> None: + evals_dir = tmp_path / "runs" / "evals" + _write_json( + evals_dir / "gpt-5-mini" / "medqa" / "base" / "metadata.json", + {"env_id": "medqa", "model": "gpt-5-mini"}, ) - _write_json(run_dir / "run_manifest.json", manifest_payload) - _write_json(job_dir / "metadata.json", {"env_id": "demo-env"}) - (job_dir / "results.jsonl").write_text("{}", encoding="utf-8") - records = discover_run_records(runs_dir) + assert discover_run_records(evals_dir, filter_status=("completed",)) == [] + + +def test_discover_run_records_counts_empty_results_candidate(tmp_path: Path) -> None: + evals_dir = tmp_path / "runs" / "evals" + output_dir = evals_dir / "gpt-5-mini" / "medqa" / "base" + _write_json(output_dir / "metadata.json", {"env_id": "medqa", "model": "gpt-5-mini"}) + (output_dir / "results.jsonl").write_text("", encoding="utf-8") + + records = discover_run_records(evals_dir, filter_status=("completed",)) + + assert len(records) == 1 + assert records[0].row_count == 0 + + +def test_discover_run_records_counts_invalid_jsonl_candidate_for_later_row_validation(tmp_path: Path) -> None: + evals_dir = tmp_path / "runs" / "evals" + output_dir = evals_dir / "gpt-5-mini" / "medqa" / "base" + _write_json(output_dir / "metadata.json", {"env_id": "medqa", "model": "gpt-5-mini"}) + (output_dir / "results.jsonl").write_text("{not json}\n", encoding="utf-8") + + records = discover_run_records(evals_dir, filter_status=("completed",)) + assert len(records) == 1 - assert records[0].has_results is True - assert records[0].has_metadata is True + assert records[0].row_count == 1 + + +def test_discover_run_records_filters_current_output_status(tmp_path: Path) -> None: + evals_dir = tmp_path / "runs" / "evals" + _write_eval_output(evals_dir / "gpt-5-mini" / "medqa" / "base") + + assert len(discover_run_records(evals_dir, filter_status=("completed",))) == 1 + assert discover_run_records(evals_dir, filter_status=("failed",)) == [] + + +def test_discover_run_records_parent_baseline_and_child_variant_once(tmp_path: Path) -> None: + evals_dir = tmp_path / "runs" / "evals" + baseline_dir = evals_dir / "gpt-5-mini" / "medqa" + variant_dir = baseline_dir / "env_args.shuffle_seed-1618" + _write_eval_output(baseline_dir) + _write_eval_output(variant_dir) + + records = discover_run_records(evals_dir, filter_status=("completed",)) + + assert len(records) == 2 + assert {record.results_dir for record in records} == {baseline_dir, variant_dir} + + +def test_discover_run_records_scans_only_provided_root(tmp_path: Path) -> None: + evals_dir = tmp_path / "runs" / "evals" + raw_dir = tmp_path / "runs" / "raw" + _write_eval_output(evals_dir / "gpt-5-mini" / "medqa" / "base") + + assert discover_run_records(raw_dir, filter_status=("completed",)) == [] diff --git a/tests/test_cli/test_process_metadata.py b/tests/test_cli/test_process_metadata.py index e69b8d46..4dc1113e 100644 --- a/tests/test_cli/test_process_metadata.py +++ b/tests/test_cli/test_process_metadata.py @@ -1,11 +1,8 @@ from __future__ import annotations import json -import logging from pathlib import Path -import pytest - from medarc_verifiers.cli.process.discovery import RunManifestInfo, RunRecord from medarc_verifiers.cli.process.metadata import load_normalized_metadata @@ -76,7 +73,7 @@ def _make_record( return record -def test_load_normalized_metadata_prefers_manifest_fields(tmp_path: Path) -> None: +def test_load_normalized_metadata_prefers_metadata_fields(tmp_path: Path) -> None: record = _make_record( tmp_path, env_args={"difficulty": "hard"}, @@ -102,9 +99,9 @@ def test_load_normalized_metadata_prefers_manifest_fields(tmp_path: Path) -> Non assert normalized.manifest_env_id == "demo-env-rollout3" assert normalized.base_env_id == "demo-env" assert normalized.rollout_index == 3 - assert normalized.env_args == {"difficulty": "hard", "split": "dev"} - assert normalized.sampling_args == {"temperature": 0.1, "top_p": 0.95} - assert normalized.num_examples == 10 + assert normalized.env_args == {"difficulty": "easy", "split": "dev"} + assert normalized.sampling_args == {"temperature": 0.9, "top_p": 0.95} + assert normalized.num_examples == 20 assert normalized.rollouts_per_example == 2 assert normalized.model_id == "gpt-4o" assert normalized.metadata_model == "gpt-4o-mini" @@ -220,13 +217,13 @@ def test_load_normalized_metadata_validation_failure_sanitizes_raw_metadata(tmp_ } -def test_load_normalized_metadata_keeps_zero_num_examples_from_manifest(tmp_path: Path) -> None: +def test_load_normalized_metadata_keeps_zero_num_examples_from_metadata(tmp_path: Path) -> None: record = _make_record(tmp_path, manifest_env_id="demo-env", num_examples=0, rollouts_per_example=1) _write_json( record.metadata_path, { "env_id": "demo-env", - "num_examples": 20, + "num_examples": 0, "rollouts_per_example": 3, }, ) @@ -234,33 +231,33 @@ def test_load_normalized_metadata_keeps_zero_num_examples_from_manifest(tmp_path normalized = load_normalized_metadata(record) assert normalized.num_examples == 0 - assert normalized.rollouts_per_example == 1 + assert normalized.rollouts_per_example == 3 -def test_load_normalized_metadata_keeps_zero_rollouts_from_manifest(tmp_path: Path) -> None: +def test_load_normalized_metadata_keeps_zero_rollouts_from_metadata(tmp_path: Path) -> None: record = _make_record(tmp_path, manifest_env_id="demo-env", num_examples=10, rollouts_per_example=0) _write_json( record.metadata_path, { "env_id": "demo-env", "num_examples": 20, - "rollouts_per_example": 3, + "rollouts_per_example": 0, }, ) normalized = load_normalized_metadata(record) - assert normalized.num_examples == 10 + assert normalized.num_examples == 20 assert normalized.rollouts_per_example == 0 -def test_load_normalized_metadata_keeps_all_examples_sentinel_from_manifest(tmp_path: Path) -> None: +def test_load_normalized_metadata_keeps_all_examples_sentinel_from_metadata(tmp_path: Path) -> None: record = _make_record(tmp_path, manifest_env_id="demo-env", num_examples=-1, rollouts_per_example=1) _write_json( record.metadata_path, { "env_id": "demo-env", - "num_examples": 20, + "num_examples": -1, "rollouts_per_example": 3, }, ) @@ -268,47 +265,4 @@ def test_load_normalized_metadata_keeps_all_examples_sentinel_from_manifest(tmp_ normalized = load_normalized_metadata(record) assert normalized.num_examples == -1 - assert normalized.rollouts_per_example == 1 - - -def test_load_normalized_metadata_warns_on_avg_reward_and_num_examples_mismatch( - tmp_path: Path, - caplog: pytest.LogCaptureFixture, -) -> None: - record = _make_record(tmp_path, manifest_env_id="demo-env", avg_reward=0.8, num_examples=10) - _write_json( - record.metadata_path, - { - "env_id": "demo-env", - "avg_reward": 0.7, - "num_examples": 12, - }, - ) - - with caplog.at_level(logging.WARNING): - normalized = load_normalized_metadata(record) - - assert normalized.num_examples == 10 - assert "Manifest/metadata result mismatch for process input" in caplog.text - assert "avg_reward manifest=0.8 metadata=0.7" in caplog.text - assert "num_examples manifest=10 metadata=12" in caplog.text - - -def test_load_normalized_metadata_does_not_warn_when_result_fields_match( - tmp_path: Path, - caplog: pytest.LogCaptureFixture, -) -> None: - record = _make_record(tmp_path, manifest_env_id="demo-env", avg_reward=0.8, num_examples=10) - _write_json( - record.metadata_path, - { - "env_id": "demo-env", - "avg_reward": 0.8, - "num_examples": 10, - }, - ) - - with caplog.at_level(logging.WARNING): - load_normalized_metadata(record) - - assert "Manifest/metadata result mismatch for process input" not in caplog.text + assert normalized.rollouts_per_example == 3 diff --git a/tests/test_cli/test_process_pipeline.py b/tests/test_cli/test_process_pipeline.py index 204ac8bc..926da9a5 100644 --- a/tests/test_cli/test_process_pipeline.py +++ b/tests/test_cli/test_process_pipeline.py @@ -1,12 +1,13 @@ from __future__ import annotations import json +import os +from datetime import datetime from pathlib import Path import pytest import pyarrow.parquet as pq -from medarc_verifiers.cli._manifest import MANIFEST_VERSION from medarc_verifiers.cli._schemas import EnvironmentExportConfig from medarc_verifiers.cli.hf import HFSyncConfig from medarc_verifiers.cli.process import ProcessOptions, run_process @@ -23,6 +24,11 @@ def _write_json(path: Path, payload: dict) -> None: path.write_text(json.dumps(payload), encoding="utf-8") +def _set_mtime(path: Path, updated_at: str) -> None: + timestamp = datetime.fromisoformat(updated_at.replace("Z", "+00:00")).timestamp() + os.utime(path, (timestamp, timestamp)) + + def _manifest_info( *, run_id: str, @@ -100,49 +106,15 @@ def _run_record( def _setup_run(tmp_path: Path) -> Path: - runs_dir = tmp_path / "runs" - run_dir = runs_dir / "run-1" - results_dir = run_dir / "demo-job" - manifest = { - "version": MANIFEST_VERSION, - "run_id": "run-1", - "name": "demo", - "config_source": "configs/demo.yaml", - "config_snapshot": {"jobs": []}, - "config_checksum": "abc123", - "created_at": "2024-01-01T00:00:00Z", - "updated_at": "2024-01-01T00:00:00Z", - "models": {"gpt-mini": {"sampling_args": {}}}, - "env_templates": {"demo-env-template": {"module": "demo-env-rollout3"}}, - "summary": { - "total": 1, - "completed": 1, - "pending": 0, - "running": 0, - "failed": 0, - "skipped": 0, - }, - "jobs": [ - { - "job_id": "demo-job", - "model_id": "gpt-mini", - "env_id": "demo-env-rollout3", - "env_template_id": "demo-env-template", - "env_variant_id": "demo-env-rollout3", - "env_args": {}, - "results_dir": "demo-job", - "status": "completed", - "num_examples": 1, - "rollouts_per_example": 1, - "row_count": 1, - } - ], - } - _write_json(run_dir / "run_manifest.json", manifest) + runs_dir = tmp_path / "runs" / "evals" + results_dir = runs_dir / "demo-env-rollout3--gpt-mini" / "run-1" metadata = { "env_id": "demo-env-rollout3", + "model": "gpt-mini", "env_args": {}, "sampling_args": {}, + "num_examples": 1, + "rollouts_per_example": 1, "version_info": { "vf_version": "0.1.10", "vf_commit": "abc123", @@ -165,6 +137,7 @@ def _setup_run(tmp_path: Path) -> Path: with results_path.open("w", encoding="utf-8") as handle: for row in results: handle.write(json.dumps(row) + "\n") + _set_mtime(results_dir / "metadata.json", "2024-01-01T00:00:00Z") return runs_dir @@ -184,75 +157,56 @@ def _write_run( write_results: bool = True, job_id: str = "demo-job", ) -> Path: - runs_dir = tmp_path / "runs" - run_dir = runs_dir / run_id - results_dir = run_dir / job_id - manifest = { - "version": MANIFEST_VERSION, - "run_id": run_id, - "name": "demo", - "config_source": "configs/demo.yaml", - "config_snapshot": {"jobs": []}, - "config_checksum": "abc123", - "created_at": "2024-01-01T00:00:00Z", - "updated_at": updated_at, - "models": {model_id: {"sampling_args": {}}}, - "env_templates": {"demo-env-template": {"module": env_id}}, - "summary": { - "total": 1, - "completed": 1 if status == "completed" else 0, - "pending": 0, - "running": 0, - "failed": 1 if status == "failed" else 0, - "skipped": 0, - }, - "jobs": [ - { - "job_id": job_id, - "model_id": model_id, - "env_id": env_id, - "env_template_id": "demo-env-template", - "env_variant_id": env_id, - "env_args": {}, - "results_dir": job_id, - "status": status, - "row_count": row_count, - "num_examples": num_examples, - "rollouts_per_example": rollouts_per_example, - } - ], - } - _write_json(run_dir / "run_manifest.json", manifest) + runs_dir = tmp_path / "runs" / "evals" + results_dir = runs_dir / f"{env_id}--{model_id}" / run_id metadata = { "env_id": env_id, + "model": model_id, "env_args": {}, "sampling_args": {}, "num_examples": num_examples, "rollouts_per_example": rollouts_per_example, } _write_json(results_dir / "metadata.json", metadata) + _set_mtime(results_dir / "metadata.json", updated_at) results_path = results_dir / "results.jsonl" if write_results: results_path.parent.mkdir(parents=True, exist_ok=True) if results_text is None: - row = {"example_id": f"ex-{run_id}", "reward": reward} - results_text = json.dumps(row) + "\n" + result_rows = 1 if row_count is None else max(int(row_count), 0) + results_text = "".join( + json.dumps({"example_id": f"ex-{run_id}-{index}", "reward": reward}) + "\n" + for index in range(result_rows) + ) results_path.write_text(results_text, encoding="utf-8") + _set_mtime(results_path, updated_at) return runs_dir -def _remove_model_id(tmp_path: Path, run_id: str) -> None: - manifest_path = tmp_path / "runs" / run_id / "run_manifest.json" - manifest = json.loads(manifest_path.read_text(encoding="utf-8")) - manifest["jobs"][0]["model_id"] = None - manifest["models"] = {} - manifest_path.write_text(json.dumps(manifest), encoding="utf-8") - - job_id = manifest["jobs"][0]["job_id"] - metadata_path = tmp_path / "runs" / run_id / job_id / "metadata.json" - metadata = json.loads(metadata_path.read_text(encoding="utf-8")) - metadata.pop("model", None) - metadata_path.write_text(json.dumps(metadata), encoding="utf-8") +def _write_deterministic_eval( + tmp_path: Path, + *, + model_id: str = "gpt-mini", + env_id: str = "demo-env", + variant_id: str | None = None, + env_args: dict[str, object] | None = None, + result_row: dict[str, object] | None = None, +) -> Path: + runs_dir = tmp_path / "runs" / "evals" + results_dir = runs_dir / model_id / env_id / (variant_id or "base") + resolved_env_args = env_args or {} + metadata = { + "env_id": env_id, + "model": model_id, + "env_args": resolved_env_args, + "sampling_args": {}, + "num_examples": 1, + "rollouts_per_example": 1, + } + _write_json(results_dir / "metadata.json", metadata) + row = result_row or {"example_id": "ex-1", "reward": 1.0} + (results_dir / "results.jsonl").write_text(json.dumps(row) + "\n", encoding="utf-8") + return runs_dir def test_run_process_respects_env_export_defaults(tmp_path: Path) -> None: @@ -281,6 +235,133 @@ def test_run_process_respects_env_export_defaults(tmp_path: Path) -> None: assert group.model_id == "gpt-mini" +def test_run_process_processes_deterministic_eval_outputs(tmp_path: Path) -> None: + runs_dir = _write_deterministic_eval(tmp_path) + + result = run_process( + ProcessOptions( + runs_dir=runs_dir, + output_dir=tmp_path / "processed", + dry_run=True, + max_workers=1, + ) + ) + + assert result.records_processed == 1 + assert result.rows_processed == 1 + group = result.env_groups[0] + assert group.env_id == "demo-env" + assert group.model_id == "gpt-mini" + + +def test_run_process_preserves_deterministic_eval_variants(tmp_path: Path) -> None: + _write_deterministic_eval(tmp_path, variant_id="env_args.shuffle_seed-1618") + runs_dir = _write_deterministic_eval(tmp_path, variant_id="env_args.shuffle_seed-9331") + output_dir = tmp_path / "processed" + + result = run_process( + ProcessOptions( + runs_dir=runs_dir, + output_dir=output_dir, + dry_run=False, + max_workers=1, + ) + ) + + assert result.records_processed == 2 + rel_paths = sorted(summary.output_path.relative_to(output_dir).as_posix() for summary in result.env_summaries) + assert rel_paths == [ + "gpt-mini/demo-env__variants/env_args.shuffle_seed-1618.parquet", + "gpt-mini/demo-env__variants/env_args.shuffle_seed-9331.parquet", + ] + index_payload = json.loads((output_dir / "env_index.json").read_text(encoding="utf-8")) + assert sorted(index_payload["files"]) == rel_paths + assert {entry["variant_id"] for entry in index_payload["files"].values()} == { + "env_args.shuffle_seed-1618", + "env_args.shuffle_seed-9331", + } + + +def test_run_process_excludes_specific_deterministic_eval_variant(tmp_path: Path) -> None: + _write_deterministic_eval(tmp_path, variant_id="env_args.shuffle_seed-1618") + runs_dir = _write_deterministic_eval(tmp_path, variant_id="env_args.shuffle_seed-9331") + + result = run_process( + ProcessOptions( + runs_dir=runs_dir, + output_dir=tmp_path / "processed", + exclude_datasets=("demo-env::env_args.shuffle_seed-1618",), + dry_run=True, + max_workers=1, + ) + ) + + assert result.records_processed == 1 + assert result.env_groups[0].variant_id == "env_args.shuffle_seed-9331" + + +def test_run_process_excludes_deterministic_eval_variants_by_base_env(tmp_path: Path) -> None: + _write_deterministic_eval(tmp_path, variant_id="env_args.shuffle_seed-1618") + runs_dir = _write_deterministic_eval(tmp_path, variant_id="env_args.shuffle_seed-9331") + + result = run_process( + ProcessOptions( + runs_dir=runs_dir, + output_dir=tmp_path / "processed", + exclude_datasets=("demo-env",), + dry_run=True, + max_workers=1, + ) + ) + + assert result.records_processed == 0 + assert result.env_groups == [] + + +def test_run_process_applies_variant_export_overrides_to_deterministic_eval(tmp_path: Path) -> None: + variant_id = "env_args.add_calculator_tool-true__env_args.add_python_tool-true__env_args.version-verified" + env_args = { + "version": "verified", + "add_python_tool": True, + "add_calculator_tool": True, + } + runs_dir = _write_deterministic_eval( + tmp_path, + env_id="medcalc_bench", + variant_id=variant_id, + env_args=env_args, + result_row={ + "example_id": "ex-1", + "ground_truth": "42", + "lower_bound": 40, + "upper_bound": 44, + "reward": 1.0, + }, + ) + output_dir = tmp_path / "processed" + + result = run_process( + ProcessOptions( + runs_dir=runs_dir, + output_dir=output_dir, + dry_run=False, + max_workers=1, + ), + env_export_map={ + f"medcalc_bench::{variant_id}": EnvironmentExportConfig( + extra_columns=["lower_bound", "upper_bound"], + answer_column="ground_truth", + ) + }, + ) + + table = pq.read_table(result.env_summaries[0].output_path) + row = table.to_pylist()[0] + assert row["answer"] == "42" + assert json.loads(row["extras"]) == {"lower_bound": 40, "upper_bound": 44} + assert "ground_truth" not in row + + def test_run_process_resolves_base_env_id(tmp_path: Path) -> None: runs_dir = _setup_run(tmp_path) options = ProcessOptions( @@ -412,7 +493,7 @@ def test_process_allows_results_missing_pct_within_threshold(tmp_path: Path) -> result = run_process(options) assert result.records_processed == 1 - assert result.rows_processed == 1 + assert result.rows_processed == 98 def test_process_rejects_results_missing_pct_above_threshold(tmp_path: Path) -> None: @@ -468,14 +549,14 @@ def test_process_allows_ungateable_record_when_expected_rows_unknown(tmp_path: P assert result.records_processed == 1 -def test_process_allows_ungateable_record_when_row_count_unknown(tmp_path: Path) -> None: +def test_process_allows_ungateable_record_when_expected_rows_unknown_even_with_observed_rows(tmp_path: Path) -> None: runs_dir = _write_run( tmp_path, run_id="run-unknown-observed", updated_at="2024-01-01T00:00:00Z", reward=1.0, row_count=None, - num_examples=100, + num_examples=None, rollouts_per_example=1, ) options = ProcessOptions( @@ -525,7 +606,7 @@ def test_process_latest_record_that_fails_gate_does_not_fall_back(tmp_path: Path assert "run-older-ok" not in message -def test_process_rejects_missing_results_jsonl_for_selected_latest_record(tmp_path: Path) -> None: +def test_process_ignores_metadata_only_output_without_results_jsonl(tmp_path: Path) -> None: runs_dir = _write_run( tmp_path, run_id="run-missing-results", @@ -543,12 +624,10 @@ def test_process_rejects_missing_results_jsonl_for_selected_latest_record(tmp_pa max_workers=1, ) - with pytest.raises(RuntimeError) as excinfo: - run_process(options) + result = run_process(options) - message = str(excinfo.value) - assert "missing results.jsonl files" in message - assert "run-missing-results" in message + assert result.records_processed == 0 + assert result.rows_processed == 0 def test_process_gate_ignores_excluded_record(tmp_path: Path) -> None: @@ -628,7 +707,7 @@ def test_process_emits_single_warning_for_ungateable_selected_records( updated_at="2024-01-01T00:00:00Z", reward=1.0, row_count=None, - num_examples=100, + num_examples=None, rollouts_per_example=1, ) caplog.set_level("WARNING") @@ -649,7 +728,7 @@ def test_process_emits_single_warning_for_ungateable_selected_records( assert len(warnings) == 1 -def test_process_uses_actual_results_rows_when_manifest_row_count_is_stale( +def test_process_uses_discovered_actual_results_rows_for_completeness_gate( tmp_path: Path, caplog: pytest.LogCaptureFixture, ) -> None: @@ -676,8 +755,7 @@ def test_process_uses_actual_results_rows_when_manifest_row_count_is_stale( ) assert result.records_processed == 1 - assert "Manifest row_count mismatch for process input" in caplog.text - assert "manifest row_count=90 actual_rows=100" in caplog.text + assert "row_count mismatch" not in caplog.text def test_select_work_items_rollout_gate_error_includes_output_and_manifest_ids(tmp_path: Path) -> None: @@ -706,7 +784,7 @@ def test_select_work_items_rollout_gate_error_includes_output_and_manifest_ids(t message = str(excinfo.value) assert "output_env_id=demo-env" in message assert "manifest_env_id=demo-env-rollout3" in message - assert "job_id=demo-job" in message + assert "job_id=run-rollout-bad" in message def test_run_process_excludes_models(tmp_path: Path) -> None: @@ -1281,78 +1359,6 @@ def test_process_ignores_invalid_superseded_run(tmp_path: Path) -> None: assert table.column("reward").to_pylist() == [0.9] -def test_process_ignores_superseded_run_missing_model_id(tmp_path: Path) -> None: - runs_dir = _write_run(tmp_path, run_id="run-1", updated_at="2024-01-01T00:00:00Z", reward=0.1) - _remove_model_id(tmp_path, "run-1") - _write_run(tmp_path, run_id="run-2", updated_at="2024-01-02T00:00:00Z", reward=0.9) - - result = run_process( - ProcessOptions(runs_dir=runs_dir, output_dir=tmp_path / "processed", dry_run=False, max_workers=1) - ) - - table = pq.read_table(result.env_summaries[0].output_path) - assert table.column("reward").to_pylist() == [0.9] - - -def test_process_latest_missing_model_id_fails_clearly(tmp_path: Path) -> None: - runs_dir = _write_run(tmp_path, run_id="run-1", updated_at="2024-01-01T00:00:00Z", reward=0.1) - _write_run(tmp_path, run_id="run-2", updated_at="2024-01-02T00:00:00Z", reward=0.9) - _remove_model_id(tmp_path, "run-2") - - with pytest.raises(RuntimeError, match=r"Missing model_id for run \(job_run_id=run-2, job_id=demo-job,"): - run_process(ProcessOptions(runs_dir=runs_dir, output_dir=tmp_path / "processed", dry_run=False, max_workers=1)) - - -def test_process_latest_missing_model_id_not_masked_by_newer_other_job(tmp_path: Path) -> None: - runs_dir = _write_run( - tmp_path, - run_id="run-model-a-old", - updated_at="2024-01-01T00:00:00Z", - reward=0.1, - model_id="model-a", - job_id="job-model-a", - ) - _write_run( - tmp_path, - run_id="run-model-a-bad", - updated_at="2024-01-02T00:00:00Z", - reward=0.2, - model_id="model-a", - job_id="job-model-a", - ) - _remove_model_id(tmp_path, "run-model-a-bad") - _write_run( - tmp_path, - run_id="run-model-b-good", - updated_at="2024-01-03T00:00:00Z", - reward=0.9, - model_id="model-b", - job_id="job-model-b", - ) - - with pytest.raises( - RuntimeError, match=r"Missing model_id for run \(job_run_id=run-model-a-bad, job_id=job-model-a," - ): - run_process(ProcessOptions(runs_dir=runs_dir, output_dir=tmp_path / "processed", dry_run=False, max_workers=1)) - - -def test_process_ignores_invalid_incomplete_run_by_default(tmp_path: Path) -> None: - runs_dir = _write_run( - tmp_path, - run_id="run-1", - updated_at="2024-01-01T00:00:00Z", - reward=0.1, - status="running", - results_text='{"example_id": ', - ) - _write_run(tmp_path, run_id="run-2", updated_at="2024-01-02T00:00:00Z", reward=0.9, env_id="other-env") - output_dir = tmp_path / "processed" - - result = run_process(ProcessOptions(runs_dir=runs_dir, output_dir=output_dir, dry_run=False, max_workers=1)) - - assert {summary.env_id for summary in result.env_summaries} == {"other-env"} - - def test_process_selected_invalid_results_still_fail(tmp_path: Path) -> None: runs_dir = _write_run( tmp_path, @@ -1366,15 +1372,6 @@ def test_process_selected_invalid_results_still_fail(tmp_path: Path) -> None: run_process(ProcessOptions(runs_dir=runs_dir, output_dir=tmp_path / "processed", dry_run=False, max_workers=1)) -def test_process_selected_missing_results_still_fail(tmp_path: Path) -> None: - runs_dir = _setup_run(tmp_path) - missing_results = runs_dir / "run-1" / "demo-job" / "results.jsonl" - missing_results.unlink() - - with pytest.raises(RuntimeError, match="Selected records are missing results.jsonl files:"): - run_process(ProcessOptions(runs_dir=runs_dir, output_dir=tmp_path / "processed", dry_run=False, max_workers=1)) - - def test_process_clean_clears_outputs(tmp_path: Path) -> None: runs_dir = _setup_run(tmp_path) output_dir = tmp_path / "processed" @@ -1456,11 +1453,6 @@ def fake_read_env_index_files(processed_dir: Path): def test_run_process_ignores_legacy_run_output_path(tmp_path: Path) -> None: runs_dir = _setup_run(tmp_path) - run_dir = runs_dir / "run-1" - manifest_path = run_dir / "run_manifest.json" - manifest = json.loads(manifest_path.read_text(encoding="utf-8")) - manifest["updated_at"] = "2024-01-01T00:10:00Z" - _write_json(manifest_path, manifest) output_dir = tmp_path / "processed" output_dir.mkdir() diff --git a/tests/test_cli/test_process_winrate.py b/tests/test_cli/test_process_winrate.py index a29e8bed..b0dad0ee 100644 --- a/tests/test_cli/test_process_winrate.py +++ b/tests/test_cli/test_process_winrate.py @@ -595,3 +595,82 @@ def test_run_winrate_validates_known_models_from_env_index(tmp_path: Path) -> No output_name=None, config=cfg, ) + + +def test_run_winrate_discovers_variants_as_distinct_datasets(tmp_path: Path) -> None: + processed_dir = tmp_path / "processed" + output_dir = tmp_path / "out" + files: dict[str, dict[str, object]] = {} + rewards = { + ("model-a", "seed-1"): 1.0, + ("model-b", "seed-1"): 0.0, + ("model-a", "seed-2"): 0.0, + ("model-b", "seed-2"): 1.0, + } + for (model_id, variant_id), reward in rewards.items(): + path = processed_dir / model_id / "demo-env__variants" / f"{variant_id}.parquet" + _write_dataset( + path, + [ + { + "example_id": "q1", + "model_id": model_id, + "reward": reward, + } + ], + ) + rel_path = path.relative_to(processed_dir).as_posix() + files[rel_path] = { + "env_id": "demo-env", + "base_env_id": "demo-env", + "model_id": model_id, + "variant_id": variant_id, + "variant_payload": {"env_args": {"shuffle_seed": int(variant_id.removeprefix("seed-"))}}, + "row_count": 1, + } + + env_index = { + "version": 2, + "processed_at": "2024-01-01T00:00:00Z", + "schema_version": 1, + "processed_with_args": {}, + "runs": {}, + "files": files, + } + (processed_dir / "env_index.json").write_text(json.dumps(env_index), encoding="utf-8") + + result = winrate.run_winrate( + processed_dir=processed_dir, + output_dir=output_dir, + output_path=None, + output_name=None, + config=winrate.WinrateConfig(dataset_coverage="per-model"), + processed_at="2024-01-01T00:00:00Z", + ) + payload = winrate.to_json(result.result) + + assert [dataset for dataset, _ in result.datasets] == ["demo-env::seed-1", "demo-env::seed-2"] + assert set(payload["datasets"]) == {"demo-env::seed-1", "demo-env::seed-2"} + assert "demo-env" not in payload["datasets"] + assert payload["models"]["model-a"]["avg_reward_per_dataset"] == { + "demo-env::seed-1": 1.0, + "demo-env::seed-2": 0.0, + } + assert payload["models"]["model-b"]["avg_reward_per_dataset"] == { + "demo-env::seed-1": 0.0, + "demo-env::seed-2": 1.0, + } + assert payload["models"]["model-a"]["mean_winrate"]["n_datasets"] == 2 + + excluded_variant = winrate.compute_winrates( + result.datasets, + winrate.WinrateConfig(exclude_datasets=("demo-env::seed-1",), dataset_coverage="per-model"), + ) + excluded_payload = winrate.to_json(excluded_variant) + assert set(excluded_payload["datasets"]) == {"demo-env::seed-2"} + + with pytest.raises(ValueError, match="No datasets remain after applying dataset exclusions"): + winrate.compute_winrates( + result.datasets, + winrate.WinrateConfig(exclude_datasets=("demo-env",), dataset_coverage="per-model"), + ) diff --git a/tests/test_cli/test_process_writer.py b/tests/test_cli/test_process_writer.py index f3229651..e65c99e9 100644 --- a/tests/test_cli/test_process_writer.py +++ b/tests/test_cli/test_process_writer.py @@ -74,6 +74,52 @@ def test_write_env_groups_creates_parquet_and_index(tmp_path: Path) -> None: assert rel_path in ds_infos["default"]["data_files"]["train"] +def test_write_env_groups_writes_variant_path_and_metadata(tmp_path: Path) -> None: + rows = [ + { + "env_id": "demo-env", + "base_env_id": "demo-env", + "example_id": "ex-1", + "job_run_id": "run-1", + "model_id": "demo-model", + "variant_id": "env_args.shuffle_seed-1618", + "variant_payload": json.dumps({"env_args": {"shuffle_seed": 1618}}), + "reward": 1.0, + } + ] + group = aggregate_rows_by_env(rows)[0] + config = WriterConfig( + output_dir=tmp_path, + processed_at="2024-01-01T00:00:00Z", + processed_with_args={}, + ) + + summaries = write_env_groups([group], config) + + summary = summaries[0] + rel_path = summary.output_path.relative_to(tmp_path).as_posix() + assert rel_path == "demo-model/demo-env__variants/env_args.shuffle_seed-1618.parquet" + assert summary.variant_id == "env_args.shuffle_seed-1618" + assert summary.variant_payload == {"env_args": {"shuffle_seed": 1618}} + + table = pq.read_table(summary.output_path) + assert table.column("variant_id").to_pylist() == ["env_args.shuffle_seed-1618"] + assert [json.loads(value) for value in table.column("variant_payload").to_pylist()] == [ + {"env_args": {"shuffle_seed": 1618}} + ] + embedded = json.loads((table.schema.metadata or {})[EXPORTER_METADATA_KEY]) + assert embedded["variant_id"] == "env_args.shuffle_seed-1618" + assert embedded["variant_payload"] == {"env_args": {"shuffle_seed": 1618}} + + payload = json.loads((tmp_path / "env_index.json").read_text(encoding="utf-8")) + file_entry = payload["files"][rel_path] + assert file_entry["env_id"] == "demo-env" + assert file_entry["base_env_id"] == "demo-env" + assert file_entry["model_id"] == "demo-model" + assert file_entry["variant_id"] == "env_args.shuffle_seed-1618" + assert file_entry["variant_payload"] == {"env_args": {"shuffle_seed": 1618}} + + def test_write_env_groups_dry_run(tmp_path: Path) -> None: group = _group_for_env() config = WriterConfig( diff --git a/tests/test_cli/test_schemas.py b/tests/test_cli/test_schemas.py index 4fe28197..abdf5f33 100644 --- a/tests/test_cli/test_schemas.py +++ b/tests/test_cli/test_schemas.py @@ -5,38 +5,9 @@ from medarc_verifiers.cli._schemas import ( EnvironmentConfigSchema, EnvironmentExportConfig, - ModelConfigSchema, ) -def test_model_params_merge_matches_explicit_definition() -> None: - explicit = ModelConfigSchema( - id="demo", - model="gpt-mini", - env_args={"split": "dev"}, - env_overrides={"medqa": {"temperature": 0.2}}, - ) - legacy = ModelConfigSchema( - id="demo", - params={ - "model": "gpt-mini", - "env_args": {"split": "dev"}, - "env_overrides": {"medqa": {"temperature": 0.2}}, - }, - ) - - assert legacy.model_dump() == explicit.model_dump() - - -def test_environment_matrix_exclude_with_unknown_key_raises() -> None: - with pytest.raises(ValueError, match="matrix_exclude entry references unknown keys"): - EnvironmentConfigSchema( - id="medqa", - matrix={"num_examples": [5]}, - matrix_exclude=[{"unknown_key": 1}], - ) - - def test_environment_export_config_validates_columns() -> None: env = EnvironmentConfigSchema( id="medqa", diff --git a/tests/test_cli/test_upstream_eval.py b/tests/test_cli/test_upstream_eval.py new file mode 100644 index 00000000..5b69dfd8 --- /dev/null +++ b/tests/test_cli/test_upstream_eval.py @@ -0,0 +1,24 @@ +from __future__ import annotations + +import verifiers.scripts.eval as upstream_eval_script + +from medarc_verifiers.cli import upstream_eval +from medarc_verifiers.cli import verifiers_adapter + + +def test_upstream_eval_boundary_uses_temporary_adapter_until_public_builder_exists() -> None: + assert not hasattr(upstream_eval_script, "build_eval_config") + assert upstream_eval.build_eval_config is verifiers_adapter.build_eval_config + assert upstream_eval.load_toml_eval_configs is verifiers_adapter.load_toml_eval_configs + + +def test_temporary_adapter_provider_constants_match_upstream() -> None: + assert verifiers_adapter.DEFAULT_MODEL == upstream_eval_script.DEFAULT_MODEL + assert verifiers_adapter.DEFAULT_ENV_DIR_PATH == upstream_eval_script.DEFAULT_ENV_DIR_PATH + assert verifiers_adapter.DEFAULT_ENDPOINTS_PATH == upstream_eval_script.DEFAULT_ENDPOINTS_PATH + assert verifiers_adapter.DEFAULT_NUM_EXAMPLES == upstream_eval_script.DEFAULT_NUM_EXAMPLES + assert verifiers_adapter.DEFAULT_ROLLOUTS_PER_EXAMPLE == upstream_eval_script.DEFAULT_ROLLOUTS_PER_EXAMPLE + assert verifiers_adapter.DEFAULT_MAX_CONCURRENT == upstream_eval_script.DEFAULT_MAX_CONCURRENT + assert verifiers_adapter.DEFAULT_CLIENT_TYPE == upstream_eval_script.DEFAULT_CLIENT_TYPE + assert verifiers_adapter.DEFAULT_PROVIDER == upstream_eval_script.DEFAULT_PROVIDER + assert verifiers_adapter.PROVIDER_CONFIGS == upstream_eval_script.PROVIDER_CONFIGS diff --git a/tests/test_cli/test_verifiers_adapter.py b/tests/test_cli/test_verifiers_adapter.py new file mode 100644 index 00000000..c0d179fe --- /dev/null +++ b/tests/test_cli/test_verifiers_adapter.py @@ -0,0 +1,715 @@ +from __future__ import annotations + +import importlib +from pathlib import Path + +import pytest + +from medarc_verifiers.cli.utils.endpoint_utils import load_endpoint_sampling_profiles +from medarc_verifiers.cli.verifiers_adapter import EvalConfigOverrides, build_eval_config, load_toml_eval_configs +from medarc_verifiers.utils.prime_inference import PRIME_INFERENCE_URL + + +def _write_endpoints(path: Path) -> Path: + path.write_text( + """ +[[endpoint]] +endpoint_id = "openai-alias" +model = "openai/resolved" +url = "https://openai.example/v1" +key = "OPENAI_ALIAS_KEY" +headers = { "X-Registry" = "1" } + +[[endpoint]] +endpoint_id = "replica-alias" +model = "replica/resolved" +url = "https://replica-a.example/v1" +key = "REPLICA_KEY_A" +headers = { "X-Replica" = "a" } + +[[endpoint]] +endpoint_id = "replica-alias" +model = "replica/resolved" +url = "https://replica-b.example/v1" +key = "REPLICA_KEY_B" +headers = { "X-Replica" = "b" } +""".strip() + ) + return path + + +def test_load_endpoint_sampling_profiles_parses_nested_table(tmp_path: Path) -> None: + endpoints_path = tmp_path / "endpoints.toml" + endpoints_path.write_text( + """ +[[endpoint]] +endpoint_id = "gpt-oss-20b-low-local" +model = "openai/gpt-oss-20b" +url = "http://host.docker.internal:8010/v1" +key = "VLLM_API_KEY" + +[endpoint.sampling_args] +temperature = 1.0 +top_p = 1.0 +top_k = 0 +reasoning_effort = "low" +""".strip() + ) + + profiles = load_endpoint_sampling_profiles(endpoints_path) + + assert profiles == { + "gpt-oss-20b-low-local": [{"temperature": 1.0, "top_p": 1.0, "top_k": 0, "reasoning_effort": "low"}] + } + + +def test_load_toml_eval_configs_expands_ablation(tmp_path: Path) -> None: + config_path = tmp_path / "eval.toml" + endpoints_path = _write_endpoints(tmp_path / "endpoints.toml") + config_path.write_text( + f""" +model = "openai/gpt-4.1-mini" +endpoints_path = "{endpoints_path}" +debug = true +headers_from_state = {{ "X-Trace" = "trace_id" }} +timeout = 30.0 + +[[eval]] +env_id = "medqa" + +[[ablation]] +env_id = "medqa" +env_args = {{ shuffle_answers = true }} + +[ablation.sweep.env_args] +shuffle_seed = [1618, 9331] +""".strip() + ) + + configs = load_toml_eval_configs(config_path) + + assert [config["env_id"] for config in configs] == ["medqa", "medqa", "medqa"] + assert configs[0]["debug"] is True + assert configs[0]["headers_from_state"] == {"X-Trace": "trace_id"} + assert configs[0]["timeout"] == 30.0 + assert configs[1]["env_args"] == {"shuffle_answers": True, "shuffle_seed": 1618} + assert configs[2]["env_args"] == {"shuffle_answers": True, "shuffle_seed": 9331} + + +def test_load_toml_eval_configs_strips_medarc_metadata(tmp_path: Path) -> None: + config_path = tmp_path / "eval.toml" + config_path.write_text( + """ +model = "openai/gpt-4.1-mini" + +[[eval]] +env_id = "medqa" + +[medarc.orchestrate.foo] +gpus = 1 + +[medarc.orchestrate.vllm-container] +image = "vllm/vllm-openai:latest" +""".strip() + ) + + configs = load_toml_eval_configs(config_path) + + assert len(configs) == 1 + assert "medarc" not in configs[0] + assert configs[0]["env_id"] == "medqa" + + +def test_build_eval_config_resolves_endpoint_alias_and_core_fields(tmp_path: Path) -> None: + endpoints_path = _write_endpoints(tmp_path / "endpoints.toml") + + config = build_eval_config( + { + "env_id": "medqa", + "endpoint_id": "openai-alias", + "endpoints_path": str(endpoints_path), + "env_args": {"subset": "dev"}, + "sampling_args": {"temperature": 0.2}, + "max_tokens": 123, + "num_examples": 7, + "rollouts_per_example": 2, + "max_concurrent": 4, + "max_retries": 3, + "num_workers": 2, + "debug": True, + "timeout": 45.0, + "state_columns": ["question_id", "split"], + "save_results": True, + "independent_scoring": True, + "save_to_hf_hub": True, + "hf_hub_dataset_name": "org/dataset", + "headers": {"X-Eval": "table"}, + "header": ["X-Eval: list", "X-Extra: 1"], + "headers_from_state": {"X-Trace": "trace_id"}, + "header_from_state": ["X-User: user_id"], + } + ) + + assert config.env_id == "medqa" + assert config.endpoint_id == "openai-alias" + assert config.model == "openai/resolved" + assert config.env_args == {"subset": "dev"} + assert config.sampling_args["temperature"] == 0.2 + assert config.sampling_args["max_tokens"] == 123 + assert config.num_examples == 7 + assert config.rollouts_per_example == 2 + assert config.max_concurrent == 4 + assert config.max_retries == 3 + assert config.num_workers == 2 + assert "debug" not in type(config).model_fields + assert config.extra_env_kwargs == {"timeout_seconds": 45.0} + assert config.state_columns == ["question_id", "split"] + assert config.save_results is True + assert config.resume_path is None + assert config.independent_scoring is True + assert config.save_to_hf_hub is True + assert config.hf_hub_dataset_name == "org/dataset" + assert config.client_config.api_base_url == "https://openai.example/v1" + assert config.client_config.api_key_var == "OPENAI_ALIAS_KEY" + assert config.client_config.extra_headers == {"X-Registry": "1", "X-Eval": "list", "X-Extra": "1"} + assert config.client_config.extra_headers_from_state == { + "X-Session-ID": "example_id", + "X-Trace": "trace_id", + "X-User": "user_id", + } + + +def test_build_eval_config_supports_model_only_endpoint_alias(tmp_path: Path) -> None: + endpoints_path = tmp_path / "endpoints.toml" + endpoints_path.write_text( + """ +[[endpoint]] +endpoint_id = "portable-alias" +model = "org/resolved" + +[endpoint.sampling_args] +temperature = 0.4 +top_p = 0.9 +""".strip() + ) + + config = build_eval_config( + { + "env_id": "medqa", + "model": "portable-alias", + "endpoints_path": str(endpoints_path), + "api_base_url": "https://deployment.example/v1", + "api_key_var": "DEPLOYMENT_KEY", + } + ) + + assert config.model == "org/resolved" + assert config.client_config.api_base_url == "https://deployment.example/v1" + assert config.client_config.api_key_var == "DEPLOYMENT_KEY" + assert config.sampling_args["temperature"] == 0.4 + assert config.sampling_args["top_p"] == 0.9 + + +def test_build_eval_config_supports_endpoint_replicas(tmp_path: Path) -> None: + endpoints_path = _write_endpoints(tmp_path / "endpoints.toml") + + config = build_eval_config( + { + "env_id": "medqa", + "endpoint_id": "replica-alias", + "endpoints_path": str(endpoints_path), + } + ) + + assert config.model == "replica/resolved" + assert [endpoint.api_base_url for endpoint in config.client_config.endpoint_configs] == [ + "https://replica-a.example/v1", + "https://replica-b.example/v1", + ] + assert [endpoint.api_key_var for endpoint in config.client_config.endpoint_configs] == [ + "REPLICA_KEY_A", + "REPLICA_KEY_B", + ] + assert [endpoint.extra_headers for endpoint in config.client_config.endpoint_configs] == [ + {"X-Replica": "a"}, + {"X-Replica": "b"}, + ] + + +def test_build_eval_config_uses_endpoint_sampling_defaults(tmp_path: Path) -> None: + endpoints_path = tmp_path / "endpoints.toml" + endpoints_path.write_text( + """ +[[endpoint]] +endpoint_id = "gpt-oss" +model = "openai/gpt-oss-20b" +url = "http://localhost:8010/v1" +key = "VLLM_API_KEY" +api_client_type = "openai_responses" + +[endpoint.sampling_args] +temperature = 1.0 +top_p = 1.0 +top_k = 0 +reasoning_effort = "low" +""".strip() + ) + + config = build_eval_config({"env_id": "medqa", "model": "gpt-oss", "endpoints_path": str(endpoints_path)}) + + assert config.model == "openai/gpt-oss-20b" + assert config.client_config.client_type == "openai_responses" + assert config.sampling_args["temperature"] == 1.0 + assert config.sampling_args["top_p"] == 1.0 + assert "reasoning_effort" not in config.sampling_args + assert config.sampling_args["reasoning"] == {"effort": "low"} + assert config.sampling_args["extra_body"]["top_k"] == 0 + + +def test_build_eval_config_chat_client_keeps_reasoning_effort(tmp_path: Path) -> None: + endpoints_path = tmp_path / "endpoints.toml" + endpoints_path.write_text( + """ +[[endpoint]] +endpoint_id = "gpt-oss-chat" +model = "openai/gpt-oss-20b" +url = "http://localhost:8010/v1" +key = "VLLM_API_KEY" +api_client_type = "openai_chat_completions" + +[endpoint.sampling_args] +top_k = 0 +reasoning_effort = "low" +""".strip() + ) + + config = build_eval_config({"env_id": "medqa", "model": "gpt-oss-chat", "endpoints_path": str(endpoints_path)}) + + assert config.client_config.client_type == "openai_chat_completions" + assert config.sampling_args["reasoning_effort"] == "low" + assert "reasoning" not in config.sampling_args + assert config.sampling_args["extra_body"]["top_k"] == 0 + + +def test_build_eval_config_sampling_precedence_endpoint_raw_and_cli(tmp_path: Path) -> None: + endpoints_path = tmp_path / "endpoints.toml" + endpoints_path.write_text( + """ +[[endpoint]] +endpoint_id = "profiled" +model = "openai/profiled" +url = "https://profiled.example/v1" +key = "PROFILED_KEY" +sampling_args = { temperature = 1.0, top_p = 0.5 } +""".strip() + ) + + toml_config = build_eval_config( + { + "env_id": "medqa", + "endpoint_id": "profiled", + "endpoints_path": str(endpoints_path), + "temperature": 0.2, + "sampling_args": {"temperature": 0.7}, + } + ) + cli_config = build_eval_config( + { + "env_id": "medqa", + "endpoint_id": "profiled", + "endpoints_path": str(endpoints_path), + "temperature": 0.2, + "sampling_args": {"temperature": 0.7}, + }, + overrides=EvalConfigOverrides(sampling_args={"temperature": 0.8}), + ) + + assert toml_config.sampling_args["temperature"] == 0.7 + assert toml_config.sampling_args["top_p"] == 0.5 + assert cli_config.sampling_args["temperature"] == 0.8 + + +def test_build_eval_config_responses_cli_reasoning_overrides_endpoint_reasoning_effort(tmp_path: Path) -> None: + endpoints_path = tmp_path / "endpoints.toml" + endpoints_path.write_text( + """ +[[endpoint]] +endpoint_id = "profiled" +model = "openai/profiled" +url = "https://profiled.example/v1" +key = "PROFILED_KEY" +api_client_type = "openai_responses" +sampling_args = { reasoning_effort = "low" } +""".strip() + ) + + config = build_eval_config( + {"env_id": "medqa", "endpoint_id": "profiled", "endpoints_path": str(endpoints_path)}, + overrides=EvalConfigOverrides(sampling_args={"reasoning": {"effort": "high"}}), + ) + + assert config.sampling_args["reasoning"] == {"effort": "high"} + assert "reasoning_effort" not in config.sampling_args + + +def test_build_eval_config_responses_cli_reasoning_effort_overrides_endpoint_reasoning(tmp_path: Path) -> None: + endpoints_path = tmp_path / "endpoints.toml" + endpoints_path.write_text( + """ +[[endpoint]] +endpoint_id = "profiled" +model = "openai/profiled" +url = "https://profiled.example/v1" +key = "PROFILED_KEY" +api_client_type = "openai_responses" +sampling_args = { reasoning = { effort = "low" } } +""".strip() + ) + + config = build_eval_config( + {"env_id": "medqa", "endpoint_id": "profiled", "endpoints_path": str(endpoints_path)}, + overrides=EvalConfigOverrides(sampling_args={"reasoning_effort": "high"}), + ) + + assert config.sampling_args["reasoning"] == {"effort": "high"} + assert "reasoning_effort" not in config.sampling_args + + +def test_build_eval_config_scalar_temperature_overrides_endpoint_default(tmp_path: Path) -> None: + endpoints_path = tmp_path / "endpoints.toml" + endpoints_path.write_text( + """ +[[endpoint]] +endpoint_id = "profiled" +model = "openai/profiled" +url = "https://profiled.example/v1" +key = "PROFILED_KEY" +sampling_args = { temperature = 1.0 } +""".strip() + ) + + config = build_eval_config( + { + "env_id": "medqa", + "endpoint_id": "profiled", + "endpoints_path": str(endpoints_path), + "temperature": 0.2, + } + ) + + assert config.sampling_args["temperature"] == 0.2 + + +def test_build_eval_config_deep_merges_sampling_extra_body(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv("MEDARC_INCLUDE_USAGE", raising=False) + endpoints_path = tmp_path / "endpoints.toml" + endpoints_path.write_text( + f""" +[[endpoint]] +endpoint_id = "prime-profiled" +model = "openai/profiled" +url = "{PRIME_INFERENCE_URL}" +key = "PRIME_API_KEY" +sampling_args = {{ top_k = 0 }} +""".strip() + ) + + config = build_eval_config( + { + "env_id": "medqa", + "endpoint_id": "prime-profiled", + "endpoints_path": str(endpoints_path), + "sampling_args": {"extra_body": {"guided_choice": ["A", "B"]}}, + } + ) + + assert config.sampling_args["extra_body"] == { + "usage": {"include": True}, + "guided_choice": ["A", "B"], + "top_k": 0, + } + + +def test_build_eval_config_direct_unknown_sampling_arg_overrides_extra_body_key(tmp_path: Path) -> None: + endpoints_path = tmp_path / "endpoints.toml" + endpoints_path.write_text( + """ +[[endpoint]] +endpoint_id = "profiled" +model = "openai/profiled" +url = "https://profiled.example/v1" +key = "PROFILED_KEY" +sampling_args = { extra_body = { top_k = 1 } } +""".strip() + ) + + config = build_eval_config( + { + "env_id": "medqa", + "endpoint_id": "profiled", + "endpoints_path": str(endpoints_path), + }, + overrides=EvalConfigOverrides(sampling_args={"top_k": 3}), + ) + + assert config.sampling_args["extra_body"]["top_k"] == 3 + + +def test_build_eval_config_direct_unknown_sampling_arg_overrides_extra_body_key_for_any_extra(tmp_path: Path) -> None: + endpoints_path = tmp_path / "endpoints.toml" + endpoints_path.write_text( + """ +[[endpoint]] +endpoint_id = "profiled" +model = "openai/profiled" +url = "https://profiled.example/v1" +key = "PROFILED_KEY" +sampling_args = { extra_body = { repetition_penalty = 1.1 } } +""".strip() + ) + + config = build_eval_config( + { + "env_id": "medqa", + "endpoint_id": "profiled", + "endpoints_path": str(endpoints_path), + }, + overrides=EvalConfigOverrides(sampling_args={"repetition_penalty": 1.2}), + ) + + assert config.sampling_args["extra_body"]["repetition_penalty"] == 1.2 + + +def test_build_eval_config_extra_body_key_overrides_lower_precedence_direct_unknown_arg(tmp_path: Path) -> None: + endpoints_path = tmp_path / "endpoints.toml" + endpoints_path.write_text( + """ +[[endpoint]] +endpoint_id = "profiled" +model = "openai/profiled" +url = "https://profiled.example/v1" +key = "PROFILED_KEY" +sampling_args = { top_k = 0 } +""".strip() + ) + + config = build_eval_config( + { + "env_id": "medqa", + "endpoint_id": "profiled", + "endpoints_path": str(endpoints_path), + "sampling_args": {"extra_body": {"top_k": 5}}, + } + ) + + assert config.sampling_args["extra_body"]["top_k"] == 5 + + +def test_build_eval_config_endpoint_replica_sampling_profiles_must_match(tmp_path: Path) -> None: + endpoints_path = tmp_path / "endpoints.toml" + endpoints_path.write_text( + """ +[[endpoint]] +endpoint_id = "replica-profiled" +model = "openai/profiled" +url = "https://replica-a.example/v1" +key = "REPLICA_A" +sampling_args = { temperature = 1.0 } + +[[endpoint]] +endpoint_id = "replica-profiled" +model = "openai/profiled" +url = "https://replica-b.example/v1" +key = "REPLICA_B" +sampling_args = { temperature = 1.0 } +""".strip() + ) + + config = build_eval_config( + {"env_id": "medqa", "endpoint_id": "replica-profiled", "endpoints_path": str(endpoints_path)} + ) + + assert config.sampling_args["temperature"] == 1.0 + + +@pytest.mark.parametrize( + "second_sampling", + [ + "sampling_args = { temperature = 0.5 }", + "", + ], +) +def test_build_eval_config_rejects_conflicting_replica_sampling_profiles(tmp_path: Path, second_sampling: str) -> None: + endpoints_path = tmp_path / "endpoints.toml" + endpoints_path.write_text( + f""" +[[endpoint]] +endpoint_id = "replica-profiled" +model = "openai/profiled" +url = "https://replica-a.example/v1" +key = "REPLICA_A" +sampling_args = {{ temperature = 1.0 }} + +[[endpoint]] +endpoint_id = "replica-profiled" +model = "openai/profiled" +url = "https://replica-b.example/v1" +key = "REPLICA_B" +{second_sampling} +""".strip() + ) + + with pytest.raises(ValueError, match="conflicting sampling_args"): + build_eval_config({"env_id": "medqa", "endpoint_id": "replica-profiled", "endpoints_path": str(endpoints_path)}) + + +@pytest.mark.parametrize( + "sampling_toml", + [ + 'sampling_args = "bad"', + "[[endpoint.sampling_args]]\ntemperature = 1.0", + ], +) +def test_load_endpoint_sampling_profiles_rejects_invalid_sampling_args(tmp_path: Path, sampling_toml: str) -> None: + endpoints_path = tmp_path / "endpoints.toml" + endpoints_path.write_text( + f""" +[[endpoint]] +endpoint_id = "bad-profile" +model = "openai/bad" +url = "https://bad.example/v1" +key = "BAD_KEY" +{sampling_toml} +""".strip() + ) + + with pytest.raises(ValueError, match="sampling_args must be a table"): + load_endpoint_sampling_profiles(endpoints_path) + + +def test_load_endpoint_sampling_profiles_ignores_python_registry(tmp_path: Path) -> None: + endpoints_path = tmp_path / "endpoints.py" + endpoints_path.write_text( + """ +ENDPOINTS = { + "profiled": { + "model": "openai/profiled", + "url": "https://profiled.example/v1", + "key": "PROFILED_KEY", + } +} +""".strip() + ) + + assert load_endpoint_sampling_profiles(endpoints_path) == {} + + +def test_build_eval_config_already_expanded_ablation_sampling_args_override_endpoint( + tmp_path: Path, +) -> None: + endpoints_path = tmp_path / "endpoints.toml" + endpoints_path.write_text( + """ +[[endpoint]] +endpoint_id = "profiled" +model = "openai/profiled" +url = "https://profiled.example/v1" +key = "PROFILED_KEY" +sampling_args = { temperature = 1.0, top_p = 0.9 } +""".strip() + ) + + config = build_eval_config( + { + "env_id": "medqa", + "endpoint_id": "profiled", + "endpoints_path": str(endpoints_path), + "name": "temp-0.3", + "sampling_args": {"temperature": 0.3}, + } + ) + + assert config.sampling_args["temperature"] == 0.3 + assert config.sampling_args["top_p"] == 0.9 + + +def test_build_eval_config_provider_and_cli_overrides_precede_toml(tmp_path: Path) -> None: + endpoints_path = _write_endpoints(tmp_path / "endpoints.toml") + + config = build_eval_config( + { + "env_id": "medqa", + "model": "openai-alias", + "endpoints_path": str(endpoints_path), + "provider": "openai", + "api_base_url": "https://toml.example/v1", + "api_key_var": "TOML_KEY", + "max_concurrent": 8, + }, + overrides=EvalConfigOverrides( + provider="local", + api_base_url="http://127.0.0.1:9000/v1", + api_key_var="CLI_KEY", + max_concurrent=1, + ), + ) + + assert config.model == "openai/resolved" + assert config.client_config.api_base_url == "http://127.0.0.1:9000/v1" + assert config.client_config.api_key_var == "CLI_KEY" + assert config.max_concurrent == 1 + assert config.client_config.endpoint_configs == [] + + +def test_build_eval_config_unknown_model_uses_prime_provider_by_default(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setenv("PRIME_TEAM_ID", "team-123") + + config = build_eval_config({"env_id": "medqa", "model": "prime-model", "sampling_args": {"top_k": 20}}) + + assert config.model == "prime-model" + assert config.client_config.api_base_url == PRIME_INFERENCE_URL + assert config.client_config.api_key_var == "PRIME_API_KEY" + assert config.client_config.extra_headers == {"X-Prime-Team-ID": "team-123"} + assert config.sampling_args["extra_body"]["usage"] == {"include": True} + assert config.sampling_args["extra_body"]["top_k"] == 20 + + +def test_build_eval_config_sanitizes_unknown_sampling_args() -> None: + config = build_eval_config( + { + "env_id": "medqa", + "provider": "openai", + "model": "openai/gpt-4.1-mini", + "sampling_args": {"temperature": 0.4, "top_k": 40, "extra_body": {"known": True}}, + } + ) + + assert config.sampling_args["temperature"] == 0.4 + assert config.sampling_args["extra_body"] == {"known": True, "top_k": 40} + + +def test_build_eval_config_uses_env_pyproject_defaults(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None: + project_dir = tmp_path / "adapter_default_env_project" + package_dir = project_dir / "adapter_default_env" + package_dir.mkdir(parents=True) + (package_dir / "__init__.py").write_text("") + (project_dir / "pyproject.toml").write_text( + """ +[tool.verifiers.eval] +num_examples = 11 +rollouts_per_example = 4 +""".strip() + ) + monkeypatch.syspath_prepend(str(project_dir)) + importlib.invalidate_caches() + + config = build_eval_config( + { + "env_id": "adapter-default-env", + "provider": "openai", + "model": "openai/gpt-4.1-mini", + } + ) + + assert config.num_examples == 11 + assert config.rollouts_per_example == 4 diff --git a/tests/test_environments/test_longhealth.py b/tests/test_environments/test_longhealth.py new file mode 100644 index 00000000..00da300b --- /dev/null +++ b/tests/test_environments/test_longhealth.py @@ -0,0 +1,96 @@ +import ast +import importlib.util +from pathlib import Path + +from verifiers.types import flatten_task_input + +REPO_ROOT = Path(__file__).resolve().parents[2] + + +def _load_longhealth_module(): + module_path = REPO_ROOT / "environments" / "longhealth" / "longhealth.py" + spec = importlib.util.spec_from_file_location("longhealth_local", module_path) + assert spec is not None + assert spec.loader is not None + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + return module + + +def test_environment_code_does_not_emit_reserved_task_key() -> None: + offenders = [] + for path in (REPO_ROOT / "environments").rglob("*.py"): + tree = ast.parse(path.read_text(), filename=str(path)) + for node in ast.walk(tree): + if isinstance(node, ast.Dict): + for key in node.keys: + if isinstance(key, ast.Constant) and key.value == "task": + offenders.append(f"{path.relative_to(REPO_ROOT)}:{key.lineno}") + if isinstance(node, ast.Subscript) and isinstance(node.ctx, ast.Store): + if isinstance(node.slice, ast.Constant) and node.slice.value == "task": + offenders.append(f"{path.relative_to(REPO_ROOT)}:{node.lineno}") + + assert offenders == [] + + +def test_copied_info_payloads_drop_reserved_task_key() -> None: + offenders = [] + for path in (REPO_ROOT / "environments").rglob("*.py"): + tree = ast.parse(path.read_text(), filename=str(path)) + for fn in [node for node in ast.walk(tree) if isinstance(node, ast.FunctionDef)]: + copies_raw_payload = False + drops_task = False + for node in ast.walk(fn): + if isinstance(node, ast.Assign): + for target in node.targets: + if not (isinstance(target, ast.Name) and target.id == "info"): + continue + value = node.value + if ( + isinstance(value, ast.Call) + and isinstance(value.func, ast.Name) + and value.func.id == "dict" + and value.args + ): + copies_raw_payload = True + if isinstance(node, ast.Call) and isinstance(node.func, ast.Attribute): + if not (isinstance(node.func.value, ast.Name) and node.func.value.id == "info"): + continue + if node.func.attr == "pop" and node.args: + arg = node.args[0] + if isinstance(arg, ast.Constant) and arg.value == "task": + drops_task = True + if copies_raw_payload and not drops_task: + offenders.append(f"{path.relative_to(REPO_ROOT)}:{fn.lineno}:{fn.name}") + + assert offenders == [] + + +def test_longhealth_task1_metadata_does_not_use_verifiers_task_key() -> None: + module = _load_longhealth_module() + + env = module.load_environment(task="task1", max_examples=3, shuffle_docs=False) + + seen_tasks = set() + for row in env.eval_dataset: + info = row["info"] + assert "task" not in info + seen_tasks.add(info["longhealth_task"]) + assert flatten_task_input(row)["info"]["longhealth_task"] == info["longhealth_task"] + + assert seen_tasks == {"task1"} + + +def test_longhealth_task2_metadata_does_not_use_verifiers_task_key() -> None: + module = _load_longhealth_module() + + env = module.load_environment(task="task2", max_examples=2, shuffle_docs=False) + + seen_tasks = set() + for row in env.eval_dataset: + info = row["info"] + assert "task" not in info + seen_tasks.add(info["longhealth_task"]) + assert flatten_task_input(row)["info"]["longhealth_task"] == info["longhealth_task"] + + assert seen_tasks == {"task2_negation", "task2_identification"} diff --git a/tests/test_mcq_accuracy.py b/tests/test_mcq_accuracy.py index 5365ddc4..77cc9da6 100644 --- a/tests/test_mcq_accuracy.py +++ b/tests/test_mcq_accuracy.py @@ -162,6 +162,34 @@ def test_answer_text_in_sentence(): ) +@pytest.mark.parametrize( + ("response", "answer_letter", "answer_text"), + [ + (" c ", "C", "Correct option"), + ("(2)", "2", "Second option"), + (" Chemotherapy and radiation. ", "C", "chemotherapy and radiation"), + ("B. Video-capsule endoscopy", "B", "Video-capsule endoscopy"), + ("**(3)** Third option", "3", "Third option"), + ], +) +def test_strict_accepts_only_exact_option_text_or_both(response: str, answer_letter: str, answer_text: str): + assert multiple_choice_accuracy(response, answer_letter=answer_letter, answer_text=answer_text, strict=True) + + +@pytest.mark.parametrize( + ("response", "answer_letter", "answer_text"), + [ + ("Final answer: C", "C", "Correct option"), + ("I think it's C", "C", "Correct option"), + ("Based on the symptoms, acute myocardial infarction is most likely.", "B", "acute myocardial infarction"), + ("The answer is all of the above.", "D", "All of the above"), + ], +) +def test_strict_rejects_permissive_heuristic_matches(response: str, answer_letter: str, answer_text: str): + assert multiple_choice_accuracy(response, answer_letter=answer_letter, answer_text=answer_text) + assert not multiple_choice_accuracy(response, answer_letter=answer_letter, answer_text=answer_text, strict=True) + + @pytest.mark.parametrize("response", ["All of the above", "The answer is all of the above."]) def test_answer_text_all_of_the_above_is_not_rejected(response: str): assert multiple_choice_accuracy(response, answer_letter="D", answer_text="All of the above") diff --git a/tests/test_orchestrate/test_orchestrate_cli_validation.py b/tests/test_orchestrate/test_orchestrate_cli_validation.py index 638f75fe..f190ed69 100644 --- a/tests/test_orchestrate/test_orchestrate_cli_validation.py +++ b/tests/test_orchestrate/test_orchestrate_cli_validation.py @@ -73,18 +73,17 @@ def test_cli_runtime_flag_parses() -> None: def test_cli_runtime_precedence_cli_over_plan(monkeypatch, tmp_path: Path) -> None: - job_cfg = tmp_path / "job.yaml" + job_cfg = tmp_path / "job.toml" job_cfg.write_text( """ -models: - foo: - model: Foo/Bar -orchestrate: - vllm-container: - image: fake - foo: - gpus: 1 - serve: {} +model = "Foo/Bar" + +[medarc.orchestrate.vllm-container] +image = "fake" + +[medarc.orchestrate.foo] +gpus = 1 +serve = {} """.lstrip(), encoding="utf-8", ) @@ -112,6 +111,40 @@ def fake_run(self) -> None: assert captured["runtime"] == "pyxis" +def test_cli_dry_run_accepts_toml_job_config(tmp_path: Path, capsys: pytest.CaptureFixture[str]) -> None: + job_cfg = tmp_path / "job.toml" + job_cfg.write_text( + """ +model = "Foo/Bar" + +[[eval]] +env_id = "medqa" + +[medarc.orchestrate.vllm-container] +image = "fake" + +[medarc.orchestrate.foo] +gpus = 1 +serve = {} +""".lstrip(), + encoding="utf-8", + ) + plan_path = tmp_path / "plan.yaml" + plan_path.write_text( + f""" +job_configs: + - {job_cfg.name} +runtime: pyxis +""".lstrip(), + encoding="utf-8", + ) + + rc = main(["--plan", str(plan_path), "--dry-run"]) + + assert rc == 0 + assert f"job:foo\tFoo/Bar\t{job_cfg.resolve()}" in capsys.readouterr().out + + def test_port_only_resource_manager_skips_gpus() -> None: rm = PortOnlyResourceManager(port_range=(9000, 9010)) diff --git a/tests/test_orchestrate/test_orchestrate_config.py b/tests/test_orchestrate/test_orchestrate_config.py index 64d472ee..be32446f 100644 --- a/tests/test_orchestrate/test_orchestrate_config.py +++ b/tests/test_orchestrate/test_orchestrate_config.py @@ -1,4 +1,3 @@ -import warnings from pathlib import Path import pytest @@ -9,20 +8,22 @@ def test_plan_job_configs_resolve_relative_to_plan_file(tmp_path: Path): configs_dir = tmp_path / "configs" configs_dir.mkdir() - job_cfg = configs_dir / "job-foo.yaml" + job_cfg = configs_dir / "job-foo.toml" job_cfg.write_text( """ -models: - foo: - model: Foo/Bar -orchestrate: - restart: runs/raw/example-run - vllm-container: - image: vllm/vllm-openai:latest - foo: - gpus: 1 - serve: - dtype: bfloat16 +model = "Foo/Bar" + +[medarc.orchestrate] +restart = "runs/raw/example-run" + +[medarc.orchestrate.vllm-container] +image = "vllm/vllm-openai:latest" + +[medarc.orchestrate.foo] +gpus = 1 + +[medarc.orchestrate.foo.serve] +dtype = "bfloat16" """.lstrip(), encoding="utf-8", ) @@ -31,7 +32,7 @@ def test_plan_job_configs_resolve_relative_to_plan_file(tmp_path: Path): """ name: test job_configs: - - configs/job-foo.yaml + - configs/job-foo.toml gpu_range: "0-3" port_range: "8100-8199" run_id: "hello" @@ -63,54 +64,66 @@ def test_plan_job_configs_resolve_relative_to_plan_file(tmp_path: Path): assert "vllm-container" in tasks[0].orchestrate -def test_expand_tasks_accepts_deprecated_vllm_docker_with_warning(tmp_path: Path) -> None: - job_cfg = tmp_path / "job.yaml" +def test_expand_tasks_accepts_toml_eval_config(tmp_path: Path) -> None: + job_cfg = tmp_path / "job.toml" job_cfg.write_text( """ -models: - foo: - model: Foo/Bar -orchestrate: - vllm-docker: - image: vllm/vllm-openai:latest - foo: - gpus: 1 - serve: {} +model = "Foo/Bar" + +[[eval]] +env_id = "medqa" + +[medarc.orchestrate.vllm-container] +image = "vllm/vllm-openai:latest" + +[medarc.orchestrate.foo] +gpus = 1 + +[medarc.orchestrate.foo.serve] +dtype = "bfloat16" """.lstrip(), encoding="utf-8", ) plan_path = tmp_path / "plan.yaml" plan_path.write_text(f"job_configs:\n - {job_cfg.name}\n", encoding="utf-8") - with warnings.catch_warnings(record=True) as caught: - warnings.simplefilter("always") - tasks = expand_tasks(load_plan(plan_path)) + tasks = expand_tasks(load_plan(plan_path)) + assert tasks[0].job_config_path == job_cfg.resolve() + assert tasks[0].model_key == "foo" + assert tasks[0].model_id == "Foo/Bar" assert tasks[0].orchestrate["vllm-container"]["image"] == "vllm/vllm-openai:latest" - assert "vllm-docker" not in tasks[0].orchestrate - assert any("deprecated orchestrate.vllm-docker" in str(item.message) for item in caught) + assert tasks[0].orchestrate["foo"]["serve"]["dtype"] == "bfloat16" -def test_expand_tasks_rejects_ambiguous_container_keys(tmp_path: Path) -> None: +def test_expand_tasks_rejects_non_toml_job_config(tmp_path: Path) -> None: job_cfg = tmp_path / "job.yaml" job_cfg.write_text( """ -models: - foo: - model: Foo/Bar -orchestrate: - vllm-container: - image: new - vllm-docker: - image: old - foo: - gpus: 1 - serve: {} +model: Foo/Bar +""".lstrip(), + encoding="utf-8", + ) + plan_path = tmp_path / "plan.yaml" + plan_path.write_text(f"job_configs:\n - {job_cfg.name}\n", encoding="utf-8") + + with pytest.raises(ValueError, match="Unsupported job config format"): + expand_tasks(load_plan(plan_path)) + + +def test_expand_tasks_rejects_missing_vllm_container(tmp_path: Path) -> None: + job_cfg = tmp_path / "job.toml" + job_cfg.write_text( + """ +model = "Foo/Bar" + +[medarc.orchestrate.foo] +gpus = 1 """.lstrip(), encoding="utf-8", ) plan_path = tmp_path / "plan.yaml" plan_path.write_text(f"job_configs:\n - {job_cfg.name}\n", encoding="utf-8") - with pytest.raises(ValueError, match="defines both orchestrate.vllm-container and orchestrate.vllm-docker"): + with pytest.raises(ValueError, match="must define medarc.orchestrate.vllm-container settings"): expand_tasks(load_plan(plan_path)) diff --git a/tests/test_orchestrate/test_orchestrate_parallel_launch.py b/tests/test_orchestrate/test_orchestrate_parallel_launch.py index ccef080a..aa4e63e3 100644 --- a/tests/test_orchestrate/test_orchestrate_parallel_launch.py +++ b/tests/test_orchestrate/test_orchestrate_parallel_launch.py @@ -103,6 +103,7 @@ def _task(tmp_path: Path, task_id: str) -> TaskSpec: model_key="foo", model_id=f"Foo/{task_id}", orchestrate={ + "restart": "runs/raw/old-run", "vllm-container": {"image": "fake"}, "foo": {"gpus": 2, "tensor_parallel_size": 2, "serve": {}}, }, @@ -129,6 +130,7 @@ async def test_parallel_launch_runs_concurrently( max_parallel=2, ) adapter = FakeRuntimeAdapter() + bench_commands: list[list[str]] = [] runner = OrchestratorRunner( plan, tasks, @@ -164,6 +166,8 @@ class Result: return Result() async def fake_start_benchmark(*args, **kwargs): + bench_commands.append(list(args[0])) + class Proc: pass @@ -190,3 +194,7 @@ async def fake_to_thread(func, /, *args, **kwargs): assert readiness_overlapped assert [call["server_port"] for call in adapter.launch_calls] == [8000, 8001] + assert all("--api-base-url" in command for command in bench_commands) + assert all("--provider" in command and "local" in command for command in bench_commands) + assert all("--on-complete" not in command for command in bench_commands) + assert all("--restart" not in command for command in bench_commands) diff --git a/tests/test_process_writer_schema.py b/tests/test_process_writer_schema.py index b4607c5f..5b9aec3a 100644 --- a/tests/test_process_writer_schema.py +++ b/tests/test_process_writer_schema.py @@ -9,6 +9,8 @@ def test_process_writer_emits_stable_schema_with_all_null_values(tmp_path) -> No env_id="medcalc_bench", base_env_id="medcalc_bench", model_id="test-model", + variant_id=None, + variant_payload=None, rows=[ { "env_id": "medcalc_bench", @@ -57,6 +59,8 @@ def test_process_writer_emits_stable_schema_for_empty_groups(tmp_path) -> None: env_id="empty_env", base_env_id="empty_env", model_id="test-model", + variant_id=None, + variant_payload=None, rows=[], column_names=(), job_run_ids=(), diff --git a/tests/test_sampling_args.py b/tests/test_sampling_args.py new file mode 100644 index 00000000..8971ae8b --- /dev/null +++ b/tests/test_sampling_args.py @@ -0,0 +1,225 @@ +from __future__ import annotations + +import pytest + +from medarc_verifiers.utils.sampling_args import sanitize_sampling_args, sanitize_sampling_args_for_openai + + +def test_openai_chat_keeps_reasoning_effort_and_moves_extras() -> None: + result = sanitize_sampling_args( + { + "reasoning_effort": "low", + "top_k": 20, + "min_p": 0.1, + "repetition_penalty": 1.1, + "extra_body": {"usage": {"include": True}}, + }, + client_type="openai_chat_completions", + ) + + assert result["reasoning_effort"] == "low" + assert result["extra_body"] == { + "usage": {"include": True}, + "top_k": 20, + "min_p": 0.1, + "repetition_penalty": 1.1, + } + + +def test_openai_chat_token_uses_chat_shape() -> None: + result = sanitize_sampling_args( + {"reasoning_effort": "medium", "top_k": 8}, + client_type="openai_chat_completions_token", + ) + + assert result["reasoning_effort"] == "medium" + assert result["extra_body"] == {"top_k": 8} + + +def test_compatibility_wrapper_uses_chat_shape() -> None: + result = sanitize_sampling_args_for_openai({"reasoning_effort": "low", "top_k": 1}) + + assert result["reasoning_effort"] == "low" + assert result["extra_body"] == {"top_k": 1} + + +def test_openai_chat_drops_framework_owned_request_keys() -> None: + result = sanitize_sampling_args( + {"model": "override", "messages": [], "tools": [], "extra_headers": {"x": "y"}, "top_k": 1}, + client_type="openai_chat_completions", + ) + + assert "model" not in result + assert "messages" not in result + assert "tools" not in result + assert "extra_headers" not in result + assert result["extra_body"] == {"top_k": 1} + + +def test_openai_responses_converts_reasoning_effort() -> None: + result = sanitize_sampling_args( + {"reasoning_effort": "low", "top_k": 20, "max_tokens": 128}, + client_type="openai_responses", + ) + + assert "reasoning_effort" not in result + assert result["reasoning"] == {"effort": "low"} + assert result["max_output_tokens"] == 128 + assert result["extra_body"] == {"top_k": 20} + + +def test_openai_responses_rejects_stop_sequences() -> None: + with pytest.raises(ValueError, match="does not support stop sequences"): + sanitize_sampling_args({"stop": ["END"]}, client_type="openai_responses") + + +def test_openai_responses_preserves_explicit_reasoning_effort() -> None: + result = sanitize_sampling_args( + {"reasoning": {"effort": "high", "summary": "auto"}, "reasoning_effort": "low"}, + client_type="openai_responses", + ) + + assert result["reasoning"] == {"effort": "high", "summary": "auto"} + + +def test_openai_responses_drops_framework_owned_request_keys() -> None: + result = sanitize_sampling_args( + {"model": "override", "input": "x", "prompt": "y", "tools": [], "extra_headers": {"x": "y"}, "top_k": 1}, + client_type="openai_responses", + ) + + assert "model" not in result + assert "input" not in result + assert "prompt" not in result + assert "tools" not in result + assert "extra_headers" not in result + assert result["extra_body"] == {"top_k": 1} + + +def test_openai_completions_removes_reasoning_and_moves_extras() -> None: + result = sanitize_sampling_args( + {"prompt": "x", "reasoning_effort": "low", "reasoning": {"effort": "low"}, "top_k": 20}, + client_type="openai_completions", + ) + + assert "reasoning_effort" not in result + assert "reasoning" not in result + assert "prompt" not in result + assert result["extra_body"] == {"top_k": 20} + + +@pytest.mark.parametrize("client_type", ["renderer", "nemorl_chat_completions"]) +def test_passthrough_clients_only_drop_none(client_type: str) -> None: + result = sanitize_sampling_args( + {"reasoning_effort": "low", "top_k": 20, "temperature": None}, + client_type=client_type, + ) + + assert result == {"reasoning_effort": "low", "top_k": 20} + + +def test_anthropic_preserves_adaptive_thinking() -> None: + result = sanitize_sampling_args( + {"thinking": {"type": "adaptive"}, "output_config": {"effort": "medium"}, "top_k": 10}, + client_type="anthropic_messages", + ) + + assert result["thinking"] == {"type": "adaptive"} + assert result["output_config"] == {"effort": "medium"} + assert result["top_k"] == 10 + + +def test_anthropic_maps_reasoning_effort_to_adaptive_output_config() -> None: + result = sanitize_sampling_args({"reasoning_effort": "high"}, client_type="anthropic_messages") + + assert result["thinking"] == {"type": "adaptive"} + assert result["output_config"] == {"effort": "high"} + assert "reasoning_effort" not in result + assert "effort" not in result + + +def test_anthropic_drops_framework_owned_request_keys() -> None: + result = sanitize_sampling_args( + { + "model": "override", + "messages": [], + "system": "override", + "tools": [], + "extra_headers": {"x": "y"}, + "reasoning_effort": "low", + }, + client_type="anthropic_messages", + ) + + assert "model" not in result + assert "messages" not in result + assert "system" not in result + assert "tools" not in result + assert "extra_headers" not in result + assert result["thinking"] == {"type": "adaptive"} + assert result["output_config"] == {"effort": "low"} + + +def test_anthropic_does_not_put_effort_inside_thinking() -> None: + result = sanitize_sampling_args( + {"thinking": {"type": "adaptive", "effort": "low"}, "reasoning_effort": "medium"}, + client_type="anthropic_messages", + ) + + assert result["thinking"] == {"type": "adaptive"} + assert result["output_config"] == {"effort": "medium"} + + +@pytest.mark.parametrize( + "sampling_args", + [ + {"thinking": {"type": "enabled", "budget_tokens": 4096}}, + {"thinking": {"type": "adaptive", "budget_tokens": 4096}}, + ], +) +def test_anthropic_rejects_manual_budget_thinking(sampling_args: dict[str, object]) -> None: + with pytest.raises(ValueError, match="thinking"): + sanitize_sampling_args(sampling_args, client_type="anthropic_messages") + + +def test_anthropic_validates_effort_values() -> None: + with pytest.raises(ValueError, match="reasoning effort"): + sanitize_sampling_args({"reasoning_effort": "extreme"}, client_type="anthropic_messages") + + +@pytest.mark.parametrize("effort", ["xhigh", "max"]) +def test_anthropic_accepts_sdk_documented_effort_values(effort: str) -> None: + result = sanitize_sampling_args({"reasoning_effort": effort}, client_type="anthropic_messages") + + assert result["thinking"] == {"type": "adaptive"} + assert result["output_config"] == {"effort": effort} + + +@pytest.mark.asyncio +async def test_openai_responses_client_receives_nested_reasoning() -> None: + from verifiers.clients.openai_responses_client import OpenAIResponsesClient + + class Responses: + def __init__(self) -> None: + self.kwargs: dict[str, object] | None = None + + async def create(self, **kwargs): + self.kwargs = kwargs + return object() + + class Client: + def __init__(self) -> None: + self.responses = Responses() + + async def close(self) -> None: + pass + + raw_client = Client() + client = OpenAIResponsesClient(raw_client) + sampling_args = sanitize_sampling_args({"reasoning_effort": "low"}, client_type="openai_responses") + + await client.get_native_response([], "model", sampling_args) + + assert raw_client.responses.kwargs is not None + assert "reasoning_effort" not in raw_client.responses.kwargs + assert raw_client.responses.kwargs["reasoning"] == {"effort": "low"} diff --git a/tests/test_scripts/test_convert_legacy_raw_runs.py b/tests/test_scripts/test_convert_legacy_raw_runs.py new file mode 100644 index 00000000..8748f685 --- /dev/null +++ b/tests/test_scripts/test_convert_legacy_raw_runs.py @@ -0,0 +1,293 @@ +from __future__ import annotations + +import json +from pathlib import Path + +from scripts.convert_legacy_raw_runs import convert_legacy_raw_runs, main + + +def _write_json(path: Path, payload: dict) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(json.dumps(payload), encoding="utf-8") + + +def _write_manifest( + raw_dir: Path, + *, + run_id: str = "run-1", + jobs: list[dict] | None = None, +) -> Path: + run_dir = raw_dir / run_id + manifest = { + "version": 3, + "run_id": run_id, + "name": "legacy", + "config_source": "configs/legacy.yaml", + "config_checksum": "abc123", + "created_at": "2024-01-01T00:00:00Z", + "updated_at": "2024-01-01T00:01:00Z", + "artifacts_root": ".", + "models": {"gpt/mini": {"sampling_args": {"temperature": 0.1}}}, + "env_templates": {}, + "summary": {"completed": 1, "total": 1}, + "jobs": jobs if jobs is not None else [_job()], + } + _write_json(run_dir / "run_manifest.json", manifest) + return run_dir + + +def _job(**overrides: object) -> dict: + payload = { + "job_id": "job-1", + "model_id": "gpt/mini", + "env_id": "demo/env", + "env_template_id": "demo-template", + "env_variant_id": "demo/env", + "env_args": {"fold": "dev"}, + "sampling_args": {"top_p": 0.9}, + "status": "completed", + "results_relpath": "job-1/results.jsonl", + "metadata_relpath": "job-1/metadata.json", + "num_examples": 2, + "rollouts_per_example": 1, + "avg_reward": 0.75, + } + payload.update(overrides) + return payload + + +def _write_artifacts(run_dir: Path, *, job_id: str = "job-1") -> None: + _write_json( + run_dir / job_id / "metadata.json", + { + "env_args": {"fold": "metadata"}, + "sampling_args": {"temperature": 0.2}, + "num_examples": 2, + "rollouts_per_example": 1, + "avg_reward": 0.5, + }, + ) + (run_dir / job_id / "results.jsonl").write_text('{"example_id":"ex-1","reward":1.0}\n', encoding="utf-8") + + +def test_dry_run_lists_jobs_and_writes_nothing(tmp_path: Path) -> None: + raw_dir = tmp_path / "runs" / "raw" + output_dir = tmp_path / "runs" / "evals" + run_dir = _write_manifest(raw_dir) + _write_artifacts(run_dir) + + report = convert_legacy_raw_runs(raw_dir=raw_dir, output_dir=output_dir) + + assert report.would_convert == 1 + assert report.failed == 0 + assert not output_dir.exists() + entry = report.entries[0] + assert entry.target_dir is not None + assert entry.target_dir.endswith("gpt-mini/demo-env/base") + + +def test_converts_valid_manifest_job_to_processable_eval_output(tmp_path: Path) -> None: + raw_dir = tmp_path / "runs" / "raw" + output_dir = tmp_path / "runs" / "evals" + run_dir = _write_manifest(raw_dir) + _write_artifacts(run_dir) + + report = convert_legacy_raw_runs(raw_dir=raw_dir, output_dir=output_dir, dry_run=False) + + assert report.converted == 1 + target = output_dir / "gpt-mini" / "demo-env" / "base" + row = json.loads((target / "results.jsonl").read_text(encoding="utf-8")) + assert row["is_completed"] is True + assert row["is_truncated"] is False + assert row["metrics"] == {} + assert row["stop_condition"] == "max_turns_reached" + assert row["timing"]["total"] == 0.0 + assert row["tool_defs"] == [] + metadata = json.loads((target / "metadata.json").read_text(encoding="utf-8")) + assert metadata == { + "avg_error": 0.0, + "avg_metrics": {}, + "avg_reward": 1.0, + "base_url": "", + "env_args": {"fold": "metadata"}, + "env_id": "demo/env", + "model": "gpt/mini", + "num_examples": 2, + "pass_all_k": {}, + "pass_at_k": {}, + "pass_threshold": 0.5, + "rollouts_per_example": 1, + "sampling_args": {"temperature": 0.2}, + "state_columns": [], + "time": 0.0, + "tools": None, + "usage": None, + "version_info": {}, + } + assert not (target / "bench_index.json").exists() + assert not (target / ".medarc_eval_metadata.json").exists() + assert (run_dir / "job-1" / "results.jsonl").exists() + + +def test_skips_missing_results(tmp_path: Path) -> None: + raw_dir = tmp_path / "runs" / "raw" + _write_manifest(raw_dir) + + report = convert_legacy_raw_runs(raw_dir=raw_dir, output_dir=tmp_path / "evals", dry_run=False) + + assert report.skipped == 1 + assert report.entries[0].reason == "missing results.jsonl" + + +def test_skips_non_completed_jobs(tmp_path: Path) -> None: + raw_dir = tmp_path / "runs" / "raw" + run_dir = _write_manifest(raw_dir, jobs=[_job(status="failed")]) + _write_artifacts(run_dir) + + report = convert_legacy_raw_runs(raw_dir=raw_dir, output_dir=tmp_path / "evals", dry_run=False) + + assert report.skipped == 1 + assert "failed" in report.entries[0].reason + + +def test_target_collision_fails_without_writing(tmp_path: Path) -> None: + raw_dir = tmp_path / "runs" / "raw" + output_dir = tmp_path / "runs" / "evals" + run_dir = _write_manifest(raw_dir) + _write_artifacts(run_dir) + target = output_dir / "gpt-mini" / "demo-env" / "base" + target.mkdir(parents=True) + + report = convert_legacy_raw_runs(raw_dir=raw_dir, output_dir=output_dir, dry_run=False) + + assert report.failed == 1 + assert "already exists" in report.entries[0].reason + assert not (target / "metadata.json").exists() + + +def test_report_includes_valid_jobs_when_another_job_fails(tmp_path: Path) -> None: + raw_dir = tmp_path / "runs" / "raw" + output_dir = tmp_path / "runs" / "evals" + run_dir = _write_manifest( + raw_dir, + jobs=[ + _job(job_id="valid", results_relpath="valid/results.jsonl"), + _job(job_id="collision", results_relpath="collision/results.jsonl", env_variant_id="demo/env::seed-1"), + ], + ) + _write_artifacts(run_dir, job_id="valid") + _write_artifacts(run_dir, job_id="collision") + (output_dir / "gpt-mini" / "demo-env" / "seed-1").mkdir(parents=True) + + report = convert_legacy_raw_runs(raw_dir=raw_dir, output_dir=output_dir, dry_run=False) + + assert report.failed == 1 + assert report.converted == 1 + by_job = {entry.job_id: entry for entry in report.entries} + assert by_job["collision"].status == "failed" + assert by_job["valid"].status == "converted" + assert (output_dir / "gpt-mini" / "demo-env" / "base" / "metadata.json").exists() + + +def test_invalid_existing_metadata_is_skipped(tmp_path: Path) -> None: + raw_dir = tmp_path / "runs" / "raw" + output_dir = tmp_path / "runs" / "evals" + run_dir = _write_manifest(raw_dir) + (run_dir / "job-1").mkdir(parents=True) + (run_dir / "job-1" / "metadata.json").write_text("not json", encoding="utf-8") + (run_dir / "job-1" / "results.jsonl").write_text('{"example_id":"ex-1"}\n', encoding="utf-8") + + report = convert_legacy_raw_runs(raw_dir=raw_dir, output_dir=output_dir, dry_run=False) + + assert report.skipped == 1 + assert "invalid metadata.json" in report.entries[0].reason + assert not output_dir.exists() + + +def test_path_unsafe_or_ambiguous_variants_are_skipped(tmp_path: Path) -> None: + raw_dir = tmp_path / "runs" / "raw" + run_dir = _write_manifest( + raw_dir, + jobs=[ + _job(job_id="ambiguous", env_variant_id="other-env::seed-1", results_relpath="ambiguous/results.jsonl"), + _job(job_id="unsafe", env_variant_id="demo/env::bad value", results_relpath="unsafe/results.jsonl"), + _job( + job_id="base-conflict", env_variant_id="demo/env::base", results_relpath="base-conflict/results.jsonl" + ), + ], + ) + for job_id in ("ambiguous", "unsafe", "base-conflict"): + _write_artifacts(run_dir, job_id=job_id) + + report = convert_legacy_raw_runs(raw_dir=raw_dir, output_dir=tmp_path / "evals", dry_run=False) + + assert report.skipped == 3 + reasons = {entry.job_id: entry.reason for entry in report.entries} + assert "ambiguous env_variant_id" in reasons["ambiguous"] + assert "path-unsafe variant" in reasons["unsafe"] + assert "reserved base" in reasons["base-conflict"] + + +def test_parses_relative_variant_and_cli_report_path(tmp_path: Path) -> None: + raw_dir = tmp_path / "runs" / "raw" + output_dir = tmp_path / "runs" / "evals" + report_path = tmp_path / "report.json" + run_dir = _write_manifest(raw_dir, jobs=[_job(env_variant_id="demo/env::shuffle_seed-1618")]) + _write_artifacts(run_dir) + + exit_code = main( + [ + "--raw-dir", + str(raw_dir), + "--output-dir", + str(output_dir), + "--no-dry-run", + "--report-path", + str(report_path), + ] + ) + + assert exit_code == 0 + assert (output_dir / "gpt-mini" / "demo-env" / "shuffle_seed-1618" / "metadata.json").exists() + payload = json.loads(report_path.read_text(encoding="utf-8")) + assert payload["summary"]["converted"] == 1 + + +def test_parses_legacy_delimited_env_variant_ids(tmp_path: Path) -> None: + raw_dir = tmp_path / "runs" / "raw" + output_dir = tmp_path / "runs" / "evals" + run_dir = _write_manifest( + raw_dir, + jobs=[ + _job( + job_id="longhealth-task1", + env_id="longhealth", + env_variant_id="longhealth-task1", + results_relpath="longhealth-task1/results.jsonl", + ), + _job( + job_id="careqa-en", + env_id="careqa", + env_variant_id="careqa_en", + env_args={"split": "en"}, + results_relpath="careqa-en/results.jsonl", + ), + _job( + job_id="pubhealthbench-reviewed", + env_id="pubhealthbench", + env_variant_id="pubhealthbench_reviewed", + env_args={"split": "reviewed"}, + results_relpath="pubhealthbench-reviewed/results.jsonl", + ), + ], + ) + _write_artifacts(run_dir, job_id="longhealth-task1") + _write_artifacts(run_dir, job_id="careqa-en") + _write_artifacts(run_dir, job_id="pubhealthbench-reviewed") + + report = convert_legacy_raw_runs(raw_dir=raw_dir, output_dir=output_dir, dry_run=False) + + assert report.converted == 3 + assert (output_dir / "gpt-mini" / "longhealth" / "task1" / "metadata.json").exists() + assert (output_dir / "gpt-mini" / "careqa" / "base" / "metadata.json").exists() + assert (output_dir / "gpt-mini" / "pubhealthbench" / "reviewed" / "metadata.json").exists()