From a8f1c102d475050ac837ed7726e2fb0a845e1f34 Mon Sep 17 00:00:00 2001
From: Benjamin Warner <me@benjaminwarner.dev>
Date: Tue, 28 Apr 2026 18:22:17 +0000
Subject: [PATCH 01/53] Bump verifiers benchmark dependency

---
 environments/aci_bench/pyproject.toml            | 2 +-
 environments/agentclinic/pyproject.toml          | 2 +-
 environments/careqa/pyproject.toml               | 2 +-
 environments/head_qa/pyproject.toml              | 2 +-
 environments/head_qa_v2/pyproject.toml           | 2 +-
 environments/healthbench/pyproject.toml          | 2 +-
 environments/longhealth/pyproject.toml           | 2 +-
 environments/m_arc/pyproject.toml                | 2 +-
 environments/med_dialog/pyproject.toml           | 2 +-
 environments/med_halt/pyproject.toml             | 2 +-
 environments/med_mcqa/pyproject.toml             | 2 +-
 environments/medagentbench/pyproject.toml        | 2 +-
 environments/medagentbenchv2/pyproject.toml      | 2 +-
 environments/medbullets/pyproject.toml           | 2 +-
 environments/medcalc_bench/pyproject.toml        | 2 +-
 environments/medcasereasoning/pyproject.toml     | 2 +-
 environments/medconceptsqa/pyproject.toml        | 2 +-
 environments/medec/pyproject.toml                | 2 +-
 environments/medexqa/pyproject.toml              | 2 +-
 environments/medhallu/pyproject.toml             | 2 +-
 environments/medicationqa/pyproject.toml         | 2 +-
 environments/medqa/pyproject.toml                | 2 +-
 environments/medrbench/pyproject.toml            | 2 +-
 environments/medredqa/pyproject.toml             | 2 +-
 environments/medxpertqa/pyproject.toml           | 2 +-
 environments/meqsum/pyproject.toml               | 2 +-
 environments/metamedqa/pyproject.toml            | 2 +-
 environments/mmlu_pro_health/pyproject.toml      | 2 +-
 environments/mtsamples_procedures/pyproject.toml | 2 +-
 environments/mtsamples_replicate/pyproject.toml  | 2 +-
 environments/pubhealthbench/pyproject.toml       | 2 +-
 environments/pubmedqa/pyproject.toml             | 2 +-
 environments/sctpublic/pyproject.toml            | 2 +-
 environments/supergpqa_medicine/pyproject.toml   | 2 +-
 pyproject.toml                                   | 2 +-
 35 files changed, 35 insertions(+), 35 deletions(-)

diff --git a/environments/aci_bench/pyproject.toml b/environments/aci_bench/pyproject.toml
index 37a7afb2..9bc9bd4b 100644
--- a/environments/aci_bench/pyproject.toml
+++ b/environments/aci_bench/pyproject.toml
@@ -6,7 +6,7 @@ tags = ["medical", "clinical", "dialogue", "summarization", "llm-judge", "single
 version = "0.1.0"
 requires-python = ">=3.11"
 dependencies = [
-    "verifiers>=0.1.5.post0",
+    "verifiers>=0.1.12,<0.2",
     "medarc-verifiers>=0.1.0",
 ]
 
diff --git a/environments/agentclinic/pyproject.toml b/environments/agentclinic/pyproject.toml
index 742b5244..d97e1ebb 100644
--- a/environments/agentclinic/pyproject.toml
+++ b/environments/agentclinic/pyproject.toml
@@ -8,7 +8,7 @@ version = "0.1.0"
 requires-python = ">=3.11"
 dependencies = [
     "medarc-verifiers>=0.1.0",
-    "verifiers",
+    "verifiers>=0.1.12,<0.2",
     "datasets",
 ]
 
diff --git a/environments/careqa/pyproject.toml b/environments/careqa/pyproject.toml
index eae6dfa2..8b1a793e 100644
--- a/environments/careqa/pyproject.toml
+++ b/environments/careqa/pyproject.toml
@@ -5,7 +5,7 @@ tags = ["healthcare", "medical-qa", "mcq", "clinical", "single-turn", "open-ende
 version = "0.1.1"
 requires-python = ">=3.11"
 dependencies = [
-    "verifiers>=0.1.4",
+    "verifiers>=0.1.12,<0.2",
     "datasets>=2.13.0",
     "medarc-verifiers>=0.1.0",
 ]
diff --git a/environments/head_qa/pyproject.toml b/environments/head_qa/pyproject.toml
index c7fbf46b..2645e2b7 100644
--- a/environments/head_qa/pyproject.toml
+++ b/environments/head_qa/pyproject.toml
@@ -10,7 +10,7 @@ authors = [
 ]
 dependencies = [
     "medarc-verifiers>=0.1.0",
-    "verifiers>=0.1.6.post0",
+    "verifiers>=0.1.12,<0.2",
 ]
 
 [build-system]
diff --git a/environments/head_qa_v2/pyproject.toml b/environments/head_qa_v2/pyproject.toml
index c35a3d4f..e42339c5 100644
--- a/environments/head_qa_v2/pyproject.toml
+++ b/environments/head_qa_v2/pyproject.toml
@@ -10,7 +10,7 @@ authors = [
 ]
 dependencies = [
     "medarc-verifiers>=0.1.0",
-    "verifiers>=0.1.5.post0",
+    "verifiers>=0.1.12,<0.2",
 ]
 
 [build-system]
diff --git a/environments/healthbench/pyproject.toml b/environments/healthbench/pyproject.toml
index d2a35374..90447dbe 100644
--- a/environments/healthbench/pyproject.toml
+++ b/environments/healthbench/pyproject.toml
@@ -6,7 +6,7 @@ version = "0.1.0"
 requires-python = ">=3.11"
 dependencies = [
     "medarc-verifiers>=0.1.0",
-    "verifiers>=0.1.4",
+    "verifiers>=0.1.12,<0.2",
     "datasets>=4.1.1",
     "openai>=2.1.0",
 ]
diff --git a/environments/longhealth/pyproject.toml b/environments/longhealth/pyproject.toml
index 01cd8e0a..1e7ae6f0 100644
--- a/environments/longhealth/pyproject.toml
+++ b/environments/longhealth/pyproject.toml
@@ -8,7 +8,7 @@ authors = [
     { name = "Shamus Sim Zi Yang", email = "shamus.sim@monash.edu" },
 ]
 dependencies = [
-    "verifiers>=0.1.5.post0",
+    "verifiers>=0.1.12,<0.2",
     "medarc-verifiers>=0.1.0",
 ]
 
diff --git a/environments/m_arc/pyproject.toml b/environments/m_arc/pyproject.toml
index b69e2337..c1db3b51 100644
--- a/environments/m_arc/pyproject.toml
+++ b/environments/m_arc/pyproject.toml
@@ -5,7 +5,7 @@ tags = ["medical", "clinical", "single-turn", "multiple-choice", "evaluation", "
 version = "0.1.1"
 requires-python = ">=3.11"
 dependencies = [
-    "verifiers>=0.1.3.post0",
+    "verifiers>=0.1.12,<0.2",
     "medarc-verifiers>=0.1.0",
 ]
 
diff --git a/environments/med_dialog/pyproject.toml b/environments/med_dialog/pyproject.toml
index 92389a26..f141f505 100644
--- a/environments/med_dialog/pyproject.toml
+++ b/environments/med_dialog/pyproject.toml
@@ -5,7 +5,7 @@ tags = ["medical", "dialogue", "summarization", "single-turn", "llm-judge"]
 version = "0.1.0"
 requires-python = ">=3.11"
 dependencies = [
-    "verifiers>=0.1.4",
+    "verifiers>=0.1.12,<0.2",
     "datasets>=4.1.1",
     "openai>=2.1.0",
     "medarc-verifiers>=0.1.0",
diff --git a/environments/med_halt/pyproject.toml b/environments/med_halt/pyproject.toml
index c5ac0939..82f9f372 100644
--- a/environments/med_halt/pyproject.toml
+++ b/environments/med_halt/pyproject.toml
@@ -6,7 +6,7 @@ version = "0.1.1"
 requires-python = ">=3.11"
 dependencies = [
     "datasets>=4.0.0",
-    "verifiers>=0.1.2.post0",
+    "verifiers>=0.1.12,<0.2",
     "medarc-verifiers>=0.1.0",
 ]
 
diff --git a/environments/med_mcqa/pyproject.toml b/environments/med_mcqa/pyproject.toml
index ebd2998d..9c40e292 100644
--- a/environments/med_mcqa/pyproject.toml
+++ b/environments/med_mcqa/pyproject.toml
@@ -9,7 +9,7 @@ authors = [
     { name = "Ratna Sagari Grandhi", email = "sagari.grandhi@gmail.com" },
 ]
 dependencies = [
-    "verifiers>=0.1.5.post0",
+    "verifiers>=0.1.12,<0.2",
     "medarc-verifiers>=0.1.0",
 ]
 
diff --git a/environments/medagentbench/pyproject.toml b/environments/medagentbench/pyproject.toml
index 766e07e8..2a0cfaad 100644
--- a/environments/medagentbench/pyproject.toml
+++ b/environments/medagentbench/pyproject.toml
@@ -6,7 +6,7 @@ version = "0.1.2"
 dependencies = [
     "medarc-verifiers>=0.1.0",
     "datasets",
-    "verifiers>=0.1.2.post1",
+    "verifiers>=0.1.12,<0.2",
 ]
 
 [build-system]
diff --git a/environments/medagentbenchv2/pyproject.toml b/environments/medagentbenchv2/pyproject.toml
index 92f59484..a135be24 100644
--- a/environments/medagentbenchv2/pyproject.toml
+++ b/environments/medagentbenchv2/pyproject.toml
@@ -6,7 +6,7 @@ readme = "README.md"
 requires-python = ">=3.11,<3.13"
 dependencies = [
     "medarc-verifiers>=0.1.0",
-    "verifiers>=0.1.10",
+    "verifiers>=0.1.12,<0.2",
     "datasets",
     "requests",
     "pydantic",
diff --git a/environments/medbullets/pyproject.toml b/environments/medbullets/pyproject.toml
index bca2a05b..488d0d91 100644
--- a/environments/medbullets/pyproject.toml
+++ b/environments/medbullets/pyproject.toml
@@ -5,7 +5,7 @@ tags = ["medical", "clinical", "single-turn", "multiple-choice", "usmle", "train
 version = "0.1.1"
 requires-python = ">=3.11"
 dependencies = [
-    "verifiers>=0.1.3.post0",
+    "verifiers>=0.1.12,<0.2",
     "medarc-verifiers>=0.1.0",
 ]
 
diff --git a/environments/medcalc_bench/pyproject.toml b/environments/medcalc_bench/pyproject.toml
index 4e37b302..79e45490 100644
--- a/environments/medcalc_bench/pyproject.toml
+++ b/environments/medcalc_bench/pyproject.toml
@@ -5,7 +5,7 @@ description = "MedCalc-Bench clinical calculator evaluation"
 readme = "README.md"
 requires-python = ">=3.11,<3.13"
 dependencies = [
-    "verifiers>=0.1.4",
+    "verifiers>=0.1.12,<0.2",
     "datasets>=4.0.0",
     "numpy>=1.26.0",
     "simpleeval>=0.9.10",
diff --git a/environments/medcasereasoning/pyproject.toml b/environments/medcasereasoning/pyproject.toml
index 2d37215a..3cd81193 100644
--- a/environments/medcasereasoning/pyproject.toml
+++ b/environments/medcasereasoning/pyproject.toml
@@ -5,7 +5,7 @@ description = "MedCaseReasoning medical diagnosis evaluation"
 tags = ["medical", "reasoning", "single-turn", "llm-judge", "diagnosis"]
 dependencies = [
     "medarc-verifiers>=0.1.0",
-    "verifiers>=0.1.2.post0",
+    "verifiers>=0.1.12,<0.2",
     "datasets",
     "openai",
 ]
diff --git a/environments/medconceptsqa/pyproject.toml b/environments/medconceptsqa/pyproject.toml
index 752878e8..773b22d8 100644
--- a/environments/medconceptsqa/pyproject.toml
+++ b/environments/medconceptsqa/pyproject.toml
@@ -8,7 +8,7 @@ authors = [
     { name = "Anish Mahishi", email = "anish.mahishi@gmail.com" },
 ]
 dependencies = [
-    "verifiers>=0.1.5.post0",
+    "verifiers>=0.1.12,<0.2",
     "datasets>=4.1.1",
     "medarc-verifiers>=0.1.0",
 ]
diff --git a/environments/medec/pyproject.toml b/environments/medec/pyproject.toml
index 5931bf0a..ba2c971a 100644
--- a/environments/medec/pyproject.toml
+++ b/environments/medec/pyproject.toml
@@ -6,7 +6,7 @@ version = "0.1.0"
 requires-python = ">=3.11"
 dependencies = [
     "medarc-verifiers>=0.1.0",
-    "verifiers>=0.1.4",
+    "verifiers>=0.1.12,<0.2",
     "datasets",
     "openai>=1.3.0",
     "numpy",
diff --git a/environments/medexqa/pyproject.toml b/environments/medexqa/pyproject.toml
index bbcdd4d6..6ba9135f 100644
--- a/environments/medexqa/pyproject.toml
+++ b/environments/medexqa/pyproject.toml
@@ -9,7 +9,7 @@ authors = [
 ]
 dependencies = [
     "datasets>=4.0.0",
-    "verifiers>=0.1.2.post0",
+    "verifiers>=0.1.12,<0.2",
     "pandas>=2.0.0",
     "medarc-verifiers>=0.1.0",
 ]
diff --git a/environments/medhallu/pyproject.toml b/environments/medhallu/pyproject.toml
index e335d88e..c5bc5063 100644
--- a/environments/medhallu/pyproject.toml
+++ b/environments/medhallu/pyproject.toml
@@ -6,7 +6,7 @@ version = "0.1.0"
 requires-python = ">=3.10"
 dependencies = [
     "medarc-verifiers>=0.1.0",
-    "verifiers>=0.1.4",
+    "verifiers>=0.1.12,<0.2",
 ]
 
 [dependency-groups]
diff --git a/environments/medicationqa/pyproject.toml b/environments/medicationqa/pyproject.toml
index 06f523ee..f729fea8 100644
--- a/environments/medicationqa/pyproject.toml
+++ b/environments/medicationqa/pyproject.toml
@@ -5,7 +5,7 @@ tags = ["medical", "qa", "consumer", "single-turn", "llm-judge"]
 version = "0.1.0"
 requires-python = ">=3.11"
 dependencies = [
-    "verifiers>=0.1.4",
+    "verifiers>=0.1.12,<0.2",
     "requests",
     "medarc-verifiers>=0.1.0",
     "openpyxl"
diff --git a/environments/medqa/pyproject.toml b/environments/medqa/pyproject.toml
index 6994e36e..138d6f8f 100644
--- a/environments/medqa/pyproject.toml
+++ b/environments/medqa/pyproject.toml
@@ -6,7 +6,7 @@ readme = "README.md"
 requires-python = ">=3.11"
 dependencies = [
     "datasets>=4.0.0",
-    "verifiers>=0.1.2.post0",
+    "verifiers>=0.1.12,<0.2",
     "medarc-verifiers>=0.1.0",
 ]
 
diff --git a/environments/medrbench/pyproject.toml b/environments/medrbench/pyproject.toml
index ac12160d..81fd9e2e 100644
--- a/environments/medrbench/pyproject.toml
+++ b/environments/medrbench/pyproject.toml
@@ -8,7 +8,7 @@ authors = [
     { name = "Hunar Batra", email = "i@hunarbatra.com" },
 ]
 dependencies = [
-    "verifiers>=0.1.5.post0",
+    "verifiers>=0.1.12,<0.2",
     "medarc-verifiers>=0.1.0",
     "requests>=2.28.0",
 ]
diff --git a/environments/medredqa/pyproject.toml b/environments/medredqa/pyproject.toml
index 4e2ba001..a92d3366 100644
--- a/environments/medredqa/pyproject.toml
+++ b/environments/medredqa/pyproject.toml
@@ -9,7 +9,7 @@ authors = [
 ]
 dependencies = [
     "medarc-verifiers>=0.1.0",
-    "verifiers>=0.1.2.post0",
+    "verifiers>=0.1.12,<0.2",
     "datasets",
     "openai",
 ]
diff --git a/environments/medxpertqa/pyproject.toml b/environments/medxpertqa/pyproject.toml
index 021c6dd6..0a651ecc 100644
--- a/environments/medxpertqa/pyproject.toml
+++ b/environments/medxpertqa/pyproject.toml
@@ -5,7 +5,7 @@ tags = ["eval"]
 version = "0.1.1"
 requires-python = ">=3.11"
 dependencies = [
-    "verifiers>=0.1.4",
+    "verifiers>=0.1.12,<0.2",
     "datasets>=4.0.0",
     "medarc-verifiers>=0.1.0",
 ]
diff --git a/environments/meqsum/pyproject.toml b/environments/meqsum/pyproject.toml
index cfef2e00..dbc48a61 100644
--- a/environments/meqsum/pyproject.toml
+++ b/environments/meqsum/pyproject.toml
@@ -5,7 +5,7 @@ tags = ["medical", "nlp", "summarization", "single-turn", "llm-judge", "nlg-metr
 version = "0.1.0"
 requires-python = ">=3.11"
 dependencies = [
-    "verifiers>=0.1.4",
+    "verifiers>=0.1.12,<0.2",
     "datasets",
     "medarc-verifiers>=0.1.0"
 ]
diff --git a/environments/metamedqa/pyproject.toml b/environments/metamedqa/pyproject.toml
index 8924dec7..8dec10b8 100644
--- a/environments/metamedqa/pyproject.toml
+++ b/environments/metamedqa/pyproject.toml
@@ -8,7 +8,7 @@ authors = [
     { name = "Aymane Ouraq", email = "ouraqaymane@gmail.com" },
 ]
 dependencies = [
-    "verifiers>=0.1.2.post0", # same major/minor line as your working env
+    "verifiers>=0.1.12,<0.2",
     "datasets>=4.0.0",
     "medarc-verifiers>=0.1.0",
 ]
diff --git a/environments/mmlu_pro_health/pyproject.toml b/environments/mmlu_pro_health/pyproject.toml
index 15c36e1e..64391770 100644
--- a/environments/mmlu_pro_health/pyproject.toml
+++ b/environments/mmlu_pro_health/pyproject.toml
@@ -5,7 +5,7 @@ tags = ["medical", "clinical", "single-turn", "multiple-choice", "mmlu", "evalua
 version = "0.1.1"
 requires-python = ">=3.11"
 dependencies = [
-    "verifiers>=0.1.3.post0",
+    "verifiers>=0.1.12,<0.2",
     "medarc-verifiers>=0.1.0",
 ]
 
diff --git a/environments/mtsamples_procedures/pyproject.toml b/environments/mtsamples_procedures/pyproject.toml
index e9a119fd..84bd5818 100644
--- a/environments/mtsamples_procedures/pyproject.toml
+++ b/environments/mtsamples_procedures/pyproject.toml
@@ -5,7 +5,7 @@ tags = ["medical", "procedures", "plan_generation", "single-turn", "llm-judge"]
 version = "0.1.0"
 requires-python = ">=3.11"
 dependencies = [
-    "verifiers>=0.1.4",
+    "verifiers>=0.1.12,<0.2",
     "datasets>=4.1.1",
     "openai>=2.1.0",
     "medarc-verifiers>=0.1.0",
diff --git a/environments/mtsamples_replicate/pyproject.toml b/environments/mtsamples_replicate/pyproject.toml
index 5c854dd7..07a2dc0b 100644
--- a/environments/mtsamples_replicate/pyproject.toml
+++ b/environments/mtsamples_replicate/pyproject.toml
@@ -12,7 +12,7 @@ tags = [
 version = "0.1.0"
 requires-python = ">=3.11"
 dependencies = [
-    "verifiers>=0.1.4",
+    "verifiers>=0.1.12,<0.2",
     "datasets>=4.1.1",
     "openai>=2.1.0",
     "medarc-verifiers>=0.1.0",
diff --git a/environments/pubhealthbench/pyproject.toml b/environments/pubhealthbench/pyproject.toml
index c283fd19..fd5b671a 100644
--- a/environments/pubhealthbench/pyproject.toml
+++ b/environments/pubhealthbench/pyproject.toml
@@ -6,7 +6,7 @@ readme = "README.md"
 requires-python = ">=3.11"
 dependencies = [
     "datasets>=4.0.0",
-    "verifiers>=0.1.2.post0",
+    "verifiers>=0.1.12,<0.2",
     "medarc-verifiers>=0.1.0",
 ]
 
diff --git a/environments/pubmedqa/pyproject.toml b/environments/pubmedqa/pyproject.toml
index 216a6e6d..9955a8c3 100644
--- a/environments/pubmedqa/pyproject.toml
+++ b/environments/pubmedqa/pyproject.toml
@@ -8,7 +8,7 @@ authors = [
     { name = "Robert Scholz", email = "robert.scholz@maxplanckschools.de" },
 ]
 dependencies = [
-    "verifiers>=0.1.3.post0",
+    "verifiers>=0.1.12,<0.2",
     "datasets>= 4.0.0",
     "medarc-verifiers>=0.1.0",
 ]
diff --git a/environments/sctpublic/pyproject.toml b/environments/sctpublic/pyproject.toml
index 94a3e395..949c9cc3 100644
--- a/environments/sctpublic/pyproject.toml
+++ b/environments/sctpublic/pyproject.toml
@@ -9,7 +9,7 @@ authors = [
 ]
 dependencies = [
     "medarc-verifiers>=0.1.0",
-    "verifiers>=0.1.6.post0",
+    "verifiers>=0.1.12,<0.2",
 ]
 
 [build-system]
diff --git a/environments/supergpqa_medicine/pyproject.toml b/environments/supergpqa_medicine/pyproject.toml
index db8b268a..0bb6b46e 100644
--- a/environments/supergpqa_medicine/pyproject.toml
+++ b/environments/supergpqa_medicine/pyproject.toml
@@ -5,7 +5,7 @@ tags = ["medicine", "single-turn", "multiple-choice", "supergpqa", "evaluation",
 version = "0.1.0"
 requires-python = ">=3.11"
 dependencies = [
-    "verifiers>=0.1.3.post0",
+    "verifiers>=0.1.12,<0.2",
     "medarc-verifiers>=0.1.0",
 ]
 
diff --git a/pyproject.toml b/pyproject.toml
index cd0c653d..a1cbd737 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -7,7 +7,7 @@ requires-python = ">=3.11"
 license = { file = "LICENSE" }
 dependencies = [
     "prime>=0.3.35",
-    "verifiers>=0.1.10,<0.1.12",
+    "verifiers>=0.1.12,<0.2",
     "pyyaml>=6.0.1",
     "docstring-parser>=0.17.0",
     "pylatexenc>=2.10",

From d009bc3f07defb851a0c7c6ed015f767cb8ef49e Mon Sep 17 00:00:00 2001
From: Benjamin Warner <me@benjaminwarner.dev>
Date: Tue, 28 Apr 2026 18:36:17 +0000
Subject: [PATCH 02/53] Add upstream eval config adapter

---
 medarc_verifiers/cli/verifiers_adapter.py | 502 ++++++++++++++++++++++
 tests/test_cli/test_verifiers_adapter.py  | 288 +++++++++++++
 2 files changed, 790 insertions(+)
 create mode 100644 medarc_verifiers/cli/verifiers_adapter.py
 create mode 100644 tests/test_cli/test_verifiers_adapter.py

diff --git a/medarc_verifiers/cli/verifiers_adapter.py b/medarc_verifiers/cli/verifiers_adapter.py
new file mode 100644
index 00000000..3a979051
--- /dev/null
+++ b/medarc_verifiers/cli/verifiers_adapter.py
@@ -0,0 +1,502 @@
+"""Small adapter for upstream ``verifiers`` eval configuration.
+
+Upstream ``verifiers`` owns TOML loading and eval execution, but in 0.1.12 the
+``EvalConfig`` builder lives inside ``verifiers.scripts.eval.main()`` and cannot
+be imported directly. Keep this module deliberately narrow until upstream exposes
+a public builder.
+"""
+
+from __future__ import annotations
+
+import importlib.util
+import logging
+from collections.abc import Mapping
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, cast
+
+from verifiers.types import (
+    ClientConfig,
+    ClientType,
+    Endpoint,
+    EndpointClientConfig,
+    EvalConfig,
+)
+from verifiers.utils.eval_utils import load_endpoints, load_toml_config, resolve_endpoints_file
+from verifiers.utils.import_utils import load_toml
+from verifiers.utils.path_utils import find_latest_incomplete_eval_results_path, is_valid_eval_results_path
+
+from medarc_verifiers.utils.prime_inference import prime_inference_overrides
+from medarc_verifiers.utils.sampling_args import sanitize_sampling_args_for_openai
+
+logger = logging.getLogger(__name__)
+
+DEFAULT_MODEL = "openai/gpt-4.1-mini"
+DEFAULT_ENV_DIR_PATH = "./environments"
+DEFAULT_ENDPOINTS_PATH = "./configs/endpoints.toml"
+DEFAULT_NUM_EXAMPLES = 5
+DEFAULT_ROLLOUTS_PER_EXAMPLE = 3
+DEFAULT_MAX_CONCURRENT = 32
+DEFAULT_CLIENT_TYPE = "openai_chat_completions"
+DEFAULT_PROVIDER = "prime"
+ADAPTER_TOML_FIELDS = {"debug", "header_from_state", "headers_from_state", "timeout"}
+
+PROVIDER_CONFIGS: dict[str, dict[str, str]] = {
+    "prime": {
+        "url": "https://api.pinference.ai/api/v1",
+        "key": "PRIME_API_KEY",
+    },
+    "openrouter": {
+        "url": "https://openrouter.ai/api/v1",
+        "key": "OPENROUTER_API_KEY",
+    },
+    "openai": {
+        "url": "https://api.openai.com/v1",
+        "key": "OPENAI_API_KEY",
+    },
+    "anthropic": {
+        "url": "https://api.anthropic.com",
+        "key": "ANTHROPIC_API_KEY",
+        "client_type": "anthropic_messages",
+    },
+    "minimax": {
+        "url": "https://api.minimax.chat/v1",
+        "key": "MINIMAX_API_KEY",
+    },
+    "deepseek": {
+        "url": "https://api.deepseek.com/v1",
+        "key": "DEEPSEEK_API_KEY",
+    },
+    "glm": {
+        "url": "https://open.bigmodel.cn/api/paas/v4",
+        "key": "GLM_API_KEY",
+    },
+    "local": {
+        "url": "http://localhost:8000/v1",
+        "key": "VLLM_API_KEY",
+    },
+    "vllm": {
+        "url": "http://localhost:8000/v1",
+        "key": "VLLM_API_KEY",
+    },
+}
+
+
+@dataclass(frozen=True)
+class EvalConfigOverrides:
+    """CLI-level overrides applied after TOML globals and per-eval fields."""
+
+    model: str | None = None
+    provider: str | None = None
+    api_base_url: str | None = None
+    api_key_var: str | None = None
+    api_client_type: str | None = None
+    endpoints_path: str | Path | None = None
+    max_concurrent: int | None = None
+    env_args: Mapping[str, Any] | None = None
+    sampling_args: Mapping[str, Any] | None = None
+
+
+def load_toml_eval_configs(path: str | Path, *, extra_valid_fields: set[str] | None = None) -> list[dict[str, Any]]:
+    """Load upstream TOML eval configs, including ``[[ablation]]`` expansion."""
+
+    valid_fields = ADAPTER_TOML_FIELDS | (extra_valid_fields or set())
+    return load_toml_config(Path(path), extra_valid_fields=valid_fields)
+
+
+def build_eval_config(raw: Mapping[str, Any], *, overrides: EvalConfigOverrides | None = None) -> EvalConfig:
+    """Build an upstream ``EvalConfig`` from one loaded TOML/CLI eval mapping."""
+
+    merged_raw = _apply_overrides(dict(raw), overrides)
+    env_id = merged_raw["env_id"]
+
+    env_defaults = get_env_eval_defaults(env_id)
+    raw_num_examples = merged_raw.get("num_examples")
+    raw_rollouts = merged_raw.get("rollouts_per_example")
+    num_examples = (
+        raw_num_examples if raw_num_examples is not None else env_defaults.get("num_examples", DEFAULT_NUM_EXAMPLES)
+    )
+    rollouts_per_example = (
+        raw_rollouts
+        if raw_rollouts is not None
+        else env_defaults.get("rollouts_per_example", DEFAULT_ROLLOUTS_PER_EXAMPLE)
+    )
+
+    endpoints_path = str(merged_raw.get("endpoints_path", DEFAULT_ENDPOINTS_PATH))
+    endpoints = load_endpoints(endpoints_path)
+    model, resolved_endpoint_id, client_config = _build_client_config(merged_raw, endpoints, endpoints_path)
+
+    sampling_args = _build_sampling_args(merged_raw, client_config.api_base_url)
+    resume_path = _resolve_resume_path(
+        merged_raw,
+        env_id=env_id,
+        model=model,
+        num_examples=num_examples,
+        rollouts_per_example=rollouts_per_example,
+    )
+
+    extra_env_kwargs = dict(merged_raw.get("extra_env_kwargs", {}))
+    if merged_raw.get("timeout") is not None:
+        extra_env_kwargs["timeout_seconds"] = merged_raw["timeout"]
+
+    return EvalConfig(
+        env_id=env_id,
+        env_args=merged_raw.get("env_args", {}),
+        env_dir_path=merged_raw.get("env_dir_path", DEFAULT_ENV_DIR_PATH),
+        output_dir=merged_raw.get("output_dir"),
+        extra_env_kwargs=extra_env_kwargs,
+        endpoint_id=resolved_endpoint_id,
+        model=model,
+        client_config=client_config,
+        sampling_args=sampling_args,
+        num_examples=num_examples,
+        rollouts_per_example=rollouts_per_example,
+        max_concurrent=merged_raw.get("max_concurrent", DEFAULT_MAX_CONCURRENT),
+        max_retries=merged_raw.get("max_retries", 0),
+        num_workers=merged_raw.get("num_workers", "auto"),
+        disable_env_server=merged_raw.get("disable_env_server", False),
+        debug=merged_raw.get("debug", False),
+        verbose=merged_raw.get("verbose", False),
+        state_columns=merged_raw.get("state_columns", []),
+        save_results=merged_raw.get("save_results", False),
+        resume_path=resume_path,
+        independent_scoring=merged_raw.get("independent_scoring", False),
+        save_to_hf_hub=merged_raw.get("save_to_hf_hub", False),
+        hf_hub_dataset_name=merged_raw.get("hf_hub_dataset_name", ""),
+    )
+
+
+def get_env_eval_defaults(env_id: str) -> dict[str, Any]:
+    """Read ``[tool.verifiers.eval]`` defaults from an installed env package."""
+
+    defaults: dict[str, Any] = {}
+    module_name = env_id.replace("-", "_").split("/")[-1]
+
+    try:
+        spec = importlib.util.find_spec(module_name)
+        if spec is None:
+            raise ModuleNotFoundError(module_name)
+
+        if spec.submodule_search_locations:
+            base_dir = Path(next(iter(spec.submodule_search_locations)))
+        elif spec.origin:
+            base_dir = Path(spec.origin).parent
+        else:
+            logger.debug("Could not determine module path for %s; skipping eval defaults", module_name)
+            return defaults
+
+        pyproject_file = _find_env_pyproject(base_dir)
+        if not pyproject_file.is_file():
+            logger.debug("pyproject.toml not found for installed module %s", module_name)
+            return defaults
+
+        with pyproject_file.open("rb") as handle:
+            pyproject_data = load_toml(handle)
+
+        eval_config = pyproject_data.get("tool", {}).get("verifiers", {}).get("eval", {})
+        if "num_examples" in eval_config:
+            defaults["num_examples"] = eval_config["num_examples"]
+        if "rollouts_per_example" in eval_config:
+            defaults["rollouts_per_example"] = eval_config["rollouts_per_example"]
+    except ModuleNotFoundError:
+        logger.debug("Module %s not installed", module_name)
+    except Exception as exc:
+        logger.debug("Could not load eval defaults from %s pyproject.toml: %s", module_name, exc)
+
+    return defaults
+
+
+def _find_env_pyproject(base_dir: Path) -> Path:
+    candidates = [base_dir / "pyproject.toml", base_dir.parent / "pyproject.toml"]
+    for candidate in candidates:
+        if candidate.is_file():
+            return candidate
+    return candidates[0]
+
+
+def _apply_overrides(raw: dict[str, Any], overrides: EvalConfigOverrides | None) -> dict[str, Any]:
+    if overrides is None:
+        return raw
+
+    for field in ("provider", "api_base_url", "api_key_var", "api_client_type", "max_concurrent"):
+        value = getattr(overrides, field)
+        if value is not None:
+            raw[field] = value
+
+    if overrides.endpoints_path is not None:
+        raw["endpoints_path"] = str(overrides.endpoints_path)
+
+    if overrides.model is not None:
+        raw["model"] = overrides.model
+        raw.pop("endpoint_id", None)
+
+    if overrides.env_args:
+        raw["env_args"] = {**dict(raw.get("env_args", {})), **dict(overrides.env_args)}
+    if overrides.sampling_args:
+        raw["sampling_args"] = {**dict(raw.get("sampling_args", {})), **dict(overrides.sampling_args)}
+
+    return raw
+
+
+def _build_client_config(
+    raw: Mapping[str, Any], endpoints: Mapping[str, list[Endpoint]], endpoints_path: str
+) -> tuple[str, str | None, ClientConfig]:
+    raw_endpoint_id = raw.get("endpoint_id")
+    raw_model_field = raw.get("model")
+    if raw_endpoint_id is not None and raw_model_field is not None:
+        raise ValueError("Cannot set both 'endpoint_id' and 'model' in eval config; choose one.")
+    if raw_endpoint_id is not None and not isinstance(raw_endpoint_id, str):
+        raise ValueError("'endpoint_id' must be a string when provided.")
+    if isinstance(raw_endpoint_id, str) and not raw_endpoint_id:
+        raise ValueError("'endpoint_id' must be a non-empty string when provided.")
+
+    resolved_endpoints_file = resolve_endpoints_file(endpoints_path)
+    if raw_endpoint_id is not None and (resolved_endpoints_file is None or resolved_endpoints_file.suffix != ".toml"):
+        raise ValueError(
+            "'endpoint_id' is only supported with TOML endpoint registries. Set endpoints_path to an endpoints.toml file."
+        )
+
+    raw_model = raw_model_field if raw_model_field is not None else DEFAULT_MODEL
+    endpoint_lookup_id = raw_endpoint_id if raw_endpoint_id is not None else raw_model
+    raw_api_base_url = raw.get("api_base_url")
+    if isinstance(raw_api_base_url, list):
+        raise ValueError(
+            "api_base_url lists are no longer supported. Use endpoint_id + endpoints.toml for multi-endpoint configuration."
+        )
+
+    raw_provider = raw.get("provider")
+    if raw_provider is not None and raw_provider not in PROVIDER_CONFIGS:
+        raise ValueError(f"Unknown provider '{raw_provider}'. Valid providers are: {sorted(PROVIDER_CONFIGS)}")
+
+    api_key_override = raw.get("api_key_var") is not None
+    api_base_url_override = raw_api_base_url is not None
+    client_type_override = raw.get("api_client_type") is not None
+    endpoint_group: list[Endpoint] | None = None
+    resolved_endpoint_id: str | None = None
+
+    if endpoint_lookup_id in endpoints:
+        endpoint_group = list(endpoints[endpoint_lookup_id])
+        resolved_endpoint_id = cast(str, endpoint_lookup_id)
+        endpoint = endpoint_group[0]
+
+        api_key_var = endpoint["key"]
+        api_base_url = endpoint["url"]
+        client_type = endpoint.get("api_client_type", DEFAULT_CLIENT_TYPE)
+
+        endpoint_models = {entry["model"] for entry in endpoint_group}
+        if len(endpoint_models) > 1:
+            raise ValueError(
+                f"Endpoint alias '{endpoint_lookup_id}' maps to multiple model ids {sorted(endpoint_models)}, "
+                "which is not yet supported by EvalConfig."
+            )
+        model = endpoint["model"]
+
+        if raw_provider is not None:
+            provider_cfg = PROVIDER_CONFIGS[raw_provider]
+            api_key_var = provider_cfg["key"]
+            api_base_url = provider_cfg["url"]
+            client_type = provider_cfg.get("client_type", client_type)
+        if api_key_override:
+            api_key_var = raw["api_key_var"]
+        if api_base_url_override:
+            api_base_url = raw_api_base_url
+        if client_type_override:
+            client_type = raw["api_client_type"]
+    else:
+        if raw_endpoint_id is not None:
+            raise ValueError(f"Endpoint id '{raw_endpoint_id}' not found in endpoint registry at {endpoints_path}")
+        provider_cfg = PROVIDER_CONFIGS[raw_provider or DEFAULT_PROVIDER]
+        model = raw_model
+        api_key_var = raw["api_key_var"] if api_key_override else provider_cfg["key"]
+        api_base_url = raw_api_base_url if api_base_url_override else provider_cfg["url"]
+        client_type = (
+            raw["api_client_type"] if client_type_override else provider_cfg.get("client_type", DEFAULT_CLIENT_TYPE)
+        )
+
+    if not isinstance(api_base_url, str):
+        raise ValueError("api_base_url must be a single string URL")
+    if not isinstance(api_key_var, str):
+        raise ValueError("api_key_var must be a string")
+
+    eval_headers_merged = _build_extra_headers(raw)
+    prime_headers, _ = prime_inference_overrides(api_base_url)
+    eval_headers_from_state = {"X-Session-ID": "example_id", **_build_extra_headers_from_state(raw)}
+
+    registry_headers_base: dict[str, str] = {}
+    if endpoint_group is not None:
+        registry_headers_base = dict(endpoint_group[0].get("extra_headers", {}))
+    merged_headers = {**prime_headers, **registry_headers_base, **eval_headers_merged}
+
+    endpoint_configs: list[EndpointClientConfig] = []
+    if endpoint_group is not None and not api_base_url_override and raw_provider is None and len(endpoint_group) > 1:
+        endpoint_configs = [
+            EndpointClientConfig(
+                api_key_var=api_key_var if api_key_override else endpoint["key"],
+                api_base_url=endpoint["url"],
+                extra_headers={**prime_headers, **dict(endpoint.get("extra_headers", {})), **eval_headers_merged},
+            )
+            for endpoint in endpoint_group
+        ]
+
+    client_config = ClientConfig(
+        client_type=cast(ClientType, client_type),
+        api_key_var=api_key_var,
+        api_base_url=api_base_url,
+        endpoint_configs=endpoint_configs,
+        extra_headers=merged_headers,
+        extra_headers_from_state=eval_headers_from_state,
+    )
+    return cast(str, model), resolved_endpoint_id, client_config
+
+
+def _build_sampling_args(raw: Mapping[str, Any], api_base_url: str) -> dict[str, Any]:
+    sampling_args = _merge_sampling_args(
+        raw.get("sampling_args"),
+        max_tokens=raw.get("max_tokens"),
+        temperature=raw.get("temperature"),
+        include_none_max_tokens=True,
+    )
+    _, prime_sampling_overrides = prime_inference_overrides(api_base_url)
+    return sanitize_sampling_args_for_openai(_deep_merge(prime_sampling_overrides, sampling_args))
+
+
+def _merge_sampling_args(
+    sampling_args: Mapping[str, Any] | None,
+    *,
+    max_tokens: int | None = None,
+    temperature: float | None = None,
+    prefer_existing_keys: bool = True,
+    include_none_max_tokens: bool = False,
+) -> dict[str, Any]:
+    merged_sampling_args = dict(sampling_args or {})
+    if (not prefer_existing_keys or "max_tokens" not in merged_sampling_args) and (
+        include_none_max_tokens or max_tokens is not None
+    ):
+        merged_sampling_args["max_tokens"] = max_tokens
+    if temperature is not None and (not prefer_existing_keys or "temperature" not in merged_sampling_args):
+        merged_sampling_args["temperature"] = temperature
+    return merged_sampling_args
+
+
+def _resolve_resume_path(
+    raw: Mapping[str, Any],
+    *,
+    env_id: str,
+    model: str,
+    num_examples: int,
+    rollouts_per_example: int,
+) -> Path | None:
+    resume_arg = raw.get("resume")
+    if resume_arg is None and raw.get("resume_path") is not None:
+        resume_arg = raw["resume_path"]
+
+    if isinstance(resume_arg, str):
+        resume_path = Path(resume_arg)
+        if not is_valid_eval_results_path(resume_path):
+            raise ValueError(f"Resume path {resume_path} is not a valid evaluation results path")
+        logger.info("Resuming from explicit path: %s", resume_path)
+        return resume_path
+    if resume_arg is True:
+        auto_resume_path = find_latest_incomplete_eval_results_path(
+            env_id=env_id,
+            model=model,
+            num_examples=num_examples,
+            rollouts_per_example=rollouts_per_example,
+            env_dir_path=raw.get("env_dir_path", DEFAULT_ENV_DIR_PATH),
+            output_dir=raw.get("output_dir"),
+        )
+        if auto_resume_path is not None:
+            logger.info("Auto-resuming from: %s", auto_resume_path)
+            return auto_resume_path
+        logger.info("No matching incomplete run found for --resume; starting a new run")
+        return None
+    if resume_arg in (None, False):
+        return None
+    raise ValueError(f"Invalid value for --resume: {resume_arg!r}")
+
+
+def _build_extra_headers(raw: Mapping[str, Any]) -> dict[str, str]:
+    eval_headers_table: dict[str, str] = {}
+    raw_headers = raw.get("headers")
+    if raw_headers is not None:
+        eval_headers_table = _validate_header_mapping(raw_headers)
+
+    raw_header_values = raw.get("header") or []
+    if not isinstance(raw_header_values, list):
+        raise ValueError("'header' must be a list of 'Name: Value' strings")
+
+    eval_headers_from_list: dict[str, str] = {}
+    for header_value in raw_header_values:
+        if not isinstance(header_value, str):
+            raise ValueError(f"Each 'header' entry must be a string 'Name: Value', got: {header_value!r}")
+        if ":" not in header_value:
+            raise ValueError(f"--header must be 'Name: Value', got: {header_value!r}")
+        key, value = header_value.split(":", 1)
+        key, value = key.strip(), value.strip()
+        if not key:
+            raise ValueError("--header name cannot be empty")
+        eval_headers_from_list[key] = value
+
+    return {**eval_headers_table, **eval_headers_from_list}
+
+
+def _build_extra_headers_from_state(raw: Mapping[str, Any]) -> dict[str, str]:
+    table: dict[str, str] = {}
+    raw_table = raw.get("headers_from_state")
+    if raw_table is not None:
+        table = _validate_header_mapping(raw_table)
+
+    raw_list = raw.get("header_from_state") or []
+    if not isinstance(raw_list, list):
+        raise ValueError("'header_from_state' must be a list of 'Name: state_key' strings")
+
+    from_list: dict[str, str] = {}
+    for entry in raw_list:
+        if not isinstance(entry, str):
+            raise ValueError(f"Each 'header_from_state' entry must be a string 'Name: state_key', got: {entry!r}")
+        if ":" not in entry:
+            raise ValueError(f"--header-from-state must be 'Name: state_key', got: {entry!r}")
+        key, value = entry.split(":", 1)
+        key, value = key.strip(), value.strip()
+        if not key:
+            raise ValueError("--header-from-state name cannot be empty")
+        if not value:
+            raise ValueError("--header-from-state state_key cannot be empty")
+        from_list[key] = value
+
+    return {**table, **from_list}
+
+
+def _deep_merge(base: Mapping[str, Any], override: Mapping[str, Any]) -> dict[str, Any]:
+    merged = dict(base)
+    for key, value in override.items():
+        if isinstance(value, Mapping) and isinstance(merged.get(key), Mapping):
+            merged[key] = _deep_merge(cast(Mapping[str, Any], merged[key]), value)
+        else:
+            merged[key] = value
+    return merged
+
+
+def _validate_header_mapping(value: object) -> dict[str, str]:
+    if not isinstance(value, dict):
+        raise ValueError("headers must be a dict")
+
+    headers: dict[str, str] = {}
+    for key, header_value in value.items():
+        if not isinstance(key, str) or not key.strip():
+            raise ValueError("headers keys must be non-empty strings")
+        if not isinstance(header_value, str):
+            raise ValueError("headers values must be strings")
+        headers[key] = header_value
+    return headers
+
+
+__all__ = [
+    "DEFAULT_MAX_CONCURRENT",
+    "DEFAULT_NUM_EXAMPLES",
+    "DEFAULT_ROLLOUTS_PER_EXAMPLE",
+    "EvalConfigOverrides",
+    "build_eval_config",
+    "get_env_eval_defaults",
+    "load_toml_eval_configs",
+]
diff --git a/tests/test_cli/test_verifiers_adapter.py b/tests/test_cli/test_verifiers_adapter.py
new file mode 100644
index 00000000..b4caa925
--- /dev/null
+++ b/tests/test_cli/test_verifiers_adapter.py
@@ -0,0 +1,288 @@
+from __future__ import annotations
+
+import importlib
+import json
+from pathlib import Path
+
+import pytest
+
+from medarc_verifiers.cli.verifiers_adapter import EvalConfigOverrides, build_eval_config, load_toml_eval_configs
+from medarc_verifiers.utils.prime_inference import PRIME_INFERENCE_URL
+
+
+def _write_endpoints(path: Path) -> Path:
+    path.write_text(
+        """
+[[endpoint]]
+endpoint_id = "openai-alias"
+model = "openai/resolved"
+url = "https://openai.example/v1"
+key = "OPENAI_ALIAS_KEY"
+headers = { "X-Registry" = "1" }
+
+[[endpoint]]
+endpoint_id = "replica-alias"
+model = "replica/resolved"
+url = "https://replica-a.example/v1"
+key = "REPLICA_KEY_A"
+headers = { "X-Replica" = "a" }
+
+[[endpoint]]
+endpoint_id = "replica-alias"
+model = "replica/resolved"
+url = "https://replica-b.example/v1"
+key = "REPLICA_KEY_B"
+headers = { "X-Replica" = "b" }
+""".strip()
+    )
+    return path
+
+
+def test_load_toml_eval_configs_expands_ablation(tmp_path: Path) -> None:
+    config_path = tmp_path / "eval.toml"
+    endpoints_path = _write_endpoints(tmp_path / "endpoints.toml")
+    config_path.write_text(
+        f"""
+model = "openai/gpt-4.1-mini"
+endpoints_path = "{endpoints_path}"
+debug = true
+headers_from_state = {{ "X-Trace" = "trace_id" }}
+timeout = 30.0
+
+[[eval]]
+env_id = "medqa"
+
+[[ablation]]
+env_id = "medqa"
+env_args = {{ shuffle_answers = true }}
+
+[ablation.sweep.env_args]
+shuffle_seed = [1618, 9331]
+""".strip()
+    )
+
+    configs = load_toml_eval_configs(config_path)
+
+    assert [config["env_id"] for config in configs] == ["medqa", "medqa", "medqa"]
+    assert configs[0]["debug"] is True
+    assert configs[0]["headers_from_state"] == {"X-Trace": "trace_id"}
+    assert configs[0]["timeout"] == 30.0
+    assert configs[1]["env_args"] == {"shuffle_answers": True, "shuffle_seed": 1618}
+    assert configs[2]["env_args"] == {"shuffle_answers": True, "shuffle_seed": 9331}
+
+
+def test_build_eval_config_resolves_endpoint_alias_and_core_fields(tmp_path: Path) -> None:
+    endpoints_path = _write_endpoints(tmp_path / "endpoints.toml")
+    resume_path = tmp_path / "resume"
+    resume_path.mkdir()
+    (resume_path / "results.jsonl").write_text("")
+    (resume_path / "metadata.json").write_text("{}")
+
+    config = build_eval_config(
+        {
+            "env_id": "medqa",
+            "endpoint_id": "openai-alias",
+            "endpoints_path": str(endpoints_path),
+            "env_args": {"subset": "dev"},
+            "sampling_args": {"temperature": 0.2},
+            "max_tokens": 123,
+            "num_examples": 7,
+            "rollouts_per_example": 2,
+            "max_concurrent": 4,
+            "max_retries": 3,
+            "num_workers": 2,
+            "debug": True,
+            "timeout": 45.0,
+            "state_columns": ["question_id", "split"],
+            "save_results": True,
+            "resume_path": str(resume_path),
+            "independent_scoring": True,
+            "save_to_hf_hub": True,
+            "hf_hub_dataset_name": "org/dataset",
+            "headers": {"X-Eval": "table"},
+            "header": ["X-Eval: list", "X-Extra: 1"],
+            "headers_from_state": {"X-Trace": "trace_id"},
+            "header_from_state": ["X-User: user_id"],
+        }
+    )
+
+    assert config.env_id == "medqa"
+    assert config.endpoint_id == "openai-alias"
+    assert config.model == "openai/resolved"
+    assert config.env_args == {"subset": "dev"}
+    assert config.sampling_args["temperature"] == 0.2
+    assert config.sampling_args["max_tokens"] == 123
+    assert config.num_examples == 7
+    assert config.rollouts_per_example == 2
+    assert config.max_concurrent == 4
+    assert config.max_retries == 3
+    assert config.num_workers == 2
+    assert config.debug is True
+    assert config.extra_env_kwargs == {"timeout_seconds": 45.0}
+    assert config.state_columns == ["question_id", "split"]
+    assert config.save_results is True
+    assert config.resume_path == resume_path
+    assert config.independent_scoring is True
+    assert config.save_to_hf_hub is True
+    assert config.hf_hub_dataset_name == "org/dataset"
+    assert config.client_config.api_base_url == "https://openai.example/v1"
+    assert config.client_config.api_key_var == "OPENAI_ALIAS_KEY"
+    assert config.client_config.extra_headers == {"X-Registry": "1", "X-Eval": "list", "X-Extra": "1"}
+    assert config.client_config.extra_headers_from_state == {
+        "X-Session-ID": "example_id",
+        "X-Trace": "trace_id",
+        "X-User": "user_id",
+    }
+
+
+def test_build_eval_config_supports_endpoint_replicas(tmp_path: Path) -> None:
+    endpoints_path = _write_endpoints(tmp_path / "endpoints.toml")
+
+    config = build_eval_config(
+        {
+            "env_id": "medqa",
+            "endpoint_id": "replica-alias",
+            "endpoints_path": str(endpoints_path),
+        }
+    )
+
+    assert config.model == "replica/resolved"
+    assert [endpoint.api_base_url for endpoint in config.client_config.endpoint_configs] == [
+        "https://replica-a.example/v1",
+        "https://replica-b.example/v1",
+    ]
+    assert [endpoint.api_key_var for endpoint in config.client_config.endpoint_configs] == [
+        "REPLICA_KEY_A",
+        "REPLICA_KEY_B",
+    ]
+    assert [endpoint.extra_headers for endpoint in config.client_config.endpoint_configs] == [
+        {"X-Replica": "a"},
+        {"X-Replica": "b"},
+    ]
+
+
+def test_build_eval_config_provider_and_cli_overrides_precede_toml(tmp_path: Path) -> None:
+    endpoints_path = _write_endpoints(tmp_path / "endpoints.toml")
+
+    config = build_eval_config(
+        {
+            "env_id": "medqa",
+            "model": "openai-alias",
+            "endpoints_path": str(endpoints_path),
+            "provider": "openai",
+            "api_base_url": "https://toml.example/v1",
+            "api_key_var": "TOML_KEY",
+            "max_concurrent": 8,
+        },
+        overrides=EvalConfigOverrides(
+            provider="local",
+            api_base_url="http://127.0.0.1:9000/v1",
+            api_key_var="CLI_KEY",
+            max_concurrent=1,
+        ),
+    )
+
+    assert config.model == "openai/resolved"
+    assert config.client_config.api_base_url == "http://127.0.0.1:9000/v1"
+    assert config.client_config.api_key_var == "CLI_KEY"
+    assert config.max_concurrent == 1
+    assert config.client_config.endpoint_configs == []
+
+
+def test_build_eval_config_unknown_model_uses_prime_provider_by_default(monkeypatch: pytest.MonkeyPatch) -> None:
+    monkeypatch.setenv("PRIME_TEAM_ID", "team-123")
+
+    config = build_eval_config({"env_id": "medqa", "model": "prime-model", "sampling_args": {"top_k": 20}})
+
+    assert config.model == "prime-model"
+    assert config.client_config.api_base_url == PRIME_INFERENCE_URL
+    assert config.client_config.api_key_var == "PRIME_API_KEY"
+    assert config.client_config.extra_headers == {"X-Prime-Team-ID": "team-123"}
+    assert config.sampling_args["extra_body"]["usage"] == {"include": True}
+    assert config.sampling_args["extra_body"]["top_k"] == 20
+
+
+def test_build_eval_config_sanitizes_unknown_sampling_args() -> None:
+    config = build_eval_config(
+        {
+            "env_id": "medqa",
+            "provider": "openai",
+            "model": "openai/gpt-4.1-mini",
+            "sampling_args": {"temperature": 0.4, "top_k": 40, "extra_body": {"known": True}},
+        }
+    )
+
+    assert config.sampling_args["temperature"] == 0.4
+    assert config.sampling_args["extra_body"] == {"known": True, "top_k": 40}
+
+
+def test_build_eval_config_uses_env_pyproject_defaults(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
+    project_dir = tmp_path / "adapter_default_env_project"
+    package_dir = project_dir / "adapter_default_env"
+    package_dir.mkdir(parents=True)
+    (package_dir / "__init__.py").write_text("")
+    (project_dir / "pyproject.toml").write_text(
+        """
+[tool.verifiers.eval]
+num_examples = 11
+rollouts_per_example = 4
+""".strip()
+    )
+    monkeypatch.syspath_prepend(str(project_dir))
+    importlib.invalidate_caches()
+
+    config = build_eval_config(
+        {
+            "env_id": "adapter-default-env",
+            "provider": "openai",
+            "model": "openai/gpt-4.1-mini",
+        }
+    )
+
+    assert config.num_examples == 11
+    assert config.rollouts_per_example == 4
+
+
+def test_build_eval_config_rejects_invalid_resume_path(tmp_path: Path) -> None:
+    invalid_resume_path = tmp_path / "missing"
+
+    with pytest.raises(ValueError, match="not a valid evaluation results path"):
+        build_eval_config(
+            {
+                "env_id": "medqa",
+                "provider": "openai",
+                "model": "openai/gpt-4.1-mini",
+                "resume": str(invalid_resume_path),
+            }
+        )
+
+
+def test_build_eval_config_auto_resume_uses_upstream_path_lookup(tmp_path: Path) -> None:
+    output_dir = tmp_path / "outputs"
+    run_dir = output_dir / "evals" / "medqa--openai--gpt-4.1-mini" / "abc12345"
+    run_dir.mkdir(parents=True)
+    (run_dir / "results.jsonl").write_text(json.dumps({"example_id": "0"}) + "\n")
+    (run_dir / "metadata.json").write_text(
+        json.dumps(
+            {
+                "env_id": "medqa",
+                "model": "openai/gpt-4.1-mini",
+                "num_examples": 2,
+                "rollouts_per_example": 1,
+            }
+        )
+    )
+
+    config = build_eval_config(
+        {
+            "env_id": "medqa",
+            "provider": "openai",
+            "model": "openai/gpt-4.1-mini",
+            "num_examples": 2,
+            "rollouts_per_example": 1,
+            "output_dir": str(output_dir),
+            "resume": True,
+        }
+    )
+
+    assert config.resume_path == run_dir

From 2f02d7298af8cf6ef064b93e59481405046b44ef Mon Sep 17 00:00:00 2001
From: Benjamin Warner <me@benjaminwarner.dev>
Date: Tue, 28 Apr 2026 18:46:01 +0000
Subject: [PATCH 03/53] Add deterministic eval path utilities

---
 medarc_verifiers/cli/eval_identity.py | 361 ++++++++++++++++++++++++++
 tests/test_cli/test_eval_identity.py  | 283 ++++++++++++++++++++
 2 files changed, 644 insertions(+)
 create mode 100644 medarc_verifiers/cli/eval_identity.py
 create mode 100644 tests/test_cli/test_eval_identity.py

diff --git a/medarc_verifiers/cli/eval_identity.py b/medarc_verifiers/cli/eval_identity.py
new file mode 100644
index 00000000..c56e41c6
--- /dev/null
+++ b/medarc_verifiers/cli/eval_identity.py
@@ -0,0 +1,361 @@
+"""Deterministic eval identity helpers for the TOML bench wrapper."""
+
+from __future__ import annotations
+
+import hashlib
+import json
+import re
+from collections import Counter, defaultdict
+from collections.abc import Mapping, Sequence
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any
+
+MEDARC_CONFIG_FINGERPRINT_KEY = "medarc_config_fingerprint"
+MEDARC_CONFIG_FINGERPRINT_PAYLOAD_KEY = "medarc_config_fingerprint_payload"
+MEDARC_VARIANT_ID_KEY = "variant_id"
+MEDARC_VARIANT_PAYLOAD_KEY = "variant_payload"
+
+_SLUG_PATTERN = re.compile(r"[^A-Za-z0-9._-]+")
+_MAX_SEGMENT_LENGTH = 80
+_MAX_VARIANT_ID_LENGTH = 160
+
+_SEMANTIC_SAMPLING_KEYS = {
+    "frequency_penalty",
+    "logit_bias",
+    "max_completion_tokens",
+    "max_tokens",
+    "min_p",
+    "n",
+    "presence_penalty",
+    "repetition_penalty",
+    "response_format",
+    "seed",
+    "stop",
+    "temperature",
+    "tool_choice",
+    "tools",
+    "top_k",
+    "top_p",
+}
+_EXCLUDED_SAMPLING_KEYS = {
+    "api_base_url",
+    "api_key",
+    "api_key_var",
+    "base_url",
+    "extra_headers",
+    "headers",
+    "max_retries",
+    "metadata",
+    "request_timeout",
+    "stream",
+    "timeout",
+}
+_EXCLUDED_EXTRA_BODY_KEYS = {
+    "metadata",
+    "provider",
+    "usage",
+}
+
+
+class UnclassifiedSamplingArgError(ValueError):
+    """Raised when fingerprinting sees a sampling arg without a policy."""
+
+
+@dataclass(frozen=True)
+class EvalIdentity:
+    """Resolved model/env identity plus optional variant metadata."""
+
+    model_id: str
+    env_id: str
+    variant_id: str | None = None
+    variant_payload: dict[str, Any] | None = None
+
+    @property
+    def dataset_id(self) -> str:
+        if self.variant_id is None:
+            return self.env_id
+        return f"{self.env_id}::{self.variant_id}"
+
+
+@dataclass(frozen=True)
+class EvalPathPlan:
+    """Deterministic result location for one eval config."""
+
+    identity: EvalIdentity
+    results_path: Path
+
+
+def slug_component(value: Any, *, max_length: int = _MAX_SEGMENT_LENGTH) -> str:
+    """Return a path-safe slug for one path component."""
+
+    slug = _SLUG_PATTERN.sub("-", str(value).strip()).strip("-._")
+    if not slug:
+        slug = "value"
+    if len(slug) <= max_length:
+        return slug
+    digest = short_fingerprint(str(value), length=10)
+    return f"{slug[: max_length - 11].rstrip('-._')}-{digest}"
+
+
+def plan_eval_paths(raw_configs: Sequence[Mapping[str, Any]], *, output_root: str | Path) -> list[EvalPathPlan]:
+    """Plan deterministic output paths, adding variants for colliding model/env pairs."""
+
+    keys = [(_model_id(config), _env_id(config)) for config in raw_configs]
+    counts = Counter(keys)
+    semantic_payloads = [_semantic_variant_source(config) for config in raw_configs]
+    differing_fields = _differing_fields_by_key(semantic_payloads, keys)
+
+    plans: list[EvalPathPlan] = []
+    for idx, (config, key) in enumerate(zip(raw_configs, keys)):
+        model_id, env_id = key
+        variant_payload: dict[str, Any] | None = None
+        variant_id: str | None = None
+        if counts[key] > 1:
+            variant_payload = extract_variant_payload(semantic_payloads[idx], differing_fields[key])
+            variant_id = generate_variant_id(variant_payload)
+
+        identity = EvalIdentity(
+            model_id=model_id, env_id=env_id, variant_id=variant_id, variant_payload=variant_payload
+        )
+        path = Path(output_root) / slug_component(model_id) / slug_component(env_id)
+        if variant_id is not None:
+            path = path / slug_component(variant_id, max_length=_MAX_VARIANT_ID_LENGTH)
+        plans.append(EvalPathPlan(identity=identity, results_path=path))
+
+    _ensure_unique_paths(plans)
+    return plans
+
+
+def extract_variant_payload(config: Mapping[str, Any], field_names: Sequence[str]) -> dict[str, Any]:
+    """Return the subset of config fields that distinguishes a variant."""
+
+    payload: dict[str, Any] = {}
+    for field_name in field_names:
+        if field_name in config:
+            payload[field_name] = _canonicalize(config[field_name])
+    return payload
+
+
+def generate_variant_id(payload: Mapping[str, Any]) -> str:
+    """Generate a stable human-readable variant ID from distinguishing fields."""
+
+    if not payload:
+        return f"variant-{short_fingerprint(payload)}"
+
+    segments: list[str] = []
+    for key, value in sorted(payload.items()):
+        if isinstance(value, Mapping):
+            for nested_key, nested_value in sorted(value.items()):
+                segments.append(_variant_segment(f"{key}.{nested_key}", nested_value))
+        else:
+            segments.append(_variant_segment(key, value))
+
+    variant_id = "__".join(segments)
+    if len(variant_id) <= _MAX_VARIANT_ID_LENGTH and all(not segment.endswith("-hash") for segment in segments):
+        return variant_id
+    return f"{variant_id[:120].rstrip('-._')}__{short_fingerprint(payload, length=12)}"
+
+
+def build_fingerprint_payload(config: Mapping[str, Any]) -> dict[str, Any]:
+    """Build the narrow semantic payload used for config-safe resume checks."""
+
+    payload: dict[str, Any] = {
+        "env_args": _canonicalize(config.get("env_args", {})),
+        "env_id": _env_id(config),
+        "model": _model_id(config),
+        "num_examples": config.get("num_examples"),
+        "rollouts_per_example": config.get("rollouts_per_example"),
+        "sampling_args": normalize_semantic_sampling_args(_sampling_args_with_top_level(config)),
+    }
+    return payload
+
+
+def config_fingerprint(config: Mapping[str, Any]) -> str:
+    """Return the stable fingerprint for an eval config's benchmark identity."""
+
+    return short_fingerprint(build_fingerprint_payload(config), length=32)
+
+
+def normalize_semantic_sampling_args(sampling_args: Mapping[str, Any] | None) -> dict[str, Any]:
+    """Normalize provider-independent generation semantics for fingerprinting."""
+
+    if not sampling_args:
+        return {}
+
+    normalized: dict[str, Any] = {}
+    for key, value in sampling_args.items():
+        if key == "extra_body":
+            _merge_extra_body_semantics(normalized, value)
+        elif key == "reasoning_effort":
+            normalized["reasoning_effort"] = _canonicalize(value)
+        elif key == "reasoning":
+            effort = _extract_reasoning_effort(value)
+            if effort is not None:
+                normalized["reasoning_effort"] = _canonicalize(effort)
+        elif key in _SEMANTIC_SAMPLING_KEYS:
+            normalized[key] = _canonicalize(value)
+        elif key in _EXCLUDED_SAMPLING_KEYS:
+            continue
+        else:
+            raise UnclassifiedSamplingArgError(f"Sampling arg '{key}' is not classified for resume fingerprinting.")
+
+    return dict(sorted(normalized.items()))
+
+
+def metadata_identity_fields(config: Mapping[str, Any], identity: EvalIdentity) -> dict[str, Any]:
+    """Return MedARC metadata fields to write alongside upstream metadata."""
+
+    payload = build_fingerprint_payload(config)
+    return {
+        MEDARC_CONFIG_FINGERPRINT_KEY: short_fingerprint(payload, length=32),
+        MEDARC_CONFIG_FINGERPRINT_PAYLOAD_KEY: payload,
+        MEDARC_VARIANT_ID_KEY: identity.variant_id,
+        MEDARC_VARIANT_PAYLOAD_KEY: identity.variant_payload,
+    }
+
+
+def short_fingerprint(value: Any, *, length: int = 12) -> str:
+    encoded = _canonical_json(value).encode("utf-8")
+    return hashlib.sha256(encoded).hexdigest()[:length]
+
+
+def _semantic_variant_source(config: Mapping[str, Any]) -> dict[str, Any]:
+    return {
+        "env_args": _canonicalize(config.get("env_args", {})),
+        "num_examples": config.get("num_examples"),
+        "rollouts_per_example": config.get("rollouts_per_example"),
+        "sampling_args": normalize_semantic_sampling_args(_sampling_args_with_top_level(config)),
+    }
+
+
+def _sampling_args_with_top_level(config: Mapping[str, Any]) -> dict[str, Any]:
+    sampling_args = dict(config.get("sampling_args", {}) or {})
+    for key in ("max_tokens", "temperature"):
+        if key in config and key not in sampling_args:
+            sampling_args[key] = config[key]
+    return sampling_args
+
+
+def _differing_fields_by_key(
+    semantic_payloads: Sequence[Mapping[str, Any]], keys: Sequence[tuple[str, str]]
+) -> dict[tuple[str, str], list[str]]:
+    grouped: dict[tuple[str, str], list[Mapping[str, Any]]] = defaultdict(list)
+    for payload, key in zip(semantic_payloads, keys):
+        grouped[key].append(payload)
+
+    differing: dict[tuple[str, str], list[str]] = {}
+    for key, configs in grouped.items():
+        if len(configs) < 2:
+            differing[key] = []
+            continue
+        field_names = sorted(set().union(*(payload.keys() for payload in configs)))
+        differing[key] = [
+            field_name
+            for field_name in field_names
+            if len({_canonical_json(payload.get(field_name)) for payload in configs}) > 1
+        ]
+    return differing
+
+
+def _ensure_unique_paths(plans: Sequence[EvalPathPlan]) -> None:
+    paths = [plan.results_path for plan in plans]
+    duplicate_paths = sorted(path for path, count in Counter(paths).items() if count > 1)
+    if duplicate_paths:
+        rendered = ", ".join(str(path) for path in duplicate_paths)
+        raise ValueError(f"Deterministic eval path collision after variant planning: {rendered}")
+
+
+def _variant_segment(key: str, value: Any) -> str:
+    key_slug = slug_component(key, max_length=40)
+    value_slug = slug_component(_variant_value_text(value), max_length=80)
+    if isinstance(value, Mapping | Sequence) and not isinstance(value, str | bytes | bytearray):
+        return f"{key_slug}-{value_slug}-{short_fingerprint(value, length=8)}"
+    return f"{key_slug}-{value_slug}"
+
+
+def _variant_value_text(value: Any) -> str:
+    if isinstance(value, bool):
+        return str(value).lower()
+    if value is None:
+        return "none"
+    if isinstance(value, int | float | str):
+        return str(value)
+    return "hash"
+
+
+def _merge_extra_body_semantics(normalized: dict[str, Any], extra_body: Any) -> None:
+    if not isinstance(extra_body, Mapping):
+        raise UnclassifiedSamplingArgError("sampling_args.extra_body must be a mapping for resume fingerprinting.")
+
+    for key, value in extra_body.items():
+        if key == "reasoning":
+            effort = _extract_reasoning_effort(value)
+            if effort is not None:
+                normalized["reasoning_effort"] = _canonicalize(effort)
+        elif key in _SEMANTIC_SAMPLING_KEYS:
+            normalized[key] = _canonicalize(value)
+        elif key in _EXCLUDED_EXTRA_BODY_KEYS or key in _EXCLUDED_SAMPLING_KEYS:
+            continue
+        else:
+            raise UnclassifiedSamplingArgError(
+                f"Sampling arg 'extra_body.{key}' is not classified for resume fingerprinting."
+            )
+
+
+def _extract_reasoning_effort(value: Any) -> Any:
+    if not isinstance(value, Mapping):
+        return None
+    return value.get("effort") or value.get("reasoning_effort")
+
+
+def _model_id(config: Mapping[str, Any]) -> str:
+    value = config.get("model")
+    if not value:
+        raise ValueError(
+            "Eval config must include resolved 'model' for deterministic identity; build EvalConfig before planning paths."
+        )
+    return str(value)
+
+
+def _env_id(config: Mapping[str, Any]) -> str:
+    value = config.get("env_id")
+    if not value:
+        raise ValueError("Eval config must include 'env_id' for deterministic identity.")
+    return str(value)
+
+
+def _canonical_json(value: Any) -> str:
+    return json.dumps(_canonicalize(value), sort_keys=True, separators=(",", ":"), default=str)
+
+
+def _canonicalize(value: Any) -> Any:
+    if isinstance(value, Mapping):
+        return {str(key): _canonicalize(value[key]) for key in sorted(value)}
+    if isinstance(value, list | tuple):
+        return [_canonicalize(item) for item in value]
+    if isinstance(value, set):
+        return [_canonicalize(item) for item in sorted(value, key=str)]
+    if isinstance(value, Path):
+        return str(value)
+    return value
+
+
+__all__ = [
+    "EvalIdentity",
+    "EvalPathPlan",
+    "MEDARC_CONFIG_FINGERPRINT_KEY",
+    "MEDARC_CONFIG_FINGERPRINT_PAYLOAD_KEY",
+    "MEDARC_VARIANT_ID_KEY",
+    "MEDARC_VARIANT_PAYLOAD_KEY",
+    "UnclassifiedSamplingArgError",
+    "build_fingerprint_payload",
+    "config_fingerprint",
+    "extract_variant_payload",
+    "generate_variant_id",
+    "metadata_identity_fields",
+    "normalize_semantic_sampling_args",
+    "plan_eval_paths",
+    "short_fingerprint",
+    "slug_component",
+]
diff --git a/tests/test_cli/test_eval_identity.py b/tests/test_cli/test_eval_identity.py
new file mode 100644
index 00000000..6ce61c72
--- /dev/null
+++ b/tests/test_cli/test_eval_identity.py
@@ -0,0 +1,283 @@
+from __future__ import annotations
+
+from pathlib import Path
+
+import pytest
+
+from medarc_verifiers.cli.eval_identity import (
+    MEDARC_CONFIG_FINGERPRINT_KEY,
+    MEDARC_CONFIG_FINGERPRINT_PAYLOAD_KEY,
+    MEDARC_VARIANT_ID_KEY,
+    MEDARC_VARIANT_PAYLOAD_KEY,
+    UnclassifiedSamplingArgError,
+    build_fingerprint_payload,
+    config_fingerprint,
+    generate_variant_id,
+    metadata_identity_fields,
+    normalize_semantic_sampling_args,
+    plan_eval_paths,
+    slug_component,
+)
+
+
+def test_unique_model_env_path_uses_plain_dataset_directory(tmp_path: Path) -> None:
+    [plan] = plan_eval_paths(
+        [{"model": "openai/gpt-5-mini", "env_id": "medqa"}],
+        output_root=tmp_path / "runs" / "evals",
+    )
+
+    assert plan.identity.model_id == "openai/gpt-5-mini"
+    assert plan.identity.env_id == "medqa"
+    assert plan.identity.variant_id is None
+    assert plan.identity.variant_payload is None
+    assert plan.results_path == tmp_path / "runs" / "evals" / "openai-gpt-5-mini" / "medqa"
+
+
+def test_duplicate_model_env_paths_use_deterministic_variants(tmp_path: Path) -> None:
+    plans = plan_eval_paths(
+        [
+            {"model": "gpt-5-mini", "env_id": "medqa", "env_args": {"shuffle_seed": 1618}},
+            {"model": "gpt-5-mini", "env_id": "medqa", "env_args": {"shuffle_seed": 9331}},
+        ],
+        output_root=tmp_path,
+    )
+
+    assert [plan.identity.variant_id for plan in plans] == ["env_args.shuffle_seed-1618", "env_args.shuffle_seed-9331"]
+    assert plans[0].identity.variant_payload == {"env_args": {"shuffle_seed": 1618}}
+    assert plans[1].identity.variant_payload == {"env_args": {"shuffle_seed": 9331}}
+    assert plans[0].results_path == tmp_path / "gpt-5-mini" / "medqa" / "env_args.shuffle_seed-1618"
+    assert plans[1].results_path == tmp_path / "gpt-5-mini" / "medqa" / "env_args.shuffle_seed-9331"
+
+
+def test_duplicate_model_env_variant_can_use_sampling_args(tmp_path: Path) -> None:
+    plans = plan_eval_paths(
+        [
+            {"model": "gpt-5-mini", "env_id": "medqa", "sampling_args": {"temperature": 0.0}},
+            {"model": "gpt-5-mini", "env_id": "medqa", "sampling_args": {"temperature": 0.7}},
+        ],
+        output_root=tmp_path,
+    )
+
+    assert [plan.identity.variant_id for plan in plans] == [
+        "sampling_args.temperature-0.0",
+        "sampling_args.temperature-0.7",
+    ]
+
+
+def test_duplicate_model_env_variant_canonicalizes_sampling_args(tmp_path: Path) -> None:
+    plans = plan_eval_paths(
+        [
+            {"model": "gpt-5-mini", "env_id": "medqa", "sampling_args": {"reasoning_effort": "medium"}},
+            {
+                "model": "gpt-5-mini",
+                "env_id": "medqa",
+                "sampling_args": {"extra_body": {"reasoning": {"effort": "high"}}},
+            },
+        ],
+        output_root=tmp_path,
+    )
+
+    assert [plan.identity.variant_id for plan in plans] == [
+        "sampling_args.reasoning_effort-medium",
+        "sampling_args.reasoning_effort-high",
+    ]
+
+
+def test_long_nested_variant_values_use_stable_fingerprint() -> None:
+    payload = {
+        "env_args": {
+            "rubric": {
+                "criteria": ["clinically grounded", "concise", "safe"],
+                "description": "x" * 240,
+            }
+        }
+    }
+
+    variant_id = generate_variant_id(payload)
+
+    assert len(variant_id) <= 160
+    assert variant_id.endswith(generate_variant_id(payload)[-12:])
+    assert "env_args.rubric-hash" in variant_id
+
+
+def test_fingerprint_stable_across_key_ordering() -> None:
+    left = {
+        "env_id": "medqa",
+        "model": "gpt-5-mini",
+        "env_args": {"b": 2, "a": 1},
+        "sampling_args": {"top_p": 0.9, "temperature": 0.1},
+        "num_examples": 10,
+        "rollouts_per_example": 1,
+    }
+    right = {
+        "rollouts_per_example": 1,
+        "num_examples": 10,
+        "sampling_args": {"temperature": 0.1, "top_p": 0.9},
+        "env_args": {"a": 1, "b": 2},
+        "model": "gpt-5-mini",
+        "env_id": "medqa",
+    }
+
+    assert config_fingerprint(left) == config_fingerprint(right)
+    assert build_fingerprint_payload(left) == build_fingerprint_payload(right)
+
+
+@pytest.mark.parametrize(
+    "changed",
+    [
+        {"env_args": {"shuffle_seed": 9331}},
+        {"sampling_args": {"temperature": 0.8}},
+        {"max_tokens": 1024},
+        {"num_examples": 11},
+        {"rollouts_per_example": 2},
+    ],
+)
+def test_fingerprint_changes_for_semantic_benchmark_changes(changed: dict[str, object]) -> None:
+    base = {
+        "env_id": "medqa",
+        "model": "gpt-5-mini",
+        "env_args": {"shuffle_seed": 1618},
+        "sampling_args": {"temperature": 0.2},
+        "num_examples": 10,
+        "rollouts_per_example": 1,
+    }
+    candidate = {**base, **changed}
+
+    assert config_fingerprint(base) != config_fingerprint(candidate)
+
+
+@pytest.mark.parametrize(
+    "changed",
+    [
+        {"provider": "openai"},
+        {"api_base_url": "http://localhost:9000/v1"},
+        {"endpoint_id": "local-alias"},
+        {"api_key_var": "LOCAL_KEY"},
+        {"api_client_type": "openai_chat_completions"},
+        {"timeout": 120},
+        {"max_concurrent": 1},
+        {"max_retries": 5},
+        {"headers": {"X-Prime-Team-ID": "team"}},
+        {"sampling_args": {"temperature": 0.2, "extra_body": {"usage": {"include": True}}}},
+    ],
+)
+def test_fingerprint_ignores_provider_transport_and_runtime_changes(changed: dict[str, object]) -> None:
+    base = {
+        "env_id": "medqa",
+        "model": "gpt-5-mini",
+        "env_args": {"shuffle_seed": 1618},
+        "sampling_args": {"temperature": 0.2},
+        "num_examples": 10,
+        "rollouts_per_example": 1,
+    }
+    candidate = {**base, **changed}
+
+    assert config_fingerprint(base) == config_fingerprint(candidate)
+
+
+def test_variant_planning_ignores_runtime_fields_in_identity(tmp_path: Path) -> None:
+    with pytest.raises(ValueError, match="Deterministic eval path collision"):
+        plan_eval_paths(
+            [
+                {"model": "gpt-5-mini", "env_id": "medqa", "max_concurrent": 1},
+                {"model": "gpt-5-mini", "env_id": "medqa", "max_concurrent": 32, "timeout": 120},
+            ],
+            output_root=tmp_path,
+        )
+
+
+def test_reasoning_effort_shapes_fingerprint_identically() -> None:
+    native = {
+        "env_id": "medqa",
+        "model": "gpt-5-mini",
+        "sampling_args": {"reasoning_effort": "medium", "temperature": 0.2},
+        "num_examples": 10,
+        "rollouts_per_example": 1,
+    }
+    openrouter = {
+        "env_id": "medqa",
+        "model": "gpt-5-mini",
+        "sampling_args": {"extra_body": {"reasoning": {"effort": "medium"}}, "temperature": 0.2},
+        "num_examples": 10,
+        "rollouts_per_example": 1,
+    }
+
+    assert config_fingerprint(native) == config_fingerprint(openrouter)
+    assert build_fingerprint_payload(native)["sampling_args"] == {
+        "reasoning_effort": "medium",
+        "temperature": 0.2,
+    }
+
+
+def test_top_level_sampling_aliases_match_sampling_args_shape() -> None:
+    top_level = {
+        "env_id": "medqa",
+        "model": "gpt-5-mini",
+        "temperature": 0.2,
+        "max_tokens": 256,
+        "num_examples": 10,
+        "rollouts_per_example": 1,
+    }
+    nested = {
+        "env_id": "medqa",
+        "model": "gpt-5-mini",
+        "sampling_args": {"temperature": 0.2, "max_tokens": 256},
+        "num_examples": 10,
+        "rollouts_per_example": 1,
+    }
+
+    assert config_fingerprint(top_level) == config_fingerprint(nested)
+    assert build_fingerprint_payload(top_level)["sampling_args"] == {"max_tokens": 256, "temperature": 0.2}
+
+
+def test_extra_body_semantic_args_match_top_level_shape() -> None:
+    assert normalize_semantic_sampling_args({"top_k": 20}) == normalize_semantic_sampling_args(
+        {"extra_body": {"top_k": 20}}
+    )
+
+
+def test_unclassified_sampling_args_refuse_fingerprint() -> None:
+    with pytest.raises(UnclassifiedSamplingArgError, match="vendor_knob"):
+        normalize_semantic_sampling_args({"vendor_knob": True})
+
+    with pytest.raises(UnclassifiedSamplingArgError, match="extra_body.vendor_knob"):
+        normalize_semantic_sampling_args({"extra_body": {"vendor_knob": True}})
+
+
+def test_endpoint_alias_without_resolved_model_is_rejected() -> None:
+    with pytest.raises(ValueError, match="resolved 'model'"):
+        config_fingerprint({"endpoint_id": "gpt-alias", "env_id": "medqa"})
+
+
+def test_metadata_identity_fields_include_planned_keys(tmp_path: Path) -> None:
+    plan = plan_eval_paths(
+        [
+            {"model": "gpt-5-mini", "env_id": "medqa", "env_args": {"shuffle_seed": 1618}},
+            {"model": "gpt-5-mini", "env_id": "medqa", "env_args": {"shuffle_seed": 9331}},
+        ],
+        output_root=tmp_path,
+    )[0]
+    config = {
+        "model": "gpt-5-mini",
+        "env_id": "medqa",
+        "env_args": {"shuffle_seed": 1618},
+        "sampling_args": {"temperature": 0.2},
+        "num_examples": 10,
+        "rollouts_per_example": 1,
+    }
+
+    fields = metadata_identity_fields(config, plan.identity)
+
+    assert fields[MEDARC_CONFIG_FINGERPRINT_KEY] == config_fingerprint(config)
+    assert fields[MEDARC_CONFIG_FINGERPRINT_PAYLOAD_KEY] == build_fingerprint_payload(config)
+    assert fields[MEDARC_VARIANT_ID_KEY] == "env_args.shuffle_seed-1618"
+    assert fields[MEDARC_VARIANT_PAYLOAD_KEY] == {"env_args": {"shuffle_seed": 1618}}
+
+
+def test_slug_component_is_path_safe_and_stable_for_long_values() -> None:
+    slug = slug_component(" openai/gpt-5:mini " + "x" * 120)
+
+    assert "/" not in slug
+    assert ":" not in slug
+    assert len(slug) <= 80
+    assert slug == slug_component(" openai/gpt-5:mini " + "x" * 120)

From a80e43ae393ba2440297182ef6234926c5a56a78 Mon Sep 17 00:00:00 2001
From: Benjamin Warner <me@benjaminwarner.dev>
Date: Tue, 28 Apr 2026 18:55:10 +0000
Subject: [PATCH 04/53] Add TOML bench dry run

---
 configs/endpoints.toml                |  4 ++
 configs/eval/smoke.toml               |  7 ++
 medarc_verifiers/cli/_constants.py    |  1 +
 medarc_verifiers/cli/eval_identity.py |  3 +
 medarc_verifiers/cli/main.py          | 80 +++++++++++++++++++++-
 tests/test_cli/test_eval_identity.py  | 14 ++++
 tests/test_cli/test_main.py           | 96 +++++++++++++++++++++++++++
 7 files changed, 204 insertions(+), 1 deletion(-)
 create mode 100644 configs/endpoints.toml
 create mode 100644 configs/eval/smoke.toml

diff --git a/configs/endpoints.toml b/configs/endpoints.toml
new file mode 100644
index 00000000..363e12ca
--- /dev/null
+++ b/configs/endpoints.toml
@@ -0,0 +1,4 @@
+# Default upstream verifiers endpoint registry.
+#
+# Add [[endpoint]] entries here to resolve endpoint_id aliases. An empty registry
+# is valid; provider/model defaults are used when no alias matches.
diff --git a/configs/eval/smoke.toml b/configs/eval/smoke.toml
new file mode 100644
index 00000000..20bf5d75
--- /dev/null
+++ b/configs/eval/smoke.toml
@@ -0,0 +1,7 @@
+model = "openai/gpt-4.1-mini"
+save_results = true
+
+[[eval]]
+env_id = "medqa"
+num_examples = 1
+rollouts_per_example = 1
diff --git a/medarc_verifiers/cli/_constants.py b/medarc_verifiers/cli/_constants.py
index a466e47b..faa84092 100644
--- a/medarc_verifiers/cli/_constants.py
+++ b/medarc_verifiers/cli/_constants.py
@@ -18,6 +18,7 @@
 DEFAULT_ENDPOINTS_PATH = Path("configs") / "endpoints.toml"
 DEFAULT_ENV_DIR = Path("environments")
 DEFAULT_ENV_CONFIG_ROOT = Path("configs") / "envs"
+DEFAULT_EVALS_DIR = Path("runs") / "evals"
 DEFAULT_RUNS_RAW_DIR = Path("runs") / "raw"
 DEFAULT_PROCESSED_DIR = Path("runs") / "processed"
 DEFAULT_WINRATE_DIR = DEFAULT_PROCESSED_DIR / "winrate"
diff --git a/medarc_verifiers/cli/eval_identity.py b/medarc_verifiers/cli/eval_identity.py
index c56e41c6..f738a186 100644
--- a/medarc_verifiers/cli/eval_identity.py
+++ b/medarc_verifiers/cli/eval_identity.py
@@ -151,6 +151,9 @@ def generate_variant_id(payload: Mapping[str, Any]) -> str:
         else:
             segments.append(_variant_segment(key, value))
 
+    if not segments:
+        return "baseline"
+
     variant_id = "__".join(segments)
     if len(variant_id) <= _MAX_VARIANT_ID_LENGTH and all(not segment.endswith("-hash") for segment in segments):
         return variant_id
diff --git a/medarc_verifiers/cli/main.py b/medarc_verifiers/cli/main.py
index 97ca6e50..b14b6a20 100644
--- a/medarc_verifiers/cli/main.py
+++ b/medarc_verifiers/cli/main.py
@@ -22,6 +22,7 @@
     DEFAULT_API_BASE_URL,
     DEFAULT_API_KEY_VAR,
     DEFAULT_ENDPOINTS_PATH,
+    DEFAULT_EVALS_DIR,
     DEFAULT_ENV_CONFIG_ROOT,
     DEFAULT_ENV_DIR,
     DEFAULT_PROCESSED_DIR,
@@ -35,6 +36,7 @@
 from medarc_verifiers.cli._manifest_planner import ManifestPlanner
 from medarc_verifiers.cli._schemas import EnvironmentConfigSchema, EnvironmentExportConfig
 from medarc_verifiers.cli._single_run import run_single_mode
+from medarc_verifiers.cli.eval_identity import EvalPathPlan, plan_eval_paths
 from medarc_verifiers.cli.hf import HFSyncConfig, sync_files_to_hub
 from medarc_verifiers.cli.process import PROCESS_DEFAULT_STATUS_FILTER, ProcessOptions, ProcessResult, run_process
 from medarc_verifiers.cli.utils.config_io import load_mapping_file
@@ -46,6 +48,7 @@
     slugify,
     validate_simple_name,
 )
+from medarc_verifiers.cli.verifiers_adapter import EvalConfigOverrides, build_eval_config, load_toml_eval_configs
 from medarc_verifiers.utils.pathing import resolve_under
 from medarc_verifiers.cli.winrate import (
     WinrateConfig,
@@ -65,7 +68,7 @@ def build_batch_parser() -> argparse.ArgumentParser:
         prog=COMMAND,
         description="Run MedARC evaluations using unified configuration files.",
     )
-    parser.add_argument("-c", "--config", required=True, type=Path, help="Path to a run configuration YAML file.")
+    parser.add_argument("-c", "--config", required=True, type=Path, help="Path to a benchmark configuration file.")
     parser.add_argument(
         "--run-id",
         help="Override the generated run identifier (simple name only: no slashes, no '..', not absolute).",
@@ -139,6 +142,9 @@ def build_batch_parser() -> argparse.ArgumentParser:
             "Useful when pointing a config at a dynamically assigned endpoint."
         ),
     )
+    parser.add_argument("--api-key-var", default=None, help="Override API key environment variable for TOML bench.")
+    parser.add_argument("--provider", default=None, help="Override provider shorthand for TOML bench.")
+    parser.add_argument("--model", "-m", default=None, help="Override model for every TOML eval.")
     parser.add_argument(
         "--job-id", action="append", help="Run only the specified job identifier (repeat to select multiple)."
     )
@@ -549,6 +555,16 @@ def _run_batch_mode(argv: Sequence[str]) -> int:
     except ValueError as exc:
         parser.error(str(exc))
 
+    config_path = Path(args.config).expanduser()
+    if config_path.suffix.lower() == ".toml":
+        if not args.dry_run:
+            parser.error("TOML bench execution is not available yet; use --dry-run in this transition commit.")
+        try:
+            return _dry_run_toml_bench(args)
+        except Exception as exc:  # noqa: BLE001
+            logger.exception("TOML bench dry-run failed: %s", exc)
+            return 1
+
     if args.restart:
         args.auto_resume = False
     # Restarting is an explicit workflow; disable auto-resume selection when --restart is set.
@@ -1346,6 +1362,8 @@ def _execute_batch(args: argparse.Namespace) -> int:
         os.environ["MEDARC_INCLUDE_USAGE"] = "true" if args.include_usage else "false"
 
     config_path = Path(args.config).expanduser()
+    if config_path.suffix.lower() in {".yaml", ".yml"}:
+        logger.warning("YAML benchmark configs will be removed; convert to TOML.")
     env_root_override = Path(args.env_config_root).expanduser().resolve() if args.env_config_root else None
     run_config = load_run_config(config_path, env_default_root=env_root_override)
 
@@ -1558,6 +1576,66 @@ def _execute_batch(args: argparse.Namespace) -> int:
     return 1 if has_failures else 0
 
 
+def _dry_run_toml_bench(args: argparse.Namespace) -> int:
+    config_path = Path(args.config).expanduser()
+    raw_configs = load_toml_eval_configs(config_path)
+    overrides = EvalConfigOverrides(
+        model=args.model,
+        provider=args.provider,
+        api_base_url=args.api_base_url,
+        api_key_var=args.api_key_var,
+        endpoints_path=args.endpoints_path if getattr(args, "endpoints_path_explicit", False) else None,
+        max_concurrent=args.max_concurrent,
+        env_args=getattr(args, "cli_env_args", None),
+        sampling_args=getattr(args, "cli_sampling_args", None),
+    )
+    eval_configs = [build_eval_config(raw, overrides=overrides) for raw in raw_configs]
+    plan_inputs = [_eval_config_identity_payload(config) for config in eval_configs]
+    output_root = Path(args.output_dir).expanduser() if args.output_dir else DEFAULT_EVALS_DIR
+    path_plans = plan_eval_paths(plan_inputs, output_root=output_root)
+
+    _print_toml_bench_plan(eval_configs, path_plans)
+    return 0
+
+
+def _eval_config_identity_payload(config: Any) -> dict[str, Any]:
+    return {
+        "env_args": dict(config.env_args or {}),
+        "env_id": config.env_id,
+        "model": config.model,
+        "num_examples": config.num_examples,
+        "rollouts_per_example": config.rollouts_per_example,
+        "sampling_args": dict(config.sampling_args or {}),
+    }
+
+
+def _print_toml_bench_plan(eval_configs: Sequence[Any], path_plans: Sequence[EvalPathPlan]) -> None:
+    console = Console(width=240)
+    table = Table(title="TOML Bench Dry Run", caption=f"{len(eval_configs)} eval(s) to dry-run", expand=True)
+    table.add_column("#", justify="right", style="dim")
+    table.add_column("Model", style="magenta", overflow="fold")
+    table.add_column("Environment", style="green", overflow="fold")
+    table.add_column("Variant", style="cyan", overflow="fold")
+    table.add_column("Examples", justify="right")
+    table.add_column("Rollouts", justify="right")
+    table.add_column("Max Concurrency", justify="right")
+    table.add_column("Output Path", overflow="fold")
+
+    for index, (config, path_plan) in enumerate(zip(eval_configs, path_plans), start=1):
+        table.add_row(
+            str(index),
+            config.model,
+            config.env_id,
+            path_plan.identity.variant_id or "-",
+            str(config.num_examples),
+            str(config.rollouts_per_example),
+            str(config.max_concurrent),
+            str(path_plan.results_path),
+        )
+
+    console.print(table)
+
+
 def _build_effective_args(
     jobs: Sequence[ResolvedJob],
 ) -> tuple[dict[str, dict[str, Any]], dict[str, dict[str, Any]]]:
diff --git a/tests/test_cli/test_eval_identity.py b/tests/test_cli/test_eval_identity.py
index 6ce61c72..d524aeac 100644
--- a/tests/test_cli/test_eval_identity.py
+++ b/tests/test_cli/test_eval_identity.py
@@ -49,6 +49,20 @@ def test_duplicate_model_env_paths_use_deterministic_variants(tmp_path: Path) ->
     assert plans[1].results_path == tmp_path / "gpt-5-mini" / "medqa" / "env_args.shuffle_seed-9331"
 
 
+def test_duplicate_model_env_baseline_gets_explicit_variant(tmp_path: Path) -> None:
+    plans = plan_eval_paths(
+        [
+            {"model": "gpt-5-mini", "env_id": "medqa"},
+            {"model": "gpt-5-mini", "env_id": "medqa", "env_args": {"shuffle_seed": 1618}},
+        ],
+        output_root=tmp_path,
+    )
+
+    assert [plan.identity.variant_id for plan in plans] == ["baseline", "env_args.shuffle_seed-1618"]
+    assert plans[0].identity.variant_payload == {"env_args": {}}
+    assert plans[0].results_path == tmp_path / "gpt-5-mini" / "medqa" / "baseline"
+
+
 def test_duplicate_model_env_variant_can_use_sampling_args(tmp_path: Path) -> None:
     plans = plan_eval_paths(
         [
diff --git a/tests/test_cli/test_main.py b/tests/test_cli/test_main.py
index c78b6cad..2603932c 100644
--- a/tests/test_cli/test_main.py
+++ b/tests/test_cli/test_main.py
@@ -169,6 +169,102 @@ async def fake_run(config):
     assert manifest["jobs"][0]["status"] == "completed"
 
 
+def test_toml_bench_dry_run_expands_evals_and_ablations(
+    monkeypatch: pytest.MonkeyPatch,
+    tmp_path: Path,
+    capsys: pytest.CaptureFixture[str],
+) -> None:
+    config_path = tmp_path / "bench.toml"
+    _write_config(
+        config_path,
+        """
+        model = "gpt-5-mini"
+        save_results = true
+
+        [[eval]]
+        env_id = "medqa"
+        num_examples = 1
+        rollouts_per_example = 1
+
+        [[ablation]]
+        env_id = "medqa"
+        num_examples = 1
+        rollouts_per_example = 1
+        env_args = { shuffle_answers = true }
+
+        [ablation.sweep.env_args]
+        shuffle_seed = [1618, 9331]
+        """,
+    )
+
+    def fail_execute_jobs(*_args, **_kwargs):
+        raise AssertionError("execute_jobs should not be called for TOML dry-run")
+
+    monkeypatch.setattr(main, "execute_jobs", fail_execute_jobs)
+    exit_code = main.main(
+        [
+            "bench",
+            "--config",
+            str(config_path),
+            "--dry-run",
+            "--output-dir",
+            str(tmp_path / "evals"),
+            "--max-concurrent",
+            "1",
+        ]
+    )
+
+    output = capsys.readouterr().out
+    assert exit_code == 0
+    assert "TOML Bench Dry Run" in output
+    assert "3 eval(s) to dry-run" in output
+    assert "baseline" in output
+    assert "env_args.shuffle_seed-1618" in output
+    assert "env_args.shuffle_seed-9331" in output
+    assert str(tmp_path / "evals" / "gpt-5-mini" / "medqa" / "baseline") in output
+
+
+def test_toml_bench_dry_run_model_override(
+    tmp_path: Path,
+    capsys: pytest.CaptureFixture[str],
+) -> None:
+    config_path = tmp_path / "bench.toml"
+    _write_config(
+        config_path,
+        """
+        model = "config-model"
+
+        [[eval]]
+        env_id = "medqa"
+        num_examples = 1
+        rollouts_per_example = 1
+        """,
+    )
+
+    exit_code = main.main(["bench", "--config", str(config_path), "--dry-run", "--model", "cli-model"])
+
+    output = capsys.readouterr().out
+    assert exit_code == 0
+    assert "cli-model" in output
+    assert "config-model" not in output
+
+
+def test_toml_bench_without_dry_run_fails_explicitly(tmp_path: Path, capsys: pytest.CaptureFixture[str]) -> None:
+    config_path = tmp_path / "bench.toml"
+    _write_config(
+        config_path,
+        """
+        [[eval]]
+        env_id = "medqa"
+        """,
+    )
+
+    with pytest.raises(SystemExit):
+        main.main(["bench", "--config", str(config_path)])
+
+    assert "TOML bench execution is not available yet" in capsys.readouterr().err
+
+
 def test_batch_api_base_url_override_forces_endpoint(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
     config_path = tmp_path / "config.yaml"
     _write_config(

From e63d8da99bf156125796cf7a7e86b718c2c6f358 Mon Sep 17 00:00:00 2001
From: Benjamin Warner <me@benjaminwarner.dev>
Date: Tue, 28 Apr 2026 19:07:38 +0000
Subject: [PATCH 05/53] Run TOML bench sequentially

---
 medarc_verifiers/cli/main.py | 234 +++++++++++++++++++++++++++++++--
 tests/test_cli/test_main.py  | 245 ++++++++++++++++++++++++++++++++++-
 2 files changed, 463 insertions(+), 16 deletions(-)

diff --git a/medarc_verifiers/cli/main.py b/medarc_verifiers/cli/main.py
index b14b6a20..1a671c1b 100644
--- a/medarc_verifiers/cli/main.py
+++ b/medarc_verifiers/cli/main.py
@@ -3,17 +3,23 @@
 from __future__ import annotations
 
 import argparse
+import asyncio
+import json
 import logging
 import os
+import shutil
 import sys
+from datetime import UTC, datetime
 from pathlib import Path
 from textwrap import dedent
-from typing import Any, Literal, Mapping, Sequence
+from typing import Any, Literal, Mapping, MutableMapping, Sequence
 
 import yaml
 from pydantic import ValidationError
 from rich.console import Console
 from rich.table import Table
+from verifiers.utils.eval_utils import run_evaluation
+from verifiers.utils.save_utils import make_serializable
 
 from medarc_verifiers.cli._config_loader import ConfigFormatError, load_run_config
 from medarc_verifiers.cli._constants import (
@@ -37,6 +43,7 @@
 from medarc_verifiers.cli._schemas import EnvironmentConfigSchema, EnvironmentExportConfig
 from medarc_verifiers.cli._single_run import run_single_mode
 from medarc_verifiers.cli.eval_identity import EvalPathPlan, plan_eval_paths
+from medarc_verifiers.cli.eval_identity import metadata_identity_fields
 from medarc_verifiers.cli.hf import HFSyncConfig, sync_files_to_hub
 from medarc_verifiers.cli.process import PROCESS_DEFAULT_STATUS_FILTER, ProcessOptions, ProcessResult, run_process
 from medarc_verifiers.cli.utils.config_io import load_mapping_file
@@ -145,6 +152,16 @@ def build_batch_parser() -> argparse.ArgumentParser:
     parser.add_argument("--api-key-var", default=None, help="Override API key environment variable for TOML bench.")
     parser.add_argument("--provider", default=None, help="Override provider shorthand for TOML bench.")
     parser.add_argument("--model", "-m", default=None, help="Override model for every TOML eval.")
+    parser.add_argument(
+        "--eval-index", "--job-index", dest="eval_index", type=int, help="Run only one TOML eval by 1-based index."
+    )
+    parser.add_argument("--start-at", type=int, help="Start TOML execution at this 1-based eval index.")
+    parser.add_argument("--stop-after", type=int, help="Stop TOML execution after this 1-based eval index.")
+    parser.add_argument(
+        "--continue-on-error",
+        action="store_true",
+        help="Continue TOML sequential execution after a failed eval.",
+    )
     parser.add_argument(
         "--job-id", action="append", help="Run only the specified job identifier (repeat to select multiple)."
     )
@@ -557,12 +574,11 @@ def _run_batch_mode(argv: Sequence[str]) -> int:
 
     config_path = Path(args.config).expanduser()
     if config_path.suffix.lower() == ".toml":
-        if not args.dry_run:
-            parser.error("TOML bench execution is not available yet; use --dry-run in this transition commit.")
         try:
-            return _dry_run_toml_bench(args)
+            _validate_toml_selection_args(args, parser=parser)
+            return _run_toml_bench(args)
         except Exception as exc:  # noqa: BLE001
-            logger.exception("TOML bench dry-run failed: %s", exc)
+            logger.exception("TOML bench failed: %s", exc)
             return 1
 
     if args.restart:
@@ -1576,9 +1592,20 @@ def _execute_batch(args: argparse.Namespace) -> int:
     return 1 if has_failures else 0
 
 
-def _dry_run_toml_bench(args: argparse.Namespace) -> int:
+def _validate_toml_selection_args(args: argparse.Namespace, *, parser: argparse.ArgumentParser) -> None:
+    for attr, flag in (("eval_index", "--eval-index"), ("start_at", "--start-at"), ("stop_after", "--stop-after")):
+        value = getattr(args, attr, None)
+        if value is not None and value < 1:
+            parser.error(f"{flag} must be a 1-based index.")
+    if args.eval_index is not None and (args.start_at is not None or args.stop_after is not None):
+        parser.error("--eval-index cannot be combined with --start-at or --stop-after.")
+    if args.start_at is not None and args.stop_after is not None and args.stop_after < args.start_at:
+        parser.error("--stop-after must be greater than or equal to --start-at.")
+
+
+def _run_toml_bench(args: argparse.Namespace) -> int:
     config_path = Path(args.config).expanduser()
-    raw_configs = load_toml_eval_configs(config_path)
+    raw_configs = _prepare_toml_raw_configs(load_toml_eval_configs(config_path), args)
     overrides = EvalConfigOverrides(
         model=args.model,
         provider=args.provider,
@@ -1591,11 +1618,189 @@ def _dry_run_toml_bench(args: argparse.Namespace) -> int:
     )
     eval_configs = [build_eval_config(raw, overrides=overrides) for raw in raw_configs]
     plan_inputs = [_eval_config_identity_payload(config) for config in eval_configs]
-    output_root = Path(args.output_dir).expanduser() if args.output_dir else DEFAULT_EVALS_DIR
+    output_root = _resolve_toml_output_root(eval_configs, args)
     path_plans = plan_eval_paths(plan_inputs, output_root=output_root)
+    eval_configs, path_plans = _select_toml_plan(eval_configs, path_plans, args)
 
-    _print_toml_bench_plan(eval_configs, path_plans)
-    return 0
+    _print_toml_bench_plan(eval_configs, path_plans, dry_run=bool(args.dry_run))
+    if args.dry_run:
+        return 0
+    return _execute_toml_plan(eval_configs, path_plans, args)
+
+
+def _prepare_toml_raw_configs(raw_configs: Sequence[dict[str, Any]], args: argparse.Namespace) -> list[dict[str, Any]]:
+    prepared: list[dict[str, Any]] = []
+    for raw in raw_configs:
+        item = dict(raw)
+        item.setdefault("save_results", True)
+        if args.max_concurrent is None and "max_concurrent" not in item:
+            item["max_concurrent"] = 1
+        prepared.append(item)
+    return prepared
+
+
+def _resolve_toml_output_root(eval_configs: Sequence[Any], args: argparse.Namespace) -> Path:
+    if args.output_dir:
+        return Path(args.output_dir).expanduser()
+
+    configured_roots = {str(config.output_dir) for config in eval_configs if config.output_dir}
+    if len(configured_roots) > 1:
+        raise ValueError(
+            "TOML bench deterministic output supports one output_dir per run; use a single global output_dir."
+        )
+    if configured_roots:
+        return Path(configured_roots.pop()).expanduser()
+    return DEFAULT_EVALS_DIR
+
+
+def _select_toml_plan(
+    eval_configs: Sequence[Any],
+    path_plans: Sequence[EvalPathPlan],
+    args: argparse.Namespace,
+) -> tuple[list[Any], list[EvalPathPlan]]:
+    indexed = list(zip(eval_configs, path_plans))
+    if args.eval_index is not None:
+        start = args.eval_index - 1
+        indexed = indexed[start : start + 1]
+    else:
+        if args.start_at is not None:
+            indexed = indexed[args.start_at - 1 :]
+        if args.stop_after is not None:
+            indexed = indexed[: args.stop_after - (args.start_at or 1) + 1]
+    if not indexed:
+        raise ValueError("No TOML evals matched the requested selection.")
+    selected_configs, selected_paths = zip(*indexed)
+    return list(selected_configs), list(selected_paths)
+
+
+def _execute_toml_plan(
+    eval_configs: Sequence[Any], path_plans: Sequence[EvalPathPlan], args: argparse.Namespace
+) -> int:
+    failures = 0
+    for index, (config, path_plan) in enumerate(zip(eval_configs, path_plans), start=1):
+        metadata_fields = metadata_identity_fields(_eval_config_identity_payload(config), path_plan.identity)
+        results_path = path_plan.results_path
+        try:
+            _prepare_toml_results_dir(results_path, metadata_fields, config, force=bool(args.force))
+            run_config = config.model_copy(update={"resume_path": results_path, "save_results": True})
+            logger.info("Running TOML eval %d/%d: %s on %s", index, len(eval_configs), config.env_id, config.model)
+            asyncio.run(_run_one_toml_eval(run_config, results_path, metadata_fields))
+            _merge_metadata_fields(results_path, metadata_fields)
+        except Exception as exc:  # noqa: BLE001
+            failures += 1
+            logger.exception("TOML eval %d failed: %s", index, exc)
+            if not args.continue_on_error:
+                return 1
+        if args.sleep and index < len(eval_configs):
+            import time
+
+            time.sleep(float(args.sleep))
+    return 1 if failures else 0
+
+
+async def _run_one_toml_eval(config: Any, results_path: Path, metadata_fields: Mapping[str, Any]) -> Any:
+    import verifiers.envs.environment as environment_module
+
+    def add_medarc_metadata(_all_outputs: Any, _new_outputs: Any, metadata: MutableMapping[str, Any]) -> None:
+        metadata.update(metadata_fields)
+
+    original_save_metadata = environment_module.save_metadata
+
+    def save_metadata_with_medarc_fields(metadata: MutableMapping[str, Any], result_path: Path) -> Any:
+        if Path(result_path) == results_path:
+            metadata.update(metadata_fields)
+        return original_save_metadata(metadata, result_path)
+
+    environment_module.save_metadata = save_metadata_with_medarc_fields
+    try:
+        return await run_evaluation(config, on_progress=add_medarc_metadata)
+    finally:
+        environment_module.save_metadata = original_save_metadata
+
+
+def _prepare_toml_results_dir(
+    results_path: Path,
+    metadata_fields: Mapping[str, Any],
+    config: Any,
+    *,
+    force: bool,
+) -> None:
+    if results_path.exists() and force:
+        _archive_existing_path(results_path)
+
+    metadata_path = results_path / "metadata.json"
+    results_file = results_path / "results.jsonl"
+    has_existing_state = metadata_path.exists() or results_file.exists()
+    if has_existing_state:
+        _validate_toml_resume_metadata(results_path, metadata_fields)
+
+    results_path.mkdir(parents=True, exist_ok=True)
+    results_file.touch(exist_ok=True)
+    if has_existing_state:
+        _merge_metadata_fields(results_path, metadata_fields)
+        return
+
+    metadata = _initial_toml_metadata(config)
+    metadata.update(metadata_fields)
+    _write_json(metadata_path, metadata)
+
+
+def _validate_toml_resume_metadata(results_path: Path, metadata_fields: Mapping[str, Any]) -> None:
+    metadata_path = results_path / "metadata.json"
+    if not metadata_path.exists():
+        raise ValueError(f"Cannot resume {results_path}: metadata.json is missing.")
+    try:
+        metadata = json.loads(metadata_path.read_text(encoding="utf-8"))
+    except json.JSONDecodeError as exc:
+        raise ValueError(f"Cannot resume {results_path}: metadata.json is invalid JSON.") from exc
+    expected = metadata_fields.get("medarc_config_fingerprint")
+    current = metadata.get("medarc_config_fingerprint") if isinstance(metadata, Mapping) else None
+    if current != expected:
+        raise ValueError(
+            f"Cannot resume {results_path}: MedARC config fingerprint mismatch "
+            f"(saved={current!r}, current={expected!r}). Use --force to archive and rerun."
+        )
+
+
+def _initial_toml_metadata(config: Any) -> dict[str, Any]:
+    return {
+        "env_id": config.env_id,
+        "env_args": dict(config.env_args or {}),
+        "model": config.model,
+        "base_url": config.client_config.api_base_url,
+        "num_examples": config.num_examples,
+        "rollouts_per_example": config.rollouts_per_example,
+        "sampling_args": dict(config.sampling_args or {}),
+        "avg_reward": None,
+        "avg_metrics": {},
+        "avg_error": None,
+        "state_columns": list(config.state_columns or []),
+    }
+
+
+def _merge_metadata_fields(results_path: Path, metadata_fields: Mapping[str, Any]) -> None:
+    metadata_path = results_path / "metadata.json"
+    metadata = json.loads(metadata_path.read_text(encoding="utf-8")) if metadata_path.exists() else {}
+    if not isinstance(metadata, dict):
+        metadata = {}
+    metadata.update(metadata_fields)
+    _write_json(metadata_path, metadata)
+
+
+def _archive_existing_path(path: Path) -> Path:
+    timestamp = datetime.now(UTC).strftime("%Y%m%d-%H%M%S")
+    candidate = path.with_name(f"{path.name}__old_{timestamp}")
+    suffix = 1
+    while candidate.exists():
+        candidate = path.with_name(f"{path.name}__old_{timestamp}_{suffix}")
+        suffix += 1
+    shutil.move(str(path), str(candidate))
+    return candidate
+
+
+def _write_json(path: Path, payload: Mapping[str, Any]) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    path.write_text(json.dumps(payload, default=make_serializable, sort_keys=True), encoding="utf-8")
 
 
 def _eval_config_identity_payload(config: Any) -> dict[str, Any]:
@@ -1609,9 +1814,14 @@ def _eval_config_identity_payload(config: Any) -> dict[str, Any]:
     }
 
 
-def _print_toml_bench_plan(eval_configs: Sequence[Any], path_plans: Sequence[EvalPathPlan]) -> None:
+def _print_toml_bench_plan(eval_configs: Sequence[Any], path_plans: Sequence[EvalPathPlan], *, dry_run: bool) -> None:
     console = Console(width=240)
-    table = Table(title="TOML Bench Dry Run", caption=f"{len(eval_configs)} eval(s) to dry-run", expand=True)
+    action = "dry-run" if dry_run else "run"
+    table = Table(
+        title="TOML Bench Dry Run" if dry_run else "TOML Bench Plan",
+        caption=f"{len(eval_configs)} eval(s) to {action}",
+        expand=True,
+    )
     table.add_column("#", justify="right", style="dim")
     table.add_column("Model", style="magenta", overflow="fold")
     table.add_column("Environment", style="green", overflow="fold")
diff --git a/tests/test_cli/test_main.py b/tests/test_cli/test_main.py
index 2603932c..8d551e0c 100644
--- a/tests/test_cli/test_main.py
+++ b/tests/test_cli/test_main.py
@@ -249,20 +249,257 @@ def test_toml_bench_dry_run_model_override(
     assert "config-model" not in output
 
 
-def test_toml_bench_without_dry_run_fails_explicitly(tmp_path: Path, capsys: pytest.CaptureFixture[str]) -> None:
+def test_toml_bench_dry_run_uses_toml_output_dir(
+    tmp_path: Path,
+    capsys: pytest.CaptureFixture[str],
+) -> None:
+    config_path = tmp_path / "bench.toml"
+    output_dir = tmp_path / "toml-output"
+    _write_config(
+        config_path,
+        f"""
+        model = "gpt-5-mini"
+        output_dir = "{output_dir}"
+
+        [[eval]]
+        env_id = "medqa"
+        """,
+    )
+
+    assert main.main(["bench", "--config", str(config_path), "--dry-run"]) == 0
+
+    assert str(output_dir / "gpt-5-mini" / "medqa") in capsys.readouterr().out
+
+
+def test_toml_bench_executes_sequentially_to_deterministic_path(
+    monkeypatch: pytest.MonkeyPatch,
+    tmp_path: Path,
+) -> None:
     config_path = tmp_path / "bench.toml"
+    output_dir = tmp_path / "evals"
     _write_config(
         config_path,
         """
+        model = "gpt-5-mini"
+
         [[eval]]
         env_id = "medqa"
+        num_examples = 1
+        rollouts_per_example = 1
         """,
     )
+    calls: list[Path] = []
 
-    with pytest.raises(SystemExit):
-        main.main(["bench", "--config", str(config_path)])
+    async def fake_run(config, on_progress=None, **_kwargs):
+        results_path = Path(config.resume_path)
+        calls.append(results_path)
+        metadata = {"env_id": config.env_id, "model": config.model}
+        if on_progress is not None:
+            on_progress([], [], metadata)
+        (results_path / "results.jsonl").write_text(json.dumps({"example_id": "0", "reward": 1.0}) + "\n")
+        (results_path / "metadata.json").write_text(json.dumps(metadata))
+        return {"outputs": [], "metadata": metadata}
+
+    monkeypatch.setattr(main, "run_evaluation", fake_run)
+
+    exit_code = main.main(["bench", "--config", str(config_path), "--output-dir", str(output_dir)])
+
+    results_path = output_dir / "gpt-5-mini" / "medqa"
+    assert exit_code == 0
+    assert calls == [results_path]
+    assert (results_path / "results.jsonl").exists()
+    metadata = json.loads((results_path / "metadata.json").read_text())
+    assert metadata["medarc_config_fingerprint"]
+    assert metadata["variant_id"] is None
+    assert metadata["variant_payload"] is None
+
+
+def test_toml_bench_defaults_max_concurrent_to_one(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
+    config_path = tmp_path / "bench.toml"
+    _write_config(
+        config_path,
+        """
+        model = "gpt-5-mini"
+
+        [[eval]]
+        env_id = "medqa"
+        """,
+    )
+    captured: list[int] = []
+
+    async def fake_run(config, **_kwargs):
+        captured.append(config.max_concurrent)
+        return {"outputs": [], "metadata": {}}
+
+    monkeypatch.setattr(main, "run_evaluation", fake_run)
+
+    assert main.main(["bench", "--config", str(config_path), "--output-dir", str(tmp_path / "evals")]) == 0
+    assert captured == [1]
+
+    captured.clear()
+    assert (
+        main.main(
+            [
+                "bench",
+                "--config",
+                str(config_path),
+                "--output-dir",
+                str(tmp_path / "evals-override"),
+                "--max-concurrent",
+                "4",
+            ]
+        )
+        == 0
+    )
+    assert captured == [4]
+
+
+def test_toml_bench_refuses_mismatched_resume(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
+    config_path = tmp_path / "bench.toml"
+    output_dir = tmp_path / "evals"
+    _write_config(
+        config_path,
+        """
+        model = "gpt-5-mini"
+
+        [[eval]]
+        env_id = "medqa"
+        env_args = { shuffle_seed = 1618 }
+        """,
+    )
+
+    async def fake_run(config, **_kwargs):
+        Path(config.resume_path, "results.jsonl").write_text(json.dumps({"example_id": "0"}) + "\n")
+        return {"outputs": [], "metadata": {}}
+
+    monkeypatch.setattr(main, "run_evaluation", fake_run)
+
+    assert main.main(["bench", "--config", str(config_path), "--output-dir", str(output_dir)]) == 0
+    _write_config(
+        config_path,
+        """
+        model = "gpt-5-mini"
+
+        [[eval]]
+        env_id = "medqa"
+        env_args = { shuffle_seed = 9331 }
+        """,
+    )
+
+    assert main.main(["bench", "--config", str(config_path), "--output-dir", str(output_dir)]) == 1
+
+
+def test_toml_bench_force_archives_existing_output(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
+    config_path = tmp_path / "bench.toml"
+    output_dir = tmp_path / "evals"
+    _write_config(
+        config_path,
+        """
+        model = "gpt-5-mini"
+
+        [[eval]]
+        env_id = "medqa"
+        """,
+    )
+
+    async def fake_run(config, **_kwargs):
+        Path(config.resume_path, "results.jsonl").write_text(json.dumps({"example_id": "0"}) + "\n")
+        return {"outputs": [], "metadata": {}}
+
+    monkeypatch.setattr(main, "run_evaluation", fake_run)
+
+    assert main.main(["bench", "--config", str(config_path), "--output-dir", str(output_dir)]) == 0
+    results_path = output_dir / "gpt-5-mini" / "medqa"
+    (results_path / "sentinel.txt").write_text("old")
+
+    assert main.main(["bench", "--config", str(config_path), "--output-dir", str(output_dir), "--force"]) == 0
+
+    archived = list((output_dir / "gpt-5-mini").glob("medqa__old_*"))
+    assert len(archived) == 1
+    assert (archived[0] / "sentinel.txt").read_text() == "old"
+    assert not (results_path / "sentinel.txt").exists()
+
+
+def test_toml_bench_resume_preserves_existing_metadata(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
+    config_path = tmp_path / "bench.toml"
+    output_dir = tmp_path / "evals"
+    _write_config(
+        config_path,
+        """
+        model = "gpt-5-mini"
+
+        [[eval]]
+        env_id = "medqa"
+        """,
+    )
+    calls = 0
+
+    async def fake_run(config, **_kwargs):
+        nonlocal calls
+        calls += 1
+        results_path = Path(config.resume_path)
+        if calls == 1:
+            (results_path / "metadata.json").write_text(
+                json.dumps(
+                    {
+                        "avg_reward": 0.75,
+                        "avg_metrics": {"accuracy": 0.75},
+                        "total_tokens": 123,
+                        "medarc_config_fingerprint": json.loads((results_path / "metadata.json").read_text())[
+                            "medarc_config_fingerprint"
+                        ],
+                    }
+                )
+            )
+        return {"outputs": [], "metadata": {}}
+
+    monkeypatch.setattr(main, "run_evaluation", fake_run)
+
+    assert main.main(["bench", "--config", str(config_path), "--output-dir", str(output_dir)]) == 0
+    assert main.main(["bench", "--config", str(config_path), "--output-dir", str(output_dir)]) == 0
+
+    metadata = json.loads((output_dir / "gpt-5-mini" / "medqa" / "metadata.json").read_text())
+    assert metadata["avg_reward"] == 0.75
+    assert metadata["avg_metrics"] == {"accuracy": 0.75}
+    assert metadata["total_tokens"] == 123
+    assert metadata["medarc_config_fingerprint"]
+
+
+def test_toml_bench_injects_medarc_fields_into_upstream_metadata_saves(
+    monkeypatch: pytest.MonkeyPatch,
+    tmp_path: Path,
+) -> None:
+    import verifiers.envs.environment as environment_module
+
+    config_path = tmp_path / "bench.toml"
+    _write_config(
+        config_path,
+        """
+        model = "gpt-5-mini"
+
+        [[eval]]
+        env_id = "medqa"
+        """,
+    )
+    saved_metadata: list[dict[str, Any]] = []
+
+    def fake_save_metadata(metadata, _result_path):
+        saved_metadata.append(dict(metadata))
+
+    async def fake_run(config, on_progress=None, **_kwargs):
+        metadata = {}
+        if on_progress is not None:
+            on_progress([], [], metadata)
+        environment_module.save_metadata({}, Path(config.resume_path))
+        return {"outputs": [], "metadata": metadata}
+
+    monkeypatch.setattr(environment_module, "save_metadata", fake_save_metadata)
+    monkeypatch.setattr(main, "run_evaluation", fake_run)
+
+    assert main.main(["bench", "--config", str(config_path), "--output-dir", str(tmp_path / "evals")]) == 0
 
-    assert "TOML bench execution is not available yet" in capsys.readouterr().err
+    assert saved_metadata
+    assert all(item["medarc_config_fingerprint"] for item in saved_metadata)
 
 
 def test_batch_api_base_url_override_forces_endpoint(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:

From 2fa7517ace9afde77ec01a58e8d96121373d0a22 Mon Sep 17 00:00:00 2001
From: Benjamin Warner <me@benjaminwarner.dev>
Date: Tue, 28 Apr 2026 19:16:06 +0000
Subject: [PATCH 06/53] Route dynamic single runs through adapter

---
 medarc_verifiers/cli/_single_run.py       | 155 ++++++----------------
 medarc_verifiers/cli/verifiers_adapter.py |  30 +++--
 tests/test_cli/test_main.py               |  44 +++++-
 3 files changed, 98 insertions(+), 131 deletions(-)

diff --git a/medarc_verifiers/cli/_single_run.py b/medarc_verifiers/cli/_single_run.py
index 1414fd6b..dfbb3852 100644
--- a/medarc_verifiers/cli/_single_run.py
+++ b/medarc_verifiers/cli/_single_run.py
@@ -20,16 +20,13 @@
     DEFAULT_API_KEY_VAR,
     DEFAULT_ENDPOINTS_PATH,
 )
-from medarc_verifiers.cli._eval_builder import build_client_config, build_eval_config
-from medarc_verifiers.cli._schemas import ModelConfigSchema
+from medarc_verifiers.cli.verifiers_adapter import build_eval_config
 from medarc_verifiers.cli.utils.env_args import EnvParam, MissingEnvParamError, gather_env_cli_metadata, merge_env_args
-from medarc_verifiers.cli.utils.endpoint_utils import load_endpoint_registry
 from medarc_verifiers.cli.utils.overrides import build_cli_override
 from medarc_verifiers.cli.utils.resume import (
     format_resume_mismatch_lines,
     is_resume_metadata_mismatch_error,
     load_resume_metadata_values,
-    resolve_resume_path,
 )
 from medarc_verifiers.cli.utils.shared import (
     HEADER_SEPARATOR,
@@ -40,6 +37,7 @@
     merge_sampling_args,
     normalize_headers,
 )
+from medarc_verifiers.utils.prime_inference import PRIME_INFERENCE_URL
 
 logger = logging.getLogger(__name__)
 
@@ -53,21 +51,6 @@ class EnvOptionBinding:
     default: Any
 
 
-@dataclass
-class _SingleRunEnvConfig:
-    """Lightweight env config to reuse the shared EvalConfig builder."""
-
-    id: str
-    module: str | None = None
-    matrix_base_id: str | None = None
-    num_examples: int = 5
-    rollouts_per_example: int = 1
-    max_concurrent: int | None = None
-    independent_scoring: bool = True
-    state_columns: list[str] | None = None
-    verbose: bool | None = False
-
-
 def run_single_mode(argv: Sequence[str] | None = None) -> int:
     """Entry point for single-run (medarc-eval style) execution."""
     args_list = list(argv) if argv is not None else sys.argv[1:]
@@ -84,6 +67,7 @@ def run_single_mode(argv: Sequence[str] | None = None) -> int:
     remaining = args_list[1:]
     endpoints_path_explicit = _option_was_provided(remaining, "--endpoints-path", "-e")
     api_key_var_explicit = _option_was_provided(remaining, "--api-key-var", "-k")
+    api_base_url_explicit = _option_was_provided(remaining, "--api-base-url", "-b")
 
     parser, env_group, reserved_dests = _build_base_parser_layout(require_env=True, add_help=True, env_id=env_id)
     try:
@@ -174,106 +158,47 @@ def run_single_mode(argv: Sequence[str] | None = None) -> int:
         )
 
     endpoints_path = Path(args.endpoints_path).expanduser()
-    default_endpoints_path = Path(DEFAULT_ENDPOINTS_PATH).expanduser()
-    if not endpoints_path.exists():
-        if endpoints_path_explicit:
-            logger.error("Explicit endpoints registry path does not exist: %s", endpoints_path)
-            return 2
-        if _same_path(endpoints_path, default_endpoints_path):
-            logger.warning(
-                "Default endpoints registry '%s' not found; continuing without endpoint aliases.",
-                endpoints_path,
-            )
-        endpoints = {}
-    else:
-        try:
-            endpoints = load_endpoint_registry(endpoints_path)
-        except Exception as exc:  # noqa: BLE001
-            if endpoints_path_explicit:
-                logger.error("Failed to load explicit endpoints registry '%s': %s", endpoints_path, exc)
-                return 2
-            logger.warning(
-                "Failed to load default endpoints registry '%s'; continuing without endpoint aliases: %s",
-                endpoints_path,
-                exc,
-            )
-            endpoints = {}
-
-    if endpoints_path_explicit and not endpoints:
-        logger.error("Failed to load endpoint registry from explicit path: %s", endpoints_path)
+    if endpoints_path_explicit and not endpoints_path.exists():
+        logger.error("Explicit endpoints registry path does not exist: %s", endpoints_path)
         return 2
 
-    model_cfg = ModelConfigSchema(model=args.model)
-    resolved_model, client_config, prime_sampling_overrides = build_client_config(
-        model_cfg,
-        endpoints=endpoints,
-        default_api_key_var=args.api_key_var,
-        default_api_key_var_explicit=api_key_var_explicit,
-        default_api_base_url=args.api_base_url,
-        api_base_url_override=None,
-        http_max_retries_override=args.http_max_retries,
-        timeout_override=args.timeout,
-        headers=headers,
-    )
-
-    # Merge Prime Inference overrides with user sampling args (user args take precedence)
-    merged_sampling_args = {**prime_sampling_overrides, **merged_sampling_args}
-
-    env_cfg = _SingleRunEnvConfig(
-        id=args.env,
-        num_examples=args.num_examples,
-        rollouts_per_example=args.rollouts_per_example,
-        max_concurrent=args.max_concurrent,
-        independent_scoring=not args.group_scoring,
-        state_columns=state_columns or None,
-        verbose=args.verbose,
-    )
+    raw_config: dict[str, Any] = {
+        "env_id": args.env,
+        "model": args.model,
+        "env_args": merged_env_args,
+        "sampling_args": merged_sampling_args,
+        "include_none_max_tokens": False,
+        "env_dir_path": str(Path(args.env_dir_path).expanduser()),
+        "endpoints_path": str(endpoints_path),
+        "headers": headers,
+        "num_examples": args.num_examples,
+        "rollouts_per_example": args.rollouts_per_example,
+        "max_concurrent": args.max_concurrent,
+        "max_retries": args.rollout_max_retries,
+        "http_max_retries": args.http_max_retries,
+        "client_timeout": args.timeout,
+        "independent_scoring": not args.group_scoring,
+        "state_columns": state_columns,
+        "save_results": bool(args.save_results or args.resume),
+        "resume": args.resume,
+        "save_to_hf_hub": args.save_to_hf_hub,
+        "hf_hub_dataset_name": args.hf_hub_dataset_name or "",
+        "verbose": args.verbose,
+    }
+    if api_base_url_explicit:
+        raw_config["api_base_url"] = args.api_base_url
+    else:
+        raw_config["default_api_base_url"] = args.api_base_url
+    if api_key_var_explicit:
+        raw_config["api_key_var"] = args.api_key_var
+    elif not (api_base_url_explicit and args.api_base_url == PRIME_INFERENCE_URL):
+        raw_config["default_api_key_var"] = args.api_key_var
 
     try:
-        resume_path = resolve_resume_path(
-            resume_arg=args.resume,
-            env_id=args.env,
-            model=resolved_model,
-            num_examples=args.num_examples,
-            rollouts_per_example=args.rollouts_per_example,
-            env_dir_path=Path(args.env_dir_path).expanduser(),
-        )
+        eval_config = build_eval_config(raw_config)
     except ValueError as exc:
         parser.error(str(exc))
 
-    if isinstance(args.resume, str):
-        logger.info("Resuming from explicit path: %s", resume_path)
-    elif args.resume is True:
-        if resume_path is not None:
-            logger.info("Auto-resuming from: %s", resume_path)
-        else:
-            logger.info("No matching incomplete run found for --resume; starting a new run.")
-
-    eval_config = build_eval_config(
-        job_label=args.env,
-        model_cfg=model_cfg,
-        env_cfg=env_cfg,
-        env_args=merged_env_args,
-        sampling_args=merged_sampling_args,
-        cli_env_args=None,
-        cli_sampling_args=None,
-        resolved_model=resolved_model,
-        client_config=client_config,
-        env_dir=Path(args.env_dir_path).expanduser(),
-        max_concurrent_override=args.max_concurrent,
-        max_concurrent_generation=args.max_concurrent_generation,
-        max_concurrent_scoring=args.max_concurrent_scoring,
-        rollout_max_retries=args.rollout_max_retries,
-        resume_path=resume_path,
-        default_max_concurrent=DEFAULT_SINGLE_RUN_MAX_CONCURRENT,
-        save_results=args.save_results,
-        save_to_hf_hub=args.save_to_hf_hub,
-        hf_hub_dataset_name=args.hf_hub_dataset_name or None,
-        verbose=args.verbose,
-        env_metadata_cache=None,
-        enforce_required_env_args=True,
-    )
-
     if args.dry_run:
         print(eval_config.model_dump_json(indent=2))
         return 0
@@ -291,9 +216,9 @@ def run_single_mode(argv: Sequence[str] | None = None) -> int:
         logger.error("Evaluation interrupted by user.")
         return 1
     except Exception as exc:  # noqa: BLE001
-        if resume_path is not None and is_resume_metadata_mismatch_error(exc):
-            logger.error("Resume metadata mismatch for %s.", resume_path)
-            saved_values = load_resume_metadata_values(resume_path)
+        if eval_config.resume_path is not None and is_resume_metadata_mismatch_error(exc):
+            logger.error("Resume metadata mismatch for %s.", eval_config.resume_path)
+            saved_values = load_resume_metadata_values(eval_config.resume_path)
             current_values = {
                 "env_id": eval_config.env_id,
                 "model": eval_config.model,
diff --git a/medarc_verifiers/cli/verifiers_adapter.py b/medarc_verifiers/cli/verifiers_adapter.py
index 3a979051..288c2e6e 100644
--- a/medarc_verifiers/cli/verifiers_adapter.py
+++ b/medarc_verifiers/cli/verifiers_adapter.py
@@ -307,8 +307,10 @@ def _build_client_config(
             raise ValueError(f"Endpoint id '{raw_endpoint_id}' not found in endpoint registry at {endpoints_path}")
         provider_cfg = PROVIDER_CONFIGS[raw_provider or DEFAULT_PROVIDER]
         model = raw_model
-        api_key_var = raw["api_key_var"] if api_key_override else provider_cfg["key"]
-        api_base_url = raw_api_base_url if api_base_url_override else provider_cfg["url"]
+        api_key_var = raw["api_key_var"] if api_key_override else raw.get("default_api_key_var", provider_cfg["key"])
+        api_base_url = (
+            raw_api_base_url if api_base_url_override else raw.get("default_api_base_url", provider_cfg["url"])
+        )
         client_type = (
             raw["api_client_type"] if client_type_override else provider_cfg.get("client_type", DEFAULT_CLIENT_TYPE)
         )
@@ -338,14 +340,20 @@ def _build_client_config(
             for endpoint in endpoint_group
         ]
 
-    client_config = ClientConfig(
-        client_type=cast(ClientType, client_type),
-        api_key_var=api_key_var,
-        api_base_url=api_base_url,
-        endpoint_configs=endpoint_configs,
-        extra_headers=merged_headers,
-        extra_headers_from_state=eval_headers_from_state,
-    )
+    client_kwargs: dict[str, Any] = {
+        "client_type": cast(ClientType, client_type),
+        "api_key_var": api_key_var,
+        "api_base_url": api_base_url,
+        "endpoint_configs": endpoint_configs,
+        "extra_headers": merged_headers,
+        "extra_headers_from_state": eval_headers_from_state,
+    }
+    if raw.get("client_timeout") is not None:
+        client_kwargs["timeout"] = raw["client_timeout"]
+    if raw.get("http_max_retries") is not None:
+        client_kwargs["max_retries"] = raw["http_max_retries"]
+
+    client_config = ClientConfig(**client_kwargs)
     return cast(str, model), resolved_endpoint_id, client_config
 
 
@@ -354,7 +362,7 @@ def _build_sampling_args(raw: Mapping[str, Any], api_base_url: str) -> dict[str,
         raw.get("sampling_args"),
         max_tokens=raw.get("max_tokens"),
         temperature=raw.get("temperature"),
-        include_none_max_tokens=True,
+        include_none_max_tokens=raw.get("include_none_max_tokens", True),
     )
     _, prime_sampling_overrides = prime_inference_overrides(api_base_url)
     return sanitize_sampling_args_for_openai(_deep_merge(prime_sampling_overrides, sampling_args))
diff --git a/tests/test_cli/test_main.py b/tests/test_cli/test_main.py
index 8d551e0c..215b2041 100644
--- a/tests/test_cli/test_main.py
+++ b/tests/test_cli/test_main.py
@@ -29,10 +29,6 @@ def _patch_single_run_env(monkeypatch: pytest.MonkeyPatch, metadata: list[EnvPar
         "medarc_verifiers.cli._single_run.gather_env_cli_metadata",
         lambda env_id: metadata,
     )
-    monkeypatch.setattr(
-        "medarc_verifiers.cli._single_run.load_endpoint_registry",
-        lambda *args, **kwargs: {},
-    )
 
 
 def _patch_single_run_metadata_only(monkeypatch: pytest.MonkeyPatch, metadata: list[EnvParam]) -> None:
@@ -1803,6 +1799,44 @@ def test_single_run_explicit_api_key_var_is_respected_for_prime_url(
     assert config["client_config"]["api_key_var"] == "OPENAI_API_KEY"
 
 
+def test_single_run_endpoint_alias_uses_registry_url_and_key(
+    monkeypatch: pytest.MonkeyPatch,
+    tmp_path: Path,
+    capsys: pytest.CaptureFixture[str],
+) -> None:
+    metadata: list[EnvParam] = []
+    _patch_single_run_env(monkeypatch, metadata)
+    endpoints_path = tmp_path / "endpoints.toml"
+    endpoints_path.write_text(
+        """
+        [[endpoint]]
+        endpoint_id = "openai-alias"
+        model = "openai/resolved"
+        url = "https://registry.example/v1"
+        key = "REGISTRY_KEY"
+        """,
+        encoding="utf-8",
+    )
+
+    exit_code = main.main(
+        [
+            "medqa",
+            "--dry-run",
+            "--model",
+            "openai-alias",
+            "--endpoints-path",
+            str(endpoints_path),
+        ]
+    )
+
+    assert exit_code == 0
+    config = json.loads(capsys.readouterr().out)
+    assert config["endpoint_id"] == "openai-alias"
+    assert config["model"] == "openai/resolved"
+    assert config["client_config"]["api_base_url"] == "https://registry.example/v1"
+    assert config["client_config"]["api_key_var"] == "REGISTRY_KEY"
+
+
 def test_single_run_dry_run_outputs_config(
     monkeypatch: pytest.MonkeyPatch,
     capsys: pytest.CaptureFixture[str],
@@ -1910,7 +1944,7 @@ def fake_find_latest_incomplete_eval_results_path(**kwargs: Any) -> Path:
         return discovered
 
     monkeypatch.setattr(
-        "medarc_verifiers.cli.utils.resume.find_latest_incomplete_eval_results_path",
+        "medarc_verifiers.cli.verifiers_adapter.find_latest_incomplete_eval_results_path",
         fake_find_latest_incomplete_eval_results_path,
     )
 

From 829bf9d8f0b43ba5f4aa6c6da4deaee33ad2055c Mon Sep 17 00:00:00 2001
From: Benjamin Warner <me@benjaminwarner.dev>
Date: Tue, 28 Apr 2026 19:21:44 +0000
Subject: [PATCH 07/53] Migrate orchestrate to TOML bench

---
 docs/medarc-orchestrate.md                    | 55 ++++++++++++-------
 medarc_verifiers/orchestrate/cli.py           |  2 +-
 medarc_verifiers/orchestrate/config.py        | 49 ++++++++++++-----
 medarc_verifiers/orchestrate/run.py           | 11 +---
 .../test_orchestrate_cli_validation.py        | 34 ++++++++++++
 .../test_orchestrate_config.py                | 32 +++++++++++
 .../test_orchestrate_parallel_launch.py       |  8 +++
 7 files changed, 145 insertions(+), 46 deletions(-)

diff --git a/docs/medarc-orchestrate.md b/docs/medarc-orchestrate.md
index 028dd1da..786a82d2 100644
--- a/docs/medarc-orchestrate.md
+++ b/docs/medarc-orchestrate.md
@@ -39,7 +39,7 @@ Create a plan YAML listing the job configs you want to orchestrate:
 ```yaml
 name: local-vllm
 job_configs:
-  - configs/job-gpt-oss-20b.yaml
+  - configs/eval/job-gpt-oss-20b.toml
 env_file: .env
 gpu_range: "0-3"
 port_range: "8000-8999"
@@ -49,32 +49,36 @@ resume: false
 rerun_failed: false
 ```
 
-Each job config must define exactly one model under `models:` and include a top-level
-`orchestrate:` block with per-model serve settings.
+Each job config should be an upstream `medarc-eval bench` TOML config with a top-level
+`model` and a top-level `orchestrate` table. Legacy YAML job configs are still loadable
+during migration, but new orchestrated runs should use TOML.
 
 The `env_file` is a dotenv file that is loaded for every Docker launch. If unset and a repo-level `.env` exists,
 it is used automatically. You can also override it via `--env-file`.
 
-Optional: set `orchestrate.restart` to reuse completed jobs from a previous `medarc-eval` run (it is forwarded as
-`medarc-eval bench --restart ...`).
-
 Shared container config:
 
-```yaml
-orchestrate:
-  qwen-30b-a3b:
-    gpus: 2
-    tensor_parallel_size: 2
-    serve:
-      max_model_len: 40960
-  vllm-container:
-    image: vllm/vllm-openai:latest
-    container_port: 8000
-    volumes:
-      - /data/huggingface:/root/.cache/huggingface
-    ipc_mode: host
-  pyxis:
-    srun_extra_args: []
+```toml
+model = "Qwen/Qwen3-30B-A3B"
+
+[[eval]]
+env_id = "medqa"
+
+[orchestrate.qwen-30b-a3b]
+gpus = 2
+tensor_parallel_size = 2
+
+[orchestrate.qwen-30b-a3b.serve]
+max_model_len = 40960
+
+[orchestrate.vllm-container]
+image = "vllm/vllm-openai:latest"
+container_port = 8000
+volumes = ["/data/huggingface:/root/.cache/huggingface"]
+ipc_mode = "host"
+
+[orchestrate.pyxis]
+srun_extra_args = []
 ```
 
 Config notes:
@@ -130,6 +134,15 @@ Artifacts are written under `outputs/orchestrator/<run_id>/`:
 
 ### Runtime behavior
 
+For each task, the orchestrator launches vLLM, waits for readiness, then runs:
+
+```bash
+medarc-eval bench --config <job.toml> --api-base-url <allocated-local-url> --provider local
+```
+
+The bench command exits naturally on completion; the orchestrator no longer passes YAML-runner flags such as
+`--on-complete` or `--restart`.
+
 Docker mode:
 
 - The orchestrator reserves concrete local GPU IDs and host ports.
diff --git a/medarc_verifiers/orchestrate/cli.py b/medarc_verifiers/orchestrate/cli.py
index 53edafb3..b2cab3c9 100644
--- a/medarc_verifiers/orchestrate/cli.py
+++ b/medarc_verifiers/orchestrate/cli.py
@@ -33,7 +33,7 @@ def build_parser() -> argparse.ArgumentParser:
         prog="medarc-orchestrate",
         description="Run vLLM orchestration over job configs.",
     )
-    parser.add_argument("--plan", required=True, type=Path, help="Path to orchestrator plan YAML.")
+    parser.add_argument("--plan", required=True, type=Path, help="Path to orchestrator plan YAML or JSON.")
     parser.add_argument(
         "--env-file",
         type=Path,
diff --git a/medarc_verifiers/orchestrate/config.py b/medarc_verifiers/orchestrate/config.py
index 264e1db5..3e5f0146 100644
--- a/medarc_verifiers/orchestrate/config.py
+++ b/medarc_verifiers/orchestrate/config.py
@@ -5,12 +5,16 @@
 from dataclasses import dataclass
 from pathlib import Path
 from typing import Any, Mapping
+import tomllib
 import warnings
 
 from omegaconf import OmegaConf
 from pydantic import BaseModel, Field, ValidationError
 
 
+_ORCHESTRATE_NON_MODEL_KEYS = {"restart", "vllm-container", "vllm-docker", "pyxis"}
+
+
 class PlanConfig(BaseModel):
     """Schema for the orchestrator plan file."""
 
@@ -84,7 +88,7 @@ def expand_tasks(plan: PlanConfig) -> list[TaskSpec]:
     for job_path in plan.job_configs:
         resolved_job_path = job_path.expanduser().resolve()
         job_cfg = load_job_config(resolved_job_path)
-        model_key, model_entry = _extract_single_model(job_cfg, source=resolved_job_path)
+        model_key, model_entry = _extract_task_model(job_cfg, source=resolved_job_path)
         orchestrate_cfg = _extract_orchestrate_config(job_cfg, model_key=model_key, source=resolved_job_path)
         model_id = str(model_entry.get("model", "")).strip()
         if not model_id:
@@ -105,10 +109,13 @@ def expand_tasks(plan: PlanConfig) -> list[TaskSpec]:
 def _load_mapping(path: Path) -> Mapping[str, Any]:
     if not path.exists():
         raise FileNotFoundError(f"Config not found: {path}")
-    if path.suffix not in {".yaml", ".yml", ".json"}:
-        raise ValueError(f"Unsupported config format: {path} (expected .yaml/.yml/.json)")
+    if path.suffix not in {".yaml", ".yml", ".json", ".toml"}:
+        raise ValueError(f"Unsupported config format: {path} (expected .yaml/.yml/.json/.toml)")
     try:
-        data = OmegaConf.to_container(OmegaConf.load(path), resolve=True)
+        if path.suffix == ".toml":
+            data = tomllib.loads(path.read_text(encoding="utf-8"))
+        else:
+            data = OmegaConf.to_container(OmegaConf.load(path), resolve=True)
     except Exception as exc:  # pragma: no cover - OmegaConf error types vary
         raise ConfigFormatError(f"Failed to load config: {path}") from exc
     if not isinstance(data, Mapping):
@@ -116,18 +123,30 @@ def _load_mapping(path: Path) -> Mapping[str, Any]:
     return data
 
 
-def _extract_single_model(payload: Mapping[str, Any], *, source: Path) -> tuple[str, Mapping[str, Any]]:
+def _extract_task_model(payload: Mapping[str, Any], *, source: Path) -> tuple[str, Mapping[str, Any]]:
     models = payload.get("models")
-    if not isinstance(models, Mapping):
-        raise ValueError(f"Job config {source} must define a models mapping.")
-    keys = list(models.keys())
-    if len(keys) != 1:
-        raise ValueError(f"Job config {source} must define exactly one model; found {len(keys)}.")
-    model_key = str(keys[0])
-    model_entry = models.get(model_key)
-    if not isinstance(model_entry, Mapping):
-        raise ValueError(f"Job config {source} models.{model_key} must be a mapping.")
-    return model_key, model_entry
+    if isinstance(models, Mapping):
+        keys = list(models.keys())
+        if len(keys) != 1:
+            raise ValueError(f"Job config {source} must define exactly one model; found {len(keys)}.")
+        model_key = str(keys[0])
+        model_entry = models.get(model_key)
+        if not isinstance(model_entry, Mapping):
+            raise ValueError(f"Job config {source} models.{model_key} must be a mapping.")
+        return model_key, model_entry
+
+    model_id = str(payload.get("model", "")).strip()
+    if not model_id:
+        raise ValueError(f"Job config {source} must define either one models entry or a top-level model.")
+    orchestrate = payload.get("orchestrate")
+    if not isinstance(orchestrate, Mapping):
+        raise ValueError(f"Job config {source} must define a top-level orchestrate mapping.")
+    model_keys = [str(key) for key, value in orchestrate.items() if key not in _ORCHESTRATE_NON_MODEL_KEYS]
+    if len(model_keys) != 1:
+        raise ValueError(
+            f"Job config {source} must define exactly one orchestrate model settings table; found {len(model_keys)}."
+        )
+    return model_keys[0], {"model": model_id}
 
 
 def _extract_orchestrate_config(payload: Mapping[str, Any], *, model_key: str, source: Path) -> Mapping[str, Any]:
diff --git a/medarc_verifiers/orchestrate/run.py b/medarc_verifiers/orchestrate/run.py
index 9e79aa14..489f350a 100644
--- a/medarc_verifiers/orchestrate/run.py
+++ b/medarc_verifiers/orchestrate/run.py
@@ -39,10 +39,8 @@
 )
 from medarc_verifiers.orchestrate.vllm_args import build_container_args, normalize_volume_mounts
 
-_COMMAND_TEMPLATE_UV = (
-    "uv run medarc-eval bench --config {job_config_path} --api-base-url {base_url} --on-complete exit"
-)
-_COMMAND_TEMPLATE_BARE = "medarc-eval bench --config {job_config_path} --api-base-url {base_url} --on-complete exit"
+_COMMAND_TEMPLATE_UV = "uv run medarc-eval bench --config {job_config_path} --api-base-url {base_url} --provider local"
+_COMMAND_TEMPLATE_BARE = "medarc-eval bench --config {job_config_path} --api-base-url {base_url} --provider local"
 
 _TASK_DIR_ALLOWED = re.compile(r"[^a-zA-Z0-9_.-]+")
 
@@ -330,11 +328,6 @@ async def _run_task_once(
                 "job_config_path": str(task.job_config_path),
             }
             command = render_command(self._command_template, command_context)
-            restart_source = orchestrate.get("restart")
-            if restart_source:
-                restart_value = str(restart_source)
-                if "--restart" not in command:
-                    command.extend(["--restart", restart_value])
             manifest.bench_command = shlex.join(command)
             self._dashboard.log(f"JOB bench-start task={task.task_id} cmd={_shorten(manifest.bench_command)}")
             self._set_state(manifest, paths, JobState.running)
diff --git a/tests/test_orchestrate/test_orchestrate_cli_validation.py b/tests/test_orchestrate/test_orchestrate_cli_validation.py
index 638f75fe..25c9c809 100644
--- a/tests/test_orchestrate/test_orchestrate_cli_validation.py
+++ b/tests/test_orchestrate/test_orchestrate_cli_validation.py
@@ -112,6 +112,40 @@ def fake_run(self) -> None:
     assert captured["runtime"] == "pyxis"
 
 
+def test_cli_dry_run_accepts_toml_job_config(tmp_path: Path, capsys: pytest.CaptureFixture[str]) -> None:
+    job_cfg = tmp_path / "job.toml"
+    job_cfg.write_text(
+        """
+model = "Foo/Bar"
+
+[[eval]]
+env_id = "medqa"
+
+[orchestrate.vllm-container]
+image = "fake"
+
+[orchestrate.foo]
+gpus = 1
+serve = {}
+""".lstrip(),
+        encoding="utf-8",
+    )
+    plan_path = tmp_path / "plan.yaml"
+    plan_path.write_text(
+        f"""
+job_configs:
+  - {job_cfg.name}
+runtime: pyxis
+""".lstrip(),
+        encoding="utf-8",
+    )
+
+    rc = main(["--plan", str(plan_path), "--dry-run"])
+
+    assert rc == 0
+    assert f"job:foo\tFoo/Bar\t{job_cfg.resolve()}" in capsys.readouterr().out
+
+
 def test_port_only_resource_manager_skips_gpus() -> None:
     rm = PortOnlyResourceManager(port_range=(9000, 9010))
 
diff --git a/tests/test_orchestrate/test_orchestrate_config.py b/tests/test_orchestrate/test_orchestrate_config.py
index 64d472ee..cf84c8b0 100644
--- a/tests/test_orchestrate/test_orchestrate_config.py
+++ b/tests/test_orchestrate/test_orchestrate_config.py
@@ -63,6 +63,38 @@ def test_plan_job_configs_resolve_relative_to_plan_file(tmp_path: Path):
     assert "vllm-container" in tasks[0].orchestrate
 
 
+def test_expand_tasks_accepts_toml_eval_config(tmp_path: Path) -> None:
+    job_cfg = tmp_path / "job.toml"
+    job_cfg.write_text(
+        """
+model = "Foo/Bar"
+
+[[eval]]
+env_id = "medqa"
+
+[orchestrate.vllm-container]
+image = "vllm/vllm-openai:latest"
+
+[orchestrate.foo]
+gpus = 1
+
+[orchestrate.foo.serve]
+dtype = "bfloat16"
+""".lstrip(),
+        encoding="utf-8",
+    )
+    plan_path = tmp_path / "plan.yaml"
+    plan_path.write_text(f"job_configs:\n  - {job_cfg.name}\n", encoding="utf-8")
+
+    tasks = expand_tasks(load_plan(plan_path))
+
+    assert tasks[0].job_config_path == job_cfg.resolve()
+    assert tasks[0].model_key == "foo"
+    assert tasks[0].model_id == "Foo/Bar"
+    assert tasks[0].orchestrate["vllm-container"]["image"] == "vllm/vllm-openai:latest"
+    assert tasks[0].orchestrate["foo"]["serve"]["dtype"] == "bfloat16"
+
+
 def test_expand_tasks_accepts_deprecated_vllm_docker_with_warning(tmp_path: Path) -> None:
     job_cfg = tmp_path / "job.yaml"
     job_cfg.write_text(
diff --git a/tests/test_orchestrate/test_orchestrate_parallel_launch.py b/tests/test_orchestrate/test_orchestrate_parallel_launch.py
index ccef080a..aa4e63e3 100644
--- a/tests/test_orchestrate/test_orchestrate_parallel_launch.py
+++ b/tests/test_orchestrate/test_orchestrate_parallel_launch.py
@@ -103,6 +103,7 @@ def _task(tmp_path: Path, task_id: str) -> TaskSpec:
         model_key="foo",
         model_id=f"Foo/{task_id}",
         orchestrate={
+            "restart": "runs/raw/old-run",
             "vllm-container": {"image": "fake"},
             "foo": {"gpus": 2, "tensor_parallel_size": 2, "serve": {}},
         },
@@ -129,6 +130,7 @@ async def test_parallel_launch_runs_concurrently(
         max_parallel=2,
     )
     adapter = FakeRuntimeAdapter()
+    bench_commands: list[list[str]] = []
     runner = OrchestratorRunner(
         plan,
         tasks,
@@ -164,6 +166,8 @@ class Result:
         return Result()
 
     async def fake_start_benchmark(*args, **kwargs):
+        bench_commands.append(list(args[0]))
+
         class Proc:
             pass
 
@@ -190,3 +194,7 @@ async def fake_to_thread(func, /, *args, **kwargs):
 
     assert readiness_overlapped
     assert [call["server_port"] for call in adapter.launch_calls] == [8000, 8001]
+    assert all("--api-base-url" in command for command in bench_commands)
+    assert all("--provider" in command and "local" in command for command in bench_commands)
+    assert all("--on-complete" not in command for command in bench_commands)
+    assert all("--restart" not in command for command in bench_commands)

From f6312332556dbe343a2e8e45137250ae90505de0 Mon Sep 17 00:00:00 2001
From: Benjamin Warner <me@benjaminwarner.dev>
Date: Tue, 28 Apr 2026 19:32:16 +0000
Subject: [PATCH 08/53] Discover deterministic eval outputs

---
 medarc_verifiers/cli/process/discovery.py | 278 ++++++++++++++++++++--
 medarc_verifiers/cli/process/metadata.py  |  67 ++++++
 medarc_verifiers/cli/process/pipeline.py  |  34 ++-
 tests/test_cli/test_process_discovery.py  |  98 ++++++++
 tests/test_cli/test_process_pipeline.py   |  74 ++++++
 5 files changed, 522 insertions(+), 29 deletions(-)

diff --git a/medarc_verifiers/cli/process/discovery.py b/medarc_verifiers/cli/process/discovery.py
index 7aba00f8..3421d0df 100644
--- a/medarc_verifiers/cli/process/discovery.py
+++ b/medarc_verifiers/cli/process/discovery.py
@@ -4,12 +4,19 @@
 
 import json
 import logging
+from datetime import UTC, datetime
 from dataclasses import dataclass, field
 from pathlib import Path
 from typing import Any, Dict, Iterator, Mapping, Sequence
 
 from pydantic import ValidationError
 
+from medarc_verifiers.cli.eval_identity import (
+    MEDARC_CONFIG_FINGERPRINT_KEY,
+    MEDARC_CONFIG_FINGERPRINT_PAYLOAD_KEY,
+    MEDARC_VARIANT_ID_KEY,
+    MEDARC_VARIANT_PAYLOAD_KEY,
+)
 from medarc_verifiers.cli._manifest import (
     MANIFEST_FILENAME,
     ManifestJobEntry,
@@ -20,6 +27,8 @@
 logger = logging.getLogger(__name__)
 
 DEFAULT_STATUS = "unknown"
+RESULTS_FILENAME = "results.jsonl"
+METADATA_FILENAME = "metadata.json"
 
 
 @dataclass(frozen=True, slots=True)
@@ -91,28 +100,39 @@ def iter_run_records(
 ) -> Iterator[RunRecord]:
     """Yield run records for each job entry found under the runs directory."""
     runs_path = Path(runs_dir)
-    if not runs_path.exists():
-        logger.debug("Runs directory %s does not exist; nothing to process.", runs_path)
-        return
-
     normalized_status = _normalize_status_filter(filter_status)
+    emitted_results_dirs: set[Path] = set()
 
-    try:
-        run_dirs = sorted(path for path in runs_path.iterdir() if path.is_dir())
-    except OSError as exc:  # noqa: FBT003
-        logger.warning("Failed to list runs directory %s: %s", runs_path, exc)
-        return
+    if runs_path.exists():
+        try:
+            run_dirs = sorted(path for path in runs_path.iterdir() if path.is_dir())
+        except OSError as exc:  # noqa: FBT003
+            logger.warning("Failed to list runs directory %s: %s", runs_path, exc)
+            run_dirs = []
+
+        for run_dir in run_dirs:
+            manifest_info, job_entries = _load_manifest(run_dir)
+            if manifest_info is None:
+                continue
+            summary_map = _load_run_summary(run_dir)
+            for job_entry in job_entries:
+                summary_entry = summary_map.get(job_entry.job_id or "")
+                record = _build_run_record(manifest_info, job_entry, summary_entry)
+                if record is None:
+                    continue
+                emitted_results_dirs.add(_dedupe_key(record.results_dir))
+                if normalized_status and record.status not in normalized_status:
+                    continue
+                yield record
+    else:
+        logger.debug("Runs directory %s does not exist; checking eval output roots.", runs_path)
 
-    for run_dir in run_dirs:
-        manifest_info, job_entries = _load_manifest(run_dir)
-        if manifest_info is None:
-            continue
-        summary_map = _load_run_summary(run_dir)
-        for job_entry in job_entries:
-            summary_entry = summary_map.get(job_entry.job_id or "")
-            record = _build_run_record(manifest_info, job_entry, summary_entry)
-            if record is None:
+    for evals_root in _candidate_evals_roots(runs_path):
+        for record in _iter_eval_output_records(evals_root):
+            results_key = _dedupe_key(record.results_dir)
+            if results_key in emitted_results_dirs:
                 continue
+            emitted_results_dirs.add(results_key)
             if normalized_status and record.status not in normalized_status:
                 continue
             yield record
@@ -140,8 +160,8 @@ def _build_run_record(
         manifest.run_dir,
         job_id,
     )
-    metadata_path = results_dir / "metadata.json"
-    results_path = results_dir / "results.jsonl"
+    metadata_path = results_dir / METADATA_FILENAME
+    results_path = results_dir / RESULTS_FILENAME
     summary_path = results_dir / "summary.json"
 
     status = DEFAULT_STATUS
@@ -215,7 +235,7 @@ def _resolve_results_dir(
         base = run_dir / str(artifacts_root or ".")
         candidate_file = (base / rel).resolve()
         # v3 stores results_relpath to results.jsonl; derive the containing directory.
-        candidate_dir = candidate_file.parent if candidate_file.name == "results.jsonl" else candidate_file
+        candidate_dir = candidate_file.parent if candidate_file.name == RESULTS_FILENAME else candidate_file
         return candidate_dir.name, candidate_dir
 
     # Backward-compatible fallback for malformed v3 payloads missing relpaths.
@@ -229,13 +249,13 @@ def _fallback_results_dir_if_missing(
     run_dir: Path,
     job_id: str,
 ) -> tuple[str, Path]:
-    metadata_path = results_dir / "metadata.json"
-    results_path = results_dir / "results.jsonl"
+    metadata_path = results_dir / METADATA_FILENAME
+    results_path = results_dir / RESULTS_FILENAME
     if metadata_path.exists() or results_path.exists():
         return results_dir_name, results_dir
     fallback = (run_dir / job_id).resolve()
-    fallback_metadata = fallback / "metadata.json"
-    fallback_results = fallback / "results.jsonl"
+    fallback_metadata = fallback / METADATA_FILENAME
+    fallback_results = fallback / RESULTS_FILENAME
     if fallback_metadata.exists() or fallback_results.exists():
         logger.warning(
             "Manifest results path missing for job '%s'; falling back to run-relative directory '%s'.",
@@ -330,6 +350,214 @@ def _load_run_summary(run_dir: Path) -> Mapping[str, Mapping[str, Any]]:
     return summary
 
 
+def _candidate_evals_roots(runs_path: Path) -> tuple[Path, ...]:
+    candidates: list[Path] = []
+    if runs_path.name == "evals":
+        candidates.append(runs_path)
+    candidates.append(runs_path / "evals")
+    candidates.append(runs_path.parent / "evals")
+
+    roots: list[Path] = []
+    seen: set[Path] = set()
+    for candidate in candidates:
+        key = _dedupe_key(candidate)
+        if key in seen or not candidate.exists() or not candidate.is_dir():
+            continue
+        seen.add(key)
+        roots.append(candidate)
+    return tuple(roots)
+
+
+def _iter_eval_output_records(evals_root: Path) -> Iterator[RunRecord]:
+    """Yield synthetic run records for upstream eval output directories."""
+    try:
+        results_paths = sorted(evals_root.rglob(RESULTS_FILENAME))
+    except OSError as exc:  # noqa: FBT003
+        logger.warning("Failed to scan eval outputs under %s: %s", evals_root, exc)
+        return
+
+    seen: set[Path] = set()
+    for results_path in results_paths:
+        results_dir = results_path.parent
+        key = _dedupe_key(results_dir)
+        if key in seen:
+            continue
+        seen.add(key)
+        metadata_path = results_dir / METADATA_FILENAME
+        if not metadata_path.exists():
+            continue
+        record = _build_eval_output_record(evals_root, results_dir)
+        if record is not None:
+            yield record
+
+
+def _build_eval_output_record(evals_root: Path, results_dir: Path) -> RunRecord | None:
+    metadata_path = results_dir / METADATA_FILENAME
+    metadata_payload = _read_metadata_payload(metadata_path)
+    if metadata_payload is None:
+        return None
+
+    layout = _infer_eval_output_layout(evals_root, results_dir, metadata_payload)
+    updated_at = _path_timestamp(metadata_path)
+    job_run_id = layout["job_run_id"]
+    job_id = layout["job_id"]
+    model_id = layout["model_id"]
+    env_id = layout["env_id"]
+
+    manifest = RunManifestInfo(
+        job_run_id=job_run_id,
+        run_name=job_run_id,
+        summary_completed=1,
+        summary_total=1,
+        summary_total_known=True,
+        manifest_path=metadata_path,
+        run_dir=results_dir,
+        created_at=updated_at,
+        updated_at=updated_at,
+        config_source=None,
+        config_checksum=_string_or_none(metadata_payload.get(MEDARC_CONFIG_FINGERPRINT_KEY)),
+        run_summary_path=results_dir / "summary.json",
+        models={model_id: {"sampling_args": _mapping_or_empty(metadata_payload.get("sampling_args"))}},
+        env_templates={env_id: {"module": env_id}},
+    )
+
+    env_args = _mapping_or_empty(metadata_payload.get("env_args"))
+    sampling_args = _mapping_or_empty(metadata_payload.get("sampling_args"))
+    row_count = _count_results_rows(results_dir / RESULTS_FILENAME)
+    return RunRecord(
+        manifest=manifest,
+        job_id=job_id,
+        model_id=model_id,
+        manifest_env_id=env_id,
+        results_dir_name=results_dir.name,
+        results_dir=results_dir,
+        metadata_path=metadata_path,
+        results_path=results_dir / RESULTS_FILENAME,
+        summary_path=results_dir / "summary.json",
+        has_metadata=True,
+        has_results=True,
+        has_summary=(results_dir / "summary.json").exists(),
+        status="completed",
+        duration_seconds=None,
+        reason=None,
+        started_at=None,
+        ended_at=None,
+        avg_reward=_float_or_none(metadata_payload.get("avg_reward")),
+        num_examples=_int_or_none(metadata_payload.get("num_examples")),
+        rollouts_per_example=_int_or_none(metadata_payload.get("rollouts_per_example")),
+        row_count=row_count,
+        env_args=env_args,
+        sampling_args=sampling_args,
+        env_config={
+            "id": env_id,
+            "module": env_id,
+            "variant_id": metadata_payload.get(MEDARC_VARIANT_ID_KEY),
+            "variant_payload": metadata_payload.get(MEDARC_VARIANT_PAYLOAD_KEY),
+            "medarc_config_fingerprint": metadata_payload.get(MEDARC_CONFIG_FINGERPRINT_KEY),
+            "medarc_config_fingerprint_payload": metadata_payload.get(MEDARC_CONFIG_FINGERPRINT_PAYLOAD_KEY),
+        },
+        model_config={"sampling_args": sampling_args},
+    )
+
+
+def _infer_eval_output_layout(evals_root: Path, results_dir: Path, metadata_payload: Mapping[str, Any]) -> dict[str, str]:
+    try:
+        parts = results_dir.relative_to(evals_root).parts
+    except ValueError:
+        parts = results_dir.parts
+
+    metadata_env_id = _string_or_none(metadata_payload.get("env_id"))
+    metadata_model = _string_or_none(metadata_payload.get("model"))
+    parent_name = results_dir.parent.name
+    if "--" in parent_name and len(parts) >= 2:
+        env_from_parent, model_from_parent = parent_name.split("--", 1)
+        env_id = metadata_env_id or env_from_parent
+        model_id = metadata_model or model_from_parent
+        job_run_id = results_dir.name
+    else:
+        model_id = metadata_model or (parts[0] if len(parts) >= 1 else "unknown")
+        env_id = metadata_env_id or (parts[1] if len(parts) >= 2 else results_dir.name)
+        variant_id = _string_or_none(metadata_payload.get(MEDARC_VARIANT_ID_KEY))
+        job_run_id = "::".join(part for part in (model_id, env_id, variant_id) if part)
+
+    return {
+        "job_run_id": job_run_id,
+        "job_id": results_dir.name,
+        "model_id": model_id,
+        "env_id": env_id,
+    }
+
+
+def _read_metadata_payload(path: Path) -> Mapping[str, Any] | None:
+    try:
+        payload = json.loads(path.read_text(encoding="utf-8"))
+    except (OSError, ValueError) as exc:  # noqa: FBT003
+        logger.warning("Failed to parse eval metadata %s: %s", path, exc)
+        return None
+    if not isinstance(payload, Mapping):
+        logger.warning("Invalid eval metadata payload type for %s: expected JSON object.", path)
+        return None
+    return dict(payload)
+
+
+def _dedupe_key(path: Path) -> Path:
+    try:
+        return path.resolve()
+    except OSError:
+        return path.absolute()
+
+
+def _path_timestamp(path: Path) -> str:
+    try:
+        timestamp = path.stat().st_mtime
+    except OSError:
+        return ""
+    return datetime.fromtimestamp(timestamp, UTC).replace(microsecond=0).isoformat().replace("+00:00", "Z")
+
+
+def _count_results_rows(path: Path) -> int | None:
+    count = 0
+    try:
+        with path.open("r", encoding="utf-8") as handle:
+            for line in handle:
+                if line.strip():
+                    count += 1
+    except OSError:
+        return None
+    return count
+
+
+def _mapping_or_empty(value: Any) -> Mapping[str, Any]:
+    if isinstance(value, Mapping):
+        return value
+    return {}
+
+
+def _string_or_none(value: Any) -> str | None:
+    if value is None:
+        return None
+    text = str(value).strip()
+    return text or None
+
+
+def _int_or_none(value: Any) -> int | None:
+    if value is None or isinstance(value, bool):
+        return None
+    try:
+        return int(value)
+    except (TypeError, ValueError):
+        return None
+
+
+def _float_or_none(value: Any) -> float | None:
+    if value is None or isinstance(value, bool):
+        return None
+    try:
+        return float(value)
+    except (TypeError, ValueError):
+        return None
+
+
 def _normalize_status_filter(statuses: Sequence[str] | None) -> tuple[str, ...]:
     if not statuses:
         return ()
diff --git a/medarc_verifiers/cli/process/metadata.py b/medarc_verifiers/cli/process/metadata.py
index d7bc8c58..bcab416b 100644
--- a/medarc_verifiers/cli/process/metadata.py
+++ b/medarc_verifiers/cli/process/metadata.py
@@ -11,6 +11,12 @@
 
 from pydantic import BaseModel, Field, ValidationError
 
+from medarc_verifiers.cli.eval_identity import (
+    MEDARC_CONFIG_FINGERPRINT_KEY,
+    MEDARC_CONFIG_FINGERPRINT_PAYLOAD_KEY,
+    MEDARC_VARIANT_ID_KEY,
+    MEDARC_VARIANT_PAYLOAD_KEY,
+)
 from medarc_verifiers.cli.process.discovery import RunRecord
 from medarc_verifiers.cli.process.rollout import derive_base_env_id, extract_rollout_index
 
@@ -28,6 +34,10 @@ class _MetadataPayload(BaseModel):
     num_examples: int | None = None
     rollouts_per_example: int | None = None
     sampling_args: dict[str, Any] = Field(default_factory=dict)
+    medarc_config_fingerprint: str | None = None
+    medarc_config_fingerprint_payload: dict[str, Any] | None = None
+    variant_id: str | None = None
+    variant_payload: dict[str, Any] | None = None
 
 
 @dataclass(slots=True)
@@ -48,6 +58,10 @@ class NormalizedMetadata:
     sampling_args: Mapping[str, Any]
     num_examples: int | None
     rollouts_per_example: int | None
+    variant_id: str | None
+    variant_payload: Mapping[str, Any] | None
+    medarc_config_fingerprint: str | None
+    medarc_config_fingerprint_payload: Mapping[str, Any] | None
 
 
 @dataclass(frozen=True, slots=True)
@@ -72,6 +86,7 @@ class ResolvedRunIdentity:
     rollout_index: int | None
     job_run_id: str
     output_env_id: str
+    variant_id: str | None = None
 
 
 @dataclass(frozen=True, slots=True)
@@ -87,6 +102,10 @@ class _ResolvedMetadataContext:
     sampling_args: Mapping[str, Any]
     num_examples: int | None
     rollouts_per_example: int | None
+    variant_id: str | None
+    variant_payload: Mapping[str, Any] | None
+    medarc_config_fingerprint: str | None
+    medarc_config_fingerprint_payload: Mapping[str, Any] | None
 
 
 def resolve_run_identity(
@@ -106,6 +125,7 @@ def resolve_run_identity(
         rollout_index=resolved_rollout_index,
         job_run_id=record.manifest.job_run_id,
         output_env_id=context.base_env_id or context.manifest_env_id or record.job_id,
+        variant_id=context.variant_id,
     )
 
 
@@ -145,6 +165,10 @@ def load_normalized_metadata(
         sampling_args=context.sampling_args,
         num_examples=context.num_examples,
         rollouts_per_example=context.rollouts_per_example,
+        variant_id=context.variant_id,
+        variant_payload=context.variant_payload,
+        medarc_config_fingerprint=context.medarc_config_fingerprint,
+        medarc_config_fingerprint_payload=context.medarc_config_fingerprint_payload,
     )
 
 
@@ -194,6 +218,30 @@ def _resolve_metadata_context(
             record.rollouts_per_example,
             metadata_payload.rollouts_per_example if metadata_payload else None,
         ),
+        variant_id=_string_or_none(
+            _raw_metadata_value(raw_metadata, MEDARC_VARIANT_ID_KEY, metadata_payload.variant_id if metadata_payload else None)
+        ),
+        variant_payload=_mapping_or_none(
+            _raw_metadata_value(
+                raw_metadata,
+                MEDARC_VARIANT_PAYLOAD_KEY,
+                metadata_payload.variant_payload if metadata_payload else None,
+            )
+        ),
+        medarc_config_fingerprint=_string_or_none(
+            _raw_metadata_value(
+                raw_metadata,
+                MEDARC_CONFIG_FINGERPRINT_KEY,
+                metadata_payload.medarc_config_fingerprint if metadata_payload else None,
+            )
+        ),
+        medarc_config_fingerprint_payload=_mapping_or_none(
+            _raw_metadata_value(
+                raw_metadata,
+                MEDARC_CONFIG_FINGERPRINT_PAYLOAD_KEY,
+                metadata_payload.medarc_config_fingerprint_payload if metadata_payload else None,
+            )
+        ),
     )
 
 
@@ -261,6 +309,25 @@ def _prefer_manifest_value(primary: int | None, fallback: int | None) -> int | N
     return fallback
 
 
+def _raw_metadata_value(raw_metadata: Mapping[str, Any], key: str, fallback: Any) -> Any:
+    if key in raw_metadata:
+        return raw_metadata.get(key)
+    return fallback
+
+
+def _mapping_or_none(value: Any) -> Mapping[str, Any] | None:
+    if isinstance(value, Mapping):
+        return dict(value)
+    return None
+
+
+def _string_or_none(value: Any) -> str | None:
+    if value is None:
+        return None
+    text = str(value).strip()
+    return text or None
+
+
 def _warn_manifest_metadata_result_mismatch(record: RunRecord, metadata_payload: _MetadataPayload | None) -> None:
     if metadata_payload is None:
         return
diff --git a/medarc_verifiers/cli/process/pipeline.py b/medarc_verifiers/cli/process/pipeline.py
index 7b10a60c..5c59cc19 100644
--- a/medarc_verifiers/cli/process/pipeline.py
+++ b/medarc_verifiers/cli/process/pipeline.py
@@ -30,6 +30,7 @@
 
 logger = logging.getLogger(__name__)
 PROCESS_DEFAULT_STATUS_FILTER: tuple[str, ...] = ("completed",)
+VARIANT_AGGREGATION_NOT_IMPLEMENTED = "variant aggregation not implemented yet"
 
 
 @dataclass(slots=True)
@@ -324,6 +325,7 @@ def select_work_items(
         exclude_datasets=options.exclude_datasets,
         exclude_models=options.exclude_models,
     )
+    _raise_for_variant_aggregation(work_items)
     _validate_replace_targets(work_items, options)
     work_items, skipped_by_delta = _apply_additive_delta(work_items, options=options, index_files=index_files)
     _validate_selected_results_completeness(work_items, max_results_missing_pct=options.max_results_missing_pct)
@@ -372,9 +374,9 @@ def _plan_selection_record(
 
 
 def _raise_for_latest_invalid_selection(records: Sequence[SelectionRecord]) -> None:
-    latest_by_target: dict[tuple[str, str], SelectionRecord] = {}
+    latest_by_target: dict[tuple[str, str, str], SelectionRecord] = {}
     for planned in records:
-        selection_key = (planned.identity.output_env_id, planned.record.job_id)
+        selection_key = (planned.identity.output_env_id, planned.identity.variant_id or "", planned.record.job_id)
         current = latest_by_target.get(selection_key)
         if current is None or _run_sort_key(
             _source_updated_at(planned.record),
@@ -397,14 +399,14 @@ def _raise_for_latest_invalid_selection(records: Sequence[SelectionRecord]) -> N
 
 
 def _select_latest_work_items(records: Sequence[SelectionRecord]) -> list[SelectionWorkItem]:
-    grouped: dict[tuple[str, str], dict[str, list[SelectionRecord]]] = {}
+    grouped: dict[tuple[str, str, str], dict[str, list[SelectionRecord]]] = {}
     run_timestamps: dict[str, str] = {}
 
     for planned in records:
         identity = planned.identity
         if not identity.model_id:
             continue
-        group_key = (identity.model_id, identity.output_env_id)
+        group_key = (identity.model_id, identity.output_env_id, identity.variant_id or "")
         grouped.setdefault(group_key, {}).setdefault(identity.job_run_id, []).append(planned)
         run_timestamps.setdefault(identity.job_run_id, _source_updated_at(planned.record))
 
@@ -464,6 +466,30 @@ def _apply_exclusions(
     return filtered, skipped
 
 
+def _raise_for_variant_aggregation(work_items: Sequence[PlannedWorkItem]) -> None:
+    variant_records: list[str] = []
+    for item in work_items:
+        for planned in item.records:
+            normalized = planned.normalized
+            if not normalized.variant_id:
+                continue
+            record = normalized.record
+            variant_records.append(
+                "model_id={model_id} output_env_id={output_env_id} variant_id={variant_id} "
+                "job_run_id={job_run_id} results_path={results_path}".format(
+                    model_id=item.identity.model_id,
+                    output_env_id=item.identity.output_env_id,
+                    variant_id=normalized.variant_id,
+                    job_run_id=record.manifest.job_run_id,
+                    results_path=record.results_path,
+                )
+            )
+    if not variant_records:
+        return
+    details = "\n".join(f"  - {record}" for record in sorted(variant_records))
+    raise RuntimeError(f"{VARIANT_AGGREGATION_NOT_IMPLEMENTED}:\n{details}")
+
+
 def _validate_replace_targets(work_items: Sequence[PlannedWorkItem], options: ProcessOptions) -> None:
     if not options.replace_models and not options.replace_envs:
         return
diff --git a/tests/test_cli/test_process_discovery.py b/tests/test_cli/test_process_discovery.py
index a41a6bed..5ff8c70d 100644
--- a/tests/test_cli/test_process_discovery.py
+++ b/tests/test_cli/test_process_discovery.py
@@ -4,6 +4,7 @@
 from pathlib import Path
 
 from medarc_verifiers.cli.process.discovery import RunManifestInfo, discover_run_records
+from medarc_verifiers.cli.process.metadata import load_normalized_metadata
 
 
 def _write_json(path: Path, payload: dict) -> None:
@@ -249,3 +250,100 @@ def test_discover_run_records_fallbacks_to_job_dir_when_results_relpath_is_broke
     assert len(records) == 1
     assert records[0].has_results is True
     assert records[0].has_metadata is True
+
+
+def _write_eval_output(path: Path, metadata: dict | None = None) -> None:
+    _write_json(
+        path / "metadata.json",
+        {
+            "env_id": "medqa",
+            "model": "gpt-5-mini",
+            "env_args": {"split": "test"},
+            "sampling_args": {"temperature": 0},
+            "num_examples": 1,
+            "rollouts_per_example": 1,
+            **(metadata or {}),
+        },
+    )
+    path.mkdir(parents=True, exist_ok=True)
+    (path / "results.jsonl").write_text(json.dumps({"example_id": "ex-1", "reward": 1.0}) + "\n", encoding="utf-8")
+
+
+def test_discover_run_records_includes_deterministic_eval_outputs(tmp_path: Path) -> None:
+    raw_dir = tmp_path / "runs" / "raw"
+    eval_dir = tmp_path / "runs" / "evals" / "gpt-5-mini" / "medqa"
+    _write_eval_output(eval_dir)
+
+    records = discover_run_records(raw_dir, filter_status=("completed",))
+
+    assert len(records) == 1
+    record = records[0]
+    assert record.model_id == "gpt-5-mini"
+    assert record.manifest_env_id == "medqa"
+    assert record.results_dir == eval_dir
+    assert record.row_count == 1
+    assert record.env_args == {"split": "test"}
+    assert record.sampling_args == {"temperature": 0}
+
+
+def test_discover_run_records_includes_deterministic_eval_variants(tmp_path: Path) -> None:
+    eval_dir = tmp_path / "runs" / "evals" / "gpt-5-mini" / "medqa" / "env_args.shuffle_seed-1618"
+    _write_eval_output(
+        eval_dir,
+        {
+            "variant_id": "env_args.shuffle_seed-1618",
+            "variant_payload": {"env_args": {"shuffle_seed": 1618}},
+            "medarc_config_fingerprint": "abc123",
+            "medarc_config_fingerprint_payload": {"env_id": "medqa"},
+        },
+    )
+
+    records = discover_run_records(tmp_path / "runs" / "raw", filter_status=("completed",))
+
+    assert len(records) == 1
+    normalized = load_normalized_metadata(records[0])
+    assert normalized.variant_id == "env_args.shuffle_seed-1618"
+    assert normalized.variant_payload == {"env_args": {"shuffle_seed": 1618}}
+    assert normalized.medarc_config_fingerprint == "abc123"
+    assert normalized.medarc_config_fingerprint_payload == {"env_id": "medqa"}
+
+
+def test_discover_run_records_includes_direct_upstream_uuid_outputs(tmp_path: Path) -> None:
+    upstream_dir = tmp_path / "runs" / "evals" / "medqa--gpt-5-mini" / "016f4b4a-92a4-4a5b-a7c1-853af3318c52"
+    _write_eval_output(upstream_dir)
+
+    records = discover_run_records(tmp_path / "runs" / "raw", filter_status=("completed",))
+
+    assert len(records) == 1
+    record = records[0]
+    assert record.model_id == "gpt-5-mini"
+    assert record.manifest_env_id == "medqa"
+    assert record.manifest.job_run_id == "016f4b4a-92a4-4a5b-a7c1-853af3318c52"
+
+
+def test_discover_run_records_deduplicates_overlapping_eval_roots(tmp_path: Path) -> None:
+    eval_dir = tmp_path / "runs" / "evals" / "gpt-5-mini" / "medqa"
+    _write_eval_output(eval_dir)
+
+    records = discover_run_records(tmp_path / "runs", filter_status=("completed",))
+
+    assert len(records) == 1
+    assert records[0].results_dir == eval_dir
+
+
+def test_discover_run_records_parent_baseline_and_child_variant_once(tmp_path: Path) -> None:
+    baseline_dir = tmp_path / "runs" / "evals" / "gpt-5-mini" / "medqa"
+    variant_dir = baseline_dir / "env_args.shuffle_seed-1618"
+    _write_eval_output(baseline_dir)
+    _write_eval_output(
+        variant_dir,
+        {
+            "variant_id": "env_args.shuffle_seed-1618",
+            "variant_payload": {"env_args": {"shuffle_seed": 1618}},
+        },
+    )
+
+    records = discover_run_records(tmp_path / "runs" / "raw", filter_status=("completed",))
+
+    assert len(records) == 2
+    assert {record.results_dir for record in records} == {baseline_dir, variant_dir}
diff --git a/tests/test_cli/test_process_pipeline.py b/tests/test_cli/test_process_pipeline.py
index 204ac8bc..4d091381 100644
--- a/tests/test_cli/test_process_pipeline.py
+++ b/tests/test_cli/test_process_pipeline.py
@@ -241,6 +241,43 @@ def _write_run(
     return runs_dir
 
 
+def _write_deterministic_eval(
+    tmp_path: Path,
+    *,
+    model_id: str = "gpt-mini",
+    env_id: str = "demo-env",
+    variant_id: str | None = None,
+) -> Path:
+    runs_dir = tmp_path / "runs"
+    results_dir = runs_dir / "evals" / model_id / env_id
+    metadata = {
+        "env_id": env_id,
+        "model": model_id,
+        "env_args": {},
+        "sampling_args": {},
+        "num_examples": 1,
+        "rollouts_per_example": 1,
+        "medarc_config_fingerprint": "abc123",
+        "medarc_config_fingerprint_payload": {
+            "env_id": env_id,
+            "model": model_id,
+            "env_args": {},
+            "sampling_args": {},
+            "num_examples": 1,
+            "rollouts_per_example": 1,
+        },
+        "variant_id": None,
+        "variant_payload": None,
+    }
+    if variant_id is not None:
+        results_dir = results_dir / variant_id
+        metadata["variant_id"] = variant_id
+        metadata["variant_payload"] = {"env_args": {"shuffle_seed": 1618}}
+    _write_json(results_dir / "metadata.json", metadata)
+    (results_dir / "results.jsonl").write_text(json.dumps({"example_id": "ex-1", "reward": 1.0}) + "\n", encoding="utf-8")
+    return runs_dir / "raw"
+
+
 def _remove_model_id(tmp_path: Path, run_id: str) -> None:
     manifest_path = tmp_path / "runs" / run_id / "run_manifest.json"
     manifest = json.loads(manifest_path.read_text(encoding="utf-8"))
@@ -281,6 +318,43 @@ def test_run_process_respects_env_export_defaults(tmp_path: Path) -> None:
     assert group.model_id == "gpt-mini"
 
 
+def test_run_process_processes_deterministic_eval_outputs(tmp_path: Path) -> None:
+    runs_dir = _write_deterministic_eval(tmp_path)
+
+    result = run_process(
+        ProcessOptions(
+            runs_dir=runs_dir,
+            output_dir=tmp_path / "processed",
+            dry_run=True,
+            max_workers=1,
+        )
+    )
+
+    assert result.records_processed == 1
+    assert result.rows_processed == 1
+    group = result.env_groups[0]
+    assert group.env_id == "demo-env"
+    assert group.model_id == "gpt-mini"
+
+
+def test_run_process_rejects_variant_aggregation_until_supported(tmp_path: Path) -> None:
+    runs_dir = _write_deterministic_eval(tmp_path, variant_id="env_args.shuffle_seed-1618")
+
+    with pytest.raises(RuntimeError) as excinfo:
+        run_process(
+            ProcessOptions(
+                runs_dir=runs_dir,
+                output_dir=tmp_path / "processed",
+                dry_run=True,
+                max_workers=1,
+            )
+        )
+
+    message = str(excinfo.value)
+    assert "variant aggregation not implemented yet" in message
+    assert "variant_id=env_args.shuffle_seed-1618" in message
+
+
 def test_run_process_resolves_base_env_id(tmp_path: Path) -> None:
     runs_dir = _setup_run(tmp_path)
     options = ProcessOptions(

From 8ca8928ce60edc50582d5c25a9d3bd2d12957f40 Mon Sep 17 00:00:00 2001
From: Benjamin Warner <me@benjaminwarner.dev>
Date: Tue, 28 Apr 2026 19:45:31 +0000
Subject: [PATCH 09/53] Preserve eval variants in processing

---
 medarc_verifiers/cli/process/aggregate.py | 58 +++++++++++++++--
 medarc_verifiers/cli/process/env_index.py |  4 +-
 medarc_verifiers/cli/process/metadata.py  |  2 +
 medarc_verifiers/cli/process/pipeline.py  | 44 +++++--------
 medarc_verifiers/cli/process/rows.py      |  5 ++
 medarc_verifiers/cli/process/writer.py    | 28 ++++++--
 medarc_verifiers/cli/winrate/api.py       |  9 ++-
 tests/test_cli/test_process_pipeline.py   | 71 ++++++++++++++++----
 tests/test_cli/test_process_winrate.py    | 79 +++++++++++++++++++++++
 tests/test_cli/test_process_writer.py     | 46 +++++++++++++
 tests/test_process_writer_schema.py       |  4 ++
 11 files changed, 296 insertions(+), 54 deletions(-)

diff --git a/medarc_verifiers/cli/process/aggregate.py b/medarc_verifiers/cli/process/aggregate.py
index cfca0baf..c8e0776e 100644
--- a/medarc_verifiers/cli/process/aggregate.py
+++ b/medarc_verifiers/cli/process/aggregate.py
@@ -2,6 +2,7 @@
 
 from __future__ import annotations
 
+import json
 import logging
 from dataclasses import dataclass
 from typing import Any, Iterable, Mapping
@@ -19,6 +20,8 @@ class AggregatedEnvRows:
     env_id: str
     base_env_id: str
     model_id: str | None
+    variant_id: str | None
+    variant_payload: Mapping[str, Any] | None
     rows: list[Mapping[str, Any]]
     column_names: tuple[str, ...]
     job_run_ids: tuple[str, ...]
@@ -29,18 +32,22 @@ def aggregate_rows_by_env(
     *,
     identities: Iterable[RunIdentity] | None = None,
 ) -> list[AggregatedEnvRows]:
-    """Group enriched rows by (model_id, base_env_id), capturing unioned schemas."""
-    groups: dict[tuple[str, str], dict[str, Any]] = {}
+    """Group enriched rows by (model_id, base_env_id, variant_id), capturing unioned schemas."""
+    groups: dict[tuple[str, str, str], dict[str, Any]] = {}
     identity_list = list(identities or ())
     fake_rollout_groups = {
-        (identity.model_id, identity.output_env_id) for identity in identity_list if identity.rollout_index is not None
+        (identity.model_id, identity.output_env_id, identity.variant_id or "")
+        for identity in identity_list
+        if identity.rollout_index is not None
     }
 
     for row in rows:
         base_env_id = str(row.get("base_env_id") or row.get("env_id") or "")
         env_id = str(row.get("env_id") or base_env_id)
         model_id = str(row.get("model_id") or "unknown")
-        group_key = (model_id, base_env_id or env_id)
+        variant_id = _string_or_none(row.get("variant_id"))
+        variant_payload = _decode_variant_payload(row.get("variant_payload"))
+        group_key = (model_id, base_env_id or env_id, variant_id or "")
         if not group_key[1]:  # no env identifier
             logger.debug("Skipping row without env identifiers.")
             continue
@@ -50,6 +57,8 @@ def aggregate_rows_by_env(
                 "env_id": env_id if env_id else base_env_id,
                 "base_env_id": base_env_id,
                 "model_id": model_id,
+                "variant_id": variant_id,
+                "variant_payload": variant_payload,
                 "rows": [],
                 "column_names": set(),
                 "job_run_ids": set(),
@@ -62,6 +71,10 @@ def aggregate_rows_by_env(
             group["base_env_id"] = base_env_id
         if not group["model_id"] and model_id:
             group["model_id"] = model_id
+        if not group["variant_id"] and variant_id:
+            group["variant_id"] = variant_id
+        if group["variant_payload"] is None and variant_payload is not None:
+            group["variant_payload"] = variant_payload
         group["rows"].append(row)
         group["column_names"].update(row.keys())
         job_run_id = row.get("job_run_id")
@@ -81,6 +94,7 @@ def aggregate_rows_by_env(
                 identities=identity_list,
                 model_id=group["model_id"],
                 base_env_id=group["base_env_id"] or key[1],
+                variant_id=group["variant_id"],
             )
             _normalize_rollout_indices(normalized_rows)
         elif _group_uses_rollout_suffixes(normalized_rows, base_env_id=group["base_env_id"] or key[1]):
@@ -92,6 +106,8 @@ def aggregate_rows_by_env(
                 env_id=candidate_env_id,
                 base_env_id=group["base_env_id"] or key[1],
                 model_id=group["model_id"],
+                variant_id=group["variant_id"],
+                variant_payload=group["variant_payload"],
                 rows=normalized_rows,
                 column_names=tuple(sorted(group["column_names"])),
                 job_run_ids=tuple(sorted(group["job_run_ids"])),
@@ -106,11 +122,14 @@ def _ensure_rollout_index_from_identities(
     identities: list[RunIdentity],
     model_id: str,
     base_env_id: str,
+    variant_id: str | None,
 ) -> None:
     rollout_by_manifest_env: dict[str, int] = {}
     for identity in identities:
         if identity.model_id != model_id or identity.output_env_id != base_env_id:
             continue
+        if identity.variant_id != variant_id:
+            continue
         if identity.rollout_index is None:
             continue
         rollout_by_manifest_env[identity.manifest_env_id] = identity.rollout_index
@@ -168,10 +187,39 @@ def _ensure_rollout_index_from_suffix(rows: list[Mapping[str, Any]], *, base_env
 def _coerce_rollout_index(value: Any) -> int | None:
     if value is None or isinstance(value, bool):
         return None
+    if isinstance(value, int):
+        return value
+    if isinstance(value, float):
+        if value.is_integer():
+            return int(value)
+        return None
+    if isinstance(value, str):
+        try:
+            return int(value.strip())
+        except ValueError:
+            return None
+    return None
+
+
+def _string_or_none(value: Any) -> str | None:
+    if value is None:
+        return None
+    text = str(value).strip()
+    return text or None
+
+
+def _decode_variant_payload(value: Any) -> Mapping[str, Any] | None:
+    if isinstance(value, Mapping):
+        return dict(value)
+    if not isinstance(value, str) or not value.strip():
+        return None
     try:
-        return int(value)
+        payload = json.loads(value)
     except (TypeError, ValueError):
         return None
+    if isinstance(payload, Mapping):
+        return dict(payload)
+    return None
 
 
 def _normalize_rollout_indices(rows: list[Mapping[str, Any]]) -> None:
diff --git a/medarc_verifiers/cli/process/env_index.py b/medarc_verifiers/cli/process/env_index.py
index 86fecd50..927469f7 100644
--- a/medarc_verifiers/cli/process/env_index.py
+++ b/medarc_verifiers/cli/process/env_index.py
@@ -43,10 +43,12 @@ def _inventory_from_v2(payload: Mapping[str, Any], base_dir: Path) -> EnvIndexIn
         env_id = entry.get("env_id") or entry.get("base_env_id")
         if not env_id:
             continue
+        variant_id = entry.get("variant_id")
+        dataset_id = f"{env_id}::{variant_id}" if variant_id else str(env_id)
         resolved = _resolve_path(base_dir, str(path_str))
         if not resolved:
             continue
-        env_paths.setdefault(str(env_id), []).append(resolved)
+        env_paths.setdefault(dataset_id, []).append(resolved)
     return EnvIndexInventory(env_paths=env_paths, version=2)
 
 
diff --git a/medarc_verifiers/cli/process/metadata.py b/medarc_verifiers/cli/process/metadata.py
index bcab416b..770acda4 100644
--- a/medarc_verifiers/cli/process/metadata.py
+++ b/medarc_verifiers/cli/process/metadata.py
@@ -74,6 +74,7 @@ class RunIdentity:
     rollout_index: int | None
     job_run_id: str
     output_env_id: str
+    variant_id: str | None = None
 
 
 @dataclass(frozen=True, slots=True)
@@ -148,6 +149,7 @@ def load_normalized_metadata(
         rollout_index=resolved_rollout_index,
         job_run_id=record.manifest.job_run_id,
         output_env_id=context.base_env_id or context.manifest_env_id or record.job_id,
+        variant_id=context.variant_id,
     )
 
     return NormalizedMetadata(
diff --git a/medarc_verifiers/cli/process/pipeline.py b/medarc_verifiers/cli/process/pipeline.py
index 5c59cc19..2c5c802c 100644
--- a/medarc_verifiers/cli/process/pipeline.py
+++ b/medarc_verifiers/cli/process/pipeline.py
@@ -30,7 +30,6 @@
 
 logger = logging.getLogger(__name__)
 PROCESS_DEFAULT_STATUS_FILTER: tuple[str, ...] = ("completed",)
-VARIANT_AGGREGATION_NOT_IMPLEMENTED = "variant aggregation not implemented yet"
 
 
 @dataclass(slots=True)
@@ -325,7 +324,6 @@ def select_work_items(
         exclude_datasets=options.exclude_datasets,
         exclude_models=options.exclude_models,
     )
-    _raise_for_variant_aggregation(work_items)
     _validate_replace_targets(work_items, options)
     work_items, skipped_by_delta = _apply_additive_delta(work_items, options=options, index_files=index_files)
     _validate_selected_results_completeness(work_items, max_results_missing_pct=options.max_results_missing_pct)
@@ -456,7 +454,11 @@ def _apply_exclusions(
     filtered: list[PlannedWorkItem] = []
     skipped = 0
     for item in work_items:
-        if exclude_dataset_set and _env_is_excluded(item.identity.output_env_id, exclude_dataset_set):
+        if exclude_dataset_set and _env_is_excluded(
+            item.identity.output_env_id,
+            exclude_dataset_set,
+            variant_id=item.identity.variant_id,
+        ):
             skipped += 1
             continue
         if exclude_model_set and model_is_excluded(item.identity.model_id, exclude_model_set):
@@ -466,30 +468,6 @@ def _apply_exclusions(
     return filtered, skipped
 
 
-def _raise_for_variant_aggregation(work_items: Sequence[PlannedWorkItem]) -> None:
-    variant_records: list[str] = []
-    for item in work_items:
-        for planned in item.records:
-            normalized = planned.normalized
-            if not normalized.variant_id:
-                continue
-            record = normalized.record
-            variant_records.append(
-                "model_id={model_id} output_env_id={output_env_id} variant_id={variant_id} "
-                "job_run_id={job_run_id} results_path={results_path}".format(
-                    model_id=item.identity.model_id,
-                    output_env_id=item.identity.output_env_id,
-                    variant_id=normalized.variant_id,
-                    job_run_id=record.manifest.job_run_id,
-                    results_path=record.results_path,
-                )
-            )
-    if not variant_records:
-        return
-    details = "\n".join(f"  - {record}" for record in sorted(variant_records))
-    raise RuntimeError(f"{VARIANT_AGGREGATION_NOT_IMPLEMENTED}:\n{details}")
-
-
 def _validate_replace_targets(work_items: Sequence[PlannedWorkItem], options: ProcessOptions) -> None:
     if not options.replace_models and not options.replace_envs:
         return
@@ -539,6 +517,7 @@ def _apply_additive_delta(
             options.output_dir,
             model_id=item.identity.model_id,
             env_id=item.identity.output_env_id,
+            variant_id=item.identity.variant_id,
         )
         if not output_path.exists():
             filtered.append(item)
@@ -893,10 +872,15 @@ def _source_updated_at(record: discovery.RunRecord) -> str:
     return record.manifest.updated_at or record.manifest.created_at or ""
 
 
-def _env_is_excluded(env_id: str, exclude_set: set[str]) -> bool:
+def _env_is_excluded(env_id: str, exclude_set: set[str], *, variant_id: str | None = None) -> bool:
     env_identifier = str(env_id or "").strip()
     base_env_id, _ = rollout.derive_base_env_id(env_identifier)
-    return dataset_is_excluded(env_identifier, exclude_set, base_dataset_id=base_env_id)
+    dataset_id = f"{env_identifier}::{variant_id}" if variant_id else env_identifier
+    if dataset_is_excluded(dataset_id, exclude_set, base_dataset_id=base_env_id):
+        return True
+    if variant_id:
+        return dataset_is_excluded(env_identifier, exclude_set, base_dataset_id=base_env_id)
+    return False
 
 
 def _strip_env_group_rows(group: AggregatedEnvRows) -> AggregatedEnvRows:
@@ -904,6 +888,8 @@ def _strip_env_group_rows(group: AggregatedEnvRows) -> AggregatedEnvRows:
         env_id=group.env_id,
         base_env_id=group.base_env_id,
         model_id=group.model_id,
+        variant_id=group.variant_id,
+        variant_payload=group.variant_payload,
         rows=[],
         column_names=group.column_names,
         job_run_ids=group.job_run_ids,
diff --git a/medarc_verifiers/cli/process/rows.py b/medarc_verifiers/cli/process/rows.py
index e27896a7..095018b6 100644
--- a/medarc_verifiers/cli/process/rows.py
+++ b/medarc_verifiers/cli/process/rows.py
@@ -41,6 +41,7 @@ def load_rows(
     decoded_rows, example_counts = _decode_results_jsonl(results_path)
     multi_rollout = _detect_multi_rollout_shape(example_counts)
     version_info_json = _encode_metadata_json_column(metadata.raw_metadata.get("version_info"))
+    variant_payload_json = _encode_metadata_json_column(metadata.variant_payload)
 
     rows: list[dict[str, Any]] = []
     seen_per_example: dict[Any, int] = {}
@@ -67,6 +68,7 @@ def load_rows(
             line_number=line_number,
             rollout_index=rollout_index,
             version_info_json=version_info_json,
+            variant_payload_json=variant_payload_json,
         )
         rows.append(enriched)
 
@@ -243,6 +245,7 @@ def _attach_row_metadata(
     line_number: int,
     rollout_index: int,
     version_info_json: str | None,
+    variant_payload_json: str | None,
 ) -> MutableMapping[str, Any]:
     record = metadata.record
     identity = metadata.identity
@@ -258,6 +261,8 @@ def _attach_row_metadata(
             "run_id": record.job_id,
             "model_id": identity.model_id,
             "version_info": version_info_json,
+            "variant_id": metadata.variant_id,
+            "variant_payload": variant_payload_json,
             "status": record.status,
             "error": error_value,
             "started_at": record.started_at,
diff --git a/medarc_verifiers/cli/process/writer.py b/medarc_verifiers/cli/process/writer.py
index a9256cdb..10a51273 100644
--- a/medarc_verifiers/cli/process/writer.py
+++ b/medarc_verifiers/cli/process/writer.py
@@ -36,6 +36,8 @@
     "model_cost",
     "model_id",
     "version_info",
+    "variant_id",
+    "variant_payload",
     "model_token_completion",
     "model_token_prompt",
     "model_token_total",
@@ -63,6 +65,8 @@
     "model_cost": pl.Float64,
     "model_id": pl.String,
     "version_info": pl.String,
+    "variant_id": pl.String,
+    "variant_payload": pl.String,
     "model_token_completion": pl.Float64,
     "model_token_prompt": pl.Float64,
     "model_token_total": pl.Float64,
@@ -91,6 +95,8 @@
         pa.field("model_cost", pa.float64()),
         pa.field("model_id", pa.large_string()),
         pa.field("version_info", pa.large_string()),
+        pa.field("variant_id", pa.large_string()),
+        pa.field("variant_payload", pa.large_string()),
         pa.field("model_token_completion", pa.float64()),
         pa.field("model_token_prompt", pa.float64()),
         pa.field("model_token_total", pa.float64()),
@@ -147,6 +153,8 @@ class EnvWriteSummary:
     env_id: str
     base_env_id: str
     model_id: str
+    variant_id: str | None
+    variant_payload: Mapping[str, Any] | None
     output_path: Path
     row_count: int
     job_run_ids: tuple[str, ...]
@@ -250,7 +258,7 @@ def _write_group(group: AggregatedEnvRows, config: WriterConfig) -> EnvWriteSumm
     model_id = group.model_id
     if not model_id:
         raise ValueError("model_id is required for parquet output.")
-    output_path = build_output_path(config.output_dir, model_id=model_id, env_id=env_id)
+    output_path = build_output_path(config.output_dir, model_id=model_id, env_id=env_id, variant_id=group.variant_id)
     if not config.dry_run:
         output_path.parent.mkdir(parents=True, exist_ok=True)
     file_exists = output_path.exists()
@@ -261,6 +269,8 @@ def _write_group(group: AggregatedEnvRows, config: WriterConfig) -> EnvWriteSumm
         "processed_with_args": dict(config.processed_with_args),
         "env_id": env_id,
         "model_id": model_id,
+        "variant_id": group.variant_id,
+        "variant_payload": group.variant_payload,
     }
     row_count = len(group.rows)
     job_run_ids_set = set(group.job_run_ids)
@@ -271,6 +281,8 @@ def _write_group(group: AggregatedEnvRows, config: WriterConfig) -> EnvWriteSumm
             env_id=env_id,
             base_env_id=group.base_env_id,
             model_id=model_id,
+            variant_id=group.variant_id,
+            variant_payload=group.variant_payload,
             output_path=output_path,
             row_count=row_count,
             job_run_ids=group.job_run_ids,
@@ -293,6 +305,8 @@ def _write_group(group: AggregatedEnvRows, config: WriterConfig) -> EnvWriteSumm
         env_id=env_id,
         base_env_id=group.base_env_id,
         model_id=model_id,
+        variant_id=group.variant_id,
+        variant_payload=group.variant_payload,
         output_path=output_path,
         row_count=row_count,
         job_run_ids=group.job_run_ids,
@@ -364,7 +378,10 @@ def _write_env_index(
         timestamps: list[str] = []
         files[path_str] = {
             "env_id": summary.env_id,
+            "base_env_id": summary.base_env_id,
             "model_id": summary.model_id,
+            "variant_id": summary.variant_id,
+            "variant_payload": summary.variant_payload,
             "row_count": summary.row_count,
         }
         for job_run_id in summary.job_run_ids:
@@ -468,14 +485,17 @@ def _normalize_columns(df: pl.DataFrame) -> pl.DataFrame:
     return out
 
 
-def build_output_path(output_dir: Path, *, model_id: str, env_id: str) -> Path:
-    """Return the canonical parquet output path for a (model_id, env_id) dataset."""
+def build_output_path(output_dir: Path, *, model_id: str, env_id: str, variant_id: str | None = None) -> Path:
+    """Return the canonical parquet output path for a processed dataset."""
     if not model_id:
         raise ValueError("model_id is required for output path.")
     if not env_id:
         raise ValueError("env_id is required for output path.")
     model_dir = output_dir / slugify_filename_component(model_id)
-    return model_dir / f"{slugify_filename_component(env_id)}.parquet"
+    env_slug = slugify_filename_component(env_id)
+    if variant_id:
+        return model_dir / f"{env_slug}__variants" / f"{slugify_filename_component(variant_id)}.parquet"
+    return model_dir / f"{env_slug}.parquet"
 
 
 __all__ = ["EnvWriteSummary", "WriterConfig", "build_output_path", "write_env_groups", "write_env_index"]
diff --git a/medarc_verifiers/cli/winrate/api.py b/medarc_verifiers/cli/winrate/api.py
index 88f375cb..1f6b2e65 100644
--- a/medarc_verifiers/cli/winrate/api.py
+++ b/medarc_verifiers/cli/winrate/api.py
@@ -759,8 +759,13 @@ def _models_present(df_avg: pl.DataFrame) -> list[str]:
 
 
 def _is_dataset_excluded(dataset_name: str, exclude_set: set[str]) -> bool:
-    base, _ = derive_base_env_id(dataset_name)
-    return dataset_is_excluded(dataset_name, exclude_set, base_dataset_id=base)
+    env_name, _, variant_id = dataset_name.partition("::")
+    base, _ = derive_base_env_id(env_name)
+    if dataset_is_excluded(dataset_name, exclude_set, base_dataset_id=base):
+        return True
+    if variant_id:
+        return dataset_is_excluded(env_name, exclude_set, base_dataset_id=base)
+    return False
 
 
 def _filter_models(
diff --git a/tests/test_cli/test_process_pipeline.py b/tests/test_cli/test_process_pipeline.py
index 4d091381..eed3739a 100644
--- a/tests/test_cli/test_process_pipeline.py
+++ b/tests/test_cli/test_process_pipeline.py
@@ -337,22 +337,67 @@ def test_run_process_processes_deterministic_eval_outputs(tmp_path: Path) -> Non
     assert group.model_id == "gpt-mini"
 
 
-def test_run_process_rejects_variant_aggregation_until_supported(tmp_path: Path) -> None:
-    runs_dir = _write_deterministic_eval(tmp_path, variant_id="env_args.shuffle_seed-1618")
+def test_run_process_preserves_deterministic_eval_variants(tmp_path: Path) -> None:
+    _write_deterministic_eval(tmp_path, variant_id="env_args.shuffle_seed-1618")
+    runs_dir = _write_deterministic_eval(tmp_path, variant_id="env_args.shuffle_seed-9331")
+    output_dir = tmp_path / "processed"
 
-    with pytest.raises(RuntimeError) as excinfo:
-        run_process(
-            ProcessOptions(
-                runs_dir=runs_dir,
-                output_dir=tmp_path / "processed",
-                dry_run=True,
-                max_workers=1,
-            )
+    result = run_process(
+        ProcessOptions(
+            runs_dir=runs_dir,
+            output_dir=output_dir,
+            dry_run=False,
+            max_workers=1,
         )
+    )
 
-    message = str(excinfo.value)
-    assert "variant aggregation not implemented yet" in message
-    assert "variant_id=env_args.shuffle_seed-1618" in message
+    assert result.records_processed == 2
+    rel_paths = sorted(summary.output_path.relative_to(output_dir).as_posix() for summary in result.env_summaries)
+    assert rel_paths == [
+        "gpt-mini/demo-env__variants/env_args.shuffle_seed-1618.parquet",
+        "gpt-mini/demo-env__variants/env_args.shuffle_seed-9331.parquet",
+    ]
+    index_payload = json.loads((output_dir / "env_index.json").read_text(encoding="utf-8"))
+    assert sorted(index_payload["files"]) == rel_paths
+    assert {
+        entry["variant_id"] for entry in index_payload["files"].values()
+    } == {"env_args.shuffle_seed-1618", "env_args.shuffle_seed-9331"}
+
+
+def test_run_process_excludes_specific_deterministic_eval_variant(tmp_path: Path) -> None:
+    _write_deterministic_eval(tmp_path, variant_id="env_args.shuffle_seed-1618")
+    runs_dir = _write_deterministic_eval(tmp_path, variant_id="env_args.shuffle_seed-9331")
+
+    result = run_process(
+        ProcessOptions(
+            runs_dir=runs_dir,
+            output_dir=tmp_path / "processed",
+            exclude_datasets=("demo-env::env_args.shuffle_seed-1618",),
+            dry_run=True,
+            max_workers=1,
+        )
+    )
+
+    assert result.records_processed == 1
+    assert result.env_groups[0].variant_id == "env_args.shuffle_seed-9331"
+
+
+def test_run_process_excludes_deterministic_eval_variants_by_base_env(tmp_path: Path) -> None:
+    _write_deterministic_eval(tmp_path, variant_id="env_args.shuffle_seed-1618")
+    runs_dir = _write_deterministic_eval(tmp_path, variant_id="env_args.shuffle_seed-9331")
+
+    result = run_process(
+        ProcessOptions(
+            runs_dir=runs_dir,
+            output_dir=tmp_path / "processed",
+            exclude_datasets=("demo-env",),
+            dry_run=True,
+            max_workers=1,
+        )
+    )
+
+    assert result.records_processed == 0
+    assert result.env_groups == []
 
 
 def test_run_process_resolves_base_env_id(tmp_path: Path) -> None:
diff --git a/tests/test_cli/test_process_winrate.py b/tests/test_cli/test_process_winrate.py
index a29e8bed..b0dad0ee 100644
--- a/tests/test_cli/test_process_winrate.py
+++ b/tests/test_cli/test_process_winrate.py
@@ -595,3 +595,82 @@ def test_run_winrate_validates_known_models_from_env_index(tmp_path: Path) -> No
             output_name=None,
             config=cfg,
         )
+
+
+def test_run_winrate_discovers_variants_as_distinct_datasets(tmp_path: Path) -> None:
+    processed_dir = tmp_path / "processed"
+    output_dir = tmp_path / "out"
+    files: dict[str, dict[str, object]] = {}
+    rewards = {
+        ("model-a", "seed-1"): 1.0,
+        ("model-b", "seed-1"): 0.0,
+        ("model-a", "seed-2"): 0.0,
+        ("model-b", "seed-2"): 1.0,
+    }
+    for (model_id, variant_id), reward in rewards.items():
+        path = processed_dir / model_id / "demo-env__variants" / f"{variant_id}.parquet"
+        _write_dataset(
+            path,
+            [
+                {
+                    "example_id": "q1",
+                    "model_id": model_id,
+                    "reward": reward,
+                }
+            ],
+        )
+        rel_path = path.relative_to(processed_dir).as_posix()
+        files[rel_path] = {
+            "env_id": "demo-env",
+            "base_env_id": "demo-env",
+            "model_id": model_id,
+            "variant_id": variant_id,
+            "variant_payload": {"env_args": {"shuffle_seed": int(variant_id.removeprefix("seed-"))}},
+            "row_count": 1,
+        }
+
+    env_index = {
+        "version": 2,
+        "processed_at": "2024-01-01T00:00:00Z",
+        "schema_version": 1,
+        "processed_with_args": {},
+        "runs": {},
+        "files": files,
+    }
+    (processed_dir / "env_index.json").write_text(json.dumps(env_index), encoding="utf-8")
+
+    result = winrate.run_winrate(
+        processed_dir=processed_dir,
+        output_dir=output_dir,
+        output_path=None,
+        output_name=None,
+        config=winrate.WinrateConfig(dataset_coverage="per-model"),
+        processed_at="2024-01-01T00:00:00Z",
+    )
+    payload = winrate.to_json(result.result)
+
+    assert [dataset for dataset, _ in result.datasets] == ["demo-env::seed-1", "demo-env::seed-2"]
+    assert set(payload["datasets"]) == {"demo-env::seed-1", "demo-env::seed-2"}
+    assert "demo-env" not in payload["datasets"]
+    assert payload["models"]["model-a"]["avg_reward_per_dataset"] == {
+        "demo-env::seed-1": 1.0,
+        "demo-env::seed-2": 0.0,
+    }
+    assert payload["models"]["model-b"]["avg_reward_per_dataset"] == {
+        "demo-env::seed-1": 0.0,
+        "demo-env::seed-2": 1.0,
+    }
+    assert payload["models"]["model-a"]["mean_winrate"]["n_datasets"] == 2
+
+    excluded_variant = winrate.compute_winrates(
+        result.datasets,
+        winrate.WinrateConfig(exclude_datasets=("demo-env::seed-1",), dataset_coverage="per-model"),
+    )
+    excluded_payload = winrate.to_json(excluded_variant)
+    assert set(excluded_payload["datasets"]) == {"demo-env::seed-2"}
+
+    with pytest.raises(ValueError, match="No datasets remain after applying dataset exclusions"):
+        winrate.compute_winrates(
+            result.datasets,
+            winrate.WinrateConfig(exclude_datasets=("demo-env",), dataset_coverage="per-model"),
+        )
diff --git a/tests/test_cli/test_process_writer.py b/tests/test_cli/test_process_writer.py
index f3229651..e65c99e9 100644
--- a/tests/test_cli/test_process_writer.py
+++ b/tests/test_cli/test_process_writer.py
@@ -74,6 +74,52 @@ def test_write_env_groups_creates_parquet_and_index(tmp_path: Path) -> None:
     assert rel_path in ds_infos["default"]["data_files"]["train"]
 
 
+def test_write_env_groups_writes_variant_path_and_metadata(tmp_path: Path) -> None:
+    rows = [
+        {
+            "env_id": "demo-env",
+            "base_env_id": "demo-env",
+            "example_id": "ex-1",
+            "job_run_id": "run-1",
+            "model_id": "demo-model",
+            "variant_id": "env_args.shuffle_seed-1618",
+            "variant_payload": json.dumps({"env_args": {"shuffle_seed": 1618}}),
+            "reward": 1.0,
+        }
+    ]
+    group = aggregate_rows_by_env(rows)[0]
+    config = WriterConfig(
+        output_dir=tmp_path,
+        processed_at="2024-01-01T00:00:00Z",
+        processed_with_args={},
+    )
+
+    summaries = write_env_groups([group], config)
+
+    summary = summaries[0]
+    rel_path = summary.output_path.relative_to(tmp_path).as_posix()
+    assert rel_path == "demo-model/demo-env__variants/env_args.shuffle_seed-1618.parquet"
+    assert summary.variant_id == "env_args.shuffle_seed-1618"
+    assert summary.variant_payload == {"env_args": {"shuffle_seed": 1618}}
+
+    table = pq.read_table(summary.output_path)
+    assert table.column("variant_id").to_pylist() == ["env_args.shuffle_seed-1618"]
+    assert [json.loads(value) for value in table.column("variant_payload").to_pylist()] == [
+        {"env_args": {"shuffle_seed": 1618}}
+    ]
+    embedded = json.loads((table.schema.metadata or {})[EXPORTER_METADATA_KEY])
+    assert embedded["variant_id"] == "env_args.shuffle_seed-1618"
+    assert embedded["variant_payload"] == {"env_args": {"shuffle_seed": 1618}}
+
+    payload = json.loads((tmp_path / "env_index.json").read_text(encoding="utf-8"))
+    file_entry = payload["files"][rel_path]
+    assert file_entry["env_id"] == "demo-env"
+    assert file_entry["base_env_id"] == "demo-env"
+    assert file_entry["model_id"] == "demo-model"
+    assert file_entry["variant_id"] == "env_args.shuffle_seed-1618"
+    assert file_entry["variant_payload"] == {"env_args": {"shuffle_seed": 1618}}
+
+
 def test_write_env_groups_dry_run(tmp_path: Path) -> None:
     group = _group_for_env()
     config = WriterConfig(
diff --git a/tests/test_process_writer_schema.py b/tests/test_process_writer_schema.py
index b4607c5f..5b9aec3a 100644
--- a/tests/test_process_writer_schema.py
+++ b/tests/test_process_writer_schema.py
@@ -9,6 +9,8 @@ def test_process_writer_emits_stable_schema_with_all_null_values(tmp_path) -> No
         env_id="medcalc_bench",
         base_env_id="medcalc_bench",
         model_id="test-model",
+        variant_id=None,
+        variant_payload=None,
         rows=[
             {
                 "env_id": "medcalc_bench",
@@ -57,6 +59,8 @@ def test_process_writer_emits_stable_schema_for_empty_groups(tmp_path) -> None:
         env_id="empty_env",
         base_env_id="empty_env",
         model_id="test-model",
+        variant_id=None,
+        variant_payload=None,
         rows=[],
         column_names=(),
         job_run_ids=(),

From cc8efc8cb54123527ffd24ab446a2c9218477de3 Mon Sep 17 00:00:00 2001
From: Benjamin Warner <me@benjaminwarner.dev>
Date: Tue, 28 Apr 2026 20:07:09 +0000
Subject: [PATCH 10/53] Convert benchmark configs to TOML

---
 configs/eval/README.md                   |  12 +
 configs/eval/medarc-all.toml             | 315 +++++++++++++++++++++++
 configs/eval/medarc-judge.toml           |  80 ++++++
 configs/eval/medarc-mcq.toml             | 185 +++++++++++++
 configs/eval/smoke.toml                  |   1 +
 environments/healthbench/pyproject.toml  |   4 +
 environments/m_arc/pyproject.toml        |   3 +
 environments/med_mcqa/pyproject.toml     |   6 +-
 environments/medqa/pyproject.toml        |   4 +
 environments/pubmedqa/pyproject.toml     |   4 +
 medarc_verifiers/cli/eval_identity.py    |  41 ++-
 medarc_verifiers/cli/main.py             |   4 +-
 medarc_verifiers/cli/process/pipeline.py |  16 +-
 tests/test_cli/test_eval_identity.py     |  23 ++
 tests/test_cli/test_main.py              |  80 +++++-
 tests/test_cli/test_process_pipeline.py  |  56 +++-
 16 files changed, 821 insertions(+), 13 deletions(-)
 create mode 100644 configs/eval/README.md
 create mode 100644 configs/eval/medarc-all.toml
 create mode 100644 configs/eval/medarc-judge.toml
 create mode 100644 configs/eval/medarc-mcq.toml

diff --git a/configs/eval/README.md b/configs/eval/README.md
new file mode 100644
index 00000000..f65809cf
--- /dev/null
+++ b/configs/eval/README.md
@@ -0,0 +1,12 @@
+# MedARC Eval TOML Configs
+
+These configs use upstream `verifiers` TOML semantics. Repeated `env_id` entries
+and `[[ablation]]` sweeps intentionally keep the upstream environment id stable;
+`medarc-eval bench` writes deterministic variant directories for differing
+`env_args` and `sampling_args`.
+
+Per-environment `[tool.verifiers.eval]` defaults are read from editable installs
+where the environment `pyproject.toml` is discoverable next to the module. Wheel
+installs may ignore those defaults unless the package includes `pyproject.toml`,
+so production suite configs keep explicit `num_examples` and
+`rollouts_per_example` values.
diff --git a/configs/eval/medarc-all.toml b/configs/eval/medarc-all.toml
new file mode 100644
index 00000000..c8cc4318
--- /dev/null
+++ b/configs/eval/medarc-all.toml
@@ -0,0 +1,315 @@
+# Aggregate MedARC benchmark suite. This keeps suite composition in upstream
+# TOML and leaves variant path generation to the MedARC sequential bench wrapper.
+
+model = "openai/gpt-4.1-mini"
+save_results = true
+output_dir = "runs/evals"
+
+[[eval]]
+env_id = "medqa"
+num_examples = -1
+rollouts_per_example = 1
+
+[[eval]]
+env_id = "med_mcqa"
+num_examples = -1
+rollouts_per_example = 1
+
+[[eval]]
+env_id = "pubmedqa"
+num_examples = -1
+rollouts_per_example = 1
+
+[[eval]]
+env_id = "mmlu_pro_health"
+num_examples = -1
+rollouts_per_example = 1
+
+[[eval]]
+env_id = "m_arc"
+num_examples = -1
+rollouts_per_example = 1
+
+[[eval]]
+env_id = "head_qa_v2"
+num_examples = -1
+rollouts_per_example = 1
+
+[[eval]]
+env_id = "sctpublic"
+num_examples = -1
+rollouts_per_example = 1
+
+[[eval]]
+env_id = "healthbench"
+num_examples = -1
+rollouts_per_example = 1
+env_args = { judge_model = "openai/gpt-5-mini", judge_base_url = "https://api.pinference.ai/api/v1", difficulty = "all" }
+
+[[eval]]
+env_id = "med_dialog"
+num_examples = 2500
+rollouts_per_example = 1
+env_args = { judge_model = ["openai/gpt-5-mini", "x-ai/grok-4.1-fast"], judge_base_url = "https://api.pinference.ai/api/v1" }
+
+[[eval]]
+env_id = "medcasereasoning"
+num_examples = -1
+rollouts_per_example = 1
+env_args = { judge_model = "openai/gpt-5-nano", judge_base_url = "https://api.pinference.ai/api/v1" }
+
+[[eval]]
+env_id = "medec"
+num_examples = -1
+rollouts_per_example = 1
+env_args = { judge_model = ["openai/gpt-5-mini", "google/gemini-3-flash-preview"], judge_base_url = "https://api.pinference.ai/api/v1" }
+
+[[eval]]
+env_id = "medexqa"
+num_examples = -1
+rollouts_per_example = 1
+env_args = { use_judge = true, judge_model = ["openai/gpt-5-mini", "google/gemini-3-flash-preview"], judge_base_url = "https://api.pinference.ai/api/v1" }
+
+[[eval]]
+env_id = "medicationqa"
+num_examples = -1
+rollouts_per_example = 1
+env_args = { judge_model = ["openai/gpt-5-mini", "x-ai/grok-4.1-fast"], judge_base_url = "https://api.pinference.ai/api/v1" }
+
+[[eval]]
+env_id = "mtsamples_procedures"
+num_examples = -1
+rollouts_per_example = 1
+env_args = { judge_model = ["openai/gpt-5-mini", "x-ai/grok-4.1-fast"], judge_base_url = "https://api.pinference.ai/api/v1" }
+
+[[eval]]
+env_id = "mtsamples_replicate"
+num_examples = -1
+rollouts_per_example = 1
+env_args = { judge_model = ["openai/gpt-5-mini", "x-ai/grok-4.1-fast"], judge_base_url = "https://api.pinference.ai/api/v1" }
+
+[[eval]]
+env_id = "agentclinic"
+num_examples = -1
+rollouts_per_example = 1
+env_args = { judge_model = ["openai/gpt-5-mini", "google/gemini-3-flash-preview"], judge_base_url = "https://api.pinference.ai/api/v1", patient_model = "openai/gpt-5-mini", patient_base_url = "https://api.pinference.ai/api/v1", measurement_model = "openai/gpt-5-mini", measurement_base_url = "https://api.pinference.ai/api/v1" }
+
+[[eval]]
+env_id = "pubhealthbench"
+num_examples = -1
+rollouts_per_example = 1
+env_args = { split = "reviewed" }
+
+[[eval]]
+env_id = "pubhealthbench"
+num_examples = -1
+rollouts_per_example = 1
+env_args = { split = "freeform", judge_model = ["openai/gpt-5-mini", "google/gemini-3-flash-preview"], judge_base_url = "https://api.pinference.ai/api/v1" }
+
+[[eval]]
+env_id = "careqa"
+num_examples = -1
+rollouts_per_example = 1
+env_args = { split = "en" }
+
+[[eval]]
+env_id = "careqa"
+num_examples = -1
+rollouts_per_example = 1
+env_args = { split = "open", judge_model = ["openai/gpt-5-mini", "google/gemini-3-flash-preview"], judge_base_url = "https://api.pinference.ai/api/v1" }
+
+[[eval]]
+env_id = "medcalc_bench"
+num_examples = -1
+rollouts_per_example = 1
+env_args = { version = "1.2" }
+
+[[eval]]
+env_id = "medcalc_bench"
+num_examples = -1
+rollouts_per_example = 1
+env_args = { version = "verified", add_python_tool = true, add_calculator_tool = true }
+
+[[eval]]
+env_id = "medagentbench"
+num_examples = -1
+rollouts_per_example = 1
+env_args = { fhir_api_base = "http://localhost:8080/fhir/" }
+
+[[eval]]
+env_id = "medagentbenchv2"
+num_examples = -1
+rollouts_per_example = 1
+env_args = { fhir_api_base = "http://localhost:8080/fhir/" }
+
+[[eval]]
+env_id = "meqsum"
+num_examples = -1
+rollouts_per_example = 1
+env_args = { split = "test", compute_auto_metrics = true }
+
+[[eval]]
+env_id = "meqsum"
+num_examples = -1
+rollouts_per_example = 1
+env_args = { split = "validation", compute_auto_metrics = true }
+
+[[eval]]
+env_id = "meqsum"
+num_examples = -1
+rollouts_per_example = 1
+env_args = { split = "test", compute_auto_metrics = false }
+
+[[ablation]]
+env_id = "medqa"
+num_examples = -1
+rollouts_per_example = 1
+env_args = { shuffle_answers = true }
+
+[ablation.sweep.env_args]
+shuffle_seed = [1618, 9331]
+
+[[ablation]]
+env_id = "med_mcqa"
+num_examples = -1
+rollouts_per_example = 1
+env_args = { shuffle_answers = true }
+
+[ablation.sweep.env_args]
+shuffle_seed = [1618, 9331]
+
+[[ablation]]
+env_id = "pubmedqa"
+num_examples = -1
+rollouts_per_example = 1
+env_args = { shuffle_answers = true }
+
+[ablation.sweep.env_args]
+shuffle_seed = [1618, 9331]
+
+[[ablation]]
+env_id = "mmlu_pro_health"
+num_examples = -1
+rollouts_per_example = 1
+env_args = { shuffle_answers = true }
+
+[ablation.sweep.env_args]
+shuffle_seed = [1618, 9331]
+
+[[ablation]]
+env_id = "m_arc"
+num_examples = -1
+rollouts_per_example = 1
+env_args = { shuffle_answers = true }
+
+[ablation.sweep.env_args]
+shuffle_seed = [1618, 9331]
+
+[[ablation]]
+env_id = "careqa"
+num_examples = -1
+rollouts_per_example = 1
+env_args = { split = "en", shuffle_answers = true }
+
+[ablation.sweep.env_args]
+shuffle_seed = [1618, 9331]
+
+[[ablation]]
+env_id = "medbullets"
+num_examples = -1
+rollouts_per_example = 1
+
+[ablation.sweep.env_args]
+num_options = [4, 5]
+
+[[ablation]]
+env_id = "medbullets"
+num_examples = -1
+rollouts_per_example = 1
+env_args = { shuffle_answers = true }
+
+[ablation.sweep.env_args]
+num_options = [4, 5]
+shuffle_seed = [1618, 9331]
+
+[[ablation]]
+env_id = "medxpertqa"
+num_examples = -1
+rollouts_per_example = 1
+
+[ablation.sweep.env_args]
+question_type = ["reasoning", "understanding"]
+
+[[ablation]]
+env_id = "medxpertqa"
+num_examples = -1
+rollouts_per_example = 1
+env_args = { shuffle_answers = true }
+
+[ablation.sweep.env_args]
+question_type = ["reasoning", "understanding"]
+shuffle_seed = [1618, 9331]
+
+[[ablation]]
+env_id = "supergpqa_medicine"
+num_examples = -1
+rollouts_per_example = 1
+
+[ablation.sweep.env_args]
+difficulty = ["easy", "hard"]
+
+[[ablation]]
+env_id = "supergpqa_medicine"
+num_examples = -1
+rollouts_per_example = 1
+env_args = { shuffle_answers = true }
+
+[ablation.sweep.env_args]
+difficulty = ["easy", "hard"]
+shuffle_seed = [1618, 9331]
+
+[[ablation]]
+env_id = "medconceptsqa"
+num_examples = -1
+rollouts_per_example = 1
+env_args = { vocab = "icd10cm_sample" }
+
+[ablation.sweep.env_args]
+difficulty = ["easy", "medium", "hard"]
+
+[[ablation]]
+env_id = "medconceptsqa"
+num_examples = -1
+rollouts_per_example = 1
+env_args = { vocab = "icd10cm_sample", shuffle_answers = true }
+
+[ablation.sweep.env_args]
+difficulty = ["easy", "medium", "hard"]
+shuffle_seed = [1618, 9331]
+
+[[ablation]]
+env_id = "pubhealthbench"
+num_examples = -1
+rollouts_per_example = 1
+env_args = { split = "reviewed", shuffle_answers = true }
+
+[ablation.sweep.env_args]
+shuffle_seed = [1618, 9331]
+
+[[ablation]]
+env_id = "longhealth"
+num_examples = -1
+rollouts_per_example = 1
+env_args = { doc_shuffle_seed = 2718 }
+
+[ablation.sweep.env_args]
+task = ["task1", "task2"]
+
+[[ablation]]
+env_id = "medrbench"
+num_examples = -1
+rollouts_per_example = 1
+env_args = { judge_model = ["openai/gpt-5-mini", "google/gemini-3-flash-preview"], judge_base_url = "https://api.pinference.ai/api/v1", patient_agent_model = "openai/gpt-5-mini", patient_agent_base_url = "https://api.pinference.ai/api/v1" }
+
+[ablation.sweep.env_args]
+task = ["oracle", "1turn", "free_turn"]
diff --git a/configs/eval/medarc-judge.toml b/configs/eval/medarc-judge.toml
new file mode 100644
index 00000000..b295421b
--- /dev/null
+++ b/configs/eval/medarc-judge.toml
@@ -0,0 +1,80 @@
+# MedARC judge- and free-form-heavy benchmark suite.
+
+model = "openai/gpt-4.1-mini"
+save_results = true
+output_dir = "runs/evals"
+
+[[eval]]
+env_id = "healthbench"
+num_examples = -1
+rollouts_per_example = 1
+env_args = { judge_model = "openai/gpt-5-mini", judge_base_url = "https://api.pinference.ai/api/v1", difficulty = "all" }
+
+[[eval]]
+env_id = "careqa"
+num_examples = -1
+rollouts_per_example = 1
+env_args = { split = "open", judge_model = ["openai/gpt-5-mini", "google/gemini-3-flash-preview"], judge_base_url = "https://api.pinference.ai/api/v1" }
+
+[[eval]]
+env_id = "med_dialog"
+num_examples = 2500
+rollouts_per_example = 1
+env_args = { judge_model = ["openai/gpt-5-mini", "x-ai/grok-4.1-fast"], judge_base_url = "https://api.pinference.ai/api/v1" }
+
+[[eval]]
+env_id = "medcasereasoning"
+num_examples = -1
+rollouts_per_example = 1
+env_args = { judge_model = "openai/gpt-5-nano", judge_base_url = "https://api.pinference.ai/api/v1" }
+
+[[eval]]
+env_id = "medec"
+num_examples = -1
+rollouts_per_example = 1
+env_args = { judge_model = ["openai/gpt-5-mini", "google/gemini-3-flash-preview"], judge_base_url = "https://api.pinference.ai/api/v1" }
+
+[[eval]]
+env_id = "medexqa"
+num_examples = -1
+rollouts_per_example = 1
+env_args = { use_judge = true, judge_model = ["openai/gpt-5-mini", "google/gemini-3-flash-preview"], judge_base_url = "https://api.pinference.ai/api/v1" }
+
+[[eval]]
+env_id = "medicationqa"
+num_examples = -1
+rollouts_per_example = 1
+env_args = { judge_model = ["openai/gpt-5-mini", "x-ai/grok-4.1-fast"], judge_base_url = "https://api.pinference.ai/api/v1" }
+
+[[eval]]
+env_id = "mtsamples_procedures"
+num_examples = -1
+rollouts_per_example = 1
+env_args = { judge_model = ["openai/gpt-5-mini", "x-ai/grok-4.1-fast"], judge_base_url = "https://api.pinference.ai/api/v1" }
+
+[[eval]]
+env_id = "mtsamples_replicate"
+num_examples = -1
+rollouts_per_example = 1
+env_args = { judge_model = ["openai/gpt-5-mini", "x-ai/grok-4.1-fast"], judge_base_url = "https://api.pinference.ai/api/v1" }
+
+[[eval]]
+env_id = "pubhealthbench"
+num_examples = -1
+rollouts_per_example = 1
+env_args = { split = "freeform", judge_model = ["openai/gpt-5-mini", "google/gemini-3-flash-preview"], judge_base_url = "https://api.pinference.ai/api/v1" }
+
+[[eval]]
+env_id = "agentclinic"
+num_examples = -1
+rollouts_per_example = 1
+env_args = { judge_model = ["openai/gpt-5-mini", "google/gemini-3-flash-preview"], judge_base_url = "https://api.pinference.ai/api/v1", patient_model = "openai/gpt-5-mini", patient_base_url = "https://api.pinference.ai/api/v1", measurement_model = "openai/gpt-5-mini", measurement_base_url = "https://api.pinference.ai/api/v1" }
+
+[[ablation]]
+env_id = "medrbench"
+num_examples = -1
+rollouts_per_example = 1
+env_args = { judge_model = ["openai/gpt-5-mini", "google/gemini-3-flash-preview"], judge_base_url = "https://api.pinference.ai/api/v1", patient_agent_model = "openai/gpt-5-mini", patient_agent_base_url = "https://api.pinference.ai/api/v1" }
+
+[ablation.sweep.env_args]
+task = ["oracle", "1turn", "free_turn"]
diff --git a/configs/eval/medarc-mcq.toml b/configs/eval/medarc-mcq.toml
new file mode 100644
index 00000000..29cadb7d
--- /dev/null
+++ b/configs/eval/medarc-mcq.toml
@@ -0,0 +1,185 @@
+# MedARC multiple-choice benchmark suite.
+# Ablations become deterministic variant directories such as
+# runs/evals/<model>/<env>/env_args.shuffle_seed-1618/.
+
+model = "openai/gpt-4.1-mini"
+save_results = true
+output_dir = "runs/evals"
+
+[[eval]]
+env_id = "medqa"
+num_examples = -1
+rollouts_per_example = 1
+
+[[ablation]]
+env_id = "medqa"
+num_examples = -1
+rollouts_per_example = 1
+env_args = { shuffle_answers = true }
+
+[ablation.sweep.env_args]
+shuffle_seed = [1618, 9331]
+
+[[eval]]
+env_id = "med_mcqa"
+num_examples = -1
+rollouts_per_example = 1
+
+[[ablation]]
+env_id = "med_mcqa"
+num_examples = -1
+rollouts_per_example = 1
+env_args = { shuffle_answers = true }
+
+[ablation.sweep.env_args]
+shuffle_seed = [1618, 9331]
+
+[[eval]]
+env_id = "pubmedqa"
+num_examples = -1
+rollouts_per_example = 1
+
+[[ablation]]
+env_id = "pubmedqa"
+num_examples = -1
+rollouts_per_example = 1
+env_args = { shuffle_answers = true }
+
+[ablation.sweep.env_args]
+shuffle_seed = [1618, 9331]
+
+[[eval]]
+env_id = "mmlu_pro_health"
+num_examples = -1
+rollouts_per_example = 1
+
+[[ablation]]
+env_id = "mmlu_pro_health"
+num_examples = -1
+rollouts_per_example = 1
+env_args = { shuffle_answers = true }
+
+[ablation.sweep.env_args]
+shuffle_seed = [1618, 9331]
+
+[[eval]]
+env_id = "m_arc"
+num_examples = -1
+rollouts_per_example = 1
+
+[[ablation]]
+env_id = "m_arc"
+num_examples = -1
+rollouts_per_example = 1
+env_args = { shuffle_answers = true }
+
+[ablation.sweep.env_args]
+shuffle_seed = [1618, 9331]
+
+[[eval]]
+env_id = "careqa"
+num_examples = -1
+rollouts_per_example = 1
+env_args = { split = "en" }
+
+[[ablation]]
+env_id = "careqa"
+num_examples = -1
+rollouts_per_example = 1
+env_args = { split = "en", shuffle_answers = true }
+
+[ablation.sweep.env_args]
+shuffle_seed = [1618, 9331]
+
+[[ablation]]
+env_id = "medbullets"
+num_examples = -1
+rollouts_per_example = 1
+
+[ablation.sweep.env_args]
+num_options = [4, 5]
+
+[[ablation]]
+env_id = "medbullets"
+num_examples = -1
+rollouts_per_example = 1
+env_args = { shuffle_answers = true }
+
+[ablation.sweep.env_args]
+num_options = [4, 5]
+shuffle_seed = [1618, 9331]
+
+[[ablation]]
+env_id = "medxpertqa"
+num_examples = -1
+rollouts_per_example = 1
+
+[ablation.sweep.env_args]
+question_type = ["reasoning", "understanding"]
+
+[[ablation]]
+env_id = "medxpertqa"
+num_examples = -1
+rollouts_per_example = 1
+env_args = { shuffle_answers = true }
+
+[ablation.sweep.env_args]
+question_type = ["reasoning", "understanding"]
+shuffle_seed = [1618, 9331]
+
+[[ablation]]
+env_id = "supergpqa_medicine"
+num_examples = -1
+rollouts_per_example = 1
+
+[ablation.sweep.env_args]
+difficulty = ["easy", "hard"]
+
+[[ablation]]
+env_id = "supergpqa_medicine"
+num_examples = -1
+rollouts_per_example = 1
+env_args = { shuffle_answers = true }
+
+[ablation.sweep.env_args]
+difficulty = ["easy", "hard"]
+shuffle_seed = [1618, 9331]
+
+[[ablation]]
+env_id = "medconceptsqa"
+num_examples = -1
+rollouts_per_example = 1
+env_args = { vocab = "icd10cm_sample" }
+
+[ablation.sweep.env_args]
+difficulty = ["easy", "medium", "hard"]
+
+[[ablation]]
+env_id = "medconceptsqa"
+num_examples = -1
+rollouts_per_example = 1
+env_args = { vocab = "icd10cm_sample", shuffle_answers = true }
+
+[ablation.sweep.env_args]
+difficulty = ["easy", "medium", "hard"]
+shuffle_seed = [1618, 9331]
+
+[[eval]]
+env_id = "head_qa_v2"
+num_examples = -1
+rollouts_per_example = 1
+
+[[eval]]
+env_id = "pubhealthbench"
+num_examples = -1
+rollouts_per_example = 1
+env_args = { split = "reviewed" }
+
+[[ablation]]
+env_id = "pubhealthbench"
+num_examples = -1
+rollouts_per_example = 1
+env_args = { split = "reviewed", shuffle_answers = true }
+
+[ablation.sweep.env_args]
+shuffle_seed = [1618, 9331]
diff --git a/configs/eval/smoke.toml b/configs/eval/smoke.toml
index 20bf5d75..550245de 100644
--- a/configs/eval/smoke.toml
+++ b/configs/eval/smoke.toml
@@ -1,5 +1,6 @@
 model = "openai/gpt-4.1-mini"
 save_results = true
+output_dir = "runs/evals"
 
 [[eval]]
 env_id = "medqa"
diff --git a/environments/healthbench/pyproject.toml b/environments/healthbench/pyproject.toml
index 90447dbe..c968a509 100644
--- a/environments/healthbench/pyproject.toml
+++ b/environments/healthbench/pyproject.toml
@@ -20,3 +20,7 @@ dev = [
 [build-system]
 requires = ["hatchling"]
 build-backend = "hatchling.build"
+
+[tool.verifiers.eval]
+num_examples = -1
+rollouts_per_example = 1
diff --git a/environments/m_arc/pyproject.toml b/environments/m_arc/pyproject.toml
index c1db3b51..37d15b69 100644
--- a/environments/m_arc/pyproject.toml
+++ b/environments/m_arc/pyproject.toml
@@ -21,3 +21,6 @@ build-backend = "hatchling.build"
 [tool.hatch.build]
 include = ["m_arc.py"]
 
+[tool.verifiers.eval]
+num_examples = -1
+rollouts_per_example = 1
diff --git a/environments/med_mcqa/pyproject.toml b/environments/med_mcqa/pyproject.toml
index 9c40e292..016aab20 100644
--- a/environments/med_mcqa/pyproject.toml
+++ b/environments/med_mcqa/pyproject.toml
@@ -21,4 +21,8 @@ build-backend = "hatchling.build"
 [tool.prime.environment]
 loader = "med_mcqa:load_environment"
 display_name = "MedMCQA"
-visibility = "PUBLIC"
\ No newline at end of file
+visibility = "PUBLIC"
+
+[tool.verifiers.eval]
+num_examples = -1
+rollouts_per_example = 1
diff --git a/environments/medqa/pyproject.toml b/environments/medqa/pyproject.toml
index 138d6f8f..658dfbc8 100644
--- a/environments/medqa/pyproject.toml
+++ b/environments/medqa/pyproject.toml
@@ -23,3 +23,7 @@ include = ["medqa.py"]
 loader = "medqa:load_environment"
 display_name = "MedQA"
 visibility = "PUBLIC"
+
+[tool.verifiers.eval]
+num_examples = -1
+rollouts_per_example = 1
diff --git a/environments/pubmedqa/pyproject.toml b/environments/pubmedqa/pyproject.toml
index 9955a8c3..85982185 100644
--- a/environments/pubmedqa/pyproject.toml
+++ b/environments/pubmedqa/pyproject.toml
@@ -25,3 +25,7 @@ build-backend = "hatchling.build"
 
 [tool.hatch.build]
 include = ["pubmedqa.py", "data/"]
+
+[tool.verifiers.eval]
+num_examples = -1
+rollouts_per_example = 1
diff --git a/medarc_verifiers/cli/eval_identity.py b/medarc_verifiers/cli/eval_identity.py
index f738a186..4e02138a 100644
--- a/medarc_verifiers/cli/eval_identity.py
+++ b/medarc_verifiers/cli/eval_identity.py
@@ -132,6 +132,16 @@ def extract_variant_payload(config: Mapping[str, Any], field_names: Sequence[str
 
     payload: dict[str, Any] = {}
     for field_name in field_names:
+        if "." in field_name:
+            root, nested_key = field_name.split(".", 1)
+            value = config.get(root)
+            if isinstance(value, Mapping):
+                nested_payload = payload.setdefault(root, {})
+                if isinstance(nested_payload, dict) and nested_key in value:
+                    nested_payload[nested_key] = _canonicalize(value[nested_key])
+            else:
+                payload.setdefault(root, {})
+            continue
         if field_name in config:
             payload[field_name] = _canonicalize(config[field_name])
     return payload
@@ -253,11 +263,32 @@ def _differing_fields_by_key(
             differing[key] = []
             continue
         field_names = sorted(set().union(*(payload.keys() for payload in configs)))
-        differing[key] = [
-            field_name
-            for field_name in field_names
-            if len({_canonical_json(payload.get(field_name)) for payload in configs}) > 1
-        ]
+        differing[key] = []
+        for field_name in field_names:
+            values = [payload.get(field_name) for payload in configs]
+            if all(isinstance(value, Mapping) for value in values if value is not None):
+                nested_names = sorted(
+                    {
+                        str(nested_key)
+                        for value in values
+                        if isinstance(value, Mapping)
+                        for nested_key in value.keys()
+                    }
+                )
+                differing[key].extend(
+                    f"{field_name}.{nested_name}"
+                    for nested_name in nested_names
+                    if len(
+                        {
+                            _canonical_json(value.get(nested_name) if isinstance(value, Mapping) else None)
+                            for value in values
+                        }
+                    )
+                    > 1
+                )
+                continue
+            if len({_canonical_json(value) for value in values}) > 1:
+                differing[key].append(field_name)
     return differing
 
 
diff --git a/medarc_verifiers/cli/main.py b/medarc_verifiers/cli/main.py
index 1a671c1b..00ec35f1 100644
--- a/medarc_verifiers/cli/main.py
+++ b/medarc_verifiers/cli/main.py
@@ -42,7 +42,7 @@
 from medarc_verifiers.cli._manifest_planner import ManifestPlanner
 from medarc_verifiers.cli._schemas import EnvironmentConfigSchema, EnvironmentExportConfig
 from medarc_verifiers.cli._single_run import run_single_mode
-from medarc_verifiers.cli.eval_identity import EvalPathPlan, plan_eval_paths
+from medarc_verifiers.cli.eval_identity import EvalPathPlan, generate_variant_id, plan_eval_paths
 from medarc_verifiers.cli.eval_identity import metadata_identity_fields
 from medarc_verifiers.cli.hf import HFSyncConfig, sync_files_to_hub
 from medarc_verifiers.cli.process import PROCESS_DEFAULT_STATUS_FILTER, ProcessOptions, ProcessResult, run_process
@@ -2125,6 +2125,8 @@ def _load_env_export_map(root: Path | None) -> dict[str, EnvironmentExportConfig
             if env_cfg.export is None:
                 continue
             keys = {env_cfg.id, env_cfg.matrix_base_id}
+            if env_cfg.module and env_cfg.env_args:
+                keys.add(f"{env_cfg.module}::{generate_variant_id({'env_args': env_cfg.env_args})}")
             for key in filter(None, keys):
                 export_map[key] = env_cfg.export
 
diff --git a/medarc_verifiers/cli/process/pipeline.py b/medarc_verifiers/cli/process/pipeline.py
index 2c5c802c..4d14c723 100644
--- a/medarc_verifiers/cli/process/pipeline.py
+++ b/medarc_verifiers/cli/process/pipeline.py
@@ -338,13 +338,22 @@ def select_work_items(
 
 def _resolve_env_export(
     manifest_env_id: str | None,
+    variant_id: str | None,
     env_export_map: Mapping[str, EnvironmentExportConfig],
 ) -> EnvironmentExportConfig:
     if not manifest_env_id:
         return EnvironmentExportConfig()
+    if variant_id:
+        variant_key = f"{manifest_env_id}::{variant_id}"
+        if variant_key in env_export_map:
+            return env_export_map[variant_key]
     if manifest_env_id in env_export_map:
         return env_export_map[manifest_env_id]
     base_env_id, _ = rollout.derive_base_env_id(manifest_env_id)
+    if base_env_id and variant_id:
+        variant_base_key = f"{base_env_id}::{variant_id}"
+        if variant_base_key in env_export_map:
+            return env_export_map[variant_base_key]
     if base_env_id and base_env_id in env_export_map:
         return env_export_map[base_env_id]
     return EnvironmentExportConfig()
@@ -358,9 +367,14 @@ def _plan_selection_record(
     record: discovery.RunRecord,
     env_export_map: Mapping[str, EnvironmentExportConfig],
 ) -> SelectionRecord:
-    env_export = _resolve_env_export(record.manifest_env_id, env_export_map)
+    env_export = _resolve_env_export(record.manifest_env_id, None, env_export_map)
     combine_rollouts = bool(env_export.combine_rollouts)
     identity = metadata.resolve_run_identity(record, combine_rollouts=combine_rollouts)
+    variant_export = _resolve_env_export(record.manifest_env_id, identity.variant_id, env_export_map)
+    if variant_export != env_export:
+        env_export = variant_export
+        combine_rollouts = bool(env_export.combine_rollouts)
+        identity = metadata.resolve_run_identity(record, combine_rollouts=combine_rollouts)
     return SelectionRecord(
         record=record,
         identity=identity,
diff --git a/tests/test_cli/test_eval_identity.py b/tests/test_cli/test_eval_identity.py
index d524aeac..4700b89e 100644
--- a/tests/test_cli/test_eval_identity.py
+++ b/tests/test_cli/test_eval_identity.py
@@ -78,6 +78,29 @@ def test_duplicate_model_env_variant_can_use_sampling_args(tmp_path: Path) -> No
     ]
 
 
+def test_duplicate_model_env_variant_uses_only_differing_nested_keys(tmp_path: Path) -> None:
+    common_env_args = {
+        "judge_model": ["openai/gpt-5-mini", "google/gemini-3-flash-preview"],
+        "judge_base_url": "https://api.pinference.ai/api/v1",
+    }
+
+    plans = plan_eval_paths(
+        [
+            {"model": "gpt-5-mini", "env_id": "medrbench", "env_args": {**common_env_args, "task": "oracle"}},
+            {"model": "gpt-5-mini", "env_id": "medrbench", "env_args": {**common_env_args, "task": "1turn"}},
+            {"model": "gpt-5-mini", "env_id": "medrbench", "env_args": {**common_env_args, "task": "free_turn"}},
+        ],
+        output_root=tmp_path,
+    )
+
+    assert [plan.identity.variant_id for plan in plans] == [
+        "env_args.task-oracle",
+        "env_args.task-1turn",
+        "env_args.task-free_turn",
+    ]
+    assert plans[0].identity.variant_payload == {"env_args": {"task": "oracle"}}
+
+
 def test_duplicate_model_env_variant_canonicalizes_sampling_args(tmp_path: Path) -> None:
     plans = plan_eval_paths(
         [
diff --git a/tests/test_cli/test_main.py b/tests/test_cli/test_main.py
index 215b2041..0d037d08 100644
--- a/tests/test_cli/test_main.py
+++ b/tests/test_cli/test_main.py
@@ -4,7 +4,7 @@
 import logging
 import sys
 from pathlib import Path
-from typing import Any
+from typing import Any, Mapping
 from types import SimpleNamespace
 
 import pytest
@@ -220,6 +220,58 @@ def fail_execute_jobs(*_args, **_kwargs):
     assert str(tmp_path / "evals" / "gpt-5-mini" / "medqa" / "baseline") in output
 
 
+def test_repository_smoke_toml_config_dry_runs(capsys: pytest.CaptureFixture[str]) -> None:
+    exit_code = main.main(["bench", "--config", "configs/eval/smoke.toml", "--dry-run"])
+
+    output = capsys.readouterr().out
+    assert exit_code == 0
+    assert "TOML Bench Dry Run" in output
+    assert "medqa" in output
+    assert "runs/evals/openai-gpt-4.1-mini/medqa" in output
+
+
+def test_repository_mcq_toml_config_dry_run_shows_ablation_variants(capsys: pytest.CaptureFixture[str]) -> None:
+    exit_code = main.main(["bench", "--config", "configs/eval/medarc-mcq.toml", "--dry-run", "--eval-index", "9"])
+
+    output = capsys.readouterr().out
+    assert exit_code == 0
+    assert "medqa" in output
+    assert "env_args.shuffle_seed-1618" in output
+    assert "runs/evals/openai-gpt-4.1-mini/medqa/env_args.shuffle_answers-true__env_args.shuffle_seed-1618" in output
+
+
+def test_repository_judge_toml_config_loads_expected_judge_args() -> None:
+    configs = main.load_toml_eval_configs("configs/eval/medarc-judge.toml")
+    healthbench = next(config for config in configs if config["env_id"] == "healthbench")
+    medrbench = [config for config in configs if config["env_id"] == "medrbench"]
+
+    assert healthbench["env_args"]["judge_model"] == "openai/gpt-5-mini"
+    assert healthbench["env_args"]["judge_base_url"] == "https://api.pinference.ai/api/v1"
+    assert {config["env_args"]["task"] for config in medrbench} == {"oracle", "1turn", "free_turn"}
+
+
+def test_repository_all_toml_contains_production_suite_entries() -> None:
+    def signature(config: Mapping[str, Any]) -> str:
+        return json.dumps(
+            {
+                "env_id": config["env_id"],
+                "env_args": config.get("env_args", {}),
+                "num_examples": config.get("num_examples"),
+                "rollouts_per_example": config.get("rollouts_per_example"),
+            },
+            sort_keys=True,
+        )
+
+    all_configs = {signature(config) for config in main.load_toml_eval_configs("configs/eval/medarc-all.toml")}
+    production_configs = {
+        signature(config)
+        for path in ("configs/eval/medarc-mcq.toml", "configs/eval/medarc-judge.toml")
+        for config in main.load_toml_eval_configs(path)
+    }
+
+    assert production_configs <= all_configs
+
+
 def test_toml_bench_dry_run_model_override(
     tmp_path: Path,
     capsys: pytest.CaptureFixture[str],
@@ -2147,6 +2199,32 @@ def fake_run(options, env_export_map):
     assert "demo-env" in env_map
 
 
+def test_load_env_export_map_adds_module_variant_keys(tmp_path: Path) -> None:
+    env_root = tmp_path / "envs"
+    env_root.mkdir()
+    (env_root / "medcalc_bench.yaml").write_text(
+        """
+        - id: medcalc_bench_tools
+          module: medcalc_bench
+          env_args:
+            version: verified
+            add_python_tool: true
+            add_calculator_tool: true
+          export:
+            extra_columns: [lower_bound, upper_bound]
+            answer_column: ground_truth
+        """,
+        encoding="utf-8",
+    )
+
+    env_map = main._load_env_export_map(env_root)
+
+    variant_key = "medcalc_bench::env_args.add_calculator_tool-true__env_args.add_python_tool-true__env_args.version-verified"
+    assert "medcalc_bench_tools" in env_map
+    assert variant_key in env_map
+    assert env_map[variant_key].answer_column == "ground_truth"
+
+
 def test_process_cli_applies_config_defaults(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
     env_root = tmp_path / "envs"
     env_root.mkdir()
diff --git a/tests/test_cli/test_process_pipeline.py b/tests/test_cli/test_process_pipeline.py
index eed3739a..b03cd3f8 100644
--- a/tests/test_cli/test_process_pipeline.py
+++ b/tests/test_cli/test_process_pipeline.py
@@ -247,13 +247,16 @@ def _write_deterministic_eval(
     model_id: str = "gpt-mini",
     env_id: str = "demo-env",
     variant_id: str | None = None,
+    env_args: dict[str, object] | None = None,
+    result_row: dict[str, object] | None = None,
 ) -> Path:
     runs_dir = tmp_path / "runs"
     results_dir = runs_dir / "evals" / model_id / env_id
+    resolved_env_args = env_args or {}
     metadata = {
         "env_id": env_id,
         "model": model_id,
-        "env_args": {},
+        "env_args": resolved_env_args,
         "sampling_args": {},
         "num_examples": 1,
         "rollouts_per_example": 1,
@@ -261,7 +264,7 @@ def _write_deterministic_eval(
         "medarc_config_fingerprint_payload": {
             "env_id": env_id,
             "model": model_id,
-            "env_args": {},
+            "env_args": resolved_env_args,
             "sampling_args": {},
             "num_examples": 1,
             "rollouts_per_example": 1,
@@ -272,9 +275,10 @@ def _write_deterministic_eval(
     if variant_id is not None:
         results_dir = results_dir / variant_id
         metadata["variant_id"] = variant_id
-        metadata["variant_payload"] = {"env_args": {"shuffle_seed": 1618}}
+        metadata["variant_payload"] = {"env_args": resolved_env_args or {"shuffle_seed": 1618}}
     _write_json(results_dir / "metadata.json", metadata)
-    (results_dir / "results.jsonl").write_text(json.dumps({"example_id": "ex-1", "reward": 1.0}) + "\n", encoding="utf-8")
+    row = result_row or {"example_id": "ex-1", "reward": 1.0}
+    (results_dir / "results.jsonl").write_text(json.dumps(row) + "\n", encoding="utf-8")
     return runs_dir / "raw"
 
 
@@ -400,6 +404,50 @@ def test_run_process_excludes_deterministic_eval_variants_by_base_env(tmp_path:
     assert result.env_groups == []
 
 
+def test_run_process_applies_variant_export_overrides_to_deterministic_eval(tmp_path: Path) -> None:
+    variant_id = "env_args.add_calculator_tool-true__env_args.add_python_tool-true__env_args.version-verified"
+    env_args = {
+        "version": "verified",
+        "add_python_tool": True,
+        "add_calculator_tool": True,
+    }
+    runs_dir = _write_deterministic_eval(
+        tmp_path,
+        env_id="medcalc_bench",
+        variant_id=variant_id,
+        env_args=env_args,
+        result_row={
+            "example_id": "ex-1",
+            "ground_truth": "42",
+            "lower_bound": 40,
+            "upper_bound": 44,
+            "reward": 1.0,
+        },
+    )
+    output_dir = tmp_path / "processed"
+
+    result = run_process(
+        ProcessOptions(
+            runs_dir=runs_dir,
+            output_dir=output_dir,
+            dry_run=False,
+            max_workers=1,
+        ),
+        env_export_map={
+            f"medcalc_bench::{variant_id}": EnvironmentExportConfig(
+                extra_columns=["lower_bound", "upper_bound"],
+                answer_column="ground_truth",
+            )
+        },
+    )
+
+    table = pq.read_table(result.env_summaries[0].output_path)
+    row = table.to_pylist()[0]
+    assert row["answer"] == "42"
+    assert json.loads(row["extras"]) == {"lower_bound": 40, "upper_bound": 44}
+    assert "ground_truth" not in row
+
+
 def test_run_process_resolves_base_env_id(tmp_path: Path) -> None:
     runs_dir = _setup_run(tmp_path)
     options = ProcessOptions(

From dafc5b2cbebdcf6f5abe1019c4a9d815af850611 Mon Sep 17 00:00:00 2001
From: Benjamin Warner <me@benjaminwarner.dev>
Date: Tue, 28 Apr 2026 20:25:54 +0000
Subject: [PATCH 11/53] Remove YAML benchmark runner

---
 medarc_verifiers/cli/_config_loader.py    |  512 --------
 medarc_verifiers/cli/_eval_builder.py     |  306 -----
 medarc_verifiers/cli/_job_builder.py      |  204 ---
 medarc_verifiers/cli/_job_executor.py     |  584 ---------
 medarc_verifiers/cli/_manifest.py         |  837 +-----------
 medarc_verifiers/cli/_manifest_planner.py |  414 ------
 medarc_verifiers/cli/_manifest_tools.py   |  389 ------
 medarc_verifiers/cli/_schemas.py          |  307 +----
 medarc_verifiers/cli/_single_run.py       |   81 +-
 medarc_verifiers/cli/main.py              |  617 +--------
 tests/test_cli/test_config_loader.py      |  742 -----------
 tests/test_cli/test_eval_builder.py       |  166 ---
 tests/test_cli/test_job_builder.py        |  182 ---
 tests/test_cli/test_job_executor.py       |  680 ----------
 tests/test_cli/test_main.py               | 1435 ++-------------------
 tests/test_cli/test_manifest_planner.py   |  491 -------
 tests/test_cli/test_manifest_snapshot.py  |  452 -------
 tests/test_cli/test_manifest_tools.py     |  151 ---
 tests/test_cli/test_schemas.py            |   29 -
 19 files changed, 198 insertions(+), 8381 deletions(-)
 delete mode 100644 medarc_verifiers/cli/_config_loader.py
 delete mode 100644 medarc_verifiers/cli/_eval_builder.py
 delete mode 100644 medarc_verifiers/cli/_job_builder.py
 delete mode 100644 medarc_verifiers/cli/_job_executor.py
 delete mode 100644 medarc_verifiers/cli/_manifest_planner.py
 delete mode 100644 medarc_verifiers/cli/_manifest_tools.py
 delete mode 100644 tests/test_cli/test_config_loader.py
 delete mode 100644 tests/test_cli/test_eval_builder.py
 delete mode 100644 tests/test_cli/test_job_builder.py
 delete mode 100644 tests/test_cli/test_job_executor.py
 delete mode 100644 tests/test_cli/test_manifest_planner.py
 delete mode 100644 tests/test_cli/test_manifest_snapshot.py
 delete mode 100644 tests/test_cli/test_manifest_tools.py

diff --git a/medarc_verifiers/cli/_config_loader.py b/medarc_verifiers/cli/_config_loader.py
deleted file mode 100644
index 62097dea..00000000
--- a/medarc_verifiers/cli/_config_loader.py
+++ /dev/null
@@ -1,512 +0,0 @@
-"""Config loader utilities bridging OmegaConf YAML files and Pydantic schemas."""
-
-from __future__ import annotations
-
-import logging
-from collections.abc import Iterable, Mapping
-from itertools import product
-from pathlib import Path
-from typing import Any, Callable
-
-from omegaconf import OmegaConf
-
-from ._schemas import RESERVED_MATRIX_KEYS, EnvironmentConfigSchema, RunConfigSchema
-from .utils.endpoint_utils import EnvMetadataCache, load_env_metadata
-from .utils.env_args import validate_env_args_or_raise
-
-logger = logging.getLogger(__name__)
-DEFAULT_ENV_FILE_SUFFIXES = (".yaml", ".yml")
-
-# Scalar fields (non-env_args) that may be overridden by matrix combos.
-SCALAR_FIELD_NAMES = {
-    name for name in EnvironmentConfigSchema.model_fields if name not in RESERVED_MATRIX_KEYS and name != "env_args"
-}
-
-
-class ConfigFormatError(ValueError):
-    """Raised when a configuration file cannot be interpreted as a mapping."""
-
-
-def _load_raw_config(path: Path) -> Any:
-    """Load and resolve an OmegaConf configuration file."""
-    cfg = OmegaConf.load(path)
-    OmegaConf.resolve(cfg)
-    return OmegaConf.to_container(cfg, resolve=True)
-
-
-def load_run_config(path: str | Path, *, env_default_root: str | Path | None = None) -> RunConfigSchema:
-    """Load a run configuration file into the top-level schema."""
-    # Loader responsibilities:
-    # 1. Read and resolve OmegaConf input (supporting includes and defaults).
-    # 2. Normalize models/envs/jobs into canonical mappings or lists.
-    # 3. Let Pydantic schemas handle structural validation and coercion.
-    # 4. Expand environment matrices into concrete variants.
-    # 5. Perform lightweight env_args validation using environment metadata.
-    resolved_path = Path(path).expanduser().resolve()
-    env_default_root_path = Path(env_default_root).expanduser().resolve() if env_default_root is not None else None
-    data = _load_raw_config(resolved_path)
-
-    if not isinstance(data, dict):
-        msg = f"Configuration root must be a mapping, got {type(data).__name__}."
-        raise ConfigFormatError(msg)
-
-    if "envs" not in data or data["envs"] in (None, [], {}):
-        if env_default_root_path is None:
-            raise ConfigFormatError(
-                "Configuration must define 'envs' or --env-config-root must supply a discovery directory."
-            )
-        data = dict(data)
-        data["envs"] = str(env_default_root_path)
-
-    data = _normalize_config_fields(data, base_dir=resolved_path.parent, env_default_root=env_default_root_path)
-
-    run_config = RunConfigSchema(**data)
-    expanded_envs = _expand_env_matrices(run_config.envs)
-    _validate_env_args(expanded_envs.values())
-    return run_config.model_copy(update={"envs": expanded_envs})
-
-
-def _expand_env_matrices(envs: dict[str, EnvironmentConfigSchema]) -> dict[str, EnvironmentConfigSchema]:
-    scalar_fields = SCALAR_FIELD_NAMES
-    expanded: dict[str, EnvironmentConfigSchema] = {}
-    for env_id, env in envs.items():
-        env_with_id = env if env.id else env.model_copy(update={"id": env_id})
-        for variant in _expand_single_environment(env_with_id, scalar_fields):
-            if variant.id in expanded:
-                raise ValueError(f"environment '{variant.id}' defined multiple times after expansion.")
-            expanded[variant.id] = variant
-    return expanded
-
-
-def _expand_single_environment(
-    env: EnvironmentConfigSchema,
-    scalar_fields: Iterable[str],
-) -> list[EnvironmentConfigSchema]:
-    if not env.matrix:
-        return [
-            env.model_copy(
-                update={
-                    "env_args": dict(env.env_args),
-                    "matrix": None,
-                    "matrix_exclude": None,
-                    "matrix_id_format": None,
-                }
-            )
-        ]
-
-    matrix = env.matrix
-    base_id = env.id
-    if not base_id:
-        raise ValueError("environment entries must specify an id.")
-
-    matrix_keys = list(matrix.keys())
-    matrix_values = [matrix[key] for key in matrix_keys]
-    variants: list[EnvironmentConfigSchema] = []
-    seen_ids: set[str] = set()
-
-    base_env_args = dict(env.env_args)
-    module_name = env.module or env.id  # prefer explicit module override when present
-
-    exclude_patterns = env.matrix_exclude or []
-
-    combos: Iterable[tuple[Any, ...]]
-    if matrix_keys:
-        combos = product(*matrix_values)
-    else:
-        combos = [()]
-
-    for combo_values in combos:
-        combo = dict(zip(matrix_keys, combo_values))
-        if any(_matches_matrix_pattern(combo, pattern) for pattern in exclude_patterns):
-            continue
-
-        env_args = dict(base_env_args)
-        updates: dict[str, Any] = {}
-        for key, value in combo.items():
-            if value is None:
-                continue
-            if key in scalar_fields:
-                updates[key] = value
-            else:
-                env_args[key] = value
-
-        variant_id = _build_matrix_variant_id(base_id, combo, env.matrix_id_format)
-        if variant_id in seen_ids:
-            raise ValueError(f"environment '{base_id}' matrix generated duplicate id '{variant_id}'.")
-        seen_ids.add(variant_id)
-
-        variant_data = env.model_dump()
-        variant_data.update(updates)
-        variant_data["id"] = variant_id
-        variant_data["env_args"] = env_args
-        variant_data["module"] = module_name
-        variant_data["matrix"] = None
-        variant_data["matrix_exclude"] = None
-        variant_data["matrix_id_format"] = None
-        variant_data["matrix_base_id"] = base_id
-
-        variants.append(EnvironmentConfigSchema(**variant_data))
-
-    if not variants:
-        raise ValueError(f"environment '{base_id}' matrix produced no variants after exclusions.")
-
-    return variants
-
-
-def _normalize_config_fields(
-    data: Mapping[str, Any], *, base_dir: Path, env_default_root: Path | None
-) -> dict[str, Any]:
-    """Apply include expansion and shape normalization before schema validation."""
-
-    normalized = dict(data)
-
-    if "models" in normalized:
-        normalized["models"] = _normalize_models_field(normalized["models"], base_dir=base_dir)
-
-    if "envs" in normalized:
-        normalized["envs"] = _normalize_envs_field(
-            normalized["envs"],
-            base_dir=base_dir,
-            env_default_root=env_default_root,
-        )
-
-    if "jobs" in normalized:
-        normalized["jobs"] = _normalize_jobs_field(normalized["jobs"], base_dir=base_dir)
-
-    return normalized
-
-
-def _normalize_models_field(value: Any, *, base_dir: Path) -> dict[str, Any]:
-    return _normalize_section(
-        value,
-        base_dir=base_dir,
-        context="models",
-        entry_description="models",
-        default_id_from_key=True,
-        allow_duplicate_ids=False,
-        env_default_root=None,
-    )
-
-
-def _normalize_envs_field(value: Any, *, base_dir: Path, env_default_root: Path | None) -> dict[str, Any]:
-    # Env configs intentionally allow duplicate "id" entries so multiple blocks can
-    # share a common base id (e.g., m_arc + rollout variants). We de-duplicate only
-    # the internal map key while preserving each entry's explicit "id".
-    return _normalize_section(
-        value,
-        base_dir=base_dir,
-        context="envs",
-        entry_description="envs",
-        default_id_from_key=True,
-        allow_duplicate_ids=True,
-        duplicate_key_fn=_make_duplicate_key,
-        env_default_root=env_default_root,
-    )
-
-
-def _make_duplicate_key(base: str, count: int, existing: Mapping[str, Any]) -> str:
-    suffix = count
-    while True:
-        candidate = f"{base}__dup__{suffix}"
-        if candidate not in existing:
-            return candidate
-        suffix += 1
-
-
-def _normalize_jobs_field(value: Any, *, base_dir: Path) -> list[dict[str, Any]]:
-    entries = _collect_entries(
-        value,
-        base_dir=base_dir,
-        context="jobs",
-        entry_description="jobs",
-        env_default_root=None,
-    )
-    return [_adapt_job_entry(entry) for entry in entries]
-
-
-def _normalize_section(
-    value: Any,
-    *,
-    base_dir: Path,
-    context: str,
-    entry_description: str,
-    default_id_from_key: bool,
-    allow_duplicate_ids: bool,
-    duplicate_key_fn: Callable[[str, int, Mapping[str, Any]], str] | None = None,
-    env_default_root: Path | None,
-) -> dict[str, Any]:
-    """Normalize section entries (models/envs) with shared include handling."""
-    if value is None:
-        return {}
-
-    normalized: dict[str, Any] = {}
-
-    def _add_entry(
-        entry: Mapping[str, Any], *, key_hint: str | None = None, count_map: dict[str, int] | None = None
-    ) -> None:
-        if not isinstance(entry, Mapping):
-            raise ValueError(f"{context} entries must be mappings.")
-        adapted = dict(entry)
-        item_id = adapted.get("id") or key_hint
-        if not item_id:
-            raise ValueError(f"{context} entries must include an 'id'.")
-        key = str(item_id)
-        if count_map is not None:
-            count_map.setdefault(key, 1)
-        if key in normalized:
-            if not allow_duplicate_ids:
-                raise ValueError(f"Duplicate {entry_description.rstrip('s')} id '{key}' in configuration.")
-            if duplicate_key_fn is None:
-                raise ValueError(f"Duplicate {entry_description.rstrip('s')} id '{key}' in configuration.")
-            # Env entries can intentionally repeat ids to group variants under a common
-            # base id; we only de-duplicate the internal map key, not the entry's id.
-            counter = 2
-            if count_map is not None:
-                counter = count_map.get(key, 1) + 1
-                count_map[key] = counter
-            key = duplicate_key_fn(key, counter, normalized)
-        normalized[key] = adapted
-
-    if isinstance(value, Mapping) and all(isinstance(v, Mapping) for v in value.values()):
-        for key, entry in value.items():
-            adapted = dict(entry)
-            if default_id_from_key and "id" not in adapted:
-                adapted["id"] = str(key)
-            _add_entry(adapted)
-        return normalized
-
-    entries = _collect_entries(
-        value,
-        base_dir=base_dir,
-        context=context,
-        entry_description=entry_description,
-        env_default_root=env_default_root,
-    )
-    duplicate_counts: dict[str, int] = {}
-    for entry in entries:
-        _add_entry(entry, count_map=duplicate_counts)
-    return normalized
-
-
-def _collect_entries(
-    source: Any,
-    *,
-    base_dir: Path,
-    context: str,
-    entry_description: str,
-    env_default_root: Path | None,
-) -> list[dict[str, Any]]:
-    if source is None:
-        return []
-    if isinstance(source, Mapping):
-        return [dict(source)]
-    if isinstance(source, (str, Path)):
-        return _collect_entries_from_path(
-            source,
-            base_dir=base_dir,
-            context=context,
-            entry_description=entry_description,
-            env_default_root=env_default_root,
-        )
-    if isinstance(source, list):
-        entries: list[dict[str, Any]] = []
-        for index, item in enumerate(source):
-            item_context = f"{context}[{index}]"
-            if isinstance(item, Mapping):
-                entries.append(dict(item))
-            elif isinstance(item, (str, Path)):
-                entries.extend(
-                    _collect_entries_from_path(
-                        item,
-                        base_dir=base_dir,
-                        context=item_context,
-                        entry_description=entry_description,
-                        env_default_root=env_default_root,
-                    )
-                )
-            else:
-                raise ValueError(f"{item_context} must be a mapping or path.")
-        return entries
-    raise ValueError(f"{context} must be provided as a mapping, list, or path.")
-
-
-def _collect_entries_from_path(
-    source: str | Path,
-    *,
-    base_dir: Path,
-    context: str,
-    entry_description: str,
-    env_default_root: Path | None,
-) -> list[dict[str, Any]]:
-    path = _resolve_include_path(source, base_dir=base_dir)
-    if not path.exists() and entry_description == "envs":
-        fallback = _resolve_default_env_path(source, base_dir=base_dir, env_default_root=env_default_root)
-        if fallback is not None:
-            path = fallback
-    if not path.exists():
-        raise FileNotFoundError(f"{context} path '{path}' does not exist.")
-    if path.is_dir():
-        if entry_description not in {"envs", "jobs"}:
-            msg = f"{context} path '{path}' must be a file. Directory includes are only supported for envs and jobs."
-            raise ValueError(msg)
-        entries: list[dict[str, Any]] = []
-        for child in sorted(path.iterdir()):
-            if child.is_file() and child.suffix.lower() in {".yaml", ".yml"}:
-                entries.extend(
-                    _collect_entries_from_path(
-                        child,
-                        base_dir=child.parent,
-                        context=f"{context}/{child.name}",
-                        entry_description=entry_description,
-                        env_default_root=env_default_root,
-                    )
-                )
-        return entries
-
-    loaded = _load_raw_config(path)
-    if isinstance(loaded, Mapping):
-        if not loaded:
-            return []
-        if not all(isinstance(v, Mapping) for v in loaded.values()):
-            msg = f"{context} included {entry_description} must be a mapping of id→mapping or a list of mappings."
-            raise ValueError(msg)
-        entries: list[dict[str, Any]] = []
-        for key, value in loaded.items():
-            entry = dict(value)
-            entry.setdefault("id", str(key))
-            entries.append(entry)
-        return entries
-    if isinstance(loaded, list):
-        entries: list[dict[str, Any]] = []
-        for index, item in enumerate(loaded):
-            if not isinstance(item, Mapping):
-                raise ValueError(f"{context}[{index}] in included {entry_description} must be a mapping.")
-            entries.append(dict(item))
-        return entries
-    if loaded is None:
-        return []
-    raise ValueError(f"{context} included {entry_description} must be a mapping of id→mapping or a list of mappings.")
-
-
-def _resolve_include_path(source: str | Path, *, base_dir: Path) -> Path:
-    path = Path(source).expanduser()
-    if not path.is_absolute():
-        path = (base_dir / path).resolve()
-    else:
-        path = path.resolve()
-    return path
-
-
-def _resolve_default_env_path(source: str | Path, *, base_dir: Path, env_default_root: Path | None) -> Path | None:
-    raw_source = Path(source)
-    if raw_source.is_absolute() or env_default_root is None:
-        return None
-
-    normalized = env_default_root if env_default_root.is_absolute() else env_default_root.resolve()
-    candidates = _candidate_env_paths(normalized, raw_source)
-    for candidate in candidates:
-        if candidate.exists():
-            return candidate
-    return None
-
-
-def _candidate_env_paths(root: Path, relative_entry: Path) -> list[Path]:
-    base = root / relative_entry
-    candidates = [base]
-    if not relative_entry.suffix:
-        for suffix in DEFAULT_ENV_FILE_SUFFIXES:
-            candidates.append((root / relative_entry).with_suffix(suffix))
-    return [candidate.resolve() for candidate in candidates]
-
-
-def _adapt_job_entry(entry: Any) -> Any:
-    if not isinstance(entry, dict):
-        return entry
-
-    normalized = dict(entry)
-    for key in ("env_args", "sampling_args"):
-        value = normalized.get(key)
-        if value is None:
-            normalized[key] = {}
-        elif isinstance(value, dict):
-            normalized[key] = dict(value)
-        else:
-            raise ValueError(f"job {key} must be a mapping when provided.")
-
-    return normalized
-
-
-def _build_matrix_variant_id(
-    base_id: str,
-    combo: dict[str, Any],
-    id_format: str | None,
-) -> str:
-    format_values = {key: _format_matrix_value(value) for key, value in combo.items()}
-    format_values["base"] = base_id
-
-    if id_format:
-        try:
-            variant_id = id_format.format(**format_values)
-        except KeyError as exc:  # noqa: F841
-            missing = exc.args[0]
-            raise ValueError(f"environment '{base_id}' matrix_id_format references unknown key '{missing}'.") from exc
-    else:
-        suffix_parts = [f"{key}-{_format_matrix_value(value)}" for key, value in combo.items() if value is not None]
-        variant_id = base_id if not suffix_parts else f"{base_id}-{'-'.join(suffix_parts)}"
-
-    if not isinstance(variant_id, str) or not variant_id:
-        raise ValueError(f"environment '{base_id}' matrix generated an invalid id '{variant_id!r}'.")
-
-    return variant_id
-
-
-def _format_matrix_value(value: Any) -> str:
-    if value is None:
-        return "base"
-    if isinstance(value, bool):
-        return "true" if value else "false"
-    return str(value)
-
-
-def _matches_matrix_pattern(combo: dict[str, Any], pattern: dict[str, Any]) -> bool:
-    return all(combo.get(key) == value for key, value in pattern.items())
-
-
-def _validate_env_args(envs: Iterable[EnvironmentConfigSchema]) -> None:
-    """Validate env_args at config load time (lenient - no required param enforcement).
-
-    This is the first of two validation phases:
-    Phase 1 (here): Check for unknown parameters and type mismatches
-                    Do NOT enforce required parameters (allow partial configs)
-    Phase 2 (executor): Enforce required parameters after CLI overrides applied
-
-    Why two phases?
-    - Matrix expansion can create variants with different required params
-    - Users might load a config with 100 jobs but only run 5 with --job-id
-    - Failing at load time for jobs we won't run would be frustrating
-
-    This phase catches obvious mistakes (typos, wrong types) early while deferring
-    required parameter checks until execution when we know what will actually run.
-    """
-    cache: EnvMetadataCache = {}
-    for env in envs:
-        env_module = env.module or env.matrix_base_id or env.id
-        if not env_module:
-            continue
-        try:
-            metadata = load_env_metadata(env_module, cache=cache)
-        except ImportError as exc:
-            logger.warning("Skipping env_args validation for '%s': %s", env_module, exc)
-            continue
-        # Phase 1 validation: unknown/type checks only; do not enforce requireds at load time.
-        validate_env_args_or_raise(
-            env_module,
-            env.env_args,
-            metadata=metadata,
-            metadata_cache=cache,
-            allow_unknown=False,
-            enforce_required=False,  # Deferred to execution time
-        )
-
-
-__all__ = ["ConfigFormatError", "load_run_config"]
diff --git a/medarc_verifiers/cli/_eval_builder.py b/medarc_verifiers/cli/_eval_builder.py
deleted file mode 100644
index 94462c8c..00000000
--- a/medarc_verifiers/cli/_eval_builder.py
+++ /dev/null
@@ -1,306 +0,0 @@
-"""Shared helpers for building client and eval configs."""
-
-from __future__ import annotations
-
-import logging
-from pathlib import Path
-from typing import Any, Callable, Mapping
-
-from verifiers.types import ClientConfig, EndpointClientConfig, EvalConfig
-
-from medarc_verifiers.cli._schemas import EnvironmentConfigSchema, ModelConfigSchema
-from medarc_verifiers.cli.utils.endpoint_utils import (
-    EndpointRegistry,
-    EnvMetadataCache,
-    load_env_metadata,
-    resolve_model_endpoint,
-)
-from medarc_verifiers.cli.utils.env_args import merge_env_args
-from medarc_verifiers.cli.utils.shared import (
-    DEFAULT_BATCH_MAX_CONCURRENT,
-    merge_sampling_overrides,
-    normalize_headers,
-    resolve_env_identifier,
-    resolve_max_concurrent,
-)
-from medarc_verifiers.utils.prime_inference import PRIME_INFERENCE_URL, prime_inference_overrides
-
-logger = logging.getLogger(__name__)
-
-
-def build_client_config(
-    model_cfg: ModelConfigSchema,
-    *,
-    endpoints: EndpointRegistry,
-    default_api_key_var: str,
-    default_api_key_var_explicit: bool,
-    default_api_base_url: str,
-    api_base_url_override: str | None,
-    http_max_retries_override: int | None,
-    timeout_override: float | None,
-    headers: list[str] | dict[str, str] | None,
-) -> tuple[str, ClientConfig, dict[str, Any]]:
-    """Resolve model alias + endpoint settings into a ClientConfig.
-
-    Returns:
-        A tuple of (resolved_model, client_config, sampling_overrides).
-        - resolved_model: The resolved model identifier
-        - client_config: The ClientConfig for API calls
-        - sampling_overrides: Prime Inference sampling args to merge (e.g., usage reporting)
-    """
-    normalized_headers = normalize_headers(headers if headers is not None else model_cfg.headers)
-    model_alias = model_cfg.model or model_cfg.id
-    if not model_alias:
-        raise ValueError("Model entries must define 'id' or 'model'.")
-
-    model_api_key_var_explicit = model_cfg.api_key_var is not None
-    default_key_var = model_cfg.api_key_var if model_api_key_var_explicit else default_api_key_var
-    default_base_url = model_cfg.api_base_url or default_api_base_url
-    endpoint_group = endpoints.get(model_alias, [])
-    resolved_model, api_key_var, api_base_url = resolve_model_endpoint(
-        model_alias,
-        endpoints,
-        default_key_var=default_key_var,
-        default_base_url=default_base_url,
-    )
-    if api_base_url_override is not None:
-        logger.debug("Forcing api_base_url override for model '%s'.", model_alias)
-        api_base_url = api_base_url_override
-
-    # Get Prime Inference-specific overrides (headers + sampling args).
-    prime_headers, sampling_overrides = prime_inference_overrides(api_base_url)
-
-    effective_api_key_var = api_key_var
-    # MedARC defaults to OPENAI_API_KEY. For Prime URLs, force PRIME_API_KEY only when
-    # neither model config nor CLI explicitly selected a key var, and no endpoint
-    # registry group resolved this alias (which may intentionally provide a custom key var).
-    if (
-        api_base_url == PRIME_INFERENCE_URL
-        and not model_api_key_var_explicit
-        and not default_api_key_var_explicit
-        and not endpoint_group
-    ):
-        effective_api_key_var = "PRIME_API_KEY"
-
-    # Merge headers: user-provided headers take precedence over Prime auto-detected
-    merged_headers = {**prime_headers, **(normalized_headers or {})}
-
-    endpoint_configs: list[EndpointClientConfig] = []
-    if api_base_url_override is None and len(endpoint_group) > 1:
-        first_entry = endpoint_group[0]
-        expected_model = first_entry.get("model", model_alias)
-        expected_key = first_entry.get("key", default_key_var)
-        for idx, endpoint in enumerate(endpoint_group[1:], start=1):
-            entry_model = endpoint.get("model", model_alias)
-            entry_key = endpoint.get("key", default_key_var)
-            if entry_model != expected_model or entry_key != expected_key:
-                raise ValueError(
-                    "Endpoint replicas for "
-                    f"'{model_alias}' must agree on 'model' and 'key'; "
-                    f"variant 0 has model={expected_model!r}, key={expected_key!r}, "
-                    f"variant {idx} has model={entry_model!r}, key={entry_key!r}."
-                )
-        endpoint_configs = [
-            EndpointClientConfig(
-                api_key_var=effective_api_key_var,
-                api_base_url=endpoint["url"],
-                extra_headers=merged_headers,
-            )
-            for endpoint in endpoint_group
-        ]
-
-    client_kwargs: dict[str, Any] = {
-        "api_key_var": effective_api_key_var,
-        "api_base_url": api_base_url,
-        "endpoint_configs": endpoint_configs,
-        "extra_headers": merged_headers,
-    }
-    timeout = timeout_override if timeout_override is not None else model_cfg.timeout
-    if timeout is not None:
-        client_kwargs["timeout"] = timeout
-    if model_cfg.max_connections is not None:
-        client_kwargs["max_connections"] = model_cfg.max_connections
-    if model_cfg.max_keepalive_connections is not None:
-        client_kwargs["max_keepalive_connections"] = model_cfg.max_keepalive_connections
-    if http_max_retries_override is not None:
-        client_kwargs["max_retries"] = http_max_retries_override
-    elif model_cfg.max_retries is not None:
-        client_kwargs["max_retries"] = model_cfg.max_retries
-
-    return resolved_model, ClientConfig(**client_kwargs), sampling_overrides
-
-
-def build_eval_config(
-    *,
-    job_label: str | None,
-    model_cfg: ModelConfigSchema,
-    env_cfg: EnvironmentConfigSchema,
-    env_args: Mapping[str, Any],
-    sampling_args: Mapping[str, Any],
-    cli_env_args: Mapping[str, Any] | None,
-    cli_sampling_args: Mapping[str, Any] | None,
-    resolved_model: str,
-    client_config: ClientConfig,
-    env_dir: Path,
-    max_concurrent_override: int | None,
-    max_concurrent_generation: int | None,
-    max_concurrent_scoring: int | None,
-    rollout_max_retries: int = 0,
-    resume_path: Path | None = None,
-    default_max_concurrent: int = DEFAULT_BATCH_MAX_CONCURRENT,
-    save_results: bool = True,
-    save_to_hf_hub: bool = False,
-    hf_hub_dataset_name: str | None = None,
-    verbose: bool = False,
-    env_metadata_cache: EnvMetadataCache | None = None,
-    env_metadata_loader: Callable[..., Any] = load_env_metadata,
-    enforce_required_env_args: bool = True,
-    allow_unknown_env_args: bool = False,
-) -> EvalConfig:
-    """Assemble EvalConfig with shared env/sampling override handling."""
-    env_id = resolve_env_identifier(env_cfg)
-    try:
-        metadata = _call_env_metadata_loader(env_metadata_loader, env_id, env_metadata_cache)
-    except ImportError as exc:
-        logger.warning("Skipping env_args validation for '%s': %s", env_id, exc)
-        metadata = None
-
-    merged_env_args = merge_env_args(
-        env_id,
-        sources=[env_args, cli_env_args or {}],
-        metadata=metadata,
-        metadata_cache=env_metadata_cache,
-        allow_unknown=allow_unknown_env_args,
-        enforce_required=enforce_required_env_args,
-        verbose=verbose,
-    )
-
-    merged_sampling = dict(sampling_args)
-    merged_sampling = merge_sampling_overrides(merged_sampling, cli_sampling_args)
-
-    _warn_deprecated_eval_knobs(
-        env_cfg=env_cfg,
-        env_id=env_id,
-        job_label=job_label,
-        max_concurrent_generation=max_concurrent_generation,
-        max_concurrent_scoring=max_concurrent_scoring,
-    )
-
-    max_concurrent = resolve_max_concurrent(
-        cli_override=max_concurrent_override,
-        model_max=model_cfg.max_concurrent,
-        env_max=env_cfg.max_concurrent,
-        default_max=default_max_concurrent,
-    )
-    effective_save_results = save_results
-    if resume_path is not None and not effective_save_results:
-        logger.warning("Enabling save_results (required for resume support).")
-        effective_save_results = True
-
-    verbose_flag = env_cfg.verbose if env_cfg.verbose is not None else verbose
-    state_columns = list(env_cfg.state_columns) if env_cfg.state_columns else None
-    eval_config_fields = _pydantic_field_names(EvalConfig)
-
-    eval_kwargs: dict[str, Any] = {
-        "env_id": env_id,
-        "env_args": merged_env_args,
-        "env_dir_path": str(env_dir),
-        "model": resolved_model,
-        "client_config": client_config,
-        "sampling_args": merged_sampling,
-        "num_examples": env_cfg.num_examples,
-        "rollouts_per_example": env_cfg.rollouts_per_example,
-        "max_concurrent": max_concurrent,
-        "verbose": verbose_flag,
-        "state_columns": state_columns,
-        "save_results": effective_save_results,
-        "save_to_hf_hub": save_to_hf_hub,
-        "hf_hub_dataset_name": hf_hub_dataset_name,
-    }
-    if "max_retries" in eval_config_fields:
-        eval_kwargs["max_retries"] = rollout_max_retries
-    if "resume_path" in eval_config_fields:
-        eval_kwargs["resume_path"] = resume_path
-
-    independent_scoring = getattr(env_cfg, "independent_scoring", None)
-    interleave_scoring = getattr(env_cfg, "interleave_scoring", None)
-
-    if interleave_scoring is not None:
-        raise ValueError(
-            f"Environment '{env_id}' uses interleave_scoring, which is no longer supported; use independent_scoring."
-        )
-
-    if "independent_scoring" in eval_config_fields:
-        if independent_scoring is None:
-            independent_scoring = True
-        eval_kwargs["independent_scoring"] = bool(independent_scoring)
-    elif independent_scoring is not None:
-        logger.warning(
-            "Environment '%s' set independent_scoring=%s, but installed verifiers does not support it; ignoring.",
-            env_id,
-            independent_scoring,
-        )
-
-    if "extra_env_kwargs" in eval_config_fields:
-        extra_env_kwargs = getattr(env_cfg, "extra_env_kwargs", None)
-        if extra_env_kwargs is not None:
-            eval_kwargs["extra_env_kwargs"] = dict(extra_env_kwargs)
-
-    return EvalConfig(**eval_kwargs)
-
-
-__all__ = ["build_client_config", "build_eval_config"]
-
-
-def _call_env_metadata_loader(loader: Callable[..., Any], env_id: str, cache: EnvMetadataCache | None) -> Any:
-    """Invoke env metadata loader tolerant of positional-only stubs used in tests."""
-    try:
-        return loader(env_id, cache=cache)
-    except TypeError:
-        return loader(env_id)
-
-
-def _pydantic_field_names(model_type: type[Any]) -> set[str]:
-    fields = getattr(model_type, "model_fields", None)
-    if isinstance(fields, dict):
-        return set(fields.keys())
-    fields = getattr(model_type, "__fields__", None)
-    if isinstance(fields, dict):
-        return set(fields.keys())
-    return set()
-
-
-def _warn_deprecated_eval_knobs(
-    *,
-    env_cfg: Any,
-    env_id: str,
-    job_label: str | None,
-    max_concurrent_generation: int | None,
-    max_concurrent_scoring: int | None,
-) -> None:
-    env_fields_set = set(getattr(env_cfg, "model_fields_set", set()))
-
-    deprecated_env_knobs: list[str] = []
-    if "save_every" in env_fields_set and getattr(env_cfg, "save_every", None) is not None:
-        deprecated_env_knobs.append("save_every")
-    if "print_results" in env_fields_set:
-        deprecated_env_knobs.append("print_results")
-    if deprecated_env_knobs:
-        logger.warning(
-            "Environment '%s' sets deprecated eval knob(s): %s. These options are ignored.",
-            env_id,
-            ", ".join(sorted(deprecated_env_knobs)),
-        )
-
-    deprecated_concurrency_knobs: list[str] = []
-    if max_concurrent_generation is not None:
-        deprecated_concurrency_knobs.append("max_concurrent_generation")
-    if max_concurrent_scoring is not None:
-        deprecated_concurrency_knobs.append("max_concurrent_scoring")
-    if deprecated_concurrency_knobs:
-        label = job_label or env_id
-        logger.warning(
-            "Job '%s' sets deprecated eval knob(s): %s. These options are ignored.",
-            label,
-            ", ".join(sorted(deprecated_concurrency_knobs)),
-        )
diff --git a/medarc_verifiers/cli/_job_builder.py b/medarc_verifiers/cli/_job_builder.py
deleted file mode 100644
index f04a4c6e..00000000
--- a/medarc_verifiers/cli/_job_builder.py
+++ /dev/null
@@ -1,204 +0,0 @@
-"""Resolve validated run configurations into executable job definitions."""
-
-from __future__ import annotations
-
-from dataclasses import dataclass
-from typing import Any, Iterable
-
-from ._schemas import EnvironmentConfigSchema, ModelConfigSchema, RunConfigSchema
-from .utils.env_args import merge_env_args
-from .utils.shared import compute_checksum, slugify
-
-
-@dataclass(slots=True)
-class ResolvedJob:
-    """Executable job produced from a run configuration."""
-
-    job_id: str
-    name: str
-    model: ModelConfigSchema
-    env: EnvironmentConfigSchema
-    env_args: dict[str, Any]
-    sampling_args: dict[str, Any]
-    sleep: float | None = None
-
-
-def build_jobs(config: RunConfigSchema) -> list[ResolvedJob]:
-    """Expand a validated run configuration into concrete jobs."""
-    matrix_index = _build_matrix_index(config.envs.values())
-    models: dict[str, ModelConfigSchema] = config.models
-    resolved: list[ResolvedJob] = []
-    used_ids: set[str] = set()
-
-    for job_cfg in config.jobs:
-        model_id, model = _resolve_model(job_cfg.model, models)
-        if model.id is None:
-            model = model.model_copy(update={"id": model_id})
-            models[model_id] = model
-        env_targets = _coerce_iterable(job_cfg.env)
-        for env_target in env_targets:
-            for env_id in _resolve_env_ids(env_target, config.envs, matrix_index):
-                env = config.envs[env_id]
-                if env.id is None:
-                    env = env.model_copy(update={"id": env_id})
-                    config.envs[env_id] = env
-                env_args = _compose_env_args(env, model, job_cfg.env_args)
-                sampling_args = _compose_sampling_args(model.sampling_args, job_cfg.sampling_args)
-                name = job_cfg.name or f"{model_id}-{env.id}"
-                job_id = _build_job_id(
-                    model_id=model_id,
-                    env_id=env.id,
-                    job_name=job_cfg.name,
-                    env_overrides=job_cfg.env_args,
-                    sampling_overrides=job_cfg.sampling_args,
-                    used_ids=used_ids,
-                )
-                used_ids.add(job_id)
-                resolved.append(
-                    ResolvedJob(
-                        job_id=job_id,
-                        name=name,
-                        model=model,
-                        env=env,
-                        env_args=env_args,
-                        sampling_args=sampling_args,
-                        sleep=job_cfg.sleep,
-                    )
-                )
-
-    return resolved
-
-
-def _resolve_model(
-    model_ref: str | dict[str, Any],
-    models: dict[str, ModelConfigSchema],
-) -> tuple[str, ModelConfigSchema]:
-    if isinstance(model_ref, str):
-        model = models.get(model_ref)
-        if model is None:
-            raise ValueError(f"Job references unknown model '{model_ref}'.")
-        return model_ref, model
-
-    inline = ModelConfigSchema(**model_ref)
-    if not inline.id:
-        raise ValueError("Inline model definitions must include an 'id'.")
-    existing = models.get(inline.id)
-    if existing is not None and existing != inline:
-        raise ValueError(f"Conflicting inline model definition for id '{inline.id}'.")
-    models[inline.id] = inline
-    return inline.id, inline
-
-
-def _resolve_env_ids(
-    env_ref: str,
-    envs: dict[str, EnvironmentConfigSchema],
-    matrix_index: dict[str, list[str]],
-) -> list[str]:
-    candidates: list[str] = []
-    if env_ref in envs:
-        candidates.append(env_ref)
-    if env_ref in matrix_index:
-        candidates.extend(matrix_index[env_ref])
-    if not candidates:
-        raise ValueError(f"Job references unknown environment '{env_ref}'.")
-    # Preserve order while removing duplicates
-    unique: list[str] = []
-    seen: set[str] = set()
-    for env_id in candidates:
-        if env_id not in seen:
-            unique.append(env_id)
-            seen.add(env_id)
-    return unique
-
-
-def _resolve_env_override(model: ModelConfigSchema, env: EnvironmentConfigSchema) -> dict[str, Any] | None:
-    """Resolve env-specific overrides from model config.
-
-    Tries in order:
-    1. env.id (exact match for the environment identifier)
-    2. env.matrix_base_id (for matrix-expanded variants like 'medqa-seed-1')
-    3. env.module (fallback for module-based lookup)
-
-    Returns the override dict if found, None otherwise.
-    """
-    for key in (env.id, env.matrix_base_id, env.module):
-        if key and key in model.env_overrides:
-            return model.env_overrides[key]
-    return None
-
-
-def _compose_env_args(
-    env: EnvironmentConfigSchema,
-    model: ModelConfigSchema,
-    job_env_args: dict[str, Any],
-) -> dict[str, Any]:
-    """Compose env_args up to job overrides (CLI is applied later)."""
-    return merge_env_args(
-        None,
-        sources=[
-            env.env_args,
-            model.env_args,
-            _resolve_env_override(model, env) or {},
-            job_env_args,
-        ],
-    )
-
-
-def _compose_sampling_args(
-    model_sampling: dict[str, Any],
-    job_sampling: dict[str, Any],
-) -> dict[str, Any]:
-    merged = dict(model_sampling)
-    merged.update(job_sampling)
-    return merged
-
-
-def _build_matrix_index(envs: Iterable[EnvironmentConfigSchema]) -> dict[str, list[str]]:
-    index: dict[str, list[str]] = {}
-    for env in envs:
-        base_id = env.matrix_base_id
-        if base_id:
-            index.setdefault(base_id, []).append(env.id)
-    return index
-
-
-def _coerce_iterable(value: str | list[str]) -> list[str]:
-    if isinstance(value, str):
-        return [value]
-    return list(value)
-
-
-def _build_job_id(
-    *,
-    model_id: str,
-    env_id: str,
-    job_name: str | None,
-    env_overrides: dict[str, Any],
-    sampling_overrides: dict[str, Any],
-    used_ids: set[str],
-) -> str:
-    segments = [slugify(model_id), slugify(env_id)]
-    if job_name:
-        segments.append(slugify(job_name))
-    base = "-".join(filter(None, segments)) or "job"
-    job_id = base
-    if job_id not in used_ids:
-        return job_id
-
-    payload = {
-        "model_id": model_id,
-        "env_id": env_id,
-        "job_name": job_name,
-        "env_overrides": env_overrides,
-        "sampling_overrides": sampling_overrides,
-    }
-    fingerprint = compute_checksum(payload)[:10]
-    job_id = f"{base}-{fingerprint}"
-    suffix = 1
-    while job_id in used_ids:
-        suffix += 1
-        job_id = f"{base}-{fingerprint}{suffix}"
-    return job_id
-
-
-__all__ = ["ResolvedJob", "build_jobs"]
diff --git a/medarc_verifiers/cli/_job_executor.py b/medarc_verifiers/cli/_job_executor.py
deleted file mode 100644
index 31dd8f4b..00000000
--- a/medarc_verifiers/cli/_job_executor.py
+++ /dev/null
@@ -1,584 +0,0 @@
-"""Job execution utilities for the unified CLI."""
-
-from __future__ import annotations
-
-import asyncio
-import contextlib
-import logging
-import shutil
-from datetime import UTC, datetime
-from pathlib import Path
-from time import perf_counter, sleep
-from typing import Any, Literal, Mapping, Sequence
-from pydantic import BaseModel, Field, field_validator
-
-from verifiers.types import GenerateOutputs
-from verifiers.utils.eval_utils import run_evaluation
-
-from medarc_verifiers.cli._constants import DEFAULT_ENDPOINTS_PATH
-from medarc_verifiers.cli._eval_builder import build_client_config, build_eval_config
-from medarc_verifiers.cli._job_builder import ResolvedJob
-from medarc_verifiers.cli._manifest import RunManifest
-from medarc_verifiers.cli._schemas import ModelConfigSchema
-from medarc_verifiers.cli.utils.endpoint_utils import (
-    EndpointRegistry,
-    EndpointRegistryCache,
-    EnvMetadataCache,
-    load_endpoint_registry,
-    load_env_metadata,
-)
-from medarc_verifiers.cli.utils.resume import (
-    format_resume_mismatch_lines,
-    is_resume_metadata_mismatch_error,
-    is_valid_resume_results_path,
-    load_resume_metadata_values,
-)
-from medarc_verifiers.cli.utils.shared import DEFAULT_BATCH_MAX_CONCURRENT, ensure_root_logging, resolve_env_identifier
-
-try:
-    from rich import print as rich_print  # type: ignore
-except ImportError:  # pragma: no cover - rich is optional
-    rich_print = None
-
-logger = logging.getLogger(__name__)
-
-
-class ExecutorSettings(BaseModel):
-    """Run-level options controlling how jobs are executed."""
-
-    run_id: str
-    output_dir: Path
-    env_dir: Path
-    endpoints_path: Path | None = None
-    endpoints_path_explicit: bool = False
-    default_api_key_var: str
-    default_api_key_var_explicit: bool = False
-    default_api_base_url: str
-    api_base_url_override: str | None = None
-    log_level: str = "INFO"
-    verbose: bool = False
-    save_results: bool = True
-    save_to_hf_hub: bool = False
-    hf_hub_dataset_name: str | None = None
-    max_concurrent_generation: int | None = None  # Deprecated; accepted for compatibility and ignored.
-    max_concurrent_scoring: int | None = None  # Deprecated; accepted for compatibility and ignored.
-    max_concurrent: int | None = None  # CLI override for max_concurrent
-    http_max_retries: int | None = None  # CLI override for ClientConfig.max_retries
-    rollout_max_retries: int = 0  # CLI override for EvalConfig.max_retries
-    timeout: float | None = None
-    sleep: float = 0.0
-    dry_run: bool = False
-    cli_env_args: dict[str, Any] | None = None
-    cli_sampling_args: dict[str, Any] | None = None
-    forced_job_ids: set[str] = Field(default_factory=set)
-
-    @field_validator("output_dir", "env_dir", mode="before")
-    @classmethod
-    def _expand_path(cls, value: Path | str) -> Path:
-        return Path(value).expanduser()
-
-    @field_validator("endpoints_path", mode="before")
-    @classmethod
-    def _expand_optional_path(cls, value: Path | str | None) -> Path | None:
-        if value is None:
-            return None
-        return Path(value).expanduser()
-
-
-class JobExecutionResult(BaseModel):
-    """Outcome emitted for each executed job."""
-
-    job_id: str
-    status: Literal["succeeded", "failed", "skipped"]
-    error: str | None = None
-    duration_seconds: float | None = None
-    output_path: Path | None = None
-    result: Any | None = None
-
-
-def execute_jobs(
-    jobs: Sequence[ResolvedJob],
-    settings: ExecutorSettings,
-    *,
-    endpoints_cache: EndpointRegistryCache | None = None,
-    env_metadata_cache: EnvMetadataCache | None = None,
-    manifest: RunManifest | None = None,
-) -> list[JobExecutionResult]:
-    """Execute a sequence of resolved jobs."""
-    ensure_root_logging(settings.log_level)
-    logger.info("Starting run '%s' with %d job(s).", settings.run_id, len(jobs))
-
-    run_dir = settings.output_dir / settings.run_id
-    run_dir.mkdir(parents=True, exist_ok=True)
-
-    job_statuses: dict[str, str] = {job.job_id: "pending" for job in jobs}
-    results: list[JobExecutionResult] = []
-    interrupted = False
-
-    for index, job in enumerate(jobs):
-        is_last_job = index == len(jobs) - 1
-        env_identifier = resolve_env_identifier(job.env)
-        model_identifier = job.model.id or job.model.model or job.job_id
-        job_label = f"{job.job_id} (env={env_identifier}, model={model_identifier})"
-        logger.info("Job %d/%d starting: %s", index + 1, len(jobs), job_label)
-        job_dir = (run_dir / job.job_id).resolve()
-        job_dir.mkdir(parents=True, exist_ok=True)
-        job_statuses[job.job_id] = "running"
-        forced_clean = job.job_id in settings.forced_job_ids
-
-        if settings.dry_run:
-            logger.info("Dry run enabled; skipping execution for job '%s'.", job.job_id)
-            results.append(
-                JobExecutionResult(
-                    job_id=job.job_id,
-                    status="skipped",
-                    output_path=job_dir,
-                )
-            )
-            job_statuses[job.job_id] = "skipped"
-            _log_job_progress_window(jobs, index, job_statuses, event="dry-run skip")
-            continue
-
-        if manifest is not None:
-            manifest.record_job_start(job.job_id)
-
-        try:
-            _prepare_job_dir_for_resume(job_id=job.job_id, job_dir=job_dir, forced_clean=forced_clean)
-        except Exception as exc:  # noqa: BLE001
-            error_message = f"{job_label} preflight failed: {exc}"
-            logger.exception("%s", error_message)
-            _record_job_failure(
-                results=results,
-                job_statuses=job_statuses,
-                jobs=jobs,
-                center_index=index,
-                manifest=manifest,
-                job_id=job.job_id,
-                output_path=job_dir,
-                error_message=error_message,
-                manifest_error=str(exc),
-                duration_seconds=None,
-                status_label="failed",
-                event="failure",
-                note="during preflight",
-            )
-            _maybe_sleep_between_jobs(job, settings, is_last=is_last_job)
-            continue
-
-        try:
-            endpoints = _load_endpoints_for_model(job.model, settings, cache=endpoints_cache)
-            resolved_model, client_config, prime_sampling_overrides = build_client_config(
-                job.model,
-                endpoints=endpoints,
-                default_api_key_var=settings.default_api_key_var,
-                default_api_key_var_explicit=settings.default_api_key_var_explicit,
-                default_api_base_url=settings.default_api_base_url,
-                api_base_url_override=settings.api_base_url_override,
-                http_max_retries_override=settings.http_max_retries,
-                timeout_override=settings.timeout,
-                headers=job.model.headers,
-            )
-            # Merge Prime Inference overrides with job sampling args (job args take precedence)
-            merged_sampling_args = {**prime_sampling_overrides, **job.sampling_args}
-            eval_config = build_eval_config(
-                job_label=job.job_id,
-                model_cfg=job.model,
-                env_cfg=job.env,
-                env_args=job.env_args,
-                sampling_args=merged_sampling_args,
-                cli_env_args=settings.cli_env_args,
-                cli_sampling_args=settings.cli_sampling_args,
-                resolved_model=resolved_model,
-                client_config=client_config,
-                env_dir=settings.env_dir,
-                max_concurrent_override=settings.max_concurrent,
-                max_concurrent_generation=settings.max_concurrent_generation,
-                max_concurrent_scoring=settings.max_concurrent_scoring,
-                rollout_max_retries=settings.rollout_max_retries,
-                resume_path=job_dir,
-                default_max_concurrent=DEFAULT_BATCH_MAX_CONCURRENT,
-                save_results=settings.save_results,
-                save_to_hf_hub=settings.save_to_hf_hub,
-                hf_hub_dataset_name=settings.hf_hub_dataset_name,
-                verbose=settings.verbose,
-                env_metadata_cache=env_metadata_cache,
-                env_metadata_loader=load_env_metadata,
-                enforce_required_env_args=True,
-            )
-        except KeyboardInterrupt:
-            logger.warning("Interrupted while preparing job %s.", job_label)
-            interruption_message = f"{job_label} interrupted by user"
-            _record_job_failure(
-                results=results,
-                job_statuses=job_statuses,
-                jobs=jobs,
-                center_index=index,
-                manifest=manifest,
-                job_id=job.job_id,
-                output_path=job_dir,
-                error_message=interruption_message,
-                manifest_error="interrupted by user",
-                duration_seconds=None,
-                status_label="interrupted",
-                event="interruption",
-                note="during preparation",
-            )
-            interrupted = True
-            break
-        except Exception as exc:  # noqa: BLE001
-            error_message = f"{job_label} preparation failed: {exc}"
-            logger.exception("%s", error_message)
-            _record_job_failure(
-                results=results,
-                job_statuses=job_statuses,
-                jobs=jobs,
-                center_index=index,
-                manifest=manifest,
-                job_id=job.job_id,
-                output_path=job_dir,
-                error_message=error_message,
-                manifest_error=str(exc),
-                duration_seconds=None,
-                status_label="failed",
-                event="failure",
-                note="during preparation",
-            )
-            _maybe_sleep_between_jobs(job, settings, is_last=is_last_job)
-            continue
-
-        start = perf_counter()
-        try:
-            eval_result = asyncio.run(run_evaluation(eval_config))
-        except KeyboardInterrupt:
-            duration = perf_counter() - start
-            logger.warning("Job %s interrupted by user after %.2fs.", job_label, duration)
-            interruption_message = f"{job_label} interrupted by user"
-            _record_job_failure(
-                results=results,
-                job_statuses=job_statuses,
-                jobs=jobs,
-                center_index=index,
-                manifest=manifest,
-                job_id=job.job_id,
-                output_path=job_dir,
-                error_message=interruption_message,
-                manifest_error="interrupted by user",
-                duration_seconds=duration,
-                status_label="interrupted",
-                event="interruption",
-            )
-            interrupted = True
-            break
-        except Exception as exc:  # noqa: BLE001
-            duration = perf_counter() - start
-            if is_resume_metadata_mismatch_error(exc):
-                _log_resume_mismatch_diagnostics(job_id=job.job_id, resume_path=job_dir, eval_config=eval_config)
-                prescriptive = (
-                    "Job output dir contains incompatible prior results; "
-                    "use --force to rerun cleanly or start a new run_id."
-                )
-                error_message = f"{job_label} failed after {duration:.2f}s: {prescriptive}"
-                logger.error("%s", error_message)
-                manifest_error = prescriptive
-            else:
-                error_message = f"{job_label} failed after {duration:.2f}s: {exc}"
-                logger.exception("%s", error_message)
-                manifest_error = str(exc)
-            _record_job_failure(
-                results=results,
-                job_statuses=job_statuses,
-                jobs=jobs,
-                center_index=index,
-                manifest=manifest,
-                job_id=job.job_id,
-                output_path=job_dir,
-                error_message=error_message,
-                manifest_error=manifest_error,
-                duration_seconds=duration,
-                status_label="failed",
-                event="failure",
-            )
-            _maybe_sleep_between_jobs(job, settings, is_last=is_last_job)
-            continue
-
-        duration = perf_counter() - start
-        logger.info("Job '%s' completed in %.2fs.", job.job_id, duration)
-
-        _materialize_results(job_dir, eval_result)
-        avg_reward = _extract_avg_reward(eval_result)
-        metrics_avg = _extract_avg_metrics(eval_result)
-        metadata = _safe_get(eval_result, "metadata", None)
-        num_examples = _safe_get(metadata, "num_examples", None)
-        rollouts_per_example = _safe_get(metadata, "rollouts_per_example", None)
-
-        if manifest is not None:
-            manifest.record_job_completion(
-                job.job_id,
-                duration_seconds=duration,
-                results_dir=job_dir,
-                avg_reward=avg_reward,
-                metrics=metrics_avg,
-                num_examples=num_examples,
-                rollouts_per_example=rollouts_per_example,
-            )
-
-        results.append(
-            JobExecutionResult(
-                job_id=job.job_id,
-                status="succeeded",
-                duration_seconds=duration,
-                output_path=job_dir,
-                result=eval_result,
-            )
-        )
-        job_statuses[job.job_id] = "completed"
-        _log_job_progress_window(jobs, index, job_statuses, event="completion")
-        _maybe_sleep_between_jobs(job, settings, is_last=is_last_job)
-
-    if interrupted:
-        logger.warning("Execution interrupted by user; %d job(s) left pending.", len(jobs) - len(results))
-
-    return results
-
-
-def _load_endpoints_for_model(
-    model_cfg: ModelConfigSchema,
-    settings: ExecutorSettings,
-    *,
-    cache: EndpointRegistryCache | None,
-) -> EndpointRegistry:
-    """Load the endpoint registry to use for a model."""
-    registry_path = model_cfg.endpoints_path or settings.endpoints_path
-    if registry_path is None:
-        return {}
-
-    registry_path_obj = Path(registry_path).expanduser()
-    default_registry_path = Path(DEFAULT_ENDPOINTS_PATH).expanduser()
-    explicit_path = bool(model_cfg.endpoints_path) or settings.endpoints_path_explicit
-
-    if not registry_path_obj.exists():
-        if explicit_path:
-            raise FileNotFoundError(f"Endpoint registry not found at {registry_path_obj}")
-        if _same_path(registry_path_obj, default_registry_path):
-            logger.warning(
-                "Default endpoints registry '%s' not found; continuing without endpoint aliases.",
-                registry_path_obj,
-            )
-            return {}
-
-    endpoints = load_endpoint_registry(registry_path_obj, cache=cache)
-    if explicit_path and not endpoints:
-        raise ValueError(f"Failed to load endpoint registry from explicit path '{registry_path_obj}'")
-    return endpoints
-
-
-def _record_job_failure(
-    *,
-    results: list[JobExecutionResult],
-    job_statuses: dict[str, str],
-    jobs: Sequence[ResolvedJob],
-    center_index: int,
-    manifest: RunManifest | None,
-    job_id: str,
-    output_path: Path,
-    error_message: str,
-    manifest_error: str,
-    duration_seconds: float | None,
-    status_label: str,
-    event: str,
-    note: str | None = None,
-) -> None:
-    if manifest is not None:
-        manifest.record_job_failure(job_id, error=manifest_error, duration_seconds=duration_seconds)
-    results.append(
-        JobExecutionResult(
-            job_id=job_id,
-            status="failed",
-            error=error_message,
-            duration_seconds=duration_seconds,
-            output_path=output_path,
-        )
-    )
-    job_statuses[job_id] = status_label
-    _log_job_progress_window(jobs, center_index, job_statuses, event=event, note=note)
-
-
-def _safe_get(obj: Any, key: str, default: Any = None) -> Any:
-    """Retrieve attribute or dict key, allowing newer dict-style GenerateOutputs."""
-    if isinstance(obj, dict):
-        return obj.get(key, default)
-    return getattr(obj, key, default)
-
-
-def _same_path(left: Path, right: Path) -> bool:
-    try:
-        return left.resolve(strict=False) == right.resolve(strict=False)
-    except OSError:
-        return left == right
-
-
-def _materialize_results(job_dir: Path, results: GenerateOutputs) -> None:
-    """Move evaluation artifacts into the job directory."""
-    metadata = _safe_get(results, "metadata", None)
-    raw_path = _safe_get(metadata, "path_to_save", None)
-    src_path = Path(raw_path) if raw_path else job_dir
-    try:
-        resolved_src = src_path.resolve()
-    except OSError:
-        resolved_src = src_path
-    try:
-        resolved_job_dir = job_dir.resolve()
-    except OSError:
-        resolved_job_dir = job_dir
-
-    if resolved_src == resolved_job_dir:
-        logger.debug("Results already in job_dir; _materialize_results no-op for %s.", job_dir)
-        return
-
-    if src_path.exists() and resolved_src != resolved_job_dir:
-        logger.warning(
-            "Unexpected results source path for job '%s': src=%s, job_dir=%s. Materializing as a safety net.",
-            job_dir.name,
-            src_path,
-            job_dir,
-        )
-        for item in src_path.iterdir():
-            target = job_dir / item.name
-            if target.exists():
-                if target.is_dir():
-                    shutil.rmtree(target)
-                else:
-                    target.unlink()
-            shutil.move(str(item), target)
-        with contextlib.suppress(OSError):
-            src_path.rmdir()
-
-
-def _prepare_job_dir_for_resume(*, job_id: str, job_dir: Path, forced_clean: bool) -> None:
-    if not job_dir.exists():
-        job_dir.mkdir(parents=True, exist_ok=True)
-        return
-    if not job_dir.is_dir():
-        msg = f"Job output dir '{job_dir}' is not a directory. Use --force to rerun cleanly or choose a new run_id."
-        raise ValueError(msg)
-    if not any(job_dir.iterdir()):
-        return
-
-    if forced_clean:
-        archive_path = _archive_job_dir(job_dir)
-        logger.info("Forced rerun for job '%s': archived '%s' -> '%s'.", job_id, job_dir, archive_path)
-        job_dir.mkdir(parents=True, exist_ok=True)
-        return
-
-    if is_valid_resume_results_path(job_dir):
-        return
-
-    msg = (
-        f"Job output dir '{job_dir}' is non-empty but not a valid evaluation results path "
-        "(expected results.jsonl and metadata.json). "
-        "Use --force to rerun cleanly or choose a new run_id."
-    )
-    raise ValueError(msg)
-
-
-def _archive_job_dir(job_dir: Path) -> Path:
-    timestamp = datetime.now(UTC).strftime("%Y%m%d-%H%M%S")
-    candidate = job_dir.with_name(f"{job_dir.name}__old_{timestamp}")
-    suffix = 1
-    while candidate.exists():
-        candidate = job_dir.with_name(f"{job_dir.name}__old_{timestamp}_{suffix}")
-        suffix += 1
-    job_dir.rename(candidate)
-    return candidate
-
-
-def _log_resume_mismatch_diagnostics(*, job_id: str, resume_path: Path, eval_config: Any) -> None:
-    logger.error("Resume metadata mismatch for job '%s' at %s.", job_id, resume_path)
-    saved_values = load_resume_metadata_values(resume_path)
-    current_values = {
-        "env_id": getattr(eval_config, "env_id", "<missing>"),
-        "model": getattr(eval_config, "model", "<missing>"),
-        "rollouts_per_example": getattr(eval_config, "rollouts_per_example", "<missing>"),
-        "num_examples": getattr(eval_config, "num_examples", "<missing>"),
-    }
-    for line in format_resume_mismatch_lines(saved_values=saved_values, current_values=current_values):
-        logger.error("  %s", line)
-    logger.error("Resume supports increasing num_examples, but not decreasing it.")
-
-
-def _extract_avg_reward(results: GenerateOutputs) -> float | None:
-    """Return metadata-level average reward from GenerateOutputs."""
-    metadata = _safe_get(results, "metadata", None)
-    metadata_avg = _safe_get(metadata, "avg_reward", None)
-    if metadata_avg is not None:
-        return float(metadata_avg)
-    return None
-
-
-def _extract_avg_metrics(results: GenerateOutputs) -> dict[str, float]:
-    """Return metadata-level average metrics from GenerateOutputs."""
-    metadata = _safe_get(results, "metadata", None)
-    raw_metrics = _safe_get(metadata, "avg_metrics", None)
-    if not isinstance(raw_metrics, Mapping):
-        return {}
-
-    metrics: dict[str, float] = {}
-    for key, value in raw_metrics.items():
-        if value is None:
-            continue
-        try:
-            metrics[str(key)] = float(value)
-        except (TypeError, ValueError):
-            continue
-    return metrics
-
-
-def _log_job_progress_window(
-    jobs: Sequence[ResolvedJob],
-    center_index: int,
-    job_statuses: Mapping[str, str],
-    *,
-    event: str,
-    note: str | None = None,
-) -> None:
-    if not jobs:
-        return
-    start = max(0, center_index - 1)
-    end = min(len(jobs), center_index + 2)
-    lines: list[str] = []
-    header = "Segment | Job ID | Status | Model | Env | Name"
-    divider = "-" * len(header)
-    lines.append(header)
-    lines.append(divider)
-    for idx in range(start, end):
-        job = jobs[idx]
-        segment = "current" if idx == center_index else ("previous" if idx < center_index else "next")
-        status = job_statuses.get(job.job_id, "pending")
-        model_label = job.model.id or job.model.model or "-"
-        try:
-            env_label = resolve_env_identifier(job.env)
-        except ValueError:
-            env_label = job.env.id or job.job_id
-        lines.append(
-            f"{segment:8} | {job.job_id:20} | {status:10} | {model_label:15} | {env_label:20} | {job.name or '-'}"
-        )
-    label = f"Job progress after {event}"
-    if note:
-        label = f"{label} ({note})"
-    logger.info("%s:\n%s", label, "\n".join(lines))
-
-
-def _maybe_sleep_between_jobs(job: ResolvedJob, settings: ExecutorSettings, *, is_last: bool) -> None:
-    """Optionally pause between jobs to spread out environment runs."""
-    if settings.dry_run or is_last:
-        return
-    delay = job.sleep if job.sleep is not None else settings.sleep
-    if delay is None or delay <= 0:
-        return
-    if rich_print:
-        rich_print(f"[cyan]Sleeping {delay:.2f} second(s) before next job...[/cyan]")
-    logger.info("Sleeping %.2f second(s) before next job...", delay)
-    sleep(delay)
-
-
-__all__ = ["ExecutorSettings", "JobExecutionResult", "execute_jobs"]
diff --git a/medarc_verifiers/cli/_manifest.py b/medarc_verifiers/cli/_manifest.py
index f3827d4f..afa8ca1d 100644
--- a/medarc_verifiers/cli/_manifest.py
+++ b/medarc_verifiers/cli/_manifest.py
@@ -1,53 +1,24 @@
-"""Run manifest helpers for the unified CLI."""
+"""Legacy run manifest schemas retained for process discovery.
+
+The YAML benchmark runner no longer writes manifests, but `medarc-eval process`
+still supports old `runs/raw/<run_id>/run_manifest.json` directories during the
+transition. Keep this module to the schema pieces needed for that reader.
+"""
 
 from __future__ import annotations
 
-import json
-import logging
-from collections import Counter
-from dataclasses import dataclass
-from datetime import UTC, datetime
 from pathlib import Path
-from typing import Any, Mapping, Sequence
+from typing import Any, Mapping
 
 from pydantic import BaseModel, ConfigDict, Field, model_validator
 
-from medarc_verifiers.cli._job_builder import ResolvedJob
-from medarc_verifiers.cli._schemas import ModelConfigSchema
-from medarc_verifiers.cli.utils.json_io import dumps_json
-from medarc_verifiers.cli.utils.shared import count_jsonl_rows, compute_checksum, resolve_env_identifier_or
-from medarc_verifiers.utils.pathing import normalize_results_dir_for_manifest
-
-MANIFEST_FILENAME = "run_manifest.json"
 MANIFEST_VERSION = 3
-SUPPORTED_MANIFEST_VERSIONS = {3}
-
-logger = logging.getLogger(__name__)
-
-
-class ManifestConflictError(ValueError):
-    """Raised when an existing manifest conflicts with the current config."""
-
-
-def _normalize_model_slug(value: str) -> str:
-    """Normalize model slugs for restart comparisons.
-
-    Some providers expose the same model under different namespaces (e.g.
-    `google/gemini-3-pro-preview` vs `gemini-3-pro-preview`). For now, we only
-    normalize Gemini model slugs by stripping a single leading namespace.
-    """
-    if not value:
-        return value
-    if "/" not in value:
-        return value
-    candidate = value.rsplit("/", 1)[-1]
-    if candidate.startswith("gemini-"):
-        return candidate
-    return value
+SUPPORTED_MANIFEST_VERSIONS = {MANIFEST_VERSION}
+MANIFEST_FILENAME = "run_manifest.json"
 
 
 class ManifestJobEntry(BaseModel):
-    """Pydantic model describing a single manifest job entry."""
+    """Pydantic model describing a single legacy manifest job entry."""
 
     model_config = ConfigDict(extra="ignore")
 
@@ -74,35 +45,8 @@ class ManifestJobEntry(BaseModel):
     rollouts_per_example: int | None = None
 
 
-# Keep this list aligned with runtime/progress fields mutated by record_job_* methods.
-# Fields here must survive ensure_job() refreshes when config/env metadata changes.
-_ENSURE_JOB_RUNTIME_STATE_FIELDS = (
-    "status",
-    "reason",
-    "attempt",
-    "started_at",
-    "ended_at",
-    "duration_seconds",
-    "row_count",
-    "metrics",
-    "avg_reward",
-    "num_examples",
-    "rollouts_per_example",
-)
-
-
-def _validate_ensure_job_runtime_state_fields() -> None:
-    missing = set(_ENSURE_JOB_RUNTIME_STATE_FIELDS) - set(ManifestJobEntry.model_fields)
-    if missing:
-        msg = f"Unknown manifest fields in _ENSURE_JOB_RUNTIME_STATE_FIELDS: {sorted(missing)}"
-        raise ValueError(msg)
-
-
-_validate_ensure_job_runtime_state_fields()
-
-
 class RunManifestModel(BaseModel):
-    """Root manifest payload persisted to disk."""
+    """Root legacy manifest payload persisted by the retired YAML runner."""
 
     model_config = ConfigDict(extra="allow")
 
@@ -131,762 +75,19 @@ def _check_version(self) -> RunManifestModel:
         return self
 
 
-def timestamp() -> str:
-    """Return an ISO8601 timestamp in UTC."""
-    return datetime.now(UTC).replace(microsecond=0).isoformat().replace("+00:00", "Z")
-
-
-def compute_snapshot_checksum(snapshot: Mapping[str, Any]) -> str:
-    """Public helper to compute the checksum for a config snapshot."""
-    sanitized = dict(snapshot)
-    models = sanitized.get("models")
-    if isinstance(models, Mapping):
-        sanitized_models: dict[str, Any] = {}
-        for model_id, payload in models.items():
-            if isinstance(payload, Mapping):
-                sanitized_models[str(model_id)] = {
-                    key: value for key, value in payload.items() if key not in ModelConfigSchema.resume_tolerant_fields
-                }
-            else:
-                sanitized_models[str(model_id)] = payload
-        sanitized["models"] = sanitized_models
-    return compute_checksum(sanitized)
-
-
-def _drop_resume_tolerant_fields(payload: Mapping[str, Any]) -> dict[str, Any]:
-    cleaned = dict(payload)
-    model_payload = cleaned.get("model")
-    if isinstance(model_payload, Mapping):
-        cleaned["model"] = {
-            key: value for key, value in model_payload.items() if key not in ModelConfigSchema.resume_tolerant_fields
-        }
-    return cleaned
-
-
-def _relativize_results_dir(value: str | Path, *, run_dir: Path) -> str:
-    """Ensure results directories are stored relative to the project root."""
-    return normalize_results_dir_for_manifest(value, run_dir=run_dir)
-
-
-def _to_jsonable(value: Any) -> Any:
-    """Convert arbitrary data to JSON-serializable structures (default=str)."""
-    return json.loads(json.dumps(value, default=str))
-
-
-def _normalize_payload(payload: Mapping[str, Any]) -> dict[str, Any]:
-    def _drop(value: Any) -> Any:
-        if isinstance(value, dict):
-            return {k: _drop(v) for k, v in value.items() if v is not None}
-        if isinstance(value, list):
-            return [_drop(v) for v in value]
-        return value
-
-    return _drop(_to_jsonable(payload))
-
-
 def _require_manifest_v3(payload: Mapping[str, Any], *, path: Path | None = None) -> None:
+    """Raise when a legacy manifest payload is not version 3."""
     version = payload.get("version")
-    if version not in SUPPORTED_MANIFEST_VERSIONS:
-        location = f" '{path}'" if path else ""
-        msg = f"Manifest{location} uses version {version}; expected one of {sorted(SUPPORTED_MANIFEST_VERSIONS)}."
-        raise ValueError(msg)
-
-
-def _sanitize_model_payload(model_payload: Mapping[str, Any]) -> dict[str, Any]:
-    sanitized = {
-        key: value for key, value in model_payload.items() if key not in ModelConfigSchema.resume_tolerant_fields
-    }
-
-    model_slug = sanitized.get("model")
-    if isinstance(model_slug, str):
-        sanitized["model"] = _normalize_model_slug(model_slug)
-
-    # Provider quirks: OpenAI-compatible endpoints vary widely in what they accept when
-    # we forward `sampling_args.extra_body`. Treat *all* of extra_body as resume-tolerant
-    # for the purposes of manifest conflict detection so users can switch providers
-    # without getting blocked by payload drift.
-    sampling_args = sanitized.get("sampling_args")
-    if isinstance(sampling_args, Mapping):
-        updated_sampling_args = dict(sampling_args)
-        updated_sampling_args.pop("extra_body", None)
-        if updated_sampling_args:
-            sanitized["sampling_args"] = updated_sampling_args
-        else:
-            sanitized.pop("sampling_args", None)
-
-    return sanitized
-
-
-def _sampling_extra_body(model_payload: Mapping[str, Any]) -> dict[str, Any] | None:
-    sampling_args = model_payload.get("sampling_args")
-    if not isinstance(sampling_args, Mapping):
-        return None
-    extra_body = sampling_args.get("extra_body")
-    if not isinstance(extra_body, Mapping):
-        return None
-    normalized = _normalize_payload(extra_body)
-    return normalized or None
-
-
-def _warn_extra_body_change(key: str, existing: Mapping[str, Any], payload: Mapping[str, Any]) -> None:
-    existing_extra = _sampling_extra_body(existing)
-    payload_extra = _sampling_extra_body(payload)
-    if existing_extra is None and payload_extra is None:
-        return
-    if compute_checksum(existing_extra or {}) == compute_checksum(payload_extra or {}):
-        return
-    logger.warning(
-        "Model '%s' sampling_args.extra_body changed; allowing restart, but providers may reject unknown fields.",
-        key,
-    )
-
-
-def _sampling_args_payload(model_payload: Mapping[str, Any]) -> dict[str, Any] | None:
-    sampling_args = model_payload.get("sampling_args")
-    if not isinstance(sampling_args, Mapping):
-        return None
-    normalized = _normalize_payload(sampling_args)
-    return normalized or None
-
-
-def _warn_sampling_args_change(key: str, existing: Mapping[str, Any], payload: Mapping[str, Any]) -> None:
-    existing_sampling = _sampling_args_payload(existing)
-    payload_sampling = _sampling_args_payload(payload)
-    if existing_sampling is None and payload_sampling is None:
-        return
-    if compute_checksum(existing_sampling or {}) == compute_checksum(payload_sampling or {}):
-        return
-    logger.warning(
-        "Model '%s' sampling_args changed; allowing restart, but providers may reject unsupported parameters.",
-        key,
-    )
-
-
-def _effective_sampling_args(entry: ManifestJobEntry, model_payload: Mapping[str, Any]) -> Mapping[str, Any]:
-    if entry.sampling_args is not None:
-        return _normalize_payload(entry.sampling_args)
-    return _normalize_payload(model_payload.get("sampling_args") or {})
-
-
-def _canonical_manifest_parts(
-    *,
-    model_payload: Mapping[str, Any],
-    env_payload: Mapping[str, Any],
-    env_args: Mapping[str, Any],
-    sampling_args: Mapping[str, Any],
-    env_identifier: str | None,
-    env_variant_id: str,
-    env_payload_is_template: bool = False,
-    sampling_args_already_resolved: bool = False,
-) -> dict[str, Any]:
-    model_normalized = _normalize_payload(model_payload)
-    env_normalized = _normalize_payload(env_payload)
-    if env_payload_is_template:
-        env_template_payload = dict(env_normalized)
-    else:
-        env_template_payload = _build_env_template_payload(env_normalized)
-        if "module" not in env_template_payload:
-            env_template_payload["module"] = env_identifier
-    if sampling_args_already_resolved:
-        sampling_override = None
-        sampling_payload = sampling_args
-    else:
-        sampling_override = _sampling_args_override(sampling_args=sampling_args, model_payload=model_normalized)
-        sampling_payload = sampling_override or model_normalized.get("sampling_args") or {}
-    effective_env_payload = {
-        **env_template_payload,
-        "module": env_identifier,
-        "id": env_variant_id,
-        "env_args": _normalize_payload(env_args),
-    }
-    return {
-        "model_sanitized": _sanitize_model_payload(model_normalized),
-        "env_template_payload": _normalize_payload(env_template_payload),
-        "effective_env_payload": _normalize_payload(effective_env_payload),
-        "sampling_payload": _normalize_payload(sampling_payload),
-        "sampling_override": sampling_override,
-    }
-
-
-def manifest_job_signature(manifest: RunManifestModel, entry: ManifestJobEntry) -> dict[str, Any]:
-    model_payload = _normalize_payload(manifest.models.get(entry.model_id or "", {}) or {})
-    env_template = _normalize_payload(manifest.env_templates.get(entry.env_template_id, {}) or {})
-    env_identifier = entry.env_id or env_template.get("module")
-    canonical = _canonical_manifest_parts(
-        model_payload=model_payload,
-        env_payload=env_template,
-        env_args=entry.env_args,
-        sampling_args=_effective_sampling_args(entry, model_payload),
-        env_identifier=env_identifier,
-        env_variant_id=entry.env_variant_id,
-        env_payload_is_template=True,
-        sampling_args_already_resolved=True,
-    )
-    signature = {
-        "model": canonical["model_sanitized"],
-        "env": canonical["effective_env_payload"],
-        "sampling_args": canonical["sampling_payload"],
-    }
-    return _normalize_payload(signature)
-
-
-def resolved_job_signature(
-    job: ResolvedJob,
-    *,
-    env_args: Mapping[str, Any],
-    sampling_args: Mapping[str, Any],
-) -> dict[str, Any]:
-    model_payload = _normalize_payload(json.loads(job.model.model_dump_json(exclude_none=True)))
-    env_payload = _normalize_payload(json.loads(job.env.model_dump_json(exclude_none=True)))
-    env_id = env_payload.get("module") or _resolve_env_identifier(job)
-    env_variant_id = str(env_payload.get("id") or job.job_id)
-    canonical = _canonical_manifest_parts(
-        model_payload=model_payload,
-        env_payload=env_payload,
-        env_args=env_args,
-        sampling_args=sampling_args,
-        env_identifier=env_id,
-        env_variant_id=env_variant_id,
-    )
-    signature = {
-        "model": canonical["model_sanitized"],
-        "env": canonical["effective_env_payload"],
-        "sampling_args": canonical["sampling_payload"],
-    }
-    return _normalize_payload(signature)
-
-
-def _maybe_store_results_dir(value: str | Path | None, *, run_dir: Path, job_id: str) -> str | None:
-    if value is None:
-        return None
-    normalized = _relativize_results_dir(value, run_dir=run_dir)
-    default_value = _relativize_results_dir(run_dir / job_id, run_dir=run_dir)
-    if normalized == default_value:
-        return None
-    return normalized
-
-
-def _manifest_relative_artifacts(*, run_dir: Path, job_id: str, results_dir: Path | str | None) -> tuple[str, str]:
-    if results_dir is None:
-        base_rel = Path(job_id)
-    else:
-        candidate = Path(results_dir)
-        if not candidate.is_absolute():
-            candidate = (run_dir / candidate).resolve()
-        else:
-            candidate = candidate.resolve()
-        try:
-            base_rel = candidate.relative_to(run_dir)
-        except ValueError:
-            base_rel = Path(job_id)
-    base_rel = Path(base_rel.as_posix())
-    return (
-        (base_rel / "results.jsonl").as_posix(),
-        (base_rel / "metadata.json").as_posix(),
-    )
-
-
-def _build_env_template_payload(env_payload: Mapping[str, Any]) -> dict[str, Any]:
-    payload = dict(env_payload)
-    payload.pop("id", None)
-    payload.pop("env_args", None)
-    return _normalize_payload(payload)
-
-
-def _env_template_id(env_id: str, env_template_payload: Mapping[str, Any]) -> str:
-    digest = compute_checksum(_normalize_payload(env_template_payload))[:12]
-    return f"{env_id}:{digest}"
-
-
-def _sampling_args_override(
-    *,
-    sampling_args: Mapping[str, Any],
-    model_payload: Mapping[str, Any],
-) -> dict[str, Any] | None:
-    normalized_sampling = _normalize_payload(sampling_args)
-    model_sampling = model_payload.get("sampling_args") or {}
-    normalized_model_sampling = _normalize_payload(model_sampling)
-    if compute_checksum(normalized_sampling) == compute_checksum(normalized_model_sampling):
-        return None
-    return normalized_sampling
-
-
-def _merge_unique_model_payload(
-    container: dict[str, dict[str, Any]],
-    key: str,
-    payload: dict[str, Any],
-    *,
-    allow_mismatch: bool,
-) -> None:
-    existing = container.get(key)
-    if existing is None:
-        container[key] = payload
-        return
-    if existing == payload:
-        return
-    if allow_mismatch:
-        container[key] = payload
-        return
-    sanitized_existing = _sanitize_model_payload(existing)
-    sanitized_payload = _sanitize_model_payload(payload)
-    if sanitized_existing == sanitized_payload:
-        _warn_extra_body_change(key, existing, payload)
-        container[key] = payload
-        return
-
-    stripped_existing = dict(sanitized_existing)
-    stripped_payload = dict(sanitized_payload)
-    stripped_existing.pop("sampling_args", None)
-    stripped_payload.pop("sampling_args", None)
-    if stripped_existing == stripped_payload:
-        _warn_sampling_args_change(key, existing, payload)
-        _warn_extra_body_change(key, existing, payload)
-        container[key] = payload
-        return
-
-    all_keys = set(sanitized_existing) | set(sanitized_payload)
-    diff_keys = sorted(key for key in all_keys if sanitized_existing.get(key) != sanitized_payload.get(key))
-    suffix = f" (conflicting keys: {', '.join(diff_keys)})" if diff_keys else ""
-    msg = f"Conflicting model payload for '{key}'{suffix}."
-    raise ManifestConflictError(msg)
-
-
-def _merge_unique_payload(
-    container: dict[str, dict[str, Any]],
-    key: str,
-    payload: dict[str, Any],
-    *,
-    allow_mismatch: bool,
-    label: str,
-) -> None:
-    existing = container.get(key)
-    if existing is None:
-        container[key] = payload
-        return
-    if existing != payload and not allow_mismatch:
-        msg = f"Conflicting {label} payload for '{key}'."
-        raise ValueError(msg)
-    container[key] = payload
-
-
-def _resolve_env_identifier(job: ResolvedJob) -> str:
-    return resolve_env_identifier_or(job.env, job.job_id)
-
-
-def _resolve_model_identifier(job: ResolvedJob) -> str:
-    mid = getattr(job.model, "id", None)
-    if mid:
-        return mid
-    if getattr(job.model, "model", None):
-        return job.model.model  # type: ignore[return-value]
-    return job.job_id
-
-
-def build_job_entry(
-    job: ResolvedJob,
-    *,
-    env_args: Mapping[str, Any],
-    sampling_args: Mapping[str, Any],
-    results_dir: str | None,
-    models: dict[str, dict[str, Any]] | None = None,
-    env_templates: dict[str, dict[str, Any]] | None = None,
-    allow_model_mismatch: bool = False,
-) -> ManifestJobEntry:
-    """Build the manifest entry recorded for a job."""
-    model_payload = _normalize_payload(json.loads(job.model.model_dump_json(exclude_none=True)))
-    env_payload = _normalize_payload(json.loads(job.env.model_dump_json(exclude_none=True)))
-    env_id = env_payload.get("module") or _resolve_env_identifier(job)
-    env_variant_id = str(env_payload.get("id") or job.job_id)
-    canonical = _canonical_manifest_parts(
-        model_payload=model_payload,
-        env_payload=env_payload,
-        env_args=env_args,
-        sampling_args=sampling_args,
-        env_identifier=env_id,
-        env_variant_id=env_variant_id,
-    )
-    env_template_payload = canonical["env_template_payload"]
-    env_template_id = _env_template_id(env_id, env_template_payload)
-    if models is not None:
-        _merge_unique_model_payload(
-            models,
-            _resolve_model_identifier(job),
-            model_payload,
-            allow_mismatch=allow_model_mismatch,
-        )
-    if env_templates is not None:
-        _merge_unique_payload(
-            env_templates,
-            env_template_id,
-            env_template_payload,
-            allow_mismatch=False,
-            label="manifest template",
-        )
-    results_relpath, metadata_relpath = _manifest_relative_artifacts(
-        run_dir=Path("."),
-        job_id=job.job_id,
-        results_dir=job.job_id,
-    )
-    return ManifestJobEntry(
-        job_id=job.job_id,
-        env_id=env_id,
-        model_id=_resolve_model_identifier(job),
-        env_template_id=env_template_id,
-        env_variant_id=env_variant_id,
-        env_args=_normalize_payload(env_args),
-        sampling_args=canonical["sampling_override"],
-        status="pending",
-        reason=None,
-        attempt=0,
-        started_at=None,
-        ended_at=None,
-        duration_seconds=None,
-        results_dir=results_dir,
-        results_relpath=results_relpath,
-        metadata_relpath=metadata_relpath,
-        row_count=None,
-        metrics=None,
-        avg_reward=None,
-        num_examples=None,
-        rollouts_per_example=None,
-    )
-
-
-def _summarize_jobs(entries: Sequence[ManifestJobEntry]) -> dict[str, int]:
-    counter = Counter((entry.status or "pending") for entry in entries)
-    skipped = sum(1 for entry in entries if entry.reason in {"up_to_date", "skipped"})
-    summary = {
-        "total": len(entries),
-        "pending": counter.get("pending", 0),
-        "running": counter.get("running", 0),
-        "completed": counter.get("completed", 0),
-        "failed": counter.get("failed", 0),
-        "skipped": skipped,
-    }
-    return summary
-
-
-@dataclass
-class RunManifest:
-    """In-memory representation of a run manifest."""
-
-    path: Path
-    model: RunManifestModel
-    persist: bool = True
-
-    def __post_init__(self) -> None:
-        self._jobs: list[ManifestJobEntry] = list(self.model.jobs)
-        self.model.jobs = self._jobs
-        self._index: dict[str, ManifestJobEntry] = {entry.job_id: entry for entry in self._jobs if entry.job_id}
-        if not self.model.summary:
-            self.model.summary = _summarize_jobs(self._jobs)
-
-    @property
-    def jobs(self) -> list[ManifestJobEntry]:
-        return self._jobs
-
-    @property
-    def summary(self) -> Mapping[str, Any]:
-        return self.model.summary
-
-    @property
-    def payload(self) -> dict[str, Any]:
-        """Dictionary representation (back-compat)."""
-        return self.model.model_dump()
-
-    def job_entry(self, job_id: str) -> ManifestJobEntry | None:
-        return self._index.get(job_id)
-
-    @property
-    def run_dir(self) -> Path:
-        return self.path.parent
-
-    def ensure_job(
-        self,
-        job: ResolvedJob,
-        *,
-        env_args: Mapping[str, Any],
-        sampling_args: Mapping[str, Any],
-        results_dir: Path,
-    ) -> ManifestJobEntry:
-        entry = self._index.get(job.job_id)
-        normalized_results_dir = _maybe_store_results_dir(results_dir, run_dir=self.run_dir, job_id=job.job_id)
-        results_relpath, metadata_relpath = _manifest_relative_artifacts(
-            run_dir=self.run_dir,
-            job_id=job.job_id,
-            results_dir=results_dir,
-        )
-        if entry is None:
-            entry = build_job_entry(
-                job,
-                env_args=env_args,
-                sampling_args=sampling_args,
-                results_dir=normalized_results_dir,
-                models=self.model.models,
-                env_templates=self.model.env_templates,
-            )
-            entry.results_relpath = results_relpath
-            entry.metadata_relpath = metadata_relpath
-            self._jobs.append(entry)
-            self._index[job.job_id] = entry
-            self._refresh_summary(save=False)
-            return entry
-
-        updated = build_job_entry(
-            job,
-            env_args=env_args,
-            sampling_args=sampling_args,
-            results_dir=normalized_results_dir,
-            models=self.model.models,
-            env_templates=self.model.env_templates,
-        )
-        runtime_state = {field: getattr(entry, field) for field in _ENSURE_JOB_RUNTIME_STATE_FIELDS}
-        if entry.results_dir is not None:
-            results_dir_value = entry.results_dir
-        else:
-            results_dir_value = updated.results_dir
-        replacement = updated.model_copy(
-            update={
-                **runtime_state,
-                "results_dir": results_dir_value,
-                "results_relpath": results_relpath,
-                "metadata_relpath": metadata_relpath,
-            }
-        )
-        # Preserve object identity so external references to `entry` remain live.
-        for field_name, value in replacement.model_dump().items():
-            setattr(entry, field_name, value)
-        self._index[job.job_id] = entry
-        return entry
-
-    def record_job_start(self, job_id: str) -> None:
-        entry = self._index.get(job_id)
-        if not entry:
-            return
-        entry.status = "running"
-        entry.reason = None
-        entry.started_at = timestamp()
-        entry.attempt = int(entry.attempt or 0) + 1
-        self._refresh_summary()
-
-    def record_job_completion(
-        self,
-        job_id: str,
-        *,
-        duration_seconds: float,
-        results_dir: Path,
-        avg_reward: float | None,
-        metrics: Mapping[str, Any],
-        num_examples: int | None,
-        rollouts_per_example: int | None,
-    ) -> None:
-        entry = self._index.get(job_id)
-        if not entry:
-            return
-        entry.status = "completed"
-        entry.reason = None
-        entry.ended_at = timestamp()
-        entry.duration_seconds = duration_seconds
-        entry.results_dir = _maybe_store_results_dir(results_dir, run_dir=self.run_dir, job_id=job_id)
-        results_relpath, metadata_relpath = _manifest_relative_artifacts(
-            run_dir=self.run_dir,
-            job_id=job_id,
-            results_dir=results_dir,
-        )
-        entry.results_relpath = results_relpath
-        entry.metadata_relpath = metadata_relpath
-        entry.avg_reward = avg_reward
-        entry.metrics = dict(metrics) if metrics else None
-        entry.num_examples = num_examples
-        entry.rollouts_per_example = rollouts_per_example
-        results_path = results_dir / "results.jsonl"
-        entry.row_count = count_jsonl_rows(results_path)
-        self._refresh_summary()
-
-    def record_job_failure(self, job_id: str, *, error: str, duration_seconds: float | None = None) -> None:
-        entry = self._index.get(job_id)
-        if not entry:
-            return
-        entry.status = "failed"
-        entry.reason = error
-        entry.ended_at = timestamp()
-        entry.duration_seconds = duration_seconds
-        self._refresh_summary()
-
-    def record_job_skip(
-        self,
-        job_id: str,
-        *,
-        reason: str,
-        results_dir: str | Path | None = None,
-        source_entry: Mapping[str, Any] | None = None,
-    ) -> None:
-        entry = self._index.get(job_id)
-        if not entry:
-            return
-        entry.status = "completed"
-        entry.reason = reason
-        entry.ended_at = entry.ended_at or timestamp()
-
-        if source_entry:
-            is_mapping = isinstance(source_entry, Mapping)
-            for key in (
-                "duration_seconds",
-                "avg_reward",
-                "metrics",
-                "num_examples",
-                "rollouts_per_example",
-                "row_count",
-            ):
-                if is_mapping:
-                    if key in source_entry:
-                        setattr(entry, key, source_entry[key])
-                else:
-                    setattr(entry, key, getattr(source_entry, key))
-        if results_dir:
-            entry.results_dir = _maybe_store_results_dir(results_dir, run_dir=self.run_dir, job_id=job_id)
-            results_relpath, metadata_relpath = _manifest_relative_artifacts(
-                run_dir=self.run_dir,
-                job_id=job_id,
-                results_dir=results_dir,
-            )
-            entry.results_relpath = results_relpath
-            entry.metadata_relpath = metadata_relpath
-        if entry.metrics == {}:
-            entry.metrics = None
-        self._refresh_summary()
-
-    def _refresh_summary(self, *, save: bool = True) -> None:
-        self.model.summary = _summarize_jobs(self._jobs)
-        self.model.updated_at = timestamp()
-        if save:
-            self.save()
-
-    def save(self) -> None:
-        if not self.persist:
-            return
-        tmp_path = self.path.with_suffix(".tmp")
-        self.path.parent.mkdir(parents=True, exist_ok=True)
-        text = dumps_json(self.model.model_dump(exclude_none=True))
-        tmp_path.write_text(text, encoding="utf-8")
-        tmp_path.replace(self.path)
-
-    @classmethod
-    def load(cls, path: Path, *, persist: bool = True) -> RunManifest:
-        if not path.exists():
-            raise FileNotFoundError(f"Run manifest '{path}' not found.")
-        with path.open("r", encoding="utf-8") as handle:
-            payload = json.load(handle)
-        payload, _ = _upgrade_manifest_payload(payload)
-        model = RunManifestModel.model_validate(payload)
-        return cls(path=path, model=model, persist=persist)
-
-    @classmethod
-    def create(
-        cls,
-        *,
-        run_dir: Path,
-        run_id: str,
-        run_name: str,
-        config_source: Path,
-        config_checksum: str,
-        jobs: Sequence[ResolvedJob],
-        env_args_map: Mapping[str, Mapping[str, Any]],
-        sampling_args_map: Mapping[str, Mapping[str, Any]],
-        persist: bool = True,
-        restart_source: str | None = None,
-    ) -> RunManifest:
-        run_dir.mkdir(parents=True, exist_ok=True)
-        path = run_dir / MANIFEST_FILENAME
-        payload: Mapping[str, Any] = {
-            "version": MANIFEST_VERSION,
-            "run_id": run_id,
-            "name": run_name,
-            "config_source": str(config_source),
-            "config_checksum": config_checksum,
-            "created_at": timestamp(),
-            "updated_at": timestamp(),
-            "restart_source": restart_source,
-            "artifacts_root": ".",
-            "models": {},
-            "env_templates": {},
-            "jobs": [],
-            "summary": {},
-        }
-        model = RunManifestModel.model_validate(payload)
-        manifest = cls(path=path, model=model, persist=persist)
-        for job in jobs:
-            env_args = env_args_map[job.job_id]
-            sampling_args = sampling_args_map[job.job_id]
-            manifest.ensure_job(
-                job,
-                env_args=env_args,
-                sampling_args=sampling_args,
-                results_dir=(run_dir / job.job_id),
-            )
-        manifest._refresh_summary(save=True)
-        return manifest
+    if version != MANIFEST_VERSION:
+        location = f" at {path}" if path else ""
+        raise ValueError(f"Unsupported legacy run manifest version {version!r}{location}; expected 3.")
 
 
 __all__ = [
     "MANIFEST_FILENAME",
-    "RunManifest",
-    "RunManifestModel",
+    "MANIFEST_VERSION",
+    "SUPPORTED_MANIFEST_VERSIONS",
     "ManifestJobEntry",
-    "build_job_entry",
-    "compute_snapshot_checksum",
-    "manifest_job_signature",
-    "resolved_job_signature",
-    "timestamp",
+    "RunManifestModel",
+    "_require_manifest_v3",
 ]
-
-
-def _upgrade_manifest_payload(payload: Any) -> tuple[Any, bool]:
-    """Apply in-memory migrations for older manifest payloads."""
-    if not isinstance(payload, dict):
-        return payload, False
-
-    changed = False
-    version = payload.get("version")
-    jobs = payload.get("jobs")
-    if version == 2:
-        payload["version"] = MANIFEST_VERSION
-        payload.setdefault("artifacts_root", ".")
-        changed = True
-        if isinstance(jobs, list):
-            for index, job in enumerate(jobs):
-                if not isinstance(job, dict):
-                    continue
-                job_id = str(job.get("job_id") or "")
-                base_dir = Path(str(job.get("results_dir") or job_id or ""))
-                if not base_dir.as_posix():
-                    base_dir = Path(job_id or f"job-{index}")
-                if "results_relpath" not in job:
-                    job["results_relpath"] = (base_dir / "results.jsonl").as_posix()
-                    changed = True
-                if "metadata_relpath" not in job:
-                    job["metadata_relpath"] = (base_dir / "metadata.json").as_posix()
-                    changed = True
-                if "summary_relpath" in job:
-                    job.pop("summary_relpath", None)
-                    changed = True
-                if "artifacts_checksum" in job:
-                    job.pop("artifacts_checksum", None)
-                    changed = True
-                if "artifacts" in job:
-                    job.pop("artifacts", None)
-                    changed = True
-
-    env_templates = payload.get("env_templates")
-    if isinstance(env_templates, dict):
-        for template_id, template in env_templates.items():
-            if not isinstance(template, dict):
-                continue
-            if "interleave_scoring" not in template:
-                continue
-            interleave_value = template.pop("interleave_scoring")
-            template.setdefault("independent_scoring", interleave_value)
-            env_templates[template_id] = template
-            changed = True
-
-    return payload, changed
diff --git a/medarc_verifiers/cli/_manifest_planner.py b/medarc_verifiers/cli/_manifest_planner.py
deleted file mode 100644
index 64897260..00000000
--- a/medarc_verifiers/cli/_manifest_planner.py
+++ /dev/null
@@ -1,414 +0,0 @@
-"""Manifest planning helpers separating selection from runnable computation."""
-
-from __future__ import annotations
-
-import json
-import logging
-from dataclasses import dataclass
-from datetime import UTC, datetime
-from pathlib import Path
-from typing import Any, Mapping, Sequence
-
-from medarc_verifiers.cli._job_builder import ResolvedJob
-from medarc_verifiers.cli._manifest import (
-    MANIFEST_FILENAME,
-    RunManifest,
-    manifest_job_signature,
-    resolved_job_signature,
-)
-from medarc_verifiers.cli.utils.shared import slugify
-from medarc_verifiers.utils.pathing import resolve_results_dir_from_manifest
-
-logger = logging.getLogger(__name__)
-
-
-@dataclass
-class ManifestPlan:
-    manifest: RunManifest
-    runnable_job_ids: set[str]
-    reused_job_ids: set[str]
-
-
-@dataclass
-class ManifestSelection:
-    manifest: RunManifest
-    seed_manifest: RunManifest | None
-    strategy: str
-
-
-class ManifestPlanner:
-    """Resolve a manifest for a run and compute runnable/reused job sets."""
-
-    def __init__(
-        self,
-        *,
-        output_dir: Path,
-        run_id: str | None,
-        run_name: str,
-        config_path: Path,
-        config_checksum: str,
-        jobs: Sequence[ResolvedJob],
-        env_args_map: Mapping[str, Mapping[str, Any]],
-        sampling_args_map: Mapping[str, Mapping[str, Any]],
-        restart_source: str | None,
-        auto_resume: bool,
-        persist: bool,
-    ) -> None:
-        self.output_dir = Path(output_dir)
-        self.run_id = run_id
-        self.run_name = run_name
-        self.config_path = Path(config_path)
-        self.config_checksum = config_checksum
-        self.jobs = jobs
-        self.env_args_map = env_args_map
-        self.sampling_args_map = sampling_args_map
-        self.restart_source = restart_source
-        self.auto_resume = auto_resume
-        self.persist = persist
-
-    def plan(self, *, force_all: bool, forced_envs: set[str]) -> ManifestPlan:
-        selection = self._select_manifest()
-        runnable, reused = self._compute_runnable(selection, force_all=force_all, forced_envs=forced_envs)
-        return ManifestPlan(manifest=selection.manifest, runnable_job_ids=runnable, reused_job_ids=reused)
-
-    # Selection helpers
-    def _select_manifest(self) -> ManifestSelection:
-        if self.restart_source:
-            restart = self._select_restart_manifest(self.restart_source)
-            if restart:
-                return restart
-
-        if self.auto_resume:
-            resumed = self._select_auto_resume_manifest()
-            if resumed:
-                return resumed
-
-        manifest = self._create_fresh_manifest()
-        return ManifestSelection(manifest=manifest, seed_manifest=None, strategy="fresh")
-
-    def _select_restart_manifest(self, restart_source: str) -> ManifestSelection | None:
-        persist = self.persist
-        restart_path = Path(restart_source).expanduser()
-        seed_dir: Path | None = None
-        if restart_path.exists() and restart_path.is_dir():
-            seed_dir = restart_path.resolve()
-        else:
-            candidate = self.output_dir / restart_source
-            if candidate.exists() and candidate.is_dir():
-                seed_dir = candidate.resolve()
-        if seed_dir and (seed_dir / MANIFEST_FILENAME).exists():
-            seed_manifest = RunManifest.load(seed_dir / MANIFEST_FILENAME, persist=persist)
-            logger.info(
-                "Restart in-place: extending existing run '%s' with any new jobs from current config.",
-                seed_manifest.model.run_id,
-            )
-            self._ensure_jobs(seed_manifest, seed_manifest.run_dir)
-            return ManifestSelection(
-                manifest=seed_manifest,
-                seed_manifest=seed_manifest,
-                strategy="restart_in_place",
-            )
-
-        if seed_dir is None:
-            return None
-        if not (seed_dir / MANIFEST_FILENAME).exists():
-            msg = f"Invalid --restart '{seed_dir}': missing {MANIFEST_FILENAME}"
-            raise ValueError(msg)
-        seed_manifest = RunManifest.load(seed_dir / MANIFEST_FILENAME, persist=False)
-        dest_run_id = self.run_id or _generate_run_id(self.run_name)
-        run_dir = self._run_dir_for(dest_run_id)
-        manifest_path = run_dir / MANIFEST_FILENAME
-        if run_dir.exists() and manifest_path.exists() and persist:
-            msg = f"Run directory '{run_dir}' already exists; choose a different --run-id."
-            raise ValueError(msg)
-        logger.info("Restarting run '%s' from prior run '%s'.", dest_run_id, restart_source)
-        manifest = RunManifest.create(
-            run_dir=run_dir,
-            run_id=dest_run_id,
-            run_name=self.run_name,
-            config_source=self.config_path,
-            config_checksum=self.config_checksum,
-            jobs=self.jobs,
-            env_args_map=self.env_args_map,
-            sampling_args_map=self.sampling_args_map,
-            persist=persist,
-            restart_source=restart_source,
-        )
-        self._ensure_jobs(manifest, run_dir)
-        return ManifestSelection(manifest=manifest, seed_manifest=seed_manifest, strategy="restart_new")
-
-    def _select_auto_resume_manifest(self) -> ManifestSelection | None:
-        persist = self.persist
-        if self.run_id:
-            run_dir = self._run_dir_for(self.run_id)
-            manifest_path = run_dir / MANIFEST_FILENAME
-            if manifest_path.exists():
-                manifest = RunManifest.load(manifest_path, persist=persist)
-                existing_checksum = manifest.model.config_checksum
-                if existing_checksum and existing_checksum != self.config_checksum:
-                    msg = (
-                        f"Run '{self.run_id}' was created from a different configuration. "
-                        f"To start fresh, pick a different --run-id or pass --no-auto-resume. "
-                        f"To reuse completed jobs from this run, pass --restart {self.run_id}."
-                    )
-                    raise ValueError(msg)
-                self._ensure_jobs(manifest, run_dir)
-                return ManifestSelection(manifest=manifest, seed_manifest=None, strategy="auto_resume")
-            if run_dir.exists():
-                msg = f"Run '{self.run_id}' is missing {MANIFEST_FILENAME}; cannot auto-resume."
-                raise ValueError(msg)
-            logger.info(
-                "Auto-resume requested for run '%s', but no prior run exists. Starting a fresh run with this id.",
-                self.run_id,
-            )
-            manifest = self._create_fresh_manifest(run_id=self.run_id)
-            return ManifestSelection(manifest=manifest, seed_manifest=None, strategy="fresh")
-
-        candidate = _find_auto_resume_candidate(self.output_dir, expected_checksum=self.config_checksum)
-        if candidate is None:
-            logger.info(
-                "Auto-resume enabled but no matching run exists in %s; starting a fresh run. "
-                "Use --no-auto-resume to always start new runs.",
-                self.output_dir,
-            )
-            return None
-        manifest = RunManifest.load(candidate / MANIFEST_FILENAME, persist=persist)
-        self._ensure_jobs(manifest, manifest.run_dir)
-        return ManifestSelection(manifest=manifest, seed_manifest=None, strategy="auto_resume")
-
-    def _create_fresh_manifest(self, run_id: str | None = None) -> RunManifest:
-        dest_run_id = run_id or _generate_run_id(self.run_name)
-        run_dir = self._run_dir_for(dest_run_id)
-        manifest = RunManifest.create(
-            run_dir=run_dir,
-            run_id=dest_run_id,
-            run_name=self.run_name,
-            config_source=self.config_path,
-            config_checksum=self.config_checksum,
-            jobs=self.jobs,
-            env_args_map=self.env_args_map,
-            sampling_args_map=self.sampling_args_map,
-            persist=self.persist,
-            restart_source=None,
-        )
-        self._ensure_jobs(manifest, run_dir)
-        return manifest
-
-    def _ensure_jobs(self, manifest: RunManifest, run_dir: Path) -> None:
-        for job in self.jobs:
-            manifest.ensure_job(
-                job,
-                env_args=self.env_args_map[job.job_id],
-                sampling_args=self.sampling_args_map[job.job_id],
-                results_dir=run_dir / job.job_id,
-            )
-
-    def _run_dir_for(self, run_id: str) -> Path:
-        return Path(self.output_dir) / run_id
-
-    # Runnable computation
-    def _compute_runnable(
-        self,
-        selection: ManifestSelection,
-        *,
-        force_all: bool,
-        forced_envs: set[str],
-    ) -> tuple[set[str], set[str]]:
-        manifest = selection.manifest
-        strategy = selection.strategy
-        if strategy in {"restart_in_place", "restart_new"} and selection.seed_manifest is not None:
-            runnable, reused = _plan_regen_jobs(
-                manifest=manifest,
-                seed_manifest=selection.seed_manifest,
-                jobs=self.jobs,
-                force_all=force_all,
-                forced_envs=forced_envs,
-            )
-            if strategy == "restart_new" and reused:
-                logger.info("Reused %d completed job(s) from '%s'.", len(reused), self.restart_source)
-            return runnable, reused
-
-        if strategy == "auto_resume":
-            runnable = _plan_auto_resume_jobs(
-                manifest=manifest,
-                jobs=self.jobs,
-                env_args_map=self.env_args_map,
-                sampling_args_map=self.sampling_args_map,
-                force_all=force_all,
-                forced_envs=forced_envs,
-            )
-            return runnable, set()
-
-        runnable = {job.job_id for job in self.jobs}
-        return runnable, set()
-
-
-def _find_auto_resume_candidate(output_dir: Path, *, expected_checksum: str) -> Path | None:
-    """Pick the best prior run directory to auto-resume for the given checksum.
-
-    Preference order:
-    1) Matching config checksum and incomplete (completed < total)
-    2) Matching config checksum and most recent updated_at
-    Returns the run directory Path or None if no candidates.
-    """
-    candidates: list[tuple[bool, float, Path]] = []
-    for child in sorted(output_dir.iterdir() if output_dir.exists() else [], key=lambda p: p.name):
-        if not child.is_dir():
-            continue
-        manifest_path = child / MANIFEST_FILENAME
-        if not manifest_path.exists():
-            continue
-        try:
-            with manifest_path.open("r", encoding="utf-8") as fh:
-                payload = json.load(fh)
-        except Exception:  # noqa: BLE001
-            continue
-        if payload.get("config_checksum") != expected_checksum:
-            continue
-        summary = payload.get("summary") or {}
-        total = int(summary.get("total", 0))
-        completed = int(summary.get("completed", 0))
-        incomplete = completed < total if total > 0 else True
-        updated_at = payload.get("updated_at") or payload.get("created_at")
-        try:
-            ts = _parse_iso_ts(updated_at) if isinstance(updated_at, str) else (manifest_path.stat().st_mtime)
-        except Exception:  # noqa: BLE001
-            ts = manifest_path.stat().st_mtime
-        candidates.append((incomplete, float(ts), child))
-
-    if not candidates:
-        return None
-    candidates.sort(key=lambda t: (t[0], t[1]))
-    return candidates[-1][2]
-
-
-def _parse_iso_ts(value: str) -> float:
-    # Accept timestamps like '2025-11-07T01:23:45Z' or ISO with offset
-    try:
-        normalized = value.replace("Z", "+00:00")
-        return datetime.fromisoformat(normalized).timestamp()
-    except Exception:  # noqa: BLE001
-        return 0.0
-
-
-def _plan_auto_resume_jobs(
-    *,
-    manifest: RunManifest,
-    jobs: Sequence[ResolvedJob],
-    env_args_map: Mapping[str, Mapping[str, Any]],
-    sampling_args_map: Mapping[str, Mapping[str, Any]],
-    force_all: bool,
-    forced_envs: set[str],
-) -> set[str]:
-    job_lookup = {job.job_id: job for job in jobs}
-    manifest_signatures: dict[str, dict[str, Any]] = {}
-    resolved_signatures: dict[str, dict[str, Any]] = {}
-    runnable: set[str] = set()
-    manifest_job_ids = {entry.job_id for entry in manifest.jobs if entry.job_id}
-    new_jobs = set(job_lookup) - manifest_job_ids
-    if new_jobs:
-        logger.info(
-            "Auto-resume ignoring %d new job(s) not present in the manifest: %s",
-            len(new_jobs),
-            ", ".join(sorted(new_jobs)),
-        )
-    for entry in manifest.jobs:
-        job_id = entry.job_id
-        if not job_id:
-            continue
-        job = job_lookup.get(job_id)
-        if job is None:
-            logger.debug("Manifest contains job '%s' that is absent from the current config; skipping.", job_id)
-            continue
-        manifest_signature = manifest_signatures.get(job_id)
-        if manifest_signature is None:
-            manifest_signature = manifest_job_signature(manifest.model, entry)
-            manifest_signatures[job_id] = manifest_signature
-        resolved_signature = resolved_signatures.get(job_id)
-        if resolved_signature is None:
-            resolved_signature = resolved_job_signature(
-                job,
-                env_args=env_args_map[job_id],
-                sampling_args=sampling_args_map[job_id],
-            )
-            resolved_signatures[job_id] = resolved_signature
-        if manifest_signature != resolved_signature:
-            msg = (
-                f"Job '{job_id}' arguments changed since the manifest was recorded. "
-                "Start a fresh run by choosing a different --run-id or passing --no-auto-resume. "
-                "To reuse completed jobs from this run, pass --restart <run-id-or-path>."
-            )
-            raise ValueError(msg)
-        env_id = (entry.env_id or job.env.id or job.job_id).lower()
-        forced = force_all or env_id in forced_envs
-        if forced or entry.status != "completed":
-            runnable.add(job_id)
-    return runnable
-
-
-def _plan_regen_jobs(
-    *,
-    manifest: RunManifest,
-    seed_manifest: RunManifest,
-    jobs: Sequence[ResolvedJob],
-    force_all: bool,
-    forced_envs: set[str],
-) -> tuple[set[str], set[str]]:
-    runnable: set[str] = set()
-    reused: set[str] = set()
-    manifest_signatures: dict[str, dict[str, Any]] = {}
-    seed_signatures: dict[str, dict[str, Any]] = {}
-    for job in jobs:
-        entry = manifest.job_entry(job.job_id)
-        if entry is None:
-            continue
-        seed_entry = seed_manifest.job_entry(job.job_id)
-        env_id = (entry.env_id or job.env.id or job.job_id).lower()
-        forced = force_all or env_id in forced_envs
-        if (
-            not forced
-            and seed_entry is not None
-            and seed_entry.status == "completed"
-            and _manifest_job_signature_cached(seed_manifest, seed_entry, seed_signatures)
-            == _manifest_job_signature_cached(manifest, entry, manifest_signatures)
-        ):
-            seed_results_dir = seed_entry.results_dir
-            if seed_results_dir is None:
-                seed_results_dir = seed_manifest.run_dir / seed_entry.job_id
-            if isinstance(seed_results_dir, Path):
-                resolved_results_dir: Path | str | None = seed_results_dir
-            else:
-                resolved_results_dir = resolve_results_dir_from_manifest(
-                    str(seed_results_dir) if seed_results_dir is not None else None,
-                    job_id=seed_entry.job_id,
-                    run_dir=seed_manifest.run_dir,
-                )
-            manifest.record_job_skip(
-                job.job_id,
-                reason="up_to_date",
-                results_dir=resolved_results_dir or seed_results_dir,
-                source_entry=seed_entry,
-            )
-            reused.add(job.job_id)
-            continue
-        runnable.add(job.job_id)
-    return runnable, reused
-
-
-def _manifest_job_signature_cached(
-    manifest: RunManifest,
-    entry: Any,
-    cache: dict[str, dict[str, Any]],
-) -> dict[str, Any]:
-    job_id = entry.job_id
-    signature = cache.get(job_id)
-    if signature is None:
-        signature = manifest_job_signature(manifest.model, entry)
-        cache[job_id] = signature
-    return signature
-
-
-def _generate_run_id(name: str) -> str:
-    base = slugify(name or "run")
-    timestamp = datetime.now(UTC).strftime("%Y%m%d-%H%M%S")
-    return f"{base}-{timestamp}"
diff --git a/medarc_verifiers/cli/_manifest_tools.py b/medarc_verifiers/cli/_manifest_tools.py
deleted file mode 100644
index 836fd9d2..00000000
--- a/medarc_verifiers/cli/_manifest_tools.py
+++ /dev/null
@@ -1,389 +0,0 @@
-"""Utilities for manifest validation and migration."""
-
-from __future__ import annotations
-
-import os
-import json
-import logging
-import sys
-from concurrent.futures import ThreadPoolExecutor, as_completed
-from dataclasses import dataclass
-from pathlib import Path
-from typing import Any, Mapping, Sequence
-
-from medarc_verifiers.cli._manifest import MANIFEST_FILENAME, RunManifestModel, SUPPORTED_MANIFEST_VERSIONS
-
-logger = logging.getLogger(__name__)
-
-
-@dataclass(slots=True)
-class ManifestValidationIssue:
-    run_id: str
-    job_id: str
-    kind: str
-    message: str
-
-
-@dataclass(slots=True)
-class ManifestValidationResult:
-    manifests_checked: int
-    jobs_checked: int
-    issues: list[ManifestValidationIssue]
-
-    @property
-    def has_errors(self) -> bool:
-        return any(issue.kind == "error" for issue in self.issues)
-
-
-def validate_manifests_in_runs(runs_dir: Path | str, *, strict: bool = False) -> ManifestValidationResult:
-    runs_path = Path(runs_dir)
-    issues: list[ManifestValidationIssue] = []
-    manifests_checked = 0
-    jobs_checked = 0
-    if not runs_path.exists():
-        return ManifestValidationResult(manifests_checked=0, jobs_checked=0, issues=[])
-
-    run_dirs = sorted(path for path in runs_path.iterdir() if path.is_dir())
-    logger.info("Scanning manifests under %s...", runs_path)
-
-    manifest_run_dirs = [run_dir for run_dir in run_dirs if (run_dir / MANIFEST_FILENAME).exists()]
-    if not manifest_run_dirs:
-        return ManifestValidationResult(manifests_checked=0, jobs_checked=0, issues=[])
-
-    max_workers = min(len(manifest_run_dirs), max(1, (os.cpu_count() or 4) * 4))
-    if max_workers <= 1:
-        results = [_validate_run_dir(run_dir, strict=strict) for run_dir in manifest_run_dirs]
-    else:
-        results = list(_validate_run_dirs_parallel(manifest_run_dirs, strict=strict, max_workers=max_workers))
-
-    for result in results:
-        manifests_checked += result.manifests_checked
-        jobs_checked += result.jobs_checked
-        issues.extend(result.issues)
-
-    issues.sort(key=lambda item: (item.run_id, item.job_id, item.kind, item.message))
-    return ManifestValidationResult(manifests_checked=manifests_checked, jobs_checked=jobs_checked, issues=issues)
-
-
-def _validate_run_dirs_parallel(
-    run_dirs: Sequence[Path],
-    *,
-    strict: bool,
-    max_workers: int,
-) -> list[ManifestValidationResult]:
-    results: list[ManifestValidationResult] = []
-    progress, task_id = _create_manifest_scan_progress(len(run_dirs))
-    executor: ThreadPoolExecutor | None = None
-    futures = []
-    try:
-        executor = ThreadPoolExecutor(max_workers=max_workers)
-        futures = [executor.submit(_validate_run_dir, run_dir, strict=strict) for run_dir in run_dirs]
-        if progress is not None and task_id is not None:
-            with progress:
-                for future in as_completed(futures):
-                    results.append(future.result())
-                    progress.update(task_id, advance=1)
-        else:
-            for future in as_completed(futures):
-                results.append(future.result())
-    except KeyboardInterrupt:
-        logger.warning("Manifest scanning interrupted; cancelling validation workers.")
-        for future in futures:
-            future.cancel()
-        if executor is not None:
-            executor.shutdown(wait=False, cancel_futures=True)
-            executor = None
-        raise
-    finally:
-        if executor is not None:
-            executor.shutdown(wait=True, cancel_futures=False)
-    return results
-
-
-def _create_manifest_scan_progress(total: int) -> tuple[object | None, object | None]:
-    if total <= 0 or not sys.stderr.isatty():
-        return None, None
-    try:
-        from rich.progress import BarColumn, Progress, SpinnerColumn, TaskProgressColumn, TextColumn, TimeElapsedColumn
-
-        progress = Progress(
-            SpinnerColumn(),
-            TextColumn("[progress.description]{task.description}"),
-            BarColumn(),
-            TaskProgressColumn(),
-            TimeElapsedColumn(),
-            transient=True,
-        )
-        task_id = progress.add_task("Scanning manifests", total=total)
-        return progress, task_id
-    except Exception:
-        return None, None
-
-
-def _validate_run_dir(run_dir: Path, *, strict: bool) -> ManifestValidationResult:
-    issues: list[ManifestValidationIssue] = []
-    manifest_path = run_dir / MANIFEST_FILENAME
-    if not manifest_path.exists():
-        return ManifestValidationResult(manifests_checked=0, jobs_checked=0, issues=[])
-
-    try:
-        payload = json.loads(manifest_path.read_text(encoding="utf-8"))
-    except Exception as exc:  # noqa: BLE001
-        return ManifestValidationResult(
-            manifests_checked=1,
-            jobs_checked=0,
-            issues=[
-                ManifestValidationIssue(
-                    run_id=run_dir.name,
-                    job_id="",
-                    kind="error",
-                    message=f"Failed to parse manifest: {exc}",
-                )
-            ],
-        )
-
-    version = payload.get("version")
-    if version not in SUPPORTED_MANIFEST_VERSIONS:
-        return ManifestValidationResult(
-            manifests_checked=1,
-            jobs_checked=0,
-            issues=[
-                ManifestValidationIssue(
-                    run_id=run_dir.name,
-                    job_id="",
-                    kind="error",
-                    message=f"Unsupported manifest version: {version}",
-                )
-            ],
-        )
-
-    model = RunManifestModel.model_validate(payload)
-    artifacts_root = str(getattr(model, "artifacts_root", ".") or ".")
-    jobs_checked = 0
-
-    for entry in model.jobs:
-        jobs_checked += 1
-        results_path, metadata_path, used_fallback = _resolve_job_artifact_paths(
-            run_dir=run_dir,
-            artifacts_root=artifacts_root,
-            job_id=entry.job_id,
-            results_relpath=entry.results_relpath,
-            metadata_relpath=entry.metadata_relpath,
-        )
-        if used_fallback:
-            issues.append(
-                ManifestValidationIssue(
-                    run_id=model.run_id,
-                    job_id=entry.job_id,
-                    kind="warning",
-                    message="Manifest artifact path missing; fallback to run-relative job directory would be used.",
-                )
-            )
-        if not results_path.exists():
-            kind = "error" if strict else "warning"
-            issues.append(
-                ManifestValidationIssue(
-                    run_id=model.run_id,
-                    job_id=entry.job_id,
-                    kind=kind,
-                    message=f"Missing results.jsonl at {results_path}",
-                )
-            )
-        if results_path.exists():
-            for message in _quick_validate_results_jsonl(
-                results_path,
-                num_examples=entry.num_examples,
-                rollouts_per_example=entry.rollouts_per_example,
-            ):
-                kind = "error" if strict else "warning"
-                issues.append(
-                    ManifestValidationIssue(
-                        run_id=model.run_id,
-                        job_id=entry.job_id,
-                        kind=kind,
-                        message=message,
-                    )
-                )
-        if entry.metadata_relpath and not metadata_path.exists():
-            kind = "error" if strict else "warning"
-            issues.append(
-                ManifestValidationIssue(
-                    run_id=model.run_id,
-                    job_id=entry.job_id,
-                    kind=kind,
-                    message=f"Missing metadata.json at {metadata_path}",
-                )
-            )
-
-    return ManifestValidationResult(manifests_checked=1, jobs_checked=jobs_checked, issues=issues)
-
-
-def _resolve_job_artifact_paths(
-    *,
-    run_dir: Path,
-    artifacts_root: str,
-    job_id: str,
-    results_relpath: str | None,
-    metadata_relpath: str | None,
-) -> tuple[Path, Path, bool]:
-    used_fallback = False
-    if results_relpath:
-        root = (run_dir / artifacts_root).resolve()
-        results_path = (root / results_relpath).resolve()
-        metadata_path = (
-            root / (metadata_relpath or f"{Path(results_relpath).parent.as_posix()}/metadata.json")
-        ).resolve()
-    else:
-        base_dir = (run_dir / job_id).resolve()
-        results_path = base_dir / "results.jsonl"
-        metadata_path = base_dir / "metadata.json"
-    if not results_path.exists() and (run_dir / job_id / "results.jsonl").exists():
-        used_fallback = True
-        results_path = (run_dir / job_id / "results.jsonl").resolve()
-        metadata_path = (run_dir / job_id / "metadata.json").resolve()
-    return results_path, metadata_path, used_fallback
-
-
-def _quick_validate_results_jsonl(
-    path: Path,
-    *,
-    num_examples: int | None,
-    rollouts_per_example: int | None,
-) -> list[str]:
-    first_line = _read_first_nonempty_line(path)
-    last_line = _read_last_nonempty_line(path)
-    if first_line is None or last_line is None:
-        return [f"results.jsonl at {path} is empty"]
-
-    issues: list[str] = []
-    first_payload = _decode_probe_line(first_line, path=path, position="first", issues=issues)
-    last_payload = _decode_probe_line(last_line, path=path, position="last", issues=issues)
-    if first_payload is None or last_payload is None:
-        return issues
-
-    for position, payload in (("first", first_payload), ("last", last_payload)):
-        if "example_id" not in payload:
-            issues.append(f"{position} JSONL row in {path} is missing example_id")
-    _validate_rollout_index(
-        first_payload,
-        path=path,
-        position="first",
-        rollouts_per_example=rollouts_per_example,
-        issues=issues,
-    )
-    _validate_rollout_index(
-        last_payload,
-        path=path,
-        position="last",
-        rollouts_per_example=rollouts_per_example,
-        issues=issues,
-    )
-
-    return issues
-
-
-def _decode_probe_line(
-    raw_line: str,
-    *,
-    path: Path,
-    position: str,
-    issues: list[str],
-) -> Mapping[str, Any] | None:
-    try:
-        payload = json.loads(raw_line)
-    except json.JSONDecodeError as exc:
-        issues.append(f"failed to parse {position} JSONL row in {path}: {exc.msg}")
-        return None
-    if not isinstance(payload, Mapping):
-        issues.append(f"{position} JSONL row in {path} is not a JSON object")
-        return None
-    return payload
-
-
-def _read_first_nonempty_line(path: Path) -> str | None:
-    with path.open("r", encoding="utf-8") as handle:
-        for line in handle:
-            candidate = line.strip()
-            if candidate:
-                return candidate
-    return None
-
-
-def _read_last_nonempty_line(path: Path) -> str | None:
-    with path.open("rb") as handle:
-        handle.seek(0, os.SEEK_END)
-        file_size = handle.tell()
-        if file_size <= 0:
-            return None
-
-        chunk_size = 8192
-        buffer = b""
-        position = file_size
-        while position > 0:
-            read_size = min(chunk_size, position)
-            position -= read_size
-            handle.seek(position)
-            buffer = handle.read(read_size) + buffer
-            lines = buffer.splitlines()
-            for raw_line in reversed(lines):
-                candidate = raw_line.strip()
-                if candidate:
-                    return candidate.decode("utf-8")
-        return None
-
-
-def _validate_rollout_index(
-    payload: Mapping[str, Any],
-    *,
-    path: Path,
-    position: str,
-    rollouts_per_example: int | None,
-    issues: list[str],
-) -> None:
-    rollout_index = _coerce_int(payload.get("rollout_index"))
-    if rollout_index is None:
-        return
-    if rollout_index < 0:
-        issues.append(f"{position} JSONL row in {path} has negative rollout_index={payload.get('rollout_index')!r}")
-        return
-    if rollouts_per_example and rollout_index >= rollouts_per_example:
-        issues.append(
-            f"{position} JSONL row in {path} has out-of-range rollout_index={payload.get('rollout_index')!r}; "
-            f"expected < {rollouts_per_example}"
-        )
-
-
-def _coerce_int(value: Any) -> int | None:
-    if value is None or isinstance(value, bool):
-        return None
-    if isinstance(value, int):
-        return value
-    if isinstance(value, float):
-        if value.is_integer():
-            return int(value)
-        return None
-    if isinstance(value, str):
-        try:
-            return int(value.strip())
-        except ValueError:
-            return None
-    return None
-
-
-def format_validation_issues(issues: Sequence[ManifestValidationIssue]) -> list[str]:
-    lines: list[str] = []
-    for issue in issues:
-        prefix = issue.kind.upper()
-        target = f"run={issue.run_id}"
-        if issue.job_id:
-            target += f" job={issue.job_id}"
-        lines.append(f"[{prefix}] {target}: {issue.message}")
-    return lines
-
-
-__all__ = [
-    "ManifestValidationIssue",
-    "ManifestValidationResult",
-    "validate_manifests_in_runs",
-    "format_validation_issues",
-]
diff --git a/medarc_verifiers/cli/_schemas.py b/medarc_verifiers/cli/_schemas.py
index 05dc139e..1958cdc7 100644
--- a/medarc_verifiers/cli/_schemas.py
+++ b/medarc_verifiers/cli/_schemas.py
@@ -1,127 +1,14 @@
-"""Pydantic schema stubs for the unified CLI configuration system."""
+"""Small schemas still shared by process export configuration."""
 
 from __future__ import annotations
 
-from pathlib import Path
-from typing import Any, ClassVar
+from typing import Any
 
-from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
-
-RESERVED_MATRIX_KEYS = {
-    "id",
-    "module",
-    "env_args",
-    "extra_env_kwargs",
-    "independent_scoring",
-    "interleave_scoring",
-    "matrix",
-    "matrix_exclude",
-    "matrix_id_format",
-    "matrix_base_id",
-    "state_columns",
-}
-
-
-# NOTE: These schema definitions are intentionally incomplete. They provide
-# the structural scaffolding required to start wiring the config loader and
-# will be expanded in subsequent steps of the integration plan.
-
-
-class ModelConfigSchema(BaseModel):
-    """Schema for model configuration entries (keyed by identifier)."""
-
-    resume_tolerant_fields: ClassVar[set[str]] = frozenset(
-        {
-            "api_key_var",
-            "api_base_url",
-            "endpoints_path",
-            "headers",
-            "timeout",
-            "max_connections",
-            "max_keepalive_connections",
-            "max_retries",
-            "max_concurrent",
-        }
-    )
-
-    id: str | None = Field(
-        None,
-        description="Optional model identifier (legacy list format).",
-    )
-    model: str | None = Field(None, description="Provider-specific model slug.")
-    headers: list[str] | dict[str, str] | None = Field(
-        None,
-        description="Optional HTTP headers to attach to requests.",
-    )
-    sampling_args: dict[str, Any] = Field(default_factory=dict)
-    env_args: dict[str, Any] = Field(default_factory=dict)
-    env_overrides: dict[str, dict[str, Any]] = Field(default_factory=dict)
-    api_key_var: str | None = None
-    api_base_url: str | None = None
-    endpoints_path: str | None = None
-    timeout: float | None = Field(None, ge=0)
-    max_connections: int | None = Field(None, ge=1)
-    max_keepalive_connections: int | None = Field(None, ge=1)
-    max_retries: int | None = Field(None, ge=0)
-    max_concurrent: int | None = Field(None, ge=1)
-
-    @model_validator(mode="before")
-    @classmethod
-    def merge_legacy_params(cls, data: Any) -> Any:
-        if not isinstance(data, dict):
-            return data
-        params = data.get("params")
-        if not isinstance(params, dict):
-            return data
-        merged = dict(params)
-        for key, value in data.items():
-            if key == "params":
-                continue
-            merged[key] = value
-        merged.setdefault("id", data.get("id"))
-        return merged
-
-    @field_validator("headers")
-    @classmethod
-    def validate_headers(cls, value: list[str] | dict[str, str] | None) -> list[str] | dict[str, str] | None:
-        if value is None:
-            return None
-        if isinstance(value, dict):
-            return {str(key): str(item) for key, item in value.items()}
-        if isinstance(value, list):
-            for entry in value:
-                if not isinstance(entry, str):
-                    msg = "Header entries must be strings when provided as a list."
-                    raise ValueError(msg)
-        else:
-            msg = "Headers must be provided as a list of strings or a mapping."
-            raise ValueError(msg)
-        return value
-
-    @field_validator("env_args")
-    @classmethod
-    def default_model_env_args(cls, value: dict[str, Any]) -> dict[str, Any]:
-        return dict(value)
-
-    @field_validator("env_overrides", mode="before")
-    @classmethod
-    def validate_env_overrides(cls, value: Any) -> dict[str, dict[str, Any]]:
-        if value is None:
-            return {}
-        if not isinstance(value, dict):
-            raise ValueError("env_overrides must be a mapping of environment ids to mappings.")
-        normalized: dict[str, dict[str, Any]] = {}
-        for env_id, override in value.items():
-            if not isinstance(env_id, str) or not env_id:
-                raise ValueError("env_overrides keys must be non-empty strings.")
-            if not isinstance(override, dict):
-                raise ValueError(f"env_overrides['{env_id}'] must be a mapping.")
-            normalized[env_id] = dict(override)
-        return normalized
+from pydantic import BaseModel, ConfigDict, Field, field_validator
 
 
 class EnvironmentExportConfig(BaseModel):
-    """Optional export customization embedded in environment configs."""
+    """Optional export customization embedded in legacy environment configs."""
 
     model_config = ConfigDict(populate_by_name=True)
 
@@ -163,197 +50,23 @@ def validate_answer_column(cls, value: Any) -> str | None:
 
 
 class EnvironmentConfigSchema(BaseModel):
-    """Schema for environment configuration entries (keyed by identifier)."""
+    """Legacy environment YAML entry schema used for process export overrides."""
 
-    id: str | None = Field(None, description="Optional environment identifier (legacy list format).")
-    module: str | None = Field(None, description="Optional module override when the ID differs from the import path.")
-    num_examples: int = Field(5, description="Number of examples to evaluate (-1 for all).")
-    rollouts_per_example: int = Field(1, description="Number of rollouts to perform per example.")
-    max_concurrent: int | None = Field(
-        None, description="Maximum number of concurrent requests when running the environment."
-    )
-    independent_scoring: bool | None = Field(
-        default=None,
-        description=(
-            "Whether to score each rollout independently (verifiers>=0.1.9). "
-            "When unset, defaults to rollout-level scoring."
-        ),
-    )
-    interleave_scoring: bool | None = Field(
-        default=None,
-        description="No longer supported; use independent_scoring instead.",
-    )
-    state_columns: list[str] | None = Field(
-        default=None, description="Optional state columns to persist in job outputs."
-    )
-    save_every: int | None = Field(default=None, description="Deprecated; accepted for compatibility and ignored.")
-    print_results: bool = Field(False, description="Deprecated; accepted for compatibility and ignored.")
-    verbose: bool | None = Field(None, description="Override per-environment verbosity.")
+    model_config = ConfigDict(extra="ignore")
+
+    id: str | None = None
+    module: str | None = None
     env_args: dict[str, Any] = Field(default_factory=dict)
-    extra_env_kwargs: dict[str, Any] | None = Field(
-        default=None,
-        description="Optional kwargs forwarded to verifiers Environment.set_kwargs(...) (verifiers>=0.1.9).",
-    )
-    rerun: bool = Field(
-        False,
-        description="Re-run jobs for this environment when resuming/regenerating even if previously completed.",
-    )
-    matrix: dict[str, list[Any]] | None = Field(default=None, description="Parameter sweeps for expansion.")
-    matrix_exclude: list[dict[str, Any]] | None = Field(default=None, description="List of matrix patterns to exclude.")
-    matrix_id_format: str | None = Field(default=None, description="Optional format string for matrix variant IDs.")
     matrix_base_id: str | None = Field(default=None, exclude=True)
-    export: EnvironmentExportConfig | None = Field(
-        default=None,
-        description="Optional export customization (keep/drop columns, prompt settings).",
-    )
-
-    @model_validator(mode="after")
-    def validate_scoring_flags(self) -> EnvironmentConfigSchema:
-        if self.interleave_scoring is not None:
-            raise ValueError("interleave_scoring is no longer supported; use independent_scoring instead.")
-        return self
-
-    @field_validator("num_examples")
-    @classmethod
-    def validate_num_examples(cls, value: int) -> int:
-        if value == -1 or value >= 1:
-            return value
-        msg = "num_examples must be -1 (all) or >= 1."
-        raise ValueError(msg)
+    export: EnvironmentExportConfig | None = None
 
     @field_validator("env_args")
     @classmethod
     def default_env_args(cls, value: dict[str, Any]) -> dict[str, Any]:
         return dict(value)
 
-    @field_validator("rollouts_per_example")
-    @classmethod
-    def validate_rollouts_per_example(cls, value: int) -> int:
-        if value >= 1:
-            return value
-        raise ValueError("rollouts_per_example must be >= 1.")
-
-    @field_validator("max_concurrent")
-    @classmethod
-    def validate_max_concurrent(cls, value: int | None) -> int | None:
-        if value is None or value >= 1:
-            return value
-        raise ValueError("max_concurrent must be >= 1 when provided.")
-
-    @field_validator("state_columns")
-    @classmethod
-    def validate_state_columns(cls, value: list[str] | None) -> list[str] | None:
-        if value is None:
-            return None
-        if not isinstance(value, list):
-            raise ValueError("state_columns must be a list of strings when provided.")
-        return [str(item) for item in value]
-
-    @field_validator("save_every")
-    @classmethod
-    def validate_save_every(cls, value: int | None) -> int | None:
-        if value is None:
-            return None
-        if value >= 1:
-            return value
-        raise ValueError("save_every must be >= 1 when provided.")
-
-    @field_validator("matrix", mode="before")
-    @classmethod
-    def validate_matrix(cls, value: Any) -> dict[str, list[Any]] | None:
-        if value is None:
-            return None
-        if not isinstance(value, dict):
-            raise ValueError("matrix must be a mapping of parameter names to value lists.")
-        normalized: dict[str, list[Any]] = {}
-        for key, items in value.items():
-            if not isinstance(key, str) or not key:
-                raise ValueError("matrix keys must be non-empty strings.")
-            if isinstance(items, tuple):
-                items = list(items)
-            elif not isinstance(items, list):
-                raise ValueError(f"matrix['{key}'] must be a list of values.")
-            if not items:
-                raise ValueError(f"matrix['{key}'] must contain at least one value.")
-            normalized[key] = list(items)
-        return normalized
-
-    @field_validator("matrix_exclude", mode="before")
-    @classmethod
-    def validate_matrix_exclude(cls, value: Any) -> list[dict[str, Any]] | None:
-        if value is None:
-            return None
-        if not isinstance(value, list):
-            raise ValueError("matrix_exclude must be a list of mappings.")
-        normalized: list[dict[str, Any]] = []
-        for entry in value:
-            if not isinstance(entry, dict):
-                raise ValueError("matrix_exclude entries must be mappings.")
-            normalized.append(dict(entry))
-        return normalized
-
-    @field_validator("matrix_id_format")
-    @classmethod
-    def validate_matrix_id_format(cls, value: str | None) -> str | None:
-        if value is None:
-            return None
-        if not isinstance(value, str) or not value:
-            raise ValueError("matrix_id_format must be a non-empty string when provided.")
-        return value
-
-    @model_validator(mode="after")
-    def validate_matrix_constraints(self) -> "EnvironmentConfigSchema":
-        matrix = self.matrix or {}
-        if matrix:
-            base_id = self.id or "<environment>"
-            for key in matrix:
-                if key in RESERVED_MATRIX_KEYS:
-                    raise ValueError(f"environment '{base_id}' matrix cannot vary '{key}'.")
-            matrix_keys = set(matrix)
-            if self.matrix_exclude:
-                for pattern in self.matrix_exclude:
-                    invalid_keys = set(pattern) - matrix_keys
-                    if invalid_keys:
-                        invalid = ", ".join(sorted(invalid_keys))
-                        raise ValueError(
-                            f"environment '{base_id}' matrix_exclude entry references unknown keys: {invalid}."
-                        )
-        return self
-
-
-class JobConfigSchema(BaseModel):
-    """Schema for job entries mapping models to environments."""
-
-    model_config = ConfigDict(populate_by_name=True)
-
-    model: str | dict[str, Any] = Field(..., description="Reference to a defined model id or inline model definition.")
-    env: str | list[str] = Field(..., description="Reference to an environment id or list of ids.")
-    env_args: dict[str, Any] = Field(default_factory=dict)
-    sampling_args: dict[str, Any] = Field(default_factory=dict)
-    name: str | None = Field(default=None, description="Optional human-friendly job label.")
-    sleep: float | None = Field(default=None, ge=0, description="Optional delay (in seconds) after this job.")
-
-
-DEFAULT_RUN_OUTPUT_DIR = Path("runs") / "raw"
-
-
-class RunConfigSchema(BaseModel):
-    """Top-level configuration for unified CLI runs."""
-
-    name: str = Field("benchmark", description="Human readable run name.")
-    models: dict[str, ModelConfigSchema] = Field(default_factory=dict, description="Map of model id -> configuration.")
-    envs: dict[str, EnvironmentConfigSchema] = Field(
-        ..., description="Map of environment id -> configuration.", min_length=1
-    )
-    jobs: list[JobConfigSchema] = Field(default_factory=list)
-    output_dir: Path = Field(default_factory=lambda: DEFAULT_RUN_OUTPUT_DIR)
-
 
 __all__ = [
-    "ModelConfigSchema",
     "EnvironmentConfigSchema",
     "EnvironmentExportConfig",
-    "JobConfigSchema",
-    "RunConfigSchema",
-    "RESERVED_MATRIX_KEYS",
 ]
diff --git a/medarc_verifiers/cli/_single_run.py b/medarc_verifiers/cli/_single_run.py
index dfbb3852..6d803f43 100644
--- a/medarc_verifiers/cli/_single_run.py
+++ b/medarc_verifiers/cli/_single_run.py
@@ -81,17 +81,10 @@ def run_single_mode(argv: Sequence[str] | None = None) -> int:
         args = parser.parse_args([env_id, *remaining])
     except SystemExit as exc:  # pragma: no cover - argparse already emitted error/help
         return int(exc.code)
-    try:
-        args.model_call_retries = _resolve_model_call_retries(
-            args.model_call_retries,
-            args.enable_additional_retries,
-        )
-        if args.http_max_retries is not None and args.http_max_retries < 0:
-            raise ValueError("--http-max-retries must be >= 0.")
-        if args.rollout_max_retries < 0:
-            raise ValueError("--rollout-max-retries must be >= 0.")
-    except ValueError as exc:
-        parser.error(str(exc))
+    if args.http_max_retries is not None and args.http_max_retries < 0:
+        parser.error("--http-max-retries must be >= 0.")
+    if args.rollout_max_retries < 0:
+        parser.error("--rollout-max-retries must be >= 0.")
 
     try:
         env_override_mapping = build_cli_override(
@@ -136,7 +129,7 @@ def run_single_mode(argv: Sequence[str] | None = None) -> int:
     )
 
     try:
-        headers = normalize_headers(args.header, header_file=args.header_file)
+        headers = normalize_headers(args.header)
     except ValueError as exc:
         parser.error(str(exc))
 
@@ -144,19 +137,6 @@ def run_single_mode(argv: Sequence[str] | None = None) -> int:
 
     ensure_root_logging("DEBUG" if args.verbose else "INFO")
 
-    if args.model_call_retries > 0 and not args.dry_run:
-        from datetime import datetime
-
-        from medarc_verifiers.utils.retry import patch_verifiers_model_response_retry
-
-        cwd = Path.cwd()
-        ts = datetime.now().strftime("%Y%m%d_%H%M%S")
-        retry_log_path = cwd / "logs" / f"medarc_model_retry_{ts}.log"
-        patch_verifiers_model_response_retry(
-            attempts=args.model_call_retries,
-            log_path=retry_log_path,
-        )
-
     endpoints_path = Path(args.endpoints_path).expanduser()
     if endpoints_path_explicit and not endpoints_path.exists():
         logger.error("Explicit endpoints registry path does not exist: %s", endpoints_path)
@@ -307,12 +287,6 @@ def _add_and_track(group, *args: str, **kwargs: Any) -> None:
         action="append",
         help=f"Extra HTTP header to send ('Name{HEADER_SEPARATOR} Value'). Repeatable.",
     )
-    _add_and_track(
-        core_group,
-        "--header-file",
-        type=Path,
-        help="File containing newline-delimited 'Name: Value' header entries. Overrides --header on conflicts.",
-    )
     _add_and_track(core_group, "--num-examples", "-n", type=int, default=5, help="Number of examples to evaluate.")
     _add_and_track(
         core_group, "--rollouts-per-example", "-r", type=int, default=3, help="Number of rollouts per example."
@@ -325,20 +299,6 @@ def _add_and_track(group, *args: str, **kwargs: Any) -> None:
         default=DEFAULT_SINGLE_RUN_MAX_CONCURRENT,
         help="Maximum number of concurrent requests.",
     )
-    _add_and_track(
-        core_group,
-        "--max-concurrent-generation",
-        type=int,
-        default=None,
-        help="Deprecated: ignored.",
-    )
-    _add_and_track(
-        core_group,
-        "--max-concurrent-scoring",
-        type=int,
-        default=None,
-        help="Deprecated: ignored.",
-    )
     _add_and_track(
         core_group,
         "--timeout",
@@ -360,20 +320,6 @@ def _add_and_track(group, *args: str, **kwargs: Any) -> None:
         default=0,
         help="Retry full rollout/group on retryable infra/invalid-response errors.",
     )
-    _add_and_track(
-        core_group,
-        "--model-call-retries",
-        type=int,
-        default=None,
-        help="Per-model-call MedARC retry attempts (0 disables the monkeypatch).",
-    )
-    _add_and_track(
-        core_group,
-        "--enable-additional-retries",
-        action=argparse.BooleanOptionalAction,
-        default=None,
-        help="Deprecated alias for --model-call-retries (true maps to 3 attempts).",
-    )
     _add_and_track(
         core_group,
         "--max-tokens",
@@ -574,23 +520,6 @@ def _option_was_provided(argv: Sequence[str], long_flag: str, short_flag: str |
     return False
 
 
-def _resolve_model_call_retries(model_call_retries: int | None, deprecated_toggle: bool | None) -> int:
-    if model_call_retries is not None:
-        if model_call_retries < 0:
-            raise ValueError("--model-call-retries must be >= 0.")
-        if deprecated_toggle is not None:
-            logger.warning(
-                "Ignoring deprecated --enable-additional-retries because --model-call-retries was explicitly set."
-            )
-        return model_call_retries
-
-    if deprecated_toggle is None:
-        return 0
-
-    logger.warning("Flag --enable-additional-retries is deprecated; use --model-call-retries <attempts> instead.")
-    return 3 if deprecated_toggle else 0
-
-
 def _same_path(left: Path, right: Path) -> bool:
     try:
         return left.resolve(strict=False) == right.resolve(strict=False)
diff --git a/medarc_verifiers/cli/main.py b/medarc_verifiers/cli/main.py
index 00ec35f1..d12b2127 100644
--- a/medarc_verifiers/cli/main.py
+++ b/medarc_verifiers/cli/main.py
@@ -21,12 +21,9 @@
 from verifiers.utils.eval_utils import run_evaluation
 from verifiers.utils.save_utils import make_serializable
 
-from medarc_verifiers.cli._config_loader import ConfigFormatError, load_run_config
 from medarc_verifiers.cli._constants import (
     BENCH_COMMAND,
     COMMAND,
-    DEFAULT_API_BASE_URL,
-    DEFAULT_API_KEY_VAR,
     DEFAULT_ENDPOINTS_PATH,
     DEFAULT_EVALS_DIR,
     DEFAULT_ENV_CONFIG_ROOT,
@@ -36,10 +33,6 @@
     PROCESS_COMMAND,
     WINRATE_COMMAND,
 )
-from medarc_verifiers.cli._job_builder import ResolvedJob, build_jobs
-from medarc_verifiers.cli._job_executor import ExecutorSettings, JobExecutionResult, execute_jobs
-from medarc_verifiers.cli._manifest import MANIFEST_FILENAME, ManifestJobEntry, RunManifest, compute_snapshot_checksum
-from medarc_verifiers.cli._manifest_planner import ManifestPlanner
 from medarc_verifiers.cli._schemas import EnvironmentConfigSchema, EnvironmentExportConfig
 from medarc_verifiers.cli._single_run import run_single_mode
 from medarc_verifiers.cli.eval_identity import EvalPathPlan, generate_variant_id, plan_eval_paths
@@ -52,8 +45,6 @@
     dataset_is_excluded,
     normalize_dataset_ids,
     normalize_model_ids,
-    slugify,
-    validate_simple_name,
 )
 from medarc_verifiers.cli.verifiers_adapter import EvalConfigOverrides, build_eval_config, load_toml_eval_configs
 from medarc_verifiers.utils.pathing import resolve_under
@@ -73,45 +64,10 @@ def build_batch_parser() -> argparse.ArgumentParser:
     """Construct the unified CLI parser."""
     parser = argparse.ArgumentParser(
         prog=COMMAND,
-        description="Run MedARC evaluations using unified configuration files.",
-    )
-    parser.add_argument("-c", "--config", required=True, type=Path, help="Path to a benchmark configuration file.")
-    parser.add_argument(
-        "--run-id",
-        help="Override the generated run identifier (simple name only: no slashes, no '..', not absolute).",
-    )
-    parser.add_argument("--name", help="Override the human-friendly run name (defaults to the config name).")
-    parser.add_argument(
-        "--restart",
-        help=(
-            "Seed jobs from a previous run directory or run_manifest.json path; "
-            "otherwise treated as a run id under output_dir."
-        ),
-    )
-    parser.add_argument(
-        "--auto-resume",
-        action=argparse.BooleanOptionalAction,
-        default=True,
-        help=(
-            "Automatically resume the newest matching run (default: enabled). "
-            "Pass --no-auto-resume to force a fresh run."
-        ),
-    )
-    parser.add_argument(
-        "--on-complete",
-        choices=("exit", "continue", "rerun", "new", "prompt"),
-        default="prompt",
-        help=(
-            "Action when all selected jobs are already completed. "
-            "Use 'prompt' for interactive selection (default: prompt)."
-        ),
-    )
-    parser.add_argument("--force", action="store_true", help="Re-run every job regardless of manifest state.")
-    parser.add_argument(
-        "--forced",
-        action="append",
-        help="Re-run jobs for the specified environment(s); repeat or comma-separate values.",
+        description="Run MedARC evaluations using upstream verifiers TOML configs.",
     )
+    parser.add_argument("-c", "--config", required=True, type=Path, help="Path to an upstream TOML eval config file.")
+    parser.add_argument("--force", action="store_true", help="Archive existing deterministic output and rerun.")
     parser.add_argument("--output-dir", type=Path, help="Override the output directory from the configuration.")
     parser.add_argument(
         "--env-dir",
@@ -119,35 +75,16 @@ def build_batch_parser() -> argparse.ArgumentParser:
         default=DEFAULT_ENV_DIR,
         help="Directory containing environments (default: %(default)s).",
     )
-    parser.add_argument(
-        "--env-config-root",
-        type=Path,
-        default=DEFAULT_ENV_CONFIG_ROOT,
-        help="Directory containing environment YAMLs for auto-discovery (default: %(default)s).",
-    )
     parser.add_argument(
         "--endpoints-path",
         type=Path,
         default=DEFAULT_ENDPOINTS_PATH,
         help=f"Path to the endpoints registry file (default: {DEFAULT_ENDPOINTS_PATH}).",
     )
-    parser.add_argument(
-        "--default-api-key-var",
-        default=DEFAULT_API_KEY_VAR,
-        help=f"Default API key environment variable (default: {DEFAULT_API_KEY_VAR}).",
-    )
-    parser.add_argument(
-        "--default-api-base-url",
-        default=DEFAULT_API_BASE_URL,
-        help=f"Default API base URL (default: {DEFAULT_API_BASE_URL}).",
-    )
     parser.add_argument(
         "--api-base-url",
         default=None,
-        help=(
-            "Override API base URL for all models (CLI force > model api_base_url > --default-api-base-url). "
-            "Useful when pointing a config at a dynamically assigned endpoint."
-        ),
+        help="Override API base URL for all TOML evals.",
     )
     parser.add_argument("--api-key-var", default=None, help="Override API key environment variable for TOML bench.")
     parser.add_argument("--provider", default=None, help="Override provider shorthand for TOML bench.")
@@ -162,9 +99,6 @@ def build_batch_parser() -> argparse.ArgumentParser:
         action="store_true",
         help="Continue TOML sequential execution after a failed eval.",
     )
-    parser.add_argument(
-        "--job-id", action="append", help="Run only the specified job identifier (repeat to select multiple)."
-    )
     parser.add_argument(
         "--env-arg", action="append", help="Override an environment argument with KEY=VALUE (repeatable)."
     )
@@ -177,50 +111,24 @@ def build_batch_parser() -> argparse.ArgumentParser:
         "--dry-run", action="store_true", help="Resolve jobs and report overrides without executing them."
     )
     parser.add_argument("--verbose", "-v", action="store_true", help="Enable verbose logging.")
-    parser.add_argument(
-        "--save-results",
-        action=argparse.BooleanOptionalAction,
-        default=True,
-        help="Persist evaluation outputs (default: enabled).",
-    )
-    parser.add_argument(
-        "--save-to-hf-hub",
-        action=argparse.BooleanOptionalAction,
-        default=False,
-        help="Upload results to the Hugging Face Hub.",
-    )
-    parser.add_argument("--hf-hub-dataset-name", help="Custom dataset name when uploading to the Hub.")
     parser.add_argument(
         "--max-concurrent",
         type=int,
         default=None,
-        help="Override env max_concurrent for all jobs (CLI > model > env > defaults).",
+        help="Override max_concurrent for every TOML eval.",
     )
-    parser.add_argument("--max-concurrent-generation", type=int, help="Deprecated: ignored.")
-    parser.add_argument("--max-concurrent-scoring", type=int, help="Deprecated: ignored.")
     parser.add_argument(
         "--timeout",
         type=float,
         default=None,
-        help="Override request timeout in seconds for all jobs (CLI > model > default).",
-    )
-    parser.add_argument(
-        "--http-max-retries",
-        type=int,
-        default=None,
-        help="HTTP/client-level retries for model calls (CLI > model max_retries).",
-    )
-    parser.add_argument(
-        "--rollout-max-retries",
-        type=int,
-        default=0,
-        help="Retry full rollout/group on retryable infra/invalid-response errors.",
+        help="Override request timeout in seconds for every TOML eval.",
     )
     parser.add_argument(
-        "--model-call-retries",
+        "--max-retries",
+        dest="rollout_max_retries",
         type=int,
         default=None,
-        help="Per-model-call MedARC retry attempts (0 disables the monkeypatch).",
+        help="Override upstream rollout max_retries for every TOML eval.",
     )
     parser.add_argument(
         "--sleep",
@@ -230,21 +138,6 @@ def build_batch_parser() -> argparse.ArgumentParser:
         default=0.0,
         help="Sleep this many seconds after each job (overridden by per-job sleep).",
     )
-    parser.add_argument(
-        "--enable-additional-retries",
-        action=argparse.BooleanOptionalAction,
-        default=None,
-        help="Deprecated alias for --model-call-retries (true maps to 3 attempts).",
-    )
-    parser.add_argument(
-        "--include-usage",
-        action=argparse.BooleanOptionalAction,
-        default=None,
-        help=(
-            "Include usage reporting in API requests (extra_body.usage.include). "
-            "Default: auto-detect (enabled for Prime Inference, disabled otherwise)."
-        ),
-    )
     return parser
 
 
@@ -561,42 +454,19 @@ def _run_batch_mode(argv: Sequence[str]) -> int:
             json_flag="--sampling-args",
             pair_flag="--sampling-arg",
         )
-        args.model_call_retries = _resolve_model_call_retries(
-            args.model_call_retries,
-            args.enable_additional_retries,
-        )
-        if args.http_max_retries is not None and args.http_max_retries < 0:
-            raise ValueError("--http-max-retries must be >= 0.")
-        if args.rollout_max_retries < 0:
-            raise ValueError("--rollout-max-retries must be >= 0.")
+        if args.rollout_max_retries is not None and args.rollout_max_retries < 0:
+            raise ValueError("--max-retries must be >= 0.")
     except ValueError as exc:
         parser.error(str(exc))
 
     config_path = Path(args.config).expanduser()
-    if config_path.suffix.lower() == ".toml":
-        try:
-            _validate_toml_selection_args(args, parser=parser)
-            return _run_toml_bench(args)
-        except Exception as exc:  # noqa: BLE001
-            logger.exception("TOML bench failed: %s", exc)
-            return 1
-
-    if args.restart:
-        args.auto_resume = False
-    # Restarting is an explicit workflow; disable auto-resume selection when --restart is set.
-    # The planner may restart in-place when --restart points to an existing run directory.
-
+    if config_path.suffix.lower() != ".toml":
+        parser.error("medarc-eval bench now accepts upstream TOML configs only.")
     try:
-        return _execute_batch(args)
-    except KeyboardInterrupt:
-        logger.warning("Batch run interrupted by user.")
-        return 1
-    except ConfigFormatError as exc:
-        parser.error(str(exc))
-    except SystemExit:  # pragma: no cover - argparse already handled messaging
-        raise
+        _validate_toml_selection_args(args, parser=parser)
+        return _run_toml_bench(args)
     except Exception as exc:  # noqa: BLE001
-        logger.exception("Unhandled error: %s", exc)
+        logger.exception("TOML bench failed: %s", exc)
         return 1
 
 
@@ -1370,228 +1240,6 @@ def _run_winrate_mode(argv: Sequence[str]) -> int:
     return 0
 
 
-def _execute_batch(args: argparse.Namespace) -> int:
-    # Set the include_usage environment variable if explicitly specified
-    if getattr(args, "include_usage", None) is not None:
-        import os
-
-        os.environ["MEDARC_INCLUDE_USAGE"] = "true" if args.include_usage else "false"
-
-    config_path = Path(args.config).expanduser()
-    if config_path.suffix.lower() in {".yaml", ".yml"}:
-        logger.warning("YAML benchmark configs will be removed; convert to TOML.")
-    env_root_override = Path(args.env_config_root).expanduser().resolve() if args.env_config_root else None
-    run_config = load_run_config(config_path, env_default_root=env_root_override)
-
-    run_name = args.name or run_config.name
-    output_dir = Path(args.output_dir).expanduser() if args.output_dir else Path(run_config.output_dir).expanduser()
-    output_dir = output_dir.resolve()
-    run_id = args.run_id  # May be None when using --auto-resume discovery
-    if run_id is not None:
-        try:
-            run_id = validate_simple_name(run_id, flag="--run-id")
-        except ValueError as exc:
-            logger.error("Invalid --run-id '%s': %s", run_id, exc)
-            logger.error("Suggested safe value: --run-id %s", slugify(run_id))
-            return 1
-
-    if args.restart:
-        restart_raw = args.restart
-        restart_path = Path(restart_raw).expanduser()
-        try:
-            if restart_path.exists():
-                if restart_path.is_dir():
-                    args.restart = str(restart_path)
-                elif restart_path.is_file() and restart_path.name == MANIFEST_FILENAME:
-                    args.restart = str(restart_path.parent)
-                else:
-                    logger.error(
-                        "Invalid --restart '%s': expected a run directory or %s file.",
-                        restart_raw,
-                        MANIFEST_FILENAME,
-                    )
-                    return 1
-            else:
-                args.restart = validate_simple_name(restart_raw, flag="--restart")
-        except OSError as exc:
-            logger.error("Invalid --restart '%s': %s", restart_raw, exc)
-            return 1
-        except ValueError as exc:
-            logger.error("Invalid --restart '%s': %s", restart_raw, exc)
-            return 1
-
-    if args.model_call_retries > 0 and not args.dry_run:
-        from datetime import datetime
-
-        from medarc_verifiers.utils.retry import patch_verifiers_model_response_retry
-
-        cwd = Path.cwd()
-        ts = datetime.now().strftime("%Y%m%d_%H%M%S")
-        retry_log_path = cwd / "logs" / f"medarc_model_retry_{ts}.log"
-        patch_verifiers_model_response_retry(
-            attempts=args.model_call_retries,
-            log_path=retry_log_path,
-        )
-
-    jobs = build_jobs(run_config)
-    if not jobs:
-        logger.error("Configuration %s did not produce any jobs.", config_path)
-        return 1
-
-    selected_jobs = _filter_jobs(jobs, args.job_id)
-    if not selected_jobs:
-        logger.error("No jobs matched the provided filters.")
-        return 1
-
-    env_args_map, sampling_args_map = _build_effective_args(jobs)
-    config_checksum = compute_snapshot_checksum(run_config.model_dump())
-    forced_envs = _parse_forced_envs(args.forced)
-    forced_envs.update(_collect_rerun_envs(run_config.envs))
-
-    planner = ManifestPlanner(
-        output_dir=output_dir,
-        run_id=run_id,
-        run_name=run_name,
-        config_path=config_path,
-        config_checksum=config_checksum,
-        jobs=jobs,
-        env_args_map=env_args_map,
-        sampling_args_map=sampling_args_map,
-        restart_source=args.restart,
-        auto_resume=bool(args.auto_resume),
-        persist=not bool(args.dry_run),
-    )
-
-    try:
-        manifest_plan = planner.plan(force_all=bool(args.force), forced_envs=forced_envs)
-    except ValueError as exc:
-        logger.error("%s", exc)
-        return 1
-
-    runnable_ids = manifest_plan.runnable_job_ids
-    selected_ids = {job.job_id for job in selected_jobs}
-    planned_jobs = [job for job in jobs if job.job_id in runnable_ids and job.job_id in selected_ids]
-
-    _print_job_plan(
-        selected_jobs,
-        manifest=manifest_plan.manifest,
-        runnable_job_ids=runnable_ids,
-        discovered_total=len(jobs),
-        dry_run=bool(args.dry_run),
-    )
-
-    if not planned_jobs:
-        if manifest_plan.reused_job_ids:
-            logger.info(
-                "All jobs already completed (reused %d job(s) from prior manifests).",
-                len(manifest_plan.reused_job_ids),
-            )
-        else:
-            logger.info("No jobs were scheduled after applying filters and resume settings.")
-
-        # Check if all selected jobs are completed (not just filtered out)
-        all_completed = all(
-            manifest_plan.manifest.job_entry(job.job_id)
-            and manifest_plan.manifest.job_entry(job.job_id).status == "completed"
-            for job in selected_jobs
-        )
-
-        if all_completed and selected_jobs and not args.dry_run and not args.force:
-            choice = args.on_complete
-            if choice == "prompt":
-                choice = _prompt_completed_jobs_action()
-            if choice == "new":
-                logger.info("Creating a new run with all jobs...")
-                # Create a fresh run by disabling auto-resume and forcing a new run_id
-                # Recursively call with updated args to create new manifest
-                new_args = argparse.Namespace(**vars(args))
-                new_args.auto_resume = False
-                new_args.run_id = None  # Force generation of new run_id
-                new_args.restart = None
-                return _execute_batch(new_args)
-            elif choice == "rerun":
-                logger.info("Rerunning all completed jobs...")
-                # Set all selected jobs to runnable
-                runnable_ids = {job.job_id for job in selected_jobs}
-                planned_jobs = [job for job in jobs if job.job_id in runnable_ids and job.job_id in selected_ids]
-                # Continue execution below
-            elif choice == "exit":
-                logger.info("Exiting without running jobs.")
-                _log_summary([], manifest_plan.manifest)
-                return 0
-            else:  # continue/skip
-                logger.info("Continuing without running jobs.")
-                _log_summary([], manifest_plan.manifest)
-                return 0
-        else:
-            _log_summary([], manifest_plan.manifest)
-            return 0
-
-    if not planned_jobs:
-        # After prompting, still no planned jobs (shouldn't happen, but safety check)
-        _log_summary([], manifest_plan.manifest)
-        return 0
-
-    forced_job_ids = _compute_forced_job_ids(
-        planned_jobs=planned_jobs,
-        runnable_job_ids=runnable_ids,
-        manifest=manifest_plan.manifest,
-        force_all=bool(args.force),
-        forced_envs=forced_envs,
-    )
-
-    settings = ExecutorSettings(
-        run_id=manifest_plan.manifest.model.run_id or "",
-        output_dir=output_dir,
-        env_dir=Path(args.env_dir).expanduser(),
-        endpoints_path=Path(args.endpoints_path).expanduser() if args.endpoints_path else None,
-        endpoints_path_explicit=bool(getattr(args, "endpoints_path_explicit", False)),
-        default_api_key_var=args.default_api_key_var,
-        default_api_key_var_explicit=bool(getattr(args, "default_api_key_var_explicit", False)),
-        default_api_base_url=args.default_api_base_url,
-        api_base_url_override=args.api_base_url,
-        log_level="DEBUG" if args.verbose else "INFO",
-        verbose=args.verbose,
-        save_results=args.save_results,
-        save_to_hf_hub=args.save_to_hf_hub,
-        hf_hub_dataset_name=_coerce_optional_str(args.hf_hub_dataset_name),
-        max_concurrent_generation=args.max_concurrent_generation,
-        max_concurrent_scoring=args.max_concurrent_scoring,
-        max_concurrent=args.max_concurrent,  # CLI override (None if not provided)
-        http_max_retries=args.http_max_retries,
-        rollout_max_retries=args.rollout_max_retries,
-        timeout=args.timeout,
-        sleep=args.sleep,
-        dry_run=args.dry_run,
-        cli_env_args=getattr(args, "cli_env_args", None),
-        cli_sampling_args=getattr(args, "cli_sampling_args", None),
-        forced_job_ids=forced_job_ids,
-    )
-
-    logger.info(
-        "Loaded %d job(s); executing %d after filters (%d reusable).",
-        len(jobs),
-        len(planned_jobs),
-        len(manifest_plan.reused_job_ids),
-    )
-
-    endpoints_cache: dict[str, Any] = {}
-    env_metadata_cache: dict[str, Any] = {}
-
-    results = execute_jobs(
-        planned_jobs,
-        settings,
-        endpoints_cache=endpoints_cache,
-        env_metadata_cache=env_metadata_cache,
-        manifest=None if args.dry_run else manifest_plan.manifest,
-    )
-
-    _log_summary(results, manifest_plan.manifest)
-
-    has_failures = any(result.status == "failed" for result in results if result.status != "skipped")
-    return 1 if has_failures else 0
-
-
 def _validate_toml_selection_args(args: argparse.Namespace, *, parser: argparse.ArgumentParser) -> None:
     for attr, flag in (("eval_index", "--eval-index"), ("start_at", "--start-at"), ("stop_after", "--stop-after")):
         value = getattr(args, attr, None)
@@ -1635,6 +1283,10 @@ def _prepare_toml_raw_configs(raw_configs: Sequence[dict[str, Any]], args: argpa
         item.setdefault("save_results", True)
         if args.max_concurrent is None and "max_concurrent" not in item:
             item["max_concurrent"] = 1
+        if args.timeout is not None:
+            item["timeout"] = args.timeout
+        if args.rollout_max_retries is not None:
+            item["max_retries"] = args.rollout_max_retries
         prepared.append(item)
     return prepared
 
@@ -1846,31 +1498,6 @@ def _print_toml_bench_plan(eval_configs: Sequence[Any], path_plans: Sequence[Eva
     console.print(table)
 
 
-def _build_effective_args(
-    jobs: Sequence[ResolvedJob],
-) -> tuple[dict[str, dict[str, Any]], dict[str, dict[str, Any]]]:
-    env_map: dict[str, dict[str, Any]] = {}
-    sampling_map: dict[str, dict[str, Any]] = {}
-    for job in jobs:
-        env_map[job.job_id] = dict(job.env_args)
-        sampling_map[job.job_id] = dict(job.sampling_args)
-    return env_map, sampling_map
-
-
-def _parse_forced_envs(values: Sequence[str] | None) -> set[str]:
-    forced: set[str] = set()
-    if not values:
-        return forced
-    for chunk in values:
-        if not chunk:
-            continue
-        for item in chunk.split(","):
-            value = item.strip()
-            if value:
-                forced.add(value.lower())
-    return forced
-
-
 def _parse_repeatable_csv(values: Sequence[str] | None) -> list[str]:
     parsed: list[str] = []
     for chunk in values or ():
@@ -1890,23 +1517,6 @@ def _option_was_provided(argv: Sequence[str], long_flag: str) -> bool:
     return False
 
 
-def _resolve_model_call_retries(model_call_retries: int | None, deprecated_toggle: bool | None) -> int:
-    if model_call_retries is not None:
-        if model_call_retries < 0:
-            raise ValueError("--model-call-retries must be >= 0.")
-        if deprecated_toggle is not None:
-            logger.warning(
-                "Ignoring deprecated --enable-additional-retries because --model-call-retries was explicitly set."
-            )
-        return model_call_retries
-
-    if deprecated_toggle is None:
-        return 0
-
-    logger.warning("Flag --enable-additional-retries is deprecated; use --model-call-retries <attempts> instead.")
-    return 3 if deprecated_toggle else 0
-
-
 def _filter_winrate_datasets(
     datasets: Sequence[tuple[str, Sequence[Path]]],
     exclude_datasets: Sequence[str],
@@ -1926,132 +1536,16 @@ def _filter_winrate_datasets(
     return filtered
 
 
-def _collect_rerun_envs(envs: Mapping[str, EnvironmentConfigSchema]) -> set[str]:
-    rerun: set[str] = set()
-    for env in envs.values():
-        if getattr(env, "rerun", False):
-            for key in (env.id, env.module, env.matrix_base_id):
-                if key:
-                    rerun.add(str(key).lower())
-    return rerun
-
-
-def _compute_forced_job_ids(
-    *,
-    planned_jobs: Sequence[ResolvedJob],
-    runnable_job_ids: set[str],
-    manifest: RunManifest | None,
-    force_all: bool,
-    forced_envs: set[str],
-) -> set[str]:
-    forced_ids: set[str] = set()
-    if force_all:
-        return {job.job_id for job in planned_jobs}
-
-    for job in planned_jobs:
-        entry = manifest.job_entry(job.job_id) if manifest is not None else None
-        env_forced = any(key in forced_envs for key in _force_keys_for_job(job, entry))
-        completed_but_runnable = bool(
-            entry is not None and entry.status == "completed" and job.job_id in runnable_job_ids
-        )
-        if env_forced or completed_but_runnable:
-            forced_ids.add(job.job_id)
-    return forced_ids
-
-
-def _force_keys_for_job(job: ResolvedJob, entry: ManifestJobEntry | None) -> set[str]:
-    keys: set[str] = {job.job_id.lower()}
-    for value in (
-        getattr(job.env, "id", None),
-        getattr(job.env, "module", None),
-        getattr(job.env, "matrix_base_id", None),
-        getattr(entry, "env_id", None),
-    ):
-        if value:
-            keys.add(str(value).lower())
-    return keys
-
-
-def _filter_jobs(jobs: Sequence[ResolvedJob], job_filters: Sequence[str] | None) -> list[ResolvedJob]:
-    if not job_filters:
-        return list(jobs)
-    filters = set(job_filters)
-    selected = [job for job in jobs if job.job_id in filters]
-    missing = filters - {job.job_id for job in selected}
-    if missing:
-        logger.warning("Unknown job ids requested: %s", ", ".join(sorted(missing)))
-    return selected
-
-
-def _coerce_optional_str(value: str | None) -> str | None:
-    if value is None or value == "":
-        return None
-    return value
-
-
-def _prompt_completed_jobs_action() -> str:
-    """Prompt user to choose what to do when all jobs are completed.
-
-    Returns:
-        "new", "rerun", "continue", or "exit"
-    """
-    console = Console()
-
-    message = "\n[bold yellow]All jobs are already completed.[/bold yellow]\n"
-    message += "What would you like to do?\n"
-    message += "  [bold cyan]n[/bold cyan] - Create a new run\n"
-    message += "  [bold cyan]r[/bold cyan] - Rerun all jobs (ignore completion status)\n"
-    message += "  [bold cyan]c[/bold cyan] - Continue without running (default)\n"
-    message += "  [bold cyan]e[/bold cyan] - Exit\n"
-
-    console.print(message)
-
-    try:
-        response = input("Choose [n/r/c/e]: ").strip().lower()
-    except (EOFError, KeyboardInterrupt):
-        print()  # New line after Ctrl+C
-        return "exit"
-
-    if response == "n" or response == "new":
-        return "new"
-    elif response == "r" or response == "rerun":
-        return "rerun"
-    elif response == "e" or response == "exit":
-        return "exit"
-    else:
-        # Default to continue for any other input (including empty/enter)
-        return "continue"
-
-
-def _log_summary(results: Sequence[JobExecutionResult], manifest: RunManifest | None = None) -> None:
-    if manifest is not None:
-        summary = manifest.summary
-        logger.info(
-            "Run complete: %d completed, %d pending, %d failed, %d skipped (total %d).",
-            summary.get("completed", 0),
-            summary.get("pending", 0),
-            summary.get("failed", 0),
-            summary.get("skipped", 0),
-            summary.get("total", 0),
-        )
-        return
-    total = len(results)
-    succeeded = sum(result.status == "succeeded" for result in results)
-    skipped = sum(result.status == "skipped" for result in results)
-    failed = sum(result.status == "failed" for result in results)
-    logger.info("Run complete: %d succeeded, %d skipped, %d failed (total %d).", succeeded, skipped, failed, total)
-
-
 def _print_general_help() -> None:
     message = dedent(
         f"""\
         Usage:
           {COMMAND} <ENV> [options]                 # Single run (ENV must be first; use ENV --help for details)
-          {COMMAND} {BENCH_COMMAND} --config CONFIG.yaml ...  # Batch run (see: {COMMAND} {BENCH_COMMAND} --help)
+          {COMMAND} {BENCH_COMMAND} --config CONFIG.toml ...  # Sequential TOML bench
           {COMMAND} {PROCESS_COMMAND} [options]               # Export raw runs to parquet (see: {COMMAND} {PROCESS_COMMAND} --help)
           {COMMAND} {WINRATE_COMMAND} [options]               # Compute win rates from processed parquet outputs
 
-        First argument must be the environment slug for single runs. Use '{COMMAND} {BENCH_COMMAND} --help' for batch mode options."""
+        First argument must be the environment slug for single runs. Use '{COMMAND} {BENCH_COMMAND} --help' for TOML bench options."""
     )
     print(message)
 
@@ -2133,72 +1627,5 @@ def _load_env_export_map(root: Path | None) -> dict[str, EnvironmentExportConfig
     return export_map
 
 
-def _print_job_plan(
-    jobs: Sequence[ResolvedJob],
-    *,
-    manifest: RunManifest | None,
-    runnable_job_ids: set[str],
-    discovered_total: int,
-    dry_run: bool,
-) -> None:
-    """Render a human-friendly summary of the jobs scheduled for execution."""
-    listed_total = len(jobs)
-    scheduled_total = sum(1 for job in jobs if job.job_id in runnable_job_ids)
-    caption_parts: list[str] = [f"{listed_total} job(s) listed"]
-    caption_parts.append(f"{scheduled_total} to {'dry-run' if dry_run else 'run'}")
-    if discovered_total != listed_total:
-        caption_parts.append(f"{discovered_total} discovered")
-    caption = " | ".join(part for part in caption_parts if part)
-
-    if not jobs:
-        logger.info("No jobs to display (%s).", caption)
-        return
-
-    def _format_label(primary: str | None, secondary: str | None) -> str:
-        if primary and secondary and primary != secondary:
-            return f"{primary} ({secondary})"
-        return primary or secondary or "-"
-
-    def _resolve_status(job_id: str, entry: ManifestJobEntry | None) -> str:
-        if job_id in runnable_job_ids:
-            return "next"
-        if entry and entry.status == "completed":
-            return "completed"
-        return "pending"
-
-    entries = {}
-    if manifest is not None:
-        entries = {entry.job_id: entry for entry in manifest.jobs if entry.job_id}
-
-    console = Console()
-    table = Table(title="Planned Jobs", caption=caption, expand=True)
-    table.add_column("#", justify="right", style="dim")
-    table.add_column("Job ID", style="bold cyan", overflow="fold")
-    table.add_column("Status", style="yellow")
-    table.add_column("Name", style="white", overflow="fold")
-    table.add_column("Model", style="magenta", overflow="fold")
-    table.add_column("Environment", style="green", overflow="fold")
-    table.add_column("Examples", justify="right")
-    table.add_column("Rollouts", justify="right")
-
-    for index, job in enumerate(jobs, start=1):
-        entry = entries.get(job.job_id)
-        model_label = _format_label(job.model.id, job.model.model)
-        env_label = _format_label(job.env.id, job.env.module)
-        status = _resolve_status(job.job_id, entry)
-        table.add_row(
-            str(index),
-            job.job_id,
-            status,
-            job.name or "-",
-            model_label,
-            env_label,
-            str(job.env.num_examples),
-            str(job.env.rollouts_per_example),
-        )
-
-    console.print(table)
-
-
 if __name__ == "__main__":  # pragma: no cover
     raise SystemExit(main())
diff --git a/tests/test_cli/test_config_loader.py b/tests/test_cli/test_config_loader.py
deleted file mode 100644
index b39e8d87..00000000
--- a/tests/test_cli/test_config_loader.py
+++ /dev/null
@@ -1,742 +0,0 @@
-from __future__ import annotations
-
-from dataclasses import dataclass
-from pathlib import Path
-
-import pytest
-from pydantic import ValidationError
-
-from medarc_verifiers.cli._config_loader import ConfigFormatError, load_run_config
-from medarc_verifiers.cli._job_builder import build_jobs
-from medarc_verifiers.cli._job_executor import ExecutorSettings, execute_jobs
-
-
-@dataclass
-class _FakeParam:
-    name: str
-    required: bool = False
-    choices: tuple | None = None
-    argparse_type: type | None = None
-    is_list: bool = False
-    element_type: type | None = None
-    action: str | None = None
-    kind: str | None = None
-    supports_cli: bool = True
-
-
-def _write_yaml(path: Path, content: str) -> Path:
-    path.write_text(content)
-    return path
-
-
-def test_load_run_config_parses_basic_yaml(monkeypatch, tmp_path: Path) -> None:
-    monkeypatch.setattr(
-        "medarc_verifiers.cli._config_loader.load_env_metadata",
-        lambda _env_id, cache=None: [],
-    )
-    config_path = _write_yaml(
-        tmp_path / "config.yaml",
-        """
-        name: demo-run
-        models:
-          - id: gpt-mini
-        envs:
-          - id: medqa
-        jobs:
-          - model: gpt-mini
-            env: medqa
-        """,
-    )
-
-    config = load_run_config(config_path)
-
-    assert config.name == "demo-run"
-    assert len(config.models) == 1
-    assert "gpt-mini" in config.models
-    assert config.models["gpt-mini"].id == "gpt-mini"
-    assert len(config.envs) == 1
-    assert "medqa" in config.envs
-    assert config.envs["medqa"].id == "medqa"
-    assert len(config.jobs) == 1
-    assert config.jobs[0].model == "gpt-mini"
-
-
-def test_load_run_config_supports_mapped_format(monkeypatch, tmp_path: Path) -> None:
-    monkeypatch.setattr(
-        "medarc_verifiers.cli._config_loader.load_env_metadata",
-        lambda _env_id, cache=None: [],
-    )
-    config_path = _write_yaml(
-        tmp_path / "mapped.yaml",
-        """
-        models:
-          gpt-mini:
-            model: openai/gpt-mini
-        envs:
-          medqa:
-            num_examples: 5
-        jobs:
-          - model: gpt-mini
-            env: medqa
-        """,
-    )
-
-    config = load_run_config(config_path)
-
-    assert set(config.models) == {"gpt-mini"}
-    assert config.models["gpt-mini"].model == "openai/gpt-mini"
-    assert set(config.envs) == {"medqa"}
-    assert config.envs["medqa"].num_examples == 5
-
-
-def test_load_run_config_rejects_non_mapping_root(tmp_path: Path) -> None:
-    config_path = _write_yaml(
-        tmp_path / "invalid.yaml",
-        """
-        - not: a-mapping
-        """,
-    )
-
-    with pytest.raises(ConfigFormatError):
-        load_run_config(config_path)
-
-
-def test_model_headers_validation(monkeypatch, tmp_path: Path) -> None:
-    monkeypatch.setattr(
-        "medarc_verifiers.cli._config_loader.load_env_metadata",
-        lambda _env_id, cache=None: [],
-    )
-    config_path = _write_yaml(
-        tmp_path / "headers.yaml",
-        """
-        models:
-          - id: bad
-            headers:
-              - 123
-        envs:
-          - id: medqa
-        jobs:
-          - model: bad
-            env: medqa
-        """,
-    )
-
-    with pytest.raises(ValidationError):
-        load_run_config(config_path)
-
-
-def test_environment_num_examples_validation(monkeypatch, tmp_path: Path) -> None:
-    monkeypatch.setattr(
-        "medarc_verifiers.cli._config_loader.load_env_metadata",
-        lambda _env_id, cache=None: [],
-    )
-    config_path = _write_yaml(
-        tmp_path / "envs.yaml",
-        """
-        envs:
-          - id: medqa
-            num_examples: 0
-        jobs:
-          - model: gpt
-            env: medqa
-        models:
-          - id: gpt
-        """,
-    )
-
-    with pytest.raises(ValidationError):
-        load_run_config(config_path)
-
-
-def test_environment_env_args_unknown(monkeypatch, tmp_path: Path) -> None:
-    def fake_metadata(_env_id: str, cache=None):
-        return [_FakeParam("shuffle_seed"), _FakeParam("shuffle_answers")]
-
-    monkeypatch.setattr(
-        "medarc_verifiers.cli._config_loader.load_env_metadata",
-        fake_metadata,
-    )
-
-    config_path = _write_yaml(
-        tmp_path / "unknown_env_arg.yaml",
-        """
-        envs:
-          - id: medqa
-            env_args:
-              invalid_param: true
-        models:
-          - id: gpt
-        jobs:
-          - model: gpt
-            env: medqa
-        """,
-    )
-
-    with pytest.raises(ValueError):
-        load_run_config(config_path)
-
-
-def test_environment_env_args_known(monkeypatch, tmp_path: Path) -> None:
-    def fake_metadata(_env_id: str, cache=None):
-        return [_FakeParam("shuffle_answers"), _FakeParam("shuffle_seed")]
-
-    monkeypatch.setattr(
-        "medarc_verifiers.cli._config_loader.load_env_metadata",
-        fake_metadata,
-    )
-
-    config_path = _write_yaml(
-        tmp_path / "known_env_arg.yaml",
-        """
-        envs:
-          - id: medqa
-            env_args:
-              shuffle_answers: true
-        models:
-          - id: gpt
-        jobs:
-          - model: gpt
-            env: medqa
-        """,
-    )
-
-    config = load_run_config(config_path)
-
-    assert config.envs["medqa"].env_args == {"shuffle_answers": True}
-
-
-def test_env_paths_resolve_with_cli_default_root(monkeypatch, tmp_path: Path) -> None:
-    monkeypatch.setattr(
-        "medarc_verifiers.cli._config_loader.load_env_metadata",
-        lambda _env_id, cache=None: [],
-    )
-    configs_dir = tmp_path / "configs"
-    envs_dir = configs_dir / "envs"
-    envs_dir.mkdir(parents=True)
-    _write_yaml(
-        envs_dir / "medqa.yaml",
-        """
-        - id: medqa
-          module: medqa
-        """,
-    )
-    config_path = _write_yaml(
-        configs_dir / "jobs.yaml",
-        """
-        models:
-          - id: gpt
-        envs:
-          - medqa
-        jobs:
-          - model: gpt
-            env: medqa
-        """,
-    )
-
-    config = load_run_config(config_path, env_default_root=envs_dir)
-
-    assert "medqa" in config.envs
-
-
-def test_env_paths_use_env_config_root(monkeypatch, tmp_path: Path) -> None:
-    monkeypatch.setattr(
-        "medarc_verifiers.cli._config_loader.load_env_metadata",
-        lambda _env_id, cache=None: [],
-    )
-    shared_envs = tmp_path / "shared_envs"
-    shared_envs.mkdir()
-    _write_yaml(
-        shared_envs / "custom_env.yaml",
-        """
-        - id: custom_env
-          module: custom_env
-        """,
-    )
-    config_path = _write_yaml(
-        tmp_path / "jobs.yaml",
-        """
-        models:
-          - id: gpt
-        envs:
-          - custom_env
-        jobs:
-          - model: gpt
-            env: custom_env
-        """,
-    )
-
-    config = load_run_config(config_path, env_default_root=shared_envs)
-
-    assert "custom_env" in config.envs
-
-
-def test_envs_auto_discovered_from_env_config_root(monkeypatch, tmp_path: Path) -> None:
-    monkeypatch.setattr(
-        "medarc_verifiers.cli._config_loader.load_env_metadata",
-        lambda _env_id, cache=None: [],
-    )
-    env_root = tmp_path / "auto_envs"
-    env_root.mkdir()
-    _write_yaml(
-        env_root / "auto.yaml",
-        """
-        - id: auto_env
-          module: auto_env
-        """,
-    )
-    config_path = _write_yaml(
-        tmp_path / "config.yaml",
-        """
-        models:
-          - id: auto-model
-        jobs:
-          - model: auto-model
-            env: auto_env
-        """,
-    )
-
-    config = load_run_config(config_path, env_default_root=env_root)
-
-    assert sorted(config.envs) == ["auto_env"]
-
-
-def test_environment_env_args_missing_required(monkeypatch, tmp_path: Path) -> None:
-    def fake_metadata(_env_id: str, cache=None):
-        return [_FakeParam("subset", required=True)]
-
-    monkeypatch.setattr(
-        "medarc_verifiers.cli._config_loader.load_env_metadata",
-        fake_metadata,
-    )
-    monkeypatch.setattr(
-        "medarc_verifiers.cli._job_executor.load_env_metadata",
-        fake_metadata,
-    )
-    monkeypatch.setattr(
-        "medarc_verifiers.cli._job_executor.load_endpoint_registry",
-        lambda _path, cache=None: {},
-    )
-
-    async def fail_if_called(*_args, **_kwargs):  # pragma: no cover - sanity guard
-        raise AssertionError("run_evaluation should not execute when env args are invalid.")
-
-    monkeypatch.setattr("medarc_verifiers.cli._job_executor.run_evaluation", fail_if_called)
-
-    config_path = _write_yaml(
-        tmp_path / "missing_required.yaml",
-        """
-        envs:
-          - id: medqa
-        models:
-          - id: gpt
-        jobs:
-          - model: gpt
-            env: medqa
-        """,
-    )
-
-    config = load_run_config(config_path)
-    jobs = build_jobs(config)
-
-    env_dir = tmp_path / "envs"
-    env_dir.mkdir()
-
-    settings = ExecutorSettings(
-        run_id="run-1",
-        output_dir=tmp_path / "runs",
-        env_dir=env_dir,
-        endpoints_path=tmp_path / "endpoints.yaml",
-        default_api_key_var="API_KEY",
-        default_api_base_url="https://api.example",
-        dry_run=False,
-    )
-
-    results = execute_jobs(jobs, settings)
-
-    assert results[0].status == "failed"
-    assert results[0].error is not None
-    assert "Missing required environment arguments" in results[0].error
-
-
-def test_environment_env_args_type_validation(monkeypatch, tmp_path: Path) -> None:
-    def fake_metadata(_env_id: str, cache=None):
-        return [_FakeParam("shuffle_seed", argparse_type=int)]
-
-    monkeypatch.setattr(
-        "medarc_verifiers.cli._config_loader.load_env_metadata",
-        fake_metadata,
-    )
-
-    config_path = _write_yaml(
-        tmp_path / "invalid_type.yaml",
-        """
-        envs:
-          - id: medqa
-            env_args:
-              shuffle_seed: wrong
-        models:
-          - id: gpt
-        jobs:
-          - model: gpt
-            env: medqa
-        """,
-    )
-
-    with pytest.raises(ValueError):
-        load_run_config(config_path)
-
-
-def test_matrix_expansion_generates_variants(monkeypatch, tmp_path: Path) -> None:
-    def fake_metadata(env_id: str, cache=None):  # noqa: ARG001
-        return [
-            _FakeParam("shuffle_answers"),
-            _FakeParam("shuffle_seed", argparse_type=int),
-        ]
-
-    monkeypatch.setattr(
-        "medarc_verifiers.cli._config_loader.load_env_metadata",
-        fake_metadata,
-    )
-
-    config_path = _write_yaml(
-        tmp_path / "matrix.yaml",
-        """
-        envs:
-          - id: medqa
-            module: medqa
-            env_args:
-              shuffle_answers: true
-            matrix:
-              shuffle_seed: [1618, 9331]
-            matrix_id_format: "{base}-r{shuffle_seed}"
-        models:
-          - id: gpt
-        jobs:
-          - model: gpt
-            env: medqa
-        """,
-    )
-
-    config = load_run_config(config_path)
-
-    env_ids = list(config.envs.keys())
-    assert env_ids == ["medqa-r1618", "medqa-r9331"]
-    assert all(env.matrix is None for env in config.envs.values())
-    assert all(env.matrix_base_id == "medqa" for env in config.envs.values())
-    assert {env.env_args["shuffle_seed"] for env in config.envs.values()} == {1618, 9331}
-    assert all(env.env_args["shuffle_answers"] is True for env in config.envs.values())
-    assert all(env.module == "medqa" for env in config.envs.values())
-
-
-def test_duplicate_env_ids_from_files_expand_variants(monkeypatch, tmp_path: Path) -> None:
-    monkeypatch.setattr(
-        "medarc_verifiers.cli._config_loader.load_env_metadata",
-        lambda _env_id, cache=None: [],
-    )
-
-    env_dir = tmp_path / "envs"
-    env_dir.mkdir()
-    _write_yaml(
-        env_dir / "longhealth.yaml",
-        """
-        - id: longhealth
-          module: longhealth
-          matrix:
-            task: ["task1", "task2"]
-          matrix_id_format: "{base}-{task}"
-        - id: longhealth
-          module: longhealth
-          matrix:
-            task: ["task3"]
-          matrix_id_format: "{base}-{task}-alt"
-        """,
-    )
-
-    config_path = _write_yaml(
-        tmp_path / "config.yaml",
-        f"""
-        envs:
-          - "{env_dir}"
-        models:
-          - id: gpt
-        jobs:
-          - model: gpt
-            env: longhealth
-        """,
-    )
-
-    config = load_run_config(config_path)
-
-    assert sorted(config.envs.keys()) == ["longhealth-task1", "longhealth-task2", "longhealth-task3-alt"]
-    assert all(env.matrix_base_id == "longhealth" for env in config.envs.values())
-
-
-def test_matrix_exclude_and_scalar_fields(monkeypatch, tmp_path: Path) -> None:
-    def fake_metadata(env_id: str, cache=None):  # noqa: ARG001
-        return [_FakeParam("shuffle_seed", argparse_type=int)]
-
-    monkeypatch.setattr(
-        "medarc_verifiers.cli._config_loader.load_env_metadata",
-        fake_metadata,
-    )
-
-    config_path = _write_yaml(
-        tmp_path / "matrix_scalar.yaml",
-        """
-        envs:
-          - id: medqa
-            module: medqa
-            matrix:
-              num_examples: [10, 20]
-              shuffle_seed: [1618, 9331]
-            matrix_exclude:
-              - num_examples: 20
-                shuffle_seed: 9331
-            matrix_id_format: "{base}-n{num_examples}-r{shuffle_seed}"
-        models:
-          - id: gpt
-        jobs:
-          - model: gpt
-            env: medqa
-        """,
-    )
-
-    config = load_run_config(config_path)
-
-    env_ids = sorted(config.envs)
-    assert env_ids == ["medqa-n10-r1618", "medqa-n10-r9331", "medqa-n20-r1618"]
-
-    num_examples = {env_id: env.num_examples for env_id, env in config.envs.items()}
-    assert num_examples["medqa-n10-r1618"] == 10
-    assert num_examples["medqa-n10-r9331"] == 10
-    assert num_examples["medqa-n20-r1618"] == 20
-
-    assert all("shuffle_seed" in env.env_args for env in config.envs.values())
-    assert "medqa-n20-r9331" not in env_ids
-
-
-def test_legacy_model_params_adapter(monkeypatch, tmp_path: Path) -> None:
-    monkeypatch.setattr(
-        "medarc_verifiers.cli._config_loader.load_env_metadata",
-        lambda _env_id, cache=None: [],
-    )
-
-    config_path = _write_yaml(
-        tmp_path / "legacy_model.yaml",
-        """
-        models:
-          - id: gpt
-            params:
-              model: openai/gpt
-              env_overrides:
-                medqa:
-                  shuffle_answers: true
-        envs:
-          - id: medqa
-            module: medqa
-        jobs:
-          - model: gpt
-            env: medqa
-        """,
-    )
-
-    config = load_run_config(config_path)
-
-    assert len(config.models) == 1
-    model_cfg = config.models["gpt"]
-    assert model_cfg.model == "openai/gpt"
-    assert model_cfg.env_overrides == {"medqa": {"shuffle_answers": True}}
-    assert model_cfg.env_args == {}
-
-
-def test_envs_can_reference_yaml_file(monkeypatch, tmp_path: Path) -> None:
-    monkeypatch.setattr(
-        "medarc_verifiers.cli._config_loader.load_env_metadata",
-        lambda _env_id, cache=None: [],
-    )
-    env_file = tmp_path / "envs.yaml"
-    env_file.write_text(
-        """
-        - id: included-env
-          module: included_env
-          num_examples: 3
-        """,
-    )
-    config_path = _write_yaml(
-        tmp_path / "config.yaml",
-        f"""
-        models:
-          gpt-mini:
-            model: openai/gpt-mini
-        envs: "{env_file.name}"
-        jobs:
-          - model: gpt-mini
-            env: included-env
-        """,
-    )
-
-    config = load_run_config(config_path)
-
-    assert set(config.envs) == {"included-env"}
-    assert config.envs["included-env"].num_examples == 3
-
-
-def test_envs_can_reference_directory(monkeypatch, tmp_path: Path) -> None:
-    monkeypatch.setattr(
-        "medarc_verifiers.cli._config_loader.load_env_metadata",
-        lambda _env_id, cache=None: [],
-    )
-    env_dir = tmp_path / "envs"
-    env_dir.mkdir()
-    (env_dir / "a.yaml").write_text(
-        """
-        - id: env-a
-          num_examples: 2
-        """,
-    )
-    (env_dir / "b.yaml").write_text(
-        """
-        - id: env-b
-          num_examples: 4
-        """,
-    )
-
-    config_path = _write_yaml(
-        tmp_path / "config.yaml",
-        """
-        models:
-          gpt-mini:
-            model: openai/gpt-mini
-        envs: "envs"
-        jobs:
-          - model: gpt-mini
-            env: env-a
-        """,
-    )
-
-    config = load_run_config(config_path)
-
-    assert set(config.envs) == {"env-a", "env-b"}
-    assert config.envs["env-b"].num_examples == 4
-
-
-def test_included_file_strict_shapes(monkeypatch, tmp_path: Path) -> None:
-    monkeypatch.setattr(
-        "medarc_verifiers.cli._config_loader.load_env_metadata",
-        lambda _env_id, cache=None: [],
-    )
-
-    bad_mapping = tmp_path / "bad_mapping.yaml"
-    bad_mapping.write_text(
-        """
-        env-basic:
-          id: env-basic
-        env-invalid: 1
-        """,
-    )
-
-    config_path = _write_yaml(
-        tmp_path / "config_bad_mapping.yaml",
-        f"""
-        models:
-          gpt-mini:
-            model: openai/gpt-mini
-        envs: "{bad_mapping.name}"
-        jobs:
-          - model: gpt-mini
-            env: env-basic
-        """,
-    )
-
-    with pytest.raises(ValueError) as excinfo:
-        load_run_config(config_path)
-    assert "mapping of id" in str(excinfo.value)
-
-    bad_list = tmp_path / "bad_list.yaml"
-    bad_list.write_text(
-        """
-        - id: env-basic
-        - 42
-        """,
-    )
-
-    config_path = _write_yaml(
-        tmp_path / "config_bad_list.yaml",
-        f"""
-        models:
-          gpt-mini:
-            model: openai/gpt-mini
-        envs: "{bad_list.name}"
-        jobs:
-          - model: gpt-mini
-            env: env-basic
-        """,
-    )
-
-    with pytest.raises(ValueError) as excinfo:
-        load_run_config(config_path)
-    assert "must be a mapping" in str(excinfo.value)
-
-
-def test_models_can_reference_yaml_file(monkeypatch, tmp_path: Path) -> None:
-    monkeypatch.setattr(
-        "medarc_verifiers.cli._config_loader.load_env_metadata",
-        lambda _env_id, cache=None: [],
-    )
-    models_file = tmp_path / "models.yaml"
-    models_file.write_text(
-        """
-        - id: model-a
-          model: openai/gpt-a
-          sampling_args:
-            max_tokens: 256
-        """,
-    )
-    config_path = _write_yaml(
-        tmp_path / "config.yaml",
-        f"""
-        models: "{models_file.name}"
-        envs:
-          env-a:
-            num_examples: 5
-        jobs:
-          - model: model-a
-            env: env-a
-        """,
-    )
-
-    config = load_run_config(config_path)
-
-    assert set(config.models) == {"model-a"}
-    assert config.models["model-a"].sampling_args["max_tokens"] == 256
-
-
-def test_jobs_can_reference_yaml_file(monkeypatch, tmp_path: Path) -> None:
-    monkeypatch.setattr(
-        "medarc_verifiers.cli._config_loader.load_env_metadata",
-        lambda _env_id, cache=None: [],
-    )
-    jobs_file = tmp_path / "jobs.yaml"
-    jobs_file.write_text(
-        """
-        - model: gpt-mini
-          env: env-a
-        """,
-    )
-    config_path = _write_yaml(
-        tmp_path / "config.yaml",
-        f"""
-        models:
-          gpt-mini:
-            model: openai/gpt-mini
-        envs:
-          env-a:
-            num_examples: 5
-        jobs: "{jobs_file.name}"
-        """,
-    )
-
-    config = load_run_config(config_path)
-
-    assert len(config.jobs) == 1
-    assert config.jobs[0].model == "gpt-mini"
diff --git a/tests/test_cli/test_eval_builder.py b/tests/test_cli/test_eval_builder.py
deleted file mode 100644
index ab9da748..00000000
--- a/tests/test_cli/test_eval_builder.py
+++ /dev/null
@@ -1,166 +0,0 @@
-from __future__ import annotations
-
-import pytest
-
-from medarc_verifiers.cli._eval_builder import build_client_config
-from medarc_verifiers.cli._schemas import ModelConfigSchema
-from medarc_verifiers.utils.prime_inference import PRIME_INFERENCE_URL
-
-
-def test_build_client_config_populates_endpoint_configs_for_replicas() -> None:
-    model_cfg = ModelConfigSchema(id="alias-model", headers={"X-Test": "1"})
-    endpoints = {
-        "alias-model": [
-            {"model": "resolved-model", "key": "MODEL_KEY", "url": "https://endpoint-a.example/v1"},
-            {"model": "resolved-model", "key": "MODEL_KEY", "url": "https://endpoint-b.example/v1"},
-            {"model": "resolved-model", "key": "MODEL_KEY", "url": "https://endpoint-c.example/v1"},
-        ]
-    }
-
-    resolved_model, client_config, sampling_overrides = build_client_config(
-        model_cfg,
-        endpoints=endpoints,
-        default_api_key_var="DEFAULT_KEY",
-        default_api_key_var_explicit=False,
-        default_api_base_url="https://default.example/v1",
-        api_base_url_override=None,
-        http_max_retries_override=None,
-        timeout_override=None,
-        headers=None,
-    )
-
-    assert resolved_model == "resolved-model"
-    assert client_config.api_base_url == "https://endpoint-a.example/v1"
-    assert client_config.api_key_var == "MODEL_KEY"
-    assert sampling_overrides == {}
-    assert [entry.api_base_url for entry in client_config.endpoint_configs] == [
-        "https://endpoint-a.example/v1",
-        "https://endpoint-b.example/v1",
-        "https://endpoint-c.example/v1",
-    ]
-    assert all(entry.api_key_var == "MODEL_KEY" for entry in client_config.endpoint_configs)
-    assert all(entry.extra_headers == {"X-Test": "1"} for entry in client_config.endpoint_configs)
-
-
-def test_build_client_config_api_base_url_override_suppresses_endpoint_configs() -> None:
-    model_cfg = ModelConfigSchema(id="alias-model")
-    endpoints = {
-        "alias-model": [
-            {"model": "resolved-model", "key": "MODEL_KEY", "url": "https://endpoint-a.example/v1"},
-            {"model": "resolved-model", "key": "MODEL_KEY", "url": "https://endpoint-b.example/v1"},
-            {"model": "resolved-model", "key": "MODEL_KEY", "url": "https://endpoint-c.example/v1"},
-        ]
-    }
-
-    _, client_config, _ = build_client_config(
-        model_cfg,
-        endpoints=endpoints,
-        default_api_key_var="DEFAULT_KEY",
-        default_api_key_var_explicit=False,
-        default_api_base_url="https://default.example/v1",
-        api_base_url_override="http://127.0.0.1:8000/v1",
-        http_max_retries_override=None,
-        timeout_override=None,
-        headers=None,
-    )
-
-    assert client_config.api_base_url == "http://127.0.0.1:8000/v1"
-    assert client_config.endpoint_configs == []
-
-
-def test_build_client_config_replicas_must_share_model_and_key() -> None:
-    model_cfg = ModelConfigSchema(id="alias-model")
-    endpoints = {
-        "alias-model": [
-            {"model": "resolved-model", "key": "MODEL_KEY", "url": "https://endpoint-a.example/v1"},
-            {"model": "resolved-model", "key": "MODEL_KEY_B", "url": "https://endpoint-b.example/v1"},
-        ]
-    }
-
-    with pytest.raises(ValueError, match="must agree on 'model' and 'key'"):
-        build_client_config(
-            model_cfg,
-            endpoints=endpoints,
-            default_api_key_var="DEFAULT_KEY",
-            default_api_key_var_explicit=False,
-            default_api_base_url="https://default.example/v1",
-            api_base_url_override=None,
-            http_max_retries_override=None,
-            timeout_override=None,
-            headers=None,
-        )
-
-
-def test_build_client_config_prime_base_url_forces_prime_key_when_non_explicit() -> None:
-    model_cfg = ModelConfigSchema(model="prime-model")
-
-    _, client_config, _ = build_client_config(
-        model_cfg,
-        endpoints={},
-        default_api_key_var="OPENAI_API_KEY",
-        default_api_key_var_explicit=False,
-        default_api_base_url=PRIME_INFERENCE_URL,
-        api_base_url_override=None,
-        http_max_retries_override=None,
-        timeout_override=None,
-        headers=None,
-    )
-
-    assert client_config.api_base_url == PRIME_INFERENCE_URL
-    assert client_config.api_key_var == "PRIME_API_KEY"
-
-
-def test_build_client_config_prime_registry_keeps_endpoint_key_var() -> None:
-    model_cfg = ModelConfigSchema(model="prime-model")
-    endpoints = {
-        "prime-model": [
-            {
-                "model": "prime-model-resolved",
-                "key": "CUSTOM_KEY",
-                "url": PRIME_INFERENCE_URL,
-            }
-        ]
-    }
-
-    _, client_config, _ = build_client_config(
-        model_cfg,
-        endpoints=endpoints,
-        default_api_key_var="OPENAI_API_KEY",
-        default_api_key_var_explicit=False,
-        default_api_base_url="https://default.example/v1",
-        api_base_url_override=None,
-        http_max_retries_override=None,
-        timeout_override=None,
-        headers=None,
-    )
-
-    assert client_config.api_base_url == PRIME_INFERENCE_URL
-    assert client_config.api_key_var == "CUSTOM_KEY"
-
-
-@pytest.mark.parametrize(
-    ("model_cfg", "default_key_var", "default_key_var_explicit", "expected"),
-    [
-        (ModelConfigSchema(model="prime-model", api_key_var="MODEL_KEY"), "OPENAI_API_KEY", False, "MODEL_KEY"),
-        (ModelConfigSchema(model="prime-model"), "CUSTOM_KEY", True, "CUSTOM_KEY"),
-    ],
-)
-def test_build_client_config_prime_base_url_respects_explicit_key_var(
-    model_cfg: ModelConfigSchema,
-    default_key_var: str,
-    default_key_var_explicit: bool,
-    expected: str,
-) -> None:
-    _, client_config, _ = build_client_config(
-        model_cfg,
-        endpoints={},
-        default_api_key_var=default_key_var,
-        default_api_key_var_explicit=default_key_var_explicit,
-        default_api_base_url=PRIME_INFERENCE_URL,
-        api_base_url_override=None,
-        http_max_retries_override=None,
-        timeout_override=None,
-        headers=None,
-    )
-
-    assert client_config.api_key_var == expected
diff --git a/tests/test_cli/test_job_builder.py b/tests/test_cli/test_job_builder.py
deleted file mode 100644
index 39b0d22c..00000000
--- a/tests/test_cli/test_job_builder.py
+++ /dev/null
@@ -1,182 +0,0 @@
-from __future__ import annotations
-
-from pathlib import Path
-
-import pytest
-
-from medarc_verifiers.cli._config_loader import load_run_config
-from medarc_verifiers.cli._job_builder import ResolvedJob, build_jobs
-
-
-def _write_yaml(path: Path, content: str) -> Path:
-    path.write_text(content)
-    return path
-
-
-def _stub_metadata(monkeypatch) -> None:
-    monkeypatch.setattr(
-        "medarc_verifiers.cli._config_loader.load_env_metadata",
-        lambda _env_id, cache=None: [],
-    )
-
-
-def test_build_jobs_basic(monkeypatch, tmp_path: Path) -> None:
-    _stub_metadata(monkeypatch)
-    config_path = _write_yaml(
-        tmp_path / "config.yaml",
-        """
-        models:
-          gpt-mini:
-            model: openai/gpt-mini
-        envs:
-          medqa:
-            num_examples: 5
-        jobs:
-          - model: gpt-mini
-            env: medqa
-        """,
-    )
-
-    run_config = load_run_config(config_path)
-    jobs = build_jobs(run_config)
-
-    assert len(jobs) == 1
-    job = jobs[0]
-    assert isinstance(job, ResolvedJob)
-    assert job.job_id == "gpt-mini-medqa"
-    assert job.env.id == "medqa"
-    assert job.env_args == {}
-    assert job.sampling_args == {}
-
-
-def test_env_args_precedence(monkeypatch, tmp_path: Path) -> None:
-    _stub_metadata(monkeypatch)
-    config_path = _write_yaml(
-        tmp_path / "config.yaml",
-        """
-        models:
-          gpt-mini:
-            env_args:
-              shared: model
-              model_only: 1
-            env_overrides:
-              medqa:
-                shared: override
-                override_only: true
-        envs:
-          medqa:
-            env_args:
-              shared: env
-              env_only: 2
-        jobs:
-          - model: gpt-mini
-            env: medqa
-            env_args:
-              shared: job
-              job_only: 3
-        """,
-    )
-
-    run_config = load_run_config(config_path)
-    jobs = build_jobs(run_config)
-
-    assert len(jobs) == 1
-    env_args = jobs[0].env_args
-    assert env_args["env_only"] == 2
-    assert env_args["model_only"] == 1
-    assert env_args["override_only"] is True
-    assert env_args["job_only"] == 3
-    assert env_args["shared"] == "job"
-
-
-def test_matrix_base_expansion(monkeypatch, tmp_path: Path) -> None:
-    _stub_metadata(monkeypatch)
-    config_path = _write_yaml(
-        tmp_path / "config.yaml",
-        """
-        models:
-          gpt-mini: {}
-        envs:
-          medqa:
-            matrix:
-              shuffle_seed: [1618, 9331]
-            matrix_id_format: "{base}-r{shuffle_seed}"
-        jobs:
-          - model: gpt-mini
-            env: medqa
-        """,
-    )
-
-    run_config = load_run_config(config_path)
-    jobs = build_jobs(run_config)
-
-    job_ids = {job.env.id for job in jobs}
-    assert job_ids == {"medqa-r1618", "medqa-r9331"}
-
-
-def test_duplicate_job_ids_get_fingerprinted(monkeypatch, tmp_path: Path) -> None:
-    _stub_metadata(monkeypatch)
-    config_path = _write_yaml(
-        tmp_path / "config.yaml",
-        """
-        models:
-          gpt-mini: {}
-        envs:
-          medqa: {}
-        jobs:
-          - model: gpt-mini
-            env: medqa
-            env_args:
-              variant: 1
-          - model: gpt-mini
-            env: medqa
-            env_args:
-              variant: 2
-        """,
-    )
-
-    run_config = load_run_config(config_path)
-    jobs = build_jobs(run_config)
-
-    assert len(jobs) == 2
-    job_ids = {job.job_id for job in jobs}
-    assert len(job_ids) == 2
-    assert any(job_id.startswith("gpt-mini-medqa-") for job_id in job_ids)
-
-
-def test_unknown_model_raises(monkeypatch, tmp_path: Path) -> None:
-    _stub_metadata(monkeypatch)
-    config_path = _write_yaml(
-        tmp_path / "config.yaml",
-        """
-        envs:
-          medqa: {}
-        jobs:
-          - model: missing
-            env: medqa
-        """,
-    )
-
-    run_config = load_run_config(config_path)
-    with pytest.raises(ValueError, match="unknown model"):
-        build_jobs(run_config)
-
-
-def test_unknown_environment_raises(monkeypatch, tmp_path: Path) -> None:
-    _stub_metadata(monkeypatch)
-    config_path = _write_yaml(
-        tmp_path / "config.yaml",
-        """
-        models:
-          gpt-mini: {}
-        envs:
-          medqa: {}
-        jobs:
-          - model: gpt-mini
-            env: missing
-        """,
-    )
-
-    run_config = load_run_config(config_path)
-    with pytest.raises(ValueError, match="unknown environment"):
-        build_jobs(run_config)
diff --git a/tests/test_cli/test_job_executor.py b/tests/test_cli/test_job_executor.py
deleted file mode 100644
index 68c35d27..00000000
--- a/tests/test_cli/test_job_executor.py
+++ /dev/null
@@ -1,680 +0,0 @@
-from __future__ import annotations
-
-import logging
-from pathlib import Path
-from types import SimpleNamespace
-
-import pytest
-
-from medarc_verifiers.cli._constants import DEFAULT_ENDPOINTS_PATH
-from medarc_verifiers.cli._job_builder import ResolvedJob
-from medarc_verifiers.cli._job_executor import (
-    ExecutorSettings,
-    JobExecutionResult,
-    _load_endpoints_for_model,
-    execute_jobs,
-)
-from medarc_verifiers.cli._schemas import EnvironmentConfigSchema, ModelConfigSchema
-from medarc_verifiers.cli.utils.env_args import EnvParam
-
-
-def _stub_metadata(required: bool = False) -> list[EnvParam]:
-    return [
-        EnvParam(
-            name="seed",
-            cli_name="seed",
-            kind="int",
-            default=None,
-            required=required,
-            help="Seed value",
-            annotation=int,
-            argparse_type=int,
-            choices=None,
-            action=None,
-            is_list=False,
-            element_type=None,
-            unsupported_reason=None,
-        )
-    ]
-
-
-def _settings(tmp_path: Path, **overrides: object) -> ExecutorSettings:
-    base_kwargs = dict(
-        run_id="run-1",
-        output_dir=tmp_path / "runs",
-        env_dir=tmp_path / "environments",
-        endpoints_path=tmp_path / "endpoints.py",
-        endpoints_path_explicit=False,
-        default_api_key_var="DEFAULT_KEY",
-        default_api_base_url="https://api.default",
-        log_level="INFO",
-        verbose=False,
-        save_results=True,
-        save_to_hf_hub=False,
-        hf_hub_dataset_name=None,
-        max_concurrent_generation=None,
-        max_concurrent_scoring=None,
-        # New concurrency precedence: CLI (--max-concurrent) > env_cfg.max_concurrent > DEFAULT_BATCH_MAX_CONCURRENT (128)
-        # Provide a placeholder so tests can inject a CLI override via overrides (max_concurrent=VALUE).
-        max_concurrent=None,
-        timeout=None,
-        sleep=0.0,
-        dry_run=False,
-    )
-    base_kwargs.update(overrides)
-    return ExecutorSettings(**base_kwargs)
-
-
-def _stub_results(value: float = 0.5) -> SimpleNamespace:
-    metadata = SimpleNamespace(
-        path_to_save="",
-        avg_reward=value,
-        num_examples=1,
-        rollouts_per_example=1,
-        avg_metrics={"pass_rate": value},
-    )
-    return SimpleNamespace(metadata=metadata, reward=[value], metrics={"pass_rate": [value]})
-
-
-def _stub_results_metadata_only(value: float = 0.5) -> SimpleNamespace:
-    metadata = SimpleNamespace(
-        path_to_save="",
-        avg_reward=value,
-        num_examples=2,
-        rollouts_per_example=3,
-        avg_metrics={"pass_rate": value, "accuracy": value / 2},
-    )
-    return SimpleNamespace(metadata=metadata)
-
-
-def test_execute_jobs_invokes_run_evaluation(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
-    captured = {}
-
-    async def fake_run(config):
-        captured["config"] = config
-        return _stub_results()
-
-    monkeypatch.setattr("medarc_verifiers.cli._job_executor.run_evaluation", fake_run)
-    monkeypatch.setattr(
-        "medarc_verifiers.cli._job_executor.load_endpoint_registry",
-        lambda path, cache=None: {
-            "alias": [{"model": "resolved-model", "key": "MODEL_KEY", "url": "https://api.resolved"}]
-        },
-    )
-    monkeypatch.setattr(
-        "medarc_verifiers.cli._job_executor.load_env_metadata",
-        lambda env_id, cache=None: _stub_metadata(required=True),
-    )
-
-    model_cfg = ModelConfigSchema(id="alias", headers={"X-Test": "1"}, sampling_args={"temperature": 0.1})
-    env_cfg = EnvironmentConfigSchema(id="medqa", env_args={"seed": 1}, num_examples=3)
-    job = ResolvedJob(
-        job_id="alias-medqa",
-        name="alias-medqa",
-        model=model_cfg,
-        env=env_cfg,
-        env_args={"seed": 1},
-        sampling_args={"temperature": 0.1},
-    )
-
-    results = execute_jobs([job], _settings(tmp_path))
-
-    assert len(results) == 1
-    result = results[0]
-    assert isinstance(result, JobExecutionResult)
-    assert result.status == "succeeded"
-    assert result.output_path == (tmp_path / "runs" / "run-1" / job.job_id)
-    assert "config" in captured
-    config = captured["config"]
-    assert config.model == "resolved-model"
-    assert Path(str(config.resume_path)) == (tmp_path / "runs" / "run-1" / job.job_id)
-    assert config.client_config.api_key_var == "MODEL_KEY"
-    assert config.client_config.api_base_url == "https://api.resolved"
-    assert config.client_config.extra_headers == {"X-Test": "1"}
-    assert config.env_args == {"seed": 1}
-    # With no CLI override and no env-level max_concurrent, falls back to DEFAULT_BATCH_MAX_CONCURRENT (128)
-    assert config.max_concurrent == 128
-
-
-def test_execute_jobs_records_failures(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
-    async def failing_run(config):
-        raise RuntimeError("boom")
-
-    monkeypatch.setattr("medarc_verifiers.cli._job_executor.run_evaluation", failing_run)
-    monkeypatch.setattr(
-        "medarc_verifiers.cli._job_executor.load_endpoint_registry",
-        lambda path, cache=None: {},
-    )
-    monkeypatch.setattr(
-        "medarc_verifiers.cli._job_executor.load_env_metadata",
-        lambda env_id, cache=None: _stub_metadata(required=False),
-    )
-
-    model_cfg = ModelConfigSchema(id="alias")
-    env_cfg = EnvironmentConfigSchema(id="medqa", env_args={"seed": 1})
-    job = ResolvedJob(
-        job_id="alias-medqa",
-        name="alias-medqa",
-        model=model_cfg,
-        env=env_cfg,
-        env_args={"seed": 1},
-        sampling_args={},
-    )
-
-    results = execute_jobs([job], _settings(tmp_path))
-
-    assert len(results) == 1
-    result = results[0]
-    assert result.status == "failed"
-    assert result.error is not None
-    assert "boom" in result.error
-    assert "alias-medqa" in result.error
-    assert "env=medqa" in result.error
-    assert result.output_path == (tmp_path / "runs" / "run-1" / job.job_id)
-
-
-def test_materialize_results_noop_logs_debug_when_source_matches_job_dir(
-    monkeypatch: pytest.MonkeyPatch,
-    tmp_path: Path,
-    caplog: pytest.LogCaptureFixture,
-) -> None:
-    async def fake_run(config):
-        metadata = SimpleNamespace(
-            path_to_save=str(config.resume_path),
-            avg_reward=0.5,
-            num_examples=1,
-            rollouts_per_example=1,
-            avg_metrics={"pass_rate": 0.5},
-        )
-        return SimpleNamespace(metadata=metadata, reward=[0.5], metrics={"pass_rate": [0.5]})
-
-    monkeypatch.setattr("medarc_verifiers.cli._job_executor.run_evaluation", fake_run)
-    monkeypatch.setattr("medarc_verifiers.cli._job_executor.load_endpoint_registry", lambda path, cache=None: {})
-    monkeypatch.setattr("medarc_verifiers.cli._job_executor.load_env_metadata", lambda env_id, cache=None: [])
-
-    job = ResolvedJob(
-        job_id="alias-medqa",
-        name="alias-medqa",
-        model=ModelConfigSchema(id="alias"),
-        env=EnvironmentConfigSchema(id="medqa"),
-        env_args={},
-        sampling_args={},
-    )
-
-    with caplog.at_level(logging.DEBUG):
-        results = execute_jobs([job], _settings(tmp_path, log_level="DEBUG"))
-
-    assert results[0].status == "succeeded"
-    assert "Results already in job_dir; _materialize_results no-op" in caplog.text
-
-
-def test_forced_job_archives_and_resets_existing_job_dir(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
-    captured: dict[str, object] = {}
-
-    async def fake_run(config):
-        captured["resume_path"] = config.resume_path
-        metadata = SimpleNamespace(
-            path_to_save=str(config.resume_path),
-            avg_reward=0.5,
-            num_examples=1,
-            rollouts_per_example=1,
-            avg_metrics={"pass_rate": 0.5},
-        )
-        return SimpleNamespace(metadata=metadata, reward=[0.5], metrics={"pass_rate": [0.5]})
-
-    monkeypatch.setattr("medarc_verifiers.cli._job_executor.run_evaluation", fake_run)
-    monkeypatch.setattr("medarc_verifiers.cli._job_executor.load_endpoint_registry", lambda path, cache=None: {})
-    monkeypatch.setattr("medarc_verifiers.cli._job_executor.load_env_metadata", lambda env_id, cache=None: [])
-
-    job = ResolvedJob(
-        job_id="alias-medqa",
-        name="alias-medqa",
-        model=ModelConfigSchema(id="alias"),
-        env=EnvironmentConfigSchema(id="medqa"),
-        env_args={},
-        sampling_args={},
-    )
-    run_dir = tmp_path / "runs" / "run-1"
-    job_dir = run_dir / job.job_id
-    job_dir.mkdir(parents=True, exist_ok=True)
-    (job_dir / "stale.txt").write_text("stale", encoding="utf-8")
-
-    results = execute_jobs([job], _settings(tmp_path, forced_job_ids={job.job_id}))
-
-    assert results[0].status == "succeeded"
-    assert Path(str(captured["resume_path"])) == job_dir
-    archived = sorted(run_dir.glob(f"{job.job_id}__old_*"))
-    assert len(archived) == 1
-    assert (archived[0] / "stale.txt").exists()
-    assert job_dir.exists()
-    assert not (job_dir / "stale.txt").exists()
-
-
-def test_non_forced_invalid_nonempty_job_dir_fails_prescriptively(
-    monkeypatch: pytest.MonkeyPatch,
-    tmp_path: Path,
-) -> None:
-    async def fail_if_called(_config):
-        raise AssertionError("run_evaluation should not run when preflight fails")
-
-    monkeypatch.setattr("medarc_verifiers.cli._job_executor.run_evaluation", fail_if_called)
-    monkeypatch.setattr("medarc_verifiers.cli._job_executor.load_endpoint_registry", lambda path, cache=None: {})
-    monkeypatch.setattr("medarc_verifiers.cli._job_executor.load_env_metadata", lambda env_id, cache=None: [])
-
-    job = ResolvedJob(
-        job_id="alias-medqa",
-        name="alias-medqa",
-        model=ModelConfigSchema(id="alias"),
-        env=EnvironmentConfigSchema(id="medqa"),
-        env_args={},
-        sampling_args={},
-    )
-    job_dir = tmp_path / "runs" / "run-1" / job.job_id
-    job_dir.mkdir(parents=True, exist_ok=True)
-    (job_dir / "orphan.log").write_text("invalid state", encoding="utf-8")
-
-    results = execute_jobs([job], _settings(tmp_path))
-
-    assert len(results) == 1
-    assert results[0].status == "failed"
-    assert results[0].error is not None
-    assert "not a valid evaluation results path" in results[0].error
-    assert "--force" in results[0].error
-    assert "new run_id" in results[0].error
-
-
-def test_batch_resume_mismatch_logs_saved_and_current_values(
-    monkeypatch: pytest.MonkeyPatch,
-    tmp_path: Path,
-    caplog: pytest.LogCaptureFixture,
-) -> None:
-    job = ResolvedJob(
-        job_id="alias-medqa",
-        name="alias-medqa",
-        model=ModelConfigSchema(id="alias"),
-        env=EnvironmentConfigSchema(id="medqa", num_examples=5, rollouts_per_example=3),
-        env_args={},
-        sampling_args={},
-    )
-    job_dir = tmp_path / "runs" / "run-1" / job.job_id
-    job_dir.mkdir(parents=True, exist_ok=True)
-    (job_dir / "results.jsonl").write_text("", encoding="utf-8")
-    (job_dir / "metadata.json").write_text(
-        ('{"env_id":"saved-env","model":"saved-model","rollouts_per_example":2,"num_examples":8}'),
-        encoding="utf-8",
-    )
-
-    async def fake_run(_config):
-        raise ValueError(
-            f"Cannot resume from {job_dir}: metadata mismatch (env_id: saved='saved-env', current='medqa')"
-        )
-
-    monkeypatch.setattr("medarc_verifiers.cli._job_executor.run_evaluation", fake_run)
-    monkeypatch.setattr("medarc_verifiers.cli._job_executor.load_endpoint_registry", lambda path, cache=None: {})
-    monkeypatch.setattr("medarc_verifiers.cli._job_executor.load_env_metadata", lambda env_id, cache=None: [])
-
-    with caplog.at_level(logging.ERROR):
-        results = execute_jobs([job], _settings(tmp_path))
-
-    assert len(results) == 1
-    assert results[0].status == "failed"
-    assert results[0].error is not None
-    assert "incompatible prior results" in results[0].error
-    assert "Resume metadata mismatch for job 'alias-medqa'" in caplog.text
-    assert "env_id: saved='saved-env', current='medqa'" in caplog.text
-    assert "model: saved='saved-model', current='alias'" in caplog.text
-    assert "rollouts_per_example: saved=2, current=3" in caplog.text
-    assert "num_examples: saved=8, current=5 (current must be >= saved)" in caplog.text
-
-
-def test_execute_jobs_uses_metadata_averages(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
-    class _ManifestStub:
-        def __init__(self) -> None:
-            self.started: list[str] = []
-            self.completed: list[dict[str, object]] = []
-
-        def record_job_start(self, job_id: str) -> None:
-            self.started.append(job_id)
-
-        def record_job_completion(self, job_id: str, **kwargs: object) -> None:
-            payload = {"job_id": job_id}
-            payload.update(kwargs)
-            self.completed.append(payload)
-
-        def record_job_failure(self, job_id: str, **kwargs: object) -> None:
-            raise AssertionError(f"Job should not fail: {job_id}, {kwargs}")
-
-    async def fake_run(config):
-        return _stub_results_metadata_only(0.8)
-
-    monkeypatch.setattr("medarc_verifiers.cli._job_executor.run_evaluation", fake_run)
-    monkeypatch.setattr("medarc_verifiers.cli._job_executor.load_endpoint_registry", lambda path, cache=None: {})
-    monkeypatch.setattr("medarc_verifiers.cli._job_executor.load_env_metadata", lambda env_id, cache=None: [])
-
-    job = ResolvedJob(
-        job_id="alias-medqa",
-        name="alias-medqa",
-        model=ModelConfigSchema(id="alias"),
-        env=EnvironmentConfigSchema(id="medqa"),
-        env_args={},
-        sampling_args={},
-    )
-    manifest = _ManifestStub()
-
-    results = execute_jobs([job], _settings(tmp_path), manifest=manifest)
-
-    assert results[0].status == "succeeded"
-    assert manifest.started == ["alias-medqa"]
-    assert len(manifest.completed) == 1
-    completed = manifest.completed[0]
-    assert completed["job_id"] == "alias-medqa"
-    assert completed["avg_reward"] == pytest.approx(0.8)
-    metrics = completed["metrics"]
-    assert isinstance(metrics, dict)
-    assert metrics["pass_rate"] == pytest.approx(0.8)
-    assert metrics["accuracy"] == pytest.approx(0.4)
-    assert completed["num_examples"] == 2
-    assert completed["rollouts_per_example"] == 3
-
-
-def test_execute_jobs_respects_dry_run(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
-    async def raise_if_called(*args, **kwargs):
-        raise AssertionError("run_evaluation should not be invoked during dry runs.")
-
-    monkeypatch.setattr("medarc_verifiers.cli._job_executor.run_evaluation", raise_if_called)
-    monkeypatch.setattr(
-        "medarc_verifiers.cli._job_executor.load_endpoint_registry",
-        lambda path, cache=None: {},
-    )
-    monkeypatch.setattr(
-        "medarc_verifiers.cli._job_executor.load_env_metadata",
-        lambda env_id, cache=None: _stub_metadata(required=False),
-    )
-
-    model_cfg = ModelConfigSchema(id="alias")
-    env_cfg = EnvironmentConfigSchema(id="medqa")
-    job = ResolvedJob(
-        job_id="alias-medqa",
-        name="alias-medqa",
-        model=model_cfg,
-        env=env_cfg,
-        env_args={},
-        sampling_args={},
-    )
-
-    results = execute_jobs([job], _settings(tmp_path, dry_run=True))
-
-    assert results[0].status == "skipped"
-    assert results[0].output_path == (tmp_path / "runs" / "run-1" / job.job_id)
-
-
-def test_executor_timeout_precedence(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
-    captured = {}
-
-    async def fake_run(config):
-        captured["config"] = config
-        return _stub_results()
-
-    monkeypatch.setattr("medarc_verifiers.cli._job_executor.run_evaluation", fake_run)
-    monkeypatch.setattr(
-        "medarc_verifiers.cli._job_executor.load_endpoint_registry",
-        lambda path, cache=None: {},
-    )
-    monkeypatch.setattr(
-        "medarc_verifiers.cli._job_executor.load_env_metadata",
-        lambda env_id, cache=None: _stub_metadata(required=False),
-    )
-
-    model_cfg = ModelConfigSchema(id="alias", timeout=5.0)
-    env_cfg = EnvironmentConfigSchema(id="medqa")
-    job = ResolvedJob(
-        job_id="alias-medqa",
-        name="alias-medqa",
-        model=model_cfg,
-        env=env_cfg,
-        env_args={},
-        sampling_args={},
-    )
-
-    # CLI override should take precedence when provided.
-    execute_jobs([job], _settings(tmp_path, timeout=10.0))
-    config = captured["config"]
-    assert config.client_config.timeout == 10.0
-
-    # Model-level timeout applies when CLI flag is absent.
-    captured.clear()
-    execute_jobs([job], _settings(tmp_path))
-    config = captured["config"]
-    assert config.client_config.timeout == 5.0
-
-
-def test_cli_env_arg_overrides_yaml(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
-    captured = {}
-
-    async def fake_run(config):
-        captured["config"] = config
-        return _stub_results()
-
-    monkeypatch.setattr("medarc_verifiers.cli._job_executor.run_evaluation", fake_run)
-    monkeypatch.setattr(
-        "medarc_verifiers.cli._job_executor.load_endpoint_registry",
-        lambda path, cache=None: {},
-    )
-    metadata = [
-        EnvParam(
-            name="flag",
-            cli_name="flag",
-            kind="bool",
-            default=False,
-            required=False,
-            help="Boolean flag",
-            annotation=bool,
-            argparse_type=None,
-            choices=None,
-            action="BooleanOptionalAction",
-            is_list=False,
-            element_type=None,
-            unsupported_reason=None,
-        )
-    ]
-    monkeypatch.setattr(
-        "medarc_verifiers.cli._job_executor.load_env_metadata",
-        lambda env_id, cache=None: metadata,
-    )
-
-    model_cfg = ModelConfigSchema(id="alias", env_args={"flag": True})
-    env_cfg = EnvironmentConfigSchema(id="medqa", env_args={"flag": False})
-    job = ResolvedJob(
-        job_id="alias-medqa",
-        name="alias-medqa",
-        model=model_cfg,
-        env=env_cfg,
-        env_args={"flag": False},
-        sampling_args={},
-    )
-
-    results = execute_jobs([job], _settings(tmp_path, cli_env_args={"flag": True}))
-
-    assert results[0].status == "succeeded"
-    assert captured["config"].env_args["flag"] is True
-
-
-def test_cli_sampling_arg_overrides_yaml(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
-    captured = {}
-
-    async def fake_run(config):
-        captured["config"] = config
-        return _stub_results()
-
-    monkeypatch.setattr("medarc_verifiers.cli._job_executor.run_evaluation", fake_run)
-    monkeypatch.setattr(
-        "medarc_verifiers.cli._job_executor.load_endpoint_registry",
-        lambda path, cache=None: {},
-    )
-    monkeypatch.setattr(
-        "medarc_verifiers.cli._job_executor.load_env_metadata",
-        lambda env_id, cache=None: [],
-    )
-
-    model_cfg = ModelConfigSchema(id="alias", sampling_args={"temperature": 0.7})
-    env_cfg = EnvironmentConfigSchema(id="medqa")
-    job = ResolvedJob(
-        job_id="alias-medqa",
-        name="alias-medqa",
-        model=model_cfg,
-        env=env_cfg,
-        env_args={},
-        sampling_args={"temperature": 0.5},
-    )
-
-    results = execute_jobs(
-        [job],
-        _settings(tmp_path, cli_sampling_args={"temperature": 0.2}),
-    )
-
-    assert results[0].status == "succeeded"
-    assert captured["config"].sampling_args["temperature"] == 0.2
-
-
-def test_execute_jobs_handles_keyboard_interrupt(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
-    async def interrupting_run(config):  # noqa: ARG001
-        raise KeyboardInterrupt
-
-    monkeypatch.setattr("medarc_verifiers.cli._job_executor.run_evaluation", interrupting_run)
-    monkeypatch.setattr(
-        "medarc_verifiers.cli._job_executor.load_endpoint_registry",
-        lambda path, cache=None: {},
-    )
-    monkeypatch.setattr(
-        "medarc_verifiers.cli._job_executor.load_env_metadata",
-        lambda env_id, cache=None: [],
-    )
-
-    model_cfg = ModelConfigSchema(id="alias")
-    env_cfg = EnvironmentConfigSchema(id="medqa")
-    job = ResolvedJob(
-        job_id="alias-medqa",
-        name="alias-medqa",
-        model=model_cfg,
-        env=env_cfg,
-        env_args={},
-        sampling_args={},
-    )
-
-    results = execute_jobs([job], _settings(tmp_path))
-
-    assert len(results) == 1
-    result = results[0]
-    assert result.status == "failed"
-    assert result.error is not None
-    assert "interrupted" in result.error.lower()
-
-
-def test_job_sleep_overrides_cli(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
-    sleep_calls: list[float] = []
-
-    async def fake_run(config):  # noqa: ARG001
-        return _stub_results()
-
-    monkeypatch.setattr("medarc_verifiers.cli._job_executor.run_evaluation", fake_run)
-    monkeypatch.setattr("medarc_verifiers.cli._job_executor.load_endpoint_registry", lambda path, cache=None: {})
-    monkeypatch.setattr(
-        "medarc_verifiers.cli._job_executor.load_env_metadata",
-        lambda env_id, cache=None: _stub_metadata(required=False),
-    )
-    monkeypatch.setattr("medarc_verifiers.cli._job_executor.sleep", lambda seconds: sleep_calls.append(seconds))
-
-    model_cfg = ModelConfigSchema(id="alias")
-    env_cfg = EnvironmentConfigSchema(id="medqa")
-
-    jobs = [
-        ResolvedJob(
-            job_id="alias-medqa-a",
-            name="alias-medqa-a",
-            model=model_cfg,
-            env=env_cfg,
-            env_args={},
-            sampling_args={},
-            sleep=1.5,
-        ),
-        ResolvedJob(
-            job_id="alias-medqa-b",
-            name="alias-medqa-b",
-            model=model_cfg,
-            env=env_cfg,
-            env_args={},
-            sampling_args={},
-            sleep=None,
-        ),
-    ]
-
-    results = execute_jobs(jobs, _settings(tmp_path, sleep=0.25))
-
-    assert all(result.status == "succeeded" for result in results)
-    assert sleep_calls == [pytest.approx(1.5)]
-
-
-def test_execute_jobs_warns_for_deprecated_eval_knobs(
-    monkeypatch: pytest.MonkeyPatch,
-    tmp_path: Path,
-    caplog: pytest.LogCaptureFixture,
-) -> None:
-    async def fake_run(config):  # noqa: ARG001
-        return _stub_results()
-
-    monkeypatch.setattr("medarc_verifiers.cli._job_executor.run_evaluation", fake_run)
-    monkeypatch.setattr("medarc_verifiers.cli._job_executor.load_endpoint_registry", lambda path, cache=None: {})
-    monkeypatch.setattr("medarc_verifiers.cli._job_executor.load_env_metadata", lambda env_id, cache=None: [])
-
-    model_cfg = ModelConfigSchema(id="alias")
-    env_cfg = EnvironmentConfigSchema(
-        id="medqa",
-        save_every=5,
-        print_results=True,
-    )
-    job = ResolvedJob(
-        job_id="alias-medqa",
-        name="alias-medqa",
-        model=model_cfg,
-        env=env_cfg,
-        env_args={},
-        sampling_args={},
-    )
-
-    with caplog.at_level(logging.WARNING):
-        results = execute_jobs(
-            [job],
-            _settings(
-                tmp_path,
-                max_concurrent_generation=2,
-                max_concurrent_scoring=3,
-            ),
-        )
-
-    assert results[0].status == "succeeded"
-    assert "Environment 'medqa' sets deprecated eval knob(s): print_results, save_every" in caplog.text
-    assert (
-        "Job 'alias-medqa' sets deprecated eval knob(s): max_concurrent_generation, max_concurrent_scoring"
-        in caplog.text
-    )
-
-
-def test_load_endpoints_for_model_missing_default_path_is_non_fatal(
-    monkeypatch: pytest.MonkeyPatch,
-    tmp_path: Path,
-) -> None:
-    monkeypatch.chdir(tmp_path)
-    settings = _settings(tmp_path, endpoints_path=Path(DEFAULT_ENDPOINTS_PATH), endpoints_path_explicit=False)
-    model_cfg = ModelConfigSchema(id="alias")
-
-    endpoints = _load_endpoints_for_model(model_cfg, settings, cache=None)
-
-    assert endpoints == {}
-
-
-def test_load_endpoints_for_model_missing_explicit_path_raises(tmp_path: Path) -> None:
-    settings = _settings(tmp_path, endpoints_path=tmp_path / "missing.toml", endpoints_path_explicit=True)
-    model_cfg = ModelConfigSchema(id="alias")
-
-    with pytest.raises(FileNotFoundError):
-        _load_endpoints_for_model(model_cfg, settings, cache=None)
diff --git a/tests/test_cli/test_main.py b/tests/test_cli/test_main.py
index 0d037d08..d8d0197a 100644
--- a/tests/test_cli/test_main.py
+++ b/tests/test_cli/test_main.py
@@ -100,71 +100,6 @@ def _write_resume_artifacts(
     )
 
 
-def test_cli_runs_configuration(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
-    config_path = tmp_path / "config.yaml"
-    _write_config(
-        config_path,
-        """
-        models:
-          model-a:
-            model: alias-model
-            headers:
-              X-Test: one
-        envs:
-          medqa:
-            env_args: {}
-        jobs:
-          - model: model-a
-            env: medqa
-        """,
-    )
-
-    captured = []
-
-    async def fake_run(config):
-        captured.append(config)
-        return _stub_cli_result()
-
-    monkeypatch.setattr("medarc_verifiers.cli._config_loader.load_env_metadata", lambda *args, **kwargs: [])
-    monkeypatch.setattr("medarc_verifiers.cli._job_executor.load_env_metadata", lambda *args, **kwargs: [])
-    monkeypatch.setattr("medarc_verifiers.cli._job_executor.load_endpoint_registry", lambda *args, **kwargs: {})
-    monkeypatch.setattr("medarc_verifiers.cli._job_executor.run_evaluation", fake_run)
-
-    output_dir = tmp_path / "runs_out"
-    env_dir = tmp_path / "envs"
-    env_dir.mkdir()
-
-    exit_code = main.main(
-        [
-            "bench",
-            "--config",
-            str(config_path),
-            "--output-dir",
-            str(output_dir),
-            "--env-dir",
-            str(env_dir),
-            "--max-concurrent",
-            "5",
-        ]
-    )
-
-    assert exit_code == 0
-    assert len(captured) == 1
-    config = captured[0]
-    assert config.model == "alias-model"
-    assert config.env_dir_path == str(env_dir)
-    assert config.client_config.extra_headers == {"X-Test": "one"}
-    assert config.max_concurrent == 5
-    run_dirs = list(output_dir.iterdir())
-    assert len(run_dirs) == 1
-    assert run_dirs[0].is_dir()
-    manifest_path = run_dirs[0] / "run_manifest.json"
-    assert manifest_path.exists()
-    manifest = json.loads(manifest_path.read_text())
-    assert manifest["summary"]["completed"] == 1
-    assert manifest["jobs"][0]["status"] == "completed"
-
-
 def test_toml_bench_dry_run_expands_evals_and_ablations(
     monkeypatch: pytest.MonkeyPatch,
     tmp_path: Path,
@@ -193,10 +128,6 @@ def test_toml_bench_dry_run_expands_evals_and_ablations(
         """,
     )
 
-    def fail_execute_jobs(*_args, **_kwargs):
-        raise AssertionError("execute_jobs should not be called for TOML dry-run")
-
-    monkeypatch.setattr(main, "execute_jobs", fail_execute_jobs)
     exit_code = main.main(
         [
             "bench",
@@ -230,6 +161,27 @@ def test_repository_smoke_toml_config_dry_runs(capsys: pytest.CaptureFixture[str
     assert "runs/evals/openai-gpt-4.1-mini/medqa" in output
 
 
+def test_bench_rejects_non_toml_config(tmp_path: Path, capsys: pytest.CaptureFixture[str]) -> None:
+    config_path = tmp_path / "bench.yaml"
+    _write_config(config_path, "models: {}\n")
+
+    with pytest.raises(SystemExit) as excinfo:
+        main.main(["bench", "--config", str(config_path), "--dry-run"])
+
+    assert excinfo.value.code == 2
+    err = capsys.readouterr().err
+    assert "medarc-eval bench now accepts upstream TOML configs only." in err
+
+
+def test_bench_rejects_removed_yaml_runner_flags(capsys: pytest.CaptureFixture[str]) -> None:
+    with pytest.raises(SystemExit) as excinfo:
+        main.main(["bench", "--config", "configs/eval/smoke.toml", "--restart"])
+
+    assert excinfo.value.code == 2
+    err = capsys.readouterr().err
+    assert "unrecognized arguments: --restart" in err
+
+
 def test_repository_mcq_toml_config_dry_run_shows_ablation_variants(capsys: pytest.CaptureFixture[str]) -> None:
     exit_code = main.main(["bench", "--config", "configs/eval/medarc-mcq.toml", "--dry-run", "--eval-index", "9"])
 
@@ -550,137 +502,105 @@ async def fake_run(config, on_progress=None, **_kwargs):
     assert all(item["medarc_config_fingerprint"] for item in saved_metadata)
 
 
-def test_batch_api_base_url_override_forces_endpoint(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
-    config_path = tmp_path / "config.yaml"
-    _write_config(
-        config_path,
-        """
-        models:
-          model-a:
-            model: alias-model
-            api_base_url: https://config.example/v1
-        envs:
-          medqa:
-            env_args: {}
-        jobs:
-          - model: model-a
-            env: medqa
-        """,
-    )
-
-    captured = []
+def test_single_run_help_lists_env_section_and_header_option(
+    monkeypatch: pytest.MonkeyPatch,
+    capsys: pytest.CaptureFixture[str],
+) -> None:
+    metadata = [
+        _make_env_param(
+            "difficulty",
+            kind="str",
+            default="easy",
+            choices=("easy", "hard"),
+        )
+    ]
+    _patch_single_run_env(monkeypatch, metadata)
 
-    async def fake_run(config):
-        captured.append(config)
-        return _stub_cli_result()
+    exit_code = main.main(["medqa", "--help"])
 
-    monkeypatch.setattr("medarc_verifiers.cli._config_loader.load_env_metadata", lambda *args, **kwargs: [])
-    monkeypatch.setattr("medarc_verifiers.cli._job_executor.load_env_metadata", lambda *args, **kwargs: [])
-    monkeypatch.setattr(
-        "medarc_verifiers.cli._job_executor.load_endpoint_registry",
-        lambda *args, **kwargs: {
-            "alias-model": [{"model": "resolved-model", "url": "https://endpoint.example/v1", "key": "REGISTRY_KEY"}]
-        },
-    )
-    monkeypatch.setattr("medarc_verifiers.cli._job_executor.run_evaluation", fake_run)
+    assert exit_code == 0
+    captured = capsys.readouterr().out
+    assert "medqa environment options:" in captured
+    assert "--header" in captured
+    assert "--header-file" not in captured
 
-    output_dir = tmp_path / "runs_out"
-    env_dir = tmp_path / "envs"
-    env_dir.mkdir()
 
-    override_url = "http://127.0.0.1:8000/v1"
-    assert (
-        main.main(
-            [
-                "bench",
-                "--config",
-                str(config_path),
-                "--output-dir",
-                str(output_dir),
-                "--env-dir",
-                str(env_dir),
-                "--api-base-url",
-                override_url,
-            ]
+def test_single_run_help_orders_env_group_before_core_options(
+    monkeypatch: pytest.MonkeyPatch,
+    capsys: pytest.CaptureFixture[str],
+) -> None:
+    metadata = [
+        _make_env_param(
+            "difficulty",
+            kind="str",
+            default="easy",
+            choices=("easy", "hard"),
         )
-        == 0
-    )
+    ]
+    _patch_single_run_env(monkeypatch, metadata)
 
-    assert len(captured) == 1
-    assert captured[0].client_config.api_base_url == override_url
+    exit_code = main.main(["medqa", "--help"])
+    assert exit_code == 0
+    captured = capsys.readouterr().out
+    env_idx = captured.index("medqa environment options:")
+    core_idx = captured.index("medarc-eval options:")
+    assert env_idx < core_idx
 
 
-def test_batch_prime_base_url_forces_prime_api_key_when_default_not_explicit(
+def test_general_help_uses_invoked_binary_name(
     monkeypatch: pytest.MonkeyPatch,
-    tmp_path: Path,
+    capsys: pytest.CaptureFixture[str],
 ) -> None:
-    config_path = tmp_path / "config.yaml"
-    _write_config(
-        config_path,
-        f"""
-        models:
-          model-a:
-            model: alias-model
-            api_base_url: {PRIME_INFERENCE_URL}
-        envs:
-          medqa: {{}}
-        jobs:
-          - model: model-a
-            env: medqa
-        """,
-    )
-
-    captured = []
+    monkeypatch.setattr(sys, "argv", ["medarc-eval"])
 
-    async def fake_run(config):
-        captured.append(config)
-        return _stub_cli_result()
+    exit_code = main.main([])
 
-    monkeypatch.setattr("medarc_verifiers.cli._config_loader.load_env_metadata", lambda *args, **kwargs: [])
-    monkeypatch.setattr("medarc_verifiers.cli._job_executor.load_env_metadata", lambda *args, **kwargs: [])
-    monkeypatch.setattr("medarc_verifiers.cli._job_executor.load_endpoint_registry", lambda *args, **kwargs: {})
-    monkeypatch.setattr("medarc_verifiers.cli._job_executor.run_evaluation", fake_run)
+    assert exit_code == 0
+    captured = capsys.readouterr().out
+    assert "medarc-eval bench --help" in captured
 
-    output_dir = tmp_path / "runs_out"
-    env_dir = tmp_path / "envs"
-    env_dir.mkdir()
 
-    exit_code = main.main(
-        [
-            "bench",
-            "--config",
-            str(config_path),
-            "--output-dir",
-            str(output_dir),
-            "--env-dir",
-            str(env_dir),
-        ]
+def test_single_run_missing_required_param_errors(
+    monkeypatch: pytest.MonkeyPatch,
+    capsys: pytest.CaptureFixture[str],
+) -> None:
+    metadata = [
+        _make_env_param(
+            "threshold",
+            kind="int",
+            required=True,
+            annotation=int,
+            argparse_type=int,
+        )
+    ]
+    _patch_single_run_env(monkeypatch, metadata)
+    monkeypatch.setattr(
+        "medarc_verifiers.cli._single_run.run_evaluation",
+        lambda *args, **kwargs: pytest.fail("Should not run when args invalid."),
     )
 
-    assert exit_code == 0
-    assert len(captured) == 1
-    assert captured[0].client_config.api_key_var == "PRIME_API_KEY"
+    with pytest.raises(SystemExit) as excinfo:
+        main.main(["medqa"])
+
+    assert excinfo.value.code == 2
+    err = capsys.readouterr().err
+    assert "Missing required environment arguments: threshold" in err
 
 
-def test_batch_explicit_default_api_key_var_is_respected_for_prime_base_url(
+def test_single_run_boolean_negation_and_sampling_precedence(
     monkeypatch: pytest.MonkeyPatch,
-    tmp_path: Path,
 ) -> None:
-    config_path = tmp_path / "config.yaml"
-    _write_config(
-        config_path,
-        f"""
-        models:
-          model-a:
-            model: alias-model
-            api_base_url: {PRIME_INFERENCE_URL}
-        envs:
-          medqa: {{}}
-        jobs:
-          - model: model-a
-            env: medqa
-        """,
-    )
+    metadata = [
+        _make_env_param(
+            "use_think",
+            kind="bool",
+            default=True,
+            annotation=bool,
+            argparse_type=None,
+            action="BooleanOptionalAction",
+        )
+    ]
+    _patch_single_run_env(monkeypatch, metadata)
 
     captured = []
 
@@ -688,1123 +608,74 @@ async def fake_run(config):
         captured.append(config)
         return _stub_cli_result()
 
-    monkeypatch.setattr("medarc_verifiers.cli._config_loader.load_env_metadata", lambda *args, **kwargs: [])
-    monkeypatch.setattr("medarc_verifiers.cli._job_executor.load_env_metadata", lambda *args, **kwargs: [])
-    monkeypatch.setattr("medarc_verifiers.cli._job_executor.load_endpoint_registry", lambda *args, **kwargs: {})
-    monkeypatch.setattr("medarc_verifiers.cli._job_executor.run_evaluation", fake_run)
-
-    output_dir = tmp_path / "runs_out"
-    env_dir = tmp_path / "envs"
-    env_dir.mkdir()
+    monkeypatch.setattr("medarc_verifiers.cli._single_run.run_evaluation", fake_run)
 
     exit_code = main.main(
         [
-            "bench",
-            "--config",
-            str(config_path),
-            "--output-dir",
-            str(output_dir),
-            "--env-dir",
-            str(env_dir),
-            "--default-api-key-var",
-            "OPENAI_API_KEY",
+            "medqa",
+            "--no-use-think",
+            "--sampling-arg",
+            "max_tokens=64",
+            "--max-tokens",
+            "128",
         ]
     )
 
     assert exit_code == 0
     assert len(captured) == 1
-    assert captured[0].client_config.api_key_var == "OPENAI_API_KEY"
-
-
-def test_model_level_max_concurrent_applies(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
-    config_path = tmp_path / "config.yaml"
-    _write_config(
-        config_path,
-        """
-        models:
-          model-a:
-            model: alias-model
-            max_concurrent: 7
-        envs:
-          medqa: {}
-        jobs:
-          - model: model-a
-            env: medqa
-        """,
-    )
-
-    captured = []
-
-    async def fake_run(config):
-        captured.append(config)
-        return _stub_cli_result()
+    eval_config = captured[0]
+    assert eval_config.env_args["use_think"] is False
+    assert eval_config.sampling_args["max_tokens"] == 64
 
-    monkeypatch.setattr("medarc_verifiers.cli._config_loader.load_env_metadata", lambda *args, **kwargs: [])
-    monkeypatch.setattr("medarc_verifiers.cli._job_executor.load_env_metadata", lambda *args, **kwargs: [])
-    monkeypatch.setattr("medarc_verifiers.cli._job_executor.load_endpoint_registry", lambda *args, **kwargs: {})
-    monkeypatch.setattr("medarc_verifiers.cli._job_executor.run_evaluation", fake_run)
 
-    output_dir = tmp_path / "runs_out"
-    env_dir = tmp_path / "envs"
-    env_dir.mkdir()
+def test_single_run_headers_pass_through_to_eval_config(
+    monkeypatch: pytest.MonkeyPatch,
+    capsys: pytest.CaptureFixture[str],
+) -> None:
+    metadata: list[EnvParam] = []
+    _patch_single_run_env(monkeypatch, metadata)
 
     exit_code = main.main(
         [
-            "bench",
-            "--config",
-            str(config_path),
-            "--output-dir",
-            str(output_dir),
-            "--env-dir",
-            str(env_dir),
+            "medqa",
+            "--dry-run",
+            "--header",
+            "X-Test: cli",
         ]
     )
 
     assert exit_code == 0
-    assert len(captured) == 1
-    config = captured[0]
-    assert config.max_concurrent == 7
-
-
-def test_batch_rollout_max_retries_sets_eval_config(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
-    config_path = tmp_path / "config.yaml"
-    _write_config(
-        config_path,
-        """
-        models:
-          model-a:
-            model: alias-model
-        envs:
-          medqa: {}
-        jobs:
-          - model: model-a
-            env: medqa
-        """,
-    )
-
-    captured = []
-
-    async def fake_run(config):
-        captured.append(config)
-        return _stub_cli_result()
+    output = capsys.readouterr().out
+    config = json.loads(output)
+    assert config["client_config"]["extra_headers"] == {"X-Test": "cli"}
 
-    monkeypatch.setattr("medarc_verifiers.cli._config_loader.load_env_metadata", lambda *args, **kwargs: [])
-    monkeypatch.setattr("medarc_verifiers.cli._job_executor.load_env_metadata", lambda *args, **kwargs: [])
-    monkeypatch.setattr("medarc_verifiers.cli._job_executor.load_endpoint_registry", lambda *args, **kwargs: {})
-    monkeypatch.setattr("medarc_verifiers.cli._job_executor.run_evaluation", fake_run)
 
-    output_dir = tmp_path / "runs_out"
-    env_dir = tmp_path / "envs"
-    env_dir.mkdir()
+def test_single_run_auto_adds_prime_team_header(
+    monkeypatch: pytest.MonkeyPatch,
+    capsys: pytest.CaptureFixture[str],
+) -> None:
+    metadata: list[EnvParam] = []
+    _patch_single_run_env(monkeypatch, metadata)
+    monkeypatch.setenv("PRIME_TEAM_ID", "team-123")
 
     exit_code = main.main(
         [
-            "bench",
-            "--config",
-            str(config_path),
-            "--output-dir",
-            str(output_dir),
-            "--env-dir",
-            str(env_dir),
-            "--rollout-max-retries",
-            "3",
+            "medqa",
+            "--dry-run",
+            "--api-base-url",
+            PRIME_INFERENCE_URL,
+            "--header",
+            "X-Test: cli",
         ]
     )
 
     assert exit_code == 0
-    assert len(captured) == 1
-    assert captured[0].max_retries == 3
-
-
-def test_batch_http_max_retries_sets_client_config(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
-    config_path = tmp_path / "config.yaml"
-    _write_config(
-        config_path,
-        """
-        models:
-          model-a:
-            model: alias-model
-        envs:
-          medqa: {}
-        jobs:
-          - model: model-a
-            env: medqa
-        """,
-    )
-
-    captured = []
-
-    async def fake_run(config):
-        captured.append(config)
-        return _stub_cli_result()
-
-    monkeypatch.setattr("medarc_verifiers.cli._config_loader.load_env_metadata", lambda *args, **kwargs: [])
-    monkeypatch.setattr("medarc_verifiers.cli._job_executor.load_env_metadata", lambda *args, **kwargs: [])
-    monkeypatch.setattr("medarc_verifiers.cli._job_executor.load_endpoint_registry", lambda *args, **kwargs: {})
-    monkeypatch.setattr("medarc_verifiers.cli._job_executor.run_evaluation", fake_run)
-
-    output_dir = tmp_path / "runs_out"
-    env_dir = tmp_path / "envs"
-    env_dir.mkdir()
-
-    exit_code = main.main(
-        [
-            "bench",
-            "--config",
-            str(config_path),
-            "--output-dir",
-            str(output_dir),
-            "--env-dir",
-            str(env_dir),
-            "--http-max-retries",
-            "7",
-        ]
-    )
-
-    assert exit_code == 0
-    assert len(captured) == 1
-    assert captured[0].client_config.max_retries == 7
-
-
-def test_deprecated_enable_additional_retries_warns_and_maps_to_default_attempts(
-    monkeypatch: pytest.MonkeyPatch,
-    tmp_path: Path,
-    caplog: pytest.LogCaptureFixture,
-) -> None:
-    config_path = tmp_path / "config.yaml"
-    _write_config(
-        config_path,
-        """
-        models:
-          model-a:
-            model: alias-model
-        envs:
-          medqa: {}
-        jobs:
-          - model: model-a
-            env: medqa
-        """,
-    )
-
-    captured_attempts: list[int] = []
-
-    def fake_patch(*, attempts=3, backoff_s=1.0, log_path="medarc_model_retry.log"):  # noqa: ARG001
-        captured_attempts.append(attempts)
-
-    async def fake_run(config):  # noqa: ARG001
-        return _stub_cli_result()
-
-    monkeypatch.setattr("medarc_verifiers.utils.retry.patch_verifiers_model_response_retry", fake_patch)
-    monkeypatch.setattr("medarc_verifiers.cli._config_loader.load_env_metadata", lambda *args, **kwargs: [])
-    monkeypatch.setattr("medarc_verifiers.cli._job_executor.load_env_metadata", lambda *args, **kwargs: [])
-    monkeypatch.setattr("medarc_verifiers.cli._job_executor.load_endpoint_registry", lambda *args, **kwargs: {})
-    monkeypatch.setattr("medarc_verifiers.cli._job_executor.run_evaluation", fake_run)
-
-    output_dir = tmp_path / "runs_out"
-    env_dir = tmp_path / "envs"
-    env_dir.mkdir()
-
-    with caplog.at_level(logging.WARNING):
-        exit_code = main.main(
-            [
-                "bench",
-                "--config",
-                str(config_path),
-                "--output-dir",
-                str(output_dir),
-                "--env-dir",
-                str(env_dir),
-                "--enable-additional-retries",
-            ]
-        )
-
-    assert exit_code == 0
-    assert captured_attempts == [3]
-    assert "Flag --enable-additional-retries is deprecated" in caplog.text
-
-
-def test_model_call_retries_overrides_deprecated_toggle(
-    monkeypatch: pytest.MonkeyPatch,
-    tmp_path: Path,
-    caplog: pytest.LogCaptureFixture,
-) -> None:
-    config_path = tmp_path / "config.yaml"
-    _write_config(
-        config_path,
-        """
-        models:
-          model-a:
-            model: alias-model
-        envs:
-          medqa: {}
-        jobs:
-          - model: model-a
-            env: medqa
-        """,
-    )
-
-    captured_attempts: list[int] = []
-
-    def fake_patch(*, attempts=3, backoff_s=1.0, log_path="medarc_model_retry.log"):  # noqa: ARG001
-        captured_attempts.append(attempts)
-
-    async def fake_run(config):  # noqa: ARG001
-        return _stub_cli_result()
-
-    monkeypatch.setattr("medarc_verifiers.utils.retry.patch_verifiers_model_response_retry", fake_patch)
-    monkeypatch.setattr("medarc_verifiers.cli._config_loader.load_env_metadata", lambda *args, **kwargs: [])
-    monkeypatch.setattr("medarc_verifiers.cli._job_executor.load_env_metadata", lambda *args, **kwargs: [])
-    monkeypatch.setattr("medarc_verifiers.cli._job_executor.load_endpoint_registry", lambda *args, **kwargs: {})
-    monkeypatch.setattr("medarc_verifiers.cli._job_executor.run_evaluation", fake_run)
-
-    output_dir = tmp_path / "runs_out"
-    env_dir = tmp_path / "envs"
-    env_dir.mkdir()
-
-    with caplog.at_level(logging.WARNING):
-        exit_code = main.main(
-            [
-                "bench",
-                "--config",
-                str(config_path),
-                "--output-dir",
-                str(output_dir),
-                "--env-dir",
-                str(env_dir),
-                "--enable-additional-retries",
-                "--model-call-retries",
-                "5",
-            ]
-        )
-
-    assert exit_code == 0
-    assert captured_attempts == [5]
-    assert "Ignoring deprecated --enable-additional-retries" in caplog.text
-
-
-def test_batch_dry_run_with_model_call_retries_does_not_patch(
-    monkeypatch: pytest.MonkeyPatch,
-    tmp_path: Path,
-) -> None:
-    config_path = tmp_path / "config.yaml"
-    _write_config(
-        config_path,
-        """
-        models:
-          model-a:
-            model: alias-model
-        envs:
-          medqa: {}
-        jobs:
-          - model: model-a
-            env: medqa
-        """,
-    )
-
-    captured_attempts: list[int] = []
-
-    def fake_patch(*, attempts=3, backoff_s=1.0, log_path="medarc_model_retry.log"):  # noqa: ARG001
-        captured_attempts.append(attempts)
-
-    monkeypatch.setattr("medarc_verifiers.utils.retry.patch_verifiers_model_response_retry", fake_patch)
-    monkeypatch.setattr("medarc_verifiers.cli._config_loader.load_env_metadata", lambda *args, **kwargs: [])
-    monkeypatch.setattr("medarc_verifiers.cli._job_executor.load_env_metadata", lambda *args, **kwargs: [])
-    monkeypatch.setattr("medarc_verifiers.cli._job_executor.load_endpoint_registry", lambda *args, **kwargs: {})
-
-    output_dir = tmp_path / "runs_out"
-    env_dir = tmp_path / "envs"
-    env_dir.mkdir()
-
-    exit_code = main.main(
-        [
-            "bench",
-            "--config",
-            str(config_path),
-            "--output-dir",
-            str(output_dir),
-            "--env-dir",
-            str(env_dir),
-            "--dry-run",
-            "--model-call-retries",
-            "3",
-        ]
-    )
-
-    assert exit_code == 0
-    assert captured_attempts == []
-
-
-def test_env_rerun_flag_forces_completed_jobs(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
-    config_path = tmp_path / "config.yaml"
-    _write_config(
-        config_path,
-        """
-        name: rerun-check
-        models:
-          model-a:
-            model: alias-model
-        envs:
-          env-a:
-            rerun: true
-        jobs:
-          - model: model-a
-            env: env-a
-        """,
-    )
-
-    captured = []
-
-    async def fake_run(config):
-        captured.append(config)
-        return _stub_cli_result()
-
-    monkeypatch.setattr("medarc_verifiers.cli._config_loader.load_env_metadata", lambda *args, **kwargs: [])
-    monkeypatch.setattr("medarc_verifiers.cli._job_executor.load_env_metadata", lambda *args, **kwargs: [])
-    monkeypatch.setattr("medarc_verifiers.cli._job_executor.load_endpoint_registry", lambda *args, **kwargs: {})
-    monkeypatch.setattr("medarc_verifiers.cli._job_executor.run_evaluation", fake_run)
-
-    output_dir = tmp_path / "runs_out"
-    env_dir = tmp_path / "envs"
-    env_dir.mkdir()
-
-    exit_code = main.main(
-        [
-            "bench",
-            "--config",
-            str(config_path),
-            "--output-dir",
-            str(output_dir),
-            "--env-dir",
-            str(env_dir),
-        ]
-    )
-    assert exit_code == 0
-    assert len(captured) == 1
-    run_dirs = list(output_dir.iterdir())
-    assert len(run_dirs) == 1
-    run_dir = run_dirs[0]
-
-    exit_code_second = main.main(
-        [
-            "bench",
-            "--config",
-            str(config_path),
-            "--output-dir",
-            str(output_dir),
-            "--env-dir",
-            str(env_dir),
-        ]
-    )
-    assert exit_code_second == 0
-    assert len(captured) == 2
-
-    manifest_path = run_dir / "run_manifest.json"
-    manifest = json.loads(manifest_path.read_text())
-    job_entry = manifest["jobs"][0]
-    assert job_entry["status"] == "completed"
-    assert job_entry["attempt"] == 2
-    assert manifest["summary"]["completed"] == 1
-
-
-def test_on_complete_rerun_marks_completed_jobs_as_forced(
-    monkeypatch: pytest.MonkeyPatch,
-    tmp_path: Path,
-) -> None:
-    config_path = tmp_path / "config.yaml"
-    _write_config(
-        config_path,
-        """
-        name: rerun-on-complete
-        models:
-          model-a:
-            model: alias-model
-        envs:
-          medqa: {}
-        jobs:
-          - model: model-a
-            env: medqa
-        """,
-    )
-
-    monkeypatch.setattr("medarc_verifiers.cli._config_loader.load_env_metadata", lambda *args, **kwargs: [])
-    monkeypatch.setattr("medarc_verifiers.cli._job_executor.load_env_metadata", lambda *args, **kwargs: [])
-    monkeypatch.setattr("medarc_verifiers.cli._job_executor.load_endpoint_registry", lambda *args, **kwargs: {})
-
-    async def fake_run(config):  # noqa: ARG001
-        return _stub_cli_result()
-
-    monkeypatch.setattr("medarc_verifiers.cli._job_executor.run_evaluation", fake_run)
-
-    output_dir = tmp_path / "runs_out"
-    env_dir = tmp_path / "envs"
-    env_dir.mkdir()
-
-    first_exit = main.main(
-        [
-            "bench",
-            "--config",
-            str(config_path),
-            "--output-dir",
-            str(output_dir),
-            "--env-dir",
-            str(env_dir),
-            "--run-id",
-            "forced-rerun-test",
-        ]
-    )
-    assert first_exit == 0
-
-    captured: dict[str, Any] = {}
-
-    def fake_execute_jobs(planned_jobs, settings, **kwargs):  # noqa: ANN001, ARG001
-        captured["planned_job_ids"] = [job.job_id for job in planned_jobs]
-        captured["forced_job_ids"] = set(settings.forced_job_ids)
-        return [
-            main.JobExecutionResult(
-                job_id=planned_jobs[0].job_id,
-                status="skipped",
-                output_path=settings.output_dir / settings.run_id / planned_jobs[0].job_id,
-            )
-        ]
-
-    monkeypatch.setattr("medarc_verifiers.cli.main.execute_jobs", fake_execute_jobs)
-
-    second_exit = main.main(
-        [
-            "bench",
-            "--config",
-            str(config_path),
-            "--output-dir",
-            str(output_dir),
-            "--env-dir",
-            str(env_dir),
-            "--run-id",
-            "forced-rerun-test",
-            "--on-complete",
-            "rerun",
-        ]
-    )
-    assert second_exit == 0
-    assert captured["planned_job_ids"] == ["model-a-medqa"]
-    assert captured["forced_job_ids"] == {"model-a-medqa"}
-
-
-def test_cli_env_config_root_override(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
-    config_dir = tmp_path / "configs"
-    config_dir.mkdir()
-    config_path = config_dir / "jobs.yaml"
-    _write_config(
-        config_path,
-        """
-        models:
-          model-a: {}
-        envs:
-          - custom_env
-        jobs:
-          - model: model-a
-            env: custom_env
-        """,
-    )
-
-    shared_envs = tmp_path / "shared_envs"
-    shared_envs.mkdir()
-    (shared_envs / "custom_env.yaml").write_text(
-        """
-        - id: custom_env
-          module: custom_env
-        """,
-        encoding="utf-8",
-    )
-
-    monkeypatch.setattr("medarc_verifiers.cli._config_loader.load_env_metadata", lambda *args, **kwargs: [])
-    monkeypatch.setattr("medarc_verifiers.cli._job_executor.load_env_metadata", lambda *args, **kwargs: [])
-    monkeypatch.setattr("medarc_verifiers.cli._job_executor.load_endpoint_registry", lambda *args, **kwargs: {})
-
-    async def fake_run(config):
-        return _stub_cli_result()
-
-    monkeypatch.setattr("medarc_verifiers.cli._job_executor.run_evaluation", fake_run)
-
-    output_dir = tmp_path / "runs_out"
-    exit_code = main.main(
-        [
-            "bench",
-            "--config",
-            str(config_path),
-            "--output-dir",
-            str(output_dir),
-            "--env-config-root",
-            str(shared_envs),
-        ]
-    )
-
-    assert exit_code == 0
-
-
-##
-
-
-def test_regen_reuses_completed_jobs(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
-    config_path = tmp_path / "config.yaml"
-    _write_config(
-        config_path,
-        """
-        models:
-          model-a: {}
-          model-b: {}
-        envs:
-          medqa: {}
-        jobs:
-          - model: model-a
-            env: medqa
-          - model: model-b
-            env: medqa
-        """,
-    )
-
-    monkeypatch.setattr("medarc_verifiers.cli._config_loader.load_env_metadata", lambda *args, **kwargs: [])
-    monkeypatch.setattr("medarc_verifiers.cli._job_executor.load_env_metadata", lambda *args, **kwargs: [])
-    monkeypatch.setattr("medarc_verifiers.cli._job_executor.load_endpoint_registry", lambda *args, **kwargs: {})
-
-    async def first_run(config):
-        return _stub_cli_result()
-
-    monkeypatch.setattr("medarc_verifiers.cli._job_executor.run_evaluation", first_run)
-
-    output_dir = tmp_path / "runs_out"
-    base_run = "base-run"
-    assert (
-        main.main(["bench", "--config", str(config_path), "--output-dir", str(output_dir), "--run-id", base_run]) == 0
-    )
-
-    base_manifest_path = output_dir / base_run / "run_manifest.json"
-    base_manifest = json.loads(base_manifest_path.read_text())
-    base_manifest["jobs"][1]["status"] = "failed"
-    base_manifest["jobs"][1]["reason"] = "boom"
-    base_manifest_path.write_text(json.dumps(base_manifest, indent=2))
-
-    calls: list[int] = []
-
-    async def regen_run(config):
-        calls.append(1)
-        return _stub_cli_result()
-
-    monkeypatch.setattr("medarc_verifiers.cli._job_executor.run_evaluation", regen_run)
-
-    # Restart now uses the --restart flag and performs in-place extension of the seed run.
-    exit_code = main.main(
-        [
-            "bench",
-            "--config",
-            str(config_path),
-            "--output-dir",
-            str(output_dir),
-            "--restart",
-            base_run,
-        ]
-    )
-    assert exit_code == 0
-    assert len(calls) == 1
-
-    # Manifest is updated in-place under the original run id (base_run)
-    updated_manifest = json.loads((output_dir / base_run / "run_manifest.json").read_text())
-    reasons = {entry["job_id"]: entry.get("reason") for entry in updated_manifest["jobs"]}
-    assert reasons["model-a-medqa"] == "up_to_date"
-    assert updated_manifest["summary"]["completed"] == 2
-    # restart_source may remain None for in-place restarts; no assertion on legacy regen_source field.
-
-
-def test_regen_accepts_path_to_run_dir(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
-    """--restart can be a direct path to a run directory, not only a run-id under output_dir."""
-    config_path = tmp_path / "config.yaml"
-    _write_config(
-        config_path,
-        """
-        models:
-          model-a: {}
-        envs:
-          medqa: {}
-        jobs:
-          - model: model-a
-            env: medqa
-        """,
-    )
-
-    monkeypatch.setattr("medarc_verifiers.cli._config_loader.load_env_metadata", lambda *args, **kwargs: [])
-    monkeypatch.setattr("medarc_verifiers.cli._job_executor.load_env_metadata", lambda *args, **kwargs: [])
-    monkeypatch.setattr("medarc_verifiers.cli._job_executor.load_endpoint_registry", lambda *args, **kwargs: {})
-
-    async def fake_run(config):
-        return _stub_cli_result()
-
-    monkeypatch.setattr("medarc_verifiers.cli._job_executor.run_evaluation", fake_run)
-
-    output_dir = tmp_path / "runs_out"
-    base_run = output_dir / "base-run"
-    # First run to create a seed manifest
-    assert (
-        main.main(["bench", "--config", str(config_path), "--output-dir", str(output_dir), "--run-id", "base-run"]) == 0
-    )
-
-    # Now use --restart with an explicit path to the run directory
-    # Use --restart with explicit path to existing run directory; should update in place.
-    # Mock interactive prompt to avoid stdin capture when all jobs are already completed.
-    monkeypatch.setattr("medarc_verifiers.cli.main._prompt_completed_jobs_action", lambda: "continue")
-    exit_code = main.main(
-        [
-            "bench",
-            "--config",
-            str(config_path),
-            "--output-dir",
-            str(output_dir),
-            "--restart",
-            str(base_run),
-        ]
-    )
-    assert exit_code == 0
-    # Ensure manifest exists after restart-in-place; legacy regen_source not asserted.
-    assert (output_dir / "base-run" / "run_manifest.json").exists()
-
-
-def test_regen_accepts_manifest_path(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
-    config_path = tmp_path / "config.yaml"
-    _write_config(
-        config_path,
-        """
-        models:
-          model-a: {}
-        envs:
-          medqa: {}
-        jobs:
-          - model: model-a
-            env: medqa
-        """,
-    )
-
-    monkeypatch.setattr("medarc_verifiers.cli._config_loader.load_env_metadata", lambda *args, **kwargs: [])
-    monkeypatch.setattr("medarc_verifiers.cli._job_executor.load_env_metadata", lambda *args, **kwargs: [])
-    monkeypatch.setattr("medarc_verifiers.cli._job_executor.load_endpoint_registry", lambda *args, **kwargs: {})
-
-    async def fake_run(config):
-        return _stub_cli_result()
-
-    monkeypatch.setattr("medarc_verifiers.cli._job_executor.run_evaluation", fake_run)
-
-    output_dir = tmp_path / "runs_out"
-    base_run = "base-run"
-    assert (
-        main.main(["bench", "--config", str(config_path), "--output-dir", str(output_dir), "--run-id", base_run]) == 0
-    )
-
-    manifest_path = output_dir / base_run / "run_manifest.json"
-    monkeypatch.setattr("medarc_verifiers.cli.main._prompt_completed_jobs_action", lambda: "continue")
-    exit_code = main.main(
-        [
-            "bench",
-            "--config",
-            str(config_path),
-            "--output-dir",
-            str(output_dir),
-            "--restart",
-            str(manifest_path),
-        ]
-    )
-    assert exit_code == 0
-
-
-def test_invalid_run_id_rejected(
-    monkeypatch: pytest.MonkeyPatch, tmp_path: Path, caplog: pytest.LogCaptureFixture
-) -> None:
-    config_path = tmp_path / "config.yaml"
-    _write_config(
-        config_path,
-        """
-        models:
-          model-a: {}
-        envs:
-          medqa: {}
-        jobs:
-          - model: model-a
-            env: medqa
-        """,
-    )
-
-    monkeypatch.setattr("medarc_verifiers.cli._config_loader.load_env_metadata", lambda *args, **kwargs: [])
-    monkeypatch.setattr("medarc_verifiers.cli._job_executor.load_env_metadata", lambda *args, **kwargs: [])
-    monkeypatch.setattr("medarc_verifiers.cli._job_executor.load_endpoint_registry", lambda *args, **kwargs: {})
-
-    output_dir = tmp_path / "runs_out"
-    caplog.set_level(logging.ERROR)
-    exit_code = main.main(
-        [
-            "bench",
-            "--config",
-            str(config_path),
-            "--output-dir",
-            str(output_dir),
-            "--run-id",
-            "../oops",
-        ]
-    )
-    assert exit_code == 1
-    assert "Invalid --run-id '../oops'" in caplog.text
-    assert "Suggested safe value: --run-id" in caplog.text
-
-    caplog.clear()
-    exit_code = main.main(
-        [
-            "bench",
-            "--config",
-            str(config_path),
-            "--output-dir",
-            str(output_dir),
-            "--run-id",
-            "/tmp/elsewhere",
-        ]
-    )
-    assert exit_code == 1
-    assert "Invalid --run-id '/tmp/elsewhere'" in caplog.text
-
-
-def test_restart_run_id_rejects_traversal(
-    monkeypatch: pytest.MonkeyPatch,
-    tmp_path: Path,
-    caplog: pytest.LogCaptureFixture,
-) -> None:
-    config_path = tmp_path / "config.yaml"
-    _write_config(
-        config_path,
-        """
-        models:
-          model-a: {}
-        envs:
-          medqa: {}
-        jobs:
-          - model: model-a
-            env: medqa
-        """,
-    )
-
-    monkeypatch.setattr("medarc_verifiers.cli._config_loader.load_env_metadata", lambda *args, **kwargs: [])
-    monkeypatch.setattr("medarc_verifiers.cli._job_executor.load_env_metadata", lambda *args, **kwargs: [])
-    monkeypatch.setattr("medarc_verifiers.cli._job_executor.load_endpoint_registry", lambda *args, **kwargs: {})
-
-    output_dir = tmp_path / "runs_out"
-    caplog.set_level(logging.ERROR)
-    exit_code = main.main(
-        [
-            "bench",
-            "--config",
-            str(config_path),
-            "--output-dir",
-            str(output_dir),
-            "--restart",
-            "../escape",
-        ]
-    )
-    assert exit_code == 1
-    assert "Invalid --restart '../escape'" in caplog.text
-
-
-def test_auto_resume_discovery_without_run_id(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
-    """Auto-resume should discover a prior matching run when --run-id is omitted."""
-    config_path = tmp_path / "config.yaml"
-    _write_config(
-        config_path,
-        """
-models:
-  model-a: {}
-  model-b: {}
-envs:
-  medqa: {}
-jobs:
-  - model: model-a
-    env: medqa
-  - model: model-b
-    env: medqa
-        """,
-    )
-
-    # Avoid external dependencies
-    monkeypatch.setattr("medarc_verifiers.cli._config_loader.load_env_metadata", lambda *args, **kwargs: [])
-    monkeypatch.setattr("medarc_verifiers.cli._job_executor.load_env_metadata", lambda *args, **kwargs: [])
-    monkeypatch.setattr("medarc_verifiers.cli._job_executor.load_endpoint_registry", lambda *args, **kwargs: {})
-
-    async def first_run(config):
-        return _stub_cli_result()
-
-    monkeypatch.setattr("medarc_verifiers.cli._job_executor.run_evaluation", first_run)
-
-    output_dir = tmp_path / "runs_out"
-    run_id = "discover-me"
-    # Create the prior run
-    assert main.main(["bench", "--config", str(config_path), "--output-dir", str(output_dir), "--run-id", run_id]) == 0
-
-    # Mark one job as failed to make the run incomplete
-    manifest_path = output_dir / run_id / "run_manifest.json"
-    manifest = json.loads(manifest_path.read_text())
-    manifest["jobs"][1]["status"] = "failed"
-    manifest["jobs"][1]["reason"] = "boom"
-    manifest_path.write_text(json.dumps(manifest, indent=2))
-
-    # Now resume without specifying --run-id; it should discover the 'discover-me' run
-    calls: list[int] = []
-
-    async def resume_run(config):
-        calls.append(1)
-        return _stub_cli_result()
-
-    monkeypatch.setattr("medarc_verifiers.cli._job_executor.run_evaluation", resume_run)
-
-    exit_code = main.main(["bench", "--config", str(config_path), "--output-dir", str(output_dir)])
-    assert exit_code == 0
-    assert len(calls) == 1  # only the failed job should be re-run
-
-    # Verify the discovered run was updated to completion
-    manifest_after = json.loads(manifest_path.read_text())
-    assert manifest_after["summary"]["completed"] == 2
-    assert manifest_after["summary"]["failed"] == 0
-
-
-def test_no_auto_resume_forces_new_run(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
-    """Passing --no-auto-resume should ignore existing manifests and start a new run."""
-    config_path = tmp_path / "config.yaml"
-    _write_config(
-        config_path,
-        """
-models:
-  model-a: {}
-  model-b: {}
-envs:
-  medqa: {}
-jobs:
-  - model: model-a
-    env: medqa
-  - model: model-b
-    env: medqa
-        """,
-    )
-
-    monkeypatch.setattr("medarc_verifiers.cli._config_loader.load_env_metadata", lambda *args, **kwargs: [])
-    monkeypatch.setattr("medarc_verifiers.cli._job_executor.load_env_metadata", lambda *args, **kwargs: [])
-    monkeypatch.setattr("medarc_verifiers.cli._job_executor.load_endpoint_registry", lambda *args, **kwargs: {})
-
-    async def first_run(config):
-        return _stub_cli_result()
-
-    monkeypatch.setattr("medarc_verifiers.cli._job_executor.run_evaluation", first_run)
-
-    output_dir = tmp_path / "runs_out"
-    run_id = "baseline-run"
-    assert main.main(["bench", "--config", str(config_path), "--output-dir", str(output_dir), "--run-id", run_id]) == 0
-
-    manifest_path = output_dir / run_id / "run_manifest.json"
-    manifest = json.loads(manifest_path.read_text())
-    manifest["jobs"][0]["status"] = "failed"
-    manifest["jobs"][0]["reason"] = "boom"
-    manifest_path.write_text(json.dumps(manifest, indent=2))
-
-    calls: list[int] = []
-
-    async def fresh_run(config):
-        calls.append(1)
-        return _stub_cli_result()
-
-    monkeypatch.setattr("medarc_verifiers.cli._job_executor.run_evaluation", fresh_run)
-
-    preexisting = {child.name for child in output_dir.iterdir()}
-    exit_code = main.main(["bench", "--config", str(config_path), "--output-dir", str(output_dir), "--no-auto-resume"])
-    assert exit_code == 0
-    assert len(calls) == 2  # both jobs rerun in the fresh run
-
-    post = {child.name for child in output_dir.iterdir()}
-    new_runs = post - preexisting
-    assert run_id in post
-    assert len(new_runs) == 1
-    new_run_id = next(iter(new_runs))
-    assert new_run_id != run_id
-    assert (output_dir / new_run_id / "run_manifest.json").exists()
-
-
-def test_single_run_help_lists_env_section_and_header_file(
-    monkeypatch: pytest.MonkeyPatch,
-    capsys: pytest.CaptureFixture[str],
-) -> None:
-    metadata = [
-        _make_env_param(
-            "difficulty",
-            kind="str",
-            default="easy",
-            choices=("easy", "hard"),
-        )
-    ]
-    _patch_single_run_env(monkeypatch, metadata)
-
-    exit_code = main.main(["medqa", "--help"])
-
-    assert exit_code == 0
-    captured = capsys.readouterr().out
-    assert "medqa environment options:" in captured
-    assert "--header-file" in captured
-
-
-def test_single_run_help_orders_env_group_before_core_options(
-    monkeypatch: pytest.MonkeyPatch,
-    capsys: pytest.CaptureFixture[str],
-) -> None:
-    metadata = [
-        _make_env_param(
-            "difficulty",
-            kind="str",
-            default="easy",
-            choices=("easy", "hard"),
-        )
-    ]
-    _patch_single_run_env(monkeypatch, metadata)
-
-    exit_code = main.main(["medqa", "--help"])
-    assert exit_code == 0
-    captured = capsys.readouterr().out
-    env_idx = captured.index("medqa environment options:")
-    core_idx = captured.index("medarc-eval options:")
-    assert env_idx < core_idx
-
-
-def test_general_help_uses_invoked_binary_name(
-    monkeypatch: pytest.MonkeyPatch,
-    capsys: pytest.CaptureFixture[str],
-) -> None:
-    monkeypatch.setattr(sys, "argv", ["medarc-eval"])
-
-    exit_code = main.main([])
-
-    assert exit_code == 0
-    captured = capsys.readouterr().out
-    assert "medarc-eval bench --help" in captured
-
-
-def test_single_run_missing_required_param_errors(
-    monkeypatch: pytest.MonkeyPatch,
-    capsys: pytest.CaptureFixture[str],
-) -> None:
-    metadata = [
-        _make_env_param(
-            "threshold",
-            kind="int",
-            required=True,
-            annotation=int,
-            argparse_type=int,
-        )
-    ]
-    _patch_single_run_env(monkeypatch, metadata)
-    monkeypatch.setattr(
-        "medarc_verifiers.cli._single_run.run_evaluation",
-        lambda *args, **kwargs: pytest.fail("Should not run when args invalid."),
-    )
-
-    with pytest.raises(SystemExit) as excinfo:
-        main.main(["medqa"])
-
-    assert excinfo.value.code == 2
-    err = capsys.readouterr().err
-    assert "Missing required environment arguments: threshold" in err
-
-
-def test_single_run_boolean_negation_and_sampling_precedence(
-    monkeypatch: pytest.MonkeyPatch,
-) -> None:
-    metadata = [
-        _make_env_param(
-            "use_think",
-            kind="bool",
-            default=True,
-            annotation=bool,
-            argparse_type=None,
-            action="BooleanOptionalAction",
-        )
-    ]
-    _patch_single_run_env(monkeypatch, metadata)
-
-    captured = []
-
-    async def fake_run(config):
-        captured.append(config)
-        return _stub_cli_result()
-
-    monkeypatch.setattr("medarc_verifiers.cli._single_run.run_evaluation", fake_run)
-
-    exit_code = main.main(
-        [
-            "medqa",
-            "--no-use-think",
-            "--sampling-arg",
-            "max_tokens=64",
-            "--max-tokens",
-            "128",
-        ]
-    )
-
-    assert exit_code == 0
-    assert len(captured) == 1
-    eval_config = captured[0]
-    assert eval_config.env_args["use_think"] is False
-    assert eval_config.sampling_args["max_tokens"] == 64
-
-
-def test_single_run_header_file_overrides_cli_headers(
-    monkeypatch: pytest.MonkeyPatch,
-    tmp_path: Path,
-    capsys: pytest.CaptureFixture[str],
-) -> None:
-    metadata: list[EnvParam] = []
-    _patch_single_run_env(monkeypatch, metadata)
-
-    header_file = tmp_path / "headers.txt"
-    header_file.write_text("X-Test: file\n", encoding="utf-8")
-
-    exit_code = main.main(
-        [
-            "medqa",
-            "--dry-run",
-            "--header",
-            "X-Test: cli",
-            "--header-file",
-            str(header_file),
-        ]
-    )
-
-    assert exit_code == 0
-    output = capsys.readouterr().out
-    config = json.loads(output)
-    assert config["client_config"]["extra_headers"] == {"X-Test": "file"}
-
-
-def test_single_run_auto_adds_prime_team_header(
-    monkeypatch: pytest.MonkeyPatch,
-    capsys: pytest.CaptureFixture[str],
-) -> None:
-    metadata: list[EnvParam] = []
-    _patch_single_run_env(monkeypatch, metadata)
-    monkeypatch.setenv("PRIME_TEAM_ID", "team-123")
-
-    exit_code = main.main(
-        [
-            "medqa",
-            "--dry-run",
-            "--api-base-url",
-            PRIME_INFERENCE_URL,
-            "--header",
-            "X-Test: cli",
-        ]
-    )
-
-    assert exit_code == 0
-    output = capsys.readouterr().out
-    config = json.loads(output)
-    assert config["client_config"]["extra_headers"] == {
-        "X-Prime-Team-ID": "team-123",
-        "X-Test": "cli",
-    }
+    output = capsys.readouterr().out
+    config = json.loads(output)
+    assert config["client_config"]["extra_headers"] == {
+        "X-Prime-Team-ID": "team-123",
+        "X-Test": "cli",
+    }
 
 
 def test_single_run_prime_url_forces_prime_api_key_when_key_var_not_explicit(
@@ -1908,28 +779,6 @@ async def fail_if_called(*args, **kwargs):
     assert '"env_id": "medqa"' in output
 
 
-def test_single_run_dry_run_with_model_call_retries_does_not_patch(
-    monkeypatch: pytest.MonkeyPatch,
-    capsys: pytest.CaptureFixture[str],
-) -> None:
-    metadata: list[EnvParam] = []
-    _patch_single_run_env(monkeypatch, metadata)
-
-    captured_attempts: list[int] = []
-
-    def fake_patch(*, attempts=3, backoff_s=1.0, log_path="medarc_model_retry.log"):  # noqa: ARG001
-        captured_attempts.append(attempts)
-
-    monkeypatch.setattr("medarc_verifiers.utils.retry.patch_verifiers_model_response_retry", fake_patch)
-
-    exit_code = main.main(["medqa", "--dry-run", "--model-call-retries", "3"])
-
-    assert exit_code == 0
-    assert captured_attempts == []
-    output = capsys.readouterr().out
-    assert '"env_id": "medqa"' in output
-
-
 def test_single_run_retry_flags_apply_to_client_and_eval_config(
     monkeypatch: pytest.MonkeyPatch,
     capsys: pytest.CaptureFixture[str],
diff --git a/tests/test_cli/test_manifest_planner.py b/tests/test_cli/test_manifest_planner.py
deleted file mode 100644
index ce44b091..00000000
--- a/tests/test_cli/test_manifest_planner.py
+++ /dev/null
@@ -1,491 +0,0 @@
-from __future__ import annotations
-
-import logging
-from pathlib import Path
-
-import pytest
-
-from medarc_verifiers.cli._job_builder import ResolvedJob
-from medarc_verifiers.cli._manifest import RunManifest
-from medarc_verifiers.cli._manifest_planner import ManifestPlanner, _find_auto_resume_candidate
-from medarc_verifiers.cli._schemas import EnvironmentConfigSchema, ModelConfigSchema
-
-
-def _make_job(job_id: str = "job-a", env_id: str = "env-a", model_id: str = "model-a") -> ResolvedJob:
-    env = EnvironmentConfigSchema(id=env_id, module=env_id)
-    model = ModelConfigSchema(id=model_id, model="gpt-4.1-mini")
-    return ResolvedJob(
-        job_id=job_id,
-        name=job_id,
-        model=model,
-        env=env,
-        env_args={},
-        sampling_args={},
-    )
-
-
-def _planner(
-    *,
-    tmp_path: Path,
-    jobs: list[ResolvedJob],
-    config_checksum: str = "abc123",
-    run_id: str | None = None,
-    restart_source: str | None = None,
-    auto_resume: bool = True,
-    persist: bool = True,
-) -> ManifestPlanner:
-    config_path = tmp_path / "config.yaml"
-    config_path.write_text("config: test\n", encoding="utf-8")
-    env_args_map = {job.job_id: {} for job in jobs}
-    sampling_args_map = {job.job_id: {} for job in jobs}
-    return ManifestPlanner(
-        output_dir=tmp_path / "runs",
-        run_id=run_id,
-        run_name="demo-run",
-        config_path=config_path,
-        config_checksum=config_checksum,
-        jobs=jobs,
-        env_args_map=env_args_map,
-        sampling_args_map=sampling_args_map,
-        restart_source=restart_source,
-        auto_resume=auto_resume,
-        persist=persist,
-    )
-
-
-def test_restart_in_place_reuses_completed_job(tmp_path: Path) -> None:
-    config_path = tmp_path / "config.yaml"
-    config_path.write_text("config: test\n", encoding="utf-8")
-    job = _make_job()
-    env_args_map = {job.job_id: {}}
-    sampling_args_map = {job.job_id: {}}
-    run_dir = tmp_path / "runs" / "base-run"
-    manifest = RunManifest.create(
-        run_dir=run_dir,
-        run_id="base-run",
-        run_name="demo-run",
-        config_source=config_path,
-        config_checksum="abc123",
-        jobs=[job],
-        env_args_map=env_args_map,
-        sampling_args_map=sampling_args_map,
-        persist=True,
-        restart_source=None,
-    )
-    manifest.record_job_completion(
-        job.job_id,
-        duration_seconds=1.0,
-        results_dir=run_dir / job.job_id,
-        avg_reward=None,
-        metrics={},
-        num_examples=job.env.num_examples,
-        rollouts_per_example=job.env.rollouts_per_example,
-    )
-
-    planner = _planner(tmp_path=tmp_path, jobs=[job], restart_source="base-run")
-    plan = planner.plan(force_all=False, forced_envs=set())
-
-    assert plan.manifest.path == manifest.path
-    assert plan.runnable_job_ids == set()
-    assert plan.reused_job_ids == {job.job_id}
-
-
-def test_auto_resume_prefers_incomplete_run(tmp_path: Path) -> None:
-    config_path = tmp_path / "config.yaml"
-    config_path.write_text("config: test\n", encoding="utf-8")
-    job = _make_job()
-    env_args_map = {job.job_id: {}}
-    sampling_args_map = {job.job_id: {}}
-    output_dir = tmp_path / "runs"
-    output_dir.mkdir(parents=True, exist_ok=True)
-
-    incomplete_dir = output_dir / "incomplete-run"
-    RunManifest.create(
-        run_dir=incomplete_dir,
-        run_id="incomplete-run",
-        run_name="demo-run",
-        config_source=config_path,
-        config_checksum="abc123",
-        jobs=[job],
-        env_args_map=env_args_map,
-        sampling_args_map=sampling_args_map,
-        persist=True,
-        restart_source=None,
-    )
-
-    complete_dir = output_dir / "complete-run"
-    complete_manifest = RunManifest.create(
-        run_dir=complete_dir,
-        run_id="complete-run",
-        run_name="demo-run",
-        config_source=config_path,
-        config_checksum="abc123",
-        jobs=[job],
-        env_args_map=env_args_map,
-        sampling_args_map=sampling_args_map,
-        persist=True,
-        restart_source=None,
-    )
-    complete_manifest.record_job_completion(
-        job.job_id,
-        duration_seconds=1.0,
-        results_dir=complete_dir / job.job_id,
-        avg_reward=None,
-        metrics={},
-        num_examples=job.env.num_examples,
-        rollouts_per_example=job.env.rollouts_per_example,
-    )
-
-    candidate = _find_auto_resume_candidate(output_dir, expected_checksum="abc123")
-    assert candidate == incomplete_dir
-
-    planner = _planner(tmp_path=tmp_path, jobs=[job], auto_resume=True)
-    plan = planner.plan(force_all=False, forced_envs=set())
-
-    assert plan.manifest.run_dir == incomplete_dir
-    assert plan.runnable_job_ids == {job.job_id}
-    assert plan.reused_job_ids == set()
-
-
-def test_auto_resume_with_checksum_mismatch_raises(tmp_path: Path) -> None:
-    config_path = tmp_path / "config.yaml"
-    config_path.write_text("config: test\n", encoding="utf-8")
-    job = _make_job()
-    env_args_map = {job.job_id: {}}
-    sampling_args_map = {job.job_id: {}}
-    run_dir = tmp_path / "runs" / "existing"
-    RunManifest.create(
-        run_dir=run_dir,
-        run_id="existing",
-        run_name="demo-run",
-        config_source=config_path,
-        config_checksum="different",
-        jobs=[job],
-        env_args_map=env_args_map,
-        sampling_args_map=sampling_args_map,
-        persist=True,
-        restart_source=None,
-    )
-
-    planner = _planner(tmp_path=tmp_path, jobs=[job], run_id="existing", auto_resume=True, config_checksum="abc123")
-    with pytest.raises(
-        ValueError,
-        match=(
-            r"Run 'existing' was created from a different configuration\."
-            r".*--no-auto-resume.*--restart existing"
-        ),
-    ):
-        planner.plan(force_all=False, forced_envs=set())
-
-
-def test_auto_resume_allows_resume_tolerant_model_fields(tmp_path: Path) -> None:
-    config_path = tmp_path / "config.yaml"
-    config_path.write_text("config: test\n", encoding="utf-8")
-    env = EnvironmentConfigSchema(id="env-a", module="env-a")
-    model = ModelConfigSchema(id="model-a", model="gpt-4.1-mini", max_concurrent=16, timeout=30.0)
-    job = ResolvedJob(
-        job_id="job-a",
-        name="job-a",
-        model=model,
-        env=env,
-        env_args={},
-        sampling_args={},
-    )
-
-    env_args_map = {job.job_id: {}}
-    sampling_args_map = {job.job_id: {}}
-    run_dir = tmp_path / "runs" / "existing"
-    manifest = RunManifest.create(
-        run_dir=run_dir,
-        run_id="existing",
-        run_name="demo-run",
-        config_source=config_path,
-        config_checksum="abc123",
-        jobs=[job],
-        env_args_map=env_args_map,
-        sampling_args_map=sampling_args_map,
-        persist=True,
-        restart_source=None,
-    )
-    manifest.record_job_completion(
-        job.job_id,
-        duration_seconds=1.0,
-        results_dir=run_dir / job.job_id,
-        avg_reward=None,
-        metrics={},
-        num_examples=job.env.num_examples,
-        rollouts_per_example=job.env.rollouts_per_example,
-    )
-
-    planner = _planner(tmp_path=tmp_path, jobs=[job], run_id="existing", auto_resume=True, config_checksum="abc123")
-    plan = planner.plan(force_all=False, forced_envs=set())
-
-    assert plan.manifest.run_dir == run_dir
-    assert plan.runnable_job_ids == set()
-
-
-def test_restart_dir_missing_manifest_raises(tmp_path: Path) -> None:
-    job = _make_job()
-    seed_dir = tmp_path / "seed-run"
-    seed_dir.mkdir(parents=True, exist_ok=True)
-    planner = _planner(tmp_path=tmp_path, jobs=[job], restart_source=str(seed_dir))
-    with pytest.raises(ValueError, match="run_manifest.json"):
-        planner.plan(force_all=False, forced_envs=set())
-
-
-def test_auto_resume_allows_provider_overrides(tmp_path: Path) -> None:
-    config_path = tmp_path / "config.yaml"
-    config_path.write_text("config: test\n", encoding="utf-8")
-    env = EnvironmentConfigSchema(id="env-a", module="env-a")
-
-    job_seed = ResolvedJob(
-        job_id="job-a",
-        name="job-a",
-        model=ModelConfigSchema(
-            id="model-a",
-            model="google/gemini-3-pro-preview",
-            api_base_url="https://api.pinference.ai/api/v1",
-            api_key_var="PRIME_API_KEY",
-            headers={"X-Prime-Team-ID": "team-a"},
-        ),
-        env=env,
-        env_args={},
-        sampling_args={},
-    )
-
-    env_args_map = {job_seed.job_id: {}}
-    sampling_args_map = {job_seed.job_id: {}}
-    run_dir = tmp_path / "runs" / "existing"
-    manifest = RunManifest.create(
-        run_dir=run_dir,
-        run_id="existing",
-        run_name="demo-run",
-        config_source=config_path,
-        config_checksum="abc123",
-        jobs=[job_seed],
-        env_args_map=env_args_map,
-        sampling_args_map=sampling_args_map,
-        persist=True,
-        restart_source=None,
-    )
-    manifest.record_job_completion(
-        job_seed.job_id,
-        duration_seconds=1.0,
-        results_dir=run_dir / job_seed.job_id,
-        avg_reward=None,
-        metrics={},
-        num_examples=job_seed.env.num_examples,
-        rollouts_per_example=job_seed.env.rollouts_per_example,
-    )
-
-    # Same model id, but updated provider settings. These should be resume-tolerant.
-    job_current = ResolvedJob(
-        job_id="job-a",
-        name="job-a",
-        model=ModelConfigSchema(
-            id="model-a",
-            model="google/gemini-3-pro-preview",
-            api_base_url="https://generativelanguage.googleapis.com/v1beta/openai",
-            api_key_var="GEMINI_API_KEY",
-            headers={},
-        ),
-        env=env,
-        env_args={},
-        sampling_args={},
-    )
-
-    planner = _planner(
-        tmp_path=tmp_path,
-        jobs=[job_current],
-        run_id="existing",
-        auto_resume=True,
-        config_checksum="abc123",
-    )
-    plan = planner.plan(force_all=False, forced_envs=set())
-
-    assert plan.manifest.run_dir == run_dir
-    assert plan.runnable_job_ids == set()
-    # Auto-resume doesn't populate reused_job_ids (only restart strategies do).
-    assert plan.reused_job_ids == set()
-
-
-def test_restart_in_place_allows_extra_body_usage_override(caplog: pytest.LogCaptureFixture, tmp_path: Path) -> None:
-    caplog.set_level(logging.WARNING, logger="medarc_verifiers.cli._manifest")
-    config_path = tmp_path / "config.yaml"
-    config_path.write_text("config: test\n", encoding="utf-8")
-    env = EnvironmentConfigSchema(id="env-a", module="env-a")
-
-    job_seed = ResolvedJob(
-        job_id="job-a",
-        name="job-a",
-        model=ModelConfigSchema(
-            id="model-a",
-            model="google/gemini-3-pro-preview",
-            sampling_args={"temperature": 0.2, "extra_body": {"usage": {"include": True}}},
-        ),
-        env=env,
-        env_args={},
-        sampling_args={},
-    )
-
-    env_args_map = {job_seed.job_id: {}}
-    sampling_args_map = {job_seed.job_id: {}}
-    run_dir = tmp_path / "runs" / "base-run"
-    RunManifest.create(
-        run_dir=run_dir,
-        run_id="base-run",
-        run_name="demo-run",
-        config_source=config_path,
-        config_checksum="seed",
-        jobs=[job_seed],
-        env_args_map=env_args_map,
-        sampling_args_map=sampling_args_map,
-        persist=True,
-        restart_source=None,
-    )
-
-    # Same model id, but drop/alter extra_body.usage (provider-specific).
-    job_current = ResolvedJob(
-        job_id="job-a",
-        name="job-a",
-        model=ModelConfigSchema(
-            id="model-a",
-            model="google/gemini-3-pro-preview",
-            sampling_args={"temperature": 0.2, "extra_body": {}},
-        ),
-        env=env,
-        env_args={},
-        sampling_args={},
-    )
-
-    planner = _planner(
-        tmp_path=tmp_path,
-        jobs=[job_current],
-        restart_source=str(run_dir),
-        auto_resume=False,
-        config_checksum="current",
-    )
-    plan = planner.plan(force_all=False, forced_envs=set())
-
-    assert plan.manifest.run_dir == run_dir
-    assert any("sampling_args.extra_body changed" in record.message for record in caplog.records)
-
-
-def test_restart_in_place_allows_sampling_args_override(caplog: pytest.LogCaptureFixture, tmp_path: Path) -> None:
-    caplog.set_level(logging.WARNING, logger="medarc_verifiers.cli._manifest")
-    config_path = tmp_path / "config.yaml"
-    config_path.write_text("config: test\n", encoding="utf-8")
-    env = EnvironmentConfigSchema(id="env-a", module="env-a")
-
-    job_seed = ResolvedJob(
-        job_id="job-a",
-        name="job-a",
-        model=ModelConfigSchema(
-            id="model-a",
-            model="google/gemini-3-pro-preview",
-            sampling_args={"temperature": 0.2, "top_k": 64},
-        ),
-        env=env,
-        env_args={},
-        sampling_args={},
-    )
-
-    env_args_map = {job_seed.job_id: {}}
-    sampling_args_map = {job_seed.job_id: {}}
-    run_dir = tmp_path / "runs" / "base-run"
-    RunManifest.create(
-        run_dir=run_dir,
-        run_id="base-run",
-        run_name="demo-run",
-        config_source=config_path,
-        config_checksum="seed",
-        jobs=[job_seed],
-        env_args_map=env_args_map,
-        sampling_args_map=sampling_args_map,
-        persist=True,
-        restart_source=None,
-    )
-
-    # Same model id, but provider-specific sampling args changed.
-    job_current = ResolvedJob(
-        job_id="job-a",
-        name="job-a",
-        model=ModelConfigSchema(
-            id="model-a",
-            model="google/gemini-3-pro-preview",
-            sampling_args={"temperature": 0.2},
-        ),
-        env=env,
-        env_args={},
-        sampling_args={},
-    )
-
-    planner = _planner(
-        tmp_path=tmp_path,
-        jobs=[job_current],
-        restart_source=str(run_dir),
-        auto_resume=False,
-        config_checksum="current",
-    )
-    plan = planner.plan(force_all=False, forced_envs=set())
-
-    assert plan.manifest.run_dir == run_dir
-    assert any("sampling_args changed" in record.message for record in caplog.records)
-
-
-def test_restart_in_place_allows_model_namespace_override(tmp_path: Path) -> None:
-    config_path = tmp_path / "config.yaml"
-    config_path.write_text("config: test\n", encoding="utf-8")
-    env = EnvironmentConfigSchema(id="env-a", module="env-a")
-
-    job_seed = ResolvedJob(
-        job_id="job-a",
-        name="job-a",
-        model=ModelConfigSchema(
-            id="model-a",
-            model="google/gemini-3-pro-preview",
-        ),
-        env=env,
-        env_args={},
-        sampling_args={},
-    )
-
-    env_args_map = {job_seed.job_id: {}}
-    sampling_args_map = {job_seed.job_id: {}}
-    run_dir = tmp_path / "runs" / "base-run"
-    RunManifest.create(
-        run_dir=run_dir,
-        run_id="base-run",
-        run_name="demo-run",
-        config_source=config_path,
-        config_checksum="seed",
-        jobs=[job_seed],
-        env_args_map=env_args_map,
-        sampling_args_map=sampling_args_map,
-        persist=True,
-        restart_source=None,
-    )
-
-    # Same underlying model, but without the provider namespace prefix.
-    job_current = ResolvedJob(
-        job_id="job-a",
-        name="job-a",
-        model=ModelConfigSchema(
-            id="model-a",
-            model="gemini-3-pro-preview",
-        ),
-        env=env,
-        env_args={},
-        sampling_args={},
-    )
-
-    planner = _planner(
-        tmp_path=tmp_path,
-        jobs=[job_current],
-        restart_source=str(run_dir),
-        auto_resume=False,
-        config_checksum="current",
-    )
-    plan = planner.plan(force_all=False, forced_envs=set())
-
-    assert plan.manifest.run_dir == run_dir
diff --git a/tests/test_cli/test_manifest_snapshot.py b/tests/test_cli/test_manifest_snapshot.py
deleted file mode 100644
index ccddb9ee..00000000
--- a/tests/test_cli/test_manifest_snapshot.py
+++ /dev/null
@@ -1,452 +0,0 @@
-from __future__ import annotations
-
-import json
-import os
-from pathlib import Path
-from typing import Any
-
-import pytest
-
-from medarc_verifiers.cli._job_builder import ResolvedJob
-from medarc_verifiers.cli._manifest import (
-    _ENSURE_JOB_RUNTIME_STATE_FIELDS,
-    MANIFEST_FILENAME,
-    MANIFEST_VERSION,
-    ManifestJobEntry,
-    RunManifest,
-    RunManifestModel,
-    build_job_entry,
-    compute_snapshot_checksum,
-    manifest_job_signature,
-    resolved_job_signature,
-)
-from medarc_verifiers.cli._schemas import EnvironmentConfigSchema, ModelConfigSchema
-
-SNAPSHOT_ENV_VAR = "UPDATE_CLI_MANIFEST_SNAPSHOT"
-SNAPSHOT_PATH = Path(__file__).parent / "data" / "run_manifest_snapshot.json"
-
-
-def _build_job() -> ResolvedJob:
-    model = ModelConfigSchema(
-        id="snapshot-model",
-        model="gpt-4o-mini",
-        headers={"X-Test": "one"},
-        sampling_args={"max_tokens": 256, "temperature": 0.3},
-        env_args={"split": "dev"},
-        env_overrides={"snapshot-env": {"temperature": 0.2}},
-    )
-    env = EnvironmentConfigSchema(
-        id="snapshot-env",
-        module="environments.snapshot_env",
-        num_examples=3,
-        rollouts_per_example=2,
-        max_concurrent=4,
-        independent_scoring=False,
-        state_columns=["student_answer", "score"],
-        env_args={"difficulty": "easy", "runner_seed": 99},
-    )
-    return ResolvedJob(
-        job_id="snapshot-model-snapshot-env",
-        name="snapshot-eval",
-        model=model,
-        env=env,
-        env_args={"difficulty": "easy", "runner_seed": 99, "split": "dev", "job_seed": 7},
-        sampling_args={"max_tokens": 256, "temperature": 0.3, "eval_seed": 17},
-    )
-
-
-def _normalize_manifest(payload: Any, *, base_dir: Path) -> Any:
-    base_posix = base_dir.as_posix()
-    base_native = str(base_dir)
-
-    if isinstance(payload, dict):
-        return {key: _normalize_manifest(value, base_dir=base_dir) for key, value in payload.items()}
-    if isinstance(payload, list):
-        return [_normalize_manifest(item, base_dir=base_dir) for item in payload]
-    if isinstance(payload, str):
-        return payload.replace(base_posix, "<TMP>").replace(base_native, "<TMP>")
-    return payload
-
-
-def test_run_manifest_snapshot(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
-    job = _build_job()
-    monkeypatch.setattr("medarc_verifiers.cli._manifest.timestamp", lambda: "2024-03-01T00:00:00Z")
-
-    run_dir = tmp_path / "snapshot-run"
-    snapshot_cfg = {
-        "models": {"snapshot-model": {"model": "gpt-4o-mini"}},
-        "envs": {"snapshot-env": {"module": "environments.snapshot_env"}},
-        "jobs": [{"model": "snapshot-model", "env": "snapshot-env"}],
-    }
-    manifest = RunManifest.create(
-        run_dir=run_dir,
-        run_id="snapshot-run",
-        run_name="Snapshot Run",
-        config_source=Path("configs/snapshot.yaml"),
-        config_checksum=compute_snapshot_checksum(snapshot_cfg),
-        jobs=[job],
-        env_args_map={job.job_id: job.env_args},
-        sampling_args_map={job.job_id: job.sampling_args},
-        persist=True,
-        restart_source="baseline-run",
-    )
-
-    manifest_path = manifest.path
-    assert manifest_path.name == MANIFEST_FILENAME
-    payload = json.loads(manifest_path.read_text(encoding="utf-8"))
-    normalized = _normalize_manifest(payload, base_dir=tmp_path)
-
-    SNAPSHOT_PATH.parent.mkdir(parents=True, exist_ok=True)
-    serialized = json.dumps(normalized, indent=2, sort_keys=True) + "\n"
-
-    if os.environ.get(SNAPSHOT_ENV_VAR):
-        SNAPSHOT_PATH.write_text(serialized, encoding="utf-8")
-
-    expected = json.loads(SNAPSHOT_PATH.read_text(encoding="utf-8"))
-    assert normalized == expected
-
-    loaded = RunManifest.load(manifest_path, persist=False)
-    assert loaded.model.config_checksum == expected["config_checksum"]
-    assert loaded.jobs[0].status == "pending"
-
-
-def test_manifest_load_upgrades_interleave_scoring(tmp_path: Path) -> None:
-    """Older manifests may store interleave_scoring in env_templates; load should upgrade it."""
-    manifest_path = tmp_path / "run_manifest.json"
-    payload = {
-        "version": 2,
-        "run_id": "demo",
-        "name": "Demo",
-        "config_source": "configs/demo.yaml",
-        "config_checksum": "abc",
-        "created_at": "2024-03-01T00:00:00Z",
-        "updated_at": "2024-03-01T00:00:00Z",
-        "models": {},
-        "env_templates": {
-            "env:template": {
-                "module": "environments.snapshot_env",
-                "num_examples": 3,
-                "rollouts_per_example": 2,
-                "interleave_scoring": False,
-            }
-        },
-        "jobs": [],
-        "summary": {},
-    }
-    manifest_path.write_text(json.dumps(payload), encoding="utf-8")
-
-    loaded = RunManifest.load(manifest_path, persist=False)
-    template = loaded.model.env_templates["env:template"]
-    assert "interleave_scoring" not in template
-    assert template["independent_scoring"] is False
-
-
-def test_manifest_serialization_prunes_nones_and_relativizes(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
-    job = _build_job()
-    fake_root = tmp_path / "repo"
-    fake_root.mkdir()
-    run_dir = fake_root / "runs" / "phase5"
-
-    def fake_to_project_relative(path: Path | str, *, default_base: Path | None = None) -> str:
-        resolved = Path(path).resolve()
-        base = fake_root if default_base is None else default_base
-        return resolved.relative_to(base).as_posix()
-
-    monkeypatch.setattr("medarc_verifiers.utils.pathing.project_root", lambda: fake_root)
-    monkeypatch.setattr("medarc_verifiers.utils.pathing.to_project_relative", fake_to_project_relative)
-
-    snapshot_cfg = {
-        "models": {"snapshot-model": {"model": "gpt-4o-mini"}},
-        "envs": {"snapshot-env": {"module": "environments.snapshot_env"}},
-        "jobs": [{"model": "snapshot-model", "env": "snapshot-env"}],
-    }
-    manifest = RunManifest.create(
-        run_dir=run_dir,
-        run_id="phase5",
-        run_name="Phase 5 Run",
-        config_source=fake_root / "configs" / "phase5.yaml",
-        config_checksum=compute_snapshot_checksum(snapshot_cfg),
-        jobs=[job],
-        env_args_map={job.job_id: job.env_args},
-        sampling_args_map={job.job_id: job.sampling_args},
-    )
-
-    payload = json.loads(manifest.path.read_text(encoding="utf-8"))
-    job_payload = payload["jobs"][0]
-
-    assert "results_dir" not in job_payload
-    assert "reason" not in job_payload
-    assert "avg_reward" not in job_payload
-    assert job_payload["env_args"]["job_seed"] == 7
-    assert job_payload["sampling_args"]["eval_seed"] == 17
-
-
-def test_manifest_job_signature_is_stable(tmp_path: Path) -> None:
-    job = _build_job()
-    run_dir = tmp_path / "sig-run"
-    manifest = RunManifest.create(
-        run_dir=run_dir,
-        run_id="sig-run",
-        run_name="Signature Run",
-        config_source=Path("configs/sig.yaml"),
-        config_checksum="sig",
-        jobs=[job],
-        env_args_map={job.job_id: job.env_args},
-        sampling_args_map={job.job_id: job.sampling_args},
-        persist=False,
-    )
-    entry = manifest.jobs[0]
-
-    signature = manifest_job_signature(manifest.model, entry)
-    assert signature == {
-        "model": {
-            "id": "snapshot-model",
-            "model": "gpt-4o-mini",
-            "sampling_args": {"max_tokens": 256, "temperature": 0.3},
-            "env_args": {"split": "dev"},
-            "env_overrides": {"snapshot-env": {"temperature": 0.2}},
-        },
-        "env": {
-            "module": "environments.snapshot_env",
-            "num_examples": 3,
-            "rollouts_per_example": 2,
-            "max_concurrent": 4,
-            "independent_scoring": False,
-            "state_columns": ["student_answer", "score"],
-            "print_results": False,
-            "rerun": False,
-            "id": "snapshot-env",
-            "env_args": {"difficulty": "easy", "runner_seed": 99, "split": "dev", "job_seed": 7},
-        },
-        "sampling_args": {"max_tokens": 256, "temperature": 0.3, "eval_seed": 17},
-    }
-
-
-def test_resolved_job_signature_is_stable() -> None:
-    job = _build_job()
-
-    signature = resolved_job_signature(job, env_args=job.env_args, sampling_args=job.sampling_args)
-    assert signature == {
-        "model": {
-            "id": "snapshot-model",
-            "model": "gpt-4o-mini",
-            "sampling_args": {"max_tokens": 256, "temperature": 0.3},
-            "env_args": {"split": "dev"},
-            "env_overrides": {"snapshot-env": {"temperature": 0.2}},
-        },
-        "env": {
-            "module": "environments.snapshot_env",
-            "num_examples": 3,
-            "rollouts_per_example": 2,
-            "max_concurrent": 4,
-            "independent_scoring": False,
-            "state_columns": ["student_answer", "score"],
-            "print_results": False,
-            "rerun": False,
-            "id": "snapshot-env",
-            "env_args": {"difficulty": "easy", "runner_seed": 99, "split": "dev", "job_seed": 7},
-        },
-        "sampling_args": {"max_tokens": 256, "temperature": 0.3, "eval_seed": 17},
-    }
-
-
-def test_build_job_entry_is_stable() -> None:
-    job = _build_job()
-    entry = build_job_entry(job, env_args=job.env_args, sampling_args=job.sampling_args, results_dir=None)
-    assert entry.model_dump() == {
-        "job_id": "snapshot-model-snapshot-env",
-        "env_id": "environments.snapshot_env",
-        "model_id": "snapshot-model",
-        "env_template_id": "environments.snapshot_env:6ef485576891",
-        "env_variant_id": "snapshot-env",
-        "env_args": {"difficulty": "easy", "runner_seed": 99, "split": "dev", "job_seed": 7},
-        "sampling_args": {"max_tokens": 256, "temperature": 0.3, "eval_seed": 17},
-        "status": "pending",
-        "reason": None,
-        "attempt": 0,
-        "started_at": None,
-        "ended_at": None,
-        "duration_seconds": None,
-        "results_dir": None,
-        "results_relpath": "snapshot-model-snapshot-env/results.jsonl",
-        "metadata_relpath": "snapshot-model-snapshot-env/metadata.json",
-        "row_count": None,
-        "metrics": None,
-        "avg_reward": None,
-        "num_examples": None,
-        "rollouts_per_example": None,
-    }
-
-
-def test_resolved_job_signature_ignores_resume_tolerant_fields() -> None:
-    base_job = _build_job()
-    model_variant = base_job.model.model_copy(update={"api_key_var": "ALT_KEY"})
-    variant_job = ResolvedJob(
-        job_id=base_job.job_id,
-        name=base_job.name,
-        model=model_variant,
-        env=base_job.env,
-        env_args=base_job.env_args,
-        sampling_args=base_job.sampling_args,
-        sleep=base_job.sleep,
-    )
-
-    base_sig = resolved_job_signature(base_job, env_args=base_job.env_args, sampling_args=base_job.sampling_args)
-    variant_sig = resolved_job_signature(
-        variant_job, env_args=variant_job.env_args, sampling_args=variant_job.sampling_args
-    )
-
-    assert base_sig == variant_sig
-
-
-def test_ensure_job_preserves_runtime_fields_on_update(tmp_path: Path) -> None:
-    seed_job = _build_job()
-    run_dir = tmp_path / "runtime-run"
-    manifest = RunManifest.create(
-        run_dir=run_dir,
-        run_id="runtime-run",
-        run_name="Runtime Run",
-        config_source=Path("configs/runtime.yaml"),
-        config_checksum="runtime",
-        jobs=[seed_job],
-        env_args_map={seed_job.job_id: seed_job.env_args},
-        sampling_args_map={seed_job.job_id: seed_job.sampling_args},
-        persist=False,
-    )
-    manifest.record_job_completion(
-        seed_job.job_id,
-        duration_seconds=3.5,
-        results_dir=run_dir / seed_job.job_id,
-        avg_reward=0.75,
-        metrics={"pass_rate": 0.75},
-        num_examples=12,
-        rollouts_per_example=2,
-    )
-    entry_before = manifest.job_entry(seed_job.job_id)
-    assert entry_before is not None
-    entry_before.row_count = 4
-    assert set(_ENSURE_JOB_RUNTIME_STATE_FIELDS) == {
-        "status",
-        "reason",
-        "attempt",
-        "started_at",
-        "ended_at",
-        "duration_seconds",
-        "row_count",
-        "metrics",
-        "avg_reward",
-        "num_examples",
-        "rollouts_per_example",
-    }
-    before_runtime = {
-        "status": entry_before.status,
-        "reason": entry_before.reason,
-        "attempt": entry_before.attempt,
-        "started_at": entry_before.started_at,
-        "ended_at": entry_before.ended_at,
-        "duration_seconds": entry_before.duration_seconds,
-        "row_count": entry_before.row_count,
-        "metrics": entry_before.metrics,
-        "avg_reward": entry_before.avg_reward,
-        "num_examples": entry_before.num_examples,
-        "rollouts_per_example": entry_before.rollouts_per_example,
-    }
-
-    updated_job = ResolvedJob(
-        job_id=seed_job.job_id,
-        name=seed_job.name,
-        model=seed_job.model,
-        env=seed_job.env,
-        env_args={**seed_job.env_args, "job_seed": 999},
-        sampling_args={**seed_job.sampling_args, "eval_seed": 999},
-        sleep=seed_job.sleep,
-    )
-    manifest.ensure_job(
-        updated_job,
-        env_args=updated_job.env_args,
-        sampling_args=updated_job.sampling_args,
-        results_dir=run_dir / updated_job.job_id,
-    )
-
-    entry_after = manifest.job_entry(seed_job.job_id)
-    assert entry_after is not None
-    after_runtime = {
-        "status": entry_after.status,
-        "reason": entry_after.reason,
-        "attempt": entry_after.attempt,
-        "started_at": entry_after.started_at,
-        "ended_at": entry_after.ended_at,
-        "duration_seconds": entry_after.duration_seconds,
-        "row_count": entry_after.row_count,
-        "metrics": entry_after.metrics,
-        "avg_reward": entry_after.avg_reward,
-        "num_examples": entry_after.num_examples,
-        "rollouts_per_example": entry_after.rollouts_per_example,
-    }
-
-    assert before_runtime == after_runtime
-
-
-def test_ensure_job_preserves_entry_object_identity(tmp_path: Path) -> None:
-    seed_job = _build_job()
-    run_dir = tmp_path / "identity-run"
-    manifest = RunManifest.create(
-        run_dir=run_dir,
-        run_id="identity-run",
-        run_name="Identity Run",
-        config_source=Path("configs/identity.yaml"),
-        config_checksum="identity",
-        jobs=[seed_job],
-        env_args_map={seed_job.job_id: seed_job.env_args},
-        sampling_args_map={seed_job.job_id: seed_job.sampling_args},
-        persist=False,
-    )
-    entry_before = manifest.job_entry(seed_job.job_id)
-    assert entry_before is not None
-
-    updated_job = ResolvedJob(
-        job_id=seed_job.job_id,
-        name=seed_job.name,
-        model=seed_job.model,
-        env=seed_job.env,
-        env_args={**seed_job.env_args, "job_seed": 111},
-        sampling_args={**seed_job.sampling_args, "eval_seed": 111},
-        sleep=seed_job.sleep,
-    )
-    manifest.ensure_job(
-        updated_job,
-        env_args=updated_job.env_args,
-        sampling_args=updated_job.sampling_args,
-        results_dir=run_dir / updated_job.job_id,
-    )
-    entry_after = manifest.job_entry(seed_job.job_id)
-    assert entry_after is not None
-    assert entry_before is entry_after
-    assert entry_before.env_args["job_seed"] == 111
-
-
-def test_manifest_job_signature_does_not_fallback_module_to_variant_id() -> None:
-    model = RunManifestModel(
-        version=MANIFEST_VERSION,
-        run_id="r",
-        name="n",
-        config_source="cfg.yaml",
-        config_checksum="x",
-        created_at="2024-01-01T00:00:00Z",
-        updated_at="2024-01-01T00:00:00Z",
-        models={},
-        env_templates={"template-no-module": {}},
-        jobs=[],
-        summary={},
-    )
-    entry = ManifestJobEntry(
-        job_id="job-x",
-        env_id=None,
-        model_id="missing-model",
-        env_template_id="template-no-module",
-        env_variant_id="variant-x",
-        env_args={},
-    )
-    signature = manifest_job_signature(model, entry)
-    assert "module" not in signature["env"]
-    assert signature["env"]["id"] == "variant-x"
diff --git a/tests/test_cli/test_manifest_tools.py b/tests/test_cli/test_manifest_tools.py
deleted file mode 100644
index 4274fb1e..00000000
--- a/tests/test_cli/test_manifest_tools.py
+++ /dev/null
@@ -1,151 +0,0 @@
-from __future__ import annotations
-
-import json
-from pathlib import Path
-
-from medarc_verifiers.cli._manifest_tools import validate_manifests_in_runs
-
-
-def _write_json(path: Path, payload: dict) -> None:
-    path.parent.mkdir(parents=True, exist_ok=True)
-    path.write_text(json.dumps(payload), encoding="utf-8")
-
-
-def _write_manifest(
-    run_dir: Path,
-    *,
-    num_examples: int | None = None,
-    rollouts_per_example: int | None = None,
-) -> None:
-    payload = {
-        "version": 3,
-        "run_id": "demo-run",
-        "name": "demo",
-        "config_source": "cfg.yaml",
-        "config_checksum": "x",
-        "created_at": "2024-01-01T00:00:00Z",
-        "updated_at": "2024-01-01T00:00:00Z",
-        "artifacts_root": ".",
-        "models": {},
-        "env_templates": {},
-        "jobs": [
-            {
-                "job_id": "job-1",
-                "model_id": "m",
-                "env_id": "e",
-                "env_template_id": "e:t",
-                "env_variant_id": "e",
-                "env_args": {},
-                "results_relpath": "job-1/results.jsonl",
-                "metadata_relpath": "job-1/metadata.json",
-                "status": "completed",
-                "num_examples": num_examples,
-                "rollouts_per_example": rollouts_per_example,
-            }
-        ],
-        "summary": {"total": 1, "completed": 1, "pending": 0, "failed": 0, "running": 0, "skipped": 0},
-    }
-    _write_json(run_dir / "run_manifest.json", payload)
-
-
-def test_validate_manifests_reports_broken_paths(tmp_path: Path) -> None:
-    runs_dir = tmp_path / "runs" / "raw"
-    run_dir = runs_dir / "demo-run"
-    job_dir = run_dir / "job-1"
-    _write_json(job_dir / "metadata.json", {"env_id": "demo"})
-    (job_dir / "results.jsonl").write_text('{"example_id": 1}\n', encoding="utf-8")
-
-    payload = {
-        "version": 3,
-        "run_id": "demo-run",
-        "name": "demo",
-        "config_source": "cfg.yaml",
-        "config_checksum": "x",
-        "created_at": "2024-01-01T00:00:00Z",
-        "updated_at": "2024-01-01T00:00:00Z",
-        "artifacts_root": ".",
-        "models": {},
-        "env_templates": {},
-        "jobs": [
-            {
-                "job_id": "job-1",
-                "model_id": "m",
-                "env_id": "e",
-                "env_template_id": "e:t",
-                "env_variant_id": "e",
-                "env_args": {},
-                "results_relpath": "broken/job-1/results.jsonl",
-                "status": "completed",
-            }
-        ],
-        "summary": {"total": 1, "completed": 1, "pending": 0, "failed": 0, "running": 0, "skipped": 0},
-    }
-    _write_json(run_dir / "run_manifest.json", payload)
-
-    result = validate_manifests_in_runs(runs_dir, strict=False)
-    assert result.manifests_checked == 1
-    assert result.jobs_checked == 1
-    assert any(issue.kind == "warning" and "fallback" in issue.message.lower() for issue in result.issues)
-
-
-def test_validate_manifests_accepts_partial_rollout_file(tmp_path: Path) -> None:
-    runs_dir = tmp_path / "runs" / "raw"
-    run_dir = runs_dir / "demo-run"
-    job_dir = run_dir / "job-1"
-    _write_json(job_dir / "metadata.json", {"env_id": "demo"})
-    (job_dir / "results.jsonl").write_text(
-        "\n".join(
-            [
-                '{"example_id": 1, "rollout_index": 0}',
-                '{"example_id": 2, "rollout_index": 0}',
-                '{"example_id": 1, "rollout_index": 1}',
-                '{"example_id": 2, "rollout_index": 1}',
-                '{"example_id": 1, "rollout_index": 2}',
-            ]
-        )
-        + "\n",
-        encoding="utf-8",
-    )
-    _write_manifest(run_dir, num_examples=2, rollouts_per_example=3)
-
-    result = validate_manifests_in_runs(runs_dir, strict=False)
-
-    assert result.manifests_checked == 1
-    assert result.jobs_checked == 1
-    assert result.issues == []
-
-
-def test_validate_manifests_reports_out_of_range_rollout_index(tmp_path: Path) -> None:
-    runs_dir = tmp_path / "runs" / "raw"
-    run_dir = runs_dir / "demo-run"
-    job_dir = run_dir / "job-1"
-    _write_json(job_dir / "metadata.json", {"env_id": "demo"})
-    (job_dir / "results.jsonl").write_text(
-        "\n".join(
-            [
-                '{"example_id": 1, "rollout_index": 0}',
-                '{"example_id": 2, "rollout_index": 0}',
-                '{"example_id": 1, "rollout_index": 3}',
-            ]
-        )
-        + "\n",
-        encoding="utf-8",
-    )
-    _write_manifest(run_dir, num_examples=2, rollouts_per_example=3)
-
-    result = validate_manifests_in_runs(runs_dir, strict=False)
-
-    assert any("out-of-range rollout_index" in issue.message for issue in result.issues)
-
-
-def test_validate_manifests_reports_malformed_last_jsonl_row(tmp_path: Path) -> None:
-    runs_dir = tmp_path / "runs" / "raw"
-    run_dir = runs_dir / "demo-run"
-    job_dir = run_dir / "job-1"
-    _write_json(job_dir / "metadata.json", {"env_id": "demo"})
-    (job_dir / "results.jsonl").write_text('{"example_id": 1}\n{"example_id": ', encoding="utf-8")
-    _write_manifest(run_dir, num_examples=1, rollouts_per_example=1)
-
-    result = validate_manifests_in_runs(runs_dir, strict=False)
-
-    assert any("failed to parse last JSONL row" in issue.message for issue in result.issues)
diff --git a/tests/test_cli/test_schemas.py b/tests/test_cli/test_schemas.py
index 4fe28197..abdf5f33 100644
--- a/tests/test_cli/test_schemas.py
+++ b/tests/test_cli/test_schemas.py
@@ -5,38 +5,9 @@
 from medarc_verifiers.cli._schemas import (
     EnvironmentConfigSchema,
     EnvironmentExportConfig,
-    ModelConfigSchema,
 )
 
 
-def test_model_params_merge_matches_explicit_definition() -> None:
-    explicit = ModelConfigSchema(
-        id="demo",
-        model="gpt-mini",
-        env_args={"split": "dev"},
-        env_overrides={"medqa": {"temperature": 0.2}},
-    )
-    legacy = ModelConfigSchema(
-        id="demo",
-        params={
-            "model": "gpt-mini",
-            "env_args": {"split": "dev"},
-            "env_overrides": {"medqa": {"temperature": 0.2}},
-        },
-    )
-
-    assert legacy.model_dump() == explicit.model_dump()
-
-
-def test_environment_matrix_exclude_with_unknown_key_raises() -> None:
-    with pytest.raises(ValueError, match="matrix_exclude entry references unknown keys"):
-        EnvironmentConfigSchema(
-            id="medqa",
-            matrix={"num_examples": [5]},
-            matrix_exclude=[{"unknown_key": 1}],
-        )
-
-
 def test_environment_export_config_validates_columns() -> None:
     env = EnvironmentConfigSchema(
         id="medqa",

From bd6e5d4d7ad61f52179c81e82fb33c2fac8836d4 Mon Sep 17 00:00:00 2001
From: Benjamin Warner <me@benjaminwarner.dev>
Date: Tue, 28 Apr 2026 20:38:44 +0000
Subject: [PATCH 12/53] Document TOML benchmark workflow

---
 README.md                             |  73 ++---
 configs/eval/README.md                |   6 +
 docs/README.md                        |   4 +-
 docs/medarc-eval-bench.md             | 375 +++++++++-----------------
 docs/medarc-eval-process.md           |  57 ++--
 docs/medarc-eval-winrate.md           |   6 +-
 docs/medarc-eval.md                   |  41 ++-
 docs/medarc-orchestrate.md            |  10 +-
 docs/medarc-verifiers-architecture.md | 315 ++++++++++------------
 9 files changed, 377 insertions(+), 510 deletions(-)

diff --git a/README.md b/README.md
index f98c7d13..2d12e925 100644
--- a/README.md
+++ b/README.md
@@ -102,12 +102,12 @@ Once your tooling is set up you can install MedARC-maintained environments direc
 
 ## medarc-eval CLI
 
-`medarc-eval` wraps the upstream `vf-eval` flow, adding environment-specific flags and batch orchestration. See [full documentation](docs/medarc-eval.md).
+`medarc-eval` wraps the upstream `verifiers` eval flow, adding environment-specific flags and a TOML bench workflow. See [full documentation](docs/medarc-eval.md).
 
 | Command | Description |
 |---------|-------------|
 | [`medarc-eval <ENV>`](docs/medarc-eval-single-run.md) | Run a single benchmark with auto-discovered environment flags |
-| [`medarc-eval bench`](docs/medarc-eval-bench.md) | Run multiple benchmarks from a YAML config with resume support |
+| [`medarc-eval bench`](docs/medarc-eval-bench.md) | Run upstream TOML eval configs with deterministic MedARC paths |
 | [`medarc-eval process`](docs/medarc-eval-process.md) | Convert raw outputs to parquet for analysis |
 | [`medarc-eval winrate`](docs/medarc-eval-winrate.md) | Compute HELM-style win rates across models |
 
@@ -118,10 +118,10 @@ Once your tooling is set up you can install MedARC-maintained environments direc
 uv run medarc-eval medqa -m gpt-4.1-mini -n 25
 
 # Run batch evaluations from config
-uv run medarc-eval bench --config configs/job-gpt-oss-20b.yaml
+uv run medarc-eval bench --config configs/eval/smoke.toml
 
 # Process results and compute win rates
-uv run medarc-eval process
+uv run medarc-eval process --runs-dir runs/evals
 uv run medarc-eval winrate
 ```
 
@@ -145,51 +145,52 @@ uv run medarc-eval careqa --env-args '{"split": "open", "judge_model": "gpt-4o"}
 
 ## Batch Evaluations
 
-Use `medarc-eval bench` to run multiple model × environment evaluations from a config file. See [full batch mode documentation](docs/medarc-eval-bench.md).
+Use `medarc-eval bench` to run upstream `verifiers` TOML eval configs
+sequentially with deterministic MedARC output paths. See [full bench mode
+documentation](docs/medarc-eval-bench.md).
 
-```yaml
-name: gpt-oss-20b-med
-
-models:
-  gpt-oss-20b:
-    model: openai/gpt-oss-20b
-    api_base_url: http://localhost:8000/v1
-    sampling_args:
-      temperature: 1.0
-      reasoning_effort: medium
-
-jobs:
-  - model: gpt-oss-20b
-    env: [m_arc, medcalc_bench, medxpertqa]
+```toml
+model = "openai/gpt-4.1-mini"
+save_results = true
+output_dir = "runs/evals"
+
+[[eval]]
+env_id = "medqa"
+num_examples = 25
+rollouts_per_example = 1
+env_args = { shuffle_answers = true, shuffle_seed = 1618 }
 ```
 
 ```bash
 # Run the batch
-uv run medarc-eval bench --config configs/job-gpt-oss-20b.yaml
+uv run medarc-eval bench --config configs/eval/medarc-mcq.toml
 
 # Preview without executing
-uv run medarc-eval bench --config configs/job-gpt-oss-20b.yaml --dry-run
+uv run medarc-eval bench --config configs/eval/medarc-mcq.toml --dry-run
 ```
 
-Batch mode supports automatic resume, job manifests, and matrix sweeps for parameter grids. See the [batch mode documentation](docs/medarc-eval-bench.md) for config file format, resume/restart options, and advanced features.
+Bench mode resumes matching deterministic result directories and supports
+`[[ablation]]` sweeps for parameter grids. The removed YAML job/manifest runner
+is documented only in the migration notes in the [bench mode docs](docs/medarc-eval-bench.md).
 
-### Matrix Sweeps
+### Ablation Sweeps
 
-Environment configs support matrix expansion for parameter grid runs:
+Use upstream TOML ablations for parameter grid runs:
 
-```yaml
-- id: medconceptsqa
-  module: medconceptsqa
-  num_examples: -1
-  env_args:
-    shuffle_answers: true
-  matrix:
-    difficulty: [easy, medium, hard]
-    shuffle_seed: [1618, 9331]
-  matrix_id_format: "{base}-{difficulty}-s{shuffle_seed}"
+```toml
+[[ablation]]
+env_id = "medconceptsqa"
+num_examples = -1
+env_args = { shuffle_answers = true }
+
+[ablation.sweep.env_args]
+difficulty = ["easy", "medium", "hard"]
+shuffle_seed = [1618, 9331]
 ```
 
-This expands into six variants (`medconceptsqa-base-easy-s1618`, …). See [batch mode docs](docs/medarc-eval-bench.md) for full details on matrix expansion, exclusions, and split config files.
+This expands into deterministic variant directories under
+`runs/evals/<model>/medconceptsqa/`. See [bench mode docs](docs/medarc-eval-bench.md)
+for details.
 
 ## Processing and Win Rates
 
@@ -197,7 +198,7 @@ After running benchmarks, convert results to parquet and compute model compariso
 
 ```bash
 # Process raw outputs to parquet
-uv run medarc-eval process
+uv run medarc-eval process --runs-dir runs/evals
 
 # Compute HELM-style win rates
 uv run medarc-eval winrate
diff --git a/configs/eval/README.md b/configs/eval/README.md
index f65809cf..77159bbc 100644
--- a/configs/eval/README.md
+++ b/configs/eval/README.md
@@ -5,6 +5,12 @@ and `[[ablation]]` sweeps intentionally keep the upstream environment id stable;
 `medarc-eval bench` writes deterministic variant directories for differing
 `env_args` and `sampling_args`.
 
+```bash
+medarc-eval bench --config configs/eval/smoke.toml --dry-run
+medarc-eval bench --config configs/eval/medarc-all.toml
+medarc-eval process --runs-dir runs/evals --output-dir runs/processed
+```
+
 Per-environment `[tool.verifiers.eval]` defaults are read from editable installs
 where the environment `pyproject.toml` is discoverable next to the module. Wheel
 installs may ignore those defaults unless the package includes `pyproject.toml`,
diff --git a/docs/README.md b/docs/README.md
index ff72a226..a8a1b189 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -1,6 +1,6 @@
 # medarc-verifiers
 
-Utilities and CLI for running medical LLM benchmarks with [verifiers](https://github.com/primeintellect-ai/verifiers). Provides batch orchestration, result processing, and shared building blocks for authoring environments.
+Utilities and CLI for running medical LLM benchmarks with [verifiers](https://github.com/primeintellect-ai/verifiers). Provides TOML bench execution, result processing, and shared building blocks for authoring environments.
 
 ## Install
 
@@ -17,7 +17,7 @@ Environments are installed separately via `prime env install <owner/env>` (from
 | Command | Description |
 |---------|-------------|
 | `medarc-eval <ENV>` | Run a single benchmark; env-specific flags inferred from `load_environment()` |
-| `medarc-eval bench` | Run multiple model × environment jobs from a YAML config, with resume support |
+| `medarc-eval bench` | Run upstream TOML eval configs with deterministic MedARC paths |
 | `medarc-eval process` | Convert raw outputs to analysis-ready parquet |
 | `medarc-eval winrate` | Compute HELM-style win rates across models |
 
diff --git a/docs/medarc-eval-bench.md b/docs/medarc-eval-bench.md
index c32e313e..989f6699 100644
--- a/docs/medarc-eval-bench.md
+++ b/docs/medarc-eval-bench.md
@@ -1,300 +1,183 @@
-# Batch Mode
+# TOML Bench Mode
 
-Run multiple benchmarks across multiple models using a configuration file. Batch mode handles job scheduling, progress tracking, and automatic resume.
+`medarc-eval bench` runs upstream `verifiers` TOML eval configs sequentially with
+MedARC-specific deterministic output paths. It is the supported path for
+systematic local benchmark runs.
 
-Each job invokes the verifiers [`vf-eval`](https://github.com/primeintellect-ai/verifiers) evaluation loop under the hood, with configuration-driven environment and sampling arguments.
+The old MedARC YAML benchmark runner has been removed. `bench --config` now
+accepts `.toml` files only.
 
 ## Quick Start
 
 ```bash
-# Run all jobs from config
-medarc-eval bench --config configs/job-gpt-oss-20b.yaml
+# Preview the repository smoke config
+medarc-eval bench --config configs/eval/smoke.toml --dry-run
 
-# Preview what would run
-medarc-eval bench --config configs/job-gpt-oss-20b.yaml --dry-run
+# Run the MCQ production suite
+medarc-eval bench --config configs/eval/medarc-mcq.toml
 
-# Force all jobs to use a specific API endpoint
-medarc-eval bench --config configs/job-gpt-oss-20b.yaml --api-base-url http://127.0.0.1:8000/v1
+# Run the aggregate suite against a local OpenAI-compatible server
+medarc-eval bench \
+  --config configs/eval/medarc-all.toml \
+  --api-base-url http://127.0.0.1:8000/v1 \
+  --provider local \
+  --model openai/my-local-model
 ```
 
-## Writing a Config File
-
-A minimal config defines models and which benchmarks to run:
-
-```yaml
-name: gpt-oss-20b-med
-
-models:
-  gpt-oss-20b:
-    model: openai/gpt-oss-20b
-    api_base_url: http://localhost:8000/v1
-    sampling_args:
-      temperature: 1.0
-      top_p: 1.0
-      top_k: 0
-      reasoning_effort: medium
-
-jobs:
-  - model: gpt-oss-20b
-    env:
-      - m_arc
-      - medcalc_bench
-      - medxpertqa
-```
-
-This creates 3 jobs: gpt-oss-20b evaluated on m_arc, medcalc_bench, and medxpertqa.
-
-### Config Structure
-
-| Field | Description |
-|-------|-------------|
-| `name` | Human-readable run name |
-| `output_dir` | Where to save results (default: `runs/raw`) |
-| `models` | Map of model ID → model configuration |
-| `jobs` | List of model + environment combinations to run |
-
-### Model Configuration
-
-```yaml
-models:
-  gpt-oss-20b:
-    model: openai/gpt-oss-20b       # Model identifier
-    api_base_url: http://localhost:8000/v1  # API endpoint (local or remote)
-    api_key_var: OPENAI_API_KEY     # Optional: env var for API key
-    max_concurrent: 10              # Optional: parallel request limit
-    timeout: 120                    # Optional: request timeout (seconds)
-    sampling_args:
-      temperature: 1.0
-      top_p: 1.0
-      reasoning_effort: medium
-```
+Repository suite configs live in `configs/eval/`:
 
-### Runtime API Base URL Override
+| Config | Purpose |
+|--------|---------|
+| `smoke.toml` | Small smoke test used by CLI tests |
+| `medarc-mcq.toml` | Multiple-choice benchmark suite |
+| `medarc-judge.toml` | Judge/free-form benchmark suite |
+| `medarc-all.toml` | Aggregate production suite |
 
-Use `--api-base-url` to override `models.*.api_base_url` for all jobs at runtime:
+## Config Format
 
-```bash
-medarc-eval bench --config my-config.yaml --api-base-url http://127.0.0.1:8000/v1
-```
+Bench configs use upstream `verifiers` TOML semantics: top-level defaults plus
+one or more `[[eval]]` blocks. MedARC adds deterministic output planning around
+the resolved evals; it does not use YAML `models`, `envs`, or `jobs` sections.
 
-### Environment Configuration
+```toml
+model = "openai/gpt-4.1-mini"
+save_results = true
+output_dir = "runs/evals"
 
-Benchmarks are configured in `configs/envs/`. Each file defines one or more environment variants:
+[[eval]]
+env_id = "medqa"
+num_examples = 25
+rollouts_per_example = 1
+env_args = { shuffle_answers = true, shuffle_seed = 1618 }
+sampling_args = { temperature = 0.0 }
 
-```yaml
-# configs/envs/medqa.yaml
-- id: medqa
-  module: medqa
-  num_examples: -1              # -1 = all examples
-  rollouts_per_example: 1
-  env_args:
-    shuffle_answers: true
-    shuffle_seed: 1618
+[[eval]]
+env_id = "pubmedqa"
+num_examples = 25
+rollouts_per_example = 1
 ```
 
-#### Matrix Sweeps
+Per-environment defaults can also live in an environment package
+`pyproject.toml` under `[tool.verifiers.eval]`. Production suite configs keep
+explicit `num_examples` and `rollouts_per_example` values so they remain stable
+across editable and wheel installs.
 
-Run the same benchmark with different parameter combinations:
+## Ablations and Variants
 
-```yaml
-- id: medqa
-  module: medqa
-  num_examples: -1
-  env_args:
-    shuffle_answers: true
-  matrix:
-    shuffle_seed: [1618, 9331, 2718]
-  matrix_id_format: "{base}-seed{shuffle_seed}"
-```
+Use upstream `[[ablation]]` tables to sweep values. The upstream env id stays
+unchanged, and MedARC writes each differing config to a deterministic variant
+directory.
 
-This creates three variants: `medqa-seed1618`, `medqa-seed9331`, `medqa-seed2718`.
+```toml
+model = "openai/gpt-4.1-mini"
+save_results = true
+output_dir = "runs/evals"
 
-## Resume and Restart
-
-Batch mode automatically tracks job status and can resume interrupted runs.
-
-### Automatic Resume (Default)
-
-When you re-run the same config, completed jobs are skipped:
-
-```bash
-# First run - runs all jobs
-medarc-eval bench --config my-config.yaml
+[[ablation]]
+env_id = "medqa"
+num_examples = -1
+rollouts_per_example = 1
+env_args = { shuffle_answers = true }
 
-# Interrupted, re-run - skips completed jobs
-medarc-eval bench --config my-config.yaml
+[ablation.sweep.env_args]
+shuffle_seed = [1618, 9331]
 ```
 
-### Force Fresh Run
+Example output paths:
 
-```bash
-# Disable auto-resume, create new run directory
-medarc-eval bench --config my-config.yaml --no-auto-resume
-
-# Re-run everything, even completed jobs
-medarc-eval bench --config my-config.yaml --force
+```text
+runs/evals/openai-gpt-4.1-mini/medqa/env_args.shuffle_seed-1618/
+runs/evals/openai-gpt-4.1-mini/medqa/env_args.shuffle_seed-9331/
 ```
 
-### Restart from Previous Run
-
-Copy completed jobs from an old run to seed a new one:
+Non-variant evals write to `runs/evals/<model>/<env>/`.
 
-```bash
-medarc-eval bench --config updated-config.yaml --restart old-run-id
-```
+## Resume and Force
 
-### Re-run Specific Environments
+Bench writes each eval to a deterministic result directory and stores a narrow
+MedARC config fingerprint in `metadata.json`. Re-running the same TOML config
+resumes the same directory when the fingerprint matches.
 
 ```bash
-# Re-run only medqa jobs (keep other completed jobs)
-medarc-eval bench --config my-config.yaml --forced medqa
+# Resume matching deterministic outputs
+medarc-eval bench --config configs/eval/medarc-all.toml
 
-# Re-run multiple environments
-medarc-eval bench --config my-config.yaml --forced medqa,pubmedqa
+# Archive existing deterministic outputs and rerun
+medarc-eval bench --config configs/eval/medarc-all.toml --force
 ```
 
-## Common Flags
-
-### Job Selection
-
-| Flag | Description |
-|------|-------------|
-| `--config PATH` | **Required.** Path to config YAML |
-| `--endpoints-path PATH` | Endpoint registry path (default: `configs/endpoints.toml`) |
-| `--job-id ID` | Run only specific job(s) by ID (repeatable) |
-| `--dry-run` | Show plan without executing |
-
-### Output Control
-
-| Flag | Description |
-|------|-------------|
-| `--output-dir PATH` | Override output directory |
-| `--run-id ID` | Force specific run directory name |
-| `--name NAME` | Override run name in manifest |
+Fingerprint checks protect semantic benchmark identity such as `env_id`,
+`env_args`, and normalized sampling args. Operational details such as host URL,
+timeout, key variable, and concurrency do not define the benchmark identity.
 
-### Override All Jobs
+## Common Flags
 
 | Flag | Description |
 |------|-------------|
-| `--env-args JSON` | Override environment args for all jobs |
-| `--sampling-args JSON` | Override sampling args for all jobs |
-| `--max-concurrent N` | Override concurrency for all jobs |
-| `--timeout SEC` | Override timeout for all jobs |
-| `--include-usage` / `--no-include-usage` | Enable/disable usage reporting (auto-detected for Prime Inference) |
-
-### Prime Inference
-
-When using Prime Inference (`https://api.pinference.ai/api/v1`), the CLI automatically:
-- Uses `PRIME_API_KEY` for authentication (if set)
-- Adds `X-Prime-Team-ID` header from the `PRIME_TEAM_ID` env var
-- Enables usage reporting in API requests
-
-Just set the environment variables and the config stays simple:
+| `--config PATH` | Required path to an upstream TOML eval config |
+| `--dry-run` | Resolve evals and print the deterministic plan |
+| `--force` | Archive existing deterministic output and rerun |
+| `--output-dir PATH` | Override the config output directory |
+| `--env-dir PATH` | Directory containing local environments |
+| `--endpoints-path PATH` | Endpoint registry path, default `configs/endpoints.toml` |
+| `--api-base-url URL` | Override API base URL for every eval |
+| `--api-key-var NAME` | Override API key environment variable |
+| `--provider NAME` | Override upstream provider shorthand |
+| `--model MODEL` | Override model for every eval |
+| `--eval-index N` | Run one resolved eval by 1-based index |
+| `--start-at N` / `--stop-after N` | Run a contiguous 1-based eval range |
+| `--continue-on-error` | Continue after a failed eval |
+| `--env-arg KEY=VALUE` / `--env-args JSON` | Apply environment arg overrides |
+| `--sampling-arg KEY=VALUE` / `--sampling-args JSON` | Apply sampling arg overrides |
+| `--max-concurrent N` | Override max concurrency for every eval |
+| `--timeout SEC` | Override request timeout for every eval |
+| `--max-retries N` | Override upstream rollout retries for every eval |
+| `--sleep SEC` | Sleep after each eval |
+
+## Prime Inference
+
+When `--api-base-url` or a config points at Prime Inference
+(`https://api.pinference.ai/api/v1`), MedARC applies the same Prime helpers used
+by single-run mode:
+
+- `PRIME_API_KEY` is preferred when available.
+- `X-Prime-Team-ID` is added from `PRIME_TEAM_ID`.
+- Usage reporting is enabled unless `MEDARC_INCLUDE_USAGE=false` is set.
 
 ```bash
-export PRIME_API_KEY=your-api-key
-export PRIME_TEAM_ID=your-team-id
-```
-
-```yaml
-models:
-  my-model:
-    model: openai/gpt-5-nano
-    api_base_url: https://api.pinference.ai/api/v1
-```
+export PRIME_API_KEY=...
+export PRIME_TEAM_ID=...
 
-Manual configuration is only needed to override auto-detection:
-
-```yaml
-models:
-  my-model:
-    model: openai/gpt-5-nano
-    api_base_url: https://api.pinference.ai/api/v1
-    api_key_var: PRIME_API_KEY
-    headers:
-      X-Prime-Team-ID: override-team-id
-    sampling_args:
-      extra_body:
-        usage:
-          include: false  # disable usage reporting
+medarc-eval bench \
+  --config configs/eval/medarc-mcq.toml \
+  --api-base-url https://api.pinference.ai/api/v1
 ```
 
-### Endpoints Registry Migration (`endpoints.py` -> `endpoints.toml`)
+## Processing Outputs
 
-Batch mode now defaults `--endpoints-path` to `configs/endpoints.toml`.
-
-If your project still uses a Python registry, pass it explicitly:
+After a TOML bench run, process the deterministic eval outputs:
 
 ```bash
-medarc-eval bench --config my-config.yaml --endpoints-path configs/endpoints.py
-```
-
-## Output Structure
-
-```
-runs/raw/<run_id>/
-├── run_manifest.json           # Run metadata, job status, checksums
-├── <job_id>/
-│   ├── results.jsonl           # Per-example results
-│   ├── summary.json            # Aggregate metrics
-│   └── metadata.json           # Job configuration snapshot
-└── <job_id>/
-    └── ...
+medarc-eval process --runs-dir runs/evals --output-dir runs/processed
+medarc-eval winrate --processed-dir runs/processed
 ```
 
-The manifest tracks:
-- Job status (pending, running, completed, failed)
-- Configuration checksums for resume detection
-- Timing information
-- Output paths
-
-## Example Workflows
-
-### Evaluate Multiple Models on Core Benchmarks
-
-```yaml
-name: model-comparison
-
-models:
-  gpt-oss-20b:
-    model: openai/gpt-oss-20b
-    api_base_url: http://192.168.1.152:8000/v1
-    sampling_args:
-      temperature: 1.0
-      reasoning_effort: medium
-
-  gpt-oss-20b-low:
-    model: openai/gpt-oss-20b
-    api_base_url: http://192.168.1.152:8000/v1
-    sampling_args:
-      temperature: 0.7
-      reasoning_effort: low
-
-jobs:
-  - model: gpt-oss-20b
-    env: [m_arc, medcalc_bench, medxpertqa]
-  - model: gpt-oss-20b-low
-    env: [m_arc, medcalc_bench, medxpertqa]
-```
-
-### Override Parameters at Runtime
+Processing still supports legacy `runs/raw/<run_id>/run_manifest.json` outputs
+for migration, but new bench runs should use `runs/evals`.
 
-```bash
-# Lower concurrency for rate-limited API
-medarc-eval bench --config my-config.yaml --max-concurrent 5
+## Migrating from the Removed YAML Runner
 
-# Change temperature for all jobs
-medarc-eval bench --config my-config.yaml --sampling-args '{"temperature": 0.5}'
+Move old YAML `models` entries into top-level TOML defaults or explicit
+`[[eval]]` blocks. Move old `envs` and matrix variants into repeated `[[eval]]`
+blocks or upstream `[[ablation]]` sweeps.
 
-# Enable usage reporting for all jobs
-medarc-eval bench --config my-config.yaml --include-usage
-
-# Disable usage reporting (overrides auto-detection for Prime Inference)
-medarc-eval bench --config my-config.yaml --no-include-usage
-```
+Removed YAML-runner concepts no longer exist in `medarc-eval bench`:
 
-## Next Steps
+- YAML `models`, `envs`, and `jobs` schemas
+- `run_manifest.json` creation for new bench runs
+- `--run-id`, `--restart`, `--auto-resume`, `--no-auto-resume`
+- `--job-id`, `--forced`, `--on-complete`
+- custom YAML job status and manifest planning
 
-After batch runs complete:
-1. [Process results](medarc-eval-process.md) into parquet format
-2. [Compute win rates](medarc-eval-winrate.md) to compare models
+Old raw outputs remain processable through the legacy manifest reader, so
+historical runs do not need to be converted before processing.
diff --git a/docs/medarc-eval-process.md b/docs/medarc-eval-process.md
index 1b57c7ba..e3a5caa1 100644
--- a/docs/medarc-eval-process.md
+++ b/docs/medarc-eval-process.md
@@ -5,11 +5,11 @@ Convert raw benchmark outputs into analysis-ready parquet files. This step prepa
 ## Quick Start
 
 ```bash
-# Process all completed jobs (uses defaults)
-medarc-eval process
+# Process outputs from the current TOML bench runner
+medarc-eval process --runs-dir runs/evals --output-dir runs/processed
 
-# Specify directories explicitly
-medarc-eval process --runs-dir runs/raw --output-dir runs/processed
+# Process legacy manifest outputs (default runs dir)
+medarc-eval process
 
 # Preview what would be processed
 medarc-eval process --dry-run
@@ -17,8 +17,8 @@ medarc-eval process --dry-run
 
 ## What Processing Does
 
-1. **Discovers** jobs in `runs/raw/` and filters by manifest status (default: `completed`)
-2. **Extracts** results from each job's output files
+1. **Discovers** eval outputs in `runs/evals/` and legacy manifest jobs in `runs/raw/`
+2. **Extracts** results from each eval output directory
 3. **Normalizes** data into a fixed output schema
 4. **Writes** parquet files organized by model and environment
 5. **Creates** an index (`env_index.json`) for downstream tools
@@ -43,7 +43,7 @@ On-disk model and env path components are slugified, so filenames may not exactl
 
 | Flag | Description | Default |
 |------|-------------|---------|
-| `--runs-dir PATH` | Directory containing raw runs | `runs/raw` |
+| `--runs-dir PATH` | Directory containing raw run outputs. Use `runs/evals` for new TOML bench runs. | `runs/raw` |
 | `--output-dir PATH` | Where to write processed files | `runs/processed` |
 | `--max-workers N` | Parallel worker processes | 4 |
 | `--dry-run` | Show what would be processed | - |
@@ -55,9 +55,15 @@ On-disk model and env path components are slugified, so filenames may not exactl
 
 ### By Completion Status
 
-By default, `medarc-eval process` only selects jobs whose manifest status is `completed`.
+For current TOML bench outputs, processing discovers valid eval result
+directories under `runs/evals` and reads their `metadata.json`.
 
-Note: successful jobs are written to `run_manifest.json` with `status: completed`.
+For legacy YAML-runner outputs, `medarc-eval process` reads
+`runs/raw/<run_id>/run_manifest.json` and only selects jobs whose manifest
+status is `completed` by default.
+
+Note: successful legacy jobs are written to `run_manifest.json` with
+`status: completed`.
 
 To override that default, pass one or more explicit status filters:
 
@@ -75,12 +81,18 @@ medarc-eval process --max-results-missing-pct 2.5
 medarc-eval process --max-results-missing-pct 100
 ```
 
-This gate uses manifest job metadata only:
+For TOML bench outputs, this gate uses `metadata.json` values for expected rows
+and the observed `results.jsonl` row count:
+
+- `expected_rows = num_examples * rollouts_per_example`
+- `observed_rows = results.jsonl row count`
+
+For legacy manifest outputs, the same gate uses manifest job fields:
 
 - `expected_rows = num_examples * rollouts_per_example`
 - `observed_rows = row_count`
 
-It is computed per selected job record and enforced only on the latest selected run for each processed model/environment output. It does not use manifest `summary.completed` / `summary.total`, and it does not fall back to older runs if the latest one is too incomplete.
+It is computed per selected job record and enforced only on the latest selected run for each processed model/environment output. For legacy manifests, it does not use manifest `summary.completed` / `summary.total`, and it does not fall back to older runs if the latest one is too incomplete.
 
 Selected records with missing `results.jsonl` fail processing immediately.
 
@@ -106,7 +118,7 @@ Store common options in a YAML file:
 
 ```yaml
 # process-config.yaml
-runs_dir: runs/raw
+runs_dir: runs/evals
 
 process:
   dir: processed
@@ -130,7 +142,7 @@ CLI flags override config values.
 
 Supported config schema for `medarc-eval process`:
 
-- Top-level `runs_dir`: raw run root.
+- Top-level `runs_dir`: raw run root. Use `runs/evals` for new TOML bench outputs and `runs/raw` for legacy manifest outputs.
 - Top-level `process:`: process-specific defaults.
 - Optional top-level `winrate:`: embedded post-process winrate step.
 - Optional top-level `hf:`: shared HF settings. For embedded winrate uploads, use `hf.winrate_dir`.
@@ -143,7 +155,7 @@ Path shortcuts:
 Example:
 
 ```yaml
-runs_dir: runs/raw
+runs_dir: runs/evals
 
 process:
   dir: processed
@@ -163,7 +175,7 @@ Sync processed datasets to/from the Hugging Face Hub:
 
 ```yaml
 # process-config.yaml
-runs_dir: runs/raw
+runs_dir: runs/evals
 process:
   dir: processed
 
@@ -225,10 +237,10 @@ This runs `medarc-eval winrate` automatically after processing completes when th
 
 ```bash
 # 1. Run benchmarks
-medarc-eval bench --config my-eval.yaml
+medarc-eval bench --config configs/eval/medarc-mcq.toml
 
 # 2. Process results
-medarc-eval process
+medarc-eval process --runs-dir runs/evals
 
 # 3. Compute win rates
 medarc-eval winrate
@@ -249,8 +261,8 @@ medarc-eval process \
 ### Incremental Updates
 
 ```bash
-# Process only new runs (default behavior)
-medarc-eval process
+# Process only new TOML bench outputs
+medarc-eval process --runs-dir runs/evals
 
 # env_index.json tracks what's already processed
 ```
@@ -280,12 +292,13 @@ When both flags are present, processing only rebuilds outputs that match both fi
 
 Check that:
 1. `--runs-dir` points to the correct location
-2. Runs have completed (check `run_manifest.json` `jobs[*].status`)
-3. Use `--status pending` or `--status running` to include non-completed jobs
+2. For TOML bench outputs, each eval directory contains `results.jsonl` and `metadata.json`
+3. For legacy manifest outputs, runs have completed (check `run_manifest.json` `jobs[*].status`)
+4. Use `--status pending` or `--status running` to include non-completed legacy jobs
 
 ### Missing data in output
 
-By default, only jobs with `completed` status are included. In addition, `--max-results-missing-pct` fails if a selected latest job record is missing more than 2.5% of its expected `results.jsonl` rows, using manifest job fields:
+By default, current TOML bench outputs are selected from valid eval directories, while legacy manifest outputs include only jobs with `completed` status. In addition, `--max-results-missing-pct` fails if a selected latest job record is missing more than 2.5% of its expected `results.jsonl` rows. TOML bench outputs use eval metadata plus the observed JSONL row count; legacy manifest outputs use manifest job fields:
 
 - `row_count`
 - `num_examples`
diff --git a/docs/medarc-eval-winrate.md b/docs/medarc-eval-winrate.md
index 47c28f92..b56e6d02 100644
--- a/docs/medarc-eval-winrate.md
+++ b/docs/medarc-eval-winrate.md
@@ -21,7 +21,7 @@ Win rate computation requires processed parquet files with an `env_index.json`:
 
 ```bash
 # If you haven't processed yet:
-medarc-eval process
+medarc-eval process --runs-dir runs/evals
 ```
 
 ## How Win Rates Work
@@ -103,7 +103,7 @@ The JSON output includes:
 
 ```yaml
 # process-config.yaml
-runs_dir: runs/raw
+runs_dir: runs/evals
 
 process:
   dir: processed
@@ -192,7 +192,7 @@ medarc-eval winrate \
 
 ```yaml
 # process-config.yaml
-runs_dir: runs/raw
+runs_dir: runs/evals
 
 process:
   dir: processed
diff --git a/docs/medarc-eval.md b/docs/medarc-eval.md
index 395d251f..4f4d199c 100644
--- a/docs/medarc-eval.md
+++ b/docs/medarc-eval.md
@@ -2,7 +2,7 @@
 
 `medarc-eval` is a command-line tool for evaluating language models on medical benchmarks. It handles the full pipeline: running benchmarks, processing results, and computing model comparisons.
 
-> **Note:** `medarc-eval <ENV>` and `medarc-eval bench` are wrappers around the [verifiers](https://github.com/primeintellect-ai/verifiers) `vf-eval` command, adding medical-specific environments, batch orchestration, and environment-specific CLI flags inferred from each benchmark's `load_environment()` signature.
+> **Note:** `medarc-eval <ENV>` and `medarc-eval bench` are wrappers around the [verifiers](https://github.com/primeintellect-ai/verifiers) eval flow. Single-run mode adds environment-specific CLI flags inferred from each benchmark's `load_environment()` signature; bench mode runs upstream TOML eval configs sequentially with deterministic MedARC output paths.
 
 ## Quick Start
 
@@ -11,10 +11,10 @@
 medarc-eval medqa -m gpt-4.1-mini -n 25
 
 # Run a batch of benchmarks from a config file
-medarc-eval bench --config configs/job-gpt-oss-20b.yaml
+medarc-eval bench --config configs/eval/smoke.toml
 
 # Process raw results into analysis-ready parquet files
-medarc-eval process
+medarc-eval process --runs-dir runs/evals
 
 # Compute win rates across models
 medarc-eval winrate
@@ -27,7 +27,7 @@ medarc-eval winrate
    (bench or single)       (process)               (winrate)
         |                      |                        |
         v                      v                        v
-    runs/raw/           runs/processed/    runs/processed/winrate/
+    runs/evals/         runs/processed/    runs/processed/winrate/
 ```
 
 ## Commands
@@ -46,8 +46,8 @@ medarc-eval winrate
 medarc-eval medqa -m gpt-4.1-mini -n 50
 
 # Subcommands: keyword comes first
-medarc-eval bench --config configs/my-run.yaml
-medarc-eval process --runs-dir runs/raw
+medarc-eval bench --config configs/eval/medarc-mcq.toml
+medarc-eval process --runs-dir runs/evals
 medarc-eval winrate --processed-dir runs/processed
 ```
 
@@ -70,17 +70,17 @@ medarc-eval longhealth --help
 
 ### Batch Mode (`medarc-eval bench`)
 
-**Best for:** Systematic evaluation across multiple models and benchmarks.
+**Best for:** Systematic evaluation across TOML eval configs.
 
 ```bash
 # Run all jobs defined in config
-medarc-eval bench --config configs/job-gpt-oss-20b.yaml
+medarc-eval bench --config configs/eval/medarc-mcq.toml
 
 # Preview what would run without executing
-medarc-eval bench --config configs/job-gpt-oss-20b.yaml --dry-run
+medarc-eval bench --config configs/eval/medarc-mcq.toml --dry-run
 
 # Force all jobs to use a specific API endpoint
-medarc-eval bench --config configs/job-gpt-oss-20b.yaml --api-base-url http://127.0.0.1:8000/v1
+medarc-eval bench --config configs/eval/medarc-mcq.toml --api-base-url http://127.0.0.1:8000/v1 --provider local
 ```
 
 ### Processing Mode (`medarc-eval process`)
@@ -88,11 +88,11 @@ medarc-eval bench --config configs/job-gpt-oss-20b.yaml --api-base-url http://12
 **Best for:** Preparing results for analysis after batch runs complete.
 
 ```bash
-# Process all completed runs
-medarc-eval process
+# Process current TOML bench outputs
+medarc-eval process --runs-dir runs/evals
 
 # Process specific directory
-medarc-eval process --runs-dir runs/raw --output-dir runs/processed
+medarc-eval process --runs-dir runs/evals --output-dir runs/processed
 ```
 
 ### Win Rate Mode (`medarc-eval winrate`)
@@ -111,15 +111,14 @@ medarc-eval winrate --list-models
 
 ```
 runs/
-├── raw/                          # Raw benchmark outputs (from bench/single-run)
-│   └── <run_id>/
-│       ├── run_manifest.json     # Run metadata and job status
-│       └── <job_id>/             # Per-job results
+├── evals/                        # Raw TOML bench outputs
+│   └── <model>/
+│       └── <env>/
 │           ├── results.jsonl
-│           └── summary.json
+│           └── metadata.json
 ├── processed/                    # Analysis-ready parquet files (from process)
 │   ├── env_index.json            # Dataset inventory
-│   └── <env>/<model>.parquet
+│   └── <model>/<env>.parquet
 └── winrate/                      # Model comparison outputs (from winrate)
     ├── latest.json
     └── latest.csv
@@ -129,7 +128,7 @@ runs/
 
 ```bash
 medarc-eval --help              # General usage
-medarc-eval bench --help        # Batch mode options
+medarc-eval bench --help        # TOML bench options
 medarc-eval process --help      # Processing options
 medarc-eval winrate --help      # Win rate options
 medarc-eval medqa --help        # Environment-specific options
@@ -166,6 +165,6 @@ prime env install owner/environment-name@0.1.3
 ## Detailed Documentation
 
 - [Single-Run Mode](medarc-eval-single-run.md) - Run individual benchmarks with custom options
-- [Batch Mode](medarc-eval-bench.md) - Configure and run systematic evaluations
+- [TOML Bench Mode](medarc-eval-bench.md) - Configure and run systematic evaluations
 - [Processing](medarc-eval-process.md) - Prepare results for analysis
 - [Win Rates](medarc-eval-winrate.md) - Compare models across benchmarks
diff --git a/docs/medarc-orchestrate.md b/docs/medarc-orchestrate.md
index 786a82d2..ba802e3e 100644
--- a/docs/medarc-orchestrate.md
+++ b/docs/medarc-orchestrate.md
@@ -39,7 +39,7 @@ Create a plan YAML listing the job configs you want to orchestrate:
 ```yaml
 name: local-vllm
 job_configs:
-  - configs/eval/job-gpt-oss-20b.toml
+  - configs/eval/local-qwen.toml
 env_file: .env
 gpu_range: "0-3"
 port_range: "8000-8999"
@@ -50,8 +50,7 @@ rerun_failed: false
 ```
 
 Each job config should be an upstream `medarc-eval bench` TOML config with a top-level
-`model` and a top-level `orchestrate` table. Legacy YAML job configs are still loadable
-during migration, but new orchestrated runs should use TOML.
+`model` and a top-level `orchestrate` table.
 
 The `env_file` is a dotenv file that is loaded for every Docker launch. If unset and a repo-level `.env` exists,
 it is used automatically. You can also override it via `--env-file`.
@@ -130,7 +129,7 @@ runtime: pyxis
 Artifacts are written under `outputs/orchestrator/<run_id>/`:
 
 - `summary.json` aggregates task states.
-- per-task folders contain `run_manifest.json`, `serve/` logs, `bench/` outputs, and `result.json`.
+- per-task folders contain orchestrator task state, `serve/` logs, `bench/` outputs, and `result.json`.
 
 ### Runtime behavior
 
@@ -140,8 +139,7 @@ For each task, the orchestrator launches vLLM, waits for readiness, then runs:
 medarc-eval bench --config <job.toml> --api-base-url <allocated-local-url> --provider local
 ```
 
-The bench command exits naturally on completion; the orchestrator no longer passes YAML-runner flags such as
-`--on-complete` or `--restart`.
+The bench command exits naturally on completion; the orchestrator passes TOML bench flags only.
 
 Docker mode:
 
diff --git a/docs/medarc-verifiers-architecture.md b/docs/medarc-verifiers-architecture.md
index d9f25cd2..66b38d96 100644
--- a/docs/medarc-verifiers-architecture.md
+++ b/docs/medarc-verifiers-architecture.md
@@ -4,264 +4,231 @@ This is a coding agents guide to `medarc_verifiers/`.
 
 ## What `medarc_verifiers` is
 
-`medarc_verifiers` is the repository’s Python package that wraps and extends the upstream `verifiers` evaluation framework with:
+`medarc_verifiers` wraps and extends the upstream `verifiers` evaluation
+framework with:
 
-- A unified CLI (`medarc-eval`) for running many medical benchmark environments consistently.
-- Batch orchestration with durable run manifests (resume/restart/force).
-- A processing pipeline that converts raw run artifacts into analysis-ready Parquet datasets.
+- A unified CLI (`medarc-eval`) for medical benchmark environments.
+- A TOML bench wrapper for sequential local benchmark runs with deterministic output paths.
+- A processing pipeline that converts raw eval artifacts into analysis-ready Parquet datasets.
 - HELM-style win rate computation across models from processed outputs.
-- Shared building blocks used by environments (parsers, rewards, shuffling utilities, judge helpers).
+- Shared environment utilities for parsers, rewards, shuffling, and judging.
 
-At a high level, everything funnels into a three-stage workflow:
+The current workflow is:
 
-1. **Run** evals (single or batch) → `runs/raw/<run_id>/...`
-2. **Process** raw outputs → `runs/processed/<model>/<env>.parquet` + `env_index.json`
-3. **Winrate** on processed outputs → `runs/processed/winrate/*.json` and `*.csv`
+1. **Run** evals with single-run mode or TOML bench -> `runs/evals/<model>/<env>/...`
+2. **Process** raw outputs -> `runs/processed/<model>/<env>.parquet` plus `env_index.json`
+3. **Winrate** on processed outputs -> `runs/processed/winrate/*.json` and `*.csv`
 
-## Important side effects (auto-installed patches)
+Historical YAML-runner outputs under `runs/raw/<run_id>/...` remain readable by
+`medarc-eval process`, but the YAML benchmark runner itself has been removed.
 
-Importing `medarc_verifiers` installs monkey patches into `verifiers` by default (`medarc_verifiers/__init__.py`):
+## Import Side Effects
 
-- **Judge cache namespacing**: cached judge responses are keyed by `base_url::model` so multi-judge runs don’t collide (`medarc_verifiers/judging/judge_cache_fix.py`).
+Importing `medarc_verifiers` installs monkey patches into `verifiers` by default
+(`medarc_verifiers/__init__.py`):
 
-`token_usage` is now produced by upstream `verifiers` output serialization and is flattened into explicit columns during `medarc-eval process`.
+- **Judge cache namespacing**: cached judge responses are keyed by
+  `base_url::model` so multi-judge runs do not collide
+  (`medarc_verifiers/judging/judge_cache_fix.py`).
 
-## `medarc-eval` CLI: modes and code layout
+`token_usage` is produced by upstream `verifiers` output serialization and is
+flattened into explicit columns during `medarc-eval process`.
+
+## `medarc-eval` CLI
 
 Entry point and router: `medarc_verifiers/cli/main.py`.
 
 It supports:
 
 - **Single-run mode**: `medarc-eval <ENV> ...`
-  - Special rule: the environment name must be the first token.
+  - The environment name must be the first token.
   - Implemented in `medarc_verifiers/cli/_single_run.py`.
-- **Batch mode**: `medarc-eval bench --config <yaml>`
-  - Loads config, expands job matrix, creates/updates a run manifest, then executes jobs.
-  - Implemented across:
-    - Config loading + matrix expansion: `medarc_verifiers/cli/_config_loader.py`
-    - Schemas: `medarc_verifiers/cli/_schemas.py`
-    - Job expansion: `medarc_verifiers/cli/_job_builder.py`
-    - Manifest creation + conflict detection: `medarc_verifiers/cli/_manifest.py`
-    - Resume/restart planning: `medarc_verifiers/cli/_manifest_planner.py`
-    - Execution loop: `medarc_verifiers/cli/_job_executor.py`
+- **TOML bench mode**: `medarc-eval bench --config <config.toml>`
+  - Loads upstream `verifiers` TOML eval configs, expands ablations, plans
+    deterministic output directories, validates MedARC fingerprints, then runs
+    evals sequentially.
+  - Main implementation: `medarc_verifiers/cli/main.py`
+  - Eval config adapter: `medarc_verifiers/cli/verifiers_adapter.py`
+  - Deterministic identity/path helpers: `medarc_verifiers/cli/eval_identity.py`
 - **Processing**: `medarc-eval process ...`
   - Pipeline wiring: `medarc_verifiers/cli/process/pipeline.py`
 - **Win rates**: `medarc-eval winrate ...`
-  - Runner that reads processed datasets and writes results: `medarc_verifiers/cli/winrate/runner.py`
-  - Core computations live in `medarc_verifiers/cli/winrate/api.py`.
+  - Runner: `medarc_verifiers/cli/winrate/runner.py`
+  - Core math: `medarc_verifiers/cli/winrate/api.py`
 
-Shared CLI constants (paths, command strings): `medarc_verifiers/cli/_constants.py`.
+Shared CLI constants live in `medarc_verifiers/cli/_constants.py`.
 
-### How single-run “dynamic env flags” works
+## Dynamic Env Flags
 
-Single-run mode introspects each environment’s `load_environment()` signature (and docstring) to generate argparse flags on the fly:
+Single-run mode introspects each environment's `load_environment()` signature
+and docstring to generate argparse flags dynamically:
 
-- Introspection + validation: `medarc_verifiers/cli/utils/env_args.py`
+- Introspection and validation: `medarc_verifiers/cli/utils/env_args.py`
 
-That’s why `medarc-eval longhealth --help` shows environment-specific flags even though they aren’t hardcoded. For anything too complex for flags, both single/batch support:
+That is why `medarc-eval longhealth --help` shows environment-specific flags
+even though they are not hardcoded. For anything too complex for flags,
+single-run and TOML bench both support:
 
 - `--env-args '{...json...}'`
 - `--env-arg key=value` (repeatable; smart type coercion)
 
-Override parsing helper: `medarc_verifiers/cli/utils/overrides.py`.
-
-## Config + override semantics (batch mode)
-
-Batch configs (YAML) validate into pydantic models in `medarc_verifiers/cli/_schemas.py`. After validation:
-
-- Environment matrices expand into multiple env variants (IDs can be formatted) in `medarc_verifiers/cli/_config_loader.py`.
-- Jobs expand into concrete “model × env variant” runs in `medarc_verifiers/cli/_job_builder.py`.
+Override parsing lives in `medarc_verifiers/cli/utils/overrides.py`.
 
-### `env_args` precedence
+## TOML Bench Config Semantics
 
-`env_args` are merged in layers. Think “low → high priority”:
+Bench configs use upstream `verifiers` TOML shape: top-level defaults plus one
+or more `[[eval]]` entries. Upstream `[[ablation]]` tables expand into repeated
+eval configs. MedARC adds deterministic paths and config-safe resume around the
+resolved upstream eval configs.
 
-1. Environment config `env.env_args` (from `configs/envs/*.yaml`)
-2. Model config `model.env_args`
-3. Model env-specific override `model.env_overrides[...]` (lookup tries: env id → matrix base id → module)
-4. Job-level overrides `job.env_args`
-5. CLI overrides (`--env-args` / `--env-arg`) applied later when building `EvalConfig`
+`env_args` precedence is low to high:
 
-The merge is handled by `medarc_verifiers/cli/utils/env_args.py` (with optional metadata validation).
+1. Environment package `[tool.verifiers.eval]` defaults, when discoverable
+2. TOML top-level defaults
+3. Per-`[[eval]]` values
+4. Expanded `[[ablation]]` values
+5. CLI overrides (`--env-args` / `--env-arg`)
 
-### `sampling_args` precedence and sanitation
+`sampling_args` follow the same TOML -> eval -> ablation -> CLI override model,
+then are sanitized for OpenAI-compatible clients:
 
-`sampling_args` merge from model → job → CLI, and are then sanitized for OpenAI-compatible clients:
-
-- Unknown parameters are moved under `extra_body` so they can be forwarded to compatible servers (e.g., vLLM).
+- Unknown parameters move under `extra_body` for compatible servers such as vLLM.
 - Sanitizer: `medarc_verifiers/utils/sampling_args.py`
-- Merge point: `medarc_verifiers/cli/_eval_builder.py`
+- Merge/adaptation point: `medarc_verifiers/cli/verifiers_adapter.py`
+
+The old YAML `models`, `envs`, `jobs`, matrix expansion, job builder, and
+manifest planner modules have been deleted.
 
-## Endpoints and Prime Inference integration
+## Endpoints and Prime Inference
 
 There are two related concepts:
 
-1. **Endpoint registry** (optional): resolves a model alias to an endpoint URL and key env var.
-   - Loader + cache: `medarc_verifiers/cli/utils/endpoint_utils.py`
-   - CLI default path: `configs/endpoints.toml` (TOML-first, aligned with upstream verifiers)
-   - Legacy Python registries remain usable via explicit `--endpoints-path configs/endpoints.py`.
+1. **Endpoint registry**: optional aliases for endpoint URL and key env var.
+   - Loader and cache: `medarc_verifiers/cli/utils/endpoint_utils.py`
+   - CLI default path: `configs/endpoints.toml`
 2. **Prime Inference overrides**:
-   - Adds `X-Prime-Team-ID` header (if `PRIME_TEAM_ID` is set and base URL is Prime Inference).
-   - Optionally injects `extra_body.usage.include = true` for usage reporting.
+   - Adds `X-Prime-Team-ID` from `PRIME_TEAM_ID`.
    - Selects `PRIME_API_KEY` when available for Prime Inference endpoints.
+   - Enables usage reporting unless disabled by `MEDARC_INCLUDE_USAGE=false`.
    - Implementation: `medarc_verifiers/utils/prime_inference.py`
 
 Relevant env vars:
 
-- `OPENAI_API_KEY` (default model key var)
-- `PRIME_API_KEY`, `PRIME_TEAM_ID` (Prime Inference)
-- `MEDARC_INCLUDE_USAGE` (force usage reporting true/false globally)
-
-Programmatic usage (build headers/sampling overrides for a base URL):
+- `OPENAI_API_KEY`
+- `PRIME_API_KEY`, `PRIME_TEAM_ID`
+- `MEDARC_INCLUDE_USAGE`
 
-```python
-from medarc_verifiers.utils.prime_inference import prime_inference_overrides
+## Resume and Deterministic Paths
 
-headers, sampling_overrides, api_key_var = prime_inference_overrides(base_url)
-```
+TOML bench writes eval outputs under deterministic directories:
 
-### Judge defaults and judge API keys
+- Non-variant evals: `runs/evals/<model>/<env>/`
+- Variant evals: `runs/evals/<model>/<env>/<variant_id>/`
 
-Judging defaults are centralized and provider-tuned:
+Before resuming an existing deterministic directory, bench validates the
+MedARC-specific config fingerprint in `metadata.json`. The fingerprint covers
+semantic benchmark identity such as `env_id`, `env_args`, and normalized
+sampling args. It excludes operational fields such as endpoint URL, timeout,
+API key variable, and concurrency.
 
-- `medarc_verifiers/utils/judge_helpers.py`
+`medarc_verifiers/cli/_manifest.py` now only contains the legacy manifest schema
+needed by processing to read historical `runs/raw` outputs.
 
-Key env vars:
+## Raw Outputs
 
-- `JUDGE_API_KEY` (preferred for judge calls)
-- fallback to `PRIME_API_KEY` (if judging via Prime Inference) or `OPENAI_API_KEY`.
+TOML bench outputs include:
 
-## Resume, restart, and manifests (batch mode)
-
-Batch mode writes `runs/raw/<run_id>/run_manifest.json` (manifest v3).
-
-- Manifest schema + update methods: `medarc_verifiers/cli/_manifest.py`
-- Planning which jobs to run vs reuse: `medarc_verifiers/cli/_manifest_planner.py`
+- `results.jsonl`: per-example rollouts
+- `metadata.json`: eval configuration and metrics snapshot
 
-Important concepts:
+The runner executes via `verifiers.utils.eval_utils.run_evaluation()` from
+single-run mode and the TOML bench code in `medarc_verifiers/cli/main.py`.
 
-- A **job** is a resolved combination of model + environment variant + args (plus sampling args).
-- Auto-resume tries to find the newest run matching the config checksum and skip completed jobs.
-- Restart can “seed” a new run from an old run, reusing outputs when job signatures match.
-- Conflict detection is conservative for most fields, but treats some model fields as “resume tolerant” (e.g., base URLs/timeouts) so you can move between providers without being blocked.
+## Processing Pipeline
 
-## Raw outputs (what eval produces)
+Docs: `docs/medarc-eval-process.md`.
 
-Raw outputs are expected under `runs/raw/<run_id>/<job_id>/` and include:
+Entry point: `medarc_verifiers/cli/process/pipeline.py`.
 
-- `results.jsonl`: per-example rollouts
-- `summary.json`: aggregated job metrics
-- `metadata.json`: job configuration snapshot (env/model/sampling args, etc.)
+Processing:
 
-The runner executes via `verifiers.utils.eval_utils.run_evaluation()` (called from `medarc_verifiers/cli/_single_run.py` and `medarc_verifiers/cli/_job_executor.py`).
+1. Discovers TOML bench outputs from `runs/evals` and legacy manifest outputs
+   from `runs/raw`.
+2. Normalizes metadata from `metadata.json` and, for legacy outputs, manifest
+   fields.
+3. Loads rows from `results.jsonl`, drops large prompt/completion fields, and
+   flattens `token_usage`.
+4. Aggregates rows per model and environment, preserving variant ids.
+5. Writes Parquet files plus `env_index.json` and `dataset_infos.json`.
 
-## Processing pipeline (raw → parquet)
+Important modules:
 
-Docs: `docs/medarc-eval-process.md`.
+- Discovery: `medarc_verifiers/cli/process/discovery.py`
+- Metadata normalization: `medarc_verifiers/cli/process/metadata.py`
+- Row loading: `medarc_verifiers/cli/process/rows.py`
+- Aggregation: `medarc_verifiers/cli/process/aggregate.py`
+- Writing/indexing: `medarc_verifiers/cli/process/writer.py`,
+  `medarc_verifiers/cli/process/env_index.py`
 
-Entry point: `medarc_verifiers/cli/process/pipeline.py` (via `run_process()`).
-
-### What processing does
-
-1. **Discover** job outputs from `runs/raw` by reading run manifests:
-   - `medarc_verifiers/cli/process/discovery.py`
-2. **Normalize metadata** by merging manifest fields with `metadata.json`:
-   - `medarc_verifiers/cli/process/metadata.py`
-3. **Handle rollouts**:
-   - MedARC sometimes “fakes” multiple rollouts by running the same base environment multiple times with different settings (e.g., different seeds).
-   - These fake rollouts are identified by a rollout suffix in the **manifest env id** like `env-a-rollout7` or `env-a-r7` (fallback: parse the results directory name).
-   - This suffix-derived rollout index is only used when rollouts are faked this way. Native verifiers rollouts (below) use the per-row JSONL field.
-   - `medarc_verifiers/cli/process/rollout.py`
-4. **Load rows from `results.jsonl`**:
-   - Always drops large fields (`prompt`, `completion`).
-   - Allows selecting extra per-env columns into a JSON-encoded `extras` column.
-   - If the JSONL provides a per-row `rollout_index` (native verifiers multi-rollout runs), it is treated as authoritative and preserved.
-   - If `rollout_index` is missing but the JSONL contains multiple rows per `example_id`, computes a data-driven `rollout_index` based on occurrence count.
-   - Flattens `token_usage` into explicit columns like `model_token_total`, `judge_cost`, etc.
-   - `medarc_verifiers/cli/process/rows.py`
-5. **Aggregate** rows per `(model_id, base_env_id)` and union schemas:
-   - `medarc_verifiers/cli/process/aggregate.py`
-   - When aggregating fake rollouts (manifest env ids include rollout suffixes), ensures every row has a `rollout_index` (derived from the suffix if missing) and normalizes indices to `0..K-1` within the dataset.
-   - When aggregating native verifiers rollouts (no rollout suffixes), preserves `rollout_index` values as provided by `results.jsonl` (no normalization).
-6. **Write Parquet**:
-   - Output path is `<processed_dir>/<slug(model_id)>/<slug(env_id)>.parquet`.
-   - Output columns are restricted to a fixed allowlist schema for downstream compatibility.
-   - Adds exporter metadata under a Parquet schema metadata key.
-   - Writes `env_index.json` (v2) and `dataset_infos.json` for HF datasets UX.
-   - `medarc_verifiers/cli/process/writer.py`, `medarc_verifiers/cli/process/env_index.py`
-
-### Delta processing and HF baselines
-
-Processing can use `env_index.json` to do incremental updates (delta processing). It also supports pulling/pushing processed artifacts to/from Hugging Face:
-
-- HF baseline management (download/copy policies): `medarc_verifiers/cli/process/workspace.py`
-- HF sync operations: `medarc_verifiers/cli/hf/sync.py`
-
-## Win rates (processed parquet → comparisons)
+## Win Rates
 
 Docs: `docs/medarc-eval-winrate.md`.
 
-`medarc-eval winrate` reads dataset inventory from `env_index.json`, averages rollouts per `(example_id, model_id)`, then computes pairwise model comparisons.
+`medarc-eval winrate` reads dataset inventory from `env_index.json`, averages
+rollouts per `(example_id, model_id)`, and computes pairwise model comparisons.
 
-- Dataset discovery via `env_index.json`: `medarc_verifiers/cli/winrate/runner.py`
-- Core math + weighting policies: `medarc_verifiers/cli/winrate/api.py`
-- Outputs:
-  - timestamped `winrates-<timestamp>.json` and `.csv`
-  - `latest.json` and `latest.csv`
-  - JSON shape is model-centric: top-level `models` and `datasets`
-  - CSV contains aggregate winrates plus per-dataset average rewards, not pairwise `vs_*` columns
+- Dataset discovery: `medarc_verifiers/cli/winrate/runner.py`
+- Core math and weighting policies: `medarc_verifiers/cli/winrate/api.py`
+- Outputs: timestamped `winrates-<timestamp>.json` / `.csv` plus
+  `latest.json` / `latest.csv`
 
-## Shared building blocks used by environments
+## Environment Utilities
 
-These utilities are frequently imported by environment packages under `environments/*`:
+Frequently imported utilities under `environments/*`:
 
 - Prompts and answer format constants: `medarc_verifiers/prompts.py`
-- Parsers:
-  - XML parser (supports raw string or chat messages): `medarc_verifiers/parsers/xml_parser.py`
-  - JSON parser (field alternatives, optional pydantic schema validation, “format reward”): `medarc_verifiers/parsers/json_parser.py`
-- Rewards:
-  - Robust MCQ grading with CoT/anchored patterns + answer-text fallback: `medarc_verifiers/rewards/multiple_choice_accuracy.py`
-  - Normalize judge dimension scores (1–5 → 0–1): `medarc_verifiers/rewards/normalize_helm_reward.py`
-- MCQ shuffling with deterministic seeding and “anchor option” preservation:
-  - Skips shuffling entirely if options reference other labels (“A or B”, “Both A and C”), to avoid corrupting the question.
-  - `medarc_verifiers/utils/randomize_multiple_choice.py`
+- XML parser: `medarc_verifiers/parsers/xml_parser.py`
+- JSON parser: `medarc_verifiers/parsers/json_parser.py`
+- MCQ grading: `medarc_verifiers/rewards/multiple_choice_accuracy.py`
+- HELM reward normalization: `medarc_verifiers/rewards/normalize_helm_reward.py`
+- Deterministic MCQ shuffling: `medarc_verifiers/utils/randomize_multiple_choice.py`
+- Judge helpers: `medarc_verifiers/utils/judge_helpers.py`
 
-## Judging and multi-judge support
+## Judging and Multi-Judge Support
 
-Some environments use “LLM-as-judge” scoring. `medarc_verifiers` provides:
+Some environments use LLM-as-judge scoring. `medarc_verifiers` provides:
 
-- A safer judge call wrapper with clearer errors: `medarc_verifiers/judging/judge_core.py`
-- A `MultiJudge` that runs multiple judge models concurrently: `medarc_verifiers/judging/multi_judge.py`
-- A `verifiers`-compatible rubric wrapper: `medarc_verifiers/judging/multi_judge_rubric.py`
+- Judge call wrapper: `medarc_verifiers/judging/judge_core.py`
+- Multi-judge runner: `medarc_verifiers/judging/multi_judge.py`
+- Verifiers-compatible rubric wrapper: `medarc_verifiers/judging/multi_judge_rubric.py`
 
-## vLLM orchestrator (local Docker) – separate CLI
+## vLLM Orchestrator
 
 Docs: `docs/medarc-orchestrate.md`.
 
-This is a separate tool (`medarc-orchestrate`) for running batch configs against locally hosted vLLM containers with GPU/port scheduling across Docker or Slurm+Pyxis runtimes.
+`medarc-orchestrate` runs TOML bench configs against locally hosted vLLM
+containers with GPU/port scheduling across Docker or Slurm+Pyxis runtimes.
 
 - CLI entry: `medarc_verifiers/orchestrate/cli.py`
 - Runtime loop: `medarc_verifiers/orchestrate/run.py`
 
-It essentially:
+It:
 
-1. Launches vLLM containers
-2. Waits for readiness
-3. Runs `uv run medarc-eval bench --config ... --api-base-url <allocated>`
-4. Tracks orchestration state under `outputs/orchestrator/<run_id>/`
+1. Launches vLLM containers.
+2. Waits for readiness.
+3. Runs `uv run medarc-eval bench --config <job.toml> --api-base-url <allocated> --provider local`.
+4. Tracks orchestration state under `outputs/orchestrator/<run_id>/`.
 
-## Where to change things (quick mental index)
+## Where To Change Things
 
-- Add/adjust CLI flags or command behavior:
+- CLI flags or routing:
   - `medarc_verifiers/cli/main.py`, `medarc_verifiers/cli/_single_run.py`
-- Change config semantics (matrix, normalization, validation):
-  - `medarc_verifiers/cli/_config_loader.py`, `medarc_verifiers/cli/_schemas.py`
-- Fix resume/restart quirks:
-  - `medarc_verifiers/cli/_manifest.py`, `medarc_verifiers/cli/_manifest_planner.py`
-- Add new columns or modify processed dataset schema:
-  - extraction: `medarc_verifiers/cli/process/rows.py`
-  - allowed columns/output schema: `medarc_verifiers/cli/process/writer.py`
-- Change winrate math/output:
+- TOML bench behavior, deterministic paths, or resume fingerprints:
+  - `medarc_verifiers/cli/main.py`, `medarc_verifiers/cli/eval_identity.py`,
+    `medarc_verifiers/cli/verifiers_adapter.py`
+- Processed dataset schema:
+  - `medarc_verifiers/cli/process/rows.py`, `medarc_verifiers/cli/process/writer.py`
+- Winrate math/output:
   - `medarc_verifiers/cli/winrate/api.py`, `medarc_verifiers/cli/winrate/runner.py`
-- Adjust judging defaults/provider behaviors:
+- Judging/provider behavior:
   - `medarc_verifiers/utils/judge_helpers.py`, `medarc_verifiers/utils/prime_inference.py`

From 6d6b2fff78df8e55fd674fc1dd1ef8b7571c003d Mon Sep 17 00:00:00 2001
From: Benjamin Warner <me@benjaminwarner.dev>
Date: Tue, 28 Apr 2026 21:10:37 +0000
Subject: [PATCH 13/53] Allow namespaced orchestrate metadata

---
 docs/medarc-orchestrate.md                    | 16 ++++----
 medarc_verifiers/cli/verifiers_adapter.py     | 11 +++++-
 medarc_verifiers/orchestrate/config.py        | 39 +++++++++++++------
 tests/test_cli/test_main.py                   | 30 ++++++++++++++
 tests/test_cli/test_verifiers_adapter.py      | 24 ++++++++++++
 .../test_orchestrate_cli_validation.py        |  4 +-
 .../test_orchestrate_config.py                |  6 +--
 7 files changed, 104 insertions(+), 26 deletions(-)

diff --git a/docs/medarc-orchestrate.md b/docs/medarc-orchestrate.md
index ba802e3e..fbeaa31c 100644
--- a/docs/medarc-orchestrate.md
+++ b/docs/medarc-orchestrate.md
@@ -50,7 +50,7 @@ rerun_failed: false
 ```
 
 Each job config should be an upstream `medarc-eval bench` TOML config with a top-level
-`model` and a top-level `orchestrate` table.
+`model` and a namespaced `[medarc.orchestrate]` table.
 
 The `env_file` is a dotenv file that is loaded for every Docker launch. If unset and a repo-level `.env` exists,
 it is used automatically. You can also override it via `--env-file`.
@@ -63,30 +63,30 @@ model = "Qwen/Qwen3-30B-A3B"
 [[eval]]
 env_id = "medqa"
 
-[orchestrate.qwen-30b-a3b]
+[medarc.orchestrate.qwen-30b-a3b]
 gpus = 2
 tensor_parallel_size = 2
 
-[orchestrate.qwen-30b-a3b.serve]
+[medarc.orchestrate.qwen-30b-a3b.serve]
 max_model_len = 40960
 
-[orchestrate.vllm-container]
+[medarc.orchestrate.vllm-container]
 image = "vllm/vllm-openai:latest"
 container_port = 8000
 volumes = ["/data/huggingface:/root/.cache/huggingface"]
 ipc_mode = "host"
 
-[orchestrate.pyxis]
+[medarc.orchestrate.pyxis]
 srun_extra_args = []
 ```
 
 Config notes:
 
-- `orchestrate.vllm-container` is the preferred key.
-- `orchestrate.vllm-docker` is still accepted as a deprecated alias.
+- `medarc.orchestrate.vllm-container` is the preferred key.
+- `medarc.orchestrate.vllm-docker` is still accepted as a deprecated alias.
 - Do not set both keys in the same job config.
 - `ipc_mode` is Docker-only and is ignored in `--runtime pyxis`.
-- `orchestrate.pyxis` is Pyxis-only and is ignored in `--runtime docker`.
+- `medarc.orchestrate.pyxis` is Pyxis-only and is ignored in `--runtime docker`.
 - In Pyxis mode, Slurm allocates GPUs per `srun` step. The orchestrator only reserves localhost ports.
 
 ### CLI usage
diff --git a/medarc_verifiers/cli/verifiers_adapter.py b/medarc_verifiers/cli/verifiers_adapter.py
index 288c2e6e..8fd2ae50 100644
--- a/medarc_verifiers/cli/verifiers_adapter.py
+++ b/medarc_verifiers/cli/verifiers_adapter.py
@@ -40,6 +40,7 @@
 DEFAULT_CLIENT_TYPE = "openai_chat_completions"
 DEFAULT_PROVIDER = "prime"
 ADAPTER_TOML_FIELDS = {"debug", "header_from_state", "headers_from_state", "timeout"}
+MEDARC_TOML_METADATA_FIELD = "medarc"
 
 PROVIDER_CONFIGS: dict[str, dict[str, str]] = {
     "prime": {
@@ -100,8 +101,14 @@ class EvalConfigOverrides:
 def load_toml_eval_configs(path: str | Path, *, extra_valid_fields: set[str] | None = None) -> list[dict[str, Any]]:
     """Load upstream TOML eval configs, including ``[[ablation]]`` expansion."""
 
-    valid_fields = ADAPTER_TOML_FIELDS | (extra_valid_fields or set())
-    return load_toml_config(Path(path), extra_valid_fields=valid_fields)
+    valid_fields = ADAPTER_TOML_FIELDS | {MEDARC_TOML_METADATA_FIELD} | (extra_valid_fields or set())
+    return [_strip_medarc_metadata(raw) for raw in load_toml_config(Path(path), extra_valid_fields=valid_fields)]
+
+
+def _strip_medarc_metadata(raw: Mapping[str, Any]) -> dict[str, Any]:
+    cleaned = dict(raw)
+    cleaned.pop(MEDARC_TOML_METADATA_FIELD, None)
+    return cleaned
 
 
 def build_eval_config(raw: Mapping[str, Any], *, overrides: EvalConfigOverrides | None = None) -> EvalConfig:
diff --git a/medarc_verifiers/orchestrate/config.py b/medarc_verifiers/orchestrate/config.py
index 3e5f0146..67a97fc8 100644
--- a/medarc_verifiers/orchestrate/config.py
+++ b/medarc_verifiers/orchestrate/config.py
@@ -138,33 +138,32 @@ def _extract_task_model(payload: Mapping[str, Any], *, source: Path) -> tuple[st
     model_id = str(payload.get("model", "")).strip()
     if not model_id:
         raise ValueError(f"Job config {source} must define either one models entry or a top-level model.")
-    orchestrate = payload.get("orchestrate")
-    if not isinstance(orchestrate, Mapping):
-        raise ValueError(f"Job config {source} must define a top-level orchestrate mapping.")
+    orchestrate, table_name = _extract_orchestrate_root(payload, source=source)
     model_keys = [str(key) for key, value in orchestrate.items() if key not in _ORCHESTRATE_NON_MODEL_KEYS]
     if len(model_keys) != 1:
         raise ValueError(
-            f"Job config {source} must define exactly one orchestrate model settings table; found {len(model_keys)}."
+            f"Job config {source} must define exactly one {table_name} model settings table; found {len(model_keys)}."
         )
     return model_keys[0], {"model": model_id}
 
 
 def _extract_orchestrate_config(payload: Mapping[str, Any], *, model_key: str, source: Path) -> Mapping[str, Any]:
-    orchestrate = payload.get("orchestrate")
-    if not isinstance(orchestrate, Mapping):
-        raise ValueError(f"Job config {source} must define a top-level orchestrate mapping.")
+    orchestrate, table_name = _extract_orchestrate_root(payload, source=source)
     has_container = "vllm-container" in orchestrate
     has_docker = "vllm-docker" in orchestrate
     if has_container and has_docker:
-        raise ValueError(f"Job config {source} defines both orchestrate.vllm-container and orchestrate.vllm-docker.")
+        raise ValueError(f"Job config {source} defines both {table_name}.vllm-container and {table_name}.vllm-docker.")
     if not has_container and not has_docker:
-        raise ValueError(f"Job config {source} must define orchestrate.vllm-container settings.")
+        raise ValueError(f"Job config {source} must define {table_name}.vllm-container settings.")
     if model_key not in orchestrate:
-        raise ValueError(f"Job config {source} must define orchestrate.{model_key} settings.")
+        raise ValueError(f"Job config {source} must define {table_name}.{model_key} settings.")
     normalized = dict(orchestrate)
     if has_docker:
         warnings.warn(
-            (f"Job config {source} uses deprecated orchestrate.vllm-docker; rename it to orchestrate.vllm-container."),
+            (
+                f"Job config {source} uses deprecated {table_name}.vllm-docker; "
+                f"rename it to {table_name}.vllm-container."
+            ),
             DeprecationWarning,
             stacklevel=2,
         )
@@ -173,4 +172,22 @@ def _extract_orchestrate_config(payload: Mapping[str, Any], *, model_key: str, s
     return normalized
 
 
+def _extract_orchestrate_root(payload: Mapping[str, Any], *, source: Path) -> tuple[Mapping[str, Any], str]:
+    medarc = payload.get("medarc")
+    if medarc is not None:
+        if not isinstance(medarc, Mapping):
+            raise ValueError(f"Job config {source} medarc must be a mapping.")
+        medarc_orchestrate = medarc.get("orchestrate")
+        if medarc_orchestrate is not None:
+            if not isinstance(medarc_orchestrate, Mapping):
+                raise ValueError(f"Job config {source} medarc.orchestrate must be a mapping.")
+            return medarc_orchestrate, "medarc.orchestrate"
+
+    orchestrate = payload.get("orchestrate")
+    if isinstance(orchestrate, Mapping):
+        return orchestrate, "orchestrate"
+
+    raise ValueError(f"Job config {source} must define a [medarc.orchestrate] mapping.")
+
+
 __all__ = ["ConfigFormatError", "PlanConfig", "TaskSpec", "expand_tasks", "load_job_config", "load_plan"]
diff --git a/tests/test_cli/test_main.py b/tests/test_cli/test_main.py
index d8d0197a..8409bb2b 100644
--- a/tests/test_cli/test_main.py
+++ b/tests/test_cli/test_main.py
@@ -161,6 +161,36 @@ def test_repository_smoke_toml_config_dry_runs(capsys: pytest.CaptureFixture[str
     assert "runs/evals/openai-gpt-4.1-mini/medqa" in output
 
 
+def test_toml_bench_dry_run_accepts_medarc_orchestrate_metadata(
+    tmp_path: Path, capsys: pytest.CaptureFixture[str]
+) -> None:
+    config_path = tmp_path / "bench.toml"
+    _write_config(
+        config_path,
+        """
+        model = "gpt-5-mini"
+
+        [[eval]]
+        env_id = "medqa"
+        num_examples = 1
+        rollouts_per_example = 1
+
+        [medarc.orchestrate.foo]
+        gpus = 1
+
+        [medarc.orchestrate.vllm-container]
+        image = "vllm/vllm-openai:latest"
+        """,
+    )
+
+    exit_code = main.main(["bench", "--config", str(config_path), "--dry-run"])
+
+    output = capsys.readouterr().out
+    assert exit_code == 0
+    assert "TOML Bench Dry Run" in output
+    assert "medqa" in output
+
+
 def test_bench_rejects_non_toml_config(tmp_path: Path, capsys: pytest.CaptureFixture[str]) -> None:
     config_path = tmp_path / "bench.yaml"
     _write_config(config_path, "models: {}\n")
diff --git a/tests/test_cli/test_verifiers_adapter.py b/tests/test_cli/test_verifiers_adapter.py
index b4caa925..99ce63f5 100644
--- a/tests/test_cli/test_verifiers_adapter.py
+++ b/tests/test_cli/test_verifiers_adapter.py
@@ -71,6 +71,30 @@ def test_load_toml_eval_configs_expands_ablation(tmp_path: Path) -> None:
     assert configs[2]["env_args"] == {"shuffle_answers": True, "shuffle_seed": 9331}
 
 
+def test_load_toml_eval_configs_strips_medarc_metadata(tmp_path: Path) -> None:
+    config_path = tmp_path / "eval.toml"
+    config_path.write_text(
+        """
+model = "openai/gpt-4.1-mini"
+
+[[eval]]
+env_id = "medqa"
+
+[medarc.orchestrate.foo]
+gpus = 1
+
+[medarc.orchestrate.vllm-container]
+image = "vllm/vllm-openai:latest"
+""".strip()
+    )
+
+    configs = load_toml_eval_configs(config_path)
+
+    assert len(configs) == 1
+    assert "medarc" not in configs[0]
+    assert configs[0]["env_id"] == "medqa"
+
+
 def test_build_eval_config_resolves_endpoint_alias_and_core_fields(tmp_path: Path) -> None:
     endpoints_path = _write_endpoints(tmp_path / "endpoints.toml")
     resume_path = tmp_path / "resume"
diff --git a/tests/test_orchestrate/test_orchestrate_cli_validation.py b/tests/test_orchestrate/test_orchestrate_cli_validation.py
index 25c9c809..26668148 100644
--- a/tests/test_orchestrate/test_orchestrate_cli_validation.py
+++ b/tests/test_orchestrate/test_orchestrate_cli_validation.py
@@ -121,10 +121,10 @@ def test_cli_dry_run_accepts_toml_job_config(tmp_path: Path, capsys: pytest.Capt
 [[eval]]
 env_id = "medqa"
 
-[orchestrate.vllm-container]
+[medarc.orchestrate.vllm-container]
 image = "fake"
 
-[orchestrate.foo]
+[medarc.orchestrate.foo]
 gpus = 1
 serve = {}
 """.lstrip(),
diff --git a/tests/test_orchestrate/test_orchestrate_config.py b/tests/test_orchestrate/test_orchestrate_config.py
index cf84c8b0..e369743f 100644
--- a/tests/test_orchestrate/test_orchestrate_config.py
+++ b/tests/test_orchestrate/test_orchestrate_config.py
@@ -72,13 +72,13 @@ def test_expand_tasks_accepts_toml_eval_config(tmp_path: Path) -> None:
 [[eval]]
 env_id = "medqa"
 
-[orchestrate.vllm-container]
+[medarc.orchestrate.vllm-container]
 image = "vllm/vllm-openai:latest"
 
-[orchestrate.foo]
+[medarc.orchestrate.foo]
 gpus = 1
 
-[orchestrate.foo.serve]
+[medarc.orchestrate.foo.serve]
 dtype = "bfloat16"
 """.lstrip(),
         encoding="utf-8",

From 167158dec2592c58cb01ee2b2844f2b11e382611 Mon Sep 17 00:00:00 2001
From: Benjamin Warner <me@benjaminwarner.dev>
Date: Wed, 6 May 2026 15:14:35 +0000
Subject: [PATCH 14/53] Remove obsolete benchmark compatibility code

---
 docs/medarc-orchestrate.md                    |  4 +-
 medarc_verifiers/cli/_single_run.py           | 11 +++
 medarc_verifiers/cli/main.py                  |  1 -
 medarc_verifiers/cli/verifiers_adapter.py     | 47 +-----------
 medarc_verifiers/orchestrate/config.py        | 42 ++---------
 tests/test_cli/test_main.py                   |  2 +-
 tests/test_cli/test_verifiers_adapter.py      | 53 +-------------
 .../test_orchestrate_cli_validation.py        | 19 +++--
 .../test_orchestrate_config.py                | 71 +++++++------------
 9 files changed, 56 insertions(+), 194 deletions(-)

diff --git a/docs/medarc-orchestrate.md b/docs/medarc-orchestrate.md
index fbeaa31c..8a9accfd 100644
--- a/docs/medarc-orchestrate.md
+++ b/docs/medarc-orchestrate.md
@@ -82,9 +82,7 @@ srun_extra_args = []
 
 Config notes:
 
-- `medarc.orchestrate.vllm-container` is the preferred key.
-- `medarc.orchestrate.vllm-docker` is still accepted as a deprecated alias.
-- Do not set both keys in the same job config.
+- `medarc.orchestrate.vllm-container` is required.
 - `ipc_mode` is Docker-only and is ignored in `--runtime pyxis`.
 - `medarc.orchestrate.pyxis` is Pyxis-only and is ignored in `--runtime docker`.
 - In Pyxis mode, Slurm allocates GPUs per `srun` step. The orchestrator only reserves localhost ports.
diff --git a/medarc_verifiers/cli/_single_run.py b/medarc_verifiers/cli/_single_run.py
index 6d803f43..a3c8b8dc 100644
--- a/medarc_verifiers/cli/_single_run.py
+++ b/medarc_verifiers/cli/_single_run.py
@@ -27,6 +27,7 @@
     format_resume_mismatch_lines,
     is_resume_metadata_mismatch_error,
     load_resume_metadata_values,
+    resolve_resume_path,
 )
 from medarc_verifiers.cli.utils.shared import (
     HEADER_SEPARATOR,
@@ -176,8 +177,18 @@ def run_single_mode(argv: Sequence[str] | None = None) -> int:
 
     try:
         eval_config = build_eval_config(raw_config)
+        resume_path = resolve_resume_path(
+            resume_arg=args.resume,
+            env_id=eval_config.env_id,
+            model=eval_config.model,
+            num_examples=eval_config.num_examples,
+            rollouts_per_example=eval_config.rollouts_per_example,
+            env_dir_path=eval_config.env_dir_path,
+        )
     except ValueError as exc:
         parser.error(str(exc))
+    if resume_path is not None:
+        eval_config = eval_config.model_copy(update={"resume_path": resume_path, "save_results": True})
 
     if args.dry_run:
         print(eval_config.model_dump_json(indent=2))
diff --git a/medarc_verifiers/cli/main.py b/medarc_verifiers/cli/main.py
index d12b2127..df833bb5 100644
--- a/medarc_verifiers/cli/main.py
+++ b/medarc_verifiers/cli/main.py
@@ -439,7 +439,6 @@ def _run_batch_mode(argv: Sequence[str]) -> int:
     parser = build_batch_parser()
     args = parser.parse_args(argv)
     args.endpoints_path_explicit = _option_was_provided(argv, "--endpoints-path")
-    args.default_api_key_var_explicit = _option_was_provided(argv, "--default-api-key-var")
 
     try:
         args.cli_env_args = build_cli_override(
diff --git a/medarc_verifiers/cli/verifiers_adapter.py b/medarc_verifiers/cli/verifiers_adapter.py
index 8fd2ae50..ffbc26c8 100644
--- a/medarc_verifiers/cli/verifiers_adapter.py
+++ b/medarc_verifiers/cli/verifiers_adapter.py
@@ -24,7 +24,6 @@
 )
 from verifiers.utils.eval_utils import load_endpoints, load_toml_config, resolve_endpoints_file
 from verifiers.utils.import_utils import load_toml
-from verifiers.utils.path_utils import find_latest_incomplete_eval_results_path, is_valid_eval_results_path
 
 from medarc_verifiers.utils.prime_inference import prime_inference_overrides
 from medarc_verifiers.utils.sampling_args import sanitize_sampling_args_for_openai
@@ -134,13 +133,6 @@ def build_eval_config(raw: Mapping[str, Any], *, overrides: EvalConfigOverrides
     model, resolved_endpoint_id, client_config = _build_client_config(merged_raw, endpoints, endpoints_path)
 
     sampling_args = _build_sampling_args(merged_raw, client_config.api_base_url)
-    resume_path = _resolve_resume_path(
-        merged_raw,
-        env_id=env_id,
-        model=model,
-        num_examples=num_examples,
-        rollouts_per_example=rollouts_per_example,
-    )
 
     extra_env_kwargs = dict(merged_raw.get("extra_env_kwargs", {}))
     if merged_raw.get("timeout") is not None:
@@ -166,7 +158,7 @@ def build_eval_config(raw: Mapping[str, Any], *, overrides: EvalConfigOverrides
         verbose=merged_raw.get("verbose", False),
         state_columns=merged_raw.get("state_columns", []),
         save_results=merged_raw.get("save_results", False),
-        resume_path=resume_path,
+        resume_path=None,
         independent_scoring=merged_raw.get("independent_scoring", False),
         save_to_hf_hub=merged_raw.get("save_to_hf_hub", False),
         hf_hub_dataset_name=merged_raw.get("hf_hub_dataset_name", ""),
@@ -393,43 +385,6 @@ def _merge_sampling_args(
     return merged_sampling_args
 
 
-def _resolve_resume_path(
-    raw: Mapping[str, Any],
-    *,
-    env_id: str,
-    model: str,
-    num_examples: int,
-    rollouts_per_example: int,
-) -> Path | None:
-    resume_arg = raw.get("resume")
-    if resume_arg is None and raw.get("resume_path") is not None:
-        resume_arg = raw["resume_path"]
-
-    if isinstance(resume_arg, str):
-        resume_path = Path(resume_arg)
-        if not is_valid_eval_results_path(resume_path):
-            raise ValueError(f"Resume path {resume_path} is not a valid evaluation results path")
-        logger.info("Resuming from explicit path: %s", resume_path)
-        return resume_path
-    if resume_arg is True:
-        auto_resume_path = find_latest_incomplete_eval_results_path(
-            env_id=env_id,
-            model=model,
-            num_examples=num_examples,
-            rollouts_per_example=rollouts_per_example,
-            env_dir_path=raw.get("env_dir_path", DEFAULT_ENV_DIR_PATH),
-            output_dir=raw.get("output_dir"),
-        )
-        if auto_resume_path is not None:
-            logger.info("Auto-resuming from: %s", auto_resume_path)
-            return auto_resume_path
-        logger.info("No matching incomplete run found for --resume; starting a new run")
-        return None
-    if resume_arg in (None, False):
-        return None
-    raise ValueError(f"Invalid value for --resume: {resume_arg!r}")
-
-
 def _build_extra_headers(raw: Mapping[str, Any]) -> dict[str, str]:
     eval_headers_table: dict[str, str] = {}
     raw_headers = raw.get("headers")
diff --git a/medarc_verifiers/orchestrate/config.py b/medarc_verifiers/orchestrate/config.py
index 67a97fc8..07277681 100644
--- a/medarc_verifiers/orchestrate/config.py
+++ b/medarc_verifiers/orchestrate/config.py
@@ -6,13 +6,12 @@
 from pathlib import Path
 from typing import Any, Mapping
 import tomllib
-import warnings
 
 from omegaconf import OmegaConf
 from pydantic import BaseModel, Field, ValidationError
 
 
-_ORCHESTRATE_NON_MODEL_KEYS = {"restart", "vllm-container", "vllm-docker", "pyxis"}
+_ORCHESTRATE_NON_MODEL_KEYS = {"restart", "vllm-container", "pyxis"}
 
 
 class PlanConfig(BaseModel):
@@ -80,6 +79,8 @@ def load_plan(path: Path) -> PlanConfig:
 
 def load_job_config(path: Path) -> Mapping[str, Any]:
     resolved = path.expanduser().resolve()
+    if resolved.suffix != ".toml":
+        raise ValueError(f"Unsupported job config format: {resolved} (expected .toml)")
     return _load_mapping(resolved)
 
 
@@ -124,20 +125,9 @@ def _load_mapping(path: Path) -> Mapping[str, Any]:
 
 
 def _extract_task_model(payload: Mapping[str, Any], *, source: Path) -> tuple[str, Mapping[str, Any]]:
-    models = payload.get("models")
-    if isinstance(models, Mapping):
-        keys = list(models.keys())
-        if len(keys) != 1:
-            raise ValueError(f"Job config {source} must define exactly one model; found {len(keys)}.")
-        model_key = str(keys[0])
-        model_entry = models.get(model_key)
-        if not isinstance(model_entry, Mapping):
-            raise ValueError(f"Job config {source} models.{model_key} must be a mapping.")
-        return model_key, model_entry
-
     model_id = str(payload.get("model", "")).strip()
     if not model_id:
-        raise ValueError(f"Job config {source} must define either one models entry or a top-level model.")
+        raise ValueError(f"Job config {source} must define a top-level model.")
     orchestrate, table_name = _extract_orchestrate_root(payload, source=source)
     model_keys = [str(key) for key, value in orchestrate.items() if key not in _ORCHESTRATE_NON_MODEL_KEYS]
     if len(model_keys) != 1:
@@ -149,27 +139,11 @@ def _extract_task_model(payload: Mapping[str, Any], *, source: Path) -> tuple[st
 
 def _extract_orchestrate_config(payload: Mapping[str, Any], *, model_key: str, source: Path) -> Mapping[str, Any]:
     orchestrate, table_name = _extract_orchestrate_root(payload, source=source)
-    has_container = "vllm-container" in orchestrate
-    has_docker = "vllm-docker" in orchestrate
-    if has_container and has_docker:
-        raise ValueError(f"Job config {source} defines both {table_name}.vllm-container and {table_name}.vllm-docker.")
-    if not has_container and not has_docker:
+    if "vllm-container" not in orchestrate:
         raise ValueError(f"Job config {source} must define {table_name}.vllm-container settings.")
     if model_key not in orchestrate:
         raise ValueError(f"Job config {source} must define {table_name}.{model_key} settings.")
-    normalized = dict(orchestrate)
-    if has_docker:
-        warnings.warn(
-            (
-                f"Job config {source} uses deprecated {table_name}.vllm-docker; "
-                f"rename it to {table_name}.vllm-container."
-            ),
-            DeprecationWarning,
-            stacklevel=2,
-        )
-        normalized["vllm-container"] = orchestrate["vllm-docker"]
-        del normalized["vllm-docker"]
-    return normalized
+    return orchestrate
 
 
 def _extract_orchestrate_root(payload: Mapping[str, Any], *, source: Path) -> tuple[Mapping[str, Any], str]:
@@ -183,10 +157,6 @@ def _extract_orchestrate_root(payload: Mapping[str, Any], *, source: Path) -> tu
                 raise ValueError(f"Job config {source} medarc.orchestrate must be a mapping.")
             return medarc_orchestrate, "medarc.orchestrate"
 
-    orchestrate = payload.get("orchestrate")
-    if isinstance(orchestrate, Mapping):
-        return orchestrate, "orchestrate"
-
     raise ValueError(f"Job config {source} must define a [medarc.orchestrate] mapping.")
 
 
diff --git a/tests/test_cli/test_main.py b/tests/test_cli/test_main.py
index 8409bb2b..cbfa5ea7 100644
--- a/tests/test_cli/test_main.py
+++ b/tests/test_cli/test_main.py
@@ -875,7 +875,7 @@ def fake_find_latest_incomplete_eval_results_path(**kwargs: Any) -> Path:
         return discovered
 
     monkeypatch.setattr(
-        "medarc_verifiers.cli.verifiers_adapter.find_latest_incomplete_eval_results_path",
+        "medarc_verifiers.cli.utils.resume.find_latest_incomplete_eval_results_path",
         fake_find_latest_incomplete_eval_results_path,
     )
 
diff --git a/tests/test_cli/test_verifiers_adapter.py b/tests/test_cli/test_verifiers_adapter.py
index 99ce63f5..f06dcab7 100644
--- a/tests/test_cli/test_verifiers_adapter.py
+++ b/tests/test_cli/test_verifiers_adapter.py
@@ -1,7 +1,6 @@
 from __future__ import annotations
 
 import importlib
-import json
 from pathlib import Path
 
 import pytest
@@ -97,10 +96,6 @@ def test_load_toml_eval_configs_strips_medarc_metadata(tmp_path: Path) -> None:
 
 def test_build_eval_config_resolves_endpoint_alias_and_core_fields(tmp_path: Path) -> None:
     endpoints_path = _write_endpoints(tmp_path / "endpoints.toml")
-    resume_path = tmp_path / "resume"
-    resume_path.mkdir()
-    (resume_path / "results.jsonl").write_text("")
-    (resume_path / "metadata.json").write_text("{}")
 
     config = build_eval_config(
         {
@@ -119,7 +114,6 @@ def test_build_eval_config_resolves_endpoint_alias_and_core_fields(tmp_path: Pat
             "timeout": 45.0,
             "state_columns": ["question_id", "split"],
             "save_results": True,
-            "resume_path": str(resume_path),
             "independent_scoring": True,
             "save_to_hf_hub": True,
             "hf_hub_dataset_name": "org/dataset",
@@ -145,7 +139,7 @@ def test_build_eval_config_resolves_endpoint_alias_and_core_fields(tmp_path: Pat
     assert config.extra_env_kwargs == {"timeout_seconds": 45.0}
     assert config.state_columns == ["question_id", "split"]
     assert config.save_results is True
-    assert config.resume_path == resume_path
+    assert config.resume_path is None
     assert config.independent_scoring is True
     assert config.save_to_hf_hub is True
     assert config.hf_hub_dataset_name == "org/dataset"
@@ -265,48 +259,3 @@ def test_build_eval_config_uses_env_pyproject_defaults(tmp_path: Path, monkeypat
 
     assert config.num_examples == 11
     assert config.rollouts_per_example == 4
-
-
-def test_build_eval_config_rejects_invalid_resume_path(tmp_path: Path) -> None:
-    invalid_resume_path = tmp_path / "missing"
-
-    with pytest.raises(ValueError, match="not a valid evaluation results path"):
-        build_eval_config(
-            {
-                "env_id": "medqa",
-                "provider": "openai",
-                "model": "openai/gpt-4.1-mini",
-                "resume": str(invalid_resume_path),
-            }
-        )
-
-
-def test_build_eval_config_auto_resume_uses_upstream_path_lookup(tmp_path: Path) -> None:
-    output_dir = tmp_path / "outputs"
-    run_dir = output_dir / "evals" / "medqa--openai--gpt-4.1-mini" / "abc12345"
-    run_dir.mkdir(parents=True)
-    (run_dir / "results.jsonl").write_text(json.dumps({"example_id": "0"}) + "\n")
-    (run_dir / "metadata.json").write_text(
-        json.dumps(
-            {
-                "env_id": "medqa",
-                "model": "openai/gpt-4.1-mini",
-                "num_examples": 2,
-                "rollouts_per_example": 1,
-            }
-        )
-    )
-
-    config = build_eval_config(
-        {
-            "env_id": "medqa",
-            "provider": "openai",
-            "model": "openai/gpt-4.1-mini",
-            "num_examples": 2,
-            "rollouts_per_example": 1,
-            "output_dir": str(output_dir),
-            "resume": True,
-        }
-    )
-
-    assert config.resume_path == run_dir
diff --git a/tests/test_orchestrate/test_orchestrate_cli_validation.py b/tests/test_orchestrate/test_orchestrate_cli_validation.py
index 26668148..f190ed69 100644
--- a/tests/test_orchestrate/test_orchestrate_cli_validation.py
+++ b/tests/test_orchestrate/test_orchestrate_cli_validation.py
@@ -73,18 +73,17 @@ def test_cli_runtime_flag_parses() -> None:
 
 
 def test_cli_runtime_precedence_cli_over_plan(monkeypatch, tmp_path: Path) -> None:
-    job_cfg = tmp_path / "job.yaml"
+    job_cfg = tmp_path / "job.toml"
     job_cfg.write_text(
         """
-models:
-  foo:
-    model: Foo/Bar
-orchestrate:
-  vllm-container:
-    image: fake
-  foo:
-    gpus: 1
-    serve: {}
+model = "Foo/Bar"
+
+[medarc.orchestrate.vllm-container]
+image = "fake"
+
+[medarc.orchestrate.foo]
+gpus = 1
+serve = {}
 """.lstrip(),
         encoding="utf-8",
     )
diff --git a/tests/test_orchestrate/test_orchestrate_config.py b/tests/test_orchestrate/test_orchestrate_config.py
index e369743f..be32446f 100644
--- a/tests/test_orchestrate/test_orchestrate_config.py
+++ b/tests/test_orchestrate/test_orchestrate_config.py
@@ -1,4 +1,3 @@
-import warnings
 from pathlib import Path
 
 import pytest
@@ -9,20 +8,22 @@
 def test_plan_job_configs_resolve_relative_to_plan_file(tmp_path: Path):
     configs_dir = tmp_path / "configs"
     configs_dir.mkdir()
-    job_cfg = configs_dir / "job-foo.yaml"
+    job_cfg = configs_dir / "job-foo.toml"
     job_cfg.write_text(
         """
-models:
-  foo:
-    model: Foo/Bar
-orchestrate:
-  restart: runs/raw/example-run
-  vllm-container:
-    image: vllm/vllm-openai:latest
-  foo:
-    gpus: 1
-    serve:
-      dtype: bfloat16
+model = "Foo/Bar"
+
+[medarc.orchestrate]
+restart = "runs/raw/example-run"
+
+[medarc.orchestrate.vllm-container]
+image = "vllm/vllm-openai:latest"
+
+[medarc.orchestrate.foo]
+gpus = 1
+
+[medarc.orchestrate.foo.serve]
+dtype = "bfloat16"
 """.lstrip(),
         encoding="utf-8",
     )
@@ -31,7 +32,7 @@ def test_plan_job_configs_resolve_relative_to_plan_file(tmp_path: Path):
         """
 name: test
 job_configs:
-  - configs/job-foo.yaml
+  - configs/job-foo.toml
 gpu_range: "0-3"
 port_range: "8100-8199"
 run_id: "hello"
@@ -95,54 +96,34 @@ def test_expand_tasks_accepts_toml_eval_config(tmp_path: Path) -> None:
     assert tasks[0].orchestrate["foo"]["serve"]["dtype"] == "bfloat16"
 
 
-def test_expand_tasks_accepts_deprecated_vllm_docker_with_warning(tmp_path: Path) -> None:
+def test_expand_tasks_rejects_non_toml_job_config(tmp_path: Path) -> None:
     job_cfg = tmp_path / "job.yaml"
     job_cfg.write_text(
         """
-models:
-  foo:
-    model: Foo/Bar
-orchestrate:
-  vllm-docker:
-    image: vllm/vllm-openai:latest
-  foo:
-    gpus: 1
-    serve: {}
+model: Foo/Bar
 """.lstrip(),
         encoding="utf-8",
     )
     plan_path = tmp_path / "plan.yaml"
     plan_path.write_text(f"job_configs:\n  - {job_cfg.name}\n", encoding="utf-8")
 
-    with warnings.catch_warnings(record=True) as caught:
-        warnings.simplefilter("always")
-        tasks = expand_tasks(load_plan(plan_path))
-
-    assert tasks[0].orchestrate["vllm-container"]["image"] == "vllm/vllm-openai:latest"
-    assert "vllm-docker" not in tasks[0].orchestrate
-    assert any("deprecated orchestrate.vllm-docker" in str(item.message) for item in caught)
+    with pytest.raises(ValueError, match="Unsupported job config format"):
+        expand_tasks(load_plan(plan_path))
 
 
-def test_expand_tasks_rejects_ambiguous_container_keys(tmp_path: Path) -> None:
-    job_cfg = tmp_path / "job.yaml"
+def test_expand_tasks_rejects_missing_vllm_container(tmp_path: Path) -> None:
+    job_cfg = tmp_path / "job.toml"
     job_cfg.write_text(
         """
-models:
-  foo:
-    model: Foo/Bar
-orchestrate:
-  vllm-container:
-    image: new
-  vllm-docker:
-    image: old
-  foo:
-    gpus: 1
-    serve: {}
+model = "Foo/Bar"
+
+[medarc.orchestrate.foo]
+gpus = 1
 """.lstrip(),
         encoding="utf-8",
     )
     plan_path = tmp_path / "plan.yaml"
     plan_path.write_text(f"job_configs:\n  - {job_cfg.name}\n", encoding="utf-8")
 
-    with pytest.raises(ValueError, match="defines both orchestrate.vllm-container and orchestrate.vllm-docker"):
+    with pytest.raises(ValueError, match="must define medarc.orchestrate.vllm-container settings"):
         expand_tasks(load_plan(plan_path))

From 1f6bae26eef36512ef3492eb1e36284a8a6089fc Mon Sep 17 00:00:00 2001
From: Benjamin Warner <me@benjaminwarner.dev>
Date: Thu, 7 May 2026 16:29:23 +0000
Subject: [PATCH 15/53] Add upstream eval config boundary

---
 medarc_verifiers/cli/_single_run.py   |  2 +-
 medarc_verifiers/cli/main.py          |  2 +-
 medarc_verifiers/cli/upstream_eval.py | 18 ++++++++++++++++++
 tests/test_cli/test_upstream_eval.py  | 24 ++++++++++++++++++++++++
 4 files changed, 44 insertions(+), 2 deletions(-)
 create mode 100644 medarc_verifiers/cli/upstream_eval.py
 create mode 100644 tests/test_cli/test_upstream_eval.py

diff --git a/medarc_verifiers/cli/_single_run.py b/medarc_verifiers/cli/_single_run.py
index a3c8b8dc..c2578738 100644
--- a/medarc_verifiers/cli/_single_run.py
+++ b/medarc_verifiers/cli/_single_run.py
@@ -20,7 +20,7 @@
     DEFAULT_API_KEY_VAR,
     DEFAULT_ENDPOINTS_PATH,
 )
-from medarc_verifiers.cli.verifiers_adapter import build_eval_config
+from medarc_verifiers.cli.upstream_eval import build_eval_config
 from medarc_verifiers.cli.utils.env_args import EnvParam, MissingEnvParamError, gather_env_cli_metadata, merge_env_args
 from medarc_verifiers.cli.utils.overrides import build_cli_override
 from medarc_verifiers.cli.utils.resume import (
diff --git a/medarc_verifiers/cli/main.py b/medarc_verifiers/cli/main.py
index df833bb5..3ef6ffed 100644
--- a/medarc_verifiers/cli/main.py
+++ b/medarc_verifiers/cli/main.py
@@ -46,7 +46,7 @@
     normalize_dataset_ids,
     normalize_model_ids,
 )
-from medarc_verifiers.cli.verifiers_adapter import EvalConfigOverrides, build_eval_config, load_toml_eval_configs
+from medarc_verifiers.cli.upstream_eval import EvalConfigOverrides, build_eval_config, load_toml_eval_configs
 from medarc_verifiers.utils.pathing import resolve_under
 from medarc_verifiers.cli.winrate import (
     WinrateConfig,
diff --git a/medarc_verifiers/cli/upstream_eval.py b/medarc_verifiers/cli/upstream_eval.py
new file mode 100644
index 00000000..7871526e
--- /dev/null
+++ b/medarc_verifiers/cli/upstream_eval.py
@@ -0,0 +1,18 @@
+"""Boundary for upstream ``verifiers`` eval configuration.
+
+``verifiers==0.1.12`` keeps full ``EvalConfig`` construction nested inside
+``verifiers.scripts.eval.main()``, so MedARC still uses a temporary adapter.
+Import eval config behavior through this module so callers do not depend on the
+adapter directly and the deletion point is isolated when upstream exposes a
+public builder.
+"""
+
+from __future__ import annotations
+
+from medarc_verifiers.cli.verifiers_adapter import EvalConfigOverrides, build_eval_config, load_toml_eval_configs
+
+__all__ = [
+    "EvalConfigOverrides",
+    "build_eval_config",
+    "load_toml_eval_configs",
+]
diff --git a/tests/test_cli/test_upstream_eval.py b/tests/test_cli/test_upstream_eval.py
new file mode 100644
index 00000000..5b69dfd8
--- /dev/null
+++ b/tests/test_cli/test_upstream_eval.py
@@ -0,0 +1,24 @@
+from __future__ import annotations
+
+import verifiers.scripts.eval as upstream_eval_script
+
+from medarc_verifiers.cli import upstream_eval
+from medarc_verifiers.cli import verifiers_adapter
+
+
+def test_upstream_eval_boundary_uses_temporary_adapter_until_public_builder_exists() -> None:
+    assert not hasattr(upstream_eval_script, "build_eval_config")
+    assert upstream_eval.build_eval_config is verifiers_adapter.build_eval_config
+    assert upstream_eval.load_toml_eval_configs is verifiers_adapter.load_toml_eval_configs
+
+
+def test_temporary_adapter_provider_constants_match_upstream() -> None:
+    assert verifiers_adapter.DEFAULT_MODEL == upstream_eval_script.DEFAULT_MODEL
+    assert verifiers_adapter.DEFAULT_ENV_DIR_PATH == upstream_eval_script.DEFAULT_ENV_DIR_PATH
+    assert verifiers_adapter.DEFAULT_ENDPOINTS_PATH == upstream_eval_script.DEFAULT_ENDPOINTS_PATH
+    assert verifiers_adapter.DEFAULT_NUM_EXAMPLES == upstream_eval_script.DEFAULT_NUM_EXAMPLES
+    assert verifiers_adapter.DEFAULT_ROLLOUTS_PER_EXAMPLE == upstream_eval_script.DEFAULT_ROLLOUTS_PER_EXAMPLE
+    assert verifiers_adapter.DEFAULT_MAX_CONCURRENT == upstream_eval_script.DEFAULT_MAX_CONCURRENT
+    assert verifiers_adapter.DEFAULT_CLIENT_TYPE == upstream_eval_script.DEFAULT_CLIENT_TYPE
+    assert verifiers_adapter.DEFAULT_PROVIDER == upstream_eval_script.DEFAULT_PROVIDER
+    assert verifiers_adapter.PROVIDER_CONFIGS == upstream_eval_script.PROVIDER_CONFIGS

From 520a793dad9726d3b17c11e538bcadd8bc553841 Mon Sep 17 00:00:00 2001
From: Benjamin Warner <me@benjaminwarner.dev>
Date: Thu, 7 May 2026 16:43:34 +0000
Subject: [PATCH 16/53] Add bench output index

---
 medarc_verifiers/cli/bench_index.py     | 235 ++++++++++++++++++++++++
 medarc_verifiers/cli/eval_identity.py   |   6 +-
 medarc_verifiers/cli/main.py            | 157 +++++++++++++++-
 tests/test_cli/test_bench_index.py      |  88 +++++++++
 tests/test_cli/test_eval_identity.py    |  10 +-
 tests/test_cli/test_toml_bench_index.py | 210 +++++++++++++++++++++
 6 files changed, 686 insertions(+), 20 deletions(-)
 create mode 100644 medarc_verifiers/cli/bench_index.py
 create mode 100644 tests/test_cli/test_bench_index.py
 create mode 100644 tests/test_cli/test_toml_bench_index.py

diff --git a/medarc_verifiers/cli/bench_index.py b/medarc_verifiers/cli/bench_index.py
new file mode 100644
index 00000000..03cfa92f
--- /dev/null
+++ b/medarc_verifiers/cli/bench_index.py
@@ -0,0 +1,235 @@
+"""Bench sidecar planning and validation."""
+
+from __future__ import annotations
+
+import hashlib
+import json
+from collections import Counter
+from collections.abc import Mapping, Sequence
+from datetime import UTC, datetime
+from pathlib import Path
+from typing import Any
+
+from verifiers.utils.save_utils import make_serializable
+
+from medarc_verifiers.cli.eval_identity import EvalPathPlan
+
+BENCH_INDEX_FILENAME = "bench_index.json"
+BENCH_INDEX_VERSION = 1
+
+
+class BenchIndexError(ValueError):
+    """Raised when a bench sidecar is missing, stale, or internally inconsistent."""
+
+
+def build_bench_index(
+    *,
+    output_root: Path,
+    source_config: Path,
+    eval_configs: Sequence[Any],
+    path_plans: Sequence[EvalPathPlan],
+    plan_payloads: Sequence[Mapping[str, Any]],
+) -> dict[str, Any]:
+    entries = [
+        build_bench_index_entry(
+            index=index,
+            output_root=output_root,
+            config=config,
+            path_plan=path_plan,
+            plan_payload=plan_payload,
+        )
+        for index, (config, path_plan, plan_payload) in enumerate(
+            zip(eval_configs, path_plans, plan_payloads), start=1
+        )
+    ]
+    payload = {
+        "version": BENCH_INDEX_VERSION,
+        "created_at": datetime.now(UTC).replace(microsecond=0).isoformat().replace("+00:00", "Z"),
+        "source_config": str(source_config),
+        "evals": entries,
+    }
+    validate_bench_index(payload, output_root=output_root, require_artifacts=False)
+    return payload
+
+
+def build_bench_index_entry(
+    *,
+    index: int,
+    output_root: Path,
+    config: Any,
+    path_plan: EvalPathPlan,
+    plan_payload: Mapping[str, Any],
+) -> dict[str, Any]:
+    identity = path_plan.identity
+    entry = {
+        "index": index,
+        "results_path": str(path_plan.results_path),
+        "env_id": identity.env_id,
+        "model": identity.model_id,
+        "variant_id": identity.variant_id,
+        "variant_payload": identity.variant_payload,
+        "env_args": dict(config.env_args or {}),
+        "sampling_args": dict(config.sampling_args or {}),
+        "num_examples": config.num_examples,
+        "rollouts_per_example": config.rollouts_per_example,
+    }
+    digest_payload = {key: value for key, value in entry.items() if key != "index"}
+    entry["plan_digest"] = plan_digest({**digest_payload, "output_root": str(output_root), "plan": dict(plan_payload)})
+    return entry
+
+
+def read_bench_index(path: Path) -> dict[str, Any] | None:
+    if not path.exists():
+        return None
+    try:
+        payload = json.loads(path.read_text(encoding="utf-8"))
+    except json.JSONDecodeError as exc:
+        raise BenchIndexError(f"Invalid {BENCH_INDEX_FILENAME} at {path}: expected JSON object.") from exc
+    if not isinstance(payload, dict):
+        raise BenchIndexError(f"Invalid {BENCH_INDEX_FILENAME} at {path}: expected JSON object.")
+    return payload
+
+
+def write_bench_index(path: Path, payload: Mapping[str, Any]) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    path.write_text(json.dumps(payload, default=make_serializable, indent=2, sort_keys=True) + "\n", encoding="utf-8")
+
+
+def validate_bench_index(
+    payload: Mapping[str, Any],
+    *,
+    output_root: Path,
+    require_artifacts: bool,
+) -> None:
+    if payload.get("version") != BENCH_INDEX_VERSION:
+        raise BenchIndexError(f"Unsupported {BENCH_INDEX_FILENAME} version: {payload.get('version')!r}.")
+    entries = payload.get("evals")
+    if not isinstance(entries, list):
+        raise BenchIndexError(f"Invalid {BENCH_INDEX_FILENAME}: 'evals' must be a list.")
+
+    normalized_root = output_root.resolve()
+    paths: list[Path] = []
+    identities: list[tuple[str, str, str | None]] = []
+    model_env_counts: Counter[tuple[str, str]] = Counter()
+    for offset, raw_entry in enumerate(entries, start=1):
+        if not isinstance(raw_entry, Mapping):
+            raise BenchIndexError(f"Invalid {BENCH_INDEX_FILENAME}: eval entry {offset} must be an object.")
+        results_path = _entry_results_path(raw_entry)
+        _require_under_root(results_path, normalized_root)
+        paths.append(results_path.resolve())
+
+        model = _required_string(raw_entry, "model", offset)
+        env_id = _required_string(raw_entry, "env_id", offset)
+        variant_id = raw_entry.get("variant_id")
+        if variant_id is not None and not isinstance(variant_id, str):
+            raise BenchIndexError(f"Invalid {BENCH_INDEX_FILENAME}: eval entry {offset} variant_id must be a string.")
+        identities.append((model, env_id, variant_id))
+        model_env_counts[(model, env_id)] += 1
+
+        if require_artifacts:
+            _require_artifact(results_path / "metadata.json")
+            _require_artifact(results_path / "results.jsonl")
+            _validate_metadata_identity(results_path / "metadata.json", model=model, env_id=env_id)
+
+    _raise_duplicates(paths, label="results_path")
+    for (model, env_id), count in model_env_counts.items():
+        if count > 1:
+            missing_variant = [identity for identity in identities if identity[:2] == (model, env_id) and not identity[2]]
+            if missing_variant:
+                raise BenchIndexError(
+                    f"Duplicate bench entries for model={model!r}, env_id={env_id!r} require explicit variant_id."
+                )
+    _raise_duplicates(identities, label="(model, env_id, variant_id)")
+
+
+def find_entry_for_results_path(payload: Mapping[str, Any], results_path: Path) -> Mapping[str, Any] | None:
+    target = results_path.resolve()
+    entries = payload.get("evals")
+    if not isinstance(entries, list):
+        return None
+    for entry in entries:
+        if isinstance(entry, Mapping) and _entry_results_path(entry).resolve() == target:
+            return entry
+    return None
+
+
+def plan_digest(payload: Mapping[str, Any]) -> str:
+    encoded = json.dumps(_canonicalize(payload), sort_keys=True, separators=(",", ":"), default=str).encode("utf-8")
+    return "sha256:" + hashlib.sha256(encoded).hexdigest()
+
+
+def _entry_results_path(entry: Mapping[str, Any]) -> Path:
+    raw_path = entry.get("results_path")
+    if not isinstance(raw_path, str) or not raw_path:
+        raise BenchIndexError(f"Invalid {BENCH_INDEX_FILENAME}: each eval entry needs a non-empty results_path.")
+    return Path(raw_path)
+
+
+def _required_string(entry: Mapping[str, Any], key: str, offset: int) -> str:
+    value = entry.get(key)
+    if not isinstance(value, str) or not value:
+        raise BenchIndexError(f"Invalid {BENCH_INDEX_FILENAME}: eval entry {offset} needs non-empty {key}.")
+    return value
+
+
+def _require_under_root(path: Path, normalized_root: Path) -> None:
+    try:
+        path.resolve().relative_to(normalized_root)
+    except ValueError as exc:
+        raise BenchIndexError(
+            f"Invalid {BENCH_INDEX_FILENAME}: results_path {path} is outside output root {normalized_root}."
+        ) from exc
+
+
+def _require_artifact(path: Path) -> None:
+    if not path.is_file():
+        raise BenchIndexError(f"Invalid {BENCH_INDEX_FILENAME}: required artifact is missing: {path}.")
+
+
+def _validate_metadata_identity(metadata_path: Path, *, model: str, env_id: str) -> None:
+    try:
+        metadata = json.loads(metadata_path.read_text(encoding="utf-8"))
+    except json.JSONDecodeError as exc:
+        raise BenchIndexError(f"Invalid metadata.json at {metadata_path}: expected JSON object.") from exc
+    if not isinstance(metadata, Mapping):
+        raise BenchIndexError(f"Invalid metadata.json at {metadata_path}: expected JSON object.")
+    for key, expected in (("model", model), ("env_id", env_id)):
+        current = metadata.get(key)
+        if current is not None and current != expected:
+            raise BenchIndexError(
+                f"{BENCH_INDEX_FILENAME} identity mismatch for {metadata_path.parent}: "
+                f"{key} sidecar={expected!r} metadata={current!r}."
+            )
+
+
+def _raise_duplicates(values: Sequence[Any], *, label: str) -> None:
+    duplicates = [value for value, count in Counter(values).items() if count > 1]
+    if duplicates:
+        rendered = ", ".join(str(value) for value in duplicates)
+        raise BenchIndexError(f"Invalid {BENCH_INDEX_FILENAME}: duplicate {label}: {rendered}.")
+
+
+def _canonicalize(value: Any) -> Any:
+    if isinstance(value, Mapping):
+        return {str(key): _canonicalize(value[key]) for key in sorted(value)}
+    if isinstance(value, list | tuple):
+        return [_canonicalize(item) for item in value]
+    if isinstance(value, set):
+        return [_canonicalize(item) for item in sorted(value, key=str)]
+    if isinstance(value, Path):
+        return str(value)
+    return value
+
+
+__all__ = [
+    "BENCH_INDEX_FILENAME",
+    "BENCH_INDEX_VERSION",
+    "BenchIndexError",
+    "build_bench_index",
+    "build_bench_index_entry",
+    "find_entry_for_results_path",
+    "plan_digest",
+    "read_bench_index",
+    "validate_bench_index",
+    "write_bench_index",
+]
diff --git a/medarc_verifiers/cli/eval_identity.py b/medarc_verifiers/cli/eval_identity.py
index 4e02138a..cc185ead 100644
--- a/medarc_verifiers/cli/eval_identity.py
+++ b/medarc_verifiers/cli/eval_identity.py
@@ -211,7 +211,7 @@ def normalize_semantic_sampling_args(sampling_args: Mapping[str, Any] | None) ->
         elif key in _EXCLUDED_SAMPLING_KEYS:
             continue
         else:
-            raise UnclassifiedSamplingArgError(f"Sampling arg '{key}' is not classified for resume fingerprinting.")
+            normalized[key] = _canonicalize(value)
 
     return dict(sorted(normalized.items()))
 
@@ -332,9 +332,7 @@ def _merge_extra_body_semantics(normalized: dict[str, Any], extra_body: Any) ->
         elif key in _EXCLUDED_EXTRA_BODY_KEYS or key in _EXCLUDED_SAMPLING_KEYS:
             continue
         else:
-            raise UnclassifiedSamplingArgError(
-                f"Sampling arg 'extra_body.{key}' is not classified for resume fingerprinting."
-            )
+            normalized[key] = _canonicalize(value)
 
 
 def _extract_reasoning_effort(value: Any) -> Any:
diff --git a/medarc_verifiers/cli/main.py b/medarc_verifiers/cli/main.py
index 3ef6ffed..0070fb0c 100644
--- a/medarc_verifiers/cli/main.py
+++ b/medarc_verifiers/cli/main.py
@@ -35,6 +35,14 @@
 )
 from medarc_verifiers.cli._schemas import EnvironmentConfigSchema, EnvironmentExportConfig
 from medarc_verifiers.cli._single_run import run_single_mode
+from medarc_verifiers.cli.bench_index import (
+    BENCH_INDEX_FILENAME,
+    build_bench_index,
+    find_entry_for_results_path,
+    read_bench_index,
+    validate_bench_index,
+    write_bench_index,
+)
 from medarc_verifiers.cli.eval_identity import EvalPathPlan, generate_variant_id, plan_eval_paths
 from medarc_verifiers.cli.eval_identity import metadata_identity_fields
 from medarc_verifiers.cli.hf import HFSyncConfig, sync_files_to_hub
@@ -1267,12 +1275,19 @@ def _run_toml_bench(args: argparse.Namespace) -> int:
     plan_inputs = [_eval_config_identity_payload(config) for config in eval_configs]
     output_root = _resolve_toml_output_root(eval_configs, args)
     path_plans = plan_eval_paths(plan_inputs, output_root=output_root)
-    eval_configs, path_plans = _select_toml_plan(eval_configs, path_plans, args)
+    eval_configs, path_plans, plan_inputs = _select_toml_plan(eval_configs, path_plans, plan_inputs, args)
+    bench_index = build_bench_index(
+        output_root=output_root,
+        source_config=config_path,
+        eval_configs=eval_configs,
+        path_plans=path_plans,
+        plan_payloads=plan_inputs,
+    )
 
     _print_toml_bench_plan(eval_configs, path_plans, dry_run=bool(args.dry_run))
     if args.dry_run:
         return 0
-    return _execute_toml_plan(eval_configs, path_plans, args)
+    return _execute_toml_plan(eval_configs, path_plans, plan_inputs, bench_index, output_root, args)
 
 
 def _prepare_toml_raw_configs(raw_configs: Sequence[dict[str, Any]], args: argparse.Namespace) -> list[dict[str, Any]]:
@@ -1307,9 +1322,10 @@ def _resolve_toml_output_root(eval_configs: Sequence[Any], args: argparse.Namesp
 def _select_toml_plan(
     eval_configs: Sequence[Any],
     path_plans: Sequence[EvalPathPlan],
+    plan_inputs: Sequence[Mapping[str, Any]],
     args: argparse.Namespace,
-) -> tuple[list[Any], list[EvalPathPlan]]:
-    indexed = list(zip(eval_configs, path_plans))
+) -> tuple[list[Any], list[EvalPathPlan], list[Mapping[str, Any]]]:
+    indexed = list(zip(eval_configs, path_plans, plan_inputs))
     if args.eval_index is not None:
         start = args.eval_index - 1
         indexed = indexed[start : start + 1]
@@ -1320,23 +1336,49 @@ def _select_toml_plan(
             indexed = indexed[: args.stop_after - (args.start_at or 1) + 1]
     if not indexed:
         raise ValueError("No TOML evals matched the requested selection.")
-    selected_configs, selected_paths = zip(*indexed)
-    return list(selected_configs), list(selected_paths)
+    selected_configs, selected_paths, selected_plan_inputs = zip(*indexed)
+    return list(selected_configs), list(selected_paths), list(selected_plan_inputs)
 
 
 def _execute_toml_plan(
-    eval_configs: Sequence[Any], path_plans: Sequence[EvalPathPlan], args: argparse.Namespace
+    eval_configs: Sequence[Any],
+    path_plans: Sequence[EvalPathPlan],
+    plan_inputs: Sequence[Mapping[str, Any]],
+    bench_index: Mapping[str, Any],
+    output_root: Path,
+    args: argparse.Namespace,
 ) -> int:
     failures = 0
-    for index, (config, path_plan) in enumerate(zip(eval_configs, path_plans), start=1):
+    bench_index_path = output_root / BENCH_INDEX_FILENAME
+    existing_bench_index = _validate_existing_bench_index(
+        bench_index_path, bench_index, output_root=output_root, force=bool(args.force)
+    )
+    effective_bench_index = _merge_bench_index(existing_bench_index, bench_index, output_root=output_root)
+    write_bench_index(bench_index_path, effective_bench_index)
+    for index, (config, path_plan, _plan_input) in enumerate(zip(eval_configs, path_plans, plan_inputs), start=1):
         metadata_fields = metadata_identity_fields(_eval_config_identity_payload(config), path_plan.identity)
         results_path = path_plan.results_path
+        sidecar_entry = find_entry_for_results_path(effective_bench_index, results_path)
+        if sidecar_entry is None:
+            raise ValueError(f"Internal bench planning error: missing {BENCH_INDEX_FILENAME} entry for {results_path}.")
         try:
-            _prepare_toml_results_dir(results_path, metadata_fields, config, force=bool(args.force))
+            _prepare_toml_results_dir(
+                results_path,
+                metadata_fields,
+                config,
+                sidecar_entry=sidecar_entry,
+                output_root=output_root,
+                force=bool(args.force),
+            )
             run_config = config.model_copy(update={"resume_path": results_path, "save_results": True})
             logger.info("Running TOML eval %d/%d: %s on %s", index, len(eval_configs), config.env_id, config.model)
             asyncio.run(_run_one_toml_eval(run_config, results_path, metadata_fields))
             _merge_metadata_fields(results_path, metadata_fields)
+            validate_bench_index(
+                {"version": 1, "evals": [dict(sidecar_entry)]},
+                output_root=output_root,
+                require_artifacts=True,
+            )
         except Exception as exc:  # noqa: BLE001
             failures += 1
             logger.exception("TOML eval %d failed: %s", index, exc)
@@ -1346,6 +1388,8 @@ def _execute_toml_plan(
             import time
 
             time.sleep(float(args.sleep))
+    if failures == 0:
+        validate_bench_index(effective_bench_index, output_root=output_root, require_artifacts=True)
     return 1 if failures else 0
 
 
@@ -1374,6 +1418,8 @@ def _prepare_toml_results_dir(
     metadata_fields: Mapping[str, Any],
     config: Any,
     *,
+    sidecar_entry: Mapping[str, Any],
+    output_root: Path,
     force: bool,
 ) -> None:
     if results_path.exists() and force:
@@ -1383,6 +1429,7 @@ def _prepare_toml_results_dir(
     results_file = results_path / "results.jsonl"
     has_existing_state = metadata_path.exists() or results_file.exists()
     if has_existing_state:
+        _validate_toml_resume_sidecar(results_path, sidecar_entry, output_root=output_root)
         _validate_toml_resume_metadata(results_path, metadata_fields)
 
     results_path.mkdir(parents=True, exist_ok=True)
@@ -1396,6 +1443,98 @@ def _prepare_toml_results_dir(
     _write_json(metadata_path, metadata)
 
 
+def _validate_existing_bench_index(
+    bench_index_path: Path,
+    planned_index: Mapping[str, Any],
+    *,
+    output_root: Path,
+    force: bool,
+) -> Mapping[str, Any] | None:
+    existing = read_bench_index(bench_index_path)
+    if existing is None:
+        if force:
+            return None
+        for entry in planned_index.get("evals", []):
+            if not isinstance(entry, Mapping):
+                continue
+            results_path = Path(str(entry.get("results_path", "")))
+            if (results_path / "metadata.json").exists() or (results_path / "results.jsonl").exists():
+                raise ValueError(
+                    f"Cannot reuse deterministic bench path {results_path}: {BENCH_INDEX_FILENAME} is missing. "
+                    "Use --force to archive and rerun."
+                )
+        return None
+    validate_bench_index(existing, output_root=output_root, require_artifacts=False)
+    if force:
+        return existing
+    for entry in planned_index.get("evals", []):
+        if not isinstance(entry, Mapping):
+            continue
+        results_path = Path(str(entry.get("results_path", "")))
+        existing_entry = find_entry_for_results_path(existing, results_path)
+        if existing_entry is None:
+            if (results_path / "metadata.json").exists() or (results_path / "results.jsonl").exists():
+                raise ValueError(
+                    f"Cannot reuse deterministic bench path {results_path}: {BENCH_INDEX_FILENAME} has no entry "
+                    "for that path. Use --force to archive and rerun."
+                )
+            continue
+        if existing_entry.get("plan_digest") != entry.get("plan_digest"):
+            raise ValueError(
+                f"Cannot reuse deterministic bench path {results_path}: {BENCH_INDEX_FILENAME} plan_digest mismatch "
+                f"(saved={existing_entry.get('plan_digest')!r}, current={entry.get('plan_digest')!r}). "
+                "Use --force to archive and rerun."
+            )
+    return existing
+
+
+def _merge_bench_index(
+    existing_index: Mapping[str, Any] | None,
+    planned_index: Mapping[str, Any],
+    *,
+    output_root: Path,
+) -> dict[str, Any]:
+    if existing_index is None:
+        return dict(planned_index)
+
+    merged_entries: list[dict[str, Any]] = []
+    planned_by_path = {
+        Path(str(entry["results_path"])).resolve(): dict(entry)
+        for entry in planned_index.get("evals", [])
+        if isinstance(entry, Mapping) and entry.get("results_path")
+    }
+    emitted_paths: set[Path] = set()
+    for entry in existing_index.get("evals", []):
+        if not isinstance(entry, Mapping) or not entry.get("results_path"):
+            continue
+        path = Path(str(entry["results_path"])).resolve()
+        merged_entries.append(planned_by_path.get(path, dict(entry)))
+        emitted_paths.add(path)
+    for path, entry in planned_by_path.items():
+        if path not in emitted_paths:
+            merged_entries.append(entry)
+    for index, entry in enumerate(merged_entries, start=1):
+        entry["index"] = index
+
+    merged = dict(planned_index)
+    merged["evals"] = merged_entries
+    validate_bench_index(merged, output_root=output_root, require_artifacts=False)
+    return merged
+
+
+def _validate_toml_resume_sidecar(
+    results_path: Path,
+    sidecar_entry: Mapping[str, Any],
+    *,
+    output_root: Path,
+) -> None:
+    validate_bench_index(
+        {"version": 1, "evals": [dict(sidecar_entry)]},
+        output_root=output_root,
+        require_artifacts=False,
+    )
+
+
 def _validate_toml_resume_metadata(results_path: Path, metadata_fields: Mapping[str, Any]) -> None:
     metadata_path = results_path / "metadata.json"
     if not metadata_path.exists():
diff --git a/tests/test_cli/test_bench_index.py b/tests/test_cli/test_bench_index.py
new file mode 100644
index 00000000..277a5c8a
--- /dev/null
+++ b/tests/test_cli/test_bench_index.py
@@ -0,0 +1,88 @@
+from __future__ import annotations
+
+import json
+from pathlib import Path
+
+import pytest
+
+from medarc_verifiers.cli.bench_index import BenchIndexError, validate_bench_index
+
+
+def _write_eval(path: Path, *, model: str = "gpt-5-mini", env_id: str = "medqa") -> None:
+    path.mkdir(parents=True, exist_ok=True)
+    (path / "metadata.json").write_text(json.dumps({"model": model, "env_id": env_id}), encoding="utf-8")
+    (path / "results.jsonl").write_text(json.dumps({"example_id": "0"}) + "\n", encoding="utf-8")
+
+
+def _index_entry(path: Path, *, model: str = "gpt-5-mini", env_id: str = "medqa", variant_id: str | None = None):
+    return {
+        "index": 1,
+        "results_path": str(path),
+        "model": model,
+        "env_id": env_id,
+        "variant_id": variant_id,
+        "variant_payload": None,
+        "env_args": {},
+        "sampling_args": {"unknown_provider_arg": True},
+        "num_examples": 1,
+        "rollouts_per_example": 1,
+        "plan_digest": "sha256:test",
+    }
+
+
+def test_validate_bench_index_accepts_unknown_sampling_args(tmp_path: Path) -> None:
+    results_path = tmp_path / "gpt-5-mini" / "medqa"
+    _write_eval(results_path)
+
+    validate_bench_index(
+        {"version": 1, "evals": [_index_entry(results_path)]},
+        output_root=tmp_path,
+        require_artifacts=True,
+    )
+
+
+def test_validate_bench_index_rejects_stale_results_path(tmp_path: Path) -> None:
+    with pytest.raises(BenchIndexError, match="required artifact is missing"):
+        validate_bench_index(
+            {"version": 1, "evals": [_index_entry(tmp_path / "missing" / "medqa")]},
+            output_root=tmp_path,
+            require_artifacts=True,
+        )
+
+
+def test_validate_bench_index_rejects_duplicate_results_path(tmp_path: Path) -> None:
+    results_path = tmp_path / "gpt-5-mini" / "medqa"
+    _write_eval(results_path)
+
+    with pytest.raises(BenchIndexError, match="duplicate results_path"):
+        validate_bench_index(
+            {"version": 1, "evals": [_index_entry(results_path), _index_entry(results_path)]},
+            output_root=tmp_path,
+            require_artifacts=True,
+        )
+
+
+def test_validate_bench_index_rejects_metadata_identity_mismatch(tmp_path: Path) -> None:
+    results_path = tmp_path / "gpt-5-mini" / "medqa"
+    _write_eval(results_path, model="other-model")
+
+    with pytest.raises(BenchIndexError, match="identity mismatch"):
+        validate_bench_index(
+            {"version": 1, "evals": [_index_entry(results_path)]},
+            output_root=tmp_path,
+            require_artifacts=True,
+        )
+
+
+def test_validate_bench_index_rejects_duplicate_model_env_without_variant(tmp_path: Path) -> None:
+    first = tmp_path / "gpt-5-mini" / "medqa" / "first"
+    second = tmp_path / "gpt-5-mini" / "medqa" / "second"
+    _write_eval(first)
+    _write_eval(second)
+
+    with pytest.raises(BenchIndexError, match="require explicit variant_id"):
+        validate_bench_index(
+            {"version": 1, "evals": [_index_entry(first), _index_entry(second)]},
+            output_root=tmp_path,
+            require_artifacts=True,
+        )
diff --git a/tests/test_cli/test_eval_identity.py b/tests/test_cli/test_eval_identity.py
index 4700b89e..1d863cf7 100644
--- a/tests/test_cli/test_eval_identity.py
+++ b/tests/test_cli/test_eval_identity.py
@@ -9,7 +9,6 @@
     MEDARC_CONFIG_FINGERPRINT_PAYLOAD_KEY,
     MEDARC_VARIANT_ID_KEY,
     MEDARC_VARIANT_PAYLOAD_KEY,
-    UnclassifiedSamplingArgError,
     build_fingerprint_payload,
     config_fingerprint,
     generate_variant_id,
@@ -273,12 +272,9 @@ def test_extra_body_semantic_args_match_top_level_shape() -> None:
     )
 
 
-def test_unclassified_sampling_args_refuse_fingerprint() -> None:
-    with pytest.raises(UnclassifiedSamplingArgError, match="vendor_knob"):
-        normalize_semantic_sampling_args({"vendor_knob": True})
-
-    with pytest.raises(UnclassifiedSamplingArgError, match="extra_body.vendor_knob"):
-        normalize_semantic_sampling_args({"extra_body": {"vendor_knob": True}})
+def test_unknown_sampling_args_pass_through_fingerprint() -> None:
+    assert normalize_semantic_sampling_args({"vendor_knob": True}) == {"vendor_knob": True}
+    assert normalize_semantic_sampling_args({"extra_body": {"vendor_knob": True}}) == {"vendor_knob": True}
 
 
 def test_endpoint_alias_without_resolved_model_is_rejected() -> None:
diff --git a/tests/test_cli/test_toml_bench_index.py b/tests/test_cli/test_toml_bench_index.py
new file mode 100644
index 00000000..f0e74dad
--- /dev/null
+++ b/tests/test_cli/test_toml_bench_index.py
@@ -0,0 +1,210 @@
+from __future__ import annotations
+
+import json
+from pathlib import Path
+from textwrap import dedent
+
+import pytest
+
+from medarc_verifiers.cli import main
+
+
+def _write_config(path: Path, text: str) -> None:
+    path.write_text(dedent(text).strip(), encoding="utf-8")
+
+
+def test_toml_bench_writes_bench_index(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
+    config_path = tmp_path / "bench.toml"
+    output_dir = tmp_path / "evals"
+    _write_config(
+        config_path,
+        """
+        model = "gpt-5-mini"
+
+        [[eval]]
+        env_id = "medqa"
+        num_examples = 1
+        rollouts_per_example = 1
+        """,
+    )
+
+    async def fake_run(config, **_kwargs):
+        results_path = Path(config.resume_path)
+        (results_path / "results.jsonl").write_text(json.dumps({"example_id": "0", "reward": 1.0}) + "\n")
+        return {"outputs": [], "metadata": {}}
+
+    monkeypatch.setattr(main, "run_evaluation", fake_run)
+
+    assert main.main(["bench", "--config", str(config_path), "--output-dir", str(output_dir)]) == 0
+
+    results_path = output_dir / "gpt-5-mini" / "medqa"
+    bench_index = json.loads((output_dir / "bench_index.json").read_text())
+    assert bench_index["version"] == 1
+    assert bench_index["source_config"] == str(config_path)
+    assert bench_index["evals"][0]["results_path"] == str(results_path)
+    assert bench_index["evals"][0]["plan_digest"].startswith("sha256:")
+
+
+def test_toml_bench_refuses_existing_output_without_bench_index(
+    monkeypatch: pytest.MonkeyPatch, tmp_path: Path
+) -> None:
+    config_path = tmp_path / "bench.toml"
+    output_dir = tmp_path / "evals"
+    _write_config(
+        config_path,
+        """
+        model = "gpt-5-mini"
+
+        [[eval]]
+        env_id = "medqa"
+        """,
+    )
+    results_path = output_dir / "gpt-5-mini" / "medqa"
+    results_path.mkdir(parents=True)
+    (results_path / "metadata.json").write_text(json.dumps({"env_id": "medqa", "model": "gpt-5-mini"}))
+
+    async def fake_run(config, **_kwargs):  # noqa: ARG001
+        raise AssertionError("bench should fail before execution")
+
+    monkeypatch.setattr(main, "run_evaluation", fake_run)
+
+    assert main.main(["bench", "--config", str(config_path), "--output-dir", str(output_dir)]) == 1
+
+
+def test_toml_bench_force_archives_existing_output_without_bench_index(
+    monkeypatch: pytest.MonkeyPatch, tmp_path: Path
+) -> None:
+    config_path = tmp_path / "bench.toml"
+    output_dir = tmp_path / "evals"
+    _write_config(
+        config_path,
+        """
+        model = "gpt-5-mini"
+
+        [[eval]]
+        env_id = "medqa"
+        """,
+    )
+    results_path = output_dir / "gpt-5-mini" / "medqa"
+    results_path.mkdir(parents=True)
+    (results_path / "metadata.json").write_text(json.dumps({"env_id": "medqa", "model": "gpt-5-mini"}))
+    (results_path / "sentinel.txt").write_text("old")
+
+    async def fake_run(config, **_kwargs):
+        Path(config.resume_path, "results.jsonl").write_text(json.dumps({"example_id": "0"}) + "\n")
+        return {"outputs": [], "metadata": {}}
+
+    monkeypatch.setattr(main, "run_evaluation", fake_run)
+
+    assert main.main(["bench", "--config", str(config_path), "--output-dir", str(output_dir), "--force"]) == 0
+    assert not (results_path / "sentinel.txt").exists()
+    assert list((output_dir / "gpt-5-mini").glob("medqa__old_*"))
+
+
+def test_toml_bench_refuses_existing_output_missing_from_bench_index(
+    monkeypatch: pytest.MonkeyPatch, tmp_path: Path
+) -> None:
+    config_path = tmp_path / "bench.toml"
+    output_dir = tmp_path / "evals"
+    _write_config(
+        config_path,
+        """
+        model = "gpt-5-mini"
+
+        [[eval]]
+        env_id = "medqa"
+        """,
+    )
+    results_path = output_dir / "gpt-5-mini" / "medqa"
+    results_path.mkdir(parents=True)
+    (results_path / "metadata.json").write_text(json.dumps({"env_id": "medqa", "model": "gpt-5-mini"}))
+    (output_dir / "bench_index.json").write_text(
+        json.dumps(
+            {
+                "version": 1,
+                "evals": [
+                    {
+                        "index": 1,
+                        "results_path": str(output_dir / "other-model" / "medqa"),
+                        "model": "other-model",
+                        "env_id": "medqa",
+                        "variant_id": None,
+                        "variant_payload": None,
+                        "env_args": {},
+                        "sampling_args": {},
+                        "num_examples": 1,
+                        "rollouts_per_example": 1,
+                        "plan_digest": "sha256:old",
+                    }
+                ],
+            }
+        )
+    )
+
+    async def fake_run(config, **_kwargs):  # noqa: ARG001
+        raise AssertionError("bench should fail before execution")
+
+    monkeypatch.setattr(main, "run_evaluation", fake_run)
+
+    assert main.main(["bench", "--config", str(config_path), "--output-dir", str(output_dir)]) == 1
+
+
+def test_toml_bench_refuses_stale_metadata_even_when_bench_index_matches(
+    monkeypatch: pytest.MonkeyPatch, tmp_path: Path
+) -> None:
+    config_path = tmp_path / "bench.toml"
+    output_dir = tmp_path / "evals"
+    _write_config(
+        config_path,
+        """
+        model = "gpt-5-mini"
+
+        [[eval]]
+        env_id = "medqa"
+        """,
+    )
+
+    async def fake_run(config, **_kwargs):
+        Path(config.resume_path, "results.jsonl").write_text(json.dumps({"example_id": "0"}) + "\n")
+        return {"outputs": [], "metadata": {}}
+
+    monkeypatch.setattr(main, "run_evaluation", fake_run)
+
+    assert main.main(["bench", "--config", str(config_path), "--output-dir", str(output_dir)]) == 0
+
+    metadata_path = output_dir / "gpt-5-mini" / "medqa" / "metadata.json"
+    metadata = json.loads(metadata_path.read_text())
+    metadata["medarc_config_fingerprint"] = "stale"
+    metadata_path.write_text(json.dumps(metadata))
+
+    assert main.main(["bench", "--config", str(config_path), "--output-dir", str(output_dir)]) == 1
+
+
+def test_toml_bench_selected_runs_merge_bench_index(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
+    config_path = tmp_path / "bench.toml"
+    output_dir = tmp_path / "evals"
+    _write_config(
+        config_path,
+        """
+        model = "gpt-5-mini"
+
+        [[eval]]
+        env_id = "medqa"
+
+        [[eval]]
+        env_id = "pubmedqa"
+        """,
+    )
+
+    async def fake_run(config, **_kwargs):
+        Path(config.resume_path, "results.jsonl").write_text(json.dumps({"example_id": "0"}) + "\n")
+        return {"outputs": [], "metadata": {}}
+
+    monkeypatch.setattr(main, "run_evaluation", fake_run)
+
+    assert main.main(["bench", "--config", str(config_path), "--output-dir", str(output_dir), "--eval-index", "1"]) == 0
+    assert main.main(["bench", "--config", str(config_path), "--output-dir", str(output_dir), "--eval-index", "2"]) == 0
+
+    bench_index = json.loads((output_dir / "bench_index.json").read_text())
+    assert [entry["env_id"] for entry in bench_index["evals"]] == ["medqa", "pubmedqa"]
+    assert main.main(["bench", "--config", str(config_path), "--output-dir", str(output_dir)]) == 0

From 8114ed176f2e844a4e26d994ad65a0757994fb44 Mon Sep 17 00:00:00 2001
From: Benjamin Warner <me@benjaminwarner.dev>
Date: Thu, 7 May 2026 16:46:45 +0000
Subject: [PATCH 17/53] Add bench index process discovery

---
 medarc_verifiers/cli/process/discovery.py | 101 ++++++++++++++++++++++
 medarc_verifiers/cli/process/metadata.py  |  10 ++-
 tests/test_cli/test_process_discovery.py  |  72 +++++++++++++++
 3 files changed, 181 insertions(+), 2 deletions(-)

diff --git a/medarc_verifiers/cli/process/discovery.py b/medarc_verifiers/cli/process/discovery.py
index 3421d0df..d6ae1c8a 100644
--- a/medarc_verifiers/cli/process/discovery.py
+++ b/medarc_verifiers/cli/process/discovery.py
@@ -11,6 +11,7 @@
 
 from pydantic import ValidationError
 
+from medarc_verifiers.cli.bench_index import BENCH_INDEX_FILENAME, read_bench_index, validate_bench_index
 from medarc_verifiers.cli.eval_identity import (
     MEDARC_CONFIG_FINGERPRINT_KEY,
     MEDARC_CONFIG_FINGERPRINT_PAYLOAD_KEY,
@@ -370,6 +371,11 @@ def _candidate_evals_roots(runs_path: Path) -> tuple[Path, ...]:
 
 def _iter_eval_output_records(evals_root: Path) -> Iterator[RunRecord]:
     """Yield synthetic run records for upstream eval output directories."""
+    bench_index_path = evals_root / BENCH_INDEX_FILENAME
+    if bench_index_path.exists():
+        yield from _iter_bench_index_records(evals_root, bench_index_path)
+        return
+
     try:
         results_paths = sorted(evals_root.rglob(RESULTS_FILENAME))
     except OSError as exc:  # noqa: FBT003
@@ -391,6 +397,101 @@ def _iter_eval_output_records(evals_root: Path) -> Iterator[RunRecord]:
             yield record
 
 
+def _iter_bench_index_records(evals_root: Path, bench_index_path: Path) -> Iterator[RunRecord]:
+    bench_index = read_bench_index(bench_index_path)
+    if bench_index is None:
+        return
+    validate_bench_index(bench_index, output_root=evals_root, require_artifacts=True)
+    entries = bench_index.get("evals", [])
+    if not isinstance(entries, list):
+        return
+    source_config = _string_or_none(bench_index.get("source_config"))
+    for entry in entries:
+        if not isinstance(entry, Mapping):
+            continue
+        record = _build_bench_index_record(evals_root, bench_index_path, entry, source_config=source_config)
+        if record is not None:
+            yield record
+
+
+def _build_bench_index_record(
+    evals_root: Path,
+    bench_index_path: Path,
+    entry: Mapping[str, Any],
+    *,
+    source_config: str | None,
+) -> RunRecord | None:
+    results_dir = Path(str(entry["results_path"]))
+    metadata_path = results_dir / METADATA_FILENAME
+    metadata_payload = _read_metadata_payload(metadata_path)
+    if metadata_payload is None:
+        return None
+
+    model_id = str(entry["model"])
+    env_id = str(entry["env_id"])
+    variant_id = _string_or_none(entry.get("variant_id"))
+    variant_payload = entry.get("variant_payload") if isinstance(entry.get("variant_payload"), Mapping) else None
+    updated_at = _path_timestamp(metadata_path)
+    job_run_id = "::".join(part for part in (model_id, env_id, variant_id) if part)
+    env_args = _mapping_or_empty(entry.get("env_args")) or _mapping_or_empty(metadata_payload.get("env_args"))
+    sampling_args = _mapping_or_empty(entry.get("sampling_args")) or _mapping_or_empty(
+        metadata_payload.get("sampling_args")
+    )
+    plan_digest = _string_or_none(entry.get("plan_digest"))
+
+    manifest = RunManifestInfo(
+        job_run_id=job_run_id,
+        run_name=job_run_id,
+        summary_completed=1,
+        summary_total=1,
+        summary_total_known=True,
+        manifest_path=bench_index_path,
+        run_dir=evals_root,
+        created_at=updated_at,
+        updated_at=updated_at,
+        config_source=source_config,
+        config_checksum=plan_digest,
+        run_summary_path=results_dir / "summary.json",
+        models={model_id: {"sampling_args": sampling_args}},
+        env_templates={env_id: {"module": env_id}},
+    )
+
+    return RunRecord(
+        manifest=manifest,
+        job_id=results_dir.name,
+        model_id=model_id,
+        manifest_env_id=env_id,
+        results_dir_name=results_dir.name,
+        results_dir=results_dir,
+        metadata_path=metadata_path,
+        results_path=results_dir / RESULTS_FILENAME,
+        summary_path=results_dir / "summary.json",
+        has_metadata=True,
+        has_results=True,
+        has_summary=(results_dir / "summary.json").exists(),
+        status="completed",
+        duration_seconds=None,
+        reason=None,
+        started_at=None,
+        ended_at=None,
+        avg_reward=_float_or_none(metadata_payload.get("avg_reward")),
+        num_examples=_int_or_none(entry.get("num_examples")) or _int_or_none(metadata_payload.get("num_examples")),
+        rollouts_per_example=_int_or_none(entry.get("rollouts_per_example"))
+        or _int_or_none(metadata_payload.get("rollouts_per_example")),
+        row_count=_count_results_rows(results_dir / RESULTS_FILENAME),
+        env_args=env_args,
+        sampling_args=sampling_args,
+        env_config={
+            "id": env_id,
+            "module": env_id,
+            "variant_id": variant_id,
+            "variant_payload": variant_payload,
+            "plan_digest": plan_digest,
+        },
+        model_config={"sampling_args": sampling_args},
+    )
+
+
 def _build_eval_output_record(evals_root: Path, results_dir: Path) -> RunRecord | None:
     metadata_path = results_dir / METADATA_FILENAME
     metadata_payload = _read_metadata_payload(metadata_path)
diff --git a/medarc_verifiers/cli/process/metadata.py b/medarc_verifiers/cli/process/metadata.py
index 770acda4..9429fc62 100644
--- a/medarc_verifiers/cli/process/metadata.py
+++ b/medarc_verifiers/cli/process/metadata.py
@@ -202,6 +202,8 @@ def _resolve_metadata_context(
         alt_index = extract_rollout_index(record.results_dir_name)
         if alt_index:
             rollout_index = alt_index
+    record_variant_id = _string_or_none(record.env_config.get("variant_id") if record.env_config else None)
+    record_variant_payload = _mapping_or_none(record.env_config.get("variant_payload") if record.env_config else None)
     return _ResolvedMetadataContext(
         raw_metadata=raw_metadata,
         manifest_env_id=manifest_env_id,
@@ -221,13 +223,17 @@ def _resolve_metadata_context(
             metadata_payload.rollouts_per_example if metadata_payload else None,
         ),
         variant_id=_string_or_none(
-            _raw_metadata_value(raw_metadata, MEDARC_VARIANT_ID_KEY, metadata_payload.variant_id if metadata_payload else None)
+            _raw_metadata_value(
+                raw_metadata,
+                MEDARC_VARIANT_ID_KEY,
+                (metadata_payload.variant_id if metadata_payload else None) or record_variant_id,
+            )
         ),
         variant_payload=_mapping_or_none(
             _raw_metadata_value(
                 raw_metadata,
                 MEDARC_VARIANT_PAYLOAD_KEY,
-                metadata_payload.variant_payload if metadata_payload else None,
+                (metadata_payload.variant_payload if metadata_payload else None) or record_variant_payload,
             )
         ),
         medarc_config_fingerprint=_string_or_none(
diff --git a/tests/test_cli/test_process_discovery.py b/tests/test_cli/test_process_discovery.py
index 5ff8c70d..4f8260a9 100644
--- a/tests/test_cli/test_process_discovery.py
+++ b/tests/test_cli/test_process_discovery.py
@@ -308,6 +308,78 @@ def test_discover_run_records_includes_deterministic_eval_variants(tmp_path: Pat
     assert normalized.medarc_config_fingerprint_payload == {"env_id": "medqa"}
 
 
+def test_discover_run_records_prefers_bench_index_identity(tmp_path: Path) -> None:
+    evals_root = tmp_path / "runs" / "evals"
+    eval_dir = evals_root / "gpt-5-mini" / "medqa" / "seed-1618"
+    _write_eval_output(eval_dir, {"env_id": "medqa", "model": "gpt-5-mini"})
+    _write_json(
+        evals_root / "bench_index.json",
+        {
+            "version": 1,
+            "source_config": "configs/eval/example.toml",
+            "evals": [
+                {
+                    "index": 1,
+                    "results_path": str(eval_dir),
+                    "env_id": "medqa",
+                    "model": "gpt-5-mini",
+                    "variant_id": "seed-1618",
+                    "variant_payload": {"env_args": {"shuffle_seed": 1618}},
+                    "env_args": {"shuffle_seed": 1618},
+                    "sampling_args": {"temperature": 0},
+                    "num_examples": 1,
+                    "rollouts_per_example": 1,
+                    "plan_digest": "sha256:abc",
+                }
+            ],
+        },
+    )
+
+    records = discover_run_records(tmp_path / "runs" / "raw", filter_status=("completed",))
+
+    assert len(records) == 1
+    record = records[0]
+    assert record.manifest.manifest_path == evals_root / "bench_index.json"
+    assert record.manifest.config_source == "configs/eval/example.toml"
+    assert record.manifest.config_checksum == "sha256:abc"
+    assert record.env_args == {"shuffle_seed": 1618}
+    normalized = load_normalized_metadata(record)
+    assert normalized.variant_id == "seed-1618"
+    assert normalized.variant_payload == {"env_args": {"shuffle_seed": 1618}}
+
+
+def test_discover_run_records_bench_index_rejects_missing_artifacts(tmp_path: Path) -> None:
+    evals_root = tmp_path / "runs" / "evals"
+    _write_json(
+        evals_root / "bench_index.json",
+        {
+            "version": 1,
+            "evals": [
+                {
+                    "index": 1,
+                    "results_path": str(evals_root / "gpt-5-mini" / "medqa"),
+                    "env_id": "medqa",
+                    "model": "gpt-5-mini",
+                    "variant_id": None,
+                    "variant_payload": None,
+                    "env_args": {},
+                    "sampling_args": {},
+                    "num_examples": 1,
+                    "rollouts_per_example": 1,
+                    "plan_digest": "sha256:abc",
+                }
+            ],
+        },
+    )
+
+    try:
+        discover_run_records(tmp_path / "runs" / "raw", filter_status=("completed",))
+    except ValueError as exc:
+        assert "required artifact is missing" in str(exc)
+    else:
+        raise AssertionError("bench_index with missing artifacts should fail validation")
+
+
 def test_discover_run_records_includes_direct_upstream_uuid_outputs(tmp_path: Path) -> None:
     upstream_dir = tmp_path / "runs" / "evals" / "medqa--gpt-5-mini" / "016f4b4a-92a4-4a5b-a7c1-853af3318c52"
     _write_eval_output(upstream_dir)

From 4a7b94ae72a0bcba3f1e9d6561e35cebb66d29c9 Mon Sep 17 00:00:00 2001
From: Benjamin Warner <me@benjaminwarner.dev>
Date: Thu, 7 May 2026 17:01:18 +0000
Subject: [PATCH 18/53] Remove bench metadata monkey patch

---
 medarc_verifiers/cli/eval_identity.py   |  60 ++-----------
 medarc_verifiers/cli/main.py            |  87 ++++++++++---------
 tests/test_cli/test_eval_identity.py    |  18 +++-
 tests/test_cli/test_main.py             |  13 +--
 tests/test_cli/test_toml_bench_index.py | 110 ++++++++++++++++++++++++
 5 files changed, 185 insertions(+), 103 deletions(-)

diff --git a/medarc_verifiers/cli/eval_identity.py b/medarc_verifiers/cli/eval_identity.py
index cc185ead..f2db9330 100644
--- a/medarc_verifiers/cli/eval_identity.py
+++ b/medarc_verifiers/cli/eval_identity.py
@@ -20,48 +20,6 @@
 _MAX_SEGMENT_LENGTH = 80
 _MAX_VARIANT_ID_LENGTH = 160
 
-_SEMANTIC_SAMPLING_KEYS = {
-    "frequency_penalty",
-    "logit_bias",
-    "max_completion_tokens",
-    "max_tokens",
-    "min_p",
-    "n",
-    "presence_penalty",
-    "repetition_penalty",
-    "response_format",
-    "seed",
-    "stop",
-    "temperature",
-    "tool_choice",
-    "tools",
-    "top_k",
-    "top_p",
-}
-_EXCLUDED_SAMPLING_KEYS = {
-    "api_base_url",
-    "api_key",
-    "api_key_var",
-    "base_url",
-    "extra_headers",
-    "headers",
-    "max_retries",
-    "metadata",
-    "request_timeout",
-    "stream",
-    "timeout",
-}
-_EXCLUDED_EXTRA_BODY_KEYS = {
-    "metadata",
-    "provider",
-    "usage",
-}
-
-
-class UnclassifiedSamplingArgError(ValueError):
-    """Raised when fingerprinting sees a sampling arg without a policy."""
-
-
 @dataclass(frozen=True)
 class EvalIdentity:
     """Resolved model/env identity plus optional variant metadata."""
@@ -191,7 +149,7 @@ def config_fingerprint(config: Mapping[str, Any]) -> str:
 
 
 def normalize_semantic_sampling_args(sampling_args: Mapping[str, Any] | None) -> dict[str, Any]:
-    """Normalize provider-independent generation semantics for fingerprinting."""
+    """Canonicalize generation arguments for fingerprinting."""
 
     if not sampling_args:
         return {}
@@ -206,10 +164,8 @@ def normalize_semantic_sampling_args(sampling_args: Mapping[str, Any] | None) ->
             effort = _extract_reasoning_effort(value)
             if effort is not None:
                 normalized["reasoning_effort"] = _canonicalize(effort)
-        elif key in _SEMANTIC_SAMPLING_KEYS:
-            normalized[key] = _canonicalize(value)
-        elif key in _EXCLUDED_SAMPLING_KEYS:
-            continue
+            else:
+                normalized[key] = _canonicalize(value)
         else:
             normalized[key] = _canonicalize(value)
 
@@ -320,17 +276,16 @@ def _variant_value_text(value: Any) -> str:
 
 def _merge_extra_body_semantics(normalized: dict[str, Any], extra_body: Any) -> None:
     if not isinstance(extra_body, Mapping):
-        raise UnclassifiedSamplingArgError("sampling_args.extra_body must be a mapping for resume fingerprinting.")
+        normalized["extra_body"] = _canonicalize(extra_body)
+        return
 
     for key, value in extra_body.items():
         if key == "reasoning":
             effort = _extract_reasoning_effort(value)
             if effort is not None:
                 normalized["reasoning_effort"] = _canonicalize(effort)
-        elif key in _SEMANTIC_SAMPLING_KEYS:
-            normalized[key] = _canonicalize(value)
-        elif key in _EXCLUDED_EXTRA_BODY_KEYS or key in _EXCLUDED_SAMPLING_KEYS:
-            continue
+            else:
+                normalized[key] = _canonicalize(value)
         else:
             normalized[key] = _canonicalize(value)
 
@@ -380,7 +335,6 @@ def _canonicalize(value: Any) -> Any:
     "MEDARC_CONFIG_FINGERPRINT_PAYLOAD_KEY",
     "MEDARC_VARIANT_ID_KEY",
     "MEDARC_VARIANT_PAYLOAD_KEY",
-    "UnclassifiedSamplingArgError",
     "build_fingerprint_payload",
     "config_fingerprint",
     "extract_variant_payload",
diff --git a/medarc_verifiers/cli/main.py b/medarc_verifiers/cli/main.py
index 0070fb0c..cb786f1e 100644
--- a/medarc_verifiers/cli/main.py
+++ b/medarc_verifiers/cli/main.py
@@ -12,7 +12,7 @@
 from datetime import UTC, datetime
 from pathlib import Path
 from textwrap import dedent
-from typing import Any, Literal, Mapping, MutableMapping, Sequence
+from typing import Any, Literal, Mapping, Sequence
 
 import yaml
 from pydantic import ValidationError
@@ -1354,7 +1354,18 @@ def _execute_toml_plan(
         bench_index_path, bench_index, output_root=output_root, force=bool(args.force)
     )
     effective_bench_index = _merge_bench_index(existing_bench_index, bench_index, output_root=output_root)
-    write_bench_index(bench_index_path, effective_bench_index)
+    persisted_base_index = existing_bench_index
+    if args.force and existing_bench_index is not None:
+        persisted_base_index = _bench_index_without_paths(
+            existing_bench_index,
+            [path_plan.results_path for path_plan in path_plans],
+        )
+    persisted_bench_index = _merge_bench_index(
+        persisted_base_index,
+        _bench_index_with_entries(bench_index, []),
+        output_root=output_root,
+    )
+    write_bench_index(bench_index_path, persisted_bench_index)
     for index, (config, path_plan, _plan_input) in enumerate(zip(eval_configs, path_plans, plan_inputs), start=1):
         metadata_fields = metadata_identity_fields(_eval_config_identity_payload(config), path_plan.identity)
         results_path = path_plan.results_path
@@ -1372,13 +1383,19 @@ def _execute_toml_plan(
             )
             run_config = config.model_copy(update={"resume_path": results_path, "save_results": True})
             logger.info("Running TOML eval %d/%d: %s on %s", index, len(eval_configs), config.env_id, config.model)
-            asyncio.run(_run_one_toml_eval(run_config, results_path, metadata_fields))
+            asyncio.run(_run_one_toml_eval(run_config))
             _merge_metadata_fields(results_path, metadata_fields)
             validate_bench_index(
                 {"version": 1, "evals": [dict(sidecar_entry)]},
                 output_root=output_root,
                 require_artifacts=True,
             )
+            persisted_bench_index = _merge_bench_index(
+                persisted_bench_index,
+                _bench_index_with_entries(bench_index, [sidecar_entry]),
+                output_root=output_root,
+            )
+            write_bench_index(bench_index_path, persisted_bench_index)
         except Exception as exc:  # noqa: BLE001
             failures += 1
             logger.exception("TOML eval %d failed: %s", index, exc)
@@ -1389,28 +1406,12 @@ def _execute_toml_plan(
 
             time.sleep(float(args.sleep))
     if failures == 0:
-        validate_bench_index(effective_bench_index, output_root=output_root, require_artifacts=True)
+        validate_bench_index(persisted_bench_index, output_root=output_root, require_artifacts=True)
     return 1 if failures else 0
 
 
-async def _run_one_toml_eval(config: Any, results_path: Path, metadata_fields: Mapping[str, Any]) -> Any:
-    import verifiers.envs.environment as environment_module
-
-    def add_medarc_metadata(_all_outputs: Any, _new_outputs: Any, metadata: MutableMapping[str, Any]) -> None:
-        metadata.update(metadata_fields)
-
-    original_save_metadata = environment_module.save_metadata
-
-    def save_metadata_with_medarc_fields(metadata: MutableMapping[str, Any], result_path: Path) -> Any:
-        if Path(result_path) == results_path:
-            metadata.update(metadata_fields)
-        return original_save_metadata(metadata, result_path)
-
-    environment_module.save_metadata = save_metadata_with_medarc_fields
-    try:
-        return await run_evaluation(config, on_progress=add_medarc_metadata)
-    finally:
-        environment_module.save_metadata = original_save_metadata
+async def _run_one_toml_eval(config: Any) -> Any:
+    return await run_evaluation(config)
 
 
 def _prepare_toml_results_dir(
@@ -1432,15 +1433,11 @@ def _prepare_toml_results_dir(
         _validate_toml_resume_sidecar(results_path, sidecar_entry, output_root=output_root)
         _validate_toml_resume_metadata(results_path, metadata_fields)
 
-    results_path.mkdir(parents=True, exist_ok=True)
-    results_file.touch(exist_ok=True)
     if has_existing_state:
         _merge_metadata_fields(results_path, metadata_fields)
         return
 
-    metadata = _initial_toml_metadata(config)
-    metadata.update(metadata_fields)
-    _write_json(metadata_path, metadata)
+    results_path.mkdir(parents=True, exist_ok=True)
 
 
 def _validate_existing_bench_index(
@@ -1522,6 +1519,26 @@ def _merge_bench_index(
     return merged
 
 
+def _bench_index_with_entries(
+    bench_index: Mapping[str, Any], entries: Sequence[Mapping[str, Any]]
+) -> dict[str, Any]:
+    subset = dict(bench_index)
+    subset["evals"] = [dict(entry) for entry in entries]
+    return subset
+
+
+def _bench_index_without_paths(bench_index: Mapping[str, Any], paths: Sequence[Path]) -> dict[str, Any]:
+    excluded = {path.resolve() for path in paths}
+    entries = [
+        dict(entry)
+        for entry in bench_index.get("evals", [])
+        if isinstance(entry, Mapping)
+        and entry.get("results_path")
+        and Path(str(entry["results_path"])).resolve() not in excluded
+    ]
+    return _bench_index_with_entries(bench_index, entries)
+
+
 def _validate_toml_resume_sidecar(
     results_path: Path,
     sidecar_entry: Mapping[str, Any],
@@ -1552,22 +1569,6 @@ def _validate_toml_resume_metadata(results_path: Path, metadata_fields: Mapping[
         )
 
 
-def _initial_toml_metadata(config: Any) -> dict[str, Any]:
-    return {
-        "env_id": config.env_id,
-        "env_args": dict(config.env_args or {}),
-        "model": config.model,
-        "base_url": config.client_config.api_base_url,
-        "num_examples": config.num_examples,
-        "rollouts_per_example": config.rollouts_per_example,
-        "sampling_args": dict(config.sampling_args or {}),
-        "avg_reward": None,
-        "avg_metrics": {},
-        "avg_error": None,
-        "state_columns": list(config.state_columns or []),
-    }
-
-
 def _merge_metadata_fields(results_path: Path, metadata_fields: Mapping[str, Any]) -> None:
     metadata_path = results_path / "metadata.json"
     metadata = json.loads(metadata_path.read_text(encoding="utf-8")) if metadata_path.exists() else {}
diff --git a/tests/test_cli/test_eval_identity.py b/tests/test_cli/test_eval_identity.py
index 1d863cf7..210b0507 100644
--- a/tests/test_cli/test_eval_identity.py
+++ b/tests/test_cli/test_eval_identity.py
@@ -194,7 +194,6 @@ def test_fingerprint_changes_for_semantic_benchmark_changes(changed: dict[str, o
         {"max_concurrent": 1},
         {"max_retries": 5},
         {"headers": {"X-Prime-Team-ID": "team"}},
-        {"sampling_args": {"temperature": 0.2, "extra_body": {"usage": {"include": True}}}},
     ],
 )
 def test_fingerprint_ignores_provider_transport_and_runtime_changes(changed: dict[str, object]) -> None:
@@ -275,6 +274,23 @@ def test_extra_body_semantic_args_match_top_level_shape() -> None:
 def test_unknown_sampling_args_pass_through_fingerprint() -> None:
     assert normalize_semantic_sampling_args({"vendor_knob": True}) == {"vendor_knob": True}
     assert normalize_semantic_sampling_args({"extra_body": {"vendor_knob": True}}) == {"vendor_knob": True}
+    assert normalize_semantic_sampling_args({"extra_body": "provider-default"}) == {"extra_body": "provider-default"}
+
+
+def test_sampling_extra_body_arguments_are_part_of_fingerprint() -> None:
+    base = {
+        "env_id": "medqa",
+        "model": "gpt-5-mini",
+        "sampling_args": {"temperature": 0.2},
+        "num_examples": 10,
+        "rollouts_per_example": 1,
+    }
+    with_usage = {
+        **base,
+        "sampling_args": {"temperature": 0.2, "extra_body": {"usage": {"include": True}}},
+    }
+
+    assert config_fingerprint(base) != config_fingerprint(with_usage)
 
 
 def test_endpoint_alias_without_resolved_model_is_rejected() -> None:
diff --git a/tests/test_cli/test_main.py b/tests/test_cli/test_main.py
index cbfa5ea7..cf863b8f 100644
--- a/tests/test_cli/test_main.py
+++ b/tests/test_cli/test_main.py
@@ -359,6 +359,7 @@ def test_toml_bench_defaults_max_concurrent_to_one(monkeypatch: pytest.MonkeyPat
 
     async def fake_run(config, **_kwargs):
         captured.append(config.max_concurrent)
+        Path(config.resume_path, "results.jsonl").write_text(json.dumps({"example_id": "0"}) + "\n")
         return {"outputs": [], "metadata": {}}
 
     monkeypatch.setattr(main, "run_evaluation", fake_run)
@@ -469,15 +470,13 @@ async def fake_run(config, **_kwargs):
         calls += 1
         results_path = Path(config.resume_path)
         if calls == 1:
+            (results_path / "results.jsonl").write_text(json.dumps({"example_id": "0"}) + "\n")
             (results_path / "metadata.json").write_text(
                 json.dumps(
                     {
                         "avg_reward": 0.75,
                         "avg_metrics": {"accuracy": 0.75},
                         "total_tokens": 123,
-                        "medarc_config_fingerprint": json.loads((results_path / "metadata.json").read_text())[
-                            "medarc_config_fingerprint"
-                        ],
                     }
                 )
             )
@@ -495,7 +494,7 @@ async def fake_run(config, **_kwargs):
     assert metadata["medarc_config_fingerprint"]
 
 
-def test_toml_bench_injects_medarc_fields_into_upstream_metadata_saves(
+def test_toml_bench_does_not_patch_upstream_metadata_saves(
     monkeypatch: pytest.MonkeyPatch,
     tmp_path: Path,
 ) -> None:
@@ -521,6 +520,7 @@ async def fake_run(config, on_progress=None, **_kwargs):
         if on_progress is not None:
             on_progress([], [], metadata)
         environment_module.save_metadata({}, Path(config.resume_path))
+        Path(config.resume_path, "results.jsonl").write_text(json.dumps({"example_id": "0"}) + "\n")
         return {"outputs": [], "metadata": metadata}
 
     monkeypatch.setattr(environment_module, "save_metadata", fake_save_metadata)
@@ -528,8 +528,9 @@ async def fake_run(config, on_progress=None, **_kwargs):
 
     assert main.main(["bench", "--config", str(config_path), "--output-dir", str(tmp_path / "evals")]) == 0
 
-    assert saved_metadata
-    assert all(item["medarc_config_fingerprint"] for item in saved_metadata)
+    assert saved_metadata == [{}]
+    metadata = json.loads((tmp_path / "evals" / "gpt-5-mini" / "medqa" / "metadata.json").read_text())
+    assert metadata["medarc_config_fingerprint"]
 
 
 def test_single_run_help_lists_env_section_and_header_option(
diff --git a/tests/test_cli/test_toml_bench_index.py b/tests/test_cli/test_toml_bench_index.py
index f0e74dad..cc5440a6 100644
--- a/tests/test_cli/test_toml_bench_index.py
+++ b/tests/test_cli/test_toml_bench_index.py
@@ -45,6 +45,116 @@ async def fake_run(config, **_kwargs):
     assert bench_index["evals"][0]["plan_digest"].startswith("sha256:")
 
 
+def test_toml_bench_failed_eval_does_not_create_metadata(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
+    config_path = tmp_path / "bench.toml"
+    output_dir = tmp_path / "evals"
+    _write_config(
+        config_path,
+        """
+        model = "gpt-5-mini"
+
+        [[eval]]
+        env_id = "medqa"
+        num_examples = 1
+        rollouts_per_example = 1
+        """,
+    )
+
+    async def fake_run(config, **_kwargs):  # noqa: ARG001
+        raise RuntimeError("upstream failure")
+
+    monkeypatch.setattr(main, "run_evaluation", fake_run)
+
+    assert main.main(["bench", "--config", str(config_path), "--output-dir", str(output_dir)]) == 1
+
+    results_path = output_dir / "gpt-5-mini" / "medqa"
+    bench_index = json.loads((output_dir / "bench_index.json").read_text())
+    assert bench_index["evals"] == []
+    assert not (results_path / "metadata.json").exists()
+    assert not (results_path / "results.jsonl").exists()
+
+
+def test_toml_bench_continue_on_error_omits_failed_sidecar_entry(
+    monkeypatch: pytest.MonkeyPatch, tmp_path: Path
+) -> None:
+    config_path = tmp_path / "bench.toml"
+    output_dir = tmp_path / "evals"
+    _write_config(
+        config_path,
+        """
+        model = "gpt-5-mini"
+
+        [[eval]]
+        env_id = "medqa"
+
+        [[eval]]
+        env_id = "pubmedqa"
+        """,
+    )
+
+    async def fake_run(config, **_kwargs):
+        results_path = Path(config.resume_path)
+        if results_path.name == "pubmedqa":
+            raise RuntimeError("upstream failure")
+        (results_path / "results.jsonl").write_text(json.dumps({"example_id": "0"}) + "\n")
+        return {"outputs": [], "metadata": {}}
+
+    monkeypatch.setattr(main, "run_evaluation", fake_run)
+
+    assert (
+        main.main(
+            [
+                "bench",
+                "--config",
+                str(config_path),
+                "--output-dir",
+                str(output_dir),
+                "--continue-on-error",
+            ]
+        )
+        == 1
+    )
+
+    bench_index = json.loads((output_dir / "bench_index.json").read_text())
+    assert [entry["env_id"] for entry in bench_index["evals"]] == ["medqa"]
+    assert (output_dir / "gpt-5-mini" / "medqa" / "results.jsonl").exists()
+    assert not (output_dir / "gpt-5-mini" / "pubmedqa" / "results.jsonl").exists()
+
+
+def test_toml_bench_force_failure_removes_archived_sidecar_entry(
+    monkeypatch: pytest.MonkeyPatch, tmp_path: Path
+) -> None:
+    config_path = tmp_path / "bench.toml"
+    output_dir = tmp_path / "evals"
+    _write_config(
+        config_path,
+        """
+        model = "gpt-5-mini"
+
+        [[eval]]
+        env_id = "medqa"
+        """,
+    )
+
+    async def successful_run(config, **_kwargs):
+        Path(config.resume_path, "results.jsonl").write_text(json.dumps({"example_id": "0"}) + "\n")
+        return {"outputs": [], "metadata": {}}
+
+    monkeypatch.setattr(main, "run_evaluation", successful_run)
+    assert main.main(["bench", "--config", str(config_path), "--output-dir", str(output_dir)]) == 0
+
+    async def failing_run(config, **_kwargs):  # noqa: ARG001
+        raise RuntimeError("upstream failure")
+
+    monkeypatch.setattr(main, "run_evaluation", failing_run)
+    assert main.main(["bench", "--config", str(config_path), "--output-dir", str(output_dir), "--force"]) == 1
+
+    bench_index = json.loads((output_dir / "bench_index.json").read_text())
+    assert bench_index["evals"] == []
+    assert list((output_dir / "gpt-5-mini").glob("medqa__old_*"))
+    assert not (output_dir / "gpt-5-mini" / "medqa" / "results.jsonl").exists()
+
+
 def test_toml_bench_refuses_existing_output_without_bench_index(
     monkeypatch: pytest.MonkeyPatch, tmp_path: Path
 ) -> None:

From 3fba5ad22348ccb9f89c9ef17d387af2c300adb2 Mon Sep 17 00:00:00 2001
From: Benjamin Warner <me@benjaminwarner.dev>
Date: Thu, 7 May 2026 17:07:35 +0000
Subject: [PATCH 19/53] Document bench sidecar workflow

---
 docs/medarc-eval-bench.md             | 28 +++++++++++++-----
 docs/medarc-eval-process.md           | 11 +++++--
 docs/medarc-eval.md                   |  1 +
 docs/medarc-verifiers-architecture.md | 42 ++++++++++++++++-----------
 4 files changed, 55 insertions(+), 27 deletions(-)

diff --git a/docs/medarc-eval-bench.md b/docs/medarc-eval-bench.md
index 989f6699..060c8911 100644
--- a/docs/medarc-eval-bench.md
+++ b/docs/medarc-eval-bench.md
@@ -37,7 +37,8 @@ Repository suite configs live in `configs/eval/`:
 
 Bench configs use upstream `verifiers` TOML semantics: top-level defaults plus
 one or more `[[eval]]` blocks. MedARC adds deterministic output planning around
-the resolved evals; it does not use YAML `models`, `envs`, or `jobs` sections.
+the resolved evals and writes a required `bench_index.json` sidecar; it does not
+use YAML `models`, `envs`, or `jobs` sections.
 
 ```toml
 model = "openai/gpt-4.1-mini"
@@ -92,11 +93,23 @@ runs/evals/openai-gpt-4.1-mini/medqa/env_args.shuffle_seed-9331/
 
 Non-variant evals write to `runs/evals/<model>/<env>/`.
 
+## Output Sidecar
+
+Every bench output root contains `bench_index.json`. It records one entry for
+each successfully materialized eval output, including `results_path`, `model`,
+`env_id`, optional variant identity, resolved args, and a `plan_digest`.
+
+Processing prefers this sidecar over path inference for bench outputs. Failed
+evals are omitted from the sidecar until their `metadata.json` and
+`results.jsonl` exist, so `--continue-on-error` runs leave successful siblings
+processable.
+
 ## Resume and Force
 
-Bench writes each eval to a deterministic result directory and stores a narrow
-MedARC config fingerprint in `metadata.json`. Re-running the same TOML config
-resumes the same directory when the fingerprint matches.
+Bench writes each eval to a deterministic result directory. Re-running the same
+TOML config reuses the same directory only when the matching `bench_index.json`
+entry has the same `plan_digest` and existing `metadata.json` contains matching
+MedARC identity fields.
 
 ```bash
 # Resume matching deterministic outputs
@@ -106,9 +119,10 @@ medarc-eval bench --config configs/eval/medarc-all.toml
 medarc-eval bench --config configs/eval/medarc-all.toml --force
 ```
 
-Fingerprint checks protect semantic benchmark identity such as `env_id`,
-`env_args`, and normalized sampling args. Operational details such as host URL,
-timeout, key variable, and concurrency do not define the benchmark identity.
+The `plan_digest` and MedARC metadata identity payload are based on canonical
+JSON for the planned eval identity, including unknown `sampling_args`. MedARC
+does not maintain a sampling-argument allowlist for resume safety; new provider
+arguments pass through to upstream.
 
 ## Common Flags
 
diff --git a/docs/medarc-eval-process.md b/docs/medarc-eval-process.md
index e3a5caa1..cd15bccd 100644
--- a/docs/medarc-eval-process.md
+++ b/docs/medarc-eval-process.md
@@ -18,6 +18,7 @@ medarc-eval process --dry-run
 ## What Processing Does
 
 1. **Discovers** eval outputs in `runs/evals/` and legacy manifest jobs in `runs/raw/`
+   (`bench_index.json` is preferred when present for bench outputs)
 2. **Extracts** results from each eval output directory
 3. **Normalizes** data into a fixed output schema
 4. **Writes** parquet files organized by model and environment
@@ -55,8 +56,11 @@ On-disk model and env path components are slugified, so filenames may not exactl
 
 ### By Completion Status
 
-For current TOML bench outputs, processing discovers valid eval result
-directories under `runs/evals` and reads their `metadata.json`.
+For current TOML bench outputs, processing first looks for
+`runs/evals/bench_index.json`. When present, every sidecar entry must point to
+an existing `metadata.json` and `results.jsonl`, and sidecar `model` / `env_id`
+must match metadata when those fields are present. If no sidecar exists,
+processing falls back to metadata/path inference for ad hoc upstream outputs.
 
 For legacy YAML-runner outputs, `medarc-eval process` reads
 `runs/raw/<run_id>/run_manifest.json` and only selects jobs whose manifest
@@ -82,7 +86,8 @@ medarc-eval process --max-results-missing-pct 100
 ```
 
 For TOML bench outputs, this gate uses `metadata.json` values for expected rows
-and the observed `results.jsonl` row count:
+and the observed `results.jsonl` row count. Model, environment, and variant
+identity come from `bench_index.json` when the output root has one:
 
 - `expected_rows = num_examples * rollouts_per_example`
 - `observed_rows = results.jsonl row count`
diff --git a/docs/medarc-eval.md b/docs/medarc-eval.md
index 4f4d199c..0eca9c14 100644
--- a/docs/medarc-eval.md
+++ b/docs/medarc-eval.md
@@ -112,6 +112,7 @@ medarc-eval winrate --list-models
 ```
 runs/
 ├── evals/                        # Raw TOML bench outputs
+│   ├── bench_index.json           # Bench identity and results-path sidecar
 │   └── <model>/
 │       └── <env>/
 │           ├── results.jsonl
diff --git a/docs/medarc-verifiers-architecture.md b/docs/medarc-verifiers-architecture.md
index 66b38d96..f59d37f7 100644
--- a/docs/medarc-verifiers-architecture.md
+++ b/docs/medarc-verifiers-architecture.md
@@ -45,10 +45,10 @@ It supports:
   - Implemented in `medarc_verifiers/cli/_single_run.py`.
 - **TOML bench mode**: `medarc-eval bench --config <config.toml>`
   - Loads upstream `verifiers` TOML eval configs, expands ablations, plans
-    deterministic output directories, validates MedARC fingerprints, then runs
-    evals sequentially.
+    deterministic output directories, writes `bench_index.json`, then runs evals
+    sequentially through upstream execution.
   - Main implementation: `medarc_verifiers/cli/main.py`
-  - Eval config adapter: `medarc_verifiers/cli/verifiers_adapter.py`
+  - Upstream eval boundary: `medarc_verifiers/cli/upstream_eval.py`
   - Deterministic identity/path helpers: `medarc_verifiers/cli/eval_identity.py`
 - **Processing**: `medarc-eval process ...`
   - Pipeline wiring: `medarc_verifiers/cli/process/pipeline.py`
@@ -78,8 +78,8 @@ Override parsing lives in `medarc_verifiers/cli/utils/overrides.py`.
 
 Bench configs use upstream `verifiers` TOML shape: top-level defaults plus one
 or more `[[eval]]` entries. Upstream `[[ablation]]` tables expand into repeated
-eval configs. MedARC adds deterministic paths and config-safe resume around the
-resolved upstream eval configs.
+eval configs. MedARC adds deterministic paths and a required `bench_index.json`
+sidecar around the resolved upstream eval configs.
 
 `env_args` precedence is low to high:
 
@@ -94,7 +94,9 @@ then are sanitized for OpenAI-compatible clients:
 
 - Unknown parameters move under `extra_body` for compatible servers such as vLLM.
 - Sanitizer: `medarc_verifiers/utils/sampling_args.py`
-- Merge/adaptation point: `medarc_verifiers/cli/verifiers_adapter.py`
+- Import boundary: `medarc_verifiers/cli/upstream_eval.py`
+- Temporary merge/adaptation adapter behind that boundary:
+  `medarc_verifiers/cli/verifiers_adapter.py`
 
 The old YAML `models`, `envs`, `jobs`, matrix expansion, job builder, and
 manifest planner modules have been deleted.
@@ -125,11 +127,16 @@ TOML bench writes eval outputs under deterministic directories:
 - Non-variant evals: `runs/evals/<model>/<env>/`
 - Variant evals: `runs/evals/<model>/<env>/<variant_id>/`
 
-Before resuming an existing deterministic directory, bench validates the
-MedARC-specific config fingerprint in `metadata.json`. The fingerprint covers
-semantic benchmark identity such as `env_id`, `env_args`, and normalized
-sampling args. It excludes operational fields such as endpoint URL, timeout,
-API key variable, and concurrency.
+Before reusing an existing deterministic directory, bench validates the matching
+`bench_index.json` entry and its `plan_digest`, and also checks MedARC identity
+fields in existing `metadata.json`. Both are based on the canonical planned eval
+payload, including unknown `sampling_args`, so new provider arguments pass
+through instead of hitting a MedARC allowlist.
+
+`medarc-eval bench` does not monkey-patch upstream metadata saving. MedARC
+identity fields are merged into `metadata.json` only after successful upstream
+execution, and process discovery uses `bench_index.json` as the durable bench
+identity contract.
 
 `medarc_verifiers/cli/_manifest.py` now only contains the legacy manifest schema
 needed by processing to read historical `runs/raw` outputs.
@@ -138,6 +145,7 @@ needed by processing to read historical `runs/raw` outputs.
 
 TOML bench outputs include:
 
+- `bench_index.json`: bench identity sidecar at the output root
 - `results.jsonl`: per-example rollouts
 - `metadata.json`: eval configuration and metrics snapshot
 
@@ -152,10 +160,10 @@ Entry point: `medarc_verifiers/cli/process/pipeline.py`.
 
 Processing:
 
-1. Discovers TOML bench outputs from `runs/evals` and legacy manifest outputs
-   from `runs/raw`.
-2. Normalizes metadata from `metadata.json` and, for legacy outputs, manifest
-   fields.
+1. Discovers TOML bench outputs from `runs/evals` using `bench_index.json` when
+   present, and legacy manifest outputs from `runs/raw`.
+2. Normalizes identity from the bench sidecar plus `metadata.json`; legacy
+   outputs still use manifest fields.
 3. Loads rows from `results.jsonl`, drops large prompt/completion fields, and
    flattens `token_usage`.
 4. Aggregates rows per model and environment, preserving variant ids.
@@ -223,9 +231,9 @@ It:
 
 - CLI flags or routing:
   - `medarc_verifiers/cli/main.py`, `medarc_verifiers/cli/_single_run.py`
-- TOML bench behavior, deterministic paths, or resume fingerprints:
+- TOML bench behavior, deterministic paths, or bench sidecar identity:
   - `medarc_verifiers/cli/main.py`, `medarc_verifiers/cli/eval_identity.py`,
-    `medarc_verifiers/cli/verifiers_adapter.py`
+    `medarc_verifiers/cli/upstream_eval.py`, `medarc_verifiers/cli/verifiers_adapter.py`
 - Processed dataset schema:
   - `medarc_verifiers/cli/process/rows.py`, `medarc_verifiers/cli/process/writer.py`
 - Winrate math/output:

From fc5e666d5adc1f02d95e096a2eea8911694fa082 Mon Sep 17 00:00:00 2001
From: Benjamin Warner <me@benjaminwarner.dev>
Date: Thu, 7 May 2026 18:03:34 +0000
Subject: [PATCH 20/53] Rename eval suite configs

---
 .gitignore                                    |   7 +-
 README.md                                     |   4 +-
 configs/eval/README.md                        |   2 +-
 configs/eval/medarc-all.toml                  | 315 ------------------
 ...rc-judge.toml => medmarks-open_ended.toml} |   0
 ...medarc-mcq.toml => medmarks-verified.toml} |   0
 docs/medarc-eval-bench.md                     |  19 +-
 docs/medarc-eval-process.md                   |   2 +-
 docs/medarc-eval.md                           |   8 +-
 tests/test_cli/test_main.py                   |  32 +-
 10 files changed, 28 insertions(+), 361 deletions(-)
 delete mode 100644 configs/eval/medarc-all.toml
 rename configs/eval/{medarc-judge.toml => medmarks-open_ended.toml} (100%)
 rename configs/eval/{medarc-mcq.toml => medmarks-verified.toml} (100%)

diff --git a/.gitignore b/.gitignore
index 65d1fa7e..1ead0904 100644
--- a/.gitignore
+++ b/.gitignore
@@ -214,4 +214,9 @@ environments/healthbench/test*
 .vscode/
 pyrightconfig.json
 
-
+.claude
+.codex
+.devcontainer
+plans/
+verifiers/
+.gitmodules
\ No newline at end of file
diff --git a/README.md b/README.md
index 2d12e925..511c7f95 100644
--- a/README.md
+++ b/README.md
@@ -163,10 +163,10 @@ env_args = { shuffle_answers = true, shuffle_seed = 1618 }
 
 ```bash
 # Run the batch
-uv run medarc-eval bench --config configs/eval/medarc-mcq.toml
+uv run medarc-eval bench --config configs/eval/medmarks-verified.toml
 
 # Preview without executing
-uv run medarc-eval bench --config configs/eval/medarc-mcq.toml --dry-run
+uv run medarc-eval bench --config configs/eval/medmarks-verified.toml --dry-run
 ```
 
 Bench mode resumes matching deterministic result directories and supports
diff --git a/configs/eval/README.md b/configs/eval/README.md
index 77159bbc..b8ac3e4e 100644
--- a/configs/eval/README.md
+++ b/configs/eval/README.md
@@ -7,7 +7,7 @@ and `[[ablation]]` sweeps intentionally keep the upstream environment id stable;
 
 ```bash
 medarc-eval bench --config configs/eval/smoke.toml --dry-run
-medarc-eval bench --config configs/eval/medarc-all.toml
+medarc-eval bench --config configs/eval/medmarks-verified.toml
 medarc-eval process --runs-dir runs/evals --output-dir runs/processed
 ```
 
diff --git a/configs/eval/medarc-all.toml b/configs/eval/medarc-all.toml
deleted file mode 100644
index c8cc4318..00000000
--- a/configs/eval/medarc-all.toml
+++ /dev/null
@@ -1,315 +0,0 @@
-# Aggregate MedARC benchmark suite. This keeps suite composition in upstream
-# TOML and leaves variant path generation to the MedARC sequential bench wrapper.
-
-model = "openai/gpt-4.1-mini"
-save_results = true
-output_dir = "runs/evals"
-
-[[eval]]
-env_id = "medqa"
-num_examples = -1
-rollouts_per_example = 1
-
-[[eval]]
-env_id = "med_mcqa"
-num_examples = -1
-rollouts_per_example = 1
-
-[[eval]]
-env_id = "pubmedqa"
-num_examples = -1
-rollouts_per_example = 1
-
-[[eval]]
-env_id = "mmlu_pro_health"
-num_examples = -1
-rollouts_per_example = 1
-
-[[eval]]
-env_id = "m_arc"
-num_examples = -1
-rollouts_per_example = 1
-
-[[eval]]
-env_id = "head_qa_v2"
-num_examples = -1
-rollouts_per_example = 1
-
-[[eval]]
-env_id = "sctpublic"
-num_examples = -1
-rollouts_per_example = 1
-
-[[eval]]
-env_id = "healthbench"
-num_examples = -1
-rollouts_per_example = 1
-env_args = { judge_model = "openai/gpt-5-mini", judge_base_url = "https://api.pinference.ai/api/v1", difficulty = "all" }
-
-[[eval]]
-env_id = "med_dialog"
-num_examples = 2500
-rollouts_per_example = 1
-env_args = { judge_model = ["openai/gpt-5-mini", "x-ai/grok-4.1-fast"], judge_base_url = "https://api.pinference.ai/api/v1" }
-
-[[eval]]
-env_id = "medcasereasoning"
-num_examples = -1
-rollouts_per_example = 1
-env_args = { judge_model = "openai/gpt-5-nano", judge_base_url = "https://api.pinference.ai/api/v1" }
-
-[[eval]]
-env_id = "medec"
-num_examples = -1
-rollouts_per_example = 1
-env_args = { judge_model = ["openai/gpt-5-mini", "google/gemini-3-flash-preview"], judge_base_url = "https://api.pinference.ai/api/v1" }
-
-[[eval]]
-env_id = "medexqa"
-num_examples = -1
-rollouts_per_example = 1
-env_args = { use_judge = true, judge_model = ["openai/gpt-5-mini", "google/gemini-3-flash-preview"], judge_base_url = "https://api.pinference.ai/api/v1" }
-
-[[eval]]
-env_id = "medicationqa"
-num_examples = -1
-rollouts_per_example = 1
-env_args = { judge_model = ["openai/gpt-5-mini", "x-ai/grok-4.1-fast"], judge_base_url = "https://api.pinference.ai/api/v1" }
-
-[[eval]]
-env_id = "mtsamples_procedures"
-num_examples = -1
-rollouts_per_example = 1
-env_args = { judge_model = ["openai/gpt-5-mini", "x-ai/grok-4.1-fast"], judge_base_url = "https://api.pinference.ai/api/v1" }
-
-[[eval]]
-env_id = "mtsamples_replicate"
-num_examples = -1
-rollouts_per_example = 1
-env_args = { judge_model = ["openai/gpt-5-mini", "x-ai/grok-4.1-fast"], judge_base_url = "https://api.pinference.ai/api/v1" }
-
-[[eval]]
-env_id = "agentclinic"
-num_examples = -1
-rollouts_per_example = 1
-env_args = { judge_model = ["openai/gpt-5-mini", "google/gemini-3-flash-preview"], judge_base_url = "https://api.pinference.ai/api/v1", patient_model = "openai/gpt-5-mini", patient_base_url = "https://api.pinference.ai/api/v1", measurement_model = "openai/gpt-5-mini", measurement_base_url = "https://api.pinference.ai/api/v1" }
-
-[[eval]]
-env_id = "pubhealthbench"
-num_examples = -1
-rollouts_per_example = 1
-env_args = { split = "reviewed" }
-
-[[eval]]
-env_id = "pubhealthbench"
-num_examples = -1
-rollouts_per_example = 1
-env_args = { split = "freeform", judge_model = ["openai/gpt-5-mini", "google/gemini-3-flash-preview"], judge_base_url = "https://api.pinference.ai/api/v1" }
-
-[[eval]]
-env_id = "careqa"
-num_examples = -1
-rollouts_per_example = 1
-env_args = { split = "en" }
-
-[[eval]]
-env_id = "careqa"
-num_examples = -1
-rollouts_per_example = 1
-env_args = { split = "open", judge_model = ["openai/gpt-5-mini", "google/gemini-3-flash-preview"], judge_base_url = "https://api.pinference.ai/api/v1" }
-
-[[eval]]
-env_id = "medcalc_bench"
-num_examples = -1
-rollouts_per_example = 1
-env_args = { version = "1.2" }
-
-[[eval]]
-env_id = "medcalc_bench"
-num_examples = -1
-rollouts_per_example = 1
-env_args = { version = "verified", add_python_tool = true, add_calculator_tool = true }
-
-[[eval]]
-env_id = "medagentbench"
-num_examples = -1
-rollouts_per_example = 1
-env_args = { fhir_api_base = "http://localhost:8080/fhir/" }
-
-[[eval]]
-env_id = "medagentbenchv2"
-num_examples = -1
-rollouts_per_example = 1
-env_args = { fhir_api_base = "http://localhost:8080/fhir/" }
-
-[[eval]]
-env_id = "meqsum"
-num_examples = -1
-rollouts_per_example = 1
-env_args = { split = "test", compute_auto_metrics = true }
-
-[[eval]]
-env_id = "meqsum"
-num_examples = -1
-rollouts_per_example = 1
-env_args = { split = "validation", compute_auto_metrics = true }
-
-[[eval]]
-env_id = "meqsum"
-num_examples = -1
-rollouts_per_example = 1
-env_args = { split = "test", compute_auto_metrics = false }
-
-[[ablation]]
-env_id = "medqa"
-num_examples = -1
-rollouts_per_example = 1
-env_args = { shuffle_answers = true }
-
-[ablation.sweep.env_args]
-shuffle_seed = [1618, 9331]
-
-[[ablation]]
-env_id = "med_mcqa"
-num_examples = -1
-rollouts_per_example = 1
-env_args = { shuffle_answers = true }
-
-[ablation.sweep.env_args]
-shuffle_seed = [1618, 9331]
-
-[[ablation]]
-env_id = "pubmedqa"
-num_examples = -1
-rollouts_per_example = 1
-env_args = { shuffle_answers = true }
-
-[ablation.sweep.env_args]
-shuffle_seed = [1618, 9331]
-
-[[ablation]]
-env_id = "mmlu_pro_health"
-num_examples = -1
-rollouts_per_example = 1
-env_args = { shuffle_answers = true }
-
-[ablation.sweep.env_args]
-shuffle_seed = [1618, 9331]
-
-[[ablation]]
-env_id = "m_arc"
-num_examples = -1
-rollouts_per_example = 1
-env_args = { shuffle_answers = true }
-
-[ablation.sweep.env_args]
-shuffle_seed = [1618, 9331]
-
-[[ablation]]
-env_id = "careqa"
-num_examples = -1
-rollouts_per_example = 1
-env_args = { split = "en", shuffle_answers = true }
-
-[ablation.sweep.env_args]
-shuffle_seed = [1618, 9331]
-
-[[ablation]]
-env_id = "medbullets"
-num_examples = -1
-rollouts_per_example = 1
-
-[ablation.sweep.env_args]
-num_options = [4, 5]
-
-[[ablation]]
-env_id = "medbullets"
-num_examples = -1
-rollouts_per_example = 1
-env_args = { shuffle_answers = true }
-
-[ablation.sweep.env_args]
-num_options = [4, 5]
-shuffle_seed = [1618, 9331]
-
-[[ablation]]
-env_id = "medxpertqa"
-num_examples = -1
-rollouts_per_example = 1
-
-[ablation.sweep.env_args]
-question_type = ["reasoning", "understanding"]
-
-[[ablation]]
-env_id = "medxpertqa"
-num_examples = -1
-rollouts_per_example = 1
-env_args = { shuffle_answers = true }
-
-[ablation.sweep.env_args]
-question_type = ["reasoning", "understanding"]
-shuffle_seed = [1618, 9331]
-
-[[ablation]]
-env_id = "supergpqa_medicine"
-num_examples = -1
-rollouts_per_example = 1
-
-[ablation.sweep.env_args]
-difficulty = ["easy", "hard"]
-
-[[ablation]]
-env_id = "supergpqa_medicine"
-num_examples = -1
-rollouts_per_example = 1
-env_args = { shuffle_answers = true }
-
-[ablation.sweep.env_args]
-difficulty = ["easy", "hard"]
-shuffle_seed = [1618, 9331]
-
-[[ablation]]
-env_id = "medconceptsqa"
-num_examples = -1
-rollouts_per_example = 1
-env_args = { vocab = "icd10cm_sample" }
-
-[ablation.sweep.env_args]
-difficulty = ["easy", "medium", "hard"]
-
-[[ablation]]
-env_id = "medconceptsqa"
-num_examples = -1
-rollouts_per_example = 1
-env_args = { vocab = "icd10cm_sample", shuffle_answers = true }
-
-[ablation.sweep.env_args]
-difficulty = ["easy", "medium", "hard"]
-shuffle_seed = [1618, 9331]
-
-[[ablation]]
-env_id = "pubhealthbench"
-num_examples = -1
-rollouts_per_example = 1
-env_args = { split = "reviewed", shuffle_answers = true }
-
-[ablation.sweep.env_args]
-shuffle_seed = [1618, 9331]
-
-[[ablation]]
-env_id = "longhealth"
-num_examples = -1
-rollouts_per_example = 1
-env_args = { doc_shuffle_seed = 2718 }
-
-[ablation.sweep.env_args]
-task = ["task1", "task2"]
-
-[[ablation]]
-env_id = "medrbench"
-num_examples = -1
-rollouts_per_example = 1
-env_args = { judge_model = ["openai/gpt-5-mini", "google/gemini-3-flash-preview"], judge_base_url = "https://api.pinference.ai/api/v1", patient_agent_model = "openai/gpt-5-mini", patient_agent_base_url = "https://api.pinference.ai/api/v1" }
-
-[ablation.sweep.env_args]
-task = ["oracle", "1turn", "free_turn"]
diff --git a/configs/eval/medarc-judge.toml b/configs/eval/medmarks-open_ended.toml
similarity index 100%
rename from configs/eval/medarc-judge.toml
rename to configs/eval/medmarks-open_ended.toml
diff --git a/configs/eval/medarc-mcq.toml b/configs/eval/medmarks-verified.toml
similarity index 100%
rename from configs/eval/medarc-mcq.toml
rename to configs/eval/medmarks-verified.toml
diff --git a/docs/medarc-eval-bench.md b/docs/medarc-eval-bench.md
index 060c8911..6c3eb35d 100644
--- a/docs/medarc-eval-bench.md
+++ b/docs/medarc-eval-bench.md
@@ -13,12 +13,12 @@ accepts `.toml` files only.
 # Preview the repository smoke config
 medarc-eval bench --config configs/eval/smoke.toml --dry-run
 
-# Run the MCQ production suite
-medarc-eval bench --config configs/eval/medarc-mcq.toml
+# Run the verified production suite
+medarc-eval bench --config configs/eval/medmarks-verified.toml
 
-# Run the aggregate suite against a local OpenAI-compatible server
+# Run the verified suite against a local OpenAI-compatible server
 medarc-eval bench \
-  --config configs/eval/medarc-all.toml \
+  --config configs/eval/medmarks-verified.toml \
   --api-base-url http://127.0.0.1:8000/v1 \
   --provider local \
   --model openai/my-local-model
@@ -29,9 +29,8 @@ Repository suite configs live in `configs/eval/`:
 | Config | Purpose |
 |--------|---------|
 | `smoke.toml` | Small smoke test used by CLI tests |
-| `medarc-mcq.toml` | Multiple-choice benchmark suite |
-| `medarc-judge.toml` | Judge/free-form benchmark suite |
-| `medarc-all.toml` | Aggregate production suite |
+| `medmarks-verified.toml` | Verified benchmark suite |
+| `medmarks-open_ended.toml` | Open-ended benchmark suite |
 
 ## Config Format
 
@@ -113,10 +112,10 @@ MedARC identity fields.
 
 ```bash
 # Resume matching deterministic outputs
-medarc-eval bench --config configs/eval/medarc-all.toml
+medarc-eval bench --config configs/eval/medmarks-verified.toml
 
 # Archive existing deterministic outputs and rerun
-medarc-eval bench --config configs/eval/medarc-all.toml --force
+medarc-eval bench --config configs/eval/medmarks-verified.toml --force
 ```
 
 The `plan_digest` and MedARC metadata identity payload are based on canonical
@@ -163,7 +162,7 @@ export PRIME_API_KEY=...
 export PRIME_TEAM_ID=...
 
 medarc-eval bench \
-  --config configs/eval/medarc-mcq.toml \
+  --config configs/eval/medmarks-verified.toml \
   --api-base-url https://api.pinference.ai/api/v1
 ```
 
diff --git a/docs/medarc-eval-process.md b/docs/medarc-eval-process.md
index cd15bccd..c99cc3f5 100644
--- a/docs/medarc-eval-process.md
+++ b/docs/medarc-eval-process.md
@@ -242,7 +242,7 @@ This runs `medarc-eval winrate` automatically after processing completes when th
 
 ```bash
 # 1. Run benchmarks
-medarc-eval bench --config configs/eval/medarc-mcq.toml
+medarc-eval bench --config configs/eval/medmarks-verified.toml
 
 # 2. Process results
 medarc-eval process --runs-dir runs/evals
diff --git a/docs/medarc-eval.md b/docs/medarc-eval.md
index 0eca9c14..bfbe33fc 100644
--- a/docs/medarc-eval.md
+++ b/docs/medarc-eval.md
@@ -46,7 +46,7 @@ medarc-eval winrate
 medarc-eval medqa -m gpt-4.1-mini -n 50
 
 # Subcommands: keyword comes first
-medarc-eval bench --config configs/eval/medarc-mcq.toml
+medarc-eval bench --config configs/eval/medmarks-verified.toml
 medarc-eval process --runs-dir runs/evals
 medarc-eval winrate --processed-dir runs/processed
 ```
@@ -74,13 +74,13 @@ medarc-eval longhealth --help
 
 ```bash
 # Run all jobs defined in config
-medarc-eval bench --config configs/eval/medarc-mcq.toml
+medarc-eval bench --config configs/eval/medmarks-verified.toml
 
 # Preview what would run without executing
-medarc-eval bench --config configs/eval/medarc-mcq.toml --dry-run
+medarc-eval bench --config configs/eval/medmarks-verified.toml --dry-run
 
 # Force all jobs to use a specific API endpoint
-medarc-eval bench --config configs/eval/medarc-mcq.toml --api-base-url http://127.0.0.1:8000/v1 --provider local
+medarc-eval bench --config configs/eval/medmarks-verified.toml --api-base-url http://127.0.0.1:8000/v1 --provider local
 ```
 
 ### Processing Mode (`medarc-eval process`)
diff --git a/tests/test_cli/test_main.py b/tests/test_cli/test_main.py
index cf863b8f..37f5a620 100644
--- a/tests/test_cli/test_main.py
+++ b/tests/test_cli/test_main.py
@@ -4,7 +4,7 @@
 import logging
 import sys
 from pathlib import Path
-from typing import Any, Mapping
+from typing import Any
 from types import SimpleNamespace
 
 import pytest
@@ -212,8 +212,8 @@ def test_bench_rejects_removed_yaml_runner_flags(capsys: pytest.CaptureFixture[s
     assert "unrecognized arguments: --restart" in err
 
 
-def test_repository_mcq_toml_config_dry_run_shows_ablation_variants(capsys: pytest.CaptureFixture[str]) -> None:
-    exit_code = main.main(["bench", "--config", "configs/eval/medarc-mcq.toml", "--dry-run", "--eval-index", "9"])
+def test_repository_verified_toml_config_dry_run_shows_ablation_variants(capsys: pytest.CaptureFixture[str]) -> None:
+    exit_code = main.main(["bench", "--config", "configs/eval/medmarks-verified.toml", "--dry-run", "--eval-index", "9"])
 
     output = capsys.readouterr().out
     assert exit_code == 0
@@ -222,8 +222,8 @@ def test_repository_mcq_toml_config_dry_run_shows_ablation_variants(capsys: pyte
     assert "runs/evals/openai-gpt-4.1-mini/medqa/env_args.shuffle_answers-true__env_args.shuffle_seed-1618" in output
 
 
-def test_repository_judge_toml_config_loads_expected_judge_args() -> None:
-    configs = main.load_toml_eval_configs("configs/eval/medarc-judge.toml")
+def test_repository_open_ended_toml_config_loads_expected_judge_args() -> None:
+    configs = main.load_toml_eval_configs("configs/eval/medmarks-open_ended.toml")
     healthbench = next(config for config in configs if config["env_id"] == "healthbench")
     medrbench = [config for config in configs if config["env_id"] == "medrbench"]
 
@@ -232,28 +232,6 @@ def test_repository_judge_toml_config_loads_expected_judge_args() -> None:
     assert {config["env_args"]["task"] for config in medrbench} == {"oracle", "1turn", "free_turn"}
 
 
-def test_repository_all_toml_contains_production_suite_entries() -> None:
-    def signature(config: Mapping[str, Any]) -> str:
-        return json.dumps(
-            {
-                "env_id": config["env_id"],
-                "env_args": config.get("env_args", {}),
-                "num_examples": config.get("num_examples"),
-                "rollouts_per_example": config.get("rollouts_per_example"),
-            },
-            sort_keys=True,
-        )
-
-    all_configs = {signature(config) for config in main.load_toml_eval_configs("configs/eval/medarc-all.toml")}
-    production_configs = {
-        signature(config)
-        for path in ("configs/eval/medarc-mcq.toml", "configs/eval/medarc-judge.toml")
-        for config in main.load_toml_eval_configs(path)
-    }
-
-    assert production_configs <= all_configs
-
-
 def test_toml_bench_dry_run_model_override(
     tmp_path: Path,
     capsys: pytest.CaptureFixture[str],

From fd8d6d1f8eb7b70c687838f58afcd420a53c6f1e Mon Sep 17 00:00:00 2001
From: Benjamin Warner <me@benjaminwarner.dev>
Date: Thu, 7 May 2026 19:29:09 +0000
Subject: [PATCH 21/53] Simplify eval output identity

---
 configs/eval/medmarks-open_ended.toml     |   1 +
 configs/eval/medmarks-verified.toml       |  15 +
 docs/medarc-eval-bench.md                 |  50 +--
 docs/medarc-eval-process.md               |  19 +-
 docs/medarc-eval.md                       |   7 +-
 docs/medarc-verifiers-architecture.md     |  38 +--
 medarc_verifiers/cli/bench_index.py       | 235 --------------
 medarc_verifiers/cli/eval_identity.py     | 362 +++++++++-------------
 medarc_verifiers/cli/main.py              | 312 ++++++-------------
 medarc_verifiers/cli/process/discovery.py | 176 ++++-------
 medarc_verifiers/cli/process/metadata.py  |  11 +-
 medarc_verifiers/cli/verifiers_adapter.py |   5 +-
 tests/test_cli/test_bench_index.py        |  88 ------
 tests/test_cli/test_eval_identity.py      | 322 ++++---------------
 tests/test_cli/test_main.py               | 121 ++++++--
 tests/test_cli/test_process_discovery.py  | 100 +++---
 tests/test_cli/test_toml_bench_index.py   | 320 -------------------
 17 files changed, 599 insertions(+), 1583 deletions(-)
 delete mode 100644 medarc_verifiers/cli/bench_index.py
 delete mode 100644 tests/test_cli/test_bench_index.py
 delete mode 100644 tests/test_cli/test_toml_bench_index.py

diff --git a/configs/eval/medmarks-open_ended.toml b/configs/eval/medmarks-open_ended.toml
index b295421b..01563fe3 100644
--- a/configs/eval/medmarks-open_ended.toml
+++ b/configs/eval/medmarks-open_ended.toml
@@ -72,6 +72,7 @@ env_args = { judge_model = ["openai/gpt-5-mini", "google/gemini-3-flash-preview"
 
 [[ablation]]
 env_id = "medrbench"
+name = "{env_args.task}"
 num_examples = -1
 rollouts_per_example = 1
 env_args = { judge_model = ["openai/gpt-5-mini", "google/gemini-3-flash-preview"], judge_base_url = "https://api.pinference.ai/api/v1", patient_agent_model = "openai/gpt-5-mini", patient_agent_base_url = "https://api.pinference.ai/api/v1" }
diff --git a/configs/eval/medmarks-verified.toml b/configs/eval/medmarks-verified.toml
index 29cadb7d..73404bdc 100644
--- a/configs/eval/medmarks-verified.toml
+++ b/configs/eval/medmarks-verified.toml
@@ -13,6 +13,7 @@ rollouts_per_example = 1
 
 [[ablation]]
 env_id = "medqa"
+name = "shuffle_seed-{env_args.shuffle_seed}"
 num_examples = -1
 rollouts_per_example = 1
 env_args = { shuffle_answers = true }
@@ -27,6 +28,7 @@ rollouts_per_example = 1
 
 [[ablation]]
 env_id = "med_mcqa"
+name = "shuffle_seed-{env_args.shuffle_seed}"
 num_examples = -1
 rollouts_per_example = 1
 env_args = { shuffle_answers = true }
@@ -41,6 +43,7 @@ rollouts_per_example = 1
 
 [[ablation]]
 env_id = "pubmedqa"
+name = "shuffle_seed-{env_args.shuffle_seed}"
 num_examples = -1
 rollouts_per_example = 1
 env_args = { shuffle_answers = true }
@@ -55,6 +58,7 @@ rollouts_per_example = 1
 
 [[ablation]]
 env_id = "mmlu_pro_health"
+name = "shuffle_seed-{env_args.shuffle_seed}"
 num_examples = -1
 rollouts_per_example = 1
 env_args = { shuffle_answers = true }
@@ -69,6 +73,7 @@ rollouts_per_example = 1
 
 [[ablation]]
 env_id = "m_arc"
+name = "shuffle_seed-{env_args.shuffle_seed}"
 num_examples = -1
 rollouts_per_example = 1
 env_args = { shuffle_answers = true }
@@ -84,6 +89,7 @@ env_args = { split = "en" }
 
 [[ablation]]
 env_id = "careqa"
+name = "shuffle_seed-{env_args.shuffle_seed}"
 num_examples = -1
 rollouts_per_example = 1
 env_args = { split = "en", shuffle_answers = true }
@@ -93,6 +99,7 @@ shuffle_seed = [1618, 9331]
 
 [[ablation]]
 env_id = "medbullets"
+name = "num_options-{env_args.num_options}"
 num_examples = -1
 rollouts_per_example = 1
 
@@ -101,6 +108,7 @@ num_options = [4, 5]
 
 [[ablation]]
 env_id = "medbullets"
+name = "num_options-{env_args.num_options}__shuffle_seed-{env_args.shuffle_seed}"
 num_examples = -1
 rollouts_per_example = 1
 env_args = { shuffle_answers = true }
@@ -111,6 +119,7 @@ shuffle_seed = [1618, 9331]
 
 [[ablation]]
 env_id = "medxpertqa"
+name = "{env_args.question_type}"
 num_examples = -1
 rollouts_per_example = 1
 
@@ -119,6 +128,7 @@ question_type = ["reasoning", "understanding"]
 
 [[ablation]]
 env_id = "medxpertqa"
+name = "{env_args.question_type}__shuffle_seed-{env_args.shuffle_seed}"
 num_examples = -1
 rollouts_per_example = 1
 env_args = { shuffle_answers = true }
@@ -129,6 +139,7 @@ shuffle_seed = [1618, 9331]
 
 [[ablation]]
 env_id = "supergpqa_medicine"
+name = "{env_args.difficulty}"
 num_examples = -1
 rollouts_per_example = 1
 
@@ -137,6 +148,7 @@ difficulty = ["easy", "hard"]
 
 [[ablation]]
 env_id = "supergpqa_medicine"
+name = "{env_args.difficulty}__shuffle_seed-{env_args.shuffle_seed}"
 num_examples = -1
 rollouts_per_example = 1
 env_args = { shuffle_answers = true }
@@ -147,6 +159,7 @@ shuffle_seed = [1618, 9331]
 
 [[ablation]]
 env_id = "medconceptsqa"
+name = "{env_args.difficulty}"
 num_examples = -1
 rollouts_per_example = 1
 env_args = { vocab = "icd10cm_sample" }
@@ -156,6 +169,7 @@ difficulty = ["easy", "medium", "hard"]
 
 [[ablation]]
 env_id = "medconceptsqa"
+name = "{env_args.difficulty}__shuffle_seed-{env_args.shuffle_seed}"
 num_examples = -1
 rollouts_per_example = 1
 env_args = { vocab = "icd10cm_sample", shuffle_answers = true }
@@ -177,6 +191,7 @@ env_args = { split = "reviewed" }
 
 [[ablation]]
 env_id = "pubhealthbench"
+name = "shuffle_seed-{env_args.shuffle_seed}"
 num_examples = -1
 rollouts_per_example = 1
 env_args = { split = "reviewed", shuffle_answers = true }
diff --git a/docs/medarc-eval-bench.md b/docs/medarc-eval-bench.md
index 6c3eb35d..7a79c529 100644
--- a/docs/medarc-eval-bench.md
+++ b/docs/medarc-eval-bench.md
@@ -36,8 +36,7 @@ Repository suite configs live in `configs/eval/`:
 
 Bench configs use upstream `verifiers` TOML semantics: top-level defaults plus
 one or more `[[eval]]` blocks. MedARC adds deterministic output planning around
-the resolved evals and writes a required `bench_index.json` sidecar; it does not
-use YAML `models`, `envs`, or `jobs` sections.
+the resolved evals; it does not use YAML `models`, `envs`, or `jobs` sections.
 
 ```toml
 model = "openai/gpt-4.1-mini"
@@ -75,6 +74,7 @@ output_dir = "runs/evals"
 
 [[ablation]]
 env_id = "medqa"
+name = "shuffle_seed-{env_args.shuffle_seed}"
 num_examples = -1
 rollouts_per_example = 1
 env_args = { shuffle_answers = true }
@@ -86,42 +86,45 @@ shuffle_seed = [1618, 9331]
 Example output paths:
 
 ```text
-runs/evals/openai-gpt-4.1-mini/medqa/env_args.shuffle_seed-1618/
-runs/evals/openai-gpt-4.1-mini/medqa/env_args.shuffle_seed-9331/
+runs/evals/openai-gpt-4.1-mini/medqa/shuffle_seed-1618/
+runs/evals/openai-gpt-4.1-mini/medqa/shuffle_seed-9331/
 ```
 
-Non-variant evals write to `runs/evals/<model>/<env>/`.
+Non-variant evals use the reserved variant id `base` and write to
+`runs/evals/<model>/<env>/base/`. Duplicate `(model, env)` evals must provide
+an explicit `variant_id` or `name`. `name` may use simple templates such as
+`shuffle_seed-{env_args.shuffle_seed}` after ablation expansion.
 
-## Output Sidecar
+## MedARC Metadata
 
-Every bench output root contains `bench_index.json`. It records one entry for
-each successfully materialized eval output, including `results_path`, `model`,
-`env_id`, optional variant identity, resolved args, and a `plan_digest`.
+Upstream `metadata.json` remains a normal `verifiers` file. MedARC-specific
+identity lives in a small model-level helper:
 
-Processing prefers this sidecar over path inference for bench outputs. Failed
-evals are omitted from the sidecar until their `metadata.json` and
-`results.jsonl` exist, so `--continue-on-error` runs leave successful siblings
-processable.
+```text
+runs/evals/<model>/.medarc_eval_metadata.json
+```
+
+The helper maps model-relative results paths such as `medqa/base` to `env_id`
+and `variant_id`. Processing scans output directories first, so stale helper
+entries do not hide or create process records.
 
 ## Resume and Force
 
-Bench writes each eval to a deterministic result directory. Re-running the same
-TOML config reuses the same directory only when the matching `bench_index.json`
-entry has the same `plan_digest` and existing `metadata.json` contains matching
-MedARC identity fields.
+Bench writes each eval to a deterministic result directory. Existing output
+reuse is explicit:
 
 ```bash
-# Resume matching deterministic outputs
-medarc-eval bench --config configs/eval/medmarks-verified.toml
+# Resume an existing deterministic output using upstream resume behavior
+medarc-eval bench --config configs/eval/medmarks-verified.toml --resume
 
 # Archive existing deterministic outputs and rerun
 medarc-eval bench --config configs/eval/medmarks-verified.toml --force
 ```
 
-The `plan_digest` and MedARC metadata identity payload are based on canonical
-JSON for the planned eval identity, including unknown `sampling_args`. MedARC
-does not maintain a sampling-argument allowlist for resume safety; new provider
-arguments pass through to upstream.
+Without `--resume` or `--force`, an existing deterministic output fails.
+`--resume` delegates compatibility checks to upstream `verifiers`; MedARC does
+not maintain a sampling-argument allowlist or fingerprint blocker for resume
+safety. New provider arguments pass through to upstream.
 
 ## Common Flags
 
@@ -130,6 +133,7 @@ arguments pass through to upstream.
 | `--config PATH` | Required path to an upstream TOML eval config |
 | `--dry-run` | Resolve evals and print the deterministic plan |
 | `--force` | Archive existing deterministic output and rerun |
+| `--resume` | Resume an existing deterministic output via upstream `verifiers` |
 | `--output-dir PATH` | Override the config output directory |
 | `--env-dir PATH` | Directory containing local environments |
 | `--endpoints-path PATH` | Endpoint registry path, default `configs/endpoints.toml` |
diff --git a/docs/medarc-eval-process.md b/docs/medarc-eval-process.md
index c99cc3f5..508e1d7a 100644
--- a/docs/medarc-eval-process.md
+++ b/docs/medarc-eval-process.md
@@ -17,8 +17,8 @@ medarc-eval process --dry-run
 
 ## What Processing Does
 
-1. **Discovers** eval outputs in `runs/evals/` and legacy manifest jobs in `runs/raw/`
-   (`bench_index.json` is preferred when present for bench outputs)
+1. **Discovers** eval outputs in `runs/evals/` by scanning output directories,
+   and legacy manifest jobs in `runs/raw/`
 2. **Extracts** results from each eval output directory
 3. **Normalizes** data into a fixed output schema
 4. **Writes** parquet files organized by model and environment
@@ -56,11 +56,11 @@ On-disk model and env path components are slugified, so filenames may not exactl
 
 ### By Completion Status
 
-For current TOML bench outputs, processing first looks for
-`runs/evals/bench_index.json`. When present, every sidecar entry must point to
-an existing `metadata.json` and `results.jsonl`, and sidecar `model` / `env_id`
-must match metadata when those fields are present. If no sidecar exists,
-processing falls back to metadata/path inference for ad hoc upstream outputs.
+For current TOML bench outputs, processing scans for directories containing
+`metadata.json` and `results.jsonl`. When a model-level
+`.medarc_eval_metadata.json` helper exists, processing uses it only to enrich
+matching scanned paths with MedARC `env_id` / `variant_id`. Stale helper entries
+are ignored. Ad hoc upstream outputs fall back to metadata/path inference.
 
 For legacy YAML-runner outputs, `medarc-eval process` reads
 `runs/raw/<run_id>/run_manifest.json` and only selects jobs whose manifest
@@ -86,8 +86,9 @@ medarc-eval process --max-results-missing-pct 100
 ```
 
 For TOML bench outputs, this gate uses `metadata.json` values for expected rows
-and the observed `results.jsonl` row count. Model, environment, and variant
-identity come from `bench_index.json` when the output root has one:
+and the observed `results.jsonl` row count. Model and environment identity come
+from upstream metadata and path inference, with variant identity enriched by the
+model-level helper when present:
 
 - `expected_rows = num_examples * rollouts_per_example`
 - `observed_rows = results.jsonl row count`
diff --git a/docs/medarc-eval.md b/docs/medarc-eval.md
index bfbe33fc..a1833324 100644
--- a/docs/medarc-eval.md
+++ b/docs/medarc-eval.md
@@ -112,11 +112,12 @@ medarc-eval winrate --list-models
 ```
 runs/
 ├── evals/                        # Raw TOML bench outputs
-│   ├── bench_index.json           # Bench identity and results-path sidecar
 │   └── <model>/
+│       ├── .medarc_eval_metadata.json
 │       └── <env>/
-│           ├── results.jsonl
-│           └── metadata.json
+│           └── <variant>/
+│               ├── results.jsonl
+│               └── metadata.json
 ├── processed/                    # Analysis-ready parquet files (from process)
 │   ├── env_index.json            # Dataset inventory
 │   └── <model>/<env>.parquet
diff --git a/docs/medarc-verifiers-architecture.md b/docs/medarc-verifiers-architecture.md
index f59d37f7..d610b0c8 100644
--- a/docs/medarc-verifiers-architecture.md
+++ b/docs/medarc-verifiers-architecture.md
@@ -45,8 +45,8 @@ It supports:
   - Implemented in `medarc_verifiers/cli/_single_run.py`.
 - **TOML bench mode**: `medarc-eval bench --config <config.toml>`
   - Loads upstream `verifiers` TOML eval configs, expands ablations, plans
-    deterministic output directories, writes `bench_index.json`, then runs evals
-    sequentially through upstream execution.
+    deterministic output directories, then runs evals sequentially through
+    upstream execution.
   - Main implementation: `medarc_verifiers/cli/main.py`
   - Upstream eval boundary: `medarc_verifiers/cli/upstream_eval.py`
   - Deterministic identity/path helpers: `medarc_verifiers/cli/eval_identity.py`
@@ -78,8 +78,9 @@ Override parsing lives in `medarc_verifiers/cli/utils/overrides.py`.
 
 Bench configs use upstream `verifiers` TOML shape: top-level defaults plus one
 or more `[[eval]]` entries. Upstream `[[ablation]]` tables expand into repeated
-eval configs. MedARC adds deterministic paths and a required `bench_index.json`
-sidecar around the resolved upstream eval configs.
+eval configs. MedARC adds deterministic paths around the resolved upstream eval
+configs. Duplicate `(model, env)` outputs must use explicit `variant_id` or
+`name` identity; the reserved default variant id is `base`.
 
 `env_args` precedence is low to high:
 
@@ -124,19 +125,18 @@ Relevant env vars:
 
 TOML bench writes eval outputs under deterministic directories:
 
-- Non-variant evals: `runs/evals/<model>/<env>/`
+- Non-variant evals: `runs/evals/<model>/<env>/base/`
 - Variant evals: `runs/evals/<model>/<env>/<variant_id>/`
 
-Before reusing an existing deterministic directory, bench validates the matching
-`bench_index.json` entry and its `plan_digest`, and also checks MedARC identity
-fields in existing `metadata.json`. Both are based on the canonical planned eval
-payload, including unknown `sampling_args`, so new provider arguments pass
-through instead of hitting a MedARC allowlist.
+Existing output reuse is explicit. Without `--resume` or `--force`, bench fails
+when the target directory already exists. `--resume` passes the deterministic
+target as upstream `EvalConfig.resume_path` and trusts upstream resume
+validation. `--force` archives the existing target and reruns.
 
-`medarc-eval bench` does not monkey-patch upstream metadata saving. MedARC
-identity fields are merged into `metadata.json` only after successful upstream
-execution, and process discovery uses `bench_index.json` as the durable bench
-identity contract.
+`medarc-eval bench` does not monkey-patch upstream metadata saving and does not
+write MedARC identity into upstream `metadata.json`. MedARC-specific identity
+lives in `runs/evals/<model>/.medarc_eval_metadata.json`, keyed by
+model-relative results paths such as `medqa/base`.
 
 `medarc_verifiers/cli/_manifest.py` now only contains the legacy manifest schema
 needed by processing to read historical `runs/raw` outputs.
@@ -145,9 +145,9 @@ needed by processing to read historical `runs/raw` outputs.
 
 TOML bench outputs include:
 
-- `bench_index.json`: bench identity sidecar at the output root
 - `results.jsonl`: per-example rollouts
 - `metadata.json`: eval configuration and metrics snapshot
+- `.medarc_eval_metadata.json`: minimal model-level MedARC identity helper
 
 The runner executes via `verifiers.utils.eval_utils.run_evaluation()` from
 single-run mode and the TOML bench code in `medarc_verifiers/cli/main.py`.
@@ -160,10 +160,10 @@ Entry point: `medarc_verifiers/cli/process/pipeline.py`.
 
 Processing:
 
-1. Discovers TOML bench outputs from `runs/evals` using `bench_index.json` when
-   present, and legacy manifest outputs from `runs/raw`.
-2. Normalizes identity from the bench sidecar plus `metadata.json`; legacy
-   outputs still use manifest fields.
+1. Discovers TOML bench outputs from `runs/evals` by scanning directories, and
+   legacy manifest outputs from `runs/raw`.
+2. Normalizes identity from upstream `metadata.json`, paths, and optional
+   model-level helper metadata; legacy outputs still use manifest fields.
 3. Loads rows from `results.jsonl`, drops large prompt/completion fields, and
    flattens `token_usage`.
 4. Aggregates rows per model and environment, preserving variant ids.
diff --git a/medarc_verifiers/cli/bench_index.py b/medarc_verifiers/cli/bench_index.py
deleted file mode 100644
index 03cfa92f..00000000
--- a/medarc_verifiers/cli/bench_index.py
+++ /dev/null
@@ -1,235 +0,0 @@
-"""Bench sidecar planning and validation."""
-
-from __future__ import annotations
-
-import hashlib
-import json
-from collections import Counter
-from collections.abc import Mapping, Sequence
-from datetime import UTC, datetime
-from pathlib import Path
-from typing import Any
-
-from verifiers.utils.save_utils import make_serializable
-
-from medarc_verifiers.cli.eval_identity import EvalPathPlan
-
-BENCH_INDEX_FILENAME = "bench_index.json"
-BENCH_INDEX_VERSION = 1
-
-
-class BenchIndexError(ValueError):
-    """Raised when a bench sidecar is missing, stale, or internally inconsistent."""
-
-
-def build_bench_index(
-    *,
-    output_root: Path,
-    source_config: Path,
-    eval_configs: Sequence[Any],
-    path_plans: Sequence[EvalPathPlan],
-    plan_payloads: Sequence[Mapping[str, Any]],
-) -> dict[str, Any]:
-    entries = [
-        build_bench_index_entry(
-            index=index,
-            output_root=output_root,
-            config=config,
-            path_plan=path_plan,
-            plan_payload=plan_payload,
-        )
-        for index, (config, path_plan, plan_payload) in enumerate(
-            zip(eval_configs, path_plans, plan_payloads), start=1
-        )
-    ]
-    payload = {
-        "version": BENCH_INDEX_VERSION,
-        "created_at": datetime.now(UTC).replace(microsecond=0).isoformat().replace("+00:00", "Z"),
-        "source_config": str(source_config),
-        "evals": entries,
-    }
-    validate_bench_index(payload, output_root=output_root, require_artifacts=False)
-    return payload
-
-
-def build_bench_index_entry(
-    *,
-    index: int,
-    output_root: Path,
-    config: Any,
-    path_plan: EvalPathPlan,
-    plan_payload: Mapping[str, Any],
-) -> dict[str, Any]:
-    identity = path_plan.identity
-    entry = {
-        "index": index,
-        "results_path": str(path_plan.results_path),
-        "env_id": identity.env_id,
-        "model": identity.model_id,
-        "variant_id": identity.variant_id,
-        "variant_payload": identity.variant_payload,
-        "env_args": dict(config.env_args or {}),
-        "sampling_args": dict(config.sampling_args or {}),
-        "num_examples": config.num_examples,
-        "rollouts_per_example": config.rollouts_per_example,
-    }
-    digest_payload = {key: value for key, value in entry.items() if key != "index"}
-    entry["plan_digest"] = plan_digest({**digest_payload, "output_root": str(output_root), "plan": dict(plan_payload)})
-    return entry
-
-
-def read_bench_index(path: Path) -> dict[str, Any] | None:
-    if not path.exists():
-        return None
-    try:
-        payload = json.loads(path.read_text(encoding="utf-8"))
-    except json.JSONDecodeError as exc:
-        raise BenchIndexError(f"Invalid {BENCH_INDEX_FILENAME} at {path}: expected JSON object.") from exc
-    if not isinstance(payload, dict):
-        raise BenchIndexError(f"Invalid {BENCH_INDEX_FILENAME} at {path}: expected JSON object.")
-    return payload
-
-
-def write_bench_index(path: Path, payload: Mapping[str, Any]) -> None:
-    path.parent.mkdir(parents=True, exist_ok=True)
-    path.write_text(json.dumps(payload, default=make_serializable, indent=2, sort_keys=True) + "\n", encoding="utf-8")
-
-
-def validate_bench_index(
-    payload: Mapping[str, Any],
-    *,
-    output_root: Path,
-    require_artifacts: bool,
-) -> None:
-    if payload.get("version") != BENCH_INDEX_VERSION:
-        raise BenchIndexError(f"Unsupported {BENCH_INDEX_FILENAME} version: {payload.get('version')!r}.")
-    entries = payload.get("evals")
-    if not isinstance(entries, list):
-        raise BenchIndexError(f"Invalid {BENCH_INDEX_FILENAME}: 'evals' must be a list.")
-
-    normalized_root = output_root.resolve()
-    paths: list[Path] = []
-    identities: list[tuple[str, str, str | None]] = []
-    model_env_counts: Counter[tuple[str, str]] = Counter()
-    for offset, raw_entry in enumerate(entries, start=1):
-        if not isinstance(raw_entry, Mapping):
-            raise BenchIndexError(f"Invalid {BENCH_INDEX_FILENAME}: eval entry {offset} must be an object.")
-        results_path = _entry_results_path(raw_entry)
-        _require_under_root(results_path, normalized_root)
-        paths.append(results_path.resolve())
-
-        model = _required_string(raw_entry, "model", offset)
-        env_id = _required_string(raw_entry, "env_id", offset)
-        variant_id = raw_entry.get("variant_id")
-        if variant_id is not None and not isinstance(variant_id, str):
-            raise BenchIndexError(f"Invalid {BENCH_INDEX_FILENAME}: eval entry {offset} variant_id must be a string.")
-        identities.append((model, env_id, variant_id))
-        model_env_counts[(model, env_id)] += 1
-
-        if require_artifacts:
-            _require_artifact(results_path / "metadata.json")
-            _require_artifact(results_path / "results.jsonl")
-            _validate_metadata_identity(results_path / "metadata.json", model=model, env_id=env_id)
-
-    _raise_duplicates(paths, label="results_path")
-    for (model, env_id), count in model_env_counts.items():
-        if count > 1:
-            missing_variant = [identity for identity in identities if identity[:2] == (model, env_id) and not identity[2]]
-            if missing_variant:
-                raise BenchIndexError(
-                    f"Duplicate bench entries for model={model!r}, env_id={env_id!r} require explicit variant_id."
-                )
-    _raise_duplicates(identities, label="(model, env_id, variant_id)")
-
-
-def find_entry_for_results_path(payload: Mapping[str, Any], results_path: Path) -> Mapping[str, Any] | None:
-    target = results_path.resolve()
-    entries = payload.get("evals")
-    if not isinstance(entries, list):
-        return None
-    for entry in entries:
-        if isinstance(entry, Mapping) and _entry_results_path(entry).resolve() == target:
-            return entry
-    return None
-
-
-def plan_digest(payload: Mapping[str, Any]) -> str:
-    encoded = json.dumps(_canonicalize(payload), sort_keys=True, separators=(",", ":"), default=str).encode("utf-8")
-    return "sha256:" + hashlib.sha256(encoded).hexdigest()
-
-
-def _entry_results_path(entry: Mapping[str, Any]) -> Path:
-    raw_path = entry.get("results_path")
-    if not isinstance(raw_path, str) or not raw_path:
-        raise BenchIndexError(f"Invalid {BENCH_INDEX_FILENAME}: each eval entry needs a non-empty results_path.")
-    return Path(raw_path)
-
-
-def _required_string(entry: Mapping[str, Any], key: str, offset: int) -> str:
-    value = entry.get(key)
-    if not isinstance(value, str) or not value:
-        raise BenchIndexError(f"Invalid {BENCH_INDEX_FILENAME}: eval entry {offset} needs non-empty {key}.")
-    return value
-
-
-def _require_under_root(path: Path, normalized_root: Path) -> None:
-    try:
-        path.resolve().relative_to(normalized_root)
-    except ValueError as exc:
-        raise BenchIndexError(
-            f"Invalid {BENCH_INDEX_FILENAME}: results_path {path} is outside output root {normalized_root}."
-        ) from exc
-
-
-def _require_artifact(path: Path) -> None:
-    if not path.is_file():
-        raise BenchIndexError(f"Invalid {BENCH_INDEX_FILENAME}: required artifact is missing: {path}.")
-
-
-def _validate_metadata_identity(metadata_path: Path, *, model: str, env_id: str) -> None:
-    try:
-        metadata = json.loads(metadata_path.read_text(encoding="utf-8"))
-    except json.JSONDecodeError as exc:
-        raise BenchIndexError(f"Invalid metadata.json at {metadata_path}: expected JSON object.") from exc
-    if not isinstance(metadata, Mapping):
-        raise BenchIndexError(f"Invalid metadata.json at {metadata_path}: expected JSON object.")
-    for key, expected in (("model", model), ("env_id", env_id)):
-        current = metadata.get(key)
-        if current is not None and current != expected:
-            raise BenchIndexError(
-                f"{BENCH_INDEX_FILENAME} identity mismatch for {metadata_path.parent}: "
-                f"{key} sidecar={expected!r} metadata={current!r}."
-            )
-
-
-def _raise_duplicates(values: Sequence[Any], *, label: str) -> None:
-    duplicates = [value for value, count in Counter(values).items() if count > 1]
-    if duplicates:
-        rendered = ", ".join(str(value) for value in duplicates)
-        raise BenchIndexError(f"Invalid {BENCH_INDEX_FILENAME}: duplicate {label}: {rendered}.")
-
-
-def _canonicalize(value: Any) -> Any:
-    if isinstance(value, Mapping):
-        return {str(key): _canonicalize(value[key]) for key in sorted(value)}
-    if isinstance(value, list | tuple):
-        return [_canonicalize(item) for item in value]
-    if isinstance(value, set):
-        return [_canonicalize(item) for item in sorted(value, key=str)]
-    if isinstance(value, Path):
-        return str(value)
-    return value
-
-
-__all__ = [
-    "BENCH_INDEX_FILENAME",
-    "BENCH_INDEX_VERSION",
-    "BenchIndexError",
-    "build_bench_index",
-    "build_bench_index_entry",
-    "find_entry_for_results_path",
-    "plan_digest",
-    "read_bench_index",
-    "validate_bench_index",
-    "write_bench_index",
-]
diff --git a/medarc_verifiers/cli/eval_identity.py b/medarc_verifiers/cli/eval_identity.py
index f2db9330..c7ca5d39 100644
--- a/medarc_verifiers/cli/eval_identity.py
+++ b/medarc_verifiers/cli/eval_identity.py
@@ -5,34 +5,31 @@
 import hashlib
 import json
 import re
-from collections import Counter, defaultdict
-from collections.abc import Mapping, Sequence
+from collections import Counter
+from collections.abc import Iterable, Mapping, Sequence
 from dataclasses import dataclass
 from pathlib import Path
 from typing import Any
 
-MEDARC_CONFIG_FINGERPRINT_KEY = "medarc_config_fingerprint"
-MEDARC_CONFIG_FINGERPRINT_PAYLOAD_KEY = "medarc_config_fingerprint_payload"
 MEDARC_VARIANT_ID_KEY = "variant_id"
-MEDARC_VARIANT_PAYLOAD_KEY = "variant_payload"
+MEDARC_EVAL_METADATA_FILENAME = ".medarc_eval_metadata.json"
+BASE_VARIANT_ID = "base"
 
 _SLUG_PATTERN = re.compile(r"[^A-Za-z0-9._-]+")
 _MAX_SEGMENT_LENGTH = 80
 _MAX_VARIANT_ID_LENGTH = 160
 
+
 @dataclass(frozen=True)
 class EvalIdentity:
-    """Resolved model/env identity plus optional variant metadata."""
+    """Resolved model/env identity plus semantic variant metadata."""
 
     model_id: str
     env_id: str
-    variant_id: str | None = None
-    variant_payload: dict[str, Any] | None = None
+    variant_id: str = BASE_VARIANT_ID
 
     @property
     def dataset_id(self) -> str:
-        if self.variant_id is None:
-            return self.env_id
         return f"{self.env_id}::{self.variant_id}"
 
 
@@ -52,64 +49,15 @@ def slug_component(value: Any, *, max_length: int = _MAX_SEGMENT_LENGTH) -> str:
         slug = "value"
     if len(slug) <= max_length:
         return slug
-    digest = short_fingerprint(str(value), length=10)
+    digest = _short_text_digest(str(value), length=10)
     return f"{slug[: max_length - 11].rstrip('-._')}-{digest}"
 
 
-def plan_eval_paths(raw_configs: Sequence[Mapping[str, Any]], *, output_root: str | Path) -> list[EvalPathPlan]:
-    """Plan deterministic output paths, adding variants for colliding model/env pairs."""
-
-    keys = [(_model_id(config), _env_id(config)) for config in raw_configs]
-    counts = Counter(keys)
-    semantic_payloads = [_semantic_variant_source(config) for config in raw_configs]
-    differing_fields = _differing_fields_by_key(semantic_payloads, keys)
-
-    plans: list[EvalPathPlan] = []
-    for idx, (config, key) in enumerate(zip(raw_configs, keys)):
-        model_id, env_id = key
-        variant_payload: dict[str, Any] | None = None
-        variant_id: str | None = None
-        if counts[key] > 1:
-            variant_payload = extract_variant_payload(semantic_payloads[idx], differing_fields[key])
-            variant_id = generate_variant_id(variant_payload)
-
-        identity = EvalIdentity(
-            model_id=model_id, env_id=env_id, variant_id=variant_id, variant_payload=variant_payload
-        )
-        path = Path(output_root) / slug_component(model_id) / slug_component(env_id)
-        if variant_id is not None:
-            path = path / slug_component(variant_id, max_length=_MAX_VARIANT_ID_LENGTH)
-        plans.append(EvalPathPlan(identity=identity, results_path=path))
-
-    _ensure_unique_paths(plans)
-    return plans
-
-
-def extract_variant_payload(config: Mapping[str, Any], field_names: Sequence[str]) -> dict[str, Any]:
-    """Return the subset of config fields that distinguishes a variant."""
-
-    payload: dict[str, Any] = {}
-    for field_name in field_names:
-        if "." in field_name:
-            root, nested_key = field_name.split(".", 1)
-            value = config.get(root)
-            if isinstance(value, Mapping):
-                nested_payload = payload.setdefault(root, {})
-                if isinstance(nested_payload, dict) and nested_key in value:
-                    nested_payload[nested_key] = _canonicalize(value[nested_key])
-            else:
-                payload.setdefault(root, {})
-            continue
-        if field_name in config:
-            payload[field_name] = _canonicalize(config[field_name])
-    return payload
-
-
 def generate_variant_id(payload: Mapping[str, Any]) -> str:
-    """Generate a stable human-readable variant ID from distinguishing fields."""
+    """Generate a stable human-readable variant id for legacy export config keys."""
 
     if not payload:
-        return f"variant-{short_fingerprint(payload)}"
+        return BASE_VARIANT_ID
 
     segments: list[str] = []
     for key, value in sorted(payload.items()):
@@ -120,180 +68,132 @@ def generate_variant_id(payload: Mapping[str, Any]) -> str:
             segments.append(_variant_segment(key, value))
 
     if not segments:
-        return "baseline"
+        return BASE_VARIANT_ID
 
     variant_id = "__".join(segments)
     if len(variant_id) <= _MAX_VARIANT_ID_LENGTH and all(not segment.endswith("-hash") for segment in segments):
         return variant_id
-    return f"{variant_id[:120].rstrip('-._')}__{short_fingerprint(payload, length=12)}"
-
-
-def build_fingerprint_payload(config: Mapping[str, Any]) -> dict[str, Any]:
-    """Build the narrow semantic payload used for config-safe resume checks."""
-
-    payload: dict[str, Any] = {
-        "env_args": _canonicalize(config.get("env_args", {})),
-        "env_id": _env_id(config),
-        "model": _model_id(config),
-        "num_examples": config.get("num_examples"),
-        "rollouts_per_example": config.get("rollouts_per_example"),
-        "sampling_args": normalize_semantic_sampling_args(_sampling_args_with_top_level(config)),
-    }
-    return payload
-
+    return f"{variant_id[:120].rstrip('-._')}__{_short_json_digest(payload, length=12)}"
 
-def config_fingerprint(config: Mapping[str, Any]) -> str:
-    """Return the stable fingerprint for an eval config's benchmark identity."""
 
-    return short_fingerprint(build_fingerprint_payload(config), length=32)
-
-
-def normalize_semantic_sampling_args(sampling_args: Mapping[str, Any] | None) -> dict[str, Any]:
-    """Canonicalize generation arguments for fingerprinting."""
-
-    if not sampling_args:
-        return {}
-
-    normalized: dict[str, Any] = {}
-    for key, value in sampling_args.items():
-        if key == "extra_body":
-            _merge_extra_body_semantics(normalized, value)
-        elif key == "reasoning_effort":
-            normalized["reasoning_effort"] = _canonicalize(value)
-        elif key == "reasoning":
-            effort = _extract_reasoning_effort(value)
-            if effort is not None:
-                normalized["reasoning_effort"] = _canonicalize(effort)
-            else:
-                normalized[key] = _canonicalize(value)
-        else:
-            normalized[key] = _canonicalize(value)
-
-    return dict(sorted(normalized.items()))
-
-
-def metadata_identity_fields(config: Mapping[str, Any], identity: EvalIdentity) -> dict[str, Any]:
-    """Return MedARC metadata fields to write alongside upstream metadata."""
-
-    payload = build_fingerprint_payload(config)
-    return {
-        MEDARC_CONFIG_FINGERPRINT_KEY: short_fingerprint(payload, length=32),
-        MEDARC_CONFIG_FINGERPRINT_PAYLOAD_KEY: payload,
-        MEDARC_VARIANT_ID_KEY: identity.variant_id,
-        MEDARC_VARIANT_PAYLOAD_KEY: identity.variant_payload,
-    }
+def plan_eval_paths(raw_configs: Sequence[Mapping[str, Any]], *, output_root: str | Path) -> list[EvalPathPlan]:
+    """Plan deterministic output paths for TOML bench eval configs."""
 
+    keys = [(_model_id(config), _env_id(config)) for config in raw_configs]
+    plans: list[EvalPathPlan] = []
+    for idx, (config, key) in enumerate(zip(raw_configs, keys)):
+        model_id, env_id = key
+        variant_id = _variant_id(config, index=idx + 1)
+
+        identity = EvalIdentity(model_id=model_id, env_id=env_id, variant_id=variant_id)
+        path = (
+            Path(output_root)
+            / slug_component(model_id)
+            / slug_component(env_id)
+            / slug_component(variant_id, max_length=_MAX_VARIANT_ID_LENGTH)
+        )
+        plans.append(EvalPathPlan(identity=identity, results_path=path))
 
-def short_fingerprint(value: Any, *, length: int = 12) -> str:
-    encoded = _canonical_json(value).encode("utf-8")
-    return hashlib.sha256(encoded).hexdigest()[:length]
+    _ensure_unique_identities(plans)
+    _ensure_unique_slugs(plans)
+    return plans
 
 
-def _semantic_variant_source(config: Mapping[str, Any]) -> dict[str, Any]:
-    return {
-        "env_args": _canonicalize(config.get("env_args", {})),
-        "num_examples": config.get("num_examples"),
-        "rollouts_per_example": config.get("rollouts_per_example"),
-        "sampling_args": normalize_semantic_sampling_args(_sampling_args_with_top_level(config)),
-    }
-
-
-def _sampling_args_with_top_level(config: Mapping[str, Any]) -> dict[str, Any]:
-    sampling_args = dict(config.get("sampling_args", {}) or {})
-    for key in ("max_tokens", "temperature"):
-        if key in config and key not in sampling_args:
-            sampling_args[key] = config[key]
-    return sampling_args
-
-
-def _differing_fields_by_key(
-    semantic_payloads: Sequence[Mapping[str, Any]], keys: Sequence[tuple[str, str]]
-) -> dict[tuple[str, str], list[str]]:
-    grouped: dict[tuple[str, str], list[Mapping[str, Any]]] = defaultdict(list)
-    for payload, key in zip(semantic_payloads, keys):
-        grouped[key].append(payload)
-
-    differing: dict[tuple[str, str], list[str]] = {}
-    for key, configs in grouped.items():
-        if len(configs) < 2:
-            differing[key] = []
-            continue
-        field_names = sorted(set().union(*(payload.keys() for payload in configs)))
-        differing[key] = []
-        for field_name in field_names:
-            values = [payload.get(field_name) for payload in configs]
-            if all(isinstance(value, Mapping) for value in values if value is not None):
-                nested_names = sorted(
-                    {
-                        str(nested_key)
-                        for value in values
-                        if isinstance(value, Mapping)
-                        for nested_key in value.keys()
-                    }
-                )
-                differing[key].extend(
-                    f"{field_name}.{nested_name}"
-                    for nested_name in nested_names
-                    if len(
-                        {
-                            _canonical_json(value.get(nested_name) if isinstance(value, Mapping) else None)
-                            for value in values
-                        }
+def _ensure_unique_identities(plans: Sequence[EvalPathPlan]) -> None:
+    identities = [(plan.identity.model_id, plan.identity.env_id, plan.identity.variant_id) for plan in plans]
+    duplicates = sorted(identity for identity, count in Counter(identities).items() if count > 1)
+    if duplicates:
+        rendered = ", ".join(
+            f"model={model!r}, env_id={env_id!r}, variant_id={variant_id!r}"
+            for model, env_id, variant_id in duplicates
+        )
+        raise ValueError(f"Duplicate TOML eval identity; add a distinct variant_id/name: {rendered}")
+
+
+def _ensure_unique_slugs(plans: Sequence[EvalPathPlan]) -> None:
+    _raise_slug_collisions(
+        "model",
+        ((slug_component(plan.identity.model_id), plan.identity.model_id) for plan in plans),
+    )
+
+    _raise_slug_collisions(
+        "env",
+        (
+            (
+                f"{slug_component(plan.identity.model_id)}/{slug_component(plan.identity.env_id)}",
+                f"{plan.identity.model_id}/{plan.identity.env_id}",
+            )
+            for plan in plans
+        ),
+    )
+    _raise_slug_collisions(
+        "variant",
+        (
+            (
+                "/".join(
+                    (
+                        slug_component(plan.identity.model_id),
+                        slug_component(plan.identity.env_id),
+                        slug_component(plan.identity.variant_id, max_length=_MAX_VARIANT_ID_LENGTH),
                     )
-                    > 1
-                )
-                continue
-            if len({_canonical_json(value) for value in values}) > 1:
-                differing[key].append(field_name)
-    return differing
+                ),
+                f"{plan.identity.model_id}/{plan.identity.env_id}/{plan.identity.variant_id}",
+            )
+            for plan in plans
+        ),
+    )
 
-
-def _ensure_unique_paths(plans: Sequence[EvalPathPlan]) -> None:
     paths = [plan.results_path for plan in plans]
     duplicate_paths = sorted(path for path, count in Counter(paths).items() if count > 1)
     if duplicate_paths:
         rendered = ", ".join(str(path) for path in duplicate_paths)
-        raise ValueError(f"Deterministic eval path collision after variant planning: {rendered}")
+        raise ValueError(f"Deterministic eval path collision: {rendered}")
 
 
-def _variant_segment(key: str, value: Any) -> str:
-    key_slug = slug_component(key, max_length=40)
-    value_slug = slug_component(_variant_value_text(value), max_length=80)
-    if isinstance(value, Mapping | Sequence) and not isinstance(value, str | bytes | bytearray):
-        return f"{key_slug}-{value_slug}-{short_fingerprint(value, length=8)}"
-    return f"{key_slug}-{value_slug}"
-
+def _raise_slug_collisions(label: str, pairs: Iterable[tuple[str, str]]) -> None:
+    values_by_slug: dict[str, set[str]] = {}
+    for slug, value in pairs:
+        values_by_slug.setdefault(slug, set()).add(value)
+    collisions = {slug: sorted(values) for slug, values in values_by_slug.items() if len(values) > 1}
+    if not collisions:
+        return
+    rendered = "; ".join(f"{slug}: {values}" for slug, values in sorted(collisions.items()))
+    raise ValueError(f"Deterministic eval {label} slug collision: {rendered}")
 
-def _variant_value_text(value: Any) -> str:
-    if isinstance(value, bool):
-        return str(value).lower()
-    if value is None:
-        return "none"
-    if isinstance(value, int | float | str):
-        return str(value)
-    return "hash"
 
+def _variant_id(config: Mapping[str, Any], *, index: int) -> str:
+    raw_variant = config.get("variant_id")
+    raw_name = config.get("name")
+    variant = _normalize_variant(raw_variant, config=config, field="variant_id", index=index)
+    name = _normalize_variant(raw_name, config=config, field="name", index=index)
+    if variant and name and variant != name:
+        raise ValueError(
+            f"TOML eval {index} has conflicting variant_id/name values: {variant!r} != {name!r}."
+        )
+    return variant or name or BASE_VARIANT_ID
 
-def _merge_extra_body_semantics(normalized: dict[str, Any], extra_body: Any) -> None:
-    if not isinstance(extra_body, Mapping):
-        normalized["extra_body"] = _canonicalize(extra_body)
-        return
 
-    for key, value in extra_body.items():
-        if key == "reasoning":
-            effort = _extract_reasoning_effort(value)
-            if effort is not None:
-                normalized["reasoning_effort"] = _canonicalize(effort)
+def _normalize_variant(value: Any, *, config: Mapping[str, Any], field: str, index: int) -> str | None:
+    if value is None:
+        return None
+    text = _expand_variant_template(str(value).strip(), config)
+    if not text:
+        raise ValueError(f"TOML eval {index} {field} must not be empty.")
+    return text
+
+
+def _expand_variant_template(template: str, config: Mapping[str, Any]) -> str:
+    def replace(match: re.Match[str]) -> str:
+        path = match.group(1).strip()
+        value: Any = config
+        for part in path.split("."):
+            if isinstance(value, Mapping) and part in value:
+                value = value[part]
             else:
-                normalized[key] = _canonicalize(value)
-        else:
-            normalized[key] = _canonicalize(value)
+                raise ValueError(f"Variant template references unknown field: {path}")
+        return str(value)
 
-
-def _extract_reasoning_effort(value: Any) -> Any:
-    if not isinstance(value, Mapping):
-        return None
-    return value.get("effort") or value.get("reasoning_effort")
+    return re.sub(r"\{([^{}]+)\}", replace, template).strip()
 
 
 def _model_id(config: Mapping[str, Any]) -> str:
@@ -312,8 +212,31 @@ def _env_id(config: Mapping[str, Any]) -> str:
     return str(value)
 
 
-def _canonical_json(value: Any) -> str:
-    return json.dumps(_canonicalize(value), sort_keys=True, separators=(",", ":"), default=str)
+def _short_text_digest(value: str, *, length: int) -> str:
+    return hashlib.sha256(value.encode("utf-8")).hexdigest()[:length]
+
+
+def _short_json_digest(value: Any, *, length: int) -> str:
+    encoded = json.dumps(_canonicalize(value), sort_keys=True, separators=(",", ":"), default=str).encode("utf-8")
+    return hashlib.sha256(encoded).hexdigest()[:length]
+
+
+def _variant_segment(key: str, value: Any) -> str:
+    key_slug = slug_component(key, max_length=40)
+    value_slug = slug_component(_variant_value_text(value), max_length=80)
+    if isinstance(value, Mapping | Sequence) and not isinstance(value, str | bytes | bytearray):
+        return f"{key_slug}-{value_slug}-{_short_json_digest(value, length=8)}"
+    return f"{key_slug}-{value_slug}"
+
+
+def _variant_value_text(value: Any) -> str:
+    if isinstance(value, bool):
+        return str(value).lower()
+    if value is None:
+        return "none"
+    if isinstance(value, int | float | str):
+        return str(value)
+    return "hash"
 
 
 def _canonicalize(value: Any) -> Any:
@@ -329,19 +252,12 @@ def _canonicalize(value: Any) -> Any:
 
 
 __all__ = [
+    "BASE_VARIANT_ID",
     "EvalIdentity",
     "EvalPathPlan",
-    "MEDARC_CONFIG_FINGERPRINT_KEY",
-    "MEDARC_CONFIG_FINGERPRINT_PAYLOAD_KEY",
+    "MEDARC_EVAL_METADATA_FILENAME",
     "MEDARC_VARIANT_ID_KEY",
-    "MEDARC_VARIANT_PAYLOAD_KEY",
-    "build_fingerprint_payload",
-    "config_fingerprint",
-    "extract_variant_payload",
     "generate_variant_id",
-    "metadata_identity_fields",
-    "normalize_semantic_sampling_args",
     "plan_eval_paths",
-    "short_fingerprint",
     "slug_component",
 ]
diff --git a/medarc_verifiers/cli/main.py b/medarc_verifiers/cli/main.py
index cb786f1e..1e7dca87 100644
--- a/medarc_verifiers/cli/main.py
+++ b/medarc_verifiers/cli/main.py
@@ -35,16 +35,13 @@
 )
 from medarc_verifiers.cli._schemas import EnvironmentConfigSchema, EnvironmentExportConfig
 from medarc_verifiers.cli._single_run import run_single_mode
-from medarc_verifiers.cli.bench_index import (
-    BENCH_INDEX_FILENAME,
-    build_bench_index,
-    find_entry_for_results_path,
-    read_bench_index,
-    validate_bench_index,
-    write_bench_index,
+from medarc_verifiers.cli.eval_identity import (
+    MEDARC_EVAL_METADATA_FILENAME,
+    EvalPathPlan,
+    generate_variant_id,
+    plan_eval_paths,
+    slug_component,
 )
-from medarc_verifiers.cli.eval_identity import EvalPathPlan, generate_variant_id, plan_eval_paths
-from medarc_verifiers.cli.eval_identity import metadata_identity_fields
 from medarc_verifiers.cli.hf import HFSyncConfig, sync_files_to_hub
 from medarc_verifiers.cli.process import PROCESS_DEFAULT_STATUS_FILTER, ProcessOptions, ProcessResult, run_process
 from medarc_verifiers.cli.utils.config_io import load_mapping_file
@@ -76,6 +73,7 @@ def build_batch_parser() -> argparse.ArgumentParser:
     )
     parser.add_argument("-c", "--config", required=True, type=Path, help="Path to an upstream TOML eval config file.")
     parser.add_argument("--force", action="store_true", help="Archive existing deterministic output and rerun.")
+    parser.add_argument("--resume", action="store_true", help="Resume an existing deterministic output path.")
     parser.add_argument("--output-dir", type=Path, help="Override the output directory from the configuration.")
     parser.add_argument(
         "--env-dir",
@@ -1260,7 +1258,10 @@ def _validate_toml_selection_args(args: argparse.Namespace, *, parser: argparse.
 
 def _run_toml_bench(args: argparse.Namespace) -> int:
     config_path = Path(args.config).expanduser()
-    raw_configs = _prepare_toml_raw_configs(load_toml_eval_configs(config_path), args)
+    raw_configs = _prepare_toml_raw_configs(
+        load_toml_eval_configs(config_path, extra_valid_fields={"name", "variant_id"}),
+        args,
+    )
     overrides = EvalConfigOverrides(
         model=args.model,
         provider=args.provider,
@@ -1272,22 +1273,15 @@ def _run_toml_bench(args: argparse.Namespace) -> int:
         sampling_args=getattr(args, "cli_sampling_args", None),
     )
     eval_configs = [build_eval_config(raw, overrides=overrides) for raw in raw_configs]
-    plan_inputs = [_eval_config_identity_payload(config) for config in eval_configs]
+    plan_inputs = [_eval_config_identity_payload(config, raw) for config, raw in zip(eval_configs, raw_configs)]
     output_root = _resolve_toml_output_root(eval_configs, args)
     path_plans = plan_eval_paths(plan_inputs, output_root=output_root)
     eval_configs, path_plans, plan_inputs = _select_toml_plan(eval_configs, path_plans, plan_inputs, args)
-    bench_index = build_bench_index(
-        output_root=output_root,
-        source_config=config_path,
-        eval_configs=eval_configs,
-        path_plans=path_plans,
-        plan_payloads=plan_inputs,
-    )
 
     _print_toml_bench_plan(eval_configs, path_plans, dry_run=bool(args.dry_run))
     if args.dry_run:
         return 0
-    return _execute_toml_plan(eval_configs, path_plans, plan_inputs, bench_index, output_root, args)
+    return _execute_toml_plan(eval_configs, path_plans, output_root, args)
 
 
 def _prepare_toml_raw_configs(raw_configs: Sequence[dict[str, Any]], args: argparse.Namespace) -> list[dict[str, Any]]:
@@ -1343,59 +1337,23 @@ def _select_toml_plan(
 def _execute_toml_plan(
     eval_configs: Sequence[Any],
     path_plans: Sequence[EvalPathPlan],
-    plan_inputs: Sequence[Mapping[str, Any]],
-    bench_index: Mapping[str, Any],
     output_root: Path,
     args: argparse.Namespace,
 ) -> int:
     failures = 0
-    bench_index_path = output_root / BENCH_INDEX_FILENAME
-    existing_bench_index = _validate_existing_bench_index(
-        bench_index_path, bench_index, output_root=output_root, force=bool(args.force)
-    )
-    effective_bench_index = _merge_bench_index(existing_bench_index, bench_index, output_root=output_root)
-    persisted_base_index = existing_bench_index
-    if args.force and existing_bench_index is not None:
-        persisted_base_index = _bench_index_without_paths(
-            existing_bench_index,
-            [path_plan.results_path for path_plan in path_plans],
-        )
-    persisted_bench_index = _merge_bench_index(
-        persisted_base_index,
-        _bench_index_with_entries(bench_index, []),
-        output_root=output_root,
-    )
-    write_bench_index(bench_index_path, persisted_bench_index)
-    for index, (config, path_plan, _plan_input) in enumerate(zip(eval_configs, path_plans, plan_inputs), start=1):
-        metadata_fields = metadata_identity_fields(_eval_config_identity_payload(config), path_plan.identity)
+    _validate_model_eval_metadata_for_plan(output_root, path_plans)
+    for index, (config, path_plan) in enumerate(zip(eval_configs, path_plans), start=1):
         results_path = path_plan.results_path
-        sidecar_entry = find_entry_for_results_path(effective_bench_index, results_path)
-        if sidecar_entry is None:
-            raise ValueError(f"Internal bench planning error: missing {BENCH_INDEX_FILENAME} entry for {results_path}.")
         try:
             _prepare_toml_results_dir(
                 results_path,
-                metadata_fields,
-                config,
-                sidecar_entry=sidecar_entry,
-                output_root=output_root,
                 force=bool(args.force),
+                resume=bool(args.resume),
             )
             run_config = config.model_copy(update={"resume_path": results_path, "save_results": True})
             logger.info("Running TOML eval %d/%d: %s on %s", index, len(eval_configs), config.env_id, config.model)
             asyncio.run(_run_one_toml_eval(run_config))
-            _merge_metadata_fields(results_path, metadata_fields)
-            validate_bench_index(
-                {"version": 1, "evals": [dict(sidecar_entry)]},
-                output_root=output_root,
-                require_artifacts=True,
-            )
-            persisted_bench_index = _merge_bench_index(
-                persisted_bench_index,
-                _bench_index_with_entries(bench_index, [sidecar_entry]),
-                output_root=output_root,
-            )
-            write_bench_index(bench_index_path, persisted_bench_index)
+            _write_model_eval_metadata(output_root, path_plan)
         except Exception as exc:  # noqa: BLE001
             failures += 1
             logger.exception("TOML eval %d failed: %s", index, exc)
@@ -1405,8 +1363,6 @@ def _execute_toml_plan(
             import time
 
             time.sleep(float(args.sleep))
-    if failures == 0:
-        validate_bench_index(persisted_bench_index, output_root=output_root, require_artifacts=True)
     return 1 if failures else 0
 
 
@@ -1416,166 +1372,31 @@ async def _run_one_toml_eval(config: Any) -> Any:
 
 def _prepare_toml_results_dir(
     results_path: Path,
-    metadata_fields: Mapping[str, Any],
-    config: Any,
     *,
-    sidecar_entry: Mapping[str, Any],
-    output_root: Path,
     force: bool,
+    resume: bool,
 ) -> None:
     if results_path.exists() and force:
         _archive_existing_path(results_path)
 
     metadata_path = results_path / "metadata.json"
     results_file = results_path / "results.jsonl"
-    has_existing_state = metadata_path.exists() or results_file.exists()
-    if has_existing_state:
-        _validate_toml_resume_sidecar(results_path, sidecar_entry, output_root=output_root)
-        _validate_toml_resume_metadata(results_path, metadata_fields)
-
-    if has_existing_state:
-        _merge_metadata_fields(results_path, metadata_fields)
-        return
-
-    results_path.mkdir(parents=True, exist_ok=True)
-
-
-def _validate_existing_bench_index(
-    bench_index_path: Path,
-    planned_index: Mapping[str, Any],
-    *,
-    output_root: Path,
-    force: bool,
-) -> Mapping[str, Any] | None:
-    existing = read_bench_index(bench_index_path)
-    if existing is None:
-        if force:
-            return None
-        for entry in planned_index.get("evals", []):
-            if not isinstance(entry, Mapping):
-                continue
-            results_path = Path(str(entry.get("results_path", "")))
-            if (results_path / "metadata.json").exists() or (results_path / "results.jsonl").exists():
-                raise ValueError(
-                    f"Cannot reuse deterministic bench path {results_path}: {BENCH_INDEX_FILENAME} is missing. "
-                    "Use --force to archive and rerun."
-                )
-        return None
-    validate_bench_index(existing, output_root=output_root, require_artifacts=False)
-    if force:
-        return existing
-    for entry in planned_index.get("evals", []):
-        if not isinstance(entry, Mapping):
-            continue
-        results_path = Path(str(entry.get("results_path", "")))
-        existing_entry = find_entry_for_results_path(existing, results_path)
-        if existing_entry is None:
-            if (results_path / "metadata.json").exists() or (results_path / "results.jsonl").exists():
-                raise ValueError(
-                    f"Cannot reuse deterministic bench path {results_path}: {BENCH_INDEX_FILENAME} has no entry "
-                    "for that path. Use --force to archive and rerun."
-                )
-            continue
-        if existing_entry.get("plan_digest") != entry.get("plan_digest"):
+    if results_path.exists():
+        has_metadata = metadata_path.is_file()
+        has_results = results_file.is_file()
+        if not resume:
+            raise ValueError(
+                f"Output already exists: {results_path}. Use --resume to continue this output, "
+                "--force to archive and rerun, or add variant_id/name if this is a distinct eval."
+            )
+        if not (has_metadata and has_results):
             raise ValueError(
-                f"Cannot reuse deterministic bench path {results_path}: {BENCH_INDEX_FILENAME} plan_digest mismatch "
-                f"(saved={existing_entry.get('plan_digest')!r}, current={entry.get('plan_digest')!r}). "
+                f"Cannot resume {results_path}: metadata.json and results.jsonl are both required. "
                 "Use --force to archive and rerun."
             )
-    return existing
-
-
-def _merge_bench_index(
-    existing_index: Mapping[str, Any] | None,
-    planned_index: Mapping[str, Any],
-    *,
-    output_root: Path,
-) -> dict[str, Any]:
-    if existing_index is None:
-        return dict(planned_index)
-
-    merged_entries: list[dict[str, Any]] = []
-    planned_by_path = {
-        Path(str(entry["results_path"])).resolve(): dict(entry)
-        for entry in planned_index.get("evals", [])
-        if isinstance(entry, Mapping) and entry.get("results_path")
-    }
-    emitted_paths: set[Path] = set()
-    for entry in existing_index.get("evals", []):
-        if not isinstance(entry, Mapping) or not entry.get("results_path"):
-            continue
-        path = Path(str(entry["results_path"])).resolve()
-        merged_entries.append(planned_by_path.get(path, dict(entry)))
-        emitted_paths.add(path)
-    for path, entry in planned_by_path.items():
-        if path not in emitted_paths:
-            merged_entries.append(entry)
-    for index, entry in enumerate(merged_entries, start=1):
-        entry["index"] = index
-
-    merged = dict(planned_index)
-    merged["evals"] = merged_entries
-    validate_bench_index(merged, output_root=output_root, require_artifacts=False)
-    return merged
-
-
-def _bench_index_with_entries(
-    bench_index: Mapping[str, Any], entries: Sequence[Mapping[str, Any]]
-) -> dict[str, Any]:
-    subset = dict(bench_index)
-    subset["evals"] = [dict(entry) for entry in entries]
-    return subset
-
-
-def _bench_index_without_paths(bench_index: Mapping[str, Any], paths: Sequence[Path]) -> dict[str, Any]:
-    excluded = {path.resolve() for path in paths}
-    entries = [
-        dict(entry)
-        for entry in bench_index.get("evals", [])
-        if isinstance(entry, Mapping)
-        and entry.get("results_path")
-        and Path(str(entry["results_path"])).resolve() not in excluded
-    ]
-    return _bench_index_with_entries(bench_index, entries)
-
-
-def _validate_toml_resume_sidecar(
-    results_path: Path,
-    sidecar_entry: Mapping[str, Any],
-    *,
-    output_root: Path,
-) -> None:
-    validate_bench_index(
-        {"version": 1, "evals": [dict(sidecar_entry)]},
-        output_root=output_root,
-        require_artifacts=False,
-    )
-
-
-def _validate_toml_resume_metadata(results_path: Path, metadata_fields: Mapping[str, Any]) -> None:
-    metadata_path = results_path / "metadata.json"
-    if not metadata_path.exists():
-        raise ValueError(f"Cannot resume {results_path}: metadata.json is missing.")
-    try:
-        metadata = json.loads(metadata_path.read_text(encoding="utf-8"))
-    except json.JSONDecodeError as exc:
-        raise ValueError(f"Cannot resume {results_path}: metadata.json is invalid JSON.") from exc
-    expected = metadata_fields.get("medarc_config_fingerprint")
-    current = metadata.get("medarc_config_fingerprint") if isinstance(metadata, Mapping) else None
-    if current != expected:
-        raise ValueError(
-            f"Cannot resume {results_path}: MedARC config fingerprint mismatch "
-            f"(saved={current!r}, current={expected!r}). Use --force to archive and rerun."
-        )
-
+        return
 
-def _merge_metadata_fields(results_path: Path, metadata_fields: Mapping[str, Any]) -> None:
-    metadata_path = results_path / "metadata.json"
-    metadata = json.loads(metadata_path.read_text(encoding="utf-8")) if metadata_path.exists() else {}
-    if not isinstance(metadata, dict):
-        metadata = {}
-    metadata.update(metadata_fields)
-    _write_json(metadata_path, metadata)
+    results_path.mkdir(parents=True, exist_ok=True)
 
 
 def _archive_existing_path(path: Path) -> Path:
@@ -1589,13 +1410,72 @@ def _archive_existing_path(path: Path) -> Path:
     return candidate
 
 
-def _write_json(path: Path, payload: Mapping[str, Any]) -> None:
+def _validate_model_eval_metadata_for_plan(output_root: Path, path_plans: Sequence[EvalPathPlan]) -> None:
+    for path_plan in path_plans:
+        model_dir = output_root / slug_component(path_plan.identity.model_id)
+        helper_path = model_dir / MEDARC_EVAL_METADATA_FILENAME
+        if not helper_path.exists():
+            continue
+        try:
+            loaded = json.loads(helper_path.read_text(encoding="utf-8"))
+        except json.JSONDecodeError as exc:
+            raise ValueError(f"Cannot update {helper_path}: invalid JSON.") from exc
+        if not isinstance(loaded, dict):
+            continue
+        existing_model = loaded.get("model")
+        if existing_model and existing_model != path_plan.identity.model_id:
+            raise ValueError(
+                f"Cannot update {helper_path}: model slug is already associated with {existing_model!r}, "
+                f"not {path_plan.identity.model_id!r}."
+            )
+
+
+def _write_model_eval_metadata(output_root: Path, path_plan: EvalPathPlan) -> None:
+    model_dir = output_root / slug_component(path_plan.identity.model_id)
+    try:
+        relative_results_path = path_plan.results_path.relative_to(model_dir).as_posix()
+    except ValueError as exc:
+        raise ValueError(f"Internal bench planning error: {path_plan.results_path} is outside {model_dir}.") from exc
+
+    helper_path = model_dir / MEDARC_EVAL_METADATA_FILENAME
+    payload: dict[str, Any] = {"version": 1, "model": path_plan.identity.model_id, "outputs": {}}
+    if helper_path.exists():
+        try:
+            loaded = json.loads(helper_path.read_text(encoding="utf-8"))
+        except json.JSONDecodeError as exc:
+            raise ValueError(f"Cannot update {helper_path}: invalid JSON.") from exc
+        if isinstance(loaded, dict):
+            payload.update(loaded)
+        existing_model = payload.get("model")
+        if existing_model and existing_model != path_plan.identity.model_id:
+            raise ValueError(
+                f"Cannot update {helper_path}: model slug is already associated with {existing_model!r}, "
+                f"not {path_plan.identity.model_id!r}."
+            )
+
+    outputs = payload.get("outputs")
+    if not isinstance(outputs, dict):
+        outputs = {}
+    outputs[relative_results_path] = {
+        "env_id": path_plan.identity.env_id,
+        "variant_id": path_plan.identity.variant_id,
+        "results_path": relative_results_path,
+    }
+    payload["version"] = 1
+    payload["model"] = path_plan.identity.model_id
+    payload["outputs"] = outputs
+    _write_json_atomic(helper_path, payload)
+
+
+def _write_json_atomic(path: Path, payload: Mapping[str, Any]) -> None:
     path.parent.mkdir(parents=True, exist_ok=True)
-    path.write_text(json.dumps(payload, default=make_serializable, sort_keys=True), encoding="utf-8")
+    tmp_path = path.with_name(f".{path.name}.tmp")
+    tmp_path.write_text(json.dumps(payload, default=make_serializable, sort_keys=True), encoding="utf-8")
+    tmp_path.replace(path)
 
 
-def _eval_config_identity_payload(config: Any) -> dict[str, Any]:
-    return {
+def _eval_config_identity_payload(config: Any, raw: Mapping[str, Any] | None = None) -> dict[str, Any]:
+    payload = {
         "env_args": dict(config.env_args or {}),
         "env_id": config.env_id,
         "model": config.model,
@@ -1603,6 +1483,12 @@ def _eval_config_identity_payload(config: Any) -> dict[str, Any]:
         "rollouts_per_example": config.rollouts_per_example,
         "sampling_args": dict(config.sampling_args or {}),
     }
+    if raw:
+        if "variant_id" in raw:
+            payload["variant_id"] = raw["variant_id"]
+        if "name" in raw:
+            payload["name"] = raw["name"]
+    return payload
 
 
 def _print_toml_bench_plan(eval_configs: Sequence[Any], path_plans: Sequence[EvalPathPlan], *, dry_run: bool) -> None:
diff --git a/medarc_verifiers/cli/process/discovery.py b/medarc_verifiers/cli/process/discovery.py
index d6ae1c8a..44ed508e 100644
--- a/medarc_verifiers/cli/process/discovery.py
+++ b/medarc_verifiers/cli/process/discovery.py
@@ -11,12 +11,9 @@
 
 from pydantic import ValidationError
 
-from medarc_verifiers.cli.bench_index import BENCH_INDEX_FILENAME, read_bench_index, validate_bench_index
 from medarc_verifiers.cli.eval_identity import (
-    MEDARC_CONFIG_FINGERPRINT_KEY,
-    MEDARC_CONFIG_FINGERPRINT_PAYLOAD_KEY,
+    MEDARC_EVAL_METADATA_FILENAME,
     MEDARC_VARIANT_ID_KEY,
-    MEDARC_VARIANT_PAYLOAD_KEY,
 )
 from medarc_verifiers.cli._manifest import (
     MANIFEST_FILENAME,
@@ -371,20 +368,18 @@ def _candidate_evals_roots(runs_path: Path) -> tuple[Path, ...]:
 
 def _iter_eval_output_records(evals_root: Path) -> Iterator[RunRecord]:
     """Yield synthetic run records for upstream eval output directories."""
-    bench_index_path = evals_root / BENCH_INDEX_FILENAME
-    if bench_index_path.exists():
-        yield from _iter_bench_index_records(evals_root, bench_index_path)
-        return
-
     try:
         results_paths = sorted(evals_root.rglob(RESULTS_FILENAME))
     except OSError as exc:  # noqa: FBT003
         logger.warning("Failed to scan eval outputs under %s: %s", evals_root, exc)
         return
 
+    helper_entries = _load_model_helper_entries(evals_root)
     seen: set[Path] = set()
     for results_path in results_paths:
         results_dir = results_path.parent
+        if results_path.name == RESULTS_FILENAME and results_dir.name == "__pycache__":
+            continue
         key = _dedupe_key(results_dir)
         if key in seen:
             continue
@@ -392,113 +387,22 @@ def _iter_eval_output_records(evals_root: Path) -> Iterator[RunRecord]:
         metadata_path = results_dir / METADATA_FILENAME
         if not metadata_path.exists():
             continue
-        record = _build_eval_output_record(evals_root, results_dir)
-        if record is not None:
-            yield record
-
-
-def _iter_bench_index_records(evals_root: Path, bench_index_path: Path) -> Iterator[RunRecord]:
-    bench_index = read_bench_index(bench_index_path)
-    if bench_index is None:
-        return
-    validate_bench_index(bench_index, output_root=evals_root, require_artifacts=True)
-    entries = bench_index.get("evals", [])
-    if not isinstance(entries, list):
-        return
-    source_config = _string_or_none(bench_index.get("source_config"))
-    for entry in entries:
-        if not isinstance(entry, Mapping):
-            continue
-        record = _build_bench_index_record(evals_root, bench_index_path, entry, source_config=source_config)
+        record = _build_eval_output_record(evals_root, results_dir, helper_entries.get(_dedupe_key(results_dir)))
         if record is not None:
             yield record
 
 
-def _build_bench_index_record(
+def _build_eval_output_record(
     evals_root: Path,
-    bench_index_path: Path,
-    entry: Mapping[str, Any],
-    *,
-    source_config: str | None,
+    results_dir: Path,
+    helper_entry: Mapping[str, Any] | None = None,
 ) -> RunRecord | None:
-    results_dir = Path(str(entry["results_path"]))
     metadata_path = results_dir / METADATA_FILENAME
     metadata_payload = _read_metadata_payload(metadata_path)
     if metadata_payload is None:
         return None
 
-    model_id = str(entry["model"])
-    env_id = str(entry["env_id"])
-    variant_id = _string_or_none(entry.get("variant_id"))
-    variant_payload = entry.get("variant_payload") if isinstance(entry.get("variant_payload"), Mapping) else None
-    updated_at = _path_timestamp(metadata_path)
-    job_run_id = "::".join(part for part in (model_id, env_id, variant_id) if part)
-    env_args = _mapping_or_empty(entry.get("env_args")) or _mapping_or_empty(metadata_payload.get("env_args"))
-    sampling_args = _mapping_or_empty(entry.get("sampling_args")) or _mapping_or_empty(
-        metadata_payload.get("sampling_args")
-    )
-    plan_digest = _string_or_none(entry.get("plan_digest"))
-
-    manifest = RunManifestInfo(
-        job_run_id=job_run_id,
-        run_name=job_run_id,
-        summary_completed=1,
-        summary_total=1,
-        summary_total_known=True,
-        manifest_path=bench_index_path,
-        run_dir=evals_root,
-        created_at=updated_at,
-        updated_at=updated_at,
-        config_source=source_config,
-        config_checksum=plan_digest,
-        run_summary_path=results_dir / "summary.json",
-        models={model_id: {"sampling_args": sampling_args}},
-        env_templates={env_id: {"module": env_id}},
-    )
-
-    return RunRecord(
-        manifest=manifest,
-        job_id=results_dir.name,
-        model_id=model_id,
-        manifest_env_id=env_id,
-        results_dir_name=results_dir.name,
-        results_dir=results_dir,
-        metadata_path=metadata_path,
-        results_path=results_dir / RESULTS_FILENAME,
-        summary_path=results_dir / "summary.json",
-        has_metadata=True,
-        has_results=True,
-        has_summary=(results_dir / "summary.json").exists(),
-        status="completed",
-        duration_seconds=None,
-        reason=None,
-        started_at=None,
-        ended_at=None,
-        avg_reward=_float_or_none(metadata_payload.get("avg_reward")),
-        num_examples=_int_or_none(entry.get("num_examples")) or _int_or_none(metadata_payload.get("num_examples")),
-        rollouts_per_example=_int_or_none(entry.get("rollouts_per_example"))
-        or _int_or_none(metadata_payload.get("rollouts_per_example")),
-        row_count=_count_results_rows(results_dir / RESULTS_FILENAME),
-        env_args=env_args,
-        sampling_args=sampling_args,
-        env_config={
-            "id": env_id,
-            "module": env_id,
-            "variant_id": variant_id,
-            "variant_payload": variant_payload,
-            "plan_digest": plan_digest,
-        },
-        model_config={"sampling_args": sampling_args},
-    )
-
-
-def _build_eval_output_record(evals_root: Path, results_dir: Path) -> RunRecord | None:
-    metadata_path = results_dir / METADATA_FILENAME
-    metadata_payload = _read_metadata_payload(metadata_path)
-    if metadata_payload is None:
-        return None
-
-    layout = _infer_eval_output_layout(evals_root, results_dir, metadata_payload)
+    layout = _infer_eval_output_layout(evals_root, results_dir, metadata_payload, helper_entry)
     updated_at = _path_timestamp(metadata_path)
     job_run_id = layout["job_run_id"]
     job_id = layout["job_id"]
@@ -516,7 +420,7 @@ def _build_eval_output_record(evals_root: Path, results_dir: Path) -> RunRecord
         created_at=updated_at,
         updated_at=updated_at,
         config_source=None,
-        config_checksum=_string_or_none(metadata_payload.get(MEDARC_CONFIG_FINGERPRINT_KEY)),
+        config_checksum=None,
         run_summary_path=results_dir / "summary.json",
         models={model_id: {"sampling_args": _mapping_or_empty(metadata_payload.get("sampling_args"))}},
         env_templates={env_id: {"module": env_id}},
@@ -552,16 +456,18 @@ def _build_eval_output_record(evals_root: Path, results_dir: Path) -> RunRecord
         env_config={
             "id": env_id,
             "module": env_id,
-            "variant_id": metadata_payload.get(MEDARC_VARIANT_ID_KEY),
-            "variant_payload": metadata_payload.get(MEDARC_VARIANT_PAYLOAD_KEY),
-            "medarc_config_fingerprint": metadata_payload.get(MEDARC_CONFIG_FINGERPRINT_KEY),
-            "medarc_config_fingerprint_payload": metadata_payload.get(MEDARC_CONFIG_FINGERPRINT_PAYLOAD_KEY),
+            "variant_id": layout.get("variant_id"),
         },
         model_config={"sampling_args": sampling_args},
     )
 
 
-def _infer_eval_output_layout(evals_root: Path, results_dir: Path, metadata_payload: Mapping[str, Any]) -> dict[str, str]:
+def _infer_eval_output_layout(
+    evals_root: Path,
+    results_dir: Path,
+    metadata_payload: Mapping[str, Any],
+    helper_entry: Mapping[str, Any] | None = None,
+) -> dict[str, str]:
     try:
         parts = results_dir.relative_to(evals_root).parts
     except ValueError:
@@ -569,16 +475,21 @@ def _infer_eval_output_layout(evals_root: Path, results_dir: Path, metadata_payl
 
     metadata_env_id = _string_or_none(metadata_payload.get("env_id"))
     metadata_model = _string_or_none(metadata_payload.get("model"))
+    helper_env_id = _string_or_none(helper_entry.get("env_id") if helper_entry else None)
+    helper_variant_id = _string_or_none(helper_entry.get("variant_id") if helper_entry else None)
     parent_name = results_dir.parent.name
     if "--" in parent_name and len(parts) >= 2:
         env_from_parent, model_from_parent = parent_name.split("--", 1)
-        env_id = metadata_env_id or env_from_parent
+        env_id = helper_env_id or metadata_env_id or env_from_parent
         model_id = metadata_model or model_from_parent
         job_run_id = results_dir.name
+        variant_id = helper_variant_id
     else:
         model_id = metadata_model or (parts[0] if len(parts) >= 1 else "unknown")
-        env_id = metadata_env_id or (parts[1] if len(parts) >= 2 else results_dir.name)
-        variant_id = _string_or_none(metadata_payload.get(MEDARC_VARIANT_ID_KEY))
+        env_id = helper_env_id or metadata_env_id or (parts[1] if len(parts) >= 2 else results_dir.name)
+        variant_id = helper_variant_id or (
+            parts[2] if len(parts) >= 3 else _string_or_none(metadata_payload.get(MEDARC_VARIANT_ID_KEY))
+        )
         job_run_id = "::".join(part for part in (model_id, env_id, variant_id) if part)
 
     return {
@@ -586,9 +497,46 @@ def _infer_eval_output_layout(evals_root: Path, results_dir: Path, metadata_payl
         "job_id": results_dir.name,
         "model_id": model_id,
         "env_id": env_id,
+        "variant_id": variant_id or "",
     }
 
 
+def _load_model_helper_entries(evals_root: Path) -> dict[Path, Mapping[str, Any]]:
+    entries: dict[Path, Mapping[str, Any]] = {}
+    try:
+        helper_paths = sorted(evals_root.glob(f"*/{MEDARC_EVAL_METADATA_FILENAME}"))
+    except OSError as exc:  # noqa: FBT003
+        logger.warning("Failed to scan eval metadata helpers under %s: %s", evals_root, exc)
+        return entries
+
+    for helper_path in helper_paths:
+        payload = _read_metadata_payload(helper_path)
+        if payload is None:
+            continue
+        raw_outputs = payload.get("outputs")
+        if not isinstance(raw_outputs, Mapping):
+            continue
+        model_dir = helper_path.parent
+        for key, raw_entry in raw_outputs.items():
+            if not isinstance(raw_entry, Mapping):
+                continue
+            raw_results_path = raw_entry.get("results_path") or key
+            if not isinstance(raw_results_path, str) or not raw_results_path:
+                continue
+            relative_results_path = Path(raw_results_path)
+            if relative_results_path.is_absolute():
+                continue
+            results_dir = (model_dir / relative_results_path).resolve()
+            try:
+                results_dir.relative_to(model_dir.resolve())
+            except ValueError:
+                continue
+            if not (results_dir / METADATA_FILENAME).exists() or not (results_dir / RESULTS_FILENAME).exists():
+                continue
+            entries[_dedupe_key(results_dir)] = raw_entry
+    return entries
+
+
 def _read_metadata_payload(path: Path) -> Mapping[str, Any] | None:
     try:
         payload = json.loads(path.read_text(encoding="utf-8"))
diff --git a/medarc_verifiers/cli/process/metadata.py b/medarc_verifiers/cli/process/metadata.py
index 9429fc62..a84d7946 100644
--- a/medarc_verifiers/cli/process/metadata.py
+++ b/medarc_verifiers/cli/process/metadata.py
@@ -11,17 +11,16 @@
 
 from pydantic import BaseModel, Field, ValidationError
 
-from medarc_verifiers.cli.eval_identity import (
-    MEDARC_CONFIG_FINGERPRINT_KEY,
-    MEDARC_CONFIG_FINGERPRINT_PAYLOAD_KEY,
-    MEDARC_VARIANT_ID_KEY,
-    MEDARC_VARIANT_PAYLOAD_KEY,
-)
 from medarc_verifiers.cli.process.discovery import RunRecord
 from medarc_verifiers.cli.process.rollout import derive_base_env_id, extract_rollout_index
 
 logger = logging.getLogger(__name__)
 
+MEDARC_CONFIG_FINGERPRINT_KEY = "medarc_config_fingerprint"
+MEDARC_CONFIG_FINGERPRINT_PAYLOAD_KEY = "medarc_config_fingerprint_payload"
+MEDARC_VARIANT_ID_KEY = "variant_id"
+MEDARC_VARIANT_PAYLOAD_KEY = "variant_payload"
+
 
 class _MetadataPayload(BaseModel):
     """Lightweight schema for metadata.json rows."""
diff --git a/medarc_verifiers/cli/verifiers_adapter.py b/medarc_verifiers/cli/verifiers_adapter.py
index ffbc26c8..8a086f3e 100644
--- a/medarc_verifiers/cli/verifiers_adapter.py
+++ b/medarc_verifiers/cli/verifiers_adapter.py
@@ -40,6 +40,7 @@
 DEFAULT_PROVIDER = "prime"
 ADAPTER_TOML_FIELDS = {"debug", "header_from_state", "headers_from_state", "timeout"}
 MEDARC_TOML_METADATA_FIELD = "medarc"
+MEDARC_TOML_IDENTITY_FIELDS = {"name", "variant_id"}
 
 PROVIDER_CONFIGS: dict[str, dict[str, str]] = {
     "prime": {
@@ -100,7 +101,9 @@ class EvalConfigOverrides:
 def load_toml_eval_configs(path: str | Path, *, extra_valid_fields: set[str] | None = None) -> list[dict[str, Any]]:
     """Load upstream TOML eval configs, including ``[[ablation]]`` expansion."""
 
-    valid_fields = ADAPTER_TOML_FIELDS | {MEDARC_TOML_METADATA_FIELD} | (extra_valid_fields or set())
+    valid_fields = ADAPTER_TOML_FIELDS | {MEDARC_TOML_METADATA_FIELD} | MEDARC_TOML_IDENTITY_FIELDS | (
+        extra_valid_fields or set()
+    )
     return [_strip_medarc_metadata(raw) for raw in load_toml_config(Path(path), extra_valid_fields=valid_fields)]
 
 
diff --git a/tests/test_cli/test_bench_index.py b/tests/test_cli/test_bench_index.py
deleted file mode 100644
index 277a5c8a..00000000
--- a/tests/test_cli/test_bench_index.py
+++ /dev/null
@@ -1,88 +0,0 @@
-from __future__ import annotations
-
-import json
-from pathlib import Path
-
-import pytest
-
-from medarc_verifiers.cli.bench_index import BenchIndexError, validate_bench_index
-
-
-def _write_eval(path: Path, *, model: str = "gpt-5-mini", env_id: str = "medqa") -> None:
-    path.mkdir(parents=True, exist_ok=True)
-    (path / "metadata.json").write_text(json.dumps({"model": model, "env_id": env_id}), encoding="utf-8")
-    (path / "results.jsonl").write_text(json.dumps({"example_id": "0"}) + "\n", encoding="utf-8")
-
-
-def _index_entry(path: Path, *, model: str = "gpt-5-mini", env_id: str = "medqa", variant_id: str | None = None):
-    return {
-        "index": 1,
-        "results_path": str(path),
-        "model": model,
-        "env_id": env_id,
-        "variant_id": variant_id,
-        "variant_payload": None,
-        "env_args": {},
-        "sampling_args": {"unknown_provider_arg": True},
-        "num_examples": 1,
-        "rollouts_per_example": 1,
-        "plan_digest": "sha256:test",
-    }
-
-
-def test_validate_bench_index_accepts_unknown_sampling_args(tmp_path: Path) -> None:
-    results_path = tmp_path / "gpt-5-mini" / "medqa"
-    _write_eval(results_path)
-
-    validate_bench_index(
-        {"version": 1, "evals": [_index_entry(results_path)]},
-        output_root=tmp_path,
-        require_artifacts=True,
-    )
-
-
-def test_validate_bench_index_rejects_stale_results_path(tmp_path: Path) -> None:
-    with pytest.raises(BenchIndexError, match="required artifact is missing"):
-        validate_bench_index(
-            {"version": 1, "evals": [_index_entry(tmp_path / "missing" / "medqa")]},
-            output_root=tmp_path,
-            require_artifacts=True,
-        )
-
-
-def test_validate_bench_index_rejects_duplicate_results_path(tmp_path: Path) -> None:
-    results_path = tmp_path / "gpt-5-mini" / "medqa"
-    _write_eval(results_path)
-
-    with pytest.raises(BenchIndexError, match="duplicate results_path"):
-        validate_bench_index(
-            {"version": 1, "evals": [_index_entry(results_path), _index_entry(results_path)]},
-            output_root=tmp_path,
-            require_artifacts=True,
-        )
-
-
-def test_validate_bench_index_rejects_metadata_identity_mismatch(tmp_path: Path) -> None:
-    results_path = tmp_path / "gpt-5-mini" / "medqa"
-    _write_eval(results_path, model="other-model")
-
-    with pytest.raises(BenchIndexError, match="identity mismatch"):
-        validate_bench_index(
-            {"version": 1, "evals": [_index_entry(results_path)]},
-            output_root=tmp_path,
-            require_artifacts=True,
-        )
-
-
-def test_validate_bench_index_rejects_duplicate_model_env_without_variant(tmp_path: Path) -> None:
-    first = tmp_path / "gpt-5-mini" / "medqa" / "first"
-    second = tmp_path / "gpt-5-mini" / "medqa" / "second"
-    _write_eval(first)
-    _write_eval(second)
-
-    with pytest.raises(BenchIndexError, match="require explicit variant_id"):
-        validate_bench_index(
-            {"version": 1, "evals": [_index_entry(first), _index_entry(second)]},
-            output_root=tmp_path,
-            require_artifacts=True,
-        )
diff --git a/tests/test_cli/test_eval_identity.py b/tests/test_cli/test_eval_identity.py
index 210b0507..2d0be1f4 100644
--- a/tests/test_cli/test_eval_identity.py
+++ b/tests/test_cli/test_eval_identity.py
@@ -5,21 +5,14 @@
 import pytest
 
 from medarc_verifiers.cli.eval_identity import (
-    MEDARC_CONFIG_FINGERPRINT_KEY,
-    MEDARC_CONFIG_FINGERPRINT_PAYLOAD_KEY,
-    MEDARC_VARIANT_ID_KEY,
-    MEDARC_VARIANT_PAYLOAD_KEY,
-    build_fingerprint_payload,
-    config_fingerprint,
+    BASE_VARIANT_ID,
     generate_variant_id,
-    metadata_identity_fields,
-    normalize_semantic_sampling_args,
     plan_eval_paths,
     slug_component,
 )
 
 
-def test_unique_model_env_path_uses_plain_dataset_directory(tmp_path: Path) -> None:
+def test_unique_model_env_path_uses_base_variant_directory(tmp_path: Path) -> None:
     [plan] = plan_eval_paths(
         [{"model": "openai/gpt-5-mini", "env_id": "medqa"}],
         output_root=tmp_path / "runs" / "evals",
@@ -27,300 +20,103 @@ def test_unique_model_env_path_uses_plain_dataset_directory(tmp_path: Path) -> N
 
     assert plan.identity.model_id == "openai/gpt-5-mini"
     assert plan.identity.env_id == "medqa"
-    assert plan.identity.variant_id is None
-    assert plan.identity.variant_payload is None
-    assert plan.results_path == tmp_path / "runs" / "evals" / "openai-gpt-5-mini" / "medqa"
+    assert plan.identity.variant_id == BASE_VARIANT_ID
+    assert plan.results_path == tmp_path / "runs" / "evals" / "openai-gpt-5-mini" / "medqa" / "base"
 
 
-def test_duplicate_model_env_paths_use_deterministic_variants(tmp_path: Path) -> None:
-    plans = plan_eval_paths(
-        [
-            {"model": "gpt-5-mini", "env_id": "medqa", "env_args": {"shuffle_seed": 1618}},
-            {"model": "gpt-5-mini", "env_id": "medqa", "env_args": {"shuffle_seed": 9331}},
-        ],
+def test_explicit_variant_id_controls_variant_directory(tmp_path: Path) -> None:
+    [plan] = plan_eval_paths(
+        [{"model": "gpt-5-mini", "env_id": "medqa", "variant_id": "shuffle_seed-1618"}],
         output_root=tmp_path,
     )
 
-    assert [plan.identity.variant_id for plan in plans] == ["env_args.shuffle_seed-1618", "env_args.shuffle_seed-9331"]
-    assert plans[0].identity.variant_payload == {"env_args": {"shuffle_seed": 1618}}
-    assert plans[1].identity.variant_payload == {"env_args": {"shuffle_seed": 9331}}
-    assert plans[0].results_path == tmp_path / "gpt-5-mini" / "medqa" / "env_args.shuffle_seed-1618"
-    assert plans[1].results_path == tmp_path / "gpt-5-mini" / "medqa" / "env_args.shuffle_seed-9331"
+    assert plan.identity.variant_id == "shuffle_seed-1618"
+    assert plan.results_path == tmp_path / "gpt-5-mini" / "medqa" / "shuffle_seed-1618"
 
 
-def test_duplicate_model_env_baseline_gets_explicit_variant(tmp_path: Path) -> None:
-    plans = plan_eval_paths(
-        [
-            {"model": "gpt-5-mini", "env_id": "medqa"},
-            {"model": "gpt-5-mini", "env_id": "medqa", "env_args": {"shuffle_seed": 1618}},
-        ],
+def test_name_is_variant_id_alias(tmp_path: Path) -> None:
+    [plan] = plan_eval_paths(
+        [{"model": "gpt-5-mini", "env_id": "medqa", "name": "seed-1618"}],
         output_root=tmp_path,
     )
 
-    assert [plan.identity.variant_id for plan in plans] == ["baseline", "env_args.shuffle_seed-1618"]
-    assert plans[0].identity.variant_payload == {"env_args": {}}
-    assert plans[0].results_path == tmp_path / "gpt-5-mini" / "medqa" / "baseline"
+    assert plan.identity.variant_id == "seed-1618"
+    assert plan.results_path == tmp_path / "gpt-5-mini" / "medqa" / "seed-1618"
 
 
-def test_duplicate_model_env_variant_can_use_sampling_args(tmp_path: Path) -> None:
-    plans = plan_eval_paths(
+def test_name_can_template_expanded_env_args(tmp_path: Path) -> None:
+    [plan] = plan_eval_paths(
         [
-            {"model": "gpt-5-mini", "env_id": "medqa", "sampling_args": {"temperature": 0.0}},
-            {"model": "gpt-5-mini", "env_id": "medqa", "sampling_args": {"temperature": 0.7}},
+            {
+                "model": "gpt-5-mini",
+                "env_id": "medqa",
+                "env_args": {"shuffle_seed": 1618},
+                "name": "shuffle_seed-{env_args.shuffle_seed}",
+            }
         ],
         output_root=tmp_path,
     )
 
-    assert [plan.identity.variant_id for plan in plans] == [
-        "sampling_args.temperature-0.0",
-        "sampling_args.temperature-0.7",
-    ]
-
+    assert plan.identity.variant_id == "shuffle_seed-1618"
 
-def test_duplicate_model_env_variant_uses_only_differing_nested_keys(tmp_path: Path) -> None:
-    common_env_args = {
-        "judge_model": ["openai/gpt-5-mini", "google/gemini-3-flash-preview"],
-        "judge_base_url": "https://api.pinference.ai/api/v1",
-    }
 
-    plans = plan_eval_paths(
-        [
-            {"model": "gpt-5-mini", "env_id": "medrbench", "env_args": {**common_env_args, "task": "oracle"}},
-            {"model": "gpt-5-mini", "env_id": "medrbench", "env_args": {**common_env_args, "task": "1turn"}},
-            {"model": "gpt-5-mini", "env_id": "medrbench", "env_args": {**common_env_args, "task": "free_turn"}},
-        ],
+def test_matching_name_and_variant_id_are_allowed(tmp_path: Path) -> None:
+    [plan] = plan_eval_paths(
+        [{"model": "gpt-5-mini", "env_id": "medqa", "name": "base", "variant_id": "base"}],
         output_root=tmp_path,
     )
 
-    assert [plan.identity.variant_id for plan in plans] == [
-        "env_args.task-oracle",
-        "env_args.task-1turn",
-        "env_args.task-free_turn",
-    ]
-    assert plans[0].identity.variant_payload == {"env_args": {"task": "oracle"}}
+    assert plan.identity.variant_id == "base"
 
 
-def test_duplicate_model_env_variant_canonicalizes_sampling_args(tmp_path: Path) -> None:
-    plans = plan_eval_paths(
-        [
-            {"model": "gpt-5-mini", "env_id": "medqa", "sampling_args": {"reasoning_effort": "medium"}},
-            {
-                "model": "gpt-5-mini",
-                "env_id": "medqa",
-                "sampling_args": {"extra_body": {"reasoning": {"effort": "high"}}},
-            },
-        ],
-        output_root=tmp_path,
-    )
-
-    assert [plan.identity.variant_id for plan in plans] == [
-        "sampling_args.reasoning_effort-medium",
-        "sampling_args.reasoning_effort-high",
-    ]
+def test_conflicting_name_and_variant_id_fail(tmp_path: Path) -> None:
+    with pytest.raises(ValueError, match="conflicting variant_id/name"):
+        plan_eval_paths(
+            [{"model": "gpt-5-mini", "env_id": "medqa", "name": "left", "variant_id": "right"}],
+            output_root=tmp_path,
+        )
 
 
-def test_long_nested_variant_values_use_stable_fingerprint() -> None:
-    payload = {
-        "env_args": {
-            "rubric": {
-                "criteria": ["clinically grounded", "concise", "safe"],
-                "description": "x" * 240,
-            }
-        }
-    }
-
-    variant_id = generate_variant_id(payload)
-
-    assert len(variant_id) <= 160
-    assert variant_id.endswith(generate_variant_id(payload)[-12:])
-    assert "env_args.rubric-hash" in variant_id
-
-
-def test_fingerprint_stable_across_key_ordering() -> None:
-    left = {
-        "env_id": "medqa",
-        "model": "gpt-5-mini",
-        "env_args": {"b": 2, "a": 1},
-        "sampling_args": {"top_p": 0.9, "temperature": 0.1},
-        "num_examples": 10,
-        "rollouts_per_example": 1,
-    }
-    right = {
-        "rollouts_per_example": 1,
-        "num_examples": 10,
-        "sampling_args": {"temperature": 0.1, "top_p": 0.9},
-        "env_args": {"a": 1, "b": 2},
-        "model": "gpt-5-mini",
-        "env_id": "medqa",
-    }
-
-    assert config_fingerprint(left) == config_fingerprint(right)
-    assert build_fingerprint_payload(left) == build_fingerprint_payload(right)
-
-
-@pytest.mark.parametrize(
-    "changed",
-    [
-        {"env_args": {"shuffle_seed": 9331}},
-        {"sampling_args": {"temperature": 0.8}},
-        {"max_tokens": 1024},
-        {"num_examples": 11},
-        {"rollouts_per_example": 2},
-    ],
-)
-def test_fingerprint_changes_for_semantic_benchmark_changes(changed: dict[str, object]) -> None:
-    base = {
-        "env_id": "medqa",
-        "model": "gpt-5-mini",
-        "env_args": {"shuffle_seed": 1618},
-        "sampling_args": {"temperature": 0.2},
-        "num_examples": 10,
-        "rollouts_per_example": 1,
-    }
-    candidate = {**base, **changed}
-
-    assert config_fingerprint(base) != config_fingerprint(candidate)
-
-
-@pytest.mark.parametrize(
-    "changed",
-    [
-        {"provider": "openai"},
-        {"api_base_url": "http://localhost:9000/v1"},
-        {"endpoint_id": "local-alias"},
-        {"api_key_var": "LOCAL_KEY"},
-        {"api_client_type": "openai_chat_completions"},
-        {"timeout": 120},
-        {"max_concurrent": 1},
-        {"max_retries": 5},
-        {"headers": {"X-Prime-Team-ID": "team"}},
-    ],
-)
-def test_fingerprint_ignores_provider_transport_and_runtime_changes(changed: dict[str, object]) -> None:
-    base = {
-        "env_id": "medqa",
-        "model": "gpt-5-mini",
-        "env_args": {"shuffle_seed": 1618},
-        "sampling_args": {"temperature": 0.2},
-        "num_examples": 10,
-        "rollouts_per_example": 1,
-    }
-    candidate = {**base, **changed}
-
-    assert config_fingerprint(base) == config_fingerprint(candidate)
-
-
-def test_variant_planning_ignores_runtime_fields_in_identity(tmp_path: Path) -> None:
-    with pytest.raises(ValueError, match="Deterministic eval path collision"):
+def test_duplicate_model_env_requires_explicit_variant(tmp_path: Path) -> None:
+    with pytest.raises(ValueError, match="Duplicate TOML eval identity"):
         plan_eval_paths(
             [
-                {"model": "gpt-5-mini", "env_id": "medqa", "max_concurrent": 1},
-                {"model": "gpt-5-mini", "env_id": "medqa", "max_concurrent": 32, "timeout": 120},
+                {"model": "gpt-5-mini", "env_id": "medqa"},
+                {"model": "gpt-5-mini", "env_id": "medqa"},
             ],
             output_root=tmp_path,
         )
 
 
-def test_reasoning_effort_shapes_fingerprint_identically() -> None:
-    native = {
-        "env_id": "medqa",
-        "model": "gpt-5-mini",
-        "sampling_args": {"reasoning_effort": "medium", "temperature": 0.2},
-        "num_examples": 10,
-        "rollouts_per_example": 1,
-    }
-    openrouter = {
-        "env_id": "medqa",
-        "model": "gpt-5-mini",
-        "sampling_args": {"extra_body": {"reasoning": {"effort": "medium"}}, "temperature": 0.2},
-        "num_examples": 10,
-        "rollouts_per_example": 1,
-    }
-
-    assert config_fingerprint(native) == config_fingerprint(openrouter)
-    assert build_fingerprint_payload(native)["sampling_args"] == {
-        "reasoning_effort": "medium",
-        "temperature": 0.2,
-    }
-
-
-def test_top_level_sampling_aliases_match_sampling_args_shape() -> None:
-    top_level = {
-        "env_id": "medqa",
-        "model": "gpt-5-mini",
-        "temperature": 0.2,
-        "max_tokens": 256,
-        "num_examples": 10,
-        "rollouts_per_example": 1,
-    }
-    nested = {
-        "env_id": "medqa",
-        "model": "gpt-5-mini",
-        "sampling_args": {"temperature": 0.2, "max_tokens": 256},
-        "num_examples": 10,
-        "rollouts_per_example": 1,
-    }
-
-    assert config_fingerprint(top_level) == config_fingerprint(nested)
-    assert build_fingerprint_payload(top_level)["sampling_args"] == {"max_tokens": 256, "temperature": 0.2}
-
-
-def test_extra_body_semantic_args_match_top_level_shape() -> None:
-    assert normalize_semantic_sampling_args({"top_k": 20}) == normalize_semantic_sampling_args(
-        {"extra_body": {"top_k": 20}}
+def test_same_variant_condition_across_models_keeps_same_variant_id(tmp_path: Path) -> None:
+    plans = plan_eval_paths(
+        [
+            {"model": "gpt-5-mini", "env_id": "medqa", "variant_id": "seed-1618"},
+            {"model": "gpt-5", "env_id": "medqa", "variant_id": "seed-1618"},
+        ],
+        output_root=tmp_path,
     )
 
+    assert [plan.identity.variant_id for plan in plans] == ["seed-1618", "seed-1618"]
+    assert plans[0].results_path == tmp_path / "gpt-5-mini" / "medqa" / "seed-1618"
+    assert plans[1].results_path == tmp_path / "gpt-5" / "medqa" / "seed-1618"
 
-def test_unknown_sampling_args_pass_through_fingerprint() -> None:
-    assert normalize_semantic_sampling_args({"vendor_knob": True}) == {"vendor_knob": True}
-    assert normalize_semantic_sampling_args({"extra_body": {"vendor_knob": True}}) == {"vendor_knob": True}
-    assert normalize_semantic_sampling_args({"extra_body": "provider-default"}) == {"extra_body": "provider-default"}
-
-
-def test_sampling_extra_body_arguments_are_part_of_fingerprint() -> None:
-    base = {
-        "env_id": "medqa",
-        "model": "gpt-5-mini",
-        "sampling_args": {"temperature": 0.2},
-        "num_examples": 10,
-        "rollouts_per_example": 1,
-    }
-    with_usage = {
-        **base,
-        "sampling_args": {"temperature": 0.2, "extra_body": {"usage": {"include": True}}},
-    }
-
-    assert config_fingerprint(base) != config_fingerprint(with_usage)
 
+def test_slug_collisions_fail(tmp_path: Path) -> None:
+    with pytest.raises(ValueError, match="model slug collision"):
+        plan_eval_paths(
+            [
+                {"model": "openai/gpt", "env_id": "medqa"},
+                {"model": "openai:gpt", "env_id": "pubmedqa"},
+            ],
+            output_root=tmp_path,
+        )
 
-def test_endpoint_alias_without_resolved_model_is_rejected() -> None:
-    with pytest.raises(ValueError, match="resolved 'model'"):
-        config_fingerprint({"endpoint_id": "gpt-alias", "env_id": "medqa"})
 
+def test_legacy_variant_generator_remains_stable_for_export_config_lookup() -> None:
+    payload = {"env_args": {"shuffle_seed": 1618}}
 
-def test_metadata_identity_fields_include_planned_keys(tmp_path: Path) -> None:
-    plan = plan_eval_paths(
-        [
-            {"model": "gpt-5-mini", "env_id": "medqa", "env_args": {"shuffle_seed": 1618}},
-            {"model": "gpt-5-mini", "env_id": "medqa", "env_args": {"shuffle_seed": 9331}},
-        ],
-        output_root=tmp_path,
-    )[0]
-    config = {
-        "model": "gpt-5-mini",
-        "env_id": "medqa",
-        "env_args": {"shuffle_seed": 1618},
-        "sampling_args": {"temperature": 0.2},
-        "num_examples": 10,
-        "rollouts_per_example": 1,
-    }
-
-    fields = metadata_identity_fields(config, plan.identity)
-
-    assert fields[MEDARC_CONFIG_FINGERPRINT_KEY] == config_fingerprint(config)
-    assert fields[MEDARC_CONFIG_FINGERPRINT_PAYLOAD_KEY] == build_fingerprint_payload(config)
-    assert fields[MEDARC_VARIANT_ID_KEY] == "env_args.shuffle_seed-1618"
-    assert fields[MEDARC_VARIANT_PAYLOAD_KEY] == {"env_args": {"shuffle_seed": 1618}}
+    assert generate_variant_id(payload) == "env_args.shuffle_seed-1618"
 
 
 def test_slug_component_is_path_safe_and_stable_for_long_values() -> None:
diff --git a/tests/test_cli/test_main.py b/tests/test_cli/test_main.py
index 37f5a620..24693113 100644
--- a/tests/test_cli/test_main.py
+++ b/tests/test_cli/test_main.py
@@ -119,6 +119,7 @@ def test_toml_bench_dry_run_expands_evals_and_ablations(
 
         [[ablation]]
         env_id = "medqa"
+        name = "shuffle_seed-{env_args.shuffle_seed}"
         num_examples = 1
         rollouts_per_example = 1
         env_args = { shuffle_answers = true }
@@ -145,10 +146,10 @@ def test_toml_bench_dry_run_expands_evals_and_ablations(
     assert exit_code == 0
     assert "TOML Bench Dry Run" in output
     assert "3 eval(s) to dry-run" in output
-    assert "baseline" in output
-    assert "env_args.shuffle_seed-1618" in output
-    assert "env_args.shuffle_seed-9331" in output
-    assert str(tmp_path / "evals" / "gpt-5-mini" / "medqa" / "baseline") in output
+    assert "base" in output
+    assert "shuffle_seed-1618" in output
+    assert "shuffle_seed-9331" in output
+    assert str(tmp_path / "evals" / "gpt-5-mini" / "medqa" / "base") in output
 
 
 def test_repository_smoke_toml_config_dry_runs(capsys: pytest.CaptureFixture[str]) -> None:
@@ -218,8 +219,8 @@ def test_repository_verified_toml_config_dry_run_shows_ablation_variants(capsys:
     output = capsys.readouterr().out
     assert exit_code == 0
     assert "medqa" in output
-    assert "env_args.shuffle_seed-1618" in output
-    assert "runs/evals/openai-gpt-4.1-mini/medqa/env_args.shuffle_answers-true__env_args.shuffle_seed-1618" in output
+    assert "shuffle_seed-1618" in output
+    assert "runs/evals/openai-gpt-4.1-mini/medqa/shuffle_seed-1618" in output
 
 
 def test_repository_open_ended_toml_config_loads_expected_judge_args() -> None:
@@ -276,7 +277,7 @@ def test_toml_bench_dry_run_uses_toml_output_dir(
 
     assert main.main(["bench", "--config", str(config_path), "--dry-run"]) == 0
 
-    assert str(output_dir / "gpt-5-mini" / "medqa") in capsys.readouterr().out
+    assert str(output_dir / "gpt-5-mini" / "medqa" / "base") in capsys.readouterr().out
 
 
 def test_toml_bench_executes_sequentially_to_deterministic_path(
@@ -312,14 +313,26 @@ async def fake_run(config, on_progress=None, **_kwargs):
 
     exit_code = main.main(["bench", "--config", str(config_path), "--output-dir", str(output_dir)])
 
-    results_path = output_dir / "gpt-5-mini" / "medqa"
+    results_path = output_dir / "gpt-5-mini" / "medqa" / "base"
     assert exit_code == 0
     assert calls == [results_path]
     assert (results_path / "results.jsonl").exists()
     metadata = json.loads((results_path / "metadata.json").read_text())
-    assert metadata["medarc_config_fingerprint"]
-    assert metadata["variant_id"] is None
-    assert metadata["variant_payload"] is None
+    assert "medarc_config_fingerprint" not in metadata
+    assert "variant_id" not in metadata
+    assert "variant_payload" not in metadata
+    helper = json.loads((output_dir / "gpt-5-mini" / ".medarc_eval_metadata.json").read_text())
+    assert helper == {
+        "model": "gpt-5-mini",
+        "outputs": {
+            "medqa/base": {
+                "env_id": "medqa",
+                "results_path": "medqa/base",
+                "variant_id": "base",
+            }
+        },
+        "version": 1,
+    }
 
 
 def test_toml_bench_defaults_max_concurrent_to_one(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
@@ -363,7 +376,7 @@ async def fake_run(config, **_kwargs):
     assert captured == [4]
 
 
-def test_toml_bench_refuses_mismatched_resume(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
+def test_toml_bench_refuses_existing_output_without_resume(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
     config_path = tmp_path / "bench.toml"
     output_dir = tmp_path / "evals"
     _write_config(
@@ -384,6 +397,15 @@ async def fake_run(config, **_kwargs):
     monkeypatch.setattr(main, "run_evaluation", fake_run)
 
     assert main.main(["bench", "--config", str(config_path), "--output-dir", str(output_dir)]) == 0
+    assert main.main(["bench", "--config", str(config_path), "--output-dir", str(output_dir)]) == 1
+
+
+def test_toml_bench_resume_refuses_malformed_existing_output(
+    monkeypatch: pytest.MonkeyPatch,
+    tmp_path: Path,
+) -> None:
+    config_path = tmp_path / "bench.toml"
+    output_dir = tmp_path / "evals"
     _write_config(
         config_path,
         """
@@ -391,11 +413,22 @@ async def fake_run(config, **_kwargs):
 
         [[eval]]
         env_id = "medqa"
-        env_args = { shuffle_seed = 9331 }
         """,
     )
+    results_path = output_dir / "gpt-5-mini" / "medqa" / "base"
+    (results_path / "metadata.json").mkdir(parents=True)
+    (results_path / "results.jsonl").write_text(json.dumps({"example_id": "0"}) + "\n")
+    calls = 0
 
-    assert main.main(["bench", "--config", str(config_path), "--output-dir", str(output_dir)]) == 1
+    async def fake_run(config, **_kwargs):
+        nonlocal calls
+        calls += 1
+        return {"outputs": [], "metadata": {}}
+
+    monkeypatch.setattr(main, "run_evaluation", fake_run)
+
+    assert main.main(["bench", "--config", str(config_path), "--output-dir", str(output_dir), "--resume"]) == 1
+    assert calls == 0
 
 
 def test_toml_bench_force_archives_existing_output(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
@@ -418,17 +451,57 @@ async def fake_run(config, **_kwargs):
     monkeypatch.setattr(main, "run_evaluation", fake_run)
 
     assert main.main(["bench", "--config", str(config_path), "--output-dir", str(output_dir)]) == 0
-    results_path = output_dir / "gpt-5-mini" / "medqa"
+    results_path = output_dir / "gpt-5-mini" / "medqa" / "base"
     (results_path / "sentinel.txt").write_text("old")
 
     assert main.main(["bench", "--config", str(config_path), "--output-dir", str(output_dir), "--force"]) == 0
 
-    archived = list((output_dir / "gpt-5-mini").glob("medqa__old_*"))
+    archived = list((output_dir / "gpt-5-mini" / "medqa").glob("base__old_*"))
     assert len(archived) == 1
     assert (archived[0] / "sentinel.txt").read_text() == "old"
     assert not (results_path / "sentinel.txt").exists()
 
 
+def test_toml_bench_refuses_existing_model_helper_slug_collision_before_run(
+    monkeypatch: pytest.MonkeyPatch,
+    tmp_path: Path,
+) -> None:
+    config_path = tmp_path / "bench.toml"
+    output_dir = tmp_path / "evals"
+    _write_config(
+        config_path,
+        """
+        model = "gpt/5-mini"
+
+        [[eval]]
+        env_id = "medqa"
+        """,
+    )
+    helper_path = output_dir / "gpt-5-mini" / ".medarc_eval_metadata.json"
+    helper_path.parent.mkdir(parents=True)
+    helper_path.write_text(
+        json.dumps(
+            {
+                "version": 1,
+                "model": "gpt 5-mini",
+                "outputs": {},
+            }
+        ),
+        encoding="utf-8",
+    )
+    calls = 0
+
+    async def fake_run(config, **_kwargs):
+        nonlocal calls
+        calls += 1
+        return {"outputs": [], "metadata": {}}
+
+    monkeypatch.setattr(main, "run_evaluation", fake_run)
+
+    assert main.main(["bench", "--config", str(config_path), "--output-dir", str(output_dir)]) == 1
+    assert calls == 0
+
+
 def test_toml_bench_resume_preserves_existing_metadata(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
     config_path = tmp_path / "bench.toml"
     output_dir = tmp_path / "evals"
@@ -463,13 +536,13 @@ async def fake_run(config, **_kwargs):
     monkeypatch.setattr(main, "run_evaluation", fake_run)
 
     assert main.main(["bench", "--config", str(config_path), "--output-dir", str(output_dir)]) == 0
-    assert main.main(["bench", "--config", str(config_path), "--output-dir", str(output_dir)]) == 0
+    assert main.main(["bench", "--config", str(config_path), "--output-dir", str(output_dir), "--resume"]) == 0
 
-    metadata = json.loads((output_dir / "gpt-5-mini" / "medqa" / "metadata.json").read_text())
+    metadata = json.loads((output_dir / "gpt-5-mini" / "medqa" / "base" / "metadata.json").read_text())
     assert metadata["avg_reward"] == 0.75
     assert metadata["avg_metrics"] == {"accuracy": 0.75}
     assert metadata["total_tokens"] == 123
-    assert metadata["medarc_config_fingerprint"]
+    assert "medarc_config_fingerprint" not in metadata
 
 
 def test_toml_bench_does_not_patch_upstream_metadata_saves(
@@ -490,8 +563,10 @@ def test_toml_bench_does_not_patch_upstream_metadata_saves(
     )
     saved_metadata: list[dict[str, Any]] = []
 
-    def fake_save_metadata(metadata, _result_path):
+    def fake_save_metadata(metadata, result_path):
         saved_metadata.append(dict(metadata))
+        Path(result_path).mkdir(parents=True, exist_ok=True)
+        Path(result_path, "metadata.json").write_text(json.dumps(metadata))
 
     async def fake_run(config, on_progress=None, **_kwargs):
         metadata = {}
@@ -507,8 +582,10 @@ async def fake_run(config, on_progress=None, **_kwargs):
     assert main.main(["bench", "--config", str(config_path), "--output-dir", str(tmp_path / "evals")]) == 0
 
     assert saved_metadata == [{}]
-    metadata = json.loads((tmp_path / "evals" / "gpt-5-mini" / "medqa" / "metadata.json").read_text())
-    assert metadata["medarc_config_fingerprint"]
+    metadata = json.loads((tmp_path / "evals" / "gpt-5-mini" / "medqa" / "base" / "metadata.json").read_text())
+    assert "medarc_config_fingerprint" not in metadata
+    assert "variant_id" not in metadata
+    assert "variant_payload" not in metadata
 
 
 def test_single_run_help_lists_env_section_and_header_option(
diff --git a/tests/test_cli/test_process_discovery.py b/tests/test_cli/test_process_discovery.py
index 4f8260a9..ab001af7 100644
--- a/tests/test_cli/test_process_discovery.py
+++ b/tests/test_cli/test_process_discovery.py
@@ -271,7 +271,7 @@ def _write_eval_output(path: Path, metadata: dict | None = None) -> None:
 
 def test_discover_run_records_includes_deterministic_eval_outputs(tmp_path: Path) -> None:
     raw_dir = tmp_path / "runs" / "raw"
-    eval_dir = tmp_path / "runs" / "evals" / "gpt-5-mini" / "medqa"
+    eval_dir = tmp_path / "runs" / "evals" / "gpt-5-mini" / "medqa" / "base"
     _write_eval_output(eval_dir)
 
     records = discover_run_records(raw_dir, filter_status=("completed",))
@@ -284,6 +284,8 @@ def test_discover_run_records_includes_deterministic_eval_outputs(tmp_path: Path
     assert record.row_count == 1
     assert record.env_args == {"split": "test"}
     assert record.sampling_args == {"temperature": 0}
+    normalized = load_normalized_metadata(record)
+    assert normalized.variant_id == "base"
 
 
 def test_discover_run_records_includes_deterministic_eval_variants(tmp_path: Path) -> None:
@@ -308,30 +310,22 @@ def test_discover_run_records_includes_deterministic_eval_variants(tmp_path: Pat
     assert normalized.medarc_config_fingerprint_payload == {"env_id": "medqa"}
 
 
-def test_discover_run_records_prefers_bench_index_identity(tmp_path: Path) -> None:
+def test_discover_run_records_enriches_from_model_helper(tmp_path: Path) -> None:
     evals_root = tmp_path / "runs" / "evals"
     eval_dir = evals_root / "gpt-5-mini" / "medqa" / "seed-1618"
     _write_eval_output(eval_dir, {"env_id": "medqa", "model": "gpt-5-mini"})
     _write_json(
-        evals_root / "bench_index.json",
+        evals_root / "gpt-5-mini" / ".medarc_eval_metadata.json",
         {
             "version": 1,
-            "source_config": "configs/eval/example.toml",
-            "evals": [
-                {
-                    "index": 1,
-                    "results_path": str(eval_dir),
-                    "env_id": "medqa",
-                    "model": "gpt-5-mini",
+            "model": "gpt-5-mini",
+            "outputs": {
+                "medqa/seed-1618": {
+                    "env_id": "medqa-canonical",
                     "variant_id": "seed-1618",
-                    "variant_payload": {"env_args": {"shuffle_seed": 1618}},
-                    "env_args": {"shuffle_seed": 1618},
-                    "sampling_args": {"temperature": 0},
-                    "num_examples": 1,
-                    "rollouts_per_example": 1,
-                    "plan_digest": "sha256:abc",
+                    "results_path": "medqa/seed-1618",
                 }
-            ],
+            },
         },
     )
 
@@ -339,45 +333,63 @@ def test_discover_run_records_prefers_bench_index_identity(tmp_path: Path) -> No
 
     assert len(records) == 1
     record = records[0]
-    assert record.manifest.manifest_path == evals_root / "bench_index.json"
-    assert record.manifest.config_source == "configs/eval/example.toml"
-    assert record.manifest.config_checksum == "sha256:abc"
-    assert record.env_args == {"shuffle_seed": 1618}
+    assert record.manifest.manifest_path == eval_dir / "metadata.json"
+    assert record.manifest.config_source is None
+    assert record.manifest.config_checksum is None
+    assert record.manifest_env_id == "medqa-canonical"
     normalized = load_normalized_metadata(record)
     assert normalized.variant_id == "seed-1618"
-    assert normalized.variant_payload == {"env_args": {"shuffle_seed": 1618}}
+    assert normalized.variant_payload is None
 
 
-def test_discover_run_records_bench_index_rejects_missing_artifacts(tmp_path: Path) -> None:
+def test_discover_run_records_ignores_stale_model_helper_entries(tmp_path: Path) -> None:
     evals_root = tmp_path / "runs" / "evals"
     _write_json(
-        evals_root / "bench_index.json",
+        evals_root / "gpt-5-mini" / ".medarc_eval_metadata.json",
         {
             "version": 1,
-            "evals": [
-                {
-                    "index": 1,
-                    "results_path": str(evals_root / "gpt-5-mini" / "medqa"),
+            "model": "gpt-5-mini",
+            "outputs": {
+                "medqa/base": {
                     "env_id": "medqa",
-                    "model": "gpt-5-mini",
-                    "variant_id": None,
-                    "variant_payload": None,
-                    "env_args": {},
-                    "sampling_args": {},
-                    "num_examples": 1,
-                    "rollouts_per_example": 1,
-                    "plan_digest": "sha256:abc",
+                    "variant_id": "base",
+                    "results_path": "medqa/base",
+                }
+            },
+        },
+    )
+
+    records = discover_run_records(tmp_path / "runs" / "raw", filter_status=("completed",))
+
+    assert records == []
+
+
+def test_discover_run_records_ignores_model_helper_entries_outside_model_dir(tmp_path: Path) -> None:
+    evals_root = tmp_path / "runs" / "evals"
+    eval_dir = evals_root / "gpt-5-mini" / "medqa" / "base"
+    _write_eval_output(eval_dir, {"env_id": "medqa", "model": "gpt-5-mini"})
+    _write_json(
+        evals_root / "other-model" / ".medarc_eval_metadata.json",
+        {
+            "version": 1,
+            "model": "other-model",
+            "outputs": {
+                "../gpt-5-mini/medqa/base": {
+                    "env_id": "wrong-env",
+                    "variant_id": "wrong-variant",
+                    "results_path": "../gpt-5-mini/medqa/base",
                 }
-            ],
+            },
         },
     )
 
-    try:
-        discover_run_records(tmp_path / "runs" / "raw", filter_status=("completed",))
-    except ValueError as exc:
-        assert "required artifact is missing" in str(exc)
-    else:
-        raise AssertionError("bench_index with missing artifacts should fail validation")
+    records = discover_run_records(tmp_path / "runs" / "raw", filter_status=("completed",))
+
+    assert len(records) == 1
+    record = records[0]
+    assert record.manifest_env_id == "medqa"
+    normalized = load_normalized_metadata(record)
+    assert normalized.variant_id == "base"
 
 
 def test_discover_run_records_includes_direct_upstream_uuid_outputs(tmp_path: Path) -> None:
@@ -394,7 +406,7 @@ def test_discover_run_records_includes_direct_upstream_uuid_outputs(tmp_path: Pa
 
 
 def test_discover_run_records_deduplicates_overlapping_eval_roots(tmp_path: Path) -> None:
-    eval_dir = tmp_path / "runs" / "evals" / "gpt-5-mini" / "medqa"
+    eval_dir = tmp_path / "runs" / "evals" / "gpt-5-mini" / "medqa" / "base"
     _write_eval_output(eval_dir)
 
     records = discover_run_records(tmp_path / "runs", filter_status=("completed",))
diff --git a/tests/test_cli/test_toml_bench_index.py b/tests/test_cli/test_toml_bench_index.py
deleted file mode 100644
index cc5440a6..00000000
--- a/tests/test_cli/test_toml_bench_index.py
+++ /dev/null
@@ -1,320 +0,0 @@
-from __future__ import annotations
-
-import json
-from pathlib import Path
-from textwrap import dedent
-
-import pytest
-
-from medarc_verifiers.cli import main
-
-
-def _write_config(path: Path, text: str) -> None:
-    path.write_text(dedent(text).strip(), encoding="utf-8")
-
-
-def test_toml_bench_writes_bench_index(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
-    config_path = tmp_path / "bench.toml"
-    output_dir = tmp_path / "evals"
-    _write_config(
-        config_path,
-        """
-        model = "gpt-5-mini"
-
-        [[eval]]
-        env_id = "medqa"
-        num_examples = 1
-        rollouts_per_example = 1
-        """,
-    )
-
-    async def fake_run(config, **_kwargs):
-        results_path = Path(config.resume_path)
-        (results_path / "results.jsonl").write_text(json.dumps({"example_id": "0", "reward": 1.0}) + "\n")
-        return {"outputs": [], "metadata": {}}
-
-    monkeypatch.setattr(main, "run_evaluation", fake_run)
-
-    assert main.main(["bench", "--config", str(config_path), "--output-dir", str(output_dir)]) == 0
-
-    results_path = output_dir / "gpt-5-mini" / "medqa"
-    bench_index = json.loads((output_dir / "bench_index.json").read_text())
-    assert bench_index["version"] == 1
-    assert bench_index["source_config"] == str(config_path)
-    assert bench_index["evals"][0]["results_path"] == str(results_path)
-    assert bench_index["evals"][0]["plan_digest"].startswith("sha256:")
-
-
-def test_toml_bench_failed_eval_does_not_create_metadata(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
-    config_path = tmp_path / "bench.toml"
-    output_dir = tmp_path / "evals"
-    _write_config(
-        config_path,
-        """
-        model = "gpt-5-mini"
-
-        [[eval]]
-        env_id = "medqa"
-        num_examples = 1
-        rollouts_per_example = 1
-        """,
-    )
-
-    async def fake_run(config, **_kwargs):  # noqa: ARG001
-        raise RuntimeError("upstream failure")
-
-    monkeypatch.setattr(main, "run_evaluation", fake_run)
-
-    assert main.main(["bench", "--config", str(config_path), "--output-dir", str(output_dir)]) == 1
-
-    results_path = output_dir / "gpt-5-mini" / "medqa"
-    bench_index = json.loads((output_dir / "bench_index.json").read_text())
-    assert bench_index["evals"] == []
-    assert not (results_path / "metadata.json").exists()
-    assert not (results_path / "results.jsonl").exists()
-
-
-def test_toml_bench_continue_on_error_omits_failed_sidecar_entry(
-    monkeypatch: pytest.MonkeyPatch, tmp_path: Path
-) -> None:
-    config_path = tmp_path / "bench.toml"
-    output_dir = tmp_path / "evals"
-    _write_config(
-        config_path,
-        """
-        model = "gpt-5-mini"
-
-        [[eval]]
-        env_id = "medqa"
-
-        [[eval]]
-        env_id = "pubmedqa"
-        """,
-    )
-
-    async def fake_run(config, **_kwargs):
-        results_path = Path(config.resume_path)
-        if results_path.name == "pubmedqa":
-            raise RuntimeError("upstream failure")
-        (results_path / "results.jsonl").write_text(json.dumps({"example_id": "0"}) + "\n")
-        return {"outputs": [], "metadata": {}}
-
-    monkeypatch.setattr(main, "run_evaluation", fake_run)
-
-    assert (
-        main.main(
-            [
-                "bench",
-                "--config",
-                str(config_path),
-                "--output-dir",
-                str(output_dir),
-                "--continue-on-error",
-            ]
-        )
-        == 1
-    )
-
-    bench_index = json.loads((output_dir / "bench_index.json").read_text())
-    assert [entry["env_id"] for entry in bench_index["evals"]] == ["medqa"]
-    assert (output_dir / "gpt-5-mini" / "medqa" / "results.jsonl").exists()
-    assert not (output_dir / "gpt-5-mini" / "pubmedqa" / "results.jsonl").exists()
-
-
-def test_toml_bench_force_failure_removes_archived_sidecar_entry(
-    monkeypatch: pytest.MonkeyPatch, tmp_path: Path
-) -> None:
-    config_path = tmp_path / "bench.toml"
-    output_dir = tmp_path / "evals"
-    _write_config(
-        config_path,
-        """
-        model = "gpt-5-mini"
-
-        [[eval]]
-        env_id = "medqa"
-        """,
-    )
-
-    async def successful_run(config, **_kwargs):
-        Path(config.resume_path, "results.jsonl").write_text(json.dumps({"example_id": "0"}) + "\n")
-        return {"outputs": [], "metadata": {}}
-
-    monkeypatch.setattr(main, "run_evaluation", successful_run)
-    assert main.main(["bench", "--config", str(config_path), "--output-dir", str(output_dir)]) == 0
-
-    async def failing_run(config, **_kwargs):  # noqa: ARG001
-        raise RuntimeError("upstream failure")
-
-    monkeypatch.setattr(main, "run_evaluation", failing_run)
-    assert main.main(["bench", "--config", str(config_path), "--output-dir", str(output_dir), "--force"]) == 1
-
-    bench_index = json.loads((output_dir / "bench_index.json").read_text())
-    assert bench_index["evals"] == []
-    assert list((output_dir / "gpt-5-mini").glob("medqa__old_*"))
-    assert not (output_dir / "gpt-5-mini" / "medqa" / "results.jsonl").exists()
-
-
-def test_toml_bench_refuses_existing_output_without_bench_index(
-    monkeypatch: pytest.MonkeyPatch, tmp_path: Path
-) -> None:
-    config_path = tmp_path / "bench.toml"
-    output_dir = tmp_path / "evals"
-    _write_config(
-        config_path,
-        """
-        model = "gpt-5-mini"
-
-        [[eval]]
-        env_id = "medqa"
-        """,
-    )
-    results_path = output_dir / "gpt-5-mini" / "medqa"
-    results_path.mkdir(parents=True)
-    (results_path / "metadata.json").write_text(json.dumps({"env_id": "medqa", "model": "gpt-5-mini"}))
-
-    async def fake_run(config, **_kwargs):  # noqa: ARG001
-        raise AssertionError("bench should fail before execution")
-
-    monkeypatch.setattr(main, "run_evaluation", fake_run)
-
-    assert main.main(["bench", "--config", str(config_path), "--output-dir", str(output_dir)]) == 1
-
-
-def test_toml_bench_force_archives_existing_output_without_bench_index(
-    monkeypatch: pytest.MonkeyPatch, tmp_path: Path
-) -> None:
-    config_path = tmp_path / "bench.toml"
-    output_dir = tmp_path / "evals"
-    _write_config(
-        config_path,
-        """
-        model = "gpt-5-mini"
-
-        [[eval]]
-        env_id = "medqa"
-        """,
-    )
-    results_path = output_dir / "gpt-5-mini" / "medqa"
-    results_path.mkdir(parents=True)
-    (results_path / "metadata.json").write_text(json.dumps({"env_id": "medqa", "model": "gpt-5-mini"}))
-    (results_path / "sentinel.txt").write_text("old")
-
-    async def fake_run(config, **_kwargs):
-        Path(config.resume_path, "results.jsonl").write_text(json.dumps({"example_id": "0"}) + "\n")
-        return {"outputs": [], "metadata": {}}
-
-    monkeypatch.setattr(main, "run_evaluation", fake_run)
-
-    assert main.main(["bench", "--config", str(config_path), "--output-dir", str(output_dir), "--force"]) == 0
-    assert not (results_path / "sentinel.txt").exists()
-    assert list((output_dir / "gpt-5-mini").glob("medqa__old_*"))
-
-
-def test_toml_bench_refuses_existing_output_missing_from_bench_index(
-    monkeypatch: pytest.MonkeyPatch, tmp_path: Path
-) -> None:
-    config_path = tmp_path / "bench.toml"
-    output_dir = tmp_path / "evals"
-    _write_config(
-        config_path,
-        """
-        model = "gpt-5-mini"
-
-        [[eval]]
-        env_id = "medqa"
-        """,
-    )
-    results_path = output_dir / "gpt-5-mini" / "medqa"
-    results_path.mkdir(parents=True)
-    (results_path / "metadata.json").write_text(json.dumps({"env_id": "medqa", "model": "gpt-5-mini"}))
-    (output_dir / "bench_index.json").write_text(
-        json.dumps(
-            {
-                "version": 1,
-                "evals": [
-                    {
-                        "index": 1,
-                        "results_path": str(output_dir / "other-model" / "medqa"),
-                        "model": "other-model",
-                        "env_id": "medqa",
-                        "variant_id": None,
-                        "variant_payload": None,
-                        "env_args": {},
-                        "sampling_args": {},
-                        "num_examples": 1,
-                        "rollouts_per_example": 1,
-                        "plan_digest": "sha256:old",
-                    }
-                ],
-            }
-        )
-    )
-
-    async def fake_run(config, **_kwargs):  # noqa: ARG001
-        raise AssertionError("bench should fail before execution")
-
-    monkeypatch.setattr(main, "run_evaluation", fake_run)
-
-    assert main.main(["bench", "--config", str(config_path), "--output-dir", str(output_dir)]) == 1
-
-
-def test_toml_bench_refuses_stale_metadata_even_when_bench_index_matches(
-    monkeypatch: pytest.MonkeyPatch, tmp_path: Path
-) -> None:
-    config_path = tmp_path / "bench.toml"
-    output_dir = tmp_path / "evals"
-    _write_config(
-        config_path,
-        """
-        model = "gpt-5-mini"
-
-        [[eval]]
-        env_id = "medqa"
-        """,
-    )
-
-    async def fake_run(config, **_kwargs):
-        Path(config.resume_path, "results.jsonl").write_text(json.dumps({"example_id": "0"}) + "\n")
-        return {"outputs": [], "metadata": {}}
-
-    monkeypatch.setattr(main, "run_evaluation", fake_run)
-
-    assert main.main(["bench", "--config", str(config_path), "--output-dir", str(output_dir)]) == 0
-
-    metadata_path = output_dir / "gpt-5-mini" / "medqa" / "metadata.json"
-    metadata = json.loads(metadata_path.read_text())
-    metadata["medarc_config_fingerprint"] = "stale"
-    metadata_path.write_text(json.dumps(metadata))
-
-    assert main.main(["bench", "--config", str(config_path), "--output-dir", str(output_dir)]) == 1
-
-
-def test_toml_bench_selected_runs_merge_bench_index(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
-    config_path = tmp_path / "bench.toml"
-    output_dir = tmp_path / "evals"
-    _write_config(
-        config_path,
-        """
-        model = "gpt-5-mini"
-
-        [[eval]]
-        env_id = "medqa"
-
-        [[eval]]
-        env_id = "pubmedqa"
-        """,
-    )
-
-    async def fake_run(config, **_kwargs):
-        Path(config.resume_path, "results.jsonl").write_text(json.dumps({"example_id": "0"}) + "\n")
-        return {"outputs": [], "metadata": {}}
-
-    monkeypatch.setattr(main, "run_evaluation", fake_run)
-
-    assert main.main(["bench", "--config", str(config_path), "--output-dir", str(output_dir), "--eval-index", "1"]) == 0
-    assert main.main(["bench", "--config", str(config_path), "--output-dir", str(output_dir), "--eval-index", "2"]) == 0
-
-    bench_index = json.loads((output_dir / "bench_index.json").read_text())
-    assert [entry["env_id"] for entry in bench_index["evals"]] == ["medqa", "pubmedqa"]
-    assert main.main(["bench", "--config", str(config_path), "--output-dir", str(output_dir)]) == 0

From bb975868b923d9c861274f94a7f5f652c4234734 Mon Sep 17 00:00:00 2001
From: Benjamin Warner <me@benjaminwarner.dev>
Date: Thu, 7 May 2026 19:55:34 +0000
Subject: [PATCH 22/53] Remove eval metadata helper

---
 docs/medarc-eval-bench.md                 | 19 +++---
 docs/medarc-eval-process.md               | 11 ++--
 docs/medarc-eval.md                       |  1 -
 docs/medarc-verifiers-architecture.md     | 11 ++--
 medarc_verifiers/cli/eval_identity.py     | 14 ++--
 medarc_verifiers/cli/main.py              | 73 +--------------------
 medarc_verifiers/cli/process/discovery.py | 66 +++----------------
 medarc_verifiers/cli/process/metadata.py  |  8 +--
 tests/test_cli/test_eval_identity.py      | 23 +++++++
 tests/test_cli/test_main.py               | 53 +--------------
 tests/test_cli/test_process_discovery.py  | 78 ++---------------------
 11 files changed, 64 insertions(+), 293 deletions(-)

diff --git a/docs/medarc-eval-bench.md b/docs/medarc-eval-bench.md
index 7a79c529..f5cce689 100644
--- a/docs/medarc-eval-bench.md
+++ b/docs/medarc-eval-bench.md
@@ -95,18 +95,17 @@ Non-variant evals use the reserved variant id `base` and write to
 an explicit `variant_id` or `name`. `name` may use simple templates such as
 `shuffle_seed-{env_args.shuffle_seed}` after ablation expansion.
 
-## MedARC Metadata
+`variant_id` and `name` are path identities. They must already be path-safe:
+use only letters, numbers, `.`, `_`, and `-`. For example,
+`variant_id = "shuffle_seed-1618"` is valid, while
+`variant_id = "shuffle seed = 1618"` fails with a clear error.
 
-Upstream `metadata.json` remains a normal `verifiers` file. MedARC-specific
-identity lives in a small model-level helper:
+## Metadata
 
-```text
-runs/evals/<model>/.medarc_eval_metadata.json
-```
-
-The helper maps model-relative results paths such as `medqa/base` to `env_id`
-and `variant_id`. Processing scans output directories first, so stale helper
-entries do not hide or create process records.
+Upstream `metadata.json` remains a normal `verifiers` file. MedARC does not
+write separate bench metadata. Processing recovers exact model and environment
+identity from upstream metadata, and recovers variant identity from the
+deterministic path segment.
 
 ## Resume and Force
 
diff --git a/docs/medarc-eval-process.md b/docs/medarc-eval-process.md
index 508e1d7a..4db96901 100644
--- a/docs/medarc-eval-process.md
+++ b/docs/medarc-eval-process.md
@@ -57,10 +57,9 @@ On-disk model and env path components are slugified, so filenames may not exactl
 ### By Completion Status
 
 For current TOML bench outputs, processing scans for directories containing
-`metadata.json` and `results.jsonl`. When a model-level
-`.medarc_eval_metadata.json` helper exists, processing uses it only to enrich
-matching scanned paths with MedARC `env_id` / `variant_id`. Stale helper entries
-are ignored. Ad hoc upstream outputs fall back to metadata/path inference.
+`metadata.json` and `results.jsonl`. Model and environment identity come from
+upstream metadata when available; variant identity comes from the deterministic
+path segment. Ad hoc upstream outputs fall back to metadata/path inference.
 
 For legacy YAML-runner outputs, `medarc-eval process` reads
 `runs/raw/<run_id>/run_manifest.json` and only selects jobs whose manifest
@@ -86,9 +85,7 @@ medarc-eval process --max-results-missing-pct 100
 ```
 
 For TOML bench outputs, this gate uses `metadata.json` values for expected rows
-and the observed `results.jsonl` row count. Model and environment identity come
-from upstream metadata and path inference, with variant identity enriched by the
-model-level helper when present:
+and the observed `results.jsonl` row count:
 
 - `expected_rows = num_examples * rollouts_per_example`
 - `observed_rows = results.jsonl row count`
diff --git a/docs/medarc-eval.md b/docs/medarc-eval.md
index a1833324..93465f7f 100644
--- a/docs/medarc-eval.md
+++ b/docs/medarc-eval.md
@@ -113,7 +113,6 @@ medarc-eval winrate --list-models
 runs/
 ├── evals/                        # Raw TOML bench outputs
 │   └── <model>/
-│       ├── .medarc_eval_metadata.json
 │       └── <env>/
 │           └── <variant>/
 │               ├── results.jsonl
diff --git a/docs/medarc-verifiers-architecture.md b/docs/medarc-verifiers-architecture.md
index d610b0c8..a09efce3 100644
--- a/docs/medarc-verifiers-architecture.md
+++ b/docs/medarc-verifiers-architecture.md
@@ -134,9 +134,9 @@ target as upstream `EvalConfig.resume_path` and trusts upstream resume
 validation. `--force` archives the existing target and reruns.
 
 `medarc-eval bench` does not monkey-patch upstream metadata saving and does not
-write MedARC identity into upstream `metadata.json`. MedARC-specific identity
-lives in `runs/evals/<model>/.medarc_eval_metadata.json`, keyed by
-model-relative results paths such as `medqa/base`.
+write MedARC identity into upstream `metadata.json`. Variant identity is the
+deterministic path segment, so `variant_id` / `name` values must already be
+path-safe.
 
 `medarc_verifiers/cli/_manifest.py` now only contains the legacy manifest schema
 needed by processing to read historical `runs/raw` outputs.
@@ -147,7 +147,6 @@ TOML bench outputs include:
 
 - `results.jsonl`: per-example rollouts
 - `metadata.json`: eval configuration and metrics snapshot
-- `.medarc_eval_metadata.json`: minimal model-level MedARC identity helper
 
 The runner executes via `verifiers.utils.eval_utils.run_evaluation()` from
 single-run mode and the TOML bench code in `medarc_verifiers/cli/main.py`.
@@ -162,8 +161,8 @@ Processing:
 
 1. Discovers TOML bench outputs from `runs/evals` by scanning directories, and
    legacy manifest outputs from `runs/raw`.
-2. Normalizes identity from upstream `metadata.json`, paths, and optional
-   model-level helper metadata; legacy outputs still use manifest fields.
+2. Normalizes identity from upstream `metadata.json` and paths; legacy outputs
+   still use manifest fields.
 3. Loads rows from `results.jsonl`, drops large prompt/completion fields, and
    flattens `token_usage`.
 4. Aggregates rows per model and environment, preserving variant ids.
diff --git a/medarc_verifiers/cli/eval_identity.py b/medarc_verifiers/cli/eval_identity.py
index c7ca5d39..564c9384 100644
--- a/medarc_verifiers/cli/eval_identity.py
+++ b/medarc_verifiers/cli/eval_identity.py
@@ -12,7 +12,6 @@
 from typing import Any
 
 MEDARC_VARIANT_ID_KEY = "variant_id"
-MEDARC_EVAL_METADATA_FILENAME = ".medarc_eval_metadata.json"
 BASE_VARIANT_ID = "base"
 
 _SLUG_PATTERN = re.compile(r"[^A-Za-z0-9._-]+")
@@ -86,12 +85,7 @@ def plan_eval_paths(raw_configs: Sequence[Mapping[str, Any]], *, output_root: st
         variant_id = _variant_id(config, index=idx + 1)
 
         identity = EvalIdentity(model_id=model_id, env_id=env_id, variant_id=variant_id)
-        path = (
-            Path(output_root)
-            / slug_component(model_id)
-            / slug_component(env_id)
-            / slug_component(variant_id, max_length=_MAX_VARIANT_ID_LENGTH)
-        )
+        path = Path(output_root) / slug_component(model_id) / slug_component(env_id) / variant_id
         plans.append(EvalPathPlan(identity=identity, results_path=path))
 
     _ensure_unique_identities(plans)
@@ -179,6 +173,11 @@ def _normalize_variant(value: Any, *, config: Mapping[str, Any], field: str, ind
     text = _expand_variant_template(str(value).strip(), config)
     if not text:
         raise ValueError(f"TOML eval {index} {field} must not be empty.")
+    if slug_component(text, max_length=_MAX_VARIANT_ID_LENGTH) != text:
+        raise ValueError(
+            f'TOML eval {index} {field} {text!r} is not path-safe. '
+            'Use only letters, numbers, ".", "_", and "-", for example "shuffle_seed-1618".'
+        )
     return text
 
 
@@ -255,7 +254,6 @@ def _canonicalize(value: Any) -> Any:
     "BASE_VARIANT_ID",
     "EvalIdentity",
     "EvalPathPlan",
-    "MEDARC_EVAL_METADATA_FILENAME",
     "MEDARC_VARIANT_ID_KEY",
     "generate_variant_id",
     "plan_eval_paths",
diff --git a/medarc_verifiers/cli/main.py b/medarc_verifiers/cli/main.py
index 1e7dca87..54e8d798 100644
--- a/medarc_verifiers/cli/main.py
+++ b/medarc_verifiers/cli/main.py
@@ -4,7 +4,6 @@
 
 import argparse
 import asyncio
-import json
 import logging
 import os
 import shutil
@@ -19,7 +18,6 @@
 from rich.console import Console
 from rich.table import Table
 from verifiers.utils.eval_utils import run_evaluation
-from verifiers.utils.save_utils import make_serializable
 
 from medarc_verifiers.cli._constants import (
     BENCH_COMMAND,
@@ -36,11 +34,9 @@
 from medarc_verifiers.cli._schemas import EnvironmentConfigSchema, EnvironmentExportConfig
 from medarc_verifiers.cli._single_run import run_single_mode
 from medarc_verifiers.cli.eval_identity import (
-    MEDARC_EVAL_METADATA_FILENAME,
     EvalPathPlan,
     generate_variant_id,
     plan_eval_paths,
-    slug_component,
 )
 from medarc_verifiers.cli.hf import HFSyncConfig, sync_files_to_hub
 from medarc_verifiers.cli.process import PROCESS_DEFAULT_STATUS_FILTER, ProcessOptions, ProcessResult, run_process
@@ -1281,7 +1277,7 @@ def _run_toml_bench(args: argparse.Namespace) -> int:
     _print_toml_bench_plan(eval_configs, path_plans, dry_run=bool(args.dry_run))
     if args.dry_run:
         return 0
-    return _execute_toml_plan(eval_configs, path_plans, output_root, args)
+    return _execute_toml_plan(eval_configs, path_plans, args)
 
 
 def _prepare_toml_raw_configs(raw_configs: Sequence[dict[str, Any]], args: argparse.Namespace) -> list[dict[str, Any]]:
@@ -1337,11 +1333,9 @@ def _select_toml_plan(
 def _execute_toml_plan(
     eval_configs: Sequence[Any],
     path_plans: Sequence[EvalPathPlan],
-    output_root: Path,
     args: argparse.Namespace,
 ) -> int:
     failures = 0
-    _validate_model_eval_metadata_for_plan(output_root, path_plans)
     for index, (config, path_plan) in enumerate(zip(eval_configs, path_plans), start=1):
         results_path = path_plan.results_path
         try:
@@ -1353,7 +1347,6 @@ def _execute_toml_plan(
             run_config = config.model_copy(update={"resume_path": results_path, "save_results": True})
             logger.info("Running TOML eval %d/%d: %s on %s", index, len(eval_configs), config.env_id, config.model)
             asyncio.run(_run_one_toml_eval(run_config))
-            _write_model_eval_metadata(output_root, path_plan)
         except Exception as exc:  # noqa: BLE001
             failures += 1
             logger.exception("TOML eval %d failed: %s", index, exc)
@@ -1410,70 +1403,6 @@ def _archive_existing_path(path: Path) -> Path:
     return candidate
 
 
-def _validate_model_eval_metadata_for_plan(output_root: Path, path_plans: Sequence[EvalPathPlan]) -> None:
-    for path_plan in path_plans:
-        model_dir = output_root / slug_component(path_plan.identity.model_id)
-        helper_path = model_dir / MEDARC_EVAL_METADATA_FILENAME
-        if not helper_path.exists():
-            continue
-        try:
-            loaded = json.loads(helper_path.read_text(encoding="utf-8"))
-        except json.JSONDecodeError as exc:
-            raise ValueError(f"Cannot update {helper_path}: invalid JSON.") from exc
-        if not isinstance(loaded, dict):
-            continue
-        existing_model = loaded.get("model")
-        if existing_model and existing_model != path_plan.identity.model_id:
-            raise ValueError(
-                f"Cannot update {helper_path}: model slug is already associated with {existing_model!r}, "
-                f"not {path_plan.identity.model_id!r}."
-            )
-
-
-def _write_model_eval_metadata(output_root: Path, path_plan: EvalPathPlan) -> None:
-    model_dir = output_root / slug_component(path_plan.identity.model_id)
-    try:
-        relative_results_path = path_plan.results_path.relative_to(model_dir).as_posix()
-    except ValueError as exc:
-        raise ValueError(f"Internal bench planning error: {path_plan.results_path} is outside {model_dir}.") from exc
-
-    helper_path = model_dir / MEDARC_EVAL_METADATA_FILENAME
-    payload: dict[str, Any] = {"version": 1, "model": path_plan.identity.model_id, "outputs": {}}
-    if helper_path.exists():
-        try:
-            loaded = json.loads(helper_path.read_text(encoding="utf-8"))
-        except json.JSONDecodeError as exc:
-            raise ValueError(f"Cannot update {helper_path}: invalid JSON.") from exc
-        if isinstance(loaded, dict):
-            payload.update(loaded)
-        existing_model = payload.get("model")
-        if existing_model and existing_model != path_plan.identity.model_id:
-            raise ValueError(
-                f"Cannot update {helper_path}: model slug is already associated with {existing_model!r}, "
-                f"not {path_plan.identity.model_id!r}."
-            )
-
-    outputs = payload.get("outputs")
-    if not isinstance(outputs, dict):
-        outputs = {}
-    outputs[relative_results_path] = {
-        "env_id": path_plan.identity.env_id,
-        "variant_id": path_plan.identity.variant_id,
-        "results_path": relative_results_path,
-    }
-    payload["version"] = 1
-    payload["model"] = path_plan.identity.model_id
-    payload["outputs"] = outputs
-    _write_json_atomic(helper_path, payload)
-
-
-def _write_json_atomic(path: Path, payload: Mapping[str, Any]) -> None:
-    path.parent.mkdir(parents=True, exist_ok=True)
-    tmp_path = path.with_name(f".{path.name}.tmp")
-    tmp_path.write_text(json.dumps(payload, default=make_serializable, sort_keys=True), encoding="utf-8")
-    tmp_path.replace(path)
-
-
 def _eval_config_identity_payload(config: Any, raw: Mapping[str, Any] | None = None) -> dict[str, Any]:
     payload = {
         "env_args": dict(config.env_args or {}),
diff --git a/medarc_verifiers/cli/process/discovery.py b/medarc_verifiers/cli/process/discovery.py
index 44ed508e..68cba898 100644
--- a/medarc_verifiers/cli/process/discovery.py
+++ b/medarc_verifiers/cli/process/discovery.py
@@ -11,10 +11,7 @@
 
 from pydantic import ValidationError
 
-from medarc_verifiers.cli.eval_identity import (
-    MEDARC_EVAL_METADATA_FILENAME,
-    MEDARC_VARIANT_ID_KEY,
-)
+from medarc_verifiers.cli.eval_identity import MEDARC_VARIANT_ID_KEY
 from medarc_verifiers.cli._manifest import (
     MANIFEST_FILENAME,
     ManifestJobEntry,
@@ -374,7 +371,6 @@ def _iter_eval_output_records(evals_root: Path) -> Iterator[RunRecord]:
         logger.warning("Failed to scan eval outputs under %s: %s", evals_root, exc)
         return
 
-    helper_entries = _load_model_helper_entries(evals_root)
     seen: set[Path] = set()
     for results_path in results_paths:
         results_dir = results_path.parent
@@ -387,7 +383,7 @@ def _iter_eval_output_records(evals_root: Path) -> Iterator[RunRecord]:
         metadata_path = results_dir / METADATA_FILENAME
         if not metadata_path.exists():
             continue
-        record = _build_eval_output_record(evals_root, results_dir, helper_entries.get(_dedupe_key(results_dir)))
+        record = _build_eval_output_record(evals_root, results_dir)
         if record is not None:
             yield record
 
@@ -395,14 +391,13 @@ def _iter_eval_output_records(evals_root: Path) -> Iterator[RunRecord]:
 def _build_eval_output_record(
     evals_root: Path,
     results_dir: Path,
-    helper_entry: Mapping[str, Any] | None = None,
 ) -> RunRecord | None:
     metadata_path = results_dir / METADATA_FILENAME
     metadata_payload = _read_metadata_payload(metadata_path)
     if metadata_payload is None:
         return None
 
-    layout = _infer_eval_output_layout(evals_root, results_dir, metadata_payload, helper_entry)
+    layout = _infer_eval_output_layout(evals_root, results_dir, metadata_payload)
     updated_at = _path_timestamp(metadata_path)
     job_run_id = layout["job_run_id"]
     job_id = layout["job_id"]
@@ -466,7 +461,6 @@ def _infer_eval_output_layout(
     evals_root: Path,
     results_dir: Path,
     metadata_payload: Mapping[str, Any],
-    helper_entry: Mapping[str, Any] | None = None,
 ) -> dict[str, str]:
     try:
         parts = results_dir.relative_to(evals_root).parts
@@ -475,21 +469,16 @@ def _infer_eval_output_layout(
 
     metadata_env_id = _string_or_none(metadata_payload.get("env_id"))
     metadata_model = _string_or_none(metadata_payload.get("model"))
-    helper_env_id = _string_or_none(helper_entry.get("env_id") if helper_entry else None)
-    helper_variant_id = _string_or_none(helper_entry.get("variant_id") if helper_entry else None)
-    parent_name = results_dir.parent.name
-    if "--" in parent_name and len(parts) >= 2:
-        env_from_parent, model_from_parent = parent_name.split("--", 1)
-        env_id = helper_env_id or metadata_env_id or env_from_parent
+    if len(parts) == 2 and "--" in parts[0]:
+        env_from_parent, model_from_parent = parts[0].split("--", 1)
+        env_id = metadata_env_id or env_from_parent
         model_id = metadata_model or model_from_parent
         job_run_id = results_dir.name
-        variant_id = helper_variant_id
+        variant_id = None
     else:
         model_id = metadata_model or (parts[0] if len(parts) >= 1 else "unknown")
-        env_id = helper_env_id or metadata_env_id or (parts[1] if len(parts) >= 2 else results_dir.name)
-        variant_id = helper_variant_id or (
-            parts[2] if len(parts) >= 3 else _string_or_none(metadata_payload.get(MEDARC_VARIANT_ID_KEY))
-        )
+        env_id = metadata_env_id or (parts[1] if len(parts) >= 2 else results_dir.name)
+        variant_id = parts[2] if len(parts) >= 3 else _string_or_none(metadata_payload.get(MEDARC_VARIANT_ID_KEY))
         job_run_id = "::".join(part for part in (model_id, env_id, variant_id) if part)
 
     return {
@@ -500,43 +489,6 @@ def _infer_eval_output_layout(
         "variant_id": variant_id or "",
     }
 
-
-def _load_model_helper_entries(evals_root: Path) -> dict[Path, Mapping[str, Any]]:
-    entries: dict[Path, Mapping[str, Any]] = {}
-    try:
-        helper_paths = sorted(evals_root.glob(f"*/{MEDARC_EVAL_METADATA_FILENAME}"))
-    except OSError as exc:  # noqa: FBT003
-        logger.warning("Failed to scan eval metadata helpers under %s: %s", evals_root, exc)
-        return entries
-
-    for helper_path in helper_paths:
-        payload = _read_metadata_payload(helper_path)
-        if payload is None:
-            continue
-        raw_outputs = payload.get("outputs")
-        if not isinstance(raw_outputs, Mapping):
-            continue
-        model_dir = helper_path.parent
-        for key, raw_entry in raw_outputs.items():
-            if not isinstance(raw_entry, Mapping):
-                continue
-            raw_results_path = raw_entry.get("results_path") or key
-            if not isinstance(raw_results_path, str) or not raw_results_path:
-                continue
-            relative_results_path = Path(raw_results_path)
-            if relative_results_path.is_absolute():
-                continue
-            results_dir = (model_dir / relative_results_path).resolve()
-            try:
-                results_dir.relative_to(model_dir.resolve())
-            except ValueError:
-                continue
-            if not (results_dir / METADATA_FILENAME).exists() or not (results_dir / RESULTS_FILENAME).exists():
-                continue
-            entries[_dedupe_key(results_dir)] = raw_entry
-    return entries
-
-
 def _read_metadata_payload(path: Path) -> Mapping[str, Any] | None:
     try:
         payload = json.loads(path.read_text(encoding="utf-8"))
diff --git a/medarc_verifiers/cli/process/metadata.py b/medarc_verifiers/cli/process/metadata.py
index a84d7946..d971e1d5 100644
--- a/medarc_verifiers/cli/process/metadata.py
+++ b/medarc_verifiers/cli/process/metadata.py
@@ -221,13 +221,7 @@ def _resolve_metadata_context(
             record.rollouts_per_example,
             metadata_payload.rollouts_per_example if metadata_payload else None,
         ),
-        variant_id=_string_or_none(
-            _raw_metadata_value(
-                raw_metadata,
-                MEDARC_VARIANT_ID_KEY,
-                (metadata_payload.variant_id if metadata_payload else None) or record_variant_id,
-            )
-        ),
+        variant_id=record_variant_id or _string_or_none(metadata_payload.variant_id if metadata_payload else None),
         variant_payload=_mapping_or_none(
             _raw_metadata_value(
                 raw_metadata,
diff --git a/tests/test_cli/test_eval_identity.py b/tests/test_cli/test_eval_identity.py
index 2d0be1f4..6172e549 100644
--- a/tests/test_cli/test_eval_identity.py
+++ b/tests/test_cli/test_eval_identity.py
@@ -77,6 +77,29 @@ def test_conflicting_name_and_variant_id_fail(tmp_path: Path) -> None:
         )
 
 
+def test_variant_id_must_be_path_safe(tmp_path: Path) -> None:
+    with pytest.raises(ValueError, match="variant_id .* is not path-safe"):
+        plan_eval_paths(
+            [{"model": "gpt-5-mini", "env_id": "medqa", "variant_id": "shuffle seed = 1618"}],
+            output_root=tmp_path,
+        )
+
+
+def test_name_template_result_must_be_path_safe(tmp_path: Path) -> None:
+    with pytest.raises(ValueError, match="name .* is not path-safe"):
+        plan_eval_paths(
+            [
+                {
+                    "model": "gpt-5-mini",
+                    "env_id": "medqa",
+                    "env_args": {"difficulty": "very hard"},
+                    "name": "{env_args.difficulty}",
+                }
+            ],
+            output_root=tmp_path,
+        )
+
+
 def test_duplicate_model_env_requires_explicit_variant(tmp_path: Path) -> None:
     with pytest.raises(ValueError, match="Duplicate TOML eval identity"):
         plan_eval_paths(
diff --git a/tests/test_cli/test_main.py b/tests/test_cli/test_main.py
index 24693113..acbb79aa 100644
--- a/tests/test_cli/test_main.py
+++ b/tests/test_cli/test_main.py
@@ -321,18 +321,7 @@ async def fake_run(config, on_progress=None, **_kwargs):
     assert "medarc_config_fingerprint" not in metadata
     assert "variant_id" not in metadata
     assert "variant_payload" not in metadata
-    helper = json.loads((output_dir / "gpt-5-mini" / ".medarc_eval_metadata.json").read_text())
-    assert helper == {
-        "model": "gpt-5-mini",
-        "outputs": {
-            "medqa/base": {
-                "env_id": "medqa",
-                "results_path": "medqa/base",
-                "variant_id": "base",
-            }
-        },
-        "version": 1,
-    }
+    assert not (output_dir / "gpt-5-mini" / ".medarc_eval_metadata.json").exists()
 
 
 def test_toml_bench_defaults_max_concurrent_to_one(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
@@ -462,46 +451,6 @@ async def fake_run(config, **_kwargs):
     assert not (results_path / "sentinel.txt").exists()
 
 
-def test_toml_bench_refuses_existing_model_helper_slug_collision_before_run(
-    monkeypatch: pytest.MonkeyPatch,
-    tmp_path: Path,
-) -> None:
-    config_path = tmp_path / "bench.toml"
-    output_dir = tmp_path / "evals"
-    _write_config(
-        config_path,
-        """
-        model = "gpt/5-mini"
-
-        [[eval]]
-        env_id = "medqa"
-        """,
-    )
-    helper_path = output_dir / "gpt-5-mini" / ".medarc_eval_metadata.json"
-    helper_path.parent.mkdir(parents=True)
-    helper_path.write_text(
-        json.dumps(
-            {
-                "version": 1,
-                "model": "gpt 5-mini",
-                "outputs": {},
-            }
-        ),
-        encoding="utf-8",
-    )
-    calls = 0
-
-    async def fake_run(config, **_kwargs):
-        nonlocal calls
-        calls += 1
-        return {"outputs": [], "metadata": {}}
-
-    monkeypatch.setattr(main, "run_evaluation", fake_run)
-
-    assert main.main(["bench", "--config", str(config_path), "--output-dir", str(output_dir)]) == 1
-    assert calls == 0
-
-
 def test_toml_bench_resume_preserves_existing_metadata(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
     config_path = tmp_path / "bench.toml"
     output_dir = tmp_path / "evals"
diff --git a/tests/test_cli/test_process_discovery.py b/tests/test_cli/test_process_discovery.py
index ab001af7..22485448 100644
--- a/tests/test_cli/test_process_discovery.py
+++ b/tests/test_cli/test_process_discovery.py
@@ -310,84 +310,16 @@ def test_discover_run_records_includes_deterministic_eval_variants(tmp_path: Pat
     assert normalized.medarc_config_fingerprint_payload == {"env_id": "medqa"}
 
 
-def test_discover_run_records_enriches_from_model_helper(tmp_path: Path) -> None:
-    evals_root = tmp_path / "runs" / "evals"
-    eval_dir = evals_root / "gpt-5-mini" / "medqa" / "seed-1618"
-    _write_eval_output(eval_dir, {"env_id": "medqa", "model": "gpt-5-mini"})
-    _write_json(
-        evals_root / "gpt-5-mini" / ".medarc_eval_metadata.json",
-        {
-            "version": 1,
-            "model": "gpt-5-mini",
-            "outputs": {
-                "medqa/seed-1618": {
-                    "env_id": "medqa-canonical",
-                    "variant_id": "seed-1618",
-                    "results_path": "medqa/seed-1618",
-                }
-            },
-        },
-    )
-
-    records = discover_run_records(tmp_path / "runs" / "raw", filter_status=("completed",))
-
-    assert len(records) == 1
-    record = records[0]
-    assert record.manifest.manifest_path == eval_dir / "metadata.json"
-    assert record.manifest.config_source is None
-    assert record.manifest.config_checksum is None
-    assert record.manifest_env_id == "medqa-canonical"
-    normalized = load_normalized_metadata(record)
-    assert normalized.variant_id == "seed-1618"
-    assert normalized.variant_payload is None
-
-
-def test_discover_run_records_ignores_stale_model_helper_entries(tmp_path: Path) -> None:
-    evals_root = tmp_path / "runs" / "evals"
-    _write_json(
-        evals_root / "gpt-5-mini" / ".medarc_eval_metadata.json",
-        {
-            "version": 1,
-            "model": "gpt-5-mini",
-            "outputs": {
-                "medqa/base": {
-                    "env_id": "medqa",
-                    "variant_id": "base",
-                    "results_path": "medqa/base",
-                }
-            },
-        },
-    )
-
-    records = discover_run_records(tmp_path / "runs" / "raw", filter_status=("completed",))
-
-    assert records == []
-
-
-def test_discover_run_records_ignores_model_helper_entries_outside_model_dir(tmp_path: Path) -> None:
-    evals_root = tmp_path / "runs" / "evals"
-    eval_dir = evals_root / "gpt-5-mini" / "medqa" / "base"
-    _write_eval_output(eval_dir, {"env_id": "medqa", "model": "gpt-5-mini"})
-    _write_json(
-        evals_root / "other-model" / ".medarc_eval_metadata.json",
-        {
-            "version": 1,
-            "model": "other-model",
-            "outputs": {
-                "../gpt-5-mini/medqa/base": {
-                    "env_id": "wrong-env",
-                    "variant_id": "wrong-variant",
-                    "results_path": "../gpt-5-mini/medqa/base",
-                }
-            },
-        },
-    )
+def test_discover_run_records_preserves_variant_for_env_slug_with_double_hyphen(tmp_path: Path) -> None:
+    eval_dir = tmp_path / "runs" / "evals" / "gpt-5-mini" / "foo--bar" / "base"
+    _write_eval_output(eval_dir, {"env_id": "foo--bar", "model": "gpt-5-mini"})
 
     records = discover_run_records(tmp_path / "runs" / "raw", filter_status=("completed",))
 
     assert len(records) == 1
     record = records[0]
-    assert record.manifest_env_id == "medqa"
+    assert record.model_id == "gpt-5-mini"
+    assert record.manifest_env_id == "foo--bar"
     normalized = load_normalized_metadata(record)
     assert normalized.variant_id == "base"
 

From 78d8908899d173bb6e70b103ed48d0cff6700683 Mon Sep 17 00:00:00 2001
From: Benjamin Warner <me@benjaminwarner.dev>
Date: Thu, 7 May 2026 20:05:46 +0000
Subject: [PATCH 23/53] Default bench outputs to autoresume

---
 docs/medarc-eval-bench.md             | 32 +++++++++++++--------
 docs/medarc-verifiers-architecture.md |  9 +++---
 medarc_verifiers/cli/main.py          | 15 ++++------
 tests/test_cli/test_main.py           | 40 ++++++++++++++++++++++++---
 4 files changed, 68 insertions(+), 28 deletions(-)

diff --git a/docs/medarc-eval-bench.md b/docs/medarc-eval-bench.md
index f5cce689..eb26bdd2 100644
--- a/docs/medarc-eval-bench.md
+++ b/docs/medarc-eval-bench.md
@@ -107,23 +107,33 @@ write separate bench metadata. Processing recovers exact model and environment
 identity from upstream metadata, and recovers variant identity from the
 deterministic path segment.
 
-## Resume and Force
+## Output Root, Resume, and Force
 
-Bench writes each eval to a deterministic result directory. Existing output
-reuse is explicit:
+Bench writes each eval to a deterministic result directory. If neither
+`--output-dir` nor TOML `output_dir` is set, the output root defaults to
+`runs/evals`.
+
+Existing valid outputs resume automatically. This makes Slurm retries
+idempotent for a fixed `--eval-index`:
 
 ```bash
-# Resume an existing deterministic output using upstream resume behavior
-medarc-eval bench --config configs/eval/medmarks-verified.toml --resume
+medarc-eval bench --config configs/eval/medmarks-verified.toml --eval-index "$SLURM_ARRAY_TASK_ID"
+```
 
+If the deterministic target already contains both `metadata.json` and
+`results.jsonl`, MedARC passes that path to upstream `verifiers` as
+`resume_path` and lets upstream resume. If the target exists but is malformed or
+partial, bench fails unless `--force` is set:
+
+```bash
 # Archive existing deterministic outputs and rerun
 medarc-eval bench --config configs/eval/medmarks-verified.toml --force
 ```
 
-Without `--resume` or `--force`, an existing deterministic output fails.
-`--resume` delegates compatibility checks to upstream `verifiers`; MedARC does
-not maintain a sampling-argument allowlist or fingerprint blocker for resume
-safety. New provider arguments pass through to upstream.
+`--resume` is still accepted for compatibility, but deterministic bench outputs
+resume automatically when valid artifacts exist. MedARC does not maintain a
+sampling-argument allowlist or fingerprint blocker for resume safety. New
+provider arguments pass through to upstream.
 
 ## Common Flags
 
@@ -132,8 +142,8 @@ safety. New provider arguments pass through to upstream.
 | `--config PATH` | Required path to an upstream TOML eval config |
 | `--dry-run` | Resolve evals and print the deterministic plan |
 | `--force` | Archive existing deterministic output and rerun |
-| `--resume` | Resume an existing deterministic output via upstream `verifiers` |
-| `--output-dir PATH` | Override the config output directory |
+| `--resume` | Compatibility flag; valid deterministic outputs resume automatically |
+| `--output-dir PATH` | Override the config output directory, default `runs/evals` |
 | `--env-dir PATH` | Directory containing local environments |
 | `--endpoints-path PATH` | Endpoint registry path, default `configs/endpoints.toml` |
 | `--api-base-url URL` | Override API base URL for every eval |
diff --git a/docs/medarc-verifiers-architecture.md b/docs/medarc-verifiers-architecture.md
index a09efce3..dbd984a7 100644
--- a/docs/medarc-verifiers-architecture.md
+++ b/docs/medarc-verifiers-architecture.md
@@ -128,10 +128,11 @@ TOML bench writes eval outputs under deterministic directories:
 - Non-variant evals: `runs/evals/<model>/<env>/base/`
 - Variant evals: `runs/evals/<model>/<env>/<variant_id>/`
 
-Existing output reuse is explicit. Without `--resume` or `--force`, bench fails
-when the target directory already exists. `--resume` passes the deterministic
-target as upstream `EvalConfig.resume_path` and trusts upstream resume
-validation. `--force` archives the existing target and reruns.
+If neither `--output-dir` nor TOML `output_dir` is set, the output root
+defaults to `runs/evals`. Existing valid outputs resume automatically: bench
+passes the deterministic target as upstream `EvalConfig.resume_path` and trusts
+upstream resume validation. Partial or malformed existing targets fail unless
+`--force` archives the existing target and reruns.
 
 `medarc-eval bench` does not monkey-patch upstream metadata saving and does not
 write MedARC identity into upstream `metadata.json`. Variant identity is the
diff --git a/medarc_verifiers/cli/main.py b/medarc_verifiers/cli/main.py
index 54e8d798..a253d2f8 100644
--- a/medarc_verifiers/cli/main.py
+++ b/medarc_verifiers/cli/main.py
@@ -69,7 +69,11 @@ def build_batch_parser() -> argparse.ArgumentParser:
     )
     parser.add_argument("-c", "--config", required=True, type=Path, help="Path to an upstream TOML eval config file.")
     parser.add_argument("--force", action="store_true", help="Archive existing deterministic output and rerun.")
-    parser.add_argument("--resume", action="store_true", help="Resume an existing deterministic output path.")
+    parser.add_argument(
+        "--resume",
+        action="store_true",
+        help="Accepted for compatibility; deterministic bench outputs resume automatically when valid artifacts exist.",
+    )
     parser.add_argument("--output-dir", type=Path, help="Override the output directory from the configuration.")
     parser.add_argument(
         "--env-dir",
@@ -1342,7 +1346,6 @@ def _execute_toml_plan(
             _prepare_toml_results_dir(
                 results_path,
                 force=bool(args.force),
-                resume=bool(args.resume),
             )
             run_config = config.model_copy(update={"resume_path": results_path, "save_results": True})
             logger.info("Running TOML eval %d/%d: %s on %s", index, len(eval_configs), config.env_id, config.model)
@@ -1367,7 +1370,6 @@ def _prepare_toml_results_dir(
     results_path: Path,
     *,
     force: bool,
-    resume: bool,
 ) -> None:
     if results_path.exists() and force:
         _archive_existing_path(results_path)
@@ -1377,14 +1379,9 @@ def _prepare_toml_results_dir(
     if results_path.exists():
         has_metadata = metadata_path.is_file()
         has_results = results_file.is_file()
-        if not resume:
-            raise ValueError(
-                f"Output already exists: {results_path}. Use --resume to continue this output, "
-                "--force to archive and rerun, or add variant_id/name if this is a distinct eval."
-            )
         if not (has_metadata and has_results):
             raise ValueError(
-                f"Cannot resume {results_path}: metadata.json and results.jsonl are both required. "
+                f"Cannot use existing output {results_path}: metadata.json and results.jsonl are both required. "
                 "Use --force to archive and rerun."
             )
         return
diff --git a/tests/test_cli/test_main.py b/tests/test_cli/test_main.py
index acbb79aa..76fb7bdc 100644
--- a/tests/test_cli/test_main.py
+++ b/tests/test_cli/test_main.py
@@ -365,7 +365,34 @@ async def fake_run(config, **_kwargs):
     assert captured == [4]
 
 
-def test_toml_bench_refuses_existing_output_without_resume(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
+def test_toml_bench_defaults_to_runs_evals(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
+    monkeypatch.chdir(tmp_path)
+    config_path = tmp_path / "bench.toml"
+    _write_config(
+        config_path,
+        """
+        model = "gpt-5-mini"
+
+        [[eval]]
+        env_id = "medqa"
+        """,
+    )
+    calls: list[Path] = []
+
+    async def fake_run(config, **_kwargs):
+        results_path = Path(config.resume_path)
+        calls.append(results_path)
+        (results_path / "metadata.json").write_text(json.dumps({"env_id": "medqa", "model": "gpt-5-mini"}))
+        (results_path / "results.jsonl").write_text(json.dumps({"example_id": "0"}) + "\n")
+        return {"outputs": [], "metadata": {}}
+
+    monkeypatch.setattr(main, "run_evaluation", fake_run)
+
+    assert main.main(["bench", "--config", str(config_path)]) == 0
+    assert calls == [Path("runs/evals/gpt-5-mini/medqa/base")]
+
+
+def test_toml_bench_auto_resumes_existing_output(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
     config_path = tmp_path / "bench.toml"
     output_dir = tmp_path / "evals"
     _write_config(
@@ -378,15 +405,20 @@ def test_toml_bench_refuses_existing_output_without_resume(monkeypatch: pytest.M
         env_args = { shuffle_seed = 1618 }
         """,
     )
+    calls = 0
 
     async def fake_run(config, **_kwargs):
+        nonlocal calls
+        calls += 1
         Path(config.resume_path, "results.jsonl").write_text(json.dumps({"example_id": "0"}) + "\n")
+        Path(config.resume_path, "metadata.json").write_text(json.dumps({"env_id": "medqa", "model": "gpt-5-mini"}))
         return {"outputs": [], "metadata": {}}
 
     monkeypatch.setattr(main, "run_evaluation", fake_run)
 
     assert main.main(["bench", "--config", str(config_path), "--output-dir", str(output_dir)]) == 0
-    assert main.main(["bench", "--config", str(config_path), "--output-dir", str(output_dir)]) == 1
+    assert main.main(["bench", "--config", str(config_path), "--output-dir", str(output_dir)]) == 0
+    assert calls == 2
 
 
 def test_toml_bench_resume_refuses_malformed_existing_output(
@@ -416,7 +448,7 @@ async def fake_run(config, **_kwargs):
 
     monkeypatch.setattr(main, "run_evaluation", fake_run)
 
-    assert main.main(["bench", "--config", str(config_path), "--output-dir", str(output_dir), "--resume"]) == 1
+    assert main.main(["bench", "--config", str(config_path), "--output-dir", str(output_dir)]) == 1
     assert calls == 0
 
 
@@ -485,7 +517,7 @@ async def fake_run(config, **_kwargs):
     monkeypatch.setattr(main, "run_evaluation", fake_run)
 
     assert main.main(["bench", "--config", str(config_path), "--output-dir", str(output_dir)]) == 0
-    assert main.main(["bench", "--config", str(config_path), "--output-dir", str(output_dir), "--resume"]) == 0
+    assert main.main(["bench", "--config", str(config_path), "--output-dir", str(output_dir)]) == 0
 
     metadata = json.loads((output_dir / "gpt-5-mini" / "medqa" / "base" / "metadata.json").read_text())
     assert metadata["avg_reward"] == 0.75

From fd33fad175ca2b9487aeb8c3e47a2530df818ab9 Mon Sep 17 00:00:00 2001
From: Benjamin Warner <me@benjaminwarner.dev>
Date: Fri, 8 May 2026 06:05:20 +0000
Subject: [PATCH 24/53] Add legacy raw run conversion script

---
 scripts/convert_legacy_raw_runs.py            | 438 ++++++++++++++++++
 .../test_convert_legacy_raw_runs.py           | 234 ++++++++++
 2 files changed, 672 insertions(+)
 create mode 100644 scripts/convert_legacy_raw_runs.py
 create mode 100644 tests/test_scripts/test_convert_legacy_raw_runs.py

diff --git a/scripts/convert_legacy_raw_runs.py b/scripts/convert_legacy_raw_runs.py
new file mode 100644
index 00000000..30c9ff38
--- /dev/null
+++ b/scripts/convert_legacy_raw_runs.py
@@ -0,0 +1,438 @@
+"""Convert retired YAML-runner raw outputs into current eval-output directories."""
+
+from __future__ import annotations
+
+import argparse
+import json
+import shutil
+import sys
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, Mapping, Sequence
+
+from medarc_verifiers.cli.eval_identity import BASE_VARIANT_ID, slug_component
+
+MANIFEST_FILENAME = "run_manifest.json"
+RESULTS_FILENAME = "results.jsonl"
+METADATA_FILENAME = "metadata.json"
+SUPPORTED_MANIFEST_VERSION = 3
+MAX_VARIANT_LENGTH = 160
+
+
+@dataclass(frozen=True, slots=True)
+class ConversionEntry:
+    run_id: str
+    job_id: str | None
+    status: str
+    reason: str
+    source_results: str | None = None
+    target_dir: str | None = None
+
+
+@dataclass(frozen=True, slots=True)
+class ConversionReport:
+    entries: tuple[ConversionEntry, ...]
+    dry_run: bool
+
+    @property
+    def converted(self) -> int:
+        return sum(1 for entry in self.entries if entry.status == "converted")
+
+    @property
+    def would_convert(self) -> int:
+        return sum(1 for entry in self.entries if entry.status == "would_convert")
+
+    @property
+    def skipped(self) -> int:
+        return sum(1 for entry in self.entries if entry.status == "skipped")
+
+    @property
+    def failed(self) -> int:
+        return sum(1 for entry in self.entries if entry.status == "failed")
+
+    def to_dict(self) -> dict[str, Any]:
+        return {
+            "dry_run": self.dry_run,
+            "summary": {
+                "converted": self.converted,
+                "would_convert": self.would_convert,
+                "skipped": self.skipped,
+                "failed": self.failed,
+            },
+            "entries": [
+                {
+                    "run_id": entry.run_id,
+                    "job_id": entry.job_id,
+                    "status": entry.status,
+                    "reason": entry.reason,
+                    "source_results": entry.source_results,
+                    "target_dir": entry.target_dir,
+                }
+                for entry in self.entries
+            ],
+        }
+
+
+@dataclass(frozen=True, slots=True)
+class _PlannedConversion:
+    run_id: str
+    job: Mapping[str, Any]
+    source_results: Path
+    source_metadata: Path | None
+    source_metadata_payload: Mapping[str, Any]
+    target_dir: Path
+    env_id: str
+    model_id: str
+    variant_id: str
+    manifest: Mapping[str, Any]
+
+
+def convert_legacy_raw_runs(
+    *,
+    raw_dir: Path | str,
+    output_dir: Path | str,
+    dry_run: bool = True,
+) -> ConversionReport:
+    raw_path = Path(raw_dir)
+    output_path = Path(output_dir)
+    entries: list[ConversionEntry] = []
+    plans: list[_PlannedConversion] = []
+
+    if not raw_path.exists():
+        return ConversionReport(
+            entries=(
+                ConversionEntry(
+                    run_id=str(raw_path),
+                    job_id=None,
+                    status="failed",
+                    reason="raw directory does not exist",
+                ),
+            ),
+            dry_run=dry_run,
+        )
+
+    for manifest_path in sorted(raw_path.glob(f"*/{MANIFEST_FILENAME}")):
+        run_dir = manifest_path.parent
+        try:
+            manifest = _read_json_object(manifest_path)
+        except ValueError as exc:
+            entries.append(
+                ConversionEntry(
+                    run_id=run_dir.name,
+                    job_id=None,
+                    status="failed",
+                    reason=str(exc),
+                )
+            )
+            continue
+
+        if manifest.get("version") != SUPPORTED_MANIFEST_VERSION:
+            entries.append(
+                ConversionEntry(
+                    run_id=_run_id(manifest, run_dir),
+                    job_id=None,
+                    status="failed",
+                    reason=f"unsupported manifest version {manifest.get('version')!r}; expected 3",
+                )
+            )
+            continue
+
+        jobs = manifest.get("jobs")
+        if not isinstance(jobs, list):
+            entries.append(
+                ConversionEntry(
+                    run_id=_run_id(manifest, run_dir),
+                    job_id=None,
+                    status="failed",
+                    reason="manifest jobs must be a list",
+                )
+            )
+            continue
+
+        for job in jobs:
+            if not isinstance(job, Mapping):
+                entries.append(
+                    ConversionEntry(
+                        run_id=_run_id(manifest, run_dir),
+                        job_id=None,
+                        status="skipped",
+                        reason="job entry is not an object",
+                    )
+                )
+                continue
+            planned = _plan_job(run_dir, manifest, job, output_path)
+            if isinstance(planned, ConversionEntry):
+                entries.append(planned)
+            else:
+                plans.append(planned)
+
+    entries.extend(_collision_entries(plans, existing_targets_fail=not dry_run))
+    failed_targets = {
+        entry.target_dir
+        for entry in entries
+        if entry.status == "failed" and entry.target_dir is not None and "collision" in entry.reason
+    }
+    failed_targets.update(
+        entry.target_dir
+        for entry in entries
+        if entry.status == "failed" and entry.target_dir is not None and "already exists" in entry.reason
+    )
+    runnable_plans = [plan for plan in plans if str(plan.target_dir) not in failed_targets]
+
+    for plan in runnable_plans:
+        if dry_run:
+            entries.append(_entry_for_plan(plan, status="would_convert", reason="dry run"))
+            continue
+        try:
+            _write_conversion(plan)
+        except OSError as exc:
+            entries.append(_entry_for_plan(plan, status="failed", reason=f"write failed: {exc}"))
+            continue
+        entries.append(_entry_for_plan(plan, status="converted", reason="converted"))
+
+    return ConversionReport(entries=tuple(entries), dry_run=dry_run)
+
+
+def _plan_job(
+    run_dir: Path,
+    manifest: Mapping[str, Any],
+    job: Mapping[str, Any],
+    output_dir: Path,
+) -> _PlannedConversion | ConversionEntry:
+    run_id = _run_id(manifest, run_dir)
+    job_id = _string_or_none(job.get("job_id"))
+    if not job_id:
+        return ConversionEntry(run_id=run_id, job_id=None, status="skipped", reason="missing job_id")
+
+    status = (_string_or_none(job.get("status")) or "pending").lower()
+    if status != "completed":
+        return ConversionEntry(run_id=run_id, job_id=job_id, status="skipped", reason=f"job status is {status!r}")
+
+    model_id = _string_or_none(job.get("model_id"))
+    env_id = _string_or_none(job.get("env_id"))
+    if not model_id or not env_id:
+        return ConversionEntry(run_id=run_id, job_id=job_id, status="skipped", reason="missing model_id or env_id")
+
+    variant = _resolve_variant(job, env_id)
+    if isinstance(variant, str):
+        variant_id = variant
+    else:
+        return ConversionEntry(run_id=run_id, job_id=job_id, status="skipped", reason=variant["reason"])
+
+    results_path = _resolve_results_path(run_dir, manifest, job, job_id)
+    if not results_path.exists():
+        return ConversionEntry(
+            run_id=run_id,
+            job_id=job_id,
+            status="skipped",
+            reason="missing results.jsonl",
+            source_results=str(results_path),
+        )
+
+    source_metadata = _resolve_metadata_path(run_dir, manifest, job, results_path)
+    source_metadata_payload: Mapping[str, Any] = {}
+    if source_metadata is not None and not source_metadata.exists():
+        source_metadata = None
+    if source_metadata is not None:
+        try:
+            source_metadata_payload = _read_json_object(source_metadata)
+        except ValueError as exc:
+            return ConversionEntry(
+                run_id=run_id,
+                job_id=job_id,
+                status="skipped",
+                reason=f"invalid metadata.json: {exc}",
+                source_results=str(results_path),
+            )
+
+    target_dir = output_dir / slug_component(model_id) / slug_component(env_id) / variant_id
+    return _PlannedConversion(
+        run_id=run_id,
+        job=job,
+        source_results=results_path,
+        source_metadata=source_metadata,
+        source_metadata_payload=source_metadata_payload,
+        target_dir=target_dir,
+        env_id=env_id,
+        model_id=model_id,
+        variant_id=variant_id,
+        manifest=manifest,
+    )
+
+
+def _resolve_variant(job: Mapping[str, Any], env_id: str) -> str | dict[str, str]:
+    raw = _string_or_none(job.get("env_variant_id"))
+    if raw is None or raw == env_id:
+        return BASE_VARIANT_ID
+
+    prefix_colon = f"{env_id}::"
+    prefix_slash = f"{env_id}/"
+    if raw.startswith(prefix_colon):
+        variant_id = raw[len(prefix_colon) :]
+    elif raw.startswith(prefix_slash):
+        variant_id = raw[len(prefix_slash) :]
+    else:
+        return {"reason": f"ambiguous env_variant_id {raw!r} for env_id {env_id!r}"}
+
+    if not variant_id:
+        return {"reason": f"empty parsed variant from env_variant_id {raw!r}"}
+    if variant_id == BASE_VARIANT_ID:
+        return {"reason": "variant identity conflict: source variant maps to reserved base"}
+    if "/" in variant_id or "\\" in variant_id:
+        return {"reason": f"path-unsafe variant {variant_id!r}"}
+    if slug_component(variant_id, max_length=MAX_VARIANT_LENGTH) != variant_id:
+        return {"reason": f"path-unsafe variant {variant_id!r}"}
+    return variant_id
+
+
+def _resolve_results_path(
+    run_dir: Path,
+    manifest: Mapping[str, Any],
+    job: Mapping[str, Any],
+    job_id: str,
+) -> Path:
+    artifacts_root = _string_or_none(manifest.get("artifacts_root")) or "."
+    base = run_dir / artifacts_root
+    relpath = _string_or_none(job.get("results_relpath")) or _string_or_none(job.get("results_dir"))
+    if relpath:
+        candidate = base / relpath
+        if candidate.name == RESULTS_FILENAME:
+            return candidate
+        return candidate / RESULTS_FILENAME
+    return run_dir / job_id / RESULTS_FILENAME
+
+
+def _resolve_metadata_path(
+    run_dir: Path,
+    manifest: Mapping[str, Any],
+    job: Mapping[str, Any],
+    results_path: Path,
+) -> Path | None:
+    artifacts_root = _string_or_none(manifest.get("artifacts_root")) or "."
+    relpath = _string_or_none(job.get("metadata_relpath"))
+    if relpath:
+        return run_dir / artifacts_root / relpath
+    candidate = results_path.parent / METADATA_FILENAME
+    return candidate if candidate.exists() else None
+
+
+def _collision_entries(
+    plans: Sequence[_PlannedConversion],
+    *,
+    existing_targets_fail: bool,
+) -> list[ConversionEntry]:
+    entries: list[ConversionEntry] = []
+    by_target: dict[Path, list[_PlannedConversion]] = {}
+    for plan in plans:
+        by_target.setdefault(plan.target_dir, []).append(plan)
+
+    for target, target_plans in sorted(by_target.items(), key=lambda item: str(item[0])):
+        if len(target_plans) > 1:
+            for plan in target_plans:
+                entries.append(_entry_for_plan(plan, status="failed", reason="planned output path collision"))
+        elif existing_targets_fail and target.exists():
+            entries.append(_entry_for_plan(target_plans[0], status="failed", reason="target path already exists"))
+    return entries
+
+
+def _write_conversion(plan: _PlannedConversion) -> None:
+    plan.target_dir.mkdir(parents=True, exist_ok=False)
+    shutil.copy2(plan.source_results, plan.target_dir / RESULTS_FILENAME)
+    metadata = _converted_metadata(plan)
+    (plan.target_dir / METADATA_FILENAME).write_text(
+        json.dumps(metadata, indent=2, sort_keys=True) + "\n",
+        encoding="utf-8",
+    )
+
+
+def _converted_metadata(plan: _PlannedConversion) -> dict[str, Any]:
+    metadata: dict[str, Any] = {}
+    if plan.source_metadata_payload:
+        source = plan.source_metadata_payload
+        for key in ("env_args", "sampling_args", "num_examples", "rollouts_per_example", "avg_reward"):
+            if key in source:
+                metadata[key] = source[key]
+
+    model_table = plan.manifest.get("models")
+    model_config = model_table.get(plan.model_id) if isinstance(model_table, Mapping) else None
+    if "sampling_args" not in metadata and isinstance(model_config, Mapping):
+        sampling_args = model_config.get("sampling_args")
+        if isinstance(sampling_args, Mapping):
+            metadata["sampling_args"] = dict(sampling_args)
+
+    for key in ("env_args", "sampling_args"):
+        job_value = plan.job.get(key)
+        if key not in metadata and isinstance(job_value, Mapping):
+            metadata[key] = dict(job_value)
+
+    for key in ("num_examples", "rollouts_per_example", "avg_reward"):
+        if key not in metadata and plan.job.get(key) is not None:
+            metadata[key] = plan.job[key]
+
+    metadata.setdefault("env_args", {})
+    metadata.setdefault("sampling_args", {})
+    metadata["env_id"] = plan.env_id
+    metadata["model"] = plan.model_id
+    return metadata
+
+
+def _entry_for_plan(plan: _PlannedConversion, *, status: str, reason: str) -> ConversionEntry:
+    return ConversionEntry(
+        run_id=plan.run_id,
+        job_id=_string_or_none(plan.job.get("job_id")),
+        status=status,
+        reason=reason,
+        source_results=str(plan.source_results),
+        target_dir=str(plan.target_dir),
+    )
+
+
+def _read_json_object(path: Path) -> dict[str, Any]:
+    try:
+        payload = json.loads(path.read_text(encoding="utf-8"))
+    except (OSError, ValueError) as exc:
+        raise ValueError(f"failed to parse {path}: {exc}") from exc
+    if not isinstance(payload, dict):
+        raise ValueError(f"expected JSON object in {path}")
+    return payload
+
+
+def _run_id(manifest: Mapping[str, Any], run_dir: Path) -> str:
+    return _string_or_none(manifest.get("run_id")) or run_dir.name
+
+
+def _string_or_none(value: Any) -> str | None:
+    if value is None:
+        return None
+    text = str(value).strip()
+    return text or None
+
+
+def _build_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("--raw-dir", type=Path, default=Path("runs") / "raw")
+    parser.add_argument("--output-dir", type=Path, default=Path("runs") / "evals")
+    parser.add_argument("--dry-run", action="store_true", default=True, help="plan conversion without writing files")
+    parser.add_argument(
+        "--no-dry-run",
+        dest="dry_run",
+        action="store_false",
+        help="write converted eval-output directories",
+    )
+    parser.add_argument("--report-path", type=Path, help="optional JSON report path")
+    return parser
+
+
+def main(argv: Sequence[str] | None = None) -> int:
+    args = _build_parser().parse_args(argv)
+    report = convert_legacy_raw_runs(raw_dir=args.raw_dir, output_dir=args.output_dir, dry_run=args.dry_run)
+    encoded = json.dumps(report.to_dict(), indent=2, sort_keys=True)
+    if args.report_path:
+        args.report_path.parent.mkdir(parents=True, exist_ok=True)
+        args.report_path.write_text(encoded + "\n", encoding="utf-8")
+    print(encoded)
+    return 1 if report.failed else 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/tests/test_scripts/test_convert_legacy_raw_runs.py b/tests/test_scripts/test_convert_legacy_raw_runs.py
new file mode 100644
index 00000000..513d54fb
--- /dev/null
+++ b/tests/test_scripts/test_convert_legacy_raw_runs.py
@@ -0,0 +1,234 @@
+from __future__ import annotations
+
+import json
+from pathlib import Path
+
+from scripts.convert_legacy_raw_runs import convert_legacy_raw_runs, main
+
+
+def _write_json(path: Path, payload: dict) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    path.write_text(json.dumps(payload), encoding="utf-8")
+
+
+def _write_manifest(
+    raw_dir: Path,
+    *,
+    run_id: str = "run-1",
+    jobs: list[dict] | None = None,
+) -> Path:
+    run_dir = raw_dir / run_id
+    manifest = {
+        "version": 3,
+        "run_id": run_id,
+        "name": "legacy",
+        "config_source": "configs/legacy.yaml",
+        "config_checksum": "abc123",
+        "created_at": "2024-01-01T00:00:00Z",
+        "updated_at": "2024-01-01T00:01:00Z",
+        "artifacts_root": ".",
+        "models": {"gpt/mini": {"sampling_args": {"temperature": 0.1}}},
+        "env_templates": {},
+        "summary": {"completed": 1, "total": 1},
+        "jobs": jobs if jobs is not None else [_job()],
+    }
+    _write_json(run_dir / "run_manifest.json", manifest)
+    return run_dir
+
+
+def _job(**overrides: object) -> dict:
+    payload = {
+        "job_id": "job-1",
+        "model_id": "gpt/mini",
+        "env_id": "demo/env",
+        "env_template_id": "demo-template",
+        "env_variant_id": "demo/env",
+        "env_args": {"fold": "dev"},
+        "sampling_args": {"top_p": 0.9},
+        "status": "completed",
+        "results_relpath": "job-1/results.jsonl",
+        "metadata_relpath": "job-1/metadata.json",
+        "num_examples": 2,
+        "rollouts_per_example": 1,
+        "avg_reward": 0.75,
+    }
+    payload.update(overrides)
+    return payload
+
+
+def _write_artifacts(run_dir: Path, *, job_id: str = "job-1") -> None:
+    _write_json(
+        run_dir / job_id / "metadata.json",
+        {
+            "env_args": {"fold": "metadata"},
+            "sampling_args": {"temperature": 0.2},
+            "num_examples": 2,
+            "rollouts_per_example": 1,
+            "avg_reward": 0.5,
+        },
+    )
+    (run_dir / job_id / "results.jsonl").write_text('{"example_id":"ex-1","reward":1.0}\n', encoding="utf-8")
+
+
+def test_dry_run_lists_jobs_and_writes_nothing(tmp_path: Path) -> None:
+    raw_dir = tmp_path / "runs" / "raw"
+    output_dir = tmp_path / "runs" / "evals"
+    run_dir = _write_manifest(raw_dir)
+    _write_artifacts(run_dir)
+
+    report = convert_legacy_raw_runs(raw_dir=raw_dir, output_dir=output_dir)
+
+    assert report.would_convert == 1
+    assert report.failed == 0
+    assert not output_dir.exists()
+    entry = report.entries[0]
+    assert entry.target_dir is not None
+    assert entry.target_dir.endswith("gpt-mini/demo-env/base")
+
+
+def test_converts_valid_manifest_job_to_processable_eval_output(tmp_path: Path) -> None:
+    raw_dir = tmp_path / "runs" / "raw"
+    output_dir = tmp_path / "runs" / "evals"
+    run_dir = _write_manifest(raw_dir)
+    _write_artifacts(run_dir)
+
+    report = convert_legacy_raw_runs(raw_dir=raw_dir, output_dir=output_dir, dry_run=False)
+
+    assert report.converted == 1
+    target = output_dir / "gpt-mini" / "demo-env" / "base"
+    assert (target / "results.jsonl").read_text(encoding="utf-8")
+    metadata = json.loads((target / "metadata.json").read_text(encoding="utf-8"))
+    assert metadata == {
+        "avg_reward": 0.5,
+        "env_args": {"fold": "metadata"},
+        "env_id": "demo/env",
+        "model": "gpt/mini",
+        "num_examples": 2,
+        "rollouts_per_example": 1,
+        "sampling_args": {"temperature": 0.2},
+    }
+    assert not (target / "bench_index.json").exists()
+    assert not (target / ".medarc_eval_metadata.json").exists()
+    assert (run_dir / "job-1" / "results.jsonl").exists()
+
+
+def test_skips_missing_results(tmp_path: Path) -> None:
+    raw_dir = tmp_path / "runs" / "raw"
+    _write_manifest(raw_dir)
+
+    report = convert_legacy_raw_runs(raw_dir=raw_dir, output_dir=tmp_path / "evals", dry_run=False)
+
+    assert report.skipped == 1
+    assert report.entries[0].reason == "missing results.jsonl"
+
+
+def test_skips_non_completed_jobs(tmp_path: Path) -> None:
+    raw_dir = tmp_path / "runs" / "raw"
+    run_dir = _write_manifest(raw_dir, jobs=[_job(status="failed")])
+    _write_artifacts(run_dir)
+
+    report = convert_legacy_raw_runs(raw_dir=raw_dir, output_dir=tmp_path / "evals", dry_run=False)
+
+    assert report.skipped == 1
+    assert "failed" in report.entries[0].reason
+
+
+def test_target_collision_fails_without_writing(tmp_path: Path) -> None:
+    raw_dir = tmp_path / "runs" / "raw"
+    output_dir = tmp_path / "runs" / "evals"
+    run_dir = _write_manifest(raw_dir)
+    _write_artifacts(run_dir)
+    target = output_dir / "gpt-mini" / "demo-env" / "base"
+    target.mkdir(parents=True)
+
+    report = convert_legacy_raw_runs(raw_dir=raw_dir, output_dir=output_dir, dry_run=False)
+
+    assert report.failed == 1
+    assert "already exists" in report.entries[0].reason
+    assert not (target / "metadata.json").exists()
+
+
+def test_report_includes_valid_jobs_when_another_job_fails(tmp_path: Path) -> None:
+    raw_dir = tmp_path / "runs" / "raw"
+    output_dir = tmp_path / "runs" / "evals"
+    run_dir = _write_manifest(
+        raw_dir,
+        jobs=[
+            _job(job_id="valid", results_relpath="valid/results.jsonl"),
+            _job(job_id="collision", results_relpath="collision/results.jsonl", env_variant_id="demo/env::seed-1"),
+        ],
+    )
+    _write_artifacts(run_dir, job_id="valid")
+    _write_artifacts(run_dir, job_id="collision")
+    (output_dir / "gpt-mini" / "demo-env" / "seed-1").mkdir(parents=True)
+
+    report = convert_legacy_raw_runs(raw_dir=raw_dir, output_dir=output_dir, dry_run=False)
+
+    assert report.failed == 1
+    assert report.converted == 1
+    by_job = {entry.job_id: entry for entry in report.entries}
+    assert by_job["collision"].status == "failed"
+    assert by_job["valid"].status == "converted"
+    assert (output_dir / "gpt-mini" / "demo-env" / "base" / "metadata.json").exists()
+
+
+def test_invalid_existing_metadata_is_skipped(tmp_path: Path) -> None:
+    raw_dir = tmp_path / "runs" / "raw"
+    output_dir = tmp_path / "runs" / "evals"
+    run_dir = _write_manifest(raw_dir)
+    (run_dir / "job-1").mkdir(parents=True)
+    (run_dir / "job-1" / "metadata.json").write_text("not json", encoding="utf-8")
+    (run_dir / "job-1" / "results.jsonl").write_text('{"example_id":"ex-1"}\n', encoding="utf-8")
+
+    report = convert_legacy_raw_runs(raw_dir=raw_dir, output_dir=output_dir, dry_run=False)
+
+    assert report.skipped == 1
+    assert "invalid metadata.json" in report.entries[0].reason
+    assert not output_dir.exists()
+
+
+def test_path_unsafe_or_ambiguous_variants_are_skipped(tmp_path: Path) -> None:
+    raw_dir = tmp_path / "runs" / "raw"
+    run_dir = _write_manifest(
+        raw_dir,
+        jobs=[
+            _job(job_id="ambiguous", env_variant_id="other-env::seed-1", results_relpath="ambiguous/results.jsonl"),
+            _job(job_id="unsafe", env_variant_id="demo/env::bad value", results_relpath="unsafe/results.jsonl"),
+            _job(job_id="base-conflict", env_variant_id="demo/env::base", results_relpath="base-conflict/results.jsonl"),
+        ],
+    )
+    for job_id in ("ambiguous", "unsafe", "base-conflict"):
+        _write_artifacts(run_dir, job_id=job_id)
+
+    report = convert_legacy_raw_runs(raw_dir=raw_dir, output_dir=tmp_path / "evals", dry_run=False)
+
+    assert report.skipped == 3
+    reasons = {entry.job_id: entry.reason for entry in report.entries}
+    assert "ambiguous env_variant_id" in reasons["ambiguous"]
+    assert "path-unsafe variant" in reasons["unsafe"]
+    assert "reserved base" in reasons["base-conflict"]
+
+
+def test_parses_relative_variant_and_cli_report_path(tmp_path: Path) -> None:
+    raw_dir = tmp_path / "runs" / "raw"
+    output_dir = tmp_path / "runs" / "evals"
+    report_path = tmp_path / "report.json"
+    run_dir = _write_manifest(raw_dir, jobs=[_job(env_variant_id="demo/env::shuffle_seed-1618")])
+    _write_artifacts(run_dir)
+
+    exit_code = main(
+        [
+            "--raw-dir",
+            str(raw_dir),
+            "--output-dir",
+            str(output_dir),
+            "--no-dry-run",
+            "--report-path",
+            str(report_path),
+        ]
+    )
+
+    assert exit_code == 0
+    assert (output_dir / "gpt-mini" / "demo-env" / "shuffle_seed-1618" / "metadata.json").exists()
+    payload = json.loads(report_path.read_text(encoding="utf-8"))
+    assert payload["summary"]["converted"] == 1

From eb9be04a6fb4c2e4b09fa9cd3269d97d34aa3ebc Mon Sep 17 00:00:00 2001
From: Benjamin Warner <me@benjaminwarner.dev>
Date: Fri, 8 May 2026 06:08:24 +0000
Subject: [PATCH 25/53] Convert process discovery tests to eval outputs

---
 tests/test_cli/test_process_discovery.py | 392 ++++++-----------------
 1 file changed, 104 insertions(+), 288 deletions(-)

diff --git a/tests/test_cli/test_process_discovery.py b/tests/test_cli/test_process_discovery.py
index 22485448..71eaaed4 100644
--- a/tests/test_cli/test_process_discovery.py
+++ b/tests/test_cli/test_process_discovery.py
@@ -3,7 +3,7 @@
 import json
 from pathlib import Path
 
-from medarc_verifiers.cli.process.discovery import RunManifestInfo, discover_run_records
+from medarc_verifiers.cli.process.discovery import discover_run_records
 from medarc_verifiers.cli.process.metadata import load_normalized_metadata
 
 
@@ -12,247 +12,7 @@ def _write_json(path: Path, payload: dict) -> None:
     path.write_text(json.dumps(payload), encoding="utf-8")
 
 
-def _base_manifest(
-    job_payloads: list[dict],
-    *,
-    models: dict | None = None,
-    env_templates: dict | None = None,
-) -> dict:
-    return {
-        "version": 3,
-        "run_id": "job-run-123",
-        "name": "example-run",
-        "config_source": "configs/example.yaml",
-        "config_checksum": "abc123",
-        "created_at": "2024-01-01T00:00:00Z",
-        "updated_at": "2024-01-01T00:05:00Z",
-        "artifacts_root": ".",
-        "models": models or {},
-        "env_templates": env_templates or {},
-        "jobs": job_payloads,
-        "summary": {"completed": 1},
-    }
-
-
-def _manifest_info(*, completed: int, total: int, total_known: bool) -> RunManifestInfo:
-    return RunManifestInfo(
-        job_run_id="job-run-123",
-        run_name="example-run",
-        summary_completed=completed,
-        summary_total=total,
-        summary_total_known=total_known,
-        manifest_path=Path("/tmp/run_manifest.json"),
-        run_dir=Path("/tmp/job-run-123"),
-        created_at="2024-01-01T00:00:00Z",
-        updated_at="2024-01-01T00:05:00Z",
-        config_source="configs/example.yaml",
-        config_checksum="abc123",
-        run_summary_path=Path("/tmp/run_summary.json"),
-    )
-
-
-def test_discover_run_records_basic(tmp_path: Path) -> None:
-    runs_dir = tmp_path / "runs"
-    run_dir = runs_dir / "job-run-123"
-    results_dir = run_dir / "model-env-job"
-
-    manifest_payload = _base_manifest(
-        [
-            {
-                "job_id": "model-env-job",
-                "job_name": "demo-job",
-                "model_id": "gpt-4",
-                "env_id": "demo-env-module",
-                "env_template_id": "demo-env-template",
-                "env_variant_id": "demo-env",
-                "env_args": {"fold": "dev"},
-                "results_relpath": "model-env-job/results.jsonl",
-                "metadata_relpath": "model-env-job/metadata.json",
-                "status": "completed",
-                "started_at": "2024-01-01T00:00:30Z",
-                "ended_at": "2024-01-01T00:01:00Z",
-                "avg_reward": 0.75,
-                "num_examples": 10,
-                "rollouts_per_example": 2,
-                "row_count": 20,
-            }
-        ],
-        models={"gpt-4": {"sampling_args": {"temperature": 0.2}}},
-        env_templates={"demo-env-template": {"module": "demo-env-module"}},
-    )
-    _write_json(run_dir / "run_manifest.json", manifest_payload)
-
-    _write_json(
-        run_dir / "run_summary.json",
-        {
-            "jobs": [
-                {
-                    "job_id": "model-env-job",
-                    "status": "succeeded",
-                    "duration_seconds": 12.5,
-                    "error": None,
-                }
-            ]
-        },
-    )
-
-    _write_json(results_dir / "metadata.json", {"env_id": "demo-env"})
-    (results_dir / "results.jsonl").write_text("{}", encoding="utf-8")
-    _write_json(results_dir / "summary.json", {"env_id": "demo-env"})
-
-    records = discover_run_records(runs_dir)
-    assert len(records) == 1
-    record = records[0]
-    assert record.status == "succeeded"
-    assert record.duration_seconds == 12.5
-    assert record.has_metadata is True
-    assert record.has_results is True
-    assert record.has_summary is True
-    assert record.env_args == {"fold": "dev"}
-    assert record.sampling_args == {"temperature": 0.2}
-    assert record.avg_reward == 0.75
-    assert record.row_count == 20
-    assert record.manifest.job_run_id == "job-run-123"
-
-
-def test_discover_run_records_filters_status(tmp_path: Path) -> None:
-    runs_dir = tmp_path / "runs"
-    run_dir = runs_dir / "job-run-123"
-    results_dir = run_dir / "model-env-job"
-
-    manifest_payload = _base_manifest(
-        [
-            {
-                "job_id": "model-env-job",
-                "model_id": "gpt-4",
-                "env_id": "demo-env-module",
-                "env_template_id": "demo-env-template",
-                "env_variant_id": "demo-env",
-                "env_args": {},
-                "results_relpath": "model-env-job/results.jsonl",
-            }
-        ],
-        models={"gpt-4": {"sampling_args": {}}},
-        env_templates={"demo-env-template": {"module": "demo-env-module"}},
-    )
-    _write_json(run_dir / "run_manifest.json", manifest_payload)
-    _write_json(
-        run_dir / "run_summary.json",
-        {"jobs": [{"job_id": "model-env-job", "status": "failed", "error": "boom"}]},
-    )
-    results_dir.mkdir(parents=True, exist_ok=True)
-    (results_dir / "results.jsonl").write_text("{}", encoding="utf-8")
-
-    filtered = discover_run_records(runs_dir, filter_status=("failed",))
-    assert len(filtered) == 1
-    assert filtered[0].status == "failed"
-
-    filtered_none = discover_run_records(runs_dir, filter_status=("succeeded",))
-    assert filtered_none == []
-
-
-def test_discover_run_records_missing_summary_uses_manifest_status(tmp_path: Path) -> None:
-    runs_dir = tmp_path / "runs"
-    run_dir = runs_dir / "job-run-123"
-    results_dir = run_dir / "model-env-job"
-
-    manifest_payload = _base_manifest(
-        [
-            {
-                "job_id": "model-env-job",
-                "status": "completed",
-                "reason": "cached",
-                "model_id": "gpt-4",
-                "env_id": "demo-env-module",
-                "env_template_id": "demo-env-template",
-                "env_variant_id": "demo-env",
-                "env_args": {},
-                "results_relpath": "model-env-job/results.jsonl",
-            }
-        ],
-        models={"gpt-4": {"sampling_args": {}}},
-        env_templates={"demo-env-template": {"module": "demo-env-module"}},
-    )
-    _write_json(run_dir / "run_manifest.json", manifest_payload)
-
-    results_dir.mkdir(parents=True, exist_ok=True)
-    (results_dir / "results.jsonl").write_text("{}", encoding="utf-8")
-
-    records = discover_run_records(runs_dir)
-    assert len(records) == 1
-    record = records[0]
-    assert record.status == "completed"
-    assert record.reason == "cached"
-    assert record.has_summary is False
-
-
-def test_discover_run_records_respects_artifacts_root(tmp_path: Path, monkeypatch) -> None:
-    runs_dir = tmp_path / "runs_llm_judge" / "raw"
-    run_dir = runs_dir / "job-run-123"
-    artifacts_dir = run_dir / "artifacts"
-    results_dir = artifacts_dir / "model-env-job"
-
-    manifest_payload = _base_manifest(
-        [
-            {
-                "job_id": "model-env-job",
-                "model_id": "gpt-4",
-                "env_id": "demo-env-module",
-                "env_template_id": "demo-env-template",
-                "env_variant_id": "demo-env",
-                "env_args": {},
-                "results_relpath": "model-env-job/results.jsonl",
-                "metadata_relpath": "model-env-job/metadata.json",
-                "status": "completed",
-            }
-        ],
-        models={"gpt-4": {"sampling_args": {"temperature": 0.2}}},
-        env_templates={"demo-env-template": {"module": "demo-env-module"}},
-    )
-    manifest_payload["artifacts_root"] = "artifacts"
-    _write_json(run_dir / "run_manifest.json", manifest_payload)
-
-    results_dir.mkdir(parents=True, exist_ok=True)
-    _write_json(results_dir / "metadata.json", {"env_id": "demo-env"})
-    (results_dir / "results.jsonl").write_text("{}", encoding="utf-8")
-
-    records = discover_run_records(runs_dir)
-    assert len(records) == 1
-    assert records[0].has_results is True
-
-
-def test_discover_run_records_fallbacks_to_job_dir_when_results_relpath_is_broken(tmp_path: Path) -> None:
-    runs_dir = tmp_path / "runs" / "raw"
-    run_dir = runs_dir / "job-run-123"
-    job_dir = run_dir / "model-env-job"
-
-    manifest_payload = _base_manifest(
-        [
-            {
-                "job_id": "model-env-job",
-                "model_id": "gpt-4",
-                "env_id": "demo-env-module",
-                "env_template_id": "demo-env-template",
-                "env_variant_id": "demo-env",
-                "env_args": {},
-                "results_relpath": "wrong-dir/results.jsonl",
-                "status": "completed",
-            }
-        ],
-        models={"gpt-4": {"sampling_args": {}}},
-        env_templates={"demo-env-template": {"module": "demo-env-module"}},
-    )
-    _write_json(run_dir / "run_manifest.json", manifest_payload)
-    _write_json(job_dir / "metadata.json", {"env_id": "demo-env"})
-    (job_dir / "results.jsonl").write_text("{}", encoding="utf-8")
-
-    records = discover_run_records(runs_dir)
-    assert len(records) == 1
-    assert records[0].has_results is True
-    assert records[0].has_metadata is True
-
-
-def _write_eval_output(path: Path, metadata: dict | None = None) -> None:
+def _write_eval_output(path: Path, metadata: dict | None = None, *, rows: list[dict] | None = None) -> None:
     _write_json(
         path / "metadata.json",
         {
@@ -262,104 +22,160 @@ def _write_eval_output(path: Path, metadata: dict | None = None) -> None:
             "sampling_args": {"temperature": 0},
             "num_examples": 1,
             "rollouts_per_example": 1,
+            "avg_reward": 1.0,
             **(metadata or {}),
         },
     )
-    path.mkdir(parents=True, exist_ok=True)
-    (path / "results.jsonl").write_text(json.dumps({"example_id": "ex-1", "reward": 1.0}) + "\n", encoding="utf-8")
+    result_rows = rows if rows is not None else [{"example_id": "ex-1", "reward": 1.0}]
+    with (path / "results.jsonl").open("w", encoding="utf-8") as handle:
+        for row in result_rows:
+            handle.write(json.dumps(row) + "\n")
 
 
-def test_discover_run_records_includes_deterministic_eval_outputs(tmp_path: Path) -> None:
-    raw_dir = tmp_path / "runs" / "raw"
-    eval_dir = tmp_path / "runs" / "evals" / "gpt-5-mini" / "medqa" / "base"
-    _write_eval_output(eval_dir)
+def test_discover_run_records_includes_deterministic_base_layout(tmp_path: Path) -> None:
+    evals_dir = tmp_path / "runs" / "evals"
+    output_dir = evals_dir / "gpt-5-mini" / "medqa" / "base"
+    _write_eval_output(output_dir)
+    _write_json(output_dir / "summary.json", {"env_id": "medqa"})
 
-    records = discover_run_records(raw_dir, filter_status=("completed",))
+    records = discover_run_records(evals_dir, filter_status=("completed",))
 
     assert len(records) == 1
     record = records[0]
+    assert record.status == "completed"
     assert record.model_id == "gpt-5-mini"
     assert record.manifest_env_id == "medqa"
-    assert record.results_dir == eval_dir
-    assert record.row_count == 1
+    assert record.results_dir == output_dir
+    assert record.has_metadata is True
+    assert record.has_results is True
+    assert record.has_summary is True
     assert record.env_args == {"split": "test"}
     assert record.sampling_args == {"temperature": 0}
+    assert record.avg_reward == 1.0
+    assert record.row_count == 1
+    assert record.manifest.job_run_id == "gpt-5-mini::medqa::base"
     normalized = load_normalized_metadata(record)
     assert normalized.variant_id == "base"
 
 
 def test_discover_run_records_includes_deterministic_eval_variants(tmp_path: Path) -> None:
-    eval_dir = tmp_path / "runs" / "evals" / "gpt-5-mini" / "medqa" / "env_args.shuffle_seed-1618"
-    _write_eval_output(
-        eval_dir,
-        {
-            "variant_id": "env_args.shuffle_seed-1618",
-            "variant_payload": {"env_args": {"shuffle_seed": 1618}},
-            "medarc_config_fingerprint": "abc123",
-            "medarc_config_fingerprint_payload": {"env_id": "medqa"},
-        },
-    )
+    evals_dir = tmp_path / "runs" / "evals"
+    variant_id = "env_args.shuffle_seed-1618"
+    output_dir = evals_dir / "gpt-5-mini" / "medqa" / variant_id
+    _write_eval_output(output_dir)
 
-    records = discover_run_records(tmp_path / "runs" / "raw", filter_status=("completed",))
+    records = discover_run_records(evals_dir, filter_status=("completed",))
 
     assert len(records) == 1
     normalized = load_normalized_metadata(records[0])
-    assert normalized.variant_id == "env_args.shuffle_seed-1618"
-    assert normalized.variant_payload == {"env_args": {"shuffle_seed": 1618}}
-    assert normalized.medarc_config_fingerprint == "abc123"
-    assert normalized.medarc_config_fingerprint_payload == {"env_id": "medqa"}
+    assert normalized.variant_id == variant_id
+    assert normalized.variant_payload is None
+    assert normalized.medarc_config_fingerprint is None
+    assert normalized.medarc_config_fingerprint_payload is None
 
 
-def test_discover_run_records_preserves_variant_for_env_slug_with_double_hyphen(tmp_path: Path) -> None:
-    eval_dir = tmp_path / "runs" / "evals" / "gpt-5-mini" / "foo--bar" / "base"
-    _write_eval_output(eval_dir, {"env_id": "foo--bar", "model": "gpt-5-mini"})
+def test_discover_run_records_preserves_path_safe_variant_identity(tmp_path: Path) -> None:
+    evals_dir = tmp_path / "runs" / "evals"
+    variant_id = "name.with-safe_chars-123"
+    output_dir = evals_dir / "gpt-5-mini" / "foo--bar" / variant_id
+    _write_eval_output(output_dir, {"env_id": "foo--bar", "model": "gpt-5-mini"})
 
-    records = discover_run_records(tmp_path / "runs" / "raw", filter_status=("completed",))
+    records = discover_run_records(evals_dir, filter_status=("completed",))
 
     assert len(records) == 1
     record = records[0]
     assert record.model_id == "gpt-5-mini"
     assert record.manifest_env_id == "foo--bar"
     normalized = load_normalized_metadata(record)
-    assert normalized.variant_id == "base"
+    assert normalized.variant_id == variant_id
 
 
 def test_discover_run_records_includes_direct_upstream_uuid_outputs(tmp_path: Path) -> None:
-    upstream_dir = tmp_path / "runs" / "evals" / "medqa--gpt-5-mini" / "016f4b4a-92a4-4a5b-a7c1-853af3318c52"
+    evals_dir = tmp_path / "runs" / "evals"
+    run_id = "016f4b4a-92a4-4a5b-a7c1-853af3318c52"
+    upstream_dir = evals_dir / "medqa--gpt-5-mini" / run_id
     _write_eval_output(upstream_dir)
 
-    records = discover_run_records(tmp_path / "runs" / "raw", filter_status=("completed",))
+    records = discover_run_records(evals_dir, filter_status=("completed",))
 
     assert len(records) == 1
     record = records[0]
     assert record.model_id == "gpt-5-mini"
     assert record.manifest_env_id == "medqa"
-    assert record.manifest.job_run_id == "016f4b4a-92a4-4a5b-a7c1-853af3318c52"
+    assert record.manifest.job_run_id == run_id
+    normalized = load_normalized_metadata(record)
+    assert normalized.variant_id is None
+
+
+def test_discover_run_records_skips_missing_metadata(tmp_path: Path) -> None:
+    evals_dir = tmp_path / "runs" / "evals"
+    output_dir = evals_dir / "gpt-5-mini" / "medqa" / "base"
+    output_dir.mkdir(parents=True)
+    (output_dir / "results.jsonl").write_text('{"example_id":"ex-1"}\n', encoding="utf-8")
 
+    assert discover_run_records(evals_dir, filter_status=("completed",)) == []
 
-def test_discover_run_records_deduplicates_overlapping_eval_roots(tmp_path: Path) -> None:
-    eval_dir = tmp_path / "runs" / "evals" / "gpt-5-mini" / "medqa" / "base"
-    _write_eval_output(eval_dir)
 
-    records = discover_run_records(tmp_path / "runs", filter_status=("completed",))
+def test_discover_run_records_skips_invalid_metadata(tmp_path: Path) -> None:
+    evals_dir = tmp_path / "runs" / "evals"
+    output_dir = evals_dir / "gpt-5-mini" / "medqa" / "base"
+    output_dir.mkdir(parents=True)
+    (output_dir / "metadata.json").write_text("not json", encoding="utf-8")
+    (output_dir / "results.jsonl").write_text('{"example_id":"ex-1"}\n', encoding="utf-8")
+
+    assert discover_run_records(evals_dir, filter_status=("completed",)) == []
+
+
+def test_discover_run_records_skips_metadata_only_directory(tmp_path: Path) -> None:
+    evals_dir = tmp_path / "runs" / "evals"
+    _write_json(
+        evals_dir / "gpt-5-mini" / "medqa" / "base" / "metadata.json",
+        {"env_id": "medqa", "model": "gpt-5-mini"},
+    )
+
+    assert discover_run_records(evals_dir, filter_status=("completed",)) == []
+
+
+def test_discover_run_records_counts_empty_results_candidate(tmp_path: Path) -> None:
+    evals_dir = tmp_path / "runs" / "evals"
+    output_dir = evals_dir / "gpt-5-mini" / "medqa" / "base"
+    _write_json(output_dir / "metadata.json", {"env_id": "medqa", "model": "gpt-5-mini"})
+    (output_dir / "results.jsonl").write_text("", encoding="utf-8")
+
+    records = discover_run_records(evals_dir, filter_status=("completed",))
+
+    assert len(records) == 1
+    assert records[0].row_count == 0
+
+
+def test_discover_run_records_counts_invalid_jsonl_candidate_for_later_row_validation(tmp_path: Path) -> None:
+    evals_dir = tmp_path / "runs" / "evals"
+    output_dir = evals_dir / "gpt-5-mini" / "medqa" / "base"
+    _write_json(output_dir / "metadata.json", {"env_id": "medqa", "model": "gpt-5-mini"})
+    (output_dir / "results.jsonl").write_text("{not json}\n", encoding="utf-8")
+
+    records = discover_run_records(evals_dir, filter_status=("completed",))
 
     assert len(records) == 1
-    assert records[0].results_dir == eval_dir
+    assert records[0].row_count == 1
+
+
+def test_discover_run_records_filters_current_output_status(tmp_path: Path) -> None:
+    evals_dir = tmp_path / "runs" / "evals"
+    _write_eval_output(evals_dir / "gpt-5-mini" / "medqa" / "base")
+
+    assert len(discover_run_records(evals_dir, filter_status=("completed",))) == 1
+    assert discover_run_records(evals_dir, filter_status=("failed",)) == []
 
 
 def test_discover_run_records_parent_baseline_and_child_variant_once(tmp_path: Path) -> None:
-    baseline_dir = tmp_path / "runs" / "evals" / "gpt-5-mini" / "medqa"
+    evals_dir = tmp_path / "runs" / "evals"
+    baseline_dir = evals_dir / "gpt-5-mini" / "medqa"
     variant_dir = baseline_dir / "env_args.shuffle_seed-1618"
     _write_eval_output(baseline_dir)
-    _write_eval_output(
-        variant_dir,
-        {
-            "variant_id": "env_args.shuffle_seed-1618",
-            "variant_payload": {"env_args": {"shuffle_seed": 1618}},
-        },
-    )
+    _write_eval_output(variant_dir)
 
-    records = discover_run_records(tmp_path / "runs" / "raw", filter_status=("completed",))
+    records = discover_run_records(evals_dir, filter_status=("completed",))
 
     assert len(records) == 2
     assert {record.results_dir for record in records} == {baseline_dir, variant_dir}

From 927947f609466564a96f2e9e690f0bbc1eaa9cbc Mon Sep 17 00:00:00 2001
From: Benjamin Warner <me@benjaminwarner.dev>
Date: Fri, 8 May 2026 06:13:45 +0000
Subject: [PATCH 26/53] Convert process pipeline tests to eval outputs

---
 tests/test_cli/test_process_pipeline.py | 250 ++++--------------------
 1 file changed, 37 insertions(+), 213 deletions(-)

diff --git a/tests/test_cli/test_process_pipeline.py b/tests/test_cli/test_process_pipeline.py
index b03cd3f8..180646b5 100644
--- a/tests/test_cli/test_process_pipeline.py
+++ b/tests/test_cli/test_process_pipeline.py
@@ -1,12 +1,13 @@
 from __future__ import annotations
 
 import json
+import os
+from datetime import datetime
 from pathlib import Path
 
 import pytest
 import pyarrow.parquet as pq
 
-from medarc_verifiers.cli._manifest import MANIFEST_VERSION
 from medarc_verifiers.cli._schemas import EnvironmentExportConfig
 from medarc_verifiers.cli.hf import HFSyncConfig
 from medarc_verifiers.cli.process import ProcessOptions, run_process
@@ -23,6 +24,11 @@ def _write_json(path: Path, payload: dict) -> None:
     path.write_text(json.dumps(payload), encoding="utf-8")
 
 
+def _set_mtime(path: Path, updated_at: str) -> None:
+    timestamp = datetime.fromisoformat(updated_at.replace("Z", "+00:00")).timestamp()
+    os.utime(path, (timestamp, timestamp))
+
+
 def _manifest_info(
     *,
     run_id: str,
@@ -100,49 +106,15 @@ def _run_record(
 
 
 def _setup_run(tmp_path: Path) -> Path:
-    runs_dir = tmp_path / "runs"
-    run_dir = runs_dir / "run-1"
-    results_dir = run_dir / "demo-job"
-    manifest = {
-        "version": MANIFEST_VERSION,
-        "run_id": "run-1",
-        "name": "demo",
-        "config_source": "configs/demo.yaml",
-        "config_snapshot": {"jobs": []},
-        "config_checksum": "abc123",
-        "created_at": "2024-01-01T00:00:00Z",
-        "updated_at": "2024-01-01T00:00:00Z",
-        "models": {"gpt-mini": {"sampling_args": {}}},
-        "env_templates": {"demo-env-template": {"module": "demo-env-rollout3"}},
-        "summary": {
-            "total": 1,
-            "completed": 1,
-            "pending": 0,
-            "running": 0,
-            "failed": 0,
-            "skipped": 0,
-        },
-        "jobs": [
-            {
-                "job_id": "demo-job",
-                "model_id": "gpt-mini",
-                "env_id": "demo-env-rollout3",
-                "env_template_id": "demo-env-template",
-                "env_variant_id": "demo-env-rollout3",
-                "env_args": {},
-                "results_dir": "demo-job",
-                "status": "completed",
-                "num_examples": 1,
-                "rollouts_per_example": 1,
-                "row_count": 1,
-            }
-        ],
-    }
-    _write_json(run_dir / "run_manifest.json", manifest)
+    runs_dir = tmp_path / "runs" / "evals"
+    results_dir = runs_dir / "demo-env-rollout3--gpt-mini" / "run-1"
     metadata = {
         "env_id": "demo-env-rollout3",
+        "model": "gpt-mini",
         "env_args": {},
         "sampling_args": {},
+        "num_examples": 1,
+        "rollouts_per_example": 1,
         "version_info": {
             "vf_version": "0.1.10",
             "vf_commit": "abc123",
@@ -165,6 +137,7 @@ def _setup_run(tmp_path: Path) -> Path:
     with results_path.open("w", encoding="utf-8") as handle:
         for row in results:
             handle.write(json.dumps(row) + "\n")
+    _set_mtime(results_dir / "metadata.json", "2024-01-01T00:00:00Z")
     return runs_dir
 
 
@@ -184,60 +157,29 @@ def _write_run(
     write_results: bool = True,
     job_id: str = "demo-job",
 ) -> Path:
-    runs_dir = tmp_path / "runs"
-    run_dir = runs_dir / run_id
-    results_dir = run_dir / job_id
-    manifest = {
-        "version": MANIFEST_VERSION,
-        "run_id": run_id,
-        "name": "demo",
-        "config_source": "configs/demo.yaml",
-        "config_snapshot": {"jobs": []},
-        "config_checksum": "abc123",
-        "created_at": "2024-01-01T00:00:00Z",
-        "updated_at": updated_at,
-        "models": {model_id: {"sampling_args": {}}},
-        "env_templates": {"demo-env-template": {"module": env_id}},
-        "summary": {
-            "total": 1,
-            "completed": 1 if status == "completed" else 0,
-            "pending": 0,
-            "running": 0,
-            "failed": 1 if status == "failed" else 0,
-            "skipped": 0,
-        },
-        "jobs": [
-            {
-                "job_id": job_id,
-                "model_id": model_id,
-                "env_id": env_id,
-                "env_template_id": "demo-env-template",
-                "env_variant_id": env_id,
-                "env_args": {},
-                "results_dir": job_id,
-                "status": status,
-                "row_count": row_count,
-                "num_examples": num_examples,
-                "rollouts_per_example": rollouts_per_example,
-            }
-        ],
-    }
-    _write_json(run_dir / "run_manifest.json", manifest)
+    runs_dir = tmp_path / "runs" / "evals"
+    results_dir = runs_dir / f"{env_id}--{model_id}" / run_id
     metadata = {
         "env_id": env_id,
+        "model": model_id,
         "env_args": {},
         "sampling_args": {},
         "num_examples": num_examples,
         "rollouts_per_example": rollouts_per_example,
     }
     _write_json(results_dir / "metadata.json", metadata)
+    _set_mtime(results_dir / "metadata.json", updated_at)
     results_path = results_dir / "results.jsonl"
     if write_results:
         results_path.parent.mkdir(parents=True, exist_ok=True)
         if results_text is None:
-            row = {"example_id": f"ex-{run_id}", "reward": reward}
-            results_text = json.dumps(row) + "\n"
+            result_rows = 1 if row_count is None else max(int(row_count), 0)
+            results_text = "".join(
+                json.dumps({"example_id": f"ex-{run_id}-{index}", "reward": reward}) + "\n"
+                for index in range(result_rows)
+            )
         results_path.write_text(results_text, encoding="utf-8")
+        _set_mtime(results_path, updated_at)
     return runs_dir
 
 
@@ -250,8 +192,8 @@ def _write_deterministic_eval(
     env_args: dict[str, object] | None = None,
     result_row: dict[str, object] | None = None,
 ) -> Path:
-    runs_dir = tmp_path / "runs"
-    results_dir = runs_dir / "evals" / model_id / env_id
+    runs_dir = tmp_path / "runs" / "evals"
+    results_dir = runs_dir / model_id / env_id / (variant_id or "base")
     resolved_env_args = env_args or {}
     metadata = {
         "env_id": env_id,
@@ -260,40 +202,11 @@ def _write_deterministic_eval(
         "sampling_args": {},
         "num_examples": 1,
         "rollouts_per_example": 1,
-        "medarc_config_fingerprint": "abc123",
-        "medarc_config_fingerprint_payload": {
-            "env_id": env_id,
-            "model": model_id,
-            "env_args": resolved_env_args,
-            "sampling_args": {},
-            "num_examples": 1,
-            "rollouts_per_example": 1,
-        },
-        "variant_id": None,
-        "variant_payload": None,
     }
-    if variant_id is not None:
-        results_dir = results_dir / variant_id
-        metadata["variant_id"] = variant_id
-        metadata["variant_payload"] = {"env_args": resolved_env_args or {"shuffle_seed": 1618}}
     _write_json(results_dir / "metadata.json", metadata)
     row = result_row or {"example_id": "ex-1", "reward": 1.0}
     (results_dir / "results.jsonl").write_text(json.dumps(row) + "\n", encoding="utf-8")
-    return runs_dir / "raw"
-
-
-def _remove_model_id(tmp_path: Path, run_id: str) -> None:
-    manifest_path = tmp_path / "runs" / run_id / "run_manifest.json"
-    manifest = json.loads(manifest_path.read_text(encoding="utf-8"))
-    manifest["jobs"][0]["model_id"] = None
-    manifest["models"] = {}
-    manifest_path.write_text(json.dumps(manifest), encoding="utf-8")
-
-    job_id = manifest["jobs"][0]["job_id"]
-    metadata_path = tmp_path / "runs" / run_id / job_id / "metadata.json"
-    metadata = json.loads(metadata_path.read_text(encoding="utf-8"))
-    metadata.pop("model", None)
-    metadata_path.write_text(json.dumps(metadata), encoding="utf-8")
+    return runs_dir
 
 
 def test_run_process_respects_env_export_defaults(tmp_path: Path) -> None:
@@ -579,7 +492,7 @@ def test_process_allows_results_missing_pct_within_threshold(tmp_path: Path) ->
     result = run_process(options)
 
     assert result.records_processed == 1
-    assert result.rows_processed == 1
+    assert result.rows_processed == 98
 
 
 def test_process_rejects_results_missing_pct_above_threshold(tmp_path: Path) -> None:
@@ -635,14 +548,14 @@ def test_process_allows_ungateable_record_when_expected_rows_unknown(tmp_path: P
     assert result.records_processed == 1
 
 
-def test_process_allows_ungateable_record_when_row_count_unknown(tmp_path: Path) -> None:
+def test_process_allows_ungateable_record_when_expected_rows_unknown_even_with_observed_rows(tmp_path: Path) -> None:
     runs_dir = _write_run(
         tmp_path,
         run_id="run-unknown-observed",
         updated_at="2024-01-01T00:00:00Z",
         reward=1.0,
         row_count=None,
-        num_examples=100,
+        num_examples=None,
         rollouts_per_example=1,
     )
     options = ProcessOptions(
@@ -692,7 +605,7 @@ def test_process_latest_record_that_fails_gate_does_not_fall_back(tmp_path: Path
     assert "run-older-ok" not in message
 
 
-def test_process_rejects_missing_results_jsonl_for_selected_latest_record(tmp_path: Path) -> None:
+def test_process_ignores_metadata_only_output_without_results_jsonl(tmp_path: Path) -> None:
     runs_dir = _write_run(
         tmp_path,
         run_id="run-missing-results",
@@ -710,12 +623,10 @@ def test_process_rejects_missing_results_jsonl_for_selected_latest_record(tmp_pa
         max_workers=1,
     )
 
-    with pytest.raises(RuntimeError) as excinfo:
-        run_process(options)
+    result = run_process(options)
 
-    message = str(excinfo.value)
-    assert "missing results.jsonl files" in message
-    assert "run-missing-results" in message
+    assert result.records_processed == 0
+    assert result.rows_processed == 0
 
 
 def test_process_gate_ignores_excluded_record(tmp_path: Path) -> None:
@@ -795,7 +706,7 @@ def test_process_emits_single_warning_for_ungateable_selected_records(
         updated_at="2024-01-01T00:00:00Z",
         reward=1.0,
         row_count=None,
-        num_examples=100,
+        num_examples=None,
         rollouts_per_example=1,
     )
     caplog.set_level("WARNING")
@@ -816,7 +727,7 @@ def test_process_emits_single_warning_for_ungateable_selected_records(
     assert len(warnings) == 1
 
 
-def test_process_uses_actual_results_rows_when_manifest_row_count_is_stale(
+def test_process_uses_discovered_actual_results_rows_for_completeness_gate(
     tmp_path: Path,
     caplog: pytest.LogCaptureFixture,
 ) -> None:
@@ -843,8 +754,7 @@ def test_process_uses_actual_results_rows_when_manifest_row_count_is_stale(
     )
 
     assert result.records_processed == 1
-    assert "Manifest row_count mismatch for process input" in caplog.text
-    assert "manifest row_count=90 actual_rows=100" in caplog.text
+    assert "row_count mismatch" not in caplog.text
 
 
 def test_select_work_items_rollout_gate_error_includes_output_and_manifest_ids(tmp_path: Path) -> None:
@@ -873,7 +783,7 @@ def test_select_work_items_rollout_gate_error_includes_output_and_manifest_ids(t
     message = str(excinfo.value)
     assert "output_env_id=demo-env" in message
     assert "manifest_env_id=demo-env-rollout3" in message
-    assert "job_id=demo-job" in message
+    assert "job_id=run-rollout-bad" in message
 
 
 def test_run_process_excludes_models(tmp_path: Path) -> None:
@@ -1448,78 +1358,6 @@ def test_process_ignores_invalid_superseded_run(tmp_path: Path) -> None:
     assert table.column("reward").to_pylist() == [0.9]
 
 
-def test_process_ignores_superseded_run_missing_model_id(tmp_path: Path) -> None:
-    runs_dir = _write_run(tmp_path, run_id="run-1", updated_at="2024-01-01T00:00:00Z", reward=0.1)
-    _remove_model_id(tmp_path, "run-1")
-    _write_run(tmp_path, run_id="run-2", updated_at="2024-01-02T00:00:00Z", reward=0.9)
-
-    result = run_process(
-        ProcessOptions(runs_dir=runs_dir, output_dir=tmp_path / "processed", dry_run=False, max_workers=1)
-    )
-
-    table = pq.read_table(result.env_summaries[0].output_path)
-    assert table.column("reward").to_pylist() == [0.9]
-
-
-def test_process_latest_missing_model_id_fails_clearly(tmp_path: Path) -> None:
-    runs_dir = _write_run(tmp_path, run_id="run-1", updated_at="2024-01-01T00:00:00Z", reward=0.1)
-    _write_run(tmp_path, run_id="run-2", updated_at="2024-01-02T00:00:00Z", reward=0.9)
-    _remove_model_id(tmp_path, "run-2")
-
-    with pytest.raises(RuntimeError, match=r"Missing model_id for run \(job_run_id=run-2, job_id=demo-job,"):
-        run_process(ProcessOptions(runs_dir=runs_dir, output_dir=tmp_path / "processed", dry_run=False, max_workers=1))
-
-
-def test_process_latest_missing_model_id_not_masked_by_newer_other_job(tmp_path: Path) -> None:
-    runs_dir = _write_run(
-        tmp_path,
-        run_id="run-model-a-old",
-        updated_at="2024-01-01T00:00:00Z",
-        reward=0.1,
-        model_id="model-a",
-        job_id="job-model-a",
-    )
-    _write_run(
-        tmp_path,
-        run_id="run-model-a-bad",
-        updated_at="2024-01-02T00:00:00Z",
-        reward=0.2,
-        model_id="model-a",
-        job_id="job-model-a",
-    )
-    _remove_model_id(tmp_path, "run-model-a-bad")
-    _write_run(
-        tmp_path,
-        run_id="run-model-b-good",
-        updated_at="2024-01-03T00:00:00Z",
-        reward=0.9,
-        model_id="model-b",
-        job_id="job-model-b",
-    )
-
-    with pytest.raises(
-        RuntimeError, match=r"Missing model_id for run \(job_run_id=run-model-a-bad, job_id=job-model-a,"
-    ):
-        run_process(ProcessOptions(runs_dir=runs_dir, output_dir=tmp_path / "processed", dry_run=False, max_workers=1))
-
-
-def test_process_ignores_invalid_incomplete_run_by_default(tmp_path: Path) -> None:
-    runs_dir = _write_run(
-        tmp_path,
-        run_id="run-1",
-        updated_at="2024-01-01T00:00:00Z",
-        reward=0.1,
-        status="running",
-        results_text='{"example_id": ',
-    )
-    _write_run(tmp_path, run_id="run-2", updated_at="2024-01-02T00:00:00Z", reward=0.9, env_id="other-env")
-    output_dir = tmp_path / "processed"
-
-    result = run_process(ProcessOptions(runs_dir=runs_dir, output_dir=output_dir, dry_run=False, max_workers=1))
-
-    assert {summary.env_id for summary in result.env_summaries} == {"other-env"}
-
-
 def test_process_selected_invalid_results_still_fail(tmp_path: Path) -> None:
     runs_dir = _write_run(
         tmp_path,
@@ -1533,15 +1371,6 @@ def test_process_selected_invalid_results_still_fail(tmp_path: Path) -> None:
         run_process(ProcessOptions(runs_dir=runs_dir, output_dir=tmp_path / "processed", dry_run=False, max_workers=1))
 
 
-def test_process_selected_missing_results_still_fail(tmp_path: Path) -> None:
-    runs_dir = _setup_run(tmp_path)
-    missing_results = runs_dir / "run-1" / "demo-job" / "results.jsonl"
-    missing_results.unlink()
-
-    with pytest.raises(RuntimeError, match="Selected records are missing results.jsonl files:"):
-        run_process(ProcessOptions(runs_dir=runs_dir, output_dir=tmp_path / "processed", dry_run=False, max_workers=1))
-
-
 def test_process_clean_clears_outputs(tmp_path: Path) -> None:
     runs_dir = _setup_run(tmp_path)
     output_dir = tmp_path / "processed"
@@ -1623,11 +1452,6 @@ def fake_read_env_index_files(processed_dir: Path):
 
 def test_run_process_ignores_legacy_run_output_path(tmp_path: Path) -> None:
     runs_dir = _setup_run(tmp_path)
-    run_dir = runs_dir / "run-1"
-    manifest_path = run_dir / "run_manifest.json"
-    manifest = json.loads(manifest_path.read_text(encoding="utf-8"))
-    manifest["updated_at"] = "2024-01-01T00:10:00Z"
-    _write_json(manifest_path, manifest)
 
     output_dir = tmp_path / "processed"
     output_dir.mkdir()

From 309bf015b9f729d7e619ffb3d7e4c8c461e4b621 Mon Sep 17 00:00:00 2001
From: Benjamin Warner <me@benjaminwarner.dev>
Date: Fri, 8 May 2026 06:21:48 +0000
Subject: [PATCH 27/53] Remove runtime legacy manifest discovery

---
 medarc_verifiers/cli/_constants.py        |   1 -
 medarc_verifiers/cli/main.py              |  38 ++-
 medarc_verifiers/cli/process/discovery.py | 279 +---------------------
 medarc_verifiers/cli/process/pipeline.py  |   7 +-
 tests/test_cli/test_main.py               | 109 +++++----
 tests/test_cli/test_process_discovery.py  |   8 +
 6 files changed, 85 insertions(+), 357 deletions(-)

diff --git a/medarc_verifiers/cli/_constants.py b/medarc_verifiers/cli/_constants.py
index faa84092..fd65f19e 100644
--- a/medarc_verifiers/cli/_constants.py
+++ b/medarc_verifiers/cli/_constants.py
@@ -19,6 +19,5 @@
 DEFAULT_ENV_DIR = Path("environments")
 DEFAULT_ENV_CONFIG_ROOT = Path("configs") / "envs"
 DEFAULT_EVALS_DIR = Path("runs") / "evals"
-DEFAULT_RUNS_RAW_DIR = Path("runs") / "raw"
 DEFAULT_PROCESSED_DIR = Path("runs") / "processed"
 DEFAULT_WINRATE_DIR = DEFAULT_PROCESSED_DIR / "winrate"
diff --git a/medarc_verifiers/cli/main.py b/medarc_verifiers/cli/main.py
index a253d2f8..bdf753b4 100644
--- a/medarc_verifiers/cli/main.py
+++ b/medarc_verifiers/cli/main.py
@@ -27,7 +27,6 @@
     DEFAULT_ENV_CONFIG_ROOT,
     DEFAULT_ENV_DIR,
     DEFAULT_PROCESSED_DIR,
-    DEFAULT_RUNS_RAW_DIR,
     PROCESS_COMMAND,
     WINRATE_COMMAND,
 )
@@ -39,7 +38,7 @@
     plan_eval_paths,
 )
 from medarc_verifiers.cli.hf import HFSyncConfig, sync_files_to_hub
-from medarc_verifiers.cli.process import PROCESS_DEFAULT_STATUS_FILTER, ProcessOptions, ProcessResult, run_process
+from medarc_verifiers.cli.process import ProcessOptions, ProcessResult, run_process
 from medarc_verifiers.cli.utils.config_io import load_mapping_file
 from medarc_verifiers.cli.utils.overrides import build_cli_override
 from medarc_verifiers.cli.utils.shared import (
@@ -162,7 +161,7 @@ def build_process_parser() -> argparse.ArgumentParser:
         "--runs-dir",
         type=Path,
         default=None,
-        help=f"Directory containing raw run outputs (default: {DEFAULT_RUNS_RAW_DIR}).",
+        help=f"Directory containing eval output directories (default: {DEFAULT_EVALS_DIR}).",
     )
     parser.add_argument(
         "--output-dir",
@@ -176,12 +175,6 @@ def build_process_parser() -> argparse.ArgumentParser:
         default=None,
         help=f"Directory containing environment YAMLs for export settings (default: {DEFAULT_ENV_CONFIG_ROOT}).",
     )
-    parser.add_argument(
-        "--status",
-        action="append",
-        default=None,
-        help="Filter runs by manifest status (repeatable).",
-    )
     parser.add_argument(
         "--exclude-dataset",
         action="append",
@@ -225,10 +218,10 @@ def build_process_parser() -> argparse.ArgumentParser:
         type=float,
         default=None,
         help=(
-            "Fail if a selected latest job record is missing more than this percentage of expected results.jsonl rows "
-            "based on manifest job fields (row_count, num_examples, rollouts_per_example). "
-            "Computed per selected job record and enforced only on the latest selected run; does not use "
-            "manifest summary.completed/summary.total or fall back to older runs (default: 2.5)."
+            "Fail if a selected latest eval output is missing more than this percentage of expected results.jsonl rows "
+            "based on metadata num_examples and rollouts_per_example. "
+            "Computed per selected output and enforced only on the latest selected run; does not fall back to older "
+            "runs (default: 2.5)."
         ),
     )
     parser.add_argument(
@@ -541,11 +534,8 @@ def _build_process_options(args: argparse.Namespace) -> ProcessOptions:
         retries=args.hf_retries,
         max_files_per_commit=args.hf_max_files_per_commit,
     )
-    status_values = list(args.status or [])
-    status_filter = tuple(status_values) if status_values else PROCESS_DEFAULT_STATUS_FILTER
     max_results_missing_pct = float(args.max_results_missing_pct) if args.max_results_missing_pct is not None else 2.5
     processed_with_args = {
-        "status": list(status_filter),
         "max_results_missing_pct": max_results_missing_pct,
         "exclude_datasets": args.exclude_dataset or [],
         "exclude_models": args.exclude_model or [],
@@ -569,7 +559,6 @@ def _build_process_options(args: argparse.Namespace) -> ProcessOptions:
         replace_envs=tuple(args.replace_env or ()),
         processed_at=args.processed_at,
         processed_with_args=processed_with_args,
-        status_filter=status_filter,
         max_results_missing_pct=max_results_missing_pct,
         dry_run=bool(args.dry_run),
         clean=bool(args.clean),
@@ -736,11 +725,15 @@ def _load_config_payload(path: Path, *, mode: Literal["process", "winrate"]) ->
 def _reject_removed_process_config_keys(payload: Mapping[str, Any]) -> None:
     if "max_run_missing_pct" in payload:
         raise ValueError("Process config field 'max_run_missing_pct' was removed; use 'max_results_missing_pct'.")
+    if "status" in payload:
+        raise ValueError("Process config field 'status' was removed; process now reads completed eval outputs.")
     process_section = payload.get("process")
     if isinstance(process_section, Mapping) and "max_run_missing_pct" in process_section:
         raise ValueError(
             "Process config field 'process.max_run_missing_pct' was removed; use 'process.max_results_missing_pct'."
         )
+    if isinstance(process_section, Mapping) and "status" in process_section:
+        raise ValueError("Process config field 'process.status' was removed; process now reads completed eval outputs.")
 
 
 def _expand_embedded_pipeline_config(payload: dict[str, Any], *, mode: Literal["process", "winrate"]) -> dict[str, Any]:
@@ -788,7 +781,6 @@ def _merge_process_section(
                 "output_dir": "output_dir",
                 "env_config_root": "env_config_root",
                 "processed_at": "processed_at",
-                "status": "status",
                 "exclude_datasets": "exclude_datasets",
                 "exclude_models": "exclude_models",
                 "replace_models": "replace_models",
@@ -861,8 +853,9 @@ def _resolve_process_dir_value(value: Any, *, runs_dir: Any | None) -> Path | No
     candidate = Path(raw)
     if candidate.is_absolute():
         return candidate
-    runs_base = Path(str(runs_dir)).parent if runs_dir is not None else DEFAULT_RUNS_RAW_DIR.parent
-    return runs_base / candidate
+    if runs_dir is not None:
+        return Path(str(runs_dir)).parent / candidate
+    return DEFAULT_EVALS_DIR.parent / candidate
 
 
 def _resolve_winrate_dir_value(value: Any, *, process_output_dir: Path | None) -> Path | None:
@@ -988,7 +981,6 @@ def _load_and_apply_config(
     }[mode]
     repeatable_fields = {
         "process": {
-            "status": "status",
             "exclude_datasets": "exclude_dataset",
             "exclude_models": "exclude_model",
             "replace_models": "replace_model",
@@ -1063,7 +1055,7 @@ def _finalize_config_args(args: argparse.Namespace, *, mode: Literal["process",
     """Fill any unset process/winrate args with defaults after config + CLI merge."""
     defaults = {
         "process": {
-            "runs_dir": DEFAULT_RUNS_RAW_DIR,
+            "runs_dir": DEFAULT_EVALS_DIR,
             "output_dir": DEFAULT_PROCESSED_DIR,
             "env_config_root": DEFAULT_ENV_CONFIG_ROOT,
             "max_workers": 4,
@@ -1493,7 +1485,7 @@ def _print_general_help() -> None:
         Usage:
           {COMMAND} <ENV> [options]                 # Single run (ENV must be first; use ENV --help for details)
           {COMMAND} {BENCH_COMMAND} --config CONFIG.toml ...  # Sequential TOML bench
-          {COMMAND} {PROCESS_COMMAND} [options]               # Export raw runs to parquet (see: {COMMAND} {PROCESS_COMMAND} --help)
+          {COMMAND} {PROCESS_COMMAND} [options]               # Export eval outputs to parquet (see: {COMMAND} {PROCESS_COMMAND} --help)
           {COMMAND} {WINRATE_COMMAND} [options]               # Compute win rates from processed parquet outputs
 
         First argument must be the environment slug for single runs. Use '{COMMAND} {BENCH_COMMAND} --help' for TOML bench options."""
diff --git a/medarc_verifiers/cli/process/discovery.py b/medarc_verifiers/cli/process/discovery.py
index 68cba898..bf5112b1 100644
--- a/medarc_verifiers/cli/process/discovery.py
+++ b/medarc_verifiers/cli/process/discovery.py
@@ -7,17 +7,9 @@
 from datetime import UTC, datetime
 from dataclasses import dataclass, field
 from pathlib import Path
-from typing import Any, Dict, Iterator, Mapping, Sequence
-
-from pydantic import ValidationError
+from typing import Any, Iterator, Mapping, Sequence
 
 from medarc_verifiers.cli.eval_identity import MEDARC_VARIANT_ID_KEY
-from medarc_verifiers.cli._manifest import (
-    MANIFEST_FILENAME,
-    ManifestJobEntry,
-    RunManifestModel,
-    _require_manifest_v3,
-)
 
 logger = logging.getLogger(__name__)
 
@@ -96,271 +88,14 @@ def iter_run_records(
     """Yield run records for each job entry found under the runs directory."""
     runs_path = Path(runs_dir)
     normalized_status = _normalize_status_filter(filter_status)
-    emitted_results_dirs: set[Path] = set()
-
-    if runs_path.exists():
-        try:
-            run_dirs = sorted(path for path in runs_path.iterdir() if path.is_dir())
-        except OSError as exc:  # noqa: FBT003
-            logger.warning("Failed to list runs directory %s: %s", runs_path, exc)
-            run_dirs = []
-
-        for run_dir in run_dirs:
-            manifest_info, job_entries = _load_manifest(run_dir)
-            if manifest_info is None:
-                continue
-            summary_map = _load_run_summary(run_dir)
-            for job_entry in job_entries:
-                summary_entry = summary_map.get(job_entry.job_id or "")
-                record = _build_run_record(manifest_info, job_entry, summary_entry)
-                if record is None:
-                    continue
-                emitted_results_dirs.add(_dedupe_key(record.results_dir))
-                if normalized_status and record.status not in normalized_status:
-                    continue
-                yield record
-    else:
-        logger.debug("Runs directory %s does not exist; checking eval output roots.", runs_path)
-
-    for evals_root in _candidate_evals_roots(runs_path):
-        for record in _iter_eval_output_records(evals_root):
-            results_key = _dedupe_key(record.results_dir)
-            if results_key in emitted_results_dirs:
-                continue
-            emitted_results_dirs.add(results_key)
-            if normalized_status and record.status not in normalized_status:
-                continue
-            yield record
-
-
-def _build_run_record(
-    manifest: RunManifestInfo,
-    job_entry: ManifestJobEntry,
-    summary_entry: Mapping[str, Any] | None,
-) -> RunRecord | None:
-    job_id = job_entry.job_id
-    if not job_id:
-        logger.debug("Skipping job entry without a valid job_id in %s", manifest.manifest_path)
-        return None
-
-    results_dir_name, results_dir = _resolve_results_dir(
-        job_entry.results_relpath,
-        manifest.artifacts_root,
-        job_id,
-        manifest.run_dir,
-    )
-    results_dir_name, results_dir = _fallback_results_dir_if_missing(
-        results_dir_name,
-        results_dir,
-        manifest.run_dir,
-        job_id,
-    )
-    metadata_path = results_dir / METADATA_FILENAME
-    results_path = results_dir / RESULTS_FILENAME
-    summary_path = results_dir / "summary.json"
-
-    status = DEFAULT_STATUS
-    duration_seconds = None
-    reason: str | None = None
-
-    if summary_entry:
-        status = (str(summary_entry.get("status", DEFAULT_STATUS)) or DEFAULT_STATUS).lower()
-        duration_seconds = summary_entry.get("duration_seconds")
-        reason = summary_entry.get("error")
-    elif job_entry.status:
-        status = job_entry.status.lower()
-        reason = job_entry.reason
-
-    model_config = _ensure_mapping(manifest.models.get(job_entry.model_id) if manifest.models else {})
-    env_template = _ensure_mapping(
-        manifest.env_templates.get(job_entry.env_template_id) if manifest.env_templates else {}
-    )
-    env_config = dict(env_template)
-    if "module" not in env_config and job_entry.env_id:
-        env_config["module"] = job_entry.env_id
-    env_config["id"] = job_entry.env_variant_id
-    env_config["env_args"] = job_entry.env_args
-    env_args = _ensure_mapping(job_entry.env_args)
-    sampling_args = _ensure_mapping(job_entry.sampling_args or model_config.get("sampling_args"))
-
-    return RunRecord(
-        manifest=manifest,
-        job_id=job_id,
-        model_id=job_entry.model_id,
-        manifest_env_id=job_entry.env_id,
-        results_dir_name=results_dir_name,
-        results_dir=results_dir,
-        metadata_path=metadata_path,
-        results_path=results_path,
-        summary_path=summary_path,
-        has_metadata=metadata_path.exists(),
-        has_results=results_path.exists(),
-        has_summary=summary_path.exists(),
-        status=status,
-        duration_seconds=duration_seconds,
-        reason=reason or job_entry.reason,
-        started_at=job_entry.started_at,
-        ended_at=job_entry.ended_at,
-        avg_reward=job_entry.avg_reward,
-        num_examples=job_entry.num_examples,
-        rollouts_per_example=job_entry.rollouts_per_example,
-        row_count=job_entry.row_count,
-        env_args=env_args,
-        sampling_args=sampling_args,
-        env_config=env_config,
-        model_config=model_config,
-    )
-
-
-def _ensure_mapping(value: Any) -> Mapping[str, Any]:
-    if isinstance(value, Mapping):
-        return value
-    return {}
-
-
-def _resolve_results_dir(
-    stored_results_relpath: str | None,
-    artifacts_root: str | None,
-    job_id: str,
-    run_dir: Path,
-) -> tuple[str, Path]:
-    """Resolve a job's results directory from v3 manifest artifact fields."""
-    if stored_results_relpath:
-        rel = Path(stored_results_relpath)
-        base = run_dir / str(artifacts_root or ".")
-        candidate_file = (base / rel).resolve()
-        # v3 stores results_relpath to results.jsonl; derive the containing directory.
-        candidate_dir = candidate_file.parent if candidate_file.name == RESULTS_FILENAME else candidate_file
-        return candidate_dir.name, candidate_dir
-
-    # Backward-compatible fallback for malformed v3 payloads missing relpaths.
-    fallback = (run_dir / job_id).resolve()
-    return job_id, fallback
-
-
-def _fallback_results_dir_if_missing(
-    results_dir_name: str,
-    results_dir: Path,
-    run_dir: Path,
-    job_id: str,
-) -> tuple[str, Path]:
-    metadata_path = results_dir / METADATA_FILENAME
-    results_path = results_dir / RESULTS_FILENAME
-    if metadata_path.exists() or results_path.exists():
-        return results_dir_name, results_dir
-    fallback = (run_dir / job_id).resolve()
-    fallback_metadata = fallback / METADATA_FILENAME
-    fallback_results = fallback / RESULTS_FILENAME
-    if fallback_metadata.exists() or fallback_results.exists():
-        logger.warning(
-            "Manifest results path missing for job '%s'; falling back to run-relative directory '%s'.",
-            job_id,
-            fallback,
-        )
-        return job_id, fallback
-    return results_dir_name, results_dir
-
-
-def _load_manifest(run_dir: Path) -> tuple[RunManifestInfo | None, Sequence[ManifestJobEntry]]:
-    manifest_path = run_dir / MANIFEST_FILENAME
-    if not manifest_path.exists():
-        logger.debug("Skipping %s: no %s present.", run_dir, MANIFEST_FILENAME)
-        return None, ()
-    try:
-        manifest_payload = json.loads(manifest_path.read_text(encoding="utf-8"))
-    except (OSError, ValueError) as exc:  # noqa: FBT003
-        logger.warning("Failed to parse manifest %s: %s", manifest_path, exc)
-        return None, ()
-
-    _require_manifest_v3(manifest_payload, path=manifest_path)
-
-    try:
-        manifest_model = RunManifestModel.model_validate(manifest_payload)
-    except ValidationError as exc:
-        logger.warning("Manifest schema validation failed for %s: %s", manifest_path, exc)
-        return None, ()
-
-    job_run_id = manifest_model.run_id or run_dir.name
-    summary_payload = manifest_model.summary or {}
-    try:
-        completed_count = int(summary_payload.get("completed", 0))
-    except Exception:
-        completed_count = 0
-    total_known = False
-    if "total" in summary_payload:
-        try:
-            total_count = int(summary_payload.get("total", 0))
-        except Exception:
-            total_count = 0
-        total_known = total_count > 0 or not manifest_model.jobs
-    else:
-        total_count = 0
-    if total_count == 0 and manifest_model.jobs:
-        total_count = len(manifest_model.jobs)
-        total_known = True
-
-    manifest_info = RunManifestInfo(
-        job_run_id=job_run_id,
-        run_name=manifest_model.name,
-        summary_completed=completed_count,
-        summary_total=total_count,
-        summary_total_known=total_known,
-        manifest_path=manifest_path,
-        run_dir=run_dir,
-        created_at=manifest_model.created_at,
-        updated_at=manifest_model.updated_at,
-        config_source=manifest_model.config_source,
-        config_checksum=manifest_model.config_checksum,
-        version=int(manifest_model.version),
-        artifacts_root=str(getattr(manifest_model, "artifacts_root", ".") or "."),
-        run_summary_path=run_dir / "run_summary.json",
-        models=manifest_model.models or {},
-        env_templates=manifest_model.env_templates or {},
-    )
-
-    if not manifest_model.jobs:
-        logger.debug("Manifest %s has no jobs array.", manifest_path)
-        return manifest_info, ()
-    return manifest_info, manifest_model.jobs
-
-
-def _load_run_summary(run_dir: Path) -> Mapping[str, Mapping[str, Any]]:
-    summary_path = run_dir / "run_summary.json"
-    if not summary_path.exists():
-        return {}
-    try:
-        payload = json.loads(summary_path.read_text(encoding="utf-8"))
-    except (OSError, ValueError) as exc:  # noqa: FBT003
-        logger.warning("Failed to parse run summary %s: %s", summary_path, exc)
-        return {}
-    jobs = payload.get("jobs")
-    if not isinstance(jobs, list):
-        return {}
-    summary: Dict[str, Mapping[str, Any]] = {}
-    for entry in jobs:
-        job_id = entry.get("job_id") if isinstance(entry, Mapping) else None
-        if not job_id:
-            continue
-        summary[job_id] = entry
-    return summary
-
-
-def _candidate_evals_roots(runs_path: Path) -> tuple[Path, ...]:
-    candidates: list[Path] = []
-    if runs_path.name == "evals":
-        candidates.append(runs_path)
-    candidates.append(runs_path / "evals")
-    candidates.append(runs_path.parent / "evals")
+    if not runs_path.exists():
+        logger.debug("Runs directory %s does not exist.", runs_path)
+        return
 
-    roots: list[Path] = []
-    seen: set[Path] = set()
-    for candidate in candidates:
-        key = _dedupe_key(candidate)
-        if key in seen or not candidate.exists() or not candidate.is_dir():
+    for record in _iter_eval_output_records(runs_path):
+        if normalized_status and record.status not in normalized_status:
             continue
-        seen.add(key)
-        roots.append(candidate)
-    return tuple(roots)
+        yield record
 
 
 def _iter_eval_output_records(evals_root: Path) -> Iterator[RunRecord]:
diff --git a/medarc_verifiers/cli/process/pipeline.py b/medarc_verifiers/cli/process/pipeline.py
index 4d14c723..76ea40c9 100644
--- a/medarc_verifiers/cli/process/pipeline.py
+++ b/medarc_verifiers/cli/process/pipeline.py
@@ -45,7 +45,6 @@ class ProcessOptions:
     replace_envs: Sequence[str] = field(default_factory=tuple)
     processed_at: str | None = None
     processed_with_args: Mapping[str, Any] = field(default_factory=dict)
-    status_filter: Sequence[str] = field(default_factory=lambda: PROCESS_DEFAULT_STATUS_FILTER)
     dry_run: bool = False
     clean: bool = False
     assume_yes: bool = False
@@ -60,7 +59,6 @@ def __post_init__(self) -> None:
         self.max_workers = max(1, int(self.max_workers))
         if not self.processed_at:
             self.processed_at = datetime.now(UTC).replace(microsecond=0).isoformat().replace("+00:00", "Z")
-        self.status_filter = tuple(str(status) for status in self.status_filter)
         self.exclude_datasets = tuple(str(value) for value in self.exclude_datasets if str(value).strip())
         self.exclude_models = tuple(str(value) for value in self.exclude_models if str(value).strip())
         self.replace_models = tuple(str(value) for value in self.replace_models if str(value).strip())
@@ -150,10 +148,7 @@ def _run_pipeline() -> ProcessResult:
                 baseline_result = preparation.baseline_result
 
         index_files = {} if options.clean else env_index.read_env_index_files(options.output_dir)
-        discovered = discovery.discover_run_records(
-            options.runs_dir,
-            filter_status=options.status_filter or None,
-        )
+        discovered = discovery.discover_run_records(options.runs_dir)
         selection = select_work_items(
             discovered,
             options=options,
diff --git a/tests/test_cli/test_main.py b/tests/test_cli/test_main.py
index 76fb7bdc..2d332739 100644
--- a/tests/test_cli/test_main.py
+++ b/tests/test_cli/test_main.py
@@ -1099,8 +1099,6 @@ def fake_run(options, env_export_map):
             str(tmp_path / "processed"),
             "--env-config-root",
             str(env_root),
-            "--status",
-            "completed",
             "--hf-repo",
             "medarc/demo",
             "--dry-run",
@@ -1109,7 +1107,6 @@ def fake_run(options, env_export_map):
 
     assert exit_code == 0
     options = captured["options"]
-    assert options.status_filter == ("completed",)
     assert options.hf_config is not None
     env_map = captured["env_export_map"]
     assert "demo-env" in env_map
@@ -1155,7 +1152,7 @@ def test_process_cli_applies_config_defaults(monkeypatch: pytest.MonkeyPatch, tm
     cfg_path = tmp_path / "process.yaml"
     cfg_path.write_text(
         f"""
-        runs_dir: runs/raw-from-config
+        runs_dir: runs/evals-from-config
         process:
           dir: processed
           env_config_root: {env_root}
@@ -1183,7 +1180,7 @@ def fake_run(options, env_export_map):
     assert exit_code == 0
 
     options = captured["options"]
-    assert options.runs_dir == Path("runs/raw-from-config")
+    assert options.runs_dir == Path("runs/evals-from-config")
     assert options.output_dir == Path("runs/processed")
     assert options.max_workers == 2
     assert options.hf_pull_policy == "pull"
@@ -1209,7 +1206,7 @@ def test_process_cli_resolves_hf_token_env_reference(monkeypatch: pytest.MonkeyP
     cfg_path = tmp_path / "process.yaml"
     cfg_path.write_text(
         """
-        runs_dir: runs/raw-from-config
+        runs_dir: runs/evals-from-config
         process:
           dir: processed
         hf:
@@ -1238,7 +1235,7 @@ def test_winrate_cli_applies_config_defaults(monkeypatch: pytest.MonkeyPatch, tm
     cfg_path = tmp_path / "winrate.yaml"
     cfg_path.write_text(
         """
-        runs_dir: runs/raw-from-config
+        runs_dir: runs/evals-from-config
         process:
           dir: processed
         winrate:
@@ -1371,7 +1368,7 @@ def test_process_cli_rejects_unset_hf_token_env_reference(
     cfg_path = tmp_path / "process.yaml"
     cfg_path.write_text(
         """
-        runs_dir: runs/raw-from-config
+        runs_dir: runs/evals-from-config
         process:
           dir: processed
         hf:
@@ -1391,7 +1388,7 @@ def test_process_cli_rejects_unset_hf_token_env_reference(
 
 def test_expand_embedded_process_config_promotes_process_section() -> None:
     payload = {
-        "runs_dir": "runs/raw",
+        "runs_dir": "runs/evals",
         "process": {
             "dir": "processed",
             "max_workers": 8,
@@ -1402,7 +1399,7 @@ def test_expand_embedded_process_config_promotes_process_section() -> None:
 
     expanded = main._expand_embedded_pipeline_config(payload, mode="process")
 
-    assert expanded["runs_dir"] == "runs/raw"
+    assert expanded["runs_dir"] == "runs/evals"
     assert expanded["output_dir"] == Path("runs/processed")
     assert expanded["max_workers"] == 8
     assert expanded["replace_models"] == ["model-a"]
@@ -1410,6 +1407,20 @@ def test_expand_embedded_process_config_promotes_process_section() -> None:
     assert payload["process"]["dir"] == "processed"
 
 
+def test_expand_embedded_process_config_uses_default_evals_parent_for_relative_dir() -> None:
+    payload = {
+        "process": {
+            "dir": "processed",
+            "max_workers": 8,
+        },
+    }
+
+    expanded = main._expand_embedded_pipeline_config(payload, mode="process")
+
+    assert expanded["output_dir"] == Path("runs/processed")
+    assert expanded["max_workers"] == 8
+
+
 def test_expand_embedded_winrate_config_resolves_relative_dirs() -> None:
     payload = {
         "runs_dir": "artifacts/raw",
@@ -1460,7 +1471,9 @@ def test_process_cli_requires_winrate_config_path(tmp_path: Path) -> None:
         )
 
 
-def test_process_cli_defaults_status_filter_to_completed(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
+def test_process_cli_records_default_max_results_missing_pct(
+    monkeypatch: pytest.MonkeyPatch, tmp_path: Path
+) -> None:
     captured: dict[str, Any] = {}
 
     def fake_run_process(options, env_export_map):
@@ -1482,13 +1495,11 @@ def fake_run_process(options, env_export_map):
 
     assert exit_code == 0
     options = captured["options"]
-    assert options.status_filter == ("completed",)
-    assert options.processed_with_args["status"] == ["completed"]
     assert options.max_results_missing_pct == pytest.approx(2.5)
     assert options.processed_with_args["max_results_missing_pct"] == pytest.approx(2.5)
 
 
-def test_process_cli_uses_explicit_status_filter(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
+def test_process_cli_records_explicit_max_results_missing_pct(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
     captured: dict[str, Any] = {}
 
     def fake_run_process(options, env_export_map):
@@ -1504,8 +1515,6 @@ def fake_run_process(options, env_export_map):
             str(tmp_path / "runs"),
             "--output-dir",
             str(tmp_path / "processed"),
-            "--status",
-            "failed",
             "--max-results-missing-pct",
             "100",
             "--dry-run",
@@ -1514,8 +1523,6 @@ def fake_run_process(options, env_export_map):
 
     assert exit_code == 0
     options = captured["options"]
-    assert options.status_filter == ("failed",)
-    assert options.processed_with_args["status"] == ["failed"]
     assert options.max_results_missing_pct == pytest.approx(100.0)
 
 
@@ -1541,42 +1548,11 @@ def test_process_cli_rejects_negative_max_results_missing_pct(
     assert "--max-results-missing-pct must be non-negative." in err
 
 
-def test_process_config_empty_status_uses_default_filter(
-    monkeypatch: pytest.MonkeyPatch,
-    tmp_path: Path,
-) -> None:
-    cfg_path = tmp_path / "process.yaml"
-    cfg_path.write_text(
-        """
-        runs_dir: runs/raw
-        process:
-          dir: processed
-          status: []
-        """,
-        encoding="utf-8",
-    )
-
-    captured: dict[str, Any] = {}
-
-    def fake_run_process(options, env_export_map):
-        captured["options"] = options
-        return ProcessResult(records_processed=0, rows_processed=0, env_groups=[], env_summaries=[], hf_summary=None)
-
-    monkeypatch.setattr(main, "run_process", fake_run_process)
-
-    exit_code = main.main(["process", "--config", str(cfg_path), "--dry-run"])
-
-    assert exit_code == 0
-    options = captured["options"]
-    assert options.status_filter == ("completed",)
-    assert options.processed_with_args["status"] == ["completed"]
-
-
 def test_process_cli_runs_embedded_winrate_post_step(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
     cfg_path = tmp_path / "process.yaml"
     cfg_path.write_text(
         """
-        runs_dir: runs/raw
+        runs_dir: runs/evals
         process:
           dir: processed
         winrate:
@@ -1665,7 +1641,7 @@ def test_process_cli_defaults_winrate_output_dir_under_processed(
     cfg_path = tmp_path / "process.yaml"
     cfg_path.write_text(
         """
-        runs_dir: runs/raw
+        runs_dir: runs/evals
         process:
           dir: processed
         winrate:
@@ -1792,7 +1768,7 @@ def test_process_cli_rejects_invalid_typed_config_values(
     cfg_path = tmp_path / "process-invalid.yaml"
     cfg_path.write_text(
         f"""
-        runs_dir: runs/raw
+        runs_dir: runs/evals
         output_dir: runs/processed
         {field}: {value}
         """,
@@ -1815,7 +1791,7 @@ def test_process_cli_rejects_removed_top_level_max_run_missing_pct_config_key(
     cfg_path = tmp_path / "process-removed-top-level.yaml"
     cfg_path.write_text(
         """
-        runs_dir: runs/raw
+        runs_dir: runs/evals
         output_dir: runs/processed
         max_run_missing_pct: 2.5
         """,
@@ -1838,7 +1814,7 @@ def test_process_cli_rejects_removed_embedded_max_run_missing_pct_config_key(
     cfg_path = tmp_path / "process-removed-embedded.yaml"
     cfg_path.write_text(
         """
-        runs_dir: runs/raw
+        runs_dir: runs/evals
         process:
           dir: processed
           max_run_missing_pct: 2.5
@@ -1855,6 +1831,29 @@ def test_process_cli_rejects_removed_embedded_max_run_missing_pct_config_key(
     assert "process.max_results_missing_pct" in err
 
 
+def test_process_cli_rejects_removed_status_config_key(
+    tmp_path: Path,
+    capsys: pytest.CaptureFixture[str],
+) -> None:
+    cfg_path = tmp_path / "process-removed-status.yaml"
+    cfg_path.write_text(
+        """
+        runs_dir: runs/evals
+        process:
+          dir: processed
+          status: [completed]
+        """,
+        encoding="utf-8",
+    )
+
+    with pytest.raises(SystemExit) as excinfo:
+        main.main(["process", "--config", str(cfg_path)])
+
+    assert excinfo.value.code == 2
+    err = capsys.readouterr().err
+    assert "Process config field 'process.status' was removed" in err
+
+
 def test_winrate_cli_ignores_removed_process_only_missing_pct_key(
     monkeypatch: pytest.MonkeyPatch,
     tmp_path: Path,
@@ -1938,7 +1937,7 @@ def test_process_cli_allows_cli_override_of_malformed_numeric_config(
     cfg_path = tmp_path / "process-invalid-override.yaml"
     cfg_path.write_text(
         """
-        runs_dir: runs/raw
+        runs_dir: runs/evals
         output_dir: runs/processed
         max_workers: not-an-int
         """,
diff --git a/tests/test_cli/test_process_discovery.py b/tests/test_cli/test_process_discovery.py
index 71eaaed4..c4ef7f7d 100644
--- a/tests/test_cli/test_process_discovery.py
+++ b/tests/test_cli/test_process_discovery.py
@@ -179,3 +179,11 @@ def test_discover_run_records_parent_baseline_and_child_variant_once(tmp_path: P
 
     assert len(records) == 2
     assert {record.results_dir for record in records} == {baseline_dir, variant_dir}
+
+
+def test_discover_run_records_scans_only_provided_root(tmp_path: Path) -> None:
+    evals_dir = tmp_path / "runs" / "evals"
+    raw_dir = tmp_path / "runs" / "raw"
+    _write_eval_output(evals_dir / "gpt-5-mini" / "medqa" / "base")
+
+    assert discover_run_records(raw_dir, filter_status=("completed",)) == []

From 6cd32026641df3eaeefa33a9299a892978b26676 Mon Sep 17 00:00:00 2001
From: Benjamin Warner <me@benjaminwarner.dev>
Date: Fri, 8 May 2026 06:22:48 +0000
Subject: [PATCH 28/53] Delete legacy manifest schema

---
 medarc_verifiers/cli/_manifest.py | 93 -------------------------------
 1 file changed, 93 deletions(-)
 delete mode 100644 medarc_verifiers/cli/_manifest.py

diff --git a/medarc_verifiers/cli/_manifest.py b/medarc_verifiers/cli/_manifest.py
deleted file mode 100644
index afa8ca1d..00000000
--- a/medarc_verifiers/cli/_manifest.py
+++ /dev/null
@@ -1,93 +0,0 @@
-"""Legacy run manifest schemas retained for process discovery.
-
-The YAML benchmark runner no longer writes manifests, but `medarc-eval process`
-still supports old `runs/raw/<run_id>/run_manifest.json` directories during the
-transition. Keep this module to the schema pieces needed for that reader.
-"""
-
-from __future__ import annotations
-
-from pathlib import Path
-from typing import Any, Mapping
-
-from pydantic import BaseModel, ConfigDict, Field, model_validator
-
-MANIFEST_VERSION = 3
-SUPPORTED_MANIFEST_VERSIONS = {MANIFEST_VERSION}
-MANIFEST_FILENAME = "run_manifest.json"
-
-
-class ManifestJobEntry(BaseModel):
-    """Pydantic model describing a single legacy manifest job entry."""
-
-    model_config = ConfigDict(extra="ignore")
-
-    job_id: str
-    env_id: str | None = None
-    model_id: str | None = None
-    env_template_id: str
-    env_variant_id: str
-    env_args: dict[str, Any]
-    sampling_args: dict[str, Any] | None = None
-    status: str = "pending"
-    reason: str | None = None
-    attempt: int = 0
-    started_at: str | None = None
-    ended_at: str | None = None
-    duration_seconds: float | None = None
-    results_dir: str | None = None
-    results_relpath: str | None = None
-    metadata_relpath: str | None = None
-    row_count: int | None = None
-    metrics: dict[str, Any] | None = None
-    avg_reward: float | None = None
-    num_examples: int | None = None
-    rollouts_per_example: int | None = None
-
-
-class RunManifestModel(BaseModel):
-    """Root legacy manifest payload persisted by the retired YAML runner."""
-
-    model_config = ConfigDict(extra="allow")
-
-    version: int = MANIFEST_VERSION
-    run_id: str
-    name: str
-    config_source: str
-    config_checksum: str
-    created_at: str
-    updated_at: str
-    restart_source: str | None = None
-    artifacts_root: str = "."
-    models: dict[str, dict[str, Any]] = Field(default_factory=dict)
-    env_templates: dict[str, dict[str, Any]] = Field(default_factory=dict)
-    jobs: list[ManifestJobEntry] = Field(default_factory=list)
-    summary: dict[str, int] = Field(default_factory=dict)
-
-    @model_validator(mode="after")
-    def _check_version(self) -> RunManifestModel:
-        if self.version not in SUPPORTED_MANIFEST_VERSIONS:
-            msg = (
-                f"Manifest version {self.version} is not supported; "
-                f"expected one of {sorted(SUPPORTED_MANIFEST_VERSIONS)}."
-            )
-            raise ValueError(msg)
-        return self
-
-
-def _require_manifest_v3(payload: Mapping[str, Any], *, path: Path | None = None) -> None:
-    """Raise when a legacy manifest payload is not version 3."""
-    version = payload.get("version")
-    if version != MANIFEST_VERSION:
-        location = f" at {path}" if path else ""
-        raise ValueError(f"Unsupported legacy run manifest version {version!r}{location}; expected 3.")
-
-
-__all__ = [
-    "MANIFEST_FILENAME",
-    "MANIFEST_VERSION",
-    "SUPPORTED_MANIFEST_VERSIONS",
-    "ManifestJobEntry",
-    "RunManifestModel",
-    "_require_manifest_v3",
-]

From 4efe56318293a1763a65ae99cfbd858be27cc7a2 Mon Sep 17 00:00:00 2001
From: Benjamin Warner <me@benjaminwarner.dev>
Date: Fri, 8 May 2026 06:24:57 +0000
Subject: [PATCH 29/53] Simplify process metadata normalization

---
 medarc_verifiers/cli/process/metadata.py | 69 +++++------------------
 tests/test_cli/test_process_metadata.py  | 72 +++++-------------------
 2 files changed, 28 insertions(+), 113 deletions(-)

diff --git a/medarc_verifiers/cli/process/metadata.py b/medarc_verifiers/cli/process/metadata.py
index d971e1d5..1dfd94c9 100644
--- a/medarc_verifiers/cli/process/metadata.py
+++ b/medarc_verifiers/cli/process/metadata.py
@@ -4,7 +4,6 @@
 
 import json
 import logging
-import math
 from dataclasses import dataclass
 from pathlib import Path
 from typing import Any, Mapping, MutableMapping
@@ -41,7 +40,7 @@ class _MetadataPayload(BaseModel):
 
 @dataclass(slots=True)
 class NormalizedMetadata:
-    """Normalized view of metadata.json merged with manifest discovery data."""
+    """Normalized view of metadata.json plus output-path discovery data."""
 
     identity: "RunIdentity"
     record: RunRecord
@@ -134,7 +133,7 @@ def load_normalized_metadata(
     *,
     combine_rollouts: bool = True,
 ) -> NormalizedMetadata:
-    """Merge manifest fields with metadata.json (when present)."""
+    """Load and normalize metadata.json with output-path identity."""
     context = _resolve_metadata_context(record, combine_rollouts=combine_rollouts)
     if not context.model_id:
         raise RuntimeError(format_missing_model_id_error(record))
@@ -179,16 +178,15 @@ def _resolve_metadata_context(
     combine_rollouts: bool,
 ) -> _ResolvedMetadataContext:
     metadata_payload, raw_metadata = _load_metadata(record)
-    _warn_manifest_metadata_result_mismatch(record, metadata_payload)
     metadata_env_id = metadata_payload.env_id if metadata_payload else None
     metadata_model = metadata_payload.model if metadata_payload else None
     env_args = _merge_mappings(
-        primary=record.env_args,
-        fallback=metadata_payload.env_args if metadata_payload else None,
+        primary=metadata_payload.env_args if metadata_payload else None,
+        fallback=record.env_args,
     )
     sampling_args = _merge_mappings(
-        primary=record.sampling_args,
-        fallback=metadata_payload.sampling_args if metadata_payload else None,
+        primary=metadata_payload.sampling_args if metadata_payload else None,
+        fallback=record.sampling_args,
     )
     manifest_env_id = (
         _extract_env_config_id(record.env_config) or record.manifest_env_id or metadata_env_id or record.job_id
@@ -213,13 +211,15 @@ def _resolve_metadata_context(
         metadata_model=metadata_model,
         env_args=env_args,
         sampling_args=sampling_args,
-        num_examples=_prefer_manifest_value(
-            record.num_examples,
-            metadata_payload.num_examples if metadata_payload else None,
+        num_examples=(
+            metadata_payload.num_examples
+            if metadata_payload and metadata_payload.num_examples is not None
+            else record.num_examples
         ),
-        rollouts_per_example=_prefer_manifest_value(
-            record.rollouts_per_example,
-            metadata_payload.rollouts_per_example if metadata_payload else None,
+        rollouts_per_example=(
+            metadata_payload.rollouts_per_example
+            if metadata_payload and metadata_payload.rollouts_per_example is not None
+            else record.rollouts_per_example
         ),
         variant_id=record_variant_id or _string_or_none(metadata_payload.variant_id if metadata_payload else None),
         variant_payload=_mapping_or_none(
@@ -250,7 +250,7 @@ def format_missing_model_id_error(record: RunRecord) -> str:
     return (
         "Missing model_id for run "
         f"(job_run_id={record.manifest.job_run_id}, job_id={record.job_id}, "
-        f"results_dir={record.results_dir}, manifest={record.manifest.manifest_path})"
+        f"results_dir={record.results_dir}, metadata={record.metadata_path})"
     )
 
 
@@ -304,12 +304,6 @@ def _merge_mappings(
     return result
 
 
-def _prefer_manifest_value(primary: int | None, fallback: int | None) -> int | None:
-    if primary is not None:
-        return primary
-    return fallback
-
-
 def _raw_metadata_value(raw_metadata: Mapping[str, Any], key: str, fallback: Any) -> Any:
     if key in raw_metadata:
         return raw_metadata.get(key)
@@ -329,39 +323,6 @@ def _string_or_none(value: Any) -> str | None:
     return text or None
 
 
-def _warn_manifest_metadata_result_mismatch(record: RunRecord, metadata_payload: _MetadataPayload | None) -> None:
-    if metadata_payload is None:
-        return
-
-    mismatches: list[str] = []
-    if _has_float_mismatch(record.avg_reward, metadata_payload.avg_reward):
-        mismatches.append(f"avg_reward manifest={record.avg_reward!r} metadata={metadata_payload.avg_reward!r}")
-    if _has_int_mismatch(record.num_examples, metadata_payload.num_examples):
-        mismatches.append(f"num_examples manifest={record.num_examples!r} metadata={metadata_payload.num_examples!r}")
-    if not mismatches:
-        return
-
-    logger.warning(
-        "Manifest/metadata result mismatch for process input (job_run_id=%s, job_id=%s, metadata=%s): %s",
-        record.manifest.job_run_id,
-        record.job_id,
-        record.metadata_path,
-        "; ".join(mismatches),
-    )
-
-
-def _has_float_mismatch(left: float | None, right: float | None) -> bool:
-    if left is None or right is None:
-        return False
-    return not math.isclose(left, right, rel_tol=1e-9, abs_tol=1e-9)
-
-
-def _has_int_mismatch(left: int | None, right: int | None) -> bool:
-    if left is None or right is None:
-        return False
-    return left != right
-
-
 def _extract_env_config_id(env_config: Mapping[str, Any] | None) -> str | None:
     if not env_config:
         return None
diff --git a/tests/test_cli/test_process_metadata.py b/tests/test_cli/test_process_metadata.py
index e69b8d46..4dc1113e 100644
--- a/tests/test_cli/test_process_metadata.py
+++ b/tests/test_cli/test_process_metadata.py
@@ -1,11 +1,8 @@
 from __future__ import annotations
 
 import json
-import logging
 from pathlib import Path
 
-import pytest
-
 from medarc_verifiers.cli.process.discovery import RunManifestInfo, RunRecord
 from medarc_verifiers.cli.process.metadata import load_normalized_metadata
 
@@ -76,7 +73,7 @@ def _make_record(
     return record
 
 
-def test_load_normalized_metadata_prefers_manifest_fields(tmp_path: Path) -> None:
+def test_load_normalized_metadata_prefers_metadata_fields(tmp_path: Path) -> None:
     record = _make_record(
         tmp_path,
         env_args={"difficulty": "hard"},
@@ -102,9 +99,9 @@ def test_load_normalized_metadata_prefers_manifest_fields(tmp_path: Path) -> Non
     assert normalized.manifest_env_id == "demo-env-rollout3"
     assert normalized.base_env_id == "demo-env"
     assert normalized.rollout_index == 3
-    assert normalized.env_args == {"difficulty": "hard", "split": "dev"}
-    assert normalized.sampling_args == {"temperature": 0.1, "top_p": 0.95}
-    assert normalized.num_examples == 10
+    assert normalized.env_args == {"difficulty": "easy", "split": "dev"}
+    assert normalized.sampling_args == {"temperature": 0.9, "top_p": 0.95}
+    assert normalized.num_examples == 20
     assert normalized.rollouts_per_example == 2
     assert normalized.model_id == "gpt-4o"
     assert normalized.metadata_model == "gpt-4o-mini"
@@ -220,13 +217,13 @@ def test_load_normalized_metadata_validation_failure_sanitizes_raw_metadata(tmp_
     }
 
 
-def test_load_normalized_metadata_keeps_zero_num_examples_from_manifest(tmp_path: Path) -> None:
+def test_load_normalized_metadata_keeps_zero_num_examples_from_metadata(tmp_path: Path) -> None:
     record = _make_record(tmp_path, manifest_env_id="demo-env", num_examples=0, rollouts_per_example=1)
     _write_json(
         record.metadata_path,
         {
             "env_id": "demo-env",
-            "num_examples": 20,
+            "num_examples": 0,
             "rollouts_per_example": 3,
         },
     )
@@ -234,33 +231,33 @@ def test_load_normalized_metadata_keeps_zero_num_examples_from_manifest(tmp_path
     normalized = load_normalized_metadata(record)
 
     assert normalized.num_examples == 0
-    assert normalized.rollouts_per_example == 1
+    assert normalized.rollouts_per_example == 3
 
 
-def test_load_normalized_metadata_keeps_zero_rollouts_from_manifest(tmp_path: Path) -> None:
+def test_load_normalized_metadata_keeps_zero_rollouts_from_metadata(tmp_path: Path) -> None:
     record = _make_record(tmp_path, manifest_env_id="demo-env", num_examples=10, rollouts_per_example=0)
     _write_json(
         record.metadata_path,
         {
             "env_id": "demo-env",
             "num_examples": 20,
-            "rollouts_per_example": 3,
+            "rollouts_per_example": 0,
         },
     )
 
     normalized = load_normalized_metadata(record)
 
-    assert normalized.num_examples == 10
+    assert normalized.num_examples == 20
     assert normalized.rollouts_per_example == 0
 
 
-def test_load_normalized_metadata_keeps_all_examples_sentinel_from_manifest(tmp_path: Path) -> None:
+def test_load_normalized_metadata_keeps_all_examples_sentinel_from_metadata(tmp_path: Path) -> None:
     record = _make_record(tmp_path, manifest_env_id="demo-env", num_examples=-1, rollouts_per_example=1)
     _write_json(
         record.metadata_path,
         {
             "env_id": "demo-env",
-            "num_examples": 20,
+            "num_examples": -1,
             "rollouts_per_example": 3,
         },
     )
@@ -268,47 +265,4 @@ def test_load_normalized_metadata_keeps_all_examples_sentinel_from_manifest(tmp_
     normalized = load_normalized_metadata(record)
 
     assert normalized.num_examples == -1
-    assert normalized.rollouts_per_example == 1
-
-
-def test_load_normalized_metadata_warns_on_avg_reward_and_num_examples_mismatch(
-    tmp_path: Path,
-    caplog: pytest.LogCaptureFixture,
-) -> None:
-    record = _make_record(tmp_path, manifest_env_id="demo-env", avg_reward=0.8, num_examples=10)
-    _write_json(
-        record.metadata_path,
-        {
-            "env_id": "demo-env",
-            "avg_reward": 0.7,
-            "num_examples": 12,
-        },
-    )
-
-    with caplog.at_level(logging.WARNING):
-        normalized = load_normalized_metadata(record)
-
-    assert normalized.num_examples == 10
-    assert "Manifest/metadata result mismatch for process input" in caplog.text
-    assert "avg_reward manifest=0.8 metadata=0.7" in caplog.text
-    assert "num_examples manifest=10 metadata=12" in caplog.text
-
-
-def test_load_normalized_metadata_does_not_warn_when_result_fields_match(
-    tmp_path: Path,
-    caplog: pytest.LogCaptureFixture,
-) -> None:
-    record = _make_record(tmp_path, manifest_env_id="demo-env", avg_reward=0.8, num_examples=10)
-    _write_json(
-        record.metadata_path,
-        {
-            "env_id": "demo-env",
-            "avg_reward": 0.8,
-            "num_examples": 10,
-        },
-    )
-
-    with caplog.at_level(logging.WARNING):
-        load_normalized_metadata(record)
-
-    assert "Manifest/metadata result mismatch for process input" not in caplog.text
+    assert normalized.rollouts_per_example == 3

From 38d08b0f7b3664ef0085d55318e74d283758ba2e Mon Sep 17 00:00:00 2001
From: Benjamin Warner <me@benjaminwarner.dev>
Date: Fri, 8 May 2026 06:28:27 +0000
Subject: [PATCH 30/53] Update process docs for eval outputs

---
 AGENTS.md                             |  4 +-
 README.md                             |  4 +-
 docs/README.md                        |  2 +-
 docs/medarc-eval-bench.md             | 19 +++++--
 docs/medarc-eval-process.md           | 77 ++++++++++++++-------------
 docs/medarc-verifiers-architecture.md | 23 ++++----
 6 files changed, 73 insertions(+), 56 deletions(-)

diff --git a/AGENTS.md b/AGENTS.md
index 4c204e41..d05c3637 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -12,12 +12,12 @@
 
 - **IMPORTANT: Read `docs/medarc-verifiers-architecture.md` before writing or modifying any code.**
 - Quick workflow: eval → process → winrate
-  - raw outputs: `runs/raw/<run_id>/...`
+  - eval outputs: `runs/evals/<model>/<env>/<variant>/...`
   - processed parquet: `runs/processed/<model>/<env>.parquet` + `runs/processed/env_index.json`
   - winrate outputs: `runs/winrate/latest.json` and `runs/winrate/latest.csv`
 - `medarc-eval` CLI entrypoint/router: (`medarc_verifiers/cli/main.py`; docs: `docs/medarc-eval.md`)
 - `medarc-orchestrate` CLI entrypoint: (`medarc_verifiers/orchestrate/cli.py`; docs: `docs/medarc-orchestrate.md`)
-- Batch resume/restart state lives in `runs/raw/<run_id>/run_manifest.json`
+- Old YAML-runner `runs/raw` artifacts must be converted with `scripts/convert_legacy_raw_runs.py` before processing.
 - Environment `load_environment()` params become CLI flags (see `medarc-eval <env> --help`).
 - Environment authoring utilities (used by `environments/*`):
   - parsing/prompts: `medarc_verifiers/parsers/`, `medarc_verifiers/prompts.py` (XML preferred; BOXED supported)
diff --git a/README.md b/README.md
index 511c7f95..91f8858e 100644
--- a/README.md
+++ b/README.md
@@ -108,7 +108,7 @@ Once your tooling is set up you can install MedARC-maintained environments direc
 |---------|-------------|
 | [`medarc-eval <ENV>`](docs/medarc-eval-single-run.md) | Run a single benchmark with auto-discovered environment flags |
 | [`medarc-eval bench`](docs/medarc-eval-bench.md) | Run upstream TOML eval configs with deterministic MedARC paths |
-| [`medarc-eval process`](docs/medarc-eval-process.md) | Convert raw outputs to parquet for analysis |
+| [`medarc-eval process`](docs/medarc-eval-process.md) | Convert eval outputs to parquet for analysis |
 | [`medarc-eval winrate`](docs/medarc-eval-winrate.md) | Compute HELM-style win rates across models |
 
 ### Quick Start
@@ -197,7 +197,7 @@ for details.
 After running benchmarks, convert results to parquet and compute model comparisons:
 
 ```bash
-# Process raw outputs to parquet
+# Process eval outputs to parquet
 uv run medarc-eval process --runs-dir runs/evals
 
 # Compute HELM-style win rates
diff --git a/docs/README.md b/docs/README.md
index a8a1b189..e9db2a71 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -18,7 +18,7 @@ Environments are installed separately via `prime env install <owner/env>` (from
 |---------|-------------|
 | `medarc-eval <ENV>` | Run a single benchmark; env-specific flags inferred from `load_environment()` |
 | `medarc-eval bench` | Run upstream TOML eval configs with deterministic MedARC paths |
-| `medarc-eval process` | Convert raw outputs to analysis-ready parquet |
+| `medarc-eval process` | Convert eval outputs to analysis-ready parquet |
 | `medarc-eval winrate` | Compute HELM-style win rates across models |
 
 See [medarc-eval.md](medarc-eval.md) for full documentation.
diff --git a/docs/medarc-eval-bench.md b/docs/medarc-eval-bench.md
index eb26bdd2..64f6e707 100644
--- a/docs/medarc-eval-bench.md
+++ b/docs/medarc-eval-bench.md
@@ -188,8 +188,10 @@ medarc-eval process --runs-dir runs/evals --output-dir runs/processed
 medarc-eval winrate --processed-dir runs/processed
 ```
 
-Processing still supports legacy `runs/raw/<run_id>/run_manifest.json` outputs
-for migration, but new bench runs should use `runs/evals`.
+Processing reads eval-output directories under `runs/evals`. Legacy
+`runs/raw/<run_id>/run_manifest.json` outputs must be converted with
+`scripts/convert_legacy_raw_runs.py` before processing. New bench runs should
+use `runs/evals`.
 
 ## Migrating from the Removed YAML Runner
 
@@ -205,5 +207,14 @@ Removed YAML-runner concepts no longer exist in `medarc-eval bench`:
 - `--job-id`, `--forced`, `--on-complete`
 - custom YAML job status and manifest planning
 
-Old raw outputs remain processable through the legacy manifest reader, so
-historical runs do not need to be converted before processing.
+Old raw outputs must be converted before processing:
+
+```bash
+uv run python scripts/convert_legacy_raw_runs.py \
+  --raw-dir runs/raw \
+  --output-dir runs/evals \
+  --dry-run
+```
+
+The converter is an operator migration helper. It does not mutate `runs/raw` and
+defaults to dry-run; pass `--no-dry-run` to write converted eval outputs.
diff --git a/docs/medarc-eval-process.md b/docs/medarc-eval-process.md
index 4db96901..3976a2f4 100644
--- a/docs/medarc-eval-process.md
+++ b/docs/medarc-eval-process.md
@@ -1,6 +1,7 @@
 # Processing Results
 
-Convert raw benchmark outputs into analysis-ready parquet files. This step prepares data for win rate computation and other analyses.
+Convert eval outputs into analysis-ready parquet files. This step prepares data
+for win rate computation and other analyses.
 
 ## Quick Start
 
@@ -8,17 +9,20 @@ Convert raw benchmark outputs into analysis-ready parquet files. This step prepa
 # Process outputs from the current TOML bench runner
 medarc-eval process --runs-dir runs/evals --output-dir runs/processed
 
-# Process legacy manifest outputs (default runs dir)
+# Process outputs from the default runs/evals directory
 medarc-eval process
 
+# Convert old YAML-runner raw outputs first
+uv run python scripts/convert_legacy_raw_runs.py --raw-dir runs/raw --output-dir runs/evals --dry-run
+
 # Preview what would be processed
 medarc-eval process --dry-run
 ```
 
 ## What Processing Does
 
-1. **Discovers** eval outputs in `runs/evals/` by scanning output directories,
-   and legacy manifest jobs in `runs/raw/`
+1. **Discovers** eval outputs in `runs/evals/` by scanning output directories
+   containing `metadata.json` and `results.jsonl`
 2. **Extracts** results from each eval output directory
 3. **Normalizes** data into a fixed output schema
 4. **Writes** parquet files organized by model and environment
@@ -44,7 +48,7 @@ On-disk model and env path components are slugified, so filenames may not exactl
 
 | Flag | Description | Default |
 |------|-------------|---------|
-| `--runs-dir PATH` | Directory containing raw run outputs. Use `runs/evals` for new TOML bench runs. | `runs/raw` |
+| `--runs-dir PATH` | Directory containing eval output directories | `runs/evals` |
 | `--output-dir PATH` | Where to write processed files | `runs/processed` |
 | `--max-workers N` | Parallel worker processes | 4 |
 | `--dry-run` | Show what would be processed | - |
@@ -54,26 +58,11 @@ On-disk model and env path components are slugified, so filenames may not exactl
 
 ## Filtering Runs
 
-### By Completion Status
-
 For current TOML bench outputs, processing scans for directories containing
 `metadata.json` and `results.jsonl`. Model and environment identity come from
 upstream metadata when available; variant identity comes from the deterministic
 path segment. Ad hoc upstream outputs fall back to metadata/path inference.
 
-For legacy YAML-runner outputs, `medarc-eval process` reads
-`runs/raw/<run_id>/run_manifest.json` and only selects jobs whose manifest
-status is `completed` by default.
-
-Note: successful legacy jobs are written to `run_manifest.json` with
-`status: completed`.
-
-To override that default, pass one or more explicit status filters:
-
-```bash
-medarc-eval process --status completed --status failed
-```
-
 You can also gate partially complete outputs by missing `results.jsonl` rows:
 
 ```bash
@@ -84,20 +73,17 @@ medarc-eval process --max-results-missing-pct 2.5
 medarc-eval process --max-results-missing-pct 100
 ```
 
-For TOML bench outputs, this gate uses `metadata.json` values for expected rows
-and the observed `results.jsonl` row count:
+This gate uses `metadata.json` values for expected rows and the observed
+`results.jsonl` row count:
 
 - `expected_rows = num_examples * rollouts_per_example`
 - `observed_rows = results.jsonl row count`
 
-For legacy manifest outputs, the same gate uses manifest job fields:
+It is computed per selected output and enforced only on the latest selected run
+for each processed model/environment output. It does not fall back to older runs
+if the latest one is too incomplete.
 
-- `expected_rows = num_examples * rollouts_per_example`
-- `observed_rows = row_count`
-
-It is computed per selected job record and enforced only on the latest selected run for each processed model/environment output. For legacy manifests, it does not use manifest `summary.completed` / `summary.total`, and it does not fall back to older runs if the latest one is too incomplete.
-
-Selected records with missing `results.jsonl` fail processing immediately.
+Directories without `results.jsonl` are not process candidates.
 
 ### Latest Runs Only
 
@@ -145,7 +131,7 @@ CLI flags override config values.
 
 Supported config schema for `medarc-eval process`:
 
-- Top-level `runs_dir`: raw run root. Use `runs/evals` for new TOML bench outputs and `runs/raw` for legacy manifest outputs.
+- Top-level `runs_dir`: eval output root, usually `runs/evals`.
 - Top-level `process:`: process-specific defaults.
 - Optional top-level `winrate:`: embedded post-process winrate step.
 - Optional top-level `hf:`: shared HF settings. For embedded winrate uploads, use `hf.winrate_dir`.
@@ -296,20 +282,39 @@ When both flags are present, processing only rebuilds outputs that match both fi
 Check that:
 1. `--runs-dir` points to the correct location
 2. For TOML bench outputs, each eval directory contains `results.jsonl` and `metadata.json`
-3. For legacy manifest outputs, runs have completed (check `run_manifest.json` `jobs[*].status`)
-4. Use `--status pending` or `--status running` to include non-completed legacy jobs
+3. Each eval output directory contains both `metadata.json` and `results.jsonl`
 
 ### Missing data in output
 
-By default, current TOML bench outputs are selected from valid eval directories, while legacy manifest outputs include only jobs with `completed` status. In addition, `--max-results-missing-pct` fails if a selected latest job record is missing more than 2.5% of its expected `results.jsonl` rows. TOML bench outputs use eval metadata plus the observed JSONL row count; legacy manifest outputs use manifest job fields:
+By default, TOML bench outputs are selected from valid eval directories.
+`--max-results-missing-pct` fails if a selected latest output is missing more
+than 2.5% of its expected `results.jsonl` rows. Processing uses eval metadata
+plus the observed JSONL row count:
 
-- `row_count`
 - `num_examples`
 - `rollouts_per_example`
 
-The gate is per selected record, not per whole run manifest. If the latest selected run for a model/dataset is too incomplete, processing fails fast instead of silently falling back to an older run. Records with unknown expected rows or unknown `row_count` are not gated.
+The gate is per selected output. If the latest selected run for a model/dataset
+is too incomplete, processing fails fast instead of silently falling back to an
+older run. Records with unknown expected rows are not gated.
+
+Use `--max-results-missing-pct 100` to disable the gate.
+
+### Migrating Old Raw Runs
+
+`medarc-eval process` no longer reads `runs/raw/<run_id>/run_manifest.json`
+directly. Convert old local artifacts into the current eval-output shape first:
+
+```bash
+uv run python scripts/convert_legacy_raw_runs.py \
+  --raw-dir runs/raw \
+  --output-dir runs/evals \
+  --dry-run
+```
 
-Use `--max-results-missing-pct 100` to disable the gate, or pass explicit `--status` values to include other statuses.
+The converter defaults to dry-run, never mutates `runs/raw`, and fails on
+existing target paths. Re-run with `--no-dry-run` to write converted
+`metadata.json` and `results.jsonl` directories under `runs/evals`.
 
 ### Integrity-check failures for existing parquet files
 
diff --git a/docs/medarc-verifiers-architecture.md b/docs/medarc-verifiers-architecture.md
index dbd984a7..406531f1 100644
--- a/docs/medarc-verifiers-architecture.md
+++ b/docs/medarc-verifiers-architecture.md
@@ -9,18 +9,19 @@ framework with:
 
 - A unified CLI (`medarc-eval`) for medical benchmark environments.
 - A TOML bench wrapper for sequential local benchmark runs with deterministic output paths.
-- A processing pipeline that converts raw eval artifacts into analysis-ready Parquet datasets.
+- A processing pipeline that converts eval output artifacts into analysis-ready Parquet datasets.
 - HELM-style win rate computation across models from processed outputs.
 - Shared environment utilities for parsers, rewards, shuffling, and judging.
 
 The current workflow is:
 
 1. **Run** evals with single-run mode or TOML bench -> `runs/evals/<model>/<env>/...`
-2. **Process** raw outputs -> `runs/processed/<model>/<env>.parquet` plus `env_index.json`
+2. **Process** eval outputs -> `runs/processed/<model>/<env>.parquet` plus `env_index.json`
 3. **Winrate** on processed outputs -> `runs/processed/winrate/*.json` and `*.csv`
 
-Historical YAML-runner outputs under `runs/raw/<run_id>/...` remain readable by
-`medarc-eval process`, but the YAML benchmark runner itself has been removed.
+Historical YAML-runner outputs under `runs/raw/<run_id>/...` must be converted
+with `scripts/convert_legacy_raw_runs.py` before `medarc-eval process` can read
+them. The YAML benchmark runner itself has been removed.
 
 ## Import Side Effects
 
@@ -139,10 +140,11 @@ write MedARC identity into upstream `metadata.json`. Variant identity is the
 deterministic path segment, so `variant_id` / `name` values must already be
 path-safe.
 
-`medarc_verifiers/cli/_manifest.py` now only contains the legacy manifest schema
-needed by processing to read historical `runs/raw` outputs.
+Historical raw-run manifest schemas are not part of the runtime package. Use
+`scripts/convert_legacy_raw_runs.py` as a one-off migration helper for old
+`runs/raw` artifacts.
 
-## Raw Outputs
+## Eval Outputs
 
 TOML bench outputs include:
 
@@ -160,10 +162,9 @@ Entry point: `medarc_verifiers/cli/process/pipeline.py`.
 
 Processing:
 
-1. Discovers TOML bench outputs from `runs/evals` by scanning directories, and
-   legacy manifest outputs from `runs/raw`.
-2. Normalizes identity from upstream `metadata.json` and paths; legacy outputs
-   still use manifest fields.
+1. Discovers eval outputs from `runs/evals` by scanning directories containing
+   `metadata.json` and `results.jsonl`.
+2. Normalizes identity from upstream `metadata.json` and deterministic paths.
 3. Loads rows from `results.jsonl`, drops large prompt/completion fields, and
    flattens `token_usage`.
 4. Aggregates rows per model and environment, preserving variant ids.

From 8f540fe78ed630a2ab482ffa056a3df8d1ae9c69 Mon Sep 17 00:00:00 2001
From: Benjamin Warner <me@benjaminwarner.dev>
Date: Fri, 8 May 2026 06:30:45 +0000
Subject: [PATCH 31/53] Remove legacy manifest path helpers

---
 medarc_verifiers/utils/pathing.py | 74 +------------------------------
 1 file changed, 1 insertion(+), 73 deletions(-)

diff --git a/medarc_verifiers/utils/pathing.py b/medarc_verifiers/utils/pathing.py
index e53dc69a..8924fe02 100644
--- a/medarc_verifiers/utils/pathing.py
+++ b/medarc_verifiers/utils/pathing.py
@@ -1,77 +1,10 @@
-"""Shared filesystem helpers for locating and relativizing project paths."""
+"""Shared filesystem helpers for safe relative paths."""
 
 from __future__ import annotations
 
-from functools import lru_cache
 from pathlib import Path
 
 
-@lru_cache(maxsize=1)
-def project_root() -> Path:
-    """Best-effort detection of the repository root (directory containing pyproject.toml)."""
-    current = Path(__file__).resolve()
-    for candidate in (current,) + tuple(current.parents):
-        if (candidate / "pyproject.toml").exists():
-            return candidate
-    # Fallback to current working directory if no project marker is found.
-    return Path.cwd().resolve()
-
-
-def to_project_relative(path: Path | str, *, default_base: Path | None = None) -> str:
-    """Convert an absolute path to a string relative to the project root when possible.
-
-    If `path` is relative, treat it as rooted at `default_base` when provided.
-    """
-    resolved = _resolve_path(path, default_base=default_base)
-    root = project_root()
-    try:
-        return resolved.relative_to(root).as_posix()
-    except ValueError:
-        return resolved.as_posix()
-
-
-def from_project_relative(path: Path | str) -> Path:
-    """Convert a stored manifest path back into an absolute path under the project root."""
-    candidate = Path(path)
-    if candidate.is_absolute():
-        return candidate
-    return (project_root() / candidate).resolve()
-
-
-def normalize_results_dir_for_manifest(value: str | Path, *, run_dir: Path) -> str:
-    """Normalize results_dir entries before storing them in a manifest."""
-    candidate = Path(value)
-    if not candidate.is_absolute():
-        if candidate.parts and candidate.parts[0] == "runs":
-            candidate = (project_root() / candidate).resolve()
-        else:
-            candidate = (run_dir / candidate).resolve()
-    else:
-        candidate = candidate.resolve()
-    return to_project_relative(candidate)
-
-
-def resolve_results_dir_from_manifest(value: str | None, *, job_id: str, run_dir: Path) -> Path:
-    """Resolve manifest results_dir entries into concrete paths."""
-    raw = "" if value is None else str(value)
-    name = raw.strip() or job_id
-    candidate = Path(name)
-    if candidate.is_absolute():
-        return candidate
-    if candidate.parts and candidate.parts[0] == "runs":
-        return from_project_relative(candidate)
-    return (run_dir / candidate).resolve()
-
-
-def _resolve_path(path: Path | str, *, default_base: Path | None = None) -> Path:
-    candidate = Path(path)
-    if candidate.is_absolute():
-        return candidate
-    if default_base is not None:
-        return (default_base / candidate).resolve()
-    return candidate.resolve()
-
-
 def resolve_under(base_dir: Path, rel_path: str | Path) -> Path | None:
     """Join rel_path under base_dir, rejecting obvious traversal."""
     raw = str(rel_path).strip()
@@ -89,10 +22,5 @@ def resolve_under(base_dir: Path, rel_path: str | Path) -> Path | None:
 
 
 __all__ = [
-    "project_root",
-    "to_project_relative",
-    "from_project_relative",
     "resolve_under",
-    "normalize_results_dir_for_manifest",
-    "resolve_results_dir_from_manifest",
 ]

From 2d96b40877d77a0e2c7d643920db28241de06b51 Mon Sep 17 00:00:00 2001
From: Benjamin Warner <me@benjaminwarner.dev>
Date: Fri, 8 May 2026 16:43:00 +0000
Subject: [PATCH 32/53] Add endpoint sampling profiles

---
 docs/medarc-eval-bench.md                    |  43 +++
 medarc_verifiers/cli/utils/endpoint_utils.py |  49 ++-
 medarc_verifiers/cli/verifiers_adapter.py    |  72 ++++-
 medarc_verifiers/utils/sampling_args.py      |   2 +-
 tests/test_cli/test_verifiers_adapter.py     | 299 +++++++++++++++++++
 5 files changed, 450 insertions(+), 15 deletions(-)

diff --git a/docs/medarc-eval-bench.md b/docs/medarc-eval-bench.md
index 64f6e707..aef0f91f 100644
--- a/docs/medarc-eval-bench.md
+++ b/docs/medarc-eval-bench.md
@@ -160,6 +160,49 @@ provider arguments pass through to upstream.
 | `--max-retries N` | Override upstream rollout retries for every eval |
 | `--sleep SEC` | Sleep after each eval |
 
+## Endpoint Sampling Profiles
+
+MedARC extends upstream `verifiers` TOML endpoint registries with optional
+endpoint-level `sampling_args`. Use these for model/provider defaults and
+compatibility knobs, such as vLLM-only parameters. Put benchmark experiment
+settings in the eval TOML or CLI overrides.
+
+```toml
+[[endpoint]]
+endpoint_id = "gpt-oss-20b-low-local"
+model = "openai/gpt-oss-20b"
+url = "http://host.docker.internal:8010/v1"
+key = "VLLM_API_KEY"
+api_client_type = "openai_chat_completions"
+
+[endpoint.sampling_args]
+temperature = 1.0
+top_p = 1.0
+top_k = 0
+reasoning_effort = "low"
+
+[[endpoint]]
+endpoint_id = "another-model"
+model = "openai/another-model"
+url = "http://host.docker.internal:8011/v1"
+key = "VLLM_API_KEY"
+```
+
+Inline tables are also supported:
+
+```toml
+sampling_args = { temperature = 1.0, top_p = 1.0, top_k = 0, reasoning_effort = "low" }
+```
+
+Precedence is: Prime Inference defaults, endpoint `sampling_args`, raw scalar
+`temperature` / `max_tokens`, raw TOML `sampling_args`, then CLI
+`--sampling-args` / `--sampling-arg`. Unknown OpenAI parameters such as `top_k`
+are still moved under `extra_body` after the merge.
+
+After `[endpoint.sampling_args]`, TOML keys remain inside that nested table
+until the next table header. Start a new `[[endpoint]]` before defining another
+endpoint.
+
 ## Prime Inference
 
 When `--api-base-url` or a config points at Prime Inference
diff --git a/medarc_verifiers/cli/utils/endpoint_utils.py b/medarc_verifiers/cli/utils/endpoint_utils.py
index d81d9f34..af76604c 100644
--- a/medarc_verifiers/cli/utils/endpoint_utils.py
+++ b/medarc_verifiers/cli/utils/endpoint_utils.py
@@ -4,10 +4,11 @@
 
 import logging
 from pathlib import Path
-from typing import MutableMapping, Sequence
+from typing import Any, MutableMapping, Sequence, cast
 
 from verifiers.types import Endpoints
-from verifiers.utils.eval_utils import load_endpoints
+from verifiers.utils.eval_utils import load_endpoints, resolve_endpoints_file
+from verifiers.utils.import_utils import load_toml
 
 from medarc_verifiers.cli.utils.env_args import EnvParam, gather_env_cli_metadata
 
@@ -43,6 +44,49 @@ def load_endpoint_registry(
     return store[normalized]
 
 
+def load_endpoint_sampling_profiles(path: str | Path) -> dict[str, list[dict[str, Any]]]:
+    """Load MedARC endpoint-level sampling defaults from a TOML registry."""
+    resolved = resolve_endpoints_file(str(path))
+    if resolved is None or not resolved.exists() or resolved.suffix != ".toml":
+        return {}
+
+    with resolved.open("rb") as handle:
+        raw_toml = load_toml(handle)
+    if not isinstance(raw_toml, dict):
+        raise ValueError(f"Expected top-level TOML table in endpoint registry {resolved}")
+
+    raw_entries = raw_toml.get("endpoint", [])
+    if not isinstance(raw_entries, list):
+        raise ValueError(f"Expected [[endpoint]] array-of-tables in endpoint registry {resolved}")
+
+    profiles: dict[str, list[dict[str, Any]]] = {}
+    for index, raw_entry in enumerate(raw_entries):
+        entry_source = f"{resolved} ([[endpoint]] index {index})"
+        if not isinstance(raw_entry, dict):
+            raise ValueError(f"Each [[endpoint]] entry must be a table in {entry_source}")
+
+        endpoint_id = raw_entry.get("endpoint_id")
+        if not isinstance(endpoint_id, str) or not endpoint_id:
+            if "sampling_args" in raw_entry:
+                raise ValueError(
+                    f"Endpoint profile with sampling_args must include non-empty string endpoint_id in {entry_source}"
+                )
+            continue
+
+        raw_sampling_args = raw_entry.get("sampling_args", {})
+        if isinstance(raw_sampling_args, list):
+            raise ValueError(
+                f"Endpoint '{endpoint_id}' sampling_args must be a table in {entry_source}; "
+                "use [endpoint.sampling_args] or an inline table, not [[endpoint.sampling_args]]."
+            )
+        if not isinstance(raw_sampling_args, dict):
+            raise ValueError(f"Endpoint '{endpoint_id}' sampling_args must be a table in {entry_source}")
+
+        profiles.setdefault(endpoint_id, []).append(dict(cast(dict[str, Any], raw_sampling_args)))
+
+    return profiles
+
+
 def load_env_metadata(
     env_id: str,
     *,
@@ -103,6 +147,7 @@ def resolve_model_endpoint(
     "EndpointRegistryCache",
     "EnvMetadataCache",
     "load_endpoint_registry",
+    "load_endpoint_sampling_profiles",
     "load_env_metadata",
     "resolve_model_endpoint",
 ]
diff --git a/medarc_verifiers/cli/verifiers_adapter.py b/medarc_verifiers/cli/verifiers_adapter.py
index 8a086f3e..2e4d1d03 100644
--- a/medarc_verifiers/cli/verifiers_adapter.py
+++ b/medarc_verifiers/cli/verifiers_adapter.py
@@ -25,6 +25,7 @@
 from verifiers.utils.eval_utils import load_endpoints, load_toml_config, resolve_endpoints_file
 from verifiers.utils.import_utils import load_toml
 
+from medarc_verifiers.cli.utils.endpoint_utils import load_endpoint_sampling_profiles
 from medarc_verifiers.utils.prime_inference import prime_inference_overrides
 from medarc_verifiers.utils.sampling_args import sanitize_sampling_args_for_openai
 
@@ -101,8 +102,8 @@ class EvalConfigOverrides:
 def load_toml_eval_configs(path: str | Path, *, extra_valid_fields: set[str] | None = None) -> list[dict[str, Any]]:
     """Load upstream TOML eval configs, including ``[[ablation]]`` expansion."""
 
-    valid_fields = ADAPTER_TOML_FIELDS | {MEDARC_TOML_METADATA_FIELD} | MEDARC_TOML_IDENTITY_FIELDS | (
-        extra_valid_fields or set()
+    valid_fields = (
+        ADAPTER_TOML_FIELDS | {MEDARC_TOML_METADATA_FIELD} | MEDARC_TOML_IDENTITY_FIELDS | (extra_valid_fields or set())
     )
     return [_strip_medarc_metadata(raw) for raw in load_toml_config(Path(path), extra_valid_fields=valid_fields)]
 
@@ -135,7 +136,15 @@ def build_eval_config(raw: Mapping[str, Any], *, overrides: EvalConfigOverrides
     endpoints = load_endpoints(endpoints_path)
     model, resolved_endpoint_id, client_config = _build_client_config(merged_raw, endpoints, endpoints_path)
 
-    sampling_args = _build_sampling_args(merged_raw, client_config.api_base_url)
+    endpoint_sampling_profiles = load_endpoint_sampling_profiles(endpoints_path)
+    endpoint_sampling_args = _resolve_endpoint_sampling_args(endpoint_sampling_profiles, resolved_endpoint_id)
+    cli_sampling_args = overrides.sampling_args if overrides is not None else None
+    sampling_args = _build_sampling_args(
+        merged_raw,
+        client_config.api_base_url,
+        endpoint_sampling_args=endpoint_sampling_args,
+        cli_sampling_args=cli_sampling_args,
+    )
 
     extra_env_kwargs = dict(merged_raw.get("extra_env_kwargs", {}))
     if merged_raw.get("timeout") is not None:
@@ -234,9 +243,6 @@ def _apply_overrides(raw: dict[str, Any], overrides: EvalConfigOverrides | None)
 
     if overrides.env_args:
         raw["env_args"] = {**dict(raw.get("env_args", {})), **dict(overrides.env_args)}
-    if overrides.sampling_args:
-        raw["sampling_args"] = {**dict(raw.get("sampling_args", {})), **dict(overrides.sampling_args)}
-
     return raw
 
 
@@ -359,15 +365,49 @@ def _build_client_config(
     return cast(str, model), resolved_endpoint_id, client_config
 
 
-def _build_sampling_args(raw: Mapping[str, Any], api_base_url: str) -> dict[str, Any]:
-    sampling_args = _merge_sampling_args(
-        raw.get("sampling_args"),
+def _resolve_endpoint_sampling_args(
+    endpoint_sampling_profiles: Mapping[str, list[dict[str, Any]]], endpoint_id: str | None
+) -> dict[str, Any]:
+    if endpoint_id is None:
+        return {}
+
+    profiles = endpoint_sampling_profiles.get(endpoint_id, [])
+    if not profiles:
+        return {}
+
+    first = profiles[0]
+    for profile in profiles[1:]:
+        if profile != first:
+            raise ValueError(
+                f"Endpoint alias '{endpoint_id}' has conflicting sampling_args across replica entries. "
+                "Use identical sampling_args for every replica or omit them from every replica."
+            )
+    return dict(first)
+
+
+def _build_sampling_args(
+    raw: Mapping[str, Any],
+    api_base_url: str,
+    *,
+    endpoint_sampling_args: Mapping[str, Any] | None = None,
+    cli_sampling_args: Mapping[str, Any] | None = None,
+) -> dict[str, Any]:
+    _, prime_sampling_overrides = prime_inference_overrides(api_base_url)
+    endpoint_sampling = _validate_sampling_mapping(endpoint_sampling_args, "endpoint sampling_args")
+    include_none_max_tokens = raw.get("include_none_max_tokens", True) and (
+        "max_tokens" in raw or "max_tokens" not in endpoint_sampling
+    )
+    scalar_sampling_args = _merge_sampling_args(
+        None,
         max_tokens=raw.get("max_tokens"),
         temperature=raw.get("temperature"),
-        include_none_max_tokens=raw.get("include_none_max_tokens", True),
+        include_none_max_tokens=include_none_max_tokens,
     )
-    _, prime_sampling_overrides = prime_inference_overrides(api_base_url)
-    return sanitize_sampling_args_for_openai(_deep_merge(prime_sampling_overrides, sampling_args))
+    merged = _deep_merge(prime_sampling_overrides, endpoint_sampling)
+    merged = _deep_merge(merged, scalar_sampling_args)
+    merged = _deep_merge(merged, _validate_sampling_mapping(raw.get("sampling_args"), "sampling_args"))
+    merged = _deep_merge(merged, _validate_sampling_mapping(cli_sampling_args, "CLI sampling_args"))
+    return sanitize_sampling_args_for_openai(merged)
 
 
 def _merge_sampling_args(
@@ -450,6 +490,14 @@ def _deep_merge(base: Mapping[str, Any], override: Mapping[str, Any]) -> dict[st
     return merged
 
 
+def _validate_sampling_mapping(value: object, label: str) -> dict[str, Any]:
+    if value is None:
+        return {}
+    if not isinstance(value, Mapping):
+        raise ValueError(f"{label} must be a dict")
+    return dict(cast(Mapping[str, Any], value))
+
+
 def _validate_header_mapping(value: object) -> dict[str, str]:
     if not isinstance(value, dict):
         raise ValueError("headers must be a dict")
diff --git a/medarc_verifiers/utils/sampling_args.py b/medarc_verifiers/utils/sampling_args.py
index 58ae2aed..4019d35a 100644
--- a/medarc_verifiers/utils/sampling_args.py
+++ b/medarc_verifiers/utils/sampling_args.py
@@ -33,7 +33,7 @@ def sanitize_sampling_args_for_openai(sampling_args: Mapping[str, Any] | None) -
         elif isinstance(existing, Mapping):
             merged = dict(existing)
             for key, value in extras.items():
-                merged.setdefault(key, value)
+                merged[key] = value
             filtered["extra_body"] = merged
         else:
             filtered["extra_body"] = {"_passthrough_extra_body": existing, **extras}
diff --git a/tests/test_cli/test_verifiers_adapter.py b/tests/test_cli/test_verifiers_adapter.py
index f06dcab7..605e4401 100644
--- a/tests/test_cli/test_verifiers_adapter.py
+++ b/tests/test_cli/test_verifiers_adapter.py
@@ -5,6 +5,7 @@
 
 import pytest
 
+from medarc_verifiers.cli.utils.endpoint_utils import load_endpoint_sampling_profiles
 from medarc_verifiers.cli.verifiers_adapter import EvalConfigOverrides, build_eval_config, load_toml_eval_configs
 from medarc_verifiers.utils.prime_inference import PRIME_INFERENCE_URL
 
@@ -37,6 +38,31 @@ def _write_endpoints(path: Path) -> Path:
     return path
 
 
+def test_load_endpoint_sampling_profiles_parses_nested_table(tmp_path: Path) -> None:
+    endpoints_path = tmp_path / "endpoints.toml"
+    endpoints_path.write_text(
+        """
+[[endpoint]]
+endpoint_id = "gpt-oss-20b-low-local"
+model = "openai/gpt-oss-20b"
+url = "http://host.docker.internal:8010/v1"
+key = "VLLM_API_KEY"
+
+[endpoint.sampling_args]
+temperature = 1.0
+top_p = 1.0
+top_k = 0
+reasoning_effort = "low"
+""".strip()
+    )
+
+    profiles = load_endpoint_sampling_profiles(endpoints_path)
+
+    assert profiles == {
+        "gpt-oss-20b-low-local": [{"temperature": 1.0, "top_p": 1.0, "top_k": 0, "reasoning_effort": "low"}]
+    }
+
+
 def test_load_toml_eval_configs_expands_ablation(tmp_path: Path) -> None:
     config_path = tmp_path / "eval.toml"
     endpoints_path = _write_endpoints(tmp_path / "endpoints.toml")
@@ -179,6 +205,279 @@ def test_build_eval_config_supports_endpoint_replicas(tmp_path: Path) -> None:
     ]
 
 
+def test_build_eval_config_uses_endpoint_sampling_defaults(tmp_path: Path) -> None:
+    endpoints_path = tmp_path / "endpoints.toml"
+    endpoints_path.write_text(
+        """
+[[endpoint]]
+endpoint_id = "gpt-oss"
+model = "openai/gpt-oss-20b"
+url = "http://localhost:8010/v1"
+key = "VLLM_API_KEY"
+
+[endpoint.sampling_args]
+temperature = 1.0
+top_p = 1.0
+top_k = 0
+reasoning_effort = "low"
+""".strip()
+    )
+
+    config = build_eval_config({"env_id": "medqa", "model": "gpt-oss", "endpoints_path": str(endpoints_path)})
+
+    assert config.model == "openai/gpt-oss-20b"
+    assert config.sampling_args["temperature"] == 1.0
+    assert config.sampling_args["top_p"] == 1.0
+    assert config.sampling_args["reasoning_effort"] == "low"
+    assert config.sampling_args["extra_body"]["top_k"] == 0
+
+
+def test_build_eval_config_sampling_precedence_endpoint_raw_and_cli(tmp_path: Path) -> None:
+    endpoints_path = tmp_path / "endpoints.toml"
+    endpoints_path.write_text(
+        """
+[[endpoint]]
+endpoint_id = "profiled"
+model = "openai/profiled"
+url = "https://profiled.example/v1"
+key = "PROFILED_KEY"
+sampling_args = { temperature = 1.0, top_p = 0.5 }
+""".strip()
+    )
+
+    toml_config = build_eval_config(
+        {
+            "env_id": "medqa",
+            "endpoint_id": "profiled",
+            "endpoints_path": str(endpoints_path),
+            "temperature": 0.2,
+            "sampling_args": {"temperature": 0.7},
+        }
+    )
+    cli_config = build_eval_config(
+        {
+            "env_id": "medqa",
+            "endpoint_id": "profiled",
+            "endpoints_path": str(endpoints_path),
+            "temperature": 0.2,
+            "sampling_args": {"temperature": 0.7},
+        },
+        overrides=EvalConfigOverrides(sampling_args={"temperature": 0.8}),
+    )
+
+    assert toml_config.sampling_args["temperature"] == 0.7
+    assert toml_config.sampling_args["top_p"] == 0.5
+    assert cli_config.sampling_args["temperature"] == 0.8
+
+
+def test_build_eval_config_scalar_temperature_overrides_endpoint_default(tmp_path: Path) -> None:
+    endpoints_path = tmp_path / "endpoints.toml"
+    endpoints_path.write_text(
+        """
+[[endpoint]]
+endpoint_id = "profiled"
+model = "openai/profiled"
+url = "https://profiled.example/v1"
+key = "PROFILED_KEY"
+sampling_args = { temperature = 1.0 }
+""".strip()
+    )
+
+    config = build_eval_config(
+        {
+            "env_id": "medqa",
+            "endpoint_id": "profiled",
+            "endpoints_path": str(endpoints_path),
+            "temperature": 0.2,
+        }
+    )
+
+    assert config.sampling_args["temperature"] == 0.2
+
+
+def test_build_eval_config_deep_merges_sampling_extra_body(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
+    monkeypatch.delenv("MEDARC_INCLUDE_USAGE", raising=False)
+    endpoints_path = tmp_path / "endpoints.toml"
+    endpoints_path.write_text(
+        f"""
+[[endpoint]]
+endpoint_id = "prime-profiled"
+model = "openai/profiled"
+url = "{PRIME_INFERENCE_URL}"
+key = "PRIME_API_KEY"
+sampling_args = {{ top_k = 0 }}
+""".strip()
+    )
+
+    config = build_eval_config(
+        {
+            "env_id": "medqa",
+            "endpoint_id": "prime-profiled",
+            "endpoints_path": str(endpoints_path),
+            "sampling_args": {"extra_body": {"guided_choice": ["A", "B"]}},
+        }
+    )
+
+    assert config.sampling_args["extra_body"] == {
+        "usage": {"include": True},
+        "guided_choice": ["A", "B"],
+        "top_k": 0,
+    }
+
+
+def test_build_eval_config_direct_unknown_sampling_arg_overrides_extra_body_key(tmp_path: Path) -> None:
+    endpoints_path = tmp_path / "endpoints.toml"
+    endpoints_path.write_text(
+        """
+[[endpoint]]
+endpoint_id = "profiled"
+model = "openai/profiled"
+url = "https://profiled.example/v1"
+key = "PROFILED_KEY"
+sampling_args = { extra_body = { top_k = 1 } }
+""".strip()
+    )
+
+    config = build_eval_config(
+        {
+            "env_id": "medqa",
+            "endpoint_id": "profiled",
+            "endpoints_path": str(endpoints_path),
+        },
+        overrides=EvalConfigOverrides(sampling_args={"top_k": 3}),
+    )
+
+    assert config.sampling_args["extra_body"]["top_k"] == 3
+
+
+def test_build_eval_config_endpoint_replica_sampling_profiles_must_match(tmp_path: Path) -> None:
+    endpoints_path = tmp_path / "endpoints.toml"
+    endpoints_path.write_text(
+        """
+[[endpoint]]
+endpoint_id = "replica-profiled"
+model = "openai/profiled"
+url = "https://replica-a.example/v1"
+key = "REPLICA_A"
+sampling_args = { temperature = 1.0 }
+
+[[endpoint]]
+endpoint_id = "replica-profiled"
+model = "openai/profiled"
+url = "https://replica-b.example/v1"
+key = "REPLICA_B"
+sampling_args = { temperature = 1.0 }
+""".strip()
+    )
+
+    config = build_eval_config(
+        {"env_id": "medqa", "endpoint_id": "replica-profiled", "endpoints_path": str(endpoints_path)}
+    )
+
+    assert config.sampling_args["temperature"] == 1.0
+
+
+@pytest.mark.parametrize(
+    "second_sampling",
+    [
+        "sampling_args = { temperature = 0.5 }",
+        "",
+    ],
+)
+def test_build_eval_config_rejects_conflicting_replica_sampling_profiles(tmp_path: Path, second_sampling: str) -> None:
+    endpoints_path = tmp_path / "endpoints.toml"
+    endpoints_path.write_text(
+        f"""
+[[endpoint]]
+endpoint_id = "replica-profiled"
+model = "openai/profiled"
+url = "https://replica-a.example/v1"
+key = "REPLICA_A"
+sampling_args = {{ temperature = 1.0 }}
+
+[[endpoint]]
+endpoint_id = "replica-profiled"
+model = "openai/profiled"
+url = "https://replica-b.example/v1"
+key = "REPLICA_B"
+{second_sampling}
+""".strip()
+    )
+
+    with pytest.raises(ValueError, match="conflicting sampling_args"):
+        build_eval_config({"env_id": "medqa", "endpoint_id": "replica-profiled", "endpoints_path": str(endpoints_path)})
+
+
+@pytest.mark.parametrize(
+    "sampling_toml",
+    [
+        'sampling_args = "bad"',
+        "[[endpoint.sampling_args]]\ntemperature = 1.0",
+    ],
+)
+def test_load_endpoint_sampling_profiles_rejects_invalid_sampling_args(tmp_path: Path, sampling_toml: str) -> None:
+    endpoints_path = tmp_path / "endpoints.toml"
+    endpoints_path.write_text(
+        f"""
+[[endpoint]]
+endpoint_id = "bad-profile"
+model = "openai/bad"
+url = "https://bad.example/v1"
+key = "BAD_KEY"
+{sampling_toml}
+""".strip()
+    )
+
+    with pytest.raises(ValueError, match="sampling_args must be a table"):
+        load_endpoint_sampling_profiles(endpoints_path)
+
+
+def test_load_endpoint_sampling_profiles_ignores_python_registry(tmp_path: Path) -> None:
+    endpoints_path = tmp_path / "endpoints.py"
+    endpoints_path.write_text(
+        """
+ENDPOINTS = {
+    "profiled": {
+        "model": "openai/profiled",
+        "url": "https://profiled.example/v1",
+        "key": "PROFILED_KEY",
+    }
+}
+""".strip()
+    )
+
+    assert load_endpoint_sampling_profiles(endpoints_path) == {}
+
+
+def test_build_eval_config_already_expanded_ablation_sampling_args_override_endpoint(
+    tmp_path: Path,
+) -> None:
+    endpoints_path = tmp_path / "endpoints.toml"
+    endpoints_path.write_text(
+        """
+[[endpoint]]
+endpoint_id = "profiled"
+model = "openai/profiled"
+url = "https://profiled.example/v1"
+key = "PROFILED_KEY"
+sampling_args = { temperature = 1.0, top_p = 0.9 }
+""".strip()
+    )
+
+    config = build_eval_config(
+        {
+            "env_id": "medqa",
+            "endpoint_id": "profiled",
+            "endpoints_path": str(endpoints_path),
+            "name": "temp-0.3",
+            "sampling_args": {"temperature": 0.3},
+        }
+    )
+
+    assert config.sampling_args["temperature"] == 0.3
+    assert config.sampling_args["top_p"] == 0.9
+
+
 def test_build_eval_config_provider_and_cli_overrides_precede_toml(tmp_path: Path) -> None:
     endpoints_path = _write_endpoints(tmp_path / "endpoints.toml")
 

From e560ed6d5da1601b21c771178c04bb05b60694a1 Mon Sep 17 00:00:00 2001
From: Benjamin Warner <me@benjaminwarner.dev>
Date: Fri, 8 May 2026 16:46:45 +0000
Subject: [PATCH 33/53] Preserve sampling extra_body precedence

---
 medarc_verifiers/cli/verifiers_adapter.py | 15 +++++++++-----
 tests/test_cli/test_verifiers_adapter.py  | 25 +++++++++++++++++++++++
 2 files changed, 35 insertions(+), 5 deletions(-)

diff --git a/medarc_verifiers/cli/verifiers_adapter.py b/medarc_verifiers/cli/verifiers_adapter.py
index 2e4d1d03..442a93f8 100644
--- a/medarc_verifiers/cli/verifiers_adapter.py
+++ b/medarc_verifiers/cli/verifiers_adapter.py
@@ -403,11 +403,16 @@ def _build_sampling_args(
         temperature=raw.get("temperature"),
         include_none_max_tokens=include_none_max_tokens,
     )
-    merged = _deep_merge(prime_sampling_overrides, endpoint_sampling)
-    merged = _deep_merge(merged, scalar_sampling_args)
-    merged = _deep_merge(merged, _validate_sampling_mapping(raw.get("sampling_args"), "sampling_args"))
-    merged = _deep_merge(merged, _validate_sampling_mapping(cli_sampling_args, "CLI sampling_args"))
-    return sanitize_sampling_args_for_openai(merged)
+    merged = sanitize_sampling_args_for_openai(prime_sampling_overrides)
+    merged = _deep_merge(merged, sanitize_sampling_args_for_openai(endpoint_sampling))
+    merged = _deep_merge(merged, sanitize_sampling_args_for_openai(scalar_sampling_args))
+    merged = _deep_merge(
+        merged, sanitize_sampling_args_for_openai(_validate_sampling_mapping(raw.get("sampling_args"), "sampling_args"))
+    )
+    merged = _deep_merge(
+        merged, sanitize_sampling_args_for_openai(_validate_sampling_mapping(cli_sampling_args, "CLI sampling_args"))
+    )
+    return merged
 
 
 def _merge_sampling_args(
diff --git a/tests/test_cli/test_verifiers_adapter.py b/tests/test_cli/test_verifiers_adapter.py
index 605e4401..6bfc2923 100644
--- a/tests/test_cli/test_verifiers_adapter.py
+++ b/tests/test_cli/test_verifiers_adapter.py
@@ -350,6 +350,31 @@ def test_build_eval_config_direct_unknown_sampling_arg_overrides_extra_body_key(
     assert config.sampling_args["extra_body"]["top_k"] == 3
 
 
+def test_build_eval_config_extra_body_key_overrides_lower_precedence_direct_unknown_arg(tmp_path: Path) -> None:
+    endpoints_path = tmp_path / "endpoints.toml"
+    endpoints_path.write_text(
+        """
+[[endpoint]]
+endpoint_id = "profiled"
+model = "openai/profiled"
+url = "https://profiled.example/v1"
+key = "PROFILED_KEY"
+sampling_args = { top_k = 0 }
+""".strip()
+    )
+
+    config = build_eval_config(
+        {
+            "env_id": "medqa",
+            "endpoint_id": "profiled",
+            "endpoints_path": str(endpoints_path),
+            "sampling_args": {"extra_body": {"top_k": 5}},
+        }
+    )
+
+    assert config.sampling_args["extra_body"]["top_k"] == 5
+
+
 def test_build_eval_config_endpoint_replica_sampling_profiles_must_match(tmp_path: Path) -> None:
     endpoints_path = tmp_path / "endpoints.toml"
     endpoints_path.write_text(

From dbf2927acb58f5f321fbc8ed3c3a6f3e974944c0 Mon Sep 17 00:00:00 2001
From: Benjamin Warner <me@benjaminwarner.dev>
Date: Fri, 8 May 2026 17:25:08 +0000
Subject: [PATCH 34/53] Upgrade verifiers and improve legacy conversion CLI

---
 medarc_verifiers/cli/upstream_eval.py        |  2 +-
 medarc_verifiers/cli/utils/endpoint_utils.py |  7 +-
 medarc_verifiers/cli/verifiers_adapter.py    | 52 +++++++-------
 pyproject.toml                               |  2 +-
 scripts/convert_legacy_raw_runs.py           | 72 +++++++++++++++-----
 tests/test_cli/test_verifiers_adapter.py     |  2 +-
 6 files changed, 91 insertions(+), 46 deletions(-)

diff --git a/medarc_verifiers/cli/upstream_eval.py b/medarc_verifiers/cli/upstream_eval.py
index 7871526e..dc50fb8c 100644
--- a/medarc_verifiers/cli/upstream_eval.py
+++ b/medarc_verifiers/cli/upstream_eval.py
@@ -1,6 +1,6 @@
 """Boundary for upstream ``verifiers`` eval configuration.
 
-``verifiers==0.1.12`` keeps full ``EvalConfig`` construction nested inside
+``verifiers==0.1.14`` keeps full ``EvalConfig`` construction nested inside
 ``verifiers.scripts.eval.main()``, so MedARC still uses a temporary adapter.
 Import eval config behavior through this module so callers do not depend on the
 adapter directly and the deletion point is isolated when upstream exposes a
diff --git a/medarc_verifiers/cli/utils/endpoint_utils.py b/medarc_verifiers/cli/utils/endpoint_utils.py
index af76604c..54d0797e 100644
--- a/medarc_verifiers/cli/utils/endpoint_utils.py
+++ b/medarc_verifiers/cli/utils/endpoint_utils.py
@@ -46,7 +46,12 @@ def load_endpoint_registry(
 
 def load_endpoint_sampling_profiles(path: str | Path) -> dict[str, list[dict[str, Any]]]:
     """Load MedARC endpoint-level sampling defaults from a TOML registry."""
-    resolved = resolve_endpoints_file(str(path))
+    try:
+        resolved = resolve_endpoints_file(str(path))
+    except ValueError:
+        if Path(path).suffix == ".py":
+            return {}
+        raise
     if resolved is None or not resolved.exists() or resolved.suffix != ".toml":
         return {}
 
diff --git a/medarc_verifiers/cli/verifiers_adapter.py b/medarc_verifiers/cli/verifiers_adapter.py
index 442a93f8..3b50163e 100644
--- a/medarc_verifiers/cli/verifiers_adapter.py
+++ b/medarc_verifiers/cli/verifiers_adapter.py
@@ -1,6 +1,6 @@
 """Small adapter for upstream ``verifiers`` eval configuration.
 
-Upstream ``verifiers`` owns TOML loading and eval execution, but in 0.1.12 the
+Upstream ``verifiers`` owns TOML loading and eval execution, but in 0.1.14 the
 ``EvalConfig`` builder lives inside ``verifiers.scripts.eval.main()`` and cannot
 be imported directly. Keep this module deliberately narrow until upstream exposes
 a public builder.
@@ -150,31 +150,31 @@ def build_eval_config(raw: Mapping[str, Any], *, overrides: EvalConfigOverrides
     if merged_raw.get("timeout") is not None:
         extra_env_kwargs["timeout_seconds"] = merged_raw["timeout"]
 
-    return EvalConfig(
-        env_id=env_id,
-        env_args=merged_raw.get("env_args", {}),
-        env_dir_path=merged_raw.get("env_dir_path", DEFAULT_ENV_DIR_PATH),
-        output_dir=merged_raw.get("output_dir"),
-        extra_env_kwargs=extra_env_kwargs,
-        endpoint_id=resolved_endpoint_id,
-        model=model,
-        client_config=client_config,
-        sampling_args=sampling_args,
-        num_examples=num_examples,
-        rollouts_per_example=rollouts_per_example,
-        max_concurrent=merged_raw.get("max_concurrent", DEFAULT_MAX_CONCURRENT),
-        max_retries=merged_raw.get("max_retries", 0),
-        num_workers=merged_raw.get("num_workers", "auto"),
-        disable_env_server=merged_raw.get("disable_env_server", False),
-        debug=merged_raw.get("debug", False),
-        verbose=merged_raw.get("verbose", False),
-        state_columns=merged_raw.get("state_columns", []),
-        save_results=merged_raw.get("save_results", False),
-        resume_path=None,
-        independent_scoring=merged_raw.get("independent_scoring", False),
-        save_to_hf_hub=merged_raw.get("save_to_hf_hub", False),
-        hf_hub_dataset_name=merged_raw.get("hf_hub_dataset_name", ""),
-    )
+    eval_config_kwargs: dict[str, Any] = {
+        "env_id": env_id,
+        "env_args": merged_raw.get("env_args", {}),
+        "env_dir_path": merged_raw.get("env_dir_path", DEFAULT_ENV_DIR_PATH),
+        "output_dir": merged_raw.get("output_dir"),
+        "extra_env_kwargs": extra_env_kwargs,
+        "endpoint_id": resolved_endpoint_id,
+        "model": model,
+        "client_config": client_config,
+        "sampling_args": sampling_args,
+        "num_examples": num_examples,
+        "rollouts_per_example": rollouts_per_example,
+        "max_concurrent": merged_raw.get("max_concurrent", DEFAULT_MAX_CONCURRENT),
+        "max_retries": merged_raw.get("max_retries", 0),
+        "num_workers": merged_raw.get("num_workers", "auto"),
+        "verbose": merged_raw.get("verbose", False),
+        "disable_env_server": merged_raw.get("disable_env_server", False),
+        "state_columns": merged_raw.get("state_columns", []),
+        "save_results": merged_raw.get("save_results", False),
+        "resume_path": None,
+        "independent_scoring": merged_raw.get("independent_scoring", False),
+        "save_to_hf_hub": merged_raw.get("save_to_hf_hub", False),
+        "hf_hub_dataset_name": merged_raw.get("hf_hub_dataset_name", ""),
+    }
+    return EvalConfig(**eval_config_kwargs)
 
 
 def get_env_eval_defaults(env_id: str) -> dict[str, Any]:
diff --git a/pyproject.toml b/pyproject.toml
index a1cbd737..3de22497 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -7,7 +7,7 @@ requires-python = ">=3.11"
 license = { file = "LICENSE" }
 dependencies = [
     "prime>=0.3.35",
-    "verifiers>=0.1.12,<0.2",
+    "verifiers>=0.1.14,<0.2",
     "pyyaml>=6.0.1",
     "docstring-parser>=0.17.0",
     "pylatexenc>=2.10",
diff --git a/scripts/convert_legacy_raw_runs.py b/scripts/convert_legacy_raw_runs.py
index 30c9ff38..9db9d934 100644
--- a/scripts/convert_legacy_raw_runs.py
+++ b/scripts/convert_legacy_raw_runs.py
@@ -408,30 +408,70 @@ def _string_or_none(value: Any) -> str | None:
     return text or None
 
 
+def _run_conversion_cli(*, raw_dir: Path, output_dir: Path, dry_run: bool, report_path: Path | None) -> int:
+    report = convert_legacy_raw_runs(raw_dir=raw_dir, output_dir=output_dir, dry_run=dry_run)
+    encoded = json.dumps(report.to_dict(), indent=2, sort_keys=True)
+    if report_path:
+        report_path.parent.mkdir(parents=True, exist_ok=True)
+        report_path.write_text(encoded + "\n", encoding="utf-8")
+    print(encoded)
+    return 1 if report.failed else 0
+
+
+class _HelpFormatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescriptionHelpFormatter):
+    pass
+
+
 def _build_parser() -> argparse.ArgumentParser:
-    parser = argparse.ArgumentParser(description=__doc__)
-    parser.add_argument("--raw-dir", type=Path, default=Path("runs") / "raw")
-    parser.add_argument("--output-dir", type=Path, default=Path("runs") / "evals")
-    parser.add_argument("--dry-run", action="store_true", default=True, help="plan conversion without writing files")
+    parser = argparse.ArgumentParser(
+        description=__doc__,
+        formatter_class=_HelpFormatter,
+        epilog="""
+Examples:
+  python scripts/convert_legacy_raw_runs.py
+      Preview conversion from runs/raw to runs/evals.
+
+  python scripts/convert_legacy_raw_runs.py --no-dry-run --report-path report.json
+      Write converted eval-output directories and save the JSON report.
+
+  python scripts/convert_legacy_raw_runs.py --raw-dir old/runs/raw --output-dir runs/evals
+      Preview conversion from a custom legacy raw-run directory.
+""",
+    )
+    parser.add_argument(
+        "--raw-dir",
+        type=Path,
+        default=Path("runs") / "raw",
+        help="legacy raw-run root directory containing */run_manifest.json files",
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=Path,
+        default=Path("runs") / "evals",
+        help="converted eval-output root directory",
+    )
+    parser.add_argument(
+        "--dry-run",
+        action=argparse.BooleanOptionalAction,
+        default=True,
+        help="plan conversion without writing files; use --no-dry-run to write converted outputs",
+    )
     parser.add_argument(
-        "--no-dry-run",
-        dest="dry_run",
-        action="store_false",
-        help="write converted eval-output directories",
+        "--report-path",
+        type=Path,
+        help="optional path for a JSON copy of the conversion report",
     )
-    parser.add_argument("--report-path", type=Path, help="optional JSON report path")
     return parser
 
 
 def main(argv: Sequence[str] | None = None) -> int:
     args = _build_parser().parse_args(argv)
-    report = convert_legacy_raw_runs(raw_dir=args.raw_dir, output_dir=args.output_dir, dry_run=args.dry_run)
-    encoded = json.dumps(report.to_dict(), indent=2, sort_keys=True)
-    if args.report_path:
-        args.report_path.parent.mkdir(parents=True, exist_ok=True)
-        args.report_path.write_text(encoded + "\n", encoding="utf-8")
-    print(encoded)
-    return 1 if report.failed else 0
+    return _run_conversion_cli(
+        raw_dir=args.raw_dir,
+        output_dir=args.output_dir,
+        dry_run=args.dry_run,
+        report_path=args.report_path,
+    )
 
 
 if __name__ == "__main__":
diff --git a/tests/test_cli/test_verifiers_adapter.py b/tests/test_cli/test_verifiers_adapter.py
index 6bfc2923..1615e4eb 100644
--- a/tests/test_cli/test_verifiers_adapter.py
+++ b/tests/test_cli/test_verifiers_adapter.py
@@ -161,7 +161,7 @@ def test_build_eval_config_resolves_endpoint_alias_and_core_fields(tmp_path: Pat
     assert config.max_concurrent == 4
     assert config.max_retries == 3
     assert config.num_workers == 2
-    assert config.debug is True
+    assert "debug" not in type(config).model_fields
     assert config.extra_env_kwargs == {"timeout_seconds": 45.0}
     assert config.state_columns == ["question_id", "split"]
     assert config.save_results is True

From 2e232143462f3b4b7f86d5286d4c2a3ae7fd1c54 Mon Sep 17 00:00:00 2001
From: Benjamin Warner <me@benjaminwarner.dev>
Date: Fri, 8 May 2026 18:28:15 +0000
Subject: [PATCH 35/53] documentation pass

---
 README.md               | 274 ++++++++++++++--------------------------
 docs/README.md          |   4 +
 docs/developer-guide.md | 204 ++++++++++++++++++++++++++++++
 3 files changed, 306 insertions(+), 176 deletions(-)
 create mode 100644 docs/developer-guide.md

diff --git a/README.md b/README.md
index 91f8858e..22f08a45 100644
--- a/README.md
+++ b/README.md
@@ -1,207 +1,129 @@
-# MedARC Medical Language Model Environments
+# Medmarks
 
-This repository is used to build verifiers environments and tools for the MedARC medical language model project.
+[![Website](https://img.shields.io/badge/website-medmarks.ai-0f766e)](https://medmarks.ai)
+[![arXiv](https://img.shields.io/badge/arXiv-2605.01417-b31b1b.svg)](https://arxiv.org/abs/2605.01417)
+[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](LICENSE)
+[![Python](https://img.shields.io/badge/python-3.12-blue.svg)](pyproject.toml)
 
-It also contains the medarc-verifiers package, which provides additional tools for creating verifiers environments.
+Open-source LLM benchmark suite for medical tasks.
 
-## Getting Started with Verifiers Environments
+[medmarks.ai](https://medmarks.ai) | [arXiv:2605.01417](https://arxiv.org/abs/2605.01417)
 
-The steps below guide you through creating a new environment package under `environments/[my-new-env]`, installing it locally, testing it with Verifiers tooling, and optionally publishing it through Prime Intellect's Environments Hub.
+Medmarks is a comprehensive benchmark suite for evaluating medical capabilities in large language models. It includes 30 open-source benchmarks spanning question answering, information extraction, consumer health questions, clinical reasoning, EHR interactions, medical calculations, and open-ended medical tasks.
 
-### 1. Prerequisites
-- Python 3.11 or 3.12
-- [`uv`](https://docs.astral.sh/uv/) for dependency management
-- The [`prime` CLI](https://github.com/PrimeIntellect-ai/prime-cli) for scaffolding and publishing
-- An OpenAI-compatible API key (export it as `OPENAI_API_KEY`) or OpenAI compatible model for testing the environment with `vf-eval`
+This repository contains the runnable benchmark environments, evaluation configs, result processing tools, and win-rate analysis pipeline used for Medmarks. It also contains the [`medarc_verifiers` Python library](docs/README.md), which provides the shared CLI, parsers, rewards, judging utilities, and orchestration helpers used by the benchmark environments.
 
-### 2. Setup
+## Benchmark Suite
 
-Create and activate a virtual environment, then install the required tooling:
+Medmarks is organized into three practical subsets:
 
-```bash
-uv venv --python 3.12
-source .venv/bin/activate
-uv tool install prime
-uv pip install verifiers
-```
-
-After this setup the `prime env`, `vf-install`, and `vf-eval` commands will be available (or runnable via `uv run <command>`).
-
-### 3. Create a New Environment
-Always place new Verifiers packages inside `environments/my-new-env`. The Prime CLI ensures this by default:
-
-```bash
-# from the repository root
-prime env init my-new-env
-```
+| Subset | Description |
+|--------|-------------|
+| Medmarks-V | Verifiable tasks, including multiple-choice QA and other tasks with deterministic or programmatic grading |
+| Medmarks-OE | Open-ended tasks evaluated with LLM-as-a-Judge |
+| Medmarks-T | Experimental training-capable environments with train/test splits for post-training and RL experiments |
 
-The template produces:
-```
-environments/my_new_env/
-├── my_new_env.py
-├── pyproject.toml
-└── README.md
-```
+The benchmark suite is implemented as [verifiers](https://github.com/primeintellect-ai/verifiers) environments under [`environments/`](environments/). The main runnable suite configs are:
 
-Edit `my_new_env.py` to configure datasets, parsers, and rubrics, and update the package metadata in `pyproject.toml` (name, version, dependencies, tags, etc.).
+| Config | Purpose |
+|--------|---------|
+| [`configs/eval/medmarks-verified.toml`](configs/eval/medmarks-verified.toml) | Medmarks-V suite |
+| [`configs/eval/medmarks-open_ended.toml`](configs/eval/medmarks-open_ended.toml) | Medmarks-OE suite |
+| [`configs/eval/smoke.toml`](configs/eval/smoke.toml) | Small sanity-check run |
 
-If the `prime env init` command doesn't add it, you'll want to add the following prime env metadata so prime/verifiers knows where the environment is in a flat repo:
+## Quick Start
 
-```toml
-[tool.prime.environment]
-loader = "my_new_env:load_environment"
-display_name = "My New Env"
-visibility = "PUBLIC"
+```bash
+uv venv --python 3.12
+source .venv/bin/activate
+uv sync
 ```
 
-### 4. Install the Environment for Local Development
-Install your new environment in editable mode so changes are picked up immediately:
+Run a single benchmark:
 
 ```bash
-vf-install my-new-env
-# equivalent to:
-# uv pip install -e ./environments/my_new_env
+uv run medarc-eval medqa -m openai/gpt-4.1-mini -n 25
 ```
 
-You can now import it from Python or let Verifiers discover it with `verifiers.load_environment("my-new-env")`.
-
-### 5. Smoke-Test with `vf-eval`
-Run a small batch of rollouts to confirm the environment behaves as expected. Set `OPENAI_API_KEY` (or whichever OpenAI client compatible credentials you plan to use) before invoking the CLI.
+Run a Medmarks suite config:
 
 ```bash
-export OPENAI_API_KEY=sk-...
-vf-eval my-new-env -m gpt-4.1-mini -n 5 -s
+uv run medarc-eval bench --config configs/eval/medmarks-verified.toml
 ```
 
-A few useful arguments:
-
-- -m selects the inference model
-- -n controls dataset size
-- -s saves results locally.
-
-Use vf-eval -h for the full set of options (rollouts per example, max concurrency, etc.)
-
-During development you can iterate quickly by tweaking prompts, parser logic, or reward functions, reinstalling with `vf-install` if dependencies change, and rerunning `vf-eval` to view the results.
-
-After running with `-s`, inspect saved runs with `vf-tui`, which provides a terminal UI for browsing prompts, completions, and rewards under the generated `outputs/evals` folders.
-
-## Using an Existing MedARC Environment
-
-Once your tooling is set up you can install MedARC-maintained environments directly from the Prime Environments Hub (for example [`medarc/medcasereasoning`](https://app.primeintellect.ai/dashboard/environments/medarc/medcasereasoning) or [`medarc/metamedqa`](https://app.primeintellect.ai/dashboard/environments/medarc/metamedqa)).
-
-- **Install from the Hub.** Run `prime env install medarc/medcasereasoning` to pull the latest published version (add `@version` to pin a release).
-- **Run an evaluation.** Execute `vf-eval medcasereasoning -m gpt-4.1-mini -n 10 -s` to generate a small batch of rollouts.
-- **Load programmatically.** Environments installed via the Hub are importable like any other Verifiers module:
-
-  ```python
-  import verifiers as vf
-
-  env = vf.load_environment("medcasereasoning", split="validation")
-  results = env.evaluate(model_client, "gpt-4.1-mini", num_examples=5)
-  ```
-
-## medarc-eval CLI
-
-`medarc-eval` wraps the upstream `verifiers` eval flow, adding environment-specific flags and a TOML bench workflow. See [full documentation](docs/medarc-eval.md).
-
-| Command | Description |
-|---------|-------------|
-| [`medarc-eval <ENV>`](docs/medarc-eval-single-run.md) | Run a single benchmark with auto-discovered environment flags |
-| [`medarc-eval bench`](docs/medarc-eval-bench.md) | Run upstream TOML eval configs with deterministic MedARC paths |
-| [`medarc-eval process`](docs/medarc-eval-process.md) | Convert eval outputs to parquet for analysis |
-| [`medarc-eval winrate`](docs/medarc-eval-winrate.md) | Compute HELM-style win rates across models |
-
-### Quick Start
+Process outputs and compute win rates:
 
 ```bash
-# Run a single benchmark
-uv run medarc-eval medqa -m gpt-4.1-mini -n 25
-
-# Run batch evaluations from config
-uv run medarc-eval bench --config configs/eval/smoke.toml
-
-# Process results and compute win rates
 uv run medarc-eval process --runs-dir runs/evals
 uv run medarc-eval winrate
 ```
 
-### Environment-Specific Flags
-
-Each environment's `load_environment()` parameters become CLI flags automatically:
-
-```bash
-# Discover available flags
-uv run medarc-eval longhealth --help
-
-# Use environment-specific options
-uv run medarc-eval longhealth --task task1 --shuffle-answers -m gpt-4.1-mini -n 10
-```
-
-For complex arguments (dicts, nested structures), use `--env-args`:
-
-```bash
-uv run medarc-eval careqa --env-args '{"split": "open", "judge_model": "gpt-4o"}'
-```
-
-## Batch Evaluations
-
-Use `medarc-eval bench` to run upstream `verifiers` TOML eval configs
-sequentially with deterministic MedARC output paths. See [full bench mode
-documentation](docs/medarc-eval-bench.md).
-
-```toml
-model = "openai/gpt-4.1-mini"
-save_results = true
-output_dir = "runs/evals"
-
-[[eval]]
-env_id = "medqa"
-num_examples = 25
-rollouts_per_example = 1
-env_args = { shuffle_answers = true, shuffle_seed = 1618 }
+Evaluation outputs are written under `runs/evals/`, processed parquet files under `runs/processed/`, and win-rate summaries under `runs/processed/winrate/`.
+
+## Documentation
+
+| Page | Description |
+|------|-------------|
+| [`docs/getting-started.md`](docs/getting-started.md) | Developer setup, environment authoring, and local workflow |
+| [`docs/medarc-eval.md`](docs/medarc-eval.md) | Full `medarc-eval` CLI documentation |
+| [`docs/medarc-eval-bench.md`](docs/medarc-eval-bench.md) | TOML benchmark suite execution |
+| [`docs/medarc-eval-process.md`](docs/medarc-eval-process.md) | Processing eval outputs into parquet |
+| [`docs/medarc-eval-winrate.md`](docs/medarc-eval-winrate.md) | HELM-style win-rate computation |
+| [`docs/medarc-orchestrate.md`](docs/medarc-orchestrate.md) | Running local vLLM benchmark jobs with Docker or Slurm/Pyxis |
+
+## Datasets
+
+`--` indicates no dedicated training split. `Not specified` means we found no explicit dataset license in the dataset source. Evaluated counts reflect the effective Medmarks evaluation split or configured subset; MedDialog is intentionally capped at the first 2,500 examples.
+
+| Dataset | Description | License / terms | # Evaluated | # Training |
+|---------|-------------|-----------------|------------:|-----------:|
+| **Medmarks-V (Verifiable)** | | | | |
+| CareQA | Healthcare QA exam questions with multiple-choice reasoning questions, English subset. | Apache-2.0 | 5,621 | -- |
+| HEAD-QA v2 | Extended healthcare questions spanning 10 years of Spanish professional exams, English subset. | MIT | 12,751 | -- |
+| LongHealth | Long-context synthetic patient cases with information extraction and sorting tasks, task1 and task2 splits. | Apache-2.0 | 1,200 | -- |
+| M-ARC | Long-tail medical questions designed to test model resistance to inflexible clinical reasoning patterns. | Apache-2.0 | 100 | -- |
+| Med-HALT | Clinical Reasoning Hallucination detection via false confidence tests and "none of the above" recognition. | Apache-2.0 | 22,152 | -- |
+| MedCalc-Bench | Clinical calculator questions evaluating medical computation and formula application skills. | CC-BY-SA-4.0 | 1,100 | 10,538 |
+| MedConceptsQA | Multiple-choice questions on medical coding systems, e.g., ICD-9, ICD-10, etc., only ICD-10CM subsamples evaluated. | Not specified | 6,000 | -- |
+| Medbullets | USMLE Step 2 and Step 3 style clinical reasoning questions sourced from social media. | Not specified | 308 | -- |
+| MedHallu | Medical hallucination detection benchmark with four domain-specific error categories derived from the PubMedQA dataset. | MIT | 2,000 | -- |
+| MedMCQA | Multiple-choice questions from Indian medical entrance exams across 21 medical subjects. | Apache-2.0 | 4,183 | 182,822 |
+| MedQA | Multiple-choice questions from USMLE medical licensing exams. | CC-BY-4.0 | 1,273 | 10,178 |
+| MedXpertQA | High-difficulty MCQ questions with ~10 options across 17 specialties to evaluate expert-level medical knowledge, text subset. | MIT | 2,450 | -- |
+| MetaMedQA | Questions testing model's awareness and recognition of unanswerable medical queries using uncertainty options. | CC-BY-4.0 | 1,373 | -- |
+| MMLU-Pro-Health | Health subset of MMLU-Pro benchmark featuring general health-related questions with up to 10 answer options per question. | MIT | 818 | -- |
+| PubHealthBench | Multiple-choice questions derived from UK government public health guidance documents, reviewed subset. | CC-BY-4.0 | 760 | -- |
+| PubMedQA | Yes/no/maybe question answering requiring reasoning over biomedical research abstracts, labeled subset. | MIT | 500 | 211,269 |
+| SCTPublic | Script Concordance Tests evaluating clinical reasoning under diagnostic uncertainty. | MIT | 174 | -- |
+| SuperGPQA-Med | Graduate-level questions spanning 6 medical fields, easy and hard difficulty subsets. | ODC-BY | 1,126 | -- |
+| **Medmarks-OE (Open-Ended)** | | | | |
+| ACI-Bench | Clinical dialogue transcripts paired with corresponding structured clinical notes. | CC-BY-4.0 | 210 | 114 |
+| AgentClinic | Multimodal multi-agent OSCE-style clinical dialogues for interactive diagnostic reasoning evaluation. | MIT | 214 | -- |
+| CareQA Open | Healthcare QA exam questions with open-ended reasoning questions, English subset. | Apache-2.0 | 2,769 | -- |
+| HealthBench | Multi-turn healthcare conversations evaluated using physician-written scoring rubrics. | MIT | 5,000 | -- |
+| MedAgentBench v2 | Agentic electronic health record tasks requiring FHIR API interactions. | Not specified; V1 MIT | 600 | -- |
+| MedCaseReasoning | Diagnostic QA with clinician-authored reasoning traces from clinical case reports. | MIT | 500 | 13,092 |
+| MedDialog | Large-scale patient-doctor conversations for medical dialogue generation and understanding; Medmarks evaluates a small subsample. | Not specified | 2,500 | 205,973 |
+| MedExQA | Questions with dual expert explanations across 5 underrepresented medical specialties. | CC-BY-NC-SA-4.0 | 940 | -- |
+| MedicationQA | Consumer-style medication questions with expert-validated answers from MedlinePlus. | CC-BY-4.0 | 690 | -- |
+| MEDEC | Medical dataset for clinical error detection, extraction, and correction in synthetic medical notes. | CC-BY-4.0 | 597 | 2,189 |
+| MedR-Bench | Clinical reasoning benchmark with step-by-step diagnostic and treatment planning traces on rare disease cases. | CC-BY-SA-4.0 | 1,453 | -- |
+| MTSamples | Transcribed medical operative notes and reports evaluating models on procedural summaries and clinically appropriate treatment plans. | Not specified | 559 | -- |
+
+## Citation
+
+```bibtex
+@misc{warner2026medmarkscomprehensiveopensourcellm,
+      title={Medmarks: A Comprehensive Open-Source LLM Benchmark Suite for Medical Tasks},
+      author={Benjamin Warner and Ratna Sagari Grandhi and Max Kieffer and Aymane Ouraq and Saurav Panigrahi and Geetu Ambwani and Kunal Bagga and Nikhil Khandekar and Arya Hariharan and Nishant Mishra and Manish Ram and Shamus Sim Zi Yang and Ahmed Essouaied and Adepoju Jeremiah Moyondafoluwa and Robert Scholz and Bofeng Huang and Molly Beavers and Srishti Gureja and Anish Mahishi and Sameed Khan and Maxime Griot and Hunar Batra and Jean-Benoit Delbrouck and Siddhant Bharadwaj and Ronald Clark and Ashish Vashist and Anas Zafar and Leema Krishna Murali and Harsh Deshpande and Ameen Patel and William Brown and Johannes Hagemann and Connor Lane and Paul Steven Scotti and Tanishq Mathew Abraham},
+      year={2026},
+      eprint={2605.01417},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL},
+      url={https://arxiv.org/abs/2605.01417},
+}
 ```
 
-```bash
-# Run the batch
-uv run medarc-eval bench --config configs/eval/medmarks-verified.toml
-
-# Preview without executing
-uv run medarc-eval bench --config configs/eval/medmarks-verified.toml --dry-run
-```
-
-Bench mode resumes matching deterministic result directories and supports
-`[[ablation]]` sweeps for parameter grids. The removed YAML job/manifest runner
-is documented only in the migration notes in the [bench mode docs](docs/medarc-eval-bench.md).
-
-### Ablation Sweeps
-
-Use upstream TOML ablations for parameter grid runs:
-
-```toml
-[[ablation]]
-env_id = "medconceptsqa"
-num_examples = -1
-env_args = { shuffle_answers = true }
-
-[ablation.sweep.env_args]
-difficulty = ["easy", "medium", "hard"]
-shuffle_seed = [1618, 9331]
-```
-
-This expands into deterministic variant directories under
-`runs/evals/<model>/medconceptsqa/`. See [bench mode docs](docs/medarc-eval-bench.md)
-for details.
-
-## Processing and Win Rates
-
-After running benchmarks, convert results to parquet and compute model comparisons:
-
-```bash
-# Process eval outputs to parquet
-uv run medarc-eval process --runs-dir runs/evals
-
-# Compute HELM-style win rates
-uv run medarc-eval winrate
-```
+## License
 
-See [processing documentation](docs/medarc-eval-process.md) and [win rate documentation](docs/medarc-eval-winrate.md) for configuration options, HuggingFace integration, and output formats.
+Medmarks code in this repository is released under the [MIT License](LICENSE). Individual benchmark datasets may have their own licenses or terms of use; consult the corresponding dataset sources and environment documentation before redistribution or commercial use.
diff --git a/docs/README.md b/docs/README.md
index e9db2a71..64445a16 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -22,3 +22,7 @@ Environments are installed separately via `prime env install <owner/env>` (from
 | `medarc-eval winrate` | Compute HELM-style win rates across models |
 
 See [medarc-eval.md](medarc-eval.md) for full documentation.
+
+## Developer workflow
+
+See [getting-started.md](getting-started.md) for local setup, environment authoring, and development workflow notes.
diff --git a/docs/developer-guide.md b/docs/developer-guide.md
new file mode 100644
index 00000000..ee41a853
--- /dev/null
+++ b/docs/developer-guide.md
@@ -0,0 +1,204 @@
+# Getting Started
+
+This guide covers the developer workflow for Medmarks benchmark environments and the `medarc-verifiers` tooling in this repository.
+
+## Getting Started with Verifiers Environments
+
+The steps below guide you through creating a new environment package under `environments/[my-new-env]`, installing it locally, testing it with Verifiers tooling, and optionally publishing it through Prime Intellect's Environments Hub.
+
+### 1. Prerequisites
+
+- Python 3.11 or 3.12
+- [`uv`](https://docs.astral.sh/uv/) for dependency management
+- The [`prime` CLI](https://github.com/PrimeIntellect-ai/prime-cli) for scaffolding and publishing
+- An OpenAI-compatible API key, exported as `OPENAI_API_KEY`, or another OpenAI-compatible model endpoint for testing environments with `vf-eval`
+
+### 2. Setup
+
+Create and activate a virtual environment, then install the required tooling:
+
+```bash
+uv venv --python 3.12
+source .venv/bin/activate
+uv sync
+uv tool install prime
+```
+
+After this setup the `prime env`, `vf-install`, `vf-eval`, and `medarc-eval` commands will be available, or runnable via `uv run <command>`.
+
+### 3. Create a New Environment
+
+Always place new Verifiers packages inside `environments/my-new-env`. The Prime CLI ensures this by default:
+
+```bash
+# from the repository root
+prime env init my-new-env
+```
+
+The template produces:
+
+```text
+environments/my_new_env/
+|-- my_new_env.py
+|-- pyproject.toml
+`-- README.md
+```
+
+Edit `my_new_env.py` to configure datasets, parsers, and rubrics, and update the package metadata in `pyproject.toml` with the package name, version, dependencies, tags, and related fields.
+
+If the `prime env init` command does not add it, add the following Prime environment metadata so Prime and Verifiers know where the environment lives in a flat repo:
+
+```toml
+[tool.prime.environment]
+loader = "my_new_env:load_environment"
+display_name = "My New Env"
+visibility = "PUBLIC"
+```
+
+### 4. Install the Environment for Local Development
+
+Install your new environment in editable mode so changes are picked up immediately:
+
+```bash
+vf-install my-new-env
+# equivalent to:
+# uv pip install -e ./environments/my_new_env
+```
+
+You can now import it from Python or let Verifiers discover it with `verifiers.load_environment("my-new-env")`.
+
+### 5. Smoke-Test with `vf-eval`
+
+Run a small batch of rollouts to confirm the environment behaves as expected. Set `OPENAI_API_KEY`, or whichever OpenAI-compatible credentials you plan to use, before invoking the CLI.
+
+```bash
+export OPENAI_API_KEY=sk-...
+vf-eval my-new-env -m gpt-4.1-mini -n 5 -s
+```
+
+A few useful arguments:
+
+- `-m` selects the inference model.
+- `-n` controls dataset size.
+- `-s` saves results locally.
+
+Use `vf-eval -h` for the full set of options, including rollouts per example and max concurrency.
+
+During development you can iterate quickly by tweaking prompts, parser logic, or reward functions, reinstalling with `vf-install` if dependencies change, and rerunning `vf-eval` to view the results.
+
+After running with `-s`, inspect saved runs with `vf-tui`, which provides a terminal UI for browsing prompts, completions, and rewards under the generated `outputs/evals` folders.
+
+## Using an Existing Medmarks Environment
+
+Once your tooling is set up you can install MedARC-maintained environments directly from the Prime Environments Hub, for example [`medarc/medcasereasoning`](https://app.primeintellect.ai/dashboard/environments/medarc/medcasereasoning) or [`medarc/metamedqa`](https://app.primeintellect.ai/dashboard/environments/medarc/metamedqa).
+
+- Install from the Hub: run `prime env install medarc/medcasereasoning` to pull the latest published version. Add `@version` to pin a release.
+- Run an evaluation: execute `vf-eval medcasereasoning -m gpt-4.1-mini -n 10 -s` to generate a small batch of rollouts.
+- Load programmatically:
+
+```python
+import verifiers as vf
+
+env = vf.load_environment("medcasereasoning", split="validation")
+results = env.evaluate(model_client, "gpt-4.1-mini", num_examples=5)
+```
+
+## medarc-eval CLI
+
+`medarc-eval` wraps the upstream `verifiers` eval flow, adding environment-specific flags and a TOML bench workflow. See the [full documentation](medarc-eval.md).
+
+| Command | Description |
+|---------|-------------|
+| [`medarc-eval <ENV>`](medarc-eval-single-run.md) | Run a single benchmark with auto-discovered environment flags |
+| [`medarc-eval bench`](medarc-eval-bench.md) | Run upstream TOML eval configs with deterministic MedARC paths |
+| [`medarc-eval process`](medarc-eval-process.md) | Convert eval outputs to parquet for analysis |
+| [`medarc-eval winrate`](medarc-eval-winrate.md) | Compute HELM-style win rates across models |
+
+### Quick Start
+
+```bash
+# Run a single benchmark
+uv run medarc-eval medqa -m gpt-4.1-mini -n 25
+
+# Run batch evaluations from config
+uv run medarc-eval bench --config configs/eval/smoke.toml
+
+# Process results and compute win rates
+uv run medarc-eval process --runs-dir runs/evals
+uv run medarc-eval winrate
+```
+
+### Environment-Specific Flags
+
+Each environment's `load_environment()` parameters become CLI flags automatically:
+
+```bash
+# Discover available flags
+uv run medarc-eval longhealth --help
+
+# Use environment-specific options
+uv run medarc-eval longhealth --task task1 --shuffle-answers -m gpt-4.1-mini -n 10
+```
+
+For complex arguments such as dicts and nested structures, use `--env-args`:
+
+```bash
+uv run medarc-eval careqa --env-args '{"split": "open", "judge_model": "gpt-4o"}'
+```
+
+## Batch Evaluations
+
+Use `medarc-eval bench` to run upstream `verifiers` TOML eval configs sequentially with deterministic MedARC output paths. See the [bench mode documentation](medarc-eval-bench.md).
+
+```toml
+model = "openai/gpt-4.1-mini"
+save_results = true
+output_dir = "runs/evals"
+
+[[eval]]
+env_id = "medqa"
+num_examples = 25
+rollouts_per_example = 1
+env_args = { shuffle_answers = true, shuffle_seed = 1618 }
+```
+
+```bash
+# Run the batch
+uv run medarc-eval bench --config configs/eval/medmarks-verified.toml
+
+# Preview without executing
+uv run medarc-eval bench --config configs/eval/medmarks-verified.toml --dry-run
+```
+
+Bench mode resumes matching deterministic result directories and supports `[[ablation]]` sweeps for parameter grids. The removed YAML job/manifest runner is documented only in the migration notes in the [bench mode docs](medarc-eval-bench.md).
+
+### Ablation Sweeps
+
+Use upstream TOML ablations for parameter grid runs:
+
+```toml
+[[ablation]]
+env_id = "medconceptsqa"
+num_examples = -1
+env_args = { shuffle_answers = true }
+
+[ablation.sweep.env_args]
+difficulty = ["easy", "medium", "hard"]
+shuffle_seed = [1618, 9331]
+```
+
+This expands into deterministic variant directories under `runs/evals/<model>/medconceptsqa/`. See the [bench mode docs](medarc-eval-bench.md) for details.
+
+## Processing and Win Rates
+
+After running benchmarks, convert results to parquet and compute model comparisons:
+
+```bash
+# Process eval outputs to parquet
+uv run medarc-eval process --runs-dir runs/evals
+
+# Compute HELM-style win rates
+uv run medarc-eval winrate
+```
+
+See the [processing documentation](medarc-eval-process.md) and [win rate documentation](medarc-eval-winrate.md) for configuration options, Hugging Face integration, and output formats.

From 0cdb63df9f0f06071422aba068b789c03d090f66 Mon Sep 17 00:00:00 2001
From: Benjamin Warner <me@benjaminwarner.dev>
Date: Sat, 9 May 2026 15:12:16 +0000
Subject: [PATCH 36/53] format benchmark configs

---
 configs/eval/medmarks-open_ended.toml |  51 +++++---
 configs/eval/medmarks-verified.toml   | 179 +++++++++++++++++++-------
 2 files changed, 162 insertions(+), 68 deletions(-)

diff --git a/configs/eval/medmarks-open_ended.toml b/configs/eval/medmarks-open_ended.toml
index 01563fe3..fc0e5327 100644
--- a/configs/eval/medmarks-open_ended.toml
+++ b/configs/eval/medmarks-open_ended.toml
@@ -1,14 +1,27 @@
 # MedARC judge- and free-form-heavy benchmark suite.
 
-model = "openai/gpt-4.1-mini"
 save_results = true
 output_dir = "runs/evals"
 
 [[eval]]
-env_id = "healthbench"
+env_id = "agentclinic"
 num_examples = -1
 rollouts_per_example = 1
-env_args = { judge_model = "openai/gpt-5-mini", judge_base_url = "https://api.pinference.ai/api/v1", difficulty = "all" }
+env_args = { judge_model = ["openai/gpt-5-mini", "google/gemini-3-flash-preview"], judge_base_url = "https://api.pinference.ai/api/v1", patient_model = "openai/gpt-5-mini", patient_base_url = "https://api.pinference.ai/api/v1", measurement_model = "openai/gpt-5-mini", measurement_base_url = "https://api.pinference.ai/api/v1" }
+
+[[eval]]
+env_id = "agentclinic"
+name = "rollout_1"
+num_examples = -1
+rollouts_per_example = 1
+env_args = { judge_model = ["openai/gpt-5-mini", "google/gemini-3-flash-preview"], judge_base_url = "https://api.pinference.ai/api/v1", patient_model = "openai/gpt-5-mini", patient_base_url = "https://api.pinference.ai/api/v1", measurement_model = "openai/gpt-5-mini", measurement_base_url = "https://api.pinference.ai/api/v1" }
+
+[[eval]]
+env_id = "agentclinic"
+name = "rollout_2"
+num_examples = -1
+rollouts_per_example = 1
+env_args = { judge_model = ["openai/gpt-5-mini", "google/gemini-3-flash-preview"], judge_base_url = "https://api.pinference.ai/api/v1", patient_model = "openai/gpt-5-mini", patient_base_url = "https://api.pinference.ai/api/v1", measurement_model = "openai/gpt-5-mini", measurement_base_url = "https://api.pinference.ai/api/v1" }
 
 [[eval]]
 env_id = "careqa"
@@ -16,6 +29,12 @@ num_examples = -1
 rollouts_per_example = 1
 env_args = { split = "open", judge_model = ["openai/gpt-5-mini", "google/gemini-3-flash-preview"], judge_base_url = "https://api.pinference.ai/api/v1" }
 
+[[eval]]
+env_id = "healthbench"
+num_examples = -1
+rollouts_per_example = 1
+env_args = { judge_model = "openai/gpt-5-mini", judge_base_url = "https://api.pinference.ai/api/v1", difficulty = "all" }
+
 [[eval]]
 env_id = "med_dialog"
 num_examples = 2500
@@ -46,6 +65,16 @@ num_examples = -1
 rollouts_per_example = 1
 env_args = { judge_model = ["openai/gpt-5-mini", "x-ai/grok-4.1-fast"], judge_base_url = "https://api.pinference.ai/api/v1" }
 
+[[ablation]]
+env_id = "medrbench"
+name = "{env_args.task}"
+num_examples = -1
+rollouts_per_example = 1
+env_args = { judge_model = ["openai/gpt-5-mini", "google/gemini-3-flash-preview"], judge_base_url = "https://api.pinference.ai/api/v1", patient_agent_model = "openai/gpt-5-mini", patient_agent_base_url = "https://api.pinference.ai/api/v1" }
+
+[ablation.sweep.env_args]
+task = ["oracle", "1turn", "free_turn"]
+
 [[eval]]
 env_id = "mtsamples_procedures"
 num_examples = -1
@@ -63,19 +92,3 @@ env_id = "pubhealthbench"
 num_examples = -1
 rollouts_per_example = 1
 env_args = { split = "freeform", judge_model = ["openai/gpt-5-mini", "google/gemini-3-flash-preview"], judge_base_url = "https://api.pinference.ai/api/v1" }
-
-[[eval]]
-env_id = "agentclinic"
-num_examples = -1
-rollouts_per_example = 1
-env_args = { judge_model = ["openai/gpt-5-mini", "google/gemini-3-flash-preview"], judge_base_url = "https://api.pinference.ai/api/v1", patient_model = "openai/gpt-5-mini", patient_base_url = "https://api.pinference.ai/api/v1", measurement_model = "openai/gpt-5-mini", measurement_base_url = "https://api.pinference.ai/api/v1" }
-
-[[ablation]]
-env_id = "medrbench"
-name = "{env_args.task}"
-num_examples = -1
-rollouts_per_example = 1
-env_args = { judge_model = ["openai/gpt-5-mini", "google/gemini-3-flash-preview"], judge_base_url = "https://api.pinference.ai/api/v1", patient_agent_model = "openai/gpt-5-mini", patient_agent_base_url = "https://api.pinference.ai/api/v1" }
-
-[ablation.sweep.env_args]
-task = ["oracle", "1turn", "free_turn"]
diff --git a/configs/eval/medmarks-verified.toml b/configs/eval/medmarks-verified.toml
index 73404bdc..709782a6 100644
--- a/configs/eval/medmarks-verified.toml
+++ b/configs/eval/medmarks-verified.toml
@@ -2,47 +2,67 @@
 # Ablations become deterministic variant directories such as
 # runs/evals/<model>/<env>/env_args.shuffle_seed-1618/.
 
-model = "openai/gpt-4.1-mini"
 save_results = true
 output_dir = "runs/evals"
 
 [[eval]]
-env_id = "medqa"
+env_id = "careqa"
 num_examples = -1
 rollouts_per_example = 1
+env_args = { split = "en" }
 
 [[ablation]]
-env_id = "medqa"
+env_id = "careqa"
 name = "shuffle_seed-{env_args.shuffle_seed}"
 num_examples = -1
 rollouts_per_example = 1
-env_args = { shuffle_answers = true }
+env_args = { split = "en", shuffle_answers = true }
 
 [ablation.sweep.env_args]
 shuffle_seed = [1618, 9331]
 
 [[eval]]
-env_id = "med_mcqa"
+env_id = "head_qa_v2"
 num_examples = -1
 rollouts_per_example = 1
 
 [[ablation]]
-env_id = "med_mcqa"
-name = "shuffle_seed-{env_args.shuffle_seed}"
+env_id = "longhealth"
+name = "{env_args.task}"
 num_examples = -1
 rollouts_per_example = 1
-env_args = { shuffle_answers = true }
+env_args = { doc_shuffle_seed = 2718 }
 
 [ablation.sweep.env_args]
-shuffle_seed = [1618, 9331]
+task = ["task1", "task2"]
+
+[[ablation]]
+env_id = "longhealth"
+name = "{env_args.task}__shuffle_seed-1618"
+num_examples = -1
+rollouts_per_example = 1
+env_args = { shuffle_answers = true, shuffle_seed = 1618, doc_shuffle_seed = 1618 }
+
+[ablation.sweep.env_args]
+task = ["task1", "task2"]
+
+[[ablation]]
+env_id = "longhealth"
+name = "{env_args.task}__shuffle_seed-9331"
+num_examples = -1
+rollouts_per_example = 1
+env_args = { shuffle_answers = true, shuffle_seed = 9331, doc_shuffle_seed = 9331 }
+
+[ablation.sweep.env_args]
+task = ["task1", "task2"]
 
 [[eval]]
-env_id = "pubmedqa"
+env_id = "m_arc"
 num_examples = -1
 rollouts_per_example = 1
 
 [[ablation]]
-env_id = "pubmedqa"
+env_id = "m_arc"
 name = "shuffle_seed-{env_args.shuffle_seed}"
 num_examples = -1
 rollouts_per_example = 1
@@ -51,13 +71,22 @@ env_args = { shuffle_answers = true }
 [ablation.sweep.env_args]
 shuffle_seed = [1618, 9331]
 
+[[ablation]]
+env_id = "med_halt"
+name = "{env_args.question_type}"
+num_examples = -1
+rollouts_per_example = 1
+
+[ablation.sweep.env_args]
+question_type = ["reasoning_fct", "reasoning_nota"]
+
 [[eval]]
-env_id = "mmlu_pro_health"
+env_id = "med_mcqa"
 num_examples = -1
 rollouts_per_example = 1
 
 [[ablation]]
-env_id = "mmlu_pro_health"
+env_id = "med_mcqa"
 name = "shuffle_seed-{env_args.shuffle_seed}"
 num_examples = -1
 rollouts_per_example = 1
@@ -66,55 +95,83 @@ env_args = { shuffle_answers = true }
 [ablation.sweep.env_args]
 shuffle_seed = [1618, 9331]
 
-[[eval]]
-env_id = "m_arc"
+[[ablation]]
+env_id = "medbullets"
+name = "num_options-{env_args.num_options}"
 num_examples = -1
 rollouts_per_example = 1
 
+[ablation.sweep.env_args]
+num_options = [4, 5]
+
 [[ablation]]
-env_id = "m_arc"
-name = "shuffle_seed-{env_args.shuffle_seed}"
+env_id = "medbullets"
+name = "num_options-{env_args.num_options}__shuffle_seed-{env_args.shuffle_seed}"
 num_examples = -1
 rollouts_per_example = 1
 env_args = { shuffle_answers = true }
 
 [ablation.sweep.env_args]
+num_options = [4, 5]
 shuffle_seed = [1618, 9331]
 
 [[eval]]
-env_id = "careqa"
+env_id = "medcalc_bench"
+name = "version-1.2"
 num_examples = -1
 rollouts_per_example = 1
-env_args = { split = "en" }
+env_args = { version = "1.2" }
+
+[[eval]]
+env_id = "medcalc_bench"
+name = "tools"
+num_examples = -1
+rollouts_per_example = 1
+env_args = { version = "verified", add_python_tool = true, add_calculator_tool = true }
 
 [[ablation]]
-env_id = "careqa"
-name = "shuffle_seed-{env_args.shuffle_seed}"
+env_id = "medconceptsqa"
+name = "{env_args.difficulty}"
 num_examples = -1
 rollouts_per_example = 1
-env_args = { split = "en", shuffle_answers = true }
+env_args = { vocab = "icd10cm_sample" }
 
 [ablation.sweep.env_args]
+difficulty = ["easy", "medium", "hard"]
+
+[[ablation]]
+env_id = "medconceptsqa"
+name = "{env_args.difficulty}__shuffle_seed-{env_args.shuffle_seed}"
+num_examples = -1
+rollouts_per_example = 1
+env_args = { vocab = "icd10cm_sample", shuffle_answers = true }
+
+[ablation.sweep.env_args]
+difficulty = ["easy", "medium", "hard"]
 shuffle_seed = [1618, 9331]
 
 [[ablation]]
-env_id = "medbullets"
-name = "num_options-{env_args.num_options}"
+env_id = "medhallu"
+name = "{env_args.difficulty}"
 num_examples = -1
 rollouts_per_example = 1
 
 [ablation.sweep.env_args]
-num_options = [4, 5]
+difficulty = ["easy", "medium", "hard"]
+
+[[eval]]
+env_id = "medqa"
+num_examples = -1
+rollouts_per_example = 1
 
 [[ablation]]
-env_id = "medbullets"
-name = "num_options-{env_args.num_options}__shuffle_seed-{env_args.shuffle_seed}"
+env_id = "medqa"
+name = "shuffle_seed-{env_args.shuffle_seed}"
 num_examples = -1
 rollouts_per_example = 1
 env_args = { shuffle_answers = true }
 
 [ablation.sweep.env_args]
-num_options = [4, 5]
 shuffle_seed = [1618, 9331]
 
 [[ablation]]
@@ -137,64 +194,88 @@ env_args = { shuffle_answers = true }
 question_type = ["reasoning", "understanding"]
 shuffle_seed = [1618, 9331]
 
+[[eval]]
+env_id = "metamedqa"
+num_examples = -1
+rollouts_per_example = 1
+
 [[ablation]]
-env_id = "supergpqa_medicine"
-name = "{env_args.difficulty}"
+env_id = "metamedqa"
+name = "shuffle_seed-{env_args.shuffle_seed}"
 num_examples = -1
 rollouts_per_example = 1
+env_args = { shuffle_answers = true }
 
 [ablation.sweep.env_args]
-difficulty = ["easy", "hard"]
+shuffle_seed = [1618, 9331]
+
+[[eval]]
+env_id = "mmlu_pro_health"
+num_examples = -1
+rollouts_per_example = 1
 
 [[ablation]]
-env_id = "supergpqa_medicine"
-name = "{env_args.difficulty}__shuffle_seed-{env_args.shuffle_seed}"
+env_id = "mmlu_pro_health"
+name = "shuffle_seed-{env_args.shuffle_seed}"
 num_examples = -1
 rollouts_per_example = 1
 env_args = { shuffle_answers = true }
 
 [ablation.sweep.env_args]
-difficulty = ["easy", "hard"]
 shuffle_seed = [1618, 9331]
 
+[[eval]]
+env_id = "pubhealthbench"
+num_examples = -1
+rollouts_per_example = 1
+env_args = { split = "reviewed" }
+
 [[ablation]]
-env_id = "medconceptsqa"
-name = "{env_args.difficulty}"
+env_id = "pubhealthbench"
+name = "shuffle_seed-{env_args.shuffle_seed}"
 num_examples = -1
 rollouts_per_example = 1
-env_args = { vocab = "icd10cm_sample" }
+env_args = { split = "reviewed", shuffle_answers = true }
 
 [ablation.sweep.env_args]
-difficulty = ["easy", "medium", "hard"]
+shuffle_seed = [1618, 9331]
+
+[[eval]]
+env_id = "pubmedqa"
+num_examples = -1
+rollouts_per_example = 1
 
 [[ablation]]
-env_id = "medconceptsqa"
-name = "{env_args.difficulty}__shuffle_seed-{env_args.shuffle_seed}"
+env_id = "pubmedqa"
+name = "shuffle_seed-{env_args.shuffle_seed}"
 num_examples = -1
 rollouts_per_example = 1
-env_args = { vocab = "icd10cm_sample", shuffle_answers = true }
+env_args = { shuffle_answers = true }
 
 [ablation.sweep.env_args]
-difficulty = ["easy", "medium", "hard"]
 shuffle_seed = [1618, 9331]
 
 [[eval]]
-env_id = "head_qa_v2"
+env_id = "sctpublic"
 num_examples = -1
 rollouts_per_example = 1
 
-[[eval]]
-env_id = "pubhealthbench"
+[[ablation]]
+env_id = "supergpqa_medicine"
+name = "{env_args.difficulty}"
 num_examples = -1
 rollouts_per_example = 1
-env_args = { split = "reviewed" }
+
+[ablation.sweep.env_args]
+difficulty = ["easy", "hard"]
 
 [[ablation]]
-env_id = "pubhealthbench"
-name = "shuffle_seed-{env_args.shuffle_seed}"
+env_id = "supergpqa_medicine"
+name = "{env_args.difficulty}__shuffle_seed-{env_args.shuffle_seed}"
 num_examples = -1
 rollouts_per_example = 1
-env_args = { split = "reviewed", shuffle_answers = true }
+env_args = { shuffle_answers = true }
 
 [ablation.sweep.env_args]
+difficulty = ["easy", "hard"]
 shuffle_seed = [1618, 9331]

From 0de303646dc48ccc5b3c4a9b81516b93855fe70b Mon Sep 17 00:00:00 2001
From: Benjamin Warner <me@benjaminwarner.dev>
Date: Mon, 11 May 2026 23:03:30 +0000
Subject: [PATCH 37/53] Add Medmarks endpoint aliases and smoke config

---
 README.md                                 |  40 +-
 configs/eval/README.md                    |  34 +-
 configs/eval/gpt-oss-local-endpoints.toml |  12 +
 configs/eval/gpt-oss-local-smoke.toml     |  26 +
 configs/eval/medmarks-endpoints.toml      | 942 ++++++++++++++++++++++
 configs/eval/medmarks-smoke.toml          | 105 +++
 configs/eval/smoke.toml                   |   8 -
 docs/developer-guide.md                   |   2 +-
 docs/medarc-eval-bench.md                 |   6 +-
 docs/medarc-eval.md                       |   2 +-
 medarc_verifiers/cli/verifiers_adapter.py |  95 ++-
 tests/test_cli/test_main.py               |   9 +-
 tests/test_cli/test_verifiers_adapter.py  |  33 +
 13 files changed, 1289 insertions(+), 25 deletions(-)
 create mode 100644 configs/eval/gpt-oss-local-endpoints.toml
 create mode 100644 configs/eval/gpt-oss-local-smoke.toml
 create mode 100644 configs/eval/medmarks-endpoints.toml
 create mode 100644 configs/eval/medmarks-smoke.toml
 delete mode 100644 configs/eval/smoke.toml

diff --git a/README.md b/README.md
index 22f08a45..343de6f2 100644
--- a/README.md
+++ b/README.md
@@ -29,7 +29,8 @@ The benchmark suite is implemented as [verifiers](https://github.com/primeintell
 |--------|---------|
 | [`configs/eval/medmarks-verified.toml`](configs/eval/medmarks-verified.toml) | Medmarks-V suite |
 | [`configs/eval/medmarks-open_ended.toml`](configs/eval/medmarks-open_ended.toml) | Medmarks-OE suite |
-| [`configs/eval/smoke.toml`](configs/eval/smoke.toml) | Small sanity-check run |
+| [`configs/eval/medmarks-endpoints.toml`](configs/eval/medmarks-endpoints.toml) | Portable model aliases and sampling defaults for Medmarks runs |
+| [`configs/eval/medmarks-smoke.toml`](configs/eval/medmarks-smoke.toml) | Small Medmarks-V sanity-check run |
 
 ## Quick Start
 
@@ -51,6 +52,43 @@ Run a Medmarks suite config:
 uv run medarc-eval bench --config configs/eval/medmarks-verified.toml
 ```
 
+Run a Medmarks suite with one of the published model aliases:
+
+```bash
+uv run medarc-eval bench \
+  --config configs/eval/medmarks-verified.toml \
+  --endpoints-path configs/eval/medmarks-endpoints.toml \
+  -m gpt-oss-20b-low \
+  --api-base-url https://api.pinference.ai/api/v1 \
+  --api-key-var PRIME_API_KEY
+```
+
+[`configs/eval/medmarks-endpoints.toml`](configs/eval/medmarks-endpoints.toml) is an alias registry, not a deployment config. It maps names such as `gpt-oss-20b-low` or `medgemma-27b-text` to provider model IDs, client types, and model-specific sampling defaults. It intentionally omits `url`, `key`, and `max_concurrent`; supply those with `--provider` or with `--api-base-url` and `--api-key-var` for your deployment. The gpt-oss aliases use the Verifiers `openai_responses` client type.
+
+Preview the resolved jobs before running:
+
+```bash
+uv run medarc-eval bench \
+  --config configs/eval/medmarks-verified.toml \
+  --endpoints-path configs/eval/medmarks-endpoints.toml \
+  -m gpt-oss-20b-low \
+  --api-base-url https://api.pinference.ai/api/v1 \
+  --api-key-var PRIME_API_KEY \
+  --dry-run
+```
+
+Run the same alias against a local vLLM server exposing an OpenAI-compatible API:
+
+```bash
+VLLM_API_KEY=local-key uv run medarc-eval bench \
+  --config configs/eval/medmarks-verified.toml \
+  --endpoints-path configs/eval/medmarks-endpoints.toml \
+  -m gpt-oss-20b-low \
+  --api-base-url http://127.0.0.1:8000/v1 \
+  --api-key-var VLLM_API_KEY \
+  --dry-run
+```
+
 Process outputs and compute win rates:
 
 ```bash
diff --git a/configs/eval/README.md b/configs/eval/README.md
index b8ac3e4e..9087fb80 100644
--- a/configs/eval/README.md
+++ b/configs/eval/README.md
@@ -6,11 +6,43 @@ and `[[ablation]]` sweeps intentionally keep the upstream environment id stable;
 `env_args` and `sampling_args`.
 
 ```bash
-medarc-eval bench --config configs/eval/smoke.toml --dry-run
+medarc-eval bench --config configs/eval/medmarks-smoke.toml --dry-run
 medarc-eval bench --config configs/eval/medmarks-verified.toml
 medarc-eval process --runs-dir runs/evals --output-dir runs/processed
 ```
 
+Use `medmarks-endpoints.toml` when you want one of the Medmarks model aliases
+and its sampling defaults:
+
+```bash
+medarc-eval bench \
+  --config configs/eval/medmarks-verified.toml \
+  --endpoints-path configs/eval/medmarks-endpoints.toml \
+  -m gpt-oss-20b-low \
+  --api-base-url https://api.pinference.ai/api/v1 \
+  --api-key-var PRIME_API_KEY \
+  --dry-run
+```
+
+`medmarks-endpoints.toml` is a portable alias registry. It maps endpoint IDs to
+model IDs, client types, and sampling defaults, but intentionally omits `url`,
+`key`, and `max_concurrent` because those are deployment-specific. Supply those
+settings with `--provider` or with `--api-base-url` and `--api-key-var`.
+The gpt-oss aliases use the Verifiers `openai_responses` client type.
+
+For a local vLLM server exposing an OpenAI-compatible API, keep using the same
+alias registry and override only the deployment settings:
+
+```bash
+VLLM_API_KEY=local-key medarc-eval bench \
+  --config configs/eval/medmarks-verified.toml \
+  --endpoints-path configs/eval/medmarks-endpoints.toml \
+  -m gpt-oss-20b-low \
+  --api-base-url http://127.0.0.1:8000/v1 \
+  --api-key-var VLLM_API_KEY \
+  --dry-run
+```
+
 Per-environment `[tool.verifiers.eval]` defaults are read from editable installs
 where the environment `pyproject.toml` is discoverable next to the module. Wheel
 installs may ignore those defaults unless the package includes `pyproject.toml`,
diff --git a/configs/eval/gpt-oss-local-endpoints.toml b/configs/eval/gpt-oss-local-endpoints.toml
new file mode 100644
index 00000000..4c38d658
--- /dev/null
+++ b/configs/eval/gpt-oss-local-endpoints.toml
@@ -0,0 +1,12 @@
+[[endpoint]]
+endpoint_id = "gpt-oss-20b-low-local"
+model = "openai/gpt-oss-20b"
+url = "http://host.docker.internal:8010/v1"
+key = "VLLM_API_KEY"
+api_client_type = "openai_responses"
+
+[endpoint.sampling_args]
+temperature = 1.0
+top_p = 1.0
+top_k = 0
+reasoning_effort = "low"
diff --git a/configs/eval/gpt-oss-local-smoke.toml b/configs/eval/gpt-oss-local-smoke.toml
new file mode 100644
index 00000000..7b9764c3
--- /dev/null
+++ b/configs/eval/gpt-oss-local-smoke.toml
@@ -0,0 +1,26 @@
+endpoints_path = "gpt-oss-local-endpoints.toml"
+variant_id = "gpt-oss-20b-low-local"
+save_results = true
+output_dir = "runs/evals"
+max_concurrent = 4
+max_retries = 1
+
+[[eval]]
+env_id = "medqa"
+num_examples = 25
+rollouts_per_example = 1
+
+[[eval]]
+env_id = "pubmedqa"
+num_examples = 25
+rollouts_per_example = 1
+
+[[eval]]
+env_id = "m-arc"
+num_examples = 25
+rollouts_per_example = 1
+
+[[eval]]
+env_id = "medhallu"
+num_examples = 25
+rollouts_per_example = 1
diff --git a/configs/eval/medmarks-endpoints.toml b/configs/eval/medmarks-endpoints.toml
new file mode 100644
index 00000000..be95de32
--- /dev/null
+++ b/configs/eval/medmarks-endpoints.toml
@@ -0,0 +1,942 @@
+# MedMarks model alias registry ported from configs/old model sections.
+# URL, key, max_concurrent, orchestration, and job matrix settings are intentionally omitted.
+# Supply deployment-specific API settings with --provider or --api-base-url/--api-key-var.
+
+[[endpoint]]
+endpoint_id = "afm-4-5b"
+model = "arcee-ai/AFM-4.5B"
+api_client_type = "openai_chat_completions"
+
+[endpoint.sampling_args]
+temperature = 0.5
+top_p = 0.95
+top_k = 50
+
+
+[[endpoint]]
+endpoint_id = "ai21-jamba2-mini"
+model = "ai21labs/AI21-Jamba2-Mini"
+api_client_type = "openai_chat_completions"
+
+[endpoint.sampling_args]
+temperature = 0.6
+top_p = 1.0
+
+
+[[endpoint]]
+endpoint_id = "antangelmed"
+model = "MedAIBase/AntAngelMed"
+api_client_type = "openai_chat_completions"
+
+[endpoint.sampling_args]
+temperature = 0.6
+top_p = 0.95
+top_k = 20
+
+
+[[endpoint]]
+endpoint_id = "baichuan-m2"
+model = "baichuan-inc/Baichuan-M2-32B"
+api_client_type = "openai_chat_completions"
+
+[endpoint.sampling_args]
+temperature = 0.6
+top_p = 0.95
+top_k = 20
+
+
+[[endpoint]]
+endpoint_id = "baichuan-m3"
+model = "baichuan-inc/Baichuan-M3-235B"
+api_client_type = "openai_chat_completions"
+
+[endpoint.sampling_args]
+temperature = 0.6
+top_p = 0.95
+top_k = 20
+
+
+[[endpoint]]
+endpoint_id = "dasd-30b-a3b"
+model = "Alibaba-Apsara/DASD-30B-A3B-Thinking-Preview"
+api_client_type = "openai_chat_completions"
+
+[endpoint.sampling_args]
+temperature = 1.0
+top_p = 1.0
+
+
+[[endpoint]]
+endpoint_id = "dasd-4b-thinking"
+model = "Alibaba-Apsara/DASD-4B-Thinking"
+api_client_type = "openai_chat_completions"
+
+[endpoint.sampling_args]
+temperature = 1.0
+top_p = 1.0
+
+
+[[endpoint]]
+endpoint_id = "deepseek-v3.2-speciale"
+model = "deepseek/deepseek-v3.2-speciale"
+api_client_type = "openai_chat_completions"
+
+[endpoint.sampling_args]
+temperature = 1
+top_p = 0.95
+
+[endpoint.sampling_args.extra_body]
+
+[endpoint.sampling_args.extra_body.usage]
+include = true
+
+
+[[endpoint]]
+endpoint_id = "fiercefalcon"
+model = "fiercefalcon"
+api_client_type = "openai_chat_completions"
+
+[endpoint.sampling_args]
+reasoning_effort = "low"
+temperature = 1
+top_p = 0.95
+
+
+[[endpoint]]
+endpoint_id = "gemini-3-pro-preview"
+model = "gemini-3-pro-preview"
+api_client_type = "openai_chat_completions"
+
+[endpoint.sampling_args]
+temperature = 1
+
+
+[[endpoint]]
+endpoint_id = "gemma-3-12b-it"
+model = "google/gemma-3-12b-it"
+api_client_type = "openai_chat_completions"
+
+[endpoint.sampling_args]
+temperature = 1.0
+top_p = 0.95
+top_k = 60
+
+
+[[endpoint]]
+endpoint_id = "gemma-3-27b-it"
+model = "google/gemma-3-27b-it"
+api_client_type = "openai_chat_completions"
+
+[endpoint.sampling_args]
+temperature = 1.0
+top_p = 0.95
+top_k = 60
+
+
+[[endpoint]]
+endpoint_id = "gemma-3-4b-it"
+model = "google/gemma-3-4b-it"
+api_client_type = "openai_chat_completions"
+
+[endpoint.sampling_args]
+temperature = 1.0
+top_p = 0.95
+top_k = 60
+
+
+[[endpoint]]
+endpoint_id = "glm-4.5-air"
+model = "zai-org/GLM-4.5-Air"
+api_client_type = "openai_chat_completions"
+
+[endpoint.sampling_args]
+temperature = 0.6
+top_p = 0.95
+
+
+[[endpoint]]
+endpoint_id = "glm-4.7-fp8"
+model = "zai-org/GLM-4.7-FP8"
+api_client_type = "openai_chat_completions"
+
+[endpoint.sampling_args]
+temperature = 1.0
+top_p = 0.95
+
+
+[[endpoint]]
+endpoint_id = "glm-4_7-flash"
+model = "zai-org/GLM-4.7-Flash"
+api_client_type = "openai_chat_completions"
+
+[endpoint.sampling_args]
+temperature = 1.0
+top_p = 0.95
+
+
+# Medmarks 1.0 GPT runs predated Verifiers Responses API support.
+# These aliases now use openai_responses to better match OpenAI GPT model behavior.
+[[endpoint]]
+endpoint_id = "gpt-5-nano"
+model = "openai/gpt-5-nano"
+api_client_type = "openai_responses"
+
+[endpoint.sampling_args]
+reasoning_effort = "medium"
+
+[endpoint.sampling_args.extra_body]
+
+[endpoint.sampling_args.extra_body.usage]
+include = true
+
+
+[[endpoint]]
+endpoint_id = "gpt-5_2"
+model = "openai/gpt-5.2"
+api_client_type = "openai_responses"
+
+[endpoint.sampling_args]
+reasoning_effort = "medium"
+
+[endpoint.sampling_args.extra_body]
+
+[endpoint.sampling_args.extra_body.usage]
+include = true
+
+
+[[endpoint]]
+endpoint_id = "gpt-5_2-high"
+model = "openai/gpt-5.2"
+api_client_type = "openai_responses"
+
+[endpoint.sampling_args]
+reasoning_effort = "high"
+
+[endpoint.sampling_args.extra_body]
+
+[endpoint.sampling_args.extra_body.usage]
+include = true
+
+
+# Medmarks 1.0 GPT-OSS runs predated Verifiers Responses API support.
+# These aliases now use openai_responses to better match gpt-oss model behavior.
+[[endpoint]]
+endpoint_id = "gpt-oss-120b"
+model = "openai/gpt-oss-120b"
+api_client_type = "openai_responses"
+
+[endpoint.sampling_args]
+temperature = 1.0
+top_p = 1.0
+top_k = 0
+reasoning_effort = "medium"
+
+
+[[endpoint]]
+endpoint_id = "gpt-oss-120b-high"
+model = "openai/gpt-oss-120b"
+api_client_type = "openai_responses"
+
+[endpoint.sampling_args]
+temperature = 1.0
+top_p = 1.0
+top_k = 0
+reasoning_effort = "high"
+
+
+[[endpoint]]
+endpoint_id = "gpt-oss-120b-low"
+model = "openai/gpt-oss-120b"
+api_client_type = "openai_responses"
+
+[endpoint.sampling_args]
+temperature = 1.0
+top_p = 1.0
+top_k = 0
+reasoning_effort = "low"
+
+
+[[endpoint]]
+endpoint_id = "gpt-oss-20b"
+model = "openai/gpt-oss-20b"
+api_client_type = "openai_responses"
+
+[endpoint.sampling_args]
+temperature = 1.0
+top_p = 1.0
+top_k = 0
+reasoning_effort = "medium"
+
+
+[[endpoint]]
+endpoint_id = "gpt-oss-20b-high"
+model = "openai/gpt-oss-20b"
+api_client_type = "openai_responses"
+
+[endpoint.sampling_args]
+temperature = 1.0
+top_p = 1.0
+top_k = 0
+reasoning_effort = "high"
+
+
+[[endpoint]]
+endpoint_id = "gpt-oss-20b-low"
+model = "openai/gpt-oss-20b"
+api_client_type = "openai_responses"
+
+[endpoint.sampling_args]
+temperature = 1.0
+top_p = 1.0
+top_k = 0
+reasoning_effort = "low"
+
+
+# Medmarks 1.0 GPT runs predated Verifiers Responses API support.
+# These aliases now use openai_responses to better match OpenAI GPT model behavior.
+[[endpoint]]
+endpoint_id = "gpt_5_1"
+model = "openai/gpt-5.1"
+api_client_type = "openai_responses"
+
+[endpoint.sampling_args]
+reasoning_effort = "medium"
+
+[endpoint.sampling_args.extra_body]
+
+[endpoint.sampling_args.extra_body.usage]
+include = true
+
+
+[[endpoint]]
+endpoint_id = "gpt_5_mini"
+model = "openai/gpt-5-mini"
+api_client_type = "openai_responses"
+
+[endpoint.sampling_args]
+reasoning_effort = "low"
+
+[endpoint.sampling_args.extra_body]
+
+[endpoint.sampling_args.extra_body.usage]
+include = true
+
+
+[[endpoint]]
+endpoint_id = "granite-4-0-h-small"
+model = "ibm-granite/granite-4.0-h-small"
+api_client_type = "openai_chat_completions"
+
+[endpoint.sampling_args]
+temperature = 0.0
+top_p = 1.0
+top_k = 0
+
+
+[[endpoint]]
+endpoint_id = "granite-4-0-h-tiny"
+model = "ibm-granite/granite-4.0-h-tiny"
+api_client_type = "openai_chat_completions"
+
+[endpoint.sampling_args]
+temperature = 0.0
+top_p = 1.0
+top_k = 0
+
+
+[[endpoint]]
+endpoint_id = "grok-4"
+model = "x-ai/grok-4"
+api_client_type = "openai_chat_completions"
+
+[endpoint.sampling_args]
+temperature = 1
+top_p = 0.95
+
+[endpoint.sampling_args.extra_body]
+
+[endpoint.sampling_args.extra_body.usage]
+include = true
+
+
+[[endpoint]]
+endpoint_id = "hermes-4-14b"
+model = "NousResearch/Hermes-4-14B"
+api_client_type = "openai_chat_completions"
+
+[endpoint.sampling_args]
+temperature = 0.6
+top_p = 0.95
+top_k = 20
+
+
+[[endpoint]]
+endpoint_id = "hermes-4-70b"
+model = "NousResearch/Hermes-4-70B"
+api_client_type = "openai_chat_completions"
+
+[endpoint.sampling_args]
+temperature = 0.6
+top_p = 0.95
+top_k = 20
+
+
+[[endpoint]]
+endpoint_id = "intellect3"
+model = "PrimeIntellect/INTELLECT-3"
+api_client_type = "openai_chat_completions"
+
+[endpoint.sampling_args]
+temperature = 0.6
+top_p = 0.95
+
+
+[[endpoint]]
+endpoint_id = "kimi-k2-thinking"
+model = "moonshotai/Kimi-K2-Thinking"
+api_client_type = "openai_chat_completions"
+
+[endpoint.sampling_args]
+temperature = 1.0
+top_p = 0.95
+
+
+[[endpoint]]
+endpoint_id = "kimi-linear"
+model = "moonshotai/Kimi-Linear-48B-A3B-Instruct"
+api_client_type = "openai_chat_completions"
+
+[endpoint.sampling_args]
+temperature = 1.0
+top_p = 0.95
+
+
+[[endpoint]]
+endpoint_id = "ling-2-flash"
+model = "inclusionAI/Ling-flash-2.0"
+api_client_type = "openai_chat_completions"
+
+[endpoint.sampling_args]
+temperature = 0.7
+top_p = 0.8
+
+
+[[endpoint]]
+endpoint_id = "llama-3-70b-instruct"
+model = "meta-llama/Llama-3.3-70B-Instruct"
+api_client_type = "openai_chat_completions"
+
+[endpoint.sampling_args]
+temperature = 0.7
+top_p = 0.95
+top_k = 0
+
+
+[[endpoint]]
+endpoint_id = "llama-3-8b-instruct"
+model = "meta-llama/Llama-3.1-8B-Instruct"
+api_client_type = "openai_chat_completions"
+
+[endpoint.sampling_args]
+temperature = 0.7
+top_p = 0.95
+top_k = 0
+
+
+[[endpoint]]
+endpoint_id = "magistral-small"
+model = "mistralai/Magistral-Small-2509"
+api_client_type = "openai_chat_completions"
+
+[endpoint.sampling_args]
+temperature = 0.7
+top_p = 0.95
+
+
+[[endpoint]]
+endpoint_id = "medgemma-27b-text"
+model = "google/medgemma-27b-text-it"
+api_client_type = "openai_chat_completions"
+
+[endpoint.sampling_args]
+temperature = 0.0
+top_p = 1.0
+top_k = 0
+
+
+[[endpoint]]
+endpoint_id = "medgemma-4b-1_5-it"
+model = "google/medgemma-1.5-4b-it"
+api_client_type = "openai_chat_completions"
+
+[endpoint.sampling_args]
+temperature = 0.0
+top_p = 0.95
+top_k = 64
+
+
+[[endpoint]]
+endpoint_id = "medgemma-4b-it"
+model = "google/medgemma-4b-it"
+api_client_type = "openai_chat_completions"
+
+[endpoint.sampling_args]
+temperature = 0.0
+top_p = 1.0
+top_k = 0
+
+
+[[endpoint]]
+endpoint_id = "mediphi"
+model = "microsoft/MediPhi-Instruct"
+api_client_type = "openai_chat_completions"
+
+[endpoint.sampling_args]
+temperature = 0.0
+top_p = 0.95
+
+
+[[endpoint]]
+endpoint_id = "minimax-m2"
+model = "MiniMaxAI/MiniMax-M2"
+api_client_type = "openai_chat_completions"
+
+[endpoint.sampling_args]
+temperature = 1.0
+top_p = 0.95
+top_k = 40
+
+
+[[endpoint]]
+endpoint_id = "minimax-m2.1"
+model = "MiniMaxAI/MiniMax-M2.1"
+api_client_type = "openai_chat_completions"
+
+[endpoint.sampling_args]
+temperature = 1.0
+top_p = 0.95
+top_k = 40
+
+
+[[endpoint]]
+endpoint_id = "ministral3-14b-instruct"
+model = "mistralai/Ministral-3-14B-Instruct-2512"
+api_client_type = "openai_chat_completions"
+
+[endpoint.sampling_args]
+temperature = 0.1
+top_p = 0.95
+
+
+[[endpoint]]
+endpoint_id = "ministral3-14b-reason"
+model = "mistralai/Ministral-3-14B-Reasoning-2512"
+api_client_type = "openai_chat_completions"
+
+[endpoint.sampling_args]
+temperature = 0.7
+top_p = 0.95
+
+
+[[endpoint]]
+endpoint_id = "ministral3-3b-instruct"
+model = "mistralai/Ministral-3-3B-Instruct-2512"
+api_client_type = "openai_chat_completions"
+
+[endpoint.sampling_args]
+temperature = 0.1
+top_p = 0.95
+
+
+[[endpoint]]
+endpoint_id = "ministral3-3b-reason"
+model = "mistralai/Ministral-3-3B-Reasoning-2512"
+api_client_type = "openai_chat_completions"
+
+[endpoint.sampling_args]
+temperature = 0.7
+top_p = 0.95
+
+
+[[endpoint]]
+endpoint_id = "ministral3-8b-instruct"
+model = "mistralai/Ministral-3-8B-Instruct-2512"
+api_client_type = "openai_chat_completions"
+
+[endpoint.sampling_args]
+temperature = 0.1
+top_p = 0.95
+
+
+[[endpoint]]
+endpoint_id = "ministral3-8b-reason"
+model = "mistralai/Ministral-3-8B-Reasoning-2512"
+api_client_type = "openai_chat_completions"
+
+[endpoint.sampling_args]
+temperature = 0.7
+top_p = 0.95
+
+
+[[endpoint]]
+endpoint_id = "mirothinker-235b-a22b"
+model = "miromind-ai/MiroThinker-v1.5-235B"
+api_client_type = "openai_chat_completions"
+
+[endpoint.sampling_args]
+temperature = 1.0
+top_p = 0.95
+
+
+[[endpoint]]
+endpoint_id = "mirothinker-30b-a3b"
+model = "miromind-ai/MiroThinker-v1.5-30B"
+api_client_type = "openai_chat_completions"
+
+[endpoint.sampling_args]
+temperature = 1.0
+top_p = 0.95
+
+
+[[endpoint]]
+endpoint_id = "nemotron-nano-v2"
+model = "nvidia/NVIDIA-Nemotron-Nano-12B-v2"
+api_client_type = "openai_chat_completions"
+
+[endpoint.sampling_args]
+temperature = 0.6
+top_p = 0.95
+
+
+[[endpoint]]
+endpoint_id = "nemotron-nano-v3"
+model = "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16"
+api_client_type = "openai_chat_completions"
+
+[endpoint.sampling_args]
+temperature = 1.0
+top_p = 1.0
+
+
+[[endpoint]]
+endpoint_id = "olmo-3-32b-think"
+model = "allenai/Olmo-3-32B-Think"
+api_client_type = "openai_chat_completions"
+
+[endpoint.sampling_args]
+temperature = 0.6
+top_p = 0.95
+
+
+[[endpoint]]
+endpoint_id = "olmo-3-7b-instruct"
+model = "allenai/Olmo-3-7B-Instruct"
+api_client_type = "openai_chat_completions"
+
+[endpoint.sampling_args]
+temperature = 0.6
+top_p = 0.95
+
+
+[[endpoint]]
+endpoint_id = "olmo-3-7b-think"
+model = "allenai/Olmo-3-7B-Think"
+api_client_type = "openai_chat_completions"
+
+[endpoint.sampling_args]
+temperature = 0.6
+top_p = 0.95
+
+
+[[endpoint]]
+endpoint_id = "olmo-3_1-32b-instruct"
+model = "allenai/Olmo-3.1-32B-Instruct"
+api_client_type = "openai_chat_completions"
+
+[endpoint.sampling_args]
+temperature = 0.6
+top_p = 0.95
+
+
+[[endpoint]]
+endpoint_id = "olmo-3_1-32b-think"
+model = "allenai/Olmo-3.1-32B-Think"
+api_client_type = "openai_chat_completions"
+
+[endpoint.sampling_args]
+temperature = 0.6
+top_p = 0.95
+
+
+[[endpoint]]
+endpoint_id = "phi-4-reasoning"
+model = "microsoft/Phi-4-reasoning"
+api_client_type = "openai_chat_completions"
+
+[endpoint.sampling_args]
+temperature = 0.8
+top_p = 0.95
+top_k = 50
+
+
+[[endpoint]]
+endpoint_id = "phi-4-reasoning-plus"
+model = "microsoft/Phi-4-reasoning-plus"
+api_client_type = "openai_chat_completions"
+
+[endpoint.sampling_args]
+temperature = 0.8
+top_p = 0.95
+top_k = 50
+
+
+[[endpoint]]
+endpoint_id = "qwen-235b-a22b-thinking"
+model = "Qwen/Qwen3-235B-A22B-Thinking-2507"
+api_client_type = "openai_chat_completions"
+
+[endpoint.sampling_args]
+temperature = 0.6
+top_p = 0.95
+top_k = 20
+min_p = 0
+
+
+[[endpoint]]
+endpoint_id = "qwen-3-14b-thinking"
+model = "Qwen/Qwen3-14B"
+api_client_type = "openai_chat_completions"
+
+[endpoint.sampling_args]
+temperature = 0.6
+top_p = 0.95
+top_k = 20
+min_p = 0
+
+
+[[endpoint]]
+endpoint_id = "qwen-3-4b-thinking"
+model = "Qwen/Qwen3-4B-Thinking-2507"
+api_client_type = "openai_chat_completions"
+
+[endpoint.sampling_args]
+temperature = 0.6
+top_p = 0.95
+top_k = 20
+min_p = 0
+
+
+[[endpoint]]
+endpoint_id = "qwen-3-8b-thinking"
+model = "Qwen/Qwen3-8B"
+api_client_type = "openai_chat_completions"
+
+[endpoint.sampling_args]
+temperature = 0.6
+top_p = 0.95
+top_k = 20
+min_p = 0
+
+
+[[endpoint]]
+endpoint_id = "qwen-30b-a3b-instruct"
+model = "Qwen/Qwen3-30B-A3B-Instruct-2507"
+api_client_type = "openai_chat_completions"
+
+[endpoint.sampling_args]
+temperature = 0.7
+top_p = 0.8
+top_k = 20
+min_p = 0
+
+
+[[endpoint]]
+endpoint_id = "qwen-30b-a3b-instruct-awq-4bit"
+model = "cpatonn/Qwen3-30B-A3B-Instruct-2507-AWQ-4bit"
+api_client_type = "openai_chat_completions"
+
+[endpoint.sampling_args]
+temperature = 0.7
+top_p = 0.8
+top_k = 20
+min_p = 0
+
+
+[[endpoint]]
+endpoint_id = "qwen-30b-a3b-instruct-awq-8bit"
+model = "cpatonn/Qwen3-30B-A3B-Instruct-2507-AWQ-8bit"
+api_client_type = "openai_chat_completions"
+
+[endpoint.sampling_args]
+temperature = 0.7
+top_p = 0.8
+top_k = 20
+min_p = 0
+
+
+[[endpoint]]
+endpoint_id = "qwen-30b-a3b-instruct-fp8"
+model = "Qwen/Qwen3-30B-A3B-Instruct-2507-FP8"
+api_client_type = "openai_chat_completions"
+
+[endpoint.sampling_args]
+temperature = 0.7
+top_p = 0.8
+top_k = 20
+min_p = 0
+
+
+[[endpoint]]
+endpoint_id = "qwen-30b-a3b-thinking"
+model = "Qwen/Qwen3-30B-A3B-Thinking-2507"
+api_client_type = "openai_chat_completions"
+
+[endpoint.sampling_args]
+temperature = 0.6
+top_p = 0.95
+top_k = 20
+min_p = 0
+
+
+[[endpoint]]
+endpoint_id = "qwen-30b-a3b-thinking-awq-4bit"
+model = "cpatonn/Qwen3-30B-A3B-Thinking-2507-AWQ-4bit"
+api_client_type = "openai_chat_completions"
+
+[endpoint.sampling_args]
+temperature = 0.6
+top_p = 0.95
+top_k = 20
+min_p = 0
+
+
+[[endpoint]]
+endpoint_id = "qwen-30b-a3b-thinking-awq-8bit"
+model = "cpatonn/Qwen3-30B-A3B-Thinking-2507-AWQ-8bit"
+api_client_type = "openai_chat_completions"
+
+[endpoint.sampling_args]
+temperature = 0.6
+top_p = 0.95
+top_k = 20
+min_p = 0
+
+
+[[endpoint]]
+endpoint_id = "qwen-30b-a3b-thinking-fp8"
+model = "Qwen/Qwen3-30B-A3B-Thinking-2507-FP8"
+api_client_type = "openai_chat_completions"
+
+[endpoint.sampling_args]
+temperature = 0.6
+top_p = 0.95
+top_k = 20
+min_p = 0
+
+
+[[endpoint]]
+endpoint_id = "qwen-next-80b-a3b-instruct"
+model = "Qwen/Qwen3-Next-80B-A3B-Instruct"
+api_client_type = "openai_chat_completions"
+
+[endpoint.sampling_args]
+temperature = 0.7
+top_p = 0.8
+top_k = 20
+min_p = 0
+
+
+[[endpoint]]
+endpoint_id = "qwen-next-80b-a3b-thinking"
+model = "Qwen/Qwen3-Next-80B-A3B-Thinking"
+api_client_type = "openai_chat_completions"
+
+[endpoint.sampling_args]
+temperature = 0.6
+top_p = 0.95
+top_k = 20
+min_p = 0
+
+
+# Qwen3-VL temperature guidance varies across official sources and examples;
+# 1.0, 0.8, and 0.6 are all referenced. This preserves the legacy run config.
+[[endpoint]]
+endpoint_id = "qwen-vl-30b-a3b-thinking"
+model = "Qwen/Qwen3-VL-30B-A3B-Thinking"
+api_client_type = "openai_chat_completions"
+
+[endpoint.sampling_args]
+temperature = 1.0
+top_p = 0.95
+top_k = 20
+min_p = 0
+
+
+[[endpoint]]
+endpoint_id = "qwen2_5-32b-instruct"
+model = "Qwen/Qwen2.5-32B-Instruct"
+api_client_type = "openai_chat_completions"
+
+[endpoint.sampling_args]
+temperature = 0.7
+top_p = 0.8
+top_k = 20
+
+
+[[endpoint]]
+endpoint_id = "qwen3-max"
+model = "qwen/qwen3-max"
+api_client_type = "openai_chat_completions"
+
+[endpoint.sampling_args]
+temperature = 0.6
+top_p = 0.95
+top_k = 20
+min_p = 0
+
+[endpoint.sampling_args.extra_body]
+
+[endpoint.sampling_args.extra_body.usage]
+include = true
+
+
+[[endpoint]]
+endpoint_id = "smollm3-3b"
+model = "HuggingFaceTB/SmolLM3-3B"
+api_client_type = "openai_chat_completions"
+
+[endpoint.sampling_args]
+temperature = 0.6
+top_p = 0.95
+
+
+[[endpoint]]
+endpoint_id = "sonnet-4_5"
+model = "claude-sonnet-4-5-20250929"
+api_client_type = "anthropic_messages"
+
+[endpoint.sampling_args]
+temperature = 0.7
+
+
+[[endpoint]]
+endpoint_id = "trinity-mini"
+model = "arcee-ai/Trinity-Mini"
+api_client_type = "openai_chat_completions"
+
+[endpoint.sampling_args]
+temperature = 0.15
+top_k = 50
+top_p = 0.75
+min_p = 0.06
+
+
+[[endpoint]]
+endpoint_id = "trinity-nano-preview"
+model = "arcee-ai/Trinity-Nano-Preview"
+api_client_type = "openai_chat_completions"
+
+[endpoint.sampling_args]
+temperature = 0.5
+top_k = 50
+top_p = 0.95
diff --git a/configs/eval/medmarks-smoke.toml b/configs/eval/medmarks-smoke.toml
new file mode 100644
index 00000000..24e0d047
--- /dev/null
+++ b/configs/eval/medmarks-smoke.toml
@@ -0,0 +1,105 @@
+# Small Medmarks-V smoke run.
+# Runs 10 examples from each verified environment without ablations.
+
+save_results = true
+output_dir = "runs/smoke"
+
+[[eval]]
+env_id = "careqa"
+num_examples = 10
+rollouts_per_example = 1
+env_args = { split = "en" }
+
+[[eval]]
+env_id = "head_qa_v2"
+num_examples = 10
+rollouts_per_example = 1
+
+[[eval]]
+env_id = "longhealth"
+num_examples = 10
+rollouts_per_example = 1
+env_args = { task = "task1", doc_shuffle_seed = 2718 }
+
+[[eval]]
+env_id = "m_arc"
+num_examples = 10
+rollouts_per_example = 1
+
+[[eval]]
+env_id = "med_halt"
+num_examples = 10
+rollouts_per_example = 1
+env_args = { question_type = "reasoning_fct" }
+
+[[eval]]
+env_id = "med_mcqa"
+num_examples = 10
+rollouts_per_example = 1
+
+[[eval]]
+env_id = "medbullets"
+num_examples = 10
+rollouts_per_example = 1
+env_args = { num_options = 4 }
+
+[[eval]]
+env_id = "medcalc_bench"
+num_examples = 10
+rollouts_per_example = 1
+env_args = { version = "1.2" }
+
+[[eval]]
+env_id = "medconceptsqa"
+num_examples = 10
+rollouts_per_example = 1
+env_args = { vocab = "icd10cm_sample", difficulty = "easy" }
+
+[[eval]]
+env_id = "medhallu"
+num_examples = 10
+rollouts_per_example = 1
+env_args = { difficulty = "easy" }
+
+[[eval]]
+env_id = "medqa"
+num_examples = 10
+rollouts_per_example = 1
+
+[[eval]]
+env_id = "medxpertqa"
+num_examples = 10
+rollouts_per_example = 1
+env_args = { question_type = "reasoning" }
+
+[[eval]]
+env_id = "metamedqa"
+num_examples = 10
+rollouts_per_example = 1
+
+[[eval]]
+env_id = "mmlu_pro_health"
+num_examples = 10
+rollouts_per_example = 1
+
+[[eval]]
+env_id = "pubhealthbench"
+num_examples = 10
+rollouts_per_example = 1
+env_args = { split = "reviewed" }
+
+[[eval]]
+env_id = "pubmedqa"
+num_examples = 10
+rollouts_per_example = 1
+
+[[eval]]
+env_id = "sctpublic"
+num_examples = 10
+rollouts_per_example = 1
+
+[[eval]]
+env_id = "supergpqa_medicine"
+num_examples = 10
+rollouts_per_example = 1
+env_args = { difficulty = "easy" }
diff --git a/configs/eval/smoke.toml b/configs/eval/smoke.toml
deleted file mode 100644
index 550245de..00000000
--- a/configs/eval/smoke.toml
+++ /dev/null
@@ -1,8 +0,0 @@
-model = "openai/gpt-4.1-mini"
-save_results = true
-output_dir = "runs/evals"
-
-[[eval]]
-env_id = "medqa"
-num_examples = 1
-rollouts_per_example = 1
diff --git a/docs/developer-guide.md b/docs/developer-guide.md
index ee41a853..00bfd0ed 100644
--- a/docs/developer-guide.md
+++ b/docs/developer-guide.md
@@ -121,7 +121,7 @@ results = env.evaluate(model_client, "gpt-4.1-mini", num_examples=5)
 uv run medarc-eval medqa -m gpt-4.1-mini -n 25
 
 # Run batch evaluations from config
-uv run medarc-eval bench --config configs/eval/smoke.toml
+uv run medarc-eval bench --config configs/eval/medmarks-smoke.toml
 
 # Process results and compute win rates
 uv run medarc-eval process --runs-dir runs/evals
diff --git a/docs/medarc-eval-bench.md b/docs/medarc-eval-bench.md
index aef0f91f..6f93c574 100644
--- a/docs/medarc-eval-bench.md
+++ b/docs/medarc-eval-bench.md
@@ -11,7 +11,7 @@ accepts `.toml` files only.
 
 ```bash
 # Preview the repository smoke config
-medarc-eval bench --config configs/eval/smoke.toml --dry-run
+medarc-eval bench --config configs/eval/medmarks-smoke.toml --dry-run
 
 # Run the verified production suite
 medarc-eval bench --config configs/eval/medmarks-verified.toml
@@ -28,7 +28,7 @@ Repository suite configs live in `configs/eval/`:
 
 | Config | Purpose |
 |--------|---------|
-| `smoke.toml` | Small smoke test used by CLI tests |
+| `medmarks-smoke.toml` | Small Medmarks-V smoke test used by CLI tests |
 | `medmarks-verified.toml` | Verified benchmark suite |
 | `medmarks-open_ended.toml` | Open-ended benchmark suite |
 
@@ -173,7 +173,7 @@ endpoint_id = "gpt-oss-20b-low-local"
 model = "openai/gpt-oss-20b"
 url = "http://host.docker.internal:8010/v1"
 key = "VLLM_API_KEY"
-api_client_type = "openai_chat_completions"
+api_client_type = "openai_responses"
 
 [endpoint.sampling_args]
 temperature = 1.0
diff --git a/docs/medarc-eval.md b/docs/medarc-eval.md
index 93465f7f..b5b725f5 100644
--- a/docs/medarc-eval.md
+++ b/docs/medarc-eval.md
@@ -11,7 +11,7 @@
 medarc-eval medqa -m gpt-4.1-mini -n 25
 
 # Run a batch of benchmarks from a config file
-medarc-eval bench --config configs/eval/smoke.toml
+medarc-eval bench --config configs/eval/medmarks-smoke.toml
 
 # Process raw results into analysis-ready parquet files
 medarc-eval process --runs-dir runs/evals
diff --git a/medarc_verifiers/cli/verifiers_adapter.py b/medarc_verifiers/cli/verifiers_adapter.py
index 3b50163e..e3b417f0 100644
--- a/medarc_verifiers/cli/verifiers_adapter.py
+++ b/medarc_verifiers/cli/verifiers_adapter.py
@@ -22,7 +22,7 @@
     EndpointClientConfig,
     EvalConfig,
 )
-from verifiers.utils.eval_utils import load_endpoints, load_toml_config, resolve_endpoints_file
+from verifiers.utils.eval_utils import load_toml_config, resolve_endpoints_file
 from verifiers.utils.import_utils import load_toml
 
 from medarc_verifiers.cli.utils.endpoint_utils import load_endpoint_sampling_profiles
@@ -114,6 +114,82 @@ def _strip_medarc_metadata(raw: Mapping[str, Any]) -> dict[str, Any]:
     return cleaned
 
 
+def _load_endpoint_registry(endpoints_path: str) -> dict[str, list[Endpoint]]:
+    """Load endpoint aliases, allowing model-only entries for portable alias registries."""
+    endpoints_file = resolve_endpoints_file(endpoints_path)
+    if endpoints_file is None or not endpoints_file.exists():
+        return {}
+    if endpoints_file.suffix != ".toml":
+        raise ValueError(f"Unsupported endpoints file extension '{endpoints_file.suffix}' at {endpoints_file}")
+
+    with endpoints_file.open("rb") as handle:
+        raw_toml = load_toml(handle)
+    if not isinstance(raw_toml, dict):
+        raise ValueError(f"Expected top-level TOML table in endpoint registry {endpoints_file}")
+
+    raw_entries = raw_toml.get("endpoint", [])
+    if not isinstance(raw_entries, list):
+        raise ValueError(f"Expected [[endpoint]] array-of-tables in endpoint registry {endpoints_file}")
+
+    endpoints: dict[str, list[Endpoint]] = {}
+    for index, raw_entry in enumerate(raw_entries):
+        entry_source = f"{endpoints_file} ([[endpoint]] index {index})"
+        if not isinstance(raw_entry, dict):
+            raise ValueError(f"Each [[endpoint]] entry must be a table in {entry_source}")
+
+        endpoint_id = raw_entry.get("endpoint_id")
+        if not isinstance(endpoint_id, str) or not endpoint_id:
+            raise ValueError(f"Each [[endpoint]] entry must include non-empty string 'endpoint_id' in {entry_source}")
+
+        model = raw_entry.get("model")
+        if not isinstance(model, str) or not model:
+            raise ValueError(f"Endpoint '{endpoint_id}' must include non-empty string 'model' in {entry_source}")
+
+        url = raw_entry.get("url")
+        api_base_url = raw_entry.get("api_base_url")
+        if url is not None and api_base_url is not None and url != api_base_url:
+            raise ValueError(f"Conflicting values for 'url' and 'api_base_url' in {entry_source}")
+        resolved_url = url if url is not None else api_base_url
+        if resolved_url is not None and not isinstance(resolved_url, str):
+            raise ValueError(f"Endpoint '{endpoint_id}' url/api_base_url must be a string in {entry_source}")
+
+        key = raw_entry.get("key")
+        api_key_var = raw_entry.get("api_key_var")
+        if key is not None and api_key_var is not None and key != api_key_var:
+            raise ValueError(f"Conflicting values for 'key' and 'api_key_var' in {entry_source}")
+        resolved_key = key if key is not None else api_key_var
+        if resolved_key is not None and not isinstance(resolved_key, str):
+            raise ValueError(f"Endpoint '{endpoint_id}' key/api_key_var must be a string in {entry_source}")
+
+        short_client_type = raw_entry.get("type")
+        long_client_type = raw_entry.get("api_client_type")
+        if short_client_type is not None and long_client_type is not None and short_client_type != long_client_type:
+            raise ValueError(f"Conflicting values for 'type' and 'api_client_type' in {entry_source}")
+        client_type = short_client_type if short_client_type is not None else long_client_type
+        if client_type is not None and not isinstance(client_type, str):
+            raise ValueError(f"Endpoint '{endpoint_id}' api_client_type/type must be a string in {entry_source}")
+
+        endpoint: Endpoint = {"model": model}
+        if resolved_url is not None:
+            endpoint["url"] = resolved_url
+        if resolved_key is not None:
+            endpoint["key"] = resolved_key
+        if client_type is not None:
+            endpoint["api_client_type"] = cast(ClientType, client_type)
+
+        raw_headers = raw_entry.get("headers")
+        raw_extra_headers = raw_entry.get("extra_headers")
+        if raw_headers is not None and raw_extra_headers is not None:
+            raise ValueError(f"Use only one of 'headers' or 'extra_headers' in {entry_source}, not both")
+        headers = raw_headers if raw_headers is not None else raw_extra_headers
+        if headers is not None:
+            endpoint["extra_headers"] = _validate_header_mapping(headers)
+
+        endpoints.setdefault(endpoint_id, []).append(endpoint)
+
+    return endpoints
+
+
 def build_eval_config(raw: Mapping[str, Any], *, overrides: EvalConfigOverrides | None = None) -> EvalConfig:
     """Build an upstream ``EvalConfig`` from one loaded TOML/CLI eval mapping."""
 
@@ -133,7 +209,7 @@ def build_eval_config(raw: Mapping[str, Any], *, overrides: EvalConfigOverrides
     )
 
     endpoints_path = str(merged_raw.get("endpoints_path", DEFAULT_ENDPOINTS_PATH))
-    endpoints = load_endpoints(endpoints_path)
+    endpoints = _load_endpoint_registry(endpoints_path)
     model, resolved_endpoint_id, client_config = _build_client_config(merged_raw, endpoints, endpoints_path)
 
     endpoint_sampling_profiles = load_endpoint_sampling_profiles(endpoints_path)
@@ -286,10 +362,11 @@ def _build_client_config(
         endpoint_group = list(endpoints[endpoint_lookup_id])
         resolved_endpoint_id = cast(str, endpoint_lookup_id)
         endpoint = endpoint_group[0]
+        provider_cfg = PROVIDER_CONFIGS[raw_provider or DEFAULT_PROVIDER]
 
-        api_key_var = endpoint["key"]
-        api_base_url = endpoint["url"]
-        client_type = endpoint.get("api_client_type", DEFAULT_CLIENT_TYPE)
+        api_key_var = endpoint.get("key") or raw.get("default_api_key_var", provider_cfg["key"])
+        api_base_url = endpoint.get("url") or raw.get("default_api_base_url", provider_cfg["url"])
+        client_type = endpoint.get("api_client_type", provider_cfg.get("client_type", DEFAULT_CLIENT_TYPE))
 
         endpoint_models = {entry["model"] for entry in endpoint_group}
         if len(endpoint_models) > 1:
@@ -338,7 +415,13 @@ def _build_client_config(
     merged_headers = {**prime_headers, **registry_headers_base, **eval_headers_merged}
 
     endpoint_configs: list[EndpointClientConfig] = []
-    if endpoint_group is not None and not api_base_url_override and raw_provider is None and len(endpoint_group) > 1:
+    if (
+        endpoint_group is not None
+        and not api_base_url_override
+        and raw_provider is None
+        and len(endpoint_group) > 1
+        and all("url" in endpoint and "key" in endpoint for endpoint in endpoint_group)
+    ):
         endpoint_configs = [
             EndpointClientConfig(
                 api_key_var=api_key_var if api_key_override else endpoint["key"],
diff --git a/tests/test_cli/test_main.py b/tests/test_cli/test_main.py
index 2d332739..036d282e 100644
--- a/tests/test_cli/test_main.py
+++ b/tests/test_cli/test_main.py
@@ -153,13 +153,14 @@ def test_toml_bench_dry_run_expands_evals_and_ablations(
 
 
 def test_repository_smoke_toml_config_dry_runs(capsys: pytest.CaptureFixture[str]) -> None:
-    exit_code = main.main(["bench", "--config", "configs/eval/smoke.toml", "--dry-run"])
+    exit_code = main.main(["bench", "--config", "configs/eval/medmarks-smoke.toml", "--dry-run"])
 
     output = capsys.readouterr().out
     assert exit_code == 0
     assert "TOML Bench Dry Run" in output
+    assert "18 eval(s) to dry-run" in output
     assert "medqa" in output
-    assert "runs/evals/openai-gpt-4.1-mini/medqa" in output
+    assert "runs/smoke/openai-gpt-4.1-mini/medqa" in output
 
 
 def test_toml_bench_dry_run_accepts_medarc_orchestrate_metadata(
@@ -206,7 +207,7 @@ def test_bench_rejects_non_toml_config(tmp_path: Path, capsys: pytest.CaptureFix
 
 def test_bench_rejects_removed_yaml_runner_flags(capsys: pytest.CaptureFixture[str]) -> None:
     with pytest.raises(SystemExit) as excinfo:
-        main.main(["bench", "--config", "configs/eval/smoke.toml", "--restart"])
+        main.main(["bench", "--config", "configs/eval/medmarks-smoke.toml", "--restart"])
 
     assert excinfo.value.code == 2
     err = capsys.readouterr().err
@@ -214,7 +215,7 @@ def test_bench_rejects_removed_yaml_runner_flags(capsys: pytest.CaptureFixture[s
 
 
 def test_repository_verified_toml_config_dry_run_shows_ablation_variants(capsys: pytest.CaptureFixture[str]) -> None:
-    exit_code = main.main(["bench", "--config", "configs/eval/medmarks-verified.toml", "--dry-run", "--eval-index", "9"])
+    exit_code = main.main(["bench", "--config", "configs/eval/medmarks-verified.toml", "--dry-run", "--eval-index", "45"])
 
     output = capsys.readouterr().out
     assert exit_code == 0
diff --git a/tests/test_cli/test_verifiers_adapter.py b/tests/test_cli/test_verifiers_adapter.py
index 1615e4eb..ef32e67a 100644
--- a/tests/test_cli/test_verifiers_adapter.py
+++ b/tests/test_cli/test_verifiers_adapter.py
@@ -179,6 +179,37 @@ def test_build_eval_config_resolves_endpoint_alias_and_core_fields(tmp_path: Pat
     }
 
 
+def test_build_eval_config_supports_model_only_endpoint_alias(tmp_path: Path) -> None:
+    endpoints_path = tmp_path / "endpoints.toml"
+    endpoints_path.write_text(
+        """
+[[endpoint]]
+endpoint_id = "portable-alias"
+model = "org/resolved"
+
+[endpoint.sampling_args]
+temperature = 0.4
+top_p = 0.9
+""".strip()
+    )
+
+    config = build_eval_config(
+        {
+            "env_id": "medqa",
+            "model": "portable-alias",
+            "endpoints_path": str(endpoints_path),
+            "api_base_url": "https://deployment.example/v1",
+            "api_key_var": "DEPLOYMENT_KEY",
+        }
+    )
+
+    assert config.model == "org/resolved"
+    assert config.client_config.api_base_url == "https://deployment.example/v1"
+    assert config.client_config.api_key_var == "DEPLOYMENT_KEY"
+    assert config.sampling_args["temperature"] == 0.4
+    assert config.sampling_args["top_p"] == 0.9
+
+
 def test_build_eval_config_supports_endpoint_replicas(tmp_path: Path) -> None:
     endpoints_path = _write_endpoints(tmp_path / "endpoints.toml")
 
@@ -214,6 +245,7 @@ def test_build_eval_config_uses_endpoint_sampling_defaults(tmp_path: Path) -> No
 model = "openai/gpt-oss-20b"
 url = "http://localhost:8010/v1"
 key = "VLLM_API_KEY"
+api_client_type = "openai_responses"
 
 [endpoint.sampling_args]
 temperature = 1.0
@@ -226,6 +258,7 @@ def test_build_eval_config_uses_endpoint_sampling_defaults(tmp_path: Path) -> No
     config = build_eval_config({"env_id": "medqa", "model": "gpt-oss", "endpoints_path": str(endpoints_path)})
 
     assert config.model == "openai/gpt-oss-20b"
+    assert config.client_config.client_type == "openai_responses"
     assert config.sampling_args["temperature"] == 1.0
     assert config.sampling_args["top_p"] == 1.0
     assert config.sampling_args["reasoning_effort"] == "low"

From cb1e6b7a0b6373ba20d27579ab7f6411ee904c3a Mon Sep 17 00:00:00 2001
From: Benjamin Warner <me@benjaminwarner.dev>
Date: Mon, 11 May 2026 22:48:08 +0000
Subject: [PATCH 38/53] Implement subprocess env lifecycle

---
 docs/medarc-eval-bench.md                 |  37 ++++
 medarc_verifiers/cli/bench_child.py       | 112 +++++++++++
 medarc_verifiers/cli/env_lifecycle.py     | 150 +++++++++++++++
 medarc_verifiers/cli/main.py              | 216 +++++++++++++++++++++-
 medarc_verifiers/cli/upstream_eval.py     |   8 +-
 medarc_verifiers/cli/verifiers_adapter.py |  27 +++
 tests/test_cli/test_bench_child.py        | 101 ++++++++++
 tests/test_cli/test_env_lifecycle.py      |  86 +++++++++
 tests/test_cli/test_main.py               |  83 +++++++++
 9 files changed, 812 insertions(+), 8 deletions(-)
 create mode 100644 medarc_verifiers/cli/bench_child.py
 create mode 100644 medarc_verifiers/cli/env_lifecycle.py
 create mode 100644 tests/test_cli/test_bench_child.py
 create mode 100644 tests/test_cli/test_env_lifecycle.py

diff --git a/docs/medarc-eval-bench.md b/docs/medarc-eval-bench.md
index 6f93c574..b72163b2 100644
--- a/docs/medarc-eval-bench.md
+++ b/docs/medarc-eval-bench.md
@@ -16,6 +16,9 @@ medarc-eval bench --config configs/eval/medmarks-smoke.toml --dry-run
 # Run the verified production suite
 medarc-eval bench --config configs/eval/medmarks-verified.toml
 
+# Run selected evals while installing missing local env packages as needed
+medarc-eval bench --config configs/eval/medmarks-verified.toml --install-envs
+
 # Run the verified suite against a local OpenAI-compatible server
 medarc-eval bench \
   --config configs/eval/medmarks-verified.toml \
@@ -61,6 +64,39 @@ Per-environment defaults can also live in an environment package
 explicit `num_examples` and `rollouts_per_example` values so they remain stable
 across editable and wheel installs.
 
+## Local Environment Install Lifecycle
+
+By default, TOML bench expects environment packages to already be importable in
+the active Python environment. Pass `--install-envs` when running repository
+local environments from `--env-dir` and you want bench to install missing
+packages only for the selected evals:
+
+```bash
+medarc-eval bench \
+  --config configs/eval/medmarks-verified.toml \
+  --install-envs \
+  --eval-index "$SLURM_ARRAY_TASK_ID"
+```
+
+With `--install-envs`, the parent process loads and expands the TOML config,
+applies `--eval-index` / `--start-at` / `--stop-after`, plans deterministic
+output paths, and spawns one child subprocess per selected eval. Each child
+installs its missing environment package into the shared venv, builds the
+upstream `EvalConfig` after install, runs upstream evaluation with the
+parent-planned `resume_path`, and uninstalls only packages it installed before
+exiting.
+
+This clears Python import state between evals, but it still mutates the shared
+venv while each child is running. It does not uninstall transitive dependencies,
+and concurrent `--install-envs` bench runs against the same venv are
+unsupported. If the parent is interrupted, editable install metadata may be left
+behind and should be cleaned up manually with `uv pip uninstall`.
+
+`--install-envs --dry-run` does not install packages or spawn child processes.
+Dry-run identity is therefore based on TOML and CLI values only; environment
+package `[tool.verifiers.eval]` defaults are resolved only during the real child
+run after install.
+
 ## Ablations and Variants
 
 Use upstream `[[ablation]]` tables to sweep values. The upstream env id stays
@@ -145,6 +181,7 @@ provider arguments pass through to upstream.
 | `--resume` | Compatibility flag; valid deterministic outputs resume automatically |
 | `--output-dir PATH` | Override the config output directory, default `runs/evals` |
 | `--env-dir PATH` | Directory containing local environments |
+| `--install-envs` | Run selected evals in per-eval subprocesses that install missing local env packages into the shared venv and clean up child-installed packages |
 | `--endpoints-path PATH` | Endpoint registry path, default `configs/endpoints.toml` |
 | `--api-base-url URL` | Override API base URL for every eval |
 | `--api-key-var NAME` | Override API key environment variable |
diff --git a/medarc_verifiers/cli/bench_child.py b/medarc_verifiers/cli/bench_child.py
new file mode 100644
index 00000000..e672fe5a
--- /dev/null
+++ b/medarc_verifiers/cli/bench_child.py
@@ -0,0 +1,112 @@
+"""Private subprocess runner for one TOML bench eval with env lifecycle."""
+
+from __future__ import annotations
+
+import argparse
+import asyncio
+import json
+import sys
+import traceback
+from pathlib import Path
+from typing import Any
+
+from verifiers.utils.eval_utils import run_evaluation
+
+from medarc_verifiers.cli.env_lifecycle import (
+    EnvInstallState,
+    ensure_installed,
+    resolve_env_package,
+    uninstall_if_child_installed,
+)
+from medarc_verifiers.cli.upstream_eval import EvalConfigOverrides, build_eval_config
+
+
+def main(argv: list[str] | None = None) -> int:
+    parser = argparse.ArgumentParser(description="Run one TOML bench eval child payload.")
+    parser.add_argument("payload", type=Path)
+    args = parser.parse_args(argv)
+    payload = json.loads(args.payload.read_text(encoding="utf-8"))
+    status = _run_payload(payload)
+    status_path = Path(payload["status_path"])
+    status_path.parent.mkdir(parents=True, exist_ok=True)
+    status_path.write_text(json.dumps(status, sort_keys=True), encoding="utf-8")
+    return int(status["exit_code"])
+
+
+def _run_payload(payload: dict[str, Any]) -> dict[str, Any]:
+    installed_state: EnvInstallState | None = None
+    eval_failed = False
+    cleanup_failed = False
+    status: dict[str, Any] = {
+        "env_id": payload.get("expected_env_id"),
+        "model": payload.get("expected_model"),
+        "installed_by_child": False,
+        "eval_ok": False,
+        "cleanup_ok": True,
+        "primary_error": None,
+        "cleanup_error": None,
+        "exit_code": 1,
+        "exit_reason": "not_started",
+    }
+
+    try:
+        ref = resolve_env_package(payload["raw_config"]["env_id"], payload["env_dir"])
+        installed_state = ensure_installed(ref)
+        status["installed_by_child"] = installed_state.installed_by_child
+
+        config = build_eval_config(payload["raw_config"], overrides=_overrides_from_payload(payload["overrides"]))
+        planned_resume_path = Path(payload["resume_path"])
+        if config.env_id != payload["expected_env_id"]:
+            raise ValueError(
+                f"Child resolved env_id {config.env_id!r}, expected {payload['expected_env_id']!r}."
+            )
+        if config.model != payload["expected_model"]:
+            raise ValueError(
+                f"Child resolved model {config.model!r}, expected {payload['expected_model']!r}."
+            )
+        config = config.model_copy(update={"resume_path": planned_resume_path, "save_results": True})
+        asyncio.run(run_evaluation(config))
+        status["eval_ok"] = True
+        status["exit_code"] = 0
+        status["exit_reason"] = "success"
+    except Exception as exc:  # noqa: BLE001
+        eval_failed = True
+        status["primary_error"] = _format_exception(exc)
+        status["exit_reason"] = "eval_failed"
+    finally:
+        try:
+            if installed_state is not None:
+                uninstall_if_child_installed(installed_state)
+        except Exception as exc:  # noqa: BLE001
+            cleanup_failed = True
+            status["cleanup_ok"] = False
+            status["cleanup_error"] = _format_exception(exc)
+
+    if cleanup_failed and not eval_failed:
+        status["exit_code"] = 1
+        status["exit_reason"] = "cleanup_failed"
+    elif eval_failed:
+        status["exit_code"] = 1
+    return status
+
+
+def _overrides_from_payload(payload: dict[str, Any]) -> EvalConfigOverrides:
+    return EvalConfigOverrides(
+        model=payload.get("model"),
+        provider=payload.get("provider"),
+        api_base_url=payload.get("api_base_url"),
+        api_key_var=payload.get("api_key_var"),
+        api_client_type=payload.get("api_client_type"),
+        endpoints_path=payload.get("endpoints_path"),
+        max_concurrent=payload.get("max_concurrent"),
+        env_args=payload.get("env_args"),
+        sampling_args=payload.get("sampling_args"),
+    )
+
+
+def _format_exception(exc: BaseException) -> str:
+    return "".join(traceback.format_exception_only(type(exc), exc)).strip()
+
+
+if __name__ == "__main__":
+    raise SystemExit(main(sys.argv[1:]))
diff --git a/medarc_verifiers/cli/env_lifecycle.py b/medarc_verifiers/cli/env_lifecycle.py
new file mode 100644
index 00000000..2a2ad3fe
--- /dev/null
+++ b/medarc_verifiers/cli/env_lifecycle.py
@@ -0,0 +1,150 @@
+"""Local environment package lifecycle helpers for TOML bench subprocesses."""
+
+from __future__ import annotations
+
+import importlib
+import importlib.metadata
+import importlib.util
+import subprocess
+import sys
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any
+
+from verifiers.utils.import_utils import load_toml
+
+
+@dataclass(frozen=True)
+class EnvPackageRef:
+    env_id: str
+    module_name: str
+    project_name: str
+    env_path: Path
+    loader: str | None = None
+
+
+@dataclass(frozen=True)
+class EnvInstallState:
+    ref: EnvPackageRef
+    installed_by_child: bool
+    distribution_preexisting: bool
+    module_preexisting: bool
+
+
+def upstream_module_name(env_id: str) -> str:
+    return env_id.replace("-", "_").split("/")[-1]
+
+
+def resolve_env_package(env_id: str, env_dir: str | Path) -> EnvPackageRef:
+    module_name = upstream_module_name(env_id)
+    env_root = Path(env_dir).expanduser() / module_name
+    pyproject_path = env_root / "pyproject.toml"
+    if not env_root.exists():
+        raise FileNotFoundError(
+            f"Environment {env_id!r} is not installed and no local package was found at {env_root}. "
+            "Install it manually or pass --env-dir."
+        )
+    if not pyproject_path.is_file():
+        raise FileNotFoundError(
+            f"Environment {env_id!r} local package at {env_root} is missing pyproject.toml."
+        )
+
+    with pyproject_path.open("rb") as handle:
+        pyproject_data: dict[str, Any] = load_toml(handle)
+
+    project_name = pyproject_data.get("project", {}).get("name")
+    if not isinstance(project_name, str) or not project_name:
+        raise ValueError(f"Environment {env_id!r} pyproject.toml must define [project].name.")
+
+    loader = pyproject_data.get("tool", {}).get("prime", {}).get("environment", {}).get("loader")
+    if loader is not None and not isinstance(loader, str):
+        loader = None
+
+    return EnvPackageRef(
+        env_id=env_id,
+        module_name=module_name,
+        project_name=project_name,
+        env_path=env_root,
+        loader=loader,
+    )
+
+
+def inspect_install_state(ref: EnvPackageRef) -> EnvInstallState:
+    distribution_preexisting = _distribution_exists(ref.project_name)
+    module_preexisting = _module_importable(ref.module_name)
+
+    if distribution_preexisting and not module_preexisting:
+        loader_note = f" Loader metadata is {ref.loader!r}." if ref.loader else ""
+        raise ModuleNotFoundError(
+            f"Distribution {ref.project_name!r} is installed, but upstream module "
+            f"{ref.module_name!r} is not importable.{loader_note}"
+        )
+
+    return EnvInstallState(
+        ref=ref,
+        installed_by_child=False,
+        distribution_preexisting=distribution_preexisting,
+        module_preexisting=module_preexisting,
+    )
+
+
+def ensure_installed(ref: EnvPackageRef) -> EnvInstallState:
+    state = inspect_install_state(ref)
+    if state.distribution_preexisting or state.module_preexisting:
+        return state
+
+    subprocess.run(
+        ["uv", "pip", "install", "--python", sys.executable, "-e", str(ref.env_path)],
+        check=True,
+    )
+    importlib.invalidate_caches()
+    if not _module_importable(ref.module_name):
+        subprocess.run(
+            ["uv", "pip", "uninstall", "--python", sys.executable, "-y", ref.project_name],
+            check=False,
+        )
+        loader_note = f" Loader metadata is {ref.loader!r}." if ref.loader else ""
+        raise ModuleNotFoundError(
+            f"Installed {ref.project_name!r} from {ref.env_path}, but upstream module "
+            f"{ref.module_name!r} is still not importable.{loader_note}"
+        )
+    return EnvInstallState(
+        ref=ref,
+        installed_by_child=True,
+        distribution_preexisting=False,
+        module_preexisting=False,
+    )
+
+
+def uninstall_if_child_installed(state: EnvInstallState) -> None:
+    if not state.installed_by_child:
+        return
+    subprocess.run(
+        ["uv", "pip", "uninstall", "--python", sys.executable, "-y", state.ref.project_name],
+        check=True,
+    )
+    importlib.invalidate_caches()
+    sys.modules.pop(state.ref.module_name, None)
+
+
+def _distribution_exists(project_name: str) -> bool:
+    try:
+        importlib.metadata.distribution(project_name)
+    except importlib.metadata.PackageNotFoundError:
+        return False
+    return True
+
+
+def _module_importable(module_name: str) -> bool:
+    return importlib.util.find_spec(module_name) is not None
+
+
+__all__ = [
+    "EnvInstallState",
+    "EnvPackageRef",
+    "ensure_installed",
+    "inspect_install_state",
+    "resolve_env_package",
+    "uninstall_if_child_installed",
+    "upstream_module_name",
+]
diff --git a/medarc_verifiers/cli/main.py b/medarc_verifiers/cli/main.py
index bdf753b4..b3dd9c25 100644
--- a/medarc_verifiers/cli/main.py
+++ b/medarc_verifiers/cli/main.py
@@ -4,13 +4,17 @@
 
 import argparse
 import asyncio
+import json
 import logging
 import os
 import shutil
+import subprocess
 import sys
+import tempfile
 from datetime import UTC, datetime
 from pathlib import Path
 from textwrap import dedent
+from types import SimpleNamespace
 from typing import Any, Literal, Mapping, Sequence
 
 import yaml
@@ -46,7 +50,12 @@
     normalize_dataset_ids,
     normalize_model_ids,
 )
-from medarc_verifiers.cli.upstream_eval import EvalConfigOverrides, build_eval_config, load_toml_eval_configs
+from medarc_verifiers.cli.upstream_eval import (
+    EvalConfigOverrides,
+    build_eval_config,
+    build_eval_identity_payload,
+    load_toml_eval_configs,
+)
 from medarc_verifiers.utils.pathing import resolve_under
 from medarc_verifiers.cli.winrate import (
     WinrateConfig,
@@ -80,6 +89,11 @@ def build_batch_parser() -> argparse.ArgumentParser:
         default=DEFAULT_ENV_DIR,
         help="Directory containing environments (default: %(default)s).",
     )
+    parser.add_argument(
+        "--install-envs",
+        action="store_true",
+        help="Install missing local env packages for selected TOML evals in per-eval subprocesses.",
+    )
     parser.add_argument(
         "--endpoints-path",
         type=Path,
@@ -1254,6 +1268,23 @@ def _run_toml_bench(args: argparse.Namespace) -> int:
         load_toml_eval_configs(config_path, extra_valid_fields={"name", "variant_id"}),
         args,
     )
+    if args.install_envs:
+        return _run_toml_bench_with_env_lifecycle(raw_configs, args)
+
+    overrides = _toml_eval_overrides(args)
+    eval_configs = [build_eval_config(raw, overrides=overrides) for raw in raw_configs]
+    plan_inputs = [_eval_config_identity_payload(config, raw) for config, raw in zip(eval_configs, raw_configs)]
+    output_root = _resolve_toml_output_root(eval_configs, args)
+    path_plans = plan_eval_paths(plan_inputs, output_root=output_root)
+    eval_configs, path_plans, plan_inputs = _select_toml_plan(eval_configs, path_plans, plan_inputs, args)
+
+    _print_toml_bench_plan(eval_configs, path_plans, dry_run=bool(args.dry_run))
+    if args.dry_run:
+        return 0
+    return _execute_toml_plan(eval_configs, path_plans, args)
+
+
+def _toml_eval_overrides(args: argparse.Namespace) -> EvalConfigOverrides:
     overrides = EvalConfigOverrides(
         model=args.model,
         provider=args.provider,
@@ -1264,16 +1295,25 @@ def _run_toml_bench(args: argparse.Namespace) -> int:
         env_args=getattr(args, "cli_env_args", None),
         sampling_args=getattr(args, "cli_sampling_args", None),
     )
-    eval_configs = [build_eval_config(raw, overrides=overrides) for raw in raw_configs]
-    plan_inputs = [_eval_config_identity_payload(config, raw) for config, raw in zip(eval_configs, raw_configs)]
-    output_root = _resolve_toml_output_root(eval_configs, args)
+    return overrides
+
+
+def _run_toml_bench_with_env_lifecycle(raw_configs: Sequence[dict[str, Any]], args: argparse.Namespace) -> int:
+    selected_raw = _select_toml_raw_configs(raw_configs, args)
+    overrides = _toml_eval_overrides(args)
+    plan_inputs = [build_eval_identity_payload(raw, overrides=overrides) for raw in selected_raw]
+    output_root = _resolve_toml_output_root_from_raw(selected_raw, args)
     path_plans = plan_eval_paths(plan_inputs, output_root=output_root)
-    eval_configs, path_plans, plan_inputs = _select_toml_plan(eval_configs, path_plans, plan_inputs, args)
+    display_configs = _display_configs_from_plan_inputs(plan_inputs)
 
-    _print_toml_bench_plan(eval_configs, path_plans, dry_run=bool(args.dry_run))
+    _print_toml_bench_plan(display_configs, path_plans, dry_run=bool(args.dry_run))
     if args.dry_run:
+        Console().print(
+            "[yellow]Note:[/yellow] --install-envs dry run does not install packages; "
+            "environment package defaults are not resolved unless present in TOML or CLI overrides."
+        )
         return 0
-    return _execute_toml_plan(eval_configs, path_plans, args)
+    return _execute_toml_lifecycle_plan(selected_raw, plan_inputs, path_plans, overrides, args)
 
 
 def _prepare_toml_raw_configs(raw_configs: Sequence[dict[str, Any]], args: argparse.Namespace) -> list[dict[str, Any]]:
@@ -1281,16 +1321,61 @@ def _prepare_toml_raw_configs(raw_configs: Sequence[dict[str, Any]], args: argpa
     for raw in raw_configs:
         item = dict(raw)
         item.setdefault("save_results", True)
+        item.setdefault("env_dir_path", str(args.env_dir))
         if args.max_concurrent is None and "max_concurrent" not in item:
             item["max_concurrent"] = 1
         if args.timeout is not None:
             item["timeout"] = args.timeout
         if args.rollout_max_retries is not None:
             item["max_retries"] = args.rollout_max_retries
+        if args.verbose:
+            item["verbose"] = True
         prepared.append(item)
     return prepared
 
 
+def _select_toml_raw_configs(raw_configs: Sequence[dict[str, Any]], args: argparse.Namespace) -> list[dict[str, Any]]:
+    indexed = list(raw_configs)
+    if args.eval_index is not None:
+        start = args.eval_index - 1
+        indexed = indexed[start : start + 1]
+    else:
+        if args.start_at is not None:
+            indexed = indexed[args.start_at - 1 :]
+        if args.stop_after is not None:
+            indexed = indexed[: args.stop_after - (args.start_at or 1) + 1]
+    if not indexed:
+        raise ValueError("No TOML evals matched the requested selection.")
+    return list(indexed)
+
+
+def _resolve_toml_output_root_from_raw(raw_configs: Sequence[Mapping[str, Any]], args: argparse.Namespace) -> Path:
+    if args.output_dir:
+        return Path(args.output_dir).expanduser()
+
+    configured_roots = {str(config["output_dir"]) for config in raw_configs if config.get("output_dir")}
+    if len(configured_roots) > 1:
+        raise ValueError(
+            "TOML bench deterministic output supports one output_dir per run; use a single global output_dir."
+        )
+    if configured_roots:
+        return Path(configured_roots.pop()).expanduser()
+    return DEFAULT_EVALS_DIR
+
+
+def _display_configs_from_plan_inputs(plan_inputs: Sequence[Mapping[str, Any]]) -> list[Any]:
+    return [
+        SimpleNamespace(
+            model=str(payload["model"]),
+            env_id=str(payload["env_id"]),
+            num_examples=payload.get("num_examples", "-"),
+            rollouts_per_example=payload.get("rollouts_per_example", "-"),
+            max_concurrent=payload.get("max_concurrent", "-"),
+        )
+        for payload in plan_inputs
+    ]
+
+
 def _resolve_toml_output_root(eval_configs: Sequence[Any], args: argparse.Namespace) -> Path:
     if args.output_dir:
         return Path(args.output_dir).expanduser()
@@ -1354,6 +1439,123 @@ def _execute_toml_plan(
     return 1 if failures else 0
 
 
+def _execute_toml_lifecycle_plan(
+    raw_configs: Sequence[Mapping[str, Any]],
+    plan_inputs: Sequence[Mapping[str, Any]],
+    path_plans: Sequence[EvalPathPlan],
+    overrides: EvalConfigOverrides,
+    args: argparse.Namespace,
+) -> int:
+    failures = 0
+    with tempfile.TemporaryDirectory(prefix="medarc-bench-env-") as temp_dir:
+        temp_root = Path(temp_dir)
+        for index, (raw, plan_input, path_plan) in enumerate(zip(raw_configs, plan_inputs, path_plans), start=1):
+            try:
+                _prepare_toml_results_dir(path_plan.results_path, force=bool(args.force))
+                payload_path = temp_root / f"eval-{index}.json"
+                status_path = temp_root / f"eval-{index}-status.json"
+                payload = _bench_child_payload(
+                    raw,
+                    plan_input,
+                    path_plan,
+                    overrides,
+                    args,
+                    status_path=status_path,
+                )
+                payload_path.write_text(json.dumps(payload, sort_keys=True, default=str), encoding="utf-8")
+                logger.info(
+                    "Running TOML eval %d/%d in subprocess: %s on %s",
+                    index,
+                    len(raw_configs),
+                    plan_input["env_id"],
+                    plan_input["model"],
+                )
+                completed = subprocess.run(
+                    [sys.executable, "-m", "medarc_verifiers.cli.bench_child", str(payload_path)],
+                    check=False,
+                )
+                status = _load_child_status(status_path)
+                if completed.returncode != 0 or int(status.get("exit_code", 1)) != 0:
+                    failures += 1
+                    logger.error(
+                        "TOML eval %d failed in subprocess: %s",
+                        index,
+                        status.get("primary_error") or status.get("cleanup_error") or status.get("exit_reason"),
+                    )
+                    if not args.continue_on_error:
+                        return 1
+            except Exception as exc:  # noqa: BLE001
+                failures += 1
+                logger.exception("TOML eval %d failed: %s", index, exc)
+                if not args.continue_on_error:
+                    return 1
+            if args.sleep and index < len(raw_configs):
+                import time
+
+                time.sleep(float(args.sleep))
+    return 1 if failures else 0
+
+
+def _bench_child_payload(
+    raw: Mapping[str, Any],
+    plan_input: Mapping[str, Any],
+    path_plan: EvalPathPlan,
+    overrides: EvalConfigOverrides,
+    args: argparse.Namespace,
+    *,
+    status_path: Path,
+) -> dict[str, Any]:
+    return {
+        "raw_config": _jsonable_mapping(raw),
+        "overrides": _jsonable_mapping(_overrides_payload(overrides)),
+        "env_dir": str(Path(args.env_dir).expanduser()),
+        "resume_path": str(path_plan.results_path),
+        "status_path": str(status_path),
+        "expected_env_id": str(plan_input["env_id"]),
+        "expected_model": str(plan_input["model"]),
+    }
+
+
+def _overrides_payload(overrides: EvalConfigOverrides) -> dict[str, Any]:
+    return {
+        "model": overrides.model,
+        "provider": overrides.provider,
+        "api_base_url": overrides.api_base_url,
+        "api_key_var": overrides.api_key_var,
+        "api_client_type": overrides.api_client_type,
+        "endpoints_path": str(overrides.endpoints_path) if overrides.endpoints_path is not None else None,
+        "max_concurrent": overrides.max_concurrent,
+        "env_args": dict(overrides.env_args or {}),
+        "sampling_args": dict(overrides.sampling_args or {}),
+    }
+
+
+def _jsonable_mapping(value: Mapping[str, Any]) -> dict[str, Any]:
+    result: dict[str, Any] = {}
+    for key, item in value.items():
+        if isinstance(item, Path):
+            result[key] = str(item)
+        elif isinstance(item, Mapping):
+            result[key] = _jsonable_mapping(item)
+        elif isinstance(item, list):
+            result[key] = [str(element) if isinstance(element, Path) else element for element in item]
+        else:
+            result[key] = item
+    return result
+
+
+def _load_child_status(status_path: Path) -> dict[str, Any]:
+    if not status_path.is_file():
+        raise RuntimeError(f"Bench child did not write status file: {status_path}")
+    try:
+        status = json.loads(status_path.read_text(encoding="utf-8"))
+    except json.JSONDecodeError as exc:
+        raise RuntimeError(f"Bench child wrote malformed status file {status_path}: {exc}") from exc
+    if not isinstance(status, dict):
+        raise RuntimeError(f"Bench child wrote non-object status file: {status_path}")
+    return status
+
+
 async def _run_one_toml_eval(config: Any) -> Any:
     return await run_evaluation(config)
 
diff --git a/medarc_verifiers/cli/upstream_eval.py b/medarc_verifiers/cli/upstream_eval.py
index dc50fb8c..eaf7e5c6 100644
--- a/medarc_verifiers/cli/upstream_eval.py
+++ b/medarc_verifiers/cli/upstream_eval.py
@@ -9,10 +9,16 @@
 
 from __future__ import annotations
 
-from medarc_verifiers.cli.verifiers_adapter import EvalConfigOverrides, build_eval_config, load_toml_eval_configs
+from medarc_verifiers.cli.verifiers_adapter import (
+    EvalConfigOverrides,
+    build_eval_config,
+    build_eval_identity_payload,
+    load_toml_eval_configs,
+)
 
 __all__ = [
     "EvalConfigOverrides",
     "build_eval_config",
+    "build_eval_identity_payload",
     "load_toml_eval_configs",
 ]
diff --git a/medarc_verifiers/cli/verifiers_adapter.py b/medarc_verifiers/cli/verifiers_adapter.py
index e3b417f0..1049ff62 100644
--- a/medarc_verifiers/cli/verifiers_adapter.py
+++ b/medarc_verifiers/cli/verifiers_adapter.py
@@ -253,6 +253,32 @@ def build_eval_config(raw: Mapping[str, Any], *, overrides: EvalConfigOverrides
     return EvalConfig(**eval_config_kwargs)
 
 
+def build_eval_identity_payload(
+    raw: Mapping[str, Any], *, overrides: EvalConfigOverrides | None = None
+) -> dict[str, Any]:
+    """Resolve TOML eval identity without importing the environment package."""
+
+    merged_raw = _apply_overrides(dict(raw), overrides)
+    endpoints_path = str(merged_raw.get("endpoints_path", DEFAULT_ENDPOINTS_PATH))
+    endpoints = load_endpoints(endpoints_path)
+    model, _resolved_endpoint_id, _client_config = _build_client_config(merged_raw, endpoints, endpoints_path)
+
+    payload = {
+        "env_args": dict(merged_raw.get("env_args", {})),
+        "env_id": merged_raw["env_id"],
+        "model": model,
+        "num_examples": merged_raw.get("num_examples", DEFAULT_NUM_EXAMPLES),
+        "rollouts_per_example": merged_raw.get("rollouts_per_example", DEFAULT_ROLLOUTS_PER_EXAMPLE),
+        "max_concurrent": merged_raw.get("max_concurrent", DEFAULT_MAX_CONCURRENT),
+        "sampling_args": dict(merged_raw.get("sampling_args", {})),
+    }
+    if "variant_id" in raw:
+        payload["variant_id"] = raw["variant_id"]
+    if "name" in raw:
+        payload["name"] = raw["name"]
+    return payload
+
+
 def get_env_eval_defaults(env_id: str) -> dict[str, Any]:
     """Read ``[tool.verifiers.eval]`` defaults from an installed env package."""
 
@@ -606,6 +632,7 @@ def _validate_header_mapping(value: object) -> dict[str, str]:
     "DEFAULT_ROLLOUTS_PER_EXAMPLE",
     "EvalConfigOverrides",
     "build_eval_config",
+    "build_eval_identity_payload",
     "get_env_eval_defaults",
     "load_toml_eval_configs",
 ]
diff --git a/tests/test_cli/test_bench_child.py b/tests/test_cli/test_bench_child.py
new file mode 100644
index 00000000..0cd052a2
--- /dev/null
+++ b/tests/test_cli/test_bench_child.py
@@ -0,0 +1,101 @@
+from __future__ import annotations
+
+from pathlib import Path
+from types import SimpleNamespace
+
+import pytest
+
+from medarc_verifiers.cli import bench_child
+from medarc_verifiers.cli.env_lifecycle import EnvInstallState, EnvPackageRef
+
+
+def _payload(tmp_path: Path) -> dict:
+    return {
+        "raw_config": {"env_id": "medqa", "model": "parent-model"},
+        "overrides": {},
+        "env_dir": str(tmp_path / "envs"),
+        "resume_path": str(tmp_path / "runs" / "evals" / "parent-model" / "medqa" / "base"),
+        "status_path": str(tmp_path / "status.json"),
+        "expected_env_id": "medqa",
+        "expected_model": "parent-model",
+    }
+
+
+def _state(installed_by_child: bool) -> EnvInstallState:
+    ref = EnvPackageRef("medqa", "medqa", "medqa", Path("envs/medqa"), None)
+    return EnvInstallState(ref, installed_by_child, False, False)
+
+
+def test_child_installs_builds_runs_and_cleans_up(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
+    calls: list[str] = []
+    config = SimpleNamespace(
+        env_id="medqa",
+        model="parent-model",
+        model_copy=lambda update: SimpleNamespace(env_id="medqa", model="parent-model", **update),
+    )
+
+    monkeypatch.setattr(bench_child, "resolve_env_package", lambda env_id, env_dir: object())
+    monkeypatch.setattr(bench_child, "ensure_installed", lambda ref: calls.append("install") or _state(True))
+    monkeypatch.setattr(bench_child, "build_eval_config", lambda raw, overrides: calls.append("build") or config)
+    monkeypatch.setattr(bench_child, "uninstall_if_child_installed", lambda state: calls.append("cleanup"))
+
+    async def fake_run_evaluation(run_config):
+        calls.append(f"run:{run_config.resume_path}")
+
+    monkeypatch.setattr(bench_child, "run_evaluation", fake_run_evaluation)
+
+    status = bench_child._run_payload(_payload(tmp_path))
+
+    assert status["exit_code"] == 0
+    assert status["installed_by_child"] is True
+    assert calls == ["install", "build", f"run:{tmp_path / 'runs' / 'evals' / 'parent-model' / 'medqa' / 'base'}", "cleanup"]
+
+
+def test_child_install_failure_does_not_build_or_cleanup(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
+    calls: list[str] = []
+
+    monkeypatch.setattr(bench_child, "resolve_env_package", lambda env_id, env_dir: object())
+
+    def fail_install(ref):
+        calls.append("install")
+        raise RuntimeError("install failed")
+
+    monkeypatch.setattr(bench_child, "ensure_installed", fail_install)
+    monkeypatch.setattr(bench_child, "build_eval_config", lambda raw, overrides: calls.append("build"))
+    monkeypatch.setattr(bench_child, "uninstall_if_child_installed", lambda state: calls.append("cleanup"))
+
+    status = bench_child._run_payload(_payload(tmp_path))
+
+    assert status["exit_code"] == 1
+    assert status["exit_reason"] == "eval_failed"
+    assert "install failed" in status["primary_error"]
+    assert calls == ["install"]
+
+
+def test_child_cleanup_failure_after_success_is_fatal(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
+    config = SimpleNamespace(
+        env_id="medqa",
+        model="parent-model",
+        model_copy=lambda update: SimpleNamespace(env_id="medqa", model="parent-model", **update),
+    )
+
+    monkeypatch.setattr(bench_child, "resolve_env_package", lambda env_id, env_dir: object())
+    monkeypatch.setattr(bench_child, "ensure_installed", lambda ref: _state(True))
+    monkeypatch.setattr(bench_child, "build_eval_config", lambda raw, overrides: config)
+
+    async def fake_run_evaluation(run_config):
+        return None
+
+    monkeypatch.setattr(bench_child, "run_evaluation", fake_run_evaluation)
+
+    def fail_cleanup(state):
+        raise RuntimeError("cleanup failed")
+
+    monkeypatch.setattr(bench_child, "uninstall_if_child_installed", fail_cleanup)
+
+    status = bench_child._run_payload(_payload(tmp_path))
+
+    assert status["eval_ok"] is True
+    assert status["cleanup_ok"] is False
+    assert status["exit_code"] == 1
+    assert status["exit_reason"] == "cleanup_failed"
diff --git a/tests/test_cli/test_env_lifecycle.py b/tests/test_cli/test_env_lifecycle.py
new file mode 100644
index 00000000..5e37ffcb
--- /dev/null
+++ b/tests/test_cli/test_env_lifecycle.py
@@ -0,0 +1,86 @@
+from __future__ import annotations
+
+from pathlib import Path
+from types import SimpleNamespace
+
+import pytest
+
+from medarc_verifiers.cli import env_lifecycle
+
+
+def _write_env(root: Path, folder: str, *, project_name: str, loader: str | None = None) -> Path:
+    env_path = root / folder
+    env_path.mkdir(parents=True)
+    loader_block = ""
+    if loader is not None:
+        loader_block = f'\n[tool.prime.environment]\nloader = "{loader}"\n'
+    (env_path / "pyproject.toml").write_text(
+        f'[project]\nname = "{project_name}"\n{loader_block}',
+        encoding="utf-8",
+    )
+    return env_path
+
+
+def test_resolve_env_package_uses_upstream_module_and_project_name(tmp_path: Path) -> None:
+    env_path = _write_env(tmp_path, "head_qa_v2", project_name="head-qa-v2", loader="other:load_environment")
+
+    ref = env_lifecycle.resolve_env_package("owner/head-qa-v2", tmp_path)
+
+    assert ref.env_id == "owner/head-qa-v2"
+    assert ref.module_name == "head_qa_v2"
+    assert ref.project_name == "head-qa-v2"
+    assert ref.env_path == env_path
+    assert ref.loader == "other:load_environment"
+
+
+def test_resolve_env_package_errors_for_missing_pyproject(tmp_path: Path) -> None:
+    (tmp_path / "medqa").mkdir()
+
+    with pytest.raises(FileNotFoundError, match="missing pyproject.toml"):
+        env_lifecycle.resolve_env_package("medqa", tmp_path)
+
+
+def test_inspect_install_state_rejects_installed_distribution_without_module(monkeypatch: pytest.MonkeyPatch) -> None:
+    ref = env_lifecycle.EnvPackageRef("medqa", "medqa", "medqa", Path("envs/medqa"), None)
+    monkeypatch.setattr(env_lifecycle, "_distribution_exists", lambda name: True)
+    monkeypatch.setattr(env_lifecycle, "_module_importable", lambda name: False)
+
+    with pytest.raises(ModuleNotFoundError, match="upstream module 'medqa' is not importable"):
+        env_lifecycle.inspect_install_state(ref)
+
+
+def test_ensure_installed_installs_missing_package(monkeypatch: pytest.MonkeyPatch) -> None:
+    ref = env_lifecycle.EnvPackageRef("medqa", "medqa", "medqa", Path("envs/medqa"), None)
+    calls: list[list[str]] = []
+    importable = [False, True]
+
+    monkeypatch.setattr(env_lifecycle, "_distribution_exists", lambda name: False)
+    monkeypatch.setattr(env_lifecycle, "_module_importable", lambda name: importable.pop(0))
+    monkeypatch.setattr(env_lifecycle.importlib, "invalidate_caches", lambda: None)
+    monkeypatch.setattr(
+        env_lifecycle.subprocess,
+        "run",
+        lambda cmd, check: calls.append(cmd) or SimpleNamespace(returncode=0),
+    )
+
+    state = env_lifecycle.ensure_installed(ref)
+
+    assert state.installed_by_child is True
+    assert calls[0][:4] == ["uv", "pip", "install", "--python"]
+
+
+def test_uninstall_only_child_installed_packages(monkeypatch: pytest.MonkeyPatch) -> None:
+    ref = env_lifecycle.EnvPackageRef("medqa", "medqa", "medqa", Path("envs/medqa"), None)
+    state = env_lifecycle.EnvInstallState(ref, True, False, False)
+    calls: list[list[str]] = []
+
+    monkeypatch.setattr(env_lifecycle.importlib, "invalidate_caches", lambda: None)
+    monkeypatch.setattr(
+        env_lifecycle.subprocess,
+        "run",
+        lambda cmd, check: calls.append(cmd) or SimpleNamespace(returncode=0),
+    )
+
+    env_lifecycle.uninstall_if_child_installed(state)
+
+    assert calls[0][:4] == ["uv", "pip", "uninstall", "--python"]
diff --git a/tests/test_cli/test_main.py b/tests/test_cli/test_main.py
index 036d282e..9b980f28 100644
--- a/tests/test_cli/test_main.py
+++ b/tests/test_cli/test_main.py
@@ -259,6 +259,89 @@ def test_toml_bench_dry_run_model_override(
     assert "config-model" not in output
 
 
+def test_toml_bench_install_envs_dry_run_does_not_build_configs_or_spawn(
+    monkeypatch: pytest.MonkeyPatch,
+    tmp_path: Path,
+    capsys: pytest.CaptureFixture[str],
+) -> None:
+    config_path = tmp_path / "bench.toml"
+    _write_config(
+        config_path,
+        """
+        model = "gpt-5-mini"
+
+        [[eval]]
+        env_id = "missing-env"
+        num_examples = 1
+        rollouts_per_example = 1
+        """,
+    )
+    monkeypatch.setattr(main, "build_eval_config", lambda raw, overrides: pytest.fail("parent built EvalConfig"))
+    monkeypatch.setattr(main.subprocess, "run", lambda *args, **kwargs: pytest.fail("parent spawned child"))
+
+    exit_code = main.main(["bench", "--config", str(config_path), "--install-envs", "--dry-run"])
+
+    output = capsys.readouterr().out
+    assert exit_code == 0
+    assert "missing-env" in output
+    assert "does not install packages" in output
+
+
+def test_toml_bench_install_envs_executes_selected_child_payload(
+    monkeypatch: pytest.MonkeyPatch,
+    tmp_path: Path,
+) -> None:
+    config_path = tmp_path / "bench.toml"
+    output_dir = tmp_path / "evals"
+    _write_config(
+        config_path,
+        """
+        model = "gpt-5-mini"
+
+        [[eval]]
+        env_id = "bad-unselected"
+        num_examples = 1
+        rollouts_per_example = 1
+
+        [[eval]]
+        env_id = "selected-env"
+        num_examples = 2
+        rollouts_per_example = 1
+        """,
+    )
+    payloads: list[dict[str, Any]] = []
+
+    def fake_run(cmd, check=False):
+        payload = json.loads(Path(cmd[-1]).read_text(encoding="utf-8"))
+        payloads.append(payload)
+        Path(payload["status_path"]).write_text(json.dumps({"exit_code": 0}), encoding="utf-8")
+        return SimpleNamespace(returncode=0)
+
+    monkeypatch.setattr(main.subprocess, "run", fake_run)
+
+    exit_code = main.main(
+        [
+            "bench",
+            "--config",
+            str(config_path),
+            "--install-envs",
+            "--eval-index",
+            "2",
+            "--output-dir",
+            str(output_dir),
+        ]
+    )
+
+    assert exit_code == 0
+    assert len(payloads) == 1
+    payload = payloads[0]
+    assert payload["raw_config"]["env_id"] == "selected-env"
+    assert payload["expected_env_id"] == "selected-env"
+    assert payload["expected_model"] == "gpt-5-mini"
+    assert payload["resume_path"] == str(output_dir / "gpt-5-mini" / "selected-env" / "base")
+    assert (output_dir / "gpt-5-mini" / "selected-env" / "base").is_dir()
+
+
 def test_toml_bench_dry_run_uses_toml_output_dir(
     tmp_path: Path,
     capsys: pytest.CaptureFixture[str],

From d54b2f0d165450895c2e5b60012e2d0994be23b4 Mon Sep 17 00:00:00 2001
From: Benjamin Warner <me@benjaminwarner.dev>
Date: Tue, 12 May 2026 15:38:47 +0000
Subject: [PATCH 39/53] Isolate missing bench env installs

---
 medarc_verifiers/cli/bench_child.py  |  11 +-
 medarc_verifiers/cli/isolated_env.py | 143 +++++++++++++
 medarc_verifiers/cli/main.py         | 308 ++++++++++++++-------------
 tests/test_cli/test_bench_child.py   |  55 +++++
 tests/test_cli/test_isolated_env.py  | 117 ++++++++++
 tests/test_cli/test_main.py          | 295 +++++++++++++++++++++++--
 6 files changed, 763 insertions(+), 166 deletions(-)
 create mode 100644 medarc_verifiers/cli/isolated_env.py
 create mode 100644 tests/test_cli/test_isolated_env.py

diff --git a/medarc_verifiers/cli/bench_child.py b/medarc_verifiers/cli/bench_child.py
index e672fe5a..feecb9b7 100644
--- a/medarc_verifiers/cli/bench_child.py
+++ b/medarc_verifiers/cli/bench_child.py
@@ -50,9 +50,12 @@ def _run_payload(payload: dict[str, Any]) -> dict[str, Any]:
     }
 
     try:
-        ref = resolve_env_package(payload["raw_config"]["env_id"], payload["env_dir"])
-        installed_state = ensure_installed(ref)
-        status["installed_by_child"] = installed_state.installed_by_child
+        if payload.get("env_preinstalled", False):
+            status["installed_by_child"] = False
+        else:
+            ref = resolve_env_package(payload["raw_config"]["env_id"], payload["env_dir"])
+            installed_state = ensure_installed(ref)
+            status["installed_by_child"] = installed_state.installed_by_child
 
         config = build_eval_config(payload["raw_config"], overrides=_overrides_from_payload(payload["overrides"]))
         planned_resume_path = Path(payload["resume_path"])
@@ -75,7 +78,7 @@ def _run_payload(payload: dict[str, Any]) -> dict[str, Any]:
         status["exit_reason"] = "eval_failed"
     finally:
         try:
-            if installed_state is not None:
+            if installed_state is not None and payload.get("cleanup_env_package", True):
                 uninstall_if_child_installed(installed_state)
         except Exception as exc:  # noqa: BLE001
             cleanup_failed = True
diff --git a/medarc_verifiers/cli/isolated_env.py b/medarc_verifiers/cli/isolated_env.py
new file mode 100644
index 00000000..e6ce0326
--- /dev/null
+++ b/medarc_verifiers/cli/isolated_env.py
@@ -0,0 +1,143 @@
+"""Temporary virtual environment helpers for isolated TOML bench evals."""
+
+from __future__ import annotations
+
+import json
+import shutil
+import subprocess
+import sys
+import tempfile
+from contextlib import contextmanager
+from dataclasses import dataclass
+from importlib import metadata
+from pathlib import Path
+from typing import Iterator
+from urllib.parse import unquote, urlparse
+
+
+class IsolatedEnvError(RuntimeError):
+    """Raised when an isolated bench environment cannot be prepared."""
+
+
+@dataclass(frozen=True)
+class MedarcInstallSpec:
+    editable: bool
+    version: str
+    checkout_root: Path | None = None
+
+
+def venv_python_path(venv_path: Path) -> Path:
+    posix_path = venv_path / "bin" / "python"
+    if posix_path.exists():
+        return posix_path
+    return venv_path / "Scripts" / "python.exe"
+
+
+def current_medarc_install_spec() -> MedarcInstallSpec:
+    try:
+        dist = metadata.distribution("medarc-verifiers")
+    except metadata.PackageNotFoundError as exc:
+        raise IsolatedEnvError("Cannot auto-install isolated envs because medarc-verifiers is not installed.") from exc
+
+    direct_url_text = dist.read_text("direct_url.json")
+    if direct_url_text:
+        try:
+            direct_url = json.loads(direct_url_text)
+        except json.JSONDecodeError as exc:
+            raise IsolatedEnvError("Installed medarc-verifiers has malformed direct_url.json metadata.") from exc
+        if direct_url.get("dir_info", {}).get("editable"):
+            url = direct_url.get("url")
+            parsed = urlparse(url) if isinstance(url, str) else None
+            if parsed is None or parsed.scheme != "file":
+                raise IsolatedEnvError("Editable medarc-verifiers install does not point at a local file:// checkout.")
+            checkout_root = Path(unquote(parsed.path)).expanduser().resolve()
+            _validate_editable_checkout(checkout_root)
+            return MedarcInstallSpec(editable=True, version=dist.version, checkout_root=checkout_root)
+
+    return MedarcInstallSpec(editable=False, version=dist.version)
+
+
+@contextmanager
+def temporary_bench_venv(repo_root: Path | None = None) -> Iterator[Path]:
+    temp_root = Path(tempfile.mkdtemp(prefix="medarc-bench-venv-"))
+    try:
+        python_executable = _create_venv(temp_root)
+        install_medarc_into_venv(python_executable, repo_root=repo_root)
+        yield python_executable
+    finally:
+        shutil.rmtree(temp_root, ignore_errors=True)
+
+
+def install_medarc_into_venv(python_executable: Path, *, repo_root: Path | None = None) -> None:
+    spec = current_medarc_install_spec()
+    if spec.editable:
+        checkout_root = repo_root or spec.checkout_root
+        if checkout_root is None:
+            raise IsolatedEnvError("Editable medarc-verifiers checkout path could not be resolved.")
+        _validate_editable_checkout(checkout_root)
+        command = ["uv", "pip", "install", "--python", str(python_executable), "-e", str(checkout_root)]
+        _run_uv(command, "install editable medarc-verifiers into isolated venv")
+        return
+
+    requirement = f"medarc-verifiers=={spec.version}"
+    command = ["uv", "pip", "install", "--python", str(python_executable), requirement]
+    try:
+        _run_uv(command, f"install {requirement} into isolated venv")
+    except IsolatedEnvError as exc:
+        raise IsolatedEnvError(
+            f"Could not resolve {requirement} for isolated auto-install. Run from an editable checkout, "
+            "or preinstall environment packages and pass --no-auto-install."
+        ) from exc
+
+
+def install_env_package(python_executable: Path, env_path: Path) -> None:
+    _run_uv(
+        ["uv", "pip", "install", "--python", str(python_executable), "-e", str(env_path)],
+        f"install environment package {env_path} into isolated venv",
+    )
+
+
+def _create_venv(venv_path: Path) -> Path:
+    _run_uv(["uv", "venv", "--python", sys.executable, str(venv_path)], "create isolated bench venv")
+    python_executable = venv_python_path(venv_path)
+    if not python_executable.exists():
+        raise IsolatedEnvError(f"uv created {venv_path}, but no Python executable was found in it.")
+    return python_executable
+
+
+def _validate_editable_checkout(checkout_root: Path) -> None:
+    if not (checkout_root / "pyproject.toml").is_file() or not (checkout_root / "medarc_verifiers").is_dir():
+        raise IsolatedEnvError(
+            f"Editable medarc-verifiers checkout at {checkout_root} is invalid; expected pyproject.toml "
+            "and medarc_verifiers/."
+        )
+
+
+def _run_uv(command: list[str], action: str) -> None:
+    try:
+        completed = subprocess.run(command, check=False, capture_output=True, text=True)
+    except FileNotFoundError as exc:
+        raise IsolatedEnvError(f"Cannot {action}: uv is not installed or not on PATH.") from exc
+    if completed.returncode != 0:
+        stderr_tail = _tail(completed.stderr)
+        stdout_tail = _tail(completed.stdout)
+        detail = "\n".join(part for part in (stderr_tail, stdout_tail) if part)
+        raise IsolatedEnvError(f"Failed to {action} with exit code {completed.returncode}.\n{detail}".rstrip())
+
+
+def _tail(text: str, *, lines: int = 20) -> str:
+    stripped = text.strip()
+    if not stripped:
+        return ""
+    return "\n".join(stripped.splitlines()[-lines:])
+
+
+__all__ = [
+    "IsolatedEnvError",
+    "MedarcInstallSpec",
+    "current_medarc_install_spec",
+    "install_env_package",
+    "install_medarc_into_venv",
+    "temporary_bench_venv",
+    "venv_python_path",
+]
diff --git a/medarc_verifiers/cli/main.py b/medarc_verifiers/cli/main.py
index b3dd9c25..3af62656 100644
--- a/medarc_verifiers/cli/main.py
+++ b/medarc_verifiers/cli/main.py
@@ -4,6 +4,7 @@
 
 import argparse
 import asyncio
+import importlib.util
 import json
 import logging
 import os
@@ -42,6 +43,8 @@
     plan_eval_paths,
 )
 from medarc_verifiers.cli.hf import HFSyncConfig, sync_files_to_hub
+from medarc_verifiers.cli.env_lifecycle import EnvPackageRef, resolve_env_package, upstream_module_name
+from medarc_verifiers.cli.isolated_env import install_env_package, temporary_bench_venv
 from medarc_verifiers.cli.process import ProcessOptions, ProcessResult, run_process
 from medarc_verifiers.cli.utils.config_io import load_mapping_file
 from medarc_verifiers.cli.utils.overrides import build_cli_override
@@ -89,10 +92,19 @@ def build_batch_parser() -> argparse.ArgumentParser:
         default=DEFAULT_ENV_DIR,
         help="Directory containing environments (default: %(default)s).",
     )
-    parser.add_argument(
-        "--install-envs",
+    auto_install_group = parser.add_mutually_exclusive_group()
+    auto_install_group.add_argument(
+        "--auto-install",
+        dest="auto_install",
         action="store_true",
-        help="Install missing local env packages for selected TOML evals in per-eval subprocesses.",
+        default=True,
+        help="Auto-install missing local env packages in isolated temporary venvs (default).",
+    )
+    auto_install_group.add_argument(
+        "--no-auto-install",
+        dest="auto_install",
+        action="store_false",
+        help="Require selected environment packages to already be importable in the active Python environment.",
     )
     parser.add_argument(
         "--endpoints-path",
@@ -1268,20 +1280,17 @@ def _run_toml_bench(args: argparse.Namespace) -> int:
         load_toml_eval_configs(config_path, extra_valid_fields={"name", "variant_id"}),
         args,
     )
-    if args.install_envs:
-        return _run_toml_bench_with_env_lifecycle(raw_configs, args)
-
-    overrides = _toml_eval_overrides(args)
-    eval_configs = [build_eval_config(raw, overrides=overrides) for raw in raw_configs]
-    plan_inputs = [_eval_config_identity_payload(config, raw) for config, raw in zip(eval_configs, raw_configs)]
-    output_root = _resolve_toml_output_root(eval_configs, args)
-    path_plans = plan_eval_paths(plan_inputs, output_root=output_root)
-    eval_configs, path_plans, plan_inputs = _select_toml_plan(eval_configs, path_plans, plan_inputs, args)
-
-    _print_toml_bench_plan(eval_configs, path_plans, dry_run=bool(args.dry_run))
+    selected_raw, plan_inputs, path_plans, overrides = _plan_selected_toml_raw_configs(raw_configs, args)
+    display_configs = _display_configs_from_plan_inputs(plan_inputs)
+    missing_envs = _missing_selected_env_refs(plan_inputs, args)
+    _print_toml_bench_plan(display_configs, path_plans, dry_run=bool(args.dry_run))
+    if missing_envs and args.auto_install:
+        _print_auto_install_warning(missing_envs, dry_run=bool(args.dry_run))
     if args.dry_run:
         return 0
-    return _execute_toml_plan(eval_configs, path_plans, args)
+    if missing_envs and not args.auto_install:
+        raise RuntimeError(_missing_envs_error(missing_envs))
+    return _execute_selected_toml_plan(selected_raw, plan_inputs, path_plans, overrides, args, missing_envs=missing_envs)
 
 
 def _toml_eval_overrides(args: argparse.Namespace) -> EvalConfigOverrides:
@@ -1298,22 +1307,56 @@ def _toml_eval_overrides(args: argparse.Namespace) -> EvalConfigOverrides:
     return overrides
 
 
-def _run_toml_bench_with_env_lifecycle(raw_configs: Sequence[dict[str, Any]], args: argparse.Namespace) -> int:
+def _plan_selected_toml_raw_configs(
+    raw_configs: Sequence[dict[str, Any]],
+    args: argparse.Namespace,
+) -> tuple[list[dict[str, Any]], list[dict[str, Any]], list[EvalPathPlan], EvalConfigOverrides]:
     selected_raw = _select_toml_raw_configs(raw_configs, args)
     overrides = _toml_eval_overrides(args)
     plan_inputs = [build_eval_identity_payload(raw, overrides=overrides) for raw in selected_raw]
     output_root = _resolve_toml_output_root_from_raw(selected_raw, args)
     path_plans = plan_eval_paths(plan_inputs, output_root=output_root)
-    display_configs = _display_configs_from_plan_inputs(plan_inputs)
+    return selected_raw, plan_inputs, path_plans, overrides
 
-    _print_toml_bench_plan(display_configs, path_plans, dry_run=bool(args.dry_run))
-    if args.dry_run:
-        Console().print(
-            "[yellow]Note:[/yellow] --install-envs dry run does not install packages; "
-            "environment package defaults are not resolved unless present in TOML or CLI overrides."
-        )
-        return 0
-    return _execute_toml_lifecycle_plan(selected_raw, plan_inputs, path_plans, overrides, args)
+
+def _missing_selected_env_refs(
+    plan_inputs: Sequence[Mapping[str, Any]],
+    args: argparse.Namespace,
+) -> dict[str, EnvPackageRef]:
+    missing: dict[str, EnvPackageRef] = {}
+    for plan_input in plan_inputs:
+        env_id = str(plan_input["env_id"])
+        if _module_importable(upstream_module_name(env_id)):
+            continue
+        if env_id not in missing:
+            missing[env_id] = resolve_env_package(env_id, args.env_dir)
+    return missing
+
+
+def _print_auto_install_warning(missing_envs: Mapping[str, EnvPackageRef], *, dry_run: bool) -> None:
+    verb = "would auto-install" if dry_run else "will auto-install"
+    console = Console(stderr=True)
+    console.print(
+        f"[yellow]Warning:[/yellow] {len(missing_envs)} selected environment package(s) are not installed "
+        "in the active Python environment."
+    )
+    console.print(f"MedARC {verb} missing local envs in isolated temporary venvs for this run.")
+    console.print("Preinstall envs with vf-install or pass --no-auto-install to require installed packages.")
+    for env_id, ref in missing_envs.items():
+        console.print(f"  - {env_id}: {ref.env_path}")
+
+
+def _missing_envs_error(missing_envs: Mapping[str, EnvPackageRef]) -> str:
+    lines = [
+        "Selected environment packages are not importable and --no-auto-install was passed:",
+        *[f"- {env_id}: {ref.env_path}" for env_id, ref in missing_envs.items()],
+        "Preinstall envs with vf-install or rerun without --no-auto-install.",
+    ]
+    return "\n".join(lines)
+
+
+def _module_importable(module_name: str) -> bool:
+    return importlib.util.find_spec(module_name) is not None
 
 
 def _prepare_toml_raw_configs(raw_configs: Sequence[dict[str, Any]], args: argparse.Namespace) -> list[dict[str, Any]]:
@@ -1376,124 +1419,101 @@ def _display_configs_from_plan_inputs(plan_inputs: Sequence[Mapping[str, Any]])
     ]
 
 
-def _resolve_toml_output_root(eval_configs: Sequence[Any], args: argparse.Namespace) -> Path:
-    if args.output_dir:
-        return Path(args.output_dir).expanduser()
-
-    configured_roots = {str(config.output_dir) for config in eval_configs if config.output_dir}
-    if len(configured_roots) > 1:
-        raise ValueError(
-            "TOML bench deterministic output supports one output_dir per run; use a single global output_dir."
-        )
-    if configured_roots:
-        return Path(configured_roots.pop()).expanduser()
-    return DEFAULT_EVALS_DIR
-
-
-def _select_toml_plan(
-    eval_configs: Sequence[Any],
-    path_plans: Sequence[EvalPathPlan],
+def _execute_selected_toml_plan(
+    raw_configs: Sequence[Mapping[str, Any]],
     plan_inputs: Sequence[Mapping[str, Any]],
-    args: argparse.Namespace,
-) -> tuple[list[Any], list[EvalPathPlan], list[Mapping[str, Any]]]:
-    indexed = list(zip(eval_configs, path_plans, plan_inputs))
-    if args.eval_index is not None:
-        start = args.eval_index - 1
-        indexed = indexed[start : start + 1]
-    else:
-        if args.start_at is not None:
-            indexed = indexed[args.start_at - 1 :]
-        if args.stop_after is not None:
-            indexed = indexed[: args.stop_after - (args.start_at or 1) + 1]
-    if not indexed:
-        raise ValueError("No TOML evals matched the requested selection.")
-    selected_configs, selected_paths, selected_plan_inputs = zip(*indexed)
-    return list(selected_configs), list(selected_paths), list(selected_plan_inputs)
-
-
-def _execute_toml_plan(
-    eval_configs: Sequence[Any],
     path_plans: Sequence[EvalPathPlan],
+    overrides: EvalConfigOverrides,
     args: argparse.Namespace,
+    *,
+    missing_envs: Mapping[str, EnvPackageRef],
 ) -> int:
     failures = 0
-    for index, (config, path_plan) in enumerate(zip(eval_configs, path_plans), start=1):
-        results_path = path_plan.results_path
+    for index, (raw, plan_input, path_plan) in enumerate(zip(raw_configs, plan_inputs, path_plans), start=1):
         try:
-            _prepare_toml_results_dir(
-                results_path,
-                force=bool(args.force),
-            )
-            run_config = config.model_copy(update={"resume_path": results_path, "save_results": True})
-            logger.info("Running TOML eval %d/%d: %s on %s", index, len(eval_configs), config.env_id, config.model)
-            asyncio.run(_run_one_toml_eval(run_config))
+            env_id = str(plan_input["env_id"])
+            if env_id in missing_envs:
+                _execute_isolated_toml_eval(
+                    raw,
+                    plan_input,
+                    path_plan,
+                    overrides,
+                    args,
+                    index=index,
+                    total=len(raw_configs),
+                    env_ref=missing_envs[env_id],
+                )
+            else:
+                config = build_eval_config(raw, overrides=overrides)
+                if config.env_id != plan_input["env_id"]:
+                    raise ValueError(f"Resolved env_id {config.env_id!r}, expected {plan_input['env_id']!r}.")
+                if config.model != plan_input["model"]:
+                    raise ValueError(f"Resolved model {config.model!r}, expected {plan_input['model']!r}.")
+                _prepare_toml_results_dir(path_plan.results_path, force=bool(args.force))
+                run_config = config.model_copy(update={"resume_path": path_plan.results_path, "save_results": True})
+                logger.info("Running TOML eval %d/%d: %s on %s", index, len(raw_configs), config.env_id, config.model)
+                asyncio.run(_run_one_toml_eval(run_config))
         except Exception as exc:  # noqa: BLE001
             failures += 1
             logger.exception("TOML eval %d failed: %s", index, exc)
             if not args.continue_on_error:
                 return 1
-        if args.sleep and index < len(eval_configs):
+        if args.sleep and index < len(raw_configs):
             import time
 
             time.sleep(float(args.sleep))
     return 1 if failures else 0
 
 
-def _execute_toml_lifecycle_plan(
-    raw_configs: Sequence[Mapping[str, Any]],
-    plan_inputs: Sequence[Mapping[str, Any]],
-    path_plans: Sequence[EvalPathPlan],
+def _execute_isolated_toml_eval(
+    raw: Mapping[str, Any],
+    plan_input: Mapping[str, Any],
+    path_plan: EvalPathPlan,
     overrides: EvalConfigOverrides,
     args: argparse.Namespace,
-) -> int:
-    failures = 0
-    with tempfile.TemporaryDirectory(prefix="medarc-bench-env-") as temp_dir:
+    *,
+    index: int,
+    total: int,
+    env_ref: EnvPackageRef,
+) -> None:
+    with tempfile.TemporaryDirectory(prefix="medarc-bench-child-") as temp_dir:
         temp_root = Path(temp_dir)
-        for index, (raw, plan_input, path_plan) in enumerate(zip(raw_configs, plan_inputs, path_plans), start=1):
-            try:
-                _prepare_toml_results_dir(path_plan.results_path, force=bool(args.force))
-                payload_path = temp_root / f"eval-{index}.json"
-                status_path = temp_root / f"eval-{index}-status.json"
-                payload = _bench_child_payload(
-                    raw,
-                    plan_input,
-                    path_plan,
-                    overrides,
-                    args,
-                    status_path=status_path,
-                )
-                payload_path.write_text(json.dumps(payload, sort_keys=True, default=str), encoding="utf-8")
-                logger.info(
-                    "Running TOML eval %d/%d in subprocess: %s on %s",
-                    index,
-                    len(raw_configs),
-                    plan_input["env_id"],
-                    plan_input["model"],
-                )
-                completed = subprocess.run(
-                    [sys.executable, "-m", "medarc_verifiers.cli.bench_child", str(payload_path)],
-                    check=False,
-                )
-                status = _load_child_status(status_path)
-                if completed.returncode != 0 or int(status.get("exit_code", 1)) != 0:
-                    failures += 1
-                    logger.error(
-                        "TOML eval %d failed in subprocess: %s",
-                        index,
-                        status.get("primary_error") or status.get("cleanup_error") or status.get("exit_reason"),
-                    )
-                    if not args.continue_on_error:
-                        return 1
-            except Exception as exc:  # noqa: BLE001
-                failures += 1
-                logger.exception("TOML eval %d failed: %s", index, exc)
-                if not args.continue_on_error:
-                    return 1
-            if args.sleep and index < len(raw_configs):
-                import time
-
-                time.sleep(float(args.sleep))
-    return 1 if failures else 0
+        payload_path = temp_root / f"eval-{index}.json"
+        status_path = temp_root / f"eval-{index}-status.json"
+        with temporary_bench_venv() as child_python:
+            install_env_package(child_python, env_ref.env_path)
+            _prepare_toml_results_dir(path_plan.results_path, force=bool(args.force))
+            payload = _bench_child_payload(
+                raw,
+                plan_input,
+                path_plan,
+                overrides,
+                args,
+                status_path=status_path,
+                cleanup_env_package=False,
+                env_preinstalled=True,
+            )
+            payload_path.write_text(json.dumps(payload, sort_keys=True, default=str), encoding="utf-8")
+            logger.info(
+                "Running TOML eval %d/%d in isolated venv: %s on %s",
+                index,
+                total,
+                plan_input["env_id"],
+                plan_input["model"],
+            )
+            completed = subprocess.run(
+                [str(child_python), "-m", "medarc_verifiers.cli.bench_child", str(payload_path)],
+                check=False,
+                capture_output=True,
+                text=True,
+            )
+            status = _load_child_status(status_path, completed=completed)
+            if completed.returncode != 0 or int(status.get("exit_code", 1)) != 0:
+                detail = status.get("primary_error") or status.get("cleanup_error") or status.get("exit_reason")
+                output_tail = _completed_process_tail(completed)
+                if output_tail:
+                    detail = f"{detail}\n{output_tail}" if detail else output_tail
+                raise RuntimeError(f"Bench child failed: {detail}")
 
 
 def _bench_child_payload(
@@ -1504,6 +1524,8 @@ def _bench_child_payload(
     args: argparse.Namespace,
     *,
     status_path: Path,
+    cleanup_env_package: bool = True,
+    env_preinstalled: bool = False,
 ) -> dict[str, Any]:
     return {
         "raw_config": _jsonable_mapping(raw),
@@ -1513,6 +1535,8 @@ def _bench_child_payload(
         "status_path": str(status_path),
         "expected_env_id": str(plan_input["env_id"]),
         "expected_model": str(plan_input["model"]),
+        "cleanup_env_package": cleanup_env_package,
+        "env_preinstalled": env_preinstalled,
     }
 
 
@@ -1544,18 +1568,33 @@ def _jsonable_mapping(value: Mapping[str, Any]) -> dict[str, Any]:
     return result
 
 
-def _load_child_status(status_path: Path) -> dict[str, Any]:
+def _load_child_status(status_path: Path, *, completed: subprocess.CompletedProcess[str] | None = None) -> dict[str, Any]:
     if not status_path.is_file():
-        raise RuntimeError(f"Bench child did not write status file: {status_path}")
+        tail = _completed_process_tail(completed) if completed is not None else ""
+        detail = f"\n{tail}" if tail else ""
+        raise RuntimeError(f"Bench child did not write status file: {status_path}{detail}")
     try:
         status = json.loads(status_path.read_text(encoding="utf-8"))
     except json.JSONDecodeError as exc:
-        raise RuntimeError(f"Bench child wrote malformed status file {status_path}: {exc}") from exc
+        tail = _completed_process_tail(completed) if completed is not None else ""
+        detail = f"\n{tail}" if tail else ""
+        raise RuntimeError(f"Bench child wrote malformed status file {status_path}: {exc}{detail}") from exc
     if not isinstance(status, dict):
         raise RuntimeError(f"Bench child wrote non-object status file: {status_path}")
     return status
 
 
+def _completed_process_tail(completed: subprocess.CompletedProcess[str] | None, *, lines: int = 20) -> str:
+    if completed is None:
+        return ""
+    parts: list[str] = []
+    for label, text in (("stderr", completed.stderr), ("stdout", completed.stdout)):
+        stripped = (text or "").strip()
+        if stripped:
+            parts.append(f"{label} tail:\n" + "\n".join(stripped.splitlines()[-lines:]))
+    return "\n".join(parts)
+
+
 async def _run_one_toml_eval(config: Any) -> Any:
     return await run_evaluation(config)
 
@@ -1594,23 +1633,6 @@ def _archive_existing_path(path: Path) -> Path:
     return candidate
 
 
-def _eval_config_identity_payload(config: Any, raw: Mapping[str, Any] | None = None) -> dict[str, Any]:
-    payload = {
-        "env_args": dict(config.env_args or {}),
-        "env_id": config.env_id,
-        "model": config.model,
-        "num_examples": config.num_examples,
-        "rollouts_per_example": config.rollouts_per_example,
-        "sampling_args": dict(config.sampling_args or {}),
-    }
-    if raw:
-        if "variant_id" in raw:
-            payload["variant_id"] = raw["variant_id"]
-        if "name" in raw:
-            payload["name"] = raw["name"]
-    return payload
-
-
 def _print_toml_bench_plan(eval_configs: Sequence[Any], path_plans: Sequence[EvalPathPlan], *, dry_run: bool) -> None:
     console = Console(width=240)
     action = "dry-run" if dry_run else "run"
diff --git a/tests/test_cli/test_bench_child.py b/tests/test_cli/test_bench_child.py
index 0cd052a2..733bafed 100644
--- a/tests/test_cli/test_bench_child.py
+++ b/tests/test_cli/test_bench_child.py
@@ -51,6 +51,61 @@ async def fake_run_evaluation(run_config):
     assert calls == ["install", "build", f"run:{tmp_path / 'runs' / 'evals' / 'parent-model' / 'medqa' / 'base'}", "cleanup"]
 
 
+def test_child_cleanup_env_package_false_skips_uninstall(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
+    calls: list[str] = []
+    config = SimpleNamespace(
+        env_id="medqa",
+        model="parent-model",
+        model_copy=lambda update: SimpleNamespace(env_id="medqa", model="parent-model", **update),
+    )
+
+    monkeypatch.setattr(bench_child, "resolve_env_package", lambda env_id, env_dir: object())
+    monkeypatch.setattr(bench_child, "ensure_installed", lambda ref: calls.append("install") or _state(True))
+    monkeypatch.setattr(bench_child, "build_eval_config", lambda raw, overrides: calls.append("build") or config)
+    monkeypatch.setattr(bench_child, "uninstall_if_child_installed", lambda state: calls.append("cleanup"))
+
+    async def fake_run_evaluation(run_config):
+        calls.append(f"run:{run_config.resume_path}")
+
+    monkeypatch.setattr(bench_child, "run_evaluation", fake_run_evaluation)
+
+    payload = _payload(tmp_path)
+    payload["cleanup_env_package"] = False
+    status = bench_child._run_payload(payload)
+
+    assert status["exit_code"] == 0
+    assert status["installed_by_child"] is True
+    assert calls == ["install", "build", f"run:{tmp_path / 'runs' / 'evals' / 'parent-model' / 'medqa' / 'base'}"]
+
+
+def test_child_env_preinstalled_skips_install_and_cleanup(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
+    calls: list[str] = []
+    config = SimpleNamespace(
+        env_id="medqa",
+        model="parent-model",
+        model_copy=lambda update: SimpleNamespace(env_id="medqa", model="parent-model", **update),
+    )
+
+    monkeypatch.setattr(bench_child, "resolve_env_package", lambda env_id, env_dir: calls.append("resolve"))
+    monkeypatch.setattr(bench_child, "ensure_installed", lambda ref: calls.append("install") or _state(True))
+    monkeypatch.setattr(bench_child, "build_eval_config", lambda raw, overrides: calls.append("build") or config)
+    monkeypatch.setattr(bench_child, "uninstall_if_child_installed", lambda state: calls.append("cleanup"))
+
+    async def fake_run_evaluation(run_config):
+        calls.append(f"run:{run_config.resume_path}")
+
+    monkeypatch.setattr(bench_child, "run_evaluation", fake_run_evaluation)
+
+    payload = _payload(tmp_path)
+    payload["env_preinstalled"] = True
+    payload["cleanup_env_package"] = False
+    status = bench_child._run_payload(payload)
+
+    assert status["exit_code"] == 0
+    assert status["installed_by_child"] is False
+    assert calls == ["build", f"run:{tmp_path / 'runs' / 'evals' / 'parent-model' / 'medqa' / 'base'}"]
+
+
 def test_child_install_failure_does_not_build_or_cleanup(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
     calls: list[str] = []
 
diff --git a/tests/test_cli/test_isolated_env.py b/tests/test_cli/test_isolated_env.py
new file mode 100644
index 00000000..9f9bbb7e
--- /dev/null
+++ b/tests/test_cli/test_isolated_env.py
@@ -0,0 +1,117 @@
+from __future__ import annotations
+
+import json
+from pathlib import Path
+from types import SimpleNamespace
+
+import pytest
+
+from medarc_verifiers.cli import isolated_env
+
+
+class FakeDistribution:
+    def __init__(self, *, version: str = "1.2.3", direct_url: dict | None = None) -> None:
+        self.version = version
+        self._direct_url = direct_url
+
+    def read_text(self, name: str) -> str | None:
+        if name != "direct_url.json" or self._direct_url is None:
+            return None
+        return json.dumps(self._direct_url)
+
+
+def test_current_medarc_install_spec_detects_editable_checkout(
+    monkeypatch: pytest.MonkeyPatch,
+    tmp_path: Path,
+) -> None:
+    (tmp_path / "pyproject.toml").write_text("[project]\nname = 'medarc-verifiers'\n", encoding="utf-8")
+    (tmp_path / "medarc_verifiers").mkdir()
+    direct_url = {"url": tmp_path.as_uri(), "dir_info": {"editable": True}}
+
+    monkeypatch.setattr(isolated_env.metadata, "distribution", lambda name: FakeDistribution(direct_url=direct_url))
+
+    spec = isolated_env.current_medarc_install_spec()
+
+    assert spec.editable is True
+    assert spec.checkout_root == tmp_path
+
+
+def test_current_medarc_install_spec_rejects_invalid_editable_checkout(
+    monkeypatch: pytest.MonkeyPatch,
+    tmp_path: Path,
+) -> None:
+    direct_url = {"url": tmp_path.as_uri(), "dir_info": {"editable": True}}
+    monkeypatch.setattr(isolated_env.metadata, "distribution", lambda name: FakeDistribution(direct_url=direct_url))
+
+    with pytest.raises(isolated_env.IsolatedEnvError, match="invalid"):
+        isolated_env.current_medarc_install_spec()
+
+
+def test_install_medarc_non_editable_uses_pinned_version(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
+    commands: list[list[str]] = []
+
+    monkeypatch.setattr(isolated_env, "current_medarc_install_spec", lambda: isolated_env.MedarcInstallSpec(False, "9.8.7"))
+    monkeypatch.setattr(isolated_env, "_run_uv", lambda command, action: commands.append(command))
+
+    isolated_env.install_medarc_into_venv(tmp_path / "python")
+
+    assert commands == [["uv", "pip", "install", "--python", str(tmp_path / "python"), "medarc-verifiers==9.8.7"]]
+
+
+def test_install_medarc_non_editable_resolution_failure_is_actionable(
+    monkeypatch: pytest.MonkeyPatch,
+    tmp_path: Path,
+) -> None:
+    monkeypatch.setattr(isolated_env, "current_medarc_install_spec", lambda: isolated_env.MedarcInstallSpec(False, "9.8.7"))
+
+    def fail(command: list[str], action: str) -> None:
+        raise isolated_env.IsolatedEnvError("resolver failed")
+
+    monkeypatch.setattr(isolated_env, "_run_uv", fail)
+
+    with pytest.raises(isolated_env.IsolatedEnvError, match="preinstall environment packages"):
+        isolated_env.install_medarc_into_venv(tmp_path / "python")
+
+
+def test_temporary_bench_venv_cleans_up(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
+    venv_path = tmp_path / "venv"
+    python_path = venv_path / "bin" / "python"
+    created: list[Path] = []
+
+    def fake_mkdtemp(prefix: str) -> str:
+        venv_path.mkdir(parents=True)
+        python_path.parent.mkdir(parents=True)
+        python_path.write_text("", encoding="utf-8")
+        return str(venv_path)
+
+    monkeypatch.setattr(isolated_env.tempfile, "mkdtemp", fake_mkdtemp)
+    monkeypatch.setattr(isolated_env, "_create_venv", lambda path: created.append(path) or python_path)
+    monkeypatch.setattr(isolated_env, "install_medarc_into_venv", lambda python, repo_root=None: None)
+
+    with isolated_env.temporary_bench_venv() as python:
+        assert python == python_path
+        assert venv_path.exists()
+
+    assert created == [venv_path]
+    assert not venv_path.exists()
+
+
+def test_run_uv_reports_missing_uv(monkeypatch: pytest.MonkeyPatch) -> None:
+    def missing_uv(*args, **kwargs):
+        raise FileNotFoundError("uv")
+
+    monkeypatch.setattr(isolated_env.subprocess, "run", missing_uv)
+
+    with pytest.raises(isolated_env.IsolatedEnvError, match="uv is not installed"):
+        isolated_env._run_uv(["uv", "venv"], "create venv")
+
+
+def test_run_uv_reports_failing_command(monkeypatch: pytest.MonkeyPatch) -> None:
+    monkeypatch.setattr(
+        isolated_env.subprocess,
+        "run",
+        lambda *args, **kwargs: SimpleNamespace(returncode=2, stderr="bad\nerror", stdout=""),
+    )
+
+    with pytest.raises(isolated_env.IsolatedEnvError, match="error"):
+        isolated_env._run_uv(["uv", "venv"], "create venv")
diff --git a/tests/test_cli/test_main.py b/tests/test_cli/test_main.py
index 9b980f28..308c64b2 100644
--- a/tests/test_cli/test_main.py
+++ b/tests/test_cli/test_main.py
@@ -259,7 +259,7 @@ def test_toml_bench_dry_run_model_override(
     assert "config-model" not in output
 
 
-def test_toml_bench_install_envs_dry_run_does_not_build_configs_or_spawn(
+def test_toml_bench_auto_install_defaults_true_and_dry_run_does_not_build_configs_or_spawn(
     monkeypatch: pytest.MonkeyPatch,
     tmp_path: Path,
     capsys: pytest.CaptureFixture[str],
@@ -276,18 +276,47 @@ def test_toml_bench_install_envs_dry_run_does_not_build_configs_or_spawn(
         rollouts_per_example = 1
         """,
     )
+    env_dir = tmp_path / "envs"
+    env_pkg = env_dir / "missing_env"
+    env_pkg.mkdir(parents=True)
+    (env_pkg / "pyproject.toml").write_text('[project]\nname = "missing-env"\n')
     monkeypatch.setattr(main, "build_eval_config", lambda raw, overrides: pytest.fail("parent built EvalConfig"))
     monkeypatch.setattr(main.subprocess, "run", lambda *args, **kwargs: pytest.fail("parent spawned child"))
 
-    exit_code = main.main(["bench", "--config", str(config_path), "--install-envs", "--dry-run"])
+    parser = main.build_batch_parser()
+    parsed = parser.parse_args(["--config", str(config_path)])
+    assert parsed.auto_install is True
+    parsed_explicit = parser.parse_args(["--config", str(config_path), "--auto-install"])
+    assert parsed_explicit.auto_install is True
+    parsed_disabled = parser.parse_args(["--config", str(config_path), "--no-auto-install"])
+    assert parsed_disabled.auto_install is False
 
-    output = capsys.readouterr().out
+    exit_code = main.main(["bench", "--config", str(config_path), "--env-dir", str(env_dir), "--dry-run"])
+
+    captured = capsys.readouterr()
+    output = captured.out
     assert exit_code == 0
     assert "missing-env" in output
-    assert "does not install packages" in output
+    assert "would auto-install" in captured.err
 
 
-def test_toml_bench_install_envs_executes_selected_child_payload(
+def test_toml_bench_rejects_old_install_envs_flag(tmp_path: Path) -> None:
+    config_path = tmp_path / "bench.toml"
+    _write_config(
+        config_path,
+        """
+        model = "gpt-5-mini"
+
+        [[eval]]
+        env_id = "medqa"
+        """,
+    )
+
+    with pytest.raises(SystemExit):
+        main.build_batch_parser().parse_args(["--config", str(config_path), "--install-envs"])
+
+
+def test_toml_bench_no_auto_install_plans_selected_raw_before_building_config(
     monkeypatch: pytest.MonkeyPatch,
     tmp_path: Path,
 ) -> None:
@@ -309,22 +338,37 @@ def test_toml_bench_install_envs_executes_selected_child_payload(
         rollouts_per_example = 1
         """,
     )
-    payloads: list[dict[str, Any]] = []
+    built_envs: list[str] = []
+    calls: list[Path] = []
 
-    def fake_run(cmd, check=False):
-        payload = json.loads(Path(cmd[-1]).read_text(encoding="utf-8"))
-        payloads.append(payload)
-        Path(payload["status_path"]).write_text(json.dumps({"exit_code": 0}), encoding="utf-8")
-        return SimpleNamespace(returncode=0)
+    def fake_build(raw: dict[str, Any], *, overrides: Any) -> SimpleNamespace:
+        built_envs.append(raw["env_id"])
+        return SimpleNamespace(
+            env_id=raw["env_id"],
+            model=raw.get("model", "gpt-5-mini"),
+            model_copy=lambda update: SimpleNamespace(
+                env_id=raw["env_id"],
+                model=raw.get("model", "gpt-5-mini"),
+                **update,
+            ),
+        )
 
-    monkeypatch.setattr(main.subprocess, "run", fake_run)
+    async def fake_run(config, **_kwargs):
+        calls.append(Path(config.resume_path))
+        Path(config.resume_path, "results.jsonl").write_text(json.dumps({"example_id": "0"}) + "\n")
+        Path(config.resume_path, "metadata.json").write_text(json.dumps({"env_id": config.env_id, "model": config.model}))
+        return {"outputs": [], "metadata": {}}
+
+    monkeypatch.setattr(main, "build_eval_config", fake_build)
+    monkeypatch.setattr(main, "run_evaluation", fake_run)
+    monkeypatch.setattr(main, "_module_importable", lambda module_name: module_name == "selected_env")
 
     exit_code = main.main(
         [
             "bench",
             "--config",
             str(config_path),
-            "--install-envs",
+            "--no-auto-install",
             "--eval-index",
             "2",
             "--output-dir",
@@ -333,15 +377,228 @@ def fake_run(cmd, check=False):
     )
 
     assert exit_code == 0
-    assert len(payloads) == 1
-    payload = payloads[0]
-    assert payload["raw_config"]["env_id"] == "selected-env"
-    assert payload["expected_env_id"] == "selected-env"
-    assert payload["expected_model"] == "gpt-5-mini"
-    assert payload["resume_path"] == str(output_dir / "gpt-5-mini" / "selected-env" / "base")
+    assert built_envs == ["selected-env"]
+    assert calls == [output_dir / "gpt-5-mini" / "selected-env" / "base"]
     assert (output_dir / "gpt-5-mini" / "selected-env" / "base").is_dir()
 
 
+def test_toml_bench_mixed_missing_env_routes_only_missing_to_isolated_child(
+    monkeypatch: pytest.MonkeyPatch,
+    tmp_path: Path,
+) -> None:
+    config_path = tmp_path / "bench.toml"
+    output_dir = tmp_path / "evals"
+    env_dir = tmp_path / "envs"
+    missing_pkg = env_dir / "missing_env"
+    missing_pkg.mkdir(parents=True)
+    (missing_pkg / "pyproject.toml").write_text('[project]\nname = "missing-env"\n', encoding="utf-8")
+    _write_config(
+        config_path,
+        """
+        model = "gpt-5-mini"
+
+        [[eval]]
+        env_id = "installed-env"
+        num_examples = 1
+        rollouts_per_example = 1
+
+        [[eval]]
+        env_id = "missing-env"
+        num_examples = 1
+        rollouts_per_example = 1
+        """,
+    )
+    parent_runs: list[str] = []
+    child_commands: list[list[str]] = []
+    installed_paths: list[Path] = []
+
+    def fake_build(raw: dict[str, Any], *, overrides: Any) -> SimpleNamespace:
+        return SimpleNamespace(
+            env_id=raw["env_id"],
+            model=raw.get("model", "gpt-5-mini"),
+            model_copy=lambda update: SimpleNamespace(
+                env_id=raw["env_id"],
+                model=raw.get("model", "gpt-5-mini"),
+                **update,
+            ),
+        )
+
+    async def fake_run(config, **_kwargs):
+        parent_runs.append(config.env_id)
+        Path(config.resume_path, "results.jsonl").write_text(json.dumps({"example_id": "0"}) + "\n")
+        Path(config.resume_path, "metadata.json").write_text(json.dumps({"env_id": config.env_id, "model": config.model}))
+
+    class FakeVenv:
+        def __enter__(self) -> Path:
+            return tmp_path / "fake-venv" / "bin" / "python"
+
+        def __exit__(self, exc_type, exc, tb) -> None:
+            return None
+
+    def fake_subprocess_run(cmd, check=False, capture_output=False, text=False):
+        child_commands.append([str(part) for part in cmd])
+        payload = json.loads(Path(cmd[-1]).read_text(encoding="utf-8"))
+        assert payload["cleanup_env_package"] is False
+        assert payload["env_preinstalled"] is True
+        Path(payload["status_path"]).write_text(json.dumps({"exit_code": 0}), encoding="utf-8")
+        return SimpleNamespace(returncode=0, stdout="", stderr="")
+
+    monkeypatch.setattr(main, "_module_importable", lambda module_name: module_name == "installed_env")
+    monkeypatch.setattr(main, "build_eval_config", fake_build)
+    monkeypatch.setattr(main, "run_evaluation", fake_run)
+    monkeypatch.setattr(main, "temporary_bench_venv", lambda: FakeVenv())
+    monkeypatch.setattr(main, "install_env_package", lambda python, env_path: installed_paths.append(Path(env_path)))
+    monkeypatch.setattr(main.subprocess, "run", fake_subprocess_run)
+
+    exit_code = main.main(
+        [
+            "bench",
+            "--config",
+            str(config_path),
+            "--env-dir",
+            str(env_dir),
+            "--output-dir",
+            str(output_dir),
+        ]
+    )
+
+    assert exit_code == 0
+    assert parent_runs == ["installed-env"]
+    assert installed_paths == [missing_pkg]
+    assert len(child_commands) == 1
+    assert child_commands[0][0] == str(tmp_path / "fake-venv" / "bin" / "python")
+
+
+def test_toml_bench_no_auto_install_missing_env_does_not_force_archive_or_execute(
+    monkeypatch: pytest.MonkeyPatch,
+    tmp_path: Path,
+) -> None:
+    config_path = tmp_path / "bench.toml"
+    output_dir = tmp_path / "evals"
+    env_dir = tmp_path / "envs"
+    env_pkg = env_dir / "missing_env"
+    env_pkg.mkdir(parents=True)
+    (env_pkg / "pyproject.toml").write_text('[project]\nname = "missing-env"\n', encoding="utf-8")
+    results_path = output_dir / "gpt-5-mini" / "missing-env" / "base"
+    _write_resume_artifacts(results_path, env_id="missing-env", model="gpt-5-mini")
+    _write_config(
+        config_path,
+        """
+        model = "gpt-5-mini"
+
+        [[eval]]
+        env_id = "missing-env"
+        num_examples = 1
+        rollouts_per_example = 1
+        """,
+    )
+
+    monkeypatch.setattr(main, "_module_importable", lambda module_name: False)
+    monkeypatch.setattr(main, "build_eval_config", lambda raw, overrides: pytest.fail("built EvalConfig"))
+    monkeypatch.setattr(main.subprocess, "run", lambda *args, **kwargs: pytest.fail("spawned child"))
+
+    exit_code = main.main(
+        [
+            "bench",
+            "--config",
+            str(config_path),
+            "--env-dir",
+            str(env_dir),
+            "--output-dir",
+            str(output_dir),
+            "--no-auto-install",
+            "--force",
+        ]
+    )
+
+    assert exit_code == 1
+    assert (results_path / "metadata.json").is_file()
+    assert not list(results_path.parent.glob("base__old_*"))
+
+
+def test_toml_bench_isolated_setup_failure_does_not_force_archive(
+    monkeypatch: pytest.MonkeyPatch,
+    tmp_path: Path,
+) -> None:
+    config_path = tmp_path / "bench.toml"
+    output_dir = tmp_path / "evals"
+    env_dir = tmp_path / "envs"
+    env_pkg = env_dir / "missing_env"
+    env_pkg.mkdir(parents=True)
+    (env_pkg / "pyproject.toml").write_text('[project]\nname = "missing-env"\n', encoding="utf-8")
+    results_path = output_dir / "gpt-5-mini" / "missing-env" / "base"
+    _write_resume_artifacts(results_path, env_id="missing-env", model="gpt-5-mini")
+    _write_config(
+        config_path,
+        """
+        model = "gpt-5-mini"
+
+        [[eval]]
+        env_id = "missing-env"
+        num_examples = 1
+        rollouts_per_example = 1
+        """,
+    )
+
+    class FakeVenv:
+        def __enter__(self) -> Path:
+            return tmp_path / "fake-python"
+
+        def __exit__(self, exc_type, exc, tb) -> None:
+            return None
+
+    def fail_install(python: Path, env_path: Path) -> None:
+        raise RuntimeError("env install failed")
+
+    monkeypatch.setattr(main, "_module_importable", lambda module_name: False)
+    monkeypatch.setattr(main, "temporary_bench_venv", lambda: FakeVenv())
+    monkeypatch.setattr(main, "install_env_package", fail_install)
+    monkeypatch.setattr(main.subprocess, "run", lambda *args, **kwargs: pytest.fail("spawned child"))
+
+    exit_code = main.main(
+        [
+            "bench",
+            "--config",
+            str(config_path),
+            "--env-dir",
+            str(env_dir),
+            "--output-dir",
+            str(output_dir),
+            "--force",
+        ]
+    )
+
+    assert exit_code == 1
+    assert (results_path / "metadata.json").is_file()
+    assert not list(results_path.parent.glob("base__old_*"))
+
+
+def test_toml_bench_dry_run_display_ignores_env_package_defaults(
+    monkeypatch: pytest.MonkeyPatch,
+    tmp_path: Path,
+    capsys: pytest.CaptureFixture[str],
+) -> None:
+    config_path = tmp_path / "bench.toml"
+    _write_config(
+        config_path,
+        """
+        model = "gpt-5-mini"
+
+        [[eval]]
+        env_id = "medqa"
+        """,
+    )
+    monkeypatch.setattr(main, "build_eval_config", lambda raw, overrides: pytest.fail("parent built EvalConfig"))
+
+    exit_code = main.main(["bench", "--config", str(config_path), "--dry-run"])
+
+    output = capsys.readouterr().out
+    assert exit_code == 0
+    assert "medqa" in output
+    assert "runs/evals/gpt-5-mini/medqa/base" in output
+    assert "1000" not in output
+
+
 def test_toml_bench_dry_run_uses_toml_output_dir(
     tmp_path: Path,
     capsys: pytest.CaptureFixture[str],

From 5ff2f65fa7ee13eee2e5b996367f2f0c515f72ec Mon Sep 17 00:00:00 2001
From: Benjamin Warner <me@benjaminwarner.dev>
Date: Tue, 12 May 2026 15:42:27 +0000
Subject: [PATCH 40/53] Document isolated bench auto install

---
 docs/medarc-eval-bench.md             | 70 ++++++++++++++++-----------
 docs/medarc-verifiers-architecture.md | 27 +++++++++--
 tests/test_cli/test_isolated_env.py   | 19 ++++++++
 3 files changed, 84 insertions(+), 32 deletions(-)

diff --git a/docs/medarc-eval-bench.md b/docs/medarc-eval-bench.md
index b72163b2..7fff0d62 100644
--- a/docs/medarc-eval-bench.md
+++ b/docs/medarc-eval-bench.md
@@ -16,8 +16,8 @@ medarc-eval bench --config configs/eval/medmarks-smoke.toml --dry-run
 # Run the verified production suite
 medarc-eval bench --config configs/eval/medmarks-verified.toml
 
-# Run selected evals while installing missing local env packages as needed
-medarc-eval bench --config configs/eval/medmarks-verified.toml --install-envs
+# Require all selected env packages to already be installed
+medarc-eval bench --config configs/eval/medmarks-verified.toml --no-auto-install
 
 # Run the verified suite against a local OpenAI-compatible server
 medarc-eval bench \
@@ -39,7 +39,8 @@ Repository suite configs live in `configs/eval/`:
 
 Bench configs use upstream `verifiers` TOML semantics: top-level defaults plus
 one or more `[[eval]]` blocks. MedARC adds deterministic output planning around
-the resolved evals; it does not use YAML `models`, `envs`, or `jobs` sections.
+selected raw eval configs; it does not use YAML `models`, `envs`, or `jobs`
+sections.
 
 ```toml
 model = "openai/gpt-4.1-mini"
@@ -66,36 +67,51 @@ across editable and wheel installs.
 
 ## Local Environment Install Lifecycle
 
-By default, TOML bench expects environment packages to already be importable in
-the active Python environment. Pass `--install-envs` when running repository
-local environments from `--env-dir` and you want bench to install missing
-packages only for the selected evals:
+By default, TOML bench auto-installs selected local environment packages that
+are not already importable in the active Python environment. Auto-install only
+applies to missing local packages resolved from `--env-dir`; selected envs that
+are already importable keep the normal in-process execution path.
 
 ```bash
 medarc-eval bench \
   --config configs/eval/medmarks-verified.toml \
-  --install-envs \
   --eval-index "$SLURM_ARRAY_TASK_ID"
 ```
 
-With `--install-envs`, the parent process loads and expands the TOML config,
-applies `--eval-index` / `--start-at` / `--stop-after`, plans deterministic
-output paths, and spawns one child subprocess per selected eval. Each child
-installs its missing environment package into the shared venv, builds the
-upstream `EvalConfig` after install, runs upstream evaluation with the
-parent-planned `resume_path`, and uninstalls only packages it installed before
-exiting.
-
-This clears Python import state between evals, but it still mutates the shared
-venv while each child is running. It does not uninstall transitive dependencies,
-and concurrent `--install-envs` bench runs against the same venv are
-unsupported. If the parent is interrupted, editable install metadata may be left
-behind and should be cleaned up manually with `uv pip uninstall`.
-
-`--install-envs --dry-run` does not install packages or spawn child processes.
-Dry-run identity is therefore based on TOML and CLI values only; environment
-package `[tool.verifiers.eval]` defaults are resolved only during the real child
-run after install.
+When a selected env package is missing, bench prints a warning to stderr and
+runs that eval in an isolated temporary venv. The parent process loads and
+expands the TOML config, applies `--eval-index` / `--start-at` / `--stop-after`,
+plans deterministic output paths from raw TOML and CLI values, creates a temp
+venv, installs MedARC into it, installs the target env package into it, runs the
+bench child with the parent-planned `resume_path`, and deletes the temp venv.
+
+If the active `medarc-verifiers` install is editable, isolated mode installs
+that same checkout editable into the temp venv. If the active install is not
+editable, isolated mode installs `medarc-verifiers==<current-version>` and
+requires that package/version to be resolvable by the normal package resolver.
+If resolution fails, run from an editable checkout or preinstall env packages
+and pass `--no-auto-install`.
+
+For faster strict local iteration, preinstall environments and opt out:
+
+```bash
+vf-install medqa
+vf-install pubmedqa
+medarc-eval bench --config configs/eval/medmarks-verified.toml --no-auto-install
+```
+
+`--dry-run` does not create venvs, install packages, or spawn child processes.
+If selected env packages are missing, dry run says they would be auto-installed.
+Dry-run identity and deterministic paths are based on TOML and CLI values only;
+environment package `[tool.verifiers.eval]` defaults are execution-time defaults
+and do not affect dry-run display or path planning.
+
+Isolated mode removes shared Python package metadata mutation from auto-install,
+but it is not full filesystem or side-effect isolation. Concurrent runs can
+still collide if they target the same deterministic output directory without
+unique selections, output roots, or variants. Hugging Face caches, judge caches,
+cwd-relative artifacts, temp files created by environment code, and network/API
+side effects can also remain shared.
 
 ## Ablations and Variants
 
@@ -181,7 +197,7 @@ provider arguments pass through to upstream.
 | `--resume` | Compatibility flag; valid deterministic outputs resume automatically |
 | `--output-dir PATH` | Override the config output directory, default `runs/evals` |
 | `--env-dir PATH` | Directory containing local environments |
-| `--install-envs` | Run selected evals in per-eval subprocesses that install missing local env packages into the shared venv and clean up child-installed packages |
+| `--auto-install` / `--no-auto-install` | Auto-install missing local env packages in isolated temp venvs (default) or require selected envs to be preinstalled |
 | `--endpoints-path PATH` | Endpoint registry path, default `configs/endpoints.toml` |
 | `--api-base-url URL` | Override API base URL for every eval |
 | `--api-key-var NAME` | Override API key environment variable |
diff --git a/docs/medarc-verifiers-architecture.md b/docs/medarc-verifiers-architecture.md
index 406531f1..0b5fa757 100644
--- a/docs/medarc-verifiers-architecture.md
+++ b/docs/medarc-verifiers-architecture.md
@@ -46,9 +46,14 @@ It supports:
   - Implemented in `medarc_verifiers/cli/_single_run.py`.
 - **TOML bench mode**: `medarc-eval bench --config <config.toml>`
   - Loads upstream `verifiers` TOML eval configs, expands ablations, plans
-    deterministic output directories, then runs evals sequentially through
-    upstream execution.
+    deterministic output directories from selected raw configs, then runs evals
+    sequentially through upstream execution.
+  - Missing selected local environment packages are auto-installed by default
+    in isolated temporary venvs. Importable envs stay on the in-process path.
+    `--no-auto-install` requires selected envs to already be importable.
   - Main implementation: `medarc_verifiers/cli/main.py`
+  - Isolated auto-install helper: `medarc_verifiers/cli/isolated_env.py`
+  - Isolated child runner: `medarc_verifiers/cli/bench_child.py`
   - Upstream eval boundary: `medarc_verifiers/cli/upstream_eval.py`
   - Deterministic identity/path helpers: `medarc_verifiers/cli/eval_identity.py`
 - **Processing**: `medarc-eval process ...`
@@ -79,9 +84,10 @@ Override parsing lives in `medarc_verifiers/cli/utils/overrides.py`.
 
 Bench configs use upstream `verifiers` TOML shape: top-level defaults plus one
 or more `[[eval]]` entries. Upstream `[[ablation]]` tables expand into repeated
-eval configs. MedARC adds deterministic paths around the resolved upstream eval
-configs. Duplicate `(model, env)` outputs must use explicit `variant_id` or
-`name` identity; the reserved default variant id is `base`.
+eval configs. MedARC adds deterministic paths around selected raw eval configs
+before importing env packages. Duplicate `(model, env)` outputs must use
+explicit `variant_id` or `name` identity; the reserved default variant id is
+`base`.
 
 `env_args` precedence is low to high:
 
@@ -91,6 +97,10 @@ configs. Duplicate `(model, env)` outputs must use explicit `variant_id` or
 4. Expanded `[[ablation]]` values
 5. CLI overrides (`--env-args` / `--env-arg`)
 
+Environment package `[tool.verifiers.eval]` defaults are execution-time
+defaults. They do not affect deterministic path planning or dry-run display,
+because bench plans from TOML and CLI values before importing env packages.
+
 `sampling_args` follow the same TOML -> eval -> ablation -> CLI override model,
 then are sanitized for OpenAI-compatible clients:
 
@@ -135,6 +145,13 @@ passes the deterministic target as upstream `EvalConfig.resume_path` and trusts
 upstream resume validation. Partial or malformed existing targets fail unless
 `--force` archives the existing target and reruns.
 
+For missing local envs, auto-install creates a temporary venv, mirrors the
+current `medarc-verifiers` install into that venv, installs the target env
+package, and only then prepares or archives the deterministic output directory.
+Editable MedARC installs mirror the same checkout from package metadata.
+Non-editable installs use `medarc-verifiers==<current-version>` and require
+that distribution to be resolvable.
+
 `medarc-eval bench` does not monkey-patch upstream metadata saving and does not
 write MedARC identity into upstream `metadata.json`. Variant identity is the
 deterministic path segment, so `variant_id` / `name` values must already be
diff --git a/tests/test_cli/test_isolated_env.py b/tests/test_cli/test_isolated_env.py
index 9f9bbb7e..77065c09 100644
--- a/tests/test_cli/test_isolated_env.py
+++ b/tests/test_cli/test_isolated_env.py
@@ -1,6 +1,8 @@
 from __future__ import annotations
 
 import json
+import os
+import subprocess
 from pathlib import Path
 from types import SimpleNamespace
 
@@ -115,3 +117,20 @@ def test_run_uv_reports_failing_command(monkeypatch: pytest.MonkeyPatch) -> None
 
     with pytest.raises(isolated_env.IsolatedEnvError, match="error"):
         isolated_env._run_uv(["uv", "venv"], "create venv")
+
+
+@pytest.mark.skipif(
+    os.environ.get("MEDARC_RUN_ISOLATED_ENV_SMOKE") != "1",
+    reason="set MEDARC_RUN_ISOLATED_ENV_SMOKE=1 to run the real uv isolated-env smoke",
+)
+def test_temporary_bench_venv_real_helper_imports_bench_child() -> None:
+    with isolated_env.temporary_bench_venv() as python:
+        completed = subprocess.run(
+            [str(python), "-m", "medarc_verifiers.cli.bench_child", "--help"],
+            check=False,
+            capture_output=True,
+            text=True,
+        )
+
+    assert completed.returncode == 0
+    assert "Run one TOML bench eval child payload" in completed.stdout

From 652e0845b9289942577dc0b04fdd5eb6dba5195a Mon Sep 17 00:00:00 2001
From: Benjamin Warner <me@benjaminwarner.dev>
Date: Tue, 12 May 2026 15:56:20 +0000
Subject: [PATCH 41/53] Reuse empty bench output dirs

---
 medarc_verifiers/cli/main.py |  2 ++
 tests/test_cli/test_main.py  | 31 +++++++++++++++++++++++++++++++
 2 files changed, 33 insertions(+)

diff --git a/medarc_verifiers/cli/main.py b/medarc_verifiers/cli/main.py
index 3af62656..bbe0818f 100644
--- a/medarc_verifiers/cli/main.py
+++ b/medarc_verifiers/cli/main.py
@@ -1610,6 +1610,8 @@ def _prepare_toml_results_dir(
     metadata_path = results_path / "metadata.json"
     results_file = results_path / "results.jsonl"
     if results_path.exists():
+        if results_path.is_dir() and not any(results_path.iterdir()):
+            return
         has_metadata = metadata_path.is_file()
         has_results = results_file.is_file()
         if not (has_metadata and has_results):
diff --git a/tests/test_cli/test_main.py b/tests/test_cli/test_main.py
index 308c64b2..b4421b81 100644
--- a/tests/test_cli/test_main.py
+++ b/tests/test_cli/test_main.py
@@ -793,6 +793,37 @@ async def fake_run(config, **_kwargs):
     assert calls == 0
 
 
+def test_toml_bench_reuses_empty_existing_output_dir(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
+    config_path = tmp_path / "bench.toml"
+    output_dir = tmp_path / "evals"
+    _write_config(
+        config_path,
+        """
+        model = "gpt-5-mini"
+
+        [[eval]]
+        env_id = "medqa"
+        """,
+    )
+    results_path = output_dir / "gpt-5-mini" / "medqa" / "base"
+    results_path.mkdir(parents=True)
+    calls = 0
+
+    async def fake_run(config, **_kwargs):
+        nonlocal calls
+        calls += 1
+        assert Path(config.resume_path) == results_path
+        Path(config.resume_path, "results.jsonl").write_text(json.dumps({"example_id": "0"}) + "\n")
+        Path(config.resume_path, "metadata.json").write_text(json.dumps({"env_id": "medqa", "model": "gpt-5-mini"}))
+        return {"outputs": [], "metadata": {}}
+
+    monkeypatch.setattr(main, "run_evaluation", fake_run)
+
+    assert main.main(["bench", "--config", str(config_path), "--output-dir", str(output_dir)]) == 0
+    assert calls == 1
+    assert (results_path / "metadata.json").is_file()
+
+
 def test_toml_bench_force_archives_existing_output(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
     config_path = tmp_path / "bench.toml"
     output_dir = tmp_path / "evals"

From 6862224c99e10fc582ce0eacb1d0a1f8cf0513d0 Mon Sep 17 00:00:00 2001
From: Benjamin Warner <me@benjaminwarner.dev>
Date: Tue, 12 May 2026 16:32:30 +0000
Subject: [PATCH 42/53] Avoid reserved env task keys

---
 environments/aci_bench/aci_bench/aci_bench.py |  2 +-
 .../agentclinic/agentclinic/agentclinic.py    |  2 +-
 environments/careqa/careqa.py                 |  2 +-
 environments/longhealth/README.md             |  2 +-
 environments/longhealth/longhealth.py         |  6 +-
 environments/m_arc/m_arc.py                   |  1 +
 .../med_dialog/med_dialog/med_dialog.py       |  1 +
 .../medagentbenchv2/medagentbenchv2/env.py    |  6 +-
 environments/medbullets/medbullets.py         |  1 +
 .../medcalc_bench/medcalc_bench.py            |  2 +-
 .../medcasereasoning/medcasereasoning.py      |  6 +-
 environments/medexqa/medexqa.py               |  1 +
 environments/medrbench/medrbench/medrbench.py |  7 +-
 environments/medredqa/medredqa.py             |  6 +-
 environments/medxpertqa/medxpertqa.py         |  3 +-
 .../mmlu_pro_health/mmlu_pro_health.py        |  1 +
 environments/pubmedqa/pubmedqa.py             |  2 +-
 .../supergpqa_medicine/supergpqa_medicine.py  |  1 +
 tests/test_environments/test_longhealth.py    | 96 +++++++++++++++++++
 19 files changed, 125 insertions(+), 23 deletions(-)
 create mode 100644 tests/test_environments/test_longhealth.py

diff --git a/environments/aci_bench/aci_bench/aci_bench.py b/environments/aci_bench/aci_bench/aci_bench.py
index 4f7346db..842e8f66 100644
--- a/environments/aci_bench/aci_bench/aci_bench.py
+++ b/environments/aci_bench/aci_bench/aci_bench.py
@@ -29,11 +29,11 @@ def _to_vf_format(dataset: Dataset) -> Dataset:
         lambda row: {
             "question": prompt.format(conversation=row["dialogue"]),
             "answer": row["note"],
-            "task": "aci-bench",
             "info": {
                 "conversation": row["dialogue"],
                 "reference_response": row["note"],
                 "transcript_version": row["transcript_version"],
+                "aci_bench_task": "aci-bench",
             },
         }
     )
diff --git a/environments/agentclinic/agentclinic/agentclinic.py b/environments/agentclinic/agentclinic/agentclinic.py
index ca4a4a6a..e9899327 100644
--- a/environments/agentclinic/agentclinic/agentclinic.py
+++ b/environments/agentclinic/agentclinic/agentclinic.py
@@ -423,12 +423,12 @@ def load_environment(
             "reference_response": scenario.diagnosis_information(),
             "case_id": i,
             "dataset_type": dataset_type,
+            "agentclinic_task": f"agentclinic-{dataset_type}",
         }
         records.append(
             {
                 "question": question,
                 "answer": scenario.diagnosis_information(),
-                "task": f"agentclinic-{dataset_type}",
                 "info": info,
             }
         )
diff --git a/environments/careqa/careqa.py b/environments/careqa/careqa.py
index fe5c833c..d2488ebb 100644
--- a/environments/careqa/careqa.py
+++ b/environments/careqa/careqa.py
@@ -230,10 +230,10 @@ def _load_open_ended_environment(
     def _map(ex):
         info = {}
         info["question"] = ex["question"].strip()
+        info["careqa_task"] = "careqa_open"
         return {
             "question": ex["question"].strip(),
             "answer": ex.get("answer_explanation", ex.get("answer", "")),
-            "task": "careqa_open",
             "info": info,
         }
 
diff --git a/environments/longhealth/README.md b/environments/longhealth/README.md
index 96095872..d033659f 100644
--- a/environments/longhealth/README.md
+++ b/environments/longhealth/README.md
@@ -63,7 +63,7 @@ medarc-eval longhealth -m "openai/gpt-5-mini" -n 10 -s --task all --doc-shuffle-
 | Metric | Meaning |
 | ------ | ------- |
 | `reward` | Exact match accuracy (1.0 if correct letter, 0.0 otherwise) |
-| `info.task` | Which sub-task: `task1`, `task2_negation`, or `task2_identification` |
+| `info.longhealth_task` | Which sub-task: `task1`, `task2_negation`, or `task2_identification` |
 | `info.has_answer_docs` | Whether answer-containing documents were included |
 | `info.num_docs` | Number of documents in the context |
 
diff --git a/environments/longhealth/longhealth.py b/environments/longhealth/longhealth.py
index 173b9bc6..ab188896 100644
--- a/environments/longhealth/longhealth.py
+++ b/environments/longhealth/longhealth.py
@@ -276,7 +276,7 @@ def _prepare_task1_data(
             info = {
                 "patient_id": patient_id,
                 "question_no": question.get("No"),
-                "task": "task1",
+                "longhealth_task": "task1",
                 "correct_answer_text": correct_answer,
                 "num_docs": len(selected_docs),
                 "has_answer_docs": len(answer_docs) > 0,
@@ -401,7 +401,7 @@ def _prepare_task2_data(
             info_neg = {
                 "patient_id": patient_id,
                 "question_no": question.get("No"),
-                "task": "task2_negation",
+                "longhealth_task": "task2_negation",
                 "correct_answer_text": "Question cannot be answered with provided documents",
                 "num_docs": len(selected_docs_neg),
                 "has_answer_docs": False,
@@ -444,7 +444,7 @@ def _prepare_task2_data(
             info_ident = {
                 "patient_id": patient_id,
                 "question_no": question.get("No"),
-                "task": "task2_identification",
+                "longhealth_task": "task2_identification",
                 "correct_answer_text": correct_answer,
                 "num_docs": len(selected_docs_ident),
                 "has_answer_docs": True,
diff --git a/environments/m_arc/m_arc.py b/environments/m_arc/m_arc.py
index 595bf01e..43064afb 100644
--- a/environments/m_arc/m_arc.py
+++ b/environments/m_arc/m_arc.py
@@ -118,6 +118,7 @@ def _format_row(row: dict, idx: int) -> dict:
 
         # question and answer have been moved to top-level, so remove them here
         info = dict(row)
+        info.pop("task", None)
 
         # update shuffled answer choices in the info dict
         if shuffle_answers:
diff --git a/environments/med_dialog/med_dialog/med_dialog.py b/environments/med_dialog/med_dialog/med_dialog.py
index 23cab690..a805dccf 100644
--- a/environments/med_dialog/med_dialog/med_dialog.py
+++ b/environments/med_dialog/med_dialog/med_dialog.py
@@ -56,6 +56,7 @@ def _format_row(row: dict[str, Any], *, subset: str = subset) -> dict[str, Any]:
             response = str(row.get("tgt", ""))
 
             info = dict(row)
+            info.pop("task", None)
             info["conversation"] = prompt
             info["reference_response"] = response
             info["subset"] = subset
diff --git a/environments/medagentbenchv2/medagentbenchv2/env.py b/environments/medagentbenchv2/medagentbenchv2/env.py
index a181f1d9..cd26d6e5 100644
--- a/environments/medagentbenchv2/medagentbenchv2/env.py
+++ b/environments/medagentbenchv2/medagentbenchv2/env.py
@@ -362,12 +362,14 @@ def _map(task: dict[str, Any]) -> dict[str, Any]:
                 "content": _build_user_message(task["instruction"], task.get("context")),
             },
         ]
+        info = dict(task)
+        info.pop("task", None)
+        info["medagentbench_task"] = "medagentbenchv2"
         return {
             "id": task["id"],
             "prompt": prompt,
-            "info": dict(task),
+            "info": info,
             "answer": "",
-            "task": "medagentbenchv2",
         }
 
     eval_dataset = Dataset.from_list([_map(task) for task in tasks])
diff --git a/environments/medbullets/medbullets.py b/environments/medbullets/medbullets.py
index f0d3431e..fb183b3d 100644
--- a/environments/medbullets/medbullets.py
+++ b/environments/medbullets/medbullets.py
@@ -60,6 +60,7 @@ def _format_row(row: dict) -> dict:
 
         # question and answer have been moved to top-level, so remove them here
         info = dict(row)
+        info.pop("task", None)
 
         # update shuffled answer choices in the info dict
         if shuffle_answers:
diff --git a/environments/medcalc_bench/medcalc_bench/medcalc_bench.py b/environments/medcalc_bench/medcalc_bench/medcalc_bench.py
index b4f0090d..2ecbab62 100644
--- a/environments/medcalc_bench/medcalc_bench/medcalc_bench.py
+++ b/environments/medcalc_bench/medcalc_bench/medcalc_bench.py
@@ -363,12 +363,12 @@ def _map(row: dict):
                 answer_format=answer_format,
             ),
             "answer": row["Ground Truth Answer"],
-            "task": "medcalc_bench",
             "info": {
                 "calc_id": row["Calculator ID"],
                 "ground_truth": row["Ground Truth Answer"],
                 "lower_bound": row["Lower Limit"],
                 "upper_bound": row["Upper Limit"],
+                "medcalc_task": "medcalc_bench",
             },
         }
 
diff --git a/environments/medcasereasoning/medcasereasoning.py b/environments/medcasereasoning/medcasereasoning.py
index 87afc915..e386a0e1 100644
--- a/environments/medcasereasoning/medcasereasoning.py
+++ b/environments/medcasereasoning/medcasereasoning.py
@@ -59,8 +59,7 @@ def load_environment(
         lambda x: {
             "question": QUESTION_TEMPLATE.format(question=x["case_prompt"]),
             "answer": x["final_diagnosis"],
-            "task": "medcasereasoning",
-            "info": {"case_prompt": x["case_prompt"]},
+            "info": {"case_prompt": x["case_prompt"], "medcasereasoning_task": "medcasereasoning"},
         }
     )
 
@@ -68,8 +67,7 @@ def load_environment(
         lambda x: {
             "question": QUESTION_TEMPLATE.format(question=x["case_prompt"]),
             "answer": x["final_diagnosis"],
-            "task": "medcasereasoning",
-            "info": {"case_prompt": x["case_prompt"]},
+            "info": {"case_prompt": x["case_prompt"], "medcasereasoning_task": "medcasereasoning"},
         }
     )
 
diff --git a/environments/medexqa/medexqa.py b/environments/medexqa/medexqa.py
index aa866392..abde2acc 100644
--- a/environments/medexqa/medexqa.py
+++ b/environments/medexqa/medexqa.py
@@ -175,6 +175,7 @@ def _format_row(row: dict, idx: int | None = None) -> dict:
 
         # Keep original data in info
         info = dict(row)
+        info.pop("task", None)
         info["answer_text"] = answer_text
         info["answer"] = answer_letter
         info["question"] = question
diff --git a/environments/medrbench/medrbench/medrbench.py b/environments/medrbench/medrbench/medrbench.py
index d729a9a6..74ca5b96 100644
--- a/environments/medrbench/medrbench/medrbench.py
+++ b/environments/medrbench/medrbench/medrbench.py
@@ -105,11 +105,11 @@ def _to_vf_format_diagnosis(data: dict[str, Any], rare_disease_only: bool = Fals
         else:
             question = MULTI_TURN_FIRST_TURN_PROMPT.format(case=case_without_tests)
 
+        medrbench_task = f"medrbench-diagnosis-{task.value}"
         records.append(
             {
                 "question": question,
                 "answer": diagnosis_results,
-                "task": f"medrbench-diagnosis-{task.value}",
                 "info": {
                     "pmc_id": pmc_id,
                     "case_summary": case_summary,
@@ -118,6 +118,7 @@ def _to_vf_format_diagnosis(data: dict[str, Any], rare_disease_only: bool = Fals
                     "differential_diagnosis": differential_diagnosis,
                     "reference_response": diagnosis_results,
                     "task_type": "medrbench-diagnosis",
+                    "medrbench_task": medrbench_task,
                     "body_category": case.get("body_category", []),
                     "disorder_category": case.get("disorder_category", []),
                     "checked_rare_disease": case.get("checked_rare_disease", []),
@@ -149,13 +150,13 @@ def _to_vf_format_treatment(data: dict[str, Any], rare_disease_only: bool = Fals
             {
                 "question": question,
                 "answer": treatment_plan_results,
-                "task": "medrbench-treatment",
                 "info": {
                     "pmc_id": pmc_id,
                     "case_summary": case_summary,
                     "treatment_planning_analysis": treatment_planning_analysis,
                     "reference_response": treatment_plan_results,
                     "task_type": "medrbench-treatment",
+                    "medrbench_task": "medrbench-treatment",
                     "body_category": case.get("body_category", []),
                     "disorder_category": case.get("disorder_category", []),
                     "checked_rare_disease": case.get("checked_rare_disease", []),
@@ -497,7 +498,7 @@ async def judge_rubric_reward(completion: Messages, info: Info, state: State, **
         gold_response = str(info.get("reference_response") or "")
         extracted_answer = parser.parse_answer(completion) or ""
 
-        task_name = str(state.get("task") or info.get("task_type") or "medrbench-diagnosis")
+        task_name = str(info.get("medrbench_task") or info.get("task_type") or "medrbench-diagnosis")
         if task_name.startswith("medrbench-diagnosis-free_turn"):
             info.setdefault("turns_used", _turn_count(state))
 
diff --git a/environments/medredqa/medredqa.py b/environments/medredqa/medredqa.py
index 50810dab..1d8d616c 100644
--- a/environments/medredqa/medredqa.py
+++ b/environments/medredqa/medredqa.py
@@ -47,8 +47,7 @@ def load_environment(
         lambda x: {
             "question": x["title"] + "\n" + x["body"] if x["title"] else x["body"],
             "answer": x["response"],
-            "task": "medredqa",
-            "info": {"judge_response": "Pending.."},
+            "info": {"judge_response": "Pending..", "medredqa_task": "medredqa"},
         }
     )
 
@@ -56,8 +55,7 @@ def load_environment(
         lambda x: {
             "question": x["title"] + "\n" + x["body"] if x["title"] else x["body"],
             "answer": x["response"],
-            "task": "medredqa",
-            "info": {"judge_response": "Pending.."},
+            "info": {"judge_response": "Pending..", "medredqa_task": "medredqa"},
         }
     )
 
diff --git a/environments/medxpertqa/medxpertqa.py b/environments/medxpertqa/medxpertqa.py
index 2ef3334e..a581c641 100644
--- a/environments/medxpertqa/medxpertqa.py
+++ b/environments/medxpertqa/medxpertqa.py
@@ -79,6 +79,8 @@ def _map(example: dict) -> dict:
         answer_text = options.get(answer_letter)
 
         info = dict(example)
+        info.pop("task", None)
+        info["medxpertqa_question_type"] = question_type.value
         if shuffle_answers:
             info["options"] = options
             info["label"] = answer_letter
@@ -88,7 +90,6 @@ def _map(example: dict) -> dict:
             "question": _format_question_with_options(example.get("question", ""), options),
             "answer": answer_letter if answer_letter else "",
             "info": info,
-            "task": question_type.value,
         }
 
     # Disable the Datasets cache when shuffling answers
diff --git a/environments/mmlu_pro_health/mmlu_pro_health.py b/environments/mmlu_pro_health/mmlu_pro_health.py
index 6ad0feb1..626a5c94 100644
--- a/environments/mmlu_pro_health/mmlu_pro_health.py
+++ b/environments/mmlu_pro_health/mmlu_pro_health.py
@@ -114,6 +114,7 @@ def _format_row(row: dict, idx: int) -> dict:
 
         # question and answer have been moved to top-level, so remove them here
         info = dict(row)
+        info.pop("task", None)
 
         # update shuffled answer choices in the info dict
         if shuffle_answers:
diff --git a/environments/pubmedqa/pubmedqa.py b/environments/pubmedqa/pubmedqa.py
index dc733550..a5ee4ab9 100644
--- a/environments/pubmedqa/pubmedqa.py
+++ b/environments/pubmedqa/pubmedqa.py
@@ -74,6 +74,7 @@ def map_row_to_mcq_prompt(
     # required fields: question (for the prompt), and answer (for the scoring)
     info = {
         "answer_text": options.get(correct_answer_letter, final_decision),
+        "pubmedqa_task": "pubmedqa",
     }
     if shuffle_answers:
         info["options"] = options
@@ -81,7 +82,6 @@ def map_row_to_mcq_prompt(
     return {
         "question": complete_prompt,
         "answer": correct_answer_letter,
-        "task": "pubmedqa",
         "info": info,
     }
 
diff --git a/environments/supergpqa_medicine/supergpqa_medicine.py b/environments/supergpqa_medicine/supergpqa_medicine.py
index 404a6220..ac0fa5a5 100644
--- a/environments/supergpqa_medicine/supergpqa_medicine.py
+++ b/environments/supergpqa_medicine/supergpqa_medicine.py
@@ -202,6 +202,7 @@ def _format_row(row: dict, idx: int) -> dict:
 
         # question and answer have been moved to top-level, so remove them here
         info = dict(row)
+        info.pop("task", None)
 
         # update shuffled answer choices in the info dict
         if shuffle_answers:
diff --git a/tests/test_environments/test_longhealth.py b/tests/test_environments/test_longhealth.py
new file mode 100644
index 00000000..00da300b
--- /dev/null
+++ b/tests/test_environments/test_longhealth.py
@@ -0,0 +1,96 @@
+import ast
+import importlib.util
+from pathlib import Path
+
+from verifiers.types import flatten_task_input
+
+REPO_ROOT = Path(__file__).resolve().parents[2]
+
+
+def _load_longhealth_module():
+    module_path = REPO_ROOT / "environments" / "longhealth" / "longhealth.py"
+    spec = importlib.util.spec_from_file_location("longhealth_local", module_path)
+    assert spec is not None
+    assert spec.loader is not None
+    module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(module)
+    return module
+
+
+def test_environment_code_does_not_emit_reserved_task_key() -> None:
+    offenders = []
+    for path in (REPO_ROOT / "environments").rglob("*.py"):
+        tree = ast.parse(path.read_text(), filename=str(path))
+        for node in ast.walk(tree):
+            if isinstance(node, ast.Dict):
+                for key in node.keys:
+                    if isinstance(key, ast.Constant) and key.value == "task":
+                        offenders.append(f"{path.relative_to(REPO_ROOT)}:{key.lineno}")
+            if isinstance(node, ast.Subscript) and isinstance(node.ctx, ast.Store):
+                if isinstance(node.slice, ast.Constant) and node.slice.value == "task":
+                    offenders.append(f"{path.relative_to(REPO_ROOT)}:{node.lineno}")
+
+    assert offenders == []
+
+
+def test_copied_info_payloads_drop_reserved_task_key() -> None:
+    offenders = []
+    for path in (REPO_ROOT / "environments").rglob("*.py"):
+        tree = ast.parse(path.read_text(), filename=str(path))
+        for fn in [node for node in ast.walk(tree) if isinstance(node, ast.FunctionDef)]:
+            copies_raw_payload = False
+            drops_task = False
+            for node in ast.walk(fn):
+                if isinstance(node, ast.Assign):
+                    for target in node.targets:
+                        if not (isinstance(target, ast.Name) and target.id == "info"):
+                            continue
+                        value = node.value
+                        if (
+                            isinstance(value, ast.Call)
+                            and isinstance(value.func, ast.Name)
+                            and value.func.id == "dict"
+                            and value.args
+                        ):
+                            copies_raw_payload = True
+                if isinstance(node, ast.Call) and isinstance(node.func, ast.Attribute):
+                    if not (isinstance(node.func.value, ast.Name) and node.func.value.id == "info"):
+                        continue
+                    if node.func.attr == "pop" and node.args:
+                        arg = node.args[0]
+                        if isinstance(arg, ast.Constant) and arg.value == "task":
+                            drops_task = True
+            if copies_raw_payload and not drops_task:
+                offenders.append(f"{path.relative_to(REPO_ROOT)}:{fn.lineno}:{fn.name}")
+
+    assert offenders == []
+
+
+def test_longhealth_task1_metadata_does_not_use_verifiers_task_key() -> None:
+    module = _load_longhealth_module()
+
+    env = module.load_environment(task="task1", max_examples=3, shuffle_docs=False)
+
+    seen_tasks = set()
+    for row in env.eval_dataset:
+        info = row["info"]
+        assert "task" not in info
+        seen_tasks.add(info["longhealth_task"])
+        assert flatten_task_input(row)["info"]["longhealth_task"] == info["longhealth_task"]
+
+    assert seen_tasks == {"task1"}
+
+
+def test_longhealth_task2_metadata_does_not_use_verifiers_task_key() -> None:
+    module = _load_longhealth_module()
+
+    env = module.load_environment(task="task2", max_examples=2, shuffle_docs=False)
+
+    seen_tasks = set()
+    for row in env.eval_dataset:
+        info = row["info"]
+        assert "task" not in info
+        seen_tasks.add(info["longhealth_task"])
+        assert flatten_task_input(row)["info"]["longhealth_task"] == info["longhealth_task"]
+
+    assert seen_tasks == {"task2_negation", "task2_identification"}

From 073c0236db5043b7e236d1bd9a34d12df69d4471 Mon Sep 17 00:00:00 2001
From: Benjamin Warner <me@benjaminwarner.dev>
Date: Tue, 12 May 2026 17:38:16 +0000
Subject: [PATCH 43/53] Add client-aware sampling sanitizers

---
 docs/medarc-verifiers-architecture.md     |   8 +-
 medarc_verifiers/cli/verifiers_adapter.py |  37 ++-
 medarc_verifiers/utils/__init__.py        |   3 +-
 medarc_verifiers/utils/sampling_args.py   | 298 +++++++++++++++++-----
 tests/test_cli/test_verifiers_adapter.py  |  97 +++++++
 tests/test_sampling_args.py               | 168 ++++++++++++
 6 files changed, 535 insertions(+), 76 deletions(-)
 create mode 100644 tests/test_sampling_args.py

diff --git a/docs/medarc-verifiers-architecture.md b/docs/medarc-verifiers-architecture.md
index 0b5fa757..2de10790 100644
--- a/docs/medarc-verifiers-architecture.md
+++ b/docs/medarc-verifiers-architecture.md
@@ -102,9 +102,15 @@ defaults. They do not affect deterministic path planning or dry-run display,
 because bench plans from TOML and CLI values before importing env packages.
 
 `sampling_args` follow the same TOML -> eval -> ablation -> CLI override model,
-then are sanitized for OpenAI-compatible clients:
+then are sanitized once for the resolved Verifiers client type:
 
 - Unknown parameters move under `extra_body` for compatible servers such as vLLM.
+- OpenAI Chat Completions keeps `reasoning_effort` as a top-level request field.
+- OpenAI Responses maps `reasoning_effort` to `reasoning = {"effort": ...}`.
+- Anthropic Messages uses adaptive thinking only:
+  `thinking = {"type": "adaptive"}` plus
+  `output_config = {"effort": ...}`. Manual `budget_tokens` thinking configs
+  are rejected before execution.
 - Sanitizer: `medarc_verifiers/utils/sampling_args.py`
 - Import boundary: `medarc_verifiers/cli/upstream_eval.py`
 - Temporary merge/adaptation adapter behind that boundary:
diff --git a/medarc_verifiers/cli/verifiers_adapter.py b/medarc_verifiers/cli/verifiers_adapter.py
index 1049ff62..81651a42 100644
--- a/medarc_verifiers/cli/verifiers_adapter.py
+++ b/medarc_verifiers/cli/verifiers_adapter.py
@@ -27,7 +27,7 @@
 
 from medarc_verifiers.cli.utils.endpoint_utils import load_endpoint_sampling_profiles
 from medarc_verifiers.utils.prime_inference import prime_inference_overrides
-from medarc_verifiers.utils.sampling_args import sanitize_sampling_args_for_openai
+from medarc_verifiers.utils.sampling_args import sanitize_sampling_args
 
 logger = logging.getLogger(__name__)
 
@@ -218,6 +218,7 @@ def build_eval_config(raw: Mapping[str, Any], *, overrides: EvalConfigOverrides
     sampling_args = _build_sampling_args(
         merged_raw,
         client_config.api_base_url,
+        client_type=client_config.client_type,
         endpoint_sampling_args=endpoint_sampling_args,
         cli_sampling_args=cli_sampling_args,
     )
@@ -260,7 +261,7 @@ def build_eval_identity_payload(
 
     merged_raw = _apply_overrides(dict(raw), overrides)
     endpoints_path = str(merged_raw.get("endpoints_path", DEFAULT_ENDPOINTS_PATH))
-    endpoints = load_endpoints(endpoints_path)
+    endpoints = _load_endpoint_registry(endpoints_path)
     model, _resolved_endpoint_id, _client_config = _build_client_config(merged_raw, endpoints, endpoints_path)
 
     payload = {
@@ -498,6 +499,7 @@ def _build_sampling_args(
     raw: Mapping[str, Any],
     api_base_url: str,
     *,
+    client_type: str,
     endpoint_sampling_args: Mapping[str, Any] | None = None,
     cli_sampling_args: Mapping[str, Any] | None = None,
 ) -> dict[str, Any]:
@@ -512,16 +514,12 @@ def _build_sampling_args(
         temperature=raw.get("temperature"),
         include_none_max_tokens=include_none_max_tokens,
     )
-    merged = sanitize_sampling_args_for_openai(prime_sampling_overrides)
-    merged = _deep_merge(merged, sanitize_sampling_args_for_openai(endpoint_sampling))
-    merged = _deep_merge(merged, sanitize_sampling_args_for_openai(scalar_sampling_args))
-    merged = _deep_merge(
-        merged, sanitize_sampling_args_for_openai(_validate_sampling_mapping(raw.get("sampling_args"), "sampling_args"))
-    )
-    merged = _deep_merge(
-        merged, sanitize_sampling_args_for_openai(_validate_sampling_mapping(cli_sampling_args, "CLI sampling_args"))
-    )
-    return merged
+    merged = _merge_sampling_layer({}, prime_sampling_overrides)
+    merged = _merge_sampling_layer(merged, endpoint_sampling)
+    merged = _merge_sampling_layer(merged, scalar_sampling_args)
+    merged = _merge_sampling_layer(merged, _validate_sampling_mapping(raw.get("sampling_args"), "sampling_args"))
+    merged = _merge_sampling_layer(merged, _validate_sampling_mapping(cli_sampling_args, "CLI sampling_args"))
+    return sanitize_sampling_args(merged, client_type=client_type)
 
 
 def _merge_sampling_args(
@@ -604,6 +602,21 @@ def _deep_merge(base: Mapping[str, Any], override: Mapping[str, Any]) -> dict[st
     return merged
 
 
+def _merge_sampling_layer(base: Mapping[str, Any], override: Mapping[str, Any]) -> dict[str, Any]:
+    merged = dict(base)
+    if "reasoning" in override:
+        merged.pop("reasoning_effort", None)
+    if "reasoning_effort" in override:
+        merged.pop("reasoning", None)
+    direct_override_keys = set(override) - {"extra_body"}
+    if direct_override_keys and isinstance(merged.get("extra_body"), Mapping):
+        extra_body = dict(cast(Mapping[str, Any], merged["extra_body"]))
+        for key in direct_override_keys:
+            extra_body.pop(key, None)
+        merged["extra_body"] = extra_body
+    return _deep_merge(merged, override)
+
+
 def _validate_sampling_mapping(value: object, label: str) -> dict[str, Any]:
     if value is None:
         return {}
diff --git a/medarc_verifiers/utils/__init__.py b/medarc_verifiers/utils/__init__.py
index 776aaf86..c7ccc1c9 100644
--- a/medarc_verifiers/utils/__init__.py
+++ b/medarc_verifiers/utils/__init__.py
@@ -5,7 +5,7 @@
     randomize_multiple_choice_hf_map,
     randomize_multiple_choice_row,
 )
-from .sampling_args import sanitize_sampling_args_for_openai
+from .sampling_args import sanitize_sampling_args, sanitize_sampling_args_for_openai
 
 __all__ = [
     "download_file",
@@ -15,5 +15,6 @@
     "randomize_multiple_choice_row",
     "default_judge_api_key",
     "judge_sampling_args_and_headers",
+    "sanitize_sampling_args",
     "sanitize_sampling_args_for_openai",
 ]
diff --git a/medarc_verifiers/utils/sampling_args.py b/medarc_verifiers/utils/sampling_args.py
index 4019d35a..ed0d321b 100644
--- a/medarc_verifiers/utils/sampling_args.py
+++ b/medarc_verifiers/utils/sampling_args.py
@@ -1,91 +1,265 @@
+from __future__ import annotations
+
 import inspect
 from collections.abc import Mapping
 from functools import lru_cache
 from typing import Any
 
+_OPENAI_REASONING_KEYS = {"reasoning", "reasoning_effort", "thinking", "output_config"}
+_ANTHROPIC_EFFORT_VALUES = {"low", "medium", "high"}
+_OPENAI_CHAT_FALLBACK_TOP_LEVEL_KEYS = {
+    "temperature",
+    "top_p",
+    "max_tokens",
+    "max_completion_tokens",
+    "n",
+    "stop",
+    "presence_penalty",
+    "frequency_penalty",
+    "logit_bias",
+    "seed",
+    "response_format",
+    "tool_choice",
+    "tools",
+    "stream",
+    "extra_body",
+    "reasoning_effort",
+}
+_OPENAI_RESPONSES_FALLBACK_TOP_LEVEL_KEYS = {
+    "temperature",
+    "top_p",
+    "max_tokens",
+    "max_completion_tokens",
+    "max_output_tokens",
+    "n",
+    "stop",
+    "stream",
+    "extra_body",
+    "reasoning",
+    "tools",
+    "tool_choice",
+}
+_OPENAI_CHAT_VERIFIERS_WRAPPER_KEYS = {"max_tokens"}
+_OPENAI_RESPONSES_VERIFIERS_WRAPPER_KEYS = {"max_tokens", "max_completion_tokens", "n", "stop", "modalities"}
+_OPENAI_COMPLETIONS_FALLBACK_TOP_LEVEL_KEYS = {
+    "temperature",
+    "top_p",
+    "max_tokens",
+    "n",
+    "stop",
+    "presence_penalty",
+    "frequency_penalty",
+    "logit_bias",
+    "seed",
+    "stream",
+    "extra_body",
+}
 
-def sanitize_sampling_args_for_openai(sampling_args: Mapping[str, Any] | None) -> dict[str, Any]:
-    """Return sampling args split into OpenAI-recognized kwargs and extra_body.
 
-    Any parameters not recognized by the OpenAI Chat Completions API are moved under
-    the `extra_body` key so they can be forwarded to compatible servers (e.g., vLLM/Qwen).
-    """
+def sanitize_sampling_args(
+    sampling_args: Mapping[str, Any] | None,
+    *,
+    client_type: str,
+) -> dict[str, Any]:
+    """Return sampling args in the request shape expected by the resolved client."""
     if not sampling_args:
         return {}
 
-    allowed_keys = _get_openai_allowed_param_names()
+    if client_type in {"openai_chat_completions", "openai_chat_completions_token"}:
+        return _sanitize_openai_chat(sampling_args)
+    if client_type == "openai_responses":
+        return _sanitize_openai_responses(sampling_args)
+    if client_type == "openai_completions":
+        return _sanitize_openai_completions(sampling_args)
+    if client_type == "anthropic_messages":
+        return _sanitize_anthropic_messages(sampling_args)
+    if client_type in {"renderer", "nemorl_chat_completions"}:
+        return _drop_none(sampling_args)
+    return _drop_none(sampling_args)
+
+
+def sanitize_sampling_args_for_openai(sampling_args: Mapping[str, Any] | None) -> dict[str, Any]:
+    """Compatibility wrapper for existing OpenAI Chat Completions call sites."""
+    return sanitize_sampling_args(sampling_args, client_type="openai_chat_completions")
+
+
+def _sanitize_openai_chat(sampling_args: Mapping[str, Any]) -> dict[str, Any]:
+    cleaned = _drop_none(sampling_args, preserve_none_keys={"max_tokens"})
+    return _move_compatible_extras_to_extra_body(cleaned, allowed_top_level_keys=_get_openai_chat_allowed_param_names())
+
+
+def _sanitize_openai_responses(sampling_args: Mapping[str, Any]) -> dict[str, Any]:
+    cleaned = _drop_none(sampling_args, preserve_none_keys={"max_tokens"})
+    reasoning_effort = cleaned.pop("reasoning_effort", None)
+    if reasoning_effort is not None:
+        existing_reasoning = cleaned.get("reasoning")
+        if existing_reasoning is None:
+            cleaned["reasoning"] = {"effort": reasoning_effort}
+        elif isinstance(existing_reasoning, Mapping):
+            reasoning = dict(existing_reasoning)
+            reasoning.setdefault("effort", reasoning_effort)
+            cleaned["reasoning"] = reasoning
+        else:
+            raise ValueError("sampling_args.reasoning must be a dict when used with openai_responses")
+    return _move_compatible_extras_to_extra_body(
+        cleaned, allowed_top_level_keys=_get_openai_responses_allowed_param_names()
+    )
+
+
+def _sanitize_openai_completions(sampling_args: Mapping[str, Any]) -> dict[str, Any]:
+    cleaned = _drop_none(sampling_args, preserve_none_keys={"max_tokens"})
+    for key in _OPENAI_REASONING_KEYS:
+        cleaned.pop(key, None)
+    return _move_compatible_extras_to_extra_body(
+        cleaned, allowed_top_level_keys=_get_openai_completions_allowed_param_names()
+    )
+
 
+def _sanitize_anthropic_messages(sampling_args: Mapping[str, Any]) -> dict[str, Any]:
+    cleaned = _drop_none(sampling_args)
+    reasoning_effort = cleaned.pop("reasoning_effort", None)
+    cleaned.pop("reasoning", None)
+    cleaned.pop("effort", None)
+    cleaned.pop("extra_body", None)
+
+    thinking = cleaned.get("thinking")
+    if thinking is not None:
+        if not isinstance(thinking, Mapping):
+            raise ValueError("sampling_args.thinking must be a dict when used with anthropic_messages")
+        thinking_dict = dict(thinking)
+        if thinking_dict.get("type") != "adaptive":
+            raise ValueError("anthropic_messages only supports adaptive thinking configs")
+        if "budget_tokens" in thinking_dict:
+            raise ValueError("anthropic_messages does not support manual budget_tokens thinking configs")
+        thinking_dict.pop("effort", None)
+        cleaned["thinking"] = thinking_dict
+
+    if reasoning_effort is not None:
+        effort = _validate_anthropic_effort(reasoning_effort)
+        cleaned["thinking"] = {"type": "adaptive"}
+        output_config = cleaned.get("output_config")
+        if output_config is None:
+            cleaned["output_config"] = {"effort": effort}
+        elif isinstance(output_config, Mapping):
+            cleaned["output_config"] = {**dict(output_config), "effort": effort}
+        else:
+            raise ValueError("sampling_args.output_config must be a dict when used with anthropic_messages")
+    elif "output_config" in cleaned:
+        output_config = cleaned["output_config"]
+        if not isinstance(output_config, Mapping):
+            raise ValueError("sampling_args.output_config must be a dict when used with anthropic_messages")
+        output_config_dict = dict(output_config)
+        if "effort" in output_config_dict:
+            output_config_dict["effort"] = _validate_anthropic_effort(output_config_dict["effort"])
+        cleaned["output_config"] = output_config_dict
+
+    return cleaned
+
+
+def _validate_anthropic_effort(value: Any) -> str:
+    if not isinstance(value, str) or value not in _ANTHROPIC_EFFORT_VALUES:
+        raise ValueError(
+            "anthropic_messages reasoning effort must be one of: "
+            f"{', '.join(sorted(_ANTHROPIC_EFFORT_VALUES))}"
+        )
+    return value
+
+
+def _move_compatible_extras_to_extra_body(
+    sampling_args: Mapping[str, Any],
+    *,
+    allowed_top_level_keys: set[str],
+) -> dict[str, Any]:
     filtered: dict[str, Any] = {}
     extras: dict[str, Any] = {}
     for key, value in sampling_args.items():
-        if key in allowed_keys:
+        if key in allowed_top_level_keys:
             filtered[key] = value
         else:
             extras[key] = value
 
-    if extras:
-        # OpenAI python client forwards unknown params via `extra_body`.
-        # If the caller already supplied an `extra_body` (e.g., to request `usage.include`),
-        # merge rather than overwrite it.
-        existing = filtered.get("extra_body")
-        if existing is None:
-            filtered["extra_body"] = extras
-        elif isinstance(existing, Mapping):
-            merged = dict(existing)
-            for key, value in extras.items():
-                merged[key] = value
-            filtered["extra_body"] = merged
-        else:
-            filtered["extra_body"] = {"_passthrough_extra_body": existing, **extras}
+    if not extras:
+        return filtered
+
+    existing = filtered.get("extra_body")
+    if existing is None:
+        filtered["extra_body"] = extras
+    elif isinstance(existing, Mapping):
+        filtered["extra_body"] = _deep_merge(extras, existing)
+    else:
+        filtered["extra_body"] = {"_passthrough_extra_body": existing, **extras}
     return filtered
 
 
 @lru_cache(maxsize=1)
-def _get_openai_allowed_param_names() -> set[str]:
-    """Infer allowed kwargs for OpenAI create() by inspecting client signatures.
+def _get_openai_chat_allowed_param_names() -> set[str]:
+    try:
+        from openai.resources.chat.completions import AsyncCompletions as ChatAsyncCompletions  # type: ignore
+    except Exception:
+        return set(_OPENAI_CHAT_FALLBACK_TOP_LEVEL_KEYS)
+
+    allowed = _param_names(ChatAsyncCompletions.create) or set(_OPENAI_CHAT_FALLBACK_TOP_LEVEL_KEYS)
+    allowed.add("extra_body")
+    allowed.add("reasoning_effort")
+    allowed.update(_OPENAI_CHAT_VERIFIERS_WRAPPER_KEYS)
+    return allowed
 
-    We union parameter names from:
-      - openai.resources.chat.completions.AsyncCompletions.create
-      - openai.resources.completions.AsyncCompletions.create
 
-    On failure, return a conservative fallback. Always include 'extra_body'.
-    """
+@lru_cache(maxsize=1)
+def _get_openai_responses_allowed_param_names() -> set[str]:
+    try:
+        from openai.resources.responses import AsyncResponses  # type: ignore
+    except Exception:
+        return set(_OPENAI_RESPONSES_FALLBACK_TOP_LEVEL_KEYS)
+
+    allowed = _param_names(AsyncResponses.create) or set(_OPENAI_RESPONSES_FALLBACK_TOP_LEVEL_KEYS)
+    allowed.add("extra_body")
+    allowed.add("reasoning")
+    allowed.update(_OPENAI_RESPONSES_VERIFIERS_WRAPPER_KEYS)
+    return allowed
+
+
+@lru_cache(maxsize=1)
+def _get_openai_completions_allowed_param_names() -> set[str]:
     try:
-        from openai.resources.chat.completions import AsyncCompletions as ChatAsyncCompletions  # type: ignore
         from openai.resources.completions import AsyncCompletions as TextAsyncCompletions  # type: ignore
     except Exception:
-        return {
-            "temperature",
-            "top_p",
-            "max_tokens",
-            "max_completion_tokens",
-            "n",
-            "stop",
-            "presence_penalty",
-            "frequency_penalty",
-            "logit_bias",
-            "seed",
-            "response_format",
-            "tool_choice",
-            "tools",
-            "stream",
-            "extra_body",
-        }
-
-    def _param_names(callable_obj: Any) -> set[str]:
-        try:
-            sig = inspect.signature(callable_obj)
-        except Exception:
-            return set()
-        names: set[str] = set()
-        for name, param in sig.parameters.items():
-            if name == "self":
-                continue
-            if param.kind == inspect.Parameter.VAR_POSITIONAL:
-                continue
-            names.add(name)
-        return names
-
-    allowed = _param_names(ChatAsyncCompletions.create) | _param_names(TextAsyncCompletions.create)
+        return set(_OPENAI_COMPLETIONS_FALLBACK_TOP_LEVEL_KEYS)
+
+    allowed = _param_names(TextAsyncCompletions.create) or set(_OPENAI_COMPLETIONS_FALLBACK_TOP_LEVEL_KEYS)
     allowed.add("extra_body")
     return allowed
+
+
+def _param_names(callable_obj: Any) -> set[str]:
+    try:
+        sig = inspect.signature(callable_obj)
+    except Exception:
+        return set()
+    names: set[str] = set()
+    for name, param in sig.parameters.items():
+        if name == "self":
+            continue
+        if param.kind == inspect.Parameter.VAR_POSITIONAL:
+            continue
+        names.add(name)
+    return names
+
+
+def _drop_none(
+    sampling_args: Mapping[str, Any],
+    *,
+    preserve_none_keys: set[str] | None = None,
+) -> dict[str, Any]:
+    preserve_none_keys = preserve_none_keys or set()
+    return {key: value for key, value in sampling_args.items() if value is not None or key in preserve_none_keys}
+
+
+def _deep_merge(base: Mapping[str, Any], override: Mapping[str, Any]) -> dict[str, Any]:
+    merged = dict(base)
+    for key, value in override.items():
+        if isinstance(value, Mapping) and isinstance(merged.get(key), Mapping):
+            merged[key] = _deep_merge(merged[key], value)
+        else:
+            merged[key] = value
+    return merged
diff --git a/tests/test_cli/test_verifiers_adapter.py b/tests/test_cli/test_verifiers_adapter.py
index ef32e67a..c0d179fe 100644
--- a/tests/test_cli/test_verifiers_adapter.py
+++ b/tests/test_cli/test_verifiers_adapter.py
@@ -261,7 +261,33 @@ def test_build_eval_config_uses_endpoint_sampling_defaults(tmp_path: Path) -> No
     assert config.client_config.client_type == "openai_responses"
     assert config.sampling_args["temperature"] == 1.0
     assert config.sampling_args["top_p"] == 1.0
+    assert "reasoning_effort" not in config.sampling_args
+    assert config.sampling_args["reasoning"] == {"effort": "low"}
+    assert config.sampling_args["extra_body"]["top_k"] == 0
+
+
+def test_build_eval_config_chat_client_keeps_reasoning_effort(tmp_path: Path) -> None:
+    endpoints_path = tmp_path / "endpoints.toml"
+    endpoints_path.write_text(
+        """
+[[endpoint]]
+endpoint_id = "gpt-oss-chat"
+model = "openai/gpt-oss-20b"
+url = "http://localhost:8010/v1"
+key = "VLLM_API_KEY"
+api_client_type = "openai_chat_completions"
+
+[endpoint.sampling_args]
+top_k = 0
+reasoning_effort = "low"
+""".strip()
+    )
+
+    config = build_eval_config({"env_id": "medqa", "model": "gpt-oss-chat", "endpoints_path": str(endpoints_path)})
+
+    assert config.client_config.client_type == "openai_chat_completions"
     assert config.sampling_args["reasoning_effort"] == "low"
+    assert "reasoning" not in config.sampling_args
     assert config.sampling_args["extra_body"]["top_k"] == 0
 
 
@@ -303,6 +329,52 @@ def test_build_eval_config_sampling_precedence_endpoint_raw_and_cli(tmp_path: Pa
     assert cli_config.sampling_args["temperature"] == 0.8
 
 
+def test_build_eval_config_responses_cli_reasoning_overrides_endpoint_reasoning_effort(tmp_path: Path) -> None:
+    endpoints_path = tmp_path / "endpoints.toml"
+    endpoints_path.write_text(
+        """
+[[endpoint]]
+endpoint_id = "profiled"
+model = "openai/profiled"
+url = "https://profiled.example/v1"
+key = "PROFILED_KEY"
+api_client_type = "openai_responses"
+sampling_args = { reasoning_effort = "low" }
+""".strip()
+    )
+
+    config = build_eval_config(
+        {"env_id": "medqa", "endpoint_id": "profiled", "endpoints_path": str(endpoints_path)},
+        overrides=EvalConfigOverrides(sampling_args={"reasoning": {"effort": "high"}}),
+    )
+
+    assert config.sampling_args["reasoning"] == {"effort": "high"}
+    assert "reasoning_effort" not in config.sampling_args
+
+
+def test_build_eval_config_responses_cli_reasoning_effort_overrides_endpoint_reasoning(tmp_path: Path) -> None:
+    endpoints_path = tmp_path / "endpoints.toml"
+    endpoints_path.write_text(
+        """
+[[endpoint]]
+endpoint_id = "profiled"
+model = "openai/profiled"
+url = "https://profiled.example/v1"
+key = "PROFILED_KEY"
+api_client_type = "openai_responses"
+sampling_args = { reasoning = { effort = "low" } }
+""".strip()
+    )
+
+    config = build_eval_config(
+        {"env_id": "medqa", "endpoint_id": "profiled", "endpoints_path": str(endpoints_path)},
+        overrides=EvalConfigOverrides(sampling_args={"reasoning_effort": "high"}),
+    )
+
+    assert config.sampling_args["reasoning"] == {"effort": "high"}
+    assert "reasoning_effort" not in config.sampling_args
+
+
 def test_build_eval_config_scalar_temperature_overrides_endpoint_default(tmp_path: Path) -> None:
     endpoints_path = tmp_path / "endpoints.toml"
     endpoints_path.write_text(
@@ -383,6 +455,31 @@ def test_build_eval_config_direct_unknown_sampling_arg_overrides_extra_body_key(
     assert config.sampling_args["extra_body"]["top_k"] == 3
 
 
+def test_build_eval_config_direct_unknown_sampling_arg_overrides_extra_body_key_for_any_extra(tmp_path: Path) -> None:
+    endpoints_path = tmp_path / "endpoints.toml"
+    endpoints_path.write_text(
+        """
+[[endpoint]]
+endpoint_id = "profiled"
+model = "openai/profiled"
+url = "https://profiled.example/v1"
+key = "PROFILED_KEY"
+sampling_args = { extra_body = { repetition_penalty = 1.1 } }
+""".strip()
+    )
+
+    config = build_eval_config(
+        {
+            "env_id": "medqa",
+            "endpoint_id": "profiled",
+            "endpoints_path": str(endpoints_path),
+        },
+        overrides=EvalConfigOverrides(sampling_args={"repetition_penalty": 1.2}),
+    )
+
+    assert config.sampling_args["extra_body"]["repetition_penalty"] == 1.2
+
+
 def test_build_eval_config_extra_body_key_overrides_lower_precedence_direct_unknown_arg(tmp_path: Path) -> None:
     endpoints_path = tmp_path / "endpoints.toml"
     endpoints_path.write_text(
diff --git a/tests/test_sampling_args.py b/tests/test_sampling_args.py
new file mode 100644
index 00000000..f02878e0
--- /dev/null
+++ b/tests/test_sampling_args.py
@@ -0,0 +1,168 @@
+from __future__ import annotations
+
+import pytest
+
+from medarc_verifiers.utils.sampling_args import sanitize_sampling_args, sanitize_sampling_args_for_openai
+
+
+def test_openai_chat_keeps_reasoning_effort_and_moves_extras() -> None:
+    result = sanitize_sampling_args(
+        {
+            "reasoning_effort": "low",
+            "top_k": 20,
+            "min_p": 0.1,
+            "repetition_penalty": 1.1,
+            "extra_body": {"usage": {"include": True}},
+        },
+        client_type="openai_chat_completions",
+    )
+
+    assert result["reasoning_effort"] == "low"
+    assert result["extra_body"] == {
+        "usage": {"include": True},
+        "top_k": 20,
+        "min_p": 0.1,
+        "repetition_penalty": 1.1,
+    }
+
+
+def test_openai_chat_token_uses_chat_shape() -> None:
+    result = sanitize_sampling_args(
+        {"reasoning_effort": "medium", "top_k": 8},
+        client_type="openai_chat_completions_token",
+    )
+
+    assert result["reasoning_effort"] == "medium"
+    assert result["extra_body"] == {"top_k": 8}
+
+
+def test_compatibility_wrapper_uses_chat_shape() -> None:
+    result = sanitize_sampling_args_for_openai({"reasoning_effort": "low", "top_k": 1})
+
+    assert result["reasoning_effort"] == "low"
+    assert result["extra_body"] == {"top_k": 1}
+
+
+def test_openai_responses_converts_reasoning_effort() -> None:
+    result = sanitize_sampling_args(
+        {"reasoning_effort": "low", "top_k": 20, "max_tokens": 128, "stop": ["END"]},
+        client_type="openai_responses",
+    )
+
+    assert "reasoning_effort" not in result
+    assert result["reasoning"] == {"effort": "low"}
+    assert result["max_tokens"] == 128
+    assert result["stop"] == ["END"]
+    assert result["extra_body"] == {"top_k": 20}
+
+
+def test_openai_responses_preserves_explicit_reasoning_effort() -> None:
+    result = sanitize_sampling_args(
+        {"reasoning": {"effort": "high", "summary": "auto"}, "reasoning_effort": "low"},
+        client_type="openai_responses",
+    )
+
+    assert result["reasoning"] == {"effort": "high", "summary": "auto"}
+
+
+def test_openai_completions_removes_reasoning_and_moves_extras() -> None:
+    result = sanitize_sampling_args(
+        {"prompt": "x", "reasoning_effort": "low", "reasoning": {"effort": "low"}, "top_k": 20},
+        client_type="openai_completions",
+    )
+
+    assert "reasoning_effort" not in result
+    assert "reasoning" not in result
+    assert result["extra_body"] == {"top_k": 20}
+
+
+@pytest.mark.parametrize("client_type", ["renderer", "nemorl_chat_completions"])
+def test_passthrough_clients_only_drop_none(client_type: str) -> None:
+    result = sanitize_sampling_args(
+        {"reasoning_effort": "low", "top_k": 20, "temperature": None},
+        client_type=client_type,
+    )
+
+    assert result == {"reasoning_effort": "low", "top_k": 20}
+
+
+def test_anthropic_preserves_adaptive_thinking() -> None:
+    result = sanitize_sampling_args(
+        {"thinking": {"type": "adaptive"}, "output_config": {"effort": "medium"}, "top_k": 10},
+        client_type="anthropic_messages",
+    )
+
+    assert result["thinking"] == {"type": "adaptive"}
+    assert result["output_config"] == {"effort": "medium"}
+    assert result["top_k"] == 10
+
+
+def test_anthropic_maps_reasoning_effort_to_adaptive_output_config() -> None:
+    result = sanitize_sampling_args({"reasoning_effort": "high"}, client_type="anthropic_messages")
+
+    assert result["thinking"] == {"type": "adaptive"}
+    assert result["output_config"] == {"effort": "high"}
+    assert "reasoning_effort" not in result
+    assert "effort" not in result
+
+
+def test_anthropic_does_not_put_effort_inside_thinking() -> None:
+    result = sanitize_sampling_args(
+        {"thinking": {"type": "adaptive", "effort": "low"}, "reasoning_effort": "medium"},
+        client_type="anthropic_messages",
+    )
+
+    assert result["thinking"] == {"type": "adaptive"}
+    assert result["output_config"] == {"effort": "medium"}
+
+
+@pytest.mark.parametrize(
+    "sampling_args",
+    [
+        {"thinking": {"type": "enabled", "budget_tokens": 4096}},
+        {"thinking": {"type": "adaptive", "budget_tokens": 4096}},
+    ],
+)
+def test_anthropic_rejects_manual_budget_thinking(sampling_args: dict[str, object]) -> None:
+    with pytest.raises(ValueError, match="thinking"):
+        sanitize_sampling_args(sampling_args, client_type="anthropic_messages")
+
+
+def test_anthropic_validates_effort_values() -> None:
+    with pytest.raises(ValueError, match="reasoning effort"):
+        sanitize_sampling_args({"reasoning_effort": "extreme"}, client_type="anthropic_messages")
+
+
+def test_anthropic_rejects_xhigh_without_model_context() -> None:
+    with pytest.raises(ValueError, match="reasoning effort"):
+        sanitize_sampling_args({"reasoning_effort": "xhigh"}, client_type="anthropic_messages")
+
+
+@pytest.mark.asyncio
+async def test_openai_responses_client_receives_nested_reasoning() -> None:
+    from verifiers.clients.openai_responses_client import OpenAIResponsesClient
+
+    class Responses:
+        def __init__(self) -> None:
+            self.kwargs: dict[str, object] | None = None
+
+        async def create(self, **kwargs):
+            self.kwargs = kwargs
+            return object()
+
+    class Client:
+        def __init__(self) -> None:
+            self.responses = Responses()
+
+        async def close(self) -> None:
+            pass
+
+    raw_client = Client()
+    client = OpenAIResponsesClient(raw_client)
+    sampling_args = sanitize_sampling_args({"reasoning_effort": "low"}, client_type="openai_responses")
+
+    await client.get_native_response([], "model", sampling_args)
+
+    assert raw_client.responses.kwargs is not None
+    assert "reasoning_effort" not in raw_client.responses.kwargs
+    assert raw_client.responses.kwargs["reasoning"] == {"effort": "low"}

From bf57af9443dccccc5763300a4819b69b88af5835 Mon Sep 17 00:00:00 2001
From: Benjamin Warner <me@benjaminwarner.dev>
Date: Tue, 12 May 2026 18:06:36 +0000
Subject: [PATCH 44/53] Derive sampling args from SDKs

---
 medarc_verifiers/utils/sampling_args.py | 151 +++++++++++-------------
 tests/test_sampling_args.py             |  69 ++++++++++-
 2 files changed, 133 insertions(+), 87 deletions(-)

diff --git a/medarc_verifiers/utils/sampling_args.py b/medarc_verifiers/utils/sampling_args.py
index ed0d321b..e7adb8dd 100644
--- a/medarc_verifiers/utils/sampling_args.py
+++ b/medarc_verifiers/utils/sampling_args.py
@@ -3,57 +3,10 @@
 import inspect
 from collections.abc import Mapping
 from functools import lru_cache
-from typing import Any
+from typing import Any, Literal, get_args, get_origin
 
 _OPENAI_REASONING_KEYS = {"reasoning", "reasoning_effort", "thinking", "output_config"}
-_ANTHROPIC_EFFORT_VALUES = {"low", "medium", "high"}
-_OPENAI_CHAT_FALLBACK_TOP_LEVEL_KEYS = {
-    "temperature",
-    "top_p",
-    "max_tokens",
-    "max_completion_tokens",
-    "n",
-    "stop",
-    "presence_penalty",
-    "frequency_penalty",
-    "logit_bias",
-    "seed",
-    "response_format",
-    "tool_choice",
-    "tools",
-    "stream",
-    "extra_body",
-    "reasoning_effort",
-}
-_OPENAI_RESPONSES_FALLBACK_TOP_LEVEL_KEYS = {
-    "temperature",
-    "top_p",
-    "max_tokens",
-    "max_completion_tokens",
-    "max_output_tokens",
-    "n",
-    "stop",
-    "stream",
-    "extra_body",
-    "reasoning",
-    "tools",
-    "tool_choice",
-}
-_OPENAI_CHAT_VERIFIERS_WRAPPER_KEYS = {"max_tokens"}
-_OPENAI_RESPONSES_VERIFIERS_WRAPPER_KEYS = {"max_tokens", "max_completion_tokens", "n", "stop", "modalities"}
-_OPENAI_COMPLETIONS_FALLBACK_TOP_LEVEL_KEYS = {
-    "temperature",
-    "top_p",
-    "max_tokens",
-    "n",
-    "stop",
-    "presence_penalty",
-    "frequency_penalty",
-    "logit_bias",
-    "seed",
-    "stream",
-    "extra_body",
-}
+_FRAMEWORK_REQUEST_KEYS = {"model", "messages", "input", "prompt", "tools", "system", "extra_headers"}
 
 
 def sanitize_sampling_args(
@@ -85,11 +38,14 @@ def sanitize_sampling_args_for_openai(sampling_args: Mapping[str, Any] | None) -
 
 def _sanitize_openai_chat(sampling_args: Mapping[str, Any]) -> dict[str, Any]:
     cleaned = _drop_none(sampling_args, preserve_none_keys={"max_tokens"})
+    _drop_framework_request_keys(cleaned)
     return _move_compatible_extras_to_extra_body(cleaned, allowed_top_level_keys=_get_openai_chat_allowed_param_names())
 
 
 def _sanitize_openai_responses(sampling_args: Mapping[str, Any]) -> dict[str, Any]:
     cleaned = _drop_none(sampling_args, preserve_none_keys={"max_tokens"})
+    _normalize_openai_responses_sampling_args(cleaned)
+    _drop_framework_request_keys(cleaned)
     reasoning_effort = cleaned.pop("reasoning_effort", None)
     if reasoning_effort is not None:
         existing_reasoning = cleaned.get("reasoning")
@@ -106,8 +62,27 @@ def _sanitize_openai_responses(sampling_args: Mapping[str, Any]) -> dict[str, An
     )
 
 
+def _normalize_openai_responses_sampling_args(sampling_args: dict[str, Any]) -> None:
+    n = sampling_args.pop("n", None)
+    if n not in (None, 1):
+        raise ValueError("Responses API client only supports n=1")
+
+    max_tokens = sampling_args.pop("max_tokens", None)
+    max_completion_tokens = sampling_args.pop("max_completion_tokens", None)
+    if "max_output_tokens" not in sampling_args:
+        output_tokens = max_tokens if max_tokens is not None else max_completion_tokens
+        if output_tokens is not None:
+            sampling_args["max_output_tokens"] = output_tokens
+
+    if sampling_args.get("stop") is not None:
+        raise ValueError("Responses API client does not support stop sequences")
+    sampling_args.pop("stop", None)
+    sampling_args.pop("modalities", None)
+
+
 def _sanitize_openai_completions(sampling_args: Mapping[str, Any]) -> dict[str, Any]:
     cleaned = _drop_none(sampling_args, preserve_none_keys={"max_tokens"})
+    _drop_framework_request_keys(cleaned)
     for key in _OPENAI_REASONING_KEYS:
         cleaned.pop(key, None)
     return _move_compatible_extras_to_extra_body(
@@ -117,6 +92,7 @@ def _sanitize_openai_completions(sampling_args: Mapping[str, Any]) -> dict[str,
 
 def _sanitize_anthropic_messages(sampling_args: Mapping[str, Any]) -> dict[str, Any]:
     cleaned = _drop_none(sampling_args)
+    _drop_framework_request_keys(cleaned)
     reasoning_effort = cleaned.pop("reasoning_effort", None)
     cleaned.pop("reasoning", None)
     cleaned.pop("effort", None)
@@ -153,18 +129,25 @@ def _sanitize_anthropic_messages(sampling_args: Mapping[str, Any]) -> dict[str,
             output_config_dict["effort"] = _validate_anthropic_effort(output_config_dict["effort"])
         cleaned["output_config"] = output_config_dict
 
-    return cleaned
+    allowed_keys = _get_anthropic_allowed_param_names()
+    return {key: value for key, value in cleaned.items() if key in allowed_keys}
 
 
 def _validate_anthropic_effort(value: Any) -> str:
-    if not isinstance(value, str) or value not in _ANTHROPIC_EFFORT_VALUES:
+    effort_values = _get_anthropic_effort_values()
+    if not isinstance(value, str) or value not in effort_values:
         raise ValueError(
             "anthropic_messages reasoning effort must be one of: "
-            f"{', '.join(sorted(_ANTHROPIC_EFFORT_VALUES))}"
+            f"{', '.join(sorted(effort_values))}"
         )
     return value
 
 
+def _drop_framework_request_keys(sampling_args: dict[str, Any]) -> None:
+    for key in _FRAMEWORK_REQUEST_KEYS:
+        sampling_args.pop(key, None)
+
+
 def _move_compatible_extras_to_extra_body(
     sampling_args: Mapping[str, Any],
     *,
@@ -191,51 +174,57 @@ def _move_compatible_extras_to_extra_body(
     return filtered
 
 
+@lru_cache(maxsize=1)
+def _get_anthropic_effort_values() -> set[str]:
+    from anthropic.types import OutputConfigParam
+    from typing import get_type_hints
+
+    effort_type = get_type_hints(OutputConfigParam)["effort"]
+    return _literal_string_values(effort_type)
+
+
+def _literal_string_values(type_hint: Any) -> set[str]:
+    values: set[str] = set()
+    origin = get_origin(type_hint)
+    if origin is None:
+        return values
+    if origin is Literal:
+        return {value for value in get_args(type_hint) if isinstance(value, str)}
+    for arg in get_args(type_hint):
+        values.update(_literal_string_values(arg))
+    return values
+
+
 @lru_cache(maxsize=1)
 def _get_openai_chat_allowed_param_names() -> set[str]:
-    try:
-        from openai.resources.chat.completions import AsyncCompletions as ChatAsyncCompletions  # type: ignore
-    except Exception:
-        return set(_OPENAI_CHAT_FALLBACK_TOP_LEVEL_KEYS)
+    from openai.resources.chat.completions import AsyncCompletions as ChatAsyncCompletions  # type: ignore
 
-    allowed = _param_names(ChatAsyncCompletions.create) or set(_OPENAI_CHAT_FALLBACK_TOP_LEVEL_KEYS)
-    allowed.add("extra_body")
-    allowed.add("reasoning_effort")
-    allowed.update(_OPENAI_CHAT_VERIFIERS_WRAPPER_KEYS)
-    return allowed
+    return _param_names(ChatAsyncCompletions.create)
 
 
 @lru_cache(maxsize=1)
 def _get_openai_responses_allowed_param_names() -> set[str]:
-    try:
-        from openai.resources.responses import AsyncResponses  # type: ignore
-    except Exception:
-        return set(_OPENAI_RESPONSES_FALLBACK_TOP_LEVEL_KEYS)
+    from openai.resources.responses import AsyncResponses  # type: ignore
 
-    allowed = _param_names(AsyncResponses.create) or set(_OPENAI_RESPONSES_FALLBACK_TOP_LEVEL_KEYS)
-    allowed.add("extra_body")
-    allowed.add("reasoning")
-    allowed.update(_OPENAI_RESPONSES_VERIFIERS_WRAPPER_KEYS)
-    return allowed
+    return _param_names(AsyncResponses.create)
 
 
 @lru_cache(maxsize=1)
 def _get_openai_completions_allowed_param_names() -> set[str]:
-    try:
-        from openai.resources.completions import AsyncCompletions as TextAsyncCompletions  # type: ignore
-    except Exception:
-        return set(_OPENAI_COMPLETIONS_FALLBACK_TOP_LEVEL_KEYS)
+    from openai.resources.completions import AsyncCompletions as TextAsyncCompletions  # type: ignore
+
+    return _param_names(TextAsyncCompletions.create)
+
+
+@lru_cache(maxsize=1)
+def _get_anthropic_allowed_param_names() -> set[str]:
+    from anthropic.resources.messages import AsyncMessages
 
-    allowed = _param_names(TextAsyncCompletions.create) or set(_OPENAI_COMPLETIONS_FALLBACK_TOP_LEVEL_KEYS)
-    allowed.add("extra_body")
-    return allowed
+    return _param_names(AsyncMessages.create)
 
 
 def _param_names(callable_obj: Any) -> set[str]:
-    try:
-        sig = inspect.signature(callable_obj)
-    except Exception:
-        return set()
+    sig = inspect.signature(callable_obj)
     names: set[str] = set()
     for name, param in sig.parameters.items():
         if name == "self":
diff --git a/tests/test_sampling_args.py b/tests/test_sampling_args.py
index f02878e0..8971ae8b 100644
--- a/tests/test_sampling_args.py
+++ b/tests/test_sampling_args.py
@@ -43,19 +43,36 @@ def test_compatibility_wrapper_uses_chat_shape() -> None:
     assert result["extra_body"] == {"top_k": 1}
 
 
+def test_openai_chat_drops_framework_owned_request_keys() -> None:
+    result = sanitize_sampling_args(
+        {"model": "override", "messages": [], "tools": [], "extra_headers": {"x": "y"}, "top_k": 1},
+        client_type="openai_chat_completions",
+    )
+
+    assert "model" not in result
+    assert "messages" not in result
+    assert "tools" not in result
+    assert "extra_headers" not in result
+    assert result["extra_body"] == {"top_k": 1}
+
+
 def test_openai_responses_converts_reasoning_effort() -> None:
     result = sanitize_sampling_args(
-        {"reasoning_effort": "low", "top_k": 20, "max_tokens": 128, "stop": ["END"]},
+        {"reasoning_effort": "low", "top_k": 20, "max_tokens": 128},
         client_type="openai_responses",
     )
 
     assert "reasoning_effort" not in result
     assert result["reasoning"] == {"effort": "low"}
-    assert result["max_tokens"] == 128
-    assert result["stop"] == ["END"]
+    assert result["max_output_tokens"] == 128
     assert result["extra_body"] == {"top_k": 20}
 
 
+def test_openai_responses_rejects_stop_sequences() -> None:
+    with pytest.raises(ValueError, match="does not support stop sequences"):
+        sanitize_sampling_args({"stop": ["END"]}, client_type="openai_responses")
+
+
 def test_openai_responses_preserves_explicit_reasoning_effort() -> None:
     result = sanitize_sampling_args(
         {"reasoning": {"effort": "high", "summary": "auto"}, "reasoning_effort": "low"},
@@ -65,6 +82,20 @@ def test_openai_responses_preserves_explicit_reasoning_effort() -> None:
     assert result["reasoning"] == {"effort": "high", "summary": "auto"}
 
 
+def test_openai_responses_drops_framework_owned_request_keys() -> None:
+    result = sanitize_sampling_args(
+        {"model": "override", "input": "x", "prompt": "y", "tools": [], "extra_headers": {"x": "y"}, "top_k": 1},
+        client_type="openai_responses",
+    )
+
+    assert "model" not in result
+    assert "input" not in result
+    assert "prompt" not in result
+    assert "tools" not in result
+    assert "extra_headers" not in result
+    assert result["extra_body"] == {"top_k": 1}
+
+
 def test_openai_completions_removes_reasoning_and_moves_extras() -> None:
     result = sanitize_sampling_args(
         {"prompt": "x", "reasoning_effort": "low", "reasoning": {"effort": "low"}, "top_k": 20},
@@ -73,6 +104,7 @@ def test_openai_completions_removes_reasoning_and_moves_extras() -> None:
 
     assert "reasoning_effort" not in result
     assert "reasoning" not in result
+    assert "prompt" not in result
     assert result["extra_body"] == {"top_k": 20}
 
 
@@ -106,6 +138,28 @@ def test_anthropic_maps_reasoning_effort_to_adaptive_output_config() -> None:
     assert "effort" not in result
 
 
+def test_anthropic_drops_framework_owned_request_keys() -> None:
+    result = sanitize_sampling_args(
+        {
+            "model": "override",
+            "messages": [],
+            "system": "override",
+            "tools": [],
+            "extra_headers": {"x": "y"},
+            "reasoning_effort": "low",
+        },
+        client_type="anthropic_messages",
+    )
+
+    assert "model" not in result
+    assert "messages" not in result
+    assert "system" not in result
+    assert "tools" not in result
+    assert "extra_headers" not in result
+    assert result["thinking"] == {"type": "adaptive"}
+    assert result["output_config"] == {"effort": "low"}
+
+
 def test_anthropic_does_not_put_effort_inside_thinking() -> None:
     result = sanitize_sampling_args(
         {"thinking": {"type": "adaptive", "effort": "low"}, "reasoning_effort": "medium"},
@@ -133,9 +187,12 @@ def test_anthropic_validates_effort_values() -> None:
         sanitize_sampling_args({"reasoning_effort": "extreme"}, client_type="anthropic_messages")
 
 
-def test_anthropic_rejects_xhigh_without_model_context() -> None:
-    with pytest.raises(ValueError, match="reasoning effort"):
-        sanitize_sampling_args({"reasoning_effort": "xhigh"}, client_type="anthropic_messages")
+@pytest.mark.parametrize("effort", ["xhigh", "max"])
+def test_anthropic_accepts_sdk_documented_effort_values(effort: str) -> None:
+    result = sanitize_sampling_args({"reasoning_effort": effort}, client_type="anthropic_messages")
+
+    assert result["thinking"] == {"type": "adaptive"}
+    assert result["output_config"] == {"effort": effort}
 
 
 @pytest.mark.asyncio

From 074cba9e11d5b24f733a7be612b4db62284a0a7c Mon Sep 17 00:00:00 2001
From: Benjamin Warner <me@benjaminwarner.dev>
Date: Tue, 12 May 2026 19:04:12 +0000
Subject: [PATCH 45/53] Update legacy conversion output shape

---
 scripts/convert_legacy_raw_runs.py            | 268 +++++++++++++++++-
 .../test_convert_legacy_raw_runs.py           |  65 ++++-
 2 files changed, 324 insertions(+), 9 deletions(-)

diff --git a/scripts/convert_legacy_raw_runs.py b/scripts/convert_legacy_raw_runs.py
index 9db9d934..82bc1609 100644
--- a/scripts/convert_legacy_raw_runs.py
+++ b/scripts/convert_legacy_raw_runs.py
@@ -4,7 +4,6 @@
 
 import argparse
 import json
-import shutil
 import sys
 from dataclasses import dataclass
 from pathlib import Path
@@ -185,7 +184,7 @@ def convert_legacy_raw_runs(
             continue
         try:
             _write_conversion(plan)
-        except OSError as exc:
+        except (OSError, ValueError) as exc:
             entries.append(_entry_for_plan(plan, status="failed", reason=f"write failed: {exc}"))
             continue
         entries.append(_entry_for_plan(plan, status="converted", reason="converted"))
@@ -265,12 +264,18 @@ def _resolve_variant(job: Mapping[str, Any], env_id: str) -> str | dict[str, str
     if raw is None or raw == env_id:
         return BASE_VARIANT_ID
 
+    split_variant = _resolve_split_variant(job, env_id, raw)
+    if split_variant is not None:
+        return split_variant
+
     prefix_colon = f"{env_id}::"
     prefix_slash = f"{env_id}/"
     if raw.startswith(prefix_colon):
         variant_id = raw[len(prefix_colon) :]
     elif raw.startswith(prefix_slash):
         variant_id = raw[len(prefix_slash) :]
+    elif raw.startswith(f"{env_id}-") or raw.startswith(f"{env_id}_"):
+        variant_id = raw[len(env_id) + 1 :]
     else:
         return {"reason": f"ambiguous env_variant_id {raw!r} for env_id {env_id!r}"}
 
@@ -285,6 +290,22 @@ def _resolve_variant(job: Mapping[str, Any], env_id: str) -> str | dict[str, str
     return variant_id
 
 
+def _resolve_split_variant(job: Mapping[str, Any], env_id: str, raw: str) -> str | None:
+    env_args = job.get("env_args")
+    split = _string_or_none(env_args.get("split")) if isinstance(env_args, Mapping) else None
+    if split != "en":
+        return None
+
+    for delimiter in ("_", "-"):
+        split_prefix = f"{env_id}{delimiter}{split}"
+        if raw == split_prefix:
+            return BASE_VARIANT_ID
+        rollout_prefix = f"{split_prefix}-"
+        if raw.startswith(rollout_prefix):
+            return raw[len(rollout_prefix) :]
+    return None
+
+
 def _resolve_results_path(
     run_dir: Path,
     manifest: Mapping[str, Any],
@@ -337,21 +358,221 @@ def _collision_entries(
 
 def _write_conversion(plan: _PlannedConversion) -> None:
     plan.target_dir.mkdir(parents=True, exist_ok=False)
-    shutil.copy2(plan.source_results, plan.target_dir / RESULTS_FILENAME)
-    metadata = _converted_metadata(plan)
+    row_stats = _write_converted_results(plan)
+    metadata = _converted_metadata(plan, row_stats=row_stats)
     (plan.target_dir / METADATA_FILENAME).write_text(
         json.dumps(metadata, indent=2, sort_keys=True) + "\n",
         encoding="utf-8",
     )
 
 
-def _converted_metadata(plan: _PlannedConversion) -> dict[str, Any]:
+def _write_converted_results(plan: _PlannedConversion) -> dict[str, Any]:
+    stats = _RowStats()
+    with (
+        plan.source_results.open("r", encoding="utf-8") as source,
+        (plan.target_dir / RESULTS_FILENAME).open("w", encoding="utf-8") as target,
+    ):
+        for line_number, line in enumerate(source, start=1):
+            if not line.strip():
+                continue
+            try:
+                payload = json.loads(line)
+            except ValueError as exc:
+                raise ValueError(f"invalid JSON in {plan.source_results} line {line_number}: {exc}") from exc
+            if not isinstance(payload, Mapping):
+                raise ValueError(f"expected JSON object in {plan.source_results} line {line_number}")
+            converted = _converted_result_row(payload)
+            stats.add(converted)
+            target.write(json.dumps(converted, sort_keys=True) + "\n")
+    return stats.to_metadata()
+
+
+def _converted_result_row(payload: Mapping[str, Any]) -> dict[str, Any]:
+    converted = dict(payload)
+    converted["timing"] = _converted_timing(payload)
+    converted.pop("generation_ms", None)
+    converted.pop("scoring_ms", None)
+    converted.pop("total_ms", None)
+
+    converted["is_completed"] = bool(payload.get("is_completed", payload.get("error") is None))
+    converted["is_truncated"] = bool(payload.get("is_truncated", False))
+    converted["stop_condition"] = payload.get("stop_condition", "max_turns_reached")
+    converted["metrics"] = _converted_metrics(payload)
+    converted["tool_defs"] = payload.get("tool_defs", [])
+
+    usage = _converted_token_usage(payload.get("token_usage"))
+    if usage is not None:
+        converted["token_usage"] = usage
+    else:
+        converted.pop("token_usage", None)
+
+    return converted
+
+
+def _converted_timing(payload: Mapping[str, Any]) -> dict[str, Any]:
+    timing = payload.get("timing")
+    if isinstance(timing, Mapping):
+        return dict(timing)
+
+    generation = _milliseconds_to_seconds(payload.get("generation_ms"))
+    scoring = _milliseconds_to_seconds(payload.get("scoring_ms"))
+    total = _milliseconds_to_seconds(payload.get("total_ms"))
+    return {
+        "setup": {"duration": 0.0, "spans": []},
+        "generation": {"duration": generation, "start": 0.0, "end": generation},
+        "scoring": {"duration": scoring, "start": generation, "end": generation + scoring},
+        "model": {"duration": generation, "spans": [{"duration": generation, "start": 0.0, "end": generation}]},
+        "env": {"duration": 0.0, "spans": []},
+        "total": total,
+        "overhead": max(0.0, total - generation - scoring),
+    }
+
+
+def _converted_metrics(payload: Mapping[str, Any]) -> dict[str, float]:
+    metrics = payload.get("metrics")
+    if isinstance(metrics, Mapping):
+        return {str(key): float(value) for key, value in metrics.items() if _float_or_none(value) is not None}
+
+    converted: dict[str, float] = {}
+    for key in ("accuracy", "num_turns"):
+        value = _float_or_none(payload.get(key))
+        if value is not None:
+            converted[key] = value
+    return converted
+
+
+def _converted_token_usage(value: Any) -> dict[str, float] | None:
+    if not isinstance(value, Mapping):
+        return None
+    if "input_tokens" in value or "output_tokens" in value:
+        input_tokens = _float_or_none(value.get("input_tokens")) or 0.0
+        output_tokens = _float_or_none(value.get("output_tokens")) or 0.0
+        usage = {"input_tokens": input_tokens, "output_tokens": output_tokens}
+        final_input = _float_or_none(value.get("final_input_tokens"))
+        final_output = _float_or_none(value.get("final_output_tokens"))
+        if final_input is not None and final_output is not None:
+            usage["final_input_tokens"] = final_input
+            usage["final_output_tokens"] = final_output
+        return usage
+
+    model_usage = value.get("model")
+    if not isinstance(model_usage, Mapping):
+        return None
+    input_tokens = _float_or_none(model_usage.get("prompt")) or 0.0
+    output_tokens = _float_or_none(model_usage.get("completion")) or 0.0
+    return {
+        "input_tokens": input_tokens,
+        "output_tokens": output_tokens,
+        "final_input_tokens": input_tokens,
+        "final_output_tokens": output_tokens,
+    }
+
+
+@dataclass(slots=True)
+class _RowStats:
+    count: int = 0
+    reward_total: float = 0.0
+    reward_count: int = 0
+    error_count: int = 0
+    metric_totals: dict[str, float] | None = None
+    metric_counts: dict[str, int] | None = None
+    input_tokens_total: float = 0.0
+    output_tokens_total: float = 0.0
+    usage_count: int = 0
+    final_input_tokens_total: float = 0.0
+    final_output_tokens_total: float = 0.0
+    final_usage_count: int = 0
+
+    def add(self, row: Mapping[str, Any]) -> None:
+        self.count += 1
+        reward = _float_or_none(row.get("reward"))
+        if reward is not None:
+            self.reward_total += reward
+            self.reward_count += 1
+        if row.get("error") is not None:
+            self.error_count += 1
+
+        metrics = row.get("metrics")
+        if isinstance(metrics, Mapping):
+            if self.metric_totals is None:
+                self.metric_totals = {}
+                self.metric_counts = {}
+            assert self.metric_counts is not None
+            for key, value in metrics.items():
+                numeric = _float_or_none(value)
+                if numeric is None:
+                    continue
+                metric_key = str(key)
+                self.metric_totals[metric_key] = self.metric_totals.get(metric_key, 0.0) + numeric
+                self.metric_counts[metric_key] = self.metric_counts.get(metric_key, 0) + 1
+
+        usage = row.get("token_usage")
+        if isinstance(usage, Mapping):
+            input_tokens = _float_or_none(usage.get("input_tokens"))
+            output_tokens = _float_or_none(usage.get("output_tokens"))
+            if input_tokens is not None or output_tokens is not None:
+                self.input_tokens_total += input_tokens or 0.0
+                self.output_tokens_total += output_tokens or 0.0
+                self.usage_count += 1
+            final_input = _float_or_none(usage.get("final_input_tokens"))
+            final_output = _float_or_none(usage.get("final_output_tokens"))
+            if final_input is not None and final_output is not None:
+                self.final_input_tokens_total += final_input
+                self.final_output_tokens_total += final_output
+                self.final_usage_count += 1
+
+    def to_metadata(self) -> dict[str, Any]:
+        avg_metrics: dict[str, float] = {}
+        if self.metric_totals and self.metric_counts:
+            avg_metrics = {
+                key: total / self.metric_counts[key]
+                for key, total in sorted(self.metric_totals.items())
+                if self.metric_counts.get(key)
+            }
+
+        usage: dict[str, float] | None = None
+        if self.usage_count:
+            usage = {
+                "input_tokens": self.input_tokens_total / self.usage_count,
+                "output_tokens": self.output_tokens_total / self.usage_count,
+            }
+            if self.final_usage_count:
+                usage["final_input_tokens"] = self.final_input_tokens_total / self.final_usage_count
+                usage["final_output_tokens"] = self.final_output_tokens_total / self.final_usage_count
+
+        return {
+            "row_count": self.count,
+            "avg_reward": self.reward_total / self.reward_count if self.reward_count else 0.0,
+            "avg_error": self.error_count / self.count if self.count else 0.0,
+            "avg_metrics": avg_metrics,
+            "usage": usage,
+        }
+
+
+def _converted_metadata(plan: _PlannedConversion, *, row_stats: Mapping[str, Any]) -> dict[str, Any]:
     metadata: dict[str, Any] = {}
     if plan.source_metadata_payload:
         source = plan.source_metadata_payload
-        for key in ("env_args", "sampling_args", "num_examples", "rollouts_per_example", "avg_reward"):
+        for key in (
+            "env_args",
+            "sampling_args",
+            "num_examples",
+            "rollouts_per_example",
+            "avg_reward",
+            "avg_metrics",
+            "avg_error",
+            "base_url",
+            "state_columns",
+            "tools",
+            "usage",
+            "version_info",
+        ):
             if key in source:
                 metadata[key] = source[key]
+        if "time" in source:
+            metadata["time"] = source["time"]
+        elif "time_ms" in source:
+            metadata["time"] = _milliseconds_to_seconds(source["time_ms"])
 
     model_table = plan.manifest.get("models")
     model_config = model_table.get(plan.model_id) if isinstance(model_table, Mapping) else None
@@ -371,11 +592,46 @@ def _converted_metadata(plan: _PlannedConversion) -> dict[str, Any]:
 
     metadata.setdefault("env_args", {})
     metadata.setdefault("sampling_args", {})
+    metadata.setdefault("base_url", "")
+    metadata.setdefault("time", plan.job.get("duration_seconds", 0.0))
+    metadata["avg_reward"] = row_stats.get("avg_reward", metadata.get("avg_reward", 0.0))
+    metadata["avg_metrics"] = row_stats.get("avg_metrics") or metadata.get("avg_metrics") or _job_metrics(plan.job)
+    metadata["avg_error"] = row_stats.get("avg_error", metadata.get("avg_error", 0.0))
+    metadata.setdefault("pass_at_k", {})
+    metadata.setdefault("pass_all_k", {})
+    metadata.setdefault("pass_threshold", 0.5)
+    metadata["usage"] = row_stats.get("usage", metadata.get("usage"))
+    metadata.setdefault("version_info", {})
+    metadata.setdefault("state_columns", [])
+    metadata.setdefault("tools", None)
     metadata["env_id"] = plan.env_id
     metadata["model"] = plan.model_id
+    metadata["num_examples"] = int(metadata.get("num_examples") or row_stats.get("row_count") or 0)
+    metadata["rollouts_per_example"] = int(metadata.get("rollouts_per_example") or 1)
     return metadata
 
 
+def _job_metrics(job: Mapping[str, Any]) -> dict[str, float]:
+    metrics = job.get("metrics")
+    if not isinstance(metrics, Mapping):
+        return {}
+    return {str(key): float(value) for key, value in metrics.items() if _float_or_none(value) is not None}
+
+
+def _milliseconds_to_seconds(value: Any) -> float:
+    numeric = _float_or_none(value)
+    return 0.0 if numeric is None else numeric / 1000.0
+
+
+def _float_or_none(value: Any) -> float | None:
+    if value is None or isinstance(value, bool):
+        return None
+    try:
+        return float(value)
+    except (TypeError, ValueError):
+        return None
+
+
 def _entry_for_plan(plan: _PlannedConversion, *, status: str, reason: str) -> ConversionEntry:
     return ConversionEntry(
         run_id=plan.run_id,
diff --git a/tests/test_scripts/test_convert_legacy_raw_runs.py b/tests/test_scripts/test_convert_legacy_raw_runs.py
index 513d54fb..8748f685 100644
--- a/tests/test_scripts/test_convert_legacy_raw_runs.py
+++ b/tests/test_scripts/test_convert_legacy_raw_runs.py
@@ -96,16 +96,33 @@ def test_converts_valid_manifest_job_to_processable_eval_output(tmp_path: Path)
 
     assert report.converted == 1
     target = output_dir / "gpt-mini" / "demo-env" / "base"
-    assert (target / "results.jsonl").read_text(encoding="utf-8")
+    row = json.loads((target / "results.jsonl").read_text(encoding="utf-8"))
+    assert row["is_completed"] is True
+    assert row["is_truncated"] is False
+    assert row["metrics"] == {}
+    assert row["stop_condition"] == "max_turns_reached"
+    assert row["timing"]["total"] == 0.0
+    assert row["tool_defs"] == []
     metadata = json.loads((target / "metadata.json").read_text(encoding="utf-8"))
     assert metadata == {
-        "avg_reward": 0.5,
+        "avg_error": 0.0,
+        "avg_metrics": {},
+        "avg_reward": 1.0,
+        "base_url": "",
         "env_args": {"fold": "metadata"},
         "env_id": "demo/env",
         "model": "gpt/mini",
         "num_examples": 2,
+        "pass_all_k": {},
+        "pass_at_k": {},
+        "pass_threshold": 0.5,
         "rollouts_per_example": 1,
         "sampling_args": {"temperature": 0.2},
+        "state_columns": [],
+        "time": 0.0,
+        "tools": None,
+        "usage": None,
+        "version_info": {},
     }
     assert not (target / "bench_index.json").exists()
     assert not (target / ".medarc_eval_metadata.json").exists()
@@ -194,7 +211,9 @@ def test_path_unsafe_or_ambiguous_variants_are_skipped(tmp_path: Path) -> None:
         jobs=[
             _job(job_id="ambiguous", env_variant_id="other-env::seed-1", results_relpath="ambiguous/results.jsonl"),
             _job(job_id="unsafe", env_variant_id="demo/env::bad value", results_relpath="unsafe/results.jsonl"),
-            _job(job_id="base-conflict", env_variant_id="demo/env::base", results_relpath="base-conflict/results.jsonl"),
+            _job(
+                job_id="base-conflict", env_variant_id="demo/env::base", results_relpath="base-conflict/results.jsonl"
+            ),
         ],
     )
     for job_id in ("ambiguous", "unsafe", "base-conflict"):
@@ -232,3 +251,43 @@ def test_parses_relative_variant_and_cli_report_path(tmp_path: Path) -> None:
     assert (output_dir / "gpt-mini" / "demo-env" / "shuffle_seed-1618" / "metadata.json").exists()
     payload = json.loads(report_path.read_text(encoding="utf-8"))
     assert payload["summary"]["converted"] == 1
+
+
+def test_parses_legacy_delimited_env_variant_ids(tmp_path: Path) -> None:
+    raw_dir = tmp_path / "runs" / "raw"
+    output_dir = tmp_path / "runs" / "evals"
+    run_dir = _write_manifest(
+        raw_dir,
+        jobs=[
+            _job(
+                job_id="longhealth-task1",
+                env_id="longhealth",
+                env_variant_id="longhealth-task1",
+                results_relpath="longhealth-task1/results.jsonl",
+            ),
+            _job(
+                job_id="careqa-en",
+                env_id="careqa",
+                env_variant_id="careqa_en",
+                env_args={"split": "en"},
+                results_relpath="careqa-en/results.jsonl",
+            ),
+            _job(
+                job_id="pubhealthbench-reviewed",
+                env_id="pubhealthbench",
+                env_variant_id="pubhealthbench_reviewed",
+                env_args={"split": "reviewed"},
+                results_relpath="pubhealthbench-reviewed/results.jsonl",
+            ),
+        ],
+    )
+    _write_artifacts(run_dir, job_id="longhealth-task1")
+    _write_artifacts(run_dir, job_id="careqa-en")
+    _write_artifacts(run_dir, job_id="pubhealthbench-reviewed")
+
+    report = convert_legacy_raw_runs(raw_dir=raw_dir, output_dir=output_dir, dry_run=False)
+
+    assert report.converted == 3
+    assert (output_dir / "gpt-mini" / "longhealth" / "task1" / "metadata.json").exists()
+    assert (output_dir / "gpt-mini" / "careqa" / "base" / "metadata.json").exists()
+    assert (output_dir / "gpt-mini" / "pubhealthbench" / "reviewed" / "metadata.json").exists()

From 5d66040aec7c9b173a59f138d7f84325ec492751 Mon Sep 17 00:00:00 2001
From: Benjamin Warner <me@benjaminwarner.dev>
Date: Tue, 12 May 2026 19:28:13 +0000
Subject: [PATCH 46/53] Update docs for TOML bench behavior

---
 AGENTS.md                             |  6 +++---
 README.md                             |  2 +-
 docs/README.md                        |  2 +-
 docs/medarc-eval-bench.md             |  8 +++++++-
 docs/medarc-eval-process.md           |  5 +++++
 docs/medarc-eval.md                   | 18 ++++++++++--------
 docs/medarc-verifiers-architecture.md |  6 ++++--
 7 files changed, 31 insertions(+), 16 deletions(-)

diff --git a/AGENTS.md b/AGENTS.md
index d05c3637..0b58132e 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -4,7 +4,7 @@
 
 - `medarc_verifiers/`: Core Python package (CLI entrypoints, parsers, rewards, orchestration utilities).
 - `environments/<env>/`: Individual Verifiers environments (each is a small Python package with `<env>.py` and its own `pyproject.toml`).
-- `configs/`: YAML configs for `medarc-eval bench` (job matrices, env configs, judge configs).
+- `configs/`: TOML configs for `medarc-eval bench`, endpoint registries, and environment/judge configs.
 - `docs/`: Usage docs for `medarc-eval` and related workflows.
 - `tests/`: `pytest` suite.
 
@@ -14,7 +14,7 @@
 - Quick workflow: eval → process → winrate
   - eval outputs: `runs/evals/<model>/<env>/<variant>/...`
   - processed parquet: `runs/processed/<model>/<env>.parquet` + `runs/processed/env_index.json`
-  - winrate outputs: `runs/winrate/latest.json` and `runs/winrate/latest.csv`
+  - winrate outputs: `runs/processed/winrate/latest.json` and `runs/processed/winrate/latest.csv`
 - `medarc-eval` CLI entrypoint/router: (`medarc_verifiers/cli/main.py`; docs: `docs/medarc-eval.md`)
 - `medarc-orchestrate` CLI entrypoint: (`medarc_verifiers/orchestrate/cli.py`; docs: `docs/medarc-orchestrate.md`)
 - Old YAML-runner `runs/raw` artifacts must be converted with `scripts/convert_legacy_raw_runs.py` before processing.
@@ -32,7 +32,7 @@
 - `uv pip install -e .`: Install `medarc-verifiers` in editable mode.
 - `vf-install <env>`: Install an environment from `environments/<env>/` in editable mode.
 - `uv run medarc-eval <ENV> -m <MODEL> -n 5`: Run a small evaluation.
-- `uv run medarc-eval bench --config configs/job.yaml`: Run a batch evaluation from a YAML config.
+- `uv run medarc-eval bench --config configs/eval/medmarks-smoke.toml`: Run a batch evaluation from a TOML config.
 - `uv run pytest tests/`: Run the full test suite.
 - `uv run ruff check medarc_verifiers/ && uv run ruff format medarc_verifiers/`: Lint/format.
 
diff --git a/README.md b/README.md
index 343de6f2..1bcfdd19 100644
--- a/README.md
+++ b/README.md
@@ -102,7 +102,7 @@ Evaluation outputs are written under `runs/evals/`, processed parquet files unde
 
 | Page | Description |
 |------|-------------|
-| [`docs/getting-started.md`](docs/getting-started.md) | Developer setup, environment authoring, and local workflow |
+| [`docs/developer-guide.md`](docs/developer-guide.md) | Developer setup, environment authoring, and local workflow |
 | [`docs/medarc-eval.md`](docs/medarc-eval.md) | Full `medarc-eval` CLI documentation |
 | [`docs/medarc-eval-bench.md`](docs/medarc-eval-bench.md) | TOML benchmark suite execution |
 | [`docs/medarc-eval-process.md`](docs/medarc-eval-process.md) | Processing eval outputs into parquet |
diff --git a/docs/README.md b/docs/README.md
index 64445a16..dee7b530 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -25,4 +25,4 @@ See [medarc-eval.md](medarc-eval.md) for full documentation.
 
 ## Developer workflow
 
-See [getting-started.md](getting-started.md) for local setup, environment authoring, and development workflow notes.
+See [developer-guide.md](developer-guide.md) for local setup, environment authoring, and development workflow notes.
diff --git a/docs/medarc-eval-bench.md b/docs/medarc-eval-bench.md
index 7fff0d62..5b95fd2a 100644
--- a/docs/medarc-eval-bench.md
+++ b/docs/medarc-eval-bench.md
@@ -72,6 +72,12 @@ are not already importable in the active Python environment. Auto-install only
 applies to missing local packages resolved from `--env-dir`; selected envs that
 are already importable keep the normal in-process execution path.
 
+`--env-dir` defaults to `environments/`. When auto-install is needed, bench
+creates a system temporary directory with a `medarc-bench-venv-` prefix, creates
+a venv inside it with `uv venv`, installs the selected local env package
+editable into that venv, runs one eval through the private bench child, and then
+removes the temporary venv.
+
 ```bash
 medarc-eval bench \
   --config configs/eval/medmarks-verified.toml \
@@ -196,7 +202,7 @@ provider arguments pass through to upstream.
 | `--force` | Archive existing deterministic output and rerun |
 | `--resume` | Compatibility flag; valid deterministic outputs resume automatically |
 | `--output-dir PATH` | Override the config output directory, default `runs/evals` |
-| `--env-dir PATH` | Directory containing local environments |
+| `--env-dir PATH` | Directory containing local environments, default `environments` |
 | `--auto-install` / `--no-auto-install` | Auto-install missing local env packages in isolated temp venvs (default) or require selected envs to be preinstalled |
 | `--endpoints-path PATH` | Endpoint registry path, default `configs/endpoints.toml` |
 | `--api-base-url URL` | Override API base URL for every eval |
diff --git a/docs/medarc-eval-process.md b/docs/medarc-eval-process.md
index 3976a2f4..565b94b1 100644
--- a/docs/medarc-eval-process.md
+++ b/docs/medarc-eval-process.md
@@ -55,6 +55,10 @@ On-disk model and env path components are slugified, so filenames may not exactl
 | `--yes` | Skip confirmation prompts | - |
 | `--exclude-dataset NAME` | Skip processing specific datasets/env ids (repeatable) | - |
 | `--exclude-model MODEL` | Skip processing specific model ids (repeatable) | - |
+| `--replace-env NAME` | Rebuild existing processed outputs for specific env ids (repeatable) | - |
+| `--replace-model MODEL` | Rebuild existing processed outputs for specific model ids (repeatable) | - |
+| `--max-results-missing-pct N` | Fail latest selected outputs missing more than this percentage of expected rows | 2.5 |
+| `--winrate PATH` | Run winrate after processing with the provided config file | - |
 
 ## Filtering Runs
 
@@ -135,6 +139,7 @@ Supported config schema for `medarc-eval process`:
 - Top-level `process:`: process-specific defaults.
 - Optional top-level `winrate:`: embedded post-process winrate step.
 - Optional top-level `hf:`: shared HF settings. For embedded winrate uploads, use `hf.winrate_dir`.
+- Removed process config keys are rejected: use `max_results_missing_pct` instead of `max_run_missing_pct`; status filtering is no longer supported for current eval outputs.
 
 Path shortcuts:
 
diff --git a/docs/medarc-eval.md b/docs/medarc-eval.md
index b5b725f5..062fd336 100644
--- a/docs/medarc-eval.md
+++ b/docs/medarc-eval.md
@@ -13,7 +13,7 @@ medarc-eval medqa -m gpt-4.1-mini -n 25
 # Run a batch of benchmarks from a config file
 medarc-eval bench --config configs/eval/medmarks-smoke.toml
 
-# Process raw results into analysis-ready parquet files
+# Process eval outputs into analysis-ready parquet files
 medarc-eval process --runs-dir runs/evals
 
 # Compute win rates across models
@@ -36,7 +36,7 @@ medarc-eval winrate
 |---------|---------|
 | `medarc-eval <ENV>` | Run a single benchmark interactively |
 | `medarc-eval bench` | Run multiple benchmarks from a config file |
-| `medarc-eval process` | Convert raw results to parquet for analysis |
+| `medarc-eval process` | Convert eval outputs to parquet for analysis |
 | `medarc-eval winrate` | Compute model comparisons from processed data |
 
 ## Command Structure
@@ -117,12 +117,14 @@ runs/
 │           └── <variant>/
 │               ├── results.jsonl
 │               └── metadata.json
-├── processed/                    # Analysis-ready parquet files (from process)
-│   ├── env_index.json            # Dataset inventory
-│   └── <model>/<env>.parquet
-└── winrate/                      # Model comparison outputs (from winrate)
-    ├── latest.json
-    └── latest.csv
+└── processed/                    # Analysis-ready parquet files (from process)
+    ├── env_index.json            # Dataset inventory
+    ├── <model>/<env>.parquet
+    └── winrate/                  # Model comparison outputs (from winrate)
+        ├── winrates-<timestamp>.json
+        ├── winrates-<timestamp>.csv
+        ├── latest.json
+        └── latest.csv
 ```
 
 ## Getting Help
diff --git a/docs/medarc-verifiers-architecture.md b/docs/medarc-verifiers-architecture.md
index 2de10790..41b8b005 100644
--- a/docs/medarc-verifiers-architecture.md
+++ b/docs/medarc-verifiers-architecture.md
@@ -49,8 +49,10 @@ It supports:
     deterministic output directories from selected raw configs, then runs evals
     sequentially through upstream execution.
   - Missing selected local environment packages are auto-installed by default
-    in isolated temporary venvs. Importable envs stay on the in-process path.
-    `--no-auto-install` requires selected envs to already be importable.
+    from `--env-dir` (default `environments`) in isolated system temporary
+    venvs with a `medarc-bench-venv-` prefix. Importable envs stay on the
+    in-process path. `--no-auto-install` requires selected envs to already be
+    importable.
   - Main implementation: `medarc_verifiers/cli/main.py`
   - Isolated auto-install helper: `medarc_verifiers/cli/isolated_env.py`
   - Isolated child runner: `medarc_verifiers/cli/bench_child.py`

From c42877663b983f7616f993d446fb5b629138e75a Mon Sep 17 00:00:00 2001
From: Benjamin Warner <me@benjaminwarner.dev>
Date: Tue, 12 May 2026 21:40:07 +0000
Subject: [PATCH 47/53] add strict answer matching option

---
 .../rewards/multiple_choice_accuracy.py       |  6 +++-
 tests/test_mcq_accuracy.py                    | 28 +++++++++++++++++++
 2 files changed, 33 insertions(+), 1 deletion(-)

diff --git a/medarc_verifiers/rewards/multiple_choice_accuracy.py b/medarc_verifiers/rewards/multiple_choice_accuracy.py
index cdee4780..0c86c92f 100644
--- a/medarc_verifiers/rewards/multiple_choice_accuracy.py
+++ b/medarc_verifiers/rewards/multiple_choice_accuracy.py
@@ -768,10 +768,11 @@ def multiple_choice_accuracy(
     answer_text: str,
     prefix: Optional[str] = None,
     accept_answer_text: bool = True,
+    strict: bool = False,
     strip_tex: bool = True,
     return_details: bool = False,
 ) -> bool | MCQAccuracyResult:
-    """Grade an MCQ answer using short-mode scans and tail-authoritative long-mode scans."""
+    """Grade an MCQ answer using exact matching or permissive MCQ extraction heuristics."""
 
     if not llm_answer:
         return _result(False, "none", None, None, return_details)
@@ -834,6 +835,9 @@ def multiple_choice_accuracy(
             return_details,
         )
 
+    if strict:
+        return _result(False, "none", None, None, return_details)
+
     is_long = len(structural_text) > LONG_RESPONSE_THRESHOLD_CHARS
     terminal_region = structural_text[-TERMINAL_WINDOW_CHARS:] if is_long else structural_text
     strong_tail_region = terminal_region[-STRONG_TAIL_WINDOW_CHARS:] if is_long else structural_text
diff --git a/tests/test_mcq_accuracy.py b/tests/test_mcq_accuracy.py
index 5365ddc4..77cc9da6 100644
--- a/tests/test_mcq_accuracy.py
+++ b/tests/test_mcq_accuracy.py
@@ -162,6 +162,34 @@ def test_answer_text_in_sentence():
     )
 
 
+@pytest.mark.parametrize(
+    ("response", "answer_letter", "answer_text"),
+    [
+        (" c ", "C", "Correct option"),
+        ("(2)", "2", "Second option"),
+        ("  Chemotherapy and radiation.  ", "C", "chemotherapy and radiation"),
+        ("B. Video-capsule endoscopy", "B", "Video-capsule endoscopy"),
+        ("**(3)** Third option", "3", "Third option"),
+    ],
+)
+def test_strict_accepts_only_exact_option_text_or_both(response: str, answer_letter: str, answer_text: str):
+    assert multiple_choice_accuracy(response, answer_letter=answer_letter, answer_text=answer_text, strict=True)
+
+
+@pytest.mark.parametrize(
+    ("response", "answer_letter", "answer_text"),
+    [
+        ("Final answer: C", "C", "Correct option"),
+        ("I think it's C", "C", "Correct option"),
+        ("Based on the symptoms, acute myocardial infarction is most likely.", "B", "acute myocardial infarction"),
+        ("The answer is all of the above.", "D", "All of the above"),
+    ],
+)
+def test_strict_rejects_permissive_heuristic_matches(response: str, answer_letter: str, answer_text: str):
+    assert multiple_choice_accuracy(response, answer_letter=answer_letter, answer_text=answer_text)
+    assert not multiple_choice_accuracy(response, answer_letter=answer_letter, answer_text=answer_text, strict=True)
+
+
 @pytest.mark.parametrize("response", ["All of the above", "The answer is all of the above."])
 def test_answer_text_all_of_the_above_is_not_rejected(response: str):
     assert multiple_choice_accuracy(response, answer_letter="D", answer_text="All of the above")

From 02efacfad9ffe36de8125fbf620a8de58f7c115b Mon Sep 17 00:00:00 2001
From: Benjamin Warner <me@benjaminwarner.dev>
Date: Tue, 12 May 2026 21:43:49 +0000
Subject: [PATCH 48/53] Move Medmarks configs to top-level configs

---
 AGENTS.md                                   |  2 +-
 README.md                                   | 24 +++++++++----------
 configs/{eval => }/README.md                | 12 +++++-----
 configs/eval/gpt-oss-local-endpoints.toml   | 12 ----------
 configs/eval/gpt-oss-local-smoke.toml       | 26 ---------------------
 configs/{eval => }/medmarks-endpoints.toml  |  0
 configs/{eval => }/medmarks-open_ended.toml |  0
 configs/{eval => }/medmarks-smoke.toml      |  0
 configs/{eval => }/medmarks-verified.toml   |  0
 docs/developer-guide.md                     |  6 ++---
 docs/medarc-eval-bench.md                   | 20 ++++++++--------
 docs/medarc-eval-process.md                 |  2 +-
 docs/medarc-eval.md                         | 10 ++++----
 tests/test_cli/test_main.py                 |  8 +++----
 14 files changed, 42 insertions(+), 80 deletions(-)
 rename configs/{eval => }/README.md (82%)
 delete mode 100644 configs/eval/gpt-oss-local-endpoints.toml
 delete mode 100644 configs/eval/gpt-oss-local-smoke.toml
 rename configs/{eval => }/medmarks-endpoints.toml (100%)
 rename configs/{eval => }/medmarks-open_ended.toml (100%)
 rename configs/{eval => }/medmarks-smoke.toml (100%)
 rename configs/{eval => }/medmarks-verified.toml (100%)

diff --git a/AGENTS.md b/AGENTS.md
index 0b58132e..a32bbc6c 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -32,7 +32,7 @@
 - `uv pip install -e .`: Install `medarc-verifiers` in editable mode.
 - `vf-install <env>`: Install an environment from `environments/<env>/` in editable mode.
 - `uv run medarc-eval <ENV> -m <MODEL> -n 5`: Run a small evaluation.
-- `uv run medarc-eval bench --config configs/eval/medmarks-smoke.toml`: Run a batch evaluation from a TOML config.
+- `uv run medarc-eval bench --config configs/medmarks-smoke.toml`: Run a batch evaluation from a TOML config.
 - `uv run pytest tests/`: Run the full test suite.
 - `uv run ruff check medarc_verifiers/ && uv run ruff format medarc_verifiers/`: Lint/format.
 
diff --git a/README.md b/README.md
index 1bcfdd19..2a66fbda 100644
--- a/README.md
+++ b/README.md
@@ -27,10 +27,10 @@ The benchmark suite is implemented as [verifiers](https://github.com/primeintell
 
 | Config | Purpose |
 |--------|---------|
-| [`configs/eval/medmarks-verified.toml`](configs/eval/medmarks-verified.toml) | Medmarks-V suite |
-| [`configs/eval/medmarks-open_ended.toml`](configs/eval/medmarks-open_ended.toml) | Medmarks-OE suite |
-| [`configs/eval/medmarks-endpoints.toml`](configs/eval/medmarks-endpoints.toml) | Portable model aliases and sampling defaults for Medmarks runs |
-| [`configs/eval/medmarks-smoke.toml`](configs/eval/medmarks-smoke.toml) | Small Medmarks-V sanity-check run |
+| [`configs/medmarks-verified.toml`](configs/medmarks-verified.toml) | Medmarks-V suite |
+| [`configs/medmarks-open_ended.toml`](configs/medmarks-open_ended.toml) | Medmarks-OE suite |
+| [`configs/medmarks-endpoints.toml`](configs/medmarks-endpoints.toml) | Portable model aliases and sampling defaults for Medmarks runs |
+| [`configs/medmarks-smoke.toml`](configs/medmarks-smoke.toml) | Small Medmarks-V sanity-check run |
 
 ## Quick Start
 
@@ -49,28 +49,28 @@ uv run medarc-eval medqa -m openai/gpt-4.1-mini -n 25
 Run a Medmarks suite config:
 
 ```bash
-uv run medarc-eval bench --config configs/eval/medmarks-verified.toml
+uv run medarc-eval bench --config configs/medmarks-verified.toml
 ```
 
 Run a Medmarks suite with one of the published model aliases:
 
 ```bash
 uv run medarc-eval bench \
-  --config configs/eval/medmarks-verified.toml \
-  --endpoints-path configs/eval/medmarks-endpoints.toml \
+  --config configs/medmarks-verified.toml \
+  --endpoints-path configs/medmarks-endpoints.toml \
   -m gpt-oss-20b-low \
   --api-base-url https://api.pinference.ai/api/v1 \
   --api-key-var PRIME_API_KEY
 ```
 
-[`configs/eval/medmarks-endpoints.toml`](configs/eval/medmarks-endpoints.toml) is an alias registry, not a deployment config. It maps names such as `gpt-oss-20b-low` or `medgemma-27b-text` to provider model IDs, client types, and model-specific sampling defaults. It intentionally omits `url`, `key`, and `max_concurrent`; supply those with `--provider` or with `--api-base-url` and `--api-key-var` for your deployment. The gpt-oss aliases use the Verifiers `openai_responses` client type.
+[`configs/medmarks-endpoints.toml`](configs/medmarks-endpoints.toml) is an alias registry, not a deployment config. It maps names such as `gpt-oss-20b-low` or `medgemma-27b-text` to provider model IDs, client types, and model-specific sampling defaults. It intentionally omits `url`, `key`, and `max_concurrent`; supply those with `--provider` or with `--api-base-url` and `--api-key-var` for your deployment. The gpt-oss aliases use the Verifiers `openai_responses` client type.
 
 Preview the resolved jobs before running:
 
 ```bash
 uv run medarc-eval bench \
-  --config configs/eval/medmarks-verified.toml \
-  --endpoints-path configs/eval/medmarks-endpoints.toml \
+  --config configs/medmarks-verified.toml \
+  --endpoints-path configs/medmarks-endpoints.toml \
   -m gpt-oss-20b-low \
   --api-base-url https://api.pinference.ai/api/v1 \
   --api-key-var PRIME_API_KEY \
@@ -81,8 +81,8 @@ Run the same alias against a local vLLM server exposing an OpenAI-compatible API
 
 ```bash
 VLLM_API_KEY=local-key uv run medarc-eval bench \
-  --config configs/eval/medmarks-verified.toml \
-  --endpoints-path configs/eval/medmarks-endpoints.toml \
+  --config configs/medmarks-verified.toml \
+  --endpoints-path configs/medmarks-endpoints.toml \
   -m gpt-oss-20b-low \
   --api-base-url http://127.0.0.1:8000/v1 \
   --api-key-var VLLM_API_KEY \
diff --git a/configs/eval/README.md b/configs/README.md
similarity index 82%
rename from configs/eval/README.md
rename to configs/README.md
index 9087fb80..6d91fc41 100644
--- a/configs/eval/README.md
+++ b/configs/README.md
@@ -6,8 +6,8 @@ and `[[ablation]]` sweeps intentionally keep the upstream environment id stable;
 `env_args` and `sampling_args`.
 
 ```bash
-medarc-eval bench --config configs/eval/medmarks-smoke.toml --dry-run
-medarc-eval bench --config configs/eval/medmarks-verified.toml
+medarc-eval bench --config configs/medmarks-smoke.toml --dry-run
+medarc-eval bench --config configs/medmarks-verified.toml
 medarc-eval process --runs-dir runs/evals --output-dir runs/processed
 ```
 
@@ -16,8 +16,8 @@ and its sampling defaults:
 
 ```bash
 medarc-eval bench \
-  --config configs/eval/medmarks-verified.toml \
-  --endpoints-path configs/eval/medmarks-endpoints.toml \
+  --config configs/medmarks-verified.toml \
+  --endpoints-path configs/medmarks-endpoints.toml \
   -m gpt-oss-20b-low \
   --api-base-url https://api.pinference.ai/api/v1 \
   --api-key-var PRIME_API_KEY \
@@ -35,8 +35,8 @@ alias registry and override only the deployment settings:
 
 ```bash
 VLLM_API_KEY=local-key medarc-eval bench \
-  --config configs/eval/medmarks-verified.toml \
-  --endpoints-path configs/eval/medmarks-endpoints.toml \
+  --config configs/medmarks-verified.toml \
+  --endpoints-path configs/medmarks-endpoints.toml \
   -m gpt-oss-20b-low \
   --api-base-url http://127.0.0.1:8000/v1 \
   --api-key-var VLLM_API_KEY \
diff --git a/configs/eval/gpt-oss-local-endpoints.toml b/configs/eval/gpt-oss-local-endpoints.toml
deleted file mode 100644
index 4c38d658..00000000
--- a/configs/eval/gpt-oss-local-endpoints.toml
+++ /dev/null
@@ -1,12 +0,0 @@
-[[endpoint]]
-endpoint_id = "gpt-oss-20b-low-local"
-model = "openai/gpt-oss-20b"
-url = "http://host.docker.internal:8010/v1"
-key = "VLLM_API_KEY"
-api_client_type = "openai_responses"
-
-[endpoint.sampling_args]
-temperature = 1.0
-top_p = 1.0
-top_k = 0
-reasoning_effort = "low"
diff --git a/configs/eval/gpt-oss-local-smoke.toml b/configs/eval/gpt-oss-local-smoke.toml
deleted file mode 100644
index 7b9764c3..00000000
--- a/configs/eval/gpt-oss-local-smoke.toml
+++ /dev/null
@@ -1,26 +0,0 @@
-endpoints_path = "gpt-oss-local-endpoints.toml"
-variant_id = "gpt-oss-20b-low-local"
-save_results = true
-output_dir = "runs/evals"
-max_concurrent = 4
-max_retries = 1
-
-[[eval]]
-env_id = "medqa"
-num_examples = 25
-rollouts_per_example = 1
-
-[[eval]]
-env_id = "pubmedqa"
-num_examples = 25
-rollouts_per_example = 1
-
-[[eval]]
-env_id = "m-arc"
-num_examples = 25
-rollouts_per_example = 1
-
-[[eval]]
-env_id = "medhallu"
-num_examples = 25
-rollouts_per_example = 1
diff --git a/configs/eval/medmarks-endpoints.toml b/configs/medmarks-endpoints.toml
similarity index 100%
rename from configs/eval/medmarks-endpoints.toml
rename to configs/medmarks-endpoints.toml
diff --git a/configs/eval/medmarks-open_ended.toml b/configs/medmarks-open_ended.toml
similarity index 100%
rename from configs/eval/medmarks-open_ended.toml
rename to configs/medmarks-open_ended.toml
diff --git a/configs/eval/medmarks-smoke.toml b/configs/medmarks-smoke.toml
similarity index 100%
rename from configs/eval/medmarks-smoke.toml
rename to configs/medmarks-smoke.toml
diff --git a/configs/eval/medmarks-verified.toml b/configs/medmarks-verified.toml
similarity index 100%
rename from configs/eval/medmarks-verified.toml
rename to configs/medmarks-verified.toml
diff --git a/docs/developer-guide.md b/docs/developer-guide.md
index 00bfd0ed..311e78c4 100644
--- a/docs/developer-guide.md
+++ b/docs/developer-guide.md
@@ -121,7 +121,7 @@ results = env.evaluate(model_client, "gpt-4.1-mini", num_examples=5)
 uv run medarc-eval medqa -m gpt-4.1-mini -n 25
 
 # Run batch evaluations from config
-uv run medarc-eval bench --config configs/eval/medmarks-smoke.toml
+uv run medarc-eval bench --config configs/medmarks-smoke.toml
 
 # Process results and compute win rates
 uv run medarc-eval process --runs-dir runs/evals
@@ -164,10 +164,10 @@ env_args = { shuffle_answers = true, shuffle_seed = 1618 }
 
 ```bash
 # Run the batch
-uv run medarc-eval bench --config configs/eval/medmarks-verified.toml
+uv run medarc-eval bench --config configs/medmarks-verified.toml
 
 # Preview without executing
-uv run medarc-eval bench --config configs/eval/medmarks-verified.toml --dry-run
+uv run medarc-eval bench --config configs/medmarks-verified.toml --dry-run
 ```
 
 Bench mode resumes matching deterministic result directories and supports `[[ablation]]` sweeps for parameter grids. The removed YAML job/manifest runner is documented only in the migration notes in the [bench mode docs](medarc-eval-bench.md).
diff --git a/docs/medarc-eval-bench.md b/docs/medarc-eval-bench.md
index 5b95fd2a..ed0a6444 100644
--- a/docs/medarc-eval-bench.md
+++ b/docs/medarc-eval-bench.md
@@ -11,23 +11,23 @@ accepts `.toml` files only.
 
 ```bash
 # Preview the repository smoke config
-medarc-eval bench --config configs/eval/medmarks-smoke.toml --dry-run
+medarc-eval bench --config configs/medmarks-smoke.toml --dry-run
 
 # Run the verified production suite
-medarc-eval bench --config configs/eval/medmarks-verified.toml
+medarc-eval bench --config configs/medmarks-verified.toml
 
 # Require all selected env packages to already be installed
-medarc-eval bench --config configs/eval/medmarks-verified.toml --no-auto-install
+medarc-eval bench --config configs/medmarks-verified.toml --no-auto-install
 
 # Run the verified suite against a local OpenAI-compatible server
 medarc-eval bench \
-  --config configs/eval/medmarks-verified.toml \
+  --config configs/medmarks-verified.toml \
   --api-base-url http://127.0.0.1:8000/v1 \
   --provider local \
   --model openai/my-local-model
 ```
 
-Repository suite configs live in `configs/eval/`:
+Repository suite configs live in `configs/`:
 
 | Config | Purpose |
 |--------|---------|
@@ -80,7 +80,7 @@ removes the temporary venv.
 
 ```bash
 medarc-eval bench \
-  --config configs/eval/medmarks-verified.toml \
+  --config configs/medmarks-verified.toml \
   --eval-index "$SLURM_ARRAY_TASK_ID"
 ```
 
@@ -103,7 +103,7 @@ For faster strict local iteration, preinstall environments and opt out:
 ```bash
 vf-install medqa
 vf-install pubmedqa
-medarc-eval bench --config configs/eval/medmarks-verified.toml --no-auto-install
+medarc-eval bench --config configs/medmarks-verified.toml --no-auto-install
 ```
 
 `--dry-run` does not create venvs, install packages, or spawn child processes.
@@ -175,7 +175,7 @@ Existing valid outputs resume automatically. This makes Slurm retries
 idempotent for a fixed `--eval-index`:
 
 ```bash
-medarc-eval bench --config configs/eval/medmarks-verified.toml --eval-index "$SLURM_ARRAY_TASK_ID"
+medarc-eval bench --config configs/medmarks-verified.toml --eval-index "$SLURM_ARRAY_TASK_ID"
 ```
 
 If the deterministic target already contains both `metadata.json` and
@@ -185,7 +185,7 @@ partial, bench fails unless `--force` is set:
 
 ```bash
 # Archive existing deterministic outputs and rerun
-medarc-eval bench --config configs/eval/medmarks-verified.toml --force
+medarc-eval bench --config configs/medmarks-verified.toml --force
 ```
 
 `--resume` is still accepted for compatibility, but deterministic bench outputs
@@ -277,7 +277,7 @@ export PRIME_API_KEY=...
 export PRIME_TEAM_ID=...
 
 medarc-eval bench \
-  --config configs/eval/medmarks-verified.toml \
+  --config configs/medmarks-verified.toml \
   --api-base-url https://api.pinference.ai/api/v1
 ```
 
diff --git a/docs/medarc-eval-process.md b/docs/medarc-eval-process.md
index 565b94b1..5a4b6ffe 100644
--- a/docs/medarc-eval-process.md
+++ b/docs/medarc-eval-process.md
@@ -231,7 +231,7 @@ This runs `medarc-eval winrate` automatically after processing completes when th
 
 ```bash
 # 1. Run benchmarks
-medarc-eval bench --config configs/eval/medmarks-verified.toml
+medarc-eval bench --config configs/medmarks-verified.toml
 
 # 2. Process results
 medarc-eval process --runs-dir runs/evals
diff --git a/docs/medarc-eval.md b/docs/medarc-eval.md
index 062fd336..844b8027 100644
--- a/docs/medarc-eval.md
+++ b/docs/medarc-eval.md
@@ -11,7 +11,7 @@
 medarc-eval medqa -m gpt-4.1-mini -n 25
 
 # Run a batch of benchmarks from a config file
-medarc-eval bench --config configs/eval/medmarks-smoke.toml
+medarc-eval bench --config configs/medmarks-smoke.toml
 
 # Process eval outputs into analysis-ready parquet files
 medarc-eval process --runs-dir runs/evals
@@ -46,7 +46,7 @@ medarc-eval winrate
 medarc-eval medqa -m gpt-4.1-mini -n 50
 
 # Subcommands: keyword comes first
-medarc-eval bench --config configs/eval/medmarks-verified.toml
+medarc-eval bench --config configs/medmarks-verified.toml
 medarc-eval process --runs-dir runs/evals
 medarc-eval winrate --processed-dir runs/processed
 ```
@@ -74,13 +74,13 @@ medarc-eval longhealth --help
 
 ```bash
 # Run all jobs defined in config
-medarc-eval bench --config configs/eval/medmarks-verified.toml
+medarc-eval bench --config configs/medmarks-verified.toml
 
 # Preview what would run without executing
-medarc-eval bench --config configs/eval/medmarks-verified.toml --dry-run
+medarc-eval bench --config configs/medmarks-verified.toml --dry-run
 
 # Force all jobs to use a specific API endpoint
-medarc-eval bench --config configs/eval/medmarks-verified.toml --api-base-url http://127.0.0.1:8000/v1 --provider local
+medarc-eval bench --config configs/medmarks-verified.toml --api-base-url http://127.0.0.1:8000/v1 --provider local
 ```
 
 ### Processing Mode (`medarc-eval process`)
diff --git a/tests/test_cli/test_main.py b/tests/test_cli/test_main.py
index b4421b81..a87104c5 100644
--- a/tests/test_cli/test_main.py
+++ b/tests/test_cli/test_main.py
@@ -153,7 +153,7 @@ def test_toml_bench_dry_run_expands_evals_and_ablations(
 
 
 def test_repository_smoke_toml_config_dry_runs(capsys: pytest.CaptureFixture[str]) -> None:
-    exit_code = main.main(["bench", "--config", "configs/eval/medmarks-smoke.toml", "--dry-run"])
+    exit_code = main.main(["bench", "--config", "configs/medmarks-smoke.toml", "--dry-run"])
 
     output = capsys.readouterr().out
     assert exit_code == 0
@@ -207,7 +207,7 @@ def test_bench_rejects_non_toml_config(tmp_path: Path, capsys: pytest.CaptureFix
 
 def test_bench_rejects_removed_yaml_runner_flags(capsys: pytest.CaptureFixture[str]) -> None:
     with pytest.raises(SystemExit) as excinfo:
-        main.main(["bench", "--config", "configs/eval/medmarks-smoke.toml", "--restart"])
+        main.main(["bench", "--config", "configs/medmarks-smoke.toml", "--restart"])
 
     assert excinfo.value.code == 2
     err = capsys.readouterr().err
@@ -215,7 +215,7 @@ def test_bench_rejects_removed_yaml_runner_flags(capsys: pytest.CaptureFixture[s
 
 
 def test_repository_verified_toml_config_dry_run_shows_ablation_variants(capsys: pytest.CaptureFixture[str]) -> None:
-    exit_code = main.main(["bench", "--config", "configs/eval/medmarks-verified.toml", "--dry-run", "--eval-index", "45"])
+    exit_code = main.main(["bench", "--config", "configs/medmarks-verified.toml", "--dry-run", "--eval-index", "45"])
 
     output = capsys.readouterr().out
     assert exit_code == 0
@@ -225,7 +225,7 @@ def test_repository_verified_toml_config_dry_run_shows_ablation_variants(capsys:
 
 
 def test_repository_open_ended_toml_config_loads_expected_judge_args() -> None:
-    configs = main.load_toml_eval_configs("configs/eval/medmarks-open_ended.toml")
+    configs = main.load_toml_eval_configs("configs/medmarks-open_ended.toml")
     healthbench = next(config for config in configs if config["env_id"] == "healthbench")
     medrbench = [config for config in configs if config["env_id"] == "medrbench"]
 

From db2186e63e0341ce3a0390036a4d574192eb86f3 Mon Sep 17 00:00:00 2001
From: Benjamin Warner <me@benjaminwarner.dev>
Date: Tue, 12 May 2026 21:46:48 +0000
Subject: [PATCH 49/53] Fix mtsamples data downloading

---
 .../mtsamples_procedures.py                   | 36 +++++++++++--------
 .../mtsamples_replicate.py                    | 36 +++++++++++--------
 2 files changed, 42 insertions(+), 30 deletions(-)

diff --git a/environments/mtsamples_procedures/mtsamples_procedures/mtsamples_procedures.py b/environments/mtsamples_procedures/mtsamples_procedures/mtsamples_procedures.py
index 68d04d58..50111719 100644
--- a/environments/mtsamples_procedures/mtsamples_procedures/mtsamples_procedures.py
+++ b/environments/mtsamples_procedures/mtsamples_procedures/mtsamples_procedures.py
@@ -1,5 +1,6 @@
 import json
 import os
+import shutil
 from pathlib import Path
 from typing import Any
 from urllib.parse import quote
@@ -74,35 +75,36 @@ def _download_txt_files(cache_path: Path) -> list[Path]:
     txt_dir = cache_path / "txt_files"
     txt_dir.mkdir(parents=True, exist_ok=True)
 
-    existing_files = list(txt_dir.glob("*.txt"))
-    if len(existing_files) > 0:
-        return existing_files
-
     files_json = download_file(API_URL, cache_path / "files.json")
     files_data = json.loads(files_json.read_text(encoding="utf-8"))
+    expected_names = sorted(file_info["name"] for file_info in files_data if file_info["name"].endswith(".txt"))
 
-    downloaded_files = []
-    for file_info in files_data:
-        if file_info["name"].endswith(".txt"):
-            encoded_name = quote(file_info["name"])
-            file_url = f"{BASE_URL}/{encoded_name}"
-            dest_path = txt_dir / file_info["name"]
+    txt_files = []
+    for name in expected_names:
+        encoded_name = quote(name)
+        file_url = f"{BASE_URL}/{encoded_name}"
+        dest_path = txt_dir / name
 
+        if not dest_path.exists():
             download_file(file_url, dest_path)
-            downloaded_files.append(dest_path)
+        txt_files.append(dest_path)
 
-    return downloaded_files
+    return txt_files
 
 
 def _load_dataset(cache_dir: Path | str | None = None) -> Dataset:
     cache_path = _resolve_cache_dir(cache_dir)
     cache_path.mkdir(parents=True, exist_ok=True)
 
+    txt_files = _download_txt_files(cache_path)
     dataset_cache = cache_path / "dataset"
+    metadata_path = dataset_cache / "medarc_cache_metadata.json"
     if dataset_cache.exists():
-        return Dataset.load_from_disk(str(dataset_cache))
-
-    txt_files = _download_txt_files(cache_path)
+        if metadata_path.exists():
+            metadata = json.loads(metadata_path.read_text(encoding="utf-8"))
+            if metadata.get("source_files") == len(txt_files):
+                return Dataset.load_from_disk(str(dataset_cache))
+        shutil.rmtree(dataset_cache)
 
     examples = []
 
@@ -145,6 +147,10 @@ def _load_dataset(cache_dir: Path | str | None = None) -> Dataset:
     dataset = Dataset.from_list(examples)
 
     dataset.save_to_disk(str(dataset_cache))
+    metadata_path.write_text(
+        json.dumps({"source_files": len(txt_files), "examples": len(dataset)}, indent=2),
+        encoding="utf-8",
+    )
 
     return dataset
 
diff --git a/environments/mtsamples_replicate/mtsamples_replicate/mtsamples_replicate.py b/environments/mtsamples_replicate/mtsamples_replicate/mtsamples_replicate.py
index a1d7384e..3642d306 100644
--- a/environments/mtsamples_replicate/mtsamples_replicate/mtsamples_replicate.py
+++ b/environments/mtsamples_replicate/mtsamples_replicate/mtsamples_replicate.py
@@ -1,5 +1,6 @@
 import json
 import os
+import shutil
 from pathlib import Path
 from typing import Any
 from urllib.parse import quote
@@ -81,35 +82,36 @@ def _download_txt_files(cache_path: Path) -> list[Path]:
     txt_dir = cache_path / "txt_files"
     txt_dir.mkdir(parents=True, exist_ok=True)
 
-    existing_files = list(txt_dir.glob("*.txt"))
-    if len(existing_files) > 0:
-        return existing_files
-
     files_json = download_file(API_URL, cache_path / "files.json")
     files_data = json.loads(files_json.read_text(encoding="utf-8"))
+    expected_names = sorted(file_info["name"] for file_info in files_data if file_info["name"].endswith(".txt"))
 
-    downloaded_files = []
-    for file_info in files_data:
-        if file_info["name"].endswith(".txt"):
-            encoded_name = quote(file_info["name"])
-            file_url = f"{BASE_URL}/{encoded_name}"
-            dest_path = txt_dir / file_info["name"]
+    txt_files = []
+    for name in expected_names:
+        encoded_name = quote(name)
+        file_url = f"{BASE_URL}/{encoded_name}"
+        dest_path = txt_dir / name
 
+        if not dest_path.exists():
             download_file(file_url, dest_path)
-            downloaded_files.append(dest_path)
+        txt_files.append(dest_path)
 
-    return downloaded_files
+    return txt_files
 
 
 def _load_dataset(cache_dir: Path | str | None = None) -> Dataset:
     cache_path = _resolve_cache_dir(cache_dir)
     cache_path.mkdir(parents=True, exist_ok=True)
 
+    txt_files = _download_txt_files(cache_path)
     dataset_cache = cache_path / "dataset"
+    metadata_path = dataset_cache / "medarc_cache_metadata.json"
     if dataset_cache.exists():
-        return Dataset.load_from_disk(str(dataset_cache))
-
-    txt_files = _download_txt_files(cache_path)
+        if metadata_path.exists():
+            metadata = json.loads(metadata_path.read_text(encoding="utf-8"))
+            if metadata.get("source_files") == len(txt_files):
+                return Dataset.load_from_disk(str(dataset_cache))
+        shutil.rmtree(dataset_cache)
 
     examples = []
 
@@ -152,6 +154,10 @@ def _load_dataset(cache_dir: Path | str | None = None) -> Dataset:
     dataset = Dataset.from_list(examples)
 
     dataset.save_to_disk(str(dataset_cache))
+    metadata_path.write_text(
+        json.dumps({"source_files": len(txt_files), "examples": len(dataset)}, indent=2),
+        encoding="utf-8",
+    )
 
     return dataset
 

From 0ecdc4912e5f81d33301a85c023b014d167d5f2a Mon Sep 17 00:00:00 2001
From: Benjamin Warner <me@benjaminwarner.dev>
Date: Tue, 12 May 2026 21:56:07 +0000
Subject: [PATCH 50/53] remove old configs

---
 configs/envs/agentclinic.yaml          | 41 -------------------------
 configs/envs/careqa_en.yaml            | 17 -----------
 configs/envs/careqa_open.yaml          | 11 -------
 configs/envs/head_qa_v2.yaml           |  4 ---
 configs/envs/healthbench.yaml          |  9 ------
 configs/envs/longhealth.yaml           | 42 --------------------------
 configs/envs/m_arc.yaml                | 14 ---------
 configs/envs/med_dialog.yaml           | 10 ------
 configs/envs/med_halt.yaml             |  8 -----
 configs/envs/med_mcqa.yaml             | 16 ----------
 configs/envs/medagentbench.yaml        |  6 ----
 configs/envs/medagentbenchv2.yaml      |  6 ----
 configs/envs/medbullets.yaml           | 20 ------------
 configs/envs/medcalc_bench.yaml        | 21 -------------
 configs/envs/medcasereasoning.yaml     |  7 -----
 configs/envs/medconceptsqa_sample.yaml | 23 --------------
 configs/envs/medec.yaml                |  9 ------
 configs/envs/medexqa.yaml              | 11 -------
 configs/envs/medhallu.yaml             |  8 -----
 configs/envs/medicationqa.yaml         | 10 ------
 configs/envs/medqa.yaml                | 16 ----------
 configs/envs/medrbench.yaml            | 14 ---------
 configs/envs/medxpertqa.yaml           | 32 --------------------
 configs/envs/meqsum.yaml               | 20 ------------
 configs/envs/metamedqa.yaml            | 16 ----------
 configs/envs/mmlu_pro_health.yaml      | 16 ----------
 configs/envs/mtsamples.yaml            | 19 ------------
 configs/envs/pubhealthbench_free.yaml  | 11 -------
 configs/envs/pubhealthbench_mcq.yaml   | 18 -----------
 configs/envs/pubmedqa.yaml             | 16 ----------
 configs/envs/sctpublic.yaml            |  5 ---
 configs/envs/supergpqa_medicine.yaml   | 32 --------------------
 32 files changed, 508 deletions(-)
 delete mode 100644 configs/envs/agentclinic.yaml
 delete mode 100644 configs/envs/careqa_en.yaml
 delete mode 100644 configs/envs/careqa_open.yaml
 delete mode 100644 configs/envs/head_qa_v2.yaml
 delete mode 100644 configs/envs/healthbench.yaml
 delete mode 100644 configs/envs/longhealth.yaml
 delete mode 100644 configs/envs/m_arc.yaml
 delete mode 100644 configs/envs/med_dialog.yaml
 delete mode 100644 configs/envs/med_halt.yaml
 delete mode 100644 configs/envs/med_mcqa.yaml
 delete mode 100644 configs/envs/medagentbench.yaml
 delete mode 100644 configs/envs/medagentbenchv2.yaml
 delete mode 100644 configs/envs/medbullets.yaml
 delete mode 100644 configs/envs/medcalc_bench.yaml
 delete mode 100644 configs/envs/medcasereasoning.yaml
 delete mode 100644 configs/envs/medconceptsqa_sample.yaml
 delete mode 100644 configs/envs/medec.yaml
 delete mode 100644 configs/envs/medexqa.yaml
 delete mode 100644 configs/envs/medhallu.yaml
 delete mode 100644 configs/envs/medicationqa.yaml
 delete mode 100644 configs/envs/medqa.yaml
 delete mode 100644 configs/envs/medrbench.yaml
 delete mode 100644 configs/envs/medxpertqa.yaml
 delete mode 100644 configs/envs/meqsum.yaml
 delete mode 100644 configs/envs/metamedqa.yaml
 delete mode 100644 configs/envs/mmlu_pro_health.yaml
 delete mode 100644 configs/envs/mtsamples.yaml
 delete mode 100644 configs/envs/pubhealthbench_free.yaml
 delete mode 100644 configs/envs/pubhealthbench_mcq.yaml
 delete mode 100644 configs/envs/pubmedqa.yaml
 delete mode 100644 configs/envs/sctpublic.yaml
 delete mode 100644 configs/envs/supergpqa_medicine.yaml

diff --git a/configs/envs/agentclinic.yaml b/configs/envs/agentclinic.yaml
deleted file mode 100644
index a152a7f7..00000000
--- a/configs/envs/agentclinic.yaml
+++ /dev/null
@@ -1,41 +0,0 @@
-- id: agentclinic
-  module: agentclinic
-  num_examples: -1
-  verbose: false
-  env_args:
-    judge_model:
-      - openai/gpt-5-mini
-      - google/gemini-3-flash-preview
-    judge_base_url: https://api.pinference.ai/api/v1
-    patient_model: openai/gpt-5-mini
-    patient_base_url: https://api.pinference.ai/api/v1
-    measurement_model: openai/gpt-5-mini
-    measurement_base_url: https://api.pinference.ai/api/v1
-
-- id: agentclinic_rollout_1
-  module: agentclinic
-  num_examples: -1
-  verbose: false
-  env_args:
-    judge_model:
-      - openai/gpt-5-mini
-      - google/gemini-3-flash-preview
-    judge_base_url: https://api.pinference.ai/api/v1
-    patient_model: openai/gpt-5-mini
-    patient_base_url: https://api.pinference.ai/api/v1
-    measurement_model: openai/gpt-5-mini
-    measurement_base_url: https://api.pinference.ai/api/v1
-
-- id: agentclinic_rollout_2
-  module: agentclinic
-  num_examples: -1
-  verbose: false
-  env_args:
-    judge_model:
-      - openai/gpt-5-mini
-      - google/gemini-3-flash-preview
-    judge_base_url: https://api.pinference.ai/api/v1
-    patient_model: openai/gpt-5-mini
-    patient_base_url: https://api.pinference.ai/api/v1
-    measurement_model: openai/gpt-5-mini
-    measurement_base_url: https://api.pinference.ai/api/v1
\ No newline at end of file
diff --git a/configs/envs/careqa_en.yaml b/configs/envs/careqa_en.yaml
deleted file mode 100644
index c39f52da..00000000
--- a/configs/envs/careqa_en.yaml
+++ /dev/null
@@ -1,17 +0,0 @@
-- id: careqa_en
-  module: careqa
-  num_examples: -1
-  verbose: false
-  env_args:
-    split: en
-
-- id: careqa_en
-  module: careqa
-  num_examples: -1
-  verbose: false
-  env_args:
-    shuffle_answers: true
-    split: en
-  matrix:
-    shuffle_seed: [1618, 9331]
-  matrix_id_format: "{base}-rollout{shuffle_seed}"
\ No newline at end of file
diff --git a/configs/envs/careqa_open.yaml b/configs/envs/careqa_open.yaml
deleted file mode 100644
index 623259a6..00000000
--- a/configs/envs/careqa_open.yaml
+++ /dev/null
@@ -1,11 +0,0 @@
-- id: careqa_open
-  module: careqa
-  num_examples: -1
-  verbose: false
-  rerun: true
-  env_args:
-    split: open
-    judge_model:
-      - openai/gpt-5-mini
-      - google/gemini-3-flash-preview
-    judge_base_url: https://api.pinference.ai/api/v1
\ No newline at end of file
diff --git a/configs/envs/head_qa_v2.yaml b/configs/envs/head_qa_v2.yaml
deleted file mode 100644
index 8daa0191..00000000
--- a/configs/envs/head_qa_v2.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-- id: head_qa_v2
-  module: head_qa_v2
-  num_examples: -1
-  verbose: false
\ No newline at end of file
diff --git a/configs/envs/healthbench.yaml b/configs/envs/healthbench.yaml
deleted file mode 100644
index e6ea873a..00000000
--- a/configs/envs/healthbench.yaml
+++ /dev/null
@@ -1,9 +0,0 @@
-- id: healthbench
-  module: healthbench
-  num_examples: -1
-  verbose: false
-  rerun: true
-  env_args:
-    judge_model: openai/gpt-5-mini
-    judge_base_url: https://api.pinference.ai/api/v1
-    difficulty: all
\ No newline at end of file
diff --git a/configs/envs/longhealth.yaml b/configs/envs/longhealth.yaml
deleted file mode 100644
index 2f85f1a7..00000000
--- a/configs/envs/longhealth.yaml
+++ /dev/null
@@ -1,42 +0,0 @@
-# Base variants (no answer shuffling)
-- id: longhealth
-  module: longhealth
-  rollouts_per_example: 1
-  num_examples: -1
-  verbose: false
-  env_args:
-    doc_shuffle_seed: 2718
-  matrix:
-    task: [task1, task2]
-  matrix_id_format: "{base}-{task}"
-  max_concurrent: 64
-
-# Shuffled variants with different seeds
-- id: longhealth
-  module: longhealth
-  rollouts_per_example: 1
-  num_examples: -1
-  verbose: false
-  env_args:
-    shuffle_answers: true
-    shuffle_seed: 1618
-    doc_shuffle_seed: 1618
-  matrix:
-    task: [task1, task2]
-  matrix_id_format: "{base}-{task}-rollout1618"
-  max_concurrent: 64
-
-# Shuffled variants with different seeds
-- id: longhealth
-  module: longhealth
-  rollouts_per_example: 1
-  num_examples: -1
-  verbose: false
-  env_args:
-    shuffle_answers: true
-    shuffle_seed: 9331
-    doc_shuffle_seed: 9331
-  matrix:
-    task: [task1, task2]
-  matrix_id_format: "{base}-{task}-rollout9331"
-  max_concurrent: 64
diff --git a/configs/envs/m_arc.yaml b/configs/envs/m_arc.yaml
deleted file mode 100644
index 0c8bb132..00000000
--- a/configs/envs/m_arc.yaml
+++ /dev/null
@@ -1,14 +0,0 @@
-- id: m_arc
-  module: m_arc
-  num_examples: -1
-  verbose: false
-
-- id: m_arc
-  module: m_arc
-  num_examples: -1
-  verbose: false
-  env_args:
-    shuffle_answers: true
-  matrix:
-    shuffle_seed: [1618, 9331]
-  matrix_id_format: "{base}-rollout{shuffle_seed}"
diff --git a/configs/envs/med_dialog.yaml b/configs/envs/med_dialog.yaml
deleted file mode 100644
index de7626db..00000000
--- a/configs/envs/med_dialog.yaml
+++ /dev/null
@@ -1,10 +0,0 @@
-- id: med_dialog
-  module: med_dialog
-  num_examples: 2500
-  verbose: false
-  rerun: true
-  env_args:
-    judge_model:
-      - openai/gpt-5-mini
-      - x-ai/grok-4.1-fast
-    judge_base_url: https://api.pinference.ai/api/v1
\ No newline at end of file
diff --git a/configs/envs/med_halt.yaml b/configs/envs/med_halt.yaml
deleted file mode 100644
index 3853e922..00000000
--- a/configs/envs/med_halt.yaml
+++ /dev/null
@@ -1,8 +0,0 @@
-- id: med_halt
-  module: med_halt
-  rollouts_per_example: 1
-  num_examples: -1
-  verbose: false
-  matrix:
-    question_type: [reasoning_fct, reasoning_nota]
-  matrix_id_format: "{base}-{question_type}"
\ No newline at end of file
diff --git a/configs/envs/med_mcqa.yaml b/configs/envs/med_mcqa.yaml
deleted file mode 100644
index 06ccfd02..00000000
--- a/configs/envs/med_mcqa.yaml
+++ /dev/null
@@ -1,16 +0,0 @@
-- id: med_mcqa
-  module: med_mcqa
-  rollouts_per_example: 1
-  num_examples: -1
-  verbose: false
-
-- id: med_mcqa
-  module: med_mcqa
-  rollouts_per_example: 1
-  num_examples: -1
-  verbose: false
-  env_args:
-    shuffle_answers: true
-  matrix:
-    shuffle_seed: [1618, 9331]
-  matrix_id_format: "{base}-rollout{shuffle_seed}"
diff --git a/configs/envs/medagentbench.yaml b/configs/envs/medagentbench.yaml
deleted file mode 100644
index 4b8578e0..00000000
--- a/configs/envs/medagentbench.yaml
+++ /dev/null
@@ -1,6 +0,0 @@
-- id: medagentbench
-  module: medagentbench
-  num_examples: -1
-  verbose: false
-  env_args:
-    fhir_api_base: http://localhost:8080/fhir/
\ No newline at end of file
diff --git a/configs/envs/medagentbenchv2.yaml b/configs/envs/medagentbenchv2.yaml
deleted file mode 100644
index 450c2a3f..00000000
--- a/configs/envs/medagentbenchv2.yaml
+++ /dev/null
@@ -1,6 +0,0 @@
-- id: medagentbenchv2
-  module: medagentbenchv2
-  num_examples: -1
-  verbose: false
-  env_args:
-    fhir_api_base: http://localhost:8080/fhir/
\ No newline at end of file
diff --git a/configs/envs/medbullets.yaml b/configs/envs/medbullets.yaml
deleted file mode 100644
index 56442158..00000000
--- a/configs/envs/medbullets.yaml
+++ /dev/null
@@ -1,20 +0,0 @@
-- id: medbullets
-  module: medbullets
-  rollouts_per_example: 1
-  num_examples: -1
-  verbose: false
-  matrix:
-    num_options: [4, 5]
-  matrix_id_format: "{base}-op{num_options}"
-
-- id: medbullets
-  module: medbullets
-  rollouts_per_example: 1
-  num_examples: -1
-  verbose: false
-  env_args:
-    shuffle_answers: true
-  matrix:
-    num_options: [4, 5]
-    shuffle_seed: [1618, 9331]
-  matrix_id_format: "{base}-op{num_options}-rollout{shuffle_seed}"
diff --git a/configs/envs/medcalc_bench.yaml b/configs/envs/medcalc_bench.yaml
deleted file mode 100644
index 487a95a0..00000000
--- a/configs/envs/medcalc_bench.yaml
+++ /dev/null
@@ -1,21 +0,0 @@
-- id: medcalc_bench
-  module: medcalc_bench
-  rollouts_per_example: 1
-  verbose: false
-  num_examples: -1
-  env_args:
-    version: "1.2"
-
-- id: medcalc_bench_tools
-  module: medcalc_bench
-  rollouts_per_example: 1
-  verbose: false
-  num_examples: -1
-  env_args:
-    version: "verified"
-    add_python_tool: true
-    add_calculator_tool: true
-
-  export:
-    extra_columns: [lower_bound, upper_bound]
-    answer_column: ground_truth
\ No newline at end of file
diff --git a/configs/envs/medcasereasoning.yaml b/configs/envs/medcasereasoning.yaml
deleted file mode 100644
index d0c300ce..00000000
--- a/configs/envs/medcasereasoning.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-- id: medcasereasoning
-  module: medcasereasoning
-  num_examples: -1
-  verbose: false
-  env_args:
-    judge_model: openai/gpt-5-nano
-    judge_base_url: https://api.pinference.ai/api/v1
\ No newline at end of file
diff --git a/configs/envs/medconceptsqa_sample.yaml b/configs/envs/medconceptsqa_sample.yaml
deleted file mode 100644
index a1e09a92..00000000
--- a/configs/envs/medconceptsqa_sample.yaml
+++ /dev/null
@@ -1,23 +0,0 @@
-- id: medconceptsqa_sample
-  module: medconceptsqa
-  rollouts_per_example: 1
-  num_examples: -1
-  verbose: false
-  env_args:
-    vocab: icd10cm_sample
-  matrix:
-    difficulty: [easy, medium, hard]
-  matrix_id_format: "{base}-{difficulty}"
-
-- id: medconceptsqa_sample
-  module: medconceptsqa
-  rollouts_per_example: 1
-  num_examples: -1
-  verbose: false
-  env_args:
-    vocab: icd10cm_sample
-    shuffle_answers: true
-  matrix:
-    difficulty: [easy, medium, hard]
-    shuffle_seed: [1618, 9331]
-  matrix_id_format: "{base}-{difficulty}-rollout{shuffle_seed}"
diff --git a/configs/envs/medec.yaml b/configs/envs/medec.yaml
deleted file mode 100644
index 3d2a5f03..00000000
--- a/configs/envs/medec.yaml
+++ /dev/null
@@ -1,9 +0,0 @@
-- id: medec
-  module: medec
-  num_examples: -1
-  verbose: false
-  env_args:
-    judge_model:
-      - openai/gpt-5-mini
-      - google/gemini-3-flash-preview
-    judge_base_url: https://api.pinference.ai/api/v1
\ No newline at end of file
diff --git a/configs/envs/medexqa.yaml b/configs/envs/medexqa.yaml
deleted file mode 100644
index 4317830e..00000000
--- a/configs/envs/medexqa.yaml
+++ /dev/null
@@ -1,11 +0,0 @@
-- id: medexqa
-  module: medexqa
-  num_examples: -1
-  verbose: false
-  rerun: true
-  env_args:
-    judge_model:
-      - openai/gpt-5-mini
-      - google/gemini-3-flash-preview
-    judge_base_url: https://api.pinference.ai/api/v1
-    use_judge: true
\ No newline at end of file
diff --git a/configs/envs/medhallu.yaml b/configs/envs/medhallu.yaml
deleted file mode 100644
index bb313c1e..00000000
--- a/configs/envs/medhallu.yaml
+++ /dev/null
@@ -1,8 +0,0 @@
-- id: medhallu
-  module: medhallu
-  rollouts_per_example: 1
-  num_examples: -1
-  verbose: false
-  matrix:
-    difficulty: [easy, medium, hard]
-  matrix_id_format: "{base}-{difficulty}"
diff --git a/configs/envs/medicationqa.yaml b/configs/envs/medicationqa.yaml
deleted file mode 100644
index 3f3d12cf..00000000
--- a/configs/envs/medicationqa.yaml
+++ /dev/null
@@ -1,10 +0,0 @@
-- id: medicationqa
-  module: medicationqa
-  num_examples: -1
-  verbose: false
-  rerun: true
-  env_args:
-    judge_model:
-      - openai/gpt-5-mini
-      - x-ai/grok-4.1-fast
-    judge_base_url: https://api.pinference.ai/api/v1
\ No newline at end of file
diff --git a/configs/envs/medqa.yaml b/configs/envs/medqa.yaml
deleted file mode 100644
index a0ff9abc..00000000
--- a/configs/envs/medqa.yaml
+++ /dev/null
@@ -1,16 +0,0 @@
-- id: medqa
-  module: medqa
-  rollouts_per_example: 1
-  num_examples: -1
-  verbose: false
-
-- id: medqa
-  module: medqa
-  rollouts_per_example: 1
-  num_examples: -1
-  verbose: false
-  env_args:
-    shuffle_answers: true
-  matrix:
-    shuffle_seed: [1618, 9331]
-  matrix_id_format: "{base}-rollout{shuffle_seed}"
diff --git a/configs/envs/medrbench.yaml b/configs/envs/medrbench.yaml
deleted file mode 100644
index 597852ae..00000000
--- a/configs/envs/medrbench.yaml
+++ /dev/null
@@ -1,14 +0,0 @@
-- id: medrbench
-  module: medrbench
-  num_examples: -1
-  verbose: false
-  env_args:
-    judge_model:
-      - openai/gpt-5-mini
-      - google/gemini-3-flash-preview
-    judge_base_url: https://api.pinference.ai/api/v1
-    patient_agent_model: openai/gpt-5-mini
-    patient_agent_base_url: https://api.pinference.ai/api/v1
-  matrix:
-    task: [oracle, 1turn, free_turn]
-  matrix_id_format: "{base}-{task}"
\ No newline at end of file
diff --git a/configs/envs/medxpertqa.yaml b/configs/envs/medxpertqa.yaml
deleted file mode 100644
index 595c1b4f..00000000
--- a/configs/envs/medxpertqa.yaml
+++ /dev/null
@@ -1,32 +0,0 @@
-- id: medxpertqa
-  module: medxpertqa
-  rollouts_per_example: 1
-  num_examples: -1
-  verbose: false
-  matrix:
-    question_type: [reasoning, understanding]
-  matrix_id_format: "{base}-{question_type}"
-
-- id: medxpertqa
-  module: medxpertqa
-  rollouts_per_example: 1
-  num_examples: -1
-  verbose: false
-  env_args:
-    shuffle_answers: true
-    shuffle_seed: 1618
-  matrix:
-    question_type: [reasoning, understanding]
-  matrix_id_format: "{base}-{question_type}-rollout1618"
-
-- id: medxpertqa
-  module: medxpertqa
-  rollouts_per_example: 1
-  num_examples: -1
-  verbose: false
-  env_args:
-    shuffle_answers: true
-    shuffle_seed: 9331
-  matrix:
-    question_type: [reasoning, understanding]
-  matrix_id_format: "{base}-{question_type}-rollout9331"
\ No newline at end of file
diff --git a/configs/envs/meqsum.yaml b/configs/envs/meqsum.yaml
deleted file mode 100644
index 83e0cd24..00000000
--- a/configs/envs/meqsum.yaml
+++ /dev/null
@@ -1,20 +0,0 @@
-# MeQSum - Consumer Health Question Summarization
-# Dataset: medarc/MeQSum-patient-consumer-health-questions
-
-meqsum:
-  env_name: meqsum
-  env_args:
-    split: test
-    compute_auto_metrics: true
-
-meqsum_val:
-  env_name: meqsum
-  env_args:
-    split: validation
-    compute_auto_metrics: true
-
-meqsum_fast:
-  env_name: meqsum
-  env_args:
-    split: test
-    compute_auto_metrics: false
diff --git a/configs/envs/metamedqa.yaml b/configs/envs/metamedqa.yaml
deleted file mode 100644
index ae12c4ea..00000000
--- a/configs/envs/metamedqa.yaml
+++ /dev/null
@@ -1,16 +0,0 @@
-- id: metamedqa
-  module: metamedqa
-  rollouts_per_example: 1
-  num_examples: -1
-  verbose: false
-
-- id: metamedqa
-  module: metamedqa
-  rollouts_per_example: 1
-  num_examples: -1
-  verbose: false
-  env_args:
-    shuffle_answers: true
-  matrix:
-    shuffle_seed: [1618, 9331]
-  matrix_id_format: "{base}-rollout{shuffle_seed}"
diff --git a/configs/envs/mmlu_pro_health.yaml b/configs/envs/mmlu_pro_health.yaml
deleted file mode 100644
index 28b7576e..00000000
--- a/configs/envs/mmlu_pro_health.yaml
+++ /dev/null
@@ -1,16 +0,0 @@
-- id: mmlu_pro_health
-  module: mmlu_pro_health
-  rollouts_per_example: 1
-  num_examples: -1
-  verbose: false
-
-- id: mmlu_pro_health
-  module: mmlu_pro_health
-  rollouts_per_example: 1
-  num_examples: -1
-  verbose: false
-  env_args:
-    shuffle_answers: true
-  matrix:
-    shuffle_seed: [1618, 9331]
-  matrix_id_format: "{base}-rollout{shuffle_seed}"
diff --git a/configs/envs/mtsamples.yaml b/configs/envs/mtsamples.yaml
deleted file mode 100644
index d709d3d6..00000000
--- a/configs/envs/mtsamples.yaml
+++ /dev/null
@@ -1,19 +0,0 @@
-- id: mtsamples_procedures
-  module: mtsamples_procedures
-  verbose: false
-  num_examples: -1
-  env_args:
-    judge_model:
-      - openai/gpt-5-mini
-      - x-ai/grok-4.1-fast
-    judge_base_url: https://api.pinference.ai/api/v1
-
-- id: mtsamples_replicate
-  module: mtsamples_replicate
-  verbose: false
-  num_examples: -1
-  env_args:
-    judge_model:
-      - openai/gpt-5-mini
-      - x-ai/grok-4.1-fast
-    judge_base_url: https://api.pinference.ai/api/v1
\ No newline at end of file
diff --git a/configs/envs/pubhealthbench_free.yaml b/configs/envs/pubhealthbench_free.yaml
deleted file mode 100644
index b3fcea38..00000000
--- a/configs/envs/pubhealthbench_free.yaml
+++ /dev/null
@@ -1,11 +0,0 @@
-# Full test set - single unshuffled run
-- id: pubhealthbench_freeform
-  module: pubhealthbench
-  num_examples: -1
-  verbose: false
-  env_args:
-    split: freeform
-    judge_model:
-      - openai/gpt-5-mini
-      - google/gemini-3-flash-preview
-    judge_base_url: https://api.pinference.ai/api/v1
\ No newline at end of file
diff --git a/configs/envs/pubhealthbench_mcq.yaml b/configs/envs/pubhealthbench_mcq.yaml
deleted file mode 100644
index f7347e3a..00000000
--- a/configs/envs/pubhealthbench_mcq.yaml
+++ /dev/null
@@ -1,18 +0,0 @@
-# Reviewed set - shuffled with matrix
-- id: pubhealthbench_reviewed
-  module: pubhealthbench
-  num_examples: -1
-  verbose: false
-  env_args:
-    split: reviewed
-
-- id: pubhealthbench_reviewed
-  module: pubhealthbench
-  num_examples: -1
-  verbose: false
-  env_args:
-    split: reviewed
-    shuffle_answers: true
-  matrix:
-    shuffle_seed: [1618, 9331]
-  matrix_id_format: "{base}-rollout{shuffle_seed}"
diff --git a/configs/envs/pubmedqa.yaml b/configs/envs/pubmedqa.yaml
deleted file mode 100644
index d71c01e1..00000000
--- a/configs/envs/pubmedqa.yaml
+++ /dev/null
@@ -1,16 +0,0 @@
-- id: pubmedqa
-  module: pubmedqa
-  rollouts_per_example: 1
-  num_examples: -1
-  verbose: false
-
-- id: pubmedqa
-  module: pubmedqa
-  rollouts_per_example: 1
-  num_examples: -1
-  verbose: false
-  env_args:
-    shuffle_answers: true
-  matrix:
-    shuffle_seed: [1618, 9331]
-  matrix_id_format: "{base}-rollout{shuffle_seed}"
diff --git a/configs/envs/sctpublic.yaml b/configs/envs/sctpublic.yaml
deleted file mode 100644
index 533ab0d2..00000000
--- a/configs/envs/sctpublic.yaml
+++ /dev/null
@@ -1,5 +0,0 @@
-- id: sctpublic
-  module: sctpublic
-  num_examples: -1
-  rollouts_per_example: 1
-  verbose: false
\ No newline at end of file
diff --git a/configs/envs/supergpqa_medicine.yaml b/configs/envs/supergpqa_medicine.yaml
deleted file mode 100644
index ba1e922a..00000000
--- a/configs/envs/supergpqa_medicine.yaml
+++ /dev/null
@@ -1,32 +0,0 @@
-- id: supergpqa_medicine
-  module: supergpqa_medicine
-  rollouts_per_example: 1
-  num_examples: -1
-  verbose: false
-  matrix:
-    difficulty: [easy, hard]
-  matrix_id_format: "{base}-{difficulty}"
-
-- id: supergpqa_medicine
-  module: supergpqa_medicine
-  rollouts_per_example: 1
-  num_examples: -1
-  verbose: false
-  env_args:
-    shuffle_answers: true
-    shuffle_seed: 1618
-  matrix:
-    difficulty: [easy, hard]
-  matrix_id_format: "{base}-{difficulty}-rollout1618"
-
-- id: supergpqa_medicine
-  module: supergpqa_medicine
-  rollouts_per_example: 1
-  num_examples: -1
-  verbose: false
-  env_args:
-    shuffle_answers: true
-    shuffle_seed: 9331
-  matrix:
-    difficulty: [easy, hard]
-  matrix_id_format: "{base}-{difficulty}-rollout9331"

From 1a0b228d085246aecfa8c7e60c75205195a0fa8a Mon Sep 17 00:00:00 2001
From: Benjamin Warner <me@benjaminwarner.dev>
Date: Tue, 12 May 2026 21:57:49 +0000
Subject: [PATCH 51/53] update version for release

---
 medarc_verifiers/__init__.py | 2 +-
 pyproject.toml               | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/medarc_verifiers/__init__.py b/medarc_verifiers/__init__.py
index 7bf106ce..be21dcdf 100644
--- a/medarc_verifiers/__init__.py
+++ b/medarc_verifiers/__init__.py
@@ -1,6 +1,6 @@
 import logging
 
-__version__ = "0.1.0"
+__version__ = "0.2.0"
 
 # Always install judge cache namespacing.
 try:
diff --git a/pyproject.toml b/pyproject.toml
index 3de22497..808603d8 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "medarc-verifiers"
-version = "0.1.0"
+version = "0.2.0"
 description = "A collection of MedARC utilities and tools for Prime Intellect's verifiers package"
 readme = "docs/README.md"
 requires-python = ">=3.11"

From 7af9b4cefe9a29cc03167732a5e5bf6b56587b0e Mon Sep 17 00:00:00 2001
From: Benjamin Warner <me@benjaminwarner.dev>
Date: Tue, 12 May 2026 22:02:39 +0000
Subject: [PATCH 52/53] ruff format

---
 medarc_verifiers/cli/bench_child.py       |  8 ++------
 medarc_verifiers/cli/env_lifecycle.py     |  4 +---
 medarc_verifiers/cli/eval_identity.py     |  9 +++------
 medarc_verifiers/cli/main.py              |  8 ++++++--
 medarc_verifiers/cli/process/discovery.py |  1 +
 medarc_verifiers/utils/sampling_args.py   |  5 +----
 tests/test_cli/test_bench_child.py        |  7 ++++++-
 tests/test_cli/test_isolated_env.py       |  8 ++++++--
 tests/test_cli/test_main.py               | 16 ++++++++++------
 tests/test_cli/test_process_pipeline.py   |  7 ++++---
 10 files changed, 40 insertions(+), 33 deletions(-)

diff --git a/medarc_verifiers/cli/bench_child.py b/medarc_verifiers/cli/bench_child.py
index feecb9b7..54225465 100644
--- a/medarc_verifiers/cli/bench_child.py
+++ b/medarc_verifiers/cli/bench_child.py
@@ -60,13 +60,9 @@ def _run_payload(payload: dict[str, Any]) -> dict[str, Any]:
         config = build_eval_config(payload["raw_config"], overrides=_overrides_from_payload(payload["overrides"]))
         planned_resume_path = Path(payload["resume_path"])
         if config.env_id != payload["expected_env_id"]:
-            raise ValueError(
-                f"Child resolved env_id {config.env_id!r}, expected {payload['expected_env_id']!r}."
-            )
+            raise ValueError(f"Child resolved env_id {config.env_id!r}, expected {payload['expected_env_id']!r}.")
         if config.model != payload["expected_model"]:
-            raise ValueError(
-                f"Child resolved model {config.model!r}, expected {payload['expected_model']!r}."
-            )
+            raise ValueError(f"Child resolved model {config.model!r}, expected {payload['expected_model']!r}.")
         config = config.model_copy(update={"resume_path": planned_resume_path, "save_results": True})
         asyncio.run(run_evaluation(config))
         status["eval_ok"] = True
diff --git a/medarc_verifiers/cli/env_lifecycle.py b/medarc_verifiers/cli/env_lifecycle.py
index 2a2ad3fe..bd76356c 100644
--- a/medarc_verifiers/cli/env_lifecycle.py
+++ b/medarc_verifiers/cli/env_lifecycle.py
@@ -45,9 +45,7 @@ def resolve_env_package(env_id: str, env_dir: str | Path) -> EnvPackageRef:
             "Install it manually or pass --env-dir."
         )
     if not pyproject_path.is_file():
-        raise FileNotFoundError(
-            f"Environment {env_id!r} local package at {env_root} is missing pyproject.toml."
-        )
+        raise FileNotFoundError(f"Environment {env_id!r} local package at {env_root} is missing pyproject.toml.")
 
     with pyproject_path.open("rb") as handle:
         pyproject_data: dict[str, Any] = load_toml(handle)
diff --git a/medarc_verifiers/cli/eval_identity.py b/medarc_verifiers/cli/eval_identity.py
index 564c9384..55eddcb7 100644
--- a/medarc_verifiers/cli/eval_identity.py
+++ b/medarc_verifiers/cli/eval_identity.py
@@ -98,8 +98,7 @@ def _ensure_unique_identities(plans: Sequence[EvalPathPlan]) -> None:
     duplicates = sorted(identity for identity, count in Counter(identities).items() if count > 1)
     if duplicates:
         rendered = ", ".join(
-            f"model={model!r}, env_id={env_id!r}, variant_id={variant_id!r}"
-            for model, env_id, variant_id in duplicates
+            f"model={model!r}, env_id={env_id!r}, variant_id={variant_id!r}" for model, env_id, variant_id in duplicates
         )
         raise ValueError(f"Duplicate TOML eval identity; add a distinct variant_id/name: {rendered}")
 
@@ -161,9 +160,7 @@ def _variant_id(config: Mapping[str, Any], *, index: int) -> str:
     variant = _normalize_variant(raw_variant, config=config, field="variant_id", index=index)
     name = _normalize_variant(raw_name, config=config, field="name", index=index)
     if variant and name and variant != name:
-        raise ValueError(
-            f"TOML eval {index} has conflicting variant_id/name values: {variant!r} != {name!r}."
-        )
+        raise ValueError(f"TOML eval {index} has conflicting variant_id/name values: {variant!r} != {name!r}.")
     return variant or name or BASE_VARIANT_ID
 
 
@@ -175,7 +172,7 @@ def _normalize_variant(value: Any, *, config: Mapping[str, Any], field: str, ind
         raise ValueError(f"TOML eval {index} {field} must not be empty.")
     if slug_component(text, max_length=_MAX_VARIANT_ID_LENGTH) != text:
         raise ValueError(
-            f'TOML eval {index} {field} {text!r} is not path-safe. '
+            f"TOML eval {index} {field} {text!r} is not path-safe. "
             'Use only letters, numbers, ".", "_", and "-", for example "shuffle_seed-1618".'
         )
     return text
diff --git a/medarc_verifiers/cli/main.py b/medarc_verifiers/cli/main.py
index bbe0818f..744c8342 100644
--- a/medarc_verifiers/cli/main.py
+++ b/medarc_verifiers/cli/main.py
@@ -1290,7 +1290,9 @@ def _run_toml_bench(args: argparse.Namespace) -> int:
         return 0
     if missing_envs and not args.auto_install:
         raise RuntimeError(_missing_envs_error(missing_envs))
-    return _execute_selected_toml_plan(selected_raw, plan_inputs, path_plans, overrides, args, missing_envs=missing_envs)
+    return _execute_selected_toml_plan(
+        selected_raw, plan_inputs, path_plans, overrides, args, missing_envs=missing_envs
+    )
 
 
 def _toml_eval_overrides(args: argparse.Namespace) -> EvalConfigOverrides:
@@ -1568,7 +1570,9 @@ def _jsonable_mapping(value: Mapping[str, Any]) -> dict[str, Any]:
     return result
 
 
-def _load_child_status(status_path: Path, *, completed: subprocess.CompletedProcess[str] | None = None) -> dict[str, Any]:
+def _load_child_status(
+    status_path: Path, *, completed: subprocess.CompletedProcess[str] | None = None
+) -> dict[str, Any]:
     if not status_path.is_file():
         tail = _completed_process_tail(completed) if completed is not None else ""
         detail = f"\n{tail}" if tail else ""
diff --git a/medarc_verifiers/cli/process/discovery.py b/medarc_verifiers/cli/process/discovery.py
index bf5112b1..a38eb1b7 100644
--- a/medarc_verifiers/cli/process/discovery.py
+++ b/medarc_verifiers/cli/process/discovery.py
@@ -224,6 +224,7 @@ def _infer_eval_output_layout(
         "variant_id": variant_id or "",
     }
 
+
 def _read_metadata_payload(path: Path) -> Mapping[str, Any] | None:
     try:
         payload = json.loads(path.read_text(encoding="utf-8"))
diff --git a/medarc_verifiers/utils/sampling_args.py b/medarc_verifiers/utils/sampling_args.py
index e7adb8dd..ccff5ce7 100644
--- a/medarc_verifiers/utils/sampling_args.py
+++ b/medarc_verifiers/utils/sampling_args.py
@@ -136,10 +136,7 @@ def _sanitize_anthropic_messages(sampling_args: Mapping[str, Any]) -> dict[str,
 def _validate_anthropic_effort(value: Any) -> str:
     effort_values = _get_anthropic_effort_values()
     if not isinstance(value, str) or value not in effort_values:
-        raise ValueError(
-            "anthropic_messages reasoning effort must be one of: "
-            f"{', '.join(sorted(effort_values))}"
-        )
+        raise ValueError(f"anthropic_messages reasoning effort must be one of: {', '.join(sorted(effort_values))}")
     return value
 
 
diff --git a/tests/test_cli/test_bench_child.py b/tests/test_cli/test_bench_child.py
index 733bafed..943c9d41 100644
--- a/tests/test_cli/test_bench_child.py
+++ b/tests/test_cli/test_bench_child.py
@@ -48,7 +48,12 @@ async def fake_run_evaluation(run_config):
 
     assert status["exit_code"] == 0
     assert status["installed_by_child"] is True
-    assert calls == ["install", "build", f"run:{tmp_path / 'runs' / 'evals' / 'parent-model' / 'medqa' / 'base'}", "cleanup"]
+    assert calls == [
+        "install",
+        "build",
+        f"run:{tmp_path / 'runs' / 'evals' / 'parent-model' / 'medqa' / 'base'}",
+        "cleanup",
+    ]
 
 
 def test_child_cleanup_env_package_false_skips_uninstall(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
diff --git a/tests/test_cli/test_isolated_env.py b/tests/test_cli/test_isolated_env.py
index 77065c09..1c11b1eb 100644
--- a/tests/test_cli/test_isolated_env.py
+++ b/tests/test_cli/test_isolated_env.py
@@ -52,7 +52,9 @@ def test_current_medarc_install_spec_rejects_invalid_editable_checkout(
 def test_install_medarc_non_editable_uses_pinned_version(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
     commands: list[list[str]] = []
 
-    monkeypatch.setattr(isolated_env, "current_medarc_install_spec", lambda: isolated_env.MedarcInstallSpec(False, "9.8.7"))
+    monkeypatch.setattr(
+        isolated_env, "current_medarc_install_spec", lambda: isolated_env.MedarcInstallSpec(False, "9.8.7")
+    )
     monkeypatch.setattr(isolated_env, "_run_uv", lambda command, action: commands.append(command))
 
     isolated_env.install_medarc_into_venv(tmp_path / "python")
@@ -64,7 +66,9 @@ def test_install_medarc_non_editable_resolution_failure_is_actionable(
     monkeypatch: pytest.MonkeyPatch,
     tmp_path: Path,
 ) -> None:
-    monkeypatch.setattr(isolated_env, "current_medarc_install_spec", lambda: isolated_env.MedarcInstallSpec(False, "9.8.7"))
+    monkeypatch.setattr(
+        isolated_env, "current_medarc_install_spec", lambda: isolated_env.MedarcInstallSpec(False, "9.8.7")
+    )
 
     def fail(command: list[str], action: str) -> None:
         raise isolated_env.IsolatedEnvError("resolver failed")
diff --git a/tests/test_cli/test_main.py b/tests/test_cli/test_main.py
index a87104c5..c2aec114 100644
--- a/tests/test_cli/test_main.py
+++ b/tests/test_cli/test_main.py
@@ -356,7 +356,9 @@ def fake_build(raw: dict[str, Any], *, overrides: Any) -> SimpleNamespace:
     async def fake_run(config, **_kwargs):
         calls.append(Path(config.resume_path))
         Path(config.resume_path, "results.jsonl").write_text(json.dumps({"example_id": "0"}) + "\n")
-        Path(config.resume_path, "metadata.json").write_text(json.dumps({"env_id": config.env_id, "model": config.model}))
+        Path(config.resume_path, "metadata.json").write_text(
+            json.dumps({"env_id": config.env_id, "model": config.model})
+        )
         return {"outputs": [], "metadata": {}}
 
     monkeypatch.setattr(main, "build_eval_config", fake_build)
@@ -426,7 +428,9 @@ def fake_build(raw: dict[str, Any], *, overrides: Any) -> SimpleNamespace:
     async def fake_run(config, **_kwargs):
         parent_runs.append(config.env_id)
         Path(config.resume_path, "results.jsonl").write_text(json.dumps({"example_id": "0"}) + "\n")
-        Path(config.resume_path, "metadata.json").write_text(json.dumps({"env_id": config.env_id, "model": config.model}))
+        Path(config.resume_path, "metadata.json").write_text(
+            json.dumps({"env_id": config.env_id, "model": config.model})
+        )
 
     class FakeVenv:
         def __enter__(self) -> Path:
@@ -1504,7 +1508,9 @@ def test_load_env_export_map_adds_module_variant_keys(tmp_path: Path) -> None:
 
     env_map = main._load_env_export_map(env_root)
 
-    variant_key = "medcalc_bench::env_args.add_calculator_tool-true__env_args.add_python_tool-true__env_args.version-verified"
+    variant_key = (
+        "medcalc_bench::env_args.add_calculator_tool-true__env_args.add_python_tool-true__env_args.version-verified"
+    )
     assert "medcalc_bench_tools" in env_map
     assert variant_key in env_map
     assert env_map[variant_key].answer_column == "ground_truth"
@@ -1843,9 +1849,7 @@ def test_process_cli_requires_winrate_config_path(tmp_path: Path) -> None:
         )
 
 
-def test_process_cli_records_default_max_results_missing_pct(
-    monkeypatch: pytest.MonkeyPatch, tmp_path: Path
-) -> None:
+def test_process_cli_records_default_max_results_missing_pct(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
     captured: dict[str, Any] = {}
 
     def fake_run_process(options, env_export_map):
diff --git a/tests/test_cli/test_process_pipeline.py b/tests/test_cli/test_process_pipeline.py
index 180646b5..926da9a5 100644
--- a/tests/test_cli/test_process_pipeline.py
+++ b/tests/test_cli/test_process_pipeline.py
@@ -276,9 +276,10 @@ def test_run_process_preserves_deterministic_eval_variants(tmp_path: Path) -> No
     ]
     index_payload = json.loads((output_dir / "env_index.json").read_text(encoding="utf-8"))
     assert sorted(index_payload["files"]) == rel_paths
-    assert {
-        entry["variant_id"] for entry in index_payload["files"].values()
-    } == {"env_args.shuffle_seed-1618", "env_args.shuffle_seed-9331"}
+    assert {entry["variant_id"] for entry in index_payload["files"].values()} == {
+        "env_args.shuffle_seed-1618",
+        "env_args.shuffle_seed-9331",
+    }
 
 
 def test_run_process_excludes_specific_deterministic_eval_variant(tmp_path: Path) -> None:

From 37cf5f201b22c3f632bcd120677f441473036d28 Mon Sep 17 00:00:00 2001
From: Benjamin Warner <me@benjaminwarner.dev>
Date: Tue, 12 May 2026 22:17:36 +0000
Subject: [PATCH 53/53] small bug fixes

---
 .../supergpqa_medicine/supergpqa_medicine.py   | 18 +++++++++---------
 medarc_verifiers/cli/winrate/api.py            | 12 +++++++-----
 tests/test_cli/test_main.py                    | 13 +++++++++++++
 3 files changed, 29 insertions(+), 14 deletions(-)

diff --git a/environments/supergpqa_medicine/supergpqa_medicine.py b/environments/supergpqa_medicine/supergpqa_medicine.py
index ac0fa5a5..b7b55cbb 100644
--- a/environments/supergpqa_medicine/supergpqa_medicine.py
+++ b/environments/supergpqa_medicine/supergpqa_medicine.py
@@ -12,14 +12,14 @@
 
 disable_progress_bar()  # suppress datasets mapping progress bar
 
-ZERO_SHOT_PROMPT_TEMPLATE = """
-Answer the following multiple choice question. There is only one correct answer. The last line of your response should be in the format 'Answer: \\boxed{{$LETTER}}' (without quotes), where LETTER is one of A, B, C, D, E, F, G, H, I, or J.
+ZERO_SHOT_PROMPT_TEMPLATE = r"""
+Answer the following multiple choice question. There is only one correct answer. The last line of your response should be in the format 'Answer: \boxed{{$LETTER}}' (without quotes), where LETTER is one of A, B, C, D, E, F, G, H, I, or J.
 
 {}
 """.strip()
 
-FIVE_SHOT_PROMPT_TEMPLATE = """
-Answer the following multiple choice question. There is only one correct answer. The last line of your response should be in the format 'Answer: \\boxed{{$LETTER}}' (without quotes), where LETTER is one of A, B, C, D, E, F, G, H, I, or J.
+FIVE_SHOT_PROMPT_TEMPLATE = r"""
+Answer the following multiple choice question. There is only one correct answer. The last line of your response should be in the format 'Answer: \boxed{{$LETTER}}' (without quotes), where LETTER is one of A, B, C, D, E, F, G, H, I, or J.
 
 Question:
 A refracting telescope consists of two converging lenses separated by 100 cm. The eye-piece lens has a focal length of 20 cm. The angular magnification of the telescope is
@@ -35,7 +35,7 @@
 J) 20
 
 Answer: Let's think step by step. In a refracting telescope, if both lenses are converging, the focus of both lenses must be between the two lenses, and thus the focal lengths of the two lenses must add up to their separation. Since the focal length of one lens is 20 cm, the focal length of the other must be 80 cm. The magnification is the ratio of these two focal lengths, or 4.
-Answer: \\boxed{{H}}.
+Answer: \boxed{{H}}.
 
 Question:
 Say the pupil of your eye has a diameter of 5 mm and you have a telescope with an aperture of 50 cm. How much more light can the telescope gather than your eye?
@@ -54,7 +54,7 @@
 \[
 \frac{{\left(\frac{{50 \text{{ cm}}}}{{2}}\right)^2}}{{\left(\frac{{5 \text{{ mm}}}}{{2}}\right)^2}} = \frac{{\left(\frac{{50 \text{{ cm}}}}{{0.1 \text{{ cm}}}}\right)^2}}{{\left(\frac{{5 \text{{ mm}}}}{{0.1 \text{{ cm}}}}\right)^2}} = \frac{{500^2}}{{5^2}} = 10000.
 \]
-Answer: \\boxed{{E}}.
+Answer: \boxed{{E}}.
 
 Question:
 Where do most short-period comets come from and how do we know?
@@ -67,7 +67,7 @@
 G) The asteroid belt; short period comets have orbital periods similar to asteroids like Vesta and are found in the plane of the solar system just like the asteroid belt.
 
 Answer: Let's think step by step. Most short-period comets originate from the Kuiper belt. This is deduced from the observation that these comets tend to follow orbits that lie in the plane of the solar system, similar to the distribution of objects in the Kuiper belt itself. Thus, the alignment of these cometary orbits with the ecliptic plane points to their Kuiper belt origin.
-Answer: \\boxed{{A}}.
+Answer: \boxed{{A}}.
 
 Question:
 Colors in a soap bubble result from light
@@ -83,7 +83,7 @@
 J) transmission
 
 Answer: Let's think step by step. The colorful patterns observed in a soap bubble are caused by the phenomenon of light interference. This occurs when light waves bounce between the two surfaces of the soap film, combining constructively or destructively based on their phase differences and the varying thickness of the film. These interactions result in vibrant color patterns due to variations in the intensity of different wavelengths of light.
-Answer: \\boxed{{E}}.
+Answer: \boxed{{E}}.
 
 Question:
 A microwave oven is connected to an outlet, 120 V, and draws a current of 2 amps. At what rate is energy being used by the microwave oven?
@@ -103,7 +103,7 @@
 \text{{Power}} = \text{{Voltage}} \times \text{{Current}} = 120 \, \text{{V}} \times 2 \, \text{{A}} = 240 \, \text{{W}}.
 \]
 Therefore, the microwave oven uses energy at a rate of 240 watts.
-Answer: \\boxed{{A}}.
+Answer: \boxed{{A}}.
 
 Question:
 {}
diff --git a/medarc_verifiers/cli/winrate/api.py b/medarc_verifiers/cli/winrate/api.py
index 1f6b2e65..5ae37e69 100644
--- a/medarc_verifiers/cli/winrate/api.py
+++ b/medarc_verifiers/cli/winrate/api.py
@@ -315,12 +315,14 @@ def compute_winrates(
     seen_model_case_map: dict[str, str] = {}
 
     dataset_iter: Iterable[tuple[str, Path | str]] = datasets
-    try:
-        from rich.progress import track
+    console = _get_console()
+    if console is not None and getattr(console, "is_terminal", False):
+        try:
+            from rich.progress import track
 
-        dataset_iter = track(datasets, description="Computing win rates", transient=True)
-    except Exception:
-        dataset_iter = datasets
+            dataset_iter = track(datasets, description="Computing win rates", transient=True, console=console)
+        except Exception:
+            dataset_iter = datasets
 
     for dataset_name, parquet_path in dataset_iter:
         stats, models_present, missingness = _process_dataset(
diff --git a/tests/test_cli/test_main.py b/tests/test_cli/test_main.py
index c2aec114..fbb929f8 100644
--- a/tests/test_cli/test_main.py
+++ b/tests/test_cli/test_main.py
@@ -38,6 +38,10 @@ def _patch_single_run_metadata_only(monkeypatch: pytest.MonkeyPatch, metadata: l
     )
 
 
+def _patch_toml_bench_envs_installed(monkeypatch: pytest.MonkeyPatch) -> None:
+    monkeypatch.setattr(main, "_missing_selected_env_refs", lambda plan_inputs, args: {})
+
+
 def _make_env_param(
     name: str,
     *,
@@ -629,6 +633,7 @@ def test_toml_bench_executes_sequentially_to_deterministic_path(
     monkeypatch: pytest.MonkeyPatch,
     tmp_path: Path,
 ) -> None:
+    _patch_toml_bench_envs_installed(monkeypatch)
     config_path = tmp_path / "bench.toml"
     output_dir = tmp_path / "evals"
     _write_config(
@@ -670,6 +675,7 @@ async def fake_run(config, on_progress=None, **_kwargs):
 
 
 def test_toml_bench_defaults_max_concurrent_to_one(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
+    _patch_toml_bench_envs_installed(monkeypatch)
     config_path = tmp_path / "bench.toml"
     _write_config(
         config_path,
@@ -711,6 +717,7 @@ async def fake_run(config, **_kwargs):
 
 
 def test_toml_bench_defaults_to_runs_evals(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
+    _patch_toml_bench_envs_installed(monkeypatch)
     monkeypatch.chdir(tmp_path)
     config_path = tmp_path / "bench.toml"
     _write_config(
@@ -738,6 +745,7 @@ async def fake_run(config, **_kwargs):
 
 
 def test_toml_bench_auto_resumes_existing_output(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
+    _patch_toml_bench_envs_installed(monkeypatch)
     config_path = tmp_path / "bench.toml"
     output_dir = tmp_path / "evals"
     _write_config(
@@ -770,6 +778,7 @@ def test_toml_bench_resume_refuses_malformed_existing_output(
     monkeypatch: pytest.MonkeyPatch,
     tmp_path: Path,
 ) -> None:
+    _patch_toml_bench_envs_installed(monkeypatch)
     config_path = tmp_path / "bench.toml"
     output_dir = tmp_path / "evals"
     _write_config(
@@ -798,6 +807,7 @@ async def fake_run(config, **_kwargs):
 
 
 def test_toml_bench_reuses_empty_existing_output_dir(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
+    _patch_toml_bench_envs_installed(monkeypatch)
     config_path = tmp_path / "bench.toml"
     output_dir = tmp_path / "evals"
     _write_config(
@@ -829,6 +839,7 @@ async def fake_run(config, **_kwargs):
 
 
 def test_toml_bench_force_archives_existing_output(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
+    _patch_toml_bench_envs_installed(monkeypatch)
     config_path = tmp_path / "bench.toml"
     output_dir = tmp_path / "evals"
     _write_config(
@@ -860,6 +871,7 @@ async def fake_run(config, **_kwargs):
 
 
 def test_toml_bench_resume_preserves_existing_metadata(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
+    _patch_toml_bench_envs_installed(monkeypatch)
     config_path = tmp_path / "bench.toml"
     output_dir = tmp_path / "evals"
     _write_config(
@@ -906,6 +918,7 @@ def test_toml_bench_does_not_patch_upstream_metadata_saves(
     monkeypatch: pytest.MonkeyPatch,
     tmp_path: Path,
 ) -> None:
+    _patch_toml_bench_envs_installed(monkeypatch)
     import verifiers.envs.environment as environment_module
 
     config_path = tmp_path / "bench.toml"