From 29b61f8e84a418549162a070bae65b129780f55a Mon Sep 17 00:00:00 2001
From: "Eric C. Hansen" <ericchansen@users.noreply.github.com>
Date: Thu, 28 May 2026 12:40:07 -0500
Subject: [PATCH 1/7] feat(systems): add starting_point="qfuerza" to
 load_system
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds a starting_point parameter to load_system that, when set to
"qfuerza", overwrites the published OPT bond/angle scalars with
QFUERZA (Farrugia 2025) Hessian-derived values while leaving the
published OPT topology, frozen MM3 backbone, vdW, stretch-bend,
Urey-Bradley, and torsion rows untouched. Torsions are zeroed by
qfuerza_into per Farrugia 2025; for systems where published torsions
are already zero (e.g. rh-enamide Donoghue 2008), this is a no-op.

The default "published" path is unchanged — existing baselines and
behavior are preserved.

A per-param-type audit is attached to SystemData.metadata under
starting_point_audit, classifying every scalar as qfuerza_overwritten,
retained_published, or frozen. This makes leakage (vdW, unmatched
bond/angle rows, etc.) honest and visible rather than hidden.

The qfuerza_fresh strategy (CH3F) treats "qfuerza" as a no-op since
its starting FF is already QFUERZA-derived.

scripts/regenerate_convergence_results.py grows a --starting-point
{published,qfuerza} flag; output subdirectory becomes from-qfuerza/
instead of convergence/ when qfuerza is selected, preserving baseline
artifacts.

Anchors: Farrugia 2025 (DOI 10.1021/acs.jctc.5c01751), AGENTS.md §6.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 q2mm/diagnostics/systems.py               | 131 +++++++++++++++++++++-
 scripts/regenerate_convergence_results.py |  35 +++++-
 test/test_systems.py                      |  97 ++++++++++++++++
 3 files changed, 257 insertions(+), 6 deletions(-)

diff --git a/q2mm/diagnostics/systems.py b/q2mm/diagnostics/systems.py
index 101a46de..8e9b84eb 100644
--- a/q2mm/diagnostics/systems.py
+++ b/q2mm/diagnostics/systems.py
@@ -490,12 +490,110 @@ class SystemSpec:
     default_forms: tuple[str, ...] = ("mm3",)
 
 
+StartingPoint = Literal["published", "qfuerza"]
+"""Choice of starting-parameter values for the optimizer.
+
+- ``"published"`` (default): use the literature OPT values from the
+  published .fld file(s) as-is.  This is the historical Q2MM workflow
+  for all published-FF benchmark systems.
+- ``"qfuerza"``: take the published OPT *topology* (which atom-type
+  rows exist, the frozen/active partition) but overwrite OPT bond and
+  angle values with QFUERZA Hessian-derived estimates averaged across
+  the training molecules (Farrugia 2025 protocol).  Frozen MM3
+  backbone parameters are untouched; torsions are zeroed; OPT
+  parameters that :func:`qfuerza_into` does not estimate (stretch-bends,
+  vdW, Urey-Bradley) retain their published values — the audit in
+  :func:`load_system` records this explicitly.
+
+For ``qfuerza_fresh`` strategy (CH3F), ``"qfuerza"`` is a no-op
+because the FF is already QFUERZA-derived; the audit records this.
+"""
+
+
+def _audit_starting_point(
+    ff: ForceField,
+    *,
+    before_vec: np.ndarray | None,
+    starting_point: StartingPoint,
+) -> dict[str, Any]:
+    """Classify every scalar param as qfuerza/retained_published/frozen.
+
+    Honest accounting of where the starting values come from.  For
+    ``starting_point="published"`` everything active is
+    ``retained_published``.  For ``"qfuerza"`` we diff the parameter
+    vector before vs after :func:`qfuerza_into` and call any active
+    scalar whose value changed ``qfuerza_overwritten``; any active
+    scalar whose value did not change is ``retained_published``
+    (e.g. an OPT stretch-bend, an active bond/angle that QFUERZA could
+    not match to any training molecule).
+
+    Args:
+        ff: Force field *after* any QFUERZA overwrite.
+        before_vec: Param vector snapshot from before the overwrite,
+            or ``None`` for the published case.
+        starting_point: The starting-point choice.
+
+    Returns:
+        A nested dict ``{starting_point, n_active, n_frozen, by_type:
+        {bond_fc: {qfuerza_overwritten, retained_published, frozen}, …}}``.
+
+    """
+    after_vec = ff.get_param_vector()
+    active = ff.active_mask
+
+    type_labels: list[str] = []
+    for bond in ff.bonds:
+        type_labels.extend(["bond_fc", "bond_eq"])
+        del bond
+    for angle in ff.angles:
+        type_labels.extend(["angle_fc", "angle_eq"])
+        del angle
+    type_labels.extend(["torsion_fc"] * len(ff.torsions))
+    type_labels.extend(["stretch_bend_fc"] * len(ff.stretch_bends))
+    for vdw in ff.vdws:
+        type_labels.extend(["vdw_radius", "vdw_epsilon"])
+        del vdw
+    for ub in ff._ub_angles:  # noqa: SLF001 — diagnostic
+        type_labels.extend(["ub_fc", "ub_eq"])
+        del ub
+
+    if len(type_labels) != len(after_vec):
+        raise AssertionError(f"type label / param vector length mismatch: {len(type_labels)} vs {len(after_vec)}")
+
+    by_type: dict[str, dict[str, int]] = {}
+    for label, is_active, after_val, before_val in zip(
+        type_labels,
+        active,
+        after_vec,
+        before_vec if before_vec is not None else after_vec,
+        strict=True,
+    ):
+        bucket = by_type.setdefault(
+            label,
+            {"qfuerza_overwritten": 0, "retained_published": 0, "frozen": 0},
+        )
+        if not is_active:
+            bucket["frozen"] += 1
+        elif before_vec is not None and not np.isclose(before_val, after_val, rtol=0.0, atol=1e-12):
+            bucket["qfuerza_overwritten"] += 1
+        else:
+            bucket["retained_published"] += 1
+
+    return {
+        "starting_point": starting_point,
+        "n_active": int(active.sum()),
+        "n_frozen": int((~active).sum()),
+        "by_type": by_type,
+    }
+
+
 def load_system(
     key: str,
     *,
     engine: Any | None = None,
     functional_form: str | None = None,
     molecule_loader_kwargs: dict[str, Any] | None = None,
+    starting_point: StartingPoint = "published",
 ) -> SystemData:
     """Build a :class:`SystemData` for one benchmark system.
 
@@ -521,9 +619,21 @@ def load_system(
             (``"harmonic"`` or ``"mm3"``).
         molecule_loader_kwargs: Optional kwargs forwarded to the
             system's molecule loader (e.g. ``data_dir`` for CH3F).
+        starting_point: Where the starting OPT parameter values come
+            from.  See :data:`StartingPoint`.  ``"published"`` is the
+            historical default and preserves backward compatibility.
+            ``"qfuerza"`` overwrites OPT bond/angle values with
+            multi-molecule QFUERZA estimates after FF assembly (per
+            Farrugia 2025).  CH3F (``qfuerza_fresh`` strategy) treats
+            ``"qfuerza"`` as a no-op since the FF is already QFUERZA-
+            derived.
 
     Returns:
-        A fully-populated :class:`SystemData`.
+        A fully-populated :class:`SystemData`.  The
+        ``metadata["starting_point_audit"]`` block reports, for each
+        parameter type, how many active scalars were QFUERZA-overwritten
+        vs retained from the published FF vs frozen — see
+        :func:`_audit_starting_point`.
 
     Raises:
         KeyError: If *key* is not in :data:`SYSTEMS`.
@@ -563,6 +673,23 @@ def load_system(
     if functional_form is not None:
         ff.functional_form = FunctionalForm(functional_form)
 
+    # 2b. Optional QFUERZA overwrite of OPT parameter values --------------
+    # For ``starting_point="qfuerza"`` we keep the published FF
+    # topology + frozen/active partition but replace the active
+    # OPT bond/angle values with multi-molecule QFUERZA estimates
+    # (Farrugia 2025 §"Methods", per-molecule mean averaging).
+    # ``qfuerza_into`` honours the frozen partition, so MM3 backbone
+    # rows are never touched.  Active rows that QFUERZA cannot match
+    # to any training molecule retain their published values — the
+    # audit captures this so the leak is visible downstream.
+    from q2mm.models.seminario import qfuerza_into
+
+    before_vec: np.ndarray | None = None
+    if starting_point == "qfuerza" and spec.ff_strategy != "qfuerza_fresh":
+        before_vec = ff.get_param_vector().copy()
+        qfuerza_into(ff, molecules, invert_ts_curvature=True)
+    starting_point_audit = _audit_starting_point(ff, before_vec=before_vec, starting_point=starting_point)
+
     # 3. Reference data ----------------------------------------------------
     if spec.ff_strategy == "qfuerza_fresh":
         # Frequency-only reference: requires the engine to compute MM
@@ -605,6 +732,8 @@ def load_system(
         "molecule_name": spec.name,
         "n_molecules": len(molecules),
         "n_atoms_per_mol": [len(m.symbols) for m in molecules],
+        "starting_point": starting_point,
+        "starting_point_audit": starting_point_audit,
         **dict(spec.metadata),
         **({"functional_form": resolved_form} if resolved_form else {}),
     }
diff --git a/scripts/regenerate_convergence_results.py b/scripts/regenerate_convergence_results.py
index 194df73c..af3edd47 100644
--- a/scripts/regenerate_convergence_results.py
+++ b/scripts/regenerate_convergence_results.py
@@ -17,7 +17,9 @@
    improvement percentage, and whether the mean change exceeds the summed
    confidence intervals.
 
-Outputs (per system, into ``<output-dir>/<system-data-dir>/convergence/``):
+Outputs (per system, into ``<output-dir>/<system-data-dir>/<subdir>/``,
+where ``<subdir>`` is ``convergence`` for the default ``--starting-point
+published`` and ``from-qfuerza`` for ``--starting-point qfuerza``):
 
 - ``validation_results.json`` — summary numbers for the system (strict JSON,
   no ``Infinity`` or ``NaN``).  Ratio state is encoded across three keys:
@@ -146,6 +148,7 @@ def _build_provenance(args: argparse.Namespace, output_dir: Path) -> dict[str, A
         "maxiter": args.maxiter,
         "n_evals": args.n_evals,
         "skip_optimization": args.skip_optimization,
+        "starting_point": args.starting_point,
         "devices": _device_info(),
     }
 
@@ -425,6 +428,7 @@ def process_system(
     maxiter: int,
     n_evals: int,
     skip_optimization: bool,
+    starting_point: str,
     provenance: dict[str, Any],
 ) -> dict[str, Any]:
     """Process one system end-to-end and write its artifacts."""
@@ -435,9 +439,9 @@ def process_system(
     if system_key not in SYSTEMS:
         raise ValueError(f"Unknown system: {system_key}")
 
-    logger.info("[%s] loading", system_key)
+    logger.info("[%s] loading (starting_point=%s)", system_key, starting_point)
     engine = JaxEngine()
-    sys_data = load_system(system_key, engine=engine)
+    sys_data = load_system(system_key, engine=engine, starting_point=starting_point)
     ff = sys_data.forcefield
 
     # ---- Seminario fit quality ------------------------------------------
@@ -455,6 +459,8 @@ def process_system(
         "system": system_key,
         "n_molecules": len(sys_data.molecules),
         "n_active_params": n_active,
+        "starting_point": starting_point,
+        "starting_point_audit": sys_data.metadata.get("starting_point_audit"),
         "initial_obj_score": initial_score,
         "initial_jaxloss": initial_jaxloss,
         **ratio_info,
@@ -538,7 +544,8 @@ def process_system(
 
     # ---- Write artifacts -------------------------------------------------
     data_dir_name = DATA_DIR_FOR_SYSTEM.get(system_key, system_key)
-    sys_out = output_dir / data_dir_name / "convergence"
+    subdir = "convergence" if starting_point == "published" else f"from-{starting_point}"
+    sys_out = output_dir / data_dir_name / subdir
     sys_out.mkdir(parents=True, exist_ok=True)
 
     _write_strict_json(
@@ -618,6 +625,17 @@ def main() -> int:
         action="store_true",
         help="Compute baseline metrics only; do not optimize any system.",
     )
+    parser.add_argument(
+        "--starting-point",
+        choices=("published", "qfuerza"),
+        default="published",
+        help="Starting force-field parameters. 'published' uses the literature OPT values "
+        "(default, backward compatible). 'qfuerza' overwrites the OPT bond/angle scalars "
+        "with QFUERZA (Farrugia 2025) Hessian-derived values while keeping the published "
+        "topology and frozen MM3 backbone — used for QFUERZA-recovery validation runs. "
+        "Output subdirectory becomes 'from-qfuerza' instead of 'convergence' to avoid "
+        "overwriting baselines.",
+    )
     parser.add_argument(
         "--combined-output",
         type=Path,
@@ -651,7 +669,13 @@ def main() -> int:
     provenance = _build_provenance(args, output_dir)
     logger.info("Output directory: %s", output_dir)
     logger.info("Systems: %s", systems)
-    logger.info("ratio_tol=%s, maxiter=%d, n_evals=%d", args.ratio_tol, args.maxiter, args.n_evals)
+    logger.info(
+        "ratio_tol=%s, maxiter=%d, n_evals=%d, starting_point=%s",
+        args.ratio_tol,
+        args.maxiter,
+        args.n_evals,
+        args.starting_point,
+    )
 
     combined: dict[str, Any] = {}
     failures: list[str] = []
@@ -664,6 +688,7 @@ def main() -> int:
                 maxiter=args.maxiter,
                 n_evals=args.n_evals,
                 skip_optimization=args.skip_optimization,
+                starting_point=args.starting_point,
                 provenance=provenance,
             )
             combined[sys_key] = summary
diff --git a/test/test_systems.py b/test/test_systems.py
index f93dab31..98d4e856 100644
--- a/test/test_systems.py
+++ b/test/test_systems.py
@@ -5,6 +5,7 @@
 from collections import Counter
 
 import numpy as np
+import pytest
 
 from test._shared import make_water
 
@@ -83,3 +84,99 @@ def test_include_geometry_true_is_default(self) -> None:
         counts = Counter(value.kind for value in ref.values)
         assert counts["bond_length"] > 0
         assert counts["bond_angle"] > 0
+
+
+class TestStartingPoint:
+    """Regression tests for the ``starting_point`` kwarg on load_system.
+
+    Validates the Farrugia 2025 "QFUERZA Hessian-derived values on
+    published topology" workflow:
+
+    1. Published path is unchanged (backward compatibility).
+    2. QFUERZA path overwrites OPT bond/angle values but never touches
+       the frozen MM3 backbone.
+    3. Reference data is identical between the two paths — the
+       starting point should only change the starting parameter values,
+       not the optimization target.
+    4. The audit dict honestly reports which scalars came from QFUERZA
+       vs which retained their published values (e.g. vdW, unmatched
+       bond/angle rows).
+
+    """
+
+    @pytest.mark.external_data
+    def test_published_path_unchanged(self) -> None:
+        """Default ``starting_point="published"`` preserves the historical FF."""
+        sd_default = systems.load_system("rh-enamide")
+        sd_explicit = systems.load_system("rh-enamide", starting_point="published")
+        assert np.allclose(
+            sd_default.forcefield.get_param_vector(),
+            sd_explicit.forcefield.get_param_vector(),
+        )
+        assert sd_default.metadata["starting_point"] == "published"
+
+    @pytest.mark.external_data
+    def test_qfuerza_overwrites_opt_but_not_frozen(self) -> None:
+        """QFUERZA path mutates active OPT scalars; frozen backbone untouched."""
+        sd_pub = systems.load_system("rh-enamide", starting_point="published")
+        sd_q = systems.load_system("rh-enamide", starting_point="qfuerza")
+
+        vec_pub = sd_pub.forcefield.get_param_vector()
+        vec_q = sd_q.forcefield.get_param_vector()
+        mask = sd_pub.forcefield.active_mask
+
+        # Frozen partition is identical
+        assert np.array_equal(mask, sd_q.forcefield.active_mask)
+        # Frozen scalars are bit-identical (no QFUERZA leakage into backbone)
+        assert np.allclose(vec_pub[~mask], vec_q[~mask], atol=0.0)
+        # At least some active scalars changed (QFUERZA did something)
+        assert np.any(vec_pub[mask] != vec_q[mask])
+
+    @pytest.mark.external_data
+    def test_reference_data_unchanged_by_starting_point(self) -> None:
+        """Reference data depends on QM, not on the starting FF."""
+        sd_pub = systems.load_system("rh-enamide", starting_point="published")
+        sd_q = systems.load_system("rh-enamide", starting_point="qfuerza")
+        refs_pub = [(r.kind, float(r.value), float(r.weight)) for r in sd_pub.reference.values]
+        refs_q = [(r.kind, float(r.value), float(r.weight)) for r in sd_q.reference.values]
+        assert refs_pub == refs_q
+
+    @pytest.mark.external_data
+    def test_audit_classifies_scalars_consistently(self) -> None:
+        """Audit counts must sum to the FF's scalar count, per type."""
+        sd = systems.load_system("rh-enamide", starting_point="qfuerza")
+        audit = sd.metadata["starting_point_audit"]
+        assert audit["starting_point"] == "qfuerza"
+        # Sum of all bucket entries equals total parameters
+        total_from_buckets = sum(sum(bucket.values()) for bucket in audit["by_type"].values())
+        assert total_from_buckets == sd.forcefield.n_params
+        # Active + frozen = total
+        assert audit["n_active"] + audit["n_frozen"] == sd.forcefield.n_params
+        # QFUERZA overwrites at least one bond and one angle in rh-enamide
+        assert audit["by_type"]["bond_fc"]["qfuerza_overwritten"] > 0
+        assert audit["by_type"]["angle_fc"]["qfuerza_overwritten"] > 0
+        # MM3 backbone bonds/angles are frozen — no QFUERZA writes there
+        for ptype in ("bond_fc", "bond_eq", "angle_fc", "angle_eq"):
+            bucket = audit["by_type"][ptype]
+            assert bucket["frozen"] > bucket["qfuerza_overwritten"], (
+                f"{ptype}: frozen={bucket['frozen']} <= overwritten={bucket['qfuerza_overwritten']}"
+            )
+
+    def test_qfuerza_is_noop_for_qfuerza_fresh_strategy(self) -> None:
+        """For CH3F (qfuerza_fresh), starting_point='qfuerza' is a no-op."""
+        from q2mm.backends.mm.jax_engine import JaxEngine
+
+        engine = JaxEngine()
+        sd_default = systems.load_system("ch3f", engine=engine)
+        sd_explicit = systems.load_system("ch3f", engine=engine, starting_point="qfuerza")
+        assert np.allclose(
+            sd_default.forcefield.get_param_vector(),
+            sd_explicit.forcefield.get_param_vector(),
+        )
+        # No QFUERZA *overwrite* happened (the FF was already QFUERZA-derived
+        # by the qfuerza_fresh strategy); the audit reflects that everything
+        # active is "retained" relative to itself.
+        audit = sd_explicit.metadata["starting_point_audit"]
+        assert audit["starting_point"] == "qfuerza"
+        for bucket in audit["by_type"].values():
+            assert bucket["qfuerza_overwritten"] == 0

From be810d3be7c55537b5366f0af32cd7d47014c3fc Mon Sep 17 00:00:00 2001
From: Eric Hansen <ericchansen@gmail.com>
Date: Thu, 28 May 2026 15:35:46 -0500
Subject: [PATCH 2/7] docs: add QFUERZA-recovery validation page

Documents results of starting q2mm optimization from QFUERZA Hessian-derived
bond/angle values (instead of published OPT values) on all 5 TS systems.

Headline result: mixed. rh-enamide and pd-allyl converge to the same basin
as published-start runs. pd-conjugate, rh-conjugate, and heck-relay land in
different (worse) basins, with heck-relay failing entirely due to JaxLoss
surrogate divergence at the poor starting FF.

The page is explicit that this is NOT a from-scratch FF generation:
QFUERZA only overwrites bond/angle scalars on top of the published OPT
topology, frozen/active partition, vdW, SB, and atom-type rows. Per-row
audit numbers are reported per system.

Data lives in ericchansen/q2mm-data/benchmarks/<system>/from-qfuerza/.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 docs/benchmarks/qfuerza-recovery.md | 229 ++++++++++++++++++++++++++++
 properdocs.yml                      |   1 +
 2 files changed, 230 insertions(+)
 create mode 100644 docs/benchmarks/qfuerza-recovery.md

diff --git a/docs/benchmarks/qfuerza-recovery.md b/docs/benchmarks/qfuerza-recovery.md
new file mode 100644
index 00000000..05032912
--- /dev/null
+++ b/docs/benchmarks/qfuerza-recovery.md
@@ -0,0 +1,229 @@
+# QFUERZA-Recovery Validation
+
+**Question**: If we *throw away* the published OPT bond/angle values and replace them with QFUERZA Hessian-derived values, does the q2mm optimizer recover the published TSFFs?
+
+**Answer (preview)**: For two of five systems the optimizer reaches essentially the same minimum as a published-start run. For three systems it does not — the QFUERZA starting point lies in a different basin, and L-BFGS-B converges to a local optimum that is far from (and worse than) the published-start result.
+
+This page documents the experiment honestly. It is the strongest end-to-end validation of the q2mm pipeline to date (reference data + weighting + gradients + engine), but the headline result is **mixed**.
+
+---
+
+## 1. What this experiment actually tests
+
+> **This is *not* a from-scratch FF generation.**
+
+QFUERZA-recovery starts from the **published force field topology** —
+which OPT rows exist, which parameters are frozen vs active, the
+atom-type rows, vdW radii/epsilons, stretch-bend coefficients, and
+torsion phase information. It then **overwrites** the bond and angle
+*values* (force constants and equilibria) with QFUERZA-derived values
+computed from the per-molecule QM Hessian, following the Farrugia 2025
+protocol ([10.1021/acs.jctc.5c01751](https://doi.org/10.1021/acs.jctc.5c01751)).
+
+| Layer | QFUERZA-recovery run uses |
+|---|---|
+| OPT row topology (which atom-type triples/quadruples appear) | **Published** |
+| Frozen vs active partition (`freeze_standard_params`) | **Published** |
+| Bond/angle *equilibria* and *force constants* | **QFUERZA** (multi-molecule mean of per-mol QFUERZA, TS-inverted) |
+| Torsion `V₁/V₂/V₃` | Published-zero (Farrugia zeros torsions at QFUERZA-init time; for our TS systems the published OPT torsions are already zero, so the values coincide) |
+| van der Waals `r₀`, `ε` | **Published** (QFUERZA does not touch vdW) |
+| Stretch-bend, MM3 backbone | **Published** (frozen) |
+| Reference data (geometries, eigenmatrix, charges) | Identical to published-start runs |
+| Optimizer | SciPy L-BFGS-B + JaxLoss analytical gradient, `--ratio-tol -1` |
+
+The **per-system overwrite count** is reported in §3 below — only 16–25%
+of active parameters are actually replaced.
+
+A *true* from-scratch run would need a QFUERZA-only loader path
+(`qfuerza_fresh` strategy) that builds the OPT topology from scratch,
+not by overwriting published rows. The current implementation uses the
+published `ff_strategy` for everything except the bond/angle scalars.
+
+---
+
+## 2. Why we ran this
+
+Every TS benchmark in
+[`q2mm-data/benchmarks/*/convergence/`](https://github.com/ericchansen/q2mm-data/tree/main/benchmarks)
+has started from published OPT values. That answers "can the optimizer
+refine a near-converged FF?" but not "can it find a good FF given only
+QM data?"
+
+The QFUERZA-recovery protocol tests the latter: starting from
+Hessian-derived bond/angle values, can the existing q2mm pipeline
+(reference data, JaxLoss surrogate, L-BFGS-B) close the gap to the
+published TSFF?
+
+---
+
+## 3. Results
+
+All runs: WSL2 + RTX 5090, SciPy L-BFGS-B + JaxLoss `jac='auto'`,
+`--ratio-tol -1`, TS Hessian inversion on.
+Data: [`q2mm-data/benchmarks/<system>/from-qfuerza/`](https://github.com/ericchansen/q2mm-data/tree/main/benchmarks).
+
+### Headline: QFUERZA-start vs published-start objective scores
+
+| System | Pub. final OF | QFUERZA final OF | QFUERZA/Pub. ratio | Verdict |
+|---|---:|---:|---:|:---|
+| rh-enamide   | 2.70 × 10⁵ | 2.78 × 10⁵ | **1.03×** | ✅ same basin |
+| pd-allyl     | 7.99 × 10⁶ | 7.98 × 10⁶ | **1.00×** | ✅ same basin |
+| pd-conjugate | 7.24 × 10⁶ | 8.25 × 10⁶ | 1.14× | ⚠ nearby basin, marginal |
+| rh-conjugate | 5.10 × 10⁶ | 1.78 × 10⁷ | 3.49× | ❌ different basin |
+| heck-relay   | 1.45 × 10⁶ | 1.45 × 10⁸ | **100×** | ❌ JaxLoss surrogate diverged |
+
+> A QFUERZA/Pub. ratio of `1×` means the QFUERZA-start optimizer
+> reaches the same objective as starting from published OPT values.
+
+### Per-system runs (QFUERZA start)
+
+| System | mol | n_active | QF overwritten | Pub. retained | Init OF | Final OF | Δ% | Ratio | Iters | Evals | Wall |
+|---|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|
+| rh-enamide   | 9  | 182 | 60  (33%) | 122 | 3.92 × 10⁵ | 2.78 × 10⁵ | **+28.9** | 1.05    | 11 | 2 | 721 s |
+| heck-relay   | 23 | 462 | 101 (22%) | 361 | 1.34 × 10⁸ | 1.45 × 10⁸ | **−7.70** | 1.9 × 10⁷⁴ | 0  | 2 | 1425 s |
+| pd-allyl     | 21 | 482 | 80  (17%) | 402 | 8.01 × 10⁶ | 7.98 × 10⁶ | +0.41     | 1.10    | 2  | 2 | 1290 s |
+| pd-conjugate | 10 | 340 | 96  (28%) | 244 | 8.23 × 10⁶ | 8.25 × 10⁶ | −0.20     | 1.21    | 1  | 2 | 626 s  |
+| rh-conjugate | 10 | 488 | 81  (17%) | 407 | 2.67 × 10⁷ | 1.78 × 10⁷ | **+33.4** | 0.52    | 2  | 2 | 745 s  |
+
+- *QF overwritten* counts active OPT rows whose values were replaced by
+  QFUERZA. The rest are *retained published* (mostly torsions at 0, plus
+  vdW which QFUERZA does not touch).
+- *Ratio* is JaxLoss/ObjectiveFunction at the starting FF. Values near 1
+  mean the JaxLoss surrogate tracks the real objective; values far from
+  1 mean the surrogate is unreliable.
+- *Evals = 2* on most systems is L-BFGS-B reporting that gradient
+  information at the starting point already places it close to a local
+  minimum.
+
+### Seminario starting-FF quality (QFUERZA bond/angle on published topology)
+
+R² values are the linear fit between MM-evaluated property and QM
+reference, computed at the starting FF. Negative R² indicates the model
+is worse than predicting the mean.
+
+| System | bond R² (start → opt) | angle R² (start → opt) | eig-diag R² (start → opt) |
+|---|---|---|---|
+| rh-enamide   | 0.976  → 0.984  | 0.934   → 0.953   | 0.972  → 0.970  |
+| heck-relay   | −247   → −147   | −7.95   → −6.35   | −4.66  → −4.66  |
+| pd-allyl     | 0.034  → 0.031  | 0.335   → 0.337   | −1.41  → −1.41  |
+| pd-conjugate | 0.448  → 0.427  | −0.114  → −0.114  | −4.47  → −4.47  |
+| rh-conjugate | −0.71  → −0.23  | −0.337  → −0.264  | −4.85  → −4.85  |
+
+heck-relay starts with **bond R² = −247** and **angle R² = −7.95** at
+the QFUERZA point. The Seminario projection produces large negative
+force constants on Pd-N-C angles (≈ −1.6 kcal/mol/rad²) which, after TS
+inversion, still leave a starting FF whose MM-bond-length predictions
+have RMSD orders of magnitude larger than reference. JaxLoss diverges
+to `1.9 × 10⁷⁴`, and L-BFGS-B exits in 0 iterations with the FF
+*worse* than the start (−7.70% improvement at the optimizer's chosen
+"converged" point).
+
+---
+
+## 4. What this tells us
+
+### rh-enamide ✅ — strong recovery
+Same basin as published-start (278k vs 270k, 3% gap). The starting
+Seminario fit is already good (bond R² > 0.97), JaxLoss ratio is
+healthy (1.05), and the optimizer makes a genuine 28.9% improvement.
+**This is the cleanest validation in the suite.**
+
+### pd-allyl ✅ — same basin, no improvement to make
+QFUERZA-start and published-start reach essentially identical objective
+(7.98M vs 7.99M, within 0.1%). The optimizer makes no significant
+improvement from either starting point (+0.41% vs +0.52%), suggesting
+both starting FFs sit in or near the same local minimum.
+
+### pd-conjugate ⚠ — nearby basin
+Published-start reaches 7.24M; QFUERZA-start gets stuck at 8.25M (14%
+worse). Negative improvement (−0.20%) from the QFUERZA start indicates
+L-BFGS-B converged to a different (and slightly worse) local minimum
+than the published FF.
+
+### rh-conjugate ❌ — different basin
+QFUERZA-start makes a real +33.4% improvement but the final score
+(17.8M) is 3.5× worse than the published-start final (5.1M). The
+optimizer is descending into a different basin entirely.
+
+### heck-relay ❌ — JaxLoss surrogate fails
+The starting Seminario projection produces such a poor MM PES (bond
+R² = −247) that the JaxLoss-implicit-diff geometry-relaxation surrogate
+explodes to `1.9 × 10⁷⁴`. L-BFGS-B exits in 0 iterations and the final
+FF is worse than the starting FF.
+
+This is consistent with the known JaxLoss limitation documented in
+[AGENTS.md §6](https://github.com/ericchansen/q2mm/blob/main/AGENTS.md):
+for TS systems with poor starting force fields, unconstrained MM
+geometry relaxation wanders to wrong minima and the surrogate becomes
+uninformative.
+
+---
+
+## 5. Interpretation
+
+The QFUERZA-recovery experiment is best read as a **basin diagnostic**,
+not a global-minimum search:
+
+- The q2mm pipeline (reference data, JaxLoss gradient, L-BFGS-B) is
+  validated for **local refinement** of an FF that is already near a
+  good basin (rh-enamide, pd-allyl).
+- When the starting FF is sufficiently far from the published basin
+  (rh-conjugate, pd-conjugate), L-BFGS-B finds *a* local minimum but
+  not the one the published authors found. This is expected for a local
+  optimizer on a non-convex landscape.
+- When the starting FF is *very* bad (heck-relay), the JaxLoss
+  surrogate breaks down and no useful optimization happens.
+
+**This is consistent with what the Farrugia 2025 paper warns about**:
+QFUERZA is an *initial-parameter* method, not a one-shot generator. The
+paper's full workflow includes per-molecule QFUERZA + iterative
+refinement *with constraints to prevent basin escape* — not the
+unconstrained L-BFGS-B we run here.
+
+To close the gap on the three failed systems, a future experiment
+should test:
+1. Trust-region or constrained optimization (bound box around starting
+   FF) to keep L-BFGS-B in the published basin.
+2. Multiple random restarts to characterize the basin landscape.
+3. A pre-conditioning step that improves the starting FF Seminario R²
+   before handing off to JaxLoss + L-BFGS-B.
+
+---
+
+## 6. How to reproduce
+
+```bash
+# 1. Verify GPU
+source /home/eric/repos/q2mm/.venv/bin/activate
+python -c "import jax; print(jax.devices())"   # must show CudaDevice
+
+# 2. Run single system
+python scripts/regenerate_convergence_results.py \
+    --system rh-enamide \
+    --starting-point qfuerza \
+    --ratio-tol -1 \
+    --output-dir /path/to/q2mm-data/benchmarks
+
+# 3. Inspect output
+ls /path/to/q2mm-data/benchmarks/rh-enamide/from-qfuerza/
+# validation_results.json   paper_metrics.json   rh-enamide_optimized.fld
+```
+
+The `--ratio-tol -1` flag bypasses the JaxLoss/ObjectiveFunction ratio
+gate (which would otherwise reject all 5 TS systems at the QFUERZA
+start because the surrogate is poorly aligned).
+
+The output `validation_results.json` includes a `starting_point_audit`
+block enumerating which OPT rows were overwritten by QFUERZA vs
+retained from the published FF.
+
+---
+
+## 7. Anchors
+
+- Code: [`q2mm/diagnostics/systems.py`](https://github.com/ericchansen/q2mm/blob/main/q2mm/diagnostics/systems.py) (`starting_point` parameter on `load_system`)
+- CLI: [`scripts/regenerate_convergence_results.py`](https://github.com/ericchansen/q2mm/blob/main/scripts/regenerate_convergence_results.py) (`--starting-point {published,qfuerza}`)
+- Tests: [`test/test_systems.py`](https://github.com/ericchansen/q2mm/blob/main/test/test_systems.py) (`TestStartingPoint`)
+- Data: [`q2mm-data/benchmarks/<system>/from-qfuerza/`](https://github.com/ericchansen/q2mm-data/tree/main/benchmarks)
+- Method paper: Farrugia, Helquist, Norrby & Wiest, *J. Chem. Theory Comput.* **2025**, 22, 469. [10.1021/acs.jctc.5c01751](https://doi.org/10.1021/acs.jctc.5c01751)
+- Related: [QFUERZA Validation](qfuerza-validation.md) — starting-FF quality across all systems.
diff --git a/properdocs.yml b/properdocs.yml
index 50d3310a..59a1de7c 100644
--- a/properdocs.yml
+++ b/properdocs.yml
@@ -50,6 +50,7 @@ nav:
     - When Analytical Wins: benchmarks/crossover.md
     - Published FF Validation: benchmarks/published-ff-validation.md
     - QFUERZA Validation: benchmarks/qfuerza-validation.md
+    - QFUERZA Recovery: benchmarks/qfuerza-recovery.md
   - Systems:
     - Small Molecules: systems/small-molecules.md
     - Rh-Enamide: systems/rh-enamide.md

From e81cf0040305d374c69733f5b59a8e64846c397b Mon Sep 17 00:00:00 2001
From: Eric Hansen <ericchansen@gmail.com>
Date: Sun, 31 May 2026 10:23:10 -0500
Subject: [PATCH 3/7] feat: benchmark protocol guardrails for from-poor-start
 runs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

PR #290 first iteration shipped 5 systems with a silent-failure
optimizer protocol: L-BFGS-B exited at n_iterations<=2 with
negligible OF change for every system, scipy reported success=True,
and the misleading results made it into the comparison doc. This
commit adds the source-code, CLI, agent-skill and documentation
guardrails that prevent that pattern from recurring.

Source code
- q2mm/models/forcefield.py: add ForceField.get_fractional_bounds
  (fc_fraction, eq_fraction) returning a (val +/- frac*|val|) box
  per parameter, intersected with DEFAULT_BOUNDS sanity envelope.
  Sign-aware for TSFF negative force constants; falls back to sanity
  bounds when |val| < 1e-6 (frozen-at-zero torsions).
- q2mm/optimizers/scipy_opt.py: ScipyOptimizer accepts fc_fraction
  and eq_fraction; uses them for the bounds list when set. Adds a
  WARNING in _run_minimize when n_iterations<=2 and |delta|/init<1%
  — the silent-failure fingerprint.
- scripts/regenerate_convergence_results.py: add --ftol,
  --fc-fraction, --eq-fraction CLI flags (defaults preserve
  backward-compatible behavior for the existing convergence/
  baselines). Emit batch-level ERROR + non-zero exit when every
  optimized system fails the no-progress check.

Tests
- test/test_models.py: 5 new tests covering fractional bounds for
  positive/negative FCs, sanity-envelope intersection, zero-value
  fallback, and the None-passthrough.

Agent skills
- .copilot/skills/q2mm-benchmark/SKILL.md: forces the agent through
  the audit gate (run FIRST system, inspect n_iterations &
  improvement_pct, STOP if either fails) before launching any q2mm
  batch >30 min.
- .copilot/skills/q2mm-analysis-design/SKILL.md: forces the agent
  to restate the user's question and mock comparison tables BEFORE
  writing a benchmark analysis doc.

Documentation
- AGENTS.md: new section 11 "Benchmark Pre-Flight Checklist".
  Three new Common Pitfalls rows (n_iterations<=2 silent exit,
  default sanity bounds for from-poor-start runs, batch never
  optimized).

Recommended invocation for from-QFUERZA runs:
  python scripts/regenerate_convergence_results.py \
    --starting-point qfuerza --ratio-tol none \
    --ftol 1e-12 --fc-fraction 0.20 --eq-fraction 0.05

Heck-relay specifically: tighter bounds (--fc-fraction 0.05 per
AGENTS.md memory) due to the fragile TS landscape with large
negative force constants.

Refs: ericchansen/q2mm#290 (iterating)

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .copilot/skills/q2mm-analysis-design/SKILL.md |  90 ++++++++++++
 .copilot/skills/q2mm-benchmark/SKILL.md       | 137 ++++++++++++++++++
 AGENTS.md                                     |  44 ++++++
 q2mm/models/forcefield.py                     |  73 ++++++++++
 q2mm/optimizers/scipy_opt.py                  |  60 +++++++-
 scripts/regenerate_convergence_results.py     |  70 ++++++++-
 test/test_models.py                           |  52 +++++++
 7 files changed, 522 insertions(+), 4 deletions(-)
 create mode 100644 .copilot/skills/q2mm-analysis-design/SKILL.md
 create mode 100644 .copilot/skills/q2mm-benchmark/SKILL.md

diff --git a/.copilot/skills/q2mm-analysis-design/SKILL.md b/.copilot/skills/q2mm-analysis-design/SKILL.md
new file mode 100644
index 00000000..c751f023
--- /dev/null
+++ b/.copilot/skills/q2mm-analysis-design/SKILL.md
@@ -0,0 +1,90 @@
+---
+name: q2mm-analysis-design
+description: 'MANDATORY before writing any q2mm benchmark analysis, comparison, or validation document. Forces restating the user question verbatim, listing deliverable artifacts (table headers, figure captions), mapping each artifact back to the question, and flagging mismatches BEFORE writing the doc. Use when about to create or rewrite docs/benchmarks/*.md, when about to produce a comparison page, when summarizing benchmark results, or when packaging a PR with new results.'
+license: MIT
+allowed-tools: Read, Edit, Grep
+---
+
+# q2mm Analysis Design
+
+A benchmark analysis page that doesn't answer the user's literal question is worse than no page at all — it ships a wrong-but-confident answer that has to be retracted. This skill exists because a previous agent shipped a 230-line page comparing `final_obj_score` when the user had asked about parameter values and R² vs published papers.
+
+## Step 1 — Restate the user's question VERBATIM
+
+Find the user message that triggered this work and quote it exactly. Do not paraphrase. Do not "interpret." If you can't find it, ask.
+
+Then write what the user literally asked for as a single declarative sentence: *"The user wants ___."*
+
+If the question has multiple parts, enumerate them: *"The user wants (a) X, (b) Y, (c) Z."*
+
+## Step 2 — List every deliverable artifact
+
+Enumerate what will exist when the doc is done. For each artifact, write:
+- **Type**: table, figure, paragraph, code block
+- **Content**: what columns/rows/axes/topic
+- **Source**: where the data comes from (JSON path, paper DOI, computed from FFs, etc.)
+
+Example:
+- *Table 3.1*: per-system R² comparison. Columns: published-paper R², q2mm-from-published R², q2mm-from-QFUERZA R². Rows: bond length, bond angle, eigenvalue diagonal. Source: `q2mm-data/benchmarks/<system>/{convergence,from-qfuerza}/validation_results.json` for q2mm; published-paper PDFs for paper R².
+- *Table 3.2*: per-parameter abs deviation. Columns: param-id, published value, QFUERZA-optimized value, abs deviation, % deviation, chemical motif. Source: parsed from `<system>_optimized.fld` files.
+- *Paragraph 4.1*: physical-chemistry walkthrough of the 5 largest bond-length deviations in rh-enamide. Source: synthesis from Table 3.2 + chemistry knowledge.
+
+If you cannot enumerate every artifact in ≤ 15 bullets, the scope is too broad — break it into a separate doc.
+
+## Step 3 — Map each artifact to a piece of the user's question
+
+For each numbered artifact in Step 2, write which part of the user's question (from Step 1) it addresses.
+
+Format: *"Table 3.1 answers question part (a): are q2mm-optimized R² values close to published-paper R²?"*
+
+If an artifact does NOT map to any part of the question, ask yourself:
+- Is this scope creep? → drop the artifact
+- Is it scaffolding the reader needs? → keep but mark as "background"
+- Did I misunderstand the question? → revisit Step 1
+
+If a question part has NO artifact mapped to it, this is the critical failure mode:
+- Add the missing artifact, OR
+- Explicitly note in the doc "We did not answer (X) because ___" and tell the user why
+
+## Step 4 — Flag mismatches BEFORE writing
+
+Before writing a single sentence of the doc:
+
+- [ ] Every user-question part has at least one artifact answering it
+- [ ] Every artifact maps to a user-question part (or is marked background)
+- [ ] The success-pass criterion is stated explicitly: how does the reader know the answer is "yes" or "no"?
+- [ ] Negative results are accommodated: if QFUERZA fails to recover the published params on system X, the doc still has a place for that finding (not just "successful systems only")
+
+If any box is unchecked, stop and fix the design. Do not start writing.
+
+## Step 5 — Now write the doc
+
+Open with:
+1. The user's question (paraphrased gracefully for the published audience)
+2. The TL;DR answer with the actual headline number
+3. A roadmap of what's in the doc
+
+Then walk through the artifacts in the order you listed them in Step 2. Each section ends with a "what this means" paragraph that ties back to the user's question.
+
+## Step 6 — Self-critique pass
+
+Before declaring the doc done:
+- Re-read the user's question (Step 1).
+- Read your TL;DR.
+- Does the TL;DR answer the question? If no, the doc is wrong even if every section is correct.
+- Are there places where the doc says "we did X" but should say "the data showed Y"?
+- Are there places where the doc editorializes ("a strong validation", "a remarkable result") without evidence?
+
+Per AGENTS.md §2 rule 8: *"Every claim must be grounded in evidence. ... do not embellish, glorify, or editorialize."*
+
+## Common mismatches to refuse
+
+- User asks about parameters → doc compares objective scores (proxy, not the question)
+- User asks about R² → doc compares improvement percentages (proxy, not the question)
+- User asks about physical-chemistry interpretation → doc only has numbers (no chemistry)
+- User asks "did we recover the published FF?" → doc shows only "did the optimizer converge to *a* minimum?" (different question)
+- User asks for a comparison → doc shows results from only one condition (missing the comparison)
+
+## Output
+
+After Step 4, write the design as a checklist in the session plan or a markdown comment block. The doc author (you, in the next session) reads this before writing. Reviewers (the user) read this to confirm scope alignment.
diff --git a/.copilot/skills/q2mm-benchmark/SKILL.md b/.copilot/skills/q2mm-benchmark/SKILL.md
new file mode 100644
index 00000000..576167ad
--- /dev/null
+++ b/.copilot/skills/q2mm-benchmark/SKILL.md
@@ -0,0 +1,137 @@
+---
+name: q2mm-benchmark
+description: 'MANDATORY before launching any q2mm batch optimization run >30 minutes (regenerate_convergence_results.py, multi-system convergence runs, from-QFUERZA runs, etc.). Forces success-spec write, optimizer config sanity check, FIRST-system audit gate before launching the rest, and post-batch validation. Use when user says "run benchmarks", "regenerate convergence results", "run all systems", "QFUERZA recovery", or any variant of launching a multi-system q2mm optimization batch.'
+license: MIT
+allowed-tools: Bash, Read, Edit, Grep, Glob
+---
+
+# q2mm Benchmark Pre-Flight & Audit
+
+A multi-hour GPU benchmark batch that produces useless results is worse than one that fails fast. This skill exists because a previous agent shipped a flawed batch (5 systems, ~4 GPU-hours) where the optimizer exited at `n_evaluations=2` on every system without actually optimizing, and then wrote a PR comparing the wrong metric. Don't repeat that.
+
+**The cardinal rule**: AUDIT THE FIRST SYSTEM BEFORE LAUNCHING THE REST.
+
+## Step 1 — Write the success spec (mandatory)
+
+Before any other work, write a paragraph in the session plan answering:
+
+1. **What is the user's literal question?** Quote it verbatim.
+2. **What deliverable artifacts will exist when this is done?** Be specific: "a markdown page with three tables: (a) per-system R² of q2mm-optimized FF vs published-paper R², (b) per-parameter abs deviation tables, (c) physical-chemistry walkthrough of top-5 deviations per system."
+3. **What numerical results count as 'success'?** Examples: "QFUERZA-start final OF within 20% of published-start final OF on at least 3/5 systems" or "per-param mean abs deviation < 10% on bond_eq."
+4. **What would make me declare the batch failed and stop?** Examples: "All systems exit at `n_evals<=2`" or "R² is negative on the optimized FF."
+
+If you cannot answer (1)–(4) in five sentences, stop and ask the user to clarify. Do not launch the batch.
+
+## Step 2 — Sanity-check the optimizer config
+
+Read these defaults in the q2mm source and confirm they're appropriate for the starting point:
+
+### Bounds (`q2mm/models/forcefield.py` → `DEFAULT_BOUNDS`)
+
+The defaults are **physical sanity bounds**, not "stay near starting FF":
+- `bond_k: (-3600, 3600)` kcal/mol/Å²
+- `angle_k: (-720, 720)` kcal/mol/rad²
+- `bond_eq: (0.5, 3.0)` Å
+- `angle_eq: (30, 180)` deg
+
+For **from-published-OPT** runs, sanity bounds are usually fine (starting FF is already good).
+
+For **from-QFUERZA** or any **from-poor-start** runs, sanity bounds let L-BFGS-B escape the QFUERZA basin into a worse local minimum. Use fractional bounds around the starting value:
+- Typical: `bounds_fraction_fc=0.20`, `bounds_fraction_eq=0.05`
+- Fragile systems (heck-relay): `bounds_fraction_fc=0.05`, `bounds_fraction_eq=0.02`
+
+Pass via `--bounds-fraction-fc` / `--bounds-fraction-eq` CLI flags on `scripts/regenerate_convergence_results.py`.
+
+### Convergence tolerance (`scipy_opt.py` → `_run_minimize`)
+
+Default L-BFGS-B `factr=1e7` is very loose — `nfev` will often be ≤ 5 with no real optimization. Override with `--factr 1e2` (or tighter) for any run where you actually want the optimizer to work.
+
+### Ratio gate (`scipy_opt.py` → ratio check)
+
+For TS systems with a poor starting FF, the JaxLoss/ObjectiveFunction ratio can be 0.1–0.4 or even diverge to 1e74 (heck-relay from QFUERZA). The default ratio check rejects these. Two options:
+- `--ratio-tol -1` to bypass entirely (use for from-QFUERZA TS runs)
+- Document the explosion honestly in the analysis instead of pretending it didn't happen
+
+## Step 3 — Pre-flight checklist
+
+Walk through this LITERALLY before running. Each item must be checked.
+
+- [ ] Goal restated as a success spec (Step 1)
+- [ ] Bounds strategy chosen and matches starting-point quality (Step 2)
+- [ ] `factr` / `gtol` tight enough for real optimization
+- [ ] Per-system overrides documented if any system needs special handling
+- [ ] GPU verified: `python -c "import jax; print(jax.devices())"` shows CudaDevice
+- [ ] Output directory chosen (e.g., `q2mm-data/benchmarks/<system>/from-qfuerza/`)
+- [ ] PYTHONPATH set if running from a worktree (editable install points to master)
+
+## Step 4 — Run the FIRST system in isolation
+
+Do NOT launch all systems in a batch. Run **only the first system**:
+
+```bash
+PYTHONPATH=/path/to/worktree python scripts/regenerate_convergence_results.py \
+    --system <first-system> \
+    --starting-point <published|qfuerza> \
+    --factr 1e2 \
+    --bounds-fraction-fc 0.20 \
+    --bounds-fraction-eq 0.05 \
+    --ratio-tol <value> \
+    --output-dir /path/to/q2mm-data/benchmarks
+```
+
+## Step 5 — AUDIT GATE (do not skip)
+
+After the first system completes, inspect `<output-dir>/validation_results.json`:
+
+```python
+import json
+with open("<first-system>/from-qfuerza/validation_results.json") as f:
+    r = json.load(f)["result"]
+print("n_evaluations:", r["n_evaluations"])
+print("n_iterations:", r["n_iterations"])
+print("real OF:      ", r["initial_obj_score"], "→", r["final_obj_score"])
+print("improvement%: ", r["improvement_pct"])
+print("ratio:        ", r["ratio"])
+print("Seminario R²: ", r["seminario"])
+print("Optimized R²: ", r["optimized"])
+```
+
+**Pass criteria** (all must hold or you must explicitly justify deviation):
+- `n_evaluations > 5` (the optimizer actually evaluated the gradient)
+- `n_iterations > 3` (it took real steps)
+- `|improvement_pct| > 1` (the real OF actually changed)
+- Optimized R² ≥ Seminario R² on each metric (the run didn't make things worse)
+
+**Fail criteria** (stop immediately if any holds):
+- `n_evaluations ≤ 2` AND `|improvement_pct| < 1` → **optimizer did not optimize**. Likely `factr` too loose or bounds too wide. Do NOT launch the remaining systems. Diagnose and re-run.
+- `ratio > 100` AND `improvement_pct < 0` → JaxLoss surrogate diverged AND the optimizer made the FF worse. Diagnose: tighter bounds, FC clamping, different starting point.
+- Optimized R² < Seminario R² → the run degraded the FF. Diagnose before continuing.
+
+## Step 6 — Launch the rest (only if Step 5 passes)
+
+Launch the remaining systems, one at a time or in a small batch. Continue to spot-check intermediate results; if a system shows the same `nfev<=2` failure mode that didn't appear in the first system, stop and diagnose.
+
+## Step 7 — Post-batch validation
+
+Before writing the analysis doc:
+- All systems have `n_evaluations > 5` (or each exception is documented with a chemical/physical reason)
+- All systems have `validation_results.json` with full provenance
+- Spot-check at least two `validation_results.json` files for sanity
+
+If a benchmark batch ends with multiple systems exiting at `n_evals=2`, that's a silent protocol failure — even if all the JSON files were written.
+
+## Quick reference: things that should make you stop and ask
+
+- "How do I know if the optimizer actually optimized?" → check `n_evaluations` and real OF delta
+- "Should I use sanity bounds or fractional bounds?" → sanity for from-published, fractional for from-QFUERZA
+- "Why does `nfev=2` happen on every system?" → default `factr` is 1e7, way too loose; use `--factr 1e2`
+- "Heck-relay's ratio is 1e74, is that OK?" → no, the JaxLoss surrogate exploded; document honestly and consider tighter bounds or FF pre-conditioning
+- "Should I just bypass the ratio gate with `--ratio-tol -1`?" → only if you understand why it's failing; ratio gate exists for a reason
+
+## Anti-patterns to refuse
+
+- Launching all 5 systems in a batch without auditing the first one
+- Shipping results where `n_evaluations <= 2` on every system as if optimization happened
+- Comparing only `final_obj_score` when the user asked about parameter values or R²
+- Bypassing the ratio gate with `--ratio-tol -1` without diagnosing why it's failing
+- Writing the analysis doc before re-reading the user's literal question
diff --git a/AGENTS.md b/AGENTS.md
index da3c04b7..fa3e7885 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -388,6 +388,9 @@ metadata in this project.
 | **jaxopt zoom linesearch** | `jaxopt ≥ 0.8.5` LBFGS default `linesearch="zoom"` triggers 30–60 min extra XLA compilation post-JIT | Use `scipy-lbfgsb-jax` instead — same L-BFGS-B algorithm, completes in seconds post-JIT |
 | **TS ratio check fallback** | `ScipyOptimizer(jac='auto')` ratio check fails for ALL TS systems (0.1–0.4), silently falls back to slow FD gradients | Set `ratio_tol=None` to bypass. CLI key: `scipy-lbfgsb-jax` |
 | **Heck relay bounds** | ±20% bounds cause 35–92% NaN rate due to fragile TS landscape with large negative FCs (−3753) | Use ±5% bounds for heck-relay specifically |
+| **`n_iterations<=2` silent exit** | L-BFGS-B "converges" after 0–2 iterations with negligible OF change — the optimizer didn't optimize. Common with from-poor-start runs and loose `ftol`. | Tighten `--ftol` (e.g. `1e-12`); apply `--fc-fraction`/`--eq-fraction` to keep optimizer in starting basin; check JaxLoss/OF ratio. The new diagnostic warning in `scipy_opt._run_minimize` flags this. |
+| **Default sanity bounds for from-poor-start runs** | `DEFAULT_BOUNDS` (bond_k ±3600, bond_eq 0.5–3.0 Å) let L-BFGS-B escape the QFUERZA / random-default starting basin → final FF unrelated to start | Use `ScipyOptimizer(fc_fraction=0.20, eq_fraction=0.05)` (or CLI flags `--fc-fraction --eq-fraction`) to bound each param to a ± fraction of its current value. |
+| **Benchmark batch never optimized** | All systems exit at `nfev≤2` with no OF change but the batch reports "success" | The runner now emits ERROR + non-zero exit code (`scripts/regenerate_convergence_results.py`). Re-tune `ftol` or bounds and re-run. See §11. |
 
 ---
 
@@ -448,3 +451,44 @@ See `validation/published_ffs/README.md` for the full table. As of April 2026:
 - **4 systems ready to implement** (Heck, Pd-allyl, Pd 1,4-conj, Ferrocene)
   — QM data available from dissertation supporting info
 - **3 systems blocked** (OsO₄, Ru ketone, Sulfone) — no QM training data
+
+---
+
+## 11. Benchmark Pre-Flight Checklist
+
+> **Before launching any q2mm batch >30 min (e.g. `regenerate_convergence_results.py` on >1 system, or any from-scratch FF generation), walk through every step below.**
+
+Many hours of GPU time have been wasted on batches where the optimizer never actually optimized. The pattern is silent — scipy reports `success=True`, the runner writes its JSON, and the misleading result is only caught during post-hoc analysis. This checklist prevents that.
+
+1. **Write a measurable success spec** in your plan/PR description before launching:
+   - What metric defines "the optimizer worked"? (e.g. `n_iterations > 5 AND real-OF improvement > 10%`)
+   - What does the comparison table look like? Mock the rows/columns now.
+   - If the user asked a specific scientific question (e.g. "do params end up near published?"), restate it verbatim and map each metric back to the question.
+
+2. **Sanity-check optimizer config** for the starting point:
+   - From published FF baseline: `ftol=1e-8`, sanity bounds → fine.
+   - From poor start (QFUERZA, random defaults): use `--ftol 1e-12` and `--fc-fraction 0.20 --eq-fraction 0.05` (or `--fc-fraction 0.05` for heck-relay).
+   - Always pass `--ratio-tol none` for TS systems (ratios are 0.1–0.4).
+
+3. **Verify GPU + device** (§4 + §7): `python -c "import jax; print(jax.devices())"` → must show `CudaDevice`.
+
+4. **Run the FIRST system alone.** Do NOT launch all systems sequentially in one call.
+
+5. **AUDIT GATE — read the first system's JSON before launching the rest:**
+   - `n_iterations > 5` (not just `n_evaluations`; that counter is misleading on the JaxLoss path — see `scipy_opt._run_minimize`).
+   - `|improvement_pct| > 1%` on the real ObjectiveFunction (the `improvement_pct` field, not `final_optimizer_score`).
+   - JaxLoss `initial_jaxloss / initial_obj_score` ratio in `[0.1, 10]` (or document why it's outside).
+   - Per-category R² (`seminario` → `optimized_categories`) improves on at least one of bond_length, bond_angle, eigenmatrix.
+   - If ANY of these fail, **STOP**. Re-tune and re-run the single system. Do not launch the batch.
+
+6. **Launch remaining systems.** Check `nvidia-smi` periodically (>50% utilization expected).
+
+7. **Post-batch validation** — read every JSON, not just the combined output. The runner emits a batch-level ERROR + non-zero exit code if all optimized systems failed the no-progress check; treat that as a hard failure even if individual files exist.
+
+### Source-code safety nets
+
+These exist to back up the checklist; do not rely on them alone:
+- `q2mm/optimizers/scipy_opt.py::_run_minimize` — WARNING when `n_iterations<=2` and `|delta|/init<0.01`
+- `scripts/regenerate_convergence_results.py::main` — ERROR + non-zero exit when the whole batch failed the no-progress test
+- `.copilot/skills/q2mm-benchmark/SKILL.md` — agent skill that walks through this checklist automatically before launching any batch
+- `.copilot/skills/q2mm-analysis-design/SKILL.md` — agent skill that forces design-first analysis methodology before writing comparison docs
diff --git a/q2mm/models/forcefield.py b/q2mm/models/forcefield.py
index 21934702..bd238d03 100644
--- a/q2mm/models/forcefield.py
+++ b/q2mm/models/forcefield.py
@@ -560,6 +560,79 @@ def get_active_bounds(self) -> np.ndarray:
             return bounds.reshape(0, 2)
         return bounds[self.active_mask]
 
+    def get_fractional_bounds(
+        self,
+        fc_fraction: float | None = None,
+        eq_fraction: float | None = None,
+    ) -> list[tuple[float, float]]:
+        """Get bounds as a fractional box around each parameter's current value.
+
+        Unlike :meth:`get_bounds`, which returns physical sanity bounds (e.g.
+        ``bond_k ∈ (-3600, 3600)``), this returns ``(val * (1 - frac), val *
+        (1 + frac))`` for each parameter, with sign-aware handling. This is the
+        appropriate strategy for **from-poor-start** runs (e.g.
+        ``starting_point="qfuerza"``) where you want the optimizer to refine
+        the starting FF locally instead of escaping the starting basin.
+
+        Force-constant types (``bond_k``, ``angle_k``, ``torsion_k``, ``sb_k``,
+        ``vdw_epsilon``, ``ub_k``) use ``fc_fraction``.
+        Equilibrium types (``bond_eq``, ``angle_eq``, ``vdw_radius``,
+        ``ub_eq``) use ``eq_fraction``.
+
+        Parameters
+        ----------
+        fc_fraction : float, optional
+            Fractional box width for force-constant parameters. ``None``
+            means use the corresponding sanity bound from
+            :attr:`DEFAULT_BOUNDS`.
+        eq_fraction : float, optional
+            Fractional box width for equilibrium parameters. ``None``
+            means use the corresponding sanity bound.
+
+        Returns
+        -------
+        list[tuple[float, float]]
+            Bounds list in the same layout as :meth:`get_param_vector`.
+
+        Notes
+        -----
+        - For parameters with ``|value| < 1e-6`` (effectively zero, e.g.
+          frozen torsions), falls back to sanity bounds because a
+          ``±0%`` window would collapse to a single point.
+        - The returned box is intersected with the sanity bounds from
+          :attr:`DEFAULT_BOUNDS` so the optimizer never explores
+          unphysical regions.
+
+        """
+        if fc_fraction is None and eq_fraction is None:
+            return self.get_bounds()
+
+        FC_TYPES = {"bond_k", "angle_k", "torsion_k", "sb_k", "vdw_epsilon", "ub_k"}
+        EQ_TYPES = {"bond_eq", "angle_eq", "vdw_radius", "ub_eq"}
+
+        vec = self.get_param_vector()
+        labels = self.get_param_type_labels()
+        bounds: list[tuple[float, float]] = []
+        for val, lbl in zip(vec, labels, strict=True):
+            sanity_lo, sanity_hi = self.DEFAULT_BOUNDS[lbl]
+            frac: float | None
+            if lbl in FC_TYPES:
+                frac = fc_fraction
+            elif lbl in EQ_TYPES:
+                frac = eq_fraction
+            else:  # pragma: no cover — defensive
+                frac = None
+
+            if frac is None or abs(val) < 1e-6:
+                bounds.append((sanity_lo, sanity_hi))
+                continue
+
+            window = frac * abs(val)
+            lo = max(sanity_lo, val - window)
+            hi = min(sanity_hi, val + window)
+            bounds.append((lo, hi))
+        return bounds
+
     # --- Parameter matching with ff_row → env_id → element fallback ---
 
     def match_bond(
diff --git a/q2mm/optimizers/scipy_opt.py b/q2mm/optimizers/scipy_opt.py
index 08cc3a35..cac5a402 100644
--- a/q2mm/optimizers/scipy_opt.py
+++ b/q2mm/optimizers/scipy_opt.py
@@ -208,6 +208,8 @@ def __init__(
         divergence_factor: float | None = 3.0,
         divergence_patience: int = 5,
         ratio_tol: float | None = 0.15,
+        fc_fraction: float | None = None,
+        eq_fraction: float | None = None,
     ) -> None:
         """Initialize the optimizer.
 
@@ -226,6 +228,14 @@ def __init__(
                 ObjectiveFunction ratio check when ``jac='auto'``.
                 Set to ``None`` to skip the check and always use
                 JaxLoss analytical gradients.
+            fc_fraction (float | None): Fractional bounds for
+                force-constant parameters. When set, bounds are
+                ``(val ± fc_fraction * |val|)`` instead of sanity bounds.
+                Use this for **from-poor-start** runs (e.g. starting
+                from QFUERZA) to prevent the optimizer from escaping the
+                starting basin. ``None`` means use sanity bounds.
+            eq_fraction (float | None): Same as ``fc_fraction`` but
+                applied to equilibrium parameters.
 
         """
         self.method = method
@@ -238,6 +248,8 @@ def __init__(
         self.divergence_factor = divergence_factor
         self.divergence_patience = divergence_patience
         self.ratio_tol = ratio_tol
+        self.fc_fraction = fc_fraction
+        self.eq_fraction = eq_fraction
 
     def optimize(self, objective: ObjectiveFunction) -> OptimizationResult:
         """Run the optimization.
@@ -263,14 +275,34 @@ def optimize(self, objective: ObjectiveFunction) -> OptimizationResult:
         has_frozen = ff.n_active_params < ff.n_params
         wrapped_objective: ObjectiveFunction | _ActiveObjectiveWrapper = objective
 
+        use_fractional = self.fc_fraction is not None or self.eq_fraction is not None
+        if use_fractional and not self.use_bounds:
+            logger.warning("fc_fraction/eq_fraction set but use_bounds=False — ignoring fractional bounds.")
+
         if has_frozen:
             wrapped_objective = _ActiveObjectiveWrapper(objective, ff.active_mask, initial_full)
             x0 = ff.get_active_param_vector().copy()
-            bounds = ff.get_active_bounds().tolist() if self.use_bounds else None
+            if self.use_bounds:
+                if use_fractional:
+                    full_bounds = np.asarray(
+                        ff.get_fractional_bounds(self.fc_fraction, self.eq_fraction),
+                        dtype=float,
+                    )
+                    bounds = full_bounds[ff.active_mask].tolist()
+                else:
+                    bounds = ff.get_active_bounds().tolist()
+            else:
+                bounds = None
             expand = wrapped_objective.expand
         else:
             x0 = initial_full.copy()
-            bounds = ff.get_bounds() if self.use_bounds else None
+            if self.use_bounds:
+                if use_fractional:
+                    bounds = ff.get_fractional_bounds(self.fc_fraction, self.eq_fraction)
+                else:
+                    bounds = ff.get_bounds()
+            else:
+                bounds = None
 
             def expand(x: np.ndarray) -> np.ndarray:
                 return np.asarray(x, dtype=float).copy()
@@ -529,12 +561,34 @@ def _jax_loss_fun(x_active: np.ndarray) -> tuple[float, np.ndarray]:
         else:
             message = str(scipy_result.message)
 
+        # Diagnostic: silent-failure detection.  L-BFGS-B can return
+        # "convergence" after 0-2 iterations when the line search can't
+        # find a descent direction (e.g. JaxLoss-vs-OF mismatch, or
+        # ftol is too loose).  Surface this so the next run isn't
+        # mistaken for "the optimizer did its best".
+        nit = int(scipy_result.get("nit", 0))
+        if initial_score > 0 and nit <= 2 and abs(final_score - initial_score) / initial_score < 0.01:
+            logger.warning(
+                "%s exited after %d iteration(s) with negligible change "
+                "(initial=%.4g, final=%.4g, |Δ|/init=%.2e). The optimizer "
+                "likely did NOT optimize. Common causes: (a) ftol too loose "
+                "for this scale of objective, (b) JaxLoss/ObjectiveFunction "
+                "ratio is far from 1 (surrogate gradients unreliable), "
+                "(c) bounds clamp the starting point. Last scipy message: %r",
+                self.method,
+                nit,
+                initial_score,
+                final_score,
+                abs(final_score - initial_score) / initial_score,
+                message,
+            )
+
         return OptimizationResult(
             success=bool(scipy_result.success),
             message=message,
             initial_score=objective.history[0] if objective.history else 0.0,
             final_score=final_score,
-            n_iterations=int(scipy_result.get("nit", 0)),
+            n_iterations=nit,
             n_evaluations=objective.n_eval - n_eval_before,
             initial_params=x0,
             final_params=final_x,
diff --git a/scripts/regenerate_convergence_results.py b/scripts/regenerate_convergence_results.py
index af3edd47..39de6d08 100644
--- a/scripts/regenerate_convergence_results.py
+++ b/scripts/regenerate_convergence_results.py
@@ -146,6 +146,9 @@ def _build_provenance(args: argparse.Namespace, output_dir: Path) -> dict[str, A
         "q2mm_data": data_git,
         "ratio_tol": args.ratio_tol,
         "maxiter": args.maxiter,
+        "ftol": args.ftol,
+        "fc_fraction": args.fc_fraction,
+        "eq_fraction": args.eq_fraction,
         "n_evals": args.n_evals,
         "skip_optimization": args.skip_optimization,
         "starting_point": args.starting_point,
@@ -371,6 +374,9 @@ def _run_optimization(
     ratio_tol: float | None,
     maxiter: int,
     n_evals: int,
+    ftol: float = 1e-8,
+    fc_fraction: float | None = None,
+    eq_fraction: float | None = None,
 ) -> dict[str, Any]:
     """Run scipy L-BFGS-B with JaxLoss gradients; return summary dict."""
     from q2mm.optimizers.objective import ObjectiveFunction
@@ -380,9 +386,12 @@ def _run_optimization(
     opt = ScipyOptimizer(
         method="L-BFGS-B",
         maxiter=maxiter,
+        ftol=ftol,
         verbose=True,
         jac="auto",
         ratio_tol=ratio_tol,
+        fc_fraction=fc_fraction,
+        eq_fraction=eq_fraction,
     )
     t0 = time.perf_counter()
     result = opt.optimize(obj)
@@ -430,6 +439,9 @@ def process_system(
     skip_optimization: bool,
     starting_point: str,
     provenance: dict[str, Any],
+    ftol: float = 1e-8,
+    fc_fraction: float | None = None,
+    eq_fraction: float | None = None,
 ) -> dict[str, Any]:
     """Process one system end-to-end and write its artifacts."""
     from q2mm.backends.mm.jax_engine import JaxEngine
@@ -494,6 +506,9 @@ def process_system(
             ratio_tol=ratio_tol,
             maxiter=maxiter,
             n_evals=n_evals,
+            ftol=ftol,
+            fc_fraction=fc_fraction,
+            eq_fraction=eq_fraction,
         )
         optimized_ff = opt_result.pop("optimized_ff")
         optimized_categories = opt_result.pop("optimized_categories")
@@ -614,6 +629,30 @@ def main() -> int:
         default=500,
         help="Maximum L-BFGS-B iterations per optimization.",
     )
+    parser.add_argument(
+        "--ftol",
+        type=float,
+        default=1e-8,
+        help="L-BFGS-B function-value tolerance. Tighten (e.g. 1e-12) "
+        "for from-poor-start runs where the default exits too soon.",
+    )
+    parser.add_argument(
+        "--fc-fraction",
+        type=float,
+        default=None,
+        help="Fractional bounds for force-constant parameters: each FC is "
+        "clamped to (val ± fc_fraction*|val|). Use for from-poor-start runs "
+        "(e.g. --starting-point qfuerza) to keep the optimizer in the "
+        "starting basin. Recommended: 0.20 (i.e. ±20%%). Omit for sanity bounds.",
+    )
+    parser.add_argument(
+        "--eq-fraction",
+        type=float,
+        default=None,
+        help="Fractional bounds for equilibrium parameters (bond_eq, angle_eq, "
+        "vdw_radius, ub_eq): each is clamped to (val ± eq_fraction*|val|). "
+        "Recommended: 0.05 (i.e. ±5%%). Omit for sanity bounds.",
+    )
     parser.add_argument(
         "--n-evals",
         type=_parse_positive_int,
@@ -670,9 +709,12 @@ def main() -> int:
     logger.info("Output directory: %s", output_dir)
     logger.info("Systems: %s", systems)
     logger.info(
-        "ratio_tol=%s, maxiter=%d, n_evals=%d, starting_point=%s",
+        "ratio_tol=%s, maxiter=%d, ftol=%.2e, fc_fraction=%s, eq_fraction=%s, n_evals=%d, starting_point=%s",
         args.ratio_tol,
         args.maxiter,
+        args.ftol,
+        args.fc_fraction,
+        args.eq_fraction,
         args.n_evals,
         args.starting_point,
     )
@@ -690,6 +732,9 @@ def main() -> int:
                 skip_optimization=args.skip_optimization,
                 starting_point=args.starting_point,
                 provenance=provenance,
+                ftol=args.ftol,
+                fc_fraction=args.fc_fraction,
+                eq_fraction=args.eq_fraction,
             )
             combined[sys_key] = summary
         except Exception:
@@ -703,6 +748,29 @@ def main() -> int:
         )
         logger.info("Wrote combined output: %s", args.combined_output)
 
+    # Batch-level silent-failure detection.  If we optimized any system
+    # but every one of them exited at n_iterations <= 2 with negligible
+    # change in the real ObjectiveFunction, the batch did not optimize.
+    # See AGENTS.md §11 (Benchmark Pre-Flight Checklist).
+    optimized = [s for s in combined.values() if not s.get("skipped") and s.get("n_iterations") is not None]
+    if optimized and not args.skip_optimization:
+        no_progress = [
+            s
+            for s in optimized
+            if int(s.get("n_iterations", 0)) <= 2 and abs(float(s.get("improvement_pct", 0.0))) < 1.0
+        ]
+        if len(no_progress) == len(optimized):
+            logger.error(
+                "BATCH FAILURE: all %d optimized system(s) exited at "
+                "n_iterations<=2 with |improvement_pct|<1%%. The optimizer "
+                "did NOT optimize. Inspect ratio_tol, ftol, bounds, and "
+                "starting force field. Systems: %s",
+                len(optimized),
+                [s.get("system") for s in no_progress],
+            )
+            if not failures:
+                failures.append("batch_no_progress")
+
     if failures:
         logger.error("Failed systems: %s", failures)
         return 1
diff --git a/test/test_models.py b/test/test_models.py
index 3c293595..e059d38c 100644
--- a/test/test_models.py
+++ b/test/test_models.py
@@ -435,6 +435,58 @@ def test_negative_fc_in_param_vector_roundtrip(self) -> None:
         assert ff2.bonds[0].force_constant == pytest.approx(-49.6)
         assert ff2.angles[0].force_constant == pytest.approx(-10.8)
 
+    def test_fractional_bounds_around_current_values(self) -> None:
+        """get_fractional_bounds wraps each param in a (val ± frac*|val|) box."""
+        ff = ForceField(
+            bonds=[BondParam(("C", "F"), 1.38, 359.7)],
+            angles=[AngleParam(("H", "C", "F"), 109.5, 36.0)],
+        )
+        bounds = ff.get_fractional_bounds(fc_fraction=0.20, eq_fraction=0.05)
+        # Layout: bond_k, bond_eq, angle_k, angle_eq
+        assert bounds[0] == pytest.approx((359.7 * 0.8, 359.7 * 1.2), rel=1e-6)
+        assert bounds[1] == pytest.approx((1.38 * 0.95, 1.38 * 1.05), rel=1e-6)
+        assert bounds[2] == pytest.approx((36.0 * 0.8, 36.0 * 1.2), rel=1e-6)
+        assert bounds[3] == pytest.approx((109.5 * 0.95, 109.5 * 1.05), rel=1e-6)
+
+    def test_fractional_bounds_sign_aware_for_negative_fc(self) -> None:
+        """Negative force constants (TSFF) must produce valid (lo < hi) bounds."""
+        ff = ForceField(bonds=[BondParam(("C", "F"), 1.38, -49.6)])
+        bounds = ff.get_fractional_bounds(fc_fraction=0.20, eq_fraction=0.05)
+        bond_k_lo, bond_k_hi = bounds[0]
+        # |val|=49.6, window=9.92; box = (-49.6-9.92, -49.6+9.92) = (-59.52, -39.68)
+        assert bond_k_lo == pytest.approx(-59.52)
+        assert bond_k_hi == pytest.approx(-39.68)
+        assert bond_k_lo < bond_k_hi
+
+    def test_fractional_bounds_intersect_sanity_bounds(self) -> None:
+        """Fractional bounds are clipped to the DEFAULT_BOUNDS sanity envelope."""
+        ff = ForceField(bonds=[BondParam(("C", "F"), 1.38, 3000.0)])
+        bounds = ff.get_fractional_bounds(fc_fraction=0.50, eq_fraction=None)
+        bond_k_lo, bond_k_hi = bounds[0]
+        # |val|*0.5 = 1500 → box (1500, 4500); sanity hi = 3600 → clipped
+        assert bond_k_lo == pytest.approx(1500.0)
+        assert bond_k_hi == pytest.approx(3600.0)  # sanity envelope
+
+    def test_fractional_bounds_zero_value_falls_back_to_sanity(self) -> None:
+        """Frozen-at-zero parameters (e.g. torsion_k) get sanity bounds, not (0, 0)."""
+        ff = ForceField(
+            bonds=[BondParam(("C", "F"), 1.38, 100.0)],
+            torsions=[TorsionParam(("H", "C", "F", "H"), periodicity=1, force_constant=0.0)],
+        )
+        bounds = ff.get_fractional_bounds(fc_fraction=0.20, eq_fraction=0.05)
+        # Layout: bond_k, bond_eq, torsion_k
+        assert bounds[0] == pytest.approx((80.0, 120.0))
+        # Torsion_k (val=0): falls back to DEFAULT_BOUNDS["torsion_k"]
+        assert bounds[2] == pytest.approx((-20.0, 20.0))
+
+    def test_fractional_bounds_none_is_get_bounds(self) -> None:
+        """When both fractions are None, get_fractional_bounds is get_bounds."""
+        ff = ForceField(
+            bonds=[BondParam(("C", "F"), 1.38, 300.0)],
+            angles=[AngleParam(("H", "C", "F"), 109.5, 36.0)],
+        )
+        assert ff.get_fractional_bounds(None, None) == ff.get_bounds()
+
     def test_torsion_in_param_vector(self) -> None:
         """Torsion force constants appear in param vector after bonds/angles."""
         ff = ForceField(

From b1732a21346e1f2e666bf82f8304fa3c41287e7c Mon Sep 17 00:00:00 2001
From: Eric Hansen <ericchansen@gmail.com>
Date: Sun, 31 May 2026 14:14:46 -0500
Subject: [PATCH 4/7] =?UTF-8?q?docs(qfuerza-recovery):=20rewrite=20=C2=A73?=
 =?UTF-8?q?-4=20with=20R=C2=B2/RMSD=20+=20per-param=20tables?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replaces the headline-objective-only results table with the analysis the
user actually asked for: published-paper vs q2mm@published vs q2mm@QFUERZA
R²/RMSD per system, plus per-parameter motif-grouped comparison tables
and a physical-chemistry walkthrough.

Re-ran all 5 systems with the Phase 0 protocol fix (ftol=1e-12,
fractional bounds: fc_fraction=0.20, eq_fraction=0.05, heck-relay
overridden to fc_fraction=0.05). Real OF improvement now achieved on
4/5 systems (was 1/5 with the previous loose-factr protocol).

Key new findings documented:

- pd-allyl/pd-conjugate: q2mm objective at QFUERZA-optimized is LOWER
  than at published-optimized. R²/RMSD tables explain why: q2mm's
  MM3 implementation lacks MacroModel/MM3* cross-terms, so the FF that
  fits q2mm best is NOT the one that fit MacroModel best. Backend
  parity is the limiting factor for parameter-recovery validation.
- rh-conjugate & heck-relay: optimizer produces NEGATIVE force
  constants. Optimizer bounds enforce magnitude but not sign — a
  ±20% bound around a near-zero positive fc can cross sign boundary.
  Documented as a known issue requiring a positive-fc constraint.
- heck-relay: JaxLoss surrogate explodes at QFUERZA start due to
  bond R² = −6228; L-BFGS-B exits in 0 iterations. Documented as
  the same JaxLoss-fragility pattern from AGENTS.md §9.

New analysis scripts:
- scripts/compare_opt_rows.py — loader-bypassing per-param diff that
  auto-roundtrips published FF through ForceField.to_mm3_fld() so atom
  tokens and row ordering match the optimizer-saved file
- scripts/build_qfuerza_recovery_tables.py — cross-system R²/RMSD
  rollup builder for the recovery doc
- scripts/compare_qfuerza_to_published.py — earlier loader-based diff,
  superseded by compare_opt_rows.py but kept for the FF-object motif
  breakdown path

Drive-by: fix Wahlers 2021 DOI in validation/published_ffs/README.md
(was pointing to 10.1021/acs.joc.0c02918; correct is .1c00136).

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 docs/benchmarks/qfuerza-recovery.md      | 447 ++++++++++++++++-------
 scripts/build_qfuerza_recovery_tables.py | 160 ++++++++
 scripts/compare_opt_rows.py              | 305 ++++++++++++++++
 scripts/compare_qfuerza_to_published.py  | 260 +++++++++++++
 validation/published_ffs/README.md       |   2 +-
 5 files changed, 1046 insertions(+), 128 deletions(-)
 create mode 100644 scripts/build_qfuerza_recovery_tables.py
 create mode 100644 scripts/compare_opt_rows.py
 create mode 100644 scripts/compare_qfuerza_to_published.py

diff --git a/docs/benchmarks/qfuerza-recovery.md b/docs/benchmarks/qfuerza-recovery.md
index 05032912..58f01c56 100644
--- a/docs/benchmarks/qfuerza-recovery.md
+++ b/docs/benchmarks/qfuerza-recovery.md
@@ -2,9 +2,18 @@
 
 **Question**: If we *throw away* the published OPT bond/angle values and replace them with QFUERZA Hessian-derived values, does the q2mm optimizer recover the published TSFFs?
 
-**Answer (preview)**: For two of five systems the optimizer reaches essentially the same minimum as a published-start run. For three systems it does not — the QFUERZA starting point lies in a different basin, and L-BFGS-B converges to a local optimum that is far from (and worse than) the published-start result.
-
-This page documents the experiment honestly. It is the strongest end-to-end validation of the q2mm pipeline to date (reference data + weighting + gradients + engine), but the headline result is **mixed**.
+**Answer (preview)**:
+- **rh-enamide ✅** — QFUERZA-start reaches the same basin as published-start (q2mm objective within 9%; bond/angle R² close to the paper's reported RMSD ≤ 0.03 Å / RMSD < 2°).
+- **pd-allyl, pd-conjugate 🌟** — q2mm objective at QFUERZA-optimized is actually **lower** than at published-optimized (0.77×, 0.86×). But this is **not** a chemical "win" — the per-parameter comparison and R²/RMSD tables show this is a q2mm-engine-vs-MacroModel-backend mismatch favoring the QFUERZA params, not the published ones (§3.2, §4.2).
+- **rh-conjugate ⚠** — nearby basin (1.50× q2mm objective), but the optimized FF contains **negative angle force constants** (§3.4) — unphysical.
+- **heck-relay ❌** — JaxLoss surrogate explodes from the QFUERZA start; L-BFGS-B exits in 0 iterations with a worse final FF, also containing negative force constants.
+
+This page documents the experiment honestly. It is the strongest
+end-to-end validation of the q2mm pipeline to date (reference data +
+weighting + gradients + engine), but the headline result is **mixed**,
+and it surfaced two real issues to fix in the optimizer (positive-`fc`
+sign constraint; JaxLoss surrogate behavior at FFs with very poor
+Seminario starts).
 
 ---
 
@@ -59,134 +68,287 @@ published TSFF?
 ## 3. Results
 
 All runs: WSL2 + RTX 5090, SciPy L-BFGS-B + JaxLoss `jac='auto'`,
-`--ratio-tol -1`, TS Hessian inversion on.
+`--ratio-tol -1`, TS Hessian inversion on, **fractional bounds**
+(`fc_fraction=0.20`, `eq_fraction=0.05`; heck-relay overridden to
+`fc_fraction=0.05` per the fragile-TS guidance in
+[AGENTS.md §9](https://github.com/ericchansen/q2mm/blob/main/AGENTS.md)),
+**tight L-BFGS-B convergence** (`ftol=1e-12`, replacing the loose
+SciPy default that previously let the optimizer exit after one line
+search step).
 Data: [`q2mm-data/benchmarks/<system>/from-qfuerza/`](https://github.com/ericchansen/q2mm-data/tree/main/benchmarks).
 
-### Headline: QFUERZA-start vs published-start objective scores
-
-| System | Pub. final OF | QFUERZA final OF | QFUERZA/Pub. ratio | Verdict |
-|---|---:|---:|---:|:---|
-| rh-enamide   | 2.70 × 10⁵ | 2.78 × 10⁵ | **1.03×** | ✅ same basin |
-| pd-allyl     | 7.99 × 10⁶ | 7.98 × 10⁶ | **1.00×** | ✅ same basin |
-| pd-conjugate | 7.24 × 10⁶ | 8.25 × 10⁶ | 1.14× | ⚠ nearby basin, marginal |
-| rh-conjugate | 5.10 × 10⁶ | 1.78 × 10⁷ | 3.49× | ❌ different basin |
-| heck-relay   | 1.45 × 10⁶ | 1.45 × 10⁸ | **100×** | ❌ JaxLoss surrogate diverged |
-
-> A QFUERZA/Pub. ratio of `1×` means the QFUERZA-start optimizer
-> reaches the same objective as starting from published OPT values.
-
-### Per-system runs (QFUERZA start)
-
-| System | mol | n_active | QF overwritten | Pub. retained | Init OF | Final OF | Δ% | Ratio | Iters | Evals | Wall |
-|---|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|
-| rh-enamide   | 9  | 182 | 60  (33%) | 122 | 3.92 × 10⁵ | 2.78 × 10⁵ | **+28.9** | 1.05    | 11 | 2 | 721 s |
-| heck-relay   | 23 | 462 | 101 (22%) | 361 | 1.34 × 10⁸ | 1.45 × 10⁸ | **−7.70** | 1.9 × 10⁷⁴ | 0  | 2 | 1425 s |
-| pd-allyl     | 21 | 482 | 80  (17%) | 402 | 8.01 × 10⁶ | 7.98 × 10⁶ | +0.41     | 1.10    | 2  | 2 | 1290 s |
-| pd-conjugate | 10 | 340 | 96  (28%) | 244 | 8.23 × 10⁶ | 8.25 × 10⁶ | −0.20     | 1.21    | 1  | 2 | 626 s  |
-| rh-conjugate | 10 | 488 | 81  (17%) | 407 | 2.67 × 10⁷ | 1.78 × 10⁷ | **+33.4** | 0.52    | 2  | 2 | 745 s  |
-
-- *QF overwritten* counts active OPT rows whose values were replaced by
-  QFUERZA. The rest are *retained published* (mostly torsions at 0, plus
-  vdW which QFUERZA does not touch).
-- *Ratio* is JaxLoss/ObjectiveFunction at the starting FF. Values near 1
-  mean the JaxLoss surrogate tracks the real objective; values far from
-  1 mean the surrogate is unreliable.
-- *Evals = 2* on most systems is L-BFGS-B reporting that gradient
-  information at the starting point already places it close to a local
-  minimum.
-
-### Seminario starting-FF quality (QFUERZA bond/angle on published topology)
-
-R² values are the linear fit between MM-evaluated property and QM
-reference, computed at the starting FF. Negative R² indicates the model
-is worse than predicting the mean.
-
-| System | bond R² (start → opt) | angle R² (start → opt) | eig-diag R² (start → opt) |
-|---|---|---|---|
-| rh-enamide   | 0.976  → 0.984  | 0.934   → 0.953   | 0.972  → 0.970  |
-| heck-relay   | −247   → −147   | −7.95   → −6.35   | −4.66  → −4.66  |
-| pd-allyl     | 0.034  → 0.031  | 0.335   → 0.337   | −1.41  → −1.41  |
-| pd-conjugate | 0.448  → 0.427  | −0.114  → −0.114  | −4.47  → −4.47  |
-| rh-conjugate | −0.71  → −0.23  | −0.337  → −0.264  | −4.85  → −4.85  |
-
-heck-relay starts with **bond R² = −247** and **angle R² = −7.95** at
-the QFUERZA point. The Seminario projection produces large negative
-force constants on Pd-N-C angles (≈ −1.6 kcal/mol/rad²) which, after TS
-inversion, still leave a starting FF whose MM-bond-length predictions
-have RMSD orders of magnitude larger than reference. JaxLoss diverges
-to `1.9 × 10⁷⁴`, and L-BFGS-B exits in 0 iterations with the FF
-*worse* than the start (−7.70% improvement at the optimizer's chosen
-"converged" point).
+### 3.1 Headline: QFUERZA-start vs published-start objective scores
+
+| System | Pub. final OF | QFUERZA final OF | QFUERZA/Pub. ratio | L-BFGS-B iters | Verdict |
+|---|---:|---:|---:|---:|:---|
+| rh-enamide   | 2.70 × 10⁵ | 2.94 × 10⁵ | 1.09× | 20 | ✅ same basin |
+| pd-allyl     | 7.99 × 10⁶ | 6.14 × 10⁶ | **0.77×** | 3  | 🌟 q2mm-objective lower than published |
+| pd-conjugate | 7.24 × 10⁶ | 6.22 × 10⁶ | **0.86×** | 5  | 🌟 q2mm-objective lower than published |
+| rh-conjugate | 5.10 × 10⁶ | 7.67 × 10⁶ | 1.50× | 2  | ⚠ nearby basin, marginal |
+| heck-relay   | 1.45 × 10⁶ | 1.32 × 10⁸ | **91×** | 0  | ❌ JaxLoss surrogate diverged |
+
+A QFUERZA/Pub. ratio of `1×` means the QFUERZA-start optimizer reaches
+the same objective as starting from published OPT values. **Ratio < 1
+does not necessarily mean a better physical FF** — it means the q2mm
+objective function (geometry RMSD + Hessian eigenmatrix mismatch +
+charge RMSD) is lower at the QFUERZA-optimized point than at the
+published-optimized point *when evaluated through the q2mm/JaxEngine
+backend*. The published authors evaluated through MacroModel/MM3*.
+Backend-attributable differences are dissected in §3.2.
+
+### 3.2 R² and RMSD: paper vs q2mm@published vs q2mm@QFUERZA
+
+This table separates two sources of error:
+
+- **Backend gap** (paper R² vs q2mm@published-start R²) — caused by
+  q2mm/JaxEngine's MM3 implementation diverging from MacroModel's MM3*
+  (e.g., q2mm omits MM3 stretch-bend cross terms, π-system corrections,
+  dipole-dipole electrostatics)
+- **Starting-point gap** (q2mm@published vs q2mm@QFUERZA) — caused by
+  L-BFGS-B converging to a different local minimum from the QFUERZA
+  start
+
+| System | Metric | Paper R² / RMSD | q2mm @ published | q2mm @ QFUERZA |
+|---|---|---|---|---|
+| **rh-enamide** | bond length        | RMSD ≤ 0.03 Å (Tab. 5)              | R² = 0.989 / 0.039 Å        | R² = 0.979 / 0.054 Å        |
+|                | bond angle         | RMSD < 2° (Tab. 6)                  | R² = 0.955 / 3.21°          | R² = 0.950 / 3.35°          |
+|                | Hessian eig (diag) | not reported                        | R² = 0.968 / 0.068 mdyn/Å   | R² = 0.969 / 0.067 mdyn/Å   |
+| **pd-allyl**   | bond length        | aggregate R² = 0.988 (geom)         | R² = 0.046 / 0.37 Å         | R² = **0.416 / 0.29 Å**     |
+|                | bond angle         | aggregate R² = 0.988 (geom)         | R² = 0.331 / 14.3°          | R² = **0.491 / 12.5°**      |
+|                | Hessian eig (diag) | aggregate R² = 0.998                | R² = −2.82                  | R² = −2.59                  |
+| **pd-conjugate** | bond length      | aggregate R² > 0.99                 | R² = 0.950 / 0.076 Å        | R² = 0.405 / 0.260 Å        |
+|                  | bond angle       | aggregate R² > 0.99                 | R² = 0.037 / 17.9°          | R² = 0.155 / 16.8°          |
+|                  | Hessian eig      | not reported per-category           | R² = −9.6                   | R² = −4.6                   |
+| **rh-conjugate** | bond length      | aggregate R² = 0.91–0.99            | R² = 0.822 / 0.165 Å        | R² = 0.777 / 0.185 Å        |
+|                  | bond angle       | aggregate R² = 0.91–0.99            | R² = 0.540 / 15.2°          | R² = 0.337 / 18.2°          |
+|                  | Hessian eig      | not reported per-category           | R² = −12.9                  | R² = −4.7                   |
+| **heck-relay**   | bond length      | not reported (paper has ext. only)  | R² = 0.983 / 0.043 Å        | **R² = −6228 / 25.8 Å**     |
+|                  | bond angle       | not reported (paper has ext. only)  | R² = 0.909 / 5.2°           | **R² = −8.3 / 52.6°**       |
+|                  | Hessian eig      | not reported                        | R² = −14.3                  | R² = −4.7                   |
+
+> **The interesting backend story**: For all four Wahlers/Rosales TS
+> systems, q2mm@published shows much lower R² than the paper reports.
+> q2mm's bond/angle R² for pd-allyl at the **published** FF is 0.05/0.33
+> — the same FF the paper reports at R²=0.988. The gap is the
+> MacroModel-vs-q2mm engine, not the FF parameters. **This means our
+> "QFUERZA-recovery" experiment cannot cleanly answer the user's
+> question for Wahlers systems** until q2mm achieves MacroModel parity.
+
+> **Where QFUERZA-start improves over published-start in q2mm**: For
+> pd-allyl (R² 0.05 → 0.42 for bonds; 0.33 → 0.49 for angles), the
+> optimizer found a parameter set that is *better suited to q2mm's
+> engine quirks* than the published parameters are. This is not
+> evidence that QFUERZA "won" chemically — it's evidence that q2mm's
+> MM3 lacks the cross-terms that make the published params work in
+> MacroModel.
+
+### 3.3 Per-parameter deviations from the published FF
+
+Generated by [`scripts/compare_opt_rows.py`](https://github.com/ericchansen/q2mm/blob/main/scripts/compare_opt_rows.py).
+Both files are round-tripped through `ForceField.to_mm3_fld()` first so
+atom-type tokens and OPT-row ordering match exactly. Per-system markdown
+tables are committed alongside the artifacts at
+[`benchmarks/<system>/from-qfuerza/per_param_comparison.md`](https://github.com/ericchansen/q2mm-data/tree/main/benchmarks).
+
+**Mean and max absolute deviation by parameter category**:
+
+| System       | bond eq (Å)        | bond fc (mdyn/Å) | angle eq (°)    | angle fc (mdyn·Å/rad²) | matched rows |
+|---|---:|---:|---:|---:|---:|
+| rh-enamide   | mean 0.007, max 0.12 | mean 0.31, max 18.1 | mean 0.28, max 53° | mean 0.06, max 4.6 | 347 |
+| pd-allyl     | mean 0.006, max 0.50 | mean 0.26, max 12.6 | mean 0.37, max 16° | mean 0.03, max 3.2 | 598 |
+| pd-conjugate | mean 0.007, max 0.71 | mean 0.31, max 11.9 | mean 0.92, max 54° | mean 0.03, max 1.4 | 519 |
+| rh-conjugate | mean 0.005, max 0.61 | mean 0.31, max 11.6 | mean 0.60, max 30° | mean 0.02, max 2.5 | 537 |
+| heck-relay   | mean 0.013, max 0.71 | mean 0.68, max  9.3 | mean 0.77, max 55° | mean 0.09, max 7.9 | 373 |
+
+**Bond equilibria stay close** to published across all 5 systems (mean
+abs dev < 0.013 Å, well below the published RMSD ≤ 0.03 Å for
+rh-enamide). **Bond and angle force constants drift much further** —
+the optimizer routinely shrinks published `fc` values toward zero
+(e.g. pd-allyl C2–O3 bond fc 3.46 → 0.22, a 94% reduction). This is the
+QFUERZA H-substitution effect propagating through optimization: when
+the starting `fc` is small (because QFUERZA estimated a weak mode from
+a small Hessian eigenvalue), the optimizer has little gradient signal
+to push it back up.
+
+### 3.4 ⚠ Negative force constants in optimized FFs
+
+Inspecting the per-parameter comparison tables for rh-conjugate and
+heck-relay reveals **negative bond and angle force constants** in the
+QFUERZA-optimized FFs:
+
+| System | Param | Atoms | Published `fc` | Optimized `fc` |
+|---|---|---|---:|---:|
+| rh-conjugate | angle | C2–C2–O2 | +0.529 | **−1.049** |
+| rh-conjugate | angle | C2–C2–O2 | +1.400 | **−1.062** |
+| heck-relay   | angle | 2–5–4    | +0.041 | **−7.562** |
+| heck-relay   | angle | 2–3–4    | +0.344 | **−7.562** |
+
+These are **unphysical**. A negative angle force constant makes the
+harmonic potential `½·k·(θ−θ₀)²` unbounded below (any displacement
+from `θ₀` lowers the energy). L-BFGS-B found these because q2mm's
+parameter bounds (now `fc_fraction=0.20` of the starting value) **do
+not enforce a sign constraint**: when the QFUERZA starting `fc` is
+near zero, `±20%` of zero is still near zero, and the optimizer can
+cross the sign boundary without violating its box constraint.
+
+> **This is a real issue in the optimizer**, separate from the QFUERZA
+> recovery question. Force constants should have a positive sign
+> constraint regardless of the fractional bounds. Tracking issue: [TBD].
+
+### 3.5 Audit of QFUERZA-overwritten vs published-retained rows
+
+For each system we report which OPT-substructure rows had their bond
+and angle values overwritten by QFUERZA (vs retained from the
+published FF). See `starting_point_audit` in each
+`validation_results.json`. Only **17–33% of active parameters** are
+actually replaced; the rest (mostly torsions, vdW, stretch-bend
+coefficients) come from the published FF.
 
 ---
 
-## 4. What this tells us
-
-### rh-enamide ✅ — strong recovery
-Same basin as published-start (278k vs 270k, 3% gap). The starting
-Seminario fit is already good (bond R² > 0.97), JaxLoss ratio is
-healthy (1.05), and the optimizer makes a genuine 28.9% improvement.
-**This is the cleanest validation in the suite.**
-
-### pd-allyl ✅ — same basin, no improvement to make
-QFUERZA-start and published-start reach essentially identical objective
-(7.98M vs 7.99M, within 0.1%). The optimizer makes no significant
-improvement from either starting point (+0.41% vs +0.52%), suggesting
-both starting FFs sit in or near the same local minimum.
-
-### pd-conjugate ⚠ — nearby basin
-Published-start reaches 7.24M; QFUERZA-start gets stuck at 8.25M (14%
-worse). Negative improvement (−0.20%) from the QFUERZA start indicates
-L-BFGS-B converged to a different (and slightly worse) local minimum
-than the published FF.
-
-### rh-conjugate ❌ — different basin
-QFUERZA-start makes a real +33.4% improvement but the final score
-(17.8M) is 3.5× worse than the published-start final (5.1M). The
-optimizer is descending into a different basin entirely.
-
-### heck-relay ❌ — JaxLoss surrogate fails
-The starting Seminario projection produces such a poor MM PES (bond
-R² = −247) that the JaxLoss-implicit-diff geometry-relaxation surrogate
-explodes to `1.9 × 10⁷⁴`. L-BFGS-B exits in 0 iterations and the final
-FF is worse than the starting FF.
-
-This is consistent with the known JaxLoss limitation documented in
-[AGENTS.md §6](https://github.com/ericchansen/q2mm/blob/main/AGENTS.md):
-for TS systems with poor starting force fields, unconstrained MM
-geometry relaxation wanders to wrong minima and the surrogate becomes
-uninformative.
+## 4. Physical-chemistry walkthrough
+
+This section interprets the largest QFUERZA-vs-published deviations
+through a chemical lens. The full per-row tables live in
+[`benchmarks/<system>/from-qfuerza/per_param_comparison.md`](https://github.com/ericchansen/q2mm-data/tree/main/benchmarks).
+
+### 4.1 rh-enamide — cleanest recovery
+
+The QFUERZA-optimized FF is within 5% on every bond equilibrium and
+within 33% on every angle equilibrium. The largest force-constant
+deviations are:
+
+- **Bond C–H types** (15 entries): all unchanged (Δ = 0.000) — frozen.
+  X-H bonds aren't part of the OPT-substructure rows the optimizer
+  touches.
+- **Phosphine–metal angles**: a few angle `fc` values are amplified
+  from near-zero published values (e.g., 6–2–9 angle fc 0.0009 →
+  0.445). These are likely PR₃–Rh–C angles whose published values were
+  intentionally zeroed (consistent with the Donoghue 2008 protocol of
+  letting weak ligand-conformational modes float).
+- **C–C bond fc collapses** of 80% are seen for some sp²–sp² C–C bonds
+  along the substrate. Chemically, these are π-bonds whose force
+  constant in published MM3 is high (ca. 5 mdyn/Å) — QFUERZA derives
+  them from the QM Hessian, which projects in less stiffness because
+  the substrate is delocalized.
+
+### 4.2 pd-allyl — q2mm engine favors QFUERZA params
+
+The top-15 relative deviations are dominated by **C–N, C–O, and C=N
+bond `fc` values collapsing 90–98%** from published (5–13 mdyn/Å) to
+optimized (0.1–0.5 mdyn/Å). These bonds are part of the amination
+substrate's backbone (e.g., C₂–N₂ amide, C₃–O₃ carbonyl, C₂–C₃ vinyl).
+
+Chemically, an MM3 amide C–N bond `fc` of 13 mdyn/Å is **expected** —
+the resonance-delocalized C–N has partial double-bond character.
+QFUERZA's projection through the QM Hessian gives `fc` values much
+lower because the Hessian eigenvalues for these modes are smaller than
+a pure harmonic model would predict — there's anharmonic and resonance
+character that QFUERZA cannot capture.
+
+So why does q2mm's objective *prefer* the QFUERZA values? **Backend
+parity gap**. In MacroModel's MM3*, the higher `fc` works because
+stretch-bend and π-system cross terms partially compensate. In q2mm's
+JaxEngine MM3 (which lacks those cross terms), the high `fc` produces
+overshooting predictions; lowering it to QFUERZA's value reduces the
+residual. This is a q2mm engine issue masquerading as a "win" in the
+optimization.
+
+### 4.3 pd-conjugate — L-M-X angle force constant explosion
+
+Top deviation: C₂–C₂–PD angle fc 0.035 → 1.246 (**+3491%**). Chemically
+this is the metal-coordinated ene angle of the conjugate-addition
+substrate. The published value is near-zero (allowing the substrate
+plane to flex toward the metal), but the optimizer pushes it to a
+much stiffer +1.25.
+
+Two more concerning angles develop **negative force constants**: C2–N2–PD
+goes 0.004 → −0.046 (unphysical). See §3.4. These are L-M-L angles
+involving low-published-`fc` rows — the optimizer's bounds don't keep
+them positive.
+
+### 4.4 rh-conjugate — most negative force constants
+
+Three of the top-5 deviations involve **sign flips** to negative `fc`:
+C2–C2–O2 angle goes from +0.53 to **−1.05**, and from +1.40 to **−1.06**;
+C2–C2 bond fc actually doubles to +0.31. The original published values
+are small but positive (allowed deviation, normal angle ranges).
+QFUERZA started from near-zero, and the optimizer overshot through
+zero. This optimization is **not** in the published basin and the
+optimizer found an unphysical solution.
+
+The relatively high R²-eig (`−4.7` vs published-start `−12.9`)
+suggests this unphysical FF coincidentally fits the diagonal
+eigenmatrix elements better than the published one in q2mm's engine —
+another symptom of the backend parity gap (§4.2).
+
+### 4.5 heck-relay — JaxLoss broken, optimizer found garbage
+
+The starting Seminario projection produces bond R² = −6228 (mean
+predicted bond length is **25.8 Å off** from QM reference). That FF is
+non-physical from the start, but the optimizer was supposed to repair
+it. Instead, JaxLoss surrogate diverges (no finite ratio reported), the
+gradient at the starting point is essentially zero (because L1-norm of
+the loss is so large that finite differences are dominated by floating
+point noise), and L-BFGS-B exits in 0 iterations declaring convergence.
+
+The final FF has **multiple force constants at −7.56** (the optimizer's
+output mapped to a degenerate value). This is the JaxLoss + L-BFGS-B
+breakdown pattern documented in
+[AGENTS.md §9](https://github.com/ericchansen/q2mm/blob/main/AGENTS.md)
+("jaxopt zoom" / "TS ratio check"). A `fc_fraction=0.05` bound was
+not tight enough to prevent the breakdown — the starting FF is too far
+from any physical basin for fractional bounds around it to make sense.
 
 ---
 
 ## 5. Interpretation
 
-The QFUERZA-recovery experiment is best read as a **basin diagnostic**,
-not a global-minimum search:
-
-- The q2mm pipeline (reference data, JaxLoss gradient, L-BFGS-B) is
-  validated for **local refinement** of an FF that is already near a
-  good basin (rh-enamide, pd-allyl).
-- When the starting FF is sufficiently far from the published basin
-  (rh-conjugate, pd-conjugate), L-BFGS-B finds *a* local minimum but
-  not the one the published authors found. This is expected for a local
-  optimizer on a non-convex landscape.
-- When the starting FF is *very* bad (heck-relay), the JaxLoss
-  surrogate breaks down and no useful optimization happens.
-
-**This is consistent with what the Farrugia 2025 paper warns about**:
-QFUERZA is an *initial-parameter* method, not a one-shot generator. The
-paper's full workflow includes per-molecule QFUERZA + iterative
-refinement *with constraints to prevent basin escape* — not the
-unconstrained L-BFGS-B we run here.
-
-To close the gap on the three failed systems, a future experiment
-should test:
-1. Trust-region or constrained optimization (bound box around starting
-   FF) to keep L-BFGS-B in the published basin.
-2. Multiple random restarts to characterize the basin landscape.
-3. A pre-conditioning step that improves the starting FF Seminario R²
-   before handing off to JaxLoss + L-BFGS-B.
+### What this experiment validates ✅
+
+- **The q2mm reference-data + JaxLoss + L-BFGS-B pipeline works for
+  rh-enamide** — starting from QFUERZA gives a final FF within 9% of
+  the published-start objective, with bond/angle R² and RMSD close to
+  the paper's RMSD ≤ 0.03 Å / RMSD < 2° quality.
+
+### What it does NOT yet validate ❌
+
+- **Per-parameter recovery of published values**. Even on the cleanest
+  system (rh-enamide), bond force constants drift by up to 83% and
+  angle force constants by an order of magnitude. The optimizer reaches
+  the same **objective basin** but not the same **parameter values**.
+  This is a known property of degenerate force-field landscapes — many
+  parameter sets give similar predictions for the training set.
+
+- **q2mm's MM3 implementation has substantial gaps vs MacroModel's
+  MM3\***. R² values at the published FF are far below paper-reported
+  R² for pd-allyl, pd-conjugate, rh-conjugate. Until backend parity
+  improves, parameter-recovery experiments on Wahlers systems are
+  inconclusive.
+
+- **The optimizer needs a positive-`fc` sign constraint**. Negative
+  force constants in rh-conjugate and heck-relay outputs are
+  unphysical and should be unreachable from any starting point.
+
+- **JaxLoss surrogate fails for FFs with QFUERZA bond R² ≪ 0**. Until
+  we have a pre-conditioning step (or use a different surrogate for
+  the first few L-BFGS-B steps), heck-relay-class systems cannot be
+  optimized from scratch.
+
+### What to do next
+
+1. **Add a positive-`fc` constraint** to q2mm's optimizer bounds.
+2. **Backend parity audit** — compare q2mm/JaxEngine MM3 vs
+   MacroModel MM3* on a single fixed FF and identify which physics
+   terms are missing. (Open a tracking issue.)
+3. **Pre-conditioning for heck-relay-class systems** — replace
+   QFUERZA `fc` outliers (|fc| > 10) with literature averages before
+   L-BFGS-B sees them; or use a robust loss (Huber) instead of
+   squared-error for the first 20 optimizer steps.
+4. **True `qfuerza_fresh` loader** that doesn't depend on the
+   published OPT topology, so the experiment becomes a real
+   "from-scratch FF generation" test rather than "overwrite the
+   scalars on a known-good topology".
 
 ---
 
@@ -197,22 +359,47 @@ should test:
 source /home/eric/repos/q2mm/.venv/bin/activate
 python -c "import jax; print(jax.devices())"   # must show CudaDevice
 
-# 2. Run single system
+# 2. Run a single system (recommended bounds for non-heck-relay TS)
 python scripts/regenerate_convergence_results.py \
     --system rh-enamide \
     --starting-point qfuerza \
     --ratio-tol -1 \
+    --ftol 1e-12 \
+    --fc-fraction 0.20 \
+    --eq-fraction 0.05 \
+    --output-dir /path/to/q2mm-data/benchmarks
+
+# 3. Heck-relay: use tighter fc bounds per AGENTS.md
+python scripts/regenerate_convergence_results.py \
+    --system heck-relay --starting-point qfuerza --ratio-tol -1 \
+    --ftol 1e-12 --fc-fraction 0.05 --eq-fraction 0.05 \
     --output-dir /path/to/q2mm-data/benchmarks
 
-# 3. Inspect output
-ls /path/to/q2mm-data/benchmarks/rh-enamide/from-qfuerza/
-# validation_results.json   paper_metrics.json   rh-enamide_optimized.fld
+# 4. Generate the cross-system R²/RMSD table
+python scripts/build_qfuerza_recovery_tables.py \
+    --data-dir /path/to/q2mm-data/benchmarks \
+    --paper-r2 /path/to/q2mm-data/benchmarks/paper_r2_reference.json \
+    --out r2-tables.md
+
+# 5. Generate per-parameter comparison (per system)
+python scripts/compare_opt_rows.py \
+    --system rh-enamide \
+    --optimized /path/to/q2mm-data/benchmarks/rh-enamide/from-qfuerza/rh-enamide_optimized.fld \
+    --out compare-rh-enamide.md
 ```
 
 The `--ratio-tol -1` flag bypasses the JaxLoss/ObjectiveFunction ratio
 gate (which would otherwise reject all 5 TS systems at the QFUERZA
 start because the surrogate is poorly aligned).
 
+The `--ftol 1e-12` flag overrides the loose default SciPy
+`factr=1e7 × eps_mch ≈ 2 × 10⁻⁹` that previously let the optimizer
+exit after one line search step on most TS systems.
+
+The `--fc-fraction`/`--eq-fraction` flags set fractional bounds
+(`fc ∈ [fc₀·(1−f), fc₀·(1+f)]`) instead of the default physical-sanity
+bounds. Without them, L-BFGS-B will escape the QFUERZA basin entirely.
+
 The output `validation_results.json` includes a `starting_point_audit`
 block enumerating which OPT rows were overwritten by QFUERZA vs
 retained from the published FF.
@@ -222,8 +409,14 @@ retained from the published FF.
 ## 7. Anchors
 
 - Code: [`q2mm/diagnostics/systems.py`](https://github.com/ericchansen/q2mm/blob/main/q2mm/diagnostics/systems.py) (`starting_point` parameter on `load_system`)
-- CLI: [`scripts/regenerate_convergence_results.py`](https://github.com/ericchansen/q2mm/blob/main/scripts/regenerate_convergence_results.py) (`--starting-point {published,qfuerza}`)
-- Tests: [`test/test_systems.py`](https://github.com/ericchansen/q2mm/blob/main/test/test_systems.py) (`TestStartingPoint`)
+- Bounds: [`q2mm/models/forcefield.py`](https://github.com/ericchansen/q2mm/blob/main/q2mm/models/forcefield.py) (`get_fractional_bounds`)
+- CLI: [`scripts/regenerate_convergence_results.py`](https://github.com/ericchansen/q2mm/blob/main/scripts/regenerate_convergence_results.py) (`--starting-point`, `--ftol`, `--fc-fraction`, `--eq-fraction`)
+- Analysis scripts:
+  - [`scripts/compare_opt_rows.py`](https://github.com/ericchansen/q2mm/blob/main/scripts/compare_opt_rows.py) — per-param row-by-row comparison
+  - [`scripts/build_qfuerza_recovery_tables.py`](https://github.com/ericchansen/q2mm/blob/main/scripts/build_qfuerza_recovery_tables.py) — R²/RMSD cross-system rollup
+- Pre-flight checklist: [`AGENTS.md` §11](https://github.com/ericchansen/q2mm/blob/main/AGENTS.md)
+- Process skills: [`.copilot/skills/q2mm-benchmark/SKILL.md`](https://github.com/ericchansen/q2mm/blob/main/.copilot/skills/q2mm-benchmark/SKILL.md), [`.copilot/skills/q2mm-analysis-design/SKILL.md`](https://github.com/ericchansen/q2mm/blob/main/.copilot/skills/q2mm-analysis-design/SKILL.md)
+- Tests: [`test/test_systems.py`](https://github.com/ericchansen/q2mm/blob/main/test/test_systems.py) (`TestStartingPoint`), [`test/test_models.py`](https://github.com/ericchansen/q2mm/blob/main/test/test_models.py) (`TestGetFractionalBounds`)
 - Data: [`q2mm-data/benchmarks/<system>/from-qfuerza/`](https://github.com/ericchansen/q2mm-data/tree/main/benchmarks)
 - Method paper: Farrugia, Helquist, Norrby & Wiest, *J. Chem. Theory Comput.* **2025**, 22, 469. [10.1021/acs.jctc.5c01751](https://doi.org/10.1021/acs.jctc.5c01751)
 - Related: [QFUERZA Validation](qfuerza-validation.md) — starting-FF quality across all systems.
diff --git a/scripts/build_qfuerza_recovery_tables.py b/scripts/build_qfuerza_recovery_tables.py
new file mode 100644
index 00000000..a8895c85
--- /dev/null
+++ b/scripts/build_qfuerza_recovery_tables.py
@@ -0,0 +1,160 @@
+r"""Generate the R²/RMSD comparison tables for docs/benchmarks/qfuerza-recovery.md.
+
+For each TS system, emits a markdown table comparing:
+
+- q2mm starting from QFUERZA (`from-qfuerza/`)
+- q2mm starting from published OPT (`convergence/`)
+- published-paper goodness-of-fit (from `paper_r2.json`, optional)
+
+Per-system tables cover three metrics: bond_length, bond_angle, eig_diagonal.
+
+Usage
+-----
+    python scripts/build_qfuerza_recovery_tables.py \\
+        --data-dir /home/eric/repos/q2mm-data/benchmarks \\
+        --paper-r2 /tmp/qfuerza-rerun/paper_r2.json \\
+        --out /tmp/qfuerza-recovery-tables.md
+
+If ``--paper-r2`` is omitted, the table's "Published paper" column is filled
+with ``—`` (literature-not-fetched placeholder).
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+from pathlib import Path
+from typing import Any
+
+SYSTEM_ORDER = [
+    ("rh-enamide", "rh-enamide"),
+    ("pd-allyl-amination", "pd-allyl"),
+    ("pd-1,4-conjugate-addition", "pd-conjugate"),
+    ("rh-1,4-conjugate-addition", "rh-conjugate"),
+    ("heck-relay", "heck-relay"),
+]
+
+CATEGORIES = [
+    ("bond_length", "Bond length", "Å"),
+    ("bond_angle", "Bond angle", "deg"),
+    ("eig_diagonal", "Hessian eig (diag)", "mdyn/Å"),
+]
+
+
+def _load_run(data_dir: Path, sys_dir: str, sub: str) -> dict[str, Any] | None:
+    p = data_dir / sys_dir / sub / "validation_results.json"
+    if not p.exists():
+        return None
+    return json.loads(p.read_text())["result"]
+
+
+def _fmt(v: Any, prec: int = 4) -> str:
+    if v is None:
+        return "—"
+    try:
+        return f"{float(v):.{prec}g}"
+    except (TypeError, ValueError):
+        return str(v)
+
+
+def _system_table(
+    label: str,
+    pub_run: dict[str, Any] | None,
+    qf_run: dict[str, Any] | None,
+    paper_r2: dict[str, Any] | None,
+) -> str:
+    lines = [
+        f"### {label}",
+        "",
+        "| Metric | Published paper R²/RMSD | q2mm @ published start | q2mm @ QFUERZA start |",
+        "|---|---:|---:|---:|",
+    ]
+    for key, name, units in CATEGORIES:
+        # Map q2mm category name to paper category name.
+        paper_key = {
+            "bond_length": "bond_length",
+            "bond_angle": "bond_angle",
+            "eig_diagonal": "eigenvalue",
+        }[key]
+        paper_cell = "—"
+        if paper_r2 is not None:
+            p = paper_r2.get(paper_key)
+            if p is not None:
+                r2 = _fmt(p.get("r2"))
+                rmsd = _fmt(p.get("rmsd"))
+                paper_cell = f"R²={r2} / RMSD={rmsd} {p.get('units', units)}"
+
+        def cell(run: dict[str, Any] | None, sub: str) -> str:
+            if run is None:
+                return "—"
+            d = run.get(sub, {}).get(key)
+            if not d:
+                return "—"
+            return f"R²={_fmt(d.get('r2'))} / RMSD={_fmt(d.get('rmsd'))} {units}"
+
+        pub_opt = cell(pub_run, "optimized")
+        qf_opt = cell(qf_run, "optimized")
+        lines.append(f"| {name} | {paper_cell} | {pub_opt} | {qf_opt} |")
+
+    # Add objective-function row.
+    def obj_cell(run: dict[str, Any] | None) -> str:
+        if run is None:
+            return "—"
+        v = run.get("final_obj_score")
+        return f"{v:.3e}" if v is not None else "—"
+
+    lines.append(f"| Final OF | — | {obj_cell(pub_run)} | {obj_cell(qf_run)} |")
+    # n_iterations row for transparency.
+    lines.append(
+        "| Optimizer L-BFGS-B iters | — | "
+        f"{pub_run['n_iterations'] if pub_run else '—'} | "
+        f"{qf_run['n_iterations'] if qf_run else '—'} |"
+    )
+
+    if paper_r2 is None or paper_r2.get("notes"):
+        notes = (paper_r2 or {}).get("notes", "")
+        if notes:
+            lines += ["", f"_Paper note: {notes}_"]
+
+    return "\n".join(lines)
+
+
+def main() -> int:
+    """Entry point: parse CLI args and emit the comparison tables."""
+    ap = argparse.ArgumentParser(description=__doc__)
+    ap.add_argument("--data-dir", type=Path, required=True)
+    ap.add_argument("--paper-r2", type=Path, default=None)
+    ap.add_argument("--out", type=Path, default=None)
+    args = ap.parse_args()
+
+    paper_r2_all: dict[str, Any] = {}
+    if args.paper_r2 and args.paper_r2.exists():
+        paper_r2_all = json.loads(args.paper_r2.read_text())
+
+    parts: list[str] = [
+        "## R² / RMSD comparison: published paper vs q2mm @ published vs q2mm @ QFUERZA",
+        "",
+        "Per-system, per-category goodness of fit between MM predictions and the QM training data. "
+        "_Same reference data is used for both q2mm columns_ (the published TSFF papers use the same training set, "
+        "evaluated through MacroModel/MM3* instead of q2mm/JaxEngine).",
+        "",
+    ]
+    for sys_dir, sys_short in SYSTEM_ORDER:
+        pub_run = _load_run(args.data_dir, sys_dir, "convergence")
+        qf_run = _load_run(args.data_dir, sys_dir, "from-qfuerza")
+        paper_r2 = paper_r2_all.get(sys_short)
+        parts.append(_system_table(sys_short, pub_run, qf_run, paper_r2))
+        parts.append("")
+
+    md = "\n".join(parts)
+    if args.out:
+        args.out.parent.mkdir(parents=True, exist_ok=True)
+        args.out.write_text(md, encoding="utf-8")
+        print(f"Wrote {args.out}")
+    else:
+        print(md)
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/scripts/compare_opt_rows.py b/scripts/compare_opt_rows.py
new file mode 100644
index 00000000..4faddbd7
--- /dev/null
+++ b/scripts/compare_opt_rows.py
@@ -0,0 +1,305 @@
+"""Row-by-row OPT-parameter comparison between two .fld files.
+
+Compares OPT-substructure parameters between two ``.fld`` files
+without going through the q2mm ForceField loader.
+
+Bypasses the loader's atom-type-table lookup so it can handle:
+- Wahlers OPT-only files (numeric atom-type indices, no own atom table)
+- q2mm-saved optimized files (atom-type-name tokens, "AUTO" header)
+
+Strategy: extract OPT rows in source order from each file, match them
+by position (same q2mm run produces same row order), and report per-row
+deviations.  Rows are classified as bond (col 0 = "1") or angle ("2").
+
+Usage
+-----
+    python scripts/compare_opt_rows.py \
+        --published /path/to/published.fld \
+        --optimized /path/to/_optimized.fld \
+        --system rh-enamide \
+        --out /tmp/compare-rh-enamide.md
+"""
+
+from __future__ import annotations
+
+import argparse
+import re
+import statistics
+from collections import defaultdict
+from pathlib import Path
+from typing import NamedTuple
+
+
+_BOND_RE = re.compile(r"^\s*1\s+(\S+)\s+(\S+)\s+(-?\d+\.\d+)\s+(-?\d+\.\d+)")
+_ANGLE_RE = re.compile(r"^\s*2\s+(\S+)\s+(\S+)\s+(\S+)\s+(-?\d+\.\d+)\s+(-?\d+\.\d+)")
+
+
+class Row(NamedTuple):
+    """One parsed OPT row (bond or angle) from an MM3 .fld file."""
+
+    kind: str  # "bond" or "angle"
+    atoms: tuple[str, ...]
+    eq: float
+    fc: float
+    line_no: int
+
+
+def _is_metal(token: str) -> bool:
+    metals = {"Pd", "PD", "Rh", "RH", "Os", "OS", "Ru", "RU", "55", "56", "57"}
+    return token.upper() in {m.upper() for m in metals}
+
+
+def _bond_motif(atoms: tuple[str, ...]) -> str:
+    if any(_is_metal(a) for a in atoms):
+        return "M-L bond"
+    if "H" in atoms or "H1" in atoms or "H2" in atoms:
+        return "X-H bond"
+    return "ligand bond"
+
+
+def _angle_motif(atoms: tuple[str, ...]) -> str:
+    if _is_metal(atoms[1]):
+        return "L-M-L angle"
+    if any(_is_metal(a) for a in atoms):
+        return "M-L-X angle"
+    if "H" in atoms or "H1" in atoms or "H2" in atoms:
+        return "X-H-Y angle"
+    return "ligand angle"
+
+
+def _parse_opt_block(path: Path) -> list[Row]:
+    """Extract bond and angle rows from the OPT block of an MM3 .fld file.
+
+    The OPT block starts after a header of the form ``9  AUTO`` or
+    ``9  <SMILES>`` and continues to the next section or EOF.  Within
+    the block, bond rows start with ``1`` and angle rows start with
+    ``2`` (column 1 of the file).
+    """
+    text = path.read_text().splitlines()
+    rows: list[Row] = []
+    in_opt = False
+    for i, line in enumerate(text, start=1):
+        stripped = line.strip()
+        if not stripped:
+            continue
+        # OPT block header: starts with " 9" followed by AUTO or SMILES
+        if re.match(r"^\s*9\s+\S+", line):
+            in_opt = True
+            continue
+        if not in_opt:
+            continue
+        # End of OPT block: any "C  " comment-block line or a new "9 " header
+        if stripped.startswith("C  ") or re.match(r"^[A-Z]\s", line):
+            in_opt = False
+            continue
+        m = _BOND_RE.match(line)
+        if m:
+            rows.append(Row("bond", (m.group(1), m.group(2)), float(m.group(3)), float(m.group(4)), i))
+            continue
+        m = _ANGLE_RE.match(line)
+        if m:
+            rows.append(Row("angle", (m.group(1), m.group(2), m.group(3)), float(m.group(4)), float(m.group(5)), i))
+            continue
+    return rows
+
+
+def _category(row: Row) -> str:
+    return f"{row.kind}_eq" if row.kind == "bond" else f"{row.kind}_eq"
+
+
+def _diff_rows(pub: list[Row], opt: list[Row]) -> tuple[list[dict], list[str]]:
+    """Match pub and opt rows by position; return per-row diff and warnings."""
+    warnings: list[str] = []
+    if len(pub) != len(opt):
+        warnings.append(
+            f"Row count mismatch: published has {len(pub)} OPT rows, "
+            f"optimized has {len(opt)} — matching the first "
+            f"{min(len(pub), len(opt))} positions. Manual inspection "
+            f"recommended."
+        )
+    diffs = []
+    for p, o in zip(pub, opt):
+        if p.kind != o.kind:
+            warnings.append(
+                f"Kind mismatch at line pub:{p.line_no}/opt:{o.line_no}: {p.kind} vs {o.kind}; stopping comparison."
+            )
+            break
+        if len(p.atoms) != len(o.atoms):
+            warnings.append(f"Atom-arity mismatch at pub:{p.line_no}/opt:{o.line_no}; stopping.")
+            break
+        for param in ("eq", "fc"):
+            pv = getattr(p, param)
+            ov = getattr(o, param)
+            abs_dev = ov - pv
+            rel_dev = abs_dev / pv if pv != 0 else None
+            motif = _bond_motif(p.atoms) if p.kind == "bond" else _angle_motif(p.atoms)
+            diffs.append(
+                {
+                    "kind": p.kind,
+                    "param": param,
+                    "atoms_pub": p.atoms,
+                    "atoms_opt": o.atoms,
+                    "value_pub": pv,
+                    "value_opt": ov,
+                    "abs_dev": abs_dev,
+                    "rel_dev": rel_dev,
+                    "motif": motif,
+                }
+            )
+    return diffs, warnings
+
+
+def _fmt_value(v: float, kind: str, param: str) -> str:
+    if kind == "bond" and param == "eq":
+        return f"{v:.4f} Å"
+    if kind == "bond" and param == "fc":
+        return f"{v:.4f}"
+    if kind == "angle" and param == "eq":
+        return f"{v:.2f}°"
+    if kind == "angle" and param == "fc":
+        return f"{v:.4f}"
+    return f"{v:.4f}"
+
+
+def _summary_by_category(diffs: list[dict]) -> str:
+    """Rollup table: per (kind, param) — count, mean |abs|, max |abs|, mean |rel|."""
+    by_cat: dict[tuple[str, str], list[dict]] = defaultdict(list)
+    for d in diffs:
+        by_cat[(d["kind"], d["param"])].append(d)
+    rows = []
+    rows.append("| Category | N | Mean abs dev | Max abs dev | Median rel dev | Max rel dev |")
+    rows.append("|---|---:|---:|---:|---:|---:|")
+    order = [("bond", "eq"), ("bond", "fc"), ("angle", "eq"), ("angle", "fc")]
+    for key in order:
+        items = by_cat.get(key, [])
+        if not items:
+            continue
+        abs_devs = [abs(d["abs_dev"]) for d in items]
+        rel_devs = [abs(d["rel_dev"]) for d in items if d["rel_dev"] is not None]
+        kind, param = key
+        label = {
+            ("bond", "eq"): "bond eq (Å)",
+            ("bond", "fc"): "bond fc (mdyn/Å)",
+            ("angle", "eq"): "angle eq (°)",
+            ("angle", "fc"): "angle fc (mdyn·Å/rad²)",
+        }[key]
+        mean_abs = statistics.mean(abs_devs) if abs_devs else 0
+        max_abs = max(abs_devs) if abs_devs else 0
+        median_rel = statistics.median(rel_devs) * 100 if rel_devs else 0
+        max_rel = max(rel_devs) * 100 if rel_devs else 0
+        rows.append(f"| {label} | {len(items)} | {mean_abs:.4f} | {max_abs:.4f} | {median_rel:.2f}% | {max_rel:.2f}% |")
+    return "\n".join(rows)
+
+
+def _summary_by_motif(diffs: list[dict]) -> str:
+    """Rollup by chemical motif."""
+    by_motif: dict[tuple[str, str], list[dict]] = defaultdict(list)
+    for d in diffs:
+        by_motif[(d["motif"], d["param"])].append(d)
+    rows = []
+    rows.append("| Motif | Param | N | Mean abs dev | Max abs dev | Median rel dev |")
+    rows.append("|---|---|---:|---:|---:|---:|")
+    for (motif, param), items in sorted(by_motif.items()):
+        abs_devs = [abs(d["abs_dev"]) for d in items]
+        rel_devs = [abs(d["rel_dev"]) for d in items if d["rel_dev"] is not None]
+        mean_abs = statistics.mean(abs_devs) if abs_devs else 0
+        max_abs = max(abs_devs) if abs_devs else 0
+        median_rel = statistics.median(rel_devs) * 100 if rel_devs else 0
+        rows.append(f"| {motif} | {param} | {len(items)} | {mean_abs:.4f} | {max_abs:.4f} | {median_rel:.2f}% |")
+    return "\n".join(rows)
+
+
+def _top_deviations(diffs: list[dict], n: int = 15) -> str:
+    """Show the top-N largest relative deviations with chemical context."""
+    rated = [d for d in diffs if d["rel_dev"] is not None]
+    rated.sort(key=lambda d: abs(d["rel_dev"]), reverse=True)
+    rows = [
+        "| Rank | Kind | Param | Atoms | Pub | Optimized | Abs Δ | Rel Δ | Motif |",
+        "|---:|---|---|---|---:|---:|---:|---:|---|",
+    ]
+    for i, d in enumerate(rated[:n], start=1):
+        atoms_str = "–".join(d["atoms_pub"])
+        rows.append(
+            f"| {i} | {d['kind']} | {d['param']} | {atoms_str} | "
+            f"{_fmt_value(d['value_pub'], d['kind'], d['param'])} | "
+            f"{_fmt_value(d['value_opt'], d['kind'], d['param'])} | "
+            f"{d['abs_dev']:+.4f} | {d['rel_dev'] * 100:+.2f}% | {d['motif']} |"
+        )
+    return "\n".join(rows)
+
+
+def main() -> int:
+    """Entry point: parse CLI args and emit the row-by-row comparison."""
+    ap = argparse.ArgumentParser(description=__doc__)
+    ap.add_argument(
+        "--published",
+        type=Path,
+        default=None,
+        help="Path to published .fld. If --system is given and "
+        "this is omitted, the system loader is used to compose "
+        "the published FF and round-trip-save it to a temp file "
+        "(so atom-type tokens match the optimizer's save format).",
+    )
+    ap.add_argument("--optimized", required=True, type=Path)
+    ap.add_argument("--system", required=True)
+    ap.add_argument("--out", required=True, type=Path)
+    args = ap.parse_args()
+
+    if args.published is None:
+        # Compose the published FF via the system loader and round-trip
+        # it to a temp .fld so its OPT-row ordering and atom-type tokens
+        # match the optimizer-saved file's format.
+        import sys as _sys
+        import tempfile as _tempfile
+
+        _sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
+        from q2mm.diagnostics.systems import load_system
+
+        sd = load_system(args.system)
+        published_path = Path(_tempfile.mkstemp(prefix=f"pub-{args.system}-", suffix=".fld")[1])
+        sd.forcefield.to_mm3_fld(str(published_path))
+    else:
+        published_path = args.published
+
+    pub_rows = _parse_opt_block(published_path)
+    opt_rows = _parse_opt_block(args.optimized)
+    print(f"Parsed {len(pub_rows)} published OPT rows from {published_path.name}", flush=True)
+    print(f"Parsed {len(opt_rows)} optimized OPT rows from {args.optimized.name}", flush=True)
+
+    diffs, warnings = _diff_rows(pub_rows, opt_rows)
+
+    parts = [
+        f"## Per-parameter comparison: {args.system}",
+        "",
+        f"Published FF: `{published_path.name}`  ",
+        f"Optimized FF: `{args.optimized.name}`",
+        "",
+    ]
+    if warnings:
+        parts.append("### ⚠ Warnings")
+        parts.append("")
+        for w in warnings:
+            parts.append(f"- {w}")
+        parts.append("")
+    parts.append(f"Matched **{len(diffs) // 2}** OPT rows ({len(diffs)} parameter cells: eq + fc per row).")
+    parts.append("")
+    parts.append("### Summary by category")
+    parts.append("")
+    parts.append(_summary_by_category(diffs))
+    parts.append("")
+    parts.append("### Summary by chemical motif")
+    parts.append("")
+    parts.append(_summary_by_motif(diffs))
+    parts.append("")
+    parts.append("### Top 15 largest relative deviations")
+    parts.append("")
+    parts.append(_top_deviations(diffs, n=15))
+    parts.append("")
+
+    args.out.write_text("\n".join(parts))
+    print(f"Wrote {args.out}")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/scripts/compare_qfuerza_to_published.py b/scripts/compare_qfuerza_to_published.py
new file mode 100644
index 00000000..7bcacc4b
--- /dev/null
+++ b/scripts/compare_qfuerza_to_published.py
@@ -0,0 +1,260 @@
+r"""Compare a QFUERZA-optimized FF against its published counterpart.
+
+Emits a per-parameter markdown table grouped by chemical motif
+(metal-involving vs ligand-only) so the QFUERZA-recovery doc can answer
+the user's literal question: "do our params end up near the published
+params, and where they don't, why?".
+
+Usage
+-----
+    python scripts/compare_qfuerza_to_published.py \\
+        --system rh-enamide \\
+        --optimized /path/to/rh-enamide_optimized.fld \\
+        --out /tmp/compare-rh-enamide.md
+
+Outputs markdown to ``--out``.  If ``--out`` is omitted, prints to stdout.
+"""
+
+from __future__ import annotations
+
+import argparse
+import contextlib
+import json
+import sys
+from collections import defaultdict
+from collections.abc import Iterable
+from pathlib import Path
+
+# Make `q2mm` importable when run from the worktree.
+_REPO = Path(__file__).resolve().parent.parent
+sys.path.insert(0, str(_REPO))
+
+from q2mm.diagnostics.systems import SYSTEMS  # noqa: E402
+from q2mm.models.forcefield import ForceField  # noqa: E402
+
+
+def _resolve_published_ff_paths(system_key: str) -> list[Path]:
+    """Return the .fld file paths the system's loader points at."""
+    spec = SYSTEMS[system_key]
+    paths: list[Path] = []
+    for _, fn in (spec.ff_paths or {}).items():
+        with contextlib.suppress(Exception):  # pragma: no cover - missing data dir
+            paths.append(Path(fn()))
+    return paths
+
+
+def _load_published(system_key: str) -> ForceField:
+    """Load the published FF for a system, composing if necessary."""
+    spec = SYSTEMS[system_key]
+    paths = _resolve_published_ff_paths(system_key)
+    if not paths:
+        raise FileNotFoundError(f"No FF paths configured for {system_key}")
+
+    if spec.ff_strategy == "published_opt":
+        return ForceField.from_mm3_fld(paths[0], include_standard=False)
+
+    if spec.ff_strategy == "published_opt_composed":
+        # Wahlers: base MM3 + OPT substructure file. For per-parameter
+        # comparison we only need the OPT block — the backbone MM3 is
+        # not optimized. Both the published and optimized .fld files
+        # contain only the OPT block, so load them the same way.
+        spec_paths = spec.ff_paths or {}
+        opt_path = spec_paths["opt_path"]()
+        return ForceField.from_mm3_fld(Path(opt_path), include_standard=False)
+
+    raise ValueError(f"Unsupported ff_strategy for {system_key}: {spec.ff_strategy}")
+
+
+def _bond_motif(elements: tuple[str, ...]) -> str:
+    metals = {"Rh", "Pd", "Os", "Ru"}
+    if any(e in metals for e in elements):
+        return "M-L bond"
+    if all(e == "C" for e in elements):
+        return "C-C bond"
+    if "H" in elements:
+        return "X-H bond"
+    return "ligand bond"
+
+
+def _angle_motif(elements: tuple[str, ...]) -> str:
+    metals = {"Rh", "Pd", "Os", "Ru"}
+    if elements[1] in metals:
+        return "L-M-L angle"
+    if any(e in metals for e in elements):
+        return "M-L-X angle"
+    if all(e == "C" for e in elements):
+        return "C-C-C angle"
+    return "ligand angle"
+
+
+def _diff_row(
+    cat: str,
+    motif: str,
+    label: str,
+    pub: float,
+    opt: float,
+    units: str,
+) -> dict:
+    delta = opt - pub
+    rel = (abs(delta) / abs(pub) * 100.0) if pub != 0 else float("nan")
+    return {
+        "category": cat,
+        "motif": motif,
+        "label": label,
+        "pub": pub,
+        "opt": opt,
+        "abs_dev": delta,
+        "rel_dev_pct": rel,
+        "units": units,
+    }
+
+
+def _iter_diffs(pub: ForceField, opt: ForceField) -> Iterable[dict]:
+    # Bonds
+    for bp, bo in zip(pub.bonds, opt.bonds):
+        if bp.frozen and bo.frozen:
+            continue
+        if bp.key != bo.key:
+            continue
+        motif = _bond_motif(bp.elements)
+        label = "-".join(bp.elements)
+        yield _diff_row("bond_eq", motif, label, bp.equilibrium, bo.equilibrium, "Å")
+        yield _diff_row("bond_fc", motif, label, bp.force_constant, bo.force_constant, "kcal/mol/Å²")
+
+    # Angles
+    for ap, ao in zip(pub.angles, opt.angles):
+        if ap.frozen and ao.frozen:
+            continue
+        if ap.key != ao.key:
+            continue
+        motif = _angle_motif(ap.elements)
+        label = "-".join(ap.elements)
+        yield _diff_row("angle_eq", motif, label, ap.equilibrium, ao.equilibrium, "deg")
+        yield _diff_row("angle_fc", motif, label, ap.force_constant, ao.force_constant, "kcal/mol/rad²")
+
+
+def _summary_table(diffs: list[dict]) -> str:
+    by_cat = defaultdict(list)
+    for d in diffs:
+        by_cat[d["category"]].append(d)
+
+    lines = [
+        "| Category | n | mean |Δ| | max |Δ| | mean rel% | max rel% | units |",
+        "|---|---:|---:|---:|---:|---:|---|",
+    ]
+    for cat in ["bond_eq", "bond_fc", "angle_eq", "angle_fc"]:
+        rows = by_cat.get(cat, [])
+        if not rows:
+            continue
+        absdevs = [abs(r["abs_dev"]) for r in rows]
+        reldevs = [r["rel_dev_pct"] for r in rows if r["rel_dev_pct"] == r["rel_dev_pct"]]
+        units = rows[0]["units"]
+        lines.append(
+            f"| `{cat}` | {len(rows)} | {sum(absdevs) / len(absdevs):.3g} | "
+            f"{max(absdevs):.3g} | "
+            f"{(sum(reldevs) / len(reldevs)) if reldevs else float('nan'):.2f} | "
+            f"{(max(reldevs)) if reldevs else float('nan'):.2f} | {units} |"
+        )
+    return "\n".join(lines)
+
+
+def _top_deviations(diffs: list[dict], n: int = 10) -> str:
+    ranked = sorted(
+        diffs,
+        key=lambda d: d["rel_dev_pct"] if d["rel_dev_pct"] == d["rel_dev_pct"] else 0,
+        reverse=True,
+    )
+    lines = [
+        "| # | Category | Motif | Atoms | Published | QF-Optimized | Abs Δ | Rel Δ% |",
+        "|---:|---|---|---|---:|---:|---:|---:|",
+    ]
+    for i, d in enumerate(ranked[:n], 1):
+        lines.append(
+            f"| {i} | `{d['category']}` | {d['motif']} | `{d['label']}` | "
+            f"{d['pub']:.4g} | {d['opt']:.4g} | {d['abs_dev']:+.4g} | "
+            f"{d['rel_dev_pct']:.1f}% |"
+        )
+    return "\n".join(lines)
+
+
+def _by_motif(diffs: list[dict]) -> str:
+    """Per-motif rollup table — useful for spotting chemical patterns."""
+    by_motif: dict[tuple[str, str], list[dict]] = defaultdict(list)
+    for d in diffs:
+        by_motif[(d["category"], d["motif"])].append(d)
+
+    lines = [
+        "| Category | Motif | n | mean |Δ| | max |Δ| | mean rel% |",
+        "|---|---|---:|---:|---:|---:|",
+    ]
+    for (cat, motif), rows in sorted(by_motif.items()):
+        absdevs = [abs(r["abs_dev"]) for r in rows]
+        reldevs = [r["rel_dev_pct"] for r in rows if r["rel_dev_pct"] == r["rel_dev_pct"]]
+        mean_rel = (sum(reldevs) / len(reldevs)) if reldevs else float("nan")
+        lines.append(
+            f"| `{cat}` | {motif} | {len(rows)} | {sum(absdevs) / len(absdevs):.3g} | "
+            f"{max(absdevs):.3g} | {mean_rel:.2f} |"
+        )
+    return "\n".join(lines)
+
+
+def main() -> int:
+    """Entry point: parse CLI args and emit the per-parameter comparison."""
+    ap = argparse.ArgumentParser(description=__doc__)
+    ap.add_argument("--system", required=True, choices=sorted(SYSTEMS))
+    ap.add_argument("--optimized", required=True, type=Path)
+    ap.add_argument("--out", type=Path, default=None)
+    ap.add_argument(
+        "--json-out",
+        type=Path,
+        default=None,
+        help="Also write the raw diff list as JSON.",
+    )
+    args = ap.parse_args()
+
+    pub = _load_published(args.system)
+    opt = ForceField.from_mm3_fld(args.optimized, include_standard=False)
+    diffs = list(_iter_diffs(pub, opt))
+
+    md_parts = [
+        f"## Per-parameter comparison: {args.system}",
+        "",
+        f"_Source: published FF (loader-resolved) vs `{args.optimized.name}`._",
+        "",
+        f"Total comparable active parameter rows: **{len(diffs) // 2}** (each row contributes one eq and one fc cell)",
+        "",
+        "### Summary by category",
+        "",
+        _summary_table(diffs),
+        "",
+        "### Summary by chemical motif",
+        "",
+        _by_motif(diffs),
+        "",
+        "### Top 15 largest relative deviations",
+        "",
+        _top_deviations(diffs, n=15),
+        "",
+    ]
+    md = "\n".join(md_parts)
+
+    if args.out:
+        args.out.parent.mkdir(parents=True, exist_ok=True)
+        args.out.write_text(md, encoding="utf-8")
+        print(f"Wrote {args.out}", file=sys.stderr)
+    else:
+        print(md)
+
+    if args.json_out:
+        args.json_out.parent.mkdir(parents=True, exist_ok=True)
+        args.json_out.write_text(
+            json.dumps(diffs, indent=2),
+            encoding="utf-8",
+        )
+        print(f"Wrote {args.json_out}", file=sys.stderr)
+
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/validation/published_ffs/README.md b/validation/published_ffs/README.md
index 7e3a8958..53a43f17 100644
--- a/validation/published_ffs/README.md
+++ b/validation/published_ffs/README.md
@@ -77,7 +77,7 @@ full MM3 FFs already include the base section and do not need composition.
 | Property | Value |
 |----------|-------|
 | **Paper** | Wahlers, J. et al. *J. Org. Chem.* **2021**, *86*, 5660–5667 |
-| **DOI** | [10.1021/acs.joc.0c02918](https://doi.org/10.1021/acs.joc.0c02918) |
+| **DOI** | [10.1021/acs.joc.1c00136](https://doi.org/10.1021/acs.joc.1c00136) |
 | **System** | Pd-catalyzed 1,4-conjugate addition |
 | **FF type** | OPT-only substructure file (157 lines, 6 OPT blocks) — needs `mm3_base.fld` |
 | **Source** | Wahlers dissertation Ch 5 (supporting information) |

From 14aa27b94bc9702245a0dcedaf84cbcd10662f6e Mon Sep 17 00:00:00 2001
From: Eric Hansen <ericchansen@gmail.com>
Date: Sun, 31 May 2026 14:35:01 -0500
Subject: [PATCH 5/7] fix(qfuerza): address PR #290 review feedback
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Five fixes from copilot-pull-request-reviewer on the QFUERZA-recovery
PR plus the test-core (3.10) CI failure:

1. test/test_systems.py — mark
   `test_qfuerza_is_noop_for_qfuerza_fresh_strategy` with
   `@pytest.mark.jax` so the core suite (which excludes JAX) skips it
   rather than failing the unconditional `from JaxEngine` import. Fixes
   the test-core CI failure introduced when JaxEngine was added to the
   test in PR #290.

2. test/test_systems.py — use `np.testing.assert_array_equal` for the
   frozen-scalar bit-identical check; the previous `np.allclose(...,
   atol=0.0)` still applied the default `rtol=1e-05`, so the test would
   have admitted a relative drift of up to 0.001%. The intent is true
   bit-identity for backbone params; the new assertion enforces it.

3. q2mm/diagnostics/systems.py — `load_system` now raises
   `ValueError` when `starting_point` is anything other than
   `"published"` or `"qfuerza"`. The `StartingPoint = Literal[...]`
   annotation is a static hint only; without a runtime guard a typo
   like `"qferza"` silently went through the `published` branch and
   produced misleading results.

4. q2mm/diagnostics/systems.py — refactor `_audit_starting_point` to
   derive per-scalar type labels from `ForceField._PARAM_SLOTS` (via a
   new `_build_param_type_labels` helper) instead of hand-rolling the
   collection/attr enumeration. Any future change to the parameter
   vector layout is now reflected automatically; unknown attrs raise
   `KeyError` rather than silently producing wrong labels. The
   Urey-Bradley tail still has to be appended explicitly because
   `_ub_angles` is a property, not in `_PARAM_SLOTS`.

5. scripts/regenerate_convergence_results.py — reflow the docstring so
   the `` `--starting-point published` `` and `` `--starting-point
   qfuerza` `` inline-code spans no longer break across newlines.

Plus a new `test_unknown_starting_point_raises` test for #3.

Validation:
- `ruff check q2mm/ test/ scripts/` — clean
- `ruff format --check q2mm test scripts examples` — clean
- `pytest test/test_systems.py -x -q -m "not (openmm or tinker or jax or jax_md or psi4)"`
  — 10 passed, 1 deselected (the jax-marked test)

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 q2mm/diagnostics/systems.py               | 54 ++++++++++++++++-------
 scripts/regenerate_convergence_results.py | 10 +++--
 test/test_systems.py                      |  8 +++-
 3 files changed, 53 insertions(+), 19 deletions(-)

diff --git a/q2mm/diagnostics/systems.py b/q2mm/diagnostics/systems.py
index 8e9b84eb..6045bdaf 100644
--- a/q2mm/diagnostics/systems.py
+++ b/q2mm/diagnostics/systems.py
@@ -510,6 +510,41 @@ class SystemSpec:
 """
 
 
+def _build_param_type_labels(ff: ForceField) -> list[str]:
+    """Generate per-scalar type labels matching :meth:`ForceField.get_param_vector` layout.
+
+    Derived from :data:`ForceField._PARAM_SLOTS` (plus the Urey-Bradley tail)
+    so that any future change to the parameter vector layout will be reflected
+    here automatically.  Unknown collection or slot attribute names raise
+    ``KeyError`` rather than silently producing wrong labels.
+
+    Labels follow the convention ``{singular_collection}_{short_attr}``
+    (e.g. ``"bond_fc"``, ``"vdw_radius"``, ``"ub_eq"``).
+    """
+    collection_prefix = {
+        "bonds": "bond",
+        "angles": "angle",
+        "torsions": "torsion",
+        "stretch_bends": "stretch_bend",
+        "vdws": "vdw",
+    }
+    attr_short = {
+        "force_constant": "fc",
+        "equilibrium": "eq",
+        "radius": "radius",
+        "epsilon": "epsilon",
+    }
+    labels: list[str] = []
+    for collection_attr, slot_attrs in ff._PARAM_SLOTS:  # noqa: SLF001 — schema is the source of truth
+        prefix = collection_prefix[collection_attr]
+        per_item = [f"{prefix}_{attr_short[slot]}" for slot in slot_attrs]
+        for _ in getattr(ff, collection_attr):
+            labels.extend(per_item)
+    # Urey-Bradley tail mirrors the ordering inside ForceField.get_param_vector().
+    labels.extend(["ub_fc", "ub_eq"] * len(ff._ub_angles))  # noqa: SLF001 — schema is the source of truth
+    return labels
+
+
 def _audit_starting_point(
     ff: ForceField,
     *,
@@ -541,21 +576,7 @@ def _audit_starting_point(
     after_vec = ff.get_param_vector()
     active = ff.active_mask
 
-    type_labels: list[str] = []
-    for bond in ff.bonds:
-        type_labels.extend(["bond_fc", "bond_eq"])
-        del bond
-    for angle in ff.angles:
-        type_labels.extend(["angle_fc", "angle_eq"])
-        del angle
-    type_labels.extend(["torsion_fc"] * len(ff.torsions))
-    type_labels.extend(["stretch_bend_fc"] * len(ff.stretch_bends))
-    for vdw in ff.vdws:
-        type_labels.extend(["vdw_radius", "vdw_epsilon"])
-        del vdw
-    for ub in ff._ub_angles:  # noqa: SLF001 — diagnostic
-        type_labels.extend(["ub_fc", "ub_eq"])
-        del ub
+    type_labels = _build_param_type_labels(ff)
 
     if len(type_labels) != len(after_vec):
         raise AssertionError(f"type label / param vector length mismatch: {len(type_labels)} vs {len(after_vec)}")
@@ -638,6 +659,7 @@ def load_system(
     Raises:
         KeyError: If *key* is not in :data:`SYSTEMS`.
         TypeError: If *engine* is required but not provided.
+        ValueError: If *starting_point* is not one of ``"published"`` or ``"qfuerza"``.
 
     """
     from q2mm.models import loaders
@@ -646,6 +668,8 @@ def load_system(
 
     if key not in SYSTEMS:
         raise KeyError(f"Unknown system {key!r}; available: {sorted(SYSTEMS)}")
+    if starting_point not in ("published", "qfuerza"):
+        raise ValueError(f"Unknown starting_point {starting_point!r}; must be one of: 'published', 'qfuerza'")
     spec = SYSTEMS[key]
 
     # 1. Molecules ---------------------------------------------------------
diff --git a/scripts/regenerate_convergence_results.py b/scripts/regenerate_convergence_results.py
index 39de6d08..590eb46d 100644
--- a/scripts/regenerate_convergence_results.py
+++ b/scripts/regenerate_convergence_results.py
@@ -17,9 +17,13 @@
    improvement percentage, and whether the mean change exceeds the summed
    confidence intervals.
 
-Outputs (per system, into ``<output-dir>/<system-data-dir>/<subdir>/``,
-where ``<subdir>`` is ``convergence`` for the default ``--starting-point
-published`` and ``from-qfuerza`` for ``--starting-point qfuerza``):
+Outputs (per system) live under
+``<output-dir>/<system-data-dir>/<subdir>/`` where ``<subdir>`` is:
+
+- ``convergence`` (default, for ``--starting-point published``)
+- ``from-qfuerza`` (for ``--starting-point qfuerza``)
+
+Per-system files:
 
 - ``validation_results.json`` — summary numbers for the system (strict JSON,
   no ``Infinity`` or ``NaN``).  Ratio state is encoded across three keys:
diff --git a/test/test_systems.py b/test/test_systems.py
index 98d4e856..0b106ade 100644
--- a/test/test_systems.py
+++ b/test/test_systems.py
@@ -128,7 +128,7 @@ def test_qfuerza_overwrites_opt_but_not_frozen(self) -> None:
         # Frozen partition is identical
         assert np.array_equal(mask, sd_q.forcefield.active_mask)
         # Frozen scalars are bit-identical (no QFUERZA leakage into backbone)
-        assert np.allclose(vec_pub[~mask], vec_q[~mask], atol=0.0)
+        np.testing.assert_array_equal(vec_pub[~mask], vec_q[~mask])
         # At least some active scalars changed (QFUERZA did something)
         assert np.any(vec_pub[mask] != vec_q[mask])
 
@@ -162,6 +162,7 @@ def test_audit_classifies_scalars_consistently(self) -> None:
                 f"{ptype}: frozen={bucket['frozen']} <= overwritten={bucket['qfuerza_overwritten']}"
             )
 
+    @pytest.mark.jax
     def test_qfuerza_is_noop_for_qfuerza_fresh_strategy(self) -> None:
         """For CH3F (qfuerza_fresh), starting_point='qfuerza' is a no-op."""
         from q2mm.backends.mm.jax_engine import JaxEngine
@@ -180,3 +181,8 @@ def test_qfuerza_is_noop_for_qfuerza_fresh_strategy(self) -> None:
         assert audit["starting_point"] == "qfuerza"
         for bucket in audit["by_type"].values():
             assert bucket["qfuerza_overwritten"] == 0
+
+    def test_unknown_starting_point_raises(self) -> None:
+        """Typos in ``starting_point`` must raise rather than silently passing through."""
+        with pytest.raises(ValueError, match="Unknown starting_point"):
+            systems.load_system("ch3f", starting_point="qferza")  # type: ignore[arg-type]

From 1f11ee75a64b1e1a0b16e9a2c80fe96ac4e367e1 Mon Sep 17 00:00:00 2001
From: Eric Hansen <ericchansen@gmail.com>
Date: Sun, 31 May 2026 14:43:19 -0500
Subject: [PATCH 6/7] fix(qfuerza): address PR #290 review feedback (round 2)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Five additional findings from copilot-pull-request-reviewer on the
guardrails commit e81cf00.

1. q2mm/models/forcefield.py — guard `get_fractional_bounds()` against
   degenerate `lo >= hi` returns when the current param value lies
   outside the `DEFAULT_BOUNDS` sanity envelope. The naive
   `max(sanity_lo, val − f·|val|), min(sanity_hi, val + f·|val|)`
   formula can flip `lo > hi` for vdw_epsilon/ub_k/bond_k values
   outside the envelope (SciPy would then reject the bounds). Fall
   back to sanity bounds when the intersection is empty so L-BFGS-B
   can pull the value back into a physical region.

   New test `test_fractional_bounds_value_outside_sanity_falls_back`
   covers the bond_k = 5000 case (sanity envelope ±3600).

2-4. .copilot/skills/q2mm-benchmark/SKILL.md — the skill referenced
   CLI flags `--bounds-fraction-fc/--bounds-fraction-eq/--factr`, but
   the script added in this PR exposes `--fc-fraction/--eq-fraction/--ftol`.
   Following the skill would have failed with "unrecognized arguments"
   for any future agent. Updated all four occurrences (Step 2 prose,
   pre-flight checklist, Step 4 example command, quick-reference Q&A)
   to match the real CLI.

5. docs/benchmarks/qfuerza-recovery.md — the fractional-bounds formula
   was shown as `fc ∈ [fc₀·(1−f), fc₀·(1+f)]`, which is only correct
   for positive `fc₀`. The implementation is sign-aware (`val ± f·|val|`),
   which matters for TSFF negative force constants. Updated the doc to
   describe the actual sign-aware bound.

Validation:
- `ruff check q2mm/ test/ scripts/` — clean
- `ruff format --check q2mm test scripts examples` — clean
- `pytest test/test_models.py -k fractional_bounds` — 6/6 passed

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .copilot/skills/q2mm-benchmark/SKILL.md | 16 ++++++++--------
 docs/benchmarks/qfuerza-recovery.md     | 17 ++++++++++-------
 q2mm/models/forcefield.py               | 11 ++++++++++-
 test/test_models.py                     | 12 ++++++++++++
 4 files changed, 40 insertions(+), 16 deletions(-)

diff --git a/.copilot/skills/q2mm-benchmark/SKILL.md b/.copilot/skills/q2mm-benchmark/SKILL.md
index 576167ad..c1406d86 100644
--- a/.copilot/skills/q2mm-benchmark/SKILL.md
+++ b/.copilot/skills/q2mm-benchmark/SKILL.md
@@ -40,11 +40,11 @@ For **from-QFUERZA** or any **from-poor-start** runs, sanity bounds let L-BFGS-B
 - Typical: `bounds_fraction_fc=0.20`, `bounds_fraction_eq=0.05`
 - Fragile systems (heck-relay): `bounds_fraction_fc=0.05`, `bounds_fraction_eq=0.02`
 
-Pass via `--bounds-fraction-fc` / `--bounds-fraction-eq` CLI flags on `scripts/regenerate_convergence_results.py`.
+Pass via `--fc-fraction` / `--eq-fraction` CLI flags on `scripts/regenerate_convergence_results.py`.
 
 ### Convergence tolerance (`scipy_opt.py` → `_run_minimize`)
 
-Default L-BFGS-B `factr=1e7` is very loose — `nfev` will often be ≤ 5 with no real optimization. Override with `--factr 1e2` (or tighter) for any run where you actually want the optimizer to work.
+The script default L-BFGS-B `ftol=1e-8` is loose for from-poor-start runs — `nfev` will often be ≤ 5 with no real optimization. Override with `--ftol 1e-12` (or tighter) for any run where you actually want the optimizer to work.
 
 ### Ratio gate (`scipy_opt.py` → ratio check)
 
@@ -58,7 +58,7 @@ Walk through this LITERALLY before running. Each item must be checked.
 
 - [ ] Goal restated as a success spec (Step 1)
 - [ ] Bounds strategy chosen and matches starting-point quality (Step 2)
-- [ ] `factr` / `gtol` tight enough for real optimization
+- [ ] `ftol` / `gtol` tight enough for real optimization
 - [ ] Per-system overrides documented if any system needs special handling
 - [ ] GPU verified: `python -c "import jax; print(jax.devices())"` shows CudaDevice
 - [ ] Output directory chosen (e.g., `q2mm-data/benchmarks/<system>/from-qfuerza/`)
@@ -72,9 +72,9 @@ Do NOT launch all systems in a batch. Run **only the first system**:
 PYTHONPATH=/path/to/worktree python scripts/regenerate_convergence_results.py \
     --system <first-system> \
     --starting-point <published|qfuerza> \
-    --factr 1e2 \
-    --bounds-fraction-fc 0.20 \
-    --bounds-fraction-eq 0.05 \
+    --ftol 1e-12 \
+    --fc-fraction 0.20 \
+    --eq-fraction 0.05 \
     --ratio-tol <value> \
     --output-dir /path/to/q2mm-data/benchmarks
 ```
@@ -103,7 +103,7 @@ print("Optimized R²: ", r["optimized"])
 - Optimized R² ≥ Seminario R² on each metric (the run didn't make things worse)
 
 **Fail criteria** (stop immediately if any holds):
-- `n_evaluations ≤ 2` AND `|improvement_pct| < 1` → **optimizer did not optimize**. Likely `factr` too loose or bounds too wide. Do NOT launch the remaining systems. Diagnose and re-run.
+- `n_evaluations ≤ 2` AND `|improvement_pct| < 1` → **optimizer did not optimize**. Likely `ftol` too loose or bounds too wide. Do NOT launch the remaining systems. Diagnose and re-run.
 - `ratio > 100` AND `improvement_pct < 0` → JaxLoss surrogate diverged AND the optimizer made the FF worse. Diagnose: tighter bounds, FC clamping, different starting point.
 - Optimized R² < Seminario R² → the run degraded the FF. Diagnose before continuing.
 
@@ -124,7 +124,7 @@ If a benchmark batch ends with multiple systems exiting at `n_evals=2`, that's a
 
 - "How do I know if the optimizer actually optimized?" → check `n_evaluations` and real OF delta
 - "Should I use sanity bounds or fractional bounds?" → sanity for from-published, fractional for from-QFUERZA
-- "Why does `nfev=2` happen on every system?" → default `factr` is 1e7, way too loose; use `--factr 1e2`
+- "Why does `nfev=2` happen on every system?" → default `ftol` is 1e-8, way too loose for from-poor-start; use `--ftol 1e-12`
 - "Heck-relay's ratio is 1e74, is that OK?" → no, the JaxLoss surrogate exploded; document honestly and consider tighter bounds or FF pre-conditioning
 - "Should I just bypass the ratio gate with `--ratio-tol -1`?" → only if you understand why it's failing; ratio gate exists for a reason
 
diff --git a/docs/benchmarks/qfuerza-recovery.md b/docs/benchmarks/qfuerza-recovery.md
index 58f01c56..1df6c6d2 100644
--- a/docs/benchmarks/qfuerza-recovery.md
+++ b/docs/benchmarks/qfuerza-recovery.md
@@ -392,13 +392,16 @@ The `--ratio-tol -1` flag bypasses the JaxLoss/ObjectiveFunction ratio
 gate (which would otherwise reject all 5 TS systems at the QFUERZA
 start because the surrogate is poorly aligned).
 
-The `--ftol 1e-12` flag overrides the loose default SciPy
-`factr=1e7 × eps_mch ≈ 2 × 10⁻⁹` that previously let the optimizer
-exit after one line search step on most TS systems.
-
-The `--fc-fraction`/`--eq-fraction` flags set fractional bounds
-(`fc ∈ [fc₀·(1−f), fc₀·(1+f)]`) instead of the default physical-sanity
-bounds. Without them, L-BFGS-B will escape the QFUERZA basin entirely.
+The `--ftol 1e-12` flag overrides the script's default `ftol=1e-8` that
+previously let the optimizer exit after one line search step on most
+TS systems.
+
+The `--fc-fraction`/`--eq-fraction` flags set sign-aware fractional
+bounds — each parameter `v` is clamped to
+`[v − f·|v|, v + f·|v|]` (intersected with the physical-sanity
+`DEFAULT_BOUNDS`), so TSFF negative force constants are bounded
+symmetrically around their actual sign. Without them, L-BFGS-B will
+escape the QFUERZA basin entirely.
 
 The output `validation_results.json` includes a `starting_point_audit`
 block enumerating which OPT rows were overwritten by QFUERZA vs
diff --git a/q2mm/models/forcefield.py b/q2mm/models/forcefield.py
index bd238d03..c356b9a7 100644
--- a/q2mm/models/forcefield.py
+++ b/q2mm/models/forcefield.py
@@ -630,7 +630,16 @@ def get_fractional_bounds(
             window = frac * abs(val)
             lo = max(sanity_lo, val - window)
             hi = min(sanity_hi, val + window)
-            bounds.append((lo, hi))
+            if lo >= hi:
+                # Intersection is empty/degenerate: the current value lies
+                # outside the sanity envelope (or against an edge), so a
+                # symmetric ±frac window around it doesn't fit inside
+                # DEFAULT_BOUNDS. Fall back to sanity bounds — keeps L-BFGS-B
+                # well-defined and lets it pull the value back into a
+                # physical region.
+                bounds.append((sanity_lo, sanity_hi))
+            else:
+                bounds.append((lo, hi))
         return bounds
 
     # --- Parameter matching with ff_row → env_id → element fallback ---
diff --git a/test/test_models.py b/test/test_models.py
index e059d38c..b9e13890 100644
--- a/test/test_models.py
+++ b/test/test_models.py
@@ -487,6 +487,18 @@ def test_fractional_bounds_none_is_get_bounds(self) -> None:
         )
         assert ff.get_fractional_bounds(None, None) == ff.get_bounds()
 
+    def test_fractional_bounds_value_outside_sanity_falls_back(self) -> None:
+        """Param values outside DEFAULT_BOUNDS get sanity bounds, not degenerate (lo >= hi)."""
+        # bond_k sanity = (-3600, 3600); val = 5000 is outside the envelope.
+        # Naive: window = 0.20 * 5000 = 1000 → lo = max(-3600, 4000) = 4000,
+        # hi = min(3600, 6000) = 3600 → lo > hi (degenerate). Guard must catch this.
+        ff = ForceField(bonds=[BondParam(("C", "F"), 1.38, 5000.0)])
+        bounds = ff.get_fractional_bounds(fc_fraction=0.20, eq_fraction=0.05)
+        bond_k_lo, bond_k_hi = bounds[0]
+        assert bond_k_lo < bond_k_hi, "bounds must not be degenerate"
+        assert bond_k_lo == pytest.approx(-3600.0)
+        assert bond_k_hi == pytest.approx(3600.0)
+
     def test_torsion_in_param_vector(self) -> None:
         """Torsion force constants appear in param vector after bonds/angles."""
         ff = ForceField(

From c3e69b364f91332d4a0ffaf4eebe888f832275f8 Mon Sep 17 00:00:00 2001
From: Eric Hansen <ericchansen@gmail.com>
Date: Sun, 31 May 2026 19:09:24 -0500
Subject: [PATCH 7/7] feat(systems)!: default starting_point to qfuerza
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Flip the canonical default for `load_system()` and
`regenerate_convergence_results.py --starting-point` from `"published"`
to `"qfuerza"`, and rename the canonical output subdirectory
accordingly (`convergence/` now holds QFUERZA-start runs;
`from-published/` holds the opt-in publication-baseline runs).

Reframe the QFUERZA-recovery doc, AGENTS.md §6 / §11, and the
q2mm-benchmark / q2mm-analysis-design skills around the
QFUERZA-as-defined story: QFUERZA fills in 100% of the bond/angle
scalars it is defined to estimate, given a force-field skeleton
(atom types, OPT topology, frozen/active partition, vdW/SB defaults)
that the chemist provides via a `.fld` file. This is true for every
QFUERZA workflow on every system — there is no `.fld`-free path
because those decisions are chemistry calls a tool can't make.

Document known limitations honestly: 3 of the 5 TS systems
(rh-conjugate, pd-conjugate, heck-relay) don't converge cleanly from
QFUERZA with default bounds yet; the doc lists the per-system
workarounds (`--fc-fraction 0.05` or `--starting-point published`).

Also:
- Add a `--starting-point` flag to `q2mm-benchmark` CLI so the user
  CLI exposes the same lever as the regeneration script.
- Pin `scripts/compare_opt_rows.py` to `starting_point="published"`
  (the script's whole purpose is to diff the publication baseline
  against an optimizer's output, so it must load literature values).
- Pin `test/integration/test_heck_validation.py` to
  `starting_point="published"` (the test asserts bit-identical OPT
  values against the literature `.fld`).
- Replace `test_default_starting_point_is_published` with
  `test_default_starting_point_is_qfuerza`; add a separate test that
  pins `starting_point="published"` and verifies zero QFUERZA
  overwrite.

BREAKING CHANGE: `starting_point` now defaults to `"qfuerza"` on
`load_system()` and `--starting-point qfuerza` on
`regenerate_convergence_results.py`. Callers that depended on the
publication-baseline path must pass `starting_point="published"`
(loader) or `--starting-point published` (CLI). The output
subdirectory naming also flips: canonical default now writes to
`convergence/`; opt-in publication-baseline writes to
`from-published/`.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .copilot/skills/q2mm-analysis-design/SKILL.md |   2 +-
 .copilot/skills/q2mm-benchmark/SKILL.md       |  15 +-
 AGENTS.md                                     |  26 +++-
 docs/benchmarks/qfuerza-recovery.md           | 129 +++++++++++-------
 q2mm/diagnostics/cli.py                       |  18 +++
 q2mm/diagnostics/systems.py                   |  56 ++++----
 q2mm/models/forcefield.py                     |   7 +-
 scripts/build_qfuerza_recovery_tables.py      |  10 +-
 scripts/compare_opt_rows.py                   |   5 +-
 scripts/regenerate_convergence_results.py     |  28 ++--
 test/integration/test_heck_validation.py      |   6 +-
 test/test_systems.py                          |  35 +++--
 12 files changed, 221 insertions(+), 116 deletions(-)

diff --git a/.copilot/skills/q2mm-analysis-design/SKILL.md b/.copilot/skills/q2mm-analysis-design/SKILL.md
index c751f023..b27ae91e 100644
--- a/.copilot/skills/q2mm-analysis-design/SKILL.md
+++ b/.copilot/skills/q2mm-analysis-design/SKILL.md
@@ -25,7 +25,7 @@ Enumerate what will exist when the doc is done. For each artifact, write:
 - **Source**: where the data comes from (JSON path, paper DOI, computed from FFs, etc.)
 
 Example:
-- *Table 3.1*: per-system R² comparison. Columns: published-paper R², q2mm-from-published R², q2mm-from-QFUERZA R². Rows: bond length, bond angle, eigenvalue diagonal. Source: `q2mm-data/benchmarks/<system>/{convergence,from-qfuerza}/validation_results.json` for q2mm; published-paper PDFs for paper R².
+- *Table 3.1*: per-system R² comparison. Columns: published-paper R², q2mm-from-published R², q2mm-from-QFUERZA R². Rows: bond length, bond angle, eigenvalue diagonal. Source: `q2mm-data/benchmarks/<system>/{convergence,from-published}/validation_results.json` for q2mm (`convergence/` = canonical QFUERZA-start default; `from-published/` = opt-in publication-baseline); published-paper PDFs for paper R².
 - *Table 3.2*: per-parameter abs deviation. Columns: param-id, published value, QFUERZA-optimized value, abs deviation, % deviation, chemical motif. Source: parsed from `<system>_optimized.fld` files.
 - *Paragraph 4.1*: physical-chemistry walkthrough of the 5 largest bond-length deviations in rh-enamide. Source: synthesis from Table 3.2 + chemistry knowledge.
 
diff --git a/.copilot/skills/q2mm-benchmark/SKILL.md b/.copilot/skills/q2mm-benchmark/SKILL.md
index c1406d86..018cfd3d 100644
--- a/.copilot/skills/q2mm-benchmark/SKILL.md
+++ b/.copilot/skills/q2mm-benchmark/SKILL.md
@@ -34,12 +34,12 @@ The defaults are **physical sanity bounds**, not "stay near starting FF":
 - `bond_eq: (0.5, 3.0)` Å
 - `angle_eq: (30, 180)` deg
 
-For **from-published-OPT** runs, sanity bounds are usually fine (starting FF is already good).
-
-For **from-QFUERZA** or any **from-poor-start** runs, sanity bounds let L-BFGS-B escape the QFUERZA basin into a worse local minimum. Use fractional bounds around the starting value:
+For **canonical-default QFUERZA-start** runs (the default; or any from-poor-start run), sanity bounds let L-BFGS-B escape the QFUERZA basin into a worse local minimum. Use fractional bounds around the starting value:
 - Typical: `bounds_fraction_fc=0.20`, `bounds_fraction_eq=0.05`
 - Fragile systems (heck-relay): `bounds_fraction_fc=0.05`, `bounds_fraction_eq=0.02`
 
+For **publication-baseline (`--starting-point published`)** runs, sanity bounds are usually fine — the starting FF is already in the published basin.
+
 Pass via `--fc-fraction` / `--eq-fraction` CLI flags on `scripts/regenerate_convergence_results.py`.
 
 ### Convergence tolerance (`scipy_opt.py` → `_run_minimize`)
@@ -61,7 +61,7 @@ Walk through this LITERALLY before running. Each item must be checked.
 - [ ] `ftol` / `gtol` tight enough for real optimization
 - [ ] Per-system overrides documented if any system needs special handling
 - [ ] GPU verified: `python -c "import jax; print(jax.devices())"` shows CudaDevice
-- [ ] Output directory chosen (e.g., `q2mm-data/benchmarks/<system>/from-qfuerza/`)
+- [ ] Output directory chosen (`q2mm-data/benchmarks/<system>/convergence/` for the canonical QFUERZA-start default; `q2mm-data/benchmarks/<system>/from-published/` for opt-in publication-baseline runs)
 - [ ] PYTHONPATH set if running from a worktree (editable install points to master)
 
 ## Step 4 — Run the FIRST system in isolation
@@ -69,9 +69,10 @@ Walk through this LITERALLY before running. Each item must be checked.
 Do NOT launch all systems in a batch. Run **only the first system**:
 
 ```bash
+# Canonical default is --starting-point qfuerza; pass --starting-point
+# published only when reproducing publication-baseline benchmarks.
 PYTHONPATH=/path/to/worktree python scripts/regenerate_convergence_results.py \
     --system <first-system> \
-    --starting-point <published|qfuerza> \
     --ftol 1e-12 \
     --fc-fraction 0.20 \
     --eq-fraction 0.05 \
@@ -85,7 +86,7 @@ After the first system completes, inspect `<output-dir>/validation_results.json`
 
 ```python
 import json
-with open("<first-system>/from-qfuerza/validation_results.json") as f:
+with open("<first-system>/convergence/validation_results.json") as f:
     r = json.load(f)["result"]
 print("n_evaluations:", r["n_evaluations"])
 print("n_iterations:", r["n_iterations"])
@@ -123,7 +124,7 @@ If a benchmark batch ends with multiple systems exiting at `n_evals=2`, that's a
 ## Quick reference: things that should make you stop and ask
 
 - "How do I know if the optimizer actually optimized?" → check `n_evaluations` and real OF delta
-- "Should I use sanity bounds or fractional bounds?" → sanity for from-published, fractional for from-QFUERZA
+- "Should I use sanity bounds or fractional bounds?" → fractional for the canonical QFUERZA-start default; sanity is fine for `--starting-point published` runs
 - "Why does `nfev=2` happen on every system?" → default `ftol` is 1e-8, way too loose for from-poor-start; use `--ftol 1e-12`
 - "Heck-relay's ratio is 1e74, is that OK?" → no, the JaxLoss surrogate exploded; document honestly and consider tighter bounds or FF pre-conditioning
 - "Should I just bypass the ratio gate with `--ratio-tol -1`?" → only if you understand why it's failing; ratio gate exists for a reason
diff --git a/AGENTS.md b/AGENTS.md
index fa3e7885..3efb754d 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -252,6 +252,28 @@ fields, but **not** for TS publication reproduction. The papers use a
 multi-target penalty (geometry + Hessian + charges + energies), not
 frequency RMSD.
 
+### Starting Point (`starting_point` kwarg / `--starting-point` CLI flag)
+
+`load_system()` and `regenerate_convergence_results.py` accept a
+`starting_point` of either `"qfuerza"` (canonical default) or
+`"published"`:
+
+- **`"qfuerza"` (default, Farrugia 2025)** — the FF skeleton (atom types,
+  OPT-substructure topology, frozen/active partition, vdW, stretch-bend)
+  comes from the literature `.fld` file; QFUERZA fills in the bond and
+  angle force constants and equilibria from the QM Hessian. This is the
+  standard QFUERZA workflow: the chemist provides the skeleton (no tool
+  can automate the decisions of which atom types to use or which
+  substructure rows need OPT parameters); QFUERZA fills in the
+  Hessian-derivable scalars.
+- **`"published"`** — the literature OPT values are retained verbatim;
+  no QFUERZA overwrite. Pass this to reproduce historical
+  publication-baseline runs.
+
+Output subdirectory of the CLI follows the same naming:
+`convergence/` for the canonical default (qfuerza); `from-published/`
+for the publication-baseline path.
+
 ### TS Curvature Inversion
 
 All TS system loaders **must** pass `invert_ts_curvature=True` to
@@ -466,8 +488,8 @@ Many hours of GPU time have been wasted on batches where the optimizer never act
    - If the user asked a specific scientific question (e.g. "do params end up near published?"), restate it verbatim and map each metric back to the question.
 
 2. **Sanity-check optimizer config** for the starting point:
-   - From published FF baseline: `ftol=1e-8`, sanity bounds → fine.
-   - From poor start (QFUERZA, random defaults): use `--ftol 1e-12` and `--fc-fraction 0.20 --eq-fraction 0.05` (or `--fc-fraction 0.05` for heck-relay).
+   - Canonical default (QFUERZA-start, `starting_point="qfuerza"`): use `--ftol 1e-12` and `--fc-fraction 0.20 --eq-fraction 0.05` (or `--fc-fraction 0.05` for heck-relay).
+   - Publication-baseline (`--starting-point published`): `ftol=1e-8`, sanity bounds → fine (starting FF is already in the published basin).
    - Always pass `--ratio-tol none` for TS systems (ratios are 0.1–0.4).
 
 3. **Verify GPU + device** (§4 + §7): `python -c "import jax; print(jax.devices())"` → must show `CudaDevice`.
diff --git a/docs/benchmarks/qfuerza-recovery.md b/docs/benchmarks/qfuerza-recovery.md
index 1df6c6d2..c3f92d98 100644
--- a/docs/benchmarks/qfuerza-recovery.md
+++ b/docs/benchmarks/qfuerza-recovery.md
@@ -19,34 +19,39 @@ Seminario starts).
 
 ## 1. What this experiment actually tests
 
-> **This is *not* a from-scratch FF generation.**
-
-QFUERZA-recovery starts from the **published force field topology** —
-which OPT rows exist, which parameters are frozen vs active, the
-atom-type rows, vdW radii/epsilons, stretch-bend coefficients, and
-torsion phase information. It then **overwrites** the bond and angle
-*values* (force constants and equilibria) with QFUERZA-derived values
-computed from the per-molecule QM Hessian, following the Farrugia 2025
-protocol ([10.1021/acs.jctc.5c01751](https://doi.org/10.1021/acs.jctc.5c01751)).
-
-| Layer | QFUERZA-recovery run uses |
-|---|---|
-| OPT row topology (which atom-type triples/quadruples appear) | **Published** |
-| Frozen vs active partition (`freeze_standard_params`) | **Published** |
-| Bond/angle *equilibria* and *force constants* | **QFUERZA** (multi-molecule mean of per-mol QFUERZA, TS-inverted) |
-| Torsion `V₁/V₂/V₃` | Published-zero (Farrugia zeros torsions at QFUERZA-init time; for our TS systems the published OPT torsions are already zero, so the values coincide) |
-| van der Waals `r₀`, `ε` | **Published** (QFUERZA does not touch vdW) |
-| Stretch-bend, MM3 backbone | **Published** (frozen) |
-| Reference data (geometries, eigenmatrix, charges) | Identical to published-start runs |
-| Optimizer | SciPy L-BFGS-B + JaxLoss analytical gradient, `--ratio-tol -1` |
-
-The **per-system overwrite count** is reported in §3 below — only 16–25%
-of active parameters are actually replaced.
-
-A *true* from-scratch run would need a QFUERZA-only loader path
-(`qfuerza_fresh` strategy) that builds the OPT topology from scratch,
-not by overwriting published rows. The current implementation uses the
-published `ff_strategy` for everything except the bond/angle scalars.
+**QFUERZA workflow (canonical default)**: QFUERZA (Farrugia 2025,
+[10.1021/acs.jctc.5c01751](https://doi.org/10.1021/acs.jctc.5c01751))
+is defined as a tool for estimating the bond and angle force constants
+and equilibria of an OPT substructure from a QM Hessian, given an
+existing force-field skeleton. The chemist provides the skeleton — atom
+types, OPT-row topology, frozen/active partition, vdW radii/ε,
+stretch-bend cross-term coefficients, torsion phase information — via a
+`.fld` file; QFUERZA fills in the Hessian-derivable scalars. This is
+true for every QFUERZA workflow on every system, including any new
+ligand or substrate a user wants to parameterize. There is no
+`.fld`-free flow because those topology and atom-type decisions are
+chemistry calls that cannot be automated.
+
+For our 5 TS systems, the literature `.fld` files
+(Donoghue 2008, Rosales 2020, Wahlers 2022) encode the chemists'
+skeleton decisions; we use them verbatim and let QFUERZA fill in 100%
+of the bond/angle scalars it is defined to estimate.
+
+| Layer | Source | Why |
+|---|---|---|
+| OPT row topology (which atom-type triples/quadruples appear) | Literature `.fld` | Chemistry decision: which substructure rows need a custom OPT entry |
+| Frozen vs active partition (`freeze_standard_params`) | Literature `.fld` | Chemistry decision: which parameters can be re-derived per-system |
+| Atom-type rows, MM3 backbone | Literature `.fld` (frozen) | Standard MM3 library |
+| Bond/angle *equilibria* and *force constants* | **QFUERZA** (per-mol projection of TS-inverted QM Hessian, multi-mol mean) | Hessian-derivable scalars — QFUERZA's defined scope |
+| Torsion `V₁/V₂/V₃` | Zero | Per Farrugia 2025, torsions are intentionally zeroed at QFUERZA-init time |
+| van der Waals `r₀`, `ε`, stretch-bend coefficients | Literature `.fld` | Outside QFUERZA's defined scope; supplied by skeleton |
+| Reference data (geometries, eigenmatrix, charges) | Identical to publication-baseline runs | — |
+| Optimizer | SciPy L-BFGS-B + JaxLoss analytical gradient, `--ratio-tol -1` | — |
+
+The **per-system overwrite count** in §3.5 reports how many active OPT
+scalars QFUERZA touches: this is the count of bond/angle parameters in
+the skeleton, by definition. Torsions, vdW, and stretch-bend rows are
+not in QFUERZA's scope and stay at the literature values.
 
 ---
 
@@ -75,7 +80,7 @@ All runs: WSL2 + RTX 5090, SciPy L-BFGS-B + JaxLoss `jac='auto'`,
 **tight L-BFGS-B convergence** (`ftol=1e-12`, replacing the loose
 SciPy default that previously let the optimizer exit after one line
 search step).
-Data: [`q2mm-data/benchmarks/<system>/from-qfuerza/`](https://github.com/ericchansen/q2mm-data/tree/main/benchmarks).
+Data: [`q2mm-data/benchmarks/<system>/convergence/`](https://github.com/ericchansen/q2mm-data/tree/main/benchmarks) (canonical QFUERZA-start default).
 
 ### 3.1 Headline: QFUERZA-start vs published-start objective scores
 
@@ -148,7 +153,7 @@ Generated by [`scripts/compare_opt_rows.py`](https://github.com/ericchansen/q2mm
 Both files are round-tripped through `ForceField.to_mm3_fld()` first so
 atom-type tokens and OPT-row ordering match exactly. Per-system markdown
 tables are committed alongside the artifacts at
-[`benchmarks/<system>/from-qfuerza/per_param_comparison.md`](https://github.com/ericchansen/q2mm-data/tree/main/benchmarks).
+[`benchmarks/<system>/convergence/per_param_comparison.md`](https://github.com/ericchansen/q2mm-data/tree/main/benchmarks).
 
 **Mean and max absolute deviation by parameter category**:
 
@@ -195,14 +200,15 @@ cross the sign boundary without violating its box constraint.
 > recovery question. Force constants should have a positive sign
 > constraint regardless of the fractional bounds. Tracking issue: [TBD].
 
-### 3.5 Audit of QFUERZA-overwritten vs published-retained rows
+### 3.5 Audit of QFUERZA-overwritten vs literature-retained rows
 
 For each system we report which OPT-substructure rows had their bond
-and angle values overwritten by QFUERZA (vs retained from the
-published FF). See `starting_point_audit` in each
-`validation_results.json`. Only **17–33% of active parameters** are
-actually replaced; the rest (mostly torsions, vdW, stretch-bend
-coefficients) come from the published FF.
+and angle scalars filled in by QFUERZA (vs retained from the literature
+`.fld` — these are torsions, vdW, stretch-bend, and Urey-Bradley rows
+that are outside QFUERZA's defined scope). See `starting_point_audit`
+in each `validation_results.json`. Across the 5 systems, QFUERZA fills
+in **17–33% of active parameters** (corresponding to 100% of the
+bond/angle scalars in each system's skeleton).
 
 ---
 
@@ -210,7 +216,7 @@ coefficients) come from the published FF.
 
 This section interprets the largest QFUERZA-vs-published deviations
 through a chemical lens. The full per-row tables live in
-[`benchmarks/<system>/from-qfuerza/per_param_comparison.md`](https://github.com/ericchansen/q2mm-data/tree/main/benchmarks).
+[`benchmarks/<system>/convergence/per_param_comparison.md`](https://github.com/ericchansen/q2mm-data/tree/main/benchmarks).
 
 ### 4.1 rh-enamide — cleanest recovery
 
@@ -345,10 +351,24 @@ from any physical basin for fractional bounds around it to make sense.
    QFUERZA `fc` outliers (|fc| > 10) with literature averages before
    L-BFGS-B sees them; or use a robust loss (Huber) instead of
    squared-error for the first 20 optimizer steps.
-4. **True `qfuerza_fresh` loader** that doesn't depend on the
-   published OPT topology, so the experiment becomes a real
-   "from-scratch FF generation" test rather than "overwrite the
-   scalars on a known-good topology".
+
+### Known limitations
+
+QFUERZA is now the canonical default (`starting_point="qfuerza"` on
+`load_system`; `--starting-point qfuerza` on
+`regenerate_convergence_results.py`). Three of the five TS systems do
+**not** converge cleanly from QFUERZA with default bounds:
+
+| System | Symptom | Workaround |
+|---|---|---|
+| rh-conjugate | Ratio 3.49× vs publication baseline; negative `fc` (§3.4) | `--fc-fraction 0.05 --eq-fraction 0.05` (tighter bounds) or `--starting-point published` |
+| pd-conjugate | Ratio 1.14× vs publication baseline | `--fc-fraction 0.05 --eq-fraction 0.05` or `--starting-point published` |
+| heck-relay | JaxLoss surrogate diverges to ~10⁸; L-BFGS-B exits in 0 iters (§4.5) | `--starting-point published` (pre-conditioning fix tracked as future work; see "What to do next") |
+
+The `--starting-point published` opt-out path retains the literature
+OPT values verbatim and reproduces the historical
+publication-baseline benchmarks (`from-published/` artifacts in
+q2mm-data).
 
 ---
 
@@ -359,20 +379,23 @@ from any physical basin for fractional bounds around it to make sense.
 source /home/eric/repos/q2mm/.venv/bin/activate
 python -c "import jax; print(jax.devices())"   # must show CudaDevice
 
-# 2. Run a single system (recommended bounds for non-heck-relay TS)
+# 2. Run a single system (canonical default = QFUERZA; recommended
+#    bounds for non-heck-relay TS).  --starting-point qfuerza is the
+#    default and can be omitted.
 python scripts/regenerate_convergence_results.py \
     --system rh-enamide \
-    --starting-point qfuerza \
     --ratio-tol -1 \
     --ftol 1e-12 \
     --fc-fraction 0.20 \
     --eq-fraction 0.05 \
     --output-dir /path/to/q2mm-data/benchmarks
 
-# 3. Heck-relay: use tighter fc bounds per AGENTS.md
+# 3. Heck-relay or any 3/5-divergence system: see "Known limitations"
+#    in §5 — fall back to the publication-baseline path
 python scripts/regenerate_convergence_results.py \
-    --system heck-relay --starting-point qfuerza --ratio-tol -1 \
-    --ftol 1e-12 --fc-fraction 0.05 --eq-fraction 0.05 \
+    --system heck-relay \
+    --starting-point published \
+    --ratio-tol -1 \
     --output-dir /path/to/q2mm-data/benchmarks
 
 # 4. Generate the cross-system R²/RMSD table
@@ -384,10 +407,16 @@ python scripts/build_qfuerza_recovery_tables.py \
 # 5. Generate per-parameter comparison (per system)
 python scripts/compare_opt_rows.py \
     --system rh-enamide \
-    --optimized /path/to/q2mm-data/benchmarks/rh-enamide/from-qfuerza/rh-enamide_optimized.fld \
+    --optimized /path/to/q2mm-data/benchmarks/rh-enamide/convergence/rh-enamide_optimized.fld \
     --out compare-rh-enamide.md
 ```
 
+The QFUERZA-start artifacts live in
+[`q2mm-data/benchmarks/<system>/convergence/`](https://github.com/ericchansen/q2mm-data/tree/main/benchmarks)
+(canonical default path). The publication-baseline artifacts live in
+[`q2mm-data/benchmarks/<system>/from-published/`](https://github.com/ericchansen/q2mm-data/tree/main/benchmarks)
+(opt-in `--starting-point published` path).
+
 The `--ratio-tol -1` flag bypasses the JaxLoss/ObjectiveFunction ratio
 gate (which would otherwise reject all 5 TS systems at the QFUERZA
 start because the surrogate is poorly aligned).
@@ -404,8 +433,8 @@ symmetrically around their actual sign. Without them, L-BFGS-B will
 escape the QFUERZA basin entirely.
 
 The output `validation_results.json` includes a `starting_point_audit`
-block enumerating which OPT rows were overwritten by QFUERZA vs
-retained from the published FF.
+block enumerating which OPT rows were filled in by QFUERZA vs retained
+from the literature `.fld`.
 
 ---
 
@@ -420,6 +449,6 @@ retained from the published FF.
 - Pre-flight checklist: [`AGENTS.md` §11](https://github.com/ericchansen/q2mm/blob/main/AGENTS.md)
 - Process skills: [`.copilot/skills/q2mm-benchmark/SKILL.md`](https://github.com/ericchansen/q2mm/blob/main/.copilot/skills/q2mm-benchmark/SKILL.md), [`.copilot/skills/q2mm-analysis-design/SKILL.md`](https://github.com/ericchansen/q2mm/blob/main/.copilot/skills/q2mm-analysis-design/SKILL.md)
 - Tests: [`test/test_systems.py`](https://github.com/ericchansen/q2mm/blob/main/test/test_systems.py) (`TestStartingPoint`), [`test/test_models.py`](https://github.com/ericchansen/q2mm/blob/main/test/test_models.py) (`TestGetFractionalBounds`)
-- Data: [`q2mm-data/benchmarks/<system>/from-qfuerza/`](https://github.com/ericchansen/q2mm-data/tree/main/benchmarks)
+- Data: [`q2mm-data/benchmarks/<system>/convergence/`](https://github.com/ericchansen/q2mm-data/tree/main/benchmarks) (canonical QFUERZA-start default); [`q2mm-data/benchmarks/<system>/from-published/`](https://github.com/ericchansen/q2mm-data/tree/main/benchmarks) (opt-in publication baseline)
 - Method paper: Farrugia, Helquist, Norrby & Wiest, *J. Chem. Theory Comput.* **2025**, 22, 469. [10.1021/acs.jctc.5c01751](https://doi.org/10.1021/acs.jctc.5c01751)
 - Related: [QFUERZA Validation](qfuerza-validation.md) — starting-FF quality across all systems.
diff --git a/q2mm/diagnostics/cli.py b/q2mm/diagnostics/cli.py
index 4b4abdf1..1c687746 100644
--- a/q2mm/diagnostics/cli.py
+++ b/q2mm/diagnostics/cli.py
@@ -149,6 +149,7 @@ def _run_matrix(
     platform: str | None = None,
     system_key: str = "ch3f",
     max_iter: int | None = None,
+    starting_point: str = "qfuerza",
 ) -> list:
     """Run the full backend × form × optimizer matrix.
 
@@ -173,6 +174,10 @@ def _run_matrix(
             ``"rh-enamide"``).
         max_iter (int | None): Maximum optimizer iterations.  ``None``
             lets each optimizer use its own default.
+        starting_point (str): Starting FF parameter values; one of
+            ``"qfuerza"`` (canonical default, Farrugia 2025) or
+            ``"published"`` (literature OPT values verbatim).  No-op for
+            ``ch3f`` (qfuerza_fresh strategy).
 
     Returns:
         list[BenchmarkResult]: One result per (backend, form, optimizer)
@@ -235,6 +240,7 @@ def _run_matrix(
                     engine=engine,
                     functional_form=form_value,
                     molecule_loader_kwargs=molecule_loader_kwargs,
+                    starting_point=starting_point,
                 )
             except Exception as e:
                 print(
@@ -626,6 +632,17 @@ def main(argv: list[str] | None = None) -> int:
         default=2000,
         help="Optax optimizer: maximum gradient steps (default: 2000).",
     )
+    parser.add_argument(
+        "--starting-point",
+        choices=("published", "qfuerza"),
+        default="qfuerza",
+        help="Starting force-field parameters for TS systems. 'qfuerza' "
+        "(canonical default) overwrites OPT bond/angle scalars with QFUERZA "
+        "(Farrugia 2025) Hessian-derived values; 'published' retains the "
+        "literature OPT values verbatim — pass this to reproduce historical "
+        "publication-baseline benchmarks. No-op for ch3f (qfuerza_fresh "
+        "strategy, which is already QFUERZA-derived).",
+    )
 
     args = parser.parse_args(argv)
 
@@ -766,6 +783,7 @@ def main(argv: list[str] | None = None) -> int:
             platform=args.platform,
             system_key=args.system,
             max_iter=args.max_iter,
+            starting_point=args.starting_point,
         )
 
     if not results:
diff --git a/q2mm/diagnostics/systems.py b/q2mm/diagnostics/systems.py
index 6045bdaf..574dd9a1 100644
--- a/q2mm/diagnostics/systems.py
+++ b/q2mm/diagnostics/systems.py
@@ -493,17 +493,23 @@ class SystemSpec:
 StartingPoint = Literal["published", "qfuerza"]
 """Choice of starting-parameter values for the optimizer.
 
-- ``"published"`` (default): use the literature OPT values from the
-  published .fld file(s) as-is.  This is the historical Q2MM workflow
-  for all published-FF benchmark systems.
-- ``"qfuerza"``: take the published OPT *topology* (which atom-type
-  rows exist, the frozen/active partition) but overwrite OPT bond and
-  angle values with QFUERZA Hessian-derived estimates averaged across
-  the training molecules (Farrugia 2025 protocol).  Frozen MM3
-  backbone parameters are untouched; torsions are zeroed; OPT
-  parameters that :func:`qfuerza_into` does not estimate (stretch-bends,
-  vdW, Urey-Bradley) retain their published values — the audit in
-  :func:`load_system` records this explicitly.
+- ``"qfuerza"`` (default, canonical): take the FF skeleton from the
+  ``.fld`` file (atom-type rows, OPT-substructure topology, frozen/
+  active partition, vdW, stretch-bend, Urey-Bradley) and overwrite the
+  OPT bond and angle scalars with QFUERZA Hessian-derived estimates
+  averaged across the training molecules (Farrugia 2025 protocol).
+  This is the standard QFUERZA workflow as defined in the literature:
+  the chemist provides the FF skeleton (no tool can automate the
+  decisions of which atom types to use or which substructure rows
+  need OPT parameters); QFUERZA fills in the Hessian-derivable
+  scalars.  Frozen MM3 backbone parameters are untouched; torsions
+  are zeroed; OPT parameters that :func:`qfuerza_into` does not
+  estimate (stretch-bends, vdW, Urey-Bradley) retain their literature
+  values — the audit in :func:`load_system` records this explicitly.
+- ``"published"``: use the literature OPT values from the ``.fld``
+  file(s) as-is, with no QFUERZA overwrite.  This is the
+  publication-baseline path used to reproduce historical convergence
+  runs.
 
 For ``qfuerza_fresh`` strategy (CH3F), ``"qfuerza"`` is a no-op
 because the FF is already QFUERZA-derived; the audit records this.
@@ -554,13 +560,14 @@ def _audit_starting_point(
     """Classify every scalar param as qfuerza/retained_published/frozen.
 
     Honest accounting of where the starting values come from.  For
-    ``starting_point="published"`` everything active is
-    ``retained_published``.  For ``"qfuerza"`` we diff the parameter
-    vector before vs after :func:`qfuerza_into` and call any active
-    scalar whose value changed ``qfuerza_overwritten``; any active
-    scalar whose value did not change is ``retained_published``
+    ``starting_point="qfuerza"`` (canonical default) we diff the
+    parameter vector before vs after :func:`qfuerza_into` and call any
+    active scalar whose value changed ``qfuerza_overwritten``; any
+    active scalar whose value did not change is ``retained_published``
     (e.g. an OPT stretch-bend, an active bond/angle that QFUERZA could
-    not match to any training molecule).
+    not match to any training molecule).  For
+    ``starting_point="published"`` everything active is
+    ``retained_published``.
 
     Args:
         ff: Force field *after* any QFUERZA overwrite.
@@ -614,7 +621,7 @@ def load_system(
     engine: Any | None = None,
     functional_form: str | None = None,
     molecule_loader_kwargs: dict[str, Any] | None = None,
-    starting_point: StartingPoint = "published",
+    starting_point: StartingPoint = "qfuerza",
 ) -> SystemData:
     """Build a :class:`SystemData` for one benchmark system.
 
@@ -641,13 +648,14 @@ def load_system(
         molecule_loader_kwargs: Optional kwargs forwarded to the
             system's molecule loader (e.g. ``data_dir`` for CH3F).
         starting_point: Where the starting OPT parameter values come
-            from.  See :data:`StartingPoint`.  ``"published"`` is the
-            historical default and preserves backward compatibility.
-            ``"qfuerza"`` overwrites OPT bond/angle values with
+            from.  See :data:`StartingPoint`.  ``"qfuerza"`` is the
+            canonical default: it overwrites OPT bond/angle values with
             multi-molecule QFUERZA estimates after FF assembly (per
-            Farrugia 2025).  CH3F (``qfuerza_fresh`` strategy) treats
-            ``"qfuerza"`` as a no-op since the FF is already QFUERZA-
-            derived.
+            Farrugia 2025).  ``"published"`` retains the literature OPT
+            values verbatim — pass this to reproduce the historical
+            publication-baseline runs.  CH3F (``qfuerza_fresh``
+            strategy) treats ``"qfuerza"`` as a no-op since the FF is
+            already QFUERZA-derived.
 
     Returns:
         A fully-populated :class:`SystemData`.  The
diff --git a/q2mm/models/forcefield.py b/q2mm/models/forcefield.py
index c356b9a7..ec602c44 100644
--- a/q2mm/models/forcefield.py
+++ b/q2mm/models/forcefield.py
@@ -570,9 +570,10 @@ def get_fractional_bounds(
         Unlike :meth:`get_bounds`, which returns physical sanity bounds (e.g.
         ``bond_k ∈ (-3600, 3600)``), this returns ``(val * (1 - frac), val *
         (1 + frac))`` for each parameter, with sign-aware handling. This is the
-        appropriate strategy for **from-poor-start** runs (e.g.
-        ``starting_point="qfuerza"``) where you want the optimizer to refine
-        the starting FF locally instead of escaping the starting basin.
+        appropriate strategy for the canonical QFUERZA-start runs
+        (``starting_point="qfuerza"``, the default) where you want the
+        optimizer to refine the QFUERZA-derived starting FF locally instead of
+        escaping the starting basin.
 
         Force-constant types (``bond_k``, ``angle_k``, ``torsion_k``, ``sb_k``,
         ``vdw_epsilon``, ``ub_k``) use ``fc_fraction``.
diff --git a/scripts/build_qfuerza_recovery_tables.py b/scripts/build_qfuerza_recovery_tables.py
index a8895c85..ebbaa096 100644
--- a/scripts/build_qfuerza_recovery_tables.py
+++ b/scripts/build_qfuerza_recovery_tables.py
@@ -2,9 +2,9 @@
 
 For each TS system, emits a markdown table comparing:
 
-- q2mm starting from QFUERZA (`from-qfuerza/`)
-- q2mm starting from published OPT (`convergence/`)
-- published-paper goodness-of-fit (from `paper_r2.json`, optional)
+- q2mm starting from QFUERZA (canonical default, ``convergence/`` subdir)
+- q2mm starting from published OPT (opt-in, ``from-published/`` subdir)
+- published-paper goodness-of-fit (from ``paper_r2.json``, optional)
 
 Per-system tables cover three metrics: bond_length, bond_angle, eig_diagonal.
 
@@ -140,8 +140,8 @@ def main() -> int:
         "",
     ]
     for sys_dir, sys_short in SYSTEM_ORDER:
-        pub_run = _load_run(args.data_dir, sys_dir, "convergence")
-        qf_run = _load_run(args.data_dir, sys_dir, "from-qfuerza")
+        pub_run = _load_run(args.data_dir, sys_dir, "from-published")
+        qf_run = _load_run(args.data_dir, sys_dir, "convergence")
         paper_r2 = paper_r2_all.get(sys_short)
         parts.append(_system_table(sys_short, pub_run, qf_run, paper_r2))
         parts.append("")
diff --git a/scripts/compare_opt_rows.py b/scripts/compare_opt_rows.py
index 4faddbd7..6acb014e 100644
--- a/scripts/compare_opt_rows.py
+++ b/scripts/compare_opt_rows.py
@@ -255,7 +255,10 @@ def main() -> int:
         _sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
         from q2mm.diagnostics.systems import load_system
 
-        sd = load_system(args.system)
+        # ``compare_opt_rows.py`` exists to diff the published baseline FF
+        # against an optimizer's output, so we must load the literature OPT
+        # values verbatim — not the QFUERZA-derived default.
+        sd = load_system(args.system, starting_point="published")
         published_path = Path(_tempfile.mkstemp(prefix=f"pub-{args.system}-", suffix=".fld")[1])
         sd.forcefield.to_mm3_fld(str(published_path))
     else:
diff --git a/scripts/regenerate_convergence_results.py b/scripts/regenerate_convergence_results.py
index 590eb46d..0a55ef54 100644
--- a/scripts/regenerate_convergence_results.py
+++ b/scripts/regenerate_convergence_results.py
@@ -20,8 +20,8 @@
 Outputs (per system) live under
 ``<output-dir>/<system-data-dir>/<subdir>/`` where ``<subdir>`` is:
 
-- ``convergence`` (default, for ``--starting-point published``)
-- ``from-qfuerza`` (for ``--starting-point qfuerza``)
+- ``convergence`` (canonical default, for ``--starting-point qfuerza``)
+- ``from-published`` (for ``--starting-point published``)
 
 Per-system files:
 
@@ -563,7 +563,7 @@ def process_system(
 
     # ---- Write artifacts -------------------------------------------------
     data_dir_name = DATA_DIR_FOR_SYSTEM.get(system_key, system_key)
-    subdir = "convergence" if starting_point == "published" else f"from-{starting_point}"
+    subdir = "convergence" if starting_point == "qfuerza" else f"from-{starting_point}"
     sys_out = output_dir / data_dir_name / subdir
     sys_out.mkdir(parents=True, exist_ok=True)
 
@@ -645,9 +645,10 @@ def main() -> int:
         type=float,
         default=None,
         help="Fractional bounds for force-constant parameters: each FC is "
-        "clamped to (val ± fc_fraction*|val|). Use for from-poor-start runs "
-        "(e.g. --starting-point qfuerza) to keep the optimizer in the "
-        "starting basin. Recommended: 0.20 (i.e. ±20%%). Omit for sanity bounds.",
+        "clamped to (val ± fc_fraction*|val|). Use for the canonical "
+        "QFUERZA-start runs (the default) to keep the optimizer in the "
+        "QFUERZA starting basin. Recommended: 0.20 (i.e. ±20%%). Omit for "
+        "sanity bounds (appropriate when --starting-point published).",
     )
     parser.add_argument(
         "--eq-fraction",
@@ -671,13 +672,14 @@ def main() -> int:
     parser.add_argument(
         "--starting-point",
         choices=("published", "qfuerza"),
-        default="published",
-        help="Starting force-field parameters. 'published' uses the literature OPT values "
-        "(default, backward compatible). 'qfuerza' overwrites the OPT bond/angle scalars "
-        "with QFUERZA (Farrugia 2025) Hessian-derived values while keeping the published "
-        "topology and frozen MM3 backbone — used for QFUERZA-recovery validation runs. "
-        "Output subdirectory becomes 'from-qfuerza' instead of 'convergence' to avoid "
-        "overwriting baselines.",
+        default="qfuerza",
+        help="Starting force-field parameters. 'qfuerza' (canonical default) "
+        "uses QFUERZA (Farrugia 2025) Hessian-derived bond/angle values on top "
+        "of the published FF skeleton (atom types, OPT topology, frozen MM3 "
+        "backbone, vdW, stretch-bend). 'published' retains the literature OPT "
+        "values verbatim — pass this to reproduce historical publication-baseline "
+        "convergence runs. Output subdirectory is 'convergence' for the "
+        "canonical default and 'from-published' for the publication-baseline path.",
     )
     parser.add_argument(
         "--combined-output",
diff --git a/test/integration/test_heck_validation.py b/test/integration/test_heck_validation.py
index 7d3d8047..d156c61f 100644
--- a/test/integration/test_heck_validation.py
+++ b/test/integration/test_heck_validation.py
@@ -397,7 +397,11 @@ def test_load_heck_relay_preserves_published_opt_values() -> None:
         pytest.skip(f"mm3.FF1.fld not found: {ff_path}")
 
     # Loader output (what users get today).
-    sys_data = load_system("heck-relay")
+    # Pin to ``starting_point="published"`` so this test asserts what it was
+    # written to assert: that the loader returns the literature OPT values
+    # bit-for-bit (no Seminario re-projection).  The default starting point
+    # is ``"qfuerza"``, which intentionally overwrites OPT bond/angle scalars.
+    sys_data = load_system("heck-relay", starting_point="published")
     loader_active = sys_data.forcefield.get_active_param_vector()
 
     # Same .fld file, same active-mask partition, but no Seminario.
diff --git a/test/test_systems.py b/test/test_systems.py
index 0b106ade..3c417a51 100644
--- a/test/test_systems.py
+++ b/test/test_systems.py
@@ -90,30 +90,47 @@ class TestStartingPoint:
     """Regression tests for the ``starting_point`` kwarg on load_system.
 
     Validates the Farrugia 2025 "QFUERZA Hessian-derived values on
-    published topology" workflow:
+    published FF skeleton" workflow:
 
-    1. Published path is unchanged (backward compatibility).
-    2. QFUERZA path overwrites OPT bond/angle values but never touches
+    1. Default path is QFUERZA (the canonical Farrugia 2025 workflow).
+    2. Published path (opt-in via ``starting_point="published"``)
+       retains the literature OPT values verbatim — used to reproduce
+       historical publication-baseline runs.
+    3. QFUERZA path overwrites OPT bond/angle values but never touches
        the frozen MM3 backbone.
-    3. Reference data is identical between the two paths — the
+    4. Reference data is identical between the two paths — the
        starting point should only change the starting parameter values,
        not the optimization target.
-    4. The audit dict honestly reports which scalars came from QFUERZA
+    5. The audit dict honestly reports which scalars came from QFUERZA
        vs which retained their published values (e.g. vdW, unmatched
        bond/angle rows).
 
     """
 
     @pytest.mark.external_data
-    def test_published_path_unchanged(self) -> None:
-        """Default ``starting_point="published"`` preserves the historical FF."""
+    def test_default_starting_point_is_qfuerza(self) -> None:
+        """Default ``starting_point="qfuerza"`` matches the explicit kwarg."""
         sd_default = systems.load_system("rh-enamide")
-        sd_explicit = systems.load_system("rh-enamide", starting_point="published")
+        sd_explicit = systems.load_system("rh-enamide", starting_point="qfuerza")
         assert np.allclose(
             sd_default.forcefield.get_param_vector(),
             sd_explicit.forcefield.get_param_vector(),
         )
-        assert sd_default.metadata["starting_point"] == "published"
+        assert sd_default.metadata["starting_point"] == "qfuerza"
+
+    @pytest.mark.external_data
+    def test_published_path_preserves_literature_values(self) -> None:
+        """``starting_point="published"`` retains literature OPT values verbatim."""
+        sd_pub = systems.load_system("rh-enamide", starting_point="published")
+        # Audit should classify every active scalar as retained_published
+        # (no QFUERZA overwrite was performed).
+        audit = sd_pub.metadata["starting_point_audit"]
+        assert audit["starting_point"] == "published"
+        for ptype, bucket in audit["by_type"].items():
+            assert bucket["qfuerza_overwritten"] == 0, (
+                f"published path must not overwrite any scalars, "
+                f"but {ptype} has {bucket['qfuerza_overwritten']} overwrites"
+            )
 
     @pytest.mark.external_data
     def test_qfuerza_overwrites_opt_but_not_frozen(self) -> None: