From 23436e2c758c173ebb4ca495383dde2d428362b9 Mon Sep 17 00:00:00 2001
From: Bill Hlavacek <hlavacek@lanl.gov>
Date: Mon, 11 May 2026 13:51:30 -0600
Subject: [PATCH 1/4] fix: preserve seeded trajectories with connected updates

---
 src/NFcore/moleculeType.cpp                   | 57 +++++++++----------
 src/NFcore/reactionClass.cpp                  | 47 ++++++++++++++-
 src/NFcore/templateMolecule.cpp               | 22 +++++--
 .../transformations/transformationSet.cpp     | 30 ++++------
 validate/validate.py                          | 20 +++++++
 5 files changed, 117 insertions(+), 59 deletions(-)
diff --git a/src/NFcore/moleculeType.cpp b/src/NFcore/moleculeType.cpp
index af59eae3..8d4a6fa0 100644
--- a/src/NFcore/moleculeType.cpp
+++ b/src/NFcore/moleculeType.cpp
@@ -572,35 +572,33 @@ void MoleculeType::updateRxnMembership(Molecule * m)
 
 void MoleculeType::updateConnectedRxnMembership(Molecule * m, ReactionClass * firedReaction)
 {
-	// Replace the iteration over all reactions for the MoleculeType in
-	// MoleculeType::updateRxnMembership by only the
-	// connectedReactions for the fired Reaction. This is a much smaller loop
-	// and skips moleculetypes that are not the TemplateMolecule of the reactant
-	// in the connected reaction right away.
-	// Arvind Rasi Subramaniam
-	//
-	for (int r=0; r<firedReaction->getNumConnectedRxns(); r++) {
-		rxn = firedReaction->getconnectedRxn(r);
-		for (int pos=0; pos<rxn->getNumOfReactants(); pos++) {
-			if (rxn->getMoleculeTypeOfReactantTemplate(pos) != this) continue;
-			double oldA = rxn->get_a();
-			double oldAwithTotal = rxn->update_a();
-			rxn->tryToAdd(m, pos);
-				double newA = rxn->update_a();
-				this->system->update_A_tot(rxn,oldA,newA);
-			// Used for debugging to see which reaction rates changed
-			// upon updating molecule membership
-			// Arvind Rasi Subramaniam Nov 21, 2018
-			if (!this->system->getTrackConnected()) continue;
-			if (oldAwithTotal != newA) {
-				this->system->getConnectedRxnFileStream() <<
-				this->system->getGlobalEventCounter() << "\t" <<
-				firedReaction->getName() << "\t" <<
-						m->getMoleculeTypeName() << "\t" <<
-						m->getUniqueID() << "\t" <<
-						rxn->getName() << "\t" <<
-						oldAwithTotal << "\t" << newA << endl;
-			}
+	// Preserve the MoleculeType's native reaction order so the connectivity path
+	// mutates reactant containers in the same sequence as a full membership
+	// refresh, while still using the precomputed connectivity matrix.
+	for (unsigned int r=0; r<reactions.size(); r++) {
+		rxn = reactions.at(r);
+		if (!this->system->areReactionsConnected(
+				firedReaction->getRxnId(), rxn->getRxnId())) {
+			continue;
+		}
+		int pos = reactionPositions.at(r);
+		double oldA = rxn->get_a();
+		double oldAwithTotal = rxn->update_a();
+		rxn->tryToAdd(m, pos);
+		double newA = rxn->update_a();
+		this->system->update_A_tot(rxn,oldA,newA);
+		// Used for debugging to see which reaction rates changed
+		// upon updating molecule membership
+		// Arvind Rasi Subramaniam Nov 21, 2018
+		if (!this->system->getTrackConnected()) continue;
+		if (oldAwithTotal != newA) {
+			this->system->getConnectedRxnFileStream() <<
+			this->system->getGlobalEventCounter() << "\t" <<
+			firedReaction->getName() << "\t" <<
+					m->getMoleculeTypeName() << "\t" <<
+					m->getUniqueID() << "\t" <<
+					rxn->getName() << "\t" <<
+					oldAwithTotal << "\t" << newA << endl;
 		}
   	}
 }
@@ -828,4 +826,3 @@ void MoleculeType::printDetails() const
 // }
 
 
-
diff --git a/src/NFcore/reactionClass.cpp b/src/NFcore/reactionClass.cpp
index 6cc92f0b..e0e0ac75 100755
--- a/src/NFcore/reactionClass.cpp
+++ b/src/NFcore/reactionClass.cpp
@@ -273,7 +273,19 @@ void ReactionClass::appendConnectedRxn(ReactionClass * rxn) {
 bool ReactionClass::isReactionConnected(ReactionClass * rxn) {
 	// First check if any of the operations share MoleculeType and components with
 	// one of the reactant templates of rxn.
-	return this->transformationSet->checkConnection(rxn);
+	if (this->transformationSet->checkConnection(rxn)) return true;
+
+	// Full membership refresh revisits every explicit reactant/product template in
+	// the fired rule, not only templates that carry direct transformations. Treat
+	// any compatible explicit template as connected so the fast path preserves the
+	// same reachable update set.
+	for (unsigned int i=0; i<allReactantTemplates.size(); i++) {
+		if (rxn->isTemplateCompatible(allReactantTemplates[i])) return true;
+	}
+	for (unsigned int i=0; i<allProductTemplates.size(); i++) {
+		if (rxn->isTemplateCompatible(allProductTemplates[i])) return true;
+	}
+	return false;
 }
 
 ReactionClass::~ReactionClass()
@@ -491,6 +503,30 @@ string ReactionClass::fire(double random_A_number, bool track) {
 	// Add newly created molecules to the list of products
 	this->transformationSet->getListOfAddedMolecules(mappingSet,products,traversalLimit);
 
+	// Track molecules that were explicitly mapped by this firing. Products added
+	// through bonded-neighborhood traversal must use the full updater to preserve
+	// the same membership mutation order as the non-connectivity path.
+	std::unordered_set<Molecule*> directProductSet;
+	for (unsigned int msIndex=0; msIndex<n_mappingsets; msIndex++) {
+		MappingSet *ms = mappingSet[msIndex];
+		if (ms==0) continue;
+		for (unsigned int mapIndex=0; mapIndex<ms->getNumOfMappings(); mapIndex++) {
+			Mapping *mapping = ms->get(mapIndex);
+			if (mapping==0) continue;
+			Molecule *directMol = mapping->getMolecule();
+			if (directMol!=0) directProductSet.insert(directMol);
+		}
+	}
+	bool hasIndirectProducts = false;
+	for (molIter = products.begin(); molIter != products.end(); molIter++) {
+		Molecule *mol = *molIter;
+		if (!mol->isAlive()) continue;
+		if (directProductSet.find(mol)==directProductSet.end()) {
+			hasIndirectProducts = true;
+			break;
+		}
+	}
+
 	// if complex bookkeeping is on, find all product complexes
 	// (this is useful for updating Species Observables and TypeII functions, so keep the info handy).
 	// NOTE: this is a brute force approach: check complex of each molecule. there may be a more
@@ -558,8 +594,13 @@ string ReactionClass::fire(double random_A_number, bool track) {
 		//Update this molcule's reaction membership
 		//  NOTE: as a side-effect, DORreactions that depend on molecule-scoped local functions
 		//   (typeI relationship) will be updated as long as UTL is set appropriately.
-		if ( mol->isAlive() )
-			mol->updateRxnMembership(this, useConnectivity);
+		if ( mol->isAlive() ) {
+			bool useConnectedUpdate =
+				useConnectivity &&
+				!hasIndirectProducts &&
+				directProductSet.find(mol)!=directProductSet.end();
+			mol->updateRxnMembership(this, useConnectedUpdate);
+		}
 	}
 
 	// update complex-scoped local functions for typeII dependencies
diff --git a/src/NFcore/templateMolecule.cpp b/src/NFcore/templateMolecule.cpp
index 89a433c8..5283351c 100644
--- a/src/NFcore/templateMolecule.cpp
+++ b/src/NFcore/templateMolecule.cpp
@@ -2148,22 +2148,32 @@ bool TemplateMolecule::checkSymmetryAroundBond(TemplateMolecule *tm1, TemplateMo
 
 bool TemplateMolecule::isMoleculeTypeAndComponentPresent(MoleculeType * mt, int cIndex) {
 	if (this->getMoleculeType() != mt) return false;
+
+	auto matchesComponent = [mt, cIndex](int templateComponent) {
+		if (templateComponent == cIndex) return true;
+		if (!mt->isEquivalentComponent(templateComponent) ||
+				!mt->isEquivalentComponent(cIndex)) {
+			return false;
+		}
+		return mt->getEquivalenceClassNumber(templateComponent) ==
+				mt->getEquivalenceClassNumber(cIndex);
+	};
 	
 	// First make a joint vector of components specified in the TemplateMolecule
 	for(int i=0; i<n_emptyComps; i++) {
-		if (this->emptyComps[i] == cIndex) return true;
+		if (matchesComponent(this->emptyComps[i])) return true;
 	}
 	for(int i=0; i<n_occupiedComps; i++) {
-		if (this->occupiedComps[i] == cIndex) return true;
+		if (matchesComponent(this->occupiedComps[i])) return true;
 	}
 	for(int i=0; i<n_bonds; i++) {
-		if (this->bondComp[i] == cIndex) return true;
+		if (matchesComponent(this->bondComp[i])) return true;
 	}
 	for(int i=0; i<n_compStateConstraint; i++) {
-		if (this->compStateConstraint_Comp[i] == cIndex) return true;
+		if (matchesComponent(this->compStateConstraint_Comp[i])) return true;
 	}
 	for(int i=0; i<n_compStateExclusion; i++) {
-		if (this->compStateExclusion_Comp[i] == cIndex) return true;
+		if (matchesComponent(this->compStateExclusion_Comp[i])) return true;
 	}
 	for(int c=0; c<n_symComps; c++) {
 		int *molEqComp; int n_molEqComp=0;
@@ -2174,4 +2184,4 @@ bool TemplateMolecule::isMoleculeTypeAndComponentPresent(MoleculeType * mt, int
 	}
 	
 	return false;
-}
\ No newline at end of file
+}
diff --git a/src/NFreactions/transformations/transformationSet.cpp b/src/NFreactions/transformations/transformationSet.cpp
index 7ceb2127..36ffe0f9 100644
--- a/src/NFreactions/transformations/transformationSet.cpp
+++ b/src/NFreactions/transformations/transformationSet.cpp
@@ -1022,26 +1022,20 @@ void TransformationSet::finalize()
 
 bool TransformationSet::checkConnection(ReactionClass * rxn) {
 	TemplateMolecule * t1;
-	MoleculeType * mt1;
 	Transformation * transfn;
-	int c1;
 	for(unsigned int r=0; r<n_reactants; r++) {
 		for (unsigned int i=0; i<transformations[r].size(); i++) {
 			transfn = transformations[r].at(i);
 			t1 = transfn->getTemplateMolecule();
 			if (!t1) continue;
-			mt1 = t1->getMoleculeType();
 			// AS2023 - if this is not a removal, track connections, removal
 			// doesn't give any reaction connections, so skip that
 			if (transfn->getType()!=(int)TransformationFactory::REMOVE) {
-				c1 = transfn->getComponentIndex();
-				// If the moleculetype or component is not present in the other reaction,
-				// it is not connected
-				if (!rxn->areMoleculeTypeAndComponentPresent(mt1, c1)) continue;
-
-				// If the TemplateMolecule is 'incompatible' with any of the reactants
-				// or products, then the reaction is not connected
-				if (!rxn->isTemplateCompatible(t1)) continue;
+				bool isCompatible = rxn->isTemplateCompatible(t1);
+				if (!isCompatible) continue;
+				// Full membership refresh still removes/re-adds compatible mappings
+				// even when the changed component is outside the target pattern,
+				// which can change ReactantList/ReactantTree ordering.
 				// Both checks passed for one op so return true
 				return true;
 			} else {
@@ -1058,15 +1052,11 @@ bool TransformationSet::checkConnection(ReactionClass * rxn) {
 			if (!t1) continue;
 			t1 = t1->getMappedPartner();
 			if (!t1) continue;
-			mt1 = t1->getMoleculeType();
-			c1 = transfn->getComponentIndex();
-			// If the moleculetype or component is present in the other reaction,
-			// it is not connected
-			if (!rxn->areMoleculeTypeAndComponentPresent(mt1, c1)) continue;
-
-			// If the TemplateMolecule is 'incompatible' with any of the reactants
-			// or products, then the reaction is not connected
-			if (!rxn->isTemplateCompatible(t1)) continue;
+			bool isCompatible = rxn->isTemplateCompatible(t1);
+			if (!isCompatible) continue;
+			// See note above: compatibility alone is enough to require a
+			// connected update if the fast path is to preserve full-update
+			// membership ordering.
 			// Both checks passed for one op so return true
 			return true;
 		}
diff --git a/validate/validate.py b/validate/validate.py
index 47c9b06e..f25464ef 100644
--- a/validate/validate.py
+++ b/validate/validate.py
@@ -5,6 +5,7 @@
 import re
 import fnmatch
 import sys
+import tempfile
 import bionetgen
 
 nIterations=15
@@ -202,6 +203,25 @@ def _run_nfsim(self, outputDirectory, fileNumber, runOptions):
             expect_success=True,
         )
 
+    def test_connectivity_preserves_seeded_tlbr_trajectory(self):
+        xmlPath = os.path.join(nfsimPrePath, 'test', 'tlbr', 'tlbr.xml')
+        with tempfile.TemporaryDirectory() as tmpdir:
+            offPath = os.path.join(tmpdir, 'tlbr_off.gdat')
+            onPath = os.path.join(tmpdir, 'tlbr_on.gdat')
+
+            self._run_nfsim_xml(xmlPath, offPath, '-sim 1 -oSteps 100 -seed 1')
+            self._run_nfsim_xml(xmlPath, onPath, '-sim 1 -oSteps 100 -seed 1 -connect')
+
+            offHeaders, offData = self._load_gdat(offPath)
+            onHeaders, onData = self._load_gdat(onPath)
+
+            self.assertEqual(offHeaders, onHeaders, 'Connectivity regression changed TLBR output columns')
+            self.assertEqual(offData.shape, onData.shape, 'Connectivity regression changed TLBR output shape')
+            self.assertTrue(
+                np.array_equal(offData, onData),
+                'Connectivity regression changed the same-seed TLBR trajectory'
+            )
+
     def test_issue48_ring_unbinding_requires_disconnection(self):
         outputDirectory = mfolder
         fileNumber = '37'

From 124f8b7d7faa36a6ea0d4804420898f790d6bf7b Mon Sep 17 00:00:00 2001
From: Bill Hlavacek <hlavacek@lanl.gov>
Date: Mon, 11 May 2026 14:44:48 -0600
Subject: [PATCH 2/4] fix: narrow connectivity reachability for add-only paths

---
 src/NFcore/reactionClass.cpp    | 15 +++++++++------
 src/NFcore/templateMolecule.cpp | 20 +++++---------------
 2 files changed, 14 insertions(+), 21 deletions(-)

diff --git a/src/NFcore/reactionClass.cpp b/src/NFcore/reactionClass.cpp
index e0e0ac75..b5ed4a0a 100755
--- a/src/NFcore/reactionClass.cpp
+++ b/src/NFcore/reactionClass.cpp
@@ -275,15 +275,18 @@ bool ReactionClass::isReactionConnected(ReactionClass * rxn) {
 	// one of the reactant templates of rxn.
 	if (this->transformationSet->checkConnection(rxn)) return true;
 
-	// Full membership refresh revisits every explicit reactant/product template in
-	// the fired rule, not only templates that carry direct transformations. Treat
-	// any compatible explicit template as connected so the fast path preserves the
-	// same reachable update set.
+	// Full membership refresh revisits every explicit reactant template in the
+	// fired rule, not only templates that carry direct transformations.
 	for (unsigned int i=0; i<allReactantTemplates.size(); i++) {
 		if (rxn->isTemplateCompatible(allReactantTemplates[i])) return true;
 	}
-	for (unsigned int i=0; i<allProductTemplates.size(); i++) {
-		if (rxn->isTemplateCompatible(allProductTemplates[i])) return true;
+
+	// Product templates can also create new compatible mappings, but avoid
+	// broadening pure-synthesis rules where this over-connects add-only paths.
+	if (n_reactants > 0) {
+		for (unsigned int i=0; i<allProductTemplates.size(); i++) {
+			if (rxn->isTemplateCompatible(allProductTemplates[i])) return true;
+		}
 	}
 	return false;
 }
diff --git a/src/NFcore/templateMolecule.cpp b/src/NFcore/templateMolecule.cpp
index 5283351c..6307f950 100644
--- a/src/NFcore/templateMolecule.cpp
+++ b/src/NFcore/templateMolecule.cpp
@@ -2148,32 +2148,22 @@ bool TemplateMolecule::checkSymmetryAroundBond(TemplateMolecule *tm1, TemplateMo
 
 bool TemplateMolecule::isMoleculeTypeAndComponentPresent(MoleculeType * mt, int cIndex) {
 	if (this->getMoleculeType() != mt) return false;
-
-	auto matchesComponent = [mt, cIndex](int templateComponent) {
-		if (templateComponent == cIndex) return true;
-		if (!mt->isEquivalentComponent(templateComponent) ||
-				!mt->isEquivalentComponent(cIndex)) {
-			return false;
-		}
-		return mt->getEquivalenceClassNumber(templateComponent) ==
-				mt->getEquivalenceClassNumber(cIndex);
-	};
 	
 	// First make a joint vector of components specified in the TemplateMolecule
 	for(int i=0; i<n_emptyComps; i++) {
-		if (matchesComponent(this->emptyComps[i])) return true;
+		if (this->emptyComps[i] == cIndex) return true;
 	}
 	for(int i=0; i<n_occupiedComps; i++) {
-		if (matchesComponent(this->occupiedComps[i])) return true;
+		if (this->occupiedComps[i] == cIndex) return true;
 	}
 	for(int i=0; i<n_bonds; i++) {
-		if (matchesComponent(this->bondComp[i])) return true;
+		if (this->bondComp[i] == cIndex) return true;
 	}
 	for(int i=0; i<n_compStateConstraint; i++) {
-		if (matchesComponent(this->compStateConstraint_Comp[i])) return true;
+		if (this->compStateConstraint_Comp[i] == cIndex) return true;
 	}
 	for(int i=0; i<n_compStateExclusion; i++) {
-		if (matchesComponent(this->compStateExclusion_Comp[i])) return true;
+		if (this->compStateExclusion_Comp[i] == cIndex) return true;
 	}
 	for(int c=0; c<n_symComps; c++) {
 		int *molEqComp; int n_molEqComp=0;

From 69befcd3777fcfc39cf46875deba200aeec2054e Mon Sep 17 00:00:00 2001
From: Bill Hlavacek <hlavacek@lanl.gov>
Date: Mon, 11 May 2026 15:44:25 -0600
Subject: [PATCH 3/4] Add RuleMonkey validation harness and reports

---
 .../basicmodels_smoke_standard.md             |  41 ++++
 .../corpus_smoke_connect.md                   |  44 ++++
 .../corpus_smoke_standard.md                  |  44 ++++
 .../feature_coverage_connect_subset.md        |  34 +++
 .../feature_coverage_standard_subset.md       |  34 +++
 .../tutorial_example_connect.md               |  37 +++
 .../tutorial_example_standard.md              |  37 +++
 validate/rulemonkey_nfsim_driver.py           | 217 ++++++++++++++++++
 8 files changed, 488 insertions(+)
 create mode 100644 validate/results/rulemonkey_harness/basicmodels_smoke_standard.md
 create mode 100644 validate/results/rulemonkey_harness/corpus_smoke_connect.md
 create mode 100644 validate/results/rulemonkey_harness/corpus_smoke_standard.md
 create mode 100644 validate/results/rulemonkey_harness/feature_coverage_connect_subset.md
 create mode 100644 validate/results/rulemonkey_harness/feature_coverage_standard_subset.md
 create mode 100644 validate/results/rulemonkey_harness/tutorial_example_connect.md
 create mode 100644 validate/results/rulemonkey_harness/tutorial_example_standard.md
 create mode 100755 validate/rulemonkey_nfsim_driver.py

diff --git a/validate/results/rulemonkey_harness/basicmodels_smoke_standard.md b/validate/results/rulemonkey_harness/basicmodels_smoke_standard.md
new file mode 100644
index 00000000..0a77bdb4
--- /dev/null
+++ b/validate/results/rulemonkey_harness/basicmodels_smoke_standard.md
@@ -0,0 +1,41 @@
+# RuleMonkey Benchmark Report
+
+**Date:** 2026-05-11
+**Commit:** `6d7f240 docs(readme,quickstart): document the cancellation hook for embedders`
+**Reps per model:** 10
+**NFsim reference:** 100-rep ensemble
+
+## Correctness
+
+- **screen**: max |RM_mean - NFsim_mean| / (NFsim_std / sqrt(n_reps)) over all (time, obs) pairs. Fast early-warning metric. Flags values ≥ 5.0 as "suspicious" but does not determine verdict. Historical benchmarks used this as the pass/fail criterion; for models with many rare-event Size_N observables it is statistically unreliable.
+- **tz_max**: max over observables of the z-score of the per-rep trapezoidal time integral, computed against precomputed NFsim stats in `ensemble/{model}.tint.tsv`. Collapses each 1001-point trajectory to one number per observable per rep, eliminating single-time-point coincidences.
+- **T**: per-model verdict threshold = max(5.0, 1.2 × tz_p99), where tz_p99 is the 99th percentile of `tz_max` from self-splitting the NFsim replicates at n=10 (see `tests/reference/nfsim/noise_floor.tsv`; provenance and regen recipe in the same directory's `PROVENANCE.md`). Adaptive to each model's intrinsic rare-event noise floor.
+- **std_ratio**: max(RM_std / NFsim_std) across observables with nontrivial variance. Diagnostic for variance consistency; not part of verdict.
+- **verdict**: PASS if `tz_max < T`, FAIL otherwise. Degenerate-observable mismatches (both stds zero, values differ) fail unconditionally.
+
+## Efficiency
+
+- **nfsim_s**: NFsim mean wall time (100-rep, from reference campaign).
+- **rm_s**: RM mean wall time (10-rep average).
+- **ev/s**: SSA events per wall-second (throughput).
+- **sel/fire/obs/upd**: Phase breakdown as % of RM engine time.
+
+## Results
+
+| model | nfsim_s | rm_s | events | ev/s | sel% | fire% | obs% | upd% | screen | tz_max | T | std_ratio | worst_obs | verdict |
+|-------|--------:|-----:|-------:|-----:|-----:|------:|-----:|-----:|-------:|-------:|----:|----------:|-----------|---------|
+| r01 | — | 1.9 | — | — | 0.0 | 0.0 | 0.0 | 0.0 | 10.22 | 12.66 | 5.00 | 2.00 | Xp_free | FAIL |
+| r05 | — | 0.3 | — | — | 0.0 | 0.0 | 0.0 | 0.0 | 7.83 | 8.51 | 5.00 | 1.86 | Complex | FAIL |
+| r20 | — | 2.1 | — | — | 0.0 | 0.0 | 0.0 | 0.0 | 10.22 | 12.66 | 5.00 | 2.00 | Xp_free | FAIL |
+| r22 | — | 0.3 | — | — | 0.0 | 0.0 | 0.0 | 0.0 | 6.87 | 10.28 | 5.00 | 1.00 | rib_elong | FAIL |
+| r32 | — | 0.2 | — | — | 0.0 | 0.0 | 0.0 | 0.0 | 6.22 | 9.56 | 5.00 | 2.00 | Mintra | FAIL |
+
+## Summary
+
+| Metric | Count |
+|--------|------:|
+| PASS | 0 |
+| FAIL | 5 |
+| TIMEOUT | 0 |
+| SKIP | 0 |
+| **Total** | **5** |
diff --git a/validate/results/rulemonkey_harness/corpus_smoke_connect.md b/validate/results/rulemonkey_harness/corpus_smoke_connect.md
new file mode 100644
index 00000000..25157067
--- /dev/null
+++ b/validate/results/rulemonkey_harness/corpus_smoke_connect.md
@@ -0,0 +1,44 @@
+# RuleMonkey Benchmark Report
+
+**Date:** 2026-05-11
+**Commit:** `6d7f240 docs(readme,quickstart): document the cancellation hook for embedders`
+**Reps per model:** 2
+**NFsim reference:** 100-rep ensemble
+
+## Correctness
+
+- **screen**: max |RM_mean - NFsim_mean| / (NFsim_std / sqrt(n_reps)) over all (time, obs) pairs. Fast early-warning metric. Flags values ≥ 5.0 as "suspicious" but does not determine verdict. Historical benchmarks used this as the pass/fail criterion; for models with many rare-event Size_N observables it is statistically unreliable.
+- **tz_max**: max over observables of the z-score of the per-rep trapezoidal time integral, computed against precomputed NFsim stats in `ensemble/{model}.tint.tsv`. Collapses each 1001-point trajectory to one number per observable per rep, eliminating single-time-point coincidences.
+- **T**: per-model verdict threshold = max(5.0, 1.2 × tz_p99), where tz_p99 is the 99th percentile of `tz_max` from self-splitting the NFsim replicates at n=10 (see `tests/reference/nfsim/noise_floor.tsv`; provenance and regen recipe in the same directory's `PROVENANCE.md`). Adaptive to each model's intrinsic rare-event noise floor.
+- **std_ratio**: max(RM_std / NFsim_std) across observables with nontrivial variance. Diagnostic for variance consistency; not part of verdict.
+- **verdict**: PASS if `tz_max < T`, FAIL otherwise. Degenerate-observable mismatches (both stds zero, values differ) fail unconditionally.
+
+## Efficiency
+
+- **nfsim_s**: NFsim mean wall time (100-rep, from reference campaign).
+- **rm_s**: RM mean wall time (2-rep average).
+- **ev/s**: SSA events per wall-second (throughput).
+- **sel/fire/obs/upd**: Phase breakdown as % of RM engine time.
+
+## Results
+
+| model | nfsim_s | rm_s | events | ev/s | sel% | fire% | obs% | upd% | screen | tz_max | T | std_ratio | worst_obs | verdict |
+|-------|--------:|-----:|-------:|-----:|-----:|------:|-----:|-----:|-------:|-------:|----:|----------:|-----------|---------|
+| Tutorial_Example | 0.0 | 0.2 | — | — | 0.0 | 0.0 | 0.0 | 0.0 | 5.54 | 65.07 | 17.67 | 2.24 | A_phos_1 | FAIL |
+| A_plus_A | 0.0 | 0.2 | — | — | 0.0 | 0.0 | 0.0 | 0.0 | 217.39 | 2675.74 | 14.52 | 2.00 | AA_1 | FAIL |
+| nfsim_ring_closure_polymer | 0.1 | 0.2 | — | — | 0.0 | 0.0 | 0.0 | 0.0 | 1.16 | 9.24 | 12.82 | 1.00 | Linear_Dimers | PASS |
+| ANx_noActivity | 0.2 | 0.4 | — | — | 0.0 | 0.0 | 0.0 | 0.0 | 4.00 | 11.86 | 15.18 | 2.00 | RD_R | PASS |
+| isingspin_localfcn | 0.3 | 0.3 | — | — | 0.0 | 0.0 | 0.0 | 0.0 | 2.79 | 8.46 | 16.64 | 2.00 | M_spUp | PASS |
+| BLBR | 0.9 | 0.4 | — | — | 0.0 | 0.0 | 0.0 | 0.0 | 95021.47 | 3014.83 | 19.16 | 31.66 | R1 | FAIL |
+| e1 | 3.0 | 2.2 | — | — | 0.0 | 0.0 | 0.0 | 0.0 | 5.25 | 16.73 | 18.30 | 2.00 | Efree | PASS |
+| fceri_ji | 18.9 | 4.0 | — | — | 0.0 | 0.0 | 0.0 | 0.0 | 4.35 | 4.43 | 18.27 | 3.40 | RecMon_1 | PASS |
+
+## Summary
+
+| Metric | Count |
+|--------|------:|
+| PASS | 5 |
+| FAIL | 3 |
+| TIMEOUT | 0 |
+| SKIP | 0 |
+| **Total** | **8** |
diff --git a/validate/results/rulemonkey_harness/corpus_smoke_standard.md b/validate/results/rulemonkey_harness/corpus_smoke_standard.md
new file mode 100644
index 00000000..8b36fa83
--- /dev/null
+++ b/validate/results/rulemonkey_harness/corpus_smoke_standard.md
@@ -0,0 +1,44 @@
+# RuleMonkey Benchmark Report
+
+**Date:** 2026-05-11
+**Commit:** `6d7f240 docs(readme,quickstart): document the cancellation hook for embedders`
+**Reps per model:** 2
+**NFsim reference:** 100-rep ensemble
+
+## Correctness
+
+- **screen**: max |RM_mean - NFsim_mean| / (NFsim_std / sqrt(n_reps)) over all (time, obs) pairs. Fast early-warning metric. Flags values ≥ 5.0 as "suspicious" but does not determine verdict. Historical benchmarks used this as the pass/fail criterion; for models with many rare-event Size_N observables it is statistically unreliable.
+- **tz_max**: max over observables of the z-score of the per-rep trapezoidal time integral, computed against precomputed NFsim stats in `ensemble/{model}.tint.tsv`. Collapses each 1001-point trajectory to one number per observable per rep, eliminating single-time-point coincidences.
+- **T**: per-model verdict threshold = max(5.0, 1.2 × tz_p99), where tz_p99 is the 99th percentile of `tz_max` from self-splitting the NFsim replicates at n=10 (see `tests/reference/nfsim/noise_floor.tsv`; provenance and regen recipe in the same directory's `PROVENANCE.md`). Adaptive to each model's intrinsic rare-event noise floor.
+- **std_ratio**: max(RM_std / NFsim_std) across observables with nontrivial variance. Diagnostic for variance consistency; not part of verdict.
+- **verdict**: PASS if `tz_max < T`, FAIL otherwise. Degenerate-observable mismatches (both stds zero, values differ) fail unconditionally.
+
+## Efficiency
+
+- **nfsim_s**: NFsim mean wall time (100-rep, from reference campaign).
+- **rm_s**: RM mean wall time (2-rep average).
+- **ev/s**: SSA events per wall-second (throughput).
+- **sel/fire/obs/upd**: Phase breakdown as % of RM engine time.
+
+## Results
+
+| model | nfsim_s | rm_s | events | ev/s | sel% | fire% | obs% | upd% | screen | tz_max | T | std_ratio | worst_obs | verdict |
+|-------|--------:|-----:|-------:|-----:|-----:|------:|-----:|-----:|-------:|-------:|----:|----------:|-----------|---------|
+| Tutorial_Example | 0.0 | 0.2 | — | — | 0.0 | 0.0 | 0.0 | 0.0 | 3.69 | 4.04 | 17.67 | 2.00 | R_dim_1 | PASS |
+| A_plus_A | 0.0 | 0.2 | — | — | 0.0 | 0.0 | 0.0 | 0.0 | 217.39 | 2675.74 | 14.52 | 2.00 | AA_1 | FAIL |
+| nfsim_ring_closure_polymer | 0.1 | 0.2 | — | — | 0.0 | 0.0 | 0.0 | 0.0 | 1.16 | 9.24 | 12.82 | 1.00 | Linear_Dimers | PASS |
+| ANx_noActivity | 0.2 | 0.3 | — | — | 0.0 | 0.0 | 0.0 | 0.0 | 4.00 | 11.86 | 15.18 | 2.00 | RD_R | PASS |
+| isingspin_localfcn | 0.3 | 0.3 | — | — | 0.0 | 0.0 | 0.0 | 0.0 | 2.79 | 8.46 | 16.64 | 2.00 | M_spUp | PASS |
+| BLBR | 0.9 | 0.4 | — | — | 0.0 | 0.0 | 0.0 | 0.0 | 95021.47 | 3014.83 | 19.16 | 31.66 | R1 | FAIL |
+| e1 | 3.0 | 2.2 | — | — | 0.0 | 0.0 | 0.0 | 0.0 | 5.25 | 16.73 | 18.30 | 2.00 | Efree | PASS |
+| fceri_ji | 18.9 | 5.7 | — | — | 0.0 | 0.0 | 0.0 | 0.0 | 4.35 | 4.43 | 18.27 | 3.40 | RecMon_1 | PASS |
+
+## Summary
+
+| Metric | Count |
+|--------|------:|
+| PASS | 6 |
+| FAIL | 2 |
+| TIMEOUT | 0 |
+| SKIP | 0 |
+| **Total** | **8** |
diff --git a/validate/results/rulemonkey_harness/feature_coverage_connect_subset.md b/validate/results/rulemonkey_harness/feature_coverage_connect_subset.md
new file mode 100644
index 00000000..0f88998d
--- /dev/null
+++ b/validate/results/rulemonkey_harness/feature_coverage_connect_subset.md
@@ -0,0 +1,34 @@
+# Feature Coverage Benchmark Report
+
+Generated: 2026-05-11 15:28:57
+
+**Summary: 1 PASS / 2 FAIL / 0 SKIP**
+
+## Feature Coverage
+
+| Model | Tier | Features | RM vs NFsim | RM vs ODE | Verdict |
+|-------|------|----------|-------------|-----------|--------|
+| combo_synth_degrade_equilibrium | combinations | Feature combination: synthesis + degradation + bin | FAIL | - | **FAIL** |
+| ft_delete_molecules | base | Feature: DeleteMolecules keyword on degradation ru | PASS | - | PASS |
+| ft_multi_product | base | Feature: rule producing multiple new molecules (on | FAIL | - | **FAIL** |
+
+## Detailed Results
+
+### combo_synth_degrade_equilibrium
+- Tier: combinations
+- RM reps: 5, wall time: 0.293s
+- vs NFsim: max_z=17.33 (AB), tz_max=17.22 — **FAIL**
+- **Overall: FAIL**
+
+### ft_delete_molecules
+- Tier: base
+- RM reps: 5, wall time: 0.288s
+- vs NFsim: max_z=3.31 (B_free), tz_max=4.83 — **PASS**
+- **Overall: PASS**
+
+### ft_multi_product
+- Tier: base
+- RM reps: 5, wall time: 0.265s
+- vs NFsim: max_z=25.97 (BC), tz_max=2329.70 — **FAIL**
+- **Overall: FAIL**
+
diff --git a/validate/results/rulemonkey_harness/feature_coverage_standard_subset.md b/validate/results/rulemonkey_harness/feature_coverage_standard_subset.md
new file mode 100644
index 00000000..27756938
--- /dev/null
+++ b/validate/results/rulemonkey_harness/feature_coverage_standard_subset.md
@@ -0,0 +1,34 @@
+# Feature Coverage Benchmark Report
+
+Generated: 2026-05-11 15:29:02
+
+**Summary: 1 PASS / 2 FAIL / 0 SKIP**
+
+## Feature Coverage
+
+| Model | Tier | Features | RM vs NFsim | RM vs ODE | Verdict |
+|-------|------|----------|-------------|-----------|--------|
+| combo_synth_degrade_equilibrium | combinations | Feature combination: synthesis + degradation + bin | FAIL | - | **FAIL** |
+| ft_delete_molecules | base | Feature: DeleteMolecules keyword on degradation ru | PASS | - | PASS |
+| ft_multi_product | base | Feature: rule producing multiple new molecules (on | FAIL | - | **FAIL** |
+
+## Detailed Results
+
+### combo_synth_degrade_equilibrium
+- Tier: combinations
+- RM reps: 5, wall time: 0.318s
+- vs NFsim: max_z=9.79 (A_free), tz_max=5.94 — **FAIL**
+- **Overall: FAIL**
+
+### ft_delete_molecules
+- Tier: base
+- RM reps: 5, wall time: 0.284s
+- vs NFsim: max_z=3.62 (A_total), tz_max=4.82 — **PASS**
+- **Overall: PASS**
+
+### ft_multi_product
+- Tier: base
+- RM reps: 5, wall time: 0.285s
+- vs NFsim: max_z=7.42 (BC), tz_max=516.40 — **FAIL**
+- **Overall: FAIL**
+
diff --git a/validate/results/rulemonkey_harness/tutorial_example_connect.md b/validate/results/rulemonkey_harness/tutorial_example_connect.md
new file mode 100644
index 00000000..4b5809ab
--- /dev/null
+++ b/validate/results/rulemonkey_harness/tutorial_example_connect.md
@@ -0,0 +1,37 @@
+# RuleMonkey Benchmark Report
+
+**Date:** 2026-05-11
+**Commit:** `6d7f240 docs(readme,quickstart): document the cancellation hook for embedders`
+**Reps per model:** 10
+**NFsim reference:** 100-rep ensemble
+
+## Correctness
+
+- **screen**: max |RM_mean - NFsim_mean| / (NFsim_std / sqrt(n_reps)) over all (time, obs) pairs. Fast early-warning metric. Flags values ≥ 5.0 as "suspicious" but does not determine verdict. Historical benchmarks used this as the pass/fail criterion; for models with many rare-event Size_N observables it is statistically unreliable.
+- **tz_max**: max over observables of the z-score of the per-rep trapezoidal time integral, computed against precomputed NFsim stats in `ensemble/{model}.tint.tsv`. Collapses each 1001-point trajectory to one number per observable per rep, eliminating single-time-point coincidences.
+- **T**: per-model verdict threshold = max(5.0, 1.2 × tz_p99), where tz_p99 is the 99th percentile of `tz_max` from self-splitting the NFsim replicates at n=10 (see `tests/reference/nfsim/noise_floor.tsv`; provenance and regen recipe in the same directory's `PROVENANCE.md`). Adaptive to each model's intrinsic rare-event noise floor.
+- **std_ratio**: max(RM_std / NFsim_std) across observables with nontrivial variance. Diagnostic for variance consistency; not part of verdict.
+- **verdict**: PASS if `tz_max < T`, FAIL otherwise. Degenerate-observable mismatches (both stds zero, values differ) fail unconditionally.
+
+## Efficiency
+
+- **nfsim_s**: NFsim mean wall time (100-rep, from reference campaign).
+- **rm_s**: RM mean wall time (10-rep average).
+- **ev/s**: SSA events per wall-second (throughput).
+- **sel/fire/obs/upd**: Phase breakdown as % of RM engine time.
+
+## Results
+
+| model | nfsim_s | rm_s | events | ev/s | sel% | fire% | obs% | upd% | screen | tz_max | T | std_ratio | worst_obs | verdict |
+|-------|--------:|-----:|-------:|-----:|-----:|------:|-----:|-----:|-------:|-------:|----:|----------:|-----------|---------|
+| Tutorial_Example | 0.0 | 0.5 | — | — | 0.0 | 0.0 | 0.0 | 0.0 | 12.40 | 65.07 | 5.00 | 2.00 | A_phos_1 | FAIL |
+
+## Summary
+
+| Metric | Count |
+|--------|------:|
+| PASS | 0 |
+| FAIL | 1 |
+| TIMEOUT | 0 |
+| SKIP | 0 |
+| **Total** | **1** |
diff --git a/validate/results/rulemonkey_harness/tutorial_example_standard.md b/validate/results/rulemonkey_harness/tutorial_example_standard.md
new file mode 100644
index 00000000..8e3a20d4
--- /dev/null
+++ b/validate/results/rulemonkey_harness/tutorial_example_standard.md
@@ -0,0 +1,37 @@
+# RuleMonkey Benchmark Report
+
+**Date:** 2026-05-11
+**Commit:** `6d7f240 docs(readme,quickstart): document the cancellation hook for embedders`
+**Reps per model:** 10
+**NFsim reference:** 100-rep ensemble
+
+## Correctness
+
+- **screen**: max |RM_mean - NFsim_mean| / (NFsim_std / sqrt(n_reps)) over all (time, obs) pairs. Fast early-warning metric. Flags values ≥ 5.0 as "suspicious" but does not determine verdict. Historical benchmarks used this as the pass/fail criterion; for models with many rare-event Size_N observables it is statistically unreliable.
+- **tz_max**: max over observables of the z-score of the per-rep trapezoidal time integral, computed against precomputed NFsim stats in `ensemble/{model}.tint.tsv`. Collapses each 1001-point trajectory to one number per observable per rep, eliminating single-time-point coincidences.
+- **T**: per-model verdict threshold = max(5.0, 1.2 × tz_p99), where tz_p99 is the 99th percentile of `tz_max` from self-splitting the NFsim replicates at n=10 (see `tests/reference/nfsim/noise_floor.tsv`; provenance and regen recipe in the same directory's `PROVENANCE.md`). Adaptive to each model's intrinsic rare-event noise floor.
+- **std_ratio**: max(RM_std / NFsim_std) across observables with nontrivial variance. Diagnostic for variance consistency; not part of verdict.
+- **verdict**: PASS if `tz_max < T`, FAIL otherwise. Degenerate-observable mismatches (both stds zero, values differ) fail unconditionally.
+
+## Efficiency
+
+- **nfsim_s**: NFsim mean wall time (100-rep, from reference campaign).
+- **rm_s**: RM mean wall time (10-rep average).
+- **ev/s**: SSA events per wall-second (throughput).
+- **sel/fire/obs/upd**: Phase breakdown as % of RM engine time.
+
+## Results
+
+| model | nfsim_s | rm_s | events | ev/s | sel% | fire% | obs% | upd% | screen | tz_max | T | std_ratio | worst_obs | verdict |
+|-------|--------:|-----:|-------:|-----:|-----:|------:|-----:|-----:|-------:|-------:|----:|----------:|-----------|---------|
+| Tutorial_Example | 0.0 | 0.5 | — | — | 0.0 | 0.0 | 0.0 | 0.0 | 7.52 | 9.14 | 5.00 | 2.00 | R_phos_1 | FAIL |
+
+## Summary
+
+| Metric | Count |
+|--------|------:|
+| PASS | 0 |
+| FAIL | 1 |
+| TIMEOUT | 0 |
+| SKIP | 0 |
+| **Total** | **1** |
diff --git a/validate/rulemonkey_nfsim_driver.py b/validate/rulemonkey_nfsim_driver.py
new file mode 100755
index 00000000..5aa343da
--- /dev/null
+++ b/validate/rulemonkey_nfsim_driver.py
@@ -0,0 +1,217 @@
+#!/usr/bin/env python3
+"""Adapt NFsim CLI to RuleMonkey's rm_driver interface.
+
+This lets RuleMonkey's Python harnesses validate an NFsim binary directly
+against the vendored NFsim reference ensembles.
+
+Expected rm_driver CLI:
+    rulemonkey_nfsim_driver.py <model.xml> <t_end> <n_steps> <seed> [rm_flags...]
+
+Useful environment variables:
+    NFSIM_BIN          Path to NFsim executable (default: <repo>/build/NFsim)
+    NFSIM_SIM_PARAMS   Path to RuleMonkey sim_params.tsv for model-specific flags
+    NFSIM_EXTRA_FLAGS  Extra NFsim CLI flags, e.g. "-connect"
+"""
+
+from __future__ import annotations
+
+import csv
+import os
+import shlex
+import subprocess
+import sys
+import tempfile
+from pathlib import Path
+
+
+REPO_ROOT = Path(__file__).resolve().parents[1]
+DEFAULT_NFSIM_BIN = REPO_ROOT / "build" / "NFsim"
+
+
+def _usage() -> str:
+    return (
+        "Usage: rulemonkey_nfsim_driver.py <model.xml> <t_end> <n_steps> <seed> "
+        "[rm_flags...]"
+    )
+
+
+def _strip_header_hash(fieldnames: list[str] | None) -> list[str] | None:
+    if fieldnames and fieldnames[0].startswith("#"):
+        fieldnames = list(fieldnames)
+        fieldnames[0] = fieldnames[0].lstrip("#")
+    return fieldnames
+
+
+def _load_model_flags(sim_params_path: Path | None, model_name: str) -> list[str]:
+    if sim_params_path is None or not sim_params_path.exists():
+        return []
+
+    with sim_params_path.open(newline="") as f:
+        reader = csv.DictReader(f, delimiter="\t")
+        reader.fieldnames = _strip_header_hash(reader.fieldnames)
+        for row in reader:
+            model = (row.get("model") or "").strip()
+            if not model or model.startswith("#"):
+                continue
+            if model != model_name:
+                continue
+            raw_flags = (row.get("nfsim_flags") or "").strip()
+            return _normalize_nfsim_flags(shlex.split(raw_flags))
+    return []
+
+
+def _normalize_nfsim_flags(tokens: list[str]) -> list[str]:
+    """Drop flags that the harness already supplies explicitly.
+
+    Keep model-specific behavior flags such as -cb, -bscb, -gml, and -utl.
+    """
+
+    drop_with_value = {
+        "-xml",
+        "-o",
+        "-sim",
+        "-eq",
+        "-oSteps",
+        "-oTimes",
+        "-seed",
+        "-ss",
+        "-rxnlog",
+        "-logbuffer",
+        "-maxcputime",
+    }
+
+    keep_with_value = {
+        "-gml",
+        "-utl",
+    }
+
+    out: list[str] = []
+    i = 0
+    while i < len(tokens):
+        tok = tokens[i]
+        if tok in keep_with_value:
+            if i + 1 < len(tokens):
+                out.extend([tok, tokens[i + 1]])
+            i += 2
+            continue
+        if tok in drop_with_value:
+            i += 2
+            continue
+        out.append(tok)
+        i += 1
+    return out
+
+
+def _dedupe_flags(tokens: list[str]) -> list[str]:
+    """Keep first occurrence of standalone flags; last occurrence of value flags."""
+
+    value_flags = {"-gml", "-utl"}
+    standalone_seen: set[str] = set()
+    value_map: dict[str, str] = {}
+    ordered_values: list[str] = []
+    out: list[str] = []
+
+    i = 0
+    while i < len(tokens):
+        tok = tokens[i]
+        if tok in value_flags:
+            if i + 1 < len(tokens):
+                if tok not in value_map:
+                    ordered_values.append(tok)
+                value_map[tok] = tokens[i + 1]
+            i += 2
+            continue
+        if tok not in standalone_seen:
+            standalone_seen.add(tok)
+            out.append(tok)
+        i += 1
+
+    for tok in ordered_values:
+        out.extend([tok, value_map[tok]])
+    return out
+
+
+def _normalize_gdat_text(raw_text: str) -> str:
+    """Rewrite NFsim gdat text as clean tab-separated output."""
+
+    out_lines: list[str] = []
+    for raw_line in raw_text.splitlines():
+        line = raw_line.strip()
+        if not line:
+            continue
+        if line.startswith("#"):
+            parts = line.lstrip("#").strip().split()
+            out_lines.append("#" + "\t".join(parts))
+            continue
+        parts = line.split()
+        out_lines.append("\t".join(parts))
+    return "\n".join(out_lines) + ("\n" if out_lines else "")
+
+
+def main() -> int:
+    if len(sys.argv) < 5:
+        print(_usage(), file=sys.stderr)
+        return 2
+
+    xml_path = Path(sys.argv[1]).resolve()
+    t_end = sys.argv[2]
+    n_steps = sys.argv[3]
+    seed = sys.argv[4]
+    passthrough_flags = sys.argv[5:]
+
+    nfsim_bin = Path(os.environ.get("NFSIM_BIN", str(DEFAULT_NFSIM_BIN))).resolve()
+    sim_params_env = os.environ.get("NFSIM_SIM_PARAMS")
+    sim_params_path = Path(sim_params_env).resolve() if sim_params_env else None
+    extra_flags = shlex.split(os.environ.get("NFSIM_EXTRA_FLAGS", ""))
+
+    if not nfsim_bin.exists():
+        print(f"NFsim binary not found: {nfsim_bin}", file=sys.stderr)
+        return 2
+    if not xml_path.exists():
+        print(f"XML not found: {xml_path}", file=sys.stderr)
+        return 2
+
+    model_name = xml_path.stem
+    model_flags = _load_model_flags(sim_params_path, model_name)
+    merged_flags = _dedupe_flags(model_flags + passthrough_flags + extra_flags)
+
+    with tempfile.TemporaryDirectory(prefix=f"rm_nfsim_{model_name}_") as td_raw:
+        out_gdat = Path(td_raw) / f"{model_name}.gdat"
+        cmd = [
+            str(nfsim_bin),
+            "-xml",
+            str(xml_path),
+            "-sim",
+            str(t_end),
+            "-oSteps",
+            str(n_steps),
+            "-seed",
+            str(seed),
+            *merged_flags,
+            "-o",
+            str(out_gdat),
+        ]
+        result = subprocess.run(
+            cmd,
+            cwd=str(xml_path.parent),
+            capture_output=True,
+            text=True,
+        )
+
+        if result.returncode != 0 or not out_gdat.exists():
+            print("NFsim driver wrapper failed.", file=sys.stderr)
+            print(f"Command: {' '.join(shlex.quote(x) for x in cmd)}", file=sys.stderr)
+            if result.stdout.strip():
+                print("--- stdout ---", file=sys.stderr)
+                print(result.stdout.strip(), file=sys.stderr)
+            if result.stderr.strip():
+                print("--- stderr ---", file=sys.stderr)
+                print(result.stderr.strip(), file=sys.stderr)
+            return 1
+
+        sys.stdout.write(_normalize_gdat_text(out_gdat.read_text()))
+        return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())

From 841c27ef015528ef120b016e19b0698dbd12544a Mon Sep 17 00:00:00 2001
From: Bill Hlavacek <hlavacek@lanl.gov>
Date: Tue, 12 May 2026 11:33:53 -0600
Subject: [PATCH 4/4] test: tighten connectivity regression coverage

---
 .../basicmodels_smoke_standard.md             |  41 ----
 .../corpus_smoke_connect.md                   |  44 ----
 .../corpus_smoke_standard.md                  |  44 ----
 .../feature_coverage_connect_subset.md        |  34 ---
 .../feature_coverage_standard_subset.md       |  34 ---
 .../tutorial_example_connect.md               |  37 ---
 .../tutorial_example_standard.md              |  37 ---
 validate/rulemonkey_nfsim_driver.py           | 217 ------------------
 validate/validate.py                          |  43 +++-
 9 files changed, 33 insertions(+), 498 deletions(-)
 delete mode 100644 validate/results/rulemonkey_harness/basicmodels_smoke_standard.md
 delete mode 100644 validate/results/rulemonkey_harness/corpus_smoke_connect.md
 delete mode 100644 validate/results/rulemonkey_harness/corpus_smoke_standard.md
 delete mode 100644 validate/results/rulemonkey_harness/feature_coverage_connect_subset.md
 delete mode 100644 validate/results/rulemonkey_harness/feature_coverage_standard_subset.md
 delete mode 100644 validate/results/rulemonkey_harness/tutorial_example_connect.md
 delete mode 100644 validate/results/rulemonkey_harness/tutorial_example_standard.md
 delete mode 100755 validate/rulemonkey_nfsim_driver.py

diff --git a/validate/results/rulemonkey_harness/basicmodels_smoke_standard.md b/validate/results/rulemonkey_harness/basicmodels_smoke_standard.md
deleted file mode 100644
index 0a77bdb4..00000000
--- a/validate/results/rulemonkey_harness/basicmodels_smoke_standard.md
+++ /dev/null
@@ -1,41 +0,0 @@
-# RuleMonkey Benchmark Report
-
-**Date:** 2026-05-11
-**Commit:** `6d7f240 docs(readme,quickstart): document the cancellation hook for embedders`
-**Reps per model:** 10
-**NFsim reference:** 100-rep ensemble
-
-## Correctness
-
-- **screen**: max |RM_mean - NFsim_mean| / (NFsim_std / sqrt(n_reps)) over all (time, obs) pairs. Fast early-warning metric. Flags values ≥ 5.0 as "suspicious" but does not determine verdict. Historical benchmarks used this as the pass/fail criterion; for models with many rare-event Size_N observables it is statistically unreliable.
-- **tz_max**: max over observables of the z-score of the per-rep trapezoidal time integral, computed against precomputed NFsim stats in `ensemble/{model}.tint.tsv`. Collapses each 1001-point trajectory to one number per observable per rep, eliminating single-time-point coincidences.
-- **T**: per-model verdict threshold = max(5.0, 1.2 × tz_p99), where tz_p99 is the 99th percentile of `tz_max` from self-splitting the NFsim replicates at n=10 (see `tests/reference/nfsim/noise_floor.tsv`; provenance and regen recipe in the same directory's `PROVENANCE.md`). Adaptive to each model's intrinsic rare-event noise floor.
-- **std_ratio**: max(RM_std / NFsim_std) across observables with nontrivial variance. Diagnostic for variance consistency; not part of verdict.
-- **verdict**: PASS if `tz_max < T`, FAIL otherwise. Degenerate-observable mismatches (both stds zero, values differ) fail unconditionally.
-
-## Efficiency
-
-- **nfsim_s**: NFsim mean wall time (100-rep, from reference campaign).
-- **rm_s**: RM mean wall time (10-rep average).
-- **ev/s**: SSA events per wall-second (throughput).
-- **sel/fire/obs/upd**: Phase breakdown as % of RM engine time.
-
-## Results
-
-| model | nfsim_s | rm_s | events | ev/s | sel% | fire% | obs% | upd% | screen | tz_max | T | std_ratio | worst_obs | verdict |
-|-------|--------:|-----:|-------:|-----:|-----:|------:|-----:|-----:|-------:|-------:|----:|----------:|-----------|---------|
-| r01 | — | 1.9 | — | — | 0.0 | 0.0 | 0.0 | 0.0 | 10.22 | 12.66 | 5.00 | 2.00 | Xp_free | FAIL |
-| r05 | — | 0.3 | — | — | 0.0 | 0.0 | 0.0 | 0.0 | 7.83 | 8.51 | 5.00 | 1.86 | Complex | FAIL |
-| r20 | — | 2.1 | — | — | 0.0 | 0.0 | 0.0 | 0.0 | 10.22 | 12.66 | 5.00 | 2.00 | Xp_free | FAIL |
-| r22 | — | 0.3 | — | — | 0.0 | 0.0 | 0.0 | 0.0 | 6.87 | 10.28 | 5.00 | 1.00 | rib_elong | FAIL |
-| r32 | — | 0.2 | — | — | 0.0 | 0.0 | 0.0 | 0.0 | 6.22 | 9.56 | 5.00 | 2.00 | Mintra | FAIL |
-
-## Summary
-
-| Metric | Count |
-|--------|------:|
-| PASS | 0 |
-| FAIL | 5 |
-| TIMEOUT | 0 |
-| SKIP | 0 |
-| **Total** | **5** |
diff --git a/validate/results/rulemonkey_harness/corpus_smoke_connect.md b/validate/results/rulemonkey_harness/corpus_smoke_connect.md
deleted file mode 100644
index 25157067..00000000
--- a/validate/results/rulemonkey_harness/corpus_smoke_connect.md
+++ /dev/null
@@ -1,44 +0,0 @@
-# RuleMonkey Benchmark Report
-
-**Date:** 2026-05-11
-**Commit:** `6d7f240 docs(readme,quickstart): document the cancellation hook for embedders`
-**Reps per model:** 2
-**NFsim reference:** 100-rep ensemble
-
-## Correctness
-
-- **screen**: max |RM_mean - NFsim_mean| / (NFsim_std / sqrt(n_reps)) over all (time, obs) pairs. Fast early-warning metric. Flags values ≥ 5.0 as "suspicious" but does not determine verdict. Historical benchmarks used this as the pass/fail criterion; for models with many rare-event Size_N observables it is statistically unreliable.
-- **tz_max**: max over observables of the z-score of the per-rep trapezoidal time integral, computed against precomputed NFsim stats in `ensemble/{model}.tint.tsv`. Collapses each 1001-point trajectory to one number per observable per rep, eliminating single-time-point coincidences.
-- **T**: per-model verdict threshold = max(5.0, 1.2 × tz_p99), where tz_p99 is the 99th percentile of `tz_max` from self-splitting the NFsim replicates at n=10 (see `tests/reference/nfsim/noise_floor.tsv`; provenance and regen recipe in the same directory's `PROVENANCE.md`). Adaptive to each model's intrinsic rare-event noise floor.
-- **std_ratio**: max(RM_std / NFsim_std) across observables with nontrivial variance. Diagnostic for variance consistency; not part of verdict.
-- **verdict**: PASS if `tz_max < T`, FAIL otherwise. Degenerate-observable mismatches (both stds zero, values differ) fail unconditionally.
-
-## Efficiency
-
-- **nfsim_s**: NFsim mean wall time (100-rep, from reference campaign).
-- **rm_s**: RM mean wall time (2-rep average).
-- **ev/s**: SSA events per wall-second (throughput).
-- **sel/fire/obs/upd**: Phase breakdown as % of RM engine time.
-
-## Results
-
-| model | nfsim_s | rm_s | events | ev/s | sel% | fire% | obs% | upd% | screen | tz_max | T | std_ratio | worst_obs | verdict |
-|-------|--------:|-----:|-------:|-----:|-----:|------:|-----:|-----:|-------:|-------:|----:|----------:|-----------|---------|
-| Tutorial_Example | 0.0 | 0.2 | — | — | 0.0 | 0.0 | 0.0 | 0.0 | 5.54 | 65.07 | 17.67 | 2.24 | A_phos_1 | FAIL |
-| A_plus_A | 0.0 | 0.2 | — | — | 0.0 | 0.0 | 0.0 | 0.0 | 217.39 | 2675.74 | 14.52 | 2.00 | AA_1 | FAIL |
-| nfsim_ring_closure_polymer | 0.1 | 0.2 | — | — | 0.0 | 0.0 | 0.0 | 0.0 | 1.16 | 9.24 | 12.82 | 1.00 | Linear_Dimers | PASS |
-| ANx_noActivity | 0.2 | 0.4 | — | — | 0.0 | 0.0 | 0.0 | 0.0 | 4.00 | 11.86 | 15.18 | 2.00 | RD_R | PASS |
-| isingspin_localfcn | 0.3 | 0.3 | — | — | 0.0 | 0.0 | 0.0 | 0.0 | 2.79 | 8.46 | 16.64 | 2.00 | M_spUp | PASS |
-| BLBR | 0.9 | 0.4 | — | — | 0.0 | 0.0 | 0.0 | 0.0 | 95021.47 | 3014.83 | 19.16 | 31.66 | R1 | FAIL |
-| e1 | 3.0 | 2.2 | — | — | 0.0 | 0.0 | 0.0 | 0.0 | 5.25 | 16.73 | 18.30 | 2.00 | Efree | PASS |
-| fceri_ji | 18.9 | 4.0 | — | — | 0.0 | 0.0 | 0.0 | 0.0 | 4.35 | 4.43 | 18.27 | 3.40 | RecMon_1 | PASS |
-
-## Summary
-
-| Metric | Count |
-|--------|------:|
-| PASS | 5 |
-| FAIL | 3 |
-| TIMEOUT | 0 |
-| SKIP | 0 |
-| **Total** | **8** |
diff --git a/validate/results/rulemonkey_harness/corpus_smoke_standard.md b/validate/results/rulemonkey_harness/corpus_smoke_standard.md
deleted file mode 100644
index 8b36fa83..00000000
--- a/validate/results/rulemonkey_harness/corpus_smoke_standard.md
+++ /dev/null
@@ -1,44 +0,0 @@
-# RuleMonkey Benchmark Report
-
-**Date:** 2026-05-11
-**Commit:** `6d7f240 docs(readme,quickstart): document the cancellation hook for embedders`
-**Reps per model:** 2
-**NFsim reference:** 100-rep ensemble
-
-## Correctness
-
-- **screen**: max |RM_mean - NFsim_mean| / (NFsim_std / sqrt(n_reps)) over all (time, obs) pairs. Fast early-warning metric. Flags values ≥ 5.0 as "suspicious" but does not determine verdict. Historical benchmarks used this as the pass/fail criterion; for models with many rare-event Size_N observables it is statistically unreliable.
-- **tz_max**: max over observables of the z-score of the per-rep trapezoidal time integral, computed against precomputed NFsim stats in `ensemble/{model}.tint.tsv`. Collapses each 1001-point trajectory to one number per observable per rep, eliminating single-time-point coincidences.
-- **T**: per-model verdict threshold = max(5.0, 1.2 × tz_p99), where tz_p99 is the 99th percentile of `tz_max` from self-splitting the NFsim replicates at n=10 (see `tests/reference/nfsim/noise_floor.tsv`; provenance and regen recipe in the same directory's `PROVENANCE.md`). Adaptive to each model's intrinsic rare-event noise floor.
-- **std_ratio**: max(RM_std / NFsim_std) across observables with nontrivial variance. Diagnostic for variance consistency; not part of verdict.
-- **verdict**: PASS if `tz_max < T`, FAIL otherwise. Degenerate-observable mismatches (both stds zero, values differ) fail unconditionally.
-
-## Efficiency
-
-- **nfsim_s**: NFsim mean wall time (100-rep, from reference campaign).
-- **rm_s**: RM mean wall time (2-rep average).
-- **ev/s**: SSA events per wall-second (throughput).
-- **sel/fire/obs/upd**: Phase breakdown as % of RM engine time.
-
-## Results
-
-| model | nfsim_s | rm_s | events | ev/s | sel% | fire% | obs% | upd% | screen | tz_max | T | std_ratio | worst_obs | verdict |
-|-------|--------:|-----:|-------:|-----:|-----:|------:|-----:|-----:|-------:|-------:|----:|----------:|-----------|---------|
-| Tutorial_Example | 0.0 | 0.2 | — | — | 0.0 | 0.0 | 0.0 | 0.0 | 3.69 | 4.04 | 17.67 | 2.00 | R_dim_1 | PASS |
-| A_plus_A | 0.0 | 0.2 | — | — | 0.0 | 0.0 | 0.0 | 0.0 | 217.39 | 2675.74 | 14.52 | 2.00 | AA_1 | FAIL |
-| nfsim_ring_closure_polymer | 0.1 | 0.2 | — | — | 0.0 | 0.0 | 0.0 | 0.0 | 1.16 | 9.24 | 12.82 | 1.00 | Linear_Dimers | PASS |
-| ANx_noActivity | 0.2 | 0.3 | — | — | 0.0 | 0.0 | 0.0 | 0.0 | 4.00 | 11.86 | 15.18 | 2.00 | RD_R | PASS |
-| isingspin_localfcn | 0.3 | 0.3 | — | — | 0.0 | 0.0 | 0.0 | 0.0 | 2.79 | 8.46 | 16.64 | 2.00 | M_spUp | PASS |
-| BLBR | 0.9 | 0.4 | — | — | 0.0 | 0.0 | 0.0 | 0.0 | 95021.47 | 3014.83 | 19.16 | 31.66 | R1 | FAIL |
-| e1 | 3.0 | 2.2 | — | — | 0.0 | 0.0 | 0.0 | 0.0 | 5.25 | 16.73 | 18.30 | 2.00 | Efree | PASS |
-| fceri_ji | 18.9 | 5.7 | — | — | 0.0 | 0.0 | 0.0 | 0.0 | 4.35 | 4.43 | 18.27 | 3.40 | RecMon_1 | PASS |
-
-## Summary
-
-| Metric | Count |
-|--------|------:|
-| PASS | 6 |
-| FAIL | 2 |
-| TIMEOUT | 0 |
-| SKIP | 0 |
-| **Total** | **8** |
diff --git a/validate/results/rulemonkey_harness/feature_coverage_connect_subset.md b/validate/results/rulemonkey_harness/feature_coverage_connect_subset.md
deleted file mode 100644
index 0f88998d..00000000
--- a/validate/results/rulemonkey_harness/feature_coverage_connect_subset.md
+++ /dev/null
@@ -1,34 +0,0 @@
-# Feature Coverage Benchmark Report
-
-Generated: 2026-05-11 15:28:57
-
-**Summary: 1 PASS / 2 FAIL / 0 SKIP**
-
-## Feature Coverage
-
-| Model | Tier | Features | RM vs NFsim | RM vs ODE | Verdict |
-|-------|------|----------|-------------|-----------|--------|
-| combo_synth_degrade_equilibrium | combinations | Feature combination: synthesis + degradation + bin | FAIL | - | **FAIL** |
-| ft_delete_molecules | base | Feature: DeleteMolecules keyword on degradation ru | PASS | - | PASS |
-| ft_multi_product | base | Feature: rule producing multiple new molecules (on | FAIL | - | **FAIL** |
-
-## Detailed Results
-
-### combo_synth_degrade_equilibrium
-- Tier: combinations
-- RM reps: 5, wall time: 0.293s
-- vs NFsim: max_z=17.33 (AB), tz_max=17.22 — **FAIL**
-- **Overall: FAIL**
-
-### ft_delete_molecules
-- Tier: base
-- RM reps: 5, wall time: 0.288s
-- vs NFsim: max_z=3.31 (B_free), tz_max=4.83 — **PASS**
-- **Overall: PASS**
-
-### ft_multi_product
-- Tier: base
-- RM reps: 5, wall time: 0.265s
-- vs NFsim: max_z=25.97 (BC), tz_max=2329.70 — **FAIL**
-- **Overall: FAIL**
-
diff --git a/validate/results/rulemonkey_harness/feature_coverage_standard_subset.md b/validate/results/rulemonkey_harness/feature_coverage_standard_subset.md
deleted file mode 100644
index 27756938..00000000
--- a/validate/results/rulemonkey_harness/feature_coverage_standard_subset.md
+++ /dev/null
@@ -1,34 +0,0 @@
-# Feature Coverage Benchmark Report
-
-Generated: 2026-05-11 15:29:02
-
-**Summary: 1 PASS / 2 FAIL / 0 SKIP**
-
-## Feature Coverage
-
-| Model | Tier | Features | RM vs NFsim | RM vs ODE | Verdict |
-|-------|------|----------|-------------|-----------|--------|
-| combo_synth_degrade_equilibrium | combinations | Feature combination: synthesis + degradation + bin | FAIL | - | **FAIL** |
-| ft_delete_molecules | base | Feature: DeleteMolecules keyword on degradation ru | PASS | - | PASS |
-| ft_multi_product | base | Feature: rule producing multiple new molecules (on | FAIL | - | **FAIL** |
-
-## Detailed Results
-
-### combo_synth_degrade_equilibrium
-- Tier: combinations
-- RM reps: 5, wall time: 0.318s
-- vs NFsim: max_z=9.79 (A_free), tz_max=5.94 — **FAIL**
-- **Overall: FAIL**
-
-### ft_delete_molecules
-- Tier: base
-- RM reps: 5, wall time: 0.284s
-- vs NFsim: max_z=3.62 (A_total), tz_max=4.82 — **PASS**
-- **Overall: PASS**
-
-### ft_multi_product
-- Tier: base
-- RM reps: 5, wall time: 0.285s
-- vs NFsim: max_z=7.42 (BC), tz_max=516.40 — **FAIL**
-- **Overall: FAIL**
-
diff --git a/validate/results/rulemonkey_harness/tutorial_example_connect.md b/validate/results/rulemonkey_harness/tutorial_example_connect.md
deleted file mode 100644
index 4b5809ab..00000000
--- a/validate/results/rulemonkey_harness/tutorial_example_connect.md
+++ /dev/null
@@ -1,37 +0,0 @@
-# RuleMonkey Benchmark Report
-
-**Date:** 2026-05-11
-**Commit:** `6d7f240 docs(readme,quickstart): document the cancellation hook for embedders`
-**Reps per model:** 10
-**NFsim reference:** 100-rep ensemble
-
-## Correctness
-
-- **screen**: max |RM_mean - NFsim_mean| / (NFsim_std / sqrt(n_reps)) over all (time, obs) pairs. Fast early-warning metric. Flags values ≥ 5.0 as "suspicious" but does not determine verdict. Historical benchmarks used this as the pass/fail criterion; for models with many rare-event Size_N observables it is statistically unreliable.
-- **tz_max**: max over observables of the z-score of the per-rep trapezoidal time integral, computed against precomputed NFsim stats in `ensemble/{model}.tint.tsv`. Collapses each 1001-point trajectory to one number per observable per rep, eliminating single-time-point coincidences.
-- **T**: per-model verdict threshold = max(5.0, 1.2 × tz_p99), where tz_p99 is the 99th percentile of `tz_max` from self-splitting the NFsim replicates at n=10 (see `tests/reference/nfsim/noise_floor.tsv`; provenance and regen recipe in the same directory's `PROVENANCE.md`). Adaptive to each model's intrinsic rare-event noise floor.
-- **std_ratio**: max(RM_std / NFsim_std) across observables with nontrivial variance. Diagnostic for variance consistency; not part of verdict.
-- **verdict**: PASS if `tz_max < T`, FAIL otherwise. Degenerate-observable mismatches (both stds zero, values differ) fail unconditionally.
-
-## Efficiency
-
-- **nfsim_s**: NFsim mean wall time (100-rep, from reference campaign).
-- **rm_s**: RM mean wall time (10-rep average).
-- **ev/s**: SSA events per wall-second (throughput).
-- **sel/fire/obs/upd**: Phase breakdown as % of RM engine time.
-
-## Results
-
-| model | nfsim_s | rm_s | events | ev/s | sel% | fire% | obs% | upd% | screen | tz_max | T | std_ratio | worst_obs | verdict |
-|-------|--------:|-----:|-------:|-----:|-----:|------:|-----:|-----:|-------:|-------:|----:|----------:|-----------|---------|
-| Tutorial_Example | 0.0 | 0.5 | — | — | 0.0 | 0.0 | 0.0 | 0.0 | 12.40 | 65.07 | 5.00 | 2.00 | A_phos_1 | FAIL |
-
-## Summary
-
-| Metric | Count |
-|--------|------:|
-| PASS | 0 |
-| FAIL | 1 |
-| TIMEOUT | 0 |
-| SKIP | 0 |
-| **Total** | **1** |
diff --git a/validate/results/rulemonkey_harness/tutorial_example_standard.md b/validate/results/rulemonkey_harness/tutorial_example_standard.md
deleted file mode 100644
index 8e3a20d4..00000000
--- a/validate/results/rulemonkey_harness/tutorial_example_standard.md
+++ /dev/null
@@ -1,37 +0,0 @@
-# RuleMonkey Benchmark Report
-
-**Date:** 2026-05-11
-**Commit:** `6d7f240 docs(readme,quickstart): document the cancellation hook for embedders`
-**Reps per model:** 10
-**NFsim reference:** 100-rep ensemble
-
-## Correctness
-
-- **screen**: max |RM_mean - NFsim_mean| / (NFsim_std / sqrt(n_reps)) over all (time, obs) pairs. Fast early-warning metric. Flags values ≥ 5.0 as "suspicious" but does not determine verdict. Historical benchmarks used this as the pass/fail criterion; for models with many rare-event Size_N observables it is statistically unreliable.
-- **tz_max**: max over observables of the z-score of the per-rep trapezoidal time integral, computed against precomputed NFsim stats in `ensemble/{model}.tint.tsv`. Collapses each 1001-point trajectory to one number per observable per rep, eliminating single-time-point coincidences.
-- **T**: per-model verdict threshold = max(5.0, 1.2 × tz_p99), where tz_p99 is the 99th percentile of `tz_max` from self-splitting the NFsim replicates at n=10 (see `tests/reference/nfsim/noise_floor.tsv`; provenance and regen recipe in the same directory's `PROVENANCE.md`). Adaptive to each model's intrinsic rare-event noise floor.
-- **std_ratio**: max(RM_std / NFsim_std) across observables with nontrivial variance. Diagnostic for variance consistency; not part of verdict.
-- **verdict**: PASS if `tz_max < T`, FAIL otherwise. Degenerate-observable mismatches (both stds zero, values differ) fail unconditionally.
-
-## Efficiency
-
-- **nfsim_s**: NFsim mean wall time (100-rep, from reference campaign).
-- **rm_s**: RM mean wall time (10-rep average).
-- **ev/s**: SSA events per wall-second (throughput).
-- **sel/fire/obs/upd**: Phase breakdown as % of RM engine time.
-
-## Results
-
-| model | nfsim_s | rm_s | events | ev/s | sel% | fire% | obs% | upd% | screen | tz_max | T | std_ratio | worst_obs | verdict |
-|-------|--------:|-----:|-------:|-----:|-----:|------:|-----:|-----:|-------:|-------:|----:|----------:|-----------|---------|
-| Tutorial_Example | 0.0 | 0.5 | — | — | 0.0 | 0.0 | 0.0 | 0.0 | 7.52 | 9.14 | 5.00 | 2.00 | R_phos_1 | FAIL |
-
-## Summary
-
-| Metric | Count |
-|--------|------:|
-| PASS | 0 |
-| FAIL | 1 |
-| TIMEOUT | 0 |
-| SKIP | 0 |
-| **Total** | **1** |
diff --git a/validate/rulemonkey_nfsim_driver.py b/validate/rulemonkey_nfsim_driver.py
deleted file mode 100755
index 5aa343da..00000000
--- a/validate/rulemonkey_nfsim_driver.py
+++ /dev/null
@@ -1,217 +0,0 @@
-#!/usr/bin/env python3
-"""Adapt NFsim CLI to RuleMonkey's rm_driver interface.
-
-This lets RuleMonkey's Python harnesses validate an NFsim binary directly
-against the vendored NFsim reference ensembles.
-
-Expected rm_driver CLI:
-    rulemonkey_nfsim_driver.py <model.xml> <t_end> <n_steps> <seed> [rm_flags...]
-
-Useful environment variables:
-    NFSIM_BIN          Path to NFsim executable (default: <repo>/build/NFsim)
-    NFSIM_SIM_PARAMS   Path to RuleMonkey sim_params.tsv for model-specific flags
-    NFSIM_EXTRA_FLAGS  Extra NFsim CLI flags, e.g. "-connect"
-"""
-
-from __future__ import annotations
-
-import csv
-import os
-import shlex
-import subprocess
-import sys
-import tempfile
-from pathlib import Path
-
-
-REPO_ROOT = Path(__file__).resolve().parents[1]
-DEFAULT_NFSIM_BIN = REPO_ROOT / "build" / "NFsim"
-
-
-def _usage() -> str:
-    return (
-        "Usage: rulemonkey_nfsim_driver.py <model.xml> <t_end> <n_steps> <seed> "
-        "[rm_flags...]"
-    )
-
-
-def _strip_header_hash(fieldnames: list[str] | None) -> list[str] | None:
-    if fieldnames and fieldnames[0].startswith("#"):
-        fieldnames = list(fieldnames)
-        fieldnames[0] = fieldnames[0].lstrip("#")
-    return fieldnames
-
-
-def _load_model_flags(sim_params_path: Path | None, model_name: str) -> list[str]:
-    if sim_params_path is None or not sim_params_path.exists():
-        return []
-
-    with sim_params_path.open(newline="") as f:
-        reader = csv.DictReader(f, delimiter="\t")
-        reader.fieldnames = _strip_header_hash(reader.fieldnames)
-        for row in reader:
-            model = (row.get("model") or "").strip()
-            if not model or model.startswith("#"):
-                continue
-            if model != model_name:
-                continue
-            raw_flags = (row.get("nfsim_flags") or "").strip()
-            return _normalize_nfsim_flags(shlex.split(raw_flags))
-    return []
-
-
-def _normalize_nfsim_flags(tokens: list[str]) -> list[str]:
-    """Drop flags that the harness already supplies explicitly.
-
-    Keep model-specific behavior flags such as -cb, -bscb, -gml, and -utl.
-    """
-
-    drop_with_value = {
-        "-xml",
-        "-o",
-        "-sim",
-        "-eq",
-        "-oSteps",
-        "-oTimes",
-        "-seed",
-        "-ss",
-        "-rxnlog",
-        "-logbuffer",
-        "-maxcputime",
-    }
-
-    keep_with_value = {
-        "-gml",
-        "-utl",
-    }
-
-    out: list[str] = []
-    i = 0
-    while i < len(tokens):
-        tok = tokens[i]
-        if tok in keep_with_value:
-            if i + 1 < len(tokens):
-                out.extend([tok, tokens[i + 1]])
-            i += 2
-            continue
-        if tok in drop_with_value:
-            i += 2
-            continue
-        out.append(tok)
-        i += 1
-    return out
-
-
-def _dedupe_flags(tokens: list[str]) -> list[str]:
-    """Keep first occurrence of standalone flags; last occurrence of value flags."""
-
-    value_flags = {"-gml", "-utl"}
-    standalone_seen: set[str] = set()
-    value_map: dict[str, str] = {}
-    ordered_values: list[str] = []
-    out: list[str] = []
-
-    i = 0
-    while i < len(tokens):
-        tok = tokens[i]
-        if tok in value_flags:
-            if i + 1 < len(tokens):
-                if tok not in value_map:
-                    ordered_values.append(tok)
-                value_map[tok] = tokens[i + 1]
-            i += 2
-            continue
-        if tok not in standalone_seen:
-            standalone_seen.add(tok)
-            out.append(tok)
-        i += 1
-
-    for tok in ordered_values:
-        out.extend([tok, value_map[tok]])
-    return out
-
-
-def _normalize_gdat_text(raw_text: str) -> str:
-    """Rewrite NFsim gdat text as clean tab-separated output."""
-
-    out_lines: list[str] = []
-    for raw_line in raw_text.splitlines():
-        line = raw_line.strip()
-        if not line:
-            continue
-        if line.startswith("#"):
-            parts = line.lstrip("#").strip().split()
-            out_lines.append("#" + "\t".join(parts))
-            continue
-        parts = line.split()
-        out_lines.append("\t".join(parts))
-    return "\n".join(out_lines) + ("\n" if out_lines else "")
-
-
-def main() -> int:
-    if len(sys.argv) < 5:
-        print(_usage(), file=sys.stderr)
-        return 2
-
-    xml_path = Path(sys.argv[1]).resolve()
-    t_end = sys.argv[2]
-    n_steps = sys.argv[3]
-    seed = sys.argv[4]
-    passthrough_flags = sys.argv[5:]
-
-    nfsim_bin = Path(os.environ.get("NFSIM_BIN", str(DEFAULT_NFSIM_BIN))).resolve()
-    sim_params_env = os.environ.get("NFSIM_SIM_PARAMS")
-    sim_params_path = Path(sim_params_env).resolve() if sim_params_env else None
-    extra_flags = shlex.split(os.environ.get("NFSIM_EXTRA_FLAGS", ""))
-
-    if not nfsim_bin.exists():
-        print(f"NFsim binary not found: {nfsim_bin}", file=sys.stderr)
-        return 2
-    if not xml_path.exists():
-        print(f"XML not found: {xml_path}", file=sys.stderr)
-        return 2
-
-    model_name = xml_path.stem
-    model_flags = _load_model_flags(sim_params_path, model_name)
-    merged_flags = _dedupe_flags(model_flags + passthrough_flags + extra_flags)
-
-    with tempfile.TemporaryDirectory(prefix=f"rm_nfsim_{model_name}_") as td_raw:
-        out_gdat = Path(td_raw) / f"{model_name}.gdat"
-        cmd = [
-            str(nfsim_bin),
-            "-xml",
-            str(xml_path),
-            "-sim",
-            str(t_end),
-            "-oSteps",
-            str(n_steps),
-            "-seed",
-            str(seed),
-            *merged_flags,
-            "-o",
-            str(out_gdat),
-        ]
-        result = subprocess.run(
-            cmd,
-            cwd=str(xml_path.parent),
-            capture_output=True,
-            text=True,
-        )
-
-        if result.returncode != 0 or not out_gdat.exists():
-            print("NFsim driver wrapper failed.", file=sys.stderr)
-            print(f"Command: {' '.join(shlex.quote(x) for x in cmd)}", file=sys.stderr)
-            if result.stdout.strip():
-                print("--- stdout ---", file=sys.stderr)
-                print(result.stdout.strip(), file=sys.stderr)
-            if result.stderr.strip():
-                print("--- stderr ---", file=sys.stderr)
-                print(result.stderr.strip(), file=sys.stderr)
-            return 1
-
-        sys.stdout.write(_normalize_gdat_text(out_gdat.read_text()))
-        return 0
-
-
-if __name__ == "__main__":
-    raise SystemExit(main())
diff --git a/validate/validate.py b/validate/validate.py
index f25464ef..40ab5fad 100644
--- a/validate/validate.py
+++ b/validate/validate.py
@@ -6,7 +6,10 @@
 import fnmatch
 import sys
 import tempfile
-import bionetgen
+try:
+    import bionetgen
+except ImportError:
+    bionetgen = None
 
 nIterations=15
 nfsimPrePath='..'
@@ -172,6 +175,9 @@ def _bng_generate(self, outputDirectory, fileNumber):
             # already generated, no need to rerun BNG
             return
 
+        if bionetgen is None:
+            self.fail('bionetgen Python package is required to generate XML fixtures')
+
         bngFileName = os.path.join(outputDirectory, 'v{0}.bngl'.format(fileNumber))
         bionetgen.run(bngFileName, out=outputDirectory, suppress=True)
 
@@ -203,25 +209,42 @@ def _run_nfsim(self, outputDirectory, fileNumber, runOptions):
             expect_success=True,
         )
 
-    def test_connectivity_preserves_seeded_tlbr_trajectory(self):
-        xmlPath = os.path.join(nfsimPrePath, 'test', 'tlbr', 'tlbr.xml')
+    def _assert_same_seed_connectivity_parity(self, xmlPath, runOptions, label):
         with tempfile.TemporaryDirectory() as tmpdir:
-            offPath = os.path.join(tmpdir, 'tlbr_off.gdat')
-            onPath = os.path.join(tmpdir, 'tlbr_on.gdat')
+            offPath = os.path.join(tmpdir, 'off.gdat')
+            onPath = os.path.join(tmpdir, 'on.gdat')
 
-            self._run_nfsim_xml(xmlPath, offPath, '-sim 1 -oSteps 100 -seed 1')
-            self._run_nfsim_xml(xmlPath, onPath, '-sim 1 -oSteps 100 -seed 1 -connect')
+            connectOptions = f'{runOptions} -connect'.strip()
+            self._run_nfsim_xml(xmlPath, offPath, runOptions)
+            self._run_nfsim_xml(xmlPath, onPath, connectOptions)
 
             offHeaders, offData = self._load_gdat(offPath)
             onHeaders, onData = self._load_gdat(onPath)
 
-            self.assertEqual(offHeaders, onHeaders, 'Connectivity regression changed TLBR output columns')
-            self.assertEqual(offData.shape, onData.shape, 'Connectivity regression changed TLBR output shape')
+            self.assertEqual(offHeaders, onHeaders, f'Connectivity regression changed {label} output columns')
+            self.assertEqual(offData.shape, onData.shape, f'Connectivity regression changed {label} output shape')
             self.assertTrue(
                 np.array_equal(offData, onData),
-                'Connectivity regression changed the same-seed TLBR trajectory'
+                f'Connectivity regression changed the same-seed {label} trajectory'
             )
 
+    def test_connectivity_preserves_seeded_tlbr_trajectory(self):
+        self._assert_same_seed_connectivity_parity(
+            os.path.join(nfsimPrePath, 'test', 'tlbr', 'tlbr.xml'),
+            '-sim 1 -oSteps 100 -seed 1',
+            'TLBR'
+        )
+
+    def test_connectivity_preserves_seeded_local_function_trajectory(self):
+        # testSuite/t3 exercises local-function membership updates on a much
+        # smaller model than AN_chemotaxis while still reproducing the
+        # master-vs-connect divergence fixed by this branch.
+        self._assert_same_seed_connectivity_parity(
+            os.path.join(nfsimPrePath, 'test', 'testSuite', 't3.xml'),
+            '-sim 1 -oSteps 20 -seed 1',
+            'testSuite t3'
+        )
+
     def test_issue48_ring_unbinding_requires_disconnection(self):
         outputDirectory = mfolder
         fileNumber = '37'