From d302dd7b16a87ace881cc8377588d89f6ce774cd Mon Sep 17 00:00:00 2001 From: blocksifrdev Date: Sat, 20 Jun 2026 10:00:24 -0400 Subject: [PATCH 1/3] docs: add index authority receipts for IFC evidence Signed-off-by: blocksifrdev --- .../caif-ifc-index-authority.md | 49 +++++++++++ docs/INDEX_AUTHORITY_RECEIPTS.md | 48 +++++++++++ .../trec-covid-sign-rq2.index-authority.json | 82 +++++++++++++++++++ tools/verify_index_authority.py | 74 +++++++++++++++++ 4 files changed, 253 insertions(+) create mode 100644 .github/pull_request_bodies/caif-ifc-index-authority.md create mode 100644 docs/INDEX_AUTHORITY_RECEIPTS.md create mode 100644 examples/caif/trec-covid-sign-rq2.index-authority.json create mode 100755 tools/verify_index_authority.py diff --git a/.github/pull_request_bodies/caif-ifc-index-authority.md b/.github/pull_request_bodies/caif-ifc-index-authority.md new file mode 100644 index 0000000..ac2b595 --- /dev/null +++ b/.github/pull_request_bodies/caif-ifc-index-authority.md @@ -0,0 +1,49 @@ +## Summary + +Adds an optional CAIF-style Index Authority Receipt for ordvec benchmark evidence. + +The goal is to make ordvec's index-first retrieval evidence machine-readable: quality delta, bytes/vector, latency regime, benchmark scope, limitations, fallback conditions, and a deterministic receipt hash. + +## Why + +ordvec already has a strong index-first compute story: compressed ordinal/sign retrieval can preserve retrieval quality under stated benchmark scopes while reducing storage and latency. + +This PR adds a small evidence packet and verifier so downstream systems can answer: + +> Is this compressed/index-first retrieval path evidence-supported before dense compute for this stated workload scope? + +## What this includes + +- `docs/INDEX_AUTHORITY_RECEIPTS.md` +- `examples/caif/trec-covid-sign-rq2.index-authority.json` +- `tools/verify_index_authority.py` + +## What this does not do + +- Does not change Rust code +- Does not change `Cargo.toml` +- Does not add runtime dependencies +- Does not add CI requirements +- Does not claim new benchmark results +- Does not add signing, key management, or deployment trust policy + +## Verification + + python3 tools/verify_index_authority.py examples/caif/trec-covid-sign-rq2.index-authority.json + +Expected output includes: + + decision: ALLOW_INDEX_FIRST + quality_within_bootstrap_noise: true + storage_reduction: 10.6667x + single_query_speedup: 105.6604x + +## Scope + +The example uses existing public README benchmark values and preserves the stated limitations around dataset, encoder, corpus size, batch/threading regime, HNSW comparison, and larger-corpus claims. + +## Framing + +Benchmarks should not only report performance. + +They should authorize compute paths within a defined evidence envelope. diff --git a/docs/INDEX_AUTHORITY_RECEIPTS.md b/docs/INDEX_AUTHORITY_RECEIPTS.md new file mode 100644 index 0000000..9b745e8 --- /dev/null +++ b/docs/INDEX_AUTHORITY_RECEIPTS.md @@ -0,0 +1,48 @@ +# Index Authority Receipts for ordvec + +Index Authority Receipts are CAIF-style evidence packets for ordvec benchmark results. + +They make index-first retrieval evidence machine-readable. + +Instead of only asking whether a retrieval mode is faster, a receipt asks whether the benchmark evidence supports using a compressed/index-first retrieval path within a stated workload scope. + +## IFC + +Index-First Compute means a cheaper index representation is evaluated before more expensive dense compute. + +For ordvec, IFC can include RankQuant compressed scan, Bitmap candidate generation, SignBitmap candidate generation, or SignBitmap to RankQuant rerank. + +## CAIF + +Compute Authority Index Format describes whether a compute path is justified under a stated evidence envelope. + +A receipt records baseline mode, candidate mode, quality delta, storage reduction, latency profile, scope, limitations, fallback conditions, and a deterministic receipt hash. + +## Verify + +Run: + + python3 tools/verify_index_authority.py examples/caif/trec-covid-sign-rq2.index-authority.json + +Expected output: + + decision: ALLOW_INDEX_FIRST + mode: sign_to_rq2 + baseline: flat_exact + quality_within_bootstrap_noise: true + storage_reduction: 10.6667x + single_query_speedup: 105.6604x + +## Non-goals + +This does not change Rust code, Cargo.toml, CI, runtime behavior, signing, key management, or deployment trust policy. + +It does not create new benchmark claims. + +It preserves the stated benchmark scope and limitations. + +## Principle + +Benchmarks should not only report performance. + +They should authorize compute paths within a defined evidence envelope. diff --git a/examples/caif/trec-covid-sign-rq2.index-authority.json b/examples/caif/trec-covid-sign-rq2.index-authority.json new file mode 100644 index 0000000..a4ff5e1 --- /dev/null +++ b/examples/caif/trec-covid-sign-rq2.index-authority.json @@ -0,0 +1,82 @@ +{ + "schema": "ordvec.index_authority.v0.1", + "subject": { + "project": "ordvec", + "mode": "sign_to_rq2", + "version": "0.5.0" + }, + "baseline": { + "mode": "flat_exact", + "bytes_per_vector": 4096 + }, + "ifc": { + "enabled": true, + "compute_path": [ + "sign_bitmap_candidate_generation", + "rankquant_b2_rerank" + ], + "training_required": false, + "fit_required": false, + "graph_required": false, + "float_corpus_required_for_reported_path": false + }, + "evidence": { + "dataset": "trec-covid", + "dataset_family": "BEIR", + "encoder": "Harrier-Q8 1024-d", + "corpus_size": 171332, + "metric": "nDCG@10", + "baseline_score": 0.7574, + "candidate_score": 0.7638, + "delta_vs_baseline": 0.0064, + "within_bootstrap_noise": true, + "evidence_source": "repository README benchmark table" + }, + "economics": { + "candidate_bytes_per_vector": 384, + "storage_reduction_x": 10.6667, + "single_query_latency_ms": { + "baseline": 56.0, + "candidate": 0.53 + }, + "single_query_speedup_x": 105.6604 + }, + "decision": { + "recommended": "ALLOW_INDEX_FIRST", + "policy": { + "min_storage_reduction_x": 8.0, + "min_single_query_speedup_x": 10.0, + "require_quality_within_bootstrap_noise": true, + "require_scope": true, + "require_limitations": true + }, + "fallback": [ + "Use dense flat or ANN comparison when dataset, encoder, scale, or serving regime falls outside the stated evidence scope.", + "Require HNSW comparison for highly parallel threaded serving claims.", + "Require checked-in artifacts before extending the claim to larger corpora or alternate encoders." + ] + }, + "scope": { + "claim_status": "public_repository_evidence", + "applies_to": [ + "BEIR trec-covid", + "Harrier-Q8 1024-d embeddings", + "171332 document public benchmark run", + "single-query latency comparison against exact flat" + ], + "does_not_claim": [ + "million-scale HNSW crossover", + "GPU bandwidth claims", + "alternate-encoder generalization", + "all serving regimes", + "dominance over HNSW in highly parallel threaded throughput" + ] + }, + "limitations": [ + "The compressed scan remains O(n), with a lower constant than dense flat.", + "HNSW wins the committed highly parallel threaded view.", + "The claim is scoped to the stated dataset, encoder, corpus size, and benchmark artifact.", + "Larger-corpus and alternate-encoder claims require checked-in run artifacts.", + "This receipt does not sign artifacts or manage deployment trust policy." + ] +} diff --git a/tools/verify_index_authority.py b/tools/verify_index_authority.py new file mode 100755 index 0000000..bc5dc3c --- /dev/null +++ b/tools/verify_index_authority.py @@ -0,0 +1,74 @@ +#!/usr/bin/env python3 +import argparse +import hashlib +import json +import sys +from pathlib import Path + +def die(msg, code=2): + print("ERROR:", msg, file=sys.stderr) + raise SystemExit(code) + +def sha(obj): + b = json.dumps(obj, sort_keys=True, separators=(",", ":")).encode() + return "sha256:" + hashlib.sha256(b).hexdigest() + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("receipt", type=Path) + args = ap.parse_args() + + try: + r = json.loads(args.receipt.read_text()) + except Exception as e: + die(f"cannot read receipt: {e}") + + for k in ["schema","subject","baseline","ifc","evidence","economics","decision","scope","limitations"]: + if k not in r: + die(f"missing field {k}") + + if r["schema"] != "ordvec.index_authority.v0.1": + die("bad schema") + + e = r["evidence"] + econ = r["economics"] + base = r["baseline"] + policy = r["decision"]["policy"] + + expected_delta = e["candidate_score"] - e["baseline_score"] + if abs(e["delta_vs_baseline"] - expected_delta) > 0.0001: + die("delta_vs_baseline mismatch") + + expected_storage = base["bytes_per_vector"] / econ["candidate_bytes_per_vector"] + if abs(econ["storage_reduction_x"] - expected_storage) > 0.02: + die("storage_reduction_x mismatch") + + expected_speedup = econ["single_query_latency_ms"]["baseline"] / econ["single_query_latency_ms"]["candidate"] + if abs(econ["single_query_speedup_x"] - expected_speedup) > 0.02: + die("single_query_speedup_x mismatch") + + decision = "ALLOW_INDEX_FIRST" + if policy["require_quality_within_bootstrap_noise"] and not e["within_bootstrap_noise"]: + decision = "REQUIRE_DENSE_FALLBACK" + if econ["storage_reduction_x"] < policy["min_storage_reduction_x"]: + decision = "REQUIRE_DENSE_FALLBACK" + if econ["single_query_speedup_x"] < policy["min_single_query_speedup_x"]: + decision = "REQUIRE_DENSE_FALLBACK" + if policy["require_scope"] and (not r["scope"]["applies_to"] or not r["scope"]["does_not_claim"]): + decision = "DENY_UNSCOPED_CLAIM" + if policy["require_limitations"] and not r["limitations"]: + decision = "DENY_UNSCOPED_CLAIM" + + print(f"decision: {decision}") + print(f"mode: {r['subject']['mode']}") + print(f"baseline: {base['mode']}") + print(f"quality_within_bootstrap_noise: {str(e['within_bootstrap_noise']).lower()}") + print(f"storage_reduction: {econ['storage_reduction_x']}x") + print(f"single_query_speedup: {econ['single_query_speedup_x']}x") + print(f"receipt_hash: {sha(r)}") + + if decision != r["decision"]["recommended"]: + die(f"declared decision {r['decision']['recommended']} does not match computed decision {decision}", 3) + +if __name__ == "__main__": + main() From e501704594927afeb47f3818468d0146ac1ec69e Mon Sep 17 00:00:00 2001 From: blocksifrdev Date: Sun, 21 Jun 2026 13:45:26 -0400 Subject: [PATCH 2/3] fix: make index authority verifier policy-owned --- .gitignore | 3 + policies/index-authority.default-policy.json | 9 + tests/verify_index_authority_test.py | 45 ++++ tools/verify_index_authority.py | 208 ++++++++++++++++--- 4 files changed, 233 insertions(+), 32 deletions(-) create mode 100644 policies/index-authority.default-policy.json create mode 100644 tests/verify_index_authority_test.py diff --git a/.gitignore b/.gitignore index c19eaad..9255deb 100644 --- a/.gitignore +++ b/.gitignore @@ -49,3 +49,6 @@ venv/ /.cache/ordvec-beir/ /results/beir/* !/results/beir/.gitkeep + +# CortexTrace local runtime artifacts +.cortextrace/ diff --git a/policies/index-authority.default-policy.json b/policies/index-authority.default-policy.json new file mode 100644 index 0000000..310ec52 --- /dev/null +++ b/policies/index-authority.default-policy.json @@ -0,0 +1,9 @@ +{ + "schema": "ordvec.index_authority.verifier_policy.v0.1", + "min_storage_reduction_x": 4.0, + "min_single_query_speedup_x": 1.25, + "max_quality_delta_loss": 0.02, + "require_scope": true, + "require_limitations": true, + "require_hnsw_comparison_for_parallel_claims": true +} diff --git a/tests/verify_index_authority_test.py b/tests/verify_index_authority_test.py new file mode 100644 index 0000000..758edf3 --- /dev/null +++ b/tests/verify_index_authority_test.py @@ -0,0 +1,45 @@ +import json +import subprocess +import sys +from pathlib import Path + +ROOT = Path(__file__).resolve().parents[1] +VERIFY = ROOT / "tools" / "verify_index_authority.py" +RECEIPT = ROOT / "examples" / "caif" / "trec-covid-sign-rq2.index-authority.json" +POLICY = ROOT / "policies" / "index-authority.default-policy.json" + +def run_verify(path): + return subprocess.run( + [sys.executable, str(VERIFY), str(path), "--policy", str(POLICY)], + cwd=ROOT, + text=True, + capture_output=True, + ) + +def test_valid_receipt_passes(): + result = run_verify(RECEIPT) + assert result.returncode == 0, result.stderr + result.stdout + +def test_missing_required_field_rejected(tmp_path): + data = json.loads(RECEIPT.read_text()) + data.pop("evidence") + bad = tmp_path / "missing-evidence.json" + bad.write_text(json.dumps(data)) + result = run_verify(bad) + assert result.returncode != 0 + +def test_metric_tampering_rejected(tmp_path): + data = json.loads(RECEIPT.read_text()) + data["economics"]["storage_reduction_x"] = 999 + bad = tmp_path / "tampered.json" + bad.write_text(json.dumps(data)) + result = run_verify(bad) + assert result.returncode != 0 + +def test_decision_mismatch_exit_code_3(tmp_path): + data = json.loads(RECEIPT.read_text()) + data["decision"]["recommended"] = "DENY_UNSCOPED_CLAIM" + bad = tmp_path / "decision-mismatch.json" + bad.write_text(json.dumps(data)) + result = run_verify(bad) + assert result.returncode == 3 diff --git a/tools/verify_index_authority.py b/tools/verify_index_authority.py index bc5dc3c..3915938 100755 --- a/tools/verify_index_authority.py +++ b/tools/verify_index_authority.py @@ -5,70 +5,214 @@ import sys from pathlib import Path +RECEIPT_SCHEMA = "ordvec.index_authority.v0.1" +POLICY_SCHEMA = "ordvec.index_authority.verifier_policy.v0.1" + +VALID_DECISIONS = { + "ALLOW_INDEX_FIRST", + "REQUIRE_DENSE_FALLBACK", + "REQUIRE_HNSW_COMPARISON", + "DENY_UNSCOPED_CLAIM", +} + +REQUIRED_TOP_LEVEL = [ + "schema", + "subject", + "baseline", + "ifc", + "evidence", + "economics", + "decision", + "scope", + "limitations", +] + + def die(msg, code=2): - print("ERROR:", msg, file=sys.stderr) - raise SystemExit(code) + print(f"error: {msg}", file=sys.stderr) + sys.exit(code) + + +def load_json(path: Path, label: str): + try: + return json.loads(path.read_text()) + except Exception as e: + die(f"cannot read {label}: {e}") + def sha(obj): b = json.dumps(obj, sort_keys=True, separators=(",", ":")).encode() return "sha256:" + hashlib.sha256(b).hexdigest() + +def require_keys(obj, keys, label): + if not isinstance(obj, dict): + die(f"{label} must be an object") + + missing = [k for k in keys if k not in obj] + if missing: + die(f"{label} missing required field(s): {', '.join(missing)}") + + +def require_number(obj, key, label): + value = obj.get(key) + if not isinstance(value, (int, float)) or isinstance(value, bool): + die(f"{label}.{key} must be a number") + return float(value) + + +def require_list(obj, key, label): + value = obj.get(key) + if not isinstance(value, list): + die(f"{label}.{key} must be a list") + return value + + def main(): ap = argparse.ArgumentParser() ap.add_argument("receipt", type=Path) + ap.add_argument( + "--policy", + type=Path, + default=Path("policies/index-authority.default-policy.json"), + help="Verifier-owned acceptance policy. Receipt policy fields are ignored.", + ) args = ap.parse_args() - try: - r = json.loads(args.receipt.read_text()) - except Exception as e: - die(f"cannot read receipt: {e}") + r = load_json(args.receipt, "receipt") + policy = load_json(args.policy, "policy") + + require_keys(r, REQUIRED_TOP_LEVEL, "receipt") + + if r["schema"] != RECEIPT_SCHEMA: + die(f"bad receipt schema: {r['schema']}") - for k in ["schema","subject","baseline","ifc","evidence","economics","decision","scope","limitations"]: - if k not in r: - die(f"missing field {k}") + if policy.get("schema") != POLICY_SCHEMA: + die(f"bad policy schema: {policy.get('schema')}") - if r["schema"] != "ordvec.index_authority.v0.1": - die("bad schema") + require_keys( + policy, + [ + "min_storage_reduction_x", + "min_single_query_speedup_x", + "max_quality_delta_loss", + "require_scope", + "require_limitations", + "require_hnsw_comparison_for_parallel_claims", + ], + "policy", + ) e = r["evidence"] econ = r["economics"] base = r["baseline"] - policy = r["decision"]["policy"] + decision_obj = r["decision"] + scope = r["scope"] + limitations = r["limitations"] - expected_delta = e["candidate_score"] - e["baseline_score"] - if abs(e["delta_vs_baseline"] - expected_delta) > 0.0001: + require_keys(e, ["candidate_score", "baseline_score", "delta_vs_baseline", "within_bootstrap_noise"], "evidence") + require_keys(base, ["mode", "bytes_per_vector"], "baseline") + require_keys( + econ, + ["candidate_bytes_per_vector", "storage_reduction_x", "single_query_latency_ms", "single_query_speedup_x"], + "economics", + ) + require_keys(econ["single_query_latency_ms"], ["baseline", "candidate"], "economics.single_query_latency_ms") + require_keys(decision_obj, ["recommended"], "decision") + require_keys(scope, ["applies_to", "does_not_claim"], "scope") + + recommended = decision_obj["recommended"] + if recommended not in VALID_DECISIONS: + die(f"invalid recommended decision: {recommended}") + + candidate_score = require_number(e, "candidate_score", "evidence") + baseline_score = require_number(e, "baseline_score", "evidence") + declared_delta = require_number(e, "delta_vs_baseline", "evidence") + + baseline_bytes = require_number(base, "bytes_per_vector", "baseline") + candidate_bytes = require_number(econ, "candidate_bytes_per_vector", "economics") + declared_storage = require_number(econ, "storage_reduction_x", "economics") + + latency = econ["single_query_latency_ms"] + baseline_latency = require_number(latency, "baseline", "economics.single_query_latency_ms") + candidate_latency = require_number(latency, "candidate", "economics.single_query_latency_ms") + declared_speedup = require_number(econ, "single_query_speedup_x", "economics") + + if baseline_bytes <= 0 or candidate_bytes <= 0: + die("bytes_per_vector values must be positive") + if baseline_latency <= 0 or candidate_latency <= 0: + die("latency values must be positive") + + expected_delta = candidate_score - baseline_score + if abs(declared_delta - expected_delta) > 0.0001: die("delta_vs_baseline mismatch") - expected_storage = base["bytes_per_vector"] / econ["candidate_bytes_per_vector"] - if abs(econ["storage_reduction_x"] - expected_storage) > 0.02: + expected_storage = baseline_bytes / candidate_bytes + if abs(declared_storage - expected_storage) > 0.02: die("storage_reduction_x mismatch") - expected_speedup = econ["single_query_latency_ms"]["baseline"] / econ["single_query_latency_ms"]["candidate"] - if abs(econ["single_query_speedup_x"] - expected_speedup) > 0.02: + expected_speedup = baseline_latency / candidate_latency + if abs(declared_speedup - expected_speedup) > 0.02: die("single_query_speedup_x mismatch") + applies_to = require_list(scope, "applies_to", "scope") + does_not_claim = require_list(scope, "does_not_claim", "scope") + + if not isinstance(limitations, list): + die("limitations must be a list") + decision = "ALLOW_INDEX_FIRST" - if policy["require_quality_within_bootstrap_noise"] and not e["within_bootstrap_noise"]: - decision = "REQUIRE_DENSE_FALLBACK" - if econ["storage_reduction_x"] < policy["min_storage_reduction_x"]: - decision = "REQUIRE_DENSE_FALLBACK" - if econ["single_query_speedup_x"] < policy["min_single_query_speedup_x"]: - decision = "REQUIRE_DENSE_FALLBACK" - if policy["require_scope"] and (not r["scope"]["applies_to"] or not r["scope"]["does_not_claim"]): + + scope_missing = not applies_to or not does_not_claim + limitations_missing = not limitations + + quality_loss = baseline_score - candidate_score + quality_too_low = quality_loss > float(policy["max_quality_delta_loss"]) + outside_bootstrap_noise = e["within_bootstrap_noise"] is not True + + economics_too_weak = ( + declared_storage < float(policy["min_storage_reduction_x"]) + or declared_speedup < float(policy["min_single_query_speedup_x"]) + ) + + claims_text = " ".join(str(x).lower() for x in applies_to) + claims_parallel_or_production = any( + marker in claims_text + for marker in ["parallel", "threaded", "production", "prod", "serving", "online"] + ) + + has_hnsw_comparison = ( + e.get("compared_against_hnsw") is True + or isinstance(e.get("hnsw_comparison"), dict) + ) + + if policy["require_scope"] and scope_missing: decision = "DENY_UNSCOPED_CLAIM" - if policy["require_limitations"] and not r["limitations"]: + elif policy["require_limitations"] and limitations_missing: decision = "DENY_UNSCOPED_CLAIM" + elif quality_too_low or outside_bootstrap_noise or economics_too_weak: + decision = "REQUIRE_DENSE_FALLBACK" + elif ( + policy["require_hnsw_comparison_for_parallel_claims"] + and claims_parallel_or_production + and not has_hnsw_comparison + ): + decision = "REQUIRE_HNSW_COMPARISON" print(f"decision: {decision}") - print(f"mode: {r['subject']['mode']}") - print(f"baseline: {base['mode']}") + print(f"mode: {r['subject'].get('mode')}") + print(f"baseline: {base.get('mode')}") print(f"quality_within_bootstrap_noise: {str(e['within_bootstrap_noise']).lower()}") - print(f"storage_reduction: {econ['storage_reduction_x']}x") - print(f"single_query_speedup: {econ['single_query_speedup_x']}x") + print(f"storage_reduction: {declared_storage}x") + print(f"single_query_speedup: {declared_speedup}x") print(f"receipt_hash: {sha(r)}") + print(f"policy_hash: {sha(policy)}") + + if decision != recommended: + die(f"decision mismatch: receipt recommends {recommended}, verifier computed {decision}", code=3) + + print("verified: true") - if decision != r["decision"]["recommended"]: - die(f"declared decision {r['decision']['recommended']} does not match computed decision {decision}", 3) if __name__ == "__main__": main() From 5a6c18c60bbd5de96cf718c1178f7caa6398602b Mon Sep 17 00:00:00 2001 From: blocksifrdev Date: Sun, 21 Jun 2026 13:59:33 -0400 Subject: [PATCH 3/3] fix: harden index authority verifier validation --- tests/verify_index_authority_test.py | 70 +++++++++++++++++ tools/verify_index_authority.py | 108 ++++++++++++++++++++++++--- 2 files changed, 166 insertions(+), 12 deletions(-) diff --git a/tests/verify_index_authority_test.py b/tests/verify_index_authority_test.py index 758edf3..3fede9b 100644 --- a/tests/verify_index_authority_test.py +++ b/tests/verify_index_authority_test.py @@ -43,3 +43,73 @@ def test_decision_mismatch_exit_code_3(tmp_path): bad.write_text(json.dumps(data)) result = run_verify(bad) assert result.returncode == 3 + +def test_ifc_disabled_rejected(tmp_path): + data = json.loads(RECEIPT.read_text()) + data["ifc"]["enabled"] = False + bad = tmp_path / "ifc-disabled.json" + bad.write_text(json.dumps(data)) + result = run_verify(bad) + assert result.returncode != 0 + assert "ifc.enabled must be true" in result.stderr + +def test_ifc_empty_compute_path_rejected(tmp_path): + data = json.loads(RECEIPT.read_text()) + data["ifc"]["compute_path"] = "" + bad = tmp_path / "ifc-empty-path.json" + bad.write_text(json.dumps(data)) + result = run_verify(bad) + assert result.returncode != 0 + assert "ifc.compute_path" in result.stderr + +def test_nan_metrics_rejected(tmp_path): + bad = tmp_path / "nan.json" + text = RECEIPT.read_text().replace('"storage_reduction_x":', '"storage_reduction_x": NaN, "old_storage_reduction_x":', 1) + bad.write_text(text) + result = run_verify(bad) + assert result.returncode != 0 + assert "non-finite" in result.stderr + +def test_blank_scope_entries_rejected(tmp_path): + data = json.loads(RECEIPT.read_text()) + data["scope"]["applies_to"] = [""] + data["scope"]["does_not_claim"] = [" "] + data["limitations"] = [""] + bad = tmp_path / "blank-scope.json" + bad.write_text(json.dumps(data)) + result = run_verify(bad) + assert result.returncode != 0 + +def test_significant_quality_improvement_allowed(tmp_path): + data = json.loads(RECEIPT.read_text()) + data["evidence"]["candidate_score"] = data["evidence"]["baseline_score"] + 0.05 + data["evidence"]["delta_vs_baseline"] = 0.05 + data["evidence"]["within_bootstrap_noise"] = False + data["decision"]["recommended"] = "ALLOW_INDEX_FIRST" + bad = tmp_path / "quality-improvement.json" + bad.write_text(json.dumps(data)) + result = run_verify(bad) + assert result.returncode == 0, result.stderr + result.stdout + +def test_parallel_claim_requires_concrete_hnsw_evidence(tmp_path): + data = json.loads(RECEIPT.read_text()) + data["scope"]["applies_to"] = ["highly parallel threaded serving"] + data["evidence"]["compared_against_hnsw"] = True + data["evidence"]["hnsw_comparison"] = {} + data["decision"]["recommended"] = "ALLOW_INDEX_FIRST" + bad = tmp_path / "empty-hnsw.json" + bad.write_text(json.dumps(data)) + result = run_verify(bad) + assert result.returncode == 3 + assert "REQUIRE_HNSW_COMPARISON" in result.stderr + result.stdout + +def test_single_query_production_does_not_require_hnsw(tmp_path): + data = json.loads(RECEIPT.read_text()) + data["scope"]["applies_to"] = ["single-query production serving"] + data["evidence"].pop("hnsw_comparison", None) + data["evidence"].pop("compared_against_hnsw", None) + data["decision"]["recommended"] = "ALLOW_INDEX_FIRST" + bad = tmp_path / "single-query-prod.json" + bad.write_text(json.dumps(data)) + result = run_verify(bad) + assert result.returncode == 0, result.stderr + result.stdout diff --git a/tools/verify_index_authority.py b/tools/verify_index_authority.py index 3915938..54729a3 100755 --- a/tools/verify_index_authority.py +++ b/tools/verify_index_authority.py @@ -2,6 +2,7 @@ import argparse import hashlib import json +import math import sys from pathlib import Path @@ -33,9 +34,13 @@ def die(msg, code=2): sys.exit(code) +def reject_json_constant(value): + raise ValueError(f"non-finite JSON number is not allowed: {value}") + + def load_json(path: Path, label: str): try: - return json.loads(path.read_text()) + return json.loads(path.read_text(), parse_constant=reject_json_constant) except Exception as e: die(f"cannot read {label}: {e}") @@ -58,7 +63,10 @@ def require_number(obj, key, label): value = obj.get(key) if not isinstance(value, (int, float)) or isinstance(value, bool): die(f"{label}.{key} must be a number") - return float(value) + value = float(value) + if not math.isfinite(value): + die(f"{label}.{key} must be finite") + return value def require_list(obj, key, label): @@ -68,6 +76,70 @@ def require_list(obj, key, label): return value +def require_nonempty_string_list(obj, key, label): + value = require_list(obj, key, label) + cleaned = [] + for i, item in enumerate(value): + if not isinstance(item, str) or not item.strip(): + die(f"{label}.{key}[{i}] must be a non-empty string") + cleaned.append(item.strip()) + if not cleaned: + die(f"{label}.{key} must contain at least one non-empty string") + return cleaned + + +def require_ifc_enabled(ifc): + if not isinstance(ifc, dict): + die("ifc must be an object") + if ifc.get("enabled") is not True: + die("ifc.enabled must be true for an index authority receipt") + + compute_path = ifc.get("compute_path") + if isinstance(compute_path, str): + if not compute_path.strip(): + die("ifc.compute_path must be non-empty") + elif isinstance(compute_path, list): + if not compute_path or any(not isinstance(x, str) or not x.strip() for x in compute_path): + die("ifc.compute_path must contain non-empty string entries") + else: + die("ifc.compute_path must be a non-empty string or list of strings") + + +def has_concrete_hnsw_comparison(evidence): + h = evidence.get("hnsw_comparison") + if not isinstance(h, dict) or not h: + return False + + artifact = h.get("artifact") or h.get("artifact_ref") or h.get("evidence_ref") or h.get("receipt_ref") + has_artifact = isinstance(artifact, str) and bool(artifact.strip()) + + metric_pairs = [ + ("baseline_latency_ms", "candidate_latency_ms"), + ("baseline_qps", "candidate_qps"), + ("baseline_recall", "candidate_recall"), + ("baseline_score", "candidate_score"), + ] + + has_metric_pair = any( + isinstance(h.get(a), (int, float)) + and isinstance(h.get(b), (int, float)) + and math.isfinite(float(h.get(a))) + and math.isfinite(float(h.get(b))) + for a, b in metric_pairs + ) + + nested_latency = h.get("single_query_latency_ms") + has_nested_latency = ( + isinstance(nested_latency, dict) + and isinstance(nested_latency.get("baseline"), (int, float)) + and isinstance(nested_latency.get("candidate"), (int, float)) + and math.isfinite(float(nested_latency.get("baseline"))) + and math.isfinite(float(nested_latency.get("candidate"))) + ) + + return has_artifact and (has_metric_pair or has_nested_latency) + + def main(): ap = argparse.ArgumentParser() ap.add_argument("receipt", type=Path) @@ -106,10 +178,13 @@ def main(): e = r["evidence"] econ = r["economics"] base = r["baseline"] + ifc = r["ifc"] decision_obj = r["decision"] scope = r["scope"] limitations = r["limitations"] + require_ifc_enabled(ifc) + require_keys(e, ["candidate_score", "baseline_score", "delta_vs_baseline", "within_bootstrap_noise"], "evidence") require_keys(base, ["mode", "bytes_per_vector"], "baseline") require_keys( @@ -155,20 +230,23 @@ def main(): if abs(declared_speedup - expected_speedup) > 0.02: die("single_query_speedup_x mismatch") - applies_to = require_list(scope, "applies_to", "scope") - does_not_claim = require_list(scope, "does_not_claim", "scope") + applies_to = require_nonempty_string_list(scope, "applies_to", "scope") + does_not_claim = require_nonempty_string_list(scope, "does_not_claim", "scope") if not isinstance(limitations, list): die("limitations must be a list") + for i, item in enumerate(limitations): + if not isinstance(item, str) or not item.strip(): + die(f"limitations[{i}] must be a non-empty string") decision = "ALLOW_INDEX_FIRST" scope_missing = not applies_to or not does_not_claim limitations_missing = not limitations - quality_loss = baseline_score - candidate_score - quality_too_low = quality_loss > float(policy["max_quality_delta_loss"]) + quality_loss = max(0.0, baseline_score - candidate_score) outside_bootstrap_noise = e["within_bootstrap_noise"] is not True + quality_too_low = quality_loss > float(policy["max_quality_delta_loss"]) and outside_bootstrap_noise economics_too_weak = ( declared_storage < float(policy["min_storage_reduction_x"]) @@ -178,19 +256,25 @@ def main(): claims_text = " ".join(str(x).lower() for x in applies_to) claims_parallel_or_production = any( marker in claims_text - for marker in ["parallel", "threaded", "production", "prod", "serving", "online"] + for marker in [ + "parallel", + "threaded", + "multi-thread", + "multithread", + "concurrent", + "throughput", + "high-qps", + "high qps" + ] ) - has_hnsw_comparison = ( - e.get("compared_against_hnsw") is True - or isinstance(e.get("hnsw_comparison"), dict) - ) + has_hnsw_comparison = has_concrete_hnsw_comparison(e) if policy["require_scope"] and scope_missing: decision = "DENY_UNSCOPED_CLAIM" elif policy["require_limitations"] and limitations_missing: decision = "DENY_UNSCOPED_CLAIM" - elif quality_too_low or outside_bootstrap_noise or economics_too_weak: + elif quality_too_low or economics_too_weak: decision = "REQUIRE_DENSE_FALLBACK" elif ( policy["require_hnsw_comparison_for_parallel_claims"]