diff --git a/.github/pull_request_bodies/caif-ifc-index-authority.md b/.github/pull_request_bodies/caif-ifc-index-authority.md new file mode 100644 index 0000000..ac2b595 --- /dev/null +++ b/.github/pull_request_bodies/caif-ifc-index-authority.md @@ -0,0 +1,49 @@ +## Summary + +Adds an optional CAIF-style Index Authority Receipt for ordvec benchmark evidence. + +The goal is to make ordvec's index-first retrieval evidence machine-readable: quality delta, bytes/vector, latency regime, benchmark scope, limitations, fallback conditions, and a deterministic receipt hash. + +## Why + +ordvec already has a strong index-first compute story: compressed ordinal/sign retrieval can preserve retrieval quality under stated benchmark scopes while reducing storage and latency. + +This PR adds a small evidence packet and verifier so downstream systems can answer: + +> Is this compressed/index-first retrieval path evidence-supported before dense compute for this stated workload scope? + +## What this includes + +- `docs/INDEX_AUTHORITY_RECEIPTS.md` +- `examples/caif/trec-covid-sign-rq2.index-authority.json` +- `tools/verify_index_authority.py` + +## What this does not do + +- Does not change Rust code +- Does not change `Cargo.toml` +- Does not add runtime dependencies +- Does not add CI requirements +- Does not claim new benchmark results +- Does not add signing, key management, or deployment trust policy + +## Verification + + python3 tools/verify_index_authority.py examples/caif/trec-covid-sign-rq2.index-authority.json + +Expected output includes: + + decision: ALLOW_INDEX_FIRST + quality_within_bootstrap_noise: true + storage_reduction: 10.6667x + single_query_speedup: 105.6604x + +## Scope + +The example uses existing public README benchmark values and preserves the stated limitations around dataset, encoder, corpus size, batch/threading regime, HNSW comparison, and larger-corpus claims. + +## Framing + +Benchmarks should not only report performance. + +They should authorize compute paths within a defined evidence envelope. diff --git a/.gitignore b/.gitignore index c19eaad..9255deb 100644 --- a/.gitignore +++ b/.gitignore @@ -49,3 +49,6 @@ venv/ /.cache/ordvec-beir/ /results/beir/* !/results/beir/.gitkeep + +# CortexTrace local runtime artifacts +.cortextrace/ diff --git a/docs/INDEX_AUTHORITY_RECEIPTS.md b/docs/INDEX_AUTHORITY_RECEIPTS.md new file mode 100644 index 0000000..9b745e8 --- /dev/null +++ b/docs/INDEX_AUTHORITY_RECEIPTS.md @@ -0,0 +1,48 @@ +# Index Authority Receipts for ordvec + +Index Authority Receipts are CAIF-style evidence packets for ordvec benchmark results. + +They make index-first retrieval evidence machine-readable. + +Instead of only asking whether a retrieval mode is faster, a receipt asks whether the benchmark evidence supports using a compressed/index-first retrieval path within a stated workload scope. + +## IFC + +Index-First Compute means a cheaper index representation is evaluated before more expensive dense compute. + +For ordvec, IFC can include RankQuant compressed scan, Bitmap candidate generation, SignBitmap candidate generation, or SignBitmap to RankQuant rerank. + +## CAIF + +Compute Authority Index Format describes whether a compute path is justified under a stated evidence envelope. + +A receipt records baseline mode, candidate mode, quality delta, storage reduction, latency profile, scope, limitations, fallback conditions, and a deterministic receipt hash. + +## Verify + +Run: + + python3 tools/verify_index_authority.py examples/caif/trec-covid-sign-rq2.index-authority.json + +Expected output: + + decision: ALLOW_INDEX_FIRST + mode: sign_to_rq2 + baseline: flat_exact + quality_within_bootstrap_noise: true + storage_reduction: 10.6667x + single_query_speedup: 105.6604x + +## Non-goals + +This does not change Rust code, Cargo.toml, CI, runtime behavior, signing, key management, or deployment trust policy. + +It does not create new benchmark claims. + +It preserves the stated benchmark scope and limitations. + +## Principle + +Benchmarks should not only report performance. + +They should authorize compute paths within a defined evidence envelope. diff --git a/examples/caif/trec-covid-sign-rq2.index-authority.json b/examples/caif/trec-covid-sign-rq2.index-authority.json new file mode 100644 index 0000000..a4ff5e1 --- /dev/null +++ b/examples/caif/trec-covid-sign-rq2.index-authority.json @@ -0,0 +1,82 @@ +{ + "schema": "ordvec.index_authority.v0.1", + "subject": { + "project": "ordvec", + "mode": "sign_to_rq2", + "version": "0.5.0" + }, + "baseline": { + "mode": "flat_exact", + "bytes_per_vector": 4096 + }, + "ifc": { + "enabled": true, + "compute_path": [ + "sign_bitmap_candidate_generation", + "rankquant_b2_rerank" + ], + "training_required": false, + "fit_required": false, + "graph_required": false, + "float_corpus_required_for_reported_path": false + }, + "evidence": { + "dataset": "trec-covid", + "dataset_family": "BEIR", + "encoder": "Harrier-Q8 1024-d", + "corpus_size": 171332, + "metric": "nDCG@10", + "baseline_score": 0.7574, + "candidate_score": 0.7638, + "delta_vs_baseline": 0.0064, + "within_bootstrap_noise": true, + "evidence_source": "repository README benchmark table" + }, + "economics": { + "candidate_bytes_per_vector": 384, + "storage_reduction_x": 10.6667, + "single_query_latency_ms": { + "baseline": 56.0, + "candidate": 0.53 + }, + "single_query_speedup_x": 105.6604 + }, + "decision": { + "recommended": "ALLOW_INDEX_FIRST", + "policy": { + "min_storage_reduction_x": 8.0, + "min_single_query_speedup_x": 10.0, + "require_quality_within_bootstrap_noise": true, + "require_scope": true, + "require_limitations": true + }, + "fallback": [ + "Use dense flat or ANN comparison when dataset, encoder, scale, or serving regime falls outside the stated evidence scope.", + "Require HNSW comparison for highly parallel threaded serving claims.", + "Require checked-in artifacts before extending the claim to larger corpora or alternate encoders." + ] + }, + "scope": { + "claim_status": "public_repository_evidence", + "applies_to": [ + "BEIR trec-covid", + "Harrier-Q8 1024-d embeddings", + "171332 document public benchmark run", + "single-query latency comparison against exact flat" + ], + "does_not_claim": [ + "million-scale HNSW crossover", + "GPU bandwidth claims", + "alternate-encoder generalization", + "all serving regimes", + "dominance over HNSW in highly parallel threaded throughput" + ] + }, + "limitations": [ + "The compressed scan remains O(n), with a lower constant than dense flat.", + "HNSW wins the committed highly parallel threaded view.", + "The claim is scoped to the stated dataset, encoder, corpus size, and benchmark artifact.", + "Larger-corpus and alternate-encoder claims require checked-in run artifacts.", + "This receipt does not sign artifacts or manage deployment trust policy." + ] +} diff --git a/policies/index-authority.default-policy.json b/policies/index-authority.default-policy.json new file mode 100644 index 0000000..310ec52 --- /dev/null +++ b/policies/index-authority.default-policy.json @@ -0,0 +1,9 @@ +{ + "schema": "ordvec.index_authority.verifier_policy.v0.1", + "min_storage_reduction_x": 4.0, + "min_single_query_speedup_x": 1.25, + "max_quality_delta_loss": 0.02, + "require_scope": true, + "require_limitations": true, + "require_hnsw_comparison_for_parallel_claims": true +} diff --git a/tests/verify_index_authority_test.py b/tests/verify_index_authority_test.py new file mode 100644 index 0000000..3fede9b --- /dev/null +++ b/tests/verify_index_authority_test.py @@ -0,0 +1,115 @@ +import json +import subprocess +import sys +from pathlib import Path + +ROOT = Path(__file__).resolve().parents[1] +VERIFY = ROOT / "tools" / "verify_index_authority.py" +RECEIPT = ROOT / "examples" / "caif" / "trec-covid-sign-rq2.index-authority.json" +POLICY = ROOT / "policies" / "index-authority.default-policy.json" + +def run_verify(path): + return subprocess.run( + [sys.executable, str(VERIFY), str(path), "--policy", str(POLICY)], + cwd=ROOT, + text=True, + capture_output=True, + ) + +def test_valid_receipt_passes(): + result = run_verify(RECEIPT) + assert result.returncode == 0, result.stderr + result.stdout + +def test_missing_required_field_rejected(tmp_path): + data = json.loads(RECEIPT.read_text()) + data.pop("evidence") + bad = tmp_path / "missing-evidence.json" + bad.write_text(json.dumps(data)) + result = run_verify(bad) + assert result.returncode != 0 + +def test_metric_tampering_rejected(tmp_path): + data = json.loads(RECEIPT.read_text()) + data["economics"]["storage_reduction_x"] = 999 + bad = tmp_path / "tampered.json" + bad.write_text(json.dumps(data)) + result = run_verify(bad) + assert result.returncode != 0 + +def test_decision_mismatch_exit_code_3(tmp_path): + data = json.loads(RECEIPT.read_text()) + data["decision"]["recommended"] = "DENY_UNSCOPED_CLAIM" + bad = tmp_path / "decision-mismatch.json" + bad.write_text(json.dumps(data)) + result = run_verify(bad) + assert result.returncode == 3 + +def test_ifc_disabled_rejected(tmp_path): + data = json.loads(RECEIPT.read_text()) + data["ifc"]["enabled"] = False + bad = tmp_path / "ifc-disabled.json" + bad.write_text(json.dumps(data)) + result = run_verify(bad) + assert result.returncode != 0 + assert "ifc.enabled must be true" in result.stderr + +def test_ifc_empty_compute_path_rejected(tmp_path): + data = json.loads(RECEIPT.read_text()) + data["ifc"]["compute_path"] = "" + bad = tmp_path / "ifc-empty-path.json" + bad.write_text(json.dumps(data)) + result = run_verify(bad) + assert result.returncode != 0 + assert "ifc.compute_path" in result.stderr + +def test_nan_metrics_rejected(tmp_path): + bad = tmp_path / "nan.json" + text = RECEIPT.read_text().replace('"storage_reduction_x":', '"storage_reduction_x": NaN, "old_storage_reduction_x":', 1) + bad.write_text(text) + result = run_verify(bad) + assert result.returncode != 0 + assert "non-finite" in result.stderr + +def test_blank_scope_entries_rejected(tmp_path): + data = json.loads(RECEIPT.read_text()) + data["scope"]["applies_to"] = [""] + data["scope"]["does_not_claim"] = [" "] + data["limitations"] = [""] + bad = tmp_path / "blank-scope.json" + bad.write_text(json.dumps(data)) + result = run_verify(bad) + assert result.returncode != 0 + +def test_significant_quality_improvement_allowed(tmp_path): + data = json.loads(RECEIPT.read_text()) + data["evidence"]["candidate_score"] = data["evidence"]["baseline_score"] + 0.05 + data["evidence"]["delta_vs_baseline"] = 0.05 + data["evidence"]["within_bootstrap_noise"] = False + data["decision"]["recommended"] = "ALLOW_INDEX_FIRST" + bad = tmp_path / "quality-improvement.json" + bad.write_text(json.dumps(data)) + result = run_verify(bad) + assert result.returncode == 0, result.stderr + result.stdout + +def test_parallel_claim_requires_concrete_hnsw_evidence(tmp_path): + data = json.loads(RECEIPT.read_text()) + data["scope"]["applies_to"] = ["highly parallel threaded serving"] + data["evidence"]["compared_against_hnsw"] = True + data["evidence"]["hnsw_comparison"] = {} + data["decision"]["recommended"] = "ALLOW_INDEX_FIRST" + bad = tmp_path / "empty-hnsw.json" + bad.write_text(json.dumps(data)) + result = run_verify(bad) + assert result.returncode == 3 + assert "REQUIRE_HNSW_COMPARISON" in result.stderr + result.stdout + +def test_single_query_production_does_not_require_hnsw(tmp_path): + data = json.loads(RECEIPT.read_text()) + data["scope"]["applies_to"] = ["single-query production serving"] + data["evidence"].pop("hnsw_comparison", None) + data["evidence"].pop("compared_against_hnsw", None) + data["decision"]["recommended"] = "ALLOW_INDEX_FIRST" + bad = tmp_path / "single-query-prod.json" + bad.write_text(json.dumps(data)) + result = run_verify(bad) + assert result.returncode == 0, result.stderr + result.stdout diff --git a/tools/verify_index_authority.py b/tools/verify_index_authority.py new file mode 100755 index 0000000..54729a3 --- /dev/null +++ b/tools/verify_index_authority.py @@ -0,0 +1,302 @@ +#!/usr/bin/env python3 +import argparse +import hashlib +import json +import math +import sys +from pathlib import Path + +RECEIPT_SCHEMA = "ordvec.index_authority.v0.1" +POLICY_SCHEMA = "ordvec.index_authority.verifier_policy.v0.1" + +VALID_DECISIONS = { + "ALLOW_INDEX_FIRST", + "REQUIRE_DENSE_FALLBACK", + "REQUIRE_HNSW_COMPARISON", + "DENY_UNSCOPED_CLAIM", +} + +REQUIRED_TOP_LEVEL = [ + "schema", + "subject", + "baseline", + "ifc", + "evidence", + "economics", + "decision", + "scope", + "limitations", +] + + +def die(msg, code=2): + print(f"error: {msg}", file=sys.stderr) + sys.exit(code) + + +def reject_json_constant(value): + raise ValueError(f"non-finite JSON number is not allowed: {value}") + + +def load_json(path: Path, label: str): + try: + return json.loads(path.read_text(), parse_constant=reject_json_constant) + except Exception as e: + die(f"cannot read {label}: {e}") + + +def sha(obj): + b = json.dumps(obj, sort_keys=True, separators=(",", ":")).encode() + return "sha256:" + hashlib.sha256(b).hexdigest() + + +def require_keys(obj, keys, label): + if not isinstance(obj, dict): + die(f"{label} must be an object") + + missing = [k for k in keys if k not in obj] + if missing: + die(f"{label} missing required field(s): {', '.join(missing)}") + + +def require_number(obj, key, label): + value = obj.get(key) + if not isinstance(value, (int, float)) or isinstance(value, bool): + die(f"{label}.{key} must be a number") + value = float(value) + if not math.isfinite(value): + die(f"{label}.{key} must be finite") + return value + + +def require_list(obj, key, label): + value = obj.get(key) + if not isinstance(value, list): + die(f"{label}.{key} must be a list") + return value + + +def require_nonempty_string_list(obj, key, label): + value = require_list(obj, key, label) + cleaned = [] + for i, item in enumerate(value): + if not isinstance(item, str) or not item.strip(): + die(f"{label}.{key}[{i}] must be a non-empty string") + cleaned.append(item.strip()) + if not cleaned: + die(f"{label}.{key} must contain at least one non-empty string") + return cleaned + + +def require_ifc_enabled(ifc): + if not isinstance(ifc, dict): + die("ifc must be an object") + if ifc.get("enabled") is not True: + die("ifc.enabled must be true for an index authority receipt") + + compute_path = ifc.get("compute_path") + if isinstance(compute_path, str): + if not compute_path.strip(): + die("ifc.compute_path must be non-empty") + elif isinstance(compute_path, list): + if not compute_path or any(not isinstance(x, str) or not x.strip() for x in compute_path): + die("ifc.compute_path must contain non-empty string entries") + else: + die("ifc.compute_path must be a non-empty string or list of strings") + + +def has_concrete_hnsw_comparison(evidence): + h = evidence.get("hnsw_comparison") + if not isinstance(h, dict) or not h: + return False + + artifact = h.get("artifact") or h.get("artifact_ref") or h.get("evidence_ref") or h.get("receipt_ref") + has_artifact = isinstance(artifact, str) and bool(artifact.strip()) + + metric_pairs = [ + ("baseline_latency_ms", "candidate_latency_ms"), + ("baseline_qps", "candidate_qps"), + ("baseline_recall", "candidate_recall"), + ("baseline_score", "candidate_score"), + ] + + has_metric_pair = any( + isinstance(h.get(a), (int, float)) + and isinstance(h.get(b), (int, float)) + and math.isfinite(float(h.get(a))) + and math.isfinite(float(h.get(b))) + for a, b in metric_pairs + ) + + nested_latency = h.get("single_query_latency_ms") + has_nested_latency = ( + isinstance(nested_latency, dict) + and isinstance(nested_latency.get("baseline"), (int, float)) + and isinstance(nested_latency.get("candidate"), (int, float)) + and math.isfinite(float(nested_latency.get("baseline"))) + and math.isfinite(float(nested_latency.get("candidate"))) + ) + + return has_artifact and (has_metric_pair or has_nested_latency) + + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("receipt", type=Path) + ap.add_argument( + "--policy", + type=Path, + default=Path("policies/index-authority.default-policy.json"), + help="Verifier-owned acceptance policy. Receipt policy fields are ignored.", + ) + args = ap.parse_args() + + r = load_json(args.receipt, "receipt") + policy = load_json(args.policy, "policy") + + require_keys(r, REQUIRED_TOP_LEVEL, "receipt") + + if r["schema"] != RECEIPT_SCHEMA: + die(f"bad receipt schema: {r['schema']}") + + if policy.get("schema") != POLICY_SCHEMA: + die(f"bad policy schema: {policy.get('schema')}") + + require_keys( + policy, + [ + "min_storage_reduction_x", + "min_single_query_speedup_x", + "max_quality_delta_loss", + "require_scope", + "require_limitations", + "require_hnsw_comparison_for_parallel_claims", + ], + "policy", + ) + + e = r["evidence"] + econ = r["economics"] + base = r["baseline"] + ifc = r["ifc"] + decision_obj = r["decision"] + scope = r["scope"] + limitations = r["limitations"] + + require_ifc_enabled(ifc) + + require_keys(e, ["candidate_score", "baseline_score", "delta_vs_baseline", "within_bootstrap_noise"], "evidence") + require_keys(base, ["mode", "bytes_per_vector"], "baseline") + require_keys( + econ, + ["candidate_bytes_per_vector", "storage_reduction_x", "single_query_latency_ms", "single_query_speedup_x"], + "economics", + ) + require_keys(econ["single_query_latency_ms"], ["baseline", "candidate"], "economics.single_query_latency_ms") + require_keys(decision_obj, ["recommended"], "decision") + require_keys(scope, ["applies_to", "does_not_claim"], "scope") + + recommended = decision_obj["recommended"] + if recommended not in VALID_DECISIONS: + die(f"invalid recommended decision: {recommended}") + + candidate_score = require_number(e, "candidate_score", "evidence") + baseline_score = require_number(e, "baseline_score", "evidence") + declared_delta = require_number(e, "delta_vs_baseline", "evidence") + + baseline_bytes = require_number(base, "bytes_per_vector", "baseline") + candidate_bytes = require_number(econ, "candidate_bytes_per_vector", "economics") + declared_storage = require_number(econ, "storage_reduction_x", "economics") + + latency = econ["single_query_latency_ms"] + baseline_latency = require_number(latency, "baseline", "economics.single_query_latency_ms") + candidate_latency = require_number(latency, "candidate", "economics.single_query_latency_ms") + declared_speedup = require_number(econ, "single_query_speedup_x", "economics") + + if baseline_bytes <= 0 or candidate_bytes <= 0: + die("bytes_per_vector values must be positive") + if baseline_latency <= 0 or candidate_latency <= 0: + die("latency values must be positive") + + expected_delta = candidate_score - baseline_score + if abs(declared_delta - expected_delta) > 0.0001: + die("delta_vs_baseline mismatch") + + expected_storage = baseline_bytes / candidate_bytes + if abs(declared_storage - expected_storage) > 0.02: + die("storage_reduction_x mismatch") + + expected_speedup = baseline_latency / candidate_latency + if abs(declared_speedup - expected_speedup) > 0.02: + die("single_query_speedup_x mismatch") + + applies_to = require_nonempty_string_list(scope, "applies_to", "scope") + does_not_claim = require_nonempty_string_list(scope, "does_not_claim", "scope") + + if not isinstance(limitations, list): + die("limitations must be a list") + for i, item in enumerate(limitations): + if not isinstance(item, str) or not item.strip(): + die(f"limitations[{i}] must be a non-empty string") + + decision = "ALLOW_INDEX_FIRST" + + scope_missing = not applies_to or not does_not_claim + limitations_missing = not limitations + + quality_loss = max(0.0, baseline_score - candidate_score) + outside_bootstrap_noise = e["within_bootstrap_noise"] is not True + quality_too_low = quality_loss > float(policy["max_quality_delta_loss"]) and outside_bootstrap_noise + + economics_too_weak = ( + declared_storage < float(policy["min_storage_reduction_x"]) + or declared_speedup < float(policy["min_single_query_speedup_x"]) + ) + + claims_text = " ".join(str(x).lower() for x in applies_to) + claims_parallel_or_production = any( + marker in claims_text + for marker in [ + "parallel", + "threaded", + "multi-thread", + "multithread", + "concurrent", + "throughput", + "high-qps", + "high qps" + ] + ) + + has_hnsw_comparison = has_concrete_hnsw_comparison(e) + + if policy["require_scope"] and scope_missing: + decision = "DENY_UNSCOPED_CLAIM" + elif policy["require_limitations"] and limitations_missing: + decision = "DENY_UNSCOPED_CLAIM" + elif quality_too_low or economics_too_weak: + decision = "REQUIRE_DENSE_FALLBACK" + elif ( + policy["require_hnsw_comparison_for_parallel_claims"] + and claims_parallel_or_production + and not has_hnsw_comparison + ): + decision = "REQUIRE_HNSW_COMPARISON" + + print(f"decision: {decision}") + print(f"mode: {r['subject'].get('mode')}") + print(f"baseline: {base.get('mode')}") + print(f"quality_within_bootstrap_noise: {str(e['within_bootstrap_noise']).lower()}") + print(f"storage_reduction: {declared_storage}x") + print(f"single_query_speedup: {declared_speedup}x") + print(f"receipt_hash: {sha(r)}") + print(f"policy_hash: {sha(policy)}") + + if decision != recommended: + die(f"decision mismatch: receipt recommends {recommended}, verifier computed {decision}", code=3) + + print("verified: true") + + +if __name__ == "__main__": + main()