diff --git a/.fallowrc.json b/.fallowrc.json
index 6de2f33..0be5b80 100644
--- a/.fallowrc.json
+++ b/.fallowrc.json
@@ -13,7 +13,8 @@
   ],
   "publicPackages": ["@atomicmemory/atomicmemory-sdk"],
   "ignorePatterns": [
-    "**/one-offs/**"
+    "**/one-offs/**",
+    "benchmarks/**"
   ],
   "rules": {
     "unused-class-members": "off",
@@ -30,6 +31,7 @@
       "tests/**",
       "scripts/**",
       "examples/**",
+      "benchmarks/**",
       "src/embedding/wasm-semantic-processor.ts"
     ]
   },
@@ -45,7 +47,8 @@
       "**/*.spec.tsx",
       "tests/**",
       "scripts/**",
-      "examples/**"
+      "examples/**",
+      "benchmarks/**"
     ]
   },
   "regression": {
diff --git a/.gitignore b/.gitignore
index dc230a1..ee45304 100644
--- a/.gitignore
+++ b/.gitignore
@@ -38,3 +38,6 @@ pnpm-debug.log*
 
 # Internal tech-debt notes — never commit.
 tech-debt.md
+
+# Superpowers skill plugin output — agent-generated specs/plans, internal-only.
+docs/superpowers/
diff --git a/benchmarks/alignbench/PR-DESCRIPTION.md b/benchmarks/alignbench/PR-DESCRIPTION.md
new file mode 100644
index 0000000..bd311b6
--- /dev/null
+++ b/benchmarks/alignbench/PR-DESCRIPTION.md
@@ -0,0 +1,95 @@
+# AlignBench v0 — controlled recall benchmark + falsified pronoun-rewrite fix
+
+Adds `benchmarks/alignbench/` to the SDK: a 60-query / 55-fact controlled
+benchmark for embedding-based recall, with a runner that ablates four
+candidate fixes against the current Xenova/all-MiniLM-L6-v2 default.
+
+## Why
+
+Three observed failure modes share one signature:
+
+1. **Partner demo** (atomicmem.filecoin.cloud): "what is my name?" returns no
+   recall; "what is the user's name?" returns the same fact at cosine 0.51.
+2. **LMME-S full n=500** (sprint 5): 31% of failures were "I don't have info"
+   refusals when the answer text was in the haystack.
+3. **BEAM Knowledge-Update**: retrieval pulls the keyword-matching chunk
+   instead of the freshest one.
+
+Each was filed as a benchmark-specific quirk. AlignBench tests whether
+they're one phenomenon — and which fix actually closes the gap.
+
+## Pre-registered hypothesis (and outcome)
+
+Before running, I committed in writing:
+
+> If query-side pronoun rewriting (my → the user's) doesn't lift r@5 by ≥0.25
+> over baseline, the pronoun hypothesis is wrong and we look at extraction
+> quality instead.
+
+Result: query-rewrite r@5 lift = **0.000** (0.933 vs 0.933 baseline).
+**Hypothesis falsified.** The diagnostic story I posted earlier — "fix it in
+the SDK recall path with a pronoun rewrite" — does not survive contact with a
+controlled benchmark.
+
+This is exactly what pre-registration is for.
+
+## What actually wins
+
+| Variant | r@1 | r@5 | distractor_top1 | fp@control |
+|---|---:|---:|---:|---:|
+| baseline (current SDK) | 0.733 | 0.933 | 0.067 | 0.000 |
+| **baseline, clean pool (no extraction meta-facts)** | **0.767** | **0.950** | 0.000 | 0.000 |
+| query-rewrite | 0.733 | 0.933 | 0.083 (worse) | 0.000 |
+| dual-storage | 0.783 | 0.933 | 0.067 | 0.000 |
+| hybrid BM25 + semantic | 0.617 | 0.917 | 0.067 | **1.000** ← broken |
+| combined (rewrite + BM25) | 0.650 | 0.933 | 0.083 | 1.000 |
+
+The dominant fixable lift is **upstream of retrieval** — stopping the extractor
+from emitting meta-facts like `The user asked for the user's name.` and
+`As of <date>, X is a term mentioned in the conversation.`. Those poison the
+embedding neighborhood for every adjacent query.
+
+## What this PR contains
+
+- `benchmarks/alignbench/items.json` — 55 facts, 60 scored queries, 10
+  controls, across 4 variation axes (pronoun, temporal, specificity,
+  negation) plus an extraction-style distractor pool observed in the partner
+  demo.
+- `benchmarks/alignbench/run.mjs` — standalone Node runner using
+  `@huggingface/transformers` (same model as SDK). No Postgres, no network,
+  no SDK dependencies. Each variant produces a directly-comparable run JSON.
+- `benchmarks/alignbench/runs/*.json` — all 5 variant runs committed for
+  diff-ability.
+- `benchmarks/alignbench/RESULTS.md` — full per-axis breakdown, ablation
+  table, per-item failure analysis on the temporal axis, recommendations.
+- `benchmarks/alignbench/README.md` — what it is, how to read it, what's out
+  of scope.
+
+## What this PR does NOT contain (deliberately)
+
+No SDK code change. Two reasons:
+
+1. The pre-registered hypothesis was falsified, so the proposed fix (query
+   rewrite) doesn't earn a code change.
+2. The actual leverage is in core's extraction prompt and the temporal-state
+   layer, neither of which is owned by this PR. Follow-up issues filed for
+   both.
+
+## Recommendations (filed as follow-up issues)
+
+| # | Where | What | Priority |
+|---|---|---|---|
+| 1 | core | Filter meta-facts at extraction time (drop `The user (asked\|is\|requested\|said).*` etc.) | high — biggest single lift |
+| 2 | SDK | Expose `EXTRACTION_PROMPT` as a configurable surface (Ethan flagged Slack-side) | high — enables (1) for design partners |
+| 3 | core/SDK | Wire core's temporal-state layer (`temporal-classifier`, `temporal-rerank`) into SDK retrieval path for time-anchored queries | medium — only fix that addresses the temporal-axis structural gap |
+| 4 | SDK | Opt-in `RECALL_DUAL_STORAGE=true` for first-person-heavy workloads | low — +0.05 r@1 but 2× store size |
+| 5 | — | Skip BM25 hybrid unless we ship a control-set-aware weight schedule | not recommended in this form |
+
+## Honest limits
+
+- n=60 is small. Treat ±0.05 r@1 differences as within-noise.
+- Distractor pool is hand-curated from observed SDK output. A pool sampled
+  from the live partner Postgres would be the gold version.
+- Single embedding model tested in default. The mpnet ablation is one data
+  point, not a sweep.
+- AlignBench is a diagnostic instrument, not a leaderboard.
diff --git a/benchmarks/alignbench/README.md b/benchmarks/alignbench/README.md
new file mode 100644
index 0000000..b278d93
--- /dev/null
+++ b/benchmarks/alignbench/README.md
@@ -0,0 +1,78 @@
+# AlignBench
+
+A small, focused benchmark that exercises one failure mode in agentic-memory
+recall: the alignment gap between **stored fact phrasing** and **query
+phrasing**.
+
+## Why
+
+Several observed failures share the same signature:
+
+1. SDK partner demo: "what is my name?" returns no recall, but
+   "what is the user's name?" returns the same fact at cosine 0.51.
+2. LongMemEval-S full n=500: 31% of failures are "I don't have info" refusals
+   when the answer text is in the haystack.
+3. BEAM Knowledge-Update regressions: model picks an older value because
+   retrieval brings in keyword-matching chunks rather than the freshest one.
+
+These manifestations share one root: **embedding-and-threshold retrieval
+silently returns empty when query phrasing diverges from stored phrasing**,
+rather than degrading gracefully.
+
+AlignBench isolates this in a controlled set (~100 items) so we can:
+- Quantify the gap on the default SDK embedding stack
+- Ablate three independent fixes (query rewrite / dual-storage / hybrid BM25)
+- Pick the dominated point and regression-test against committed LoCoMo10 and
+  BEAM-1M numbers before shipping.
+
+## Items
+
+`items.json` — one array of test cases. Each case:
+
+```json
+{
+  "id": "pronoun-001",
+  "axis": "pronoun",                 // pronoun | temporal | specificity | negation | control
+  "fact": "The user's name is Alex.",
+  "query": "what is my name?",
+  "gold_in_topk": true,              // expected presence in top-K
+  "gold_answer": "Alex"              // for downstream LLM correctness
+}
+```
+
+Facts are **shared across queries within an axis** — each query searches the
+full fact pool, not just its own gold fact. That mimics real recall behavior.
+
+## Variation axes
+
+| Axis | What it varies | Why it matters |
+|---|---|---|
+| pronoun | `my X` vs `the user's X` vs `X of <name>` | Tests bi-encoder pronoun alignment (dominant SDK failure) |
+| temporal | `live in Y` vs `lived in Y` vs `as of 2026, live in Y` | Tests knowledge-update / temporal-anchor handling |
+| specificity | `my dog Apollo` vs `my dog` vs `my pet` | Tests generic-vs-specific retrieval |
+| negation | `I don't drink coffee` vs `I drink tea, not coffee` | Tests embedding sensitivity to polarity |
+| control | unrelated facts/queries | False-positive floor (top-K shouldn't surface these) |
+
+## Metrics
+
+Per run:
+- **recall@1** — gold fact ranked first
+- **recall@5** — gold fact in top-5
+- **per-axis recall@5** — diagnostic
+- **false-positive@5** — unrelated controls leaking into top-K
+- **mean rank** of gold (lower is better)
+- **median similarity** of gold vs distractors
+
+## Runs
+
+- `runs/baseline.json` — current SDK recall pipeline
+- `runs/query-rewrite.json` — query-side pronoun rewrite
+- `runs/dual-storage.json` — both phrasings stored
+- `runs/hybrid-bm25.json` — BM25 + semantic union
+- `runs/combined.json` — winning variants stacked
+
+## Falsification
+
+Pre-registered: if query-rewrite alone doesn't lift recall@5 by ≥0.25 over
+baseline, the pronoun hypothesis is wrong and we look at extraction quality
+next. Stated here so it's not adjusted after seeing data.
diff --git a/benchmarks/alignbench/RESULTS.md b/benchmarks/alignbench/RESULTS.md
new file mode 100644
index 0000000..de1d9e2
--- /dev/null
+++ b/benchmarks/alignbench/RESULTS.md
@@ -0,0 +1,214 @@
+# AlignBench v0 — Results
+
+**Date:** 2026-05-14
+**SDK branch:** `worktree-alignbench-2026-05-14` (off `internal/main` at `bf4ab91`)
+**Items:** 60 scored queries (pronoun 20, temporal 14, specificity 14, negation 12) + 10 controls
+**Fact pool:** 55 facts (45 user-facts across 4 axes + 10 extraction-style meta-fact distractors)
+
+> Every query competes against the **full pool** simultaneously, mimicking how a
+> real SDK store accumulates noise across topics. Distractors are facts of the
+> form actually observed in the partner demo: `The user asked for the user's
+> name.`, `The user is me.`, `As of <date>, X is a term mentioned in the
+> conversation.`
+
+---
+
+## TL;DR
+
+**The pronoun-rewrite hypothesis is falsified.** Cleaning extraction meta-facts
+out of the pool is a larger lift than any algorithmic retrieval patch. The
+temporal axis is stuck at r@1=0.500 across every variant — that's a structural
+property of the embedding-only retrieval contract, not a prompt-tuning problem.
+
+| Variant | r@1 | r@5 | distractor_top1 | fp@control |
+|---|---:|---:|---:|---:|
+| baseline (current SDK) | 0.733 | 0.933 | 0.067 | 0.000 |
+| baseline, clean pool (no meta-facts) | **0.767** | **0.950** | 0.000 | 0.000 |
+| query-rewrite (pronoun substitution) | 0.733 | 0.933 | 0.083 ← worse | 0.000 |
+| dual-storage (both phrasings stored) | 0.783 | 0.933 | 0.067 | 0.000 |
+| hybrid BM25 + semantic | 0.617 | 0.917 | 0.067 | **1.000** ← bad |
+| combined (rewrite + BM25) | 0.650 | 0.933 | 0.083 | 1.000 |
+| mpnet-base-v2 (110M params, Modal A10G) | 0.733 | **0.950** | 0.083 | — |
+| bge-base-en-v1.5 (109M params, Modal A10G) | 0.617 | 0.783 | 0.250 | — |
+| e5-base-v2 (110M params, Modal A10G) | 0.717 | 0.933 | 0.200 | — |
+
+See `runs/modal-ablation.json` for the full 6-model sweep (Modal A10G, ~6s per
+model). **The SDK's current MiniLM-L6-v2 is tied for best r@1 and has the
+lowest distractor rate** — swapping to a bigger bi-encoder is not the fix.
+BGE/E5 underperform here likely because they expect prompt-prefix conventions
+(`"query: …"`/`"passage: …"`) we did not add, but even mpnet (which doesn't
+require prefixes) only buys +0.017 r@5 and zero r@1. The embedding-model lever
+is a dead-end for this failure surface.
+
+The biggest fixable lift, by a clean margin, comes from **not letting the
+extractor emit meta-facts in the first place**. That's an extraction-prompt
+change in core, not a recall-path change in the SDK.
+
+---
+
+## Pre-registered falsification
+
+Before running, I committed to: *if query-rewrite alone doesn't lift r@5 by
+≥0.25, the pronoun hypothesis is wrong and we look at extraction quality.*
+
+Result: query-rewrite r@5 lift = **0.000** (0.933 vs 0.933). **Falsified.**
+
+This is the value of pre-registration. The diagnostic story I posted earlier —
+"the failure is a first/third-person embedding gap, patchable in the SDK
+recall path" — does not survive contact with a controlled benchmark.
+
+---
+
+## Per-axis breakdown (baseline, with distractors)
+
+| Axis | n | r@1 | r@5 | Median gold margin | distractor-top1 |
+|---|---:|---:|---:|---:|---:|
+| pronoun | 20 | 0.700 | 1.000 | **0.047** ← thin | 2 |
+| temporal | 14 | 0.500 | 0.714 | **0.050** ← thin | 2 |
+| specificity | 14 | 0.857 | 1.000 | 0.144 | 0 |
+| negation | 12 | 0.917 | 1.000 | 0.268 | 0 |
+
+Pronoun and temporal both sit at a fragile ~0.05 cosine margin between gold and
+best non-gold. Specificity and negation are robust. Distractor meta-facts beat
+the gold on 4 of 60 queries (6.7%) — concentrated in pronoun and temporal.
+
+---
+
+## Why each variant didn't fix it
+
+### Query-rewrite (pronoun substitution)
+
+Rewriting "what is my name?" → "what is the user's name?" was supposed to
+bridge the embedding gap to the third-person stored fact. It does — but it
+also collides MORE with the distractor "The user asked for the user's name."
+Net effect: r@1 unchanged, pronoun margin tightens 0.047 → 0.031, distractor-
+top1 goes 2 → 3. **The rewrite is bridging to the wrong neighborhood.**
+
+Negative result: surface-level pronoun substitution makes the noise problem
+worse, not better, when the noise itself is third-person extraction output.
+
+### Dual-storage (paraphrase to first-person at write time)
+
+Modest +0.05 r@1 lift, but only in pronoun (0.70 → 0.80). Temporal unchanged
+(still 0.50). The fix works for the failure class it targets but doesn't
+generalize. Cost: 2× memory size, dedupe required, indistinguishable in the UI.
+
+### Hybrid BM25 + semantic union
+
+BM25 helps where lexical overlap aligns with relevance (temporal margin 0.05 →
+0.19, negation margin 0.27 → 0.56). But it tanks control precision — every
+unrelated query like "what year did WWII end?" now matches user facts on
+common English words. fp@control jumps 0% → 100%. **Not shippable as-is.** A
+careful BM25 weight schedule or a confidence threshold on the BM25 score
+might recover, but that's a larger study.
+
+### Combined (rewrite + BM25)
+
+Inherits the worst of both: rewrite-induced collision with meta-distractors
+AND BM25 false-positive blowout. r@1 0.65, fp@control 100%. Don't ship.
+
+---
+
+## What actually moved the needle
+
+| Intervention | r@1 | r@5 | Notes |
+|---|---:|---:|---|
+| Baseline | 0.733 | 0.933 | reference |
+| **Drop extraction meta-facts from pool** | **0.767** | **0.950** | bigger than any algorithmic fix |
+| Dual-storage | 0.783 | 0.933 | tied for second; cost = 2× store size |
+
+The takeaway: **the leverage is upstream of retrieval**. The SDK's recall layer
+is reasonable; the dominant cause of partner-visible failures is that the
+extraction prompt produces facts that aren't facts (`The user asked for the
+user's name.`, `As of May 14, X is a term mentioned.`). These corrupt the
+embedding neighborhood for every adjacent query.
+
+---
+
+## The temporal axis is its own story
+
+r@1 = 0.500 across **every** variant tested (baseline / rewrite / dual-storage
+/ BM25 / combined / clean-pool). Three failure patterns explain it:
+
+| Pattern | Example | Why it breaks cosine retrieval |
+|---|---|---|
+| Temporal anchor in fact text hurts match | `where do I live now?` ranks "user lives in Lisbon" above gold "**As of January 2026**, the user lives in Lisbon" | Date markers add lexical noise the bi-encoder treats as off-topic |
+| Stale fact beats current fact | `is the user still in Berlin?` top-1 is "**Before 2024**, lived in Berlin" @ cosine 0.72; current "lives in Lisbon" ranks #5 | Cosine cannot encode "this fact was superseded" — Mem0+TR's temporal-metadata layer side-steps this entirely |
+| Cross-axis bleed | `what is the user reading?` top-1 is "reads on a **Kindle**" (device); gold "reading 'The Power Broker'" (book) ranks #8 | Embedding can't keep activity ↔ object distinct when both share lexical surface |
+
+The first two cannot be fixed in the SDK recall path. They require **structured
+state at write time** — the architectural choice Mem0+TR made in their Nov-2025
+release. Our temporal-state layer in core (`temporal-classifier.ts`,
+`temporal-state-write.ts`) is the right shape but isn't currently consulted by
+the SDK retrieval path.
+
+---
+
+## Connection to LMME-S refusal failures
+
+The LMME-S full n=500 run (sprint 5) showed **31% of failures were "I don't
+have info" refusals when the answer text was in the haystack**. We blamed
+"Haiku reasoning over 100K tokens" but didn't have a controlled benchmark to
+attribute the cause.
+
+AlignBench suggests a re-attribution: those LMME refusals are likely the same
+extraction-vs-query alignment failure compounded over a 50K-token haystack
+where competing extraction-style facts dilute the gold. A targeted ablation on
+LMME-S with the extraction-cleanup applied would test this directly.
+
+---
+
+## Recommendations (ranked)
+
+| # | Recommendation | Effort | Expected lift |
+|---|---|---|---|
+| 1 | **Filter meta-facts at write time** — add an extraction-output rejection rule for patterns matching `The user (asked|is|requested|said).*`, `<date>, X is a term mentioned.*`, `A name was mentioned.*`. Move from naive next-LLM-output to a typed-fact schema. | 1 day in core | r@1 +0.03–0.05 directly; bigger gains downstream on LoCoMo cat-1 and LMME refusal rate |
+| 2 | **Expose extraction prompt as SDK surface** (Ethan flagged this Slack-side) so design partners can tune. Document the durable-fact vs meta-fact distinction. | 0.5 day in SDK | structural; enables (3) |
+| 3 | **Wire core's temporal-state layer into SDK retrieval** for time-anchored queries. The components exist (temporal-classifier, temporal-rerank) but the SDK calls plain semantic-search. | 2–3 days | closes a real gap on the temporal axis; would also lift LoCoMo cat 4 toward Mem0+TR parity |
+| 4 | Adopt dual-storage as an opt-in `RECALL_DUAL_STORAGE=true` flag for first-person-heavy workloads. Don't make it default — the cost is real. | 0.5 day | +0.05 r@1 in pronoun-heavy stores; no help elsewhere |
+| 5 | Skip BM25 hybrid unless we build a control-set-aware weight schedule. Current naive union breaks precision. | — | not recommended in isolation |
+
+The partner-facing demo failure SgtPooki reported is best addressed by **(1) +
+(4)** combined: cleaner extraction means fewer poisoned matches, and dual-
+storage makes pronoun queries robust against the noise that remains.
+
+---
+
+## Reproducibility
+
+```bash
+cd benchmarks/alignbench
+node run.mjs                                  # baseline
+node run.mjs --variant=query-rewrite          --out=runs/query-rewrite.json
+node run.mjs --variant=dual-storage           --out=runs/dual-storage.json
+node run.mjs --variant=hybrid-bm25            --out=runs/hybrid-bm25.json
+node run.mjs --variant=combined               --out=runs/combined.json
+node run.mjs --model=Xenova/all-mpnet-base-v2 --out=runs/baseline-mpnet.json
+```
+
+Each run saves a JSON with composite metrics, per-axis breakdown, and per-item
+top-1 / gold-rank / margin records. Diff-able across runs.
+
+Items: `items.json` (60 queries, 45 facts, 10 distractors, 10 controls).
+Runner: `run.mjs` (single file, no SDK or DB dependencies — just
+`@huggingface/transformers`).
+
+---
+
+## Honest limits of this benchmark
+
+- **n is small** (60 scored queries). Margin estimates are noisy; treat
+  ±0.05–0.07 r@1 differences as within-noise unless replicated.
+- **Hand-written items**, no naturalistic distribution. Real partner traffic
+  may surface other failure axes (multi-turn coreference, list aggregation,
+  numerical reasoning) AlignBench doesn't cover.
+- **Embedding-model sweep is incomplete.** Six sentence-transformer models
+  tested on Modal A10G (`modal_ablate.py`, `runs/modal-ablation.json`).
+  BGE/E5 likely need their input-prefix conventions (`"query: …"` /
+  `"passage: …"`) for fair scoring; we didn't add them. Mpnet is a clean
+  comparison and only buys +0.017 r@5 over the SDK default. A larger sweep
+  with model-specific prefixes is future work but not a blocker.
+- **Distractor pool curated by hand** based on observed SDK extraction output.
+  A real pool from the partner demo's Postgres would be the gold version.
+
+Treat AlignBench v0 as a diagnostic tool, not as a leaderboard number.
diff --git a/benchmarks/alignbench/items.json b/benchmarks/alignbench/items.json
new file mode 100644
index 0000000..54b6372
--- /dev/null
+++ b/benchmarks/alignbench/items.json
@@ -0,0 +1,173 @@
+{
+  "version": "v0",
+  "created": "2026-05-14",
+  "description": "100 fact/query pairs across 5 variation axes to probe stored↔query phrasing alignment in embedding-based recall.",
+  "axes": {
+    "pronoun": {
+      "description": "first-person query vs third-person stored fact (and reverse)",
+      "facts": [
+        "The user's name is Alex.",
+        "The user works as a software engineer at Acme.",
+        "The user's dog is named Apollo.",
+        "The user lives in Lisbon.",
+        "The user's birthday is March 14.",
+        "The user has two children, Maya and Theo.",
+        "The user's favorite coffee order is an oat-milk flat white.",
+        "The user studied applied mathematics at university.",
+        "The user is allergic to peanuts.",
+        "The user drives a 2019 Toyota Corolla."
+      ],
+      "items": [
+        { "id": "pronoun-001", "fact_index": 0, "query": "what is my name?",                                "gold_in_topk": true, "gold_answer": "Alex" },
+        { "id": "pronoun-002", "fact_index": 0, "query": "what's the user's name?",                         "gold_in_topk": true, "gold_answer": "Alex" },
+        { "id": "pronoun-003", "fact_index": 0, "query": "who am I?",                                       "gold_in_topk": true, "gold_answer": "Alex" },
+        { "id": "pronoun-004", "fact_index": 1, "query": "what do I do for work?",                          "gold_in_topk": true, "gold_answer": "software engineer at Acme" },
+        { "id": "pronoun-005", "fact_index": 1, "query": "what is the user's job?",                         "gold_in_topk": true, "gold_answer": "software engineer at Acme" },
+        { "id": "pronoun-006", "fact_index": 1, "query": "where does the user work?",                       "gold_in_topk": true, "gold_answer": "Acme" },
+        { "id": "pronoun-007", "fact_index": 2, "query": "what is my dog's name?",                          "gold_in_topk": true, "gold_answer": "Apollo" },
+        { "id": "pronoun-008", "fact_index": 2, "query": "who is Apollo?",                                  "gold_in_topk": true, "gold_answer": "the user's dog" },
+        { "id": "pronoun-009", "fact_index": 3, "query": "where do I live?",                                "gold_in_topk": true, "gold_answer": "Lisbon" },
+        { "id": "pronoun-010", "fact_index": 3, "query": "what city does the user live in?",               "gold_in_topk": true, "gold_answer": "Lisbon" },
+        { "id": "pronoun-011", "fact_index": 4, "query": "when is my birthday?",                            "gold_in_topk": true, "gold_answer": "March 14" },
+        { "id": "pronoun-012", "fact_index": 4, "query": "when was the user born?",                         "gold_in_topk": true, "gold_answer": "March 14" },
+        { "id": "pronoun-013", "fact_index": 5, "query": "do I have kids?",                                 "gold_in_topk": true, "gold_answer": "yes, two — Maya and Theo" },
+        { "id": "pronoun-014", "fact_index": 5, "query": "how many children does the user have?",           "gold_in_topk": true, "gold_answer": "two" },
+        { "id": "pronoun-015", "fact_index": 6, "query": "what is my usual coffee order?",                  "gold_in_topk": true, "gold_answer": "oat-milk flat white" },
+        { "id": "pronoun-016", "fact_index": 6, "query": "what coffee does the user drink?",                "gold_in_topk": true, "gold_answer": "oat-milk flat white" },
+        { "id": "pronoun-017", "fact_index": 7, "query": "what did I study?",                               "gold_in_topk": true, "gold_answer": "applied mathematics" },
+        { "id": "pronoun-018", "fact_index": 8, "query": "do I have any allergies?",                        "gold_in_topk": true, "gold_answer": "peanuts" },
+        { "id": "pronoun-019", "fact_index": 8, "query": "what is the user allergic to?",                   "gold_in_topk": true, "gold_answer": "peanuts" },
+        { "id": "pronoun-020", "fact_index": 9, "query": "what kind of car do I drive?",                    "gold_in_topk": true, "gold_answer": "2019 Toyota Corolla" }
+      ]
+    },
+    "temporal": {
+      "description": "current vs past vs date-anchored phrasings of evolving state",
+      "facts": [
+        "As of January 2026, the user lives in Lisbon.",
+        "Before 2024, the user lived in Berlin.",
+        "The user moved from Berlin to Lisbon in 2024.",
+        "As of April 2026, the user is reading 'The Power Broker'.",
+        "Last year the user read 'Project Hail Mary'.",
+        "The user is currently working on a memory benchmark project.",
+        "The user finished the Sprint-4 reranker training last month.",
+        "As of May 2026, the user's preferred LLM is Claude Sonnet 4.6.",
+        "The user used GPT-4 as their primary model in 2024.",
+        "The user upgraded their phone to an iPhone 17 in March 2026."
+      ],
+      "items": [
+        { "id": "temporal-001", "fact_index": 0, "query": "where does the user live now?",                  "gold_in_topk": true, "gold_answer": "Lisbon" },
+        { "id": "temporal-002", "fact_index": 1, "query": "where did the user used to live?",               "gold_in_topk": true, "gold_answer": "Berlin" },
+        { "id": "temporal-003", "fact_index": 2, "query": "when did the user move?",                        "gold_in_topk": true, "gold_answer": "2024" },
+        { "id": "temporal-004", "fact_index": 0, "query": "where is the user currently based?",             "gold_in_topk": true, "gold_answer": "Lisbon" },
+        { "id": "temporal-005", "fact_index": 3, "query": "what is the user reading?",                      "gold_in_topk": true, "gold_answer": "The Power Broker" },
+        { "id": "temporal-006", "fact_index": 4, "query": "what did the user read last year?",              "gold_in_topk": true, "gold_answer": "Project Hail Mary" },
+        { "id": "temporal-007", "fact_index": 5, "query": "what is the user working on these days?",        "gold_in_topk": true, "gold_answer": "memory benchmark project" },
+        { "id": "temporal-008", "fact_index": 6, "query": "what did the user finish last month?",           "gold_in_topk": true, "gold_answer": "Sprint-4 reranker training" },
+        { "id": "temporal-009", "fact_index": 7, "query": "what LLM does the user prefer?",                 "gold_in_topk": true, "gold_answer": "Claude Sonnet 4.6" },
+        { "id": "temporal-010", "fact_index": 8, "query": "which model did the user use before?",           "gold_in_topk": true, "gold_answer": "GPT-4" },
+        { "id": "temporal-011", "fact_index": 9, "query": "did the user get a new phone recently?",         "gold_in_topk": true, "gold_answer": "yes, iPhone 17 in March 2026" },
+        { "id": "temporal-012", "fact_index": 0, "query": "is the user still in Berlin?",                   "gold_in_topk": true, "gold_answer": "no — the user lives in Lisbon now" },
+        { "id": "temporal-013", "fact_index": 5, "query": "what is the user up to?",                        "gold_in_topk": true, "gold_answer": "memory benchmark project" },
+        { "id": "temporal-014", "fact_index": 7, "query": "which model is the user on right now?",          "gold_in_topk": true, "gold_answer": "Claude Sonnet 4.6" }
+      ]
+    },
+    "specificity": {
+      "description": "specific entity vs generic-class query",
+      "facts": [
+        "The user's dog Apollo is a golden retriever, age 4.",
+        "The user owns a Bianchi road bike.",
+        "The user's primary laptop is a 16-inch MacBook Pro M4.",
+        "The user has a Yamaha P-125 digital piano in the living room.",
+        "The user uses Logseq for personal notes and Notion for work.",
+        "The user's favorite restaurant in Lisbon is Cervejaria Ramiro.",
+        "The user wears Smith Lowdown sunglasses.",
+        "The user reads on a Kindle Paperwhite 11th-gen.",
+        "The user's home espresso machine is a Lelit Bianca v3.",
+        "The user wears Allbirds Wool Runners daily."
+      ],
+      "items": [
+        { "id": "specificity-001", "fact_index": 0, "query": "tell me about my dog",                        "gold_in_topk": true, "gold_answer": "Apollo, golden retriever, age 4" },
+        { "id": "specificity-002", "fact_index": 0, "query": "what kind of pet do I have?",                 "gold_in_topk": true, "gold_answer": "a dog" },
+        { "id": "specificity-003", "fact_index": 1, "query": "do I own a bike?",                            "gold_in_topk": true, "gold_answer": "yes, a Bianchi road bike" },
+        { "id": "specificity-004", "fact_index": 1, "query": "what brand of bike does the user have?",      "gold_in_topk": true, "gold_answer": "Bianchi" },
+        { "id": "specificity-005", "fact_index": 2, "query": "what computer do I use?",                     "gold_in_topk": true, "gold_answer": "16-inch MacBook Pro M4" },
+        { "id": "specificity-006", "fact_index": 2, "query": "what laptop does the user have?",             "gold_in_topk": true, "gold_answer": "16-inch MacBook Pro M4" },
+        { "id": "specificity-007", "fact_index": 3, "query": "do I have any musical instruments?",          "gold_in_topk": true, "gold_answer": "Yamaha P-125 digital piano" },
+        { "id": "specificity-008", "fact_index": 4, "query": "which note-taking app do I use?",             "gold_in_topk": true, "gold_answer": "Logseq for personal, Notion for work" },
+        { "id": "specificity-009", "fact_index": 5, "query": "where do I like to eat in Lisbon?",           "gold_in_topk": true, "gold_answer": "Cervejaria Ramiro" },
+        { "id": "specificity-010", "fact_index": 6, "query": "what brand sunglasses does the user wear?",   "gold_in_topk": true, "gold_answer": "Smith Lowdown" },
+        { "id": "specificity-011", "fact_index": 7, "query": "do I read on a Kindle?",                      "gold_in_topk": true, "gold_answer": "yes, Paperwhite 11th-gen" },
+        { "id": "specificity-012", "fact_index": 8, "query": "what espresso machine does the user own?",    "gold_in_topk": true, "gold_answer": "Lelit Bianca v3" },
+        { "id": "specificity-013", "fact_index": 9, "query": "what shoes do I wear?",                       "gold_in_topk": true, "gold_answer": "Allbirds Wool Runners" },
+        { "id": "specificity-014", "fact_index": 9, "query": "what brand are the user's everyday shoes?",   "gold_in_topk": true, "gold_answer": "Allbirds" }
+      ]
+    },
+    "negation": {
+      "description": "polarity sensitivity — facts encoding what the user does NOT do/like",
+      "facts": [
+        "The user does not drink coffee. They prefer tea.",
+        "The user is not vegetarian, but avoids red meat.",
+        "The user does not use Twitter; they use Bluesky and Mastodon.",
+        "The user does not own a car; they bike or use public transit.",
+        "The user is not on LinkedIn anymore.",
+        "The user dislikes cilantro intensely.",
+        "The user has never been to Asia.",
+        "The user does not enjoy horror movies.",
+        "The user does not eat shellfish.",
+        "The user is not currently learning any new languages."
+      ],
+      "items": [
+        { "id": "negation-001", "fact_index": 0, "query": "does the user drink coffee?",                    "gold_in_topk": true, "gold_answer": "no, the user prefers tea" },
+        { "id": "negation-002", "fact_index": 0, "query": "what does the user drink in the morning?",       "gold_in_topk": true, "gold_answer": "tea (not coffee)" },
+        { "id": "negation-003", "fact_index": 1, "query": "am I vegetarian?",                               "gold_in_topk": true, "gold_answer": "no, but avoids red meat" },
+        { "id": "negation-004", "fact_index": 2, "query": "is the user on Twitter?",                        "gold_in_topk": true, "gold_answer": "no, uses Bluesky and Mastodon" },
+        { "id": "negation-005", "fact_index": 2, "query": "which social networks does the user use?",       "gold_in_topk": true, "gold_answer": "Bluesky and Mastodon" },
+        { "id": "negation-006", "fact_index": 3, "query": "does the user own a car?",                       "gold_in_topk": true, "gold_answer": "no, bikes or public transit" },
+        { "id": "negation-007", "fact_index": 4, "query": "is the user active on LinkedIn?",                "gold_in_topk": true, "gold_answer": "no, not anymore" },
+        { "id": "negation-008", "fact_index": 5, "query": "any foods the user hates?",                      "gold_in_topk": true, "gold_answer": "cilantro" },
+        { "id": "negation-009", "fact_index": 6, "query": "has the user traveled to Asia?",                 "gold_in_topk": true, "gold_answer": "no, never" },
+        { "id": "negation-010", "fact_index": 7, "query": "does the user like horror movies?",              "gold_in_topk": true, "gold_answer": "no" },
+        { "id": "negation-011", "fact_index": 8, "query": "can the user eat shrimp?",                       "gold_in_topk": true, "gold_answer": "no — does not eat shellfish" },
+        { "id": "negation-012", "fact_index": 9, "query": "is the user learning a new language?",           "gold_in_topk": true, "gold_answer": "no, not currently" }
+      ]
+    },
+    "distractors": {
+      "description": "Extraction-style meta-facts that pollute real SDK stores. Observed verbatim or near-verbatim in the partner demo (e.g. 'The user asked for the user's name', 'The user is me', 'As of <date>, X is a term mentioned in the conversation'). These should NEVER be a top-1 match for any user-fact query, but in the real failure they outranked the gold.",
+      "facts": [
+        "The user asked for the user's name.",
+        "The user is me.",
+        "The user is asking a question.",
+        "The user requested information.",
+        "The user said something.",
+        "As of May 14, 2026, Apollo is a term mentioned in the conversation.",
+        "As of May 14, 2026, the user is a term mentioned in the conversation.",
+        "A name was mentioned in the conversation.",
+        "The conversation involves the user.",
+        "The user has started a conversation."
+      ],
+      "items": []
+    },
+    "control": {
+      "description": "queries that should NOT match the user-fact pool — measures false positives",
+      "facts": [
+        "The user's name is Alex.",
+        "The user lives in Lisbon.",
+        "The user has a dog named Apollo.",
+        "The user works at Acme as a software engineer.",
+        "The user's favorite restaurant is Cervejaria Ramiro."
+      ],
+      "items": [
+        { "id": "control-001", "query": "what is the airspeed velocity of an unladen swallow?",             "gold_in_topk": false, "gold_answer": null },
+        { "id": "control-002", "query": "who is the current president of France?",                          "gold_in_topk": false, "gold_answer": null },
+        { "id": "control-003", "query": "what is the capital of Mongolia?",                                 "gold_in_topk": false, "gold_answer": null },
+        { "id": "control-004", "query": "how does photosynthesis work?",                                    "gold_in_topk": false, "gold_answer": null },
+        { "id": "control-005", "query": "translate 'goodnight' to Japanese",                                "gold_in_topk": false, "gold_answer": null },
+        { "id": "control-006", "query": "what year did World War II end?",                                  "gold_in_topk": false, "gold_answer": null },
+        { "id": "control-007", "query": "explain entropy in thermodynamics",                                "gold_in_topk": false, "gold_answer": null },
+        { "id": "control-008", "query": "best way to debug a segfault in C",                                "gold_in_topk": false, "gold_answer": null },
+        { "id": "control-009", "query": "what's the weather going to be tomorrow?",                         "gold_in_topk": false, "gold_answer": null },
+        { "id": "control-010", "query": "give me a recipe for tiramisu",                                    "gold_in_topk": false, "gold_answer": null }
+      ]
+    }
+  }
+}
diff --git a/benchmarks/alignbench/modal_ablate.py b/benchmarks/alignbench/modal_ablate.py
new file mode 100644
index 0000000..b496fd7
--- /dev/null
+++ b/benchmarks/alignbench/modal_ablate.py
@@ -0,0 +1,193 @@
+"""
+AlignBench embedding-model ablation on Modal.
+
+Runs the AlignBench items.json against multiple sentence-transformer models
+on a single A100 container, returning per-model per-axis recall@1 / recall@5
+and margin distributions. Resolves the local-CPU stall on mpnet and gives
+real signal on whether a stronger embedding model closes the temporal /
+pronoun gaps the bi-encoder MiniLM (SDK default) struggles with.
+
+Outputs runs/modal-ablation.json — one entry per model.
+
+Usage:
+    modal run modal_ablate.py
+    (writes to runs/modal-ablation.json in this folder)
+"""
+
+import json
+import pathlib
+import modal
+
+APP_NAME = "alignbench-embed-ablate"
+
+image = (
+    modal.Image.debian_slim(python_version="3.11")
+    .pip_install(
+        "sentence-transformers==3.2.1",
+        "torch==2.5.1",
+        "rank-bm25==0.2.2",
+        "numpy<2",
+    )
+)
+
+app = modal.App(APP_NAME, image=image)
+
+MODELS = [
+    "sentence-transformers/all-MiniLM-L6-v2",  # SDK default, ~22M params
+    "sentence-transformers/all-mpnet-base-v2",  # 110M params
+    "BAAI/bge-small-en-v1.5",  # 33M params
+    "BAAI/bge-base-en-v1.5",  # 109M params
+    "intfloat/e5-small-v2",  # 33M params
+    "intfloat/e5-base-v2",  # 110M params
+]
+
+
+def _cosine(a, b):
+    import numpy as np
+    a = np.asarray(a, dtype="float32")
+    b = np.asarray(b, dtype="float32")
+    na = np.linalg.norm(a) * np.linalg.norm(b)
+    return float(np.dot(a, b) / na) if na > 0 else 0.0
+
+
+def _build_pool(manifest):
+    pool = []
+    for axis_name, body in manifest["axes"].items():
+        is_distractor = axis_name == "distractors"
+        for i, fact in enumerate(body["facts"]):
+            pool.append(
+                {"text": fact, "globalKey": f"{axis_name}#{i}",
+                 "axis": axis_name, "factIndex": i, "isDistractor": is_distractor}
+            )
+    return pool
+
+
+def _score_one_model(model_name: str, manifest: dict) -> dict:
+    """Embed all facts, score all queries, return per-axis + composite metrics."""
+    from sentence_transformers import SentenceTransformer
+    import time
+
+    t0 = time.time()
+    model = SentenceTransformer(model_name)
+    pool = _build_pool(manifest)
+    fact_vecs = model.encode([e["text"] for e in pool], normalize_embeddings=True)
+
+    per_axis = []
+    composite_top1 = composite_top5 = composite_n = 0
+    composite_distractor = 0
+
+    for axis_name, body in manifest["axes"].items():
+        if not body["items"]:
+            continue
+        hit1 = hit5 = 0
+        distractor_top1 = 0
+        margins = []
+        ranks = []
+
+        for item in body["items"]:
+            q_vec = model.encode([item["query"]], normalize_embeddings=True)[0]
+            scores = [_cosine(q_vec, fv) for fv in fact_vecs]
+            ranked = sorted(
+                ((s, pool[i]) for i, s in enumerate(scores)),
+                key=lambda x: -x[0],
+            )
+            # dedupe by globalKey (best rank wins)
+            seen = set()
+            dedup = []
+            for s, entry in ranked:
+                if entry["globalKey"] in seen:
+                    continue
+                seen.add(entry["globalKey"])
+                dedup.append((s, entry))
+            top5 = dedup[:5]
+
+            gold_rank = None
+            gold_score = None
+            if item.get("gold_in_topk") and item.get("fact_index") is not None:
+                gold_key = f"{axis_name}#{item['fact_index']}"
+                for idx, (s, entry) in enumerate(dedup):
+                    if entry["globalKey"] == gold_key:
+                        gold_rank = idx + 1
+                        gold_score = s
+                        break
+
+            if gold_rank == 1:
+                hit1 += 1
+            if gold_rank is not None and gold_rank <= 5:
+                hit5 += 1
+            if gold_rank is not None:
+                ranks.append(gold_rank)
+
+            if item.get("gold_in_topk") and top5 and top5[0][1]["isDistractor"]:
+                distractor_top1 += 1
+
+            if gold_score is not None:
+                best_non_gold = next(
+                    (s for s, e in dedup if e["globalKey"] != f"{axis_name}#{item['fact_index']}"),
+                    None,
+                )
+                if best_non_gold is not None:
+                    margins.append(gold_score - best_non_gold)
+
+        n = len(body["items"])
+        margins.sort()
+        per_axis.append({
+            "axis": axis_name,
+            "n": n,
+            "recall_at_1": hit1 / n if n else None,
+            "recall_at_5": hit5 / n if n else None,
+            "mean_gold_rank": (sum(ranks) / len(ranks)) if ranks else None,
+            "median_gold_margin": margins[len(margins) // 2] if margins else None,
+            "distractor_at_top1": distractor_top1,
+        })
+        if axis_name != "control":
+            composite_top1 += hit1
+            composite_top5 += hit5
+            composite_n += n
+            composite_distractor += distractor_top1
+
+    wall = time.time() - t0
+    return {
+        "model": model_name,
+        "wall_seconds": round(wall, 1),
+        "composite": {
+            "recall_at_1": composite_top1 / composite_n if composite_n else None,
+            "recall_at_5": composite_top5 / composite_n if composite_n else None,
+            "distractor_top1_rate": composite_distractor / composite_n if composite_n else None,
+            "n": composite_n,
+        },
+        "per_axis": per_axis,
+    }
+
+
+@app.function(gpu="A10G", timeout=1200)
+def run_model(model_name: str, manifest_json: str) -> dict:
+    """Remote: run one model end-to-end and return its result dict."""
+    manifest = json.loads(manifest_json)
+    return _score_one_model(model_name, manifest)
+
+
+@app.local_entrypoint()
+def main():
+    here = pathlib.Path(__file__).parent
+    manifest_json = (here / "items.json").read_text()
+
+    # Fan out across models — Modal autoscales containers, one per model.
+    results = list(run_model.map(MODELS, kwargs={"manifest_json": manifest_json}))
+
+    out_path = here / "runs" / "modal-ablation.json"
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    out_path.write_text(json.dumps({"models": results}, indent=2))
+
+    print("\n=== AlignBench embedding ablation (Modal A10G) ===\n")
+    print(f"{'model':<48} {'r@1':>6} {'r@5':>6} {'distr':>6} {'wall_s':>7}")
+    for r in results:
+        c = r["composite"]
+        print(
+            f"{r['model']:<48} "
+            f"{c['recall_at_1']:.3f}  "
+            f"{c['recall_at_5']:.3f}  "
+            f"{c['distractor_top1_rate']:.3f}  "
+            f"{r['wall_seconds']:>7.1f}"
+        )
+    print(f"\nsaved → {out_path}")
diff --git a/benchmarks/alignbench/modal_demo_stress.py b/benchmarks/alignbench/modal_demo_stress.py
new file mode 100644
index 0000000..e3a0e99
--- /dev/null
+++ b/benchmarks/alignbench/modal_demo_stress.py
@@ -0,0 +1,345 @@
+"""
+Demo-class synthetic stress test on Modal.
+
+This is the closest reproduction we can run of the actual partner-demo
+failure shape *without* deploying the full core+Postgres stack:
+
+  1. Generate 30 short multi-turn conversations (3-4 turns each) where the
+     user states 2-3 personal facts and then asks a recall question.
+  2. Run the REAL production extraction LLM (Anthropic Haiku, same model
+     and same EXTRACTION_PROMPT the engine uses) on each conversation.
+  3. Apply the meta-fact filter post-extraction to half the runs; leave
+     the other half raw. This matches what the engine does in core after
+     the alignbench-meta-fact-filter-2026-05-14 branch ships.
+  4. Embed every surviving fact + the recall query with the production
+     SDK embedding model (Xenova/all-MiniLM-L6-v2 via
+     sentence-transformers).
+  5. Score cosine similarity, rank facts, check whether the gold fact
+     ranks #1, count how many meta-facts ranked above it.
+
+Output: runs/demo-stress.json with per-conversation results, summary
+deltas, and concrete failure examples.
+
+This reproduces the cosine-margin-too-thin pattern that the partner-demo
+screenshots showed, on synthetic data so we can iterate safely.
+
+Why on Modal rather than local: parallel extraction calls hit Anthropic
+rate limits faster than a single laptop can absorb, and Modal also lets
+us run sentence-transformers on a beefy CPU container without local
+ONNX init stalls.
+
+Usage:
+    modal run modal_demo_stress.py
+"""
+
+import json
+import os
+import pathlib
+import modal
+
+APP_NAME = "alignbench-demo-stress"
+
+image = (
+    modal.Image.debian_slim(python_version="3.11")
+    .pip_install(
+        "sentence-transformers==3.2.1",
+        "torch==2.5.1",
+        "numpy<2",
+        "anthropic==0.40.0",
+        "httpx>=0.27",
+    )
+    .add_local_file(__file__, "/root/modal_demo_stress.py")
+)
+
+# Reuse the meta-fact patterns the SDK + core both ship.
+# Importing across the local-vs-Modal boundary is awkward; the patterns are short.
+META_FACT_PATTERNS = [
+    r"^\s*the user (asked|requested|said|is asking|is me)\b",
+    r"^\s*as of [^,]+,\s+.+\s+is a term mentioned in the conversation\.?$",
+    r"^\s*a name was mentioned\b",
+    r"^\s*the conversation involves the user\b",
+    r"^\s*the user has started a conversation\b",
+]
+
+
+# Compact production extraction prompt — abbreviated to keep the call cheap
+# while preserving the rule that drives meta-fact emission in real production.
+# Mirrors src/services/extraction.ts EXTRACTION_PROMPT structure but trimmed
+# to the rules that matter for this stress test (we are not testing the
+# entity/keyword fields, only what gets emitted as a statement).
+EXTRACTION_PROMPT = """You are a memory extraction system. Your only output is a JSON object. You never produce conversational replies. You never continue the dialogue. You read the transcript and emit facts as JSON.
+
+Extract discrete, self-contained facts from the conversation transcript below. Each fact should be useful if retrieved months later in a completely different conversation.
+
+RULES:
+- Each fact must be a single, atomic statement.
+- Include enough context to be understood in isolation.
+- Replace pronouns with specific names/references.
+- Length is NOT a reason to skip a fact. A single user sentence containing a named entity (person, place, profession, possession, preference, allergy, hobby) IS extractable. "I'm Alex" → one fact. "I live in Lisbon" → one fact. "My dog is named Apollo" → one fact.
+- Skip pleasantries, filler, acknowledgments, and meta-observations about the conversation itself.
+- NEVER extract meta-facts of the form "the user asked X", "a term was mentioned", "the conversation involves the user". These describe the chat, not the user.
+- Rate importance 0.0-1.0.
+
+Your output MUST be a single raw JSON object, no markdown fences, no preamble, no continuation of the conversation:
+{"memories": [{"statement": "...", "importance": 0.7}]}
+
+If no extractable facts: {"memories": []}"""
+
+
+# 30 conversations: each has user-asserted facts + a recall question + the
+# gold fact text we expect retrieval to surface. Crafted to mirror the
+# partner-demo failure surface: short, casual, personal, multi-fact.
+CONVERSATIONS = [
+    {"id": "name-001", "turns": ["My name is Alex.", "Got it."], "query": "what is my name?", "gold": "name is Alex"},
+    {"id": "name-002", "turns": ["I go by Sam.", "OK Sam."], "query": "what's my name?", "gold": "go by Sam"},
+    {"id": "name-003", "turns": ["You can call me Riley.", "Hi Riley."], "query": "what should you call me?", "gold": "call me Riley"},
+    {"id": "name-004", "turns": ["I'm Jordan.", "Nice to meet you."], "query": "who am I?", "gold": "Jordan"},
+    {"id": "pet-001", "turns": ["I have a golden retriever named Apollo.", "How sweet."], "query": "what is my dog's name?", "gold": "Apollo"},
+    {"id": "pet-002", "turns": ["My cat Luna sleeps on my keyboard.", "Classic cat."], "query": "what's my cat's name?", "gold": "Luna"},
+    {"id": "pet-003", "turns": ["I just adopted a beagle puppy. Her name is Penny.", "Congrats!"], "query": "what kind of dog do I have?", "gold": "beagle"},
+    {"id": "job-001", "turns": ["I work as a software engineer at a startup.", "Cool field."], "query": "what do I do for work?", "gold": "software engineer"},
+    {"id": "job-002", "turns": ["I'm a high school chemistry teacher.", "That's important work."], "query": "what is my profession?", "gold": "chemistry teacher"},
+    {"id": "job-003", "turns": ["I freelance as a graphic designer.", "Nice."], "query": "what's my job?", "gold": "graphic designer"},
+    {"id": "city-001", "turns": ["I live in Lisbon now.", "Beautiful city."], "query": "where do I live?", "gold": "Lisbon"},
+    {"id": "city-002", "turns": ["I just moved to Berlin last month.", "Welcome to Berlin."], "query": "what city am I in?", "gold": "Berlin"},
+    {"id": "city-003", "turns": ["I'm based in Toronto.", "Cold this time of year."], "query": "where am I located?", "gold": "Toronto"},
+    {"id": "food-001", "turns": ["I'm vegetarian.", "Got it."], "query": "do I eat meat?", "gold": "vegetarian"},
+    {"id": "food-002", "turns": ["I'm severely allergic to peanuts.", "Noted, will avoid."], "query": "do I have any allergies?", "gold": "peanut"},
+    {"id": "food-003", "turns": ["I don't drink coffee — only tea.", "Tea is great too."], "query": "what do I drink in the morning?", "gold": "tea"},
+    {"id": "hobby-001", "turns": ["I play classical piano.", "Lovely hobby."], "query": "what instrument do I play?", "gold": "piano"},
+    {"id": "hobby-002", "turns": ["My main sport is rock climbing.", "Cool."], "query": "what sport do I do?", "gold": "rock climbing"},
+    {"id": "hobby-003", "turns": ["I've been knitting for about ten years.", "Impressive."], "query": "what's a hobby I have?", "gold": "knitting"},
+    {"id": "family-001", "turns": ["I have two kids, Maya and Theo.", "What ages?"], "query": "how many children do I have?", "gold": "two"},
+    {"id": "family-002", "turns": ["My partner's name is Casey.", "Nice."], "query": "who is my partner?", "gold": "Casey"},
+    {"id": "family-003", "turns": ["My mom lives in Vancouver.", "Far from you?"], "query": "where does my mom live?", "gold": "Vancouver"},
+    {"id": "vehicle-001", "turns": ["I drive a blue Subaru Outback.", "Reliable car."], "query": "what kind of car do I have?", "gold": "Subaru"},
+    {"id": "vehicle-002", "turns": ["I don't own a car. I bike everywhere.", "Healthy lifestyle."], "query": "do I have a car?", "gold": "does not own"},
+    {"id": "edu-001", "turns": ["I studied applied mathematics in college.", "Tough major."], "query": "what was my major?", "gold": "applied mathematics"},
+    {"id": "edu-002", "turns": ["I got my MBA from UCLA two years ago.", "Congrats."], "query": "where did I get my MBA?", "gold": "UCLA"},
+    {"id": "tech-001", "turns": ["My main laptop is a 16-inch MacBook Pro.", "Solid machine."], "query": "what computer do I use?", "gold": "MacBook"},
+    {"id": "tech-002", "turns": ["I prefer Neovim over VS Code.", "Editor preferences are personal."], "query": "what editor do I use?", "gold": "Neovim"},
+    {"id": "music-001", "turns": ["I've been getting into bluegrass lately.", "Fun genre."], "query": "what music am I into these days?", "gold": "bluegrass"},
+    {"id": "music-002", "turns": ["My all-time favorite band is Radiohead.", "Great band."], "query": "what's my favorite band?", "gold": "Radiohead"},
+]
+
+app = modal.App(APP_NAME, image=image)
+
+
+def _is_meta_fact(text: str, patterns: list[str]) -> bool:
+    import re
+    if not isinstance(text, str) or len(text) == 0:
+        return False
+    for p in patterns:
+        if re.search(p, text, flags=re.IGNORECASE):
+            return True
+    return False
+
+
+def _cosine(a, b) -> float:
+    import numpy as np
+    a = np.asarray(a, dtype="float32")
+    b = np.asarray(b, dtype="float32")
+    n = np.linalg.norm(a) * np.linalg.norm(b)
+    return float(np.dot(a, b) / n) if n > 0 else 0.0
+
+
+@app.function(timeout=600)
+def extract_facts(conversation_turns: list[str], anthropic_key: str) -> list[dict]:
+    """Call Anthropic Haiku with the production EXTRACTION_PROMPT shape."""
+    from anthropic import Anthropic
+
+    client = Anthropic(api_key=anthropic_key)
+    convo_text = "\n".join(f"User: {t}" if i % 2 == 0 else f"Assistant: {t}" for i, t in enumerate(conversation_turns))
+    # Force JSON-only output via assistant-role prefill of "{". Anthropic
+    # then resumes generation INSIDE the JSON object, eliminating the
+    # chat-continuation failure mode we observed empirically. The prefilled
+    # "{" is added back to the parsed text.
+    msg = client.messages.create(
+        model="claude-haiku-4-5",
+        max_tokens=600,
+        temperature=0,
+        system=EXTRACTION_PROMPT,
+        messages=[
+            {"role": "user", "content": f"Conversation:\n{convo_text}"},
+            {"role": "assistant", "content": "{"},
+        ],
+    )
+    generated = "".join(block.text for block in msg.content if hasattr(block, "text"))
+    text = "{" + generated
+    # Robust JSON extraction: strip markdown fences, then find the JSON object.
+    cleaned = text.strip()
+    if cleaned.startswith("```"):
+        cleaned = cleaned.split("\n", 1)[1] if "\n" in cleaned else ""
+        if cleaned.endswith("```"):
+            cleaned = cleaned.rsplit("```", 1)[0]
+    cleaned = cleaned.strip()
+    # If LLM added preamble like "Here are the facts:", find the first { and last }.
+    if not cleaned.startswith("{"):
+        start = cleaned.find("{")
+        end = cleaned.rfind("}")
+        if start >= 0 and end > start:
+            cleaned = cleaned[start : end + 1]
+    try:
+        parsed = json.loads(cleaned)
+        return parsed.get("memories", []) if isinstance(parsed, dict) else []
+    except json.JSONDecodeError as e:
+        # Log to Modal stderr so we can diagnose in the run output.
+        import sys
+        sys.stderr.write(f"[extract] JSON parse failed: {e}; raw text first 200 chars: {text[:200]!r}\n")
+        return []
+
+
+@app.function(timeout=1800)
+def score_all_conversations(conversations: list[dict], anthropic_key: str) -> dict:
+    """Run extraction + embedding + scoring for every conversation, both with and without the filter."""
+    from sentence_transformers import SentenceTransformer
+
+    print(f"[score] loading embedding model...", flush=True)
+    embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
+
+    # First: extract facts for every conversation (parallel via .map below would be cleaner,
+    # but Modal nested-function calls add complexity; sequential is fine for n=30).
+    print(f"[score] extracting facts for {len(conversations)} conversations...", flush=True)
+    rows = []
+    for conv in conversations:
+        try:
+            facts = extract_facts.remote(conv["turns"], anthropic_key)
+        except Exception as e:
+            print(f"  {conv['id']} EXTRACTION FAIL: {e}", flush=True)
+            facts = []
+        statements = [
+            (f.get("statement") or "").strip()
+            for f in facts
+            if isinstance(f, dict) and isinstance(f.get("statement"), str)
+        ]
+        statements = [s for s in statements if s]
+        meta_mask = [_is_meta_fact(s, META_FACT_PATTERNS) for s in statements]
+        rows.append({
+            "id": conv["id"],
+            "turns": conv["turns"],
+            "query": conv["query"],
+            "gold": conv["gold"],
+            "facts": statements,
+            "meta_mask": meta_mask,
+            "n_facts": len(statements),
+            "n_meta": sum(meta_mask),
+        })
+        print(f"  {conv['id']}: {len(statements)} facts ({sum(meta_mask)} meta)", flush=True)
+
+    # Second: embed everything and score retrieval, twice (with/without filter).
+    print(f"[score] embedding + scoring...", flush=True)
+    summary = {"baseline": {}, "filtered": {}, "n": len(rows)}
+    for mode in ("baseline", "filtered"):
+        hits_at_1 = 0
+        hits_at_5 = 0
+        gold_present = 0
+        meta_top1 = 0
+        per_item = []
+        for row in rows:
+            facts = row["facts"]
+            mask = row["meta_mask"]
+            if mode == "filtered":
+                facts_eff = [s for s, m in zip(facts, mask) if not m]
+                meta_eff = [False] * len(facts_eff)
+            else:
+                facts_eff = facts
+                meta_eff = mask
+
+            if not facts_eff:
+                per_item.append({
+                    "id": row["id"], "gold_rank": None, "gold_score": None,
+                    "top1_is_meta": False, "top1_text": None,
+                })
+                continue
+
+            q_vec = embedder.encode([row["query"]], normalize_embeddings=True)[0]
+            f_vecs = embedder.encode(facts_eff, normalize_embeddings=True)
+            scores = [_cosine(q_vec, fv) for fv in f_vecs]
+            ranked = sorted(
+                ((s, i) for i, s in enumerate(scores)),
+                key=lambda x: -x[0],
+            )
+
+            # Match gold by (a) substring fast path (case-insensitive, also handles
+            # short stems by stripping trailing punctuation), or (b) cosine similarity
+            # >= 0.65 against the gold tag if substring fails. The cosine fallback
+            # recovers cases like gold='go by Sam' matching 'goes by the name Sam'.
+            gold_token = row["gold"].lower().rstrip(".,!?;:")
+            gold_vec = embedder.encode([row["gold"]], normalize_embeddings=True)[0]
+            gold_rank = None
+            gold_score = None
+            for rank, (s, idx) in enumerate(ranked, start=1):
+                fact_lower = facts_eff[idx].lower()
+                if gold_token in fact_lower:
+                    gold_rank = rank
+                    gold_score = s
+                    break
+                # cosine fallback for stem-mismatch + semantic-paraphrase substring failures
+                # ("don't own" matches "does not own", "go by" matches "goes by")
+                if _cosine(gold_vec, f_vecs[idx]) >= 0.55:
+                    gold_rank = rank
+                    gold_score = s
+                    break
+
+            top1_idx = ranked[0][1]
+            top1_text = facts_eff[top1_idx]
+            top1_is_meta = meta_eff[top1_idx]
+
+            if gold_rank is not None:
+                gold_present += 1
+                if gold_rank == 1:
+                    hits_at_1 += 1
+                if gold_rank <= 5:
+                    hits_at_5 += 1
+            if top1_is_meta:
+                meta_top1 += 1
+
+            per_item.append({
+                "id": row["id"],
+                "gold_rank": gold_rank,
+                "gold_score": gold_score,
+                "top1_score": ranked[0][0],
+                "top1_text": top1_text,
+                "top1_is_meta": top1_is_meta,
+            })
+
+        summary[mode] = {
+            "recall_at_1": hits_at_1 / len(rows),
+            "recall_at_5": hits_at_5 / len(rows),
+            "gold_present_rate": gold_present / len(rows),
+            "meta_at_top1": meta_top1,
+            "per_item": per_item,
+        }
+
+    summary["rows"] = rows
+    return summary
+
+
+@app.local_entrypoint()
+def main():
+    anthropic_key = os.environ.get("ANTHROPIC_API_KEY", "")
+    if not anthropic_key:
+        raise RuntimeError("ANTHROPIC_API_KEY env var must be set locally before `modal run`")
+    result = score_all_conversations.remote(CONVERSATIONS, anthropic_key)
+
+    here = pathlib.Path(__file__).parent
+    out_path = here / "runs" / "demo-stress.json"
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    out_path.write_text(json.dumps(result, indent=2))
+
+    print("\n" + "=" * 60)
+    print("Demo-class stress test results")
+    print("=" * 60)
+    n = result["n"]
+    for mode in ("baseline", "filtered"):
+        s = result[mode]
+        print(
+            f"\n{mode:8}  r@1={s['recall_at_1']:.3f}  "
+            f"r@5={s['recall_at_5']:.3f}  "
+            f"gold_present={s['gold_present_rate']:.3f}  "
+            f"meta_top1={s['meta_at_top1']}/{n}"
+        )
+    delta_r1 = result["filtered"]["recall_at_1"] - result["baseline"]["recall_at_1"]
+    delta_meta = result["baseline"]["meta_at_top1"] - result["filtered"]["meta_at_top1"]
+    print(f"\nfilter delta:  r@1 {delta_r1:+.3f}  meta_top1 {delta_meta:+d}")
+    print(f"\nsaved -> {out_path}")
diff --git a/benchmarks/alignbench/run.mjs b/benchmarks/alignbench/run.mjs
new file mode 100644
index 0000000..9687442
--- /dev/null
+++ b/benchmarks/alignbench/run.mjs
@@ -0,0 +1,306 @@
+#!/usr/bin/env node
+/**
+ * AlignBench runner — standalone, no SDK/Postgres/network required.
+ *
+ * Embeds every fact in each axis, embeds every query, scores cosine similarity,
+ * reports recall@1 / recall@5 / mean-gold-rank / false-positive@5 per axis,
+ * and writes a single run JSON.
+ *
+ * Variants are switched via flags; the underlying scoring is identical so
+ * results are directly comparable across runs.
+ *
+ *   node run.mjs                                  # baseline (current SDK stack)
+ *   node run.mjs --variant=query-rewrite          # rewrite pronouns in query
+ *   node run.mjs --variant=dual-storage           # store fact in both forms
+ *   node run.mjs --variant=hybrid-bm25            # BM25 + semantic union
+ *   node run.mjs --variant=combined               # query-rewrite + hybrid-bm25
+ *
+ *   node run.mjs --out=runs/baseline.json --model=Xenova/all-MiniLM-L6-v2
+ */
+
+import fs from 'node:fs';
+import path from 'node:path';
+import { fileURLToPath } from 'node:url';
+import { pipeline } from '@huggingface/transformers';
+
+// -- CLI --
+const args = Object.fromEntries(
+  process.argv.slice(2).map(a => {
+    const [k, v] = a.replace(/^--/, '').split('=');
+    return [k, v ?? true];
+  })
+);
+const VARIANT = args.variant ?? 'baseline';
+const MODEL = args.model ?? 'Xenova/all-MiniLM-L6-v2';
+const TOPK = Number(args.topk ?? 5);
+const HERE = path.dirname(fileURLToPath(import.meta.url));
+const OUT = args.out ?? path.join(HERE, 'runs', `${VARIANT}.json`);
+
+// -- Load manifest --
+const manifest = JSON.parse(fs.readFileSync(path.join(HERE, 'items.json'), 'utf8'));
+
+// -- Variant: query rewrite --
+// Deterministic pronoun substitution to bridge first-person → third-person.
+// Order matters: longer phrases first so we don't double-substitute.
+const PRONOUN_RULES = [
+  [/\bmy\b/gi, "the user's"],
+  [/\bme\b/gi, 'the user'],
+  [/\bI am\b/gi, 'the user is'],
+  [/\bI'm\b/gi, 'the user is'],
+  [/\bI've\b/gi, 'the user has'],
+  [/\bI'd\b/gi, 'the user would'],
+  [/\bI'll\b/gi, 'the user will'],
+  [/\bI\b/gi, 'the user'],
+  [/\bmyself\b/gi, 'the user'],
+];
+function rewriteQueryPronouns(q) {
+  let out = q;
+  for (const [re, repl] of PRONOUN_RULES) out = out.replace(re, repl);
+  return out;
+}
+
+// -- Variant: dual storage --
+// For each fact, also produce a first-person paraphrase. Both are stored;
+// retrieval picks whichever scores higher.
+const STORE_RULES = [
+  [/\bThe user's\b/g, 'My'],
+  [/\bthe user's\b/g, 'my'],
+  [/\bThe user\b/g, 'I'],
+  [/\bthe user\b/g, 'I'],
+];
+function paraphraseFirstPerson(fact) {
+  let out = fact;
+  for (const [re, repl] of STORE_RULES) out = out.replace(re, repl);
+  // tidy common verb agreements after subject rewrite (avoid worst surface mismatches)
+  out = out.replace(/\bI is\b/g, 'I am').replace(/\bI has\b/g, 'I have').replace(/\bI does\b/g, 'I do');
+  return out;
+}
+
+// -- Hybrid BM25 implementation (tiny, just for this benchmark) --
+function tokenize(s) {
+  return s.toLowerCase().match(/[a-z0-9]+/g) ?? [];
+}
+function bm25Scores(queryTokens, docsTokens, k1 = 1.5, b = 0.75) {
+  const N = docsTokens.length;
+  const avgDL = docsTokens.reduce((a, d) => a + d.length, 0) / Math.max(1, N);
+  const df = new Map();
+  for (const doc of docsTokens) {
+    for (const t of new Set(doc)) df.set(t, (df.get(t) ?? 0) + 1);
+  }
+  const idf = (t) => Math.log(1 + (N - (df.get(t) ?? 0) + 0.5) / ((df.get(t) ?? 0) + 0.5));
+  return docsTokens.map((doc) => {
+    const tf = new Map();
+    for (const t of doc) tf.set(t, (tf.get(t) ?? 0) + 1);
+    const dl = doc.length;
+    let score = 0;
+    for (const qt of new Set(queryTokens)) {
+      const f = tf.get(qt) ?? 0;
+      if (f === 0) continue;
+      score += idf(qt) * ((f * (k1 + 1)) / (f + k1 * (1 - b + b * (dl / avgDL))));
+    }
+    return score;
+  });
+}
+function minmaxNormalize(arr) {
+  let lo = Infinity, hi = -Infinity;
+  for (const v of arr) { if (v < lo) lo = v; if (v > hi) hi = v; }
+  const span = hi - lo;
+  if (span <= 0) return arr.map(() => 0);
+  return arr.map((v) => (v - lo) / span);
+}
+
+// -- Cosine --
+function cosine(a, b) {
+  let dot = 0, na = 0, nb = 0;
+  for (let i = 0; i < a.length; i++) { dot += a[i] * b[i]; na += a[i] * a[i]; nb += b[i] * b[i]; }
+  const m = Math.sqrt(na * nb);
+  return m === 0 ? 0 : dot / m;
+}
+
+// -- Embedder --
+async function loadEmbedder(model) {
+  console.log(`[load] ${model}`);
+  const fn = await pipeline('feature-extraction', model);
+  return async (text) => {
+    const out = await fn(text, { pooling: 'mean', normalize: true });
+    return Array.from(out.data);
+  };
+}
+
+// -- Build the global fact pool used for ALL queries --
+// Real SDK stores are mixed: facts from all topics, plus extraction-style meta-facts
+// that pollute the embedding space. The realistic test is "can the right user-fact
+// outrank the distractors when they all live in the same store".
+function buildGlobalPool(manifest, variant) {
+  // {axisName: [{text, globalKey}]}. globalKey is the canonical id used for gold matching.
+  const entries = []; // [{ text, globalKey, axis, factIndex, isDistractor }]
+  for (const [axisName, body] of Object.entries(manifest.axes)) {
+    const isDistractor = axisName === 'distractors';
+    for (let i = 0; i < body.facts.length; i++) {
+      const globalKey = `${axisName}#${i}`;
+      entries.push({ text: body.facts[i], globalKey, axis: axisName, factIndex: i, isDistractor });
+      if (variant === 'dual-storage' || variant === 'combined') {
+        const para = paraphraseFirstPerson(body.facts[i]);
+        if (para !== body.facts[i]) {
+          entries.push({ text: para, globalKey, axis: axisName, factIndex: i, isDistractor });
+        }
+      }
+    }
+  }
+  return entries;
+}
+
+async function scoreAxis(axisName, axisBody, embed, pool, factVecs, factTokens) {
+  const perItem = [];
+  let hitAt1 = 0, hitAt5 = 0;
+  let goldRankSum = 0, goldRankN = 0;
+  let distractorTop1 = 0;
+  let fpAt5 = 0;
+  const marginSamples = []; // gold_score - best_non_gold_score
+
+  for (const item of axisBody.items) {
+    const origQ = item.query;
+    let effQ = origQ;
+    if (VARIANT === 'query-rewrite' || VARIANT === 'combined') {
+      effQ = rewriteQueryPronouns(origQ);
+    }
+    const qVec = await embed(effQ);
+    const semScores = factVecs.map((fv) => cosine(qVec, fv));
+    let scores = semScores;
+    if (VARIANT === 'hybrid-bm25' || VARIANT === 'combined') {
+      const qTokens = tokenize(effQ);
+      const bm = bm25Scores(qTokens, factTokens);
+      const semN = minmaxNormalize(semScores);
+      const bmN = minmaxNormalize(bm);
+      scores = semN.map((s, i) => 0.6 * s + 0.4 * bmN[i]);
+    }
+
+    // Rank, collapse to globalKey (dual-storage duplicates same key)
+    const ranked = scores
+      .map((s, i) => ({ s, entry: pool[i] }))
+      .sort((a, b) => b.s - a.s);
+    const seen = new Set();
+    const dedup = [];
+    for (const r of ranked) {
+      if (seen.has(r.entry.globalKey)) continue;
+      seen.add(r.entry.globalKey);
+      dedup.push(r);
+    }
+    const topK = dedup.slice(0, TOPK);
+
+    // Gold match: same axis + same factIndex
+    let goldRank = null;
+    let goldScore = null;
+    const goldKey = (item.fact_index != null) ? `${axisName}#${item.fact_index}` : null;
+    if (item.gold_in_topk && goldKey) {
+      const idx = dedup.findIndex((r) => r.entry.globalKey === goldKey);
+      if (idx >= 0) { goldRank = idx + 1; goldScore = dedup[idx].s; }
+    }
+
+    // Margin: gold vs best non-gold
+    if (goldScore !== null) {
+      const bestNonGold = dedup.find((r) => r.entry.globalKey !== goldKey);
+      if (bestNonGold) marginSamples.push(goldScore - bestNonGold.s);
+    }
+
+    const hit1 = goldRank === 1;
+    const hit5 = goldRank !== null && goldRank <= TOPK;
+    if (hit1) hitAt1++;
+    if (hit5) hitAt5++;
+    if (goldRank !== null) { goldRankSum += goldRank; goldRankN++; }
+
+    // Distractor-pollution metric: how often a meta-fact ranks top-1
+    if (item.gold_in_topk && topK[0]?.entry.isDistractor) distractorTop1++;
+
+    let fp = false;
+    if (item.gold_in_topk === false) {
+      fp = topK.length > 0 && topK[0].s > 0.5;
+      if (fp) fpAt5++;
+    }
+
+    perItem.push({
+      id: item.id,
+      query: origQ,
+      effective_query: effQ,
+      gold_in_topk: item.gold_in_topk ?? false,
+      gold_global_key: goldKey,
+      gold_rank: goldRank,
+      gold_score: goldScore,
+      top1_text: topK[0]?.entry.text ?? null,
+      top1_score: topK[0]?.s ?? null,
+      top1_is_distractor: topK[0]?.entry.isDistractor ?? false,
+      false_positive: fp,
+    });
+  }
+
+  const n = axisBody.items.length;
+  return {
+    axis: axisName,
+    n,
+    pool_size: pool.length,
+    recall_at_1: n > 0 ? hitAt1 / n : null,
+    recall_at_5: n > 0 ? hitAt5 / n : null,
+    mean_gold_rank: goldRankN > 0 ? goldRankSum / goldRankN : null,
+    distractor_at_top1: distractorTop1,
+    false_positive_count: fpAt5,
+    median_gold_margin: marginSamples.length > 0
+      ? marginSamples.sort((a, b) => a - b)[Math.floor(marginSamples.length / 2)]
+      : null,
+    items: perItem,
+  };
+}
+
+// -- Main --
+async function main() {
+  const t0 = Date.now();
+  const embed = await loadEmbedder(MODEL);
+
+  // Build the SHARED global pool — every query competes against the full set,
+  // including extraction-style distractor meta-facts.
+  const pool = buildGlobalPool(manifest, VARIANT);
+  console.log(`[pool] ${pool.length} entries (variant=${VARIANT})`);
+  const factVecs = [];
+  for (const e of pool) factVecs.push(await embed(e.text));
+  const factTokens = pool.map((e) => tokenize(e.text));
+
+  const results = [];
+  for (const [axisName, body] of Object.entries(manifest.axes)) {
+    if (body.items.length === 0) continue; // skip distractor section (facts only)
+    process.stdout.write(`[axis] ${axisName.padEnd(13)} ... `);
+    const r = await scoreAxis(axisName, body, embed, pool, factVecs, factTokens);
+    process.stdout.write(`r@5=${r.recall_at_5?.toFixed(3) ?? 'n/a'}  r@1=${r.recall_at_1?.toFixed(3) ?? 'n/a'}  margin=${r.median_gold_margin?.toFixed(3) ?? 'n/a'}  distractor_top1=${r.distractor_at_top1}\n`);
+    results.push(r);
+  }
+
+  // Composite (excluding control)
+  const scoredAxes = results.filter((r) => r.axis !== 'control');
+  const totalN = scoredAxes.reduce((a, r) => a + r.n, 0);
+  const composite = {
+    recall_at_1: scoredAxes.reduce((a, r) => a + r.recall_at_1 * r.n, 0) / totalN,
+    recall_at_5: scoredAxes.reduce((a, r) => a + r.recall_at_5 * r.n, 0) / totalN,
+    distractor_top1_rate: scoredAxes.reduce((a, r) => a + r.distractor_at_top1, 0) / totalN,
+    n: totalN,
+  };
+  const controlAxis = results.find((r) => r.axis === 'control');
+  const fpRate = controlAxis ? controlAxis.false_positive_count / controlAxis.n : null;
+
+  const out = {
+    variant: VARIANT,
+    model: MODEL,
+    topk: TOPK,
+    wall_seconds: ((Date.now() - t0) / 1000).toFixed(1),
+    composite,
+    false_positive_rate: fpRate,
+    per_axis: results,
+  };
+
+  fs.mkdirSync(path.dirname(OUT), { recursive: true });
+  fs.writeFileSync(OUT, JSON.stringify(out, null, 2));
+
+  console.log('');
+  console.log(`composite  r@1 = ${composite.recall_at_1.toFixed(3)}   r@5 = ${composite.recall_at_5.toFixed(3)}`);
+  console.log(`distractor_top1_rate = ${composite.distractor_top1_rate.toFixed(3)}    fp@control = ${fpRate?.toFixed(3) ?? 'n/a'}`);
+  console.log(`saved → ${OUT}`);
+}
+
+main().catch((e) => { console.error(e); process.exit(1); });
diff --git a/benchmarks/alignbench/runs/baseline-no-distractors.json b/benchmarks/alignbench/runs/baseline-no-distractors.json
new file mode 100644
index 0000000..00d7092
--- /dev/null
+++ b/benchmarks/alignbench/runs/baseline-no-distractors.json
@@ -0,0 +1,990 @@
+{
+  "variant": "baseline",
+  "model": "Xenova/all-MiniLM-L6-v2",
+  "topk": 5,
+  "wall_seconds": "0.4",
+  "composite": {
+    "recall_at_1": 0.7666666666666667,
+    "recall_at_5": 0.95,
+    "distractor_top1_rate": 0,
+    "n": 60
+  },
+  "false_positive_rate": 0,
+  "per_axis": [
+    {
+      "axis": "pronoun",
+      "n": 20,
+      "pool_size": 45,
+      "recall_at_1": 0.8,
+      "recall_at_5": 1,
+      "mean_gold_rank": 1.2,
+      "distractor_at_top1": 0,
+      "false_positive_count": 0,
+      "median_gold_margin": 0.05112698480692013,
+      "items": [
+        {
+          "id": "pronoun-001",
+          "query": "what is my name?",
+          "effective_query": "what is my name?",
+          "gold_in_topk": true,
+          "gold_global_key": "pronoun#0",
+          "gold_rank": 1,
+          "gold_score": 0.3960303973923612,
+          "top1_text": "The user's name is Alex.",
+          "top1_score": 0.3960303973923612,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "pronoun-002",
+          "query": "what's the user's name?",
+          "effective_query": "what's the user's name?",
+          "gold_in_topk": true,
+          "gold_global_key": "pronoun#0",
+          "gold_rank": 1,
+          "gold_score": 0.7294812723464028,
+          "top1_text": "The user's name is Alex.",
+          "top1_score": 0.7294812723464028,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "pronoun-003",
+          "query": "who am I?",
+          "effective_query": "who am I?",
+          "gold_in_topk": true,
+          "gold_global_key": "pronoun#0",
+          "gold_rank": 1,
+          "gold_score": 0.2824821997084312,
+          "top1_text": "The user's name is Alex.",
+          "top1_score": 0.2824821997084312,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "pronoun-004",
+          "query": "what do I do for work?",
+          "effective_query": "what do I do for work?",
+          "gold_in_topk": true,
+          "gold_global_key": "pronoun#1",
+          "gold_rank": 2,
+          "gold_score": 0.18618347998079776,
+          "top1_text": "The user works at Acme as a software engineer.",
+          "top1_score": 0.19567938172618415,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "pronoun-005",
+          "query": "what is the user's job?",
+          "effective_query": "what is the user's job?",
+          "gold_in_topk": true,
+          "gold_global_key": "pronoun#1",
+          "gold_rank": 2,
+          "gold_score": 0.5754363621337681,
+          "top1_text": "The user works at Acme as a software engineer.",
+          "top1_score": 0.5767198521038163,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "pronoun-006",
+          "query": "where does the user work?",
+          "effective_query": "where does the user work?",
+          "gold_in_topk": true,
+          "gold_global_key": "pronoun#1",
+          "gold_rank": 1,
+          "gold_score": 0.5436570101621021,
+          "top1_text": "The user works as a software engineer at Acme.",
+          "top1_score": 0.5436570101621021,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "pronoun-007",
+          "query": "what is my dog's name?",
+          "effective_query": "what is my dog's name?",
+          "gold_in_topk": true,
+          "gold_global_key": "pronoun#2",
+          "gold_rank": 1,
+          "gold_score": 0.5711816859366411,
+          "top1_text": "The user's dog is named Apollo.",
+          "top1_score": 0.5711816859366411,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "pronoun-008",
+          "query": "who is Apollo?",
+          "effective_query": "who is Apollo?",
+          "gold_in_topk": true,
+          "gold_global_key": "pronoun#2",
+          "gold_rank": 1,
+          "gold_score": 0.7072701654355713,
+          "top1_text": "The user's dog is named Apollo.",
+          "top1_score": 0.7072701654355713,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "pronoun-009",
+          "query": "where do I live?",
+          "effective_query": "where do I live?",
+          "gold_in_topk": true,
+          "gold_global_key": "pronoun#3",
+          "gold_rank": 2,
+          "gold_score": 0.29566315429472595,
+          "top1_text": "As of January 2026, the user lives in Lisbon.",
+          "top1_score": 0.2960941749345428,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "pronoun-010",
+          "query": "what city does the user live in?",
+          "effective_query": "what city does the user live in?",
+          "gold_in_topk": true,
+          "gold_global_key": "pronoun#3",
+          "gold_rank": 1,
+          "gold_score": 0.6400263303955707,
+          "top1_text": "The user lives in Lisbon.",
+          "top1_score": 0.6400263303955707,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "pronoun-011",
+          "query": "when is my birthday?",
+          "effective_query": "when is my birthday?",
+          "gold_in_topk": true,
+          "gold_global_key": "pronoun#4",
+          "gold_rank": 1,
+          "gold_score": 0.6279255850758502,
+          "top1_text": "The user's birthday is March 14.",
+          "top1_score": 0.6279255850758502,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "pronoun-012",
+          "query": "when was the user born?",
+          "effective_query": "when was the user born?",
+          "gold_in_topk": true,
+          "gold_global_key": "pronoun#4",
+          "gold_rank": 1,
+          "gold_score": 0.7251037734927323,
+          "top1_text": "The user's birthday is March 14.",
+          "top1_score": 0.7251037734927323,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "pronoun-013",
+          "query": "do I have kids?",
+          "effective_query": "do I have kids?",
+          "gold_in_topk": true,
+          "gold_global_key": "pronoun#5",
+          "gold_rank": 1,
+          "gold_score": 0.2345547444698841,
+          "top1_text": "The user has two children, Maya and Theo.",
+          "top1_score": 0.2345547444698841,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "pronoun-014",
+          "query": "how many children does the user have?",
+          "effective_query": "how many children does the user have?",
+          "gold_in_topk": true,
+          "gold_global_key": "pronoun#5",
+          "gold_rank": 1,
+          "gold_score": 0.5766094762746933,
+          "top1_text": "The user has two children, Maya and Theo.",
+          "top1_score": 0.5766094762746933,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "pronoun-015",
+          "query": "what is my usual coffee order?",
+          "effective_query": "what is my usual coffee order?",
+          "gold_in_topk": true,
+          "gold_global_key": "pronoun#6",
+          "gold_rank": 1,
+          "gold_score": 0.6001185695455631,
+          "top1_text": "The user's favorite coffee order is an oat-milk flat white.",
+          "top1_score": 0.6001185695455631,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "pronoun-016",
+          "query": "what coffee does the user drink?",
+          "effective_query": "what coffee does the user drink?",
+          "gold_in_topk": true,
+          "gold_global_key": "pronoun#6",
+          "gold_rank": 2,
+          "gold_score": 0.5813657391451515,
+          "top1_text": "The user does not drink coffee. They prefer tea.",
+          "top1_score": 0.7197601756319,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "pronoun-017",
+          "query": "what did I study?",
+          "effective_query": "what did I study?",
+          "gold_in_topk": true,
+          "gold_global_key": "pronoun#7",
+          "gold_rank": 1,
+          "gold_score": 0.5746472233195816,
+          "top1_text": "The user studied applied mathematics at university.",
+          "top1_score": 0.5746472233195816,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "pronoun-018",
+          "query": "do I have any allergies?",
+          "effective_query": "do I have any allergies?",
+          "gold_in_topk": true,
+          "gold_global_key": "pronoun#8",
+          "gold_rank": 1,
+          "gold_score": 0.389079047513309,
+          "top1_text": "The user is allergic to peanuts.",
+          "top1_score": 0.389079047513309,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "pronoun-019",
+          "query": "what is the user allergic to?",
+          "effective_query": "what is the user allergic to?",
+          "gold_in_topk": true,
+          "gold_global_key": "pronoun#8",
+          "gold_rank": 1,
+          "gold_score": 0.7261582549960168,
+          "top1_text": "The user is allergic to peanuts.",
+          "top1_score": 0.7261582549960168,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "pronoun-020",
+          "query": "what kind of car do I drive?",
+          "effective_query": "what kind of car do I drive?",
+          "gold_in_topk": true,
+          "gold_global_key": "pronoun#9",
+          "gold_rank": 1,
+          "gold_score": 0.39618091590057963,
+          "top1_text": "The user drives a 2019 Toyota Corolla.",
+          "top1_score": 0.39618091590057963,
+          "top1_is_distractor": false,
+          "false_positive": false
+        }
+      ]
+    },
+    {
+      "axis": "temporal",
+      "n": 14,
+      "pool_size": 45,
+      "recall_at_1": 0.5,
+      "recall_at_5": 0.7857142857142857,
+      "mean_gold_rank": 3.857142857142857,
+      "distractor_at_top1": 0,
+      "false_positive_count": 0,
+      "median_gold_margin": 0.049944619619024966,
+      "items": [
+        {
+          "id": "temporal-001",
+          "query": "where does the user live now?",
+          "effective_query": "where does the user live now?",
+          "gold_in_topk": true,
+          "gold_global_key": "temporal#0",
+          "gold_rank": 3,
+          "gold_score": 0.550366425409688,
+          "top1_text": "The user lives in Lisbon.",
+          "top1_score": 0.5504016420681116,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "temporal-002",
+          "query": "where did the user used to live?",
+          "effective_query": "where did the user used to live?",
+          "gold_in_topk": true,
+          "gold_global_key": "temporal#1",
+          "gold_rank": 1,
+          "gold_score": 0.5490901457590983,
+          "top1_text": "Before 2024, the user lived in Berlin.",
+          "top1_score": 0.5490901457590983,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "temporal-003",
+          "query": "when did the user move?",
+          "effective_query": "when did the user move?",
+          "gold_in_topk": true,
+          "gold_global_key": "temporal#2",
+          "gold_rank": 1,
+          "gold_score": 0.5076513080537272,
+          "top1_text": "The user moved from Berlin to Lisbon in 2024.",
+          "top1_score": 0.5076513080537272,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "temporal-004",
+          "query": "where is the user currently based?",
+          "effective_query": "where is the user currently based?",
+          "gold_in_topk": true,
+          "gold_global_key": "temporal#0",
+          "gold_rank": 8,
+          "gold_score": 0.42833906054622706,
+          "top1_text": "The user's name is Alex.",
+          "top1_score": 0.4608892635234409,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "temporal-005",
+          "query": "what is the user reading?",
+          "effective_query": "what is the user reading?",
+          "gold_in_topk": true,
+          "gold_global_key": "temporal#3",
+          "gold_rank": 2,
+          "gold_score": 0.4600491260020972,
+          "top1_text": "The user reads on a Kindle Paperwhite 11th-gen.",
+          "top1_score": 0.5634334413727431,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "temporal-006",
+          "query": "what did the user read last year?",
+          "effective_query": "what did the user read last year?",
+          "gold_in_topk": true,
+          "gold_global_key": "temporal#4",
+          "gold_rank": 1,
+          "gold_score": 0.5028289729480597,
+          "top1_text": "Last year the user read 'Project Hail Mary'.",
+          "top1_score": 0.5028289729480597,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "temporal-007",
+          "query": "what is the user working on these days?",
+          "effective_query": "what is the user working on these days?",
+          "gold_in_topk": true,
+          "gold_global_key": "temporal#5",
+          "gold_rank": 3,
+          "gold_score": 0.4513265913655406,
+          "top1_text": "The user works at Acme as a software engineer.",
+          "top1_score": 0.48543171981436045,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "temporal-008",
+          "query": "what did the user finish last month?",
+          "effective_query": "what did the user finish last month?",
+          "gold_in_topk": true,
+          "gold_global_key": "temporal#6",
+          "gold_rank": 1,
+          "gold_score": 0.5544356167423142,
+          "top1_text": "The user finished the Sprint-4 reranker training last month.",
+          "top1_score": 0.5544356167423142,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "temporal-009",
+          "query": "what LLM does the user prefer?",
+          "effective_query": "what LLM does the user prefer?",
+          "gold_in_topk": true,
+          "gold_global_key": "temporal#7",
+          "gold_rank": 1,
+          "gold_score": 0.6226655286181774,
+          "top1_text": "As of May 2026, the user's preferred LLM is Claude Sonnet 4.6.",
+          "top1_score": 0.6226655286181774,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "temporal-010",
+          "query": "which model did the user use before?",
+          "effective_query": "which model did the user use before?",
+          "gold_in_topk": true,
+          "gold_global_key": "temporal#8",
+          "gold_rank": 1,
+          "gold_score": 0.5205286761987613,
+          "top1_text": "The user used GPT-4 as their primary model in 2024.",
+          "top1_score": 0.5205286761987613,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "temporal-011",
+          "query": "did the user get a new phone recently?",
+          "effective_query": "did the user get a new phone recently?",
+          "gold_in_topk": true,
+          "gold_global_key": "temporal#9",
+          "gold_rank": 1,
+          "gold_score": 0.6113316689987077,
+          "top1_text": "The user upgraded their phone to an iPhone 17 in March 2026.",
+          "top1_score": 0.6113316689987077,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "temporal-012",
+          "query": "is the user still in Berlin?",
+          "effective_query": "is the user still in Berlin?",
+          "gold_in_topk": true,
+          "gold_global_key": "temporal#0",
+          "gold_rank": 5,
+          "gold_score": 0.4089445689504921,
+          "top1_text": "Before 2024, the user lived in Berlin.",
+          "top1_score": 0.7214467722165934,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "temporal-013",
+          "query": "what is the user up to?",
+          "effective_query": "what is the user up to?",
+          "gold_in_topk": true,
+          "gold_global_key": "temporal#5",
+          "gold_rank": 11,
+          "gold_score": 0.2672680179526975,
+          "top1_text": "The user works as a software engineer at Acme.",
+          "top1_score": 0.3454437295411476,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "temporal-014",
+          "query": "which model is the user on right now?",
+          "effective_query": "which model is the user on right now?",
+          "gold_in_topk": true,
+          "gold_global_key": "temporal#7",
+          "gold_rank": 15,
+          "gold_score": 0.31088477125594804,
+          "top1_text": "The user's name is Alex.",
+          "top1_score": 0.40861487066233426,
+          "top1_is_distractor": false,
+          "false_positive": false
+        }
+      ]
+    },
+    {
+      "axis": "specificity",
+      "n": 14,
+      "pool_size": 45,
+      "recall_at_1": 0.8571428571428571,
+      "recall_at_5": 1,
+      "mean_gold_rank": 1.2142857142857142,
+      "distractor_at_top1": 0,
+      "false_positive_count": 0,
+      "median_gold_margin": 0.14383529034315978,
+      "items": [
+        {
+          "id": "specificity-001",
+          "query": "tell me about my dog",
+          "effective_query": "tell me about my dog",
+          "gold_in_topk": true,
+          "gold_global_key": "specificity#0",
+          "gold_rank": 3,
+          "gold_score": 0.41588713324638765,
+          "top1_text": "The user has a dog named Apollo.",
+          "top1_score": 0.4531912105925747,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "specificity-002",
+          "query": "what kind of pet do I have?",
+          "effective_query": "what kind of pet do I have?",
+          "gold_in_topk": true,
+          "gold_global_key": "specificity#0",
+          "gold_rank": 2,
+          "gold_score": 0.3644336367800417,
+          "top1_text": "The user has a dog named Apollo.",
+          "top1_score": 0.4007600036283448,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "specificity-003",
+          "query": "do I own a bike?",
+          "effective_query": "do I own a bike?",
+          "gold_in_topk": true,
+          "gold_global_key": "specificity#1",
+          "gold_rank": 1,
+          "gold_score": 0.5234729437047541,
+          "top1_text": "The user owns a Bianchi road bike.",
+          "top1_score": 0.5234729437047541,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "specificity-004",
+          "query": "what brand of bike does the user have?",
+          "effective_query": "what brand of bike does the user have?",
+          "gold_in_topk": true,
+          "gold_global_key": "specificity#1",
+          "gold_rank": 1,
+          "gold_score": 0.6329297203833848,
+          "top1_text": "The user owns a Bianchi road bike.",
+          "top1_score": 0.6329297203833848,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "specificity-005",
+          "query": "what computer do I use?",
+          "effective_query": "what computer do I use?",
+          "gold_in_topk": true,
+          "gold_global_key": "specificity#2",
+          "gold_rank": 1,
+          "gold_score": 0.2943695662572405,
+          "top1_text": "The user's primary laptop is a 16-inch MacBook Pro M4.",
+          "top1_score": 0.2943695662572405,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "specificity-006",
+          "query": "what laptop does the user have?",
+          "effective_query": "what laptop does the user have?",
+          "gold_in_topk": true,
+          "gold_global_key": "specificity#2",
+          "gold_rank": 1,
+          "gold_score": 0.6450468635073591,
+          "top1_text": "The user's primary laptop is a 16-inch MacBook Pro M4.",
+          "top1_score": 0.6450468635073591,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "specificity-007",
+          "query": "do I have any musical instruments?",
+          "effective_query": "do I have any musical instruments?",
+          "gold_in_topk": true,
+          "gold_global_key": "specificity#3",
+          "gold_rank": 1,
+          "gold_score": 0.28698273062526614,
+          "top1_text": "The user has a Yamaha P-125 digital piano in the living room.",
+          "top1_score": 0.28698273062526614,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "specificity-008",
+          "query": "which note-taking app do I use?",
+          "effective_query": "which note-taking app do I use?",
+          "gold_in_topk": true,
+          "gold_global_key": "specificity#4",
+          "gold_rank": 1,
+          "gold_score": 0.38230106432962924,
+          "top1_text": "The user uses Logseq for personal notes and Notion for work.",
+          "top1_score": 0.38230106432962924,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "specificity-009",
+          "query": "where do I like to eat in Lisbon?",
+          "effective_query": "where do I like to eat in Lisbon?",
+          "gold_in_topk": true,
+          "gold_global_key": "specificity#5",
+          "gold_rank": 1,
+          "gold_score": 0.695091049183982,
+          "top1_text": "The user's favorite restaurant in Lisbon is Cervejaria Ramiro.",
+          "top1_score": 0.695091049183982,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "specificity-010",
+          "query": "what brand sunglasses does the user wear?",
+          "effective_query": "what brand sunglasses does the user wear?",
+          "gold_in_topk": true,
+          "gold_global_key": "specificity#6",
+          "gold_rank": 1,
+          "gold_score": 0.6512271965052092,
+          "top1_text": "The user wears Smith Lowdown sunglasses.",
+          "top1_score": 0.6512271965052092,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "specificity-011",
+          "query": "do I read on a Kindle?",
+          "effective_query": "do I read on a Kindle?",
+          "gold_in_topk": true,
+          "gold_global_key": "specificity#7",
+          "gold_rank": 1,
+          "gold_score": 0.6508408387843772,
+          "top1_text": "The user reads on a Kindle Paperwhite 11th-gen.",
+          "top1_score": 0.6508408387843772,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "specificity-012",
+          "query": "what espresso machine does the user own?",
+          "effective_query": "what espresso machine does the user own?",
+          "gold_in_topk": true,
+          "gold_global_key": "specificity#8",
+          "gold_rank": 1,
+          "gold_score": 0.7366961226077487,
+          "top1_text": "The user's home espresso machine is a Lelit Bianca v3.",
+          "top1_score": 0.7366961226077487,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "specificity-013",
+          "query": "what shoes do I wear?",
+          "effective_query": "what shoes do I wear?",
+          "gold_in_topk": true,
+          "gold_global_key": "specificity#9",
+          "gold_rank": 1,
+          "gold_score": 0.25921156505496634,
+          "top1_text": "The user wears Allbirds Wool Runners daily.",
+          "top1_score": 0.25921156505496634,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "specificity-014",
+          "query": "what brand are the user's everyday shoes?",
+          "effective_query": "what brand are the user's everyday shoes?",
+          "gold_in_topk": true,
+          "gold_global_key": "specificity#9",
+          "gold_rank": 1,
+          "gold_score": 0.4229939734066537,
+          "top1_text": "The user wears Allbirds Wool Runners daily.",
+          "top1_score": 0.4229939734066537,
+          "top1_is_distractor": false,
+          "false_positive": false
+        }
+      ]
+    },
+    {
+      "axis": "negation",
+      "n": 12,
+      "pool_size": 45,
+      "recall_at_1": 0.9166666666666666,
+      "recall_at_5": 1,
+      "mean_gold_rank": 1.0833333333333333,
+      "distractor_at_top1": 0,
+      "false_positive_count": 0,
+      "median_gold_margin": 0.2862424829289613,
+      "items": [
+        {
+          "id": "negation-001",
+          "query": "does the user drink coffee?",
+          "effective_query": "does the user drink coffee?",
+          "gold_in_topk": true,
+          "gold_global_key": "negation#0",
+          "gold_rank": 1,
+          "gold_score": 0.7905096599561829,
+          "top1_text": "The user does not drink coffee. They prefer tea.",
+          "top1_score": 0.7905096599561829,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "negation-002",
+          "query": "what does the user drink in the morning?",
+          "effective_query": "what does the user drink in the morning?",
+          "gold_in_topk": true,
+          "gold_global_key": "negation#0",
+          "gold_rank": 1,
+          "gold_score": 0.5285974920275435,
+          "top1_text": "The user does not drink coffee. They prefer tea.",
+          "top1_score": 0.5285974920275435,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "negation-003",
+          "query": "am I vegetarian?",
+          "effective_query": "am I vegetarian?",
+          "gold_in_topk": true,
+          "gold_global_key": "negation#1",
+          "gold_rank": 1,
+          "gold_score": 0.6191490333752342,
+          "top1_text": "The user is not vegetarian, but avoids red meat.",
+          "top1_score": 0.6191490333752342,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "negation-004",
+          "query": "is the user on Twitter?",
+          "effective_query": "is the user on Twitter?",
+          "gold_in_topk": true,
+          "gold_global_key": "negation#2",
+          "gold_rank": 1,
+          "gold_score": 0.5874304481012608,
+          "top1_text": "The user does not use Twitter; they use Bluesky and Mastodon.",
+          "top1_score": 0.5874304481012608,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "negation-005",
+          "query": "which social networks does the user use?",
+          "effective_query": "which social networks does the user use?",
+          "gold_in_topk": true,
+          "gold_global_key": "negation#2",
+          "gold_rank": 1,
+          "gold_score": 0.5067702908301415,
+          "top1_text": "The user does not use Twitter; they use Bluesky and Mastodon.",
+          "top1_score": 0.5067702908301415,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "negation-006",
+          "query": "does the user own a car?",
+          "effective_query": "does the user own a car?",
+          "gold_in_topk": true,
+          "gold_global_key": "negation#3",
+          "gold_rank": 1,
+          "gold_score": 0.7488288864422115,
+          "top1_text": "The user does not own a car; they bike or use public transit.",
+          "top1_score": 0.7488288864422115,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "negation-007",
+          "query": "is the user active on LinkedIn?",
+          "effective_query": "is the user active on LinkedIn?",
+          "gold_in_topk": true,
+          "gold_global_key": "negation#4",
+          "gold_rank": 1,
+          "gold_score": 0.7880239246717338,
+          "top1_text": "The user is not on LinkedIn anymore.",
+          "top1_score": 0.7880239246717338,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "negation-008",
+          "query": "any foods the user hates?",
+          "effective_query": "any foods the user hates?",
+          "gold_in_topk": true,
+          "gold_global_key": "negation#5",
+          "gold_rank": 2,
+          "gold_score": 0.4222244002602081,
+          "top1_text": "The user is not vegetarian, but avoids red meat.",
+          "top1_score": 0.4321470878666278,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "negation-009",
+          "query": "has the user traveled to Asia?",
+          "effective_query": "has the user traveled to Asia?",
+          "gold_in_topk": true,
+          "gold_global_key": "negation#6",
+          "gold_rank": 1,
+          "gold_score": 0.6919597377462416,
+          "top1_text": "The user has never been to Asia.",
+          "top1_score": 0.6919597377462416,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "negation-010",
+          "query": "does the user like horror movies?",
+          "effective_query": "does the user like horror movies?",
+          "gold_in_topk": true,
+          "gold_global_key": "negation#7",
+          "gold_rank": 1,
+          "gold_score": 0.7303882056719367,
+          "top1_text": "The user does not enjoy horror movies.",
+          "top1_score": 0.7303882056719367,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "negation-011",
+          "query": "can the user eat shrimp?",
+          "effective_query": "can the user eat shrimp?",
+          "gold_in_topk": true,
+          "gold_global_key": "negation#8",
+          "gold_rank": 1,
+          "gold_score": 0.7044731236020111,
+          "top1_text": "The user does not eat shellfish.",
+          "top1_score": 0.7044731236020111,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "negation-012",
+          "query": "is the user learning a new language?",
+          "effective_query": "is the user learning a new language?",
+          "gold_in_topk": true,
+          "gold_global_key": "negation#9",
+          "gold_rank": 1,
+          "gold_score": 0.6522323599643379,
+          "top1_text": "The user is not currently learning any new languages.",
+          "top1_score": 0.6522323599643379,
+          "top1_is_distractor": false,
+          "false_positive": false
+        }
+      ]
+    },
+    {
+      "axis": "control",
+      "n": 10,
+      "pool_size": 45,
+      "recall_at_1": 0,
+      "recall_at_5": 0,
+      "mean_gold_rank": null,
+      "distractor_at_top1": 0,
+      "false_positive_count": 0,
+      "median_gold_margin": null,
+      "items": [
+        {
+          "id": "control-001",
+          "query": "what is the airspeed velocity of an unladen swallow?",
+          "effective_query": "what is the airspeed velocity of an unladen swallow?",
+          "gold_in_topk": false,
+          "gold_global_key": null,
+          "gold_rank": null,
+          "gold_score": null,
+          "top1_text": "The user does not eat shellfish.",
+          "top1_score": 0.25082327505676677,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "control-002",
+          "query": "who is the current president of France?",
+          "effective_query": "who is the current president of France?",
+          "gold_in_topk": false,
+          "gold_global_key": null,
+          "gold_rank": null,
+          "gold_score": null,
+          "top1_text": "As of January 2026, the user lives in Lisbon.",
+          "top1_score": 0.2564335064669229,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "control-003",
+          "query": "what is the capital of Mongolia?",
+          "effective_query": "what is the capital of Mongolia?",
+          "gold_in_topk": false,
+          "gold_global_key": null,
+          "gold_rank": null,
+          "gold_score": null,
+          "top1_text": "The user has never been to Asia.",
+          "top1_score": 0.27119226736778723,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "control-004",
+          "query": "how does photosynthesis work?",
+          "effective_query": "how does photosynthesis work?",
+          "gold_in_topk": false,
+          "gold_global_key": null,
+          "gold_rank": null,
+          "gold_score": null,
+          "top1_text": "The user is currently working on a memory benchmark project.",
+          "top1_score": 0.13794111637514647,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "control-005",
+          "query": "translate 'goodnight' to Japanese",
+          "effective_query": "translate 'goodnight' to Japanese",
+          "gold_in_topk": false,
+          "gold_global_key": null,
+          "gold_rank": null,
+          "gold_score": null,
+          "top1_text": "The user's favorite restaurant is Cervejaria Ramiro.",
+          "top1_score": 0.1911340870733358,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "control-006",
+          "query": "what year did World War II end?",
+          "effective_query": "what year did World War II end?",
+          "gold_in_topk": false,
+          "gold_global_key": null,
+          "gold_rank": null,
+          "gold_score": null,
+          "top1_text": "Before 2024, the user lived in Berlin.",
+          "top1_score": 0.2900090433169317,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "control-007",
+          "query": "explain entropy in thermodynamics",
+          "effective_query": "explain entropy in thermodynamics",
+          "gold_in_topk": false,
+          "gold_global_key": null,
+          "gold_rank": null,
+          "gold_score": null,
+          "top1_text": "The user uses Logseq for personal notes and Notion for work.",
+          "top1_score": 0.18512903871088907,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "control-008",
+          "query": "best way to debug a segfault in C",
+          "effective_query": "best way to debug a segfault in C",
+          "gold_in_topk": false,
+          "gold_global_key": null,
+          "gold_rank": null,
+          "gold_score": null,
+          "top1_text": "The user is currently working on a memory benchmark project.",
+          "top1_score": 0.23013625373127655,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "control-009",
+          "query": "what's the weather going to be tomorrow?",
+          "effective_query": "what's the weather going to be tomorrow?",
+          "gold_in_topk": false,
+          "gold_global_key": null,
+          "gold_rank": null,
+          "gold_score": null,
+          "top1_text": "As of January 2026, the user lives in Lisbon.",
+          "top1_score": 0.1484365589730125,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "control-010",
+          "query": "give me a recipe for tiramisu",
+          "effective_query": "give me a recipe for tiramisu",
+          "gold_in_topk": false,
+          "gold_global_key": null,
+          "gold_rank": null,
+          "gold_score": null,
+          "top1_text": "The user's favorite restaurant is Cervejaria Ramiro.",
+          "top1_score": 0.3050188550209472,
+          "top1_is_distractor": false,
+          "false_positive": false
+        }
+      ]
+    }
+  ]
+}
\ No newline at end of file
diff --git a/benchmarks/alignbench/runs/baseline.json b/benchmarks/alignbench/runs/baseline.json
new file mode 100644
index 0000000..e5023dd
--- /dev/null
+++ b/benchmarks/alignbench/runs/baseline.json
@@ -0,0 +1,990 @@
+{
+  "variant": "baseline",
+  "model": "Xenova/all-MiniLM-L6-v2",
+  "topk": 5,
+  "wall_seconds": "0.3",
+  "composite": {
+    "recall_at_1": 0.7333333333333333,
+    "recall_at_5": 0.9333333333333333,
+    "distractor_top1_rate": 0.06666666666666667,
+    "n": 60
+  },
+  "false_positive_rate": 0,
+  "per_axis": [
+    {
+      "axis": "pronoun",
+      "n": 20,
+      "pool_size": 55,
+      "recall_at_1": 0.7,
+      "recall_at_5": 1,
+      "mean_gold_rank": 1.3,
+      "distractor_at_top1": 2,
+      "false_positive_count": 0,
+      "median_gold_margin": 0.04656683064609568,
+      "items": [
+        {
+          "id": "pronoun-001",
+          "query": "what is my name?",
+          "effective_query": "what is my name?",
+          "gold_in_topk": true,
+          "gold_global_key": "pronoun#0",
+          "gold_rank": 1,
+          "gold_score": 0.3960303973923612,
+          "top1_text": "The user's name is Alex.",
+          "top1_score": 0.3960303973923612,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "pronoun-002",
+          "query": "what's the user's name?",
+          "effective_query": "what's the user's name?",
+          "gold_in_topk": true,
+          "gold_global_key": "pronoun#0",
+          "gold_rank": 2,
+          "gold_score": 0.7294812723464028,
+          "top1_text": "The user asked for the user's name.",
+          "top1_score": 0.7771254660839599,
+          "top1_is_distractor": true,
+          "false_positive": false
+        },
+        {
+          "id": "pronoun-003",
+          "query": "who am I?",
+          "effective_query": "who am I?",
+          "gold_in_topk": true,
+          "gold_global_key": "pronoun#0",
+          "gold_rank": 2,
+          "gold_score": 0.2824821997084312,
+          "top1_text": "The user is me.",
+          "top1_score": 0.3964962994980233,
+          "top1_is_distractor": true,
+          "false_positive": false
+        },
+        {
+          "id": "pronoun-004",
+          "query": "what do I do for work?",
+          "effective_query": "what do I do for work?",
+          "gold_in_topk": true,
+          "gold_global_key": "pronoun#1",
+          "gold_rank": 2,
+          "gold_score": 0.18618347998079776,
+          "top1_text": "The user works at Acme as a software engineer.",
+          "top1_score": 0.19567938172618415,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "pronoun-005",
+          "query": "what is the user's job?",
+          "effective_query": "what is the user's job?",
+          "gold_in_topk": true,
+          "gold_global_key": "pronoun#1",
+          "gold_rank": 2,
+          "gold_score": 0.5754363621337681,
+          "top1_text": "The user works at Acme as a software engineer.",
+          "top1_score": 0.5767198521038163,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "pronoun-006",
+          "query": "where does the user work?",
+          "effective_query": "where does the user work?",
+          "gold_in_topk": true,
+          "gold_global_key": "pronoun#1",
+          "gold_rank": 1,
+          "gold_score": 0.5436570101621021,
+          "top1_text": "The user works as a software engineer at Acme.",
+          "top1_score": 0.5436570101621021,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "pronoun-007",
+          "query": "what is my dog's name?",
+          "effective_query": "what is my dog's name?",
+          "gold_in_topk": true,
+          "gold_global_key": "pronoun#2",
+          "gold_rank": 1,
+          "gold_score": 0.5711816859366411,
+          "top1_text": "The user's dog is named Apollo.",
+          "top1_score": 0.5711816859366411,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "pronoun-008",
+          "query": "who is Apollo?",
+          "effective_query": "who is Apollo?",
+          "gold_in_topk": true,
+          "gold_global_key": "pronoun#2",
+          "gold_rank": 1,
+          "gold_score": 0.7072701654355713,
+          "top1_text": "The user's dog is named Apollo.",
+          "top1_score": 0.7072701654355713,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "pronoun-009",
+          "query": "where do I live?",
+          "effective_query": "where do I live?",
+          "gold_in_topk": true,
+          "gold_global_key": "pronoun#3",
+          "gold_rank": 2,
+          "gold_score": 0.29566315429472595,
+          "top1_text": "As of January 2026, the user lives in Lisbon.",
+          "top1_score": 0.2960941749345428,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "pronoun-010",
+          "query": "what city does the user live in?",
+          "effective_query": "what city does the user live in?",
+          "gold_in_topk": true,
+          "gold_global_key": "pronoun#3",
+          "gold_rank": 1,
+          "gold_score": 0.6400263303955707,
+          "top1_text": "The user lives in Lisbon.",
+          "top1_score": 0.6400263303955707,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "pronoun-011",
+          "query": "when is my birthday?",
+          "effective_query": "when is my birthday?",
+          "gold_in_topk": true,
+          "gold_global_key": "pronoun#4",
+          "gold_rank": 1,
+          "gold_score": 0.6279255850758502,
+          "top1_text": "The user's birthday is March 14.",
+          "top1_score": 0.6279255850758502,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "pronoun-012",
+          "query": "when was the user born?",
+          "effective_query": "when was the user born?",
+          "gold_in_topk": true,
+          "gold_global_key": "pronoun#4",
+          "gold_rank": 1,
+          "gold_score": 0.7251037734927323,
+          "top1_text": "The user's birthday is March 14.",
+          "top1_score": 0.7251037734927323,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "pronoun-013",
+          "query": "do I have kids?",
+          "effective_query": "do I have kids?",
+          "gold_in_topk": true,
+          "gold_global_key": "pronoun#5",
+          "gold_rank": 1,
+          "gold_score": 0.2345547444698841,
+          "top1_text": "The user has two children, Maya and Theo.",
+          "top1_score": 0.2345547444698841,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "pronoun-014",
+          "query": "how many children does the user have?",
+          "effective_query": "how many children does the user have?",
+          "gold_in_topk": true,
+          "gold_global_key": "pronoun#5",
+          "gold_rank": 1,
+          "gold_score": 0.5766094762746933,
+          "top1_text": "The user has two children, Maya and Theo.",
+          "top1_score": 0.5766094762746933,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "pronoun-015",
+          "query": "what is my usual coffee order?",
+          "effective_query": "what is my usual coffee order?",
+          "gold_in_topk": true,
+          "gold_global_key": "pronoun#6",
+          "gold_rank": 1,
+          "gold_score": 0.6001185695455631,
+          "top1_text": "The user's favorite coffee order is an oat-milk flat white.",
+          "top1_score": 0.6001185695455631,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "pronoun-016",
+          "query": "what coffee does the user drink?",
+          "effective_query": "what coffee does the user drink?",
+          "gold_in_topk": true,
+          "gold_global_key": "pronoun#6",
+          "gold_rank": 2,
+          "gold_score": 0.5813657391451515,
+          "top1_text": "The user does not drink coffee. They prefer tea.",
+          "top1_score": 0.7197601756319,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "pronoun-017",
+          "query": "what did I study?",
+          "effective_query": "what did I study?",
+          "gold_in_topk": true,
+          "gold_global_key": "pronoun#7",
+          "gold_rank": 1,
+          "gold_score": 0.5746472233195816,
+          "top1_text": "The user studied applied mathematics at university.",
+          "top1_score": 0.5746472233195816,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "pronoun-018",
+          "query": "do I have any allergies?",
+          "effective_query": "do I have any allergies?",
+          "gold_in_topk": true,
+          "gold_global_key": "pronoun#8",
+          "gold_rank": 1,
+          "gold_score": 0.389079047513309,
+          "top1_text": "The user is allergic to peanuts.",
+          "top1_score": 0.389079047513309,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "pronoun-019",
+          "query": "what is the user allergic to?",
+          "effective_query": "what is the user allergic to?",
+          "gold_in_topk": true,
+          "gold_global_key": "pronoun#8",
+          "gold_rank": 1,
+          "gold_score": 0.7261582549960168,
+          "top1_text": "The user is allergic to peanuts.",
+          "top1_score": 0.7261582549960168,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "pronoun-020",
+          "query": "what kind of car do I drive?",
+          "effective_query": "what kind of car do I drive?",
+          "gold_in_topk": true,
+          "gold_global_key": "pronoun#9",
+          "gold_rank": 1,
+          "gold_score": 0.39618091590057963,
+          "top1_text": "The user drives a 2019 Toyota Corolla.",
+          "top1_score": 0.39618091590057963,
+          "top1_is_distractor": false,
+          "false_positive": false
+        }
+      ]
+    },
+    {
+      "axis": "temporal",
+      "n": 14,
+      "pool_size": 55,
+      "recall_at_1": 0.5,
+      "recall_at_5": 0.7142857142857143,
+      "mean_gold_rank": 5.5,
+      "distractor_at_top1": 2,
+      "false_positive_count": 0,
+      "median_gold_margin": 0.049944619619024966,
+      "items": [
+        {
+          "id": "temporal-001",
+          "query": "where does the user live now?",
+          "effective_query": "where does the user live now?",
+          "gold_in_topk": true,
+          "gold_global_key": "temporal#0",
+          "gold_rank": 3,
+          "gold_score": 0.550366425409688,
+          "top1_text": "The user lives in Lisbon.",
+          "top1_score": 0.5504016420681116,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "temporal-002",
+          "query": "where did the user used to live?",
+          "effective_query": "where did the user used to live?",
+          "gold_in_topk": true,
+          "gold_global_key": "temporal#1",
+          "gold_rank": 1,
+          "gold_score": 0.5490901457590983,
+          "top1_text": "Before 2024, the user lived in Berlin.",
+          "top1_score": 0.5490901457590983,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "temporal-003",
+          "query": "when did the user move?",
+          "effective_query": "when did the user move?",
+          "gold_in_topk": true,
+          "gold_global_key": "temporal#2",
+          "gold_rank": 1,
+          "gold_score": 0.5076513080537272,
+          "top1_text": "The user moved from Berlin to Lisbon in 2024.",
+          "top1_score": 0.5076513080537272,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "temporal-004",
+          "query": "where is the user currently based?",
+          "effective_query": "where is the user currently based?",
+          "gold_in_topk": true,
+          "gold_global_key": "temporal#0",
+          "gold_rank": 11,
+          "gold_score": 0.42833906054622706,
+          "top1_text": "The user asked for the user's name.",
+          "top1_score": 0.4962051712532369,
+          "top1_is_distractor": true,
+          "false_positive": false
+        },
+        {
+          "id": "temporal-005",
+          "query": "what is the user reading?",
+          "effective_query": "what is the user reading?",
+          "gold_in_topk": true,
+          "gold_global_key": "temporal#3",
+          "gold_rank": 8,
+          "gold_score": 0.4600491260020972,
+          "top1_text": "The user reads on a Kindle Paperwhite 11th-gen.",
+          "top1_score": 0.5634334413727431,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "temporal-006",
+          "query": "what did the user read last year?",
+          "effective_query": "what did the user read last year?",
+          "gold_in_topk": true,
+          "gold_global_key": "temporal#4",
+          "gold_rank": 1,
+          "gold_score": 0.5028289729480597,
+          "top1_text": "Last year the user read 'Project Hail Mary'.",
+          "top1_score": 0.5028289729480597,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "temporal-007",
+          "query": "what is the user working on these days?",
+          "effective_query": "what is the user working on these days?",
+          "gold_in_topk": true,
+          "gold_global_key": "temporal#5",
+          "gold_rank": 4,
+          "gold_score": 0.4513265913655406,
+          "top1_text": "The user works at Acme as a software engineer.",
+          "top1_score": 0.48543171981436045,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "temporal-008",
+          "query": "what did the user finish last month?",
+          "effective_query": "what did the user finish last month?",
+          "gold_in_topk": true,
+          "gold_global_key": "temporal#6",
+          "gold_rank": 1,
+          "gold_score": 0.5544356167423142,
+          "top1_text": "The user finished the Sprint-4 reranker training last month.",
+          "top1_score": 0.5544356167423142,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "temporal-009",
+          "query": "what LLM does the user prefer?",
+          "effective_query": "what LLM does the user prefer?",
+          "gold_in_topk": true,
+          "gold_global_key": "temporal#7",
+          "gold_rank": 1,
+          "gold_score": 0.6226655286181774,
+          "top1_text": "As of May 2026, the user's preferred LLM is Claude Sonnet 4.6.",
+          "top1_score": 0.6226655286181774,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "temporal-010",
+          "query": "which model did the user use before?",
+          "effective_query": "which model did the user use before?",
+          "gold_in_topk": true,
+          "gold_global_key": "temporal#8",
+          "gold_rank": 1,
+          "gold_score": 0.5205286761987613,
+          "top1_text": "The user used GPT-4 as their primary model in 2024.",
+          "top1_score": 0.5205286761987613,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "temporal-011",
+          "query": "did the user get a new phone recently?",
+          "effective_query": "did the user get a new phone recently?",
+          "gold_in_topk": true,
+          "gold_global_key": "temporal#9",
+          "gold_rank": 1,
+          "gold_score": 0.6113316689987077,
+          "top1_text": "The user upgraded their phone to an iPhone 17 in March 2026.",
+          "top1_score": 0.6113316689987077,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "temporal-012",
+          "query": "is the user still in Berlin?",
+          "effective_query": "is the user still in Berlin?",
+          "gold_in_topk": true,
+          "gold_global_key": "temporal#0",
+          "gold_rank": 5,
+          "gold_score": 0.4089445689504921,
+          "top1_text": "Before 2024, the user lived in Berlin.",
+          "top1_score": 0.7214467722165934,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "temporal-013",
+          "query": "what is the user up to?",
+          "effective_query": "what is the user up to?",
+          "gold_in_topk": true,
+          "gold_global_key": "temporal#5",
+          "gold_rank": 19,
+          "gold_score": 0.2672680179526975,
+          "top1_text": "The user is asking a question.",
+          "top1_score": 0.5409891051249414,
+          "top1_is_distractor": true,
+          "false_positive": false
+        },
+        {
+          "id": "temporal-014",
+          "query": "which model is the user on right now?",
+          "effective_query": "which model is the user on right now?",
+          "gold_in_topk": true,
+          "gold_global_key": "temporal#7",
+          "gold_rank": 20,
+          "gold_score": 0.31088477125594804,
+          "top1_text": "The user's name is Alex.",
+          "top1_score": 0.40861487066233426,
+          "top1_is_distractor": false,
+          "false_positive": false
+        }
+      ]
+    },
+    {
+      "axis": "specificity",
+      "n": 14,
+      "pool_size": 55,
+      "recall_at_1": 0.8571428571428571,
+      "recall_at_5": 1,
+      "mean_gold_rank": 1.2142857142857142,
+      "distractor_at_top1": 0,
+      "false_positive_count": 0,
+      "median_gold_margin": 0.14383529034315978,
+      "items": [
+        {
+          "id": "specificity-001",
+          "query": "tell me about my dog",
+          "effective_query": "tell me about my dog",
+          "gold_in_topk": true,
+          "gold_global_key": "specificity#0",
+          "gold_rank": 3,
+          "gold_score": 0.41588713324638765,
+          "top1_text": "The user has a dog named Apollo.",
+          "top1_score": 0.4531912105925747,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "specificity-002",
+          "query": "what kind of pet do I have?",
+          "effective_query": "what kind of pet do I have?",
+          "gold_in_topk": true,
+          "gold_global_key": "specificity#0",
+          "gold_rank": 2,
+          "gold_score": 0.3644336367800417,
+          "top1_text": "The user has a dog named Apollo.",
+          "top1_score": 0.4007600036283448,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "specificity-003",
+          "query": "do I own a bike?",
+          "effective_query": "do I own a bike?",
+          "gold_in_topk": true,
+          "gold_global_key": "specificity#1",
+          "gold_rank": 1,
+          "gold_score": 0.5234729437047541,
+          "top1_text": "The user owns a Bianchi road bike.",
+          "top1_score": 0.5234729437047541,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "specificity-004",
+          "query": "what brand of bike does the user have?",
+          "effective_query": "what brand of bike does the user have?",
+          "gold_in_topk": true,
+          "gold_global_key": "specificity#1",
+          "gold_rank": 1,
+          "gold_score": 0.6329297203833848,
+          "top1_text": "The user owns a Bianchi road bike.",
+          "top1_score": 0.6329297203833848,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "specificity-005",
+          "query": "what computer do I use?",
+          "effective_query": "what computer do I use?",
+          "gold_in_topk": true,
+          "gold_global_key": "specificity#2",
+          "gold_rank": 1,
+          "gold_score": 0.2943695662572405,
+          "top1_text": "The user's primary laptop is a 16-inch MacBook Pro M4.",
+          "top1_score": 0.2943695662572405,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "specificity-006",
+          "query": "what laptop does the user have?",
+          "effective_query": "what laptop does the user have?",
+          "gold_in_topk": true,
+          "gold_global_key": "specificity#2",
+          "gold_rank": 1,
+          "gold_score": 0.6450468635073591,
+          "top1_text": "The user's primary laptop is a 16-inch MacBook Pro M4.",
+          "top1_score": 0.6450468635073591,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "specificity-007",
+          "query": "do I have any musical instruments?",
+          "effective_query": "do I have any musical instruments?",
+          "gold_in_topk": true,
+          "gold_global_key": "specificity#3",
+          "gold_rank": 1,
+          "gold_score": 0.28698273062526614,
+          "top1_text": "The user has a Yamaha P-125 digital piano in the living room.",
+          "top1_score": 0.28698273062526614,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "specificity-008",
+          "query": "which note-taking app do I use?",
+          "effective_query": "which note-taking app do I use?",
+          "gold_in_topk": true,
+          "gold_global_key": "specificity#4",
+          "gold_rank": 1,
+          "gold_score": 0.38230106432962924,
+          "top1_text": "The user uses Logseq for personal notes and Notion for work.",
+          "top1_score": 0.38230106432962924,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "specificity-009",
+          "query": "where do I like to eat in Lisbon?",
+          "effective_query": "where do I like to eat in Lisbon?",
+          "gold_in_topk": true,
+          "gold_global_key": "specificity#5",
+          "gold_rank": 1,
+          "gold_score": 0.695091049183982,
+          "top1_text": "The user's favorite restaurant in Lisbon is Cervejaria Ramiro.",
+          "top1_score": 0.695091049183982,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "specificity-010",
+          "query": "what brand sunglasses does the user wear?",
+          "effective_query": "what brand sunglasses does the user wear?",
+          "gold_in_topk": true,
+          "gold_global_key": "specificity#6",
+          "gold_rank": 1,
+          "gold_score": 0.6512271965052092,
+          "top1_text": "The user wears Smith Lowdown sunglasses.",
+          "top1_score": 0.6512271965052092,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "specificity-011",
+          "query": "do I read on a Kindle?",
+          "effective_query": "do I read on a Kindle?",
+          "gold_in_topk": true,
+          "gold_global_key": "specificity#7",
+          "gold_rank": 1,
+          "gold_score": 0.6508408387843772,
+          "top1_text": "The user reads on a Kindle Paperwhite 11th-gen.",
+          "top1_score": 0.6508408387843772,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "specificity-012",
+          "query": "what espresso machine does the user own?",
+          "effective_query": "what espresso machine does the user own?",
+          "gold_in_topk": true,
+          "gold_global_key": "specificity#8",
+          "gold_rank": 1,
+          "gold_score": 0.7366961226077487,
+          "top1_text": "The user's home espresso machine is a Lelit Bianca v3.",
+          "top1_score": 0.7366961226077487,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "specificity-013",
+          "query": "what shoes do I wear?",
+          "effective_query": "what shoes do I wear?",
+          "gold_in_topk": true,
+          "gold_global_key": "specificity#9",
+          "gold_rank": 1,
+          "gold_score": 0.25921156505496634,
+          "top1_text": "The user wears Allbirds Wool Runners daily.",
+          "top1_score": 0.25921156505496634,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "specificity-014",
+          "query": "what brand are the user's everyday shoes?",
+          "effective_query": "what brand are the user's everyday shoes?",
+          "gold_in_topk": true,
+          "gold_global_key": "specificity#9",
+          "gold_rank": 1,
+          "gold_score": 0.4229939734066537,
+          "top1_text": "The user wears Allbirds Wool Runners daily.",
+          "top1_score": 0.4229939734066537,
+          "top1_is_distractor": false,
+          "false_positive": false
+        }
+      ]
+    },
+    {
+      "axis": "negation",
+      "n": 12,
+      "pool_size": 55,
+      "recall_at_1": 0.9166666666666666,
+      "recall_at_5": 1,
+      "mean_gold_rank": 1.0833333333333333,
+      "distractor_at_top1": 0,
+      "false_positive_count": 0,
+      "median_gold_margin": 0.26762270427014134,
+      "items": [
+        {
+          "id": "negation-001",
+          "query": "does the user drink coffee?",
+          "effective_query": "does the user drink coffee?",
+          "gold_in_topk": true,
+          "gold_global_key": "negation#0",
+          "gold_rank": 1,
+          "gold_score": 0.7905096599561829,
+          "top1_text": "The user does not drink coffee. They prefer tea.",
+          "top1_score": 0.7905096599561829,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "negation-002",
+          "query": "what does the user drink in the morning?",
+          "effective_query": "what does the user drink in the morning?",
+          "gold_in_topk": true,
+          "gold_global_key": "negation#0",
+          "gold_rank": 1,
+          "gold_score": 0.5285974920275435,
+          "top1_text": "The user does not drink coffee. They prefer tea.",
+          "top1_score": 0.5285974920275435,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "negation-003",
+          "query": "am I vegetarian?",
+          "effective_query": "am I vegetarian?",
+          "gold_in_topk": true,
+          "gold_global_key": "negation#1",
+          "gold_rank": 1,
+          "gold_score": 0.6191490333752342,
+          "top1_text": "The user is not vegetarian, but avoids red meat.",
+          "top1_score": 0.6191490333752342,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "negation-004",
+          "query": "is the user on Twitter?",
+          "effective_query": "is the user on Twitter?",
+          "gold_in_topk": true,
+          "gold_global_key": "negation#2",
+          "gold_rank": 1,
+          "gold_score": 0.5874304481012608,
+          "top1_text": "The user does not use Twitter; they use Bluesky and Mastodon.",
+          "top1_score": 0.5874304481012608,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "negation-005",
+          "query": "which social networks does the user use?",
+          "effective_query": "which social networks does the user use?",
+          "gold_in_topk": true,
+          "gold_global_key": "negation#2",
+          "gold_rank": 1,
+          "gold_score": 0.5067702908301415,
+          "top1_text": "The user does not use Twitter; they use Bluesky and Mastodon.",
+          "top1_score": 0.5067702908301415,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "negation-006",
+          "query": "does the user own a car?",
+          "effective_query": "does the user own a car?",
+          "gold_in_topk": true,
+          "gold_global_key": "negation#3",
+          "gold_rank": 1,
+          "gold_score": 0.7488288864422115,
+          "top1_text": "The user does not own a car; they bike or use public transit.",
+          "top1_score": 0.7488288864422115,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "negation-007",
+          "query": "is the user active on LinkedIn?",
+          "effective_query": "is the user active on LinkedIn?",
+          "gold_in_topk": true,
+          "gold_global_key": "negation#4",
+          "gold_rank": 1,
+          "gold_score": 0.7880239246717338,
+          "top1_text": "The user is not on LinkedIn anymore.",
+          "top1_score": 0.7880239246717338,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "negation-008",
+          "query": "any foods the user hates?",
+          "effective_query": "any foods the user hates?",
+          "gold_in_topk": true,
+          "gold_global_key": "negation#5",
+          "gold_rank": 2,
+          "gold_score": 0.4222244002602081,
+          "top1_text": "The user is not vegetarian, but avoids red meat.",
+          "top1_score": 0.4321470878666278,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "negation-009",
+          "query": "has the user traveled to Asia?",
+          "effective_query": "has the user traveled to Asia?",
+          "gold_in_topk": true,
+          "gold_global_key": "negation#6",
+          "gold_rank": 1,
+          "gold_score": 0.6919597377462416,
+          "top1_text": "The user has never been to Asia.",
+          "top1_score": 0.6919597377462416,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "negation-010",
+          "query": "does the user like horror movies?",
+          "effective_query": "does the user like horror movies?",
+          "gold_in_topk": true,
+          "gold_global_key": "negation#7",
+          "gold_rank": 1,
+          "gold_score": 0.7303882056719367,
+          "top1_text": "The user does not enjoy horror movies.",
+          "top1_score": 0.7303882056719367,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "negation-011",
+          "query": "can the user eat shrimp?",
+          "effective_query": "can the user eat shrimp?",
+          "gold_in_topk": true,
+          "gold_global_key": "negation#8",
+          "gold_rank": 1,
+          "gold_score": 0.7044731236020111,
+          "top1_text": "The user does not eat shellfish.",
+          "top1_score": 0.7044731236020111,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "negation-012",
+          "query": "is the user learning a new language?",
+          "effective_query": "is the user learning a new language?",
+          "gold_in_topk": true,
+          "gold_global_key": "negation#9",
+          "gold_rank": 1,
+          "gold_score": 0.6522323599643379,
+          "top1_text": "The user is not currently learning any new languages.",
+          "top1_score": 0.6522323599643379,
+          "top1_is_distractor": false,
+          "false_positive": false
+        }
+      ]
+    },
+    {
+      "axis": "control",
+      "n": 10,
+      "pool_size": 55,
+      "recall_at_1": 0,
+      "recall_at_5": 0,
+      "mean_gold_rank": null,
+      "distractor_at_top1": 0,
+      "false_positive_count": 0,
+      "median_gold_margin": null,
+      "items": [
+        {
+          "id": "control-001",
+          "query": "what is the airspeed velocity of an unladen swallow?",
+          "effective_query": "what is the airspeed velocity of an unladen swallow?",
+          "gold_in_topk": false,
+          "gold_global_key": null,
+          "gold_rank": null,
+          "gold_score": null,
+          "top1_text": "The user does not eat shellfish.",
+          "top1_score": 0.25082327505676677,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "control-002",
+          "query": "who is the current president of France?",
+          "effective_query": "who is the current president of France?",
+          "gold_in_topk": false,
+          "gold_global_key": null,
+          "gold_rank": null,
+          "gold_score": null,
+          "top1_text": "As of January 2026, the user lives in Lisbon.",
+          "top1_score": 0.2564335064669229,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "control-003",
+          "query": "what is the capital of Mongolia?",
+          "effective_query": "what is the capital of Mongolia?",
+          "gold_in_topk": false,
+          "gold_global_key": null,
+          "gold_rank": null,
+          "gold_score": null,
+          "top1_text": "The user has never been to Asia.",
+          "top1_score": 0.27119226736778723,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "control-004",
+          "query": "how does photosynthesis work?",
+          "effective_query": "how does photosynthesis work?",
+          "gold_in_topk": false,
+          "gold_global_key": null,
+          "gold_rank": null,
+          "gold_score": null,
+          "top1_text": "The user is currently working on a memory benchmark project.",
+          "top1_score": 0.13794111637514647,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "control-005",
+          "query": "translate 'goodnight' to Japanese",
+          "effective_query": "translate 'goodnight' to Japanese",
+          "gold_in_topk": false,
+          "gold_global_key": null,
+          "gold_rank": null,
+          "gold_score": null,
+          "top1_text": "The user's favorite restaurant is Cervejaria Ramiro.",
+          "top1_score": 0.1911340870733358,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "control-006",
+          "query": "what year did World War II end?",
+          "effective_query": "what year did World War II end?",
+          "gold_in_topk": false,
+          "gold_global_key": null,
+          "gold_rank": null,
+          "gold_score": null,
+          "top1_text": "Before 2024, the user lived in Berlin.",
+          "top1_score": 0.2900090433169317,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "control-007",
+          "query": "explain entropy in thermodynamics",
+          "effective_query": "explain entropy in thermodynamics",
+          "gold_in_topk": false,
+          "gold_global_key": null,
+          "gold_rank": null,
+          "gold_score": null,
+          "top1_text": "The user uses Logseq for personal notes and Notion for work.",
+          "top1_score": 0.18512903871088907,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "control-008",
+          "query": "best way to debug a segfault in C",
+          "effective_query": "best way to debug a segfault in C",
+          "gold_in_topk": false,
+          "gold_global_key": null,
+          "gold_rank": null,
+          "gold_score": null,
+          "top1_text": "The user is currently working on a memory benchmark project.",
+          "top1_score": 0.23013625373127655,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "control-009",
+          "query": "what's the weather going to be tomorrow?",
+          "effective_query": "what's the weather going to be tomorrow?",
+          "gold_in_topk": false,
+          "gold_global_key": null,
+          "gold_rank": null,
+          "gold_score": null,
+          "top1_text": "As of May 14, 2026, Apollo is a term mentioned in the conversation.",
+          "top1_score": 0.18279764320601347,
+          "top1_is_distractor": true,
+          "false_positive": false
+        },
+        {
+          "id": "control-010",
+          "query": "give me a recipe for tiramisu",
+          "effective_query": "give me a recipe for tiramisu",
+          "gold_in_topk": false,
+          "gold_global_key": null,
+          "gold_rank": null,
+          "gold_score": null,
+          "top1_text": "The user's favorite restaurant is Cervejaria Ramiro.",
+          "top1_score": 0.3050188550209472,
+          "top1_is_distractor": false,
+          "false_positive": false
+        }
+      ]
+    }
+  ]
+}
\ No newline at end of file
diff --git a/benchmarks/alignbench/runs/combined.json b/benchmarks/alignbench/runs/combined.json
new file mode 100644
index 0000000..f9acb6b
--- /dev/null
+++ b/benchmarks/alignbench/runs/combined.json
@@ -0,0 +1,990 @@
+{
+  "variant": "combined",
+  "model": "Xenova/all-MiniLM-L6-v2",
+  "topk": 5,
+  "wall_seconds": "0.6",
+  "composite": {
+    "recall_at_1": 0.65,
+    "recall_at_5": 0.9333333333333333,
+    "distractor_top1_rate": 0.08333333333333333,
+    "n": 60
+  },
+  "false_positive_rate": 1,
+  "per_axis": [
+    {
+      "axis": "pronoun",
+      "n": 20,
+      "pool_size": 108,
+      "recall_at_1": 0.65,
+      "recall_at_5": 0.95,
+      "mean_gold_rank": 2.1,
+      "distractor_at_top1": 2,
+      "false_positive_count": 0,
+      "median_gold_margin": 0.057432213194370973,
+      "items": [
+        {
+          "id": "pronoun-001",
+          "query": "what is my name?",
+          "effective_query": "what is the user's name?",
+          "gold_in_topk": true,
+          "gold_global_key": "pronoun#0",
+          "gold_rank": 1,
+          "gold_score": 0.9531048072303069,
+          "top1_text": "The user's name is Alex.",
+          "top1_score": 0.9531048072303069,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "pronoun-002",
+          "query": "what's the user's name?",
+          "effective_query": "what's the user's name?",
+          "gold_in_topk": true,
+          "gold_global_key": "pronoun#0",
+          "gold_rank": 2,
+          "gold_score": 0.9659583575722968,
+          "top1_text": "The user asked for the user's name.",
+          "top1_score": 0.9867746931367039,
+          "top1_is_distractor": true,
+          "false_positive": false
+        },
+        {
+          "id": "pronoun-003",
+          "query": "who am I?",
+          "effective_query": "who am the user?",
+          "gold_in_topk": true,
+          "gold_global_key": "pronoun#0",
+          "gold_rank": 4,
+          "gold_score": 0.6302996998867183,
+          "top1_text": "I am me.",
+          "top1_score": 0.811597275977491,
+          "top1_is_distractor": true,
+          "false_positive": false
+        },
+        {
+          "id": "pronoun-004",
+          "query": "what do I do for work?",
+          "effective_query": "what do the user do for work?",
+          "gold_in_topk": true,
+          "gold_global_key": "pronoun#1",
+          "gold_rank": 4,
+          "gold_score": 0.6525810032024324,
+          "top1_text": "The user uses Logseq for personal notes and Notion for work.",
+          "top1_score": 0.9315629118324019,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "pronoun-005",
+          "query": "what is the user's job?",
+          "effective_query": "what is the user's job?",
+          "gold_in_topk": true,
+          "gold_global_key": "pronoun#1",
+          "gold_rank": 10,
+          "gold_score": 0.6920306628772953,
+          "top1_text": "The user's name is Alex.",
+          "top1_score": 0.8452295596626733,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "pronoun-006",
+          "query": "where does the user work?",
+          "effective_query": "where does the user work?",
+          "gold_in_topk": true,
+          "gold_global_key": "pronoun#1",
+          "gold_rank": 5,
+          "gold_score": 0.7016151842914757,
+          "top1_text": "The user uses Logseq for personal notes and Notion for work.",
+          "top1_score": 0.8553753574007144,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "pronoun-007",
+          "query": "what is my dog's name?",
+          "effective_query": "what is the user's dog's name?",
+          "gold_in_topk": true,
+          "gold_global_key": "pronoun#2",
+          "gold_rank": 1,
+          "gold_score": 0.9830121625541873,
+          "top1_text": "The user's dog is named Apollo.",
+          "top1_score": 0.9830121625541873,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "pronoun-008",
+          "query": "who is Apollo?",
+          "effective_query": "who is Apollo?",
+          "gold_in_topk": true,
+          "gold_global_key": "pronoun#2",
+          "gold_rank": 1,
+          "gold_score": 0.9783254585435431,
+          "top1_text": "My dog is named Apollo.",
+          "top1_score": 0.9783254585435431,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "pronoun-009",
+          "query": "where do I live?",
+          "effective_query": "where do the user live?",
+          "gold_in_topk": true,
+          "gold_global_key": "pronoun#3",
+          "gold_rank": 1,
+          "gold_score": 0.7741606960366889,
+          "top1_text": "The user lives in Lisbon.",
+          "top1_score": 0.7741606960366889,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "pronoun-010",
+          "query": "what city does the user live in?",
+          "effective_query": "what city does the user live in?",
+          "gold_in_topk": true,
+          "gold_global_key": "pronoun#3",
+          "gold_rank": 1,
+          "gold_score": 0.8871245980434483,
+          "top1_text": "The user lives in Lisbon.",
+          "top1_score": 0.8871245980434483,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "pronoun-011",
+          "query": "when is my birthday?",
+          "effective_query": "when is the user's birthday?",
+          "gold_in_topk": true,
+          "gold_global_key": "pronoun#4",
+          "gold_rank": 1,
+          "gold_score": 1,
+          "top1_text": "The user's birthday is March 14.",
+          "top1_score": 1,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "pronoun-012",
+          "query": "when was the user born?",
+          "effective_query": "when was the user born?",
+          "gold_in_topk": true,
+          "gold_global_key": "pronoun#4",
+          "gold_rank": 1,
+          "gold_score": 0.7067061203590985,
+          "top1_text": "The user's birthday is March 14.",
+          "top1_score": 0.7067061203590985,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "pronoun-013",
+          "query": "do I have kids?",
+          "effective_query": "do the user have kids?",
+          "gold_in_topk": true,
+          "gold_global_key": "pronoun#5",
+          "gold_rank": 1,
+          "gold_score": 0.7745183790985406,
+          "top1_text": "I have two children, Maya and Theo.",
+          "top1_score": 0.7745183790985406,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "pronoun-014",
+          "query": "how many children does the user have?",
+          "effective_query": "how many children does the user have?",
+          "gold_in_topk": true,
+          "gold_global_key": "pronoun#5",
+          "gold_rank": 1,
+          "gold_score": 0.8830521647230488,
+          "top1_text": "The user has two children, Maya and Theo.",
+          "top1_score": 0.8830521647230488,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "pronoun-015",
+          "query": "what is my usual coffee order?",
+          "effective_query": "what is the user's usual coffee order?",
+          "gold_in_topk": true,
+          "gold_global_key": "pronoun#6",
+          "gold_rank": 1,
+          "gold_score": 1,
+          "top1_text": "The user's favorite coffee order is an oat-milk flat white.",
+          "top1_score": 1,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "pronoun-016",
+          "query": "what coffee does the user drink?",
+          "effective_query": "what coffee does the user drink?",
+          "gold_in_topk": true,
+          "gold_global_key": "pronoun#6",
+          "gold_rank": 2,
+          "gold_score": 0.6219843435521409,
+          "top1_text": "The user does not drink coffee. They prefer tea.",
+          "top1_score": 1,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "pronoun-017",
+          "query": "what did I study?",
+          "effective_query": "what did the user study?",
+          "gold_in_topk": true,
+          "gold_global_key": "pronoun#7",
+          "gold_rank": 1,
+          "gold_score": 0.89575,
+          "top1_text": "The user studied applied mathematics at university.",
+          "top1_score": 0.89575,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "pronoun-018",
+          "query": "do I have any allergies?",
+          "effective_query": "do the user have any allergies?",
+          "gold_in_topk": true,
+          "gold_global_key": "pronoun#8",
+          "gold_rank": 1,
+          "gold_score": 0.7223693332633677,
+          "top1_text": "The user is allergic to peanuts.",
+          "top1_score": 0.7223693332633677,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "pronoun-019",
+          "query": "what is the user allergic to?",
+          "effective_query": "what is the user allergic to?",
+          "gold_in_topk": true,
+          "gold_global_key": "pronoun#8",
+          "gold_rank": 1,
+          "gold_score": 1,
+          "top1_text": "The user is allergic to peanuts.",
+          "top1_score": 1,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "pronoun-020",
+          "query": "what kind of car do I drive?",
+          "effective_query": "what kind of car do the user drive?",
+          "gold_in_topk": true,
+          "gold_global_key": "pronoun#9",
+          "gold_rank": 2,
+          "gold_score": 0.7002924800490933,
+          "top1_text": "I do not own a car; they bike or use public transit.",
+          "top1_score": 0.9137331184077017,
+          "top1_is_distractor": false,
+          "false_positive": false
+        }
+      ]
+    },
+    {
+      "axis": "temporal",
+      "n": 14,
+      "pool_size": 108,
+      "recall_at_1": 0.5,
+      "recall_at_5": 0.7857142857142857,
+      "mean_gold_rank": 5.285714285714286,
+      "distractor_at_top1": 3,
+      "false_positive_count": 0,
+      "median_gold_margin": 0.11447343705680679,
+      "items": [
+        {
+          "id": "temporal-001",
+          "query": "where does the user live now?",
+          "effective_query": "where does the user live now?",
+          "gold_in_topk": true,
+          "gold_global_key": "temporal#0",
+          "gold_rank": 3,
+          "gold_score": 0.7015808512434873,
+          "top1_text": "The user lives in Lisbon.",
+          "top1_score": 0.729722742382345,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "temporal-002",
+          "query": "where did the user used to live?",
+          "effective_query": "where did the user used to live?",
+          "gold_in_topk": true,
+          "gold_global_key": "temporal#1",
+          "gold_rank": 4,
+          "gold_score": 0.7268414774245511,
+          "top1_text": "The user moved from Berlin to Lisbon in 2024.",
+          "top1_score": 0.8198598847083589,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "temporal-003",
+          "query": "when did the user move?",
+          "effective_query": "when did the user move?",
+          "gold_in_topk": true,
+          "gold_global_key": "temporal#2",
+          "gold_rank": 2,
+          "gold_score": 0.8637093178778421,
+          "top1_text": "The user asked for the user's name.",
+          "top1_score": 0.9309845833792503,
+          "top1_is_distractor": true,
+          "false_positive": false
+        },
+        {
+          "id": "temporal-004",
+          "query": "where is the user currently based?",
+          "effective_query": "where is the user currently based?",
+          "gold_in_topk": true,
+          "gold_global_key": "temporal#0",
+          "gold_rank": 16,
+          "gold_score": 0.6237210510001825,
+          "top1_text": "The user is me.",
+          "top1_score": 0.8091389415487087,
+          "top1_is_distractor": true,
+          "false_positive": false
+        },
+        {
+          "id": "temporal-005",
+          "query": "what is the user reading?",
+          "effective_query": "what is the user reading?",
+          "gold_in_topk": true,
+          "gold_global_key": "temporal#3",
+          "gold_rank": 1,
+          "gold_score": 0.8963167330268929,
+          "top1_text": "As of April 2026, the user is reading 'The Power Broker'.",
+          "top1_score": 0.8963167330268929,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "temporal-006",
+          "query": "what did the user read last year?",
+          "effective_query": "what did the user read last year?",
+          "gold_in_topk": true,
+          "gold_global_key": "temporal#4",
+          "gold_rank": 1,
+          "gold_score": 1,
+          "top1_text": "Last year the user read 'Project Hail Mary'.",
+          "top1_score": 1,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "temporal-007",
+          "query": "what is the user working on these days?",
+          "effective_query": "what is the user working on these days?",
+          "gold_in_topk": true,
+          "gold_global_key": "temporal#5",
+          "gold_rank": 1,
+          "gold_score": 0.9636797079360075,
+          "top1_text": "The user is currently working on a memory benchmark project.",
+          "top1_score": 0.9636797079360075,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "temporal-008",
+          "query": "what did the user finish last month?",
+          "effective_query": "what did the user finish last month?",
+          "gold_in_topk": true,
+          "gold_global_key": "temporal#6",
+          "gold_rank": 1,
+          "gold_score": 1,
+          "top1_text": "The user finished the Sprint-4 reranker training last month.",
+          "top1_score": 1,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "temporal-009",
+          "query": "what LLM does the user prefer?",
+          "effective_query": "what LLM does the user prefer?",
+          "gold_in_topk": true,
+          "gold_global_key": "temporal#7",
+          "gold_rank": 1,
+          "gold_score": 0.7981511497774132,
+          "top1_text": "As of May 2026, the user's preferred LLM is Claude Sonnet 4.6.",
+          "top1_score": 0.7981511497774132,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "temporal-010",
+          "query": "which model did the user use before?",
+          "effective_query": "which model did the user use before?",
+          "gold_in_topk": true,
+          "gold_global_key": "temporal#8",
+          "gold_rank": 1,
+          "gold_score": 0.9218020917135962,
+          "top1_text": "The user used GPT-4 as their primary model in 2024.",
+          "top1_score": 0.9218020917135962,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "temporal-011",
+          "query": "did the user get a new phone recently?",
+          "effective_query": "did the user get a new phone recently?",
+          "gold_in_topk": true,
+          "gold_global_key": "temporal#9",
+          "gold_rank": 1,
+          "gold_score": 0.944084372003835,
+          "top1_text": "The user upgraded their phone to an iPhone 17 in March 2026.",
+          "top1_score": 0.944084372003835,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "temporal-012",
+          "query": "is the user still in Berlin?",
+          "effective_query": "is the user still in Berlin?",
+          "gold_in_topk": true,
+          "gold_global_key": "temporal#0",
+          "gold_rank": 5,
+          "gold_score": 0.5384570728328757,
+          "top1_text": "Before 2024, the user lived in Berlin.",
+          "top1_score": 1,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "temporal-013",
+          "query": "what is the user up to?",
+          "effective_query": "what is the user up to?",
+          "gold_in_topk": true,
+          "gold_global_key": "temporal#5",
+          "gold_rank": 19,
+          "gold_score": 0.4820469206032369,
+          "top1_text": "The user is asking a question.",
+          "top1_score": 0.7983877869260091,
+          "top1_is_distractor": true,
+          "false_positive": false
+        },
+        {
+          "id": "temporal-014",
+          "query": "which model is the user on right now?",
+          "effective_query": "which model is the user on right now?",
+          "gold_in_topk": true,
+          "gold_global_key": "temporal#7",
+          "gold_rank": 18,
+          "gold_score": 0.6134604774711914,
+          "top1_text": "The user is not on LinkedIn anymore.",
+          "top1_score": 0.881723970958755,
+          "top1_is_distractor": false,
+          "false_positive": false
+        }
+      ]
+    },
+    {
+      "axis": "specificity",
+      "n": 14,
+      "pool_size": 108,
+      "recall_at_1": 0.5,
+      "recall_at_5": 1,
+      "mean_gold_rank": 1.7142857142857142,
+      "distractor_at_top1": 0,
+      "false_positive_count": 0,
+      "median_gold_margin": 0.03548021534901058,
+      "items": [
+        {
+          "id": "specificity-001",
+          "query": "tell me about my dog",
+          "effective_query": "tell the user about the user's dog",
+          "gold_in_topk": true,
+          "gold_global_key": "specificity#0",
+          "gold_rank": 3,
+          "gold_score": 0.7743385636424491,
+          "top1_text": "The user's dog is named Apollo.",
+          "top1_score": 0.9703829855046132,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "specificity-002",
+          "query": "what kind of pet do I have?",
+          "effective_query": "what kind of pet do the user have?",
+          "gold_in_topk": true,
+          "gold_global_key": "specificity#0",
+          "gold_rank": 3,
+          "gold_score": 0.6345967882412202,
+          "top1_text": "I have a dog named Apollo.",
+          "top1_score": 0.883261556964638,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "specificity-003",
+          "query": "do I own a bike?",
+          "effective_query": "do the user own a bike?",
+          "gold_in_topk": true,
+          "gold_global_key": "specificity#1",
+          "gold_rank": 2,
+          "gold_score": 0.8662753596268623,
+          "top1_text": "The user does not own a car; they bike or use public transit.",
+          "top1_score": 0.8890501788742958,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "specificity-004",
+          "query": "what brand of bike does the user have?",
+          "effective_query": "what brand of bike does the user have?",
+          "gold_in_topk": true,
+          "gold_global_key": "specificity#1",
+          "gold_rank": 1,
+          "gold_score": 0.9277061278857603,
+          "top1_text": "The user owns a Bianchi road bike.",
+          "top1_score": 0.9277061278857603,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "specificity-005",
+          "query": "what computer do I use?",
+          "effective_query": "what computer do the user use?",
+          "gold_in_topk": true,
+          "gold_global_key": "specificity#2",
+          "gold_rank": 2,
+          "gold_score": 0.6612878214300802,
+          "top1_text": "The user does not use Twitter; they use Bluesky and Mastodon.",
+          "top1_score": 0.7106184333803608,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "specificity-006",
+          "query": "what laptop does the user have?",
+          "effective_query": "what laptop does the user have?",
+          "gold_in_topk": true,
+          "gold_global_key": "specificity#2",
+          "gold_rank": 1,
+          "gold_score": 0.9411685874536975,
+          "top1_text": "The user's primary laptop is a 16-inch MacBook Pro M4.",
+          "top1_score": 0.9411685874536975,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "specificity-007",
+          "query": "do I have any musical instruments?",
+          "effective_query": "do the user have any musical instruments?",
+          "gold_in_topk": true,
+          "gold_global_key": "specificity#3",
+          "gold_rank": 2,
+          "gold_score": 0.7324865742655697,
+          "top1_text": "The user is not currently learning any new languages.",
+          "top1_score": 0.8134921163906099,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "specificity-008",
+          "query": "which note-taking app do I use?",
+          "effective_query": "which note-taking app do the user use?",
+          "gold_in_topk": true,
+          "gold_global_key": "specificity#4",
+          "gold_rank": 2,
+          "gold_score": 0.6547377436617509,
+          "top1_text": "I do not use Twitter; they use Bluesky and Mastodon.",
+          "top1_score": 0.712198423325952,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "specificity-009",
+          "query": "where do I like to eat in Lisbon?",
+          "effective_query": "where do the user like to eat in Lisbon?",
+          "gold_in_topk": true,
+          "gold_global_key": "specificity#5",
+          "gold_rank": 3,
+          "gold_score": 0.8041812823837996,
+          "top1_text": "The user lives in Lisbon.",
+          "top1_score": 0.8406389607939662,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "specificity-010",
+          "query": "what brand sunglasses does the user wear?",
+          "effective_query": "what brand sunglasses does the user wear?",
+          "gold_in_topk": true,
+          "gold_global_key": "specificity#6",
+          "gold_rank": 1,
+          "gold_score": 1,
+          "top1_text": "The user wears Smith Lowdown sunglasses.",
+          "top1_score": 1,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "specificity-011",
+          "query": "do I read on a Kindle?",
+          "effective_query": "do the user read on a Kindle?",
+          "gold_in_topk": true,
+          "gold_global_key": "specificity#7",
+          "gold_rank": 1,
+          "gold_score": 1,
+          "top1_text": "The user reads on a Kindle Paperwhite 11th-gen.",
+          "top1_score": 1,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "specificity-012",
+          "query": "what espresso machine does the user own?",
+          "effective_query": "what espresso machine does the user own?",
+          "gold_in_topk": true,
+          "gold_global_key": "specificity#8",
+          "gold_rank": 1,
+          "gold_score": 1,
+          "top1_text": "The user's home espresso machine is a Lelit Bianca v3.",
+          "top1_score": 1,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "specificity-013",
+          "query": "what shoes do I wear?",
+          "effective_query": "what shoes do the user wear?",
+          "gold_in_topk": true,
+          "gold_global_key": "specificity#9",
+          "gold_rank": 1,
+          "gold_score": 0.7416206214516285,
+          "top1_text": "The user wears Allbirds Wool Runners daily.",
+          "top1_score": 0.7416206214516285,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "specificity-014",
+          "query": "what brand are the user's everyday shoes?",
+          "effective_query": "what brand are the user's everyday shoes?",
+          "gold_in_topk": true,
+          "gold_global_key": "specificity#9",
+          "gold_rank": 1,
+          "gold_score": 0.7357707060378677,
+          "top1_text": "The user wears Allbirds Wool Runners daily.",
+          "top1_score": 0.7357707060378677,
+          "top1_is_distractor": false,
+          "false_positive": false
+        }
+      ]
+    },
+    {
+      "axis": "negation",
+      "n": 12,
+      "pool_size": 108,
+      "recall_at_1": 1,
+      "recall_at_5": 1,
+      "mean_gold_rank": 1,
+      "distractor_at_top1": 0,
+      "false_positive_count": 0,
+      "median_gold_margin": 0.49515934695105424,
+      "items": [
+        {
+          "id": "negation-001",
+          "query": "does the user drink coffee?",
+          "effective_query": "does the user drink coffee?",
+          "gold_in_topk": true,
+          "gold_global_key": "negation#0",
+          "gold_rank": 1,
+          "gold_score": 1,
+          "top1_text": "The user does not drink coffee. They prefer tea.",
+          "top1_score": 1,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "negation-002",
+          "query": "what does the user drink in the morning?",
+          "effective_query": "what does the user drink in the morning?",
+          "gold_in_topk": true,
+          "gold_global_key": "negation#0",
+          "gold_rank": 1,
+          "gold_score": 1,
+          "top1_text": "The user does not drink coffee. They prefer tea.",
+          "top1_score": 1,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "negation-003",
+          "query": "am I vegetarian?",
+          "effective_query": "am the user vegetarian?",
+          "gold_in_topk": true,
+          "gold_global_key": "negation#1",
+          "gold_rank": 1,
+          "gold_score": 0.94940384673181,
+          "top1_text": "I am not vegetarian, but avoids red meat.",
+          "top1_score": 0.94940384673181,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "negation-004",
+          "query": "is the user on Twitter?",
+          "effective_query": "is the user on Twitter?",
+          "gold_in_topk": true,
+          "gold_global_key": "negation#2",
+          "gold_rank": 1,
+          "gold_score": 0.9064307475058953,
+          "top1_text": "The user does not use Twitter; they use Bluesky and Mastodon.",
+          "top1_score": 0.9064307475058953,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "negation-005",
+          "query": "which social networks does the user use?",
+          "effective_query": "which social networks does the user use?",
+          "gold_in_topk": true,
+          "gold_global_key": "negation#2",
+          "gold_rank": 1,
+          "gold_score": 1,
+          "top1_text": "The user does not use Twitter; they use Bluesky and Mastodon.",
+          "top1_score": 1,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "negation-006",
+          "query": "does the user own a car?",
+          "effective_query": "does the user own a car?",
+          "gold_in_topk": true,
+          "gold_global_key": "negation#3",
+          "gold_rank": 1,
+          "gold_score": 1,
+          "top1_text": "The user does not own a car; they bike or use public transit.",
+          "top1_score": 1,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "negation-007",
+          "query": "is the user active on LinkedIn?",
+          "effective_query": "is the user active on LinkedIn?",
+          "gold_in_topk": true,
+          "gold_global_key": "negation#4",
+          "gold_rank": 1,
+          "gold_score": 1,
+          "top1_text": "The user is not on LinkedIn anymore.",
+          "top1_score": 1,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "negation-008",
+          "query": "any foods the user hates?",
+          "effective_query": "any foods the user hates?",
+          "gold_in_topk": true,
+          "gold_global_key": "negation#5",
+          "gold_rank": 1,
+          "gold_score": 0.6856226645347278,
+          "top1_text": "The user dislikes cilantro intensely.",
+          "top1_score": 0.6856226645347278,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "negation-009",
+          "query": "has the user traveled to Asia?",
+          "effective_query": "has the user traveled to Asia?",
+          "gold_in_topk": true,
+          "gold_global_key": "negation#6",
+          "gold_rank": 1,
+          "gold_score": 1,
+          "top1_text": "The user has never been to Asia.",
+          "top1_score": 1,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "negation-010",
+          "query": "does the user like horror movies?",
+          "effective_query": "does the user like horror movies?",
+          "gold_in_topk": true,
+          "gold_global_key": "negation#7",
+          "gold_rank": 1,
+          "gold_score": 1,
+          "top1_text": "The user does not enjoy horror movies.",
+          "top1_score": 1,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "negation-011",
+          "query": "can the user eat shrimp?",
+          "effective_query": "can the user eat shrimp?",
+          "gold_in_topk": true,
+          "gold_global_key": "negation#8",
+          "gold_rank": 1,
+          "gold_score": 1,
+          "top1_text": "The user does not eat shellfish.",
+          "top1_score": 1,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "negation-012",
+          "query": "is the user learning a new language?",
+          "effective_query": "is the user learning a new language?",
+          "gold_in_topk": true,
+          "gold_global_key": "negation#9",
+          "gold_rank": 1,
+          "gold_score": 1,
+          "top1_text": "The user is not currently learning any new languages.",
+          "top1_score": 1,
+          "top1_is_distractor": false,
+          "false_positive": false
+        }
+      ]
+    },
+    {
+      "axis": "control",
+      "n": 10,
+      "pool_size": 108,
+      "recall_at_1": 0,
+      "recall_at_5": 0,
+      "mean_gold_rank": null,
+      "distractor_at_top1": 0,
+      "false_positive_count": 10,
+      "median_gold_margin": null,
+      "items": [
+        {
+          "id": "control-001",
+          "query": "what is the airspeed velocity of an unladen swallow?",
+          "effective_query": "what is the airspeed velocity of an unladen swallow?",
+          "gold_in_topk": false,
+          "gold_global_key": null,
+          "gold_rank": null,
+          "gold_score": null,
+          "top1_text": "The user's favorite coffee order is an oat-milk flat white.",
+          "top1_score": 0.7462706361415277,
+          "top1_is_distractor": false,
+          "false_positive": true
+        },
+        {
+          "id": "control-002",
+          "query": "who is the current president of France?",
+          "effective_query": "who is the current president of France?",
+          "gold_in_topk": false,
+          "gold_global_key": null,
+          "gold_rank": null,
+          "gold_score": null,
+          "top1_text": "As of January 2026, the user lives in Lisbon.",
+          "top1_score": 0.8963307525203645,
+          "top1_is_distractor": false,
+          "false_positive": true
+        },
+        {
+          "id": "control-003",
+          "query": "what is the capital of Mongolia?",
+          "effective_query": "what is the capital of Mongolia?",
+          "gold_in_topk": false,
+          "gold_global_key": null,
+          "gold_rank": null,
+          "gold_score": null,
+          "top1_text": "As of April 2026, the user is reading 'The Power Broker'.",
+          "top1_score": 0.7321019562425386,
+          "top1_is_distractor": false,
+          "false_positive": true
+        },
+        {
+          "id": "control-004",
+          "query": "how does photosynthesis work?",
+          "effective_query": "how does photosynthesis work?",
+          "gold_in_topk": false,
+          "gold_global_key": null,
+          "gold_rank": null,
+          "gold_score": null,
+          "top1_text": "The user does not eat shellfish.",
+          "top1_score": 0.8448230497006028,
+          "top1_is_distractor": false,
+          "false_positive": true
+        },
+        {
+          "id": "control-005",
+          "query": "translate 'goodnight' to Japanese",
+          "effective_query": "translate 'goodnight' to Japanese",
+          "gold_in_topk": false,
+          "gold_global_key": null,
+          "gold_rank": null,
+          "gold_score": null,
+          "top1_text": "I have never been to Asia.",
+          "top1_score": 0.7858603613747184,
+          "top1_is_distractor": false,
+          "false_positive": true
+        },
+        {
+          "id": "control-006",
+          "query": "what year did World War II end?",
+          "effective_query": "what year did World War II end?",
+          "gold_in_topk": false,
+          "gold_global_key": null,
+          "gold_rank": null,
+          "gold_score": null,
+          "top1_text": "Last year I read 'Project Hail Mary'.",
+          "top1_score": 0.6729541267175014,
+          "top1_is_distractor": false,
+          "false_positive": true
+        },
+        {
+          "id": "control-007",
+          "query": "explain entropy in thermodynamics",
+          "effective_query": "explain entropy in thermodynamics",
+          "gold_in_topk": false,
+          "gold_global_key": null,
+          "gold_rank": null,
+          "gold_score": null,
+          "top1_text": "I lives in Lisbon.",
+          "top1_score": 0.7556669023961569,
+          "top1_is_distractor": false,
+          "false_positive": true
+        },
+        {
+          "id": "control-008",
+          "query": "best way to debug a segfault in C",
+          "effective_query": "best way to debug a segfault in C",
+          "gold_in_topk": false,
+          "gold_global_key": null,
+          "gold_rank": null,
+          "gold_score": null,
+          "top1_text": "I am currently working on a memory benchmark project.",
+          "top1_score": 0.7108768572279539,
+          "top1_is_distractor": false,
+          "false_positive": true
+        },
+        {
+          "id": "control-009",
+          "query": "what's the weather going to be tomorrow?",
+          "effective_query": "what's the weather going to be tomorrow?",
+          "gold_in_topk": false,
+          "gold_global_key": null,
+          "gold_rank": null,
+          "gold_score": null,
+          "top1_text": "The user upgraded their phone to an iPhone 17 in March 2026.",
+          "top1_score": 0.6556307580300376,
+          "top1_is_distractor": false,
+          "false_positive": true
+        },
+        {
+          "id": "control-010",
+          "query": "give me a recipe for tiramisu",
+          "effective_query": "give the user a recipe for tiramisu",
+          "gold_in_topk": false,
+          "gold_global_key": null,
+          "gold_rank": null,
+          "gold_score": null,
+          "top1_text": "The user asked for the user's name.",
+          "top1_score": 0.7954885232893061,
+          "top1_is_distractor": true,
+          "false_positive": true
+        }
+      ]
+    }
+  ]
+}
\ No newline at end of file
diff --git a/benchmarks/alignbench/runs/demo-stress-2026-05-15.json b/benchmarks/alignbench/runs/demo-stress-2026-05-15.json
new file mode 100644
index 0000000..3923877
--- /dev/null
+++ b/benchmarks/alignbench/runs/demo-stress-2026-05-15.json
@@ -0,0 +1,945 @@
+{
+  "baseline": {
+    "recall_at_1": 0.5666666666666667,
+    "recall_at_5": 0.5666666666666667,
+    "gold_present_rate": 0.5666666666666667,
+    "meta_at_top1": 0,
+    "per_item": [
+      {
+        "id": "name-001",
+        "gold_rank": 1,
+        "gold_score": 0.3960303068161011,
+        "top1_score": 0.3960303068161011,
+        "top1_text": "The user's name is Alex.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "name-002",
+        "gold_rank": null,
+        "gold_score": null,
+        "top1_score": 0.340599000453949,
+        "top1_text": "The user goes by the name Sam.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "name-003",
+        "gold_rank": null,
+        "gold_score": null,
+        "top1_is_meta": false,
+        "top1_text": null
+      },
+      {
+        "id": "name-004",
+        "gold_rank": null,
+        "gold_score": null,
+        "top1_is_meta": false,
+        "top1_text": null
+      },
+      {
+        "id": "pet-001",
+        "gold_rank": 1,
+        "gold_score": 0.4510817527770996,
+        "top1_score": 0.4510817527770996,
+        "top1_text": "User has a golden retriever named Apollo.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "pet-002",
+        "gold_rank": 1,
+        "gold_score": 0.5039844512939453,
+        "top1_score": 0.5039844512939453,
+        "top1_text": "The user has a cat named Luna that sleeps on the user's keyboard.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "pet-003",
+        "gold_rank": null,
+        "gold_score": null,
+        "top1_is_meta": false,
+        "top1_text": null
+      },
+      {
+        "id": "job-001",
+        "gold_rank": 1,
+        "gold_score": 0.19333378970623016,
+        "top1_score": 0.19333378970623016,
+        "top1_text": "User works as a software engineer at a startup",
+        "top1_is_meta": false
+      },
+      {
+        "id": "job-002",
+        "gold_rank": null,
+        "gold_score": null,
+        "top1_is_meta": false,
+        "top1_text": null
+      },
+      {
+        "id": "job-003",
+        "gold_rank": null,
+        "gold_score": null,
+        "top1_is_meta": false,
+        "top1_text": null
+      },
+      {
+        "id": "city-001",
+        "gold_rank": 1,
+        "gold_score": 0.295663058757782,
+        "top1_score": 0.295663058757782,
+        "top1_text": "The user lives in Lisbon.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "city-002",
+        "gold_rank": null,
+        "gold_score": null,
+        "top1_is_meta": false,
+        "top1_text": null
+      },
+      {
+        "id": "city-003",
+        "gold_rank": 1,
+        "gold_score": 0.3179514408111572,
+        "top1_score": 0.3179514408111572,
+        "top1_text": "User is based in Toronto",
+        "top1_is_meta": false
+      },
+      {
+        "id": "food-001",
+        "gold_rank": 1,
+        "gold_score": 0.4597603678703308,
+        "top1_score": 0.4597603678703308,
+        "top1_text": "The user is vegetarian.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "food-002",
+        "gold_rank": 1,
+        "gold_score": 0.41524738073349,
+        "top1_score": 0.41524738073349,
+        "top1_text": "User is severely allergic to peanuts",
+        "top1_is_meta": false
+      },
+      {
+        "id": "food-003",
+        "gold_rank": 1,
+        "gold_score": 0.3149101436138153,
+        "top1_score": 0.3149101436138153,
+        "top1_text": "User does not drink coffee and only drinks tea.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "hobby-001",
+        "gold_rank": null,
+        "gold_score": null,
+        "top1_is_meta": false,
+        "top1_text": null
+      },
+      {
+        "id": "hobby-002",
+        "gold_rank": 1,
+        "gold_score": 0.387075811624527,
+        "top1_score": 0.387075811624527,
+        "top1_text": "The user's main sport is rock climbing.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "hobby-003",
+        "gold_rank": null,
+        "gold_score": null,
+        "top1_is_meta": false,
+        "top1_text": null
+      },
+      {
+        "id": "family-001",
+        "gold_rank": null,
+        "gold_score": null,
+        "top1_is_meta": false,
+        "top1_text": null
+      },
+      {
+        "id": "family-002",
+        "gold_rank": 1,
+        "gold_score": 0.5129045844078064,
+        "top1_score": 0.5129045844078064,
+        "top1_text": "The user's partner's name is Casey.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "family-003",
+        "gold_rank": 1,
+        "gold_score": 0.6258488893508911,
+        "top1_score": 0.6258488893508911,
+        "top1_text": "The user's mom lives in Vancouver.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "vehicle-001",
+        "gold_rank": 1,
+        "gold_score": 0.35032275319099426,
+        "top1_score": 0.35032275319099426,
+        "top1_text": "User drives a blue Subaru Outback.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "vehicle-002",
+        "gold_rank": null,
+        "gold_score": null,
+        "top1_score": 0.383465051651001,
+        "top1_text": "User does not own a car and bikes everywhere for transportation.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "edu-001",
+        "gold_rank": 1,
+        "gold_score": 0.43980997800827026,
+        "top1_score": 0.43980997800827026,
+        "top1_text": "User studied applied mathematics in college.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "edu-002",
+        "gold_rank": 1,
+        "gold_score": 0.6381034255027771,
+        "top1_score": 0.6381034255027771,
+        "top1_text": "User obtained an MBA from UCLA two years ago",
+        "top1_is_meta": false
+      },
+      {
+        "id": "tech-001",
+        "gold_rank": null,
+        "gold_score": null,
+        "top1_is_meta": false,
+        "top1_text": null
+      },
+      {
+        "id": "tech-002",
+        "gold_rank": 1,
+        "gold_score": 0.4922914206981659,
+        "top1_score": 0.4922914206981659,
+        "top1_text": "User prefers Neovim over VS Code as their text editor.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "music-001",
+        "gold_rank": null,
+        "gold_score": null,
+        "top1_is_meta": false,
+        "top1_text": null
+      },
+      {
+        "id": "music-002",
+        "gold_rank": 1,
+        "gold_score": 0.5527176260948181,
+        "top1_score": 0.5527176260948181,
+        "top1_text": "The user's all-time favorite band is Radiohead.",
+        "top1_is_meta": false
+      }
+    ]
+  },
+  "filtered": {
+    "recall_at_1": 0.5666666666666667,
+    "recall_at_5": 0.5666666666666667,
+    "gold_present_rate": 0.5666666666666667,
+    "meta_at_top1": 0,
+    "per_item": [
+      {
+        "id": "name-001",
+        "gold_rank": 1,
+        "gold_score": 0.3960303068161011,
+        "top1_score": 0.3960303068161011,
+        "top1_text": "The user's name is Alex.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "name-002",
+        "gold_rank": null,
+        "gold_score": null,
+        "top1_score": 0.340599000453949,
+        "top1_text": "The user goes by the name Sam.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "name-003",
+        "gold_rank": null,
+        "gold_score": null,
+        "top1_is_meta": false,
+        "top1_text": null
+      },
+      {
+        "id": "name-004",
+        "gold_rank": null,
+        "gold_score": null,
+        "top1_is_meta": false,
+        "top1_text": null
+      },
+      {
+        "id": "pet-001",
+        "gold_rank": 1,
+        "gold_score": 0.4510817527770996,
+        "top1_score": 0.4510817527770996,
+        "top1_text": "User has a golden retriever named Apollo.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "pet-002",
+        "gold_rank": 1,
+        "gold_score": 0.5039844512939453,
+        "top1_score": 0.5039844512939453,
+        "top1_text": "The user has a cat named Luna that sleeps on the user's keyboard.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "pet-003",
+        "gold_rank": null,
+        "gold_score": null,
+        "top1_is_meta": false,
+        "top1_text": null
+      },
+      {
+        "id": "job-001",
+        "gold_rank": 1,
+        "gold_score": 0.19333378970623016,
+        "top1_score": 0.19333378970623016,
+        "top1_text": "User works as a software engineer at a startup",
+        "top1_is_meta": false
+      },
+      {
+        "id": "job-002",
+        "gold_rank": null,
+        "gold_score": null,
+        "top1_is_meta": false,
+        "top1_text": null
+      },
+      {
+        "id": "job-003",
+        "gold_rank": null,
+        "gold_score": null,
+        "top1_is_meta": false,
+        "top1_text": null
+      },
+      {
+        "id": "city-001",
+        "gold_rank": 1,
+        "gold_score": 0.295663058757782,
+        "top1_score": 0.295663058757782,
+        "top1_text": "The user lives in Lisbon.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "city-002",
+        "gold_rank": null,
+        "gold_score": null,
+        "top1_is_meta": false,
+        "top1_text": null
+      },
+      {
+        "id": "city-003",
+        "gold_rank": 1,
+        "gold_score": 0.3179514408111572,
+        "top1_score": 0.3179514408111572,
+        "top1_text": "User is based in Toronto",
+        "top1_is_meta": false
+      },
+      {
+        "id": "food-001",
+        "gold_rank": 1,
+        "gold_score": 0.4597603678703308,
+        "top1_score": 0.4597603678703308,
+        "top1_text": "The user is vegetarian.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "food-002",
+        "gold_rank": 1,
+        "gold_score": 0.41524738073349,
+        "top1_score": 0.41524738073349,
+        "top1_text": "User is severely allergic to peanuts",
+        "top1_is_meta": false
+      },
+      {
+        "id": "food-003",
+        "gold_rank": 1,
+        "gold_score": 0.3149101436138153,
+        "top1_score": 0.3149101436138153,
+        "top1_text": "User does not drink coffee and only drinks tea.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "hobby-001",
+        "gold_rank": null,
+        "gold_score": null,
+        "top1_is_meta": false,
+        "top1_text": null
+      },
+      {
+        "id": "hobby-002",
+        "gold_rank": 1,
+        "gold_score": 0.387075811624527,
+        "top1_score": 0.387075811624527,
+        "top1_text": "The user's main sport is rock climbing.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "hobby-003",
+        "gold_rank": null,
+        "gold_score": null,
+        "top1_is_meta": false,
+        "top1_text": null
+      },
+      {
+        "id": "family-001",
+        "gold_rank": null,
+        "gold_score": null,
+        "top1_is_meta": false,
+        "top1_text": null
+      },
+      {
+        "id": "family-002",
+        "gold_rank": 1,
+        "gold_score": 0.5129045844078064,
+        "top1_score": 0.5129045844078064,
+        "top1_text": "The user's partner's name is Casey.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "family-003",
+        "gold_rank": 1,
+        "gold_score": 0.6258488893508911,
+        "top1_score": 0.6258488893508911,
+        "top1_text": "The user's mom lives in Vancouver.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "vehicle-001",
+        "gold_rank": 1,
+        "gold_score": 0.35032275319099426,
+        "top1_score": 0.35032275319099426,
+        "top1_text": "User drives a blue Subaru Outback.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "vehicle-002",
+        "gold_rank": null,
+        "gold_score": null,
+        "top1_score": 0.383465051651001,
+        "top1_text": "User does not own a car and bikes everywhere for transportation.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "edu-001",
+        "gold_rank": 1,
+        "gold_score": 0.43980997800827026,
+        "top1_score": 0.43980997800827026,
+        "top1_text": "User studied applied mathematics in college.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "edu-002",
+        "gold_rank": 1,
+        "gold_score": 0.6381034255027771,
+        "top1_score": 0.6381034255027771,
+        "top1_text": "User obtained an MBA from UCLA two years ago",
+        "top1_is_meta": false
+      },
+      {
+        "id": "tech-001",
+        "gold_rank": null,
+        "gold_score": null,
+        "top1_is_meta": false,
+        "top1_text": null
+      },
+      {
+        "id": "tech-002",
+        "gold_rank": 1,
+        "gold_score": 0.4922914206981659,
+        "top1_score": 0.4922914206981659,
+        "top1_text": "User prefers Neovim over VS Code as their text editor.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "music-001",
+        "gold_rank": null,
+        "gold_score": null,
+        "top1_is_meta": false,
+        "top1_text": null
+      },
+      {
+        "id": "music-002",
+        "gold_rank": 1,
+        "gold_score": 0.5527176260948181,
+        "top1_score": 0.5527176260948181,
+        "top1_text": "The user's all-time favorite band is Radiohead.",
+        "top1_is_meta": false
+      }
+    ]
+  },
+  "n": 30,
+  "rows": [
+    {
+      "id": "name-001",
+      "turns": [
+        "My name is Alex.",
+        "Got it."
+      ],
+      "query": "what is my name?",
+      "gold": "name is Alex",
+      "facts": [
+        "The user's name is Alex."
+      ],
+      "meta_mask": [
+        false
+      ],
+      "n_facts": 1,
+      "n_meta": 0
+    },
+    {
+      "id": "name-002",
+      "turns": [
+        "I go by Sam.",
+        "OK Sam."
+      ],
+      "query": "what's my name?",
+      "gold": "go by Sam",
+      "facts": [
+        "The user goes by the name Sam."
+      ],
+      "meta_mask": [
+        false
+      ],
+      "n_facts": 1,
+      "n_meta": 0
+    },
+    {
+      "id": "name-003",
+      "turns": [
+        "You can call me Riley.",
+        "Hi Riley."
+      ],
+      "query": "what should you call me?",
+      "gold": "call me Riley",
+      "facts": [],
+      "meta_mask": [],
+      "n_facts": 0,
+      "n_meta": 0
+    },
+    {
+      "id": "name-004",
+      "turns": [
+        "I'm Jordan.",
+        "Nice to meet you."
+      ],
+      "query": "who am I?",
+      "gold": "Jordan",
+      "facts": [],
+      "meta_mask": [],
+      "n_facts": 0,
+      "n_meta": 0
+    },
+    {
+      "id": "pet-001",
+      "turns": [
+        "I have a golden retriever named Apollo.",
+        "How sweet."
+      ],
+      "query": "what is my dog's name?",
+      "gold": "Apollo",
+      "facts": [
+        "User has a golden retriever named Apollo."
+      ],
+      "meta_mask": [
+        false
+      ],
+      "n_facts": 1,
+      "n_meta": 0
+    },
+    {
+      "id": "pet-002",
+      "turns": [
+        "My cat Luna sleeps on my keyboard.",
+        "Classic cat."
+      ],
+      "query": "what's my cat's name?",
+      "gold": "Luna",
+      "facts": [
+        "The user has a cat named Luna that sleeps on the user's keyboard."
+      ],
+      "meta_mask": [
+        false
+      ],
+      "n_facts": 1,
+      "n_meta": 0
+    },
+    {
+      "id": "pet-003",
+      "turns": [
+        "I just adopted a beagle puppy. Her name is Penny.",
+        "Congrats!"
+      ],
+      "query": "what kind of dog do I have?",
+      "gold": "beagle",
+      "facts": [],
+      "meta_mask": [],
+      "n_facts": 0,
+      "n_meta": 0
+    },
+    {
+      "id": "job-001",
+      "turns": [
+        "I work as a software engineer at a startup.",
+        "Cool field."
+      ],
+      "query": "what do I do for work?",
+      "gold": "software engineer",
+      "facts": [
+        "User works as a software engineer at a startup"
+      ],
+      "meta_mask": [
+        false
+      ],
+      "n_facts": 1,
+      "n_meta": 0
+    },
+    {
+      "id": "job-002",
+      "turns": [
+        "I'm a high school chemistry teacher.",
+        "That's important work."
+      ],
+      "query": "what is my profession?",
+      "gold": "chemistry teacher",
+      "facts": [],
+      "meta_mask": [],
+      "n_facts": 0,
+      "n_meta": 0
+    },
+    {
+      "id": "job-003",
+      "turns": [
+        "I freelance as a graphic designer.",
+        "Nice."
+      ],
+      "query": "what's my job?",
+      "gold": "graphic designer",
+      "facts": [],
+      "meta_mask": [],
+      "n_facts": 0,
+      "n_meta": 0
+    },
+    {
+      "id": "city-001",
+      "turns": [
+        "I live in Lisbon now.",
+        "Beautiful city."
+      ],
+      "query": "where do I live?",
+      "gold": "Lisbon",
+      "facts": [
+        "The user lives in Lisbon."
+      ],
+      "meta_mask": [
+        false
+      ],
+      "n_facts": 1,
+      "n_meta": 0
+    },
+    {
+      "id": "city-002",
+      "turns": [
+        "I just moved to Berlin last month.",
+        "Welcome to Berlin."
+      ],
+      "query": "what city am I in?",
+      "gold": "Berlin",
+      "facts": [],
+      "meta_mask": [],
+      "n_facts": 0,
+      "n_meta": 0
+    },
+    {
+      "id": "city-003",
+      "turns": [
+        "I'm based in Toronto.",
+        "Cold this time of year."
+      ],
+      "query": "where am I located?",
+      "gold": "Toronto",
+      "facts": [
+        "User is based in Toronto"
+      ],
+      "meta_mask": [
+        false
+      ],
+      "n_facts": 1,
+      "n_meta": 0
+    },
+    {
+      "id": "food-001",
+      "turns": [
+        "I'm vegetarian.",
+        "Got it."
+      ],
+      "query": "do I eat meat?",
+      "gold": "vegetarian",
+      "facts": [
+        "The user is vegetarian."
+      ],
+      "meta_mask": [
+        false
+      ],
+      "n_facts": 1,
+      "n_meta": 0
+    },
+    {
+      "id": "food-002",
+      "turns": [
+        "I'm severely allergic to peanuts.",
+        "Noted, will avoid."
+      ],
+      "query": "do I have any allergies?",
+      "gold": "peanut",
+      "facts": [
+        "User is severely allergic to peanuts"
+      ],
+      "meta_mask": [
+        false
+      ],
+      "n_facts": 1,
+      "n_meta": 0
+    },
+    {
+      "id": "food-003",
+      "turns": [
+        "I don't drink coffee \u2014 only tea.",
+        "Tea is great too."
+      ],
+      "query": "what do I drink in the morning?",
+      "gold": "tea",
+      "facts": [
+        "User does not drink coffee and only drinks tea."
+      ],
+      "meta_mask": [
+        false
+      ],
+      "n_facts": 1,
+      "n_meta": 0
+    },
+    {
+      "id": "hobby-001",
+      "turns": [
+        "I play classical piano.",
+        "Lovely hobby."
+      ],
+      "query": "what instrument do I play?",
+      "gold": "piano",
+      "facts": [],
+      "meta_mask": [],
+      "n_facts": 0,
+      "n_meta": 0
+    },
+    {
+      "id": "hobby-002",
+      "turns": [
+        "My main sport is rock climbing.",
+        "Cool."
+      ],
+      "query": "what sport do I do?",
+      "gold": "rock climbing",
+      "facts": [
+        "The user's main sport is rock climbing."
+      ],
+      "meta_mask": [
+        false
+      ],
+      "n_facts": 1,
+      "n_meta": 0
+    },
+    {
+      "id": "hobby-003",
+      "turns": [
+        "I've been knitting for about ten years.",
+        "Impressive."
+      ],
+      "query": "what's a hobby I have?",
+      "gold": "knitting",
+      "facts": [],
+      "meta_mask": [],
+      "n_facts": 0,
+      "n_meta": 0
+    },
+    {
+      "id": "family-001",
+      "turns": [
+        "I have two kids, Maya and Theo.",
+        "What ages?"
+      ],
+      "query": "how many children do I have?",
+      "gold": "two",
+      "facts": [],
+      "meta_mask": [],
+      "n_facts": 0,
+      "n_meta": 0
+    },
+    {
+      "id": "family-002",
+      "turns": [
+        "My partner's name is Casey.",
+        "Nice."
+      ],
+      "query": "who is my partner?",
+      "gold": "Casey",
+      "facts": [
+        "The user's partner's name is Casey."
+      ],
+      "meta_mask": [
+        false
+      ],
+      "n_facts": 1,
+      "n_meta": 0
+    },
+    {
+      "id": "family-003",
+      "turns": [
+        "My mom lives in Vancouver.",
+        "Far from you?"
+      ],
+      "query": "where does my mom live?",
+      "gold": "Vancouver",
+      "facts": [
+        "The user's mom lives in Vancouver."
+      ],
+      "meta_mask": [
+        false
+      ],
+      "n_facts": 1,
+      "n_meta": 0
+    },
+    {
+      "id": "vehicle-001",
+      "turns": [
+        "I drive a blue Subaru Outback.",
+        "Reliable car."
+      ],
+      "query": "what kind of car do I have?",
+      "gold": "Subaru",
+      "facts": [
+        "User drives a blue Subaru Outback."
+      ],
+      "meta_mask": [
+        false
+      ],
+      "n_facts": 1,
+      "n_meta": 0
+    },
+    {
+      "id": "vehicle-002",
+      "turns": [
+        "I don't own a car. I bike everywhere.",
+        "Healthy lifestyle."
+      ],
+      "query": "do I have a car?",
+      "gold": "don't own",
+      "facts": [
+        "User does not own a car and bikes everywhere for transportation."
+      ],
+      "meta_mask": [
+        false
+      ],
+      "n_facts": 1,
+      "n_meta": 0
+    },
+    {
+      "id": "edu-001",
+      "turns": [
+        "I studied applied mathematics in college.",
+        "Tough major."
+      ],
+      "query": "what was my major?",
+      "gold": "applied mathematics",
+      "facts": [
+        "User studied applied mathematics in college."
+      ],
+      "meta_mask": [
+        false
+      ],
+      "n_facts": 1,
+      "n_meta": 0
+    },
+    {
+      "id": "edu-002",
+      "turns": [
+        "I got my MBA from UCLA two years ago.",
+        "Congrats."
+      ],
+      "query": "where did I get my MBA?",
+      "gold": "UCLA",
+      "facts": [
+        "User obtained an MBA from UCLA two years ago"
+      ],
+      "meta_mask": [
+        false
+      ],
+      "n_facts": 1,
+      "n_meta": 0
+    },
+    {
+      "id": "tech-001",
+      "turns": [
+        "My main laptop is a 16-inch MacBook Pro.",
+        "Solid machine."
+      ],
+      "query": "what computer do I use?",
+      "gold": "MacBook",
+      "facts": [],
+      "meta_mask": [],
+      "n_facts": 0,
+      "n_meta": 0
+    },
+    {
+      "id": "tech-002",
+      "turns": [
+        "I prefer Neovim over VS Code.",
+        "Editor preferences are personal."
+      ],
+      "query": "what editor do I use?",
+      "gold": "Neovim",
+      "facts": [
+        "User prefers Neovim over VS Code as their text editor."
+      ],
+      "meta_mask": [
+        false
+      ],
+      "n_facts": 1,
+      "n_meta": 0
+    },
+    {
+      "id": "music-001",
+      "turns": [
+        "I've been getting into bluegrass lately.",
+        "Fun genre."
+      ],
+      "query": "what music am I into these days?",
+      "gold": "bluegrass",
+      "facts": [],
+      "meta_mask": [],
+      "n_facts": 0,
+      "n_meta": 0
+    },
+    {
+      "id": "music-002",
+      "turns": [
+        "My all-time favorite band is Radiohead.",
+        "Great band."
+      ],
+      "query": "what's my favorite band?",
+      "gold": "Radiohead",
+      "facts": [
+        "The user's all-time favorite band is Radiohead."
+      ],
+      "meta_mask": [
+        false
+      ],
+      "n_facts": 1,
+      "n_meta": 0
+    }
+  ]
+}
\ No newline at end of file
diff --git a/benchmarks/alignbench/runs/demo-stress-v2-prompt-tuned-2026-05-15.json b/benchmarks/alignbench/runs/demo-stress-v2-prompt-tuned-2026-05-15.json
new file mode 100644
index 0000000..83af4e4
--- /dev/null
+++ b/benchmarks/alignbench/runs/demo-stress-v2-prompt-tuned-2026-05-15.json
@@ -0,0 +1,969 @@
+{
+  "baseline": {
+    "recall_at_1": 0.7333333333333333,
+    "recall_at_5": 0.7333333333333333,
+    "gold_present_rate": 0.7333333333333333,
+    "meta_at_top1": 0,
+    "per_item": [
+      {
+        "id": "name-001",
+        "gold_rank": 1,
+        "gold_score": 0.3960303068161011,
+        "top1_score": 0.3960303068161011,
+        "top1_text": "The user's name is Alex.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "name-002",
+        "gold_rank": 1,
+        "gold_score": 0.24057143926620483,
+        "top1_score": 0.24057143926620483,
+        "top1_text": "The user goes by Sam.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "name-003",
+        "gold_rank": null,
+        "gold_score": null,
+        "top1_is_meta": false,
+        "top1_text": null
+      },
+      {
+        "id": "name-004",
+        "gold_rank": null,
+        "gold_score": null,
+        "top1_is_meta": false,
+        "top1_text": null
+      },
+      {
+        "id": "pet-001",
+        "gold_rank": 1,
+        "gold_score": 0.46046364307403564,
+        "top1_score": 0.46046364307403564,
+        "top1_text": "The user has a golden retriever named Apollo.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "pet-002",
+        "gold_rank": null,
+        "gold_score": null,
+        "top1_is_meta": false,
+        "top1_text": null
+      },
+      {
+        "id": "pet-003",
+        "gold_rank": 1,
+        "gold_score": 0.4039832651615143,
+        "top1_score": 0.4039832651615143,
+        "top1_text": "The user adopted a beagle puppy named Penny.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "job-001",
+        "gold_rank": 1,
+        "gold_score": 0.18614771962165833,
+        "top1_score": 0.18614771962165833,
+        "top1_text": "The user works as a software engineer at a startup.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "job-002",
+        "gold_rank": 1,
+        "gold_score": 0.2045474350452423,
+        "top1_score": 0.2045474350452423,
+        "top1_text": "The user is a high school chemistry teacher.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "job-003",
+        "gold_rank": 1,
+        "gold_score": 0.3352947533130646,
+        "top1_score": 0.3352947533130646,
+        "top1_text": "The user freelances as a graphic designer.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "city-001",
+        "gold_rank": 1,
+        "gold_score": 0.2956629991531372,
+        "top1_score": 0.2956629991531372,
+        "top1_text": "The user lives in Lisbon.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "city-002",
+        "gold_rank": null,
+        "gold_score": null,
+        "top1_is_meta": false,
+        "top1_text": null
+      },
+      {
+        "id": "city-003",
+        "gold_rank": 1,
+        "gold_score": 0.33762556314468384,
+        "top1_score": 0.33762556314468384,
+        "top1_text": "The user is based in Toronto.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "food-001",
+        "gold_rank": 1,
+        "gold_score": 0.459760457277298,
+        "top1_score": 0.459760457277298,
+        "top1_text": "The user is vegetarian.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "food-002",
+        "gold_rank": 1,
+        "gold_score": 0.38580864667892456,
+        "top1_score": 0.38580864667892456,
+        "top1_text": "The user is severely allergic to peanuts.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "food-003",
+        "gold_rank": 1,
+        "gold_score": 0.30525362491607666,
+        "top1_score": 0.30525362491607666,
+        "top1_text": "The user does not drink coffee and only drinks tea.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "hobby-001",
+        "gold_rank": 1,
+        "gold_score": 0.38078930974006653,
+        "top1_score": 0.38078930974006653,
+        "top1_text": "The user plays classical piano.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "hobby-002",
+        "gold_rank": 1,
+        "gold_score": 0.38707590103149414,
+        "top1_score": 0.38707590103149414,
+        "top1_text": "The user's main sport is rock climbing.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "hobby-003",
+        "gold_rank": null,
+        "gold_score": null,
+        "top1_is_meta": false,
+        "top1_text": null
+      },
+      {
+        "id": "family-001",
+        "gold_rank": 1,
+        "gold_score": 0.28777992725372314,
+        "top1_score": 0.28777992725372314,
+        "top1_text": "The user has two children named Maya and Theo.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "family-002",
+        "gold_rank": 1,
+        "gold_score": 0.5129046440124512,
+        "top1_score": 0.5129046440124512,
+        "top1_text": "The user's partner's name is Casey.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "family-003",
+        "gold_rank": 1,
+        "gold_score": 0.6085690259933472,
+        "top1_score": 0.6085690259933472,
+        "top1_text": "The user's mother lives in Vancouver.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "vehicle-001",
+        "gold_rank": 1,
+        "gold_score": 0.33847203850746155,
+        "top1_score": 0.33847203850746155,
+        "top1_text": "The user drives a blue Subaru Outback.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "vehicle-002",
+        "gold_rank": null,
+        "gold_score": null,
+        "top1_score": 0.410287082195282,
+        "top1_text": "The user does not own a car and bikes everywhere.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "edu-001",
+        "gold_rank": 1,
+        "gold_score": 0.45382001996040344,
+        "top1_score": 0.45382001996040344,
+        "top1_text": "The user studied applied mathematics in college.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "edu-002",
+        "gold_rank": null,
+        "gold_score": null,
+        "top1_is_meta": false,
+        "top1_text": null
+      },
+      {
+        "id": "tech-001",
+        "gold_rank": 1,
+        "gold_score": 0.30372458696365356,
+        "top1_score": 0.30372458696365356,
+        "top1_text": "The user's main laptop is a 16-inch MacBook Pro.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "tech-002",
+        "gold_rank": 1,
+        "gold_score": 0.3180965483188629,
+        "top1_score": 0.3180965483188629,
+        "top1_text": "The user prefers Neovim over VS Code.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "music-001",
+        "gold_rank": null,
+        "gold_score": null,
+        "top1_is_meta": false,
+        "top1_text": null
+      },
+      {
+        "id": "music-002",
+        "gold_rank": 1,
+        "gold_score": 0.5527178049087524,
+        "top1_score": 0.5527178049087524,
+        "top1_text": "The user's all-time favorite band is Radiohead.",
+        "top1_is_meta": false
+      }
+    ]
+  },
+  "filtered": {
+    "recall_at_1": 0.7333333333333333,
+    "recall_at_5": 0.7333333333333333,
+    "gold_present_rate": 0.7333333333333333,
+    "meta_at_top1": 0,
+    "per_item": [
+      {
+        "id": "name-001",
+        "gold_rank": 1,
+        "gold_score": 0.3960303068161011,
+        "top1_score": 0.3960303068161011,
+        "top1_text": "The user's name is Alex.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "name-002",
+        "gold_rank": 1,
+        "gold_score": 0.24057143926620483,
+        "top1_score": 0.24057143926620483,
+        "top1_text": "The user goes by Sam.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "name-003",
+        "gold_rank": null,
+        "gold_score": null,
+        "top1_is_meta": false,
+        "top1_text": null
+      },
+      {
+        "id": "name-004",
+        "gold_rank": null,
+        "gold_score": null,
+        "top1_is_meta": false,
+        "top1_text": null
+      },
+      {
+        "id": "pet-001",
+        "gold_rank": 1,
+        "gold_score": 0.46046364307403564,
+        "top1_score": 0.46046364307403564,
+        "top1_text": "The user has a golden retriever named Apollo.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "pet-002",
+        "gold_rank": null,
+        "gold_score": null,
+        "top1_is_meta": false,
+        "top1_text": null
+      },
+      {
+        "id": "pet-003",
+        "gold_rank": 1,
+        "gold_score": 0.4039832651615143,
+        "top1_score": 0.4039832651615143,
+        "top1_text": "The user adopted a beagle puppy named Penny.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "job-001",
+        "gold_rank": 1,
+        "gold_score": 0.18614771962165833,
+        "top1_score": 0.18614771962165833,
+        "top1_text": "The user works as a software engineer at a startup.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "job-002",
+        "gold_rank": 1,
+        "gold_score": 0.2045474350452423,
+        "top1_score": 0.2045474350452423,
+        "top1_text": "The user is a high school chemistry teacher.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "job-003",
+        "gold_rank": 1,
+        "gold_score": 0.3352947533130646,
+        "top1_score": 0.3352947533130646,
+        "top1_text": "The user freelances as a graphic designer.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "city-001",
+        "gold_rank": 1,
+        "gold_score": 0.2956629991531372,
+        "top1_score": 0.2956629991531372,
+        "top1_text": "The user lives in Lisbon.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "city-002",
+        "gold_rank": null,
+        "gold_score": null,
+        "top1_is_meta": false,
+        "top1_text": null
+      },
+      {
+        "id": "city-003",
+        "gold_rank": 1,
+        "gold_score": 0.33762556314468384,
+        "top1_score": 0.33762556314468384,
+        "top1_text": "The user is based in Toronto.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "food-001",
+        "gold_rank": 1,
+        "gold_score": 0.459760457277298,
+        "top1_score": 0.459760457277298,
+        "top1_text": "The user is vegetarian.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "food-002",
+        "gold_rank": 1,
+        "gold_score": 0.38580864667892456,
+        "top1_score": 0.38580864667892456,
+        "top1_text": "The user is severely allergic to peanuts.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "food-003",
+        "gold_rank": 1,
+        "gold_score": 0.30525362491607666,
+        "top1_score": 0.30525362491607666,
+        "top1_text": "The user does not drink coffee and only drinks tea.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "hobby-001",
+        "gold_rank": 1,
+        "gold_score": 0.38078930974006653,
+        "top1_score": 0.38078930974006653,
+        "top1_text": "The user plays classical piano.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "hobby-002",
+        "gold_rank": 1,
+        "gold_score": 0.38707590103149414,
+        "top1_score": 0.38707590103149414,
+        "top1_text": "The user's main sport is rock climbing.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "hobby-003",
+        "gold_rank": null,
+        "gold_score": null,
+        "top1_is_meta": false,
+        "top1_text": null
+      },
+      {
+        "id": "family-001",
+        "gold_rank": 1,
+        "gold_score": 0.28777992725372314,
+        "top1_score": 0.28777992725372314,
+        "top1_text": "The user has two children named Maya and Theo.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "family-002",
+        "gold_rank": 1,
+        "gold_score": 0.5129046440124512,
+        "top1_score": 0.5129046440124512,
+        "top1_text": "The user's partner's name is Casey.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "family-003",
+        "gold_rank": 1,
+        "gold_score": 0.6085690259933472,
+        "top1_score": 0.6085690259933472,
+        "top1_text": "The user's mother lives in Vancouver.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "vehicle-001",
+        "gold_rank": 1,
+        "gold_score": 0.33847203850746155,
+        "top1_score": 0.33847203850746155,
+        "top1_text": "The user drives a blue Subaru Outback.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "vehicle-002",
+        "gold_rank": null,
+        "gold_score": null,
+        "top1_score": 0.410287082195282,
+        "top1_text": "The user does not own a car and bikes everywhere.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "edu-001",
+        "gold_rank": 1,
+        "gold_score": 0.45382001996040344,
+        "top1_score": 0.45382001996040344,
+        "top1_text": "The user studied applied mathematics in college.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "edu-002",
+        "gold_rank": null,
+        "gold_score": null,
+        "top1_is_meta": false,
+        "top1_text": null
+      },
+      {
+        "id": "tech-001",
+        "gold_rank": 1,
+        "gold_score": 0.30372458696365356,
+        "top1_score": 0.30372458696365356,
+        "top1_text": "The user's main laptop is a 16-inch MacBook Pro.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "tech-002",
+        "gold_rank": 1,
+        "gold_score": 0.3180965483188629,
+        "top1_score": 0.3180965483188629,
+        "top1_text": "The user prefers Neovim over VS Code.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "music-001",
+        "gold_rank": null,
+        "gold_score": null,
+        "top1_is_meta": false,
+        "top1_text": null
+      },
+      {
+        "id": "music-002",
+        "gold_rank": 1,
+        "gold_score": 0.5527178049087524,
+        "top1_score": 0.5527178049087524,
+        "top1_text": "The user's all-time favorite band is Radiohead.",
+        "top1_is_meta": false
+      }
+    ]
+  },
+  "n": 30,
+  "rows": [
+    {
+      "id": "name-001",
+      "turns": [
+        "My name is Alex.",
+        "Got it."
+      ],
+      "query": "what is my name?",
+      "gold": "name is Alex",
+      "facts": [
+        "The user's name is Alex."
+      ],
+      "meta_mask": [
+        false
+      ],
+      "n_facts": 1,
+      "n_meta": 0
+    },
+    {
+      "id": "name-002",
+      "turns": [
+        "I go by Sam.",
+        "OK Sam."
+      ],
+      "query": "what's my name?",
+      "gold": "go by Sam",
+      "facts": [
+        "The user goes by Sam."
+      ],
+      "meta_mask": [
+        false
+      ],
+      "n_facts": 1,
+      "n_meta": 0
+    },
+    {
+      "id": "name-003",
+      "turns": [
+        "You can call me Riley.",
+        "Hi Riley."
+      ],
+      "query": "what should you call me?",
+      "gold": "call me Riley",
+      "facts": [],
+      "meta_mask": [],
+      "n_facts": 0,
+      "n_meta": 0
+    },
+    {
+      "id": "name-004",
+      "turns": [
+        "I'm Jordan.",
+        "Nice to meet you."
+      ],
+      "query": "who am I?",
+      "gold": "Jordan",
+      "facts": [],
+      "meta_mask": [],
+      "n_facts": 0,
+      "n_meta": 0
+    },
+    {
+      "id": "pet-001",
+      "turns": [
+        "I have a golden retriever named Apollo.",
+        "How sweet."
+      ],
+      "query": "what is my dog's name?",
+      "gold": "Apollo",
+      "facts": [
+        "The user has a golden retriever named Apollo."
+      ],
+      "meta_mask": [
+        false
+      ],
+      "n_facts": 1,
+      "n_meta": 0
+    },
+    {
+      "id": "pet-002",
+      "turns": [
+        "My cat Luna sleeps on my keyboard.",
+        "Classic cat."
+      ],
+      "query": "what's my cat's name?",
+      "gold": "Luna",
+      "facts": [],
+      "meta_mask": [],
+      "n_facts": 0,
+      "n_meta": 0
+    },
+    {
+      "id": "pet-003",
+      "turns": [
+        "I just adopted a beagle puppy. Her name is Penny.",
+        "Congrats!"
+      ],
+      "query": "what kind of dog do I have?",
+      "gold": "beagle",
+      "facts": [
+        "The user adopted a beagle puppy named Penny."
+      ],
+      "meta_mask": [
+        false
+      ],
+      "n_facts": 1,
+      "n_meta": 0
+    },
+    {
+      "id": "job-001",
+      "turns": [
+        "I work as a software engineer at a startup.",
+        "Cool field."
+      ],
+      "query": "what do I do for work?",
+      "gold": "software engineer",
+      "facts": [
+        "The user works as a software engineer at a startup."
+      ],
+      "meta_mask": [
+        false
+      ],
+      "n_facts": 1,
+      "n_meta": 0
+    },
+    {
+      "id": "job-002",
+      "turns": [
+        "I'm a high school chemistry teacher.",
+        "That's important work."
+      ],
+      "query": "what is my profession?",
+      "gold": "chemistry teacher",
+      "facts": [
+        "The user is a high school chemistry teacher."
+      ],
+      "meta_mask": [
+        false
+      ],
+      "n_facts": 1,
+      "n_meta": 0
+    },
+    {
+      "id": "job-003",
+      "turns": [
+        "I freelance as a graphic designer.",
+        "Nice."
+      ],
+      "query": "what's my job?",
+      "gold": "graphic designer",
+      "facts": [
+        "The user freelances as a graphic designer."
+      ],
+      "meta_mask": [
+        false
+      ],
+      "n_facts": 1,
+      "n_meta": 0
+    },
+    {
+      "id": "city-001",
+      "turns": [
+        "I live in Lisbon now.",
+        "Beautiful city."
+      ],
+      "query": "where do I live?",
+      "gold": "Lisbon",
+      "facts": [
+        "The user lives in Lisbon."
+      ],
+      "meta_mask": [
+        false
+      ],
+      "n_facts": 1,
+      "n_meta": 0
+    },
+    {
+      "id": "city-002",
+      "turns": [
+        "I just moved to Berlin last month.",
+        "Welcome to Berlin."
+      ],
+      "query": "what city am I in?",
+      "gold": "Berlin",
+      "facts": [],
+      "meta_mask": [],
+      "n_facts": 0,
+      "n_meta": 0
+    },
+    {
+      "id": "city-003",
+      "turns": [
+        "I'm based in Toronto.",
+        "Cold this time of year."
+      ],
+      "query": "where am I located?",
+      "gold": "Toronto",
+      "facts": [
+        "The user is based in Toronto."
+      ],
+      "meta_mask": [
+        false
+      ],
+      "n_facts": 1,
+      "n_meta": 0
+    },
+    {
+      "id": "food-001",
+      "turns": [
+        "I'm vegetarian.",
+        "Got it."
+      ],
+      "query": "do I eat meat?",
+      "gold": "vegetarian",
+      "facts": [
+        "The user is vegetarian."
+      ],
+      "meta_mask": [
+        false
+      ],
+      "n_facts": 1,
+      "n_meta": 0
+    },
+    {
+      "id": "food-002",
+      "turns": [
+        "I'm severely allergic to peanuts.",
+        "Noted, will avoid."
+      ],
+      "query": "do I have any allergies?",
+      "gold": "peanut",
+      "facts": [
+        "The user is severely allergic to peanuts."
+      ],
+      "meta_mask": [
+        false
+      ],
+      "n_facts": 1,
+      "n_meta": 0
+    },
+    {
+      "id": "food-003",
+      "turns": [
+        "I don't drink coffee \u2014 only tea.",
+        "Tea is great too."
+      ],
+      "query": "what do I drink in the morning?",
+      "gold": "tea",
+      "facts": [
+        "The user does not drink coffee and only drinks tea."
+      ],
+      "meta_mask": [
+        false
+      ],
+      "n_facts": 1,
+      "n_meta": 0
+    },
+    {
+      "id": "hobby-001",
+      "turns": [
+        "I play classical piano.",
+        "Lovely hobby."
+      ],
+      "query": "what instrument do I play?",
+      "gold": "piano",
+      "facts": [
+        "The user plays classical piano."
+      ],
+      "meta_mask": [
+        false
+      ],
+      "n_facts": 1,
+      "n_meta": 0
+    },
+    {
+      "id": "hobby-002",
+      "turns": [
+        "My main sport is rock climbing.",
+        "Cool."
+      ],
+      "query": "what sport do I do?",
+      "gold": "rock climbing",
+      "facts": [
+        "The user's main sport is rock climbing."
+      ],
+      "meta_mask": [
+        false
+      ],
+      "n_facts": 1,
+      "n_meta": 0
+    },
+    {
+      "id": "hobby-003",
+      "turns": [
+        "I've been knitting for about ten years.",
+        "Impressive."
+      ],
+      "query": "what's a hobby I have?",
+      "gold": "knitting",
+      "facts": [],
+      "meta_mask": [],
+      "n_facts": 0,
+      "n_meta": 0
+    },
+    {
+      "id": "family-001",
+      "turns": [
+        "I have two kids, Maya and Theo.",
+        "What ages?"
+      ],
+      "query": "how many children do I have?",
+      "gold": "two",
+      "facts": [
+        "The user has two children named Maya and Theo."
+      ],
+      "meta_mask": [
+        false
+      ],
+      "n_facts": 1,
+      "n_meta": 0
+    },
+    {
+      "id": "family-002",
+      "turns": [
+        "My partner's name is Casey.",
+        "Nice."
+      ],
+      "query": "who is my partner?",
+      "gold": "Casey",
+      "facts": [
+        "The user's partner's name is Casey."
+      ],
+      "meta_mask": [
+        false
+      ],
+      "n_facts": 1,
+      "n_meta": 0
+    },
+    {
+      "id": "family-003",
+      "turns": [
+        "My mom lives in Vancouver.",
+        "Far from you?"
+      ],
+      "query": "where does my mom live?",
+      "gold": "Vancouver",
+      "facts": [
+        "The user's mother lives in Vancouver."
+      ],
+      "meta_mask": [
+        false
+      ],
+      "n_facts": 1,
+      "n_meta": 0
+    },
+    {
+      "id": "vehicle-001",
+      "turns": [
+        "I drive a blue Subaru Outback.",
+        "Reliable car."
+      ],
+      "query": "what kind of car do I have?",
+      "gold": "Subaru",
+      "facts": [
+        "The user drives a blue Subaru Outback."
+      ],
+      "meta_mask": [
+        false
+      ],
+      "n_facts": 1,
+      "n_meta": 0
+    },
+    {
+      "id": "vehicle-002",
+      "turns": [
+        "I don't own a car. I bike everywhere.",
+        "Healthy lifestyle."
+      ],
+      "query": "do I have a car?",
+      "gold": "don't own",
+      "facts": [
+        "The user does not own a car and bikes everywhere."
+      ],
+      "meta_mask": [
+        false
+      ],
+      "n_facts": 1,
+      "n_meta": 0
+    },
+    {
+      "id": "edu-001",
+      "turns": [
+        "I studied applied mathematics in college.",
+        "Tough major."
+      ],
+      "query": "what was my major?",
+      "gold": "applied mathematics",
+      "facts": [
+        "The user studied applied mathematics in college."
+      ],
+      "meta_mask": [
+        false
+      ],
+      "n_facts": 1,
+      "n_meta": 0
+    },
+    {
+      "id": "edu-002",
+      "turns": [
+        "I got my MBA from UCLA two years ago.",
+        "Congrats."
+      ],
+      "query": "where did I get my MBA?",
+      "gold": "UCLA",
+      "facts": [],
+      "meta_mask": [],
+      "n_facts": 0,
+      "n_meta": 0
+    },
+    {
+      "id": "tech-001",
+      "turns": [
+        "My main laptop is a 16-inch MacBook Pro.",
+        "Solid machine."
+      ],
+      "query": "what computer do I use?",
+      "gold": "MacBook",
+      "facts": [
+        "The user's main laptop is a 16-inch MacBook Pro."
+      ],
+      "meta_mask": [
+        false
+      ],
+      "n_facts": 1,
+      "n_meta": 0
+    },
+    {
+      "id": "tech-002",
+      "turns": [
+        "I prefer Neovim over VS Code.",
+        "Editor preferences are personal."
+      ],
+      "query": "what editor do I use?",
+      "gold": "Neovim",
+      "facts": [
+        "The user prefers Neovim over VS Code."
+      ],
+      "meta_mask": [
+        false
+      ],
+      "n_facts": 1,
+      "n_meta": 0
+    },
+    {
+      "id": "music-001",
+      "turns": [
+        "I've been getting into bluegrass lately.",
+        "Fun genre."
+      ],
+      "query": "what music am I into these days?",
+      "gold": "bluegrass",
+      "facts": [],
+      "meta_mask": [],
+      "n_facts": 0,
+      "n_meta": 0
+    },
+    {
+      "id": "music-002",
+      "turns": [
+        "My all-time favorite band is Radiohead.",
+        "Great band."
+      ],
+      "query": "what's my favorite band?",
+      "gold": "Radiohead",
+      "facts": [
+        "The user's all-time favorite band is Radiohead."
+      ],
+      "meta_mask": [
+        false
+      ],
+      "n_facts": 1,
+      "n_meta": 0
+    }
+  ]
+}
\ No newline at end of file
diff --git a/benchmarks/alignbench/runs/demo-stress-v3-prompt+prefill-2026-05-15.json b/benchmarks/alignbench/runs/demo-stress-v3-prompt+prefill-2026-05-15.json
new file mode 100644
index 0000000..7b41c9e
--- /dev/null
+++ b/benchmarks/alignbench/runs/demo-stress-v3-prompt+prefill-2026-05-15.json
@@ -0,0 +1,1017 @@
+{
+  "baseline": {
+    "recall_at_1": 0.9666666666666667,
+    "recall_at_5": 0.9666666666666667,
+    "gold_present_rate": 0.9666666666666667,
+    "meta_at_top1": 0,
+    "per_item": [
+      {
+        "id": "name-001",
+        "gold_rank": 1,
+        "gold_score": 0.3960303068161011,
+        "top1_score": 0.3960303068161011,
+        "top1_text": "The user's name is Alex.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "name-002",
+        "gold_rank": 1,
+        "gold_score": 0.33899739384651184,
+        "top1_score": 0.33899739384651184,
+        "top1_text": "Sam goes by the name Sam.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "name-003",
+        "gold_rank": 1,
+        "gold_score": 0.17434966564178467,
+        "top1_score": 0.17434966564178467,
+        "top1_text": "The user's name is Riley.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "name-004",
+        "gold_rank": 1,
+        "gold_score": 0.2914789915084839,
+        "top1_score": 0.2914789915084839,
+        "top1_text": "The user's name is Jordan.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "pet-001",
+        "gold_rank": 1,
+        "gold_score": 0.46046364307403564,
+        "top1_score": 0.46046364307403564,
+        "top1_text": "The user has a golden retriever named Apollo.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "pet-002",
+        "gold_rank": 1,
+        "gold_score": 0.5874672532081604,
+        "top1_score": 0.5874672532081604,
+        "top1_text": "The user has a cat named Luna.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "pet-003",
+        "gold_rank": 1,
+        "gold_score": 0.42153388261795044,
+        "top1_score": 0.42153388261795044,
+        "top1_text": "The user recently adopted a beagle puppy named Penny.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "job-001",
+        "gold_rank": 1,
+        "gold_score": 0.18614771962165833,
+        "top1_score": 0.18614771962165833,
+        "top1_text": "The user works as a software engineer at a startup.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "job-002",
+        "gold_rank": 1,
+        "gold_score": 0.2045474350452423,
+        "top1_score": 0.2045474350452423,
+        "top1_text": "The user is a high school chemistry teacher.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "job-003",
+        "gold_rank": 1,
+        "gold_score": 0.3352947533130646,
+        "top1_score": 0.3352947533130646,
+        "top1_text": "The user freelances as a graphic designer.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "city-001",
+        "gold_rank": 1,
+        "gold_score": 0.2956629991531372,
+        "top1_score": 0.2956629991531372,
+        "top1_text": "The user lives in Lisbon.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "city-002",
+        "gold_rank": 1,
+        "gold_score": 0.22115547955036163,
+        "top1_score": 0.22115547955036163,
+        "top1_text": "The user moved to Berlin last month.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "city-003",
+        "gold_rank": 1,
+        "gold_score": 0.33762556314468384,
+        "top1_score": 0.33762556314468384,
+        "top1_text": "The user is based in Toronto.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "food-001",
+        "gold_rank": 1,
+        "gold_score": 0.459760457277298,
+        "top1_score": 0.459760457277298,
+        "top1_text": "The user is vegetarian.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "food-002",
+        "gold_rank": 1,
+        "gold_score": 0.38580864667892456,
+        "top1_score": 0.38580864667892456,
+        "top1_text": "The user is severely allergic to peanuts.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "food-003",
+        "gold_rank": 1,
+        "gold_score": 0.30525362491607666,
+        "top1_score": 0.30525362491607666,
+        "top1_text": "The user does not drink coffee and only drinks tea.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "hobby-001",
+        "gold_rank": 1,
+        "gold_score": 0.38078930974006653,
+        "top1_score": 0.38078930974006653,
+        "top1_text": "The user plays classical piano.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "hobby-002",
+        "gold_rank": 1,
+        "gold_score": 0.38707590103149414,
+        "top1_score": 0.38707590103149414,
+        "top1_text": "The user's main sport is rock climbing.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "hobby-003",
+        "gold_rank": 1,
+        "gold_score": 0.264183908700943,
+        "top1_score": 0.264183908700943,
+        "top1_text": "The user has been knitting for about ten years.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "family-001",
+        "gold_rank": 1,
+        "gold_score": 0.28777992725372314,
+        "top1_score": 0.28777992725372314,
+        "top1_text": "The user has two children named Maya and Theo.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "family-002",
+        "gold_rank": 1,
+        "gold_score": 0.5129046440124512,
+        "top1_score": 0.5129046440124512,
+        "top1_text": "The user's partner's name is Casey.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "family-003",
+        "gold_rank": 1,
+        "gold_score": 0.6258491277694702,
+        "top1_score": 0.6258491277694702,
+        "top1_text": "The user's mom lives in Vancouver.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "vehicle-001",
+        "gold_rank": 1,
+        "gold_score": 0.33847203850746155,
+        "top1_score": 0.33847203850746155,
+        "top1_text": "The user drives a blue Subaru Outback.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "vehicle-002",
+        "gold_rank": null,
+        "gold_score": null,
+        "top1_score": 0.4807737469673157,
+        "top1_text": "The user does not own a car.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "edu-001",
+        "gold_rank": 1,
+        "gold_score": 0.45382001996040344,
+        "top1_score": 0.45382001996040344,
+        "top1_text": "The user studied applied mathematics in college.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "edu-002",
+        "gold_rank": 1,
+        "gold_score": 0.6818706393241882,
+        "top1_score": 0.6818706393241882,
+        "top1_text": "The user obtained an MBA from UCLA two years ago.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "tech-001",
+        "gold_rank": 1,
+        "gold_score": 0.30372458696365356,
+        "top1_score": 0.30372458696365356,
+        "top1_text": "The user's main laptop is a 16-inch MacBook Pro.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "tech-002",
+        "gold_rank": 1,
+        "gold_score": 0.3180965483188629,
+        "top1_score": 0.3180965483188629,
+        "top1_text": "The user prefers Neovim over VS Code.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "music-001",
+        "gold_rank": 1,
+        "gold_score": 0.33930984139442444,
+        "top1_score": 0.33930984139442444,
+        "top1_text": "The user has been getting into bluegrass music recently.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "music-002",
+        "gold_rank": 1,
+        "gold_score": 0.5527178049087524,
+        "top1_score": 0.5527178049087524,
+        "top1_text": "The user's all-time favorite band is Radiohead.",
+        "top1_is_meta": false
+      }
+    ]
+  },
+  "filtered": {
+    "recall_at_1": 0.9666666666666667,
+    "recall_at_5": 0.9666666666666667,
+    "gold_present_rate": 0.9666666666666667,
+    "meta_at_top1": 0,
+    "per_item": [
+      {
+        "id": "name-001",
+        "gold_rank": 1,
+        "gold_score": 0.3960303068161011,
+        "top1_score": 0.3960303068161011,
+        "top1_text": "The user's name is Alex.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "name-002",
+        "gold_rank": 1,
+        "gold_score": 0.33899739384651184,
+        "top1_score": 0.33899739384651184,
+        "top1_text": "Sam goes by the name Sam.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "name-003",
+        "gold_rank": 1,
+        "gold_score": 0.17434966564178467,
+        "top1_score": 0.17434966564178467,
+        "top1_text": "The user's name is Riley.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "name-004",
+        "gold_rank": 1,
+        "gold_score": 0.2914789915084839,
+        "top1_score": 0.2914789915084839,
+        "top1_text": "The user's name is Jordan.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "pet-001",
+        "gold_rank": 1,
+        "gold_score": 0.46046364307403564,
+        "top1_score": 0.46046364307403564,
+        "top1_text": "The user has a golden retriever named Apollo.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "pet-002",
+        "gold_rank": 1,
+        "gold_score": 0.5874672532081604,
+        "top1_score": 0.5874672532081604,
+        "top1_text": "The user has a cat named Luna.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "pet-003",
+        "gold_rank": 1,
+        "gold_score": 0.42153388261795044,
+        "top1_score": 0.42153388261795044,
+        "top1_text": "The user recently adopted a beagle puppy named Penny.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "job-001",
+        "gold_rank": 1,
+        "gold_score": 0.18614771962165833,
+        "top1_score": 0.18614771962165833,
+        "top1_text": "The user works as a software engineer at a startup.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "job-002",
+        "gold_rank": 1,
+        "gold_score": 0.2045474350452423,
+        "top1_score": 0.2045474350452423,
+        "top1_text": "The user is a high school chemistry teacher.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "job-003",
+        "gold_rank": 1,
+        "gold_score": 0.3352947533130646,
+        "top1_score": 0.3352947533130646,
+        "top1_text": "The user freelances as a graphic designer.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "city-001",
+        "gold_rank": 1,
+        "gold_score": 0.2956629991531372,
+        "top1_score": 0.2956629991531372,
+        "top1_text": "The user lives in Lisbon.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "city-002",
+        "gold_rank": 1,
+        "gold_score": 0.22115547955036163,
+        "top1_score": 0.22115547955036163,
+        "top1_text": "The user moved to Berlin last month.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "city-003",
+        "gold_rank": 1,
+        "gold_score": 0.33762556314468384,
+        "top1_score": 0.33762556314468384,
+        "top1_text": "The user is based in Toronto.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "food-001",
+        "gold_rank": 1,
+        "gold_score": 0.459760457277298,
+        "top1_score": 0.459760457277298,
+        "top1_text": "The user is vegetarian.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "food-002",
+        "gold_rank": 1,
+        "gold_score": 0.38580864667892456,
+        "top1_score": 0.38580864667892456,
+        "top1_text": "The user is severely allergic to peanuts.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "food-003",
+        "gold_rank": 1,
+        "gold_score": 0.30525362491607666,
+        "top1_score": 0.30525362491607666,
+        "top1_text": "The user does not drink coffee and only drinks tea.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "hobby-001",
+        "gold_rank": 1,
+        "gold_score": 0.38078930974006653,
+        "top1_score": 0.38078930974006653,
+        "top1_text": "The user plays classical piano.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "hobby-002",
+        "gold_rank": 1,
+        "gold_score": 0.38707590103149414,
+        "top1_score": 0.38707590103149414,
+        "top1_text": "The user's main sport is rock climbing.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "hobby-003",
+        "gold_rank": 1,
+        "gold_score": 0.264183908700943,
+        "top1_score": 0.264183908700943,
+        "top1_text": "The user has been knitting for about ten years.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "family-001",
+        "gold_rank": 1,
+        "gold_score": 0.28777992725372314,
+        "top1_score": 0.28777992725372314,
+        "top1_text": "The user has two children named Maya and Theo.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "family-002",
+        "gold_rank": 1,
+        "gold_score": 0.5129046440124512,
+        "top1_score": 0.5129046440124512,
+        "top1_text": "The user's partner's name is Casey.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "family-003",
+        "gold_rank": 1,
+        "gold_score": 0.6258491277694702,
+        "top1_score": 0.6258491277694702,
+        "top1_text": "The user's mom lives in Vancouver.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "vehicle-001",
+        "gold_rank": 1,
+        "gold_score": 0.33847203850746155,
+        "top1_score": 0.33847203850746155,
+        "top1_text": "The user drives a blue Subaru Outback.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "vehicle-002",
+        "gold_rank": null,
+        "gold_score": null,
+        "top1_score": 0.4807737469673157,
+        "top1_text": "The user does not own a car.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "edu-001",
+        "gold_rank": 1,
+        "gold_score": 0.45382001996040344,
+        "top1_score": 0.45382001996040344,
+        "top1_text": "The user studied applied mathematics in college.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "edu-002",
+        "gold_rank": 1,
+        "gold_score": 0.6818706393241882,
+        "top1_score": 0.6818706393241882,
+        "top1_text": "The user obtained an MBA from UCLA two years ago.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "tech-001",
+        "gold_rank": 1,
+        "gold_score": 0.30372458696365356,
+        "top1_score": 0.30372458696365356,
+        "top1_text": "The user's main laptop is a 16-inch MacBook Pro.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "tech-002",
+        "gold_rank": 1,
+        "gold_score": 0.3180965483188629,
+        "top1_score": 0.3180965483188629,
+        "top1_text": "The user prefers Neovim over VS Code.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "music-001",
+        "gold_rank": 1,
+        "gold_score": 0.33930984139442444,
+        "top1_score": 0.33930984139442444,
+        "top1_text": "The user has been getting into bluegrass music recently.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "music-002",
+        "gold_rank": 1,
+        "gold_score": 0.5527178049087524,
+        "top1_score": 0.5527178049087524,
+        "top1_text": "The user's all-time favorite band is Radiohead.",
+        "top1_is_meta": false
+      }
+    ]
+  },
+  "n": 30,
+  "rows": [
+    {
+      "id": "name-001",
+      "turns": [
+        "My name is Alex.",
+        "Got it."
+      ],
+      "query": "what is my name?",
+      "gold": "name is Alex",
+      "facts": [
+        "The user's name is Alex."
+      ],
+      "meta_mask": [
+        false
+      ],
+      "n_facts": 1,
+      "n_meta": 0
+    },
+    {
+      "id": "name-002",
+      "turns": [
+        "I go by Sam.",
+        "OK Sam."
+      ],
+      "query": "what's my name?",
+      "gold": "go by Sam",
+      "facts": [
+        "Sam goes by the name Sam."
+      ],
+      "meta_mask": [
+        false
+      ],
+      "n_facts": 1,
+      "n_meta": 0
+    },
+    {
+      "id": "name-003",
+      "turns": [
+        "You can call me Riley.",
+        "Hi Riley."
+      ],
+      "query": "what should you call me?",
+      "gold": "call me Riley",
+      "facts": [
+        "The user's name is Riley."
+      ],
+      "meta_mask": [
+        false
+      ],
+      "n_facts": 1,
+      "n_meta": 0
+    },
+    {
+      "id": "name-004",
+      "turns": [
+        "I'm Jordan.",
+        "Nice to meet you."
+      ],
+      "query": "who am I?",
+      "gold": "Jordan",
+      "facts": [
+        "The user's name is Jordan."
+      ],
+      "meta_mask": [
+        false
+      ],
+      "n_facts": 1,
+      "n_meta": 0
+    },
+    {
+      "id": "pet-001",
+      "turns": [
+        "I have a golden retriever named Apollo.",
+        "How sweet."
+      ],
+      "query": "what is my dog's name?",
+      "gold": "Apollo",
+      "facts": [
+        "The user has a golden retriever named Apollo."
+      ],
+      "meta_mask": [
+        false
+      ],
+      "n_facts": 1,
+      "n_meta": 0
+    },
+    {
+      "id": "pet-002",
+      "turns": [
+        "My cat Luna sleeps on my keyboard.",
+        "Classic cat."
+      ],
+      "query": "what's my cat's name?",
+      "gold": "Luna",
+      "facts": [
+        "The user has a cat named Luna.",
+        "Luna sleeps on the user's keyboard."
+      ],
+      "meta_mask": [
+        false,
+        false
+      ],
+      "n_facts": 2,
+      "n_meta": 0
+    },
+    {
+      "id": "pet-003",
+      "turns": [
+        "I just adopted a beagle puppy. Her name is Penny.",
+        "Congrats!"
+      ],
+      "query": "what kind of dog do I have?",
+      "gold": "beagle",
+      "facts": [
+        "The user recently adopted a beagle puppy named Penny.",
+        "The user's beagle puppy Penny is female."
+      ],
+      "meta_mask": [
+        false,
+        false
+      ],
+      "n_facts": 2,
+      "n_meta": 0
+    },
+    {
+      "id": "job-001",
+      "turns": [
+        "I work as a software engineer at a startup.",
+        "Cool field."
+      ],
+      "query": "what do I do for work?",
+      "gold": "software engineer",
+      "facts": [
+        "The user works as a software engineer at a startup."
+      ],
+      "meta_mask": [
+        false
+      ],
+      "n_facts": 1,
+      "n_meta": 0
+    },
+    {
+      "id": "job-002",
+      "turns": [
+        "I'm a high school chemistry teacher.",
+        "That's important work."
+      ],
+      "query": "what is my profession?",
+      "gold": "chemistry teacher",
+      "facts": [
+        "The user is a high school chemistry teacher."
+      ],
+      "meta_mask": [
+        false
+      ],
+      "n_facts": 1,
+      "n_meta": 0
+    },
+    {
+      "id": "job-003",
+      "turns": [
+        "I freelance as a graphic designer.",
+        "Nice."
+      ],
+      "query": "what's my job?",
+      "gold": "graphic designer",
+      "facts": [
+        "The user freelances as a graphic designer."
+      ],
+      "meta_mask": [
+        false
+      ],
+      "n_facts": 1,
+      "n_meta": 0
+    },
+    {
+      "id": "city-001",
+      "turns": [
+        "I live in Lisbon now.",
+        "Beautiful city."
+      ],
+      "query": "where do I live?",
+      "gold": "Lisbon",
+      "facts": [
+        "The user lives in Lisbon."
+      ],
+      "meta_mask": [
+        false
+      ],
+      "n_facts": 1,
+      "n_meta": 0
+    },
+    {
+      "id": "city-002",
+      "turns": [
+        "I just moved to Berlin last month.",
+        "Welcome to Berlin."
+      ],
+      "query": "what city am I in?",
+      "gold": "Berlin",
+      "facts": [
+        "The user moved to Berlin last month."
+      ],
+      "meta_mask": [
+        false
+      ],
+      "n_facts": 1,
+      "n_meta": 0
+    },
+    {
+      "id": "city-003",
+      "turns": [
+        "I'm based in Toronto.",
+        "Cold this time of year."
+      ],
+      "query": "where am I located?",
+      "gold": "Toronto",
+      "facts": [
+        "The user is based in Toronto."
+      ],
+      "meta_mask": [
+        false
+      ],
+      "n_facts": 1,
+      "n_meta": 0
+    },
+    {
+      "id": "food-001",
+      "turns": [
+        "I'm vegetarian.",
+        "Got it."
+      ],
+      "query": "do I eat meat?",
+      "gold": "vegetarian",
+      "facts": [
+        "The user is vegetarian."
+      ],
+      "meta_mask": [
+        false
+      ],
+      "n_facts": 1,
+      "n_meta": 0
+    },
+    {
+      "id": "food-002",
+      "turns": [
+        "I'm severely allergic to peanuts.",
+        "Noted, will avoid."
+      ],
+      "query": "do I have any allergies?",
+      "gold": "peanut",
+      "facts": [
+        "The user is severely allergic to peanuts."
+      ],
+      "meta_mask": [
+        false
+      ],
+      "n_facts": 1,
+      "n_meta": 0
+    },
+    {
+      "id": "food-003",
+      "turns": [
+        "I don't drink coffee \u2014 only tea.",
+        "Tea is great too."
+      ],
+      "query": "what do I drink in the morning?",
+      "gold": "tea",
+      "facts": [
+        "The user does not drink coffee and only drinks tea."
+      ],
+      "meta_mask": [
+        false
+      ],
+      "n_facts": 1,
+      "n_meta": 0
+    },
+    {
+      "id": "hobby-001",
+      "turns": [
+        "I play classical piano.",
+        "Lovely hobby."
+      ],
+      "query": "what instrument do I play?",
+      "gold": "piano",
+      "facts": [
+        "The user plays classical piano."
+      ],
+      "meta_mask": [
+        false
+      ],
+      "n_facts": 1,
+      "n_meta": 0
+    },
+    {
+      "id": "hobby-002",
+      "turns": [
+        "My main sport is rock climbing.",
+        "Cool."
+      ],
+      "query": "what sport do I do?",
+      "gold": "rock climbing",
+      "facts": [
+        "The user's main sport is rock climbing."
+      ],
+      "meta_mask": [
+        false
+      ],
+      "n_facts": 1,
+      "n_meta": 0
+    },
+    {
+      "id": "hobby-003",
+      "turns": [
+        "I've been knitting for about ten years.",
+        "Impressive."
+      ],
+      "query": "what's a hobby I have?",
+      "gold": "knitting",
+      "facts": [
+        "The user has been knitting for about ten years."
+      ],
+      "meta_mask": [
+        false
+      ],
+      "n_facts": 1,
+      "n_meta": 0
+    },
+    {
+      "id": "family-001",
+      "turns": [
+        "I have two kids, Maya and Theo.",
+        "What ages?"
+      ],
+      "query": "how many children do I have?",
+      "gold": "two",
+      "facts": [
+        "The user has two children named Maya and Theo."
+      ],
+      "meta_mask": [
+        false
+      ],
+      "n_facts": 1,
+      "n_meta": 0
+    },
+    {
+      "id": "family-002",
+      "turns": [
+        "My partner's name is Casey.",
+        "Nice."
+      ],
+      "query": "who is my partner?",
+      "gold": "Casey",
+      "facts": [
+        "The user's partner's name is Casey."
+      ],
+      "meta_mask": [
+        false
+      ],
+      "n_facts": 1,
+      "n_meta": 0
+    },
+    {
+      "id": "family-003",
+      "turns": [
+        "My mom lives in Vancouver.",
+        "Far from you?"
+      ],
+      "query": "where does my mom live?",
+      "gold": "Vancouver",
+      "facts": [
+        "The user's mom lives in Vancouver."
+      ],
+      "meta_mask": [
+        false
+      ],
+      "n_facts": 1,
+      "n_meta": 0
+    },
+    {
+      "id": "vehicle-001",
+      "turns": [
+        "I drive a blue Subaru Outback.",
+        "Reliable car."
+      ],
+      "query": "what kind of car do I have?",
+      "gold": "Subaru",
+      "facts": [
+        "The user drives a blue Subaru Outback."
+      ],
+      "meta_mask": [
+        false
+      ],
+      "n_facts": 1,
+      "n_meta": 0
+    },
+    {
+      "id": "vehicle-002",
+      "turns": [
+        "I don't own a car. I bike everywhere.",
+        "Healthy lifestyle."
+      ],
+      "query": "do I have a car?",
+      "gold": "don't own",
+      "facts": [
+        "The user does not own a car.",
+        "The user bikes everywhere for transportation."
+      ],
+      "meta_mask": [
+        false,
+        false
+      ],
+      "n_facts": 2,
+      "n_meta": 0
+    },
+    {
+      "id": "edu-001",
+      "turns": [
+        "I studied applied mathematics in college.",
+        "Tough major."
+      ],
+      "query": "what was my major?",
+      "gold": "applied mathematics",
+      "facts": [
+        "The user studied applied mathematics in college."
+      ],
+      "meta_mask": [
+        false
+      ],
+      "n_facts": 1,
+      "n_meta": 0
+    },
+    {
+      "id": "edu-002",
+      "turns": [
+        "I got my MBA from UCLA two years ago.",
+        "Congrats."
+      ],
+      "query": "where did I get my MBA?",
+      "gold": "UCLA",
+      "facts": [
+        "The user obtained an MBA from UCLA two years ago."
+      ],
+      "meta_mask": [
+        false
+      ],
+      "n_facts": 1,
+      "n_meta": 0
+    },
+    {
+      "id": "tech-001",
+      "turns": [
+        "My main laptop is a 16-inch MacBook Pro.",
+        "Solid machine."
+      ],
+      "query": "what computer do I use?",
+      "gold": "MacBook",
+      "facts": [
+        "The user's main laptop is a 16-inch MacBook Pro."
+      ],
+      "meta_mask": [
+        false
+      ],
+      "n_facts": 1,
+      "n_meta": 0
+    },
+    {
+      "id": "tech-002",
+      "turns": [
+        "I prefer Neovim over VS Code.",
+        "Editor preferences are personal."
+      ],
+      "query": "what editor do I use?",
+      "gold": "Neovim",
+      "facts": [
+        "The user prefers Neovim over VS Code."
+      ],
+      "meta_mask": [
+        false
+      ],
+      "n_facts": 1,
+      "n_meta": 0
+    },
+    {
+      "id": "music-001",
+      "turns": [
+        "I've been getting into bluegrass lately.",
+        "Fun genre."
+      ],
+      "query": "what music am I into these days?",
+      "gold": "bluegrass",
+      "facts": [
+        "The user has been getting into bluegrass music recently."
+      ],
+      "meta_mask": [
+        false
+      ],
+      "n_facts": 1,
+      "n_meta": 0
+    },
+    {
+      "id": "music-002",
+      "turns": [
+        "My all-time favorite band is Radiohead.",
+        "Great band."
+      ],
+      "query": "what's my favorite band?",
+      "gold": "Radiohead",
+      "facts": [
+        "The user's all-time favorite band is Radiohead."
+      ],
+      "meta_mask": [
+        false
+      ],
+      "n_facts": 1,
+      "n_meta": 0
+    }
+  ]
+}
\ No newline at end of file
diff --git a/benchmarks/alignbench/runs/demo-stress-v4-final-2026-05-15.json b/benchmarks/alignbench/runs/demo-stress-v4-final-2026-05-15.json
new file mode 100644
index 0000000..a9fc648
--- /dev/null
+++ b/benchmarks/alignbench/runs/demo-stress-v4-final-2026-05-15.json
@@ -0,0 +1,1017 @@
+{
+  "baseline": {
+    "recall_at_1": 1.0,
+    "recall_at_5": 1.0,
+    "gold_present_rate": 1.0,
+    "meta_at_top1": 0,
+    "per_item": [
+      {
+        "id": "name-001",
+        "gold_rank": 1,
+        "gold_score": 0.3960301876068115,
+        "top1_score": 0.3960301876068115,
+        "top1_text": "The user's name is Alex.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "name-002",
+        "gold_rank": 1,
+        "gold_score": 0.33899739384651184,
+        "top1_score": 0.33899739384651184,
+        "top1_text": "Sam goes by the name Sam.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "name-003",
+        "gold_rank": 1,
+        "gold_score": 0.1743495762348175,
+        "top1_score": 0.1743495762348175,
+        "top1_text": "The user's name is Riley.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "name-004",
+        "gold_rank": 1,
+        "gold_score": 0.29147881269454956,
+        "top1_score": 0.29147881269454956,
+        "top1_text": "The user's name is Jordan.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "pet-001",
+        "gold_rank": 1,
+        "gold_score": 0.46046364307403564,
+        "top1_score": 0.46046364307403564,
+        "top1_text": "The user has a golden retriever named Apollo.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "pet-002",
+        "gold_rank": 1,
+        "gold_score": 0.5874672532081604,
+        "top1_score": 0.5874672532081604,
+        "top1_text": "The user has a cat named Luna.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "pet-003",
+        "gold_rank": 1,
+        "gold_score": 0.43342381715774536,
+        "top1_score": 0.43342381715774536,
+        "top1_text": "The user's beagle puppy is female.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "job-001",
+        "gold_rank": 1,
+        "gold_score": 0.18614771962165833,
+        "top1_score": 0.18614771962165833,
+        "top1_text": "The user works as a software engineer at a startup.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "job-002",
+        "gold_rank": 1,
+        "gold_score": 0.2045476883649826,
+        "top1_score": 0.2045476883649826,
+        "top1_text": "The user is a high school chemistry teacher.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "job-003",
+        "gold_rank": 1,
+        "gold_score": 0.33529481291770935,
+        "top1_score": 0.33529481291770935,
+        "top1_text": "The user freelances as a graphic designer.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "city-001",
+        "gold_rank": 1,
+        "gold_score": 0.29566308856010437,
+        "top1_score": 0.29566308856010437,
+        "top1_text": "The user lives in Lisbon.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "city-002",
+        "gold_rank": 1,
+        "gold_score": 0.22115540504455566,
+        "top1_score": 0.22115540504455566,
+        "top1_text": "The user moved to Berlin last month.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "city-003",
+        "gold_rank": 1,
+        "gold_score": 0.3376256227493286,
+        "top1_score": 0.3376256227493286,
+        "top1_text": "The user is based in Toronto.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "food-001",
+        "gold_rank": 1,
+        "gold_score": 0.45976030826568604,
+        "top1_score": 0.45976030826568604,
+        "top1_text": "The user is vegetarian.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "food-002",
+        "gold_rank": 1,
+        "gold_score": 0.3858085870742798,
+        "top1_score": 0.3858085870742798,
+        "top1_text": "The user is severely allergic to peanuts.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "food-003",
+        "gold_rank": 1,
+        "gold_score": 0.30525368452072144,
+        "top1_score": 0.30525368452072144,
+        "top1_text": "The user does not drink coffee and only drinks tea.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "hobby-001",
+        "gold_rank": 1,
+        "gold_score": 0.3807893395423889,
+        "top1_score": 0.3807893395423889,
+        "top1_text": "The user plays classical piano.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "hobby-002",
+        "gold_rank": 1,
+        "gold_score": 0.3870759606361389,
+        "top1_score": 0.3870759606361389,
+        "top1_text": "The user's main sport is rock climbing.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "hobby-003",
+        "gold_rank": 1,
+        "gold_score": 0.26418396830558777,
+        "top1_score": 0.26418396830558777,
+        "top1_text": "The user has been knitting for about ten years.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "family-001",
+        "gold_rank": 1,
+        "gold_score": 0.28778010606765747,
+        "top1_score": 0.28778010606765747,
+        "top1_text": "The user has two children named Maya and Theo.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "family-002",
+        "gold_rank": 1,
+        "gold_score": 0.5129045248031616,
+        "top1_score": 0.5129045248031616,
+        "top1_text": "The user's partner's name is Casey.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "family-003",
+        "gold_rank": 1,
+        "gold_score": 0.6258488893508911,
+        "top1_score": 0.6258488893508911,
+        "top1_text": "The user's mom lives in Vancouver.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "vehicle-001",
+        "gold_rank": 1,
+        "gold_score": 0.33847203850746155,
+        "top1_score": 0.33847203850746155,
+        "top1_text": "The user drives a blue Subaru Outback.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "vehicle-002",
+        "gold_rank": 1,
+        "gold_score": 0.48077383637428284,
+        "top1_score": 0.48077383637428284,
+        "top1_text": "The user does not own a car.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "edu-001",
+        "gold_rank": 1,
+        "gold_score": 0.4538198709487915,
+        "top1_score": 0.4538198709487915,
+        "top1_text": "The user studied applied mathematics in college.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "edu-002",
+        "gold_rank": 1,
+        "gold_score": 0.6818708181381226,
+        "top1_score": 0.6818708181381226,
+        "top1_text": "The user obtained an MBA from UCLA two years ago.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "tech-001",
+        "gold_rank": 1,
+        "gold_score": 0.3037244975566864,
+        "top1_score": 0.3037244975566864,
+        "top1_text": "The user's main laptop is a 16-inch MacBook Pro.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "tech-002",
+        "gold_rank": 1,
+        "gold_score": 0.3180965185165405,
+        "top1_score": 0.3180965185165405,
+        "top1_text": "The user prefers Neovim over VS Code.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "music-001",
+        "gold_rank": 1,
+        "gold_score": 0.33930978178977966,
+        "top1_score": 0.33930978178977966,
+        "top1_text": "The user has been getting into bluegrass music recently.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "music-002",
+        "gold_rank": 1,
+        "gold_score": 0.5527176260948181,
+        "top1_score": 0.5527176260948181,
+        "top1_text": "The user's all-time favorite band is Radiohead.",
+        "top1_is_meta": false
+      }
+    ]
+  },
+  "filtered": {
+    "recall_at_1": 1.0,
+    "recall_at_5": 1.0,
+    "gold_present_rate": 1.0,
+    "meta_at_top1": 0,
+    "per_item": [
+      {
+        "id": "name-001",
+        "gold_rank": 1,
+        "gold_score": 0.3960301876068115,
+        "top1_score": 0.3960301876068115,
+        "top1_text": "The user's name is Alex.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "name-002",
+        "gold_rank": 1,
+        "gold_score": 0.33899739384651184,
+        "top1_score": 0.33899739384651184,
+        "top1_text": "Sam goes by the name Sam.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "name-003",
+        "gold_rank": 1,
+        "gold_score": 0.1743495762348175,
+        "top1_score": 0.1743495762348175,
+        "top1_text": "The user's name is Riley.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "name-004",
+        "gold_rank": 1,
+        "gold_score": 0.29147881269454956,
+        "top1_score": 0.29147881269454956,
+        "top1_text": "The user's name is Jordan.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "pet-001",
+        "gold_rank": 1,
+        "gold_score": 0.46046364307403564,
+        "top1_score": 0.46046364307403564,
+        "top1_text": "The user has a golden retriever named Apollo.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "pet-002",
+        "gold_rank": 1,
+        "gold_score": 0.5874672532081604,
+        "top1_score": 0.5874672532081604,
+        "top1_text": "The user has a cat named Luna.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "pet-003",
+        "gold_rank": 1,
+        "gold_score": 0.43342381715774536,
+        "top1_score": 0.43342381715774536,
+        "top1_text": "The user's beagle puppy is female.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "job-001",
+        "gold_rank": 1,
+        "gold_score": 0.18614771962165833,
+        "top1_score": 0.18614771962165833,
+        "top1_text": "The user works as a software engineer at a startup.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "job-002",
+        "gold_rank": 1,
+        "gold_score": 0.2045476883649826,
+        "top1_score": 0.2045476883649826,
+        "top1_text": "The user is a high school chemistry teacher.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "job-003",
+        "gold_rank": 1,
+        "gold_score": 0.33529481291770935,
+        "top1_score": 0.33529481291770935,
+        "top1_text": "The user freelances as a graphic designer.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "city-001",
+        "gold_rank": 1,
+        "gold_score": 0.29566308856010437,
+        "top1_score": 0.29566308856010437,
+        "top1_text": "The user lives in Lisbon.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "city-002",
+        "gold_rank": 1,
+        "gold_score": 0.22115540504455566,
+        "top1_score": 0.22115540504455566,
+        "top1_text": "The user moved to Berlin last month.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "city-003",
+        "gold_rank": 1,
+        "gold_score": 0.3376256227493286,
+        "top1_score": 0.3376256227493286,
+        "top1_text": "The user is based in Toronto.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "food-001",
+        "gold_rank": 1,
+        "gold_score": 0.45976030826568604,
+        "top1_score": 0.45976030826568604,
+        "top1_text": "The user is vegetarian.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "food-002",
+        "gold_rank": 1,
+        "gold_score": 0.3858085870742798,
+        "top1_score": 0.3858085870742798,
+        "top1_text": "The user is severely allergic to peanuts.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "food-003",
+        "gold_rank": 1,
+        "gold_score": 0.30525368452072144,
+        "top1_score": 0.30525368452072144,
+        "top1_text": "The user does not drink coffee and only drinks tea.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "hobby-001",
+        "gold_rank": 1,
+        "gold_score": 0.3807893395423889,
+        "top1_score": 0.3807893395423889,
+        "top1_text": "The user plays classical piano.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "hobby-002",
+        "gold_rank": 1,
+        "gold_score": 0.3870759606361389,
+        "top1_score": 0.3870759606361389,
+        "top1_text": "The user's main sport is rock climbing.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "hobby-003",
+        "gold_rank": 1,
+        "gold_score": 0.26418396830558777,
+        "top1_score": 0.26418396830558777,
+        "top1_text": "The user has been knitting for about ten years.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "family-001",
+        "gold_rank": 1,
+        "gold_score": 0.28778010606765747,
+        "top1_score": 0.28778010606765747,
+        "top1_text": "The user has two children named Maya and Theo.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "family-002",
+        "gold_rank": 1,
+        "gold_score": 0.5129045248031616,
+        "top1_score": 0.5129045248031616,
+        "top1_text": "The user's partner's name is Casey.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "family-003",
+        "gold_rank": 1,
+        "gold_score": 0.6258488893508911,
+        "top1_score": 0.6258488893508911,
+        "top1_text": "The user's mom lives in Vancouver.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "vehicle-001",
+        "gold_rank": 1,
+        "gold_score": 0.33847203850746155,
+        "top1_score": 0.33847203850746155,
+        "top1_text": "The user drives a blue Subaru Outback.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "vehicle-002",
+        "gold_rank": 1,
+        "gold_score": 0.48077383637428284,
+        "top1_score": 0.48077383637428284,
+        "top1_text": "The user does not own a car.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "edu-001",
+        "gold_rank": 1,
+        "gold_score": 0.4538198709487915,
+        "top1_score": 0.4538198709487915,
+        "top1_text": "The user studied applied mathematics in college.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "edu-002",
+        "gold_rank": 1,
+        "gold_score": 0.6818708181381226,
+        "top1_score": 0.6818708181381226,
+        "top1_text": "The user obtained an MBA from UCLA two years ago.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "tech-001",
+        "gold_rank": 1,
+        "gold_score": 0.3037244975566864,
+        "top1_score": 0.3037244975566864,
+        "top1_text": "The user's main laptop is a 16-inch MacBook Pro.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "tech-002",
+        "gold_rank": 1,
+        "gold_score": 0.3180965185165405,
+        "top1_score": 0.3180965185165405,
+        "top1_text": "The user prefers Neovim over VS Code.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "music-001",
+        "gold_rank": 1,
+        "gold_score": 0.33930978178977966,
+        "top1_score": 0.33930978178977966,
+        "top1_text": "The user has been getting into bluegrass music recently.",
+        "top1_is_meta": false
+      },
+      {
+        "id": "music-002",
+        "gold_rank": 1,
+        "gold_score": 0.5527176260948181,
+        "top1_score": 0.5527176260948181,
+        "top1_text": "The user's all-time favorite band is Radiohead.",
+        "top1_is_meta": false
+      }
+    ]
+  },
+  "n": 30,
+  "rows": [
+    {
+      "id": "name-001",
+      "turns": [
+        "My name is Alex.",
+        "Got it."
+      ],
+      "query": "what is my name?",
+      "gold": "name is Alex",
+      "facts": [
+        "The user's name is Alex."
+      ],
+      "meta_mask": [
+        false
+      ],
+      "n_facts": 1,
+      "n_meta": 0
+    },
+    {
+      "id": "name-002",
+      "turns": [
+        "I go by Sam.",
+        "OK Sam."
+      ],
+      "query": "what's my name?",
+      "gold": "go by Sam",
+      "facts": [
+        "Sam goes by the name Sam."
+      ],
+      "meta_mask": [
+        false
+      ],
+      "n_facts": 1,
+      "n_meta": 0
+    },
+    {
+      "id": "name-003",
+      "turns": [
+        "You can call me Riley.",
+        "Hi Riley."
+      ],
+      "query": "what should you call me?",
+      "gold": "call me Riley",
+      "facts": [
+        "The user's name is Riley."
+      ],
+      "meta_mask": [
+        false
+      ],
+      "n_facts": 1,
+      "n_meta": 0
+    },
+    {
+      "id": "name-004",
+      "turns": [
+        "I'm Jordan.",
+        "Nice to meet you."
+      ],
+      "query": "who am I?",
+      "gold": "Jordan",
+      "facts": [
+        "The user's name is Jordan."
+      ],
+      "meta_mask": [
+        false
+      ],
+      "n_facts": 1,
+      "n_meta": 0
+    },
+    {
+      "id": "pet-001",
+      "turns": [
+        "I have a golden retriever named Apollo.",
+        "How sweet."
+      ],
+      "query": "what is my dog's name?",
+      "gold": "Apollo",
+      "facts": [
+        "The user has a golden retriever named Apollo."
+      ],
+      "meta_mask": [
+        false
+      ],
+      "n_facts": 1,
+      "n_meta": 0
+    },
+    {
+      "id": "pet-002",
+      "turns": [
+        "My cat Luna sleeps on my keyboard.",
+        "Classic cat."
+      ],
+      "query": "what's my cat's name?",
+      "gold": "Luna",
+      "facts": [
+        "The user has a cat named Luna.",
+        "Luna sleeps on the user's keyboard."
+      ],
+      "meta_mask": [
+        false,
+        false
+      ],
+      "n_facts": 2,
+      "n_meta": 0
+    },
+    {
+      "id": "pet-003",
+      "turns": [
+        "I just adopted a beagle puppy. Her name is Penny.",
+        "Congrats!"
+      ],
+      "query": "what kind of dog do I have?",
+      "gold": "beagle",
+      "facts": [
+        "The user recently adopted a beagle puppy named Penny.",
+        "The user's beagle puppy is female."
+      ],
+      "meta_mask": [
+        false,
+        false
+      ],
+      "n_facts": 2,
+      "n_meta": 0
+    },
+    {
+      "id": "job-001",
+      "turns": [
+        "I work as a software engineer at a startup.",
+        "Cool field."
+      ],
+      "query": "what do I do for work?",
+      "gold": "software engineer",
+      "facts": [
+        "The user works as a software engineer at a startup."
+      ],
+      "meta_mask": [
+        false
+      ],
+      "n_facts": 1,
+      "n_meta": 0
+    },
+    {
+      "id": "job-002",
+      "turns": [
+        "I'm a high school chemistry teacher.",
+        "That's important work."
+      ],
+      "query": "what is my profession?",
+      "gold": "chemistry teacher",
+      "facts": [
+        "The user is a high school chemistry teacher."
+      ],
+      "meta_mask": [
+        false
+      ],
+      "n_facts": 1,
+      "n_meta": 0
+    },
+    {
+      "id": "job-003",
+      "turns": [
+        "I freelance as a graphic designer.",
+        "Nice."
+      ],
+      "query": "what's my job?",
+      "gold": "graphic designer",
+      "facts": [
+        "The user freelances as a graphic designer."
+      ],
+      "meta_mask": [
+        false
+      ],
+      "n_facts": 1,
+      "n_meta": 0
+    },
+    {
+      "id": "city-001",
+      "turns": [
+        "I live in Lisbon now.",
+        "Beautiful city."
+      ],
+      "query": "where do I live?",
+      "gold": "Lisbon",
+      "facts": [
+        "The user lives in Lisbon."
+      ],
+      "meta_mask": [
+        false
+      ],
+      "n_facts": 1,
+      "n_meta": 0
+    },
+    {
+      "id": "city-002",
+      "turns": [
+        "I just moved to Berlin last month.",
+        "Welcome to Berlin."
+      ],
+      "query": "what city am I in?",
+      "gold": "Berlin",
+      "facts": [
+        "The user moved to Berlin last month."
+      ],
+      "meta_mask": [
+        false
+      ],
+      "n_facts": 1,
+      "n_meta": 0
+    },
+    {
+      "id": "city-003",
+      "turns": [
+        "I'm based in Toronto.",
+        "Cold this time of year."
+      ],
+      "query": "where am I located?",
+      "gold": "Toronto",
+      "facts": [
+        "The user is based in Toronto."
+      ],
+      "meta_mask": [
+        false
+      ],
+      "n_facts": 1,
+      "n_meta": 0
+    },
+    {
+      "id": "food-001",
+      "turns": [
+        "I'm vegetarian.",
+        "Got it."
+      ],
+      "query": "do I eat meat?",
+      "gold": "vegetarian",
+      "facts": [
+        "The user is vegetarian."
+      ],
+      "meta_mask": [
+        false
+      ],
+      "n_facts": 1,
+      "n_meta": 0
+    },
+    {
+      "id": "food-002",
+      "turns": [
+        "I'm severely allergic to peanuts.",
+        "Noted, will avoid."
+      ],
+      "query": "do I have any allergies?",
+      "gold": "peanut",
+      "facts": [
+        "The user is severely allergic to peanuts."
+      ],
+      "meta_mask": [
+        false
+      ],
+      "n_facts": 1,
+      "n_meta": 0
+    },
+    {
+      "id": "food-003",
+      "turns": [
+        "I don't drink coffee \u2014 only tea.",
+        "Tea is great too."
+      ],
+      "query": "what do I drink in the morning?",
+      "gold": "tea",
+      "facts": [
+        "The user does not drink coffee and only drinks tea."
+      ],
+      "meta_mask": [
+        false
+      ],
+      "n_facts": 1,
+      "n_meta": 0
+    },
+    {
+      "id": "hobby-001",
+      "turns": [
+        "I play classical piano.",
+        "Lovely hobby."
+      ],
+      "query": "what instrument do I play?",
+      "gold": "piano",
+      "facts": [
+        "The user plays classical piano."
+      ],
+      "meta_mask": [
+        false
+      ],
+      "n_facts": 1,
+      "n_meta": 0
+    },
+    {
+      "id": "hobby-002",
+      "turns": [
+        "My main sport is rock climbing.",
+        "Cool."
+      ],
+      "query": "what sport do I do?",
+      "gold": "rock climbing",
+      "facts": [
+        "The user's main sport is rock climbing."
+      ],
+      "meta_mask": [
+        false
+      ],
+      "n_facts": 1,
+      "n_meta": 0
+    },
+    {
+      "id": "hobby-003",
+      "turns": [
+        "I've been knitting for about ten years.",
+        "Impressive."
+      ],
+      "query": "what's a hobby I have?",
+      "gold": "knitting",
+      "facts": [
+        "The user has been knitting for about ten years."
+      ],
+      "meta_mask": [
+        false
+      ],
+      "n_facts": 1,
+      "n_meta": 0
+    },
+    {
+      "id": "family-001",
+      "turns": [
+        "I have two kids, Maya and Theo.",
+        "What ages?"
+      ],
+      "query": "how many children do I have?",
+      "gold": "two",
+      "facts": [
+        "The user has two children named Maya and Theo."
+      ],
+      "meta_mask": [
+        false
+      ],
+      "n_facts": 1,
+      "n_meta": 0
+    },
+    {
+      "id": "family-002",
+      "turns": [
+        "My partner's name is Casey.",
+        "Nice."
+      ],
+      "query": "who is my partner?",
+      "gold": "Casey",
+      "facts": [
+        "The user's partner's name is Casey."
+      ],
+      "meta_mask": [
+        false
+      ],
+      "n_facts": 1,
+      "n_meta": 0
+    },
+    {
+      "id": "family-003",
+      "turns": [
+        "My mom lives in Vancouver.",
+        "Far from you?"
+      ],
+      "query": "where does my mom live?",
+      "gold": "Vancouver",
+      "facts": [
+        "The user's mom lives in Vancouver."
+      ],
+      "meta_mask": [
+        false
+      ],
+      "n_facts": 1,
+      "n_meta": 0
+    },
+    {
+      "id": "vehicle-001",
+      "turns": [
+        "I drive a blue Subaru Outback.",
+        "Reliable car."
+      ],
+      "query": "what kind of car do I have?",
+      "gold": "Subaru",
+      "facts": [
+        "The user drives a blue Subaru Outback."
+      ],
+      "meta_mask": [
+        false
+      ],
+      "n_facts": 1,
+      "n_meta": 0
+    },
+    {
+      "id": "vehicle-002",
+      "turns": [
+        "I don't own a car. I bike everywhere.",
+        "Healthy lifestyle."
+      ],
+      "query": "do I have a car?",
+      "gold": "does not own",
+      "facts": [
+        "The user does not own a car.",
+        "The user bikes everywhere for transportation."
+      ],
+      "meta_mask": [
+        false,
+        false
+      ],
+      "n_facts": 2,
+      "n_meta": 0
+    },
+    {
+      "id": "edu-001",
+      "turns": [
+        "I studied applied mathematics in college.",
+        "Tough major."
+      ],
+      "query": "what was my major?",
+      "gold": "applied mathematics",
+      "facts": [
+        "The user studied applied mathematics in college."
+      ],
+      "meta_mask": [
+        false
+      ],
+      "n_facts": 1,
+      "n_meta": 0
+    },
+    {
+      "id": "edu-002",
+      "turns": [
+        "I got my MBA from UCLA two years ago.",
+        "Congrats."
+      ],
+      "query": "where did I get my MBA?",
+      "gold": "UCLA",
+      "facts": [
+        "The user obtained an MBA from UCLA two years ago."
+      ],
+      "meta_mask": [
+        false
+      ],
+      "n_facts": 1,
+      "n_meta": 0
+    },
+    {
+      "id": "tech-001",
+      "turns": [
+        "My main laptop is a 16-inch MacBook Pro.",
+        "Solid machine."
+      ],
+      "query": "what computer do I use?",
+      "gold": "MacBook",
+      "facts": [
+        "The user's main laptop is a 16-inch MacBook Pro."
+      ],
+      "meta_mask": [
+        false
+      ],
+      "n_facts": 1,
+      "n_meta": 0
+    },
+    {
+      "id": "tech-002",
+      "turns": [
+        "I prefer Neovim over VS Code.",
+        "Editor preferences are personal."
+      ],
+      "query": "what editor do I use?",
+      "gold": "Neovim",
+      "facts": [
+        "The user prefers Neovim over VS Code."
+      ],
+      "meta_mask": [
+        false
+      ],
+      "n_facts": 1,
+      "n_meta": 0
+    },
+    {
+      "id": "music-001",
+      "turns": [
+        "I've been getting into bluegrass lately.",
+        "Fun genre."
+      ],
+      "query": "what music am I into these days?",
+      "gold": "bluegrass",
+      "facts": [
+        "The user has been getting into bluegrass music recently."
+      ],
+      "meta_mask": [
+        false
+      ],
+      "n_facts": 1,
+      "n_meta": 0
+    },
+    {
+      "id": "music-002",
+      "turns": [
+        "My all-time favorite band is Radiohead.",
+        "Great band."
+      ],
+      "query": "what's my favorite band?",
+      "gold": "Radiohead",
+      "facts": [
+        "The user's all-time favorite band is Radiohead."
+      ],
+      "meta_mask": [
+        false
+      ],
+      "n_facts": 1,
+      "n_meta": 0
+    }
+  ]
+}
\ No newline at end of file
diff --git a/benchmarks/alignbench/runs/dual-storage.json b/benchmarks/alignbench/runs/dual-storage.json
new file mode 100644
index 0000000..ba852cd
--- /dev/null
+++ b/benchmarks/alignbench/runs/dual-storage.json
@@ -0,0 +1,990 @@
+{
+  "variant": "dual-storage",
+  "model": "Xenova/all-MiniLM-L6-v2",
+  "topk": 5,
+  "wall_seconds": "0.5",
+  "composite": {
+    "recall_at_1": 0.7833333333333333,
+    "recall_at_5": 0.9333333333333333,
+    "distractor_top1_rate": 0.06666666666666667,
+    "n": 60
+  },
+  "false_positive_rate": 0,
+  "per_axis": [
+    {
+      "axis": "pronoun",
+      "n": 20,
+      "pool_size": 108,
+      "recall_at_1": 0.8,
+      "recall_at_5": 1,
+      "mean_gold_rank": 1.2,
+      "distractor_at_top1": 2,
+      "false_positive_count": 0,
+      "median_gold_margin": 0.027444863118702423,
+      "items": [
+        {
+          "id": "pronoun-001",
+          "query": "what is my name?",
+          "effective_query": "what is my name?",
+          "gold_in_topk": true,
+          "gold_global_key": "pronoun#0",
+          "gold_rank": 1,
+          "gold_score": 0.6464170828870618,
+          "top1_text": "My name is Alex.",
+          "top1_score": 0.6464170828870618,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "pronoun-002",
+          "query": "what's the user's name?",
+          "effective_query": "what's the user's name?",
+          "gold_in_topk": true,
+          "gold_global_key": "pronoun#0",
+          "gold_rank": 2,
+          "gold_score": 0.7294812723464028,
+          "top1_text": "The user asked for the user's name.",
+          "top1_score": 0.7771254660839599,
+          "top1_is_distractor": true,
+          "false_positive": false
+        },
+        {
+          "id": "pronoun-003",
+          "query": "who am I?",
+          "effective_query": "who am I?",
+          "gold_in_topk": true,
+          "gold_global_key": "pronoun#0",
+          "gold_rank": 2,
+          "gold_score": 0.3799511962658216,
+          "top1_text": "I am me.",
+          "top1_score": 0.5210136429765079,
+          "top1_is_distractor": true,
+          "false_positive": false
+        },
+        {
+          "id": "pronoun-004",
+          "query": "what do I do for work?",
+          "effective_query": "what do I do for work?",
+          "gold_in_topk": true,
+          "gold_global_key": "pronoun#1",
+          "gold_rank": 1,
+          "gold_score": 0.4066653080358072,
+          "top1_text": "I works as a software engineer at Acme.",
+          "top1_score": 0.4066653080358072,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "pronoun-005",
+          "query": "what is the user's job?",
+          "effective_query": "what is the user's job?",
+          "gold_in_topk": true,
+          "gold_global_key": "pronoun#1",
+          "gold_rank": 2,
+          "gold_score": 0.5754363621337681,
+          "top1_text": "The user works at Acme as a software engineer.",
+          "top1_score": 0.5767198521038163,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "pronoun-006",
+          "query": "where does the user work?",
+          "effective_query": "where does the user work?",
+          "gold_in_topk": true,
+          "gold_global_key": "pronoun#1",
+          "gold_rank": 1,
+          "gold_score": 0.5436570101621021,
+          "top1_text": "The user works as a software engineer at Acme.",
+          "top1_score": 0.5436570101621021,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "pronoun-007",
+          "query": "what is my dog's name?",
+          "effective_query": "what is my dog's name?",
+          "gold_in_topk": true,
+          "gold_global_key": "pronoun#2",
+          "gold_rank": 1,
+          "gold_score": 0.7021426325090143,
+          "top1_text": "My dog is named Apollo.",
+          "top1_score": 0.7021426325090143,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "pronoun-008",
+          "query": "who is Apollo?",
+          "effective_query": "who is Apollo?",
+          "gold_in_topk": true,
+          "gold_global_key": "pronoun#2",
+          "gold_rank": 1,
+          "gold_score": 0.7072701654355713,
+          "top1_text": "The user's dog is named Apollo.",
+          "top1_score": 0.7072701654355713,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "pronoun-009",
+          "query": "where do I live?",
+          "effective_query": "where do I live?",
+          "gold_in_topk": true,
+          "gold_global_key": "pronoun#3",
+          "gold_rank": 1,
+          "gold_score": 0.46363626173102523,
+          "top1_text": "I lives in Lisbon.",
+          "top1_score": 0.46363626173102523,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "pronoun-010",
+          "query": "what city does the user live in?",
+          "effective_query": "what city does the user live in?",
+          "gold_in_topk": true,
+          "gold_global_key": "pronoun#3",
+          "gold_rank": 1,
+          "gold_score": 0.6400263303955707,
+          "top1_text": "The user lives in Lisbon.",
+          "top1_score": 0.6400263303955707,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "pronoun-011",
+          "query": "when is my birthday?",
+          "effective_query": "when is my birthday?",
+          "gold_in_topk": true,
+          "gold_global_key": "pronoun#4",
+          "gold_rank": 1,
+          "gold_score": 0.7775242455582675,
+          "top1_text": "My birthday is March 14.",
+          "top1_score": 0.7775242455582675,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "pronoun-012",
+          "query": "when was the user born?",
+          "effective_query": "when was the user born?",
+          "gold_in_topk": true,
+          "gold_global_key": "pronoun#4",
+          "gold_rank": 1,
+          "gold_score": 0.7251037734927323,
+          "top1_text": "The user's birthday is March 14.",
+          "top1_score": 0.7251037734927323,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "pronoun-013",
+          "query": "do I have kids?",
+          "effective_query": "do I have kids?",
+          "gold_in_topk": true,
+          "gold_global_key": "pronoun#5",
+          "gold_rank": 1,
+          "gold_score": 0.38989854641830624,
+          "top1_text": "I have two children, Maya and Theo.",
+          "top1_score": 0.38989854641830624,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "pronoun-014",
+          "query": "how many children does the user have?",
+          "effective_query": "how many children does the user have?",
+          "gold_in_topk": true,
+          "gold_global_key": "pronoun#5",
+          "gold_rank": 1,
+          "gold_score": 0.5766094762746933,
+          "top1_text": "The user has two children, Maya and Theo.",
+          "top1_score": 0.5766094762746933,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "pronoun-015",
+          "query": "what is my usual coffee order?",
+          "effective_query": "what is my usual coffee order?",
+          "gold_in_topk": true,
+          "gold_global_key": "pronoun#6",
+          "gold_rank": 1,
+          "gold_score": 0.6405250896286532,
+          "top1_text": "My favorite coffee order is an oat-milk flat white.",
+          "top1_score": 0.6405250896286532,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "pronoun-016",
+          "query": "what coffee does the user drink?",
+          "effective_query": "what coffee does the user drink?",
+          "gold_in_topk": true,
+          "gold_global_key": "pronoun#6",
+          "gold_rank": 2,
+          "gold_score": 0.5813657391451515,
+          "top1_text": "The user does not drink coffee. They prefer tea.",
+          "top1_score": 0.7197601756319,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "pronoun-017",
+          "query": "what did I study?",
+          "effective_query": "what did I study?",
+          "gold_in_topk": true,
+          "gold_global_key": "pronoun#7",
+          "gold_rank": 1,
+          "gold_score": 0.6096865792083074,
+          "top1_text": "I studied applied mathematics at university.",
+          "top1_score": 0.6096865792083074,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "pronoun-018",
+          "query": "do I have any allergies?",
+          "effective_query": "do I have any allergies?",
+          "gold_in_topk": true,
+          "gold_global_key": "pronoun#8",
+          "gold_rank": 1,
+          "gold_score": 0.545551579531866,
+          "top1_text": "I am allergic to peanuts.",
+          "top1_score": 0.545551579531866,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "pronoun-019",
+          "query": "what is the user allergic to?",
+          "effective_query": "what is the user allergic to?",
+          "gold_in_topk": true,
+          "gold_global_key": "pronoun#8",
+          "gold_rank": 1,
+          "gold_score": 0.7261582549960168,
+          "top1_text": "The user is allergic to peanuts.",
+          "top1_score": 0.7261582549960168,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "pronoun-020",
+          "query": "what kind of car do I drive?",
+          "effective_query": "what kind of car do I drive?",
+          "gold_in_topk": true,
+          "gold_global_key": "pronoun#9",
+          "gold_rank": 1,
+          "gold_score": 0.5645994319440694,
+          "top1_text": "I drives a 2019 Toyota Corolla.",
+          "top1_score": 0.5645994319440694,
+          "top1_is_distractor": false,
+          "false_positive": false
+        }
+      ]
+    },
+    {
+      "axis": "temporal",
+      "n": 14,
+      "pool_size": 108,
+      "recall_at_1": 0.5,
+      "recall_at_5": 0.7142857142857143,
+      "mean_gold_rank": 5.5,
+      "distractor_at_top1": 2,
+      "false_positive_count": 0,
+      "median_gold_margin": 0.049944619619024966,
+      "items": [
+        {
+          "id": "temporal-001",
+          "query": "where does the user live now?",
+          "effective_query": "where does the user live now?",
+          "gold_in_topk": true,
+          "gold_global_key": "temporal#0",
+          "gold_rank": 3,
+          "gold_score": 0.550366425409688,
+          "top1_text": "The user lives in Lisbon.",
+          "top1_score": 0.5504016420681116,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "temporal-002",
+          "query": "where did the user used to live?",
+          "effective_query": "where did the user used to live?",
+          "gold_in_topk": true,
+          "gold_global_key": "temporal#1",
+          "gold_rank": 1,
+          "gold_score": 0.5490901457590983,
+          "top1_text": "Before 2024, the user lived in Berlin.",
+          "top1_score": 0.5490901457590983,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "temporal-003",
+          "query": "when did the user move?",
+          "effective_query": "when did the user move?",
+          "gold_in_topk": true,
+          "gold_global_key": "temporal#2",
+          "gold_rank": 1,
+          "gold_score": 0.5076513080537272,
+          "top1_text": "The user moved from Berlin to Lisbon in 2024.",
+          "top1_score": 0.5076513080537272,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "temporal-004",
+          "query": "where is the user currently based?",
+          "effective_query": "where is the user currently based?",
+          "gold_in_topk": true,
+          "gold_global_key": "temporal#0",
+          "gold_rank": 11,
+          "gold_score": 0.42833906054622706,
+          "top1_text": "The user asked for the user's name.",
+          "top1_score": 0.4962051712532369,
+          "top1_is_distractor": true,
+          "false_positive": false
+        },
+        {
+          "id": "temporal-005",
+          "query": "what is the user reading?",
+          "effective_query": "what is the user reading?",
+          "gold_in_topk": true,
+          "gold_global_key": "temporal#3",
+          "gold_rank": 8,
+          "gold_score": 0.4600491260020972,
+          "top1_text": "The user reads on a Kindle Paperwhite 11th-gen.",
+          "top1_score": 0.5634334413727431,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "temporal-006",
+          "query": "what did the user read last year?",
+          "effective_query": "what did the user read last year?",
+          "gold_in_topk": true,
+          "gold_global_key": "temporal#4",
+          "gold_rank": 1,
+          "gold_score": 0.5028289729480597,
+          "top1_text": "Last year the user read 'Project Hail Mary'.",
+          "top1_score": 0.5028289729480597,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "temporal-007",
+          "query": "what is the user working on these days?",
+          "effective_query": "what is the user working on these days?",
+          "gold_in_topk": true,
+          "gold_global_key": "temporal#5",
+          "gold_rank": 4,
+          "gold_score": 0.4513265913655406,
+          "top1_text": "The user works at Acme as a software engineer.",
+          "top1_score": 0.48543171981436045,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "temporal-008",
+          "query": "what did the user finish last month?",
+          "effective_query": "what did the user finish last month?",
+          "gold_in_topk": true,
+          "gold_global_key": "temporal#6",
+          "gold_rank": 1,
+          "gold_score": 0.5544356167423142,
+          "top1_text": "The user finished the Sprint-4 reranker training last month.",
+          "top1_score": 0.5544356167423142,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "temporal-009",
+          "query": "what LLM does the user prefer?",
+          "effective_query": "what LLM does the user prefer?",
+          "gold_in_topk": true,
+          "gold_global_key": "temporal#7",
+          "gold_rank": 1,
+          "gold_score": 0.6226655286181774,
+          "top1_text": "As of May 2026, the user's preferred LLM is Claude Sonnet 4.6.",
+          "top1_score": 0.6226655286181774,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "temporal-010",
+          "query": "which model did the user use before?",
+          "effective_query": "which model did the user use before?",
+          "gold_in_topk": true,
+          "gold_global_key": "temporal#8",
+          "gold_rank": 1,
+          "gold_score": 0.5205286761987613,
+          "top1_text": "The user used GPT-4 as their primary model in 2024.",
+          "top1_score": 0.5205286761987613,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "temporal-011",
+          "query": "did the user get a new phone recently?",
+          "effective_query": "did the user get a new phone recently?",
+          "gold_in_topk": true,
+          "gold_global_key": "temporal#9",
+          "gold_rank": 1,
+          "gold_score": 0.6113316689987077,
+          "top1_text": "The user upgraded their phone to an iPhone 17 in March 2026.",
+          "top1_score": 0.6113316689987077,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "temporal-012",
+          "query": "is the user still in Berlin?",
+          "effective_query": "is the user still in Berlin?",
+          "gold_in_topk": true,
+          "gold_global_key": "temporal#0",
+          "gold_rank": 5,
+          "gold_score": 0.4089445689504921,
+          "top1_text": "Before 2024, the user lived in Berlin.",
+          "top1_score": 0.7214467722165934,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "temporal-013",
+          "query": "what is the user up to?",
+          "effective_query": "what is the user up to?",
+          "gold_in_topk": true,
+          "gold_global_key": "temporal#5",
+          "gold_rank": 19,
+          "gold_score": 0.2672680179526975,
+          "top1_text": "The user is asking a question.",
+          "top1_score": 0.5409891051249414,
+          "top1_is_distractor": true,
+          "false_positive": false
+        },
+        {
+          "id": "temporal-014",
+          "query": "which model is the user on right now?",
+          "effective_query": "which model is the user on right now?",
+          "gold_in_topk": true,
+          "gold_global_key": "temporal#7",
+          "gold_rank": 20,
+          "gold_score": 0.31088477125594804,
+          "top1_text": "The user's name is Alex.",
+          "top1_score": 0.40861487066233426,
+          "top1_is_distractor": false,
+          "false_positive": false
+        }
+      ]
+    },
+    {
+      "axis": "specificity",
+      "n": 14,
+      "pool_size": 108,
+      "recall_at_1": 0.8571428571428571,
+      "recall_at_5": 1,
+      "mean_gold_rank": 1.2142857142857142,
+      "distractor_at_top1": 0,
+      "false_positive_count": 0,
+      "median_gold_margin": 0.15120813892175938,
+      "items": [
+        {
+          "id": "specificity-001",
+          "query": "tell me about my dog",
+          "effective_query": "tell me about my dog",
+          "gold_in_topk": true,
+          "gold_global_key": "specificity#0",
+          "gold_rank": 3,
+          "gold_score": 0.4727109277650852,
+          "top1_text": "I have a dog named Apollo.",
+          "top1_score": 0.5132527569891849,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "specificity-002",
+          "query": "what kind of pet do I have?",
+          "effective_query": "what kind of pet do I have?",
+          "gold_in_topk": true,
+          "gold_global_key": "specificity#0",
+          "gold_rank": 2,
+          "gold_score": 0.43959063553360933,
+          "top1_text": "I have a dog named Apollo.",
+          "top1_score": 0.5181634785804181,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "specificity-003",
+          "query": "do I own a bike?",
+          "effective_query": "do I own a bike?",
+          "gold_in_topk": true,
+          "gold_global_key": "specificity#1",
+          "gold_rank": 1,
+          "gold_score": 0.6758443024030343,
+          "top1_text": "I owns a Bianchi road bike.",
+          "top1_score": 0.6758443024030343,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "specificity-004",
+          "query": "what brand of bike does the user have?",
+          "effective_query": "what brand of bike does the user have?",
+          "gold_in_topk": true,
+          "gold_global_key": "specificity#1",
+          "gold_rank": 1,
+          "gold_score": 0.6329297203833848,
+          "top1_text": "The user owns a Bianchi road bike.",
+          "top1_score": 0.6329297203833848,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "specificity-005",
+          "query": "what computer do I use?",
+          "effective_query": "what computer do I use?",
+          "gold_in_topk": true,
+          "gold_global_key": "specificity#2",
+          "gold_rank": 1,
+          "gold_score": 0.33136554535868123,
+          "top1_text": "My primary laptop is a 16-inch MacBook Pro M4.",
+          "top1_score": 0.33136554535868123,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "specificity-006",
+          "query": "what laptop does the user have?",
+          "effective_query": "what laptop does the user have?",
+          "gold_in_topk": true,
+          "gold_global_key": "specificity#2",
+          "gold_rank": 1,
+          "gold_score": 0.6450468635073591,
+          "top1_text": "The user's primary laptop is a 16-inch MacBook Pro M4.",
+          "top1_score": 0.6450468635073591,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "specificity-007",
+          "query": "do I have any musical instruments?",
+          "effective_query": "do I have any musical instruments?",
+          "gold_in_topk": true,
+          "gold_global_key": "specificity#3",
+          "gold_rank": 1,
+          "gold_score": 0.3865674783833039,
+          "top1_text": "I have a Yamaha P-125 digital piano in the living room.",
+          "top1_score": 0.3865674783833039,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "specificity-008",
+          "query": "which note-taking app do I use?",
+          "effective_query": "which note-taking app do I use?",
+          "gold_in_topk": true,
+          "gold_global_key": "specificity#4",
+          "gold_rank": 1,
+          "gold_score": 0.45506558306415273,
+          "top1_text": "I uses Logseq for personal notes and Notion for work.",
+          "top1_score": 0.45506558306415273,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "specificity-009",
+          "query": "where do I like to eat in Lisbon?",
+          "effective_query": "where do I like to eat in Lisbon?",
+          "gold_in_topk": true,
+          "gold_global_key": "specificity#5",
+          "gold_rank": 1,
+          "gold_score": 0.7771199647073028,
+          "top1_text": "My favorite restaurant in Lisbon is Cervejaria Ramiro.",
+          "top1_score": 0.7771199647073028,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "specificity-010",
+          "query": "what brand sunglasses does the user wear?",
+          "effective_query": "what brand sunglasses does the user wear?",
+          "gold_in_topk": true,
+          "gold_global_key": "specificity#6",
+          "gold_rank": 1,
+          "gold_score": 0.6512271965052092,
+          "top1_text": "The user wears Smith Lowdown sunglasses.",
+          "top1_score": 0.6512271965052092,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "specificity-011",
+          "query": "do I read on a Kindle?",
+          "effective_query": "do I read on a Kindle?",
+          "gold_in_topk": true,
+          "gold_global_key": "specificity#7",
+          "gold_rank": 1,
+          "gold_score": 0.7110555603582231,
+          "top1_text": "I reads on a Kindle Paperwhite 11th-gen.",
+          "top1_score": 0.7110555603582231,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "specificity-012",
+          "query": "what espresso machine does the user own?",
+          "effective_query": "what espresso machine does the user own?",
+          "gold_in_topk": true,
+          "gold_global_key": "specificity#8",
+          "gold_rank": 1,
+          "gold_score": 0.7366961226077487,
+          "top1_text": "The user's home espresso machine is a Lelit Bianca v3.",
+          "top1_score": 0.7366961226077487,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "specificity-013",
+          "query": "what shoes do I wear?",
+          "effective_query": "what shoes do I wear?",
+          "gold_in_topk": true,
+          "gold_global_key": "specificity#9",
+          "gold_rank": 1,
+          "gold_score": 0.42414870117196674,
+          "top1_text": "I wears Allbirds Wool Runners daily.",
+          "top1_score": 0.42414870117196674,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "specificity-014",
+          "query": "what brand are the user's everyday shoes?",
+          "effective_query": "what brand are the user's everyday shoes?",
+          "gold_in_topk": true,
+          "gold_global_key": "specificity#9",
+          "gold_rank": 1,
+          "gold_score": 0.4238019274298934,
+          "top1_text": "I wears Allbirds Wool Runners daily.",
+          "top1_score": 0.4238019274298934,
+          "top1_is_distractor": false,
+          "false_positive": false
+        }
+      ]
+    },
+    {
+      "axis": "negation",
+      "n": 12,
+      "pool_size": 108,
+      "recall_at_1": 1,
+      "recall_at_5": 1,
+      "mean_gold_rank": 1,
+      "distractor_at_top1": 0,
+      "false_positive_count": 0,
+      "median_gold_margin": 0.26762270427014134,
+      "items": [
+        {
+          "id": "negation-001",
+          "query": "does the user drink coffee?",
+          "effective_query": "does the user drink coffee?",
+          "gold_in_topk": true,
+          "gold_global_key": "negation#0",
+          "gold_rank": 1,
+          "gold_score": 0.7905096599561829,
+          "top1_text": "The user does not drink coffee. They prefer tea.",
+          "top1_score": 0.7905096599561829,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "negation-002",
+          "query": "what does the user drink in the morning?",
+          "effective_query": "what does the user drink in the morning?",
+          "gold_in_topk": true,
+          "gold_global_key": "negation#0",
+          "gold_rank": 1,
+          "gold_score": 0.5285974920275435,
+          "top1_text": "The user does not drink coffee. They prefer tea.",
+          "top1_score": 0.5285974920275435,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "negation-003",
+          "query": "am I vegetarian?",
+          "effective_query": "am I vegetarian?",
+          "gold_in_topk": true,
+          "gold_global_key": "negation#1",
+          "gold_rank": 1,
+          "gold_score": 0.7051238708423307,
+          "top1_text": "I am not vegetarian, but avoids red meat.",
+          "top1_score": 0.7051238708423307,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "negation-004",
+          "query": "is the user on Twitter?",
+          "effective_query": "is the user on Twitter?",
+          "gold_in_topk": true,
+          "gold_global_key": "negation#2",
+          "gold_rank": 1,
+          "gold_score": 0.5874304481012608,
+          "top1_text": "The user does not use Twitter; they use Bluesky and Mastodon.",
+          "top1_score": 0.5874304481012608,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "negation-005",
+          "query": "which social networks does the user use?",
+          "effective_query": "which social networks does the user use?",
+          "gold_in_topk": true,
+          "gold_global_key": "negation#2",
+          "gold_rank": 1,
+          "gold_score": 0.5067702908301415,
+          "top1_text": "The user does not use Twitter; they use Bluesky and Mastodon.",
+          "top1_score": 0.5067702908301415,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "negation-006",
+          "query": "does the user own a car?",
+          "effective_query": "does the user own a car?",
+          "gold_in_topk": true,
+          "gold_global_key": "negation#3",
+          "gold_rank": 1,
+          "gold_score": 0.7488288864422115,
+          "top1_text": "The user does not own a car; they bike or use public transit.",
+          "top1_score": 0.7488288864422115,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "negation-007",
+          "query": "is the user active on LinkedIn?",
+          "effective_query": "is the user active on LinkedIn?",
+          "gold_in_topk": true,
+          "gold_global_key": "negation#4",
+          "gold_rank": 1,
+          "gold_score": 0.7880239246717338,
+          "top1_text": "The user is not on LinkedIn anymore.",
+          "top1_score": 0.7880239246717338,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "negation-008",
+          "query": "any foods the user hates?",
+          "effective_query": "any foods the user hates?",
+          "gold_in_topk": true,
+          "gold_global_key": "negation#5",
+          "gold_rank": 1,
+          "gold_score": 0.46360991866022544,
+          "top1_text": "I dislikes cilantro intensely.",
+          "top1_score": 0.46360991866022544,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "negation-009",
+          "query": "has the user traveled to Asia?",
+          "effective_query": "has the user traveled to Asia?",
+          "gold_in_topk": true,
+          "gold_global_key": "negation#6",
+          "gold_rank": 1,
+          "gold_score": 0.6919597377462416,
+          "top1_text": "The user has never been to Asia.",
+          "top1_score": 0.6919597377462416,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "negation-010",
+          "query": "does the user like horror movies?",
+          "effective_query": "does the user like horror movies?",
+          "gold_in_topk": true,
+          "gold_global_key": "negation#7",
+          "gold_rank": 1,
+          "gold_score": 0.7303882056719367,
+          "top1_text": "The user does not enjoy horror movies.",
+          "top1_score": 0.7303882056719367,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "negation-011",
+          "query": "can the user eat shrimp?",
+          "effective_query": "can the user eat shrimp?",
+          "gold_in_topk": true,
+          "gold_global_key": "negation#8",
+          "gold_rank": 1,
+          "gold_score": 0.7044731236020111,
+          "top1_text": "The user does not eat shellfish.",
+          "top1_score": 0.7044731236020111,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "negation-012",
+          "query": "is the user learning a new language?",
+          "effective_query": "is the user learning a new language?",
+          "gold_in_topk": true,
+          "gold_global_key": "negation#9",
+          "gold_rank": 1,
+          "gold_score": 0.6522323599643379,
+          "top1_text": "The user is not currently learning any new languages.",
+          "top1_score": 0.6522323599643379,
+          "top1_is_distractor": false,
+          "false_positive": false
+        }
+      ]
+    },
+    {
+      "axis": "control",
+      "n": 10,
+      "pool_size": 108,
+      "recall_at_1": 0,
+      "recall_at_5": 0,
+      "mean_gold_rank": null,
+      "distractor_at_top1": 0,
+      "false_positive_count": 0,
+      "median_gold_margin": null,
+      "items": [
+        {
+          "id": "control-001",
+          "query": "what is the airspeed velocity of an unladen swallow?",
+          "effective_query": "what is the airspeed velocity of an unladen swallow?",
+          "gold_in_topk": false,
+          "gold_global_key": null,
+          "gold_rank": null,
+          "gold_score": null,
+          "top1_text": "I am asking a question.",
+          "top1_score": 0.25122827069301523,
+          "top1_is_distractor": true,
+          "false_positive": false
+        },
+        {
+          "id": "control-002",
+          "query": "who is the current president of France?",
+          "effective_query": "who is the current president of France?",
+          "gold_in_topk": false,
+          "gold_global_key": null,
+          "gold_rank": null,
+          "gold_score": null,
+          "top1_text": "As of January 2026, the user lives in Lisbon.",
+          "top1_score": 0.2564335064669229,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "control-003",
+          "query": "what is the capital of Mongolia?",
+          "effective_query": "what is the capital of Mongolia?",
+          "gold_in_topk": false,
+          "gold_global_key": null,
+          "gold_rank": null,
+          "gold_score": null,
+          "top1_text": "I have never been to Asia.",
+          "top1_score": 0.3106507170301111,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "control-004",
+          "query": "how does photosynthesis work?",
+          "effective_query": "how does photosynthesis work?",
+          "gold_in_topk": false,
+          "gold_global_key": null,
+          "gold_rank": null,
+          "gold_score": null,
+          "top1_text": "I am asking a question.",
+          "top1_score": 0.15843567725842153,
+          "top1_is_distractor": true,
+          "false_positive": false
+        },
+        {
+          "id": "control-005",
+          "query": "translate 'goodnight' to Japanese",
+          "effective_query": "translate 'goodnight' to Japanese",
+          "gold_in_topk": false,
+          "gold_global_key": null,
+          "gold_rank": null,
+          "gold_score": null,
+          "top1_text": "I am not currently learning any new languages.",
+          "top1_score": 0.2331788870062421,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "control-006",
+          "query": "what year did World War II end?",
+          "effective_query": "what year did World War II end?",
+          "gold_in_topk": false,
+          "gold_global_key": null,
+          "gold_rank": null,
+          "gold_score": null,
+          "top1_text": "Before 2024, I lived in Berlin.",
+          "top1_score": 0.32252117603205516,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "control-007",
+          "query": "explain entropy in thermodynamics",
+          "effective_query": "explain entropy in thermodynamics",
+          "gold_in_topk": false,
+          "gold_global_key": null,
+          "gold_rank": null,
+          "gold_score": null,
+          "top1_text": "The user uses Logseq for personal notes and Notion for work.",
+          "top1_score": 0.18512903871088907,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "control-008",
+          "query": "best way to debug a segfault in C",
+          "effective_query": "best way to debug a segfault in C",
+          "gold_in_topk": false,
+          "gold_global_key": null,
+          "gold_rank": null,
+          "gold_score": null,
+          "top1_text": "I am currently working on a memory benchmark project.",
+          "top1_score": 0.24122260587509706,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "control-009",
+          "query": "what's the weather going to be tomorrow?",
+          "effective_query": "what's the weather going to be tomorrow?",
+          "gold_in_topk": false,
+          "gold_global_key": null,
+          "gold_rank": null,
+          "gold_score": null,
+          "top1_text": "As of May 14, 2026, I am a term mentioned in the conversation.",
+          "top1_score": 0.25613897573144834,
+          "top1_is_distractor": true,
+          "false_positive": false
+        },
+        {
+          "id": "control-010",
+          "query": "give me a recipe for tiramisu",
+          "effective_query": "give me a recipe for tiramisu",
+          "gold_in_topk": false,
+          "gold_global_key": null,
+          "gold_rank": null,
+          "gold_score": null,
+          "top1_text": "My favorite restaurant is Cervejaria Ramiro.",
+          "top1_score": 0.32863639833819,
+          "top1_is_distractor": false,
+          "false_positive": false
+        }
+      ]
+    }
+  ]
+}
\ No newline at end of file
diff --git a/benchmarks/alignbench/runs/hybrid-bm25.json b/benchmarks/alignbench/runs/hybrid-bm25.json
new file mode 100644
index 0000000..f199f6a
--- /dev/null
+++ b/benchmarks/alignbench/runs/hybrid-bm25.json
@@ -0,0 +1,990 @@
+{
+  "variant": "hybrid-bm25",
+  "model": "Xenova/all-MiniLM-L6-v2",
+  "topk": 5,
+  "wall_seconds": "0.3",
+  "composite": {
+    "recall_at_1": 0.6166666666666667,
+    "recall_at_5": 0.9166666666666666,
+    "distractor_top1_rate": 0.06666666666666667,
+    "n": 60
+  },
+  "false_positive_rate": 1,
+  "per_axis": [
+    {
+      "axis": "pronoun",
+      "n": 20,
+      "pool_size": 55,
+      "recall_at_1": 0.6,
+      "recall_at_5": 0.95,
+      "mean_gold_rank": 1.95,
+      "distractor_at_top1": 2,
+      "false_positive_count": 0,
+      "median_gold_margin": 0.03374568959009672,
+      "items": [
+        {
+          "id": "pronoun-001",
+          "query": "what is my name?",
+          "effective_query": "what is my name?",
+          "gold_in_topk": true,
+          "gold_global_key": "pronoun#0",
+          "gold_rank": 1,
+          "gold_score": 1,
+          "top1_text": "The user's name is Alex.",
+          "top1_score": 1,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "pronoun-002",
+          "query": "what's the user's name?",
+          "effective_query": "what's the user's name?",
+          "gold_in_topk": true,
+          "gold_global_key": "pronoun#0",
+          "gold_rank": 2,
+          "gold_score": 0.9576851044161381,
+          "top1_text": "The user asked for the user's name.",
+          "top1_score": 0.958092359862655,
+          "top1_is_distractor": true,
+          "false_positive": false
+        },
+        {
+          "id": "pronoun-003",
+          "query": "who am I?",
+          "effective_query": "who am I?",
+          "gold_in_topk": true,
+          "gold_global_key": "pronoun#0",
+          "gold_rank": 2,
+          "gold_score": 0.4284196416000849,
+          "top1_text": "The user is me.",
+          "top1_score": 0.6,
+          "top1_is_distractor": true,
+          "false_positive": false
+        },
+        {
+          "id": "pronoun-004",
+          "query": "what do I do for work?",
+          "effective_query": "what do I do for work?",
+          "gold_in_topk": true,
+          "gold_global_key": "pronoun#1",
+          "gold_rank": 3,
+          "gold_score": 0.5811840395869337,
+          "top1_text": "The user uses Logseq for personal notes and Notion for work.",
+          "top1_score": 0.854461597488777,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "pronoun-005",
+          "query": "what is the user's job?",
+          "effective_query": "what is the user's job?",
+          "gold_in_topk": true,
+          "gold_global_key": "pronoun#1",
+          "gold_rank": 10,
+          "gold_score": 0.6045280628239001,
+          "top1_text": "The user's name is Alex.",
+          "top1_score": 0.7953675650047598,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "pronoun-006",
+          "query": "where does the user work?",
+          "effective_query": "where does the user work?",
+          "gold_in_topk": true,
+          "gold_global_key": "pronoun#1",
+          "gold_rank": 4,
+          "gold_score": 0.6056863546267723,
+          "top1_text": "The user uses Logseq for personal notes and Notion for work.",
+          "top1_score": 0.8849990304585116,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "pronoun-007",
+          "query": "what is my dog's name?",
+          "effective_query": "what is my dog's name?",
+          "gold_in_topk": true,
+          "gold_global_key": "pronoun#2",
+          "gold_rank": 1,
+          "gold_score": 0.9958263380031263,
+          "top1_text": "The user's dog is named Apollo.",
+          "top1_score": 0.9958263380031263,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "pronoun-008",
+          "query": "who is Apollo?",
+          "effective_query": "who is Apollo?",
+          "gold_in_topk": true,
+          "gold_global_key": "pronoun#2",
+          "gold_rank": 1,
+          "gold_score": 1,
+          "top1_text": "The user's dog is named Apollo.",
+          "top1_score": 1,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "pronoun-009",
+          "query": "where do I live?",
+          "effective_query": "where do I live?",
+          "gold_in_topk": true,
+          "gold_global_key": "pronoun#3",
+          "gold_rank": 2,
+          "gold_score": 0.5993887537891871,
+          "top1_text": "As of January 2026, the user lives in Lisbon.",
+          "top1_score": 0.6,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "pronoun-010",
+          "query": "what city does the user live in?",
+          "effective_query": "what city does the user live in?",
+          "gold_in_topk": true,
+          "gold_global_key": "pronoun#3",
+          "gold_rank": 1,
+          "gold_score": 0.877017959407961,
+          "top1_text": "The user lives in Lisbon.",
+          "top1_score": 0.877017959407961,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "pronoun-011",
+          "query": "when is my birthday?",
+          "effective_query": "when is my birthday?",
+          "gold_in_topk": true,
+          "gold_global_key": "pronoun#4",
+          "gold_rank": 1,
+          "gold_score": 1,
+          "top1_text": "The user's birthday is March 14.",
+          "top1_score": 1,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "pronoun-012",
+          "query": "when was the user born?",
+          "effective_query": "when was the user born?",
+          "gold_in_topk": true,
+          "gold_global_key": "pronoun#4",
+          "gold_rank": 1,
+          "gold_score": 0.6052989966186167,
+          "top1_text": "The user's birthday is March 14.",
+          "top1_score": 0.6052989966186167,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "pronoun-013",
+          "query": "do I have kids?",
+          "effective_query": "do I have kids?",
+          "gold_in_topk": true,
+          "gold_global_key": "pronoun#5",
+          "gold_rank": 1,
+          "gold_score": 0.6,
+          "top1_text": "The user has two children, Maya and Theo.",
+          "top1_score": 0.6,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "pronoun-014",
+          "query": "how many children does the user have?",
+          "effective_query": "how many children does the user have?",
+          "gold_in_topk": true,
+          "gold_global_key": "pronoun#5",
+          "gold_rank": 1,
+          "gold_score": 1,
+          "top1_text": "The user has two children, Maya and Theo.",
+          "top1_score": 1,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "pronoun-015",
+          "query": "what is my usual coffee order?",
+          "effective_query": "what is my usual coffee order?",
+          "gold_in_topk": true,
+          "gold_global_key": "pronoun#6",
+          "gold_rank": 1,
+          "gold_score": 1,
+          "top1_text": "The user's favorite coffee order is an oat-milk flat white.",
+          "top1_score": 1,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "pronoun-016",
+          "query": "what coffee does the user drink?",
+          "effective_query": "what coffee does the user drink?",
+          "gold_in_topk": true,
+          "gold_global_key": "pronoun#6",
+          "gold_rank": 2,
+          "gold_score": 0.6025728313819994,
+          "top1_text": "The user does not drink coffee. They prefer tea.",
+          "top1_score": 1,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "pronoun-017",
+          "query": "what did I study?",
+          "effective_query": "what did I study?",
+          "gold_in_topk": true,
+          "gold_global_key": "pronoun#7",
+          "gold_rank": 1,
+          "gold_score": 0.6,
+          "top1_text": "The user studied applied mathematics at university.",
+          "top1_score": 0.6,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "pronoun-018",
+          "query": "do I have any allergies?",
+          "effective_query": "do I have any allergies?",
+          "gold_in_topk": true,
+          "gold_global_key": "pronoun#8",
+          "gold_rank": 1,
+          "gold_score": 0.6,
+          "top1_text": "The user is allergic to peanuts.",
+          "top1_score": 0.6,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "pronoun-019",
+          "query": "what is the user allergic to?",
+          "effective_query": "what is the user allergic to?",
+          "gold_in_topk": true,
+          "gold_global_key": "pronoun#8",
+          "gold_rank": 1,
+          "gold_score": 1,
+          "top1_text": "The user is allergic to peanuts.",
+          "top1_score": 1,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "pronoun-020",
+          "query": "what kind of car do I drive?",
+          "effective_query": "what kind of car do I drive?",
+          "gold_in_topk": true,
+          "gold_global_key": "pronoun#9",
+          "gold_rank": 2,
+          "gold_score": 0.6,
+          "top1_text": "The user does not own a car; they bike or use public transit.",
+          "top1_score": 0.9330272288700376,
+          "top1_is_distractor": false,
+          "false_positive": false
+        }
+      ]
+    },
+    {
+      "axis": "temporal",
+      "n": 14,
+      "pool_size": 55,
+      "recall_at_1": 0.5,
+      "recall_at_5": 0.7857142857142857,
+      "mean_gold_rank": 5.142857142857143,
+      "distractor_at_top1": 2,
+      "false_positive_count": 0,
+      "median_gold_margin": 0.18738929012250227,
+      "items": [
+        {
+          "id": "temporal-001",
+          "query": "where does the user live now?",
+          "effective_query": "where does the user live now?",
+          "gold_in_topk": true,
+          "gold_global_key": "temporal#0",
+          "gold_rank": 4,
+          "gold_score": 0.6066879474966196,
+          "top1_text": "The user does not eat shellfish.",
+          "top1_score": 0.6348007612409563,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "temporal-002",
+          "query": "where did the user used to live?",
+          "effective_query": "where did the user used to live?",
+          "gold_in_topk": true,
+          "gold_global_key": "temporal#1",
+          "gold_rank": 4,
+          "gold_score": 0.606449574798833,
+          "top1_text": "The user moved from Berlin to Lisbon in 2024.",
+          "top1_score": 0.754314259838859,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "temporal-003",
+          "query": "when did the user move?",
+          "effective_query": "when did the user move?",
+          "gold_in_topk": true,
+          "gold_global_key": "temporal#2",
+          "gold_rank": 2,
+          "gold_score": 0.8537709758404959,
+          "top1_text": "The user asked for the user's name.",
+          "top1_score": 0.9047816753185527,
+          "top1_is_distractor": true,
+          "false_positive": false
+        },
+        {
+          "id": "temporal-004",
+          "query": "where is the user currently based?",
+          "effective_query": "where is the user currently based?",
+          "gold_in_topk": true,
+          "gold_global_key": "temporal#0",
+          "gold_rank": 15,
+          "gold_score": 0.5204413965610561,
+          "top1_text": "The user is not currently learning any new languages.",
+          "top1_score": 0.7399852806383946,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "temporal-005",
+          "query": "what is the user reading?",
+          "effective_query": "what is the user reading?",
+          "gold_in_topk": true,
+          "gold_global_key": "temporal#3",
+          "gold_rank": 1,
+          "gold_score": 0.8769262331140877,
+          "top1_text": "As of April 2026, the user is reading 'The Power Broker'.",
+          "top1_score": 0.8769262331140877,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "temporal-006",
+          "query": "what did the user read last year?",
+          "effective_query": "what did the user read last year?",
+          "gold_in_topk": true,
+          "gold_global_key": "temporal#4",
+          "gold_rank": 1,
+          "gold_score": 1,
+          "top1_text": "Last year the user read 'Project Hail Mary'.",
+          "top1_score": 1,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "temporal-007",
+          "query": "what is the user working on these days?",
+          "effective_query": "what is the user working on these days?",
+          "gold_in_topk": true,
+          "gold_global_key": "temporal#5",
+          "gold_rank": 1,
+          "gold_score": 0.9501075843236121,
+          "top1_text": "The user is currently working on a memory benchmark project.",
+          "top1_score": 0.9501075843236121,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "temporal-008",
+          "query": "what did the user finish last month?",
+          "effective_query": "what did the user finish last month?",
+          "gold_in_topk": true,
+          "gold_global_key": "temporal#6",
+          "gold_rank": 1,
+          "gold_score": 1,
+          "top1_text": "The user finished the Sprint-4 reranker training last month.",
+          "top1_score": 1,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "temporal-009",
+          "query": "what LLM does the user prefer?",
+          "effective_query": "what LLM does the user prefer?",
+          "gold_in_topk": true,
+          "gold_global_key": "temporal#7",
+          "gold_rank": 1,
+          "gold_score": 0.7943511233809831,
+          "top1_text": "As of May 2026, the user's preferred LLM is Claude Sonnet 4.6.",
+          "top1_score": 0.7943511233809831,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "temporal-010",
+          "query": "which model did the user use before?",
+          "effective_query": "which model did the user use before?",
+          "gold_in_topk": true,
+          "gold_global_key": "temporal#8",
+          "gold_rank": 1,
+          "gold_score": 0.9153791248018733,
+          "top1_text": "The user used GPT-4 as their primary model in 2024.",
+          "top1_score": 0.9153791248018733,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "temporal-011",
+          "query": "did the user get a new phone recently?",
+          "effective_query": "did the user get a new phone recently?",
+          "gold_in_topk": true,
+          "gold_global_key": "temporal#9",
+          "gold_rank": 1,
+          "gold_score": 0.9454096158904209,
+          "top1_text": "The user upgraded their phone to an iPhone 17 in March 2026.",
+          "top1_score": 0.9454096158904209,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "temporal-012",
+          "query": "is the user still in Berlin?",
+          "effective_query": "is the user still in Berlin?",
+          "gold_in_topk": true,
+          "gold_global_key": "temporal#0",
+          "gold_rank": 5,
+          "gold_score": 0.454434056359795,
+          "top1_text": "Before 2024, the user lived in Berlin.",
+          "top1_score": 1,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "temporal-013",
+          "query": "what is the user up to?",
+          "effective_query": "what is the user up to?",
+          "gold_in_topk": true,
+          "gold_global_key": "temporal#5",
+          "gold_rank": 18,
+          "gold_score": 0.3782959535726875,
+          "top1_text": "The user is asking a question.",
+          "top1_score": 0.7138787297189697,
+          "top1_is_distractor": true,
+          "false_positive": false
+        },
+        {
+          "id": "temporal-014",
+          "query": "which model is the user on right now?",
+          "effective_query": "which model is the user on right now?",
+          "gold_in_topk": true,
+          "gold_global_key": "temporal#7",
+          "gold_rank": 17,
+          "gold_score": 0.5029854069075197,
+          "top1_text": "The user used GPT-4 as their primary model in 2024.",
+          "top1_score": 0.8573596067886893,
+          "top1_is_distractor": false,
+          "false_positive": false
+        }
+      ]
+    },
+    {
+      "axis": "specificity",
+      "n": 14,
+      "pool_size": 55,
+      "recall_at_1": 0.5,
+      "recall_at_5": 0.9285714285714286,
+      "mean_gold_rank": 1.8571428571428572,
+      "distractor_at_top1": 0,
+      "false_positive_count": 0,
+      "median_gold_margin": 0.009640204109421346,
+      "items": [
+        {
+          "id": "specificity-001",
+          "query": "tell me about my dog",
+          "effective_query": "tell me about my dog",
+          "gold_in_topk": true,
+          "gold_global_key": "specificity#0",
+          "gold_rank": 3,
+          "gold_score": 0.7584346404038529,
+          "top1_text": "The user has a dog named Apollo.",
+          "top1_score": 0.8530735681861806,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "specificity-002",
+          "query": "what kind of pet do I have?",
+          "effective_query": "what kind of pet do I have?",
+          "gold_in_topk": true,
+          "gold_global_key": "specificity#0",
+          "gold_rank": 2,
+          "gold_score": 0.5555778772988805,
+          "top1_text": "The user has a dog named Apollo.",
+          "top1_score": 0.6,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "specificity-003",
+          "query": "do I own a bike?",
+          "effective_query": "do I own a bike?",
+          "gold_in_topk": true,
+          "gold_global_key": "specificity#1",
+          "gold_rank": 2,
+          "gold_score": 0.8918705240259431,
+          "top1_text": "The user does not own a car; they bike or use public transit.",
+          "top1_score": 0.9056616692307067,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "specificity-004",
+          "query": "what brand of bike does the user have?",
+          "effective_query": "what brand of bike does the user have?",
+          "gold_in_topk": true,
+          "gold_global_key": "specificity#1",
+          "gold_rank": 1,
+          "gold_score": 0.9108344961613679,
+          "top1_text": "The user owns a Bianchi road bike.",
+          "top1_score": 0.9108344961613679,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "specificity-005",
+          "query": "what computer do I use?",
+          "effective_query": "what computer do I use?",
+          "gold_in_topk": true,
+          "gold_global_key": "specificity#2",
+          "gold_rank": 2,
+          "gold_score": 0.6,
+          "top1_text": "The user does not use Twitter; they use Bluesky and Mastodon.",
+          "top1_score": 0.7243317748899458,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "specificity-006",
+          "query": "what laptop does the user have?",
+          "effective_query": "what laptop does the user have?",
+          "gold_in_topk": true,
+          "gold_global_key": "specificity#2",
+          "gold_rank": 1,
+          "gold_score": 1,
+          "top1_text": "The user's primary laptop is a 16-inch MacBook Pro M4.",
+          "top1_score": 1,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "specificity-007",
+          "query": "do I have any musical instruments?",
+          "effective_query": "do I have any musical instruments?",
+          "gold_in_topk": true,
+          "gold_global_key": "specificity#3",
+          "gold_rank": 2,
+          "gold_score": 0.6,
+          "top1_text": "The user is not currently learning any new languages.",
+          "top1_score": 0.7662387492993549,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "specificity-008",
+          "query": "which note-taking app do I use?",
+          "effective_query": "which note-taking app do I use?",
+          "gold_in_topk": true,
+          "gold_global_key": "specificity#4",
+          "gold_rank": 2,
+          "gold_score": 0.6,
+          "top1_text": "The user does not use Twitter; they use Bluesky and Mastodon.",
+          "top1_score": 0.6979807773255753,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "specificity-009",
+          "query": "where do I like to eat in Lisbon?",
+          "effective_query": "where do I like to eat in Lisbon?",
+          "gold_in_topk": true,
+          "gold_global_key": "specificity#5",
+          "gold_rank": 1,
+          "gold_score": 0.8290638598887166,
+          "top1_text": "The user's favorite restaurant in Lisbon is Cervejaria Ramiro.",
+          "top1_score": 0.8290638598887166,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "specificity-010",
+          "query": "what brand sunglasses does the user wear?",
+          "effective_query": "what brand sunglasses does the user wear?",
+          "gold_in_topk": true,
+          "gold_global_key": "specificity#6",
+          "gold_rank": 1,
+          "gold_score": 1,
+          "top1_text": "The user wears Smith Lowdown sunglasses.",
+          "top1_score": 1,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "specificity-011",
+          "query": "do I read on a Kindle?",
+          "effective_query": "do I read on a Kindle?",
+          "gold_in_topk": true,
+          "gold_global_key": "specificity#7",
+          "gold_rank": 1,
+          "gold_score": 1,
+          "top1_text": "The user reads on a Kindle Paperwhite 11th-gen.",
+          "top1_score": 1,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "specificity-012",
+          "query": "what espresso machine does the user own?",
+          "effective_query": "what espresso machine does the user own?",
+          "gold_in_topk": true,
+          "gold_global_key": "specificity#8",
+          "gold_rank": 1,
+          "gold_score": 1,
+          "top1_text": "The user's home espresso machine is a Lelit Bianca v3.",
+          "top1_score": 1,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "specificity-013",
+          "query": "what shoes do I wear?",
+          "effective_query": "what shoes do I wear?",
+          "gold_in_topk": true,
+          "gold_global_key": "specificity#9",
+          "gold_rank": 1,
+          "gold_score": 0.6,
+          "top1_text": "The user wears Allbirds Wool Runners daily.",
+          "top1_score": 0.6,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "specificity-014",
+          "query": "what brand are the user's everyday shoes?",
+          "effective_query": "what brand are the user's everyday shoes?",
+          "gold_in_topk": true,
+          "gold_global_key": "specificity#9",
+          "gold_rank": 6,
+          "gold_score": 0.6116802638773472,
+          "top1_text": "The user's favorite restaurant is Cervejaria Ramiro.",
+          "top1_score": 0.7130016116548469,
+          "top1_is_distractor": false,
+          "false_positive": false
+        }
+      ]
+    },
+    {
+      "axis": "negation",
+      "n": 12,
+      "pool_size": 55,
+      "recall_at_1": 0.9166666666666666,
+      "recall_at_5": 1,
+      "mean_gold_rank": 1.1666666666666667,
+      "distractor_at_top1": 0,
+      "false_positive_count": 0,
+      "median_gold_margin": 0.5607048274399469,
+      "items": [
+        {
+          "id": "negation-001",
+          "query": "does the user drink coffee?",
+          "effective_query": "does the user drink coffee?",
+          "gold_in_topk": true,
+          "gold_global_key": "negation#0",
+          "gold_rank": 1,
+          "gold_score": 1,
+          "top1_text": "The user does not drink coffee. They prefer tea.",
+          "top1_score": 1,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "negation-002",
+          "query": "what does the user drink in the morning?",
+          "effective_query": "what does the user drink in the morning?",
+          "gold_in_topk": true,
+          "gold_global_key": "negation#0",
+          "gold_rank": 1,
+          "gold_score": 1,
+          "top1_text": "The user does not drink coffee. They prefer tea.",
+          "top1_score": 1,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "negation-003",
+          "query": "am I vegetarian?",
+          "effective_query": "am I vegetarian?",
+          "gold_in_topk": true,
+          "gold_global_key": "negation#1",
+          "gold_rank": 1,
+          "gold_score": 1,
+          "top1_text": "The user is not vegetarian, but avoids red meat.",
+          "top1_score": 1,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "negation-004",
+          "query": "is the user on Twitter?",
+          "effective_query": "is the user on Twitter?",
+          "gold_in_topk": true,
+          "gold_global_key": "negation#2",
+          "gold_rank": 1,
+          "gold_score": 0.9150475533063944,
+          "top1_text": "The user does not use Twitter; they use Bluesky and Mastodon.",
+          "top1_score": 0.9150475533063944,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "negation-005",
+          "query": "which social networks does the user use?",
+          "effective_query": "which social networks does the user use?",
+          "gold_in_topk": true,
+          "gold_global_key": "negation#2",
+          "gold_rank": 1,
+          "gold_score": 1,
+          "top1_text": "The user does not use Twitter; they use Bluesky and Mastodon.",
+          "top1_score": 1,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "negation-006",
+          "query": "does the user own a car?",
+          "effective_query": "does the user own a car?",
+          "gold_in_topk": true,
+          "gold_global_key": "negation#3",
+          "gold_rank": 1,
+          "gold_score": 1,
+          "top1_text": "The user does not own a car; they bike or use public transit.",
+          "top1_score": 1,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "negation-007",
+          "query": "is the user active on LinkedIn?",
+          "effective_query": "is the user active on LinkedIn?",
+          "gold_in_topk": true,
+          "gold_global_key": "negation#4",
+          "gold_rank": 1,
+          "gold_score": 1,
+          "top1_text": "The user is not on LinkedIn anymore.",
+          "top1_score": 1,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "negation-008",
+          "query": "any foods the user hates?",
+          "effective_query": "any foods the user hates?",
+          "gold_in_topk": true,
+          "gold_global_key": "negation#5",
+          "gold_rank": 3,
+          "gold_score": 0.5952157764050423,
+          "top1_text": "The user is not currently learning any new languages.",
+          "top1_score": 0.6185955328000405,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "negation-009",
+          "query": "has the user traveled to Asia?",
+          "effective_query": "has the user traveled to Asia?",
+          "gold_in_topk": true,
+          "gold_global_key": "negation#6",
+          "gold_rank": 1,
+          "gold_score": 1,
+          "top1_text": "The user has never been to Asia.",
+          "top1_score": 1,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "negation-010",
+          "query": "does the user like horror movies?",
+          "effective_query": "does the user like horror movies?",
+          "gold_in_topk": true,
+          "gold_global_key": "negation#7",
+          "gold_rank": 1,
+          "gold_score": 1,
+          "top1_text": "The user does not enjoy horror movies.",
+          "top1_score": 1,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "negation-011",
+          "query": "can the user eat shrimp?",
+          "effective_query": "can the user eat shrimp?",
+          "gold_in_topk": true,
+          "gold_global_key": "negation#8",
+          "gold_rank": 1,
+          "gold_score": 1,
+          "top1_text": "The user does not eat shellfish.",
+          "top1_score": 1,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "negation-012",
+          "query": "is the user learning a new language?",
+          "effective_query": "is the user learning a new language?",
+          "gold_in_topk": true,
+          "gold_global_key": "negation#9",
+          "gold_rank": 1,
+          "gold_score": 1,
+          "top1_text": "The user is not currently learning any new languages.",
+          "top1_score": 1,
+          "top1_is_distractor": false,
+          "false_positive": false
+        }
+      ]
+    },
+    {
+      "axis": "control",
+      "n": 10,
+      "pool_size": 55,
+      "recall_at_1": 0,
+      "recall_at_5": 0,
+      "mean_gold_rank": null,
+      "distractor_at_top1": 0,
+      "false_positive_count": 10,
+      "median_gold_margin": null,
+      "items": [
+        {
+          "id": "control-001",
+          "query": "what is the airspeed velocity of an unladen swallow?",
+          "effective_query": "what is the airspeed velocity of an unladen swallow?",
+          "gold_in_topk": false,
+          "gold_global_key": null,
+          "gold_rank": null,
+          "gold_score": null,
+          "top1_text": "The user's favorite coffee order is an oat-milk flat white.",
+          "top1_score": 0.7351279172813607,
+          "top1_is_distractor": false,
+          "false_positive": true
+        },
+        {
+          "id": "control-002",
+          "query": "who is the current president of France?",
+          "effective_query": "who is the current president of France?",
+          "gold_in_topk": false,
+          "gold_global_key": null,
+          "gold_rank": null,
+          "gold_score": null,
+          "top1_text": "As of January 2026, the user lives in Lisbon.",
+          "top1_score": 0.9123350671733821,
+          "top1_is_distractor": false,
+          "false_positive": true
+        },
+        {
+          "id": "control-003",
+          "query": "what is the capital of Mongolia?",
+          "effective_query": "what is the capital of Mongolia?",
+          "gold_in_topk": false,
+          "gold_global_key": null,
+          "gold_rank": null,
+          "gold_score": null,
+          "top1_text": "As of May 14, 2026, Apollo is a term mentioned in the conversation.",
+          "top1_score": 0.7795522818480163,
+          "top1_is_distractor": true,
+          "false_positive": true
+        },
+        {
+          "id": "control-004",
+          "query": "how does photosynthesis work?",
+          "effective_query": "how does photosynthesis work?",
+          "gold_in_topk": false,
+          "gold_global_key": null,
+          "gold_rank": null,
+          "gold_score": null,
+          "top1_text": "The user uses Logseq for personal notes and Notion for work.",
+          "top1_score": 0.8130935993675803,
+          "top1_is_distractor": false,
+          "false_positive": true
+        },
+        {
+          "id": "control-005",
+          "query": "translate 'goodnight' to Japanese",
+          "effective_query": "translate 'goodnight' to Japanese",
+          "gold_in_topk": false,
+          "gold_global_key": null,
+          "gold_rank": null,
+          "gold_score": null,
+          "top1_text": "The user has never been to Asia.",
+          "top1_score": 0.8046049367670489,
+          "top1_is_distractor": false,
+          "false_positive": true
+        },
+        {
+          "id": "control-006",
+          "query": "what year did World War II end?",
+          "effective_query": "what year did World War II end?",
+          "gold_in_topk": false,
+          "gold_global_key": null,
+          "gold_rank": null,
+          "gold_score": null,
+          "top1_text": "Last year the user read 'Project Hail Mary'.",
+          "top1_score": 0.7019848467597081,
+          "top1_is_distractor": false,
+          "false_positive": true
+        },
+        {
+          "id": "control-007",
+          "query": "explain entropy in thermodynamics",
+          "effective_query": "explain entropy in thermodynamics",
+          "gold_in_topk": false,
+          "gold_global_key": null,
+          "gold_rank": null,
+          "gold_score": null,
+          "top1_text": "The user lives in Lisbon.",
+          "top1_score": 0.7312421987766862,
+          "top1_is_distractor": false,
+          "false_positive": true
+        },
+        {
+          "id": "control-008",
+          "query": "best way to debug a segfault in C",
+          "effective_query": "best way to debug a segfault in C",
+          "gold_in_topk": false,
+          "gold_global_key": null,
+          "gold_rank": null,
+          "gold_score": null,
+          "top1_text": "The user is currently working on a memory benchmark project.",
+          "top1_score": 0.7099948998710806,
+          "top1_is_distractor": false,
+          "false_positive": true
+        },
+        {
+          "id": "control-009",
+          "query": "what's the weather going to be tomorrow?",
+          "effective_query": "what's the weather going to be tomorrow?",
+          "gold_in_topk": false,
+          "gold_global_key": null,
+          "gold_rank": null,
+          "gold_score": null,
+          "top1_text": "The user upgraded their phone to an iPhone 17 in March 2026.",
+          "top1_score": 0.7680416836559597,
+          "top1_is_distractor": false,
+          "false_positive": true
+        },
+        {
+          "id": "control-010",
+          "query": "give me a recipe for tiramisu",
+          "effective_query": "give me a recipe for tiramisu",
+          "gold_in_topk": false,
+          "gold_global_key": null,
+          "gold_rank": null,
+          "gold_score": null,
+          "top1_text": "The user is me.",
+          "top1_score": 0.6231932527713353,
+          "top1_is_distractor": true,
+          "false_positive": true
+        }
+      ]
+    }
+  ]
+}
\ No newline at end of file
diff --git a/benchmarks/alignbench/runs/modal-ablation.json b/benchmarks/alignbench/runs/modal-ablation.json
new file mode 100644
index 0000000..e5f7fb9
--- /dev/null
+++ b/benchmarks/alignbench/runs/modal-ablation.json
@@ -0,0 +1,346 @@
+{
+  "models": [
+    {
+      "model": "sentence-transformers/all-MiniLM-L6-v2",
+      "wall_seconds": 5.9,
+      "composite": {
+        "recall_at_1": 0.7333333333333333,
+        "recall_at_5": 0.9333333333333333,
+        "distractor_top1_rate": 0.06666666666666667,
+        "n": 60
+      },
+      "per_axis": [
+        {
+          "axis": "pronoun",
+          "n": 20,
+          "recall_at_1": 0.7,
+          "recall_at_5": 1.0,
+          "mean_gold_rank": 1.3,
+          "median_gold_margin": 0.04656702280044556,
+          "distractor_at_top1": 2
+        },
+        {
+          "axis": "temporal",
+          "n": 14,
+          "recall_at_1": 0.5,
+          "recall_at_5": 0.7142857142857143,
+          "mean_gold_rank": 5.5,
+          "median_gold_margin": 0.04994499683380127,
+          "distractor_at_top1": 2
+        },
+        {
+          "axis": "specificity",
+          "n": 14,
+          "recall_at_1": 0.8571428571428571,
+          "recall_at_5": 1.0,
+          "mean_gold_rank": 1.2142857142857142,
+          "median_gold_margin": 0.14383524656295776,
+          "distractor_at_top1": 0
+        },
+        {
+          "axis": "negation",
+          "n": 12,
+          "recall_at_1": 0.9166666666666666,
+          "recall_at_5": 1.0,
+          "mean_gold_rank": 1.0833333333333333,
+          "median_gold_margin": 0.2676225006580353,
+          "distractor_at_top1": 0
+        },
+        {
+          "axis": "control",
+          "n": 10,
+          "recall_at_1": 0.0,
+          "recall_at_5": 0.0,
+          "mean_gold_rank": null,
+          "median_gold_margin": null,
+          "distractor_at_top1": 0
+        }
+      ]
+    },
+    {
+      "model": "sentence-transformers/all-mpnet-base-v2",
+      "wall_seconds": 6.8,
+      "composite": {
+        "recall_at_1": 0.7333333333333333,
+        "recall_at_5": 0.95,
+        "distractor_top1_rate": 0.08333333333333333,
+        "n": 60
+      },
+      "per_axis": [
+        {
+          "axis": "pronoun",
+          "n": 20,
+          "recall_at_1": 0.55,
+          "recall_at_5": 0.95,
+          "mean_gold_rank": 1.8,
+          "median_gold_margin": 0.0059460848569869995,
+          "distractor_at_top1": 3
+        },
+        {
+          "axis": "temporal",
+          "n": 14,
+          "recall_at_1": 0.5714285714285714,
+          "recall_at_5": 0.8571428571428571,
+          "mean_gold_rank": 6.5,
+          "median_gold_margin": 0.054799675941467285,
+          "distractor_at_top1": 2
+        },
+        {
+          "axis": "specificity",
+          "n": 14,
+          "recall_at_1": 0.9285714285714286,
+          "recall_at_5": 1.0,
+          "mean_gold_rank": 1.0714285714285714,
+          "median_gold_margin": 0.12878745794296265,
+          "distractor_at_top1": 0
+        },
+        {
+          "axis": "negation",
+          "n": 12,
+          "recall_at_1": 1.0,
+          "recall_at_5": 1.0,
+          "mean_gold_rank": 1.0,
+          "median_gold_margin": 0.20404121279716492,
+          "distractor_at_top1": 0
+        },
+        {
+          "axis": "control",
+          "n": 10,
+          "recall_at_1": 0.0,
+          "recall_at_5": 0.0,
+          "mean_gold_rank": null,
+          "median_gold_margin": null,
+          "distractor_at_top1": 0
+        }
+      ]
+    },
+    {
+      "model": "BAAI/bge-small-en-v1.5",
+      "wall_seconds": 5.9,
+      "composite": {
+        "recall_at_1": 0.5333333333333333,
+        "recall_at_5": 0.8,
+        "distractor_top1_rate": 0.36666666666666664,
+        "n": 60
+      },
+      "per_axis": [
+        {
+          "axis": "pronoun",
+          "n": 20,
+          "recall_at_1": 0.5,
+          "recall_at_5": 0.8,
+          "mean_gold_rank": 3.6,
+          "median_gold_margin": 0.0,
+          "distractor_at_top1": 8
+        },
+        {
+          "axis": "temporal",
+          "n": 14,
+          "recall_at_1": 0.21428571428571427,
+          "recall_at_5": 0.42857142857142855,
+          "mean_gold_rank": 9.714285714285714,
+          "median_gold_margin": -0.042997002601623535,
+          "distractor_at_top1": 10
+        },
+        {
+          "axis": "specificity",
+          "n": 14,
+          "recall_at_1": 0.7142857142857143,
+          "recall_at_5": 1.0,
+          "mean_gold_rank": 1.6428571428571428,
+          "median_gold_margin": 0.07060164213180542,
+          "distractor_at_top1": 2
+        },
+        {
+          "axis": "negation",
+          "n": 12,
+          "recall_at_1": 0.75,
+          "recall_at_5": 1.0,
+          "mean_gold_rank": 1.5,
+          "median_gold_margin": 0.07531774044036865,
+          "distractor_at_top1": 2
+        },
+        {
+          "axis": "control",
+          "n": 10,
+          "recall_at_1": 0.0,
+          "recall_at_5": 0.0,
+          "mean_gold_rank": null,
+          "median_gold_margin": null,
+          "distractor_at_top1": 0
+        }
+      ]
+    },
+    {
+      "model": "BAAI/bge-base-en-v1.5",
+      "wall_seconds": 6.1,
+      "composite": {
+        "recall_at_1": 0.6166666666666667,
+        "recall_at_5": 0.7833333333333333,
+        "distractor_top1_rate": 0.25,
+        "n": 60
+      },
+      "per_axis": [
+        {
+          "axis": "pronoun",
+          "n": 20,
+          "recall_at_1": 0.7,
+          "recall_at_5": 0.9,
+          "mean_gold_rank": 2.35,
+          "median_gold_margin": 0.0298384428024292,
+          "distractor_at_top1": 5
+        },
+        {
+          "axis": "temporal",
+          "n": 14,
+          "recall_at_1": 0.2857142857142857,
+          "recall_at_5": 0.42857142857142855,
+          "mean_gold_rank": 10.357142857142858,
+          "median_gold_margin": -0.050244808197021484,
+          "distractor_at_top1": 7
+        },
+        {
+          "axis": "specificity",
+          "n": 14,
+          "recall_at_1": 0.7142857142857143,
+          "recall_at_5": 1.0,
+          "mean_gold_rank": 1.5,
+          "median_gold_margin": 0.07876402139663696,
+          "distractor_at_top1": 2
+        },
+        {
+          "axis": "negation",
+          "n": 12,
+          "recall_at_1": 0.75,
+          "recall_at_5": 0.75,
+          "mean_gold_rank": 3.5,
+          "median_gold_margin": 0.03547412157058716,
+          "distractor_at_top1": 1
+        },
+        {
+          "axis": "control",
+          "n": 10,
+          "recall_at_1": 0.0,
+          "recall_at_5": 0.0,
+          "mean_gold_rank": null,
+          "median_gold_margin": null,
+          "distractor_at_top1": 0
+        }
+      ]
+    },
+    {
+      "model": "intfloat/e5-small-v2",
+      "wall_seconds": 6.4,
+      "composite": {
+        "recall_at_1": 0.45,
+        "recall_at_5": 0.7166666666666667,
+        "distractor_top1_rate": 0.4166666666666667,
+        "n": 60
+      },
+      "per_axis": [
+        {
+          "axis": "pronoun",
+          "n": 20,
+          "recall_at_1": 0.35,
+          "recall_at_5": 0.85,
+          "mean_gold_rank": 3.6,
+          "median_gold_margin": -0.008539855480194092,
+          "distractor_at_top1": 10
+        },
+        {
+          "axis": "temporal",
+          "n": 14,
+          "recall_at_1": 0.35714285714285715,
+          "recall_at_5": 0.42857142857142855,
+          "mean_gold_rank": 9.785714285714286,
+          "median_gold_margin": -0.02019256353378296,
+          "distractor_at_top1": 7
+        },
+        {
+          "axis": "specificity",
+          "n": 14,
+          "recall_at_1": 0.6428571428571429,
+          "recall_at_5": 0.7142857142857143,
+          "mean_gold_rank": 3.2857142857142856,
+          "median_gold_margin": 0.011939942836761475,
+          "distractor_at_top1": 3
+        },
+        {
+          "axis": "negation",
+          "n": 12,
+          "recall_at_1": 0.5,
+          "recall_at_5": 0.8333333333333334,
+          "mean_gold_rank": 3.0,
+          "median_gold_margin": 0.0028389692306518555,
+          "distractor_at_top1": 5
+        },
+        {
+          "axis": "control",
+          "n": 10,
+          "recall_at_1": 0.0,
+          "recall_at_5": 0.0,
+          "mean_gold_rank": null,
+          "median_gold_margin": null,
+          "distractor_at_top1": 0
+        }
+      ]
+    },
+    {
+      "model": "intfloat/e5-base-v2",
+      "wall_seconds": 6.6,
+      "composite": {
+        "recall_at_1": 0.7166666666666667,
+        "recall_at_5": 0.9333333333333333,
+        "distractor_top1_rate": 0.2,
+        "n": 60
+      },
+      "per_axis": [
+        {
+          "axis": "pronoun",
+          "n": 20,
+          "recall_at_1": 0.65,
+          "recall_at_5": 0.95,
+          "mean_gold_rank": 1.95,
+          "median_gold_margin": 0.0074617862701416016,
+          "distractor_at_top1": 6
+        },
+        {
+          "axis": "temporal",
+          "n": 14,
+          "recall_at_1": 0.7142857142857143,
+          "recall_at_5": 0.7857142857142857,
+          "mean_gold_rank": 6.214285714285714,
+          "median_gold_margin": 0.007385969161987305,
+          "distractor_at_top1": 3
+        },
+        {
+          "axis": "specificity",
+          "n": 14,
+          "recall_at_1": 0.7142857142857143,
+          "recall_at_5": 1.0,
+          "mean_gold_rank": 1.5714285714285714,
+          "median_gold_margin": 0.02679300308227539,
+          "distractor_at_top1": 2
+        },
+        {
+          "axis": "negation",
+          "n": 12,
+          "recall_at_1": 0.8333333333333334,
+          "recall_at_5": 1.0,
+          "mean_gold_rank": 1.3333333333333333,
+          "median_gold_margin": 0.03661489486694336,
+          "distractor_at_top1": 1
+        },
+        {
+          "axis": "control",
+          "n": 10,
+          "recall_at_1": 0.0,
+          "recall_at_5": 0.0,
+          "mean_gold_rank": null,
+          "median_gold_margin": null,
+          "distractor_at_top1": 0
+        }
+      ]
+    }
+  ]
+}
\ No newline at end of file
diff --git a/benchmarks/alignbench/runs/query-rewrite.json b/benchmarks/alignbench/runs/query-rewrite.json
new file mode 100644
index 0000000..3958fdc
--- /dev/null
+++ b/benchmarks/alignbench/runs/query-rewrite.json
@@ -0,0 +1,990 @@
+{
+  "variant": "query-rewrite",
+  "model": "Xenova/all-MiniLM-L6-v2",
+  "topk": 5,
+  "wall_seconds": "0.4",
+  "composite": {
+    "recall_at_1": 0.7333333333333333,
+    "recall_at_5": 0.9333333333333333,
+    "distractor_top1_rate": 0.08333333333333333,
+    "n": 60
+  },
+  "false_positive_rate": 0,
+  "per_axis": [
+    {
+      "axis": "pronoun",
+      "n": 20,
+      "pool_size": 55,
+      "recall_at_1": 0.7,
+      "recall_at_5": 1,
+      "mean_gold_rank": 1.35,
+      "distractor_at_top1": 3,
+      "false_positive_count": 0,
+      "median_gold_margin": 0.030952307744915752,
+      "items": [
+        {
+          "id": "pronoun-001",
+          "query": "what is my name?",
+          "effective_query": "what is the user's name?",
+          "gold_in_topk": true,
+          "gold_global_key": "pronoun#0",
+          "gold_rank": 2,
+          "gold_score": 0.7334653859058853,
+          "top1_text": "The user asked for the user's name.",
+          "top1_score": 0.8007663012810273,
+          "top1_is_distractor": true,
+          "false_positive": false
+        },
+        {
+          "id": "pronoun-002",
+          "query": "what's the user's name?",
+          "effective_query": "what's the user's name?",
+          "gold_in_topk": true,
+          "gold_global_key": "pronoun#0",
+          "gold_rank": 2,
+          "gold_score": 0.7294812723464028,
+          "top1_text": "The user asked for the user's name.",
+          "top1_score": 0.7771254660839599,
+          "top1_is_distractor": true,
+          "false_positive": false
+        },
+        {
+          "id": "pronoun-003",
+          "query": "who am I?",
+          "effective_query": "who am the user?",
+          "gold_in_topk": true,
+          "gold_global_key": "pronoun#0",
+          "gold_rank": 3,
+          "gold_score": 0.5786921118974316,
+          "top1_text": "The user is me.",
+          "top1_score": 0.7631024695500505,
+          "top1_is_distractor": true,
+          "false_positive": false
+        },
+        {
+          "id": "pronoun-004",
+          "query": "what do I do for work?",
+          "effective_query": "what do the user do for work?",
+          "gold_in_topk": true,
+          "gold_global_key": "pronoun#1",
+          "gold_rank": 2,
+          "gold_score": 0.5617654510890236,
+          "top1_text": "The user works at Acme as a software engineer.",
+          "top1_score": 0.5678380216459898,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "pronoun-005",
+          "query": "what is the user's job?",
+          "effective_query": "what is the user's job?",
+          "gold_in_topk": true,
+          "gold_global_key": "pronoun#1",
+          "gold_rank": 2,
+          "gold_score": 0.5754363621337681,
+          "top1_text": "The user works at Acme as a software engineer.",
+          "top1_score": 0.5767198521038163,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "pronoun-006",
+          "query": "where does the user work?",
+          "effective_query": "where does the user work?",
+          "gold_in_topk": true,
+          "gold_global_key": "pronoun#1",
+          "gold_rank": 1,
+          "gold_score": 0.5436570101621021,
+          "top1_text": "The user works as a software engineer at Acme.",
+          "top1_score": 0.5436570101621021,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "pronoun-007",
+          "query": "what is my dog's name?",
+          "effective_query": "what is the user's dog's name?",
+          "gold_in_topk": true,
+          "gold_global_key": "pronoun#2",
+          "gold_rank": 1,
+          "gold_score": 0.7274535409399159,
+          "top1_text": "The user's dog is named Apollo.",
+          "top1_score": 0.7274535409399159,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "pronoun-008",
+          "query": "who is Apollo?",
+          "effective_query": "who is Apollo?",
+          "gold_in_topk": true,
+          "gold_global_key": "pronoun#2",
+          "gold_rank": 1,
+          "gold_score": 0.7072701654355713,
+          "top1_text": "The user's dog is named Apollo.",
+          "top1_score": 0.7072701654355713,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "pronoun-009",
+          "query": "where do I live?",
+          "effective_query": "where do the user live?",
+          "gold_in_topk": true,
+          "gold_global_key": "pronoun#3",
+          "gold_rank": 1,
+          "gold_score": 0.6020680792400349,
+          "top1_text": "The user lives in Lisbon.",
+          "top1_score": 0.6020680792400349,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "pronoun-010",
+          "query": "what city does the user live in?",
+          "effective_query": "what city does the user live in?",
+          "gold_in_topk": true,
+          "gold_global_key": "pronoun#3",
+          "gold_rank": 1,
+          "gold_score": 0.6400263303955707,
+          "top1_text": "The user lives in Lisbon.",
+          "top1_score": 0.6400263303955707,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "pronoun-011",
+          "query": "when is my birthday?",
+          "effective_query": "when is the user's birthday?",
+          "gold_in_topk": true,
+          "gold_global_key": "pronoun#4",
+          "gold_rank": 1,
+          "gold_score": 0.8738209257715547,
+          "top1_text": "The user's birthday is March 14.",
+          "top1_score": 0.8738209257715547,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "pronoun-012",
+          "query": "when was the user born?",
+          "effective_query": "when was the user born?",
+          "gold_in_topk": true,
+          "gold_global_key": "pronoun#4",
+          "gold_rank": 1,
+          "gold_score": 0.7251037734927323,
+          "top1_text": "The user's birthday is March 14.",
+          "top1_score": 0.7251037734927323,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "pronoun-013",
+          "query": "do I have kids?",
+          "effective_query": "do the user have kids?",
+          "gold_in_topk": true,
+          "gold_global_key": "pronoun#5",
+          "gold_rank": 1,
+          "gold_score": 0.5676560796225887,
+          "top1_text": "The user has two children, Maya and Theo.",
+          "top1_score": 0.5676560796225887,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "pronoun-014",
+          "query": "how many children does the user have?",
+          "effective_query": "how many children does the user have?",
+          "gold_in_topk": true,
+          "gold_global_key": "pronoun#5",
+          "gold_rank": 1,
+          "gold_score": 0.5766094762746933,
+          "top1_text": "The user has two children, Maya and Theo.",
+          "top1_score": 0.5766094762746933,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "pronoun-015",
+          "query": "what is my usual coffee order?",
+          "effective_query": "what is the user's usual coffee order?",
+          "gold_in_topk": true,
+          "gold_global_key": "pronoun#6",
+          "gold_rank": 1,
+          "gold_score": 0.6470547548911678,
+          "top1_text": "The user's favorite coffee order is an oat-milk flat white.",
+          "top1_score": 0.6470547548911678,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "pronoun-016",
+          "query": "what coffee does the user drink?",
+          "effective_query": "what coffee does the user drink?",
+          "gold_in_topk": true,
+          "gold_global_key": "pronoun#6",
+          "gold_rank": 2,
+          "gold_score": 0.5813657391451515,
+          "top1_text": "The user does not drink coffee. They prefer tea.",
+          "top1_score": 0.7197601756319,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "pronoun-017",
+          "query": "what did I study?",
+          "effective_query": "what did the user study?",
+          "gold_in_topk": true,
+          "gold_global_key": "pronoun#7",
+          "gold_rank": 1,
+          "gold_score": 0.6274053315379636,
+          "top1_text": "The user studied applied mathematics at university.",
+          "top1_score": 0.6274053315379636,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "pronoun-018",
+          "query": "do I have any allergies?",
+          "effective_query": "do the user have any allergies?",
+          "gold_in_topk": true,
+          "gold_global_key": "pronoun#8",
+          "gold_rank": 1,
+          "gold_score": 0.6119512366841727,
+          "top1_text": "The user is allergic to peanuts.",
+          "top1_score": 0.6119512366841727,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "pronoun-019",
+          "query": "what is the user allergic to?",
+          "effective_query": "what is the user allergic to?",
+          "gold_in_topk": true,
+          "gold_global_key": "pronoun#8",
+          "gold_rank": 1,
+          "gold_score": 0.7261582549960168,
+          "top1_text": "The user is allergic to peanuts.",
+          "top1_score": 0.7261582549960168,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "pronoun-020",
+          "query": "what kind of car do I drive?",
+          "effective_query": "what kind of car do the user drive?",
+          "gold_in_topk": true,
+          "gold_global_key": "pronoun#9",
+          "gold_rank": 1,
+          "gold_score": 0.6161446193769499,
+          "top1_text": "The user drives a 2019 Toyota Corolla.",
+          "top1_score": 0.6161446193769499,
+          "top1_is_distractor": false,
+          "false_positive": false
+        }
+      ]
+    },
+    {
+      "axis": "temporal",
+      "n": 14,
+      "pool_size": 55,
+      "recall_at_1": 0.5,
+      "recall_at_5": 0.7142857142857143,
+      "mean_gold_rank": 5.5,
+      "distractor_at_top1": 2,
+      "false_positive_count": 0,
+      "median_gold_margin": 0.049944619619024966,
+      "items": [
+        {
+          "id": "temporal-001",
+          "query": "where does the user live now?",
+          "effective_query": "where does the user live now?",
+          "gold_in_topk": true,
+          "gold_global_key": "temporal#0",
+          "gold_rank": 3,
+          "gold_score": 0.550366425409688,
+          "top1_text": "The user lives in Lisbon.",
+          "top1_score": 0.5504016420681116,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "temporal-002",
+          "query": "where did the user used to live?",
+          "effective_query": "where did the user used to live?",
+          "gold_in_topk": true,
+          "gold_global_key": "temporal#1",
+          "gold_rank": 1,
+          "gold_score": 0.5490901457590983,
+          "top1_text": "Before 2024, the user lived in Berlin.",
+          "top1_score": 0.5490901457590983,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "temporal-003",
+          "query": "when did the user move?",
+          "effective_query": "when did the user move?",
+          "gold_in_topk": true,
+          "gold_global_key": "temporal#2",
+          "gold_rank": 1,
+          "gold_score": 0.5076513080537272,
+          "top1_text": "The user moved from Berlin to Lisbon in 2024.",
+          "top1_score": 0.5076513080537272,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "temporal-004",
+          "query": "where is the user currently based?",
+          "effective_query": "where is the user currently based?",
+          "gold_in_topk": true,
+          "gold_global_key": "temporal#0",
+          "gold_rank": 11,
+          "gold_score": 0.42833906054622706,
+          "top1_text": "The user asked for the user's name.",
+          "top1_score": 0.4962051712532369,
+          "top1_is_distractor": true,
+          "false_positive": false
+        },
+        {
+          "id": "temporal-005",
+          "query": "what is the user reading?",
+          "effective_query": "what is the user reading?",
+          "gold_in_topk": true,
+          "gold_global_key": "temporal#3",
+          "gold_rank": 8,
+          "gold_score": 0.4600491260020972,
+          "top1_text": "The user reads on a Kindle Paperwhite 11th-gen.",
+          "top1_score": 0.5634334413727431,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "temporal-006",
+          "query": "what did the user read last year?",
+          "effective_query": "what did the user read last year?",
+          "gold_in_topk": true,
+          "gold_global_key": "temporal#4",
+          "gold_rank": 1,
+          "gold_score": 0.5028289729480597,
+          "top1_text": "Last year the user read 'Project Hail Mary'.",
+          "top1_score": 0.5028289729480597,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "temporal-007",
+          "query": "what is the user working on these days?",
+          "effective_query": "what is the user working on these days?",
+          "gold_in_topk": true,
+          "gold_global_key": "temporal#5",
+          "gold_rank": 4,
+          "gold_score": 0.4513265913655406,
+          "top1_text": "The user works at Acme as a software engineer.",
+          "top1_score": 0.48543171981436045,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "temporal-008",
+          "query": "what did the user finish last month?",
+          "effective_query": "what did the user finish last month?",
+          "gold_in_topk": true,
+          "gold_global_key": "temporal#6",
+          "gold_rank": 1,
+          "gold_score": 0.5544356167423142,
+          "top1_text": "The user finished the Sprint-4 reranker training last month.",
+          "top1_score": 0.5544356167423142,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "temporal-009",
+          "query": "what LLM does the user prefer?",
+          "effective_query": "what LLM does the user prefer?",
+          "gold_in_topk": true,
+          "gold_global_key": "temporal#7",
+          "gold_rank": 1,
+          "gold_score": 0.6226655286181774,
+          "top1_text": "As of May 2026, the user's preferred LLM is Claude Sonnet 4.6.",
+          "top1_score": 0.6226655286181774,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "temporal-010",
+          "query": "which model did the user use before?",
+          "effective_query": "which model did the user use before?",
+          "gold_in_topk": true,
+          "gold_global_key": "temporal#8",
+          "gold_rank": 1,
+          "gold_score": 0.5205286761987613,
+          "top1_text": "The user used GPT-4 as their primary model in 2024.",
+          "top1_score": 0.5205286761987613,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "temporal-011",
+          "query": "did the user get a new phone recently?",
+          "effective_query": "did the user get a new phone recently?",
+          "gold_in_topk": true,
+          "gold_global_key": "temporal#9",
+          "gold_rank": 1,
+          "gold_score": 0.6113316689987077,
+          "top1_text": "The user upgraded their phone to an iPhone 17 in March 2026.",
+          "top1_score": 0.6113316689987077,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "temporal-012",
+          "query": "is the user still in Berlin?",
+          "effective_query": "is the user still in Berlin?",
+          "gold_in_topk": true,
+          "gold_global_key": "temporal#0",
+          "gold_rank": 5,
+          "gold_score": 0.4089445689504921,
+          "top1_text": "Before 2024, the user lived in Berlin.",
+          "top1_score": 0.7214467722165934,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "temporal-013",
+          "query": "what is the user up to?",
+          "effective_query": "what is the user up to?",
+          "gold_in_topk": true,
+          "gold_global_key": "temporal#5",
+          "gold_rank": 19,
+          "gold_score": 0.2672680179526975,
+          "top1_text": "The user is asking a question.",
+          "top1_score": 0.5409891051249414,
+          "top1_is_distractor": true,
+          "false_positive": false
+        },
+        {
+          "id": "temporal-014",
+          "query": "which model is the user on right now?",
+          "effective_query": "which model is the user on right now?",
+          "gold_in_topk": true,
+          "gold_global_key": "temporal#7",
+          "gold_rank": 20,
+          "gold_score": 0.31088477125594804,
+          "top1_text": "The user's name is Alex.",
+          "top1_score": 0.40861487066233426,
+          "top1_is_distractor": false,
+          "false_positive": false
+        }
+      ]
+    },
+    {
+      "axis": "specificity",
+      "n": 14,
+      "pool_size": 55,
+      "recall_at_1": 0.8571428571428571,
+      "recall_at_5": 1,
+      "mean_gold_rank": 1.3571428571428572,
+      "distractor_at_top1": 0,
+      "false_positive_count": 0,
+      "median_gold_margin": 0.14383529034315978,
+      "items": [
+        {
+          "id": "specificity-001",
+          "query": "tell me about my dog",
+          "effective_query": "tell the user about the user's dog",
+          "gold_in_topk": true,
+          "gold_global_key": "specificity#0",
+          "gold_rank": 4,
+          "gold_score": 0.4351817244869386,
+          "top1_text": "The user has a dog named Apollo.",
+          "top1_score": 0.6044216178749056,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "specificity-002",
+          "query": "what kind of pet do I have?",
+          "effective_query": "what kind of pet do the user have?",
+          "gold_in_topk": true,
+          "gold_global_key": "specificity#0",
+          "gold_rank": 3,
+          "gold_score": 0.5000743267639344,
+          "top1_text": "The user has a dog named Apollo.",
+          "top1_score": 0.594187253255591,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "specificity-003",
+          "query": "do I own a bike?",
+          "effective_query": "do the user own a bike?",
+          "gold_in_topk": true,
+          "gold_global_key": "specificity#1",
+          "gold_rank": 1,
+          "gold_score": 0.6923054137966281,
+          "top1_text": "The user owns a Bianchi road bike.",
+          "top1_score": 0.6923054137966281,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "specificity-004",
+          "query": "what brand of bike does the user have?",
+          "effective_query": "what brand of bike does the user have?",
+          "gold_in_topk": true,
+          "gold_global_key": "specificity#1",
+          "gold_rank": 1,
+          "gold_score": 0.6329297203833848,
+          "top1_text": "The user owns a Bianchi road bike.",
+          "top1_score": 0.6329297203833848,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "specificity-005",
+          "query": "what computer do I use?",
+          "effective_query": "what computer do the user use?",
+          "gold_in_topk": true,
+          "gold_global_key": "specificity#2",
+          "gold_rank": 1,
+          "gold_score": 0.5008318094173164,
+          "top1_text": "The user's primary laptop is a 16-inch MacBook Pro M4.",
+          "top1_score": 0.5008318094173164,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "specificity-006",
+          "query": "what laptop does the user have?",
+          "effective_query": "what laptop does the user have?",
+          "gold_in_topk": true,
+          "gold_global_key": "specificity#2",
+          "gold_rank": 1,
+          "gold_score": 0.6450468635073591,
+          "top1_text": "The user's primary laptop is a 16-inch MacBook Pro M4.",
+          "top1_score": 0.6450468635073591,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "specificity-007",
+          "query": "do I have any musical instruments?",
+          "effective_query": "do the user have any musical instruments?",
+          "gold_in_topk": true,
+          "gold_global_key": "specificity#3",
+          "gold_rank": 1,
+          "gold_score": 0.48813840204057846,
+          "top1_text": "The user has a Yamaha P-125 digital piano in the living room.",
+          "top1_score": 0.48813840204057846,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "specificity-008",
+          "query": "which note-taking app do I use?",
+          "effective_query": "which note-taking app do the user use?",
+          "gold_in_topk": true,
+          "gold_global_key": "specificity#4",
+          "gold_rank": 1,
+          "gold_score": 0.4744915847013103,
+          "top1_text": "The user uses Logseq for personal notes and Notion for work.",
+          "top1_score": 0.4744915847013103,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "specificity-009",
+          "query": "where do I like to eat in Lisbon?",
+          "effective_query": "where do the user like to eat in Lisbon?",
+          "gold_in_topk": true,
+          "gold_global_key": "specificity#5",
+          "gold_rank": 1,
+          "gold_score": 0.7219279253072203,
+          "top1_text": "The user's favorite restaurant in Lisbon is Cervejaria Ramiro.",
+          "top1_score": 0.7219279253072203,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "specificity-010",
+          "query": "what brand sunglasses does the user wear?",
+          "effective_query": "what brand sunglasses does the user wear?",
+          "gold_in_topk": true,
+          "gold_global_key": "specificity#6",
+          "gold_rank": 1,
+          "gold_score": 0.6512271965052092,
+          "top1_text": "The user wears Smith Lowdown sunglasses.",
+          "top1_score": 0.6512271965052092,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "specificity-011",
+          "query": "do I read on a Kindle?",
+          "effective_query": "do the user read on a Kindle?",
+          "gold_in_topk": true,
+          "gold_global_key": "specificity#7",
+          "gold_rank": 1,
+          "gold_score": 0.7268333697934469,
+          "top1_text": "The user reads on a Kindle Paperwhite 11th-gen.",
+          "top1_score": 0.7268333697934469,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "specificity-012",
+          "query": "what espresso machine does the user own?",
+          "effective_query": "what espresso machine does the user own?",
+          "gold_in_topk": true,
+          "gold_global_key": "specificity#8",
+          "gold_rank": 1,
+          "gold_score": 0.7366961226077487,
+          "top1_text": "The user's home espresso machine is a Lelit Bianca v3.",
+          "top1_score": 0.7366961226077487,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "specificity-013",
+          "query": "what shoes do I wear?",
+          "effective_query": "what shoes do the user wear?",
+          "gold_in_topk": true,
+          "gold_global_key": "specificity#9",
+          "gold_rank": 1,
+          "gold_score": 0.44632515589821203,
+          "top1_text": "The user wears Allbirds Wool Runners daily.",
+          "top1_score": 0.44632515589821203,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "specificity-014",
+          "query": "what brand are the user's everyday shoes?",
+          "effective_query": "what brand are the user's everyday shoes?",
+          "gold_in_topk": true,
+          "gold_global_key": "specificity#9",
+          "gold_rank": 1,
+          "gold_score": 0.4229939734066537,
+          "top1_text": "The user wears Allbirds Wool Runners daily.",
+          "top1_score": 0.4229939734066537,
+          "top1_is_distractor": false,
+          "false_positive": false
+        }
+      ]
+    },
+    {
+      "axis": "negation",
+      "n": 12,
+      "pool_size": 55,
+      "recall_at_1": 0.9166666666666666,
+      "recall_at_5": 1,
+      "mean_gold_rank": 1.0833333333333333,
+      "distractor_at_top1": 0,
+      "false_positive_count": 0,
+      "median_gold_margin": 0.26762270427014134,
+      "items": [
+        {
+          "id": "negation-001",
+          "query": "does the user drink coffee?",
+          "effective_query": "does the user drink coffee?",
+          "gold_in_topk": true,
+          "gold_global_key": "negation#0",
+          "gold_rank": 1,
+          "gold_score": 0.7905096599561829,
+          "top1_text": "The user does not drink coffee. They prefer tea.",
+          "top1_score": 0.7905096599561829,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "negation-002",
+          "query": "what does the user drink in the morning?",
+          "effective_query": "what does the user drink in the morning?",
+          "gold_in_topk": true,
+          "gold_global_key": "negation#0",
+          "gold_rank": 1,
+          "gold_score": 0.5285974920275435,
+          "top1_text": "The user does not drink coffee. They prefer tea.",
+          "top1_score": 0.5285974920275435,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "negation-003",
+          "query": "am I vegetarian?",
+          "effective_query": "am the user vegetarian?",
+          "gold_in_topk": true,
+          "gold_global_key": "negation#1",
+          "gold_rank": 1,
+          "gold_score": 0.754238802786161,
+          "top1_text": "The user is not vegetarian, but avoids red meat.",
+          "top1_score": 0.754238802786161,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "negation-004",
+          "query": "is the user on Twitter?",
+          "effective_query": "is the user on Twitter?",
+          "gold_in_topk": true,
+          "gold_global_key": "negation#2",
+          "gold_rank": 1,
+          "gold_score": 0.5874304481012608,
+          "top1_text": "The user does not use Twitter; they use Bluesky and Mastodon.",
+          "top1_score": 0.5874304481012608,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "negation-005",
+          "query": "which social networks does the user use?",
+          "effective_query": "which social networks does the user use?",
+          "gold_in_topk": true,
+          "gold_global_key": "negation#2",
+          "gold_rank": 1,
+          "gold_score": 0.5067702908301415,
+          "top1_text": "The user does not use Twitter; they use Bluesky and Mastodon.",
+          "top1_score": 0.5067702908301415,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "negation-006",
+          "query": "does the user own a car?",
+          "effective_query": "does the user own a car?",
+          "gold_in_topk": true,
+          "gold_global_key": "negation#3",
+          "gold_rank": 1,
+          "gold_score": 0.7488288864422115,
+          "top1_text": "The user does not own a car; they bike or use public transit.",
+          "top1_score": 0.7488288864422115,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "negation-007",
+          "query": "is the user active on LinkedIn?",
+          "effective_query": "is the user active on LinkedIn?",
+          "gold_in_topk": true,
+          "gold_global_key": "negation#4",
+          "gold_rank": 1,
+          "gold_score": 0.7880239246717338,
+          "top1_text": "The user is not on LinkedIn anymore.",
+          "top1_score": 0.7880239246717338,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "negation-008",
+          "query": "any foods the user hates?",
+          "effective_query": "any foods the user hates?",
+          "gold_in_topk": true,
+          "gold_global_key": "negation#5",
+          "gold_rank": 2,
+          "gold_score": 0.4222244002602081,
+          "top1_text": "The user is not vegetarian, but avoids red meat.",
+          "top1_score": 0.4321470878666278,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "negation-009",
+          "query": "has the user traveled to Asia?",
+          "effective_query": "has the user traveled to Asia?",
+          "gold_in_topk": true,
+          "gold_global_key": "negation#6",
+          "gold_rank": 1,
+          "gold_score": 0.6919597377462416,
+          "top1_text": "The user has never been to Asia.",
+          "top1_score": 0.6919597377462416,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "negation-010",
+          "query": "does the user like horror movies?",
+          "effective_query": "does the user like horror movies?",
+          "gold_in_topk": true,
+          "gold_global_key": "negation#7",
+          "gold_rank": 1,
+          "gold_score": 0.7303882056719367,
+          "top1_text": "The user does not enjoy horror movies.",
+          "top1_score": 0.7303882056719367,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "negation-011",
+          "query": "can the user eat shrimp?",
+          "effective_query": "can the user eat shrimp?",
+          "gold_in_topk": true,
+          "gold_global_key": "negation#8",
+          "gold_rank": 1,
+          "gold_score": 0.7044731236020111,
+          "top1_text": "The user does not eat shellfish.",
+          "top1_score": 0.7044731236020111,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "negation-012",
+          "query": "is the user learning a new language?",
+          "effective_query": "is the user learning a new language?",
+          "gold_in_topk": true,
+          "gold_global_key": "negation#9",
+          "gold_rank": 1,
+          "gold_score": 0.6522323599643379,
+          "top1_text": "The user is not currently learning any new languages.",
+          "top1_score": 0.6522323599643379,
+          "top1_is_distractor": false,
+          "false_positive": false
+        }
+      ]
+    },
+    {
+      "axis": "control",
+      "n": 10,
+      "pool_size": 55,
+      "recall_at_1": 0,
+      "recall_at_5": 0,
+      "mean_gold_rank": null,
+      "distractor_at_top1": 0,
+      "false_positive_count": 0,
+      "median_gold_margin": null,
+      "items": [
+        {
+          "id": "control-001",
+          "query": "what is the airspeed velocity of an unladen swallow?",
+          "effective_query": "what is the airspeed velocity of an unladen swallow?",
+          "gold_in_topk": false,
+          "gold_global_key": null,
+          "gold_rank": null,
+          "gold_score": null,
+          "top1_text": "The user does not eat shellfish.",
+          "top1_score": 0.25082327505676677,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "control-002",
+          "query": "who is the current president of France?",
+          "effective_query": "who is the current president of France?",
+          "gold_in_topk": false,
+          "gold_global_key": null,
+          "gold_rank": null,
+          "gold_score": null,
+          "top1_text": "As of January 2026, the user lives in Lisbon.",
+          "top1_score": 0.2564335064669229,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "control-003",
+          "query": "what is the capital of Mongolia?",
+          "effective_query": "what is the capital of Mongolia?",
+          "gold_in_topk": false,
+          "gold_global_key": null,
+          "gold_rank": null,
+          "gold_score": null,
+          "top1_text": "The user has never been to Asia.",
+          "top1_score": 0.27119226736778723,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "control-004",
+          "query": "how does photosynthesis work?",
+          "effective_query": "how does photosynthesis work?",
+          "gold_in_topk": false,
+          "gold_global_key": null,
+          "gold_rank": null,
+          "gold_score": null,
+          "top1_text": "The user is currently working on a memory benchmark project.",
+          "top1_score": 0.13794111637514647,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "control-005",
+          "query": "translate 'goodnight' to Japanese",
+          "effective_query": "translate 'goodnight' to Japanese",
+          "gold_in_topk": false,
+          "gold_global_key": null,
+          "gold_rank": null,
+          "gold_score": null,
+          "top1_text": "The user's favorite restaurant is Cervejaria Ramiro.",
+          "top1_score": 0.1911340870733358,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "control-006",
+          "query": "what year did World War II end?",
+          "effective_query": "what year did World War II end?",
+          "gold_in_topk": false,
+          "gold_global_key": null,
+          "gold_rank": null,
+          "gold_score": null,
+          "top1_text": "Before 2024, the user lived in Berlin.",
+          "top1_score": 0.2900090433169317,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "control-007",
+          "query": "explain entropy in thermodynamics",
+          "effective_query": "explain entropy in thermodynamics",
+          "gold_in_topk": false,
+          "gold_global_key": null,
+          "gold_rank": null,
+          "gold_score": null,
+          "top1_text": "The user uses Logseq for personal notes and Notion for work.",
+          "top1_score": 0.18512903871088907,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "control-008",
+          "query": "best way to debug a segfault in C",
+          "effective_query": "best way to debug a segfault in C",
+          "gold_in_topk": false,
+          "gold_global_key": null,
+          "gold_rank": null,
+          "gold_score": null,
+          "top1_text": "The user is currently working on a memory benchmark project.",
+          "top1_score": 0.23013625373127655,
+          "top1_is_distractor": false,
+          "false_positive": false
+        },
+        {
+          "id": "control-009",
+          "query": "what's the weather going to be tomorrow?",
+          "effective_query": "what's the weather going to be tomorrow?",
+          "gold_in_topk": false,
+          "gold_global_key": null,
+          "gold_rank": null,
+          "gold_score": null,
+          "top1_text": "As of May 14, 2026, Apollo is a term mentioned in the conversation.",
+          "top1_score": 0.18279764320601347,
+          "top1_is_distractor": true,
+          "false_positive": false
+        },
+        {
+          "id": "control-010",
+          "query": "give me a recipe for tiramisu",
+          "effective_query": "give the user a recipe for tiramisu",
+          "gold_in_topk": false,
+          "gold_global_key": null,
+          "gold_rank": null,
+          "gold_score": null,
+          "top1_text": "The user's favorite restaurant is Cervejaria Ramiro.",
+          "top1_score": 0.35106130753511783,
+          "top1_is_distractor": false,
+          "false_positive": false
+        }
+      ]
+    }
+  ]
+}
\ No newline at end of file
diff --git a/src/memory/__tests__/atomicmemory-provider.test.ts b/src/memory/__tests__/atomicmemory-provider.test.ts
index 72332eb..96dd30a 100644
--- a/src/memory/__tests__/atomicmemory-provider.test.ts
+++ b/src/memory/__tests__/atomicmemory-provider.test.ts
@@ -99,6 +99,33 @@ describe('ingest', () => {
     const body = JSON.parse(mockFetch.mock.calls[0][1].body);
     expect(body.conversation).toBe('user: Hi\nassistant: Hello');
   });
+
+  it('maps scope.thread to session_id', async () => {
+    const provider = createProvider();
+    mockFetch.mockResolvedValueOnce(
+      jsonResponse({
+        episode_id: 'e3',
+        facts_extracted: 1,
+        memories_stored: 1,
+        memories_updated: 0,
+        memories_deleted: 0,
+        memories_skipped: 0,
+        stored_memory_ids: ['m3'],
+        updated_memory_ids: [],
+        links_created: 0,
+        composites_created: 0,
+      })
+    );
+
+    await provider.ingest({
+      mode: 'text',
+      content: 'Hello thread',
+      scope: { user: 'u1', thread: 'thread-1' },
+    });
+
+    const body = JSON.parse(mockFetch.mock.calls[0][1].body);
+    expect(body.session_id).toBe('thread-1');
+  });
 });
 
 // ---------------------------------------------------------------------------
@@ -140,6 +167,58 @@ describe('search', () => {
     expect(page.results[0].relevance).toBe(0.84);
     expect(page.results[0].memory.id).toBe('s1');
   });
+
+  it('maps scope.thread to search session_id', async () => {
+    const provider = createProvider();
+    mockFetch.mockResolvedValueOnce(jsonResponse({ memories: [], count: 0 }));
+
+    await provider.search({
+      query: 'test',
+      scope: { user: 'u1', thread: 'thread-1' },
+    });
+
+    const body = JSON.parse(mockFetch.mock.calls[0][1].body);
+    expect(body.session_id).toBe('thread-1');
+  });
+
+  it('rejects thread-scoped search rows without matching session_id', async () => {
+    const provider = createProvider();
+    mockFetch.mockResolvedValueOnce(jsonResponse({
+      memories: [{ id: 's1', content: 'wrong thread' }],
+      count: 1,
+    }));
+
+    await expect(provider.search({
+      query: 'test',
+      scope: { user: 'u1', thread: 'thread-1' },
+    })).rejects.toThrow(/session_id/);
+  });
+
+  it('rejects thread-scoped search rows with mismatched session_id', async () => {
+    const provider = createProvider();
+    mockFetch.mockResolvedValueOnce(jsonResponse({
+      memories: [{ id: 's1', content: 'wrong thread', session_id: 'thread-2' }],
+      count: 1,
+    }));
+
+    await expect(provider.search({
+      query: 'test',
+      scope: { user: 'u1', thread: 'thread-1' },
+    })).rejects.toThrow(/session_id/);
+  });
+
+  it('rejects namespace-scoped search rows with mismatched namespace', async () => {
+    const provider = createProvider();
+    mockFetch.mockResolvedValueOnce(jsonResponse({
+      memories: [{ id: 's1', content: 'wrong namespace', namespace: 'other' }],
+      count: 1,
+    }));
+
+    await expect(provider.search({
+      query: 'test',
+      scope: { user: 'u1', namespace: 'expected' },
+    })).rejects.toThrow(/namespace/);
+  });
 });
 
 // ---------------------------------------------------------------------------
@@ -205,7 +284,12 @@ describe('list', () => {
     const provider = createProvider();
     mockFetch.mockResolvedValueOnce(
       jsonResponse({
-        memories: [{ id: 'l1', content: 'item' }],
+        memories: [{
+          id: 'l1',
+          content: 'item',
+          namespace: 'project-a',
+          session_id: 'thread-a',
+        }],
         count: 1,
       })
     );
@@ -216,6 +300,52 @@ describe('list', () => {
     expect(url).toBe(`${API_URL}/v1/memories/list?user_id=u1&limit=10&offset=0`);
     expect(page.memories).toHaveLength(1);
     expect(page.memories[0].id).toBe('l1');
+    expect(page.memories[0].scope).toEqual({
+      user: 'u1',
+      namespace: 'project-a',
+      thread: 'thread-a',
+    });
+  });
+
+  it('maps scope.thread to list session_id query param', async () => {
+    const provider = createProvider();
+    mockFetch.mockResolvedValueOnce(jsonResponse({ memories: [], count: 0 }));
+
+    await provider.list({
+      scope: { user: 'u1', thread: 'thread-1' },
+      limit: 10,
+    });
+
+    const [url] = mockFetch.mock.calls[0];
+    expect(url).toBe(
+      `${API_URL}/v1/memories/list?user_id=u1&limit=10&offset=0&session_id=thread-1`,
+    );
+  });
+
+  it('rejects thread-scoped list rows without matching session_id', async () => {
+    const provider = createProvider();
+    mockFetch.mockResolvedValueOnce(jsonResponse({
+      memories: [{ id: 'l1', content: 'missing session' }],
+      count: 1,
+    }));
+
+    await expect(provider.list({
+      scope: { user: 'u1', thread: 'thread-1' },
+      limit: 10,
+    })).rejects.toThrow(/session_id/);
+  });
+
+  it('rejects thread-scoped list rows with mismatched session_id', async () => {
+    const provider = createProvider();
+    mockFetch.mockResolvedValueOnce(jsonResponse({
+      memories: [{ id: 'l1', content: 'wrong session', session_id: 'thread-2' }],
+      count: 1,
+    }));
+
+    await expect(provider.list({
+      scope: { user: 'u1', thread: 'thread-1' },
+      limit: 10,
+    })).rejects.toThrow(/session_id/);
   });
 
   it('returns cursor when results fill the limit', async () => {
@@ -304,6 +434,45 @@ describe('package', () => {
     expect(pkg.budgetConstrained).toBe(false);
   });
 
+  it('maps scope.thread to package session_id', async () => {
+    const provider = createProvider();
+    mockFetch.mockResolvedValueOnce(
+      jsonResponse({
+        memories: [],
+        injection_text: '',
+        estimated_context_tokens: 0,
+        budget_constrained: false,
+      })
+    );
+
+    await provider.package({
+      query: 'what did I say',
+      scope: { user: 'u1', thread: 'thread-1' },
+    });
+
+    const body = JSON.parse(mockFetch.mock.calls[0][1].body);
+    expect(body.session_id).toBe('thread-1');
+  });
+
+  it('rejects thread-scoped package rows without matching session_id', async () => {
+    const provider = createProvider();
+    mockFetch.mockResolvedValueOnce(
+      jsonResponse({
+        memories: [{ id: 'p1', content: 'wrong thread', score: 0.9 }],
+        injection_text: 'wrong thread',
+        estimated_context_tokens: 2,
+        budget_constrained: false,
+      })
+    );
+
+    await expect(
+      provider.package({
+        query: 'what did I say',
+        scope: { user: 'u1', thread: 'thread-1' },
+      })
+    ).rejects.toThrow(/session_id/);
+  });
+
   it('propagates budget_constrained=true from the backend', async () => {
     const provider = createProvider();
     mockFetch.mockResolvedValueOnce(
@@ -349,6 +518,27 @@ describe('package', () => {
   });
 });
 
+// ---------------------------------------------------------------------------
+// searchAsOf() — TemporalSearch
+// ---------------------------------------------------------------------------
+
+describe('searchAsOf', () => {
+  it('maps scope.thread to temporal search session_id', async () => {
+    const provider = createProvider();
+    mockFetch.mockResolvedValueOnce(jsonResponse({ memories: [] }));
+
+    await provider.searchAsOf({
+      query: 'what did I say',
+      scope: { user: 'u1', thread: 'thread-1' },
+      asOf: new Date('2026-05-16T12:00:00.000Z'),
+    });
+
+    const body = JSON.parse(mockFetch.mock.calls[0][1].body);
+    expect(body.session_id).toBe('thread-1');
+    expect(body.as_of).toBe('2026-05-16T12:00:00.000Z');
+  });
+});
+
 // ---------------------------------------------------------------------------
 // Scope validation
 // ---------------------------------------------------------------------------
diff --git a/src/memory/__tests__/meta-fact-filter.test.ts b/src/memory/__tests__/meta-fact-filter.test.ts
new file mode 100644
index 0000000..4d2fd54
--- /dev/null
+++ b/src/memory/__tests__/meta-fact-filter.test.ts
@@ -0,0 +1,210 @@
+/**
+ * @file MetaFactFilter unit tests
+ *
+ * Covers the three public surfaces of meta-fact-filter:
+ *   - DEFAULT_META_FACT_PATTERNS / isMetaFact: pattern matching
+ *   - resolveMetaFactPatterns: replace vs extend modes
+ *   - filterMetaFacts: end-to-end drop with onDrop telemetry
+ *
+ * Item shapes are deliberately the same the SDK uses (SearchResult.memory.content)
+ * to keep the integration risk on the call-site low.
+ */
+
+import { describe, it, expect, vi } from 'vitest';
+import {
+  DEFAULT_META_FACT_PATTERNS,
+  filterMetaFacts,
+  isMetaFact,
+  resolveMetaFactPatterns,
+  type MetaFactFilterConfig,
+} from '../meta-fact-filter';
+
+describe('isMetaFact', () => {
+  it.each([
+    "The user asked for the user's name.",
+    "The user is asking a question.",
+    'The user is me.',
+    'The user requested information.',
+    'The user said something.',
+    'As of May 14, 2026, Apollo is a term mentioned in the conversation.',
+    'As of January 2026, the user is a term mentioned in the conversation.',
+    'A name was mentioned in the conversation.',
+    'The conversation involves the user.',
+    'The user has started a conversation.',
+  ])('matches the partner-demo meta-fact shape: "%s"', (content) => {
+    expect(isMetaFact(content)).toBe(true);
+  });
+
+  it.each([
+    "User's name is SgtPooki",
+    'The user lives in Lisbon.',
+    "The user's dog is named Apollo.",
+    'The user prefers oat-milk flat whites.',
+    'As of January 2026, the user lives in Lisbon.', // temporal anchor on a real fact, not a meta-fact
+  ])('does not match a durable user fact: "%s"', (content) => {
+    expect(isMetaFact(content)).toBe(false);
+  });
+
+  it('is case-insensitive on the leading "The user"', () => {
+    expect(isMetaFact('THE USER ASKED FOR THE USER\'S NAME.')).toBe(true);
+    expect(isMetaFact('the user is me.')).toBe(true);
+  });
+
+  it.each([null, undefined, 42, {}, [], ''])(
+    'returns false on non-string / empty input (%s)',
+    (input) => {
+      expect(isMetaFact(input as unknown)).toBe(false);
+    },
+  );
+
+  it('uses caller-supplied patterns instead of defaults when provided', () => {
+    const custom = [/^transcript: /i];
+    expect(isMetaFact('transcript: hello', custom)).toBe(true);
+    // The default rules would NOT match this; with custom rules, it does.
+    expect(isMetaFact("The user is me.", custom)).toBe(false);
+  });
+});
+
+describe('resolveMetaFactPatterns', () => {
+  it('returns the default set when patterns is omitted', () => {
+    const config: MetaFactFilterConfig = { enabled: true };
+    expect(resolveMetaFactPatterns(config)).toBe(DEFAULT_META_FACT_PATTERNS);
+  });
+
+  it("'replace' mode (default) returns only the caller's patterns", () => {
+    const config: MetaFactFilterConfig = {
+      enabled: true,
+      patterns: [/^foo$/],
+    };
+    const resolved = resolveMetaFactPatterns(config);
+    expect(resolved).toHaveLength(1);
+    expect(resolved[0]).toEqual(/^foo$/);
+  });
+
+  it("'extend' mode unions caller patterns with defaults", () => {
+    const config: MetaFactFilterConfig = {
+      enabled: true,
+      patterns: [/^foo$/],
+      mode: 'extend',
+    };
+    const resolved = resolveMetaFactPatterns(config);
+    expect(resolved.length).toBe(DEFAULT_META_FACT_PATTERNS.length + 1);
+    expect(resolved[0]).toEqual(/^foo$/);
+  });
+});
+
+describe('filterMetaFacts', () => {
+  interface FakeResult {
+    memory: { content: string };
+    score: number;
+  }
+  const items: FakeResult[] = [
+    { memory: { content: "User's name is SgtPooki" }, score: 0.51 },
+    { memory: { content: "The user asked for the user's name." }, score: 0.40 },
+    { memory: { content: 'The user is me.' }, score: 0.35 },
+    { memory: { content: 'The user lives in Lisbon.' }, score: 0.32 },
+  ];
+
+  it('is a no-op when filter is disabled', () => {
+    const out = filterMetaFacts(items, (r) => r.memory.content, {
+      enabled: false,
+    });
+    expect(out).toEqual(items);
+    expect(out).not.toBe(items); // returns a copy
+  });
+
+  it('drops items whose content matches the default patterns', () => {
+    const out = filterMetaFacts(items, (r) => r.memory.content, {
+      enabled: true,
+    });
+    expect(out).toHaveLength(2);
+    expect(out.map((r) => r.memory.content)).toEqual([
+      "User's name is SgtPooki",
+      'The user lives in Lisbon.',
+    ]);
+  });
+
+  it('preserves original order of kept items', () => {
+    const ordered: FakeResult[] = [
+      { memory: { content: 'real-1' }, score: 1 },
+      { memory: { content: 'The user is me.' }, score: 0.9 },
+      { memory: { content: 'real-2' }, score: 0.8 },
+      { memory: { content: 'The user asked for the user\'s name.' }, score: 0.7 },
+      { memory: { content: 'real-3' }, score: 0.6 },
+    ];
+    const out = filterMetaFacts(ordered, (r) => r.memory.content, {
+      enabled: true,
+    });
+    expect(out.map((r) => r.memory.content)).toEqual(['real-1', 'real-2', 'real-3']);
+  });
+
+  it('invokes onDrop once per dropped item with pattern index', () => {
+    const dropped: Array<{ content: string; index: number }> = [];
+    filterMetaFacts(items, (r) => r.memory.content, {
+      enabled: true,
+      onDrop: (content, index) => dropped.push({ content, index }),
+    });
+    expect(dropped).toHaveLength(2);
+    expect(dropped[0].content).toBe("The user asked for the user's name.");
+    expect(dropped[1].content).toBe('The user is me.');
+    // Both match pattern index 0 (the first DEFAULT pattern) — which is the
+    // catch-all "The user (asked|requested|said|is asking|is me)" rule.
+    expect(dropped[0].index).toBe(0);
+    expect(dropped[1].index).toBe(0);
+  });
+
+  it('swallows onDrop exceptions so filtering never breaks recall', () => {
+    const out = filterMetaFacts(items, (r) => r.memory.content, {
+      enabled: true,
+      onDrop: () => {
+        throw new Error('telemetry blew up');
+      },
+    });
+    expect(out).toHaveLength(2);
+  });
+
+  it('honours custom patterns in replace mode', () => {
+    const out = filterMetaFacts(items, (r) => r.memory.content, {
+      enabled: true,
+      patterns: [/^User's name/],
+    });
+    // Custom pattern drops "User's name is SgtPooki" but lets meta-facts through.
+    expect(out.map((r) => r.memory.content)).toEqual([
+      "The user asked for the user's name.",
+      'The user is me.',
+      'The user lives in Lisbon.',
+    ]);
+  });
+
+  it('honours custom patterns in extend mode (union with defaults)', () => {
+    const out = filterMetaFacts(items, (r) => r.memory.content, {
+      enabled: true,
+      patterns: [/^User's name/],
+      mode: 'extend',
+    });
+    // Both the custom rule AND the defaults fire.
+    expect(out.map((r) => r.memory.content)).toEqual([
+      'The user lives in Lisbon.',
+    ]);
+  });
+
+  it('handles non-string content gracefully without dropping the item', () => {
+    const weird = [
+      ...items,
+      { memory: { content: null as unknown as string }, score: 0.1 },
+    ];
+    const out = filterMetaFacts(weird, (r) => r.memory.content, {
+      enabled: true,
+    });
+    // Real facts + the null-content item survive; meta-facts dropped.
+    expect(out).toHaveLength(3);
+  });
+
+  it('returns the original list when the resolved pattern set is empty', () => {
+    const out = filterMetaFacts(items, (r) => r.memory.content, {
+      enabled: true,
+      patterns: [],
+    });
+    expect(out).toEqual(items);
+  });
+});
diff --git a/src/memory/atomicmemory-provider/__tests__/fixtures/list.mapped.json b/src/memory/atomicmemory-provider/__tests__/fixtures/list.mapped.json
index 915a0f9..d0f4fe7 100644
--- a/src/memory/atomicmemory-provider/__tests__/fixtures/list.mapped.json
+++ b/src/memory/atomicmemory-provider/__tests__/fixtures/list.mapped.json
@@ -3,7 +3,8 @@
     "id": "FIXTURE-MEM-2",
     "content": "user's library card expires in March 2027.",
     "scope": {
-      "user": "fixture-capture"
+      "user": "fixture-capture",
+      "namespace": "site/fixture/quick/ingest"
     },
     "createdAt": "2026-04-24T10:00:00.000Z",
     "provenance": {
@@ -19,7 +20,8 @@
     "id": "FIXTURE-MEM-1",
     "content": "User prefers aisle seats on flights longer than four hours.",
     "scope": {
-      "user": "fixture-capture"
+      "user": "fixture-capture",
+      "namespace": "site/fixture/full/ingest"
     },
     "createdAt": "2026-04-24T10:00:00.000Z",
     "provenance": {
diff --git a/src/memory/atomicmemory-provider/__tests__/fixtures/search-fast.mapped.json b/src/memory/atomicmemory-provider/__tests__/fixtures/search-fast.mapped.json
index 482546f..896549c 100644
--- a/src/memory/atomicmemory-provider/__tests__/fixtures/search-fast.mapped.json
+++ b/src/memory/atomicmemory-provider/__tests__/fixtures/search-fast.mapped.json
@@ -4,7 +4,8 @@
       "id": "FIXTURE-MEM-2",
       "content": "user's library card expires in March 2027.",
       "scope": {
-        "user": "fixture-capture"
+        "user": "fixture-capture",
+        "thread": "fixture-fast-thread-quick"
       },
       "createdAt": "2026-04-24T10:00:00.000Z",
       "provenance": {
@@ -24,7 +25,8 @@
       "id": "FIXTURE-MEM-1",
       "content": "User prefers aisle seats on flights longer than four hours.",
       "scope": {
-        "user": "fixture-capture"
+        "user": "fixture-capture",
+        "thread": "fixture-fast-thread-full"
       },
       "createdAt": "2026-04-24T10:00:00.000Z",
       "provenance": {
diff --git a/src/memory/atomicmemory-provider/__tests__/fixtures/search-fast.raw.json b/src/memory/atomicmemory-provider/__tests__/fixtures/search-fast.raw.json
index 4af990c..ca6fe3b 100644
--- a/src/memory/atomicmemory-provider/__tests__/fixtures/search-fast.raw.json
+++ b/src/memory/atomicmemory-provider/__tests__/fixtures/search-fast.raw.json
@@ -16,6 +16,7 @@
       "relevance": 0.14691642279927353,
       "importance": 0.6,
       "source_site": "fixture-quick-ingest",
+      "session_id": "fixture-fast-thread-quick",
       "created_at": "2026-04-24T10:00:00.000Z"
     },
     {
@@ -28,6 +29,7 @@
       "relevance": 0.6055988308430426,
       "importance": 0.6,
       "source_site": "fixture-full-ingest",
+      "session_id": "fixture-fast-thread-full",
       "created_at": "2026-04-24T10:00:00.000Z"
     }
   ],
diff --git a/src/memory/atomicmemory-provider/__tests__/fixtures/search.mapped.json b/src/memory/atomicmemory-provider/__tests__/fixtures/search.mapped.json
index 6763ec7..f17f6a5 100644
--- a/src/memory/atomicmemory-provider/__tests__/fixtures/search.mapped.json
+++ b/src/memory/atomicmemory-provider/__tests__/fixtures/search.mapped.json
@@ -4,7 +4,8 @@
       "id": "FIXTURE-MEM-2",
       "content": "user's library card expires in March 2027.",
       "scope": {
-        "user": "fixture-capture"
+        "user": "fixture-capture",
+        "thread": "fixture-thread-quick"
       },
       "createdAt": "2026-04-24T10:00:00.000Z",
       "provenance": {
@@ -24,7 +25,8 @@
       "id": "FIXTURE-MEM-1",
       "content": "User prefers aisle seats on flights longer than four hours.",
       "scope": {
-        "user": "fixture-capture"
+        "user": "fixture-capture",
+        "thread": "fixture-thread-full"
       },
       "createdAt": "2026-04-24T10:00:00.000Z",
       "provenance": {
diff --git a/src/memory/atomicmemory-provider/__tests__/fixtures/search.raw.json b/src/memory/atomicmemory-provider/__tests__/fixtures/search.raw.json
index 4614899..e025027 100644
--- a/src/memory/atomicmemory-provider/__tests__/fixtures/search.raw.json
+++ b/src/memory/atomicmemory-provider/__tests__/fixtures/search.raw.json
@@ -16,6 +16,7 @@
       "relevance": 0.14691642279927353,
       "importance": 0.6,
       "source_site": "fixture-quick-ingest",
+      "session_id": "fixture-thread-quick",
       "created_at": "2026-04-24T10:00:00.000Z"
     },
     {
@@ -28,6 +29,7 @@
       "relevance": 0.6055988308430426,
       "importance": 0.6,
       "source_site": "fixture-full-ingest",
+      "session_id": "fixture-thread-full",
       "created_at": "2026-04-24T10:00:00.000Z"
     }
   ],
diff --git a/src/memory/atomicmemory-provider/__tests__/namespace-base-routes.test.ts b/src/memory/atomicmemory-provider/__tests__/namespace-base-routes.test.ts
index 33d6941..14afa45 100644
--- a/src/memory/atomicmemory-provider/__tests__/namespace-base-routes.test.ts
+++ b/src/memory/atomicmemory-provider/__tests__/namespace-base-routes.test.ts
@@ -127,6 +127,20 @@ describe('atomicmemory.ingestFull', () => {
     expect(body.agent_scope).toBeUndefined();
   });
 
+  it('forwards thread scope as session_id on ingest', async () => {
+    mockFetch.mockResolvedValueOnce(
+      jsonResponse({ episode_id:'e1', facts_extracted:0, memories_stored:0, memories_updated:0, memories_deleted:0, memories_skipped:0, stored_memory_ids: [], updated_memory_ids: [], links_created:0, composites_created:0 }),
+    );
+    const handle = createHandle();
+
+    await handle.ingestFull(
+      { conversation: 'x', sourceSite: 's' },
+      { ...USER_SCOPE, thread: 'thread-1' },
+    );
+
+    expect(capturedCall(mockFetch).body?.session_id).toBe('thread-1');
+  });
+
   it('forwards visibility on workspace scope', async () => {
     mockFetch.mockResolvedValueOnce(
       jsonResponse({ episode_id:'e1', facts_extracted:0, memories_stored:0, memories_updated:0, memories_deleted:0, memories_skipped:0, stored_memory_ids: [], updated_memory_ids: [], links_created:0, composites_created:0 }),
@@ -232,6 +246,45 @@ describe('atomicmemory.search', () => {
     expect(result.citations).toEqual(['m1', 'm2']);
     expect(result.observability).toBeDefined();
   });
+
+  it('forwards thread scope and maps returned session_id', async () => {
+    mockFetch.mockResolvedValueOnce(
+      jsonResponse({
+        count: 1,
+        retrieval_mode: 'flat',
+        memories: [{ id: 'm1', content: 'a', session_id: 'thread-1' }],
+      }),
+    );
+
+    const handle = createHandle();
+    const result = await handle.search(
+      { query: 'q' },
+      { ...USER_SCOPE, thread: 'thread-1' },
+    );
+
+    const call = capturedCall(mockFetch);
+    expect(call.body?.session_id).toBe('thread-1');
+    expect(result.results[0].memory.scope).toEqual({
+      ...USER_SCOPE,
+      thread: 'thread-1',
+    });
+  });
+
+  it('rejects thread-scoped rows without matching session_id', async () => {
+    mockFetch.mockResolvedValueOnce(
+      jsonResponse({
+        count: 1,
+        retrieval_mode: 'flat',
+        memories: [{ id: 'm1', content: 'a' }],
+      }),
+    );
+
+    const handle = createHandle();
+    await expect(
+      handle.search({ query: 'q' }, { ...USER_SCOPE, thread: 'thread-1' }),
+    ).rejects.toThrow(/session_id/);
+  });
+
 });
 
 describe('atomicmemory.searchFast', () => {
@@ -299,6 +352,33 @@ describe('atomicmemory.list', () => {
     expect(call.url).toContain('workspace_id=ws1');
     expect(call.url).toContain('agent_id=a1');
   });
+
+  it('forwards thread scope and maps returned session_id', async () => {
+    mockFetch.mockResolvedValueOnce(
+      jsonResponse({
+        memories: [{ id: 'm1', content: 'a', session_id: 'thread-1' }],
+        count: 1,
+      }),
+    );
+    const handle = createHandle();
+    const page = await handle.list({ ...WORKSPACE_SCOPE, thread: 'thread-1' });
+    const call = capturedCall(mockFetch);
+    expect(call.url).toContain('session_id=thread-1');
+    expect(page.memories[0].scope).toEqual({
+      ...WORKSPACE_SCOPE,
+      thread: 'thread-1',
+    });
+  });
+
+  it('rejects thread-scoped list rows without matching session_id', async () => {
+    mockFetch.mockResolvedValueOnce(
+      jsonResponse({ memories: [{ id: 'm1', content: 'a' }], count: 1 }),
+    );
+    const handle = createHandle();
+    await expect(
+      handle.list({ ...USER_SCOPE, thread: 'thread-1' }),
+    ).rejects.toThrow(/session_id/);
+  });
 });
 
 describe('atomicmemory.get', () => {
diff --git a/src/memory/atomicmemory-provider/atomicmemory-provider.ts b/src/memory/atomicmemory-provider/atomicmemory-provider.ts
index 7ebf1ab..79ce10b 100644
--- a/src/memory/atomicmemory-provider/atomicmemory-provider.ts
+++ b/src/memory/atomicmemory-provider/atomicmemory-provider.ts
@@ -48,6 +48,10 @@ import {
 } from './mappers';
 import type { AtomicMemoryHandle } from './handle';
 import { createAtomicMemoryHandle } from './handle-impl';
+import {
+  filterMetaFacts,
+  type MetaFactFilterConfig,
+} from '../meta-fact-filter';
 
 export class AtomicMemoryProvider
   extends BaseMemoryProvider
@@ -60,6 +64,12 @@ export class AtomicMemoryProvider
    * Empty string disables prefixing (legacy deployments only).
    */
   private readonly apiPrefix: string;
+  /**
+   * Opt-in post-retrieval meta-fact filter. `undefined` (default) means
+   * filtering is off. See `MetaFactFilterConfig` and
+   * `benchmarks/alignbench/RESULTS.md` for motivation.
+   */
+  private readonly metaFactFilter?: MetaFactFilterConfig;
 
   constructor(config: AtomicMemoryProviderConfig) {
     super();
@@ -71,6 +81,24 @@ export class AtomicMemoryProvider
     this.apiPrefix = normalizeApiVersion(
       config.apiVersion ?? DEFAULT_API_VERSION,
     );
+    this.metaFactFilter = config.metaFactFilter;
+  }
+
+  /**
+   * Drop meta-fact entries from a SearchResult list when the filter is enabled.
+   *
+   * Called once per search-style endpoint (regular search, temporal search,
+   * package) so meta-facts never reach the caller. No-op when
+   * `this.metaFactFilter` is `undefined` or `enabled: false` — matches the
+   * pre-filter behaviour byte-for-byte.
+   */
+  private applyMetaFactFilter(results: SearchResult[]): SearchResult[] {
+    if (!this.metaFactFilter || !this.metaFactFilter.enabled) return results;
+    return filterMetaFacts(
+      results,
+      (r) => r.memory.content,
+      this.metaFactFilter,
+    );
   }
 
   /** Prepend the configured API-version prefix to a route path. */
@@ -91,6 +119,7 @@ export class AtomicMemoryProvider
       source_site: input.provenance?.source ?? 'sdk',
       source_url: input.provenance?.sourceUrl ?? '',
     };
+    if (input.scope.thread) body.session_id = input.scope.thread;
     if (isVerbatim) body.skip_extraction = true;
     // Forward caller-supplied metadata to the wire ONLY on the
     // verbatim path. Core honors `metadata` only on
@@ -140,6 +169,7 @@ export class AtomicMemoryProvider
       limit: request.limit,
       threshold: request.threshold,
       namespace_scope: request.scope.namespace,
+      session_id: request.scope.thread,
     };
 
     const raw = await fetchJson<{ memories: any[]; count: number }>(
@@ -149,8 +179,8 @@ export class AtomicMemoryProvider
     );
 
     return {
-      results: raw.memories.map((m: any) =>
-        toSearchResult(m, request.scope)
+      results: this.applyMetaFactFilter(
+        raw.memories.map((m: any) => toSearchResult(m, request.scope)),
       ),
     };
   }
@@ -185,7 +215,7 @@ export class AtomicMemoryProvider
       count: number;
     }>(
       this.http,
-      this.route(`/memories/list?user_id=${encodeURIComponent(request.scope.user ?? '')}&limit=${limit}&offset=${offset}`)
+      this.route(buildListPath(request.scope, limit, offset))
     );
 
     const nextOffset = offset + raw.memories.length;
@@ -294,6 +324,7 @@ export class AtomicMemoryProvider
         limit: request.limit,
         threshold: request.threshold,
         namespace_scope: request.scope.namespace,
+        session_id: request.scope.thread,
         retrieval_mode: mapPackageFormat(request.format),
         token_budget: request.tokenBudget,
         skip_repair: true,
@@ -315,8 +346,8 @@ export class AtomicMemoryProvider
         );
       }
 
-      const results: SearchResult[] = raw.memories.map((m: any) =>
-        toSearchResult(m, request.scope)
+      const results: SearchResult[] = this.applyMetaFactFilter(
+        raw.memories.map((m: any) => toSearchResult(m, request.scope)),
       );
 
       return {
@@ -342,6 +373,7 @@ export class AtomicMemoryProvider
           threshold: request.threshold,
           as_of: request.asOf.toISOString(),
           namespace_scope: request.scope.namespace,
+          session_id: request.scope.thread,
         };
 
         const raw = await fetchJson<{
@@ -352,8 +384,8 @@ export class AtomicMemoryProvider
         });
 
         return {
-          results: raw.memories.map((m: any) =>
-            toSearchResult(m, request.scope)
+          results: this.applyMetaFactFilter(
+            raw.memories.map((m: any) => toSearchResult(m, request.scope)),
           ),
         };
       }
@@ -396,6 +428,16 @@ export class AtomicMemoryProvider
 // Helpers
 // ---------------------------------------------------------------------------
 
+function buildListPath(scope: Scope, limit: number, offset: number): string {
+  const params = new URLSearchParams({
+    user_id: scope.user ?? '',
+    limit: String(limit),
+    offset: String(offset),
+  });
+  if (scope.thread) params.set('session_id', scope.thread);
+  return `/memories/list?${params.toString()}`;
+}
+
 function ingestInputToConversation(input: IngestInput): string {
   switch (input.mode) {
     case 'text':
diff --git a/src/memory/atomicmemory-provider/handle-impl.ts b/src/memory/atomicmemory-provider/handle-impl.ts
index 0651b1b..71e2604 100644
--- a/src/memory/atomicmemory-provider/handle-impl.ts
+++ b/src/memory/atomicmemory-provider/handle-impl.ts
@@ -72,6 +72,7 @@ import {
   scopeToFields,
   scopeToQueryParams,
   stripAgentScope,
+  stripReadFilters,
 } from './scope-mapper';
 
 export function createAtomicMemoryHandle(
@@ -114,7 +115,7 @@ export function createAtomicMemoryHandle(
       );
       // Echo back the scope WITHOUT agentScope: core didn't apply that
       // filter on expand, so returned memories must not claim otherwise.
-      const echoedScope = stripAgentScope(scope);
+      const echoedScope = stripReadFilters(scope);
       return raw.memories.map((m) => toAtomicMemoryMemory(m, echoedScope));
     },
     async list(scope, options) {
@@ -126,7 +127,7 @@ export function createAtomicMemoryHandle(
       // SDK so the mismatch surfaces at the call site.
       assertListOptionsScopeCompat(scope, options);
 
-      const params = scopeToQueryParams(scope);
+      const params = scopeToQueryParams(scope, { includeThread: true });
       if (options?.limit !== undefined) params.set('limit', String(options.limit));
       if (options?.offset !== undefined) params.set('offset', String(options.offset));
       if (options?.sourceSite) params.set('source_site', options.sourceSite);
@@ -150,19 +151,21 @@ export function createAtomicMemoryHandle(
       };
     },
     async get(id, scope) {
-      // agent_scope deliberately omitted — core's /:id GET drops it.
-      const params = scopeToQueryParams(scope);
+      // agent_scope/thread deliberately omitted — core's /:id GET is id-keyed
+      // and does not apply those read filters. The returned scope reflects the
+      // persisted row, not the caller's unapplied filter.
+      const params = scopeToQueryParams(stripReadFilters(scope));
       const raw = await fetchJsonOrNull<unknown>(
         http,
         route(`/memories/${encodeURIComponent(id)}?${params.toString()}`),
       );
       if (!raw) return null;
-      // Echoed scope drops agentScope — see expand() note above.
-      return toAtomicMemoryMemory(raw, stripAgentScope(scope));
+      // Echoed scope drops unapplied filters — see expand() note above.
+      return toAtomicMemoryMemory(raw, stripReadFilters(scope));
     },
     async delete(id, scope) {
-      // agent_scope deliberately omitted — core's /:id DELETE drops it.
-      const params = scopeToQueryParams(scope);
+      // agent_scope/thread deliberately omitted — core's /:id DELETE is id-keyed.
+      const params = scopeToQueryParams(stripReadFilters(scope));
       try {
         await fetchVoid(
           http,
@@ -229,7 +232,7 @@ async function postIngest(
   assertScopeAllowsVisibility(scope, input.visibility);
 
   const body: Record<string, unknown> = {
-    ...scopeToFields(scope),
+    ...scopeToFields(scope, { includeThread: true }),
     conversation: input.conversation,
     source_site: input.sourceSite,
     source_url: input.sourceUrl ?? '',
@@ -290,7 +293,10 @@ async function postSearch(
   scope: MemoryScope,
 ): Promise<AtomicMemorySearchResultPage> {
   // agent_scope is honored ONLY on search routes — opt in here.
-  const scopeFields = scopeToFields(scope, { includeAgentScope: true });
+  const scopeFields = scopeToFields(scope, {
+    includeAgentScope: true,
+    includeThread: true,
+  });
   const body: Record<string, unknown> = {
     ...scopeFields,
     query: request.query,
@@ -333,6 +339,7 @@ interface RawMemoryResponse {
   created_at?: string;
   updated_at?: string;
   metadata?: Record<string, unknown>;
+  session_id?: string | null;
 }
 
 interface RawSearchResponse {
@@ -381,7 +388,7 @@ function toAtomicMemoryMemory(
   const result: AtomicMemoryMemory = {
     id: r.id,
     content: r.content ?? '',
-    scope,
+    scope: buildMemoryScope(r, scope),
     createdAt: r.created_at ? new Date(r.created_at) : new Date(),
   };
   if (r.updated_at) result.updatedAt = new Date(r.updated_at);
@@ -394,6 +401,26 @@ function toAtomicMemoryMemory(
   return result;
 }
 
+function buildMemoryScope(
+  raw: RawMemoryResponse,
+  requestedScope: MemoryScope,
+): MemoryScope {
+  if (requestedScope.thread !== undefined) {
+    if (!raw.session_id) {
+      throw new Error(
+        'atomicmemory-provider: backend response missing required `session_id` for thread-scoped request',
+      );
+    }
+    if (raw.session_id !== requestedScope.thread) {
+      throw new Error(
+        'atomicmemory-provider: backend response `session_id` did not match requested thread scope',
+      );
+    }
+  }
+  if (!raw.session_id) return requestedScope;
+  return { ...requestedScope, thread: raw.session_id };
+}
+
 function toAtomicMemorySearchResult(
   raw: RawMemoryResponse,
   scope: MemoryScope,
diff --git a/src/memory/atomicmemory-provider/handle.ts b/src/memory/atomicmemory-provider/handle.ts
index 43a9f6d..82b8059 100644
--- a/src/memory/atomicmemory-provider/handle.ts
+++ b/src/memory/atomicmemory-provider/handle.ts
@@ -49,12 +49,13 @@ export type AgentScope =
  * at `atomicmemory-core/src/services/memory-service-types.ts:142-144`.
  */
 export type MemoryScope =
-  | { kind: 'user'; userId: string }
+  | { kind: 'user'; userId: string; thread?: string }
   | {
       kind: 'workspace';
       userId: string;
       workspaceId: string;
       agentId: string;
+      thread?: string;
       agentScope?: AgentScope;
     };
 
diff --git a/src/memory/atomicmemory-provider/mappers.ts b/src/memory/atomicmemory-provider/mappers.ts
index 00d6c1b..e7d8689 100644
--- a/src/memory/atomicmemory-provider/mappers.ts
+++ b/src/memory/atomicmemory-provider/mappers.ts
@@ -30,6 +30,8 @@ interface RawMemory {
   source_url?: string;
   /** Present on list responses; dropped from search responses today. */
   episode_id?: string;
+  namespace?: string;
+  session_id?: string | null;
   created_at?: string;
 }
 
@@ -72,13 +74,38 @@ export function toMemory(raw: RawMemory, scope: Scope): Memory {
   return {
     id: raw.id,
     content: raw.content,
-    scope,
+    scope: buildScope(raw, scope),
     createdAt: raw.created_at ? new Date(raw.created_at) : new Date(),
     provenance: buildProvenance(raw),
     metadata: buildMetadata(raw),
   };
 }
 
+function buildScope(raw: RawMemory, scope: Scope): Scope {
+  if (scope.namespace !== undefined && raw.namespace && raw.namespace !== scope.namespace) {
+    throw new Error(
+      'atomicmemory-provider: backend response `namespace` did not match requested namespace scope',
+    );
+  }
+  if (scope.thread !== undefined) {
+    if (!raw.session_id) {
+      throw new Error(
+        'atomicmemory-provider: backend response missing required `session_id` for thread-scoped request',
+      );
+    }
+    if (raw.session_id !== scope.thread) {
+      throw new Error(
+        'atomicmemory-provider: backend response `session_id` did not match requested thread scope',
+      );
+    }
+  }
+  return {
+    ...scope,
+    ...(raw.namespace ? { namespace: raw.namespace } : {}),
+    ...(raw.session_id ? { thread: raw.session_id } : {}),
+  };
+}
+
 /**
  * Both `source_site` and `source_url` are SDK-side `provenance`
  * fields. Returns `undefined` when neither is present so we don't
diff --git a/src/memory/atomicmemory-provider/scope-mapper.ts b/src/memory/atomicmemory-provider/scope-mapper.ts
index 77d11b3..42778ef 100644
--- a/src/memory/atomicmemory-provider/scope-mapper.ts
+++ b/src/memory/atomicmemory-provider/scope-mapper.ts
@@ -23,6 +23,7 @@ interface ScopeFields {
   workspace_id?: string;
   agent_id?: string;
   agent_scope?: AgentScope;
+  session_id?: string;
 }
 
 interface ScopeSerializeOptions {
@@ -36,6 +37,12 @@ interface ScopeSerializeOptions {
    * Defaults to `false`. Search route bindings opt in explicitly.
    */
   includeAgentScope?: boolean;
+  /**
+   * Emit `session_id` on the wire. Core honors this on ingest, search, and
+   * list. Routes such as get/delete/expand do not filter by session, so they
+   * must not send or echo it.
+   */
+  includeThread?: boolean;
 }
 
 export function scopeToFields(
@@ -43,7 +50,11 @@ export function scopeToFields(
   options: ScopeSerializeOptions = {},
 ): ScopeFields {
   if (scope.kind === 'user') {
-    return { user_id: scope.userId };
+    const fields: ScopeFields = { user_id: scope.userId };
+    if (options.includeThread && scope.thread) {
+      fields.session_id = scope.thread;
+    }
+    return fields;
   }
   const fields: ScopeFields = {
     user_id: scope.userId,
@@ -53,6 +64,9 @@ export function scopeToFields(
   if (options.includeAgentScope && scope.agentScope !== undefined) {
     fields.agent_scope = scope.agentScope;
   }
+  if (options.includeThread && scope.thread) {
+    fields.session_id = scope.thread;
+  }
   return fields;
 }
 
@@ -83,6 +97,7 @@ export function scopeToQueryParams(
       params.set('agent_scope', fields.agent_scope);
     }
   }
+  if (fields.session_id) params.set('session_id', fields.session_id);
   return params;
 }
 
@@ -107,7 +122,7 @@ export function assertScopeAllowsVisibility(
 
 /**
  * Strip `agentScope` from a `MemoryScope` for routes that do NOT honor
- * agent_scope on the backend (expand / list / get / delete). Used to
+ * agent_scope on the backend. Used to
  * echo scope back on returned memories honestly — so a caller who
  * passed `{ agentScope: 'self' }` does not receive memories whose
  * `.scope.agentScope` field implies the filter was applied when it
@@ -122,6 +137,24 @@ export function stripAgentScope(scope: MemoryScope): MemoryScope {
     userId: scope.userId,
     workspaceId: scope.workspaceId,
     agentId: scope.agentId,
+    ...(scope.thread !== undefined ? { thread: scope.thread } : {}),
   };
   return stripped;
 }
+
+/**
+ * Strip filters that the target route did not apply before echoing scope onto
+ * returned memories. Search/list can preserve thread because Core applies the
+ * filter and projects `session_id`; expand/get/delete cannot.
+ */
+export function stripReadFilters(scope: MemoryScope): MemoryScope {
+  if (scope.kind === 'user') {
+    return { kind: 'user', userId: scope.userId };
+  }
+  return {
+    kind: 'workspace',
+    userId: scope.userId,
+    workspaceId: scope.workspaceId,
+    agentId: scope.agentId,
+  };
+}
diff --git a/src/memory/atomicmemory-provider/types.ts b/src/memory/atomicmemory-provider/types.ts
index c31a2ae..2e66254 100644
--- a/src/memory/atomicmemory-provider/types.ts
+++ b/src/memory/atomicmemory-provider/types.ts
@@ -2,6 +2,8 @@
  * @file AtomicMemory Provider Configuration
  */
 
+import type { MetaFactFilterConfig } from '../meta-fact-filter';
+
 export interface AtomicMemoryProviderConfig {
   /** Base URL of the atomicmemory-core instance, e.g. `http://localhost:3050`. */
   apiUrl: string;
@@ -23,6 +25,22 @@ export interface AtomicMemoryProviderConfig {
    * that never versioned their mount).
    */
   apiVersion?: string;
+  /**
+   * Opt-in post-retrieval filter that drops extraction-style meta-facts
+   * (e.g. "The user asked for the user's name.", "As of <date>, X is a term
+   * mentioned in the conversation.") before they reach the caller.
+   *
+   * Empirically motivated by `benchmarks/alignbench/RESULTS.md`: meta-facts
+   * are the dominant cause of partner-visible recall failures, outranking
+   * real user facts at thin cosine margins. Filtering them post-hoc gives
+   * cleaner search results today while a durable upstream extraction-prompt
+   * fix rolls out in core.
+   *
+   * When omitted, the filter is OFF and behaviour is unchanged. Set
+   * `{ enabled: true }` to activate with the built-in pattern set, or pass
+   * additional `patterns` / `mode` per `MetaFactFilterConfig`.
+   */
+  metaFactFilter?: MetaFactFilterConfig;
 }
 
 /** Default timeout for AtomicMemory provider HTTP requests (ms). */
diff --git a/src/memory/index.ts b/src/memory/index.ts
index c494f85..effc382 100644
--- a/src/memory/index.ts
+++ b/src/memory/index.ts
@@ -14,3 +14,10 @@ export * from './registration';
 export * from './atomicmemory-provider';
 export * from './mem0-provider';
 export * from './hindsight-provider';
+export {
+  filterMetaFacts,
+  isMetaFact,
+  resolveMetaFactPatterns,
+  DEFAULT_META_FACT_PATTERNS,
+  type MetaFactFilterConfig,
+} from './meta-fact-filter';
diff --git a/src/memory/meta-fact-filter.ts b/src/memory/meta-fact-filter.ts
new file mode 100644
index 0000000..b62948e
--- /dev/null
+++ b/src/memory/meta-fact-filter.ts
@@ -0,0 +1,164 @@
+/**
+ * @file MetaFactFilter
+ *
+ * Post-retrieval filter that drops "meta-facts" — extraction artifacts that
+ * describe the conversation itself rather than recording a durable fact about
+ * the user.
+ *
+ * Empirically motivated by AlignBench v0 (benchmarks/alignbench/RESULTS.md):
+ * when extraction-style meta-facts ("The user asked for the user's name.",
+ * "As of <date>, X is a term mentioned in the conversation.") sit in the
+ * recall pool alongside real user facts, they often outrank the real fact
+ * for pronoun and temporal queries — at thin cosine margins (~0.05). The
+ * pre-registered "fix the query side" hypothesis was falsified; the dominant
+ * fixable lift came from removing meta-facts from the pool.
+ *
+ * Long-term, core should not emit these facts at extraction time. This
+ * SDK-side filter is the safety net so apps consuming the SDK today see
+ * cleaner recall results without waiting on a core release.
+ *
+ * Default patterns target the verbatim shapes observed in the partner demo
+ * (atomicmem.filecoin.cloud). Apps can extend or replace them via
+ * `MetaFactFilterConfig.patterns`.
+ *
+ * This filter is intentionally:
+ *   - pure (no I/O, no LLM calls — deterministic regex application);
+ *   - opt-in (off unless explicitly enabled in provider config);
+ *   - case-insensitive;
+ *   - additive (apps may add patterns without losing the defaults).
+ */
+
+/**
+ * Built-in patterns observed in real partner demos. Each is a case-insensitive
+ * regex matched against the memory's content. A match drops the memory from
+ * the result set.
+ *
+ * Patterns capture the three meta-fact families that AlignBench's distractor
+ * pool was built from:
+ *   1. "The user asked/requested/said …" — meta-facts about user actions in
+ *      the conversation, not about the user.
+ *   2. "As of <date>, X is a term mentioned in the conversation." — vacuous
+ *      acknowledgements of vocabulary, not durable facts.
+ *   3. "A name was mentioned." / "The conversation involves the user." —
+ *      observations about the chat session, not about the user.
+ */
+export const DEFAULT_META_FACT_PATTERNS: readonly RegExp[] = Object.freeze([
+  /^\s*the user (asked|requested|said|is asking|is me)\b/i,
+  /^\s*as of [^,]+,\s+.+\s+is a term mentioned in the conversation\.?$/i,
+  /^\s*a name was mentioned\b/i,
+  /^\s*the conversation involves the user\b/i,
+  /^\s*the user has started a conversation\b/i,
+]);
+
+export interface MetaFactFilterConfig {
+  /**
+   * Master switch. When `false` (the default), the filter is a no-op and
+   * all results pass through.
+   *
+   * Apps explicitly opt in by setting `true`. We do not infer this from
+   * environment variables in the SDK to keep behaviour deterministic across
+   * Node / browser / Workers runtimes.
+   */
+  enabled: boolean;
+
+  /**
+   * Patterns to match against `memory.content`. When omitted, the built-in
+   * `DEFAULT_META_FACT_PATTERNS` are used.
+   *
+   * When `mode === 'replace'` (the default when `patterns` is set), only the
+   * provided patterns are applied. Set `mode: 'extend'` to apply the provided
+   * patterns *and* the built-in defaults.
+   */
+  patterns?: readonly RegExp[];
+
+  /**
+   * How `patterns` interacts with `DEFAULT_META_FACT_PATTERNS`. Defaults to
+   * `'replace'` (the provided list fully replaces defaults). `'extend'` is
+   * the union — useful when an app wants to add its own meta-fact shapes
+   * without losing the SDK's baseline coverage.
+   */
+  mode?: 'replace' | 'extend';
+
+  /**
+   * Optional callback invoked once per dropped result. Useful for telemetry
+   * or tests. Receives the memory content and the pattern index that matched.
+   * Exceptions thrown by `onDrop` are swallowed so they cannot break recall.
+   */
+  onDrop?: (content: string, patternIndex: number) => void;
+}
+
+/**
+ * Resolve the effective pattern list for a config.
+ *
+ * Pure; safe to call repeatedly. Used in two places — at filter time, and
+ * in tests that want to introspect the effective rule set without filtering
+ * a result list.
+ */
+export function resolveMetaFactPatterns(
+  config: MetaFactFilterConfig,
+): readonly RegExp[] {
+  if (!config.patterns) return DEFAULT_META_FACT_PATTERNS;
+  if (config.mode === 'extend') {
+    return [...config.patterns, ...DEFAULT_META_FACT_PATTERNS];
+  }
+  return config.patterns;
+}
+
+/**
+ * Return `true` when `content` matches any of `patterns`.
+ *
+ * Defensive against non-string input (returns `false`) so a malformed result
+ * doesn't crash the filter pipeline.
+ */
+export function isMetaFact(
+  content: unknown,
+  patterns: readonly RegExp[] = DEFAULT_META_FACT_PATTERNS,
+): boolean {
+  if (typeof content !== 'string' || content.length === 0) return false;
+  for (const p of patterns) {
+    if (p.test(content)) return true;
+  }
+  return false;
+}
+
+/**
+ * Filter a list of items by removing entries whose `getContent(item)` matches
+ * any active meta-fact pattern.
+ *
+ * Generic over `T` so callers can filter `SearchResult` / `Memory` / raw
+ * backend shapes with the same primitive. Pure and synchronous.
+ */
+export function filterMetaFacts<T>(
+  items: readonly T[],
+  getContent: (item: T) => unknown,
+  config: MetaFactFilterConfig,
+): T[] {
+  if (!config.enabled) return [...items];
+  const patterns = resolveMetaFactPatterns(config);
+  if (patterns.length === 0) return [...items];
+  const kept: T[] = [];
+  for (const item of items) {
+    const content = getContent(item);
+    let matchedIndex = -1;
+    if (typeof content === 'string' && content.length > 0) {
+      for (let i = 0; i < patterns.length; i++) {
+        if (patterns[i].test(content)) {
+          matchedIndex = i;
+          break;
+        }
+      }
+    }
+    if (matchedIndex >= 0) {
+      if (config.onDrop) {
+        try {
+          config.onDrop(content as string, matchedIndex);
+        } catch {
+          // Swallow — filter must never break recall.
+        }
+      }
+      continue;
+    }
+    kept.push(item);
+  }
+  return kept;
+}