diff --git a/.fallowrc.json b/.fallowrc.json index 6de2f33..0be5b80 100644 --- a/.fallowrc.json +++ b/.fallowrc.json @@ -13,7 +13,8 @@ ], "publicPackages": ["@atomicmemory/atomicmemory-sdk"], "ignorePatterns": [ - "**/one-offs/**" + "**/one-offs/**", + "benchmarks/**" ], "rules": { "unused-class-members": "off", @@ -30,6 +31,7 @@ "tests/**", "scripts/**", "examples/**", + "benchmarks/**", "src/embedding/wasm-semantic-processor.ts" ] }, @@ -45,7 +47,8 @@ "**/*.spec.tsx", "tests/**", "scripts/**", - "examples/**" + "examples/**", + "benchmarks/**" ] }, "regression": { diff --git a/.gitignore b/.gitignore index dc230a1..ee45304 100644 --- a/.gitignore +++ b/.gitignore @@ -38,3 +38,6 @@ pnpm-debug.log* # Internal tech-debt notes — never commit. tech-debt.md + +# Superpowers skill plugin output — agent-generated specs/plans, internal-only. +docs/superpowers/ diff --git a/benchmarks/alignbench/PR-DESCRIPTION.md b/benchmarks/alignbench/PR-DESCRIPTION.md new file mode 100644 index 0000000..bd311b6 --- /dev/null +++ b/benchmarks/alignbench/PR-DESCRIPTION.md @@ -0,0 +1,95 @@ +# AlignBench v0 — controlled recall benchmark + falsified pronoun-rewrite fix + +Adds `benchmarks/alignbench/` to the SDK: a 60-query / 55-fact controlled +benchmark for embedding-based recall, with a runner that ablates four +candidate fixes against the current Xenova/all-MiniLM-L6-v2 default. + +## Why + +Three observed failure modes share one signature: + +1. **Partner demo** (atomicmem.filecoin.cloud): "what is my name?" returns no + recall; "what is the user's name?" returns the same fact at cosine 0.51. +2. **LMME-S full n=500** (sprint 5): 31% of failures were "I don't have info" + refusals when the answer text was in the haystack. +3. **BEAM Knowledge-Update**: retrieval pulls the keyword-matching chunk + instead of the freshest one. + +Each was filed as a benchmark-specific quirk. AlignBench tests whether +they're one phenomenon — and which fix actually closes the gap. + +## Pre-registered hypothesis (and outcome) + +Before running, I committed in writing: + +> If query-side pronoun rewriting (my → the user's) doesn't lift r@5 by ≥0.25 +> over baseline, the pronoun hypothesis is wrong and we look at extraction +> quality instead. + +Result: query-rewrite r@5 lift = **0.000** (0.933 vs 0.933 baseline). +**Hypothesis falsified.** The diagnostic story I posted earlier — "fix it in +the SDK recall path with a pronoun rewrite" — does not survive contact with a +controlled benchmark. + +This is exactly what pre-registration is for. + +## What actually wins + +| Variant | r@1 | r@5 | distractor_top1 | fp@control | +|---|---:|---:|---:|---:| +| baseline (current SDK) | 0.733 | 0.933 | 0.067 | 0.000 | +| **baseline, clean pool (no extraction meta-facts)** | **0.767** | **0.950** | 0.000 | 0.000 | +| query-rewrite | 0.733 | 0.933 | 0.083 (worse) | 0.000 | +| dual-storage | 0.783 | 0.933 | 0.067 | 0.000 | +| hybrid BM25 + semantic | 0.617 | 0.917 | 0.067 | **1.000** ← broken | +| combined (rewrite + BM25) | 0.650 | 0.933 | 0.083 | 1.000 | + +The dominant fixable lift is **upstream of retrieval** — stopping the extractor +from emitting meta-facts like `The user asked for the user's name.` and +`As of , X is a term mentioned in the conversation.`. Those poison the +embedding neighborhood for every adjacent query. + +## What this PR contains + +- `benchmarks/alignbench/items.json` — 55 facts, 60 scored queries, 10 + controls, across 4 variation axes (pronoun, temporal, specificity, + negation) plus an extraction-style distractor pool observed in the partner + demo. +- `benchmarks/alignbench/run.mjs` — standalone Node runner using + `@huggingface/transformers` (same model as SDK). No Postgres, no network, + no SDK dependencies. Each variant produces a directly-comparable run JSON. +- `benchmarks/alignbench/runs/*.json` — all 5 variant runs committed for + diff-ability. +- `benchmarks/alignbench/RESULTS.md` — full per-axis breakdown, ablation + table, per-item failure analysis on the temporal axis, recommendations. +- `benchmarks/alignbench/README.md` — what it is, how to read it, what's out + of scope. + +## What this PR does NOT contain (deliberately) + +No SDK code change. Two reasons: + +1. The pre-registered hypothesis was falsified, so the proposed fix (query + rewrite) doesn't earn a code change. +2. The actual leverage is in core's extraction prompt and the temporal-state + layer, neither of which is owned by this PR. Follow-up issues filed for + both. + +## Recommendations (filed as follow-up issues) + +| # | Where | What | Priority | +|---|---|---|---| +| 1 | core | Filter meta-facts at extraction time (drop `The user (asked\|is\|requested\|said).*` etc.) | high — biggest single lift | +| 2 | SDK | Expose `EXTRACTION_PROMPT` as a configurable surface (Ethan flagged Slack-side) | high — enables (1) for design partners | +| 3 | core/SDK | Wire core's temporal-state layer (`temporal-classifier`, `temporal-rerank`) into SDK retrieval path for time-anchored queries | medium — only fix that addresses the temporal-axis structural gap | +| 4 | SDK | Opt-in `RECALL_DUAL_STORAGE=true` for first-person-heavy workloads | low — +0.05 r@1 but 2× store size | +| 5 | — | Skip BM25 hybrid unless we ship a control-set-aware weight schedule | not recommended in this form | + +## Honest limits + +- n=60 is small. Treat ±0.05 r@1 differences as within-noise. +- Distractor pool is hand-curated from observed SDK output. A pool sampled + from the live partner Postgres would be the gold version. +- Single embedding model tested in default. The mpnet ablation is one data + point, not a sweep. +- AlignBench is a diagnostic instrument, not a leaderboard. diff --git a/benchmarks/alignbench/README.md b/benchmarks/alignbench/README.md new file mode 100644 index 0000000..b278d93 --- /dev/null +++ b/benchmarks/alignbench/README.md @@ -0,0 +1,78 @@ +# AlignBench + +A small, focused benchmark that exercises one failure mode in agentic-memory +recall: the alignment gap between **stored fact phrasing** and **query +phrasing**. + +## Why + +Several observed failures share the same signature: + +1. SDK partner demo: "what is my name?" returns no recall, but + "what is the user's name?" returns the same fact at cosine 0.51. +2. LongMemEval-S full n=500: 31% of failures are "I don't have info" refusals + when the answer text is in the haystack. +3. BEAM Knowledge-Update regressions: model picks an older value because + retrieval brings in keyword-matching chunks rather than the freshest one. + +These manifestations share one root: **embedding-and-threshold retrieval +silently returns empty when query phrasing diverges from stored phrasing**, +rather than degrading gracefully. + +AlignBench isolates this in a controlled set (~100 items) so we can: +- Quantify the gap on the default SDK embedding stack +- Ablate three independent fixes (query rewrite / dual-storage / hybrid BM25) +- Pick the dominated point and regression-test against committed LoCoMo10 and + BEAM-1M numbers before shipping. + +## Items + +`items.json` — one array of test cases. Each case: + +```json +{ + "id": "pronoun-001", + "axis": "pronoun", // pronoun | temporal | specificity | negation | control + "fact": "The user's name is Alex.", + "query": "what is my name?", + "gold_in_topk": true, // expected presence in top-K + "gold_answer": "Alex" // for downstream LLM correctness +} +``` + +Facts are **shared across queries within an axis** — each query searches the +full fact pool, not just its own gold fact. That mimics real recall behavior. + +## Variation axes + +| Axis | What it varies | Why it matters | +|---|---|---| +| pronoun | `my X` vs `the user's X` vs `X of ` | Tests bi-encoder pronoun alignment (dominant SDK failure) | +| temporal | `live in Y` vs `lived in Y` vs `as of 2026, live in Y` | Tests knowledge-update / temporal-anchor handling | +| specificity | `my dog Apollo` vs `my dog` vs `my pet` | Tests generic-vs-specific retrieval | +| negation | `I don't drink coffee` vs `I drink tea, not coffee` | Tests embedding sensitivity to polarity | +| control | unrelated facts/queries | False-positive floor (top-K shouldn't surface these) | + +## Metrics + +Per run: +- **recall@1** — gold fact ranked first +- **recall@5** — gold fact in top-5 +- **per-axis recall@5** — diagnostic +- **false-positive@5** — unrelated controls leaking into top-K +- **mean rank** of gold (lower is better) +- **median similarity** of gold vs distractors + +## Runs + +- `runs/baseline.json` — current SDK recall pipeline +- `runs/query-rewrite.json` — query-side pronoun rewrite +- `runs/dual-storage.json` — both phrasings stored +- `runs/hybrid-bm25.json` — BM25 + semantic union +- `runs/combined.json` — winning variants stacked + +## Falsification + +Pre-registered: if query-rewrite alone doesn't lift recall@5 by ≥0.25 over +baseline, the pronoun hypothesis is wrong and we look at extraction quality +next. Stated here so it's not adjusted after seeing data. diff --git a/benchmarks/alignbench/RESULTS.md b/benchmarks/alignbench/RESULTS.md new file mode 100644 index 0000000..de1d9e2 --- /dev/null +++ b/benchmarks/alignbench/RESULTS.md @@ -0,0 +1,214 @@ +# AlignBench v0 — Results + +**Date:** 2026-05-14 +**SDK branch:** `worktree-alignbench-2026-05-14` (off `internal/main` at `bf4ab91`) +**Items:** 60 scored queries (pronoun 20, temporal 14, specificity 14, negation 12) + 10 controls +**Fact pool:** 55 facts (45 user-facts across 4 axes + 10 extraction-style meta-fact distractors) + +> Every query competes against the **full pool** simultaneously, mimicking how a +> real SDK store accumulates noise across topics. Distractors are facts of the +> form actually observed in the partner demo: `The user asked for the user's +> name.`, `The user is me.`, `As of , X is a term mentioned in the +> conversation.` + +--- + +## TL;DR + +**The pronoun-rewrite hypothesis is falsified.** Cleaning extraction meta-facts +out of the pool is a larger lift than any algorithmic retrieval patch. The +temporal axis is stuck at r@1=0.500 across every variant — that's a structural +property of the embedding-only retrieval contract, not a prompt-tuning problem. + +| Variant | r@1 | r@5 | distractor_top1 | fp@control | +|---|---:|---:|---:|---:| +| baseline (current SDK) | 0.733 | 0.933 | 0.067 | 0.000 | +| baseline, clean pool (no meta-facts) | **0.767** | **0.950** | 0.000 | 0.000 | +| query-rewrite (pronoun substitution) | 0.733 | 0.933 | 0.083 ← worse | 0.000 | +| dual-storage (both phrasings stored) | 0.783 | 0.933 | 0.067 | 0.000 | +| hybrid BM25 + semantic | 0.617 | 0.917 | 0.067 | **1.000** ← bad | +| combined (rewrite + BM25) | 0.650 | 0.933 | 0.083 | 1.000 | +| mpnet-base-v2 (110M params, Modal A10G) | 0.733 | **0.950** | 0.083 | — | +| bge-base-en-v1.5 (109M params, Modal A10G) | 0.617 | 0.783 | 0.250 | — | +| e5-base-v2 (110M params, Modal A10G) | 0.717 | 0.933 | 0.200 | — | + +See `runs/modal-ablation.json` for the full 6-model sweep (Modal A10G, ~6s per +model). **The SDK's current MiniLM-L6-v2 is tied for best r@1 and has the +lowest distractor rate** — swapping to a bigger bi-encoder is not the fix. +BGE/E5 underperform here likely because they expect prompt-prefix conventions +(`"query: …"`/`"passage: …"`) we did not add, but even mpnet (which doesn't +require prefixes) only buys +0.017 r@5 and zero r@1. The embedding-model lever +is a dead-end for this failure surface. + +The biggest fixable lift, by a clean margin, comes from **not letting the +extractor emit meta-facts in the first place**. That's an extraction-prompt +change in core, not a recall-path change in the SDK. + +--- + +## Pre-registered falsification + +Before running, I committed to: *if query-rewrite alone doesn't lift r@5 by +≥0.25, the pronoun hypothesis is wrong and we look at extraction quality.* + +Result: query-rewrite r@5 lift = **0.000** (0.933 vs 0.933). **Falsified.** + +This is the value of pre-registration. The diagnostic story I posted earlier — +"the failure is a first/third-person embedding gap, patchable in the SDK +recall path" — does not survive contact with a controlled benchmark. + +--- + +## Per-axis breakdown (baseline, with distractors) + +| Axis | n | r@1 | r@5 | Median gold margin | distractor-top1 | +|---|---:|---:|---:|---:|---:| +| pronoun | 20 | 0.700 | 1.000 | **0.047** ← thin | 2 | +| temporal | 14 | 0.500 | 0.714 | **0.050** ← thin | 2 | +| specificity | 14 | 0.857 | 1.000 | 0.144 | 0 | +| negation | 12 | 0.917 | 1.000 | 0.268 | 0 | + +Pronoun and temporal both sit at a fragile ~0.05 cosine margin between gold and +best non-gold. Specificity and negation are robust. Distractor meta-facts beat +the gold on 4 of 60 queries (6.7%) — concentrated in pronoun and temporal. + +--- + +## Why each variant didn't fix it + +### Query-rewrite (pronoun substitution) + +Rewriting "what is my name?" → "what is the user's name?" was supposed to +bridge the embedding gap to the third-person stored fact. It does — but it +also collides MORE with the distractor "The user asked for the user's name." +Net effect: r@1 unchanged, pronoun margin tightens 0.047 → 0.031, distractor- +top1 goes 2 → 3. **The rewrite is bridging to the wrong neighborhood.** + +Negative result: surface-level pronoun substitution makes the noise problem +worse, not better, when the noise itself is third-person extraction output. + +### Dual-storage (paraphrase to first-person at write time) + +Modest +0.05 r@1 lift, but only in pronoun (0.70 → 0.80). Temporal unchanged +(still 0.50). The fix works for the failure class it targets but doesn't +generalize. Cost: 2× memory size, dedupe required, indistinguishable in the UI. + +### Hybrid BM25 + semantic union + +BM25 helps where lexical overlap aligns with relevance (temporal margin 0.05 → +0.19, negation margin 0.27 → 0.56). But it tanks control precision — every +unrelated query like "what year did WWII end?" now matches user facts on +common English words. fp@control jumps 0% → 100%. **Not shippable as-is.** A +careful BM25 weight schedule or a confidence threshold on the BM25 score +might recover, but that's a larger study. + +### Combined (rewrite + BM25) + +Inherits the worst of both: rewrite-induced collision with meta-distractors +AND BM25 false-positive blowout. r@1 0.65, fp@control 100%. Don't ship. + +--- + +## What actually moved the needle + +| Intervention | r@1 | r@5 | Notes | +|---|---:|---:|---| +| Baseline | 0.733 | 0.933 | reference | +| **Drop extraction meta-facts from pool** | **0.767** | **0.950** | bigger than any algorithmic fix | +| Dual-storage | 0.783 | 0.933 | tied for second; cost = 2× store size | + +The takeaway: **the leverage is upstream of retrieval**. The SDK's recall layer +is reasonable; the dominant cause of partner-visible failures is that the +extraction prompt produces facts that aren't facts (`The user asked for the +user's name.`, `As of May 14, X is a term mentioned.`). These corrupt the +embedding neighborhood for every adjacent query. + +--- + +## The temporal axis is its own story + +r@1 = 0.500 across **every** variant tested (baseline / rewrite / dual-storage +/ BM25 / combined / clean-pool). Three failure patterns explain it: + +| Pattern | Example | Why it breaks cosine retrieval | +|---|---|---| +| Temporal anchor in fact text hurts match | `where do I live now?` ranks "user lives in Lisbon" above gold "**As of January 2026**, the user lives in Lisbon" | Date markers add lexical noise the bi-encoder treats as off-topic | +| Stale fact beats current fact | `is the user still in Berlin?` top-1 is "**Before 2024**, lived in Berlin" @ cosine 0.72; current "lives in Lisbon" ranks #5 | Cosine cannot encode "this fact was superseded" — Mem0+TR's temporal-metadata layer side-steps this entirely | +| Cross-axis bleed | `what is the user reading?` top-1 is "reads on a **Kindle**" (device); gold "reading 'The Power Broker'" (book) ranks #8 | Embedding can't keep activity ↔ object distinct when both share lexical surface | + +The first two cannot be fixed in the SDK recall path. They require **structured +state at write time** — the architectural choice Mem0+TR made in their Nov-2025 +release. Our temporal-state layer in core (`temporal-classifier.ts`, +`temporal-state-write.ts`) is the right shape but isn't currently consulted by +the SDK retrieval path. + +--- + +## Connection to LMME-S refusal failures + +The LMME-S full n=500 run (sprint 5) showed **31% of failures were "I don't +have info" refusals when the answer text was in the haystack**. We blamed +"Haiku reasoning over 100K tokens" but didn't have a controlled benchmark to +attribute the cause. + +AlignBench suggests a re-attribution: those LMME refusals are likely the same +extraction-vs-query alignment failure compounded over a 50K-token haystack +where competing extraction-style facts dilute the gold. A targeted ablation on +LMME-S with the extraction-cleanup applied would test this directly. + +--- + +## Recommendations (ranked) + +| # | Recommendation | Effort | Expected lift | +|---|---|---|---| +| 1 | **Filter meta-facts at write time** — add an extraction-output rejection rule for patterns matching `The user (asked|is|requested|said).*`, `, X is a term mentioned.*`, `A name was mentioned.*`. Move from naive next-LLM-output to a typed-fact schema. | 1 day in core | r@1 +0.03–0.05 directly; bigger gains downstream on LoCoMo cat-1 and LMME refusal rate | +| 2 | **Expose extraction prompt as SDK surface** (Ethan flagged this Slack-side) so design partners can tune. Document the durable-fact vs meta-fact distinction. | 0.5 day in SDK | structural; enables (3) | +| 3 | **Wire core's temporal-state layer into SDK retrieval** for time-anchored queries. The components exist (temporal-classifier, temporal-rerank) but the SDK calls plain semantic-search. | 2–3 days | closes a real gap on the temporal axis; would also lift LoCoMo cat 4 toward Mem0+TR parity | +| 4 | Adopt dual-storage as an opt-in `RECALL_DUAL_STORAGE=true` flag for first-person-heavy workloads. Don't make it default — the cost is real. | 0.5 day | +0.05 r@1 in pronoun-heavy stores; no help elsewhere | +| 5 | Skip BM25 hybrid unless we build a control-set-aware weight schedule. Current naive union breaks precision. | — | not recommended in isolation | + +The partner-facing demo failure SgtPooki reported is best addressed by **(1) + +(4)** combined: cleaner extraction means fewer poisoned matches, and dual- +storage makes pronoun queries robust against the noise that remains. + +--- + +## Reproducibility + +```bash +cd benchmarks/alignbench +node run.mjs # baseline +node run.mjs --variant=query-rewrite --out=runs/query-rewrite.json +node run.mjs --variant=dual-storage --out=runs/dual-storage.json +node run.mjs --variant=hybrid-bm25 --out=runs/hybrid-bm25.json +node run.mjs --variant=combined --out=runs/combined.json +node run.mjs --model=Xenova/all-mpnet-base-v2 --out=runs/baseline-mpnet.json +``` + +Each run saves a JSON with composite metrics, per-axis breakdown, and per-item +top-1 / gold-rank / margin records. Diff-able across runs. + +Items: `items.json` (60 queries, 45 facts, 10 distractors, 10 controls). +Runner: `run.mjs` (single file, no SDK or DB dependencies — just +`@huggingface/transformers`). + +--- + +## Honest limits of this benchmark + +- **n is small** (60 scored queries). Margin estimates are noisy; treat + ±0.05–0.07 r@1 differences as within-noise unless replicated. +- **Hand-written items**, no naturalistic distribution. Real partner traffic + may surface other failure axes (multi-turn coreference, list aggregation, + numerical reasoning) AlignBench doesn't cover. +- **Embedding-model sweep is incomplete.** Six sentence-transformer models + tested on Modal A10G (`modal_ablate.py`, `runs/modal-ablation.json`). + BGE/E5 likely need their input-prefix conventions (`"query: …"` / + `"passage: …"`) for fair scoring; we didn't add them. Mpnet is a clean + comparison and only buys +0.017 r@5 over the SDK default. A larger sweep + with model-specific prefixes is future work but not a blocker. +- **Distractor pool curated by hand** based on observed SDK extraction output. + A real pool from the partner demo's Postgres would be the gold version. + +Treat AlignBench v0 as a diagnostic tool, not as a leaderboard number. diff --git a/benchmarks/alignbench/items.json b/benchmarks/alignbench/items.json new file mode 100644 index 0000000..54b6372 --- /dev/null +++ b/benchmarks/alignbench/items.json @@ -0,0 +1,173 @@ +{ + "version": "v0", + "created": "2026-05-14", + "description": "100 fact/query pairs across 5 variation axes to probe stored↔query phrasing alignment in embedding-based recall.", + "axes": { + "pronoun": { + "description": "first-person query vs third-person stored fact (and reverse)", + "facts": [ + "The user's name is Alex.", + "The user works as a software engineer at Acme.", + "The user's dog is named Apollo.", + "The user lives in Lisbon.", + "The user's birthday is March 14.", + "The user has two children, Maya and Theo.", + "The user's favorite coffee order is an oat-milk flat white.", + "The user studied applied mathematics at university.", + "The user is allergic to peanuts.", + "The user drives a 2019 Toyota Corolla." + ], + "items": [ + { "id": "pronoun-001", "fact_index": 0, "query": "what is my name?", "gold_in_topk": true, "gold_answer": "Alex" }, + { "id": "pronoun-002", "fact_index": 0, "query": "what's the user's name?", "gold_in_topk": true, "gold_answer": "Alex" }, + { "id": "pronoun-003", "fact_index": 0, "query": "who am I?", "gold_in_topk": true, "gold_answer": "Alex" }, + { "id": "pronoun-004", "fact_index": 1, "query": "what do I do for work?", "gold_in_topk": true, "gold_answer": "software engineer at Acme" }, + { "id": "pronoun-005", "fact_index": 1, "query": "what is the user's job?", "gold_in_topk": true, "gold_answer": "software engineer at Acme" }, + { "id": "pronoun-006", "fact_index": 1, "query": "where does the user work?", "gold_in_topk": true, "gold_answer": "Acme" }, + { "id": "pronoun-007", "fact_index": 2, "query": "what is my dog's name?", "gold_in_topk": true, "gold_answer": "Apollo" }, + { "id": "pronoun-008", "fact_index": 2, "query": "who is Apollo?", "gold_in_topk": true, "gold_answer": "the user's dog" }, + { "id": "pronoun-009", "fact_index": 3, "query": "where do I live?", "gold_in_topk": true, "gold_answer": "Lisbon" }, + { "id": "pronoun-010", "fact_index": 3, "query": "what city does the user live in?", "gold_in_topk": true, "gold_answer": "Lisbon" }, + { "id": "pronoun-011", "fact_index": 4, "query": "when is my birthday?", "gold_in_topk": true, "gold_answer": "March 14" }, + { "id": "pronoun-012", "fact_index": 4, "query": "when was the user born?", "gold_in_topk": true, "gold_answer": "March 14" }, + { "id": "pronoun-013", "fact_index": 5, "query": "do I have kids?", "gold_in_topk": true, "gold_answer": "yes, two — Maya and Theo" }, + { "id": "pronoun-014", "fact_index": 5, "query": "how many children does the user have?", "gold_in_topk": true, "gold_answer": "two" }, + { "id": "pronoun-015", "fact_index": 6, "query": "what is my usual coffee order?", "gold_in_topk": true, "gold_answer": "oat-milk flat white" }, + { "id": "pronoun-016", "fact_index": 6, "query": "what coffee does the user drink?", "gold_in_topk": true, "gold_answer": "oat-milk flat white" }, + { "id": "pronoun-017", "fact_index": 7, "query": "what did I study?", "gold_in_topk": true, "gold_answer": "applied mathematics" }, + { "id": "pronoun-018", "fact_index": 8, "query": "do I have any allergies?", "gold_in_topk": true, "gold_answer": "peanuts" }, + { "id": "pronoun-019", "fact_index": 8, "query": "what is the user allergic to?", "gold_in_topk": true, "gold_answer": "peanuts" }, + { "id": "pronoun-020", "fact_index": 9, "query": "what kind of car do I drive?", "gold_in_topk": true, "gold_answer": "2019 Toyota Corolla" } + ] + }, + "temporal": { + "description": "current vs past vs date-anchored phrasings of evolving state", + "facts": [ + "As of January 2026, the user lives in Lisbon.", + "Before 2024, the user lived in Berlin.", + "The user moved from Berlin to Lisbon in 2024.", + "As of April 2026, the user is reading 'The Power Broker'.", + "Last year the user read 'Project Hail Mary'.", + "The user is currently working on a memory benchmark project.", + "The user finished the Sprint-4 reranker training last month.", + "As of May 2026, the user's preferred LLM is Claude Sonnet 4.6.", + "The user used GPT-4 as their primary model in 2024.", + "The user upgraded their phone to an iPhone 17 in March 2026." + ], + "items": [ + { "id": "temporal-001", "fact_index": 0, "query": "where does the user live now?", "gold_in_topk": true, "gold_answer": "Lisbon" }, + { "id": "temporal-002", "fact_index": 1, "query": "where did the user used to live?", "gold_in_topk": true, "gold_answer": "Berlin" }, + { "id": "temporal-003", "fact_index": 2, "query": "when did the user move?", "gold_in_topk": true, "gold_answer": "2024" }, + { "id": "temporal-004", "fact_index": 0, "query": "where is the user currently based?", "gold_in_topk": true, "gold_answer": "Lisbon" }, + { "id": "temporal-005", "fact_index": 3, "query": "what is the user reading?", "gold_in_topk": true, "gold_answer": "The Power Broker" }, + { "id": "temporal-006", "fact_index": 4, "query": "what did the user read last year?", "gold_in_topk": true, "gold_answer": "Project Hail Mary" }, + { "id": "temporal-007", "fact_index": 5, "query": "what is the user working on these days?", "gold_in_topk": true, "gold_answer": "memory benchmark project" }, + { "id": "temporal-008", "fact_index": 6, "query": "what did the user finish last month?", "gold_in_topk": true, "gold_answer": "Sprint-4 reranker training" }, + { "id": "temporal-009", "fact_index": 7, "query": "what LLM does the user prefer?", "gold_in_topk": true, "gold_answer": "Claude Sonnet 4.6" }, + { "id": "temporal-010", "fact_index": 8, "query": "which model did the user use before?", "gold_in_topk": true, "gold_answer": "GPT-4" }, + { "id": "temporal-011", "fact_index": 9, "query": "did the user get a new phone recently?", "gold_in_topk": true, "gold_answer": "yes, iPhone 17 in March 2026" }, + { "id": "temporal-012", "fact_index": 0, "query": "is the user still in Berlin?", "gold_in_topk": true, "gold_answer": "no — the user lives in Lisbon now" }, + { "id": "temporal-013", "fact_index": 5, "query": "what is the user up to?", "gold_in_topk": true, "gold_answer": "memory benchmark project" }, + { "id": "temporal-014", "fact_index": 7, "query": "which model is the user on right now?", "gold_in_topk": true, "gold_answer": "Claude Sonnet 4.6" } + ] + }, + "specificity": { + "description": "specific entity vs generic-class query", + "facts": [ + "The user's dog Apollo is a golden retriever, age 4.", + "The user owns a Bianchi road bike.", + "The user's primary laptop is a 16-inch MacBook Pro M4.", + "The user has a Yamaha P-125 digital piano in the living room.", + "The user uses Logseq for personal notes and Notion for work.", + "The user's favorite restaurant in Lisbon is Cervejaria Ramiro.", + "The user wears Smith Lowdown sunglasses.", + "The user reads on a Kindle Paperwhite 11th-gen.", + "The user's home espresso machine is a Lelit Bianca v3.", + "The user wears Allbirds Wool Runners daily." + ], + "items": [ + { "id": "specificity-001", "fact_index": 0, "query": "tell me about my dog", "gold_in_topk": true, "gold_answer": "Apollo, golden retriever, age 4" }, + { "id": "specificity-002", "fact_index": 0, "query": "what kind of pet do I have?", "gold_in_topk": true, "gold_answer": "a dog" }, + { "id": "specificity-003", "fact_index": 1, "query": "do I own a bike?", "gold_in_topk": true, "gold_answer": "yes, a Bianchi road bike" }, + { "id": "specificity-004", "fact_index": 1, "query": "what brand of bike does the user have?", "gold_in_topk": true, "gold_answer": "Bianchi" }, + { "id": "specificity-005", "fact_index": 2, "query": "what computer do I use?", "gold_in_topk": true, "gold_answer": "16-inch MacBook Pro M4" }, + { "id": "specificity-006", "fact_index": 2, "query": "what laptop does the user have?", "gold_in_topk": true, "gold_answer": "16-inch MacBook Pro M4" }, + { "id": "specificity-007", "fact_index": 3, "query": "do I have any musical instruments?", "gold_in_topk": true, "gold_answer": "Yamaha P-125 digital piano" }, + { "id": "specificity-008", "fact_index": 4, "query": "which note-taking app do I use?", "gold_in_topk": true, "gold_answer": "Logseq for personal, Notion for work" }, + { "id": "specificity-009", "fact_index": 5, "query": "where do I like to eat in Lisbon?", "gold_in_topk": true, "gold_answer": "Cervejaria Ramiro" }, + { "id": "specificity-010", "fact_index": 6, "query": "what brand sunglasses does the user wear?", "gold_in_topk": true, "gold_answer": "Smith Lowdown" }, + { "id": "specificity-011", "fact_index": 7, "query": "do I read on a Kindle?", "gold_in_topk": true, "gold_answer": "yes, Paperwhite 11th-gen" }, + { "id": "specificity-012", "fact_index": 8, "query": "what espresso machine does the user own?", "gold_in_topk": true, "gold_answer": "Lelit Bianca v3" }, + { "id": "specificity-013", "fact_index": 9, "query": "what shoes do I wear?", "gold_in_topk": true, "gold_answer": "Allbirds Wool Runners" }, + { "id": "specificity-014", "fact_index": 9, "query": "what brand are the user's everyday shoes?", "gold_in_topk": true, "gold_answer": "Allbirds" } + ] + }, + "negation": { + "description": "polarity sensitivity — facts encoding what the user does NOT do/like", + "facts": [ + "The user does not drink coffee. They prefer tea.", + "The user is not vegetarian, but avoids red meat.", + "The user does not use Twitter; they use Bluesky and Mastodon.", + "The user does not own a car; they bike or use public transit.", + "The user is not on LinkedIn anymore.", + "The user dislikes cilantro intensely.", + "The user has never been to Asia.", + "The user does not enjoy horror movies.", + "The user does not eat shellfish.", + "The user is not currently learning any new languages." + ], + "items": [ + { "id": "negation-001", "fact_index": 0, "query": "does the user drink coffee?", "gold_in_topk": true, "gold_answer": "no, the user prefers tea" }, + { "id": "negation-002", "fact_index": 0, "query": "what does the user drink in the morning?", "gold_in_topk": true, "gold_answer": "tea (not coffee)" }, + { "id": "negation-003", "fact_index": 1, "query": "am I vegetarian?", "gold_in_topk": true, "gold_answer": "no, but avoids red meat" }, + { "id": "negation-004", "fact_index": 2, "query": "is the user on Twitter?", "gold_in_topk": true, "gold_answer": "no, uses Bluesky and Mastodon" }, + { "id": "negation-005", "fact_index": 2, "query": "which social networks does the user use?", "gold_in_topk": true, "gold_answer": "Bluesky and Mastodon" }, + { "id": "negation-006", "fact_index": 3, "query": "does the user own a car?", "gold_in_topk": true, "gold_answer": "no, bikes or public transit" }, + { "id": "negation-007", "fact_index": 4, "query": "is the user active on LinkedIn?", "gold_in_topk": true, "gold_answer": "no, not anymore" }, + { "id": "negation-008", "fact_index": 5, "query": "any foods the user hates?", "gold_in_topk": true, "gold_answer": "cilantro" }, + { "id": "negation-009", "fact_index": 6, "query": "has the user traveled to Asia?", "gold_in_topk": true, "gold_answer": "no, never" }, + { "id": "negation-010", "fact_index": 7, "query": "does the user like horror movies?", "gold_in_topk": true, "gold_answer": "no" }, + { "id": "negation-011", "fact_index": 8, "query": "can the user eat shrimp?", "gold_in_topk": true, "gold_answer": "no — does not eat shellfish" }, + { "id": "negation-012", "fact_index": 9, "query": "is the user learning a new language?", "gold_in_topk": true, "gold_answer": "no, not currently" } + ] + }, + "distractors": { + "description": "Extraction-style meta-facts that pollute real SDK stores. Observed verbatim or near-verbatim in the partner demo (e.g. 'The user asked for the user's name', 'The user is me', 'As of , X is a term mentioned in the conversation'). These should NEVER be a top-1 match for any user-fact query, but in the real failure they outranked the gold.", + "facts": [ + "The user asked for the user's name.", + "The user is me.", + "The user is asking a question.", + "The user requested information.", + "The user said something.", + "As of May 14, 2026, Apollo is a term mentioned in the conversation.", + "As of May 14, 2026, the user is a term mentioned in the conversation.", + "A name was mentioned in the conversation.", + "The conversation involves the user.", + "The user has started a conversation." + ], + "items": [] + }, + "control": { + "description": "queries that should NOT match the user-fact pool — measures false positives", + "facts": [ + "The user's name is Alex.", + "The user lives in Lisbon.", + "The user has a dog named Apollo.", + "The user works at Acme as a software engineer.", + "The user's favorite restaurant is Cervejaria Ramiro." + ], + "items": [ + { "id": "control-001", "query": "what is the airspeed velocity of an unladen swallow?", "gold_in_topk": false, "gold_answer": null }, + { "id": "control-002", "query": "who is the current president of France?", "gold_in_topk": false, "gold_answer": null }, + { "id": "control-003", "query": "what is the capital of Mongolia?", "gold_in_topk": false, "gold_answer": null }, + { "id": "control-004", "query": "how does photosynthesis work?", "gold_in_topk": false, "gold_answer": null }, + { "id": "control-005", "query": "translate 'goodnight' to Japanese", "gold_in_topk": false, "gold_answer": null }, + { "id": "control-006", "query": "what year did World War II end?", "gold_in_topk": false, "gold_answer": null }, + { "id": "control-007", "query": "explain entropy in thermodynamics", "gold_in_topk": false, "gold_answer": null }, + { "id": "control-008", "query": "best way to debug a segfault in C", "gold_in_topk": false, "gold_answer": null }, + { "id": "control-009", "query": "what's the weather going to be tomorrow?", "gold_in_topk": false, "gold_answer": null }, + { "id": "control-010", "query": "give me a recipe for tiramisu", "gold_in_topk": false, "gold_answer": null } + ] + } + } +} diff --git a/benchmarks/alignbench/modal_ablate.py b/benchmarks/alignbench/modal_ablate.py new file mode 100644 index 0000000..b496fd7 --- /dev/null +++ b/benchmarks/alignbench/modal_ablate.py @@ -0,0 +1,193 @@ +""" +AlignBench embedding-model ablation on Modal. + +Runs the AlignBench items.json against multiple sentence-transformer models +on a single A100 container, returning per-model per-axis recall@1 / recall@5 +and margin distributions. Resolves the local-CPU stall on mpnet and gives +real signal on whether a stronger embedding model closes the temporal / +pronoun gaps the bi-encoder MiniLM (SDK default) struggles with. + +Outputs runs/modal-ablation.json — one entry per model. + +Usage: + modal run modal_ablate.py + (writes to runs/modal-ablation.json in this folder) +""" + +import json +import pathlib +import modal + +APP_NAME = "alignbench-embed-ablate" + +image = ( + modal.Image.debian_slim(python_version="3.11") + .pip_install( + "sentence-transformers==3.2.1", + "torch==2.5.1", + "rank-bm25==0.2.2", + "numpy<2", + ) +) + +app = modal.App(APP_NAME, image=image) + +MODELS = [ + "sentence-transformers/all-MiniLM-L6-v2", # SDK default, ~22M params + "sentence-transformers/all-mpnet-base-v2", # 110M params + "BAAI/bge-small-en-v1.5", # 33M params + "BAAI/bge-base-en-v1.5", # 109M params + "intfloat/e5-small-v2", # 33M params + "intfloat/e5-base-v2", # 110M params +] + + +def _cosine(a, b): + import numpy as np + a = np.asarray(a, dtype="float32") + b = np.asarray(b, dtype="float32") + na = np.linalg.norm(a) * np.linalg.norm(b) + return float(np.dot(a, b) / na) if na > 0 else 0.0 + + +def _build_pool(manifest): + pool = [] + for axis_name, body in manifest["axes"].items(): + is_distractor = axis_name == "distractors" + for i, fact in enumerate(body["facts"]): + pool.append( + {"text": fact, "globalKey": f"{axis_name}#{i}", + "axis": axis_name, "factIndex": i, "isDistractor": is_distractor} + ) + return pool + + +def _score_one_model(model_name: str, manifest: dict) -> dict: + """Embed all facts, score all queries, return per-axis + composite metrics.""" + from sentence_transformers import SentenceTransformer + import time + + t0 = time.time() + model = SentenceTransformer(model_name) + pool = _build_pool(manifest) + fact_vecs = model.encode([e["text"] for e in pool], normalize_embeddings=True) + + per_axis = [] + composite_top1 = composite_top5 = composite_n = 0 + composite_distractor = 0 + + for axis_name, body in manifest["axes"].items(): + if not body["items"]: + continue + hit1 = hit5 = 0 + distractor_top1 = 0 + margins = [] + ranks = [] + + for item in body["items"]: + q_vec = model.encode([item["query"]], normalize_embeddings=True)[0] + scores = [_cosine(q_vec, fv) for fv in fact_vecs] + ranked = sorted( + ((s, pool[i]) for i, s in enumerate(scores)), + key=lambda x: -x[0], + ) + # dedupe by globalKey (best rank wins) + seen = set() + dedup = [] + for s, entry in ranked: + if entry["globalKey"] in seen: + continue + seen.add(entry["globalKey"]) + dedup.append((s, entry)) + top5 = dedup[:5] + + gold_rank = None + gold_score = None + if item.get("gold_in_topk") and item.get("fact_index") is not None: + gold_key = f"{axis_name}#{item['fact_index']}" + for idx, (s, entry) in enumerate(dedup): + if entry["globalKey"] == gold_key: + gold_rank = idx + 1 + gold_score = s + break + + if gold_rank == 1: + hit1 += 1 + if gold_rank is not None and gold_rank <= 5: + hit5 += 1 + if gold_rank is not None: + ranks.append(gold_rank) + + if item.get("gold_in_topk") and top5 and top5[0][1]["isDistractor"]: + distractor_top1 += 1 + + if gold_score is not None: + best_non_gold = next( + (s for s, e in dedup if e["globalKey"] != f"{axis_name}#{item['fact_index']}"), + None, + ) + if best_non_gold is not None: + margins.append(gold_score - best_non_gold) + + n = len(body["items"]) + margins.sort() + per_axis.append({ + "axis": axis_name, + "n": n, + "recall_at_1": hit1 / n if n else None, + "recall_at_5": hit5 / n if n else None, + "mean_gold_rank": (sum(ranks) / len(ranks)) if ranks else None, + "median_gold_margin": margins[len(margins) // 2] if margins else None, + "distractor_at_top1": distractor_top1, + }) + if axis_name != "control": + composite_top1 += hit1 + composite_top5 += hit5 + composite_n += n + composite_distractor += distractor_top1 + + wall = time.time() - t0 + return { + "model": model_name, + "wall_seconds": round(wall, 1), + "composite": { + "recall_at_1": composite_top1 / composite_n if composite_n else None, + "recall_at_5": composite_top5 / composite_n if composite_n else None, + "distractor_top1_rate": composite_distractor / composite_n if composite_n else None, + "n": composite_n, + }, + "per_axis": per_axis, + } + + +@app.function(gpu="A10G", timeout=1200) +def run_model(model_name: str, manifest_json: str) -> dict: + """Remote: run one model end-to-end and return its result dict.""" + manifest = json.loads(manifest_json) + return _score_one_model(model_name, manifest) + + +@app.local_entrypoint() +def main(): + here = pathlib.Path(__file__).parent + manifest_json = (here / "items.json").read_text() + + # Fan out across models — Modal autoscales containers, one per model. + results = list(run_model.map(MODELS, kwargs={"manifest_json": manifest_json})) + + out_path = here / "runs" / "modal-ablation.json" + out_path.parent.mkdir(parents=True, exist_ok=True) + out_path.write_text(json.dumps({"models": results}, indent=2)) + + print("\n=== AlignBench embedding ablation (Modal A10G) ===\n") + print(f"{'model':<48} {'r@1':>6} {'r@5':>6} {'distr':>6} {'wall_s':>7}") + for r in results: + c = r["composite"] + print( + f"{r['model']:<48} " + f"{c['recall_at_1']:.3f} " + f"{c['recall_at_5']:.3f} " + f"{c['distractor_top1_rate']:.3f} " + f"{r['wall_seconds']:>7.1f}" + ) + print(f"\nsaved → {out_path}") diff --git a/benchmarks/alignbench/modal_demo_stress.py b/benchmarks/alignbench/modal_demo_stress.py new file mode 100644 index 0000000..e3a0e99 --- /dev/null +++ b/benchmarks/alignbench/modal_demo_stress.py @@ -0,0 +1,345 @@ +""" +Demo-class synthetic stress test on Modal. + +This is the closest reproduction we can run of the actual partner-demo +failure shape *without* deploying the full core+Postgres stack: + + 1. Generate 30 short multi-turn conversations (3-4 turns each) where the + user states 2-3 personal facts and then asks a recall question. + 2. Run the REAL production extraction LLM (Anthropic Haiku, same model + and same EXTRACTION_PROMPT the engine uses) on each conversation. + 3. Apply the meta-fact filter post-extraction to half the runs; leave + the other half raw. This matches what the engine does in core after + the alignbench-meta-fact-filter-2026-05-14 branch ships. + 4. Embed every surviving fact + the recall query with the production + SDK embedding model (Xenova/all-MiniLM-L6-v2 via + sentence-transformers). + 5. Score cosine similarity, rank facts, check whether the gold fact + ranks #1, count how many meta-facts ranked above it. + +Output: runs/demo-stress.json with per-conversation results, summary +deltas, and concrete failure examples. + +This reproduces the cosine-margin-too-thin pattern that the partner-demo +screenshots showed, on synthetic data so we can iterate safely. + +Why on Modal rather than local: parallel extraction calls hit Anthropic +rate limits faster than a single laptop can absorb, and Modal also lets +us run sentence-transformers on a beefy CPU container without local +ONNX init stalls. + +Usage: + modal run modal_demo_stress.py +""" + +import json +import os +import pathlib +import modal + +APP_NAME = "alignbench-demo-stress" + +image = ( + modal.Image.debian_slim(python_version="3.11") + .pip_install( + "sentence-transformers==3.2.1", + "torch==2.5.1", + "numpy<2", + "anthropic==0.40.0", + "httpx>=0.27", + ) + .add_local_file(__file__, "/root/modal_demo_stress.py") +) + +# Reuse the meta-fact patterns the SDK + core both ship. +# Importing across the local-vs-Modal boundary is awkward; the patterns are short. +META_FACT_PATTERNS = [ + r"^\s*the user (asked|requested|said|is asking|is me)\b", + r"^\s*as of [^,]+,\s+.+\s+is a term mentioned in the conversation\.?$", + r"^\s*a name was mentioned\b", + r"^\s*the conversation involves the user\b", + r"^\s*the user has started a conversation\b", +] + + +# Compact production extraction prompt — abbreviated to keep the call cheap +# while preserving the rule that drives meta-fact emission in real production. +# Mirrors src/services/extraction.ts EXTRACTION_PROMPT structure but trimmed +# to the rules that matter for this stress test (we are not testing the +# entity/keyword fields, only what gets emitted as a statement). +EXTRACTION_PROMPT = """You are a memory extraction system. Your only output is a JSON object. You never produce conversational replies. You never continue the dialogue. You read the transcript and emit facts as JSON. + +Extract discrete, self-contained facts from the conversation transcript below. Each fact should be useful if retrieved months later in a completely different conversation. + +RULES: +- Each fact must be a single, atomic statement. +- Include enough context to be understood in isolation. +- Replace pronouns with specific names/references. +- Length is NOT a reason to skip a fact. A single user sentence containing a named entity (person, place, profession, possession, preference, allergy, hobby) IS extractable. "I'm Alex" → one fact. "I live in Lisbon" → one fact. "My dog is named Apollo" → one fact. +- Skip pleasantries, filler, acknowledgments, and meta-observations about the conversation itself. +- NEVER extract meta-facts of the form "the user asked X", "a term was mentioned", "the conversation involves the user". These describe the chat, not the user. +- Rate importance 0.0-1.0. + +Your output MUST be a single raw JSON object, no markdown fences, no preamble, no continuation of the conversation: +{"memories": [{"statement": "...", "importance": 0.7}]} + +If no extractable facts: {"memories": []}""" + + +# 30 conversations: each has user-asserted facts + a recall question + the +# gold fact text we expect retrieval to surface. Crafted to mirror the +# partner-demo failure surface: short, casual, personal, multi-fact. +CONVERSATIONS = [ + {"id": "name-001", "turns": ["My name is Alex.", "Got it."], "query": "what is my name?", "gold": "name is Alex"}, + {"id": "name-002", "turns": ["I go by Sam.", "OK Sam."], "query": "what's my name?", "gold": "go by Sam"}, + {"id": "name-003", "turns": ["You can call me Riley.", "Hi Riley."], "query": "what should you call me?", "gold": "call me Riley"}, + {"id": "name-004", "turns": ["I'm Jordan.", "Nice to meet you."], "query": "who am I?", "gold": "Jordan"}, + {"id": "pet-001", "turns": ["I have a golden retriever named Apollo.", "How sweet."], "query": "what is my dog's name?", "gold": "Apollo"}, + {"id": "pet-002", "turns": ["My cat Luna sleeps on my keyboard.", "Classic cat."], "query": "what's my cat's name?", "gold": "Luna"}, + {"id": "pet-003", "turns": ["I just adopted a beagle puppy. Her name is Penny.", "Congrats!"], "query": "what kind of dog do I have?", "gold": "beagle"}, + {"id": "job-001", "turns": ["I work as a software engineer at a startup.", "Cool field."], "query": "what do I do for work?", "gold": "software engineer"}, + {"id": "job-002", "turns": ["I'm a high school chemistry teacher.", "That's important work."], "query": "what is my profession?", "gold": "chemistry teacher"}, + {"id": "job-003", "turns": ["I freelance as a graphic designer.", "Nice."], "query": "what's my job?", "gold": "graphic designer"}, + {"id": "city-001", "turns": ["I live in Lisbon now.", "Beautiful city."], "query": "where do I live?", "gold": "Lisbon"}, + {"id": "city-002", "turns": ["I just moved to Berlin last month.", "Welcome to Berlin."], "query": "what city am I in?", "gold": "Berlin"}, + {"id": "city-003", "turns": ["I'm based in Toronto.", "Cold this time of year."], "query": "where am I located?", "gold": "Toronto"}, + {"id": "food-001", "turns": ["I'm vegetarian.", "Got it."], "query": "do I eat meat?", "gold": "vegetarian"}, + {"id": "food-002", "turns": ["I'm severely allergic to peanuts.", "Noted, will avoid."], "query": "do I have any allergies?", "gold": "peanut"}, + {"id": "food-003", "turns": ["I don't drink coffee — only tea.", "Tea is great too."], "query": "what do I drink in the morning?", "gold": "tea"}, + {"id": "hobby-001", "turns": ["I play classical piano.", "Lovely hobby."], "query": "what instrument do I play?", "gold": "piano"}, + {"id": "hobby-002", "turns": ["My main sport is rock climbing.", "Cool."], "query": "what sport do I do?", "gold": "rock climbing"}, + {"id": "hobby-003", "turns": ["I've been knitting for about ten years.", "Impressive."], "query": "what's a hobby I have?", "gold": "knitting"}, + {"id": "family-001", "turns": ["I have two kids, Maya and Theo.", "What ages?"], "query": "how many children do I have?", "gold": "two"}, + {"id": "family-002", "turns": ["My partner's name is Casey.", "Nice."], "query": "who is my partner?", "gold": "Casey"}, + {"id": "family-003", "turns": ["My mom lives in Vancouver.", "Far from you?"], "query": "where does my mom live?", "gold": "Vancouver"}, + {"id": "vehicle-001", "turns": ["I drive a blue Subaru Outback.", "Reliable car."], "query": "what kind of car do I have?", "gold": "Subaru"}, + {"id": "vehicle-002", "turns": ["I don't own a car. I bike everywhere.", "Healthy lifestyle."], "query": "do I have a car?", "gold": "does not own"}, + {"id": "edu-001", "turns": ["I studied applied mathematics in college.", "Tough major."], "query": "what was my major?", "gold": "applied mathematics"}, + {"id": "edu-002", "turns": ["I got my MBA from UCLA two years ago.", "Congrats."], "query": "where did I get my MBA?", "gold": "UCLA"}, + {"id": "tech-001", "turns": ["My main laptop is a 16-inch MacBook Pro.", "Solid machine."], "query": "what computer do I use?", "gold": "MacBook"}, + {"id": "tech-002", "turns": ["I prefer Neovim over VS Code.", "Editor preferences are personal."], "query": "what editor do I use?", "gold": "Neovim"}, + {"id": "music-001", "turns": ["I've been getting into bluegrass lately.", "Fun genre."], "query": "what music am I into these days?", "gold": "bluegrass"}, + {"id": "music-002", "turns": ["My all-time favorite band is Radiohead.", "Great band."], "query": "what's my favorite band?", "gold": "Radiohead"}, +] + +app = modal.App(APP_NAME, image=image) + + +def _is_meta_fact(text: str, patterns: list[str]) -> bool: + import re + if not isinstance(text, str) or len(text) == 0: + return False + for p in patterns: + if re.search(p, text, flags=re.IGNORECASE): + return True + return False + + +def _cosine(a, b) -> float: + import numpy as np + a = np.asarray(a, dtype="float32") + b = np.asarray(b, dtype="float32") + n = np.linalg.norm(a) * np.linalg.norm(b) + return float(np.dot(a, b) / n) if n > 0 else 0.0 + + +@app.function(timeout=600) +def extract_facts(conversation_turns: list[str], anthropic_key: str) -> list[dict]: + """Call Anthropic Haiku with the production EXTRACTION_PROMPT shape.""" + from anthropic import Anthropic + + client = Anthropic(api_key=anthropic_key) + convo_text = "\n".join(f"User: {t}" if i % 2 == 0 else f"Assistant: {t}" for i, t in enumerate(conversation_turns)) + # Force JSON-only output via assistant-role prefill of "{". Anthropic + # then resumes generation INSIDE the JSON object, eliminating the + # chat-continuation failure mode we observed empirically. The prefilled + # "{" is added back to the parsed text. + msg = client.messages.create( + model="claude-haiku-4-5", + max_tokens=600, + temperature=0, + system=EXTRACTION_PROMPT, + messages=[ + {"role": "user", "content": f"Conversation:\n{convo_text}"}, + {"role": "assistant", "content": "{"}, + ], + ) + generated = "".join(block.text for block in msg.content if hasattr(block, "text")) + text = "{" + generated + # Robust JSON extraction: strip markdown fences, then find the JSON object. + cleaned = text.strip() + if cleaned.startswith("```"): + cleaned = cleaned.split("\n", 1)[1] if "\n" in cleaned else "" + if cleaned.endswith("```"): + cleaned = cleaned.rsplit("```", 1)[0] + cleaned = cleaned.strip() + # If LLM added preamble like "Here are the facts:", find the first { and last }. + if not cleaned.startswith("{"): + start = cleaned.find("{") + end = cleaned.rfind("}") + if start >= 0 and end > start: + cleaned = cleaned[start : end + 1] + try: + parsed = json.loads(cleaned) + return parsed.get("memories", []) if isinstance(parsed, dict) else [] + except json.JSONDecodeError as e: + # Log to Modal stderr so we can diagnose in the run output. + import sys + sys.stderr.write(f"[extract] JSON parse failed: {e}; raw text first 200 chars: {text[:200]!r}\n") + return [] + + +@app.function(timeout=1800) +def score_all_conversations(conversations: list[dict], anthropic_key: str) -> dict: + """Run extraction + embedding + scoring for every conversation, both with and without the filter.""" + from sentence_transformers import SentenceTransformer + + print(f"[score] loading embedding model...", flush=True) + embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2") + + # First: extract facts for every conversation (parallel via .map below would be cleaner, + # but Modal nested-function calls add complexity; sequential is fine for n=30). + print(f"[score] extracting facts for {len(conversations)} conversations...", flush=True) + rows = [] + for conv in conversations: + try: + facts = extract_facts.remote(conv["turns"], anthropic_key) + except Exception as e: + print(f" {conv['id']} EXTRACTION FAIL: {e}", flush=True) + facts = [] + statements = [ + (f.get("statement") or "").strip() + for f in facts + if isinstance(f, dict) and isinstance(f.get("statement"), str) + ] + statements = [s for s in statements if s] + meta_mask = [_is_meta_fact(s, META_FACT_PATTERNS) for s in statements] + rows.append({ + "id": conv["id"], + "turns": conv["turns"], + "query": conv["query"], + "gold": conv["gold"], + "facts": statements, + "meta_mask": meta_mask, + "n_facts": len(statements), + "n_meta": sum(meta_mask), + }) + print(f" {conv['id']}: {len(statements)} facts ({sum(meta_mask)} meta)", flush=True) + + # Second: embed everything and score retrieval, twice (with/without filter). + print(f"[score] embedding + scoring...", flush=True) + summary = {"baseline": {}, "filtered": {}, "n": len(rows)} + for mode in ("baseline", "filtered"): + hits_at_1 = 0 + hits_at_5 = 0 + gold_present = 0 + meta_top1 = 0 + per_item = [] + for row in rows: + facts = row["facts"] + mask = row["meta_mask"] + if mode == "filtered": + facts_eff = [s for s, m in zip(facts, mask) if not m] + meta_eff = [False] * len(facts_eff) + else: + facts_eff = facts + meta_eff = mask + + if not facts_eff: + per_item.append({ + "id": row["id"], "gold_rank": None, "gold_score": None, + "top1_is_meta": False, "top1_text": None, + }) + continue + + q_vec = embedder.encode([row["query"]], normalize_embeddings=True)[0] + f_vecs = embedder.encode(facts_eff, normalize_embeddings=True) + scores = [_cosine(q_vec, fv) for fv in f_vecs] + ranked = sorted( + ((s, i) for i, s in enumerate(scores)), + key=lambda x: -x[0], + ) + + # Match gold by (a) substring fast path (case-insensitive, also handles + # short stems by stripping trailing punctuation), or (b) cosine similarity + # >= 0.65 against the gold tag if substring fails. The cosine fallback + # recovers cases like gold='go by Sam' matching 'goes by the name Sam'. + gold_token = row["gold"].lower().rstrip(".,!?;:") + gold_vec = embedder.encode([row["gold"]], normalize_embeddings=True)[0] + gold_rank = None + gold_score = None + for rank, (s, idx) in enumerate(ranked, start=1): + fact_lower = facts_eff[idx].lower() + if gold_token in fact_lower: + gold_rank = rank + gold_score = s + break + # cosine fallback for stem-mismatch + semantic-paraphrase substring failures + # ("don't own" matches "does not own", "go by" matches "goes by") + if _cosine(gold_vec, f_vecs[idx]) >= 0.55: + gold_rank = rank + gold_score = s + break + + top1_idx = ranked[0][1] + top1_text = facts_eff[top1_idx] + top1_is_meta = meta_eff[top1_idx] + + if gold_rank is not None: + gold_present += 1 + if gold_rank == 1: + hits_at_1 += 1 + if gold_rank <= 5: + hits_at_5 += 1 + if top1_is_meta: + meta_top1 += 1 + + per_item.append({ + "id": row["id"], + "gold_rank": gold_rank, + "gold_score": gold_score, + "top1_score": ranked[0][0], + "top1_text": top1_text, + "top1_is_meta": top1_is_meta, + }) + + summary[mode] = { + "recall_at_1": hits_at_1 / len(rows), + "recall_at_5": hits_at_5 / len(rows), + "gold_present_rate": gold_present / len(rows), + "meta_at_top1": meta_top1, + "per_item": per_item, + } + + summary["rows"] = rows + return summary + + +@app.local_entrypoint() +def main(): + anthropic_key = os.environ.get("ANTHROPIC_API_KEY", "") + if not anthropic_key: + raise RuntimeError("ANTHROPIC_API_KEY env var must be set locally before `modal run`") + result = score_all_conversations.remote(CONVERSATIONS, anthropic_key) + + here = pathlib.Path(__file__).parent + out_path = here / "runs" / "demo-stress.json" + out_path.parent.mkdir(parents=True, exist_ok=True) + out_path.write_text(json.dumps(result, indent=2)) + + print("\n" + "=" * 60) + print("Demo-class stress test results") + print("=" * 60) + n = result["n"] + for mode in ("baseline", "filtered"): + s = result[mode] + print( + f"\n{mode:8} r@1={s['recall_at_1']:.3f} " + f"r@5={s['recall_at_5']:.3f} " + f"gold_present={s['gold_present_rate']:.3f} " + f"meta_top1={s['meta_at_top1']}/{n}" + ) + delta_r1 = result["filtered"]["recall_at_1"] - result["baseline"]["recall_at_1"] + delta_meta = result["baseline"]["meta_at_top1"] - result["filtered"]["meta_at_top1"] + print(f"\nfilter delta: r@1 {delta_r1:+.3f} meta_top1 {delta_meta:+d}") + print(f"\nsaved -> {out_path}") diff --git a/benchmarks/alignbench/run.mjs b/benchmarks/alignbench/run.mjs new file mode 100644 index 0000000..9687442 --- /dev/null +++ b/benchmarks/alignbench/run.mjs @@ -0,0 +1,306 @@ +#!/usr/bin/env node +/** + * AlignBench runner — standalone, no SDK/Postgres/network required. + * + * Embeds every fact in each axis, embeds every query, scores cosine similarity, + * reports recall@1 / recall@5 / mean-gold-rank / false-positive@5 per axis, + * and writes a single run JSON. + * + * Variants are switched via flags; the underlying scoring is identical so + * results are directly comparable across runs. + * + * node run.mjs # baseline (current SDK stack) + * node run.mjs --variant=query-rewrite # rewrite pronouns in query + * node run.mjs --variant=dual-storage # store fact in both forms + * node run.mjs --variant=hybrid-bm25 # BM25 + semantic union + * node run.mjs --variant=combined # query-rewrite + hybrid-bm25 + * + * node run.mjs --out=runs/baseline.json --model=Xenova/all-MiniLM-L6-v2 + */ + +import fs from 'node:fs'; +import path from 'node:path'; +import { fileURLToPath } from 'node:url'; +import { pipeline } from '@huggingface/transformers'; + +// -- CLI -- +const args = Object.fromEntries( + process.argv.slice(2).map(a => { + const [k, v] = a.replace(/^--/, '').split('='); + return [k, v ?? true]; + }) +); +const VARIANT = args.variant ?? 'baseline'; +const MODEL = args.model ?? 'Xenova/all-MiniLM-L6-v2'; +const TOPK = Number(args.topk ?? 5); +const HERE = path.dirname(fileURLToPath(import.meta.url)); +const OUT = args.out ?? path.join(HERE, 'runs', `${VARIANT}.json`); + +// -- Load manifest -- +const manifest = JSON.parse(fs.readFileSync(path.join(HERE, 'items.json'), 'utf8')); + +// -- Variant: query rewrite -- +// Deterministic pronoun substitution to bridge first-person → third-person. +// Order matters: longer phrases first so we don't double-substitute. +const PRONOUN_RULES = [ + [/\bmy\b/gi, "the user's"], + [/\bme\b/gi, 'the user'], + [/\bI am\b/gi, 'the user is'], + [/\bI'm\b/gi, 'the user is'], + [/\bI've\b/gi, 'the user has'], + [/\bI'd\b/gi, 'the user would'], + [/\bI'll\b/gi, 'the user will'], + [/\bI\b/gi, 'the user'], + [/\bmyself\b/gi, 'the user'], +]; +function rewriteQueryPronouns(q) { + let out = q; + for (const [re, repl] of PRONOUN_RULES) out = out.replace(re, repl); + return out; +} + +// -- Variant: dual storage -- +// For each fact, also produce a first-person paraphrase. Both are stored; +// retrieval picks whichever scores higher. +const STORE_RULES = [ + [/\bThe user's\b/g, 'My'], + [/\bthe user's\b/g, 'my'], + [/\bThe user\b/g, 'I'], + [/\bthe user\b/g, 'I'], +]; +function paraphraseFirstPerson(fact) { + let out = fact; + for (const [re, repl] of STORE_RULES) out = out.replace(re, repl); + // tidy common verb agreements after subject rewrite (avoid worst surface mismatches) + out = out.replace(/\bI is\b/g, 'I am').replace(/\bI has\b/g, 'I have').replace(/\bI does\b/g, 'I do'); + return out; +} + +// -- Hybrid BM25 implementation (tiny, just for this benchmark) -- +function tokenize(s) { + return s.toLowerCase().match(/[a-z0-9]+/g) ?? []; +} +function bm25Scores(queryTokens, docsTokens, k1 = 1.5, b = 0.75) { + const N = docsTokens.length; + const avgDL = docsTokens.reduce((a, d) => a + d.length, 0) / Math.max(1, N); + const df = new Map(); + for (const doc of docsTokens) { + for (const t of new Set(doc)) df.set(t, (df.get(t) ?? 0) + 1); + } + const idf = (t) => Math.log(1 + (N - (df.get(t) ?? 0) + 0.5) / ((df.get(t) ?? 0) + 0.5)); + return docsTokens.map((doc) => { + const tf = new Map(); + for (const t of doc) tf.set(t, (tf.get(t) ?? 0) + 1); + const dl = doc.length; + let score = 0; + for (const qt of new Set(queryTokens)) { + const f = tf.get(qt) ?? 0; + if (f === 0) continue; + score += idf(qt) * ((f * (k1 + 1)) / (f + k1 * (1 - b + b * (dl / avgDL)))); + } + return score; + }); +} +function minmaxNormalize(arr) { + let lo = Infinity, hi = -Infinity; + for (const v of arr) { if (v < lo) lo = v; if (v > hi) hi = v; } + const span = hi - lo; + if (span <= 0) return arr.map(() => 0); + return arr.map((v) => (v - lo) / span); +} + +// -- Cosine -- +function cosine(a, b) { + let dot = 0, na = 0, nb = 0; + for (let i = 0; i < a.length; i++) { dot += a[i] * b[i]; na += a[i] * a[i]; nb += b[i] * b[i]; } + const m = Math.sqrt(na * nb); + return m === 0 ? 0 : dot / m; +} + +// -- Embedder -- +async function loadEmbedder(model) { + console.log(`[load] ${model}`); + const fn = await pipeline('feature-extraction', model); + return async (text) => { + const out = await fn(text, { pooling: 'mean', normalize: true }); + return Array.from(out.data); + }; +} + +// -- Build the global fact pool used for ALL queries -- +// Real SDK stores are mixed: facts from all topics, plus extraction-style meta-facts +// that pollute the embedding space. The realistic test is "can the right user-fact +// outrank the distractors when they all live in the same store". +function buildGlobalPool(manifest, variant) { + // {axisName: [{text, globalKey}]}. globalKey is the canonical id used for gold matching. + const entries = []; // [{ text, globalKey, axis, factIndex, isDistractor }] + for (const [axisName, body] of Object.entries(manifest.axes)) { + const isDistractor = axisName === 'distractors'; + for (let i = 0; i < body.facts.length; i++) { + const globalKey = `${axisName}#${i}`; + entries.push({ text: body.facts[i], globalKey, axis: axisName, factIndex: i, isDistractor }); + if (variant === 'dual-storage' || variant === 'combined') { + const para = paraphraseFirstPerson(body.facts[i]); + if (para !== body.facts[i]) { + entries.push({ text: para, globalKey, axis: axisName, factIndex: i, isDistractor }); + } + } + } + } + return entries; +} + +async function scoreAxis(axisName, axisBody, embed, pool, factVecs, factTokens) { + const perItem = []; + let hitAt1 = 0, hitAt5 = 0; + let goldRankSum = 0, goldRankN = 0; + let distractorTop1 = 0; + let fpAt5 = 0; + const marginSamples = []; // gold_score - best_non_gold_score + + for (const item of axisBody.items) { + const origQ = item.query; + let effQ = origQ; + if (VARIANT === 'query-rewrite' || VARIANT === 'combined') { + effQ = rewriteQueryPronouns(origQ); + } + const qVec = await embed(effQ); + const semScores = factVecs.map((fv) => cosine(qVec, fv)); + let scores = semScores; + if (VARIANT === 'hybrid-bm25' || VARIANT === 'combined') { + const qTokens = tokenize(effQ); + const bm = bm25Scores(qTokens, factTokens); + const semN = minmaxNormalize(semScores); + const bmN = minmaxNormalize(bm); + scores = semN.map((s, i) => 0.6 * s + 0.4 * bmN[i]); + } + + // Rank, collapse to globalKey (dual-storage duplicates same key) + const ranked = scores + .map((s, i) => ({ s, entry: pool[i] })) + .sort((a, b) => b.s - a.s); + const seen = new Set(); + const dedup = []; + for (const r of ranked) { + if (seen.has(r.entry.globalKey)) continue; + seen.add(r.entry.globalKey); + dedup.push(r); + } + const topK = dedup.slice(0, TOPK); + + // Gold match: same axis + same factIndex + let goldRank = null; + let goldScore = null; + const goldKey = (item.fact_index != null) ? `${axisName}#${item.fact_index}` : null; + if (item.gold_in_topk && goldKey) { + const idx = dedup.findIndex((r) => r.entry.globalKey === goldKey); + if (idx >= 0) { goldRank = idx + 1; goldScore = dedup[idx].s; } + } + + // Margin: gold vs best non-gold + if (goldScore !== null) { + const bestNonGold = dedup.find((r) => r.entry.globalKey !== goldKey); + if (bestNonGold) marginSamples.push(goldScore - bestNonGold.s); + } + + const hit1 = goldRank === 1; + const hit5 = goldRank !== null && goldRank <= TOPK; + if (hit1) hitAt1++; + if (hit5) hitAt5++; + if (goldRank !== null) { goldRankSum += goldRank; goldRankN++; } + + // Distractor-pollution metric: how often a meta-fact ranks top-1 + if (item.gold_in_topk && topK[0]?.entry.isDistractor) distractorTop1++; + + let fp = false; + if (item.gold_in_topk === false) { + fp = topK.length > 0 && topK[0].s > 0.5; + if (fp) fpAt5++; + } + + perItem.push({ + id: item.id, + query: origQ, + effective_query: effQ, + gold_in_topk: item.gold_in_topk ?? false, + gold_global_key: goldKey, + gold_rank: goldRank, + gold_score: goldScore, + top1_text: topK[0]?.entry.text ?? null, + top1_score: topK[0]?.s ?? null, + top1_is_distractor: topK[0]?.entry.isDistractor ?? false, + false_positive: fp, + }); + } + + const n = axisBody.items.length; + return { + axis: axisName, + n, + pool_size: pool.length, + recall_at_1: n > 0 ? hitAt1 / n : null, + recall_at_5: n > 0 ? hitAt5 / n : null, + mean_gold_rank: goldRankN > 0 ? goldRankSum / goldRankN : null, + distractor_at_top1: distractorTop1, + false_positive_count: fpAt5, + median_gold_margin: marginSamples.length > 0 + ? marginSamples.sort((a, b) => a - b)[Math.floor(marginSamples.length / 2)] + : null, + items: perItem, + }; +} + +// -- Main -- +async function main() { + const t0 = Date.now(); + const embed = await loadEmbedder(MODEL); + + // Build the SHARED global pool — every query competes against the full set, + // including extraction-style distractor meta-facts. + const pool = buildGlobalPool(manifest, VARIANT); + console.log(`[pool] ${pool.length} entries (variant=${VARIANT})`); + const factVecs = []; + for (const e of pool) factVecs.push(await embed(e.text)); + const factTokens = pool.map((e) => tokenize(e.text)); + + const results = []; + for (const [axisName, body] of Object.entries(manifest.axes)) { + if (body.items.length === 0) continue; // skip distractor section (facts only) + process.stdout.write(`[axis] ${axisName.padEnd(13)} ... `); + const r = await scoreAxis(axisName, body, embed, pool, factVecs, factTokens); + process.stdout.write(`r@5=${r.recall_at_5?.toFixed(3) ?? 'n/a'} r@1=${r.recall_at_1?.toFixed(3) ?? 'n/a'} margin=${r.median_gold_margin?.toFixed(3) ?? 'n/a'} distractor_top1=${r.distractor_at_top1}\n`); + results.push(r); + } + + // Composite (excluding control) + const scoredAxes = results.filter((r) => r.axis !== 'control'); + const totalN = scoredAxes.reduce((a, r) => a + r.n, 0); + const composite = { + recall_at_1: scoredAxes.reduce((a, r) => a + r.recall_at_1 * r.n, 0) / totalN, + recall_at_5: scoredAxes.reduce((a, r) => a + r.recall_at_5 * r.n, 0) / totalN, + distractor_top1_rate: scoredAxes.reduce((a, r) => a + r.distractor_at_top1, 0) / totalN, + n: totalN, + }; + const controlAxis = results.find((r) => r.axis === 'control'); + const fpRate = controlAxis ? controlAxis.false_positive_count / controlAxis.n : null; + + const out = { + variant: VARIANT, + model: MODEL, + topk: TOPK, + wall_seconds: ((Date.now() - t0) / 1000).toFixed(1), + composite, + false_positive_rate: fpRate, + per_axis: results, + }; + + fs.mkdirSync(path.dirname(OUT), { recursive: true }); + fs.writeFileSync(OUT, JSON.stringify(out, null, 2)); + + console.log(''); + console.log(`composite r@1 = ${composite.recall_at_1.toFixed(3)} r@5 = ${composite.recall_at_5.toFixed(3)}`); + console.log(`distractor_top1_rate = ${composite.distractor_top1_rate.toFixed(3)} fp@control = ${fpRate?.toFixed(3) ?? 'n/a'}`); + console.log(`saved → ${OUT}`); +} + +main().catch((e) => { console.error(e); process.exit(1); }); diff --git a/benchmarks/alignbench/runs/baseline-no-distractors.json b/benchmarks/alignbench/runs/baseline-no-distractors.json new file mode 100644 index 0000000..00d7092 --- /dev/null +++ b/benchmarks/alignbench/runs/baseline-no-distractors.json @@ -0,0 +1,990 @@ +{ + "variant": "baseline", + "model": "Xenova/all-MiniLM-L6-v2", + "topk": 5, + "wall_seconds": "0.4", + "composite": { + "recall_at_1": 0.7666666666666667, + "recall_at_5": 0.95, + "distractor_top1_rate": 0, + "n": 60 + }, + "false_positive_rate": 0, + "per_axis": [ + { + "axis": "pronoun", + "n": 20, + "pool_size": 45, + "recall_at_1": 0.8, + "recall_at_5": 1, + "mean_gold_rank": 1.2, + "distractor_at_top1": 0, + "false_positive_count": 0, + "median_gold_margin": 0.05112698480692013, + "items": [ + { + "id": "pronoun-001", + "query": "what is my name?", + "effective_query": "what is my name?", + "gold_in_topk": true, + "gold_global_key": "pronoun#0", + "gold_rank": 1, + "gold_score": 0.3960303973923612, + "top1_text": "The user's name is Alex.", + "top1_score": 0.3960303973923612, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "pronoun-002", + "query": "what's the user's name?", + "effective_query": "what's the user's name?", + "gold_in_topk": true, + "gold_global_key": "pronoun#0", + "gold_rank": 1, + "gold_score": 0.7294812723464028, + "top1_text": "The user's name is Alex.", + "top1_score": 0.7294812723464028, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "pronoun-003", + "query": "who am I?", + "effective_query": "who am I?", + "gold_in_topk": true, + "gold_global_key": "pronoun#0", + "gold_rank": 1, + "gold_score": 0.2824821997084312, + "top1_text": "The user's name is Alex.", + "top1_score": 0.2824821997084312, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "pronoun-004", + "query": "what do I do for work?", + "effective_query": "what do I do for work?", + "gold_in_topk": true, + "gold_global_key": "pronoun#1", + "gold_rank": 2, + "gold_score": 0.18618347998079776, + "top1_text": "The user works at Acme as a software engineer.", + "top1_score": 0.19567938172618415, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "pronoun-005", + "query": "what is the user's job?", + "effective_query": "what is the user's job?", + "gold_in_topk": true, + "gold_global_key": "pronoun#1", + "gold_rank": 2, + "gold_score": 0.5754363621337681, + "top1_text": "The user works at Acme as a software engineer.", + "top1_score": 0.5767198521038163, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "pronoun-006", + "query": "where does the user work?", + "effective_query": "where does the user work?", + "gold_in_topk": true, + "gold_global_key": "pronoun#1", + "gold_rank": 1, + "gold_score": 0.5436570101621021, + "top1_text": "The user works as a software engineer at Acme.", + "top1_score": 0.5436570101621021, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "pronoun-007", + "query": "what is my dog's name?", + "effective_query": "what is my dog's name?", + "gold_in_topk": true, + "gold_global_key": "pronoun#2", + "gold_rank": 1, + "gold_score": 0.5711816859366411, + "top1_text": "The user's dog is named Apollo.", + "top1_score": 0.5711816859366411, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "pronoun-008", + "query": "who is Apollo?", + "effective_query": "who is Apollo?", + "gold_in_topk": true, + "gold_global_key": "pronoun#2", + "gold_rank": 1, + "gold_score": 0.7072701654355713, + "top1_text": "The user's dog is named Apollo.", + "top1_score": 0.7072701654355713, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "pronoun-009", + "query": "where do I live?", + "effective_query": "where do I live?", + "gold_in_topk": true, + "gold_global_key": "pronoun#3", + "gold_rank": 2, + "gold_score": 0.29566315429472595, + "top1_text": "As of January 2026, the user lives in Lisbon.", + "top1_score": 0.2960941749345428, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "pronoun-010", + "query": "what city does the user live in?", + "effective_query": "what city does the user live in?", + "gold_in_topk": true, + "gold_global_key": "pronoun#3", + "gold_rank": 1, + "gold_score": 0.6400263303955707, + "top1_text": "The user lives in Lisbon.", + "top1_score": 0.6400263303955707, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "pronoun-011", + "query": "when is my birthday?", + "effective_query": "when is my birthday?", + "gold_in_topk": true, + "gold_global_key": "pronoun#4", + "gold_rank": 1, + "gold_score": 0.6279255850758502, + "top1_text": "The user's birthday is March 14.", + "top1_score": 0.6279255850758502, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "pronoun-012", + "query": "when was the user born?", + "effective_query": "when was the user born?", + "gold_in_topk": true, + "gold_global_key": "pronoun#4", + "gold_rank": 1, + "gold_score": 0.7251037734927323, + "top1_text": "The user's birthday is March 14.", + "top1_score": 0.7251037734927323, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "pronoun-013", + "query": "do I have kids?", + "effective_query": "do I have kids?", + "gold_in_topk": true, + "gold_global_key": "pronoun#5", + "gold_rank": 1, + "gold_score": 0.2345547444698841, + "top1_text": "The user has two children, Maya and Theo.", + "top1_score": 0.2345547444698841, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "pronoun-014", + "query": "how many children does the user have?", + "effective_query": "how many children does the user have?", + "gold_in_topk": true, + "gold_global_key": "pronoun#5", + "gold_rank": 1, + "gold_score": 0.5766094762746933, + "top1_text": "The user has two children, Maya and Theo.", + "top1_score": 0.5766094762746933, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "pronoun-015", + "query": "what is my usual coffee order?", + "effective_query": "what is my usual coffee order?", + "gold_in_topk": true, + "gold_global_key": "pronoun#6", + "gold_rank": 1, + "gold_score": 0.6001185695455631, + "top1_text": "The user's favorite coffee order is an oat-milk flat white.", + "top1_score": 0.6001185695455631, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "pronoun-016", + "query": "what coffee does the user drink?", + "effective_query": "what coffee does the user drink?", + "gold_in_topk": true, + "gold_global_key": "pronoun#6", + "gold_rank": 2, + "gold_score": 0.5813657391451515, + "top1_text": "The user does not drink coffee. They prefer tea.", + "top1_score": 0.7197601756319, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "pronoun-017", + "query": "what did I study?", + "effective_query": "what did I study?", + "gold_in_topk": true, + "gold_global_key": "pronoun#7", + "gold_rank": 1, + "gold_score": 0.5746472233195816, + "top1_text": "The user studied applied mathematics at university.", + "top1_score": 0.5746472233195816, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "pronoun-018", + "query": "do I have any allergies?", + "effective_query": "do I have any allergies?", + "gold_in_topk": true, + "gold_global_key": "pronoun#8", + "gold_rank": 1, + "gold_score": 0.389079047513309, + "top1_text": "The user is allergic to peanuts.", + "top1_score": 0.389079047513309, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "pronoun-019", + "query": "what is the user allergic to?", + "effective_query": "what is the user allergic to?", + "gold_in_topk": true, + "gold_global_key": "pronoun#8", + "gold_rank": 1, + "gold_score": 0.7261582549960168, + "top1_text": "The user is allergic to peanuts.", + "top1_score": 0.7261582549960168, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "pronoun-020", + "query": "what kind of car do I drive?", + "effective_query": "what kind of car do I drive?", + "gold_in_topk": true, + "gold_global_key": "pronoun#9", + "gold_rank": 1, + "gold_score": 0.39618091590057963, + "top1_text": "The user drives a 2019 Toyota Corolla.", + "top1_score": 0.39618091590057963, + "top1_is_distractor": false, + "false_positive": false + } + ] + }, + { + "axis": "temporal", + "n": 14, + "pool_size": 45, + "recall_at_1": 0.5, + "recall_at_5": 0.7857142857142857, + "mean_gold_rank": 3.857142857142857, + "distractor_at_top1": 0, + "false_positive_count": 0, + "median_gold_margin": 0.049944619619024966, + "items": [ + { + "id": "temporal-001", + "query": "where does the user live now?", + "effective_query": "where does the user live now?", + "gold_in_topk": true, + "gold_global_key": "temporal#0", + "gold_rank": 3, + "gold_score": 0.550366425409688, + "top1_text": "The user lives in Lisbon.", + "top1_score": 0.5504016420681116, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "temporal-002", + "query": "where did the user used to live?", + "effective_query": "where did the user used to live?", + "gold_in_topk": true, + "gold_global_key": "temporal#1", + "gold_rank": 1, + "gold_score": 0.5490901457590983, + "top1_text": "Before 2024, the user lived in Berlin.", + "top1_score": 0.5490901457590983, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "temporal-003", + "query": "when did the user move?", + "effective_query": "when did the user move?", + "gold_in_topk": true, + "gold_global_key": "temporal#2", + "gold_rank": 1, + "gold_score": 0.5076513080537272, + "top1_text": "The user moved from Berlin to Lisbon in 2024.", + "top1_score": 0.5076513080537272, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "temporal-004", + "query": "where is the user currently based?", + "effective_query": "where is the user currently based?", + "gold_in_topk": true, + "gold_global_key": "temporal#0", + "gold_rank": 8, + "gold_score": 0.42833906054622706, + "top1_text": "The user's name is Alex.", + "top1_score": 0.4608892635234409, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "temporal-005", + "query": "what is the user reading?", + "effective_query": "what is the user reading?", + "gold_in_topk": true, + "gold_global_key": "temporal#3", + "gold_rank": 2, + "gold_score": 0.4600491260020972, + "top1_text": "The user reads on a Kindle Paperwhite 11th-gen.", + "top1_score": 0.5634334413727431, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "temporal-006", + "query": "what did the user read last year?", + "effective_query": "what did the user read last year?", + "gold_in_topk": true, + "gold_global_key": "temporal#4", + "gold_rank": 1, + "gold_score": 0.5028289729480597, + "top1_text": "Last year the user read 'Project Hail Mary'.", + "top1_score": 0.5028289729480597, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "temporal-007", + "query": "what is the user working on these days?", + "effective_query": "what is the user working on these days?", + "gold_in_topk": true, + "gold_global_key": "temporal#5", + "gold_rank": 3, + "gold_score": 0.4513265913655406, + "top1_text": "The user works at Acme as a software engineer.", + "top1_score": 0.48543171981436045, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "temporal-008", + "query": "what did the user finish last month?", + "effective_query": "what did the user finish last month?", + "gold_in_topk": true, + "gold_global_key": "temporal#6", + "gold_rank": 1, + "gold_score": 0.5544356167423142, + "top1_text": "The user finished the Sprint-4 reranker training last month.", + "top1_score": 0.5544356167423142, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "temporal-009", + "query": "what LLM does the user prefer?", + "effective_query": "what LLM does the user prefer?", + "gold_in_topk": true, + "gold_global_key": "temporal#7", + "gold_rank": 1, + "gold_score": 0.6226655286181774, + "top1_text": "As of May 2026, the user's preferred LLM is Claude Sonnet 4.6.", + "top1_score": 0.6226655286181774, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "temporal-010", + "query": "which model did the user use before?", + "effective_query": "which model did the user use before?", + "gold_in_topk": true, + "gold_global_key": "temporal#8", + "gold_rank": 1, + "gold_score": 0.5205286761987613, + "top1_text": "The user used GPT-4 as their primary model in 2024.", + "top1_score": 0.5205286761987613, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "temporal-011", + "query": "did the user get a new phone recently?", + "effective_query": "did the user get a new phone recently?", + "gold_in_topk": true, + "gold_global_key": "temporal#9", + "gold_rank": 1, + "gold_score": 0.6113316689987077, + "top1_text": "The user upgraded their phone to an iPhone 17 in March 2026.", + "top1_score": 0.6113316689987077, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "temporal-012", + "query": "is the user still in Berlin?", + "effective_query": "is the user still in Berlin?", + "gold_in_topk": true, + "gold_global_key": "temporal#0", + "gold_rank": 5, + "gold_score": 0.4089445689504921, + "top1_text": "Before 2024, the user lived in Berlin.", + "top1_score": 0.7214467722165934, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "temporal-013", + "query": "what is the user up to?", + "effective_query": "what is the user up to?", + "gold_in_topk": true, + "gold_global_key": "temporal#5", + "gold_rank": 11, + "gold_score": 0.2672680179526975, + "top1_text": "The user works as a software engineer at Acme.", + "top1_score": 0.3454437295411476, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "temporal-014", + "query": "which model is the user on right now?", + "effective_query": "which model is the user on right now?", + "gold_in_topk": true, + "gold_global_key": "temporal#7", + "gold_rank": 15, + "gold_score": 0.31088477125594804, + "top1_text": "The user's name is Alex.", + "top1_score": 0.40861487066233426, + "top1_is_distractor": false, + "false_positive": false + } + ] + }, + { + "axis": "specificity", + "n": 14, + "pool_size": 45, + "recall_at_1": 0.8571428571428571, + "recall_at_5": 1, + "mean_gold_rank": 1.2142857142857142, + "distractor_at_top1": 0, + "false_positive_count": 0, + "median_gold_margin": 0.14383529034315978, + "items": [ + { + "id": "specificity-001", + "query": "tell me about my dog", + "effective_query": "tell me about my dog", + "gold_in_topk": true, + "gold_global_key": "specificity#0", + "gold_rank": 3, + "gold_score": 0.41588713324638765, + "top1_text": "The user has a dog named Apollo.", + "top1_score": 0.4531912105925747, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "specificity-002", + "query": "what kind of pet do I have?", + "effective_query": "what kind of pet do I have?", + "gold_in_topk": true, + "gold_global_key": "specificity#0", + "gold_rank": 2, + "gold_score": 0.3644336367800417, + "top1_text": "The user has a dog named Apollo.", + "top1_score": 0.4007600036283448, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "specificity-003", + "query": "do I own a bike?", + "effective_query": "do I own a bike?", + "gold_in_topk": true, + "gold_global_key": "specificity#1", + "gold_rank": 1, + "gold_score": 0.5234729437047541, + "top1_text": "The user owns a Bianchi road bike.", + "top1_score": 0.5234729437047541, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "specificity-004", + "query": "what brand of bike does the user have?", + "effective_query": "what brand of bike does the user have?", + "gold_in_topk": true, + "gold_global_key": "specificity#1", + "gold_rank": 1, + "gold_score": 0.6329297203833848, + "top1_text": "The user owns a Bianchi road bike.", + "top1_score": 0.6329297203833848, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "specificity-005", + "query": "what computer do I use?", + "effective_query": "what computer do I use?", + "gold_in_topk": true, + "gold_global_key": "specificity#2", + "gold_rank": 1, + "gold_score": 0.2943695662572405, + "top1_text": "The user's primary laptop is a 16-inch MacBook Pro M4.", + "top1_score": 0.2943695662572405, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "specificity-006", + "query": "what laptop does the user have?", + "effective_query": "what laptop does the user have?", + "gold_in_topk": true, + "gold_global_key": "specificity#2", + "gold_rank": 1, + "gold_score": 0.6450468635073591, + "top1_text": "The user's primary laptop is a 16-inch MacBook Pro M4.", + "top1_score": 0.6450468635073591, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "specificity-007", + "query": "do I have any musical instruments?", + "effective_query": "do I have any musical instruments?", + "gold_in_topk": true, + "gold_global_key": "specificity#3", + "gold_rank": 1, + "gold_score": 0.28698273062526614, + "top1_text": "The user has a Yamaha P-125 digital piano in the living room.", + "top1_score": 0.28698273062526614, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "specificity-008", + "query": "which note-taking app do I use?", + "effective_query": "which note-taking app do I use?", + "gold_in_topk": true, + "gold_global_key": "specificity#4", + "gold_rank": 1, + "gold_score": 0.38230106432962924, + "top1_text": "The user uses Logseq for personal notes and Notion for work.", + "top1_score": 0.38230106432962924, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "specificity-009", + "query": "where do I like to eat in Lisbon?", + "effective_query": "where do I like to eat in Lisbon?", + "gold_in_topk": true, + "gold_global_key": "specificity#5", + "gold_rank": 1, + "gold_score": 0.695091049183982, + "top1_text": "The user's favorite restaurant in Lisbon is Cervejaria Ramiro.", + "top1_score": 0.695091049183982, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "specificity-010", + "query": "what brand sunglasses does the user wear?", + "effective_query": "what brand sunglasses does the user wear?", + "gold_in_topk": true, + "gold_global_key": "specificity#6", + "gold_rank": 1, + "gold_score": 0.6512271965052092, + "top1_text": "The user wears Smith Lowdown sunglasses.", + "top1_score": 0.6512271965052092, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "specificity-011", + "query": "do I read on a Kindle?", + "effective_query": "do I read on a Kindle?", + "gold_in_topk": true, + "gold_global_key": "specificity#7", + "gold_rank": 1, + "gold_score": 0.6508408387843772, + "top1_text": "The user reads on a Kindle Paperwhite 11th-gen.", + "top1_score": 0.6508408387843772, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "specificity-012", + "query": "what espresso machine does the user own?", + "effective_query": "what espresso machine does the user own?", + "gold_in_topk": true, + "gold_global_key": "specificity#8", + "gold_rank": 1, + "gold_score": 0.7366961226077487, + "top1_text": "The user's home espresso machine is a Lelit Bianca v3.", + "top1_score": 0.7366961226077487, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "specificity-013", + "query": "what shoes do I wear?", + "effective_query": "what shoes do I wear?", + "gold_in_topk": true, + "gold_global_key": "specificity#9", + "gold_rank": 1, + "gold_score": 0.25921156505496634, + "top1_text": "The user wears Allbirds Wool Runners daily.", + "top1_score": 0.25921156505496634, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "specificity-014", + "query": "what brand are the user's everyday shoes?", + "effective_query": "what brand are the user's everyday shoes?", + "gold_in_topk": true, + "gold_global_key": "specificity#9", + "gold_rank": 1, + "gold_score": 0.4229939734066537, + "top1_text": "The user wears Allbirds Wool Runners daily.", + "top1_score": 0.4229939734066537, + "top1_is_distractor": false, + "false_positive": false + } + ] + }, + { + "axis": "negation", + "n": 12, + "pool_size": 45, + "recall_at_1": 0.9166666666666666, + "recall_at_5": 1, + "mean_gold_rank": 1.0833333333333333, + "distractor_at_top1": 0, + "false_positive_count": 0, + "median_gold_margin": 0.2862424829289613, + "items": [ + { + "id": "negation-001", + "query": "does the user drink coffee?", + "effective_query": "does the user drink coffee?", + "gold_in_topk": true, + "gold_global_key": "negation#0", + "gold_rank": 1, + "gold_score": 0.7905096599561829, + "top1_text": "The user does not drink coffee. They prefer tea.", + "top1_score": 0.7905096599561829, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "negation-002", + "query": "what does the user drink in the morning?", + "effective_query": "what does the user drink in the morning?", + "gold_in_topk": true, + "gold_global_key": "negation#0", + "gold_rank": 1, + "gold_score": 0.5285974920275435, + "top1_text": "The user does not drink coffee. They prefer tea.", + "top1_score": 0.5285974920275435, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "negation-003", + "query": "am I vegetarian?", + "effective_query": "am I vegetarian?", + "gold_in_topk": true, + "gold_global_key": "negation#1", + "gold_rank": 1, + "gold_score": 0.6191490333752342, + "top1_text": "The user is not vegetarian, but avoids red meat.", + "top1_score": 0.6191490333752342, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "negation-004", + "query": "is the user on Twitter?", + "effective_query": "is the user on Twitter?", + "gold_in_topk": true, + "gold_global_key": "negation#2", + "gold_rank": 1, + "gold_score": 0.5874304481012608, + "top1_text": "The user does not use Twitter; they use Bluesky and Mastodon.", + "top1_score": 0.5874304481012608, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "negation-005", + "query": "which social networks does the user use?", + "effective_query": "which social networks does the user use?", + "gold_in_topk": true, + "gold_global_key": "negation#2", + "gold_rank": 1, + "gold_score": 0.5067702908301415, + "top1_text": "The user does not use Twitter; they use Bluesky and Mastodon.", + "top1_score": 0.5067702908301415, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "negation-006", + "query": "does the user own a car?", + "effective_query": "does the user own a car?", + "gold_in_topk": true, + "gold_global_key": "negation#3", + "gold_rank": 1, + "gold_score": 0.7488288864422115, + "top1_text": "The user does not own a car; they bike or use public transit.", + "top1_score": 0.7488288864422115, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "negation-007", + "query": "is the user active on LinkedIn?", + "effective_query": "is the user active on LinkedIn?", + "gold_in_topk": true, + "gold_global_key": "negation#4", + "gold_rank": 1, + "gold_score": 0.7880239246717338, + "top1_text": "The user is not on LinkedIn anymore.", + "top1_score": 0.7880239246717338, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "negation-008", + "query": "any foods the user hates?", + "effective_query": "any foods the user hates?", + "gold_in_topk": true, + "gold_global_key": "negation#5", + "gold_rank": 2, + "gold_score": 0.4222244002602081, + "top1_text": "The user is not vegetarian, but avoids red meat.", + "top1_score": 0.4321470878666278, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "negation-009", + "query": "has the user traveled to Asia?", + "effective_query": "has the user traveled to Asia?", + "gold_in_topk": true, + "gold_global_key": "negation#6", + "gold_rank": 1, + "gold_score": 0.6919597377462416, + "top1_text": "The user has never been to Asia.", + "top1_score": 0.6919597377462416, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "negation-010", + "query": "does the user like horror movies?", + "effective_query": "does the user like horror movies?", + "gold_in_topk": true, + "gold_global_key": "negation#7", + "gold_rank": 1, + "gold_score": 0.7303882056719367, + "top1_text": "The user does not enjoy horror movies.", + "top1_score": 0.7303882056719367, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "negation-011", + "query": "can the user eat shrimp?", + "effective_query": "can the user eat shrimp?", + "gold_in_topk": true, + "gold_global_key": "negation#8", + "gold_rank": 1, + "gold_score": 0.7044731236020111, + "top1_text": "The user does not eat shellfish.", + "top1_score": 0.7044731236020111, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "negation-012", + "query": "is the user learning a new language?", + "effective_query": "is the user learning a new language?", + "gold_in_topk": true, + "gold_global_key": "negation#9", + "gold_rank": 1, + "gold_score": 0.6522323599643379, + "top1_text": "The user is not currently learning any new languages.", + "top1_score": 0.6522323599643379, + "top1_is_distractor": false, + "false_positive": false + } + ] + }, + { + "axis": "control", + "n": 10, + "pool_size": 45, + "recall_at_1": 0, + "recall_at_5": 0, + "mean_gold_rank": null, + "distractor_at_top1": 0, + "false_positive_count": 0, + "median_gold_margin": null, + "items": [ + { + "id": "control-001", + "query": "what is the airspeed velocity of an unladen swallow?", + "effective_query": "what is the airspeed velocity of an unladen swallow?", + "gold_in_topk": false, + "gold_global_key": null, + "gold_rank": null, + "gold_score": null, + "top1_text": "The user does not eat shellfish.", + "top1_score": 0.25082327505676677, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "control-002", + "query": "who is the current president of France?", + "effective_query": "who is the current president of France?", + "gold_in_topk": false, + "gold_global_key": null, + "gold_rank": null, + "gold_score": null, + "top1_text": "As of January 2026, the user lives in Lisbon.", + "top1_score": 0.2564335064669229, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "control-003", + "query": "what is the capital of Mongolia?", + "effective_query": "what is the capital of Mongolia?", + "gold_in_topk": false, + "gold_global_key": null, + "gold_rank": null, + "gold_score": null, + "top1_text": "The user has never been to Asia.", + "top1_score": 0.27119226736778723, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "control-004", + "query": "how does photosynthesis work?", + "effective_query": "how does photosynthesis work?", + "gold_in_topk": false, + "gold_global_key": null, + "gold_rank": null, + "gold_score": null, + "top1_text": "The user is currently working on a memory benchmark project.", + "top1_score": 0.13794111637514647, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "control-005", + "query": "translate 'goodnight' to Japanese", + "effective_query": "translate 'goodnight' to Japanese", + "gold_in_topk": false, + "gold_global_key": null, + "gold_rank": null, + "gold_score": null, + "top1_text": "The user's favorite restaurant is Cervejaria Ramiro.", + "top1_score": 0.1911340870733358, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "control-006", + "query": "what year did World War II end?", + "effective_query": "what year did World War II end?", + "gold_in_topk": false, + "gold_global_key": null, + "gold_rank": null, + "gold_score": null, + "top1_text": "Before 2024, the user lived in Berlin.", + "top1_score": 0.2900090433169317, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "control-007", + "query": "explain entropy in thermodynamics", + "effective_query": "explain entropy in thermodynamics", + "gold_in_topk": false, + "gold_global_key": null, + "gold_rank": null, + "gold_score": null, + "top1_text": "The user uses Logseq for personal notes and Notion for work.", + "top1_score": 0.18512903871088907, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "control-008", + "query": "best way to debug a segfault in C", + "effective_query": "best way to debug a segfault in C", + "gold_in_topk": false, + "gold_global_key": null, + "gold_rank": null, + "gold_score": null, + "top1_text": "The user is currently working on a memory benchmark project.", + "top1_score": 0.23013625373127655, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "control-009", + "query": "what's the weather going to be tomorrow?", + "effective_query": "what's the weather going to be tomorrow?", + "gold_in_topk": false, + "gold_global_key": null, + "gold_rank": null, + "gold_score": null, + "top1_text": "As of January 2026, the user lives in Lisbon.", + "top1_score": 0.1484365589730125, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "control-010", + "query": "give me a recipe for tiramisu", + "effective_query": "give me a recipe for tiramisu", + "gold_in_topk": false, + "gold_global_key": null, + "gold_rank": null, + "gold_score": null, + "top1_text": "The user's favorite restaurant is Cervejaria Ramiro.", + "top1_score": 0.3050188550209472, + "top1_is_distractor": false, + "false_positive": false + } + ] + } + ] +} \ No newline at end of file diff --git a/benchmarks/alignbench/runs/baseline.json b/benchmarks/alignbench/runs/baseline.json new file mode 100644 index 0000000..e5023dd --- /dev/null +++ b/benchmarks/alignbench/runs/baseline.json @@ -0,0 +1,990 @@ +{ + "variant": "baseline", + "model": "Xenova/all-MiniLM-L6-v2", + "topk": 5, + "wall_seconds": "0.3", + "composite": { + "recall_at_1": 0.7333333333333333, + "recall_at_5": 0.9333333333333333, + "distractor_top1_rate": 0.06666666666666667, + "n": 60 + }, + "false_positive_rate": 0, + "per_axis": [ + { + "axis": "pronoun", + "n": 20, + "pool_size": 55, + "recall_at_1": 0.7, + "recall_at_5": 1, + "mean_gold_rank": 1.3, + "distractor_at_top1": 2, + "false_positive_count": 0, + "median_gold_margin": 0.04656683064609568, + "items": [ + { + "id": "pronoun-001", + "query": "what is my name?", + "effective_query": "what is my name?", + "gold_in_topk": true, + "gold_global_key": "pronoun#0", + "gold_rank": 1, + "gold_score": 0.3960303973923612, + "top1_text": "The user's name is Alex.", + "top1_score": 0.3960303973923612, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "pronoun-002", + "query": "what's the user's name?", + "effective_query": "what's the user's name?", + "gold_in_topk": true, + "gold_global_key": "pronoun#0", + "gold_rank": 2, + "gold_score": 0.7294812723464028, + "top1_text": "The user asked for the user's name.", + "top1_score": 0.7771254660839599, + "top1_is_distractor": true, + "false_positive": false + }, + { + "id": "pronoun-003", + "query": "who am I?", + "effective_query": "who am I?", + "gold_in_topk": true, + "gold_global_key": "pronoun#0", + "gold_rank": 2, + "gold_score": 0.2824821997084312, + "top1_text": "The user is me.", + "top1_score": 0.3964962994980233, + "top1_is_distractor": true, + "false_positive": false + }, + { + "id": "pronoun-004", + "query": "what do I do for work?", + "effective_query": "what do I do for work?", + "gold_in_topk": true, + "gold_global_key": "pronoun#1", + "gold_rank": 2, + "gold_score": 0.18618347998079776, + "top1_text": "The user works at Acme as a software engineer.", + "top1_score": 0.19567938172618415, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "pronoun-005", + "query": "what is the user's job?", + "effective_query": "what is the user's job?", + "gold_in_topk": true, + "gold_global_key": "pronoun#1", + "gold_rank": 2, + "gold_score": 0.5754363621337681, + "top1_text": "The user works at Acme as a software engineer.", + "top1_score": 0.5767198521038163, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "pronoun-006", + "query": "where does the user work?", + "effective_query": "where does the user work?", + "gold_in_topk": true, + "gold_global_key": "pronoun#1", + "gold_rank": 1, + "gold_score": 0.5436570101621021, + "top1_text": "The user works as a software engineer at Acme.", + "top1_score": 0.5436570101621021, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "pronoun-007", + "query": "what is my dog's name?", + "effective_query": "what is my dog's name?", + "gold_in_topk": true, + "gold_global_key": "pronoun#2", + "gold_rank": 1, + "gold_score": 0.5711816859366411, + "top1_text": "The user's dog is named Apollo.", + "top1_score": 0.5711816859366411, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "pronoun-008", + "query": "who is Apollo?", + "effective_query": "who is Apollo?", + "gold_in_topk": true, + "gold_global_key": "pronoun#2", + "gold_rank": 1, + "gold_score": 0.7072701654355713, + "top1_text": "The user's dog is named Apollo.", + "top1_score": 0.7072701654355713, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "pronoun-009", + "query": "where do I live?", + "effective_query": "where do I live?", + "gold_in_topk": true, + "gold_global_key": "pronoun#3", + "gold_rank": 2, + "gold_score": 0.29566315429472595, + "top1_text": "As of January 2026, the user lives in Lisbon.", + "top1_score": 0.2960941749345428, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "pronoun-010", + "query": "what city does the user live in?", + "effective_query": "what city does the user live in?", + "gold_in_topk": true, + "gold_global_key": "pronoun#3", + "gold_rank": 1, + "gold_score": 0.6400263303955707, + "top1_text": "The user lives in Lisbon.", + "top1_score": 0.6400263303955707, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "pronoun-011", + "query": "when is my birthday?", + "effective_query": "when is my birthday?", + "gold_in_topk": true, + "gold_global_key": "pronoun#4", + "gold_rank": 1, + "gold_score": 0.6279255850758502, + "top1_text": "The user's birthday is March 14.", + "top1_score": 0.6279255850758502, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "pronoun-012", + "query": "when was the user born?", + "effective_query": "when was the user born?", + "gold_in_topk": true, + "gold_global_key": "pronoun#4", + "gold_rank": 1, + "gold_score": 0.7251037734927323, + "top1_text": "The user's birthday is March 14.", + "top1_score": 0.7251037734927323, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "pronoun-013", + "query": "do I have kids?", + "effective_query": "do I have kids?", + "gold_in_topk": true, + "gold_global_key": "pronoun#5", + "gold_rank": 1, + "gold_score": 0.2345547444698841, + "top1_text": "The user has two children, Maya and Theo.", + "top1_score": 0.2345547444698841, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "pronoun-014", + "query": "how many children does the user have?", + "effective_query": "how many children does the user have?", + "gold_in_topk": true, + "gold_global_key": "pronoun#5", + "gold_rank": 1, + "gold_score": 0.5766094762746933, + "top1_text": "The user has two children, Maya and Theo.", + "top1_score": 0.5766094762746933, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "pronoun-015", + "query": "what is my usual coffee order?", + "effective_query": "what is my usual coffee order?", + "gold_in_topk": true, + "gold_global_key": "pronoun#6", + "gold_rank": 1, + "gold_score": 0.6001185695455631, + "top1_text": "The user's favorite coffee order is an oat-milk flat white.", + "top1_score": 0.6001185695455631, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "pronoun-016", + "query": "what coffee does the user drink?", + "effective_query": "what coffee does the user drink?", + "gold_in_topk": true, + "gold_global_key": "pronoun#6", + "gold_rank": 2, + "gold_score": 0.5813657391451515, + "top1_text": "The user does not drink coffee. They prefer tea.", + "top1_score": 0.7197601756319, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "pronoun-017", + "query": "what did I study?", + "effective_query": "what did I study?", + "gold_in_topk": true, + "gold_global_key": "pronoun#7", + "gold_rank": 1, + "gold_score": 0.5746472233195816, + "top1_text": "The user studied applied mathematics at university.", + "top1_score": 0.5746472233195816, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "pronoun-018", + "query": "do I have any allergies?", + "effective_query": "do I have any allergies?", + "gold_in_topk": true, + "gold_global_key": "pronoun#8", + "gold_rank": 1, + "gold_score": 0.389079047513309, + "top1_text": "The user is allergic to peanuts.", + "top1_score": 0.389079047513309, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "pronoun-019", + "query": "what is the user allergic to?", + "effective_query": "what is the user allergic to?", + "gold_in_topk": true, + "gold_global_key": "pronoun#8", + "gold_rank": 1, + "gold_score": 0.7261582549960168, + "top1_text": "The user is allergic to peanuts.", + "top1_score": 0.7261582549960168, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "pronoun-020", + "query": "what kind of car do I drive?", + "effective_query": "what kind of car do I drive?", + "gold_in_topk": true, + "gold_global_key": "pronoun#9", + "gold_rank": 1, + "gold_score": 0.39618091590057963, + "top1_text": "The user drives a 2019 Toyota Corolla.", + "top1_score": 0.39618091590057963, + "top1_is_distractor": false, + "false_positive": false + } + ] + }, + { + "axis": "temporal", + "n": 14, + "pool_size": 55, + "recall_at_1": 0.5, + "recall_at_5": 0.7142857142857143, + "mean_gold_rank": 5.5, + "distractor_at_top1": 2, + "false_positive_count": 0, + "median_gold_margin": 0.049944619619024966, + "items": [ + { + "id": "temporal-001", + "query": "where does the user live now?", + "effective_query": "where does the user live now?", + "gold_in_topk": true, + "gold_global_key": "temporal#0", + "gold_rank": 3, + "gold_score": 0.550366425409688, + "top1_text": "The user lives in Lisbon.", + "top1_score": 0.5504016420681116, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "temporal-002", + "query": "where did the user used to live?", + "effective_query": "where did the user used to live?", + "gold_in_topk": true, + "gold_global_key": "temporal#1", + "gold_rank": 1, + "gold_score": 0.5490901457590983, + "top1_text": "Before 2024, the user lived in Berlin.", + "top1_score": 0.5490901457590983, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "temporal-003", + "query": "when did the user move?", + "effective_query": "when did the user move?", + "gold_in_topk": true, + "gold_global_key": "temporal#2", + "gold_rank": 1, + "gold_score": 0.5076513080537272, + "top1_text": "The user moved from Berlin to Lisbon in 2024.", + "top1_score": 0.5076513080537272, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "temporal-004", + "query": "where is the user currently based?", + "effective_query": "where is the user currently based?", + "gold_in_topk": true, + "gold_global_key": "temporal#0", + "gold_rank": 11, + "gold_score": 0.42833906054622706, + "top1_text": "The user asked for the user's name.", + "top1_score": 0.4962051712532369, + "top1_is_distractor": true, + "false_positive": false + }, + { + "id": "temporal-005", + "query": "what is the user reading?", + "effective_query": "what is the user reading?", + "gold_in_topk": true, + "gold_global_key": "temporal#3", + "gold_rank": 8, + "gold_score": 0.4600491260020972, + "top1_text": "The user reads on a Kindle Paperwhite 11th-gen.", + "top1_score": 0.5634334413727431, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "temporal-006", + "query": "what did the user read last year?", + "effective_query": "what did the user read last year?", + "gold_in_topk": true, + "gold_global_key": "temporal#4", + "gold_rank": 1, + "gold_score": 0.5028289729480597, + "top1_text": "Last year the user read 'Project Hail Mary'.", + "top1_score": 0.5028289729480597, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "temporal-007", + "query": "what is the user working on these days?", + "effective_query": "what is the user working on these days?", + "gold_in_topk": true, + "gold_global_key": "temporal#5", + "gold_rank": 4, + "gold_score": 0.4513265913655406, + "top1_text": "The user works at Acme as a software engineer.", + "top1_score": 0.48543171981436045, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "temporal-008", + "query": "what did the user finish last month?", + "effective_query": "what did the user finish last month?", + "gold_in_topk": true, + "gold_global_key": "temporal#6", + "gold_rank": 1, + "gold_score": 0.5544356167423142, + "top1_text": "The user finished the Sprint-4 reranker training last month.", + "top1_score": 0.5544356167423142, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "temporal-009", + "query": "what LLM does the user prefer?", + "effective_query": "what LLM does the user prefer?", + "gold_in_topk": true, + "gold_global_key": "temporal#7", + "gold_rank": 1, + "gold_score": 0.6226655286181774, + "top1_text": "As of May 2026, the user's preferred LLM is Claude Sonnet 4.6.", + "top1_score": 0.6226655286181774, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "temporal-010", + "query": "which model did the user use before?", + "effective_query": "which model did the user use before?", + "gold_in_topk": true, + "gold_global_key": "temporal#8", + "gold_rank": 1, + "gold_score": 0.5205286761987613, + "top1_text": "The user used GPT-4 as their primary model in 2024.", + "top1_score": 0.5205286761987613, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "temporal-011", + "query": "did the user get a new phone recently?", + "effective_query": "did the user get a new phone recently?", + "gold_in_topk": true, + "gold_global_key": "temporal#9", + "gold_rank": 1, + "gold_score": 0.6113316689987077, + "top1_text": "The user upgraded their phone to an iPhone 17 in March 2026.", + "top1_score": 0.6113316689987077, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "temporal-012", + "query": "is the user still in Berlin?", + "effective_query": "is the user still in Berlin?", + "gold_in_topk": true, + "gold_global_key": "temporal#0", + "gold_rank": 5, + "gold_score": 0.4089445689504921, + "top1_text": "Before 2024, the user lived in Berlin.", + "top1_score": 0.7214467722165934, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "temporal-013", + "query": "what is the user up to?", + "effective_query": "what is the user up to?", + "gold_in_topk": true, + "gold_global_key": "temporal#5", + "gold_rank": 19, + "gold_score": 0.2672680179526975, + "top1_text": "The user is asking a question.", + "top1_score": 0.5409891051249414, + "top1_is_distractor": true, + "false_positive": false + }, + { + "id": "temporal-014", + "query": "which model is the user on right now?", + "effective_query": "which model is the user on right now?", + "gold_in_topk": true, + "gold_global_key": "temporal#7", + "gold_rank": 20, + "gold_score": 0.31088477125594804, + "top1_text": "The user's name is Alex.", + "top1_score": 0.40861487066233426, + "top1_is_distractor": false, + "false_positive": false + } + ] + }, + { + "axis": "specificity", + "n": 14, + "pool_size": 55, + "recall_at_1": 0.8571428571428571, + "recall_at_5": 1, + "mean_gold_rank": 1.2142857142857142, + "distractor_at_top1": 0, + "false_positive_count": 0, + "median_gold_margin": 0.14383529034315978, + "items": [ + { + "id": "specificity-001", + "query": "tell me about my dog", + "effective_query": "tell me about my dog", + "gold_in_topk": true, + "gold_global_key": "specificity#0", + "gold_rank": 3, + "gold_score": 0.41588713324638765, + "top1_text": "The user has a dog named Apollo.", + "top1_score": 0.4531912105925747, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "specificity-002", + "query": "what kind of pet do I have?", + "effective_query": "what kind of pet do I have?", + "gold_in_topk": true, + "gold_global_key": "specificity#0", + "gold_rank": 2, + "gold_score": 0.3644336367800417, + "top1_text": "The user has a dog named Apollo.", + "top1_score": 0.4007600036283448, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "specificity-003", + "query": "do I own a bike?", + "effective_query": "do I own a bike?", + "gold_in_topk": true, + "gold_global_key": "specificity#1", + "gold_rank": 1, + "gold_score": 0.5234729437047541, + "top1_text": "The user owns a Bianchi road bike.", + "top1_score": 0.5234729437047541, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "specificity-004", + "query": "what brand of bike does the user have?", + "effective_query": "what brand of bike does the user have?", + "gold_in_topk": true, + "gold_global_key": "specificity#1", + "gold_rank": 1, + "gold_score": 0.6329297203833848, + "top1_text": "The user owns a Bianchi road bike.", + "top1_score": 0.6329297203833848, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "specificity-005", + "query": "what computer do I use?", + "effective_query": "what computer do I use?", + "gold_in_topk": true, + "gold_global_key": "specificity#2", + "gold_rank": 1, + "gold_score": 0.2943695662572405, + "top1_text": "The user's primary laptop is a 16-inch MacBook Pro M4.", + "top1_score": 0.2943695662572405, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "specificity-006", + "query": "what laptop does the user have?", + "effective_query": "what laptop does the user have?", + "gold_in_topk": true, + "gold_global_key": "specificity#2", + "gold_rank": 1, + "gold_score": 0.6450468635073591, + "top1_text": "The user's primary laptop is a 16-inch MacBook Pro M4.", + "top1_score": 0.6450468635073591, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "specificity-007", + "query": "do I have any musical instruments?", + "effective_query": "do I have any musical instruments?", + "gold_in_topk": true, + "gold_global_key": "specificity#3", + "gold_rank": 1, + "gold_score": 0.28698273062526614, + "top1_text": "The user has a Yamaha P-125 digital piano in the living room.", + "top1_score": 0.28698273062526614, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "specificity-008", + "query": "which note-taking app do I use?", + "effective_query": "which note-taking app do I use?", + "gold_in_topk": true, + "gold_global_key": "specificity#4", + "gold_rank": 1, + "gold_score": 0.38230106432962924, + "top1_text": "The user uses Logseq for personal notes and Notion for work.", + "top1_score": 0.38230106432962924, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "specificity-009", + "query": "where do I like to eat in Lisbon?", + "effective_query": "where do I like to eat in Lisbon?", + "gold_in_topk": true, + "gold_global_key": "specificity#5", + "gold_rank": 1, + "gold_score": 0.695091049183982, + "top1_text": "The user's favorite restaurant in Lisbon is Cervejaria Ramiro.", + "top1_score": 0.695091049183982, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "specificity-010", + "query": "what brand sunglasses does the user wear?", + "effective_query": "what brand sunglasses does the user wear?", + "gold_in_topk": true, + "gold_global_key": "specificity#6", + "gold_rank": 1, + "gold_score": 0.6512271965052092, + "top1_text": "The user wears Smith Lowdown sunglasses.", + "top1_score": 0.6512271965052092, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "specificity-011", + "query": "do I read on a Kindle?", + "effective_query": "do I read on a Kindle?", + "gold_in_topk": true, + "gold_global_key": "specificity#7", + "gold_rank": 1, + "gold_score": 0.6508408387843772, + "top1_text": "The user reads on a Kindle Paperwhite 11th-gen.", + "top1_score": 0.6508408387843772, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "specificity-012", + "query": "what espresso machine does the user own?", + "effective_query": "what espresso machine does the user own?", + "gold_in_topk": true, + "gold_global_key": "specificity#8", + "gold_rank": 1, + "gold_score": 0.7366961226077487, + "top1_text": "The user's home espresso machine is a Lelit Bianca v3.", + "top1_score": 0.7366961226077487, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "specificity-013", + "query": "what shoes do I wear?", + "effective_query": "what shoes do I wear?", + "gold_in_topk": true, + "gold_global_key": "specificity#9", + "gold_rank": 1, + "gold_score": 0.25921156505496634, + "top1_text": "The user wears Allbirds Wool Runners daily.", + "top1_score": 0.25921156505496634, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "specificity-014", + "query": "what brand are the user's everyday shoes?", + "effective_query": "what brand are the user's everyday shoes?", + "gold_in_topk": true, + "gold_global_key": "specificity#9", + "gold_rank": 1, + "gold_score": 0.4229939734066537, + "top1_text": "The user wears Allbirds Wool Runners daily.", + "top1_score": 0.4229939734066537, + "top1_is_distractor": false, + "false_positive": false + } + ] + }, + { + "axis": "negation", + "n": 12, + "pool_size": 55, + "recall_at_1": 0.9166666666666666, + "recall_at_5": 1, + "mean_gold_rank": 1.0833333333333333, + "distractor_at_top1": 0, + "false_positive_count": 0, + "median_gold_margin": 0.26762270427014134, + "items": [ + { + "id": "negation-001", + "query": "does the user drink coffee?", + "effective_query": "does the user drink coffee?", + "gold_in_topk": true, + "gold_global_key": "negation#0", + "gold_rank": 1, + "gold_score": 0.7905096599561829, + "top1_text": "The user does not drink coffee. They prefer tea.", + "top1_score": 0.7905096599561829, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "negation-002", + "query": "what does the user drink in the morning?", + "effective_query": "what does the user drink in the morning?", + "gold_in_topk": true, + "gold_global_key": "negation#0", + "gold_rank": 1, + "gold_score": 0.5285974920275435, + "top1_text": "The user does not drink coffee. They prefer tea.", + "top1_score": 0.5285974920275435, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "negation-003", + "query": "am I vegetarian?", + "effective_query": "am I vegetarian?", + "gold_in_topk": true, + "gold_global_key": "negation#1", + "gold_rank": 1, + "gold_score": 0.6191490333752342, + "top1_text": "The user is not vegetarian, but avoids red meat.", + "top1_score": 0.6191490333752342, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "negation-004", + "query": "is the user on Twitter?", + "effective_query": "is the user on Twitter?", + "gold_in_topk": true, + "gold_global_key": "negation#2", + "gold_rank": 1, + "gold_score": 0.5874304481012608, + "top1_text": "The user does not use Twitter; they use Bluesky and Mastodon.", + "top1_score": 0.5874304481012608, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "negation-005", + "query": "which social networks does the user use?", + "effective_query": "which social networks does the user use?", + "gold_in_topk": true, + "gold_global_key": "negation#2", + "gold_rank": 1, + "gold_score": 0.5067702908301415, + "top1_text": "The user does not use Twitter; they use Bluesky and Mastodon.", + "top1_score": 0.5067702908301415, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "negation-006", + "query": "does the user own a car?", + "effective_query": "does the user own a car?", + "gold_in_topk": true, + "gold_global_key": "negation#3", + "gold_rank": 1, + "gold_score": 0.7488288864422115, + "top1_text": "The user does not own a car; they bike or use public transit.", + "top1_score": 0.7488288864422115, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "negation-007", + "query": "is the user active on LinkedIn?", + "effective_query": "is the user active on LinkedIn?", + "gold_in_topk": true, + "gold_global_key": "negation#4", + "gold_rank": 1, + "gold_score": 0.7880239246717338, + "top1_text": "The user is not on LinkedIn anymore.", + "top1_score": 0.7880239246717338, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "negation-008", + "query": "any foods the user hates?", + "effective_query": "any foods the user hates?", + "gold_in_topk": true, + "gold_global_key": "negation#5", + "gold_rank": 2, + "gold_score": 0.4222244002602081, + "top1_text": "The user is not vegetarian, but avoids red meat.", + "top1_score": 0.4321470878666278, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "negation-009", + "query": "has the user traveled to Asia?", + "effective_query": "has the user traveled to Asia?", + "gold_in_topk": true, + "gold_global_key": "negation#6", + "gold_rank": 1, + "gold_score": 0.6919597377462416, + "top1_text": "The user has never been to Asia.", + "top1_score": 0.6919597377462416, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "negation-010", + "query": "does the user like horror movies?", + "effective_query": "does the user like horror movies?", + "gold_in_topk": true, + "gold_global_key": "negation#7", + "gold_rank": 1, + "gold_score": 0.7303882056719367, + "top1_text": "The user does not enjoy horror movies.", + "top1_score": 0.7303882056719367, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "negation-011", + "query": "can the user eat shrimp?", + "effective_query": "can the user eat shrimp?", + "gold_in_topk": true, + "gold_global_key": "negation#8", + "gold_rank": 1, + "gold_score": 0.7044731236020111, + "top1_text": "The user does not eat shellfish.", + "top1_score": 0.7044731236020111, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "negation-012", + "query": "is the user learning a new language?", + "effective_query": "is the user learning a new language?", + "gold_in_topk": true, + "gold_global_key": "negation#9", + "gold_rank": 1, + "gold_score": 0.6522323599643379, + "top1_text": "The user is not currently learning any new languages.", + "top1_score": 0.6522323599643379, + "top1_is_distractor": false, + "false_positive": false + } + ] + }, + { + "axis": "control", + "n": 10, + "pool_size": 55, + "recall_at_1": 0, + "recall_at_5": 0, + "mean_gold_rank": null, + "distractor_at_top1": 0, + "false_positive_count": 0, + "median_gold_margin": null, + "items": [ + { + "id": "control-001", + "query": "what is the airspeed velocity of an unladen swallow?", + "effective_query": "what is the airspeed velocity of an unladen swallow?", + "gold_in_topk": false, + "gold_global_key": null, + "gold_rank": null, + "gold_score": null, + "top1_text": "The user does not eat shellfish.", + "top1_score": 0.25082327505676677, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "control-002", + "query": "who is the current president of France?", + "effective_query": "who is the current president of France?", + "gold_in_topk": false, + "gold_global_key": null, + "gold_rank": null, + "gold_score": null, + "top1_text": "As of January 2026, the user lives in Lisbon.", + "top1_score": 0.2564335064669229, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "control-003", + "query": "what is the capital of Mongolia?", + "effective_query": "what is the capital of Mongolia?", + "gold_in_topk": false, + "gold_global_key": null, + "gold_rank": null, + "gold_score": null, + "top1_text": "The user has never been to Asia.", + "top1_score": 0.27119226736778723, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "control-004", + "query": "how does photosynthesis work?", + "effective_query": "how does photosynthesis work?", + "gold_in_topk": false, + "gold_global_key": null, + "gold_rank": null, + "gold_score": null, + "top1_text": "The user is currently working on a memory benchmark project.", + "top1_score": 0.13794111637514647, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "control-005", + "query": "translate 'goodnight' to Japanese", + "effective_query": "translate 'goodnight' to Japanese", + "gold_in_topk": false, + "gold_global_key": null, + "gold_rank": null, + "gold_score": null, + "top1_text": "The user's favorite restaurant is Cervejaria Ramiro.", + "top1_score": 0.1911340870733358, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "control-006", + "query": "what year did World War II end?", + "effective_query": "what year did World War II end?", + "gold_in_topk": false, + "gold_global_key": null, + "gold_rank": null, + "gold_score": null, + "top1_text": "Before 2024, the user lived in Berlin.", + "top1_score": 0.2900090433169317, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "control-007", + "query": "explain entropy in thermodynamics", + "effective_query": "explain entropy in thermodynamics", + "gold_in_topk": false, + "gold_global_key": null, + "gold_rank": null, + "gold_score": null, + "top1_text": "The user uses Logseq for personal notes and Notion for work.", + "top1_score": 0.18512903871088907, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "control-008", + "query": "best way to debug a segfault in C", + "effective_query": "best way to debug a segfault in C", + "gold_in_topk": false, + "gold_global_key": null, + "gold_rank": null, + "gold_score": null, + "top1_text": "The user is currently working on a memory benchmark project.", + "top1_score": 0.23013625373127655, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "control-009", + "query": "what's the weather going to be tomorrow?", + "effective_query": "what's the weather going to be tomorrow?", + "gold_in_topk": false, + "gold_global_key": null, + "gold_rank": null, + "gold_score": null, + "top1_text": "As of May 14, 2026, Apollo is a term mentioned in the conversation.", + "top1_score": 0.18279764320601347, + "top1_is_distractor": true, + "false_positive": false + }, + { + "id": "control-010", + "query": "give me a recipe for tiramisu", + "effective_query": "give me a recipe for tiramisu", + "gold_in_topk": false, + "gold_global_key": null, + "gold_rank": null, + "gold_score": null, + "top1_text": "The user's favorite restaurant is Cervejaria Ramiro.", + "top1_score": 0.3050188550209472, + "top1_is_distractor": false, + "false_positive": false + } + ] + } + ] +} \ No newline at end of file diff --git a/benchmarks/alignbench/runs/combined.json b/benchmarks/alignbench/runs/combined.json new file mode 100644 index 0000000..f9acb6b --- /dev/null +++ b/benchmarks/alignbench/runs/combined.json @@ -0,0 +1,990 @@ +{ + "variant": "combined", + "model": "Xenova/all-MiniLM-L6-v2", + "topk": 5, + "wall_seconds": "0.6", + "composite": { + "recall_at_1": 0.65, + "recall_at_5": 0.9333333333333333, + "distractor_top1_rate": 0.08333333333333333, + "n": 60 + }, + "false_positive_rate": 1, + "per_axis": [ + { + "axis": "pronoun", + "n": 20, + "pool_size": 108, + "recall_at_1": 0.65, + "recall_at_5": 0.95, + "mean_gold_rank": 2.1, + "distractor_at_top1": 2, + "false_positive_count": 0, + "median_gold_margin": 0.057432213194370973, + "items": [ + { + "id": "pronoun-001", + "query": "what is my name?", + "effective_query": "what is the user's name?", + "gold_in_topk": true, + "gold_global_key": "pronoun#0", + "gold_rank": 1, + "gold_score": 0.9531048072303069, + "top1_text": "The user's name is Alex.", + "top1_score": 0.9531048072303069, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "pronoun-002", + "query": "what's the user's name?", + "effective_query": "what's the user's name?", + "gold_in_topk": true, + "gold_global_key": "pronoun#0", + "gold_rank": 2, + "gold_score": 0.9659583575722968, + "top1_text": "The user asked for the user's name.", + "top1_score": 0.9867746931367039, + "top1_is_distractor": true, + "false_positive": false + }, + { + "id": "pronoun-003", + "query": "who am I?", + "effective_query": "who am the user?", + "gold_in_topk": true, + "gold_global_key": "pronoun#0", + "gold_rank": 4, + "gold_score": 0.6302996998867183, + "top1_text": "I am me.", + "top1_score": 0.811597275977491, + "top1_is_distractor": true, + "false_positive": false + }, + { + "id": "pronoun-004", + "query": "what do I do for work?", + "effective_query": "what do the user do for work?", + "gold_in_topk": true, + "gold_global_key": "pronoun#1", + "gold_rank": 4, + "gold_score": 0.6525810032024324, + "top1_text": "The user uses Logseq for personal notes and Notion for work.", + "top1_score": 0.9315629118324019, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "pronoun-005", + "query": "what is the user's job?", + "effective_query": "what is the user's job?", + "gold_in_topk": true, + "gold_global_key": "pronoun#1", + "gold_rank": 10, + "gold_score": 0.6920306628772953, + "top1_text": "The user's name is Alex.", + "top1_score": 0.8452295596626733, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "pronoun-006", + "query": "where does the user work?", + "effective_query": "where does the user work?", + "gold_in_topk": true, + "gold_global_key": "pronoun#1", + "gold_rank": 5, + "gold_score": 0.7016151842914757, + "top1_text": "The user uses Logseq for personal notes and Notion for work.", + "top1_score": 0.8553753574007144, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "pronoun-007", + "query": "what is my dog's name?", + "effective_query": "what is the user's dog's name?", + "gold_in_topk": true, + "gold_global_key": "pronoun#2", + "gold_rank": 1, + "gold_score": 0.9830121625541873, + "top1_text": "The user's dog is named Apollo.", + "top1_score": 0.9830121625541873, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "pronoun-008", + "query": "who is Apollo?", + "effective_query": "who is Apollo?", + "gold_in_topk": true, + "gold_global_key": "pronoun#2", + "gold_rank": 1, + "gold_score": 0.9783254585435431, + "top1_text": "My dog is named Apollo.", + "top1_score": 0.9783254585435431, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "pronoun-009", + "query": "where do I live?", + "effective_query": "where do the user live?", + "gold_in_topk": true, + "gold_global_key": "pronoun#3", + "gold_rank": 1, + "gold_score": 0.7741606960366889, + "top1_text": "The user lives in Lisbon.", + "top1_score": 0.7741606960366889, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "pronoun-010", + "query": "what city does the user live in?", + "effective_query": "what city does the user live in?", + "gold_in_topk": true, + "gold_global_key": "pronoun#3", + "gold_rank": 1, + "gold_score": 0.8871245980434483, + "top1_text": "The user lives in Lisbon.", + "top1_score": 0.8871245980434483, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "pronoun-011", + "query": "when is my birthday?", + "effective_query": "when is the user's birthday?", + "gold_in_topk": true, + "gold_global_key": "pronoun#4", + "gold_rank": 1, + "gold_score": 1, + "top1_text": "The user's birthday is March 14.", + "top1_score": 1, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "pronoun-012", + "query": "when was the user born?", + "effective_query": "when was the user born?", + "gold_in_topk": true, + "gold_global_key": "pronoun#4", + "gold_rank": 1, + "gold_score": 0.7067061203590985, + "top1_text": "The user's birthday is March 14.", + "top1_score": 0.7067061203590985, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "pronoun-013", + "query": "do I have kids?", + "effective_query": "do the user have kids?", + "gold_in_topk": true, + "gold_global_key": "pronoun#5", + "gold_rank": 1, + "gold_score": 0.7745183790985406, + "top1_text": "I have two children, Maya and Theo.", + "top1_score": 0.7745183790985406, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "pronoun-014", + "query": "how many children does the user have?", + "effective_query": "how many children does the user have?", + "gold_in_topk": true, + "gold_global_key": "pronoun#5", + "gold_rank": 1, + "gold_score": 0.8830521647230488, + "top1_text": "The user has two children, Maya and Theo.", + "top1_score": 0.8830521647230488, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "pronoun-015", + "query": "what is my usual coffee order?", + "effective_query": "what is the user's usual coffee order?", + "gold_in_topk": true, + "gold_global_key": "pronoun#6", + "gold_rank": 1, + "gold_score": 1, + "top1_text": "The user's favorite coffee order is an oat-milk flat white.", + "top1_score": 1, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "pronoun-016", + "query": "what coffee does the user drink?", + "effective_query": "what coffee does the user drink?", + "gold_in_topk": true, + "gold_global_key": "pronoun#6", + "gold_rank": 2, + "gold_score": 0.6219843435521409, + "top1_text": "The user does not drink coffee. They prefer tea.", + "top1_score": 1, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "pronoun-017", + "query": "what did I study?", + "effective_query": "what did the user study?", + "gold_in_topk": true, + "gold_global_key": "pronoun#7", + "gold_rank": 1, + "gold_score": 0.89575, + "top1_text": "The user studied applied mathematics at university.", + "top1_score": 0.89575, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "pronoun-018", + "query": "do I have any allergies?", + "effective_query": "do the user have any allergies?", + "gold_in_topk": true, + "gold_global_key": "pronoun#8", + "gold_rank": 1, + "gold_score": 0.7223693332633677, + "top1_text": "The user is allergic to peanuts.", + "top1_score": 0.7223693332633677, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "pronoun-019", + "query": "what is the user allergic to?", + "effective_query": "what is the user allergic to?", + "gold_in_topk": true, + "gold_global_key": "pronoun#8", + "gold_rank": 1, + "gold_score": 1, + "top1_text": "The user is allergic to peanuts.", + "top1_score": 1, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "pronoun-020", + "query": "what kind of car do I drive?", + "effective_query": "what kind of car do the user drive?", + "gold_in_topk": true, + "gold_global_key": "pronoun#9", + "gold_rank": 2, + "gold_score": 0.7002924800490933, + "top1_text": "I do not own a car; they bike or use public transit.", + "top1_score": 0.9137331184077017, + "top1_is_distractor": false, + "false_positive": false + } + ] + }, + { + "axis": "temporal", + "n": 14, + "pool_size": 108, + "recall_at_1": 0.5, + "recall_at_5": 0.7857142857142857, + "mean_gold_rank": 5.285714285714286, + "distractor_at_top1": 3, + "false_positive_count": 0, + "median_gold_margin": 0.11447343705680679, + "items": [ + { + "id": "temporal-001", + "query": "where does the user live now?", + "effective_query": "where does the user live now?", + "gold_in_topk": true, + "gold_global_key": "temporal#0", + "gold_rank": 3, + "gold_score": 0.7015808512434873, + "top1_text": "The user lives in Lisbon.", + "top1_score": 0.729722742382345, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "temporal-002", + "query": "where did the user used to live?", + "effective_query": "where did the user used to live?", + "gold_in_topk": true, + "gold_global_key": "temporal#1", + "gold_rank": 4, + "gold_score": 0.7268414774245511, + "top1_text": "The user moved from Berlin to Lisbon in 2024.", + "top1_score": 0.8198598847083589, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "temporal-003", + "query": "when did the user move?", + "effective_query": "when did the user move?", + "gold_in_topk": true, + "gold_global_key": "temporal#2", + "gold_rank": 2, + "gold_score": 0.8637093178778421, + "top1_text": "The user asked for the user's name.", + "top1_score": 0.9309845833792503, + "top1_is_distractor": true, + "false_positive": false + }, + { + "id": "temporal-004", + "query": "where is the user currently based?", + "effective_query": "where is the user currently based?", + "gold_in_topk": true, + "gold_global_key": "temporal#0", + "gold_rank": 16, + "gold_score": 0.6237210510001825, + "top1_text": "The user is me.", + "top1_score": 0.8091389415487087, + "top1_is_distractor": true, + "false_positive": false + }, + { + "id": "temporal-005", + "query": "what is the user reading?", + "effective_query": "what is the user reading?", + "gold_in_topk": true, + "gold_global_key": "temporal#3", + "gold_rank": 1, + "gold_score": 0.8963167330268929, + "top1_text": "As of April 2026, the user is reading 'The Power Broker'.", + "top1_score": 0.8963167330268929, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "temporal-006", + "query": "what did the user read last year?", + "effective_query": "what did the user read last year?", + "gold_in_topk": true, + "gold_global_key": "temporal#4", + "gold_rank": 1, + "gold_score": 1, + "top1_text": "Last year the user read 'Project Hail Mary'.", + "top1_score": 1, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "temporal-007", + "query": "what is the user working on these days?", + "effective_query": "what is the user working on these days?", + "gold_in_topk": true, + "gold_global_key": "temporal#5", + "gold_rank": 1, + "gold_score": 0.9636797079360075, + "top1_text": "The user is currently working on a memory benchmark project.", + "top1_score": 0.9636797079360075, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "temporal-008", + "query": "what did the user finish last month?", + "effective_query": "what did the user finish last month?", + "gold_in_topk": true, + "gold_global_key": "temporal#6", + "gold_rank": 1, + "gold_score": 1, + "top1_text": "The user finished the Sprint-4 reranker training last month.", + "top1_score": 1, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "temporal-009", + "query": "what LLM does the user prefer?", + "effective_query": "what LLM does the user prefer?", + "gold_in_topk": true, + "gold_global_key": "temporal#7", + "gold_rank": 1, + "gold_score": 0.7981511497774132, + "top1_text": "As of May 2026, the user's preferred LLM is Claude Sonnet 4.6.", + "top1_score": 0.7981511497774132, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "temporal-010", + "query": "which model did the user use before?", + "effective_query": "which model did the user use before?", + "gold_in_topk": true, + "gold_global_key": "temporal#8", + "gold_rank": 1, + "gold_score": 0.9218020917135962, + "top1_text": "The user used GPT-4 as their primary model in 2024.", + "top1_score": 0.9218020917135962, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "temporal-011", + "query": "did the user get a new phone recently?", + "effective_query": "did the user get a new phone recently?", + "gold_in_topk": true, + "gold_global_key": "temporal#9", + "gold_rank": 1, + "gold_score": 0.944084372003835, + "top1_text": "The user upgraded their phone to an iPhone 17 in March 2026.", + "top1_score": 0.944084372003835, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "temporal-012", + "query": "is the user still in Berlin?", + "effective_query": "is the user still in Berlin?", + "gold_in_topk": true, + "gold_global_key": "temporal#0", + "gold_rank": 5, + "gold_score": 0.5384570728328757, + "top1_text": "Before 2024, the user lived in Berlin.", + "top1_score": 1, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "temporal-013", + "query": "what is the user up to?", + "effective_query": "what is the user up to?", + "gold_in_topk": true, + "gold_global_key": "temporal#5", + "gold_rank": 19, + "gold_score": 0.4820469206032369, + "top1_text": "The user is asking a question.", + "top1_score": 0.7983877869260091, + "top1_is_distractor": true, + "false_positive": false + }, + { + "id": "temporal-014", + "query": "which model is the user on right now?", + "effective_query": "which model is the user on right now?", + "gold_in_topk": true, + "gold_global_key": "temporal#7", + "gold_rank": 18, + "gold_score": 0.6134604774711914, + "top1_text": "The user is not on LinkedIn anymore.", + "top1_score": 0.881723970958755, + "top1_is_distractor": false, + "false_positive": false + } + ] + }, + { + "axis": "specificity", + "n": 14, + "pool_size": 108, + "recall_at_1": 0.5, + "recall_at_5": 1, + "mean_gold_rank": 1.7142857142857142, + "distractor_at_top1": 0, + "false_positive_count": 0, + "median_gold_margin": 0.03548021534901058, + "items": [ + { + "id": "specificity-001", + "query": "tell me about my dog", + "effective_query": "tell the user about the user's dog", + "gold_in_topk": true, + "gold_global_key": "specificity#0", + "gold_rank": 3, + "gold_score": 0.7743385636424491, + "top1_text": "The user's dog is named Apollo.", + "top1_score": 0.9703829855046132, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "specificity-002", + "query": "what kind of pet do I have?", + "effective_query": "what kind of pet do the user have?", + "gold_in_topk": true, + "gold_global_key": "specificity#0", + "gold_rank": 3, + "gold_score": 0.6345967882412202, + "top1_text": "I have a dog named Apollo.", + "top1_score": 0.883261556964638, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "specificity-003", + "query": "do I own a bike?", + "effective_query": "do the user own a bike?", + "gold_in_topk": true, + "gold_global_key": "specificity#1", + "gold_rank": 2, + "gold_score": 0.8662753596268623, + "top1_text": "The user does not own a car; they bike or use public transit.", + "top1_score": 0.8890501788742958, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "specificity-004", + "query": "what brand of bike does the user have?", + "effective_query": "what brand of bike does the user have?", + "gold_in_topk": true, + "gold_global_key": "specificity#1", + "gold_rank": 1, + "gold_score": 0.9277061278857603, + "top1_text": "The user owns a Bianchi road bike.", + "top1_score": 0.9277061278857603, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "specificity-005", + "query": "what computer do I use?", + "effective_query": "what computer do the user use?", + "gold_in_topk": true, + "gold_global_key": "specificity#2", + "gold_rank": 2, + "gold_score": 0.6612878214300802, + "top1_text": "The user does not use Twitter; they use Bluesky and Mastodon.", + "top1_score": 0.7106184333803608, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "specificity-006", + "query": "what laptop does the user have?", + "effective_query": "what laptop does the user have?", + "gold_in_topk": true, + "gold_global_key": "specificity#2", + "gold_rank": 1, + "gold_score": 0.9411685874536975, + "top1_text": "The user's primary laptop is a 16-inch MacBook Pro M4.", + "top1_score": 0.9411685874536975, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "specificity-007", + "query": "do I have any musical instruments?", + "effective_query": "do the user have any musical instruments?", + "gold_in_topk": true, + "gold_global_key": "specificity#3", + "gold_rank": 2, + "gold_score": 0.7324865742655697, + "top1_text": "The user is not currently learning any new languages.", + "top1_score": 0.8134921163906099, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "specificity-008", + "query": "which note-taking app do I use?", + "effective_query": "which note-taking app do the user use?", + "gold_in_topk": true, + "gold_global_key": "specificity#4", + "gold_rank": 2, + "gold_score": 0.6547377436617509, + "top1_text": "I do not use Twitter; they use Bluesky and Mastodon.", + "top1_score": 0.712198423325952, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "specificity-009", + "query": "where do I like to eat in Lisbon?", + "effective_query": "where do the user like to eat in Lisbon?", + "gold_in_topk": true, + "gold_global_key": "specificity#5", + "gold_rank": 3, + "gold_score": 0.8041812823837996, + "top1_text": "The user lives in Lisbon.", + "top1_score": 0.8406389607939662, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "specificity-010", + "query": "what brand sunglasses does the user wear?", + "effective_query": "what brand sunglasses does the user wear?", + "gold_in_topk": true, + "gold_global_key": "specificity#6", + "gold_rank": 1, + "gold_score": 1, + "top1_text": "The user wears Smith Lowdown sunglasses.", + "top1_score": 1, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "specificity-011", + "query": "do I read on a Kindle?", + "effective_query": "do the user read on a Kindle?", + "gold_in_topk": true, + "gold_global_key": "specificity#7", + "gold_rank": 1, + "gold_score": 1, + "top1_text": "The user reads on a Kindle Paperwhite 11th-gen.", + "top1_score": 1, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "specificity-012", + "query": "what espresso machine does the user own?", + "effective_query": "what espresso machine does the user own?", + "gold_in_topk": true, + "gold_global_key": "specificity#8", + "gold_rank": 1, + "gold_score": 1, + "top1_text": "The user's home espresso machine is a Lelit Bianca v3.", + "top1_score": 1, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "specificity-013", + "query": "what shoes do I wear?", + "effective_query": "what shoes do the user wear?", + "gold_in_topk": true, + "gold_global_key": "specificity#9", + "gold_rank": 1, + "gold_score": 0.7416206214516285, + "top1_text": "The user wears Allbirds Wool Runners daily.", + "top1_score": 0.7416206214516285, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "specificity-014", + "query": "what brand are the user's everyday shoes?", + "effective_query": "what brand are the user's everyday shoes?", + "gold_in_topk": true, + "gold_global_key": "specificity#9", + "gold_rank": 1, + "gold_score": 0.7357707060378677, + "top1_text": "The user wears Allbirds Wool Runners daily.", + "top1_score": 0.7357707060378677, + "top1_is_distractor": false, + "false_positive": false + } + ] + }, + { + "axis": "negation", + "n": 12, + "pool_size": 108, + "recall_at_1": 1, + "recall_at_5": 1, + "mean_gold_rank": 1, + "distractor_at_top1": 0, + "false_positive_count": 0, + "median_gold_margin": 0.49515934695105424, + "items": [ + { + "id": "negation-001", + "query": "does the user drink coffee?", + "effective_query": "does the user drink coffee?", + "gold_in_topk": true, + "gold_global_key": "negation#0", + "gold_rank": 1, + "gold_score": 1, + "top1_text": "The user does not drink coffee. They prefer tea.", + "top1_score": 1, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "negation-002", + "query": "what does the user drink in the morning?", + "effective_query": "what does the user drink in the morning?", + "gold_in_topk": true, + "gold_global_key": "negation#0", + "gold_rank": 1, + "gold_score": 1, + "top1_text": "The user does not drink coffee. They prefer tea.", + "top1_score": 1, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "negation-003", + "query": "am I vegetarian?", + "effective_query": "am the user vegetarian?", + "gold_in_topk": true, + "gold_global_key": "negation#1", + "gold_rank": 1, + "gold_score": 0.94940384673181, + "top1_text": "I am not vegetarian, but avoids red meat.", + "top1_score": 0.94940384673181, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "negation-004", + "query": "is the user on Twitter?", + "effective_query": "is the user on Twitter?", + "gold_in_topk": true, + "gold_global_key": "negation#2", + "gold_rank": 1, + "gold_score": 0.9064307475058953, + "top1_text": "The user does not use Twitter; they use Bluesky and Mastodon.", + "top1_score": 0.9064307475058953, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "negation-005", + "query": "which social networks does the user use?", + "effective_query": "which social networks does the user use?", + "gold_in_topk": true, + "gold_global_key": "negation#2", + "gold_rank": 1, + "gold_score": 1, + "top1_text": "The user does not use Twitter; they use Bluesky and Mastodon.", + "top1_score": 1, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "negation-006", + "query": "does the user own a car?", + "effective_query": "does the user own a car?", + "gold_in_topk": true, + "gold_global_key": "negation#3", + "gold_rank": 1, + "gold_score": 1, + "top1_text": "The user does not own a car; they bike or use public transit.", + "top1_score": 1, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "negation-007", + "query": "is the user active on LinkedIn?", + "effective_query": "is the user active on LinkedIn?", + "gold_in_topk": true, + "gold_global_key": "negation#4", + "gold_rank": 1, + "gold_score": 1, + "top1_text": "The user is not on LinkedIn anymore.", + "top1_score": 1, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "negation-008", + "query": "any foods the user hates?", + "effective_query": "any foods the user hates?", + "gold_in_topk": true, + "gold_global_key": "negation#5", + "gold_rank": 1, + "gold_score": 0.6856226645347278, + "top1_text": "The user dislikes cilantro intensely.", + "top1_score": 0.6856226645347278, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "negation-009", + "query": "has the user traveled to Asia?", + "effective_query": "has the user traveled to Asia?", + "gold_in_topk": true, + "gold_global_key": "negation#6", + "gold_rank": 1, + "gold_score": 1, + "top1_text": "The user has never been to Asia.", + "top1_score": 1, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "negation-010", + "query": "does the user like horror movies?", + "effective_query": "does the user like horror movies?", + "gold_in_topk": true, + "gold_global_key": "negation#7", + "gold_rank": 1, + "gold_score": 1, + "top1_text": "The user does not enjoy horror movies.", + "top1_score": 1, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "negation-011", + "query": "can the user eat shrimp?", + "effective_query": "can the user eat shrimp?", + "gold_in_topk": true, + "gold_global_key": "negation#8", + "gold_rank": 1, + "gold_score": 1, + "top1_text": "The user does not eat shellfish.", + "top1_score": 1, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "negation-012", + "query": "is the user learning a new language?", + "effective_query": "is the user learning a new language?", + "gold_in_topk": true, + "gold_global_key": "negation#9", + "gold_rank": 1, + "gold_score": 1, + "top1_text": "The user is not currently learning any new languages.", + "top1_score": 1, + "top1_is_distractor": false, + "false_positive": false + } + ] + }, + { + "axis": "control", + "n": 10, + "pool_size": 108, + "recall_at_1": 0, + "recall_at_5": 0, + "mean_gold_rank": null, + "distractor_at_top1": 0, + "false_positive_count": 10, + "median_gold_margin": null, + "items": [ + { + "id": "control-001", + "query": "what is the airspeed velocity of an unladen swallow?", + "effective_query": "what is the airspeed velocity of an unladen swallow?", + "gold_in_topk": false, + "gold_global_key": null, + "gold_rank": null, + "gold_score": null, + "top1_text": "The user's favorite coffee order is an oat-milk flat white.", + "top1_score": 0.7462706361415277, + "top1_is_distractor": false, + "false_positive": true + }, + { + "id": "control-002", + "query": "who is the current president of France?", + "effective_query": "who is the current president of France?", + "gold_in_topk": false, + "gold_global_key": null, + "gold_rank": null, + "gold_score": null, + "top1_text": "As of January 2026, the user lives in Lisbon.", + "top1_score": 0.8963307525203645, + "top1_is_distractor": false, + "false_positive": true + }, + { + "id": "control-003", + "query": "what is the capital of Mongolia?", + "effective_query": "what is the capital of Mongolia?", + "gold_in_topk": false, + "gold_global_key": null, + "gold_rank": null, + "gold_score": null, + "top1_text": "As of April 2026, the user is reading 'The Power Broker'.", + "top1_score": 0.7321019562425386, + "top1_is_distractor": false, + "false_positive": true + }, + { + "id": "control-004", + "query": "how does photosynthesis work?", + "effective_query": "how does photosynthesis work?", + "gold_in_topk": false, + "gold_global_key": null, + "gold_rank": null, + "gold_score": null, + "top1_text": "The user does not eat shellfish.", + "top1_score": 0.8448230497006028, + "top1_is_distractor": false, + "false_positive": true + }, + { + "id": "control-005", + "query": "translate 'goodnight' to Japanese", + "effective_query": "translate 'goodnight' to Japanese", + "gold_in_topk": false, + "gold_global_key": null, + "gold_rank": null, + "gold_score": null, + "top1_text": "I have never been to Asia.", + "top1_score": 0.7858603613747184, + "top1_is_distractor": false, + "false_positive": true + }, + { + "id": "control-006", + "query": "what year did World War II end?", + "effective_query": "what year did World War II end?", + "gold_in_topk": false, + "gold_global_key": null, + "gold_rank": null, + "gold_score": null, + "top1_text": "Last year I read 'Project Hail Mary'.", + "top1_score": 0.6729541267175014, + "top1_is_distractor": false, + "false_positive": true + }, + { + "id": "control-007", + "query": "explain entropy in thermodynamics", + "effective_query": "explain entropy in thermodynamics", + "gold_in_topk": false, + "gold_global_key": null, + "gold_rank": null, + "gold_score": null, + "top1_text": "I lives in Lisbon.", + "top1_score": 0.7556669023961569, + "top1_is_distractor": false, + "false_positive": true + }, + { + "id": "control-008", + "query": "best way to debug a segfault in C", + "effective_query": "best way to debug a segfault in C", + "gold_in_topk": false, + "gold_global_key": null, + "gold_rank": null, + "gold_score": null, + "top1_text": "I am currently working on a memory benchmark project.", + "top1_score": 0.7108768572279539, + "top1_is_distractor": false, + "false_positive": true + }, + { + "id": "control-009", + "query": "what's the weather going to be tomorrow?", + "effective_query": "what's the weather going to be tomorrow?", + "gold_in_topk": false, + "gold_global_key": null, + "gold_rank": null, + "gold_score": null, + "top1_text": "The user upgraded their phone to an iPhone 17 in March 2026.", + "top1_score": 0.6556307580300376, + "top1_is_distractor": false, + "false_positive": true + }, + { + "id": "control-010", + "query": "give me a recipe for tiramisu", + "effective_query": "give the user a recipe for tiramisu", + "gold_in_topk": false, + "gold_global_key": null, + "gold_rank": null, + "gold_score": null, + "top1_text": "The user asked for the user's name.", + "top1_score": 0.7954885232893061, + "top1_is_distractor": true, + "false_positive": true + } + ] + } + ] +} \ No newline at end of file diff --git a/benchmarks/alignbench/runs/demo-stress-2026-05-15.json b/benchmarks/alignbench/runs/demo-stress-2026-05-15.json new file mode 100644 index 0000000..3923877 --- /dev/null +++ b/benchmarks/alignbench/runs/demo-stress-2026-05-15.json @@ -0,0 +1,945 @@ +{ + "baseline": { + "recall_at_1": 0.5666666666666667, + "recall_at_5": 0.5666666666666667, + "gold_present_rate": 0.5666666666666667, + "meta_at_top1": 0, + "per_item": [ + { + "id": "name-001", + "gold_rank": 1, + "gold_score": 0.3960303068161011, + "top1_score": 0.3960303068161011, + "top1_text": "The user's name is Alex.", + "top1_is_meta": false + }, + { + "id": "name-002", + "gold_rank": null, + "gold_score": null, + "top1_score": 0.340599000453949, + "top1_text": "The user goes by the name Sam.", + "top1_is_meta": false + }, + { + "id": "name-003", + "gold_rank": null, + "gold_score": null, + "top1_is_meta": false, + "top1_text": null + }, + { + "id": "name-004", + "gold_rank": null, + "gold_score": null, + "top1_is_meta": false, + "top1_text": null + }, + { + "id": "pet-001", + "gold_rank": 1, + "gold_score": 0.4510817527770996, + "top1_score": 0.4510817527770996, + "top1_text": "User has a golden retriever named Apollo.", + "top1_is_meta": false + }, + { + "id": "pet-002", + "gold_rank": 1, + "gold_score": 0.5039844512939453, + "top1_score": 0.5039844512939453, + "top1_text": "The user has a cat named Luna that sleeps on the user's keyboard.", + "top1_is_meta": false + }, + { + "id": "pet-003", + "gold_rank": null, + "gold_score": null, + "top1_is_meta": false, + "top1_text": null + }, + { + "id": "job-001", + "gold_rank": 1, + "gold_score": 0.19333378970623016, + "top1_score": 0.19333378970623016, + "top1_text": "User works as a software engineer at a startup", + "top1_is_meta": false + }, + { + "id": "job-002", + "gold_rank": null, + "gold_score": null, + "top1_is_meta": false, + "top1_text": null + }, + { + "id": "job-003", + "gold_rank": null, + "gold_score": null, + "top1_is_meta": false, + "top1_text": null + }, + { + "id": "city-001", + "gold_rank": 1, + "gold_score": 0.295663058757782, + "top1_score": 0.295663058757782, + "top1_text": "The user lives in Lisbon.", + "top1_is_meta": false + }, + { + "id": "city-002", + "gold_rank": null, + "gold_score": null, + "top1_is_meta": false, + "top1_text": null + }, + { + "id": "city-003", + "gold_rank": 1, + "gold_score": 0.3179514408111572, + "top1_score": 0.3179514408111572, + "top1_text": "User is based in Toronto", + "top1_is_meta": false + }, + { + "id": "food-001", + "gold_rank": 1, + "gold_score": 0.4597603678703308, + "top1_score": 0.4597603678703308, + "top1_text": "The user is vegetarian.", + "top1_is_meta": false + }, + { + "id": "food-002", + "gold_rank": 1, + "gold_score": 0.41524738073349, + "top1_score": 0.41524738073349, + "top1_text": "User is severely allergic to peanuts", + "top1_is_meta": false + }, + { + "id": "food-003", + "gold_rank": 1, + "gold_score": 0.3149101436138153, + "top1_score": 0.3149101436138153, + "top1_text": "User does not drink coffee and only drinks tea.", + "top1_is_meta": false + }, + { + "id": "hobby-001", + "gold_rank": null, + "gold_score": null, + "top1_is_meta": false, + "top1_text": null + }, + { + "id": "hobby-002", + "gold_rank": 1, + "gold_score": 0.387075811624527, + "top1_score": 0.387075811624527, + "top1_text": "The user's main sport is rock climbing.", + "top1_is_meta": false + }, + { + "id": "hobby-003", + "gold_rank": null, + "gold_score": null, + "top1_is_meta": false, + "top1_text": null + }, + { + "id": "family-001", + "gold_rank": null, + "gold_score": null, + "top1_is_meta": false, + "top1_text": null + }, + { + "id": "family-002", + "gold_rank": 1, + "gold_score": 0.5129045844078064, + "top1_score": 0.5129045844078064, + "top1_text": "The user's partner's name is Casey.", + "top1_is_meta": false + }, + { + "id": "family-003", + "gold_rank": 1, + "gold_score": 0.6258488893508911, + "top1_score": 0.6258488893508911, + "top1_text": "The user's mom lives in Vancouver.", + "top1_is_meta": false + }, + { + "id": "vehicle-001", + "gold_rank": 1, + "gold_score": 0.35032275319099426, + "top1_score": 0.35032275319099426, + "top1_text": "User drives a blue Subaru Outback.", + "top1_is_meta": false + }, + { + "id": "vehicle-002", + "gold_rank": null, + "gold_score": null, + "top1_score": 0.383465051651001, + "top1_text": "User does not own a car and bikes everywhere for transportation.", + "top1_is_meta": false + }, + { + "id": "edu-001", + "gold_rank": 1, + "gold_score": 0.43980997800827026, + "top1_score": 0.43980997800827026, + "top1_text": "User studied applied mathematics in college.", + "top1_is_meta": false + }, + { + "id": "edu-002", + "gold_rank": 1, + "gold_score": 0.6381034255027771, + "top1_score": 0.6381034255027771, + "top1_text": "User obtained an MBA from UCLA two years ago", + "top1_is_meta": false + }, + { + "id": "tech-001", + "gold_rank": null, + "gold_score": null, + "top1_is_meta": false, + "top1_text": null + }, + { + "id": "tech-002", + "gold_rank": 1, + "gold_score": 0.4922914206981659, + "top1_score": 0.4922914206981659, + "top1_text": "User prefers Neovim over VS Code as their text editor.", + "top1_is_meta": false + }, + { + "id": "music-001", + "gold_rank": null, + "gold_score": null, + "top1_is_meta": false, + "top1_text": null + }, + { + "id": "music-002", + "gold_rank": 1, + "gold_score": 0.5527176260948181, + "top1_score": 0.5527176260948181, + "top1_text": "The user's all-time favorite band is Radiohead.", + "top1_is_meta": false + } + ] + }, + "filtered": { + "recall_at_1": 0.5666666666666667, + "recall_at_5": 0.5666666666666667, + "gold_present_rate": 0.5666666666666667, + "meta_at_top1": 0, + "per_item": [ + { + "id": "name-001", + "gold_rank": 1, + "gold_score": 0.3960303068161011, + "top1_score": 0.3960303068161011, + "top1_text": "The user's name is Alex.", + "top1_is_meta": false + }, + { + "id": "name-002", + "gold_rank": null, + "gold_score": null, + "top1_score": 0.340599000453949, + "top1_text": "The user goes by the name Sam.", + "top1_is_meta": false + }, + { + "id": "name-003", + "gold_rank": null, + "gold_score": null, + "top1_is_meta": false, + "top1_text": null + }, + { + "id": "name-004", + "gold_rank": null, + "gold_score": null, + "top1_is_meta": false, + "top1_text": null + }, + { + "id": "pet-001", + "gold_rank": 1, + "gold_score": 0.4510817527770996, + "top1_score": 0.4510817527770996, + "top1_text": "User has a golden retriever named Apollo.", + "top1_is_meta": false + }, + { + "id": "pet-002", + "gold_rank": 1, + "gold_score": 0.5039844512939453, + "top1_score": 0.5039844512939453, + "top1_text": "The user has a cat named Luna that sleeps on the user's keyboard.", + "top1_is_meta": false + }, + { + "id": "pet-003", + "gold_rank": null, + "gold_score": null, + "top1_is_meta": false, + "top1_text": null + }, + { + "id": "job-001", + "gold_rank": 1, + "gold_score": 0.19333378970623016, + "top1_score": 0.19333378970623016, + "top1_text": "User works as a software engineer at a startup", + "top1_is_meta": false + }, + { + "id": "job-002", + "gold_rank": null, + "gold_score": null, + "top1_is_meta": false, + "top1_text": null + }, + { + "id": "job-003", + "gold_rank": null, + "gold_score": null, + "top1_is_meta": false, + "top1_text": null + }, + { + "id": "city-001", + "gold_rank": 1, + "gold_score": 0.295663058757782, + "top1_score": 0.295663058757782, + "top1_text": "The user lives in Lisbon.", + "top1_is_meta": false + }, + { + "id": "city-002", + "gold_rank": null, + "gold_score": null, + "top1_is_meta": false, + "top1_text": null + }, + { + "id": "city-003", + "gold_rank": 1, + "gold_score": 0.3179514408111572, + "top1_score": 0.3179514408111572, + "top1_text": "User is based in Toronto", + "top1_is_meta": false + }, + { + "id": "food-001", + "gold_rank": 1, + "gold_score": 0.4597603678703308, + "top1_score": 0.4597603678703308, + "top1_text": "The user is vegetarian.", + "top1_is_meta": false + }, + { + "id": "food-002", + "gold_rank": 1, + "gold_score": 0.41524738073349, + "top1_score": 0.41524738073349, + "top1_text": "User is severely allergic to peanuts", + "top1_is_meta": false + }, + { + "id": "food-003", + "gold_rank": 1, + "gold_score": 0.3149101436138153, + "top1_score": 0.3149101436138153, + "top1_text": "User does not drink coffee and only drinks tea.", + "top1_is_meta": false + }, + { + "id": "hobby-001", + "gold_rank": null, + "gold_score": null, + "top1_is_meta": false, + "top1_text": null + }, + { + "id": "hobby-002", + "gold_rank": 1, + "gold_score": 0.387075811624527, + "top1_score": 0.387075811624527, + "top1_text": "The user's main sport is rock climbing.", + "top1_is_meta": false + }, + { + "id": "hobby-003", + "gold_rank": null, + "gold_score": null, + "top1_is_meta": false, + "top1_text": null + }, + { + "id": "family-001", + "gold_rank": null, + "gold_score": null, + "top1_is_meta": false, + "top1_text": null + }, + { + "id": "family-002", + "gold_rank": 1, + "gold_score": 0.5129045844078064, + "top1_score": 0.5129045844078064, + "top1_text": "The user's partner's name is Casey.", + "top1_is_meta": false + }, + { + "id": "family-003", + "gold_rank": 1, + "gold_score": 0.6258488893508911, + "top1_score": 0.6258488893508911, + "top1_text": "The user's mom lives in Vancouver.", + "top1_is_meta": false + }, + { + "id": "vehicle-001", + "gold_rank": 1, + "gold_score": 0.35032275319099426, + "top1_score": 0.35032275319099426, + "top1_text": "User drives a blue Subaru Outback.", + "top1_is_meta": false + }, + { + "id": "vehicle-002", + "gold_rank": null, + "gold_score": null, + "top1_score": 0.383465051651001, + "top1_text": "User does not own a car and bikes everywhere for transportation.", + "top1_is_meta": false + }, + { + "id": "edu-001", + "gold_rank": 1, + "gold_score": 0.43980997800827026, + "top1_score": 0.43980997800827026, + "top1_text": "User studied applied mathematics in college.", + "top1_is_meta": false + }, + { + "id": "edu-002", + "gold_rank": 1, + "gold_score": 0.6381034255027771, + "top1_score": 0.6381034255027771, + "top1_text": "User obtained an MBA from UCLA two years ago", + "top1_is_meta": false + }, + { + "id": "tech-001", + "gold_rank": null, + "gold_score": null, + "top1_is_meta": false, + "top1_text": null + }, + { + "id": "tech-002", + "gold_rank": 1, + "gold_score": 0.4922914206981659, + "top1_score": 0.4922914206981659, + "top1_text": "User prefers Neovim over VS Code as their text editor.", + "top1_is_meta": false + }, + { + "id": "music-001", + "gold_rank": null, + "gold_score": null, + "top1_is_meta": false, + "top1_text": null + }, + { + "id": "music-002", + "gold_rank": 1, + "gold_score": 0.5527176260948181, + "top1_score": 0.5527176260948181, + "top1_text": "The user's all-time favorite band is Radiohead.", + "top1_is_meta": false + } + ] + }, + "n": 30, + "rows": [ + { + "id": "name-001", + "turns": [ + "My name is Alex.", + "Got it." + ], + "query": "what is my name?", + "gold": "name is Alex", + "facts": [ + "The user's name is Alex." + ], + "meta_mask": [ + false + ], + "n_facts": 1, + "n_meta": 0 + }, + { + "id": "name-002", + "turns": [ + "I go by Sam.", + "OK Sam." + ], + "query": "what's my name?", + "gold": "go by Sam", + "facts": [ + "The user goes by the name Sam." + ], + "meta_mask": [ + false + ], + "n_facts": 1, + "n_meta": 0 + }, + { + "id": "name-003", + "turns": [ + "You can call me Riley.", + "Hi Riley." + ], + "query": "what should you call me?", + "gold": "call me Riley", + "facts": [], + "meta_mask": [], + "n_facts": 0, + "n_meta": 0 + }, + { + "id": "name-004", + "turns": [ + "I'm Jordan.", + "Nice to meet you." + ], + "query": "who am I?", + "gold": "Jordan", + "facts": [], + "meta_mask": [], + "n_facts": 0, + "n_meta": 0 + }, + { + "id": "pet-001", + "turns": [ + "I have a golden retriever named Apollo.", + "How sweet." + ], + "query": "what is my dog's name?", + "gold": "Apollo", + "facts": [ + "User has a golden retriever named Apollo." + ], + "meta_mask": [ + false + ], + "n_facts": 1, + "n_meta": 0 + }, + { + "id": "pet-002", + "turns": [ + "My cat Luna sleeps on my keyboard.", + "Classic cat." + ], + "query": "what's my cat's name?", + "gold": "Luna", + "facts": [ + "The user has a cat named Luna that sleeps on the user's keyboard." + ], + "meta_mask": [ + false + ], + "n_facts": 1, + "n_meta": 0 + }, + { + "id": "pet-003", + "turns": [ + "I just adopted a beagle puppy. Her name is Penny.", + "Congrats!" + ], + "query": "what kind of dog do I have?", + "gold": "beagle", + "facts": [], + "meta_mask": [], + "n_facts": 0, + "n_meta": 0 + }, + { + "id": "job-001", + "turns": [ + "I work as a software engineer at a startup.", + "Cool field." + ], + "query": "what do I do for work?", + "gold": "software engineer", + "facts": [ + "User works as a software engineer at a startup" + ], + "meta_mask": [ + false + ], + "n_facts": 1, + "n_meta": 0 + }, + { + "id": "job-002", + "turns": [ + "I'm a high school chemistry teacher.", + "That's important work." + ], + "query": "what is my profession?", + "gold": "chemistry teacher", + "facts": [], + "meta_mask": [], + "n_facts": 0, + "n_meta": 0 + }, + { + "id": "job-003", + "turns": [ + "I freelance as a graphic designer.", + "Nice." + ], + "query": "what's my job?", + "gold": "graphic designer", + "facts": [], + "meta_mask": [], + "n_facts": 0, + "n_meta": 0 + }, + { + "id": "city-001", + "turns": [ + "I live in Lisbon now.", + "Beautiful city." + ], + "query": "where do I live?", + "gold": "Lisbon", + "facts": [ + "The user lives in Lisbon." + ], + "meta_mask": [ + false + ], + "n_facts": 1, + "n_meta": 0 + }, + { + "id": "city-002", + "turns": [ + "I just moved to Berlin last month.", + "Welcome to Berlin." + ], + "query": "what city am I in?", + "gold": "Berlin", + "facts": [], + "meta_mask": [], + "n_facts": 0, + "n_meta": 0 + }, + { + "id": "city-003", + "turns": [ + "I'm based in Toronto.", + "Cold this time of year." + ], + "query": "where am I located?", + "gold": "Toronto", + "facts": [ + "User is based in Toronto" + ], + "meta_mask": [ + false + ], + "n_facts": 1, + "n_meta": 0 + }, + { + "id": "food-001", + "turns": [ + "I'm vegetarian.", + "Got it." + ], + "query": "do I eat meat?", + "gold": "vegetarian", + "facts": [ + "The user is vegetarian." + ], + "meta_mask": [ + false + ], + "n_facts": 1, + "n_meta": 0 + }, + { + "id": "food-002", + "turns": [ + "I'm severely allergic to peanuts.", + "Noted, will avoid." + ], + "query": "do I have any allergies?", + "gold": "peanut", + "facts": [ + "User is severely allergic to peanuts" + ], + "meta_mask": [ + false + ], + "n_facts": 1, + "n_meta": 0 + }, + { + "id": "food-003", + "turns": [ + "I don't drink coffee \u2014 only tea.", + "Tea is great too." + ], + "query": "what do I drink in the morning?", + "gold": "tea", + "facts": [ + "User does not drink coffee and only drinks tea." + ], + "meta_mask": [ + false + ], + "n_facts": 1, + "n_meta": 0 + }, + { + "id": "hobby-001", + "turns": [ + "I play classical piano.", + "Lovely hobby." + ], + "query": "what instrument do I play?", + "gold": "piano", + "facts": [], + "meta_mask": [], + "n_facts": 0, + "n_meta": 0 + }, + { + "id": "hobby-002", + "turns": [ + "My main sport is rock climbing.", + "Cool." + ], + "query": "what sport do I do?", + "gold": "rock climbing", + "facts": [ + "The user's main sport is rock climbing." + ], + "meta_mask": [ + false + ], + "n_facts": 1, + "n_meta": 0 + }, + { + "id": "hobby-003", + "turns": [ + "I've been knitting for about ten years.", + "Impressive." + ], + "query": "what's a hobby I have?", + "gold": "knitting", + "facts": [], + "meta_mask": [], + "n_facts": 0, + "n_meta": 0 + }, + { + "id": "family-001", + "turns": [ + "I have two kids, Maya and Theo.", + "What ages?" + ], + "query": "how many children do I have?", + "gold": "two", + "facts": [], + "meta_mask": [], + "n_facts": 0, + "n_meta": 0 + }, + { + "id": "family-002", + "turns": [ + "My partner's name is Casey.", + "Nice." + ], + "query": "who is my partner?", + "gold": "Casey", + "facts": [ + "The user's partner's name is Casey." + ], + "meta_mask": [ + false + ], + "n_facts": 1, + "n_meta": 0 + }, + { + "id": "family-003", + "turns": [ + "My mom lives in Vancouver.", + "Far from you?" + ], + "query": "where does my mom live?", + "gold": "Vancouver", + "facts": [ + "The user's mom lives in Vancouver." + ], + "meta_mask": [ + false + ], + "n_facts": 1, + "n_meta": 0 + }, + { + "id": "vehicle-001", + "turns": [ + "I drive a blue Subaru Outback.", + "Reliable car." + ], + "query": "what kind of car do I have?", + "gold": "Subaru", + "facts": [ + "User drives a blue Subaru Outback." + ], + "meta_mask": [ + false + ], + "n_facts": 1, + "n_meta": 0 + }, + { + "id": "vehicle-002", + "turns": [ + "I don't own a car. I bike everywhere.", + "Healthy lifestyle." + ], + "query": "do I have a car?", + "gold": "don't own", + "facts": [ + "User does not own a car and bikes everywhere for transportation." + ], + "meta_mask": [ + false + ], + "n_facts": 1, + "n_meta": 0 + }, + { + "id": "edu-001", + "turns": [ + "I studied applied mathematics in college.", + "Tough major." + ], + "query": "what was my major?", + "gold": "applied mathematics", + "facts": [ + "User studied applied mathematics in college." + ], + "meta_mask": [ + false + ], + "n_facts": 1, + "n_meta": 0 + }, + { + "id": "edu-002", + "turns": [ + "I got my MBA from UCLA two years ago.", + "Congrats." + ], + "query": "where did I get my MBA?", + "gold": "UCLA", + "facts": [ + "User obtained an MBA from UCLA two years ago" + ], + "meta_mask": [ + false + ], + "n_facts": 1, + "n_meta": 0 + }, + { + "id": "tech-001", + "turns": [ + "My main laptop is a 16-inch MacBook Pro.", + "Solid machine." + ], + "query": "what computer do I use?", + "gold": "MacBook", + "facts": [], + "meta_mask": [], + "n_facts": 0, + "n_meta": 0 + }, + { + "id": "tech-002", + "turns": [ + "I prefer Neovim over VS Code.", + "Editor preferences are personal." + ], + "query": "what editor do I use?", + "gold": "Neovim", + "facts": [ + "User prefers Neovim over VS Code as their text editor." + ], + "meta_mask": [ + false + ], + "n_facts": 1, + "n_meta": 0 + }, + { + "id": "music-001", + "turns": [ + "I've been getting into bluegrass lately.", + "Fun genre." + ], + "query": "what music am I into these days?", + "gold": "bluegrass", + "facts": [], + "meta_mask": [], + "n_facts": 0, + "n_meta": 0 + }, + { + "id": "music-002", + "turns": [ + "My all-time favorite band is Radiohead.", + "Great band." + ], + "query": "what's my favorite band?", + "gold": "Radiohead", + "facts": [ + "The user's all-time favorite band is Radiohead." + ], + "meta_mask": [ + false + ], + "n_facts": 1, + "n_meta": 0 + } + ] +} \ No newline at end of file diff --git a/benchmarks/alignbench/runs/demo-stress-v2-prompt-tuned-2026-05-15.json b/benchmarks/alignbench/runs/demo-stress-v2-prompt-tuned-2026-05-15.json new file mode 100644 index 0000000..83af4e4 --- /dev/null +++ b/benchmarks/alignbench/runs/demo-stress-v2-prompt-tuned-2026-05-15.json @@ -0,0 +1,969 @@ +{ + "baseline": { + "recall_at_1": 0.7333333333333333, + "recall_at_5": 0.7333333333333333, + "gold_present_rate": 0.7333333333333333, + "meta_at_top1": 0, + "per_item": [ + { + "id": "name-001", + "gold_rank": 1, + "gold_score": 0.3960303068161011, + "top1_score": 0.3960303068161011, + "top1_text": "The user's name is Alex.", + "top1_is_meta": false + }, + { + "id": "name-002", + "gold_rank": 1, + "gold_score": 0.24057143926620483, + "top1_score": 0.24057143926620483, + "top1_text": "The user goes by Sam.", + "top1_is_meta": false + }, + { + "id": "name-003", + "gold_rank": null, + "gold_score": null, + "top1_is_meta": false, + "top1_text": null + }, + { + "id": "name-004", + "gold_rank": null, + "gold_score": null, + "top1_is_meta": false, + "top1_text": null + }, + { + "id": "pet-001", + "gold_rank": 1, + "gold_score": 0.46046364307403564, + "top1_score": 0.46046364307403564, + "top1_text": "The user has a golden retriever named Apollo.", + "top1_is_meta": false + }, + { + "id": "pet-002", + "gold_rank": null, + "gold_score": null, + "top1_is_meta": false, + "top1_text": null + }, + { + "id": "pet-003", + "gold_rank": 1, + "gold_score": 0.4039832651615143, + "top1_score": 0.4039832651615143, + "top1_text": "The user adopted a beagle puppy named Penny.", + "top1_is_meta": false + }, + { + "id": "job-001", + "gold_rank": 1, + "gold_score": 0.18614771962165833, + "top1_score": 0.18614771962165833, + "top1_text": "The user works as a software engineer at a startup.", + "top1_is_meta": false + }, + { + "id": "job-002", + "gold_rank": 1, + "gold_score": 0.2045474350452423, + "top1_score": 0.2045474350452423, + "top1_text": "The user is a high school chemistry teacher.", + "top1_is_meta": false + }, + { + "id": "job-003", + "gold_rank": 1, + "gold_score": 0.3352947533130646, + "top1_score": 0.3352947533130646, + "top1_text": "The user freelances as a graphic designer.", + "top1_is_meta": false + }, + { + "id": "city-001", + "gold_rank": 1, + "gold_score": 0.2956629991531372, + "top1_score": 0.2956629991531372, + "top1_text": "The user lives in Lisbon.", + "top1_is_meta": false + }, + { + "id": "city-002", + "gold_rank": null, + "gold_score": null, + "top1_is_meta": false, + "top1_text": null + }, + { + "id": "city-003", + "gold_rank": 1, + "gold_score": 0.33762556314468384, + "top1_score": 0.33762556314468384, + "top1_text": "The user is based in Toronto.", + "top1_is_meta": false + }, + { + "id": "food-001", + "gold_rank": 1, + "gold_score": 0.459760457277298, + "top1_score": 0.459760457277298, + "top1_text": "The user is vegetarian.", + "top1_is_meta": false + }, + { + "id": "food-002", + "gold_rank": 1, + "gold_score": 0.38580864667892456, + "top1_score": 0.38580864667892456, + "top1_text": "The user is severely allergic to peanuts.", + "top1_is_meta": false + }, + { + "id": "food-003", + "gold_rank": 1, + "gold_score": 0.30525362491607666, + "top1_score": 0.30525362491607666, + "top1_text": "The user does not drink coffee and only drinks tea.", + "top1_is_meta": false + }, + { + "id": "hobby-001", + "gold_rank": 1, + "gold_score": 0.38078930974006653, + "top1_score": 0.38078930974006653, + "top1_text": "The user plays classical piano.", + "top1_is_meta": false + }, + { + "id": "hobby-002", + "gold_rank": 1, + "gold_score": 0.38707590103149414, + "top1_score": 0.38707590103149414, + "top1_text": "The user's main sport is rock climbing.", + "top1_is_meta": false + }, + { + "id": "hobby-003", + "gold_rank": null, + "gold_score": null, + "top1_is_meta": false, + "top1_text": null + }, + { + "id": "family-001", + "gold_rank": 1, + "gold_score": 0.28777992725372314, + "top1_score": 0.28777992725372314, + "top1_text": "The user has two children named Maya and Theo.", + "top1_is_meta": false + }, + { + "id": "family-002", + "gold_rank": 1, + "gold_score": 0.5129046440124512, + "top1_score": 0.5129046440124512, + "top1_text": "The user's partner's name is Casey.", + "top1_is_meta": false + }, + { + "id": "family-003", + "gold_rank": 1, + "gold_score": 0.6085690259933472, + "top1_score": 0.6085690259933472, + "top1_text": "The user's mother lives in Vancouver.", + "top1_is_meta": false + }, + { + "id": "vehicle-001", + "gold_rank": 1, + "gold_score": 0.33847203850746155, + "top1_score": 0.33847203850746155, + "top1_text": "The user drives a blue Subaru Outback.", + "top1_is_meta": false + }, + { + "id": "vehicle-002", + "gold_rank": null, + "gold_score": null, + "top1_score": 0.410287082195282, + "top1_text": "The user does not own a car and bikes everywhere.", + "top1_is_meta": false + }, + { + "id": "edu-001", + "gold_rank": 1, + "gold_score": 0.45382001996040344, + "top1_score": 0.45382001996040344, + "top1_text": "The user studied applied mathematics in college.", + "top1_is_meta": false + }, + { + "id": "edu-002", + "gold_rank": null, + "gold_score": null, + "top1_is_meta": false, + "top1_text": null + }, + { + "id": "tech-001", + "gold_rank": 1, + "gold_score": 0.30372458696365356, + "top1_score": 0.30372458696365356, + "top1_text": "The user's main laptop is a 16-inch MacBook Pro.", + "top1_is_meta": false + }, + { + "id": "tech-002", + "gold_rank": 1, + "gold_score": 0.3180965483188629, + "top1_score": 0.3180965483188629, + "top1_text": "The user prefers Neovim over VS Code.", + "top1_is_meta": false + }, + { + "id": "music-001", + "gold_rank": null, + "gold_score": null, + "top1_is_meta": false, + "top1_text": null + }, + { + "id": "music-002", + "gold_rank": 1, + "gold_score": 0.5527178049087524, + "top1_score": 0.5527178049087524, + "top1_text": "The user's all-time favorite band is Radiohead.", + "top1_is_meta": false + } + ] + }, + "filtered": { + "recall_at_1": 0.7333333333333333, + "recall_at_5": 0.7333333333333333, + "gold_present_rate": 0.7333333333333333, + "meta_at_top1": 0, + "per_item": [ + { + "id": "name-001", + "gold_rank": 1, + "gold_score": 0.3960303068161011, + "top1_score": 0.3960303068161011, + "top1_text": "The user's name is Alex.", + "top1_is_meta": false + }, + { + "id": "name-002", + "gold_rank": 1, + "gold_score": 0.24057143926620483, + "top1_score": 0.24057143926620483, + "top1_text": "The user goes by Sam.", + "top1_is_meta": false + }, + { + "id": "name-003", + "gold_rank": null, + "gold_score": null, + "top1_is_meta": false, + "top1_text": null + }, + { + "id": "name-004", + "gold_rank": null, + "gold_score": null, + "top1_is_meta": false, + "top1_text": null + }, + { + "id": "pet-001", + "gold_rank": 1, + "gold_score": 0.46046364307403564, + "top1_score": 0.46046364307403564, + "top1_text": "The user has a golden retriever named Apollo.", + "top1_is_meta": false + }, + { + "id": "pet-002", + "gold_rank": null, + "gold_score": null, + "top1_is_meta": false, + "top1_text": null + }, + { + "id": "pet-003", + "gold_rank": 1, + "gold_score": 0.4039832651615143, + "top1_score": 0.4039832651615143, + "top1_text": "The user adopted a beagle puppy named Penny.", + "top1_is_meta": false + }, + { + "id": "job-001", + "gold_rank": 1, + "gold_score": 0.18614771962165833, + "top1_score": 0.18614771962165833, + "top1_text": "The user works as a software engineer at a startup.", + "top1_is_meta": false + }, + { + "id": "job-002", + "gold_rank": 1, + "gold_score": 0.2045474350452423, + "top1_score": 0.2045474350452423, + "top1_text": "The user is a high school chemistry teacher.", + "top1_is_meta": false + }, + { + "id": "job-003", + "gold_rank": 1, + "gold_score": 0.3352947533130646, + "top1_score": 0.3352947533130646, + "top1_text": "The user freelances as a graphic designer.", + "top1_is_meta": false + }, + { + "id": "city-001", + "gold_rank": 1, + "gold_score": 0.2956629991531372, + "top1_score": 0.2956629991531372, + "top1_text": "The user lives in Lisbon.", + "top1_is_meta": false + }, + { + "id": "city-002", + "gold_rank": null, + "gold_score": null, + "top1_is_meta": false, + "top1_text": null + }, + { + "id": "city-003", + "gold_rank": 1, + "gold_score": 0.33762556314468384, + "top1_score": 0.33762556314468384, + "top1_text": "The user is based in Toronto.", + "top1_is_meta": false + }, + { + "id": "food-001", + "gold_rank": 1, + "gold_score": 0.459760457277298, + "top1_score": 0.459760457277298, + "top1_text": "The user is vegetarian.", + "top1_is_meta": false + }, + { + "id": "food-002", + "gold_rank": 1, + "gold_score": 0.38580864667892456, + "top1_score": 0.38580864667892456, + "top1_text": "The user is severely allergic to peanuts.", + "top1_is_meta": false + }, + { + "id": "food-003", + "gold_rank": 1, + "gold_score": 0.30525362491607666, + "top1_score": 0.30525362491607666, + "top1_text": "The user does not drink coffee and only drinks tea.", + "top1_is_meta": false + }, + { + "id": "hobby-001", + "gold_rank": 1, + "gold_score": 0.38078930974006653, + "top1_score": 0.38078930974006653, + "top1_text": "The user plays classical piano.", + "top1_is_meta": false + }, + { + "id": "hobby-002", + "gold_rank": 1, + "gold_score": 0.38707590103149414, + "top1_score": 0.38707590103149414, + "top1_text": "The user's main sport is rock climbing.", + "top1_is_meta": false + }, + { + "id": "hobby-003", + "gold_rank": null, + "gold_score": null, + "top1_is_meta": false, + "top1_text": null + }, + { + "id": "family-001", + "gold_rank": 1, + "gold_score": 0.28777992725372314, + "top1_score": 0.28777992725372314, + "top1_text": "The user has two children named Maya and Theo.", + "top1_is_meta": false + }, + { + "id": "family-002", + "gold_rank": 1, + "gold_score": 0.5129046440124512, + "top1_score": 0.5129046440124512, + "top1_text": "The user's partner's name is Casey.", + "top1_is_meta": false + }, + { + "id": "family-003", + "gold_rank": 1, + "gold_score": 0.6085690259933472, + "top1_score": 0.6085690259933472, + "top1_text": "The user's mother lives in Vancouver.", + "top1_is_meta": false + }, + { + "id": "vehicle-001", + "gold_rank": 1, + "gold_score": 0.33847203850746155, + "top1_score": 0.33847203850746155, + "top1_text": "The user drives a blue Subaru Outback.", + "top1_is_meta": false + }, + { + "id": "vehicle-002", + "gold_rank": null, + "gold_score": null, + "top1_score": 0.410287082195282, + "top1_text": "The user does not own a car and bikes everywhere.", + "top1_is_meta": false + }, + { + "id": "edu-001", + "gold_rank": 1, + "gold_score": 0.45382001996040344, + "top1_score": 0.45382001996040344, + "top1_text": "The user studied applied mathematics in college.", + "top1_is_meta": false + }, + { + "id": "edu-002", + "gold_rank": null, + "gold_score": null, + "top1_is_meta": false, + "top1_text": null + }, + { + "id": "tech-001", + "gold_rank": 1, + "gold_score": 0.30372458696365356, + "top1_score": 0.30372458696365356, + "top1_text": "The user's main laptop is a 16-inch MacBook Pro.", + "top1_is_meta": false + }, + { + "id": "tech-002", + "gold_rank": 1, + "gold_score": 0.3180965483188629, + "top1_score": 0.3180965483188629, + "top1_text": "The user prefers Neovim over VS Code.", + "top1_is_meta": false + }, + { + "id": "music-001", + "gold_rank": null, + "gold_score": null, + "top1_is_meta": false, + "top1_text": null + }, + { + "id": "music-002", + "gold_rank": 1, + "gold_score": 0.5527178049087524, + "top1_score": 0.5527178049087524, + "top1_text": "The user's all-time favorite band is Radiohead.", + "top1_is_meta": false + } + ] + }, + "n": 30, + "rows": [ + { + "id": "name-001", + "turns": [ + "My name is Alex.", + "Got it." + ], + "query": "what is my name?", + "gold": "name is Alex", + "facts": [ + "The user's name is Alex." + ], + "meta_mask": [ + false + ], + "n_facts": 1, + "n_meta": 0 + }, + { + "id": "name-002", + "turns": [ + "I go by Sam.", + "OK Sam." + ], + "query": "what's my name?", + "gold": "go by Sam", + "facts": [ + "The user goes by Sam." + ], + "meta_mask": [ + false + ], + "n_facts": 1, + "n_meta": 0 + }, + { + "id": "name-003", + "turns": [ + "You can call me Riley.", + "Hi Riley." + ], + "query": "what should you call me?", + "gold": "call me Riley", + "facts": [], + "meta_mask": [], + "n_facts": 0, + "n_meta": 0 + }, + { + "id": "name-004", + "turns": [ + "I'm Jordan.", + "Nice to meet you." + ], + "query": "who am I?", + "gold": "Jordan", + "facts": [], + "meta_mask": [], + "n_facts": 0, + "n_meta": 0 + }, + { + "id": "pet-001", + "turns": [ + "I have a golden retriever named Apollo.", + "How sweet." + ], + "query": "what is my dog's name?", + "gold": "Apollo", + "facts": [ + "The user has a golden retriever named Apollo." + ], + "meta_mask": [ + false + ], + "n_facts": 1, + "n_meta": 0 + }, + { + "id": "pet-002", + "turns": [ + "My cat Luna sleeps on my keyboard.", + "Classic cat." + ], + "query": "what's my cat's name?", + "gold": "Luna", + "facts": [], + "meta_mask": [], + "n_facts": 0, + "n_meta": 0 + }, + { + "id": "pet-003", + "turns": [ + "I just adopted a beagle puppy. Her name is Penny.", + "Congrats!" + ], + "query": "what kind of dog do I have?", + "gold": "beagle", + "facts": [ + "The user adopted a beagle puppy named Penny." + ], + "meta_mask": [ + false + ], + "n_facts": 1, + "n_meta": 0 + }, + { + "id": "job-001", + "turns": [ + "I work as a software engineer at a startup.", + "Cool field." + ], + "query": "what do I do for work?", + "gold": "software engineer", + "facts": [ + "The user works as a software engineer at a startup." + ], + "meta_mask": [ + false + ], + "n_facts": 1, + "n_meta": 0 + }, + { + "id": "job-002", + "turns": [ + "I'm a high school chemistry teacher.", + "That's important work." + ], + "query": "what is my profession?", + "gold": "chemistry teacher", + "facts": [ + "The user is a high school chemistry teacher." + ], + "meta_mask": [ + false + ], + "n_facts": 1, + "n_meta": 0 + }, + { + "id": "job-003", + "turns": [ + "I freelance as a graphic designer.", + "Nice." + ], + "query": "what's my job?", + "gold": "graphic designer", + "facts": [ + "The user freelances as a graphic designer." + ], + "meta_mask": [ + false + ], + "n_facts": 1, + "n_meta": 0 + }, + { + "id": "city-001", + "turns": [ + "I live in Lisbon now.", + "Beautiful city." + ], + "query": "where do I live?", + "gold": "Lisbon", + "facts": [ + "The user lives in Lisbon." + ], + "meta_mask": [ + false + ], + "n_facts": 1, + "n_meta": 0 + }, + { + "id": "city-002", + "turns": [ + "I just moved to Berlin last month.", + "Welcome to Berlin." + ], + "query": "what city am I in?", + "gold": "Berlin", + "facts": [], + "meta_mask": [], + "n_facts": 0, + "n_meta": 0 + }, + { + "id": "city-003", + "turns": [ + "I'm based in Toronto.", + "Cold this time of year." + ], + "query": "where am I located?", + "gold": "Toronto", + "facts": [ + "The user is based in Toronto." + ], + "meta_mask": [ + false + ], + "n_facts": 1, + "n_meta": 0 + }, + { + "id": "food-001", + "turns": [ + "I'm vegetarian.", + "Got it." + ], + "query": "do I eat meat?", + "gold": "vegetarian", + "facts": [ + "The user is vegetarian." + ], + "meta_mask": [ + false + ], + "n_facts": 1, + "n_meta": 0 + }, + { + "id": "food-002", + "turns": [ + "I'm severely allergic to peanuts.", + "Noted, will avoid." + ], + "query": "do I have any allergies?", + "gold": "peanut", + "facts": [ + "The user is severely allergic to peanuts." + ], + "meta_mask": [ + false + ], + "n_facts": 1, + "n_meta": 0 + }, + { + "id": "food-003", + "turns": [ + "I don't drink coffee \u2014 only tea.", + "Tea is great too." + ], + "query": "what do I drink in the morning?", + "gold": "tea", + "facts": [ + "The user does not drink coffee and only drinks tea." + ], + "meta_mask": [ + false + ], + "n_facts": 1, + "n_meta": 0 + }, + { + "id": "hobby-001", + "turns": [ + "I play classical piano.", + "Lovely hobby." + ], + "query": "what instrument do I play?", + "gold": "piano", + "facts": [ + "The user plays classical piano." + ], + "meta_mask": [ + false + ], + "n_facts": 1, + "n_meta": 0 + }, + { + "id": "hobby-002", + "turns": [ + "My main sport is rock climbing.", + "Cool." + ], + "query": "what sport do I do?", + "gold": "rock climbing", + "facts": [ + "The user's main sport is rock climbing." + ], + "meta_mask": [ + false + ], + "n_facts": 1, + "n_meta": 0 + }, + { + "id": "hobby-003", + "turns": [ + "I've been knitting for about ten years.", + "Impressive." + ], + "query": "what's a hobby I have?", + "gold": "knitting", + "facts": [], + "meta_mask": [], + "n_facts": 0, + "n_meta": 0 + }, + { + "id": "family-001", + "turns": [ + "I have two kids, Maya and Theo.", + "What ages?" + ], + "query": "how many children do I have?", + "gold": "two", + "facts": [ + "The user has two children named Maya and Theo." + ], + "meta_mask": [ + false + ], + "n_facts": 1, + "n_meta": 0 + }, + { + "id": "family-002", + "turns": [ + "My partner's name is Casey.", + "Nice." + ], + "query": "who is my partner?", + "gold": "Casey", + "facts": [ + "The user's partner's name is Casey." + ], + "meta_mask": [ + false + ], + "n_facts": 1, + "n_meta": 0 + }, + { + "id": "family-003", + "turns": [ + "My mom lives in Vancouver.", + "Far from you?" + ], + "query": "where does my mom live?", + "gold": "Vancouver", + "facts": [ + "The user's mother lives in Vancouver." + ], + "meta_mask": [ + false + ], + "n_facts": 1, + "n_meta": 0 + }, + { + "id": "vehicle-001", + "turns": [ + "I drive a blue Subaru Outback.", + "Reliable car." + ], + "query": "what kind of car do I have?", + "gold": "Subaru", + "facts": [ + "The user drives a blue Subaru Outback." + ], + "meta_mask": [ + false + ], + "n_facts": 1, + "n_meta": 0 + }, + { + "id": "vehicle-002", + "turns": [ + "I don't own a car. I bike everywhere.", + "Healthy lifestyle." + ], + "query": "do I have a car?", + "gold": "don't own", + "facts": [ + "The user does not own a car and bikes everywhere." + ], + "meta_mask": [ + false + ], + "n_facts": 1, + "n_meta": 0 + }, + { + "id": "edu-001", + "turns": [ + "I studied applied mathematics in college.", + "Tough major." + ], + "query": "what was my major?", + "gold": "applied mathematics", + "facts": [ + "The user studied applied mathematics in college." + ], + "meta_mask": [ + false + ], + "n_facts": 1, + "n_meta": 0 + }, + { + "id": "edu-002", + "turns": [ + "I got my MBA from UCLA two years ago.", + "Congrats." + ], + "query": "where did I get my MBA?", + "gold": "UCLA", + "facts": [], + "meta_mask": [], + "n_facts": 0, + "n_meta": 0 + }, + { + "id": "tech-001", + "turns": [ + "My main laptop is a 16-inch MacBook Pro.", + "Solid machine." + ], + "query": "what computer do I use?", + "gold": "MacBook", + "facts": [ + "The user's main laptop is a 16-inch MacBook Pro." + ], + "meta_mask": [ + false + ], + "n_facts": 1, + "n_meta": 0 + }, + { + "id": "tech-002", + "turns": [ + "I prefer Neovim over VS Code.", + "Editor preferences are personal." + ], + "query": "what editor do I use?", + "gold": "Neovim", + "facts": [ + "The user prefers Neovim over VS Code." + ], + "meta_mask": [ + false + ], + "n_facts": 1, + "n_meta": 0 + }, + { + "id": "music-001", + "turns": [ + "I've been getting into bluegrass lately.", + "Fun genre." + ], + "query": "what music am I into these days?", + "gold": "bluegrass", + "facts": [], + "meta_mask": [], + "n_facts": 0, + "n_meta": 0 + }, + { + "id": "music-002", + "turns": [ + "My all-time favorite band is Radiohead.", + "Great band." + ], + "query": "what's my favorite band?", + "gold": "Radiohead", + "facts": [ + "The user's all-time favorite band is Radiohead." + ], + "meta_mask": [ + false + ], + "n_facts": 1, + "n_meta": 0 + } + ] +} \ No newline at end of file diff --git a/benchmarks/alignbench/runs/demo-stress-v3-prompt+prefill-2026-05-15.json b/benchmarks/alignbench/runs/demo-stress-v3-prompt+prefill-2026-05-15.json new file mode 100644 index 0000000..7b41c9e --- /dev/null +++ b/benchmarks/alignbench/runs/demo-stress-v3-prompt+prefill-2026-05-15.json @@ -0,0 +1,1017 @@ +{ + "baseline": { + "recall_at_1": 0.9666666666666667, + "recall_at_5": 0.9666666666666667, + "gold_present_rate": 0.9666666666666667, + "meta_at_top1": 0, + "per_item": [ + { + "id": "name-001", + "gold_rank": 1, + "gold_score": 0.3960303068161011, + "top1_score": 0.3960303068161011, + "top1_text": "The user's name is Alex.", + "top1_is_meta": false + }, + { + "id": "name-002", + "gold_rank": 1, + "gold_score": 0.33899739384651184, + "top1_score": 0.33899739384651184, + "top1_text": "Sam goes by the name Sam.", + "top1_is_meta": false + }, + { + "id": "name-003", + "gold_rank": 1, + "gold_score": 0.17434966564178467, + "top1_score": 0.17434966564178467, + "top1_text": "The user's name is Riley.", + "top1_is_meta": false + }, + { + "id": "name-004", + "gold_rank": 1, + "gold_score": 0.2914789915084839, + "top1_score": 0.2914789915084839, + "top1_text": "The user's name is Jordan.", + "top1_is_meta": false + }, + { + "id": "pet-001", + "gold_rank": 1, + "gold_score": 0.46046364307403564, + "top1_score": 0.46046364307403564, + "top1_text": "The user has a golden retriever named Apollo.", + "top1_is_meta": false + }, + { + "id": "pet-002", + "gold_rank": 1, + "gold_score": 0.5874672532081604, + "top1_score": 0.5874672532081604, + "top1_text": "The user has a cat named Luna.", + "top1_is_meta": false + }, + { + "id": "pet-003", + "gold_rank": 1, + "gold_score": 0.42153388261795044, + "top1_score": 0.42153388261795044, + "top1_text": "The user recently adopted a beagle puppy named Penny.", + "top1_is_meta": false + }, + { + "id": "job-001", + "gold_rank": 1, + "gold_score": 0.18614771962165833, + "top1_score": 0.18614771962165833, + "top1_text": "The user works as a software engineer at a startup.", + "top1_is_meta": false + }, + { + "id": "job-002", + "gold_rank": 1, + "gold_score": 0.2045474350452423, + "top1_score": 0.2045474350452423, + "top1_text": "The user is a high school chemistry teacher.", + "top1_is_meta": false + }, + { + "id": "job-003", + "gold_rank": 1, + "gold_score": 0.3352947533130646, + "top1_score": 0.3352947533130646, + "top1_text": "The user freelances as a graphic designer.", + "top1_is_meta": false + }, + { + "id": "city-001", + "gold_rank": 1, + "gold_score": 0.2956629991531372, + "top1_score": 0.2956629991531372, + "top1_text": "The user lives in Lisbon.", + "top1_is_meta": false + }, + { + "id": "city-002", + "gold_rank": 1, + "gold_score": 0.22115547955036163, + "top1_score": 0.22115547955036163, + "top1_text": "The user moved to Berlin last month.", + "top1_is_meta": false + }, + { + "id": "city-003", + "gold_rank": 1, + "gold_score": 0.33762556314468384, + "top1_score": 0.33762556314468384, + "top1_text": "The user is based in Toronto.", + "top1_is_meta": false + }, + { + "id": "food-001", + "gold_rank": 1, + "gold_score": 0.459760457277298, + "top1_score": 0.459760457277298, + "top1_text": "The user is vegetarian.", + "top1_is_meta": false + }, + { + "id": "food-002", + "gold_rank": 1, + "gold_score": 0.38580864667892456, + "top1_score": 0.38580864667892456, + "top1_text": "The user is severely allergic to peanuts.", + "top1_is_meta": false + }, + { + "id": "food-003", + "gold_rank": 1, + "gold_score": 0.30525362491607666, + "top1_score": 0.30525362491607666, + "top1_text": "The user does not drink coffee and only drinks tea.", + "top1_is_meta": false + }, + { + "id": "hobby-001", + "gold_rank": 1, + "gold_score": 0.38078930974006653, + "top1_score": 0.38078930974006653, + "top1_text": "The user plays classical piano.", + "top1_is_meta": false + }, + { + "id": "hobby-002", + "gold_rank": 1, + "gold_score": 0.38707590103149414, + "top1_score": 0.38707590103149414, + "top1_text": "The user's main sport is rock climbing.", + "top1_is_meta": false + }, + { + "id": "hobby-003", + "gold_rank": 1, + "gold_score": 0.264183908700943, + "top1_score": 0.264183908700943, + "top1_text": "The user has been knitting for about ten years.", + "top1_is_meta": false + }, + { + "id": "family-001", + "gold_rank": 1, + "gold_score": 0.28777992725372314, + "top1_score": 0.28777992725372314, + "top1_text": "The user has two children named Maya and Theo.", + "top1_is_meta": false + }, + { + "id": "family-002", + "gold_rank": 1, + "gold_score": 0.5129046440124512, + "top1_score": 0.5129046440124512, + "top1_text": "The user's partner's name is Casey.", + "top1_is_meta": false + }, + { + "id": "family-003", + "gold_rank": 1, + "gold_score": 0.6258491277694702, + "top1_score": 0.6258491277694702, + "top1_text": "The user's mom lives in Vancouver.", + "top1_is_meta": false + }, + { + "id": "vehicle-001", + "gold_rank": 1, + "gold_score": 0.33847203850746155, + "top1_score": 0.33847203850746155, + "top1_text": "The user drives a blue Subaru Outback.", + "top1_is_meta": false + }, + { + "id": "vehicle-002", + "gold_rank": null, + "gold_score": null, + "top1_score": 0.4807737469673157, + "top1_text": "The user does not own a car.", + "top1_is_meta": false + }, + { + "id": "edu-001", + "gold_rank": 1, + "gold_score": 0.45382001996040344, + "top1_score": 0.45382001996040344, + "top1_text": "The user studied applied mathematics in college.", + "top1_is_meta": false + }, + { + "id": "edu-002", + "gold_rank": 1, + "gold_score": 0.6818706393241882, + "top1_score": 0.6818706393241882, + "top1_text": "The user obtained an MBA from UCLA two years ago.", + "top1_is_meta": false + }, + { + "id": "tech-001", + "gold_rank": 1, + "gold_score": 0.30372458696365356, + "top1_score": 0.30372458696365356, + "top1_text": "The user's main laptop is a 16-inch MacBook Pro.", + "top1_is_meta": false + }, + { + "id": "tech-002", + "gold_rank": 1, + "gold_score": 0.3180965483188629, + "top1_score": 0.3180965483188629, + "top1_text": "The user prefers Neovim over VS Code.", + "top1_is_meta": false + }, + { + "id": "music-001", + "gold_rank": 1, + "gold_score": 0.33930984139442444, + "top1_score": 0.33930984139442444, + "top1_text": "The user has been getting into bluegrass music recently.", + "top1_is_meta": false + }, + { + "id": "music-002", + "gold_rank": 1, + "gold_score": 0.5527178049087524, + "top1_score": 0.5527178049087524, + "top1_text": "The user's all-time favorite band is Radiohead.", + "top1_is_meta": false + } + ] + }, + "filtered": { + "recall_at_1": 0.9666666666666667, + "recall_at_5": 0.9666666666666667, + "gold_present_rate": 0.9666666666666667, + "meta_at_top1": 0, + "per_item": [ + { + "id": "name-001", + "gold_rank": 1, + "gold_score": 0.3960303068161011, + "top1_score": 0.3960303068161011, + "top1_text": "The user's name is Alex.", + "top1_is_meta": false + }, + { + "id": "name-002", + "gold_rank": 1, + "gold_score": 0.33899739384651184, + "top1_score": 0.33899739384651184, + "top1_text": "Sam goes by the name Sam.", + "top1_is_meta": false + }, + { + "id": "name-003", + "gold_rank": 1, + "gold_score": 0.17434966564178467, + "top1_score": 0.17434966564178467, + "top1_text": "The user's name is Riley.", + "top1_is_meta": false + }, + { + "id": "name-004", + "gold_rank": 1, + "gold_score": 0.2914789915084839, + "top1_score": 0.2914789915084839, + "top1_text": "The user's name is Jordan.", + "top1_is_meta": false + }, + { + "id": "pet-001", + "gold_rank": 1, + "gold_score": 0.46046364307403564, + "top1_score": 0.46046364307403564, + "top1_text": "The user has a golden retriever named Apollo.", + "top1_is_meta": false + }, + { + "id": "pet-002", + "gold_rank": 1, + "gold_score": 0.5874672532081604, + "top1_score": 0.5874672532081604, + "top1_text": "The user has a cat named Luna.", + "top1_is_meta": false + }, + { + "id": "pet-003", + "gold_rank": 1, + "gold_score": 0.42153388261795044, + "top1_score": 0.42153388261795044, + "top1_text": "The user recently adopted a beagle puppy named Penny.", + "top1_is_meta": false + }, + { + "id": "job-001", + "gold_rank": 1, + "gold_score": 0.18614771962165833, + "top1_score": 0.18614771962165833, + "top1_text": "The user works as a software engineer at a startup.", + "top1_is_meta": false + }, + { + "id": "job-002", + "gold_rank": 1, + "gold_score": 0.2045474350452423, + "top1_score": 0.2045474350452423, + "top1_text": "The user is a high school chemistry teacher.", + "top1_is_meta": false + }, + { + "id": "job-003", + "gold_rank": 1, + "gold_score": 0.3352947533130646, + "top1_score": 0.3352947533130646, + "top1_text": "The user freelances as a graphic designer.", + "top1_is_meta": false + }, + { + "id": "city-001", + "gold_rank": 1, + "gold_score": 0.2956629991531372, + "top1_score": 0.2956629991531372, + "top1_text": "The user lives in Lisbon.", + "top1_is_meta": false + }, + { + "id": "city-002", + "gold_rank": 1, + "gold_score": 0.22115547955036163, + "top1_score": 0.22115547955036163, + "top1_text": "The user moved to Berlin last month.", + "top1_is_meta": false + }, + { + "id": "city-003", + "gold_rank": 1, + "gold_score": 0.33762556314468384, + "top1_score": 0.33762556314468384, + "top1_text": "The user is based in Toronto.", + "top1_is_meta": false + }, + { + "id": "food-001", + "gold_rank": 1, + "gold_score": 0.459760457277298, + "top1_score": 0.459760457277298, + "top1_text": "The user is vegetarian.", + "top1_is_meta": false + }, + { + "id": "food-002", + "gold_rank": 1, + "gold_score": 0.38580864667892456, + "top1_score": 0.38580864667892456, + "top1_text": "The user is severely allergic to peanuts.", + "top1_is_meta": false + }, + { + "id": "food-003", + "gold_rank": 1, + "gold_score": 0.30525362491607666, + "top1_score": 0.30525362491607666, + "top1_text": "The user does not drink coffee and only drinks tea.", + "top1_is_meta": false + }, + { + "id": "hobby-001", + "gold_rank": 1, + "gold_score": 0.38078930974006653, + "top1_score": 0.38078930974006653, + "top1_text": "The user plays classical piano.", + "top1_is_meta": false + }, + { + "id": "hobby-002", + "gold_rank": 1, + "gold_score": 0.38707590103149414, + "top1_score": 0.38707590103149414, + "top1_text": "The user's main sport is rock climbing.", + "top1_is_meta": false + }, + { + "id": "hobby-003", + "gold_rank": 1, + "gold_score": 0.264183908700943, + "top1_score": 0.264183908700943, + "top1_text": "The user has been knitting for about ten years.", + "top1_is_meta": false + }, + { + "id": "family-001", + "gold_rank": 1, + "gold_score": 0.28777992725372314, + "top1_score": 0.28777992725372314, + "top1_text": "The user has two children named Maya and Theo.", + "top1_is_meta": false + }, + { + "id": "family-002", + "gold_rank": 1, + "gold_score": 0.5129046440124512, + "top1_score": 0.5129046440124512, + "top1_text": "The user's partner's name is Casey.", + "top1_is_meta": false + }, + { + "id": "family-003", + "gold_rank": 1, + "gold_score": 0.6258491277694702, + "top1_score": 0.6258491277694702, + "top1_text": "The user's mom lives in Vancouver.", + "top1_is_meta": false + }, + { + "id": "vehicle-001", + "gold_rank": 1, + "gold_score": 0.33847203850746155, + "top1_score": 0.33847203850746155, + "top1_text": "The user drives a blue Subaru Outback.", + "top1_is_meta": false + }, + { + "id": "vehicle-002", + "gold_rank": null, + "gold_score": null, + "top1_score": 0.4807737469673157, + "top1_text": "The user does not own a car.", + "top1_is_meta": false + }, + { + "id": "edu-001", + "gold_rank": 1, + "gold_score": 0.45382001996040344, + "top1_score": 0.45382001996040344, + "top1_text": "The user studied applied mathematics in college.", + "top1_is_meta": false + }, + { + "id": "edu-002", + "gold_rank": 1, + "gold_score": 0.6818706393241882, + "top1_score": 0.6818706393241882, + "top1_text": "The user obtained an MBA from UCLA two years ago.", + "top1_is_meta": false + }, + { + "id": "tech-001", + "gold_rank": 1, + "gold_score": 0.30372458696365356, + "top1_score": 0.30372458696365356, + "top1_text": "The user's main laptop is a 16-inch MacBook Pro.", + "top1_is_meta": false + }, + { + "id": "tech-002", + "gold_rank": 1, + "gold_score": 0.3180965483188629, + "top1_score": 0.3180965483188629, + "top1_text": "The user prefers Neovim over VS Code.", + "top1_is_meta": false + }, + { + "id": "music-001", + "gold_rank": 1, + "gold_score": 0.33930984139442444, + "top1_score": 0.33930984139442444, + "top1_text": "The user has been getting into bluegrass music recently.", + "top1_is_meta": false + }, + { + "id": "music-002", + "gold_rank": 1, + "gold_score": 0.5527178049087524, + "top1_score": 0.5527178049087524, + "top1_text": "The user's all-time favorite band is Radiohead.", + "top1_is_meta": false + } + ] + }, + "n": 30, + "rows": [ + { + "id": "name-001", + "turns": [ + "My name is Alex.", + "Got it." + ], + "query": "what is my name?", + "gold": "name is Alex", + "facts": [ + "The user's name is Alex." + ], + "meta_mask": [ + false + ], + "n_facts": 1, + "n_meta": 0 + }, + { + "id": "name-002", + "turns": [ + "I go by Sam.", + "OK Sam." + ], + "query": "what's my name?", + "gold": "go by Sam", + "facts": [ + "Sam goes by the name Sam." + ], + "meta_mask": [ + false + ], + "n_facts": 1, + "n_meta": 0 + }, + { + "id": "name-003", + "turns": [ + "You can call me Riley.", + "Hi Riley." + ], + "query": "what should you call me?", + "gold": "call me Riley", + "facts": [ + "The user's name is Riley." + ], + "meta_mask": [ + false + ], + "n_facts": 1, + "n_meta": 0 + }, + { + "id": "name-004", + "turns": [ + "I'm Jordan.", + "Nice to meet you." + ], + "query": "who am I?", + "gold": "Jordan", + "facts": [ + "The user's name is Jordan." + ], + "meta_mask": [ + false + ], + "n_facts": 1, + "n_meta": 0 + }, + { + "id": "pet-001", + "turns": [ + "I have a golden retriever named Apollo.", + "How sweet." + ], + "query": "what is my dog's name?", + "gold": "Apollo", + "facts": [ + "The user has a golden retriever named Apollo." + ], + "meta_mask": [ + false + ], + "n_facts": 1, + "n_meta": 0 + }, + { + "id": "pet-002", + "turns": [ + "My cat Luna sleeps on my keyboard.", + "Classic cat." + ], + "query": "what's my cat's name?", + "gold": "Luna", + "facts": [ + "The user has a cat named Luna.", + "Luna sleeps on the user's keyboard." + ], + "meta_mask": [ + false, + false + ], + "n_facts": 2, + "n_meta": 0 + }, + { + "id": "pet-003", + "turns": [ + "I just adopted a beagle puppy. Her name is Penny.", + "Congrats!" + ], + "query": "what kind of dog do I have?", + "gold": "beagle", + "facts": [ + "The user recently adopted a beagle puppy named Penny.", + "The user's beagle puppy Penny is female." + ], + "meta_mask": [ + false, + false + ], + "n_facts": 2, + "n_meta": 0 + }, + { + "id": "job-001", + "turns": [ + "I work as a software engineer at a startup.", + "Cool field." + ], + "query": "what do I do for work?", + "gold": "software engineer", + "facts": [ + "The user works as a software engineer at a startup." + ], + "meta_mask": [ + false + ], + "n_facts": 1, + "n_meta": 0 + }, + { + "id": "job-002", + "turns": [ + "I'm a high school chemistry teacher.", + "That's important work." + ], + "query": "what is my profession?", + "gold": "chemistry teacher", + "facts": [ + "The user is a high school chemistry teacher." + ], + "meta_mask": [ + false + ], + "n_facts": 1, + "n_meta": 0 + }, + { + "id": "job-003", + "turns": [ + "I freelance as a graphic designer.", + "Nice." + ], + "query": "what's my job?", + "gold": "graphic designer", + "facts": [ + "The user freelances as a graphic designer." + ], + "meta_mask": [ + false + ], + "n_facts": 1, + "n_meta": 0 + }, + { + "id": "city-001", + "turns": [ + "I live in Lisbon now.", + "Beautiful city." + ], + "query": "where do I live?", + "gold": "Lisbon", + "facts": [ + "The user lives in Lisbon." + ], + "meta_mask": [ + false + ], + "n_facts": 1, + "n_meta": 0 + }, + { + "id": "city-002", + "turns": [ + "I just moved to Berlin last month.", + "Welcome to Berlin." + ], + "query": "what city am I in?", + "gold": "Berlin", + "facts": [ + "The user moved to Berlin last month." + ], + "meta_mask": [ + false + ], + "n_facts": 1, + "n_meta": 0 + }, + { + "id": "city-003", + "turns": [ + "I'm based in Toronto.", + "Cold this time of year." + ], + "query": "where am I located?", + "gold": "Toronto", + "facts": [ + "The user is based in Toronto." + ], + "meta_mask": [ + false + ], + "n_facts": 1, + "n_meta": 0 + }, + { + "id": "food-001", + "turns": [ + "I'm vegetarian.", + "Got it." + ], + "query": "do I eat meat?", + "gold": "vegetarian", + "facts": [ + "The user is vegetarian." + ], + "meta_mask": [ + false + ], + "n_facts": 1, + "n_meta": 0 + }, + { + "id": "food-002", + "turns": [ + "I'm severely allergic to peanuts.", + "Noted, will avoid." + ], + "query": "do I have any allergies?", + "gold": "peanut", + "facts": [ + "The user is severely allergic to peanuts." + ], + "meta_mask": [ + false + ], + "n_facts": 1, + "n_meta": 0 + }, + { + "id": "food-003", + "turns": [ + "I don't drink coffee \u2014 only tea.", + "Tea is great too." + ], + "query": "what do I drink in the morning?", + "gold": "tea", + "facts": [ + "The user does not drink coffee and only drinks tea." + ], + "meta_mask": [ + false + ], + "n_facts": 1, + "n_meta": 0 + }, + { + "id": "hobby-001", + "turns": [ + "I play classical piano.", + "Lovely hobby." + ], + "query": "what instrument do I play?", + "gold": "piano", + "facts": [ + "The user plays classical piano." + ], + "meta_mask": [ + false + ], + "n_facts": 1, + "n_meta": 0 + }, + { + "id": "hobby-002", + "turns": [ + "My main sport is rock climbing.", + "Cool." + ], + "query": "what sport do I do?", + "gold": "rock climbing", + "facts": [ + "The user's main sport is rock climbing." + ], + "meta_mask": [ + false + ], + "n_facts": 1, + "n_meta": 0 + }, + { + "id": "hobby-003", + "turns": [ + "I've been knitting for about ten years.", + "Impressive." + ], + "query": "what's a hobby I have?", + "gold": "knitting", + "facts": [ + "The user has been knitting for about ten years." + ], + "meta_mask": [ + false + ], + "n_facts": 1, + "n_meta": 0 + }, + { + "id": "family-001", + "turns": [ + "I have two kids, Maya and Theo.", + "What ages?" + ], + "query": "how many children do I have?", + "gold": "two", + "facts": [ + "The user has two children named Maya and Theo." + ], + "meta_mask": [ + false + ], + "n_facts": 1, + "n_meta": 0 + }, + { + "id": "family-002", + "turns": [ + "My partner's name is Casey.", + "Nice." + ], + "query": "who is my partner?", + "gold": "Casey", + "facts": [ + "The user's partner's name is Casey." + ], + "meta_mask": [ + false + ], + "n_facts": 1, + "n_meta": 0 + }, + { + "id": "family-003", + "turns": [ + "My mom lives in Vancouver.", + "Far from you?" + ], + "query": "where does my mom live?", + "gold": "Vancouver", + "facts": [ + "The user's mom lives in Vancouver." + ], + "meta_mask": [ + false + ], + "n_facts": 1, + "n_meta": 0 + }, + { + "id": "vehicle-001", + "turns": [ + "I drive a blue Subaru Outback.", + "Reliable car." + ], + "query": "what kind of car do I have?", + "gold": "Subaru", + "facts": [ + "The user drives a blue Subaru Outback." + ], + "meta_mask": [ + false + ], + "n_facts": 1, + "n_meta": 0 + }, + { + "id": "vehicle-002", + "turns": [ + "I don't own a car. I bike everywhere.", + "Healthy lifestyle." + ], + "query": "do I have a car?", + "gold": "don't own", + "facts": [ + "The user does not own a car.", + "The user bikes everywhere for transportation." + ], + "meta_mask": [ + false, + false + ], + "n_facts": 2, + "n_meta": 0 + }, + { + "id": "edu-001", + "turns": [ + "I studied applied mathematics in college.", + "Tough major." + ], + "query": "what was my major?", + "gold": "applied mathematics", + "facts": [ + "The user studied applied mathematics in college." + ], + "meta_mask": [ + false + ], + "n_facts": 1, + "n_meta": 0 + }, + { + "id": "edu-002", + "turns": [ + "I got my MBA from UCLA two years ago.", + "Congrats." + ], + "query": "where did I get my MBA?", + "gold": "UCLA", + "facts": [ + "The user obtained an MBA from UCLA two years ago." + ], + "meta_mask": [ + false + ], + "n_facts": 1, + "n_meta": 0 + }, + { + "id": "tech-001", + "turns": [ + "My main laptop is a 16-inch MacBook Pro.", + "Solid machine." + ], + "query": "what computer do I use?", + "gold": "MacBook", + "facts": [ + "The user's main laptop is a 16-inch MacBook Pro." + ], + "meta_mask": [ + false + ], + "n_facts": 1, + "n_meta": 0 + }, + { + "id": "tech-002", + "turns": [ + "I prefer Neovim over VS Code.", + "Editor preferences are personal." + ], + "query": "what editor do I use?", + "gold": "Neovim", + "facts": [ + "The user prefers Neovim over VS Code." + ], + "meta_mask": [ + false + ], + "n_facts": 1, + "n_meta": 0 + }, + { + "id": "music-001", + "turns": [ + "I've been getting into bluegrass lately.", + "Fun genre." + ], + "query": "what music am I into these days?", + "gold": "bluegrass", + "facts": [ + "The user has been getting into bluegrass music recently." + ], + "meta_mask": [ + false + ], + "n_facts": 1, + "n_meta": 0 + }, + { + "id": "music-002", + "turns": [ + "My all-time favorite band is Radiohead.", + "Great band." + ], + "query": "what's my favorite band?", + "gold": "Radiohead", + "facts": [ + "The user's all-time favorite band is Radiohead." + ], + "meta_mask": [ + false + ], + "n_facts": 1, + "n_meta": 0 + } + ] +} \ No newline at end of file diff --git a/benchmarks/alignbench/runs/demo-stress-v4-final-2026-05-15.json b/benchmarks/alignbench/runs/demo-stress-v4-final-2026-05-15.json new file mode 100644 index 0000000..a9fc648 --- /dev/null +++ b/benchmarks/alignbench/runs/demo-stress-v4-final-2026-05-15.json @@ -0,0 +1,1017 @@ +{ + "baseline": { + "recall_at_1": 1.0, + "recall_at_5": 1.0, + "gold_present_rate": 1.0, + "meta_at_top1": 0, + "per_item": [ + { + "id": "name-001", + "gold_rank": 1, + "gold_score": 0.3960301876068115, + "top1_score": 0.3960301876068115, + "top1_text": "The user's name is Alex.", + "top1_is_meta": false + }, + { + "id": "name-002", + "gold_rank": 1, + "gold_score": 0.33899739384651184, + "top1_score": 0.33899739384651184, + "top1_text": "Sam goes by the name Sam.", + "top1_is_meta": false + }, + { + "id": "name-003", + "gold_rank": 1, + "gold_score": 0.1743495762348175, + "top1_score": 0.1743495762348175, + "top1_text": "The user's name is Riley.", + "top1_is_meta": false + }, + { + "id": "name-004", + "gold_rank": 1, + "gold_score": 0.29147881269454956, + "top1_score": 0.29147881269454956, + "top1_text": "The user's name is Jordan.", + "top1_is_meta": false + }, + { + "id": "pet-001", + "gold_rank": 1, + "gold_score": 0.46046364307403564, + "top1_score": 0.46046364307403564, + "top1_text": "The user has a golden retriever named Apollo.", + "top1_is_meta": false + }, + { + "id": "pet-002", + "gold_rank": 1, + "gold_score": 0.5874672532081604, + "top1_score": 0.5874672532081604, + "top1_text": "The user has a cat named Luna.", + "top1_is_meta": false + }, + { + "id": "pet-003", + "gold_rank": 1, + "gold_score": 0.43342381715774536, + "top1_score": 0.43342381715774536, + "top1_text": "The user's beagle puppy is female.", + "top1_is_meta": false + }, + { + "id": "job-001", + "gold_rank": 1, + "gold_score": 0.18614771962165833, + "top1_score": 0.18614771962165833, + "top1_text": "The user works as a software engineer at a startup.", + "top1_is_meta": false + }, + { + "id": "job-002", + "gold_rank": 1, + "gold_score": 0.2045476883649826, + "top1_score": 0.2045476883649826, + "top1_text": "The user is a high school chemistry teacher.", + "top1_is_meta": false + }, + { + "id": "job-003", + "gold_rank": 1, + "gold_score": 0.33529481291770935, + "top1_score": 0.33529481291770935, + "top1_text": "The user freelances as a graphic designer.", + "top1_is_meta": false + }, + { + "id": "city-001", + "gold_rank": 1, + "gold_score": 0.29566308856010437, + "top1_score": 0.29566308856010437, + "top1_text": "The user lives in Lisbon.", + "top1_is_meta": false + }, + { + "id": "city-002", + "gold_rank": 1, + "gold_score": 0.22115540504455566, + "top1_score": 0.22115540504455566, + "top1_text": "The user moved to Berlin last month.", + "top1_is_meta": false + }, + { + "id": "city-003", + "gold_rank": 1, + "gold_score": 0.3376256227493286, + "top1_score": 0.3376256227493286, + "top1_text": "The user is based in Toronto.", + "top1_is_meta": false + }, + { + "id": "food-001", + "gold_rank": 1, + "gold_score": 0.45976030826568604, + "top1_score": 0.45976030826568604, + "top1_text": "The user is vegetarian.", + "top1_is_meta": false + }, + { + "id": "food-002", + "gold_rank": 1, + "gold_score": 0.3858085870742798, + "top1_score": 0.3858085870742798, + "top1_text": "The user is severely allergic to peanuts.", + "top1_is_meta": false + }, + { + "id": "food-003", + "gold_rank": 1, + "gold_score": 0.30525368452072144, + "top1_score": 0.30525368452072144, + "top1_text": "The user does not drink coffee and only drinks tea.", + "top1_is_meta": false + }, + { + "id": "hobby-001", + "gold_rank": 1, + "gold_score": 0.3807893395423889, + "top1_score": 0.3807893395423889, + "top1_text": "The user plays classical piano.", + "top1_is_meta": false + }, + { + "id": "hobby-002", + "gold_rank": 1, + "gold_score": 0.3870759606361389, + "top1_score": 0.3870759606361389, + "top1_text": "The user's main sport is rock climbing.", + "top1_is_meta": false + }, + { + "id": "hobby-003", + "gold_rank": 1, + "gold_score": 0.26418396830558777, + "top1_score": 0.26418396830558777, + "top1_text": "The user has been knitting for about ten years.", + "top1_is_meta": false + }, + { + "id": "family-001", + "gold_rank": 1, + "gold_score": 0.28778010606765747, + "top1_score": 0.28778010606765747, + "top1_text": "The user has two children named Maya and Theo.", + "top1_is_meta": false + }, + { + "id": "family-002", + "gold_rank": 1, + "gold_score": 0.5129045248031616, + "top1_score": 0.5129045248031616, + "top1_text": "The user's partner's name is Casey.", + "top1_is_meta": false + }, + { + "id": "family-003", + "gold_rank": 1, + "gold_score": 0.6258488893508911, + "top1_score": 0.6258488893508911, + "top1_text": "The user's mom lives in Vancouver.", + "top1_is_meta": false + }, + { + "id": "vehicle-001", + "gold_rank": 1, + "gold_score": 0.33847203850746155, + "top1_score": 0.33847203850746155, + "top1_text": "The user drives a blue Subaru Outback.", + "top1_is_meta": false + }, + { + "id": "vehicle-002", + "gold_rank": 1, + "gold_score": 0.48077383637428284, + "top1_score": 0.48077383637428284, + "top1_text": "The user does not own a car.", + "top1_is_meta": false + }, + { + "id": "edu-001", + "gold_rank": 1, + "gold_score": 0.4538198709487915, + "top1_score": 0.4538198709487915, + "top1_text": "The user studied applied mathematics in college.", + "top1_is_meta": false + }, + { + "id": "edu-002", + "gold_rank": 1, + "gold_score": 0.6818708181381226, + "top1_score": 0.6818708181381226, + "top1_text": "The user obtained an MBA from UCLA two years ago.", + "top1_is_meta": false + }, + { + "id": "tech-001", + "gold_rank": 1, + "gold_score": 0.3037244975566864, + "top1_score": 0.3037244975566864, + "top1_text": "The user's main laptop is a 16-inch MacBook Pro.", + "top1_is_meta": false + }, + { + "id": "tech-002", + "gold_rank": 1, + "gold_score": 0.3180965185165405, + "top1_score": 0.3180965185165405, + "top1_text": "The user prefers Neovim over VS Code.", + "top1_is_meta": false + }, + { + "id": "music-001", + "gold_rank": 1, + "gold_score": 0.33930978178977966, + "top1_score": 0.33930978178977966, + "top1_text": "The user has been getting into bluegrass music recently.", + "top1_is_meta": false + }, + { + "id": "music-002", + "gold_rank": 1, + "gold_score": 0.5527176260948181, + "top1_score": 0.5527176260948181, + "top1_text": "The user's all-time favorite band is Radiohead.", + "top1_is_meta": false + } + ] + }, + "filtered": { + "recall_at_1": 1.0, + "recall_at_5": 1.0, + "gold_present_rate": 1.0, + "meta_at_top1": 0, + "per_item": [ + { + "id": "name-001", + "gold_rank": 1, + "gold_score": 0.3960301876068115, + "top1_score": 0.3960301876068115, + "top1_text": "The user's name is Alex.", + "top1_is_meta": false + }, + { + "id": "name-002", + "gold_rank": 1, + "gold_score": 0.33899739384651184, + "top1_score": 0.33899739384651184, + "top1_text": "Sam goes by the name Sam.", + "top1_is_meta": false + }, + { + "id": "name-003", + "gold_rank": 1, + "gold_score": 0.1743495762348175, + "top1_score": 0.1743495762348175, + "top1_text": "The user's name is Riley.", + "top1_is_meta": false + }, + { + "id": "name-004", + "gold_rank": 1, + "gold_score": 0.29147881269454956, + "top1_score": 0.29147881269454956, + "top1_text": "The user's name is Jordan.", + "top1_is_meta": false + }, + { + "id": "pet-001", + "gold_rank": 1, + "gold_score": 0.46046364307403564, + "top1_score": 0.46046364307403564, + "top1_text": "The user has a golden retriever named Apollo.", + "top1_is_meta": false + }, + { + "id": "pet-002", + "gold_rank": 1, + "gold_score": 0.5874672532081604, + "top1_score": 0.5874672532081604, + "top1_text": "The user has a cat named Luna.", + "top1_is_meta": false + }, + { + "id": "pet-003", + "gold_rank": 1, + "gold_score": 0.43342381715774536, + "top1_score": 0.43342381715774536, + "top1_text": "The user's beagle puppy is female.", + "top1_is_meta": false + }, + { + "id": "job-001", + "gold_rank": 1, + "gold_score": 0.18614771962165833, + "top1_score": 0.18614771962165833, + "top1_text": "The user works as a software engineer at a startup.", + "top1_is_meta": false + }, + { + "id": "job-002", + "gold_rank": 1, + "gold_score": 0.2045476883649826, + "top1_score": 0.2045476883649826, + "top1_text": "The user is a high school chemistry teacher.", + "top1_is_meta": false + }, + { + "id": "job-003", + "gold_rank": 1, + "gold_score": 0.33529481291770935, + "top1_score": 0.33529481291770935, + "top1_text": "The user freelances as a graphic designer.", + "top1_is_meta": false + }, + { + "id": "city-001", + "gold_rank": 1, + "gold_score": 0.29566308856010437, + "top1_score": 0.29566308856010437, + "top1_text": "The user lives in Lisbon.", + "top1_is_meta": false + }, + { + "id": "city-002", + "gold_rank": 1, + "gold_score": 0.22115540504455566, + "top1_score": 0.22115540504455566, + "top1_text": "The user moved to Berlin last month.", + "top1_is_meta": false + }, + { + "id": "city-003", + "gold_rank": 1, + "gold_score": 0.3376256227493286, + "top1_score": 0.3376256227493286, + "top1_text": "The user is based in Toronto.", + "top1_is_meta": false + }, + { + "id": "food-001", + "gold_rank": 1, + "gold_score": 0.45976030826568604, + "top1_score": 0.45976030826568604, + "top1_text": "The user is vegetarian.", + "top1_is_meta": false + }, + { + "id": "food-002", + "gold_rank": 1, + "gold_score": 0.3858085870742798, + "top1_score": 0.3858085870742798, + "top1_text": "The user is severely allergic to peanuts.", + "top1_is_meta": false + }, + { + "id": "food-003", + "gold_rank": 1, + "gold_score": 0.30525368452072144, + "top1_score": 0.30525368452072144, + "top1_text": "The user does not drink coffee and only drinks tea.", + "top1_is_meta": false + }, + { + "id": "hobby-001", + "gold_rank": 1, + "gold_score": 0.3807893395423889, + "top1_score": 0.3807893395423889, + "top1_text": "The user plays classical piano.", + "top1_is_meta": false + }, + { + "id": "hobby-002", + "gold_rank": 1, + "gold_score": 0.3870759606361389, + "top1_score": 0.3870759606361389, + "top1_text": "The user's main sport is rock climbing.", + "top1_is_meta": false + }, + { + "id": "hobby-003", + "gold_rank": 1, + "gold_score": 0.26418396830558777, + "top1_score": 0.26418396830558777, + "top1_text": "The user has been knitting for about ten years.", + "top1_is_meta": false + }, + { + "id": "family-001", + "gold_rank": 1, + "gold_score": 0.28778010606765747, + "top1_score": 0.28778010606765747, + "top1_text": "The user has two children named Maya and Theo.", + "top1_is_meta": false + }, + { + "id": "family-002", + "gold_rank": 1, + "gold_score": 0.5129045248031616, + "top1_score": 0.5129045248031616, + "top1_text": "The user's partner's name is Casey.", + "top1_is_meta": false + }, + { + "id": "family-003", + "gold_rank": 1, + "gold_score": 0.6258488893508911, + "top1_score": 0.6258488893508911, + "top1_text": "The user's mom lives in Vancouver.", + "top1_is_meta": false + }, + { + "id": "vehicle-001", + "gold_rank": 1, + "gold_score": 0.33847203850746155, + "top1_score": 0.33847203850746155, + "top1_text": "The user drives a blue Subaru Outback.", + "top1_is_meta": false + }, + { + "id": "vehicle-002", + "gold_rank": 1, + "gold_score": 0.48077383637428284, + "top1_score": 0.48077383637428284, + "top1_text": "The user does not own a car.", + "top1_is_meta": false + }, + { + "id": "edu-001", + "gold_rank": 1, + "gold_score": 0.4538198709487915, + "top1_score": 0.4538198709487915, + "top1_text": "The user studied applied mathematics in college.", + "top1_is_meta": false + }, + { + "id": "edu-002", + "gold_rank": 1, + "gold_score": 0.6818708181381226, + "top1_score": 0.6818708181381226, + "top1_text": "The user obtained an MBA from UCLA two years ago.", + "top1_is_meta": false + }, + { + "id": "tech-001", + "gold_rank": 1, + "gold_score": 0.3037244975566864, + "top1_score": 0.3037244975566864, + "top1_text": "The user's main laptop is a 16-inch MacBook Pro.", + "top1_is_meta": false + }, + { + "id": "tech-002", + "gold_rank": 1, + "gold_score": 0.3180965185165405, + "top1_score": 0.3180965185165405, + "top1_text": "The user prefers Neovim over VS Code.", + "top1_is_meta": false + }, + { + "id": "music-001", + "gold_rank": 1, + "gold_score": 0.33930978178977966, + "top1_score": 0.33930978178977966, + "top1_text": "The user has been getting into bluegrass music recently.", + "top1_is_meta": false + }, + { + "id": "music-002", + "gold_rank": 1, + "gold_score": 0.5527176260948181, + "top1_score": 0.5527176260948181, + "top1_text": "The user's all-time favorite band is Radiohead.", + "top1_is_meta": false + } + ] + }, + "n": 30, + "rows": [ + { + "id": "name-001", + "turns": [ + "My name is Alex.", + "Got it." + ], + "query": "what is my name?", + "gold": "name is Alex", + "facts": [ + "The user's name is Alex." + ], + "meta_mask": [ + false + ], + "n_facts": 1, + "n_meta": 0 + }, + { + "id": "name-002", + "turns": [ + "I go by Sam.", + "OK Sam." + ], + "query": "what's my name?", + "gold": "go by Sam", + "facts": [ + "Sam goes by the name Sam." + ], + "meta_mask": [ + false + ], + "n_facts": 1, + "n_meta": 0 + }, + { + "id": "name-003", + "turns": [ + "You can call me Riley.", + "Hi Riley." + ], + "query": "what should you call me?", + "gold": "call me Riley", + "facts": [ + "The user's name is Riley." + ], + "meta_mask": [ + false + ], + "n_facts": 1, + "n_meta": 0 + }, + { + "id": "name-004", + "turns": [ + "I'm Jordan.", + "Nice to meet you." + ], + "query": "who am I?", + "gold": "Jordan", + "facts": [ + "The user's name is Jordan." + ], + "meta_mask": [ + false + ], + "n_facts": 1, + "n_meta": 0 + }, + { + "id": "pet-001", + "turns": [ + "I have a golden retriever named Apollo.", + "How sweet." + ], + "query": "what is my dog's name?", + "gold": "Apollo", + "facts": [ + "The user has a golden retriever named Apollo." + ], + "meta_mask": [ + false + ], + "n_facts": 1, + "n_meta": 0 + }, + { + "id": "pet-002", + "turns": [ + "My cat Luna sleeps on my keyboard.", + "Classic cat." + ], + "query": "what's my cat's name?", + "gold": "Luna", + "facts": [ + "The user has a cat named Luna.", + "Luna sleeps on the user's keyboard." + ], + "meta_mask": [ + false, + false + ], + "n_facts": 2, + "n_meta": 0 + }, + { + "id": "pet-003", + "turns": [ + "I just adopted a beagle puppy. Her name is Penny.", + "Congrats!" + ], + "query": "what kind of dog do I have?", + "gold": "beagle", + "facts": [ + "The user recently adopted a beagle puppy named Penny.", + "The user's beagle puppy is female." + ], + "meta_mask": [ + false, + false + ], + "n_facts": 2, + "n_meta": 0 + }, + { + "id": "job-001", + "turns": [ + "I work as a software engineer at a startup.", + "Cool field." + ], + "query": "what do I do for work?", + "gold": "software engineer", + "facts": [ + "The user works as a software engineer at a startup." + ], + "meta_mask": [ + false + ], + "n_facts": 1, + "n_meta": 0 + }, + { + "id": "job-002", + "turns": [ + "I'm a high school chemistry teacher.", + "That's important work." + ], + "query": "what is my profession?", + "gold": "chemistry teacher", + "facts": [ + "The user is a high school chemistry teacher." + ], + "meta_mask": [ + false + ], + "n_facts": 1, + "n_meta": 0 + }, + { + "id": "job-003", + "turns": [ + "I freelance as a graphic designer.", + "Nice." + ], + "query": "what's my job?", + "gold": "graphic designer", + "facts": [ + "The user freelances as a graphic designer." + ], + "meta_mask": [ + false + ], + "n_facts": 1, + "n_meta": 0 + }, + { + "id": "city-001", + "turns": [ + "I live in Lisbon now.", + "Beautiful city." + ], + "query": "where do I live?", + "gold": "Lisbon", + "facts": [ + "The user lives in Lisbon." + ], + "meta_mask": [ + false + ], + "n_facts": 1, + "n_meta": 0 + }, + { + "id": "city-002", + "turns": [ + "I just moved to Berlin last month.", + "Welcome to Berlin." + ], + "query": "what city am I in?", + "gold": "Berlin", + "facts": [ + "The user moved to Berlin last month." + ], + "meta_mask": [ + false + ], + "n_facts": 1, + "n_meta": 0 + }, + { + "id": "city-003", + "turns": [ + "I'm based in Toronto.", + "Cold this time of year." + ], + "query": "where am I located?", + "gold": "Toronto", + "facts": [ + "The user is based in Toronto." + ], + "meta_mask": [ + false + ], + "n_facts": 1, + "n_meta": 0 + }, + { + "id": "food-001", + "turns": [ + "I'm vegetarian.", + "Got it." + ], + "query": "do I eat meat?", + "gold": "vegetarian", + "facts": [ + "The user is vegetarian." + ], + "meta_mask": [ + false + ], + "n_facts": 1, + "n_meta": 0 + }, + { + "id": "food-002", + "turns": [ + "I'm severely allergic to peanuts.", + "Noted, will avoid." + ], + "query": "do I have any allergies?", + "gold": "peanut", + "facts": [ + "The user is severely allergic to peanuts." + ], + "meta_mask": [ + false + ], + "n_facts": 1, + "n_meta": 0 + }, + { + "id": "food-003", + "turns": [ + "I don't drink coffee \u2014 only tea.", + "Tea is great too." + ], + "query": "what do I drink in the morning?", + "gold": "tea", + "facts": [ + "The user does not drink coffee and only drinks tea." + ], + "meta_mask": [ + false + ], + "n_facts": 1, + "n_meta": 0 + }, + { + "id": "hobby-001", + "turns": [ + "I play classical piano.", + "Lovely hobby." + ], + "query": "what instrument do I play?", + "gold": "piano", + "facts": [ + "The user plays classical piano." + ], + "meta_mask": [ + false + ], + "n_facts": 1, + "n_meta": 0 + }, + { + "id": "hobby-002", + "turns": [ + "My main sport is rock climbing.", + "Cool." + ], + "query": "what sport do I do?", + "gold": "rock climbing", + "facts": [ + "The user's main sport is rock climbing." + ], + "meta_mask": [ + false + ], + "n_facts": 1, + "n_meta": 0 + }, + { + "id": "hobby-003", + "turns": [ + "I've been knitting for about ten years.", + "Impressive." + ], + "query": "what's a hobby I have?", + "gold": "knitting", + "facts": [ + "The user has been knitting for about ten years." + ], + "meta_mask": [ + false + ], + "n_facts": 1, + "n_meta": 0 + }, + { + "id": "family-001", + "turns": [ + "I have two kids, Maya and Theo.", + "What ages?" + ], + "query": "how many children do I have?", + "gold": "two", + "facts": [ + "The user has two children named Maya and Theo." + ], + "meta_mask": [ + false + ], + "n_facts": 1, + "n_meta": 0 + }, + { + "id": "family-002", + "turns": [ + "My partner's name is Casey.", + "Nice." + ], + "query": "who is my partner?", + "gold": "Casey", + "facts": [ + "The user's partner's name is Casey." + ], + "meta_mask": [ + false + ], + "n_facts": 1, + "n_meta": 0 + }, + { + "id": "family-003", + "turns": [ + "My mom lives in Vancouver.", + "Far from you?" + ], + "query": "where does my mom live?", + "gold": "Vancouver", + "facts": [ + "The user's mom lives in Vancouver." + ], + "meta_mask": [ + false + ], + "n_facts": 1, + "n_meta": 0 + }, + { + "id": "vehicle-001", + "turns": [ + "I drive a blue Subaru Outback.", + "Reliable car." + ], + "query": "what kind of car do I have?", + "gold": "Subaru", + "facts": [ + "The user drives a blue Subaru Outback." + ], + "meta_mask": [ + false + ], + "n_facts": 1, + "n_meta": 0 + }, + { + "id": "vehicle-002", + "turns": [ + "I don't own a car. I bike everywhere.", + "Healthy lifestyle." + ], + "query": "do I have a car?", + "gold": "does not own", + "facts": [ + "The user does not own a car.", + "The user bikes everywhere for transportation." + ], + "meta_mask": [ + false, + false + ], + "n_facts": 2, + "n_meta": 0 + }, + { + "id": "edu-001", + "turns": [ + "I studied applied mathematics in college.", + "Tough major." + ], + "query": "what was my major?", + "gold": "applied mathematics", + "facts": [ + "The user studied applied mathematics in college." + ], + "meta_mask": [ + false + ], + "n_facts": 1, + "n_meta": 0 + }, + { + "id": "edu-002", + "turns": [ + "I got my MBA from UCLA two years ago.", + "Congrats." + ], + "query": "where did I get my MBA?", + "gold": "UCLA", + "facts": [ + "The user obtained an MBA from UCLA two years ago." + ], + "meta_mask": [ + false + ], + "n_facts": 1, + "n_meta": 0 + }, + { + "id": "tech-001", + "turns": [ + "My main laptop is a 16-inch MacBook Pro.", + "Solid machine." + ], + "query": "what computer do I use?", + "gold": "MacBook", + "facts": [ + "The user's main laptop is a 16-inch MacBook Pro." + ], + "meta_mask": [ + false + ], + "n_facts": 1, + "n_meta": 0 + }, + { + "id": "tech-002", + "turns": [ + "I prefer Neovim over VS Code.", + "Editor preferences are personal." + ], + "query": "what editor do I use?", + "gold": "Neovim", + "facts": [ + "The user prefers Neovim over VS Code." + ], + "meta_mask": [ + false + ], + "n_facts": 1, + "n_meta": 0 + }, + { + "id": "music-001", + "turns": [ + "I've been getting into bluegrass lately.", + "Fun genre." + ], + "query": "what music am I into these days?", + "gold": "bluegrass", + "facts": [ + "The user has been getting into bluegrass music recently." + ], + "meta_mask": [ + false + ], + "n_facts": 1, + "n_meta": 0 + }, + { + "id": "music-002", + "turns": [ + "My all-time favorite band is Radiohead.", + "Great band." + ], + "query": "what's my favorite band?", + "gold": "Radiohead", + "facts": [ + "The user's all-time favorite band is Radiohead." + ], + "meta_mask": [ + false + ], + "n_facts": 1, + "n_meta": 0 + } + ] +} \ No newline at end of file diff --git a/benchmarks/alignbench/runs/dual-storage.json b/benchmarks/alignbench/runs/dual-storage.json new file mode 100644 index 0000000..ba852cd --- /dev/null +++ b/benchmarks/alignbench/runs/dual-storage.json @@ -0,0 +1,990 @@ +{ + "variant": "dual-storage", + "model": "Xenova/all-MiniLM-L6-v2", + "topk": 5, + "wall_seconds": "0.5", + "composite": { + "recall_at_1": 0.7833333333333333, + "recall_at_5": 0.9333333333333333, + "distractor_top1_rate": 0.06666666666666667, + "n": 60 + }, + "false_positive_rate": 0, + "per_axis": [ + { + "axis": "pronoun", + "n": 20, + "pool_size": 108, + "recall_at_1": 0.8, + "recall_at_5": 1, + "mean_gold_rank": 1.2, + "distractor_at_top1": 2, + "false_positive_count": 0, + "median_gold_margin": 0.027444863118702423, + "items": [ + { + "id": "pronoun-001", + "query": "what is my name?", + "effective_query": "what is my name?", + "gold_in_topk": true, + "gold_global_key": "pronoun#0", + "gold_rank": 1, + "gold_score": 0.6464170828870618, + "top1_text": "My name is Alex.", + "top1_score": 0.6464170828870618, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "pronoun-002", + "query": "what's the user's name?", + "effective_query": "what's the user's name?", + "gold_in_topk": true, + "gold_global_key": "pronoun#0", + "gold_rank": 2, + "gold_score": 0.7294812723464028, + "top1_text": "The user asked for the user's name.", + "top1_score": 0.7771254660839599, + "top1_is_distractor": true, + "false_positive": false + }, + { + "id": "pronoun-003", + "query": "who am I?", + "effective_query": "who am I?", + "gold_in_topk": true, + "gold_global_key": "pronoun#0", + "gold_rank": 2, + "gold_score": 0.3799511962658216, + "top1_text": "I am me.", + "top1_score": 0.5210136429765079, + "top1_is_distractor": true, + "false_positive": false + }, + { + "id": "pronoun-004", + "query": "what do I do for work?", + "effective_query": "what do I do for work?", + "gold_in_topk": true, + "gold_global_key": "pronoun#1", + "gold_rank": 1, + "gold_score": 0.4066653080358072, + "top1_text": "I works as a software engineer at Acme.", + "top1_score": 0.4066653080358072, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "pronoun-005", + "query": "what is the user's job?", + "effective_query": "what is the user's job?", + "gold_in_topk": true, + "gold_global_key": "pronoun#1", + "gold_rank": 2, + "gold_score": 0.5754363621337681, + "top1_text": "The user works at Acme as a software engineer.", + "top1_score": 0.5767198521038163, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "pronoun-006", + "query": "where does the user work?", + "effective_query": "where does the user work?", + "gold_in_topk": true, + "gold_global_key": "pronoun#1", + "gold_rank": 1, + "gold_score": 0.5436570101621021, + "top1_text": "The user works as a software engineer at Acme.", + "top1_score": 0.5436570101621021, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "pronoun-007", + "query": "what is my dog's name?", + "effective_query": "what is my dog's name?", + "gold_in_topk": true, + "gold_global_key": "pronoun#2", + "gold_rank": 1, + "gold_score": 0.7021426325090143, + "top1_text": "My dog is named Apollo.", + "top1_score": 0.7021426325090143, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "pronoun-008", + "query": "who is Apollo?", + "effective_query": "who is Apollo?", + "gold_in_topk": true, + "gold_global_key": "pronoun#2", + "gold_rank": 1, + "gold_score": 0.7072701654355713, + "top1_text": "The user's dog is named Apollo.", + "top1_score": 0.7072701654355713, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "pronoun-009", + "query": "where do I live?", + "effective_query": "where do I live?", + "gold_in_topk": true, + "gold_global_key": "pronoun#3", + "gold_rank": 1, + "gold_score": 0.46363626173102523, + "top1_text": "I lives in Lisbon.", + "top1_score": 0.46363626173102523, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "pronoun-010", + "query": "what city does the user live in?", + "effective_query": "what city does the user live in?", + "gold_in_topk": true, + "gold_global_key": "pronoun#3", + "gold_rank": 1, + "gold_score": 0.6400263303955707, + "top1_text": "The user lives in Lisbon.", + "top1_score": 0.6400263303955707, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "pronoun-011", + "query": "when is my birthday?", + "effective_query": "when is my birthday?", + "gold_in_topk": true, + "gold_global_key": "pronoun#4", + "gold_rank": 1, + "gold_score": 0.7775242455582675, + "top1_text": "My birthday is March 14.", + "top1_score": 0.7775242455582675, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "pronoun-012", + "query": "when was the user born?", + "effective_query": "when was the user born?", + "gold_in_topk": true, + "gold_global_key": "pronoun#4", + "gold_rank": 1, + "gold_score": 0.7251037734927323, + "top1_text": "The user's birthday is March 14.", + "top1_score": 0.7251037734927323, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "pronoun-013", + "query": "do I have kids?", + "effective_query": "do I have kids?", + "gold_in_topk": true, + "gold_global_key": "pronoun#5", + "gold_rank": 1, + "gold_score": 0.38989854641830624, + "top1_text": "I have two children, Maya and Theo.", + "top1_score": 0.38989854641830624, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "pronoun-014", + "query": "how many children does the user have?", + "effective_query": "how many children does the user have?", + "gold_in_topk": true, + "gold_global_key": "pronoun#5", + "gold_rank": 1, + "gold_score": 0.5766094762746933, + "top1_text": "The user has two children, Maya and Theo.", + "top1_score": 0.5766094762746933, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "pronoun-015", + "query": "what is my usual coffee order?", + "effective_query": "what is my usual coffee order?", + "gold_in_topk": true, + "gold_global_key": "pronoun#6", + "gold_rank": 1, + "gold_score": 0.6405250896286532, + "top1_text": "My favorite coffee order is an oat-milk flat white.", + "top1_score": 0.6405250896286532, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "pronoun-016", + "query": "what coffee does the user drink?", + "effective_query": "what coffee does the user drink?", + "gold_in_topk": true, + "gold_global_key": "pronoun#6", + "gold_rank": 2, + "gold_score": 0.5813657391451515, + "top1_text": "The user does not drink coffee. They prefer tea.", + "top1_score": 0.7197601756319, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "pronoun-017", + "query": "what did I study?", + "effective_query": "what did I study?", + "gold_in_topk": true, + "gold_global_key": "pronoun#7", + "gold_rank": 1, + "gold_score": 0.6096865792083074, + "top1_text": "I studied applied mathematics at university.", + "top1_score": 0.6096865792083074, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "pronoun-018", + "query": "do I have any allergies?", + "effective_query": "do I have any allergies?", + "gold_in_topk": true, + "gold_global_key": "pronoun#8", + "gold_rank": 1, + "gold_score": 0.545551579531866, + "top1_text": "I am allergic to peanuts.", + "top1_score": 0.545551579531866, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "pronoun-019", + "query": "what is the user allergic to?", + "effective_query": "what is the user allergic to?", + "gold_in_topk": true, + "gold_global_key": "pronoun#8", + "gold_rank": 1, + "gold_score": 0.7261582549960168, + "top1_text": "The user is allergic to peanuts.", + "top1_score": 0.7261582549960168, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "pronoun-020", + "query": "what kind of car do I drive?", + "effective_query": "what kind of car do I drive?", + "gold_in_topk": true, + "gold_global_key": "pronoun#9", + "gold_rank": 1, + "gold_score": 0.5645994319440694, + "top1_text": "I drives a 2019 Toyota Corolla.", + "top1_score": 0.5645994319440694, + "top1_is_distractor": false, + "false_positive": false + } + ] + }, + { + "axis": "temporal", + "n": 14, + "pool_size": 108, + "recall_at_1": 0.5, + "recall_at_5": 0.7142857142857143, + "mean_gold_rank": 5.5, + "distractor_at_top1": 2, + "false_positive_count": 0, + "median_gold_margin": 0.049944619619024966, + "items": [ + { + "id": "temporal-001", + "query": "where does the user live now?", + "effective_query": "where does the user live now?", + "gold_in_topk": true, + "gold_global_key": "temporal#0", + "gold_rank": 3, + "gold_score": 0.550366425409688, + "top1_text": "The user lives in Lisbon.", + "top1_score": 0.5504016420681116, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "temporal-002", + "query": "where did the user used to live?", + "effective_query": "where did the user used to live?", + "gold_in_topk": true, + "gold_global_key": "temporal#1", + "gold_rank": 1, + "gold_score": 0.5490901457590983, + "top1_text": "Before 2024, the user lived in Berlin.", + "top1_score": 0.5490901457590983, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "temporal-003", + "query": "when did the user move?", + "effective_query": "when did the user move?", + "gold_in_topk": true, + "gold_global_key": "temporal#2", + "gold_rank": 1, + "gold_score": 0.5076513080537272, + "top1_text": "The user moved from Berlin to Lisbon in 2024.", + "top1_score": 0.5076513080537272, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "temporal-004", + "query": "where is the user currently based?", + "effective_query": "where is the user currently based?", + "gold_in_topk": true, + "gold_global_key": "temporal#0", + "gold_rank": 11, + "gold_score": 0.42833906054622706, + "top1_text": "The user asked for the user's name.", + "top1_score": 0.4962051712532369, + "top1_is_distractor": true, + "false_positive": false + }, + { + "id": "temporal-005", + "query": "what is the user reading?", + "effective_query": "what is the user reading?", + "gold_in_topk": true, + "gold_global_key": "temporal#3", + "gold_rank": 8, + "gold_score": 0.4600491260020972, + "top1_text": "The user reads on a Kindle Paperwhite 11th-gen.", + "top1_score": 0.5634334413727431, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "temporal-006", + "query": "what did the user read last year?", + "effective_query": "what did the user read last year?", + "gold_in_topk": true, + "gold_global_key": "temporal#4", + "gold_rank": 1, + "gold_score": 0.5028289729480597, + "top1_text": "Last year the user read 'Project Hail Mary'.", + "top1_score": 0.5028289729480597, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "temporal-007", + "query": "what is the user working on these days?", + "effective_query": "what is the user working on these days?", + "gold_in_topk": true, + "gold_global_key": "temporal#5", + "gold_rank": 4, + "gold_score": 0.4513265913655406, + "top1_text": "The user works at Acme as a software engineer.", + "top1_score": 0.48543171981436045, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "temporal-008", + "query": "what did the user finish last month?", + "effective_query": "what did the user finish last month?", + "gold_in_topk": true, + "gold_global_key": "temporal#6", + "gold_rank": 1, + "gold_score": 0.5544356167423142, + "top1_text": "The user finished the Sprint-4 reranker training last month.", + "top1_score": 0.5544356167423142, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "temporal-009", + "query": "what LLM does the user prefer?", + "effective_query": "what LLM does the user prefer?", + "gold_in_topk": true, + "gold_global_key": "temporal#7", + "gold_rank": 1, + "gold_score": 0.6226655286181774, + "top1_text": "As of May 2026, the user's preferred LLM is Claude Sonnet 4.6.", + "top1_score": 0.6226655286181774, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "temporal-010", + "query": "which model did the user use before?", + "effective_query": "which model did the user use before?", + "gold_in_topk": true, + "gold_global_key": "temporal#8", + "gold_rank": 1, + "gold_score": 0.5205286761987613, + "top1_text": "The user used GPT-4 as their primary model in 2024.", + "top1_score": 0.5205286761987613, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "temporal-011", + "query": "did the user get a new phone recently?", + "effective_query": "did the user get a new phone recently?", + "gold_in_topk": true, + "gold_global_key": "temporal#9", + "gold_rank": 1, + "gold_score": 0.6113316689987077, + "top1_text": "The user upgraded their phone to an iPhone 17 in March 2026.", + "top1_score": 0.6113316689987077, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "temporal-012", + "query": "is the user still in Berlin?", + "effective_query": "is the user still in Berlin?", + "gold_in_topk": true, + "gold_global_key": "temporal#0", + "gold_rank": 5, + "gold_score": 0.4089445689504921, + "top1_text": "Before 2024, the user lived in Berlin.", + "top1_score": 0.7214467722165934, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "temporal-013", + "query": "what is the user up to?", + "effective_query": "what is the user up to?", + "gold_in_topk": true, + "gold_global_key": "temporal#5", + "gold_rank": 19, + "gold_score": 0.2672680179526975, + "top1_text": "The user is asking a question.", + "top1_score": 0.5409891051249414, + "top1_is_distractor": true, + "false_positive": false + }, + { + "id": "temporal-014", + "query": "which model is the user on right now?", + "effective_query": "which model is the user on right now?", + "gold_in_topk": true, + "gold_global_key": "temporal#7", + "gold_rank": 20, + "gold_score": 0.31088477125594804, + "top1_text": "The user's name is Alex.", + "top1_score": 0.40861487066233426, + "top1_is_distractor": false, + "false_positive": false + } + ] + }, + { + "axis": "specificity", + "n": 14, + "pool_size": 108, + "recall_at_1": 0.8571428571428571, + "recall_at_5": 1, + "mean_gold_rank": 1.2142857142857142, + "distractor_at_top1": 0, + "false_positive_count": 0, + "median_gold_margin": 0.15120813892175938, + "items": [ + { + "id": "specificity-001", + "query": "tell me about my dog", + "effective_query": "tell me about my dog", + "gold_in_topk": true, + "gold_global_key": "specificity#0", + "gold_rank": 3, + "gold_score": 0.4727109277650852, + "top1_text": "I have a dog named Apollo.", + "top1_score": 0.5132527569891849, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "specificity-002", + "query": "what kind of pet do I have?", + "effective_query": "what kind of pet do I have?", + "gold_in_topk": true, + "gold_global_key": "specificity#0", + "gold_rank": 2, + "gold_score": 0.43959063553360933, + "top1_text": "I have a dog named Apollo.", + "top1_score": 0.5181634785804181, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "specificity-003", + "query": "do I own a bike?", + "effective_query": "do I own a bike?", + "gold_in_topk": true, + "gold_global_key": "specificity#1", + "gold_rank": 1, + "gold_score": 0.6758443024030343, + "top1_text": "I owns a Bianchi road bike.", + "top1_score": 0.6758443024030343, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "specificity-004", + "query": "what brand of bike does the user have?", + "effective_query": "what brand of bike does the user have?", + "gold_in_topk": true, + "gold_global_key": "specificity#1", + "gold_rank": 1, + "gold_score": 0.6329297203833848, + "top1_text": "The user owns a Bianchi road bike.", + "top1_score": 0.6329297203833848, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "specificity-005", + "query": "what computer do I use?", + "effective_query": "what computer do I use?", + "gold_in_topk": true, + "gold_global_key": "specificity#2", + "gold_rank": 1, + "gold_score": 0.33136554535868123, + "top1_text": "My primary laptop is a 16-inch MacBook Pro M4.", + "top1_score": 0.33136554535868123, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "specificity-006", + "query": "what laptop does the user have?", + "effective_query": "what laptop does the user have?", + "gold_in_topk": true, + "gold_global_key": "specificity#2", + "gold_rank": 1, + "gold_score": 0.6450468635073591, + "top1_text": "The user's primary laptop is a 16-inch MacBook Pro M4.", + "top1_score": 0.6450468635073591, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "specificity-007", + "query": "do I have any musical instruments?", + "effective_query": "do I have any musical instruments?", + "gold_in_topk": true, + "gold_global_key": "specificity#3", + "gold_rank": 1, + "gold_score": 0.3865674783833039, + "top1_text": "I have a Yamaha P-125 digital piano in the living room.", + "top1_score": 0.3865674783833039, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "specificity-008", + "query": "which note-taking app do I use?", + "effective_query": "which note-taking app do I use?", + "gold_in_topk": true, + "gold_global_key": "specificity#4", + "gold_rank": 1, + "gold_score": 0.45506558306415273, + "top1_text": "I uses Logseq for personal notes and Notion for work.", + "top1_score": 0.45506558306415273, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "specificity-009", + "query": "where do I like to eat in Lisbon?", + "effective_query": "where do I like to eat in Lisbon?", + "gold_in_topk": true, + "gold_global_key": "specificity#5", + "gold_rank": 1, + "gold_score": 0.7771199647073028, + "top1_text": "My favorite restaurant in Lisbon is Cervejaria Ramiro.", + "top1_score": 0.7771199647073028, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "specificity-010", + "query": "what brand sunglasses does the user wear?", + "effective_query": "what brand sunglasses does the user wear?", + "gold_in_topk": true, + "gold_global_key": "specificity#6", + "gold_rank": 1, + "gold_score": 0.6512271965052092, + "top1_text": "The user wears Smith Lowdown sunglasses.", + "top1_score": 0.6512271965052092, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "specificity-011", + "query": "do I read on a Kindle?", + "effective_query": "do I read on a Kindle?", + "gold_in_topk": true, + "gold_global_key": "specificity#7", + "gold_rank": 1, + "gold_score": 0.7110555603582231, + "top1_text": "I reads on a Kindle Paperwhite 11th-gen.", + "top1_score": 0.7110555603582231, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "specificity-012", + "query": "what espresso machine does the user own?", + "effective_query": "what espresso machine does the user own?", + "gold_in_topk": true, + "gold_global_key": "specificity#8", + "gold_rank": 1, + "gold_score": 0.7366961226077487, + "top1_text": "The user's home espresso machine is a Lelit Bianca v3.", + "top1_score": 0.7366961226077487, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "specificity-013", + "query": "what shoes do I wear?", + "effective_query": "what shoes do I wear?", + "gold_in_topk": true, + "gold_global_key": "specificity#9", + "gold_rank": 1, + "gold_score": 0.42414870117196674, + "top1_text": "I wears Allbirds Wool Runners daily.", + "top1_score": 0.42414870117196674, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "specificity-014", + "query": "what brand are the user's everyday shoes?", + "effective_query": "what brand are the user's everyday shoes?", + "gold_in_topk": true, + "gold_global_key": "specificity#9", + "gold_rank": 1, + "gold_score": 0.4238019274298934, + "top1_text": "I wears Allbirds Wool Runners daily.", + "top1_score": 0.4238019274298934, + "top1_is_distractor": false, + "false_positive": false + } + ] + }, + { + "axis": "negation", + "n": 12, + "pool_size": 108, + "recall_at_1": 1, + "recall_at_5": 1, + "mean_gold_rank": 1, + "distractor_at_top1": 0, + "false_positive_count": 0, + "median_gold_margin": 0.26762270427014134, + "items": [ + { + "id": "negation-001", + "query": "does the user drink coffee?", + "effective_query": "does the user drink coffee?", + "gold_in_topk": true, + "gold_global_key": "negation#0", + "gold_rank": 1, + "gold_score": 0.7905096599561829, + "top1_text": "The user does not drink coffee. They prefer tea.", + "top1_score": 0.7905096599561829, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "negation-002", + "query": "what does the user drink in the morning?", + "effective_query": "what does the user drink in the morning?", + "gold_in_topk": true, + "gold_global_key": "negation#0", + "gold_rank": 1, + "gold_score": 0.5285974920275435, + "top1_text": "The user does not drink coffee. They prefer tea.", + "top1_score": 0.5285974920275435, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "negation-003", + "query": "am I vegetarian?", + "effective_query": "am I vegetarian?", + "gold_in_topk": true, + "gold_global_key": "negation#1", + "gold_rank": 1, + "gold_score": 0.7051238708423307, + "top1_text": "I am not vegetarian, but avoids red meat.", + "top1_score": 0.7051238708423307, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "negation-004", + "query": "is the user on Twitter?", + "effective_query": "is the user on Twitter?", + "gold_in_topk": true, + "gold_global_key": "negation#2", + "gold_rank": 1, + "gold_score": 0.5874304481012608, + "top1_text": "The user does not use Twitter; they use Bluesky and Mastodon.", + "top1_score": 0.5874304481012608, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "negation-005", + "query": "which social networks does the user use?", + "effective_query": "which social networks does the user use?", + "gold_in_topk": true, + "gold_global_key": "negation#2", + "gold_rank": 1, + "gold_score": 0.5067702908301415, + "top1_text": "The user does not use Twitter; they use Bluesky and Mastodon.", + "top1_score": 0.5067702908301415, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "negation-006", + "query": "does the user own a car?", + "effective_query": "does the user own a car?", + "gold_in_topk": true, + "gold_global_key": "negation#3", + "gold_rank": 1, + "gold_score": 0.7488288864422115, + "top1_text": "The user does not own a car; they bike or use public transit.", + "top1_score": 0.7488288864422115, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "negation-007", + "query": "is the user active on LinkedIn?", + "effective_query": "is the user active on LinkedIn?", + "gold_in_topk": true, + "gold_global_key": "negation#4", + "gold_rank": 1, + "gold_score": 0.7880239246717338, + "top1_text": "The user is not on LinkedIn anymore.", + "top1_score": 0.7880239246717338, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "negation-008", + "query": "any foods the user hates?", + "effective_query": "any foods the user hates?", + "gold_in_topk": true, + "gold_global_key": "negation#5", + "gold_rank": 1, + "gold_score": 0.46360991866022544, + "top1_text": "I dislikes cilantro intensely.", + "top1_score": 0.46360991866022544, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "negation-009", + "query": "has the user traveled to Asia?", + "effective_query": "has the user traveled to Asia?", + "gold_in_topk": true, + "gold_global_key": "negation#6", + "gold_rank": 1, + "gold_score": 0.6919597377462416, + "top1_text": "The user has never been to Asia.", + "top1_score": 0.6919597377462416, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "negation-010", + "query": "does the user like horror movies?", + "effective_query": "does the user like horror movies?", + "gold_in_topk": true, + "gold_global_key": "negation#7", + "gold_rank": 1, + "gold_score": 0.7303882056719367, + "top1_text": "The user does not enjoy horror movies.", + "top1_score": 0.7303882056719367, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "negation-011", + "query": "can the user eat shrimp?", + "effective_query": "can the user eat shrimp?", + "gold_in_topk": true, + "gold_global_key": "negation#8", + "gold_rank": 1, + "gold_score": 0.7044731236020111, + "top1_text": "The user does not eat shellfish.", + "top1_score": 0.7044731236020111, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "negation-012", + "query": "is the user learning a new language?", + "effective_query": "is the user learning a new language?", + "gold_in_topk": true, + "gold_global_key": "negation#9", + "gold_rank": 1, + "gold_score": 0.6522323599643379, + "top1_text": "The user is not currently learning any new languages.", + "top1_score": 0.6522323599643379, + "top1_is_distractor": false, + "false_positive": false + } + ] + }, + { + "axis": "control", + "n": 10, + "pool_size": 108, + "recall_at_1": 0, + "recall_at_5": 0, + "mean_gold_rank": null, + "distractor_at_top1": 0, + "false_positive_count": 0, + "median_gold_margin": null, + "items": [ + { + "id": "control-001", + "query": "what is the airspeed velocity of an unladen swallow?", + "effective_query": "what is the airspeed velocity of an unladen swallow?", + "gold_in_topk": false, + "gold_global_key": null, + "gold_rank": null, + "gold_score": null, + "top1_text": "I am asking a question.", + "top1_score": 0.25122827069301523, + "top1_is_distractor": true, + "false_positive": false + }, + { + "id": "control-002", + "query": "who is the current president of France?", + "effective_query": "who is the current president of France?", + "gold_in_topk": false, + "gold_global_key": null, + "gold_rank": null, + "gold_score": null, + "top1_text": "As of January 2026, the user lives in Lisbon.", + "top1_score": 0.2564335064669229, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "control-003", + "query": "what is the capital of Mongolia?", + "effective_query": "what is the capital of Mongolia?", + "gold_in_topk": false, + "gold_global_key": null, + "gold_rank": null, + "gold_score": null, + "top1_text": "I have never been to Asia.", + "top1_score": 0.3106507170301111, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "control-004", + "query": "how does photosynthesis work?", + "effective_query": "how does photosynthesis work?", + "gold_in_topk": false, + "gold_global_key": null, + "gold_rank": null, + "gold_score": null, + "top1_text": "I am asking a question.", + "top1_score": 0.15843567725842153, + "top1_is_distractor": true, + "false_positive": false + }, + { + "id": "control-005", + "query": "translate 'goodnight' to Japanese", + "effective_query": "translate 'goodnight' to Japanese", + "gold_in_topk": false, + "gold_global_key": null, + "gold_rank": null, + "gold_score": null, + "top1_text": "I am not currently learning any new languages.", + "top1_score": 0.2331788870062421, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "control-006", + "query": "what year did World War II end?", + "effective_query": "what year did World War II end?", + "gold_in_topk": false, + "gold_global_key": null, + "gold_rank": null, + "gold_score": null, + "top1_text": "Before 2024, I lived in Berlin.", + "top1_score": 0.32252117603205516, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "control-007", + "query": "explain entropy in thermodynamics", + "effective_query": "explain entropy in thermodynamics", + "gold_in_topk": false, + "gold_global_key": null, + "gold_rank": null, + "gold_score": null, + "top1_text": "The user uses Logseq for personal notes and Notion for work.", + "top1_score": 0.18512903871088907, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "control-008", + "query": "best way to debug a segfault in C", + "effective_query": "best way to debug a segfault in C", + "gold_in_topk": false, + "gold_global_key": null, + "gold_rank": null, + "gold_score": null, + "top1_text": "I am currently working on a memory benchmark project.", + "top1_score": 0.24122260587509706, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "control-009", + "query": "what's the weather going to be tomorrow?", + "effective_query": "what's the weather going to be tomorrow?", + "gold_in_topk": false, + "gold_global_key": null, + "gold_rank": null, + "gold_score": null, + "top1_text": "As of May 14, 2026, I am a term mentioned in the conversation.", + "top1_score": 0.25613897573144834, + "top1_is_distractor": true, + "false_positive": false + }, + { + "id": "control-010", + "query": "give me a recipe for tiramisu", + "effective_query": "give me a recipe for tiramisu", + "gold_in_topk": false, + "gold_global_key": null, + "gold_rank": null, + "gold_score": null, + "top1_text": "My favorite restaurant is Cervejaria Ramiro.", + "top1_score": 0.32863639833819, + "top1_is_distractor": false, + "false_positive": false + } + ] + } + ] +} \ No newline at end of file diff --git a/benchmarks/alignbench/runs/hybrid-bm25.json b/benchmarks/alignbench/runs/hybrid-bm25.json new file mode 100644 index 0000000..f199f6a --- /dev/null +++ b/benchmarks/alignbench/runs/hybrid-bm25.json @@ -0,0 +1,990 @@ +{ + "variant": "hybrid-bm25", + "model": "Xenova/all-MiniLM-L6-v2", + "topk": 5, + "wall_seconds": "0.3", + "composite": { + "recall_at_1": 0.6166666666666667, + "recall_at_5": 0.9166666666666666, + "distractor_top1_rate": 0.06666666666666667, + "n": 60 + }, + "false_positive_rate": 1, + "per_axis": [ + { + "axis": "pronoun", + "n": 20, + "pool_size": 55, + "recall_at_1": 0.6, + "recall_at_5": 0.95, + "mean_gold_rank": 1.95, + "distractor_at_top1": 2, + "false_positive_count": 0, + "median_gold_margin": 0.03374568959009672, + "items": [ + { + "id": "pronoun-001", + "query": "what is my name?", + "effective_query": "what is my name?", + "gold_in_topk": true, + "gold_global_key": "pronoun#0", + "gold_rank": 1, + "gold_score": 1, + "top1_text": "The user's name is Alex.", + "top1_score": 1, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "pronoun-002", + "query": "what's the user's name?", + "effective_query": "what's the user's name?", + "gold_in_topk": true, + "gold_global_key": "pronoun#0", + "gold_rank": 2, + "gold_score": 0.9576851044161381, + "top1_text": "The user asked for the user's name.", + "top1_score": 0.958092359862655, + "top1_is_distractor": true, + "false_positive": false + }, + { + "id": "pronoun-003", + "query": "who am I?", + "effective_query": "who am I?", + "gold_in_topk": true, + "gold_global_key": "pronoun#0", + "gold_rank": 2, + "gold_score": 0.4284196416000849, + "top1_text": "The user is me.", + "top1_score": 0.6, + "top1_is_distractor": true, + "false_positive": false + }, + { + "id": "pronoun-004", + "query": "what do I do for work?", + "effective_query": "what do I do for work?", + "gold_in_topk": true, + "gold_global_key": "pronoun#1", + "gold_rank": 3, + "gold_score": 0.5811840395869337, + "top1_text": "The user uses Logseq for personal notes and Notion for work.", + "top1_score": 0.854461597488777, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "pronoun-005", + "query": "what is the user's job?", + "effective_query": "what is the user's job?", + "gold_in_topk": true, + "gold_global_key": "pronoun#1", + "gold_rank": 10, + "gold_score": 0.6045280628239001, + "top1_text": "The user's name is Alex.", + "top1_score": 0.7953675650047598, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "pronoun-006", + "query": "where does the user work?", + "effective_query": "where does the user work?", + "gold_in_topk": true, + "gold_global_key": "pronoun#1", + "gold_rank": 4, + "gold_score": 0.6056863546267723, + "top1_text": "The user uses Logseq for personal notes and Notion for work.", + "top1_score": 0.8849990304585116, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "pronoun-007", + "query": "what is my dog's name?", + "effective_query": "what is my dog's name?", + "gold_in_topk": true, + "gold_global_key": "pronoun#2", + "gold_rank": 1, + "gold_score": 0.9958263380031263, + "top1_text": "The user's dog is named Apollo.", + "top1_score": 0.9958263380031263, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "pronoun-008", + "query": "who is Apollo?", + "effective_query": "who is Apollo?", + "gold_in_topk": true, + "gold_global_key": "pronoun#2", + "gold_rank": 1, + "gold_score": 1, + "top1_text": "The user's dog is named Apollo.", + "top1_score": 1, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "pronoun-009", + "query": "where do I live?", + "effective_query": "where do I live?", + "gold_in_topk": true, + "gold_global_key": "pronoun#3", + "gold_rank": 2, + "gold_score": 0.5993887537891871, + "top1_text": "As of January 2026, the user lives in Lisbon.", + "top1_score": 0.6, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "pronoun-010", + "query": "what city does the user live in?", + "effective_query": "what city does the user live in?", + "gold_in_topk": true, + "gold_global_key": "pronoun#3", + "gold_rank": 1, + "gold_score": 0.877017959407961, + "top1_text": "The user lives in Lisbon.", + "top1_score": 0.877017959407961, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "pronoun-011", + "query": "when is my birthday?", + "effective_query": "when is my birthday?", + "gold_in_topk": true, + "gold_global_key": "pronoun#4", + "gold_rank": 1, + "gold_score": 1, + "top1_text": "The user's birthday is March 14.", + "top1_score": 1, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "pronoun-012", + "query": "when was the user born?", + "effective_query": "when was the user born?", + "gold_in_topk": true, + "gold_global_key": "pronoun#4", + "gold_rank": 1, + "gold_score": 0.6052989966186167, + "top1_text": "The user's birthday is March 14.", + "top1_score": 0.6052989966186167, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "pronoun-013", + "query": "do I have kids?", + "effective_query": "do I have kids?", + "gold_in_topk": true, + "gold_global_key": "pronoun#5", + "gold_rank": 1, + "gold_score": 0.6, + "top1_text": "The user has two children, Maya and Theo.", + "top1_score": 0.6, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "pronoun-014", + "query": "how many children does the user have?", + "effective_query": "how many children does the user have?", + "gold_in_topk": true, + "gold_global_key": "pronoun#5", + "gold_rank": 1, + "gold_score": 1, + "top1_text": "The user has two children, Maya and Theo.", + "top1_score": 1, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "pronoun-015", + "query": "what is my usual coffee order?", + "effective_query": "what is my usual coffee order?", + "gold_in_topk": true, + "gold_global_key": "pronoun#6", + "gold_rank": 1, + "gold_score": 1, + "top1_text": "The user's favorite coffee order is an oat-milk flat white.", + "top1_score": 1, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "pronoun-016", + "query": "what coffee does the user drink?", + "effective_query": "what coffee does the user drink?", + "gold_in_topk": true, + "gold_global_key": "pronoun#6", + "gold_rank": 2, + "gold_score": 0.6025728313819994, + "top1_text": "The user does not drink coffee. They prefer tea.", + "top1_score": 1, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "pronoun-017", + "query": "what did I study?", + "effective_query": "what did I study?", + "gold_in_topk": true, + "gold_global_key": "pronoun#7", + "gold_rank": 1, + "gold_score": 0.6, + "top1_text": "The user studied applied mathematics at university.", + "top1_score": 0.6, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "pronoun-018", + "query": "do I have any allergies?", + "effective_query": "do I have any allergies?", + "gold_in_topk": true, + "gold_global_key": "pronoun#8", + "gold_rank": 1, + "gold_score": 0.6, + "top1_text": "The user is allergic to peanuts.", + "top1_score": 0.6, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "pronoun-019", + "query": "what is the user allergic to?", + "effective_query": "what is the user allergic to?", + "gold_in_topk": true, + "gold_global_key": "pronoun#8", + "gold_rank": 1, + "gold_score": 1, + "top1_text": "The user is allergic to peanuts.", + "top1_score": 1, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "pronoun-020", + "query": "what kind of car do I drive?", + "effective_query": "what kind of car do I drive?", + "gold_in_topk": true, + "gold_global_key": "pronoun#9", + "gold_rank": 2, + "gold_score": 0.6, + "top1_text": "The user does not own a car; they bike or use public transit.", + "top1_score": 0.9330272288700376, + "top1_is_distractor": false, + "false_positive": false + } + ] + }, + { + "axis": "temporal", + "n": 14, + "pool_size": 55, + "recall_at_1": 0.5, + "recall_at_5": 0.7857142857142857, + "mean_gold_rank": 5.142857142857143, + "distractor_at_top1": 2, + "false_positive_count": 0, + "median_gold_margin": 0.18738929012250227, + "items": [ + { + "id": "temporal-001", + "query": "where does the user live now?", + "effective_query": "where does the user live now?", + "gold_in_topk": true, + "gold_global_key": "temporal#0", + "gold_rank": 4, + "gold_score": 0.6066879474966196, + "top1_text": "The user does not eat shellfish.", + "top1_score": 0.6348007612409563, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "temporal-002", + "query": "where did the user used to live?", + "effective_query": "where did the user used to live?", + "gold_in_topk": true, + "gold_global_key": "temporal#1", + "gold_rank": 4, + "gold_score": 0.606449574798833, + "top1_text": "The user moved from Berlin to Lisbon in 2024.", + "top1_score": 0.754314259838859, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "temporal-003", + "query": "when did the user move?", + "effective_query": "when did the user move?", + "gold_in_topk": true, + "gold_global_key": "temporal#2", + "gold_rank": 2, + "gold_score": 0.8537709758404959, + "top1_text": "The user asked for the user's name.", + "top1_score": 0.9047816753185527, + "top1_is_distractor": true, + "false_positive": false + }, + { + "id": "temporal-004", + "query": "where is the user currently based?", + "effective_query": "where is the user currently based?", + "gold_in_topk": true, + "gold_global_key": "temporal#0", + "gold_rank": 15, + "gold_score": 0.5204413965610561, + "top1_text": "The user is not currently learning any new languages.", + "top1_score": 0.7399852806383946, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "temporal-005", + "query": "what is the user reading?", + "effective_query": "what is the user reading?", + "gold_in_topk": true, + "gold_global_key": "temporal#3", + "gold_rank": 1, + "gold_score": 0.8769262331140877, + "top1_text": "As of April 2026, the user is reading 'The Power Broker'.", + "top1_score": 0.8769262331140877, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "temporal-006", + "query": "what did the user read last year?", + "effective_query": "what did the user read last year?", + "gold_in_topk": true, + "gold_global_key": "temporal#4", + "gold_rank": 1, + "gold_score": 1, + "top1_text": "Last year the user read 'Project Hail Mary'.", + "top1_score": 1, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "temporal-007", + "query": "what is the user working on these days?", + "effective_query": "what is the user working on these days?", + "gold_in_topk": true, + "gold_global_key": "temporal#5", + "gold_rank": 1, + "gold_score": 0.9501075843236121, + "top1_text": "The user is currently working on a memory benchmark project.", + "top1_score": 0.9501075843236121, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "temporal-008", + "query": "what did the user finish last month?", + "effective_query": "what did the user finish last month?", + "gold_in_topk": true, + "gold_global_key": "temporal#6", + "gold_rank": 1, + "gold_score": 1, + "top1_text": "The user finished the Sprint-4 reranker training last month.", + "top1_score": 1, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "temporal-009", + "query": "what LLM does the user prefer?", + "effective_query": "what LLM does the user prefer?", + "gold_in_topk": true, + "gold_global_key": "temporal#7", + "gold_rank": 1, + "gold_score": 0.7943511233809831, + "top1_text": "As of May 2026, the user's preferred LLM is Claude Sonnet 4.6.", + "top1_score": 0.7943511233809831, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "temporal-010", + "query": "which model did the user use before?", + "effective_query": "which model did the user use before?", + "gold_in_topk": true, + "gold_global_key": "temporal#8", + "gold_rank": 1, + "gold_score": 0.9153791248018733, + "top1_text": "The user used GPT-4 as their primary model in 2024.", + "top1_score": 0.9153791248018733, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "temporal-011", + "query": "did the user get a new phone recently?", + "effective_query": "did the user get a new phone recently?", + "gold_in_topk": true, + "gold_global_key": "temporal#9", + "gold_rank": 1, + "gold_score": 0.9454096158904209, + "top1_text": "The user upgraded their phone to an iPhone 17 in March 2026.", + "top1_score": 0.9454096158904209, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "temporal-012", + "query": "is the user still in Berlin?", + "effective_query": "is the user still in Berlin?", + "gold_in_topk": true, + "gold_global_key": "temporal#0", + "gold_rank": 5, + "gold_score": 0.454434056359795, + "top1_text": "Before 2024, the user lived in Berlin.", + "top1_score": 1, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "temporal-013", + "query": "what is the user up to?", + "effective_query": "what is the user up to?", + "gold_in_topk": true, + "gold_global_key": "temporal#5", + "gold_rank": 18, + "gold_score": 0.3782959535726875, + "top1_text": "The user is asking a question.", + "top1_score": 0.7138787297189697, + "top1_is_distractor": true, + "false_positive": false + }, + { + "id": "temporal-014", + "query": "which model is the user on right now?", + "effective_query": "which model is the user on right now?", + "gold_in_topk": true, + "gold_global_key": "temporal#7", + "gold_rank": 17, + "gold_score": 0.5029854069075197, + "top1_text": "The user used GPT-4 as their primary model in 2024.", + "top1_score": 0.8573596067886893, + "top1_is_distractor": false, + "false_positive": false + } + ] + }, + { + "axis": "specificity", + "n": 14, + "pool_size": 55, + "recall_at_1": 0.5, + "recall_at_5": 0.9285714285714286, + "mean_gold_rank": 1.8571428571428572, + "distractor_at_top1": 0, + "false_positive_count": 0, + "median_gold_margin": 0.009640204109421346, + "items": [ + { + "id": "specificity-001", + "query": "tell me about my dog", + "effective_query": "tell me about my dog", + "gold_in_topk": true, + "gold_global_key": "specificity#0", + "gold_rank": 3, + "gold_score": 0.7584346404038529, + "top1_text": "The user has a dog named Apollo.", + "top1_score": 0.8530735681861806, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "specificity-002", + "query": "what kind of pet do I have?", + "effective_query": "what kind of pet do I have?", + "gold_in_topk": true, + "gold_global_key": "specificity#0", + "gold_rank": 2, + "gold_score": 0.5555778772988805, + "top1_text": "The user has a dog named Apollo.", + "top1_score": 0.6, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "specificity-003", + "query": "do I own a bike?", + "effective_query": "do I own a bike?", + "gold_in_topk": true, + "gold_global_key": "specificity#1", + "gold_rank": 2, + "gold_score": 0.8918705240259431, + "top1_text": "The user does not own a car; they bike or use public transit.", + "top1_score": 0.9056616692307067, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "specificity-004", + "query": "what brand of bike does the user have?", + "effective_query": "what brand of bike does the user have?", + "gold_in_topk": true, + "gold_global_key": "specificity#1", + "gold_rank": 1, + "gold_score": 0.9108344961613679, + "top1_text": "The user owns a Bianchi road bike.", + "top1_score": 0.9108344961613679, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "specificity-005", + "query": "what computer do I use?", + "effective_query": "what computer do I use?", + "gold_in_topk": true, + "gold_global_key": "specificity#2", + "gold_rank": 2, + "gold_score": 0.6, + "top1_text": "The user does not use Twitter; they use Bluesky and Mastodon.", + "top1_score": 0.7243317748899458, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "specificity-006", + "query": "what laptop does the user have?", + "effective_query": "what laptop does the user have?", + "gold_in_topk": true, + "gold_global_key": "specificity#2", + "gold_rank": 1, + "gold_score": 1, + "top1_text": "The user's primary laptop is a 16-inch MacBook Pro M4.", + "top1_score": 1, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "specificity-007", + "query": "do I have any musical instruments?", + "effective_query": "do I have any musical instruments?", + "gold_in_topk": true, + "gold_global_key": "specificity#3", + "gold_rank": 2, + "gold_score": 0.6, + "top1_text": "The user is not currently learning any new languages.", + "top1_score": 0.7662387492993549, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "specificity-008", + "query": "which note-taking app do I use?", + "effective_query": "which note-taking app do I use?", + "gold_in_topk": true, + "gold_global_key": "specificity#4", + "gold_rank": 2, + "gold_score": 0.6, + "top1_text": "The user does not use Twitter; they use Bluesky and Mastodon.", + "top1_score": 0.6979807773255753, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "specificity-009", + "query": "where do I like to eat in Lisbon?", + "effective_query": "where do I like to eat in Lisbon?", + "gold_in_topk": true, + "gold_global_key": "specificity#5", + "gold_rank": 1, + "gold_score": 0.8290638598887166, + "top1_text": "The user's favorite restaurant in Lisbon is Cervejaria Ramiro.", + "top1_score": 0.8290638598887166, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "specificity-010", + "query": "what brand sunglasses does the user wear?", + "effective_query": "what brand sunglasses does the user wear?", + "gold_in_topk": true, + "gold_global_key": "specificity#6", + "gold_rank": 1, + "gold_score": 1, + "top1_text": "The user wears Smith Lowdown sunglasses.", + "top1_score": 1, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "specificity-011", + "query": "do I read on a Kindle?", + "effective_query": "do I read on a Kindle?", + "gold_in_topk": true, + "gold_global_key": "specificity#7", + "gold_rank": 1, + "gold_score": 1, + "top1_text": "The user reads on a Kindle Paperwhite 11th-gen.", + "top1_score": 1, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "specificity-012", + "query": "what espresso machine does the user own?", + "effective_query": "what espresso machine does the user own?", + "gold_in_topk": true, + "gold_global_key": "specificity#8", + "gold_rank": 1, + "gold_score": 1, + "top1_text": "The user's home espresso machine is a Lelit Bianca v3.", + "top1_score": 1, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "specificity-013", + "query": "what shoes do I wear?", + "effective_query": "what shoes do I wear?", + "gold_in_topk": true, + "gold_global_key": "specificity#9", + "gold_rank": 1, + "gold_score": 0.6, + "top1_text": "The user wears Allbirds Wool Runners daily.", + "top1_score": 0.6, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "specificity-014", + "query": "what brand are the user's everyday shoes?", + "effective_query": "what brand are the user's everyday shoes?", + "gold_in_topk": true, + "gold_global_key": "specificity#9", + "gold_rank": 6, + "gold_score": 0.6116802638773472, + "top1_text": "The user's favorite restaurant is Cervejaria Ramiro.", + "top1_score": 0.7130016116548469, + "top1_is_distractor": false, + "false_positive": false + } + ] + }, + { + "axis": "negation", + "n": 12, + "pool_size": 55, + "recall_at_1": 0.9166666666666666, + "recall_at_5": 1, + "mean_gold_rank": 1.1666666666666667, + "distractor_at_top1": 0, + "false_positive_count": 0, + "median_gold_margin": 0.5607048274399469, + "items": [ + { + "id": "negation-001", + "query": "does the user drink coffee?", + "effective_query": "does the user drink coffee?", + "gold_in_topk": true, + "gold_global_key": "negation#0", + "gold_rank": 1, + "gold_score": 1, + "top1_text": "The user does not drink coffee. They prefer tea.", + "top1_score": 1, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "negation-002", + "query": "what does the user drink in the morning?", + "effective_query": "what does the user drink in the morning?", + "gold_in_topk": true, + "gold_global_key": "negation#0", + "gold_rank": 1, + "gold_score": 1, + "top1_text": "The user does not drink coffee. They prefer tea.", + "top1_score": 1, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "negation-003", + "query": "am I vegetarian?", + "effective_query": "am I vegetarian?", + "gold_in_topk": true, + "gold_global_key": "negation#1", + "gold_rank": 1, + "gold_score": 1, + "top1_text": "The user is not vegetarian, but avoids red meat.", + "top1_score": 1, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "negation-004", + "query": "is the user on Twitter?", + "effective_query": "is the user on Twitter?", + "gold_in_topk": true, + "gold_global_key": "negation#2", + "gold_rank": 1, + "gold_score": 0.9150475533063944, + "top1_text": "The user does not use Twitter; they use Bluesky and Mastodon.", + "top1_score": 0.9150475533063944, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "negation-005", + "query": "which social networks does the user use?", + "effective_query": "which social networks does the user use?", + "gold_in_topk": true, + "gold_global_key": "negation#2", + "gold_rank": 1, + "gold_score": 1, + "top1_text": "The user does not use Twitter; they use Bluesky and Mastodon.", + "top1_score": 1, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "negation-006", + "query": "does the user own a car?", + "effective_query": "does the user own a car?", + "gold_in_topk": true, + "gold_global_key": "negation#3", + "gold_rank": 1, + "gold_score": 1, + "top1_text": "The user does not own a car; they bike or use public transit.", + "top1_score": 1, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "negation-007", + "query": "is the user active on LinkedIn?", + "effective_query": "is the user active on LinkedIn?", + "gold_in_topk": true, + "gold_global_key": "negation#4", + "gold_rank": 1, + "gold_score": 1, + "top1_text": "The user is not on LinkedIn anymore.", + "top1_score": 1, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "negation-008", + "query": "any foods the user hates?", + "effective_query": "any foods the user hates?", + "gold_in_topk": true, + "gold_global_key": "negation#5", + "gold_rank": 3, + "gold_score": 0.5952157764050423, + "top1_text": "The user is not currently learning any new languages.", + "top1_score": 0.6185955328000405, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "negation-009", + "query": "has the user traveled to Asia?", + "effective_query": "has the user traveled to Asia?", + "gold_in_topk": true, + "gold_global_key": "negation#6", + "gold_rank": 1, + "gold_score": 1, + "top1_text": "The user has never been to Asia.", + "top1_score": 1, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "negation-010", + "query": "does the user like horror movies?", + "effective_query": "does the user like horror movies?", + "gold_in_topk": true, + "gold_global_key": "negation#7", + "gold_rank": 1, + "gold_score": 1, + "top1_text": "The user does not enjoy horror movies.", + "top1_score": 1, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "negation-011", + "query": "can the user eat shrimp?", + "effective_query": "can the user eat shrimp?", + "gold_in_topk": true, + "gold_global_key": "negation#8", + "gold_rank": 1, + "gold_score": 1, + "top1_text": "The user does not eat shellfish.", + "top1_score": 1, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "negation-012", + "query": "is the user learning a new language?", + "effective_query": "is the user learning a new language?", + "gold_in_topk": true, + "gold_global_key": "negation#9", + "gold_rank": 1, + "gold_score": 1, + "top1_text": "The user is not currently learning any new languages.", + "top1_score": 1, + "top1_is_distractor": false, + "false_positive": false + } + ] + }, + { + "axis": "control", + "n": 10, + "pool_size": 55, + "recall_at_1": 0, + "recall_at_5": 0, + "mean_gold_rank": null, + "distractor_at_top1": 0, + "false_positive_count": 10, + "median_gold_margin": null, + "items": [ + { + "id": "control-001", + "query": "what is the airspeed velocity of an unladen swallow?", + "effective_query": "what is the airspeed velocity of an unladen swallow?", + "gold_in_topk": false, + "gold_global_key": null, + "gold_rank": null, + "gold_score": null, + "top1_text": "The user's favorite coffee order is an oat-milk flat white.", + "top1_score": 0.7351279172813607, + "top1_is_distractor": false, + "false_positive": true + }, + { + "id": "control-002", + "query": "who is the current president of France?", + "effective_query": "who is the current president of France?", + "gold_in_topk": false, + "gold_global_key": null, + "gold_rank": null, + "gold_score": null, + "top1_text": "As of January 2026, the user lives in Lisbon.", + "top1_score": 0.9123350671733821, + "top1_is_distractor": false, + "false_positive": true + }, + { + "id": "control-003", + "query": "what is the capital of Mongolia?", + "effective_query": "what is the capital of Mongolia?", + "gold_in_topk": false, + "gold_global_key": null, + "gold_rank": null, + "gold_score": null, + "top1_text": "As of May 14, 2026, Apollo is a term mentioned in the conversation.", + "top1_score": 0.7795522818480163, + "top1_is_distractor": true, + "false_positive": true + }, + { + "id": "control-004", + "query": "how does photosynthesis work?", + "effective_query": "how does photosynthesis work?", + "gold_in_topk": false, + "gold_global_key": null, + "gold_rank": null, + "gold_score": null, + "top1_text": "The user uses Logseq for personal notes and Notion for work.", + "top1_score": 0.8130935993675803, + "top1_is_distractor": false, + "false_positive": true + }, + { + "id": "control-005", + "query": "translate 'goodnight' to Japanese", + "effective_query": "translate 'goodnight' to Japanese", + "gold_in_topk": false, + "gold_global_key": null, + "gold_rank": null, + "gold_score": null, + "top1_text": "The user has never been to Asia.", + "top1_score": 0.8046049367670489, + "top1_is_distractor": false, + "false_positive": true + }, + { + "id": "control-006", + "query": "what year did World War II end?", + "effective_query": "what year did World War II end?", + "gold_in_topk": false, + "gold_global_key": null, + "gold_rank": null, + "gold_score": null, + "top1_text": "Last year the user read 'Project Hail Mary'.", + "top1_score": 0.7019848467597081, + "top1_is_distractor": false, + "false_positive": true + }, + { + "id": "control-007", + "query": "explain entropy in thermodynamics", + "effective_query": "explain entropy in thermodynamics", + "gold_in_topk": false, + "gold_global_key": null, + "gold_rank": null, + "gold_score": null, + "top1_text": "The user lives in Lisbon.", + "top1_score": 0.7312421987766862, + "top1_is_distractor": false, + "false_positive": true + }, + { + "id": "control-008", + "query": "best way to debug a segfault in C", + "effective_query": "best way to debug a segfault in C", + "gold_in_topk": false, + "gold_global_key": null, + "gold_rank": null, + "gold_score": null, + "top1_text": "The user is currently working on a memory benchmark project.", + "top1_score": 0.7099948998710806, + "top1_is_distractor": false, + "false_positive": true + }, + { + "id": "control-009", + "query": "what's the weather going to be tomorrow?", + "effective_query": "what's the weather going to be tomorrow?", + "gold_in_topk": false, + "gold_global_key": null, + "gold_rank": null, + "gold_score": null, + "top1_text": "The user upgraded their phone to an iPhone 17 in March 2026.", + "top1_score": 0.7680416836559597, + "top1_is_distractor": false, + "false_positive": true + }, + { + "id": "control-010", + "query": "give me a recipe for tiramisu", + "effective_query": "give me a recipe for tiramisu", + "gold_in_topk": false, + "gold_global_key": null, + "gold_rank": null, + "gold_score": null, + "top1_text": "The user is me.", + "top1_score": 0.6231932527713353, + "top1_is_distractor": true, + "false_positive": true + } + ] + } + ] +} \ No newline at end of file diff --git a/benchmarks/alignbench/runs/modal-ablation.json b/benchmarks/alignbench/runs/modal-ablation.json new file mode 100644 index 0000000..e5f7fb9 --- /dev/null +++ b/benchmarks/alignbench/runs/modal-ablation.json @@ -0,0 +1,346 @@ +{ + "models": [ + { + "model": "sentence-transformers/all-MiniLM-L6-v2", + "wall_seconds": 5.9, + "composite": { + "recall_at_1": 0.7333333333333333, + "recall_at_5": 0.9333333333333333, + "distractor_top1_rate": 0.06666666666666667, + "n": 60 + }, + "per_axis": [ + { + "axis": "pronoun", + "n": 20, + "recall_at_1": 0.7, + "recall_at_5": 1.0, + "mean_gold_rank": 1.3, + "median_gold_margin": 0.04656702280044556, + "distractor_at_top1": 2 + }, + { + "axis": "temporal", + "n": 14, + "recall_at_1": 0.5, + "recall_at_5": 0.7142857142857143, + "mean_gold_rank": 5.5, + "median_gold_margin": 0.04994499683380127, + "distractor_at_top1": 2 + }, + { + "axis": "specificity", + "n": 14, + "recall_at_1": 0.8571428571428571, + "recall_at_5": 1.0, + "mean_gold_rank": 1.2142857142857142, + "median_gold_margin": 0.14383524656295776, + "distractor_at_top1": 0 + }, + { + "axis": "negation", + "n": 12, + "recall_at_1": 0.9166666666666666, + "recall_at_5": 1.0, + "mean_gold_rank": 1.0833333333333333, + "median_gold_margin": 0.2676225006580353, + "distractor_at_top1": 0 + }, + { + "axis": "control", + "n": 10, + "recall_at_1": 0.0, + "recall_at_5": 0.0, + "mean_gold_rank": null, + "median_gold_margin": null, + "distractor_at_top1": 0 + } + ] + }, + { + "model": "sentence-transformers/all-mpnet-base-v2", + "wall_seconds": 6.8, + "composite": { + "recall_at_1": 0.7333333333333333, + "recall_at_5": 0.95, + "distractor_top1_rate": 0.08333333333333333, + "n": 60 + }, + "per_axis": [ + { + "axis": "pronoun", + "n": 20, + "recall_at_1": 0.55, + "recall_at_5": 0.95, + "mean_gold_rank": 1.8, + "median_gold_margin": 0.0059460848569869995, + "distractor_at_top1": 3 + }, + { + "axis": "temporal", + "n": 14, + "recall_at_1": 0.5714285714285714, + "recall_at_5": 0.8571428571428571, + "mean_gold_rank": 6.5, + "median_gold_margin": 0.054799675941467285, + "distractor_at_top1": 2 + }, + { + "axis": "specificity", + "n": 14, + "recall_at_1": 0.9285714285714286, + "recall_at_5": 1.0, + "mean_gold_rank": 1.0714285714285714, + "median_gold_margin": 0.12878745794296265, + "distractor_at_top1": 0 + }, + { + "axis": "negation", + "n": 12, + "recall_at_1": 1.0, + "recall_at_5": 1.0, + "mean_gold_rank": 1.0, + "median_gold_margin": 0.20404121279716492, + "distractor_at_top1": 0 + }, + { + "axis": "control", + "n": 10, + "recall_at_1": 0.0, + "recall_at_5": 0.0, + "mean_gold_rank": null, + "median_gold_margin": null, + "distractor_at_top1": 0 + } + ] + }, + { + "model": "BAAI/bge-small-en-v1.5", + "wall_seconds": 5.9, + "composite": { + "recall_at_1": 0.5333333333333333, + "recall_at_5": 0.8, + "distractor_top1_rate": 0.36666666666666664, + "n": 60 + }, + "per_axis": [ + { + "axis": "pronoun", + "n": 20, + "recall_at_1": 0.5, + "recall_at_5": 0.8, + "mean_gold_rank": 3.6, + "median_gold_margin": 0.0, + "distractor_at_top1": 8 + }, + { + "axis": "temporal", + "n": 14, + "recall_at_1": 0.21428571428571427, + "recall_at_5": 0.42857142857142855, + "mean_gold_rank": 9.714285714285714, + "median_gold_margin": -0.042997002601623535, + "distractor_at_top1": 10 + }, + { + "axis": "specificity", + "n": 14, + "recall_at_1": 0.7142857142857143, + "recall_at_5": 1.0, + "mean_gold_rank": 1.6428571428571428, + "median_gold_margin": 0.07060164213180542, + "distractor_at_top1": 2 + }, + { + "axis": "negation", + "n": 12, + "recall_at_1": 0.75, + "recall_at_5": 1.0, + "mean_gold_rank": 1.5, + "median_gold_margin": 0.07531774044036865, + "distractor_at_top1": 2 + }, + { + "axis": "control", + "n": 10, + "recall_at_1": 0.0, + "recall_at_5": 0.0, + "mean_gold_rank": null, + "median_gold_margin": null, + "distractor_at_top1": 0 + } + ] + }, + { + "model": "BAAI/bge-base-en-v1.5", + "wall_seconds": 6.1, + "composite": { + "recall_at_1": 0.6166666666666667, + "recall_at_5": 0.7833333333333333, + "distractor_top1_rate": 0.25, + "n": 60 + }, + "per_axis": [ + { + "axis": "pronoun", + "n": 20, + "recall_at_1": 0.7, + "recall_at_5": 0.9, + "mean_gold_rank": 2.35, + "median_gold_margin": 0.0298384428024292, + "distractor_at_top1": 5 + }, + { + "axis": "temporal", + "n": 14, + "recall_at_1": 0.2857142857142857, + "recall_at_5": 0.42857142857142855, + "mean_gold_rank": 10.357142857142858, + "median_gold_margin": -0.050244808197021484, + "distractor_at_top1": 7 + }, + { + "axis": "specificity", + "n": 14, + "recall_at_1": 0.7142857142857143, + "recall_at_5": 1.0, + "mean_gold_rank": 1.5, + "median_gold_margin": 0.07876402139663696, + "distractor_at_top1": 2 + }, + { + "axis": "negation", + "n": 12, + "recall_at_1": 0.75, + "recall_at_5": 0.75, + "mean_gold_rank": 3.5, + "median_gold_margin": 0.03547412157058716, + "distractor_at_top1": 1 + }, + { + "axis": "control", + "n": 10, + "recall_at_1": 0.0, + "recall_at_5": 0.0, + "mean_gold_rank": null, + "median_gold_margin": null, + "distractor_at_top1": 0 + } + ] + }, + { + "model": "intfloat/e5-small-v2", + "wall_seconds": 6.4, + "composite": { + "recall_at_1": 0.45, + "recall_at_5": 0.7166666666666667, + "distractor_top1_rate": 0.4166666666666667, + "n": 60 + }, + "per_axis": [ + { + "axis": "pronoun", + "n": 20, + "recall_at_1": 0.35, + "recall_at_5": 0.85, + "mean_gold_rank": 3.6, + "median_gold_margin": -0.008539855480194092, + "distractor_at_top1": 10 + }, + { + "axis": "temporal", + "n": 14, + "recall_at_1": 0.35714285714285715, + "recall_at_5": 0.42857142857142855, + "mean_gold_rank": 9.785714285714286, + "median_gold_margin": -0.02019256353378296, + "distractor_at_top1": 7 + }, + { + "axis": "specificity", + "n": 14, + "recall_at_1": 0.6428571428571429, + "recall_at_5": 0.7142857142857143, + "mean_gold_rank": 3.2857142857142856, + "median_gold_margin": 0.011939942836761475, + "distractor_at_top1": 3 + }, + { + "axis": "negation", + "n": 12, + "recall_at_1": 0.5, + "recall_at_5": 0.8333333333333334, + "mean_gold_rank": 3.0, + "median_gold_margin": 0.0028389692306518555, + "distractor_at_top1": 5 + }, + { + "axis": "control", + "n": 10, + "recall_at_1": 0.0, + "recall_at_5": 0.0, + "mean_gold_rank": null, + "median_gold_margin": null, + "distractor_at_top1": 0 + } + ] + }, + { + "model": "intfloat/e5-base-v2", + "wall_seconds": 6.6, + "composite": { + "recall_at_1": 0.7166666666666667, + "recall_at_5": 0.9333333333333333, + "distractor_top1_rate": 0.2, + "n": 60 + }, + "per_axis": [ + { + "axis": "pronoun", + "n": 20, + "recall_at_1": 0.65, + "recall_at_5": 0.95, + "mean_gold_rank": 1.95, + "median_gold_margin": 0.0074617862701416016, + "distractor_at_top1": 6 + }, + { + "axis": "temporal", + "n": 14, + "recall_at_1": 0.7142857142857143, + "recall_at_5": 0.7857142857142857, + "mean_gold_rank": 6.214285714285714, + "median_gold_margin": 0.007385969161987305, + "distractor_at_top1": 3 + }, + { + "axis": "specificity", + "n": 14, + "recall_at_1": 0.7142857142857143, + "recall_at_5": 1.0, + "mean_gold_rank": 1.5714285714285714, + "median_gold_margin": 0.02679300308227539, + "distractor_at_top1": 2 + }, + { + "axis": "negation", + "n": 12, + "recall_at_1": 0.8333333333333334, + "recall_at_5": 1.0, + "mean_gold_rank": 1.3333333333333333, + "median_gold_margin": 0.03661489486694336, + "distractor_at_top1": 1 + }, + { + "axis": "control", + "n": 10, + "recall_at_1": 0.0, + "recall_at_5": 0.0, + "mean_gold_rank": null, + "median_gold_margin": null, + "distractor_at_top1": 0 + } + ] + } + ] +} \ No newline at end of file diff --git a/benchmarks/alignbench/runs/query-rewrite.json b/benchmarks/alignbench/runs/query-rewrite.json new file mode 100644 index 0000000..3958fdc --- /dev/null +++ b/benchmarks/alignbench/runs/query-rewrite.json @@ -0,0 +1,990 @@ +{ + "variant": "query-rewrite", + "model": "Xenova/all-MiniLM-L6-v2", + "topk": 5, + "wall_seconds": "0.4", + "composite": { + "recall_at_1": 0.7333333333333333, + "recall_at_5": 0.9333333333333333, + "distractor_top1_rate": 0.08333333333333333, + "n": 60 + }, + "false_positive_rate": 0, + "per_axis": [ + { + "axis": "pronoun", + "n": 20, + "pool_size": 55, + "recall_at_1": 0.7, + "recall_at_5": 1, + "mean_gold_rank": 1.35, + "distractor_at_top1": 3, + "false_positive_count": 0, + "median_gold_margin": 0.030952307744915752, + "items": [ + { + "id": "pronoun-001", + "query": "what is my name?", + "effective_query": "what is the user's name?", + "gold_in_topk": true, + "gold_global_key": "pronoun#0", + "gold_rank": 2, + "gold_score": 0.7334653859058853, + "top1_text": "The user asked for the user's name.", + "top1_score": 0.8007663012810273, + "top1_is_distractor": true, + "false_positive": false + }, + { + "id": "pronoun-002", + "query": "what's the user's name?", + "effective_query": "what's the user's name?", + "gold_in_topk": true, + "gold_global_key": "pronoun#0", + "gold_rank": 2, + "gold_score": 0.7294812723464028, + "top1_text": "The user asked for the user's name.", + "top1_score": 0.7771254660839599, + "top1_is_distractor": true, + "false_positive": false + }, + { + "id": "pronoun-003", + "query": "who am I?", + "effective_query": "who am the user?", + "gold_in_topk": true, + "gold_global_key": "pronoun#0", + "gold_rank": 3, + "gold_score": 0.5786921118974316, + "top1_text": "The user is me.", + "top1_score": 0.7631024695500505, + "top1_is_distractor": true, + "false_positive": false + }, + { + "id": "pronoun-004", + "query": "what do I do for work?", + "effective_query": "what do the user do for work?", + "gold_in_topk": true, + "gold_global_key": "pronoun#1", + "gold_rank": 2, + "gold_score": 0.5617654510890236, + "top1_text": "The user works at Acme as a software engineer.", + "top1_score": 0.5678380216459898, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "pronoun-005", + "query": "what is the user's job?", + "effective_query": "what is the user's job?", + "gold_in_topk": true, + "gold_global_key": "pronoun#1", + "gold_rank": 2, + "gold_score": 0.5754363621337681, + "top1_text": "The user works at Acme as a software engineer.", + "top1_score": 0.5767198521038163, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "pronoun-006", + "query": "where does the user work?", + "effective_query": "where does the user work?", + "gold_in_topk": true, + "gold_global_key": "pronoun#1", + "gold_rank": 1, + "gold_score": 0.5436570101621021, + "top1_text": "The user works as a software engineer at Acme.", + "top1_score": 0.5436570101621021, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "pronoun-007", + "query": "what is my dog's name?", + "effective_query": "what is the user's dog's name?", + "gold_in_topk": true, + "gold_global_key": "pronoun#2", + "gold_rank": 1, + "gold_score": 0.7274535409399159, + "top1_text": "The user's dog is named Apollo.", + "top1_score": 0.7274535409399159, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "pronoun-008", + "query": "who is Apollo?", + "effective_query": "who is Apollo?", + "gold_in_topk": true, + "gold_global_key": "pronoun#2", + "gold_rank": 1, + "gold_score": 0.7072701654355713, + "top1_text": "The user's dog is named Apollo.", + "top1_score": 0.7072701654355713, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "pronoun-009", + "query": "where do I live?", + "effective_query": "where do the user live?", + "gold_in_topk": true, + "gold_global_key": "pronoun#3", + "gold_rank": 1, + "gold_score": 0.6020680792400349, + "top1_text": "The user lives in Lisbon.", + "top1_score": 0.6020680792400349, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "pronoun-010", + "query": "what city does the user live in?", + "effective_query": "what city does the user live in?", + "gold_in_topk": true, + "gold_global_key": "pronoun#3", + "gold_rank": 1, + "gold_score": 0.6400263303955707, + "top1_text": "The user lives in Lisbon.", + "top1_score": 0.6400263303955707, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "pronoun-011", + "query": "when is my birthday?", + "effective_query": "when is the user's birthday?", + "gold_in_topk": true, + "gold_global_key": "pronoun#4", + "gold_rank": 1, + "gold_score": 0.8738209257715547, + "top1_text": "The user's birthday is March 14.", + "top1_score": 0.8738209257715547, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "pronoun-012", + "query": "when was the user born?", + "effective_query": "when was the user born?", + "gold_in_topk": true, + "gold_global_key": "pronoun#4", + "gold_rank": 1, + "gold_score": 0.7251037734927323, + "top1_text": "The user's birthday is March 14.", + "top1_score": 0.7251037734927323, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "pronoun-013", + "query": "do I have kids?", + "effective_query": "do the user have kids?", + "gold_in_topk": true, + "gold_global_key": "pronoun#5", + "gold_rank": 1, + "gold_score": 0.5676560796225887, + "top1_text": "The user has two children, Maya and Theo.", + "top1_score": 0.5676560796225887, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "pronoun-014", + "query": "how many children does the user have?", + "effective_query": "how many children does the user have?", + "gold_in_topk": true, + "gold_global_key": "pronoun#5", + "gold_rank": 1, + "gold_score": 0.5766094762746933, + "top1_text": "The user has two children, Maya and Theo.", + "top1_score": 0.5766094762746933, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "pronoun-015", + "query": "what is my usual coffee order?", + "effective_query": "what is the user's usual coffee order?", + "gold_in_topk": true, + "gold_global_key": "pronoun#6", + "gold_rank": 1, + "gold_score": 0.6470547548911678, + "top1_text": "The user's favorite coffee order is an oat-milk flat white.", + "top1_score": 0.6470547548911678, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "pronoun-016", + "query": "what coffee does the user drink?", + "effective_query": "what coffee does the user drink?", + "gold_in_topk": true, + "gold_global_key": "pronoun#6", + "gold_rank": 2, + "gold_score": 0.5813657391451515, + "top1_text": "The user does not drink coffee. They prefer tea.", + "top1_score": 0.7197601756319, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "pronoun-017", + "query": "what did I study?", + "effective_query": "what did the user study?", + "gold_in_topk": true, + "gold_global_key": "pronoun#7", + "gold_rank": 1, + "gold_score": 0.6274053315379636, + "top1_text": "The user studied applied mathematics at university.", + "top1_score": 0.6274053315379636, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "pronoun-018", + "query": "do I have any allergies?", + "effective_query": "do the user have any allergies?", + "gold_in_topk": true, + "gold_global_key": "pronoun#8", + "gold_rank": 1, + "gold_score": 0.6119512366841727, + "top1_text": "The user is allergic to peanuts.", + "top1_score": 0.6119512366841727, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "pronoun-019", + "query": "what is the user allergic to?", + "effective_query": "what is the user allergic to?", + "gold_in_topk": true, + "gold_global_key": "pronoun#8", + "gold_rank": 1, + "gold_score": 0.7261582549960168, + "top1_text": "The user is allergic to peanuts.", + "top1_score": 0.7261582549960168, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "pronoun-020", + "query": "what kind of car do I drive?", + "effective_query": "what kind of car do the user drive?", + "gold_in_topk": true, + "gold_global_key": "pronoun#9", + "gold_rank": 1, + "gold_score": 0.6161446193769499, + "top1_text": "The user drives a 2019 Toyota Corolla.", + "top1_score": 0.6161446193769499, + "top1_is_distractor": false, + "false_positive": false + } + ] + }, + { + "axis": "temporal", + "n": 14, + "pool_size": 55, + "recall_at_1": 0.5, + "recall_at_5": 0.7142857142857143, + "mean_gold_rank": 5.5, + "distractor_at_top1": 2, + "false_positive_count": 0, + "median_gold_margin": 0.049944619619024966, + "items": [ + { + "id": "temporal-001", + "query": "where does the user live now?", + "effective_query": "where does the user live now?", + "gold_in_topk": true, + "gold_global_key": "temporal#0", + "gold_rank": 3, + "gold_score": 0.550366425409688, + "top1_text": "The user lives in Lisbon.", + "top1_score": 0.5504016420681116, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "temporal-002", + "query": "where did the user used to live?", + "effective_query": "where did the user used to live?", + "gold_in_topk": true, + "gold_global_key": "temporal#1", + "gold_rank": 1, + "gold_score": 0.5490901457590983, + "top1_text": "Before 2024, the user lived in Berlin.", + "top1_score": 0.5490901457590983, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "temporal-003", + "query": "when did the user move?", + "effective_query": "when did the user move?", + "gold_in_topk": true, + "gold_global_key": "temporal#2", + "gold_rank": 1, + "gold_score": 0.5076513080537272, + "top1_text": "The user moved from Berlin to Lisbon in 2024.", + "top1_score": 0.5076513080537272, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "temporal-004", + "query": "where is the user currently based?", + "effective_query": "where is the user currently based?", + "gold_in_topk": true, + "gold_global_key": "temporal#0", + "gold_rank": 11, + "gold_score": 0.42833906054622706, + "top1_text": "The user asked for the user's name.", + "top1_score": 0.4962051712532369, + "top1_is_distractor": true, + "false_positive": false + }, + { + "id": "temporal-005", + "query": "what is the user reading?", + "effective_query": "what is the user reading?", + "gold_in_topk": true, + "gold_global_key": "temporal#3", + "gold_rank": 8, + "gold_score": 0.4600491260020972, + "top1_text": "The user reads on a Kindle Paperwhite 11th-gen.", + "top1_score": 0.5634334413727431, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "temporal-006", + "query": "what did the user read last year?", + "effective_query": "what did the user read last year?", + "gold_in_topk": true, + "gold_global_key": "temporal#4", + "gold_rank": 1, + "gold_score": 0.5028289729480597, + "top1_text": "Last year the user read 'Project Hail Mary'.", + "top1_score": 0.5028289729480597, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "temporal-007", + "query": "what is the user working on these days?", + "effective_query": "what is the user working on these days?", + "gold_in_topk": true, + "gold_global_key": "temporal#5", + "gold_rank": 4, + "gold_score": 0.4513265913655406, + "top1_text": "The user works at Acme as a software engineer.", + "top1_score": 0.48543171981436045, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "temporal-008", + "query": "what did the user finish last month?", + "effective_query": "what did the user finish last month?", + "gold_in_topk": true, + "gold_global_key": "temporal#6", + "gold_rank": 1, + "gold_score": 0.5544356167423142, + "top1_text": "The user finished the Sprint-4 reranker training last month.", + "top1_score": 0.5544356167423142, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "temporal-009", + "query": "what LLM does the user prefer?", + "effective_query": "what LLM does the user prefer?", + "gold_in_topk": true, + "gold_global_key": "temporal#7", + "gold_rank": 1, + "gold_score": 0.6226655286181774, + "top1_text": "As of May 2026, the user's preferred LLM is Claude Sonnet 4.6.", + "top1_score": 0.6226655286181774, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "temporal-010", + "query": "which model did the user use before?", + "effective_query": "which model did the user use before?", + "gold_in_topk": true, + "gold_global_key": "temporal#8", + "gold_rank": 1, + "gold_score": 0.5205286761987613, + "top1_text": "The user used GPT-4 as their primary model in 2024.", + "top1_score": 0.5205286761987613, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "temporal-011", + "query": "did the user get a new phone recently?", + "effective_query": "did the user get a new phone recently?", + "gold_in_topk": true, + "gold_global_key": "temporal#9", + "gold_rank": 1, + "gold_score": 0.6113316689987077, + "top1_text": "The user upgraded their phone to an iPhone 17 in March 2026.", + "top1_score": 0.6113316689987077, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "temporal-012", + "query": "is the user still in Berlin?", + "effective_query": "is the user still in Berlin?", + "gold_in_topk": true, + "gold_global_key": "temporal#0", + "gold_rank": 5, + "gold_score": 0.4089445689504921, + "top1_text": "Before 2024, the user lived in Berlin.", + "top1_score": 0.7214467722165934, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "temporal-013", + "query": "what is the user up to?", + "effective_query": "what is the user up to?", + "gold_in_topk": true, + "gold_global_key": "temporal#5", + "gold_rank": 19, + "gold_score": 0.2672680179526975, + "top1_text": "The user is asking a question.", + "top1_score": 0.5409891051249414, + "top1_is_distractor": true, + "false_positive": false + }, + { + "id": "temporal-014", + "query": "which model is the user on right now?", + "effective_query": "which model is the user on right now?", + "gold_in_topk": true, + "gold_global_key": "temporal#7", + "gold_rank": 20, + "gold_score": 0.31088477125594804, + "top1_text": "The user's name is Alex.", + "top1_score": 0.40861487066233426, + "top1_is_distractor": false, + "false_positive": false + } + ] + }, + { + "axis": "specificity", + "n": 14, + "pool_size": 55, + "recall_at_1": 0.8571428571428571, + "recall_at_5": 1, + "mean_gold_rank": 1.3571428571428572, + "distractor_at_top1": 0, + "false_positive_count": 0, + "median_gold_margin": 0.14383529034315978, + "items": [ + { + "id": "specificity-001", + "query": "tell me about my dog", + "effective_query": "tell the user about the user's dog", + "gold_in_topk": true, + "gold_global_key": "specificity#0", + "gold_rank": 4, + "gold_score": 0.4351817244869386, + "top1_text": "The user has a dog named Apollo.", + "top1_score": 0.6044216178749056, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "specificity-002", + "query": "what kind of pet do I have?", + "effective_query": "what kind of pet do the user have?", + "gold_in_topk": true, + "gold_global_key": "specificity#0", + "gold_rank": 3, + "gold_score": 0.5000743267639344, + "top1_text": "The user has a dog named Apollo.", + "top1_score": 0.594187253255591, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "specificity-003", + "query": "do I own a bike?", + "effective_query": "do the user own a bike?", + "gold_in_topk": true, + "gold_global_key": "specificity#1", + "gold_rank": 1, + "gold_score": 0.6923054137966281, + "top1_text": "The user owns a Bianchi road bike.", + "top1_score": 0.6923054137966281, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "specificity-004", + "query": "what brand of bike does the user have?", + "effective_query": "what brand of bike does the user have?", + "gold_in_topk": true, + "gold_global_key": "specificity#1", + "gold_rank": 1, + "gold_score": 0.6329297203833848, + "top1_text": "The user owns a Bianchi road bike.", + "top1_score": 0.6329297203833848, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "specificity-005", + "query": "what computer do I use?", + "effective_query": "what computer do the user use?", + "gold_in_topk": true, + "gold_global_key": "specificity#2", + "gold_rank": 1, + "gold_score": 0.5008318094173164, + "top1_text": "The user's primary laptop is a 16-inch MacBook Pro M4.", + "top1_score": 0.5008318094173164, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "specificity-006", + "query": "what laptop does the user have?", + "effective_query": "what laptop does the user have?", + "gold_in_topk": true, + "gold_global_key": "specificity#2", + "gold_rank": 1, + "gold_score": 0.6450468635073591, + "top1_text": "The user's primary laptop is a 16-inch MacBook Pro M4.", + "top1_score": 0.6450468635073591, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "specificity-007", + "query": "do I have any musical instruments?", + "effective_query": "do the user have any musical instruments?", + "gold_in_topk": true, + "gold_global_key": "specificity#3", + "gold_rank": 1, + "gold_score": 0.48813840204057846, + "top1_text": "The user has a Yamaha P-125 digital piano in the living room.", + "top1_score": 0.48813840204057846, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "specificity-008", + "query": "which note-taking app do I use?", + "effective_query": "which note-taking app do the user use?", + "gold_in_topk": true, + "gold_global_key": "specificity#4", + "gold_rank": 1, + "gold_score": 0.4744915847013103, + "top1_text": "The user uses Logseq for personal notes and Notion for work.", + "top1_score": 0.4744915847013103, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "specificity-009", + "query": "where do I like to eat in Lisbon?", + "effective_query": "where do the user like to eat in Lisbon?", + "gold_in_topk": true, + "gold_global_key": "specificity#5", + "gold_rank": 1, + "gold_score": 0.7219279253072203, + "top1_text": "The user's favorite restaurant in Lisbon is Cervejaria Ramiro.", + "top1_score": 0.7219279253072203, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "specificity-010", + "query": "what brand sunglasses does the user wear?", + "effective_query": "what brand sunglasses does the user wear?", + "gold_in_topk": true, + "gold_global_key": "specificity#6", + "gold_rank": 1, + "gold_score": 0.6512271965052092, + "top1_text": "The user wears Smith Lowdown sunglasses.", + "top1_score": 0.6512271965052092, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "specificity-011", + "query": "do I read on a Kindle?", + "effective_query": "do the user read on a Kindle?", + "gold_in_topk": true, + "gold_global_key": "specificity#7", + "gold_rank": 1, + "gold_score": 0.7268333697934469, + "top1_text": "The user reads on a Kindle Paperwhite 11th-gen.", + "top1_score": 0.7268333697934469, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "specificity-012", + "query": "what espresso machine does the user own?", + "effective_query": "what espresso machine does the user own?", + "gold_in_topk": true, + "gold_global_key": "specificity#8", + "gold_rank": 1, + "gold_score": 0.7366961226077487, + "top1_text": "The user's home espresso machine is a Lelit Bianca v3.", + "top1_score": 0.7366961226077487, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "specificity-013", + "query": "what shoes do I wear?", + "effective_query": "what shoes do the user wear?", + "gold_in_topk": true, + "gold_global_key": "specificity#9", + "gold_rank": 1, + "gold_score": 0.44632515589821203, + "top1_text": "The user wears Allbirds Wool Runners daily.", + "top1_score": 0.44632515589821203, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "specificity-014", + "query": "what brand are the user's everyday shoes?", + "effective_query": "what brand are the user's everyday shoes?", + "gold_in_topk": true, + "gold_global_key": "specificity#9", + "gold_rank": 1, + "gold_score": 0.4229939734066537, + "top1_text": "The user wears Allbirds Wool Runners daily.", + "top1_score": 0.4229939734066537, + "top1_is_distractor": false, + "false_positive": false + } + ] + }, + { + "axis": "negation", + "n": 12, + "pool_size": 55, + "recall_at_1": 0.9166666666666666, + "recall_at_5": 1, + "mean_gold_rank": 1.0833333333333333, + "distractor_at_top1": 0, + "false_positive_count": 0, + "median_gold_margin": 0.26762270427014134, + "items": [ + { + "id": "negation-001", + "query": "does the user drink coffee?", + "effective_query": "does the user drink coffee?", + "gold_in_topk": true, + "gold_global_key": "negation#0", + "gold_rank": 1, + "gold_score": 0.7905096599561829, + "top1_text": "The user does not drink coffee. They prefer tea.", + "top1_score": 0.7905096599561829, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "negation-002", + "query": "what does the user drink in the morning?", + "effective_query": "what does the user drink in the morning?", + "gold_in_topk": true, + "gold_global_key": "negation#0", + "gold_rank": 1, + "gold_score": 0.5285974920275435, + "top1_text": "The user does not drink coffee. They prefer tea.", + "top1_score": 0.5285974920275435, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "negation-003", + "query": "am I vegetarian?", + "effective_query": "am the user vegetarian?", + "gold_in_topk": true, + "gold_global_key": "negation#1", + "gold_rank": 1, + "gold_score": 0.754238802786161, + "top1_text": "The user is not vegetarian, but avoids red meat.", + "top1_score": 0.754238802786161, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "negation-004", + "query": "is the user on Twitter?", + "effective_query": "is the user on Twitter?", + "gold_in_topk": true, + "gold_global_key": "negation#2", + "gold_rank": 1, + "gold_score": 0.5874304481012608, + "top1_text": "The user does not use Twitter; they use Bluesky and Mastodon.", + "top1_score": 0.5874304481012608, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "negation-005", + "query": "which social networks does the user use?", + "effective_query": "which social networks does the user use?", + "gold_in_topk": true, + "gold_global_key": "negation#2", + "gold_rank": 1, + "gold_score": 0.5067702908301415, + "top1_text": "The user does not use Twitter; they use Bluesky and Mastodon.", + "top1_score": 0.5067702908301415, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "negation-006", + "query": "does the user own a car?", + "effective_query": "does the user own a car?", + "gold_in_topk": true, + "gold_global_key": "negation#3", + "gold_rank": 1, + "gold_score": 0.7488288864422115, + "top1_text": "The user does not own a car; they bike or use public transit.", + "top1_score": 0.7488288864422115, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "negation-007", + "query": "is the user active on LinkedIn?", + "effective_query": "is the user active on LinkedIn?", + "gold_in_topk": true, + "gold_global_key": "negation#4", + "gold_rank": 1, + "gold_score": 0.7880239246717338, + "top1_text": "The user is not on LinkedIn anymore.", + "top1_score": 0.7880239246717338, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "negation-008", + "query": "any foods the user hates?", + "effective_query": "any foods the user hates?", + "gold_in_topk": true, + "gold_global_key": "negation#5", + "gold_rank": 2, + "gold_score": 0.4222244002602081, + "top1_text": "The user is not vegetarian, but avoids red meat.", + "top1_score": 0.4321470878666278, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "negation-009", + "query": "has the user traveled to Asia?", + "effective_query": "has the user traveled to Asia?", + "gold_in_topk": true, + "gold_global_key": "negation#6", + "gold_rank": 1, + "gold_score": 0.6919597377462416, + "top1_text": "The user has never been to Asia.", + "top1_score": 0.6919597377462416, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "negation-010", + "query": "does the user like horror movies?", + "effective_query": "does the user like horror movies?", + "gold_in_topk": true, + "gold_global_key": "negation#7", + "gold_rank": 1, + "gold_score": 0.7303882056719367, + "top1_text": "The user does not enjoy horror movies.", + "top1_score": 0.7303882056719367, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "negation-011", + "query": "can the user eat shrimp?", + "effective_query": "can the user eat shrimp?", + "gold_in_topk": true, + "gold_global_key": "negation#8", + "gold_rank": 1, + "gold_score": 0.7044731236020111, + "top1_text": "The user does not eat shellfish.", + "top1_score": 0.7044731236020111, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "negation-012", + "query": "is the user learning a new language?", + "effective_query": "is the user learning a new language?", + "gold_in_topk": true, + "gold_global_key": "negation#9", + "gold_rank": 1, + "gold_score": 0.6522323599643379, + "top1_text": "The user is not currently learning any new languages.", + "top1_score": 0.6522323599643379, + "top1_is_distractor": false, + "false_positive": false + } + ] + }, + { + "axis": "control", + "n": 10, + "pool_size": 55, + "recall_at_1": 0, + "recall_at_5": 0, + "mean_gold_rank": null, + "distractor_at_top1": 0, + "false_positive_count": 0, + "median_gold_margin": null, + "items": [ + { + "id": "control-001", + "query": "what is the airspeed velocity of an unladen swallow?", + "effective_query": "what is the airspeed velocity of an unladen swallow?", + "gold_in_topk": false, + "gold_global_key": null, + "gold_rank": null, + "gold_score": null, + "top1_text": "The user does not eat shellfish.", + "top1_score": 0.25082327505676677, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "control-002", + "query": "who is the current president of France?", + "effective_query": "who is the current president of France?", + "gold_in_topk": false, + "gold_global_key": null, + "gold_rank": null, + "gold_score": null, + "top1_text": "As of January 2026, the user lives in Lisbon.", + "top1_score": 0.2564335064669229, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "control-003", + "query": "what is the capital of Mongolia?", + "effective_query": "what is the capital of Mongolia?", + "gold_in_topk": false, + "gold_global_key": null, + "gold_rank": null, + "gold_score": null, + "top1_text": "The user has never been to Asia.", + "top1_score": 0.27119226736778723, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "control-004", + "query": "how does photosynthesis work?", + "effective_query": "how does photosynthesis work?", + "gold_in_topk": false, + "gold_global_key": null, + "gold_rank": null, + "gold_score": null, + "top1_text": "The user is currently working on a memory benchmark project.", + "top1_score": 0.13794111637514647, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "control-005", + "query": "translate 'goodnight' to Japanese", + "effective_query": "translate 'goodnight' to Japanese", + "gold_in_topk": false, + "gold_global_key": null, + "gold_rank": null, + "gold_score": null, + "top1_text": "The user's favorite restaurant is Cervejaria Ramiro.", + "top1_score": 0.1911340870733358, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "control-006", + "query": "what year did World War II end?", + "effective_query": "what year did World War II end?", + "gold_in_topk": false, + "gold_global_key": null, + "gold_rank": null, + "gold_score": null, + "top1_text": "Before 2024, the user lived in Berlin.", + "top1_score": 0.2900090433169317, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "control-007", + "query": "explain entropy in thermodynamics", + "effective_query": "explain entropy in thermodynamics", + "gold_in_topk": false, + "gold_global_key": null, + "gold_rank": null, + "gold_score": null, + "top1_text": "The user uses Logseq for personal notes and Notion for work.", + "top1_score": 0.18512903871088907, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "control-008", + "query": "best way to debug a segfault in C", + "effective_query": "best way to debug a segfault in C", + "gold_in_topk": false, + "gold_global_key": null, + "gold_rank": null, + "gold_score": null, + "top1_text": "The user is currently working on a memory benchmark project.", + "top1_score": 0.23013625373127655, + "top1_is_distractor": false, + "false_positive": false + }, + { + "id": "control-009", + "query": "what's the weather going to be tomorrow?", + "effective_query": "what's the weather going to be tomorrow?", + "gold_in_topk": false, + "gold_global_key": null, + "gold_rank": null, + "gold_score": null, + "top1_text": "As of May 14, 2026, Apollo is a term mentioned in the conversation.", + "top1_score": 0.18279764320601347, + "top1_is_distractor": true, + "false_positive": false + }, + { + "id": "control-010", + "query": "give me a recipe for tiramisu", + "effective_query": "give the user a recipe for tiramisu", + "gold_in_topk": false, + "gold_global_key": null, + "gold_rank": null, + "gold_score": null, + "top1_text": "The user's favorite restaurant is Cervejaria Ramiro.", + "top1_score": 0.35106130753511783, + "top1_is_distractor": false, + "false_positive": false + } + ] + } + ] +} \ No newline at end of file diff --git a/src/memory/__tests__/atomicmemory-provider.test.ts b/src/memory/__tests__/atomicmemory-provider.test.ts index 72332eb..96dd30a 100644 --- a/src/memory/__tests__/atomicmemory-provider.test.ts +++ b/src/memory/__tests__/atomicmemory-provider.test.ts @@ -99,6 +99,33 @@ describe('ingest', () => { const body = JSON.parse(mockFetch.mock.calls[0][1].body); expect(body.conversation).toBe('user: Hi\nassistant: Hello'); }); + + it('maps scope.thread to session_id', async () => { + const provider = createProvider(); + mockFetch.mockResolvedValueOnce( + jsonResponse({ + episode_id: 'e3', + facts_extracted: 1, + memories_stored: 1, + memories_updated: 0, + memories_deleted: 0, + memories_skipped: 0, + stored_memory_ids: ['m3'], + updated_memory_ids: [], + links_created: 0, + composites_created: 0, + }) + ); + + await provider.ingest({ + mode: 'text', + content: 'Hello thread', + scope: { user: 'u1', thread: 'thread-1' }, + }); + + const body = JSON.parse(mockFetch.mock.calls[0][1].body); + expect(body.session_id).toBe('thread-1'); + }); }); // --------------------------------------------------------------------------- @@ -140,6 +167,58 @@ describe('search', () => { expect(page.results[0].relevance).toBe(0.84); expect(page.results[0].memory.id).toBe('s1'); }); + + it('maps scope.thread to search session_id', async () => { + const provider = createProvider(); + mockFetch.mockResolvedValueOnce(jsonResponse({ memories: [], count: 0 })); + + await provider.search({ + query: 'test', + scope: { user: 'u1', thread: 'thread-1' }, + }); + + const body = JSON.parse(mockFetch.mock.calls[0][1].body); + expect(body.session_id).toBe('thread-1'); + }); + + it('rejects thread-scoped search rows without matching session_id', async () => { + const provider = createProvider(); + mockFetch.mockResolvedValueOnce(jsonResponse({ + memories: [{ id: 's1', content: 'wrong thread' }], + count: 1, + })); + + await expect(provider.search({ + query: 'test', + scope: { user: 'u1', thread: 'thread-1' }, + })).rejects.toThrow(/session_id/); + }); + + it('rejects thread-scoped search rows with mismatched session_id', async () => { + const provider = createProvider(); + mockFetch.mockResolvedValueOnce(jsonResponse({ + memories: [{ id: 's1', content: 'wrong thread', session_id: 'thread-2' }], + count: 1, + })); + + await expect(provider.search({ + query: 'test', + scope: { user: 'u1', thread: 'thread-1' }, + })).rejects.toThrow(/session_id/); + }); + + it('rejects namespace-scoped search rows with mismatched namespace', async () => { + const provider = createProvider(); + mockFetch.mockResolvedValueOnce(jsonResponse({ + memories: [{ id: 's1', content: 'wrong namespace', namespace: 'other' }], + count: 1, + })); + + await expect(provider.search({ + query: 'test', + scope: { user: 'u1', namespace: 'expected' }, + })).rejects.toThrow(/namespace/); + }); }); // --------------------------------------------------------------------------- @@ -205,7 +284,12 @@ describe('list', () => { const provider = createProvider(); mockFetch.mockResolvedValueOnce( jsonResponse({ - memories: [{ id: 'l1', content: 'item' }], + memories: [{ + id: 'l1', + content: 'item', + namespace: 'project-a', + session_id: 'thread-a', + }], count: 1, }) ); @@ -216,6 +300,52 @@ describe('list', () => { expect(url).toBe(`${API_URL}/v1/memories/list?user_id=u1&limit=10&offset=0`); expect(page.memories).toHaveLength(1); expect(page.memories[0].id).toBe('l1'); + expect(page.memories[0].scope).toEqual({ + user: 'u1', + namespace: 'project-a', + thread: 'thread-a', + }); + }); + + it('maps scope.thread to list session_id query param', async () => { + const provider = createProvider(); + mockFetch.mockResolvedValueOnce(jsonResponse({ memories: [], count: 0 })); + + await provider.list({ + scope: { user: 'u1', thread: 'thread-1' }, + limit: 10, + }); + + const [url] = mockFetch.mock.calls[0]; + expect(url).toBe( + `${API_URL}/v1/memories/list?user_id=u1&limit=10&offset=0&session_id=thread-1`, + ); + }); + + it('rejects thread-scoped list rows without matching session_id', async () => { + const provider = createProvider(); + mockFetch.mockResolvedValueOnce(jsonResponse({ + memories: [{ id: 'l1', content: 'missing session' }], + count: 1, + })); + + await expect(provider.list({ + scope: { user: 'u1', thread: 'thread-1' }, + limit: 10, + })).rejects.toThrow(/session_id/); + }); + + it('rejects thread-scoped list rows with mismatched session_id', async () => { + const provider = createProvider(); + mockFetch.mockResolvedValueOnce(jsonResponse({ + memories: [{ id: 'l1', content: 'wrong session', session_id: 'thread-2' }], + count: 1, + })); + + await expect(provider.list({ + scope: { user: 'u1', thread: 'thread-1' }, + limit: 10, + })).rejects.toThrow(/session_id/); }); it('returns cursor when results fill the limit', async () => { @@ -304,6 +434,45 @@ describe('package', () => { expect(pkg.budgetConstrained).toBe(false); }); + it('maps scope.thread to package session_id', async () => { + const provider = createProvider(); + mockFetch.mockResolvedValueOnce( + jsonResponse({ + memories: [], + injection_text: '', + estimated_context_tokens: 0, + budget_constrained: false, + }) + ); + + await provider.package({ + query: 'what did I say', + scope: { user: 'u1', thread: 'thread-1' }, + }); + + const body = JSON.parse(mockFetch.mock.calls[0][1].body); + expect(body.session_id).toBe('thread-1'); + }); + + it('rejects thread-scoped package rows without matching session_id', async () => { + const provider = createProvider(); + mockFetch.mockResolvedValueOnce( + jsonResponse({ + memories: [{ id: 'p1', content: 'wrong thread', score: 0.9 }], + injection_text: 'wrong thread', + estimated_context_tokens: 2, + budget_constrained: false, + }) + ); + + await expect( + provider.package({ + query: 'what did I say', + scope: { user: 'u1', thread: 'thread-1' }, + }) + ).rejects.toThrow(/session_id/); + }); + it('propagates budget_constrained=true from the backend', async () => { const provider = createProvider(); mockFetch.mockResolvedValueOnce( @@ -349,6 +518,27 @@ describe('package', () => { }); }); +// --------------------------------------------------------------------------- +// searchAsOf() — TemporalSearch +// --------------------------------------------------------------------------- + +describe('searchAsOf', () => { + it('maps scope.thread to temporal search session_id', async () => { + const provider = createProvider(); + mockFetch.mockResolvedValueOnce(jsonResponse({ memories: [] })); + + await provider.searchAsOf({ + query: 'what did I say', + scope: { user: 'u1', thread: 'thread-1' }, + asOf: new Date('2026-05-16T12:00:00.000Z'), + }); + + const body = JSON.parse(mockFetch.mock.calls[0][1].body); + expect(body.session_id).toBe('thread-1'); + expect(body.as_of).toBe('2026-05-16T12:00:00.000Z'); + }); +}); + // --------------------------------------------------------------------------- // Scope validation // --------------------------------------------------------------------------- diff --git a/src/memory/__tests__/meta-fact-filter.test.ts b/src/memory/__tests__/meta-fact-filter.test.ts new file mode 100644 index 0000000..4d2fd54 --- /dev/null +++ b/src/memory/__tests__/meta-fact-filter.test.ts @@ -0,0 +1,210 @@ +/** + * @file MetaFactFilter unit tests + * + * Covers the three public surfaces of meta-fact-filter: + * - DEFAULT_META_FACT_PATTERNS / isMetaFact: pattern matching + * - resolveMetaFactPatterns: replace vs extend modes + * - filterMetaFacts: end-to-end drop with onDrop telemetry + * + * Item shapes are deliberately the same the SDK uses (SearchResult.memory.content) + * to keep the integration risk on the call-site low. + */ + +import { describe, it, expect, vi } from 'vitest'; +import { + DEFAULT_META_FACT_PATTERNS, + filterMetaFacts, + isMetaFact, + resolveMetaFactPatterns, + type MetaFactFilterConfig, +} from '../meta-fact-filter'; + +describe('isMetaFact', () => { + it.each([ + "The user asked for the user's name.", + "The user is asking a question.", + 'The user is me.', + 'The user requested information.', + 'The user said something.', + 'As of May 14, 2026, Apollo is a term mentioned in the conversation.', + 'As of January 2026, the user is a term mentioned in the conversation.', + 'A name was mentioned in the conversation.', + 'The conversation involves the user.', + 'The user has started a conversation.', + ])('matches the partner-demo meta-fact shape: "%s"', (content) => { + expect(isMetaFact(content)).toBe(true); + }); + + it.each([ + "User's name is SgtPooki", + 'The user lives in Lisbon.', + "The user's dog is named Apollo.", + 'The user prefers oat-milk flat whites.', + 'As of January 2026, the user lives in Lisbon.', // temporal anchor on a real fact, not a meta-fact + ])('does not match a durable user fact: "%s"', (content) => { + expect(isMetaFact(content)).toBe(false); + }); + + it('is case-insensitive on the leading "The user"', () => { + expect(isMetaFact('THE USER ASKED FOR THE USER\'S NAME.')).toBe(true); + expect(isMetaFact('the user is me.')).toBe(true); + }); + + it.each([null, undefined, 42, {}, [], ''])( + 'returns false on non-string / empty input (%s)', + (input) => { + expect(isMetaFact(input as unknown)).toBe(false); + }, + ); + + it('uses caller-supplied patterns instead of defaults when provided', () => { + const custom = [/^transcript: /i]; + expect(isMetaFact('transcript: hello', custom)).toBe(true); + // The default rules would NOT match this; with custom rules, it does. + expect(isMetaFact("The user is me.", custom)).toBe(false); + }); +}); + +describe('resolveMetaFactPatterns', () => { + it('returns the default set when patterns is omitted', () => { + const config: MetaFactFilterConfig = { enabled: true }; + expect(resolveMetaFactPatterns(config)).toBe(DEFAULT_META_FACT_PATTERNS); + }); + + it("'replace' mode (default) returns only the caller's patterns", () => { + const config: MetaFactFilterConfig = { + enabled: true, + patterns: [/^foo$/], + }; + const resolved = resolveMetaFactPatterns(config); + expect(resolved).toHaveLength(1); + expect(resolved[0]).toEqual(/^foo$/); + }); + + it("'extend' mode unions caller patterns with defaults", () => { + const config: MetaFactFilterConfig = { + enabled: true, + patterns: [/^foo$/], + mode: 'extend', + }; + const resolved = resolveMetaFactPatterns(config); + expect(resolved.length).toBe(DEFAULT_META_FACT_PATTERNS.length + 1); + expect(resolved[0]).toEqual(/^foo$/); + }); +}); + +describe('filterMetaFacts', () => { + interface FakeResult { + memory: { content: string }; + score: number; + } + const items: FakeResult[] = [ + { memory: { content: "User's name is SgtPooki" }, score: 0.51 }, + { memory: { content: "The user asked for the user's name." }, score: 0.40 }, + { memory: { content: 'The user is me.' }, score: 0.35 }, + { memory: { content: 'The user lives in Lisbon.' }, score: 0.32 }, + ]; + + it('is a no-op when filter is disabled', () => { + const out = filterMetaFacts(items, (r) => r.memory.content, { + enabled: false, + }); + expect(out).toEqual(items); + expect(out).not.toBe(items); // returns a copy + }); + + it('drops items whose content matches the default patterns', () => { + const out = filterMetaFacts(items, (r) => r.memory.content, { + enabled: true, + }); + expect(out).toHaveLength(2); + expect(out.map((r) => r.memory.content)).toEqual([ + "User's name is SgtPooki", + 'The user lives in Lisbon.', + ]); + }); + + it('preserves original order of kept items', () => { + const ordered: FakeResult[] = [ + { memory: { content: 'real-1' }, score: 1 }, + { memory: { content: 'The user is me.' }, score: 0.9 }, + { memory: { content: 'real-2' }, score: 0.8 }, + { memory: { content: 'The user asked for the user\'s name.' }, score: 0.7 }, + { memory: { content: 'real-3' }, score: 0.6 }, + ]; + const out = filterMetaFacts(ordered, (r) => r.memory.content, { + enabled: true, + }); + expect(out.map((r) => r.memory.content)).toEqual(['real-1', 'real-2', 'real-3']); + }); + + it('invokes onDrop once per dropped item with pattern index', () => { + const dropped: Array<{ content: string; index: number }> = []; + filterMetaFacts(items, (r) => r.memory.content, { + enabled: true, + onDrop: (content, index) => dropped.push({ content, index }), + }); + expect(dropped).toHaveLength(2); + expect(dropped[0].content).toBe("The user asked for the user's name."); + expect(dropped[1].content).toBe('The user is me.'); + // Both match pattern index 0 (the first DEFAULT pattern) — which is the + // catch-all "The user (asked|requested|said|is asking|is me)" rule. + expect(dropped[0].index).toBe(0); + expect(dropped[1].index).toBe(0); + }); + + it('swallows onDrop exceptions so filtering never breaks recall', () => { + const out = filterMetaFacts(items, (r) => r.memory.content, { + enabled: true, + onDrop: () => { + throw new Error('telemetry blew up'); + }, + }); + expect(out).toHaveLength(2); + }); + + it('honours custom patterns in replace mode', () => { + const out = filterMetaFacts(items, (r) => r.memory.content, { + enabled: true, + patterns: [/^User's name/], + }); + // Custom pattern drops "User's name is SgtPooki" but lets meta-facts through. + expect(out.map((r) => r.memory.content)).toEqual([ + "The user asked for the user's name.", + 'The user is me.', + 'The user lives in Lisbon.', + ]); + }); + + it('honours custom patterns in extend mode (union with defaults)', () => { + const out = filterMetaFacts(items, (r) => r.memory.content, { + enabled: true, + patterns: [/^User's name/], + mode: 'extend', + }); + // Both the custom rule AND the defaults fire. + expect(out.map((r) => r.memory.content)).toEqual([ + 'The user lives in Lisbon.', + ]); + }); + + it('handles non-string content gracefully without dropping the item', () => { + const weird = [ + ...items, + { memory: { content: null as unknown as string }, score: 0.1 }, + ]; + const out = filterMetaFacts(weird, (r) => r.memory.content, { + enabled: true, + }); + // Real facts + the null-content item survive; meta-facts dropped. + expect(out).toHaveLength(3); + }); + + it('returns the original list when the resolved pattern set is empty', () => { + const out = filterMetaFacts(items, (r) => r.memory.content, { + enabled: true, + patterns: [], + }); + expect(out).toEqual(items); + }); +}); diff --git a/src/memory/atomicmemory-provider/__tests__/fixtures/list.mapped.json b/src/memory/atomicmemory-provider/__tests__/fixtures/list.mapped.json index 915a0f9..d0f4fe7 100644 --- a/src/memory/atomicmemory-provider/__tests__/fixtures/list.mapped.json +++ b/src/memory/atomicmemory-provider/__tests__/fixtures/list.mapped.json @@ -3,7 +3,8 @@ "id": "FIXTURE-MEM-2", "content": "user's library card expires in March 2027.", "scope": { - "user": "fixture-capture" + "user": "fixture-capture", + "namespace": "site/fixture/quick/ingest" }, "createdAt": "2026-04-24T10:00:00.000Z", "provenance": { @@ -19,7 +20,8 @@ "id": "FIXTURE-MEM-1", "content": "User prefers aisle seats on flights longer than four hours.", "scope": { - "user": "fixture-capture" + "user": "fixture-capture", + "namespace": "site/fixture/full/ingest" }, "createdAt": "2026-04-24T10:00:00.000Z", "provenance": { diff --git a/src/memory/atomicmemory-provider/__tests__/fixtures/search-fast.mapped.json b/src/memory/atomicmemory-provider/__tests__/fixtures/search-fast.mapped.json index 482546f..896549c 100644 --- a/src/memory/atomicmemory-provider/__tests__/fixtures/search-fast.mapped.json +++ b/src/memory/atomicmemory-provider/__tests__/fixtures/search-fast.mapped.json @@ -4,7 +4,8 @@ "id": "FIXTURE-MEM-2", "content": "user's library card expires in March 2027.", "scope": { - "user": "fixture-capture" + "user": "fixture-capture", + "thread": "fixture-fast-thread-quick" }, "createdAt": "2026-04-24T10:00:00.000Z", "provenance": { @@ -24,7 +25,8 @@ "id": "FIXTURE-MEM-1", "content": "User prefers aisle seats on flights longer than four hours.", "scope": { - "user": "fixture-capture" + "user": "fixture-capture", + "thread": "fixture-fast-thread-full" }, "createdAt": "2026-04-24T10:00:00.000Z", "provenance": { diff --git a/src/memory/atomicmemory-provider/__tests__/fixtures/search-fast.raw.json b/src/memory/atomicmemory-provider/__tests__/fixtures/search-fast.raw.json index 4af990c..ca6fe3b 100644 --- a/src/memory/atomicmemory-provider/__tests__/fixtures/search-fast.raw.json +++ b/src/memory/atomicmemory-provider/__tests__/fixtures/search-fast.raw.json @@ -16,6 +16,7 @@ "relevance": 0.14691642279927353, "importance": 0.6, "source_site": "fixture-quick-ingest", + "session_id": "fixture-fast-thread-quick", "created_at": "2026-04-24T10:00:00.000Z" }, { @@ -28,6 +29,7 @@ "relevance": 0.6055988308430426, "importance": 0.6, "source_site": "fixture-full-ingest", + "session_id": "fixture-fast-thread-full", "created_at": "2026-04-24T10:00:00.000Z" } ], diff --git a/src/memory/atomicmemory-provider/__tests__/fixtures/search.mapped.json b/src/memory/atomicmemory-provider/__tests__/fixtures/search.mapped.json index 6763ec7..f17f6a5 100644 --- a/src/memory/atomicmemory-provider/__tests__/fixtures/search.mapped.json +++ b/src/memory/atomicmemory-provider/__tests__/fixtures/search.mapped.json @@ -4,7 +4,8 @@ "id": "FIXTURE-MEM-2", "content": "user's library card expires in March 2027.", "scope": { - "user": "fixture-capture" + "user": "fixture-capture", + "thread": "fixture-thread-quick" }, "createdAt": "2026-04-24T10:00:00.000Z", "provenance": { @@ -24,7 +25,8 @@ "id": "FIXTURE-MEM-1", "content": "User prefers aisle seats on flights longer than four hours.", "scope": { - "user": "fixture-capture" + "user": "fixture-capture", + "thread": "fixture-thread-full" }, "createdAt": "2026-04-24T10:00:00.000Z", "provenance": { diff --git a/src/memory/atomicmemory-provider/__tests__/fixtures/search.raw.json b/src/memory/atomicmemory-provider/__tests__/fixtures/search.raw.json index 4614899..e025027 100644 --- a/src/memory/atomicmemory-provider/__tests__/fixtures/search.raw.json +++ b/src/memory/atomicmemory-provider/__tests__/fixtures/search.raw.json @@ -16,6 +16,7 @@ "relevance": 0.14691642279927353, "importance": 0.6, "source_site": "fixture-quick-ingest", + "session_id": "fixture-thread-quick", "created_at": "2026-04-24T10:00:00.000Z" }, { @@ -28,6 +29,7 @@ "relevance": 0.6055988308430426, "importance": 0.6, "source_site": "fixture-full-ingest", + "session_id": "fixture-thread-full", "created_at": "2026-04-24T10:00:00.000Z" } ], diff --git a/src/memory/atomicmemory-provider/__tests__/namespace-base-routes.test.ts b/src/memory/atomicmemory-provider/__tests__/namespace-base-routes.test.ts index 33d6941..14afa45 100644 --- a/src/memory/atomicmemory-provider/__tests__/namespace-base-routes.test.ts +++ b/src/memory/atomicmemory-provider/__tests__/namespace-base-routes.test.ts @@ -127,6 +127,20 @@ describe('atomicmemory.ingestFull', () => { expect(body.agent_scope).toBeUndefined(); }); + it('forwards thread scope as session_id on ingest', async () => { + mockFetch.mockResolvedValueOnce( + jsonResponse({ episode_id:'e1', facts_extracted:0, memories_stored:0, memories_updated:0, memories_deleted:0, memories_skipped:0, stored_memory_ids: [], updated_memory_ids: [], links_created:0, composites_created:0 }), + ); + const handle = createHandle(); + + await handle.ingestFull( + { conversation: 'x', sourceSite: 's' }, + { ...USER_SCOPE, thread: 'thread-1' }, + ); + + expect(capturedCall(mockFetch).body?.session_id).toBe('thread-1'); + }); + it('forwards visibility on workspace scope', async () => { mockFetch.mockResolvedValueOnce( jsonResponse({ episode_id:'e1', facts_extracted:0, memories_stored:0, memories_updated:0, memories_deleted:0, memories_skipped:0, stored_memory_ids: [], updated_memory_ids: [], links_created:0, composites_created:0 }), @@ -232,6 +246,45 @@ describe('atomicmemory.search', () => { expect(result.citations).toEqual(['m1', 'm2']); expect(result.observability).toBeDefined(); }); + + it('forwards thread scope and maps returned session_id', async () => { + mockFetch.mockResolvedValueOnce( + jsonResponse({ + count: 1, + retrieval_mode: 'flat', + memories: [{ id: 'm1', content: 'a', session_id: 'thread-1' }], + }), + ); + + const handle = createHandle(); + const result = await handle.search( + { query: 'q' }, + { ...USER_SCOPE, thread: 'thread-1' }, + ); + + const call = capturedCall(mockFetch); + expect(call.body?.session_id).toBe('thread-1'); + expect(result.results[0].memory.scope).toEqual({ + ...USER_SCOPE, + thread: 'thread-1', + }); + }); + + it('rejects thread-scoped rows without matching session_id', async () => { + mockFetch.mockResolvedValueOnce( + jsonResponse({ + count: 1, + retrieval_mode: 'flat', + memories: [{ id: 'm1', content: 'a' }], + }), + ); + + const handle = createHandle(); + await expect( + handle.search({ query: 'q' }, { ...USER_SCOPE, thread: 'thread-1' }), + ).rejects.toThrow(/session_id/); + }); + }); describe('atomicmemory.searchFast', () => { @@ -299,6 +352,33 @@ describe('atomicmemory.list', () => { expect(call.url).toContain('workspace_id=ws1'); expect(call.url).toContain('agent_id=a1'); }); + + it('forwards thread scope and maps returned session_id', async () => { + mockFetch.mockResolvedValueOnce( + jsonResponse({ + memories: [{ id: 'm1', content: 'a', session_id: 'thread-1' }], + count: 1, + }), + ); + const handle = createHandle(); + const page = await handle.list({ ...WORKSPACE_SCOPE, thread: 'thread-1' }); + const call = capturedCall(mockFetch); + expect(call.url).toContain('session_id=thread-1'); + expect(page.memories[0].scope).toEqual({ + ...WORKSPACE_SCOPE, + thread: 'thread-1', + }); + }); + + it('rejects thread-scoped list rows without matching session_id', async () => { + mockFetch.mockResolvedValueOnce( + jsonResponse({ memories: [{ id: 'm1', content: 'a' }], count: 1 }), + ); + const handle = createHandle(); + await expect( + handle.list({ ...USER_SCOPE, thread: 'thread-1' }), + ).rejects.toThrow(/session_id/); + }); }); describe('atomicmemory.get', () => { diff --git a/src/memory/atomicmemory-provider/atomicmemory-provider.ts b/src/memory/atomicmemory-provider/atomicmemory-provider.ts index 7ebf1ab..79ce10b 100644 --- a/src/memory/atomicmemory-provider/atomicmemory-provider.ts +++ b/src/memory/atomicmemory-provider/atomicmemory-provider.ts @@ -48,6 +48,10 @@ import { } from './mappers'; import type { AtomicMemoryHandle } from './handle'; import { createAtomicMemoryHandle } from './handle-impl'; +import { + filterMetaFacts, + type MetaFactFilterConfig, +} from '../meta-fact-filter'; export class AtomicMemoryProvider extends BaseMemoryProvider @@ -60,6 +64,12 @@ export class AtomicMemoryProvider * Empty string disables prefixing (legacy deployments only). */ private readonly apiPrefix: string; + /** + * Opt-in post-retrieval meta-fact filter. `undefined` (default) means + * filtering is off. See `MetaFactFilterConfig` and + * `benchmarks/alignbench/RESULTS.md` for motivation. + */ + private readonly metaFactFilter?: MetaFactFilterConfig; constructor(config: AtomicMemoryProviderConfig) { super(); @@ -71,6 +81,24 @@ export class AtomicMemoryProvider this.apiPrefix = normalizeApiVersion( config.apiVersion ?? DEFAULT_API_VERSION, ); + this.metaFactFilter = config.metaFactFilter; + } + + /** + * Drop meta-fact entries from a SearchResult list when the filter is enabled. + * + * Called once per search-style endpoint (regular search, temporal search, + * package) so meta-facts never reach the caller. No-op when + * `this.metaFactFilter` is `undefined` or `enabled: false` — matches the + * pre-filter behaviour byte-for-byte. + */ + private applyMetaFactFilter(results: SearchResult[]): SearchResult[] { + if (!this.metaFactFilter || !this.metaFactFilter.enabled) return results; + return filterMetaFacts( + results, + (r) => r.memory.content, + this.metaFactFilter, + ); } /** Prepend the configured API-version prefix to a route path. */ @@ -91,6 +119,7 @@ export class AtomicMemoryProvider source_site: input.provenance?.source ?? 'sdk', source_url: input.provenance?.sourceUrl ?? '', }; + if (input.scope.thread) body.session_id = input.scope.thread; if (isVerbatim) body.skip_extraction = true; // Forward caller-supplied metadata to the wire ONLY on the // verbatim path. Core honors `metadata` only on @@ -140,6 +169,7 @@ export class AtomicMemoryProvider limit: request.limit, threshold: request.threshold, namespace_scope: request.scope.namespace, + session_id: request.scope.thread, }; const raw = await fetchJson<{ memories: any[]; count: number }>( @@ -149,8 +179,8 @@ export class AtomicMemoryProvider ); return { - results: raw.memories.map((m: any) => - toSearchResult(m, request.scope) + results: this.applyMetaFactFilter( + raw.memories.map((m: any) => toSearchResult(m, request.scope)), ), }; } @@ -185,7 +215,7 @@ export class AtomicMemoryProvider count: number; }>( this.http, - this.route(`/memories/list?user_id=${encodeURIComponent(request.scope.user ?? '')}&limit=${limit}&offset=${offset}`) + this.route(buildListPath(request.scope, limit, offset)) ); const nextOffset = offset + raw.memories.length; @@ -294,6 +324,7 @@ export class AtomicMemoryProvider limit: request.limit, threshold: request.threshold, namespace_scope: request.scope.namespace, + session_id: request.scope.thread, retrieval_mode: mapPackageFormat(request.format), token_budget: request.tokenBudget, skip_repair: true, @@ -315,8 +346,8 @@ export class AtomicMemoryProvider ); } - const results: SearchResult[] = raw.memories.map((m: any) => - toSearchResult(m, request.scope) + const results: SearchResult[] = this.applyMetaFactFilter( + raw.memories.map((m: any) => toSearchResult(m, request.scope)), ); return { @@ -342,6 +373,7 @@ export class AtomicMemoryProvider threshold: request.threshold, as_of: request.asOf.toISOString(), namespace_scope: request.scope.namespace, + session_id: request.scope.thread, }; const raw = await fetchJson<{ @@ -352,8 +384,8 @@ export class AtomicMemoryProvider }); return { - results: raw.memories.map((m: any) => - toSearchResult(m, request.scope) + results: this.applyMetaFactFilter( + raw.memories.map((m: any) => toSearchResult(m, request.scope)), ), }; } @@ -396,6 +428,16 @@ export class AtomicMemoryProvider // Helpers // --------------------------------------------------------------------------- +function buildListPath(scope: Scope, limit: number, offset: number): string { + const params = new URLSearchParams({ + user_id: scope.user ?? '', + limit: String(limit), + offset: String(offset), + }); + if (scope.thread) params.set('session_id', scope.thread); + return `/memories/list?${params.toString()}`; +} + function ingestInputToConversation(input: IngestInput): string { switch (input.mode) { case 'text': diff --git a/src/memory/atomicmemory-provider/handle-impl.ts b/src/memory/atomicmemory-provider/handle-impl.ts index 0651b1b..71e2604 100644 --- a/src/memory/atomicmemory-provider/handle-impl.ts +++ b/src/memory/atomicmemory-provider/handle-impl.ts @@ -72,6 +72,7 @@ import { scopeToFields, scopeToQueryParams, stripAgentScope, + stripReadFilters, } from './scope-mapper'; export function createAtomicMemoryHandle( @@ -114,7 +115,7 @@ export function createAtomicMemoryHandle( ); // Echo back the scope WITHOUT agentScope: core didn't apply that // filter on expand, so returned memories must not claim otherwise. - const echoedScope = stripAgentScope(scope); + const echoedScope = stripReadFilters(scope); return raw.memories.map((m) => toAtomicMemoryMemory(m, echoedScope)); }, async list(scope, options) { @@ -126,7 +127,7 @@ export function createAtomicMemoryHandle( // SDK so the mismatch surfaces at the call site. assertListOptionsScopeCompat(scope, options); - const params = scopeToQueryParams(scope); + const params = scopeToQueryParams(scope, { includeThread: true }); if (options?.limit !== undefined) params.set('limit', String(options.limit)); if (options?.offset !== undefined) params.set('offset', String(options.offset)); if (options?.sourceSite) params.set('source_site', options.sourceSite); @@ -150,19 +151,21 @@ export function createAtomicMemoryHandle( }; }, async get(id, scope) { - // agent_scope deliberately omitted — core's /:id GET drops it. - const params = scopeToQueryParams(scope); + // agent_scope/thread deliberately omitted — core's /:id GET is id-keyed + // and does not apply those read filters. The returned scope reflects the + // persisted row, not the caller's unapplied filter. + const params = scopeToQueryParams(stripReadFilters(scope)); const raw = await fetchJsonOrNull( http, route(`/memories/${encodeURIComponent(id)}?${params.toString()}`), ); if (!raw) return null; - // Echoed scope drops agentScope — see expand() note above. - return toAtomicMemoryMemory(raw, stripAgentScope(scope)); + // Echoed scope drops unapplied filters — see expand() note above. + return toAtomicMemoryMemory(raw, stripReadFilters(scope)); }, async delete(id, scope) { - // agent_scope deliberately omitted — core's /:id DELETE drops it. - const params = scopeToQueryParams(scope); + // agent_scope/thread deliberately omitted — core's /:id DELETE is id-keyed. + const params = scopeToQueryParams(stripReadFilters(scope)); try { await fetchVoid( http, @@ -229,7 +232,7 @@ async function postIngest( assertScopeAllowsVisibility(scope, input.visibility); const body: Record = { - ...scopeToFields(scope), + ...scopeToFields(scope, { includeThread: true }), conversation: input.conversation, source_site: input.sourceSite, source_url: input.sourceUrl ?? '', @@ -290,7 +293,10 @@ async function postSearch( scope: MemoryScope, ): Promise { // agent_scope is honored ONLY on search routes — opt in here. - const scopeFields = scopeToFields(scope, { includeAgentScope: true }); + const scopeFields = scopeToFields(scope, { + includeAgentScope: true, + includeThread: true, + }); const body: Record = { ...scopeFields, query: request.query, @@ -333,6 +339,7 @@ interface RawMemoryResponse { created_at?: string; updated_at?: string; metadata?: Record; + session_id?: string | null; } interface RawSearchResponse { @@ -381,7 +388,7 @@ function toAtomicMemoryMemory( const result: AtomicMemoryMemory = { id: r.id, content: r.content ?? '', - scope, + scope: buildMemoryScope(r, scope), createdAt: r.created_at ? new Date(r.created_at) : new Date(), }; if (r.updated_at) result.updatedAt = new Date(r.updated_at); @@ -394,6 +401,26 @@ function toAtomicMemoryMemory( return result; } +function buildMemoryScope( + raw: RawMemoryResponse, + requestedScope: MemoryScope, +): MemoryScope { + if (requestedScope.thread !== undefined) { + if (!raw.session_id) { + throw new Error( + 'atomicmemory-provider: backend response missing required `session_id` for thread-scoped request', + ); + } + if (raw.session_id !== requestedScope.thread) { + throw new Error( + 'atomicmemory-provider: backend response `session_id` did not match requested thread scope', + ); + } + } + if (!raw.session_id) return requestedScope; + return { ...requestedScope, thread: raw.session_id }; +} + function toAtomicMemorySearchResult( raw: RawMemoryResponse, scope: MemoryScope, diff --git a/src/memory/atomicmemory-provider/handle.ts b/src/memory/atomicmemory-provider/handle.ts index 43a9f6d..82b8059 100644 --- a/src/memory/atomicmemory-provider/handle.ts +++ b/src/memory/atomicmemory-provider/handle.ts @@ -49,12 +49,13 @@ export type AgentScope = * at `atomicmemory-core/src/services/memory-service-types.ts:142-144`. */ export type MemoryScope = - | { kind: 'user'; userId: string } + | { kind: 'user'; userId: string; thread?: string } | { kind: 'workspace'; userId: string; workspaceId: string; agentId: string; + thread?: string; agentScope?: AgentScope; }; diff --git a/src/memory/atomicmemory-provider/mappers.ts b/src/memory/atomicmemory-provider/mappers.ts index 00d6c1b..e7d8689 100644 --- a/src/memory/atomicmemory-provider/mappers.ts +++ b/src/memory/atomicmemory-provider/mappers.ts @@ -30,6 +30,8 @@ interface RawMemory { source_url?: string; /** Present on list responses; dropped from search responses today. */ episode_id?: string; + namespace?: string; + session_id?: string | null; created_at?: string; } @@ -72,13 +74,38 @@ export function toMemory(raw: RawMemory, scope: Scope): Memory { return { id: raw.id, content: raw.content, - scope, + scope: buildScope(raw, scope), createdAt: raw.created_at ? new Date(raw.created_at) : new Date(), provenance: buildProvenance(raw), metadata: buildMetadata(raw), }; } +function buildScope(raw: RawMemory, scope: Scope): Scope { + if (scope.namespace !== undefined && raw.namespace && raw.namespace !== scope.namespace) { + throw new Error( + 'atomicmemory-provider: backend response `namespace` did not match requested namespace scope', + ); + } + if (scope.thread !== undefined) { + if (!raw.session_id) { + throw new Error( + 'atomicmemory-provider: backend response missing required `session_id` for thread-scoped request', + ); + } + if (raw.session_id !== scope.thread) { + throw new Error( + 'atomicmemory-provider: backend response `session_id` did not match requested thread scope', + ); + } + } + return { + ...scope, + ...(raw.namespace ? { namespace: raw.namespace } : {}), + ...(raw.session_id ? { thread: raw.session_id } : {}), + }; +} + /** * Both `source_site` and `source_url` are SDK-side `provenance` * fields. Returns `undefined` when neither is present so we don't diff --git a/src/memory/atomicmemory-provider/scope-mapper.ts b/src/memory/atomicmemory-provider/scope-mapper.ts index 77d11b3..42778ef 100644 --- a/src/memory/atomicmemory-provider/scope-mapper.ts +++ b/src/memory/atomicmemory-provider/scope-mapper.ts @@ -23,6 +23,7 @@ interface ScopeFields { workspace_id?: string; agent_id?: string; agent_scope?: AgentScope; + session_id?: string; } interface ScopeSerializeOptions { @@ -36,6 +37,12 @@ interface ScopeSerializeOptions { * Defaults to `false`. Search route bindings opt in explicitly. */ includeAgentScope?: boolean; + /** + * Emit `session_id` on the wire. Core honors this on ingest, search, and + * list. Routes such as get/delete/expand do not filter by session, so they + * must not send or echo it. + */ + includeThread?: boolean; } export function scopeToFields( @@ -43,7 +50,11 @@ export function scopeToFields( options: ScopeSerializeOptions = {}, ): ScopeFields { if (scope.kind === 'user') { - return { user_id: scope.userId }; + const fields: ScopeFields = { user_id: scope.userId }; + if (options.includeThread && scope.thread) { + fields.session_id = scope.thread; + } + return fields; } const fields: ScopeFields = { user_id: scope.userId, @@ -53,6 +64,9 @@ export function scopeToFields( if (options.includeAgentScope && scope.agentScope !== undefined) { fields.agent_scope = scope.agentScope; } + if (options.includeThread && scope.thread) { + fields.session_id = scope.thread; + } return fields; } @@ -83,6 +97,7 @@ export function scopeToQueryParams( params.set('agent_scope', fields.agent_scope); } } + if (fields.session_id) params.set('session_id', fields.session_id); return params; } @@ -107,7 +122,7 @@ export function assertScopeAllowsVisibility( /** * Strip `agentScope` from a `MemoryScope` for routes that do NOT honor - * agent_scope on the backend (expand / list / get / delete). Used to + * agent_scope on the backend. Used to * echo scope back on returned memories honestly — so a caller who * passed `{ agentScope: 'self' }` does not receive memories whose * `.scope.agentScope` field implies the filter was applied when it @@ -122,6 +137,24 @@ export function stripAgentScope(scope: MemoryScope): MemoryScope { userId: scope.userId, workspaceId: scope.workspaceId, agentId: scope.agentId, + ...(scope.thread !== undefined ? { thread: scope.thread } : {}), }; return stripped; } + +/** + * Strip filters that the target route did not apply before echoing scope onto + * returned memories. Search/list can preserve thread because Core applies the + * filter and projects `session_id`; expand/get/delete cannot. + */ +export function stripReadFilters(scope: MemoryScope): MemoryScope { + if (scope.kind === 'user') { + return { kind: 'user', userId: scope.userId }; + } + return { + kind: 'workspace', + userId: scope.userId, + workspaceId: scope.workspaceId, + agentId: scope.agentId, + }; +} diff --git a/src/memory/atomicmemory-provider/types.ts b/src/memory/atomicmemory-provider/types.ts index c31a2ae..2e66254 100644 --- a/src/memory/atomicmemory-provider/types.ts +++ b/src/memory/atomicmemory-provider/types.ts @@ -2,6 +2,8 @@ * @file AtomicMemory Provider Configuration */ +import type { MetaFactFilterConfig } from '../meta-fact-filter'; + export interface AtomicMemoryProviderConfig { /** Base URL of the atomicmemory-core instance, e.g. `http://localhost:3050`. */ apiUrl: string; @@ -23,6 +25,22 @@ export interface AtomicMemoryProviderConfig { * that never versioned their mount). */ apiVersion?: string; + /** + * Opt-in post-retrieval filter that drops extraction-style meta-facts + * (e.g. "The user asked for the user's name.", "As of , X is a term + * mentioned in the conversation.") before they reach the caller. + * + * Empirically motivated by `benchmarks/alignbench/RESULTS.md`: meta-facts + * are the dominant cause of partner-visible recall failures, outranking + * real user facts at thin cosine margins. Filtering them post-hoc gives + * cleaner search results today while a durable upstream extraction-prompt + * fix rolls out in core. + * + * When omitted, the filter is OFF and behaviour is unchanged. Set + * `{ enabled: true }` to activate with the built-in pattern set, or pass + * additional `patterns` / `mode` per `MetaFactFilterConfig`. + */ + metaFactFilter?: MetaFactFilterConfig; } /** Default timeout for AtomicMemory provider HTTP requests (ms). */ diff --git a/src/memory/index.ts b/src/memory/index.ts index c494f85..effc382 100644 --- a/src/memory/index.ts +++ b/src/memory/index.ts @@ -14,3 +14,10 @@ export * from './registration'; export * from './atomicmemory-provider'; export * from './mem0-provider'; export * from './hindsight-provider'; +export { + filterMetaFacts, + isMetaFact, + resolveMetaFactPatterns, + DEFAULT_META_FACT_PATTERNS, + type MetaFactFilterConfig, +} from './meta-fact-filter'; diff --git a/src/memory/meta-fact-filter.ts b/src/memory/meta-fact-filter.ts new file mode 100644 index 0000000..b62948e --- /dev/null +++ b/src/memory/meta-fact-filter.ts @@ -0,0 +1,164 @@ +/** + * @file MetaFactFilter + * + * Post-retrieval filter that drops "meta-facts" — extraction artifacts that + * describe the conversation itself rather than recording a durable fact about + * the user. + * + * Empirically motivated by AlignBench v0 (benchmarks/alignbench/RESULTS.md): + * when extraction-style meta-facts ("The user asked for the user's name.", + * "As of , X is a term mentioned in the conversation.") sit in the + * recall pool alongside real user facts, they often outrank the real fact + * for pronoun and temporal queries — at thin cosine margins (~0.05). The + * pre-registered "fix the query side" hypothesis was falsified; the dominant + * fixable lift came from removing meta-facts from the pool. + * + * Long-term, core should not emit these facts at extraction time. This + * SDK-side filter is the safety net so apps consuming the SDK today see + * cleaner recall results without waiting on a core release. + * + * Default patterns target the verbatim shapes observed in the partner demo + * (atomicmem.filecoin.cloud). Apps can extend or replace them via + * `MetaFactFilterConfig.patterns`. + * + * This filter is intentionally: + * - pure (no I/O, no LLM calls — deterministic regex application); + * - opt-in (off unless explicitly enabled in provider config); + * - case-insensitive; + * - additive (apps may add patterns without losing the defaults). + */ + +/** + * Built-in patterns observed in real partner demos. Each is a case-insensitive + * regex matched against the memory's content. A match drops the memory from + * the result set. + * + * Patterns capture the three meta-fact families that AlignBench's distractor + * pool was built from: + * 1. "The user asked/requested/said …" — meta-facts about user actions in + * the conversation, not about the user. + * 2. "As of , X is a term mentioned in the conversation." — vacuous + * acknowledgements of vocabulary, not durable facts. + * 3. "A name was mentioned." / "The conversation involves the user." — + * observations about the chat session, not about the user. + */ +export const DEFAULT_META_FACT_PATTERNS: readonly RegExp[] = Object.freeze([ + /^\s*the user (asked|requested|said|is asking|is me)\b/i, + /^\s*as of [^,]+,\s+.+\s+is a term mentioned in the conversation\.?$/i, + /^\s*a name was mentioned\b/i, + /^\s*the conversation involves the user\b/i, + /^\s*the user has started a conversation\b/i, +]); + +export interface MetaFactFilterConfig { + /** + * Master switch. When `false` (the default), the filter is a no-op and + * all results pass through. + * + * Apps explicitly opt in by setting `true`. We do not infer this from + * environment variables in the SDK to keep behaviour deterministic across + * Node / browser / Workers runtimes. + */ + enabled: boolean; + + /** + * Patterns to match against `memory.content`. When omitted, the built-in + * `DEFAULT_META_FACT_PATTERNS` are used. + * + * When `mode === 'replace'` (the default when `patterns` is set), only the + * provided patterns are applied. Set `mode: 'extend'` to apply the provided + * patterns *and* the built-in defaults. + */ + patterns?: readonly RegExp[]; + + /** + * How `patterns` interacts with `DEFAULT_META_FACT_PATTERNS`. Defaults to + * `'replace'` (the provided list fully replaces defaults). `'extend'` is + * the union — useful when an app wants to add its own meta-fact shapes + * without losing the SDK's baseline coverage. + */ + mode?: 'replace' | 'extend'; + + /** + * Optional callback invoked once per dropped result. Useful for telemetry + * or tests. Receives the memory content and the pattern index that matched. + * Exceptions thrown by `onDrop` are swallowed so they cannot break recall. + */ + onDrop?: (content: string, patternIndex: number) => void; +} + +/** + * Resolve the effective pattern list for a config. + * + * Pure; safe to call repeatedly. Used in two places — at filter time, and + * in tests that want to introspect the effective rule set without filtering + * a result list. + */ +export function resolveMetaFactPatterns( + config: MetaFactFilterConfig, +): readonly RegExp[] { + if (!config.patterns) return DEFAULT_META_FACT_PATTERNS; + if (config.mode === 'extend') { + return [...config.patterns, ...DEFAULT_META_FACT_PATTERNS]; + } + return config.patterns; +} + +/** + * Return `true` when `content` matches any of `patterns`. + * + * Defensive against non-string input (returns `false`) so a malformed result + * doesn't crash the filter pipeline. + */ +export function isMetaFact( + content: unknown, + patterns: readonly RegExp[] = DEFAULT_META_FACT_PATTERNS, +): boolean { + if (typeof content !== 'string' || content.length === 0) return false; + for (const p of patterns) { + if (p.test(content)) return true; + } + return false; +} + +/** + * Filter a list of items by removing entries whose `getContent(item)` matches + * any active meta-fact pattern. + * + * Generic over `T` so callers can filter `SearchResult` / `Memory` / raw + * backend shapes with the same primitive. Pure and synchronous. + */ +export function filterMetaFacts( + items: readonly T[], + getContent: (item: T) => unknown, + config: MetaFactFilterConfig, +): T[] { + if (!config.enabled) return [...items]; + const patterns = resolveMetaFactPatterns(config); + if (patterns.length === 0) return [...items]; + const kept: T[] = []; + for (const item of items) { + const content = getContent(item); + let matchedIndex = -1; + if (typeof content === 'string' && content.length > 0) { + for (let i = 0; i < patterns.length; i++) { + if (patterns[i].test(content)) { + matchedIndex = i; + break; + } + } + } + if (matchedIndex >= 0) { + if (config.onDrop) { + try { + config.onDrop(content as string, matchedIndex); + } catch { + // Swallow — filter must never break recall. + } + } + continue; + } + kept.push(item); + } + return kept; +}