diff --git a/.agents/skills/codestory-grounding/SKILL.md b/.agents/skills/codestory-grounding/SKILL.md
index dd125fa..5bbaa4c 100644
--- a/.agents/skills/codestory-grounding/SKILL.md
+++ b/.agents/skills/codestory-grounding/SKILL.md
@@ -38,7 +38,13 @@ checkout is only the tool artifact unless the user is editing CodeStory itself.
 - When `packet` reports `sufficient` and `follow_up_commands` is empty, answer
   from the packet; budget truncation alone is not a gap. Preserve supported-claim
   wording and include a compact "Support files" list from `answer.citations` and
-  `sufficiency.avoid_opening`.
+  `sufficiency.avoid_opening`. Do not run ordinary source reads, `rg`, `grep`, or
+  `git show` only to verify packet citations; run more commands only for a named
+  unresolved gap, an edit target, or a user-requested worktree proof.
+- When `packet` reports `partial`, read `sufficiency.follow_up_commands` and run
+  those commands in order. Prefer listed targeted `search --why` commands before
+  escalating to a larger packet budget. As soon as a follow-up packet becomes
+  sufficient, stop exploration and answer from that packet.
 - When `search --why` emits `search_plan`, use its subqueries, anchor groups,
   bridge evidence, next commands, and source-truth checks as the follow-up plan,
   not as final answer prose.
@@ -48,9 +54,10 @@ checkout is only the tool artifact unless the user is editing CodeStory itself.
 - Treat repo-text, semantic suggestions, speculative OpenAPI edges, and
   cross-language framework hits as navigation hints until typed graph evidence,
   snippets, trails, or direct source reads support the claim.
-- If `doctor` reports semantic retrieval as partial, stale, or failed, prefer
-  `search --repo-text on --why`, `symbol`, `trail`, and `snippet` until a full
-  refresh and embedding setup restore healthy retrieval.
+- If `doctor` reports retrieval as partial, stale, stubbed, hash-vector, or
+  failed, treat product retrieval as unavailable until `retrieval_mode=full` is
+  restored. Repo-text output is diagnostic only; do not use it as a substitute
+  for mandatory sidecar evidence.
 
 ## Command Routing
 
@@ -75,7 +82,7 @@ Detailed argument tables, output examples, and usage patterns for each command:
 - [ground](references/ground.md) - Compact codebase context snapshot
 - [doctor](references/doctor.md) - Read-only project/cache/index/retrieval health check
 - [packet](references/packet.md) - Broad task packet with sufficiency contract
-- [search](references/search.md) - Search indexed symbols and repo text
+- [search](references/search.md) - Search mandatory sidecar indexes
 - [context](references/context.md) - Deep evidence packet for a concrete target
 - [symbol](references/symbol.md) - Inspect a symbol's details and relationships
 - [trail](references/trail.md) - Follow a symbol's call/reference graph
@@ -83,7 +90,7 @@ Detailed argument tables, output examples, and usage patterns for each command:
 - [drill](references/drill.md) - Build a repeatable evidence packet for agent-grounding drills
 - [drill-suite](references/drill-suite.md) - Run a manifest-defined cross-repo real-repo agent drill matrix
 - [query](references/query.md) - Structured graph query pipelines
-- [explore](references/explore.md) - Interactive terminal exploration with Markdown/JSON fallback
+- [explore](references/explore.md) - Interactive terminal exploration with Markdown/JSON output
 - [files](references/files.md) - Indexed file inventory and coverage markers
 - [affected](references/affected.md) - Changed-file impact analysis
 - [bookmark](references/bookmark.md) - Save reusable investigation focus nodes
diff --git a/.agents/skills/codestory-grounding/references/doctor.md b/.agents/skills/codestory-grounding/references/doctor.md
index 905c00e..29f5b21 100644
--- a/.agents/skills/codestory-grounding/references/doctor.md
+++ b/.agents/skills/codestory-grounding/references/doctor.md
@@ -21,15 +21,15 @@ Reads project/cache/index/retrieval health without mutating the index. Use it at
 
 | Path | Command | Expected result |
 |------|---------|-----------------|
-| Normal path | `<codestory-cli> doctor --project <target-workspace>` | Reports project root, cache path, indexed stats, retrieval state, managed embedding setup, environment hints, and next commands. |
-| Failure path | If cache or index checks warn, run `index --project <target-workspace> --refresh full`; if managed embeddings are missing, run `setup embeddings --project <target-workspace>`; if semantic reports `semantic partial`, `semantic stale`, or `semantic failed`, rebuild before `context` or continue with `search --repo-text on --why` plus focused `symbol`/`trail`/`snippet`. | Separates missing index, missing managed assets, stale semantic docs, partial semantic docs, and lexical fallback. |
+| Normal path | `<codestory-cli> doctor --project <target-workspace>` | Reports project root, cache path, indexed stats, retrieval state, sidecar embedding setup, environment hints, and next commands. |
+| Failure path | If cache or index checks warn, run `index --project <target-workspace> --refresh full`; if mandatory sidecars are missing or stale, run the setup/index commands surfaced by `doctor`; if semantic reports `semantic partial`, `semantic stale`, or `semantic failed`, rebuild before trusting broad packet/search evidence. | Separates missing index, stale semantic docs, partial semantic docs, and mandatory retrieval setup failures. |
 | Integration edge | Use doctor before `ground`, `search --why`, `explore`, `context`, or `serve`; its next commands are the safe follow-up loop. | Prevents read commands from silently querying the wrong or empty cache. |
 
 ## Notes
 
 - `doctor` does not accept `--refresh`; it is a read-only health surface.
 - The `attention:` block repeats warnings first so agents do not miss semantic partial/stale/failure messages buried in the full check list.
-- Environment rows report retrieval-related variables such as `CODESTORY_EMBED_PROFILE`, `CODESTORY_EMBED_BACKEND`, and `CODESTORY_EMBED_RUNTIME_MODE`.
-- The `managed_embeddings` check distinguishes missing managed ONNX assets, installed assets, disabled/hash mode, and intentionally selected external legacy llama.cpp backend state.
-- Treat `semantic ok` as the only health state suitable for broad repository explanation prompts. Treat `semantic partial`, `semantic stale`, and `semantic failed` as instructions to rebuild or use lexical/repo-text fallback.
+- Environment rows report retrieval-related variables such as `CODESTORY_EMBED_BACKEND`, `CODESTORY_EMBED_LLAMACPP_URL`, and sidecar enablement flags.
+- The embedding checks distinguish product llama.cpp sidecar state from hash, ONNX, disabled, or stale diagnostic states.
+- Treat `semantic ok` plus `retrieval_mode=full` as the health state suitable for broad repository explanation prompts. Treat `semantic partial`, `semantic stale`, `semantic failed`, and non-`full` retrieval modes as instructions to repair setup or rebuild before trusting agent-facing evidence.
 - Prefer JSON for CI or doc-contract checks.
diff --git a/.agents/skills/codestory-grounding/references/drill-suite.md b/.agents/skills/codestory-grounding/references/drill-suite.md
index fad0884..0d82c35 100644
--- a/.agents/skills/codestory-grounding/references/drill-suite.md
+++ b/.agents/skills/codestory-grounding/references/drill-suite.md
@@ -83,6 +83,7 @@ Allowed claim classifications are `correct`, `partial`, `misleading`, and
 | `--output-dir` | path | **required** | Directory for aggregate suite reports and per-case drill artifacts |
 | `--refresh` | enum | `full` | Refresh strategy passed to each per-case drill: `auto`, `full`, `incremental`, `none` |
 | `--format` | enum | `json` | Primary aggregate output format: `json` or `markdown` |
+| `--jobs` | integer | `1` | Read-only workers for `--refresh none`; multiple cases run in parallel, a single case parallelizes anchors and bridge checks |
 
 ## Output
 
@@ -98,11 +99,20 @@ retrieval mode, anchor resolution, bridge status, source-truth check counts,
 expected-file recall, source-truth target roles/ranking reasons, bridge
 `evidence_kind`, claim classification counts, and next actions. A case can
 be mechanically healthy but still `degraded` when source-truth verification is
-required, bridge evidence is partial, retrieval is symbolic-only, freshness is
+required, bridge evidence is partial, retrieval needs repair, freshness is
 stale, expected files were missed, or the ledger records partial/materially
 revised claims. A failed case is recorded as `blocked` instead of aborting the
 whole suite, so other manifest cases still produce evidence.
 
+`--jobs` is default-off and only applies to read-only `--refresh none` loops.
+It leaves refreshing or indexing runs serialized, caps worker count
+automatically, preserves final manifest order in aggregate reports, and writes
+each single-case drill's anchor and bridge artifacts in deterministic report
+order.
+Measure it on the target suite before treating it as a speed-up: multi-case
+manifests can benefit from parallel isolated cases, while single-case anchor
+and bridge checks may be limited by storage and graph traversal contention.
+
 Per-case `drill` runs include the broad question search plus bounded
 supplemental searches for terms such as public pages, home components, Payload
 collections, social feeds, comments, and store crates. Those hits are added as
diff --git a/.agents/skills/codestory-grounding/references/drill.md b/.agents/skills/codestory-grounding/references/drill.md
index 9ca6327..8fcad5f 100644
--- a/.agents/skills/codestory-grounding/references/drill.md
+++ b/.agents/skills/codestory-grounding/references/drill.md
@@ -20,6 +20,7 @@ Runs a deterministic evidence collection pass for a realistic codebase question.
 | `--output-dir` | path | **required** | Directory for the drill report and artifacts; created if missing |
 | `--refresh` | enum | `full` | Refresh strategy: `auto`, `full`, `incremental`, `none` |
 | `--format` | enum | `markdown` | Primary output format: `markdown` or `json` |
+| `--jobs` | integer | `1` | Read-only anchor and bridge evidence workers for `--refresh none`; capped automatically |
 
 ## Output
 
@@ -36,7 +37,7 @@ The report includes:
 - chosen anchor, endpoint files, and source-truth verification targets
 - an `evidence_packet` with typed evidence items, repo-text hints, negative evidence, source locations, confidence, and readiness status
 - an Answer Readiness report with `safe_to_say`, `inferred_claims`, `needs_verification`, `next_commands`, and `source_truth_checks`
-- compact mechanical status, retrieval/freshness status, bridge counts, source-truth file list plus target roles/ranking reasons, and verdict/next action in `drill-summary.json`
+- compact mechanical status, retrieval/freshness status, drill runtime timings, bridge counts, source-truth file list plus target roles/ranking reasons, and verdict/next action in `drill-summary.json`
 - an answer-quality contract requiring a CodeStory-only draft before source reads and source-truth verification afterward
 - a fillable claim-ledger template for source-truth classification, correction counts, and material-revision tracking
 - a verification checklist requiring `correct`, `partial`, `misleading`, or `unsupported` classifications
@@ -49,15 +50,27 @@ The report includes:
 
 # JSON-first run for automation, while still writing Markdown too
 <codestory-cli> drill --project <target-workspace> --refresh none --anchors EntryPoint,Coordinator,BackingStore --output-dir target/drill/entrypoint-flow --format json
+
+# Optional read-only anchor and bridge workers against an already-fresh local index
+<codestory-cli> drill --project <target-workspace> --refresh none --anchors EntryPoint,Coordinator,BackingStore --output-dir target/drill/entrypoint-flow --format json --jobs 4
 ```
 
 ## Interpretation
 
 Use the drill report as the CodeStory-only phase. Draft the architecture answer from those artifacts first, then open only files named or implied by the artifacts and classify each claim against source truth. If the answer changes materially after source reads, record that as a CodeStory or agent-UX finding.
 
-Start with `drill-summary.json` for compact health, retrieval/freshness state, bridge status, bridge `evidence_kind`, source-truth target roles, and the verdict next action, then read `evidence_packet.readiness`. Claims in `safe_to_say` are anchored enough for a draft. Claims in `inferred_claims` or `needs_verification` must stay uncertain until the listed `source_truth_checks` or equivalent source reads confirm them. Repo-text and cross-language framework hits are navigation hints unless supported by typed symbol/trail/snippet evidence or source-truth verification. A `source_truth_only` bridge is deliberately not proof; it means CodeStory found the concrete files to read but no typed graph/framework/data path strong enough to answer without source verification.
+Start with `drill-summary.json` for compact health, retrieval/freshness state, drill runtime timings, bridge status, bridge `evidence_kind`, source-truth target roles, and the verdict next action, then read `evidence_packet.readiness`. Claims in `safe_to_say` are anchored enough for a draft. Claims in `inferred_claims` or `needs_verification` must stay uncertain until the listed `source_truth_checks` or equivalent source reads confirm them. Repo-text and cross-language framework hits are navigation hints unless supported by typed symbol/trail/snippet evidence or source-truth verification. A `source_truth_only` bridge is deliberately not proof; it means CodeStory found the concrete files to read but no typed graph/framework/data path strong enough to answer without source verification.
+
+`mechanical.drill_timings` breaks the evidence-collection runtime into setup, question search, anchor resolution, supplemental search, bridge evidence, and evidence assembly. Per-anchor `timings`, command `duration_ms`, and summary `slowest_command` fields further split anchor work into search, query resolution, consumer-summary, and artifact-command costs. Use these fields to localize slow drills before changing ranking or graph traversal logic; they are diagnostic timing, not answer-quality evidence by themselves.
+
+Consumer summaries inspect direct incoming production consumers for the selected anchor first. Related payload/API/native targets are searched only when the selected anchor has no visible graph consumers, so ordinary drills do not pay broad related-target search costs unless the direct graph evidence is missing.
+
+If `drill-summary.json` reports stale freshness, refresh the index before promoting claims. If retrieval is not full or semantic diagnostics report degraded state, repair sidecars before trusting broad natural-language recall; use symbol, trail, snippet, and source-truth files deliberately while the run is degraded.
 
-If `drill-summary.json` reports stale freshness, refresh the index before promoting claims. If retrieval is symbolic-only or semantic fallback is reported, broad natural-language recall is degraded even when exact anchors resolve; use repo-text, symbol, trail, snippet, and source-truth files deliberately.
+`--jobs` is default-off and read-only. Use it only with `--refresh none` after
+the index is fresh, and measure the run: multi-case suites can benefit from
+parallel case execution, while single-case anchor resolution and bridge checks
+may be limited by storage and graph traversal contention on some repos.
 
 The optional `question_search` artifact and any `question_supplemental_searches` are intentionally partial discovery evidence. They can add public page, component, collection, and store files to the source-truth checklist when the broad question points there, but they do not prove the architecture by themselves. Use them to avoid missing verification files, then rely on each anchor's symbol/trail/explore/snippet artifacts and focused source reads before promoting claims.
 
diff --git a/.agents/skills/codestory-grounding/references/index.md b/.agents/skills/codestory-grounding/references/index.md
index e111050..e7f7855 100644
--- a/.agents/skills/codestory-grounding/references/index.md
+++ b/.agents/skills/codestory-grounding/references/index.md
@@ -53,14 +53,14 @@ High-signal environment toggles:
 
 | Variable | Use |
 |----------|-----|
-| `CODESTORY_HYBRID_RETRIEVAL_ENABLED=false` | Disable hybrid retrieval and use symbolic ranking. |
 | `CODESTORY_SEMANTIC_DOC_SCOPE=all` | Include all-symbol semantic docs. Accepted all-symbol aliases are `all`, `full`, `all-symbols`, and `all_symbols`; omitted or other values default to durable symbols. |
-| `CODESTORY_EMBED_BACKEND=onnx` | Use the managed ONNX backend. |
-| `CODESTORY_EMBED_RUNTIME_MODE=hash` | Use deterministic hash embeddings for local smoke checks. |
+| `CODESTORY_EMBED_BACKEND=llamacpp` | Use the mandatory local llama.cpp embedding sidecar. |
+| `CODESTORY_EMBED_LLAMACPP_URL=http://127.0.0.1:8080/v1/embeddings` | Product embedding endpoint for bge-base sidecar vectors. |
 | `CODESTORY_SUMMARY_ENDPOINT=local` | Enable deterministic local summaries with `--summarize`. |
 
-Use other embedding, alias, batch-size, tokenizer, provider, llama.cpp, and
-summary tuning variables only for focused profiling or compatibility work.
+Use other embedding, alias, batch-size, tokenizer, provider, hash, ONNX, and
+summary tuning variables only for focused diagnostics or historical comparisons.
+Agent-facing retrieval requires full sidecar readiness.
 
 ## Output
 
diff --git a/.agents/skills/codestory-grounding/references/search.md b/.agents/skills/codestory-grounding/references/search.md
index f0df736..a94991b 100644
--- a/.agents/skills/codestory-grounding/references/search.md
+++ b/.agents/skills/codestory-grounding/references/search.md
@@ -1,6 +1,9 @@
-# `search` — Search Indexed Symbols and Repo Text
+# `search` — Search Mandatory Sidecar Indexes
 
-Searches the symbol index for matching nodes, optionally augmented with grep-style text hits across the repo. Results are ranked by relevance score and deduplicated.
+Searches the mandatory local sidecar indexes for matching symbols, files,
+semantic candidates, and graph-neighborhood evidence. A product search requires
+`retrieval_mode=full`; stale, stubbed, hash-vector, or missing sidecars are
+fail-closed states.
 
 ## Usage
 
@@ -16,20 +19,23 @@ Searches the symbol index for matching nodes, optionally augmented with grep-sty
 | `--cache-dir` | path | *auto* | Override the cache directory |
 | `--query` | string | **required** | Search term — symbol name or natural-language text |
 | `--limit` | integer | `10` | Maximum results per provenance group, capped at 50 |
-| `--repo-text` | enum | `auto` | Repo text scanning: `auto`, `on`, or `off`. `auto` also scans repo text when indexed hits are weak or no exact concrete anchor matched |
+| `--repo-text` | enum | `auto` | Diagnostic repo-text scanning: `auto`, `on`, or `off`. Repo-text hits are navigation clues and must not replace exact sidecar evidence |
 | `--refresh` | enum | `none` | Refresh strategy: `auto`, `full`, `incremental`, `none` |
 | `--format` | enum | `markdown` | Output format: `markdown` or `json` |
 | `--output-file` | path | *stdout* | Write output to a file; the parent directory must already exist |
-| `--hybrid-lexical` | float | runtime default | Override lexical weight for hybrid-search research |
-| `--hybrid-semantic` | float | runtime default | Override semantic weight for hybrid-search research |
-| `--hybrid-graph` | float | runtime default | Override graph-neighborhood weight for hybrid-search research |
+| `--hybrid-*` | n/a | unsupported | Hybrid tuning overrides are rejected under mandatory sidecar search because ignored weights would be misleading |
 
 ## Query Behavior
 
-- **Symbol-like queries** (e.g. `AppController`, `run_indexing`) search the indexed symbol table.
-- **Natural-language queries** (e.g. `"how does incremental indexing work"`) also perform a repo-wide text scan and merge results by score.
+- **Symbol-like queries** (e.g. `AppController`, `run_indexing`) search exact
+  and normalized symbol lanes first.
+- **Natural-language queries** (e.g. `"how does incremental indexing work"`)
+  search semantic and graph-aware sidecar evidence. Repo-text may appear as
+  diagnostic evidence, but it is not proof of a symbol or graph relationship.
 - **Field-qualified queries** filter indexed and repo-text results after candidate retrieval. Supported filters are `kind:<node-kind-or-alias>`, `path:<path-fragment>`, `name:<symbol-fragment>`, and `lang:<language-or-extension>`. Example: `kind:function name:listUsers` or `path:routes.ts /api/users`.
-- **Concrete anchors with weak indexed results** also trigger repo text in `auto` mode. This prevents stale names such as retired UI components from looking like valid direct symbol hits.
+- **Concrete anchors with weak indexed results** may report repo-text diagnostics
+  in `auto` mode. Treat this as an uncertainty signal, not as successful graph
+  grounding.
 - When hybrid retrieval finds strong semantic matches but no lexical match, Markdown and JSON output include `did_you_mean` suggestions.
 - Broad architecture-style queries can include `search_plan`. The plan reports
   extracted and dropped terms, bounded subqueries, candidate windows, anchor
@@ -37,10 +43,11 @@ Searches the symbol index for matching nodes, optionally augmented with grep-sty
   source-truth checks. It is a discovery plan, not final answer prose.
 - Ranking boosts exact and terminal symbol names, CamelCase initials, compound terms, and path co-location. Test, fixture, vendor, and external hits are dampened unless the query asks for them.
 - Import/re-export-looking exact hits are ranked below definition-looking hits when source-line evidence is available.
-- Repo-text fallback remains explicit evidence. Treat repo-text hits as clues to inspect, not as silent graph success.
+- Repo-text evidence remains explicit navigation evidence. Treat repo-text hits
+  as clues to inspect, not as sidecar success.
 - For architecture questions, broad natural-language `search` is discovery only. If `query_assessment` says `weak_top_hit=true` or there is no exact anchor, move to `drill` with concrete anchors from `ground`/`search`; do not answer from broad search hits alone.
 - `symbol`, `trail`, and `snippet` require a resolvable graph target. Semantic suggestions and repo-text hits can guide follow-up searches, but they are not promoted into graph targets by those commands.
-- **Hybrid weight overrides** are intended for benchmarking and tuning. Omit all three `--hybrid-*` flags for production-like runtime defaults.
+- **Hybrid weight overrides** are unsupported. `search --hybrid-*` flags are rejected under mandatory sidecar search; use sidecar configuration and fixture-backed tests for ranking experiments instead.
 
 ## Output
 
@@ -55,7 +62,7 @@ hits: 3
 
 Each hit includes: node ID, display name, kind, file path, line number, relevance score, provenance, and `match_quality` (`exact`, `normalized_exact`, `prefix`, `fuzzy`, `semantic_suggestion`, or `repo_text`).
 
-Search output also includes `query_assessment` with exact symbol hit count, weak-hit/stale-anchor flags, any repo-text fallback reason, and a recommended next action. Use it to avoid treating weak semantic suggestions as proof of an exact anchor.
+Search output also includes `query_assessment` with exact symbol hit count, weak-hit/stale-anchor flags, any repo-text diagnostic reason, and a recommended next action. Use it to avoid treating weak semantic suggestions as proof of an exact anchor.
 
 For broad architecture queries, JSON may include `search_plan`; Markdown renders
 it when `--why` is set. Prefer `typed_anchor` and `promoted` plan groups as
@@ -65,7 +72,9 @@ until direct source verification. Use the plan's next commands to continue with
 
 When a name appears more than once, prefer typed symbol hits such as `[function]`, `[struct]`, `[field]`, or `[file]` over `[unknown]` hits when you are verifying symbol surfacing. `[unknown]` results are often usage-like callsite or reference nodes, not the canonical definition.
 
-Repo-text hits from text-only surfaces such as `.svelte` files are evidence, not graph anchors. Use the excerpt to choose a symbol or open a snippet/source file for verification.
+Repo-text hits from text-only surfaces such as `.svelte` files are navigation
+clues, not sidecar evidence or graph anchors. Use the excerpt to choose a symbol
+or open a snippet/source file for verification.
 
 For ranking or route-search changes, run the search-quality eval and interpret
 failures before promoting the change:
@@ -96,10 +105,10 @@ cargo test -p codestory-cli --test search_json_output -- --ignored --nocapture s
 # Search for a symbol
 <codestory-cli> search --project <target-workspace> --query AppController
 
-# Natural-language search, more results
+# Natural-language sidecar search, more results
 <codestory-cli> search --project <target-workspace> --query "how does the grounding snapshot work" --limit 20
 
-# Force repo text scanning for a symbol-like query
+# Diagnostic repo-text scan for a symbol-like query
 <codestory-cli> search --project <target-workspace> --query AppController --repo-text on
 
 # Narrow an ambiguous result set by kind and file path
diff --git a/.agents/skills/codestory-grounding/references/serve.md b/.agents/skills/codestory-grounding/references/serve.md
index 261b67b..1b0f6b4 100644
--- a/.agents/skills/codestory-grounding/references/serve.md
+++ b/.agents/skills/codestory-grounding/references/serve.md
@@ -43,3 +43,4 @@ Serves the indexed project over either a small HTTP JSON API or an MCP-style JSO
 - `serve` is local by default on `127.0.0.1`; do not bind wider unless the user explicitly needs remote access.
 - HTTP only accepts GET requests for the documented routes.
 - Start it after a successful index or with an intentional refresh mode.
+- In one `serve --stdio` process, identical successful `packet` and search-fragment requests are cached with small LRUs keyed by request arguments, the current SQLite/WAL fingerprint, and a mandatory sidecar-readiness fingerprint. The sidecar fingerprint includes the active embedding backend, sidecar state-file metadata, strict retrieval mode, degraded reason, manifest generation/input hash/backend/dimension, and status errors. This is for repeated agent calls only; changed index files, sidecar state drift, and strict stale/unavailable readiness bypass the cache.
diff --git a/.agents/skills/codestory-grounding/references/setup.md b/.agents/skills/codestory-grounding/references/setup.md
index f9a9513..6916cd1 100644
--- a/.agents/skills/codestory-grounding/references/setup.md
+++ b/.agents/skills/codestory-grounding/references/setup.md
@@ -1,6 +1,8 @@
-# `setup` - Managed Local Embedding Assets
+# `setup` - Local Retrieval Assets
 
-Installs explicit local assets that normal read/index commands should not surprise-download.
+Prepares explicit local assets that normal read/index commands should not
+surprise-download. Agent-facing packet/search evidence still requires
+`retrieval bootstrap`, `retrieval index`, and `retrieval_mode=full`.
 
 ## Usage
 
@@ -14,10 +16,10 @@ Installs explicit local assets that normal read/index commands should not surpri
 |--------|---------|-----|
 | `--project <path>` | `.` | Target workspace used to resolve cache configuration. Always pass it explicitly. |
 | `--cache-dir <path>` | auto | Use an isolated cache root, useful for tests and repros. |
-| `--quant <q8_0|q4_k_m>` | `q8_0` | Legacy GGUF selector retained for CLI compatibility; managed setup now installs the pinned ONNX model. |
-| `--variant <cpu|vulkan>` | `vulkan` | Legacy llama.cpp selector retained for CLI compatibility; managed setup now uses ONNX Runtime. |
-| `--dry-run` | off | Show the managed ONNX asset plan without downloading anything. |
-| `--no-start` | off | Compatibility flag; managed ONNX setup never starts a server. |
+| `--quant <q8_0|q4_k_m>` | `q8_0` | Legacy compatibility selector. Managed `setup embeddings` installs pinned ONNX assets for the local semantic runtime; GGUF llama.cpp sidecar model setup is handled by the retrieval sidecar setup path. |
+| `--variant <cpu|vulkan>` | `vulkan` | Compatibility selector for older setup flows; product sidecar use is governed by `retrieval bootstrap`. |
+| `--dry-run` | off | Show the asset plan without downloading anything. |
+| `--no-start` | off | Compatibility flag; product setup is handled by retrieval sidecar bootstrap. |
 | `--format <markdown|json>` | `markdown` | Human or automation output. |
 | `--output-file <path>` | stdout | Write output to an existing parent directory. |
 
@@ -25,13 +27,19 @@ Installs explicit local assets that normal read/index commands should not surpri
 
 | Path | Command | Expected result |
 |------|---------|-----------------|
-| Normal path | `<codestory-cli> setup embeddings --project <target-workspace>` | Downloads pinned Qdrant BGE-base ONNX graph and tokenizer assets into the user cache, verifies checksums, and derives the pooled ONNX graph that runtime uses for embeddings. |
-| Failure path | If setup fails, run `setup embeddings --project <target-workspace> --dry-run --format json` and inspect the selected asset URLs, cache root, output paths, and checksums. | Separates platform support, download, checksum, extraction, and graph-derivation failures. |
-| Integration edge | Run `doctor --project <target-workspace>` after setup, then `index --project <target-workspace> --refresh full` when semantic docs need the managed runtime. | Keeps first-run model setup explicit and auditable. |
+| Normal path | `node scripts/setup-retrieval-env.mjs --fetch-embed-model`, then `<codestory-cli> retrieval bootstrap --project <target-workspace>` | Downloads the pinned bge-base GGUF for the local llama.cpp sidecar, starts local sidecars, and prepares the product retrieval environment. |
+| Failure path | If setup fails, run `setup embeddings --project <target-workspace> --dry-run --format json` and inspect the selected asset URLs, cache root, output paths, and checksums. | Separates platform support, download, checksum, extraction, and sidecar-readiness failures. |
+| Integration edge | Run `retrieval index --project <target-workspace> --refresh full`, then `retrieval status --project <target-workspace> --format json`. | Product search/packet paths are usable only when status reports `retrieval_mode=full`. |
 
 ## Notes
 
-- Normal commands may use already installed managed assets, but they do not download missing assets.
-- Managed setup seeds local defaults for ONNX Runtime: DirectML on Windows, CPU elsewhere, doc batch `2048`, token budget `32768`, and stored vectors `int8` unless environment variables override them.
-- Set `CODESTORY_EMBED_RUNTIME_MODE=hash` for deterministic local-dev checks without real model inference.
-- Set `CODESTORY_EMBED_BACKEND=llamacpp` and `CODESTORY_EMBED_LLAMACPP_URL` only when intentionally using an external legacy llama.cpp server.
+- Normal commands may use already installed assets, but they do not download
+  missing assets.
+- Plain `index` builds the core SQLite code index only. Run `retrieval index`
+  after sidecars are configured, then require `retrieval status --format json`
+  to report `retrieval_mode=full` before relying on packet/search evidence.
+- Product sidecar evidence requires `CODESTORY_EMBED_BACKEND=llamacpp`, the
+  local llama.cpp endpoint, and a manifest embedding backend of
+  `llamacpp:bge-base-en-v1.5`.
+- Hash embeddings, ONNX-only flows, and non-sidecar embedding paths are
+  diagnostic or historical comparison modes only.
diff --git a/.github/workflows/retrieval-sidecar-smoke.yml b/.github/workflows/retrieval-sidecar-smoke.yml
new file mode 100644
index 0000000..a77bca5
--- /dev/null
+++ b/.github/workflows/retrieval-sidecar-smoke.yml
@@ -0,0 +1,73 @@
+# Windows retrieval manifest-missing shape smoke.
+# Contract: docs/contributors/retrieval-sidecar-smoke-ci.md
+
+name: retrieval-sidecar-smoke
+
+on:
+  pull_request:
+    paths:
+      - crates/codestory-retrieval/**
+      - crates/codestory-contracts/**
+      - crates/codestory-store/Cargo.toml
+      - crates/codestory-store/src/**
+      - crates/codestory-cli/src/retrieval.rs
+      - crates/codestory-cli/src/main.rs
+      - crates/codestory-cli/src/args.rs
+      - crates/codestory-cli/src/runtime.rs
+      - crates/codestory-cli/src/stdio_*.rs
+      - crates/codestory-cli/tests/retrieval_bootstrap_contracts.rs
+      - crates/codestory-cli/tests/search_json_output.rs
+      - crates/codestory-cli/tests/stdio_protocol_contracts.rs
+      - crates/codestory-runtime/src/**
+      - crates/codestory-indexer/Cargo.toml
+      - crates/codestory-indexer/src/lib.rs
+      - crates/codestory-runtime/tests/retrieval_generalization_guard.rs
+      - scripts/lint-retrieval-generalization.mjs
+      - scripts/**retrieval**
+      - docs/ops/retrieval-sidecars.md
+      - docs/contributors/retrieval-sidecar-smoke-ci.md
+      - docs/architecture/retrieval-*.md
+      - docs/testing/retrieval-architecture.md
+      - docker/retrieval-compose.yml
+      - .github/workflows/retrieval-sidecar-smoke.yml
+  workflow_dispatch:
+
+jobs:
+  windows-smoke:
+    runs-on: windows-latest
+    timeout-minutes: 45
+    steps:
+      - uses: actions/checkout@v4
+
+      - uses: actions/setup-node@v4
+        with:
+          node-version: "22"
+
+      - name: Install Rust toolchain
+        uses: dtolnay/rust-toolchain@stable
+
+      - name: Cache Rust build outputs
+        uses: Swatinem/rust-cache@v2
+        with:
+          cache-on-failure: "true"
+
+      - name: Generalization lint (production paths)
+        run: node scripts/lint-retrieval-generalization.mjs
+
+      - name: Retrieval bootstrap manifest-missing contract
+        run: cargo test -p codestory-cli --test retrieval_bootstrap_contracts
+
+      - name: Runtime library contract tests
+        run: cargo test -p codestory-runtime --lib
+
+      - name: Retrieval generalization guard (Rust)
+        run: cargo test -p codestory-runtime --test retrieval_generalization_guard
+
+      - name: Stdio protocol contract tests
+        run: cargo test -p codestory-cli --test stdio_protocol_contracts
+
+      - name: CLI search JSON fail-closed contract tests
+        run: cargo test -p codestory-cli --test search_json_output
+
+      - name: Retrieval crate unit tests
+        run: cargo test -p codestory-retrieval
diff --git a/AGENTS.md b/AGENTS.md
index 043e0f7..40c70c4 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -3,7 +3,7 @@
 ## Project Structure & Module Organization
 - Rust workspace is defined in `Cargo.toml`; crates live under `crates/`.
 - Primary runtime surface is `crates/codestory-cli`; the repo-local skill lives under `.agents/skills/codestory-grounding`.
-- Workspace crates: `codestory-contracts`, `codestory-workspace`, `codestory-store`, `codestory-indexer`, `codestory-runtime`, `codestory-cli`, `codestory-bench`.
+- Workspace crates: `codestory-contracts`, `codestory-workspace`, `codestory-store`, `codestory-indexer`, `codestory-retrieval`, `codestory-runtime`, `codestory-cli`, `codestory-bench`.
 - Runtime artifacts: user-cache SQLite grounding indexes keyed by repo path; build outputs in `target/`.
 
 ## Architecture Overview
@@ -39,5 +39,8 @@
 - Commit messages are short, lowercase, imperative (e.g., `fix minimap`, `refactor graph style`).
 - PRs should include a summary, tests run, linked issues, and relevant artifacts for behavior changes.
 
+## Retrieval documentation
+- Canonical sidecar retrieval docs are `docs/architecture/retrieval-design.md`, `docs/architecture/retrieval-parser-compat-matrix.md`, `docs/testing/retrieval-architecture.md`, and `docs/ops/retrieval-sidecars.md`.
+
 ## Security & Configuration Tips
 - Keep secrets out of the repo; pass credentials via environment variables.
diff --git a/Cargo.lock b/Cargo.lock
index bc5c7d0..81e8681 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -387,7 +387,7 @@ checksum = "3a822ea5bc7590f9d40f1ba12c0dc3c2760f3482c6984db1573ad11031420831"
 
 [[package]]
 name = "codestory-bench"
-version = "0.4.0"
+version = "0.5.0"
 dependencies = [
  "anyhow",
  "codestory-contracts",
@@ -403,7 +403,7 @@ dependencies = [
 
 [[package]]
 name = "codestory-cli"
-version = "0.4.0"
+version = "0.5.0"
 dependencies = [
  "anyhow",
  "clap",
@@ -425,7 +425,7 @@ dependencies = [
 
 [[package]]
 name = "codestory-contracts"
-version = "0.4.0"
+version = "0.5.0"
 dependencies = [
  "anyhow",
  "crossbeam-channel",
@@ -440,7 +440,7 @@ dependencies = [
 
 [[package]]
 name = "codestory-indexer"
-version = "0.4.0"
+version = "0.5.0"
 dependencies = [
  "anyhow",
  "codestory-contracts",
@@ -475,7 +475,7 @@ dependencies = [
 
 [[package]]
 name = "codestory-retrieval"
-version = "0.4.0"
+version = "0.5.0"
 dependencies = [
  "anyhow",
  "chrono",
@@ -494,7 +494,7 @@ dependencies = [
 
 [[package]]
 name = "codestory-runtime"
-version = "0.4.0"
+version = "0.5.0"
 dependencies = [
  "anyhow",
  "codestory-contracts",
@@ -519,7 +519,7 @@ dependencies = [
 
 [[package]]
 name = "codestory-store"
-version = "0.4.0"
+version = "0.5.0"
 dependencies = [
  "anyhow",
  "codestory-contracts",
@@ -534,7 +534,7 @@ dependencies = [
 
 [[package]]
 name = "codestory-workspace"
-version = "0.4.0"
+version = "0.5.0"
 dependencies = [
  "anyhow",
  "codestory-contracts",
diff --git a/README.md b/README.md
index 8f7f552..f4c2886 100644
--- a/README.md
+++ b/README.md
@@ -34,16 +34,18 @@ cargo build --release -p codestory-cli
 $CodeStoryCli = ".\target\release\codestory-cli.exe"
 $TargetWorkspace = "C:\path\to\repo"
 
+cargo retrieval-setup
 & $CodeStoryCli doctor --project $TargetWorkspace
 & $CodeStoryCli setup embeddings --project $TargetWorkspace --dry-run --format json
 & $CodeStoryCli index --project $TargetWorkspace --refresh full
+& $CodeStoryCli retrieval index --project $TargetWorkspace --refresh auto
 & $CodeStoryCli ground --project $TargetWorkspace --why
 ```
 
-The dry run shows whether the managed embedding assets are already installed or
-what CodeStory would download for hybrid retrieval. If managed assets are not
-available and you skip `setup embeddings`, indexing still works and read
-commands report the symbolic or lexical fallback path through `doctor`.
+Current packet/search evidence requires the local Zoekt, Qdrant, SCIP, and
+llama.cpp embedding sidecars to reach `retrieval_mode=full`; missing assets,
+stale manifests, disabled sidecars, or diagnostic embedding modes are setup
+failures to fix before trusting agent context.
 
 After that first index, use narrower commands instead of asking the agent to
 start over:
@@ -60,6 +62,29 @@ is needed.
 
 For task-shaped flows, use [docs/usage.md](docs/usage.md).
 
+## Retrieval sidecars
+
+For Zoekt/Qdrant/SCIP packet retrieval, run once from this repository root
+(Windows, macOS, or Linux):
+
+```sh
+cargo retrieval-setup
+```
+
+`cargo retrieval-setup` builds `codestory-cli` if needed, starts Docker Compose sidecars when
+Docker is available, writes local sidecar state, and waits for health probes. Check status with
+`cargo retrieval-status`.
+
+Bootstrap modifiers (pass through `cargo run`):
+
+```sh
+cargo run -p codestory-cli -- retrieval bootstrap --project . --skip-compose
+cargo run -p codestory-cli -- retrieval bootstrap --project . --wait-secs 120
+```
+
+Thin wrapper (same bootstrap, optional holdout clone): `node scripts/setup-retrieval-env.mjs`.
+Details: [docs/ops/retrieval-sidecars.md](docs/ops/retrieval-sidecars.md).
+
 ## Install As An Agent Skill
 
 Use this path when CodeStory should be installed once as a grounding skill and
diff --git a/crates/codestory-bench/Cargo.toml b/crates/codestory-bench/Cargo.toml
index 874de92..43831c5 100644
--- a/crates/codestory-bench/Cargo.toml
+++ b/crates/codestory-bench/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "codestory-bench"
-version = "0.4.0"
+version = "0.5.0"
 edition = "2024"
 publish = false
 
diff --git a/crates/codestory-cli/Cargo.toml b/crates/codestory-cli/Cargo.toml
index 3886651..7112927 100644
--- a/crates/codestory-cli/Cargo.toml
+++ b/crates/codestory-cli/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "codestory-cli"
-version = "0.4.0"
+version = "0.5.0"
 edition = "2024"
 
 [dependencies]
diff --git a/crates/codestory-cli/tests/retrieval_bootstrap_contracts.rs b/crates/codestory-cli/tests/retrieval_bootstrap_contracts.rs
index 9c70a95..523c31c 100644
--- a/crates/codestory-cli/tests/retrieval_bootstrap_contracts.rs
+++ b/crates/codestory-cli/tests/retrieval_bootstrap_contracts.rs
@@ -19,6 +19,20 @@ fn run_bootstrap(project: &std::path::Path, extra_args: &[&str]) -> Value {
     serde_json::from_slice(&output.stdout).expect("parse bootstrap json")
 }
 
+fn run_status(project: &std::path::Path, extra_args: &[&str]) -> Value {
+    let mut command = Command::new(env!("CARGO_BIN_EXE_codestory-cli"));
+    command.args(["retrieval", "status", "--project"]);
+    command.arg(project);
+    command.args(extra_args);
+    let output = command.output().expect("run retrieval status");
+    assert!(
+        output.status.success(),
+        "status failed: {}",
+        String::from_utf8_lossy(&output.stderr)
+    );
+    serde_json::from_slice(&output.stdout).expect("parse status json")
+}
+
 fn create_valid_cache_with_cli(project: &std::path::Path, cache: &std::path::Path) {
     let output = Command::new(env!("CARGO_BIN_EXE_codestory-cli"))
         .args(["index", "--project"])
@@ -84,6 +98,46 @@ fn bootstrap_json_includes_storage_repair_fields() {
     );
 }
 
+#[test]
+fn bootstrap_then_status_reports_manifest_missing_before_indexing() {
+    let project = tempdir().expect("project");
+    fs::write(project.path().join("lib.rs"), "pub fn main() {}\n").expect("source");
+    let cache = tempdir().expect("cache");
+    let cache_arg = cache.path().to_str().expect("utf8 cache");
+
+    let bootstrap = run_bootstrap(
+        project.path(),
+        &[
+            "--cache-dir",
+            cache_arg,
+            "--skip-compose",
+            "--wait-secs",
+            "0",
+            "--format",
+            "json",
+        ],
+    );
+    assert!(
+        bootstrap["storage_repair"].is_object(),
+        "bootstrap output missing storage repair: {bootstrap}"
+    );
+
+    let status = run_status(
+        project.path(),
+        &["--cache-dir", cache_arg, "--format", "json"],
+    );
+    assert_eq!(
+        status["degraded_reason"].as_str(),
+        Some("retrieval_manifest_missing"),
+        "status should report manifest-missing shape before indexing: {status}"
+    );
+    assert_ne!(
+        status["retrieval_mode"].as_str(),
+        Some("full"),
+        "manifest-missing status must not report full mode before retrieval index: {status}"
+    );
+}
+
 #[test]
 fn bootstrap_json_surfaces_prune_suppressed_reason_on_scan_errors() {
     let project = tempdir().expect("project");
diff --git a/crates/codestory-contracts/Cargo.toml b/crates/codestory-contracts/Cargo.toml
index c246dd0..88b63f0 100644
--- a/crates/codestory-contracts/Cargo.toml
+++ b/crates/codestory-contracts/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "codestory-contracts"
-version = "0.4.0"
+version = "0.5.0"
 edition = "2024"
 
 [dependencies]
diff --git a/crates/codestory-indexer/Cargo.toml b/crates/codestory-indexer/Cargo.toml
index 8a09a6f..c1f247e 100644
--- a/crates/codestory-indexer/Cargo.toml
+++ b/crates/codestory-indexer/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "codestory-indexer"
-version = "0.4.0"
+version = "0.5.0"
 edition = "2024"
 
 [dev-dependencies]
diff --git a/crates/codestory-retrieval/Cargo.toml b/crates/codestory-retrieval/Cargo.toml
index f3356e3..56cda6b 100644
--- a/crates/codestory-retrieval/Cargo.toml
+++ b/crates/codestory-retrieval/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "codestory-retrieval"
-version = "0.4.0"
+version = "0.5.0"
 edition = "2024"
 
 [dependencies]
diff --git a/crates/codestory-runtime/Cargo.toml b/crates/codestory-runtime/Cargo.toml
index 91c8b9a..719d895 100644
--- a/crates/codestory-runtime/Cargo.toml
+++ b/crates/codestory-runtime/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "codestory-runtime"
-version = "0.4.0"
+version = "0.5.0"
 edition = "2024"
 
 [dependencies]
diff --git a/crates/codestory-runtime/src/lib.rs b/crates/codestory-runtime/src/lib.rs
index cf7d856..0802c71 100644
--- a/crates/codestory-runtime/src/lib.rs
+++ b/crates/codestory-runtime/src/lib.rs
@@ -9603,8 +9603,11 @@ mod tests {
                 EnvGuard::remove(EMBEDDING_LAYER_NORM_ENV),
                 EnvGuard::remove(EMBEDDING_TRUNCATE_DIM_ENV),
                 EnvGuard::remove(EMBEDDING_EXPECTED_DIM_ENV),
+                EnvGuard::remove(SEMANTIC_DOC_SCOPE_ENV),
                 EnvGuard::remove(SEMANTIC_DOC_ALIAS_MODE_ENV),
                 EnvGuard::remove(SEMANTIC_DOC_MAX_TOKENS_ENV),
+                EnvGuard::remove(SEMANTIC_STREAM_PENDING_DOCS_ENV),
+                EnvGuard::remove(SEMANTIC_STREAM_SORT_WINDOW_BATCHES_ENV),
             ],
             _lock: lock,
         }
@@ -9834,6 +9837,9 @@ mod tests {
 
     #[test]
     fn semantic_doc_scope_defaults_to_durable_symbols_and_all_scope_is_opt_in() {
+        let _lock = ENV_TEST_LOCK
+            .lock()
+            .unwrap_or_else(|poisoned| poisoned.into_inner());
         let _env = EnvGuard::remove(SEMANTIC_DOC_SCOPE_ENV);
         assert_eq!(
             semantic_doc_scope_from_env(),
@@ -10259,12 +10265,13 @@ pub fn beta() {}
     fn write_reindex_semantic_fixture(root: &std::path::Path, digest_text: &str) {
         let src = root.join("src");
         fs::create_dir_all(&src).expect("create src dir");
+        let digest_identifier = digest_text.replace(' ', "_");
         fs::write(
             src.join("lib.rs"),
             format!(
                 r#"
 /// {digest_text}
-pub fn build_snapshot_digest() -> &'static str {{
+pub fn build_snapshot_digest({digest_identifier}: &str) -> &'static str {{
     "{digest_text}"
 }}
 
@@ -13932,13 +13939,23 @@ fn build_llm_symbol_doc_text() -> String {
             .expect("initial full index");
 
         let storage = Storage::open(&storage_path).expect("open storage after initial index");
-        let initial_doc = storage
+        let initial_docs = storage
             .get_all_llm_symbol_docs()
             .expect("load initial semantic docs")
             .into_iter()
-            .find(|doc| doc.display_name == "build_snapshot_digest")
-            .expect("initial digest doc");
-        assert!(initial_doc.doc_text.contains("initial compressed digest"));
+            .filter(|doc| doc.display_name == "build_snapshot_digest")
+            .collect::<Vec<_>>();
+        assert!(!initial_docs.is_empty(), "initial digest doc");
+        assert!(
+            initial_docs
+                .iter()
+                .any(|doc| doc.doc_text.contains("initial_compressed_digest")),
+            "initial digest docs should include fixture source text: {:?}",
+            initial_docs
+                .iter()
+                .map(|doc| doc.doc_text.as_str())
+                .collect::<Vec<_>>()
+        );
         drop(storage);
 
         write_reindex_semantic_fixture(workspace.path(), "updated compressed digest");
@@ -13947,15 +13964,27 @@ fn build_llm_symbol_doc_text() -> String {
             .expect("rerun full index");
 
         let storage = Storage::open(&storage_path).expect("open storage after rerun");
-        let updated_doc = storage
+        let updated_docs = storage
             .get_all_llm_symbol_docs()
             .expect("load updated semantic docs")
             .into_iter()
-            .find(|doc| doc.display_name == "build_snapshot_digest")
-            .expect("updated digest doc");
-        assert!(updated_doc.doc_text.contains("updated compressed digest"));
+            .filter(|doc| doc.display_name == "build_snapshot_digest")
+            .collect::<Vec<_>>();
+        assert!(!updated_docs.is_empty(), "updated digest doc");
         assert!(
-            !updated_doc.doc_text.contains("initial compressed digest"),
+            updated_docs
+                .iter()
+                .any(|doc| doc.doc_text.contains("updated_compressed_digest")),
+            "updated digest docs should include fixture source text: {:?}",
+            updated_docs
+                .iter()
+                .map(|doc| doc.doc_text.as_str())
+                .collect::<Vec<_>>()
+        );
+        assert!(
+            !updated_docs
+                .iter()
+                .any(|doc| doc.doc_text.contains("initial_compressed_digest")),
             "full index should rebuild semantic docs instead of reusing stale persisted content"
         );
     }
diff --git a/crates/codestory-store/Cargo.toml b/crates/codestory-store/Cargo.toml
index ab11905..cdc24f7 100644
--- a/crates/codestory-store/Cargo.toml
+++ b/crates/codestory-store/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "codestory-store"
-version = "0.4.0"
+version = "0.5.0"
 edition = "2024"
 
 [dependencies]
diff --git a/crates/codestory-workspace/Cargo.toml b/crates/codestory-workspace/Cargo.toml
index e4fd110..96353ee 100644
--- a/crates/codestory-workspace/Cargo.toml
+++ b/crates/codestory-workspace/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "codestory-workspace"
-version = "0.4.0"
+version = "0.5.0"
 edition = "2024"
 
 [dependencies]
diff --git a/docs/architecture/indexing-pipeline.md b/docs/architecture/indexing-pipeline.md
index 83db4d9..8f0f699 100644
--- a/docs/architecture/indexing-pipeline.md
+++ b/docs/architecture/indexing-pipeline.md
@@ -212,8 +212,11 @@ Embedding throughput is optimized for the local embedding path:
 
 - pending semantic docs are sorted by generated text length before embedding, which keeps batches close to uniform length
 - the default semantic embedding batch size is `128`, with `CODESTORY_LLM_DOC_EMBED_BATCH_SIZE` available for profiling
-- ONNX embeddings use `CODESTORY_EMBED_ONNX_MODEL`, `CODESTORY_EMBED_ONNX_TOKENIZER`, and `CODESTORY_EMBED_ONNX_PROVIDER` for the local model graph, tokenizer, and execution provider; managed setup points the model path at a derived CLS-pooled graph to avoid returning full token hidden states
-- the managed ONNX path seeds the current local throughput shape: semantic doc window `512`, doc batch `2048`, per-call ONNX token budget `32768`, DirectML on Windows or CPU elsewhere, and stored vectors `int8`; it does not start or retain an embedding server process
+- product sidecar embeddings use `CODESTORY_EMBED_BACKEND=llamacpp` and the
+  local `CODESTORY_EMBED_LLAMACPP_URL` endpoint; the manifest must record
+  `llamacpp:bge-base-en-v1.5`
+- ONNX and hash embedding paths are historical or diagnostic lanes unless a
+  future spec promotes them with fresh sidecar-quality evidence
 
 Keep measured repo-scale timings in [codestory-e2e-stats-log.md](../testing/codestory-e2e-stats-log.md). Architecture explains the lifecycle; the testing log owns time-specific numbers because caches, backends, and workstation state drift.
 
@@ -247,7 +250,9 @@ Full refresh builds a staged database and publishes it only after staged finaliz
 
 Semantic docs are persisted in SQLite with generated-text metadata. Reuse is keyed by schema version, generated text hash, embedding model, and embedding dimension. On full refresh, runtime copies prior semantic docs forward into the staged database before semantic sync checks them. On incremental refresh, runtime passes a touched-file scope so only docs belonging to changed files are rebuilt or pruned.
 
-Cold start still has to embed any semantic doc that has no reusable row. The cold path is kept under control by using the durable-symbol default scope, length-bucketed batches, the managed ONNX batch defaults, and stored vector quantization.
+Cold start still has to embed any semantic doc that has no reusable row. The
+cold path is kept under control by using the durable-symbol default scope,
+length-bucketed batches, full sidecar readiness, and stored vector quantization.
 
 ### What timing output means
 
diff --git a/docs/architecture/overview.md b/docs/architecture/overview.md
index 46b9518..75908cf 100644
--- a/docs/architecture/overview.md
+++ b/docs/architecture/overview.md
@@ -17,7 +17,7 @@ flowchart LR
 User-visible guarantees come from those boundaries:
 
 - Project evidence is stored in a local per-workspace cache.
-- Read commands can report stale, partial, or fallback retrieval state.
+- Read commands can report stale, partial, or non-`full` retrieval state.
 - CLI rendering stays thin; orchestration belongs to runtime.
 - Full refreshes can publish a staged store; incremental refreshes update the
   live store and refresh derived views.
diff --git a/docs/architecture/retrieval-design.md b/docs/architecture/retrieval-design.md
new file mode 100644
index 0000000..d2742ea
--- /dev/null
+++ b/docs/architecture/retrieval-design.md
@@ -0,0 +1,117 @@
+# Mandatory Sidecar Retrieval Design
+
+CodeStory packet/search evidence is sidecar-primary. A product response may be
+served only when the current project has a manifest-backed sidecar generation
+with `retrieval_mode=full`.
+
+`full` means all of the following are true for the same generation:
+
+- Zoekt lexical shard exists, matches the current lexical input hash, and
+  answers smoke queries.
+- Qdrant collection exists, has at least the manifest projection count, uses the
+  product llama.cpp `bge-base-en-v1.5` embedding backend, and answers semantic
+  smoke queries.
+- SCIP graph artifacts exist and are not stub markers.
+- The SQLite `retrieval_index_manifest` has the current schema version,
+  sidecar input hash, sidecar generation, Qdrant collection, embedding backend,
+  embedding dimension, and projection count.
+
+Everything else is diagnostic only. `no_scip`, `no_semantic`, `lexical_only`,
+`unavailable`, stale manifests, stub markers, disabled sidecars, hash vectors,
+ONNX-only paths, old env aliases, and `CODESTORY_RETRIEVAL=0` fail closed for
+agent-facing packet/search.
+
+## Ownership
+
+| Area | Owner | Supporting areas |
+|------|-------|------------------|
+| Sidecar clients, health, index generation, query execution | `codestory-retrieval` | `codestory-cli` |
+| Manifest persistence and migrations | `codestory-store` | `codestory-contracts` |
+| Packet/search routing and fail-closed behavior | `codestory-runtime` | `codestory-contracts` |
+| CLI setup, status, index, query commands | `codestory-cli` | `codestory-retrieval` |
+| Benchmarks and promotion gates | `scripts/` | docs |
+
+## Mode Matrix
+
+| Zoekt | Qdrant | SCIP | Mode | Product behavior |
+|-------|--------|------|------|------------------|
+| up | up | up | `full` | Serve packet/search evidence |
+| up | up | down | `no_scip` | Fail closed |
+| up | down | up | `no_semantic` | Fail closed |
+| up | down | down | `lexical_only` | Fail closed |
+| down | * | * | `unavailable` | Fail closed |
+
+Runtime rules:
+
+- Only `full` can serve primary packet/search results.
+- Non-`full` modes must expose `retrieval_mode` and `degraded_reason`.
+- Guard checks may warn or block promotion, but never switch product behavior to
+  an older retrieval path.
+- Repo-text, hash, stub, and old local search surfaces may be used only as
+  explicitly labeled diagnostics.
+
+## Generation And Reuse
+
+Sidecar generation is content-addressed by project id and sidecar input hash.
+The hash includes local lexical input, symbol projection rows, semantic file
+role metadata, sidecar schema version, Zoekt version pin, embedding backend,
+embedding dimension, and SCIP artifact contract inputs.
+
+`retrieval index --refresh auto` should reuse an unchanged healthy generation.
+If inputs match but health is not `full`, CodeStory rebuilds the unhealthy
+component and persists the manifest only after the full stack is healthy.
+
+## Evidence Rules
+
+- Exact symbol and path evidence remains the precision floor.
+- Semantic and graph evidence can expand or rank candidates, but cannot replace
+  a missing exact sidecar contract.
+- Broad prompt retrieval should let lexical/source evidence compete with
+  semantic evidence and should downrank tests, generated files, benchmarks, and
+  vendor paths unless the query explicitly asks for those roles.
+- Broad packet/search results must preserve provenance and mark weak evidence.
+- Search plans and repo-text diagnostics are discovery aids, not final proof.
+- Promotion metrics must come from one coherent fresh artifact run.
+
+## Cost Envelope
+
+| Tier | Example repos | Cold index budget | Sidecar disk budget | Query process budget |
+|------|---------------|-------------------|---------------------|----------------------|
+| S | `codestory`, `axios` | 8 min | 4 GB | 1.5 GB |
+| M | `ripgrep`, `rootandruntime`, `codex` | 15 min | 8 GB | 3 GB |
+| L | `redis`, `sourcetrail`, `vscode` | 45 min | 25 GB | 6 GB |
+| XL | `vscode` monolith | 60 min | 35 GB | 8 GB |
+
+Promotion is blocked for a tier if cold index exceeds budget by more than 20%
+without a documented exception.
+
+## Promotion Guards
+
+Guard warnings block promotion when consecutive full local-real runs show:
+
+| Trigger | Threshold |
+|---------|-----------|
+| p95 packet wall regression | >25% versus current accepted sidecar baseline |
+| retrieval p99 regression | >50% versus current accepted sidecar baseline |
+| quality pass drop | at least one repo worse than prior promotion |
+| sufficient-quality mismatch | any increase |
+| degraded mode rate | >5% of runs |
+| VS Code claim recall | <50% while packet says sufficient |
+
+The file currently named `retrieval-rollback.json` stores these guard
+thresholds. It is not a runtime rollback mechanism.
+
+## Generalization
+
+Local-real tuning repos are `codex`, `rootandruntime`, `sourcetrail`, and
+`vscode`. Holdout repos should be fetched into ignored target directories and
+must not influence ranker/planner tuning. Dogfood results on `codestory` are
+fast regression evidence, not generalization proof.
+
+Promotion requires at least:
+
+- fresh coherent six-lane artifacts,
+- served packet/search rows reporting `retrieval_mode=full`,
+- local-real quality that beats the prior accepted baseline,
+- no diagnostic/stub/hash product evidence,
+- docs and runbooks aligned with the current mandatory sidecar contract.
diff --git a/docs/architecture/retrieval-parser-compat-matrix.md b/docs/architecture/retrieval-parser-compat-matrix.md
new file mode 100644
index 0000000..a297896
--- /dev/null
+++ b/docs/architecture/retrieval-parser-compat-matrix.md
@@ -0,0 +1,46 @@
+# Retrieval parser compatibility matrix (ws-a-parser-compat)
+
+This records Step 2 parser compatibility decisions from `retrieval-language-support_038d3ae9.plan.md` against the workspace policy:
+
+- `tree-sitter = "0.24"`
+- `tree-sitter-graph = "0.12"`
+
+## Validation method
+
+Checked candidate parser crates in an isolated temporary probe crate (outside workspace members) with this dependency shape:
+
+```toml
+[dependencies]
+tree-sitter = "0.24"
+tree-sitter-graph = "0.12"
+<language-parser-crate> = "=<pinned-version>"
+```
+
+For each language, ran `cargo check` after pinning exactly one parser crate/version.
+
+## Decision matrix
+
+| Language | Candidate crate | Version checked | `cargo check` with 0.24/0.12 | Decision | Notes |
+|---|---|---:|---|---|---|
+| Go | `tree-sitter-go` | `0.23.4` | pass (`cargo check` + parse smoke) | crates.io pin | `0.25.0` compiles but fails at runtime with `LanguageError { version: 15 }` on tree-sitter `0.24`. |
+| Ruby | `tree-sitter-ruby` | `0.23.1` | pass (`cargo check` + parse smoke) | crates.io pin | Wired in indexer with `rules/ruby.scm`. |
+| PHP | `tree-sitter-php` | `0.23.11` | pass (`cargo check` + parse smoke) | crates.io pin | `0.24.2` compiles but fails at runtime with `LanguageError { version: 15 }` on tree-sitter `0.24`. |
+| C# | `tree-sitter-c-sharp` | `=0.23.0` | pass (`cargo check` + parse smoke) | crates.io pin | `0.23.5` compiles but fails at runtime with `LanguageError { version: 15 }` on tree-sitter `0.24`. |
+| Kotlin | `tree-sitter-kotlin-ng` | `1.1.0` | pass | crates.io pin | Use `-ng` crate family for Kotlin parser wiring. |
+| Swift | `tree-sitter-swift` | `0.7.2` | pass | crates.io pin | crates.io source compiles with policy pins. |
+| Dart | `tree-sitter-dart` | `0.2.0` | pass | crates.io pin | crates.io source compiles with policy pins. |
+| HTML | `tree-sitter-html` | `0.23.2` | pass | crates.io pin | Parser is available if structural extraction chooses parser-backed route. |
+| CSS | `tree-sitter-css` | `0.25.0` | pass | crates.io pin | Parser is available if structural extraction chooses parser-backed route. |
+| SQL | `tree-sitter-sequel` | `0.3.11` | pass | crates.io pin | SQL parser candidate compiles with policy pins. |
+| Bash | `tree-sitter-bash` | `0.25.1` | pass | crates.io pin | Supports script-language parser path if/when enabled. |
+
+## Current outcome
+
+- No language in this matrix currently requires a git pin, custom fork, or forced text-only fallback for **parser-policy compatibility**.
+- Go, Ruby, PHP, and C# have parser dependencies, rule assets, and extension
+  routing wired in the current branch.
+- HTML, CSS, and SQL have structural extraction paths, but they are not
+  parser-backed rule assets from this matrix.
+- Kotlin, Swift, Dart, and Bash remain compatibility decisions only. They still
+  need dependency wiring, rule assets, language routing, and fidelity coverage
+  before they should be described as parser-backed runtime support.
diff --git a/docs/architecture/retrieval-rollback.json b/docs/architecture/retrieval-rollback.json
new file mode 100644
index 0000000..bfa4695
--- /dev/null
+++ b/docs/architecture/retrieval-rollback.json
@@ -0,0 +1,63 @@
+{
+  "$schema": "https://json-schema.org/draft/2020-12/schema",
+  "title": "CodeStory sidecar retrieval auto-rollback triggers",
+  "description": "When CODESTORY_RETRIEVAL default is enabled, fail closed and warn if any trigger fires on consecutive full local-real benchmark runs.",
+  "version": 1,
+  "env": {
+    "shadow_mode": "CODESTORY_RETRIEVAL_SHADOW",
+    "zoekt": "CODESTORY_ZOEKT_ENABLED",
+    "qdrant": "CODESTORY_QDRANT_ENABLED"
+  },
+  "baseline_artifact_dir": "target/agent-benchmark/baseline-pre-retrieval",
+  "comparison_artifact": "packet-runtime-summary.json",
+  "consecutive_runs_required": 2,
+  "triggers": [
+    {
+      "id": "p95_packet_wall_regression",
+      "metric": "p95_packet_wall_ms",
+      "threshold": "+25%",
+      "baseline": "baseline-pre-retrieval/local-real",
+      "measurement": "packet-runtime-summary.json"
+    },
+    {
+      "id": "retrieval_p99_regression",
+      "metric": "retrieval_p99_ms",
+      "threshold": "+50%",
+      "baseline": "baseline-pre-retrieval/shadow-sidecar",
+      "measurement": "trace aggregates"
+    },
+    {
+      "id": "quality_pass_runs_drop",
+      "metric": "quality_pass_runs",
+      "threshold": ">=1 repo regression",
+      "baseline": "baseline-pre-retrieval/local-real",
+      "measurement": "packet-runtime-summary.json"
+    },
+    {
+      "id": "sufficient_quality_mismatch_increase",
+      "metric": "sufficient_quality_mismatch_runs",
+      "threshold": "increase vs prior run",
+      "baseline": "baseline-pre-retrieval/local-real",
+      "measurement": "packet-runtime-summary.json"
+    },
+    {
+      "id": "degraded_mode_rate",
+      "metric": "degraded_mode_rate",
+      "threshold": ">5%",
+      "baseline": "baseline-pre-retrieval/shadow-sidecar",
+      "measurement": "retrieval_shadow traces"
+    },
+    {
+      "id": "holdout_claim_recall_floor",
+      "metric": "holdout_claim_recall",
+      "threshold": "<50% when sufficient",
+      "baseline": "baseline-pre-retrieval/local-real",
+      "measurement": "packet-runtime-summary.json"
+    }
+  ],
+  "actions": [
+    "Fail closed; do not set CODESTORY_RETRIEVAL=0",
+    "File incident note in docs/testing/retrieval-architecture.md",
+    "Block promotion until root cause resolved"
+  ]
+}
diff --git a/docs/architecture/runtime-execution-path.md b/docs/architecture/runtime-execution-path.md
index b348139..4bc4289 100644
--- a/docs/architecture/runtime-execution-path.md
+++ b/docs/architecture/runtime-execution-path.md
@@ -35,7 +35,7 @@ sequenceDiagram
 7. Runtime finalizes staged builds through `SnapshotStore` and publishes the finished snapshot when a full refresh completes.
 8. Runtime refreshes the search-symbol projection and synchronizes semantic docs before returning the index summary.
 
-Default index runs do not defer semantic docs. When embedding assets are available, the returned retrieval state should have `semantic_ready = true` and a non-zero semantic doc count. If semantic assets are missing or hybrid retrieval is disabled, runtime still completes graph and lexical state and reports the fallback reason.
+Default index runs do not defer semantic docs. When embedding assets are available, the returned retrieval state should have `semantic_ready = true` and a non-zero semantic doc count. If semantic assets are missing or hybrid retrieval is disabled, runtime still completes graph and lexical state and reports the degraded-state reason.
 
 ## Search Command
 
@@ -44,27 +44,33 @@ sequenceDiagram
     participant CLI as codestory-cli
     participant Runtime as codestory-runtime
     participant Store as codestory-store
+    participant Retrieval as codestory-retrieval
 
     CLI->>Runtime: resolve command and query options
-    Runtime->>Store: open store and read graph/search state
-    Store-->>Runtime: rows, snapshots, and search docs
-    Runtime->>Runtime: prefer hybrid ranking, or fall back to symbolic ranking
-    Runtime->>Runtime: map retrieval state and hits into DTOs
+    Runtime->>Retrieval: strict_sidecar_status(project, storage)
+    Retrieval->>Store: load retrieval manifest and validate live indexable inventory
+    Retrieval-->>Runtime: retrieval_mode + degraded reason
+    Runtime->>Retrieval: execute sidecar query only when retrieval_mode=full
+    Runtime->>Store: resolve sidecar candidates to indexed symbols
+    Runtime->>Runtime: map sidecar trace and resolved hits into DTOs
     Runtime-->>CLI: result DTOs
     CLI->>CLI: render markdown or JSON
 ```
 
 1. CLI resolves the project and query options.
-2. Runtime opens the store and ensures runtime-owned search state is available.
-3. Runtime search prefers hybrid ranking when semantic docs and a local embedding runtime are ready.
-4. When semantic retrieval is unavailable, runtime falls back to symbolic ranking and records the fallback reason in the DTO surface.
-5. For broad architecture-style queries, runtime assembles an optional Search Plan with extracted/dropped terms, bounded subqueries, candidate windows, anchor groups, repo-text promotion status, bridge evidence, next commands, and source-truth checks.
-6. Runtime maps retrieval state plus matches into contract DTOs and CLI renders them.
+2. Runtime asks `codestory-retrieval` for sidecar status before serving results.
+3. Retrieval status loads the stored retrieval manifest, applies stale-manifest checks, and reports the exact degraded reason before any healthy sidecar probe can bless an invalid manifest.
+4. `retrieval_mode = full` is the only product-serving search path. Missing, stale, partial, or non-product sidecar state fails closed with the degraded reason.
+5. Runtime executes the mandatory sidecar query, resolves returned candidates back into indexed symbols, and rejects unresolved or non-full candidate sets before returning product hits.
+6. Hybrid semantic state, repo-text matches, and local lexical search are diagnostic/navigation surfaces only; they are not a product fallback for `search`.
+7. For broad architecture-style queries, runtime may assemble a Search Plan with extracted/dropped terms, bounded subqueries, candidate windows, anchor groups, bridge evidence, next commands, and source-truth checks.
+8. Runtime maps retrieval state plus resolved sidecar matches into contract DTOs and CLI renders them.
 
 When `search --why` is requested, the CLI renders compact explanations from the
-same DTO surface: origin, fallback state, and lexical/semantic/graph score
-breakdowns when runtime produced hybrid scored hits, plus the Search Plan when
-the broad-query planner emitted one.
+same DTO surface: sidecar origin, degraded/fail-closed state, candidate
+resolution details, and the Search Plan when the broad-query planner emitted
+one. Legacy hybrid score details may appear only as diagnostic data from
+non-serving paths.
 
 ## Context Command
 
@@ -72,20 +78,25 @@ the broad-query planner emitted one.
 sequenceDiagram
     participant CLI as codestory-cli
     participant Runtime as codestory-runtime
-    participant Search as runtime search
+    participant Retrieval as mandatory sidecar retrieval
     participant Graph as runtime graph builders
 
     CLI->>Runtime: concrete target request
-    Runtime->>Search: scored hybrid retrieval
+    Runtime->>Retrieval: validate full sidecar status and resolve target
     Runtime->>Graph: neighborhood, trail, snippets, citations
     Runtime-->>CLI: context packet with trace and evidence
     CLI->>CLI: render markdown/json and optional bundle
 ```
 
-`context` is DB-first and target-first. The CLI resolves `--id`, `--query`, or
-`--bookmark` to one concrete target, then asks runtime-owned retrieval to build
-a deep evidence packet. It is not a question-answering command and does not
-interpret broad natural-language prompts.
+`context` is target-first. The CLI resolves `--id`, `--query`, or `--bookmark`
+to one concrete target. Query target selection may use read-only indexed-symbol
+resolution to choose that target, but context answer/evidence retrieval still
+fails closed unless strict sidecar status reports `retrieval_mode = full`.
+Runtime then builds the deep evidence packet from graph neighborhoods, trails,
+snippets, and citations. It is not a question-answering command and does not
+interpret broad natural-language prompts. Repo-text or hybrid state can guide
+diagnostics, but `retrieval_mode = full` sidecar evidence is the only
+product-serving retrieval state.
 
 ## Ground, Symbol, Trail, and Snippet Commands
 
diff --git a/docs/architecture/subsystems/runtime.md b/docs/architecture/subsystems/runtime.md
index f8556bf..f0adc61 100644
--- a/docs/architecture/subsystems/runtime.md
+++ b/docs/architecture/subsystems/runtime.md
@@ -41,37 +41,23 @@ Important tuning surfaces:
 
 - `CODESTORY_SEMANTIC_DOC_SCOPE`: default durable symbols; use `all` for the older broad symbol set
 - `CODESTORY_SEMANTIC_DOC_ALIAS_MODE`: default `alias_variant`; use `no_alias` for baseline research rows or `current_alias` for the older full alias text
-- `CODESTORY_SEMANTIC_DOC_MAX_TOKENS`: generated semantic-doc token budget; managed ONNX setup seeds `512` unless explicitly set
-- `CODESTORY_EMBED_BACKEND`: `onnx`, `llamacpp`, or `hash`
-- `CODESTORY_EMBED_PROFILE`: built-in profile; defaults to `bge-base-en-v1.5`; explicit profiles include `minilm`, `bge-small-en-v1.5`, `bge-base-en-v1.5`, `qwen3-embedding-0.6b`, `embeddinggemma-300m`, `nomic-embed-text-v1.5`, or `nomic-embed-text-v2-moe`
-- `CODESTORY_EMBED_ONNX_MODEL`: path to the ONNX embedding graph
-- `CODESTORY_EMBED_ONNX_TOKENIZER`: path to the matching Hugging Face `tokenizer.json`
-- `CODESTORY_EMBED_ONNX_PROVIDER`: `directml`, `cpu`, or `auto`
-- `CODESTORY_EMBED_ONNX_BATCH_TOKENS`: max padded tokens per ORT call; managed ONNX setup seeds `32768`
-- `CODESTORY_EMBED_ONNX_THREADS`: optional ORT intra-op thread count for CPU-oriented runs
-- `CODESTORY_EMBED_LLAMACPP_URL`: legacy OpenAI-compatible llama.cpp embedding endpoint for `CODESTORY_EMBED_BACKEND=llamacpp`
-- `CODESTORY_EMBED_LLAMACPP_REQUEST_COUNT`: legacy llama.cpp request concurrency, clamped from `1` to `16`
-- `CODESTORY_LLM_DOC_EMBED_BATCH_SIZE`: semantic doc embedding batch size, default `128`; managed ONNX setup seeds `2048` unless explicitly set
-- `CODESTORY_STORED_VECTOR_ENCODING`: in-memory search-vector encoding; managed ONNX setup seeds `int8` unless explicitly set
-
-ONNX Runtime is the managed real-model path. Runtime loads the tokenizer and
-graph in-process, feeds `input_ids`, `attention_mask`, and `token_type_ids`,
-and accepts either a pooled rank-2 `sentence_embedding` output or a legacy
-rank-3 `last_hidden_state` output. Managed setup derives a CLS-pooled runtime
-graph so normal runs avoid transferring the full token hidden state, then reuse
-the existing normalization and stored-vector path. The `llamacpp` backend remains
-an explicit legacy option for callers that manage their own OpenAI-compatible
-embedding server, and the `hash` backend remains for deterministic local-dev
-and CI checks. Current benchmark findings live in
+- `CODESTORY_SEMANTIC_DOC_MAX_TOKENS`: generated semantic-doc token budget.
+- `CODESTORY_EMBED_BACKEND`: product sidecar indexing requires `llamacpp`.
+- `CODESTORY_EMBED_LLAMACPP_URL`: local OpenAI-compatible llama.cpp embedding endpoint for `CODESTORY_EMBED_BACKEND=llamacpp`.
+- `CODESTORY_EMBED_LLAMACPP_REQUEST_COUNT`: local llama.cpp request concurrency, clamped from `1` to `16`.
+- `CODESTORY_LLM_DOC_EMBED_BATCH_SIZE`: semantic doc embedding batch size, default `128`.
+
+Product packet/search evidence is served through mandatory sidecars. The
+manifest must record `llamacpp:bge-base-en-v1.5` and live health must report
+`retrieval_mode=full`. ONNX, hash, and in-process embedding paths are historical
+or diagnostic research lanes unless a future spec promotes them with fresh
+sidecar-quality evidence. Current benchmark findings live in
 [embedding-backend-benchmarks.md](../../testing/embedding-backend-benchmarks.md).
 
-The CLI owns managed embedding setup. `codestory-cli setup embeddings` installs
-pinned Qdrant BGE-base ONNX assets under the user cache and derives
-`model_optimized_cls_pool.onnx` from the downloaded graph. It does not start a
-server, write server logs, or leave a model process behind. When assets are
-present, CLI runtime preparation sets the pooled ONNX model and tokenizer paths plus
-the managed provider and throughput defaults unless the user already set
-explicit environment values.
+The CLI owns managed embedding setup. `codestory-cli retrieval bootstrap` starts
+the local llama.cpp sidecar when Docker Compose is available; `retrieval index`
+then writes generation-bound sidecar artifacts and manifest metadata. Missing
+or non-product embedding state fails closed for agent-facing retrieval.
 
 Timing fields for this path are in `IndexingPhaseTimings`: `search_projection_rebuild_ms`, `search_symbol_index_ms`, `runtime_cache_publish_ms`, `semantic_doc_build_ms`, `semantic_embedding_ms`, `semantic_db_upsert_ms`, `semantic_reload_ms`, `semantic_prune_ms`, `semantic_docs_reused`, `semantic_docs_embedded`, `semantic_docs_pending`, and `semantic_docs_stale`.
 
diff --git a/docs/contributors/debugging.md b/docs/contributors/debugging.md
index 84907e6..55dbf1b 100644
--- a/docs/contributors/debugging.md
+++ b/docs/contributors/debugging.md
@@ -83,20 +83,25 @@ Check:
 - whether the symbol exists in store-backed search docs
 - whether runtime rebuilt its search state after indexing
 - what retrieval mode `index`, `ground`, or `search` reported for the current run
-- whether semantic retrieval is disabled, ONNX model/tokenizer paths are missing, a legacy llama.cpp endpoint is unreachable, or semantic docs are missing
+- whether semantic retrieval is disabled, ONNX model/tokenizer paths are missing, sidecars are not full, or semantic docs are missing
 - whether `CODESTORY_HYBRID_RETRIEVAL_ENABLED`, `CODESTORY_SEMANTIC_DOC_SCOPE`, `CODESTORY_EMBED_RUNTIME_MODE`, `CODESTORY_EMBED_BACKEND`, or the `CODESTORY_EMBED_ONNX_*` paths changed between runs
 - whether graph-based boosts are overwhelming lexical matches
 
 Recovery order:
 
 1. Confirm whether the miss is in `indexed_symbol_hits`, `repo_text_hits`, or both.
-2. Confirm the reported retrieval mode and fallback reason before touching search ranking code.
-3. If `doctor` reports `missing_managed_assets`, run `codestory-cli setup embeddings --project .` before reindexing. Managed setup installs ONNX assets and should not start a server.
-4. For lightweight local-dev semantic checks, set `CODESTORY_EMBED_RUNTIME_MODE=hash`.
-5. For external legacy real local model assets, set `CODESTORY_EMBED_BACKEND=llamacpp`, start `llama-server --embedding`, and set `CODESTORY_EMBED_LLAMACPP_URL` if it is not on the default endpoint.
-6. If the current machine should stay lexical only, set `CODESTORY_HYBRID_RETRIEVAL_ENABLED=false` and verify the fallback messaging instead of treating it as a runtime regression.
-7. Rebuild once with `index --refresh full`.
-8. If semantic retrieval is still the only failing part, inspect the reported fallback reason before touching lexical ranking or CLI rendering.
+2. Confirm the reported retrieval mode and degraded-state reason before touching search ranking code.
+3. For product sidecar evidence, run `codestory-cli retrieval bootstrap --project .`, set
+   `CODESTORY_EMBED_BACKEND=llamacpp`, and point `CODESTORY_EMBED_LLAMACPP_URL` at the local
+   bge-base-en-v1.5 llama.cpp `/v1/embeddings` endpoint before reindexing.
+4. Rebuild once with `codestory-cli index --project . --refresh full`, then
+   `codestory-cli retrieval index --project . --refresh full`.
+5. Require `codestory-cli retrieval status --project . --format json` to report
+   `retrieval_mode: "full"` before trusting packet/search evidence.
+6. If `doctor` reports `missing_managed_assets`, use `codestory-cli setup embeddings --project .`
+   only for managed ONNX/local semantic diagnostics. Managed setup installs ONNX assets and should
+   not start a server or create the product retrieval manifest.
+7. If semantic retrieval is still the only failing part, inspect the reported degraded-state reason before touching lexical ranking or CLI rendering.
 
 ## If Cold Indexing Is Slow
 
@@ -121,16 +126,19 @@ Check:
 - whether `CODESTORY_SEMANTIC_DOC_SCOPE=all` is forcing the broad all-symbol semantic set
 - whether `CODESTORY_SEMANTIC_DOC_ALIAS_MODE` was changed from the profiled default of `alias_variant`
 - whether `CODESTORY_LLM_DOC_EMBED_BATCH_SIZE` was changed from the profiled default of `128`
-- whether managed ONNX embeddings are installed according to `doctor`
-- whether `CODESTORY_EMBED_ONNX_PROVIDER` is `directml`, `cpu`, or `auto`, whether managed setup is using the pooled ONNX graph, and whether `CODESTORY_EMBED_ONNX_BATCH_TOKENS` is too large for the active output tensor shape
-- whether a legacy `CODESTORY_EMBED_BACKEND=llamacpp` comparison is pointing at a real `llama-server --embedding` endpoint before trusting GGUF speed claims
+- whether mandatory sidecars report `retrieval_mode=full` according to `doctor`
+  and `retrieval status`
+- whether `CODESTORY_EMBED_BACKEND=llamacpp` and the local
+  `CODESTORY_EMBED_LLAMACPP_URL` endpoint match the manifest embedding backend
+- whether an ONNX, hash, or other diagnostic comparison is clearly labeled and
+  excluded from agent-facing sidecar evidence
 
 Recovery order:
 
 1. Run one measured cold E2E and append the headline numbers to `docs/testing/codestory-e2e-stats-log.md`.
 2. Compare semantic embedded/reused counts before changing graph code.
 3. For reuse regressions, inspect semantic doc version, generated text hash, embedding model, and embedding dimension.
-4. For cold-only regressions, inspect durable semantic scope, length-bucket ordering, embedding batch size, ONNX provider, and ORT CPU thread settings.
+4. For cold-only regressions, inspect durable semantic scope, length-bucket ordering, embedding batch size, sidecar health, and local embedding endpoint latency.
 5. For backend experiments, first verify the runtime is using the backend under test, then rerun the speed and quality comparisons documented in `docs/testing/embedding-backend-benchmarks.md`.
 
 ## If Grounding Is Wrong
diff --git a/docs/contributors/getting-started.md b/docs/contributors/getting-started.md
index baf281f..765202d 100644
--- a/docs/contributors/getting-started.md
+++ b/docs/contributors/getting-started.md
@@ -32,17 +32,18 @@ Read commands default to `--refresh none`. If a read command says the cache is e
 
 ## Hybrid Retrieval Setup
 
-Use one of these modes before debugging ranking quality:
+Use the managed full-sidecar path before debugging ranking quality:
 
-- managed real-model setup: `codestory-cli setup embeddings --project .`; use `--dry-run` first to inspect the pinned BGE-base ONNX assets without downloading
-- fast local-dev semantic mode: `CODESTORY_EMBED_RUNTIME_MODE=hash`
-- external legacy model serving: set `CODESTORY_EMBED_BACKEND=llamacpp`, run `llama-server --embedding`, and point `CODESTORY_EMBED_LLAMACPP_URL` at its OpenAI-compatible embeddings endpoint when it is not on the default `http://127.0.0.1:8080/v1/embeddings`
+- managed real-model setup: `node scripts/setup-retrieval-env.mjs --fetch-embed-model`, then `codestory-cli retrieval bootstrap --project .`
 - default semantic scope: durable symbols only; set `CODESTORY_SEMANTIC_DOC_SCOPE=all` when you intentionally need the broad all-symbol semantic doc set
 - default semantic alias mode: compact aliases; set `CODESTORY_SEMANTIC_DOC_ALIAS_MODE=no_alias` or `current_alias` only when reproducing benchmark rows
-- embedding throughput tuning: `CODESTORY_LLM_DOC_EMBED_BATCH_SIZE`, `CODESTORY_EMBED_ONNX_BATCH_TOKENS`, `CODESTORY_EMBED_ONNX_PROVIDER`, and `CODESTORY_EMBED_ONNX_THREADS`
-- lexical-only mode: `CODESTORY_HYBRID_RETRIEVAL_ENABLED=false`
+- embedding throughput tuning: `CODESTORY_LLM_DOC_EMBED_BATCH_SIZE` and local llama.cpp sidecar settings
 
-`index`, `ground`, `search`, `context`, and `doctor` report the active retrieval mode plus any fallback reason when retrieval state is available, so confirm that output before assuming the ranking logic regressed. Default `index` synchronizes semantic docs before returning when embedding assets are available.
+Hash embeddings, ONNX-only flows, and lexical-only switches are diagnostic or
+historical comparison modes only; they are not valid agent-facing retrieval
+setup.
+
+`index`, `ground`, `search`, `context`, and `doctor` report the active retrieval mode plus any degraded-state reason when retrieval state is available, so confirm that output before assuming the ranking logic regressed. Agent-facing retrieval requires `retrieval_mode=full`.
 
 ## Recommended Reading Order
 
@@ -104,4 +105,5 @@ Read these pages first:
 - `index --refresh auto`: chooses full on an empty cache and incremental after that
 - `ground`, `search`, `context`, `symbol`, `trail`, `snippet`, `query`, `explore`, `serve`: default to `--refresh none`
 - `drill`: defaults to `--refresh full` so report bundles are mechanically fresh
+- `drill --jobs N` and `drill-suite --jobs N`: only use workers with `--refresh none`; refresh/indexing runs stay serialized
 - use `--refresh full` after deleting the cache directory, after schema-affecting changes, or when stale state is suspected
diff --git a/docs/contributors/retrieval-sidecar-smoke-ci.md b/docs/contributors/retrieval-sidecar-smoke-ci.md
new file mode 100644
index 0000000..ee9e79c
--- /dev/null
+++ b/docs/contributors/retrieval-sidecar-smoke-ci.md
@@ -0,0 +1,98 @@
+# CI manifest-missing smoke: `retrieval-sidecar-smoke` (Windows)
+
+**Status:** workflow checked in at [`.github/workflows/retrieval-sidecar-smoke.yml`](../../.github/workflows/retrieval-sidecar-smoke.yml).
+Full index/query on the monorepo may exceed runner budgets; the job runs bootstrap with
+`--skip-compose --wait-secs 0`, asserts `retrieval status` returns the clean pre-index
+`retrieval_manifest_missing` shape through the CLI integration test suite, and runs
+runtime/retrieval protocol plus non-live CLI search contract tests. This job is not a full sidecar
+readiness gate. The workflow restores a Rust build cache before the Cargo steps; a new cache key may
+still pay one cold compile, but later pushes should reuse the warmed target and Cargo dependency
+state.
+
+**Preflight reference:** [`docs/ops/retrieval-sidecars.md`](../ops/retrieval-sidecars.md#preflight-smoke-contract)
+
+---
+
+## Purpose
+
+Fail PRs that touch retrieval/runtime/stdio/search wiring when the manifest-missing status shape
+or associated Rust contracts drift on a clean Windows runner.
+
+## Trigger paths (suggested)
+
+```yaml
+paths:
+  - crates/codestory-retrieval/**
+  - crates/codestory-cli/src/**/retrieval*
+  - crates/codestory-cli/src/stdio_*.rs
+  - crates/codestory-cli/tests/retrieval_bootstrap_contracts.rs
+  - crates/codestory-cli/tests/search_json_output.rs
+  - crates/codestory-cli/tests/stdio_protocol_contracts.rs
+  - crates/codestory-runtime/src/**
+  - crates/codestory-indexer/Cargo.toml
+  - crates/codestory-indexer/src/lib.rs
+  - docs/ops/retrieval-sidecars.md
+```
+
+## Job sketch (PowerShell)
+
+```powershell
+# After checkout, Node setup, Rust toolchain setup, and Rust cache restore:
+node scripts/lint-retrieval-generalization.mjs
+cargo test -p codestory-cli --test retrieval_bootstrap_contracts
+cargo test -p codestory-runtime --lib
+cargo test -p codestory-runtime --test retrieval_generalization_guard
+cargo test -p codestory-cli --test stdio_protocol_contracts
+cargo test -p codestory-cli --test search_json_output
+cargo test -p codestory-retrieval
+```
+
+Use a tiny fixture repo if this workflow later grows to include indexed full-mode smoke coverage;
+bootstrap with `--skip-compose` does not start sidecars, fetch the GGUF model, or create the
+retrieval manifest required for `retrieval_mode == "full"`.
+
+## Pass criteria
+
+1. Generalization lint exits 0.
+2. Rust cache restore/save completes or gracefully misses without masking later failures.
+3. `cargo test -p codestory-cli --test retrieval_bootstrap_contracts` exits 0, including the
+   bootstrap/status assertion that reports `degraded_reason == "retrieval_manifest_missing"` and
+   non-`full` mode on a clean temp project before indexing.
+4. `cargo test -p codestory-runtime --lib` exits 0.
+5. `cargo test -p codestory-runtime --test retrieval_generalization_guard` exits 0.
+6. `cargo test -p codestory-cli --test stdio_protocol_contracts` exits 0.
+7. `cargo test -p codestory-cli --test search_json_output` exits 0 for non-live fail-closed search contracts.
+8. `cargo test -p codestory-retrieval` exits 0.
+
+## Pins
+
+Match [`docs/ops/retrieval-sidecars.md`](../ops/retrieval-sidecars.md) version table (real Zoekt,
+`qdrant/qdrant:v1.12.5`, generated SCIP graph artifacts).
+
+## Related tests (local substitute)
+
+```powershell
+node scripts/lint-retrieval-generalization.mjs
+cargo test -p codestory-cli --test retrieval_bootstrap_contracts
+cargo test -p codestory-runtime --lib
+cargo test -p codestory-runtime --test retrieval_generalization_guard
+cargo test -p codestory-cli --test stdio_protocol_contracts
+cargo test -p codestory-cli --test search_json_output
+cargo test -p codestory-retrieval
+```
+
+The workflow runs the lint script and focused test targets. The manifest-missing smoke lives in
+`retrieval_bootstrap_contracts` so Cargo builds the CLI through the integration-test path instead of
+paying for a standalone build step before the tests. The Rust cache is configured to save even on
+failure, which keeps failed follow-up pushes from repeatedly paying the full Windows cold-compile
+cost. `retrieval_generalization_guard` invokes the same lint from Rust for cross-platform CI parity.
+This smoke job does not claim stdio, CLI, or runtime full-mode success. Full readiness evidence
+requires a separate fixture run that starts real sidecars, provisions `bge-base-en-v1.5.Q8_0.gguf`,
+runs `retrieval index`, and verifies `retrieval_mode == "full"`. The live success contracts are
+intentionally outside the normal smoke gate: set `CODESTORY_STDIO_FULL_RETRIEVAL_TESTS=1` before
+running the stdio full-mode
+contracts with `-- --ignored --nocapture`, run
+`cargo test -p codestory-cli --test search_json_output -- --ignored --nocapture search_json_emits_sidecar_primary_results_without_repo_text_fallback`
+for the CLI lane, and run the ignored `retrieval_eval_*` tests with
+`CODESTORY_RETRIEVAL_EVAL_FULL_TESTS=1` only after the sidecar fixture is prepared. Without those
+preconditions, the live lanes are blocked/skipped by name rather than silently passing.
diff --git a/docs/contributors/testing-matrix.md b/docs/contributors/testing-matrix.md
index 1cc4326..aece0cc 100644
--- a/docs/contributors/testing-matrix.md
+++ b/docs/contributors/testing-matrix.md
@@ -68,7 +68,9 @@ cargo test -p codestory-runtime
 cargo test -p codestory-runtime --test retrieval_eval
 ```
 
-Run `retrieval_eval` when search or grounding quality may have changed.
+Run `retrieval_eval` when search or grounding quality may have changed. By default it verifies
+that plain indexing fails closed for sidecar-primary search. To run the full quality assertions,
+prepare real sidecars and set `CODESTORY_RETRIEVAL_EVAL_FULL_TESTS=1`.
 The repo-scale runtime integration test is ignored by default because it indexes the full
 `codestory` workspace and can exhaust memory on developer machines.
 Only run it as an explicit heavy lane:
@@ -87,7 +89,12 @@ cargo build --release -p codestory-cli
 cargo test -p codestory-cli --test codestory_repo_e2e_stats -- --ignored --nocapture
 ```
 
-Append the emitted headline metrics to `docs/testing/codestory-e2e-stats-log.md`. Include graph seconds, semantic seconds, semantic docs reused, semantic docs embedded, total index seconds, and whether `retrieval.semantic_ready` was true.
+The real-repo drill portion fails closed unless `CODESTORY_REAL_REPO_DRILL_CASES`
+points at a prepared manifest. Use `CODESTORY_ALLOW_SKIP_REAL_REPO_DRILL_CASES=1`
+only for intentional local stats-only rows; those rows are not real-drill release
+evidence.
+
+Append the emitted headline metrics to `docs/testing/codestory-e2e-stats-log.md`. Include graph seconds, semantic seconds, semantic docs reused, semantic docs embedded, total index seconds, `retrieval_index_seconds`, `retrieval_status_seconds`, and whether `sidecar_status_after_retrieval_index` plus `search.sidecar_shadow_retrieval_mode` were `full`.
 
 For the current repo-scale baseline, use the latest row in
 [`codestory-e2e-stats-log.md`](../testing/codestory-e2e-stats-log.md). Older
diff --git a/docs/internal/README.md b/docs/internal/README.md
deleted file mode 100644
index 14becd2..0000000
--- a/docs/internal/README.md
+++ /dev/null
@@ -1,8 +0,0 @@
-# Internal Notes
-
-This folder is for planning artifacts, release gates, and operating notes that
-are useful to maintainers but should not be the public learning path.
-
-If a page explains how to use CodeStory today, keep it in the public docs. If it
-explains why future work is gated, what a sprint once intended, or how evidence
-must be promoted, it probably belongs here or in `docs/testing/`.
diff --git a/docs/internal/llm-default-codebase-browser-plan.md b/docs/internal/llm-default-codebase-browser-plan.md
deleted file mode 100644
index c9fc133..0000000
--- a/docs/internal/llm-default-codebase-browser-plan.md
+++ /dev/null
@@ -1,509 +0,0 @@
-# Internal Plan: Make CodeStory An LLM's Default Codebase Browser
-
-**Generated**: 2026-05-06
-**Estimated complexity**: High
-**Status**: historical planning artifact; status reviewed on 2026-05-24
-
-## Overview
-
-CodeStory already has the core substrate for an agent-facing codebase browser:
-local indexing, a SQLite-backed symbol/edge graph, semantic docs, grounding
-snapshots, search, symbol inspection, trails, snippets, DB-first `context`, a TUI
-`explore` path, HTTP routes, and MCP-style stdio serving.
-
-The next product step is not another isolated command. It is making those
-primitives act like one browsing layer that an LLM can use
-before reaching for ad hoc file reads.
-
-The reviewed direction is:
-
-1. Fix unsafe or drifting contracts first.
-2. Add fast always-on browser-path tests.
-3. Cleanly separate read-only browser services from CLI transports.
-4. Make stdio/MCP compatibility explicit and testable.
-5. Improve retrieval quality with a bounded target-context mode.
-6. Add freshness, setup, and performance trust signals.
-7. Improve the existing `explore` and evidence UX before creating a new UI surface.
-
-## Current State
-
-CodeStory's durable promise is strong:
-
-- `codestory-cli index` builds graph state, snapshots, lexical search state, and semantic docs.
-- Read commands default to `--refresh none`, which is the right posture for agent loops over a known cache.
-- `ground`, `search --why`, `context`, `symbol`, `trail`, `snippet`, `query`, `explore`, `doctor`, and `serve` already cover most browser primitives.
-- `serve --stdio` exposes tools, resources, resource templates, and prompts.
-- The architecture docs and contract tests preserve the intended crate split.
-- The repo-scale e2e stats gate already measures index/search/symbol/trail/snippet behavior.
-
-This plan is retained as design history, not as the current delivery backlog.
-The status of the original limitations is:
-
-| Area | Status | Current note |
-| --- | --- | --- |
-| DB-first browser contract | Completed, guarded | High-level retrieval no longer carries local external-agent execution controls; architecture and onboarding contracts protect the read-only boundary. |
-| `.codestory.toml` embedding mapping | Completed | `embedding_profile` and `embedding_model_id` map to runtime env names; legacy `embedding_model` remains a deprecated alias. |
-| Repo-local grounding skill refs | Completed, guarded | Command refs exist for the browser surfaces and onboarding tests check required reference shape. |
-| Trail-only DOT output | Completed, guarded | CLI help and command contracts keep DOT scoped to trail output. |
-| Fast browser golden path | Completed | `cli_golden_path` covers the small always-on index-then-browse loop. |
-| HTTP/stdio schema generation | Still open | Tool schemas and prompts are still handwritten in the CLI. |
-| `context` packet quality | Superseded by packet/search-plan work | `packet`, `search --why`, and structured follow-up commands now carry more of the agent handoff path. |
-| Freshness/profile mismatch signals | Partly complete | `doctor` and read outputs report retrieval/freshness state; continue improving where review evidence shows ambiguity. |
-| Large-repo performance evidence | Still open | Repo-scale and public-core rows exist, but 10k-100k file agent-loop evidence remains future work. |
-
-## Sprint 0: Safety, Drift, And Fast Tripwires
-
-**Goal**: make the current browser surface safer and harder to regress before adding new product behavior.
-
-**Demo/validation**
-
-- `cargo test -p codestory-contracts`
-- `cargo test -p codestory-cli --test onboarding_contracts`
-- `cargo test -p codestory-cli --test architecture_contracts`
-- `cargo test -p codestory-cli --test cli_golden_path`
-- `cargo test -p codestory-cli --test cli_error_contracts`
-
-### Task 0.1: Keep Context DB-First Everywhere
-
-- **Location**: `crates/codestory-contracts/src/api/dto.rs`, `crates/codestory-runtime/src/agent/orchestrator.rs`, CLI context tests.
-- **Description**: keep `AgentAskRequest` as a retrieval-only contract with no external local-agent execution controls.
-- **Acceptance criteria**:
-  - CLI `context` exposes no local-agent flags.
-  - `serve --stdio` context remains read-only and DB-first.
-  - Retrieval trace contains no local-agent execution step.
-- **Validation**:
-  - Add CLI/stdin tests proving context output has only retrieval-owned trace steps.
-
-### Task 0.2: Fix `.codestory.toml` Embedding Config Mapping
-
-- **Location**: `crates/codestory-cli/src/config.rs`, `README.md`, `docs/architecture/subsystems/cli.md`, `docs/contributors/getting-started.md`.
-- **Description**: make config keys map to runtime env names that actually control embeddings.
-- **Recommended shape**:
-  - Add `embedding_profile` -> `CODESTORY_EMBED_PROFILE`.
-  - Add `embedding_model_id` -> `CODESTORY_EMBED_MODEL_ID`.
-  - Keep legacy `embedding_model` as a deprecated alias for `embedding_model_id`.
-  - Stop setting only `CODESTORY_EMBEDDING_MODEL` unless runtime starts reading it.
-- **Acceptance criteria**:
-  - A copy-paste `.codestory.toml` example changes `doctor` output predictably.
-  - Docs explain precedence: explicit env vars win over config defaults; project config overrides home config.
-  - A config test covers profile, model id, legacy alias, and explicit env override behavior.
-- **Validation**:
-  - `cargo test -p codestory-cli config`
-  - `codestory-cli setup embeddings --project . --dry-run --format json`
-  - `codestory-cli doctor --project . --format json` in hash-mode, missing-managed-assets, and missing-llama modes.
-
-### Task 0.3: Repair Agent-Facing Skill Docs
-
-- **Location**: `.agents/skills/codestory-grounding/SKILL.md`, `.agents/skills/codestory-grounding/references/`.
-- **Description**: make the repo-local skill the canonical operational guide for agents.
-- **Acceptance criteria**:
-  - Remove stale crate names: `codestory-app`, `codestory-index`, `codestory-storage`.
-  - Add command refs for `context.md`, `doctor.md`, `explore.md`, and `serve.md`.
-  - Add a short "LLM default browser loop":
-    `doctor` -> `index` when needed -> `ground` -> `search --why` -> `symbol/trail/snippet/explore` -> `context` with citations.
-  - Each command ref includes one normal path, one failure path, and one integration edge.
-- **Validation**:
-  - Skill metadata validation with `quick_validate.py`.
-  - Add or extend CLI docs contract tests so command refs cannot drift silently.
-
-### Task 0.4: Add Fast Browser Golden Path
-
-- **Location**: `crates/codestory-cli/tests/cli_golden_path.rs`.
-- **Description**: add a tiny always-on temp Rust workspace test that proves the core browser loop without indexing the full repo.
-- **Fixture**:
-  - `src/lib.rs` with `AppController`, `open_project`, `run_indexing`.
-  - `src/runtime.rs` with a cross-file call.
-  - isolated `--cache-dir`.
-  - deterministic `CODESTORY_EMBED_RUNTIME_MODE=hash`.
-- **Acceptance criteria**:
-  - `index --refresh full --format json` succeeds.
-  - `doctor`, `ground`, `search`, `symbol`, `trail`, `snippet`, and `query` work with `--refresh none`.
-  - Read commands do not mutate the search directory.
-- **Validation**:
-  - `cargo test -p codestory-cli --test cli_golden_path`.
-
-### Task 0.5: Add Error Contract Tests
-
-- **Location**: `crates/codestory-cli/tests/cli_error_contracts.rs`.
-- **Description**: make common agent-facing failures actionable and stable.
-- **Acceptance criteria**:
-  - Read command without cache exits nonzero and includes a recovery command.
-  - Ambiguous query lists ranked alternatives or exact next steps.
-  - Missing output parent fails before runtime mutation.
-  - Non-`trail` `--format dot` is either absent from help or rejected by a tested pre-runtime error.
-- **Validation**:
-  - `cargo test -p codestory-cli --test cli_error_contracts`.
-
-## Sprint 1: Architecture Boundary Cleanup
-
-**Goal**: create a safe read-only browser boundary before publishing richer protocol metadata or UI surfaces.
-
-**Demo/validation**
-
-- `cargo test -p codestory-cli --test architecture_contracts`
-- `cargo test -p codestory-store`
-- targeted full/incremental refresh tests
-- `cargo check --all-targets`
-
-### Task 1.1: Move Refresh/Inventory Contracts Out Of Workspace Coupling
-
-- **Location**: `crates/codestory-contracts`, `crates/codestory-workspace`, `crates/codestory-store`, `crates/codestory-indexer`, `crates/codestory-runtime`.
-- **Description**: stop `codestory-store` from depending on `codestory-workspace` types.
-- **Acceptance criteria**:
-  - `codestory-store` no longer depends on `codestory-workspace`.
-  - Neutral refresh/inventory value types live in `codestory-contracts` or a tiny internal planning module.
-  - `index --dry-run`, full index, and incremental index preserve current `files_to_index` / `files_to_remove` behavior.
-- **Validation**:
-  - Add architecture test asserting store does not depend on workspace.
-  - Run existing workspace, store, indexer, and runtime incremental tests.
-
-### Task 1.2: Introduce A Runtime Read-Only Browser Service
-
-- **Location**: `crates/codestory-runtime/src/services.rs`, new runtime module such as `crates/codestory-runtime/src/browser.rs`.
-- **Description**: gather read-only browser operations behind a runtime-owned service used by CLI, HTTP, stdio, and future UI.
-- **Initial operations**:
-  - `search`
-  - `symbol`
-  - `definition`
-  - `references`
-  - `symbols`
-  - `trail`
-  - `snippet`
-  - `query`
-  - DB-first `context` packet
-- **Non-goals**:
-  - Do not include file writes, opening IDEs, opening folders, or OS actions.
-  - Do not move socket/stdin transport loops into runtime.
-- **Acceptance criteria**:
-  - CLI transport stays thin.
-  - Read-only capability boundary is explicit.
-  - Existing response shapes remain compatible.
-- **Validation**:
-  - Architecture tests that CLI does not construct browser business logic.
-  - HTTP/stdin regression tests for route/tool names and core JSON shapes.
-
-## Sprint 2: Protocol And Agent Integration Contracts
-
-**Goal**: make CodeStory's agent integration stable, discoverable, and safe for automatic use.
-
-**Demo/validation**
-
-- JSON-lines transcript tests for `serve --stdio`
-- HTTP parity smoke for `/search`, `/definition`, `/references`, `/symbols`, `/trail`
-- `cargo test -p codestory-cli`
-
-### Task 2.1: Add Stdio Transcript Compatibility Tests First
-
-- **Location**: `crates/codestory-cli/tests/stdio_protocol_contracts.rs`.
-- **Description**: test current and intended JSON-RPC/MCP-style behavior before changing metadata.
-- **Acceptance criteria**:
-  - `initialize` preserves request `id` and reports server info/capabilities.
-  - Unknown method, invalid JSON, bad args, and not-found errors return stable JSON-RPC-shaped errors.
-  - `tools/list`, `resources/list`, `resources/templates/list`, `prompts/list`, `resources/read`, and `tools/call` have transcript fixtures.
-- **Validation**:
-  - `cargo test -p codestory-cli --test stdio_protocol_contracts`.
-
-### Task 2.2: Create A Typed Tool/Resource/Prompt Catalog
-
-- **Location**: runtime read-only service or a small transport-neutral module; CLI renders it.
-- **Description**: replace handwritten loose schema generation with a single manifest/catalog.
-- **Acceptance criteria**:
-  - Tool names remain stable.
-  - Input schemas include required fields, enum values, defaults, and bounds.
-  - Output schemas exist for core tools where stable DTOs already exist.
-  - Future write/system tools cannot appear in the read-only catalog without explicit safety metadata.
-- **Validation**:
-  - Snapshot tests for tool/resource/prompt catalog.
-  - Tests comparing catalog command list to browser service operations.
-
-### Task 2.3: Add Safety Metadata And Resource Links
-
-- **Location**: catalog/rendering module, stdio result wrappers.
-- **Description**: make safe automatic use easy for agents.
-- **Acceptance criteria**:
-  - All read-only tools include annotations such as read-only, non-destructive, idempotent, and local-only/open-world false where supported.
-  - `search` and `definition` results expose `codestory://symbol/{node_id}`, snippet, references, and trail links.
-  - `codestory://status` reports project root, cache path, retrieval mode, semantic readiness, fallback reason, and recommended next calls.
-  - `codestory://agent-guide` describes the default browser loop.
-- **Validation**:
-  - `tools/list` snapshot asserts annotations.
-  - Resource read tests for `status` and `agent-guide`.
-  - Tool call tests assert continuation links and payload-size limits.
-
-### Task 2.4: Keep HTTP And Stdio Aligned
-
-- **Location**: CLI transport layer, shared route/tool descriptors.
-- **Description**: prevent route defaults from diverging between HTTP and stdio.
-- **Acceptance criteria**:
-  - `/definition`, `/references`, `/symbols`, `/trail` share default limits/depth semantics with stdio tools.
-  - Existing HTTP paths remain stable.
-- **Validation**:
-  - Handler descriptor tests.
-  - One HTTP smoke against an indexed temp repo.
-
-## Sprint 3: Retrieval Quality And Target Context
-
-**Goal**: make `context` gather deep evidence around concrete integration and architecture anchors instead of hoping a single search query hits.
-
-**Demo/validation**
-
-- `cargo test -p codestory-runtime --test retrieval_eval`
-- new retrieval golden tests
-- CLI `context` JSON/Markdown snapshot tests
-
-### Task 3.1: Build Retrieval Golden Fixtures Before Changing Context
-
-- **Location**: `crates/codestory-runtime/tests/retrieval_browser_contracts.rs`.
-- **Description**: create deterministic fixtures for the browser investigations CodeStory must ground.
-- **Cases**:
-  - exact symbol query
-  - exact file/literal query
-  - broad integration question decomposed into concrete search anchors
-  - ambiguous symbol requiring alternatives
-  - graph/snippet expansion
-  - stale index warning
-  - no-hit query with suggestions and explicit gaps
-- **Acceptance criteria**:
-  - Tests assert citations, selected focus, trace steps, and gap reporting.
-  - Hash embedding mode gives deterministic results.
-- **Validation**:
-  - `cargo test -p codestory-runtime --test retrieval_browser_contracts`.
-
-### Task 3.2: Make Bounded Context The Default
-
-- **Location**: `crates/codestory-cli/src/args.rs`, `crates/codestory-contracts/src/api/dto.rs`, `crates/codestory-runtime/src/agent/orchestrator.rs`, `crates/codestory-runtime/src/agent/profiles.rs`.
-- **Description**: make the deep retrieval path the default for `context`, with no public lightweight/deep split.
-- **Behavior**:
-  - Initial search with current ranking.
-  - Query expansion or exact-symbol/file fallback when first hits are weak.
-  - Bounded graph expansion.
-  - Bounded snippet/source reads.
-  - Citations and "what I checked" trace.
-  - Explicit gaps when confidence is low.
-- **Hard limits**:
-  - Respect latency budget before expensive trail/source phases.
-  - Cap default trail nodes and source bytes.
-  - Keep investigation inside CodeStory's indexed retrieval layer.
-- **Acceptance criteria**:
-  - Integration targets that currently miss relevant symbols return cited hits.
-  - Trace proves multiple retrieval steps only when needed.
-  - `context` stays target-first and does not accept broad question prompts.
-- **Validation**:
-  - Golden target tests.
-  - `context --format json` trace assertions.
-  - Warm latency checks under the performance thresholds.
-
-### Task 3.3: Improve Target Resolution UX
-
-- **Location**: `crates/codestory-cli/src/runtime.rs`, target selection DTOs, CLI renderers.
-- **Description**: reduce ambiguous-query dead ends.
-- **Acceptance criteria**:
-  - Ambiguous results include numbered alternatives and stable node refs.
-  - Add `--choose <N>` or equivalent only if it can be made deterministic without hidden session state.
-  - JSON includes enough data for agents to resolve by id on the next call.
-- **Validation**:
-  - Ambiguous symbol CLI tests.
-  - No silent auto-pick when ranks tie.
-
-### Task 3.4: Redesign Evidence Packet Output
-
-- **Location**: `crates/codestory-cli/src/output.rs`, context renderers, search/ground explanations.
-- **Description**: make Markdown outputs easier for humans and LLMs to consume.
-- **Suggested structure**:
-  - context summary or short finding
-  - confidence
-  - what was checked
-  - gaps/uncertainty
-  - citations
-  - next useful commands
-- **Acceptance criteria**:
-  - Full trace remains in JSON/bundles.
-  - Markdown never hides fallback reasons or low-confidence state.
-- **Validation**:
-  - Snapshot tests for `context`, `search --why`, and `ground --why`.
-
-## Sprint 4: Operational Trust And Performance Evidence
-
-**Goal**: expose the state that agents need to know: freshness, retrieval readiness, semantic profile, and warm-loop performance.
-
-**Demo/validation**
-
-- `doctor` reports useful cache/profile/fallback/freshness state.
-- warm stdio benchmark produces p50/p95/p99.
-- repo-scale e2e stats remain the promotion gate.
-
-### Task 4.1: Add Embedding Profile Contract And Doctor Warnings
-
-- **Location**: `crates/codestory-runtime/src/search/engine.rs`, semantic doc metadata, `doctor` DTO/output.
-- **Description**: represent embedding profile/backend/doc-shape as a stable runtime contract.
-- **Acceptance criteria**:
-  - Stored semantic docs report profile/model/backend/dimension/doc-shape enough to explain reuse or rebuild.
-  - `doctor` warns when stored docs and current env/config disagree.
-  - Missing managed ONNX assets and external legacy llama.cpp endpoint failures remain clear fallbacks, not silent degradation.
-- **Validation**:
-  - hash backend normal path
-  - fake llama.cpp path
-  - missing endpoint failure path
-  - profile mismatch warning
-
-### Task 4.2: Add Index Freshness Signal
-
-- **Location**: runtime project/search/context DTOs, workspace inventory check, `doctor`, `serve --stdio` status resource.
-- **Description**: make stale caches visible without mutating read commands.
-- **Acceptance criteria**:
-  - Freshness check is bounded and read-only.
-  - It reports changed/new/removed counts or "not checked" with reason.
-  - Read commands do not refresh implicitly.
-- **Validation**:
-  - Temp fixture where a file changes after indexing.
-  - Freshness p95 under 250 ms for small repos.
-
-### Task 4.3: Measure Warm `serve --stdio` Agent Loop
-
-- **Location**: CLI stdio test harness or bench, `docs/testing/codestory-e2e-stats-log.md` or a new warm-loop stats doc.
-- **Description**: measure the actual persistent-session shape agents should use.
-- **Metrics**:
-  - startup ms
-  - first tool ms
-  - warm p50/p95/p99 per tool
-  - response bytes
-  - semantic reload ms
-  - fallback reason
-  - search dir unchanged
-- **Acceptance criteria**:
-  - Metrics do not pollute stdout protocol.
-  - Initial report compares warm stdio to cold one-shot CLI timings.
-- **Validation**:
-  - transcript: initialize -> tools/list -> search -> symbol -> trail -> snippet -> resources/read.
-
-### Task 4.4: Add Hard Caps Before Bigger Bundles
-
-- **Location**: repo-text search, context retrieval, future bundle/context tools.
-- **Description**: reduce large-repo footguns before introducing higher-level bundle tools.
-- **Caps**:
-  - repo-text scanned files/bytes/time
-  - bundle output bytes
-  - default context trail nodes
-  - source snippet bytes
-- **Acceptance criteria**:
-  - Truncation is explicit and actionable.
-  - Caps are visible in `--why`, JSON, or retrieval trace.
-- **Validation**:
-  - Large low-match repo-text fixture.
-  - High-fanout trail fixture.
-
-### Task 4.5: Add Stress Lanes Only After Metrics Exist
-
-- **Location**: `crates/codestory-bench`.
-- **Description**: create large-repo stress benches after the warm-loop counters are stable.
-- **Scenarios**:
-  - 1k, 10k, 100k synthetic file sets
-  - high-degree graph nodes
-  - repo-text `auto/on/off`
-  - trail depths 2/4/6
-  - stdio/HTTP concurrency 1/4/16
-- **Acceptance criteria**:
-  - Promotion thresholds documented.
-  - Synthetic results are not treated as real-world proof without at least one real repo run.
-
-## Sprint 5: Delight UX On The Existing Surface
-
-**Goal**: improve the existing browser flow without creating duplicate UI surfaces prematurely.
-
-**Demo/validation**
-
-- `explore` flow improves for keyboard-first navigation.
-- No new `browse` command until its distinction from `explore` is clear.
-- Accessibility and text-equivalent review for any graph-heavy UI.
-
-### Task 5.1: Improve `explore` Before Adding `browse`
-
-- **Location**: `crates/codestory-cli/src/main.rs`, explore rendering/TUI modules if split.
-- **Description**: evolve the current TUI into the default browser path.
-- **Acceptance criteria**:
-  - Project/status pane shows retrieval mode, fallback, freshness, and next useful command.
-  - Search/results/detail/trail/snippet panes are keyboard reachable.
-  - Empty/error states preserve the failed layer: cache, index, semantic runtime, query resolution, output write.
-- **Validation**:
-  - Keyboard-only TUI pass.
-  - JSON/Markdown fallback pass with `--no-tui`.
-
-### Task 5.2: Add Bookmarks As Investigation State
-
-- **Location**: existing bookmark store/runtime surfaces, CLI commands or explore actions.
-- **Description**: expose saved focus sets for repeated investigations.
-- **Acceptance criteria**:
-  - Add/list/remove bookmarks.
-  - `context` or `trail` can use bookmark context if explicitly requested.
-  - Stale bookmarks after reindex degrade gracefully.
-- **Validation**:
-  - CRUD tests.
-  - Reindex stale-node behavior.
-
-### Task 5.3: Add Trail Story Mode
-
-- **Location**: trail renderers and runtime trail DTOs.
-- **Description**: provide a readable narrative of graph paths.
-- **Acceptance criteria**:
-  - Entry points, core flow, side effects, uncertain edges, and tests included/excluded are explicit.
-  - Uncertainty is textual, not only color or graph styling.
-- **Validation**:
-  - Trail fixtures with certain/probable/speculative edges.
-  - Markdown snapshot tests.
-
-### Task 5.4: Defer Web Cockpit Until Contracts Are Stable
-
-- **Description**: only add a separate web UI after read-only service, protocol catalog, status/freshness, and warm-loop telemetry are stable.
-- **Acceptance criteria for starting web work**:
-  - Tool/resource manifest stable.
-  - Warm p95 thresholds are met.
-  - Existing `explore` experience proves the browser workflow.
-  - Screenshot-visible review loop is planned before implementation.
-
-## Suggested First Three PRs
-
-### PR 1: Trust Foundations
-
-- Fix the high-level retrieval request default.
-- Add serde omission tests.
-- Fix `.codestory.toml` embedding config mapping.
-- Add config precedence tests.
-- Update README/CLI docs for config mapping.
-
-### PR 2: Agent Docs And Fast Browser Tests
-
-- Repair `codestory-grounding` skill freshness rules.
-- Add `context`, `doctor`, `explore`, `serve` refs.
-- Add command-reference drift tests.
-- Add `cli_golden_path.rs`.
-- Add `cli_error_contracts.rs`.
-
-### PR 3: Read-Only Browser Boundary
-
-- Move refresh/inventory shared types out of workspace-store coupling.
-- Add architecture guard for store not depending on workspace.
-- Introduce `ReadOnlyBrowserService`.
-- Keep CLI transport loops in CLI.
-- Preserve route/tool names and response shapes.
-
-## Review Risks
-
-- **Protocol overreach**: do not freeze a rich manifest until service boundaries are clean.
-- **UI duplication**: improve `explore` first; defer `browse` and a separate web UI.
-- **Latency waterfall**: deep `context` must be budgeted before graph/source phases.
-- **Repo-text I/O**: add global caps before repo-text participates in high-level bundles.
-- **Config churn**: support legacy `embedding_model` while introducing precise `embedding_profile` and `embedding_model_id`.
-- **Telemetry sprawl**: retrieval state already reports several useful fields; add only counters that explain current blind spots.
-- **Large-repo claims**: CodeStory repo stats are useful but small; do not claim large-monorepo readiness until stress lanes exist.
-
-## Completion Definition
-
-CodeStory is credibly acting as an LLM's default codebase browser when:
-
-- an agent can discover and use the read-only browser loop from the repo-local skill or stdio resources;
-- missing cache, stale cache, semantic fallback, ambiguous symbols, and unsupported format cases produce actionable output;
-- `context` can gather evidence for real integration anchors with cited symbols, snippets, trails, and explicit gaps;
-- MCP/stdio clients receive stable schemas, read-only annotations, JSON-RPC-shaped errors, and continuation resource links;
-- warm stdio/browser-loop p95 timings are measured and bounded;
-- repo-scale and stress-lane gates protect index/search/trail/snippet behavior before releases;
-- `explore` provides a useful browser-style flow without requiring a separate web app.
diff --git a/docs/ops/retrieval-sidecars.md b/docs/ops/retrieval-sidecars.md
new file mode 100644
index 0000000..f928d45
--- /dev/null
+++ b/docs/ops/retrieval-sidecars.md
@@ -0,0 +1,379 @@
+# Retrieval sidecars — Operations runbook
+
+Local Zoekt, Qdrant, and SCIP indexer processes for sidecar packet retrieval. Data directories
+live under the CodeStory user cache; ports are fixed for local dev and CI smoke.
+
+**Design reference:** [`retrieval-design.md`](../architecture/retrieval-design.md)
+(sidecar pins, degraded modes, preflight).
+
+---
+
+## Prerequisites
+
+- Rust toolchain with `cargo` (primary path)
+- Docker Desktop or Docker Engine for automated Qdrant, Zoekt, and embed sidecars
+- Optional: Node.js 18+ for `scripts/setup-retrieval-env.mjs` wrapper
+- Manual sidecar path: Zoekt webserver on `6070` and Qdrant on `6333` without Docker; all sidecars must still be healthy before agent-facing retrieval is valid
+- Network: localhost only for sidecars; holdout clone needs outbound git
+
+---
+
+## Quick start: one command
+
+From the CodeStory repository root (Windows, macOS, Linux):
+
+```sh
+cargo retrieval-setup
+```
+
+Plain `codestory-cli index` builds the core SQLite code index only. It does not
+generate sidecar artifacts or prove retrieval readiness. Use
+`codestory-cli retrieval index --project <repo>` to generate Zoekt, Qdrant, and
+SCIP sidecar artifacts, then use `codestory-cli retrieval status --project
+<repo> --format json` to verify `retrieval_mode: "full"` before using
+agent-facing packet/search evidence.
+
+Status after bootstrap:
+
+```sh
+cargo retrieval-status
+```
+
+Aliases are defined in [`.cargo/config.toml`](../../.cargo/config.toml). They run
+`codestory retrieval bootstrap --project .` and `retrieval status --project .`, building the CLI
+when needed.
+
+**Bootstrap flags** (via `cargo run -p codestory-cli -- retrieval bootstrap ...`):
+
+| Flag | Purpose |
+|------|---------|
+| `--skip-compose` | Cache dirs + state file only; use only when equivalent local sidecars are already running |
+| `--wait-secs <n>` | Health wait timeout (default `90`; `0` = no wait) |
+| `--compose-file <path>` | Override `docker/retrieval-compose.yml` |
+
+**Optional Node wrapper** (prerequisite checks, optional holdout clone):
+
+```sh
+node scripts/setup-retrieval-env.mjs
+node scripts/setup-retrieval-env.mjs --check-only
+node scripts/setup-retrieval-env.mjs --skip-compose
+node scripts/setup-retrieval-env.mjs --with-holdout-clone
+```
+
+| Wrapper flag | Purpose |
+|------|---------|
+| `--check-only` | Prerequisites report only; exit 1 if required tools missing |
+| `--skip-compose` | Passed to bootstrap |
+| `--skip-build` | Skip `cargo build` (alias still builds on first `cargo retrieval-setup`) |
+| `--with-holdout-clone` | Also run `scripts/fetch-holdout-repos.mjs` (large git clones under `target/`) |
+
+**Direct CLI** (equivalent to alias):
+
+```sh
+cargo run -p codestory-cli -- retrieval bootstrap --project .
+```
+
+Compose file: [`docker/retrieval-compose.yml`](../../docker/retrieval-compose.yml). Env template:
+[`docker/retrieval.env.example`](../../docker/retrieval.env.example).
+
+---
+
+## Version pin policy
+
+| Dependency | Pin policy | Pinned version | Notes |
+|------------|------------|----------------|-------|
+| Zoekt real (Phase 2) | `COMPOSE_PROFILES=real` | `zoekt-20250506123554` | `sourcegraph/zoekt-webserver:0.0.0-20250506123554-490422d1adb4` + lexical shards |
+| Qdrant | Fixed container image tag | `qdrant/qdrant:v1.12.5` | HTTP `6333`, gRPC `6334` |
+| SCIP | CodeStory graph artifact emitter | `graph-<hash>` | Generated local graph artifacts under the sidecar generation |
+
+Update this table when production Zoekt/SCIP toolchains are wired. CI `retrieval-sidecar-smoke` must use the
+same pins as local dev.
+
+---
+
+## Ports and data directories
+
+| Service | Default port | Data dir (Windows) |
+|---------|--------------|---------------------|
+| Zoekt web/search | `6070` | `%LOCALAPPDATA%\codestory\cache\zoekt\` |
+| Qdrant HTTP | `6333` | `%LOCALAPPDATA%\codestory\cache\qdrant\` |
+| Qdrant gRPC | `6334` | same |
+| SCIP artifacts | n/a (files) | `%LOCALAPPDATA%\codestory\cache\scip\<sidecar-generation>\` |
+| Sidecar state | n/a | `%LOCALAPPDATA%\codestory\cache\retrieval-sidecars.json` |
+
+Override ports with `CODESTORY_ZOEKT_PORT`, `CODESTORY_QDRANT_HTTP_PORT`, `CODESTORY_QDRANT_GRPC_PORT`.
+
+Project id is a stable FNV-1a hex hash of the canonical repo root (same scheme as CLI cache hashing).
+Sidecar artifacts are content-addressed by `sidecar_generation = <project-id>-<input-hash-prefix>`.
+The hash covers the local lexical input, symbol projection rows, semantic file roles, embedding
+backend/dim, and sidecar schema version. Re-running `retrieval index` with unchanged inputs validates
+the live generation and reuses it instead of rewriting Zoekt, Qdrant, or SCIP.
+
+`retrieval status` and `retrieval query` fail closed when the manifest is obsolete or stale. A valid
+manifest must include the current sidecar schema version, input hash, derived generation id, derived
+Qdrant collection, and matching stored semantic-doc vector count. If the SQLite projection or stored
+semantic-doc contract changes after the manifest is written, rerun `retrieval index`; runtime paths
+will not infer or reuse bare project-id sidecars.
+`retrieval index --refresh auto` repairs stale stored semantic-doc contracts by retrying once with a
+full refresh when finalization detects that the manifest would be unavailable immediately. Explicit
+`--refresh none` and failed explicit refreshes still fail closed instead of serving degraded sidecars.
+
+Confirm bindings with:
+
+```powershell
+.\target\release\codestory-cli.exe retrieval status --project .
+```
+
+---
+
+## CLI workflow
+
+### Bootstrap (recommended: Compose + cache dirs + wait)
+
+```powershell
+cargo build --release -p codestory-cli
+.\target\release\codestory-cli.exe retrieval bootstrap --project .
+```
+
+Starts `docker/retrieval-compose.yml` when Docker is available (`qdrant/qdrant:v1.12.5`, Zoekt
+webserver on `6070`, and llama.cpp embeddings on `8080`), writes
+`retrieval-sidecars.json`, and waits for Zoekt, Qdrant, and llama.cpp embedding HTTP probes.
+Bootstrap removes stale pre-mandatory `codestory-zoekt-stub` containers before starting the
+real sidecars. It discovers the embed model directory from `CODESTORY_EMBED_MODEL_DIR`,
+`target/retrieval-models`, or `models/gguf/bge-base-en-v1.5` when the GGUF file is present.
+The embed service uses the measured local request geometry (`-np 6`, `-b 1024`, `-ub 1024`).
+Qdrant document vectors are copied from the already-managed local `llm_symbol_doc` semantic
+document table when the stored embedding contract is the product BGE base profile
+(`bge-base-en-v1.5`, 768 dimensions, ONNX or llama.cpp backend). The llama.cpp sidecar remains
+mandatory for query embeddings and live semantic smoke checks, but cold sidecar indexing must not
+re-embed the whole stored semantic corpus just to populate Qdrant.
+Qdrant query-time search uses the current Query API
+`POST /collections/{collection}/points/query` and requires `result.points[]` in the response;
+older search response shapes are treated as contract drift. Exact symbol queries are served from
+exact sidecar evidence first: once SCIP or lexical stages produce an exact symbol anchor, semantic
+and graph expansion lanes are skipped for that query instead of letting broad semantic evidence
+displace the exact hit.
+
+Before Compose starts, bootstrap repairs Qdrant storage:
+
+1. **Protection scan** — builds a protected set from:
+   - every `codestory.db` under the default user cache root (hashed subdirs),
+   - the active `--cache-dir` tree when it differs from the default root (flat `codestory.db` or hashed subdirs),
+   - the active project `storage_path` when not already scanned.
+   Only manifest-recorded generated collections are protected; bare project-id collections are obsolete diagnostics and may be pruned.
+   Unreadable cache roots or DBs are recorded as `storage_repair.scan_errors` in bootstrap output; repair continues with partial protection (bootstrap does **not** abort solely because a cache tree could not be read).
+
+2. **Offline cleanup (Qdrant unreachable only)** — when the Qdrant HTTP probe fails, invalid `collections/codestory_*` dirs without Qdrant config are removed and obsolete in-collection stub markers are migrated to `codestory-stub-markers/`. When Qdrant is reachable, these on-disk steps are skipped to avoid races with a live server. Non-`codestory_*` collection dirs are never deleted by this step.
+
+3. **Retention (fail-closed on scan errors)** — excess `codestory_*` collections beyond the cap (64) may be pruned among **unprotected** collections only, ranked by manifest `built_at_epoch_ms` when known, else directory mtime. When every collection is protected but count exceeds the cap, bootstrap sets `overflow_protected=true` and prunes nothing. When `storage_repair.scan_errors` is non-empty, **all** retention deletes are skipped (`pruned_collections=0`) and `prune_suppressed_reason` is set to `protection_scan_error` unless `CODESTORY_RETRIEVAL_PRUNE_ON_SCAN_ERROR=1` (default off).
+
+While Qdrant is reachable, pruning uses HTTP `DELETE /collections/{name}`; when offline, stale collection dirs are removed on disk.
+
+### Start sidecars (data dirs + state file only)
+
+```powershell
+.\target\release\codestory-cli.exe retrieval up
+```
+
+Does **not** start Docker. Use `retrieval bootstrap` or the setup script for automated Compose.
+
+### Health check
+
+```powershell
+.\target\release\codestory-cli.exe retrieval status --project .
+```
+
+JSON includes per-component `status`, `latency_ms`, `detail`, `capabilities` flags
+(`lexical`, `semantic`, `graph`), and top-level `retrieval_mode`
+(`full`, `no_scip`, `no_semantic`, `lexical_only`, `unavailable`). Only `full`
+is allowed to serve agent-facing retrieval.
+
+| Component | Healthy when |
+|-----------|--------------|
+| zoekt | HTTP reachable on `6070`, real shard dir (no `.zoekt-stub` marker) |
+| qdrant | collection exists, no stub marker under `{qdrant_data_dir}/codestory-stub-markers/{collection}.qdrant-stub` (obsolete `collections/{collection}/.qdrant-stub` also counts as stubbed), reported point count is at least the manifest projection count when available, and semantic smoke search returns repo-relative paths |
+| scip | `symbols.index.json`, `index.scip`, and non-empty `revision.txt` exist under the manifest generation, with no `index.scip.stub` |
+
+### Mandatory sidecars
+
+The default `docker/retrieval-compose.yml` stack starts the required product sidecars directly.
+Historical compose-profile overrides and hash-vector modes are rejected by product bootstrap/index
+paths. Stubbed, hash-vector, or partial sidecars report a non-`full` mode and fail closed for agent-facing packet/search
+paths. Sidecar-primary packet runs require a project-scoped lexical shard, live llama.cpp
+semantic state, and SCIP graph artifacts. `CODESTORY_RETRIEVAL=0` is unsupported.
+
+**Phase 2 (shipped in crate):**
+
+| Component | Status |
+|-----------|--------|
+| Zoekt | `retrieval index` builds `lexical-index.jsonl` shards for the active sidecar generation; client searches the manifest generation |
+| Qdrant | 768-d bge-base vectors copied from stored local semantic docs are mandatory; `semantic=true` only after smoke search succeeds against the manifest collection and manifest records the product embedding backend |
+| SCIP | Graph symbols emitted to `symbols.index.json` + `index.scip` under the active sidecar generation from the full SQLite symbol projection |
+
+### Real embeddings (bge-base-en-v1.5 + llama.cpp)
+
+Promotion uses **768-d** vectors. Qdrant document vectors come from stored semantic docs produced by
+the managed local embedding runtime. Query vectors still come from the local llama.cpp sidecar so
+retrieval remains sidecar-backed and can smoke-test the live collection. Hash projection is
+diagnostic only and never produces `retrieval_mode=full`.
+
+1. Download GGUF (once): `node scripts/setup-retrieval-env.mjs --fetch-embed-model`
+2. Export (see [`docker/retrieval.env.example`](../../docker/retrieval.env.example)):
+   - `CODESTORY_EMBED_MODEL_DIR=<repo>/target/retrieval-models`
+   - `CODESTORY_EMBED_BACKEND=llamacpp`
+   - `CODESTORY_EMBED_LLAMACPP_URL=http://127.0.0.1:8080/v1/embeddings`
+3. `cargo retrieval-setup` (starts Qdrant, Zoekt webserver, `codestory-embed` on `:8080`)
+4. Dim smoke: `curl -s http://127.0.0.1:8080/v1/embeddings -H "Content-Type: application/json" -d "{\"input\":[\"function\"]}"` → embedding length **768**
+5. `retrieval index --project <repo> --refresh full` (manifest records `embedding_backend`, `embedding_dim`, `sidecar_input_hash`, `sidecar_generation`, and the generated Qdrant collection; the input hash includes stored semantic-doc metadata and embedding contract)
+6. `retrieval status` → `retrieval_mode: full` and `capabilities.semantic=true`
+
+Wrong model dim with `CODESTORY_EMBED_BACKEND=llamacpp` fails loudly (no hash substitution).
+
+### Index project
+
+```powershell
+.\target\release\codestory-cli.exe retrieval index --project . --refresh auto
+```
+
+Runs workspace index (same as `codestory index`) then persists `retrieval_index_manifest` in
+`codestory.db`. Zoekt, Qdrant, and SCIP are mandatory; missing sidecars or empty sidecar
+artifacts fail the command instead of writing stub markers.
+
+Index finalization writes new generations instead of mutating the manifest generation in place:
+
+- Zoekt shard: `zoekt/shards/<sidecar-generation>/`
+- Qdrant collection: `codestory_<project-id>_<input-hash-prefix>`
+- SCIP artifacts: `scip/<sidecar-generation>/`
+
+The manifest is updated only after the generated sidecars are emitted. If the manifest hash,
+schema version, projection count, embedding backend/dim, and live health still match, finalization
+returns the existing manifest and skips the rebuild path. This is the intended fast path for
+iterative evidence loops with `--refresh none` after a successful generation build.
+
+If a previous `retrieval index` attempt emitted generated artifacts but failed before manifest
+persist, finalization probes the would-be generation before rebuilding. Healthy Zoekt shards,
+complete Qdrant collections, and SCIP artifacts are reused independently. Qdrant reuse requires an
+exact point count at least as large as the current stored semantic-doc vector count; a one-point or
+otherwise partial collection is rebuilt instead of being blessed by semantic smoke alone.
+
+### Stop sidecars (state file only)
+
+```powershell
+.\target\release\codestory-cli.exe retrieval down
+```
+
+### Standalone query (Phase 2+)
+
+```powershell
+.\target\release\codestory-cli.exe retrieval query "ExtensionService" --project .
+```
+
+---
+
+## Preflight smoke contract
+
+Use the full sequence locally before index/query changes. The CI job
+`retrieval-sidecar-smoke` (Windows) runs the reduced manifest-missing shape contract
+below because full index/query on the monorepo can exceed runner budgets and CI does not fetch the
+GGUF embedding model.
+
+**Local full sequence:**
+
+1. `retrieval up` - exit 0
+2. `retrieval status` - JSON with expected shape; non-`full` status is a failure for agent use
+3. `retrieval index --project <fixture>` - manifest row in SQLite only when all sidecars are real
+4. `retrieval query "<smoke query>"` - Phase 2+
+5. `retrieval down` - clean shutdown
+
+**CI reduced sequence:**
+
+1. generalization lint - exit 0
+2. release `codestory-cli` build - exit 0
+3. `retrieval bootstrap --project . --skip-compose --wait-secs 0` - exit 0
+4. `retrieval status --project .` - JSON reports the clean pre-index
+   `degraded_reason == "retrieval_manifest_missing"` state and must not report `retrieval_mode=full`
+5. `cargo test -p codestory-runtime --lib` - exit 0
+6. `cargo test -p codestory-runtime --test retrieval_generalization_guard` - exit 0
+7. `cargo test -p codestory-cli --test stdio_protocol_contracts` - exit 0
+8. `cargo test -p codestory-cli --test search_json_output` - exit 0 for non-live fail-closed search contracts
+9. `cargo test -p codestory-retrieval` - exit 0
+
+The reduced CI sequence is a manifest-missing shape check only. It creates local cache/state
+directories and verifies status JSON plus runtime/stdio/search/retrieval contracts, but it does
+not start sidecars, fetch `bge-base-en-v1.5.Q8_0.gguf`, or build the project manifest required for
+`retrieval_mode=full`. The included `search_json_output` suite covers non-live fail-closed search
+behavior; it does not claim stdio, CLI, or runtime full-mode success. Full-mode gates must start
+real sidecars, provision the GGUF model, index a fixture or target workspace, and verify
+`retrieval_mode == "full"`. The live full-mode contracts are ignored or env-gated by default and
+should be run explicitly only after those dependencies are prepared: set
+`CODESTORY_STDIO_FULL_RETRIEVAL_TESTS=1` before running stdio full-mode contracts
+with `-- --ignored --nocapture`,
+`cargo test -p codestory-cli --test search_json_output -- --ignored --nocapture search_json_emits_sidecar_primary_results_without_repo_text_fallback`
+and the ignored `retrieval_eval_*` tests with `CODESTORY_RETRIEVAL_EVAL_FULL_TESTS=1`.
+
+**Failure policy:** PRs touching `codestory-retrieval` or sidecar wiring fail CI if smoke fails.
+
+**Holdout prefetch (benchmark harness, not sidecar CLI):**
+
+```powershell
+node scripts/codestory-agent-ab-benchmark.mjs `
+  --list --task-suite holdout-retrieval --materialize-repos
+```
+
+Clones land in `target/agent-benchmark/repos/` (gitignored).
+
+## Troubleshooting
+
+| Symptom | Likely cause | Action |
+|---------|--------------|--------|
+| `retrieval up` port in use | stale process | `retrieval down`; check Task Manager / `docker ps` |
+| Zoekt unhealthy, unreachable | server not started | start Zoekt on `6070` and rebuild the project shard |
+| Qdrant unhealthy | wrong image tag / volume permissions | `docker run -p 6333:6333 qdrant/qdrant:v1.12.5` |
+| SCIP `scip_unavailable` | graph artifacts missing | fix SCIP emission before using agent-facing retrieval |
+| Smoke > 100ms / 200ms | cold cache or oversized fixture | retry; check tier envelope |
+
+---
+
+## Mandatory sidecar modes (operator view)
+
+When a sidecar is down, the sidecar executor selects a non-`full` mode per design matrix — see
+[`retrieval-design.md`](../architecture/retrieval-design.md#mandatory-sidecar-mode-matrix).
+Non-`full` modes are diagnostic only and fail closed for product packet/search paths.
+
+| Condition | User-visible mode | Action |
+|-----------|-------------------|--------|
+| Zoekt down | `unavailable` | Fix Zoekt; no product query should run |
+| Qdrant down, Zoekt up | `no_semantic` or `lexical_only` | Fix Qdrant; no product query should run |
+| SCIP down | `no_scip` | Fix SCIP artifacts; no product query should run |
+
+Traces must include `retrieval_mode` and `degraded_reason`.
+
+---
+
+## Environment variables
+
+| Variable | Purpose |
+|----------|---------|
+| `CODESTORY_RETRIEVAL` | unset or `1` uses mandatory sidecar primary when mode is `full`; non-`full` modes fail closed; `0` is unsupported |
+| `CODESTORY_RETRIEVAL_SHADOW` | Historical diagnostic trace switch; unsupported in product benchmarks |
+| `CODESTORY_RETRIEVAL_REAL_EMBEDDINGS` | defaults to `1`; `0` is unsupported for product indexing or packet/search evidence |
+| `CODESTORY_EMBED_BACKEND` | unset/default product mode, `llamacpp`, or `llama_cpp` for sidecar query embeddings; explicit `onnx` is non-product for sidecar retrieval and cannot finalize/report full product mode |
+| `CODESTORY_EMBED_LLAMACPP_URL` | local OpenAI-compatible llama.cpp embedding endpoint (default `http://127.0.0.1:8080/v1/embeddings`) |
+| `CODESTORY_EMBED_MODEL_DIR` | Host path to `bge-base-en-v1.5.Q8_0.gguf` for compose `embed` service |
+| `CODESTORY_EMBED_PORT` | llama.cpp server port (default `8080`) |
+| `CODESTORY_RETRIEVAL_COMPOSE_PROFILE` | `real` by default; every other value is unsupported for product bootstrap |
+| `CODESTORY_ZOEKT_ENABLED` | on by default; `0` is unsupported for product retrieval |
+| `CODESTORY_QDRANT_ENABLED` | on by default; `0` is unsupported for product retrieval |
+| `CODESTORY_ZOEKT_PORT` | Zoekt HTTP port (default `6070`) |
+| `CODESTORY_QDRANT_HTTP_PORT` | Qdrant HTTP (default `6333`) |
+| `CODESTORY_QDRANT_GRPC_PORT` | Qdrant gRPC (default `6334`) |
+| `CODESTORY_RETRIEVAL_PRUNE_ON_SCAN_ERROR` | `1` allows retention deletes despite protection-scan errors (default off; fail-closed prune) |
+| `CODESTORY_EVAL_PROBES` | Test-only benchmark probe catalog switch; production runtime ignores this env var |
+
+---
+
+## Related docs
+
+- [`retrieval-architecture.md`](../testing/retrieval-architecture.md) — promotion guide and checklist
+- [`retrieval-design.md`](../architecture/retrieval-design.md) — mandatory sidecar mode matrix and module contracts
+- [`retrieval-sidecar-smoke-ci.md`](../contributors/retrieval-sidecar-smoke-ci.md) — CI job stub
diff --git a/docs/project-delight-roadmap.md b/docs/project-delight-roadmap.md
index 00f0ad1..02121df 100644
--- a/docs/project-delight-roadmap.md
+++ b/docs/project-delight-roadmap.md
@@ -57,8 +57,8 @@ to trust and harder to misuse:
      current surfaces do not.
 
 5. **Simplify setup**
-   - Managed embeddings, profile selection, and fallback messaging should make
-     first use clear.
+   - Managed embeddings, profile selection, and fail-closed diagnostics should
+     make first use clear.
    - If the model path, backend, or doc shape is stale, `doctor` should say so
      plainly.
 
diff --git a/docs/research.md b/docs/research.md
index 5309476..9d5d874 100644
--- a/docs/research.md
+++ b/docs/research.md
@@ -7,11 +7,11 @@ decisions and points to the comparison matrix, not raw run output.
 
 | Area | Decision | Why it matters |
 | --- | --- | --- |
-| Real local embeddings | Use `CODESTORY_EMBED_BACKEND=onnx`. | Managed setup now installs an in-process ONNX Runtime BGE-base path instead of launching llama.cpp. |
-| Deterministic local checks | Use `CODESTORY_EMBED_RUNTIME_MODE=hash`. | Keeps local-dev and CI checks reproducible without model services. |
+| Real local embeddings | Use `CODESTORY_EMBED_BACKEND=llamacpp` with the local llama.cpp sidecar. | Product packet/search evidence now requires the sidecar manifest to record the 768-d bge-base backend and `retrieval_mode=full`. |
+| Deterministic diagnostics | `CODESTORY_EMBED_RUNTIME_MODE=hash` is diagnostic-only. | Keeps selected local-dev and CI checks reproducible without model services, but is not agent-facing retrieval evidence. |
 | Default model profile | `CODESTORY_EMBED_PROFILE=bge-base-en-v1.5`. | BGE-base remains the best quality/speed family for the active runtime. |
 | Default doc shape | `CODESTORY_SEMANTIC_DOC_ALIAS_MODE=alias_variant`, durable semantic scope. | Compact aliases help retrieval without the noise of full alias text. |
-| Current benchmark baseline | Historical BGE-base Q8 GGUF through llama.cpp/Vulkan remains the last fully scored broad-holdout baseline; the active managed runtime is now BGE-base ONNX and needs a fresh benchmark row. | Do not compare new ONNX speed numbers against old llama.cpp rows without rerunning the quality and cross-repo gates. |
+| Current benchmark baseline | Historical BGE-base Q8 GGUF through llama.cpp/Vulkan remains the last fully scored broad-holdout baseline; the active mandatory sidecar contract needs a fresh coherent benchmark row. | Do not compare new sidecar speed numbers against old mixed-vintage rows without rerunning the quality and cross-repo gates. |
 | Peak memory evidence | Segment-2 q8/r6 baseline measured peak descendant working set `828.726562 MB`; repeat sampled `1019.789062 MB`; `peak_vram_mb` was unavailable on this host. | Memory is now measured explicitly, but sampled peak RAM is noisy enough that tiny memory wins need repeats. |
 | Evidence standard | Quality gates and rank profiles come before speed. | A faster row is rejected when MRR, Hit@10, rank1/rank2-10, or misses regress. |
 
diff --git a/docs/testing/benchmark-results.md b/docs/testing/benchmark-results.md
index 6dc6479..1e0af58 100644
--- a/docs/testing/benchmark-results.md
+++ b/docs/testing/benchmark-results.md
@@ -15,6 +15,10 @@ answer, so tool output alone cannot make a row quality-pass.
 | Lane | Current status | Public claim status |
 | --- | --- | --- |
 | Agent A/B quick check | The 2026-05-23 CodeStory-only quick run passed both arms, but the CodeStory arm used more tokens, more wall time, and more tool starts. | No agent savings claim. |
+| Local-real Codex probe | On 2026-05-25, the narrowed `codex-exec-json-flow` live A/B repeated with a quality-passing CodeStory arm against a failing no-CodeStory arm. Latest corrected-wrapper repeat: `114,510` vs `2,209,856` tokens, `2` vs `39` observed tool calls, `117.37s` vs `262.39s`, and overhead ratio `0.183466`. | Strong exploratory evidence; no promotion claim from this task alone. |
+| Local-real Sourcetrail probe | On 2026-05-25, the `sourcetrail-indexing-to-storage` live A/B passed with CodeStory after source-group/indexing/storage packet fixes. CodeStory used `269,363` vs `5,697,852` tokens, `2` vs `105` observed tool calls, `138.92s` vs `532.68s`, `0` vs `87` source reads, and overhead ratio `0.10904`. | Strong second-repo exploratory evidence; still not promotion-grade because it is one repeat using a local existing cache. |
+| Local-real VS Code probe | On 2026-05-25, the `vscode-workbench-extension-host` packet holdout moved from partial coverage to a sufficient packet, then the live A/B passed with CodeStory after workbench/extension-host packet fixes. CodeStory used `1,070,153` vs `7,296,578` tokens, `2` vs `115` observed tool calls, `329.69s` vs `626.08s`, `0` vs `71` source reads, and overhead ratio `0.230215`. A follow-up release incremental refresh repaired the stale cache provenance, moving VS Code freshness from `74` new files to `0`. | Strong third-repo exploratory evidence; still not promotion-grade because it is one repeat and the no-CodeStory arm failed quality. |
+| Local-real drill-suite probe | On 2026-05-25, a four-repo `drill-suite` matrix exposed a real CodeStory cache-reuse blocker, stale Codex anchor selections, VS Code indexing-error blockage, and Sourcetrail source-truth-only bridges. After the CodeStory cache fix and Rust receiver/return-chain graph pass, this repo's one-case drill is still degraded but now resolves `11/11` anchors with `28/55` graph bridges, `27` partial bridges, and `0` unresolved bridges. | Diagnostic product evidence only; the remaining target is store/workspace execution-plan and snapshot/projection bridge coverage. |
 | Strict packet-first rows | Several with-CodeStory public-checkout rows passed quality, packet-first, and zero ordinary source reads after packet. | Behavior evidence only; paired savings still needs broader quality-passing baselines. |
 | Packet runtime | Public-core warm stdio and cold CLI packet rows passed repeated publishable quality gates. | Runtime evidence, not agent-token savings. |
 | Repo-scale cold index/read timing | The current timing source is the latest row in [codestory-e2e-stats-log.md](codestory-e2e-stats-log.md). | Current only after a fresh row is logged for the relevant change. |
@@ -26,6 +30,33 @@ answer, so tool output alone cannot make a row quality-pass.
   tasks while avoiding ordinary source reads after the answer packet.
 - Repeated packet-runtime rows show `packet` can fit inside an agent workflow
   budget in both cold CLI and warm stdio modes.
+- The local-real harness now separates first-index setup cost from timed
+  cache-reuse agent work, blocks stale or semantic-empty caches from
+  publishable evidence, and records useful-context density for final-answer
+  context instead of raw packet volume alone.
+- The quality-passing local-real Codex live A/B has now repeated on the same
+  task with the corrected wrapper.
+- Sourcetrail now adds a second realistic repo where the CodeStory arm passed
+  quality and avoided source reads while the no-CodeStory arm failed quality
+  after broad exploration.
+- VS Code now adds a large TypeScript repo where the packet planner can find the
+  workbench startup, extension service, extension host manager, extension-host
+  activation, and command execution anchors without follow-up commands, and the
+  live CodeStory arm passed quality while using far fewer tools and source
+  reads than the no-CodeStory arm.
+- The VS Code cache freshness issue behind the first local-real row is now
+  understood and fixed: TypeScript/TSX factory-call superclass extraction no
+  longer crashes on `extends mock<T>()`, failed attempts are recorded as
+  incomplete files with attached errors, and `../vscode` now reports
+  `10,491/10,491` indexed files as fresh after incremental refresh.
+- CodeStory's own active cache can now recover from stale incremental
+  projection cleanup where cross-file callable state points at a deleted node;
+  release incremental refresh reports fresh inventory with `150/150` indexed
+  files, `0` errors, and `7,794` semantic docs.
+- The tightened CodeStory drill now exposes the CLI-to-runtime-to-indexer path
+  mostly as graph evidence: Rust receiver and return-chain resolution moved the
+  case from `3/55` graph bridges to `28/55`, while preserving explicit
+  source-truth-only status for the remaining unproven bridge pairs.
 - Repo-scale timing history is tracked in the stats log instead of copied into
   prose that silently drifts.
 
@@ -36,6 +67,18 @@ count, wall time, or tool calls. General savings claims require repeated
 controlled with/without-agent measurements from the benchmark harness, not one
 exploratory row or representative estimates.
 
+The 2026-05-25 Codex, Sourcetrail, and VS Code local-real rows are explicitly
+non-promotional. They show a CodeStory advantage on three realistic tasks, but
+they are still single-run or same-task exploratory measurements using local
+cache state. The VS Code cache now has fresh provenance after the follow-up
+repair, but public savings language still needs repeated controlled rows, clean
+pinned checkout provenance, and at least one holdout that was not tuned during
+the implementation loop.
+
+The 2026-05-25 `drill-suite` rows are also non-promotional. They are designed to
+find grounding failures before an agent A/B run, and the current CodeStory case
+still falls back to source-truth-only evidence for `27/55` bridge pairs.
+
 ## Promotion Rules
 
 - Use the same project, cache state, semantic backend, command flags, runner,
diff --git a/docs/testing/codestory-e2e-stats-log.md b/docs/testing/codestory-e2e-stats-log.md
index 9be8f75..f100d29 100644
--- a/docs/testing/codestory-e2e-stats-log.md
+++ b/docs/testing/codestory-e2e-stats-log.md
@@ -35,6 +35,23 @@ Keep the full emitted JSON in the test output when reviewing locally, and add th
 | 2026-05-24 | 7c891af+wt | pass, review remediation e2e | 11.10 | 0.29 | 1.29 | 0.86 | 0.25 | 0.23 | 56,272 | 47,628 | 149 | 0 | 7,501 | true |
 | 2026-05-24 | 663c257+wt | pass, review findings remediation | 12.30 | 0.24 | 1.04 | 0.65 | 0.24 | 0.21 | 56,362 | 47,659 | 149 | 0 | 7,530 | true |
 | 2026-05-24 | 3c62f1e+wt | pass, remove spec docs publish gate | 11.39 | 0.23 | 1.06 | 0.66 | 0.21 | 0.19 | 56,531 | 47,806 | 149 | 0 | 7,566 | true |
+| 2026-05-25 | cba6cfe+wt | pass, packet planner local-real A/B checkpoint | 11.61 | 0.21 | 1.06 | 0.63 | 0.19 | 0.17 | 58,659 | 49,707 | 150 | 0 | 7,827 | true |
+| 2026-05-25 | 49cd906+wt | pass, vscode packet holdout checkpoint | 11.88 | 0.28 | 1.30 | 0.77 | 0.21 | 0.18 | 58,726 | 49,773 | 150 | 0 | 7,834 | true |
+| 2026-05-25 | 73fc42a+wt | pass, vscode cache freshness fix; drill manifest skipped | 11.00 | 0.21 | 1.08 | 0.68 | 0.19 | 0.19 | 58,782 | 49,824 | 150 | 0 | 7,847 | true |
+| 2026-05-25 | 5aad799+wt | pass, projection cleanup FK fix; drill manifest skipped | 11.59 | 0.22 | 1.03 | 0.69 | 0.19 | 0.18 | 58,799 | 49,843 | 150 | 0 | 7,851 | true |
+| 2026-05-25 | a6416ad+wt | pass, rust receiver chain drill bridge pass | 14.81 | 0.25 | 0.98 | 0.47 | 0.23 | 0.20 | 59,456 | 50,381 | 150 | 0 | 7,915 | true |
+| 2026-05-25 | 765fe4b+wt | pass, owner-alias drill evidence and jobs coverage | 14.79 | 0.25 | 0.96 | 0.44 | 0.33 | 0.21 | 59,531 | 50,444 | 150 | 0 | 7,927 | true |
+| 2026-05-25 | bce041a+wt | fail, drill search_plan missing before seed-anchor repair; targeted seed-anchor search repro passed after repair | 15.72 | 0.24 | 0.95 | 0.44 | 0.21 | 0.22 | 59,917 | 50,781 | 150 | 0 | 8,008 | true |
+| 2026-06-01 | 7c4143f6+wt | pass, mandatory sidecar real-embedding e2e plus real drill manifest | 675.82 | 0.38 | 1.43 | 0.63 | 0.35 | 0.43 | 77,912 | 65,529 | 229 | 0 | 10,668 | true |
+| 2026-06-01 | 2deff76e+wt | fail, release e2e stats ok; real drill manifest env missing | 685.56 | 0.34 | 1.29 | 0.54 | 0.32 | 0.31 | 78,795 | 66,280 | 229 | 0 | 10,771 | true |
+| 2026-06-02 | 72d4ea4c+wt | pass, review remediation sidecar e2e; retrieval index 16.34s; drill manifest skipped | 746.30 | 0.31 | 1.14 | 0.46 | 0.24 | 0.22 | 78,247 | 66,075 | 217 | 0 | 10,787 | true |
+| 2026-06-02 | 8f625b5e+wt | fail, release e2e stats ok; real drill manifest env missing; retrieval_index_seconds 18.15 | 1190.65 | 0.48 | 1.38 | 0.53 | 0.28 | 0.32 | 78,212 | 66,040 | 217 | 0 | 10,814 | true |
+| 2026-06-02 | de6436a3+wt | pass, round 3 sidecar contract e2e; optional real drill manifest skipped; retrieval_index_seconds 18.13 | 929.37 | 0.31 | 1.48 | 0.52 | 0.30 | 0.25 | 78,159 | 65,970 | 217 | 0 | 10,806 | true |
+| 2026-06-02 | dbba955b+wt | pass, round 4 sidecar contract e2e; optional real drill manifest skipped; retrieval_index_seconds 16.02 | 874.56 | 0.29 | 1.15 | 0.46 | 0.26 | 0.24 | 78,203 | 66,005 | 217 | 0 | 10,814 | true |
+| 2026-06-02 | 3c3012af+wt | pass, round 6 sidecar cache/status e2e; optional real drill manifest skipped; retrieval_index_seconds 20.58; retrieval_mode full | 890.52 | 0.34 | 1.91 | 0.63 | 0.31 | 0.29 | 78,376 | 66,156 | 217 | 0 | 10,836 | true |
+| 2026-06-02 | 4c616548+wt | blocked, round 7 release e2e index phase did not complete; stopped child after 1075.05s with no stdout/stderr; failed command `index --refresh full --format json`; retrieval_index_seconds n/a; retrieval_mode n/a | n/a | n/a | n/a | n/a | n/a | n/a | n/a | n/a | n/a | n/a | n/a | n/a |
+| 2026-06-02 | 25751a39+wt | fail, round 8 release e2e stats ok; real drill manifest env missing fail-closed; retrieval_index_seconds 17.73; retrieval_status_seconds 0.46; retrieval_mode full | 720.80 | 0.31 | 1.54 | 0.52 | 0.26 | 0.26 | 78,478 | 66,235 | 217 | 0 | 10,839 | true |
+| 2026-06-02 | a23770f+wt | pass, round 9 stats-only release e2e; real drill intentionally skipped with CODESTORY_ALLOW_SKIP_REAL_REPO_DRILL_CASES=1; not real-drill release evidence; retrieval_index_seconds 18.35; retrieval_status_seconds 0.56; retrieval_mode full | 711.31 | 0.32 | 1.77 | 0.59 | 0.32 | 0.27 | 78,582 | 66,332 | 217 | 0 | 10,847 | true |
 
 ## Phase Metrics
 
@@ -64,3 +81,21 @@ Keep the full emitted JSON in the test output when reviewing locally, and add th
 | 2026-05-24 | 7db7fb1+wt | post-rebase benchmark/packet integration E2E | 18.04 | 5.38 | 1.69 | 0 | 7,466 | 0 |
 | 2026-05-24 | 7c891af+wt | review remediation E2E | 11.10 | 5.12 | 0.79 | 0 | 7,501 | 0 |
 | 2026-05-24 | 3c62f1e+wt | remove spec docs publish gate E2E | 11.39 | 5.14 | 0.70 | 0 | 7,566 | 0 |
+| 2026-05-25 | cba6cfe+wt | packet planner local-real A/B checkpoint E2E | 11.61 | 4.99 | 0.80 | 0 | 7,827 | 0 |
+| 2026-05-25 | 49cd906+wt | vscode packet holdout checkpoint E2E | 11.88 | 5.27 | 0.81 | 0 | 7,834 | 0 |
+| 2026-05-25 | 73fc42a+wt | vscode cache freshness fix E2E | 11.00 | 5.20 | 0.94 | 0 | 7,847 | 0 |
+| 2026-05-25 | 5aad799+wt | projection cleanup FK fix E2E | 11.59 | 5.26 | 1.18 | 0 | 7,851 | 0 |
+| 2026-05-25 | a6416ad+wt | rust receiver chain drill bridge E2E | 14.81 | 7.47 | 1.61 | 0 | 7,915 | 0 |
+| 2026-05-25 | 765fe4b+wt | owner-alias drill evidence and jobs coverage E2E | 14.79 | 7.79 | 0.93 | 0 | 7,927 | 0 |
+| 2026-05-25 | bce041a+wt | semantic role-awareness E2E release half; drill half failed before seed-anchor repair | 15.72 | 7.94 | 0.98 | 0 | 8,008 | 0 |
+| 2026-06-01 | 7c4143f6+wt | mandatory sidecar real-embedding e2e with real drill manifest | 675.82 | 9.19 | 643.68 | 0 | 10,668 | 0 |
+| 2026-06-01 | 2deff76e+wt | segment 70 stage probes release e2e; drill manifest env missing | 685.56 | 8.61 | 654.37 | 0 | 10,771 | 0 |
+| 2026-06-02 | 72d4ea4c+wt | review remediation sidecar e2e; retrieval index 16.34s; drill manifest skipped | 746.30 | 10.40 | 727.80 | 0 | 10,787 | 0 |
+| 2026-06-02 | 8f625b5e+wt | review remediation stats pass; drill failed CODESTORY_REAL_REPO_DRILL_CASES missing; retrieval_index_seconds 18.15 | 1190.65 | 9.77 | 1172.38 | 0 | 10,814 | 0 |
+| 2026-06-02 | de6436a3+wt | round 3 sidecar contract e2e; optional real drill manifest skipped; retrieval_index_seconds 18.13 | 929.37 | 11.74 | 906.29 | 0 | 10,806 | 0 |
+| 2026-06-02 | dbba955b+wt | round 4 sidecar contract e2e; optional real drill manifest skipped; retrieval_index_seconds 16.02 | 874.56 | 11.09 | 854.21 | 0 | 10,814 | 0 |
+| 2026-06-02 | b582a9bb+wt | round 5 sidecar contract e2e; optional real drill manifest skipped; retrieval_index_seconds 19.75; retrieval_mode full | 917.01 | 10.30 | 897.84 | 0 | 10,826 | 0 |
+| 2026-06-02 | 3c3012af+wt | round 6 sidecar cache/status e2e; optional real drill manifest skipped; retrieval_index_seconds 20.58; retrieval_mode full | 890.52 | 12.95 | 866.79 | 0 | 10,836 | 0 |
+| 2026-06-02 | 4c616548+wt | round 7 blocked before phase metrics; release index child stopped after 1075.05s with no stdout/stderr; retrieval_index_seconds n/a; retrieval_mode n/a | n/a | n/a | n/a | n/a | n/a | n/a |
+| 2026-06-02 | 25751a39+wt | round 8 release e2e stats ok; real drill manifest env missing fail-closed; retrieval_index_seconds 17.73; retrieval_status_seconds 0.46; retrieval_mode full | 720.80 | 10.27 | 702.18 | 0 | 10,839 | 0 |
+| 2026-06-02 | a23770f+wt | round 9 stats-only release e2e; real drill intentionally skipped with CODESTORY_ALLOW_SKIP_REAL_REPO_DRILL_CASES=1; not real-drill release evidence; retrieval_index_seconds 18.35; retrieval_mode full | 711.31 | 11.08 | 691.07 | 0 | 10,847 | 0 |
diff --git a/docs/testing/codestory-stdio-warm-loop-stats.md b/docs/testing/codestory-stdio-warm-loop-stats.md
index 15e4d74..0d8ea70 100644
--- a/docs/testing/codestory-stdio-warm-loop-stats.md
+++ b/docs/testing/codestory-stdio-warm-loop-stats.md
@@ -63,6 +63,16 @@ From the 2026-05-06 baseline:
 | snippet | 20 | 744 | 744 |
 | resources/read:status | 20 | 1,003 | 1,003 |
 
+## Packet Cache Probe
+
+`serve --stdio` keeps a small in-process LRU for identical successful `packet`
+requests. The key includes request arguments plus the SQLite DB/WAL fingerprint,
+so a changed index bypasses the cached packet.
+
+| Date | Commit | Scenario | First packet ms | Repeated packet ms | Speedup | Same packet id | Trace steps | Protocol stderr |
+| --- | --- | --- | ---: | ---: | ---: | --- | ---: | ---: |
+| 2026-05-25 | pending | CodeStory repo, release binary, `--refresh none`, repeated identical tiny packet | 3495.60 | 0.93 | 3754.27x | true | 13 | 0 bytes |
+
 ## Notes
 
 - The baseline is a small-fixture release-binary smoke, not a repo-scale promotion gate.
diff --git a/docs/testing/embedding-backend-benchmarks.md b/docs/testing/embedding-backend-benchmarks.md
index b94df4e..1eb007d 100644
--- a/docs/testing/embedding-backend-benchmarks.md
+++ b/docs/testing/embedding-backend-benchmarks.md
@@ -8,12 +8,12 @@ tried, what was rejected, and what still needs proof.
 
 | Question | Current answer |
 | --- | --- |
-| Real local embedding backend | `CODESTORY_EMBED_BACKEND=onnx` |
-| Deterministic local-dev backend | `CODESTORY_EMBED_RUNTIME_MODE=hash` |
+| Real local embedding backend | `CODESTORY_EMBED_BACKEND=llamacpp` through the mandatory local sidecar |
+| Deterministic diagnostic backend | `CODESTORY_EMBED_RUNTIME_MODE=hash` |
 | Default profile | `CODESTORY_EMBED_PROFILE=bge-base-en-v1.5` |
 | Default doc shape | `CODESTORY_SEMANTIC_DOC_ALIAS_MODE=alias_variant`, durable semantic scope |
-| Current broad-holdout incumbent candidate | Pending fresh ONNX quality row. Historical incumbent: BGE-base Q8 GGUF through llama.cpp/Vulkan, batch `512`, request count `6`, server batch `1024`, server microbatch `1024`, stored vectors `int8`, full-text enabled |
-| Cross-repo gate status | Historical q8/r6 full-text profile passed the external gate across 4 projects and 225 queries; ONNX needs the same fresh gate before benchmark promotion |
+| Current broad-holdout incumbent candidate | Pending fresh mandatory-sidecar quality row. Historical incumbent: BGE-base Q8 GGUF through llama.cpp/Vulkan, batch `512`, request count `6`, server batch `1024`, server microbatch `1024`, stored vectors `int8`, full-text enabled |
+| Cross-repo gate status | Historical q8/r6 full-text profile passed the external gate across 4 projects and 225 queries; the current sidecar contract needs the same fresh gate before benchmark promotion |
 | Primary metric shape | `pipeline_score = 1000000 * (0.7 * quality + 0.2 * speed + 0.1 * memory) * quality_gate_penalty` |
 | Memory component shape | Model footprint, persisted vector footprint, and cache/index footprint; peak RAM is reported separately when sampled |
 | Default-change rule | Do not promote a faster row when MRR, Hit@10, rank profile, repeat behavior, or cross-repo behavior regresses |
@@ -30,16 +30,16 @@ also showed peak-RAM sampling variance, so tiny memory deltas need repeat
 evidence before they matter. Treat earlier perfect-score r5 evidence and any
 single-pass q5 score as historical/suspect, not current default proof.
 
-As of the ONNX replacement, the llama.cpp rows below are historical baselines.
-Add a fresh ONNX row before calling the new managed backend promoted on quality
-and cross-repo evidence.
+As of the mandatory sidecar reset, older ONNX and hash-projection rows are
+historical diagnostics. Add a fresh sidecar row before calling the active runtime
+promoted on quality and cross-repo evidence.
 
 ## Primary Comparison Matrix
 
 | Candidate or lane | Best relevant evidence | Quality signal | Speed and footprint signal | Decision |
 | --- | --- | --- | --- | --- |
-| Managed BGE-base ONNX Runtime, CLS-pooled graph, DirectML, doc batch 2048, token budget 32768, stored int8 | Large C++ workspace fresh-cache timing on 26,010 semantic docs after batch-fast tokenizer switch and pooled-output graph derivation | Quality gate not run yet; direct CPU ORT check showed pooled graph output exactly matched source `last_hidden_state[:, 0, :]` on sampled inputs; semantic contract and search smoke passed | `semantic_embedding_ms=128.438s`; previous 32k unpooled row was `135.762s`; pooled 65k was slower at `131.807s`; prior managed ONNX 65k first pass was `152.500s`, batch-fast 65k was `138.118s`, 16k was `137.776s`; unpooled 131k was aborted as slower and memory-heavy after sampled peak working set around `3.37 GB` | Active managed default for throughput shape. Still needs full quality and cross-repo gates before replacing the historical llama.cpp benchmark baseline as promoted evidence. |
-| BGE-base llama.cpp, b512/Q8/r6, server batch 1024, microbatch 1024, stored int8, full-text enabled | Segment 2 baseline `909369.110274`; repeat `909844.215726`; earlier fixed-wrapper holdout `910504.353332`; cross-repo `851670.370370` | Local MRR@10 `0.982432`, Hit@10 `1.0`, Hit@1 `0.972973`; cross-repo Hit@10 `1.0`, adversarial Hit@10 `1.0`, MRR@10 `0.826831` across 225 queries | Segment 2 baseline `368.01` docs/sec, cache `74.40 MB`, sampled peak descendant working set `828.73 MB`; repeat `371.89` docs/sec, sampled peak `1019.79 MB`; cross-repo search p95 `84.7 ms` | Historical externally validated baseline. Re-run the same gates for ONNX before treating the replacement as promoted on research evidence. |
+| Managed BGE-base ONNX Runtime, CLS-pooled graph, DirectML, doc batch 2048, token budget 32768, stored int8 | Large C++ workspace fresh-cache timing on 26,010 semantic docs after batch-fast tokenizer switch and pooled-output graph derivation | Quality gate not run yet; direct CPU ORT check showed pooled graph output exactly matched source `last_hidden_state[:, 0, :]` on sampled inputs; semantic contract and search smoke passed | `semantic_embedding_ms=128.438s`; previous 32k unpooled row was `135.762s`; pooled 65k was slower at `131.807s`; prior managed ONNX 65k first pass was `152.500s`, batch-fast 65k was `138.118s`, 16k was `137.776s`; unpooled 131k was aborted as slower and memory-heavy after sampled peak working set around `3.37 GB` | Historical diagnostic lane after mandatory-sidecar reset. Do not treat as promoted product evidence without a fresh sidecar contract and quality gate. |
+| BGE-base llama.cpp, b512/Q8/r6, server batch 1024, microbatch 1024, stored int8, full-text enabled | Segment 2 baseline `909369.110274`; repeat `909844.215726`; earlier fixed-wrapper holdout `910504.353332`; cross-repo `851670.370370` | Local MRR@10 `0.982432`, Hit@10 `1.0`, Hit@1 `0.972973`; cross-repo Hit@10 `1.0`, adversarial Hit@10 `1.0`, MRR@10 `0.826831` across 225 queries | Segment 2 baseline `368.01` docs/sec, cache `74.40 MB`, sampled peak descendant working set `828.73 MB`; repeat `371.89` docs/sec, sampled peak `1019.79 MB`; cross-repo search p95 `84.7 ms` | Historical externally validated baseline and closest prior evidence to the current mandatory sidecar backend. Re-run the same gates under the generation-bound sidecar contract before promotion. |
 | BGE-base llama.cpp, b512/r5, microbatch 1024, stored int8 | Earlier local `pipeline_score=918957.022351`; confirmation `918697.617312`; corrected segment-2 scout `901789.644032` | Earlier perfect local scores triggered the overfit review; corrected segment-2 quality matched q8/r6, but did not improve it | Corrected segment-2 r5 slowed to `327.58` docs/sec and sampled peak rose to `1074.43 MB` | Historical/discarded. Do not treat r5 as the current promoted answer after the corrected broad-holdout pass. |
 | BGE-base llama.cpp, b768/r4 vs b512/Q8/r6 on the 74-query broad holdout | Packet 18 selected Q8/r6 with `pipeline_score=910173.164803` | r4 matched Q8/r6 quality | r4 was slower than Q8/r6 (`361.85` vs `389.22` docs/sec) | Do not promote. Useful comparison only. |
 | BGE-base Q5 against the broad holdout incumbent | Segment 2 first pass `910704.594864`; repeat `901823.678946` | First pass matched q8/r6 quality; repeat failed quality with MRR@10 `0.975676` and Hit@1 `0.959459` | Model footprint shrank to `78.21 MB`, but speed regressed and repeat stayed below baseline | Discard as a default/promotion candidate. Reopen only with a quality-preserving and speed-neutral compression recipe. |
@@ -61,7 +61,7 @@ and cross-repo evidence.
 
 The measured work covered these families:
 
-- ONNX Runtime provider and batch geometry, plus legacy llama.cpp request geometry for historical comparison.
+- ONNX Runtime provider and batch geometry, plus external llama.cpp request geometry for historical comparison.
 - Stored-vector footprint: compact scaled int8 persisted vectors.
 - Quality metric repair: explicit gates, continuous penalties, denominator metrics, and query-rank reporting.
 - Benchmark isolation: leakage guard, tainted-query quarantine, and cache replay blocking.
@@ -80,10 +80,10 @@ The measured work covered these families:
 - True producer-consumer streaming from parsed/indexed symbols into the embedder
   has not been proven. Several scouts reduced or rearranged semantic work, but
   none established a safe end-to-end streaming architecture that beats the
-  historical q8/r6 full-text baseline or a fresh managed ONNX row.
-- The historical external gate covers four useful repository families, but ONNX
-  and any broader default should still be checked on representative repos before
-  treating the profile as universal.
+  historical q8/r6 full-text baseline or a fresh mandatory-sidecar row.
+- The historical external gate covers four useful repository families, but the
+  current sidecar contract and any broader default should still be checked on
+  representative repos before treating the profile as universal.
 - Q5 remains a compression option only if a future quality-preserving recipe
   exists. The current scout saved model footprint but did not beat the q8/r6
   incumbent under the user-weighted metric.
diff --git a/docs/testing/framework-route-coverage.md b/docs/testing/framework-route-coverage.md
index 15df1d0..be3b343 100644
--- a/docs/testing/framework-route-coverage.md
+++ b/docs/testing/framework-route-coverage.md
@@ -15,6 +15,10 @@ single heuristic hit.
 - Rust: Axum, Actix, Rocket.
 - Go: Gin, Chi, Echo, Fiber as text-only partial route extraction until Go
   parser-backed handler links exist.
+- Kotlin/Swift/Dart (unmapped today): Ktor, Vapor, and Shelf heuristics are
+  implemented in `collect_framework_routes` for when those language paths index
+  source files; fixture coverage lives in
+  `test_framework_route_extractors_cover_requested_web_stacks`.
 - Existing OpenAPI endpoint indexing remains separate and should continue to
   produce endpoint symbols and speculative client-call edges.
 - Payload collection config and usage extraction is tracked as data bridge
diff --git a/docs/testing/performance-review-playbook.md b/docs/testing/performance-review-playbook.md
index 457e675..ecf91a3 100644
--- a/docs/testing/performance-review-playbook.md
+++ b/docs/testing/performance-review-playbook.md
@@ -26,7 +26,7 @@ Before proposing an optimization, record:
 | --- | --- |
 | Command | Exact command line, including `--project`, `--refresh`, `--format`, and relevant environment variables. |
 | Commit | Current commit or working-tree label. If the tree is dirty, say so. |
-| Cache state | Cold cache, warm cache, incremental refresh, lexical-only, hash semantic, managed ONNX, or external embedding backend. |
+| Cache state | Cold cache, warm cache, incremental refresh, full sidecar, lexical-only diagnostic, hash semantic diagnostic, ONNX diagnostic, or external embedding backend. |
 | Sample size | Number of runs and whether the first run was discarded. |
 | Headline metric | Index seconds, graph phase seconds, semantic phase seconds, per-command seconds, p95/max latency, or benchmark score. |
 | Dominant cost | Measured cost center: graph phase, semantic phase, store reads/writes, repo-text scan, source reads, graph traversal, search scoring, CLI rendering, lock contention, or memory pressure. |
@@ -42,6 +42,9 @@ cargo test -p codestory-runtime --test retrieval_eval
 cargo check -p codestory-bench --benches
 ```
 
+`retrieval_eval` needs `CODESTORY_RETRIEVAL_EVAL_FULL_TESTS=1` for full sidecar quality assertions;
+without it, the suite checks that non-full retrieval fails closed.
+
 Use Criterion benches from `crates/codestory-bench` only when the measured hot
 path is narrower than the repo-scale e2e test can explain.
 
diff --git a/docs/testing/retrieval-architecture.md b/docs/testing/retrieval-architecture.md
new file mode 100644
index 0000000..a87868e
--- /dev/null
+++ b/docs/testing/retrieval-architecture.md
@@ -0,0 +1,226 @@
+# Sidecar retrieval — architecture and promotion guide
+
+Sidecar-primary packet retrieval (Zoekt lexical, Qdrant semantic, SCIP graph) orchestrated by
+`codestory-retrieval` and integrated in `codestory-runtime`. Production packet paths use
+generic symbol/path roles; benchmark-only probe catalogs remain behind test-only eval harness hooks.
+Sidecar retrieval is mandatory for current evidence; `CODESTORY_RETRIEVAL=0` is treated as a
+configuration error, not a diagnostic route.
+
+**Related:** [`../ops/retrieval-sidecars.md`](../ops/retrieval-sidecars.md) (operator runbook),
+[`../architecture/retrieval-design.md`](../architecture/retrieval-design.md) (module contracts).
+
+---
+
+## Implemented stack (Phases 0–5)
+
+| Layer | Location | Role |
+|-------|----------|------|
+| Sidecar clients | `crates/codestory-retrieval/` (`zoekt_client`, `qdrant_client`, `scip_client`, `health`) | HTTP probes, staged search, timeouts |
+| Planner / executor / ranker | `codestory-retrieval` (`planner`, `executor`, `ranker`, `query_features`, `mode`) | Repo-agnostic staged plan, deadlines, degraded modes |
+| Index manifest | `codestory-store` `retrieval_index_manifest` + `codestory-retrieval::index` | Version pins, sidecar input hash, generation id, and mandatory real sidecar artifact paths |
+| CLI lifecycle | `codestory-cli` `retrieval up\|down\|status\|index\|query` | Local data dirs, health JSON, standalone query |
+| Packet integration | `codestory-runtime/src/agent/retrieval_primary.rs` | Primary sidecar path, diagnostic traces, promotion warnings |
+| Nucleo policy | `codestory-runtime/src/agent/nucleo_policy.rs` | Suppresses Nucleo O(n) scan on sidecar primary; disabled sidecars are not valid product evidence |
+| Generalization lint | `scripts/lint-retrieval-generalization.mjs` | Bans repo literals in Rust production retrieval trees (CI via Rust guard test); benchmark/eval harness scripts may name holdout repos only inside their manifest/eval boundary |
+
+**Modes:** `full`, `no_scip`, `no_semantic`, `lexical_only`, `unavailable` — only
+`full` may serve primary packet/search results. All non-`full` modes fail closed. See
+[`retrieval-design.md`](../architecture/retrieval-design.md#mandatory-sidecar-mode-matrix).
+
+**Benchmark manifests:** `benchmarks/tasks/local-real/` is the realistic local
+product corpus; `benchmarks/tasks/holdout-retrieval/` is the public
+generalization corpus. Holdout rows are promotion evidence only, not a tuning
+loop.
+
+## Environment flags
+
+### Runtime variables
+
+`CODESTORY_RETRIEVAL_V2` and `CODESTORY_RETRIEVAL_V2_SHADOW` are no longer migration aliases.
+If either legacy variable is present, packet retrieval fails closed instead of silently mapping it
+to the sidecar-primary contract.
+
+| Variable | Default (production) | Purpose |
+|----------|----------------------|---------|
+| `CODESTORY_RETRIEVAL` | unset → sidecar primary when manifest + `full` mode (else fail closed) | `1` force sidecar primary attempt; `0` is unsupported and fails closed |
+| `CODESTORY_RETRIEVAL_SHADOW` | unsupported for product benchmarks | Historical diagnostic switch; benchmark contract rejects it |
+| `CODESTORY_ZOEKT_ENABLED` | on | `0` is unsupported for product retrieval |
+| `CODESTORY_QDRANT_ENABLED` | on | `0` is unsupported for product retrieval |
+| `CODESTORY_RETRIEVAL_REAL_EMBEDDINGS` | `1` | `0` is unsupported for product retrieval |
+| `CODESTORY_RETRIEVAL_COMPOSE_PROFILE` | `real` | every other profile is unsupported for product bootstrap |
+| `CODESTORY_EMBED_BACKEND` | `llamacpp` | product manifests require llama.cpp bge-base embeddings |
+| `CODESTORY_EMBED_LLAMACPP_URL` | `http://127.0.0.1:8080/v1/embeddings` | local embedding sidecar endpoint |
+| `CODESTORY_ZOEKT_PORT` | `6070` | Zoekt HTTP |
+| `CODESTORY_QDRANT_HTTP_PORT` | `6333` | Qdrant HTTP |
+| `CODESTORY_QDRANT_GRPC_PORT` | `6334` | Qdrant gRPC |
+
+### Benchmark-only flags
+
+Use these when running promotion harnesses. Do not enable in normal production packet runs.
+
+| Variable | Default | Purpose |
+|----------|---------|---------|
+| `CODESTORY_EVAL_PROBES` | ignored in production runtime | Benchmark-shaped probe catalog (`eval_probes.rs`) is test-only; promotion bundles do not inject it. |
+
+**Sidecar promotion candidate (typical):**
+
+```powershell
+Remove-Item Env:CODESTORY_RETRIEVAL -ErrorAction SilentlyContinue
+Remove-Item Env:CODESTORY_EVAL_PROBES -ErrorAction SilentlyContinue
+.\target\release\codestory-cli.exe retrieval up
+.\target\release\codestory-cli.exe retrieval index --project . --refresh auto
+```
+
+---
+
+## Local workflows
+
+### One-command environment setup
+
+From the CodeStory repository root:
+
+```sh
+cargo retrieval-setup
+cargo retrieval-status
+```
+
+Optional Node wrapper (prerequisite report, optional holdout clone):
+`node scripts/setup-retrieval-env.mjs`.
+See [`../ops/retrieval-sidecars.md`](../ops/retrieval-sidecars.md#quick-start-one-command).
+
+### Sidecars and index
+
+```sh
+cargo retrieval-setup
+cargo run -p codestory-cli -- retrieval index --project <repo-root> --refresh auto
+cargo run -p codestory-cli -- retrieval query "main" --project <repo-root>
+```
+
+`retrieval bootstrap` (alias `cargo retrieval-setup`) starts Docker Compose when Docker is installed.
+`retrieval up` alone only prepares cache dirs and state (see runbook).
+
+### local-real packet suite (in-scope tuning)
+
+Repos: `codex`, `rootandruntime`, `sourcetrail`, `vscode` — manifests under
+`benchmarks/tasks/local-real/`.
+
+```powershell
+node scripts/codestory-agent-ab-benchmark.mjs `
+  --packet-runtime --packet-runtime-mode cold-cli `
+  --task-suite local-real --repeats 1 `
+  --out-dir target/agent-benchmark/packet-runtime-sidecar-promotion `
+  --codestory-cli target/release/codestory-cli.exe `
+  --timeout-ms 300000
+```
+
+Local-real rows are product-development evidence, not public savings claims by
+themselves. They need repeated quality-gated runs against clean pinned checkouts
+before promotion language.
+
+### holdout-retrieval (generalization)
+
+```powershell
+node scripts/fetch-holdout-repos.mjs
+# or:
+node scripts/codestory-agent-ab-benchmark.mjs `
+  --list --task-suite holdout-retrieval --materialize-repos
+
+node scripts/codestory-agent-ab-benchmark.mjs `
+  --packet-runtime --packet-runtime-mode cold-cli `
+  --task-suite holdout-retrieval --materialize-repos `
+  --repeats 1 `
+  --out-dir target/agent-benchmark/holdout-retrieval-smoke `
+  --codestory-cli target/release/codestory-cli.exe `
+  --timeout-ms 180000
+```
+
+Holdout failures should block promotion or trigger diagnosis; do not add
+repo-name/path literals or tune planner/ranker heuristics against holdout rows.
+
+## Fast CI-style checks (automated in Phase 6)
+
+```powershell
+cargo test -p codestory-runtime --test retrieval_generalization_guard
+node --test scripts/tests/codestory-agent-ab-analyzer.test.mjs
+cargo test -p codestory-cli --test onboarding_contracts
+```
+
+Optional broader lane:
+
+```powershell
+cargo test -p codestory-retrieval
+cargo test -p codestory-runtime
+node --test scripts/tests/codestory-agent-ab-analyzer.test.mjs
+```
+
+---
+
+## Promotion checklist
+
+Status as of Phase 6 documentation pass. **Benchmark pass columns require a human run** with
+repos, sidecars, and release CLI — not claimed here.
+
+### Language support audit alignment
+
+Support claims must be backed by committed benchmark manifests, generated artifacts, or
+tests in the branch. Do not infer support for languages without direct benchmark artifacts.
+
+| Item | Status | Notes |
+|------|--------|-------|
+| Phases 0–5 code landed | done | See implemented stack above |
+| Architecture / design docs | done | `docs/architecture/retrieval-design.md` |
+| Sidecar runbook | done | `docs/ops/retrieval-sidecars.md` |
+| Local-real manifests | done | `benchmarks/tasks/local-real/` |
+| Holdout manifests + fetch script | done | `benchmarks/tasks/holdout-retrieval/`, `scripts/fetch-holdout-repos.mjs` |
+| `freelancer` / `traderotate` removed from default holdouts | done | OSS holdouts only |
+| Generalization lint + guard test | done | `lint-retrieval-generalization.mjs`, `retrieval_generalization_guard` |
+| Warning config | done | `docs/architecture/retrieval-rollback.json` |
+| Markdown link contract (`onboarding_contracts`) | verify | `cargo test -p codestory-cli --test onboarding_contracts` |
+| local-real cold packet + north-star SLOs | **human** | p99 retrieval, quality 3/4, wall targets |
+| holdout-retrieval 2/3 pass | **human** | Requires materialized OSS repos + index |
+| `agent_value_gap` &lt; 0.20 | **human** | Measure from a fresh coherent bundle |
+| Windows `retrieval-sidecar-smoke` CI job | fail-closed sidecar smoke | [`retrieval-sidecar-smoke-ci.md`](../contributors/retrieval-sidecar-smoke-ci.md) |
+| Ragas/Phoenix nightly eval | optional | Not configured |
+
+### North-star SLOs (targets — measure before claiming pass)
+
+| Metric | Target |
+|--------|--------|
+| Retrieval p50 | ≤ 250 ms |
+| Retrieval p90 | ≤ 600 ms |
+| Retrieval p99 | ≤ 1,000 ms |
+| Worst-case packet wall | ≤ 1,500 ms |
+| local-real quality pass | ≥ 3/4 repos |
+| `agent_value_gap` | &lt; 0.20 |
+| holdout generalization | 2/3 of `ripgrep`, `axios`, `redis` |
+| Sidecar planner/ranker repo literals | 0 (lint clean) |
+
+---
+
+## Rollback drill (REQ-RES-005)
+
+After promotion runs, verify rollback warnings:
+
+1. Point `retrieval_rollback` at a baseline `packet-runtime-summary.json` with thresholds that will trip on the current summary (or use unit test `rollback_drill_warns_without_setting_legacy_env` in `retrieval_rollback.rs`).
+2. Confirm `check_and_log_rollback_warnings` logs trigger ids without setting `CODESTORY_RETRIEVAL=0`.
+3. File a one-line incident note in this doc with date and trigger id if rollback fires in production promotion.
+
+**One-shot operator drill (after each promotion run):**
+
+```powershell
+cargo test -p codestory-runtime retrieval_rollback::tests::rollback_drill_warns_without_setting_legacy_env -- --nocapture
+```
+
+Expect rollback warnings only when configured thresholds fire (see `docs/architecture/retrieval-rollback.json`). Sidecar retrieval remains mandatory.
+
+**Closure status (2026-05-27, semantic promotion pass):** Phase A shipped (bge-base 768-d, llama.cpp `embed` compose service, manifest `embedding_backend`/`embedding_dim`, Qdrant collection migration, llamacpp dim hard-fail). Local `retrieval status` reaches `full` with default 768-d vectors after Qdrant re-index. Sidecar-primary is the intended product path, but product promotion remains gated until fresh benchmark evidence passes.
+
+---
+
+## Spec and design references
+
+| Doc | Path |
+|-----|------|
+| Design | [`docs/architecture/retrieval-design.md`](../architecture/retrieval-design.md) |
+| Operations | [`docs/ops/retrieval-sidecars.md`](../ops/retrieval-sidecars.md) |
+| Rollback config | [`docs/architecture/retrieval-rollback.json`](../architecture/retrieval-rollback.json) |
diff --git a/docs/usage.md b/docs/usage.md
index fc18649..19240b7 100644
--- a/docs/usage.md
+++ b/docs/usage.md
@@ -118,9 +118,9 @@ codestory-cli doctor --project <target-workspace>
 ```
 
 If `doctor` reports stale inventory, semantic contract mismatch, missing managed
-assets, or a fallback retrieval mode, fix that layer before investigating answer
-quality. Treat the health report as the first source of truth for cache and
-retrieval state.
+assets, or a non-`full` retrieval mode, fix that layer before investigating
+answer quality. Treat the health report as the first source of truth for cache
+and retrieval state.
 
 ## Core Commands
 
@@ -181,32 +181,56 @@ reset, schema change, or suspected stale-state incident.
 
 ## Retrieval Defaults
 
-Hybrid retrieval is the intended default when local embedding assets are
-available. If they are unavailable, CodeStory falls back to symbolic or lexical
-ranking and reports the fallback reason.
+Sidecar retrieval is mandatory for agent-facing packet/search workflows. A
+project is usable only when the local Zoekt, Qdrant, and SCIP sidecars report
+`retrieval_mode=full`; missing sidecars, stale manifests, or embedding-contract
+drift fail closed instead of falling back to an older local search path.
 
-Managed setup:
+Product sidecar setup:
 
 ```powershell
-codestory-cli setup embeddings --project <target-workspace> --dry-run --format json
-codestory-cli setup embeddings --project <target-workspace>
+codestory-cli retrieval bootstrap --project <target-workspace>
+$env:CODESTORY_EMBED_BACKEND = "llamacpp"
+$env:CODESTORY_EMBED_LLAMACPP_URL = "http://127.0.0.1:8080/v1/embeddings"
 codestory-cli index --project <target-workspace> --refresh full
+codestory-cli retrieval index --project <target-workspace> --refresh full
+codestory-cli retrieval status --project <target-workspace> --format json
 codestory-cli doctor --project <target-workspace>
 ```
 
+Plain `codestory-cli index` builds the core SQLite code index. It does not
+generate or prove sidecar readiness. Run `codestory-cli retrieval index` only
+after the local sidecar services, llama.cpp embedding endpoint, and
+`bge-base-en-v1.5` model configuration are ready, then require `retrieval
+status --format json` to report `retrieval_mode: "full"` before trusting
+agent-facing packet/search evidence.
+
+Legacy managed embedding setup is local semantic/diagnostic only:
+
+```powershell
+codestory-cli setup embeddings --project <target-workspace> --dry-run --format json
+codestory-cli setup embeddings --project <target-workspace>
+```
+
+Those commands install managed ONNX assets. They do not start llama.cpp, create
+the retrieval manifest, or prove product sidecar readiness.
+
 Useful environment knobs:
 
-- `CODESTORY_HYBRID_RETRIEVAL_ENABLED=false`: lexical-only mode.
-- `CODESTORY_EMBED_RUNTIME_MODE=hash`: fast local development semantics.
-- `CODESTORY_EMBED_BACKEND=onnx`, `llamacpp`, or `hash`: backend selection.
-- `CODESTORY_EMBED_PROFILE=bge-base-en-v1.5`: default managed profile unless
-  overridden.
+- `CODESTORY_EMBED_BACKEND=llamacpp`: product embedding sidecar selection.
+- `CODESTORY_EMBED_LLAMACPP_URL=http://127.0.0.1:8080/v1/embeddings`: local
+  bge-base-en-v1.5 embedding endpoint.
 - `CODESTORY_SEMANTIC_DOC_SCOPE=all`: include lower-signal symbols while
   investigating.
 - `CODESTORY_LLM_DOC_EMBED_BATCH_SIZE=<n>`: override only while profiling.
 
+Hash embeddings, ONNX-only experiments, lexical-only switches, and non-sidecar
+embedding paths are diagnostic or historical comparison modes only.
+Agent-facing packet/search evidence requires repaired sidecars and
+`retrieval_mode=full`.
+
 `index`, `ground`, `search`, `context`, and `doctor` report retrieval mode and
-fallback notes when retrieval state is available.
+degraded-state notes when retrieval state is available.
 
 ## Workspace And Config
 
@@ -270,8 +294,8 @@ Low-memory guidance:
 
 - Prefer `index --refresh incremental` over repeated full refreshes.
 - Avoid running multiple Cargo commands at once in this repo.
-- If embedding assets are unavailable or too heavy, symbolic retrieval remains
-  supported and is reported explicitly.
+- If embedding assets or retrieval sidecars are unavailable, fix that setup
+  layer before using packet/search evidence for broad agent grounding.
 - If a cold index is slow, inspect semantic timing before changing parser or
   graph code.
 
@@ -300,6 +324,10 @@ cargo test -p codestory-indexer --test tictactoe_language_coverage
 cargo test -p codestory-runtime --test retrieval_eval
 ```
 
+`retrieval_eval` runs a fail-closed sidecar-primary check by default. Set
+`CODESTORY_RETRIEVAL_EVAL_FULL_TESTS=1` only in an environment with real full sidecars to run the
+semantic quality assertions.
+
 Heavy repo-scale timing lane:
 
 ```powershell
diff --git a/scripts/setup-retrieval-env.mjs b/scripts/setup-retrieval-env.mjs
new file mode 100644
index 0000000..be33b85
--- /dev/null
+++ b/scripts/setup-retrieval-env.mjs
@@ -0,0 +1,315 @@
+#!/usr/bin/env node
+/**
+ * Thin wrapper around `cargo retrieval-setup` (see .cargo/config.toml).
+ *
+ * Primary documented path: `cargo retrieval-setup` from repo root.
+ * This script adds prerequisite reporting and optional holdout repo clones.
+ *
+ * Prerequisites: Node 18+, cargo, Docker Desktop (unless --skip-compose).
+ * SCIP language indexers are documented only — not installed by this script.
+ */
+import { spawnSync } from "node:child_process";
+import fs from "node:fs";
+import os from "node:os";
+import path from "node:path";
+import { fileURLToPath } from "node:url";
+
+const scriptDir = path.dirname(fileURLToPath(import.meta.url));
+const repoRoot = path.resolve(scriptDir, "..");
+
+function usage() {
+  console.log(`Usage:
+  node scripts/setup-retrieval-env.mjs [options]
+
+Options:
+  --check-only, --dry-run   Verify prerequisites and print planned steps (no changes)
+  --skip-build              Do not run cargo build -p codestory-cli
+  --skip-compose            Pass --skip-compose to "retrieval bootstrap"
+  --skip-status             Skip final "retrieval status"
+  --with-holdout-clone      Clone holdout-retrieval OSS repos (network; large)
+  --fetch-embed-model       Download bge-base-en-v1.5.Q8_0.gguf into target/retrieval-models
+  --release                 Build and use release CLI (default: debug for speed)
+  --project <path>          Project root for status (default: repo root)
+  --wait-secs <n>           Bootstrap wait timeout (default: 90)
+
+Examples:
+  node scripts/setup-retrieval-env.mjs --check-only
+  node scripts/setup-retrieval-env.mjs
+  node scripts/setup-retrieval-env.mjs --with-holdout-clone
+`);
+}
+
+function parseArgs(argv) {
+  const opts = {
+    checkOnly: false,
+    skipBuild: false,
+    skipCompose: false,
+    skipStatus: false,
+    withHoldoutClone: false,
+    fetchEmbedModel: false,
+    release: false,
+    project: repoRoot,
+    waitSecs: 90,
+  };
+  for (let i = 0; i < argv.length; i += 1) {
+    const arg = argv[i];
+    if (arg === "--help" || arg === "-h") {
+      usage();
+      process.exit(0);
+    }
+    if (arg === "--check-only" || arg === "--dry-run") {
+      opts.checkOnly = true;
+      continue;
+    }
+    if (arg === "--skip-build") {
+      opts.skipBuild = true;
+      continue;
+    }
+    if (arg === "--skip-compose") {
+      opts.skipCompose = true;
+      continue;
+    }
+    if (arg === "--skip-status") {
+      opts.skipStatus = true;
+      continue;
+    }
+    if (arg === "--with-holdout-clone") {
+      opts.withHoldoutClone = true;
+      continue;
+    }
+    if (arg === "--fetch-embed-model") {
+      opts.fetchEmbedModel = true;
+      continue;
+    }
+    if (arg === "--release") {
+      opts.release = true;
+      continue;
+    }
+    if (arg === "--project") {
+      opts.project = path.resolve(argv[++i]);
+      continue;
+    }
+    if (arg === "--wait-secs") {
+      opts.waitSecs = Number.parseInt(argv[++i], 10);
+      continue;
+    }
+    throw new Error(`Unknown argument: ${arg}`);
+  }
+  if (!Number.isInteger(opts.waitSecs) || opts.waitSecs < 0) {
+    throw new Error("--wait-secs must be a non-negative integer");
+  }
+  return opts;
+}
+
+function commandExists(name) {
+  const lookup = process.platform === "win32" ? "where" : "which";
+  const result = spawnSync(commandName(lookup), [commandName(name)], {
+    encoding: "utf8",
+    shell: false,
+  });
+  return result.status === 0;
+}
+
+function commandName(name) {
+  if (process.platform === "win32" && !name.toLowerCase().endsWith(".exe")) {
+    return `${name}.exe`;
+  }
+  return name;
+}
+
+function codestoryCacheRoot() {
+  if (process.platform === "win32" && process.env.LOCALAPPDATA) {
+    return path.join(process.env.LOCALAPPDATA, "codestory", "cache");
+  }
+  return path.join(os.homedir(), ".cache", "codestory", "cache");
+}
+
+function cliPath(release) {
+  const base = path.join(repoRoot, "target", release ? "release" : "debug");
+  const name = process.platform === "win32" ? "codestory-cli.exe" : "codestory-cli";
+  return path.join(base, name);
+}
+
+function runChecked(label, file, args, env = process.env) {
+  console.log(`\n==> ${label}`);
+  console.log(`    ${file} ${args.join(" ")}`);
+  const result = spawnSync(commandName(file), args, {
+    cwd: repoRoot,
+    env,
+    encoding: "utf8",
+    shell: false,
+    stdio: "inherit",
+  });
+  if (result.status !== 0) {
+    throw new Error(`${label} failed (exit ${result.status ?? "unknown"})`);
+  }
+}
+
+function printPrereqReport(opts) {
+  const composeFile = path.join(repoRoot, "docker", "retrieval-compose.yml");
+  const cacheRoot = codestoryCacheRoot();
+  const checks = [
+    ["node", commandExists("node"), "required"],
+    ["cargo", commandExists("cargo"), opts.skipBuild ? "optional (--skip-build)" : "required"],
+    [
+      "docker",
+      commandExists("docker"),
+      opts.skipCompose ? "optional (--skip-compose)" : "required for live Qdrant",
+    ],
+    [
+      `compose file (${composeFile})`,
+      fs.existsSync(composeFile),
+      "required unless CODESTORY_RETRIEVAL_COMPOSE_FILE points elsewhere",
+    ],
+  ];
+
+  console.log("CodeStory retrieval sidecar environment setup");
+  console.log("Primary path: cargo retrieval-setup");
+  console.log(`Repository: ${repoRoot}`);
+  console.log(`Cache root:   ${cacheRoot}`);
+  console.log("\nPrerequisites:");
+  let failed = false;
+  for (const [name, ok, note] of checks) {
+    const mark = ok ? "OK" : "MISSING";
+    console.log(`  [${mark}] ${name} — ${note}`);
+    if (!ok && note.startsWith("required")) {
+      failed = true;
+    }
+  }
+
+  console.log("\nAutomated:");
+  console.log("  - Docker Compose: Qdrant + Zoekt webserver + llama.cpp embed service");
+  console.log("  - codestory retrieval bootstrap (cache dirs, sidecar state, health wait)");
+  console.log("  - codestory retrieval status --project <path>");
+  if (opts.withHoldoutClone) {
+    console.log("  - node scripts/fetch-holdout-repos.mjs");
+  }
+
+  console.log("\nManual (not automated):");
+  console.log("  - SCIP indexers per language (rust-analyzer scip, scip-typescript, etc.)");
+  console.log("  - retrieval index --project <repo> after sidecars are healthy");
+
+  if (!opts.skipCompose && !commandExists("docker")) {
+    console.log("\nDocker install (Windows):");
+    console.log("  https://docs.docker.com/desktop/setup/install/windows-install/");
+    console.log("\nManual Qdrant without compose:");
+    console.log(
+      `  docker run -d --name codestory-qdrant -p 127.0.0.1:6333:6333 -p 127.0.0.1:6334:6334 ` +
+        `-v "${path.join(cacheRoot, "qdrant")}:/qdrant/storage" qdrant/qdrant:v1.12.5`,
+    );
+    console.log("\nZoekt without compose:");
+    console.log("  run sourcegraph/zoekt-webserver on 127.0.0.1:6070 with the CodeStory shard directory mounted");
+  }
+
+  return failed;
+}
+
+const BGE_GGUF = "bge-base-en-v1.5.Q8_0.gguf";
+const BGE_URL =
+  "https://huggingface.co/BAAI/bge-base-en-v1.5-GGUF/resolve/main/bge-base-en-v1.5.Q8_0.gguf";
+
+function embedModelDir() {
+  if (process.env.CODESTORY_EMBED_MODEL_DIR) {
+    return path.resolve(process.env.CODESTORY_EMBED_MODEL_DIR);
+  }
+  return path.join(repoRoot, "target", "retrieval-models");
+}
+
+async function fetchEmbedModel() {
+  const dir = embedModelDir();
+  fs.mkdirSync(dir, { recursive: true });
+  const dest = path.join(dir, BGE_GGUF);
+  if (fs.existsSync(dest) && fs.statSync(dest).size > 1_000_000) {
+    console.log(`Embed model already present: ${dest}`);
+    return dest;
+  }
+  console.log(`Downloading ${BGE_GGUF} to ${dest} ...`);
+  const response = await fetch(BGE_URL);
+  if (!response.ok) {
+    throw new Error(`Failed to download embed model: HTTP ${response.status}`);
+  }
+  const buffer = Buffer.from(await response.arrayBuffer());
+  fs.writeFileSync(dest, buffer);
+  console.log(`Wrote ${dest} (${buffer.length} bytes)`);
+  return dest;
+}
+
+async function main() {
+  const opts = parseArgs(process.argv.slice(2));
+  const failed = printPrereqReport(opts);
+  if (opts.checkOnly) {
+    process.exit(failed ? 1 : 0);
+  }
+  if (failed && !opts.skipCompose) {
+    throw new Error("Fix missing prerequisites (or use --skip-compose / --skip-build where applicable).");
+  }
+
+  if (opts.fetchEmbedModel) {
+    await fetchEmbedModel();
+  }
+
+  const bootstrapArgs = [
+    "run",
+    "-p",
+    "codestory-cli",
+    "--",
+    "retrieval",
+    "bootstrap",
+    "--project",
+    opts.project,
+    "--wait-secs",
+    String(opts.waitSecs),
+  ];
+  if (opts.skipCompose) {
+    bootstrapArgs.push("--skip-compose");
+  }
+  if (opts.release) {
+    bootstrapArgs.splice(1, 0, "--release");
+  }
+
+  if (!opts.skipBuild) {
+    runChecked("Bootstrap retrieval sidecars", "cargo", bootstrapArgs);
+  } else {
+    const cli = cliPath(opts.release);
+    if (!fs.existsSync(cli)) {
+      throw new Error(`CLI not found at ${cli}; drop --skip-build or run cargo retrieval-setup.`);
+    }
+    const directArgs = bootstrapArgs.slice(bootstrapArgs.indexOf("--") + 1);
+    runChecked("Bootstrap retrieval sidecars", cli, directArgs);
+  }
+
+  if (!opts.skipStatus) {
+    if (opts.skipBuild) {
+      const cli = cliPath(opts.release);
+      runChecked("Retrieval status", cli, ["retrieval", "status", "--project", opts.project]);
+    } else {
+      runChecked("Retrieval status", "cargo", [
+        "run",
+        "-p",
+        "codestory-cli",
+        ...(opts.release ? ["--release"] : []),
+        "--",
+        "retrieval",
+        "status",
+        "--project",
+        opts.project,
+      ]);
+    }
+  }
+
+  if (opts.withHoldoutClone) {
+    runChecked(
+      "Fetch holdout-retrieval repos",
+      process.execPath,
+      [path.join(repoRoot, "scripts", "fetch-holdout-repos.mjs")],
+    );
+  }
+
+  console.log("\nSetup complete.");
+  console.log("Next: cargo run -p codestory-cli -- retrieval index --project <repo-root> --refresh auto");
+}
+
+main().catch((error) => {
+  console.error(error instanceof Error ? error.message : error);
+  process.exit(1);
+});
diff --git a/scripts/setup-retrieval-env.ps1 b/scripts/setup-retrieval-env.ps1
new file mode 100644
index 0000000..ad651d6
--- /dev/null
+++ b/scripts/setup-retrieval-env.ps1
@@ -0,0 +1,44 @@
+# Optional Windows entry point for scripts/setup-retrieval-env.mjs.
+# Primary path: cargo retrieval-setup (from repo root).
+[CmdletBinding()]
+param(
+    [switch]$CheckOnly,
+    [switch]$DryRun,
+    [switch]$SkipBuild,
+    [switch]$SkipCompose,
+    [switch]$SkipStatus,
+    [switch]$WithHoldoutClone,
+    [switch]$Release,
+    [string]$Project,
+    [int]$WaitSecs = 90
+)
+
+$ErrorActionPreference = "Stop"
+
+function Require-Command {
+    param([string]$Name)
+    if (-not (Get-Command $Name -ErrorAction SilentlyContinue)) {
+        throw "Required command '$Name' was not found on PATH."
+    }
+}
+
+Require-Command node
+
+$scriptDir = Split-Path -Parent $MyInvocation.MyCommand.Path
+$repoRoot = Resolve-Path (Join-Path $scriptDir "..")
+$mjs = Join-Path $scriptDir "setup-retrieval-env.mjs"
+
+$nodeArgs = @($mjs)
+if ($CheckOnly -or $DryRun) { $nodeArgs += "--check-only" }
+if ($SkipBuild) { $nodeArgs += "--skip-build" }
+if ($SkipCompose) { $nodeArgs += "--skip-compose" }
+if ($SkipStatus) { $nodeArgs += "--skip-status" }
+if ($WithHoldoutClone) { $nodeArgs += "--with-holdout-clone" }
+if ($Release) { $nodeArgs += "--release" }
+if ($Project) { $nodeArgs += @("--project", (Resolve-Path $Project).Path) }
+if ($WaitSecs -ge 0) { $nodeArgs += @("--wait-secs", "$WaitSecs") }
+
+& node @nodeArgs
+if ($LASTEXITCODE -ne 0) {
+    exit $LASTEXITCODE
+}