diff --git a/.agents/skills/nemo-retriever-evaluate b/.agents/skills/nemo-retriever-evaluate
new file mode 120000
index 0000000000..a5c784b6d6
--- /dev/null
+++ b/.agents/skills/nemo-retriever-evaluate
@@ -0,0 +1 @@
+../../nemo_retriever/src/nemo_retriever/.agents/skills/nemo-retriever-evaluate
\ No newline at end of file
diff --git a/.agents/skills/nemo-retriever-ingest b/.agents/skills/nemo-retriever-ingest
new file mode 120000
index 0000000000..50670720dc
--- /dev/null
+++ b/.agents/skills/nemo-retriever-ingest
@@ -0,0 +1 @@
+../../nemo_retriever/src/nemo_retriever/.agents/skills/nemo-retriever-ingest
\ No newline at end of file
diff --git a/.agents/skills/nemo-retriever-query b/.agents/skills/nemo-retriever-query
new file mode 120000
index 0000000000..402b4e2e74
--- /dev/null
+++ b/.agents/skills/nemo-retriever-query
@@ -0,0 +1 @@
+../../nemo_retriever/src/nemo_retriever/.agents/skills/nemo-retriever-query
\ No newline at end of file
diff --git a/.agents/skills/nemo-retriever-service b/.agents/skills/nemo-retriever-service
new file mode 120000
index 0000000000..be8bce771d
--- /dev/null
+++ b/.agents/skills/nemo-retriever-service
@@ -0,0 +1 @@
+../../nemo_retriever/src/nemo_retriever/.agents/skills/nemo-retriever-service
\ No newline at end of file
diff --git a/.agents/skills/nemo-retriever-setup b/.agents/skills/nemo-retriever-setup
new file mode 120000
index 0000000000..011ed27a8e
--- /dev/null
+++ b/.agents/skills/nemo-retriever-setup
@@ -0,0 +1 @@
+../../nemo_retriever/src/nemo_retriever/.agents/skills/nemo-retriever-setup
\ No newline at end of file
diff --git a/.claude/skills/nemo-retriever-evaluate b/.claude/skills/nemo-retriever-evaluate
new file mode 120000
index 0000000000..a5c784b6d6
--- /dev/null
+++ b/.claude/skills/nemo-retriever-evaluate
@@ -0,0 +1 @@
+../../nemo_retriever/src/nemo_retriever/.agents/skills/nemo-retriever-evaluate
\ No newline at end of file
diff --git a/.claude/skills/nemo-retriever-ingest b/.claude/skills/nemo-retriever-ingest
new file mode 120000
index 0000000000..50670720dc
--- /dev/null
+++ b/.claude/skills/nemo-retriever-ingest
@@ -0,0 +1 @@
+../../nemo_retriever/src/nemo_retriever/.agents/skills/nemo-retriever-ingest
\ No newline at end of file
diff --git a/.claude/skills/nemo-retriever-query b/.claude/skills/nemo-retriever-query
new file mode 120000
index 0000000000..402b4e2e74
--- /dev/null
+++ b/.claude/skills/nemo-retriever-query
@@ -0,0 +1 @@
+../../nemo_retriever/src/nemo_retriever/.agents/skills/nemo-retriever-query
\ No newline at end of file
diff --git a/.claude/skills/nemo-retriever-service b/.claude/skills/nemo-retriever-service
new file mode 120000
index 0000000000..be8bce771d
--- /dev/null
+++ b/.claude/skills/nemo-retriever-service
@@ -0,0 +1 @@
+../../nemo_retriever/src/nemo_retriever/.agents/skills/nemo-retriever-service
\ No newline at end of file
diff --git a/.claude/skills/nemo-retriever-setup b/.claude/skills/nemo-retriever-setup
new file mode 120000
index 0000000000..011ed27a8e
--- /dev/null
+++ b/.claude/skills/nemo-retriever-setup
@@ -0,0 +1 @@
+../../nemo_retriever/src/nemo_retriever/.agents/skills/nemo-retriever-setup
\ No newline at end of file
diff --git a/.claude/skills/nemo-retriever/SKILL.md b/.claude/skills/nemo-retriever/SKILL.md
deleted file mode 100644
index 6e07ff6f76..0000000000
--- a/.claude/skills/nemo-retriever/SKILL.md
+++ /dev/null
@@ -1,92 +0,0 @@
----
-name: nemo-retriever
-description: Use when the user wants to search, index, or answer questions over a folder of PDFs (or other documents) — including building a RAG / search index over PDFs, looking up information across many PDFs, or running the `retriever` CLI (ingest, query, pipeline, recall, eval, etc.).
----
-
-# nemo-retriever
-
-The `retriever` CLI indexes a folder of PDFs into LanceDB (`retriever ingest`) and serves vector search over it (`retriever query`). For any task about searching/answering questions across a folder of PDFs, use this CLI — do not write a custom RAG.
-
-## Setup turn (when `./lancedb/nv-ingest.lance` doesn't exist)
-
-`retriever ingest ./pdfs/` runs the full pipeline (text extraction + page-element detection + OCR + embedding + LanceDB insert). On corpora >~800 pages this often won't fit a typical setup turn budget (10 min) — the OCR + page-element stages dominate and scale roughly linearly with page count. Always build an index — pick the recipe by corpus size:
-
-```bash
-TOTAL_PAGES=$(python -c "import pypdfium2, glob; print(sum(len(pypdfium2.PdfDocument(p)) for p in glob.glob('./pdfs/*.pdf')))" 2>/dev/null || echo 0)
-echo "total_pages=$TOTAL_PAGES"
-if [ "$TOTAL_PAGES" -le 800 ]; then
-  retriever ingest ./pdfs/ --embed-model-name nvidia/llama-nemotron-embed-1b-v2
-else
-  retriever pipeline run ./pdfs/ --run-mode inprocess --method pdfium --no-extract-tables --no-extract-charts --no-extract-page-as-image --evaluation-mode none --embed-model-name nvidia/llama-nemotron-embed-1b-v2
-fi
-```
-
-The `else` branch skips page-element detection, OCR, table extraction, and chart extraction — only pdfium text extraction + embedding. Embedding runs locally via the bundled HuggingFace model by default (no remote NIM needed). It's strictly better to have a text-only index than no index at all: the per-query pdfium text-extract fallback re-extracts a full PDF *per query*, which is both slow and expensive. Page-element detection may emit warning logs when its remote endpoint isn't reachable; the warnings are non-fatal as long as the embedding step itself succeeds.
-
-Don't pre-OCR, don't pre-chunk, don't write Python wrappers — the CLI handles extraction + (optionally) page-element detection + OCR + embedding + LanceDB insert in one shot.
-
-## Query turn — the WHOLE workflow
-
-```bash
-retriever query "<the user's question>" --top-k 10 --embed-model-name nvidia/llama-nemotron-embed-1b-v2 --rerank \
-  | tee /tmp/hits.json \
-  | jq -r '.[] | "rank=\(.rank // 0) page=\(.page_number) pdf=\(.pdf_basename) type=\(.metadata.type // "?") text=\(.text[:200])"'
-```
-
-Run that **exactly** as a single pipeline — do not split it into `HITS=$(...)` + `echo "$HITS" | jq ...` (the assignment swallows stdout, the pipe sees nothing, you waste 3 bash calls recovering). Stdout is clean JSON (model-init logs are silenced at the CLI layer); leave stderr unredirected so real errors surface on the first call. The full JSON sits at `/tmp/hits.json` if you need to re-parse it (`jq '.[6]' /tmp/hits.json`), but in the common case the jq summary above is all you need.
-
-That's your FIRST tool call on every query turn. Do not Read, Glob, Grep, or list PDFs before this — those duplicate what `retriever query` already did.
-
-**No narration between tool calls.** Do not write "Let me search…", "I'll now analyze…", "The retriever returned…", or any other commentary. Every assistant token you emit between the `retriever query` Bash call and the `Write` of `./output.json` becomes input tokens (and cached input tokens) for every subsequent turn in this session — quadratic cost. Go straight from reading the jq summary to writing the JSON file. The only assistant text in a query turn should be the tool calls themselves.
-
-Each hit has: `text`, `pdf_basename`, `page_number` (int, **1-indexed**: the first page of a PDF is page `1`), `pdf_page` (string composite key `"<basename>_<page_number>"` — not a number, don't use it as one), `_distance`, and `metadata` (JSON with `type` ∈ `text|table|chart|image`).
-
-**Then write `./output.json` directly from $HITS:**
-
-- `final_answer`: synthesize from the top hits' `text`. Include the exact number / name / date / row / column the question asks for, plus the source PDF and 0-indexed page. One paragraph. No restating the question, no hedging caveats. If the chunks talk *around* the fact but don't state it, run ONE `retriever pdf stage page-elements ./pdfs --method pdfium --json-output-dir /tmp/pdf_text --compact-json` and read `/tmp/pdf_text/<top_pdf>.pdf.pdf_extraction.json` for the rank-1 page (or rank-2 if rank-1 is metadata) — that almost always surfaces the exact figure. Then synthesize. **If after both calls the asked-for fact still isn't in the evidence, write `final_answer` that says so explicitly** — e.g. "The retrieved pages do not state [X] for [entity]; the closest content is [Y]." Do NOT invent, extrapolate, or generate plausible-sounding content from adjacent material. A confidently-wrong answer scores worse than an honest "not in the retrieved pages".
-- `ranked_retrieved`: one entry per hit in the order `retriever query` returned: `{"doc_id": "<pdf_basename without .pdf>", "page_number": <int>, "rank": <i+1>}`. Up to 10. Duplicate `(doc, page)` is fine. **Indexing:** the retriever's `page_number` is 1-indexed. If the task's output schema says 0-indexed (e.g. "first page is page 0"), emit `hit.page_number - 1`; if the task says 1-indexed or doesn't specify, emit `hit.page_number` as-is.
-
-**Before writing `final_answer`, re-read the question.** If it lists multiple entities, years, or categories, your answer must address each one explicitly — even if for some of them the chunks say "not provided" or contain no data. Missing entities lose more judge points than imprecise numbers.
-
-**Charts and images need extra caution — this is the single biggest source of judge=2/3 trials.** When `metadata.type` of a hit is `chart` or `image`, its `text` field is a model-generated transcription that frequently:
-
-- reverses direction words (`increase`↔`decrease`, `rose`↔`fell`, `surge`↔`drop`), and
-- rounds or misreads exact percentages (e.g. transcribing 12% as 20%).
-
-If a question asks for an exact percentage or a directional claim **and the evidence is only a chart/image hit** (no `text`-type hit corroborates the same number or direction):
-
-1. Run the targeted `retriever pdf stage page-elements --method pdfium` text-extract on the rank-1 PDF (this counts as your second tool call) and look for the number in prose.
-2. If prose confirms the chart number, assert it confidently.
-3. If prose doesn't mention it, **quote the chart transcription verbatim with an explicit hedge in `final_answer`**: "The chart on page N indicates [verbatim phrase] (chart-derived, not verified against prose)." Do NOT restate the chart's number as a confident fact.
-
-When both a chart hit and a text hit cover the same fact, always prefer the text hit's number.
-
-After writing the file, STOP. No print, no summary, no further tool calls.
-
-### Hard limits (cost discipline)
-
-- ONE `retriever query` per turn. ONE optional targeted text-extract on the rank-1 PDF if the chunks miss the asked-for fact. That's the budget — it is a hard cap, not a soft preference.
-- After your 2nd tool call, write `final_answer` with what you have and STOP. If both calls left the asked-for fact unresolved, write `final_answer` that **explicitly states the retrieved pages don't contain the requested fact** (naming the closest related content if any) — **do not run more tool calls hunting for it, and do not extrapolate a plausible value.** Long-running query turns (5+ tool calls, 1M+ cache-read tokens) cost ~5× a disciplined turn and usually still produce the wrong answer.
-- Don't read whole PDFs.
-- Don't make speculative Read/Glob/Grep calls "to confirm". The retriever already found the relevant pages — trust the ranking.
-- Don't spawn agents, write plans, or make todo lists. The workflow above is the workflow.
-
-### If the index is missing or `retriever query` returns `[]`
-
-Means ingest didn't complete (e.g. the text-only pipeline still hit the turn wall, or the table is empty). Tight fallback using the retriever's own pdfium-based extractor (always available — same binary the agent just used for `retriever query`):
-1. `ls ./pdfs/` (one call) to see filenames.
-2. Pick the SINGLE PDF whose name best matches the question.
-3. ONE call: `retriever pdf stage page-elements ./pdfs --method pdfium --json-output-dir /tmp/pdf_text --compact-json`. This emits a JSON sidecar per PDF at `/tmp/pdf_text/<basename>.pdf.pdf_extraction.json` containing per-page text primitives — pdfium only, no OCR, no NIM, fast.
-4. `jq` (or read directly) `/tmp/pdf_text/<name>.pdf.pdf_extraction.json` for the chosen PDF and synthesize from the per-page text. If the answer isn't there, still write your best guess based on the filename + extracted pages plus a one-sentence acknowledgement of uncertainty in `final_answer`. Then stop.
-
-Do NOT keep doing text-extract calls across many PDFs to hunt — that exhausts the turn budget. Better to answer partially than to time out. Never re-run `retriever ingest`.
-
-For an unlisted subcommand: `retriever <subcommand> --help`.
-
-## Failure modes
-
-- **First `ingest` takes ~60s+** — vLLM warmup. Expected.
-- **First `query` takes ~10–15s** — embedder cold-start. Expected.
-- **Empty result** — ingest didn't run. Use the fallback above.
-- **`Clamping num_partitions ...`** — informational on tiny corpora, not an error.
-- **Low-relevance top hit on tiny corpus** — look at `_distance` *gaps* between hits, not absolute values.
diff --git a/.claude/skills/nemo-retriever/references/ingest.md b/.claude/skills/nemo-retriever/references/ingest.md
deleted file mode 100644
index b3a52788ce..0000000000
--- a/.claude/skills/nemo-retriever/references/ingest.md
+++ /dev/null
@@ -1,124 +0,0 @@
-# retriever ingest
-
-End-to-end ingestion of documents and media into a LanceDB table — runs the
-full extract → embed → vector-DB pipeline in a single command.
-
-If flags below look stale, re-check `retriever ingest --help`.
-
-## When to use this
-
-- You have one or more supported files (or a directory/glob of files) and want them
-  searchable via `retriever query`.
-- You want the default pipeline: auto-select extraction for PDF/DOC/PPTX,
-  text, HTML, image, audio, or video inputs, then embed and insert into
-  LanceDB. No per-stage tuning needed.
-
-**Use a different command when:**
-
-- You only need a single stage (e.g. just extract text, no embeddings) →
-  `retriever pdf`, `retriever chart`, `retriever image`, etc.
-- You want fine-grained control over the pipeline graph → `retriever pipeline`.
-- You need a long-running service rather than one-shot CLI → `retriever service`.
-- You're benchmarking throughput → `retriever benchmark`.
-- You're iterating on the pipeline locally and want a non-distributed runner →
-  `retriever local`.
-
-## Canonical invocations
-
-Ingest a single file into the default table (`lancedb/nv-ingest.lance`):
-
-```bash
-retriever ingest data/multimodal_test.pdf
-```
-
-Ingest a directory of supported files:
-
-```bash
-retriever ingest data/corpus/
-```
-
-Ingest via glob:
-
-```bash
-retriever ingest "data/**/*"
-```
-
-Force a specific input family:
-
-```bash
-retriever ingest data/slides/ --input-type doc
-retriever ingest data/images/ --input-type image
-retriever ingest data/audio/ --input-type audio
-retriever ingest data/video/ --input-type video
-```
-
-Write to a custom DB / table:
-
-```bash
-retriever ingest data/multimodal_test.pdf \
-  --lancedb-uri ./my-lancedb \
-  --table-name my-corpus
-```
-
-## Inputs
-
-- **Positional `DOCUMENTS...`** — one or more file paths, directories, or
-  shell globs. Required, repeatable.
-- **Supported input types** — `pdf`, `doc` (`.docx`, `.pptx`), `txt`, `html`,
-  `image` (`.jpg`, `.jpeg`, `.png`, `.tiff`, `.tif`, `.bmp`, `.svg`),
-  `audio` (`.mp3`, `.wav`, `.m4a`), and `video` (`.mp4`, `.mov`, `.mkv`).
-
-## Outputs
-
-- A LanceDB dataset at `<lancedb-uri>/<table-name>.lance`. Default:
-  `./lancedb/nemo-retriever.lance`.
-- One row per extracted primitive (text chunk, table, chart, image region),
-  each with: `text`, `source`, `page_number`, `metadata` (JSON: type, bbox, …),
-  and the embedding vector.
-
-## Key flags
-
-| Flag | Default | Notes |
-|---|---|---|
-| `--lancedb-uri` | `lancedb` | Path or URI of the LanceDB database. |
-| `--table-name` | `nv-ingest` | LanceDB table to write into. Must match `retriever query`'s table on read. |
-| `--input-type` | `auto` | Input family to ingest. `auto` detects from file extensions and supports mixed directories. |
-| `--run-mode` | `inprocess` | `inprocess` for local runs; `batch` for the SDK batch ingestor. |
-
-## Pipeline shape
-
-For PDF/DOC/PPTX inputs, `ingest` runs the optimized document pipeline:
-
-1. `DocToPdfConversionActor` — non-PDF inputs → PDF (no-op for PDFs).
-2. `PDFSplitActor` — split into per-page tasks.
-3. `PDFExtractionActor` — extract native text/structure.
-4. `PageElementDetectionActor` — detect tables, charts, images, text blocks.
-5. `OCRV2Actor` — OCR text where native extraction is missing/poor.
-6. `UDFOperator` — user-defined transforms (passthrough by default).
-7. `_BatchEmbedActor` — embed primitives with `llama-nemotron-embed-1b-v2`.
-8. `IngestVdbOperator` — insert rows into LanceDB.
-
-For text, HTML, image, audio, video, or mixed `auto` inputs, `ingest` routes
-through the same GraphIngestor extraction paths used by `retriever pipeline`.
-
-## Common failure modes
-
-- **`Clamping num_partitions from 16 to 7`** — informational, not an error.
-  LanceDB IVF index needs `num_partitions < row_count`; happens on very small
-  ingests.
-- **First run is slow (~60s+ before any pages process)** — vLLM model load and
-  CUDA-graph capture for the embedder. Subsequent runs in the same process
-  are fast; one-shot CLI invocations always pay this cost.
-- **`No existing dataset at …/nemo-retriever.lance, it will be created`** — expected
-  on the first ingest into a new DB. Subsequent ingests append.
-- **HuggingFace download on first run** — the embedder and page-element
-  detector pull weights to `~/.cache/huggingface`. Needs network the first
-  time; cached afterwards.
-
-## Related
-
-- [[query]] — search the table this command writes.
-- `retriever vector-store --help` — utilities for inspecting/moving LanceDB
-  tables.
-- `retriever pipeline --help` — same end-to-end ingest but exposes per-stage
-  knobs.
diff --git a/.claude/skills/nemo-retriever/references/query.md b/.claude/skills/nemo-retriever/references/query.md
deleted file mode 100644
index b9dfe9ccc7..0000000000
--- a/.claude/skills/nemo-retriever/references/query.md
+++ /dev/null
@@ -1,95 +0,0 @@
-# retriever query
-
-Embed a text query and return the top-k nearest rows from a LanceDB table
-previously written by `retriever ingest` (or any compatible pipeline).
-
-If flags below look stale, re-check `retriever query --help`.
-
-## When to use this
-
-- You have already ingested documents and want to retrieve relevant
-  chunks/primitives for a natural-language query.
-- You want a one-shot CLI lookup — no service, no UI.
-
-**Use a different command when:**
-
-- You want recall metrics over a labelled query set → `retriever recall`.
-- You want to grade end-to-end QA quality → `retriever eval`.
-- You want a long-running query endpoint → `retriever service`.
-- You want to compare two retrieval runs → `retriever compare`.
-
-## Canonical invocations
-
-Top-10 search against the default table:
-
-```bash
-retriever query "what is in chart 1?"
-```
-
-Top-3, custom table:
-
-```bash
-retriever query "average frequency ranges for tweeters" \
-  --top-k 3 \
-  --lancedb-uri ./my-lancedb \
-  --table-name my-corpus
-```
-
-## Inputs
-
-- **Positional `QUERY`** — single text string. Required. Quote it in the shell
-  to keep multi-word queries intact.
-
-## Outputs
-
-- JSON array on stdout, one object per hit, sorted by ascending `_distance`
-  (lower = more similar). Each hit includes:
-  - `_distance` — vector distance in the embedding space.
-  - `text` — the retrieved primitive's text content.
-  - `source` / `path` / `source_id` — origin document path.
-  - `page_number`, `pdf_basename`, `pdf_page` — locator.
-  - `metadata` — JSON string with `type` (`text` / `table` / `chart` / `image`)
-    and, where applicable, a normalised `bbox_xyxy_norm`.
-
-Pipe to `jq` for filtering, e.g. only chart hits:
-
-```bash
-retriever query "gadget costs" | jq '[.[] | select(.metadata | fromjson.type == "chart")]'
-```
-
-## Key flags
-
-| Flag | Default | Notes |
-|---|---|---|
-| `--top-k` | `10` | Max hits to return. Must be ≥ 1. |
-| `--lancedb-uri` | `lancedb` | Must match what `ingest` wrote to. |
-| `--table-name` | `nemo-retriever` | Must match what `ingest` wrote to. |
-
-## Distance interpretation
-
-- The embedder (`llama-nemotron-embed-vl-1b-v2`) returns mean-pooled vectors;
-  LanceDB returns L2 distance by default. Typical relevant hits are in the
-  ~1.0–1.7 range for this model on prose queries; treat `_distance` as
-  **ranking-only**, not a calibrated similarity score.
-- The query uses the **VL** variant of the embedder so text queries can match
-  ingested image/chart embeddings as well as text. Expect mixed-modality hits
-  in the result list.
-
-## Common failure modes
-
-- **Empty result array** — table is empty (no ingest run yet) or
-  `--table-name` / `--lancedb-uri` don't match where ingest wrote.
-- **`Table 'nemo-retriever' was not found`** — same root cause: wrong table/URI,
-  or ingest hasn't been run.
-- **First query is slow (~10–15s)** — vLLM startup for the query embedder.
-  Subsequent queries in the same process are sub-second; one-shot CLI
-  invocations always pay this cost.
-- **Surprisingly low-relevance top hit** — for very short corpora, even
-  unrelated queries return *something* with a non-huge distance. Inspect
-  `_distance` gaps between hits rather than absolute values.
-
-## Related
-
-- [[ingest]] — populate the table this command reads.
-- `retriever recall --help` — batch query → recall@k against ground truth.
-- `retriever eval --help` — end-to-end QA evaluation.
diff --git a/nemo_retriever/pyproject.toml b/nemo_retriever/pyproject.toml
index 9ff55b3ec9..e8571cd1b7 100644
--- a/nemo_retriever/pyproject.toml
+++ b/nemo_retriever/pyproject.toml
@@ -214,6 +214,7 @@ explicit = true
 where = ["src"]
 
 [tool.setuptools.package-data]
+"nemo_retriever" = [".agents/skills/**/*"]
 "nemo_retriever.harness.portal" = ["static/**/*"]
 "nemo_retriever.service" = ["retriever-service.yaml"]
 "nemo_retriever.skill_eval" = ["prompts/*.j2", "configs/*.yaml"]
diff --git a/nemo_retriever/src/nemo_retriever/.agents/skills/nemo-retriever-evaluate/PITFALLS.md b/nemo_retriever/src/nemo_retriever/.agents/skills/nemo-retriever-evaluate/PITFALLS.md
new file mode 100644
index 0000000000..dfa781a8a6
--- /dev/null
+++ b/nemo_retriever/src/nemo_retriever/.agents/skills/nemo-retriever-evaluate/PITFALLS.md
@@ -0,0 +1,52 @@
+# Evaluate Pitfalls
+
+## Missing Evaluation Surface
+
+If `retriever recall` or `retriever eval` is unavailable, first try the
+repo-local CLI when a source checkout exists:
+
+```bash
+uv run --project nemo_retriever retriever eval --help
+```
+
+Retry dependency downloads before choosing another validation path.
+
+## Table Name Drift
+
+Root CLI ingest defaults to table `nv-ingest`. Some evaluation docs and older
+graph-pipeline examples mention `nemo-retriever`. Export/recall must point at
+the table that was actually written.
+
+## Retrieval JSON Contract
+
+`retriever eval run` in file mode needs a retrieval JSON whose top-level
+`queries` object maps each ground-truth question string to retrieved `chunks`.
+If query strings differ from the ground truth loader's normalization, coverage
+will drop even if retrieval quality is good.
+
+## Coverage Failures
+
+`retriever eval run` checks retrieval coverage before generation. Low coverage
+usually means the retrieval JSON and QA dataset keys do not align, the wrong
+dataset loader was selected, or the wrong table/query CSV was used.
+
+## LLM Extras And Keys
+
+QA eval needs the `[llm]` extra and generator/judge API configuration. Missing
+`litellm`, `NVIDIA_API_KEY`, `GEN_API_KEY`, or `JUDGE_API_KEY` should be reported
+as setup gaps, not retrieval failures.
+
+## Eval Export Remote Endpoint Gap
+
+`retriever eval export` supports local-HF query embedding with
+`--local-query-embed-backend hf`, `--local-hf-cache-dir`, and
+`--local-hf-device`, but it still does not expose `--embed-invoke-url` /
+`--embedding-http-endpoint` for remote/self-hosted embedding services. If export
+must use a remote endpoint, use `retriever recall ... --embedding-http-endpoint ...`
+for recall metrics or build the retrieval JSON with the Python `Retriever` API
+and explicit `embed_kwargs`.
+
+## Requery Cost
+
+Do not re-ingest or re-query LanceDB when changing only generator/judge models.
+Save and reuse retrieval JSON whenever the retrieval stage is unchanged.
diff --git a/nemo_retriever/src/nemo_retriever/.agents/skills/nemo-retriever-evaluate/SKILL.md b/nemo_retriever/src/nemo_retriever/.agents/skills/nemo-retriever-evaluate/SKILL.md
new file mode 100644
index 0000000000..aea96be3fa
--- /dev/null
+++ b/nemo_retriever/src/nemo_retriever/.agents/skills/nemo-retriever-evaluate/SKILL.md
@@ -0,0 +1,82 @@
+---
+name: nemo-retriever-evaluate
+description: Use when the user asks to measure NeMo Retriever retrieval quality, recall, QA answer quality, compare retrieval outputs, export evaluation JSON, build page markdown indexes, or run `retriever recall` / `retriever eval` workflows. Do not use for ad-hoc answering from a single index; use `nemo-retriever-query`.
+---
+
+# nemo-retriever-evaluate
+
+Use this skill for repeatable retrieval or QA evaluation, not one-off question
+answering.
+
+## Orientation
+
+1. Verify the installed surface: `retriever recall --help`,
+   `retriever recall vdb-recall run --help`, `retriever eval export --help`,
+   and `retriever eval run --help`.
+2. Decide the evaluation type:
+   - Recall metrics over labeled query/page data: `retriever recall vdb-recall run`.
+   - QA generation and judging: `retriever eval export` plus `retriever eval run`,
+     or `retriever eval run --from-env`.
+   - End-to-end ingest plus QA: `retriever pipeline run --evaluation-mode qa`.
+3. If the installed CLI is absent but this is a source checkout, use
+   `uv run --project nemo_retriever retriever ...`. Retry dependency downloads
+   before choosing another evaluation validation path.
+4. If neither path works, use `nemo-retriever-setup` before debugging evaluation
+   behavior.
+
+## References
+
+- `references/EVALUATE.md`: recall, QA export/run, page markdown, config, and
+  artifact contracts.
+- `PITFALLS.md`: table mismatches, retrieval JSON schema, coverage failures,
+  missing LLM extras, and expensive reruns.
+
+## Workflow
+
+1. Identify the corpus/index and ground-truth data. Query CSVs commonly need
+   `query,pdf_page` or `query,pdf,page`.
+2. For recall:
+
+   ```bash
+   retriever recall vdb-recall run \
+     --query-csv ./queries.csv \
+     --lancedb-uri ./lancedb \
+     --table-name nv-ingest \
+     --top-k 10
+   ```
+
+3. For QA evaluation with reusable retrieval JSON:
+
+   ```bash
+   retriever eval export \
+     --lancedb-uri ./lancedb \
+     --lancedb-table nv-ingest \
+     --query-csv ./qa.csv \
+     --output ./eval/retrieval.json
+
+   retriever eval run --config ./eval_sweep.yaml
+   ```
+
+4. If local HuggingFace query embeddings are required, add
+   `--local-query-embed-backend hf`, `--local-hf-cache-dir`, and
+   `--local-hf-device` to `eval export`.
+5. If `eval export` cannot use the required remote embedding endpoint, build the
+   retrieval JSON with the Python `Retriever` API and `write_retrieval_json`
+   using explicit `embed_kwargs`.
+6. Use `retriever eval build-page-index` when full-page markdown is needed from
+   Parquet produced during ingest.
+7. Preserve generated artifacts and report exact paths so the user can rerun
+   generation/judging without re-querying LanceDB.
+
+## Success Checks
+
+- Recall prints `recall@...` metrics or rich recall output.
+- QA eval reports coverage and writes result JSON.
+- Retrieval JSON has a top-level `queries` object keyed by ground-truth question
+  strings.
+
+## Evaluation Scenarios
+
+- "Measure recall@10 for this LanceDB table and query CSV." Use this skill.
+- "Export retrieval JSON and run QA judging with a config." Use this skill.
+- "What does the index say about a single question?" Use `nemo-retriever-query`.
diff --git a/nemo_retriever/src/nemo_retriever/.agents/skills/nemo-retriever-evaluate/references/EVALUATE.md b/nemo_retriever/src/nemo_retriever/.agents/skills/nemo-retriever-evaluate/references/EVALUATE.md
new file mode 100644
index 0000000000..5349ad697c
--- /dev/null
+++ b/nemo_retriever/src/nemo_retriever/.agents/skills/nemo-retriever-evaluate/references/EVALUATE.md
@@ -0,0 +1,192 @@
+# Evaluate Reference
+
+## Contents
+
+- [Recall](#recall)
+- [QA Evaluation](#qa-evaluation)
+- [Eval From Environment](#eval-from-environment)
+- [End-To-End QA Shortcut](#end-to-end-qa-shortcut)
+- [LLM Requirements](#llm-requirements)
+
+## Recall
+
+Use recall when the task has labeled expected pages or document/page keys:
+
+```bash
+retriever recall vdb-recall run \
+  --query-csv ./queries.csv \
+  --lancedb-uri ./lancedb \
+  --table-name nv-ingest \
+  --top-k 10
+```
+
+The query CSV expects `query,pdf_page` or `query,pdf,page`. The command retrieves
+at least 10 internally for recall@10 even when fewer hits are printed.
+
+Remote query embedding options include:
+
+- `--embedding-endpoint`
+- `--embedding-http-endpoint`
+- `--embedding-grpc-endpoint`
+- `--embedding-model`
+- `--embedding-api-key`
+
+If you omit the embedding endpoint options, recall falls back to local
+HuggingFace embeddings and may download a model. For quick remote-NIM runs,
+pass the endpoint/model/API key explicitly.
+
+## QA Evaluation
+
+Preferred reproducible path:
+
+```bash
+retriever eval export \
+  --lancedb-uri ./lancedb \
+  --lancedb-table nv-ingest \
+  --query-csv ./qa.csv \
+  --output ./eval/retrieval.json
+
+retriever eval run --config ./eval_sweep.yaml
+```
+
+`retriever eval export` writes the retrieval JSON contract consumed by
+`retriever eval run` / `FileRetriever`. It can also use `--page-index` to replace
+sub-page chunks with full-page markdown.
+
+For local HuggingFace query embeddings:
+
+```bash
+retriever eval export \
+  --lancedb-uri ./lancedb \
+  --lancedb-table nv-ingest \
+  --query-csv ./qa.csv \
+  --output ./eval/retrieval.json \
+  --top-k 5 \
+  --embedder nvidia/llama-nemotron-embed-1b-v2 \
+  --local-query-embed-backend hf \
+  --local-hf-cache-dir "$HOME/models/huggingface" \
+  --local-hf-device cuda
+```
+
+Minimal `eval_sweep.yaml` for an existing retrieval JSON:
+
+```yaml
+dataset:
+  source: "csv:./qa.csv"
+
+retrieval:
+  type: "file"
+  file_path: "./eval/retrieval.json"
+
+models:
+  generator:
+    model: "nvidia_nim/nvidia/llama-3.3-nemotron-super-49b-v1.5"
+    api_key: "${NVIDIA_API_KEY}"
+  judge:
+    model: "nvidia_nim/mistralai/mixtral-8x22b-instruct-v0.1"
+    api_key: "${NVIDIA_API_KEY}"
+
+evaluations:
+  - generator: "generator"
+    judge: "judge"
+    runs: 1
+
+execution:
+  top_k: 5
+  max_workers: 8
+
+output:
+  results_dir: "./eval/results"
+```
+
+`retriever eval export` supports local-HF query embedding options, but it does
+not currently expose a remote embedding endpoint flag. When the index must be
+queried with a remote/self-hosted embedding endpoint, use the Python API to
+create the same FileRetriever JSON contract:
+
+```python
+from nemo_retriever.export import write_retrieval_json
+from nemo_retriever.retriever import Retriever
+
+retriever = Retriever(
+    top_k=5,
+    vdb_kwargs={"uri": "./lancedb", "table_name": "nv-ingest"},
+    embed_kwargs={
+        "embedding_endpoint": "http://embed:8000/v1",
+        "embed_invoke_url": "http://embed:8000/v1",
+        "model_name": "nvidia/llama-nemotron-embed-1b-v2",
+        "embed_model_name": "nvidia/llama-nemotron-embed-1b-v2",
+    },
+    rerank=False,
+)
+
+all_results = {}
+for row in queries:
+    hits = retriever.query(row["query"])
+    all_results[row["query"]] = {
+        "chunks": [hit.get("text", "") for hit in hits],
+        "metadata": [
+            {
+                "source_id": hit.get("source_id") or hit.get("source"),
+                "page_number": hit.get("page_number"),
+                "distance": hit.get("_distance"),
+            }
+            for hit in hits
+        ],
+    }
+
+write_retrieval_json(all_results, "./eval/retrieval.json", {"vdb_backend": "lancedb"})
+```
+
+Build a page markdown index from ingestion Parquet:
+
+```bash
+retriever eval build-page-index \
+  --parquet-dir ./processed_docs \
+  --output ./page_markdown.json
+```
+
+## Eval From Environment
+
+Run from an existing retrieval JSON:
+
+```bash
+export RETRIEVAL_FILE=./eval/retrieval.json
+export QA_DATASET=csv:./qa.csv
+export RESULTS_DIR=./eval/results
+retriever eval run --from-env
+```
+
+Run live retrieval from LanceDB and optionally save the retrieval JSON for
+repeatable reruns:
+
+```bash
+export LANCEDB_URI=./lancedb
+export LANCEDB_TABLE=nv-ingest
+export QA_DATASET=csv:./qa.csv
+export RETRIEVAL_SAVE_PATH=./eval/retrieval.json
+export RESULTS_DIR=./eval/results
+retriever eval run --from-env
+```
+
+## End-To-End QA Shortcut
+
+`retriever pipeline run` can ingest and run QA in one command:
+
+```bash
+retriever pipeline run ./data/corpus \
+  --lancedb-uri ./lancedb \
+  --evaluation-mode qa \
+  --eval-config ./eval_sweep.yaml \
+  --query-csv ./qa.csv \
+  --retrieval-save-path ./eval/retrieval.json
+```
+
+Use this for development iteration. For benchmark comparisons, prefer the
+separable export/run path so retrieval can be reused.
+
+## LLM Requirements
+
+QA generation and judging need the `nemo-retriever[llm]` extra and model/API
+configuration in the eval config or environment. `NVIDIA_API_KEY` is commonly
+used as a fallback for generator and judge keys.
diff --git a/nemo_retriever/src/nemo_retriever/.agents/skills/nemo-retriever-ingest/PITFALLS.md b/nemo_retriever/src/nemo_retriever/.agents/skills/nemo-retriever-ingest/PITFALLS.md
new file mode 100644
index 0000000000..61bea87e6c
--- /dev/null
+++ b/nemo_retriever/src/nemo_retriever/.agents/skills/nemo-retriever-ingest/PITFALLS.md
@@ -0,0 +1,84 @@
+# Ingest Pitfalls
+
+## No Installed Surface
+
+First check the installed user surface:
+
+```bash
+retriever --help
+python -c "import importlib.util; print(importlib.util.find_spec('nemo_retriever'))"
+```
+
+If `retriever` is missing and this is explicitly a source checkout, use the
+developer fallback:
+
+```bash
+uv run --project nemo_retriever retriever --help
+```
+
+If that fails because dependencies need to download, retry the command.
+
+Only report the environment as missing after checking the installed command and,
+when a source checkout is actually available, the repo-local fallback.
+
+If there is no installed command, no importable `nemo_retriever` package, and no
+source checkout, switch to environment setup. Do not guess alternate command
+names such as `nemo-retriever` or proceed with a custom RAG implementation.
+
+## False-Positive Ingest Output
+
+`retriever ingest` can exit zero and print `Ingested N document(s)` even when no
+uploadable LanceDB rows were produced. Always validate with `db.table_names()`,
+`table.count_rows()`, and a smoke query before reporting success.
+
+## TXT Requires Transformers
+
+TXT ingestion uses a HuggingFace tokenizer. In a lean environment, missing
+`transformers` can lead to empty extraction or direct
+`ModuleNotFoundError: No module named 'transformers'`. Recover by installing or
+transiently adding it. Installed environment:
+
+```bash
+uv pip install transformers
+retriever ingest ./docs/*.txt --input-type txt ...
+```
+
+Source checkout:
+
+```bash
+uv run --project nemo_retriever --with transformers retriever ingest ./docs/*.txt --input-type txt ...
+```
+
+## Table Defaults Drift
+
+The root CLI defaults are `--lancedb-uri lancedb` and `--table-name nv-ingest`.
+Some older docs and examples mention `nemo-retriever`. Always match the table
+that was actually written.
+
+## Overwrite Is Default
+
+`retriever ingest` overwrites the target table unless `--append` is passed. Do
+not append on reruns unless duplicates are acceptable.
+
+## First Run Can Be Slow
+
+Local GPU model loading, CUDA graph capture, Ray startup, and first-time model
+downloads can dominate the first run. This is not automatically a failed ingest.
+Look for a non-zero exit or explicit validation error.
+
+## Remote Endpoints Need Matching Query Settings
+
+If ingest used `--embed-invoke-url` or a non-default `--embed-model-name`, query
+and evaluation must use the same endpoint/model pair. Mismatched embeddings can
+return empty or irrelevant hits.
+
+## Single-PDF And Tiny Corpora
+
+Tiny LanceDB tables can emit partition/index warnings or produce weak nearest
+neighbors for unrelated queries. Validate with a query that should be present in
+the corpus and inspect rows before tuning thresholds.
+
+## Stale Documentation
+
+If a flag from docs is rejected, run the command-specific `--help` and adapt to
+the installed CLI. Teach the mismatch in your final answer rather than hiding it.
diff --git a/nemo_retriever/src/nemo_retriever/.agents/skills/nemo-retriever-ingest/SKILL.md b/nemo_retriever/src/nemo_retriever/.agents/skills/nemo-retriever-ingest/SKILL.md
new file mode 100644
index 0000000000..4469e26a4c
--- /dev/null
+++ b/nemo_retriever/src/nemo_retriever/.agents/skills/nemo-retriever-ingest/SKILL.md
@@ -0,0 +1,97 @@
+---
+name: nemo-retriever-ingest
+description: Use when the user asks to ingest, index, embed, or make documents searchable with NeMo Retriever, including `retriever ingest`, `retriever pipeline run`, LanceDB creation, extraction outputs, or ingestion validation. Do not use for querying an existing index; use `nemo-retriever-query` instead.
+---
+
+# nemo-retriever-ingest
+
+Use this skill to build a searchable NeMo Retriever corpus. It teaches the
+current CLI/SDK behavior, the defaults that matter across tasks, and validation
+checks that distinguish a real index from a false-positive run.
+
+## Orientation
+
+1. From the active environment, verify the public surface: `retriever --help`,
+   then `retriever ingest --help` or `retriever pipeline run --help`.
+2. If `retriever` is not on PATH but this is a source checkout, bootstrap the
+   CLI with `uv run --project nemo_retriever retriever --help`. If dependencies
+   need to download, retry the command and continue from the validated command
+   surface.
+3. If neither an installed command nor a source checkout is available, this is
+   an environment setup blocker, not an ingest failure. Use the setup workflow
+   first; do not invent a package name or private command.
+4. Choose the simplest ingestion path that satisfies the task:
+   - `retriever ingest ...` for one-shot ingest into LanceDB.
+   - `retriever pipeline run ...` when the task needs saved Parquet, image
+     storage, evaluation mode, service run mode, or lower-level tuning.
+   - Python `create_ingestor(...)` when the user explicitly wants SDK code.
+5. Record the `lancedb_uri`, table name, run mode, and any remote NIM endpoints
+   because query and evaluation tasks must match them exactly.
+
+## References
+
+- `references/INGEST.md`: command choices, defaults, remote inference, SDK notes,
+  and validation checks.
+- `PITFALLS.md`: install gaps, table mismatches, slow startup, empty corpora,
+  model downloads, and stale docs.
+
+## Workflow
+
+1. Identify input paths and supported file types. For directories, expect
+   `retriever ingest` to expand supported files; for `pipeline run`, confirm the
+   desired `--input-type` when the corpus is not obvious.
+2. Decide local versus remote inference before running:
+   - Remote NIM inference: set `NVIDIA_API_KEY` when using build.nvidia.com and
+     pass explicit `--*-invoke-url`, `--embed-invoke-url`, and model flags.
+   - Local inference: confirm the environment has the needed extras, CUDA stack,
+     and model cache. Route HuggingFace downloads to `~/models` when preparing a
+     new environment.
+3. Run ingest with explicit index settings when the index will be reused:
+
+   ```bash
+   retriever ingest ./data/corpus --lancedb-uri ./lancedb --table-name nv-ingest
+   ```
+
+   From this repo checkout, use:
+
+   ```bash
+   uv run --project nemo_retriever retriever ingest ./data/corpus --lancedb-uri ./lancedb --table-name nv-ingest
+   ```
+
+4. Use `retriever pipeline run` when saved intermediates matter:
+
+   ```bash
+   retriever pipeline run ./data/corpus --input-type pdf --save-intermediate ./processed_docs
+   ```
+
+5. Validate the table before declaring success. A zero exit and "Ingested N
+   document(s)" are not sufficient:
+
+   ```python
+   import lancedb
+
+   db = lancedb.connect("./lancedb")
+   print(db.table_names())
+   table = db.open_table("nv-ingest")
+   print(table.count_rows())
+   ```
+
+   Then run a smoke query with the same URI, table, embedding endpoint, and
+   model used for ingest.
+
+## Success Checks
+
+- The command reports the target LanceDB URI and table, or the expected Parquet
+  directory exists for `--save-intermediate`.
+- A focused `retriever query ... --lancedb-uri ... --table-name ...` returns at
+  least one JSON hit for a query that should match the corpus.
+- The query skill can reuse the recorded URI/table/model settings without
+  guessing.
+
+## Evaluation Scenarios
+
+- "Index the PDFs in `data/reports` with NeMo Retriever." Use this skill.
+- "Run a tuned batch ingestion and save Parquet for later page markdown." Use
+  this skill and prefer `retriever pipeline run`.
+- "Answer a question from an existing LanceDB table." Use `nemo-retriever-query`,
+  not this skill.
diff --git a/nemo_retriever/src/nemo_retriever/.agents/skills/nemo-retriever-ingest/references/INGEST.md b/nemo_retriever/src/nemo_retriever/.agents/skills/nemo-retriever-ingest/references/INGEST.md
new file mode 100644
index 0000000000..00b0dbe21d
--- /dev/null
+++ b/nemo_retriever/src/nemo_retriever/.agents/skills/nemo-retriever-ingest/references/INGEST.md
@@ -0,0 +1,146 @@
+# Ingest Reference
+
+## Contents
+
+- [Command Selection](#command-selection)
+- [Inputs](#inputs)
+- [Remote Inference](#remote-inference)
+- [Python SDK Notes](#python-sdk-notes)
+- [Validation](#validation)
+
+## Command Selection
+
+Use `retriever ingest` for the compact installed-user path:
+
+```bash
+retriever ingest ./data/corpus --lancedb-uri ./lancedb --table-name nv-ingest
+```
+
+When working from this source checkout and the installed command is absent, use
+the project environment instead of stopping:
+
+```bash
+uv run --project nemo_retriever retriever ingest ./data/corpus \
+  --lancedb-uri ./lancedb \
+  --table-name nv-ingest
+```
+
+Observed from the root CLI tests:
+
+- Default `--lancedb-uri` is `lancedb`.
+- Default `--table-name` is `nv-ingest`.
+- Default `--run-mode` is `inprocess`.
+- The command overwrites the target table by default. Use `--append` only when
+  duplicate rows are acceptable or the caller explicitly wants append behavior.
+- Directories are expanded to supported files. Empty directories and unsupported
+  extensions are user-facing errors.
+
+Use `retriever pipeline run` when the task needs lower-level controls:
+
+```bash
+retriever pipeline run ./data/corpus \
+  --input-type pdf \
+  --method pdfium \
+  --save-intermediate ./processed_docs
+```
+
+Important differences:
+
+- `pipeline run` exposes more extraction, chunking, storage, Ray, service, and
+  evaluation flags.
+- `--save-intermediate` writes extraction results as Parquet, which is needed
+  for full-page markdown QA evaluation.
+- `--no-vdb` skips vector DB upload.
+- `--run-mode service` submits work to a running Retriever service.
+- `pipeline run` defaults to `--run-mode batch`; pass `--run-mode inprocess`
+  for a small local smoke test.
+
+## Inputs
+
+`retriever ingest` supports `auto`, `pdf`, `doc`, `txt`, `html`, `image`,
+`audio`, and `video` input types. `doc` covers Office documents such as DOCX and
+PPTX but routes through the PDF/document extraction path.
+
+Media workflows need extra system dependencies:
+
+- TXT chunking uses HuggingFace tokenizers. If `txt_file_to_chunks_df` or a
+  text ingest fails with `ModuleNotFoundError: No module named 'transformers'`,
+  install or transiently add the missing dependency, then rerun. In an installed
+  environment use `uv pip install transformers`; in a source checkout use
+  `uv run --project nemo_retriever --with transformers retriever ingest ...`.
+- Audio/video: `ffmpeg` / `ffprobe`.
+- SVG rendering: `cairosvg` and its system dependencies.
+- Local GPU inference: install the `[local]` extra and CUDA-compatible PyTorch.
+
+## Remote Inference
+
+For hosted or self-hosted NIMs, pass the stage endpoints explicitly:
+
+```bash
+export NVIDIA_API_KEY=nvapi-...
+retriever ingest ./data/corpus \
+  --page-elements-invoke-url https://ai.api.nvidia.com/v1/cv/nvidia/nemotron-page-elements-v3 \
+  --ocr-invoke-url https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-ocr-v1 \
+  --ocr-version v1 \
+  --graphic-elements-invoke-url https://ai.api.nvidia.com/v1/cv/nvidia/nemotron-graphic-elements-v1 \
+  --table-structure-invoke-url https://ai.api.nvidia.com/v1/cv/nvidia/nemotron-table-structure-v1 \
+  --embed-invoke-url https://integrate.api.nvidia.com/v1/embeddings \
+  --embed-model-name nvidia/llama-nemotron-embed-1b-v2
+```
+
+For remote embedding, query with the same embedding endpoint and model. Do not
+mix vectors created by one model with queries embedded by another model.
+
+## Python SDK Notes
+
+When the user wants SDK code, start from:
+
+```python
+from nemo_retriever import create_ingestor
+
+ingestor = create_ingestor(run_mode="batch")
+dataset = ingestor.files(["./data/file.pdf"]).extract().embed().ingest()
+```
+
+Use the CLI for the shortest path to LanceDB. Some docs discuss graph ingestion
+and storage separately; the root CLI adapter has the tested one-shot
+`extract -> embed -> vdb_upload -> ingest` path.
+
+## Validation
+
+After ingest, validate with the concrete LanceDB table and one smoke query. A
+successful process exit alone is not enough.
+
+```python
+import lancedb
+
+db = lancedb.connect("./lancedb")
+print(db.table_names())
+table = db.open_table("nv-ingest")
+print(table.count_rows())
+```
+
+Then use a query that should match the corpus:
+
+```bash
+retriever query "smoke test term from the corpus" --lancedb-uri ./lancedb --table-name nv-ingest --top-k 3
+```
+
+For text ingestion, also validate that extraction produced rows before tuning
+embedding or LanceDB settings:
+
+Installed environment:
+
+```bash
+python -c "from nemo_retriever.txt.split import txt_file_to_chunks_df; print(txt_file_to_chunks_df('file.txt').shape)"
+```
+
+Source checkout:
+
+```bash
+uv run --project nemo_retriever --with transformers python -c \
+  "from nemo_retriever.txt.split import txt_file_to_chunks_df; print(txt_file_to_chunks_df('file.txt').shape)"
+```
+
+If validation fails, read `PITFALLS.md` before changing models, table names, or
+paths.
diff --git a/nemo_retriever/src/nemo_retriever/.agents/skills/nemo-retriever-query/PITFALLS.md b/nemo_retriever/src/nemo_retriever/.agents/skills/nemo-retriever-query/PITFALLS.md
new file mode 100644
index 0000000000..987c1e5704
--- /dev/null
+++ b/nemo_retriever/src/nemo_retriever/.agents/skills/nemo-retriever-query/PITFALLS.md
@@ -0,0 +1,63 @@
+# Query Pitfalls
+
+## Missing Or Wrong Table
+
+`Table ... was not found`, `[]`, or obviously irrelevant hits often mean the
+query URI/table does not match ingest. Check both:
+
+```bash
+retriever query "known corpus term" --lancedb-uri ./lancedb --table-name nv-ingest --top-k 3
+```
+
+Root CLI default table is `nv-ingest`; some older graph-pipeline examples use
+`nemo-retriever`. Use the table that was actually written.
+
+Validate table existence directly when query says `Table ... was not found`:
+
+Installed environment:
+
+```bash
+python -c "import lancedb; db=lancedb.connect('./lancedb'); print(db.table_names())"
+```
+
+Source checkout:
+
+```bash
+uv run --project nemo_retriever python -c "import lancedb; db=lancedb.connect('./lancedb'); print(db.table_names())"
+```
+
+## Metadata Shape
+
+Current normalized hits expose `metadata` as a dict. Older docs or examples may
+show a JSON string. Do not blindly pipe through `fromjson`; first inspect one
+hit.
+
+## Embedding Mismatch
+
+If ingest used remote embedding or a non-default model, query must use the same
+embedding endpoint and model. Mixed embedding spaces can look like a retrieval
+failure even when the table has rows.
+
+If ingest used the local HuggingFace backend, pass
+`--local-query-embed-backend hf` plus the same cache/device settings used for
+local validation. Otherwise the CLI may try the default local vLLM path and fail
+before retrieval.
+
+## Rerank Is Opt-In
+
+Do not assume rerank is enabled. Use `--rerank` or a reranker endpoint/model
+option when the user asks for reranking or when precision matters enough to pay
+the extra cost.
+
+## Chart And Image Evidence
+
+Chart/image text can be model-generated and may misread exact numbers or
+directions. For exact numeric claims, prefer corroborating text/table hits. If
+only a chart/image transcription supports the answer, label it as chart-derived
+or image-derived rather than making it sound verified by prose.
+
+## Insufficient Evidence
+
+Do not answer from general knowledge when retrieved evidence is missing. State
+that the retrieved pages do not contain the requested fact and name the closest
+related evidence if useful.
diff --git a/nemo_retriever/src/nemo_retriever/.agents/skills/nemo-retriever-query/SKILL.md b/nemo_retriever/src/nemo_retriever/.agents/skills/nemo-retriever-query/SKILL.md
new file mode 100644
index 0000000000..2998e871e2
--- /dev/null
+++ b/nemo_retriever/src/nemo_retriever/.agents/skills/nemo-retriever-query/SKILL.md
@@ -0,0 +1,81 @@
+---
+name: nemo-retriever-query
+description: Use when the user asks to search a NeMo Retriever index, run `retriever query`, retrieve evidence from LanceDB, inspect query hit schemas, answer questions from retrieved documents, or debug missing/empty query results. Do not use for creating indexes; use `nemo-retriever-ingest` instead.
+---
+
+# nemo-retriever-query
+
+Use this skill to retrieve evidence from an existing NeMo Retriever index and
+answer only from that evidence.
+
+## Orientation
+
+1. Verify the active public surface first: `retriever query --help`.
+2. Confirm the LanceDB URI, table name, and embedding settings from the ingest
+   task or project config. Do not guess if the user supplied different values.
+3. If the installed CLI is absent but this is a source checkout, use
+   `uv run --project nemo_retriever retriever query --help`. Retry dependency
+   downloads before choosing another query validation path.
+4. If neither path works, use `nemo-retriever-setup` before debugging query
+   behavior.
+5. Keep source citation indexing straight: `page_number` returned by Retriever
+   is 1-indexed unless an external task schema says otherwise.
+
+## References
+
+- `references/QUERY.md`: CLI and Python query patterns, result schema, rerank
+  behavior, and answer synthesis.
+- `PITFALLS.md`: missing tables, empty hits, metadata shape mistakes, chart
+  uncertainty, and model mismatches.
+
+## Workflow
+
+1. Run a focused query against the known table:
+
+   ```bash
+   retriever query "question text" --lancedb-uri ./lancedb --table-name nv-ingest --top-k 5
+   ```
+
+2. If ingest used remote embedding, include the same query embedding endpoint
+   and model:
+
+   ```bash
+   retriever query "question text" \
+     --lancedb-uri ./lancedb \
+     --table-name nv-ingest \
+     --embed-invoke-url https://integrate.api.nvidia.com/v1/embeddings \
+     --embed-model-name nvidia/llama-nemotron-embed-1b-v2
+   ```
+
+3. If ingest used local HuggingFace embeddings, keep the query backend and cache
+   explicit so the CLI does not fall back to the vLLM local path:
+
+   ```bash
+   retriever query "question text" \
+     --lancedb-uri ./lancedb \
+     --table-name nv-ingest \
+     --embed-model-name nvidia/llama-nemotron-embed-1b-v2 \
+     --local-query-embed-backend hf \
+     --local-hf-cache-dir "$HOME/models/huggingface" \
+     --local-hf-device cuda
+   ```
+
+4. Inspect ranked hits before answering. Use `_distance` as a ranking signal,
+   not a calibrated score.
+5. Synthesize from hit `text`, `source_id` / `path`, `pdf_basename`, and
+   `page_number`. Include document and page when available.
+6. If the evidence does not answer the question, say what is missing instead of
+   inventing a plausible answer.
+
+## Success Checks
+
+- Query output is a JSON array of hits, possibly empty.
+- Each answer claim is supported by one or more retrieved hit texts.
+- The final answer states insufficient evidence when the retrieved text does
+  not contain the requested fact.
+
+## Evaluation Scenarios
+
+- "Use the Retriever index to answer: what was revenue in 2024?" Use this skill.
+- "The query returns no hits." Use this skill and read `PITFALLS.md`.
+- "Index the PDFs first." Use `nemo-retriever-ingest`, not this skill.
diff --git a/nemo_retriever/src/nemo_retriever/.agents/skills/nemo-retriever-query/references/QUERY.md b/nemo_retriever/src/nemo_retriever/.agents/skills/nemo-retriever-query/references/QUERY.md
new file mode 100644
index 0000000000..ea37187162
--- /dev/null
+++ b/nemo_retriever/src/nemo_retriever/.agents/skills/nemo-retriever-query/references/QUERY.md
@@ -0,0 +1,151 @@
+# Query Reference
+
+## Contents
+
+- [CLI Query](#cli-query)
+- [Python Query](#python-query)
+- [Result Schema](#result-schema)
+- [Answer Synthesis](#answer-synthesis)
+
+## CLI Query
+
+The root CLI command is:
+
+```bash
+retriever query "question" --top-k 5 --lancedb-uri ./lancedb --table-name nv-ingest
+```
+
+From the source checkout, prefix with the project environment when needed:
+
+```bash
+uv run --project nemo_retriever retriever query "question" \
+  --top-k 5 \
+  --lancedb-uri ./lancedb \
+  --table-name nv-ingest
+```
+
+Observed from source/tests:
+
+- Default `--top-k` is `10`.
+- Default `--lancedb-uri` is `lancedb`.
+- Default `--table-name` is `nv-ingest`.
+- The CLI prints clean JSON on stdout on success.
+- Reranking is off by default.
+- `--rerank` enables local reranking. Any reranker URL/model/backend option also
+  implicitly enables reranking.
+- `EMBED_INVOKE_URL` and `RERANKER_INVOKE_URL` environment variables are used
+  when the matching CLI flags are omitted.
+- For local HuggingFace query embeddings, pass `--local-query-embed-backend hf`
+  plus `--local-hf-cache-dir` and `--local-hf-device` when needed.
+
+Useful summary:
+
+```bash
+retriever query "question" --top-k 5 \
+  | jq -r 'to_entries[] | "rank=\(.key + 1) page=\(.value.page_number) source=\(.value.source_id // .value.path // .value.source) type=\(.value.metadata.type // .value.content_type // "?") text=\(.value.text[:200])"'
+```
+
+Do not use `fromjson` on `.metadata` for current `Retriever.query()` / root CLI
+hits. The normalized API boundary returns `metadata` as a native dict.
+
+## Python Query
+
+For SDK use:
+
+```python
+from nemo_retriever.retriever import Retriever
+
+retriever = Retriever(
+    top_k=5,
+    vdb_kwargs={"uri": "./lancedb", "table_name": "nv-ingest"},
+)
+hits = retriever.query("question")
+```
+
+Remote query embedding:
+
+```python
+retriever = Retriever(
+    top_k=5,
+    vdb_kwargs={"uri": "./lancedb", "table_name": "nv-ingest"},
+    embed_kwargs={
+        "embed_invoke_url": "https://integrate.api.nvidia.com/v1/embeddings",
+        "embedding_endpoint": "https://integrate.api.nvidia.com/v1/embeddings",
+        "model_name": "nvidia/llama-nemotron-embed-1b-v2",
+        "embed_model_name": "nvidia/llama-nemotron-embed-1b-v2",
+    },
+)
+```
+
+The equivalent CLI call must still carry the same LanceDB URI and table used at
+ingest time:
+
+```bash
+retriever query "question" \
+  --lancedb-uri ./lancedb \
+  --table-name nv-ingest \
+  --embed-invoke-url https://integrate.api.nvidia.com/v1/embeddings \
+  --embed-model-name nvidia/llama-nemotron-embed-1b-v2
+```
+
+Local HuggingFace query embedding:
+
+```bash
+retriever query "question" \
+  --lancedb-uri ./lancedb \
+  --table-name nv-ingest \
+  --embed-model-name nvidia/llama-nemotron-embed-1b-v2 \
+  --local-query-embed-backend hf \
+  --local-hf-cache-dir "$HOME/models/huggingface" \
+  --local-hf-device cuda
+```
+
+The same settings in Python:
+
+```python
+from pathlib import Path
+
+from nemo_retriever.params import ModelRuntimeParams
+from nemo_retriever.retriever import Retriever
+
+hf_cache_dir = str(Path.home() / "models/huggingface")
+retriever = Retriever(
+    top_k=5,
+    vdb_kwargs={"uri": "./lancedb", "table_name": "nv-ingest"},
+    embed_kwargs={
+        "model_name": "nvidia/llama-nemotron-embed-1b-v2",
+        "embed_model_name": "nvidia/llama-nemotron-embed-1b-v2",
+        "local_ingest_embed_backend": "hf",
+        "runtime": ModelRuntimeParams(
+            hf_cache_dir=hf_cache_dir,
+            device="cuda",
+        ),
+    },
+)
+```
+
+Use `run_mode="service"` only when you specifically need the CPU HTTP embedding
+path to require an endpoint. It is not the same thing as the FastAPI ingest
+service.
+
+## Result Schema
+
+Normalized hits may include:
+
+- `text`: retrieved content.
+- `metadata`: native dict with content metadata such as `type`, page fields, or
+  stored image metadata.
+- `source`, `source_id`, `path`: origin document path/name when known.
+- `pdf_basename`: stem of the source PDF path.
+- `page_number`: integer page number, 1-indexed when present.
+- `pdf_page`: composite key like `<pdf_basename>_<page_number>`.
+- `_distance`: vector distance. Lower is better within the same query/model.
+- `_score` or `_rerank_score`: present for some backends/rerank paths.
+
+## Answer Synthesis
+
+- Prefer direct text evidence over chart/image transcriptions for exact numbers.
+- Cite document/page when present.
+- Preserve 1-indexed pages unless the task explicitly requests 0-indexing.
+- When multiple entities, years, or categories are asked for, address each one
+  explicitly, including "not found in retrieved evidence" where needed.
diff --git a/nemo_retriever/src/nemo_retriever/.agents/skills/nemo-retriever-service/PITFALLS.md b/nemo_retriever/src/nemo_retriever/.agents/skills/nemo-retriever-service/PITFALLS.md
new file mode 100644
index 0000000000..7bc0c351ac
--- /dev/null
+++ b/nemo_retriever/src/nemo_retriever/.agents/skills/nemo-retriever-service/PITFALLS.md
@@ -0,0 +1,64 @@
+# Service Pitfalls
+
+## Service Versus SDK Run Modes
+
+`retriever service start` runs the FastAPI ingestion service.
+`Retriever(run_mode="service")` means HTTP query embedding in the Python
+retriever. Do not conflate them.
+
+## Auth Token Mismatch
+
+When `auth.api_token` is configured, every non-bypassed request needs
+`Authorization: Bearer <token>`. The CLI can read `NEMO_RETRIEVER_API_TOKEN`.
+Health and docs paths are bypassed by default, so a successful health check does
+not prove ingest requests are authenticated correctly.
+
+## Endpoint Overrides Are Server-Owned
+
+Do not let client payloads set NIM endpoint URLs or API keys. Use YAML, CLI
+overrides, environment variables, or Helm values. Request-level overrides are
+policy-gated and endpoint/api-key keys are denied.
+
+## Legacy Routes
+
+Use `GET /v1/ingest/job/{job_id}/events` for SSE. The old
+`GET /v1/ingest/events` route should be treated as stale.
+
+## Service Ingest CLI Drift
+
+Some current builds expose `retriever service ingest` options but call the
+client with stale keyword arguments, producing:
+
+```text
+TypeError: RetrieverServiceClient.ingest_documents() got an unexpected keyword argument 'use_sse'
+```
+
+Do not stop there. Use the HTTP job API directly or the current Python client
+signature: `ingest_documents(files=..., show_progress=True, pipeline_spec=...)`.
+
+## Default Service Extraction Is PDF
+
+The document upload route defaults to `extraction_mode='pdf'`. Uploading a TXT
+file without a pipeline override fails with `Input file type(s) do not match
+extraction_mode='pdf'`. For service smoke tests, use a PDF fixture or provide a
+valid `pipeline.extraction_mode` and the dependencies needed by that mode.
+
+## Page-Elements 401 Can Be Non-Fatal
+
+A text-only PDF smoke upload can complete even if page-elements detection logs
+an HTTP 401 inside `page_elements_v3.error`, as long as text extraction produced
+rows and the job status is `completed`. Treat the embedded stage error as a
+capability/config warning, not automatically as failed service ingestion.
+
+## Helm Replica Limit
+
+The Helm chart currently uses SQLite on a single ReadWriteOnce PVC, which caps
+the service at one replica until a shared database backend is introduced.
+
+## ffmpeg For Audio And Video
+
+Audio/video extraction requires `ffmpeg` and `ffprobe`. The service image can
+install them at startup with `service.installFfmpeg=true`, but that requires
+network egress, writable root filesystem, and a security policy allowing the
+scoped sudo path. Locked-down clusters should use a custom image with ffmpeg
+already installed.
diff --git a/nemo_retriever/src/nemo_retriever/.agents/skills/nemo-retriever-service/SKILL.md b/nemo_retriever/src/nemo_retriever/.agents/skills/nemo-retriever-service/SKILL.md
new file mode 100644
index 0000000000..956dd65111
--- /dev/null
+++ b/nemo_retriever/src/nemo_retriever/.agents/skills/nemo-retriever-service/SKILL.md
@@ -0,0 +1,72 @@
+---
+name: nemo-retriever-service
+description: Use when the user asks to run, deploy, configure, operate, or call the NeMo Retriever service, including `retriever service start`, `retriever service ingest`, FastAPI `/v1` endpoints, service YAML, Helm chart deployment, auth tokens, NIM endpoint wiring, or service-mode troubleshooting.
+---
+
+# nemo-retriever-service
+
+Use this skill for NeMo Retriever service operation. Do not use it for a simple
+local one-shot ingest unless the user specifically wants a long-running service.
+
+## Orientation
+
+1. Verify the installed service commands: `retriever service --help`,
+   `retriever service start --help`, and `retriever service ingest --help`.
+2. Decide local service versus Kubernetes Helm:
+   - Local: `retriever service start` plus HTTP health checks.
+   - Kubernetes: `nemo_retriever/helm` chart and NIM endpoint/secret wiring.
+3. If the installed CLI is absent but this is a source checkout, use
+   `uv run --project nemo_retriever retriever service ...`. Retry dependency
+   downloads before choosing another service validation path.
+4. If neither path works, use `nemo-retriever-setup` before debugging service
+   behavior.
+
+## References
+
+- `references/SERVICE.md`: service commands, key endpoints, YAML settings, Helm
+  deployment choices, and auth behavior.
+- `PITFALLS.md`: endpoint policy, missing NIMs, token mismatch, SQLite replica
+  limits, ffmpeg runtime install, and stale service routes.
+
+## Workflow
+
+1. Locate or create a service config. Discovery order is explicit `--config`,
+   `./retriever-service.yaml`, then the bundled package default.
+2. Start locally when appropriate:
+
+   ```bash
+   retriever service start --config ./retriever-service.yaml --host 0.0.0.0 --port 7670
+   ```
+
+3. Verify health before submitting work:
+
+   ```bash
+   curl http://localhost:7670/v1/health
+   ```
+
+4. Submit files through the CLI client:
+
+   ```bash
+   retriever service ingest ./data/file.pdf --server-url http://localhost:7670
+   ```
+
+5. If service ingest CLI raises `TypeError: ... unexpected keyword argument
+   'use_sse'`, use the HTTP job API directly: `POST /v1/ingest/job`, then
+   `POST /v1/ingest/job/{job_id}/document`, then poll
+   `GET /v1/ingest/job/{job_id}?include_documents=true`.
+6. For Kubernetes, use Helm and decide whether NIMs are operator-managed or
+   external URLs supplied through `serviceConfig.nimEndpoints.*`.
+
+## Success Checks
+
+- `/v1/health` responds.
+- The CLI client or HTTP job API can create a job, accept a document, and report
+  completion or useful job status.
+- If auth is enabled, requests include the same bearer token configured by
+  `--api-token`, YAML `auth.api_token`, or `NEMO_RETRIEVER_API_TOKEN`.
+
+## Evaluation Scenarios
+
+- "Start a Retriever service for document ingestion." Use this skill.
+- "Deploy Retriever with external NIM endpoints in Kubernetes." Use this skill.
+- "Run a one-shot local ingest into LanceDB." Use `nemo-retriever-ingest`.
diff --git a/nemo_retriever/src/nemo_retriever/.agents/skills/nemo-retriever-service/references/SERVICE.md b/nemo_retriever/src/nemo_retriever/.agents/skills/nemo-retriever-service/references/SERVICE.md
new file mode 100644
index 0000000000..54b7d28c9d
--- /dev/null
+++ b/nemo_retriever/src/nemo_retriever/.agents/skills/nemo-retriever-service/references/SERVICE.md
@@ -0,0 +1,128 @@
+# Service Reference
+
+## Contents
+
+- [Local Commands](#local-commands)
+- [Service Config](#service-config)
+- [HTTP Surface](#http-surface)
+- [Helm Deployment](#helm-deployment)
+
+## Local Commands
+
+Start a local service:
+
+```bash
+retriever service start --config ./retriever-service.yaml --port 7670
+```
+
+From this source checkout:
+
+```bash
+uv run --project nemo_retriever retriever service start --host 127.0.0.1 --port 7670
+```
+
+Submit files to a running service:
+
+```bash
+BASE_URL=http://localhost:7670
+retriever service ingest ./data/file.pdf --server-url "$BASE_URL"
+```
+
+If the service ingest CLI raises `TypeError: RetrieverServiceClient.ingest_documents()
+got an unexpected keyword argument 'use_sse'`, drive the public HTTP API
+directly. If auth is enabled, uncomment the `AUTH` line.
+
+```bash
+BASE_URL=http://localhost:7670
+AUTH=()
+# AUTH=(-H "Authorization: Bearer $NEMO_RETRIEVER_API_TOKEN")
+
+curl -sS -X POST "$BASE_URL/v1/ingest/job" \
+  "${AUTH[@]}" \
+  -H 'Content-Type: application/json' \
+  -d '{"expected_documents":1,"label":"smoke"}'
+
+curl -sS -X POST "$BASE_URL/v1/ingest/job/<job_id>/document" \
+  "${AUTH[@]}" \
+  -F file=@./data/file.pdf \
+  -F metadata='{"filename":"file.pdf"}'
+
+curl -sS "$BASE_URL/v1/ingest/job/<job_id>?include_documents=true" \
+  "${AUTH[@]}"
+```
+
+The service CLI supports:
+
+- `--nim-api-key` for NIM endpoints, overriding YAML / `NVIDIA_API_KEY`.
+- `--api-token` for service bearer-token auth, also read from
+  `NEMO_RETRIEVER_API_TOKEN`.
+- `--gpu-devices` to override service resource config.
+- `--server-url` and `--api-token` on client ingest. The current client path may
+  reject `--sse/--no-sse` or `--poll-interval`; use the HTTP job API above if
+  that happens.
+
+## Service Config
+
+The bundled default is `nemo_retriever.service/retriever-service.yaml`.
+Discovery order:
+
+1. `retriever service start --config /path/to/retriever-service.yaml`
+2. `./retriever-service.yaml`
+3. bundled package default
+
+Important config sections:
+
+- `server.host` / `server.port`
+- `nim_endpoints.*_invoke_url` and `nim_endpoints.api_key`
+- `pipeline.realtime_workers` / `pipeline.batch_workers`
+- `auth.api_token`
+- `pipeline_overrides.mode` and sink allow lists
+
+Client-supplied endpoint URLs and API keys are trust-sensitive. The policy layer
+denies those through request overrides; configure them server-side.
+
+For a cheap PDF text-only smoke upload, use allowed per-request extraction
+overrides to disable expensive table/chart/image extraction:
+
+```bash
+curl -sS -X POST "$BASE_URL/v1/ingest/job/<job_id>/document" \
+  "${AUTH[@]}" \
+  -F file=@./data/file.pdf \
+  -F metadata='{"filename":"file.pdf","pipeline":{"extraction_mode":"pdf","extract_params":{"method":"pdfium","extract_tables":false,"extract_charts":false,"extract_images":false,"extract_page_as_image":false},"stage_order":[]}}'
+```
+
+Do not include `use_page_elements` in request overrides unless the service
+operator widened the allow list; the default policy rejects that key.
+
+## HTTP Surface
+
+Common public endpoints:
+
+- `GET /v1/health`
+- `POST /v1/ingest/job`
+- `POST /v1/ingest/job/{job_id}/document`
+- `GET /v1/ingest/job/{job_id}`
+- `GET /v1/ingest/job/{job_id}/events`
+- `GET /v1/ingest/pipeline-config`
+- `GET /v1/ingest/metrics`
+- `POST /v1/query` when the vectordb route is configured
+
+The legacy firehose `GET /v1/ingest/events` is removed. Use the per-job events
+route.
+
+## Helm Deployment
+
+The chart at `nemo_retriever/helm` deploys the service and optionally NIM
+Operator resources. For external NIM endpoints:
+
+```bash
+helm install retriever ./nemo_retriever/helm \
+  --set nims.enabled=false \
+  --set serviceConfig.nimEndpoints.pageElementsInvokeUrl=http://page-elements.svc:8000/v1/infer \
+  --set serviceConfig.nimEndpoints.tableStructureInvokeUrl=http://table-structure.svc:8000/v1/infer \
+  --set serviceConfig.nimEndpoints.ocrInvokeUrl=http://ocr.svc:8000/v1/infer \
+  --set serviceConfig.nimEndpoints.embedInvokeUrl=http://embed.svc:8000/v1/embeddings
+```
+
+For NGC image pulls or build.nvidia.com endpoints, configure the relevant
+`NGC_API_KEY` / `NVIDIA_API_KEY` secrets through the chart values.
diff --git a/nemo_retriever/src/nemo_retriever/.agents/skills/nemo-retriever-setup/PITFALLS.md b/nemo_retriever/src/nemo_retriever/.agents/skills/nemo-retriever-setup/PITFALLS.md
new file mode 100644
index 0000000000..0278ac0002
--- /dev/null
+++ b/nemo_retriever/src/nemo_retriever/.agents/skills/nemo-retriever-setup/PITFALLS.md
@@ -0,0 +1,41 @@
+# Setup Pitfalls
+
+## Python Version
+
+Use Python 3.12. Older or newer Python versions can fail dependency resolution
+or import checks because the package metadata requires `>=3.12,<3.13`.
+
+## Missing Installed Surface
+
+`retriever`, `nemo-retriever`, and `nemo_retriever` are not interchangeable
+command names. The public CLI command is `retriever`; the Python import package
+is `nemo_retriever`; the distribution name is `nemo-retriever`.
+
+## Source Checkout Versus Installed Package
+
+`uv run --project nemo_retriever retriever ...` is a developer-checkout fallback.
+For installed-package validation, install the package into an isolated
+environment and run `retriever --help` without relying on the source tree.
+
+## Optional Extras
+
+The base install is enough for remote NIM workflows. Local GPU inference needs
+the `local` extra. Audio/video and SVG workflows need `multimedia` plus system
+dependencies. QA generation/judging needs `llm`.
+
+## System Dependencies
+
+`ffmpeg-python` and `nemo-retriever[multimedia]` do not install the `ffmpeg` and
+`ffprobe` binaries. Install those through the operating system or use a service
+image/cluster configuration that provides them.
+
+## Model Downloads
+
+Local inference may download large HuggingFace assets on first use. Route caches
+to `~/models` for reproducible agent work and avoid writing model assets into
+the repository.
+
+## Remote Credentials
+
+Hosted NIM endpoints need `NVIDIA_API_KEY`. Missing keys should be reported as a
+setup gap, not as an ingest/query failure.
diff --git a/nemo_retriever/src/nemo_retriever/.agents/skills/nemo-retriever-setup/SKILL.md b/nemo_retriever/src/nemo_retriever/.agents/skills/nemo-retriever-setup/SKILL.md
new file mode 100644
index 0000000000..d5933d9add
--- /dev/null
+++ b/nemo_retriever/src/nemo_retriever/.agents/skills/nemo-retriever-setup/SKILL.md
@@ -0,0 +1,88 @@
+---
+name: nemo-retriever-setup
+description: Use when the user asks to install, verify, or orient to NeMo Retriever, when `retriever` is missing, when choosing extras or model/API prerequisites, or before another Retriever workflow can run. Do not use for a specific ingest, query, service, or evaluation task once the CLI works; use that task skill instead.
+---
+
+# nemo-retriever-setup
+
+Use this skill to get an agent into a working NeMo Retriever environment before
+running task-specific workflows.
+
+## Orientation
+
+1. Verify the intended public entry points:
+
+   ```bash
+   retriever --help
+   ```
+
+2. If this is a source checkout, the developer fallback is:
+
+   ```bash
+   uv run --project nemo_retriever retriever --help
+   ```
+
+3. If neither the installed CLI nor the source fallback works, report setup as
+   the blocker before attempting ingest/query/service/evaluation.
+
+## References
+
+- `PITFALLS.md`: Python version, missing package, optional extras, system
+  dependencies, API keys, and model-cache issues.
+
+## Workflow
+
+1. Confirm Python 3.12. NeMo Retriever requires Python `>=3.12,<3.13`.
+2. Choose install shape:
+   - Remote NIM inference, no local GPU models: install the base package.
+   - Local GPU inference: install the `local` extra and verify CUDA/PyTorch.
+   - Audio/video or SVG inputs: add the `multimedia` extra and system `ffmpeg`
+     / `ffprobe` when needed.
+   - QA generation or judging: add the `llm` extra and configure model keys.
+3. Create an isolated environment:
+
+   ```bash
+   uv python install 3.12
+   uv venv retriever --python 3.12
+   source retriever/bin/activate
+   uv pip install nemo-retriever
+   ```
+
+   For local GPU inference, install the appropriate extra instead:
+
+   ```bash
+   uv pip install "nemo-retriever[local]"
+   ```
+
+4. Route first-time HuggingFace downloads outside the repo when preparing local
+   inference:
+
+   ```bash
+   export HF_HOME="$HOME/models/huggingface"
+   export HF_HUB_CACHE="$HOME/models/huggingface/hub"
+   ```
+
+5. For remote hosted NIMs, configure credentials before ingest/query:
+
+   ```bash
+   export NVIDIA_API_KEY=nvapi-...
+   ```
+
+6. Re-run the public-surface checks. Once `retriever --help` and the relevant
+   subcommand help work, switch to the task skill for ingest, query, service, or
+   evaluation.
+
+## Success Checks
+
+- `retriever --help` shows `ingest`, `query`, `service`, `recall`, `eval`, and
+  `pipeline` commands.
+- `python -c "import nemo_retriever"` succeeds in the same environment.
+- The chosen task command's `--help` output is visible before running expensive
+  model or data work.
+
+## Evaluation Scenarios
+
+- "Install NeMo Retriever and verify the CLI." Use this skill.
+- "`retriever` is not found; what should I do?" Use this skill.
+- "Index the PDFs in `data/reports`." Use `nemo-retriever-ingest` once the
+  environment is working.
diff --git a/nemo_retriever/src/nemo_retriever/adapters/cli/main.py b/nemo_retriever/src/nemo_retriever/adapters/cli/main.py
index d55f553fd5..ef9bfefd53 100644
--- a/nemo_retriever/src/nemo_retriever/adapters/cli/main.py
+++ b/nemo_retriever/src/nemo_retriever/adapters/cli/main.py
@@ -329,6 +329,21 @@ def query_command(
         "--embed-model-name",
         help="Optional embedding model name override.",
     ),
+    local_query_embed_backend: LocalIngestEmbedBackendValue | None = typer.Option(
+        None,
+        "--local-query-embed-backend",
+        help="Local query-time text embedder when --embed-invoke-url is unset.",
+    ),
+    local_hf_cache_dir: str | None = typer.Option(
+        None,
+        "--local-hf-cache-dir",
+        help="HuggingFace cache directory for local query embedding.",
+    ),
+    local_hf_device: str | None = typer.Option(
+        None,
+        "--local-hf-device",
+        help="Torch device for local HuggingFace query embedding, such as 'cuda' or 'cpu'.",
+    ),
     reranker_invoke_url: str | None = typer.Option(None, "--reranker-invoke-url", help="Reranker NIM endpoint URL."),
     reranker_model_name: str | None = typer.Option(
         None,
@@ -392,6 +407,9 @@ def query_command(
                 table_name=table_name,
                 embed_invoke_url=embed_invoke_url,
                 embed_model_name=embed_model_name,
+                local_query_embed_backend=local_query_embed_backend,
+                local_hf_cache_dir=local_hf_cache_dir,
+                local_hf_device=local_hf_device,
                 reranker_invoke_url=reranker_invoke_url,
                 reranker_model_name=reranker_model_name,
                 reranker_backend=reranker_backend,
@@ -414,6 +432,9 @@ def query_command(
                     table_name=table_name,
                     embed_invoke_url=embed_invoke_url,
                     embed_model_name=embed_model_name,
+                    local_query_embed_backend=local_query_embed_backend,
+                    local_hf_cache_dir=local_hf_cache_dir,
+                    local_hf_device=local_hf_device,
                     reranker_invoke_url=reranker_invoke_url,
                     reranker_model_name=reranker_model_name,
                     reranker_backend=reranker_backend,
diff --git a/nemo_retriever/src/nemo_retriever/adapters/cli/sdk_workflow.py b/nemo_retriever/src/nemo_retriever/adapters/cli/sdk_workflow.py
index 3bb3c78d1b..18fd816c44 100644
--- a/nemo_retriever/src/nemo_retriever/adapters/cli/sdk_workflow.py
+++ b/nemo_retriever/src/nemo_retriever/adapters/cli/sdk_workflow.py
@@ -16,6 +16,7 @@
     EmbedParams,
     ExtractParams,
     HtmlChunkParams,
+    ModelRuntimeParams,
     TextChunkParams,
     VdbUploadParams,
     VideoFrameParams,
@@ -196,6 +197,8 @@ def _build_embed_kwargs(
     embed_invoke_url: str | None,
     embed_model_name: str | None,
     local_ingest_embed_backend: LocalIngestEmbedBackendValue | None = None,
+    local_hf_cache_dir: str | None = None,
+    local_hf_device: str | None = None,
     embed_workers: int | None = None,
     embed_batch_size: int | None = None,
     embed_cpus_per_actor: float | None = None,
@@ -210,6 +213,11 @@ def _build_embed_kwargs(
         embed_kwargs["embed_model_name"] = embed_model_name
     if local_ingest_embed_backend is not None:
         embed_kwargs["local_ingest_embed_backend"] = local_ingest_embed_backend
+    if local_hf_cache_dir is not None or local_hf_device is not None:
+        embed_kwargs["runtime"] = ModelRuntimeParams(
+            device=local_hf_device,
+            hf_cache_dir=local_hf_cache_dir,
+        )
     embed_tuning = _build_embed_batch_tuning(
         embed_workers=embed_workers,
         embed_batch_size=embed_batch_size,
@@ -452,6 +460,9 @@ def query_documents(
     table_name: str = "nv-ingest",
     embed_invoke_url: str | None = None,
     embed_model_name: str | None = None,
+    local_query_embed_backend: LocalIngestEmbedBackendValue | None = None,
+    local_hf_cache_dir: str | None = None,
+    local_hf_device: str | None = None,
     reranker_invoke_url: str | None = None,
     reranker_model_name: str | None = None,
     reranker_backend: str | None = None,
@@ -462,7 +473,13 @@ def query_documents(
     Reranking is opt-in: pass ``rerank=True`` (or any of the rerank-related
     args via the CLI, which implicitly set ``rerank=True``) to enable.
     """
-    embed_kwargs = _build_embed_kwargs(embed_invoke_url, embed_model_name)
+    embed_kwargs = _build_embed_kwargs(
+        embed_invoke_url,
+        embed_model_name,
+        local_ingest_embed_backend=local_query_embed_backend,
+        local_hf_cache_dir=local_hf_cache_dir,
+        local_hf_device=local_hf_device,
+    )
     retriever_kwargs: dict[str, Any] = {
         "top_k": top_k,
         "vdb_kwargs": {"uri": lancedb_uri, "table_name": table_name},
diff --git a/nemo_retriever/src/nemo_retriever/evaluation/cli.py b/nemo_retriever/src/nemo_retriever/evaluation/cli.py
index f847d6fc1a..8d6b4da6bf 100644
--- a/nemo_retriever/src/nemo_retriever/evaluation/cli.py
+++ b/nemo_retriever/src/nemo_retriever/evaluation/cli.py
@@ -16,7 +16,7 @@
 import os
 import time
 from pathlib import Path
-from typing import Optional
+from typing import Literal, Optional
 
 import typer
 
@@ -368,6 +368,21 @@ def export_cmd(
         "--embedder",
         help="Embedding model name.",
     ),
+    local_query_embed_backend: Literal["vllm", "hf"] | None = typer.Option(
+        None,
+        "--local-query-embed-backend",
+        help="Local query-time text embedder when no remote embedding endpoint is used.",
+    ),
+    local_hf_cache_dir: str | None = typer.Option(
+        None,
+        "--local-hf-cache-dir",
+        help="HuggingFace cache directory for local query embedding.",
+    ),
+    local_hf_device: str | None = typer.Option(
+        None,
+        "--local-hf-device",
+        help="Torch device for local HuggingFace query embedding, such as 'cuda' or 'cpu'.",
+    ),
     page_index: Path = typer.Option(
         None,
         "--page-index",
@@ -404,6 +419,9 @@ def export_cmd(
         output_path=str(output),
         top_k=top_k,
         embedder=embedder,
+        local_query_embed_backend=local_query_embed_backend,
+        local_hf_cache_dir=local_hf_cache_dir,
+        local_hf_device=local_hf_device,
         page_index=page_idx,
     )
     elapsed = time.monotonic() - t0
diff --git a/nemo_retriever/src/nemo_retriever/export.py b/nemo_retriever/src/nemo_retriever/export.py
index 7c99045b9b..74bbc663dc 100644
--- a/nemo_retriever/src/nemo_retriever/export.py
+++ b/nemo_retriever/src/nemo_retriever/export.py
@@ -106,6 +106,9 @@ def query_lancedb(
     *,
     top_k: int = 5,
     embedder: str = "nvidia/llama-nemotron-embed-1b-v2",
+    local_query_embed_backend: str | None = None,
+    local_hf_cache_dir: str | None = None,
+    local_hf_device: str | None = None,
     page_index: dict[str, dict[str, str]] | None = None,
     batch_size: int = 50,
 ) -> tuple[dict[str, dict], dict[str, Any]]:
@@ -123,6 +126,12 @@ def query_lancedb(
         Number of chunks to retrieve per query.
     embedder : str
         Embedding model name for the Retriever.
+    local_query_embed_backend : str, optional
+        Local backend for query embeddings, e.g. ``"hf"``.
+    local_hf_cache_dir : str, optional
+        HuggingFace cache directory for local query embeddings.
+    local_hf_device : str, optional
+        Torch device for local HuggingFace query embeddings.
     page_index : dict, optional
         ``{source_id: {page_str: markdown}}``.  When provided, chunk hits are
         expanded to full-page markdown.
@@ -136,14 +145,24 @@ def query_lancedb(
         ``{"chunks": [...], "metadata": [...]}`` and *metadata* is the
         envelope metadata dict.
     """
+    from nemo_retriever.params import ModelRuntimeParams
     from nemo_retriever.retriever import Retriever
 
+    embed_kwargs: dict[str, Any] = {"model_name": embedder, "embed_model_name": embedder}
+    if local_query_embed_backend is not None:
+        embed_kwargs["local_ingest_embed_backend"] = local_query_embed_backend
+    if local_hf_cache_dir is not None or local_hf_device is not None:
+        embed_kwargs["runtime"] = ModelRuntimeParams(
+            device=local_hf_device,
+            hf_cache_dir=local_hf_cache_dir,
+        )
+
     retriever = Retriever(
         vdb_kwargs={
             "vdb_op": "lancedb",
             "vdb_kwargs": {"uri": lancedb_uri, "table_name": lancedb_table},
         },
-        embed_kwargs={"model_name": embedder, "embed_model_name": embedder},
+        embed_kwargs=embed_kwargs,
         top_k=top_k,
         rerank=False,
     )
@@ -184,6 +203,12 @@ def query_lancedb(
         "chunk_mode": chunk_mode,
         "query_count": len(all_results),
     }
+    if local_query_embed_backend is not None:
+        meta["local_query_embed_backend"] = local_query_embed_backend
+    if local_hf_cache_dir is not None:
+        meta["local_hf_cache_dir"] = local_hf_cache_dir
+    if local_hf_device is not None:
+        meta["local_hf_device"] = local_hf_device
     if use_fullpage:
         meta["page_index_misses"] = total_page_misses
 
@@ -233,6 +258,9 @@ def export_retrieval_json(
     *,
     top_k: int = 5,
     embedder: str = "nvidia/llama-nemotron-embed-1b-v2",
+    local_query_embed_backend: str | None = None,
+    local_hf_cache_dir: str | None = None,
+    local_hf_device: str | None = None,
     page_index: dict[str, dict[str, str]] | None = None,
     batch_size: int = 50,
 ) -> dict:
@@ -254,6 +282,12 @@ def export_retrieval_json(
         Number of chunks to retrieve per query.
     embedder : str
         Embedding model name for the Retriever.
+    local_query_embed_backend : str, optional
+        Local backend for query embeddings, e.g. ``"hf"``.
+    local_hf_cache_dir : str, optional
+        HuggingFace cache directory for local query embeddings.
+    local_hf_device : str, optional
+        Torch device for local HuggingFace query embeddings.
     page_index : dict, optional
         ``{source_id: {page_str: markdown}}``.  When provided, chunk hits are
         expanded to full-page markdown.
@@ -271,6 +305,9 @@ def export_retrieval_json(
         queries=queries,
         top_k=top_k,
         embedder=embedder,
+        local_query_embed_backend=local_query_embed_backend,
+        local_hf_cache_dir=local_hf_cache_dir,
+        local_hf_device=local_hf_device,
         page_index=page_index,
         batch_size=batch_size,
     )
diff --git a/nemo_retriever/src/nemo_retriever/skill_eval/cli.py b/nemo_retriever/src/nemo_retriever/skill_eval/cli.py
index 6c42736327..67df046c52 100644
--- a/nemo_retriever/src/nemo_retriever/skill_eval/cli.py
+++ b/nemo_retriever/src/nemo_retriever/skill_eval/cli.py
@@ -29,7 +29,7 @@
 
 DEFAULT_ORDER = ("c1_base", "c2_retriever", "c3_retriever_skill")
 
-app = typer.Typer(help="Benchmark Claude with vs. without the /nemo-retriever skill on a folder of PDFs.")
+app = typer.Typer(help="Benchmark Claude with vs. without NeMo Retriever skills on a folder of PDFs.")
 logger = logging.getLogger(__name__)
 
 
@@ -162,9 +162,7 @@ def run_command(
     domain_order = sorted(by_domain.keys())
     typer.echo(f"Domains in this run: {domain_order} ({sum(len(v) for v in by_domain.values())} entries total)")
 
-    skill_source = Path(
-        str(cfg.get("skill_source_dir") or REPO_ROOT / ".claude" / "skills" / "nemo-retriever")
-    ).expanduser()
+    skill_source = Path(str(cfg.get("skill_source_dir") or REPO_ROOT / ".claude" / "skills")).expanduser()
     workdir_root = Path(str(cfg.get("per_trial_workdir_root", "/tmp/skill_eval"))).expanduser()
     workdir_root.mkdir(parents=True, exist_ok=True)
     model = str(cfg.get("agent_model", "claude-opus-4-7"))
diff --git a/nemo_retriever/src/nemo_retriever/skill_eval/configs/skill_eval.yaml b/nemo_retriever/src/nemo_retriever/skill_eval/configs/skill_eval.yaml
index f0636be9af..45dbd54f70 100644
--- a/nemo_retriever/src/nemo_retriever/skill_eval/configs/skill_eval.yaml
+++ b/nemo_retriever/src/nemo_retriever/skill_eval/configs/skill_eval.yaml
@@ -69,14 +69,14 @@ per_trial_workdir_root: /tmp/skill_eval
 conditions:
   - c1_base               # retriever blocked → forces Read-only baseline
   - c2_retriever          # retriever available, NL prompt, skill auto-discovery
-  - c3_retriever_skill    # retriever available, explicit /nemo-retriever slash
+  - c3_retriever_skill    # retriever available, explicit split-skill slash commands
 
 # ---------------------------------------------------------------------------
 # Skill source override (rarely needed)
 # ---------------------------------------------------------------------------
-# Defaults to <repo>/.claude/skills/nemo-retriever when unset. Set this only
-# if you want to A/B-test an alternate skill tree.
-# skill_source_dir: /path/to/.claude/skills/nemo-retriever
+# Defaults to <repo>/.claude/skills when unset. Point this at either a directory
+# containing multiple skill directories or one standalone skill directory.
+# skill_source_dir: /path/to/.claude/skills
 
 # ---------------------------------------------------------------------------
 # LLM-as-judge
diff --git a/nemo_retriever/src/nemo_retriever/skill_eval/dataset.py b/nemo_retriever/src/nemo_retriever/skill_eval/dataset.py
index e3c2e89963..4dd38e72e0 100644
--- a/nemo_retriever/src/nemo_retriever/skill_eval/dataset.py
+++ b/nemo_retriever/src/nemo_retriever/skill_eval/dataset.py
@@ -54,15 +54,15 @@ def _select_prompt(candidates: list[dict[str, Any]], selected_variant: int | Non
 
 
 def _normalize_slash_command(prompt: str) -> str:
-    """Rewrite SDG-generated slash commands to this project's actual skill name.
+    """Rewrite SDG-generated slash commands to this project's actual skill names.
 
     The agent_scenario_manifest contains slash-command scenarios using made-up
-    aliases (``/vidore-ingest``, ``/vidore``, ``/vidore_hr``); the real skill
-    that ships with this repo is ``nemo-retriever``. Rewriting at load time
-    avoids editing the upstream manifest (frozen baseline) while making the
-    slash_ingest / slash_retrieval scenarios actually exercisable. The token
-    boundary after the alias is preserved so the trailing args/query carry
-    over verbatim.
+    aliases (``/vidore-ingest``, ``/vidore``, ``/vidore_hr``). The skills that
+    ship with this repo are task-specific, so ingest aliases route to
+    ``nemo-retriever-ingest`` and retrieval aliases route to
+    ``nemo-retriever-query``. Rewriting at load time avoids editing the upstream
+    manifest while making slash scenarios exercisable. The token boundary after
+    the alias is preserved so trailing args/query carry over verbatim.
     """
     s = prompt.lstrip()
     if not s.startswith("/"):
@@ -70,9 +70,9 @@ def _normalize_slash_command(prompt: str) -> str:
     # Order matters: rewrite ``/vidore-ingest`` (which carries the ingest
     # subcommand intent) before the bare ``/vidore`` prefix.
     rewrites = [
-        ("/vidore-ingest ", "/nemo-retriever ingest "),
-        ("/vidore_hr ", "/nemo-retriever "),
-        ("/vidore ", "/nemo-retriever "),
+        ("/vidore-ingest ", "/nemo-retriever-ingest "),
+        ("/vidore_hr ", "/nemo-retriever-query "),
+        ("/vidore ", "/nemo-retriever-query "),
     ]
     for old, new in rewrites:
         if s.startswith(old):
diff --git a/nemo_retriever/src/nemo_retriever/skill_eval/prompts/setup_slash.j2 b/nemo_retriever/src/nemo_retriever/skill_eval/prompts/setup_slash.j2
index ec0aa27cea..b4d2d7e208 100644
--- a/nemo_retriever/src/nemo_retriever/skill_eval/prompts/setup_slash.j2
+++ b/nemo_retriever/src/nemo_retriever/skill_eval/prompts/setup_slash.j2
@@ -1 +1 @@
-/nemo-retriever ingest ./pdfs/
+/nemo-retriever-ingest ./pdfs/
diff --git a/nemo_retriever/src/nemo_retriever/skill_eval/prompts/trial_user_slash.j2 b/nemo_retriever/src/nemo_retriever/skill_eval/prompts/trial_user_slash.j2
index 7707bab45d..023f579074 100644
--- a/nemo_retriever/src/nemo_retriever/skill_eval/prompts/trial_user_slash.j2
+++ b/nemo_retriever/src/nemo_retriever/skill_eval/prompts/trial_user_slash.j2
@@ -1,4 +1,4 @@
-/nemo-retriever query "{{ original_query }}"
+/nemo-retriever-query "{{ original_query }}"
 
 After running this query, use the retrieved hits to write your final result to ./output.json with EXACTLY this schema:
 
diff --git a/nemo_retriever/src/nemo_retriever/skill_eval/runner.py b/nemo_retriever/src/nemo_retriever/skill_eval/runner.py
index 8c9a74bcf1..8d49a00f02 100644
--- a/nemo_retriever/src/nemo_retriever/skill_eval/runner.py
+++ b/nemo_retriever/src/nemo_retriever/skill_eval/runner.py
@@ -115,14 +115,25 @@ def _write_shim(shim_dir: Path, name: str) -> None:
     shim.chmod(shim.stat().st_mode | stat.S_IEXEC | stat.S_IXGRP | stat.S_IXOTH)
 
 
-def _copy_skill(skill_source: Path, dest: Path) -> None:
-    dest.mkdir(parents=True, exist_ok=True)
+def _copy_skill_dir(skill_source: Path, dest: Path) -> None:
     if (dest / "SKILL.md").exists():
         return
-    shutil.copy2(skill_source / "SKILL.md", dest / "SKILL.md")
-    ref_src = skill_source / "references"
-    if ref_src.is_dir():
-        shutil.copytree(ref_src, dest / "references", dirs_exist_ok=True)
+    shutil.copytree(skill_source, dest, dirs_exist_ok=True)
+
+
+def _copy_skills(skill_source: Path, dest: Path) -> None:
+    """Copy either one skill directory or a directory containing many skills."""
+    dest.mkdir(parents=True, exist_ok=True)
+    if (skill_source / "SKILL.md").is_file():
+        _copy_skill_dir(skill_source, dest / skill_source.name)
+        return
+
+    skill_dirs = [path for path in sorted(skill_source.iterdir()) if (path / "SKILL.md").is_file()]
+    if not skill_dirs:
+        raise FileNotFoundError(f"No skill directories found under {skill_source}")
+
+    for skill_dir in skill_dirs:
+        _copy_skill_dir(skill_dir, dest / skill_dir.name)
 
 
 # Bash patterns that route the agent into the nemo_retriever library, regardless
@@ -168,7 +179,7 @@ def _build_condition_workdir(
 
     Workdir contents:
       - pdfs/ symlink farm into the source PDF folder
-      - .claude/ sandbox (settings + per-condition skill copy)
+      - .claude/ sandbox (settings + per-condition skill copies)
       - .bin/retriever shim (c1 only) so retriever is unavailable on PATH
 
     The agent itself creates any retrieval artifacts (e.g., ./lancedb/) inside the
@@ -182,10 +193,10 @@ def _build_condition_workdir(
     # c1 gets explicit Bash deny rules; c2/c3 keep the empty settings.json.
     settings_text = _c1_settings_json() if condition == "c1_base" else "{}\n"
     (workdir / ".claude" / "settings.json").write_text(settings_text, encoding="utf-8")
-    # c2 and c3 both have retriever installed AND the nemo-retriever skill loaded.
+    # c2 and c3 both have retriever installed AND the NeMo Retriever skills loaded.
     # The c2/c3 distinction is purely the prompt style (NL vs explicit slash command).
     if condition in ("c2_retriever", "c3_retriever_skill"):
-        _copy_skill(skill_source, workdir / ".claude" / "skills" / "nemo-retriever")
+        _copy_skills(skill_source, workdir / ".claude" / "skills")
     if condition == "c1_base":
         _write_shim(workdir / ".bin", "retriever")
         # Empty HuggingFace cache redirect; env vars are wired up in _env_for.
diff --git a/nemo_retriever/tests/skill_eval/test_split_skills.py b/nemo_retriever/tests/skill_eval/test_split_skills.py
new file mode 100644
index 0000000000..e8e6042e57
--- /dev/null
+++ b/nemo_retriever/tests/skill_eval/test_split_skills.py
@@ -0,0 +1,104 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES.
+# All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+from __future__ import annotations
+
+from pathlib import Path
+
+from nemo_retriever.skill_eval.dataset import DatasetEntry, _normalize_slash_command
+from nemo_retriever.skill_eval.runner import _copy_skills, _render_prompt, _render_setup_prompt
+
+
+def _write_skill(root: Path, name: str) -> None:
+    skill_dir = root / name
+    (skill_dir / "references").mkdir(parents=True)
+    (skill_dir / "SKILL.md").write_text(f"---\nname: {name}\n---\n\n# {name}\n", encoding="utf-8")
+    (skill_dir / "PITFALLS.md").write_text("# Pitfalls\n", encoding="utf-8")
+    (skill_dir / "references" / "REFERENCE.md").write_text("# Reference\n", encoding="utf-8")
+
+
+def test_copy_skills_copies_split_skill_tree(tmp_path: Path) -> None:
+    source = tmp_path / "skills"
+    _write_skill(source, "nemo-retriever-ingest")
+    _write_skill(source, "nemo-retriever-query")
+
+    dest = tmp_path / "workdir" / ".claude" / "skills"
+
+    _copy_skills(source, dest)
+
+    assert (dest / "nemo-retriever-ingest" / "SKILL.md").is_file()
+    assert (dest / "nemo-retriever-ingest" / "PITFALLS.md").is_file()
+    assert (dest / "nemo-retriever-ingest" / "references" / "REFERENCE.md").is_file()
+    assert (dest / "nemo-retriever-query" / "SKILL.md").is_file()
+
+
+def test_copy_skills_accepts_single_skill_directory(tmp_path: Path) -> None:
+    _write_skill(tmp_path, "nemo-retriever-query")
+
+    dest = tmp_path / "dest"
+
+    _copy_skills(tmp_path / "nemo-retriever-query", dest)
+
+    assert (dest / "nemo-retriever-query" / "PITFALLS.md").is_file()
+
+
+def test_copy_skills_follows_compatibility_symlinks(tmp_path: Path) -> None:
+    package_skills = tmp_path / "package" / "skills"
+    _write_skill(package_skills, "nemo-retriever-ingest")
+
+    claude_skills = tmp_path / ".claude" / "skills"
+    claude_skills.mkdir(parents=True)
+    (claude_skills / "nemo-retriever-ingest").symlink_to(
+        package_skills / "nemo-retriever-ingest",
+        target_is_directory=True,
+    )
+
+    dest = tmp_path / "dest"
+
+    _copy_skills(claude_skills, dest)
+
+    assert (dest / "nemo-retriever-ingest" / "SKILL.md").is_file()
+    assert not (dest / "nemo-retriever-ingest").is_symlink()
+
+
+def test_copy_skills_accepts_mixed_symlink_and_local_skill_dirs(tmp_path: Path) -> None:
+    package_skills = tmp_path / "package" / "skills"
+    _write_skill(package_skills, "nemo-retriever-query")
+
+    root_skills = tmp_path / ".agents" / "skills"
+    root_skills.mkdir(parents=True)
+    (root_skills / "nemo-retriever-query").symlink_to(
+        package_skills / "nemo-retriever-query",
+        target_is_directory=True,
+    )
+    _write_skill(root_skills, "contributor-workflow")
+    (root_skills / "notes").mkdir()
+
+    dest = tmp_path / "dest"
+
+    _copy_skills(root_skills, dest)
+
+    assert (dest / "nemo-retriever-query" / "SKILL.md").is_file()
+    assert (dest / "contributor-workflow" / "SKILL.md").is_file()
+    assert not (dest / "notes").exists()
+
+
+def test_slash_prompts_use_task_specific_skill_names() -> None:
+    entry = DatasetEntry(
+        entry_id=1,
+        query_id="q1",
+        taxonomy_slot_id="retrieval",
+        original_query="What was revenue in 2024?",
+        paraphrased_prompt="Answer the revenue question.",
+        ground_truth_pages=[],
+    )
+
+    assert _render_setup_prompt("c3_retriever_skill").strip() == "/nemo-retriever-ingest ./pdfs/"
+    assert '/nemo-retriever-query "What was revenue in 2024?"' in _render_prompt(entry, "c3_retriever_skill")
+
+
+def test_manifest_slash_aliases_rewrite_to_split_skills() -> None:
+    assert _normalize_slash_command("/vidore-ingest ./pdfs/") == "/nemo-retriever-ingest ./pdfs/"
+    assert _normalize_slash_command("/vidore What was revenue?") == "/nemo-retriever-query What was revenue?"
+    assert _normalize_slash_command("/vidore_hr Find relevant pages") == "/nemo-retriever-query Find relevant pages"
diff --git a/nemo_retriever/tests/test_evaluation_retrievers.py b/nemo_retriever/tests/test_evaluation_retrievers.py
index 5e45ebf065..38b9cf63fc 100644
--- a/nemo_retriever/tests/test_evaluation_retrievers.py
+++ b/nemo_retriever/tests/test_evaluation_retrievers.py
@@ -13,14 +13,19 @@
 
 from __future__ import annotations
 
+import importlib
 import json
 from pathlib import Path
 from unittest.mock import patch
 
 import pytest
+from typer.testing import CliRunner
 
 from nemo_retriever.evaluation.retrievers import FileRetriever
 from nemo_retriever.llm.types import RetrievalResult
+from nemo_retriever.params import ModelRuntimeParams
+
+RUNNER = CliRunner()
 
 _SAMPLE_QUERIES: dict[str, dict] = {
     "What is the range of the 767?": {
@@ -216,6 +221,82 @@ def queries(self, queries):
     assert meta["collection_name"] == "nv-ingest"
 
 
+def test_query_lancedb_passes_local_hf_embed_options(monkeypatch) -> None:
+    from nemo_retriever.export import query_lancedb
+
+    captured_kwargs: dict[str, object] = {}
+
+    class _FakeRetriever:
+        def __init__(self, **kwargs):
+            captured_kwargs.update(kwargs)
+
+        def queries(self, queries):
+            return [[{"text": "range chunk", "source": "spec.pdf", "page_number": 3, "_distance": 0.1}]]
+
+    retriever_module = importlib.import_module("nemo_retriever.retriever")
+    monkeypatch.setattr(retriever_module, "Retriever", _FakeRetriever)
+
+    _all_results, meta = query_lancedb(
+        lancedb_uri="/tmp/lancedb",
+        lancedb_table="nv-ingest",
+        queries=[{"query": "What is the range of the 767?"}],
+        embedder="embedder",
+        local_query_embed_backend="hf",
+        local_hf_cache_dir="/models/huggingface",
+        local_hf_device="cuda",
+    )
+
+    embed_kwargs = captured_kwargs["embed_kwargs"]
+    assert embed_kwargs["local_ingest_embed_backend"] == "hf"
+    runtime = embed_kwargs["runtime"]
+    assert isinstance(runtime, ModelRuntimeParams)
+    assert runtime.hf_cache_dir == "/models/huggingface"
+    assert runtime.device == "cuda"
+    assert meta["local_query_embed_backend"] == "hf"
+    assert meta["local_hf_cache_dir"] == "/models/huggingface"
+    assert meta["local_hf_device"] == "cuda"
+
+
+def test_eval_export_cli_passes_local_hf_embed_options(monkeypatch, tmp_path: Path) -> None:
+    cli = importlib.import_module("nemo_retriever.evaluation.cli")
+    query_csv = tmp_path / "queries.csv"
+    output = tmp_path / "retrieval.json"
+    query_csv.write_text("query\nWhat is the range of the 767?\n", encoding="utf-8")
+    captured_kwargs: dict[str, object] = {}
+
+    def fake_export_retrieval_json(**kwargs):
+        captured_kwargs.update(kwargs)
+        return {"queries": {"What is the range of the 767?": {"chunks": [], "metadata": []}}}
+
+    monkeypatch.setattr("nemo_retriever.export.export_retrieval_json", fake_export_retrieval_json)
+
+    result = RUNNER.invoke(
+        cli.app,
+        [
+            "export",
+            "--lancedb-uri",
+            "/tmp/lancedb",
+            "--lancedb-table",
+            "nv-ingest",
+            "--query-csv",
+            str(query_csv),
+            "--output",
+            str(output),
+            "--local-query-embed-backend",
+            "hf",
+            "--local-hf-cache-dir",
+            "/models/huggingface",
+            "--local-hf-device",
+            "cuda",
+        ],
+    )
+
+    assert result.exit_code == 0
+    assert captured_kwargs["local_query_embed_backend"] == "hf"
+    assert captured_kwargs["local_hf_cache_dir"] == "/models/huggingface"
+    assert captured_kwargs["local_hf_device"] == "cuda"
+
+
 def test_from_lancedb_no_save_path_keeps_memory_label() -> None:
     """Without ``save_path`` the instance reports the in-memory origin."""
     fake_meta = {"lancedb_uri": "mock"}
diff --git a/nemo_retriever/tests/test_root_cli_workflow.py b/nemo_retriever/tests/test_root_cli_workflow.py
index cccc6ce6b0..3f00d75a85 100644
--- a/nemo_retriever/tests/test_root_cli_workflow.py
+++ b/nemo_retriever/tests/test_root_cli_workflow.py
@@ -15,7 +15,14 @@
 
 import nemo_retriever.adapters.cli.sdk_workflow as sdk_workflow
 from nemo_retriever.graph_ingestor import GraphIngestor
-from nemo_retriever.params import AudioChunkParams, EmbedParams, ExtractParams, TextChunkParams, VideoFrameParams
+from nemo_retriever.params import (
+    AudioChunkParams,
+    EmbedParams,
+    ExtractParams,
+    ModelRuntimeParams,
+    TextChunkParams,
+    VideoFrameParams,
+)
 
 
 RUNNER = CliRunner()
@@ -628,6 +635,45 @@ def query(self, query: str) -> list[dict[str, Any]]:
     assert json.loads(result.output) == []
 
 
+def test_root_query_passes_local_hf_embed_options(monkeypatch) -> None:
+    retriever_calls: list[dict[str, Any]] = []
+
+    class FakeRetriever:
+        def __init__(self, **kwargs: Any) -> None:
+            retriever_calls.append(kwargs)
+
+        def query(self, query: str) -> list[dict[str, Any]]:
+            return []
+
+    monkeypatch.setattr(sdk_workflow, "Retriever", FakeRetriever)
+
+    result = RUNNER.invoke(
+        cli_main.app,
+        [
+            "query",
+            "Which passages mention deployment?",
+            "--embed-model-name",
+            "nvidia/llama-nemotron-embed-1b-v2",
+            "--local-query-embed-backend",
+            "hf",
+            "--local-hf-cache-dir",
+            "/models/huggingface",
+            "--local-hf-device",
+            "cuda",
+        ],
+    )
+
+    assert result.exit_code == 0
+    embed_kwargs = retriever_calls[0]["embed_kwargs"]
+    assert embed_kwargs["model_name"] == "nvidia/llama-nemotron-embed-1b-v2"
+    assert embed_kwargs["embed_model_name"] == "nvidia/llama-nemotron-embed-1b-v2"
+    assert embed_kwargs["local_ingest_embed_backend"] == "hf"
+    runtime = embed_kwargs["runtime"]
+    assert isinstance(runtime, ModelRuntimeParams)
+    assert runtime.hf_cache_dir == "/models/huggingface"
+    assert runtime.device == "cuda"
+
+
 def test_root_query_passes_reranker_url(monkeypatch) -> None:
     retriever_calls: list[dict[str, Any]] = []
     query_calls: list[str] = []