diff --git a/.agents/skills/nemo-retriever-evaluate b/.agents/skills/nemo-retriever-evaluate new file mode 120000 index 0000000000..a5c784b6d6 --- /dev/null +++ b/.agents/skills/nemo-retriever-evaluate @@ -0,0 +1 @@ +../../nemo_retriever/src/nemo_retriever/.agents/skills/nemo-retriever-evaluate \ No newline at end of file diff --git a/.agents/skills/nemo-retriever-ingest b/.agents/skills/nemo-retriever-ingest new file mode 120000 index 0000000000..50670720dc --- /dev/null +++ b/.agents/skills/nemo-retriever-ingest @@ -0,0 +1 @@ +../../nemo_retriever/src/nemo_retriever/.agents/skills/nemo-retriever-ingest \ No newline at end of file diff --git a/.agents/skills/nemo-retriever-query b/.agents/skills/nemo-retriever-query new file mode 120000 index 0000000000..402b4e2e74 --- /dev/null +++ b/.agents/skills/nemo-retriever-query @@ -0,0 +1 @@ +../../nemo_retriever/src/nemo_retriever/.agents/skills/nemo-retriever-query \ No newline at end of file diff --git a/.agents/skills/nemo-retriever-service b/.agents/skills/nemo-retriever-service new file mode 120000 index 0000000000..be8bce771d --- /dev/null +++ b/.agents/skills/nemo-retriever-service @@ -0,0 +1 @@ +../../nemo_retriever/src/nemo_retriever/.agents/skills/nemo-retriever-service \ No newline at end of file diff --git a/.agents/skills/nemo-retriever-setup b/.agents/skills/nemo-retriever-setup new file mode 120000 index 0000000000..011ed27a8e --- /dev/null +++ b/.agents/skills/nemo-retriever-setup @@ -0,0 +1 @@ +../../nemo_retriever/src/nemo_retriever/.agents/skills/nemo-retriever-setup \ No newline at end of file diff --git a/.claude/skills/nemo-retriever-evaluate b/.claude/skills/nemo-retriever-evaluate new file mode 120000 index 0000000000..a5c784b6d6 --- /dev/null +++ b/.claude/skills/nemo-retriever-evaluate @@ -0,0 +1 @@ +../../nemo_retriever/src/nemo_retriever/.agents/skills/nemo-retriever-evaluate \ No newline at end of file diff --git a/.claude/skills/nemo-retriever-ingest b/.claude/skills/nemo-retriever-ingest new file mode 120000 index 0000000000..50670720dc --- /dev/null +++ b/.claude/skills/nemo-retriever-ingest @@ -0,0 +1 @@ +../../nemo_retriever/src/nemo_retriever/.agents/skills/nemo-retriever-ingest \ No newline at end of file diff --git a/.claude/skills/nemo-retriever-query b/.claude/skills/nemo-retriever-query new file mode 120000 index 0000000000..402b4e2e74 --- /dev/null +++ b/.claude/skills/nemo-retriever-query @@ -0,0 +1 @@ +../../nemo_retriever/src/nemo_retriever/.agents/skills/nemo-retriever-query \ No newline at end of file diff --git a/.claude/skills/nemo-retriever-service b/.claude/skills/nemo-retriever-service new file mode 120000 index 0000000000..be8bce771d --- /dev/null +++ b/.claude/skills/nemo-retriever-service @@ -0,0 +1 @@ +../../nemo_retriever/src/nemo_retriever/.agents/skills/nemo-retriever-service \ No newline at end of file diff --git a/.claude/skills/nemo-retriever-setup b/.claude/skills/nemo-retriever-setup new file mode 120000 index 0000000000..011ed27a8e --- /dev/null +++ b/.claude/skills/nemo-retriever-setup @@ -0,0 +1 @@ +../../nemo_retriever/src/nemo_retriever/.agents/skills/nemo-retriever-setup \ No newline at end of file diff --git a/.claude/skills/nemo-retriever/SKILL.md b/.claude/skills/nemo-retriever/SKILL.md deleted file mode 100644 index 6e07ff6f76..0000000000 --- a/.claude/skills/nemo-retriever/SKILL.md +++ /dev/null @@ -1,92 +0,0 @@ ---- -name: nemo-retriever -description: Use when the user wants to search, index, or answer questions over a folder of PDFs (or other documents) — including building a RAG / search index over PDFs, looking up information across many PDFs, or running the `retriever` CLI (ingest, query, pipeline, recall, eval, etc.). ---- - -# nemo-retriever - -The `retriever` CLI indexes a folder of PDFs into LanceDB (`retriever ingest`) and serves vector search over it (`retriever query`). For any task about searching/answering questions across a folder of PDFs, use this CLI — do not write a custom RAG. - -## Setup turn (when `./lancedb/nv-ingest.lance` doesn't exist) - -`retriever ingest ./pdfs/` runs the full pipeline (text extraction + page-element detection + OCR + embedding + LanceDB insert). On corpora >~800 pages this often won't fit a typical setup turn budget (10 min) — the OCR + page-element stages dominate and scale roughly linearly with page count. Always build an index — pick the recipe by corpus size: - -```bash -TOTAL_PAGES=$(python -c "import pypdfium2, glob; print(sum(len(pypdfium2.PdfDocument(p)) for p in glob.glob('./pdfs/*.pdf')))" 2>/dev/null || echo 0) -echo "total_pages=$TOTAL_PAGES" -if [ "$TOTAL_PAGES" -le 800 ]; then - retriever ingest ./pdfs/ --embed-model-name nvidia/llama-nemotron-embed-1b-v2 -else - retriever pipeline run ./pdfs/ --run-mode inprocess --method pdfium --no-extract-tables --no-extract-charts --no-extract-page-as-image --evaluation-mode none --embed-model-name nvidia/llama-nemotron-embed-1b-v2 -fi -``` - -The `else` branch skips page-element detection, OCR, table extraction, and chart extraction — only pdfium text extraction + embedding. Embedding runs locally via the bundled HuggingFace model by default (no remote NIM needed). It's strictly better to have a text-only index than no index at all: the per-query pdfium text-extract fallback re-extracts a full PDF *per query*, which is both slow and expensive. Page-element detection may emit warning logs when its remote endpoint isn't reachable; the warnings are non-fatal as long as the embedding step itself succeeds. - -Don't pre-OCR, don't pre-chunk, don't write Python wrappers — the CLI handles extraction + (optionally) page-element detection + OCR + embedding + LanceDB insert in one shot. - -## Query turn — the WHOLE workflow - -```bash -retriever query "" --top-k 10 --embed-model-name nvidia/llama-nemotron-embed-1b-v2 --rerank \ - | tee /tmp/hits.json \ - | jq -r '.[] | "rank=\(.rank // 0) page=\(.page_number) pdf=\(.pdf_basename) type=\(.metadata.type // "?") text=\(.text[:200])"' -``` - -Run that **exactly** as a single pipeline — do not split it into `HITS=$(...)` + `echo "$HITS" | jq ...` (the assignment swallows stdout, the pipe sees nothing, you waste 3 bash calls recovering). Stdout is clean JSON (model-init logs are silenced at the CLI layer); leave stderr unredirected so real errors surface on the first call. The full JSON sits at `/tmp/hits.json` if you need to re-parse it (`jq '.[6]' /tmp/hits.json`), but in the common case the jq summary above is all you need. - -That's your FIRST tool call on every query turn. Do not Read, Glob, Grep, or list PDFs before this — those duplicate what `retriever query` already did. - -**No narration between tool calls.** Do not write "Let me search…", "I'll now analyze…", "The retriever returned…", or any other commentary. Every assistant token you emit between the `retriever query` Bash call and the `Write` of `./output.json` becomes input tokens (and cached input tokens) for every subsequent turn in this session — quadratic cost. Go straight from reading the jq summary to writing the JSON file. The only assistant text in a query turn should be the tool calls themselves. - -Each hit has: `text`, `pdf_basename`, `page_number` (int, **1-indexed**: the first page of a PDF is page `1`), `pdf_page` (string composite key `"_"` — not a number, don't use it as one), `_distance`, and `metadata` (JSON with `type` ∈ `text|table|chart|image`). - -**Then write `./output.json` directly from $HITS:** - -- `final_answer`: synthesize from the top hits' `text`. Include the exact number / name / date / row / column the question asks for, plus the source PDF and 0-indexed page. One paragraph. No restating the question, no hedging caveats. If the chunks talk *around* the fact but don't state it, run ONE `retriever pdf stage page-elements ./pdfs --method pdfium --json-output-dir /tmp/pdf_text --compact-json` and read `/tmp/pdf_text/.pdf.pdf_extraction.json` for the rank-1 page (or rank-2 if rank-1 is metadata) — that almost always surfaces the exact figure. Then synthesize. **If after both calls the asked-for fact still isn't in the evidence, write `final_answer` that says so explicitly** — e.g. "The retrieved pages do not state [X] for [entity]; the closest content is [Y]." Do NOT invent, extrapolate, or generate plausible-sounding content from adjacent material. A confidently-wrong answer scores worse than an honest "not in the retrieved pages". -- `ranked_retrieved`: one entry per hit in the order `retriever query` returned: `{"doc_id": "", "page_number": , "rank": }`. Up to 10. Duplicate `(doc, page)` is fine. **Indexing:** the retriever's `page_number` is 1-indexed. If the task's output schema says 0-indexed (e.g. "first page is page 0"), emit `hit.page_number - 1`; if the task says 1-indexed or doesn't specify, emit `hit.page_number` as-is. - -**Before writing `final_answer`, re-read the question.** If it lists multiple entities, years, or categories, your answer must address each one explicitly — even if for some of them the chunks say "not provided" or contain no data. Missing entities lose more judge points than imprecise numbers. - -**Charts and images need extra caution — this is the single biggest source of judge=2/3 trials.** When `metadata.type` of a hit is `chart` or `image`, its `text` field is a model-generated transcription that frequently: - -- reverses direction words (`increase`↔`decrease`, `rose`↔`fell`, `surge`↔`drop`), and -- rounds or misreads exact percentages (e.g. transcribing 12% as 20%). - -If a question asks for an exact percentage or a directional claim **and the evidence is only a chart/image hit** (no `text`-type hit corroborates the same number or direction): - -1. Run the targeted `retriever pdf stage page-elements --method pdfium` text-extract on the rank-1 PDF (this counts as your second tool call) and look for the number in prose. -2. If prose confirms the chart number, assert it confidently. -3. If prose doesn't mention it, **quote the chart transcription verbatim with an explicit hedge in `final_answer`**: "The chart on page N indicates [verbatim phrase] (chart-derived, not verified against prose)." Do NOT restate the chart's number as a confident fact. - -When both a chart hit and a text hit cover the same fact, always prefer the text hit's number. - -After writing the file, STOP. No print, no summary, no further tool calls. - -### Hard limits (cost discipline) - -- ONE `retriever query` per turn. ONE optional targeted text-extract on the rank-1 PDF if the chunks miss the asked-for fact. That's the budget — it is a hard cap, not a soft preference. -- After your 2nd tool call, write `final_answer` with what you have and STOP. If both calls left the asked-for fact unresolved, write `final_answer` that **explicitly states the retrieved pages don't contain the requested fact** (naming the closest related content if any) — **do not run more tool calls hunting for it, and do not extrapolate a plausible value.** Long-running query turns (5+ tool calls, 1M+ cache-read tokens) cost ~5× a disciplined turn and usually still produce the wrong answer. -- Don't read whole PDFs. -- Don't make speculative Read/Glob/Grep calls "to confirm". The retriever already found the relevant pages — trust the ranking. -- Don't spawn agents, write plans, or make todo lists. The workflow above is the workflow. - -### If the index is missing or `retriever query` returns `[]` - -Means ingest didn't complete (e.g. the text-only pipeline still hit the turn wall, or the table is empty). Tight fallback using the retriever's own pdfium-based extractor (always available — same binary the agent just used for `retriever query`): -1. `ls ./pdfs/` (one call) to see filenames. -2. Pick the SINGLE PDF whose name best matches the question. -3. ONE call: `retriever pdf stage page-elements ./pdfs --method pdfium --json-output-dir /tmp/pdf_text --compact-json`. This emits a JSON sidecar per PDF at `/tmp/pdf_text/.pdf.pdf_extraction.json` containing per-page text primitives — pdfium only, no OCR, no NIM, fast. -4. `jq` (or read directly) `/tmp/pdf_text/.pdf.pdf_extraction.json` for the chosen PDF and synthesize from the per-page text. If the answer isn't there, still write your best guess based on the filename + extracted pages plus a one-sentence acknowledgement of uncertainty in `final_answer`. Then stop. - -Do NOT keep doing text-extract calls across many PDFs to hunt — that exhausts the turn budget. Better to answer partially than to time out. Never re-run `retriever ingest`. - -For an unlisted subcommand: `retriever --help`. - -## Failure modes - -- **First `ingest` takes ~60s+** — vLLM warmup. Expected. -- **First `query` takes ~10–15s** — embedder cold-start. Expected. -- **Empty result** — ingest didn't run. Use the fallback above. -- **`Clamping num_partitions ...`** — informational on tiny corpora, not an error. -- **Low-relevance top hit on tiny corpus** — look at `_distance` *gaps* between hits, not absolute values. diff --git a/.claude/skills/nemo-retriever/references/ingest.md b/.claude/skills/nemo-retriever/references/ingest.md deleted file mode 100644 index b3a52788ce..0000000000 --- a/.claude/skills/nemo-retriever/references/ingest.md +++ /dev/null @@ -1,124 +0,0 @@ -# retriever ingest - -End-to-end ingestion of documents and media into a LanceDB table — runs the -full extract → embed → vector-DB pipeline in a single command. - -If flags below look stale, re-check `retriever ingest --help`. - -## When to use this - -- You have one or more supported files (or a directory/glob of files) and want them - searchable via `retriever query`. -- You want the default pipeline: auto-select extraction for PDF/DOC/PPTX, - text, HTML, image, audio, or video inputs, then embed and insert into - LanceDB. No per-stage tuning needed. - -**Use a different command when:** - -- You only need a single stage (e.g. just extract text, no embeddings) → - `retriever pdf`, `retriever chart`, `retriever image`, etc. -- You want fine-grained control over the pipeline graph → `retriever pipeline`. -- You need a long-running service rather than one-shot CLI → `retriever service`. -- You're benchmarking throughput → `retriever benchmark`. -- You're iterating on the pipeline locally and want a non-distributed runner → - `retriever local`. - -## Canonical invocations - -Ingest a single file into the default table (`lancedb/nv-ingest.lance`): - -```bash -retriever ingest data/multimodal_test.pdf -``` - -Ingest a directory of supported files: - -```bash -retriever ingest data/corpus/ -``` - -Ingest via glob: - -```bash -retriever ingest "data/**/*" -``` - -Force a specific input family: - -```bash -retriever ingest data/slides/ --input-type doc -retriever ingest data/images/ --input-type image -retriever ingest data/audio/ --input-type audio -retriever ingest data/video/ --input-type video -``` - -Write to a custom DB / table: - -```bash -retriever ingest data/multimodal_test.pdf \ - --lancedb-uri ./my-lancedb \ - --table-name my-corpus -``` - -## Inputs - -- **Positional `DOCUMENTS...`** — one or more file paths, directories, or - shell globs. Required, repeatable. -- **Supported input types** — `pdf`, `doc` (`.docx`, `.pptx`), `txt`, `html`, - `image` (`.jpg`, `.jpeg`, `.png`, `.tiff`, `.tif`, `.bmp`, `.svg`), - `audio` (`.mp3`, `.wav`, `.m4a`), and `video` (`.mp4`, `.mov`, `.mkv`). - -## Outputs - -- A LanceDB dataset at `/.lance`. Default: - `./lancedb/nemo-retriever.lance`. -- One row per extracted primitive (text chunk, table, chart, image region), - each with: `text`, `source`, `page_number`, `metadata` (JSON: type, bbox, …), - and the embedding vector. - -## Key flags - -| Flag | Default | Notes | -|---|---|---| -| `--lancedb-uri` | `lancedb` | Path or URI of the LanceDB database. | -| `--table-name` | `nv-ingest` | LanceDB table to write into. Must match `retriever query`'s table on read. | -| `--input-type` | `auto` | Input family to ingest. `auto` detects from file extensions and supports mixed directories. | -| `--run-mode` | `inprocess` | `inprocess` for local runs; `batch` for the SDK batch ingestor. | - -## Pipeline shape - -For PDF/DOC/PPTX inputs, `ingest` runs the optimized document pipeline: - -1. `DocToPdfConversionActor` — non-PDF inputs → PDF (no-op for PDFs). -2. `PDFSplitActor` — split into per-page tasks. -3. `PDFExtractionActor` — extract native text/structure. -4. `PageElementDetectionActor` — detect tables, charts, images, text blocks. -5. `OCRV2Actor` — OCR text where native extraction is missing/poor. -6. `UDFOperator` — user-defined transforms (passthrough by default). -7. `_BatchEmbedActor` — embed primitives with `llama-nemotron-embed-1b-v2`. -8. `IngestVdbOperator` — insert rows into LanceDB. - -For text, HTML, image, audio, video, or mixed `auto` inputs, `ingest` routes -through the same GraphIngestor extraction paths used by `retriever pipeline`. - -## Common failure modes - -- **`Clamping num_partitions from 16 to 7`** — informational, not an error. - LanceDB IVF index needs `num_partitions < row_count`; happens on very small - ingests. -- **First run is slow (~60s+ before any pages process)** — vLLM model load and - CUDA-graph capture for the embedder. Subsequent runs in the same process - are fast; one-shot CLI invocations always pay this cost. -- **`No existing dataset at …/nemo-retriever.lance, it will be created`** — expected - on the first ingest into a new DB. Subsequent ingests append. -- **HuggingFace download on first run** — the embedder and page-element - detector pull weights to `~/.cache/huggingface`. Needs network the first - time; cached afterwards. - -## Related - -- [[query]] — search the table this command writes. -- `retriever vector-store --help` — utilities for inspecting/moving LanceDB - tables. -- `retriever pipeline --help` — same end-to-end ingest but exposes per-stage - knobs. diff --git a/.claude/skills/nemo-retriever/references/query.md b/.claude/skills/nemo-retriever/references/query.md deleted file mode 100644 index b9dfe9ccc7..0000000000 --- a/.claude/skills/nemo-retriever/references/query.md +++ /dev/null @@ -1,95 +0,0 @@ -# retriever query - -Embed a text query and return the top-k nearest rows from a LanceDB table -previously written by `retriever ingest` (or any compatible pipeline). - -If flags below look stale, re-check `retriever query --help`. - -## When to use this - -- You have already ingested documents and want to retrieve relevant - chunks/primitives for a natural-language query. -- You want a one-shot CLI lookup — no service, no UI. - -**Use a different command when:** - -- You want recall metrics over a labelled query set → `retriever recall`. -- You want to grade end-to-end QA quality → `retriever eval`. -- You want a long-running query endpoint → `retriever service`. -- You want to compare two retrieval runs → `retriever compare`. - -## Canonical invocations - -Top-10 search against the default table: - -```bash -retriever query "what is in chart 1?" -``` - -Top-3, custom table: - -```bash -retriever query "average frequency ranges for tweeters" \ - --top-k 3 \ - --lancedb-uri ./my-lancedb \ - --table-name my-corpus -``` - -## Inputs - -- **Positional `QUERY`** — single text string. Required. Quote it in the shell - to keep multi-word queries intact. - -## Outputs - -- JSON array on stdout, one object per hit, sorted by ascending `_distance` - (lower = more similar). Each hit includes: - - `_distance` — vector distance in the embedding space. - - `text` — the retrieved primitive's text content. - - `source` / `path` / `source_id` — origin document path. - - `page_number`, `pdf_basename`, `pdf_page` — locator. - - `metadata` — JSON string with `type` (`text` / `table` / `chart` / `image`) - and, where applicable, a normalised `bbox_xyxy_norm`. - -Pipe to `jq` for filtering, e.g. only chart hits: - -```bash -retriever query "gadget costs" | jq '[.[] | select(.metadata | fromjson.type == "chart")]' -``` - -## Key flags - -| Flag | Default | Notes | -|---|---|---| -| `--top-k` | `10` | Max hits to return. Must be ≥ 1. | -| `--lancedb-uri` | `lancedb` | Must match what `ingest` wrote to. | -| `--table-name` | `nemo-retriever` | Must match what `ingest` wrote to. | - -## Distance interpretation - -- The embedder (`llama-nemotron-embed-vl-1b-v2`) returns mean-pooled vectors; - LanceDB returns L2 distance by default. Typical relevant hits are in the - ~1.0–1.7 range for this model on prose queries; treat `_distance` as - **ranking-only**, not a calibrated similarity score. -- The query uses the **VL** variant of the embedder so text queries can match - ingested image/chart embeddings as well as text. Expect mixed-modality hits - in the result list. - -## Common failure modes - -- **Empty result array** — table is empty (no ingest run yet) or - `--table-name` / `--lancedb-uri` don't match where ingest wrote. -- **`Table 'nemo-retriever' was not found`** — same root cause: wrong table/URI, - or ingest hasn't been run. -- **First query is slow (~10–15s)** — vLLM startup for the query embedder. - Subsequent queries in the same process are sub-second; one-shot CLI - invocations always pay this cost. -- **Surprisingly low-relevance top hit** — for very short corpora, even - unrelated queries return *something* with a non-huge distance. Inspect - `_distance` gaps between hits rather than absolute values. - -## Related - -- [[ingest]] — populate the table this command reads. -- `retriever recall --help` — batch query → recall@k against ground truth. -- `retriever eval --help` — end-to-end QA evaluation. diff --git a/nemo_retriever/pyproject.toml b/nemo_retriever/pyproject.toml index 9ff55b3ec9..e8571cd1b7 100644 --- a/nemo_retriever/pyproject.toml +++ b/nemo_retriever/pyproject.toml @@ -214,6 +214,7 @@ explicit = true where = ["src"] [tool.setuptools.package-data] +"nemo_retriever" = [".agents/skills/**/*"] "nemo_retriever.harness.portal" = ["static/**/*"] "nemo_retriever.service" = ["retriever-service.yaml"] "nemo_retriever.skill_eval" = ["prompts/*.j2", "configs/*.yaml"] diff --git a/nemo_retriever/src/nemo_retriever/.agents/skills/nemo-retriever-evaluate/PITFALLS.md b/nemo_retriever/src/nemo_retriever/.agents/skills/nemo-retriever-evaluate/PITFALLS.md new file mode 100644 index 0000000000..dfa781a8a6 --- /dev/null +++ b/nemo_retriever/src/nemo_retriever/.agents/skills/nemo-retriever-evaluate/PITFALLS.md @@ -0,0 +1,52 @@ +# Evaluate Pitfalls + +## Missing Evaluation Surface + +If `retriever recall` or `retriever eval` is unavailable, first try the +repo-local CLI when a source checkout exists: + +```bash +uv run --project nemo_retriever retriever eval --help +``` + +Retry dependency downloads before choosing another validation path. + +## Table Name Drift + +Root CLI ingest defaults to table `nv-ingest`. Some evaluation docs and older +graph-pipeline examples mention `nemo-retriever`. Export/recall must point at +the table that was actually written. + +## Retrieval JSON Contract + +`retriever eval run` in file mode needs a retrieval JSON whose top-level +`queries` object maps each ground-truth question string to retrieved `chunks`. +If query strings differ from the ground truth loader's normalization, coverage +will drop even if retrieval quality is good. + +## Coverage Failures + +`retriever eval run` checks retrieval coverage before generation. Low coverage +usually means the retrieval JSON and QA dataset keys do not align, the wrong +dataset loader was selected, or the wrong table/query CSV was used. + +## LLM Extras And Keys + +QA eval needs the `[llm]` extra and generator/judge API configuration. Missing +`litellm`, `NVIDIA_API_KEY`, `GEN_API_KEY`, or `JUDGE_API_KEY` should be reported +as setup gaps, not retrieval failures. + +## Eval Export Remote Endpoint Gap + +`retriever eval export` supports local-HF query embedding with +`--local-query-embed-backend hf`, `--local-hf-cache-dir`, and +`--local-hf-device`, but it still does not expose `--embed-invoke-url` / +`--embedding-http-endpoint` for remote/self-hosted embedding services. If export +must use a remote endpoint, use `retriever recall ... --embedding-http-endpoint ...` +for recall metrics or build the retrieval JSON with the Python `Retriever` API +and explicit `embed_kwargs`. + +## Requery Cost + +Do not re-ingest or re-query LanceDB when changing only generator/judge models. +Save and reuse retrieval JSON whenever the retrieval stage is unchanged. diff --git a/nemo_retriever/src/nemo_retriever/.agents/skills/nemo-retriever-evaluate/SKILL.md b/nemo_retriever/src/nemo_retriever/.agents/skills/nemo-retriever-evaluate/SKILL.md new file mode 100644 index 0000000000..aea96be3fa --- /dev/null +++ b/nemo_retriever/src/nemo_retriever/.agents/skills/nemo-retriever-evaluate/SKILL.md @@ -0,0 +1,82 @@ +--- +name: nemo-retriever-evaluate +description: Use when the user asks to measure NeMo Retriever retrieval quality, recall, QA answer quality, compare retrieval outputs, export evaluation JSON, build page markdown indexes, or run `retriever recall` / `retriever eval` workflows. Do not use for ad-hoc answering from a single index; use `nemo-retriever-query`. +--- + +# nemo-retriever-evaluate + +Use this skill for repeatable retrieval or QA evaluation, not one-off question +answering. + +## Orientation + +1. Verify the installed surface: `retriever recall --help`, + `retriever recall vdb-recall run --help`, `retriever eval export --help`, + and `retriever eval run --help`. +2. Decide the evaluation type: + - Recall metrics over labeled query/page data: `retriever recall vdb-recall run`. + - QA generation and judging: `retriever eval export` plus `retriever eval run`, + or `retriever eval run --from-env`. + - End-to-end ingest plus QA: `retriever pipeline run --evaluation-mode qa`. +3. If the installed CLI is absent but this is a source checkout, use + `uv run --project nemo_retriever retriever ...`. Retry dependency downloads + before choosing another evaluation validation path. +4. If neither path works, use `nemo-retriever-setup` before debugging evaluation + behavior. + +## References + +- `references/EVALUATE.md`: recall, QA export/run, page markdown, config, and + artifact contracts. +- `PITFALLS.md`: table mismatches, retrieval JSON schema, coverage failures, + missing LLM extras, and expensive reruns. + +## Workflow + +1. Identify the corpus/index and ground-truth data. Query CSVs commonly need + `query,pdf_page` or `query,pdf,page`. +2. For recall: + + ```bash + retriever recall vdb-recall run \ + --query-csv ./queries.csv \ + --lancedb-uri ./lancedb \ + --table-name nv-ingest \ + --top-k 10 + ``` + +3. For QA evaluation with reusable retrieval JSON: + + ```bash + retriever eval export \ + --lancedb-uri ./lancedb \ + --lancedb-table nv-ingest \ + --query-csv ./qa.csv \ + --output ./eval/retrieval.json + + retriever eval run --config ./eval_sweep.yaml + ``` + +4. If local HuggingFace query embeddings are required, add + `--local-query-embed-backend hf`, `--local-hf-cache-dir`, and + `--local-hf-device` to `eval export`. +5. If `eval export` cannot use the required remote embedding endpoint, build the + retrieval JSON with the Python `Retriever` API and `write_retrieval_json` + using explicit `embed_kwargs`. +6. Use `retriever eval build-page-index` when full-page markdown is needed from + Parquet produced during ingest. +7. Preserve generated artifacts and report exact paths so the user can rerun + generation/judging without re-querying LanceDB. + +## Success Checks + +- Recall prints `recall@...` metrics or rich recall output. +- QA eval reports coverage and writes result JSON. +- Retrieval JSON has a top-level `queries` object keyed by ground-truth question + strings. + +## Evaluation Scenarios + +- "Measure recall@10 for this LanceDB table and query CSV." Use this skill. +- "Export retrieval JSON and run QA judging with a config." Use this skill. +- "What does the index say about a single question?" Use `nemo-retriever-query`. diff --git a/nemo_retriever/src/nemo_retriever/.agents/skills/nemo-retriever-evaluate/references/EVALUATE.md b/nemo_retriever/src/nemo_retriever/.agents/skills/nemo-retriever-evaluate/references/EVALUATE.md new file mode 100644 index 0000000000..5349ad697c --- /dev/null +++ b/nemo_retriever/src/nemo_retriever/.agents/skills/nemo-retriever-evaluate/references/EVALUATE.md @@ -0,0 +1,192 @@ +# Evaluate Reference + +## Contents + +- [Recall](#recall) +- [QA Evaluation](#qa-evaluation) +- [Eval From Environment](#eval-from-environment) +- [End-To-End QA Shortcut](#end-to-end-qa-shortcut) +- [LLM Requirements](#llm-requirements) + +## Recall + +Use recall when the task has labeled expected pages or document/page keys: + +```bash +retriever recall vdb-recall run \ + --query-csv ./queries.csv \ + --lancedb-uri ./lancedb \ + --table-name nv-ingest \ + --top-k 10 +``` + +The query CSV expects `query,pdf_page` or `query,pdf,page`. The command retrieves +at least 10 internally for recall@10 even when fewer hits are printed. + +Remote query embedding options include: + +- `--embedding-endpoint` +- `--embedding-http-endpoint` +- `--embedding-grpc-endpoint` +- `--embedding-model` +- `--embedding-api-key` + +If you omit the embedding endpoint options, recall falls back to local +HuggingFace embeddings and may download a model. For quick remote-NIM runs, +pass the endpoint/model/API key explicitly. + +## QA Evaluation + +Preferred reproducible path: + +```bash +retriever eval export \ + --lancedb-uri ./lancedb \ + --lancedb-table nv-ingest \ + --query-csv ./qa.csv \ + --output ./eval/retrieval.json + +retriever eval run --config ./eval_sweep.yaml +``` + +`retriever eval export` writes the retrieval JSON contract consumed by +`retriever eval run` / `FileRetriever`. It can also use `--page-index` to replace +sub-page chunks with full-page markdown. + +For local HuggingFace query embeddings: + +```bash +retriever eval export \ + --lancedb-uri ./lancedb \ + --lancedb-table nv-ingest \ + --query-csv ./qa.csv \ + --output ./eval/retrieval.json \ + --top-k 5 \ + --embedder nvidia/llama-nemotron-embed-1b-v2 \ + --local-query-embed-backend hf \ + --local-hf-cache-dir "$HOME/models/huggingface" \ + --local-hf-device cuda +``` + +Minimal `eval_sweep.yaml` for an existing retrieval JSON: + +```yaml +dataset: + source: "csv:./qa.csv" + +retrieval: + type: "file" + file_path: "./eval/retrieval.json" + +models: + generator: + model: "nvidia_nim/nvidia/llama-3.3-nemotron-super-49b-v1.5" + api_key: "${NVIDIA_API_KEY}" + judge: + model: "nvidia_nim/mistralai/mixtral-8x22b-instruct-v0.1" + api_key: "${NVIDIA_API_KEY}" + +evaluations: + - generator: "generator" + judge: "judge" + runs: 1 + +execution: + top_k: 5 + max_workers: 8 + +output: + results_dir: "./eval/results" +``` + +`retriever eval export` supports local-HF query embedding options, but it does +not currently expose a remote embedding endpoint flag. When the index must be +queried with a remote/self-hosted embedding endpoint, use the Python API to +create the same FileRetriever JSON contract: + +```python +from nemo_retriever.export import write_retrieval_json +from nemo_retriever.retriever import Retriever + +retriever = Retriever( + top_k=5, + vdb_kwargs={"uri": "./lancedb", "table_name": "nv-ingest"}, + embed_kwargs={ + "embedding_endpoint": "http://embed:8000/v1", + "embed_invoke_url": "http://embed:8000/v1", + "model_name": "nvidia/llama-nemotron-embed-1b-v2", + "embed_model_name": "nvidia/llama-nemotron-embed-1b-v2", + }, + rerank=False, +) + +all_results = {} +for row in queries: + hits = retriever.query(row["query"]) + all_results[row["query"]] = { + "chunks": [hit.get("text", "") for hit in hits], + "metadata": [ + { + "source_id": hit.get("source_id") or hit.get("source"), + "page_number": hit.get("page_number"), + "distance": hit.get("_distance"), + } + for hit in hits + ], + } + +write_retrieval_json(all_results, "./eval/retrieval.json", {"vdb_backend": "lancedb"}) +``` + +Build a page markdown index from ingestion Parquet: + +```bash +retriever eval build-page-index \ + --parquet-dir ./processed_docs \ + --output ./page_markdown.json +``` + +## Eval From Environment + +Run from an existing retrieval JSON: + +```bash +export RETRIEVAL_FILE=./eval/retrieval.json +export QA_DATASET=csv:./qa.csv +export RESULTS_DIR=./eval/results +retriever eval run --from-env +``` + +Run live retrieval from LanceDB and optionally save the retrieval JSON for +repeatable reruns: + +```bash +export LANCEDB_URI=./lancedb +export LANCEDB_TABLE=nv-ingest +export QA_DATASET=csv:./qa.csv +export RETRIEVAL_SAVE_PATH=./eval/retrieval.json +export RESULTS_DIR=./eval/results +retriever eval run --from-env +``` + +## End-To-End QA Shortcut + +`retriever pipeline run` can ingest and run QA in one command: + +```bash +retriever pipeline run ./data/corpus \ + --lancedb-uri ./lancedb \ + --evaluation-mode qa \ + --eval-config ./eval_sweep.yaml \ + --query-csv ./qa.csv \ + --retrieval-save-path ./eval/retrieval.json +``` + +Use this for development iteration. For benchmark comparisons, prefer the +separable export/run path so retrieval can be reused. + +## LLM Requirements + +QA generation and judging need the `nemo-retriever[llm]` extra and model/API +configuration in the eval config or environment. `NVIDIA_API_KEY` is commonly +used as a fallback for generator and judge keys. diff --git a/nemo_retriever/src/nemo_retriever/.agents/skills/nemo-retriever-ingest/PITFALLS.md b/nemo_retriever/src/nemo_retriever/.agents/skills/nemo-retriever-ingest/PITFALLS.md new file mode 100644 index 0000000000..61bea87e6c --- /dev/null +++ b/nemo_retriever/src/nemo_retriever/.agents/skills/nemo-retriever-ingest/PITFALLS.md @@ -0,0 +1,84 @@ +# Ingest Pitfalls + +## No Installed Surface + +First check the installed user surface: + +```bash +retriever --help +python -c "import importlib.util; print(importlib.util.find_spec('nemo_retriever'))" +``` + +If `retriever` is missing and this is explicitly a source checkout, use the +developer fallback: + +```bash +uv run --project nemo_retriever retriever --help +``` + +If that fails because dependencies need to download, retry the command. + +Only report the environment as missing after checking the installed command and, +when a source checkout is actually available, the repo-local fallback. + +If there is no installed command, no importable `nemo_retriever` package, and no +source checkout, switch to environment setup. Do not guess alternate command +names such as `nemo-retriever` or proceed with a custom RAG implementation. + +## False-Positive Ingest Output + +`retriever ingest` can exit zero and print `Ingested N document(s)` even when no +uploadable LanceDB rows were produced. Always validate with `db.table_names()`, +`table.count_rows()`, and a smoke query before reporting success. + +## TXT Requires Transformers + +TXT ingestion uses a HuggingFace tokenizer. In a lean environment, missing +`transformers` can lead to empty extraction or direct +`ModuleNotFoundError: No module named 'transformers'`. Recover by installing or +transiently adding it. Installed environment: + +```bash +uv pip install transformers +retriever ingest ./docs/*.txt --input-type txt ... +``` + +Source checkout: + +```bash +uv run --project nemo_retriever --with transformers retriever ingest ./docs/*.txt --input-type txt ... +``` + +## Table Defaults Drift + +The root CLI defaults are `--lancedb-uri lancedb` and `--table-name nv-ingest`. +Some older docs and examples mention `nemo-retriever`. Always match the table +that was actually written. + +## Overwrite Is Default + +`retriever ingest` overwrites the target table unless `--append` is passed. Do +not append on reruns unless duplicates are acceptable. + +## First Run Can Be Slow + +Local GPU model loading, CUDA graph capture, Ray startup, and first-time model +downloads can dominate the first run. This is not automatically a failed ingest. +Look for a non-zero exit or explicit validation error. + +## Remote Endpoints Need Matching Query Settings + +If ingest used `--embed-invoke-url` or a non-default `--embed-model-name`, query +and evaluation must use the same endpoint/model pair. Mismatched embeddings can +return empty or irrelevant hits. + +## Single-PDF And Tiny Corpora + +Tiny LanceDB tables can emit partition/index warnings or produce weak nearest +neighbors for unrelated queries. Validate with a query that should be present in +the corpus and inspect rows before tuning thresholds. + +## Stale Documentation + +If a flag from docs is rejected, run the command-specific `--help` and adapt to +the installed CLI. Teach the mismatch in your final answer rather than hiding it. diff --git a/nemo_retriever/src/nemo_retriever/.agents/skills/nemo-retriever-ingest/SKILL.md b/nemo_retriever/src/nemo_retriever/.agents/skills/nemo-retriever-ingest/SKILL.md new file mode 100644 index 0000000000..4469e26a4c --- /dev/null +++ b/nemo_retriever/src/nemo_retriever/.agents/skills/nemo-retriever-ingest/SKILL.md @@ -0,0 +1,97 @@ +--- +name: nemo-retriever-ingest +description: Use when the user asks to ingest, index, embed, or make documents searchable with NeMo Retriever, including `retriever ingest`, `retriever pipeline run`, LanceDB creation, extraction outputs, or ingestion validation. Do not use for querying an existing index; use `nemo-retriever-query` instead. +--- + +# nemo-retriever-ingest + +Use this skill to build a searchable NeMo Retriever corpus. It teaches the +current CLI/SDK behavior, the defaults that matter across tasks, and validation +checks that distinguish a real index from a false-positive run. + +## Orientation + +1. From the active environment, verify the public surface: `retriever --help`, + then `retriever ingest --help` or `retriever pipeline run --help`. +2. If `retriever` is not on PATH but this is a source checkout, bootstrap the + CLI with `uv run --project nemo_retriever retriever --help`. If dependencies + need to download, retry the command and continue from the validated command + surface. +3. If neither an installed command nor a source checkout is available, this is + an environment setup blocker, not an ingest failure. Use the setup workflow + first; do not invent a package name or private command. +4. Choose the simplest ingestion path that satisfies the task: + - `retriever ingest ...` for one-shot ingest into LanceDB. + - `retriever pipeline run ...` when the task needs saved Parquet, image + storage, evaluation mode, service run mode, or lower-level tuning. + - Python `create_ingestor(...)` when the user explicitly wants SDK code. +5. Record the `lancedb_uri`, table name, run mode, and any remote NIM endpoints + because query and evaluation tasks must match them exactly. + +## References + +- `references/INGEST.md`: command choices, defaults, remote inference, SDK notes, + and validation checks. +- `PITFALLS.md`: install gaps, table mismatches, slow startup, empty corpora, + model downloads, and stale docs. + +## Workflow + +1. Identify input paths and supported file types. For directories, expect + `retriever ingest` to expand supported files; for `pipeline run`, confirm the + desired `--input-type` when the corpus is not obvious. +2. Decide local versus remote inference before running: + - Remote NIM inference: set `NVIDIA_API_KEY` when using build.nvidia.com and + pass explicit `--*-invoke-url`, `--embed-invoke-url`, and model flags. + - Local inference: confirm the environment has the needed extras, CUDA stack, + and model cache. Route HuggingFace downloads to `~/models` when preparing a + new environment. +3. Run ingest with explicit index settings when the index will be reused: + + ```bash + retriever ingest ./data/corpus --lancedb-uri ./lancedb --table-name nv-ingest + ``` + + From this repo checkout, use: + + ```bash + uv run --project nemo_retriever retriever ingest ./data/corpus --lancedb-uri ./lancedb --table-name nv-ingest + ``` + +4. Use `retriever pipeline run` when saved intermediates matter: + + ```bash + retriever pipeline run ./data/corpus --input-type pdf --save-intermediate ./processed_docs + ``` + +5. Validate the table before declaring success. A zero exit and "Ingested N + document(s)" are not sufficient: + + ```python + import lancedb + + db = lancedb.connect("./lancedb") + print(db.table_names()) + table = db.open_table("nv-ingest") + print(table.count_rows()) + ``` + + Then run a smoke query with the same URI, table, embedding endpoint, and + model used for ingest. + +## Success Checks + +- The command reports the target LanceDB URI and table, or the expected Parquet + directory exists for `--save-intermediate`. +- A focused `retriever query ... --lancedb-uri ... --table-name ...` returns at + least one JSON hit for a query that should match the corpus. +- The query skill can reuse the recorded URI/table/model settings without + guessing. + +## Evaluation Scenarios + +- "Index the PDFs in `data/reports` with NeMo Retriever." Use this skill. +- "Run a tuned batch ingestion and save Parquet for later page markdown." Use + this skill and prefer `retriever pipeline run`. +- "Answer a question from an existing LanceDB table." Use `nemo-retriever-query`, + not this skill. diff --git a/nemo_retriever/src/nemo_retriever/.agents/skills/nemo-retriever-ingest/references/INGEST.md b/nemo_retriever/src/nemo_retriever/.agents/skills/nemo-retriever-ingest/references/INGEST.md new file mode 100644 index 0000000000..00b0dbe21d --- /dev/null +++ b/nemo_retriever/src/nemo_retriever/.agents/skills/nemo-retriever-ingest/references/INGEST.md @@ -0,0 +1,146 @@ +# Ingest Reference + +## Contents + +- [Command Selection](#command-selection) +- [Inputs](#inputs) +- [Remote Inference](#remote-inference) +- [Python SDK Notes](#python-sdk-notes) +- [Validation](#validation) + +## Command Selection + +Use `retriever ingest` for the compact installed-user path: + +```bash +retriever ingest ./data/corpus --lancedb-uri ./lancedb --table-name nv-ingest +``` + +When working from this source checkout and the installed command is absent, use +the project environment instead of stopping: + +```bash +uv run --project nemo_retriever retriever ingest ./data/corpus \ + --lancedb-uri ./lancedb \ + --table-name nv-ingest +``` + +Observed from the root CLI tests: + +- Default `--lancedb-uri` is `lancedb`. +- Default `--table-name` is `nv-ingest`. +- Default `--run-mode` is `inprocess`. +- The command overwrites the target table by default. Use `--append` only when + duplicate rows are acceptable or the caller explicitly wants append behavior. +- Directories are expanded to supported files. Empty directories and unsupported + extensions are user-facing errors. + +Use `retriever pipeline run` when the task needs lower-level controls: + +```bash +retriever pipeline run ./data/corpus \ + --input-type pdf \ + --method pdfium \ + --save-intermediate ./processed_docs +``` + +Important differences: + +- `pipeline run` exposes more extraction, chunking, storage, Ray, service, and + evaluation flags. +- `--save-intermediate` writes extraction results as Parquet, which is needed + for full-page markdown QA evaluation. +- `--no-vdb` skips vector DB upload. +- `--run-mode service` submits work to a running Retriever service. +- `pipeline run` defaults to `--run-mode batch`; pass `--run-mode inprocess` + for a small local smoke test. + +## Inputs + +`retriever ingest` supports `auto`, `pdf`, `doc`, `txt`, `html`, `image`, +`audio`, and `video` input types. `doc` covers Office documents such as DOCX and +PPTX but routes through the PDF/document extraction path. + +Media workflows need extra system dependencies: + +- TXT chunking uses HuggingFace tokenizers. If `txt_file_to_chunks_df` or a + text ingest fails with `ModuleNotFoundError: No module named 'transformers'`, + install or transiently add the missing dependency, then rerun. In an installed + environment use `uv pip install transformers`; in a source checkout use + `uv run --project nemo_retriever --with transformers retriever ingest ...`. +- Audio/video: `ffmpeg` / `ffprobe`. +- SVG rendering: `cairosvg` and its system dependencies. +- Local GPU inference: install the `[local]` extra and CUDA-compatible PyTorch. + +## Remote Inference + +For hosted or self-hosted NIMs, pass the stage endpoints explicitly: + +```bash +export NVIDIA_API_KEY=nvapi-... +retriever ingest ./data/corpus \ + --page-elements-invoke-url https://ai.api.nvidia.com/v1/cv/nvidia/nemotron-page-elements-v3 \ + --ocr-invoke-url https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-ocr-v1 \ + --ocr-version v1 \ + --graphic-elements-invoke-url https://ai.api.nvidia.com/v1/cv/nvidia/nemotron-graphic-elements-v1 \ + --table-structure-invoke-url https://ai.api.nvidia.com/v1/cv/nvidia/nemotron-table-structure-v1 \ + --embed-invoke-url https://integrate.api.nvidia.com/v1/embeddings \ + --embed-model-name nvidia/llama-nemotron-embed-1b-v2 +``` + +For remote embedding, query with the same embedding endpoint and model. Do not +mix vectors created by one model with queries embedded by another model. + +## Python SDK Notes + +When the user wants SDK code, start from: + +```python +from nemo_retriever import create_ingestor + +ingestor = create_ingestor(run_mode="batch") +dataset = ingestor.files(["./data/file.pdf"]).extract().embed().ingest() +``` + +Use the CLI for the shortest path to LanceDB. Some docs discuss graph ingestion +and storage separately; the root CLI adapter has the tested one-shot +`extract -> embed -> vdb_upload -> ingest` path. + +## Validation + +After ingest, validate with the concrete LanceDB table and one smoke query. A +successful process exit alone is not enough. + +```python +import lancedb + +db = lancedb.connect("./lancedb") +print(db.table_names()) +table = db.open_table("nv-ingest") +print(table.count_rows()) +``` + +Then use a query that should match the corpus: + +```bash +retriever query "smoke test term from the corpus" --lancedb-uri ./lancedb --table-name nv-ingest --top-k 3 +``` + +For text ingestion, also validate that extraction produced rows before tuning +embedding or LanceDB settings: + +Installed environment: + +```bash +python -c "from nemo_retriever.txt.split import txt_file_to_chunks_df; print(txt_file_to_chunks_df('file.txt').shape)" +``` + +Source checkout: + +```bash +uv run --project nemo_retriever --with transformers python -c \ + "from nemo_retriever.txt.split import txt_file_to_chunks_df; print(txt_file_to_chunks_df('file.txt').shape)" +``` + +If validation fails, read `PITFALLS.md` before changing models, table names, or +paths. diff --git a/nemo_retriever/src/nemo_retriever/.agents/skills/nemo-retriever-query/PITFALLS.md b/nemo_retriever/src/nemo_retriever/.agents/skills/nemo-retriever-query/PITFALLS.md new file mode 100644 index 0000000000..987c1e5704 --- /dev/null +++ b/nemo_retriever/src/nemo_retriever/.agents/skills/nemo-retriever-query/PITFALLS.md @@ -0,0 +1,63 @@ +# Query Pitfalls + +## Missing Or Wrong Table + +`Table ... was not found`, `[]`, or obviously irrelevant hits often mean the +query URI/table does not match ingest. Check both: + +```bash +retriever query "known corpus term" --lancedb-uri ./lancedb --table-name nv-ingest --top-k 3 +``` + +Root CLI default table is `nv-ingest`; some older graph-pipeline examples use +`nemo-retriever`. Use the table that was actually written. + +Validate table existence directly when query says `Table ... was not found`: + +Installed environment: + +```bash +python -c "import lancedb; db=lancedb.connect('./lancedb'); print(db.table_names())" +``` + +Source checkout: + +```bash +uv run --project nemo_retriever python -c "import lancedb; db=lancedb.connect('./lancedb'); print(db.table_names())" +``` + +## Metadata Shape + +Current normalized hits expose `metadata` as a dict. Older docs or examples may +show a JSON string. Do not blindly pipe through `fromjson`; first inspect one +hit. + +## Embedding Mismatch + +If ingest used remote embedding or a non-default model, query must use the same +embedding endpoint and model. Mixed embedding spaces can look like a retrieval +failure even when the table has rows. + +If ingest used the local HuggingFace backend, pass +`--local-query-embed-backend hf` plus the same cache/device settings used for +local validation. Otherwise the CLI may try the default local vLLM path and fail +before retrieval. + +## Rerank Is Opt-In + +Do not assume rerank is enabled. Use `--rerank` or a reranker endpoint/model +option when the user asks for reranking or when precision matters enough to pay +the extra cost. + +## Chart And Image Evidence + +Chart/image text can be model-generated and may misread exact numbers or +directions. For exact numeric claims, prefer corroborating text/table hits. If +only a chart/image transcription supports the answer, label it as chart-derived +or image-derived rather than making it sound verified by prose. + +## Insufficient Evidence + +Do not answer from general knowledge when retrieved evidence is missing. State +that the retrieved pages do not contain the requested fact and name the closest +related evidence if useful. diff --git a/nemo_retriever/src/nemo_retriever/.agents/skills/nemo-retriever-query/SKILL.md b/nemo_retriever/src/nemo_retriever/.agents/skills/nemo-retriever-query/SKILL.md new file mode 100644 index 0000000000..2998e871e2 --- /dev/null +++ b/nemo_retriever/src/nemo_retriever/.agents/skills/nemo-retriever-query/SKILL.md @@ -0,0 +1,81 @@ +--- +name: nemo-retriever-query +description: Use when the user asks to search a NeMo Retriever index, run `retriever query`, retrieve evidence from LanceDB, inspect query hit schemas, answer questions from retrieved documents, or debug missing/empty query results. Do not use for creating indexes; use `nemo-retriever-ingest` instead. +--- + +# nemo-retriever-query + +Use this skill to retrieve evidence from an existing NeMo Retriever index and +answer only from that evidence. + +## Orientation + +1. Verify the active public surface first: `retriever query --help`. +2. Confirm the LanceDB URI, table name, and embedding settings from the ingest + task or project config. Do not guess if the user supplied different values. +3. If the installed CLI is absent but this is a source checkout, use + `uv run --project nemo_retriever retriever query --help`. Retry dependency + downloads before choosing another query validation path. +4. If neither path works, use `nemo-retriever-setup` before debugging query + behavior. +5. Keep source citation indexing straight: `page_number` returned by Retriever + is 1-indexed unless an external task schema says otherwise. + +## References + +- `references/QUERY.md`: CLI and Python query patterns, result schema, rerank + behavior, and answer synthesis. +- `PITFALLS.md`: missing tables, empty hits, metadata shape mistakes, chart + uncertainty, and model mismatches. + +## Workflow + +1. Run a focused query against the known table: + + ```bash + retriever query "question text" --lancedb-uri ./lancedb --table-name nv-ingest --top-k 5 + ``` + +2. If ingest used remote embedding, include the same query embedding endpoint + and model: + + ```bash + retriever query "question text" \ + --lancedb-uri ./lancedb \ + --table-name nv-ingest \ + --embed-invoke-url https://integrate.api.nvidia.com/v1/embeddings \ + --embed-model-name nvidia/llama-nemotron-embed-1b-v2 + ``` + +3. If ingest used local HuggingFace embeddings, keep the query backend and cache + explicit so the CLI does not fall back to the vLLM local path: + + ```bash + retriever query "question text" \ + --lancedb-uri ./lancedb \ + --table-name nv-ingest \ + --embed-model-name nvidia/llama-nemotron-embed-1b-v2 \ + --local-query-embed-backend hf \ + --local-hf-cache-dir "$HOME/models/huggingface" \ + --local-hf-device cuda + ``` + +4. Inspect ranked hits before answering. Use `_distance` as a ranking signal, + not a calibrated score. +5. Synthesize from hit `text`, `source_id` / `path`, `pdf_basename`, and + `page_number`. Include document and page when available. +6. If the evidence does not answer the question, say what is missing instead of + inventing a plausible answer. + +## Success Checks + +- Query output is a JSON array of hits, possibly empty. +- Each answer claim is supported by one or more retrieved hit texts. +- The final answer states insufficient evidence when the retrieved text does + not contain the requested fact. + +## Evaluation Scenarios + +- "Use the Retriever index to answer: what was revenue in 2024?" Use this skill. +- "The query returns no hits." Use this skill and read `PITFALLS.md`. +- "Index the PDFs first." Use `nemo-retriever-ingest`, not this skill. diff --git a/nemo_retriever/src/nemo_retriever/.agents/skills/nemo-retriever-query/references/QUERY.md b/nemo_retriever/src/nemo_retriever/.agents/skills/nemo-retriever-query/references/QUERY.md new file mode 100644 index 0000000000..ea37187162 --- /dev/null +++ b/nemo_retriever/src/nemo_retriever/.agents/skills/nemo-retriever-query/references/QUERY.md @@ -0,0 +1,151 @@ +# Query Reference + +## Contents + +- [CLI Query](#cli-query) +- [Python Query](#python-query) +- [Result Schema](#result-schema) +- [Answer Synthesis](#answer-synthesis) + +## CLI Query + +The root CLI command is: + +```bash +retriever query "question" --top-k 5 --lancedb-uri ./lancedb --table-name nv-ingest +``` + +From the source checkout, prefix with the project environment when needed: + +```bash +uv run --project nemo_retriever retriever query "question" \ + --top-k 5 \ + --lancedb-uri ./lancedb \ + --table-name nv-ingest +``` + +Observed from source/tests: + +- Default `--top-k` is `10`. +- Default `--lancedb-uri` is `lancedb`. +- Default `--table-name` is `nv-ingest`. +- The CLI prints clean JSON on stdout on success. +- Reranking is off by default. +- `--rerank` enables local reranking. Any reranker URL/model/backend option also + implicitly enables reranking. +- `EMBED_INVOKE_URL` and `RERANKER_INVOKE_URL` environment variables are used + when the matching CLI flags are omitted. +- For local HuggingFace query embeddings, pass `--local-query-embed-backend hf` + plus `--local-hf-cache-dir` and `--local-hf-device` when needed. + +Useful summary: + +```bash +retriever query "question" --top-k 5 \ + | jq -r 'to_entries[] | "rank=\(.key + 1) page=\(.value.page_number) source=\(.value.source_id // .value.path // .value.source) type=\(.value.metadata.type // .value.content_type // "?") text=\(.value.text[:200])"' +``` + +Do not use `fromjson` on `.metadata` for current `Retriever.query()` / root CLI +hits. The normalized API boundary returns `metadata` as a native dict. + +## Python Query + +For SDK use: + +```python +from nemo_retriever.retriever import Retriever + +retriever = Retriever( + top_k=5, + vdb_kwargs={"uri": "./lancedb", "table_name": "nv-ingest"}, +) +hits = retriever.query("question") +``` + +Remote query embedding: + +```python +retriever = Retriever( + top_k=5, + vdb_kwargs={"uri": "./lancedb", "table_name": "nv-ingest"}, + embed_kwargs={ + "embed_invoke_url": "https://integrate.api.nvidia.com/v1/embeddings", + "embedding_endpoint": "https://integrate.api.nvidia.com/v1/embeddings", + "model_name": "nvidia/llama-nemotron-embed-1b-v2", + "embed_model_name": "nvidia/llama-nemotron-embed-1b-v2", + }, +) +``` + +The equivalent CLI call must still carry the same LanceDB URI and table used at +ingest time: + +```bash +retriever query "question" \ + --lancedb-uri ./lancedb \ + --table-name nv-ingest \ + --embed-invoke-url https://integrate.api.nvidia.com/v1/embeddings \ + --embed-model-name nvidia/llama-nemotron-embed-1b-v2 +``` + +Local HuggingFace query embedding: + +```bash +retriever query "question" \ + --lancedb-uri ./lancedb \ + --table-name nv-ingest \ + --embed-model-name nvidia/llama-nemotron-embed-1b-v2 \ + --local-query-embed-backend hf \ + --local-hf-cache-dir "$HOME/models/huggingface" \ + --local-hf-device cuda +``` + +The same settings in Python: + +```python +from pathlib import Path + +from nemo_retriever.params import ModelRuntimeParams +from nemo_retriever.retriever import Retriever + +hf_cache_dir = str(Path.home() / "models/huggingface") +retriever = Retriever( + top_k=5, + vdb_kwargs={"uri": "./lancedb", "table_name": "nv-ingest"}, + embed_kwargs={ + "model_name": "nvidia/llama-nemotron-embed-1b-v2", + "embed_model_name": "nvidia/llama-nemotron-embed-1b-v2", + "local_ingest_embed_backend": "hf", + "runtime": ModelRuntimeParams( + hf_cache_dir=hf_cache_dir, + device="cuda", + ), + }, +) +``` + +Use `run_mode="service"` only when you specifically need the CPU HTTP embedding +path to require an endpoint. It is not the same thing as the FastAPI ingest +service. + +## Result Schema + +Normalized hits may include: + +- `text`: retrieved content. +- `metadata`: native dict with content metadata such as `type`, page fields, or + stored image metadata. +- `source`, `source_id`, `path`: origin document path/name when known. +- `pdf_basename`: stem of the source PDF path. +- `page_number`: integer page number, 1-indexed when present. +- `pdf_page`: composite key like `_`. +- `_distance`: vector distance. Lower is better within the same query/model. +- `_score` or `_rerank_score`: present for some backends/rerank paths. + +## Answer Synthesis + +- Prefer direct text evidence over chart/image transcriptions for exact numbers. +- Cite document/page when present. +- Preserve 1-indexed pages unless the task explicitly requests 0-indexing. +- When multiple entities, years, or categories are asked for, address each one + explicitly, including "not found in retrieved evidence" where needed. diff --git a/nemo_retriever/src/nemo_retriever/.agents/skills/nemo-retriever-service/PITFALLS.md b/nemo_retriever/src/nemo_retriever/.agents/skills/nemo-retriever-service/PITFALLS.md new file mode 100644 index 0000000000..7bc0c351ac --- /dev/null +++ b/nemo_retriever/src/nemo_retriever/.agents/skills/nemo-retriever-service/PITFALLS.md @@ -0,0 +1,64 @@ +# Service Pitfalls + +## Service Versus SDK Run Modes + +`retriever service start` runs the FastAPI ingestion service. +`Retriever(run_mode="service")` means HTTP query embedding in the Python +retriever. Do not conflate them. + +## Auth Token Mismatch + +When `auth.api_token` is configured, every non-bypassed request needs +`Authorization: Bearer `. The CLI can read `NEMO_RETRIEVER_API_TOKEN`. +Health and docs paths are bypassed by default, so a successful health check does +not prove ingest requests are authenticated correctly. + +## Endpoint Overrides Are Server-Owned + +Do not let client payloads set NIM endpoint URLs or API keys. Use YAML, CLI +overrides, environment variables, or Helm values. Request-level overrides are +policy-gated and endpoint/api-key keys are denied. + +## Legacy Routes + +Use `GET /v1/ingest/job/{job_id}/events` for SSE. The old +`GET /v1/ingest/events` route should be treated as stale. + +## Service Ingest CLI Drift + +Some current builds expose `retriever service ingest` options but call the +client with stale keyword arguments, producing: + +```text +TypeError: RetrieverServiceClient.ingest_documents() got an unexpected keyword argument 'use_sse' +``` + +Do not stop there. Use the HTTP job API directly or the current Python client +signature: `ingest_documents(files=..., show_progress=True, pipeline_spec=...)`. + +## Default Service Extraction Is PDF + +The document upload route defaults to `extraction_mode='pdf'`. Uploading a TXT +file without a pipeline override fails with `Input file type(s) do not match +extraction_mode='pdf'`. For service smoke tests, use a PDF fixture or provide a +valid `pipeline.extraction_mode` and the dependencies needed by that mode. + +## Page-Elements 401 Can Be Non-Fatal + +A text-only PDF smoke upload can complete even if page-elements detection logs +an HTTP 401 inside `page_elements_v3.error`, as long as text extraction produced +rows and the job status is `completed`. Treat the embedded stage error as a +capability/config warning, not automatically as failed service ingestion. + +## Helm Replica Limit + +The Helm chart currently uses SQLite on a single ReadWriteOnce PVC, which caps +the service at one replica until a shared database backend is introduced. + +## ffmpeg For Audio And Video + +Audio/video extraction requires `ffmpeg` and `ffprobe`. The service image can +install them at startup with `service.installFfmpeg=true`, but that requires +network egress, writable root filesystem, and a security policy allowing the +scoped sudo path. Locked-down clusters should use a custom image with ffmpeg +already installed. diff --git a/nemo_retriever/src/nemo_retriever/.agents/skills/nemo-retriever-service/SKILL.md b/nemo_retriever/src/nemo_retriever/.agents/skills/nemo-retriever-service/SKILL.md new file mode 100644 index 0000000000..956dd65111 --- /dev/null +++ b/nemo_retriever/src/nemo_retriever/.agents/skills/nemo-retriever-service/SKILL.md @@ -0,0 +1,72 @@ +--- +name: nemo-retriever-service +description: Use when the user asks to run, deploy, configure, operate, or call the NeMo Retriever service, including `retriever service start`, `retriever service ingest`, FastAPI `/v1` endpoints, service YAML, Helm chart deployment, auth tokens, NIM endpoint wiring, or service-mode troubleshooting. +--- + +# nemo-retriever-service + +Use this skill for NeMo Retriever service operation. Do not use it for a simple +local one-shot ingest unless the user specifically wants a long-running service. + +## Orientation + +1. Verify the installed service commands: `retriever service --help`, + `retriever service start --help`, and `retriever service ingest --help`. +2. Decide local service versus Kubernetes Helm: + - Local: `retriever service start` plus HTTP health checks. + - Kubernetes: `nemo_retriever/helm` chart and NIM endpoint/secret wiring. +3. If the installed CLI is absent but this is a source checkout, use + `uv run --project nemo_retriever retriever service ...`. Retry dependency + downloads before choosing another service validation path. +4. If neither path works, use `nemo-retriever-setup` before debugging service + behavior. + +## References + +- `references/SERVICE.md`: service commands, key endpoints, YAML settings, Helm + deployment choices, and auth behavior. +- `PITFALLS.md`: endpoint policy, missing NIMs, token mismatch, SQLite replica + limits, ffmpeg runtime install, and stale service routes. + +## Workflow + +1. Locate or create a service config. Discovery order is explicit `--config`, + `./retriever-service.yaml`, then the bundled package default. +2. Start locally when appropriate: + + ```bash + retriever service start --config ./retriever-service.yaml --host 0.0.0.0 --port 7670 + ``` + +3. Verify health before submitting work: + + ```bash + curl http://localhost:7670/v1/health + ``` + +4. Submit files through the CLI client: + + ```bash + retriever service ingest ./data/file.pdf --server-url http://localhost:7670 + ``` + +5. If service ingest CLI raises `TypeError: ... unexpected keyword argument + 'use_sse'`, use the HTTP job API directly: `POST /v1/ingest/job`, then + `POST /v1/ingest/job/{job_id}/document`, then poll + `GET /v1/ingest/job/{job_id}?include_documents=true`. +6. For Kubernetes, use Helm and decide whether NIMs are operator-managed or + external URLs supplied through `serviceConfig.nimEndpoints.*`. + +## Success Checks + +- `/v1/health` responds. +- The CLI client or HTTP job API can create a job, accept a document, and report + completion or useful job status. +- If auth is enabled, requests include the same bearer token configured by + `--api-token`, YAML `auth.api_token`, or `NEMO_RETRIEVER_API_TOKEN`. + +## Evaluation Scenarios + +- "Start a Retriever service for document ingestion." Use this skill. +- "Deploy Retriever with external NIM endpoints in Kubernetes." Use this skill. +- "Run a one-shot local ingest into LanceDB." Use `nemo-retriever-ingest`. diff --git a/nemo_retriever/src/nemo_retriever/.agents/skills/nemo-retriever-service/references/SERVICE.md b/nemo_retriever/src/nemo_retriever/.agents/skills/nemo-retriever-service/references/SERVICE.md new file mode 100644 index 0000000000..54b7d28c9d --- /dev/null +++ b/nemo_retriever/src/nemo_retriever/.agents/skills/nemo-retriever-service/references/SERVICE.md @@ -0,0 +1,128 @@ +# Service Reference + +## Contents + +- [Local Commands](#local-commands) +- [Service Config](#service-config) +- [HTTP Surface](#http-surface) +- [Helm Deployment](#helm-deployment) + +## Local Commands + +Start a local service: + +```bash +retriever service start --config ./retriever-service.yaml --port 7670 +``` + +From this source checkout: + +```bash +uv run --project nemo_retriever retriever service start --host 127.0.0.1 --port 7670 +``` + +Submit files to a running service: + +```bash +BASE_URL=http://localhost:7670 +retriever service ingest ./data/file.pdf --server-url "$BASE_URL" +``` + +If the service ingest CLI raises `TypeError: RetrieverServiceClient.ingest_documents() +got an unexpected keyword argument 'use_sse'`, drive the public HTTP API +directly. If auth is enabled, uncomment the `AUTH` line. + +```bash +BASE_URL=http://localhost:7670 +AUTH=() +# AUTH=(-H "Authorization: Bearer $NEMO_RETRIEVER_API_TOKEN") + +curl -sS -X POST "$BASE_URL/v1/ingest/job" \ + "${AUTH[@]}" \ + -H 'Content-Type: application/json' \ + -d '{"expected_documents":1,"label":"smoke"}' + +curl -sS -X POST "$BASE_URL/v1/ingest/job//document" \ + "${AUTH[@]}" \ + -F file=@./data/file.pdf \ + -F metadata='{"filename":"file.pdf"}' + +curl -sS "$BASE_URL/v1/ingest/job/?include_documents=true" \ + "${AUTH[@]}" +``` + +The service CLI supports: + +- `--nim-api-key` for NIM endpoints, overriding YAML / `NVIDIA_API_KEY`. +- `--api-token` for service bearer-token auth, also read from + `NEMO_RETRIEVER_API_TOKEN`. +- `--gpu-devices` to override service resource config. +- `--server-url` and `--api-token` on client ingest. The current client path may + reject `--sse/--no-sse` or `--poll-interval`; use the HTTP job API above if + that happens. + +## Service Config + +The bundled default is `nemo_retriever.service/retriever-service.yaml`. +Discovery order: + +1. `retriever service start --config /path/to/retriever-service.yaml` +2. `./retriever-service.yaml` +3. bundled package default + +Important config sections: + +- `server.host` / `server.port` +- `nim_endpoints.*_invoke_url` and `nim_endpoints.api_key` +- `pipeline.realtime_workers` / `pipeline.batch_workers` +- `auth.api_token` +- `pipeline_overrides.mode` and sink allow lists + +Client-supplied endpoint URLs and API keys are trust-sensitive. The policy layer +denies those through request overrides; configure them server-side. + +For a cheap PDF text-only smoke upload, use allowed per-request extraction +overrides to disable expensive table/chart/image extraction: + +```bash +curl -sS -X POST "$BASE_URL/v1/ingest/job//document" \ + "${AUTH[@]}" \ + -F file=@./data/file.pdf \ + -F metadata='{"filename":"file.pdf","pipeline":{"extraction_mode":"pdf","extract_params":{"method":"pdfium","extract_tables":false,"extract_charts":false,"extract_images":false,"extract_page_as_image":false},"stage_order":[]}}' +``` + +Do not include `use_page_elements` in request overrides unless the service +operator widened the allow list; the default policy rejects that key. + +## HTTP Surface + +Common public endpoints: + +- `GET /v1/health` +- `POST /v1/ingest/job` +- `POST /v1/ingest/job/{job_id}/document` +- `GET /v1/ingest/job/{job_id}` +- `GET /v1/ingest/job/{job_id}/events` +- `GET /v1/ingest/pipeline-config` +- `GET /v1/ingest/metrics` +- `POST /v1/query` when the vectordb route is configured + +The legacy firehose `GET /v1/ingest/events` is removed. Use the per-job events +route. + +## Helm Deployment + +The chart at `nemo_retriever/helm` deploys the service and optionally NIM +Operator resources. For external NIM endpoints: + +```bash +helm install retriever ./nemo_retriever/helm \ + --set nims.enabled=false \ + --set serviceConfig.nimEndpoints.pageElementsInvokeUrl=http://page-elements.svc:8000/v1/infer \ + --set serviceConfig.nimEndpoints.tableStructureInvokeUrl=http://table-structure.svc:8000/v1/infer \ + --set serviceConfig.nimEndpoints.ocrInvokeUrl=http://ocr.svc:8000/v1/infer \ + --set serviceConfig.nimEndpoints.embedInvokeUrl=http://embed.svc:8000/v1/embeddings +``` + +For NGC image pulls or build.nvidia.com endpoints, configure the relevant +`NGC_API_KEY` / `NVIDIA_API_KEY` secrets through the chart values. diff --git a/nemo_retriever/src/nemo_retriever/.agents/skills/nemo-retriever-setup/PITFALLS.md b/nemo_retriever/src/nemo_retriever/.agents/skills/nemo-retriever-setup/PITFALLS.md new file mode 100644 index 0000000000..0278ac0002 --- /dev/null +++ b/nemo_retriever/src/nemo_retriever/.agents/skills/nemo-retriever-setup/PITFALLS.md @@ -0,0 +1,41 @@ +# Setup Pitfalls + +## Python Version + +Use Python 3.12. Older or newer Python versions can fail dependency resolution +or import checks because the package metadata requires `>=3.12,<3.13`. + +## Missing Installed Surface + +`retriever`, `nemo-retriever`, and `nemo_retriever` are not interchangeable +command names. The public CLI command is `retriever`; the Python import package +is `nemo_retriever`; the distribution name is `nemo-retriever`. + +## Source Checkout Versus Installed Package + +`uv run --project nemo_retriever retriever ...` is a developer-checkout fallback. +For installed-package validation, install the package into an isolated +environment and run `retriever --help` without relying on the source tree. + +## Optional Extras + +The base install is enough for remote NIM workflows. Local GPU inference needs +the `local` extra. Audio/video and SVG workflows need `multimedia` plus system +dependencies. QA generation/judging needs `llm`. + +## System Dependencies + +`ffmpeg-python` and `nemo-retriever[multimedia]` do not install the `ffmpeg` and +`ffprobe` binaries. Install those through the operating system or use a service +image/cluster configuration that provides them. + +## Model Downloads + +Local inference may download large HuggingFace assets on first use. Route caches +to `~/models` for reproducible agent work and avoid writing model assets into +the repository. + +## Remote Credentials + +Hosted NIM endpoints need `NVIDIA_API_KEY`. Missing keys should be reported as a +setup gap, not as an ingest/query failure. diff --git a/nemo_retriever/src/nemo_retriever/.agents/skills/nemo-retriever-setup/SKILL.md b/nemo_retriever/src/nemo_retriever/.agents/skills/nemo-retriever-setup/SKILL.md new file mode 100644 index 0000000000..d5933d9add --- /dev/null +++ b/nemo_retriever/src/nemo_retriever/.agents/skills/nemo-retriever-setup/SKILL.md @@ -0,0 +1,88 @@ +--- +name: nemo-retriever-setup +description: Use when the user asks to install, verify, or orient to NeMo Retriever, when `retriever` is missing, when choosing extras or model/API prerequisites, or before another Retriever workflow can run. Do not use for a specific ingest, query, service, or evaluation task once the CLI works; use that task skill instead. +--- + +# nemo-retriever-setup + +Use this skill to get an agent into a working NeMo Retriever environment before +running task-specific workflows. + +## Orientation + +1. Verify the intended public entry points: + + ```bash + retriever --help + ``` + +2. If this is a source checkout, the developer fallback is: + + ```bash + uv run --project nemo_retriever retriever --help + ``` + +3. If neither the installed CLI nor the source fallback works, report setup as + the blocker before attempting ingest/query/service/evaluation. + +## References + +- `PITFALLS.md`: Python version, missing package, optional extras, system + dependencies, API keys, and model-cache issues. + +## Workflow + +1. Confirm Python 3.12. NeMo Retriever requires Python `>=3.12,<3.13`. +2. Choose install shape: + - Remote NIM inference, no local GPU models: install the base package. + - Local GPU inference: install the `local` extra and verify CUDA/PyTorch. + - Audio/video or SVG inputs: add the `multimedia` extra and system `ffmpeg` + / `ffprobe` when needed. + - QA generation or judging: add the `llm` extra and configure model keys. +3. Create an isolated environment: + + ```bash + uv python install 3.12 + uv venv retriever --python 3.12 + source retriever/bin/activate + uv pip install nemo-retriever + ``` + + For local GPU inference, install the appropriate extra instead: + + ```bash + uv pip install "nemo-retriever[local]" + ``` + +4. Route first-time HuggingFace downloads outside the repo when preparing local + inference: + + ```bash + export HF_HOME="$HOME/models/huggingface" + export HF_HUB_CACHE="$HOME/models/huggingface/hub" + ``` + +5. For remote hosted NIMs, configure credentials before ingest/query: + + ```bash + export NVIDIA_API_KEY=nvapi-... + ``` + +6. Re-run the public-surface checks. Once `retriever --help` and the relevant + subcommand help work, switch to the task skill for ingest, query, service, or + evaluation. + +## Success Checks + +- `retriever --help` shows `ingest`, `query`, `service`, `recall`, `eval`, and + `pipeline` commands. +- `python -c "import nemo_retriever"` succeeds in the same environment. +- The chosen task command's `--help` output is visible before running expensive + model or data work. + +## Evaluation Scenarios + +- "Install NeMo Retriever and verify the CLI." Use this skill. +- "`retriever` is not found; what should I do?" Use this skill. +- "Index the PDFs in `data/reports`." Use `nemo-retriever-ingest` once the + environment is working. diff --git a/nemo_retriever/src/nemo_retriever/adapters/cli/main.py b/nemo_retriever/src/nemo_retriever/adapters/cli/main.py index d55f553fd5..ef9bfefd53 100644 --- a/nemo_retriever/src/nemo_retriever/adapters/cli/main.py +++ b/nemo_retriever/src/nemo_retriever/adapters/cli/main.py @@ -329,6 +329,21 @@ def query_command( "--embed-model-name", help="Optional embedding model name override.", ), + local_query_embed_backend: LocalIngestEmbedBackendValue | None = typer.Option( + None, + "--local-query-embed-backend", + help="Local query-time text embedder when --embed-invoke-url is unset.", + ), + local_hf_cache_dir: str | None = typer.Option( + None, + "--local-hf-cache-dir", + help="HuggingFace cache directory for local query embedding.", + ), + local_hf_device: str | None = typer.Option( + None, + "--local-hf-device", + help="Torch device for local HuggingFace query embedding, such as 'cuda' or 'cpu'.", + ), reranker_invoke_url: str | None = typer.Option(None, "--reranker-invoke-url", help="Reranker NIM endpoint URL."), reranker_model_name: str | None = typer.Option( None, @@ -392,6 +407,9 @@ def query_command( table_name=table_name, embed_invoke_url=embed_invoke_url, embed_model_name=embed_model_name, + local_query_embed_backend=local_query_embed_backend, + local_hf_cache_dir=local_hf_cache_dir, + local_hf_device=local_hf_device, reranker_invoke_url=reranker_invoke_url, reranker_model_name=reranker_model_name, reranker_backend=reranker_backend, @@ -414,6 +432,9 @@ def query_command( table_name=table_name, embed_invoke_url=embed_invoke_url, embed_model_name=embed_model_name, + local_query_embed_backend=local_query_embed_backend, + local_hf_cache_dir=local_hf_cache_dir, + local_hf_device=local_hf_device, reranker_invoke_url=reranker_invoke_url, reranker_model_name=reranker_model_name, reranker_backend=reranker_backend, diff --git a/nemo_retriever/src/nemo_retriever/adapters/cli/sdk_workflow.py b/nemo_retriever/src/nemo_retriever/adapters/cli/sdk_workflow.py index 3bb3c78d1b..18fd816c44 100644 --- a/nemo_retriever/src/nemo_retriever/adapters/cli/sdk_workflow.py +++ b/nemo_retriever/src/nemo_retriever/adapters/cli/sdk_workflow.py @@ -16,6 +16,7 @@ EmbedParams, ExtractParams, HtmlChunkParams, + ModelRuntimeParams, TextChunkParams, VdbUploadParams, VideoFrameParams, @@ -196,6 +197,8 @@ def _build_embed_kwargs( embed_invoke_url: str | None, embed_model_name: str | None, local_ingest_embed_backend: LocalIngestEmbedBackendValue | None = None, + local_hf_cache_dir: str | None = None, + local_hf_device: str | None = None, embed_workers: int | None = None, embed_batch_size: int | None = None, embed_cpus_per_actor: float | None = None, @@ -210,6 +213,11 @@ def _build_embed_kwargs( embed_kwargs["embed_model_name"] = embed_model_name if local_ingest_embed_backend is not None: embed_kwargs["local_ingest_embed_backend"] = local_ingest_embed_backend + if local_hf_cache_dir is not None or local_hf_device is not None: + embed_kwargs["runtime"] = ModelRuntimeParams( + device=local_hf_device, + hf_cache_dir=local_hf_cache_dir, + ) embed_tuning = _build_embed_batch_tuning( embed_workers=embed_workers, embed_batch_size=embed_batch_size, @@ -452,6 +460,9 @@ def query_documents( table_name: str = "nv-ingest", embed_invoke_url: str | None = None, embed_model_name: str | None = None, + local_query_embed_backend: LocalIngestEmbedBackendValue | None = None, + local_hf_cache_dir: str | None = None, + local_hf_device: str | None = None, reranker_invoke_url: str | None = None, reranker_model_name: str | None = None, reranker_backend: str | None = None, @@ -462,7 +473,13 @@ def query_documents( Reranking is opt-in: pass ``rerank=True`` (or any of the rerank-related args via the CLI, which implicitly set ``rerank=True``) to enable. """ - embed_kwargs = _build_embed_kwargs(embed_invoke_url, embed_model_name) + embed_kwargs = _build_embed_kwargs( + embed_invoke_url, + embed_model_name, + local_ingest_embed_backend=local_query_embed_backend, + local_hf_cache_dir=local_hf_cache_dir, + local_hf_device=local_hf_device, + ) retriever_kwargs: dict[str, Any] = { "top_k": top_k, "vdb_kwargs": {"uri": lancedb_uri, "table_name": table_name}, diff --git a/nemo_retriever/src/nemo_retriever/evaluation/cli.py b/nemo_retriever/src/nemo_retriever/evaluation/cli.py index f847d6fc1a..8d6b4da6bf 100644 --- a/nemo_retriever/src/nemo_retriever/evaluation/cli.py +++ b/nemo_retriever/src/nemo_retriever/evaluation/cli.py @@ -16,7 +16,7 @@ import os import time from pathlib import Path -from typing import Optional +from typing import Literal, Optional import typer @@ -368,6 +368,21 @@ def export_cmd( "--embedder", help="Embedding model name.", ), + local_query_embed_backend: Literal["vllm", "hf"] | None = typer.Option( + None, + "--local-query-embed-backend", + help="Local query-time text embedder when no remote embedding endpoint is used.", + ), + local_hf_cache_dir: str | None = typer.Option( + None, + "--local-hf-cache-dir", + help="HuggingFace cache directory for local query embedding.", + ), + local_hf_device: str | None = typer.Option( + None, + "--local-hf-device", + help="Torch device for local HuggingFace query embedding, such as 'cuda' or 'cpu'.", + ), page_index: Path = typer.Option( None, "--page-index", @@ -404,6 +419,9 @@ def export_cmd( output_path=str(output), top_k=top_k, embedder=embedder, + local_query_embed_backend=local_query_embed_backend, + local_hf_cache_dir=local_hf_cache_dir, + local_hf_device=local_hf_device, page_index=page_idx, ) elapsed = time.monotonic() - t0 diff --git a/nemo_retriever/src/nemo_retriever/export.py b/nemo_retriever/src/nemo_retriever/export.py index 7c99045b9b..74bbc663dc 100644 --- a/nemo_retriever/src/nemo_retriever/export.py +++ b/nemo_retriever/src/nemo_retriever/export.py @@ -106,6 +106,9 @@ def query_lancedb( *, top_k: int = 5, embedder: str = "nvidia/llama-nemotron-embed-1b-v2", + local_query_embed_backend: str | None = None, + local_hf_cache_dir: str | None = None, + local_hf_device: str | None = None, page_index: dict[str, dict[str, str]] | None = None, batch_size: int = 50, ) -> tuple[dict[str, dict], dict[str, Any]]: @@ -123,6 +126,12 @@ def query_lancedb( Number of chunks to retrieve per query. embedder : str Embedding model name for the Retriever. + local_query_embed_backend : str, optional + Local backend for query embeddings, e.g. ``"hf"``. + local_hf_cache_dir : str, optional + HuggingFace cache directory for local query embeddings. + local_hf_device : str, optional + Torch device for local HuggingFace query embeddings. page_index : dict, optional ``{source_id: {page_str: markdown}}``. When provided, chunk hits are expanded to full-page markdown. @@ -136,14 +145,24 @@ def query_lancedb( ``{"chunks": [...], "metadata": [...]}`` and *metadata* is the envelope metadata dict. """ + from nemo_retriever.params import ModelRuntimeParams from nemo_retriever.retriever import Retriever + embed_kwargs: dict[str, Any] = {"model_name": embedder, "embed_model_name": embedder} + if local_query_embed_backend is not None: + embed_kwargs["local_ingest_embed_backend"] = local_query_embed_backend + if local_hf_cache_dir is not None or local_hf_device is not None: + embed_kwargs["runtime"] = ModelRuntimeParams( + device=local_hf_device, + hf_cache_dir=local_hf_cache_dir, + ) + retriever = Retriever( vdb_kwargs={ "vdb_op": "lancedb", "vdb_kwargs": {"uri": lancedb_uri, "table_name": lancedb_table}, }, - embed_kwargs={"model_name": embedder, "embed_model_name": embedder}, + embed_kwargs=embed_kwargs, top_k=top_k, rerank=False, ) @@ -184,6 +203,12 @@ def query_lancedb( "chunk_mode": chunk_mode, "query_count": len(all_results), } + if local_query_embed_backend is not None: + meta["local_query_embed_backend"] = local_query_embed_backend + if local_hf_cache_dir is not None: + meta["local_hf_cache_dir"] = local_hf_cache_dir + if local_hf_device is not None: + meta["local_hf_device"] = local_hf_device if use_fullpage: meta["page_index_misses"] = total_page_misses @@ -233,6 +258,9 @@ def export_retrieval_json( *, top_k: int = 5, embedder: str = "nvidia/llama-nemotron-embed-1b-v2", + local_query_embed_backend: str | None = None, + local_hf_cache_dir: str | None = None, + local_hf_device: str | None = None, page_index: dict[str, dict[str, str]] | None = None, batch_size: int = 50, ) -> dict: @@ -254,6 +282,12 @@ def export_retrieval_json( Number of chunks to retrieve per query. embedder : str Embedding model name for the Retriever. + local_query_embed_backend : str, optional + Local backend for query embeddings, e.g. ``"hf"``. + local_hf_cache_dir : str, optional + HuggingFace cache directory for local query embeddings. + local_hf_device : str, optional + Torch device for local HuggingFace query embeddings. page_index : dict, optional ``{source_id: {page_str: markdown}}``. When provided, chunk hits are expanded to full-page markdown. @@ -271,6 +305,9 @@ def export_retrieval_json( queries=queries, top_k=top_k, embedder=embedder, + local_query_embed_backend=local_query_embed_backend, + local_hf_cache_dir=local_hf_cache_dir, + local_hf_device=local_hf_device, page_index=page_index, batch_size=batch_size, ) diff --git a/nemo_retriever/src/nemo_retriever/skill_eval/cli.py b/nemo_retriever/src/nemo_retriever/skill_eval/cli.py index 6c42736327..67df046c52 100644 --- a/nemo_retriever/src/nemo_retriever/skill_eval/cli.py +++ b/nemo_retriever/src/nemo_retriever/skill_eval/cli.py @@ -29,7 +29,7 @@ DEFAULT_ORDER = ("c1_base", "c2_retriever", "c3_retriever_skill") -app = typer.Typer(help="Benchmark Claude with vs. without the /nemo-retriever skill on a folder of PDFs.") +app = typer.Typer(help="Benchmark Claude with vs. without NeMo Retriever skills on a folder of PDFs.") logger = logging.getLogger(__name__) @@ -162,9 +162,7 @@ def run_command( domain_order = sorted(by_domain.keys()) typer.echo(f"Domains in this run: {domain_order} ({sum(len(v) for v in by_domain.values())} entries total)") - skill_source = Path( - str(cfg.get("skill_source_dir") or REPO_ROOT / ".claude" / "skills" / "nemo-retriever") - ).expanduser() + skill_source = Path(str(cfg.get("skill_source_dir") or REPO_ROOT / ".claude" / "skills")).expanduser() workdir_root = Path(str(cfg.get("per_trial_workdir_root", "/tmp/skill_eval"))).expanduser() workdir_root.mkdir(parents=True, exist_ok=True) model = str(cfg.get("agent_model", "claude-opus-4-7")) diff --git a/nemo_retriever/src/nemo_retriever/skill_eval/configs/skill_eval.yaml b/nemo_retriever/src/nemo_retriever/skill_eval/configs/skill_eval.yaml index f0636be9af..45dbd54f70 100644 --- a/nemo_retriever/src/nemo_retriever/skill_eval/configs/skill_eval.yaml +++ b/nemo_retriever/src/nemo_retriever/skill_eval/configs/skill_eval.yaml @@ -69,14 +69,14 @@ per_trial_workdir_root: /tmp/skill_eval conditions: - c1_base # retriever blocked → forces Read-only baseline - c2_retriever # retriever available, NL prompt, skill auto-discovery - - c3_retriever_skill # retriever available, explicit /nemo-retriever slash + - c3_retriever_skill # retriever available, explicit split-skill slash commands # --------------------------------------------------------------------------- # Skill source override (rarely needed) # --------------------------------------------------------------------------- -# Defaults to /.claude/skills/nemo-retriever when unset. Set this only -# if you want to A/B-test an alternate skill tree. -# skill_source_dir: /path/to/.claude/skills/nemo-retriever +# Defaults to /.claude/skills when unset. Point this at either a directory +# containing multiple skill directories or one standalone skill directory. +# skill_source_dir: /path/to/.claude/skills # --------------------------------------------------------------------------- # LLM-as-judge diff --git a/nemo_retriever/src/nemo_retriever/skill_eval/dataset.py b/nemo_retriever/src/nemo_retriever/skill_eval/dataset.py index e3c2e89963..4dd38e72e0 100644 --- a/nemo_retriever/src/nemo_retriever/skill_eval/dataset.py +++ b/nemo_retriever/src/nemo_retriever/skill_eval/dataset.py @@ -54,15 +54,15 @@ def _select_prompt(candidates: list[dict[str, Any]], selected_variant: int | Non def _normalize_slash_command(prompt: str) -> str: - """Rewrite SDG-generated slash commands to this project's actual skill name. + """Rewrite SDG-generated slash commands to this project's actual skill names. The agent_scenario_manifest contains slash-command scenarios using made-up - aliases (``/vidore-ingest``, ``/vidore``, ``/vidore_hr``); the real skill - that ships with this repo is ``nemo-retriever``. Rewriting at load time - avoids editing the upstream manifest (frozen baseline) while making the - slash_ingest / slash_retrieval scenarios actually exercisable. The token - boundary after the alias is preserved so the trailing args/query carry - over verbatim. + aliases (``/vidore-ingest``, ``/vidore``, ``/vidore_hr``). The skills that + ship with this repo are task-specific, so ingest aliases route to + ``nemo-retriever-ingest`` and retrieval aliases route to + ``nemo-retriever-query``. Rewriting at load time avoids editing the upstream + manifest while making slash scenarios exercisable. The token boundary after + the alias is preserved so trailing args/query carry over verbatim. """ s = prompt.lstrip() if not s.startswith("/"): @@ -70,9 +70,9 @@ def _normalize_slash_command(prompt: str) -> str: # Order matters: rewrite ``/vidore-ingest`` (which carries the ingest # subcommand intent) before the bare ``/vidore`` prefix. rewrites = [ - ("/vidore-ingest ", "/nemo-retriever ingest "), - ("/vidore_hr ", "/nemo-retriever "), - ("/vidore ", "/nemo-retriever "), + ("/vidore-ingest ", "/nemo-retriever-ingest "), + ("/vidore_hr ", "/nemo-retriever-query "), + ("/vidore ", "/nemo-retriever-query "), ] for old, new in rewrites: if s.startswith(old): diff --git a/nemo_retriever/src/nemo_retriever/skill_eval/prompts/setup_slash.j2 b/nemo_retriever/src/nemo_retriever/skill_eval/prompts/setup_slash.j2 index ec0aa27cea..b4d2d7e208 100644 --- a/nemo_retriever/src/nemo_retriever/skill_eval/prompts/setup_slash.j2 +++ b/nemo_retriever/src/nemo_retriever/skill_eval/prompts/setup_slash.j2 @@ -1 +1 @@ -/nemo-retriever ingest ./pdfs/ +/nemo-retriever-ingest ./pdfs/ diff --git a/nemo_retriever/src/nemo_retriever/skill_eval/prompts/trial_user_slash.j2 b/nemo_retriever/src/nemo_retriever/skill_eval/prompts/trial_user_slash.j2 index 7707bab45d..023f579074 100644 --- a/nemo_retriever/src/nemo_retriever/skill_eval/prompts/trial_user_slash.j2 +++ b/nemo_retriever/src/nemo_retriever/skill_eval/prompts/trial_user_slash.j2 @@ -1,4 +1,4 @@ -/nemo-retriever query "{{ original_query }}" +/nemo-retriever-query "{{ original_query }}" After running this query, use the retrieved hits to write your final result to ./output.json with EXACTLY this schema: diff --git a/nemo_retriever/src/nemo_retriever/skill_eval/runner.py b/nemo_retriever/src/nemo_retriever/skill_eval/runner.py index 8c9a74bcf1..8d49a00f02 100644 --- a/nemo_retriever/src/nemo_retriever/skill_eval/runner.py +++ b/nemo_retriever/src/nemo_retriever/skill_eval/runner.py @@ -115,14 +115,25 @@ def _write_shim(shim_dir: Path, name: str) -> None: shim.chmod(shim.stat().st_mode | stat.S_IEXEC | stat.S_IXGRP | stat.S_IXOTH) -def _copy_skill(skill_source: Path, dest: Path) -> None: - dest.mkdir(parents=True, exist_ok=True) +def _copy_skill_dir(skill_source: Path, dest: Path) -> None: if (dest / "SKILL.md").exists(): return - shutil.copy2(skill_source / "SKILL.md", dest / "SKILL.md") - ref_src = skill_source / "references" - if ref_src.is_dir(): - shutil.copytree(ref_src, dest / "references", dirs_exist_ok=True) + shutil.copytree(skill_source, dest, dirs_exist_ok=True) + + +def _copy_skills(skill_source: Path, dest: Path) -> None: + """Copy either one skill directory or a directory containing many skills.""" + dest.mkdir(parents=True, exist_ok=True) + if (skill_source / "SKILL.md").is_file(): + _copy_skill_dir(skill_source, dest / skill_source.name) + return + + skill_dirs = [path for path in sorted(skill_source.iterdir()) if (path / "SKILL.md").is_file()] + if not skill_dirs: + raise FileNotFoundError(f"No skill directories found under {skill_source}") + + for skill_dir in skill_dirs: + _copy_skill_dir(skill_dir, dest / skill_dir.name) # Bash patterns that route the agent into the nemo_retriever library, regardless @@ -168,7 +179,7 @@ def _build_condition_workdir( Workdir contents: - pdfs/ symlink farm into the source PDF folder - - .claude/ sandbox (settings + per-condition skill copy) + - .claude/ sandbox (settings + per-condition skill copies) - .bin/retriever shim (c1 only) so retriever is unavailable on PATH The agent itself creates any retrieval artifacts (e.g., ./lancedb/) inside the @@ -182,10 +193,10 @@ def _build_condition_workdir( # c1 gets explicit Bash deny rules; c2/c3 keep the empty settings.json. settings_text = _c1_settings_json() if condition == "c1_base" else "{}\n" (workdir / ".claude" / "settings.json").write_text(settings_text, encoding="utf-8") - # c2 and c3 both have retriever installed AND the nemo-retriever skill loaded. + # c2 and c3 both have retriever installed AND the NeMo Retriever skills loaded. # The c2/c3 distinction is purely the prompt style (NL vs explicit slash command). if condition in ("c2_retriever", "c3_retriever_skill"): - _copy_skill(skill_source, workdir / ".claude" / "skills" / "nemo-retriever") + _copy_skills(skill_source, workdir / ".claude" / "skills") if condition == "c1_base": _write_shim(workdir / ".bin", "retriever") # Empty HuggingFace cache redirect; env vars are wired up in _env_for. diff --git a/nemo_retriever/tests/skill_eval/test_split_skills.py b/nemo_retriever/tests/skill_eval/test_split_skills.py new file mode 100644 index 0000000000..e8e6042e57 --- /dev/null +++ b/nemo_retriever/tests/skill_eval/test_split_skills.py @@ -0,0 +1,104 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. +# All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +from pathlib import Path + +from nemo_retriever.skill_eval.dataset import DatasetEntry, _normalize_slash_command +from nemo_retriever.skill_eval.runner import _copy_skills, _render_prompt, _render_setup_prompt + + +def _write_skill(root: Path, name: str) -> None: + skill_dir = root / name + (skill_dir / "references").mkdir(parents=True) + (skill_dir / "SKILL.md").write_text(f"---\nname: {name}\n---\n\n# {name}\n", encoding="utf-8") + (skill_dir / "PITFALLS.md").write_text("# Pitfalls\n", encoding="utf-8") + (skill_dir / "references" / "REFERENCE.md").write_text("# Reference\n", encoding="utf-8") + + +def test_copy_skills_copies_split_skill_tree(tmp_path: Path) -> None: + source = tmp_path / "skills" + _write_skill(source, "nemo-retriever-ingest") + _write_skill(source, "nemo-retriever-query") + + dest = tmp_path / "workdir" / ".claude" / "skills" + + _copy_skills(source, dest) + + assert (dest / "nemo-retriever-ingest" / "SKILL.md").is_file() + assert (dest / "nemo-retriever-ingest" / "PITFALLS.md").is_file() + assert (dest / "nemo-retriever-ingest" / "references" / "REFERENCE.md").is_file() + assert (dest / "nemo-retriever-query" / "SKILL.md").is_file() + + +def test_copy_skills_accepts_single_skill_directory(tmp_path: Path) -> None: + _write_skill(tmp_path, "nemo-retriever-query") + + dest = tmp_path / "dest" + + _copy_skills(tmp_path / "nemo-retriever-query", dest) + + assert (dest / "nemo-retriever-query" / "PITFALLS.md").is_file() + + +def test_copy_skills_follows_compatibility_symlinks(tmp_path: Path) -> None: + package_skills = tmp_path / "package" / "skills" + _write_skill(package_skills, "nemo-retriever-ingest") + + claude_skills = tmp_path / ".claude" / "skills" + claude_skills.mkdir(parents=True) + (claude_skills / "nemo-retriever-ingest").symlink_to( + package_skills / "nemo-retriever-ingest", + target_is_directory=True, + ) + + dest = tmp_path / "dest" + + _copy_skills(claude_skills, dest) + + assert (dest / "nemo-retriever-ingest" / "SKILL.md").is_file() + assert not (dest / "nemo-retriever-ingest").is_symlink() + + +def test_copy_skills_accepts_mixed_symlink_and_local_skill_dirs(tmp_path: Path) -> None: + package_skills = tmp_path / "package" / "skills" + _write_skill(package_skills, "nemo-retriever-query") + + root_skills = tmp_path / ".agents" / "skills" + root_skills.mkdir(parents=True) + (root_skills / "nemo-retriever-query").symlink_to( + package_skills / "nemo-retriever-query", + target_is_directory=True, + ) + _write_skill(root_skills, "contributor-workflow") + (root_skills / "notes").mkdir() + + dest = tmp_path / "dest" + + _copy_skills(root_skills, dest) + + assert (dest / "nemo-retriever-query" / "SKILL.md").is_file() + assert (dest / "contributor-workflow" / "SKILL.md").is_file() + assert not (dest / "notes").exists() + + +def test_slash_prompts_use_task_specific_skill_names() -> None: + entry = DatasetEntry( + entry_id=1, + query_id="q1", + taxonomy_slot_id="retrieval", + original_query="What was revenue in 2024?", + paraphrased_prompt="Answer the revenue question.", + ground_truth_pages=[], + ) + + assert _render_setup_prompt("c3_retriever_skill").strip() == "/nemo-retriever-ingest ./pdfs/" + assert '/nemo-retriever-query "What was revenue in 2024?"' in _render_prompt(entry, "c3_retriever_skill") + + +def test_manifest_slash_aliases_rewrite_to_split_skills() -> None: + assert _normalize_slash_command("/vidore-ingest ./pdfs/") == "/nemo-retriever-ingest ./pdfs/" + assert _normalize_slash_command("/vidore What was revenue?") == "/nemo-retriever-query What was revenue?" + assert _normalize_slash_command("/vidore_hr Find relevant pages") == "/nemo-retriever-query Find relevant pages" diff --git a/nemo_retriever/tests/test_evaluation_retrievers.py b/nemo_retriever/tests/test_evaluation_retrievers.py index 5e45ebf065..38b9cf63fc 100644 --- a/nemo_retriever/tests/test_evaluation_retrievers.py +++ b/nemo_retriever/tests/test_evaluation_retrievers.py @@ -13,14 +13,19 @@ from __future__ import annotations +import importlib import json from pathlib import Path from unittest.mock import patch import pytest +from typer.testing import CliRunner from nemo_retriever.evaluation.retrievers import FileRetriever from nemo_retriever.llm.types import RetrievalResult +from nemo_retriever.params import ModelRuntimeParams + +RUNNER = CliRunner() _SAMPLE_QUERIES: dict[str, dict] = { "What is the range of the 767?": { @@ -216,6 +221,82 @@ def queries(self, queries): assert meta["collection_name"] == "nv-ingest" +def test_query_lancedb_passes_local_hf_embed_options(monkeypatch) -> None: + from nemo_retriever.export import query_lancedb + + captured_kwargs: dict[str, object] = {} + + class _FakeRetriever: + def __init__(self, **kwargs): + captured_kwargs.update(kwargs) + + def queries(self, queries): + return [[{"text": "range chunk", "source": "spec.pdf", "page_number": 3, "_distance": 0.1}]] + + retriever_module = importlib.import_module("nemo_retriever.retriever") + monkeypatch.setattr(retriever_module, "Retriever", _FakeRetriever) + + _all_results, meta = query_lancedb( + lancedb_uri="/tmp/lancedb", + lancedb_table="nv-ingest", + queries=[{"query": "What is the range of the 767?"}], + embedder="embedder", + local_query_embed_backend="hf", + local_hf_cache_dir="/models/huggingface", + local_hf_device="cuda", + ) + + embed_kwargs = captured_kwargs["embed_kwargs"] + assert embed_kwargs["local_ingest_embed_backend"] == "hf" + runtime = embed_kwargs["runtime"] + assert isinstance(runtime, ModelRuntimeParams) + assert runtime.hf_cache_dir == "/models/huggingface" + assert runtime.device == "cuda" + assert meta["local_query_embed_backend"] == "hf" + assert meta["local_hf_cache_dir"] == "/models/huggingface" + assert meta["local_hf_device"] == "cuda" + + +def test_eval_export_cli_passes_local_hf_embed_options(monkeypatch, tmp_path: Path) -> None: + cli = importlib.import_module("nemo_retriever.evaluation.cli") + query_csv = tmp_path / "queries.csv" + output = tmp_path / "retrieval.json" + query_csv.write_text("query\nWhat is the range of the 767?\n", encoding="utf-8") + captured_kwargs: dict[str, object] = {} + + def fake_export_retrieval_json(**kwargs): + captured_kwargs.update(kwargs) + return {"queries": {"What is the range of the 767?": {"chunks": [], "metadata": []}}} + + monkeypatch.setattr("nemo_retriever.export.export_retrieval_json", fake_export_retrieval_json) + + result = RUNNER.invoke( + cli.app, + [ + "export", + "--lancedb-uri", + "/tmp/lancedb", + "--lancedb-table", + "nv-ingest", + "--query-csv", + str(query_csv), + "--output", + str(output), + "--local-query-embed-backend", + "hf", + "--local-hf-cache-dir", + "/models/huggingface", + "--local-hf-device", + "cuda", + ], + ) + + assert result.exit_code == 0 + assert captured_kwargs["local_query_embed_backend"] == "hf" + assert captured_kwargs["local_hf_cache_dir"] == "/models/huggingface" + assert captured_kwargs["local_hf_device"] == "cuda" + + def test_from_lancedb_no_save_path_keeps_memory_label() -> None: """Without ``save_path`` the instance reports the in-memory origin.""" fake_meta = {"lancedb_uri": "mock"} diff --git a/nemo_retriever/tests/test_root_cli_workflow.py b/nemo_retriever/tests/test_root_cli_workflow.py index cccc6ce6b0..3f00d75a85 100644 --- a/nemo_retriever/tests/test_root_cli_workflow.py +++ b/nemo_retriever/tests/test_root_cli_workflow.py @@ -15,7 +15,14 @@ import nemo_retriever.adapters.cli.sdk_workflow as sdk_workflow from nemo_retriever.graph_ingestor import GraphIngestor -from nemo_retriever.params import AudioChunkParams, EmbedParams, ExtractParams, TextChunkParams, VideoFrameParams +from nemo_retriever.params import ( + AudioChunkParams, + EmbedParams, + ExtractParams, + ModelRuntimeParams, + TextChunkParams, + VideoFrameParams, +) RUNNER = CliRunner() @@ -628,6 +635,45 @@ def query(self, query: str) -> list[dict[str, Any]]: assert json.loads(result.output) == [] +def test_root_query_passes_local_hf_embed_options(monkeypatch) -> None: + retriever_calls: list[dict[str, Any]] = [] + + class FakeRetriever: + def __init__(self, **kwargs: Any) -> None: + retriever_calls.append(kwargs) + + def query(self, query: str) -> list[dict[str, Any]]: + return [] + + monkeypatch.setattr(sdk_workflow, "Retriever", FakeRetriever) + + result = RUNNER.invoke( + cli_main.app, + [ + "query", + "Which passages mention deployment?", + "--embed-model-name", + "nvidia/llama-nemotron-embed-1b-v2", + "--local-query-embed-backend", + "hf", + "--local-hf-cache-dir", + "/models/huggingface", + "--local-hf-device", + "cuda", + ], + ) + + assert result.exit_code == 0 + embed_kwargs = retriever_calls[0]["embed_kwargs"] + assert embed_kwargs["model_name"] == "nvidia/llama-nemotron-embed-1b-v2" + assert embed_kwargs["embed_model_name"] == "nvidia/llama-nemotron-embed-1b-v2" + assert embed_kwargs["local_ingest_embed_backend"] == "hf" + runtime = embed_kwargs["runtime"] + assert isinstance(runtime, ModelRuntimeParams) + assert runtime.hf_cache_dir == "/models/huggingface" + assert runtime.device == "cuda" + + def test_root_query_passes_reranker_url(monkeypatch) -> None: retriever_calls: list[dict[str, Any]] = [] query_calls: list[str] = []