NVIDIA · jioffe502 · May 21, 2026
@@ -1,17 +1,16 @@
 # retriever ingest
 
-End-to-end ingestion of documents and media into a LanceDB table — runs the
-full extract → embed → vector-DB pipeline in a single command.
+End-to-end ingestion of supported documents and media into a LanceDB table — runs the full
+extract → embed → vector-DB pipeline in a single command.
 
 If flags below look stale, re-check `retriever ingest --help`.
 
 ## When to use this
 
 - You have one or more supported files (or a directory/glob of files) and want them
   searchable via `retriever query`.
-- You want the default pipeline: auto-select extraction for PDF/DOC/PPTX,
-  text, HTML, image, audio, or video inputs, then embed and insert into
-  LanceDB. No per-stage tuning needed.
+- You want the default pipeline: PDF split → extraction → page-element
+  detection → OCRv2 → embedding → LanceDB insert. No per-stage tuning needed.
 
 **Use a different command when:**
 
@@ -25,7 +24,7 @@ If flags below look stale, re-check `retriever ingest --help`.
 
 ## Canonical invocations
 
-Ingest a single file into the default table (`lancedb/nv-ingest.lance`):
+Ingest a single PDF into the default table (`lancedb/nemo-retriever.lance`):
 
 ```bash
 retriever ingest data/multimodal_test.pdf
@@ -43,15 +42,6 @@ Ingest via glob:
 retriever ingest "data/**/*"
 ```
 
-Force a specific input family:
-
-```bash
-retriever ingest data/slides/ --input-type doc
-retriever ingest data/images/ --input-type image
-retriever ingest data/audio/ --input-type audio
-retriever ingest data/video/ --input-type video
-```
-
 Write to a custom DB / table:
 
 ```bash
@@ -62,11 +52,8 @@ retriever ingest data/multimodal_test.pdf \
 
 ## Inputs
 
-- **Positional `DOCUMENTS...`** — one or more file paths, directories, or
-  shell globs. Required, repeatable.
-- **Supported input types** — `pdf`, `doc` (`.docx`, `.pptx`), `txt`, `html`,
-  `image` (`.jpg`, `.jpeg`, `.png`, `.tiff`, `.tif`, `.bmp`, `.svg`),
-  `audio` (`.mp3`, `.wav`, `.m4a`), and `video` (`.mp4`, `.mov`, `.mkv`).
+- **Positional `DOCUMENTS...`** — one or more of: PDF file paths, directories
+  containing PDFs, or shell globs. Required, repeatable.
 
 ## Outputs
 
@@ -81,13 +68,12 @@ retriever ingest data/multimodal_test.pdf \
 | Flag | Default | Notes |
 |---|---|---|
 | `--lancedb-uri` | `lancedb` | Path or URI of the LanceDB database. |
-| `--table-name` | `nv-ingest` | LanceDB table to write into. Must match `retriever query`'s table on read. |
-| `--input-type` | `auto` | Input family to ingest. `auto` detects from file extensions and supports mixed directories. |
+| `--table-name` | `nemo-retriever` | LanceDB table to write into. Must match `retriever query`'s table on read. |
 | `--run-mode` | `inprocess` | `inprocess` for local runs; `batch` for the SDK batch ingestor. |
 
 ## Pipeline shape
 
-For PDF/DOC/PPTX inputs, `ingest` runs the optimized document pipeline:
+The default `ingest` runs 8 stages, in order:
 
 1. `DocToPdfConversionActor` — non-PDF inputs → PDF (no-op for PDFs).
 2. `PDFSplitActor` — split into per-page tasks.
@@ -98,9 +84,6 @@ For PDF/DOC/PPTX inputs, `ingest` runs the optimized document pipeline:
 7. `_BatchEmbedActor` — embed primitives with `llama-nemotron-embed-1b-v2`.
 8. `IngestVdbOperator` — insert rows into LanceDB.
 
-For text, HTML, image, audio, video, or mixed `auto` inputs, `ingest` routes
-through the same GraphIngestor extraction paths used by `retriever pipeline`.
-
 ## Common failure modes
 
 - **`Clamping num_partitions from 16 to 7`** — informational, not an error.

@@ -17,12 +17,9 @@
 import typer
 
 from nemo_retriever.adapters.cli.sdk_workflow import (
-    IngestInputTypeValue,
     IngestRunModeValue,
-    LocalIngestEmbedBackendValue,
     OcrLangValue,
     OcrVersionValue,
-    TableOutputFormatValue,
     ingest_documents,
     query_documents,
 )
@@ -145,12 +142,7 @@ def main() -> None:
 def ingest_command(
     documents: list[str] = typer.Argument(
         ...,
-        help="One or more file paths, directories, or globs to ingest.",
-    ),
-    input_type: IngestInputTypeValue = typer.Option(
-        "auto",
-        "--input-type",
-        help="Input type: auto, pdf, doc, txt, html, image, audio, or video.",
+        help="One or more files, directories, or globs. Supported file types are detected automatically.",
     ),
     lancedb_uri: str = typer.Option("lancedb", "--lancedb-uri", help="LanceDB database URI."),
     table_name: str = typer.Option("nv-ingest", "--table-name", help="LanceDB table name."),
@@ -199,22 +191,12 @@ def ingest_command(
         "--table-structure-invoke-url",
         help="Table-structure NIM endpoint URL.",
     ),
-    table_output_format: TableOutputFormatValue | None = typer.Option(
-        None,
-        "--table-output-format",
-        help="Table text format. 'markdown' enables local table-structure extraction.",
-    ),
     embed_invoke_url: str | None = typer.Option(None, "--embed-invoke-url", help="Embedding NIM endpoint URL."),
     embed_model_name: str | None = typer.Option(
         None,
         "--embed-model-name",
         help="Optional embedding model name override.",
     ),
-    local_ingest_embed_backend: LocalIngestEmbedBackendValue | None = typer.Option(
-        None,
-        "--local-ingest-embed-backend",
-        help="Local ingest-time text embedder when --embed-invoke-url is unset.",
-    ),
     pdf_extract_workers: int | None = typer.Option(
         None,
         "--pdf-extract-workers",
@@ -251,12 +233,6 @@ def ingest_command(
         min=0.0,
         help="CPUs reserved per page-element detection actor in batch mode.",
     ),
-    page_elements_gpus_per_actor: float | None = typer.Option(
-        None,
-        "--page-elements-gpus-per-actor",
-        min=0.0,
-        help="GPUs reserved per local page-element detection actor in batch mode.",
-    ),
     ocr_workers: int | None = typer.Option(
         None,
         "--ocr-workers",
@@ -275,36 +251,6 @@ def ingest_command(
         min=0.0,
         help="CPUs reserved per OCR actor in batch mode.",
     ),
-    ocr_gpus_per_actor: float | None = typer.Option(
-        None,
-        "--ocr-gpus-per-actor",
-        min=0.0,
-        help="GPUs reserved per local OCR actor in batch mode.",
-    ),
-    table_structure_workers: int | None = typer.Option(
-        None,
-        "--table-structure-workers",
-        min=1,
-        help="Number of Ray actors for table-structure extraction in batch mode.",
-    ),
-    table_structure_batch_size: int | None = typer.Option(
-        None,
-        "--table-structure-batch-size",
-        min=1,
-        help="Table-structure extraction batch size per actor in batch mode.",
-    ),
-    table_structure_cpus_per_actor: float | None = typer.Option(
-        None,
-        "--table-structure-cpus-per-actor",
-        min=0.0,
-        help="CPUs reserved per table-structure actor in batch mode.",
-    ),
-    table_structure_gpus_per_actor: float | None = typer.Option(
-        None,
-        "--table-structure-gpus-per-actor",
-        min=0.0,
-        help="GPUs reserved per local table-structure actor in batch mode.",
-    ),
     embed_workers: int | None = typer.Option(
         None,
         "--embed-workers",
@@ -323,12 +269,6 @@ def ingest_command(
         min=0.0,
         help="CPUs reserved per embedding actor in batch mode.",
     ),
-    embed_gpus_per_actor: float | None = typer.Option(
-        None,
-        "--embed-gpus-per-actor",
-        min=0.0,
-        help="GPUs reserved per local embedding actor in batch mode.",
-    ),
     quiet: bool = typer.Option(
         False,
         "--quiet",
@@ -347,7 +287,6 @@ def ingest_command(
         with capture:
             summary = ingest_documents(
                 documents,
-                input_type=input_type,
                 run_mode=run_mode,
                 ray_address=ray_address,
                 ray_log_to_driver=ray_log_to_driver,
@@ -360,29 +299,20 @@ def ingest_command(
                 ocr_lang=ocr_lang,
                 graphic_elements_invoke_url=graphic_elements_invoke_url,
                 table_structure_invoke_url=table_structure_invoke_url,
-                table_output_format=table_output_format,
                 embed_invoke_url=embed_invoke_url,
                 embed_model_name=embed_model_name,
-                local_ingest_embed_backend=local_ingest_embed_backend,
                 pdf_extract_workers=pdf_extract_workers,
                 pdf_extract_batch_size=pdf_extract_batch_size,
                 pdf_extract_cpus_per_task=pdf_extract_cpus_per_task,
                 page_elements_workers=page_elements_workers,
                 page_elements_batch_size=page_elements_batch_size,
                 page_elements_cpus_per_actor=page_elements_cpus_per_actor,
-                page_elements_gpus_per_actor=page_elements_gpus_per_actor,
                 ocr_workers=ocr_workers,
                 ocr_batch_size=ocr_batch_size,
                 ocr_cpus_per_actor=ocr_cpus_per_actor,
-                ocr_gpus_per_actor=ocr_gpus_per_actor,
-                table_structure_workers=table_structure_workers,
-                table_structure_batch_size=table_structure_batch_size,
-                table_structure_cpus_per_actor=table_structure_cpus_per_actor,
-                table_structure_gpus_per_actor=table_structure_gpus_per_actor,
                 embed_workers=embed_workers,
                 embed_batch_size=embed_batch_size,
                 embed_cpus_per_actor=embed_cpus_per_actor,
-                embed_gpus_per_actor=embed_gpus_per_actor,
             )
     except _ROOT_CLI_ERRORS as exc:
         typer.echo(f"Error: {exc}", err=True)