NVIDIA · jdye64 · May 19, 2026 · May 19, 2026 · May 19, 2026 · May 19, 2026
@@ -4,7 +4,7 @@ on:
   workflow_dispatch:
     inputs:
       version:
-        description: 'Chart version (e.g. 26.05-RC1)'
+        description: 'Chart version (e.g. 26.5.0)'
         required: true
         type: string
       source-ref:

@@ -18,8 +18,8 @@ description: |
   shared PostgreSQL backend so the service can scale horizontally.
 
 type: application
-version: 26.05-RC1
-appVersion: "26.05-RC1"
+version: "26.5.0"
+appVersion: "26.5.0"
 kubeVersion: ">=1.25.0-0"
 home: https://github.com/NVIDIA/NeMo-Retriever
 sources:

@@ -67,13 +67,13 @@ imagePullSecrets: []
 # =============================================================================
 service:
   image:
-    # Default points at the staging image published to NGC. Override
+    # Default points at the GA image published to NGC. Override
     # `repository` / `tag` to pin a different build, e.g. one produced by:
-    #   docker build -f nemo_retriever/Dockerfile --target service \
+    #   docker build -f Dockerfile --target service \
     #       -t <your-registry>/nemo-retriever-service:<tag> .
-    repository: localhost:32000/nemo-retriever-service
-    tag: "latest"
-    pullPolicy: Always
+    repository: nvcr.io/nvidia/nemo-microservices/nrl-service
+    tag: "26.5.0"
+    pullPolicy: IfNotPresent
 
   # Number of pod replicas. Must stay at 1 while persistence is SQLite-backed
   # (RWO PVC + single writer). Bumping this requires switching to a shared

@@ -52,7 +52,7 @@ dependencies = [
   # HTTP clients
   "httpx>=0.27.0",
   "requests>=2.32.5",
-  "urllib3>=2.7.0",
+  "urllib3==2.7.0",
   # Utilities
   "pydantic>=2.8.0",
   "rich>=13.7.0",
-  "urllib3==2.7.0",
-  # Utilities
-  "pydantic>=2.8.0",
-  "rich>=13.7.0",
+  "urllib3>=2.7.0",
+  # Utilities
+  "pydantic>=2.8.0",
+  "rich>=13.7.0",
-  "urllib3==2.7.0",
-  # Utilities
-  "pydantic>=2.8.0",
-  "rich>=13.7.0",
+  "urllib3>=2.7.0",
+  # Utilities
+  "pydantic>=2.8.0",
+  "rich>=13.7.0",
@@ -65,9 +65,9 @@ dependencies = [
   # Document parsing and NIM client libs
   "pypdfium2==4.30.0",
   "pillow==12.2.0",
-  "nltk>=3.9.4",
+  "nltk==3.9.4",
-  "nltk==3.9.4",
+  "nltk>=3.9.4",
-  "nltk==3.9.4",
+  "nltk>=3.9.4",
   "markitdown",
-  "langchain-nvidia-ai-endpoints>=0.3.0",
+  "langchain-nvidia-ai-endpoints>=1.4.0",
   # Default VDB solution
   "lancedb",
   # gRPC client for Parakeet/Riva ASR. Required for ASRCPUActor when it
@@ -123,11 +123,10 @@ local = [
   "scikit-learn>=1.6.0",
   "timm==1.0.22",
   "albumentations==2.0.8",
-  "nemotron-page-elements-v3>=0.dev0",
-  "nemotron-graphic-elements-v1>=0.dev0",
-  "nemotron-table-structure-v1>=0.dev0",
-  # Accept the 2.0.0 stable release and newer OCR dev/final trains.
-  "nemotron-ocr>=2.0.0.dev0; sys_platform == 'linux' and (platform_machine == 'x86_64' or platform_machine == 'aarch64')",
+  "nemotron-page-elements-v3==3.0.1",
+  "nemotron-graphic-elements-v1==1.0.0",
+  "nemotron-table-structure-v1==1.0.0",
+  "nemotron-ocr>=2.0.0,<3; sys_platform == 'linux' and (platform_machine == 'x86_64' or platform_machine == 'aarch64')",
   "nvidia-ml-py",
   "apscheduler>=3.10",
   "psutil>=5.9.0",
@@ -165,7 +164,7 @@ tabular = [
   "duckdb>=1.2.0",
   "duckdb-engine>=0.13.0",
   "neo4j>=5.0",
-  "langgraph>=1.1.0a2",
+  "langgraph>=1.2.0",
 ]
 
 # BEIR benchmarking and evaluation tools (not needed for production use).
@@ -181,7 +180,7 @@ benchmarks = [
 # or construct an ``LLMJudge`` / ``LiteLLMClient`` directly.  Powers both the
 # live-RAG SDK and the batch evaluation framework.
 llm = [
-  "litellm>=1.86.0rc1",
+  "litellm>=1.86.0,<2",
 ]
 
 dev = [
@@ -202,10 +201,6 @@ retriever-harness = "nemo_retriever.harness:main"
 version = {attr = "nemo_retriever.version.get_build_version"}
 
 [tool.uv.sources]
-nemotron-page-elements-v3 = { index = "test-pypi" }
-nemotron-graphic-elements-v1 = { index = "test-pypi" }
-nemotron-table-structure-v1 = { index = "test-pypi" }
-nemotron-ocr = { index = "test-pypi" }
 # On Linux, resolve torch/torchvision from the CUDA wheel index.
 # On Mac, fall through to PyPI to get CPU wheels.
 torch = [

@@ -167,9 +167,9 @@ def ingest_command(
     lancedb_uri: str = typer.Option(DEFAULT_LANCEDB_URI, "--lancedb-uri", help="LanceDB database URI."),
     table_name: str = typer.Option(DEFAULT_TABLE_NAME, "--table-name", help="LanceDB table name."),
     run_mode: IngestRunModeValue = typer.Option(
-        "batch",
+        "inprocess",
         "--run-mode",
-        help="Execution mode for the SDK ingestor. Defaults to batch; use inprocess to skip Ray for local debug/CI.",
+        help="Execution mode for the SDK ingestor. Defaults to inprocess; use batch for Ray Data scale-out.",
     ),
     dry_run: bool = typer.Option(
         False,
@@ -557,8 +557,8 @@ def ingest_command(
     # Report input-file count alongside the actual landed-row count from the
     # LanceDB table — they diverge whenever one document explodes into multiple
     # chunks (PDFs → page elements, video → audio_visual segments) or
-    # shrinks to zero rows when every NIM call failed. The previous message
-    # only reported inputs and hid both cases. ``n_rows`` is None when the
+    # shrinks to zero rows when every NIM call failed. The SDK rejects empty
+    # or unverifiable ingests before we get here; ``n_rows`` is None when the
     # table read itself failed (caller can still see file count + URI).
     n_files = len(summary["documents"])
     table_path = f"{summary['lancedb_uri']}/{summary['table_name']}"

@@ -505,7 +505,7 @@ def resolve_ingest_plan(
     *,
     profile: IngestProfileValue = "auto",
     input_type: IngestInputTypeValue = "auto",
-    run_mode: IngestRunModeValue = "batch",
+    run_mode: IngestRunModeValue = "inprocess",
     method: str | None = None,
     dpi: int | None = None,
     extract_text: bool | None = None,
@@ -567,9 +567,8 @@ def resolve_ingest_plan(
 ) -> ResolvedIngestPlan:
     """Resolve root ingest options into ordinary params for one extract call.
 
-    Root ``retriever ingest`` intentionally defaults to ``run_mode="batch"``.
-    Programmatic callers that need Ray-free local execution should pass
-    ``run_mode="inprocess"`` explicitly. ``input_type`` remains a private
+    Root ``retriever ingest`` defaults to ``run_mode="inprocess"`` (no Ray).
+    Pass ``run_mode="batch"`` for Ray Data scale-out. ``input_type`` remains a private
     expansion/validation constraint; extraction still routes from the manifest.
     """
 
@@ -706,7 +705,7 @@ def ingest_documents(
     *,
     profile: IngestProfileValue = "auto",
     input_type: IngestInputTypeValue = "auto",
-    run_mode: IngestRunModeValue = "batch",
+    run_mode: IngestRunModeValue = "inprocess",
     dry_run: bool = False,
     method: str | None = None,
     dpi: int | None = None,
@@ -778,9 +777,8 @@ def ingest_documents(
     Batch tuning arguments are opt-in and are translated into
     ``BatchTuningParams`` for extraction or embedding; they are meaningful for
     ``run_mode="batch"`` and ignored by callers that leave them unset.
-    Root ``retriever ingest`` intentionally defaults to ``run_mode="batch"``;
-    pass ``run_mode="inprocess"`` explicitly for local debug or CI callers
-    that need to skip Ray startup.
+    Root ``retriever ingest`` defaults to ``run_mode="inprocess"``; pass
+    ``run_mode="batch"`` for Ray Data scale-out.
     The legacy ``input_type`` argument constrains directory expansion and file
     validation only; extraction routing remains manifest-planned.
     """

@@ -228,9 +228,8 @@ def build_dataset(self, data: Any, **kwargs: Any) -> Any:
 
         Returns
         -------
-        pandas.DataFrame
-            The materialized result after executing the Ray Data pipeline
-            (``ds.to_pandas()``).
+        ray.data.Dataset
+            The lazy Ray dataset with all graph stages appended.
         """
         import ray
         import ray.data as rd

@@ -16,7 +16,7 @@
     from nemo_retriever.params import ExtractParams, EmbedParams
 
     result_ds = (
-        GraphIngestor(run_mode="batch")
+        GraphIngestor(run_mode="inprocess")
         .files(["/data/*.pdf"])
         .extract(ExtractParams(method="pdfium"))
         .embed(EmbedParams(model_name="nvidia/llama-nemotron-embed-1b-v2"))
@@ -387,8 +387,8 @@ class GraphIngestor(ingestor):
     Parameters
     ----------
     run_mode
-        ``"batch"`` (Ray Data, default) or ``"inprocess"`` (single-process
-        pandas).
+        ``"inprocess"`` (single-process pandas, default) or ``"batch"`` (Ray
+        Data).
     ray_address
         Ray cluster address. ``None`` starts a local cluster.
     batch_size
@@ -415,7 +415,7 @@ class GraphIngestor(ingestor):
     def __init__(
         self,
         *,
-        run_mode: str = "batch",
+        run_mode: str = "inprocess",
         documents: Optional[List[str]] = None,
         ray_address: Optional[str] = None,
         ray_log_to_driver: bool = True,

@@ -73,7 +73,7 @@ class HarnessConfig:
     dataset_dir: str
     dataset_label: str
     preset: str
-    run_mode: str = "batch"
+    run_mode: str = "inprocess"
 
     query_csv: str | None = None
     input_type: str = "pdf"

@@ -8,15 +8,14 @@
 
 Examples::
 
-    # Batch mode (Ray) with PDF extraction + embedding
+    # In-process mode (default; no Ray) for local extraction + embedding
     retriever pipeline run /data/pdfs \\
-        --run-mode batch \\
-        --embed-invoke-url http://localhost:8000/v1
+        --ocr-invoke-url http://localhost:9000/v1
 
-    # In-process mode (no Ray) for quick local testing
+    # Batch mode (Ray) for large-scale throughput
     retriever pipeline run /data/pdfs \\
-        --run-mode inprocess \\
-        --ocr-invoke-url http://localhost:9000/v1
+        --run-mode batch \\
+        --embed-invoke-url http://localhost:8000/v1
 
     # Service mode (delegate to a running retriever service)
     retriever pipeline run /data/pdfs \\
@@ -979,10 +978,10 @@ def run(
     ),
     # --- I/O and execution ------------------------------------------------
     run_mode: str = typer.Option(
-        "batch",
+        "inprocess",
         "--run-mode",
         help=(
-            "Execution mode: 'batch' (Ray Data), 'inprocess' (pandas, no Ray), "
+            "Execution mode: 'inprocess' (pandas, no Ray, default), 'batch' (Ray Data), "
             "or 'service' (remote retriever service)."
         ),
         rich_help_panel=_PANEL_IO,

@@ -247,7 +247,7 @@ def create_app(config: ServiceConfig) -> FastAPI:
     app = FastAPI(
         title="Retriever Service",
         description="Low-latency document ingestion service powered by nemo-retriever",
-        version="1.0.0",
+        version="26.5.0",
         docs_url="/docs",
         lifespan=_lifespan,
     )

@@ -216,6 +216,7 @@ async def _create_job(
         *,
         expected_documents: int,
         label: str | None = None,
+        retain_results: bool = False,
     ) -> str:
         """Open a server-side job aggregate and return the assigned ``job_id``.
 
@@ -224,7 +225,10 @@ async def _create_job(
         call sized to the number of files supplied.
         """
         url = f"{self._base_url}/v1/ingest/job"
-        payload: dict[str, Any] = {"expected_documents": expected_documents}
+        payload: dict[str, Any] = {
+            "expected_documents": expected_documents,
+            "retain_results": retain_results,
+        }
         if label is not None:
             payload["label"] = label
         resp = await client.post(url, json=payload)
@@ -639,6 +643,7 @@ async def aingest_documents_stream(
         files: list[Path],
         *,
         pipeline_spec: dict[str, Any] | None = None,
+        retain_results: bool = False,
     ) -> AsyncIterator[dict[str, Any]]:
         """Async generator: upload files, yield events as documents complete.
 
@@ -665,7 +670,11 @@ async def aingest_documents_stream(
             limits=pool_limits,
             headers=self._auth_headers,
         ) as client:
-            job_id = await self._create_job(client, expected_documents=len(files))
+            job_id = await self._create_job(
+                client,
+                expected_documents=len(files),
+                retain_results=retain_results,
+            )
             yield {
                 "event": "job_created",
                 "job_id": job_id,

@@ -46,3 +46,12 @@ class JobCreateRequest(RichModel):
     expected_documents: int = Field(ge=1, description="Number of documents this job will receive")
     label: str | None = Field(default=None, description="Optional human-readable tag for the dashboard")
     metadata: dict[str, Any] = Field(default_factory=dict)
+    retain_results: bool = Field(
+        default=False,
+        description=(
+            "When false (default), completed documents keep only ``result_rows`` in the "
+            "job tracker; row payloads are discarded after the pipeline finishes. Set true "
+            "when the client will poll ``GET /v1/ingest/status/{id}`` to fetch "
+            "``result_data``."
+        ),
+    )