diff --git a/.gitattributes b/.gitattributes
index c8d189184..0a7e469ce 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -1,2 +1,4 @@
 data/dataset.zip filter=lfs diff=lfs merge=lfs -text
 data/ filter=lfs diff=lfs merge=lfs -text
+examples/rag_event_ingest/data/**/*.mp4 filter=lfs diff=lfs merge=lfs -text
+examples/rag_event_ingest/data/**/*.pdf filter=lfs diff=lfs merge=lfs -text
diff --git a/.github/workflows/publish-artifacts.yml b/.github/workflows/publish-artifacts.yml
index 2be3979e1..7cb97dbe2 100644
--- a/.github/workflows/publish-artifacts.yml
+++ b/.github/workflows/publish-artifacts.yml
@@ -7,6 +7,16 @@ on:
     - cron: '30 18 * * *'
   workflow_dispatch:
     inputs:
+      JOBS_TO_RUN:
+        description: 'Jobs to run (manual trigger only)'
+        required: true
+        default: 'all'
+        type: choice
+        options:
+          - all
+          - wheel-only
+          - containers-only
+          - helm-chart-only
       CONTAINER_TAG:
         description: 'Custom tag for containers (optional)'
         required: false
@@ -15,6 +25,26 @@ on:
         description: 'Artifactory version (optional, defaults to auto-generated from get_version.sh)'
         required: false
         default: ''
+      HELM_CHART_VERSION:
+        description: 'Helm chart version for NGC (optional, defaults to auto-generated from get_version.sh)'
+        required: false
+        default: ''
+      # Container-level selection (applies when JOBS_TO_RUN is 'all' or 'containers-only')
+      PUBLISH_RAG_SERVER:
+        description: 'Publish rag-server container'
+        required: false
+        default: true
+        type: boolean
+      PUBLISH_INGESTOR_SERVER:
+        description: 'Publish ingestor-server container'
+        required: false
+        default: true
+        type: boolean
+      PUBLISH_RAG_FRONTEND:
+        description: 'Publish rag-frontend container'
+        required: false
+        default: true
+        type: boolean
 
 env:
   RELEASE_TYPE: dev
@@ -26,6 +56,7 @@ jobs:
   publish-wheel:
     name: Build and Publish Python Wheel
     runs-on: ubuntu-latest
+    if: github.event_name != 'workflow_dispatch' || github.event.inputs.JOBS_TO_RUN == 'all' || github.event.inputs.JOBS_TO_RUN == 'wheel-only'
     container:
       image: python:3.10
     steps:
@@ -106,6 +137,7 @@ jobs:
   publish-rag-server:
     name: Build and Publish RAG Server Container
     runs-on: ubuntu-latest
+    if: github.event_name != 'workflow_dispatch' || ((github.event.inputs.JOBS_TO_RUN == 'all' || github.event.inputs.JOBS_TO_RUN == 'containers-only') && github.event.inputs.PUBLISH_RAG_SERVER != 'false')
     steps:
       - name: Checkout code
         uses: actions/checkout@v4
@@ -147,7 +179,7 @@ jobs:
           # Tag and push to NGC Container Registry
           echo "Pushing rag-server to NGC Container Registry..."
           docker push nvcr.io/nvstaging/blueprint/rag-server:$TAG
-          docker tag nvcr.io/nvstaging/blueprint/rag-server:$TAG nvcr.io/nvstaging/blueprint/rag-server:latest
+          docker tag nvcr.io/nvidia/blueprint/rag-server:$TAG nvcr.io/nvstaging/blueprint/rag-server:latest
           docker push nvcr.io/nvstaging/blueprint/rag-server:latest
           echo "RAG server container publishing completed successfully"
 
@@ -164,6 +196,7 @@ jobs:
   publish-ingestor-server:
     name: Build and Publish Ingestor Server Container
     runs-on: ubuntu-latest
+    if: github.event_name != 'workflow_dispatch' || ((github.event.inputs.JOBS_TO_RUN == 'all' || github.event.inputs.JOBS_TO_RUN == 'containers-only') && github.event.inputs.PUBLISH_INGESTOR_SERVER != 'false')
     steps:
       - name: Checkout code
         uses: actions/checkout@v4
@@ -205,7 +238,7 @@ jobs:
           # Tag and push to NGC Container Registry
           echo "Pushing ingestor-server to NGC Container Registry..."
           docker push nvcr.io/nvstaging/blueprint/ingestor-server:$TAG
-          docker tag nvcr.io/nvstaging/blueprint/ingestor-server:$TAG nvcr.io/nvstaging/blueprint/ingestor-server:latest
+          docker tag nvcr.io/nvidia/blueprint/ingestor-server:$TAG nvcr.io/nvstaging/blueprint/ingestor-server:latest
           docker push nvcr.io/nvstaging/blueprint/ingestor-server:latest
           echo "Ingestor server container publishing completed successfully"
 
@@ -222,6 +255,7 @@ jobs:
   publish-rag-frontend:
     name: Build and Publish RAG Frontend Container
     runs-on: ubuntu-latest
+    if: github.event_name != 'workflow_dispatch' || ((github.event.inputs.JOBS_TO_RUN == 'all' || github.event.inputs.JOBS_TO_RUN == 'containers-only') && github.event.inputs.PUBLISH_RAG_FRONTEND != 'false')
     steps:
       - name: Checkout code
         uses: actions/checkout@v4
@@ -263,7 +297,7 @@ jobs:
           # Tag and push to NGC Container Registry
           echo "Pushing rag-frontend to NGC Container Registry..."
           docker push nvcr.io/nvstaging/blueprint/rag-frontend:$TAG
-          docker tag nvcr.io/nvstaging/blueprint/rag-frontend:$TAG nvcr.io/nvstaging/blueprint/rag-frontend:latest
+          docker tag nvcr.io/nvidia/blueprint/rag-frontend:$TAG nvcr.io/nvstaging/blueprint/rag-frontend:latest
           docker push nvcr.io/nvstaging/blueprint/rag-frontend:latest
           echo "RAG frontend container publishing completed successfully"
 
@@ -274,3 +308,83 @@ jobs:
           docker images | grep "rag-frontend" | awk '{print $3}' | xargs -r docker rmi -f || echo "No rag-frontend images to delete"
           docker system prune -f || true
 
+  # ============================================================================
+  # PUBLISH HELM CHART TO NGC
+  # ============================================================================
+  publish-helm-chart:
+    name: Build and Publish Helm Chart to NGC
+    runs-on: ubuntu-latest
+    if: github.event_name != 'workflow_dispatch' || github.event.inputs.JOBS_TO_RUN == 'all' || github.event.inputs.JOBS_TO_RUN == 'helm-chart-only'
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Install Helm
+        uses: azure/setup-helm@v4
+        with:
+          version: 'v3.17.0'
+
+      - name: Install NGC CLI
+        env:
+          NGC_API_KEY: ${{ secrets.CI_NVSTAGING_BLUEPRINT_KEY }}
+        run: |
+          echo "Installing NGC CLI..."
+          wget --content-disposition https://api.ngc.nvidia.com/v2/resources/nvidia/ngc-apps/ngc_cli/versions/4.9.10/files/ngccli_linux.zip -O ngccli_linux.zip
+          unzip -o ngccli_linux.zip
+          chmod u+x ngc-cli/ngc
+          echo "$(pwd)/ngc-cli" >> $GITHUB_PATH
+          echo "NGC CLI installed successfully"
+
+      - name: Determine Helm chart version
+        id: helm_version
+        run: |
+          if [ -n "${{ github.event.inputs.HELM_CHART_VERSION }}" ]; then
+            echo "Using custom Helm chart version: ${{ github.event.inputs.HELM_CHART_VERSION }}"
+            VERSION="${{ github.event.inputs.HELM_CHART_VERSION }}"
+          else
+            echo "Using auto-generated version from get_version.sh"
+            chmod +x ./ci/get_version.sh
+            VERSION=$(./ci/get_version.sh)
+            echo "Generated version: $VERSION"
+          fi
+          echo "version=$VERSION" >> $GITHUB_OUTPUT
+          echo "HELM_CHART_VERSION=$VERSION" >> $GITHUB_ENV
+
+      - name: Add Helm repositories
+        env:
+          NGC_API_KEY: ${{ secrets.CI_NVSTAGING_BLUEPRINT_KEY }}
+        run: |
+          cd deploy/helm
+          helm repo add nvidia-nim https://helm.ngc.nvidia.com/nim/nvidia/ --username='$oauthtoken' --password="$NGC_API_KEY"
+          helm repo add nim https://helm.ngc.nvidia.com/nim/ --username='$oauthtoken' --password="$NGC_API_KEY"
+          helm repo add nemo-microservices https://helm.ngc.nvidia.com/nvidia/nemo-microservices --username='$oauthtoken' --password="$NGC_API_KEY"
+          helm repo add baidu-nim https://helm.ngc.nvidia.com/nim/baidu --username='$oauthtoken' --password="$NGC_API_KEY"
+          helm repo add bitnami https://charts.bitnami.com/bitnami
+          helm repo add elastic https://helm.elastic.co
+          helm repo add otel https://open-telemetry.github.io/opentelemetry-helm-charts
+          helm repo add zipkin https://zipkin.io/zipkin-helm
+          helm repo add prometheus https://prometheus-community.github.io/helm-charts
+          helm repo update
+
+      - name: Package Helm chart
+        env:
+          NGC_API_KEY: ${{ secrets.CI_NVSTAGING_BLUEPRINT_KEY }}
+        run: |
+          cd deploy/helm
+          helm dependency update nvidia-blueprint-rag
+          helm package nvidia-blueprint-rag/ --version "${{ env.HELM_CHART_VERSION }}"
+          CHART_TGZ=$(ls nvidia-blueprint-rag-*.tgz)
+          echo "Created: $CHART_TGZ"
+
+      - name: Push Helm chart to NGC
+        env:
+          NGC_API_KEY: ${{ secrets.CI_NVSTAGING_BLUEPRINT_KEY }}
+        run: |
+          cd deploy/helm
+          CHART_TGZ="nvidia-blueprint-rag-${{ env.HELM_CHART_VERSION }}.tgz"
+          TARGET="nvstaging/blueprint/nvidia-blueprint-rag:${{ env.HELM_CHART_VERSION }}"
+          # Remove existing version to overwrite (ignore error if version does not exist)
+          ngc registry chart remove "$TARGET" --org nvstaging -y 2>/dev/null || true
+          ngc registry chart push "$TARGET" --source "$CHART_TGZ" --org nvstaging
+          echo "Helm chart published to NGC: $TARGET"
+
diff --git a/.gitignore b/.gitignore
index 9dded62bf..3611412e3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -80,4 +80,9 @@ coverage/
 cover/
 *.log
 tests/data/
+# Agent skills (installed via npx skills add)
+/.agents/
+/.claude/
+skills-lock.json
+
 # Workbench Project Layout
\ No newline at end of file
diff --git a/AGENTS.md b/AGENTS.md
new file mode 100644
index 000000000..183677906
--- /dev/null
+++ b/AGENTS.md
@@ -0,0 +1,86 @@
+# NVIDIA RAG Blueprint
+
+Reference implementation for a Retrieval Augmented Generation pipeline. Python 3.11+ backend (FastAPI + LangChain), React/TypeScript frontend, deployable via Docker Compose or Helm.
+
+## Project structure
+
+```
+src/nvidia_rag/
+├── rag_server/        # RAG query/response server (FastAPI)
+├── ingestor_server/   # Document ingestion server (FastAPI)
+└── utils/             # Shared utilities
+frontend/              # React + TypeScript UI (pnpm)
+deploy/
+├── compose/           # Docker Compose files and env configs
+└── helm/              # Helm charts (standard + MIG-slicing)
+docs/                  # User-facing documentation (Sphinx, RST/MD)
+tests/
+├── unit/              # No network calls allowed
+└── integration/       # Network calls permitted
+notebooks/             # Jupyter notebooks for evaluation and examples
+```
+
+## Development commands
+
+### Backend (Python)
+
+```bash
+uv sync                              # Install all deps
+uv run pytest tests/unit/            # Unit tests
+uv run pytest tests/integration/     # Integration tests
+ruff check --fix src/                # Lint + autofix
+ruff format src/                     # Format
+pre-commit run --all-files           # Run all pre-commit hooks
+```
+
+### Frontend (TypeScript)
+
+```bash
+cd frontend
+pnpm install
+pnpm run dev                         # Dev server
+pnpm run lint                        # ESLint
+pnpm exec tsc --noEmit               # Type check
+pnpm run test:run                    # Tests
+```
+
+## Code conventions
+
+- **Python**: Ruff for linting and formatting (line-length 88, double quotes, space indent). Config in `pyproject.toml`.
+- **Type hints**: Required on all function signatures.
+- **Imports**: Sorted by isort via Ruff. No in-function imports.
+- **Tests**: Mirror source tree (`src/nvidia_rag/rag_server/server.py` → `tests/unit/rag_server/test_server.py`).
+- **Frontend**: ESLint + TypeScript strict mode. Function components with hooks.
+- **Env files**: `deploy/compose/nvdev.env` (NVIDIA-hosted NIMs) and `deploy/compose/.env` (self-hosted). These are the source of truth for Docker deployments — shell-only exports are lost on restart.
+
+## Deployment modes
+
+1. **Docker Compose** — `deploy/compose/` with env-file configs. Multiple profiles: standard, retrieval-only, NVIDIA-hosted.
+2. **Helm** — `deploy/helm/nvidia-blueprint-rag/` chart with `values.yaml`. Supports MIG GPU slicing via `deploy/helm/mig-slicing/`.
+3. **Library** — Import `nvidia_rag` as a Python package for custom pipelines.
+
+## Key files
+
+- `pyproject.toml` — All Python deps, ruff config, project metadata
+- `deploy/compose/nvdev.env` — Default env file for NVIDIA API Catalog deployments
+- `src/nvidia_rag/rag_server/prompt.yaml` — System prompt templates
+- `docs/support-matrix.md` — GPU requirements per deployment mode
+- `docs/service-port-gpu-reference.md` — Port mappings and GPU assignments
+
+## PR and commit guidelines
+
+- Target the `develop` branch, never `main`.
+- All commits must be signed off (DCO).
+- Run `pre-commit run --all-files` before submitting.
+- See `CONTRIBUTING.md` for full workflow.
+
+## Operations — `rag-blueprint` skill
+
+For any operational task — deploying, configuring, troubleshooting, or shutting down the RAG Blueprint — read and follow the skill at `.agents/skills/rag-blueprint/SKILL.md`.
+
+The skill handles:
+
+- **Deploy** — Docker Compose (standard, retrieval-only, NVIDIA-hosted), Helm, MIG-slicing, library mode
+- **Configure** — VLM, guardrails, query rewriting, ingestion, search & retrieval, models, observability, summarization, multimodal, MCP, evaluation, notebooks, UI, and more
+- **Troubleshoot** — Debug unhealthy services, container errors, GPU issues, connectivity failures
+- **Shutdown** — Stop, tear down, and clean up services
diff --git a/CLAUDE.md b/CLAUDE.md
new file mode 100644
index 000000000..e16f0c9d6
--- /dev/null
+++ b/CLAUDE.md
@@ -0,0 +1,84 @@
+# NVIDIA RAG Blueprint
+
+Reference implementation for a Retrieval Augmented Generation pipeline. Python 3.11+ backend (FastAPI + LangChain), React/TypeScript frontend, deployable via Docker Compose or Helm.
+
+## Project structure
+
+```
+src/nvidia_rag/
+├── rag_server/        # RAG query/response server (FastAPI)
+├── ingestor_server/   # Document ingestion server (FastAPI)
+└── utils/             # Shared utilities
+frontend/              # React + TypeScript UI (pnpm)
+deploy/
+├── compose/           # Docker Compose files and env configs
+└── helm/              # Helm charts (standard + MIG-slicing)
+docs/                  # User-facing documentation (Sphinx, RST/MD)
+tests/
+├── unit/              # No network calls allowed
+└── integration/       # Network calls permitted
+notebooks/             # Jupyter notebooks for evaluation and examples
+```
+
+## Development commands
+
+### Backend (Python)
+
+```bash
+uv sync                              # Install all deps
+uv run pytest tests/unit/            # Unit tests
+uv run pytest tests/integration/     # Integration tests
+ruff check --fix src/                # Lint + autofix
+ruff format src/                     # Format
+pre-commit run --all-files           # Run all pre-commit hooks
+```
+
+### Frontend (TypeScript)
+
+```bash
+cd frontend
+pnpm install
+pnpm run dev                         # Dev server
+pnpm run lint                        # ESLint
+pnpm exec tsc --noEmit               # Type check
+pnpm run test:run                    # Tests
+```
+
+## Code conventions
+
+- **Python**: Ruff for linting and formatting (line-length 88, double quotes, space indent). Config in `pyproject.toml`.
+- **Type hints**: Required on all function signatures.
+- **Imports**: Sorted by isort via Ruff. No in-function imports.
+- **Tests**: Mirror source tree (`src/nvidia_rag/rag_server/server.py` → `tests/unit/rag_server/test_server.py`).
+- **Frontend**: ESLint + TypeScript strict mode. Function components with hooks.
+- **Env files**: `deploy/compose/nvdev.env` (NVIDIA-hosted NIMs) and `deploy/compose/.env` (self-hosted). These are the source of truth for Docker deployments — shell-only exports are lost on restart.
+
+## Deployment modes
+
+1. **Docker Compose** — `deploy/compose/` with env-file configs. Multiple profiles: standard, retrieval-only, NVIDIA-hosted.
+2. **Helm** — `deploy/helm/nvidia-blueprint-rag/` chart with `values.yaml`. Supports MIG GPU slicing via `deploy/helm/mig-slicing/`.
+3. **Library** — Import `nvidia_rag` as a Python package for custom pipelines.
+
+## Key files
+
+- `pyproject.toml` — All Python deps, ruff config, project metadata
+- `deploy/compose/nvdev.env` — Default env file for NVIDIA API Catalog deployments
+- `src/nvidia_rag/rag_server/prompt.yaml` — System prompt templates
+- `docs/support-matrix.md` — GPU requirements per deployment mode
+- `docs/service-port-gpu-reference.md` — Port mappings and GPU assignments
+
+## PR and commit guidelines
+
+- Target the `develop` branch, never `main`.
+- All commits must be signed off (DCO).
+- Run `pre-commit run --all-files` before submitting.
+- See `CONTRIBUTING.md` for full workflow.
+
+## Operations — `/rag-blueprint` skill
+
+For any operational task, use the `rag-blueprint` skill (`.agents/skills/rag-blueprint/`).
+
+- **Deploy** — Docker Compose (standard, retrieval-only, NVIDIA-hosted), Helm, MIG-slicing, library mode
+- **Configure** — VLM, guardrails, query rewriting, ingestion, search & retrieval, models, observability, summarization, multimodal, MCP, evaluation, notebooks, UI, and more
+- **Troubleshoot** — Debug unhealthy services, container errors, GPU issues, connectivity failures
+- **Shutdown** — Stop, tear down, and clean up services
diff --git a/README.md b/README.md
index edea2e72a..c400dd410 100644
--- a/README.md
+++ b/README.md
@@ -105,9 +105,9 @@ This modular design ensures efficient query processing, accurate retrieval of in
 
     - [NVIDIA NIM llama-3_2-nv-embedqa-1b-v2](https://build.nvidia.com/nvidia/llama-3_2-nv-embedqa-1b-v2)
     - [NVIDIA NIM llama-3_2-nv-rerankqa-1b-v2](https://build.nvidia.com/nvidia/llama-3_2-nv-rerankqa-1b-v2)
-    - [NeMo Retriever Page Elements NIM](https://build.nvidia.com/nvidia/nemoretriever-page-elements-v3)
-    - [NeMo Retriever Table Structure NIM](https://build.nvidia.com/nvidia/nemoretriever-table-structure-v1)
-    - [NeMo Retriever Graphic Elements NIM](https://build.nvidia.com/nvidia/nemoretriever-graphic-elements-v1)
+    - [NeMo Retriever Page Elements NIM](https://build.nvidia.com/nvidia/nemotron-page-elements-v3)
+    - [NeMo Retriever Table Structure NIM](https://build.nvidia.com/nvidia/nemotron-table-structure-v1)
+    - [NeMo Retriever Graphic Elements NIM](https://build.nvidia.com/nvidia/nemotron-graphic-elements-v1)
     - [NeMo Retriever OCR NIM](https://build.nvidia.com/nvidia/nemoretriever-ocr)
 
 - Optional NIMs
@@ -162,6 +162,29 @@ The following is a step-by-step explanation of the workflow from the end-user pe
 
 
 
+## AI Agent Skill
+
+An agent skill is included that enables AI coding assistants (Claude Code, Cursor, etc.) to deploy, configure, troubleshoot, and manage the RAG Blueprint autonomously.
+
+### Install
+
+```bash
+npx skills add .
+```
+
+This installs the `rag-blueprint` skill from `skill-source/`. After installation, the agent handles requests like:
+
+- *"Deploy RAG on Docker with NVIDIA-hosted models"*
+- *"Enable VLM image captioning and restart the ingestor"*
+- *"Ingestion failed for 3 files, can you check why?"*
+- *"Switch from Docker to library mode"*
+- *"Shut down all RAG services"*
+
+> **Note:** If the agent doesn't pick up the skill automatically (e.g., for short or ambiguous queries), invoke it explicitly with `/rag-blueprint <your request>`.
+
+For skill architecture details, see [`skill-source/README.md`](skill-source/README.md).
+
+
 ## Get Started With NVIDIA RAG Blueprint
 
 The recommended way to get started is to deploy the NVIDIA RAG Blueprint
@@ -202,9 +225,9 @@ Use of the models in this blueprint is governed by the [NVIDIA AI Foundation Mod
 
 ## Terms of Use
 This blueprint is governed by the [NVIDIA Agreements | Enterprise Software | NVIDIA Software License Agreement](https://www.nvidia.com/en-us/agreements/enterprise-software/nvidia-software-license-agreement/) and the [NVIDIA Agreements | Enterprise Software | Product Specific Terms for AI Product](https://www.nvidia.com/en-us/agreements/enterprise-software/product-specific-terms-for-ai-products/). The models are governed by the [NVIDIA Agreements | Enterprise Software | NVIDIA Community Model License](https://www.nvidia.com/en-us/agreements/enterprise-software/nvidia-community-models-license/) and the [NVIDIA RAG dataset](./data/multimodal/) which is governed by the [NVIDIA Asset License Agreement](https://github.com/NVIDIA-AI-Blueprints/rag/blob/main/data/LICENSE.DATA).
-The following models that are built with Llama are governed by the Llama 3.2 Community License Agreement: nvidia/llama-3.2-nv-embedqa-1b-v2 and nvidia/llama-3.2-nv-rerankqa-1b-v2 and llama-3.2-nemoretriever-1b-vlm-embed-v1.
+The following models that are built with Llama are governed by the Llama 3.2 Community License Agreement: nvidia/llama-nemotron-embed-1b-v2 and nvidia/llama-nemotron-rerank-1b-v2 and llama-3.2-nemoretriever-1b-vlm-embed-v1.
 
 ## Additional Information
 
-The [Llama 3.1 Community License Agreement](https://www.llama.com/llama3_1/license/) for the llama-3.1-nemotron-nano-vl-8b-v1, llama-3.1-nemoguard-8b-content-safety and llama-3.1-nemoguard-8b-topic-control models. The [Llama 3.2 Community License Agreement](https://www.llama.com/llama3_2/license/) for the nvidia/llama-3.2-nv-embedqa-1b-v2, nvidia/llama-3.2-nv-rerankqa-1b-v2 and llama-3.2-nemoretriever-1b-vlm-embed-v1 models. The [Llama 3.3 Community License Agreement](https://github.com/meta-llama/llama-models/blob/main/models/llama3_3/LICENSE) for the llama-3.3-nemotron-super-49b-v1.5 models. Built with Llama. Apache 2.0 for NVIDIA Ingest and for the nemoretriever-page-elements-v2, nemoretriever-table-structure-v1, nemoretriever-graphic-elements-v1, paddleocr and nemoretriever-ocr-v1 models.
+The [Llama 3.1 Community License Agreement](https://www.llama.com/llama3_1/license/) for the llama-3.1-nemotron-nano-vl-8b-v1, llama-3.1-nemoguard-8b-content-safety and llama-3.1-nemoguard-8b-topic-control models. The [Llama 3.2 Community License Agreement](https://www.llama.com/llama3_2/license/) for the nvidia/llama-nemotron-embed-1b-v2, nvidia/llama-nemotron-rerank-1b-v2 and llama-3.2-nemoretriever-1b-vlm-embed-v1 models. The [Llama 3.3 Community License Agreement](https://github.com/meta-llama/llama-models/blob/main/models/llama3_3/LICENSE) for the llama-3.3-nemotron-super-49b-v1.5 models. Built with Llama. Apache 2.0 for NVIDIA Ingest and for the nemoretriever-page-elements-v2, nemotron-table-structure-v1, nemotron-graphic-elements-v1, paddleocr and nemoretriever-ocr-v1 models.
 
diff --git a/ci/publish_wheel.sh b/ci/publish_wheel.sh
index f59165fee..711abb462 100755
--- a/ci/publish_wheel.sh
+++ b/ci/publish_wheel.sh
@@ -25,8 +25,8 @@ if [ -n "$ARTIFACTORY_VERSION" ]; then
     echo "Using custom Artifactory version: $ARTIFACTORY_VERSION"
     ARTIFACTORY_VERSION_FINAL=$ARTIFACTORY_VERSION
 else
-    echo "Using default Artifactory version: 2.4.0.dev"
-    ARTIFACTORY_VERSION_FINAL="2.4.0.dev"
+    echo "Using default Artifactory version: 2.5.0.dev"
+    ARTIFACTORY_VERSION_FINAL="2.5.0.dev"
 fi
 
 # Build first wheel for GitLab Package Registry
diff --git a/deploy/compose/.env b/deploy/compose/.env
index 9f6ccf796..cc80cfaf0 100644
--- a/deploy/compose/.env
+++ b/deploy/compose/.env
@@ -22,8 +22,8 @@ export NVIDIA_API_KEY=${NGC_API_KEY}
 export APP_LLM_SERVERURL=nim-llm:8000
 export APP_FILTEREXPRESSIONGENERATOR_SERVERURL=nim-llm:8000
 export SUMMARY_LLM_SERVERURL=nim-llm:8000
-export APP_EMBEDDINGS_SERVERURL=nemoretriever-embedding-ms:8000/v1
-export APP_RANKING_SERVERURL=nemoretriever-ranking-ms:8000
+export APP_EMBEDDINGS_SERVERURL=nemotron-embedding-ms:8000/v1
+export APP_RANKING_SERVERURL=nemotron-ranking-ms:8000
 export OCR_GRPC_ENDPOINT=nemoretriever-ocr:8001
 export OCR_HTTP_ENDPOINT=http://nemoretriever-ocr:8000/v1/infer
 export OCR_INFER_PROTOCOL=grpc
@@ -50,11 +50,11 @@ export YOLOX_TABLE_STRUCTURE_INFER_PROTOCOL=grpc
 # export OCR_HTTP_ENDPOINT=https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-ocr
 # export OCR_INFER_PROTOCOL=http
 # export OCR_MODEL_NAME=scene_text_ensemble
-# export YOLOX_HTTP_ENDPOINT=https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-page-elements-v3
+# export YOLOX_HTTP_ENDPOINT=https://ai.api.nvidia.com/v1/cv/nvidia/nemotron-page-elements-v3
 # export YOLOX_INFER_PROTOCOL=http
-# export YOLOX_GRAPHIC_ELEMENTS_HTTP_ENDPOINT=https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-graphic-elements-v1
+# export YOLOX_GRAPHIC_ELEMENTS_HTTP_ENDPOINT=https://ai.api.nvidia.com/v1/cv/nvidia/nemotron-graphic-elements-v1
 # export YOLOX_GRAPHIC_ELEMENTS_INFER_PROTOCOL=http
-# export YOLOX_TABLE_STRUCTURE_HTTP_ENDPOINT=https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-table-structure-v1
+# export YOLOX_TABLE_STRUCTURE_HTTP_ENDPOINT=https://ai.api.nvidia.com/v1/cv/nvidia/nemotron-table-structure-v1
 # export YOLOX_TABLE_STRUCTURE_INFER_PROTOCOL=http
 # export APP_QUERYREWRITER_SERVERURL=""
 # export APP_QUERYREWRITER_MODELNAME="nvidia/llama-3.3-nemotron-super-49b-v1.5"
diff --git a/deploy/compose/docker-compose-ingestor-server.yaml b/deploy/compose/docker-compose-ingestor-server.yaml
index 964a9c02c..1d284d53d 100644
--- a/deploy/compose/docker-compose-ingestor-server.yaml
+++ b/deploy/compose/docker-compose-ingestor-server.yaml
@@ -3,7 +3,7 @@ services:
   # Main ingestor server which is responsible for ingestion
   ingestor-server:
     container_name: ingestor-server
-    image: nvcr.io/nvstaging/blueprint/ingestor-server:${TAG:-2.4.0}
+    image: nvcr.io/nvidia/blueprint/ingestor-server:${TAG:-2.5.0}
     build:
       # Set context to repo's root directory
       context: ../../
@@ -75,8 +75,8 @@ services:
 
       ##===Embedding Model specific configurations===
       # url on which embedding model is hosted. If "", Nvidia hosted API is used
-      APP_EMBEDDINGS_SERVERURL: ${APP_EMBEDDINGS_SERVERURL-"nemoretriever-embedding-ms:8000/v1"}
-      APP_EMBEDDINGS_MODELNAME: ${APP_EMBEDDINGS_MODELNAME:-nvidia/llama-3.2-nv-embedqa-1b-v2}
+      APP_EMBEDDINGS_SERVERURL: ${APP_EMBEDDINGS_SERVERURL-"nemotron-embedding-ms:8000/v1"}
+      APP_EMBEDDINGS_MODELNAME: ${APP_EMBEDDINGS_MODELNAME:-nvidia/llama-nemotron-embed-1b-v2}
       # For VLM Embedding Model (Nemoretriever-1b-vlm-embed-v1)
       # APP_EMBEDDINGS_SERVERURL: ${APP_EMBEDDINGS_SERVERURL-"nemotron-vlm-embedding-ms:8000/v1"}
       # APP_EMBEDDINGS_MODELNAME: ${APP_EMBEDDINGS_MODELNAME:-nvidia/llama-nemotron-embed-vl-1b-v2}
@@ -95,7 +95,8 @@ services:
       APP_NVINGEST_EXTRACTPAGEASIMAGE: ${APP_NVINGEST_EXTRACTPAGEASIMAGE:-False}
       APP_NVINGEST_STRUCTURED_ELEMENTS_MODALITY: ${APP_NVINGEST_STRUCTURED_ELEMENTS_MODALITY:-""} # Select from "image", "text_image"
       APP_NVINGEST_IMAGE_ELEMENTS_MODALITY: ${APP_NVINGEST_IMAGE_ELEMENTS_MODALITY:-""} # Select from "image"
-      APP_NVINGEST_PDFEXTRACTMETHOD: ${APP_NVINGEST_PDFEXTRACTMETHOD:-None} # Select from pdfium, nemoretron_parse, None
+      APP_NVINGEST_PDFEXTRACTMETHOD: ${APP_NVINGEST_PDFEXTRACTMETHOD:-None} # Select from pdfium, nemotron_parse, None
+      APP_NVINGEST_EXTRACTTABLESMETHOD: ${APP_NVINGEST_EXTRACTTABLESMETHOD:-yolox} # yolox, nemotron_parse, or None
       # Extract text by "page" only recommended for documents with pages like .pdf, .docx, etc.
       APP_NVINGEST_TEXTDEPTH: ${APP_NVINGEST_TEXTDEPTH:-page} # extract by "page" or "document"
 
@@ -168,7 +169,7 @@ services:
       - "6379:6379"
 
   nv-ingest-ms-runtime:
-    image: nvcr.io/nvidia/nemo-microservices/nv-ingest:26.1.1
+    image: nvcr.io/nvidia/nemo-microservices/nv-ingest:26.1.2
     # cpuset: "0-15" # Uncomment to restrict this container to CPU cores 0–15
     shm_size: 40gb # Should be at minimum 30% of assigned memory per Ray documentation
     volumes:
@@ -234,13 +235,13 @@ services:
       - YOLOX_HTTP_ENDPOINT=${YOLOX_HTTP_ENDPOINT:-http://page-elements:8000/v1/infer}
       - YOLOX_INFER_PROTOCOL=${YOLOX_INFER_PROTOCOL:-grpc}
       # build.nvidia.com hosted yolox-graphics-elements endpoints.
-      #- YOLOX_GRAPHIC_ELEMENTS_HTTP_ENDPOINT=https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-graphic-elements-v1
+      #- YOLOX_GRAPHIC_ELEMENTS_HTTP_ENDPOINT=https://ai.api.nvidia.com/v1/cv/nvidia/nemotron-graphic-elements-v1
       #- YOLOX_GRAPHIC_ELEMENTS_INFER_PROTOCOL=http
       - YOLOX_GRAPHIC_ELEMENTS_GRPC_ENDPOINT=${YOLOX_GRAPHIC_ELEMENTS_GRPC_ENDPOINT:-graphic-elements:8001}
       - YOLOX_GRAPHIC_ELEMENTS_HTTP_ENDPOINT=${YOLOX_GRAPHIC_ELEMENTS_HTTP_ENDPOINT:-http://graphic-elements:8000/v1/infer}
       - YOLOX_GRAPHIC_ELEMENTS_INFER_PROTOCOL=${YOLOX_GRAPHIC_ELEMENTS_INFER_PROTOCOL:-grpc}
       # build.nvidia.com hosted  yolox-table-elements endpoints.
-      #- YOLOX_TABLE_STRUCTURE_HTTP_ENDPOINT=https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-table-structure-v1
+      #- YOLOX_TABLE_STRUCTURE_HTTP_ENDPOINT=https://ai.api.nvidia.com/v1/cv/nvidia/nemotron-table-structure-v1
       #- YOLOX_TABLE_STRUCTURE_INFER_PROTOCOL=http
       - YOLOX_TABLE_STRUCTURE_GRPC_ENDPOINT=${YOLOX_TABLE_STRUCTURE_GRPC_ENDPOINT:-table-structure:8001}
       - YOLOX_TABLE_STRUCTURE_HTTP_ENDPOINT=${YOLOX_TABLE_STRUCTURE_HTTP_ENDPOINT:-http://table-structure:8000/v1/infer}
diff --git a/deploy/compose/docker-compose-rag-server.yaml b/deploy/compose/docker-compose-rag-server.yaml
index b3e20808f..dc04c5329 100644
--- a/deploy/compose/docker-compose-rag-server.yaml
+++ b/deploy/compose/docker-compose-rag-server.yaml
@@ -3,7 +3,7 @@ services:
   # Main orchestrator server which stiches together all calls to different services to fulfill the user request
   rag-server:
     container_name: rag-server
-    image: nvcr.io/nvstaging/blueprint/rag-server:${TAG:-2.4.0}
+    image: nvcr.io/nvidia/blueprint/rag-server:${TAG:-2.5.0}
     build:
       # Set context to repo's root directory
       context: ../../
@@ -74,11 +74,11 @@ services:
       LLM_MAX_TOKENS: ${LLM_MAX_TOKENS:-32768}
       LLM_TEMPERATURE: ${LLM_TEMPERATURE:-0}
       LLM_TOP_P: ${LLM_TOP_P:-1.0}
-      
-      # Enable/disable thinking/reasoning for nemotron-3-nano models (30b variant)
-      # Set to "true" to enable reasoning mode with reasoning_budget
-      # Set to "false" to disable reasoning and get direct answers
-      ENABLE_NEMOTRON_3_NANO_THINKING: ${ENABLE_NEMOTRON_3_NANO_THINKING:-true}
+
+      # Reasoning configuration (supported by Nemotron 3 and other reasoning models)
+      LLM_ENABLE_THINKING: ${LLM_ENABLE_THINKING:-false}
+      LLM_REASONING_BUDGET: ${LLM_REASONING_BUDGET:-0}
+      LLM_LOW_EFFORT: ${LLM_LOW_EFFORT:-false}
 
       ##===Query Rewriter Model specific configurations===
       APP_QUERYREWRITER_MODELNAME: ${APP_QUERYREWRITER_MODELNAME:-"nvidia/llama-3.3-nemotron-super-49b-v1.5"}
@@ -94,8 +94,8 @@ services:
 
       ##===Embedding Model specific configurations===
       # url on which embedding model is hosted. If "", Nvidia hosted API is used
-      APP_EMBEDDINGS_SERVERURL: ${APP_EMBEDDINGS_SERVERURL-"nemoretriever-embedding-ms:8000/v1"}
-      APP_EMBEDDINGS_MODELNAME: ${APP_EMBEDDINGS_MODELNAME:-nvidia/llama-3.2-nv-embedqa-1b-v2}
+      APP_EMBEDDINGS_SERVERURL: ${APP_EMBEDDINGS_SERVERURL-"nemotron-embedding-ms:8000/v1"}
+      APP_EMBEDDINGS_MODELNAME: ${APP_EMBEDDINGS_MODELNAME:-nvidia/llama-nemotron-embed-1b-v2}
       APP_EMBEDDINGS_DIMENSIONS: ${APP_EMBEDDINGS_DIMENSIONS:-2048}
       # For VLM Embedding Model (Nemoretriever-1b-vlm-embed-v1)
       # APP_EMBEDDINGS_SERVERURL: ${APP_EMBEDDINGS_SERVERURL-"nemotron-vlm-embedding-ms:8000/v1"}
@@ -103,8 +103,8 @@ services:
 
       ##===Reranking Model specific configurations===
       # url on which ranking model is hosted. If "", Nvidia hosted API is used
-      APP_RANKING_SERVERURL: ${APP_RANKING_SERVERURL-"nemoretriever-ranking-ms:8000"}
-      APP_RANKING_MODELNAME: ${APP_RANKING_MODELNAME:-"nvidia/llama-3.2-nv-rerankqa-1b-v2"}
+      APP_RANKING_SERVERURL: ${APP_RANKING_SERVERURL-"nemotron-ranking-ms:8000"}
+      APP_RANKING_MODELNAME: ${APP_RANKING_MODELNAME:-"nvidia/llama-nemotron-rerank-1b-v2"}
       ENABLE_RERANKER: ${ENABLE_RERANKER:-True}
       # Default score threshold for filtering documents by reranker relevance (0.0 to 1.0)
       RERANKER_SCORE_THRESHOLD: ${RERANKER_SCORE_THRESHOLD:-${RERANKER_CONFIDENCE_THRESHOLD:-0.0}}
@@ -211,7 +211,7 @@ services:
   # Sample UI container which interacts with APIs exposed by rag-server container
   rag-frontend:
     container_name: rag-frontend
-    image: nvcr.io/nvstaging/blueprint/rag-frontend:${TAG:-2.4.0}
+    image: nvcr.io/nvidia/blueprint/rag-frontend:${TAG:-2.5.0}
     build:
       # Set context to repo's root directory
       context: ../../frontend
diff --git a/deploy/compose/nemoguardrails/config-store/nemoguard_cloud/config.yml b/deploy/compose/nemoguardrails/config-store/nemoguard_cloud/config.yml
index 1e014fc9f..17200db05 100644
--- a/deploy/compose/nemoguardrails/config-store/nemoguard_cloud/config.yml
+++ b/deploy/compose/nemoguardrails/config-store/nemoguard_cloud/config.yml
@@ -17,5 +17,7 @@ rails:
       - content safety check input $model=content_safety
       - topic safety check input $model=topic_control
   output:
+    streaming:
+      enabled: true
     flows:
       - content safety check output $model=content_safety
\ No newline at end of file
diff --git a/deploy/compose/nemotron3-super-cloud.env b/deploy/compose/nemotron3-super-cloud.env
new file mode 100644
index 000000000..468bd2fb7
--- /dev/null
+++ b/deploy/compose/nemotron3-super-cloud.env
@@ -0,0 +1,49 @@
+# ==============================================================================
+# Nemotron 3 Super - NVIDIA-hosted (cloud) endpoints
+# ==============================================================================
+# Self-contained cloud + Nemotron 3 Super. Source after .env so cloud endpoints
+# override on-prem defaults:  source deploy/compose/.env && source deploy/compose/nemotron3-super-cloud.env
+# No need to edit .env (uncomment/comment sections).
+# ==============================================================================
+
+# === Authentication ===
+export NVIDIA_API_KEY=${NGC_API_KEY}
+
+# === Embeddings, Ranking, OCR, YOLOX (cloud) ===
+export APP_EMBEDDINGS_SERVERURL=https://integrate.api.nvidia.com/v1
+export APP_RANKING_SERVERURL=https://integrate.api.nvidia.com/v1
+export OCR_HTTP_ENDPOINT=https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-ocr
+export OCR_INFER_PROTOCOL=http
+export OCR_MODEL_NAME=scene_text_ensemble
+export YOLOX_HTTP_ENDPOINT=https://ai.api.nvidia.com/v1/cv/nvidia/nemotron-page-elements-v3
+export YOLOX_INFER_PROTOCOL=http
+export YOLOX_GRAPHIC_ELEMENTS_HTTP_ENDPOINT=https://ai.api.nvidia.com/v1/cv/nvidia/nemotron-graphic-elements-v1
+export YOLOX_GRAPHIC_ELEMENTS_INFER_PROTOCOL=http
+export YOLOX_TABLE_STRUCTURE_HTTP_ENDPOINT=https://ai.api.nvidia.com/v1/cv/nvidia/nemotron-table-structure-v1
+export YOLOX_TABLE_STRUCTURE_INFER_PROTOCOL=http
+
+# === LLM ===
+export APP_LLM_MODELNAME=nvidia/nemotron-3-super-120b-a12b
+export APP_LLM_SERVERURL=https://integrate.api.nvidia.com/v1
+
+# === Query Rewriter ===
+export APP_QUERYREWRITER_MODELNAME=nvidia/nemotron-3-super-120b-a12b
+export APP_QUERYREWRITER_SERVERURL=https://integrate.api.nvidia.com/v1
+
+# === Filter Expression Generator ===
+export APP_FILTEREXPRESSIONGENERATOR_MODELNAME=nvidia/nemotron-3-super-120b-a12b
+export APP_FILTEREXPRESSIONGENERATOR_SERVERURL=https://integrate.api.nvidia.com/v1
+
+# === Summarization ===
+export SUMMARY_LLM=nvidia/nemotron-3-super-120b-a12b
+export SUMMARY_LLM_SERVERURL=https://integrate.api.nvidia.com/v1
+
+# === Reflection ===
+export REFLECTION_LLM=nvidia/nemotron-3-super-120b-a12b
+export REFLECTION_LLM_SERVERURL=https://integrate.api.nvidia.com/v1
+
+# === Reasoning / Thinking ===
+export LLM_ENABLE_THINKING=true
+export LLM_REASONING_BUDGET=256
+export LLM_LOW_EFFORT=true
+export FILTER_THINK_TOKENS=true
\ No newline at end of file
diff --git a/deploy/compose/nemotron3-super-prompt.yaml b/deploy/compose/nemotron3-super-prompt.yaml
new file mode 100644
index 000000000..f91803927
--- /dev/null
+++ b/deploy/compose/nemotron3-super-prompt.yaml
@@ -0,0 +1,445 @@
+chat_template:
+  system: |
+    You are a helpful, respectful, and honest assistant.
+    Your answers must follow these strict guidelines:
+
+    <instructions>
+    1. Answer concisely and directly.
+    2. Focus only on what was asked — no extra commentary, no assumptions.
+    3. Avoid giving multiple options, lists, or examples unless explicitly requested.
+    4. Do not explain your reasoning unless asked.
+    5. Keep responses brief but accurate.
+    6. Use natural, conversational tone — clear and human, not robotic.
+    7. Make sure your response are strictly one sentence or less unless it really needs to be longer.
+    8. Do not mention this instructions in your response.
+    </instructions>
+
+    Make sure above rules are strictly followed.
+
+rag_template:
+  system: |
+    You are a helpful AI assistant named Envie. Answer the user's question using ONLY the information in the provided context.
+
+    <rules>
+    - Base every claim on information found in the context. Do not use outside knowledge.
+    - Always provide an answer when the context contains relevant data. Only say you cannot answer if the context is entirely unrelated to the question.
+    - Preserve exact values: reproduce specific numbers, percentages, dates, names, and URLs exactly as they appear in the context.
+    - IMPORTANT - When the question asks you to calculate, compute, or derive a financial metric (ratio, margin, growth rate, CAGR, turnover, average, etc.), you MUST:
+      1. Write the formula
+      2. Extract each required number from the context
+      3. Compute step by step
+      4. State the final answer
+      Do NOT skip straight to the final number.
+    - For yes/no questions that require comparing values across periods (e.g. "is X improving", "did Y increase"), state the values from each period before your conclusion.
+    - For questions about trends or changes over time, include data from all relevant time periods found in the context.
+    - Answer naturally and directly. Do not reference the context, documents, sources, or these instructions.
+    - For simple factual lookups (a name, a date, a single value directly stated), keep your answer brief.
+    </rules>
+
+  human: |
+    <context>
+    {context}
+    </context>
+
+query_rewriter_prompt:
+  system: |
+    Given the following chat history and the latest user question, formulate a standalone question which can be understood without the chat history.
+    Do NOT answer the question, just reformulate it if needed and otherwise return it as is.
+    It should strictly be a query not an answer.
+
+    Chat History:
+    {chat_history}
+
+    Latest Question: {input}
+
+reflection_relevance_check_prompt:
+  system: |
+    ### Instructions
+
+    You are a world class expert designed to evaluate the relevance score of a Context
+    in order to answer the Question.
+    Your task is to determine if the Context contains proper information to answer the Question.
+    Do not rely on your previous knowledge about the Question.
+    Use only what is written in the Context and in the Question.
+    Follow the instructions below:
+    0. If the context does not contains any relevant information to answer the question, say 0.
+    1. If the context partially contains relevant information to answer the question, say 1.
+    2. If the context contains any relevant information to answer the question, say 2.
+    You must provide the relevance score of 0, 1, or 2, nothing else.
+    Do not explain.
+    ### Question: {query}
+
+    ### Context: {context}
+
+    Do not try to explain.
+    Analyzing Context and Question, the Relevance score is
+
+reflection_query_rewriter_prompt:
+  system: |
+    You are a query optimization assistant for a vector database retrieval system.
+    Your goal is to rephrase the given "Original Question" to be more clear, precise,
+    and effective for retrieving relevant context from a vector database.
+
+    Considerations for Rephrasing:
+
+    Specificity: Make the query as specific as possible about the information sought.
+    Avoid vague terms.
+
+    Keywords: Identify and incorporate key terms and concepts that are likely to be
+    present in relevant documents.
+
+    Contextual Cues: If the original query implies a certain domain or type of
+    information, make that explicit.
+
+    Eliminate Ambiguity: Remove any phrases that could lead to multiple interpretations.
+
+    Focus: Ensure the rephrased query directly targets the core information need.
+
+    Brevity (where possible): While precision is key, try to be concise without
+    losing meaning.
+
+    Only output the rewritten question with no other information.
+
+    Original Question: {query}
+
+    Rewritten Question:
+
+reflection_groundedness_check_prompt:
+  system: |
+    ### Instruction
+
+    You are a world class expert designed to evaluate the groundedness of an assertion.
+    You will be provided with an assertion and a context.
+    Your task is to determine if the assertion is supported by the context.
+    Follow the instructions below:
+    A. If there is no context or no assertion or context is empty or assertion is empty, say 0.
+    B. If the assertion is not supported by the context, say 0.
+    C. If the assertion is partially supported by the context, say 1.
+    D. If the assertion is fully supported by the context, say 2.
+    You must provide a rating of 0, 1, or 2, nothing else.
+
+    ### Context:
+    <{context}>
+
+    ### Assertion:
+    <{response}>
+
+    Analyzing Context and Response, the Groundedness score is
+
+reflection_response_regeneration_prompt:
+  system: |
+    You are tasked with creating a new "Response" based solely on the provided 
+    "Context" and "Query". Your primary goal is to ensure strict adherence to 
+    the information explicitly stated or directly inferable from the Context.
+
+    Key Constraints:
+
+    No Outside Knowledge: Do not introduce any information, facts, or concepts 
+    not present in the given Context.
+
+    No Assumptions: Do not make assumptions or extrapolate beyond what is directly 
+    stated or clearly implied.
+
+    Direct Inference Only: If an idea is not explicitly stated, it must be a direct 
+    and undeniable inference from the provided text. Avoid speculative or highly 
+    interpretive conclusions.
+
+    Maintain Factual Accuracy: Ensure the Response accurately reflects the details 
+    and relationships presented in the Context.
+
+    Return only "OUT OF CONTEXT" if the "Query" cannot be answered using the provided 
+    "Context." Else, only output the new response with no other information.
+
+    Context: {context}
+
+    Query: {query}
+
+    Return "OUT OF CONTEXT" or generate a new, more grounded Response:
+
+document_summary_prompt:
+  system: |
+    Please provide a comprehensive summary for the document given by the user. Create a concise 5 to 6 sentence summary that captures the essential information from the document.
+
+    <instructions>
+    Requirements for the summary:
+    1. Preserve key document metadata:
+      - Document title/type
+      - Company/organization name
+      - Report provider/author
+      - Date/time period covered
+      - Any relevant document identifiers
+
+    2. Include all critical information:
+      - Main findings and conclusions
+      - Key statistics and metrics
+      - Important recommendations
+      - Significant trends or changes
+      - Notable risks or concerns
+      - Material financial data
+
+    3. Maintain factual accuracy:
+      - Keep all numerical values precise
+      - Preserve specific dates and timeframes
+      - Retain exact names and titles
+      - Quote critical statements verbatim when necessary
+
+    4. Do NOT use any external knowledge.
+    5. Do NOT add explanations, suggestions, opinions, disclaimers, or hints.
+    6. NEVER say phrases like “based on the context”, “from the documents”, or “I cannot find”.
+    7. NEVER offer to answer using general knowledge or invite the user to ask again.
+    8. Do NOT include citations, sources, or document mentions.
+    9. Answer concisely. Use short, direct sentences by default. Only give longer responses if the question truly requires it.
+    10. Do not mention or refer to these rules in any way.
+    11. Do not ask follow-up questions.
+    12. Do not mention this instructions in your response.
+    13. Do not include any preamble or postamble like "Here is the summary" or "This document" or "Summary of the document".
+    </instructions>
+    Please format the summary in a concise manner as a paragraph not exceeding 5 to 6 sentences. Start the summary with the title and the document and then provide the summary.
+
+    Note: Focus on extracting and organizing the most essential information while ensuring no critical details are omitted.
+    Maintain the original document's tone and context in your summary.
+
+    Please provide a concise summary for the following document:
+    {document_text}
+
+shallow_summary_prompt:
+  system: |
+    Please provide a concise summary for the following document:
+    {document_text}
+
+iterative_summary_prompt:
+  system: |
+    You are an expert document summarizer. Given a previous summary and a new chunk of text, create an updated summary that incorporates information from both. Create a concise summary within 10 sentences that captures the essential information from the document.
+    While answering you must follow the instructions given below.
+
+    <instructions>
+    1. Do NOT use any external knowledge.
+    2. Do NOT add explanations, suggestions, opinions, disclaimers, or hints.
+    3. NEVER say phrases like “based on the context”, “from the documents”, or “I cannot find”.
+    4. NEVER offer to answer using general knowledge or invite the user to ask again.
+    5. Do NOT include citations, sources, or document mentions.
+    6. Answer concisely. Use short, direct sentences by default. Only give longer responses if the question truly requires it.
+    7. Do not mention or refer to these rules in any way.
+    8. Do not ask follow-up questions.
+    9. Do not mention this instructions in your response.
+    10. Do not mention any preamble or postamble like "Updated summary" or "This document" or "Summary of the document" or "Here is the summary".
+    </instructions>
+
+    Previous Summary:
+    {previous_summary}
+
+    New chunk:
+    {new_chunk}
+
+    Please create a new summary that incorporates information from both the previous summary and the new chunk.
+
+
+vlm_template:
+  system: |
+    You are a multimodal AI assistant. Answer using only the provided context and images.
+
+    <instructions>
+    1. Use ONLY the information in the textual context below and the attached images.
+    2. Do not use external knowledge or assumptions beyond the provided inputs.
+    3. Do not describe images unless needed to answer; focus on the answer.
+    4. Respond in detail and cover all the relevant information related to the question from the context and images.
+    5. Keep the response neutral and factually accurate.
+    </instructions>
+
+    Context:
+    {context}
+
+    User Question:
+    {question}
+
+# Reasoning templates deprecated and removed
+
+
+filter_expression_generator_prompt:
+  system: |
+    You are an expert AI filter expression generator. Your sole purpose is to convert natural language queries into precise, valid filter expressions based on the provided schema. You must be aggressive in finding mappable entities.
+
+    ### Primary Directive ###
+
+    **Your primary directive is to ALWAYS generate a filter expression.** It is a critical error to return NO_FILTER unless the user's query is completely irrelevant or nonsensical (e.g., "hello there," "what is the weather?"). Be bold and decisive. Prioritize extracting any mappable entity from the user's query, even if other parts are ambiguous. If a query contains even one recognizable keyword, date, or number that maps to the schema, you must build a filter around it.
+
+    ### Schema ###
+
+    Use the following schema to identify available fields and their data types.
+    {metadata_schema}
+
+    ### Core Logic ###
+
+    1.  **Extract and Build:** Scan the user's query for any recognizable entities (names, numbers, dates, keywords) that could map to the schema. Build a filter using every piece of information you can extract. Ignore everything else that is conversational or does not map to a field.
+    2.  **Field Format:** The field format is always content_metadata["field_name"].
+    3.  **Operators:** Use uppercase logical operators: AND, OR, NOT. Use parentheses () to group expressions.
+
+    ### Operators & Data Types (Complete List) ###
+
+    1.  **String**: ==, !=, in, like
+        * Example: content_metadata["doc_type"] in ["report", "summary"]
+    2.  **Number**: ==, !=, >, >=, <, <=, in, between
+        * Example: content_metadata["page_count"] > 10
+    3.  **Datetime** (Format: YYYY-MM-DDTHH:MM:SS): ==, !=, >, >=, <, <=
+        * Example: content_metadata["created_at"] >= "2024-01-01T00:00:00"
+    4.  **Boolean**: ==, !=
+        * Example: content_metadata["is_public"] == true
+    5.  **Array**: array_contains, array_contains_any, array_contains_all, array_length
+        * Single value: array_contains(content_metadata["category"], "AI")
+        * Multiple values (any): array_contains_any(content_metadata["regions"], ["EMEA", "APAC"])
+        * Multiple values (all): array_contains_all(content_metadata["tags"], ["urgent", "review"])
+
+    ### Intelligent Mapping Examples ###
+
+    * **Query:** "Project X"
+        * **Action:** Recognizes "Project X" as a single mappable entity and builds a filter.
+        * **Output:** content_metadata["project"] == "Project X"
+    * **Query:** "approved"
+        * **Action:** Recognizes "approved" as a status and builds a filter just for that.
+        * **Output:** content_metadata["status"] == "approved"
+    * **Query:** "Find the latest financial reports for Project X"
+        * **Action:** Ignore "latest" as it's subjective. Extract "financial reports" and "Project X".
+        * **Output:** (content_metadata["doc_type"] == "financial_report" AND content_metadata["project"] == "Project X")
+    * **Query:** "I think I need the document from Q2 last year about compliance"
+        * **Action:** Ignore "I think I need". Extract "Q2 last year" (2024) and "compliance".
+        * **Output:** (content_metadata["created_at"] >= "2024-04-01T00:00:00" AND content_metadata["created_at"] < "2024-07-01T00:00:00" AND array_contains(content_metadata["tags"], "compliance"))
+
+    ### Your Task ###
+
+    Convert the following user query into a filter expression.
+    {user_request}
+
+    ### Response Format ###
+
+    Your response **MUST** be only the raw filter expression string and nothing else. Do not use explanations, comments, or markdown.
+
+    1.  **On Success:** The filter expression string.
+        * content_metadata["year"] == 2024
+
+    2.  **On Absolute Failure:** The exact text NO_FILTER.
+        * **Use this ONLY if the query is completely unrelated to the schema**, like "what is your name?" or "tell me a joke".
+
+    3.  **On Logical Conflict:** The exact text UNSUPPORTED.
+        * **Use this ONLY for impossible logic**, like "year is 2022 and year is 2023".
+
+query_decomposition_multiquery_prompt:
+  system: |
+    You are an AI assistant designed to break down a user's complex question into a list of simpler, focused subqueries. 
+    The purpose of this decomposition is to improve the accuracy of a retrieval-augmented generation (RAG) system.
+
+    <instructions>
+    1. Analyze the user's main question to identify its key components.
+    2. Decompose the question into 1-3 distinct, self-contained subqueries. 
+    3. If the original question is simple and already focused, return query directly.
+    4. Each subquery should be a clear, direct question that, when answered, contributes to a comprehensive response to the original question.
+    5. Avoid creating redundant or overly broad subqueries. Focus on the core information needed to answer the original prompt
+    </instructions>
+
+    Return only the subqueries as a numbered list, without any additional text.
+    Original question: {question}
+
+query_decompositions_query_rewriter_prompt:
+  system: |
+    You are an expert at rewriting queries to improve information retrieval for a conversational AI system. Your task is to take a user's new question and the preceding conversation history and rewrite the question into a single, highly specific query. This new query should be ideal for a search or retrieval system.
+
+    <instructions>
+    1. Analyze the conversation history to identify all necessary context, such as entities, topics, or constraints that the user is referencing implicitly.
+    2. Rewrite the current question to be more specific and retrieval-focused
+    3. Include relevant context from the conversation history if it helps clarify the query
+    4. Make the query more explicit about what information is being sought
+    5. Ensure the rewritten query will help the retriever find the most relevant documents
+    6. Just provide the rewritten query, no other text.
+    7. Keep the query as short as possible.
+    8. Do not provide any explanation.
+    9. Do not answer the question.
+    </instructions>
+
+    Conversation History:
+    {conversation_history}
+
+    Current Question: {question}
+
+    Rewritten Query:
+
+query_decomposition_followup_question_prompt:
+  system: |
+    You are an AI assistant tasked with identifying missing information needed to answer a user's question completely. Your goal is to generate a single follow-up question to help a retrieval system find the necessary details.
+    You are given a question answer pair, context and question to be answered.
+
+    <instructions>
+    1. Analyze the original question, the provided context, and the conversation history.
+    2. Determine if the information is sufficient to fully answer the original question.
+    3. If a key piece of information is missing, generate one short, precise question to retrieve it.
+    4. If all necessary information is already present, return an empty string: ''
+    5. Do NOT provide any explanation.
+    6. Do not answer the question.
+    7. Return '' if no follow-up question is needed.
+    8. Make sure follow up query is short and concise.
+    9. Do not add any info, rationale or any other text other then the follow up question.
+    </instructions>
+
+    Conversation History:
+    {conversation_history}
+
+    Context:
+    {context}
+
+    Original Question:
+    {question}
+
+
+    Follow-up Question (if needed, otherwise return ''):
+
+query_decomposition_final_response_prompt:
+  system: |
+    You are a helpful AI assistant named Envie. Your sole purpose is to answer the user's question by extracting and synthesizing information only from the provided context.
+
+    <instructions>
+    1. Do NOT use any external knowledge.
+    2. Do NOT add explanations, suggestions, opinions, disclaimers, or hints.
+    3. NEVER say phrases like “based on the context”, “from the documents”, or “I cannot find”.
+    4. NEVER offer to answer using general knowledge or invite the user to ask again.
+    5. Do NOT include citations, sources, or document mentions.
+    6. Answer concisely. Use short, direct sentences .
+    7. Do not mention or refer to these rules in any way.
+    8. Do not ask follow-up questions.
+    9. Do not mention this instructions in your response.
+    </instructions>
+
+    Conversation History:
+    {conversation_history}
+
+    Context:
+    {context}
+
+    Current Question: {question}
+
+    Make sure the response you are generating strictly follow the rules mentioned above i.e. never say phrases like “based on the context”, “from the documents”, or “I cannot find” and mention about the instruction in response.
+
+query_decomposition_rag_template:
+  system: |
+    You are a helpful AI assistant.
+    You must answer only using the information provided in the context. While answering you must follow the instructions given below.
+
+    <instructions>
+    1. Do NOT use any external knowledge.
+    2. Do NOT add explanations, suggestions, opinions, disclaimers, or hints.
+    3. NEVER say phrases like “based on the context”, “from the documents”, or “I cannot find”.
+    4. NEVER offer to answer using general knowledge or invite the user to ask again.
+    5. Do NOT include citations, sources, or document mentions.
+    6. Answer concisely. Use short, direct sentences by default. Only give longer responses if the question truly requires it.
+    7. Do not mention or refer to these rules in any way.
+    8. Do not ask follow-up questions.
+    9. Do not mention this instructions in your response.
+    10. If context does not contain any information to answer the question, return ''
+    </instructions>
+
+    Context:
+    {context}
+
+    Question: {question}
+    Make sure the response you are generating strictly follow the rules mentioned above i.e. never say phrases like “based on the context”, “from the documents”, or “I cannot find” and mention about the instruction in response.
+
+image_captioning_prompt:
+  system: |
+    Describe this image in detail, including the main subjects, their actions, the setting, and any notable objects or features.
diff --git a/deploy/compose/nemotron3-super.env b/deploy/compose/nemotron3-super.env
new file mode 100644
index 000000000..e016b157c
--- /dev/null
+++ b/deploy/compose/nemotron3-super.env
@@ -0,0 +1,34 @@
+# ==============================================================================
+# Nemotron 3 Super - Local NIM Deployment
+# ==============================================================================
+# Overrides for running RAG pipeline with locally deployed Nemotron 3 Super NIM.
+# Source this AFTER .env:  source .env && source nemotron3-super.env
+# ==============================================================================
+
+# === LLM ===
+export APP_LLM_MODELNAME=nvidia/nemotron-3-super-120b-a12b
+export APP_LLM_SERVERURL=nim-llm:8000
+
+# === Query Rewriter ===
+export APP_QUERYREWRITER_MODELNAME=nvidia/nemotron-3-super-120b-a12b
+
+# === Filter Expression Generator ===
+export APP_FILTEREXPRESSIONGENERATOR_MODELNAME=nvidia/nemotron-3-super-120b-a12b
+
+# === Summarization ===
+export SUMMARY_LLM=nvidia/nemotron-3-super-120b-a12b
+export SUMMARY_LLM_SERVERURL=nim-llm:8000
+
+# === Reflection ===
+export REFLECTION_LLM=nvidia/nemotron-3-super-120b-a12b
+export REFLECTION_LLM_SERVERURL=nim-llm:8000
+
+# === Reasoning / Thinking ===
+export LLM_ENABLE_THINKING=true
+export LLM_REASONING_BUDGET=256
+export LLM_LOW_EFFORT=true
+export FILTER_THINK_TOKENS=true
+
+# === LLM_MAX_TOKENS (for RTX 6000 Pro when using NIM_MAX_MODEL_LEN=32768) ===
+# Uncomment and set: 16256
+# export LLM_MAX_TOKENS=16256
diff --git a/deploy/compose/nims.yaml b/deploy/compose/nims.yaml
index f376d9a64..2bca3dce2 100644
--- a/deploy/compose/nims.yaml
+++ b/deploy/compose/nims.yaml
@@ -31,9 +31,9 @@ services:
       retries: 100
     profiles: ["", "rag"]
 
-  nemoretriever-embedding-ms:
-    container_name: nemoretriever-embedding-ms
-    image: nvcr.io/nim/nvidia/llama-3.2-nv-embedqa-1b-v2:1.10.1
+  nemotron-embedding-ms:
+    container_name: nemotron-embedding-ms
+    image: nvcr.io/nim/nvidia/llama-nemotron-embed-1b-v2:1.13.0
     volumes:
     - ${MODEL_DIRECTORY:-./}:/opt/nim/.cache
     ports:
@@ -91,9 +91,9 @@ services:
       start_period: 10m
     profiles: ["vlm-embed", "vlm-ingest"]
 
-  nemoretriever-ranking-ms:
-    container_name: nemoretriever-ranking-ms
-    image: nvcr.io/nim/nvidia/llama-3.2-nv-rerankqa-1b-v2:1.8.0
+  nemotron-ranking-ms:
+    container_name: nemotron-ranking-ms
+    image: nvcr.io/nim/nvidia/llama-nemotron-rerank-1b-v2:1.10.0
     volumes:
     - ${MODEL_DIRECTORY:-./}:/opt/nim/.cache
     ports:
@@ -108,6 +108,7 @@ services:
       interval: 10s
       timeout: 20s
       retries: 100
+    shm_size: 16GB
     deploy:
       resources:
         reservations:
@@ -119,7 +120,7 @@ services:
     profiles: ["", "rag", "vlm-generation"]
 
   page-elements:
-    image: ${YOLOX_IMAGE:-nvcr.io/nim/nvidia/nemoretriever-page-elements-v3}:${YOLOX_TAG:-1.7.0}
+    image: ${YOLOX_IMAGE:-nvcr.io/nim/nvidia/nemotron-page-elements-v3}:${YOLOX_TAG:-1.8.0}
     shm_size: 16gb
     ports:
       - "8000:8000"
@@ -157,7 +158,7 @@ services:
     profiles: ["", "ingest", "vlm-ingest"]
 
   graphic-elements:
-    image: ${YOLOX_GRAPHIC_ELEMENTS_IMAGE:-nvcr.io/nim/nvidia/nemoretriever-graphic-elements-v1}:${YOLOX_GRAPHIC_ELEMENTS_TAG:-1.6.0}
+    image: ${YOLOX_GRAPHIC_ELEMENTS_IMAGE:-nvcr.io/nim/nvidia/nemotron-graphic-elements-v1}:${YOLOX_GRAPHIC_ELEMENTS_TAG:-1.8.0}
     shm_size: 16gb
     ports:
       - "8003:8000"
@@ -183,7 +184,7 @@ services:
     profiles: ["", "ingest", "vlm-ingest"]
 
   table-structure:
-    image: ${YOLOX_TABLE_STRUCTURE_IMAGE:-nvcr.io/nim/nvidia/nemoretriever-table-structure-v1}:${YOLOX_TABLE_STRUCTURE_TAG:-1.6.0}
+    image: ${YOLOX_TABLE_STRUCTURE_IMAGE:-nvcr.io/nim/nvidia/nemotron-table-structure-v1}:${YOLOX_TABLE_STRUCTURE_TAG:-1.8.0}
     shm_size: 16gb
     ports:
       - "8006:8000"
@@ -323,6 +324,7 @@ services:
       interval: 10s
       timeout: 20s
       retries: 100
+    shm_size: 16GB
     deploy:
       resources:
         reservations:
diff --git a/deploy/compose/nvdev.env b/deploy/compose/nvdev.env
index b92e4500b..d5a919153 100644
--- a/deploy/compose/nvdev.env
+++ b/deploy/compose/nvdev.env
@@ -20,24 +20,24 @@ export APP_LLM_MODELNAME=nvidia/llama-3.3-nemotron-super-49b-v1.5
 # export APP_LLM_MODELNAME=nvidia/nemotron-3-nano-30b-a3b
 # Note: For locally deployed nemotron-3-nano, use: nvidia/nemotron-3-nano
 export APP_FILTEREXPRESSIONGENERATOR_MODELNAME=nvidia/llama-3.3-nemotron-super-49b-v1.5
-export APP_EMBEDDINGS_MODELNAME=nvdev/nvidia/llama-3.2-nv-embedqa-1b-v2
+export APP_EMBEDDINGS_MODELNAME=nvidia/llama-nemotron-embed-1b-v2
 # For VLM Embedding Model (Nemoretriever-1b-vlm-embed-v1)
 # export APP_EMBEDDINGS_MODELNAME=nvdev/nvidia/llama-nemotron-embed-vl-1b-v2
-export APP_RANKING_MODELNAME=nvidia/llama-3.2-nv-rerankqa-1b-v2
+export APP_RANKING_MODELNAME=nvidia/llama-nemotron-rerank-1b-v2
 export ENABLE_RERANKER=True
 export APP_EMBEDDINGS_SERVERURL=https://integrate.api.nvidia.com/v1
 export APP_LLM_SERVERURL=""
 export APP_FILTEREXPRESSIONGENERATOR_SERVERURL=""
 export APP_RANKING_SERVERURL=""
-# export APP_RANKING_SERVERURL=https://ai.api.nvidia.com/v1/nvdev/retrieval/nvidia/llama-3_2-nv-rerankqa-1b-v2/reranking/v1
+# export APP_RANKING_SERVERURL=https://ai.api.nvidia.com/v1/retrieval/nvidia/llama-nemotron-rerank-1b-v2/reranking
 export OCR_HTTP_ENDPOINT=https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-ocr
 export OCR_INFER_PROTOCOL=http
 export OCR_MODEL_NAME=scene_text_ensemble
-export YOLOX_HTTP_ENDPOINT=https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-page-elements-v3
+export YOLOX_HTTP_ENDPOINT=https://ai.api.nvidia.com/v1/cv/nvidia/nemotron-page-elements-v3
 export YOLOX_INFER_PROTOCOL=http
-export YOLOX_GRAPHIC_ELEMENTS_HTTP_ENDPOINT=https://ai.api.nvidia.com/v1/cv/nvdev/nvidia/nemoretriever-graphic-elements-v1
+export YOLOX_GRAPHIC_ELEMENTS_HTTP_ENDPOINT=https://ai.api.nvidia.com/v1/cv/nvidia/nemotron-graphic-elements-v1
 export YOLOX_GRAPHIC_ELEMENTS_INFER_PROTOCOL=http
-export YOLOX_TABLE_STRUCTURE_HTTP_ENDPOINT=https://ai.api.nvidia.com/v1/cv/nvdev/nvidia/nemoretriever-table-structure-v1
+export YOLOX_TABLE_STRUCTURE_HTTP_ENDPOINT=https://ai.api.nvidia.com/v1/cv/nvidia/nemotron-table-structure-v1
 export YOLOX_TABLE_STRUCTURE_INFER_PROTOCOL=http
 export SUMMARY_LLM="nvidia/llama-3.3-nemotron-super-49b-v1.5"
 export SUMMARY_LLM_SERVERURL=""
diff --git a/deploy/helm/mig-slicing/mig-config.yaml b/deploy/helm/mig-slicing/mig-config-h100.yaml
similarity index 100%
rename from deploy/helm/mig-slicing/mig-config.yaml
rename to deploy/helm/mig-slicing/mig-config-h100.yaml
diff --git a/deploy/helm/mig-slicing/mig-config-rtx6000.yaml b/deploy/helm/mig-slicing/mig-config-rtx6000.yaml
new file mode 100644
index 000000000..14272b497
--- /dev/null
+++ b/deploy/helm/mig-slicing/mig-config-rtx6000.yaml
@@ -0,0 +1,26 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: custom-mig-config
+data:
+  config.yaml: |
+    version: v1
+    mig-configs:
+      all-disabled:
+        - devices: all
+          mig-enabled: false
+      
+      custom-rtx6000-4x1g24-2x1g24-1x2g48-1x4g96:
+        - devices: [0]
+          mig-enabled: true
+          mig-devices:
+            "1g.24gb": 4
+        - devices: [1]
+          mig-enabled: true
+          mig-devices:
+            "1g.24gb": 2
+            "2g.48gb": 1
+        - devices: [2]
+          mig-enabled: true
+          mig-devices:
+            "4g.96gb": 1
diff --git a/deploy/helm/mig-slicing/values-mig.yaml b/deploy/helm/mig-slicing/values-mig-h100.yaml
similarity index 100%
rename from deploy/helm/mig-slicing/values-mig.yaml
rename to deploy/helm/mig-slicing/values-mig-h100.yaml
diff --git a/deploy/helm/mig-slicing/values-mig-rtx6000.yaml b/deploy/helm/mig-slicing/values-mig-rtx6000.yaml
new file mode 100644
index 000000000..e7ae285da
--- /dev/null
+++ b/deploy/helm/mig-slicing/values-mig-rtx6000.yaml
@@ -0,0 +1,114 @@
+# MIG-optimized resource configuration for RAG Blueprint
+# This file only overrides GPU resource requirements to use MIG slices
+
+# NV-Ingest configuration
+nv-ingest:
+  # Milvus - uses 1g.24gb MIG slice
+  milvus:
+    standalone:
+      resources:
+        limits:
+          nvidia.com/gpu: "0"
+          nvidia.com/mig-1g.24gb: 1
+        requests:
+          nvidia.com/gpu: "0"
+          nvidia.com/mig-1g.24gb: 1
+
+  # NV-Ingest NIM Operator overrides
+  nimOperator:
+    # Page Elements - uses 1g.24gb
+    page_elements:
+      resources:
+        limits:
+          nvidia.com/gpu: "0"
+          nvidia.com/mig-1g.24gb: 1
+        requests:
+          nvidia.com/gpu: "0"
+          nvidia.com/mig-1g.24gb: 1
+      storage:
+        pvc:
+          storageClass: ""
+
+    # Graphic Elements - uses 1g.24gb
+    graphic_elements:
+      resources:
+        limits:
+          nvidia.com/gpu: "0"
+          nvidia.com/mig-1g.24gb: 1
+        requests:
+          nvidia.com/gpu: "0"
+          nvidia.com/mig-1g.24gb: 1
+      storage:
+        pvc:
+          storageClass: ""
+
+    # Table Structure - uses 1g.24gb
+    table_structure:
+      resources:
+        limits:
+          nvidia.com/gpu: "0"
+          nvidia.com/mig-1g.24gb: 1
+        requests:
+          nvidia.com/gpu: "0"
+          nvidia.com/mig-1g.24gb: 1
+      storage:
+        pvc:
+          storageClass: ""
+
+    # OCR - uses 2g.48gb (larger slice)
+    nemoretriever_ocr_v1:
+      resources:
+        limits:
+          nvidia.com/gpu: "0"
+          nvidia.com/mig-2g.48gb: 1
+        requests:
+          nvidia.com/gpu: "0"
+          nvidia.com/mig-2g.48gb: 1
+      storage:
+        pvc:
+          storageClass: ""
+# Main NIM Operator overrides for MIG
+nimOperator:
+  # LLM - uses 4g.96gb
+  nim-llm:
+    resources:
+      limits:
+        nvidia.com/gpu: "0"
+        nvidia.com/mig-4g.96gb: 1
+      requests:
+        nvidia.com/gpu: "0"
+        nvidia.com/mig-4g.96gb: 1
+      storage:
+        pvc:
+          storageClass: ""
+    model:
+      engine: tensorrt_llm
+      precision: "fp8"
+      qosProfile: "throughput"
+      tensorParallelism: "1"
+      gpus:
+        - product: "rtx6000_blackwell_sv"
+  # Embedding - uses 1g.24gb
+  nvidia-nim-llama-32-nv-embedqa-1b-v2:
+    resources:
+      limits:
+        nvidia.com/gpu: "0"
+        nvidia.com/mig-1g.24gb: 1
+      requests:
+        nvidia.com/gpu: "0"
+        nvidia.com/mig-1g.24gb: 1
+      storage:
+        pvc:
+          storageClass: ""
+  # Reranking - uses 1g.24gb
+  nvidia-nim-llama-32-nv-rerankqa-1b-v2:
+    resources:
+      limits:
+        nvidia.com/gpu: "0"
+        nvidia.com/mig-1g.24gb: 1
+      requests:
+        nvidia.com/gpu: "0"
+        nvidia.com/mig-1g.24gb: 1
+      storage:
+        pvc:
+          storageClass: ""
diff --git a/deploy/helm/nvidia-blueprint-rag/Chart.lock b/deploy/helm/nvidia-blueprint-rag/Chart.lock
index 7b479e4aa..723660bfd 100644
--- a/deploy/helm/nvidia-blueprint-rag/Chart.lock
+++ b/deploy/helm/nvidia-blueprint-rag/Chart.lock
@@ -1,7 +1,7 @@
 dependencies:
 - name: nv-ingest
   repository: https://helm.ngc.nvidia.com/nvidia/nemo-microservices
-  version: 26.1.1
+  version: 26.1.2
 - name: eck-elasticsearch
   repository: https://helm.elastic.co
   version: 0.18.0
@@ -14,5 +14,5 @@ dependencies:
 - name: kube-prometheus-stack
   repository: https://prometheus-community.github.io/helm-charts
   version: 76.3.0
-digest: sha256:7f85073bdf19922173b3372d9b5a877d6c2f783b431ce7a2f783308f67806c66
-generated: "2026-02-04T07:29:44.453434343Z"
+digest: sha256:a65037bbcb6fa587af3d15b949a32b059cf26d1102a2166d0e77daed29a0f520
+generated: "2026-03-02T16:48:31.702049307+05:30"
diff --git a/deploy/helm/nvidia-blueprint-rag/Chart.yaml b/deploy/helm/nvidia-blueprint-rag/Chart.yaml
index a8a459279..afe7a5cbd 100644
--- a/deploy/helm/nvidia-blueprint-rag/Chart.yaml
+++ b/deploy/helm/nvidia-blueprint-rag/Chart.yaml
@@ -1,10 +1,10 @@
 apiVersion: v2
-appVersion: v2.4.0
+appVersion: v2.5.0
 dependencies:
 - condition: nv-ingest.enabled
   name: nv-ingest
   repository: https://helm.ngc.nvidia.com/nvidia/nemo-microservices
-  version: 26.1.1
+  version: 26.1.2
 - condition: eck-elasticsearch.enabled
   name: eck-elasticsearch
   repository: https://helm.elastic.co
@@ -24,4 +24,4 @@ dependencies:
 description: An end to end Helm chart for the NVIDIA RAG Blueprint
 name: nvidia-blueprint-rag
 type: application
-version: v2.4.0
+version: v2.5.0
diff --git a/deploy/helm/nvidia-blueprint-rag/endpoints.md b/deploy/helm/nvidia-blueprint-rag/endpoints.md
index 7609053d5..e62b0d1fb 100644
--- a/deploy/helm/nvidia-blueprint-rag/endpoints.md
+++ b/deploy/helm/nvidia-blueprint-rag/endpoints.md
@@ -24,11 +24,11 @@ This document describes the configurable endpoints used by the RAG server and it
 
 ### Embedding Model
 - **APP_EMBEDDINGS_SERVERURL**: URL for the embedding model service (default: "nemo-retriever-embedding-ms:8000")
-- **APP_EMBEDDINGS_MODELNAME**: Name of the embedding model (default: "nvidia/llama-3.2-nv-embedqa-1b-v2")
+- **APP_EMBEDDINGS_MODELNAME**: Name of the embedding model (default: "nvidia/llama-nemotron-embed-1b-v2")
 
 ### Reranking Model
 - **APP_RANKING_SERVERURL**: URL for the ranking model service (default: "nemo-retriever-reranking-ms:8000")
-- **APP_RANKING_MODELNAME**: Name of the ranking model (default: "nvidia/llama-3.2-nv-rerankqa-1b-v2")
+- **APP_RANKING_MODELNAME**: Name of the ranking model (default: "nvidia/llama-nemotron-rerank-1b-v2")
 
 ### Reflection Model
 - **REFLECTION_LLM_SERVERURL**: URL for the reflection LLM service (default: "nim-llm:8000")
@@ -42,8 +42,8 @@ This document describes the configurable endpoints used by the RAG server and it
 
 ### Model Configuration
 - **NEXT_PUBLIC_MODEL_NAME**: Name of the LLM model used in the frontend (default: "nvidia/llama-3.3-nemotron-super-49b-v1.5")
-- **VITE_EMBEDDING_MODEL**: Name of the embedding model used in the frontend (default: "nvidia/llama-3.2-nv-embedqa-1b-v2")
-- **VITE_RERANKER_MODEL**: Name of the reranker model used in the frontend (default: "nvidia/llama-3.2-nv-rerankqa-1b-v2")
+- **VITE_EMBEDDING_MODEL**: Name of the embedding model used in the frontend (default: "nvidia/llama-nemotron-embed-1b-v2")
+- **VITE_RERANKER_MODEL**: Name of the reranker model used in the frontend (default: "nvidia/llama-nemotron-rerank-1b-v2")
 
 ## Monitoring and Tracing Endpoints
 
diff --git a/deploy/helm/nvidia-blueprint-rag/files/prompt.yaml b/deploy/helm/nvidia-blueprint-rag/files/prompt.yaml
index f82c83655..d73036509 100644
--- a/deploy/helm/nvidia-blueprint-rag/files/prompt.yaml
+++ b/deploy/helm/nvidia-blueprint-rag/files/prompt.yaml
@@ -487,6 +487,7 @@ query_decomposition_rag_template:
     Context:
     {context}
 
+    Question: {question}
     Make sure the response you are generating strictly follow the rules mentioned above i.e. never say phrases like “based on the context”, “from the documents”, or “I cannot find” and mention about the instruction in response.
 
 image_captioning_prompt:
diff --git a/deploy/helm/nvidia-blueprint-rag/nemotron3-super-rtx6000-values.yaml b/deploy/helm/nvidia-blueprint-rag/nemotron3-super-rtx6000-values.yaml
new file mode 100644
index 000000000..d042a6c44
--- /dev/null
+++ b/deploy/helm/nvidia-blueprint-rag/nemotron3-super-rtx6000-values.yaml
@@ -0,0 +1,25 @@
+# Override values for Nemotron 3 Super on RTX 6000 Pro only.
+# Use after nemotron3-super-values.yaml:
+#   -f deploy/helm/nvidia-blueprint-rag/values.yaml \
+#   -f deploy/helm/nvidia-blueprint-rag/nemotron3-super-values.yaml \
+#   -f deploy/helm/nvidia-blueprint-rag/nemotron3-super-rtx6000-values.yaml
+# See docs/nemotron3-super-deployment.md. Requires host GRUB/reboot for RTX 6000 Pro.
+
+envVars:
+  LLM_MAX_TOKENS: "16256"  # use "1024" for non-reasoning mode
+
+nimOperator:
+  nim-llm:
+    env:
+      - name: NIM_HTTP_API_PORT
+        value: "8000"
+      - name: NIM_TRITON_LOG_VERBOSE
+        value: "1"
+      - name: NIM_SERVED_MODEL_NAME
+        value: "nvidia/nemotron-3-super-120b-a12b"
+      - name: NIM_MAX_MODEL_LEN
+        value: "32768"
+      - name: NCCL_P2P_DISABLE
+        value: "1"
+      - name: NIM_KVCACHE_PERCENT
+        value: "0.9"
\ No newline at end of file
diff --git a/deploy/helm/nvidia-blueprint-rag/nemotron3-super-values.yaml b/deploy/helm/nvidia-blueprint-rag/nemotron3-super-values.yaml
new file mode 100644
index 000000000..710fff1fe
--- /dev/null
+++ b/deploy/helm/nvidia-blueprint-rag/nemotron3-super-values.yaml
@@ -0,0 +1,39 @@
+# Override values for Nemotron 3 Super LLM NIM (all hardware).
+# Use with: -f deploy/helm/nvidia-blueprint-rag/values.yaml -f deploy/helm/nvidia-blueprint-rag/nemotron3-super-values.yaml
+# For RTX 6000 Pro, add: -f deploy/helm/nvidia-blueprint-rag/nemotron3-super-rtx6000-values.yaml
+# See docs/nemotron3-super-deployment.md.
+
+envVars:
+  APP_LLM_MODELNAME: "nvidia/nemotron-3-super-120b-a12b"
+  APP_QUERYREWRITER_MODELNAME: "nvidia/nemotron-3-super-120b-a12b"
+  APP_FILTEREXPRESSIONGENERATOR_MODELNAME: "nvidia/nemotron-3-super-120b-a12b"
+  REFLECTION_LLM: "nvidia/nemotron-3-super-120b-a12b"
+
+ingestor-server:
+  envVars:
+    SUMMARY_LLM: "nvidia/nemotron-3-super-120b-a12b"
+
+nimOperator:
+  nim-llm:
+    image:
+      repository: nvcr.io/nim/nvidia/nemotron-3-super-120b-a12b
+      pullPolicy: IfNotPresent
+      tag: "1.8.0"
+    resources:
+      limits:
+        nvidia.com/gpu: 2
+      requests:
+        nvidia.com/gpu: 2
+    model:
+      engine: vllm
+      precision: "fp8"
+      tensorParallelism: "2"
+    env:
+      - name: NIM_HTTP_API_PORT
+        value: "8000"
+      - name: NIM_TRITON_LOG_VERBOSE
+        value: "1"
+      - name: NIM_SERVED_MODEL_NAME
+        value: "nvidia/nemotron-3-super-120b-a12b"
+      - name: NIM_MAX_MODEL_LEN
+        value: "131072"
diff --git a/deploy/helm/nvidia-blueprint-rag/templates/llm-nim.yaml b/deploy/helm/nvidia-blueprint-rag/templates/llm-nim.yaml
index 60f043973..f103a1a72 100644
--- a/deploy/helm/nvidia-blueprint-rag/templates/llm-nim.yaml
+++ b/deploy/helm/nvidia-blueprint-rag/templates/llm-nim.yaml
@@ -61,4 +61,8 @@ spec:
   {{- end }}
   expose:
 {{ toYaml $nimLlm.expose | nindent 4 }}
+  {{- with $nimLlm.startupProbe }}
+  startupProbe:
+{{ toYaml . | nindent 4 }}
+  {{- end }}
 {{- end }}
\ No newline at end of file
diff --git a/deploy/helm/nvidia-blueprint-rag/values.yaml b/deploy/helm/nvidia-blueprint-rag/values.yaml
index 47ef09b68..00e6914b0 100644
--- a/deploy/helm/nvidia-blueprint-rag/values.yaml
+++ b/deploy/helm/nvidia-blueprint-rag/values.yaml
@@ -56,8 +56,8 @@ apiKeysSecret:
 
 # -- RAG server container image
 image:
-  repository: nvcr.io/nvstaging/blueprint/rag-server
-  tag: "2.4.0"
+  repository: nvcr.io/nvidia/blueprint/rag-server
+  tag: "2.5.0"
   pullPolicy: Always
 
 # -- RAG server service configuration
@@ -160,15 +160,11 @@ envVars:
   # URL on which LLM model is hosted. If "", Nvidia hosted API is used
   APP_LLM_SERVERURL: "nim-llm:8000"
   # LLM model parameters
-  LLM_MAX_TOKENS: "32768"
+  # For Nemotron 3 Super on RTX 6000 Pro: uncomment and set to 16256 (reasoning) or 1024 (non-reasoning); comment LLM_MAX_TOKENS above
+  LLM_MAX_TOKENS: "32768" # "16256"
   LLM_TEMPERATURE: "0"
   LLM_TOP_P: "1.0"
 
-  # Enable/disable thinking/reasoning for nemotron-3-nano models (30b variant)
-  # Set to "true" to enable reasoning mode with reasoning_budget
-  # Set to "false" to disable reasoning and get direct answers
-  ENABLE_NEMOTRON_3_NANO_THINKING: "true"
-
   ##===Query Rewriter Model specific configurations===
   APP_QUERYREWRITER_MODELNAME: "nvidia/llama-3.3-nemotron-super-49b-v1.5"
   # URL on which query rewriter model is hosted. If "", Nvidia hosted API is used
@@ -183,14 +179,14 @@ envVars:
 
   ##===Embedding Model specific configurations===
   # URL on which embedding model is hosted. If "", Nvidia hosted API is used
-  APP_EMBEDDINGS_SERVERURL: "nemoretriever-embedding-ms:8000/v1"
-  APP_EMBEDDINGS_MODELNAME: "nvidia/llama-3.2-nv-embedqa-1b-v2"
+  APP_EMBEDDINGS_SERVERURL: "nemotron-embedding-ms:8000/v1"
+  APP_EMBEDDINGS_MODELNAME: "nvidia/llama-nemotron-embed-1b-v2"
   APP_EMBEDDINGS_DIMENSIONS: "2048"
 
   ##===Reranking Model specific configurations===
   # URL on which ranking model is hosted. If "", Nvidia hosted API is used
-  APP_RANKING_SERVERURL: "nemoretriever-ranking-ms:8000"
-  APP_RANKING_MODELNAME: "nvidia/llama-3.2-nv-rerankqa-1b-v2"
+  APP_RANKING_SERVERURL: "nemotron-ranking-ms:8000"
+  APP_RANKING_MODELNAME: "nvidia/llama-nemotron-rerank-1b-v2"
   ENABLE_RERANKER: "True"
   # Default score threshold for filtering documents by reranker relevance (0.0 to 1.0)
   RERANKER_SCORE_THRESHOLD: "0.0"
@@ -260,6 +256,11 @@ envVars:
   # Whether to filter content within <think></think> tags in model responses
   FILTER_THINK_TOKENS: "true"
 
+  # Reasoning configuration (supported by Nemotron 3 and other reasoning models)
+  LLM_ENABLE_THINKING: "false"
+  LLM_REASONING_BUDGET: "0"
+  LLM_LOW_EFFORT: "false"
+
   NEMO_GUARDRAILS_URL: "nemo-guardrails:7331"
 
   # enable iterative query decomposition
@@ -289,8 +290,8 @@ ingestor-server:
     password: ""
 
   image:
-    repository: nvcr.io/nvstaging/blueprint/ingestor-server
-    tag: "2.4.0"
+    repository: nvcr.io/nvidia/blueprint/ingestor-server
+    tag: "2.5.0"
     pullPolicy: Always
 
   # -- Service config for ingestor-server
@@ -349,8 +350,8 @@ ingestor-server:
     ## APP_EMBEDDINGS_APIKEY and SUMMARY_LLM_APIKEY are loaded from secrets automatically.
 
     # === Embeddings Configurations ===
-    APP_EMBEDDINGS_SERVERURL: "nemoretriever-embedding-ms:8000/v1"
-    APP_EMBEDDINGS_MODELNAME: "nvidia/llama-3.2-nv-embedqa-1b-v2"
+    APP_EMBEDDINGS_SERVERURL: "nemotron-embedding-ms:8000/v1"
+    APP_EMBEDDINGS_MODELNAME: "nvidia/llama-nemotron-embed-1b-v2"
     APP_EMBEDDINGS_DIMENSIONS: "2048"
 
     # === NV-Ingest Configurations ===
@@ -359,6 +360,7 @@ ingestor-server:
 
     # === NV-Ingest extraction configurations ===
     APP_NVINGEST_PDFEXTRACTMETHOD: "None"  # Method used for text extraction from "None", "pdfium", "nemotron_parse"
+    APP_NVINGEST_EXTRACTTABLESMETHOD: "yolox"  # Method for table extraction: "yolox", "nemotron_parse", or None
     APP_NVINGEST_EXTRACTTEXT: "True"  # Enable text extraction
     APP_NVINGEST_EXTRACTINFOGRAPHICS: "False"  # Enable infographic extraction
     APP_NVINGEST_EXTRACTTABLES: "True"  # Enable table extraction
@@ -452,9 +454,9 @@ frontend:
   replicaCount: 1
 
   image:
-    repository: nvcr.io/nvstaging/blueprint/rag-frontend
+    repository: nvcr.io/nvidia/blueprint/rag-frontend
     pullPolicy: IfNotPresent
-    tag: "2.4.0"
+    tag: "2.5.0"
 
   imagePullSecret:
     name: "ngc-secret"
@@ -657,11 +659,22 @@ nimOperator:
       repository: nvcr.io/nim/nvidia/llama-3.3-nemotron-super-49b-v1.5
       pullPolicy: IfNotPresent
       tag: "1.14.0"
+# -- For Nemotron 3 Super: uncomment the block below and comment the image block above
+#    image:
+#      repository: nvcr.io/nim/nvidia/nemotron-3-super-120b-a12b
+#      pullPolicy: IfNotPresent
+#      tag: "1.8.0"
     resources:
       limits:
         nvidia.com/gpu: 1
       requests:
         nvidia.com/gpu: 1
+# -- For Nemotron 3 Super (all hardware): uncomment the block below and comment the resources block above
+#    resources:
+#      limits:
+#        nvidia.com/gpu: 2
+#      requests:
+#        nvidia.com/gpu: 2
     nodeSelector: {}
     tolerations: []
     model:
@@ -672,6 +685,10 @@ nimOperator:
 #      tensorParallelism: "1"
 #      gpus:
 #        - product: "rtx6000_blackwell_sv"
+# -- For Nemotron 3 Super (all hardware): comment "engine: tensorrt_llm" above and uncomment the three lines below
+#      engine: vllm
+#      precision: "fp8"
+#      tensorParallelism: "2"
     storage:
       pvc:
         create: true
@@ -702,6 +719,15 @@ nimOperator:
         value: "1"
       - name: NIM_SERVED_MODEL_NAME
         value: "nvidia/llama-3.3-nemotron-super-49b-v1.5"
+      - name: NIM_MAX_MODEL_LEN
+        value: "131072"
+# -- For Nemotron 3 Super on RTX 6000 Pro: comment the NIM_MAX_MODEL_LEN entry above and uncomment the block below
+#      - name: NIM_MAX_MODEL_LEN
+#        value: "32768"
+#      - name: NCCL_P2P_DISABLE
+#        value: "1"
+#      - name: NIM_KVCACHE_PERCENT
+#        value: "0.9"
 #      - name: CUDA_VISIBLE_DEVICES
 #        value: "0"
     expose:
@@ -710,16 +736,27 @@ nimOperator:
         type: ClusterIP
         port: 8000
         grpcPort: 8001
+    startupProbe:
+      enabled: true
+      probe:
+        httpGet:
+          path: /v1/health/ready
+          port: 8000
+        initialDelaySeconds: 60
+        periodSeconds: 10
+        failureThreshold: 750
+        timeoutSeconds: 5
+
 # subsection: nvidia-nim-llama-32-nv-embedqa-1b-v2
 # NIM Text Embedding
   nvidia-nim-llama-32-nv-embedqa-1b-v2:
     enabled: true
     replicas: 1
     service:
-      name: "nemoretriever-embedding-ms"
+      name: "nemotron-embedding-ms"
     image:
-      repository: nvcr.io/nim/nvidia/llama-3.2-nv-embedqa-1b-v2
-      tag: "1.10.1"
+      repository: nvcr.io/nim/nvidia/llama-nemotron-embed-1b-v2
+      tag: "1.13.0"
       pullPolicy: IfNotPresent
     resources:
       limits:
@@ -795,10 +832,10 @@ nimOperator:
     enabled: true
     replicas: 1
     service:
-      name: "nemoretriever-ranking-ms"
+      name: "nemotron-ranking-ms"
     image:
-      repository: nvcr.io/nim/nvidia/llama-3.2-nv-rerankqa-1b-v2
-      tag: "1.8.0"
+      repository: nvcr.io/nim/nvidia/llama-nemotron-rerank-1b-v2
+      tag: "1.10.0"
       pullPolicy: IfNotPresent
     resources:
       limits:
@@ -870,7 +907,7 @@ nv-ingest:
     create: false
   image:
     repository: "nvcr.io/nvidia/nemo-microservices/nv-ingest"
-    tag: "26.1.1"
+    tag: "26.1.2"
   resources:
     limits:
       nvidia.com/gpu: 0
@@ -896,8 +933,8 @@ nv-ingest:
     RAY_num_server_call_thread: "1"
     RAY_worker_num_grpc_internal_threads: "1"
     
-    EMBEDDING_NIM_ENDPOINT: "http://nemoretriever-embedding-ms:8000/v1"
-    EMBEDDING_NIM_MODEL_NAME: "nvidia/llama-3.2-nv-embedqa-1b-v2"
+    EMBEDDING_NIM_ENDPOINT: "http://nemotron-embedding-ms:8000/v1"
+    EMBEDDING_NIM_MODEL_NAME: "nvidia/llama-nemotron-embed-1b-v2"
     MESSAGE_CLIENT_HOST: "rag-redis-master"
     MESSAGE_CLIENT_PORT: 6379
     MESSAGE_CLIENT_TYPE: "redis"
@@ -1015,7 +1052,7 @@ nv-ingest:
       replicaCount: 1
       image:
         repository: nvcr.io/nim/nvidia/nemoretriever-ocr-v1
-        tag: "1.2.0"
+        tag: "1.2.1"
       imagePullSecrets:
         - name: ngc-secret
       env:
@@ -1049,8 +1086,8 @@ nv-ingest:
       tolerations: []
       replicaCount: 1
       image:
-        repository: nvcr.io/nim/nvidia/nemoretriever-graphic-elements-v1
-        tag: "1.6.0"
+        repository: nvcr.io/nim/nvidia/nemotron-graphic-elements-v1
+        tag: "1.8.0"
       env:
         - name: NIM_HTTP_API_PORT
           value: "8000"
@@ -1082,8 +1119,8 @@ nv-ingest:
       tolerations: []
       replicaCount: 1
       image:
-        repository: nvcr.io/nim/nvidia/nemoretriever-page-elements-v3
-        tag: "1.7.0"
+        repository: nvcr.io/nim/nvidia/nemotron-page-elements-v3
+        tag: "1.8.0"
       env:
         - name: NIM_HTTP_API_PORT
           value: "8000"
@@ -1133,8 +1170,8 @@ nv-ingest:
       tolerations: []
       replicaCount: 1
       image:
-        repository: nvcr.io/nim/nvidia/nemoretriever-table-structure-v1
-        tag: "1.6.0"
+        repository: nvcr.io/nim/nvidia/nemotron-table-structure-v1
+        tag: "1.8.0"
       env:
         - name: NIM_HTTP_API_PORT
           value: "8000"
diff --git a/deploy/workbench/README.md b/deploy/workbench/README.md
index 6d02a360e..179c32ec5 100644
--- a/deploy/workbench/README.md
+++ b/deploy/workbench/README.md
@@ -75,4 +75,4 @@ Use of the models in this blueprint is governed by the [NVIDIA AI Foundation Mod
 ## Terms of Use
 This blueprint is governed by the [NVIDIA Agreements | Enterprise Software | NVIDIA Software License Agreement](https://www.nvidia.com/en-us/agreements/enterprise-software/nvidia-software-license-agreement/) and the [NVIDIA Agreements | Enterprise Software | Product Specific Terms for AI Product](https://www.nvidia.com/en-us/agreements/enterprise-software/product-specific-terms-for-ai-products/). The models are governed by the [NVIDIA Agreements | Enterprise Software | NVIDIA Community Model License](https://www.nvidia.com/en-us/agreements/enterprise-software/nvidia-community-models-license/) and the [NVIDIA RAG dataset](https://github.com/NVIDIA-AI-Blueprints/rag/tree/v2.0.0/data/multimodal) which is governed by the [NVIDIA Asset License Agreement](https://github.com/NVIDIA-AI-Blueprints/rag/blob/main/data/LICENSE.DATA).
 
-The following models that are built with Llama are governed by the [Llama 3.2 Community License Agreement](https://www.llama.com/llama3_2/license/): nvidia/llama-3.3-nemotron-super-49b-v1, nvidia/llama-3.2-nv-embedqa-1b-v2, and nvidia/llama-3.2-nv-rerankqa-1b-v2.
+The following models that are built with Llama are governed by the [Llama 3.2 Community License Agreement](https://www.llama.com/llama3_2/license/): nvidia/llama-3.3-nemotron-super-49b-v1, nvidia/llama-nemotron-embed-1b-v2, and nvidia/llama-nemotron-rerank-1b-v2.
diff --git a/deploy/workbench/compose.yaml b/deploy/workbench/compose.yaml
index 04cfdd2e2..91d4b3d28 100644
--- a/deploy/workbench/compose.yaml
+++ b/deploy/workbench/compose.yaml
@@ -28,9 +28,9 @@ services:
       retries: 100
     profiles: ["local"]
 
-  nemoretriever-embedding-ms:
-    container_name: nemoretriever-embedding-ms
-    image: nvcr.io/nim/nvidia/llama-3.2-nv-embedqa-1b-v2:1.10.1
+  nemotron-embedding-ms:
+    container_name: nemotron-embedding-ms
+    image: nvcr.io/nim/nvidia/llama-nemotron-embed-1b-v2:1.13.0
     volumes:
     - ${MODEL_DIRECTORY:-/tmp}:/opt/nim/.cache
     ports:
@@ -58,9 +58,9 @@ services:
       start_period: 10m
     profiles: ["local"]
 
-  nemoretriever-ranking-ms:
-    container_name: nemoretriever-ranking-ms
-    image: nvcr.io/nim/nvidia/llama-3.2-nv-rerankqa-1b-v2:1.8.0
+  nemotron-ranking-ms:
+    container_name: nemotron-ranking-ms
+    image: nvcr.io/nim/nvidia/llama-nemotron-rerank-1b-v2:1.10.0
     volumes:
     - ${MODEL_DIRECTORY:-/tmp}:/opt/nim/.cache
     ports:
@@ -86,7 +86,7 @@ services:
     profiles: ["local"]
 
   page-elements:
-    image: ${YOLOX_IMAGE:-nvcr.io/nim/nvidia/nemoretriever-page-elements-v3}:${YOLOX_TAG:-1.7.0}
+    image: ${YOLOX_IMAGE:-nvcr.io/nim/nvidia/nemotron-page-elements-v3}:${YOLOX_TAG:-1.8.0}
     ports:
       - "8000:8000"
       - "8001:8001"
@@ -122,7 +122,7 @@ services:
     profiles: ["local"]
 
   graphic-elements:
-    image: ${YOLOX_GRAPHIC_ELEMENTS_IMAGE:-nvcr.io/nim/nvidia/nemoretriever-graphic-elements-v1}:${YOLOX_GRAPHIC_ELEMENTS_TAG:-1.6.0}
+    image: ${YOLOX_GRAPHIC_ELEMENTS_IMAGE:-nvcr.io/nim/nvidia/nemotron-graphic-elements-v1}:${YOLOX_GRAPHIC_ELEMENTS_TAG:-1.8.0}
     ports:
       - "8003:8000"
       - "8004:8001"
@@ -147,7 +147,7 @@ services:
     profiles: ["local"]
 
   table-structure:
-    image: ${YOLOX_TABLE_STRUCTURE_IMAGE:-nvcr.io/nim/nvidia/nemoretriever-table-structure-v1}:${YOLOX_TABLE_STRUCTURE_TAG:-1.6.0}
+    image: ${YOLOX_TABLE_STRUCTURE_IMAGE:-nvcr.io/nim/nvidia/nemotron-table-structure-v1}:${YOLOX_TABLE_STRUCTURE_TAG:-1.8.0}
     ports:
       - "8006:8000"
       - "8007:8001"
@@ -200,7 +200,7 @@ services:
   # Main ingestor server which is responsible for ingestion
   ingestor-server:
     container_name: ingestor-server
-    image: nvcr.io/nvstaging/blueprint/ingestor-server:${TAG:-2.4.0}
+    image: nvcr.io/nvidia/blueprint/ingestor-server:${TAG:-2.5.0}
     build:
       # Set context to repo's root directory
       context: ../../
@@ -256,8 +256,8 @@ services:
 
       ##===Embedding Model specific configurations===
       # url on which embedding model is hosted. If "", Nvidia hosted API is used
-      APP_EMBEDDINGS_SERVERURL: ${APP_EMBEDDINGS_SERVERURL-"nemoretriever-embedding-ms:8000/v1"}
-      APP_EMBEDDINGS_MODELNAME: ${APP_EMBEDDINGS_MODELNAME:-nvidia/llama-3.2-nv-embedqa-1b-v2}
+      APP_EMBEDDINGS_SERVERURL: ${APP_EMBEDDINGS_SERVERURL-"nemotron-embedding-ms:8000/v1"}
+      APP_EMBEDDINGS_MODELNAME: ${APP_EMBEDDINGS_MODELNAME:-nvidia/llama-nemotron-embed-1b-v2}
       APP_EMBEDDINGS_DIMENSIONS: ${APP_EMBEDDINGS_DIMENSIONS:-2048}
 
       ##===NV-Ingest Connection Configurations=======
@@ -333,7 +333,7 @@ services:
     profiles: ["ingest"]
 
   nv-ingest-ms-runtime:
-    image: nvcr.io/nvidia/nemo-microservices/nv-ingest:26.1.1
+    image: nvcr.io/nvidia/nemo-microservices/nv-ingest:26.1.2
     # cpuset: "0-15" # Uncomment to restrict this container to CPU cores 0–15
     shm_size: 40gb # Should be at minimum 30% of assigned memory per Ray documentation
     volumes:
@@ -399,20 +399,20 @@ services:
       - REDIS_MORPHEUS_TASK_QUEUE=morpheus_task_queue
       # Self-hosted redis endpoints.
       # build.nvidia.com hosted yolox endpoints.
-      # - YOLOX_HTTP_ENDPOINT=https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-page-elements-v3
+      # - YOLOX_HTTP_ENDPOINT=https://ai.api.nvidia.com/v1/cv/nvidia/nemotron-page-elements-v3
       # - YOLOX_INFER_PROTOCOL=http
       - YOLOX_PAGE_IMAGE_FORMAT=JPEG
       - YOLOX_GRPC_ENDPOINT=${YOLOX_GRPC_ENDPOINT:-page-elements:8001}
       - YOLOX_HTTP_ENDPOINT=${YOLOX_HTTP_ENDPOINT:-http://page-elements:8000/v1/infer}
       - YOLOX_INFER_PROTOCOL=${YOLOX_INFER_PROTOCOL:-grpc}
       # build.nvidia.com hosted yolox-graphics-elements endpoints.
-      #- YOLOX_GRAPHIC_ELEMENTS_HTTP_ENDPOINT=https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-graphic-elements-v1
+      #- YOLOX_GRAPHIC_ELEMENTS_HTTP_ENDPOINT=https://ai.api.nvidia.com/v1/cv/nvidia/nemotron-graphic-elements-v1
       #- YOLOX_GRAPHIC_ELEMENTS_INFER_PROTOCOL=http
       - YOLOX_GRAPHIC_ELEMENTS_GRPC_ENDPOINT=${YOLOX_GRAPHIC_ELEMENTS_GRPC_ENDPOINT:-graphic-elements:8001}
       - YOLOX_GRAPHIC_ELEMENTS_HTTP_ENDPOINT=${YOLOX_GRAPHIC_ELEMENTS_HTTP_ENDPOINT:-http://graphic-elements:8000/v1/infer}
       - YOLOX_GRAPHIC_ELEMENTS_INFER_PROTOCOL=${YOLOX_GRAPHIC_ELEMENTS_INFER_PROTOCOL:-grpc}
       # build.nvidia.com hosted  yolox-table-elements endpoints.
-      #- YOLOX_TABLE_STRUCTURE_HTTP_ENDPOINT=https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-table-structure-v1
+      #- YOLOX_TABLE_STRUCTURE_HTTP_ENDPOINT=https://ai.api.nvidia.com/v1/cv/nvidia/nemotron-table-structure-v1
       #- YOLOX_TABLE_STRUCTURE_INFER_PROTOCOL=http
       - YOLOX_TABLE_STRUCTURE_GRPC_ENDPOINT=${YOLOX_TABLE_STRUCTURE_GRPC_ENDPOINT:-table-structure:8001}
       - YOLOX_TABLE_STRUCTURE_HTTP_ENDPOINT=${YOLOX_TABLE_STRUCTURE_HTTP_ENDPOINT:-http://table-structure:8000/v1/infer}
@@ -432,7 +432,7 @@ services:
   # Main orchestrator server which stiches together all calls to different services to fulfill the user request
   rag-server:
     container_name: rag-server
-    image: nvcr.io/nvstaging/blueprint/rag-server:${TAG:-2.4.0}
+    image: nvcr.io/nvidia/blueprint/rag-server:${TAG:-2.5.0}
     build:
       # Set context to repo's root directory
       context: ../../
@@ -495,13 +495,13 @@ services:
 
       ##===Embedding Model specific configurations===
       # url on which embedding model is hosted. If "", Nvidia hosted API is used
-      APP_EMBEDDINGS_SERVERURL: ${APP_EMBEDDINGS_SERVERURL-"nemoretriever-embedding-ms:8000/v1"}
-      APP_EMBEDDINGS_MODELNAME: ${APP_EMBEDDINGS_MODELNAME:-nvidia/llama-3.2-nv-embedqa-1b-v2}
+      APP_EMBEDDINGS_SERVERURL: ${APP_EMBEDDINGS_SERVERURL-"nemotron-embedding-ms:8000/v1"}
+      APP_EMBEDDINGS_MODELNAME: ${APP_EMBEDDINGS_MODELNAME:-nvidia/llama-nemotron-embed-1b-v2}
 
       ##===Reranking Model specific configurations===
       # url on which ranking model is hosted. If "", Nvidia hosted API is used
-      APP_RANKING_SERVERURL: ${APP_RANKING_SERVERURL-"nemoretriever-ranking-ms:8000"}
-      APP_RANKING_MODELNAME: ${APP_RANKING_MODELNAME:-"nvidia/llama-3.2-nv-rerankqa-1b-v2"}
+      APP_RANKING_SERVERURL: ${APP_RANKING_SERVERURL-"nemotron-ranking-ms:8000"}
+      APP_RANKING_MODELNAME: ${APP_RANKING_MODELNAME:-"nvidia/llama-nemotron-rerank-1b-v2"}
       ENABLE_RERANKER: ${ENABLE_RERANKER:-True}
 
       ##===VLM Model specific configurations===
@@ -569,7 +569,7 @@ services:
   # Sample UI container which interacts with APIs exposed by rag-server container
   rag-frontend:
     container_name: rag-frontend
-    image: nvcr.io/nvstaging/blueprint/rag-frontend:${TAG:-2.4.0}
+    image: nvcr.io/nvidia/blueprint/rag-frontend:${TAG:-2.5.0}
     build:
       # Set context to repo's root directory
       context: ../../frontend
diff --git a/deploy/workbench/quickstart.ipynb b/deploy/workbench/quickstart.ipynb
index d9a15a71a..00c524aba 100644
--- a/deploy/workbench/quickstart.ipynb
+++ b/deploy/workbench/quickstart.ipynb
@@ -966,10 +966,10 @@
     "    \"enable_citations\": True,\n",
     "    \"model\": \"nvidia/llama-3.3-nemotron-super-49b-v1.5\",\n",
     "    \"llm_endpoint\": \"nim-llm:8000\",\n",
-    "    \"embedding_model\": \"nvidia/llama-3.2-nv-embedqa-1b-v2\",\n",
-    "    \"embedding_endpoint\": \"nemoretriever-embedding-ms:8000/v1\",\n",
-    "    \"reranker_model\": \"nvidia/llama-3.2-nv-rerankqa-1b-v2\",\n",
-    "    \"reranker_endpoint\": \"nemoretriever-ranking-ms:8000\",\n",
+    "    \"embedding_model\": \"nvidia/llama-nemotron-embed-1b-v2\",\n",
+    "    \"embedding_endpoint\": \"nemotron-embedding-ms:8000/v1\",\n",
+    "    \"reranker_model\": \"nvidia/llama-nemotron-rerank-1b-v2\",\n",
+    "    \"reranker_endpoint\": \"nemotron-ranking-ms:8000\",\n",
     "    \"stop\": [],\n",
     "}\n",
     "\n",
@@ -1030,10 +1030,10 @@
     "    \"enable_citations\": True,\n",
     "    \"model\": \"nvidia/llama-3.3-nemotron-super-49b-v1.5\",\n",
     "    \"llm_endpoint\": \"nim-llm:8000\",\n",
-    "    \"embedding_model\": \"nvidia/llama-3.2-nv-embedqa-1b-v2\",\n",
-    "    \"embedding_endpoint\": \"nemoretriever-embedding-ms:8000/v1\",\n",
-    "    \"reranker_model\": \"nvidia/llama-3.2-nv-rerankqa-1b-v2\",\n",
-    "    \"reranker_endpoint\": \"nemoretriever-ranking-ms:8000\",\n",
+    "    \"embedding_model\": \"nvidia/llama-nemotron-embed-1b-v2\",\n",
+    "    \"embedding_endpoint\": \"nemotron-embedding-ms:8000/v1\",\n",
+    "    \"reranker_model\": \"nvidia/llama-nemotron-rerank-1b-v2\",\n",
+    "    \"reranker_endpoint\": \"nemotron-ranking-ms:8000\",\n",
     "    \"stop\": [],\n",
     "}\n",
     "\n",
@@ -1175,10 +1175,10 @@
     "    ],\n",
     "    \"enable_query_rewriting\": False,\n",
     "    \"enable_reranker\": False,\n",
-    "    \"embedding_model\": \"nvidia/llama-3.2-nv-embedqa-1b-v2\",\n",
-    "    \"embedding_endpoint\": \"nemoretriever-embedding-ms:8000/v1\",\n",
-    "    \"reranker_model\": \"nvidia/llama-3.2-nv-rerankqa-1b-v2\",\n",
-    "    \"reranker_endpoint\": \"nemoretriever-ranking-ms:8000\",\n",
+    "    \"embedding_model\": \"nvidia/llama-nemotron-embed-1b-v2\",\n",
+    "    \"embedding_endpoint\": \"nemotron-embedding-ms:8000/v1\",\n",
+    "    \"reranker_model\": \"nvidia/llama-nemotron-rerank-1b-v2\",\n",
+    "    \"reranker_endpoint\": \"nemotron-ranking-ms:8000\",\n",
     "}\n",
     "\n",
     "\n",
@@ -1233,10 +1233,10 @@
     "    ],\n",
     "    \"enable_query_rewriting\": False,\n",
     "    \"enable_reranker\": True,\n",
-    "    \"embedding_model\": \"nvidia/llama-3.2-nv-embedqa-1b-v2\",\n",
-    "    \"embedding_endpoint\": \"nemoretriever-embedding-ms:8000/v1\",\n",
-    "    \"reranker_model\": \"nvidia/llama-3.2-nv-rerankqa-1b-v2\",\n",
-    "    \"reranker_endpoint\": \"nemoretriever-ranking-ms:8000\",\n",
+    "    \"embedding_model\": \"nvidia/llama-nemotron-embed-1b-v2\",\n",
+    "    \"embedding_endpoint\": \"nemotron-embedding-ms:8000/v1\",\n",
+    "    \"reranker_model\": \"nvidia/llama-nemotron-rerank-1b-v2\",\n",
+    "    \"reranker_endpoint\": \"nemotron-ranking-ms:8000\",\n",
     "}\n",
     "\n",
     "\n",
diff --git a/docs/accuracy-benchmarks.md b/docs/accuracy-benchmarks.md
new file mode 100644
index 000000000..6ae3a4529
--- /dev/null
+++ b/docs/accuracy-benchmarks.md
@@ -0,0 +1,126 @@
+<!--
+SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+SPDX-License-Identifier: Apache-2.0
+-->
+
+# Benchmarking RAG Accuracy: Evaluating LLM Reasoning and VLM Integration
+
+In the fast-moving world of Retrieval-Augmented Generation (RAG), the gap between a “good” system and one that’s truly production-ready often depends on how effectively the pipeline manages complex reasoning and multimodal data. To measure these advancements, our team conducted extensive benchmarks across multiple configurations, examining the influence of LLM reasoning (“Think” mode) and Vision-Language Models (VLM).
+
+## Benchmarked Datasets
+
+Our analysis centered on seven major public datasets encompassing a broad range of challenges, from financial reasoning to intricate structural document parsing.
+
+| Dataset | Domain | Corpus Language | Main Modalities | # Pages | # Queries |
+|---|---|---|---|---|---|
+| [RagBattlepacket](https://www.eyelevel.ai/post/most-accurate-rag) | Finance, Tax & Consulting | English | Text, Tables, Charts, Infographics | 1,141 | 92 |
+| [KG-RAG](https://github.com/docugami/KG-RAG-datasets/tree/main/sec-10-q/data/v1) | Finance (SEC 10-Q) | English | Text, Tables | 1,037 | 195 |
+| [Financebench](https://github.com/patronus-ai/financebench) | Finance (Public Equity) | English | Text, Tables | 54,057 | 150 |
+| [DC767](https://digitalcorpora.org/) | General (Gov, NGO, Health) | English | Text, Tables | 54,730 | 488 |
+| [HotPotQA](https://huggingface.co/datasets/hotpotqa/hotpot_qa) | Wikipedia-based question-answer pairs | English | Text | 2,673 (txt files) | 979 |
+| [Google Frames](https://huggingface.co/datasets/google/frames-benchmark) | History, Sports, Science, Animals, Health | English | Text | 31,708 | 824 |
+
+### [Vidore-V3 Dataset](https://huggingface.co/blog/QuentinJG/introducing-vidore-v3#public-datasets)
+
+| Dataset | Domain | Corpus Language | Main Modalities | # Pages | # Queries (with translations) |
+|---|---|---|---|---|---|
+| French Public Company Annual Reports | Finance-FR | French | Text, Table, Charts | 2,384 | 1,920 |
+| U.S. Public Company Annual Reports | Finance-EN | English | Text, Table | 2,942 | 1,854 |
+| Computer Science Textbooks | Computer Science | English | Text, Infographic, Tables | 1,360 | 1,290 |
+| HR Reports from EU | HR | English | Text, Table, Charts | 1,110 | 1,908 |
+| French Governmental Energy Reports | Energy | French | Text, Charts | 2,229 | 1,848 |
+| USAF Technical Orders | Industrial | English | Text, Tables, Infographics, Images | 5,244 | 1,698 |
+| FDA Reports | Pharmaceuticals | English | Text, Charts, Images, Infographic, Tables | 2,313 | 2,184 |
+| French Physics Lectures | Physics | French | Text, Images, Infographics | 1,674 | 1,812 |
+
+
+## Evaluation Methodology
+
+Our primary evaluation metric is end-to-end RAG answer accuracy, measured using the [NVIDIA Answer Accuracy metric from RAGAS](https://docs.ragas.io/en/stable/concepts/metrics/available_metrics/nvidia_metrics/). Each response is rated on a 0–4 scale by an LLM judge, with scores normalized to a range for reporting. We chose [mistralai/Mixtral-8x22B-Instruct-v0.1](https://build.nvidia.com/mistralai/mixtral-8x22b-instruct) as the LLM judge, guided by performance on the [Judge’s Verdict](https://huggingface.co/spaces/nvidia/judges-verdict) benchmark.
+
+Full evaluation pipeline: [evaluation_01_ragas.ipynb](https://github.com/NVIDIA-AI-Blueprints/rag/blob/main/notebooks/evaluation_01_ragas.ipynb)
+
+- Metric: Accuracy, defined as the degree to which generated responses align with the ground truth answers.
+- Pipeline configuration: All experiments were run using the default configuration.
+- Generation models:
+  - LLM: nvidia/llama-3.3-nemotron-super-49b-v1.5
+  - VLM: nvidia/nemotron-nano-vl-12b-v2
+- Judge model: mistralai/Mixtral-8x22B-Instruct-v0.1
+
+## Configuration and Accuracy Results
+
+We tested four main configurations to evaluate how ["Reasoning" (Think On)](https://github.com/NVIDIA-AI-Blueprints/rag/blob/main/docs/enable-nemotron-thinking.md) and ["Vision Language Model" (VLM)](https://github.com/NVIDIA-AI-Blueprints/rag/blob/main/docs/vlm.md) features influence accuracy. In the VLM-based generation pipeline, image captioning was enabled during data ingestion. For text-only datasets, we excluded the VLM-based generation setup from evaluation.
+
+| Dataset | LLM (Reasoning Off) | LLM (Reasoning On) | VLM (Reasoning Off) | VLM (Reasoning On) |
+|---|---|---|---|---|
+| FinanceBench | 0.612 | 0.668 | 0.622 | 0.697 |
+| KG-RAG | 0.569 | 0.593 | 0.596 | 0.643 |
+| RAGBattle | 0.812 | 0.818 | 0.867 | 0.842 |
+| DC767 | 0.906 | 0.899 | 0.907 | 0.897 |
+| Hotpotqa | 0.672 | 0.676 | n/a | n/a |
+| Google Frames | 0.486 | 0.597 | n/a | n/a |
+
+The table in the following section summarizes the accuracy scores for each dataset across our experimental configurations.
+
+### Vidore-V3 Results
+
+For the Vidore-v3 evaluation, we combined all domains into a single collection and then performed domain-specific evaluations.
+
+| Dataset subsets | LLM (Reasoning Off) | LLM (Reasoning On) | VLM (Reasoning Off) | VLM (Reasoning On) |
+|---|---|---|---|---|
+| Computer Science | 0.894 | 0.882 | 0.927 | 0.931 |
+| Energy | 0.751 | 0.765 | 0.802 | 0.824 |
+| Finance EN | 0.699 | 0.718 | 0.758 | 0.766 |
+| Pharmaceuticals | 0.759 | 0.775 | 0.849 | 0.858 |
+| HR | 0.726 | 0.735 | 0.767 | 0.804 |
+| Industrial | 0.677 | 0.674 | 0.733 | 0.758 |
+| Physics | 0.840 | 0.806 | 0.903 | 0.910 |
+| Finance FR | 0.639 | 0.647 | 0.683 | 0.687 |
+
+
+## Key Results
+
+The following sections describe the key results from our analysis.
+
+### The "Reasoning Dividend" in FinanceBench and KG-RAG
+
+For FinanceBench and KG-RAG datasets we have observed improved accuracy with reasoning on.
+
+Why it makes sense
+
+- FinanceBench is heavily table-centric—about 75% of queries involve tables—and many of these require mathematical operations or extracting data across multiple line items. Simple retrieval is not sufficient; the model must perform an explicit reasoning step to carry out the necessary arithmetic and cross-referencing to match the human-annotated ground truth.
+
+- KG-RAG requires temporal reasoning (for example, comparing Q3 2022 with Q1 2023). Without reasoning enabled, the model may retrieve the correct company but the wrong fiscal quarter. Turning Reasoning On lets the LLM check dates and periods before finalizing its answer.
+
+### The Multimodal Unlock: Decoding Visual Complexity in ViDoRe and RAGBattlePacket
+
+Across both the ViDoRe benchmark and RAGBattlePacket, we saw best results when moving from a text-only LLM to a VLM. RAGBattlePacket reached its highest baseline accuracy (0.867) simply by enabling the VLM, and ViDoRe showed broad gains across nearly all of its diverse sub-domains.
+
+Why it makes sense
+
+- Preserving Spatial Layouts (ViDoRe): Sub-domains like Finance and Pharmaceuticals depend on rigid tables and charts that text-only pipelines often fail to capture. A VLM can directly “see” and preserve these structures, leading to higher accuracy on this benchmark.
+- Targeting Visual Queries (RAGBattlePacket): About 10% of RAGBattlePacket queries focus on charts, bar graphs, and customer journey diagrams, which standard pipelines often hallucinate on or ignore. A VLM can directly interpret these visuals, returning precise percentages and preserving the underlying structure.
+
+### Semantic Robustness in DC767
+
+This dataset showed the highest overall stability, maintaining roughly 0.90 or higher accuracy across almost all configurations.
+
+Why it makes sense
+
+Because the dataset is about 70% text-based prose, it relies heavily on high-quality embeddings and semantic search. Our core retriever is clearly optimized for dense text retrieval, as adding Vision or Reasoning produced only a marginal gain (about a 1.1% change). This suggests that our base RAG engine is already very strong for standard retriever-focused tasks.
+
+### Reasoning as the Catalyst in Google Frames
+
+This dataset demonstrated the true impact of active reasoning on complex, multi-hop queries. By turning reasoning on, the model achieved a massive leap in overall performance. This gain represents our most significant improvement driven purely by logical processing.
+
+Why it makes sense
+
+Google Frames targets complex queries that require synthesizing facts across multiple documents while tracking overlapping constraints. A standard LLM often struggles to keep all these parameters in mind in a single pass. Turning on reasoning enables the model to systematically decompose multi-step logic and verify dependencies, which is essential for accurate factual extraction.
+
+## Related Topics
+
+- [Evaluate Your NVIDIA RAG Blueprint System](evaluate.md)
+- [Enable Reasoning in Nemotron LLM Models](enable-nemotron-thinking.md)
+- [VLM-Based Inferencing in RAG](vlm.md)
+- [Image Captioning Support](image_captioning.md)
+- [Best Practices for Common Settings](accuracy_perf.md)
\ No newline at end of file
diff --git a/docs/accuracy_perf.md b/docs/accuracy_perf.md
index 8ee491fa6..c24f0c9a5 100644
--- a/docs/accuracy_perf.md
+++ b/docs/accuracy_perf.md
@@ -14,7 +14,7 @@ Change the setting if you want different behavior.
 | Name                 | Default    | Description         | Advantages           | Disadvantages            |
 |----------------------|------------|---------------------|----------------------|--------------------------|
 | `APP_NVINGEST_CHUNKOVERLAP` | `150` | Increase overlap to ensure smooth transitions between chunks. | - Larger overlap provides smoother transitions between chunks. <br/>  | - Might increase processing overhead. <br/> |
-| `APP_NVINGEST_CHUNKSIZE` | `512` | Increase chunk size for more context. | - Larger chunks retain more context, improving coherence. <br/> | - Larger chunks increase embedding size, slowing retrieval. <br/> - Longer chunks might increase latency due to larger prompt size. <br/> |
+| `APP_NVINGEST_CHUNKSIZE` | `512` | Increase chunk size for more context. | - Larger chunks retain more context, improving coherence. <br/> - Larger chunks increase compute time for embedding creation. <br/> - Larger chunks can lead to longer retrieved context, increasing generation latency. <br/> - Very large chunks may dilute semantic focus, reducing embedding precision. <br/> |
 | `APP_NVINGEST_ENABLEPDFSPLITTER` | `true` | Set to `true` to perform chunk-based splitting of pdfs after the default page-level extraction occurs. Recommended for PDFs that are mostly text content. | - Provides more granular content segmentation. <br/> | - Can increase the number of chunks and slow down the ingestion process. <br/> |
 | `APP_NVINGEST_EXTRACTCHARTS` | `true` | Set to `true` to extract charts. | - Improves accuracy for documents that contain charts. <br/> | - Increases ingestion time. <br/> |
 | `APP_NVINGEST_EXTRACTIMAGES` | `false` | Set to `true` to enable image captioning during ingestion. For details, refer to [Image Captioning Support](image_captioning.md). | - Enhances multimodal retrieval accuracy for documents having images. <br/> | - Increased processing time during ingestion. <br/> - Requires additional GPU resources for VLM model deployment. <br/> |
@@ -30,14 +30,14 @@ Change the setting if you want different behavior.
 
 | Name                 | Default    | Description         | Advantages           | Disadvantages            |
 |----------------------|------------|---------------------|----------------------|--------------------------|
-| - `APP_LLM_MODELNAME` <br/> - `APP_EMBEDDINGS_MODELNAME` <br/> - `APP_RANKING_MODELNAME` <br/> | See description | The default models are the following: <br/>- `nvidia/llama-3.3-nemotron-super-49b-v1.5` <br/> - `nvidia/llama-3.2-nv-embedqa-1b-v2` <br/> - `nvidia/llama-3.2-nv-rerankqa-1b-v2` <br/><br/>You can use larger models.  For details, refer to [Change the Inference or Embedding Model](change-model.md). | - Higher accuracy with better reasoning and a larger context length. <br/> | - Slower response time. <br/> - Higher inference cost. <br/> - Higher GPU requirement. <br/>  |
+| - `APP_LLM_MODELNAME` <br/> - `APP_EMBEDDINGS_MODELNAME` <br/> - `APP_RANKING_MODELNAME` <br/> | See description | The default models are the following: <br/>- `nvidia/llama-3.3-nemotron-super-49b-v1.5` <br/> - `nvidia/llama-nemotron-embed-1b-v2` <br/> - `nvidia/llama-nemotron-rerank-1b-v2` <br/><br/>You can use larger models.  For details, refer to [Change the Inference or Embedding Model](change-model.md). | - Higher accuracy with better reasoning and a larger context length. <br/> | - Slower response time. <br/> - Higher inference cost. <br/> - Higher GPU requirement. <br/>  |
 | `APP_VECTORSTORE_SEARCHTYPE` | `dense` | Set to `hybrid` to enable hybrid search. For details, refer to [Hybrid Search Support](hybrid_search.md). | - Can provide better retrieval accuracy for domain-specific content. <br/> | - Can induce higher latency for large number of documents. <br/> |
 | `ENABLE_GUARDRAILS` | `false` | Set to `true` to enable NeMo Guardrails. For details, refer to [Nemo Guardrails Support](nemo-guardrails.md). | - Applies input/output constraints for better safety and consistency. <br/> | - Significant increased processing overhead for additional LLM calls. <br/> - Needs additional GPUs to deploy guardrails-specific models locally. <br/> |
 | `ENABLE_QUERYREWRITER` | `false` | Set to `true` to enable query rewriting.  For details, refer to [Multi-Turn Conversation Support](multiturn.md). | - Enhances retrieval accuracy for multi-turn scenarios by rephrasing the query. <br/> | - Adds an extra LLM call, increasing latency. <br/> |
 | `ENABLE_REFLECTION` | `false` | Set to `true` to enable self-reflection. For details, refer to [Self-Reflection Support](self-reflection.md). | - Can improve the response quality by refining intermediate retrieval and final LLM output. <br/> | - Significantly higher latency due to multiple iterations of LLM model call. <br/> - You might need to deploy a separate judge LLM model, increasing GPU requirement. <br/> |
 | `ENABLE_RERANKER`    | `true` | Set to `true` to use the reranking model.    | - Improves accuracy by selecting better documents for response generation. <br/> | - Increases latency due to additional processing. <br/> - Additional hardware requirements for self-hosted on premises deployment. <br/>   |
 | `ENABLE_VLM_INFERENCE` | `false`    | Set to `true` to use the Vision-Language Model (VLM) for response generation. For details, refer to [VLM for Generation](vlm.md).  | - Enables analysis of retrieved images alongside text for richer, multimodal responses. <br/> - Can process up to 4 images per citation. <br/> - Useful for document Q&A, visual search, and multimodal chatbots. <br/> | - Requires additional GPU resources for VLM model deployment. <br/> - Increases latency due to image processing. <br/> |
-| Reasoning in `llama-3.3-nemotron-super-49b-v1.5` | `/no_think` | Use `/think` to enable reasoning. For details, refer to [Enable Reasoning](enable-nemotron-thinking.md). | - Improves response quality through enhanced reasoning capabilities. <br/> - Yields more precise responses. The default model is verbose and works best with reasoning enabled. <br/> | - Can increase response latency due to additional thinking process. <br/> - Can increase token usage and computational overhead. <br/> |
+| `LLM_ENABLE_THINKING` | `false` | Set to `true` to enable reasoning for Nemotron 3 models. Use `LLM_REASONING_BUDGET` and `LLM_LOW_EFFORT` for fine-grained control. For Nemotron 1.5 models, use the `/think` system prompt instead. For details, refer to [Enable Reasoning](enable-nemotron-thinking.md). | - Improves response quality through enhanced reasoning capabilities. <br/> - Yields more precise responses. <br/> | - Can increase response latency due to additional thinking process. <br/> - Can increase token usage and computational overhead. <br/> |
 | `RERANKER_SCORE_THRESHOLD` | `0.0` | Filters out retrieved chunks if reranker relevance is lower than this threshold. We recommend that you set this value between `0.3` and `0.5` to balance quality and coverage. For details, refer to [Use the Python Package](python-client.md). | - Faster retrieval by processing fewer documents. <br/> - Can improve accuracy by excluding low-relevance documents. <br/> | - Requires `ENABLE_RERANKER` set to `true` for effective filtering. <br/> - Might filter out too many chunks if the threshold is set high, causing no response from the RAG server. <br/> |
 | `RERANKER TOP K` | 10 | Increase `reranker TOP K` to increase the probability of relevant context being part of the top-k contexts. | Increasing the value can improve accuracy. | Increasing the value can increase latency. |
 | `VDB TOP K` | 100 | Increase `VDB TOP K` to provide a larger candidate pool for reranking. | Increasing the value can improve accuracy. | Increasing the value can increase latency. |
diff --git a/docs/api-ingestor.md b/docs/api-ingestor.md
index 443a521a6..adeeb4cf0 100644
--- a/docs/api-ingestor.md
+++ b/docs/api-ingestor.md
@@ -8,7 +8,7 @@
 This documentation contains the OpenAPI reference for the ingestor server.
 
 :::{tip}
-To view this documentation on docs.nvidia.com, go to https://docs.nvidia.com/rag/latest/api-ingestor.html.
+To view this documentation on docs.nvidia.com, browse to [https://docs.nvidia.com/rag/latest/api-ingestor](https://docs.nvidia.com/rag/latest/api-ingestor.html).
 :::
 
 
@@ -41,7 +41,7 @@ The status response includes progress metrics updated after each batch completes
 For more granular progress updates during batch processing, use the `nv_ingest_status` object described below, which tracks individual document extraction progress and updates more frequently than the batch-level metrics.
 :::
 
-### NV-Ingest Extraction Status
+### Extraction status
 
 The `/status` endpoint response includes an `nv_ingest_status` object that provides real-time document extraction progress, updating more frequently than batch-level metrics. This is useful for monitoring individual document processing when polling the status endpoint:
 
@@ -53,7 +53,7 @@ The `/status` endpoint response includes an `nv_ingest_status` object that provi
 | Status | Description |
 |--------|-------------|
 | `not_started` | Document queued, extraction not yet initiated |
-| `submitted` | Document submitted to NV-Ingest for processing |
+| `submitted` | Document submitted to NeMo Retriever Library for processing |
 | `processing` | Document extraction is in progress |
 | `completed` | Document extraction completed successfully |
 | `failed` | Document extraction failed |
diff --git a/docs/api-rag.md b/docs/api-rag.md
index 366d44b0f..7a15d8890 100644
--- a/docs/api-rag.md
+++ b/docs/api-rag.md
@@ -8,8 +8,10 @@
 This documentation contains the OpenAPI reference for the RAG server.
 
 :::{tip}
-To view this documentation on docs.nvidia.com, go to https://docs.nvidia.com/rag/latest/api-rag.html.
+To view this documentation on docs.nvidia.com, browse to [https://docs.nvidia.com/rag/latest/api-rag](https://docs.nvidia.com/rag/latest/api-rag.html).
 :::
+=======
+To view this documentation on docs.nvidia.com, browse to [https://docs.nvidia.com/rag/latest/api-rag](https://docs.nvidia.com/rag/latest/api-rag.html).
 
 
 :::{swagger-plugin} ../docs/api_reference/openapi_schema_rag_server.json
diff --git a/docs/api_reference/openapi_schema_rag_server.json b/docs/api_reference/openapi_schema_rag_server.json
index 63bbb4e33..5bcf2ec7d 100644
--- a/docs/api_reference/openapi_schema_rag_server.json
+++ b/docs/api_reference/openapi_schema_rag_server.json
@@ -707,7 +707,7 @@
             "maxLength": 256,
             "title": "Embedding Model",
             "description": "Name of the embedding model used for vectorization.",
-            "default": "nvdev/nvidia/llama-3.2-nv-embedqa-1b-v2"
+            "default": "nvdev/nvidia/llama-nemotron-embed-1b-v2"
           },
           "embedding_endpoint": {
             "type": "string",
@@ -721,7 +721,7 @@
             "maxLength": 256,
             "title": "Reranker Model",
             "description": "Name of the reranker model used for ranking results.",
-            "default": "nvidia/llama-3.2-nv-rerankqa-1b-v2"
+            "default": "nvidia/llama-nemotron-rerank-1b-v2"
           },
           "reranker_endpoint": {
             "anyOf": [
@@ -1342,7 +1342,7 @@
             "maxLength": 256,
             "title": "Embedding Model",
             "description": "Name of the embedding model used for vectorization.",
-            "default": "nvdev/nvidia/llama-3.2-nv-embedqa-1b-v2"
+            "default": "nvdev/nvidia/llama-nemotron-embed-1b-v2"
           },
           "embedding_endpoint": {
             "anyOf": [
@@ -1363,7 +1363,7 @@
             "maxLength": 256,
             "title": "Reranker Model",
             "description": "Name of the reranker model used for ranking results.",
-            "default": "nvidia/llama-3.2-nv-rerankqa-1b-v2"
+            "default": "nvidia/llama-nemotron-rerank-1b-v2"
           },
           "reranker_endpoint": {
             "anyOf": [
diff --git a/docs/assets/perf-benchmarks/bo767_h100_performance.png b/docs/assets/perf-benchmarks/bo767_h100_performance.png
new file mode 100644
index 000000000..05e202279
Binary files /dev/null and b/docs/assets/perf-benchmarks/bo767_h100_performance.png differ
diff --git a/docs/assets/perf-benchmarks/cross_dataset_llm_reasoning_off.png b/docs/assets/perf-benchmarks/cross_dataset_llm_reasoning_off.png
new file mode 100644
index 000000000..42eed61f9
Binary files /dev/null and b/docs/assets/perf-benchmarks/cross_dataset_llm_reasoning_off.png differ
diff --git a/docs/assets/perf-benchmarks/hotpotqa_h100_performance.png b/docs/assets/perf-benchmarks/hotpotqa_h100_performance.png
new file mode 100644
index 000000000..a4190fe68
Binary files /dev/null and b/docs/assets/perf-benchmarks/hotpotqa_h100_performance.png differ
diff --git a/docs/assets/perf-benchmarks/kgrag_h100_performance.png b/docs/assets/perf-benchmarks/kgrag_h100_performance.png
new file mode 100644
index 000000000..496b77500
Binary files /dev/null and b/docs/assets/perf-benchmarks/kgrag_h100_performance.png differ
diff --git a/docs/assets/perf-benchmarks/ragbattlepacket_h100_performance.png b/docs/assets/perf-benchmarks/ragbattlepacket_h100_performance.png
new file mode 100644
index 000000000..aaf068308
Binary files /dev/null and b/docs/assets/perf-benchmarks/ragbattlepacket_h100_performance.png differ
diff --git a/docs/assets/perf-benchmarks/wikipedia_synthetic_h100_performance.png b/docs/assets/perf-benchmarks/wikipedia_synthetic_h100_performance.png
new file mode 100644
index 000000000..29047d872
Binary files /dev/null and b/docs/assets/perf-benchmarks/wikipedia_synthetic_h100_performance.png differ
diff --git a/docs/audio_ingestion.md b/docs/audio_ingestion.md
index 55fcbc132..399ea7a64 100644
--- a/docs/audio_ingestion.md
+++ b/docs/audio_ingestion.md
@@ -132,7 +132,7 @@ When using Helm deployment, the Audio NIM service requires an additional GPU.
 
 The `APP_NVINGEST_SEGMENTAUDIO` environment variable controls whether audio segmentation is enabled during the ingestion process.
 
-When set to `True`, NV-Ingest will segment audio files based on commas and other punctuation marks, resulting in more granular audio chunks. This can improve downstream processing and retrieval accuracy for audio content. Note that splitting on captions will occur regardless of this setting; enabling `APP_NVINGEST_SEGMENTAUDIO` simply adds additional segmentation based on punctuation.
+When set to `True`, NeMo Retriever Library will segment audio files based on commas and other punctuation marks, resulting in more granular audio chunks. This can improve downstream processing and retrieval accuracy for audio content. Note that splitting on captions will occur regardless of this setting; enabling `APP_NVINGEST_SEGMENTAUDIO` simply adds additional segmentation based on punctuation.
 
 To enable audio segmentation, add the following export command to your environment configuration:
 
diff --git a/docs/change-model.md b/docs/change-model.md
index d0173462a..871d8f5f3 100644
--- a/docs/change-model.md
+++ b/docs/change-model.md
@@ -46,6 +46,10 @@ The `nemotron-3-nano-30b` model has different naming conventions depending on th
 
 Both names refer to the same underlying model. Use the appropriate name based on your deployment type.
 
+##### Nemotron 3 Super
+
+Nemotron 3 Super is a larger model with different GPU and environment requirements: local NIM deployment requires at least 2 GPUs (FP8 TP2), and you may need a dedicated prompt config and reasoning settings. For full deployment steps (Docker and Helm), see the [Nemotron 3 Super deployment guide](nemotron3-super-deployment.md).
+
 
 ### Change the Embedding Model
 
@@ -77,7 +81,7 @@ Always use same embedding model or model having same tokinizers for both ingesti
 
 ### Configure Embedding Dimensions
 
-The default embedding model (`nvidia/llama-3.2-nv-embedqa-1b-v2`) uses **2048 dimensions** by default. When changing to a different embedding model, you may need to update the dimensions to match the model's output.
+The default embedding model (`nvidia/llama-nemotron-embed-1b-v2`) uses **2048 dimensions** by default. When changing to a different embedding model, you may need to update the dimensions to match the model's output.
 
 **Important:** Some embedding models have **fixed output dimensions** and do not accept a `dimensions` parameter. For example, `nvidia/nv-embedqa-e5-v5` always outputs 1024-dimensional embeddings. If you use such a model without configuring the dimensions, you may encounter an error like:
 
@@ -124,13 +128,13 @@ You can specify the model for NVIDIA NIM containers to use in the [nims.yaml](..
        image: nvcr.io/nim/<image>:<tag>
        ...
 
-     nemoretriever-embedding-ms:
-       container_name: nemoretriever-embedding-ms
+     nemotron-embedding-ms:
+       container_name: nemotron-embedding-ms
        image: nvcr.io/nim/<image>:<tag>
 
 
-     nemoretriever-ranking-ms:
-       container_name: nemoretriever-ranking-ms
+     nemotron-ranking-ms:
+       container_name: nemotron-ranking-ms
        image: nvcr.io/nim/<image>:<tag>
    ```
 
@@ -173,11 +177,11 @@ Use this procedure to change models when you are running self-hosted NVIDIA NIM
 
       # === Embeddings ===
       APP_EMBEDDINGS_MODELNAME: "<embedding-model-name>"
-      APP_EMBEDDINGS_SERVERURL: "nemoretriever-embedding-ms:8000/v1"
+      APP_EMBEDDINGS_SERVERURL: "nemotron-embedding-ms:8000/v1"
 
       # === Reranker ===
       APP_RANKING_MODELNAME: "<reranker-model-name>"
-      APP_RANKING_SERVERURL: "nemoretriever-ranking-ms:8000"
+      APP_RANKING_SERVERURL: "nemotron-ranking-ms:8000"
     ```
 
 3. Configure the NIM microservices that host those models. Replace `<image>:<tag>` with the image you selected (format `nvcr.io/nim/<image>:<tag>`) in [values.yaml](../deploy/helm/nvidia-blueprint-rag/values.yaml).
@@ -215,7 +219,7 @@ Use this procedure to change models when you are running self-hosted NVIDIA NIM
       enabled: true
       replicas: 1
       service:
-        name: "nemoretriever-embedding-ms"
+        name: "nemotron-embedding-ms"
       image:
         # nvcr.io/nim/<image>:<tag>
         repository: nvcr.io/nim/<image>
@@ -237,7 +241,7 @@ Use this procedure to change models when you are running self-hosted NVIDIA NIM
       enabled: true
       replicas: 1
       service:
-        name: "nemoretriever-ranking-ms"
+        name: "nemotron-ranking-ms"
       image:
         # nvcr.io/nim/<image>:<tag>
         repository: nvcr.io/nim/<image>
@@ -264,7 +268,19 @@ Use this procedure to change models when you are running self-hosted NVIDIA NIM
     **If only the vLLM profile is available**
 
    When only a vLLM profile is available for a model, such as on H100 and RTX GPUs, you must use the vLLM engine. First [run the list-model-profiles command](model-profiles.md#list-available-profiles) to confirm which profiles are available and then apply the following configurations.
-
+    **For Nemotron Nano Models VLLM profile**
+    
+    When deploying `nvidia/nvidia-nemotron-nano-9b-v2` or `nvidia/nemotron-3-nano`, check if `tensorrt_llm` profile is available using below command for your required model. 
+    
+    ```bash
+    # Change model name as needed
+    USERID=$(id -u) docker run --rm --gpus all \
+      nvcr.io/nim/nvidia/nvidia-nemotron-nano-9b-v2:latest \ 
+      list-model-profiles
+    ```
+    
+    If only `vllm` profile is available, you must use the **vLLM engine** and add these specific configurations:
+    
     ```yaml
     nimOperator:
       nim-llm:
@@ -292,4 +308,5 @@ Use this procedure to change models when you are running self-hosted NVIDIA NIM
 - [Deploy with Docker (Self-Hosted Models)](deploy-docker-self-hosted.md)
 - [Deploy with Docker (NVIDIA-Hosted Models)](deploy-docker-nvidia-hosted.md)
 - [Deploy with Helm](deploy-helm.md)
+- [Nemotron 3 Super deployment (Docker and Helm)](nemotron3-super-deployment.md)
 - [Service-Specific API Keys](api-key.md#service-specific-api-keys)
diff --git a/docs/change-vectordb.md b/docs/change-vectordb.md
index a4dc993b8..36f4d4f9f 100644
--- a/docs/change-vectordb.md
+++ b/docs/change-vectordb.md
@@ -1001,7 +1001,7 @@ Update your [`values.yaml`](../deploy/helm/nvidia-blueprint-rag/values.yaml) fil
 
 ### Disable Default Vector Database and Add Custom Helm Chart
 
-1. **Disable Milvus in the NV-Ingest configuration:**
+1. **Disable Milvus in the NeMo Retriever Library configuration:**
    ```yaml
    nv-ingest:
      enabled: true
diff --git a/docs/conf.py b/docs/conf.py
index f0ffa9e07..9c2a17c11 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2025-%Y, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -23,10 +23,10 @@
 import os
 import sys
 
-project = " NVIDIA-RAG-blueprint"
-copyright = "2025, NVIDIA Corporation"
+project = " NVIDIA RAG blueprint"
+copyright = "2025-%Y, NVIDIA Corporation"
 author = "NVIDIA Corporation"
-release = "2.4.0"
+release = "2.5.0"
 
 # -- General configuration ---------------------------------------------------
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
@@ -74,8 +74,7 @@
             "icon": "fa-brands fa-github",
         }
     ],
-    # Version switcher disabled: set "switcher": {"json_url": "...", "version_match": release}
-    # and ensure versions1.json is at the json_url path when using versioned doc deployments.
+    "switcher": {"json_url": "../versions1.json", "version_match": release},
     "extra_head": {
         """
     <script src="https://assets.adobedtm.com/5d4962a43b79/c1061d2c5e7b/launch-191c2462b890.min.js" ></script>
@@ -88,6 +87,7 @@
     },
 }
 
+
 # Add any paths that contain custom static files (such as style sheets) here,
 html_css_files = ["swagger-nvidia.css"]
 
diff --git a/docs/continuous-ingestion-object-storage.md b/docs/continuous-ingestion-object-storage.md
new file mode 100644
index 000000000..4baa9daf7
--- /dev/null
+++ b/docs/continuous-ingestion-object-storage.md
@@ -0,0 +1,127 @@
+<!--
+  SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+  SPDX-License-Identifier: Apache-2.0
+-->
+# Continuous Ingestion from Object Storage RAG Blueprint
+
+Continuous ingestion from object storage connects the [RAG blueprint](readme.md) to continuous integration. This enables an event-driven pipeline that automatically indexes documents. Continuous integration means that when you add documents to a storage bucket, the system detects new uploads, routes them for processing, and indexes their content—making all data immediately searchable and available for analysis through the [RAG Frontend](user-interface.md).
+
+## Hardware Requirements
+
+| Requirement | Details |
+|-------------|---------|
+| **GPU** | 2x RTX PRO 6000 Blackwell or 2x H100 |
+| **OS** | Ubuntu 22.04 or later |
+| **Docker** | Docker 24.0+ with Docker Compose v2 |
+| **NVIDIA Driver** | 570+ |
+| **NVIDIA Container Toolkit** | Required |
+
+
+## Overview
+
+You can create an event-driven continuous ingestion pipeline that works as follows:
+
+1. Upload documents to object storage.
+
+2. The system detects new uploads via storage events and routes them for processing.
+
+3. Content is automatically indexed into the RAG vector store.
+
+4. You can then query the ingested content through the RAG UI or API.
+
+Continuous ingestion supports documents such as PDF, DOCX, and other formats supported by the [ingestor](api-ingestor.md).
+
+## Architecture
+
+The continuous ingestion architecture features the following high-level flow:
+
+1. Object storage: Files are written to storage using a protocol that emits events (for example, MinIO configured with Kafka notifications).
+
+2. Event trigger: Upload events are published to a Kafka topic.
+
+3. Consumer: A Kafka consumer subscribes to the topic, retrieves the events, downloads the corresponding files from object storage, and routes them for processing.
+
+4. Document path: Files are passed to a file-based processing pipeline (such as the NeMo Retriever Library or ingestor-server) and then indexed in the vector database.
+
+The continuous ingestion architecture follows the end-to-end sequence described above and can be summarized as:
+
+- Document ingestion flow: (1) → (2) → (3) → file-based processing → VectorDB → RAG Agent.
+
+## Implementation Components
+
+The reference implementation includes the following components:
+
+- Object storage (MinIO): A bucket configured with Kafka notifications on put (and optionally delete) events.
+
+- Kafka: A broker and topic (for example, aidp-topic) used to publish storage event notifications.
+
+- Kafka consumer: A service that:
+
+-- Subscribes to the Kafka topic and consumes storage events.
+
+-- Downloads new objects from MinIO.
+
+-- Sends files to the RAG ingestor for indexing.
+
+The deployment is defined in `examples/rag_event_ingest/deploy/docker-compose.yaml`, which runs MinIO, Kafka, and the Kafka consumer on the same Docker network as the RAG stack (`nvidia-rag`).
+
+### Prerequisites
+
+- [Deploy the NVIDIA RAG Blueprint](deploy-docker-self-hosted.md) (NIMs, Milvus, ingestor-server, RAG server) so the consumer can reach the ingestor and the rest of the stack.
+- Ensure the `nvidia-rag` Docker network exists (created by the RAG deployment).
+- For the notebook, clone the repo, set `NGC_API_KEY`, and have the required hardware (see notebook for GPU and software requirements).
+
+### Option 1: Use the Notebook
+
+The notebook provides a guided walkthrough of the following steps:
+
+- Environment setup
+- NVIDIA RAG deployment
+- Continuous ingestion pipeline deployment (Kafka, MinIO, and consumer)
+- Testing document uploads with RAG queries
+- Cleanup
+
+To follow along, open and run: [rag_event_ingest.ipynb](https://github.com/NVIDIA-AI-Blueprints/rag/blob/main/notebooks/rag_event_ingest.ipynb).
+
+### Option 2: Deploy the Example with Docker Compose
+
+From the repository root, after the RAG stack is up:
+
+```bash
+docker compose -f examples/rag_event_ingest/deploy/docker-compose.yaml up -d
+```
+
+This command launches the following components:
+
+- Kafka (with an optional Kafka UI available on port 8080)
+- MinIO (object storage and console using ports 9201 and 9211 in the example)
+- Kafka consumer — connects to the ingestor at `INGESTOR_SERVER_URL` (default: `http://ingestor-server:8082`) and uses `COLLECTION_NAME` (default: `aidp_bucket`)
+
+After deployment, upload documents and query ingested content as follows:
+
+1. Open the MinIO Console UI at `http://<host-ip>:9211/login`.
+2. Log in with the default credentials (`minioadmin` / `minioadmin`).
+3. Navigate to the `aidp-bucket` bucket and upload your documents (PDF, DOCX, etc.).
+4. The system automatically publishes upload events to Kafka, the consumer retrieves the files, and documents are sent to the ingestor for indexing into the `aidp_bucket` collection.
+5. Query the ingested content through the RAG Frontend UI at `http://<host-ip>:8090` (select the `aidp_bucket` collection) or via the RAG API at `http://<host-ip>:8081/generate`.
+
+### Key Environment Variables
+
+The following environment variables configure the Kafka consumer. For details, refer to `examples/rag_event_ingest/deploy/docker-compose.yaml`.
+
+Consumer environment variables
+
+| Variable | Description | Default Value|
+|----------|---------|--------|
+| `KAFKA_BOOTSTRAP_SERVERS` | Address of the Kafka broker(s). | `kafka:9092` |
+| `KAFKA_TOPIC` |Kafka topic used for object storage events. | `aidp-topic` |
+| `MINIO_ENDPOINT` | MinIO endpoint in <host>:<port> format. | `minio-source-1:9000` |
+| `INGESTOR_SERVER_URL` | Base URL for the RAG ingestor service. | `http://ingestor-server:8082` |
+| `COLLECTION_NAME` | Target RAG collection for content indexing. | `aidp_bucket` |
+
+## Reference
+
+- [RAG Blueprint deployment (Docker self-hosted)](deploy-docker-self-hosted.md)
+- [Ingestor API](api-ingestor.md)
+- [Notebook: Document continuous ingestion from object storage](https://github.com/NVIDIA-AI-Blueprints/rag/blob/main/notebooks/rag_event_ingest.ipynb)
+- [Example: `examples/rag_event_ingest/`](https://github.com/NVIDIA-AI-Blueprints/rag/blob/main/examples/rag_event_ingest/) — Kafka consumer and `deploy/docker-compose.yaml`
diff --git a/docs/custom-metadata.md b/docs/custom-metadata.md
index dba96e461..ce96b63ba 100644
--- a/docs/custom-metadata.md
+++ b/docs/custom-metadata.md
@@ -233,12 +233,12 @@ The system automatically manages certain metadata fields that are added to all c
 | Field Name | Type | Description | Auto-Populated | User Override |
 |------------|------|-------------|----------------|---------------|
 | **`filename`** | `string` | Name of the uploaded file | ✅ RAG system | ✅ Yes - define in schema |
-| **`page_number`** | `integer` | Page number where content appears (1-indexed) | ✅ nv-ingest | ✅ Yes - define in schema |
-| **`start_time`** | `integer` | Start timestamp in milliseconds for audio/video segments | ✅ nv-ingest | ✅ Yes - define in schema |
-| **`end_time`** | `integer` | End timestamp in milliseconds for audio/video segments | ✅ nv-ingest | ✅ Yes - define in schema |
+| **`page_number`** | `integer` | Page number where content appears (1-indexed) | ✅ NeMo Retriever Library | ✅ Yes - define in schema |
+| **`start_time`** | `integer` | Start timestamp in milliseconds for audio/video segments | ✅ NeMo Retriever Library | ✅ Yes - define in schema |
+| **`end_time`** | `integer` | End timestamp in milliseconds for audio/video segments | ✅ NeMo Retriever Library | ✅ Yes - define in schema |
 
 :::{note}
-The following field names are **reserved** by NV-Ingest and cannot be used in custom metadata schemas: `type`, `subtype`, and `location`. These fields are exclusively managed by NV-Ingest during document processing and attempting to use them will result in a validation error.
+The following field names are **reserved** by NeMo Retriever Library and cannot be used in custom metadata schemas: `type`, `subtype`, and `location`. These fields are exclusively managed by NeMo Retriever Library during document processing and attempting to use them will result in a validation error.
 :::
 
 #### System-Managed Field Behavior
@@ -246,7 +246,7 @@ The following field names are **reserved** by NV-Ingest and cannot be used in cu
 - **Auto-Addition**: These fields are automatically added to your collection schema if you don't define them
 - **Auto-Population**: 
   - `filename` is populated by the RAG system during ingestion
-  - `page_number`, `start_time`, `end_time` are extracted and populated by nv-ingest during document processing
+  - `page_number`, `start_time`, `end_time` are extracted and populated by NeMo Retriever Library during document processing
 - **User Override**: You can define any of these fields in your schema with custom properties (e.g., different description, constraints)
   - If you provide a definition, your definition takes priority
   - If you don't provide a definition, the system auto-adds them with default settings
@@ -258,7 +258,7 @@ The following field names are **reserved** by NV-Ingest and cannot be used in cu
 :::{note}
 **Example**: If you upload a multi-page PDF without defining `page_number` in your schema, the system will:
 1. Automatically add the `page_number` field to your collection schema
-2. nv-ingest will extract the page number from each chunk during processing
+2. NeMo Retriever Library extracts the page number from each chunk during processing
 3. The page number will be available for filtering (e.g., `content_metadata["page_number"] == 5`)
 4. The page number will appear in citations when generating responses
 :::
diff --git a/docs/debugging.md b/docs/debugging.md
index 66a580419..1fb5aea0d 100644
--- a/docs/debugging.md
+++ b/docs/debugging.md
@@ -33,7 +33,7 @@ docker logs -f nim-llm-ms
 watch -n 10 'du -sh ~/.cache/model-cache/'
 
 # Check specific container resource usage
-docker stats nim-llm-ms nemoretriever-embedding-ms nemoretriever-ranking-ms
+docker stats nim-llm-ms nemotron-embedding-ms nemotron-ranking-ms
 ```
 
 The expected timeline for Docker (Self-Hosted) deployment is the following:
@@ -124,12 +124,12 @@ docker ps | grep -E "(ingestor-server|nv-ingest|nemoretriever-embedding|milvus|r
    milvus-standalone                       Up 36 minutes (healthy)
    milvus-minio                            Up 35 minutes (healthy)
    milvus-etcd                             Up 35 minutes (healthy)
-   nemoretriever-ranking-ms                Up 38 minutes (healthy)
+   nemotron-ranking-ms                Up 38 minutes (healthy)
    compose-page-elements-1                 Up 38 minutes
    compose-nemoretriever-ocr-1             Up 38 minutes
    compose-graphic-elements-1              Up 38 minutes
    compose-table-structure-1               Up 38 minutes
-   nemoretriever-embedding-ms              Up 38 minutes (healthy)
+   nemotron-embedding-ms              Up 38 minutes (healthy)
    nim-llm-ms                              Up 38 minutes (healthy)
    ```
 
@@ -141,7 +141,7 @@ docker ps | grep -E "(ingestor-server|nv-ingest|nemoretriever-embedding|milvus|r
 # Check ingestor server health with all dependencies
 curl -X GET "http://localhost:8082/v1/health?check_dependencies=true" | jq
 
-# Verify NV-Ingest runtime is ready for processing
+# Verify NeMo Retriever Library runtime is ready for processing
 curl -X GET "http://localhost:7670/v1/health/ready"
 
 # Check embedding service is responding
@@ -219,11 +219,11 @@ Start by examining the logs of key ingestion services to identify the specific e
 # Check ingestor server logs for API errors
 docker logs ingestor-server --tail 100
 
-# Check NV-Ingest runtime logs for processing errors
+# Check NeMo Retriever Library runtime logs for processing errors
 docker logs nv-ingest-ms-runtime --tail 100
 
 # Check embedding service logs for model issues
-docker logs nemoretriever-embedding-ms --tail 100
+docker logs nemotron-embedding-ms --tail 100
 ```
 
 ### 2. Common Ingestion Problems and Solutions
@@ -245,15 +245,15 @@ docker logs milvus-standalone --tail 50
 **Embedding Service Issues:**
 ```bash
 # Check embedding service logs
-docker logs nemoretriever-embedding-ms --tail 100
+docker logs nemotron-embedding-ms --tail 100
 
 # Verify GPU availability and memory
 nvidia-smi
 ```
 
-**NV-Ingest Processing Errors:**
+**NeMo Retriever Library Processing Errors:**
 ```bash
-# Check NV-Ingest logs for processing errors
+# Check NeMo Retriever Library logs for processing errors
 docker logs nv-ingest-ms-runtime --tail 200 | grep -i error
 
 # Check Redis connectivity for task queue
@@ -288,7 +288,7 @@ docker logs rag-server --tail 100
 docker logs nim-llm-ms --tail 100
 
 # Check ranking service logs for reranking errors
-docker logs nemoretriever-ranking-ms --tail 100
+docker logs nemotron-ranking-ms --tail 100
 ```
 
 ### 2. Common Retrieval Problems and Solutions
diff --git a/docs/deploy-docker-nvidia-hosted.md b/docs/deploy-docker-nvidia-hosted.md
index 4487edff3..2aabd06ce 100644
--- a/docs/deploy-docker-nvidia-hosted.md
+++ b/docs/deploy-docker-nvidia-hosted.md
@@ -111,7 +111,7 @@ Use the following procedure to start all containers needed for this blueprint.
         ],
         "processing": [
             {
-                "service": "NV-Ingest",
+                "service": "NeMo Retriever Library",
                 "status": "healthy",
                 ...
             }
@@ -238,7 +238,7 @@ After the first time you deploy the RAG Blueprint successfully, you can consider
 
 - If you don't have a GPU available, you can switch to CPU-only Milvus by following the instructions in [milvus-configuration.md](./milvus-configuration.md).
 
-- If you have a requirement to build the NVIDIA Ingest runtime container from source, you can do it by following instructions [here](https://github.com/NVIDIA/nv-ingest).
+- If you have a requirement to build the NeMo Retriever Library runtime container from source, you can do it by following instructions [here](https://github.com/NVIDIA/NeMo-Retriever).
 
 
 
diff --git a/docs/deploy-docker-self-hosted.md b/docs/deploy-docker-self-hosted.md
index 0efe64ea6..4913be36a 100644
--- a/docs/deploy-docker-self-hosted.md
+++ b/docs/deploy-docker-self-hosted.md
@@ -110,7 +110,7 @@ Use the following procedure to start all containers needed for this blueprint.
    USERID=$(id -u) docker compose -f deploy/compose/nims.yaml up -d
    ```
 
-5. Check the status of the deployment by running the following code. Wait until all services are up and the `nemoretriever-ranking-ms`, `nemoretriever-embedding-ms` and `nim-llm-ms`  NIMs are in healthy state before proceeding further.
+5. Check the status of the deployment by running the following code. Wait until all services are up and the `nemotron-ranking-ms`, `nemotron-embedding-ms` and `nim-llm-ms`  NIMs are in healthy state before proceeding further.
 
      ```bash
      watch -n 2 'docker ps --format "table {{.Names}}\t{{.Status}}"'
@@ -121,10 +121,10 @@ Use the following procedure to start all containers needed for this blueprint.
         NAMES                                   STATUS
 
         nim-llm-ms                    Up 4 minutes (healthy)
-        nemoretriever-ranking-ms      Up 4 minutes (healthy)
+        nemotron-ranking-ms      Up 4 minutes (healthy)
         compose-graphic-elements-1    Up 4 minutes
         compose-page-elements-1       Up 4 minutes
-        nemoretriever-embedding-ms    Up 4 minutes (healthy)
+        nemotron-embedding-ms    Up 4 minutes (healthy)
         compose-nemoretriever-ocr-1   Up 4 minutes
         compose-table-structure-1     Up 4 minutes
      ```
@@ -174,7 +174,7 @@ Use the following procedure to start all containers needed for this blueprint.
         ],
         "processing": [
             {
-                "service": "NV-Ingest",
+                "service": "NeMo Retriever Library",
                 "status": "healthy",
                 ...
             }
@@ -253,10 +253,10 @@ Use the following procedure to start all containers needed for this blueprint.
     340bc8210a0d   milvus-minio                     Up 3 minutes (healthy)
     0be702b87ad6   milvus-etcd                      Up 3 minutes (healthy)
     62eabf1d9f65   nim-llm-ms                       Up 10 minutes (healthy)
-    fe2751bfa734   nemoretriever-ranking-ms         Up 10 minutes (healthy)
+    fe2751bfa734   nemotron-ranking-ms         Up 10 minutes (healthy)
     7b5ddabf8be7   compose-graphic-elements-1       Up 10 minutes
     ecfaa5190302   compose-page-elements-1          Up 10 minutes
-    ea8c7fdf20d1   nemoretriever-embedding-ms       Up 10 minutes (healthy)
+    ea8c7fdf20d1   nemotron-embedding-ms       Up 10 minutes (healthy)
     6d62008a9b42   compose-nemoretriever-ocr-1      Up 10 minutes
     969b9f5c987c   compose-table-structure-1        Up 10 minutes
     ```
@@ -333,11 +333,11 @@ After the first time you deploy the RAG Blueprint successfully, you can consider
 - For improved accuracy, consider enabling reasoning mode. For details, refer to [Enable thinking](./enable-nemotron-thinking.md).
 
 
-- NeMo Retriever OCR is now the default OCR service. To use legacy Paddle OCR instead, refer to [OCR Configuration Guide](nemoretriever-ocr.md).
+- NeMo Retriever Library OCR is now the default OCR service. To use legacy Paddle OCR instead, refer to [OCR Configuration Guide](nemoretriever-ocr.md).
 
 - For advanced users who need direct filesystem access to extraction results, refer to [Ingestor Server Volume Mounting](mount-ingestor-volume.md).
 
-- A single NVIDIA A100-80GB or H100-80GB, B200 GPU can be used to start non-LLM NIMs (nemoretriever-embedding-ms, nemoretriever-ranking-ms, and ingestion services like page-elements, ocr, graphic-elements, and table-structure) for ingestion and RAG workflows. You can control which GPU is used for each service by setting these environment variables in `deploy/compose/.env` file before launching. For a complete list of all services and their default GPU assignments, see [Service Port and GPU Reference](service-port-gpu-reference.md).
+- A single NVIDIA A100-80GB or H100-80GB, B200 GPU can be used to start non-LLM NIMs (nemotron-embedding-ms, nemotron-ranking-ms, and ingestion services like page-elements, ocr, graphic-elements, and table-structure) for ingestion and RAG workflows. You can control which GPU is used for each service by setting these environment variables in `deploy/compose/.env` file before launching. For a complete list of all services and their default GPU assignments, see [Service Port and GPU Reference](service-port-gpu-reference.md).
 
    ```bash
    EMBEDDING_MS_GPU_ID=0
diff --git a/docs/deploy-helm-from-repo.md b/docs/deploy-helm-from-repo.md
index 04c39829a..e57c9ea26 100644
--- a/docs/deploy-helm-from-repo.md
+++ b/docs/deploy-helm-from-repo.md
@@ -14,7 +14,7 @@ The following are the core services that you install:
 
 - RAG server
 - Ingestor server
-- NV-Ingest
+- NeMo Retriever Library
 
 
 ## Prerequisites
diff --git a/docs/deploy-helm.md b/docs/deploy-helm.md
index bf8e792c5..18940aaf6 100644
--- a/docs/deploy-helm.md
+++ b/docs/deploy-helm.md
@@ -14,7 +14,7 @@ The following are the core services that you install:
 
 - RAG server
 - Ingestor server
-- NV-Ingest
+- NeMo Retriever Library
 
 
 ## Prerequisites
@@ -37,7 +37,7 @@ Plan for additional space if you are enabling persistence for multiple services.
 
 4. Verify that you have Kubernetes v1.34.2 installed and running on Ubuntu 22.04/24.04. For more information, see [Kubernetes documentation](https://kubernetes.io/docs/setup/) and [NVIDIA Cloud Native Stack](https://github.com/NVIDIA/cloud-native-stack).
 
-5. Verify that you have installed Helm 3.  To install Helm 3 (and avoid Helm 4), follow the official Helm v3 installation instructions for your platform, for example by using the `get-helm-3` script described in the [Helm documentation](https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3).
+5. Verify that you have installed Helm 3.  To install Helm 3 (and avoid Helm 4), follow the official Helm v3 installation instructions for your platform, for example by using the `get-helm-3` script described in the [Helm documentation](https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3).
 
 6. Verify that you have a default storage class available in the cluster for PVC provisioning. One option is the local path provisioner by Rancher.   Refer to the [installation](https://github.com/rancher/local-path-provisioner?tab=readme-ov-file#installation) section of the README in the GitHub repository.
 
@@ -87,7 +87,7 @@ To deploy End-to-End RAG Server and Ingestor Server, use the following procedure
 2. Install the Helm chart by running the following command.
 
     ```sh
-    helm upgrade --install rag -n rag https://helm.ngc.nvidia.com/nvstaging/blueprint/charts/nvidia-blueprint-rag-v2.4.0.tgz \
+    helm upgrade --install rag -n rag https://helm.ngc.nvidia.com/nvidia/blueprint/charts/nvidia-blueprint-rag-v2.5.0.tgz \
     --username '$oauthtoken' \
     --password "${NGC_API_KEY}" \
     --set imagePullSecret.password=$NGC_API_KEY \
@@ -112,7 +112,7 @@ To deploy End-to-End RAG Server and Ingestor Server, use the following procedure
    
    Then install using the modified values.yaml:
    ```sh
-   helm upgrade --install rag -n rag https://helm.ngc.nvidia.com/nvstaging/blueprint/charts/nvidia-blueprint-rag-v2.4.0.tgz \
+   helm upgrade --install rag -n rag https://helm.ngc.nvidia.com/nvidia/blueprint/charts/nvidia-blueprint-rag-v2.5.0.tgz \
      --username '$oauthtoken' \
      --password "${NGC_API_KEY}" \
      --set imagePullSecret.password=$NGC_API_KEY \
@@ -125,6 +125,8 @@ To deploy End-to-End RAG Server and Ingestor Server, use the following procedure
    Refer to [NIM Model Profile Configuration](model-profiles.md) for using non-default NIM LLM profile.
    :::
 
+   For **Nemotron 3 Super** on Helm, see the [Nemotron 3 Super deployment guide](nemotron3-super-deployment.md#helm-deployment-nemotron-3-super).
+
 
 ## Verify a Deployment
 
@@ -146,11 +148,11 @@ To verify a deployment, use the following procedure.
     NAME                                                 READY   STATUS      RESTARTS   AGE
     ingestor-server-6cc886bcdf-6rfwm                     1/1     Running     0          54m
     milvus-standalone-7dd5db4755-ctqzg                   1/1     Running     0          54m
-    nemoretriever-embedding-ms-86f75c8f65-dfhd2          1/1     Running     0          39m
+    nemotron-embedding-ms-86f75c8f65-dfhd2          1/1     Running     0          39m
     nemoretriever-graphic-elements-v1-67d9d65bdc-ftbkw   1/1     Running     0          33m
     nemoretriever-ocr-v1-78f56cddb9-f4852                1/1     Running     0          40m
     nemoretriever-page-elements-v3-56ddcf9b4b-qsg82      1/1     Running     0          49m
-    nemoretriever-ranking-ms-5ff774889f-fwrlm            1/1     Running     0          40m
+    nemotron-ranking-ms-5ff774889f-fwrlm            1/1     Running     0          40m
     nemoretriever-table-structure-v1-696c9f5665-l9sxn    1/1     Running     0          37m
     nim-llm-7cb9bdcc89-hwpkq                             1/1     Running     0          11m
     nim-llm-cache-job-77hpc                              0/1     Completed   0          94s
@@ -209,11 +211,11 @@ To verify a deployment, use the following procedure.
     NAME                                TYPE        CLUSTER-IP       EXTERNAL-IP   PORT(S)              AGE
     ingestor-server                     ClusterIP   10.107.12.217    <none>        8082/TCP             54m
     milvus                              ClusterIP   10.99.110.203    <none>        19530/TCP,9091/TCP   54m
-    nemoretriever-embedding-ms          ClusterIP   10.104.99.15     <none>        8000/TCP,8001/TCP    54m
+    nemotron-embedding-ms          ClusterIP   10.104.99.15     <none>        8000/TCP,8001/TCP    54m
     nemoretriever-graphic-elements-v1   ClusterIP   10.96.115.45     <none>        8000/TCP,8001/TCP    54m
     nemoretriever-ocr-v1                ClusterIP   10.100.107.215   <none>        8000/TCP,8001/TCP    54m
     nemoretriever-page-elements-v3      ClusterIP   10.102.237.196   <none>        8000/TCP,8001/TCP    54m
-    nemoretriever-ranking-ms            ClusterIP   10.96.114.244    <none>        8000/TCP,8001/TCP    54m
+    nemotron-ranking-ms            ClusterIP   10.96.114.244    <none>        8000/TCP,8001/TCP    54m
     nemoretriever-table-structure-v1    ClusterIP   10.107.227.139   <none>        8000/TCP,8001/TCP    54m
     nim-llm                             ClusterIP   10.104.60.155    <none>        8000/TCP,8001/TCP    54m
     rag-etcd                            ClusterIP   10.104.74.116    <none>        2379/TCP,2380/TCP    54m
@@ -250,7 +252,7 @@ Port-forwarding is provided as a quick method to try out the UI. However, large
 To change an existing deployment, after you modify the [`values.yaml`](../deploy/helm/nvidia-blueprint-rag/values.yaml) file, run the following code.
 
 ```sh
-helm upgrade --install rag -n rag https://helm.ngc.nvidia.com/nvstaging/blueprint/charts/nvidia-blueprint-rag-v2.4.0.tgz \
+helm upgrade --install rag -n rag https://helm.ngc.nvidia.com/nvidia/blueprint/charts/nvidia-blueprint-rag-v2.5.0.tgz \
 --username '$oauthtoken' \
 --password "${NGC_API_KEY}" \
 --set imagePullSecret.password=$NGC_API_KEY \
diff --git a/docs/documentation.md b/docs/documentation.md
index de15a9254..53b943dfa 100644
--- a/docs/documentation.md
+++ b/docs/documentation.md
@@ -8,6 +8,9 @@
 - [Documentation Development](#documentation-development)
   - [Build the Documentation](#build-the-documentation)
   - [Live Building](#live-building)
+  - [Documentation Version](#documentation-version)
+    - [Publishing multiple versions on the public site](#publishing-multiple-versions-on-the-public-site)
+    - [Multi-version build script](#multi-version-build-script)
 
 ## Build the Documentation
 
@@ -40,4 +43,46 @@ The three files below control the version switcher. Before you attempt to publis
 
 * docs/versions1.json
 * docs/project.json
-* docs/conf.py
\ No newline at end of file
+* docs/conf.py
+
+Validate the manifest and that `release` matches `project.json` before building:
+
+```sh
+uv run python docs/scripts/verify_doc_version_manifest.py
+```
+
+### Publishing multiple versions on the public site
+
+Use the **same** `docs/versions1.json` content for every release line you build (list every published version; `preferred` should be `true` only for the default, usually the latest). On each **release branch or tag**, set `release` in `conf.py` and `version` in `project.json` to that line’s version (for example `2.4.0` on the `2.4.x` branch), then build:
+
+```sh
+uv run --group docs sphinx-build . _build/html
+```
+
+Deploy the HTML so each line lives as a **sibling** folder, for example `2.3.0/`, `2.4.0/`, `2.5.0/`. The theme resolves `../versions1.json` from the version **index** page to a file **next to** those folders (the parent directory). Copy the same `docs/versions1.json` to that parent as `versions1.json` when you publish, or ensure your pipeline deploys it there once per release. If you add a version to the manifest, rebuild (or redeploy) each affected tree and refresh the root `versions1.json`; invalidate CDN cache if the menu still looks stale.
+
+### Multi-version build script
+
+From the repository root, you can build several release lines into one tree: `docs/_build/multiversion/{version}/` plus a root `versions1.json`. The script reads your current `docs/versions1.json` as the canonical manifest, then for each version checks out git tag `v{version}` if it exists, otherwise branch `release-v{version}`, writes that manifest into `docs/versions1.json`, runs the verifier, and runs Sphinx. Your original `HEAD` is restored at the end.
+
+Preview which refs will be used (no git or build):
+
+```powershell
+.\docs\scripts\build_multiversion_docs.ps1 -DryRun
+```
+
+Full build (requires a clean working tree, or pass `-AllowDirty`):
+
+```powershell
+.\docs\scripts\build_multiversion_docs.ps1 -Versions @('2.3.0','2.4.0','2.5.0')
+```
+
+On Linux or macOS:
+
+```sh
+chmod +x docs/scripts/build_multiversion_docs.sh
+./docs/scripts/build_multiversion_docs.sh --dry-run
+./docs/scripts/build_multiversion_docs.sh --versions 2.3.0,2.4.0,2.5.0
+```
+
+Serve the result locally, for example: `python -m http.server 8080 --directory docs/_build/multiversion` and open `http://localhost:8080/2.5.0/` to confirm the switcher.
\ No newline at end of file
diff --git a/docs/enable-nemotron-thinking.md b/docs/enable-nemotron-thinking.md
index 182a09ea1..dc95b4285 100644
--- a/docs/enable-nemotron-thinking.md
+++ b/docs/enable-nemotron-thinking.md
@@ -19,9 +19,106 @@ This guide explains how to enable reasoning for different Nemotron models, each
 
 | Model | Control Method | Thinking Budget Parameters |
 |-------|----------------|----------------------------|
+| Nemotron 3 (Nano 30B, and others) | Environment variables | `LLM_ENABLE_THINKING`, `LLM_REASONING_BUDGET`, `LLM_LOW_EFFORT` |
 | Nemotron 1.5 | System prompts | None |
 | Nemotron-3-Nano 9B | System prompts | min/max thinking tokens |
-| Nemotron-3-Nano 30B | Environment variable | max thinking tokens only |
+
+## Enable Reasoning for Nemotron 3 Models
+
+Nemotron 3 models (such as `nvidia/nemotron-3-nano-30b-a3b`) use environment variables to control reasoning.
+
+Set the following environment variables on the RAG server container (via Docker Compose, Helm values, or shell export):
+
+**`LLM_ENABLE_THINKING`**
+: Enable or disable the reasoning phase. When `true`, the model emits reasoning tokens before the final answer. Default: `false`.
+
+**`LLM_REASONING_BUDGET`**
+: Maximum number of tokens allocated for reasoning. Only used when `LLM_ENABLE_THINKING` is `true`. Default: `0`.
+
+**`LLM_LOW_EFFORT`**
+: Low-effort reasoning mode for faster, cheaper responses with shorter reasoning. Only used when `LLM_ENABLE_THINKING` is `true`. Default: `false`.
+
+**`FILTER_THINK_TOKENS`**
+: Filter content between `<think>` and `</think>` tags in model responses. Keep `true` for production to return only the final answer. Set `false` to see the full reasoning process. Default: `true`.
+
+:::{important}
+**Disabling reasoning:** To disable reasoning, set **`LLM_ENABLE_THINKING=false`**. Setting `LLM_REASONING_BUDGET=0` alone does not disable reasoning: when the budget is `0`, the RAG pipeline does not pass it to the LLM, and the model uses its default reasoning behavior. Always set `LLM_ENABLE_THINKING=false` to turn reasoning off.
+:::
+
+## Enable Reasoning for Nemotron 3 Models
+
+Nemotron 3 models (such as `nvidia/nemotron-3-super-120b-a12b` and `nvidia/nemotron-3-nano-30b-a3b`) use environment variables to control reasoning.
+
+### Basic Configuration
+
+```bash
+export LLM_ENABLE_THINKING=true
+```
+
+### Configure Reasoning Budget (Optional)
+
+Limit the number of reasoning tokens to control latency and cost:
+
+```bash
+export LLM_ENABLE_THINKING=true
+export LLM_REASONING_BUDGET=8192
+```
+
+### Low-Effort Mode (Optional)
+
+For faster responses where deep reasoning is unnecessary:
+
+```bash
+export LLM_ENABLE_THINKING=true
+export LLM_LOW_EFFORT=true
+```
+
+### Configure Model Parameters
+
+After you enable reasoning, configure the model parameters for optimal reasoning performance:
+
+```bash
+export LLM_TEMPERATURE=0.6
+export LLM_TOP_P=0.95
+```
+
+### Nemotron-3-Nano 30B
+
+For `nvidia/nemotron-3-nano-30b-a3b`, reasoning is controlled with the same `LLM_ENABLE_THINKING` variable. The reasoning budget can be set with either `LLM_REASONING_BUDGET` or `LLM_MAX_THINKING_TOKENS`:
+
+```bash
+export LLM_ENABLE_THINKING=true
+export LLM_REASONING_BUDGET=8192
+```
+
+The 30B model also supports a maximum thinking token limit directly in API requests:
+
+```json
+{
+  "model": "nvidia/nemotron-3-nano-30b-a3b",
+  "messages": [
+    {
+      "role": "user",
+      "content": "What is the capital of France?"
+    }
+  ],
+  "max_thinking_tokens": 8192
+}
+```
+
+**Thinking budget parameters:**
+
+**`max_thinking_tokens`**
+: Maximum number of reasoning tokens allowed before generating the final answer.
+
+:::{important}
+The key differences for the 30B model are the following:
+
+- Uses only `max_thinking_tokens` (not `min_thinking_tokens`)
+- Reasoning is available in the model output's `reasoning_content` field (not wrapped in `<think>` tags)
+- The `reasoning_content` field is present in the model output but isn't exposed in the generate API response
+- No filtering is needed because reasoning is already separated from the final answer
+:::
 
 ## Enable Reasoning for Nemotron 1.5
 
@@ -81,7 +178,7 @@ export FILTER_THINK_TOKENS=false
 For most production use cases, keep `FILTER_THINK_TOKENS=true` (default) to provide cleaner responses to end users.
 :::
 
-## Enable Reasoning for Nemotron-3-Nano 9B
+## Enable Reasoning for Nemotron Nano 9B
 
 The `nvidia/nvidia-nemotron-nano-9b-v2` model uses system prompts to control reasoning similar to Nemotron 1.5. It also adds support for thinking budget parameters to control the extent of reasoning.
 
@@ -132,63 +229,6 @@ The key differences for the 9B model are the following:
 - No filtering is needed because reasoning is already separated from the final answer
 :::
 
-## Enable Reasoning for Nemotron-3-Nano 30B
-
-The `nvidia/nemotron-3-nano-30b-a3b` model uses a different approach for reasoning control. Instead of system prompts, you control reasoning through an environment variable.
-
-### Enable Reasoning Through an Environment Variable
-
-Set the environment variable to enable or disable reasoning:
-
-```bash
-# Enable reasoning (default)
-export ENABLE_NEMOTRON_3_NANO_THINKING=true
-
-# Disable reasoning
-export ENABLE_NEMOTRON_3_NANO_THINKING=false
-```
-
-### Configure Thinking Budget (Optional)
-
-The 30B model supports a maximum thinking token limit to control the reasoning phase:
-
-```json
-{
-  "model": "nvidia/nemotron-3-nano-30b-a3b",
-  "messages": [
-    {
-      "role": "user",
-      "content": "What is the capital of France?"
-    }
-  ],
-  "max_thinking_tokens": 8192
-}
-```
-
-**Thinking budget parameters:**
-
-**`max_thinking_tokens`**
-: Maximum number of reasoning tokens allowed before generating the final answer.
-
-:::{important}
-The key differences for the 30B model are the following:
-
-- Uses only `max_thinking_tokens` (not `min_thinking_tokens`)
-- Reasoning is available in the model output's `reasoning_content` field (not wrapped in `<think>` tags)
-- The `reasoning_content` field is present in the model output but isn't exposed in the generate API response
-- No filtering is needed because reasoning is already separated from the final answer
-:::
-
-### Model Naming
-
-Use the correct model name based on your deployment:
-
-**Locally deployed NIMs**
-: `nvidia/nemotron-3-nano`
-
-**NVIDIA-hosted models**
-: `nvidia/nemotron-3-nano-30b-a3b`
-
 ## Deploy with Reasoning Enabled
 
 After you configure reasoning settings in `prompt.yaml` or environment variables, redeploy your services:
@@ -220,6 +260,7 @@ Adjust the thinking budget based on your use case:
 
 - **Lower values (1024-4096)**: Faster responses for simpler questions
 - **Higher values (8192-16384)**: More thorough reasoning for complex queries
+- **Low-effort mode**: Use `LLM_LOW_EFFORT=true` for fast, low-cost reasoning when deep thought is not required
 :::
 
 ## Related Topics
diff --git a/docs/evaluate.md b/docs/evaluate.md
index 0d83e83b0..2485682dc 100644
--- a/docs/evaluate.md
+++ b/docs/evaluate.md
@@ -7,6 +7,8 @@
 After you [deploy your NVIDIA RAG Blueprint system](readme.md#deployment-options-for-rag-blueprint),
 you can evaluate it by using [Ragas](https://docs.ragas.io/en/stable/) metrics specifically designed for Large Language Model (LLM) Applications.
 
+For published benchmark results across multiple datasets and configurations, refer to [RAG Accuracy Benchmarks](accuracy-benchmarks.md).
+
 
 ## Ragas Metrics
 
@@ -36,3 +38,4 @@ For more information, refer to the notebook [Evaluate Your RAG Pipeline with Rag
 - [NVIDIA RAG Blueprint Documentation](readme.md)
 - [Get Started](deploy-docker-self-hosted.md)
 - [Notebooks](notebooks.md)
+- [RAG Accuracy Benchmarks](accuracy-benchmarks.md)
diff --git a/docs/index.md b/docs/index.md
index 941b2f9d8..fc5df2ecf 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -1,5 +1,6 @@
 <!--
-  SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+  SPDX-FileCopyrightText: Copyright (c) 2025, 2026
+   NVIDIA CORPORATION & AFFILIATES. All rights reserved.
   SPDX-License-Identifier: Apache-2.0
 -->
 # NVIDIA RAG Blueprint Documentation
@@ -81,6 +82,7 @@ After you deploy the RAG blueprint, you can customize it for your use cases.
 - Data Ingestion & Processing
 
     - [Audio Ingestion Support](audio_ingestion.md)
+    - [Continuous Ingestion from Object Storage](continuous-ingestion-object-storage.md)
     - [Custom Metadata Support](custom-metadata.md)
     - [File System Access to Extraction Results](mount-ingestor-volume.md)
     - [Multimodal Embedding Support (Early Access)](vlm-embed.md)
@@ -110,6 +112,8 @@ After you deploy the RAG blueprint, you can customize it for your use cases.
 - Evaluation
 
     - [Evaluate Your NVIDIA RAG Blueprint System](evaluate.md)
+    - [RAG Accuracy Benchmarks](accuracy-benchmarks.md)
+    - [RAG Performance Benchmarks](perf-benchmarks.md)
 
 - Governance
 
@@ -141,7 +145,7 @@ After you deploy the RAG blueprint, you can customize it for your use cases.
 
 ## Blog Posts
 
-- [NVIDIA NeMo Retriever Delivers Accurate Multimodal PDF Data Extraction 15x Faster](https://developer.nvidia.com/blog/nvidia-nemo-retriever-delivers-accurate-multimodal-pdf-data-extraction-15x-faster/)
+- [NVIDIA NeMo Retriever Library Delivers Accurate Multimodal PDF Data Extraction 15x Faster](https://developer.nvidia.com/blog/nvidia-nemo-retriever-delivers-accurate-multimodal-pdf-data-extraction-15x-faster/)
 - [Finding the Best Chunking Strategy for Accurate AI Responses](https://developer.nvidia.com/blog/finding-the-best-chunking-strategy-for-accurate-ai-responses/)
 
 
@@ -177,6 +181,7 @@ After you deploy the RAG blueprint, you can customize it for your use cases.
    :maxdepth: 1
    :hidden:
 
+   Deploy with Docker (Self-Hosted Models) <deploy-docker-self-hosted.md>
    Deploy with Docker (NVIDIA-Hosted Models) <deploy-docker-nvidia-hosted.md>
    Deploy on Kubernetes with Helm <deploy-helm.md>
    Deploy on Kubernetes with Helm from the repository <deploy-helm-from-repo.md>
@@ -211,13 +216,14 @@ After you deploy the RAG blueprint, you can customize it for your use cases.
    :hidden:
 
    Audio Ingestion Support <audio_ingestion.md>
+   Continuous Ingestion from Object Storage <continuous-ingestion-object-storage.md>
    Custom metadata Support <custom-metadata.md>
    Data Catalog for Collections and Documents <data-catalog.md>
    File System Access to Results <mount-ingestor-volume.md>
    Multimodal Embedding Support (Early Access) <vlm-embed.md>
    OCR Configuration Guide <nemoretriever-ocr.md>
    Enhanced PDF Extraction <nemotron-parse-extraction.md>
-   Standalone NV-Ingest <nv-ingest-standalone.md>
+   Standalone NeMo Retriever Library <nv-ingest-standalone.md>
    Text-Only Ingestion <text_only_ingest.md>
    MCP Server Usage <mcp.md>
 ```
@@ -255,6 +261,8 @@ After you deploy the RAG blueprint, you can customize it for your use cases.
    :hidden:
 
    Evaluate Your RAG System <evaluate.md>
+   RAG Accuracy Benchmarks <accuracy-benchmarks.md>
+   RAG Performance Benchmarks <perf-benchmarks.md>
 ```
 
 
diff --git a/docs/mig-deployment.md b/docs/mig-deployment.md
index bc4793ab1..d2ee3cc5e 100644
--- a/docs/mig-deployment.md
+++ b/docs/mig-deployment.md
@@ -15,10 +15,10 @@ refer to the [MIG Supported Hardware List](https://docs.nvidia.com/datacenter/te
 
 Before you deploy, verify that you have the following:
 
-* A Kubernetes cluster with NVIDIA H100 GPUs
+* A Kubernetes cluster with NVIDIA H100 or RTX PRO 6000 GPUs
 
    :::{note}
-   This section showcases MIG support for `NVIDIA H100 80GB HBM3` GPU. The MIG profiles used in the `mig-config.yaml` are specific to this GPU.
+   This section showcases MIG support for `NVIDIA H100 80GB HBM3` GPU. The MIG profiles used in the `mig-config-h100.yaml` are specific to this GPU.
    Refer to the [MIG User Guide](https://docs.nvidia.com/datacenter/tesla/mig-user-guide/) for MIG profiles of other GPU types.
    :::
 
@@ -35,9 +35,9 @@ For monitoring deployment progress, refer to [Deploy on Kubernetes with Helm](./
 
 3. Verify that you have the NGC CLI available on your client computer. You can download the CLI from <https://ngc.nvidia.com/setup/installers/cli>.
 
-4. Verify that you have Kubernetes v1.34.2 installed and running on Ubuntu 22.04/24.04. For more information, see [Kubernetes documentation](https://kubernetes.io/docs/setup/) and [NVIDIA Cloud Native Stack 17.0](https://github.com/NVIDIA/cloud-native-stack/tree/17.0).
+4. Verify that you have Kubernetes v1.34.2 installed and running on Ubuntu 22.04/24.04. For more information, see [Kubernetes documentation](https://kubernetes.io/docs/setup/) and [NVIDIA Cloud Native Stack 17.0](https://github.com/NVIDIA/cloud-native-stack/tree/25.12.0).
 
-5. Verify that you have installed Helm 3 or later (Helm v3.20.0 recommended). For installation instructions, see [Helm Installation](https://helm.sh/docs/intro/install).
+5. Verify that you have installed Helm 3. To install Helm 3 (and avoid Helm 4), follow the official Helm v3 installation instructions for your platform, for example by using the `get-helm-3` script described in the [Helm documentation](https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3).
 
 6. Verify that you have a default storage class available in the cluster for PVC provisioning. One option is the local path provisioner by Rancher.   Refer to the [installation](https://github.com/rancher/local-path-provisioner?tab=readme-ov-file#installation) section of the README in the GitHub repository.
 
@@ -99,7 +99,7 @@ For monitoring deployment progress, refer to [Deploy on Kubernetes with Helm](./
 
 ## Step 2: Apply the MIG configuration
 
-Edit the MIG configuration file [`mig-config.yaml`](../deploy/helm/mig-slicing/mig-config.yaml) to adjust the slicing pattern as needed.
+Edit the MIG configuration file [`mig-config-h100.yaml`](../deploy/helm/mig-slicing/mig-config-h100.yaml) to adjust the slicing pattern as needed.
 The following example enables a custom configuration with mixed MIG slice sizes on the same GPU.
 
 
@@ -139,7 +139,7 @@ data:
 Apply the custom MIG configuration configMap to the node and update the ClusterPolicy, by running the following code.
 
 ```bash
-kubectl apply -n nvidia-gpu-operator -f mig-slicing/mig-config.yaml
+kubectl apply -n nvidia-gpu-operator -f mig-slicing/mig-config-h100.yaml
 kubectl patch clusterpolicies.nvidia.com/cluster-policy \
   --type='json' \
   -p='[{"op":"replace", "path":"/spec/migManager/config/name", "value":"custom-mig-config"}]'
@@ -151,6 +151,20 @@ Label the node with MIG configuration, by running the following code.
 kubectl label nodes <node-name> nvidia.com/mig.config=custom-7x1g10-2x1g20-1x3g40-1x7g80 --overwrite
 ```
 
+:::{important}
+**For NVIDIA RTX6000 Pro Deployments:**
+
+Use [`mig-config-rtx6000.yaml`](../deploy/helm/mig-slicing/mig-config-rtx6000.yaml) instead:
+
+```bash
+kubectl apply -n nvidia-gpu-operator -f mig-slicing/mig-config-rtx6000.yaml
+kubectl patch clusterpolicies.nvidia.com/cluster-policy \
+  --type='json' \
+  -p='[{"op":"replace", "path":"/spec/migManager/config/name", "value":"custom-mig-config"}]'
+kubectl label nodes <node-name> nvidia.com/mig.config=custom-rtx6000-4x1g24-2x1g24-1x2g48-1x4g96 --overwrite
+```
+:::
+
 Verify that the MIG configuration is successfully applied, by running the following code.
 
 ```bash
@@ -174,39 +188,26 @@ You should see output similar to the following.
 Run the following code to install the RAG Blueprint Helm Chart.
 
 ```bash
-helm upgrade --install rag -n rag https://helm.ngc.nvidia.com/nvstaging/blueprint/charts/nvidia-blueprint-rag-v2.4.0.tgz \
+helm upgrade --install rag -n rag https://helm.ngc.nvidia.com/nvidia/blueprint/charts/nvidia-blueprint-rag-v2.5.0.tgz \
   --username '$oauthtoken' \
   --password "${NGC_API_KEY}" \
   --set imagePullSecret.password=$NGC_API_KEY \
   --set ngcApiSecret.password=$NGC_API_KEY \
-  -f mig-slicing/values-mig.yaml
+  -f mig-slicing/values-mig-h100.yaml
 ```
 
 :::{important}
 **For NVIDIA RTX6000 Pro Deployments:**
 
-If you are deploying on NVIDIA RTX6000 Pro GPUs (instead of H100 GPUs), you need to configure the NIM LLM model profile. The required configuration is already present but commented out in the [`values.yaml`](../deploy/helm/nvidia-blueprint-rag/values.yaml) file.
-
-Uncomment and modify the following section under `nimOperator.nim-llm.model` in [`values.yaml`](../deploy/helm/nvidia-blueprint-rag/values.yaml):
-```yaml
-model:
-  engine: tensorrt_llm
-  precision: "fp8"
-  qosProfile: "throughput"
-  tensorParallelism: "1"
-  gpus:
-    - product: "rtx6000_blackwell_sv"
-```
+If you are deploying on NVIDIA RTX6000 Pro GPUs (instead of H100 GPUs), use [`values-mig-rtx6000.yaml`](../deploy/helm/mig-slicing/values-mig-rtx6000.yaml) and [`mig-config-rtx6000.yaml`](../deploy/helm/mig-slicing/mig-config-rtx6000.yaml) which include the RTX6000-specific MIG profiles and NIM LLM model configuration.
 
-Then install using the modified values.yaml along with MIG values:
 ```sh
-helm upgrade --install rag -n rag https://helm.ngc.nvidia.com/nvstaging/blueprint/charts/nvidia-blueprint-rag-v2.4.0.tgz \
+helm upgrade --install rag -n rag https://helm.ngc.nvidia.com/nvidia/blueprint/charts/nvidia-blueprint-rag-v2.5.0.tgz \
   --username '$oauthtoken' \
   --password "${NGC_API_KEY}" \
   --set imagePullSecret.password=$NGC_API_KEY \
   --set ngcApiSecret.password=$NGC_API_KEY \
-  -f values.yaml \
-  -f mig-slicing/values-mig.yaml
+  -f mig-slicing/values-mig-rtx6000.yaml
 ```
 :::
 
@@ -235,14 +236,14 @@ You should see output similar to the following.
 Resource                                    Requested   Limit    Allocatable  Free
 nvidia.com/mig-1g.10gb                      (86%) 6.0   (86%) 6.0     7.0        1.0
 ├─ milvus-standalone-...                   1.0     1.0
-├─ nemoretriever-embedding-ms-...          1.0     1.0
+├─ nemotron-embedding-ms-...          1.0     1.0
 ├─ rag-nv-ingest-...                       1.0     1.0
 ├─ nemoretriever-graphic-elements-v1-...   1.0     1.0
 ├─ nemoretriever-page-elements-v3-...      1.0     1.0
 └─ nemoretriever-table-structure-v1-...    1.0     1.0
 
 nvidia.com/mig-1g.20gb                      (100%) 2.0  (100%) 2.0     2.0        0.0
-├─ nemoretriever-ranking-ms-...            1.0     1.0
+├─ nemotron-ranking-ms-...            1.0     1.0
 └─ <other-workload>                        1.0     1.0
 
 nvidia.com/mig-3g.40gb                      (100%) 1.0  (100%) 1.0     1.0        0.0
@@ -303,7 +304,7 @@ GPU 3: NVIDIA H100 80GB HBM3 (UUID: ...)
 
 * Ensure you have the correct MIG strategy (`mixed`) configured.
 * Verify that `nvidia.com/mig.config.state` is `success` before deploying.
-* Customize `values-mig.yaml` to specify the correct MIG GPU resource requests for each pod.
+* Customize `values-mig-h100.yaml` or `values-mig-rtx6000.yaml` to specify the correct MIG GPU resource requests for each pod.
 
 
 
diff --git a/docs/mount-ingestor-volume.md b/docs/mount-ingestor-volume.md
index a9bbb43c4..ff776e34d 100644
--- a/docs/mount-ingestor-volume.md
+++ b/docs/mount-ingestor-volume.md
@@ -4,7 +4,7 @@
 -->
 # Ingestor Server Volume Mounting for NVIDIA RAG Blueprint
 
-You can mount a host directory to access NV-Ingest extraction results directly from the filesystem when you use the [NVIDIA RAG Blueprint](readme.md). Designed for advanced developers who need programmatic access to raw extraction results for custom processing pipelines or external vector database integration.
+You can mount a host directory to access extraction results from NeMo Retriever Library directly from the filesystem when you use the [NVIDIA RAG Blueprint](readme.md). Designed for advanced developers who need programmatic access to raw extraction results for custom processing pipelines or external vector database integration.
 
 ## Configuration
 
diff --git a/docs/multi-collection-retrieval.md b/docs/multi-collection-retrieval.md
index cd80c337f..07d05be12 100644
--- a/docs/multi-collection-retrieval.md
+++ b/docs/multi-collection-retrieval.md
@@ -38,10 +38,10 @@ The reranker settings are configured in `deploy/compose/docker-compose-rag-serve
 export ENABLE_RERANKER=True
 
 # Set reranker model (default is already configured)
-export APP_RANKING_MODELNAME="nvidia/llama-3.2-nv-rerankqa-1b-v2"
+export APP_RANKING_MODELNAME="nvidia/llama-nemotron-rerank-1b-v2"
 
 # Reranker service URL (default is already configured)
-export APP_RANKING_SERVERURL="nemoretriever-ranking-ms:8000"
+export APP_RANKING_SERVERURL="nemotron-ranking-ms:8000"
 ```
 
 ### For Helm Deployment
@@ -54,7 +54,7 @@ envVars:
   ENABLE_RERANKER: "True"
   
   # Reranker model name (default is already configured)
-  APP_RANKING_MODELNAME: "nvidia/llama-3.2-nv-rerankqa-1b-v2"
+  APP_RANKING_MODELNAME: "nvidia/llama-nemotron-rerank-1b-v2"
   
   # Reranker service URL (default is already configured)
   APP_RANKING_SERVERURL: "nemoretriever-reranking-ms:8000"
diff --git a/docs/nemoretriever-ocr.md b/docs/nemoretriever-ocr.md
index d8ef4f3c5..a76a7c113 100644
--- a/docs/nemoretriever-ocr.md
+++ b/docs/nemoretriever-ocr.md
@@ -11,17 +11,17 @@ This guide explains the OCR (Optical Character Recognition) services available i
 
 The NVIDIA RAG Blueprint supports two OCR services:
 
-1. **NeMo Retriever OCR** (Default) - High-performance OCR service offering 2x+ faster performance
+1. **NeMo Retriever Library OCR** (Default) - High-performance OCR service offering 2x+ faster performance
 2. **Paddle OCR** (Legacy) - General-purpose OCR service maintained for compatibility
 
 :::{tip}
-**NeMo Retriever OCR is now the default OCR service** and is recommended for all new deployments due to its superior performance and efficiency.
+**NeMo Retriever Library OCR is now the default OCR service** and is recommended for all new deployments due to its superior performance and efficiency.
 :::
 
 
-## NeMo Retriever OCR (Default)
+## NeMo Retriever Library OCR (Default)
 
-NeMo Retriever OCR is the default and recommended OCR service for the NVIDIA RAG Blueprint, providing:
+NeMo Retriever Library OCR is the default and recommended OCR service for the NVIDIA RAG Blueprint, providing:
 
 - **2x+ faster performance** compared to Paddle OCR
 - Optimized text extraction from documents and images
@@ -38,7 +38,7 @@ NeMo Retriever OCR is the default and recommended OCR service for the NVIDIA RAG
 
 ### Default Configuration
 
-By default, the NVIDIA RAG Blueprint is configured to use NeMo Retriever OCR with the following settings:
+By default, the NVIDIA RAG Blueprint is configured to use NeMo Retriever Library OCR with the following settings:
 
 | Variable | Default Value | Description |
 |----------|---------------|-------------|
@@ -49,11 +49,11 @@ By default, the NVIDIA RAG Blueprint is configured to use NeMo Retriever OCR wit
 
 ### Hardware Requirements
 
-For detailed hardware requirements and GPU support, refer to the [NeMo Retriever OCR Support Matrix](https://docs.nvidia.com/nim/ingestion/image-ocr/1.2.0/support-matrix.html).
+For detailed hardware requirements and GPU support, refer to the [NeMo Retriever Library OCR Support Matrix](https://docs.nvidia.com/nim/ingestion/image-ocr/1.2.0/support-matrix.html).
 
 ### Docker Configuration
 
-The NeMo Retriever OCR service is configured in the Docker Compose file with the following key settings:
+The NeMo Retriever Library OCR service is configured in the Docker Compose file with the following key settings:
 
 - **Image**: `nvcr.io/nim/nvidia/nemoretriever-ocr-v1:1.2.0`
 - **GPU Memory**: 8192 MB (default)
@@ -72,7 +72,7 @@ export OCR_OMP_NUM_THREADS=8  # Set OpenMP threads
 
 ## Paddle OCR (Legacy)
 
-Paddle OCR is maintained as a legacy option for compatibility with existing workflows. While still functional, it is recommended to migrate to NeMo Retriever OCR for better performance.
+Paddle OCR is maintained as a legacy option for compatibility with existing workflows. While still functional, it is recommended to migrate to NeMo Retriever Library OCR for better performance.
 
 ### When to Use Paddle OCR
 
@@ -83,8 +83,6 @@ Consider using Paddle OCR if you:
 
 ### Hardware Requirements
 
-For detailed hardware requirements, refer to the [Paddle OCR Support Matrix](https://docs.nvidia.com/nim/ingestion/table-extraction/latest/support-matrix.html#supported-hardware).
-
 ### Docker Configuration
 
 The Paddle OCR service configuration:
@@ -94,7 +92,7 @@ The Paddle OCR service configuration:
 - **Ports**: 8009 (HTTP), 8010 (gRPC), 8011 (Metrics)
 
 :::{note}
-**Legacy Service**: Paddle OCR is maintained as a legacy option. For new deployments, we recommend using the default NeMo Retriever OCR service for better performance.
+**Legacy Service**: Paddle OCR is maintained as a legacy option. For new deployments, we recommend using the default NeMo Retriever Library OCR service for better performance.
 :::
 
 
@@ -102,9 +100,9 @@ The Paddle OCR service configuration:
 
 ### Docker Compose Deployment
 
-#### Using NeMo Retriever OCR (Default)
+#### Using NeMo Retriever Library OCR (Default)
 
-NeMo Retriever OCR is deployed by default when you follow the standard deployment guide. No additional configuration is required.
+NeMo Retriever Library OCR is deployed by default when you follow the standard deployment guide. No additional configuration is required.
 
 1. **Prerequisites**: Follow the [deployment guide](deploy-docker-self-hosted.md) for standard setup.
 
@@ -114,7 +112,7 @@ NeMo Retriever OCR is deployed by default when you follow the standard deploymen
    ```
 
    :::{tip}
-   NeMo Retriever OCR is included in the default profile and will start automatically.
+   NeMo Retriever Library OCR is included in the default profile and will start automatically.
    :::
 
 3. **Verify Service Status**:
@@ -136,7 +134,7 @@ If you need to use Paddle OCR instead:
    export OCR_MODEL_NAME=paddle
    ```
 
-3. **Stop NeMo Retriever OCR if running**:
+3. **Stop NeMo Retriever Library OCR if running**:
    ```bash
    USERID=$(id -u) docker compose -f deploy/compose/nims.yaml down nemoretriever-ocr
    ```
@@ -146,7 +144,7 @@ If you need to use Paddle OCR instead:
    USERID=$(id -u) docker compose -f deploy/compose/nims.yaml --profile paddle up -d
    ```
 
-5. **Restart Ingestor Server and NV-Ingest Runtime**:
+5. **Restart Ingestor Server and NeMo Retriever Library Runtime**:
    ```bash
    docker compose -f deploy/compose/docker-compose-ingestor-server.yaml up -d
    ```
@@ -156,9 +154,9 @@ If you need to use Paddle OCR instead:
 
 ### NVIDIA-Hosted Deployment
 
-#### Using NeMo Retriever OCR (Default)
+#### Using NeMo Retriever Library OCR (Default)
 
-Follow the standard [NVIDIA-hosted deployment guide](deploy-docker-nvidia-hosted.md) - NeMo Retriever OCR is the default configuration.
+Follow the standard [NVIDIA-hosted deployment guide](deploy-docker-nvidia-hosted.md) - NeMo Retriever Library OCR is the default configuration.
 
 #### Using Paddle OCR with NVIDIA-Hosted Deployment
 
@@ -178,13 +176,13 @@ Follow the standard [NVIDIA-hosted deployment guide](deploy-docker-nvidia-hosted
 
 ### Helm Deployment
 
-#### Using NeMo Retriever OCR (Default)
+#### Using NeMo Retriever Library OCR (Default)
 
-NeMo Retriever OCR is deployed by default with Helm installations. Follow the standard [Helm Deployment Guide](deploy-helm.md) - no additional OCR configuration is required.
+NeMo Retriever Library OCR is deployed by default with Helm installations. Follow the standard [Helm Deployment Guide](deploy-helm.md) - no additional OCR configuration is required.
 
 #### Using Paddle OCR with Helm
 
-To use Paddle OCR instead of the default NeMo Retriever OCR:
+To use Paddle OCR instead of the default NeMo Retriever Library OCR:
 
 Modify [`values.yaml`](../deploy/helm/nvidia-blueprint-rag/values.yaml) to override the OCR service image:
 
@@ -216,7 +214,7 @@ For detailed Helm deployment instructions, see [Helm Deployment Guide](deploy-he
 
 ### Environment Variables
 
-| Variable | Description | NeMo Retriever Default | Paddle Default | Required |
+| Variable | Description | NeMo Retriever Library Default | Paddle Default | Required |
 |----------|-------------|------------------------|----------------|----------|
 | `OCR_GRPC_ENDPOINT` | gRPC endpoint for OCR service | `nemoretriever-ocr:8001` | `paddle:8001` | Yes (on-premises) |
 | `OCR_HTTP_ENDPOINT` | HTTP endpoint for OCR service | `http://nemoretriever-ocr:8000/v1/infer` | `http://paddle:8000/v1/infer` | Yes |
@@ -240,16 +238,16 @@ Replace `workstation_ip` with the actual IP address of the machine running the O
 
 ## Switching Between OCR Services
 
-### Migrating from Paddle OCR to NeMo Retriever OCR
+### Migrating from Paddle OCR to NeMo Retriever Library OCR
 
-To switch to the default NeMo Retriever OCR service:
+To switch to the default NeMo Retriever Library OCR service:
 
 1. **Stop Paddle OCR**:
    ```bash
    USERID=$(id -u) docker compose -f deploy/compose/nims.yaml down paddle
    ```
 
-2. **Configure NeMo Retriever OCR environment variables**:
+2. **Configure NeMo Retriever Library OCR environment variables**:
    ```bash
    export OCR_GRPC_ENDPOINT=nemoretriever-ocr:8001
    export OCR_HTTP_ENDPOINT=http://nemoretriever-ocr:8000/v1/infer
@@ -257,7 +255,7 @@ To switch to the default NeMo Retriever OCR service:
    export OCR_MODEL_NAME=scene_text_ensemble
    ```
 
-3. **Start NeMo Retriever OCR**:
+3. **Start NeMo Retriever Library OCR**:
    ```bash
    USERID=$(id -u) docker compose -f deploy/compose/nims.yaml up -d nemoretriever-ocr
    ```
@@ -267,14 +265,14 @@ To switch to the default NeMo Retriever OCR service:
    docker compose -f deploy/compose/docker-compose-ingestor-server.yaml up -d
    ```
 
-### Migrating from NeMo Retriever OCR to Paddle OCR
+### Migrating from NeMo Retriever Library OCR to Paddle OCR
 
 Follow the steps in [Switching to Paddle OCR](#switching-to-paddle-ocr) above.
 
 
 ## Performance Comparison
 
-| Feature | NeMo Retriever OCR | Paddle OCR |
+| Feature | NeMo Retriever Library OCR | Paddle OCR |
 |---------|-------------------|------------|
 | **Performance** | 2x+ faster | Baseline |
 | **GPU Memory** | 8 GB (default) | 3 GB (default) |
@@ -299,13 +297,13 @@ Follow the steps in [Switching to Paddle OCR](#switching-to-paddle-ocr) above.
 
 3. **Performance Issues**
    - Consider increasing `OCR_CUDA_MEMORY_POOL_MB`
-   - Adjust `OCR_BATCH_SIZE` for NeMo Retriever OCR
+   - Adjust `OCR_BATCH_SIZE` for NeMo Retriever Library OCR
    - Verify GPU has sufficient memory
 
 ### Getting Logs
 
 ```bash
-# NeMo Retriever OCR logs
+# NeMo Retriever Library OCR logs
 docker logs nemoretriever-ocr
 
 # Paddle OCR logs
diff --git a/docs/nemotron-parse-extraction.md b/docs/nemotron-parse-extraction.md
index a23dca7f4..0e2fc0b11 100644
--- a/docs/nemotron-parse-extraction.md
+++ b/docs/nemotron-parse-extraction.md
@@ -62,7 +62,7 @@ When using NVIDIA hosted endpoints, you may encounter rate limiting with larger
 
 ## Using Helm
 
-To enable PDF extraction with Nemotron Parse using Helm, you need to enable the Nemotron Parse service and configure the ingestor-server to use it.
+To enable PDF extraction with Nemotron Parse using Helm, enable the Nemotron Parse service and configure the ingestor-server to use it.
 
 ### Prerequisites
 - Ensure you have sufficient GPU resources. Nemotron Parse requires a dedicated GPU.
@@ -71,7 +71,7 @@ To enable PDF extraction with Nemotron Parse using Helm, you need to enable the
 
 To deploy with Nemotron Parse enabled:
 
-Modify [`values.yaml`](../deploy/helm/nvidia-blueprint-rag/values.yaml) to enable Nemotron Parse:
+Modify [`values.yaml`](../deploy/helm/nvidia-blueprint-rag/values.yaml) to enable Nemotron Parse and configure the ingestor-server:
 
 ```yaml
 # Enable Nemotron Parse NIM
@@ -93,9 +93,136 @@ For detailed HELM deployment instructions, see [Helm Deployment Guide](deploy-he
 :::{note}
 **Key Configuration Changes:**
 - `nv-ingest.nimOperator.nemotron_parse.enabled=true` - Enables Nemotron Parse NIM
-- `ingestor-server.envVars.APP_NVINGEST_PDFEXTRACTMETHOD="nemotron_parse"` - Configures ingestor to use Nemotron Parse
+- `ingestor-server.envVars.APP_NVINGEST_PDFEXTRACTMETHOD="nemotron_parse"` - Configures ingestor to use Nemotron Parse for PDF extraction
 :::
 
+## Experimental: Nemotron-parse-only extraction
+
+:::{note}
+The steps in this section describe a nemotron-parse-only pipeline. For production use, the default pipeline (Nemotron Parse with page-elements and table-structure NIMs) is recommended for better accuracy.
+:::
+
+The **default** Nemotron Parse pipeline uses the **page-elements** and **table-structure** NIMs together with the Nemotron Parse NIM in the extraction pipeline. This combination provides better accuracy for PDF and table extraction. 
+To **experiment** with a nemotron-parse-only extraction pipeline (using only the Nemotron Parse NIM, without OCR, page-elements, graphic-elements, or table-structure NIMs), use the following steps.
+
+### Key configuration
+
+- **PDF extraction method** — Set `APP_NVINGEST_PDFEXTRACTMETHOD` to `nemotron_parse` so the ingestor uses Nemotron Parse for PDF text extraction.
+- **Table extraction method** — Set `APP_NVINGEST_EXTRACTTABLESMETHOD` to `nemotron_parse` so the ingestor uses Nemotron Parse for table extraction instead of the default YOLOX-based table NIMs. This is required for a nemotron-parse-only pipeline.
+- **nv-ingest health check** — Set `COMPONENTS_TO_READY_CHECK` to an empty string (`""`) in the **nv-ingest** service environment. By default, nv-ingest readiness waits for other ingest NIMs. With only Nemotron Parse running, the readiness probe would otherwise never pass. Emptying this value allows nv-ingest to become ready when only Nemotron Parse is available.
+
+### Using Docker Compose (nemotron-parse-only)
+
+#### On-prem models
+
+1. **Prerequisites**: Follow the [deployment guide](deploy-docker-self-hosted.md) up to and including the step labelled "Start all required NIMs."
+
+2. Start only the Nemotron Parse service (and any other non-ingest services your setup needs):
+   ```bash
+   USERID=$(id -u) docker compose --profile rag --profile nemotron-parse -f deploy/compose/nims.yaml up -d
+   ```
+  You can skip the OCR, page-elements, graphic-elements, or table-structure NIMs if you want a nemotron-parse-only pipeline.
+
+3. Configure the ingestor-server and nv-ingest for nemotron-parse-only. Set these environment variables:
+
+   **Ingestor-server** (ingestor-server environment):
+   ```bash
+   export APP_NVINGEST_PDFEXTRACTMETHOD=nemotron_parse
+   export APP_NVINGEST_EXTRACTTABLESMETHOD=nemotron_parse
+   ```
+
+   **nv-ingest** (nv-ingest service environment, e.g. in the compose file where nv-ingest runs):
+   ```bash
+   export COMPONENTS_TO_READY_CHECK=""
+   ```
+   This ensures the nv-ingest readiness probe passes when other ingest NIMs are not running.
+
+4. Deploy the ingestion-server and rag-server containers following the remaining steps in the deployment guide.
+
+5. Ingest PDFs using the [ingestion API usage notebook](https://github.com/NVIDIA-AI-Blueprints/rag/blob/main/notebooks/ingestion_api_usage.ipynb).
+
+#### NVIDIA hosted API endpoints
+
+1. **Prerequisites**: Follow the [deployment guide](deploy-docker-nvidia-hosted.md) up to and including the step labelled "Start the vector db containers from the repo root."
+
+2. Export variables for the Nemotron Parse API:
+   ```bash
+   export NEMOTRON_PARSE_HTTP_ENDPOINT=https://integrate.api.nvidia.com/v1/chat/completions
+   export NEMOTRON_PARSE_MODEL_NAME=nvidia/nemotron-parse
+   export NEMOTRON_PARSE_INFER_PROTOCOL=http
+   ```
+
+3. Configure the ingestor-server and nv-ingest for nemotron-parse-only:
+
+   **Ingestor-server**:
+   ```bash
+   export APP_NVINGEST_PDFEXTRACTMETHOD=nemotron_parse
+   export APP_NVINGEST_EXTRACTTABLESMETHOD=nemotron_parse
+   ```
+
+   **nv-ingest** (so readiness passes without other NIMs):
+   ```bash
+   export COMPONENTS_TO_READY_CHECK=""
+   ```
+
+4. Deploy the ingestion-server and rag-server containers following the remaining steps in the deployment guide.
+
+5. Ingest PDFs using the [ingestion API usage notebook](https://github.com/NVIDIA-AI-Blueprints/rag/blob/main/notebooks/ingestion_api_usage.ipynb).
+
+:::{note}
+When using NVIDIA hosted endpoints, you may encounter rate limiting with larger file ingestions (>10 files).
+:::
+
+### Using Helm (nemotron-parse-only)
+
+To run only Nemotron Parse for PDF and table extraction with Helm:
+
+1. **Prerequisites**: Ensure you have sufficient GPU resources. Nemotron Parse requires a dedicated GPU.
+
+2. Edit [`values.yaml`](../deploy/helm/nvidia-blueprint-rag/values.yaml):
+
+   - **Enable Nemotron Parse** and **disable the other ingest NIMs** under `nv-ingest.nimOperator`:
+
+   ```yaml
+   nv-ingest:
+     nimOperator:
+       nemotron_parse:
+         enabled: true
+       nemoretriever_ocr_v1:
+         enabled: false
+       graphic_elements:
+         enabled: false
+       page_elements:
+         enabled: false
+       table_structure:
+         enabled: false
+     envVars:
+       COMPONENTS_TO_READY_CHECK: ""
+   ```
+
+   - **Configure the ingestor-server** to use Nemotron Parse for both PDF and table extraction:
+
+   ```yaml
+   ingestor-server:
+     envVars:
+       APP_NVINGEST_PDFEXTRACTMETHOD: "nemotron_parse"
+       APP_NVINGEST_EXTRACTTABLESMETHOD: "nemotron_parse"
+   ```
+
+3. Apply the changes as described in [Change a Deployment](deploy-helm.md#change-a-deployment).
+
+4. For full Helm deployment steps, see the [Helm Deployment Guide](deploy-helm.md).
+
+**Summary of nemotron-parse-only Helm settings:**
+
+| Setting | Purpose |
+|---------|---------|
+| `nv-ingest.nimOperator.nemotron_parse.enabled: true` | Enable the Nemotron Parse NIM. |
+| `nv-ingest.nimOperator.<other_nims>.enabled: false` | Disable OCR, page-elements, graphic-elements, and table-structure NIMs. |
+| `nv-ingest.envVars.COMPONENTS_TO_READY_CHECK: ""` | nv-ingest health check: readiness passes without other NIMs. |
+| `ingestor-server.envVars.APP_NVINGEST_PDFEXTRACTMETHOD: "nemotron_parse"` | Use Nemotron Parse for PDF extraction. |
+| `ingestor-server.envVars.APP_NVINGEST_EXTRACTTABLESMETHOD: "nemotron_parse"` | Use Nemotron Parse for table extraction. |
+
 ## Limitations and Requirements
 
 When using Nemotron Parse for PDF extraction, consider the following:
@@ -105,7 +232,7 @@ When using Nemotron Parse for PDF extraction, consider the following:
 - The extraction quality may vary depending on the PDF structure and content.
 - Nemotron Parse is not supported on NVIDIA B200 GPUs or RTX Pro 6000 GPUs.
 
-For detailed information about hardware requirements and supported GPUs for all NeMo Retriever extraction NIMs, refer to the [Nemotron Parse Support Matrix](https://docs.nvidia.com/nim/vision-language-models/latest/support-matrix.html#nemotron-parse).
+For detailed information about hardware requirements and supported GPUs for extraction NIMs used by NeMo Retriever Library, refer to the [Nemotron Parse Support Matrix](https://docs.nvidia.com/nim/vision-language-models/latest/support-matrix.html#nemotron-parse).
 
 ## Available PDF Extraction Methods
 
@@ -115,6 +242,8 @@ The `APP_NVINGEST_PDFEXTRACTMETHOD` environment variable supports the following
 - `pdfium`: Uses the default PDFium-based extraction
 - `None`: Uses the default extraction method
 
+**Table extraction method:** The `APP_NVINGEST_EXTRACTTABLESMETHOD` environment variable controls how tables are extracted. Set it to `nemotron_parse` to use Nemotron Parse for table extraction (recommended for a nemotron-parse-only pipeline). The default is `yolox`, which uses the YOLOX-based table NIMs.
+
 :::{note}
 The Nemotron Parse service requires GPU resources and must run on a dedicated GPU. Make sure you have sufficient GPU resources available before enabling this feature.
 :::
diff --git a/docs/nemotron3-super-deployment.md b/docs/nemotron3-super-deployment.md
new file mode 100644
index 000000000..8b4295945
--- /dev/null
+++ b/docs/nemotron3-super-deployment.md
@@ -0,0 +1,180 @@
+# Using Nemotron-3-Super-120B-A12B LLM NIM
+
+[Nemotron-3-Super-120B-A12B](https://build.nvidia.com/nvidia/nemotron-3-super-120b-a12b/modelcard) is a large language model (LLM) trained by NVIDIA, designed to deliver strong agentic, reasoning, and conversational capabilities. It is optimized for collaborative agents and high-volume workloads such as IT ticket automation. This LLM can considerably improve the accuracy of the RAG pipeline, especially with reasoning enabled. ([Model card](https://build.nvidia.com/nvidia/nemotron-3-super-120b-a12b/modelcard))
+
+We recommend to use the model with low-effort reasoning mode with a reasoning budget of 256 to have a balance between accuracy and performance. You can switch to non-reasoning mode for maximum performance or use reasoning mode for best accuracy.
+
+## Hardware requirements
+
+For Docker and Kubernetes deployment, see the following:
+
+- **Docker (local NIM):** [Hardware Requirements (Docker)](support-matrix.md#hardware-requirements-docker)
+- **Kubernetes (Helm):** [Hardware Requirements (Kubernetes)](support-matrix.md#hardware-requirements-kubernetes)
+
+For [self-hosted local NIM](deploy-docker-self-hosted.md) deployment with `nemotron-3-super-120b-a12b`, you need one of the following:
+
+- 3 x H100
+- 3 x B200
+- 3 x RTX PRO 6000
+
+### Hardware Requirements (Kubernetes)
+
+To deploy with [Helm](deploy-helm.md) using `nemotron-3-super-120b-a12b`, you need one of the following:
+
+- 9 x H100-80GB
+- 9 x B200
+- 9 x RTX PRO 6000
+
+---
+
+## Start services using NVIDIA-hosted models
+
+No local GPU needed for the LLM. The file `deploy/compose/nemotron3-super-cloud.env` sets all NVIDIA-hosted (cloud) endpoints and the `nemotron-3-super-120b-a12b` model.
+
+1. [Set your API key](https://github.com/NVIDIA-AI-Blueprints/rag/blob/main/docs/api-key.md) and prompt config, then source the env files:
+
+```bash
+export NGC_API_KEY=<ngc-api-key>
+source deploy/compose/.env
+source deploy/compose/nemotron3-super-cloud.env
+export PROMPT_CONFIG_FILE=$(pwd)/deploy/compose/nemotron3-super-prompt.yaml
+```
+
+2. Follow [Start services using NVIDIA-hosted models](deploy-docker-nvidia-hosted.md#start-services-using-nvidia-hosted-models) to start the vectorstore, rag-server, and ingestor-server.
+
+---
+
+## Start services using self-hosted on-premises models
+
+1. Update `nims.yaml`
+
+   Edit `deploy/compose/nims.yaml` and change the `nim-llm` service image and GPU allocation:
+
+   ```yaml
+   nim-llm:
+     image: nvcr.io/nim/nvidia/nemotron-3-super-120b-a12b:1.8.0
+     ...
+     user: "0"
+     environment:
+       NGC_API_KEY: ${NGC_API_KEY}
+       NIM_MAX_MODEL_LEN: "32768"  # required for TP2 profile
+       NIM_KVCACHE_PERCENT: "0.9"
+     deploy:
+       resources:
+         reservations:
+           devices:
+             - driver: nvidia
+               device_ids: ['1','2']  # 2 GPUs for FP8 TP2
+               capabilities: [gpu]
+   ```
+
+   > Note: To deploy TP2 profiles you need to limit NIM_MAX_MODEL_LEN to 32768
+
+   To confirm that a TP2 profile is available for your hardware, run:
+
+   ```bash
+   docker run -ti --rm --gpus all nvcr.io/nim/nvidia/nemotron-3-super-120b-a12b:1.8.0 list-model-profiles
+   ```
+
+   Check the [model page](https://build.nvidia.com/nvidia/nemotron-3-super-120b-a12b/modelcard) for more details.
+
+   > Note: For RTX 6000 Pro GPUs, additional NIM environment variables are required — see [RTX 6000 Pro](#rtx-6000-pro) below.
+
+2. Set nemotron-3-super specific environment variables.
+
+   Ensure the section **`Endpoints for using cloud NIMs`** in `deploy/compose/.env` is **commented** (so on-prem endpoints are used).
+
+   ```bash
+   source deploy/compose/.env
+   source deploy/compose/nemotron3-super.env
+   export PROMPT_CONFIG_FILE=$(pwd)/deploy/compose/nemotron3-super-prompt.yaml
+   export LLM_MAX_TOKENS=16256
+   ```
+
+   Follow [Start services using self-hosted on-premises models](deploy-docker-self-hosted.md#start-services-using-self-hosted-on-premises-models) to start the vectorstore, rag-server, NIMs, and ingestor-server.
+
+**RTX 6000 Pro**
+
+> Note: To deploy TP2 profiles on RTX PRO 6000 Blackwell Server Edition, run the following commands. You don't need to go through these steps if you are using TP4 or TP8 profile.
+
+1. Edit `/etc/default/grub` and set:
+
+   ```text
+   GRUB_CMDLINE_LINUX_DEFAULT="quiet splash iommu=pt"
+   ```
+
+2. Run:
+
+   ```bash
+   sudo update-grub2
+   sudo reboot
+   ```
+
+3. In `nims.yaml`, add under the `nim-llm` `environment:` block:
+
+   ```yaml
+   environment:
+     # In addition to variable already set in step 1
+     NCCL_P2P_DISABLE: "1"
+   ```
+
+---
+
+## Helm deployment (`nemotron-3-super-120b-a12b`)
+
+From the repository root, run:
+
+```bash
+helm upgrade --install rag -n rag https://helm.ngc.nvidia.com/nvidia/blueprint/charts/nvidia-blueprint-rag-v2.5.0.tgz \
+  --username '$oauthtoken' \
+  --password "${NGC_API_KEY}" \
+  --set imagePullSecret.password=$NGC_API_KEY \
+  --set ngcApiSecret.password=$NGC_API_KEY \
+  -f deploy/helm/nvidia-blueprint-rag/values.yaml \
+  -f deploy/helm/nvidia-blueprint-rag/nemotron3-super-values.yaml
+```
+
+The prompt file `deploy/compose/nemotron3-super-prompt.yaml` is tuned for `nemotron-3-super-120b-a12b`. To customize it, see [Prompt customization in Helm chart](prompt-customization.md#prompt-customization-in-helm-chart).
+
+**RTX 6000 Pro**
+
+> Note: To deploy TP2 profiles on RTX PRO 6000 Blackwell Server Edition, run the following commands. You don't need to go through these steps if you are using TP4 or TP8 profile.
+
+1. Edit `/etc/default/grub` and set:
+
+   ```text
+   GRUB_CMDLINE_LINUX_DEFAULT="quiet splash iommu=pt"
+   ```
+
+2. Run:
+
+   ```bash
+   sudo update-grub2
+   sudo reboot
+   ```
+
+3. From the repository root, run:
+
+   ```bash
+   helm upgrade --install rag -n rag https://helm.ngc.nvidia.com/nvidia/blueprint/charts/nvidia-blueprint-rag-v2.5.0.tgz \
+     --username '$oauthtoken' \
+     --password "${NGC_API_KEY}" \
+     --set imagePullSecret.password=$NGC_API_KEY \
+     --set ngcApiSecret.password=$NGC_API_KEY \
+     -f deploy/helm/nvidia-blueprint-rag/values.yaml \
+     -f deploy/helm/nvidia-blueprint-rag/nemotron3-super-values.yaml \
+     -f deploy/helm/nvidia-blueprint-rag/nemotron3-super-rtx6000-values.yaml
+   ```
+
+---
+
+## Reasoning and non-reasoning mode
+
+To disable reasoning mode set following
+
+```bash
+export LLM_ENABLE_THINKING=false
+export LLM_REASONING_BUDGET=0
+```
+
+For other options (e.g. full reasoning budget), see [Enable reasoning for Nemotron 3 models](enable-nemotron-thinking.md).
diff --git a/docs/notebooks.md b/docs/notebooks.md
index a88952f79..35b2b176b 100644
--- a/docs/notebooks.md
+++ b/docs/notebooks.md
@@ -101,7 +101,9 @@ Use the following notebooks to learn comprehensive Python client usage, metadata
 
 - [rag_library_usage.ipynb](https://github.com/NVIDIA-AI-Blueprints/rag/blob/main/notebooks/rag_library_usage.ipynb) – Demonstrates native usage of the NVIDIA RAG Python client, including environment setup, document ingestion, collection management, and querying. This notebook provides end-to-end API usage examples for interacting directly with the RAG system from Python, covering both ingestion and retrieval workflows.
 
-- [rag_library_lite_usage.ipynb](https://github.com/NVIDIA-AI-Blueprints/rag/blob/main/notebooks/rag_library_lite_usage.ipynb) – Demonstrates containerless deployment of the NVIDIA RAG Python package in lite mode. Uses Milvus Lite (embedded vector database) and NV-Ingest subprocess mode for a simplified setup without Docker containers. Leverages NVIDIA cloud APIs for embeddings, ranking, and LLM inference. **Note**: This mode does not support image/table/chart citations or document summarization.
+- [rag_library_lite_usage.ipynb](https://github.com/NVIDIA-AI-Blueprints/rag/blob/main/notebooks/rag_library_lite_usage.ipynb) – Demonstrates containerless deployment of the NVIDIA RAG Python package in lite mode. Uses Milvus Lite (embedded vector database) and NeMo Retriever Library subprocess mode for a simplified setup without Docker containers. Leverages NVIDIA cloud APIs for embeddings, ranking, and LLM inference. **Note**: This mode does not support image/table/chart citations or document summarization.
+
+- [langchain_nvidia_retriever.ipynb](https://github.com/NVIDIA-AI-Blueprints/rag/blob/main/notebooks/langchain_nvidia_retriever.ipynb) – Showcases **LangChain integration** with the NVIDIA RAG Blueprint. Run [ingestion_api_usage.ipynb](https://github.com/NVIDIA-AI-Blueprints/rag/blob/main/notebooks/ingestion_api_usage.ipynb) first to ingest documents, then use `NVIDIARAGRetriever` for retrieval (sync/async), custom parameters, error handling, and optional RAG chaining with `ChatNVIDIA`.
 
 
 
@@ -122,55 +124,7 @@ Use the following notebooks to learn how to how to extend the system with custom
 
 Use the following notebook for cloud deployment scenarios.
 
-- [launchable.ipynb](https://github.com/NVIDIA-AI-Blueprints/rag/blob/main/notebooks/launchable.ipynb) – A deployment-ready notebook intended to run in a [Brev environment](https://console.brev.dev/environment/new). To learn more about Brev, refer to [Brev](https://docs.nvidia.com/brev/latest/about-brev.html). Follow the instructions for running Jupyter notebooks in a cloud-based environment based on the hardware requirements specified in the launchable.
-
-
-
-## Set Up the Notebook Environment
-
-To run a notebook, use the following procedure with [uv](https://docs.astral.sh/uv/) - a fast Python package manager.
-
-> **Note**: Python version **3.11 or higher** is required.
-
-1. Install uv (if not already installed):
-
-    ```bash
-    curl -LsSf https://astral.sh/uv/0.8.12/install.sh | sh
-    ```
-
-2. Create and activate a virtual environment:
-
-    ```bash
-    uv venv --python=python3.12
-    source .venv/bin/activate
-    ```
-
-3. Install JupyterLab:
-
-    ```bash
-    uv pip install jupyterlab
-    ```
-
-4. Start JupyterLab:
-
-    ```bash
-    jupyter lab --allow-root --ip=0.0.0.0 --NotebookApp.token='' --port=8889 --no-browser
-    ```
-
-### Set-up Notes
-- Ensure that API keys and credentials are correctly set up before you run a notebook.
-- Modify endpoints or request parameters as necessary to match your specific use case.
-- For the custom VDB operator notebook, ensure that Docker is available for running OpenSearch services.
-
-
-
-## Run a Notebook
-
-After you set up your notebook environment, to run a notebook, use the following procedure.
-
-1. Access JupyterLab by opening a browser and navigating to `http://<your-server-ip>:8889`.
-2. Navigate to the notebook and run the cells sequentially.
-
+- [launchable.ipynb](https://github.com/NVIDIA-AI-Blueprints/rag/blob/main/notebooks/launchable.ipynb) – A deployment-ready notebook intended to run in a [Brev environment](https://console.brev.dev/environment/new). To learn more about Brev, refer to [Brev](https://developer.nvidia.com/brev). Follow the instructions for running Jupyter notebooks in a cloud-based environment based on the hardware requirements specified in the launchable.
 
 
 ## Related Topics
diff --git a/docs/nv-ingest-standalone.md b/docs/nv-ingest-standalone.md
index f09a37970..14319ad94 100644
--- a/docs/nv-ingest-standalone.md
+++ b/docs/nv-ingest-standalone.md
@@ -3,19 +3,19 @@
   SPDX-License-Identifier: Apache-2.0
 -->
 
-# Deploy NV-Ingest Standalone for NVIDIA RAG Blueprint
+# Deploy NeMo Retriever Library Standalone for NVIDIA RAG Blueprint
 
-This guide explains how to deploy and use NV-Ingest as a standalone service for [NVIDIA RAG Blueprint](readme.md) without deploying the full ingestor server. This is useful when you want to ingest documents directly using Python scripts.
+This guide explains how to deploy and use NeMo Retriever Library as a standalone service for [NVIDIA RAG Blueprint](readme.md) without deploying the full ingestor server. This is useful when you want to ingest documents directly using Python scripts.
 
 For more details and advanced usage, refer to:
-- [NVIDIA/nv-ingest repository](https://github.com/NVIDIA/nv-ingest)
-- [Official NV-Ingest Quickstart Guide](https://github.com/NVIDIA/nv-ingest/blob/main/docs/docs/extraction/quickstart-guide.md)
+- [NVIDIA/NeMo-Retriever Library repository](https://github.com/NVIDIA/NeMo-Retriever)
+- [Official NeMo Retriever Library Quickstart Guide](https://docs.nvidia.com/nemo/retriever/)
 
 ## Limitations
 
-When using NV-Ingest in standalone mode, consider the following limitations:
+When using NeMo Retriever Library in standalone mode, consider the following limitations:
 
-1. **Citations Disabled**: The RAG server's citation feature will be disabled for documents ingested through standalone NV-Ingest. This is because the citation metadata requires additional processing that is handled by the full ingestor server.
+1. **Citations Disabled**: The RAG server's citation feature will be disabled for documents ingested through standalone NeMo Retriever Library. This is because the citation metadata requires additional processing that is handled by the full ingestor server.
 
 2. **No Web UI**: The standalone deployment does not include the web-based upload interface. All document ingestion must be done through Python scripts.
 
@@ -92,7 +92,7 @@ COLLECTION_NAME = "multimodal_data_nvingest"
 MILVUS_URI = "http://localhost:19530"
 MINIO_ENDPOINT = "localhost:9010"
 
-# Server Mode (Create NV-Ingest client)
+# Server Mode (Create NeMo Retriever Library client)
 client = NvIngestClient(
     message_client_hostname="localhost",
     message_client_port=7670
@@ -118,10 +118,10 @@ ingestor = ingestor.split(
             )
 
 ingestor = ingestor.embed(
-    # For self-hosted: "http://nemoretriever-embedding-ms:8000/v1"
+    # For self-hosted: "http://nemotron-embedding-ms:8000/v1"
     # For cloud (NVIDIA-hosted): "https://integrate.api.nvidia.com/v1"
-    endpoint_url="http://nemoretriever-embedding-ms:8000/v1",
-    model_name="nvidia/llama-3.2-nv-embedqa-1b-v2"
+    endpoint_url="http://nemotron-embedding-ms:8000/v1",
+    model_name="nvidia/llama-nemotron-embed-1b-v2"
 )
 
 ingestor = ingestor.vdb_upload(
diff --git a/docs/observability.md b/docs/observability.md
index 0c4bb2665..587c6a70e 100644
--- a/docs/observability.md
+++ b/docs/observability.md
@@ -45,13 +45,13 @@ Use the following procedure to enable observability with Docker.
 
 After tracing is enabled and the system is running, you can **view the traces** in **Zipkin** by opening:
 
-<p align="center">
-<img src="assets/zipkin_ui.png" width="750">
-</p>
+```{image} assets/zipkin_ui.png
+:width: 750px
+:align: center
+```
 
 Open the Zipkin UI at: **http://localhost:9411**
 
-
 ## View Metrics in Grafana
 
 As part of the tracing, the RAG service also exports metrics like API request counts, LLM prompt and completion token count and words per chunk.
@@ -104,11 +104,10 @@ After tracing is enabled and running, you can view inputs and outputs of differe
 
 3. Similarly, you can view inputs and outputs for sub stages within the workflows by clicking on a substage and finding the `traceloop.entity.input` and `traceloop.entity.ouput` rows.
 
-  <p align="center">
-  <img src="assets/zipkin_ui_labelled.png" width="750">
-  </p>
-
-
+```{image} assets/zipkin_ui_labelled.png
+:width: 750px
+:align: center
+```
 
 ## Enable Observability with Helm
 
diff --git a/docs/perf-benchmarks.md b/docs/perf-benchmarks.md
new file mode 100644
index 000000000..7029b68bc
--- /dev/null
+++ b/docs/perf-benchmarks.md
@@ -0,0 +1,232 @@
+<!--
+  SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+  SPDX-License-Identifier: Apache-2.0
+-->
+
+# RAG Performance Measurement Methodology
+
+[GenAI Perf](https://github.com/triton-inference-server/perf_analyzer/tree/main/genai-perf), NVIDIA’s open-source benchmarking tool evaluates end-to-end RAG pipeline performance under realistic load conditions. By testing across varying levels of concurrency, it offers a consistent and reproducible way to compare different RAG deployment configurations.
+
+## Key Terms
+
+| Term | Description |
+|------|-------------|
+| **Total Requests** | The total number of questions issued to the RAG server in a single benchmark run. Controls the size of the workload and is kept constant across configurations for fair comparison. |
+| **Concurrency** | The number of simultaneously active worker threads sending requests to the server. A higher concurrency simulates a heavier multi-user load. |
+| **N_Times** | The number of measured benchmark iterations performed after warm-up. Multiple iterations improve statistical stability of the reported metrics. In all experiments reported in this document, `N_Times` is set to 3. |
+| **Input Sequence Length (ISL)** | The number of tokens in the input prompt sent to the RAG server. |
+| **Output Sequence Length (OSL)** | The number of tokens in the RAG server's generated response. |
+| **TTFT (Time to First Token)** | The elapsed time from when a request is submitted until the first output token is returned. A key indicator of perceived responsiveness in streaming deployments. |
+| **Inter-Token Latency (ITL)** | Defined as (E2E Latency - TTFT) / (OSL - 1), where OSL is the number of output tokens generated per request, averaged across all requests in the benchmark run. |
+| **KV Cache** | A memory buffer on the GPU (HBM) storing the key-value attention states computed for all tokens in an active request. KV cache size grows with sequence length, number of model layers, and hidden dimension. When aggregate KV cache across concurrent requests saturates HBM, new requests queue rather than execute, driving up TTFT. |
+| **HBM (High Bandwidth Memory)** | The GPU's on-chip memory (e.g., 80 GB on H100). |
+| **Prefill** | The stage in which the model processes the full input prompt simultaneously to construct the KV cache. |
+| **Decode** | The autoregressive phase where output tokens are generated one at a time. |
+| **Batch / Effective Batch Size** | The set of requests processed simultaneously in a single GPU forward pass during the decode phase. At each decode step, the GPU computes attention over the accumulated KV caches of all requests in the active batch and generates one new token per request. A larger batch means more requests share the same forward pass, increasing per-token contention for GPU memory bandwidth and raising ITL. The effective batch size at any moment is constrained by available HBM: once the aggregate KV cache of active requests saturates HBM, additional requests queue rather than enter the batch. Configurations with a smaller per-request KV cache footprint (smaller model, shorter context) sustain a larger effective batch size under the same HBM budget. |
+| **Reasoning Chain / Chain-of-Thought** | An extended internal monologue generated by the model before producing its final answer, used to decompose complex questions into intermediate reasoning steps. For Llama-3.3-Nemotron-Super-49B, reasoning is activated by setting the system prompt to "detailed thinking on" and suppressed by "detailed thinking off" — no model weight change occurs between modes. When active, reasoning tokens are generated autoregressively in the same decode phase as the final answer and occupy KV cache slots for the full duration of the request, increasing TTFT and ITL relative to reasoning-off. |
+
+## Benchmarking Modes
+
+The benchmark supports two distinct modes depending on the evaluation objective.
+
+### Mode 1 — Synthetic Sequence-Length Benchmarking
+
+In this mode, the benchmarking workload is defined by a target input sequence length and a target output sequence length rather than by real questions. Synthetic queries are programmatically generated to match the specified token lengths, enabling precise control over the load profile and allowing users to isolate the performance impact of sequence length independently of question content. To support retrieval in this mode, a Wikipedia dataset of 50,000 records is pre-ingested into the Vector Database, providing a sufficiently large and diverse document corpus for the retrieval stage to operate under realistic conditions.
+
+### Mode 2 — Dataset-Driven Benchmarking
+
+A curated set of domain-specific questions serves as the request pool. To prevent unbounded generation from obscuring true system throughput, the maximum output token length is capped at 32,000 tokens, ensuring responses remain within a well-defined generation budget and that results are directly comparable across runs.
+
+| Dataset | Number of Questions | Source Documents | Size | QA Characteristics |
+|---------|--------------------:|------------------|------|-------------------|
+| [RagBattlePacket](https://www.eyelevel.ai/post/most-accurate-rag) | 92 | Deloitte public tax PDFs | 1,146 pages | 92 questions across text, tabular, and graphical categories; visually dense corpus with rich tables and figures requiring cross-modal understanding |
+| [KG-RAG](https://github.com/docugami/KG-RAG-datasets/tree/main/sec-10-q/data/v1) | 195 | SEC 10-Q PDFs + KG triples | 1,037 pages | Entity-centric factual QA over structured financial filings; questions target specific named entities and numerical facts with minimal visual content |
+| [HotPotQA](https://huggingface.co/datasets/hotpotqa/hotpot_qa) | 979 | Wikipedia paragraphs | ~113K QA pairs; 2,673 source documents | Multi-hop reasoning requiring the model to chain facts across multiple documents (bridge and comparison); plain text only with no tables or figures |
+| [BO767](https://digitalcorpora.org/) | 487 | 767 PDFs | 54,730 pages | Varied, heterogeneous content across a large-scale mixed corpus of forensic and operational documents; high proportion of image and structured content alongside text |
+
+## How It Works
+The following sections describe how benchmarking works.
+### Request Pool Construction
+
+Depending on the selected mode, the request pool is constructed differently:
+
+- **Synthetic mode:** Synthetic prompts are generated with ISL and OSL set to 128.
+- **Dataset mode:** Questions are drawn sequentially from the curated benchmark dataset in a round-robin fashion. Once all questions in the dataset have been issued, the cycle restarts from the beginning, continuing until the total number of requests defined by `total_requests` is reached. This ensures uniform dataset coverage regardless of the configured workload size.
+
+In both modes, the `total_requests` parameter guarantees that every configuration is evaluated against an identical, fixed-size workload, enabling fair and reproducible comparisons across deployment variants.
+
+### Concurrency Sweep
+
+GenAI Perf spawns *N* worker threads, where *N* is driven by a concurrency parameter. The blueprint sweeps across the following concurrency levels: **1, 10, 25, 50, 75, 100, 125** — allowing users to observe how the system scales and identify the point at which latency or throughput begins to degrade.
+
+### Sequential Request Dispatch
+
+Each thread draws questions from the pool one at a time in sequence, taking the next request only after a response to the current one has been received. This models realistic per-user session behavior and avoids artificially inflating throughput through intra-thread batching.
+
+### Warm-Up and Measured Runs
+
+Before recording any metrics, GenAI Perf executes an initial warm-up run to bring the RAG servers to a steady operational state, eliminating cold-start artifacts. The benchmark is then repeated for a configurable number of iterations (`N_Times`), ensuring that the collected statistics are stable and reproducible.
+
+### Performance Result Collection
+
+For every request across all threads and iterations, GenAI Perf records timing and outcome data. These are aggregated into a performance result store, from which key metrics — including Time-to-First-Token and Inter-Token Latency — are computed and reported per concurrency level.
+
+## Configuration and Performance Results
+The following sections provide further information about configuration and performance results.
+### Configuration and Setup
+
+The following deployment configurations are evaluated:
+
+- **LLM-49B** — [Llama-3.3-Nemotron-Super-49B](https://build.nvidia.com/nvidia/llama-3_3-nemotron-super-49b-v1_5/modelcard): A key feature is the reasoning toggle — setting the system prompt to "detailed thinking on" causes the model to generate an internal chain-of-thought before the final answer; "detailed thinking off" produces a direct response. This is a system-prompt switch — no weight change occurs between modes.
+- **VLM nano** — [Nemotron Nano VL](https://build.nvidia.com/nvidia/nemotron-nano-12b-v2-vl/modelcard): Its smaller parameter count means significantly lower KV cache memory consumption per request compared to the 49B model.
+- **Ingestion setup:**
+  - Default ingestion: The default ingestion that RAG 2.4.0 uses.
+  - Default ingestion with VLM image captioning enabled: Details in [Image Captioning Support](image_captioning.md). VLM is enabled during ingestion to extract image and structured content from documents, and also enabled at query time to process retrieved image and structured chunks via the multimodal pipeline.
+
+| # | LLM Model | Embedding Model | Reasoning On/Off | Ingestion Method |
+|---|-----------|----------------|-------------------|-----------------|
+| 1 | LLM-49B | Default Embedding | On | Default Ingestion |
+| 2 | LLM-49B | Default Embedding | Off | Default Ingestion |
+| 3 | VLM nano | Default Embedding | On | Default Ingestion with VLM image captioning enabled |
+| 4 | VLM nano | Default Embedding | Off | Default Ingestion with VLM image captioning enabled |
+
+### Impact Factors
+
+TTFT (Time to First Token) at high concurrency is primarily determined by KV cache memory pressure. Each active request occupies HBM proportional to sequence length × model layers × hidden dimension. When aggregate KV cache across concurrent requests saturates HBM, incoming requests queue, driving up p95 TTFT.
+
+Factors governing TTFT:
+
+- **Model size:** A larger model (49B) has more layers and a wider hidden dimension, consuming more HBM per request than a smaller model (12B) at the same input length.
+- **Reasoning chain:** When reasoning is enabled, the model generates a full chain-of-thought before the first answer token. The caller's perceived TTFT includes the full thinking chain duration, and the extended request lifetime occupies KV cache slots for longer, accelerating HBM saturation under concurrency.
+- **Image processing pipeline (VLM configurations only):** When VLM inference is enabled and retrieved documents contain chunks with `content_metadata.type = "image"` or `"structured"`, the pipeline fetches thumbnail images from MinIO object storage, encodes them to base64 PNG, and injects them into the prompt alongside text before generation begins. This pre-generation overhead adds directly to TTFT per request, independent of KV cache pressure.
+
+**ITL (Inter-Token Latency) measures batch contention during the decode phase.** At each decode step, the GPU computes attention over all active requests' accumulated KV caches simultaneously. The more concurrent requests sharing a decode step, the longer each individual token waits — resulting in higher ITL.
+
+Factors governing ITL:
+
+- **Effective batch size:** Configurations that fit more concurrent requests into HBM simultaneously (small model + short outputs) produce higher ITL due to greater per-token contention.
+- **Output length per request:** Longer outputs per request (reasoning chains) reduce how many requests can coexist in HBM at once, lowering batch contention and ITL.
+
+### Dataset Mode Results
+
+The purpose of this mode is to evaluate the performance of RAG under different usage of LLM and VLM, along with enabling reasoning on or off. The following Helm chart configuration is applied for retrieval:
+
+| # | Configuration | Description | Value |
+|---|--------------|-------------|-------|
+| 1 | LLM/VLM #GPUs | Number of GPUs allocated to LLM/VLM | 2 |
+| 2 | Reranker / Embedding / VectorDB #GPU | Number of GPUs allocated to each service | 1 |
+| 3 | Citation | Whether citation source should be returned | Off |
+| 4 | VDB K | Number of records taken from Vector DB | 100 |
+| 5 | Reranker K | Top number of records returned after reranking | 10 |
+| 6 | total_requests | Total requests sent to RAG server per concurrency | MAX(100, 5 × Concurrency) |
+
+#### KG-RAG
+
+![KG-RAG — H100 Performance](assets/perf-benchmarks/kgrag_h100_performance.png)
+
+- TTFT is highest (~210s p95) for the LLM-Reasoning-On configuration due to the large per-request KV cache of the 49B model and extended request lifetimes from reasoning-chain generation.
+- TTFT is lowest for VLM-Reasoning-Off, consistent with the VLM nano model's smaller KV cache on a text-dominant structured corpus.
+- Both LLM-Reasoning-Off and VLM-Reasoning-On configurations show intermediate TTFT values.
+- ITL is highest for VLM-Reasoning-Off (~260ms) because the small VLM nano KV cache allows for a high number of concurrent requests, maximizing batch contention during each decode step.
+- VLM-Reasoning-On records significantly lower ITL than VLM-Reasoning-Off — approximately 20% of the reasoning-off value — despite using the same 12B model. When reasoning is enabled, the extended chain-of-thought output occupies KV cache slots for a much longer duration per request, substantially reducing the number of requests the scheduler can hold in the active batch simultaneously. This ITL reduction is proportionally similar to the LLM-Reasoning-On vs. LLM-Reasoning-Off drop, but far larger in absolute milliseconds — because VLM-Reasoning-Off reaches ~260ms due to its large active batch, so shrinking that batch via reasoning has much more room to reduce contention than on the LLM side, where the batch was already small.
+- ITL is lowest for LLM-Reasoning-On (< 20ms, near-flat). The 49B model with its long reasoning-chain outputs effectively reduces the concurrent batch size, thereby limiting per-token contention.
+- ITL for LLM-Reasoning-Off is intermediate (above LLM-Reasoning-On). This is because the same large 49B model with short outputs permits more requests to share decode steps, leading to increased contention relative to the reasoning-on case.
+
+#### RagBattlePacket
+
+![RagBattlePacket — H100 Performance](assets/perf-benchmarks/ragbattlepacket_h100_performance.png)
+
+- VLM-Reasoning-On and LLM-Reasoning-On converge at the highest TTFT values, both exceeding 140s at concurrency=125.
+- For VLM configurations on RagBattlePacket, VLM-enabled ingestion produces chunks with `content_metadata.type = "image"` and `"structured"` in addition to plain text. At query time, the pipeline fetches thumbnails from MinIO and constructs a multimodal prompt (text + images) sent to VLM nano. This image processing overhead — MinIO fetch, base64 PNG encoding, and a larger multimodal prefill — adds directly to TTFT per request.
+- LLM-Reasoning-Off records substantially lower TTFT than both VLM configurations. Default ingestion produces text-only chunks and `enable_vlm_inference=False` means no image processing pipeline is triggered — generation begins immediately on a text-only prompt.
+- LLM-Reasoning-On records the lowest ITL, remaining near-flat below 20ms throughout.
+- VLM-Reasoning-Off records the highest ITL, plateauing near 360ms from concurrency=25 onwards — the steepest plateau observed across all datasets.
+- The rapid ITL rise at low concurrency (concurrency=10 to 25) followed by a plateau indicates the system reaches maximum batch occupancy early, after which the scheduler begins queuing rather than further expanding the active batch.
+- VLM-Reasoning-On records approximately 6× lower ITL than VLM-Reasoning-Off on this dataset. Both VLM modes go through the same image processing pipeline, so the delta is driven entirely by output length. VLM-Reasoning-Off produces short outputs — the 12B model's small KV cache already allows a large active batch, and short per-request lifetimes keep that batch continuously full, sustaining high decode contention. When reasoning is enabled, each request generates a long chain-of-thought over the visually complex tax documents before producing its final answer, holding each request in the active batch for a far longer duration, preventing the scheduler from admitting new requests and shrinking the effective batch size significantly. The magnitude of the drop is amplified by RagBattlePacket's document complexity, which elicits longer reasoning chains than simpler text corpora.
+
+#### HotPotQA
+
+![HotPotQA — H100 Performance](assets/perf-benchmarks/hotpotqa_h100_performance.png)
+
+- LLM-Reasoning-On produces the highest p95 TTFT of all datasets, exceeding 250s at concurrency=125.
+  - Reason: HotPotQA's multi-hop questions require chaining facts across multiple source documents, which elicits longer reasoning chains from a thinking-enabled model. Each request holds a large KV cache slot (49B model) for an extended duration, accelerating HBM saturation at scale.
+- VLM-Reasoning-Off records the lowest TTFT.
+  - Reason: HotPotQA source documents are plain Wikipedia text containing no tables or figures, so VLM ingestion does not inflate retrieved context. The model-size advantage of VLM nano translates directly into lower KV cache pressure and reduced queuing latency.
+- VLM-Reasoning-Off again records the highest ITL, rising continuously to approximately 330ms at concurrency=125. This indicates the system has not yet reached batch saturation at concurrency=125 on this dataset, consistent with HotPotQA's short Wikipedia paragraph chunks producing compact retrieved contexts that allow the VLM nano model to continue accepting additional concurrent requests at the highest tested concurrency.
+- LLM-Reasoning-On remains the lowest ITL configuration, near-flat below 25ms.
+
+#### BO767
+
+![BO767 — H100 Performance](assets/perf-benchmarks/bo767_h100_performance.png)
+
+- On BO767, LLM-Reasoning-Off achieves lower TTFT than VLM-Reasoning-Off, even though LLM-49B is the larger model. This reversal is driven by the additional image processing pipeline overhead (thumbnail fetch and encoding) that only VLM configurations incur on this visually dense corpus.
+- LLM-Reasoning-On records the highest TTFT, reaching approximately 165s at concurrency=125, driven by the 49B model's large per-request KV cache and extended request lifetimes from reasoning-chain generation.
+- VLM configurations carry additional per-request TTFT overhead from the image processing pipeline. The BO767 VLM-ingested index contains 45,819 image chunks and 31,030 structured chunks (49.2% of total records). When these chunk types appear in the retrieved top-10, the pipeline fetches thumbnails from MinIO, encodes them to base64 PNG, and sends a multimodal prompt to VLM nano — adding latency before generation begins on every affected request.
+- LLM configurations are immune to this overhead regardless of index content. With `enable_vlm_inference=False`, the query-time pipeline performs text-only generation with no MinIO fetch, explaining why LLM-Reasoning-Off achieves lower TTFT than VLM-Reasoning-Off despite being the larger model.
+- VLM-Reasoning-Off ITL plateaus at approximately 380ms by concurrency=50, followed by a sustained plateau through concurrency=125 — indicating the system reaches maximum batch occupancy early on this corpus.
+- LLM-Reasoning-Off plateaus at approximately 40ms, well below VLM-Reasoning-Off, consistent with the 49B model's larger per-request KV cache limiting the concurrent batch size.
+- LLM-Reasoning-On records the lowest ITL, near-flat below 25ms across all concurrency levels.
+
+### Cross-Dataset Patterns
+
+**TTFT Ordering (Time to First Token)** — Across all four datasets, the following TTFT ordering holds consistently:
+
+- LLM-Reasoning-On produces the highest or joint-highest TTFT, driven by the combination of a large per-request KV cache and long request lifetimes from reasoning-chain generation.
+- VLM-Reasoning-Off produces the lowest or joint-lowest TTFT, benefiting from the VLM nano model's small KV cache footprint and the absence of reasoning-chain latency.
+- The relative ordering of LLM-Reasoning-Off vs. VLM-Reasoning-Off depends on corpus visual content. On visually dense corpora (RagBattlePacket, BO767), VLM-enabled ingestion produces image and structured chunks that trigger per-request image processing overhead at query time (MinIO thumbnail fetch, base64 encoding, multimodal prompt construction), adding directly to TTFT for VLM configurations. LLM configurations bypass this pipeline entirely, achieving lower TTFT despite the larger model. On text-dominant corpora (HotPotQA, KG-RAG), no image processing is triggered and the 12B model's smaller KV cache footprint gives VLM-Reasoning-Off the lower TTFT.
+
+**ITL Ordering (Inter-Token Latency)** — Across all datasets, the ITL ordering is fully consistent:
+
+| Rank | Configuration | Mechanism |
+|------|--------------|-----------|
+| **Highest ITL** | VLM-Reasoning-Off | Small 12B model + short outputs = maximum concurrent requests in HBM = highest batch contention per decode step. |
+| **2nd** | LLM-Reasoning-Off | Large 49B model limits concurrency, but short outputs allow moderate batch occupancy. |
+| **3rd** | VLM-Reasoning-On | Reasoning chain extends output length, reducing concurrent requests in HBM vs. reasoning-off. |
+| **Lowest ITL** | LLM-Reasoning-On | Large 49B model + very long reasoning-chain outputs = minimum concurrent requests in HBM = lowest batch contention. |
+
+**ITL Plateau Behavior** — VLM-Reasoning-Off ITL plateaus or slightly declines at very high concurrency on several datasets (RagBattlePacket from c=25, BO767 from c=50). This reflects the onset of request queuing: once HBM is saturated, the scheduler queues incoming requests rather than expanding the active decode batch, which caps batch contention and prevents further ITL growth.
+
+### Synthetic Mode Results
+
+The purpose of this mode is to re-evaluate the latency difference between LLM and VLM in isolation, removing dataset-specific effects such as visual content and reasoning-chain variability. A Wikipedia dataset of 50,000 records is pre-ingested into the Vector Database, providing a sufficiently large and diverse text-only corpus for the retrieval stage to operate under realistic conditions. The same Helm chart configuration as dataset mode is applied. The workload is fixed at ISL=128, OSL=128, representing an ordinary conversational use case — short questions, short answers — where each request occupies minimal KV cache.
+
+The same hardware allocation as dataset mode is applied: 2 GPUs for the LLM/VLM model server and 1 GPU each for the reranker, embedding model, and vector database.
+
+![Wikipedia — H100 LLM vs VLM (Reasoning Off)](assets/perf-benchmarks/wikipedia_synthetic_h100_performance.png)
+
+With reasoning disabled on both configurations and a uniform text-only corpus, the results expose the pure effect of model size on each metric:
+
+- **TTFT:** LLM-Reasoning-Off records marginally higher TTFT than VLM-Reasoning-Off across all concurrency levels, reaching approximately 65s vs. 60s at concurrency=125. Both curves rise linearly throughout the tested range, consistent with a chat-style workload where each request places minimal KV cache pressure. This confirms that for conversational workloads, both LLM and VLM configurations sustain responsive TTFT without entering a queuing collapse, and the model-size difference has negligible practical impact on user-perceived latency in this regime. The narrow gap between the two configurations reflects the fact that at these short sequence lengths, both models receive equally small inputs and produce equally short outputs — the KV cache size per request is nearly identical, leaving model parameter count as the only differentiator with a proportionally small impact.
+
+- **ITL:** The two configurations diverge sharply. LLM-Reasoning-Off ITL plateaus and remains flat at approximately 40ms from concurrency=25 onwards, indicating the decode batch has reached its HBM capacity and the scheduler has begun queuing excess requests beyond that point. VLM-Reasoning-Off ITL rises steeply and continuously, reaching approximately 220ms at concurrency=125 with no plateau visible — the 12B model's small per-request KV cache allows the scheduler to keep admitting more concurrent requests into the active decode batch as concurrency grows, continuously increasing per-token contention. The 49B LLM model's larger KV cache footprint caps the effective batch size early, preventing further ITL growth beyond concurrency=25.
+
+Taken together, the two metrics reveal two distinct saturation thresholds: decode batch saturation, which the LLM hits at low concurrency and is reflected in the ITL plateau, and full system TTFT saturation, which neither configuration reaches within the tested concurrency range for this chat-scale workload. This synthetic result serves as a clean baseline confirmation of the theoretical framework — on a text-only corpus with no image processing overhead, model size is the sole differentiating factor, producing the expected TTFT ordering (LLM slightly higher) and ITL ordering (VLM significantly higher at scale).
+
+### Cross-Dataset Latency with LLM-Reasoning-Off
+
+![LLM-Reasoning-Off — All Datasets (H100)](assets/perf-benchmarks/cross_dataset_llm_reasoning_off.png)
+
+In addition to per-dataset views, all four benchmarks plus the synthetic Wikipedia workload are aggregated into a single comparison using the same Llama-3.3-Nemotron-Super-49B configuration with reasoning disabled. This chart reports p95 Time To First Token (TTFT) and Inter-Token Latency (ITL) as concurrency increases on a single H100.
+
+Wikipedia synthetic serves as a lower bound: with fixed ISL=128 and OSL=128, no retrieval, and text-only inputs, it represents the lightest possible workload for the model. All real RAG datasets sit above this baseline. HotPotQA and KG-RAG show the highest TTFT and ITL, reflecting their heavier retrieved contexts (multi-hop Wikipedia reasoning and SEC 10-Q filings). RagBattlePacket falls in the middle, while BO767 exhibits the lowest ITL among the RAG datasets and closely tracks the Wikipedia ITL plateau. Its TTFT remains higher than Wikipedia's because real retrieval still adds pre-generation overhead.
+
+Taken together, this cross-dataset view confirms that the latency behavior of the 49B LLM is consistent and predictable: for a fixed model and hardware configuration, TTFT increases roughly linearly with concurrency for every dataset, and datasets that supply more complex or extensive retrieved context exhibit proportionally higher TTFT and ITL than the synthetic baseline.
+
+## Key Takeaways
+
+**Model size governs the TTFT/ITL trade-off direction, but corpus visual content determines its magnitude.** On text-only corpora, the smaller VLM nano (12B) delivers lower TTFT than the larger LLM-49B due to its reduced KV cache footprint — but this advantage is fully reversed on visually dense corpora (BO767, RagBattlePacket), where the VLM image processing pipeline (MinIO thumbnail fetch, base64 encoding, multimodal prompt construction) adds per-request overhead that outweighs the model-size benefit. As a result, on visually dense corpora, LLM-Reasoning-Off achieves lower TTFT than VLM-Reasoning-Off despite being the larger model — the image processing overhead is the dominant factor, not model size.
+
+**Reasoning is a force multiplier on TTFT, not just an accuracy switch.** Enabling reasoning on LLM-49B produces the highest TTFT in nearly every dataset tested. The chain-of-thought is generated autoregressively before the first answer token is returned, extending request lifetime and occupying KV cache slots for longer — accelerating HBM saturation under concurrency. This is a system-level cost, not just a per-request latency cost.
+
+**TTFT and ITL pull in opposite directions by design.** A small model with short outputs lets the scheduler pack more requests in parallel — this keeps TTFT low but creates a large, contended decode batch that drives ITL up. A large model with long reasoning outputs does the opposite: it shrinks the active batch, keeping ITL low but consuming more HBM per request and causing queuing that raises TTFT. Across every dataset tested, the lowest-TTFT configuration always has the highest ITL, and vice versa. No single configuration optimizes both metrics simultaneously — configuration selection is a deliberate trade-off between response latency and decode throughput.
+
+**ITL plateau is the earliest signal of HBM saturation.** Before TTFT shows non-linear growth, ITL flattening reveals that the scheduler has already begun queuing requests and capping the decode batch. In the synthetic experiment, LLM ITL plateaus at concurrency=25 while TTFT is still rising linearly, consistent with a chat-scale workload (ISL=128, OSL=128) where each request is small and queuing remains well controlled. In this regime, both LLM and VLM configurations keep end-user response times within an acceptable band.
+
+## Related Topics
+
+- [RAG Accuracy Benchmarks](accuracy-benchmarks.md)
+- [Evaluate Your NVIDIA RAG Blueprint System](evaluate.md)
+- [Enable Reasoning in Nemotron LLM Models](enable-nemotron-thinking.md)
+- [VLM-Based Inferencing in RAG](vlm.md)
+- [Image Captioning Support](image_captioning.md)
+- [Best Practices for Common Settings](accuracy_perf.md)
diff --git a/docs/project.json b/docs/project.json
index 66344b5d0..9b67aad99 100644
--- a/docs/project.json
+++ b/docs/project.json
@@ -1,4 +1,4 @@
 {
     "name": "NVIDIA-RAG-blueprint",
-    "version": "2.4.0"
+    "version": "2.5.0"
 }
\ No newline at end of file
diff --git a/docs/python-client.md b/docs/python-client.md
index 73b432eaf..5c9bc33ea 100644
--- a/docs/python-client.md
+++ b/docs/python-client.md
@@ -155,12 +155,12 @@ Verify all containers are running and healthy.
 
 ```output
 NAMES                           STATUS
-nemoretriever-ranking-ms        Up ... (healthy)
+nemotron-ranking-ms        Up ... (healthy)
 compose-page-elements-1         Up ...
 compose-nemoretriever-ocr-1     Up ...
 compose-graphic-elements-1      Up ...
 compose-table-structure-1       Up ...
-nemoretriever-embedding-ms      Up ... (healthy)
+nemotron-embedding-ms      Up ... (healthy)
 nim-llm-ms                      Up ... (healthy)
 ```
 
@@ -170,32 +170,32 @@ nim-llm-ms                      Up ... (healthy)
 
 `DEPLOYMENT_MODE = "cloud"`
 
-2.  Configure NV-Ingest to use NVIDIA hosted cloud APIs using the following hosted models.
+2.  Configure NeMo Retriever Library to use NVIDIA hosted cloud APIs using the following hosted models.
 
 - os.environ["OCR_HTTP_ENDPOINT"] = "https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-ocr"
 
 - os.environ["OCR_INFER_PROTOCOL"] = "http"
 os.environ["YOLOX_HTTP_ENDPOINT"] = (
-    "https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-page-elements-v3"
+    "https://ai.api.nvidia.com/v1/cv/nvidia/nemotron-page-elements-v3"
 )
 
 - os.environ["YOLOX_INFER_PROTOCOL"] = "http"
 
 - os.environ["YOLOX_GRAPHIC_ELEMENTS_HTTP_ENDPOINT"] = (
-    "https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-graphic-elements-v1"
+    "https://ai.api.nvidia.com/v1/cv/nvidia/nemotron-graphic-elements-v1"
 )
 
 - os.environ["YOLOX_GRAPHIC_ELEMENTS_INFER_PROTOCOL"] = "http"
 
 - os.environ["YOLOX_TABLE_STRUCTURE_HTTP_ENDPOINT"] = (
-    "https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-table-structure-v1"
+    "https://ai.api.nvidia.com/v1/cv/nvidia/nemotron-table-structure-v1"
 )
 os.environ["YOLOX_TABLE_STRUCTURE_INFER_PROTOCOL"] = "http"
 
 
-### Setup NVIDIA Ingest Runtime and Redis Service
+### Setup NeMo Retriever Library Runtime and Redis Service
 
-Use the following command to setup your NVIDIA Ingest Runtime and Redis Service.
+Use the following command to setup your NeMo Retriever Library Runtime and Redis Service.
 
 `docker compose -f ../deploy/compose/docker-compose-ingestor-server.yaml up nv-ingest-ms-runtime redis -d`
 
@@ -247,7 +247,7 @@ if DEPLOYMENT_MODE == "cloud":
     config_ingestor.llm.server_url = ""  # Empty uses NVIDIA API catalog
     config_ingestor.summarizer.server_url = ""  # Empty uses NVIDIA API catalog
 else:
-    config_ingestor.embeddings.server_url = "http://nemoretriever-embedding-ms:8000/v1"
+    config_ingestor.embeddings.server_url = "http://nemotron-embedding-ms:8000/v1"
 
 ingestor = NvidiaRAGIngestor(config=config_ingestor)
 ```
@@ -357,11 +357,11 @@ from nvidia_rag.utils.configuration import NvidiaRAGConfig
 #         "server_url": "",
 #     },
 #     "embeddings": {
-#         "model_name": "nvidia/llama-3.2-nv-embedqa-1b-v2",
+#         "model_name": "nvidia/llama-nemotron-embed-1b-v2",
 #         "server_url": "https://integrate.api.nvidia.com/v1",
 #     },
 #     "ranking": {
-#         "model_name": "nvidia/llama-3.2-nv-rerankqa-1b-v2",
+#         "model_name": "nvidia/llama-nemotron-rerank-1b-v2",
 #         "server_url": "",
 #     },
 # })
diff --git a/docs/query_decomposition.md b/docs/query_decomposition.md
index 8f346f847..b6826d668 100644
--- a/docs/query_decomposition.md
+++ b/docs/query_decomposition.md
@@ -26,7 +26,7 @@ Each subquery is processed independently to gather comprehensive context, which
 
 ## Accuracy Improvement Example
 
-The following example that uses the [HotpotQA](https://hotpotqa.github.io/) dataset demonstrates the accuracy improvement from enabling query decomposition.
+The following example that uses the [Google Frame](https://huggingface.co/datasets/google/frames-benchmark) benchmark demonstrates the accuracy improvement from enabling query decomposition.
 
 ```text
 Query: I am thinking of a Ancient Roman City. The city was destroyed by volcanic eruption. The eruption occurred in the year 79 AD. The volcano was a stratovolcano. Where was the session held where it was decided that the city would be named a UNESCO world heritage site?
diff --git a/docs/readme.md b/docs/readme.md
index ec8c5cf8e..acc20cd5e 100644
--- a/docs/readme.md
+++ b/docs/readme.md
@@ -113,6 +113,7 @@ After you deploy the RAG blueprint, you can customize it for your use cases.
 - Evaluation
 
     - [Evaluate Your NVIDIA RAG Blueprint System](evaluate.md)
+    - [RAG Accuracy Benchmarks](accuracy-benchmarks.md)
 
 
 - Governance
@@ -147,5 +148,5 @@ After you deploy the RAG blueprint, you can customize it for your use cases.
 
 ## Blog Posts
 
-- [NVIDIA NeMo Retriever Delivers Accurate Multimodal PDF Data Extraction 15x Faster](https://developer.nvidia.com/blog/nvidia-nemo-retriever-delivers-accurate-multimodal-pdf-data-extraction-15x-faster/)
+- [NVIDIA NeMo Retriever Library Delivers Accurate Multimodal PDF Data Extraction 15x Faster](https://developer.nvidia.com/blog/nvidia-nemo-retriever-delivers-accurate-multimodal-pdf-data-extraction-15x-faster/)
 - [Finding the Best Chunking Strategy for Accurate AI Responses](https://developer.nvidia.com/blog/finding-the-best-chunking-strategy-for-accurate-ai-responses/)
diff --git a/docs/release-notes.md b/docs/release-notes.md
index 96c48a121..6dd8e6911 100644
--- a/docs/release-notes.md
+++ b/docs/release-notes.md
@@ -8,7 +8,40 @@ This documentation contains the release notes for [NVIDIA RAG Blueprint](readme.
 
 
 
-## Release 2.4.0 (26-02-TBD)
+## Release 2.5.0 (2026-03-17)
+
+This release introduces support for the Nemotron-super-3 model, updates NIMs to the latest versions, upgrades NV-Ingest, and adds continuous ingestion along with RTX 6000 MIG support.
+
+### Highlights
+
+This release includes the following key updates:
+
+- **Nemotron-super-3 model support.** You can now integrate the Nemotron-super-3 model by following the steps outlined in [Change the Inference or Embedding Model](change-model.md).
+- **NIMs updated to latest versions.** 
+  The following model updates are included:
+  - `nvidia/llama-3.2-nv-embedqa-1b-v2` → `nvidia/llama-nemotron-embed-1b-v2`
+  - `nvidia/llama-3.2-nv-rerankqa-1b-v2` → `nvidia/llama-nemotron-rerank-1b-v2`
+  - `nemoretriever-page-elements-v3` → `nemotron-page-elements-v3`
+  - `nemoretriever-graphic-elements-v1` → `nemotron-graphic-elements-v1`
+  - `nemoretriever-table-structure-v1` → `nemotron-table-structure-v1`
+  - `nvidia/llama-3.2-nemoretriever-1b-vlm-embed-v1` → `nvidia/llama-nemotron-embed-vl-1b-v2`
+- Updated NVIngest to [version 26.1.2](https://github.com/NVIDIA/NeMo-Retriever/releases/tag/26.1.2).
+- Added an example demonstrating the continuous ingestion pipeline. For more information, see [rag_event_ingest.ipynb](https://github.com/NVIDIA-AI-Blueprints/rag/blob/main/notebooks/rag_event_ingest.ipynb).
+- **Added MIG support for RTX 6000.** For details, refer to [MIG Deployment](mig-deployment.md) and use `values-mig-rtx6000.yaml` and `mig-config-rtx6000.yaml`.
+- Added documentation for the experimental Nemotron-parse-only ingestion pipeline. This configuration allows you to perform extraction using only Nemotron Parse through NV-Ingest, without relying on OCR, page-elements, graphic-elements, or table-structure NIMs. For more information, refer to [nemotron-parse-extraction.md](nemotron-parse-extraction.md#experimental-nemotron-parse-only-extraction).
+- Several bug fixes, including frontend CVE resolutions, improved multimodal content concatenation for VLM embeddings, enhanced VDB serialization for high-concurrency parallel ingestion, and updates to observability and NeMo Guardrails configurations.
+- Added agentic skills support: the `rag-blueprint` skill enables AI coding assistants (Claude Code, Cursor, Codex, etc.) to deploy, configure, troubleshoot, and manage the RAG Blueprint autonomously. For details, refer to [RAG Blueprint Agent Skill](../skill-source/README.md).
+- Added [accuracy benchmark results](accuracy-benchmarks.md) across seven public datasets (RagBattlepacket, KG-RAG, Financebench, DC767, HotPotQA, Google Frames, and Vidore), comparing LLM and VLM configurations with reasoning on/off. Benchmarks use the NVIDIA Answer Accuracy metric from RAGAS.
+
+### Fixed Known Issues
+
+The following known issues have been resolved in this release:
+
+- Addressed frontend CVEs.
+
+- Resolved VDB indexing issues during high-concurrency batch parallel ingestion by implementing VDB serialization.
+
+## Release 2.4.0 (2026-02-20)
 
 This release adds new features to the RAG pipeline for supporting agent workflows and enhances generations with VLMs augmenting multimodal input.
 
@@ -16,10 +49,10 @@ This release adds new features to the RAG pipeline for supporting agent workflow
 
 This release contains the following key changes:
 
-- Updated NIMs and code to support  [NVIDIA Ingest 26.01 release](https://docs.nvidia.com/nemo/retriever/latest/extraction/releasenotes-nv-ingest/).
+- Updated NIMs and code to support  [NeMo Retriever Library 26.01 release](https://docs.nvidia.com/nemo/retriever/latest/extraction/releasenotes-nv-ingest/).
 - Added support for non-NIM models including OpenAI, models hosted on AWS and Azure, OSS models, and others. Supported through service-specific API keys. For details, refer to [Get an API Key](api-key.md).
-- The RAG Blueprint now uses [nemoretriever-ocr-v1](https://build.nvidia.com/nvidia/nemoretriever-ocr-v1/modelcard) as the default OCR model. For details, refer to [NeMo Retriever OCR Configuration Guide](nemoretriever-ocr.md).
-- The Vision-Language Model (VLM) inference feature now uses the model [nemotron-nano-12b-v2-vl](https://build.nvidia.com/nvidia/nemotron-nano-12b-v2-vl/modelcard). For details, refer to [VLM for Generation](vlm.md).
+- The RAG Blueprint now uses [nemoretriever-ocr-v1](https://build.nvidia.com/nvidia/nemoretriever-ocr-v1/modelcard) as the default OCR model. For details, refer to [NeMo Retriever Library OCR Configuration Guide](nemoretriever-ocr.md).
+- Improved VLM based generation support. The Vision-Language Model (VLM) inference feature now uses the model [nemotron-nano-12b-v2-vl](https://build.nvidia.com/nvidia/nemotron-nano-12b-v2-vl/modelcard). For details, refer to [VLM for Generation](vlm.md).
 - User interface improvements including catalog display, image and text query, and others. For details, refer to [User Interface](user-interface.md).
 - Added ingestion metrics endpoint support with OpenTelemetry (OTEL) for monitoring document uploads, elements ingested, and pages processed. For details, refer to [Observability](observability.md).
 - Support image and text as input query. For details, refer to [Multimodal Query Support](multimodal-query.md).
@@ -40,7 +73,7 @@ This release contains the following key changes:
   - Shallow summarization support
   - Easy model switches and dedicated configurations
   - Ease of prompt changes
-- Reserved field names `type`, `subtype`, and `location` for NV-Ingest exclusive use in metadata schemas.
+- Reserved field names `type`, `subtype`, and `location` for NeMo Retriever Library exclusive use in metadata schemas.
 - Added support for [rag_library_lite_usage.ipynb](https://github.com/NVIDIA-AI-Blueprints/rag/blob/main/notebooks/rag_library_lite_usage.ipynb) which demonstrates containerless deployment of the NVIDIA RAG Python package in lite mode.
 - Added example showcasing [NeMo Agent Toolkit integration](https://github.com/NVIDIA/NeMo-Agent-Toolkit) with NVIDIA RAG.
 - Added [weighted hybrid search](hybrid_search.md#weighted-hybrid-search) support with configurable weights.
@@ -77,7 +110,7 @@ The following are the known issues for the NVIDIA RAG Blueprint:
 - Optional features reflection and image captioning are not available in Helm-based deployment.
 - Currently, Helm-based deployment is not supported for [NeMo Guardrails](nemo-guardrails.md).
 - The Blueprint responses can have significant latency when using [NVIDIA API Catalog cloud hosted models](deploy-docker-nvidia-hosted.md).
-- The accuracy of the pipeline is optimized for certain file types like `.pdf`, `.txt`, `.docx`. The accuracy may be poor for other file types supported by NV-Ingest, since image captioning is disabled by default.
+- The accuracy of the pipeline is optimized for certain file types like `.pdf`, `.txt`, `.docx`. The accuracy may be poor for other file types supported by NeMo Retriever Library, since image captioning is disabled by default.
 - When updating model configurations in Kubernetes `values.yaml` (for example, changing from 70B to 8B models), the RAG UI automatically detects and displays the new model configuration from the backend. No container rebuilds are required - simply redeploy the Helm chart with updated values and refresh the UI to see the new model settings in the Settings panel.
 - The NeMo LLM microservice can take 5-6 minutes to start for every deployment.
 - B200 GPUs are not supported for the following advanced features. For these features, use H100 or A100 GPUs instead.
diff --git a/docs/retrieval-only-deployment.md b/docs/retrieval-only-deployment.md
index 3cfc5d30a..7f7f94475 100644
--- a/docs/retrieval-only-deployment.md
+++ b/docs/retrieval-only-deployment.md
@@ -88,11 +88,11 @@ Choose one of the following options based on your deployment preference.
 Instead of starting all NIMs, use the `text-embed` profile to start only the embedding and reranking services:
 
 ```bash
-USERID=$(id -u) docker compose -f deploy/compose/nims.yaml up -d nemoretriever-ranking-ms nemoretriever-embedding-ms
+USERID=$(id -u) docker compose -f deploy/compose/nims.yaml up -d nemotron-ranking-ms nemotron-embedding-ms
 ```
 
 :::{note}
-The `text-embed` profile starts only `nemoretriever-embedding-ms` and `nemoretriever-ranking-ms `, which is sufficient for retrieval operations. The LLM NIM (`nim-llm-ms`) is not started, saving significant GPU memory.
+The `text-embed` profile starts only `nemotron-embedding-ms` and `nemotron-ranking-ms `, which is sufficient for retrieval operations. The LLM NIM (`nim-llm-ms`) is not started, saving significant GPU memory.
 :::
 
 Wait for the services to become healthy:
@@ -105,8 +105,8 @@ Expected output:
 
 ```output
 NAMES                          STATUS
-nemoretriever-ranking-ms       Up 5 minutes (healthy)
-nemoretriever-embedding-ms     Up 5 minutes (healthy)
+nemotron-ranking-ms       Up 5 minutes (healthy)
+nemotron-embedding-ms     Up 5 minutes (healthy)
 ```
 
 #### Option B: NVIDIA-Hosted NIMs
@@ -308,7 +308,7 @@ This is expected behavior in retrieval-only mode. The `/generate` endpoint requi
 Check the embedding NIM logs:
 
 ```bash
-docker logs nemoretriever-embedding-ms
+docker logs nemotron-embedding-ms
 ```
 
 Ensure the model cache directory has proper permissions:
diff --git a/docs/scripts/build_multiversion_docs.ps1 b/docs/scripts/build_multiversion_docs.ps1
new file mode 100644
index 000000000..4aa131798
--- /dev/null
+++ b/docs/scripts/build_multiversion_docs.ps1
@@ -0,0 +1,165 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+<#
+.SYNOPSIS
+  Build Sphinx HTML for multiple release lines into a single publish layout.
+
+.DESCRIPTION
+  For each version, checks out git ref v{version} (tag) or release-v{version} (branch),
+  writes the canonical docs/versions1.json (so every build lists the same versions),
+  runs verify_doc_version_manifest.py, then sphinx-build into
+  docs/_build/multiversion/{version}/.
+
+  Copies the same manifest to docs/_build/multiversion/versions1.json for the
+  version switcher when the site root is this folder.
+
+  Requires a clean working tree unless -AllowDirty is used.
+
+.PARAMETER Versions
+  Semver strings without a leading v, e.g. 2.3.0, 2.4.0, 2.5.0
+
+.PARAMETER CanonicalManifest
+  Path to the versions1.json to inject on every checkout (default: docs/versions1.json
+  from the working tree at script start — save a backup first if needed).
+
+.PARAMETER OutputRoot
+  Directory under docs/ that will contain per-version folders and root versions1.json
+  (default: docs/_build/multiversion).
+
+.EXAMPLE
+  .\docs\scripts\build_multiversion_docs.ps1 -Versions @('2.3.0','2.4.0','2.5.0')
+
+.EXAMPLE
+  .\docs\scripts\build_multiversion_docs.ps1 -DryRun
+#>
+
+[CmdletBinding()]
+param(
+    [Parameter(Position = 0)]
+    [string[]]$Versions = @('2.3.0', '2.4.0', '2.5.0'),
+
+    [string]$CanonicalManifest = '',
+
+    [string]$OutputRoot = '',
+
+    [switch]$DryRun,
+
+    [switch]$AllowDirty
+)
+
+Set-StrictMode -Version Latest
+$ErrorActionPreference = 'Stop'
+
+$RepoRoot = (Resolve-Path (Join-Path $PSScriptRoot '..\..')).Path
+Set-Location $RepoRoot
+
+if (-not $OutputRoot) {
+    $OutputRoot = Join-Path $RepoRoot 'docs\_build\multiversion'
+} else {
+    $OutputRoot = $ExecutionContext.SessionState.Path.GetUnresolvedProviderPathFromPSPath($OutputRoot)
+}
+
+if (-not $CanonicalManifest) {
+    $CanonicalManifest = Join-Path $RepoRoot 'docs\versions1.json'
+} else {
+    $CanonicalManifest = $ExecutionContext.SessionState.Path.GetUnresolvedProviderPathFromPSPath($CanonicalManifest)
+}
+
+function Resolve-VersionGitRef {
+    param([string]$Version)
+    $tag = "v$Version"
+    git rev-parse -q --verify "refs/tags/$tag" 2>$null | Out-Null
+    if ($LASTEXITCODE -eq 0) {
+        return $tag
+    }
+    $branch = "release-v$Version"
+    git rev-parse -q --verify "refs/heads/$branch" 2>$null | Out-Null
+    if ($LASTEXITCODE -eq 0) {
+        return $branch
+    }
+    throw "No git tag '$tag' or branch '$branch' found for version $Version"
+}
+
+if (-not $DryRun) {
+    $dirty = git status --porcelain
+    if ($dirty -and -not $AllowDirty) {
+        throw "Working tree is dirty. Commit or stash changes, or pass -AllowDirty."
+    }
+}
+
+if (-not (Test-Path -LiteralPath $CanonicalManifest)) {
+    throw "Canonical manifest not found: $CanonicalManifest"
+}
+
+$canonicalJson = [System.IO.File]::ReadAllText($CanonicalManifest)
+
+$origHead = git rev-parse HEAD
+
+try {
+    if (-not $DryRun) {
+        New-Item -ItemType Directory -Force -Path $OutputRoot | Out-Null
+    }
+
+    foreach ($ver in $Versions) {
+        $ref = Resolve-VersionGitRef -Version $ver
+        $dest = Join-Path $OutputRoot $ver
+
+        Write-Host "==> Version $ver <= ref $ref => $dest" -ForegroundColor Cyan
+
+        if ($DryRun) {
+            continue
+        }
+
+        git checkout $ref
+        [System.IO.File]::WriteAllText(
+            (Join-Path $RepoRoot 'docs\versions1.json'),
+            $canonicalJson,
+            [System.Text.UTF8Encoding]::new($false)
+        )
+
+        & uv run python docs/scripts/verify_doc_version_manifest.py
+        if ($LASTEXITCODE -ne 0) {
+            throw "verify_doc_version_manifest.py failed for $ver (ref $ref)"
+        }
+
+        if (Test-Path -LiteralPath $dest) {
+            Remove-Item -LiteralPath $dest -Recurse -Force
+        }
+        New-Item -ItemType Directory -Force -Path $dest | Out-Null
+
+        & uv run --group docs sphinx-build docs $dest
+        if ($LASTEXITCODE -ne 0) {
+            throw "sphinx-build failed for $ver"
+        }
+    }
+
+    if (-not $DryRun) {
+        $rootManifest = Join-Path $OutputRoot 'versions1.json'
+        [System.IO.File]::WriteAllText(
+            $rootManifest,
+            $canonicalJson,
+            [System.Text.UTF8Encoding]::new($false)
+        )
+        Write-Host "Wrote $rootManifest" -ForegroundColor Green
+    }
+}
+finally {
+    if (-not $DryRun) {
+        git checkout $origHead
+        Write-Host "Restored HEAD to $origHead" -ForegroundColor DarkGray
+    }
+}
+
+Write-Host 'Done.' -ForegroundColor Green
diff --git a/docs/scripts/build_multiversion_docs.sh b/docs/scripts/build_multiversion_docs.sh
new file mode 100644
index 000000000..23643b35f
--- /dev/null
+++ b/docs/scripts/build_multiversion_docs.sh
@@ -0,0 +1,111 @@
+#!/usr/bin/env bash
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Build Sphinx HTML for multiple release lines into docs/_build/multiversion/.
+# See build_multiversion_docs.ps1 for behavior and options.
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
+cd "${REPO_ROOT}"
+
+DRY_RUN=0
+ALLOW_DIRTY=0
+VERSIONS=(2.3.0 2.4.0 2.5.0)
+CANONICAL_MANIFEST="${REPO_ROOT}/docs/versions1.json"
+OUTPUT_ROOT="${REPO_ROOT}/docs/_build/multiversion"
+
+usage() {
+  echo "Usage: $0 [--dry-run] [--allow-dirty] [--versions V1,V2,...] [--manifest PATH] [--output-root PATH]" >&2
+  exit 1
+}
+
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --dry-run) DRY_RUN=1; shift ;;
+    --allow-dirty) ALLOW_DIRTY=1; shift ;;
+    --versions)
+      IFS=',' read -r -a VERSIONS <<< "$2"
+      shift 2
+      ;;
+    --manifest) CANONICAL_MANIFEST="$2"; shift 2 ;;
+    --output-root) OUTPUT_ROOT="$2"; shift 2 ;;
+    -h|--help) usage ;;
+    *) echo "Unknown option: $1" >&2; usage ;;
+  esac
+done
+
+resolve_ref() {
+  local ver="$1"
+  local tag="v${ver}"
+  local branch="release-v${ver}"
+  if git rev-parse -q --verify "refs/tags/${tag}" >/dev/null 2>&1; then
+    echo "${tag}"
+    return
+  fi
+  if git rev-parse -q --verify "refs/heads/${branch}" >/dev/null 2>&1; then
+    echo "${branch}"
+    return
+  fi
+  echo "No git tag ${tag} or branch ${branch} for ${ver}" >&2
+  return 1
+}
+
+if [[ "${DRY_RUN}" -eq 0 ]]; then
+  if [[ -n "$(git status --porcelain)" && "${ALLOW_DIRTY}" -eq 0 ]]; then
+    echo "Working tree is dirty. Commit or stash, or pass --allow-dirty." >&2
+    exit 1
+  fi
+fi
+
+if [[ ! -f "${CANONICAL_MANIFEST}" ]]; then
+  echo "Canonical manifest not found: ${CANONICAL_MANIFEST}" >&2
+  exit 1
+fi
+
+canonical_json="$(cat "${CANONICAL_MANIFEST}")"
+orig_head="$(git rev-parse HEAD)"
+
+if [[ "${DRY_RUN}" -eq 0 ]]; then
+  mkdir -p "${OUTPUT_ROOT}"
+  trap 'git checkout "${orig_head}"' EXIT
+fi
+
+for ver in "${VERSIONS[@]}"; do
+  ref="$(resolve_ref "${ver}")"
+  dest="${OUTPUT_ROOT}/${ver}"
+  echo "==> Version ${ver} <= ref ${ref} => ${dest}"
+
+  if [[ "${DRY_RUN}" -ne 0 ]]; then
+    continue
+  fi
+
+  git checkout "${ref}"
+  printf '%s' "${canonical_json}" >"${REPO_ROOT}/docs/versions1.json"
+
+  uv run python docs/scripts/verify_doc_version_manifest.py
+
+  rm -rf "${dest}"
+  mkdir -p "${dest}"
+  uv run --group docs sphinx-build docs "${dest}"
+done
+
+if [[ "${DRY_RUN}" -eq 0 ]]; then
+  printf '%s' "${canonical_json}" >"${OUTPUT_ROOT}/versions1.json"
+  echo "Wrote ${OUTPUT_ROOT}/versions1.json"
+fi
+
+echo "Done."
diff --git a/docs/scripts/verify_doc_version_manifest.py b/docs/scripts/verify_doc_version_manifest.py
new file mode 100644
index 000000000..6392064ed
--- /dev/null
+++ b/docs/scripts/verify_doc_version_manifest.py
@@ -0,0 +1,146 @@
+#!/usr/bin/env python3
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Validate docs/versions1.json and consistency with conf.py / project.json.
+
+Run from the repository root:
+
+    uv run python docs/scripts/verify_doc_version_manifest.py
+
+Use before building and publishing documentation so the version switcher manifest
+is well-formed and matches the current branch's declared release.
+"""
+
+from __future__ import annotations
+
+import argparse
+import ast
+import json
+import re
+import sys
+from pathlib import Path
+
+
+def _docs_dir() -> Path:
+    return Path(__file__).resolve().parent.parent
+
+
+def _read_release_from_conf(conf_path: Path) -> str:
+    tree = ast.parse(conf_path.read_text(encoding="utf-8"))
+    for node in ast.walk(tree):
+        if isinstance(node, ast.Assign):
+            for target in node.targets:
+                if isinstance(target, ast.Name) and target.id == "release":
+                    value = node.value
+                    if isinstance(value, ast.Constant) and isinstance(
+                        value.value, str
+                    ):
+                        return value.value
+    raise ValueError(f'Could not find release = "..." string in {conf_path}')
+
+
+def _validate_versions_payload(data: object) -> list[dict[str, object]]:
+    if not isinstance(data, list):
+        raise ValueError("versions1.json must be a JSON array")
+    rows: list[dict[str, object]] = []
+    for i, item in enumerate(data):
+        if not isinstance(item, dict):
+            raise ValueError(f"Entry {i} must be an object")
+        rows.append(item)
+    return rows
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        "--docs-dir",
+        type=Path,
+        default=_docs_dir(),
+        help="Path to the docs/ folder (default: next to this script)",
+    )
+    args = parser.parse_args()
+    docs = args.docs_dir.resolve()
+    versions_path = docs / "versions1.json"
+    conf_path = docs / "conf.py"
+    project_path = docs / "project.json"
+
+    errors: list[str] = []
+
+    try:
+        payload = json.loads(versions_path.read_text(encoding="utf-8"))
+        rows = _validate_versions_payload(payload)
+    except (OSError, json.JSONDecodeError, ValueError) as e:
+        print(f"ERROR: {versions_path}: {e}", file=sys.stderr)
+        return 1
+
+    preferred_count = 0
+    url_re = re.compile(r"^\.\./[0-9]+\.[0-9]+\.[0-9]+/$")
+    for i, row in enumerate(rows):
+        ver = row.get("version")
+        url = row.get("url")
+        if not isinstance(ver, str) or not ver.strip():
+            errors.append(f"Entry {i}: missing or invalid 'version'")
+        if not isinstance(url, str) or not url_re.match(url):
+            errors.append(
+                f"Entry {i}: 'url' must look like '../M.m.p/' (got {url!r})"
+            )
+        if row.get("preferred") is True:
+            preferred_count += 1
+        elif "preferred" in row and row["preferred"] not in (False, None):
+            errors.append(f"Entry {i}: 'preferred' must be true or omitted")
+
+    if preferred_count != 1:
+        errors.append(
+            f"Expected exactly one entry with 'preferred': true, got {preferred_count}"
+        )
+
+    try:
+        release = _read_release_from_conf(conf_path)
+    except (OSError, ValueError) as e:
+        errors.append(f"conf.py: {e}")
+        release = None
+
+    proj_ver: str | None = None
+    try:
+        proj = json.loads(project_path.read_text(encoding="utf-8"))
+        if isinstance(proj, dict):
+            v = proj.get("version")
+            proj_ver = v if isinstance(v, str) else None
+        if proj_ver is None:
+            errors.append("project.json: missing top-level string 'version'")
+    except (OSError, json.JSONDecodeError) as e:
+        errors.append(f"project.json: {e}")
+
+    if not errors and release is not None and proj_ver is not None:
+        if proj_ver != release:
+            errors.append(
+                f"docs/conf.py release ({release!r}) != docs/project.json "
+                f"version ({proj_ver!r}) — they should match for this branch"
+            )
+
+    if errors:
+        print(f"Validation failed for {versions_path}:", file=sys.stderr)
+        for msg in errors:
+            print(f"  - {msg}", file=sys.stderr)
+        return 1
+
+    print(f"OK: {versions_path} ({len(rows)} versions)")
+    if release is not None:
+        print(f"OK: conf.py release and project.json version both {release!r}")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/docs/service-port-gpu-reference.md b/docs/service-port-gpu-reference.md
index 648d1bd32..ed24b39f2 100644
--- a/docs/service-port-gpu-reference.md
+++ b/docs/service-port-gpu-reference.md
@@ -13,23 +13,23 @@ The following table provides a comprehensive reference of all services, their po
 | RAG Server | `rag-server` | 8081 | 8081 | N/A (CPU) | Main RAG API endpoint |
 | Ingestor Server | `ingestor-server` | 8082 | 8082 | N/A (CPU) | Document ingestion API |
 | RAG Frontend | `rag-frontend` | 8090 | 3000 | N/A (CPU) | Web UI |
-| NV-Ingest Runtime | `nv-ingest-ms-runtime` | 7670, 7671, 8265 | 7670, 7671, 8265 | N/A (CPU) | Main orchestrator (Ray dashboard: 8265) |
+| NeMo Retriever Library Runtime | `nv-ingest-ms-runtime` | 7670, 7671, 8265 | 7670, 7671, 8265 | N/A (CPU) | Main orchestrator (Ray dashboard: 8265) |
 
 ## NIM Microservices
 
 | Service | Container Name | Host Port(s) | Container Port(s) | Default GPU ID | Environment Variable | Notes |
 |---------|---------------|--------------|-------------------|----------------|---------------------|-------|
 | LLM | `nim-llm-ms` | 8999 | 8000 | 1 | `LLM_MS_GPU_ID` | Main language model |
-| Embedding | `nemoretriever-embedding-ms` | 9080 | 8000 | 0 | `EMBEDDING_MS_GPU_ID` | Text embeddings |
+| Embedding | `nemotron-embedding-ms` | 9080 | 8000 | 0 | `EMBEDDING_MS_GPU_ID` | Text embeddings |
 | VLM Embedding | `nemotron-vlm-embedding-ms` | 9081 | 8000 | 0 | `VLM_EMBEDDING_MS_GPU_ID` | Vision-language embeddings (opt-in, profile: vlm-embed) |
-| Ranking | `nemoretriever-ranking-ms` | 1976 | 8000 | 0 | `RANKING_MS_GPU_ID` | Reranking model |
+| Ranking | `nemotron-ranking-ms` | 1976 | 8000 | 0 | `RANKING_MS_GPU_ID` | Reranking model |
 | VLM | `nemo-vlm-microservice` | 1977 | 8000 | 5 | `VLM_MS_GPU_ID` | Vision-language model (opt-in, profile: vlm-only, vlm-generation) |
 | Nemotron Parse | `compose-nemotron-parse-1` | 8015, 8016, 8017 | 8000, 8001, 8002 | 1 | `NEMOTRON_PARSE_MS_GPU_ID` | PDF parsing (opt-in, profile: nemotron-parse) |
 | RIVA ASR | `compose-audio-1` | 8021, 8022 | 50051, 9000 | 0 | `AUDIO_MS_GPU_ID` | Audio speech recognition (opt-in, profile: audio) |
 | Page Elements | `compose-page-elements-1` | 8000, 8001, 8002 | 8000, 8001, 8002 | 0 | `YOLOX_MS_GPU_ID` | Object detection for pages |
 | Graphic Elements | `compose-graphic-elements-1` | 8003, 8004, 8005 | 8000, 8001, 8002 | 0 | `YOLOX_GRAPHICS_MS_GPU_ID` | Graphics detection |
 | Table Structure | `compose-table-structure-1` | 8006, 8007, 8008 | 8000, 8001, 8002 | 0 | `YOLOX_TABLE_MS_GPU_ID` | Table structure detection |
-| NeMo Retriever OCR | `compose-nemoretriever-ocr-1` | 8012, 8013, 8014 | 8000, 8001, 8002 | 0 | `OCR_MS_GPU_ID` | OCR service (default) |
+| NeMo Retriever Library OCR | `compose-nemoretriever-ocr-1` | 8012, 8013, 8014 | 8000, 8001, 8002 | 0 | `OCR_MS_GPU_ID` | OCR service (default) |
 
 ## Vector Database and Infrastructure
 
diff --git a/docs/support-matrix.md b/docs/support-matrix.md
index 6344822e1..e0e2f88ae 100644
--- a/docs/support-matrix.md
+++ b/docs/support-matrix.md
@@ -78,8 +78,8 @@ The following are requirements and recommendations for the individual components
 - **LLM NIM (llama-3.3-nemotron-super-49b-v1.5)** – Refer to the [Support Matrix](https://docs.nvidia.com/nim/large-language-models/latest/supported-models.html#llama-3-3-nemotron-super-49b-v1-5).
 - **Embedding NIM (Llama-3.2-NV-EmbedQA-1B-v2 )** – Refer to the [Support Matrix](https://docs.nvidia.com/nim/nemo-retriever/text-embedding/latest/support-matrix.html#llama-3-2-nv-embedqa-1b-v2).
 - **Reranking NIM (llama-3_2-nv-rerankqa-1b-v2 )**: Refer to the [Support Matrix](https://docs.nvidia.com/nim/nemo-retriever/text-reranking/latest/support-matrix.html#llama-3-2-nv-rerankqa-1b-v2).
-- **NeMo Retriever OCR (Default)**: Refer to the [Support Matrix](https://docs.nvidia.com/nim/ingestion/image-ocr/1.2.0/support-matrix.html).
-- **NVIDIA NIM for Image OCR (baidu/paddleocr - Legacy)**: Refer to the [Support Matrix](https://docs.nvidia.com/nim/ingestion/table-extraction/latest/support-matrix.html#supported-hardware).
+- **NVIDIA NIM for Image OCR (baidu/paddleocr)**: Refer to the [Support Matrix](https://docs.nvidia.com/nemo/retriever/latest/extraction/support-matrix/).
+**NeMo Retriever OCR**: Refer to the [Support Matrix](https://docs.nvidia.com/nemo/retriever/latest/extraction/support-matrix/)
 - **NVIDIA NIMs for Object Detection**:
   - NeMo Retriever Page Elements v3 [Support Matrix](https://docs.nvidia.com/nim/ingestion/object-detection/latest/support-matrix.html#nemo-retriever-page-elements-v3)
   - NeMo Retriever Graphic Elements v1 [Support Matrix](https://docs.nvidia.com/nim/ingestion/object-detection/latest/support-matrix.html#nemo-retriever-graphic-elements-v1)
diff --git a/docs/text_only_ingest.md b/docs/text_only_ingest.md
index 784c08978..c2a22afd6 100644
--- a/docs/text_only_ingest.md
+++ b/docs/text_only_ingest.md
@@ -19,7 +19,7 @@ You can enable text-only ingestion for the [NVIDIA RAG Blueprint](readme.md). Fo
    ```
 
    :::{important}
-   When disabling nv-ingest dependent services, you must set `COMPONENTS_TO_READY_CHECK=""` to ensure the nv-ingest container reaches ready state. Without this setting, nv-ingest will wait indefinitely for the disabled components.
+   When disabling NeMo Retriever Library dependent services, you must set `COMPONENTS_TO_READY_CHECK=""` to ensure the NeMo Retriever Library container reaches ready state. Without this setting, the NeMo Retriever Library container will wait indefinitely for the disabled components.
    :::
 
    Then deploy the ingestor-server:
@@ -43,8 +43,8 @@ You can enable text-only ingestion for the [NVIDIA RAG Blueprint](readme.md). Fo
    ```output
       NAMES                                   STATUS
 
-      nemoretriever-ranking-ms                Up 14 minutes (healthy)
-      nemoretriever-embedding-ms              Up 14 minutes (healthy)
+      nemotron-ranking-ms                Up 14 minutes (healthy)
+      nemotron-embedding-ms              Up 14 minutes (healthy)
       nim-llm-ms                              Up 14 minutes (healthy)
    ```
 
@@ -70,7 +70,7 @@ In case you are [interacting with cloud hosted models](deploy-docker-nvidia-host
    export APP_EMBEDDINGS_SERVERURL=""
    export APP_LLM_SERVERURL=""
    export APP_RANKING_SERVERURL=""
-   export YOLOX_HTTP_ENDPOINT="https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-page-elements-v3"
+   export YOLOX_HTTP_ENDPOINT="https://ai.api.nvidia.com/v1/cv/nvidia/nemotron-page-elements-v3"
    export YOLOX_INFER_PROTOCOL="http"
    ```
 :::
@@ -113,7 +113,7 @@ Additionally, ensure that **table extraction**, **chart extraction**, and **imag
 2. Then use the modified [`values.yaml`](../deploy/helm/nvidia-blueprint-rag/values.yaml) file in your Helm upgrade command:
 
 ```bash
-helm upgrade --install rag -n rag https://helm.ngc.nvidia.com/nvstaging/blueprint/charts/nvidia-blueprint-rag-v2.4.0.tgz \
+helm upgrade --install rag -n rag https://helm.ngc.nvidia.com/nvidia/blueprint/charts/nvidia-blueprint-rag-v2.5.0.tgz \
   --username '$oauthtoken' \
   --password "${NGC_API_KEY}" \
   --values deploy/helm/nvidia-blueprint-rag/values.yaml \
@@ -131,9 +131,9 @@ helm upgrade --install rag -n rag https://helm.ngc.nvidia.com/nvstaging/blueprin
 ```
 
 :::{important}
-**Disabling NV-Ingest Components for GPU Resource Management:**
+**Disabling NeMo Retriever Library Components for GPU Resource Management:**
 
-If you disable any nv-ingest dependent services (such as `table_structure`, `graphic_elements`, `nemoretriever_ocr_v1`, etc.) to free up GPU resources for customization, you must set the `COMPONENTS_TO_READY_CHECK` parameter to an empty string in the `nv-ingest.envVars` section of your [values.yaml](../deploy/helm/nvidia-blueprint-rag/values.yaml) file:
+If you disable any NeMo Retriever Library dependent services (such as `table_structure`, `graphic_elements`, `nemoretriever_ocr_v1`, etc.) to free up GPU resources for customization, you must set the `COMPONENTS_TO_READY_CHECK` parameter to an empty string in the `nv-ingest.envVars` section of your [values.yaml](../deploy/helm/nvidia-blueprint-rag/values.yaml) file:
 
 ```yaml
 nv-ingest:
@@ -141,6 +141,6 @@ nv-ingest:
     COMPONENTS_TO_READY_CHECK: ""
 ```
 
-This ensures the nv-ingest pod reaches ready state even when some dependent components are disabled. Without this setting, the nv-ingest pod will wait indefinitely for the disabled components to become ready.
+This ensures the NeMo Retriever Library pod reaches ready state even when some dependent components are disabled. Without this setting, the NeMo Retriever Library pod will wait indefinitely for the disabled components to become ready.
 
 :::
diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md
index 319056bf0..782176ed2 100644
--- a/docs/troubleshooting.md
+++ b/docs/troubleshooting.md
@@ -87,10 +87,10 @@ During first-time deployments, large models are downloaded without visible progr
 docker logs -f nim-llm-ms
 
 # Monitor embedding service
-docker logs -f nemoretriever-embedding-ms
+docker logs -f nemotron-embedding-ms
 
 # Monitor ranking service
-docker logs -f nemoretriever-ranking-ms
+docker logs -f nemotron-ranking-ms
 ```
 
 **Check disk usage to verify download progress:**
@@ -105,7 +105,7 @@ watch -n 10 'du -sh ~/.cache/model-cache/'
 **Check container stats:**
 ```bash
 # View resource usage and verify containers are active
-docker stats nim-llm-ms nemoretriever-embedding-ms nemoretriever-ranking-ms
+docker stats nim-llm-ms nemotron-embedding-ms nemotron-ranking-ms
 ```
 
 ### Kubernetes/Helm Deployments
@@ -340,7 +340,7 @@ If the above error related to dependency conflicts are seen while building conta
 We've integrated VDB and embedding creation directly into the pipeline with caching included for expediency.
 However, in a production environment, it's better to use a separately managed VDB service.
 
-NVIDIA offers optimized models and tools like NVIDIA NeMo Retriever ([build.nvidia.com/explore/retrieval](https://build.nvidia.com/explore/retrieval))
+NVIDIA offers optimized models and tools like NVIDIA NeMo Retriever Library ([build.nvidia.com/explore/retrieval](https://build.nvidia.com/explore/retrieval))
 and cuVS ([github.com/rapidsai/cuvs](https://github.com/rapidsai/cuvs)).
 
 
@@ -367,7 +367,7 @@ Adding this information may impact response accuracy, especially when partial in
 ## Helm Deployment Issues
 
 ### PVCs in Pending state (StorageClass issues)
-If NIM Cache PVCs (e.g., `nemoretriever-embedding-ms-cache-pvc`) remain in `Pending` state, check if they are requesting a `storageClassName: default` that does not exist.
+If NIM Cache PVCs (e.g., `nemotron-embedding-ms-cache-pvc`) remain in `Pending` state, check if they are requesting a `storageClassName: default` that does not exist.
 **Fix:** Ensure you have a default storage class. If using `local-path`, you can create an alias:
 ```yaml
 apiVersion: storage.k8s.io/v1
diff --git a/docs/versions1.json b/docs/versions1.json
index d0731c374..2a3a5ee92 100644
--- a/docs/versions1.json
+++ b/docs/versions1.json
@@ -1,6 +1,10 @@
 [
     {
         "preferred": true,
+        "version": "2.5.0",
+        "url": "../2.5.0/"
+    },
+    {
         "version": "2.4.0",
         "url": "../2.4.0/"
     },
diff --git a/docs/vlm-embed.md b/docs/vlm-embed.md
index 5b9913232..01ab062a8 100644
--- a/docs/vlm-embed.md
+++ b/docs/vlm-embed.md
@@ -2,7 +2,7 @@
   SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
   SPDX-License-Identifier: Apache-2.0
 -->
-# Use Multimodal (VLM) Embedding for Ingestion for NVIDIA RAG Blueprint (Early Access)
+# Use Multimodal (VLM) Embedding for Ingestion for NVIDIA RAG Blueprint
 
 This guide shows how to enable and use the multimodal embedding model `nvidia/llama-nemotron-embed-vl-1b-v2` with the [NVIDIA RAG Blueprint](readme.md) ingestion pipeline.
 
@@ -153,8 +153,8 @@ To deploy the VLM embedding service with Helm, update the image and model settin
 nvidia-nim-llama-nemotron-embed-vl-1b-v2:
   enabled: true
   image:
-    repository: nvcr.io/nvidia/nemo-microservices/llama-3.2-nemoretriever-1b-vlm-embed-v1
-    tag: "1.7.0"
+    repository: nvcr.io/nim/nvidia/llama-nemotron-embed-vl-1b-v2
+    tag: "1.12.0"
 
 # Optional: disable the default text embedding NIM
 nvidia-nim-llama-32-nv-embedqa-1b-v2:
@@ -182,8 +182,6 @@ After modifying [`values.yaml`](../deploy/helm/nvidia-blueprint-rag/values.yaml)
 
 For detailed HELM deployment instructions, see [Helm Deployment Guide](deploy-helm.md).
 
-
-
 ## Additional Configuration: Extraction and Embedding Modalities
 
 To configure how content is extracted and embedded (similar to the Docker configurations shown above), you can add extraction and modality settings to your [`values.yaml`](../deploy/helm/nvidia-blueprint-rag/values.yaml):
@@ -214,7 +212,7 @@ ingestor-server:
 
 nv-ingest:
   envVars:
-    # NV-Ingest runtime embedding target
+    # NeMo Retriever Library runtime embedding target
     EMBEDDING_NIM_ENDPOINT: "http://nemotron-vlm-embedding-ms:8000/v1"
     EMBEDDING_NIM_MODEL_NAME: "nvidia/llama-nemotron-embed-vl-1b-v2"
 ```
diff --git a/docs/vlm.md b/docs/vlm.md
index 4a61b2f52..64c6af176 100644
--- a/docs/vlm.md
+++ b/docs/vlm.md
@@ -124,7 +124,7 @@ Continue with [Deploy with Docker (NVIDIA-Hosted Models)](deploy-docker-nvidia-h
 ## Enable VLM with Helm
 
 :::{note}
-**GPU requirements for Helm**: VLM uses the same GPU normally assigned to LLM (GPU 1). With MIG slicing, assign a dedicated MIG slice to the VLM—see [mig-deployment.md](mig-deployment.md) and [values-mig.yaml](../deploy/helm/mig-slicing/values-mig.yaml). To run both VLM and LLM simultaneously, an additional GPU is required.
+**GPU requirements for Helm**: VLM uses the same GPU normally assigned to LLM (GPU 1). With MIG slicing, assign a dedicated MIG slice to the VLM—see [mig-deployment.md](mig-deployment.md) and [values-mig-h100.yaml](../deploy/helm/mig-slicing/values-mig-h100.yaml) or [values-mig-rtx6000.yaml](../deploy/helm/mig-slicing/values-mig-rtx6000.yaml). To run both VLM and LLM simultaneously, an additional GPU is required.
 :::
 
 1. In [values.yaml](../deploy/helm/nvidia-blueprint-rag/values.yaml), under the `rag-server` `envVars` section, set:
diff --git a/examples/README.md b/examples/README.md
index 0d56c4781..b2e991ca6 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -8,6 +8,8 @@ This directory contains example integrations and extensions for NVIDIA RAG.
 |---------|-------------|---------------|
 | [rag_react_agent](./rag_react_agent/) | Integration with [NeMo Agent Toolkit (NAT)](https://github.com/NVIDIA/NeMo-Agent-Toolkit) providing RAG query and search capabilities for agent workflows | [README](./rag_react_agent/README.md) |
 | [nvidia_rag_mcp](./nvidia_rag_mcp/) | MCP (Model Context Protocol) server and client for exposing NVIDIA RAG capabilities to MCP-compatible applications | [Documentation](../docs/mcp.md) |
+| [rag_event_ingest](./rag_event_ingest/) | Automated document ingestion from object storage (MinIO) via Kafka | [Notebook](../notebooks/rag_event_ingest.ipynb) |
+| [google-cloud-netapp-volumes-data-ingestor](./google-cloud-netapp-volumes-data-ingestor/) | Helm chart for deploying the GCNV data ingestor with PVC-backed storage and configurable runtime settings | [README](./google-cloud-netapp-volumes-data-ingestor/README.md) |
 
 ## rag_react_agent
 
@@ -27,3 +29,22 @@ This example provides an MCP server and client that exposes NVIDIA RAG and Inges
 - Manage collections and documents in the vector database
 
 See the [MCP documentation](../docs/mcp.md) for detailed setup and usage instructions.
+
+## rag_event_ingest
+
+This example deploys an event-driven ingestion pipeline that monitors MinIO object storage for new file uploads via Kafka events. Documents are automatically indexed through the RAG Ingestor and become queryable through the RAG Agent.
+
+Components:
+- **kafka_consumer/** - Event-driven consumer that routes files to RAG based on file type
+- **deploy/** - Docker Compose for Kafka, MinIO, and the consumer
+- **data/** - Sample documents for testing
+
+See the [notebook](../notebooks/rag_event_ingest.ipynb) for step-by-step deployment and testing.
+
+## google-cloud-netapp-volumes-data-ingestor
+
+This example packages a GCNV data ingestor deployment as a reusable Helm chart. It is intended for Kubernetes environments where application state and source data are mounted from PVCs, including NetApp Google Cloud NetApp Volumes-backed storage.
+
+The chart supports configurable image settings, PVC creation or reuse, health probes, service exposure, and runtime environment overrides for connecting to an NVIDIA ingestor endpoint.
+
+See the [google-cloud-netapp-volumes-data-ingestor README](./google-cloud-netapp-volumes-data-ingestor/README.md) for prerequisites, installation, and configuration details.
diff --git a/examples/google-cloud-netapp-volumes-data-ingestor/Chart.yaml b/examples/google-cloud-netapp-volumes-data-ingestor/Chart.yaml
new file mode 100644
index 000000000..6fa7cd8b9
--- /dev/null
+++ b/examples/google-cloud-netapp-volumes-data-ingestor/Chart.yaml
@@ -0,0 +1,6 @@
+apiVersion: v2
+name: gcnv-data-ingestor
+description: Public Helm chart for the GCNV data ingestor deployment
+type: application
+version: 0.1.0
+appVersion: "0.1.0"
diff --git a/examples/google-cloud-netapp-volumes-data-ingestor/README.md b/examples/google-cloud-netapp-volumes-data-ingestor/README.md
new file mode 100644
index 000000000..50c0fd09c
--- /dev/null
+++ b/examples/google-cloud-netapp-volumes-data-ingestor/README.md
@@ -0,0 +1,178 @@
+# Google Cloud NetApp Volumes (GCNV) Data Ingestor Helm Chart
+
+This chart packages the deployment of the GCNV Data Ingestor that integrates with the NVIDIA Foundational RAG pipeline into a reusable Helm chart at `examples/google-cloud-netapp-volumes-data-ingestor`.
+
+Create or target the namespace externally with `--namespace ... --create-namespace`. Chart-managed namespace creation is intentionally not supported because Helm cannot reliably create the release namespace from within the same chart.
+
+## Prerequisites
+
+Before installing this chart, make sure the cluster can provision or expose the required PVCs from NetApp Google Cloud NetApp Volumes (GCNV).
+
+1. Install and configure NetApp Trident in the target cluster.
+2. Create or use a Trident `StorageClass` that maps to your GCNV backend.
+3. Decide how you want the chart to get storage:
+   - Let the chart create PVCs by setting `appData.storageClassName` and `sourceData.storageClassName` to Trident-backed classes.
+   - Or create the PVCs ahead of time with Trident and set `appData.create=false`, `appData.existingClaim=<claim>`, `sourceData.create=false`, and `sourceData.existingClaim=<claim>`.
+   - If you set `create=false`, the matching `existingClaim` is required.
+4. Make sure the Docker Hub image and tag you want to deploy are available.
+5. If the Docker Hub repository is private, create an image pull secret in the target namespace and set `image.pullSecrets`.
+
+## Chart Layout
+
+```text
+examples/google-cloud-netapp-volumes-data-ingestor/
+├── Chart.yaml
+├── values.yaml
+├── values.schema.json
+├── README.md
+└── templates/
+    ├── _helpers.tpl
+    ├── deployment.yaml
+    ├── pvc.yaml
+    ├── service.yaml
+    ├── validate.yaml
+    └── serviceaccount.yaml
+```
+
+## Important Values
+
+Update these values before install:
+
+- `image.repository`: set to your Docker Hub image path
+- `image.tag`: set to the image tag you want to deploy
+- `appData.storageClassName`: set to your Trident-backed app PVC class when the chart creates the PVC
+- `appData.size`: app PVC size request, defaults to `50Gi`
+- `appData.existingClaim`: use an already-created PVC instead of letting the chart create one; required when `appData.create=false`
+- `sourceData.storageClassName`: set to your Trident-backed GCNV source PVC class when the chart creates the PVC
+- `sourceData.size`: source PVC size request, defaults to `200Gi`
+- `sourceData.existingClaim`: use an already-created source PVC instead of letting the chart create one; required when `sourceData.create=false`
+- `env.nvIngestEndpoint`: set to the reachable NVIDIA ingestor-server `/v1` base URL
+
+The chart validates required values during `helm lint`, `helm template`, `helm install`, and `helm upgrade`.
+
+## Install
+
+You can either edit `values.yaml` directly or use an override file.
+
+Example override file:
+
+```yaml
+image:
+  repository: docker.io/acme/netapp_volumes_rag_ingestor
+  tag: "REPLACE_WITH_REAL_TAG"
+
+appData:
+  storageClassName: trident-app
+
+sourceData:
+  storageClassName: trident-gcnv
+
+env:
+  nvIngestEndpoint: http://YOUR_INGESTOR_SERVER:8082/v1
+```
+
+Install with:
+
+```bash
+helm install gcnv-data-ingestor ./examples/google-cloud-netapp-volumes-data-ingestor \
+  --namespace gcnv-data-ingestor \
+  --create-namespace \
+  -f my-values.yaml
+```
+
+If your Docker Hub repository is private, add an image pull secret. `image.pullSecrets` must be a YAML list of secret names:
+
+```yaml
+image:
+  pullSecrets:
+    - dockerhub-secret
+```
+
+## Common Overrides
+
+Resize the chart-managed PVCs:
+
+```yaml
+appData:
+  size: 100Gi
+  storageClassName: trident-app
+
+sourceData:
+  size: 500Gi
+  storageClassName: trident-gcnv
+```
+
+## Use Existing PVCs
+
+If Trident or another workflow already created the claims you want to mount, use overrides like this:
+
+```yaml
+appData:
+  create: false
+  existingClaim: gcnv-ingestor-config-data
+
+sourceData:
+  create: false
+  existingClaim: gcnv-data-for-rag
+```
+
+The chart will mount those existing claims into the Pod instead of creating new PVCs.
+
+If you set `create: false` and leave `existingClaim` empty, the chart now fails fast during Helm validation instead of creating a broken release.
+
+Expose the service differently:
+
+```yaml
+service:
+  type: ClusterIP
+  port: 8000
+```
+
+Tune runtime resources:
+
+```yaml
+resources:
+  requests:
+    cpu: 1
+    memory: 2Gi
+  limits:
+    cpu: 4
+    memory: 8Gi
+```
+
+Pass extra environment variables using normal Kubernetes `env` list syntax:
+
+```yaml
+env:
+  extra:
+    - name: EXTRA_FLAG
+      value: "1"
+```
+
+## Supported Values
+
+The chart supports overrides for the following areas in `values.yaml`:
+
+- Naming: `nameOverride`, `fullnameOverride`
+- Labels: `selectorLabels`, `podLabels`, `podAnnotations`
+- Deployment: `replicaCount`, `strategy`
+- Image: `image.repository`, `image.tag`, `image.pullPolicy`, `image.pullSecrets`
+- Service account: `serviceAccount.create`, `serviceAccount.name`, `serviceAccount.automount`, `serviceAccount.annotations`
+- Service: `service.type`, `service.port`, `service.annotations`
+- App PVC: `appData.create`, `appData.existingClaim`, `appData.name`, `appData.accessModes`, `appData.size`, `appData.storageClassName`, `appData.mountPath`
+- Source PVC: `sourceData.create`, `sourceData.existingClaim`, `sourceData.name`, `sourceData.accessModes`, `sourceData.size`, `sourceData.storageClassName`, `sourceData.mountPath`, `sourceData.readOnly`
+- Environment: `env.scanOutputRoot`, `env.appDbPath`, `env.defaultIncrementalSchedulerMins`, `env.nvIngestMode`, `env.nvIngestEndpoint`, `env.extra`
+- Health checks: `probes.liveness.*`, `probes.readiness.*`
+- Scheduling and placement: `nodeSelector`, `tolerations`, `affinity`
+- Resource limits: `resources`
+
+## Verify
+
+```bash
+helm template gcnv-data-ingestor ./examples/google-cloud-netapp-volumes-data-ingestor -n gcnv-data-ingestor
+kubectl get pods,svc,pvc -n gcnv-data-ingestor
+```
+
+The service defaults to `NodePort` with service port `8000`, matching the source manifest. Kubernetes assigns the external node port automatically unless you customize the Service separately.
+
+The default PVC access modes are `ReadWriteOnce`, so increasing `replicaCount` beyond `1` may require different storage semantics or pod placement constraints.
diff --git a/examples/google-cloud-netapp-volumes-data-ingestor/templates/_helpers.tpl b/examples/google-cloud-netapp-volumes-data-ingestor/templates/_helpers.tpl
new file mode 100644
index 000000000..60cb834ba
--- /dev/null
+++ b/examples/google-cloud-netapp-volumes-data-ingestor/templates/_helpers.tpl
@@ -0,0 +1,80 @@
+{{/*
+Expand the chart name.
+*/}}
+{{- define "gcnv-data-ingestor.name" -}}
+{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}}
+{{- end -}}
+
+{{/*
+Create a default fully qualified app name.
+*/}}
+{{- define "gcnv-data-ingestor.fullname" -}}
+{{- if .Values.fullnameOverride -}}
+{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" -}}
+{{- else -}}
+{{- $name := default .Chart.Name .Values.nameOverride -}}
+{{- if contains $name .Release.Name -}}
+{{- .Release.Name | trunc 63 | trimSuffix "-" -}}
+{{- else -}}
+{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" -}}
+{{- end -}}
+{{- end -}}
+{{- end -}}
+
+{{/*
+Chart name and version.
+*/}}
+{{- define "gcnv-data-ingestor.chart" -}}
+{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" -}}
+{{- end -}}
+
+{{/*
+Common labels.
+*/}}
+{{- define "gcnv-data-ingestor.labels" -}}
+helm.sh/chart: {{ include "gcnv-data-ingestor.chart" . }}
+{{ include "gcnv-data-ingestor.selectorLabels" . }}
+app.kubernetes.io/managed-by: {{ .Release.Service }}
+app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
+{{- end -}}
+
+{{/*
+Selector labels copied from the source deployment.
+*/}}
+{{- define "gcnv-data-ingestor.selectorLabels" -}}
+app.kubernetes.io/name: {{ .Values.selectorLabels.name }}
+app.kubernetes.io/instance: {{ .Values.selectorLabels.instance }}
+{{- end -}}
+
+{{/*
+Service account name.
+*/}}
+{{- define "gcnv-data-ingestor.serviceAccountName" -}}
+{{- if .Values.serviceAccount.create -}}
+{{- default (printf "%s-sa" (include "gcnv-data-ingestor.fullname" .)) .Values.serviceAccount.name -}}
+{{- else -}}
+{{- default "default" .Values.serviceAccount.name -}}
+{{- end -}}
+{{- end -}}
+
+{{/*
+App PVC name.
+*/}}
+{{- define "gcnv-data-ingestor.appPvcName" -}}
+{{- if .Values.appData.existingClaim -}}
+{{- .Values.appData.existingClaim -}}
+{{- else -}}
+{{- .Values.appData.name -}}
+{{- end -}}
+{{- end -}}
+
+{{/*
+Source PVC name.
+*/}}
+{{- define "gcnv-data-ingestor.sourcePvcName" -}}
+{{- if .Values.sourceData.existingClaim -}}
+{{- .Values.sourceData.existingClaim -}}
+{{- else -}}
+{{- .Values.sourceData.name -}}
+{{- end -}}
+{{- end -}}
diff --git a/examples/google-cloud-netapp-volumes-data-ingestor/templates/deployment.yaml b/examples/google-cloud-netapp-volumes-data-ingestor/templates/deployment.yaml
new file mode 100644
index 000000000..c1fbec3d9
--- /dev/null
+++ b/examples/google-cloud-netapp-volumes-data-ingestor/templates/deployment.yaml
@@ -0,0 +1,99 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: {{ include "gcnv-data-ingestor.fullname" . }}
+  labels:
+    {{- include "gcnv-data-ingestor.labels" . | nindent 4 }}
+spec:
+  replicas: {{ .Values.replicaCount }}
+  strategy:
+    type: {{ .Values.strategy.type }}
+  selector:
+    matchLabels:
+      {{- include "gcnv-data-ingestor.selectorLabels" . | nindent 6 }}
+  template:
+    metadata:
+      labels:
+        {{- include "gcnv-data-ingestor.selectorLabels" . | nindent 8 }}
+        {{- with .Values.podLabels }}
+        {{- toYaml . | nindent 8 }}
+        {{- end }}
+      {{- with .Values.podAnnotations }}
+      annotations:
+        {{- toYaml . | nindent 8 }}
+      {{- end }}
+    spec:
+      serviceAccountName: {{ include "gcnv-data-ingestor.serviceAccountName" . }}
+      {{- with .Values.image.pullSecrets }}
+      imagePullSecrets:
+        {{- range . }}
+        - name: {{ . }}
+        {{- end }}
+      {{- end }}
+      containers:
+        - name: gcnv-data-ingestor
+          image: "{{ .Values.image.repository }}:{{ .Values.image.tag }}"
+          imagePullPolicy: {{ .Values.image.pullPolicy }}
+          ports:
+            - name: http
+              containerPort: {{ .Values.service.port }}
+              protocol: TCP
+          env:
+            - name: MOUNT_PATH
+              value: {{ .Values.sourceData.mountPath | quote }}
+            - name: SCAN_OUTPUT_ROOT
+              value: {{ .Values.env.scanOutputRoot | quote }}
+            - name: APP_DB_PATH
+              value: {{ .Values.env.appDbPath | quote }}
+            - name: DEFAULT_INCREMENTAL_SCHEDULER_MINS
+              value: {{ .Values.env.defaultIncrementalSchedulerMins | quote }}
+            - name: NV_INGEST_MODE
+              value: {{ .Values.env.nvIngestMode | quote }}
+            - name: NV_INGEST_ENDPOINT
+              value: {{ .Values.env.nvIngestEndpoint | quote }}
+            {{- with .Values.env.extra }}
+            {{- toYaml . | nindent 12 }}
+            {{- end }}
+          livenessProbe:
+            httpGet:
+              path: {{ .Values.probes.liveness.path }}
+              port: http
+            initialDelaySeconds: {{ .Values.probes.liveness.initialDelaySeconds }}
+            periodSeconds: {{ .Values.probes.liveness.periodSeconds }}
+            timeoutSeconds: {{ .Values.probes.liveness.timeoutSeconds }}
+            failureThreshold: {{ .Values.probes.liveness.failureThreshold }}
+          readinessProbe:
+            httpGet:
+              path: {{ .Values.probes.readiness.path }}
+              port: http
+            initialDelaySeconds: {{ .Values.probes.readiness.initialDelaySeconds }}
+            periodSeconds: {{ .Values.probes.readiness.periodSeconds }}
+            timeoutSeconds: {{ .Values.probes.readiness.timeoutSeconds }}
+            failureThreshold: {{ .Values.probes.readiness.failureThreshold }}
+          resources:
+            {{- toYaml .Values.resources | nindent 12 }}
+          volumeMounts:
+            - name: app-data
+              mountPath: {{ .Values.appData.mountPath | quote }}
+            - name: source-data
+              mountPath: {{ .Values.sourceData.mountPath | quote }}
+              readOnly: {{ .Values.sourceData.readOnly }}
+      volumes:
+        - name: app-data
+          persistentVolumeClaim:
+            claimName: {{ include "gcnv-data-ingestor.appPvcName" . }}
+        - name: source-data
+          persistentVolumeClaim:
+            claimName: {{ include "gcnv-data-ingestor.sourcePvcName" . }}
+      {{- with .Values.nodeSelector }}
+      nodeSelector:
+        {{- toYaml . | nindent 8 }}
+      {{- end }}
+      {{- with .Values.affinity }}
+      affinity:
+        {{- toYaml . | nindent 8 }}
+      {{- end }}
+      {{- with .Values.tolerations }}
+      tolerations:
+        {{- toYaml . | nindent 8 }}
+      {{- end }}
diff --git a/examples/google-cloud-netapp-volumes-data-ingestor/templates/pvc.yaml b/examples/google-cloud-netapp-volumes-data-ingestor/templates/pvc.yaml
new file mode 100644
index 000000000..590166056
--- /dev/null
+++ b/examples/google-cloud-netapp-volumes-data-ingestor/templates/pvc.yaml
@@ -0,0 +1,35 @@
+{{- if and .Values.appData.create (not .Values.appData.existingClaim) }}
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: {{ include "gcnv-data-ingestor.appPvcName" . }}
+  labels:
+    {{- include "gcnv-data-ingestor.labels" . | nindent 4 }}
+spec:
+  accessModes:
+    {{- toYaml .Values.appData.accessModes | nindent 4 }}
+  resources:
+    requests:
+      storage: {{ .Values.appData.size }}
+  {{- with .Values.appData.storageClassName }}
+  storageClassName: {{ . | quote }}
+  {{- end }}
+{{- end }}
+{{- if and .Values.sourceData.create (not .Values.sourceData.existingClaim) }}
+---
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: {{ include "gcnv-data-ingestor.sourcePvcName" . }}
+  labels:
+    {{- include "gcnv-data-ingestor.labels" . | nindent 4 }}
+spec:
+  accessModes:
+    {{- toYaml .Values.sourceData.accessModes | nindent 4 }}
+  resources:
+    requests:
+      storage: {{ .Values.sourceData.size }}
+  {{- with .Values.sourceData.storageClassName }}
+  storageClassName: {{ . | quote }}
+  {{- end }}
+{{- end }}
diff --git a/examples/google-cloud-netapp-volumes-data-ingestor/templates/service.yaml b/examples/google-cloud-netapp-volumes-data-ingestor/templates/service.yaml
new file mode 100644
index 000000000..6d0ebbc36
--- /dev/null
+++ b/examples/google-cloud-netapp-volumes-data-ingestor/templates/service.yaml
@@ -0,0 +1,19 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: {{ include "gcnv-data-ingestor.fullname" . }}
+  labels:
+    {{- include "gcnv-data-ingestor.labels" . | nindent 4 }}
+  {{- with .Values.service.annotations }}
+  annotations:
+    {{- toYaml . | nindent 4 }}
+  {{- end }}
+spec:
+  type: {{ .Values.service.type }}
+  selector:
+    {{- include "gcnv-data-ingestor.selectorLabels" . | nindent 4 }}
+  ports:
+    - name: http
+      port: {{ .Values.service.port }}
+      targetPort: http
+      protocol: TCP
diff --git a/examples/google-cloud-netapp-volumes-data-ingestor/templates/serviceaccount.yaml b/examples/google-cloud-netapp-volumes-data-ingestor/templates/serviceaccount.yaml
new file mode 100644
index 000000000..67fe6bf5f
--- /dev/null
+++ b/examples/google-cloud-netapp-volumes-data-ingestor/templates/serviceaccount.yaml
@@ -0,0 +1,13 @@
+{{- if .Values.serviceAccount.create }}
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: {{ include "gcnv-data-ingestor.serviceAccountName" . }}
+  labels:
+    {{- include "gcnv-data-ingestor.labels" . | nindent 4 }}
+  {{- with .Values.serviceAccount.annotations }}
+  annotations:
+    {{- toYaml . | nindent 4 }}
+  {{- end }}
+automountServiceAccountToken: {{ .Values.serviceAccount.automount }}
+{{- end }}
diff --git a/examples/google-cloud-netapp-volumes-data-ingestor/templates/validate.yaml b/examples/google-cloud-netapp-volumes-data-ingestor/templates/validate.yaml
new file mode 100644
index 000000000..60f03ba4f
--- /dev/null
+++ b/examples/google-cloud-netapp-volumes-data-ingestor/templates/validate.yaml
@@ -0,0 +1,15 @@
+{{- if .Values.namespace.create }}
+{{- fail "namespace.create is not supported by this chart. Create the namespace outside the chart with --create-namespace or kubectl create namespace." }}
+{{- end }}
+
+{{- if and (not .Values.appData.create) (not .Values.appData.existingClaim) }}
+{{- fail "appData.existingClaim is required when appData.create=false." }}
+{{- end }}
+
+{{- if and (not .Values.sourceData.create) (not .Values.sourceData.existingClaim) }}
+{{- fail "sourceData.existingClaim is required when sourceData.create=false." }}
+{{- end }}
+
+{{- if and (not .Values.serviceAccount.create) (not .Values.serviceAccount.name) }}
+{{- fail "serviceAccount.name is required when serviceAccount.create=false." }}
+{{- end }}
diff --git a/examples/google-cloud-netapp-volumes-data-ingestor/values.schema.json b/examples/google-cloud-netapp-volumes-data-ingestor/values.schema.json
new file mode 100644
index 000000000..d61f87d29
--- /dev/null
+++ b/examples/google-cloud-netapp-volumes-data-ingestor/values.schema.json
@@ -0,0 +1,510 @@
+{
+  "$schema": "http://json-schema.org/draft-07/schema#",
+  "type": "object",
+  "properties": {
+    "nameOverride": {
+      "type": "string"
+    },
+    "fullnameOverride": {
+      "type": "string"
+    },
+    "namespace": {
+      "type": "object",
+      "properties": {
+        "create": {
+          "type": "boolean"
+        }
+      }
+    },
+    "selectorLabels": {
+      "type": "object",
+      "properties": {
+        "name": {
+          "type": "string",
+          "minLength": 1
+        },
+        "instance": {
+          "type": "string",
+          "minLength": 1
+        }
+      },
+      "required": [
+        "name",
+        "instance"
+      ]
+    },
+    "replicaCount": {
+      "type": "integer",
+      "minimum": 1
+    },
+    "strategy": {
+      "type": "object",
+      "properties": {
+        "type": {
+          "type": "string",
+          "enum": [
+            "Recreate",
+            "RollingUpdate"
+          ]
+        }
+      },
+      "required": [
+        "type"
+      ]
+    },
+    "image": {
+      "type": "object",
+      "properties": {
+        "repository": {
+          "type": "string",
+          "minLength": 1
+        },
+        "tag": {
+          "type": "string",
+          "minLength": 1
+        },
+        "pullPolicy": {
+          "type": "string",
+          "enum": [
+            "Always",
+            "IfNotPresent",
+            "Never"
+          ]
+        },
+        "pullSecrets": {
+          "type": "array",
+          "items": {
+            "type": "string",
+            "minLength": 1
+          }
+        }
+      },
+      "required": [
+        "repository",
+        "tag",
+        "pullPolicy",
+        "pullSecrets"
+      ]
+    },
+    "serviceAccount": {
+      "type": "object",
+      "properties": {
+        "create": {
+          "type": "boolean"
+        },
+        "name": {
+          "type": "string"
+        },
+        "automount": {
+          "type": "boolean"
+        },
+        "annotations": {
+          "type": "object",
+          "additionalProperties": {
+            "type": "string"
+          }
+        }
+      },
+      "required": [
+        "create",
+        "name",
+        "automount",
+        "annotations"
+      ],
+      "allOf": [
+        {
+          "if": {
+            "properties": {
+              "create": {
+                "const": false
+              }
+            },
+            "required": [
+              "create"
+            ]
+          },
+          "then": {
+            "properties": {
+              "name": {
+                "minLength": 1
+              }
+            }
+          }
+        }
+      ]
+    },
+    "service": {
+      "type": "object",
+      "properties": {
+        "type": {
+          "type": "string",
+          "enum": [
+            "ClusterIP",
+            "NodePort",
+            "LoadBalancer"
+          ]
+        },
+        "port": {
+          "type": "integer",
+          "minimum": 1,
+          "maximum": 65535
+        },
+        "annotations": {
+          "type": "object",
+          "additionalProperties": {
+            "type": "string"
+          }
+        }
+      },
+      "required": [
+        "type",
+        "port",
+        "annotations"
+      ]
+    },
+    "appData": {
+      "type": "object",
+      "properties": {
+        "create": {
+          "type": "boolean"
+        },
+        "existingClaim": {
+          "type": "string"
+        },
+        "name": {
+          "type": "string",
+          "minLength": 1
+        },
+        "accessModes": {
+          "type": "array",
+          "minItems": 1,
+          "items": {
+            "type": "string",
+            "enum": [
+              "ReadWriteOnce",
+              "ReadOnlyMany",
+              "ReadWriteMany",
+              "ReadWriteOncePod"
+            ]
+          }
+        },
+        "size": {
+          "type": "string",
+          "minLength": 1
+        },
+        "storageClassName": {
+          "type": "string"
+        },
+        "mountPath": {
+          "type": "string",
+          "minLength": 1
+        }
+      },
+      "required": [
+        "create",
+        "existingClaim",
+        "name",
+        "accessModes",
+        "size",
+        "storageClassName",
+        "mountPath"
+      ],
+      "allOf": [
+        {
+          "if": {
+            "properties": {
+              "create": {
+                "const": false
+              }
+            },
+            "required": [
+              "create"
+            ]
+          },
+          "then": {
+            "properties": {
+              "existingClaim": {
+                "minLength": 1
+              }
+            }
+          }
+        },
+        {
+          "if": {
+            "properties": {
+              "create": {
+                "const": true
+              },
+              "existingClaim": {
+                "maxLength": 0
+              }
+            },
+            "required": [
+              "create",
+              "existingClaim"
+            ]
+          },
+          "then": {
+            "properties": {
+              "storageClassName": {
+                "type": "string",
+                "minLength": 1,
+                "pattern": "^[a-z0-9]([-a-z0-9.]*[a-z0-9])?$"
+              }
+            }
+          }
+        }
+      ]
+    },
+    "sourceData": {
+      "type": "object",
+      "properties": {
+        "create": {
+          "type": "boolean"
+        },
+        "existingClaim": {
+          "type": "string"
+        },
+        "name": {
+          "type": "string",
+          "minLength": 1
+        },
+        "accessModes": {
+          "type": "array",
+          "minItems": 1,
+          "items": {
+            "type": "string",
+            "enum": [
+              "ReadWriteOnce",
+              "ReadOnlyMany",
+              "ReadWriteMany",
+              "ReadWriteOncePod"
+            ]
+          }
+        },
+        "size": {
+          "type": "string",
+          "minLength": 1
+        },
+        "storageClassName": {
+          "type": "string"
+        },
+        "mountPath": {
+          "type": "string",
+          "minLength": 1
+        },
+        "readOnly": {
+          "type": "boolean"
+        }
+      },
+      "required": [
+        "create",
+        "existingClaim",
+        "name",
+        "accessModes",
+        "size",
+        "storageClassName",
+        "mountPath",
+        "readOnly"
+      ],
+      "allOf": [
+        {
+          "if": {
+            "properties": {
+              "create": {
+                "const": false
+              }
+            },
+            "required": [
+              "create"
+            ]
+          },
+          "then": {
+            "properties": {
+              "existingClaim": {
+                "minLength": 1
+              }
+            }
+          }
+        },
+        {
+          "if": {
+            "properties": {
+              "create": {
+                "const": true
+              },
+              "existingClaim": {
+                "maxLength": 0
+              }
+            },
+            "required": [
+              "create",
+              "existingClaim"
+            ]
+          },
+          "then": {
+            "properties": {
+              "storageClassName": {
+                "type": "string",
+                "minLength": 1,
+                "pattern": "^[a-z0-9]([-a-z0-9.]*[a-z0-9])?$"
+              }
+            }
+          }
+        }
+      ]
+    },
+    "env": {
+      "type": "object",
+      "properties": {
+        "scanOutputRoot": {
+          "type": "string",
+          "minLength": 1
+        },
+        "appDbPath": {
+          "type": "string",
+          "minLength": 1
+        },
+        "defaultIncrementalSchedulerMins": {
+          "type": "string",
+          "pattern": "^[0-9]+$"
+        },
+        "nvIngestMode": {
+          "type": "string",
+          "minLength": 1
+        },
+        "nvIngestEndpoint": {
+          "type": "string",
+          "minLength": 1,
+          "pattern": "^https?://.+"
+        },
+        "extra": {
+          "type": "array",
+          "items": {
+            "type": "object",
+            "properties": {
+              "name": {
+                "type": "string",
+                "minLength": 1
+              },
+              "value": {
+                "type": "string"
+              },
+              "valueFrom": {
+                "type": "object"
+              }
+            },
+            "required": [
+              "name"
+            ],
+            "anyOf": [
+              {
+                "required": [
+                  "value"
+                ]
+              },
+              {
+                "required": [
+                  "valueFrom"
+                ]
+              }
+            ]
+          }
+        }
+      },
+      "required": [
+        "scanOutputRoot",
+        "appDbPath",
+        "defaultIncrementalSchedulerMins",
+        "nvIngestMode",
+        "nvIngestEndpoint",
+        "extra"
+      ]
+    },
+    "probes": {
+      "type": "object",
+      "properties": {
+        "liveness": {
+          "$ref": "#/definitions/httpProbe"
+        },
+        "readiness": {
+          "$ref": "#/definitions/httpProbe"
+        }
+      },
+      "required": [
+        "liveness",
+        "readiness"
+      ]
+    },
+    "resources": {
+      "type": "object"
+    },
+    "podAnnotations": {
+      "type": "object",
+      "additionalProperties": {
+        "type": "string"
+      }
+    },
+    "podLabels": {
+      "type": "object",
+      "additionalProperties": {
+        "type": "string"
+      }
+    },
+    "nodeSelector": {
+      "type": "object",
+      "additionalProperties": {
+        "type": "string"
+      }
+    },
+    "tolerations": {
+      "type": "array"
+    },
+    "affinity": {
+      "type": "object"
+    }
+  },
+  "required": [
+    "image",
+    "serviceAccount",
+    "service",
+    "appData",
+    "sourceData",
+    "env",
+    "probes"
+  ],
+  "definitions": {
+    "httpProbe": {
+      "type": "object",
+      "properties": {
+        "path": {
+          "type": "string",
+          "minLength": 1
+        },
+        "initialDelaySeconds": {
+          "type": "integer",
+          "minimum": 0
+        },
+        "periodSeconds": {
+          "type": "integer",
+          "minimum": 1
+        },
+        "timeoutSeconds": {
+          "type": "integer",
+          "minimum": 1
+        },
+        "failureThreshold": {
+          "type": "integer",
+          "minimum": 1
+        }
+      },
+      "required": [
+        "path",
+        "initialDelaySeconds",
+        "periodSeconds",
+        "timeoutSeconds",
+        "failureThreshold"
+      ]
+    }
+  }
+}
diff --git a/examples/google-cloud-netapp-volumes-data-ingestor/values.yaml b/examples/google-cloud-netapp-volumes-data-ingestor/values.yaml
new file mode 100644
index 000000000..483b51672
--- /dev/null
+++ b/examples/google-cloud-netapp-volumes-data-ingestor/values.yaml
@@ -0,0 +1,90 @@
+nameOverride: ""
+fullnameOverride: gcnv-data-ingestor
+
+namespace:
+  # Helm charts cannot reliably create their own target namespace during install.
+  # Use `helm install --create-namespace` instead.
+  create: false
+
+selectorLabels:
+  name: nvidia-gcnv-rag-manager
+  instance: netapp-volumes-rag-ingestor
+
+replicaCount: 1
+
+strategy:
+  type: Recreate
+
+image:
+  repository: ""
+  tag: ""
+  pullPolicy: IfNotPresent
+  pullSecrets: []
+
+serviceAccount:
+  create: true
+  name: gcnv-data-ingestor-sa
+  automount: true
+  annotations: {}
+
+service:
+  type: NodePort
+  port: 8000
+  annotations: {}
+
+appData:
+  create: true
+  existingClaim: ""
+  name: gcnv-ingestor-config-data
+  accessModes:
+    - ReadWriteOnce
+  size: 50Gi
+  storageClassName: ""
+  mountPath: /data
+
+sourceData:
+  create: true
+  existingClaim: ""
+  name: gcnv-data-for-rag
+  accessModes:
+    - ReadWriteOnce
+  size: 200Gi
+  storageClassName: ""
+  mountPath: /source
+  readOnly: true
+
+env:
+  scanOutputRoot: /data/scans
+  appDbPath: /data/state/app.db
+  defaultIncrementalSchedulerMins: "0"
+  nvIngestMode: ingestor
+  nvIngestEndpoint: ""
+  extra: []
+
+probes:
+  liveness:
+    path: /healthz
+    initialDelaySeconds: 20
+    periodSeconds: 15
+    timeoutSeconds: 5
+    failureThreshold: 3
+  readiness:
+    path: /healthz
+    initialDelaySeconds: 5
+    periodSeconds: 10
+    timeoutSeconds: 5
+    failureThreshold: 3
+
+resources:
+  requests:
+    cpu: 500m
+    memory: 1Gi
+  limits:
+    cpu: "2"
+    memory: 4Gi
+
+podAnnotations: {}
+podLabels: {}
+nodeSelector: {}
+tolerations: []
+affinity: {}
diff --git a/examples/nvidia_rag_mcp/mcp_server.py b/examples/nvidia_rag_mcp/mcp_server.py
index e93744933..c2ac626a4 100644
--- a/examples/nvidia_rag_mcp/mcp_server.py
+++ b/examples/nvidia_rag_mcp/mcp_server.py
@@ -33,6 +33,9 @@
 Environment variables:
   - VITE_API_CHAT_URL: Base URL for RAG HTTP API (default http://localhost:8081)
   - INGESTOR_URL: Base URL for Ingestor API (default http://127.0.0.1:8082)
+  - MCP_UPLOAD_DIR: Allowed base directory for file uploads (default: cwd).
+    File paths passed to upload/update tools are validated to be within this
+    directory, preventing path-traversal attacks.
 """
 
 import argparse
@@ -68,6 +71,36 @@ def _rag_base_url() -> str:
     return os.environ.get("VITE_API_CHAT_URL", "http://localhost:8081").rstrip("/")
 
 
+def _upload_base_dir() -> str:
+    """
+    Return the base directory that file upload paths must reside within.
+    Controlled by the ``MCP_UPLOAD_DIR`` environment variable; defaults to
+    the current working directory when unset.
+    """
+    return os.environ.get("MCP_UPLOAD_DIR", os.getcwd())
+
+
+def _validate_file_path(path: str) -> str:
+    """
+    Resolve *path* to an absolute, canonical path and verify it is located
+    within the allowed upload directory (``MCP_UPLOAD_DIR``).
+
+    Returns the resolved path on success; raises ``ValueError`` otherwise.
+
+    Security: uses ``os.path.realpath`` to follow symlinks so that
+    ``../../etc/passwd`` or symlink escapes are caught.
+    """
+    base = os.path.realpath(_upload_base_dir())
+    resolved = os.path.realpath(path)
+    # Ensure the resolved path starts with the base directory
+    if not resolved.startswith(base + os.sep) and resolved != base:
+        raise ValueError(
+            f"Path {path!r} (resolved to {resolved!r}) is not within the "
+            f"allowed upload directory {base!r}"
+        )
+    return resolved
+
+
 @server.tool(
     "generate",
     description="""Generate an answer using NVIDIA RAG (optionally with knowledge base).
@@ -503,6 +536,7 @@ async def tool_update_documents(
     form_data = aiohttp.FormData()
 
     for path in file_paths or []:
+        path = _validate_file_path(path)
         try:
             if os.path.exists(path):
                 with open(path, "rb") as f:
@@ -818,6 +852,7 @@ async def tool_upload_documents(
     form_data = aiohttp.FormData()
     # Add files
     for path in file_paths or []:
+        path = _validate_file_path(path)
         try:
             if os.path.exists(path):
                 with open(path, "rb") as f:
diff --git a/examples/rag_event_ingest/data/documents/Seahawks-Patriots in Super Bowl LX_ What We Learned from Seattle's 29-13 win.pdf b/examples/rag_event_ingest/data/documents/Seahawks-Patriots in Super Bowl LX_ What We Learned from Seattle's 29-13 win.pdf
new file mode 100644
index 000000000..3d750564d
Binary files /dev/null and b/examples/rag_event_ingest/data/documents/Seahawks-Patriots in Super Bowl LX_ What We Learned from Seattle's 29-13 win.pdf differ
diff --git a/examples/rag_event_ingest/data/videos/Seattle Seahawks vs New England Patriots - Super Bowl LX Game Highlights.mp4 b/examples/rag_event_ingest/data/videos/Seattle Seahawks vs New England Patriots - Super Bowl LX Game Highlights.mp4
new file mode 100644
index 000000000..164dc505a
--- /dev/null
+++ b/examples/rag_event_ingest/data/videos/Seattle Seahawks vs New England Patriots - Super Bowl LX Game Highlights.mp4	
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:618e8d92f34e1a7c3b5ea139b49bce1cf1d00eb4f15fd1963ee53ea8302f6c70
+size 83123435
diff --git a/examples/rag_event_ingest/deploy/docker-compose.yaml b/examples/rag_event_ingest/deploy/docker-compose.yaml
new file mode 100644
index 000000000..05e5bbc0f
--- /dev/null
+++ b/examples/rag_event_ingest/deploy/docker-compose.yaml
@@ -0,0 +1,164 @@
+# AIDP - AI Data Pipeline Docker Compose
+# Event-driven document ingestion with Kafka + MinIO sources
+#
+# Usage:
+#   docker compose -f docker-compose.yaml up -d
+#
+# Prerequisites:
+#   - RAG stack running (from launchable.ipynb)
+#   - nvidia-rag network exists
+
+services:
+  # =============================================================================
+  # KAFKA STACK (KRaft - no Zookeeper needed)
+  # =============================================================================
+  kafka:
+    image: apache/kafka:latest
+    container_name: kafka
+    restart: unless-stopped
+    ports:
+      - "9092:9092"
+      - "9094:9094"
+    environment:
+      - KAFKA_NODE_ID=1
+      - KAFKA_PROCESS_ROLES=broker,controller
+      - KAFKA_LISTENERS=PLAINTEXT://:9092,CONTROLLER://:9093,EXTERNAL://:9094
+      - KAFKA_ADVERTISED_LISTENERS=PLAINTEXT://kafka:9092,EXTERNAL://${HOST_IP:-localhost}:9094
+      - KAFKA_LISTENER_SECURITY_PROTOCOL_MAP=CONTROLLER:PLAINTEXT,PLAINTEXT:PLAINTEXT,EXTERNAL:PLAINTEXT
+      - KAFKA_CONTROLLER_QUORUM_VOTERS=1@kafka:9093
+      - KAFKA_CONTROLLER_LISTENER_NAMES=CONTROLLER
+      - KAFKA_INTER_BROKER_LISTENER_NAME=PLAINTEXT
+      - KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR=1
+      - KAFKA_TRANSACTION_STATE_LOG_REPLICATION_FACTOR=1
+      - KAFKA_TRANSACTION_STATE_LOG_MIN_ISR=1
+      - KAFKA_AUTO_CREATE_TOPICS_ENABLE=true
+      - KAFKA_LOG_RETENTION_HOURS=168
+      - CLUSTER_ID=MkU3OEVBNTcwNTJENDM2Qk
+    volumes:
+      - kafka-data:/var/lib/kafka/data
+    networks:
+      - nvidia-rag
+    healthcheck:
+      test: ["CMD-SHELL", "/opt/kafka/bin/kafka-topics.sh --bootstrap-server localhost:9092 --list || exit 1"]
+      interval: 30s
+      timeout: 10s
+      retries: 5
+
+  kafka-ui:
+    image: provectuslabs/kafka-ui:latest
+    container_name: aidp-kafka-ui
+    depends_on:
+      kafka:
+        condition: service_healthy
+    environment:
+      KAFKA_CLUSTERS_0_NAME: aidp-cluster
+      KAFKA_CLUSTERS_0_BOOTSTRAPSERVERS: kafka:9092
+    ports:
+      - "8080:8080"
+    networks:
+      - nvidia-rag
+
+  # =============================================================================
+  # MINIO (Data Source)
+  # =============================================================================
+  minio-source-1:
+    image: minio/minio:RELEASE.2024-01-18T22-51-28Z
+    container_name: aidp-minio
+    command: server /data --console-address ":9001"
+    environment:
+      MINIO_ROOT_USER: minioadmin
+      MINIO_ROOT_PASSWORD: minioadmin
+      # Kafka notification configuration
+      MINIO_NOTIFY_KAFKA_ENABLE_AIDP: "on"
+      MINIO_NOTIFY_KAFKA_BROKERS_AIDP: "kafka:9092"
+      MINIO_NOTIFY_KAFKA_TOPIC_AIDP: "aidp-topic"
+    volumes:
+      - minio-data:/data
+    ports:
+      - "9201:9000"  
+      - "9211:9001"
+    networks:
+      - nvidia-rag
+    healthcheck:
+      test: ["CMD", "mc", "ready", "local"]
+      interval: 10s
+      timeout: 5s
+      retries: 5
+
+  # MinIO MC for bucket setup
+  minio-mc:
+    image: minio/mc:latest
+    container_name: aidp-minio-mc
+    depends_on:
+      minio-source-1:
+        condition: service_healthy
+      kafka:
+        condition: service_healthy
+    entrypoint: >
+      /bin/sh -c "
+      echo 'Waiting for MinIO...';
+      sleep 5;
+      
+      echo 'Setting up MinIO...';
+      mc alias set minio http://minio-source-1:9000 minioadmin minioadmin;
+      mc mb --ignore-existing minio/aidp-bucket;
+      mc event add minio/aidp-bucket arn:minio:sqs::AIDP:kafka --event put,delete || true;
+      
+      echo 'MinIO setup complete!';
+      echo 'Bucket: aidp-bucket on minio-source-1';
+      
+      echo 'Keeping container alive for mc commands...';
+      tail -f /dev/null
+      "
+    networks:
+      - nvidia-rag
+
+  # =============================================================================
+  # KAFKA CONSUMER (Event-driven Ingestion)
+  # =============================================================================
+  kafka-consumer:
+    build:
+      context: ../kafka_consumer
+      dockerfile: Dockerfile
+    image: kafka-consumer:local
+    container_name: kafka-consumer
+    depends_on:
+      kafka:
+        condition: service_healthy
+      minio-source-1:
+        condition: service_healthy
+    environment:
+      # Kafka
+      - KAFKA_BOOTSTRAP_SERVERS=kafka:9092
+      - KAFKA_TOPIC=${KAFKA_TOPIC:-aidp-topic}
+      - CONSUMER_GROUP=${CONSUMER_GROUP:-nvingest-consumer-group}
+      # MinIO
+      - MINIO_ENDPOINT=minio-source-1:9000
+      - MINIO_ACCESS_KEY=${MINIO_ACCESS_KEY:-minioadmin}
+      - MINIO_SECRET_KEY=${MINIO_SECRET_KEY:-minioadmin}
+      - MINIO_SECURE=false
+      # RAG Ingestor
+      - INGESTOR_SERVER_URL=${INGESTOR_SERVER_URL:-http://ingestor-server:8082}
+      - COLLECTION_NAME=${COLLECTION_NAME:-aidp_bucket}
+      # Logging
+      - LOG_LEVEL=${LOG_LEVEL:-INFO}
+    restart: unless-stopped
+    networks:
+      - nvidia-rag
+
+# =============================================================================
+# VOLUMES
+# =============================================================================
+volumes:
+  kafka-data:
+    driver: local
+  minio-data:
+    driver: local
+
+# =============================================================================
+# NETWORKS
+# =============================================================================
+networks:
+  nvidia-rag:
+    external: true
+    name: nvidia-rag
diff --git a/examples/rag_event_ingest/kafka_consumer/Dockerfile b/examples/rag_event_ingest/kafka_consumer/Dockerfile
new file mode 100644
index 000000000..d1ff4fc62
--- /dev/null
+++ b/examples/rag_event_ingest/kafka_consumer/Dockerfile
@@ -0,0 +1,12 @@
+FROM python:3.11-slim
+
+WORKDIR /app
+
+# Install dependencies
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+
+# Copy application code
+COPY . /app/
+
+CMD ["python", "-u", "main.py"]
diff --git a/examples/rag_event_ingest/kafka_consumer/config/__init__.py b/examples/rag_event_ingest/kafka_consumer/config/__init__.py
new file mode 100644
index 000000000..6bd140441
--- /dev/null
+++ b/examples/rag_event_ingest/kafka_consumer/config/__init__.py
@@ -0,0 +1,150 @@
+# config/__init__.py
+"""Configuration package for Kafka MinIO Consumer.
+
+Usage:
+    import config.settings as cfg
+    print(cfg.INGESTOR_SERVER_URL)
+    
+    from config.constants import DOCUMENT_EXTENSIONS, DEST_RAG
+"""
+
+# Settings (env vars)
+from .settings import (
+    # Kafka
+    KAFKA_BOOTSTRAP_SERVERS,
+    KAFKA_CONSUMER_GROUP,
+    KAFKA_TOPIC,
+    KAFKA_AUTO_OFFSET_RESET,
+    KAFKA_MAX_POLL_RECORDS,
+    KAFKA_MAX_POLL_INTERVAL_MS,
+    KAFKA_SESSION_TIMEOUT_MS,
+    KAFKA_HEARTBEAT_INTERVAL_MS,
+    # Services
+    INGESTOR_SERVER_URL,
+    INGESTOR_TIMEOUT,
+    # MinIO
+    MINIO_ENDPOINT,
+    MINIO_ACCESS_KEY,
+    MINIO_SECRET_KEY,
+    MINIO_SECURE,
+    MINIO_DEFAULT_COLLECTION,
+    MINIO_SOURCES,
+    # Features
+    ENABLE_IMAGE_PROCESSING,
+    ENABLE_AUDIO_PROCESSING,
+    # Collection
+    EMBEDDING_DIMENSION,
+    CHUNK_SIZE,
+    CHUNK_OVERLAP,
+    # Logging
+    LOG_LEVEL,
+    LOG_FORMAT,
+    # History
+    HISTORY_FILE,
+    # API Endpoints (configurable via env)
+    API_INGESTOR_DOCUMENTS,
+    API_INGESTOR_COLLECTIONS,
+    API_INGESTOR_COLLECTION,
+    API_INGESTOR_STATUS,
+)
+
+# Constants
+from .constants import (
+    # File extensions
+    DOCUMENT_EXTENSIONS,
+    IMAGE_EXTENSIONS,
+    AUDIO_EXTENSIONS,
+    SKIP_EXTENSIONS,
+    # Content types
+    CONTENT_TYPE_MAP,
+    DEFAULT_CONTENT_TYPE,
+    # Routing
+    DEST_RAG,
+    DEST_SKIP,
+    DEST_UNKNOWN,
+    # S3 Event fields
+    EVENT_NAME,
+    EVENT_RECORDS,
+    EVENT_S3,
+    EVENT_BUCKET,
+    EVENT_OBJECT,
+    EVENT_KEY,
+    EVENT_SIZE,
+    EVENT_ETAG,
+    EVENT_NAME_FIELD,
+    EVENT_FIRST_RECORD_INDEX,
+    EVENT_PREFIX_CREATED,
+    EVENT_PREFIX_REMOVED,
+    EVENT_TYPE_CREATE,
+    EVENT_TYPE_DELETE,
+    # Record field names (dataclass attributes)
+    FIELD_FILE_NAME,
+    FIELD_BUCKET,
+    FIELD_COLLECTION,
+    FIELD_STATUS,
+    FIELD_START_TIME,
+    FIELD_END_TIME,
+    FIELD_DURATION_SECONDS,
+    FIELD_ERROR_MESSAGE,
+    FIELD_TASK_ID,
+    # Record serialization output keys
+    RECORD_FILE_NAME,
+    RECORD_BUCKET,
+    RECORD_COLLECTION,
+    RECORD_START_TIME,
+    RECORD_END_TIME,
+    RECORD_DURATION,
+    RECORD_STATUS,
+    RECORD_ERROR,
+    RECORD_TASK_ID,
+    # Status
+    STATUS_PENDING,
+    STATUS_PROCESSING,
+    STATUS_FINISHED,
+    STATUS_FAILED,
+    STATUS_SKIPPED,
+    STATUS_DELETED,
+    STATUS_SUCCESS,
+    # Config keys (MinIO sources)
+    CFG_ENDPOINT,
+    CFG_ACCESS,
+    CFG_SECRET,
+    CFG_SECURE,
+    CFG_COLLECTION,
+    CFG_BUCKETS,
+    # API request fields (Ingestor)
+    FIELD_COLLECTION_NAME,
+    FIELD_BLOCKING,
+    FIELD_SPLIT_OPTIONS,
+    FIELD_CHUNK_SIZE,
+    FIELD_CHUNK_OVERLAP,
+    FIELD_GENERATE_SUMMARY,
+    FIELD_EMBEDDING_DIMENSION,
+    FIELD_TASK_ID,
+    # API response fields
+    RESP_MESSAGE,
+    RESP_ERROR,
+    RESP_COLLECTIONS,
+    RESP_TASK_ID,
+    RESP_STATE,
+    RESP_RESULT,
+    RESP_FAILED_DOCUMENTS,
+    RESP_VALIDATION_ERRORS,
+    # Timeouts
+    TIMEOUT_DEFAULT,
+    TIMEOUT_UPLOAD,
+    TIMEOUT_TASK_CHECK,
+    TIMEOUT_MAX_TASK_WAIT,
+    # Kafka defaults
+    KAFKA_DEFAULT_TOPIC,
+    KAFKA_DEFAULT_CONSUMER_GROUP,
+    KAFKA_DEFAULT_AUTO_OFFSET_RESET,
+    KAFKA_DEFAULT_MAX_POLL_RECORDS,
+    KAFKA_DEFAULT_MAX_POLL_INTERVAL_MS,
+    KAFKA_DEFAULT_SESSION_TIMEOUT_MS,
+    KAFKA_DEFAULT_HEARTBEAT_INTERVAL_MS,
+    # Collection defaults
+    COLLECTION_EMBEDDING_DIMENSION,
+    COLLECTION_CHUNK_SIZE,
+    COLLECTION_CHUNK_OVERLAP,
+)
diff --git a/examples/rag_event_ingest/kafka_consumer/config/constants.py b/examples/rag_event_ingest/kafka_consumer/config/constants.py
new file mode 100644
index 000000000..050cf70c4
--- /dev/null
+++ b/examples/rag_event_ingest/kafka_consumer/config/constants.py
@@ -0,0 +1,296 @@
+# config/constants.py
+"""Static constants that don't change at runtime.
+
+For configurable values from environment, see settings.py
+"""
+
+# ==================== File Extensions ====================
+
+DOCUMENT_EXTENSIONS = frozenset({
+    '.pdf', '.docx', '.doc', '.txt', '.md', '.rst',
+    '.html', '.htm', '.pptx', '.ppt', '.xlsx', '.xls',
+    '.csv', '.json', '.xml'
+})
+
+IMAGE_EXTENSIONS = frozenset({
+    '.jpg', '.jpeg', '.png', '.gif', 
+    '.webp', '.bmp', '.tiff', '.svg'
+})
+
+AUDIO_EXTENSIONS = frozenset({
+    '.mp3', '.wav', '.flac', '.aac', 
+    '.ogg', '.m4a', '.wma'
+})
+
+SKIP_EXTENSIONS = frozenset({
+    '.tmp', '.log', '.bak', '.swp', '.DS_Store',
+    '.gitkeep', '.gitignore'
+})
+
+
+# ==================== Content Types ====================
+
+CONTENT_TYPE_MAP = {
+    # Documents
+    '.pdf': 'application/pdf',
+    '.txt': 'text/plain',
+    '.doc': 'application/msword',
+    '.docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
+    '.html': 'text/html',
+    '.htm': 'text/html',
+    '.xml': 'application/xml',
+    '.json': 'application/json',
+    '.csv': 'text/csv',
+    '.md': 'text/markdown',
+    '.rst': 'text/x-rst',
+    '.ppt': 'application/vnd.ms-powerpoint',
+    '.pptx': 'application/vnd.openxmlformats-officedocument.presentationml.presentation',
+    '.xls': 'application/vnd.ms-excel',
+    '.xlsx': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
+    # Images
+    '.jpg': 'image/jpeg',
+    '.jpeg': 'image/jpeg',
+    '.png': 'image/png',
+    '.gif': 'image/gif',
+    '.webp': 'image/webp',
+    '.bmp': 'image/bmp',
+    '.tiff': 'image/tiff',
+    '.svg': 'image/svg+xml',
+    # Audio
+    '.mp3': 'audio/mpeg',
+    '.wav': 'audio/wav',
+    '.flac': 'audio/flac',
+    '.aac': 'audio/aac',
+    '.ogg': 'audio/ogg',
+    '.m4a': 'audio/mp4',
+    '.wma': 'audio/x-ms-wma',
+}
+
+DEFAULT_CONTENT_TYPE = 'application/octet-stream'
+
+
+# ==================== Routing ====================
+
+# Destinations
+DEST_RAG = 'rag'
+DEST_SKIP = 'skip'
+DEST_UNKNOWN = 'unknown'
+
+# Route result keys
+KEY_DESTINATION = 'destination'
+KEY_FILE_TYPE = 'file_type'
+KEY_EXTENSION = 'extension'
+KEY_REASON = 'reason'
+
+# File types
+FILE_TYPE_DOCUMENT = 'document'
+FILE_TYPE_IMAGE = 'image'
+FILE_TYPE_AUDIO = 'audio'
+FILE_TYPE_SKIP = 'skip'
+FILE_TYPE_UNKNOWN = 'unknown'
+
+# Config keys
+CFG_DOCUMENT_EXTENSIONS = 'document_extensions'
+CFG_IMAGE_EXTENSIONS = 'image_extensions'
+CFG_AUDIO_EXTENSIONS = 'audio_extensions'
+CFG_SKIP_EXTENSIONS = 'skip_extensions'
+CFG_ENABLE_IMAGE_PROCESSING = 'enable_image_processing'
+CFG_ENABLE_AUDIO_PROCESSING = 'enable_audio_processing'
+
+
+# ==================== S3 Event Fields ====================
+
+# Kafka S3 event structure
+EVENT_NAME = 'EventName'
+EVENT_RECORDS = 'Records'
+EVENT_FIRST_RECORD_INDEX = 0  # S3 events typically contain single record
+EVENT_S3 = 's3'
+EVENT_BUCKET = 'bucket'
+EVENT_OBJECT = 'object'
+EVENT_KEY = 'key'
+EVENT_SIZE = 'size'
+EVENT_ETAG = 'eTag'
+EVENT_NAME_FIELD = 'name'
+
+# Event type prefixes
+EVENT_PREFIX_CREATED = 's3:ObjectCreated:'
+EVENT_PREFIX_REMOVED = 's3:ObjectRemoved:'
+
+# Event type values
+EVENT_TYPE_CREATE = 'create'
+EVENT_TYPE_DELETE = 'delete'
+
+
+# ==================== Record Fields ====================
+
+# IngestionRecord field names (dataclass attributes)
+FIELD_FILE_NAME = 'file_name'
+FIELD_BUCKET = 'bucket'
+FIELD_COLLECTION = 'collection'
+FIELD_STATUS = 'status'
+FIELD_START_TIME = 'start_time'
+FIELD_END_TIME = 'end_time'
+FIELD_DURATION_SECONDS = 'duration_seconds'
+FIELD_ERROR_MESSAGE = 'error_message'
+FIELD_TASK_ID = 'task_id'
+
+# IngestionRecord serialization output keys
+RECORD_FILE_NAME = FIELD_FILE_NAME
+RECORD_BUCKET = FIELD_BUCKET
+RECORD_COLLECTION = FIELD_COLLECTION
+RECORD_START_TIME = FIELD_START_TIME
+RECORD_END_TIME = FIELD_END_TIME
+RECORD_DURATION = FIELD_DURATION_SECONDS
+RECORD_STATUS = FIELD_STATUS
+RECORD_ERROR = FIELD_ERROR_MESSAGE
+RECORD_TASK_ID = FIELD_TASK_ID
+
+
+# ==================== Task Status ====================
+
+STATUS_PENDING = 'PENDING'
+STATUS_PROCESSING = 'PROCESSING'
+STATUS_FINISHED = 'FINISHED'
+STATUS_FAILED = 'FAILED'
+STATUS_SKIPPED = 'SKIPPED'
+STATUS_DELETED = 'DELETED'
+STATUS_SUCCESS = 'SUCCESS'
+
+
+# ==================== Config Keys ====================
+
+# MinIO/S3 source config keys
+CFG_ENDPOINT = 'endpoint'
+CFG_ACCESS = 'access'
+CFG_SECRET = 'secret'
+CFG_SECURE = 'secure'
+CFG_COLLECTION = 'collection'
+CFG_BUCKETS = 'buckets'
+
+
+# ==================== API Request Fields ====================
+
+# Ingestor request fields
+FIELD_COLLECTION_NAME = 'collection_name'
+FIELD_BLOCKING = 'blocking'
+FIELD_SPLIT_OPTIONS = 'split_options'
+FIELD_CHUNK_SIZE = 'chunk_size'
+FIELD_CHUNK_OVERLAP = 'chunk_overlap'
+FIELD_GENERATE_SUMMARY = 'generate_summary'
+FIELD_EMBEDDING_DIMENSION = 'embedding_dimension'
+FIELD_TASK_ID = 'task_id'
+
+
+# ==================== API Response Fields ====================
+
+# Common response fields
+RESP_CONTENT = 'content'
+RESP_RESPONSE = 'response'
+RESP_TEXT = 'text'
+RESP_CHOICES = 'choices'
+RESP_MESSAGE = 'message'
+RESP_ERROR = 'error'
+
+# Ingestor response fields
+RESP_COLLECTIONS = 'collections'
+RESP_TASK_ID = 'task_id'
+RESP_STATE = 'state'
+RESP_RESULT = 'result'
+RESP_FAILED_DOCUMENTS = 'failed_documents'
+RESP_VALIDATION_ERRORS = 'validation_errors'
+
+
+# ==================== Timeouts (seconds) ====================
+
+TIMEOUT_DEFAULT = 30
+TIMEOUT_UPLOAD = 600
+TIMEOUT_TASK_CHECK = 30
+TIMEOUT_MAX_TASK_WAIT = 300
+
+
+# ==================== Kafka Defaults ====================
+
+KAFKA_DEFAULT_TOPIC = 'aidp-topic'
+KAFKA_DEFAULT_CONSUMER_GROUP = 'nvingest-consumer-group'
+KAFKA_DEFAULT_AUTO_OFFSET_RESET = 'earliest'
+KAFKA_DEFAULT_MAX_POLL_RECORDS = 1
+KAFKA_DEFAULT_MAX_POLL_INTERVAL_MS = 600000   # 10 min
+KAFKA_DEFAULT_SESSION_TIMEOUT_MS = 60000      # 60s
+KAFKA_DEFAULT_HEARTBEAT_INTERVAL_MS = 20000   # 20s
+
+
+# ==================== Collection Defaults ====================
+
+COLLECTION_EMBEDDING_DIMENSION = 2048
+COLLECTION_CHUNK_SIZE = 512
+COLLECTION_CHUNK_OVERLAP = 150
+
+
+# ==================== Environment Variable Keys ====================
+
+# Kafka
+ENV_KAFKA_BOOTSTRAP_SERVERS = 'KAFKA_BOOTSTRAP_SERVERS'
+ENV_KAFKA_TOPIC = 'KAFKA_TOPIC'
+ENV_CONSUMER_GROUP = 'CONSUMER_GROUP'
+ENV_KAFKA_AUTO_OFFSET_RESET = 'KAFKA_AUTO_OFFSET_RESET'
+ENV_KAFKA_MAX_POLL_RECORDS = 'KAFKA_MAX_POLL_RECORDS'
+ENV_KAFKA_MAX_POLL_INTERVAL_MS = 'KAFKA_MAX_POLL_INTERVAL_MS'
+ENV_KAFKA_SESSION_TIMEOUT_MS = 'KAFKA_SESSION_TIMEOUT_MS'
+ENV_KAFKA_HEARTBEAT_INTERVAL_MS = 'KAFKA_HEARTBEAT_INTERVAL_MS'
+
+# Service URLs
+ENV_INGESTOR_SERVER_URL = 'INGESTOR_SERVER_URL'
+ENV_INGESTOR_TIMEOUT = 'INGESTOR_TIMEOUT'
+
+# API Endpoints
+ENV_API_INGESTOR_DOCUMENTS = 'API_INGESTOR_DOCUMENTS'
+ENV_API_INGESTOR_COLLECTIONS = 'API_INGESTOR_COLLECTIONS'
+ENV_API_INGESTOR_COLLECTION = 'API_INGESTOR_COLLECTION'
+ENV_API_INGESTOR_STATUS = 'API_INGESTOR_STATUS'
+
+# MinIO
+ENV_MINIO_ENDPOINT = 'MINIO_ENDPOINT'
+ENV_MINIO_ACCESS_KEY = 'MINIO_ACCESS_KEY'
+ENV_MINIO_SECRET_KEY = 'MINIO_SECRET_KEY'
+ENV_MINIO_SECURE = 'MINIO_SECURE'
+ENV_COLLECTION_NAME = 'COLLECTION_NAME'
+ENV_MINIO_SOURCES = 'MINIO_SOURCES'
+
+# Feature Flags
+ENV_ENABLE_IMAGE_PROCESSING = 'ENABLE_IMAGE_PROCESSING'
+ENV_ENABLE_AUDIO_PROCESSING = 'ENABLE_AUDIO_PROCESSING'
+
+# Collection Settings
+ENV_EMBEDDING_DIMENSION = 'EMBEDDING_DIMENSION'
+ENV_CHUNK_SIZE = 'CHUNK_SIZE'
+ENV_CHUNK_OVERLAP = 'CHUNK_OVERLAP'
+
+# Logging
+ENV_LOG_LEVEL = 'LOG_LEVEL'
+ENV_LOG_FORMAT = 'LOG_FORMAT'
+
+# History
+ENV_HISTORY_FILE = 'HISTORY_FILE'
+
+# ==================== API Endpoint Defaults ====================
+
+DEFAULT_API_INGESTOR_DOCUMENTS = '/v1/documents'
+DEFAULT_API_INGESTOR_COLLECTIONS = '/v1/collections'
+DEFAULT_API_INGESTOR_COLLECTION = '/v1/collection'
+DEFAULT_API_INGESTOR_STATUS = '/v1/status'
+
+
+# ==================== Logging Defaults ====================
+
+DEFAULT_LOG_LEVEL = 'INFO'
+DEFAULT_LOG_FORMAT = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+
+
+# ==================== History Defaults ====================
+
+DEFAULT_HISTORY_FILE = '/tmp/ingestion_history.jsonl'
+
+
+# ==================== MinIO Defaults ====================
+
+DEFAULT_COLLECTION_NAME = 'multimodal_data'
diff --git a/examples/rag_event_ingest/kafka_consumer/config/settings.py b/examples/rag_event_ingest/kafka_consumer/config/settings.py
new file mode 100644
index 000000000..bbfe4824c
--- /dev/null
+++ b/examples/rag_event_ingest/kafka_consumer/config/settings.py
@@ -0,0 +1,135 @@
+# config/settings.py
+"""Runtime settings loaded from environment variables."""
+
+import os
+
+from .constants import (
+    # Default values
+    KAFKA_DEFAULT_TOPIC,
+    KAFKA_DEFAULT_CONSUMER_GROUP,
+    KAFKA_DEFAULT_AUTO_OFFSET_RESET,
+    KAFKA_DEFAULT_MAX_POLL_RECORDS,
+    KAFKA_DEFAULT_MAX_POLL_INTERVAL_MS,
+    KAFKA_DEFAULT_SESSION_TIMEOUT_MS,
+    KAFKA_DEFAULT_HEARTBEAT_INTERVAL_MS,
+    TIMEOUT_UPLOAD,
+    COLLECTION_EMBEDDING_DIMENSION,
+    COLLECTION_CHUNK_SIZE,
+    COLLECTION_CHUNK_OVERLAP,
+    # API Endpoint defaults
+    DEFAULT_API_INGESTOR_DOCUMENTS,
+    DEFAULT_API_INGESTOR_COLLECTIONS,
+    DEFAULT_API_INGESTOR_COLLECTION,
+    DEFAULT_API_INGESTOR_STATUS,
+    # Logging defaults
+    DEFAULT_LOG_LEVEL,
+    DEFAULT_LOG_FORMAT,
+    # History defaults
+    DEFAULT_HISTORY_FILE,
+    # MinIO defaults
+    DEFAULT_COLLECTION_NAME,
+    # Environment variable keys
+    ENV_KAFKA_BOOTSTRAP_SERVERS,
+    ENV_KAFKA_TOPIC,
+    ENV_CONSUMER_GROUP,
+    ENV_KAFKA_AUTO_OFFSET_RESET,
+    ENV_KAFKA_MAX_POLL_RECORDS,
+    ENV_KAFKA_MAX_POLL_INTERVAL_MS,
+    ENV_KAFKA_SESSION_TIMEOUT_MS,
+    ENV_KAFKA_HEARTBEAT_INTERVAL_MS,
+    ENV_INGESTOR_SERVER_URL,
+    ENV_INGESTOR_TIMEOUT,
+    ENV_API_INGESTOR_DOCUMENTS,
+    ENV_API_INGESTOR_COLLECTIONS,
+    ENV_API_INGESTOR_COLLECTION,
+    ENV_API_INGESTOR_STATUS,
+    ENV_MINIO_ENDPOINT,
+    ENV_MINIO_ACCESS_KEY,
+    ENV_MINIO_SECRET_KEY,
+    ENV_MINIO_SECURE,
+    ENV_COLLECTION_NAME,
+    ENV_MINIO_SOURCES,
+    ENV_ENABLE_IMAGE_PROCESSING,
+    ENV_ENABLE_AUDIO_PROCESSING,
+    ENV_EMBEDDING_DIMENSION,
+    ENV_CHUNK_SIZE,
+    ENV_CHUNK_OVERLAP,
+    ENV_LOG_LEVEL,
+    ENV_LOG_FORMAT,
+    ENV_HISTORY_FILE,
+)
+
+
+# ==================== Helper Functions ====================
+
+def _get_bool(key: str, default: bool = False) -> bool:
+    """Get boolean from environment variable."""
+    return os.getenv(key, str(default)).lower() in ('true', '1', 'yes', 'on')
+
+
+def _get_int(key: str, default: int) -> int:
+    """Get integer from environment variable."""
+    try:
+        return int(os.getenv(key, str(default)))
+    except ValueError:
+        return default
+
+
+# ==================== Kafka Settings ====================
+
+KAFKA_BOOTSTRAP_SERVERS = os.getenv(ENV_KAFKA_BOOTSTRAP_SERVERS)  # Required
+KAFKA_CONSUMER_GROUP = os.getenv(ENV_CONSUMER_GROUP, KAFKA_DEFAULT_CONSUMER_GROUP)
+KAFKA_TOPIC = os.getenv(ENV_KAFKA_TOPIC, KAFKA_DEFAULT_TOPIC)
+KAFKA_AUTO_OFFSET_RESET = os.getenv(ENV_KAFKA_AUTO_OFFSET_RESET, KAFKA_DEFAULT_AUTO_OFFSET_RESET)
+KAFKA_MAX_POLL_RECORDS = _get_int(ENV_KAFKA_MAX_POLL_RECORDS, KAFKA_DEFAULT_MAX_POLL_RECORDS)
+KAFKA_MAX_POLL_INTERVAL_MS = _get_int(ENV_KAFKA_MAX_POLL_INTERVAL_MS, KAFKA_DEFAULT_MAX_POLL_INTERVAL_MS)
+KAFKA_SESSION_TIMEOUT_MS = _get_int(ENV_KAFKA_SESSION_TIMEOUT_MS, KAFKA_DEFAULT_SESSION_TIMEOUT_MS)
+KAFKA_HEARTBEAT_INTERVAL_MS = _get_int(ENV_KAFKA_HEARTBEAT_INTERVAL_MS, KAFKA_DEFAULT_HEARTBEAT_INTERVAL_MS)
+
+
+# ==================== Service URLs ====================
+
+INGESTOR_SERVER_URL = os.getenv(ENV_INGESTOR_SERVER_URL)  # Required
+INGESTOR_TIMEOUT = _get_int(ENV_INGESTOR_TIMEOUT, TIMEOUT_UPLOAD)
+
+# API Endpoints - Ingestor Server
+API_INGESTOR_DOCUMENTS = os.getenv(ENV_API_INGESTOR_DOCUMENTS, DEFAULT_API_INGESTOR_DOCUMENTS)
+API_INGESTOR_COLLECTIONS = os.getenv(ENV_API_INGESTOR_COLLECTIONS, DEFAULT_API_INGESTOR_COLLECTIONS)
+API_INGESTOR_COLLECTION = os.getenv(ENV_API_INGESTOR_COLLECTION, DEFAULT_API_INGESTOR_COLLECTION)
+API_INGESTOR_STATUS = os.getenv(ENV_API_INGESTOR_STATUS, DEFAULT_API_INGESTOR_STATUS)
+
+
+# ==================== MinIO Settings ====================
+
+MINIO_ENDPOINT = os.getenv(ENV_MINIO_ENDPOINT)  # Required
+MINIO_ACCESS_KEY = os.getenv(ENV_MINIO_ACCESS_KEY)  # Required
+MINIO_SECRET_KEY = os.getenv(ENV_MINIO_SECRET_KEY)  # Required
+MINIO_SECURE = _get_bool(ENV_MINIO_SECURE, False)
+# Single collection for all buckets - matches RAG server's COLLECTION_NAME
+MINIO_DEFAULT_COLLECTION = os.getenv(ENV_COLLECTION_NAME, DEFAULT_COLLECTION_NAME)
+MINIO_SOURCES = os.getenv(ENV_MINIO_SOURCES)  # JSON config for multi-source
+
+
+# ==================== Feature Flags ====================
+
+ENABLE_IMAGE_PROCESSING = _get_bool(ENV_ENABLE_IMAGE_PROCESSING, False)
+ENABLE_AUDIO_PROCESSING = _get_bool(ENV_ENABLE_AUDIO_PROCESSING, False)
+
+
+# ==================== Collection Settings ====================
+
+EMBEDDING_DIMENSION = _get_int(ENV_EMBEDDING_DIMENSION, COLLECTION_EMBEDDING_DIMENSION)
+CHUNK_SIZE = _get_int(ENV_CHUNK_SIZE, COLLECTION_CHUNK_SIZE)
+CHUNK_OVERLAP = _get_int(ENV_CHUNK_OVERLAP, COLLECTION_CHUNK_OVERLAP)
+
+
+# ==================== Logging Settings ====================
+
+LOG_LEVEL = os.getenv(ENV_LOG_LEVEL, DEFAULT_LOG_LEVEL)
+LOG_FORMAT = os.getenv(ENV_LOG_FORMAT, DEFAULT_LOG_FORMAT)
+
+
+# ==================== History Settings ====================
+
+HISTORY_FILE = os.getenv(ENV_HISTORY_FILE, DEFAULT_HISTORY_FILE)
+
diff --git a/examples/rag_event_ingest/kafka_consumer/consumer.py b/examples/rag_event_ingest/kafka_consumer/consumer.py
new file mode 100644
index 000000000..87a5c0538
--- /dev/null
+++ b/examples/rag_event_ingest/kafka_consumer/consumer.py
@@ -0,0 +1,197 @@
+# consumer.py
+"""Kafka consumer for MinIO S3 events."""
+
+import json
+import logging
+from datetime import datetime
+from typing import Dict, Optional
+from kafka import KafkaConsumer
+
+import config.settings as cfg
+from pathlib import Path
+from config.constants import DEST_RAG, DEST_SKIP, STATUS_FAILED, KEY_DESTINATION, KEY_FILE_TYPE, KEY_REASON
+from router import FileRouter
+from models.events import S3Event, HandlerResult, IngestionRecord
+from handlers.base import BaseHandler
+from services.storage import ObjectStorage
+
+logger = logging.getLogger(__name__)
+
+
+class KafkaEventConsumer:
+    """Kafka consumer that routes MinIO events to handlers."""
+    
+    def __init__(
+        self,
+        handlers: Dict[str, BaseHandler],
+        storage: ObjectStorage,
+        history_file: str = '/tmp/ingestion_history.jsonl'
+    ):
+        """Initialize Kafka consumer."""
+        self.handlers = handlers
+        self.storage = storage
+        self.history_file = history_file
+        self.router = FileRouter()
+        
+        logger.info(f"Connecting to Kafka: {cfg.KAFKA_BOOTSTRAP_SERVERS}")
+        logger.info(f"Consumer group: {cfg.KAFKA_CONSUMER_GROUP}")
+        
+        self.kafka_consumer = KafkaConsumer(
+            cfg.KAFKA_TOPIC,
+            bootstrap_servers=cfg.KAFKA_BOOTSTRAP_SERVERS.split(','),
+            value_deserializer=lambda m: json.loads(m.decode('utf-8')),
+            group_id=cfg.KAFKA_CONSUMER_GROUP,
+            auto_offset_reset=cfg.KAFKA_AUTO_OFFSET_RESET,
+            enable_auto_commit=True,
+            max_poll_records=cfg.KAFKA_MAX_POLL_RECORDS,
+            max_poll_interval_ms=cfg.KAFKA_MAX_POLL_INTERVAL_MS,
+            session_timeout_ms=cfg.KAFKA_SESSION_TIMEOUT_MS,
+            heartbeat_interval_ms=cfg.KAFKA_HEARTBEAT_INTERVAL_MS
+        )
+        
+        logger.info("Kafka consumer initialized")
+        logger.info(f"Registered handlers: {list(self.handlers.keys())}")
+    
+    def process_event(self, raw_event: dict) -> Optional[HandlerResult]:
+        """Process a single MinIO S3 event."""
+        start_time = datetime.now()
+        event: Optional[S3Event] = None
+        result: Optional[HandlerResult] = None
+        
+        try:
+            logger.info(f"Received event: {json.dumps(raw_event, indent=2)}")
+            
+            event = S3Event.from_kafka_message(
+                raw_event,
+                collection_resolver=self.storage.get_collection_for_bucket
+            )
+            
+            if not event:
+                logger.warning("Invalid event format, skipping")
+                return None
+            
+            logger.info(f"Processing: {event.bucket}/{event.key} ({event.size} bytes)")
+            
+            if event.event_type == 'delete':
+                result = self._handle_delete(event)
+            else:
+                result = self._handle_create(event)
+            
+            return result
+            
+        except (json.JSONDecodeError, KeyError, ValueError) as e:
+            logger.error(f"Invalid event data: {e}")
+            result = HandlerResult.failed_result(str(e))
+            return result
+            
+        except (IOError, OSError) as e:
+            logger.error(f"Storage error: {e}")
+            result = HandlerResult.failed_result(str(e))
+            return result
+            
+        finally:
+            if event:
+                self._save_record(event, result, start_time)
+    
+    def _handle_delete(self, event: S3Event) -> HandlerResult:
+        """Handle S3 delete event."""
+        logger.info(f"🗑️  DELETE event for {event.key}")
+        
+        doc_handler = self.handlers.get(DEST_RAG)
+        if not doc_handler or not hasattr(doc_handler, 'indexer'):
+            return HandlerResult.failed_result("Delete failed - no indexer available")
+
+        indexer = doc_handler.indexer
+        success = indexer.delete(event.key, event.collection)
+
+        if success:
+            logger.info(f"✓ Deleted {event.key} from Milvus")
+            return HandlerResult(success=True, status='DELETED')
+        
+        return HandlerResult.failed_result("Delete failed")
+    
+    def _handle_create(self, event: S3Event) -> HandlerResult:
+        """Handle S3 create event."""
+        route_info = self.router.route(event.key)
+        destination = route_info[KEY_DESTINATION]
+        
+        logger.info(f"📁 {route_info[KEY_FILE_TYPE]} → {destination}")
+        
+        if destination == DEST_SKIP:
+            reason = route_info.get(KEY_REASON, 'Skipped by router')
+            logger.info(f"⏭️  Skipping: {reason}")
+            return HandlerResult.skipped_result(reason)
+        
+        handler = self.handlers.get(destination)
+        if not handler:
+            handler = self.handlers.get(DEST_RAG)
+        
+        if not handler:
+            return HandlerResult.failed_result(f"No handler for {destination}")
+        
+        return handler.handle(event)
+    
+    def _save_record(self, event: S3Event, result: Optional[HandlerResult], start_time: datetime):
+        """Save ingestion record to history file."""
+        end_time = datetime.now()
+        duration = (end_time - start_time).total_seconds()
+        
+        record = IngestionRecord(
+            file_name=event.key,
+            bucket=event.bucket,
+            collection=event.collection,
+            status=result.status if result else STATUS_FAILED,
+            start_time=start_time,
+            end_time=end_time,
+            duration_seconds=duration,
+            error_message=result.error_message if result else None,
+            task_id=result.task_id if result else None
+        )
+        
+        try:
+            with open(self.history_file, 'a') as f:
+                f.write(json.dumps(record.to_dict()) + '\n')
+        except (IOError, OSError) as e:
+            logger.error(f"Failed to save history: {e}")
+        
+        status_emoji = '✓' if record.status in ['SUCCESS', 'DELETED', 'SKIPPED'] else '✗'
+        logger.info(
+            f"{status_emoji} SUMMARY: {event.key} | "
+            f"Collection: {event.collection} | "
+            f"Duration: {duration:.2f}s | "
+            f"Status: {record.status}"
+        )
+    
+    def run(self):
+        """Main consumer loop."""
+        logger.info("Starting Kafka consumer loop...")
+        logger.info(f"Subscribed topics: {self.kafka_consumer.subscription()}")
+        logger.info("Waiting for messages...")
+        
+        try:
+            message_count = 0
+            for message in self._poll_messages():
+                message_count += 1
+                logger.info(
+                    f"[{message_count}] Message from "
+                    f"partition {message.partition}, offset {message.offset}"
+                )
+                self.process_event(message.value)
+                            
+        except KeyboardInterrupt:
+            logger.info("Shutting down...")
+        finally:
+            self.kafka_consumer.close()
+            logger.info("Consumer closed")
+    
+    def _poll_messages(self):
+        """Generator that yields messages from Kafka."""
+        while True:
+            msg_pack = self.kafka_consumer.poll(timeout_ms=5000, max_records=1)
+            
+            if not msg_pack:
+                logger.debug("No messages, continuing...")
+                continue
+            
+            for messages in msg_pack.values():
+                yield from messages
diff --git a/examples/rag_event_ingest/kafka_consumer/handlers/__init__.py b/examples/rag_event_ingest/kafka_consumer/handlers/__init__.py
new file mode 100644
index 000000000..e6f3efcff
--- /dev/null
+++ b/examples/rag_event_ingest/kafka_consumer/handlers/__init__.py
@@ -0,0 +1,5 @@
+# Handlers package
+from .base import BaseHandler
+from .document import DocumentHandler
+
+__all__ = ['BaseHandler', 'DocumentHandler']
diff --git a/examples/rag_event_ingest/kafka_consumer/handlers/base.py b/examples/rag_event_ingest/kafka_consumer/handlers/base.py
new file mode 100644
index 000000000..6745f2e09
--- /dev/null
+++ b/examples/rag_event_ingest/kafka_consumer/handlers/base.py
@@ -0,0 +1,43 @@
+# handlers/base.py
+"""Base handler abstract class."""
+
+from abc import ABC, abstractmethod
+import logging
+
+from models.events import S3Event, HandlerResult
+
+logger = logging.getLogger(__name__)
+
+
+class BaseHandler(ABC):
+    """Abstract base class for file handlers."""
+    
+    @property
+    @abstractmethod
+    def name(self) -> str:
+        """Handler name for logging."""
+        pass
+    
+    @abstractmethod
+    def handle(self, event: S3Event) -> HandlerResult:
+        """Process an S3 event.
+        
+        Args:
+            event: S3 event to process
+            
+        Returns:
+            HandlerResult with success status and optional task_id
+        """
+        pass
+    
+    def log_start(self, event: S3Event):
+        """Log handler start."""
+        logger.info(f"[{self.name}] Processing {event.bucket}/{event.key}")
+    
+    def log_success(self, event: S3Event, result: HandlerResult):
+        """Log successful handling."""
+        logger.info(f"[{self.name}] ✓ {event.key} → {result.status}")
+    
+    def log_failure(self, event: S3Event, result: HandlerResult):
+        """Log failed handling."""
+        logger.error(f"[{self.name}] ✗ {event.key}: {result.error_message}")
diff --git a/examples/rag_event_ingest/kafka_consumer/handlers/document.py b/examples/rag_event_ingest/kafka_consumer/handlers/document.py
new file mode 100644
index 000000000..1df03946d
--- /dev/null
+++ b/examples/rag_event_ingest/kafka_consumer/handlers/document.py
@@ -0,0 +1,89 @@
+# handlers/document.py
+"""Handler for document files (PDF, DOCX, TXT, etc.)."""
+
+import logging
+
+import requests
+
+from .base import BaseHandler
+from models.events import S3Event, HandlerResult
+from services.storage import ObjectStorage
+from services.document_indexer import DocumentIndexer
+
+logger = logging.getLogger(__name__)
+
+
+class DocumentHandler(BaseHandler):
+    """Handler for document files - sends to RAG ingestor."""
+    
+    def __init__(self, storage: ObjectStorage, indexer: DocumentIndexer):
+        """Initialize document handler.
+        
+        Args:
+            storage: Object storage for file downloads
+            indexer: Document indexer for RAG pipeline
+        """
+        self.storage = storage
+        self.indexer = indexer
+    
+    @property
+    def name(self) -> str:
+        return "DocumentHandler"
+    
+    def handle(self, event: S3Event) -> HandlerResult:
+        """Process document file.
+        
+        1. Delete existing entries (for updates)
+        2. Download from MinIO
+        3. Upload to ingestor
+        4. Wait for completion
+        
+        Args:
+            event: S3 event with document info
+            
+        Returns:
+            HandlerResult with task_id for status tracking
+        """
+        self.log_start(event)
+        
+        try:
+            # Step 1: Delete existing entries (handles updates)
+            logger.info(f"🔄 Checking for existing entries of {event.key}...")
+            self.indexer.delete(event.key, event.collection)
+            
+            # Step 2: Download from storage
+            logger.info(f"📥 Downloading from storage...")
+            file_data = self.storage.download(event.bucket, event.key)
+            
+            # Step 3: Upload to indexer
+            logger.info(f"📤 Sending to indexer...")
+            task_id = self.indexer.upload(
+                file_data=file_data,
+                filename=event.key,
+                collection=event.collection
+            )
+            
+            if not task_id:
+                result = HandlerResult.failed_result("Indexer upload failed")
+                self.log_failure(event, result)
+                return result
+            
+            # Step 4: Wait for completion
+            logger.info(f"⏳ Waiting for indexing (task_id: {task_id})...")
+            success, message = self.indexer.check_status(task_id)
+            
+            if success:
+                result = HandlerResult.success_result(task_id=task_id)
+                self.log_success(event, result)
+                return result
+            else:
+                result = HandlerResult.failed_result(message, task_id=task_id)
+                self.log_failure(event, result)
+                return result
+                
+        except requests.RequestException as e:
+            logger.error(f"Network error processing document: {e}")
+            return HandlerResult.failed_result(str(e))
+        except (IOError, OSError) as e:
+            logger.error(f"Storage error processing document: {e}")
+            return HandlerResult.failed_result(str(e))
diff --git a/examples/rag_event_ingest/kafka_consumer/main.py b/examples/rag_event_ingest/kafka_consumer/main.py
new file mode 100644
index 000000000..df4384d4f
--- /dev/null
+++ b/examples/rag_event_ingest/kafka_consumer/main.py
@@ -0,0 +1,47 @@
+#!/usr/bin/env python3
+# main.py
+"""Entry point for Kafka MinIO consumer."""
+
+import logging
+
+import config.settings as cfg
+from config.constants import DEST_RAG
+from services import ObjectStorage, DocumentIndexer
+from handlers import DocumentHandler
+from consumer import KafkaEventConsumer
+
+logging.basicConfig(
+    level=getattr(logging, cfg.LOG_LEVEL, logging.INFO),
+    format=cfg.LOG_FORMAT
+)
+logger = logging.getLogger(__name__)
+
+
+def main():
+    """Initialize and run the Kafka consumer."""
+    logger.info("=" * 60)
+    logger.info("Starting Kafka MinIO Consumer")
+    logger.info("=" * 60)
+    
+    # Initialize services
+    logger.info("Initializing services...")
+    storage = ObjectStorage()
+    indexer = DocumentIndexer(cfg.INGESTOR_SERVER_URL)
+
+    # Initialize handlers
+    logger.info("Initializing handlers...")
+    handlers = {
+        DEST_RAG: DocumentHandler(storage, indexer),
+    }
+    
+    # Initialize consumer
+    logger.info("Initializing Kafka consumer...")
+    consumer = KafkaEventConsumer(handlers=handlers, storage=storage, history_file=cfg.HISTORY_FILE)
+    
+    # Run consumer loop
+    logger.info("Starting consumer loop...")
+    consumer.run()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/examples/rag_event_ingest/kafka_consumer/models/__init__.py b/examples/rag_event_ingest/kafka_consumer/models/__init__.py
new file mode 100644
index 000000000..2abce8a0d
--- /dev/null
+++ b/examples/rag_event_ingest/kafka_consumer/models/__init__.py
@@ -0,0 +1,4 @@
+# Models package
+from .events import S3Event, HandlerResult, IngestionRecord
+
+__all__ = ['S3Event', 'HandlerResult', 'IngestionRecord']
diff --git a/examples/rag_event_ingest/kafka_consumer/models/events.py b/examples/rag_event_ingest/kafka_consumer/models/events.py
new file mode 100644
index 000000000..4baf7112f
--- /dev/null
+++ b/examples/rag_event_ingest/kafka_consumer/models/events.py
@@ -0,0 +1,138 @@
+# models/events.py
+"""Data models for Kafka consumer events and results."""
+
+from dataclasses import dataclass, field, fields
+from datetime import datetime
+from typing import Any, Callable, ClassVar, Dict, Optional
+from urllib.parse import unquote_plus
+
+from config.constants import (
+    STATUS_SUCCESS,
+    STATUS_FAILED,
+    STATUS_SKIPPED,
+    # S3 Event fields
+    EVENT_NAME,
+    EVENT_RECORDS,
+    EVENT_FIRST_RECORD_INDEX,
+    EVENT_S3,
+    EVENT_BUCKET,
+    EVENT_OBJECT,
+    EVENT_KEY,
+    EVENT_SIZE,
+    EVENT_ETAG,
+    EVENT_NAME_FIELD,
+    EVENT_PREFIX_CREATED,
+    EVENT_PREFIX_REMOVED,
+    EVENT_TYPE_CREATE,
+    EVENT_TYPE_DELETE,
+    # Record field names (for transformers)
+    FIELD_START_TIME,
+    FIELD_END_TIME,
+    FIELD_DURATION_SECONDS,
+)
+
+
+@dataclass
+class S3Event:
+    """Represents a MinIO S3 event from Kafka."""
+    bucket: str
+    key: str
+    size: int
+    etag: str
+    event_type: str
+    collection: str = ''
+    
+    @classmethod
+    def from_kafka_message(
+        cls,
+        event: Dict[str, Any],
+        collection_resolver: Callable[[str], str]
+    ) -> Optional['S3Event']:
+        """Parse S3 event from Kafka message.
+        
+        Args:
+            event: Raw Kafka message value
+            collection_resolver: Function to resolve bucket -> collection name
+        """
+        if EVENT_NAME not in event:
+            return None
+        
+        event_name = event[EVENT_NAME]
+        
+        if event_name.startswith(EVENT_PREFIX_CREATED):
+            event_type = EVENT_TYPE_CREATE
+        elif event_name.startswith(EVENT_PREFIX_REMOVED):
+            event_type = EVENT_TYPE_DELETE
+        else:
+            return None
+        
+        records = event.get(EVENT_RECORDS, [])
+        if not records:
+            return None
+        
+        record = records[EVENT_FIRST_RECORD_INDEX]
+        s3_data = record[EVENT_S3]
+        bucket = s3_data[EVENT_BUCKET][EVENT_NAME_FIELD]
+        obj_data = s3_data[EVENT_OBJECT]
+        key = unquote_plus(obj_data[EVENT_KEY])
+        size = obj_data.get(EVENT_SIZE, 0)
+        etag = obj_data.get(EVENT_ETAG, '')
+        
+        return cls(
+            bucket=bucket,
+            key=key,
+            size=size,
+            etag=etag,
+            event_type=event_type,
+            collection=collection_resolver(bucket)
+        )
+
+
+@dataclass
+class HandlerResult:
+    """Result from a handler execution."""
+    success: bool
+    status: str  # SUCCESS, FAILED, SKIPPED, DELETED
+    error_message: Optional[str] = None
+    task_id: Optional[str] = None  # For RAG status tracking
+    
+    @classmethod
+    def success_result(cls, task_id: Optional[str] = None) -> 'HandlerResult':
+        return cls(success=True, status=STATUS_SUCCESS, task_id=task_id)
+    
+    @classmethod
+    def failed_result(cls, error: str, task_id: Optional[str] = None) -> 'HandlerResult':
+        return cls(success=False, status=STATUS_FAILED, error_message=error, task_id=task_id)
+    
+    @classmethod
+    def skipped_result(cls, reason: str) -> 'HandlerResult':
+        return cls(success=True, status=STATUS_SKIPPED, error_message=reason)
+
+
+@dataclass
+class IngestionRecord:
+    """Record of an ingestion operation for history tracking."""
+    file_name: str
+    bucket: str
+    collection: str
+    status: str
+    start_time: datetime
+    end_time: datetime = field(default_factory=datetime.now)
+    duration_seconds: float = 0.0
+    error_message: Optional[str] = None
+    task_id: Optional[str] = None
+    
+    _TRANSFORMERS: ClassVar[Dict[str, Callable]] = {
+        FIELD_START_TIME: lambda v: v.isoformat(),
+        FIELD_END_TIME: lambda v: v.isoformat(),
+        FIELD_DURATION_SECONDS: lambda v: round(v, 2),
+    }
+    
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert to dictionary for JSON serialization."""
+        result = {}
+        for f in fields(self):
+            value = getattr(self, f.name)
+            transform = self._TRANSFORMERS.get(f.name)
+            result[f.name] = transform(value) if transform else value
+        return result
diff --git a/examples/rag_event_ingest/kafka_consumer/requirements.txt b/examples/rag_event_ingest/kafka_consumer/requirements.txt
new file mode 100644
index 000000000..3f3818161
--- /dev/null
+++ b/examples/rag_event_ingest/kafka_consumer/requirements.txt
@@ -0,0 +1,4 @@
+kafka-python==2.0.2
+minio==7.2.0
+requests==2.31.0
+requests-toolbelt==1.0.0
diff --git a/examples/rag_event_ingest/kafka_consumer/router.py b/examples/rag_event_ingest/kafka_consumer/router.py
new file mode 100644
index 000000000..41f5b8f23
--- /dev/null
+++ b/examples/rag_event_ingest/kafka_consumer/router.py
@@ -0,0 +1,91 @@
+# router.py
+"""File routing module for MinIO event processing."""
+
+import logging
+from pathlib import Path
+from typing import Dict, Any, List, Set, Union
+
+from config.constants import (
+    DOCUMENT_EXTENSIONS,
+    IMAGE_EXTENSIONS,
+    AUDIO_EXTENSIONS,
+    SKIP_EXTENSIONS,
+    DEST_RAG,
+    DEST_SKIP,
+    KEY_DESTINATION,
+    KEY_FILE_TYPE,
+    KEY_EXTENSION,
+    KEY_REASON,
+    FILE_TYPE_DOCUMENT,
+    FILE_TYPE_IMAGE,
+    FILE_TYPE_AUDIO,
+    FILE_TYPE_SKIP,
+    FILE_TYPE_UNKNOWN,
+    CFG_DOCUMENT_EXTENSIONS,
+    CFG_IMAGE_EXTENSIONS,
+    CFG_AUDIO_EXTENSIONS,
+    CFG_SKIP_EXTENSIONS,
+    CFG_ENABLE_IMAGE_PROCESSING,
+    CFG_ENABLE_AUDIO_PROCESSING,
+)
+
+logger = logging.getLogger(__name__)
+
+
+class FileRouter:
+    """Routes files to appropriate processing services based on file type."""
+    
+    def __init__(self, config: Union[Dict[str, Any], Any] = None):
+        """Initialize router with optional config overrides."""
+        if config is None:
+            config = {}
+        elif hasattr(config, '__dataclass_fields__'):
+            config = {
+                CFG_DOCUMENT_EXTENSIONS: config.document_extensions,
+                CFG_IMAGE_EXTENSIONS: config.image_extensions,
+                CFG_AUDIO_EXTENSIONS: config.audio_extensions,
+                CFG_SKIP_EXTENSIONS: config.skip_extensions,
+                CFG_ENABLE_IMAGE_PROCESSING: config.enable_image_processing,
+                CFG_ENABLE_AUDIO_PROCESSING: config.enable_audio_processing,
+            }
+        
+        self.config = config
+        self.document_extensions = self._to_set(config.get(CFG_DOCUMENT_EXTENSIONS, DOCUMENT_EXTENSIONS))
+        self.image_extensions = self._to_set(config.get(CFG_IMAGE_EXTENSIONS, IMAGE_EXTENSIONS))
+        self.audio_extensions = self._to_set(config.get(CFG_AUDIO_EXTENSIONS, AUDIO_EXTENSIONS))
+        self.skip_extensions = self._to_set(config.get(CFG_SKIP_EXTENSIONS, SKIP_EXTENSIONS))
+        self.enable_image_processing = config.get(CFG_ENABLE_IMAGE_PROCESSING, False)
+        self.enable_audio_processing = config.get(CFG_ENABLE_AUDIO_PROCESSING, False)
+        
+        logger.info(f"FileRouter initialized - Documents: {len(self.document_extensions)} types")
+    
+    @staticmethod
+    def _to_set(value: Union[List, Set, None]) -> Set[str]:
+        if value is None:
+            return set()
+        return set(value) if isinstance(value, (list, tuple)) else value
+    
+    def route(self, filename: str) -> dict:
+        """Determine routing destination for a file."""
+        ext = Path(filename).suffix.lower()
+        
+        if ext in self.skip_extensions:
+            return {KEY_DESTINATION: DEST_SKIP, KEY_FILE_TYPE: FILE_TYPE_SKIP, KEY_EXTENSION: ext, KEY_REASON: 'File extension in skip list'}
+        
+        if ext in self.document_extensions:
+            return {KEY_DESTINATION: DEST_RAG, KEY_FILE_TYPE: FILE_TYPE_DOCUMENT, KEY_EXTENSION: ext}
+        
+        if ext in self.image_extensions:
+            if self.enable_image_processing:
+                return {KEY_DESTINATION: DEST_RAG, KEY_FILE_TYPE: FILE_TYPE_IMAGE, KEY_EXTENSION: ext}
+            return {KEY_DESTINATION: DEST_SKIP, KEY_FILE_TYPE: FILE_TYPE_IMAGE, KEY_EXTENSION: ext, KEY_REASON: 'Image processing not enabled'}
+        
+        if ext in self.audio_extensions:
+            if self.enable_audio_processing:
+                return {KEY_DESTINATION: DEST_RAG, KEY_FILE_TYPE: FILE_TYPE_AUDIO, KEY_EXTENSION: ext}
+            return {KEY_DESTINATION: DEST_SKIP, KEY_FILE_TYPE: FILE_TYPE_AUDIO, KEY_EXTENSION: ext, KEY_REASON: 'Audio processing not enabled'}
+        
+        return {KEY_DESTINATION: DEST_RAG, KEY_FILE_TYPE: FILE_TYPE_UNKNOWN, KEY_EXTENSION: ext, KEY_REASON: 'Unknown extension, attempting RAG ingestion'}
+    
+    def is_document(self, filename: str) -> bool:
+        return Path(filename).suffix.lower() in self.document_extensions
diff --git a/examples/rag_event_ingest/kafka_consumer/services/__init__.py b/examples/rag_event_ingest/kafka_consumer/services/__init__.py
new file mode 100644
index 000000000..db2c0e347
--- /dev/null
+++ b/examples/rag_event_ingest/kafka_consumer/services/__init__.py
@@ -0,0 +1,7 @@
+# services/__init__.py
+"""External service clients."""
+
+from .storage import ObjectStorage
+from .document_indexer import DocumentIndexer
+
+__all__ = ['ObjectStorage', 'DocumentIndexer']
diff --git a/examples/rag_event_ingest/kafka_consumer/services/document_indexer.py b/examples/rag_event_ingest/kafka_consumer/services/document_indexer.py
new file mode 100644
index 000000000..ac60d41a2
--- /dev/null
+++ b/examples/rag_event_ingest/kafka_consumer/services/document_indexer.py
@@ -0,0 +1,227 @@
+# services/document_indexer.py
+"""Document indexing service for RAG pipeline."""
+
+import json
+import logging
+import time
+from pathlib import Path
+from typing import Optional, Tuple
+import requests
+
+from config import (
+    API_INGESTOR_DOCUMENTS,
+    API_INGESTOR_COLLECTIONS,
+    API_INGESTOR_COLLECTION,
+    API_INGESTOR_STATUS,
+    STATUS_PENDING,
+    STATUS_PROCESSING,
+    STATUS_FINISHED,
+    STATUS_FAILED,
+    TIMEOUT_DEFAULT,
+    TIMEOUT_MAX_TASK_WAIT,
+    COLLECTION_EMBEDDING_DIMENSION,
+    COLLECTION_CHUNK_SIZE,
+    COLLECTION_CHUNK_OVERLAP,
+    CONTENT_TYPE_MAP,
+    DEFAULT_CONTENT_TYPE,
+    FIELD_COLLECTION_NAME,
+    FIELD_BLOCKING,
+    FIELD_SPLIT_OPTIONS,
+    FIELD_CHUNK_SIZE,
+    FIELD_CHUNK_OVERLAP,
+    FIELD_GENERATE_SUMMARY,
+    FIELD_EMBEDDING_DIMENSION,
+    FIELD_TASK_ID,
+    RESP_COLLECTIONS,
+    RESP_TASK_ID,
+    RESP_STATE,
+    RESP_RESULT,
+    RESP_FAILED_DOCUMENTS,
+    RESP_VALIDATION_ERRORS,
+    RESP_MESSAGE,
+    RESP_ERROR,
+)
+
+logger = logging.getLogger(__name__)
+
+
+class DocumentIndexer:
+    """Indexes documents in vector store for RAG retrieval."""
+    
+    def __init__(self, base_url: str, timeout: int = 600):
+        """Initialize document indexer."""
+        self.base_url = base_url.rstrip('/')
+        self.timeout = timeout
+        self._created_collections: set = set()
+        
+        logger.info(f"DocumentIndexer initialized: {self.base_url}")
+    
+    def ensure_collection_exists(self, collection_name: str) -> bool:
+        """Create collection if it doesn't exist."""
+        if collection_name in self._created_collections:
+            return True
+        
+        # Check if collection exists
+        try:
+            response = requests.get(
+                f'{self.base_url}{API_INGESTOR_COLLECTIONS}',
+                timeout=TIMEOUT_DEFAULT
+            )
+        except requests.RequestException as e:
+            logger.error(f"Error checking collections: {e}")
+            return False
+        
+        if response.status_code == 200:
+            collections = response.json().get(RESP_COLLECTIONS, [])
+            if collection_name in collections:
+                logger.info(f"Collection '{collection_name}' already exists")
+                self._created_collections.add(collection_name)
+                return True
+        
+        # Create collection
+        logger.info(f"Creating collection '{collection_name}'...")
+        try:
+            create_response = requests.post(
+                f'{self.base_url}{API_INGESTOR_COLLECTION}',
+                json={
+                    FIELD_COLLECTION_NAME: collection_name,
+                    FIELD_EMBEDDING_DIMENSION: COLLECTION_EMBEDDING_DIMENSION,
+                    'metadata_schema': []
+                },
+                headers={'Content-Type': 'application/json'},
+                timeout=TIMEOUT_DEFAULT
+            )
+        except requests.RequestException as e:
+            logger.error(f"Error creating collection: {e}")
+            return False
+        
+        if create_response.status_code in [200, 201]:
+            logger.info(f"✓ Collection '{collection_name}' created")
+            self._created_collections.add(collection_name)
+            return True
+        
+        logger.error(f"Failed to create collection: {create_response.status_code}")
+        return False
+    
+    def upload(
+        self,
+        file_data: bytes,
+        filename: str,
+        collection: str,
+        chunk_size: int = COLLECTION_CHUNK_SIZE,
+        chunk_overlap: int = COLLECTION_CHUNK_OVERLAP
+    ) -> Optional[str]:
+        """Upload document to ingestor server."""
+        if not self.ensure_collection_exists(collection):
+            logger.error("Failed to ensure collection exists")
+            return None
+        
+        content_type = self._get_content_type(filename)
+        files = {'documents': (filename, file_data, content_type)}
+        
+        data_config = {
+            FIELD_COLLECTION_NAME: collection,
+            FIELD_BLOCKING: False,
+            FIELD_SPLIT_OPTIONS: {
+                FIELD_CHUNK_SIZE: chunk_size,
+                FIELD_CHUNK_OVERLAP: chunk_overlap
+            },
+            FIELD_GENERATE_SUMMARY: False
+        }
+        
+        logger.info(f"Uploading to collection: {collection}")
+        try:
+            response = requests.post(
+                f'{self.base_url}{API_INGESTOR_DOCUMENTS}',
+                files=files,
+                data={'data': json.dumps(data_config)},
+                timeout=self.timeout
+            )
+        except requests.RequestException as e:
+            logger.error(f"Error uploading document: {e}")
+            return None
+        
+        if response.status_code in [200, 201, 202]:
+            result = response.json()
+            task_id = result.get(RESP_TASK_ID)
+            if task_id:
+                logger.info(f"✓ File uploaded, task_id: {task_id}")
+                return task_id
+            logger.error("No task_id in response")
+            return None
+        
+        logger.error(f"Upload failed: {response.status_code} - {response.text}")
+        return None
+    
+    def check_status(self, task_id: str, max_wait: int = TIMEOUT_MAX_TASK_WAIT) -> Tuple[bool, str]:
+        """Check task status and wait for completion."""
+        start_time = time.time()
+        
+        while time.time() - start_time < max_wait:
+            try:
+                response = requests.get(
+                    f'{self.base_url}{API_INGESTOR_STATUS}',
+                    params={FIELD_TASK_ID: task_id},
+                    timeout=TIMEOUT_DEFAULT
+                )
+            except requests.RequestException as e:
+                return False, str(e)
+            
+            if response.status_code != 200:
+                return False, f"Status check failed: {response.status_code}"
+            
+            result = response.json()
+            state = result.get(RESP_STATE, 'UNKNOWN')
+            
+            if state == STATUS_FAILED:
+                return False, result.get(RESP_ERROR, 'Unknown error')
+            
+            if state == STATUS_FINISHED:
+                return self._parse_finished_result(result)
+            
+            if state in [STATUS_PENDING, STATUS_PROCESSING]:
+                elapsed = int(time.time() - start_time)
+                if elapsed % 5 == 0:
+                    logger.info(f"Task {task_id}: {state} ({elapsed}s)")
+            
+            time.sleep(1)
+        
+        return False, f"Timeout after {max_wait}s"
+    
+    def _parse_finished_result(self, result: dict) -> Tuple[bool, str]:
+        """Parse result from a finished task."""
+        task_result = result.get(RESP_RESULT, {})
+        failed_docs = task_result.get(RESP_FAILED_DOCUMENTS, [])
+        validation_errors = task_result.get(RESP_VALIDATION_ERRORS, [])
+        
+        if failed_docs or validation_errors:
+            return False, f"Failed: {failed_docs}, Errors: {validation_errors}"
+        return True, task_result.get(RESP_MESSAGE, 'Completed')
+    
+    def delete(self, filename: str, collection: str) -> bool:
+        """Delete document from collection."""
+        logger.info(f"Deleting '{filename}' from '{collection}'")
+        
+        try:
+            response = requests.delete(
+                f'{self.base_url}{API_INGESTOR_DOCUMENTS}',
+                params={FIELD_COLLECTION_NAME: collection},
+                json=[filename],
+                headers={'Content-Type': 'application/json'},
+                timeout=TIMEOUT_DEFAULT
+            )
+        except requests.RequestException as e:
+            logger.error(f"Error deleting document: {e}")
+            return False
+        
+        if response.status_code in [200, 201, 204]:
+            logger.info(f"Deleted '{filename}'")
+            return True
+        
+        logger.error(f"Delete failed: {response.status_code}")
+        return False
+    
+    def _get_content_type(self, filename: str) -> str:
+        """Get content type from filename."""
+        ext = Path(filename).suffix.lower()
+        return CONTENT_TYPE_MAP.get(ext, DEFAULT_CONTENT_TYPE)
diff --git a/examples/rag_event_ingest/kafka_consumer/services/storage.py b/examples/rag_event_ingest/kafka_consumer/services/storage.py
new file mode 100644
index 000000000..8f50a1f7b
--- /dev/null
+++ b/examples/rag_event_ingest/kafka_consumer/services/storage.py
@@ -0,0 +1,195 @@
+# services/storage.py
+"""S3-compatible object storage service."""
+
+import io
+import json
+import logging
+from abc import ABC, abstractmethod
+from typing import Dict, Optional
+
+from minio import Minio
+from minio.error import S3Error
+
+from config import (
+    MINIO_ENDPOINT,
+    MINIO_ACCESS_KEY,
+    MINIO_SECRET_KEY,
+    MINIO_SECURE,
+    MINIO_DEFAULT_COLLECTION,
+    MINIO_SOURCES,
+    CFG_ENDPOINT,
+    CFG_ACCESS,
+    CFG_SECRET,
+    CFG_SECURE,
+    CFG_COLLECTION,
+    CFG_BUCKETS,
+)
+
+logger = logging.getLogger(__name__)
+
+
+# =============================================================================
+# Abstract Interface
+# =============================================================================
+
+class StorageBackend(ABC):
+    """Abstract interface for object storage operations.
+    
+    Implement this to add new backends (Azure Blob, GCS, etc.)
+    """
+    
+    @abstractmethod
+    def download(self, bucket: str, key: str) -> bytes:
+        """Download file from storage."""
+        pass
+    
+    @abstractmethod
+    def upload(self, bucket: str, key: str, data: bytes, content_type: Optional[str] = None) -> None:
+        """Upload file to storage."""
+        pass
+    
+    @abstractmethod
+    def delete(self, bucket: str, key: str) -> None:
+        """Delete file from storage."""
+        pass
+    
+    @abstractmethod
+    def exists(self, bucket: str, key: str) -> bool:
+        """Check if file exists."""
+        pass
+
+
+# =============================================================================
+# S3 Implementation
+# =============================================================================
+
+class S3Backend(StorageBackend):
+    """S3-compatible storage (MinIO, AWS S3, Wasabi, etc.)."""
+    
+    def __init__(self, client: Minio):
+        self._client = client
+    
+    @classmethod
+    def create(
+        cls,
+        endpoint: str,
+        access_key: str,
+        secret_key: str,
+        secure: bool = False,
+    ) -> 'S3Backend':
+        """Factory method to create S3 backend."""
+        client = Minio(endpoint, access_key=access_key, secret_key=secret_key, secure=secure)
+        logger.info(f"Created S3 client: {endpoint}")
+        return cls(client)
+    
+    def download(self, bucket: str, key: str) -> bytes:
+        response = self._client.get_object(bucket, key)
+        try:
+            data = response.read()
+        finally:
+            response.close()
+            response.release_conn()
+        logger.info(f"Downloaded {bucket}/{key} ({len(data)} bytes)")
+        return data
+    
+    def upload(self, bucket: str, key: str, data: bytes, content_type: Optional[str] = None) -> None:
+        self._client.put_object(
+            bucket, key, io.BytesIO(data),
+            length=len(data),
+            content_type=content_type or 'application/octet-stream'
+        )
+        logger.info(f"Uploaded {bucket}/{key}")
+    
+    def delete(self, bucket: str, key: str) -> None:
+        self._client.remove_object(bucket, key)
+        logger.info(f"Deleted {bucket}/{key}")
+    
+    def exists(self, bucket: str, key: str) -> bool:
+        try:
+            self._client.stat_object(bucket, key)
+            return True
+        except S3Error:
+            return False
+
+
+# =============================================================================
+# Object Storage (Factory + Bucket Mapping)
+# =============================================================================
+
+class ObjectStorage:
+    """Object storage with bucket-to-collection mapping.
+    
+    Handles single or multiple S3 sources via configuration.
+    """
+    
+    def __init__(self):
+        self._backends: Dict[str, StorageBackend] = {}
+        self._bucket_to_backend: Dict[str, str] = {}
+        self._bucket_to_collection: Dict[str, str] = {}
+        self._default_collection = MINIO_DEFAULT_COLLECTION
+        self._configure()
+    
+    def _configure(self):
+        if MINIO_SOURCES:
+            self._configure_multi_source(MINIO_SOURCES)
+        else:
+            self._configure_single_source()
+    
+    def _configure_single_source(self):
+        logger.info(f"Single S3 mode: {MINIO_ENDPOINT}")
+        self._backends['default'] = S3Backend.create(
+            MINIO_ENDPOINT, MINIO_ACCESS_KEY, MINIO_SECRET_KEY, MINIO_SECURE
+        )
+    
+    def _configure_multi_source(self, sources_json: str):
+        config = json.loads(sources_json)
+        for name, src in config.items():
+            self._configure_source(name, src)
+    
+    def _configure_source(self, name: str, src: dict):
+        """Configure a single S3 source and register its buckets."""
+        logger.info(f"Configuring S3 source '{name}': {src[CFG_ENDPOINT]}")
+        
+        self._backends[name] = S3Backend.create(
+            src[CFG_ENDPOINT],
+            src.get(CFG_ACCESS, MINIO_ACCESS_KEY),
+            src.get(CFG_SECRET, MINIO_SECRET_KEY),
+            src.get(CFG_SECURE, False)
+        )
+        
+        collection = src.get(CFG_COLLECTION, name.replace('-', '_'))
+        self._register_buckets(name, src.get(CFG_BUCKETS, []), collection)
+    
+    def _register_buckets(self, backend_name: str, buckets: list, collection: str):
+        """Register bucket-to-backend and bucket-to-collection mappings."""
+        for bucket in buckets:
+            self._bucket_to_backend[bucket] = backend_name
+            self._bucket_to_collection[bucket] = collection
+            logger.info(f"  {bucket} → {collection}")
+    
+    def _get_backend(self, bucket: str) -> StorageBackend:
+        if bucket in self._bucket_to_backend:
+            return self._backends[self._bucket_to_backend[bucket]]
+        return next(iter(self._backends.values()))
+    
+    def download(self, bucket: str, key: str) -> bytes:
+        return self._get_backend(bucket).download(bucket, key)
+    
+    def get_collection_for_bucket(self, bucket: str) -> str:
+        """Get collection name for bucket.
+        
+        Priority:
+        1. Explicit mapping from MINIO_SOURCES config
+        2. Default collection from COLLECTION_NAME env var
+        3. Fallback: bucket name with hyphens → underscores
+        """
+        # Check explicit mapping first
+        if bucket in self._bucket_to_collection:
+            return self._bucket_to_collection[bucket]
+        
+        # Use default collection if configured
+        if self._default_collection:
+            return self._default_collection
+        
+        # Fallback to bucket name conversion
+        return bucket.replace('-', '_')
diff --git a/examples/rag_react_agent/pyproject.toml b/examples/rag_react_agent/pyproject.toml
index fcebbcb3a..c4967a58a 100644
--- a/examples/rag_react_agent/pyproject.toml
+++ b/examples/rag_react_agent/pyproject.toml
@@ -20,7 +20,7 @@ dependencies = [
   # Keep package version constraints as open as possible to avoid conflicts with other packages. Always define a minimum
   # version when adding a new package. If unsure, default to using `~=` instead of `==`. Does not apply to nvidia-nat packages.
   # Keep sorted!!!
-  "langgraph>=1.0.7",  # Required for react_agent workflow
+  "langgraph>=1.0.8",  # Required for react_agent workflow
   "langchain_classic",
   "nvidia-nat>=1.5.0a0,<2.0",  # Allow pre-release versions
   "nvidia-nat-langchain>=1.5.0a0,<2.0",  # Allow pre-release versions
diff --git a/examples/rag_react_agent/uv.lock b/examples/rag_react_agent/uv.lock
index 0554ef787..16af3b48d 100644
--- a/examples/rag_react_agent/uv.lock
+++ b/examples/rag_react_agent/uv.lock
@@ -324,6 +324,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/cd/3a/577b549de0cc09d95f11087ee63c739bba856cd3952697eec4c4bb91350a/bleach-6.3.0-py3-none-any.whl", hash = "sha256:fe10ec77c93ddf3d13a73b035abaac7a9f5e436513864ccdad516693213c65d6", size = 164437, upload-time = "2025-10-27T17:57:37.538Z" },
 ]
 
+[[package]]
+name = "blinker"
+version = "1.9.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/21/28/9b3f50ce0e048515135495f198351908d99540d69bfdc8c1d15b73dc55ce/blinker-1.9.0.tar.gz", hash = "sha256:b4ce2265a7abece45e7cc896e98dbebe6cead56bcf805a3d23136d145f5445bf", size = 22460, upload-time = "2024-11-08T17:25:47.436Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/10/cb/f2ad4230dc2eb1a74edf38f1a38b9b52277f75bef262d8908e60d957e13c/blinker-1.9.0-py3-none-any.whl", hash = "sha256:ba0efaa9080b619ff2f3459d1d500c57bddea4a6b424b60a91141db6fd2f08bc", size = 8458, upload-time = "2024-11-08T17:25:46.184Z" },
+]
+
 [[package]]
 name = "boto3"
 version = "1.40.61"
@@ -597,21 +606,20 @@ name = "datasets"
 version = "4.5.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "dill" },
-    { name = "filelock" },
-    { name = "fsspec", extra = ["http"] },
-    { name = "httpx" },
-    { name = "huggingface-hub" },
-    { name = "multiprocess" },
-    { name = "numpy", version = "1.26.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.12'" },
-    { name = "numpy", version = "2.4.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.12'" },
-    { name = "packaging" },
-    { name = "pandas" },
-    { name = "pyarrow" },
-    { name = "pyyaml" },
-    { name = "requests" },
-    { name = "tqdm" },
-    { name = "xxhash" },
+    { name = "dill", marker = "python_full_version >= '3.12'" },
+    { name = "filelock", marker = "python_full_version >= '3.12'" },
+    { name = "fsspec", extra = ["http"], marker = "python_full_version >= '3.12'" },
+    { name = "httpx", marker = "python_full_version >= '3.12'" },
+    { name = "huggingface-hub", marker = "python_full_version >= '3.12'" },
+    { name = "multiprocess", marker = "python_full_version >= '3.12'" },
+    { name = "numpy", marker = "python_full_version >= '3.12'" },
+    { name = "packaging", marker = "python_full_version >= '3.12'" },
+    { name = "pandas", marker = "python_full_version >= '3.12'" },
+    { name = "pyarrow", marker = "python_full_version >= '3.12'" },
+    { name = "pyyaml", marker = "python_full_version >= '3.12'" },
+    { name = "requests", marker = "python_full_version >= '3.12'" },
+    { name = "tqdm", marker = "python_full_version >= '3.12'" },
+    { name = "xxhash", marker = "python_full_version >= '3.12'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/55/bf/bb927bde63d649296c83e883171ae77074717c1b80fe2868b328bd0dbcbb/datasets-4.5.0.tar.gz", hash = "sha256:00c698ce1c2452e646cc5fad47fef39d3fe78dd650a8a6eb205bb45eb63cd500", size = 588384, upload-time = "2026-01-14T18:27:54.297Z" }
 wheels = [
@@ -736,6 +744,23 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/18/79/1b8fa1bb3568781e84c9200f951c735f3f157429f44be0495da55894d620/filetype-1.2.0-py2.py3-none-any.whl", hash = "sha256:7ce71b6880181241cf7ac8697a2f1eb6a8bd9b429f7ad6d27b8db9ba5f1c2d25", size = 19970, upload-time = "2022-11-02T17:34:01.425Z" },
 ]
 
+[[package]]
+name = "flask"
+version = "3.1.3"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "blinker" },
+    { name = "click" },
+    { name = "itsdangerous" },
+    { name = "jinja2" },
+    { name = "markupsafe" },
+    { name = "werkzeug" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/26/00/35d85dcce6c57fdc871f3867d465d780f302a175ea360f62533f12b27e2b/flask-3.1.3.tar.gz", hash = "sha256:0ef0e52b8a9cd932855379197dd8f94047b359ca0a78695144304cb45f87c9eb", size = 759004, upload-time = "2026-02-19T05:00:57.678Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/7f/9c/34f6962f9b9e9c71f6e5ed806e0d0ff03c9d1b0b2340088a0cf4bce09b18/flask-3.1.3-py3-none-any.whl", hash = "sha256:f4bcbefc124291925f1a26446da31a5178f9483862233b23c0c96a20701f670c", size = 103424, upload-time = "2026-02-19T05:00:56.027Z" },
+]
+
 [[package]]
 name = "flatbuffers"
 version = "25.12.19"
@@ -828,7 +853,7 @@ wheels = [
 
 [package.optional-dependencies]
 http = [
-    { name = "aiohttp" },
+    { name = "aiohttp", marker = "python_full_version >= '3.12'" },
 ]
 
 [[package]]
@@ -1078,6 +1103,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/15/aa/0aca39a37d3c7eb941ba736ede56d689e7be91cab5d9ca846bde3999eba6/isodate-0.7.2-py3-none-any.whl", hash = "sha256:28009937d8031054830160fce6d409ed342816b543597cece116d966c6d99e15", size = 22320, upload-time = "2024-10-08T23:04:09.501Z" },
 ]
 
+[[package]]
+name = "itsdangerous"
+version = "2.2.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/9c/cb/8ac0172223afbccb63986cc25049b154ecfb5e85932587206f42317be31d/itsdangerous-2.2.0.tar.gz", hash = "sha256:e0050c0b7da1eea53ffaf149c0cfbb5c6e2e2b69c4bef22c81fa6eb73e5f6173", size = 54410, upload-time = "2024-04-16T21:28:15.614Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/04/96/92447566d16df59b2a776c0fb82dbc4d9e07cd95062562af01e408583fc4/itsdangerous-2.2.0-py3-none-any.whl", hash = "sha256:c6242fc49e35958c8b15141343aa660db5fc54d4f13a1db01a3f5891b98700ef", size = 16234, upload-time = "2024-04-16T21:28:14.499Z" },
+]
+
 [[package]]
 name = "jinja2"
 version = "3.1.6"
@@ -1235,18 +1269,17 @@ wheels = [
 
 [[package]]
 name = "langchain-aws"
-version = "1.0.0"
+version = "1.1.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "boto3" },
     { name = "langchain-core" },
-    { name = "numpy", version = "1.26.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.12'" },
-    { name = "numpy", version = "2.4.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.12'" },
+    { name = "numpy" },
     { name = "pydantic" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/90/52/7e57fb7fc34c386625f66f0ab31da9cf2788b03ef15ae78ccd4c627b30cf/langchain_aws-1.0.0.tar.gz", hash = "sha256:597342bda0e7384e13590e9ab69c872ddcfbbf07d81ac6bb0f8a67970252212e", size = 214146, upload-time = "2025-10-17T19:06:49.001Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/52/1d/bb306951b1c394b7a27effb8eb6c9ee65dd77fcc4be7c20f76e3299a9e1e/langchain_aws-1.1.0.tar.gz", hash = "sha256:1e2f8570328eae4907c3cf7e900dc68d8034ddc865d9dc96823c9f9d8cccb901", size = 393899, upload-time = "2025-11-24T14:35:24.216Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/67/5d/5b3c07780a8eb4b916ffe504893896f87f318924c86dcbeb89562baa2d20/langchain_aws-1.0.0-py3-none-any.whl", hash = "sha256:68f6965b5030d0779b02e731ce1c910a5f4518bfe0e2ae82999a5342bc46dbd5", size = 150400, upload-time = "2025-10-17T19:06:47.926Z" },
+    { url = "https://files.pythonhosted.org/packages/26/33/91b8d2a7570657b371382b45054142c54165a51706990a5c1b4cc40c0e9a/langchain_aws-1.1.0-py3-none-any.whl", hash = "sha256:8ec074615b42839e035354063717374c32c63f5028ef5221ba073fd5f3ef5e37", size = 152432, upload-time = "2025-11-24T14:35:23.004Z" },
 ]
 
 [[package]]
@@ -1278,8 +1311,7 @@ dependencies = [
     { name = "langchain-classic" },
     { name = "langchain-core" },
     { name = "langsmith" },
-    { name = "numpy", version = "1.26.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.12'" },
-    { name = "numpy", version = "2.4.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.12'" },
+    { name = "numpy" },
     { name = "pydantic-settings" },
     { name = "pyyaml" },
     { name = "requests" },
@@ -1293,7 +1325,7 @@ wheels = [
 
 [[package]]
 name = "langchain-core"
-version = "1.2.7"
+version = "1.2.17"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "jsonpatch" },
@@ -1305,9 +1337,23 @@ dependencies = [
     { name = "typing-extensions" },
     { name = "uuid-utils" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/a2/0e/664d8d81b3493e09cbab72448d2f9d693d1fa5aa2bcc488602203a9b6da0/langchain_core-1.2.7.tar.gz", hash = "sha256:e1460639f96c352b4a41c375f25aeb8d16ffc1769499fb1c20503aad59305ced", size = 837039, upload-time = "2026-01-09T17:44:25.505Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/1d/93/36226f593df52b871fc24d494c274f3a6b2ac76763a2806e7d35611634a1/langchain_core-1.2.17.tar.gz", hash = "sha256:54aa267f3311e347fb2e50951fe08e53761cebfb999ab80e6748d70525bbe872", size = 836130, upload-time = "2026-03-02T22:47:55.846Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/6e/6f/34a9fba14d191a67f7e2ee3dbce3e9b86d2fa7310e2c7f2c713583481bd2/langchain_core-1.2.7-py3-none-any.whl", hash = "sha256:452f4fef7a3d883357b22600788d37e3d8854ef29da345b7ac7099f33c31828b", size = 490232, upload-time = "2026-01-09T17:44:24.236Z" },
+    { url = "https://files.pythonhosted.org/packages/be/90/073f33ab383a62908eca7ea699586dfea280e77182176e33199c80ddf22a/langchain_core-1.2.17-py3-none-any.whl", hash = "sha256:bf6bd6ce503874e9c2da1669a69383e967c3de1ea808921d19a9a6bff1a9fbbe", size = 502727, upload-time = "2026-03-02T22:47:54.537Z" },
+]
+
+[[package]]
+name = "langchain-huggingface"
+version = "1.2.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "huggingface-hub" },
+    { name = "langchain-core" },
+    { name = "tokenizers" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/42/5b/4910551367de5c6ec246616fcc0ddb0bc6f9e5d353d4a22dcb5ab1f87e60/langchain_huggingface-1.2.1.tar.gz", hash = "sha256:33d52a30a56775380c6b4321b78136a410eb079132a80fe7120ddd4b954b4efa", size = 253106, upload-time = "2026-03-02T18:44:39.163Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/bc/90/a1440bfa467a6dd9025ad80f3c239554de28aec49dacfb369fda92871556/langchain_huggingface-1.2.1-py3-none-any.whl", hash = "sha256:0930c216a457d2c8dc7b39a756c39c567f1d88593bfee2c3441f3ae718435f0f", size = 30924, upload-time = "2026-03-02T18:44:37.745Z" },
 ]
 
 [[package]]
@@ -1338,16 +1384,17 @@ wheels = [
 
 [[package]]
 name = "langchain-nvidia-ai-endpoints"
-version = "1.0.3"
+version = "1.2.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "aiohttp" },
     { name = "filetype" },
     { name = "langchain-core" },
+    { name = "requests" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/5a/9e/30814da280f7a79b168f83180f6a0396c166f86a566e56bb9877bf562611/langchain_nvidia_ai_endpoints-1.0.3.tar.gz", hash = "sha256:11c48fd24e4a9d4c86c65bcef943400f4e709497c93254c7dc97c43f68c2be89", size = 46526, upload-time = "2026-01-28T22:04:33.93Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/47/4b/e417af1b2b7f861f37e26bf4fa4b05cda4052002e3f84a966f0735baf94f/langchain_nvidia_ai_endpoints-1.2.0.tar.gz", hash = "sha256:4bd63b812707ea348a86539001aa9a89b3cba3ee56ade7379247a955e4bfd3eb", size = 53851, upload-time = "2026-03-10T17:55:08.127Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/67/04/c83f61106a245b74de11c1e075c1cc1e70462ece1dd9fc0584ad992a776d/langchain_nvidia_ai_endpoints-1.0.3-py3-none-any.whl", hash = "sha256:e5f170ad0a335637298bb90fb3df119793821e316355f61ab82f0106913eebbf", size = 50130, upload-time = "2026-01-28T22:04:33.065Z" },
+    { url = "https://files.pythonhosted.org/packages/66/e4/186f1a99e4d30bd91c8438d024dc73a71c8f7e0657c7acb6e79658aa19cf/langchain_nvidia_ai_endpoints-1.2.0-py3-none-any.whl", hash = "sha256:c8e075d5b3d31216374af0cfa9e690ab28ada3ebbde34dd6d36fe16a26d883cc", size = 58269, upload-time = "2026-03-10T17:55:06.339Z" },
 ]
 
 [[package]]
@@ -1393,7 +1440,7 @@ wheels = [
 
 [[package]]
 name = "langgraph"
-version = "1.0.7"
+version = "1.0.10"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "langchain-core" },
@@ -1403,9 +1450,9 @@ dependencies = [
     { name = "pydantic" },
     { name = "xxhash" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/72/5b/f72655717c04e33d3b62f21b166dc063d192b53980e9e3be0e2a117f1c9f/langgraph-1.0.7.tar.gz", hash = "sha256:0cfdfee51e6e8cfe503ecc7367c73933437c505b03fa10a85c710975c8182d9a", size = 497098, upload-time = "2026-01-22T16:57:47.303Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/55/92/14df6fefba28c10caf1cb05aa5b8c7bf005838fe32a86d903b6c7cc4018d/langgraph-1.0.10.tar.gz", hash = "sha256:73bd10ee14a8020f31ef07e9cd4c1a70c35cc07b9c2b9cd637509a10d9d51e29", size = 511644, upload-time = "2026-02-27T21:04:38.743Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/7e/0e/fe80144e3e4048e5d19ccdb91ac547c1a7dc3da8dbd1443e210048194c14/langgraph-1.0.7-py3-none-any.whl", hash = "sha256:9d68e8f8dd8f3de2fec45f9a06de05766d9b075b78fb03171779893b7a52c4d2", size = 157353, upload-time = "2026-01-22T16:57:45.997Z" },
+    { url = "https://files.pythonhosted.org/packages/5d/60/260e0c04620a37ba8916b712766c341cc5fc685dabc6948c899494bbc2ae/langgraph-1.0.10-py3-none-any.whl", hash = "sha256:7c298bef4f6ea292fcf9824d6088fe41a6727e2904ad6066f240c4095af12247", size = 160920, upload-time = "2026-02-27T21:04:35.932Z" },
 ]
 
 [[package]]
@@ -1423,15 +1470,15 @@ wheels = [
 
 [[package]]
 name = "langgraph-prebuilt"
-version = "1.0.7"
+version = "1.0.8"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "langchain-core" },
     { name = "langgraph-checkpoint" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/a7/59/711aecd1a50999456850dc328f3cad72b4372d8218838d8d5326f80cb76f/langgraph_prebuilt-1.0.7.tar.gz", hash = "sha256:38e097e06de810de4d0e028ffc0e432bb56d1fb417620fb1dfdc76c5e03e4bf9", size = 163692, upload-time = "2026-01-22T16:45:22.801Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/0d/06/dd61a5c2dce009d1b03b1d56f2a85b3127659fdddf5b3be5d8f1d60820fb/langgraph_prebuilt-1.0.8.tar.gz", hash = "sha256:0cd3cf5473ced8a6cd687cc5294e08d3de57529d8dd14fdc6ae4899549efcf69", size = 164442, upload-time = "2026-02-19T18:14:39.083Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/47/49/5e37abb3f38a17a3487634abc2a5da87c208cc1d14577eb8d7184b25c886/langgraph_prebuilt-1.0.7-py3-none-any.whl", hash = "sha256:e14923516504405bb5edc3977085bc9622c35476b50c1808544490e13871fe7c", size = 35324, upload-time = "2026-01-22T16:45:21.784Z" },
+    { url = "https://files.pythonhosted.org/packages/dc/41/ec966424ad3f2ed3996d24079d3342c8cd6c0bd0653c12b2a917a685ec6c/langgraph_prebuilt-1.0.8-py3-none-any.whl", hash = "sha256:d16a731e591ba4470f3e313a319c7eee7dbc40895bcf15c821f985a3522a7ce0", size = 35648, upload-time = "2026-02-19T18:14:37.611Z" },
 ]
 
 [[package]]
@@ -1592,20 +1639,20 @@ name = "mcp"
 version = "1.26.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "anyio" },
-    { name = "httpx" },
-    { name = "httpx-sse" },
-    { name = "jsonschema" },
-    { name = "pydantic" },
-    { name = "pydantic-settings" },
-    { name = "pyjwt", extra = ["crypto"] },
-    { name = "python-multipart" },
-    { name = "pywin32", marker = "sys_platform == 'win32'" },
-    { name = "sse-starlette" },
-    { name = "starlette" },
-    { name = "typing-extensions" },
-    { name = "typing-inspection" },
-    { name = "uvicorn", marker = "sys_platform != 'emscripten'" },
+    { name = "anyio", marker = "python_full_version >= '3.12'" },
+    { name = "httpx", marker = "python_full_version >= '3.12'" },
+    { name = "httpx-sse", marker = "python_full_version >= '3.12'" },
+    { name = "jsonschema", marker = "python_full_version >= '3.12'" },
+    { name = "pydantic", marker = "python_full_version >= '3.12'" },
+    { name = "pydantic-settings", marker = "python_full_version >= '3.12'" },
+    { name = "pyjwt", extra = ["crypto"], marker = "python_full_version >= '3.12'" },
+    { name = "python-multipart", marker = "python_full_version >= '3.12'" },
+    { name = "pywin32", marker = "python_full_version >= '3.12' and sys_platform == 'win32'" },
+    { name = "sse-starlette", marker = "python_full_version >= '3.12'" },
+    { name = "starlette", marker = "python_full_version >= '3.12'" },
+    { name = "typing-extensions", marker = "python_full_version >= '3.12'" },
+    { name = "typing-inspection", marker = "python_full_version >= '3.12'" },
+    { name = "uvicorn", marker = "python_full_version >= '3.12' and sys_platform != 'emscripten'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/fc/6d/62e76bbb8144d6ed86e202b5edd8a4cb631e7c8130f3f4893c3f90262b10/mcp-1.26.0.tar.gz", hash = "sha256:db6e2ef491eecc1a0d93711a76f28dec2e05999f93afd48795da1c1137142c66", size = 608005, upload-time = "2026-01-24T19:40:32.468Z" }
 wheels = [
@@ -1746,7 +1793,7 @@ name = "multiprocess"
 version = "0.70.18"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "dill" },
+    { name = "dill", marker = "python_full_version >= '3.12'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/72/fd/2ae3826f5be24c6ed87266bc4e59c46ea5b059a103f3d7e7eb76a52aeecb/multiprocess-0.70.18.tar.gz", hash = "sha256:f9597128e6b3e67b23956da07cf3d2e5cba79e2f4e0fba8d7903636663ec6d0d", size = 1798503, upload-time = "2025-04-17T03:11:27.742Z" }
 wheels = [
@@ -1770,6 +1817,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/79/7b/2c79738432f5c924bef5071f933bcc9efd0473bac3b4aa584a6f7c1c8df8/mypy_extensions-1.1.0-py3-none-any.whl", hash = "sha256:1be4cccdb0f2482337c4743e60421de3a356cd97508abadd57d47403e94f5505", size = 4963, upload-time = "2025-04-22T14:54:22.983Z" },
 ]
 
+[[package]]
+name = "narwhals"
+version = "2.17.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/75/59/81d0f4cad21484083466f278e6b392addd9f4205b48d45b5c8771670ebf8/narwhals-2.17.0.tar.gz", hash = "sha256:ebd5bc95bcfa2f8e89a8ac09e2765a63055162837208e67b42d6eeb6651d5e67", size = 620306, upload-time = "2026-02-23T09:44:34.142Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/4b/27/20770bd6bf8fbe1e16f848ba21da9df061f38d2e6483952c29d2bb5d1d8b/narwhals-2.17.0-py3-none-any.whl", hash = "sha256:2ac5307b7c2b275a7d66eeda906b8605e3d7a760951e188dcfff86e8ebe083dd", size = 444897, upload-time = "2026-02-23T09:44:32.006Z" },
+]
+
 [[package]]
 name = "nest-asyncio"
 version = "1.6.0"
@@ -1797,41 +1853,10 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/9e/c9/b2622292ea83fbb4ec318f5b9ab867d0a28ab43c5717bb85b0a5f6b3b0a4/networkx-3.6.1-py3-none-any.whl", hash = "sha256:d47fbf302e7d9cbbb9e2555a0d267983d2aa476bac30e90dfbe5669bd57f3762", size = 2068504, upload-time = "2025-12-08T17:02:38.159Z" },
 ]
 
-[[package]]
-name = "numpy"
-version = "1.26.4"
-source = { registry = "https://pypi.org/simple" }
-resolution-markers = [
-    "python_full_version < '3.12'",
-]
-sdist = { url = "https://files.pythonhosted.org/packages/65/6e/09db70a523a96d25e115e71cc56a6f9031e7b8cd166c1ac8438307c14058/numpy-1.26.4.tar.gz", hash = "sha256:2a02aba9ed12e4ac4eb3ea9421c420301a0c6460d9830d74a9df87efa4912010", size = 15786129, upload-time = "2024-02-06T00:26:44.495Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/11/57/baae43d14fe163fa0e4c47f307b6b2511ab8d7d30177c491960504252053/numpy-1.26.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:4c66707fabe114439db9068ee468c26bbdf909cac0fb58686a42a24de1760c71", size = 20630554, upload-time = "2024-02-05T23:51:50.149Z" },
-    { url = "https://files.pythonhosted.org/packages/1a/2e/151484f49fd03944c4a3ad9c418ed193cfd02724e138ac8a9505d056c582/numpy-1.26.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:edd8b5fe47dab091176d21bb6de568acdd906d1887a4584a15a9a96a1dca06ef", size = 13997127, upload-time = "2024-02-05T23:52:15.314Z" },
-    { url = "https://files.pythonhosted.org/packages/79/ae/7e5b85136806f9dadf4878bf73cf223fe5c2636818ba3ab1c585d0403164/numpy-1.26.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7ab55401287bfec946ced39700c053796e7cc0e3acbef09993a9ad2adba6ca6e", size = 14222994, upload-time = "2024-02-05T23:52:47.569Z" },
-    { url = "https://files.pythonhosted.org/packages/3a/d0/edc009c27b406c4f9cbc79274d6e46d634d139075492ad055e3d68445925/numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:666dbfb6ec68962c033a450943ded891bed2d54e6755e35e5835d63f4f6931d5", size = 18252005, upload-time = "2024-02-05T23:53:15.637Z" },
-    { url = "https://files.pythonhosted.org/packages/09/bf/2b1aaf8f525f2923ff6cfcf134ae5e750e279ac65ebf386c75a0cf6da06a/numpy-1.26.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:96ff0b2ad353d8f990b63294c8986f1ec3cb19d749234014f4e7eb0112ceba5a", size = 13885297, upload-time = "2024-02-05T23:53:42.16Z" },
-    { url = "https://files.pythonhosted.org/packages/df/a0/4e0f14d847cfc2a633a1c8621d00724f3206cfeddeb66d35698c4e2cf3d2/numpy-1.26.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:60dedbb91afcbfdc9bc0b1f3f402804070deed7392c23eb7a7f07fa857868e8a", size = 18093567, upload-time = "2024-02-05T23:54:11.696Z" },
-    { url = "https://files.pythonhosted.org/packages/d2/b7/a734c733286e10a7f1a8ad1ae8c90f2d33bf604a96548e0a4a3a6739b468/numpy-1.26.4-cp311-cp311-win32.whl", hash = "sha256:1af303d6b2210eb850fcf03064d364652b7120803a0b872f5211f5234b399f20", size = 5968812, upload-time = "2024-02-05T23:54:26.453Z" },
-    { url = "https://files.pythonhosted.org/packages/3f/6b/5610004206cf7f8e7ad91c5a85a8c71b2f2f8051a0c0c4d5916b76d6cbb2/numpy-1.26.4-cp311-cp311-win_amd64.whl", hash = "sha256:cd25bcecc4974d09257ffcd1f098ee778f7834c3ad767fe5db785be9a4aa9cb2", size = 15811913, upload-time = "2024-02-05T23:54:53.933Z" },
-    { url = "https://files.pythonhosted.org/packages/95/12/8f2020a8e8b8383ac0177dc9570aad031a3beb12e38847f7129bacd96228/numpy-1.26.4-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:b3ce300f3644fb06443ee2222c2201dd3a89ea6040541412b8fa189341847218", size = 20335901, upload-time = "2024-02-05T23:55:32.801Z" },
-    { url = "https://files.pythonhosted.org/packages/75/5b/ca6c8bd14007e5ca171c7c03102d17b4f4e0ceb53957e8c44343a9546dcc/numpy-1.26.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:03a8c78d01d9781b28a6989f6fa1bb2c4f2d51201cf99d3dd875df6fbd96b23b", size = 13685868, upload-time = "2024-02-05T23:55:56.28Z" },
-    { url = "https://files.pythonhosted.org/packages/79/f8/97f10e6755e2a7d027ca783f63044d5b1bc1ae7acb12afe6a9b4286eac17/numpy-1.26.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9fad7dcb1aac3c7f0584a5a8133e3a43eeb2fe127f47e3632d43d677c66c102b", size = 13925109, upload-time = "2024-02-05T23:56:20.368Z" },
-    { url = "https://files.pythonhosted.org/packages/0f/50/de23fde84e45f5c4fda2488c759b69990fd4512387a8632860f3ac9cd225/numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:675d61ffbfa78604709862923189bad94014bef562cc35cf61d3a07bba02a7ed", size = 17950613, upload-time = "2024-02-05T23:56:56.054Z" },
-    { url = "https://files.pythonhosted.org/packages/4c/0c/9c603826b6465e82591e05ca230dfc13376da512b25ccd0894709b054ed0/numpy-1.26.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:ab47dbe5cc8210f55aa58e4805fe224dac469cde56b9f731a4c098b91917159a", size = 13572172, upload-time = "2024-02-05T23:57:21.56Z" },
-    { url = "https://files.pythonhosted.org/packages/76/8c/2ba3902e1a0fc1c74962ea9bb33a534bb05984ad7ff9515bf8d07527cadd/numpy-1.26.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:1dda2e7b4ec9dd512f84935c5f126c8bd8b9f2fc001e9f54af255e8c5f16b0e0", size = 17786643, upload-time = "2024-02-05T23:57:56.585Z" },
-    { url = "https://files.pythonhosted.org/packages/28/4a/46d9e65106879492374999e76eb85f87b15328e06bd1550668f79f7b18c6/numpy-1.26.4-cp312-cp312-win32.whl", hash = "sha256:50193e430acfc1346175fcbdaa28ffec49947a06918b7b92130744e81e640110", size = 5677803, upload-time = "2024-02-05T23:58:08.963Z" },
-    { url = "https://files.pythonhosted.org/packages/16/2e/86f24451c2d530c88daf997cb8d6ac622c1d40d19f5a031ed68a4b73a374/numpy-1.26.4-cp312-cp312-win_amd64.whl", hash = "sha256:08beddf13648eb95f8d867350f6a018a4be2e5ad54c8d8caed89ebca558b2818", size = 15517754, upload-time = "2024-02-05T23:58:36.364Z" },
-]
-
 [[package]]
 name = "numpy"
 version = "2.4.1"
 source = { registry = "https://pypi.org/simple" }
-resolution-markers = [
-    "python_full_version >= '3.13'",
-    "python_full_version == '3.12.*'",
-]
 sdist = { url = "https://files.pythonhosted.org/packages/24/62/ae72ff66c0f1fd959925b4c11f8c2dea61f47f6acaea75a08512cdfe3fed/numpy-2.4.1.tar.gz", hash = "sha256:a1ceafc5042451a858231588a104093474c6a5c57dcc724841f5c888d237d690", size = 20721320, upload-time = "2026-01-10T06:44:59.619Z" }
 wheels = [
     { url = "https://files.pythonhosted.org/packages/a5/34/2b1bc18424f3ad9af577f6ce23600319968a70575bd7db31ce66731bbef9/numpy-2.4.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:0cce2a669e3c8ba02ee563c7835f92c153cf02edff1ae05e1823f1dde21b16a5", size = 16944563, upload-time = "2026-01-10T06:42:14.615Z" },
@@ -1890,67 +1915,129 @@ wheels = [
 name = "nvidia-nat"
 version = "1.5.0a20260112"
 source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "python_full_version >= '3.13'",
+    "python_full_version == '3.12.*'",
+]
+dependencies = [
+    { name = "aioboto3", marker = "python_full_version >= '3.12'" },
+    { name = "authlib", marker = "python_full_version >= '3.12'" },
+    { name = "click", marker = "python_full_version >= '3.12'" },
+    { name = "colorama", marker = "python_full_version >= '3.12'" },
+    { name = "datasets", marker = "python_full_version >= '3.12'" },
+    { name = "expandvars", marker = "python_full_version >= '3.12'" },
+    { name = "fastapi", marker = "python_full_version >= '3.12'" },
+    { name = "httpx", marker = "python_full_version >= '3.12'" },
+    { name = "jinja2", marker = "python_full_version >= '3.12'" },
+    { name = "jsonpath-ng", marker = "python_full_version >= '3.12'" },
+    { name = "mcp", marker = "python_full_version >= '3.12'" },
+    { name = "nest-asyncio2", marker = "python_full_version >= '3.12'" },
+    { name = "networkx", marker = "python_full_version >= '3.12'" },
+    { name = "numpy", marker = "python_full_version >= '3.12'" },
+    { name = "openinference-semantic-conventions", marker = "python_full_version >= '3.12'" },
+    { name = "openpyxl", marker = "python_full_version >= '3.12'" },
+    { name = "optuna", marker = "python_full_version >= '3.12'" },
+    { name = "pip", marker = "python_full_version >= '3.12'" },
+    { name = "pkce", marker = "python_full_version >= '3.12'" },
+    { name = "pkginfo", marker = "python_full_version >= '3.12'" },
+    { name = "platformdirs", marker = "python_full_version >= '3.12'" },
+    { name = "pydantic", marker = "python_full_version >= '3.12'" },
+    { name = "pymilvus", marker = "python_full_version >= '3.12'" },
+    { name = "python-dotenv", marker = "python_full_version >= '3.12'" },
+    { name = "pyyaml", marker = "python_full_version >= '3.12'" },
+    { name = "ragas", marker = "python_full_version >= '3.12'" },
+    { name = "rich", marker = "python_full_version >= '3.12'" },
+    { name = "tabulate", marker = "python_full_version >= '3.12'" },
+    { name = "uvicorn", extra = ["standard"], marker = "python_full_version >= '3.12'" },
+    { name = "wikipedia", marker = "python_full_version >= '3.12'" },
+]
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/a1/e0/c7426ed15d1eb528eb0c9135efb66da033b0a56b63f42d4099b2fe05fd24/nvidia_nat-1.5.0a20260112-py3-none-any.whl", hash = "sha256:3d05c948efe0e3ab58e3d7a58ab90510d1a1128eb678810e1ef62efc5dfc9681", size = 950027, upload-time = "2026-01-12T10:46:15.705Z" },
+]
+
+[[package]]
+name = "nvidia-nat"
+version = "1.5.0a20260223"
+source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "python_full_version < '3.12'",
+]
+dependencies = [
+    { name = "nvidia-nat-core", marker = "python_full_version < '3.12'" },
+]
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/07/7e/6e984de1473e8264d5cf6598d14f1c01f6dabf22f2fedda5f8e97140ae05/nvidia_nat-1.5.0a20260223-py3-none-any.whl", hash = "sha256:137461b310af90ed12e0496bac90ddb62297b00287707c80df48208437e2502a", size = 52704, upload-time = "2026-02-23T10:04:57.955Z" },
+]
+
+[[package]]
+name = "nvidia-nat-core"
+version = "1.5.0a20260223"
+source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "aioboto3" },
     { name = "authlib" },
     { name = "click" },
     { name = "colorama" },
-    { name = "datasets" },
     { name = "expandvars" },
     { name = "fastapi" },
+    { name = "flask" },
     { name = "httpx" },
     { name = "jinja2" },
     { name = "jsonpath-ng" },
-    { name = "mcp" },
     { name = "nest-asyncio2" },
     { name = "networkx" },
-    { name = "numpy", version = "1.26.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.12'" },
-    { name = "numpy", version = "2.4.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.12'" },
+    { name = "numpy" },
     { name = "openinference-semantic-conventions" },
-    { name = "openpyxl" },
     { name = "optuna" },
+    { name = "pandas" },
     { name = "pip" },
     { name = "pkce" },
     { name = "pkginfo" },
     { name = "platformdirs" },
+    { name = "plotly" },
     { name = "pydantic" },
+    { name = "pyjwt" },
     { name = "pymilvus" },
     { name = "python-dotenv" },
+    { name = "python-multipart" },
     { name = "pyyaml" },
-    { name = "ragas" },
     { name = "rich" },
     { name = "tabulate" },
+    { name = "urllib3" },
     { name = "uvicorn", extra = ["standard"] },
     { name = "wikipedia" },
 ]
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/a1/e0/c7426ed15d1eb528eb0c9135efb66da033b0a56b63f42d4099b2fe05fd24/nvidia_nat-1.5.0a20260112-py3-none-any.whl", hash = "sha256:3d05c948efe0e3ab58e3d7a58ab90510d1a1128eb678810e1ef62efc5dfc9681", size = 950027, upload-time = "2026-01-12T10:46:15.705Z" },
+    { url = "https://files.pythonhosted.org/packages/b1/23/b043caadf08a72e4eb2c95bb65fed5083e7bf40af48ea92305fabf3b2820/nvidia_nat_core-1.5.0a20260223-py3-none-any.whl", hash = "sha256:5262cae48d66efbd53f98134e7820759121a1b4398b339e1d14d307ed2195a21", size = 762259, upload-time = "2026-02-23T10:01:44.692Z" },
 ]
 
 [[package]]
 name = "nvidia-nat-langchain"
-version = "1.5.0a20260112"
+version = "1.5.0a20260223"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "langchain" },
     { name = "langchain-aws" },
     { name = "langchain-classic" },
+    { name = "langchain-community" },
     { name = "langchain-core" },
+    { name = "langchain-huggingface" },
     { name = "langchain-litellm" },
     { name = "langchain-milvus" },
     { name = "langchain-nvidia-ai-endpoints" },
     { name = "langchain-openai" },
     { name = "langchain-tavily" },
     { name = "langgraph" },
-    { name = "nvidia-nat" },
+    { name = "nvidia-nat-core" },
+    { name = "openevals" },
 ]
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/84/2a/7a2cd2e7444ef03bdebb6c9637e63a9eee33da84e7c23baceb18f83f2250/nvidia_nat_langchain-1.5.0a20260112-py3-none-any.whl", hash = "sha256:cba64b0192d589f325cbbc2de60da8eb514efc27b49157b3eafca204ab989a55", size = 60925, upload-time = "2026-01-12T10:43:35.414Z" },
+    { url = "https://files.pythonhosted.org/packages/98/65/e565dc570ecfdf4c4ca34d0d873794d33fdf11a8889a9b2b1a78ad15b589/nvidia_nat_langchain-1.5.0a20260223-py3-none-any.whl", hash = "sha256:87c70294c1f38fcd09252a79dd5e9038aee73326678dd9d7519b8064b914d7e4", size = 160480, upload-time = "2026-02-23T10:06:08.62Z" },
 ]
 
 [[package]]
 name = "nvidia-rag"
-version = "2.4.0.dev0"
+version = "2.5.0.dev0"
 source = { editable = "../../" }
 dependencies = [
     { name = "anyio" },
@@ -2016,16 +2103,16 @@ requires-dist = [
     { name = "langchain-elasticsearch", marker = "extra == 'all'", specifier = ">=0.3" },
     { name = "langchain-elasticsearch", marker = "extra == 'elasticsearch'", specifier = ">=0.3" },
     { name = "langchain-milvus", specifier = ">=0.3.0" },
-    { name = "langchain-nvidia-ai-endpoints", specifier = ">=1.0.3" },
+    { name = "langchain-nvidia-ai-endpoints", specifier = ">=1.2.0" },
     { name = "langchain-openai", marker = "extra == 'all'", specifier = ">=0.2" },
     { name = "langchain-openai", marker = "extra == 'ingest'", specifier = ">=0.2" },
     { name = "langchain-openai", marker = "extra == 'rag'", specifier = ">=0.2" },
     { name = "lark", specifier = ">=1.2.2" },
     { name = "minio", specifier = ">=7.2,<8.0" },
-    { name = "nv-ingest-api", marker = "extra == 'all'", specifier = "==26.1.1" },
-    { name = "nv-ingest-api", marker = "extra == 'ingest'", specifier = "==26.1.1" },
-    { name = "nv-ingest-client", marker = "extra == 'all'", specifier = "==26.1.1" },
-    { name = "nv-ingest-client", marker = "extra == 'ingest'", specifier = "==26.1.1" },
+    { name = "nv-ingest-api", marker = "extra == 'all'", specifier = "==26.1.2" },
+    { name = "nv-ingest-api", marker = "extra == 'ingest'", specifier = "==26.1.2" },
+    { name = "nv-ingest-client", marker = "extra == 'all'", specifier = "==26.1.2" },
+    { name = "nv-ingest-client", marker = "extra == 'ingest'", specifier = "==26.1.2" },
     { name = "opentelemetry-api", marker = "extra == 'all'", specifier = ">=1.29,<2.0" },
     { name = "opentelemetry-api", marker = "extra == 'ingest'", specifier = ">=1.29,<2.0" },
     { name = "opentelemetry-api", marker = "extra == 'rag'", specifier = ">=1.29,<2.0" },
@@ -2094,8 +2181,7 @@ source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "coloredlogs" },
     { name = "flatbuffers" },
-    { name = "numpy", version = "1.26.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.12'" },
-    { name = "numpy", version = "2.4.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.12'" },
+    { name = "numpy" },
     { name = "packaging" },
     { name = "protobuf" },
     { name = "sympy" },
@@ -2129,6 +2215,21 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/b5/df/c306f7375d42bafb379934c2df4c2fa3964656c8c782bac75ee10c102818/openai-2.15.0-py3-none-any.whl", hash = "sha256:6ae23b932cd7230f7244e52954daa6602716d6b9bf235401a107af731baea6c3", size = 1067879, upload-time = "2026-01-09T22:10:06.446Z" },
 ]
 
+[[package]]
+name = "openevals"
+version = "0.1.3"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "langchain" },
+    { name = "langchain-openai" },
+    { name = "langsmith" },
+    { name = "rich" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/d4/37/31e23ef661fa4c3c6a3c979afd884b30205512b4dde680b36d5909550500/openevals-0.1.3.tar.gz", hash = "sha256:9b00df1a7738464676aa887d4d950b77d3ef7024f6e8a54be3a83c82f485ea65", size = 100828, upload-time = "2025-12-18T04:09:03.034Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/0d/68/162b0d273ffef5b0ad557ebccb790725bf94d78969702324dd5726828cf0/openevals-0.1.3-py3-none-any.whl", hash = "sha256:aed448df0cfdded732e24cda026eda065435a71ffb8c406a3ce73e590156d9f9", size = 67802, upload-time = "2025-12-18T04:09:01.59Z" },
+]
+
 [[package]]
 name = "openinference-semantic-conventions"
 version = "0.1.25"
@@ -2143,7 +2244,7 @@ name = "openpyxl"
 version = "3.1.5"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "et-xmlfile" },
+    { name = "et-xmlfile", marker = "python_full_version >= '3.12'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/3d/f9/88d94a75de065ea32619465d2f77b29a0469500e99012523b91cc4141cd1/openpyxl-3.1.5.tar.gz", hash = "sha256:cf0e3cf56142039133628b5acffe8ef0c12bc902d2aadd3e0fe5878dc08d1050", size = 186464, upload-time = "2024-06-28T14:03:44.161Z" }
 wheels = [
@@ -2392,8 +2493,7 @@ source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "alembic" },
     { name = "colorlog" },
-    { name = "numpy", version = "1.26.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.12'" },
-    { name = "numpy", version = "2.4.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.12'" },
+    { name = "numpy" },
     { name = "packaging" },
     { name = "pyyaml" },
     { name = "sqlalchemy" },
@@ -2506,8 +2606,7 @@ name = "pandas"
 version = "2.3.3"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "numpy", version = "1.26.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.12'" },
-    { name = "numpy", version = "2.4.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.12'" },
+    { name = "numpy" },
     { name = "python-dateutil" },
     { name = "pytz" },
     { name = "tzdata" },
@@ -2668,6 +2767,19 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/cb/28/3bfe2fa5a7b9c46fe7e13c97bda14c895fb10fa2ebf1d0abb90e0cea7ee1/platformdirs-4.5.1-py3-none-any.whl", hash = "sha256:d03afa3963c806a9bed9d5125c8f4cb2fdaf74a55ab60e5d59b3fde758104d31", size = 18731, upload-time = "2025-12-05T13:52:56.823Z" },
 ]
 
+[[package]]
+name = "plotly"
+version = "6.6.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "narwhals" },
+    { name = "packaging" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/24/fb/41efe84970cfddefd4ccf025e2cbfafe780004555f583e93dba3dac2cdef/plotly-6.6.0.tar.gz", hash = "sha256:b897f15f3b02028d69f755f236be890ba950d0a42d7dfc619b44e2d8cea8748c", size = 7027956, upload-time = "2026-03-02T21:10:25.321Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/52/d2/c6e44dba74f17c6216ce1b56044a9b93a929f1c2d5bdaff892512b260f5e/plotly-6.6.0-py3-none-any.whl", hash = "sha256:8d6daf0f87412e0c0bfe72e809d615217ab57cc715899a1e5145135a7800d1d0", size = 9910315, upload-time = "2026-03-02T21:10:18.131Z" },
+]
+
 [[package]]
 name = "ply"
 version = "3.11"
@@ -2963,7 +3075,7 @@ wheels = [
 
 [package.optional-dependencies]
 crypto = [
-    { name = "cryptography" },
+    { name = "cryptography", marker = "python_full_version >= '3.12'" },
 ]
 
 [[package]]
@@ -2994,8 +3106,7 @@ name = "pymilvus-model"
 version = "0.3.2"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "numpy", version = "1.26.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.12'" },
-    { name = "numpy", version = "2.4.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.12'" },
+    { name = "numpy" },
     { name = "onnxruntime" },
     { name = "protobuf" },
     { name = "scipy" },
@@ -3142,7 +3253,8 @@ source = { editable = "." }
 dependencies = [
     { name = "langchain-classic" },
     { name = "langgraph" },
-    { name = "nvidia-nat" },
+    { name = "nvidia-nat", version = "1.5.0a20260112", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.12'" },
+    { name = "nvidia-nat", version = "1.5.0a20260223", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.12'" },
     { name = "nvidia-nat-langchain" },
     { name = "nvidia-rag", extra = ["rag"] },
     { name = "transformers" },
@@ -3151,7 +3263,7 @@ dependencies = [
 [package.metadata]
 requires-dist = [
     { name = "langchain-classic" },
-    { name = "langgraph", specifier = ">=1.0.7" },
+    { name = "langgraph", specifier = ">=1.0.8" },
     { name = "nvidia-nat", specifier = ">=1.5.0a0,<2.0" },
     { name = "nvidia-nat-langchain", specifier = ">=1.5.0a0,<2.0" },
     { name = "nvidia-rag", extras = ["rag"], editable = "../../" },
@@ -3163,19 +3275,18 @@ name = "ragas"
 version = "0.2.15"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "appdirs" },
-    { name = "datasets" },
-    { name = "diskcache" },
-    { name = "langchain" },
-    { name = "langchain-community" },
-    { name = "langchain-core" },
-    { name = "langchain-openai" },
-    { name = "nest-asyncio" },
-    { name = "numpy", version = "1.26.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.12'" },
-    { name = "numpy", version = "2.4.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.12'" },
-    { name = "openai" },
-    { name = "pydantic" },
-    { name = "tiktoken" },
+    { name = "appdirs", marker = "python_full_version >= '3.12'" },
+    { name = "datasets", marker = "python_full_version >= '3.12'" },
+    { name = "diskcache", marker = "python_full_version >= '3.12'" },
+    { name = "langchain", marker = "python_full_version >= '3.12'" },
+    { name = "langchain-community", marker = "python_full_version >= '3.12'" },
+    { name = "langchain-core", marker = "python_full_version >= '3.12'" },
+    { name = "langchain-openai", marker = "python_full_version >= '3.12'" },
+    { name = "nest-asyncio", marker = "python_full_version >= '3.12'" },
+    { name = "numpy", marker = "python_full_version >= '3.12'" },
+    { name = "openai", marker = "python_full_version >= '3.12'" },
+    { name = "pydantic", marker = "python_full_version >= '3.12'" },
+    { name = "tiktoken", marker = "python_full_version >= '3.12'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/6c/0f/04fddfa94744b1c3d8901aed8832a6b4193cc8e4886881f1bb88ff055350/ragas-0.2.15.tar.gz", hash = "sha256:2d0cd77b315a9c9c02ceb0a19ca8a48e82e1d02416587a2944ea51e6e327cd7b", size = 40867766, upload-time = "2025-04-24T16:39:28.734Z" }
 wheels = [
@@ -3438,8 +3549,7 @@ name = "scipy"
 version = "1.17.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "numpy", version = "1.26.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.12'" },
-    { name = "numpy", version = "2.4.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.12'" },
+    { name = "numpy" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/56/3e/9cca699f3486ce6bc12ff46dc2031f1ec8eb9ccc9a320fdaf925f1417426/scipy-1.17.0.tar.gz", hash = "sha256:2591060c8e648d8b96439e111ac41fd8342fdeff1876be2e19dea3fe8930454e", size = 30396830, upload-time = "2026-01-10T21:34:23.009Z" }
 wheels = [
@@ -3569,7 +3679,7 @@ name = "sse-starlette"
 version = "3.0.3"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "anyio" },
+    { name = "anyio", marker = "python_full_version >= '3.12'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/db/3c/fa6517610dc641262b77cc7bf994ecd17465812c1b0585fe33e11be758ab/sse_starlette-3.0.3.tar.gz", hash = "sha256:88cfb08747e16200ea990c8ca876b03910a23b547ab3bd764c0d8eb81019b971", size = 21943, upload-time = "2025-10-30T18:44:20.117Z" }
 wheels = [
@@ -3703,8 +3813,7 @@ version = "5.1.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "huggingface-hub" },
-    { name = "numpy", version = "1.26.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.12'" },
-    { name = "numpy", version = "2.4.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.12'" },
+    { name = "numpy" },
     { name = "packaging" },
     { name = "pyyaml" },
     { name = "regex" },
@@ -3976,6 +4085,18 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/6f/28/258ebab549c2bf3e64d2b0217b973467394a9cea8c42f70418ca2c5d0d2e/websockets-16.0-py3-none-any.whl", hash = "sha256:1637db62fad1dc833276dded54215f2c7fa46912301a24bd94d45d46a011ceec", size = 171598, upload-time = "2026-01-10T09:23:45.395Z" },
 ]
 
+[[package]]
+name = "werkzeug"
+version = "3.1.6"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "markupsafe" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/61/f1/ee81806690a87dab5f5653c1f146c92bc066d7f4cebc603ef88eb9e13957/werkzeug-3.1.6.tar.gz", hash = "sha256:210c6bede5a420a913956b4791a7f4d6843a43b6fcee4dfa08a65e93007d0d25", size = 864736, upload-time = "2026-02-19T15:17:18.884Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/4d/ec/d58832f89ede95652fd01f4f24236af7d32b70cab2196dfcc2d2fd13c5c2/werkzeug-3.1.6-py3-none-any.whl", hash = "sha256:7ddf3357bb9564e407607f988f683d72038551200c704012bb9a4c523d42f131", size = 225166, upload-time = "2026-02-19T15:17:17.475Z" },
+]
+
 [[package]]
 name = "wikipedia"
 version = "1.4.0"
diff --git a/frontend/Dockerfile b/frontend/Dockerfile
index 1c0a6961d..c6f3e3b01 100644
--- a/frontend/Dockerfile
+++ b/frontend/Dockerfile
@@ -94,7 +94,7 @@ RUN if [ "$DOWNLOAD_LEGAL_COMPLIANCE" = "true" ] && [ -d /legal ]; then \
 
 # Production stage - NVIDIA distroless (pre-approved)
 # Updated to latest version to address CVE-2025-9230 (libssl3)
-FROM nvcr.io/nvidia/distroless/node:24-v3.1.3
+FROM nvcr.io/nvidia/distroless/node:24-v4.0.2
 
 # Copy built application and config for production preview
 WORKDIR /app/frontend
diff --git a/frontend/package.json b/frontend/package.json
index fe77b7146..bdb3500f5 100644
--- a/frontend/package.json
+++ b/frontend/package.json
@@ -51,5 +51,12 @@
   },
   "resolutions": {
     "@kui/foundations": "./src/assets/kui-foundations-react-external-0.504.1.tgz"
+  },
+  "pnpm": {
+    "overrides": {
+      "rollup": ">=4.59.0",
+      "minimatch@3.1.2": "3.1.4",
+      "minimatch@9.0.5": "9.0.7"
+    }
   }
 }
\ No newline at end of file
diff --git a/frontend/pnpm-lock.yaml b/frontend/pnpm-lock.yaml
index ea8207293..cc1a633fc 100644
--- a/frontend/pnpm-lock.yaml
+++ b/frontend/pnpm-lock.yaml
@@ -6,6 +6,9 @@ settings:
 
 overrides:
   '@kui/foundations': ./src/assets/kui-foundations-react-external-0.504.1.tgz
+  rollup: '>=4.59.0'
+  minimatch@3.1.2: 3.1.4
+  minimatch@9.0.5: 9.0.7
 
 importers:
 
@@ -1233,113 +1236,141 @@ packages:
   '@rolldown/pluginutils@1.0.0-beta.27':
     resolution: {integrity: sha512-+d0F4MKMCbeVUJwG96uQ4SgAznZNSq93I3V+9NHA4OpvqG8mRCpGdKmK8l/dl02h2CCDHwW2FqilnTyDcAnqjA==}
 
-  '@rollup/rollup-android-arm-eabi@4.53.3':
-    resolution: {integrity: sha512-mRSi+4cBjrRLoaal2PnqH82Wqyb+d3HsPUN/W+WslCXsZsyHa9ZeQQX/pQsZaVIWDkPcpV6jJ+3KLbTbgnwv8w==}
+  '@rollup/rollup-android-arm-eabi@4.59.0':
+    resolution: {integrity: sha512-upnNBkA6ZH2VKGcBj9Fyl9IGNPULcjXRlg0LLeaioQWueH30p6IXtJEbKAgvyv+mJaMxSm1l6xwDXYjpEMiLMg==}
     cpu: [arm]
     os: [android]
 
-  '@rollup/rollup-android-arm64@4.53.3':
-    resolution: {integrity: sha512-CbDGaMpdE9sh7sCmTrTUyllhrg65t6SwhjlMJsLr+J8YjFuPmCEjbBSx4Z/e4SmDyH3aB5hGaJUP2ltV/vcs4w==}
+  '@rollup/rollup-android-arm64@4.59.0':
+    resolution: {integrity: sha512-hZ+Zxj3SySm4A/DylsDKZAeVg0mvi++0PYVceVyX7hemkw7OreKdCvW2oQ3T1FMZvCaQXqOTHb8qmBShoqk69Q==}
     cpu: [arm64]
     os: [android]
 
-  '@rollup/rollup-darwin-arm64@4.53.3':
-    resolution: {integrity: sha512-Nr7SlQeqIBpOV6BHHGZgYBuSdanCXuw09hon14MGOLGmXAFYjx1wNvquVPmpZnl0tLjg25dEdr4IQ6GgyToCUA==}
+  '@rollup/rollup-darwin-arm64@4.59.0':
+    resolution: {integrity: sha512-W2Psnbh1J8ZJw0xKAd8zdNgF9HRLkdWwwdWqubSVk0pUuQkoHnv7rx4GiF9rT4t5DIZGAsConRE3AxCdJ4m8rg==}
     cpu: [arm64]
     os: [darwin]
 
-  '@rollup/rollup-darwin-x64@4.53.3':
-    resolution: {integrity: sha512-DZ8N4CSNfl965CmPktJ8oBnfYr3F8dTTNBQkRlffnUarJ2ohudQD17sZBa097J8xhQ26AwhHJ5mvUyQW8ddTsQ==}
+  '@rollup/rollup-darwin-x64@4.59.0':
+    resolution: {integrity: sha512-ZW2KkwlS4lwTv7ZVsYDiARfFCnSGhzYPdiOU4IM2fDbL+QGlyAbjgSFuqNRbSthybLbIJ915UtZBtmuLrQAT/w==}
     cpu: [x64]
     os: [darwin]
 
-  '@rollup/rollup-freebsd-arm64@4.53.3':
-    resolution: {integrity: sha512-yMTrCrK92aGyi7GuDNtGn2sNW+Gdb4vErx4t3Gv/Tr+1zRb8ax4z8GWVRfr3Jw8zJWvpGHNpss3vVlbF58DZ4w==}
+  '@rollup/rollup-freebsd-arm64@4.59.0':
+    resolution: {integrity: sha512-EsKaJ5ytAu9jI3lonzn3BgG8iRBjV4LxZexygcQbpiU0wU0ATxhNVEpXKfUa0pS05gTcSDMKpn3Sx+QB9RlTTA==}
     cpu: [arm64]
     os: [freebsd]
 
-  '@rollup/rollup-freebsd-x64@4.53.3':
-    resolution: {integrity: sha512-lMfF8X7QhdQzseM6XaX0vbno2m3hlyZFhwcndRMw8fbAGUGL3WFMBdK0hbUBIUYcEcMhVLr1SIamDeuLBnXS+Q==}
+  '@rollup/rollup-freebsd-x64@4.59.0':
+    resolution: {integrity: sha512-d3DuZi2KzTMjImrxoHIAODUZYoUUMsuUiY4SRRcJy6NJoZ6iIqWnJu9IScV9jXysyGMVuW+KNzZvBLOcpdl3Vg==}
     cpu: [x64]
     os: [freebsd]
 
-  '@rollup/rollup-linux-arm-gnueabihf@4.53.3':
-    resolution: {integrity: sha512-k9oD15soC/Ln6d2Wv/JOFPzZXIAIFLp6B+i14KhxAfnq76ajt0EhYc5YPeX6W1xJkAdItcVT+JhKl1QZh44/qw==}
+  '@rollup/rollup-linux-arm-gnueabihf@4.59.0':
+    resolution: {integrity: sha512-t4ONHboXi/3E0rT6OZl1pKbl2Vgxf9vJfWgmUoCEVQVxhW6Cw/c8I6hbbu7DAvgp82RKiH7TpLwxnJeKv2pbsw==}
     cpu: [arm]
     os: [linux]
+    libc: [glibc]
 
-  '@rollup/rollup-linux-arm-musleabihf@4.53.3':
-    resolution: {integrity: sha512-vTNlKq+N6CK/8UktsrFuc+/7NlEYVxgaEgRXVUVK258Z5ymho29skzW1sutgYjqNnquGwVUObAaxae8rZ6YMhg==}
+  '@rollup/rollup-linux-arm-musleabihf@4.59.0':
+    resolution: {integrity: sha512-CikFT7aYPA2ufMD086cVORBYGHffBo4K8MQ4uPS/ZnY54GKj36i196u8U+aDVT2LX4eSMbyHtyOh7D7Zvk2VvA==}
     cpu: [arm]
     os: [linux]
+    libc: [musl]
 
-  '@rollup/rollup-linux-arm64-gnu@4.53.3':
-    resolution: {integrity: sha512-RGrFLWgMhSxRs/EWJMIFM1O5Mzuz3Xy3/mnxJp/5cVhZ2XoCAxJnmNsEyeMJtpK+wu0FJFWz+QF4mjCA7AUQ3w==}
+  '@rollup/rollup-linux-arm64-gnu@4.59.0':
+    resolution: {integrity: sha512-jYgUGk5aLd1nUb1CtQ8E+t5JhLc9x5WdBKew9ZgAXg7DBk0ZHErLHdXM24rfX+bKrFe+Xp5YuJo54I5HFjGDAA==}
     cpu: [arm64]
     os: [linux]
+    libc: [glibc]
 
-  '@rollup/rollup-linux-arm64-musl@4.53.3':
-    resolution: {integrity: sha512-kASyvfBEWYPEwe0Qv4nfu6pNkITLTb32p4yTgzFCocHnJLAHs+9LjUu9ONIhvfT/5lv4YS5muBHyuV84epBo/A==}
+  '@rollup/rollup-linux-arm64-musl@4.59.0':
+    resolution: {integrity: sha512-peZRVEdnFWZ5Bh2KeumKG9ty7aCXzzEsHShOZEFiCQlDEepP1dpUl/SrUNXNg13UmZl+gzVDPsiCwnV1uI0RUA==}
     cpu: [arm64]
     os: [linux]
+    libc: [musl]
 
-  '@rollup/rollup-linux-loong64-gnu@4.53.3':
-    resolution: {integrity: sha512-JiuKcp2teLJwQ7vkJ95EwESWkNRFJD7TQgYmCnrPtlu50b4XvT5MOmurWNrCj3IFdyjBQ5p9vnrX4JM6I8OE7g==}
+  '@rollup/rollup-linux-loong64-gnu@4.59.0':
+    resolution: {integrity: sha512-gbUSW/97f7+r4gHy3Jlup8zDG190AuodsWnNiXErp9mT90iCy9NKKU0Xwx5k8VlRAIV2uU9CsMnEFg/xXaOfXg==}
     cpu: [loong64]
     os: [linux]
+    libc: [glibc]
 
-  '@rollup/rollup-linux-ppc64-gnu@4.53.3':
-    resolution: {integrity: sha512-EoGSa8nd6d3T7zLuqdojxC20oBfNT8nexBbB/rkxgKj5T5vhpAQKKnD+h3UkoMuTyXkP5jTjK/ccNRmQrPNDuw==}
+  '@rollup/rollup-linux-loong64-musl@4.59.0':
+    resolution: {integrity: sha512-yTRONe79E+o0FWFijasoTjtzG9EBedFXJMl888NBEDCDV9I2wGbFFfJQQe63OijbFCUZqxpHz1GzpbtSFikJ4Q==}
+    cpu: [loong64]
+    os: [linux]
+    libc: [musl]
+
+  '@rollup/rollup-linux-ppc64-gnu@4.59.0':
+    resolution: {integrity: sha512-sw1o3tfyk12k3OEpRddF68a1unZ5VCN7zoTNtSn2KndUE+ea3m3ROOKRCZxEpmT9nsGnogpFP9x6mnLTCaoLkA==}
     cpu: [ppc64]
     os: [linux]
+    libc: [glibc]
 
-  '@rollup/rollup-linux-riscv64-gnu@4.53.3':
-    resolution: {integrity: sha512-4s+Wped2IHXHPnAEbIB0YWBv7SDohqxobiiPA1FIWZpX+w9o2i4LezzH/NkFUl8LRci/8udci6cLq+jJQlh+0g==}
+  '@rollup/rollup-linux-ppc64-musl@4.59.0':
+    resolution: {integrity: sha512-+2kLtQ4xT3AiIxkzFVFXfsmlZiG5FXYW7ZyIIvGA7Bdeuh9Z0aN4hVyXS/G1E9bTP/vqszNIN/pUKCk/BTHsKA==}
+    cpu: [ppc64]
+    os: [linux]
+    libc: [musl]
+
+  '@rollup/rollup-linux-riscv64-gnu@4.59.0':
+    resolution: {integrity: sha512-NDYMpsXYJJaj+I7UdwIuHHNxXZ/b/N2hR15NyH3m2qAtb/hHPA4g4SuuvrdxetTdndfj9b1WOmy73kcPRoERUg==}
     cpu: [riscv64]
     os: [linux]
+    libc: [glibc]
 
-  '@rollup/rollup-linux-riscv64-musl@4.53.3':
-    resolution: {integrity: sha512-68k2g7+0vs2u9CxDt5ktXTngsxOQkSEV/xBbwlqYcUrAVh6P9EgMZvFsnHy4SEiUl46Xf0IObWVbMvPrr2gw8A==}
+  '@rollup/rollup-linux-riscv64-musl@4.59.0':
+    resolution: {integrity: sha512-nLckB8WOqHIf1bhymk+oHxvM9D3tyPndZH8i8+35p/1YiVoVswPid2yLzgX7ZJP0KQvnkhM4H6QZ5m0LzbyIAg==}
     cpu: [riscv64]
     os: [linux]
+    libc: [musl]
 
-  '@rollup/rollup-linux-s390x-gnu@4.53.3':
-    resolution: {integrity: sha512-VYsFMpULAz87ZW6BVYw3I6sWesGpsP9OPcyKe8ofdg9LHxSbRMd7zrVrr5xi/3kMZtpWL/wC+UIJWJYVX5uTKg==}
+  '@rollup/rollup-linux-s390x-gnu@4.59.0':
+    resolution: {integrity: sha512-oF87Ie3uAIvORFBpwnCvUzdeYUqi2wY6jRFWJAy1qus/udHFYIkplYRW+wo+GRUP4sKzYdmE1Y3+rY5Gc4ZO+w==}
     cpu: [s390x]
     os: [linux]
+    libc: [glibc]
 
-  '@rollup/rollup-linux-x64-gnu@4.53.3':
-    resolution: {integrity: sha512-3EhFi1FU6YL8HTUJZ51imGJWEX//ajQPfqWLI3BQq4TlvHy4X0MOr5q3D2Zof/ka0d5FNdPwZXm3Yyib/UEd+w==}
+  '@rollup/rollup-linux-x64-gnu@4.59.0':
+    resolution: {integrity: sha512-3AHmtQq/ppNuUspKAlvA8HtLybkDflkMuLK4DPo77DfthRb71V84/c4MlWJXixZz4uruIH4uaa07IqoAkG64fg==}
     cpu: [x64]
     os: [linux]
+    libc: [glibc]
 
-  '@rollup/rollup-linux-x64-musl@4.53.3':
-    resolution: {integrity: sha512-eoROhjcc6HbZCJr+tvVT8X4fW3/5g/WkGvvmwz/88sDtSJzO7r/blvoBDgISDiCjDRZmHpwud7h+6Q9JxFwq1Q==}
+  '@rollup/rollup-linux-x64-musl@4.59.0':
+    resolution: {integrity: sha512-2UdiwS/9cTAx7qIUZB/fWtToJwvt0Vbo0zmnYt7ED35KPg13Q0ym1g442THLC7VyI6JfYTP4PiSOWyoMdV2/xg==}
     cpu: [x64]
     os: [linux]
+    libc: [musl]
+
+  '@rollup/rollup-openbsd-x64@4.59.0':
+    resolution: {integrity: sha512-M3bLRAVk6GOwFlPTIxVBSYKUaqfLrn8l0psKinkCFxl4lQvOSz8ZrKDz2gxcBwHFpci0B6rttydI4IpS4IS/jQ==}
+    cpu: [x64]
+    os: [openbsd]
 
-  '@rollup/rollup-openharmony-arm64@4.53.3':
-    resolution: {integrity: sha512-OueLAWgrNSPGAdUdIjSWXw+u/02BRTcnfw9PN41D2vq/JSEPnJnVuBgw18VkN8wcd4fjUs+jFHVM4t9+kBSNLw==}
+  '@rollup/rollup-openharmony-arm64@4.59.0':
+    resolution: {integrity: sha512-tt9KBJqaqp5i5HUZzoafHZX8b5Q2Fe7UjYERADll83O4fGqJ49O1FsL6LpdzVFQcpwvnyd0i+K/VSwu/o/nWlA==}
     cpu: [arm64]
     os: [openharmony]
 
-  '@rollup/rollup-win32-arm64-msvc@4.53.3':
-    resolution: {integrity: sha512-GOFuKpsxR/whszbF/bzydebLiXIHSgsEUp6M0JI8dWvi+fFa1TD6YQa4aSZHtpmh2/uAlj/Dy+nmby3TJ3pkTw==}
+  '@rollup/rollup-win32-arm64-msvc@4.59.0':
+    resolution: {integrity: sha512-V5B6mG7OrGTwnxaNUzZTDTjDS7F75PO1ae6MJYdiMu60sq0CqN5CVeVsbhPxalupvTX8gXVSU9gq+Rx1/hvu6A==}
     cpu: [arm64]
     os: [win32]
 
-  '@rollup/rollup-win32-ia32-msvc@4.53.3':
-    resolution: {integrity: sha512-iah+THLcBJdpfZ1TstDFbKNznlzoxa8fmnFYK4V67HvmuNYkVdAywJSoteUszvBQ9/HqN2+9AZghbajMsFT+oA==}
+  '@rollup/rollup-win32-ia32-msvc@4.59.0':
+    resolution: {integrity: sha512-UKFMHPuM9R0iBegwzKF4y0C4J9u8C6MEJgFuXTBerMk7EJ92GFVFYBfOZaSGLu6COf7FxpQNqhNS4c4icUPqxA==}
     cpu: [ia32]
     os: [win32]
 
-  '@rollup/rollup-win32-x64-gnu@4.53.3':
-    resolution: {integrity: sha512-J9QDiOIZlZLdcot5NXEepDkstocktoVjkaKUtqzgzpt2yWjGlbYiKyp05rWwk4nypbYUNoFAztEgixoLaSETkg==}
+  '@rollup/rollup-win32-x64-gnu@4.59.0':
+    resolution: {integrity: sha512-laBkYlSS1n2L8fSo1thDNGrCTQMmxjYY5G0WFWjFFYZkKPjsMBsgJfGf4TLxXrF6RyhI60L8TMOjBMvXiTcxeA==}
     cpu: [x64]
     os: [win32]
 
-  '@rollup/rollup-win32-x64-msvc@4.53.3':
-    resolution: {integrity: sha512-UhTd8u31dXadv0MopwGgNOBpUVROFKWVQgAg5N1ESyCz8AuBcMqm4AuTjrwgQKGDfoFuz02EuMRHQIw/frmYKQ==}
+  '@rollup/rollup-win32-x64-msvc@4.59.0':
+    resolution: {integrity: sha512-2HRCml6OztYXyJXAvdDXPKcawukWY2GpR5/nxKp4iBgiO3wcoEGkAaqctIbZcNB6KlUQBIqt8VYkNSj2397EfA==}
     cpu: [x64]
     os: [win32]
 
@@ -1381,24 +1412,28 @@ packages:
     engines: {node: '>= 10'}
     cpu: [arm64]
     os: [linux]
+    libc: [glibc]
 
   '@tailwindcss/oxide-linux-arm64-musl@4.1.17':
     resolution: {integrity: sha512-HvZLfGr42i5anKtIeQzxdkw/wPqIbpeZqe7vd3V9vI3RQxe3xU1fLjss0TjyhxWcBaipk7NYwSrwTwK1hJARMg==}
     engines: {node: '>= 10'}
     cpu: [arm64]
     os: [linux]
+    libc: [musl]
 
   '@tailwindcss/oxide-linux-x64-gnu@4.1.17':
     resolution: {integrity: sha512-M3XZuORCGB7VPOEDH+nzpJ21XPvK5PyjlkSFkFziNHGLc5d6g3di2McAAblmaSUNl8IOmzYwLx9NsE7bplNkwQ==}
     engines: {node: '>= 10'}
     cpu: [x64]
     os: [linux]
+    libc: [glibc]
 
   '@tailwindcss/oxide-linux-x64-musl@4.1.17':
     resolution: {integrity: sha512-k7f+pf9eXLEey4pBlw+8dgfJHY4PZ5qOUFDyNf7SI6lHjQ9Zt7+NcscjpwdCEbYi6FI5c2KDTDWyf2iHcCSyyQ==}
     engines: {node: '>= 10'}
     cpu: [x64]
     os: [linux]
+    libc: [musl]
 
   '@tailwindcss/oxide-wasm32-wasi@4.1.17':
     resolution: {integrity: sha512-cEytGqSSoy7zK4JRWiTCx43FsKP/zGr0CsuMawhH67ONlH+T79VteQeJQRO/X7L0juEUA8ZyuYikcRBf0vsxhg==}
@@ -1677,6 +1712,10 @@ packages:
   balanced-match@1.0.2:
     resolution: {integrity: sha512-3oSeUO0TMV67hN1AmbXsK4yaqU7tjiHlbxRDZOpH0KW9+CeX4bRAaX0Anxt0tx2MrpRpWwQaPwIlISEJhYU5Pw==}
 
+  balanced-match@4.0.4:
+    resolution: {integrity: sha512-BLrgEcRTwX2o6gGxGOCNyMvGSp35YofuYzw9h1IMTRmKqttAZZVU67bdb9Pr2vUHA8+j3i2tJfjO6C6+4myGTA==}
+    engines: {node: 18 || 20 || >=22}
+
   baseline-browser-mapping@2.9.6:
     resolution: {integrity: sha512-v9BVVpOTLB59C9E7aSnmIF8h7qRsFpx+A2nugVMTszEOMcfjlZMsXRm4LF23I3Z9AJxc8ANpIvzbzONoX9VJlg==}
     hasBin: true
@@ -1684,8 +1723,9 @@ packages:
   brace-expansion@1.1.12:
     resolution: {integrity: sha512-9T9UjW3r0UW5c1Q7GTwllptXwhvYmEzFhzMfZ9H7FQWt+uZePjZPjBP/W1ZEyZ1twGWom5/56TF4lPcqjnDHcg==}
 
-  brace-expansion@2.0.2:
-    resolution: {integrity: sha512-Jt0vHyM+jmUBqojB7E1NIYadt0vI0Qxjxd2TErW94wDz+E2LAm5vKMXXwg6ZZBTHPuUlDgQHKXvjGBdfcF1ZDQ==}
+  brace-expansion@5.0.4:
+    resolution: {integrity: sha512-h+DEnpVvxmfVefa4jFbCf5HdH5YMDXRsmKflpf1pILZWRFlTbJpxeU55nJl4Smt5HQaGzg1o6RHFPJaOqnmBDg==}
+    engines: {node: 18 || 20 || >=22}
 
   browserslist@4.28.1:
     resolution: {integrity: sha512-ZC5Bd0LgJXgwGqUknZY/vkUQ04r8NXnJZ3yYi4vDmSiZmC/pdSN0NbNRPxZpbtO4uAfDUAFffO8IZoM3Gj8IkA==}
@@ -2135,24 +2175,28 @@ packages:
     engines: {node: '>= 12.0.0'}
     cpu: [arm64]
     os: [linux]
+    libc: [glibc]
 
   lightningcss-linux-arm64-musl@1.30.2:
     resolution: {integrity: sha512-5Vh9dGeblpTxWHpOx8iauV02popZDsCYMPIgiuw97OJ5uaDsL86cnqSFs5LZkG3ghHoX5isLgWzMs+eD1YzrnA==}
     engines: {node: '>= 12.0.0'}
     cpu: [arm64]
     os: [linux]
+    libc: [musl]
 
   lightningcss-linux-x64-gnu@1.30.2:
     resolution: {integrity: sha512-Cfd46gdmj1vQ+lR6VRTTadNHu6ALuw2pKR9lYq4FnhvgBc4zWY1EtZcAc6EffShbb1MFrIPfLDXD6Xprbnni4w==}
     engines: {node: '>= 12.0.0'}
     cpu: [x64]
     os: [linux]
+    libc: [glibc]
 
   lightningcss-linux-x64-musl@1.30.2:
     resolution: {integrity: sha512-XJaLUUFXb6/QG2lGIW6aIk6jKdtjtcffUT0NKvIqhSBY3hh9Ch+1LCeH80dR9q9LBjG3ewbDjnumefsLsP6aiA==}
     engines: {node: '>= 12.0.0'}
     cpu: [x64]
     os: [linux]
+    libc: [musl]
 
   lightningcss-win32-arm64-msvc@1.30.2:
     resolution: {integrity: sha512-FZn+vaj7zLv//D/192WFFVA0RgHawIcHqLX9xuWiQt7P0PtdFEVaxgF9rjM/IRYHQXNnk61/H/gb2Ei+kUQ4xQ==}
@@ -2214,11 +2258,11 @@ packages:
     resolution: {integrity: sha512-I9jwMn07Sy/IwOj3zVkVik2JTvgpaykDZEigL6Rx6N9LbMywwUSMtxET+7lVoDLLd3O3IXwJwvuuns8UB/HeAg==}
     engines: {node: '>=4'}
 
-  minimatch@3.1.2:
-    resolution: {integrity: sha512-J7p63hRiAjw1NDEww1W7i37+ByIrOWO5XQQAzZ3VOcL0PNybwpfmV/N05zFAzwQ9USyEcX6t3UO+K5aqBQOIHw==}
+  minimatch@3.1.4:
+    resolution: {integrity: sha512-twmL+S8+7yIsE9wsqgzU3E8/LumN3M3QELrBZ20OdmQ9jB2JvW5oZtBEmft84k/Gs5CG9mqtWc6Y9vW+JEzGxw==}
 
-  minimatch@9.0.5:
-    resolution: {integrity: sha512-G6T0ZX48xgozx7587koeX9Ys2NYy6Gmv//P89sEte9V9whIapMNF4idKxnW2QtCcLiTWlb/wfCabAtAFWhhBow==}
+  minimatch@9.0.7:
+    resolution: {integrity: sha512-MOwgjc8tfrpn5QQEvjijjmDVtMw2oL88ugTevzxQnzRLm6l3fVEF2gzU0kYeYYKD8C66+IdGX6peJ4MyUlUnPg==}
     engines: {node: '>=16 || 14 >=14.17'}
 
   minipass@7.1.2:
@@ -2400,8 +2444,8 @@ packages:
     resolution: {integrity: sha512-pb/MYmXstAkysRFx8piNI1tGFNQIFA3vkE3Gq4EuA1dF6gHp/+vgZqsCGJapvy8N3Q+4o7FwvquPJcnZ7RYy4g==}
     engines: {node: '>=4'}
 
-  rollup@4.53.3:
-    resolution: {integrity: sha512-w8GmOxZfBmKknvdXU1sdM9NHcoQejwF/4mNgj2JuEEdRaHwwF12K7e9eXn1nLZ07ad+du76mkVsyeb2rKGllsA==}
+  rollup@4.59.0:
+    resolution: {integrity: sha512-2oMpl67a3zCH9H79LeMcbDhXW/UmWG/y2zuqnF2jQq5uq9TbM9TVyXvA4+t+ne2IIkBdrLpAaRQAvo7YI/Yyeg==}
     engines: {node: '>=18.0.0', npm: '>=8.0.0'}
     hasBin: true
 
@@ -3031,7 +3075,7 @@ snapshots:
     dependencies:
       '@eslint/object-schema': 2.1.7
       debug: 4.4.3
-      minimatch: 3.1.2
+      minimatch: 3.1.4
     transitivePeerDependencies:
       - supports-color
 
@@ -3052,7 +3096,7 @@ snapshots:
       ignore: 5.3.2
       import-fresh: 3.3.1
       js-yaml: 4.1.1
-      minimatch: 3.1.2
+      minimatch: 3.1.4
       strip-json-comments: 3.1.1
     transitivePeerDependencies:
       - supports-color
@@ -3910,70 +3954,79 @@ snapshots:
 
   '@rolldown/pluginutils@1.0.0-beta.27': {}
 
-  '@rollup/rollup-android-arm-eabi@4.53.3':
+  '@rollup/rollup-android-arm-eabi@4.59.0':
+    optional: true
+
+  '@rollup/rollup-android-arm64@4.59.0':
+    optional: true
+
+  '@rollup/rollup-darwin-arm64@4.59.0':
     optional: true
 
-  '@rollup/rollup-android-arm64@4.53.3':
+  '@rollup/rollup-darwin-x64@4.59.0':
     optional: true
 
-  '@rollup/rollup-darwin-arm64@4.53.3':
+  '@rollup/rollup-freebsd-arm64@4.59.0':
     optional: true
 
-  '@rollup/rollup-darwin-x64@4.53.3':
+  '@rollup/rollup-freebsd-x64@4.59.0':
     optional: true
 
-  '@rollup/rollup-freebsd-arm64@4.53.3':
+  '@rollup/rollup-linux-arm-gnueabihf@4.59.0':
     optional: true
 
-  '@rollup/rollup-freebsd-x64@4.53.3':
+  '@rollup/rollup-linux-arm-musleabihf@4.59.0':
     optional: true
 
-  '@rollup/rollup-linux-arm-gnueabihf@4.53.3':
+  '@rollup/rollup-linux-arm64-gnu@4.59.0':
     optional: true
 
-  '@rollup/rollup-linux-arm-musleabihf@4.53.3':
+  '@rollup/rollup-linux-arm64-musl@4.59.0':
     optional: true
 
-  '@rollup/rollup-linux-arm64-gnu@4.53.3':
+  '@rollup/rollup-linux-loong64-gnu@4.59.0':
     optional: true
 
-  '@rollup/rollup-linux-arm64-musl@4.53.3':
+  '@rollup/rollup-linux-loong64-musl@4.59.0':
     optional: true
 
-  '@rollup/rollup-linux-loong64-gnu@4.53.3':
+  '@rollup/rollup-linux-ppc64-gnu@4.59.0':
     optional: true
 
-  '@rollup/rollup-linux-ppc64-gnu@4.53.3':
+  '@rollup/rollup-linux-ppc64-musl@4.59.0':
     optional: true
 
-  '@rollup/rollup-linux-riscv64-gnu@4.53.3':
+  '@rollup/rollup-linux-riscv64-gnu@4.59.0':
     optional: true
 
-  '@rollup/rollup-linux-riscv64-musl@4.53.3':
+  '@rollup/rollup-linux-riscv64-musl@4.59.0':
     optional: true
 
-  '@rollup/rollup-linux-s390x-gnu@4.53.3':
+  '@rollup/rollup-linux-s390x-gnu@4.59.0':
     optional: true
 
-  '@rollup/rollup-linux-x64-gnu@4.53.3':
+  '@rollup/rollup-linux-x64-gnu@4.59.0':
     optional: true
 
-  '@rollup/rollup-linux-x64-musl@4.53.3':
+  '@rollup/rollup-linux-x64-musl@4.59.0':
     optional: true
 
-  '@rollup/rollup-openharmony-arm64@4.53.3':
+  '@rollup/rollup-openbsd-x64@4.59.0':
     optional: true
 
-  '@rollup/rollup-win32-arm64-msvc@4.53.3':
+  '@rollup/rollup-openharmony-arm64@4.59.0':
     optional: true
 
-  '@rollup/rollup-win32-ia32-msvc@4.53.3':
+  '@rollup/rollup-win32-arm64-msvc@4.59.0':
     optional: true
 
-  '@rollup/rollup-win32-x64-gnu@4.53.3':
+  '@rollup/rollup-win32-ia32-msvc@4.59.0':
     optional: true
 
-  '@rollup/rollup-win32-x64-msvc@4.53.3':
+  '@rollup/rollup-win32-x64-gnu@4.59.0':
+    optional: true
+
+  '@rollup/rollup-win32-x64-msvc@4.59.0':
     optional: true
 
   '@tailwindcss/node@4.1.17':
@@ -4198,7 +4251,7 @@ snapshots:
       '@typescript-eslint/types': 8.49.0
       '@typescript-eslint/visitor-keys': 8.49.0
       debug: 4.4.3
-      minimatch: 9.0.5
+      minimatch: 9.0.7
       semver: 7.7.3
       tinyglobby: 0.2.15
       ts-api-utils: 2.1.0(typescript@5.8.3)
@@ -4355,6 +4408,8 @@ snapshots:
 
   balanced-match@1.0.2: {}
 
+  balanced-match@4.0.4: {}
+
   baseline-browser-mapping@2.9.6: {}
 
   brace-expansion@1.1.12:
@@ -4362,9 +4417,9 @@ snapshots:
       balanced-match: 1.0.2
       concat-map: 0.0.1
 
-  brace-expansion@2.0.2:
+  brace-expansion@5.0.4:
     dependencies:
-      balanced-match: 1.0.2
+      balanced-match: 4.0.4
 
   browserslist@4.28.1:
     dependencies:
@@ -4557,7 +4612,7 @@ snapshots:
       is-glob: 4.0.3
       json-stable-stringify-without-jsonify: 1.0.1
       lodash.merge: 4.6.2
-      minimatch: 3.1.2
+      minimatch: 3.1.4
       natural-compare: 1.4.0
       optionator: 0.9.4
     optionalDependencies:
@@ -4639,7 +4694,7 @@ snapshots:
     dependencies:
       foreground-child: 3.3.1
       jackspeak: 3.4.3
-      minimatch: 9.0.5
+      minimatch: 9.0.7
       minipass: 7.1.2
       package-json-from-dist: 1.0.1
       path-scurry: 1.11.1
@@ -4871,13 +4926,13 @@ snapshots:
 
   min-indent@1.0.1: {}
 
-  minimatch@3.1.2:
+  minimatch@3.1.4:
     dependencies:
       brace-expansion: 1.1.12
 
-  minimatch@9.0.5:
+  minimatch@9.0.7:
     dependencies:
-      brace-expansion: 2.0.2
+      brace-expansion: 5.0.4
 
   minipass@7.1.2: {}
 
@@ -5082,32 +5137,35 @@ snapshots:
 
   resolve-from@4.0.0: {}
 
-  rollup@4.53.3:
+  rollup@4.59.0:
     dependencies:
       '@types/estree': 1.0.8
     optionalDependencies:
-      '@rollup/rollup-android-arm-eabi': 4.53.3
-      '@rollup/rollup-android-arm64': 4.53.3
-      '@rollup/rollup-darwin-arm64': 4.53.3
-      '@rollup/rollup-darwin-x64': 4.53.3
-      '@rollup/rollup-freebsd-arm64': 4.53.3
-      '@rollup/rollup-freebsd-x64': 4.53.3
-      '@rollup/rollup-linux-arm-gnueabihf': 4.53.3
-      '@rollup/rollup-linux-arm-musleabihf': 4.53.3
-      '@rollup/rollup-linux-arm64-gnu': 4.53.3
-      '@rollup/rollup-linux-arm64-musl': 4.53.3
-      '@rollup/rollup-linux-loong64-gnu': 4.53.3
-      '@rollup/rollup-linux-ppc64-gnu': 4.53.3
-      '@rollup/rollup-linux-riscv64-gnu': 4.53.3
-      '@rollup/rollup-linux-riscv64-musl': 4.53.3
-      '@rollup/rollup-linux-s390x-gnu': 4.53.3
-      '@rollup/rollup-linux-x64-gnu': 4.53.3
-      '@rollup/rollup-linux-x64-musl': 4.53.3
-      '@rollup/rollup-openharmony-arm64': 4.53.3
-      '@rollup/rollup-win32-arm64-msvc': 4.53.3
-      '@rollup/rollup-win32-ia32-msvc': 4.53.3
-      '@rollup/rollup-win32-x64-gnu': 4.53.3
-      '@rollup/rollup-win32-x64-msvc': 4.53.3
+      '@rollup/rollup-android-arm-eabi': 4.59.0
+      '@rollup/rollup-android-arm64': 4.59.0
+      '@rollup/rollup-darwin-arm64': 4.59.0
+      '@rollup/rollup-darwin-x64': 4.59.0
+      '@rollup/rollup-freebsd-arm64': 4.59.0
+      '@rollup/rollup-freebsd-x64': 4.59.0
+      '@rollup/rollup-linux-arm-gnueabihf': 4.59.0
+      '@rollup/rollup-linux-arm-musleabihf': 4.59.0
+      '@rollup/rollup-linux-arm64-gnu': 4.59.0
+      '@rollup/rollup-linux-arm64-musl': 4.59.0
+      '@rollup/rollup-linux-loong64-gnu': 4.59.0
+      '@rollup/rollup-linux-loong64-musl': 4.59.0
+      '@rollup/rollup-linux-ppc64-gnu': 4.59.0
+      '@rollup/rollup-linux-ppc64-musl': 4.59.0
+      '@rollup/rollup-linux-riscv64-gnu': 4.59.0
+      '@rollup/rollup-linux-riscv64-musl': 4.59.0
+      '@rollup/rollup-linux-s390x-gnu': 4.59.0
+      '@rollup/rollup-linux-x64-gnu': 4.59.0
+      '@rollup/rollup-linux-x64-musl': 4.59.0
+      '@rollup/rollup-openbsd-x64': 4.59.0
+      '@rollup/rollup-openharmony-arm64': 4.59.0
+      '@rollup/rollup-win32-arm64-msvc': 4.59.0
+      '@rollup/rollup-win32-ia32-msvc': 4.59.0
+      '@rollup/rollup-win32-x64-gnu': 4.59.0
+      '@rollup/rollup-win32-x64-msvc': 4.59.0
       fsevents: 2.3.3
 
   rrweb-cssom@0.8.0: {}
@@ -5192,7 +5250,7 @@ snapshots:
     dependencies:
       '@istanbuljs/schema': 0.1.3
       glob: 10.5.0
-      minimatch: 9.0.5
+      minimatch: 9.0.7
 
   tinybench@2.9.0: {}
 
@@ -5308,7 +5366,7 @@ snapshots:
       fdir: 6.5.0(picomatch@4.0.3)
       picomatch: 4.0.3
       postcss: 8.5.6
-      rollup: 4.53.3
+      rollup: 4.59.0
       tinyglobby: 0.2.15
     optionalDependencies:
       '@types/node': 24.10.3
diff --git a/frontend/src/store/__tests__/useSettingsStore.test.tsx b/frontend/src/store/__tests__/useSettingsStore.test.tsx
index 0d8165473..541451f20 100644
--- a/frontend/src/store/__tests__/useSettingsStore.test.tsx
+++ b/frontend/src/store/__tests__/useSettingsStore.test.tsx
@@ -52,7 +52,7 @@ const mockHealthResponse: HealthResponse = {
       status: 'healthy',
       latency_ms: 40,
       error: null,
-      model: 'nvidia/llama-3.2-nv-rerankqa-1b-v2',
+      model: 'nvidia/llama-nemotron-rerank-1b-v2',
       message: null,
       http_status: 200
     }
@@ -121,7 +121,7 @@ describe('useHealthInitialization', () => {
       const state = useSettingsStore.getState();
       expect(state.model).toBe('meta/llama-3.1-8b-instruct');
       expect(state.embeddingModel).toBe('nvidia/nv-embedqa-e5-v5');
-      expect(state.rerankerModel).toBe('nvidia/llama-3.2-nv-rerankqa-1b-v2');
+      expect(state.rerankerModel).toBe('nvidia/llama-nemotron-rerank-1b-v2');
     });
 
     // Verify endpoints are also set
@@ -140,7 +140,7 @@ describe('useHealthInitialization', () => {
         llmEndpoint: 'http://llm:8000',
         embeddingModel: 'nvidia/nv-embedqa-e5-v5',
         embeddingEndpoint: 'http://embeddings:8001',
-        rerankerModel: 'nvidia/llama-3.2-nv-rerankqa-1b-v2',
+        rerankerModel: 'nvidia/llama-nemotron-rerank-1b-v2',
         rerankerEndpoint: 'http://reranker:8002'
       })
     );
@@ -167,7 +167,7 @@ describe('useHealthInitialization', () => {
       expect(state.model).toBe('user-selected-llm-model');
       expect(state.embeddingModel).toBe('user-selected-embedding-model');
       // Should still populate undefined fields
-      expect(state.rerankerModel).toBe('nvidia/llama-3.2-nv-rerankqa-1b-v2');
+      expect(state.rerankerModel).toBe('nvidia/llama-nemotron-rerank-1b-v2');
     });
   });
 
diff --git a/notebooks/.env_library b/notebooks/.env_library
index a0d998752..eb5b68eb5 100644
--- a/notebooks/.env_library
+++ b/notebooks/.env_library
@@ -16,8 +16,8 @@ export MINIO_ACCESSKEY=minioadmin
 export MINIO_SECRETKEY=minioadmin
 
 # === Embedding Model specific configurations ===
-export APP_EMBEDDINGS_SERVERURL=nemoretriever-embedding-ms:8000/v1
-export APP_EMBEDDINGS_MODELNAME=nvidia/llama-3.2-nv-embedqa-1b-v2
+export APP_EMBEDDINGS_SERVERURL=nemotron-embedding-ms:8000/v1
+export APP_EMBEDDINGS_MODELNAME=nvidia/llama-nemotron-embed-1b-v2
 export APP_EMBEDDINGS_DIMENSIONS=2048
 # For VLM Embedding Model (Nemoretriever-1b-vlm-embed-v1)
 # export APP_EMBEDDINGS_SERVERURL=localhost:9081
@@ -85,7 +85,7 @@ export ENABLE_FILTER_GENERATOR=False
 
 # === Reranking Model specific configurations ===
 export APP_RANKING_SERVERURL=localhost:1976
-export APP_RANKING_MODELNAME="nvidia/llama-3.2-nv-rerankqa-1b-v2"
+export APP_RANKING_MODELNAME="nvidia/llama-nemotron-rerank-1b-v2"
 export ENABLE_RERANKER=True
 
 # === VLM Model specific configurations ===
diff --git a/notebooks/building_rag_vdb_operator.ipynb b/notebooks/building_rag_vdb_operator.ipynb
index fec360d84..1852f1d9c 100644
--- a/notebooks/building_rag_vdb_operator.ipynb
+++ b/notebooks/building_rag_vdb_operator.ipynb
@@ -360,12 +360,12 @@
     "Ensure all the below are running and healthy before proceeding further\n",
     "```output\n",
     "NAMES                           STATUS\n",
-    "nemoretriever-ranking-ms        Up ... (healthy)\n",
+    "nemotron-ranking-ms        Up ... (healthy)\n",
     "compose-page-elements-1         Up ...\n",
     "compose-nemoretriever-ocr-1     Up ...\n",
     "compose-graphic-elements-1      Up ...\n",
     "compose-table-structure-1       Up ...\n",
-    "nemoretriever-embedding-ms      Up ... (healthy)\n",
+    "nemotron-embedding-ms      Up ... (healthy)\n",
     "nim-llm-ms                      Up ... (healthy)\n",
     "```"
    ]
@@ -390,15 +390,15 @@
     "os.environ[\"OCR_HTTP_ENDPOINT\"] = \"https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-ocr\"\n",
     "os.environ[\"OCR_INFER_PROTOCOL\"] = \"http\"\n",
     "os.environ[\"YOLOX_HTTP_ENDPOINT\"] = (\n",
-    "    \"https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-page-elements-v3\"\n",
+    "    \"https://ai.api.nvidia.com/v1/cv/nvidia/nemotron-page-elements-v3\"\n",
     ")\n",
     "os.environ[\"YOLOX_INFER_PROTOCOL\"] = \"http\"\n",
     "os.environ[\"YOLOX_GRAPHIC_ELEMENTS_HTTP_ENDPOINT\"] = (\n",
-    "    \"https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-graphic-elements-v1\"\n",
+    "    \"https://ai.api.nvidia.com/v1/cv/nvidia/nemotron-graphic-elements-v1\"\n",
     ")\n",
     "os.environ[\"YOLOX_GRAPHIC_ELEMENTS_INFER_PROTOCOL\"] = \"http\"\n",
     "os.environ[\"YOLOX_TABLE_STRUCTURE_HTTP_ENDPOINT\"] = (\n",
-    "    \"https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-table-structure-v1\"\n",
+    "    \"https://ai.api.nvidia.com/v1/cv/nvidia/nemotron-table-structure-v1\"\n",
     ")\n",
     "os.environ[\"YOLOX_TABLE_STRUCTURE_INFER_PROTOCOL\"] = \"http\""
    ]
@@ -2081,13 +2081,13 @@
     "\n",
     "# IMPORTANT: Two different embedding URLs are needed:\n",
     "# 1. config_ingestor.embeddings.server_url → Used by nv-ingest (runs in Docker)\n",
-    "#    Must use Docker network hostname: nemoretriever-embedding-ms:8000\n",
+    "#    Must use Docker network hostname: nemotron-embedding-ms:8000\n",
     "# 2. embedding_model for VDB operator → Used for queries (runs locally in notebook)\n",
     "#    Must use localhost: localhost:9080\n",
     "\n",
     "if DEPLOYMENT_MODE == \"on_prem\":\n",
     "    # nv-ingest runs inside Docker, needs Docker network hostname\n",
-    "    config_ingestor.embeddings.server_url = \"http://nemoretriever-embedding-ms:8000/v1\"\n",
+    "    config_ingestor.embeddings.server_url = \"http://nemotron-embedding-ms:8000/v1\"\n",
     "if DEPLOYMENT_MODE == \"cloud\":\n",
     "    config_ingestor.embeddings.server_url = \"https://integrate.api.nvidia.com/v1\"\n",
     "    config_ingestor.llm.server_url = \"\"  # Empty uses NVIDIA API catalog\n",
diff --git a/notebooks/config.yaml b/notebooks/config.yaml
index ae9551877..f5a8eb53b 100644
--- a/notebooks/config.yaml
+++ b/notebooks/config.yaml
@@ -55,14 +55,14 @@ filter_expression_generator:
 
 # Embedding Configuration
 embeddings:
-  model_name: "nvidia/llama-3.2-nv-embedqa-1b-v2"  # Model for generating text embeddings
+  model_name: "nvidia/llama-nemotron-embed-1b-v2"  # Model for generating text embeddings
   dimensions: 2048  # Dimensionality of the embedding vectors
   server_url: "http://localhost:9080/v1"  # URL endpoint for embedding service (on-prem NIM default)
   # api_key: ""  # Optional: API key for embeddings (overrides NVIDIA_API_KEY environment variable)
 
 # Ranking Configuration
 ranking:
-  model_name: "nvidia/llama-3.2-nv-rerankqa-1b-v2"  # Model for reranking retrieved documents
+  model_name: "nvidia/llama-nemotron-rerank-1b-v2"  # Model for reranking retrieved documents
   server_url: "http://localhost:1976"  # URL endpoint for reranking service (on-prem NIM default)
   enable_reranker: true  # Enable reranking of retrieved documents before generation
   # api_key: ""  # Optional: API key for reranking (overrides NVIDIA_API_KEY environment variable)
diff --git a/notebooks/evaluation_01_ragas.ipynb b/notebooks/evaluation_01_ragas.ipynb
index 4b50aceb0..d3f12933d 100644
--- a/notebooks/evaluation_01_ragas.ipynb
+++ b/notebooks/evaluation_01_ragas.ipynb
@@ -1,550 +1,559 @@
 {
- "cells": [
-  {
-   "cell_type": "markdown",
-   "id": "f1de2541",
-   "metadata": {},
-   "source": [
-    "# Evaluate Your RAG Pipeline with Ragas: Answer Accuracy, Context Relevancy, and Groundedness\n",
-    "\n",
-    "In this notebook, we will evaluate our RAG system using three key metrics with the [Ragas](https://docs.ragas.io/en/stable/) library. \n",
-    "\n",
-    "Ragas provides a set of metrics that you can use to evaluate the performance of your LLM application. These metrics are designed to help you objectively measure the performance of your application. \n",
-    "\n",
-    "## Evaluation Metrics\n",
-    "\n",
-    "In this notebook, we will use the following three metrics, introduced to Ragas by NVIDIA:\n",
-    "- **Answer Accuracy** – Measures the agreement between a model’s response and a reference ground truth for a given question.\n",
-    "- **Context Relevancy** – Evaluates whether the retrieved contexts (chunks or passages) are pertinent to the user input. \n",
-    "- **Response Groundedness** – Measures how well a response is supported or \"grounded\" by the retrieved contexts. It assesses whether each claim in the response can be found, either wholly or partially, in the provided contexts.\n",
-    "\n",
-    "## Prerequisites\n",
-    "\n",
-    "This notebook assumes you are familiar with the RAG system and you have both `rag-server` and `ingestor-server` up and running. If you have not done that, you can refer to [Get Started](https://github.com/NVIDIA-AI-Blueprints/rag/blob/main/docs/deploy-docker-self-hosted.md) to start the RAG server.\n",
-    "\n",
-    "## 1. Download Evaluation Documents\n",
-    "\n",
-    "First, let's download the FinanceBench dataset to evaluate our RAG system. This dataset includes PDF files with information and reports about publicly traded companies, as well as ground truth question and answer pairs.\n",
-    "\n",
-    "We'll clone the repository into our data directory in a subdirectory called `financebench`. The PDFs can be found in the `pdfs` subdirectory.\n"
-   ]
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "id": "f1de2541",
+      "metadata": {},
+      "source": [
+        "# Evaluate Your RAG Pipeline with Ragas: Answer Accuracy, Context Relevancy, and Groundedness\n",
+        "\n",
+        "In this notebook, we will evaluate our RAG system using three key metrics with the [Ragas](https://docs.ragas.io/en/stable/) library. \n",
+        "\n",
+        "Ragas provides a set of metrics that you can use to evaluate the performance of your LLM application. These metrics are designed to help you objectively measure the performance of your application. \n",
+        "\n",
+        "## Evaluation Metrics\n",
+        "\n",
+        "In this notebook, we will use the following three metrics, introduced to Ragas by NVIDIA:\n",
+        "- **Answer Accuracy** – Measures the agreement between a model’s response and a reference ground truth for a given question.\n",
+        "- **Context Relevancy** – Evaluates whether the retrieved contexts (chunks or passages) are pertinent to the user input. \n",
+        "- **Response Groundedness** – Measures how well a response is supported or \"grounded\" by the retrieved contexts. It assesses whether each claim in the response can be found, either wholly or partially, in the provided contexts.\n",
+        "\n",
+        "## Prerequisites\n",
+        "\n",
+        "This notebook assumes you are familiar with the RAG system and you have both `rag-server` and `ingestor-server` up and running. If you have not done that, you can refer to [Get Started](https://github.com/NVIDIA-AI-Blueprints/rag/blob/main/docs/deploy-docker-self-hosted.md) to start the RAG server.\n",
+        "\n",
+        "## 1. Download Evaluation Documents\n",
+        "\n",
+        "First, let's download the FinanceBench dataset to evaluate our RAG system. This dataset includes PDF files with information and reports about publicly traded companies, as well as ground truth question and answer pairs.\n",
+        "\n",
+        "We'll clone the repository into our data directory in a subdirectory called `financebench`. The PDFs can be found in the `pdfs` subdirectory.\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "d87b89d8",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "! git clone https://github.com/patronus-ai/financebench.git ../data/financebench"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "id": "702a5f07",
+      "metadata": {},
+      "source": [
+        "## 2. Ingest Evaluation Documents\n",
+        "\n",
+        "For evaluation, we will use the FinanceBench dataset. In the data directory, we have the PDF files for the FinanceBench dataset, as well as the `financebench_open_source.jsonl` file, which includes ground truth question and answer pairs. \n",
+        "\n",
+        "Let's start by creating a collection called `financebench` and upload the relevant documents.\n",
+        "\n",
+        "This process is similar to the `ingestion_api_usage` notebook. First, we'll install the required packages and set up our API connections."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "0b88ef79",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Installing required Python packages\n",
+        "! pip install aiohttp langchain-nvidia-ai-endpoints ragas httpx"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "11bcb3fe",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "import aiohttp\n",
+        "import os\n",
+        "import json\n",
+        "import glob\n",
+        "import httpx"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "fa7a4226",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "IPADDRESS = \"ingestor-server\" if os.environ.get(\"AI_WORKBENCH\", \"false\") == \"true\" else \"localhost\" # Replace this with the correct IP address\n",
+        "INGESTOR_SERVER_PORT = \"8082\"\n",
+        "INGESTOR_BASE_URL = f\"http://{IPADDRESS}:{INGESTOR_SERVER_PORT}\"  # Replace with your server URL\n",
+        "\n",
+        "async def print_response(response):\n",
+        "    \"\"\"Helper to print API response.\"\"\"\n",
+        "    try:\n",
+        "        response_json = await response.json()\n",
+        "        print(json.dumps(response_json, indent=2))\n",
+        "    except aiohttp.ClientResponseError:\n",
+        "        print(await response.text())\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "47cc6774",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "async def create_collection(\n",
+        "    collection_name: str = None,\n",
+        "    metadata_schema: list = []\n",
+        "):\n",
+        "    \"\"\"Create a new collection in the vector database.\"\"\"\n",
+        "    data = {\n",
+        "        \"collection_name\": collection_name,\n",
+        "        \"metadata_schema\": metadata_schema\n",
+        "    }\n",
+        "\n",
+        "    HEADERS = {\"Content-Type\": \"application/json\"}\n",
+        "\n",
+        "    async with aiohttp.ClientSession() as session:\n",
+        "        try:\n",
+        "            async with session.post(f\"{INGESTOR_BASE_URL}/v1/collection\", json=data, headers=HEADERS) as response:\n",
+        "                await print_response(response)\n",
+        "        except aiohttp.ClientError as e:\n",
+        "            return 500, {\"error\": str(e)}\n",
+        "\n",
+        "# Create the financebench collection\n",
+        "await create_collection(\n",
+        "    collection_name=\"financebench\",\n",
+        ")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "92418e23",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Get all PDF files from the financebench directory\n",
+        "FILEPATHS = glob.glob(os.path.join(\"../data/financebench/pdfs\", \"*.pdf\"))\n",
+        "\n",
+        "async def upload_documents(collection_name: str = \"\"):\n",
+        "    \"\"\"Upload documents to the specified collection.\"\"\"\n",
+        "    data = {\n",
+        "        \"collection_name\": collection_name,\n",
+        "        \"blocking\": False,  # If True, upload is blocking; else async. Status API not needed when blocking\n",
+        "        \"split_options\": {\n",
+        "            \"chunk_size\": 512,\n",
+        "            \"chunk_overlap\": 150\n",
+        "        },\n",
+        "        \"generate_summary\": False  # Set to True to optionally generate summaries for all documents after ingestion\n",
+        "    }\n",
+        "\n",
+        "    form_data = aiohttp.FormData()\n",
+        "    \n",
+        "    # Add all PDF files to the form data\n",
+        "    for file_path in FILEPATHS:\n",
+        "        form_data.add_field(\"documents\", open(file_path, \"rb\"), filename=os.path.basename(file_path), content_type=\"application/pdf\")\n",
+        "\n",
+        "    form_data.add_field(\"data\", json.dumps(data), content_type=\"application/json\")\n",
+        "\n",
+        "    async with aiohttp.ClientSession() as session:\n",
+        "        try:\n",
+        "            async with session.post(f\"{INGESTOR_BASE_URL}/v1/documents\", data=form_data) as response: # Replace with session.patch for reingesting\n",
+        "                await print_response(response)\n",
+        "                # Return the response JSON for task_id extraction\n",
+        "                response_json = await response.json()\n",
+        "                return response_json\n",
+        "        except aiohttp.ClientError as e:\n",
+        "            print(f\"Error uploading documents: {e}\")\n",
+        "            return None\n",
+        "\n",
+        "# Store the response and extract task_id\n",
+        "upload_response = await upload_documents(collection_name=\"financebench\")\n",
+        "task_id = upload_response.get(\"task_id\") if upload_response else None\n",
+        "print(f\"Extracted task_id: {task_id}\")\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "id": "798b7771",
+      "metadata": {},
+      "source": [
+        "**⚠️ Note**: During the document ingestion process, two files (`INTEL_2023_8K_dated-2023-08-16.pdf` and `INTEL_2023_8K_dated-2023-02-10.pdf`) may fail to process due to formatting issues. This is expected and can be safely ignored, as it will not affect the evaluation methodology or results. The remaining documents in the dataset are sufficient for comprehensive evaluation."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "82b3e199",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# This might take a few minutes to complete depending on the number of documents uploaded\n",
+        "async def get_task_status(\n",
+        "    task_id: str\n",
+        "):\n",
+        "\n",
+        "    params = {\n",
+        "        \"task_id\": task_id,\n",
+        "    }\n",
+        "\n",
+        "    HEADERS = {\"Content-Type\": \"application/json\"}\n",
+        "\n",
+        "    async with aiohttp.ClientSession() as session:\n",
+        "        try:\n",
+        "            async with session.get(f\"{INGESTOR_BASE_URL}/v1/status\", params=params, headers=HEADERS) as response:\n",
+        "                await print_response(response)\n",
+        "        except aiohttp.ClientError as e:\n",
+        "            return 500, {\"error\": str(e)}\n",
+        "\n",
+        "# Use the extracted task_id from the upload_documents response\n",
+        "if task_id:\n",
+        "    await get_task_status(task_id=task_id)\n",
+        "else:\n",
+        "    print(\"No task_id available. Please run the upload_documents cell first.\")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "id": "8bb5edff",
+      "metadata": {},
+      "source": [
+        "## 3. Create Dataset for Ragas Evaluation\n",
+        "\n",
+        "In `data/financebench/data`, there is a file called `financebench_open_source.jsonl`. This file contains questions about the PDFs, as well as corresponding ground truth answers.\n",
+        "\n",
+        "For each ground-truth question and answer pair, we will:\n",
+        "1. Generate an answer from our RAG system\n",
+        "2. Retrieve the relevant document contexts\n",
+        "3. Create a dataset suitable for Ragas evaluation\n",
+        "\n",
+        "The answer and context retrieval from the RAG system is similar to the `retriever_api_usage` notebook.\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "b96c09f1",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "IPADDRESS = \"rag-server\" if os.environ.get(\"AI_WORKBENCH\", \"false\") == \"true\" else \"localhost\" #Replace this with the correct IP address\n",
+        "RAG_SERVER_PORT = \"8081\"\n",
+        "RAG_BASE_URL = f\"http://{IPADDRESS}:{RAG_SERVER_PORT}\"  # Replace with your server URL\n",
+        "\n",
+        "generate_url = f\"{RAG_BASE_URL}/v1/generate\"\n",
+        "\n",
+        "async def generate_answer(payload):\n",
+        "    \"\"\"Generate an answer using the RAG server.\"\"\"\n",
+        "    rag_response = \"\"\n",
+        "    citations = []\n",
+        "    is_first_token = True\n",
+        "\n",
+        "    async with httpx.AsyncClient(timeout=300.0) as client:\n",
+        "        try:\n",
+        "            async with client.stream(\"POST\", url=generate_url, json=payload) as response:\n",
+        "                # Raise an exception for bad status codes like 4xx or 5xx\n",
+        "                response.raise_for_status()\n",
+        "\n",
+        "                # iterate over the response lines\n",
+        "                async for line in response.aiter_lines():\n",
+        "                    if line.startswith(\"data: \"):\n",
+        "                        json_str = line[6:].strip()\n",
+        "                        if not json_str:\n",
+        "                            continue\n",
+        "\n",
+        "                        try:\n",
+        "                            data = json.loads(json_str)\n",
+        "\n",
+        "                            # --- Extract the response from the RAG server ---\n",
+        "                            message = data.get(\"choices\", [{}])[0].get(\"message\", {}).get(\"content\", \"\")\n",
+        "                            if message:\n",
+        "                                rag_response += message\n",
+        "\n",
+        "                            # --- Extract the citations from the RAG server ---\n",
+        "                            if is_first_token and data.get(\"citations\"):\n",
+        "                                for result in data.get(\"citations\", {}).get(\"results\", []):\n",
+        "                                    description = result.get(\"metadata\", {}).get(\"description\")\n",
+        "                                    if description:\n",
+        "                                        citations.append(description)\n",
+        "                                is_first_token = False\n",
+        "\n",
+        "                            finish_reason = data.get(\"choices\", [{}])[0].get(\"finish_reason\")\n",
+        "                            if finish_reason == \"stop\":\n",
+        "                                return rag_response, citations\n",
+        "\n",
+        "                        except json.JSONDecodeError:\n",
+        "                            print(f\"Skipping malformed JSON line: {json_str}\")\n",
+        "                            continue\n",
+        "        \n",
+        "        except httpx.HTTPStatusError as e:\n",
+        "            print(f\"HTTP error occurred: {e.response.status_code} - {e.response.text}\")\n",
+        "        except httpx.RequestError as e:\n",
+        "            print(f\"An error occurred while requesting {e.request.url!r}: {e}\")\n",
+        "        except Exception as e:\n",
+        "            print(f\"An error occurred: {e}\")\n",
+        "\n",
+        "    return rag_response, citations\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "805c5744",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Load the question and ground-truth answer pairs from the FinanceBench dataset\n",
+        "with open('../data/financebench/data/financebench_open_source.jsonl', 'r') as file:\n",
+        "    gt_qa_pairs = [json.loads(line) for line in file]\n",
+        "\n",
+        "print(f\"Loaded {len(gt_qa_pairs)} question-answer pairs from FinanceBench dataset\")\n",
+        "\n",
+        "dataset = []\n",
+        "\n",
+        "# For the purposes of keeping this demo brief, we will only evaluate on 50 questions. \n",
+        "# You can increase this to the full dataset for more comprehensive results.\n",
+        "n = 50 \n",
+        "print(f\"Evaluating on {n} questions...\")\n",
+        "\n",
+        "for idx, qa_pair in enumerate(gt_qa_pairs[:n]):\n",
+        "    question = qa_pair['question']\n",
+        "    \n",
+        "    print(f\"Processing question {idx + 1}/{n}: {question[:100]}...\")\n",
+        "\n",
+        "    generate_payload = {\n",
+        "        \"messages\": [\n",
+        "            {\n",
+        "                \"role\": \"user\",\n",
+        "                \"content\": question\n",
+        "            }\n",
+        "        ],\n",
+        "        \"use_knowledge_base\": True,\n",
+        "        \"reranker_top_k\": 2,\n",
+        "        \"vdb_top_k\": 10,\n",
+        "        \"vdb_endpoint\": \"http://milvus:19530\",\n",
+        "        \"collection_names\": [\"financebench\"],\n",
+        "        \"enable_reranker\": True,\n",
+        "        \"enable_citations\": True,\n",
+        "        \"stop\": [],\n",
+        "        \"filter_expr\": ''\n",
+        "    }\n",
+        "    \n",
+        "    rag_answer, citations = await generate_answer(generate_payload)\n",
+        "\n",
+        "    dataset.append({\n",
+        "        \"user_input\": question,\n",
+        "        \"retrieved_contexts\": citations,\n",
+        "        \"response\": rag_answer,\n",
+        "        \"reference\": qa_pair['answer'],\n",
+        "    })\n",
+        "\n",
+        "print(f\"Created dataset with {len(dataset)} entries for evaluation\")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "id": "43e68742",
+      "metadata": {},
+      "source": [
+        "\n",
+        "## 4. Evaluate with Ragas\n",
+        "\n",
+        "In this example, we will use the NVIDIA hosted endpoint for our judge model. To use this endpoint, please provide your NVIDIA API Key below. \n",
+        "\n",
+        "### Rate Limiting Considerations\n",
+        "\n",
+        "When using the public endpoint for the Judge LLM, you will likely encounter rate limit errors. We can try to reduce the number of errors by adjusting the configuration, which we do below. \n",
+        "\n",
+        "Alternatively, you can use self-hosted NIM Microservices endpoints to avoid these errors altogether. If you're using a self-hosted NIM, you do not need to provide your API Key.\n",
+        "\n",
+        "### Getting Your NVIDIA API Key\n",
+        "\n",
+        "To generate an API Key:\n",
+        "1. Go to [build.nvidia.com](https://build.nvidia.com/)\n",
+        "2. Click the green \"Get API Key\" button in the top right corner\n",
+        "3. Paste your key below to save it as an environment variable\n",
+        "\n",
+        "### Self-Hosted Option\n",
+        "\n",
+        "To deploy the Judge LLM as a NIM on your own infrastructure, follow the instructions [here](https://build.nvidia.com/openai/gpt-oss-120b/deploy).\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "id": "32df51d0",
+      "metadata": {},
+      "source": [
+        "Note: Mixtral 8x22b is the preferred model if you have required compute available. You can deploy it following steps [here](https://build.nvidia.com/mistralai/mixtral-8x22b-instruct/deploy)."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "31df3819",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "from getpass import getpass\n",
+        "# del os.environ['NVIDIA_API_KEY']  ## delete key and reset if needed\n",
+        "if os.environ.get(\"NVIDIA_API_KEY\", \"\").startswith(\"nvapi-\"):\n",
+        "    print(\"Valid NVIDIA_API_KEY already in environment. Delete to reset\")\n",
+        "else:\n",
+        "    candidate_api_key = getpass(\"NVAPI Key (starts with nvapi-): \")\n",
+        "    assert candidate_api_key.startswith(\"nvapi-\"), (\n",
+        "        f\"{candidate_api_key[:5]}... is not a valid key\"\n",
+        "    )\n",
+        "    os.environ[\"NVIDIA_API_KEY\"] = candidate_api_key"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "78fb75fe",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "\n",
+        "# Note: Models on build.nvidia.com are rate limited.\n",
+        "# To avoid rate-limit issues, either deploy the judge model locally (self-hosted NIM)\n",
+        "# or use any OpenAI-compatible LLM as the judge for evaluation.\n",
+        "from langchain_nvidia_ai_endpoints.chat_models import ChatNVIDIA\n",
+        "\n",
+        "# Initialize the judge LLM for evaluation\n",
+        "# You can use any other model by creating a ChatNVIDIA object with a different model id\n",
+        "llm = ChatNVIDIA(model=\"openai/gpt-oss-120b\") # For using NVIDIA hosted endpoint\n",
+        "# llm = ChatNVIDIA(model=\"openai/gpt-oss-120b\", base_url=\"http://0.0.0.0:8000/v1\") # If using self-hosted NIM"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "928a3c8a",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Create the evaluation dataset from our collected data\n",
+        "from ragas import EvaluationDataset\n",
+        "\n",
+        "evaluation_dataset = EvaluationDataset.from_list(dataset)\n",
+        "print(f\"Created evaluation dataset with {len(evaluation_dataset)} samples\")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "b3ec24f4",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Import the required metrics and evaluation components\n",
+        "from ragas.metrics import AnswerAccuracy, ContextRelevance, ResponseGroundedness\n",
+        "from ragas import evaluate\n",
+        "from ragas.llms import LangchainLLMWrapper\n",
+        "\n",
+        "# Wrap the LLM for use with Ragas\n",
+        "evaluator_llm = LangchainLLMWrapper(llm)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "9f2f4245",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "from ragas.run_config import RunConfig\n",
+        "\n",
+        "custom_config = RunConfig(max_workers=1, max_wait=120)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "3a3571af",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Run the evaluation with our three metrics\n",
+        "print(\"Starting Ragas evaluation...\")\n",
+        "print(\"This may take several minutes depending on the dataset size.\")\n",
+        "\n",
+        "results = evaluate(\n",
+        "    dataset=evaluation_dataset,\n",
+        "    metrics=[AnswerAccuracy(), ContextRelevance(), ResponseGroundedness()],\n",
+        "    llm=evaluator_llm, \n",
+        "    run_config=custom_config\n",
+        ")\n",
+        "\n",
+        "print(\"Evaluation completed!\")\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "id": "bac9dde6",
+      "metadata": {},
+      "source": [
+        "## 5. Analyze Results\n",
+        "\n",
+        "Finally, let's examine our evaluation results. We'll look at both the overall metrics and individual sample performance."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "4c90647f",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "results"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "2da683a1",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Convert results to pandas DataFrame for detailed analysis of individual queries\n",
+        "results_df = results.to_pandas()\n",
+        "\n",
+        "import pandas as pd\n",
+        "\n",
+        "# 1. Set the option to display ALL columns, preventing the '...'\n",
+        "pd.set_option('display.max_columns', None)\n",
+        "\n",
+        "# 2. To prevent long text in cells from being cut off, you can set the column width\n",
+        "pd.set_option('display.max_colwidth', 80)\n",
+        "\n",
+        "results_df.head()"
+      ]
+    }
+  ],
+  "metadata": {
+    "kernelspec": {
+      "display_name": "evaluate",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.12.9"
+    }
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "d87b89d8",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "! git clone https://github.com/patronus-ai/financebench.git ../data/financebench"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "702a5f07",
-   "metadata": {},
-   "source": [
-    "## 2. Ingest Evaluation Documents\n",
-    "\n",
-    "For evaluation, we will use the FinanceBench dataset. In the data directory, we have the PDF files for the FinanceBench dataset, as well as the `financebench_open_source.jsonl` file, which includes ground truth question and answer pairs. \n",
-    "\n",
-    "Let's start by creating a collection called `financebench` and upload the relevant documents.\n",
-    "\n",
-    "This process is similar to the `ingestion_api_usage` notebook. First, we'll install the required packages and set up our API connections."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "0b88ef79",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Installing required Python packages\n",
-    "! pip install aiohttp langchain-nvidia-ai-endpoints ragas httpx"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "11bcb3fe",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import aiohttp\n",
-    "import os\n",
-    "import json\n",
-    "import glob\n",
-    "import httpx"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "fa7a4226",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "IPADDRESS = \"ingestor-server\" if os.environ.get(\"AI_WORKBENCH\", \"false\") == \"true\" else \"localhost\" # Replace this with the correct IP address\n",
-    "INGESTOR_SERVER_PORT = \"8082\"\n",
-    "INGESTOR_BASE_URL = f\"http://{IPADDRESS}:{INGESTOR_SERVER_PORT}\"  # Replace with your server URL\n",
-    "\n",
-    "async def print_response(response):\n",
-    "    \"\"\"Helper to print API response.\"\"\"\n",
-    "    try:\n",
-    "        response_json = await response.json()\n",
-    "        print(json.dumps(response_json, indent=2))\n",
-    "    except aiohttp.ClientResponseError:\n",
-    "        print(await response.text())\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "47cc6774",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "async def create_collection(\n",
-    "    collection_name: str = None,\n",
-    "    metadata_schema: list = []\n",
-    "):\n",
-    "    \"\"\"Create a new collection in the vector database.\"\"\"\n",
-    "    data = {\n",
-    "        \"collection_name\": collection_name,\n",
-    "        \"metadata_schema\": metadata_schema\n",
-    "    }\n",
-    "\n",
-    "    HEADERS = {\"Content-Type\": \"application/json\"}\n",
-    "\n",
-    "    async with aiohttp.ClientSession() as session:\n",
-    "        try:\n",
-    "            async with session.post(f\"{INGESTOR_BASE_URL}/v1/collection\", json=data, headers=HEADERS) as response:\n",
-    "                await print_response(response)\n",
-    "        except aiohttp.ClientError as e:\n",
-    "            return 500, {\"error\": str(e)}\n",
-    "\n",
-    "# Create the financebench collection\n",
-    "await create_collection(\n",
-    "    collection_name=\"financebench\",\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "92418e23",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Get all PDF files from the financebench directory\n",
-    "FILEPATHS = glob.glob(os.path.join(\"../data/financebench/pdfs\", \"*.pdf\"))\n",
-    "\n",
-    "async def upload_documents(collection_name: str = \"\"):\n",
-    "    \"\"\"Upload documents to the specified collection.\"\"\"\n",
-    "    data = {\n",
-    "        \"collection_name\": collection_name,\n",
-    "        \"blocking\": False,  # If True, upload is blocking; else async. Status API not needed when blocking\n",
-    "        \"split_options\": {\n",
-    "            \"chunk_size\": 512,\n",
-    "            \"chunk_overlap\": 150\n",
-    "        },\n",
-    "        \"generate_summary\": False  # Set to True to optionally generate summaries for all documents after ingestion\n",
-    "    }\n",
-    "\n",
-    "    form_data = aiohttp.FormData()\n",
-    "    \n",
-    "    # Add all PDF files to the form data\n",
-    "    for file_path in FILEPATHS:\n",
-    "        form_data.add_field(\"documents\", open(file_path, \"rb\"), filename=os.path.basename(file_path), content_type=\"application/pdf\")\n",
-    "\n",
-    "    form_data.add_field(\"data\", json.dumps(data), content_type=\"application/json\")\n",
-    "\n",
-    "    async with aiohttp.ClientSession() as session:\n",
-    "        try:\n",
-    "            async with session.post(f\"{INGESTOR_BASE_URL}/v1/documents\", data=form_data) as response: # Replace with session.patch for reingesting\n",
-    "                await print_response(response)\n",
-    "                # Return the response JSON for task_id extraction\n",
-    "                response_json = await response.json()\n",
-    "                return response_json\n",
-    "        except aiohttp.ClientError as e:\n",
-    "            print(f\"Error uploading documents: {e}\")\n",
-    "            return None\n",
-    "\n",
-    "# Store the response and extract task_id\n",
-    "upload_response = await upload_documents(collection_name=\"financebench\")\n",
-    "task_id = upload_response.get(\"task_id\") if upload_response else None\n",
-    "print(f\"Extracted task_id: {task_id}\")\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "798b7771",
-   "metadata": {},
-   "source": [
-    "**⚠️ Note**: During the document ingestion process, two files (`INTEL_2023_8K_dated-2023-08-16.pdf` and `INTEL_2023_8K_dated-2023-02-10.pdf`) may fail to process due to formatting issues. This is expected and can be safely ignored, as it will not affect the evaluation methodology or results. The remaining documents in the dataset are sufficient for comprehensive evaluation."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "82b3e199",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# This might take a few minutes to complete depending on the number of documents uploaded\n",
-    "async def get_task_status(\n",
-    "    task_id: str\n",
-    "):\n",
-    "\n",
-    "    params = {\n",
-    "        \"task_id\": task_id,\n",
-    "    }\n",
-    "\n",
-    "    HEADERS = {\"Content-Type\": \"application/json\"}\n",
-    "\n",
-    "    async with aiohttp.ClientSession() as session:\n",
-    "        try:\n",
-    "            async with session.get(f\"{INGESTOR_BASE_URL}/v1/status\", params=params, headers=HEADERS) as response:\n",
-    "                await print_response(response)\n",
-    "        except aiohttp.ClientError as e:\n",
-    "            return 500, {\"error\": str(e)}\n",
-    "\n",
-    "# Use the extracted task_id from the upload_documents response\n",
-    "if task_id:\n",
-    "    await get_task_status(task_id=task_id)\n",
-    "else:\n",
-    "    print(\"No task_id available. Please run the upload_documents cell first.\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "8bb5edff",
-   "metadata": {},
-   "source": [
-    "## 3. Create Dataset for Ragas Evaluation\n",
-    "\n",
-    "In `data/financebench/data`, there is a file called `financebench_open_source.jsonl`. This file contains questions about the PDFs, as well as corresponding ground truth answers.\n",
-    "\n",
-    "For each ground-truth question and answer pair, we will:\n",
-    "1. Generate an answer from our RAG system\n",
-    "2. Retrieve the relevant document contexts\n",
-    "3. Create a dataset suitable for Ragas evaluation\n",
-    "\n",
-    "The answer and context retrieval from the RAG system is similar to the `retriever_api_usage` notebook.\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "b96c09f1",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "IPADDRESS = \"rag-server\" if os.environ.get(\"AI_WORKBENCH\", \"false\") == \"true\" else \"localhost\" #Replace this with the correct IP address\n",
-    "RAG_SERVER_PORT = \"8081\"\n",
-    "RAG_BASE_URL = f\"http://{IPADDRESS}:{RAG_SERVER_PORT}\"  # Replace with your server URL\n",
-    "\n",
-    "generate_url = f\"{RAG_BASE_URL}/v1/generate\"\n",
-    "\n",
-    "async def generate_answer(payload):\n",
-    "    \"\"\"Generate an answer using the RAG server.\"\"\"\n",
-    "    rag_response = \"\"\n",
-    "    citations = []\n",
-    "    is_first_token = True\n",
-    "\n",
-    "    async with httpx.AsyncClient(timeout=300.0) as client:\n",
-    "        try:\n",
-    "            async with client.stream(\"POST\", url=generate_url, json=payload) as response:\n",
-    "                # Raise an exception for bad status codes like 4xx or 5xx\n",
-    "                response.raise_for_status()\n",
-    "\n",
-    "                # iterate over the response lines\n",
-    "                async for line in response.aiter_lines():\n",
-    "                    if line.startswith(\"data: \"):\n",
-    "                        json_str = line[6:].strip()\n",
-    "                        if not json_str:\n",
-    "                            continue\n",
-    "\n",
-    "                        try:\n",
-    "                            data = json.loads(json_str)\n",
-    "\n",
-    "                            # --- Extract the response from the RAG server ---\n",
-    "                            message = data.get(\"choices\", [{}])[0].get(\"message\", {}).get(\"content\", \"\")\n",
-    "                            if message:\n",
-    "                                rag_response += message\n",
-    "\n",
-    "                            # --- Extract the citations from the RAG server ---\n",
-    "                            if is_first_token and data.get(\"citations\"):\n",
-    "                                for result in data.get(\"citations\", {}).get(\"results\", []):\n",
-    "                                    description = result.get(\"metadata\", {}).get(\"description\")\n",
-    "                                    if description:\n",
-    "                                        citations.append(description)\n",
-    "                                is_first_token = False\n",
-    "\n",
-    "                            finish_reason = data.get(\"choices\", [{}])[0].get(\"finish_reason\")\n",
-    "                            if finish_reason == \"stop\":\n",
-    "                                return rag_response, citations\n",
-    "\n",
-    "                        except json.JSONDecodeError:\n",
-    "                            print(f\"Skipping malformed JSON line: {json_str}\")\n",
-    "                            continue\n",
-    "        \n",
-    "        except httpx.HTTPStatusError as e:\n",
-    "            print(f\"HTTP error occurred: {e.response.status_code} - {e.response.text}\")\n",
-    "        except httpx.RequestError as e:\n",
-    "            print(f\"An error occurred while requesting {e.request.url!r}: {e}\")\n",
-    "        except Exception as e:\n",
-    "            print(f\"An error occurred: {e}\")\n",
-    "\n",
-    "    return rag_response, citations\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "805c5744",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Load the question and ground-truth answer pairs from the FinanceBench dataset\n",
-    "with open('../data/financebench/data/financebench_open_source.jsonl', 'r') as file:\n",
-    "    gt_qa_pairs = [json.loads(line) for line in file]\n",
-    "\n",
-    "print(f\"Loaded {len(gt_qa_pairs)} question-answer pairs from FinanceBench dataset\")\n",
-    "\n",
-    "dataset = []\n",
-    "\n",
-    "# For the purposes of keeping this demo brief, we will only evaluate on 50 questions. \n",
-    "# You can increase this to the full dataset for more comprehensive results.\n",
-    "n = 50 \n",
-    "print(f\"Evaluating on {n} questions...\")\n",
-    "\n",
-    "for idx, qa_pair in enumerate(gt_qa_pairs[:n]):\n",
-    "    question = qa_pair['question']\n",
-    "    \n",
-    "    print(f\"Processing question {idx + 1}/{n}: {question[:100]}...\")\n",
-    "\n",
-    "    generate_payload = {\n",
-    "        \"messages\": [\n",
-    "            {\n",
-    "                \"role\": \"user\",\n",
-    "                \"content\": question\n",
-    "            }\n",
-    "        ],\n",
-    "        \"use_knowledge_base\": True,\n",
-    "        \"reranker_top_k\": 2,\n",
-    "        \"vdb_top_k\": 10,\n",
-    "        \"vdb_endpoint\": \"http://milvus:19530\",\n",
-    "        \"collection_names\": [\"financebench\"],\n",
-    "        \"enable_reranker\": True,\n",
-    "        \"enable_citations\": True,\n",
-    "        \"stop\": [],\n",
-    "        \"filter_expr\": ''\n",
-    "    }\n",
-    "    \n",
-    "    rag_answer, citations = await generate_answer(generate_payload)\n",
-    "\n",
-    "    dataset.append({\n",
-    "        \"user_input\": question,\n",
-    "        \"retrieved_contexts\": citations,\n",
-    "        \"response\": rag_answer,\n",
-    "        \"reference\": qa_pair['answer'],\n",
-    "    })\n",
-    "\n",
-    "print(f\"Created dataset with {len(dataset)} entries for evaluation\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "43e68742",
-   "metadata": {},
-   "source": [
-    "\n",
-    "## 4. Evaluate with Ragas\n",
-    "\n",
-    "In this example, we will use the NVIDIA hosted endpoint for our judge model. To use this endpoint, please provide your NVIDIA API Key below. \n",
-    "\n",
-    "### Rate Limiting Considerations\n",
-    "\n",
-    "When using the public endpoint for the Judge LLM, you will likely encounter rate limit errors. We can try to reduce the number of errors by adjusting the configuration, which we do below. \n",
-    "\n",
-    "Alternatively, you can use self-hosted NIM Microservices endpoints to avoid these errors altogether. If you're using a self-hosted NIM, you do not need to provide your API Key.\n",
-    "\n",
-    "### Getting Your NVIDIA API Key\n",
-    "\n",
-    "To generate an API Key:\n",
-    "1. Go to [build.nvidia.com](https://build.nvidia.com/)\n",
-    "2. Click the green \"Get API Key\" button in the top right corner\n",
-    "3. Paste your key below to save it as an environment variable\n",
-    "\n",
-    "### Self-Hosted Option\n",
-    "\n",
-    "To deploy the Judge LLM as a NIM on your own infrastructure, follow the instructions [here](https://build.nvidia.com/mistralai/mixtral-8x22b-instruct/deploy).\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "31df3819",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import os\n",
-    "from getpass import getpass\n",
-    "# del os.environ['NVIDIA_API_KEY']  ## delete key and reset if needed\n",
-    "if os.environ.get(\"NVIDIA_API_KEY\", \"\").startswith(\"nvapi-\"):\n",
-    "    print(\"Valid NVIDIA_API_KEY already in environment. Delete to reset\")\n",
-    "else:\n",
-    "    candidate_api_key = getpass(\"NVAPI Key (starts with nvapi-): \")\n",
-    "    assert candidate_api_key.startswith(\"nvapi-\"), (\n",
-    "        f\"{candidate_api_key[:5]}... is not a valid key\"\n",
-    "    )\n",
-    "    os.environ[\"NVIDIA_API_KEY\"] = candidate_api_key"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "78fb75fe",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "\n",
-    "# Note: Models on build.nvidia.com are rate limited.\n",
-    "# To avoid rate-limit issues, either deploy the judge model locally (self-hosted NIM)\n",
-    "# or use any OpenAI-compatible LLM as the judge for evaluation.\n",
-    "from langchain_nvidia_ai_endpoints.chat_models import ChatNVIDIA\n",
-    "\n",
-    "# Initialize the judge LLM for evaluation\n",
-    "# You can use any other model by creating Chat Model object\n",
-    "llm = ChatNVIDIA(model=\"openai/gpt-oss-120b\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "928a3c8a",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Create the evaluation dataset from our collected data\n",
-    "from ragas import EvaluationDataset\n",
-    "\n",
-    "evaluation_dataset = EvaluationDataset.from_list(dataset)\n",
-    "print(f\"Created evaluation dataset with {len(evaluation_dataset)} samples\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "b3ec24f4",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Import the required metrics and evaluation components\n",
-    "from ragas.metrics import AnswerAccuracy, ContextRelevance, ResponseGroundedness\n",
-    "from ragas import evaluate\n",
-    "from ragas.llms import LangchainLLMWrapper\n",
-    "\n",
-    "# Wrap the LLM for use with Ragas\n",
-    "evaluator_llm = LangchainLLMWrapper(llm)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "9f2f4245",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from ragas.run_config import RunConfig\n",
-    "\n",
-    "custom_config = RunConfig(max_workers=1, max_wait=120)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "3a3571af",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Run the evaluation with our three metrics\n",
-    "print(\"Starting Ragas evaluation...\")\n",
-    "print(\"This may take several minutes depending on the dataset size.\")\n",
-    "\n",
-    "results = evaluate(\n",
-    "    dataset=evaluation_dataset,\n",
-    "    metrics=[AnswerAccuracy(), ContextRelevance(), ResponseGroundedness()],\n",
-    "    llm=evaluator_llm, \n",
-    "    run_config=custom_config\n",
-    ")\n",
-    "\n",
-    "print(\"Evaluation completed!\")\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "bac9dde6",
-   "metadata": {},
-   "source": [
-    "## 5. Analyze Results\n",
-    "\n",
-    "Finally, let's examine our evaluation results. We'll look at both the overall metrics and individual sample performance."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "4c90647f",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "results"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "2da683a1",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Convert results to pandas DataFrame for detailed analysis of individual queries\n",
-    "results_df = results.to_pandas()\n",
-    "\n",
-    "import pandas as pd\n",
-    "\n",
-    "# 1. Set the option to display ALL columns, preventing the '...'\n",
-    "pd.set_option('display.max_columns', None)\n",
-    "\n",
-    "# 2. To prevent long text in cells from being cut off, you can set the column width\n",
-    "pd.set_option('display.max_colwidth', 80)\n",
-    "\n",
-    "results_df.head()"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "evaluate",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.12.9"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
+  "nbformat": 4,
+  "nbformat_minor": 5
 }
diff --git a/notebooks/image_input.ipynb b/notebooks/image_input.ipynb
index 622ec849b..5698b7985 100644
--- a/notebooks/image_input.ipynb
+++ b/notebooks/image_input.ipynb
@@ -1,992 +1,992 @@
 {
- "cells": [
-  {
-   "cell_type": "markdown",
-   "id": "e20e694c",
-   "metadata": {},
-   "source": [
-    "# Retriever API Usage with Multimodal Query Support\n",
-    "\n",
-    "This notebook demonstrates how to use the NVIDIA RAG retriever APIs with **multimodal queries** (text + images). You'll learn how to:\n",
-    "\n",
-    "- 🔍 Search for relevant documents using queries that contain images\n",
-    "- 🤖 Generate AI responses using the end-to-end RAG API with vision-language models (VLMs)\n",
-    "- 📊 Work with multimodal embeddings and vector databases\n",
-    "\n",
-    "**Use Case**: Query documents with images (e.g., \"What is the price of this item?\" + product image)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "0152f1eb",
-   "metadata": {},
-   "source": [
-    "## 📦 Setting up the Dependencies\n",
-    "\n",
-    "This section will guide you through:\n",
-    "1. Configuring your NGC API key for accessing NVIDIA services\n",
-    "2. Deploying the Milvus vector database\n",
-    "3. Setting up NVIDIA NIMs (NVIDIA Inference Microservices) for embeddings and VLM\n",
-    "4. Starting the NVIDIA Ingest runtime for document processing\n",
-    "5. Launching the RAG server\n",
-    "\n",
-    "**Note**: This setup uses Docker Compose to orchestrate all services."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "d77a630e",
-   "metadata": {},
-   "source": [
-    "### 0. Create a Virtual Environment (Recommended)\n",
-    "\n",
-    "Before running this notebook, create a virtual environment using `uv` to isolate dependencies:\n",
-    "\n",
-    "```bash\n",
-    "# Create a virtual environment\n",
-    "uv venv .venv\n",
-    "\n",
-    "# Activate the virtual environment\n",
-    "source .venv/bin/activate  # Linux/macOS\n",
-    "# .venv\\Scripts\\activate   # Windows\n",
-    "\n",
-    "# Install Jupyter Lab and ipykernel (if not already installed)\n",
-    "uv pip install jupyterlab ipykernel\n",
-    "\n",
-    "# Register the venv as a Jupyter kernel\n",
-    "python -m ipykernel install --user --name=.venv --display-name=\"Python (.venv)\"\n",
-    "```\n",
-    "\n",
-    "After setup, select the venv as the kernel for this notebook:\n",
-    "1. In Jupyter/VS Code/Cursor, click on the kernel selector (top right)\n",
-    "2. Choose **\".venv\"** or **\"Python (.venv)\"** as the kernel\n",
-    "\n",
-    "This ensures all packages installed via `uv pip install` in the notebook cells are installed into the isolated environment.\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "c39e628e",
-   "metadata": {},
-   "source": [
-    "### 1. Setup the Default Configurations\n",
-    "\n",
-    "Import necessary libraries for environment management."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "c03780a7",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Install python-dotenv for environment variable management\n",
-    "! uv pip install python-dotenv\n",
-    "\n",
-    "import os\n",
-    "from getpass import getpass"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "8a19cef7",
-   "metadata": {},
-   "source": [
-    "Provide your NGC_API_KEY after executing the cell below. You can obtain a key by following steps [here](https://github.com/NVIDIA-AI-Blueprints/rag/blob/main/docs/quickstart.md##obtain-an-api-key)."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "c1f7ffa3",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Check if NGC_API_KEY is already set, otherwise prompt for it\n",
-    "# Uncomment the line below to reset your API key\n",
-    "# del os.environ['NGC_API_KEY']\n",
-    "\n",
-    "if os.environ.get(\"NGC_API_KEY\", \"\").startswith(\"nvapi-\"):\n",
-    "    print(\"Valid NGC_API_KEY already in environment. Delete to reset\")\n",
-    "else:\n",
-    "    candidate_api_key = getpass(\"NVAPI Key (starts with nvapi-): \")\n",
-    "    assert candidate_api_key.startswith(\"nvapi-\"), (\n",
-    "        f\"{candidate_api_key[:5]}... is not a valid key\"\n",
-    "    )\n",
-    "    os.environ[\"NGC_API_KEY\"] = candidate_api_key"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "20ec8b61",
-   "metadata": {},
-   "source": [
-    "Login to nvcr.io which is needed for pulling the containers of dependencies"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "03972882",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Login to NVIDIA Container Registry (nvcr.io) to pull required containers\n",
-    "!echo \"${NGC_API_KEY}\" | docker login nvcr.io -u '$oauthtoken' --password-stdin"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "84642fbb",
-   "metadata": {},
-   "source": [
-    "### 2. Setup the Milvus Vector Database\n",
-    "\n",
-    "Milvus is a high-performance vector database used to store and search multimodal embeddings.\n",
-    "\n",
-    "**Configuration Notes**:\n",
-    "- By default, Milvus uses GPU indexing for faster performance\n",
-    "- Ensure you have provided the correct GPU ID below\n",
-    "- If you don't have a GPU available, you can switch to CPU-only Milvus by following the instructions in [milvus-configuration.md](https://github.com/NVIDIA-AI-Blueprints/rag/blob/main/docs/milvus-configuration.md)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "8125f717",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Specify which GPU to use for Milvus (change if using a different GPU)\n",
-    "os.environ[\"VECTORSTORE_GPU_DEVICE_ID\"] = \"0\""
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "3e2d3457",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Start Milvus vector database service\n",
-    "# This will run in the background (-d flag)\n",
-    "!docker compose -f ../deploy/compose/vectordb.yaml up -d"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "afe17557",
-   "metadata": {},
-   "source": [
-    "### 3. Setup NVIDIA Inference Microservices (NIMs)\n",
-    "\n",
-    "NIMs provide optimized inference for AI models. For multimodal RAG, we need:\n",
-    "- **VLM (Vision-Language Model)**: `nvidia/nemotron-nano-12b-v2-vl` for understanding images and generating responses\n",
-    "- **Embedding Model**: `llama-3.2-nemoretriever-1b-vlm-embed-v1` for creating multimodal embeddings"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "89a135eb",
-   "metadata": {},
-   "source": [
-    "#### Deploy On-Premise Models\n",
-    "\n",
-    "This section deploys NIMs locally using Docker. Models will be cached to avoid re-downloading."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "1b3d2e5c",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Create the model cache directory\n",
-    "!mkdir -p ~/.cache/model-cache"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "390df52d",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Set the MODEL_DIRECTORY environment variable to specify where models are cached\n",
-    "import os\n",
-    "\n",
-    "os.environ[\"MODEL_DIRECTORY\"] = os.path.expanduser(\"~/.cache/model-cache\")\n",
-    "print(\"MODEL_DIRECTORY set to:\", os.environ[\"MODEL_DIRECTORY\"])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "62a9946a",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Deploy NIMs with VLM and embedding profiles\n",
-    "# ⚠️ WARNING: This may take 10-20 minutes as models download (~10GB+)\n",
-    "# If the kernel times out, just rerun this cell - it will resume where it left off\n",
-    "# Select a free GPU for VLM Microservice\n",
-    "os.environ[\"VLM_MS_GPU_ID\"] = \"1\"\n",
-    "! USERID=$(id -u) docker compose --profile vlm-ingest --profile vlm-only -f ../deploy/compose/nims.yaml up -d"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "e91f511a",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Monitor the status of running containers\n",
-    "# Run this cell repeatedly to check if all services are healthy\n",
-    "# Look for STATUS showing \"healthy\" or \"Up\" for all containers\n",
-    "!docker ps"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "cfb34a6a",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Configure the model names and service URLs for the RAG pipeline\n",
-    "# These settings tell the RAG server which models and endpoints to use\n",
-    "\n",
-    "# VLM (Vision-Language Model) configuration\n",
-    "os.environ[\"APP_VLM_MODELNAME\"] = \"nvidia/nemotron-nano-12b-v2-vl\"\n",
-    "os.environ[\"APP_VLM_SERVERURL\"] = \"http://vlm-ms:8000/v1\"\n",
-    "\n",
-    "# Multimodal embedding model configuration\n",
-    "os.environ[\"APP_EMBEDDINGS_MODELNAME\"] = \"nvidia/llama-nemotron-embed-vl-1b-v2\"\n",
-    "os.environ[\"APP_EMBEDDINGS_SERVERURL\"] = \"nemotron-vlm-embedding-ms:8000/v1\"\n",
-    "os.environ[\"ENABLE_VLM_INFERENCE\"] = \"true\"\n",
-    "os.environ[\"VLM_TO_LLM_FALLBACK\"] = \"false\"\n",
-    "os.environ[\"ENABLE_RERANKER\"] = \"false\""
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "e62c7037",
-   "metadata": {},
-   "source": [
-    "#### Cloud based deployment\n",
-    "Using NVIDIA hosted cloud model"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "82084d4d",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import os\n",
-    "\n",
-    "# OCR and document processing endpoints - cloud hosted\n",
-    "os.environ[\"OCR_HTTP_ENDPOINT\"] = \"https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-ocr\"\n",
-    "os.environ[\"OCR_INFER_PROTOCOL\"] = \"http\"\n",
-    "os.environ[\"OCR_MODEL_NAME\"] = \"scene_text_ensemble\"\n",
-    "os.environ[\"YOLOX_HTTP_ENDPOINT\"] = \"https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-page-elements-v3\"\n",
-    "os.environ[\"YOLOX_INFER_PROTOCOL\"] = \"http\"\n",
-    "os.environ[\"YOLOX_GRAPHIC_ELEMENTS_HTTP_ENDPOINT\"] = \"https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-graphic-elements-v1\"\n",
-    "os.environ[\"YOLOX_GRAPHIC_ELEMENTS_INFER_PROTOCOL\"] = \"http\"\n",
-    "os.environ[\"YOLOX_TABLE_STRUCTURE_HTTP_ENDPOINT\"] = \"https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-table-structure-v1\"\n",
-    "os.environ[\"YOLOX_TABLE_STRUCTURE_INFER_PROTOCOL\"] = \"http\"\n",
-    "os.environ[\"APP_NVINGEST_CAPTIONENDPOINTURL\"] = \"https://integrate.api.nvidia.com/v1/chat/completions\"\n",
-    "\n",
-    "# VLM Model configuration - cloud hosted\n",
-    "os.environ[\"APP_VLM_MODELNAME\"] = \"nvidia/nemotron-nano-12b-v2-vl\"\n",
-    "os.environ[\"APP_VLM_SERVERURL\"] = \"https://integrate.api.nvidia.com/v1\"\n",
-    "os.environ[\"APP_LLM_SERVERURL\"] = \"\"\n",
-    "\n",
-    "# Multimodal embedding model configuration - cloud hosted\n",
-    "os.environ[\"APP_EMBEDDINGS_MODELNAME\"] = \"nvidia/llama-nemotron-embed-vl-1b-v2\"\n",
-    "os.environ[\"APP_EMBEDDINGS_SERVERURL\"] = \"https://integrate.api.nvidia.com/v1\"\n",
-    "os.environ[\"ENABLE_VLM_INFERENCE\"] = \"true\"\n",
-    "os.environ[\"VLM_TO_LLM_FALLBACK\"] = \"false\"\n",
-    "os.environ[\"ENABLE_RERANKER\"] = \"false\""
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "7cbcfa50",
-   "metadata": {},
-   "source": [
-    "### 4. Setup NVIDIA Ingest Runtime\n",
-    "\n",
-    "NVIDIA Ingest processes documents to extract text, images, and other elements. We'll configure it to:\n",
-    "- Extract images from documents\n",
-    "- Handle multimodal content"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "a5e0d73f",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Configure NVIDIA Ingest to extract and process images from documents\n",
-    "os.environ[\"APP_NVINGEST_STRUCTURED_ELEMENTS_MODALITY\"] = \"\"  # No special handling for structured elements\n",
-    "os.environ[\"APP_NVINGEST_IMAGE_ELEMENTS_MODALITY\"] = \"image\"  # Process image elements as images\n",
-    "os.environ[\"APP_NVINGEST_EXTRACTIMAGES\"] = \"True\"  # Extract images from documents\n",
-    "\n",
-    "# Start the ingestor server with Redis\n",
-    "! docker compose -f ../deploy/compose/docker-compose-ingestor-server.yaml up -d --build"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "da1bd9a3",
-   "metadata": {},
-   "source": [
-    "### 5. Setup the NVIDIA RAG Server\n",
-    "\n",
-    "The RAG server provides the main API endpoints for search and generation. It orchestrates all the components (embeddings, vector DB, VLM) to deliver intelligent responses."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "38ba7752",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Start the RAG server (accessible at localhost:8081)\n",
-    "os.environ[\"APP_RANKING_SERVERURL\"] = \"\"\n",
-    "! docker compose -f ../deploy/compose/docker-compose-rag-server.yaml up -d --build"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "ce492ce3",
-   "metadata": {},
-   "source": [
-    "---\n",
-    "\n",
-    "## 📚 Document Ingestion Workflow\n",
-    "\n",
-    "Now that all services are running, let's ingest documents into a collection.\n",
-    "\n",
-    "### 6. Create a Collection\n",
-    "\n",
-    "A collection is a logical grouping of documents in the vector database. Think of it as a database table optimized for similarity search."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "a8611aa1",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Install aiohttp for async HTTP requests\n",
-    "! uv pip install aiohttp\n",
-    "\n",
-    "# Configure the ingestor server URL\n",
-    "# Use \"ingestor-server\" when running in AI Workbench, otherwise \"localhost\"\n",
-    "IPADDRESS = (\n",
-    "    \"ingestor-server\"\n",
-    "    if os.environ.get(\"AI_WORKBENCH\", \"false\") == \"true\"\n",
-    "    else \"localhost\"\n",
-    ")\n",
-    "INGESTOR_SERVER_PORT = \"8082\"\n",
-    "BASE_URL = f\"http://{IPADDRESS}:{INGESTOR_SERVER_PORT}\"\n",
-    "\n",
-    "async def print_response(response):\n",
-    "    \"\"\"Helper function to pretty-print API responses.\"\"\"\n",
-    "    try:\n",
-    "        response_json = await response.json()\n",
-    "        print(json.dumps(response_json, indent=2))\n",
-    "    except aiohttp.ClientResponseError:\n",
-    "        print(await response.text())\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "688bc70f",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Define a unique name for your collection\n",
-    "# Change this if you want to create a different collection\n",
-    "collection_name = \"multimodal_query\""
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "24378f6f",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import aiohttp\n",
-    "import json\n",
-    "\n",
-    "\n",
-    "async def create_collection(\n",
-    "    collection_name: str | None = None,\n",
-    "    metadata_schema: list = [],\n",
-    "):\n",
-    "    \"\"\"\n",
-    "    Create a new collection in the vector database.\n",
-    "    \n",
-    "    Args:\n",
-    "        collection_name: Unique identifier for the collection\n",
-    "        metadata_schema: Optional schema for metadata fields\n",
-    "    \"\"\"\n",
-    "    data = {\n",
-    "        \"collection_name\": collection_name,\n",
-    "        \"metadata_schema\": metadata_schema,\n",
-    "    }\n",
-    "\n",
-    "    HEADERS = {\"Content-Type\": \"application/json\"}\n",
-    "\n",
-    "    async with aiohttp.ClientSession() as session:\n",
-    "        try:\n",
-    "            async with session.post(\n",
-    "                f\"{BASE_URL}/v1/collection\", json=data, headers=HEADERS\n",
-    "            ) as response:\n",
-    "                await print_response(response)\n",
-    "        except aiohttp.ClientError as e:\n",
-    "            return 500, {\"error\": str(e)}\n",
-    "\n",
-    "\n",
-    "# Create the collection\n",
-    "# The embedding dimension is 2048 for the multimodal embedding model we're using\n",
-    "await create_collection(\n",
-    "    collection_name=collection_name,\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "a29f4633",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Specify the documents to upload\n",
-    "# This PDF contains product images with pricing information\n",
-    "FILEPATHS = [\n",
-    "    \"../data/multimodal/product_catalog.pdf\",\n",
-    "]\n",
-    "\n",
-    "async def upload_documents(collection_name: str = \"\"):\n",
-    "    \"\"\"\n",
-    "    Upload and process documents into the collection.\n",
-    "    \n",
-    "    This will:\n",
-    "    1. Extract text and images from the PDFs\n",
-    "    2. Chunk the content for optimal retrieval\n",
-    "    3. Generate multimodal embeddings\n",
-    "    4. Store everything in the vector database\n",
-    "    \"\"\"\n",
-    "    data = {\n",
-    "        \"collection_name\": collection_name,\n",
-    "        \"blocking\": False,  # Async upload - use status API to check progress\n",
-    "        \"split_options\": {\n",
-    "            \"chunk_size\": 512,        # Characters per chunk\n",
-    "            \"chunk_overlap\": 150      # Overlap between chunks for context\n",
-    "        },\n",
-    "        \"generate_summary\": False  # Set to True to generate document summaries\n",
-    "    }\n",
-    "\n",
-    "    form_data = aiohttp.FormData()\n",
-    "    \n",
-    "    # Add all PDF files to the form data\n",
-    "    for file_path in FILEPATHS:\n",
-    "        form_data.add_field(\"documents\", open(file_path, \"rb\"), \n",
-    "                          filename=os.path.basename(file_path), \n",
-    "                          content_type=\"application/pdf\")\n",
-    "\n",
-    "    form_data.add_field(\"data\", json.dumps(data), content_type=\"application/json\")\n",
-    "\n",
-    "    async with aiohttp.ClientSession() as session:\n",
-    "        try:\n",
-    "            # Use POST for new uploads, PATCH for re-ingesting existing documents\n",
-    "            async with session.post(f\"{BASE_URL}/v1/documents\", data=form_data) as response:\n",
-    "                await print_response(response)\n",
-    "                response_json = await response.json()\n",
-    "                return response_json\n",
-    "        except aiohttp.ClientError as e:\n",
-    "            print(f\"Error uploading documents: {e}\")\n",
-    "            return None\n",
-    "\n",
-    "# Upload the documents and get the task ID for tracking progress\n",
-    "upload_response = await upload_documents(collection_name=collection_name)\n",
-    "task_id = upload_response.get(\"task_id\") if upload_response else None\n",
-    "print(f\"\\nTask ID for tracking: {task_id}\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "2234e059",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "async def get_task_status(task_id: str):\n",
-    "    \"\"\"\n",
-    "    Check the status of an asynchronous ingestion task.\n",
-    "    \n",
-    "    Possible statuses:\n",
-    "    - \"pending\": Task is queued\n",
-    "    - \"processing\": Currently processing documents\n",
-    "    - \"completed\": Successfully finished\n",
-    "    - \"failed\": Error occurred\n",
-    "    \"\"\"\n",
-    "    params = {\n",
-    "        \"task_id\": task_id,\n",
-    "    }\n",
-    "\n",
-    "    HEADERS = {\"Content-Type\": \"application/json\"}\n",
-    "\n",
-    "    async with aiohttp.ClientSession() as session:\n",
-    "        try:\n",
-    "            async with session.get(\n",
-    "                f\"{BASE_URL}/v1/status\", params=params, headers=HEADERS\n",
-    "            ) as response:\n",
-    "                await print_response(response)\n",
-    "        except aiohttp.ClientError as e:\n",
-    "            return 500, {\"error\": str(e)}\n",
-    "\n",
-    "\n",
-    "# Check the ingestion status\n",
-    "# Run this cell multiple times until status shows \"completed\"\n",
-    "await get_task_status(\n",
-    "    task_id=[task_id]\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "e2c1f1a8",
-   "metadata": {},
-   "source": [
-    "---\n",
-    "\n",
-    "## 🔍 Querying with Multimodal Inputs\n",
-    "\n",
-    "Now that documents are ingested, let's query them using both text and images!\n",
-    "\n",
-    "### 7. Using the Search and Generate APIs\n",
-    "\n",
-    "We'll demonstrate two approaches:\n",
-    "1. **Search API**: Find relevant documents without generating a response\n",
-    "2. **Generate API**: Get an AI-generated answer with citations"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "3990ca33",
-   "metadata": {},
-   "source": [
-    "#### Prepare a Multimodal Query\n",
-    "\n",
-    "To query with an image, we need to:\n",
-    "1. Convert the image to base64 encoding\n",
-    "2. Format it according to the OpenAI vision API format\n",
-    "3. Combine it with a text prompt"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "02dde830",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "! uv pip install requests httpx\n",
-    "import base64\n",
-    "import requests\n",
-    "from IPython.display import Image, Markdown, display\n",
-    "\n",
-    "def get_base64_image(image_source: str) -> str:\n",
-    "    \"\"\"\n",
-    "    Convert an image to base64 encoding.\n",
-    "    \n",
-    "    Args:\n",
-    "        image_source: Local file path or URL to the image\n",
-    "        \n",
-    "    Returns:\n",
-    "        Base64 encoded string of the image\n",
-    "    \"\"\"\n",
-    "    if image_source.startswith(('http://', 'https://')):\n",
-    "        # Download image from URL\n",
-    "        response = requests.get(image_source)\n",
-    "        return base64.b64encode(response.content).decode()\n",
-    "    else:\n",
-    "        # Read local file\n",
-    "        with open(image_source, \"rb\") as image_file:\n",
-    "            return base64.b64encode(image_file.read()).decode()\n",
-    "\n",
-    "# Convert the query image to base64\n",
-    "# Try different images to test different queries:\n",
-    "image_b64 = get_base64_image(\"../data/multimodal/Creme_clutch_purse1-small.jpg\")\n",
-    "\n",
-    "# Display the query image for reference\n",
-    "query_image_path = \"../data/multimodal/Creme_clutch_purse1-small.jpg\"\n",
-    "print(\"📷 Query Image:\")\n",
-    "display(Image(filename=query_image_path, width=300))\n",
-    "\n",
-    "# Format as a data URL\n",
-    "image_input = f\"data:image/png;base64,{image_b64}\"\n",
-    "\n",
-    "# Create the multimodal query with text + image\n",
-    "# This follows the OpenAI vision API format\n",
-    "query_1 = \"What material is this made of?\"\n",
-    "image_query = [\n",
-    "    {\"type\": \"text\", \"text\": query_1},\n",
-    "    {\n",
-    "        \"type\": \"image_url\",\n",
-    "        \"image_url\": {\n",
-    "            \"url\": image_input,\n",
-    "            \"detail\": \"auto\"  # Let the model decide the appropriate detail level\n",
-    "        }\n",
-    "    }\n",
-    "]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "c311f9d0",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import httpx\n",
-    "import json\n",
-    "from IPython.display import Image, Markdown, display\n",
-    "\n",
-    "RAG_BASE_URL = \"http://localhost:8081\"\n",
-    "\n",
-    "async def search_documents(payload):\n",
-    "    \"\"\"\n",
-    "    Search for relevant documents using a multimodal query.\n",
-    "    \n",
-    "    This performs similarity search in the vector database and optionally\n",
-    "    reranks results for better relevance.\n",
-    "    \"\"\"\n",
-    "    search_url = f\"{RAG_BASE_URL}/v1/search\"\n",
-    "    \n",
-    "    async with httpx.AsyncClient(timeout=300.0) as client:\n",
-    "        try:\n",
-    "            response = await client.post(url=search_url, json=payload)\n",
-    "            response.raise_for_status()\n",
-    "            \n",
-    "            search_results = response.json()\n",
-    "            print(\"Search Results:\")\n",
-    "            \n",
-    "            # Display search results with nice formatting\n",
-    "            if \"results\" in search_results:\n",
-    "                for idx, result in enumerate(search_results[\"results\"]):\n",
-    "                    doc_type = result.get(\"document_type\", \"text\")\n",
-    "                    content = result.get(\"content\", \"\")\n",
-    "                    doc_name = result.get(\"document_name\", f\"Result {idx + 1}\")\n",
-    "                    score = result.get(\"score\", \"N/A\")\n",
-    "                    \n",
-    "                    display(Markdown(f\"**Result {idx + 1}: {doc_name} (Score: {score})**\"))\n",
-    "                    try:\n",
-    "                        if doc_type == \"image\":\n",
-    "                            # Display image results\n",
-    "                            image_bytes = base64.b64decode(content)\n",
-    "                            display(Image(data=image_bytes))\n",
-    "                        else:\n",
-    "                            # Display text results\n",
-    "                            display(Markdown(f\"```\\n{content}\\n```\"))\n",
-    "                    except Exception as e:\n",
-    "                        print(f\"Error displaying content: {e}\")\n",
-    "                        display(Markdown(f\"```\\n{content}\\n```\"))\n",
-    "            \n",
-    "            return search_results\n",
-    "            \n",
-    "        except httpx.HTTPStatusError as e:\n",
-    "            print(f\"HTTP error occurred: {e.response.status_code} - {e.response.text}\")\n",
-    "        except httpx.RequestError as e:\n",
-    "            print(f\"An error occurred while requesting {e.request.url!r}: {e}\")\n",
-    "        except Exception as e:\n",
-    "            print(f\"An error occurred: {e}\")\n",
-    "\n",
-    "# Configure the search parameters\n",
-    "search_payload = {\n",
-    "    \"query\": image_query,                      # Our multimodal query (text + image)\n",
-    "    \"messages\": [],                            # No conversation history\n",
-    "    \"use_knowledge_base\": True,                # Search the vector database\n",
-    "    \"collection_names\": [collection_name],     # Which collection to search\n",
-    "    \"vdb_top_k\": 5,                           # Retrieve top 5 results from vector DB\n",
-    "    \"vdb_endpoint\": \"http://milvus:19530\",    # Milvus connection string\n",
-    "    \"enable_reranker\": False,                  # Set to True for better relevance (slower)\n",
-    "    \"reranker_top_k\": 3,                      # If reranker enabled, return top 3\n",
-    "    \"filter_expr\": \"\",                        # Optional metadata filter\n",
-    "}\n",
-    "\n",
-    "# Execute the search\n",
-    "print(\"🔍 Searching for documents matching the query...\\n\")\n",
-    "search_result = await search_documents(search_payload)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "bc5b1545",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import base64\n",
-    "import json\n",
-    "from IPython.display import Image, Markdown, display\n",
-    "\n",
-    "\n",
-    "async def print_streaming_response_and_citations(response_generator):\n",
-    "    \"\"\"\n",
-    "    Helper function to display streaming responses with citations.\n",
-    "    \n",
-    "    This function:\n",
-    "    1. Streams the AI-generated response token by token\n",
-    "    2. Extracts citations from the first chunk\n",
-    "    3. Displays citations (text or images) after the response completes\n",
-    "    \"\"\"\n",
-    "    first_chunk_data = None\n",
-    "    \n",
-    "    async for chunk in response_generator:\n",
-    "        # Parse Server-Sent Events (SSE) format\n",
-    "        if chunk.startswith(\"data: \"):\n",
-    "            chunk = chunk[len(\"data: \") :].strip()\n",
-    "        if not chunk:\n",
-    "            continue\n",
-    "            \n",
-    "        try:\n",
-    "            data = json.loads(chunk)\n",
-    "        except Exception as e:\n",
-    "            print(f\"JSON decode error: {e}\")\n",
-    "            continue\n",
-    "            \n",
-    "        choices = data.get(\"choices\", [])\n",
-    "        if not choices:\n",
-    "            continue\n",
-    "            \n",
-    "        # Save the first chunk with citations\n",
-    "        if first_chunk_data is None and data.get(\"citations\"):\n",
-    "            first_chunk_data = data\n",
-    "            \n",
-    "        # Print streaming text\n",
-    "        delta = choices[0].get(\"delta\", {})\n",
-    "        text = delta.get(\"content\")\n",
-    "        if not text:\n",
-    "            message = choices[0].get(\"message\", {})\n",
-    "            text = message.get(\"content\", \"\")\n",
-    "        print(text, end=\"\", flush=True)\n",
-    "        \n",
-    "    print()  # Newline after streaming\n",
-    "\n",
-    "    # Display citations after streaming is done\n",
-    "    if first_chunk_data and first_chunk_data.get(\"citations\"):\n",
-    "        print(\"\\n📚 Citations:\")\n",
-    "        citations = first_chunk_data[\"citations\"]\n",
-    "        for idx, citation in enumerate(citations.get(\"results\", [])):\n",
-    "            doc_type = citation.get(\"document_type\", \"text\")\n",
-    "            content = citation.get(\"content\", \"\")\n",
-    "            doc_name = citation.get(\"document_name\", f\"Citation {idx + 1}\")\n",
-    "            display(Markdown(f\"**Citation {idx + 1}: {doc_name}**\"))\n",
-    "            try:\n",
-    "                # Try to display as image\n",
-    "                image_bytes = base64.b64decode(content)\n",
-    "                display(Image(data=image_bytes))\n",
-    "            except Exception:\n",
-    "                # Fall back to text display\n",
-    "                display(Markdown(f\"```\\n{content}\\n```\"))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "31071359",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import httpx\n",
-    "\n",
-    "# Configure RAG server URL\n",
-    "IPADDRESS = \"rag-server\" if os.environ.get(\"AI_WORKBENCH\", \"false\") == \"true\" else \"localhost\"\n",
-    "RAG_SERVER_PORT = \"8081\"\n",
-    "RAG_BASE_URL = f\"http://{IPADDRESS}:{RAG_SERVER_PORT}\"\n",
-    "generate_url = f\"{RAG_BASE_URL}/v1/generate\"\n",
-    "\n",
-    "async def generate_answer(payload):\n",
-    "    \"\"\"\n",
-    "    Generate an AI answer using the RAG pipeline.\n",
-    "    \n",
-    "    This function:\n",
-    "    1. Sends the query to the RAG server\n",
-    "    2. Retrieves relevant context from the vector database\n",
-    "    3. Streams the AI-generated response\n",
-    "    4. Displays citations (sources) used to generate the answer\n",
-    "    \"\"\"\n",
-    "    rag_response = \"\"\n",
-    "    citations = []\n",
-    "    is_first_token = True\n",
-    "\n",
-    "    async with httpx.AsyncClient(timeout=300.0) as client:\n",
-    "        try:\n",
-    "            async with client.stream(\"POST\", url=generate_url, json=payload) as response:\n",
-    "                # Raise an exception for bad status codes like 4xx or 5xx\n",
-    "                response.raise_for_status()\n",
-    "\n",
-    "                # Iterate over the streaming response\n",
-    "                async for line in response.aiter_lines():\n",
-    "                    if line.startswith(\"data: \"):\n",
-    "                        json_str = line[6:].strip()\n",
-    "                        if not json_str:\n",
-    "                            continue\n",
-    "\n",
-    "                        try:\n",
-    "                            data = json.loads(json_str)\n",
-    "\n",
-    "                            # Extract and display the streaming response\n",
-    "                            message = data.get(\"choices\", [{}])[0].get(\"message\", {}).get(\"content\", \"\")\n",
-    "                            if message:\n",
-    "                                rag_response += message\n",
-    "\n",
-    "                            # Extract and display citations from the first chunk\n",
-    "                            if is_first_token and data.get(\"citations\"):\n",
-    "                                print(\"\\n📚 Citations:\")\n",
-    "                                citations = data[\"citations\"]\n",
-    "                                for idx, citation in enumerate(citations.get(\"results\", [])):\n",
-    "                                    doc_type = citation.get(\"document_type\", \"text\")\n",
-    "                                    content = citation.get(\"content\", \"\")\n",
-    "                                    doc_name = citation.get(\"document_name\", f\"Citation {idx + 1}\")\n",
-    "                                    display(Markdown(f\"**Citation {idx + 1}: {doc_name}**\"))\n",
-    "                                    try:\n",
-    "                                        # Display image citations\n",
-    "                                        image_bytes = base64.b64decode(content)\n",
-    "                                        display(Image(data=image_bytes))\n",
-    "                                    except Exception:\n",
-    "                                        # Display text citations\n",
-    "                                        display(Markdown(f\"```\\n{content}\\n```\"))\n",
-    "                                is_first_token = False\n",
-    "\n",
-    "                            # Check if streaming is complete\n",
-    "                            finish_reason = data.get(\"choices\", [{}])[0].get(\"finish_reason\")\n",
-    "                            if finish_reason == \"stop\":\n",
-    "                                return rag_response\n",
-    "\n",
-    "                        except json.JSONDecodeError:\n",
-    "                            print(f\"Skipping malformed JSON line: {json_str}\")\n",
-    "                            continue\n",
-    "        \n",
-    "        except httpx.HTTPStatusError as e:\n",
-    "            print(f\"HTTP error occurred: {e.response.status_code} - {e.response.text}\")\n",
-    "        except httpx.RequestError as e:\n",
-    "            print(f\"An error occurred while requesting {e.request.url!r}: {e}\")\n",
-    "        except Exception as e:\n",
-    "            print(f\"An error occurred: {e}\")\n",
-    "\n",
-    "    print(\"\\n✅ Response complete!\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "c3049696",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Format the query as a chat message\n",
-    "messages = [\n",
-    "    {\n",
-    "        \"role\": \"user\",\n",
-    "        \"content\": image_query  # Our multimodal query (text + image)\n",
-    "    }\n",
-    "]\n",
-    "\n",
-    "# Configure the generate API parameters\n",
-    "payload = {\n",
-    "    \"messages\": messages,                      # Chat conversation\n",
-    "    \"use_knowledge_base\": True,                # Enable RAG - use vector DB for context\n",
-    "    \"temperature\": 0.2,                        # Lower = more deterministic, higher = more creative\n",
-    "    \"top_p\": 0.7,                             # Nucleus sampling parameter\n",
-    "    \"max_tokens\": 1024,                       # Maximum response length\n",
-    "    \"reranker_top_k\": 2,                      # Keep top 2 results after reranking\n",
-    "    \"vdb_top_k\": 10,                          # Retrieve top 10 from vector DB initially\n",
-    "    \"vdb_endpoint\": \"http://milvus:19530\",    # Milvus connection\n",
-    "    \"collection_names\": [collection_name],     # Which collection to search\n",
-    "    \"enable_query_rewriting\": True,            # Improve query before searching\n",
-    "    \"enable_citations\": True,                  # Include source citations in response\n",
-    "    \"stop\": [],                               # Optional stop sequences\n",
-    "    \"filter_expr\": \"\",                        # Optional metadata filter    \n",
-    "}\n",
-    "\n",
-    "# Generate the answer with RAG\n",
-    "print(\"🤖 Generating answer with RAG...\\n\")\n",
-    "await generate_answer(payload)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "dcddc78d",
-   "metadata": {},
-   "source": [
-    "---\n",
-    "\n",
-    "## 🎉 Summary\n",
-    "\n",
-    "Congratulations! You've successfully:\n",
-    "\n",
-    "✅ **Set up the infrastructure**: Deployed Milvus vector DB, NVIDIA NIMs, and RAG services  \n",
-    "✅ **Ingested multimodal documents**: Uploaded PDFs with images and extracted their content  \n",
-    "✅ **Created multimodal queries**: Combined text and images in your search queries  \n",
-    "✅ **Retrieved relevant context**: Used semantic search to find matching documents  \n",
-    "✅ **Generated AI responses**: Got intelligent answers with source citations  \n",
-    "\n",
-    "### Next Steps\n",
-    "\n",
-    "- **Try different queries**: Change the query text or use different query images\n",
-    "- **Upload more documents**: Add more PDFs to enrich your knowledge base\n",
-    "- **Experiment with parameters**: Adjust `temperature`, `top_k`, reranker settings\n",
-    "- **Build applications**: Integrate these APIs into your own applications\n",
-    "\n",
-    "### Cleanup\n",
-    "\n",
-    "To stop all services and free up resources:\n",
-    "\n",
-    "```bash\n",
-    "cd ../deploy/compose\n",
-    "docker compose -f docker-compose-rag-server.yaml down\n",
-    "docker compose -f docker-compose-ingestor-server.yaml down\n",
-    "docker compose -f nims.yaml down\n",
-    "docker compose -f vectordb.yaml down\n",
-    "```\n"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": ".venv",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.11.10"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "id": "e20e694c",
+      "metadata": {},
+      "source": [
+        "# Retriever API Usage with Multimodal Query Support\n",
+        "\n",
+        "This notebook demonstrates how to use the NVIDIA RAG retriever APIs with **multimodal queries** (text + images). You'll learn how to:\n",
+        "\n",
+        "- 🔍 Search for relevant documents using queries that contain images\n",
+        "- 🤖 Generate AI responses using the end-to-end RAG API with vision-language models (VLMs)\n",
+        "- 📊 Work with multimodal embeddings and vector databases\n",
+        "\n",
+        "**Use Case**: Query documents with images (e.g., \"What is the price of this item?\" + product image)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "id": "0152f1eb",
+      "metadata": {},
+      "source": [
+        "## 📦 Setting up the Dependencies\n",
+        "\n",
+        "This section will guide you through:\n",
+        "1. Configuring your NGC API key for accessing NVIDIA services\n",
+        "2. Deploying the Milvus vector database\n",
+        "3. Setting up NVIDIA NIMs (NVIDIA Inference Microservices) for embeddings and VLM\n",
+        "4. Starting the NVIDIA Ingest runtime for document processing\n",
+        "5. Launching the RAG server\n",
+        "\n",
+        "**Note**: This setup uses Docker Compose to orchestrate all services."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "id": "d77a630e",
+      "metadata": {},
+      "source": [
+        "### 0. Create a Virtual Environment (Recommended)\n",
+        "\n",
+        "Before running this notebook, create a virtual environment using `uv` to isolate dependencies:\n",
+        "\n",
+        "```bash\n",
+        "# Create a virtual environment\n",
+        "uv venv .venv\n",
+        "\n",
+        "# Activate the virtual environment\n",
+        "source .venv/bin/activate  # Linux/macOS\n",
+        "# .venv\\Scripts\\activate   # Windows\n",
+        "\n",
+        "# Install Jupyter Lab and ipykernel (if not already installed)\n",
+        "uv pip install jupyterlab ipykernel\n",
+        "\n",
+        "# Register the venv as a Jupyter kernel\n",
+        "python -m ipykernel install --user --name=.venv --display-name=\"Python (.venv)\"\n",
+        "```\n",
+        "\n",
+        "After setup, select the venv as the kernel for this notebook:\n",
+        "1. In Jupyter/VS Code/Cursor, click on the kernel selector (top right)\n",
+        "2. Choose **\".venv\"** or **\"Python (.venv)\"** as the kernel\n",
+        "\n",
+        "This ensures all packages installed via `uv pip install` in the notebook cells are installed into the isolated environment.\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "id": "c39e628e",
+      "metadata": {},
+      "source": [
+        "### 1. Setup the Default Configurations\n",
+        "\n",
+        "Import necessary libraries for environment management."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "c03780a7",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Install python-dotenv for environment variable management\n",
+        "! uv pip install python-dotenv\n",
+        "\n",
+        "import os\n",
+        "from getpass import getpass"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "id": "8a19cef7",
+      "metadata": {},
+      "source": [
+        "Provide your NGC_API_KEY after executing the cell below. You can obtain a key by following steps [here](https://github.com/NVIDIA-AI-Blueprints/rag/blob/main/docs/api-key.md)."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "c1f7ffa3",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Check if NGC_API_KEY is already set, otherwise prompt for it\n",
+        "# Uncomment the line below to reset your API key\n",
+        "# del os.environ['NGC_API_KEY']\n",
+        "\n",
+        "if os.environ.get(\"NGC_API_KEY\", \"\").startswith(\"nvapi-\"):\n",
+        "    print(\"Valid NGC_API_KEY already in environment. Delete to reset\")\n",
+        "else:\n",
+        "    candidate_api_key = getpass(\"NVAPI Key (starts with nvapi-): \")\n",
+        "    assert candidate_api_key.startswith(\"nvapi-\"), (\n",
+        "        f\"{candidate_api_key[:5]}... is not a valid key\"\n",
+        "    )\n",
+        "    os.environ[\"NGC_API_KEY\"] = candidate_api_key"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "id": "20ec8b61",
+      "metadata": {},
+      "source": [
+        "Login to nvcr.io which is needed for pulling the containers of dependencies"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "03972882",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Login to NVIDIA Container Registry (nvcr.io) to pull required containers\n",
+        "!echo \"${NGC_API_KEY}\" | docker login nvcr.io -u '$oauthtoken' --password-stdin"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "id": "84642fbb",
+      "metadata": {},
+      "source": [
+        "### 2. Setup the Milvus Vector Database\n",
+        "\n",
+        "Milvus is a high-performance vector database used to store and search multimodal embeddings.\n",
+        "\n",
+        "**Configuration Notes**:\n",
+        "- By default, Milvus uses GPU indexing for faster performance\n",
+        "- Ensure you have provided the correct GPU ID below\n",
+        "- If you don't have a GPU available, you can switch to CPU-only Milvus by following the instructions in [milvus-configuration.md](https://github.com/NVIDIA-AI-Blueprints/rag/blob/main/docs/milvus-configuration.md)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "8125f717",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Specify which GPU to use for Milvus (change if using a different GPU)\n",
+        "os.environ[\"VECTORSTORE_GPU_DEVICE_ID\"] = \"0\""
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "3e2d3457",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Start Milvus vector database service\n",
+        "# This will run in the background (-d flag)\n",
+        "!docker compose -f ../deploy/compose/vectordb.yaml up -d"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "id": "afe17557",
+      "metadata": {},
+      "source": [
+        "### 3. Setup NVIDIA Inference Microservices (NIMs)\n",
+        "\n",
+        "NIMs provide optimized inference for AI models. For multimodal RAG, we need:\n",
+        "- **VLM (Vision-Language Model)**: `nvidia/nemotron-nano-12b-v2-vl` for understanding images and generating responses\n",
+        "- **Embedding Model**: `llama-3.2-nemoretriever-1b-vlm-embed-v1` for creating multimodal embeddings"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "id": "89a135eb",
+      "metadata": {},
+      "source": [
+        "#### Deploy On-Premise Models\n",
+        "\n",
+        "This section deploys NIMs locally using Docker. Models will be cached to avoid re-downloading."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "1b3d2e5c",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Create the model cache directory\n",
+        "!mkdir -p ~/.cache/model-cache"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "390df52d",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Set the MODEL_DIRECTORY environment variable to specify where models are cached\n",
+        "import os\n",
+        "\n",
+        "os.environ[\"MODEL_DIRECTORY\"] = os.path.expanduser(\"~/.cache/model-cache\")\n",
+        "print(\"MODEL_DIRECTORY set to:\", os.environ[\"MODEL_DIRECTORY\"])"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "62a9946a",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Deploy NIMs with VLM and embedding profiles\n",
+        "# ⚠️ WARNING: This may take 10-20 minutes as models download (~10GB+)\n",
+        "# If the kernel times out, just rerun this cell - it will resume where it left off\n",
+        "# Select a free GPU for VLM Microservice\n",
+        "os.environ[\"VLM_MS_GPU_ID\"] = \"1\"\n",
+        "! USERID=$(id -u) docker compose --profile vlm-ingest --profile vlm-only -f ../deploy/compose/nims.yaml up -d"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "e91f511a",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Monitor the status of running containers\n",
+        "# Run this cell repeatedly to check if all services are healthy\n",
+        "# Look for STATUS showing \"healthy\" or \"Up\" for all containers\n",
+        "!docker ps"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "cfb34a6a",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Configure the model names and service URLs for the RAG pipeline\n",
+        "# These settings tell the RAG server which models and endpoints to use\n",
+        "\n",
+        "# VLM (Vision-Language Model) configuration\n",
+        "os.environ[\"APP_VLM_MODELNAME\"] = \"nvidia/nemotron-nano-12b-v2-vl\"\n",
+        "os.environ[\"APP_VLM_SERVERURL\"] = \"http://vlm-ms:8000/v1\"\n",
+        "\n",
+        "# Multimodal embedding model configuration\n",
+        "os.environ[\"APP_EMBEDDINGS_MODELNAME\"] = \"nvidia/llama-nemotron-embed-vl-1b-v2\"\n",
+        "os.environ[\"APP_EMBEDDINGS_SERVERURL\"] = \"nemotron-vlm-embedding-ms:8000/v1\"\n",
+        "os.environ[\"ENABLE_VLM_INFERENCE\"] = \"true\"\n",
+        "os.environ[\"VLM_TO_LLM_FALLBACK\"] = \"false\"\n",
+        "os.environ[\"ENABLE_RERANKER\"] = \"false\""
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "id": "e62c7037",
+      "metadata": {},
+      "source": [
+        "#### Cloud based deployment\n",
+        "Using NVIDIA hosted cloud model"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "82084d4d",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "\n",
+        "# OCR and document processing endpoints - cloud hosted\n",
+        "os.environ[\"OCR_HTTP_ENDPOINT\"] = \"https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-ocr\"\n",
+        "os.environ[\"OCR_INFER_PROTOCOL\"] = \"http\"\n",
+        "os.environ[\"OCR_MODEL_NAME\"] = \"scene_text_ensemble\"\n",
+        "os.environ[\"YOLOX_HTTP_ENDPOINT\"] = \"https://ai.api.nvidia.com/v1/cv/nvidia/nemotron-page-elements-v3\"\n",
+        "os.environ[\"YOLOX_INFER_PROTOCOL\"] = \"http\"\n",
+        "os.environ[\"YOLOX_GRAPHIC_ELEMENTS_HTTP_ENDPOINT\"] = \"https://ai.api.nvidia.com/v1/cv/nvidia/nemotron-graphic-elements-v1\"\n",
+        "os.environ[\"YOLOX_GRAPHIC_ELEMENTS_INFER_PROTOCOL\"] = \"http\"\n",
+        "os.environ[\"YOLOX_TABLE_STRUCTURE_HTTP_ENDPOINT\"] = \"https://ai.api.nvidia.com/v1/cv/nvidia/nemotron-table-structure-v1\"\n",
+        "os.environ[\"YOLOX_TABLE_STRUCTURE_INFER_PROTOCOL\"] = \"http\"\n",
+        "os.environ[\"APP_NVINGEST_CAPTIONENDPOINTURL\"] = \"https://integrate.api.nvidia.com/v1/chat/completions\"\n",
+        "\n",
+        "# VLM Model configuration - cloud hosted\n",
+        "os.environ[\"APP_VLM_MODELNAME\"] = \"nvidia/nemotron-nano-12b-v2-vl\"\n",
+        "os.environ[\"APP_VLM_SERVERURL\"] = \"https://integrate.api.nvidia.com/v1\"\n",
+        "os.environ[\"APP_LLM_SERVERURL\"] = \"\"\n",
+        "\n",
+        "# Multimodal embedding model configuration - cloud hosted\n",
+        "os.environ[\"APP_EMBEDDINGS_MODELNAME\"] = \"nvidia/llama-nemotron-embed-vl-1b-v2\"\n",
+        "os.environ[\"APP_EMBEDDINGS_SERVERURL\"] = \"https://integrate.api.nvidia.com/v1\"\n",
+        "os.environ[\"ENABLE_VLM_INFERENCE\"] = \"true\"\n",
+        "os.environ[\"VLM_TO_LLM_FALLBACK\"] = \"false\"\n",
+        "os.environ[\"ENABLE_RERANKER\"] = \"false\""
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "id": "7cbcfa50",
+      "metadata": {},
+      "source": [
+        "### 4. Setup NVIDIA Ingest Runtime\n",
+        "\n",
+        "NVIDIA Ingest processes documents to extract text, images, and other elements. We'll configure it to:\n",
+        "- Extract images from documents\n",
+        "- Handle multimodal content"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "a5e0d73f",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Configure NVIDIA Ingest to extract and process images from documents\n",
+        "os.environ[\"APP_NVINGEST_STRUCTURED_ELEMENTS_MODALITY\"] = \"\"  # No special handling for structured elements\n",
+        "os.environ[\"APP_NVINGEST_IMAGE_ELEMENTS_MODALITY\"] = \"image\"  # Process image elements as images\n",
+        "os.environ[\"APP_NVINGEST_EXTRACTIMAGES\"] = \"True\"  # Extract images from documents\n",
+        "\n",
+        "# Start the ingestor server with Redis\n",
+        "! docker compose -f ../deploy/compose/docker-compose-ingestor-server.yaml up -d --build"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "id": "da1bd9a3",
+      "metadata": {},
+      "source": [
+        "### 5. Setup the NVIDIA RAG Server\n",
+        "\n",
+        "The RAG server provides the main API endpoints for search and generation. It orchestrates all the components (embeddings, vector DB, VLM) to deliver intelligent responses."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "38ba7752",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Start the RAG server (accessible at localhost:8081)\n",
+        "os.environ[\"APP_RANKING_SERVERURL\"] = \"\"\n",
+        "! docker compose -f ../deploy/compose/docker-compose-rag-server.yaml up -d --build"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "id": "ce492ce3",
+      "metadata": {},
+      "source": [
+        "---\n",
+        "\n",
+        "## 📚 Document Ingestion Workflow\n",
+        "\n",
+        "Now that all services are running, let's ingest documents into a collection.\n",
+        "\n",
+        "### 6. Create a Collection\n",
+        "\n",
+        "A collection is a logical grouping of documents in the vector database. Think of it as a database table optimized for similarity search."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "a8611aa1",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Install aiohttp for async HTTP requests\n",
+        "! uv pip install aiohttp\n",
+        "\n",
+        "# Configure the ingestor server URL\n",
+        "# Use \"ingestor-server\" when running in AI Workbench, otherwise \"localhost\"\n",
+        "IPADDRESS = (\n",
+        "    \"ingestor-server\"\n",
+        "    if os.environ.get(\"AI_WORKBENCH\", \"false\") == \"true\"\n",
+        "    else \"localhost\"\n",
+        ")\n",
+        "INGESTOR_SERVER_PORT = \"8082\"\n",
+        "BASE_URL = f\"http://{IPADDRESS}:{INGESTOR_SERVER_PORT}\"\n",
+        "\n",
+        "async def print_response(response):\n",
+        "    \"\"\"Helper function to pretty-print API responses.\"\"\"\n",
+        "    try:\n",
+        "        response_json = await response.json()\n",
+        "        print(json.dumps(response_json, indent=2))\n",
+        "    except aiohttp.ClientResponseError:\n",
+        "        print(await response.text())\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "688bc70f",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Define a unique name for your collection\n",
+        "# Change this if you want to create a different collection\n",
+        "collection_name = \"multimodal_query\""
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "24378f6f",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "import aiohttp\n",
+        "import json\n",
+        "\n",
+        "\n",
+        "async def create_collection(\n",
+        "    collection_name: str | None = None,\n",
+        "    metadata_schema: list = [],\n",
+        "):\n",
+        "    \"\"\"\n",
+        "    Create a new collection in the vector database.\n",
+        "    \n",
+        "    Args:\n",
+        "        collection_name: Unique identifier for the collection\n",
+        "        metadata_schema: Optional schema for metadata fields\n",
+        "    \"\"\"\n",
+        "    data = {\n",
+        "        \"collection_name\": collection_name,\n",
+        "        \"metadata_schema\": metadata_schema,\n",
+        "    }\n",
+        "\n",
+        "    HEADERS = {\"Content-Type\": \"application/json\"}\n",
+        "\n",
+        "    async with aiohttp.ClientSession() as session:\n",
+        "        try:\n",
+        "            async with session.post(\n",
+        "                f\"{BASE_URL}/v1/collection\", json=data, headers=HEADERS\n",
+        "            ) as response:\n",
+        "                await print_response(response)\n",
+        "        except aiohttp.ClientError as e:\n",
+        "            return 500, {\"error\": str(e)}\n",
+        "\n",
+        "\n",
+        "# Create the collection\n",
+        "# The embedding dimension is 2048 for the multimodal embedding model we're using\n",
+        "await create_collection(\n",
+        "    collection_name=collection_name,\n",
+        ")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "a29f4633",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Specify the documents to upload\n",
+        "# This PDF contains product images with pricing information\n",
+        "FILEPATHS = [\n",
+        "    \"../data/multimodal/product_catalog.pdf\",\n",
+        "]\n",
+        "\n",
+        "async def upload_documents(collection_name: str = \"\"):\n",
+        "    \"\"\"\n",
+        "    Upload and process documents into the collection.\n",
+        "    \n",
+        "    This will:\n",
+        "    1. Extract text and images from the PDFs\n",
+        "    2. Chunk the content for optimal retrieval\n",
+        "    3. Generate multimodal embeddings\n",
+        "    4. Store everything in the vector database\n",
+        "    \"\"\"\n",
+        "    data = {\n",
+        "        \"collection_name\": collection_name,\n",
+        "        \"blocking\": False,  # Async upload - use status API to check progress\n",
+        "        \"split_options\": {\n",
+        "            \"chunk_size\": 512,        # Characters per chunk\n",
+        "            \"chunk_overlap\": 150      # Overlap between chunks for context\n",
+        "        },\n",
+        "        \"generate_summary\": False  # Set to True to generate document summaries\n",
+        "    }\n",
+        "\n",
+        "    form_data = aiohttp.FormData()\n",
+        "    \n",
+        "    # Add all PDF files to the form data\n",
+        "    for file_path in FILEPATHS:\n",
+        "        form_data.add_field(\"documents\", open(file_path, \"rb\"), \n",
+        "                          filename=os.path.basename(file_path), \n",
+        "                          content_type=\"application/pdf\")\n",
+        "\n",
+        "    form_data.add_field(\"data\", json.dumps(data), content_type=\"application/json\")\n",
+        "\n",
+        "    async with aiohttp.ClientSession() as session:\n",
+        "        try:\n",
+        "            # Use POST for new uploads, PATCH for re-ingesting existing documents\n",
+        "            async with session.post(f\"{BASE_URL}/v1/documents\", data=form_data) as response:\n",
+        "                await print_response(response)\n",
+        "                response_json = await response.json()\n",
+        "                return response_json\n",
+        "        except aiohttp.ClientError as e:\n",
+        "            print(f\"Error uploading documents: {e}\")\n",
+        "            return None\n",
+        "\n",
+        "# Upload the documents and get the task ID for tracking progress\n",
+        "upload_response = await upload_documents(collection_name=collection_name)\n",
+        "task_id = upload_response.get(\"task_id\") if upload_response else None\n",
+        "print(f\"\\nTask ID for tracking: {task_id}\")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "2234e059",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "async def get_task_status(task_id: str):\n",
+        "    \"\"\"\n",
+        "    Check the status of an asynchronous ingestion task.\n",
+        "    \n",
+        "    Possible statuses:\n",
+        "    - \"pending\": Task is queued\n",
+        "    - \"processing\": Currently processing documents\n",
+        "    - \"completed\": Successfully finished\n",
+        "    - \"failed\": Error occurred\n",
+        "    \"\"\"\n",
+        "    params = {\n",
+        "        \"task_id\": task_id,\n",
+        "    }\n",
+        "\n",
+        "    HEADERS = {\"Content-Type\": \"application/json\"}\n",
+        "\n",
+        "    async with aiohttp.ClientSession() as session:\n",
+        "        try:\n",
+        "            async with session.get(\n",
+        "                f\"{BASE_URL}/v1/status\", params=params, headers=HEADERS\n",
+        "            ) as response:\n",
+        "                await print_response(response)\n",
+        "        except aiohttp.ClientError as e:\n",
+        "            return 500, {\"error\": str(e)}\n",
+        "\n",
+        "\n",
+        "# Check the ingestion status\n",
+        "# Run this cell multiple times until status shows \"completed\"\n",
+        "await get_task_status(\n",
+        "    task_id=[task_id]\n",
+        ")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "id": "e2c1f1a8",
+      "metadata": {},
+      "source": [
+        "---\n",
+        "\n",
+        "## 🔍 Querying with Multimodal Inputs\n",
+        "\n",
+        "Now that documents are ingested, let's query them using both text and images!\n",
+        "\n",
+        "### 7. Using the Search and Generate APIs\n",
+        "\n",
+        "We'll demonstrate two approaches:\n",
+        "1. **Search API**: Find relevant documents without generating a response\n",
+        "2. **Generate API**: Get an AI-generated answer with citations"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "id": "3990ca33",
+      "metadata": {},
+      "source": [
+        "#### Prepare a Multimodal Query\n",
+        "\n",
+        "To query with an image, we need to:\n",
+        "1. Convert the image to base64 encoding\n",
+        "2. Format it according to the OpenAI vision API format\n",
+        "3. Combine it with a text prompt"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "02dde830",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "! uv pip install requests httpx\n",
+        "import base64\n",
+        "import requests\n",
+        "from IPython.display import Image, Markdown, display\n",
+        "\n",
+        "def get_base64_image(image_source: str) -> str:\n",
+        "    \"\"\"\n",
+        "    Convert an image to base64 encoding.\n",
+        "    \n",
+        "    Args:\n",
+        "        image_source: Local file path or URL to the image\n",
+        "        \n",
+        "    Returns:\n",
+        "        Base64 encoded string of the image\n",
+        "    \"\"\"\n",
+        "    if image_source.startswith(('http://', 'https://')):\n",
+        "        # Download image from URL\n",
+        "        response = requests.get(image_source)\n",
+        "        return base64.b64encode(response.content).decode()\n",
+        "    else:\n",
+        "        # Read local file\n",
+        "        with open(image_source, \"rb\") as image_file:\n",
+        "            return base64.b64encode(image_file.read()).decode()\n",
+        "\n",
+        "# Convert the query image to base64\n",
+        "# Try different images to test different queries:\n",
+        "image_b64 = get_base64_image(\"../data/multimodal/Creme_clutch_purse1-small.jpg\")\n",
+        "\n",
+        "# Display the query image for reference\n",
+        "query_image_path = \"../data/multimodal/Creme_clutch_purse1-small.jpg\"\n",
+        "print(\"📷 Query Image:\")\n",
+        "display(Image(filename=query_image_path, width=300))\n",
+        "\n",
+        "# Format as a data URL\n",
+        "image_input = f\"data:image/png;base64,{image_b64}\"\n",
+        "\n",
+        "# Create the multimodal query with text + image\n",
+        "# This follows the OpenAI vision API format\n",
+        "query_1 = \"What material is this made of?\"\n",
+        "image_query = [\n",
+        "    {\"type\": \"text\", \"text\": query_1},\n",
+        "    {\n",
+        "        \"type\": \"image_url\",\n",
+        "        \"image_url\": {\n",
+        "            \"url\": image_input,\n",
+        "            \"detail\": \"auto\"  # Let the model decide the appropriate detail level\n",
+        "        }\n",
+        "    }\n",
+        "]"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "c311f9d0",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "import httpx\n",
+        "import json\n",
+        "from IPython.display import Image, Markdown, display\n",
+        "\n",
+        "RAG_BASE_URL = \"http://localhost:8081\"\n",
+        "\n",
+        "async def search_documents(payload):\n",
+        "    \"\"\"\n",
+        "    Search for relevant documents using a multimodal query.\n",
+        "    \n",
+        "    This performs similarity search in the vector database and optionally\n",
+        "    reranks results for better relevance.\n",
+        "    \"\"\"\n",
+        "    search_url = f\"{RAG_BASE_URL}/v1/search\"\n",
+        "    \n",
+        "    async with httpx.AsyncClient(timeout=300.0) as client:\n",
+        "        try:\n",
+        "            response = await client.post(url=search_url, json=payload)\n",
+        "            response.raise_for_status()\n",
+        "            \n",
+        "            search_results = response.json()\n",
+        "            print(\"Search Results:\")\n",
+        "            \n",
+        "            # Display search results with nice formatting\n",
+        "            if \"results\" in search_results:\n",
+        "                for idx, result in enumerate(search_results[\"results\"]):\n",
+        "                    doc_type = result.get(\"document_type\", \"text\")\n",
+        "                    content = result.get(\"content\", \"\")\n",
+        "                    doc_name = result.get(\"document_name\", f\"Result {idx + 1}\")\n",
+        "                    score = result.get(\"score\", \"N/A\")\n",
+        "                    \n",
+        "                    display(Markdown(f\"**Result {idx + 1}: {doc_name} (Score: {score})**\"))\n",
+        "                    try:\n",
+        "                        if doc_type == \"image\":\n",
+        "                            # Display image results\n",
+        "                            image_bytes = base64.b64decode(content)\n",
+        "                            display(Image(data=image_bytes))\n",
+        "                        else:\n",
+        "                            # Display text results\n",
+        "                            display(Markdown(f\"```\\n{content}\\n```\"))\n",
+        "                    except Exception as e:\n",
+        "                        print(f\"Error displaying content: {e}\")\n",
+        "                        display(Markdown(f\"```\\n{content}\\n```\"))\n",
+        "            \n",
+        "            return search_results\n",
+        "            \n",
+        "        except httpx.HTTPStatusError as e:\n",
+        "            print(f\"HTTP error occurred: {e.response.status_code} - {e.response.text}\")\n",
+        "        except httpx.RequestError as e:\n",
+        "            print(f\"An error occurred while requesting {e.request.url!r}: {e}\")\n",
+        "        except Exception as e:\n",
+        "            print(f\"An error occurred: {e}\")\n",
+        "\n",
+        "# Configure the search parameters\n",
+        "search_payload = {\n",
+        "    \"query\": image_query,                      # Our multimodal query (text + image)\n",
+        "    \"messages\": [],                            # No conversation history\n",
+        "    \"use_knowledge_base\": True,                # Search the vector database\n",
+        "    \"collection_names\": [collection_name],     # Which collection to search\n",
+        "    \"vdb_top_k\": 5,                           # Retrieve top 5 results from vector DB\n",
+        "    \"vdb_endpoint\": \"http://milvus:19530\",    # Milvus connection string\n",
+        "    \"enable_reranker\": False,                  # Set to True for better relevance (slower)\n",
+        "    \"reranker_top_k\": 3,                      # If reranker enabled, return top 3\n",
+        "    \"filter_expr\": \"\",                        # Optional metadata filter\n",
+        "}\n",
+        "\n",
+        "# Execute the search\n",
+        "print(\"🔍 Searching for documents matching the query...\\n\")\n",
+        "search_result = await search_documents(search_payload)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "bc5b1545",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "import base64\n",
+        "import json\n",
+        "from IPython.display import Image, Markdown, display\n",
+        "\n",
+        "\n",
+        "async def print_streaming_response_and_citations(response_generator):\n",
+        "    \"\"\"\n",
+        "    Helper function to display streaming responses with citations.\n",
+        "    \n",
+        "    This function:\n",
+        "    1. Streams the AI-generated response token by token\n",
+        "    2. Extracts citations from the first chunk\n",
+        "    3. Displays citations (text or images) after the response completes\n",
+        "    \"\"\"\n",
+        "    first_chunk_data = None\n",
+        "    \n",
+        "    async for chunk in response_generator:\n",
+        "        # Parse Server-Sent Events (SSE) format\n",
+        "        if chunk.startswith(\"data: \"):\n",
+        "            chunk = chunk[len(\"data: \") :].strip()\n",
+        "        if not chunk:\n",
+        "            continue\n",
+        "            \n",
+        "        try:\n",
+        "            data = json.loads(chunk)\n",
+        "        except Exception as e:\n",
+        "            print(f\"JSON decode error: {e}\")\n",
+        "            continue\n",
+        "            \n",
+        "        choices = data.get(\"choices\", [])\n",
+        "        if not choices:\n",
+        "            continue\n",
+        "            \n",
+        "        # Save the first chunk with citations\n",
+        "        if first_chunk_data is None and data.get(\"citations\"):\n",
+        "            first_chunk_data = data\n",
+        "            \n",
+        "        # Print streaming text\n",
+        "        delta = choices[0].get(\"delta\", {})\n",
+        "        text = delta.get(\"content\")\n",
+        "        if not text:\n",
+        "            message = choices[0].get(\"message\", {})\n",
+        "            text = message.get(\"content\", \"\")\n",
+        "        print(text, end=\"\", flush=True)\n",
+        "        \n",
+        "    print()  # Newline after streaming\n",
+        "\n",
+        "    # Display citations after streaming is done\n",
+        "    if first_chunk_data and first_chunk_data.get(\"citations\"):\n",
+        "        print(\"\\n📚 Citations:\")\n",
+        "        citations = first_chunk_data[\"citations\"]\n",
+        "        for idx, citation in enumerate(citations.get(\"results\", [])):\n",
+        "            doc_type = citation.get(\"document_type\", \"text\")\n",
+        "            content = citation.get(\"content\", \"\")\n",
+        "            doc_name = citation.get(\"document_name\", f\"Citation {idx + 1}\")\n",
+        "            display(Markdown(f\"**Citation {idx + 1}: {doc_name}**\"))\n",
+        "            try:\n",
+        "                # Try to display as image\n",
+        "                image_bytes = base64.b64decode(content)\n",
+        "                display(Image(data=image_bytes))\n",
+        "            except Exception:\n",
+        "                # Fall back to text display\n",
+        "                display(Markdown(f\"```\\n{content}\\n```\"))"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "31071359",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "import httpx\n",
+        "\n",
+        "# Configure RAG server URL\n",
+        "IPADDRESS = \"rag-server\" if os.environ.get(\"AI_WORKBENCH\", \"false\") == \"true\" else \"localhost\"\n",
+        "RAG_SERVER_PORT = \"8081\"\n",
+        "RAG_BASE_URL = f\"http://{IPADDRESS}:{RAG_SERVER_PORT}\"\n",
+        "generate_url = f\"{RAG_BASE_URL}/v1/generate\"\n",
+        "\n",
+        "async def generate_answer(payload):\n",
+        "    \"\"\"\n",
+        "    Generate an AI answer using the RAG pipeline.\n",
+        "    \n",
+        "    This function:\n",
+        "    1. Sends the query to the RAG server\n",
+        "    2. Retrieves relevant context from the vector database\n",
+        "    3. Streams the AI-generated response\n",
+        "    4. Displays citations (sources) used to generate the answer\n",
+        "    \"\"\"\n",
+        "    rag_response = \"\"\n",
+        "    citations = []\n",
+        "    is_first_token = True\n",
+        "\n",
+        "    async with httpx.AsyncClient(timeout=300.0) as client:\n",
+        "        try:\n",
+        "            async with client.stream(\"POST\", url=generate_url, json=payload) as response:\n",
+        "                # Raise an exception for bad status codes like 4xx or 5xx\n",
+        "                response.raise_for_status()\n",
+        "\n",
+        "                # Iterate over the streaming response\n",
+        "                async for line in response.aiter_lines():\n",
+        "                    if line.startswith(\"data: \"):\n",
+        "                        json_str = line[6:].strip()\n",
+        "                        if not json_str:\n",
+        "                            continue\n",
+        "\n",
+        "                        try:\n",
+        "                            data = json.loads(json_str)\n",
+        "\n",
+        "                            # Extract and display the streaming response\n",
+        "                            message = data.get(\"choices\", [{}])[0].get(\"message\", {}).get(\"content\", \"\")\n",
+        "                            if message:\n",
+        "                                rag_response += message\n",
+        "\n",
+        "                            # Extract and display citations from the first chunk\n",
+        "                            if is_first_token and data.get(\"citations\"):\n",
+        "                                print(\"\\n📚 Citations:\")\n",
+        "                                citations = data[\"citations\"]\n",
+        "                                for idx, citation in enumerate(citations.get(\"results\", [])):\n",
+        "                                    doc_type = citation.get(\"document_type\", \"text\")\n",
+        "                                    content = citation.get(\"content\", \"\")\n",
+        "                                    doc_name = citation.get(\"document_name\", f\"Citation {idx + 1}\")\n",
+        "                                    display(Markdown(f\"**Citation {idx + 1}: {doc_name}**\"))\n",
+        "                                    try:\n",
+        "                                        # Display image citations\n",
+        "                                        image_bytes = base64.b64decode(content)\n",
+        "                                        display(Image(data=image_bytes))\n",
+        "                                    except Exception:\n",
+        "                                        # Display text citations\n",
+        "                                        display(Markdown(f\"```\\n{content}\\n```\"))\n",
+        "                                is_first_token = False\n",
+        "\n",
+        "                            # Check if streaming is complete\n",
+        "                            finish_reason = data.get(\"choices\", [{}])[0].get(\"finish_reason\")\n",
+        "                            if finish_reason == \"stop\":\n",
+        "                                return rag_response\n",
+        "\n",
+        "                        except json.JSONDecodeError:\n",
+        "                            print(f\"Skipping malformed JSON line: {json_str}\")\n",
+        "                            continue\n",
+        "        \n",
+        "        except httpx.HTTPStatusError as e:\n",
+        "            print(f\"HTTP error occurred: {e.response.status_code} - {e.response.text}\")\n",
+        "        except httpx.RequestError as e:\n",
+        "            print(f\"An error occurred while requesting {e.request.url!r}: {e}\")\n",
+        "        except Exception as e:\n",
+        "            print(f\"An error occurred: {e}\")\n",
+        "\n",
+        "    print(\"\\n✅ Response complete!\")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "c3049696",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Format the query as a chat message\n",
+        "messages = [\n",
+        "    {\n",
+        "        \"role\": \"user\",\n",
+        "        \"content\": image_query  # Our multimodal query (text + image)\n",
+        "    }\n",
+        "]\n",
+        "\n",
+        "# Configure the generate API parameters\n",
+        "payload = {\n",
+        "    \"messages\": messages,                      # Chat conversation\n",
+        "    \"use_knowledge_base\": True,                # Enable RAG - use vector DB for context\n",
+        "    \"temperature\": 0.2,                        # Lower = more deterministic, higher = more creative\n",
+        "    \"top_p\": 0.7,                             # Nucleus sampling parameter\n",
+        "    \"max_tokens\": 1024,                       # Maximum response length\n",
+        "    \"reranker_top_k\": 2,                      # Keep top 2 results after reranking\n",
+        "    \"vdb_top_k\": 10,                          # Retrieve top 10 from vector DB initially\n",
+        "    \"vdb_endpoint\": \"http://milvus:19530\",    # Milvus connection\n",
+        "    \"collection_names\": [collection_name],     # Which collection to search\n",
+        "    \"enable_query_rewriting\": True,            # Improve query before searching\n",
+        "    \"enable_citations\": True,                  # Include source citations in response\n",
+        "    \"stop\": [],                               # Optional stop sequences\n",
+        "    \"filter_expr\": \"\",                        # Optional metadata filter    \n",
+        "}\n",
+        "\n",
+        "# Generate the answer with RAG\n",
+        "print(\"🤖 Generating answer with RAG...\\n\")\n",
+        "await generate_answer(payload)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "id": "dcddc78d",
+      "metadata": {},
+      "source": [
+        "---\n",
+        "\n",
+        "## 🎉 Summary\n",
+        "\n",
+        "Congratulations! You've successfully:\n",
+        "\n",
+        "✅ **Set up the infrastructure**: Deployed Milvus vector DB, NVIDIA NIMs, and RAG services  \n",
+        "✅ **Ingested multimodal documents**: Uploaded PDFs with images and extracted their content  \n",
+        "✅ **Created multimodal queries**: Combined text and images in your search queries  \n",
+        "✅ **Retrieved relevant context**: Used semantic search to find matching documents  \n",
+        "✅ **Generated AI responses**: Got intelligent answers with source citations  \n",
+        "\n",
+        "### Next Steps\n",
+        "\n",
+        "- **Try different queries**: Change the query text or use different query images\n",
+        "- **Upload more documents**: Add more PDFs to enrich your knowledge base\n",
+        "- **Experiment with parameters**: Adjust `temperature`, `top_k`, reranker settings\n",
+        "- **Build applications**: Integrate these APIs into your own applications\n",
+        "\n",
+        "### Cleanup\n",
+        "\n",
+        "To stop all services and free up resources:\n",
+        "\n",
+        "```bash\n",
+        "cd ../deploy/compose\n",
+        "docker compose -f docker-compose-rag-server.yaml down\n",
+        "docker compose -f docker-compose-ingestor-server.yaml down\n",
+        "docker compose -f nims.yaml down\n",
+        "docker compose -f vectordb.yaml down\n",
+        "```\n"
+      ]
+    }
+  ],
+  "metadata": {
+    "kernelspec": {
+      "display_name": ".venv",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.11.10"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 5
 }
diff --git a/notebooks/langchain_nvidia_retriever.ipynb b/notebooks/langchain_nvidia_retriever.ipynb
new file mode 100644
index 000000000..02b05b6db
--- /dev/null
+++ b/notebooks/langchain_nvidia_retriever.ipynb
@@ -0,0 +1,315 @@
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "id": "303aa520",
+      "metadata": {},
+      "source": [
+        "# NVIDIARAGRetriever Connector – LangChain Integration\n",
+        "\n",
+        "**Motivation:** This notebook showcases the **LangChain integration** with the NVIDIA RAG Blueprint. The `NVIDIARAGRetriever` from `langchain-nvidia-ai-endpoints` connects to the NVIDIA RAG `/v1/search` endpoint and returns standard LangChain `Document` objects, enabling seamless use in chains, agents, and RAG pipelines without custom HTTP code.\n",
+        "\n",
+        "---\n",
+        "\n",
+        "## Prerequisite: Run Ingestion First\n",
+        "\n",
+        "**You must ingest documents before using this notebook.** Use the [ingestion_api_usage.ipynb](./ingestion_api_usage.ipynb) notebook:\n",
+        "\n",
+        "1. Open [ingestion_api_usage.ipynb](./ingestion_api_usage.ipynb).\n",
+        "2. Execute the following cells **in order** (top to bottom):\n",
+        "   - **1. Install Dependencies** – `pip install aiohttp`\n",
+        "   - **2. Setup Base Configuration** – ingestor URL (port 8082)\n",
+        "   - **3. Health Check** – verify ingestor is running\n",
+        "   - **4. Create collection** – creates `multimodal_data` collection\n",
+        "   - **4. Upload Document** – FILEPATHS cell, then `upload_documents` cell\n",
+        "   - **5. Get Task Status** – poll until state is `FINISHED`\n",
+        "3. When ingestion is complete, return here and run the cells below.\n",
+        "\n",
+        "Ensure the **RAG server** (port 8081) is running. See [Get Started](https://github.com/NVIDIA-AI-Blueprints/rag/blob/main/docs/deploy-docker-self-hosted.md)."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "id": "c7a2a7dd",
+      "metadata": {},
+      "source": [
+        "---\n",
+        "## Setup"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "e6fe7153",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "!pip install langchain-nvidia-ai-endpoints langchain-core\n",
+        "\n",
+        "import os\n",
+        "\n",
+        "# RAG server URL (use collection from ingestion_api_usage.ipynb)\n",
+        "RAG_IPADDRESS = (\n",
+        "    \"rag-server\" if os.environ.get(\"AI_WORKBENCH\", \"false\") == \"true\" else \"localhost\"\n",
+        ")\n",
+        "RAG_BASE_URL = f\"http://{RAG_IPADDRESS}:8081\"\n",
+        "\n",
+        "# Collection from ingestion_api_usage.ipynb (default: multimodal_data)\n",
+        "COLLECTION_NAME = \"multimodal_data\""
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "id": "640eee93",
+      "metadata": {},
+      "source": [
+        "---\n",
+        "## Retrieval with NVIDIARAGRetriever\n",
+        "\n",
+        "The `NVIDIARAGRetriever` from `langchain-nvidia-ai-endpoints` connects to the NVIDIA RAG Blueprint `/v1/search` endpoint and returns LangChain `Document` objects. Use `COLLECTION_NAME` to match the collection you created in [ingestion_api_usage.ipynb](./ingestion_api_usage.ipynb)."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "### 2.1 Basic Sync Retrieval"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "4b09138a",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "from langchain_nvidia_ai_endpoints import NVIDIARAGRetriever\n",
+        "\n",
+        "retriever = NVIDIARAGRetriever(\n",
+        "    base_url=RAG_BASE_URL,\n",
+        "    collection_names=[COLLECTION_NAME],\n",
+        "    k=5,\n",
+        ")\n",
+        "\n",
+        "query = \"What are the main topics or products discussed?\"\n",
+        "docs = retriever.invoke(query)\n",
+        "\n",
+        "print(f\"Query: {query}\")\n",
+        "print(f\"Retrieved {len(docs)} documents:\\n\")\n",
+        "for i, doc in enumerate(docs, 1):\n",
+        "    content_preview = (doc.page_content or \"\")[:300] + \"...\" if len(doc.page_content or \"\") > 300 else (doc.page_content or \"\")\n",
+        "    score = doc.metadata.get(\"score\", \"N/A\")\n",
+        "    source = doc.metadata.get(\"document_name\", \"N/A\")\n",
+        "    print(f\"--- Document {i} ---\")\n",
+        "    print(f\"Score: {score} | Source: {source}\")\n",
+        "    print(f\"Content: {content_preview}\")\n",
+        "    print()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "### 2.2 Custom Retrieval Parameters"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "id": "35d7f118",
+      "metadata": {},
+      "source": [
+        "For details on retrieval parameters, filter expressions, and metadata:\n",
+        "- [Custom metadata & filter expressions](../docs/custom-metadata.md) – `filter_expr` syntax (Milvus), metadata schema\n",
+        "- [Multi-turn & query rewriting](../docs/multiturn.md) – `enable_query_rewriting` for decontextualizing follow-up questions\n",
+        "- [Retriever API usage](./retriever_api_usage.ipynb) – Search endpoint payload parameters"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "6e5aa122",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "retriever_custom = NVIDIARAGRetriever(\n",
+        "    base_url=RAG_BASE_URL,\n",
+        "    collection_names=[COLLECTION_NAME],\n",
+        "    # Result counts\n",
+        "    k=3,  # Number of document chunks to return (0-25, maps to reranker_top_k)\n",
+        "    vdb_top_k=50,  # Top results from vector DB before reranking (0-400)\n",
+        "    # Feature toggles\n",
+        "    enable_reranker=True,  # Rerank results for relevance\n",
+        "    enable_query_rewriting=False,  # Rewrite query for better retrieval\n",
+        "    enable_filter_generator=False,  # Auto-generate filters from query\n",
+        "    enable_citations=True,  # Include image/table/chart citations in metadata\n",
+        "    # Filtering\n",
+        "    confidence_threshold=0.0,  # Min confidence (0.0-1.0, requires enable_reranker=True)\n",
+        "    filter_expr=None,  # Milvus filter expression, e.g. content_metadata['file_name'] == \"doc.pdf\"'\n",
+        "    # Advanced\n",
+        "    vdb_endpoint=\"http://milvus:19530\",  # Vector DB endpoint (override if needed)\n",
+        "    messages=[],  # Conversation history for context-aware retrieval\n",
+        "    timeout=60.0,  # HTTP request timeout in seconds\n",
+        ")\n",
+        "\n",
+        "docs = retriever_custom.invoke(\"Summarize key information\")\n",
+        "print(f\"Retrieved {len(docs)} documents\")\n",
+        "for i, doc in enumerate(docs, 1):\n",
+        "    print(f\"  {i}. {doc.metadata.get('document_name', 'N/A')} (score: {doc.metadata.get('score', 'N/A')})\")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "### 2.3 Async Retrieval"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "docs = await retriever.ainvoke(\"What features or benefits are described?\")\n",
+        "print(f\"Async retrieval: {len(docs)} documents\")\n",
+        "for i, doc in enumerate(docs[:3], 1):\n",
+        "    print(f\"  {i}. {doc.metadata.get('document_name', 'N/A')}\")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "### 2.4 Error Handling"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "bfcc9f22",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "from langchain_nvidia_ai_endpoints.retrievers import (\n",
+        "    NVIDIARAGConnectionError,\n",
+        "    NVIDIARAGServerError,\n",
+        "    NVIDIARAGValidationError,\n",
+        ")\n",
+        "\n",
+        "try:\n",
+        "    bad_retriever = NVIDIARAGRetriever(\n",
+        "        base_url=\"http://invalid-host:8081\",\n",
+        "        collection_names=[COLLECTION_NAME],\n",
+        "    )\n",
+        "    bad_retriever.invoke(\"test\")\n",
+        "except NVIDIARAGConnectionError as e:\n",
+        "    print(f\"Connection error (expected): {e}\")\n",
+        "except NVIDIARAGValidationError as e:\n",
+        "    print(f\"Validation error: {e}\")\n",
+        "except NVIDIARAGServerError as e:\n",
+        "    print(f\"Server error ({e.status_code}): {e}\")\n",
+        "\n",
+        "print(\"\\nError handling works as expected.\")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "id": "3b5824c6",
+      "metadata": {},
+      "source": [
+        "---\n",
+        "## RAG Chain (Optional)\n",
+        "\n",
+        "Chain `NVIDIARAGRetriever` with `ChatNVIDIA` for end-to-end question answering. Requires `NVIDIA_API_KEY` to call the NVIDIA API Catalog.\n",
+        "\n",
+        "**Get an API key:** See [Get an API Key](../docs/api-key.md) for instructions."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "d82a5d54",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Set NVIDIA_API_KEY if not already set (see ../docs/api-key.md to get a key)\n",
+        "if not os.environ.get(\"NVIDIA_API_KEY\", \"\").startswith(\"nvapi-\"):\n",
+        "    import getpass\n",
+        "    key = getpass.getpass(\"Enter your NVIDIA API key (nvapi-...): \")\n",
+        "    if key.startswith(\"nvapi-\"):\n",
+        "        os.environ[\"NVIDIA_API_KEY\"] = key\n",
+        "    else:\n",
+        "        print(\"NVIDIA_API_KEY not set. Set it to run the RAG chain. See [Get an API Key](../docs/api-key.md)\")\n",
+        "\n",
+        "if os.environ.get(\"NVIDIA_API_KEY\", \"\").startswith(\"nvapi-\"):\n",
+        "    from langchain_core.output_parsers import StrOutputParser\n",
+        "    from langchain_core.prompts import ChatPromptTemplate\n",
+        "    from langchain_core.runnables import RunnablePassthrough\n",
+        "\n",
+        "    from langchain_nvidia_ai_endpoints import ChatNVIDIA, NVIDIARAGRetriever\n",
+        "\n",
+        "    retriever = NVIDIARAGRetriever(\n",
+        "        base_url=RAG_BASE_URL,\n",
+        "        collection_names=[COLLECTION_NAME],\n",
+        "        k=4,\n",
+        "    )\n",
+        "\n",
+        "    def format_docs(docs):\n",
+        "        return \"\\n\\n\".join(d.page_content for d in docs)\n",
+        "\n",
+        "    prompt = ChatPromptTemplate.from_messages([\n",
+        "        (\"system\", \"Answer based only on the context below.\\n\\n{context}\"),\n",
+        "        (\"human\", \"{question}\"),\n",
+        "    ])\n",
+        "\n",
+        "    # Model aligned with rag-server default (nvidia/llama-3.3-nemotron-super-49b-v1.5)\n",
+        "    llm = ChatNVIDIA(model=\"nvidia/llama-3.3-nemotron-super-49b-v1.5\")\n",
+        "    chain = (\n",
+        "        {\"context\": retriever | format_docs, \"question\": RunnablePassthrough()}\n",
+        "        | prompt\n",
+        "        | llm\n",
+        "        | StrOutputParser()\n",
+        "    )\n",
+        "\n",
+        "    answer = chain.invoke(\"What are the main topics or products?\")\n",
+        "    print(answer)\n",
+        "else:\n",
+        "    print(\"NVIDIA_API_KEY not set. Set it (e.g. os.environ['NVIDIA_API_KEY'] = 'nvapi-...') or run this cell again to be prompted. See [Get an API Key](../docs/api-key.md)\")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "id": "4c95fdc6",
+      "metadata": {},
+      "source": [
+        "---\n",
+        "## Cleanup (Optional)\n",
+        "\n",
+        "To remove the collection and documents, use the delete cells in [ingestion_api_usage.ipynb](./ingestion_api_usage.ipynb) (sections 7 and 9)."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Use ingestion_api_usage.ipynb sections 7 (Delete Documents) and 9 (Delete Collections)\n",
+        "# to remove the multimodal_data collection when finished."
+      ]
+    }
+  ],
+  "metadata": {
+    "kernelspec": {
+      "display_name": ".venv",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "name": "python",
+      "version": "3.11.12"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 5
+}
diff --git a/notebooks/launchable.ipynb b/notebooks/launchable.ipynb
index 8e5779af2..cccba6eba 100644
--- a/notebooks/launchable.ipynb
+++ b/notebooks/launchable.ipynb
@@ -127,6 +127,16 @@
     "RAG_BASE_URL = f\"http://{IPADDRESS}:{RAG_SERVER_PORT}\"\n",
     "INGESTOR_BASE_URL = f\"http://{IPADDRESS}:{INGESTOR_SERVER_PORT}\"\n",
     "\n",
+    "# NIM services to deploy (excludes nim-llm and vlm-ms since we use NVIDIA-hosted endpoints)\n",
+    "NIM_SERVICES = (\n",
+    "    \"nemotron-embedding-ms \"\n",
+    "    \"nemotron-ranking-ms \"\n",
+    "    \"page-elements \"\n",
+    "    \"graphic-elements \"\n",
+    "    \"table-structure \"\n",
+    "    \"nemoretriever-ocr\"\n",
+    ")\n",
+    "\n",
     "\n",
     "# =============================================================================\n",
     "# DOCKER COMPOSE HELPERS\n",
@@ -792,7 +802,7 @@
     "            print(f\"   • {warn}\")\n",
     "    print(\"\\n\" + \"=\" * 70)\n",
     "    print(\"Please fix the above errors before continuing.\")\n",
-    "    print(\"See: https://docs.nvidia.com/ai-blueprints/rag/latest/support-matrix.html\")\n",
+    "    print(\"See: https://docs.nvidia.com/rag/latest/support-matrix.html\")\n",
     "    print(\"=\" * 70)\n",
     "elif warnings:\n",
     "    print(\"\\n✅ REQUIREMENTS MET (with warnings)\")\n",
@@ -914,7 +924,7 @@
     "import subprocess\n",
     "\n",
     "REPO_URL = \"https://github.com/NVIDIA-AI-Blueprints/rag.git\"\n",
-    "BRANCH = \"release-v2.4.0\"\n",
+    "BRANCH = \"release-v2.5.0\"\n",
     "#BRANCH = \"develop\"\n",
     "# Check if we're already in the rag repo (look for deploy/compose)\n",
     "if os.path.exists(\"deploy/compose\"):\n",
@@ -1295,10 +1305,10 @@
     "902445432dde   milvus-standalone                Up 3 minutes\n",
     "340bc8210a0d   milvus-minio                     Up 3 minutes (healthy)\n",
     "0be702b87ad6   milvus-etcd                      Up 3 minutes (healthy)\n",
-    "fe2751bfa734   nemoretriever-ranking-ms         Up 10 minutes (healthy)\n",
+    "fe2751bfa734   nemotron-ranking-ms              Up 4 seconds (healthy)\n",
     "7b5ddabf8be7   compose-graphic-elements-1       Up 10 minutes\n",
     "ecfaa5190302   compose-page-elements-1          Up 10 minutes\n",
-    "ea8c7fdf20d1   nemoretriever-embedding-ms       Up 10 minutes (healthy)\n",
+    "ea8c7fdf20d1   nemotron-embedding-ms            Up 4 seconds  (healthy)\n",
     "6d62008a9b42   compose-nemoretriever-ocr-1      Up 10 minutes\n",
     "969b9f5c987c   compose-table-structure-1        Up 10 minutes\n",
     "```\n",
@@ -2030,9 +2040,9 @@
     "\n",
     "## 📚 Additional Resources\n",
     "\n",
-    "- **Documentation**: https://docs.nvidia.com/ai-blueprints/rag/latest/\n",
+    "- **Documentation**: https://docs.nvidia.com/rag/latest/\n",
     "- **GitHub**: https://github.com/NVIDIA-AI-Blueprints/rag\n",
-    "- **Support Matrix**: https://docs.nvidia.com/ai-blueprints/rag/latest/support-matrix.html\n",
+    "- **Support Matrix**: https://docs.nvidia.com/rag/latest/support-matrix.html\n",
     "\n",
     "---\n",
     "\n",
@@ -2056,7 +2066,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.12.12"
+   "version": "3.12.13"
   }
  },
  "nbformat": 4,
diff --git a/notebooks/nb_metadata.ipynb b/notebooks/nb_metadata.ipynb
index ad79dd7c7..39203d07b 100644
--- a/notebooks/nb_metadata.ipynb
+++ b/notebooks/nb_metadata.ipynb
@@ -951,8 +951,8 @@
     "  \"enable_reranker\": True,\n",
     "  \"enable_citations\": True,\n",
     "  \"model\": \"nvidia/llama-3.3-nemotron-super-49b-v1.5\",\n",
-    "  \"reranker_model\": \"nvidia/llama-3.2-nv-rerankqa-1b-v2\",\n",
-    "  \"embedding_model\": \"nvidia/llama-3.2-nv-embedqa-1b-v2\",\n",
+    "  \"reranker_model\": \"nvidia/llama-nemotron-rerank-1b-v2\",\n",
+    "  \"embedding_model\": \"nvidia/llama-nemotron-embed-1b-v2\",\n",
     "  # Provide url of the model endpoints if deployed elsewhere\n",
     "  # \"llm_endpoint\": \"\",\n",
     "  #\"embedding_endpoint\": \"\",\n",
@@ -1004,8 +1004,8 @@
     "  \"enable_reranker\": True,\n",
     "  \"enable_citations\": True,\n",
     "  \"model\": \"nvidia/llama-3.3-nemotron-super-49b-v1.5\",\n",
-    "  \"reranker_model\": \"nvidia/llama-3.2-nv-rerankqa-1b-v2\",\n",
-    "  \"embedding_model\": \"nvidia/llama-3.2-nv-embedqa-1b-v2\",\n",
+    "  \"reranker_model\": \"nvidia/llama-nemotron-rerank-1b-v2\",\n",
+    "  \"embedding_model\": \"nvidia/llama-nemotron-embed-1b-v2\",\n",
     "  # Provide url of the model endpoints if deployed elsewhere\n",
     "  # \"llm_endpoint\": \"\",\n",
     "  #\"embedding_endpoint\": \"\",\n",
@@ -1119,8 +1119,8 @@
     "    \"enable_citations\": True,\n",
     "    \"enable_filter_generator\": False,  # Disable to use manual complex filter\n",
     "    \"model\": \"nvidia/llama-3.3-nemotron-super-49b-v1.5\",\n",
-    "    \"reranker_model\": \"nvidia/llama-3.2-nv-rerankqa-1b-v2\",\n",
-    "    \"embedding_model\": \"nvidia/llama-3.2-nv-embedqa-1b-v2\",\n",
+    "    \"reranker_model\": \"nvidia/llama-nemotron-rerank-1b-v2\",\n",
+    "    \"embedding_model\": \"nvidia/llama-nemotron-embed-1b-v2\",\n",
     "    \"stop\": [],\n",
     "    \"filter_expr\": '(content_metadata[\"manufacturer\"] like \"%ford%\" and content_metadata[\"rating\"] > 4.0 and content_metadata[\"created_date\"] between \"2020-01-01\" and \"2024-12-31\" and content_metadata[\"is_public\"] == true) or (content_metadata[\"model\"] like \"%edge%\" and content_metadata[\"year\"] >= 2020 and content_metadata[\"tags\"] in [\"technology\", \"safety\", \"latest\"] and content_metadata[\"rating\"] >= 4.0)'\n",
     "}\n",
@@ -1186,8 +1186,8 @@
     "    \"enable_reranker\": True,\n",
     "    \"enable_citations\": True,\n",
     "    \"model\": \"nvidia/llama-3.3-nemotron-super-49b-v1.5\",\n",
-    "    \"reranker_model\": \"nvidia/llama-3.2-nv-rerankqa-1b-v2\",\n",
-    "    \"embedding_model\": \"nvidia/llama-3.2-nv-embedqa-1b-v2\",\n",
+    "    \"reranker_model\": \"nvidia/llama-nemotron-rerank-1b-v2\",\n",
+    "    \"embedding_model\": \"nvidia/llama-nemotron-embed-1b-v2\",\n",
     "    \"stop\": [],\n",
     "    \"filter_expr\": 'array_contains(content_metadata[\"tags\"], \"eco-friendly\")'\n",
     "}\n",
@@ -1257,8 +1257,8 @@
     "    \"enable_citations\": True,\n",
     "    \"enable_filter_generator\": True,  # 🎯 NEW FEATURE - Enable AI filter generation\n",
     "    \"model\": \"nvidia/llama-3.3-nemotron-super-49b-v1.5\",\n",
-    "    \"reranker_model\": \"nvidia/llama-3.2-nv-rerankqa-1b-v2\",\n",
-    "    \"embedding_model\": \"nvidia/llama-3.2-nv-embedqa-1b-v2\",\n",
+    "    \"reranker_model\": \"nvidia/llama-nemotron-rerank-1b-v2\",\n",
+    "    \"embedding_model\": \"nvidia/llama-nemotron-embed-1b-v2\",\n",
     "    \"stop\": [],\n",
     "    \"filter_expr\": \"\"  # Will be generated automatically by AI\n",
     "}\n",
@@ -1323,8 +1323,8 @@
     "  \"enable_reranker\": False,\n",
     "  \"enable_citations\": False,\n",
     "  \"model\": \"nvidia/llama-3.3-nemotron-super-49b-v1.5\",\n",
-    "  \"reranker_model\": \"nvidia/llama-3.2-nv-rerankqa-1b-v2\",\n",
-    "  \"embedding_model\": \"nvidia/llama-3.2-nv-embedqa-1b-v2\",\n",
+    "  \"reranker_model\": \"nvidia/llama-nemotron-rerank-1b-v2\",\n",
+    "  \"embedding_model\": \"nvidia/llama-nemotron-embed-1b-v2\",\n",
     "  # Provide url of the model endpoints if deployed elsewhere\n",
     "  # \"llm_endpoint\": \"\",\n",
     "  #\"embedding_endpoint\": \"\",\n",
@@ -1391,8 +1391,8 @@
     "    \"enable_citations\": True,\n",
     "    \"enable_filter_generator\": False,\n",
     "    \"model\": \"nvidia/llama-3.3-nemotron-super-49b-v1.5\",\n",
-    "    \"reranker_model\": \"nvidia/llama-3.2-nv-rerankqa-1b-v2\",\n",
-    "    \"embedding_model\": \"nvidia/llama-3.2-nv-embedqa-1b-v2\",\n",
+    "    \"reranker_model\": \"nvidia/llama-nemotron-rerank-1b-v2\",\n",
+    "    \"embedding_model\": \"nvidia/llama-nemotron-embed-1b-v2\",\n",
     "    \"stop\": [],\n",
     "    \"filter_expr\": 'content_metadata[\"nonexistent_field\"] == \"value\"'  # This will cause an error\n",
     "}\n",
diff --git a/notebooks/rag_event_ingest.ipynb b/notebooks/rag_event_ingest.ipynb
new file mode 100644
index 000000000..a38f976af
--- /dev/null
+++ b/notebooks/rag_event_ingest.ipynb
@@ -0,0 +1,793 @@
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Document Continuous Ingestion from Object Storage\n",
+        "\n",
+        "## Purpose\n",
+        "\n",
+        "This notebook demonstrates an **automated document ingestion pipeline** that:\n",
+        "\n",
+        "1. Monitors emulated object storage for new uploads via Kafka events\n",
+        "2. Routes documents to appropriate AI services for indexing\n",
+        "5. Enables RAG Agent for semantic search and contextual Q&A over all ingested content\n",
+        "\n",
+        "## What Gets Deployed\n",
+        "\n",
+        "1. **NVIDIA RAG** - Document indexing, vector search, and AI-powered Q&A (NIMs, Milvus, Ingestor)\n",
+        "2. **Continuous Ingestion** - Event-driven ingestion pipeline (Kafka, MinIO, Consumer)\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Prerequisites\n",
+        "\n",
+        "### Hardware\n",
+        "- **GPU**: 2x RTX PRO 6000 Blackwell or 2x H100\n",
+        "\n",
+        "#### Default GPU Assignment\n",
+        "\n",
+        "| GPU | Service |\n",
+        "|-----|---------|\n",
+        "| 0 | RAG NIMs (Embedding, Reranker) |\n",
+        "| 1 | RAG LLM NIM (Llama-3.3-Nemotron-Super-49B) |\n",
+        "\n",
+        "\n",
+        "### Software (pre-installed required)\n",
+        "- Ubuntu 22.04 or later\n",
+        "- Docker 24.0+ with Docker Compose v2\n",
+        "- NVIDIA Driver 570+\n",
+        "- NVIDIA Container Toolkit\n",
+        "\n",
+        "### API Keys\n",
+        "\n",
+        "<table style=\"margin-left: 0;\">\n",
+        "<tr><th>Key</th><th>Purpose</th><th>How to Get</th></tr>\n",
+        "<tr><td><code>NGC_API_KEY</code></td><td>Docker login, NIM deployments</td><td><a href=\"https://org.ngc.nvidia.com/setup/api-keys\">NGC Portal</a> → Generate API Key</td></tr>\n",
+        "</table>\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Table of Contents\n",
+        "\n",
+        "<table style=\"margin-left: 0;\">\n",
+        "<tr><th>Section</th><th>Description</th></tr>\n",
+        "<tr><td><b>Setup</b></td><td>Clone repo, install deps, set API keys, load helpers</td></tr>\n",
+        "<tr><td><b>Deploy RAG</b></td><td>NIMs, Vector DB, Ingestor, RAG Server</td></tr>\n",
+        "<tr><td><b>Deploy Continuous Ingestion</b></td><td>Kafka, MinIO, Consumer</td></tr>\n",
+        "<tr><td><b>Testing</b></td><td>Upload documents, query RAG</td></tr>\n",
+        "<tr><td><b>Clean Up</b></td><td>Stop services, clean data</td></tr>\n",
+        "</table>\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## References\n",
+        "\n",
+        "- **RAG Blueprint**: [NVIDIA RAG Documentation](https://github.com/NVIDIA-AI-Blueprints/rag/blob/main/docs/deploy-docker-self-hosted.md)\n",
+        "- **NIM**: [NVIDIA NIM Documentation](https://docs.nvidia.com/nim/index.html)\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Setup\n",
+        "\n",
+        "Clone the repository, configure API keys, and load helper functions.\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## 1. Clone Repository\n",
+        "\n",
+        "Clone the RAG Blueprint repo to `~/rag`. This includes the consumer source code, deploy configs, and sample test data.\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "import subprocess, sys, os, shutil\n",
+        "\n",
+        "RAG_REPO_DIR = os.path.expanduser(\"~/rag\")\n",
+        "RAG_REPO_URL = \"https://github.com/NVIDIA-AI-Blueprints/rag.git\"\n",
+        "\n",
+        "# Ensure git-lfs is installed before any LFS operations\n",
+        "if not shutil.which(\"git-lfs\"):\n",
+        "    print(\"[INSTALLING] git-lfs...\")\n",
+        "    subprocess.run(\"sudo apt-get update && sudo apt-get install -y git-lfs && git lfs install\", shell=True, check=True)\n",
+        "else:\n",
+        "    print(\"[OK] git-lfs found\")\n",
+        "\n",
+        "# Clone from correct branch (skip if already exists)\n",
+        "if not os.path.exists(RAG_REPO_DIR):\n",
+        "    subprocess.run(f\"git clone {RAG_REPO_URL} {RAG_REPO_DIR}\", shell=True, check=True)\n",
+        "else:\n",
+        "    print(f\"[OK] RAG repo already exists: {RAG_REPO_DIR}\")\n",
+        "subprocess.run(\"git lfs pull\", shell=True, cwd=RAG_REPO_DIR, check=True)\n",
+        "\n",
+        "# Verify\n",
+        "for path in [\"deploy/compose\", \"examples/rag_event_ingest/kafka_consumer\", \"examples/rag_event_ingest/data\"]:\n",
+        "    status = \"[OK]\" if os.path.exists(os.path.join(RAG_REPO_DIR, path)) else \"[MISSING]\"\n",
+        "    print(f\"  {status} {path}\")\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## 2. Install Dependencies\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "! python3 -m ensurepip --upgrade"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Ensure pip is available (some minimal Python installs lack it)\n",
+        "subprocess.run([sys.executable, \"-m\", \"ensurepip\", \"--upgrade\"], capture_output=True)\n",
+        "\n",
+        "def check_install_system_pkg(cmd: str, install_cmd: str):\n",
+        "    if shutil.which(cmd):\n",
+        "        print(f\"  [OK] {cmd} found\")\n",
+        "        return True\n",
+        "    print(f\"  [INSTALLING] {cmd}...\")\n",
+        "    result = subprocess.run(install_cmd, shell=True, capture_output=True, text=True)\n",
+        "    if result.returncode == 0:\n",
+        "        print(f\"  [OK] {cmd} installed\")\n",
+        "        return True\n",
+        "    print(f\"  [ERROR] Failed to install {cmd}. Please install manually: {install_cmd}\")\n",
+        "    return False\n",
+        "\n",
+        "check_install_system_pkg(\"git\", \"sudo apt-get update && sudo apt-get install -y git\")\n",
+        "\n",
+        "# Install Python packages\n",
+        "subprocess.check_call([sys.executable, \"-m\", \"pip\", \"install\", \"-q\", \"minio\", \"aiohttp\", \"requests\", \"python-dotenv\", \"pyyaml\"])\n",
+        "print(\"[OK] Ready\")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## 3. Set API Keys\n",
+        "\n",
+        "Configure NGC API key for NIM deployments.\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "import getpass\n",
+        "\n",
+        "def set_api_key(env_var: str, prompt: str, required: bool = True):\n",
+        "    if os.environ.get(env_var):\n",
+        "        print(f\"  [OK] {env_var} already set ({os.environ[env_var][:10]}...)\")\n",
+        "        return True\n",
+        "    key = getpass.getpass(prompt)\n",
+        "    if key:\n",
+        "        os.environ[env_var] = key\n",
+        "        print(f\"  [OK] {env_var} set\")\n",
+        "        return True\n",
+        "    if required:\n",
+        "        print(f\"  [ERROR] {env_var} is required\")\n",
+        "        return False\n",
+        "    print(f\"  [SKIP] {env_var} (optional)\")\n",
+        "    return True\n",
+        "\n",
+        "set_api_key(\"NGC_API_KEY\", \"Enter NGC_API_KEY (starts with 'nvapi-'): \", required=True)\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## 4. Helper Functions\n",
+        "\n",
+        "Shared utilities for deployment, file upload, status checks, and RAG queries.\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Install dependencies\n",
+        "import sys\n",
+        "!{sys.executable} -m pip install -q minio aiohttp requests python-dotenv"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "import os, sys, json, re, subprocess, time, socket, asyncio\n",
+        "import aiohttp, requests\n",
+        "from typing import List, Optional, Dict\n",
+        "\n",
+        "try:\n",
+        "    from minio import Minio\n",
+        "    from minio.error import S3Error\n",
+        "except ImportError:\n",
+        "    subprocess.check_call([sys.executable, \"-m\", \"pip\", \"install\", \"-q\", \"minio\"])\n",
+        "    from minio import Minio\n",
+        "    from minio.error import S3Error\n",
+        "\n",
+        "# =============================================================================\n",
+        "# CONFIGURATION\n",
+        "# =============================================================================\n",
+        "\n",
+        "# Paths relative to RAG repo root\n",
+        "RAG_REPO_DIR = os.path.expanduser(\"~/rag\")\n",
+        "EXAMPLE_DIR = os.path.join(RAG_REPO_DIR, \"examples/rag_event_ingest\")\n",
+        "AIDP_COMPOSE_FILE = os.path.join(EXAMPLE_DIR, \"deploy/docker-compose.yaml\")\n",
+        "DATA_DIR = os.path.join(EXAMPLE_DIR, \"data\")\n",
+        "RAG_SERVER_URL = \"http://localhost:8081\"\n",
+        "INGESTOR_URL = \"http://localhost:8082\"\n",
+        "\n",
+        "LOCAL_NIM_CACHE = os.path.expanduser(\"~/.cache/nim\")\n",
+        "\n",
+        "MINIO_ENDPOINT = \"localhost:9201\"\n",
+        "MINIO_ACCESS_KEY = \"minioadmin\"\n",
+        "MINIO_SECRET_KEY = \"minioadmin\"\n",
+        "MINIO_BUCKET = \"aidp-bucket\"\n",
+        "MINIO_COLLECTION = \"aidp_bucket\"\n",
+        "MINIO_CONSOLE_PORT = 9211\n",
+        "\n",
+        "# =============================================================================\n",
+        "# SHARED UTILITIES\n",
+        "# =============================================================================\n",
+        "\n",
+        "def run_command(cmd: str, capture: bool = False) -> Optional[str]:\n",
+        "    \"\"\"Execute a shell command and print it.\"\"\"\n",
+        "    print(f\"$ {cmd}\")\n",
+        "    result = subprocess.run(cmd, shell=True, capture_output=capture, text=True)\n",
+        "    return result.stdout if capture else None\n",
+        "\n",
+        "def get_host_ip() -> str:\n",
+        "    \"\"\"Get host IP address for external access URLs.\"\"\"\n",
+        "    try:\n",
+        "        s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)\n",
+        "        s.connect((\"8.8.8.8\", 80))\n",
+        "        ip = s.getsockname()[0]\n",
+        "        s.close()\n",
+        "        return ip\n",
+        "    except OSError:\n",
+        "        return \"localhost\"\n",
+        "\n",
+        "def get_minio_client() -> Minio:\n",
+        "    \"\"\"Create MinIO client for AIDP bucket operations.\"\"\"\n",
+        "    return Minio(MINIO_ENDPOINT, access_key=MINIO_ACCESS_KEY, secret_key=MINIO_SECRET_KEY, secure=False)\n",
+        "\n",
+        "def upload_file(local_path: str, object_name: Optional[str] = None) -> bool:\n",
+        "    \"\"\"Upload a local file to MinIO AIDP bucket.\"\"\"\n",
+        "    if not os.path.exists(local_path):\n",
+        "        print(f\"[ERROR] File not found: {local_path}\")\n",
+        "        return False\n",
+        "    obj = object_name or os.path.basename(local_path)\n",
+        "    try:\n",
+        "        client = get_minio_client()\n",
+        "        if not client.bucket_exists(MINIO_BUCKET):\n",
+        "            client.make_bucket(MINIO_BUCKET)\n",
+        "        client.fput_object(MINIO_BUCKET, obj, local_path)\n",
+        "        print(f\"[OK] Uploaded: {obj}\")\n",
+        "        return True\n",
+        "    except S3Error as e:\n",
+        "        print(f\"[ERROR] {e}\")\n",
+        "        return False\n",
+        "\n",
+        "def verify_file_in_storage(object_name: str, bucket: str = MINIO_BUCKET) -> bool:\n",
+        "    \"\"\"Check if a file exists in MinIO bucket and print verification status.\"\"\"\n",
+        "    try:\n",
+        "        client = get_minio_client()\n",
+        "        stat = client.stat_object(bucket, object_name)\n",
+        "        print(f\"[OK] File verified in storage:\")\n",
+        "        print(f\"  Bucket:   {bucket}\")\n",
+        "        print(f\"  Object:   {object_name}\")\n",
+        "        print(f\"  Size:     {stat.size:,} bytes\")\n",
+        "        print(f\"  Modified: {stat.last_modified}\")\n",
+        "        return True\n",
+        "    except S3Error as e:\n",
+        "        print(f\"[ERROR] File not found in storage: {object_name}\")\n",
+        "        print(f\"  Error: {e}\")\n",
+        "        return False\n",
+        "\n",
+        "def get_consumer_logs(lines: int = 30) -> None:\n",
+        "    \"\"\"Show recent Kafka consumer logs.\"\"\"\n",
+        "    run_command(f\"docker logs kafka-consumer --tail {lines}\")\n",
+        "\n",
+        "async def query_rag(question: str, collection: str = None) -> Optional[str]:\n",
+        "    \"\"\"Query RAG system and print the answer.\"\"\"\n",
+        "    coll = collection or MINIO_COLLECTION\n",
+        "    print(f\"Q: {question}\\nCollection: {coll}\\n\" + \"-\" * 40)\n",
+        "\n",
+        "    payload = {\n",
+        "        \"messages\": [{\"role\": \"user\", \"content\": question}],\n",
+        "        \"use_knowledge_base\": True,\n",
+        "        \"collection_names\": [coll],\n",
+        "    }\n",
+        "    try:\n",
+        "        async with aiohttp.ClientSession() as session:\n",
+        "            async with session.post(\n",
+        "                f\"{RAG_SERVER_URL}/generate\", json=payload,\n",
+        "                timeout=aiohttp.ClientTimeout(total=120),\n",
+        "            ) as resp:\n",
+        "                text = await resp.text()\n",
+        "                # Parse SSE response: extract content from each \"data: {...}\" line\n",
+        "                chunks = []\n",
+        "                for line in text.split(\"\\n\"):\n",
+        "                    if not line.startswith(\"data: \") or line[6:] == \"[DONE]\":\n",
+        "                        continue\n",
+        "                    try:\n",
+        "                        msg = json.loads(line[6:]).get(\"choices\", [{}])[0].get(\"message\", {})\n",
+        "                        if msg.get(\"content\"):\n",
+        "                            chunks.append(msg[\"content\"])\n",
+        "                    except json.JSONDecodeError:\n",
+        "                        pass\n",
+        "                answer = \"\".join(chunks)\n",
+        "                print(f\"Answer: {answer}\")\n",
+        "                return answer\n",
+        "    except aiohttp.ClientError as e:\n",
+        "        print(f\"[ERROR] {e}\")\n",
+        "        return None\n",
+        "\n",
+        "print(f\"[OK] Helpers loaded | Host IP: {get_host_ip()}\")\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Deploy NVIDIA RAG\n",
+        "\n",
+        "Deploy the NVIDIA RAG: NIMs (LLM, Embedding, Reranker), Milvus vector database, Ingestor server, and RAG server.\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "ngc_key = os.environ.get(\"NGC_API_KEY\")\n",
+        "if not ngc_key:\n",
+        "    raise RuntimeError(\"NGC_API_KEY not set! Run the API keys cell first.\")\n",
+        "\n",
+        "os.chdir(RAG_REPO_DIR)\n",
+        "\n",
+        "# Set env vars needed by docker compose\n",
+        "os.environ[\"NGC_API_KEY\"] = ngc_key\n",
+        "os.environ[\"USERID\"] = f\"{os.getuid()}:{os.getgid()}\"\n",
+        "os.environ[\"COLLECTION_NAME\"] = MINIO_COLLECTION\n",
+        "\n",
+        "# Load RAG .env defaults (MODEL_DIRECTORY, etc.)\n",
+        "from dotenv import load_dotenv\n",
+        "env_file = os.path.join(RAG_REPO_DIR, \"deploy/compose/.env\")\n",
+        "if os.path.exists(env_file):\n",
+        "    load_dotenv(env_file, override=False)\n",
+        "\n",
+        "# Login to nvcr.io\n",
+        "subprocess.run(f\"echo {ngc_key} | docker login nvcr.io -u '$oauthtoken' --password-stdin\",\n",
+        "               shell=True, capture_output=True, text=True, executable=\"/bin/bash\")\n",
+        "\n",
+        "# Deploy components\n",
+        "for label, compose_file in [\n",
+        "    (\"NIMs\",      \"deploy/compose/nims.yaml\"),\n",
+        "    (\"Vector DB\", \"deploy/compose/vectordb.yaml\"),\n",
+        "]:\n",
+        "    print(f\"Deploying {label}...\")\n",
+        "    run_command(f\"USERID=$(id -u) docker compose -f {compose_file} up -d\")\n",
+        "\n",
+        "print(\"Waiting 30s for Milvus...\")\n",
+        "time.sleep(30)\n",
+        "\n",
+        "for label, compose_file in [\n",
+        "    (\"Ingestor\", \"deploy/compose/docker-compose-ingestor-server.yaml\"),\n",
+        "    (\"RAG Server\", \"deploy/compose/docker-compose-rag-server.yaml\"),\n",
+        "]:\n",
+        "    print(f\"Deploying {label}...\")\n",
+        "    run_command(f\"docker compose -f {compose_file} up -d\")\n",
+        "\n",
+        "ip = get_host_ip()\n",
+        "print(f\"\\nRAG deployed: http://{ip}:8081 (server) | http://{ip}:8082 (ingestor) | http://{ip}:8090 (UI)\")\n",
+        "print(f\"COLLECTION_NAME: {MINIO_COLLECTION}\")\n",
+        "print(\"Wait ~10 minutes for NIMs to load models, then run the status check cell.\")\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Verify RAG services are healthy. Wait ~10 minutes for NIMs to load models.\n",
+        "\n",
+        "The deployment status should be:\n",
+        "```\n",
+        "NAMES                            STATUS\n",
+        "rag-frontend                     Up About a minute\n",
+        "rag-server                       Up About a minute\n",
+        "ingestor-server                  Up About a minute\n",
+        "milvus-standalone                Up 2 minutes (healthy)\n",
+        "milvus-etcd                      Up 2 minutes (healthy)\n",
+        "milvus-minio                     Up 2 minutes (healthy)\n",
+        "nim-llm-ms                       Up 2 minutes (healthy)\n",
+        "nemotron-embedding-ms            Up 2 minutes (healthy)\n",
+        "nemotron-ranking-ms              Up 2 minutes (healthy)\n",
+        "```\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Check service status and print access URLs\n",
+        "print(\"Wait ~10 minutes for services to become healthy.\")\n",
+        "print(\"Run this cell again after waiting.\\n\")\n",
+        "\n",
+        "ip = get_host_ip()\n",
+        "for name, port, path in [\n",
+        "    (\"RAG Server\", 8081, \"/health\"), (\"Ingestor\", 8082, \"/health\"),\n",
+        "    (\"Frontend\", 8090, \"/\"), (\"Milvus\", 19530, \"/v1/vector/collections\"),\n",
+        "]:\n",
+        "    try:\n",
+        "        s = \"[OK]\" if requests.get(f\"http://localhost:{port}{path}\", timeout=10).status_code == 200 else \"[WARN]\"\n",
+        "    except requests.ConnectionError:\n",
+        "        s = \"[DOWN]\"\n",
+        "    except requests.Timeout:\n",
+        "        s = \"[TIMEOUT]\"\n",
+        "    print(f\"  {s} {name}: http://{ip}:{port}\")\n",
+        "run_command(\"docker ps --format 'table {{.Names}}\\t{{.Status}}' | grep -E '(rag|milvus|ingestor|nim|nemotron|NAMES)'\")\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Deploy Continuous Ingestion from emulated object storage\n",
+        "\n",
+        "Deploy the Continuous Ingestion: Kafka message broker, MinIO object storage, and Kafka consumer for automated ingestion.\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## 1. Deploy Services\n",
+        "\n",
+        "Deploy Kafka, MinIO, and the Kafka consumer."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Verify prerequisites\n",
+        "net_check = subprocess.run(\"docker network inspect nvidia-rag\", shell=True, capture_output=True)\n",
+        "if net_check.returncode != 0:\n",
+        "    raise RuntimeError(\"nvidia-rag network not found. Deploy RAG first.\")\n",
+        "\n",
+        "ngc_key = os.environ.get(\"NGC_API_KEY\", \"\")\n",
+        "if not ngc_key:\n",
+        "    raise RuntimeError(\"NGC_API_KEY not set!\")\n",
+        "\n",
+        "host_ip = get_host_ip()\n",
+        "\n",
+        "# Set environment variables for docker compose\n",
+        "os.environ[\"HOST_IP\"] = host_ip\n",
+        "\n",
+        "# Login + pull + build\n",
+        "subprocess.run(f\"echo {ngc_key} | docker login nvcr.io -u '$oauthtoken' --password-stdin\",\n",
+        "               shell=True, capture_output=True, text=True, executable=\"/bin/bash\")\n",
+        "\n",
+        "compose = f\"docker compose -f {AIDP_COMPOSE_FILE}\"\n",
+        "subprocess.run(f\"{compose} pull --ignore-pull-failures\", shell=True, capture_output=True, text=True, executable=\"/bin/bash\")\n",
+        "subprocess.run(f\"{compose} up -d --build\", shell=True, capture_output=True, text=True, executable=\"/bin/bash\")\n",
+        "\n",
+        "print(f\"Continuous Ingestion deployed:\")\n",
+        "print(f\"  Kafka UI:      http://{host_ip}:8080\")\n",
+        "print(f\"  MinIO Console: http://{host_ip}:{MINIO_CONSOLE_PORT}\")\n",
+        "print(f\"  Credentials:   minioadmin / minioadmin\")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Verify continuous ingestion services are running.\n",
+        "\n",
+        "The deployment status should be:\n",
+        "```\n",
+        "NAMES                            STATUS\n",
+        "kafka-consumer                   Up About a minute\n",
+        "aidp-kafka-ui                    Up About a minute\n",
+        "aidp-minio-mc                    Up About a minute\n",
+        "aidp-minio                       Up About a minute (healthy)\n",
+        "kafka                            Up About a minute (healthy)\n",
+        "```\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Check service status and print access URLs\n",
+        "ip = get_host_ip()\n",
+        "for name, port, path in [\n",
+        "    (\"Kafka UI\", 8080, \"/\"),\n",
+        "    (\"MinIO Console\", MINIO_CONSOLE_PORT, \"/\"),\n",
+        "]:\n",
+        "    try:\n",
+        "        s = \"[OK]\" if requests.get(f\"http://localhost:{port}{path}\", timeout=10).status_code == 200 else \"[WARN]\"\n",
+        "    except requests.ConnectionError:\n",
+        "        s = \"[DOWN]\"\n",
+        "    except requests.Timeout:\n",
+        "        s = \"[TIMEOUT]\"\n",
+        "    print(f\"  {s} {name}: http://{ip}:{port}\")\n",
+        "\n",
+        "# Check kafka-consumer container status\n",
+        "result = subprocess.run(\"docker inspect -f '{{.State.Status}}' kafka-consumer 2>/dev/null\",\n",
+        "                        shell=True, capture_output=True, text=True)\n",
+        "status = result.stdout.strip()\n",
+        "s = \"[OK]\" if status == \"running\" else \"[DOWN]\"\n",
+        "print(f\"  {s} Kafka Consumer: {status or 'not found'}\")\n",
+        "\n",
+        "run_command(\"docker ps --format 'table {{.Names}}\\t{{.Status}}' | grep -E '(kafka|minio|NAMES)'\")\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Testing\n",
+        "\n",
+        "Test the deployment by uploading documents, then querying via RAG.\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## 1. Document Upload\n",
+        "\n",
+        "Upload a PDF document to MinIO, which triggers automatic ingestion via Kafka consumer.\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "### 1.1 Upload to Storage\n",
+        "\n",
+        "Upload the document to MinIO object storage.\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Sample documents are included in the repo under examples/rag_event_ingest/data/\n",
+        "pdf_path = os.path.join(DATA_DIR, \"documents\", \"Seahawks-Patriots in Super Bowl LX_ What We Learned from Seattle's 29-13 win.pdf\")\n",
+        "upload_file(pdf_path, \"Seahawks-Patriots_SuperBowl_LX_Analysis.pdf\")\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "### 1.2 Verify Document Ingestion"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Check consumer logs to verify document processing status.\n",
+        "\n",
+        "The logs should show the document being picked up and successfully ingested:\n",
+        "```\n",
+        "services.document_indexer - INFO - Task ...: PENDING (0s)\n",
+        "services.document_indexer - INFO - Task ...: PENDING (5s)\n",
+        "handlers.base - INFO - [DocumentHandler] ✓ Seahawks-Patriots_SuperBowl_LX_Analysis.pdf → SUCCESS\n",
+        "consumer - INFO - ✓ SUMMARY: Seahawks-Patriots_SuperBowl_LX_Analysis.pdf | Collection: aidp_bucket | Duration: 12.76s | Status: SUCCESS\n",
+        "```\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Verify file landed in object storage\n",
+        "verify_file_in_storage(\"Seahawks-Patriots_SuperBowl_LX_Analysis.pdf\")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "### 1.3 Verify Document Ingestion\n",
+        "\n",
+        "Check consumer logs to verify document processing status.\n",
+        "\n",
+        "The logs should show the document being picked up and successfully ingested:\n",
+        "```\n",
+        "services.document_indexer - INFO - Task ...: PENDING (0s)\n",
+        "services.document_indexer - INFO - Task ...: PENDING (5s)\n",
+        "handlers.base - INFO - [DocumentHandler] ✓ Seahawks-Patriots_SuperBowl_LX_Analysis.pdf → SUCCESS\n",
+        "consumer - INFO - ✓ SUMMARY: Seahawks-Patriots_SuperBowl_LX_Analysis.pdf | Collection: aidp_bucket | Duration: 12.76s | Status: SUCCESS\n",
+        "```"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Check consumer logs for ingestion status\n",
+        "print(\"Waiting for document processing...\")\n",
+        "get_consumer_logs(50)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "### 1.4 Query Document via RAG\n",
+        "\n",
+        "You can query the ingested document either **programmatically** below or via the **RAG Frontend UI**.\n",
+        "\n",
+        "> **💡 RAG Frontend**: Open `http://<host-ip>:8090` in your browser for an interactive Q&A interface.\n",
+        "> Make sure to select the collection **`aidp_bucket`** in the UI.\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Query the document\n",
+        "await query_rag(\"What was the final score and who won Super Bowl LX?\", MINIO_COLLECTION)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Ask another question about the document.\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Query about key takeaways\n",
+        "await query_rag(\"What were the key lessons learned from Seattle's victory in Super Bowl LX?\", MINIO_COLLECTION)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Clean Up\n",
+        "\n",
+        "Stop all services and clean up ingested data.\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## 1. Stop RAG Deployment\n",
+        "\n",
+        "Stop all RAG services (NIMs, Milvus, Ingestor, RAG server).\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "os.chdir(RAG_REPO_DIR)\n",
+        "for f in [\n",
+        "    \"deploy/compose/docker-compose-rag-server.yaml\",\n",
+        "    \"deploy/compose/docker-compose-ingestor-server.yaml\",\n",
+        "    \"deploy/compose/vectordb.yaml\",\n",
+        "    \"deploy/compose/nims.yaml\",\n",
+        "]:\n",
+        "    run_command(f\"docker compose -f {f} down\")\n",
+        "print(\"[OK] RAG stopped\")\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## 2. Stop Continuous ingestion Deployment\n",
+        "\n",
+        "Stop Continuous ingestion services (Kafka, MinIO, Consumer).\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "run_command(f\"docker compose -f {AIDP_COMPOSE_FILE} down\")\n",
+        "print(\"[OK] Continuous ingestion stopped\")\n"
+      ]
+    }
+  ],
+  "metadata": {
+    "kernelspec": {
+      "display_name": ".venv",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.12.3"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 4
+}
diff --git a/notebooks/rag_library_lite_usage.ipynb b/notebooks/rag_library_lite_usage.ipynb
index eb91c80b6..915eb00ca 100644
--- a/notebooks/rag_library_lite_usage.ipynb
+++ b/notebooks/rag_library_lite_usage.ipynb
@@ -36,7 +36,7 @@
     "\n",
     "Install nv-ingest library using below command - **OR** - Run the cell below if Jupyter notebook is started in the same environment:\n",
     "```bash\n",
-    "uv pip install nv-ingest==26.1.1\n",
+    "uv pip install nv-ingest==26.1.2\n",
     "```"
    ]
   },
@@ -71,7 +71,7 @@
     "# !uv pip install ../dist/nvidia_rag-*-py3-none-any.whl[all]\n",
     "\n",
     "# Install NV-Ingest library in the same environment to run NV-Ingest pipeline\n",
-    "!uv pip install nv-ingest==26.1.1"
+    "!uv pip install nv-ingest==26.1.2"
    ]
   },
   {
@@ -150,15 +150,15 @@
     "os.environ[\"OCR_HTTP_ENDPOINT\"] = \"https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-ocr\"\n",
     "os.environ[\"OCR_INFER_PROTOCOL\"] = \"http\"\n",
     "os.environ[\"YOLOX_HTTP_ENDPOINT\"] = (\n",
-    "    \"https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-page-elements-v3\"\n",
+    "    \"https://ai.api.nvidia.com/v1/cv/nvidia/nemotron-page-elements-v3\"\n",
     ")\n",
     "os.environ[\"YOLOX_INFER_PROTOCOL\"] = \"http\"\n",
     "os.environ[\"YOLOX_GRAPHIC_ELEMENTS_HTTP_ENDPOINT\"] = (\n",
-    "    \"https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-graphic-elements-v1\"\n",
+    "    \"https://ai.api.nvidia.com/v1/cv/nvidia/nemotron-graphic-elements-v1\"\n",
     ")\n",
     "os.environ[\"YOLOX_GRAPHIC_ELEMENTS_INFER_PROTOCOL\"] = \"http\"\n",
     "os.environ[\"YOLOX_TABLE_STRUCTURE_HTTP_ENDPOINT\"] = (\n",
-    "    \"https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-table-structure-v1\"\n",
+    "    \"https://ai.api.nvidia.com/v1/cv/nvidia/nemotron-table-structure-v1\"\n",
     ")\n",
     "os.environ[\"YOLOX_TABLE_STRUCTURE_INFER_PROTOCOL\"] = \"http\""
    ]
@@ -292,7 +292,7 @@
     "\n",
     "config_ingestor = NvidiaRAGConfig.from_yaml(\"config.yaml\")\n",
     "# You can update the config object to use different models and endpoints like below\n",
-    "# config_ingestor.embeddings.model_name = \"nvidia/llama-3.2-nv-embedqa-1b-v2\"\n",
+    "# config_ingestor.embeddings.model_name = \"nvidia/llama-nemotron-embed-1b-v2\"\n",
     "# config_ingestor.embeddings.server_url = \"https://integrate.api.nvidia.com/v1\"\n",
     "\n",
     "# Set config for rag lite library mode\n",
diff --git a/notebooks/rag_library_usage.ipynb b/notebooks/rag_library_usage.ipynb
index 01f07fb4a..894e82620 100644
--- a/notebooks/rag_library_usage.ipynb
+++ b/notebooks/rag_library_usage.ipynb
@@ -75,7 +75,8 @@
    "outputs": [],
    "source": [
     "# Option A: Install from PyPI (recommended)\n",
-    "# Uncomment the line below to install from PyPI\n",
+    "# Uncomment the line below to install from PyPI.\n",
+    "# Note: This will require a restart of the kernel after installation if you are using this notebook in a JupyterLab session.\n",
     "# !uv pip install nvidia-rag[all]\n",
     "\n",
     "# Option B: Install from source in development mode (for contributors)\n",
@@ -307,12 +308,12 @@
     "Ensure all the below are running and healthy before proceeding further\n",
     "```output\n",
     "NAMES                           STATUS\n",
-    "nemoretriever-ranking-ms        Up ... (healthy)\n",
+    "nemotron-ranking-ms        Up ... (healthy)\n",
     "compose-page-elements-1         Up ...\n",
     "compose-nemoretriever-ocr-1     Up ...\n",
     "compose-graphic-elements-1      Up ...\n",
     "compose-table-structure-1       Up ...\n",
-    "nemoretriever-embedding-ms      Up ... (healthy)\n",
+    "nemotron-embedding-ms      Up ... (healthy)\n",
     "nim-llm-ms                      Up ... (healthy)\n",
     "```"
    ]
@@ -337,15 +338,15 @@
     "os.environ[\"OCR_HTTP_ENDPOINT\"] = \"https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-ocr\"\n",
     "os.environ[\"OCR_INFER_PROTOCOL\"] = \"http\"\n",
     "os.environ[\"YOLOX_HTTP_ENDPOINT\"] = (\n",
-    "    \"https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-page-elements-v3\"\n",
+    "    \"https://ai.api.nvidia.com/v1/cv/nvidia/nemotron-page-elements-v3\"\n",
     ")\n",
     "os.environ[\"YOLOX_INFER_PROTOCOL\"] = \"http\"\n",
     "os.environ[\"YOLOX_GRAPHIC_ELEMENTS_HTTP_ENDPOINT\"] = (\n",
-    "    \"https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-graphic-elements-v1\"\n",
+    "    \"https://ai.api.nvidia.com/v1/cv/nvidia/nemotron-graphic-elements-v1\"\n",
     ")\n",
     "os.environ[\"YOLOX_GRAPHIC_ELEMENTS_INFER_PROTOCOL\"] = \"http\"\n",
     "os.environ[\"YOLOX_TABLE_STRUCTURE_HTTP_ENDPOINT\"] = (\n",
-    "    \"https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-table-structure-v1\"\n",
+    "    \"https://ai.api.nvidia.com/v1/cv/nvidia/nemotron-table-structure-v1\"\n",
     ")\n",
     "os.environ[\"YOLOX_TABLE_STRUCTURE_INFER_PROTOCOL\"] = \"http\""
    ]
@@ -439,7 +440,7 @@
     "    config_ingestor.llm.server_url = \"\"  # Empty uses NVIDIA API catalog\n",
     "    config_ingestor.summarizer.server_url = \"\"  # Empty uses NVIDIA API catalog\n",
     "else:\n",
-    "    config_ingestor.embeddings.server_url = \"http://nemoretriever-embedding-ms:8000/v1\"\n",
+    "    config_ingestor.embeddings.server_url = \"http://nemotron-embedding-ms:8000/v1\"\n",
     "ingestor = NvidiaRAGIngestor(config=config_ingestor)"
    ]
   },
@@ -624,11 +625,11 @@
     "#             \"server_url\": \"\",\n",
     "#         },\n",
     "#     \"embeddings\": {\n",
-    "#             \"model_name\": \"nvidia/llama-3.2-nv-embedqa-1b-v2\",\n",
+    "#             \"model_name\": \"nvidia/llama-nemotron-embed-1b-v2\",\n",
     "#             \"server_url\": \"https://integrate.api.nvidia.com/v1\",\n",
     "#         },\n",
     "#     \"ranking\": {\n",
-    "#             \"model_name\": \"nvidia/llama-3.2-nv-rerankqa-1b-v2\",\n",
+    "#             \"model_name\": \"nvidia/llama-nemotron-rerank-1b-v2\",\n",
     "#             \"server_url\": \"https://ai.api.nvidia.com/v1/retrieval/nvidia/llama-3_2-nv-rerankqa-1b-v2/reranking/v1\",\n",
     "#         },\n",
     "#     }\n",
diff --git a/notebooks/retriever_api_usage.ipynb b/notebooks/retriever_api_usage.ipynb
index 47466e3ed..9b52647ed 100644
--- a/notebooks/retriever_api_usage.ipynb
+++ b/notebooks/retriever_api_usage.ipynb
@@ -145,8 +145,8 @@
     "    \"filter_expr\": \"\",\n",
     "    # Override model endpoints and details if needed\n",
     "    #\"model\": \"nvidia/llama-3.3-nemotron-super-49b-v1.5\",\n",
-    "    #\"reranker_model\": \"nvidia/llama-3.2-nv-rerankqa-1b-v2\",\n",
-    "    #\"embedding_model\": \"nvidia/llama-3.2-nv-embedqa-1b-v2\",\n",
+    "    #\"reranker_model\": \"nvidia/llama-nemotron-rerank-1b-v2\",\n",
+    "    #\"embedding_model\": \"nvidia/llama-nemotron-embed-1b-v2\",\n",
     "    #\"llm_endpoint\": \"\",\n",
     "    #\"embedding_endpoint\": \"\",\n",
     "    #\"reranker_endpoint\": \"\",\n",
@@ -271,8 +271,8 @@
     "    \"enable_query_rewriting\": False,\n",
     "    \"enable_reranker\": True,\n",
     "    # Override model endpoints and details if needed\n",
-    "    #\"reranker_model\": \"nvidia/llama-3.2-nv-rerankqa-1b-v2\",\n",
-    "    #\"embedding_model\": \"nvidia/llama-3.2-nv-embedqa-1b-v2\",\n",
+    "    #\"reranker_model\": \"nvidia/llama-nemotron-rerank-1b-v2\",\n",
+    "    #\"embedding_model\": \"nvidia/llama-nemotron-embed-1b-v2\",\n",
     "    #\"embedding_endpoint\": \"\",\n",
     "    #\"reranker_endpoint\": \"\",\n",
     "}\n",
diff --git a/notebooks/summarization.ipynb b/notebooks/summarization.ipynb
index f824f8e7e..d0c7ef285 100644
--- a/notebooks/summarization.ipynb
+++ b/notebooks/summarization.ipynb
@@ -388,12 +388,12 @@
     "Ensure all the below are running and healthy before proceeding further\n",
     "```output\n",
     "NAMES                           STATUS\n",
-    "nemoretriever-ranking-ms        Up ... (healthy)\n",
+    "nemotron-ranking-ms        Up ... (healthy)\n",
     "compose-page-elements-1         Up ...\n",
     "compose-nemoretriever-ocr-1     Up ...\n",
     "compose-graphic-elements-1      Up ...\n",
     "compose-table-structure-1       Up ...\n",
-    "nemoretriever-embedding-ms      Up ... (healthy)\n",
+    "nemotron-embedding-ms      Up ... (healthy)\n",
     "nim-llm-ms                      Up ... (healthy)\n",
     "```"
    ]
@@ -419,15 +419,15 @@
     "os.environ[\"OCR_HTTP_ENDPOINT\"] = \"https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-ocr\"\n",
     "os.environ[\"OCR_INFER_PROTOCOL\"] = \"http\"\n",
     "os.environ[\"YOLOX_HTTP_ENDPOINT\"] = (\n",
-    "    \"https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-page-elements-v3\"\n",
+    "    \"https://ai.api.nvidia.com/v1/cv/nvidia/nemotron-page-elements-v3\"\n",
     ")\n",
     "os.environ[\"YOLOX_INFER_PROTOCOL\"] = \"http\"\n",
     "os.environ[\"YOLOX_GRAPHIC_ELEMENTS_HTTP_ENDPOINT\"] = (\n",
-    "    \"https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-graphic-elements-v1\"\n",
+    "    \"https://ai.api.nvidia.com/v1/cv/nvidia/nemotron-graphic-elements-v1\"\n",
     ")\n",
     "os.environ[\"YOLOX_GRAPHIC_ELEMENTS_INFER_PROTOCOL\"] = \"http\"\n",
     "os.environ[\"YOLOX_TABLE_STRUCTURE_HTTP_ENDPOINT\"] = (\n",
-    "    \"https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-table-structure-v1\"\n",
+    "    \"https://ai.api.nvidia.com/v1/cv/nvidia/nemotron-table-structure-v1\"\n",
     ")\n",
     "os.environ[\"YOLOX_TABLE_STRUCTURE_INFER_PROTOCOL\"] = \"http\""
    ]
@@ -548,8 +548,8 @@
     "    config.ranking.server_url = \"https://ai.api.nvidia.com/v1/retrieval/nvidia/llama-3_2-nv-rerankqa-1b-v2/reranking/v1\"\n",
     "    config.summarizer.server_url = \"\"  # Empty uses NVIDIA API catalog\n",
     "else:\n",
-    "    config.embeddings.server_url = \"nemoretriever-embedding-ms:8000/v1\"\n",
-    "    config.ranking.server_url = \"nemoretriever-ranking-ms:8000\"\n",
+    "    config.embeddings.server_url = \"nemotron-embedding-ms:8000/v1\"\n",
+    "    config.ranking.server_url = \"nemotron-ranking-ms:8000\"\n",
     "    config.summarizer.server_url = \"nim-llm:8000\"\n",
     "    config.llm.server_url = \"nim-llm:8000\"\n",
     "\n",
@@ -967,7 +967,7 @@
     "else:\n",
     "    os.environ[\"SUMMARY_LLM_SERVERURL\"] = \"nim-llm:8000\"\n",
     "    os.environ[\"LLM_SERVER_URL\"] = \"nim-llm:8000\"\n",
-    "    os.environ[\"APP_EMBEDDINGS_SERVERURL\"] = \"nemoretriever-embedding-ms:8000/v1\"\n",
+    "    os.environ[\"APP_EMBEDDINGS_SERVERURL\"] = \"nemotron-embedding-ms:8000/v1\"\n",
     "    print(\"✓ Configured for on-prem NIMs\")\n",
     "\n",
     "os.environ[\"LOGLEVEL\"] = \"INFO\"\n",
diff --git a/pyproject.toml b/pyproject.toml
index 09b647403..acdf4f53c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "nvidia_rag"
-version = "2.4.0.dev"
+version = "2.5.0.dev"
 description = "This blueprint serves as a reference solution for a foundational Retrieval Augmented Generation (RAG) pipeline."
 readme = "README.md"
 license = "Apache-2.0"
@@ -23,7 +23,7 @@ dependencies = [
     "langchain>=1.2.7",
     "langchain-community>=0.4",
     "langchain-milvus>=0.3.0",
-    "langchain-nvidia-ai-endpoints>=1.0.3",
+    "langchain-nvidia-ai-endpoints>=1.2.0",
     "minio>=7.2,<8.0",
     "pdfplumber>=0.11.9",
     "pydantic>=2.11,<3.0",
@@ -58,8 +58,8 @@ rag = [
 ]
 ingest = [
     # nv-ingest dependencies (required for ingestion operations)
-    "nv-ingest-api==26.1.1",
-    "nv-ingest-client==26.1.1",
+    "nv-ingest-api==26.1.2",
+    "nv-ingest-client==26.1.2",
     "tritonclient==2.57.0",
     # Other ingest dependencies
     "langchain-openai>=0.2",
@@ -80,8 +80,8 @@ ingest = [
 ]
 all = [
     # nv-ingest dependencies (required for ingestion operations)
-    "nv-ingest-api==26.1.1",
-    "nv-ingest-client==26.1.1",
+    "nv-ingest-api==26.1.2",
+    "nv-ingest-client==26.1.2",
     "tritonclient==2.57.0",
     # RAG + Ingest dependencies
     "langchain-openai>=0.2",
diff --git a/skill-source/.agents/skills/rag-blueprint/SKILL.md b/skill-source/.agents/skills/rag-blueprint/SKILL.md
new file mode 100644
index 000000000..8f48d2858
--- /dev/null
+++ b/skill-source/.agents/skills/rag-blueprint/SKILL.md
@@ -0,0 +1,136 @@
+---
+name: rag-blueprint
+description: "NVIDIA RAG Blueprint — deploy, configure, troubleshoot, and manage. Handles any RAG action: deploy, install, start, enable, disable, toggle, change, configure, troubleshoot, debug, fix, shutdown, stop, or tear down any RAG feature or service (VLM, guardrails, query rewriting, models, search, ingestion, observability, summarization, and more)."
+argument-hint: deploy RAG | enable feature | disable feature | configure | troubleshoot | shutdown
+allowed-tools: Bash(echo *), Bash(nvidia-smi *), Bash(curl *), Bash(docker ps *), Bash(docker exec *), Bash(docker info *), Bash(docker --version *), Bash(docker compose version *), Bash(docker logs *), Bash(docker system *), Bash(kubectl get *), Bash(kubectl describe *), Bash(kubectl version *), Bash(kubectl logs *), Bash(helm version *), Bash(helm list *), Bash(git rev-parse *), Bash(git describe *), Bash(git status *), Bash(python3 --version *), Bash(pip3 show *), Bash(df *), Bash(du *), Bash(cat /proc/*), Bash(cat /etc/os-release *), Bash(ss *), Bash(netstat *), Bash(ls *), Bash(grep *), Bash(lsof *), Bash(ps aux *), Read, Grep, Glob
+license: Apache-2.0
+metadata:
+  author: nvidia-rag-team
+  version: "1.0"
+---
+
+# NVIDIA RAG Blueprint
+
+## Autonomy Principles
+
+- Auto-detect everything: GPU, VRAM, drivers, Docker, CUDA, disk, OS, ports, existing services, NGC key, repo state.
+- If it can be checked with a command, check it — don't ask the user.
+- Ask only when user action is required: providing an API key, confirming data deletion, or choosing between equally valid options.
+- Once analysis is done, route to the correct workflow and execute.
+
+## Intent Detection
+
+Determine what the user wants and route immediately:
+
+| User Intent | Action |
+|-------------|--------|
+| Deploy, install, set up, start RAG | Read and follow `references/deploy.md` |
+| Configure, enable, change, toggle a feature | Use the **Configure** section below |
+| Troubleshoot, debug, fix, error, unhealthy | Read and follow `references/troubleshoot.md` |
+| Stop, shutdown, tear down, clean up | Read and follow `references/shutdown.md` |
+
+If the intent is ambiguous, infer from context (e.g., "RAG isn't working" → troubleshoot; "get RAG running" → deploy). Only ask if genuinely unclear.
+
+---
+
+## Configure
+
+Requires a running RAG deployment. If services are not running, deploy first via `references/deploy.md`.
+
+Match the user's request to a reference file, then read and follow it:
+
+| Feature Keywords | Reference |
+|-----------------|-----------|
+| VLM, VLM embeddings, image captioning | `references/configure/vlm.md` |
+| NeMo Guardrails | `references/configure/guardrails.md` |
+| Query rewriting, decomposition, multi-turn | `references/configure/query-and-conversation.md` |
+| Ingestion (text-only, audio, Nemotron Parse, OCR, batch CLI, NV-Ingest, volume mount, performance) | `references/configure/ingestion.md` |
+| Search, retrieval, hybrid search, multi-collection, metadata, filters, reranker, topK, accuracy/performance | `references/configure/search-and-retrieval.md` |
+| LLM/embedding/ranking model changes, vector DB, Milvus/Elasticsearch auth, service keys, model profiles, ports/GPU | `references/configure/models-and-infrastructure.md` |
+| Reasoning, self-reflection, prompts, generation params (tokens, temperature, citations), per-request LLM params | `references/configure/reasoning-and-generation.md` |
+| Summarization | `references/configure/summarization.md` |
+| Observability (tracing, Zipkin, Grafana, Prometheus) | `references/configure/observability.md` |
+| Multimodal query (image + text) | `references/configure/multimodal-query.md` |
+| Data catalog (collection/document metadata) | `references/configure/data-catalog.md` |
+| User interface (UI settings) | `references/configure/user-interface.md` |
+| API reference (endpoints, schemas) | `references/configure/api-reference.md` |
+| Evaluation (RAGAS metrics) | `references/configure/evaluation.md` |
+| MCP server & client, agent toolkit | `references/configure/mcp.md` |
+| Migration (version upgrades) | `references/configure/migration.md` |
+| Notebooks (setup and catalog) | `references/configure/notebooks.md` |
+
+### Configure Flow
+
+1. Match the user's request to a reference file from the table above.
+
+2. Detect what's running:
+   ```bash
+   echo "=== NIM ===" && docker ps --format '{{.Names}}' 2>/dev/null | grep -iE '(nim-llm|nemoretriever-embedding|nemoretriever-ranking|nemo-vlm|nemotron-vlm)' || echo "NO_LOCAL_NIMS"; echo "=== RAG ===" && docker ps --format '{{.Names}}' 2>/dev/null | grep -iE '(rag-server|ingestor-server|milvus)' || echo "NO_DOCKER_RAG"; echo "=== K8S ===" && kubectl get pods -n rag 2>/dev/null | head -5 || echo "NO_K8S"; echo "=== LIBRARY ===" && ps aux 2>/dev/null | grep -E '(nvidia_rag|uvicorn.*rag)' | grep -v grep || echo "NO_LIBRARY"
+   ```
+
+3. Use this table to determine platform, deployment type, and where config lives:
+
+   | Local NIMs running? | RAG services running? | Deployment Type | Config Location |
+   |---------------------|-----------------------|-----------------|-----------------|
+   | Yes (Docker) | Any | Self-hosted | `deploy/compose/.env` |
+   | No | Yes (Docker) | NVIDIA-hosted | `deploy/compose/nvdev.env` |
+   | Yes (K8s pods) | Any | Self-hosted | `values.yaml` (NIM sections) |
+   | No | Yes (K8s pods) | NVIDIA-hosted | `values.yaml` (envVars) |
+   | — | Library processes | Library mode | `notebooks/config.yaml` |
+   | No | No | Not running | Deploy first via `references/deploy.md` |
+
+   Tell the user what you detected and ask to confirm. Example: "I see local NIM containers running (nim-llm-ms, nemoretriever-embedding-ms) — this is a self-hosted deployment. Config file is `deploy/compose/.env`. Correct?"
+
+4. Check current feature state before changing anything — read the config location from step 3, then cross-check the live service:
+   - Docker: `docker exec rag-server env 2>/dev/null | grep -E "<VAR_NAME>"`
+   - Helm: `kubectl get pod -n rag -l app=rag-server -o jsonpath='{.items[0].spec.containers[0].env}' 2>/dev/null`
+
+   If the config file and live service disagree, tell the user the service has stale config and will need a restart.
+
+5. If the feature needs extra GPUs, check availability against hardware restrictions (see below):
+   ```bash
+   nvidia-smi --query-gpu=index,name,memory.total,memory.used --format=csv,noheader 2>/dev/null || echo "NO_GPU"
+   ```
+
+6. Read the reference file and apply changes:
+   - **Docker**: edit the env file (uncomment to enable, re-comment to disable — the env file is the source of truth). Then restart the affected service:
+     ```
+     source <env-file> && docker compose -f deploy/compose/<compose-file> up -d
+     ```
+     | Service | Compose File |
+     |---------|-------------|
+     | rag-server | `docker-compose-rag-server.yaml` |
+     | ingestor-server | `docker-compose-ingestor-server.yaml` |
+     | milvus, etcd, minio | `vectordb.yaml` |
+     | NIM containers (LLM, embedding, ranking, VLM, OCR) | `nims.yaml` |
+     | guardrails | `docker-compose-nemo-guardrails.yaml` |
+     | observability (Grafana, Prometheus, Zipkin) | `observability.yaml` |
+   - **Helm**: edit `values.yaml`, then upgrade: `helm upgrade rag <chart> -n rag -f values.yaml`
+   - **Library**: edit `notebooks/config.yaml`, then restart the Python process
+
+7. Verify:
+   - Docker: `docker ps --format "table {{.Names}}\t{{.Status}}" | head -20; curl -s http://localhost:8081/v1/health?check_dependencies=true 2>/dev/null | head -1`
+   - Helm: `kubectl get pods -n rag; kubectl rollout status deployment/rag-server -n rag --timeout=120s`
+   - Library: `curl -s http://localhost:8081/v1/health 2>/dev/null | head -1`
+
+8. If restart fails, read `references/troubleshoot.md`. If multiple features requested, repeat from step 1 for each.
+
+### When User Says "Configure" Without Specifics
+
+Run steps 2–3 above, then read the identified config file to list what's currently enabled:
+```bash
+grep -E "^(export )?(ENABLE_|APP_)" <config-file> 2>/dev/null | sort
+```
+Summarize what's running and enabled, then ask which feature to change.
+
+---
+
+## Hardware Restrictions
+
+Read `docs/support-matrix.md` for current GPU requirements per deployment mode.
+Read `docs/service-port-gpu-reference.md` for port mappings and GPU assignments.
+
+| GPU | Feature Restrictions |
+|-----|---------------------|
+| B200 | No VLM, No Guardrails, No Nemotron Parse. May need multi-GPU LLM (`LLM_MS_GPU_ID`). |
+| RTX PRO 6000 | No Nemotron Parse. No Audio on Helm. |
diff --git a/skill-source/.agents/skills/rag-blueprint/references/configure/api-reference.md b/skill-source/.agents/skills/rag-blueprint/references/configure/api-reference.md
new file mode 100644
index 000000000..056d814ba
--- /dev/null
+++ b/skill-source/.agents/skills/rag-blueprint/references/configure/api-reference.md
@@ -0,0 +1,29 @@
+# API Reference
+
+## When to Use
+- User needs to call RAG or Ingestor APIs directly
+- User asks about endpoints, request/response formats, or task status tracking
+
+## Process
+1. Read `docs/api-rag.md` for RAG server endpoints (port 8081)
+2. Read `docs/api-ingestor.md` for Ingestor server endpoints (port 8082)
+3. Consult OpenAPI schemas for exact request/response shapes
+
+## Agent-Specific Notes
+- RAG Server runs on port 8081: `/v1/generate`, `/v1/search`, `/v1/health`, `/v1/configuration`, `/v1/metrics`, `/v1/summary`
+- Ingestor Server runs on port 8082: `/v1/documents`, `/v1/collection`, `/v1/collections`, `/v1/status`
+- `POST /v1/documents` returns a `task_id` — poll `GET /v1/status?task_id=<id>` for progress
+- Task states: `PENDING` → `FINISHED` or `FAILED` (also `UNKNOWN` if not found)
+- NV-Ingest extraction states: `not_started` → `submitted` → `processing` → `completed` or `failed`
+- Max file size: 400 MB per document
+- Full health check: `GET /v1/health?check_dependencies=true`
+
+## Notebooks
+- `notebooks/ingestion_api_usage.ipynb` — ingestion API usage examples
+- `notebooks/retriever_api_usage.ipynb` — RAG retriever API: search and query examples
+
+## Source Documentation
+- `docs/api-rag.md` -- RAG server API details
+- `docs/api-ingestor.md` -- Ingestor server API details
+- `docs/api_reference/openapi_schema_rag_server.json` -- RAG server OpenAPI schema
+- `docs/api_reference/openapi_schema_ingestor_server.json` -- Ingestor server OpenAPI schema
diff --git a/skill-source/.agents/skills/rag-blueprint/references/configure/data-catalog.md b/skill-source/.agents/skills/rag-blueprint/references/configure/data-catalog.md
new file mode 100644
index 000000000..7b2d6c106
--- /dev/null
+++ b/skill-source/.agents/skills/rag-blueprint/references/configure/data-catalog.md
@@ -0,0 +1,36 @@
+# Data Catalog
+
+## When to Use
+- User wants to manage collection or document metadata for governance
+- User asks about tagging, ownership, or lifecycle status of collections
+- User wants to list or update collection metadata
+
+## Restrictions
+- None — available automatically after deployment, no additional configuration needed
+- Works with both Milvus and Elasticsearch (full feature parity)
+
+## Process
+1. Read `docs/data-catalog.md` for full API reference, field definitions, and examples
+2. All endpoints are on the ingestor server (port `8082`)
+3. Use PATCH endpoints for updates (merge updates — only provided fields change)
+
+## Decision Table
+
+| Goal | Source Doc | Key Action |
+|------|-----------|------------|
+| Add governance metadata | `docs/data-catalog.md` | POST `/v1/collection` with description, tags, owner |
+| Update lifecycle status | `docs/data-catalog.md` | PATCH with `status: "Archived"` |
+| Track content types | `docs/data-catalog.md` | Read auto-populated `has_tables`, `has_images` metrics |
+| Filter during retrieval | See custom metadata docs | Use `metadata_schema` + `filter_expr` (not data catalog) |
+
+## Agent-Specific Notes
+- Auto-populated metrics (`number_of_files`, `last_indexed`, `has_tables`, etc.) are system-set — not user-editable
+- `date_created` and `last_updated` timestamps are automatic
+- PATCH is a merge update — omitted fields keep current values
+- Different from custom metadata: catalog = governance/discovery, custom metadata = retrieval filtering
+
+## Notebooks
+- `notebooks/ingestion_api_usage.ipynb` — ingestion and collection management examples
+
+## Source Documentation
+- `docs/data-catalog.md` — full API reference, catalog fields, auto-populated metrics, Python client examples
diff --git a/skill-source/.agents/skills/rag-blueprint/references/configure/evaluation.md b/skill-source/.agents/skills/rag-blueprint/references/configure/evaluation.md
new file mode 100644
index 000000000..90f9c6206
--- /dev/null
+++ b/skill-source/.agents/skills/rag-blueprint/references/configure/evaluation.md
@@ -0,0 +1,26 @@
+# Evaluation
+
+## When to Use
+- User wants to measure RAG pipeline quality
+- User asks about accuracy, relevancy, groundedness, or recall metrics
+
+## Process
+1. Read `docs/evaluate.md` for full evaluation methodology and setup
+2. Choose the appropriate notebook based on metrics needed
+3. Run evaluation against the deployed RAG pipeline
+
+## Agent-Specific Notes
+- Uses RAGAS framework for all metrics
+- Answer Accuracy, Context Relevancy, and Groundedness are covered in one notebook
+- Recall is measured separately at top-k cutoffs (1, 3, 5, 10)
+
+## Notebooks
+| Notebook | Metrics |
+|----------|---------|
+| `notebooks/evaluation_01_ragas.ipynb` | Answer Accuracy, Context Relevancy, Groundedness |
+| `notebooks/evaluation_02_recall.ipynb` | Recall at top-k cutoffs |
+
+## Source Documentation
+- `docs/evaluate.md` -- full evaluation guide and metric definitions
+- [RAGAS documentation](https://docs.ragas.io/en/stable/)
+- [NVIDIA RAGAS metrics](https://docs.ragas.io/en/stable/concepts/metrics/available_metrics/nvidia_metrics/)
diff --git a/skill-source/.agents/skills/rag-blueprint/references/configure/guardrails.md b/skill-source/.agents/skills/rag-blueprint/references/configure/guardrails.md
new file mode 100644
index 000000000..309a18611
--- /dev/null
+++ b/skill-source/.agents/skills/rag-blueprint/references/configure/guardrails.md
@@ -0,0 +1,30 @@
+# NeMo Guardrails
+
+## When to Use
+- User wants content safety, topic control, or jailbreak prevention
+- User asks to enable/disable guardrails
+
+## Restrictions
+- Not available on B200 GPUs
+- Requires 2 extra GPUs with 48GB+ each (H100, A100 SXM 80GB, or RTX PRO 6000)
+- Not supported in library mode or Helm deployments
+- Jailbreak detection model not yet available out-of-the-box
+
+## Process
+
+1. Detect the deployment mode (guardrails are Docker-only — not supported on Helm or library mode). Edit the active env file for Docker
+2. Read `docs/nemo-guardrails.md` for full setup and configuration
+3. Choose deployment mode: self-hosted (local NIMs) or cloud-hosted (NVIDIA API)
+4. For self-hosted: assign GPU IDs — read `docs/service-port-gpu-reference.md` for default GPU assignments and adjust for your system
+5. Verify all three services healthy: `nemo-guardrails-microservice`, content-safety NIM, topic-control NIM
+6. Enable in UI: Settings > Output Preferences > Guardrails toggle
+
+## Agent-Specific Notes
+- Cloud mode (`nemoguard_cloud` config) skips local NIM containers — only the microservice is needed
+- Per-request toggle via `enable_guardrails` in `/generate` body requires server-level `ENABLE_GUARDRAILS=true` first
+- Override guardrails URL with `NEMO_GUARDRAILS_URL` if running on a different host
+- Content-safety and topic-control models are trained on single-turn data — multi-turn conversations may get inconsistent safety classifications
+- Current guardrails only produce simple refusal responses ("I'm sorry. I can't respond to that.")
+
+## Source Documentation
+- `docs/nemo-guardrails.md` -- full setup, configuration, and customization of guardrail rules
diff --git a/skill-source/.agents/skills/rag-blueprint/references/configure/ingestion.md b/skill-source/.agents/skills/rag-blueprint/references/configure/ingestion.md
new file mode 100644
index 000000000..ec5e6251f
--- /dev/null
+++ b/skill-source/.agents/skills/rag-blueprint/references/configure/ingestion.md
@@ -0,0 +1,53 @@
+# Ingestion: Text-Only, Audio, Nemotron Parse, OCR & Batch
+
+## When to Use
+User wants to configure ingestion mode (text-only, audio, Nemotron Parse), switch OCR engines, save extraction results to disk, use standalone NV-Ingest, tune ingestion performance, or run batch ingestion.
+
+## Restrictions
+- Nemotron Parse: not available on B200 or RTX PRO 6000 GPUs (requires H100 or A100 SXM 80GB)
+- Audio on Helm: not supported on RTX PRO 6000
+- Nemotron Parse GPU conflict: read `docs/service-port-gpu-reference.md` for default GPU assignments. Nemotron Parse defaults to the same GPU as LLM — reassign on limited-GPU systems
+
+## Process
+
+1. Detect the deployment mode (Docker self-hosted / NVIDIA-hosted / Helm / Library). Docker: edit the active env file. Helm: edit `values.yaml`. Library: edit `notebooks/config.yaml`
+2. Read the relevant source doc for detailed configuration
+3. Apply the required env vars to the active config, restart ingestor (and NIM services if enabling new profiles)
+4. Verify: upload a test document and check ingestion status
+
+## Decision Table
+
+| Goal | Source Doc | Key Action |
+|------|-----------|------------|
+| Text-only ingestion | `docs/text_only_ingest.md` | Set extract vars to False, set `COMPONENTS_TO_READY_CHECK=""` |
+| Audio ingestion | `docs/audio_ingestion.md` | Start audio NIM (`--profile audio`), set `AUDIO_MS_GPU_ID` |
+| Nemotron Parse | `docs/nemotron-parse-extraction.md` | `APP_NVINGEST_PDFEXTRACTMETHOD=nemotron_parse`, start NIM |
+| OCR config/switch | `docs/nemoretriever-ocr.md` | Switch between NeMo Retriever OCR and Paddle OCR |
+| Save to disk | `docs/mount-ingestor-volume.md` | `APP_NVINGEST_SAVETODISK=True`, mount volume |
+| Standalone NV-Ingest | `docs/nv-ingest-standalone.md` | Direct Python client, no full ingestor server |
+| Batch ingestion | See `scripts/batch_ingestion.py` | `python scripts/batch_ingestion.py --folder ... --collection-name ...` |
+| Tune performance | `docs/accuracy_perf.md` | Adjust chunk size, overlap, batch settings |
+| Summarization at ingest | `references/configure/summarization.md` | `generate_summary: true` in upload payload |
+
+## Agent-Specific Notes
+
+- Text-only mode: set `COMPONENTS_TO_READY_CHECK=""` in the active env file so NV-Ingest does not wait for disabled extraction services. If the compose file hardcodes `COMPONENTS_TO_READY_CHECK=ALL`, update it to `${COMPONENTS_TO_READY_CHECK:-ALL}` so the env var takes effect
+- Use `--profile rag` with nims.yaml to skip OCR/detection NIMs in text-only mode
+- Audio formats supported: `.mp3`, `.wav`, `.mp4`, `.avi`, `.mov`, `.mkv`
+- Riva ASR requires ~8GB VRAM
+- NeMo Retriever OCR is 2x+ faster than Paddle OCR but needs 8GB vs 3GB VRAM
+- Batch CLI: `pip install -r scripts/requirements.txt` first; idempotent (skips already-ingested files)
+- MIG deployments: reduce batch sizes for large bulk ingestion jobs
+
+## Notebooks
+- `notebooks/ingestion_api_usage.ipynb` — Ingestor API: collections, uploads, document management
+
+## Source Documentation
+- `docs/text_only_ingest.md` — Text-only ingestion (skip OCR/detection)
+- `docs/audio_ingestion.md` — Audio/video ingestion via ASR
+- `docs/nemotron-parse-extraction.md` — Nemotron Parse PDF extraction
+- `docs/nemoretriever-ocr.md` — OCR configuration and switching
+- `docs/mount-ingestor-volume.md` — Volume mount for extraction results
+- `docs/nv-ingest-standalone.md` — Standalone NV-Ingest without ingestor server
+- `docs/accuracy_perf.md` — Ingestion tuning settings (chunk size, overlap, batch params)
+- `docs/service-port-gpu-reference.md` — OCR port mappings and GPU assignments
diff --git a/skill-source/.agents/skills/rag-blueprint/references/configure/mcp.md b/skill-source/.agents/skills/rag-blueprint/references/configure/mcp.md
new file mode 100644
index 000000000..0fc9516e8
--- /dev/null
+++ b/skill-source/.agents/skills/rag-blueprint/references/configure/mcp.md
@@ -0,0 +1,26 @@
+# MCP Server & Client
+
+## When to Use
+- User wants to expose RAG APIs as MCP tools for agentic workflows
+- User asks about MCP transport modes, NeMo Agent Toolkit integration, or ReAct agents
+
+## Process
+1. Read `docs/mcp.md` for full MCP server/client setup and configuration
+2. Choose transport mode: `sse`, `streamable_http`, or `stdio`
+3. Run MCP server from `examples/nvidia_rag_mcp/mcp_server.py`
+4. For agentic RAG, see ReAct agent example in `examples/rag_react_agent/`
+
+## Agent-Specific Notes
+- MCP wraps both RAG tools (`generate`, `search`, `get_summary`) and Ingestor tools (`create_collection`, `upload_documents`, etc.) via FastMCP
+- `stdio` transport does not require a running server — client spawns it directly
+- ReAct agent requires: Python 3.11+, `NVIDIA_API_KEY`, and data already ingested into Milvus
+- Configure Milvus endpoint in `examples/rag_react_agent/src/rag_react_agent/configs/config.yml` or via `APP_VECTORSTORE_URL`
+
+## Notebooks
+| Notebook | Description |
+|----------|-------------|
+| `notebooks/mcp_server_usage.ipynb` | End-to-end MCP workflow: collection creation, upload, RAG queries |
+| `notebooks/nat_mcp_integration.ipynb` | NeMo Agent Toolkit integration with RAG MCP server |
+
+## Source Documentation
+- `docs/mcp.md` -- full MCP server/client documentation and transport configuration
diff --git a/skill-source/.agents/skills/rag-blueprint/references/configure/migration.md b/skill-source/.agents/skills/rag-blueprint/references/configure/migration.md
new file mode 100644
index 000000000..c56e020ee
--- /dev/null
+++ b/skill-source/.agents/skills/rag-blueprint/references/configure/migration.md
@@ -0,0 +1,35 @@
+# Migration Guide
+
+## When to Use
+- User is upgrading between RAG Blueprint versions
+- User encounters breaking API changes or deprecated endpoints after an update
+
+## Process
+1. Read `docs/migration_guide.md` for full version-by-version migration details
+2. Identify the user's current and target versions
+3. Apply changes sequentially for each version gap
+
+## Agent-Specific Notes
+
+### v2.2.0 → v2.3.0
+- New `confidence_threshold` field in `/generate` and `/search` (0.0–1.0, default 0.0)
+- New `summary_options` parameter with `page_filter`, `shallow_summary`, `summarization_strategy`
+- `SUMMARY_LLM_MAX_CHUNK_LENGTH` and `SUMMARY_CHUNK_OVERLAP` changed from character-based to token-based — divide old values by ~4
+
+### v2.1.0 → v2.2.0
+- Added `generate_summary` to `/documents`, new `GET /summary` endpoint
+- `POST /collection` (singular) replaces `POST /collections` for single collection creation
+- `collection_names: List[str]` replaces `collection_name: str` in `/generate` and `/search`
+
+### v2.0.0 → v2.1.0
+- `POST /documents` gained `blocking: bool` (default `True`); use `false` + `GET /status` for async
+
+### v1.0.0 → v2.0.0 (Breaking)
+- Single server split into RAG Server (port 8081) and Ingestion Server (port 8082)
+- Collections must be explicitly created before uploading documents
+- Default changed from cloud-hosted to on-prem models
+
+## Source Documentation
+- `docs/migration_guide.md` — Full migration guide with examples and env var changes
+- `docs/release-notes.md` — Release notes and version history
+- `docs/query-to-answer-pipeline.md` — Query-to-answer pipeline architecture overview
diff --git a/skill-source/.agents/skills/rag-blueprint/references/configure/models-and-infrastructure.md b/skill-source/.agents/skills/rag-blueprint/references/configure/models-and-infrastructure.md
new file mode 100644
index 000000000..68add08bc
--- /dev/null
+++ b/skill-source/.agents/skills/rag-blueprint/references/configure/models-and-infrastructure.md
@@ -0,0 +1,68 @@
+# Models, Vector DB & Service API Keys
+
+## When to Use
+User wants to change LLM, embedding, or ranking models; switch vector DB (Milvus/Elasticsearch); configure Milvus auth, GPU mode, or custom endpoints; set service-specific API keys; or build a custom VDB operator.
+
+## Process
+
+Detect the deployment mode before making changes. Docker: edit the active env file. Helm: edit `values.yaml` under `nimOperator` and `envVars` sections. Library: edit `notebooks/config.yaml`.
+
+### Change Models (LLM, Embedding, Ranking)
+1. Read `docs/change-model.md` for full model change instructions
+2. Read `docs/model-profiles.md` for NIM profile selection and GPU-specific profiles
+3. Key env vars: `APP_LLM_MODELNAME`, `APP_EMBEDDINGS_MODELNAME`, `APP_RANKING_MODELNAME`
+4. Embedding model change requires re-ingesting all documents — update `APP_EMBEDDINGS_DIMENSIONS` to match
+5. Restart affected services (RAG server + ingestor for embedding changes)
+6. Verify via health endpoint
+
+### Switch Vector DB (Milvus to Elasticsearch)
+1. Read `docs/change-vectordb.md` for full setup (Docker and Helm)
+2. Key env vars: `APP_VECTORSTORE_URL`, `APP_VECTORSTORE_NAME`
+3. Data is not migrated — re-ingest all documents after switching
+4. Elasticsearch requires port 9200; check for conflicts
+
+### Milvus Configuration
+1. Read `docs/milvus-configuration.md` for indexing, GPU, auth, and tuning
+2. Read `docs/milvus-schema.md` for collection schema requirements
+3. CPU mode: set `APP_VECTORSTORE_ENABLEGPUSEARCH=False`, `APP_VECTORSTORE_ENABLEGPUINDEX=False`, change Milvus image to non-GPU
+4. Auth: download milvus.yaml, enable `authorizationEnabled`, set password before first deployment
+
+### API Keys
+1. Read `docs/api-key.md` for NGC API key setup and per-service keys
+2. Fallback order: service-specific key > `NVIDIA_API_KEY` > `NGC_API_KEY`
+3. Per-service keys: `APP_LLM_APIKEY`, `APP_EMBEDDINGS_APIKEY`, `APP_RANKING_APIKEY`, `APP_VLM_APIKEY`, etc.
+
+## Decision Table
+
+| Goal | Source Doc | Key Action |
+|------|-----------|------------|
+| Change LLM | `docs/change-model.md` | Set `APP_LLM_MODELNAME`, restart RAG server |
+| Change embedding | `docs/change-model.md` | Set `APP_EMBEDDINGS_MODELNAME` + `APP_EMBEDDINGS_DIMENSIONS`, re-ingest |
+| Change reranker | `docs/change-model.md` | Set `APP_RANKING_MODELNAME`, restart RAG server |
+| Switch to Elasticsearch | `docs/change-vectordb.md` | Create data dir, start ES profile, set env vars, re-ingest |
+| Milvus auth | `docs/milvus-configuration.md` | Download config, enable auth, mount volume |
+| Milvus CPU mode | `docs/milvus-configuration.md` | Change image, disable GPU env vars |
+| Custom VDB | `docs/change-vectordb.md` | Implement `VDBRag`, register in `__init__.py` |
+| NIM profiles | `docs/model-profiles.md` | List profiles, set `NIM_MODEL_PROFILE` |
+| Service API keys | `docs/api-key.md` | Set per-service `*_APIKEY` vars |
+| Collection schema | `docs/milvus-schema.md` | Required fields: pk, vector, text, source, content_metadata |
+
+## Agent-Specific Notes
+
+- Nemotron-3-Nano naming: `nvidia/nemotron-3-nano-30b-a3b` (NVIDIA-hosted) vs `nvidia/nemotron-3-nano` (self-hosted NIM) — same model, different names
+- Helm model changes go in `values.yaml` under `nimOperator` and `envVars` sections
+- Custom VDB operator requires implementing `VDBRag` base class — see `docs/change-vectordb.md` "Custom Vector Database Operator" section
+- VDB auth tokens can be passed per-request via `Authorization: Bearer <token>` header
+- Milvus password persists in etcd volume — to change after deployment, must delete volumes (destroys data)
+
+## Notebooks
+- `notebooks/building_rag_vdb_operator.ipynb` — Custom VDB operator implementation (OpenSearch example)
+
+## Source Documentation
+- `docs/change-model.md` — Model changes (LLM, embedding, ranking, NIM images)
+- `docs/change-vectordb.md` — Vector DB switching, Elasticsearch setup, custom VDB operator
+- `docs/milvus-configuration.md` — Milvus indexing, GPU config, auth, tuning
+- `docs/milvus-schema.md` — Collection schema fields and requirements
+- `docs/model-profiles.md` — NIM profile definitions and selection
+- `docs/api-key.md` — NGC API key setup, per-service keys, fallback order
+- `docs/service-port-gpu-reference.md` — Port mappings and GPU assignments for all services
diff --git a/skill-source/.agents/skills/rag-blueprint/references/configure/multimodal-query.md b/skill-source/.agents/skills/rag-blueprint/references/configure/multimodal-query.md
new file mode 100644
index 000000000..783b6c209
--- /dev/null
+++ b/skill-source/.agents/skills/rag-blueprint/references/configure/multimodal-query.md
@@ -0,0 +1,35 @@
+# Multimodal Query (Image + Text)
+
+## When to Use
+- User wants to query knowledge base with images and text together
+- User asks about VLM (Vision Language Model) deployment for RAG
+- User wants image-based document understanding or visual Q&A
+
+## Restrictions
+- Not available with Elasticsearch — Milvus only
+- Reranker must be disabled (`ENABLE_RERANKER=false`)
+- Summarization not supported (VLM replaces LLM)
+- On-prem: requires NVIDIA H100 or A100 SXM 80GB GPU
+- Single-page retrieval only — image queries return content from one page per document
+
+## Process
+1. Detect the deployment mode (Docker / Helm / Library). Docker: edit the active env file. Helm: edit `values.yaml`. Library: edit `notebooks/config.yaml`
+2. Read `docs/multimodal-query.md` for full env var configuration and commands
+3. Choose variant: self-hosted (Docker), NVIDIA-hosted (cloud), or Helm
+4. Deploy VLM + VLM Embedding NIMs per source doc instructions
+5. Set VLM env vars in the active config and switch embedding model to VLM embedding
+6. Restart ingestor + RAG server (Docker: add `--build` flag) and verify
+
+## Agent-Specific Notes
+- Must select a collection before querying — queries without collection return no results
+- First VLM deployment: model downloads take 10–20 min (~10GB+)
+- `VLM_MS_GPU_ID` — read `docs/service-port-gpu-reference.md` for the default GPU assignment and override if needed
+- Cloud rate limits apply for ingestion of >10 files
+- For Helm with MIG: ensure dedicated MIG slice is assigned to VLM
+- Image extraction must be enabled: `APP_NVINGEST_EXTRACTIMAGES=True`, `APP_NVINGEST_IMAGE_ELEMENTS_MODALITY=image`
+
+## Notebooks
+- `notebooks/image_input.ipynb` — end-to-end multimodal query examples, image upload, VLM querying
+
+## Source Documentation
+- `docs/multimodal-query.md` — full Docker/cloud/Helm configuration, env vars, API usage, limitations
diff --git a/skill-source/.agents/skills/rag-blueprint/references/configure/notebooks.md b/skill-source/.agents/skills/rag-blueprint/references/configure/notebooks.md
new file mode 100644
index 000000000..544de03bb
--- /dev/null
+++ b/skill-source/.agents/skills/rag-blueprint/references/configure/notebooks.md
@@ -0,0 +1,50 @@
+# Notebooks
+
+## When to Use
+- Hands-on examples of NVIDIA RAG Blueprint features are needed
+- There are questions about Jupyter notebooks, tutorials, or code samples
+
+## Process
+1. Read `docs/notebooks.md` for full notebook descriptions and prerequisites.
+2. Set up the environment: virtualenv, `jupyterlab`, and `git lfs pull` for test data.
+3. Open JupyterLab at `http://<server-ip>:8889`.
+
+## Agent-Specific Notes
+- Git LFS is required because several notebooks rely on large data files (`git lfs install && git lfs pull`).
+- In Docker mode, deploy NVIDIA RAG Blueprint first, then run notebooks against the running services.
+- In library mode, use `rag_library_usage.ipynb` (full) or `rag_library_lite_usage.ipynb` (containerless).
+- The custom VDB operator notebook requires Docker for OpenSearch services.
+
+## Notebook Catalog
+
+### Beginner
+| Notebook                    | Topic                               |
+|-----------------------------|-------------------------------------|
+| `ingestion_api_usage.ipynb` | Document ingestion through the API  |
+| `retriever_api_usage.ipynb` | Search and retrieval API            |
+| `image_input.ipynb`         | Image upload and multimodal queries |
+
+### Intermediate
+| Notebook                       | Topic                                  |
+|--------------------------------|----------------------------------------|
+| `summarization.ipynb`          | Document summarization strategies      |
+| `evaluation_01_ragas.ipynb`    | RAGAS accuracy, relevancy, groundedness|
+| `evaluation_02_recall.ipynb`   | Recall at top-k cutoffs                |
+| `nb_metadata.ipynb`            | Custom metadata and filtered retrieval |
+| `rag_library_usage.ipynb`      | Full library mode end-to-end           |
+| `rag_library_lite_usage.ipynb` | Lite, containerless library mode       |
+
+### Advanced
+| Notebook                          | Topic                               |
+|-----------------------------------|-------------------------------------|
+| `building_rag_vdb_operator.ipynb` | Custom OpenSearch VDB operator      |
+| `mcp_server_usage.ipynb`          | MCP server with transport modes     |
+| `nat_mcp_integration.ipynb`       | NeMo Agent Toolkit plus MCP         |
+
+### Deployment
+| Notebook           | Topic                 |
+|--------------------|-----------------------|
+| `launchable.ipynb` | Brev cloud deployment |
+
+## Source Documentation
+- `docs/notebooks.md` — full notebook descriptions, setup, and prerequisites.
diff --git a/skill-source/.agents/skills/rag-blueprint/references/configure/observability.md b/skill-source/.agents/skills/rag-blueprint/references/configure/observability.md
new file mode 100644
index 000000000..5b291d339
--- /dev/null
+++ b/skill-source/.agents/skills/rag-blueprint/references/configure/observability.md
@@ -0,0 +1,29 @@
+# Observability
+
+## When to Use
+- User wants tracing, metrics, or monitoring for the RAG pipeline
+- User asks about latency debugging, Zipkin, Grafana, or Prometheus
+
+## Process
+1. Detect the deployment mode. Docker: edit the active env file. Helm: edit `values.yaml`. Library: edit `notebooks/config.yaml`
+2. Read `docs/observability.md` for full setup (Docker and Helm)
+3. Set `OPENTELEMETRY_CONFIG_FILE` and `APP_TRACING_ENABLED=True` in the active config
+4. Start observability stack and restart RAG server
+5. Import Grafana dashboard from `deploy/config/rag-metrics-dashboard.json`
+
+## Agent-Specific Notes
+- Library mode: set `OPENTELEMETRY_CONFIG_FILE` in the environment for tracing; the Docker-based Prometheus/Grafana stack is independent
+- Helm: Prometheus Operator CRDs must be installed before deploying with observability enabled
+- Default Grafana credentials: `admin` / `admin`
+- Zipkin spans cover: `query-rewriter`, `retriever`, `context-reranker`, `llm-stream`
+- Span I/O visible via `traceloop.entity.input` / `traceloop.entity.output` fields
+
+### Quick Latency Triage
+| Symptom | Check |
+|---------|-------|
+| Slow first token | `rag_ttft_ms` — compare retriever and reranker spans |
+| Slow full response | `llm_generation_time_ms` / `llm-stream` span |
+| Retrieval heavy | Compare `retrieval_time_ms` vs `context_reranker_time_ms` |
+
+## Source Documentation
+- `docs/observability.md` -- full Docker/Helm setup, env vars, metrics reference, and dashboard import
diff --git a/skill-source/.agents/skills/rag-blueprint/references/configure/query-and-conversation.md b/skill-source/.agents/skills/rag-blueprint/references/configure/query-and-conversation.md
new file mode 100644
index 000000000..2f092cd97
--- /dev/null
+++ b/skill-source/.agents/skills/rag-blueprint/references/configure/query-and-conversation.md
@@ -0,0 +1,82 @@
+```markdown
+# Query Rewriting, Query Decomposition, and Multi-Turn
+
+Use these features when you want the system to understand follow-up questions, rewrite queries for better retrieval, or break complex questions into smaller parts.
+
+## When to use
+
+Use these settings when:
+
+- You want to enable multi-turn conversations or support follow-up questions.
+- You want query rewriting to improve retrieval accuracy.
+- You need complex multi-hop query decomposition.
+- You are configuring or debugging conversation history behavior.[file:1]
+
+## Restrictions
+
+- Query rewriting and multi-turn both require `CONVERSATION_HISTORY > 0`. If it is set to 0, query rewriting has no effect.[file:1]
+- Query decomposition works only when `use_knowledge_base=true` and with a single collection.[file:1]
+- On Helm, query rewriting is supported only with an on-prem LLM, not with cloud-hosted models.[file:1]
+
+## Dependencies
+
+`CONVERSATION_HISTORY` is shared by query rewriting and multi-turn, so changing it affects both behaviors.
+
+| Setting                 | Depends on                 | Side effect when changed                                  |
+|-------------------------|----------------------------|-----------------------------------------------------------|
+| `ENABLE_QUERYREWRITER`  | `CONVERSATION_HISTORY > 0` | Enabling requires conversation history; disabling has no side effects |
+| `CONVERSATION_HISTORY`  | —                          | Setting to `0` also effectively disables query rewriting  |[file:1]
+
+## Process
+
+First detect the deployment mode.  
+- Docker: edit the active environment file.  
+- Helm: edit `values.yaml`.  
+- Library: edit `notebooks/config.yaml`.[file:1]
+
+### Query rewriting
+
+1. Review `docs/multiturn.md` for full configuration details.[file:1]
+2. To enable, set `ENABLE_QUERYREWRITER=True`. If `CONVERSATION_HISTORY` is `0`, set it to `5` or another positive value.[file:1]
+3. To disable, unset or comment out `ENABLE_QUERYREWRITER`.[file:1]
+4. Restart the RAG server.[file:1]
+
+### Multi-turn
+
+1. Review `docs/multiturn.md` for configuration, retrieval strategies, and API usage.[file:1]
+2. To enable, set `CONVERSATION_HISTORY > 0` and choose the retrieval strategy you want to use.[file:1]
+3. To disable, set `CONVERSATION_HISTORY=0`.[file:1]
+4. Restart the RAG server.[file:1]
+
+### Query decomposition
+
+1. Review `docs/query_decomposition.md` for the decomposition algorithm, limitations, and examples.[file:1]
+2. Set `ENABLE_QUERY_DECOMPOSITION=true` and `MAX_RECURSION_DEPTH=3` (or a different depth that fits your use case).[file:1]
+3. Restart the RAG server.[file:1]
+
+## Decision table
+
+| Goal                          | Source doc                 | Key settings                                              |
+|-------------------------------|----------------------------|-----------------------------------------------------------|
+| Multi-turn with best accuracy | `docs/multiturn.md`        | `CONVERSATION_HISTORY=5`, `ENABLE_QUERYREWRITER=True`    |
+| Multi-turn with low latency   | `docs/multiturn.md`        | `CONVERSATION_HISTORY=5`, `MULTITURN_RETRIEVER_SIMPLE=True` |
+| Complex multi-hop queries     | `docs/query_decomposition.md` | `ENABLE_QUERY_DECOMPOSITION=true`, `MAX_RECURSION_DEPTH=3` |
+| Disable multi-turn (default)  | —                          | `CONVERSATION_HISTORY=0`                                 |[file:1]
+
+## Agent-specific notes
+
+- `MULTITURN_RETRIEVER_SIMPLE` only applies when query rewriting is disabled. If both are configured, query rewriting takes precedence.[file:1]
+- You can toggle query rewriting per request by setting `enable_query_rewriting: true` in `POST /generate`, but `CONVERSATION_HISTORY` must still be greater than 0.[file:1]
+- By default, multi-turn is disabled with `CONVERSATION_HISTORY=0`.[file:1]
+- Query decomposition adds latency and is most useful for multi-hop queries that involve multiple entities or steps.[file:1]
+- In library mode, configure these settings in `notebooks/config.yaml` instead of using environment variables.[file:1]
+
+## Notebooks
+
+- `notebooks/retriever_api_usage.ipynb`: RAG retriever API usage with search and end-to-end query examples.[file:1]
+
+## Source documentation
+
+- `docs/query_decomposition.md`: Decomposition algorithm details, when to use it, and recursion depth guidance.[file:1]
+- `docs/multiturn.md`: Conversation history behavior, retrieval strategies, API usage, and Helm configuration.[file:1]
+```
diff --git a/skill-source/.agents/skills/rag-blueprint/references/configure/reasoning-and-generation.md b/skill-source/.agents/skills/rag-blueprint/references/configure/reasoning-and-generation.md
new file mode 100644
index 000000000..d5bb24114
--- /dev/null
+++ b/skill-source/.agents/skills/rag-blueprint/references/configure/reasoning-and-generation.md
@@ -0,0 +1,57 @@
+# Reasoning, Self-Reflection & Prompt Customization
+
+## When to Use
+User wants to enable reasoning/thinking mode, configure self-reflection, customize prompts, adjust generation parameters (max tokens, temperature, citations), or understand thinking budget options.
+
+## Process
+1. Detect the deployment mode (Docker / Helm / Library). Docker: edit the active env file. Helm: edit `values.yaml`. Library: edit `notebooks/config.yaml`
+2. Read the relevant source doc for the specific feature
+3. Apply env vars to the active config or edit prompt files, restart RAG server
+4. Prompt changes require `--build` flag (Docker); env var changes only need restart
+5. Verify: test with a query and check for reasoning output or changed behavior
+
+## Decision Table
+
+| Goal | Source Doc | Key Action |
+|------|-----------|------------|
+| Enable reasoning (Nemotron 1.5) | `docs/enable-nemotron-thinking.md` | Edit `prompt.yaml`: `/no_think` → `/think`, set temperature |
+| Enable reasoning (Nano 30B) | `docs/enable-nemotron-thinking.md` | `ENABLE_NEMOTRON_3_NANO_THINKING=true` |
+| Self-reflection | `docs/self-reflection.md` | `ENABLE_REFLECTION=true`, set thresholds |
+| Prompt customization | `docs/prompt-customization.md` | `PROMPT_CONFIG_FILE=/path/to/custom.yaml` or edit prompt.yaml |
+| Generation parameters | `docs/llm-params.md` | `LLM_MAX_TOKENS`, `LLM_TEMPERATURE`, `ENABLE_CITATIONS` |
+| Per-request overrides | `docs/llm-params.md` | `temperature`, `top_p`, `max_tokens`, `stop` in API payload |
+
+## Agent-Specific Notes
+
+- Prompt changes need `--build` flag on restart; env var changes do not
+- Self-reflection: streaming not supported during groundedness checks
+- Self-reflection uses same LLM by default; override with `REFLECTION_LLM`, `REFLECTION_LLM_SERVERURL`, `REFLECTION_LLM_APIKEY`
+- Helm: only on-premises reflection is supported
+- GPU requirements for reflection: see `docs/self-reflection.md` for optimal GPU configurations
+- Debug reflection: set `LOGLEVEL=INFO` to observe iteration counts
+- `FILTER_THINK_TOKENS=false` to see full reasoning output (filtered by default)
+- 18 prompt templates available in `prompt.yaml` — custom file only overrides specified keys
+
+### Reasoning Model Comparison
+
+| Model | Control | Thinking Budget | Output Format |
+|-------|---------|-----------------|---------------|
+| Nemotron 1.5 | System prompt (`/think`) | None | `<think>` tags (filtered by default) |
+| Nemotron-3-Nano 9B | System prompt (`/think`) | `min_thinking_tokens` + `max_thinking_tokens` | `reasoning_content` field |
+| Nemotron-3-Nano 30B | `ENABLE_NEMOTRON_3_NANO_THINKING` env var | `max_thinking_tokens` only | `reasoning_content` field |
+
+### Thinking Budget Recommendations
+
+| Range | Use Case |
+|-------|----------|
+| 1024–4096 | Faster responses for simpler questions |
+| 8192–16384 | More thorough reasoning for complex queries |
+
+## Notebooks
+- `notebooks/retriever_api_usage.ipynb` — end-to-end query examples showing generation behavior
+
+## Source Documentation
+- `docs/enable-nemotron-thinking.md` — Reasoning mode for all Nemotron models
+- `docs/self-reflection.md` — Self-reflection configuration and thresholds
+- `docs/prompt-customization.md` — Prompt template catalog and customization
+- `docs/llm-params.md` — Generation parameters (temperature, max tokens, etc.)
diff --git a/skill-source/.agents/skills/rag-blueprint/references/configure/search-and-retrieval.md b/skill-source/.agents/skills/rag-blueprint/references/configure/search-and-retrieval.md
new file mode 100644
index 000000000..b311c958d
--- /dev/null
+++ b/skill-source/.agents/skills/rag-blueprint/references/configure/search-and-retrieval.md
@@ -0,0 +1,67 @@
+# Search & Retrieval: Hybrid Search, Multi-Collection, Metadata & Profiles
+
+## When to Use
+User wants to enable hybrid search, query multiple collections, add custom metadata/filters, tune retrieval performance, configure reranker, enable natural language filter generation, or switch accuracy/performance profiles.
+
+## Process
+
+1. Detect the deployment mode (Docker / Helm / Library). Docker: edit the active env file. Helm: edit `values.yaml`. Library: edit `notebooks/config.yaml`
+2. Read the relevant source doc for detailed configuration
+3. Apply the required env vars to the active config and restart affected services
+4. Verify via search/generate API call
+
+## Decision Table
+
+| Goal | Source Doc | Key Env Vars |
+|------|-----------|-------------|
+| Hybrid search | `docs/hybrid_search.md` | `APP_VECTORSTORE_SEARCHTYPE=hybrid` |
+| Multi-collection | `docs/multi-collection-retrieval.md` | `enable_reranker: True` in API payload |
+| Custom metadata | `docs/custom-metadata.md` | Metadata in upload payload, `vdb_filter_expression` in query |
+| Accuracy profile | `docs/accuracy_perf.md` | Copy values from `deploy/compose/accuracy_profile.env` into the active env file |
+| Performance profile | `docs/accuracy_perf.md` | Copy values from `deploy/compose/perf_profile.env` into the active env file |
+| Filter generation | `docs/custom-metadata.md` | `ENABLE_FILTER_GENERATOR=True` |
+
+## Agent-Specific Notes
+
+- Hybrid search requires re-ingesting — existing collections created with `dense` must be re-created
+- Multi-collection: limited to 5 collections per query; reranker is mandatory
+- Multi-collection not supported when `ENABLE_QUERY_DECOMPOSITION=true`
+- Elasticsearch RRF not supported in open-source version — must use `weighted` ranker
+- Ingestor must be restarted alongside RAG server when enabling hybrid search
+- `RERANKER_CONFIDENCE_THRESHOLD` is a legacy alias for `RERANKER_SCORE_THRESHOLD`
+- Recommended `RERANKER_SCORE_THRESHOLD` range: 0.3–0.5 (too high filters out too many chunks)
+
+### Advanced Tuning (not fully documented elsewhere)
+
+| Variable | Default | Description |
+|----------|---------|-------------|
+| `APP_VECTORSTORE_INDEXTYPE` | `GPU_CAGRA` | Vector index type |
+| `APP_VECTORSTORE_EF` | `100` | Search accuracy/speed trade-off (must be >= `VECTOR_DB_TOPK`) |
+| `VECTOR_DB_TOPK` | `100` | Candidates from vector DB (input to reranker) |
+| `APP_RETRIEVER_TOPK` | `10` | Chunks sent to LLM prompt (after reranking) |
+| `ENABLE_RERANKER` | `True` | Toggle reranking model |
+| `RERANKER_SCORE_THRESHOLD` | `0.0` | Minimum reranker score (0.0–1.0) |
+| `COLLECTION_NAME` | `multimodal_data` | Default collection name |
+
+### Partial Filtering
+- Strict (default): fails if any collection doesn't support the filter
+- Flexible (`allow_partial_filtering: true` in config.yaml): succeeds if at least one collection supports it
+
+### VDB Filter Support
+
+| Feature | Milvus | Elasticsearch |
+|---------|--------|---------------|
+| NL filter generation | LLM-powered | Not supported (manual DSL) |
+| Filter syntax | String expressions | List of dicts (ES Query DSL) |
+| UI support | Full filtering interface | API only |
+
+## Notebooks
+- `notebooks/retriever_api_usage.ipynb` — RAG retriever API: search and end-to-end queries
+- `notebooks/nb_metadata.ipynb` — Metadata ingestion, filtering, and extraction from queries
+
+## Source Documentation
+- `docs/hybrid_search.md` — Hybrid dense + sparse search configuration
+- `docs/multi-collection-retrieval.md` — Multi-collection querying
+- `docs/custom-metadata.md` — Custom metadata schema, filtering expressions, filter generation
+- `docs/accuracy_perf.md` — Best practices for tuning ingestion/retrieval/generation settings
+- `docs/python-client.md` — Python library API for search and filtering
diff --git a/skill-source/.agents/skills/rag-blueprint/references/configure/summarization.md b/skill-source/.agents/skills/rag-blueprint/references/configure/summarization.md
new file mode 100644
index 000000000..299c41e89
--- /dev/null
+++ b/skill-source/.agents/skills/rag-blueprint/references/configure/summarization.md
@@ -0,0 +1,40 @@
+# Document Summarization
+
+## When to Use
+- User wants to generate summaries during document ingestion
+- User asks about summarization strategies or options
+- User wants to check summary status or progress
+
+## Restrictions
+- Not supported in lite mode (containerless/library-only deployment)
+- Requires Redis for status tracking and rate limiting
+- Collection must exist before uploading with `generate_summary: true`
+
+## Process
+1. Detect the deployment mode. Docker: edit the active env file. Helm: configure under `ingestor-server.envVars` in `values.yaml`. Library: use the upload API parameters directly (no env vars needed)
+2. Read `docs/summarization.md` for full configuration, env vars, and prompt customization
+3. Set `generate_summary: true` in the upload payload (per-request, no global toggle)
+4. Optionally configure `summary_options`: strategy, shallow mode, page filter
+5. Retrieve summary via `GET /v1/summary?collection_name=...&file_name=...`
+
+## Decision Table
+
+| Goal | Strategy | Notes |
+|------|----------|-------|
+| Fastest overview | `"single"` + `shallow_summary=true` + `page_filter` | Quick text-only extraction |
+| Best quality | `null` (iterative, default) + `shallow_summary=false` | Sequential refinement |
+| Balanced | `"hierarchical"` + `shallow_summary=true` | Parallel tree-based |
+
+## Agent-Specific Notes
+- `CONVERSATION_HISTORY` prerequisite does not apply — that's for query rewriting only
+- `SUMMARY_LLM_SERVERURL=""` (empty) routes to NVIDIA cloud; `"nim-llm:8000"` for self-hosted
+- `SUMMARY_LLM_MAX_CHUNK_LENGTH` should be below the model's context window to leave room for prompt + output
+- Redis semaphore auto-resets on ingestor startup (prevents stale values from crashes)
+- If Redis is unavailable, summaries still generate but no real-time status tracking
+- Status entries have 24-hour TTL in Redis
+
+## Notebooks
+- `notebooks/summarization.ipynb` — complete examples for all strategies, status polling, library mode usage
+
+## Source Documentation
+- `docs/summarization.md` — env var reference, prompt customization, rate limiting, chunking details
diff --git a/skill-source/.agents/skills/rag-blueprint/references/configure/user-interface.md b/skill-source/.agents/skills/rag-blueprint/references/configure/user-interface.md
new file mode 100644
index 000000000..7fbad8165
--- /dev/null
+++ b/skill-source/.agents/skills/rag-blueprint/references/configure/user-interface.md
@@ -0,0 +1,27 @@
+# User Interface
+
+## When to Use
+- User asks about the RAG UI, uploading documents, settings, or metadata filtering
+- User wants to configure features via the web interface
+
+## Restrictions
+- Sample/experimentation UI — not intended for production
+- 100-file limit per upload batch; use multiple batches or API for bulk uploads
+- 10 MB max per image attachment
+
+## Process
+1. Read `docs/user-interface.md` for full UI documentation
+2. Access at `http://localhost:8090` (or `http://<workstation-ip>:8090` for remote)
+3. Configure RAG settings and feature toggles via Settings panel
+4. Use Filter Bar above chat input for metadata-filtered queries
+
+## Agent-Specific Notes
+- VLM Inference must be enabled in Settings > Feature Toggles before image attachments work
+- ECONNRESET errors on multi-file uploads — recommend API for bulk operations
+- Document summaries generate asynchronously; UI shows "Generating summary..." until complete
+- Document count in UI may lag slightly after ingestion
+- Metadata filtering supports AND/OR logic between filters (toggle via logic button)
+- Custom metadata schema is set during collection creation via the Metadata Schema Editor
+
+## Source Documentation
+- `docs/user-interface.md` -- full UI documentation including settings, file types, metadata, and health monitoring
diff --git a/skill-source/.agents/skills/rag-blueprint/references/configure/vlm.md b/skill-source/.agents/skills/rag-blueprint/references/configure/vlm.md
new file mode 100644
index 000000000..d573cd0c2
--- /dev/null
+++ b/skill-source/.agents/skills/rag-blueprint/references/configure/vlm.md
@@ -0,0 +1,56 @@
+# VLM, VLM Embeddings & Image Captioning
+
+## When to Use
+User wants image understanding, visual content analysis, VLM inference, multimodal embeddings, or image captioning during ingestion.
+
+## Restrictions
+- Not available on B200 GPUs — use H100, A100 SXM 80GB, or RTX PRO 6000
+- Requires extra GPU (GPU 1+ for 2-GPU systems, GPU 2+ for 3+ GPUs with fallback)
+- VLM embeddings: experimental, PDF-only, no summarization, no citations with page-as-image
+- Image captioning on Helm: on-prem only (modify `values.yaml` to enable)
+
+## Process
+1. Detect the deployment mode (Docker / Helm / Library). Docker: edit the active env file. Helm: edit `values.yaml`. Library: edit `notebooks/config.yaml`
+2. Read the relevant source doc for detailed steps:
+   - VLM generation: `docs/vlm.md`
+   - VLM embeddings: `docs/vlm-embed.md`
+   - Image captioning: `docs/image_captioning.md`
+3. Start VLM NIM (self-hosted) or configure cloud endpoint (NVIDIA-hosted)
+4. Set the required variables in the active config:
+   - Enabling: `ENABLE_VLM_INFERENCE=true` and `APP_NVINGEST_EXTRACTIMAGES=True`
+   - Disabling: re-comment those variables in the env file
+5. Restart affected services and verify with a health check + image-containing document query
+
+## Decision Table
+
+| Goal | Source Doc | Docker Profile | Notes |
+|------|-----------|---------------|-------|
+| VLM replaces LLM | `docs/vlm.md` | `--profile vlm-generation` | LLM not started; set `VLM_TO_LLM_FALLBACK=false` |
+| VLM + LLM fallback | `docs/vlm.md` | `--profile vlm-only` | Needs 3+ GPUs; both VLM and LLM running |
+| VLM embeddings | `docs/vlm-embed.md` | `--profile vlm-embed` | Experimental; requires re-ingestion |
+| Image captioning | `docs/image_captioning.md` | `--profile vlm-only` | Requires VLM NIM; Helm: on-prem only |
+| Multimodal query | `docs/multimodal-query.md` | (depends on VLM mode) | Image + text querying |
+
+## Agent-Specific Notes
+
+- `--profile vlm-generation` skips the LLM entirely — use `--profile vlm-only` for fallback mode
+- `VLM_TO_LLM_FALLBACK` defaults to `true`, but `vlm-generation` profile does not start LLM
+- Helm VLM: disable `nim-llm` and enable `nim-vlm` (VLM uses LLM's GPU allocation)
+- Helm fallback: keep both `nim-vlm` and `nim-llm` enabled, set `VLM_TO_LLM_FALLBACK: "true"`
+- VLM context window is limited — keep queries self-contained
+- Image captioning known issue: files without graphs/charts/tables/plots fail to ingest when captioning is enabled
+
+### Key Env Vars (always needed)
+- `ENABLE_VLM_INFERENCE=true` — master toggle
+- `APP_NVINGEST_EXTRACTIMAGES=True` — extract images during ingestion
+- `VLM_MS_GPU_ID=<gpu-id>` — self-hosted GPU assignment
+
+## Notebooks
+- `notebooks/image_input.ipynb` — Multimodal queries with VLM (text + image)
+
+## Source Documentation
+- `docs/vlm.md` — VLM generation (self-hosted, NVIDIA-hosted, Helm, Library)
+- `docs/vlm-embed.md` — VLM embeddings (experimental)
+- `docs/image_captioning.md` — Image captioning during ingestion
+- `docs/multimodal-query.md` — Image + text querying
+- `docs/service-port-gpu-reference.md` — default GPU assignments for VLM and other NIMs
diff --git a/skill-source/.agents/skills/rag-blueprint/references/deploy.md b/skill-source/.agents/skills/rag-blueprint/references/deploy.md
new file mode 100644
index 000000000..96d712892
--- /dev/null
+++ b/skill-source/.agents/skills/rag-blueprint/references/deploy.md
@@ -0,0 +1,119 @@
+# RAG Blueprint Deployment
+
+## Phase 1: Environment Analysis
+
+Run this single command to collect all environment information at once:
+
+```bash
+echo "=== GPU ===" && nvidia-smi --query-gpu=index,name,memory.total --format=csv,noheader 2>/dev/null || echo "NO_GPU"; echo "=== VRAM ===" && nvidia-smi --query-gpu=memory.total --format=csv,noheader,nounits 2>/dev/null | awk '{s+=$1} END {print s "MB total"}' || echo "0MB total"; echo "=== DRIVER ===" && cat /proc/driver/nvidia/version 2>/dev/null | head -1 || echo "NO_DRIVER"; echo "=== CUDA ===" && nvcc --version 2>/dev/null | grep "release" || echo "NO_CUDA_TOOLKIT"; echo "=== DOCKER ===" && docker --version 2>/dev/null || echo "NO_DOCKER"; echo "=== COMPOSE ===" && docker compose version 2>/dev/null || echo "NO_COMPOSE"; echo "=== NVIDIA_TOOLKIT ===" && docker info 2>/dev/null | grep -i "runtimes.*nvidia" || echo "NO_NVIDIA_TOOLKIT"; echo "=== PYTHON ===" && python3 --version 2>/dev/null || echo "NO_PYTHON"; echo "=== DISK ===" && df -h --output=avail / | tail -1; echo "=== OS ===" && cat /etc/os-release 2>/dev/null | grep -E "^(NAME|VERSION)="; echo "=== NGC_KEY ===" && if [ -n "$NGC_API_KEY" ]; then echo "NGC_KEY_SET"; elif [ -n "$NVIDIA_API_KEY" ]; then echo "NVIDIA_KEY_SET"; elif grep -qr "NGC_API_KEY=" deploy/compose/.env deploy/compose/nvdev.env 2>/dev/null | grep -qv "nvapi-your-key"; then echo "DOTENV_SET"; else echo "NOT_SET"; fi; echo "=== RUNNING ===" && docker ps --format "{{.Names}}" 2>/dev/null | grep -E "(rag-server|ingestor-server|nim-llm|milvus)" | head -10 || echo "NO_RUNNING_SERVICES"; echo "=== PORTS ===" && (ss -tlnp 2>/dev/null || netstat -tlnp 2>/dev/null) | grep -E ":(8081|8082|8090|19530) " || echo "PORTS_FREE"; echo "=== REPO ===" && git rev-parse --show-toplevel 2>/dev/null && git describe --tags 2>/dev/null || echo "NO_GIT_REPO"; echo "=== CACHE ===" && du -sh ~/.cache/model-cache/ 2>/dev/null || echo "NO_CACHE"
+```
+
+Present a summary table:
+
+| Check | Result |
+|-------|--------|
+| GPU(s) | (list with VRAM, or NO_GPU) |
+| Total VRAM | (sum in MB/GB) |
+| NVIDIA Driver | (version or NO_DRIVER) |
+| CUDA Toolkit | (version or NO_CUDA_TOOLKIT) |
+| Docker | (version or NO_DOCKER) |
+| Docker Compose | (version or NO_COMPOSE) |
+| NVIDIA Container Toolkit | (detected or NO_NVIDIA_TOOLKIT) |
+| Python | (version or NO_PYTHON) |
+| Free disk | (value) |
+| OS | (name + version) |
+| NGC_API_KEY | ENV_SET / DOTENV_SET / NOT_SET |
+| Existing services | (list or none) |
+| Port availability | (free or list conflicts) |
+| Repo | (tag/branch or NO_GIT_REPO) |
+| Model cache | (size or empty) |
+
+### Existing Services Warning
+
+If RAG services are already running, tell the user briefly: "Existing RAG services detected (list). Proceeding will restart them." Continue unless the user objects.
+
+If the user wants to **switch deployment modes** (e.g., NVIDIA-hosted → self-hosted, or Docker → library), shut down the existing deployment first via `references/shutdown.md`, then proceed with the new mode.
+
+If ports are occupied by non-RAG processes, tell the user which ports conflict and suggest stopping the conflicting process. This is a blocker.
+
+## Phase 2: NGC_API_KEY Handling
+
+Check in this order:
+
+1. If `NGC_API_KEY` is set in the shell environment → proceed.
+2. If `NVIDIA_API_KEY` is set (common in library mode) → proceed silently.
+3. If `NGC_API_KEY` is in `deploy/compose/.env` or `deploy/compose/nvdev.env` (and not the placeholder `nvapi-your-key`) → load it and proceed.
+4. If none found → tell the user: "NGC_API_KEY is required. Get one from https://org.ngc.nvidia.com/setup/api-keys and run: `export NGC_API_KEY=\"nvapi-...\"` — then tell me when done."
+5. After user confirms → re-check silently. If still not set, write placeholder to `.env` and tell the user to edit it.
+
+## Phase 3: Blocker Checks
+
+Automatically check and report all blockers at once (don't stop at the first one):
+
+Read `docs/support-matrix.md` for current minimum versions and disk requirements, then check:
+
+- **Docker Compose below minimum**: "Upgrade Docker Compose. See https://docs.docker.com/compose/install/linux/"
+- **NVIDIA Driver below minimum** (if self-hosted): "Upgrade NVIDIA driver. See `docs/support-matrix.md` for required version."
+- **NVIDIA Container Toolkit missing** (and self-hosted needed): "Install NVIDIA Container Toolkit. See https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html"
+- **Insufficient disk**: "Check `docs/support-matrix.md` for disk requirements per deployment mode."
+- **No Docker and no Python 3.11+**: "Install Docker or Python 3.11+ to proceed."
+
+List all blockers together so the user can fix them in one pass — don't make them fix one, re-run, fix another.
+
+## Phase 4: Route to Deployment Mode
+
+### User explicitly requests a mode
+- "library mode" / "lite mode" / "no docker" / "python mode" → read and follow `deploy/library.md`
+- "docker" / "self-hosted" / "local" → read and follow `deploy/docker.md` with mode **self-hosted**
+- "cloud" / "nvidia-hosted" / "hosted" → read and follow `deploy/docker.md` with mode **nvidia-hosted**
+- "retrieval only" / "search only" / "no LLM" → read and follow `deploy/docker.md` with mode **retrieval-only**
+- "kubernetes" / "k8s" / "helm" → read and follow `deploy/helm.md`
+- "workbench" / "ai workbench" → tell user to follow `deploy/workbench/README.md` (AI Workbench uses its own UI-driven workflow)
+
+### Docker is available (Docker + Compose detected)
+
+**Self-hosted eligible** — read `docs/support-matrix.md` ("Hardware Requirements (Docker)" section) for current GPU requirements. All of the following must also be true:
+- GPU count and type matches the Docker self-hosted requirements from the support matrix
+- ≥200 GB free disk (per `docs/support-matrix.md` "Disk Space Requirements")
+- NVIDIA Container Toolkit detected
+- NVIDIA driver meets minimum version from `docs/support-matrix.md` ("Driver Versions")
+
+If self-hosted eligible → read and follow `deploy/docker.md` with mode **self-hosted**
+
+**Otherwise with Docker** → read and follow `deploy/docker.md` with mode **nvidia-hosted**
+
+Tell the user WHY if they have some GPU but not enough:
+- "You have [X GPU] with [Y GB] VRAM. Self-hosted requires [requirements from docs/support-matrix.md]. Deploying with NVIDIA-hosted cloud NIMs instead — faster startup, no model download."
+
+### Docker is available but Compose is not
+
+Tell the user: "Docker is installed but Docker Compose is below the minimum version (see `docs/support-matrix.md`). Install it: https://docs.docker.com/compose/install/linux/ — or use library mode instead."
+
+If user chooses library mode → read and follow `deploy/library.md`
+
+### Docker is not available
+
+- Python 3.11+ available → read and follow `deploy/library.md` with mode **lite**
+- No Python → tell user to install Python 3.11+ or Docker
+
+## After Deployment
+
+Once deployment completes, verify health:
+
+```bash
+echo "=== RAG Server ===" && curl -s http://localhost:8081/v1/health?check_dependencies=true 2>/dev/null || echo "RAG_SERVER_NOT_READY"; echo "=== Ingestor ===" && curl -s http://localhost:8082/v1/health?check_dependencies=true 2>/dev/null || echo "INGESTOR_NOT_READY"
+```
+
+If healthy, tell the user:
+- "RAG Blueprint is running and healthy."
+- "Ask me to configure features like VLM, query rewriting, guardrails, etc."
+- "Ask me to shutdown when you're done."
+
+If unhealthy, read `references/troubleshoot.md` and diagnose. Match error output against known issues, fix, and retry. Escalate to the user only if the fix requires their action (API key, data deletion).
+
+## Notebooks
+- `notebooks/launchable.ipynb` — Cloud deployment via Brev (alternative to local deployment)
+
+## Source Documentation
+- `docs/support-matrix.md` — GPU requirements, driver versions, disk space, supported platforms
+- `docs/service-port-gpu-reference.md` — port mappings and GPU assignments for all services
diff --git a/skill-source/.agents/skills/rag-blueprint/references/deploy/docker-nvidia-hosted.md b/skill-source/.agents/skills/rag-blueprint/references/deploy/docker-nvidia-hosted.md
new file mode 100644
index 000000000..f4c6ede07
--- /dev/null
+++ b/skill-source/.agents/skills/rag-blueprint/references/deploy/docker-nvidia-hosted.md
@@ -0,0 +1,38 @@
+# Docker Deployment (NVIDIA-Hosted NIMs)
+
+## When to Use
+- User wants fast deployment without local model downloads
+- User has no GPU or limited GPU
+- User asks about cloud-hosted or NVIDIA API deployment
+- User wants to avoid 15–30 min NIM startup time
+
+## Restrictions
+- Requires internet access (calls NVIDIA cloud APIs)
+- NVIDIA-hosted endpoints have rate limits — large ingestions (>10 files) may hit 429 errors
+- NGC_API_KEY required for cloud API access
+- Docker and Compose minimum versions per `docs/support-matrix.md`
+
+## Process
+1. Read `docs/deploy-docker-nvidia-hosted.md` for full commands and env configuration
+2. Use `deploy/compose/nvdev.env` — pre-configured for cloud endpoints. Source it before compose commands: `source deploy/compose/nvdev.env`
+3. Start vector DB → ingestor → RAG server + frontend (no NIM startup needed)
+4. Verify: `docker ps` shows containers; UI at `http://localhost:8090`
+
+## Decision Table
+
+| Goal | Key Action |
+|------|------------|
+| Standard cloud deployment | Use `nvdev.env` (pre-configured for cloud) |
+| Zero-GPU (no Milvus GPU) | Also switch Milvus image to CPU-only |
+| Large file ingestion | Reduce batch/concurrency settings to avoid 429s |
+| Maximum throughput | Use self-hosted deployment instead |
+
+## Agent-Specific Notes
+- First run: 5–10 min (image pulls only); subsequent: 1–2 min
+- No `nims.yaml` startup — all model inference is cloud-hosted
+- All subsequent configure/restart operations should source the same env file used for the initial deploy (`deploy/compose/nvdev.env`)
+- For zero-GPU: switch Milvus to CPU-only by changing the GPU image tag to the equivalent non-GPU tag and setting `APP_VECTORSTORE_ENABLEGPUSEARCH=False`. See `docs/deploy-docker-nvidia-hosted.md` for the current image tags
+- Rate limit mitigation for large ingestions: reduce `NV_INGEST_FILES_PER_BATCH`, `NV_INGEST_CONCURRENT_BATCHES`, `MAX_INGEST_PROCESS_WORKERS`, `NV_INGEST_MAX_UTIL` to minimum values
+
+## Source Documentation
+- `docs/deploy-docker-nvidia-hosted.md` — full step-by-step commands, env var blocks, CPU Milvus setup
diff --git a/skill-source/.agents/skills/rag-blueprint/references/deploy/docker-retrieval-only.md b/skill-source/.agents/skills/rag-blueprint/references/deploy/docker-retrieval-only.md
new file mode 100644
index 000000000..fb4935cf8
--- /dev/null
+++ b/skill-source/.agents/skills/rag-blueprint/references/deploy/docker-retrieval-only.md
@@ -0,0 +1,37 @@
+# Retrieval-Only Deployment
+
+## When to Use
+- User wants search/retrieval without LLM generation
+- User asks to deploy only embedding + reranking services
+- User wants `/search` endpoint with an external LLM
+- User wants a lightweight, low-GPU deployment
+
+## Restrictions
+- `/generate` endpoint returns an error — no LLM is deployed
+- Self-hosted: 1 GPU, ~24 GB memory
+- NVIDIA-hosted: 0 GPUs (cloud embedding + reranking)
+
+## Process
+1. Read `docs/retrieval-only-deployment.md` for full commands, env vars, and API examples
+2. Choose variant: self-hosted (local NIMs), NVIDIA-hosted (cloud), or Helm
+3. For self-hosted: start only embedding + ranking NIMs, skip LLM
+4. For NVIDIA-hosted: set embedding/ranking server URLs to empty, skip NIM startup entirely
+5. For Helm: set `nimOperator.nim-llm.enabled=false`
+6. Start vector DB → ingestor → RAG server
+7. Verify health: `GET http://localhost:8081/v1/health?check_dependencies=true`
+
+## Decision Table
+
+| Goal | Variant | Key Difference |
+|------|---------|----------------|
+| Minimal GPU usage with local models | Self-hosted | 1 GPU, ~24 GB |
+| Zero GPU, cloud APIs | NVIDIA-hosted | Set server URLs to empty, skip NIM startup |
+| Kubernetes | Helm | Disable `nim-llm` in values.yaml |
+
+## Agent-Specific Notes
+- Permission errors on model cache → try `USERID=0` or `chmod -R 755 ~/.cache/model-cache`
+- Empty search results → verify documents ingested: `GET http://localhost:8082/v1/documents?collection_name=<name>`
+- Users can send `/search` results to their own external LLM for generation
+
+## Source Documentation
+- `docs/retrieval-only-deployment.md` — full deployment commands, API examples, search payload options
diff --git a/skill-source/.agents/skills/rag-blueprint/references/deploy/docker-self-hosted.md b/skill-source/.agents/skills/rag-blueprint/references/deploy/docker-self-hosted.md
new file mode 100644
index 000000000..dc18ef649
--- /dev/null
+++ b/skill-source/.agents/skills/rag-blueprint/references/deploy/docker-self-hosted.md
@@ -0,0 +1,49 @@
+# Docker Deployment (Self-Hosted NIMs)
+
+## When to Use
+- User wants full on-premises deployment with local NIM containers
+- User has supported GPUs and wants models running locally
+- User asks to deploy RAG Blueprint with Docker
+
+## Restrictions
+
+Read `docs/support-matrix.md` for current GPU requirements. Feature restrictions per GPU type:
+
+| GPU | Cannot Use |
+|-----|------------|
+| B200 | VLM, Guardrails, Nemotron Parse |
+| RTX PRO 6000 | Nemotron Parse |
+
+- Read `docs/support-matrix.md` for current minimum NVIDIA Driver, CUDA, Docker, and Compose versions
+- NVIDIA Container Toolkit required (`docker info` shows nvidia runtime)
+- Disk space per `docs/support-matrix.md` ("Disk Space Requirements")
+- If any prerequisite is missing, tell the user what to install before proceeding
+
+## Process
+1. Read `docs/deploy-docker-self-hosted.md` for full commands and env configuration
+2. Read `docs/support-matrix.md` for GPU compatibility and supported model combinations
+3. Verify container toolkit, prepare model cache directory, source `.env`
+4. Apply GPU-specific config per source docs
+5. Start NIMs → wait for healthy → start remaining services
+6. Verify: `docker ps` shows all containers healthy; UI at `http://localhost:8090`
+
+## Decision Table
+
+| Goal | Profile Flag | Notes |
+|------|-------------|-------|
+| Full deployment (default) | (none) | LLM + embedding + ranking + OCR + detection |
+| Text-only RAG (lighter) | `--profile rag` | Skip OCR/detection NIMs |
+| Ingestion workload only | `--profile ingest` | Embedding + OCR + detection |
+| VLM replaces LLM | `--profile vlm-generation` | Not on B200 |
+| Advanced PDF extraction | `--profile nemotron-parse` | Not on B200 or RTX PRO 6000 |
+
+## Agent-Specific Notes
+- First run: 15–30 min (model downloads ~100–150 GB, no progress bar); subsequent: 2–5 min
+- Monitor download progress: `du -sh ~/.cache/model-cache/`
+- Permission error on model cache → try `USERID=0` instead of `USERID=$(id -u)`
+- Cloud NIM section in `deploy/compose/.env` must be commented out for self-hosted
+- Rebuild after code changes: add `--build` flag to compose up commands
+
+## Source Documentation
+- `docs/deploy-docker-self-hosted.md` — full step-by-step commands, env vars, GPU assignments
+- `docs/support-matrix.md` — GPU compatibility, supported models, hardware requirements
diff --git a/skill-source/.agents/skills/rag-blueprint/references/deploy/docker.md b/skill-source/.agents/skills/rag-blueprint/references/deploy/docker.md
new file mode 100644
index 000000000..eb31fb1a6
--- /dev/null
+++ b/skill-source/.agents/skills/rag-blueprint/references/deploy/docker.md
@@ -0,0 +1,88 @@
+# RAG Docker Deployment
+
+## Determine Mode
+
+If routed here from the deploy workflow, the mode (self-hosted, nvidia-hosted, or retrieval-only) was already decided. Use it.
+
+If invoked directly without a mode, auto-detect:
+
+```bash
+echo "=== COMPOSE ===" && docker compose version 2>/dev/null || echo "NO_COMPOSE"; echo "=== GPU ===" && nvidia-smi --query-gpu=name,memory.total --format=csv,noheader 2>/dev/null || echo "NO_GPU"; echo "=== DISK ===" && df -h --output=avail / | tail -1; echo "=== RUNNING ===" && docker ps --format "{{.Names}}" 2>/dev/null | grep -E "(rag-server|ingestor-server|nim-llm|milvus)" | head -10 || echo "NONE_RUNNING"
+```
+
+If NO_COMPOSE: stop and tell the user to install Docker Compose (see `docs/support-matrix.md` for minimum version).
+
+Read `docs/support-matrix.md` ("Hardware Requirements (Docker)" section) for current GPU requirements, then:
+- GPU count/type meets self-hosted requirements from the support matrix, and 200+ GB free disk → **self-hosted**
+- Any GPU or no GPU with ≥50 GB free disk → **nvidia-hosted**
+- User explicitly says "retrieval only" / "no LLM" / "search only" → **retrieval-only**
+
+Auto-route based on hardware. Only ask if two modes are equally valid and the user's intent is ambiguous.
+
+## Verify NGC_API_KEY
+
+Auto-check all possible locations before asking:
+
+```bash
+[ -n "$NGC_API_KEY" ] && echo "ENV_SET" || (grep -qr "NGC_API_KEY=" deploy/compose/.env deploy/compose/nvdev.env 2>/dev/null | grep -qv "nvapi-your-key" && echo "DOTENV_SET" || echo "NOT_SET")
+```
+
+- **ENV_SET**: proceed silently.
+- **DOTENV_SET**: load the env file that contains the key and proceed.
+- **NOT_SET**: ask the user to provide it. This is the only thing to ask for.
+
+## Docker Login
+
+Auto-check if already logged in:
+
+```bash
+grep -q "nvcr.io" ~/.docker/config.json 2>/dev/null && echo "ALREADY_LOGGED_IN" || echo "NOT_LOGGED_IN"
+```
+
+If already logged in → proceed silently.
+
+If not logged in → tell the user to run this themselves (the key gets expanded in agent logs):
+
+> Please run in your terminal: `echo "${NGC_API_KEY}" | docker login nvcr.io -u '$oauthtoken' --password-stdin`
+
+Wait for confirmation only if login was needed.
+
+## Deploy
+
+Based on the mode, read and follow the appropriate reference:
+
+- **Self-hosted**: read and follow `docker-self-hosted.md`
+- **NVIDIA-hosted**: read and follow `docker-nvidia-hosted.md`
+- **Retrieval-only**: read and follow `docker-retrieval-only.md`
+
+## Post-Deploy Verification
+
+Run health checks:
+
+```bash
+sleep 5; echo "=== RAG ===" && curl -s http://localhost:8081/v1/health?check_dependencies=true 2>/dev/null || echo "RAG_NOT_READY"; echo "=== INGESTOR ===" && curl -s http://localhost:8082/v1/health?check_dependencies=true 2>/dev/null || echo "INGESTOR_NOT_READY"; echo "=== CONTAINERS ===" && docker ps --format "table {{.Names}}\t{{.Status}}" 2>/dev/null | grep -E "(rag|milvus|nim|ingest)" | head -15
+```
+
+If services are still initializing, automatically poll every 30 seconds:
+- **NVIDIA-hosted**: poll until healthy or 5 minutes elapsed (no model downloads needed).
+- **Self-hosted**: poll until healthy or 15 minutes elapsed (model downloads on first run).
+- **Retrieval-only**: poll until healthy or 5 minutes elapsed.
+
+Show progress to the user during polling.
+
+## On Success
+
+Tell the user:
+- "RAG Blueprint is running and healthy. Open http://localhost:8090 to use the UI." (skip for retrieval-only)
+- "Ask me to configure features (VLM, query rewriting, guardrails, etc.)"
+- "Ask me to shutdown when you're done."
+
+## On Error
+
+1. Read the error output from the failed command.
+2. Read `references/troubleshoot.md` to match against common issues (port conflict, disk full, NGC auth, GPU OOM).
+3. Apply the fix and retry.
+4. If still failing, report the specific error to the user with the fix that was attempted.
+
+## Source Documentation
+- `docs/support-matrix.md` — GPU requirements, hardware compatibility, disk space
diff --git a/skill-source/.agents/skills/rag-blueprint/references/deploy/helm-mig.md b/skill-source/.agents/skills/rag-blueprint/references/deploy/helm-mig.md
new file mode 100644
index 000000000..dd3dd8ce3
--- /dev/null
+++ b/skill-source/.agents/skills/rag-blueprint/references/deploy/helm-mig.md
@@ -0,0 +1,38 @@
+# MIG GPU Deployment
+
+## When to Use
+- User wants fine-grained GPU allocation on Kubernetes using MIG slices
+- User has H100 GPUs and wants to share them across RAG services
+- User asks about Multi-Instance GPU deployment
+
+## Restrictions
+- Requires H100 80GB HBM3 GPUs (MIG-compatible)
+- MIG profiles in this guide are specific to H100 80GB — other GPUs need different profiles
+- Requires cloned repository (MIG config files in `deploy/helm/`)
+- All standard Helm prerequisites apply (GPU Operator, NIM Operator, StorageClass)
+- Ingestion profile is scaled down with MIG — large bulk ingestion jobs may fail
+
+## Process
+1. Read `docs/mig-deployment.md` for full configuration, commands, and MIG slice definitions
+2. Enable MIG with mixed strategy on ClusterPolicy
+3. Apply MIG ConfigMap and label the node
+4. Verify node labels show `mig.config.state: "success"` before proceeding
+5. Install Helm chart with `-f mig-slicing/values-mig.yaml`
+
+## Decision Table
+
+| Goal | Source Doc | Key Action |
+|------|-----------|------------|
+| Standard MIG on H100 | `docs/mig-deployment.md` | Apply MIG config, label node, install chart |
+| RTX PRO 6000 with MIG | `docs/mig-deployment.md` | Also uncomment model section in values.yaml |
+| Custom MIG profiles | NVIDIA MIG User Guide | Modify `mig-config.yaml` for different GPU types |
+
+## Agent-Specific Notes
+- Must wait for `mig.config.state: "success"` on the node before Helm install — if not present, wait and re-check
+- Default H100 MIG layout (see `docs/mig-deployment.md` for current GPU count and slice definitions): GPU 0 → small slices, GPU 1 → mixed slices, GPU 2 → full-GPU slice
+- LLM gets the largest slice (`7g.80gb`); embedding/Milvus/ingest share small slices
+- RTX PRO 6000 variant: uncomment model section in values.yaml, then use both `-f values.yaml -f mig-slicing/values-mig.yaml`
+- Uninstall follows standard Helm procedure (see Helm deployment docs)
+
+## Source Documentation
+- `docs/mig-deployment.md` — full MIG config, ClusterPolicy patches, node labeling, verification, Helm install commands
diff --git a/skill-source/.agents/skills/rag-blueprint/references/deploy/helm-standard.md b/skill-source/.agents/skills/rag-blueprint/references/deploy/helm-standard.md
new file mode 100644
index 000000000..df2630660
--- /dev/null
+++ b/skill-source/.agents/skills/rag-blueprint/references/deploy/helm-standard.md
@@ -0,0 +1,51 @@
+# Helm Deployment
+
+## When to Use
+- User wants to deploy RAG Blueprint on Kubernetes
+- User asks about Helm chart installation (from NGC or local repo)
+- User mentions Kubernetes, k8s, or Helm in deployment context
+
+## Restrictions
+
+Read `docs/support-matrix.md` for current Kubernetes, Helm, and OS version requirements.
+
+- Requires GPU Operator + NIM Operator pre-installed
+- Default StorageClass must be configured for PVC provisioning
+- Disk space per `docs/support-matrix.md`
+- NeMo Guardrails not available in Helm deployment
+- Image captioning: on-prem only (requires `values.yaml` changes; see `docs/image_captioning.md`)
+
+## Process
+
+### Option A: Deploy from NGC (Remote Chart)
+1. Read `docs/deploy-helm.md` for full commands and values
+2. Ensure prerequisites: GPU Operator, NIM Operator, StorageClass, NGC_API_KEY
+3. Install chart, monitor pods, port-forward frontend
+
+### Option B: Deploy from Repository (Local Chart)
+1. Read `docs/deploy-helm-from-repo.md` for full commands and repo setup
+2. Add required Helm repos, run `helm dependency update`, install from local path
+
+### RTX PRO 6000 Variant
+1. Uncomment model section under `nimOperator.nim-llm.model` in `values.yaml`
+2. See source docs for engine/precision/GPU settings
+
+## Decision Table
+
+| Goal | Option | Key Action |
+|------|--------|------------|
+| Quick deploy from published chart | NGC (Option A) | `helm upgrade --install` with NGC URL |
+| Customized chart | Local repo (Option B) | Clone, modify values, `helm dependency update` |
+| RTX PRO 6000 GPUs | Either option | Uncomment model section in values.yaml |
+| Retrieval-only (no LLM) | Either option | `--set nimOperator.nim-llm.enabled=false` |
+
+## Agent-Specific Notes
+- First deployment: 60–70 min (model cache download); subsequent: 10–15 min
+- Pods in `ContainerCreating`/`Init` for extended time is normal during cache download
+- PVCs are not removed by `helm uninstall` — delete manually: `kubectl delete nimcache --all -n rag && kubectl delete pvc --all -n rag`
+- Port-forwarding may timeout for large file ingestion — not suitable for bulk uploads
+- All configurable endpoints documented in `deploy/helm/nvidia-blueprint-rag/endpoints.md`
+
+## Source Documentation
+- `docs/deploy-helm.md` — NGC remote chart deployment, prerequisites, monitoring
+- `docs/deploy-helm-from-repo.md` — local chart deployment, repo setup, dependency management
diff --git a/skill-source/.agents/skills/rag-blueprint/references/deploy/helm.md b/skill-source/.agents/skills/rag-blueprint/references/deploy/helm.md
new file mode 100644
index 000000000..b381f4dcc
--- /dev/null
+++ b/skill-source/.agents/skills/rag-blueprint/references/deploy/helm.md
@@ -0,0 +1,103 @@
+# RAG Helm Deployment
+
+If routed here from the deploy workflow, proceed directly to Phase 1.
+
+## Phase 1: Prerequisites Check
+
+Run all checks at once:
+
+```bash
+echo "=== KUBECTL ===" && kubectl version --client 2>/dev/null || echo "NO_KUBECTL"; echo "=== HELM ===" && helm version --short 2>/dev/null || echo "NO_HELM"; echo "=== STORAGECLASS ===" && kubectl get storageclass 2>/dev/null || echo "NO_STORAGECLASS"; echo "=== NODES ===" && kubectl get nodes -o wide 2>/dev/null || echo "NO_CLUSTER_ACCESS"; echo "=== GPU_OPERATOR ===" && kubectl get pods -n gpu-operator 2>/dev/null | grep -i running || echo "NO_GPU_OPERATOR"; echo "=== NIM_OPERATOR ===" && kubectl get pods -n nim-operator 2>/dev/null | grep -i running || echo "NO_NIM_OPERATOR"; echo "=== NAMESPACE ===" && kubectl get namespace rag 2>/dev/null && echo "NAMESPACE_EXISTS" || echo "NO_NAMESPACE"; echo "=== HELM_RELEASE ===" && helm list -n rag 2>/dev/null | grep rag || echo "NO_EXISTING_RELEASE"; echo "=== PODS ===" && kubectl get pods -n rag 2>/dev/null | head -10 || echo "NO_PODS"; echo "=== NGC_KEY ===" && [ -n "$NGC_API_KEY" ] && echo "NGC_API_KEY SET" || echo "NGC_API_KEY NOT_SET"; echo "=== GPU_RESOURCES ===" && kubectl get nodes -o json 2>/dev/null | grep -o '"nvidia.com/gpu": "[0-9]*"' || echo "NO_GPU_RESOURCES"
+```
+
+Read `docs/support-matrix.md` for current Kubernetes, Helm, and OS version requirements.
+
+| Requirement | Check |
+|-------------|-------|
+| Kubernetes | Per `docs/support-matrix.md` |
+| Helm | Per `docs/support-matrix.md` |
+| NVIDIA GPU Operator | Installed and running |
+| NVIDIA NIM Operator | Installed and running |
+| Default StorageClass | Configured (e.g. local-path-provisioner) |
+| Disk space | ≥200 GB per node |
+| NGC_API_KEY | Set in environment |
+
+Report all missing prerequisites together so the user can fix everything in one pass.
+
+If NGC_API_KEY is NOT_SET: this is the one thing we must ask the user for.
+
+If an existing Helm release is detected: warn "Existing RAG Helm release found. Proceeding will upgrade it." Continue unless user objects.
+
+## Phase 2: Route to Reference
+
+Auto-detect the GPU variant from cluster nodes (not the local machine):
+
+```bash
+echo "=== GPU_LABELS ===" && kubectl get nodes -o json 2>/dev/null | grep -oE '"nvidia.com/gpu.product":\s*"[^"]*"' | sort -u || echo "NO_GPU_LABELS"; echo "=== MIG ===" && kubectl get nodes -o json 2>/dev/null | grep -oE '"nvidia.com/mig.strategy":\s*"[^"]*"' || echo "NO_MIG"
+```
+
+Determine variant from node GPU labels:
+
+Route based on detection:
+
+- **MIG enabled** → read and follow `helm-mig.md`
+- **RTX PRO 6000** → read and follow `helm-standard.md` (use the RTX values.yaml variant described there)
+- **Standard (everything else)** → read and follow `helm-standard.md`
+
+Ask the user only if the variant is genuinely ambiguous. Default to standard deployment.
+
+## Phase 3: Expected Timelines
+
+Set expectations with the user:
+
+| Scenario | Duration |
+|----------|----------|
+| First deployment | 60–70 min (NIM cache download ~40–50 min, NIMService init ~10–15 min, pod startup ~5–10 min) |
+| Subsequent deployments | 10–15 min (model caches already populated) |
+
+Pods in `ContainerCreating` or `Init` state for extended periods is normal — models download in the background without progress indicators.
+
+## Phase 4: Verification
+
+After deployment completes, verify:
+
+```bash
+echo "=== PODS ===" && kubectl get pods -n rag; echo "=== NIMCACHE ===" && kubectl get nimcache -n rag; echo "=== NIMSERVICE ===" && kubectl get nimservice -n rag
+```
+
+Wait for all pods to reach `Running` status. Poll every 60 seconds for up to 70 minutes (first deployment involves model downloads). Show progress.
+
+Once pods are running, port-forward and verify health:
+
+```bash
+kubectl port-forward -n rag service/rag-server 8081:8081 --address 0.0.0.0 & kubectl port-forward -n rag service/rag-frontend 3000:3000 --address 0.0.0.0 & sleep 3 && curl -s http://localhost:8081/v1/health?check_dependencies=true 2>/dev/null || echo "RAG_NOT_READY"
+```
+
+## Phase 5: Uninstall
+
+If the user wants to tear down:
+
+```bash
+helm uninstall rag -n rag
+kubectl delete nimcache --all -n rag
+kubectl delete pvc --all -n rag
+```
+
+## On Success
+
+Tell the user:
+- "RAG Blueprint is running on Kubernetes. Access the UI at http://localhost:3000 (via port-forward)."
+- "Ask me to configure features (VLM, query rewriting, guardrails, etc.)"
+- "Ask me to shutdown when you're done."
+
+## On Error
+
+1. Check pod status and events: `kubectl describe pod <failing-pod> -n rag` and `kubectl get events -n rag --sort-by='.lastTimestamp' | tail -20`.
+2. Read pod logs: `kubectl logs <failing-pod> -n rag --tail 50`.
+3. Read `references/troubleshoot.md` to match against common issues (PVC pending, OOM, image pull failure, port conflict).
+4. Apply the fix and retry. If the fix requires data deletion (PVCs, namespace), confirm with user first.
+
+## Source Documentation
+- `docs/support-matrix.md` — Kubernetes/Helm version requirements, GPU compatibility
+- `docs/deploy-helm.md` — standard Helm deployment from NGC
+- `docs/deploy-helm-from-repo.md` — Helm deployment from local repo
diff --git a/skill-source/.agents/skills/rag-blueprint/references/deploy/library-full.md b/skill-source/.agents/skills/rag-blueprint/references/deploy/library-full.md
new file mode 100644
index 000000000..1a386cb19
--- /dev/null
+++ b/skill-source/.agents/skills/rag-blueprint/references/deploy/library-full.md
@@ -0,0 +1,43 @@
+# Library Mode (Full)
+
+## When to Use
+- User wants programmatic Python access to RAG via `nvidia_rag` package
+- User prefers code-level configuration over Docker-based servers
+- User asks about library mode, Python client, or `NvidiaRAG`/`NvidiaRAGIngestor`
+
+## Restrictions
+- Python 3.11+ (< 3.14)
+- Docker still required for backend services (Milvus, NV-Ingest, Redis, optionally NIMs)
+- Self-hosted NIMs require supported GPUs (see `docs/support-matrix.md`)
+
+## Process
+1. Read `docs/python-client.md` for full API reference, configuration, and backend setup
+2. Create virtual environment and install `nvidia-rag[all]`
+3. Start backend services via Docker (Milvus, NV-Ingest + Redis, optionally NIMs)
+4. Load config from `notebooks/config.yaml` using `NvidiaRAGConfig.from_yaml()`
+5. Create `NvidiaRAGIngestor` and `NvidiaRAG` instances
+6. Use `ingestor.create_collection()`, `ingestor.upload_documents()`, `rag.generate()`, `rag.search()`
+
+## Decision Table
+
+| Goal | Source Doc | Key Action |
+|------|-----------|------------|
+| Self-hosted (local GPUs) | `docs/python-client.md` | Start nims.yaml + set on-prem config |
+| Cloud (NVIDIA-hosted) | `docs/python-client.md` | Skip nims.yaml, override server URLs in config |
+| Custom prompts | `docs/python-client.md` | Pass `prompts=` to NvidiaRAG constructor |
+| Summarization | `docs/python-client.md` | `generate_summary=True` in upload_documents |
+
+## Agent-Specific Notes
+- Config file: `notebooks/config.yaml`; env file: `notebooks/.env_library`
+- Docker login is interactive — tell user to run `docker login nvcr.io` themselves
+- For cloud deployment: override `config.embeddings.server_url`, `config.llm.server_url`, etc. in code
+- Config changes take effect immediately (no container restart needed, unlike Docker mode)
+- Prompt customization via constructor: `NvidiaRAG(config=config, prompts="custom_prompts.yaml")`
+- `upload_documents()` is async — returns `task_id` for status polling
+- NV-Ingest cloud endpoints must be exported before starting NV-Ingest container
+
+## Notebooks
+- `notebooks/rag_library_usage.ipynb` — complete walkthrough: setup, ingestion, querying, search, summaries
+
+## Source Documentation
+- `docs/python-client.md` — full API reference, backend setup, configuration, cloud/self-hosted options
diff --git a/skill-source/.agents/skills/rag-blueprint/references/deploy/library-lite.md b/skill-source/.agents/skills/rag-blueprint/references/deploy/library-lite.md
new file mode 100644
index 000000000..bf7da9041
--- /dev/null
+++ b/skill-source/.agents/skills/rag-blueprint/references/deploy/library-lite.md
@@ -0,0 +1,37 @@
+# Library Mode (Lite / Containerless)
+
+## When to Use
+- Quick prototyping with zero infrastructure (no Docker, no GPU)
+- User wants the fastest path to try RAG
+- CI/CD pipelines needing lightweight RAG testing
+
+## Restrictions
+- No image/table/chart citations
+- No document summarization
+- Subject to NVIDIA API rate limits (cloud-hosted inference)
+- Requires Python 3.11+ (< 3.14), internet access, and `NGC_API_KEY`
+
+## Process
+1. Read `docs/python-client.md` for full library mode documentation
+2. Create virtualenv and install: `pip install nvidia-rag[all]`
+3. Ensure `NGC_API_KEY` is exported — maps to `NVIDIA_API_KEY` internally
+4. Run the lite notebook: `jupyter lab notebooks/rag_library_lite_usage.ipynb`
+
+## Agent-Specific Notes
+- `NVIDIA_API_KEY` (used by `nvidia_rag` package) must be set from `NGC_API_KEY`: `os.environ["NVIDIA_API_KEY"] = os.environ.get("NGC_API_KEY", "")`
+- Lite config lives in `notebooks/config.yaml`; override `server_url` for embeddings to the NVIDIA API Catalog endpoint (see `docs/python-client.md` for current URL), and set LLM/ranking URLs to empty string for cloud defaults
+- Milvus Lite runs embedded (no container), NV-Ingest runs as subprocess (no container)
+- Also install `python-dotenv jupyterlab` for notebook support
+
+## When Not to Use
+- Production workloads — use Docker or Kubernetes
+- Large-scale ingestion — rate limits apply
+- Need citations from images/tables/charts or document summarization
+
+## Notebooks
+| Notebook | Description |
+|----------|-------------|
+| `notebooks/rag_library_lite_usage.ipynb` | End-to-end lite mode: collection creation, ingestion, querying, search |
+
+## Source Documentation
+- `docs/python-client.md` -- full library mode documentation (lite and full)
diff --git a/skill-source/.agents/skills/rag-blueprint/references/deploy/library.md b/skill-source/.agents/skills/rag-blueprint/references/deploy/library.md
new file mode 100644
index 000000000..a5162be7a
--- /dev/null
+++ b/skill-source/.agents/skills/rag-blueprint/references/deploy/library.md
@@ -0,0 +1,54 @@
+# RAG Library Mode Setup
+
+## Determine Mode
+
+If routed here from the deploy workflow, the mode (full or lite) may already be decided. Use it.
+
+If invoked directly, auto-detect:
+
+```bash
+echo "=== DOCKER ===" && docker --version 2>/dev/null || echo "NO_DOCKER"; echo "=== GPU ===" && nvidia-smi --query-gpu=name --format=csv,noheader 2>/dev/null || echo "NO_GPU"; echo "=== PYTHON ===" && python3 --version 2>/dev/null || echo "NO_PYTHON"; echo "=== PKG_MANAGER ===" && which uv 2>/dev/null && echo "UV_AVAILABLE" || (which pip3 2>/dev/null && echo "PIP_AVAILABLE" || echo "NO_PKG_MANAGER"); echo "=== VENV ===" && ls -d .venv/ venv/ nvidia-rag-env/ 2>/dev/null || echo "NO_EXISTING_VENV"; echo "=== INSTALLED ===" && pip3 show nvidia_rag 2>/dev/null | head -3 || echo "NOT_INSTALLED"
+```
+
+- Docker available → **full** (Python API + Docker backend services)
+- No Docker or user explicitly says "lite" / "no docker" / "containerless" → **lite**
+
+Auto-route based on Docker availability. Only ask if both modes are equally valid.
+
+## Verify NGC_API_KEY
+
+Auto-check all locations:
+
+```bash
+if [ -n "$NGC_API_KEY" ]; then echo "NGC_KEY_SET"; elif [ -n "$NVIDIA_API_KEY" ]; then echo "NVIDIA_KEY_SET"; else echo "NOT_SET"; fi
+```
+
+If NOT_SET: ask the user. Otherwise proceed silently.
+
+## Deploy
+
+Based on the mode:
+
+- **Full**: read and follow `library-full.md`
+- **Lite**: read and follow `library-lite.md`
+
+## On Success
+
+Tell the user:
+- Which mode was set up and how to start using it (notebook or Python script)
+- "Ask me to configure features, change models, etc."
+- "Ask me to shutdown backend services when done (if full mode)."
+
+## On Error
+
+1. Read the error output (pip install failure, import error, service connection error).
+2. Read `references/troubleshoot.md` to match against common issues.
+3. Common fixes to try:
+   - `pip install` failure → try `uv pip install` or check Python version ≥3.11.
+   - Import error → check if virtual environment is activated.
+   - Connection error to backend services → check Docker containers are running.
+4. Retry the failed step after fixing.
+5. If still failing, report the specific error to the user.
+
+## Source Documentation
+- `docs/python-client.md` — Python library API, installation, full and lite mode setup
diff --git a/skill-source/.agents/skills/rag-blueprint/references/shutdown.md b/skill-source/.agents/skills/rag-blueprint/references/shutdown.md
new file mode 100644
index 000000000..7407b63d8
--- /dev/null
+++ b/skill-source/.agents/skills/rag-blueprint/references/shutdown.md
@@ -0,0 +1,128 @@
+# RAG Shutdown
+
+Stopping containers and processes does not require confirmation. Deleting data (volumes, cache, images) does.
+
+## Step 1: Detect What Is Running
+
+Detect all deployment modes — Docker, K8s, and library:
+
+```bash
+echo "=== DOCKER ===" && docker ps --format "table {{.Names}}\t{{.Status}}\t{{.Image}}" 2>/dev/null || echo "NO_DOCKER"; echo "=== LIBRARY ===" && ps aux | grep -E "(nvidia_rag|uvicorn|jupyter)" | grep -v grep || echo "NO_LIBRARY_PROCESSES"; echo "=== K8S ===" && kubectl get pods -n rag 2>/dev/null | head -10 || echo "NO_K8S"; echo "=== HELM ===" && helm list -n rag 2>/dev/null | grep rag || echo "NO_HELM_RELEASE"
+```
+
+Based on what's detected, execute the appropriate shutdown path below. If multiple modes are active (e.g., Docker + library), stop all of them.
+
+## Step 2: Stop Services (Reverse Startup Order)
+
+Stop in this order — reverse of deployment. Only stop what is actually running (detected in Step 1).
+
+### 2a: Optional Services
+
+Stop these first if they are running:
+
+```bash
+docker compose -f deploy/compose/docker-compose-nemo-guardrails.yaml down 2>/dev/null; docker compose -f deploy/compose/observability.yaml down 2>/dev/null
+```
+
+### 2b: Application Services
+
+```bash
+docker compose -f deploy/compose/docker-compose-rag-server.yaml down; docker compose -f deploy/compose/docker-compose-ingestor-server.yaml down
+```
+
+### 2c: Vector DB
+
+```bash
+docker compose -f deploy/compose/vectordb.yaml down
+```
+
+If using Elasticsearch instead of Milvus:
+```bash
+docker compose -f deploy/compose/vectordb.yaml --profile elasticsearch down
+```
+
+### 2d: NIMs (Self-Hosted Only)
+
+Only present if self-hosted deployment was used:
+
+```bash
+docker compose -f deploy/compose/nims.yaml down
+```
+
+This stops ALL NIM containers (LLM, embedding, ranking, OCR, detection, and any profile-specific NIMs like VLM, audio, nemotron-parse).
+
+### 2e: Library Mode Processes
+
+If library mode is active (detected Python processes):
+
+```bash
+pkill -f "nvidia_rag" 2>/dev/null; pkill -f "uvicorn.*rag" 2>/dev/null; docker compose -f deploy/compose/docker-compose-ingestor-server.yaml down 2>/dev/null; docker compose -f deploy/compose/vectordb.yaml down 2>/dev/null
+```
+
+### 2f: Kubernetes (Helm) Deployment
+
+If K8s deployment was detected, use the release name and namespace from `helm list` output in step 1:
+
+```bash
+helm uninstall <release-name> -n <namespace> 2>/dev/null
+```
+
+To also clean up persistent data (only if user requests full cleanup):
+```bash
+kubectl delete nimcache --all -n <namespace> 2>/dev/null; kubectl delete pvc --all -n <namespace> 2>/dev/null
+```
+
+## Step 3: Verify Everything Stopped
+
+```bash
+echo "=== REMAINING ===" && docker ps --format "table {{.Names}}\t{{.Status}}" 2>/dev/null; echo "=== K8S ===" && kubectl get pods -n rag 2>/dev/null | head -10 || echo "NOT_K8S"; helm list -n rag 2>/dev/null || true
+```
+
+If any RAG-related containers remain, force remove:
+```bash
+docker ps -a --format "{{.Names}}" | grep -E "(rag|milvus|nim|ingest|redis|nemo|grafana|prometheus|embedding|ranking|vlm|ocr|page-elements|graphic-elements|table-structure)" | xargs -r docker rm -f
+```
+
+If pods remain after `helm uninstall`, force delete:
+```bash
+kubectl delete pods --all -n rag --force --grace-period=0 2>/dev/null
+```
+
+## Step 4: Optional Cleanup
+
+Ask the user if they want to clean up data/volumes:
+
+- **Remove Docker volumes** (deletes ingested data, Milvus indices):
+  ```bash
+  docker volume prune -f
+  ```
+
+- **Remove model cache** (frees 100-200 GB for self-hosted):
+  ```bash
+  rm -rf ~/.cache/model-cache/
+  ```
+
+- **Remove Docker images** (frees disk space):
+  ```bash
+  docker images | grep -E "nvcr.io/nvidia|milvusdb" | awk '{print $3}' | xargs -r docker rmi
+  ```
+
+Only perform cleanup if the user explicitly requests it.
+
+## Quick One-Liner (All Docker Services)
+
+If the user wants a fast full teardown:
+
+```bash
+cd "$(git rev-parse --show-toplevel)" && \
+docker compose -f deploy/compose/docker-compose-nemo-guardrails.yaml down 2>/dev/null; \
+docker compose -f deploy/compose/observability.yaml down 2>/dev/null; \
+docker compose -f deploy/compose/docker-compose-rag-server.yaml down 2>/dev/null; \
+docker compose -f deploy/compose/docker-compose-ingestor-server.yaml down 2>/dev/null; \
+docker compose -f deploy/compose/vectordb.yaml down 2>/dev/null; \
+docker compose -f deploy/compose/nims.yaml down 2>/dev/null; \
+echo "All RAG services stopped."
+```
+
+## Source Documentation
+- `docs/troubleshooting.md` — if services won't stop or containers hang
diff --git a/skill-source/.agents/skills/rag-blueprint/references/troubleshoot.md b/skill-source/.agents/skills/rag-blueprint/references/troubleshoot.md
new file mode 100644
index 000000000..bf0308f8b
--- /dev/null
+++ b/skill-source/.agents/skills/rag-blueprint/references/troubleshoot.md
@@ -0,0 +1,146 @@
+# RAG Troubleshooting
+
+## Auto-Triage: Run First
+
+Start with this diagnostic sweep:
+
+```bash
+echo "=== CONTAINERS ===" && docker ps --format "table {{.Names}}\t{{.Status}}\t{{.Ports}}" 2>/dev/null | grep -E "(rag|milvus|nim|ingest|redis|etcd|minio)" | head -20; echo "=== HEALTH ===" && curl -s http://localhost:8081/v1/health?check_dependencies=true 2>/dev/null || echo "RAG_UNREACHABLE"; curl -s http://localhost:8082/v1/health?check_dependencies=true 2>/dev/null || echo "INGESTOR_UNREACHABLE"; echo "=== LOGS ===" && for svc in rag-server ingestor-server nim-llm-ms nemoretriever-embedding-ms nemoretriever-ranking-ms; do echo "--- $svc ---"; docker logs --tail 20 "$svc" 2>/dev/null | grep -iE "(error|fail|exception|timeout|oom)" || echo "OK"; done; echo "=== GPU ===" && nvidia-smi 2>/dev/null | head -20 || echo "NO_GPU"; echo "=== DISK ===" && df -h / | tail -1; echo "=== DOCKER_DISK ===" && docker system df 2>/dev/null; echo "=== K8S ===" && kubectl get pods -n rag 2>/dev/null | head -20 || echo "NOT_K8S"
+```
+
+Analyze all output, then diagnose and fix. If Auto-Triage doesn't reveal the cause, dig deeper into the specific failing service's logs (`docker logs <service> --tail 100` or `kubectl logs <pod> -n rag --tail 100`).
+
+Confirm with the user before deleting data (volumes, collections, model cache), changing deployment mode, or modifying API keys.
+
+## Source Documentation for Detailed Diagnosis
+
+Read these docs to find specific issue descriptions, causes, and fixes:
+
+- `docs/troubleshooting.md` — primary reference: all common issues with detailed symptoms/fixes
+- `docs/debugging.md` — Pipeline debugging: monitoring deployment, verifying endpoints, tracing requests
+- `docs/service-port-gpu-reference.md` — Complete port/GPU mapping table for all services
+
+## Expected Deployment Times
+
+If user reports "deployment is taking too long," compare against these baselines:
+
+| Mode | First Run | Subsequent |
+|------|-----------|------------|
+| Docker (self-hosted) | 15--30 min (model downloads) | 2--5 min |
+| Docker (NVIDIA-hosted) | 5--10 min (no model downloads) | 1--2 min |
+| K8s/Helm | 60--70 min (NIM cache 40--50 min + init 10--15 min + pod startup 5--10 min) | 10--15 min |
+
+If deployment exceeds these times, check NIM container logs: `docker logs nim-llm-ms --tail 50` and model cache disk usage: `watch -n 10 'du -sh ~/.cache/model-cache/'`.
+
+## Service Health Endpoints
+
+Read `docs/service-port-gpu-reference.md` for the complete port/GPU mapping. Quick check:
+
+| Service | URL | Expected |
+|---------|-----|----------|
+| RAG Server | `http://localhost:8081/v1/health?check_dependencies=true` | `{"status":"healthy"}` |
+| Ingestor | `http://localhost:8082/v1/health?check_dependencies=true` | `{"status":"healthy"}` |
+| NV-Ingest | `http://localhost:7670/v1/health/ready` | 200 OK |
+| Embedding NIM | `http://localhost:9080/v1/health/ready` | 200 OK |
+| LLM NIM | `http://localhost:8999/v1/health/ready` | 200 OK |
+| Ranking NIM | `http://localhost:1976/v1/health/ready` | 200 OK |
+| Milvus | `http://localhost:9091/healthz` | 200 OK |
+
+## Kubernetes Monitoring Commands
+
+```bash
+kubectl get nimcache -n rag
+kubectl get pods -n rag
+kubectl logs -f <pod-name> -n rag
+kubectl get pvc -n rag
+kubectl get events -n rag --sort-by='.lastTimestamp'
+```
+
+Pods in `ContainerCreating` or `Init` state during model download is expected. Use `kubectl get nimcache -n rag -w` to watch download progress.
+
+## Enable Debug Logging
+
+```bash
+export LOGLEVEL=DEBUG
+docker compose -f deploy/compose/docker-compose-ingestor-server.yaml up -d --no-deps ingestor-server
+docker compose -f deploy/compose/docker-compose-rag-server.yaml up -d --no-deps rag-server
+```
+
+---
+
+## Symptom-to-Fix Quick Index
+
+Match the symptom from Auto-Triage output, then read `docs/troubleshooting.md` for the detailed fix. For pipeline debugging steps, read `docs/debugging.md`.
+
+| Symptom | Category | Quick Fix |
+|---------|----------|-----------|
+| NIM container stuck at `(health: starting)` >30min | NIM Startup | Check GPU memory, NGC auth, disk space. First-run model downloads are slow — wait and monitor cache size. |
+| Milvus unhealthy / search returns nothing | Milvus | Restart vectordb compose. Check etcd/MinIO. Port 19530 conflict. Corrupt data → `down -v` (destroys data). |
+| Document upload fails / ingestor health check fails | NV-Ingest | Check Redis, OCR NIMs. Rate limit (429) → reduce batch vars. Large PDFs → reduce batch size. |
+| Chat returns errors / /generate fails | RAG Server | Check LLM NIM health, embedding NIM, cloud API key. Verify `APP_LLM_MODELNAME` matches deployed NIM. |
+| DNS resolution failed for `<service>:<port>` | Networking | Service container not running. Check `docker ps`, restart missing service. |
+| Port already in use | Networking | `lsof -i :<port>` to find conflicting process. See port table above. |
+| GPU out of memory / `torch.OutOfMemoryError` | GPU | Kill other GPU processes, use `--profile rag` for fewer NIMs, or set correct `NIM_MODEL_PROFILE`. |
+| `nvidia-container-cli: unknown device` | GPU | GPU ID exceeds available GPUs. Run `nvidia-smi -L`, adjust `*_GPU_ID` vars to valid IDs. |
+| Disk full / insufficient space | Disk | `docker system prune -f`, remove unused images, check model cache size. |
+| `no configuration file provided: not found` | Docker Compose | Run from the repo root directory. |
+| `too many open files` | Docker Compose | Set `LimitNOFILE=65536` in containerd override, restart containerd. |
+| PVC stuck in Pending | Helm | Create missing StorageClass or update PVC. |
+| `ProvisioningFailed` access mode mismatch | Helm | Patch NIMCache to `ReadWriteOnce`. |
+| Ingestor OOMKilled | Helm | Increase memory limits in values.yaml. Set `SUMMARY_MAX_PARALLELIZATION=1`. |
+| Elasticsearch timeout during ingestion | Elasticsearch | Increase `ES_REQUEST_TIMEOUT` (default 600s). |
+| Hallucination / out-of-context responses | Quality | Add missing-info handling to prompt in `prompt.yaml`. |
+| Embedding dimensions mismatch | Models | Set `APP_EMBEDDINGS_DIMENSIONS` to match model output. Re-ingest. |
+| Hybrid/dense search type mismatch | Search | Align `APP_VECTORSTORE_SEARCHTYPE` on ingestor and rag-server. Re-ingest. |
+| Confidence threshold filtering all results | Search | Lower `RERANKER_SCORE_THRESHOLD` (range 0.0–1.0, default 0.0). |
+| OCR not starting / connection errors | OCR | Check GPU memory, NGC auth. Verify `OCR_GRPC_ENDPOINT`/`OCR_HTTP_ENDPOINT` match running service. |
+| NVIDIA API credits exhausted | Cloud | Contact NVIDIA representative for additional credits. |
+| Image-only PDFs not ingesting | Ingestion | Enable `APP_NVINGEST_EXTRACTINFOGRAPHICS`. Consider image captioning. |
+
+---
+
+## Troubleshooting Checklists
+
+### Ingestion Checklist
+- [ ] All required containers running (ingestor-server, nv-ingest-ms-runtime, milvus, redis)
+- [ ] Vector database accessible (`curl http://localhost:9091/healthz`)
+- [ ] Embedding service healthy (`curl http://localhost:9080/v1/health/ready`)
+- [ ] File format supported and size <= 400 MB
+- [ ] Sufficient disk space (`df -h /`)
+- [ ] GPU resources available (`nvidia-smi`)
+
+### Retrieval Checklist
+- [ ] RAG server running and healthy
+- [ ] LLM service accessible (`curl http://localhost:8999/v1/health/ready`)
+- [ ] Vector database contains data (collection exists with documents)
+- [ ] Collection name is correct
+- [ ] Query format is valid
+
+### Quality Checklist
+- [ ] Reranker is enabled and healthy
+- [ ] Top-K values are appropriate
+- [ ] Collection has sufficient relevant data
+- [ ] Query rewriting configured correctly
+- [ ] Prompt template appropriate for use case
+
+---
+
+## Full Reset
+
+Destroys all data (volumes, images, caches). Confirm with the user before running.
+
+If nothing else works and the user confirms:
+
+```bash
+cd "$(git rev-parse --show-toplevel)"
+docker compose -f deploy/compose/docker-compose-nemo-guardrails.yaml down 2>/dev/null
+docker compose -f deploy/compose/observability.yaml down 2>/dev/null
+docker compose -f deploy/compose/docker-compose-rag-server.yaml down 2>/dev/null
+docker compose -f deploy/compose/docker-compose-ingestor-server.yaml down 2>/dev/null
+docker compose -f deploy/compose/vectordb.yaml down -v 2>/dev/null
+docker compose -f deploy/compose/nims.yaml down 2>/dev/null
+
+docker system prune -af --volumes
+```
+
+Then deploy fresh using the deploy workflow.
diff --git a/skill-source/README.md b/skill-source/README.md
new file mode 100644
index 000000000..f5a787677
--- /dev/null
+++ b/skill-source/README.md
@@ -0,0 +1,111 @@
+# RAG Blueprint Agent Skill
+
+A single agent skill that enables AI coding assistants (Claude Code, Cursor, Codex, etc.) to deploy, configure, troubleshoot, and manage the NVIDIA RAG Blueprint autonomously.
+
+## Installation
+
+```bash
+npx skills add .
+```
+
+Select **rag-blueprint** — it includes all capabilities (deploy, configure, shutdown, troubleshoot) in one skill.
+
+## Architecture: Skills = Process, Docs = Truth
+
+```
+SKILL.md           = ROUTER (intent detection, autonomy rules, configure routing table)
+Reference files    = WHAT/HOW (deployment workflows, feature playbooks, diagnostics)
+docs/*.md          = SOURCE OF TRUTH (never copied into skills)
+notebooks/*.ipynb  = RUNNABLE EXAMPLES (referenced from relevant skills)
+```
+
+The SKILL.md detects user intent and routes to the correct reference file. Reference files are concise playbooks that point to `docs/*.md` for detailed configuration — this prevents staleness from duplicated content.
+
+## Skill Structure
+
+```
+skill-source/.agents/skills/rag-blueprint/
+  SKILL.md                              ← Single entry point (intent router)
+  references/
+    deploy.md                           ← Deployment: env analysis, NGC key, routing
+    deploy/
+      docker.md                         ← Docker Compose deployment workflow
+      docker-self-hosted.md             ← Self-hosted NIMs (local GPU inference)
+      docker-nvidia-hosted.md           ← Cloud NIMs (NVIDIA API endpoints)
+      docker-retrieval-only.md          ← Search/retrieve only (no LLM)
+      helm.md                           ← Kubernetes / Helm deployment workflow
+      helm-standard.md                  ← Standard Helm chart deployment
+      helm-mig.md                       ← Multi-Instance GPU deployment
+      library.md                        ← Python library mode workflow
+      library-full.md                   ← Python API + Docker backend
+      library-lite.md                   ← Containerless (Milvus Lite + cloud APIs)
+    configure/
+      vlm.md                            ← VLM, VLM embeddings, image captioning
+      guardrails.md                     ← NeMo Guardrails
+      query-and-conversation.md         ← Query rewriting, decomposition, multi-turn
+      ingestion.md                      ← Text-only, audio, Nemotron Parse, OCR, batch CLI
+      search-and-retrieval.md           ← Hybrid search, multi-collection, metadata, filters
+      models-and-infrastructure.md      ← Model changes, vector DB, auth, API keys, profiles
+      reasoning-and-generation.md       ← Reasoning, self-reflection, prompts, generation params
+      summarization.md                  ← Document summarization during ingestion
+      observability.md                  ← Tracing, Zipkin, Grafana, Prometheus
+      multimodal-query.md              ← Image + text querying with VLM embeddings
+      data-catalog.md                   ← Collection/document metadata management
+      user-interface.md                 ← RAG UI settings and usage
+      api-reference.md                  ← REST API endpoints and schemas
+      evaluation.md                     ← RAGAS quality metrics
+      mcp.md                            ← MCP server & client tools
+      migration.md                      ← Version upgrade guide
+      notebooks.md                      ← Notebook environment and catalog
+    shutdown.md                         ← Stop and tear down services
+    troubleshoot.md                     ← Diagnose and fix common issues
+```
+
+## How It Works
+
+1. User says "deploy RAG" → SKILL.md routes to `references/deploy.md` → env analysis → routes to `deploy/docker.md`, `deploy/helm.md`, or `deploy/library.md`
+2. User says "enable VLM" → SKILL.md routes to `references/configure/vlm.md` → reads `docs/vlm.md` for detailed steps
+3. User says "RAG is broken" → SKILL.md routes to `references/troubleshoot.md` → auto-triage diagnostic sweep
+4. User says "stop RAG" → SKILL.md routes to `references/shutdown.md` → detects and stops all services
+
+## Supported Deployment Modes
+
+Read `docs/support-matrix.md` for current hardware requirements per mode.
+
+| Mode | Docker Required | Description |
+|------|-----------------|-------------|
+| Docker (self-hosted) | Yes | Full on-prem with local NIM inference |
+| Docker (NVIDIA-hosted) | Yes | Cloud APIs for model inference |
+| Docker (retrieval-only) | Yes | No LLM, search/retrieve only |
+| Helm / Kubernetes | No (K8s) | Production K8s with NIM Operator |
+| Library (full) | Yes (backend) | Python API with Docker backend services |
+| Library (lite) | No | Milvus Lite + cloud APIs, zero infrastructure |
+
+## NGC_API_KEY Handling
+
+Skills never expose the API key value to the LLM. The approach:
+
+1. Check if `NGC_API_KEY` is set: `[ -n "$NGC_API_KEY" ] && echo "SET" || echo "NOT_SET"`
+2. If not set, ask the user to run `export NGC_API_KEY="nvapi-your-key"` in the terminal
+3. For `docker login`, the user runs it themselves (the command expands the key)
+4. As a fallback, offer to write a placeholder to `deploy/compose/.env` for the user to replace
+
+## Notebook Integration
+
+All 13 notebooks are referenced from relevant reference files:
+
+| Notebook | Referenced In |
+|----------|--------------|
+| `ingestion_api_usage.ipynb` | `references/configure/ingestion.md` |
+| `retriever_api_usage.ipynb` | `references/configure/search-and-retrieval.md` |
+| `image_input.ipynb` | `references/configure/vlm.md`, `references/configure/multimodal-query.md` |
+| `summarization.ipynb` | `references/configure/summarization.md` |
+| `evaluation_01_ragas.ipynb` | `references/configure/evaluation.md` |
+| `evaluation_02_recall.ipynb` | `references/configure/evaluation.md` |
+| `nb_metadata.ipynb` | `references/configure/search-and-retrieval.md` |
+| `rag_library_usage.ipynb` | `references/deploy/library-full.md` |
+| `rag_library_lite_usage.ipynb` | `references/deploy/library-lite.md` |
+| `building_rag_vdb_operator.ipynb` | `references/configure/models-and-infrastructure.md` |
+| `mcp_server_usage.ipynb` | `references/configure/mcp.md` |
+| `nat_mcp_integration.ipynb` | `references/configure/mcp.md` |
+| `launchable.ipynb` | `SKILL.md` |
diff --git a/src/nvidia_rag/ingestor_server/main.py b/src/nvidia_rag/ingestor_server/main.py
index f28750e38..88df2c957 100644
--- a/src/nvidia_rag/ingestor_server/main.py
+++ b/src/nvidia_rag/ingestor_server/main.py
@@ -94,6 +94,7 @@
 from nvidia_rag.utils.summary_status_handler import SUMMARY_STATUS_HANDLER
 from nvidia_rag.utils.vdb import DEFAULT_DOCUMENT_INFO_COLLECTION, _get_vdb_op
 from nvidia_rag.utils.vdb.vdb_base import VDBRag
+from nvidia_rag.utils.vdb.vdb_ingest_base import SerializedVDBWrapper
 
 # Initialize logger
 logger = logging.getLogger(__name__)
@@ -752,10 +753,13 @@ async def __build_ingestion_response(
         uploaded_documents = []
         for filepath in filepaths:
             if os.path.basename(filepath) not in failures_filepaths:
-                doc_type_counts, _, total_elements, raw_text_elements_size = (
-                    self._get_document_type_counts(
-                        [filename_to_result_map.get(os.path.basename(filepath), [])]
-                    )
+                (
+                    doc_type_counts,
+                    _,
+                    total_elements,
+                    raw_text_elements_size,
+                ) = self._get_document_type_counts(
+                    [filename_to_result_map.get(os.path.basename(filepath), [])]
                 )
 
                 document_info = create_document_metadata(
@@ -2265,6 +2269,13 @@ async def __run_nvingest_batched_ingestion(
                 logger.info(
                     f"Processing batches in parallel with concurrency: {state_manager.concurrent_batches}"
                 )
+
+                if vdb_op is not None and SerializedVDBWrapper is not None:
+                    vdb_op = SerializedVDBWrapper(vdb_op)
+                    logger.info(
+                        "VDB write serialization enabled — extraction runs in parallel, VDB writes are sequential"
+                    )
+
                 all_results = []
                 all_failures = []
                 tasks = []
@@ -2850,9 +2861,12 @@ def _log_result_info(
         Returns:
             dict[str, Any]: Document info with metrics
         """
-        doc_type_counts, total_documents, total_elements, raw_text_elements_size = (
-            self._get_document_type_counts(results)
-        )
+        (
+            doc_type_counts,
+            total_documents,
+            total_elements,
+            raw_text_elements_size,
+        ) = self._get_document_type_counts(results)
 
         document_info = {
             "doc_type_counts": doc_type_counts,
diff --git a/src/nvidia_rag/ingestor_server/nvingest.py b/src/nvidia_rag/ingestor_server/nvingest.py
index 55e218c53..0f02aa1bb 100644
--- a/src/nvidia_rag/ingestor_server/nvingest.py
+++ b/src/nvidia_rag/ingestor_server/nvingest.py
@@ -141,6 +141,8 @@ def get_nv_ingest_ingestor(
             "extract_audio_params": {"segment_audio": config.nv_ingest.segment_audio},
             "extract_page_as_image": config.nv_ingest.extract_page_as_image,
         }
+        if config.nv_ingest.extract_tables_method is not None:
+            extract_kwargs["extract_tables_method"] = config.nv_ingest.extract_tables_method
 
     if remove_extract_method or config.nv_ingest.pdf_extract_method is None:
         extract_kwargs.pop("extract_method", None)
diff --git a/src/nvidia_rag/rag_server/main.py b/src/nvidia_rag/rag_server/main.py
index 044120926..55aaabede 100644
--- a/src/nvidia_rag/rag_server/main.py
+++ b/src/nvidia_rag/rag_server/main.py
@@ -262,7 +262,9 @@ def __init__(
         # Load prompts and other utilities
         self.prompts = get_prompts(prompts)
         self.vdb_top_k = int(self.config.retriever.vdb_top_k)
-        self.StreamingFilterThinkParser = get_streaming_filter_think_parser_async()
+        self.StreamingFilterThinkParser = get_streaming_filter_think_parser_async(
+            enable_thinking=self.config.llm.parameters.enable_thinking
+        )
 
         if self._init_errors:
             logger.warning(
@@ -1368,18 +1370,6 @@ def _handle_prompt_processing(
         conversation_history = []
         user_message = []
 
-        is_nemotron_v1 = str(model).endswith("llama-3.3-nemotron-super-49b-v1")
-
-        # Nemotron controls thinking using system prompt, if nemotron v1 model is used update system prompt to enable/disable think
-        if is_nemotron_v1:
-            logger.info("Nemotron v1 model detected, updating system prompt")
-            if os.environ.get("ENABLE_NEMOTRON_THINKING", "false").lower() == "true":
-                logger.info("Setting system prompt as detailed thinking on")
-                system_prompt = "detailed thinking on"
-            else:
-                logger.info("Setting system prompt as detailed thinking off")
-                system_prompt = "detailed thinking off"
-
         # Process chat history
         for message in chat_history:
             # Overwrite system message if provided in conversation history
@@ -1883,27 +1873,46 @@ def _build_retriever_query_from_content(self, content: Any) -> tuple[str, bool]:
             tuple[str, bool]: Query string that may include base64 image data for VLM embeddings
             bool: True if image URL is provided, False otherwise
         """
+        is_image_query = False
         if isinstance(content, str):
-            return content, False
+            return content, is_image_query
         elif isinstance(content, list):
-            # Build multimodal query with both text and base64 images
-            query_parts = []
-            for item in content:
-                if isinstance(item, dict):
-                    if item.get("type") == "text":
-                        text_content = item.get("text", "").strip()
-                        if text_content:
-                            query_parts.append(text_content)
-                    elif item.get("type") == "image_url":
-                        image_url = item.get("image_url", {}).get("url", "")
-                        if image_url:
-                            # If image URL is provided, return it as is
-                            return image_url, True
-            # If no image URL is provided, return the text content
-            return "\n\n".join(query_parts), False
+            # Build multimodal query with both text and base64 images.
+            
+            # Process text types first, then image_url types.
+            text_items = [
+                item for item in content
+                if isinstance(item, dict) and item.get("type") == "text"
+            ]
+            image_items = [
+                item for item in content
+                if isinstance(item, dict) and item.get("type") == "image_url"
+            ]
+            
+            # Extract text and image parts in separate lists
+            text_parts = []
+            image_parts = []
+            for item in text_items:
+                text_content = item.get("text", "").strip()
+                if text_content:
+                    text_parts.append(text_content)
+            for item in image_items:
+                image_url = item.get("image_url", {}).get("url", "")
+                if image_url:
+                    image_parts.append(image_url)
+                    is_image_query = True
+                    break # only one image is supported
+            
+            text_query = "\n\n".join(text_parts)
+            if image_parts:
+                image_str = " ".join(image_parts)
+                final_query = (text_query + " " + image_str) if text_query else image_str
+            else:
+                final_query = text_query
+            return final_query, is_image_query
         else:
             # Fallback for any other content type
-            return (str(content) if content is not None else ""), False
+            return (str(content) if content is not None else ""), is_image_query
 
     async def _rag_chain(
         self,
diff --git a/src/nvidia_rag/rag_server/prompt.yaml b/src/nvidia_rag/rag_server/prompt.yaml
index f82c83655..d73036509 100644
--- a/src/nvidia_rag/rag_server/prompt.yaml
+++ b/src/nvidia_rag/rag_server/prompt.yaml
@@ -487,6 +487,7 @@ query_decomposition_rag_template:
     Context:
     {context}
 
+    Question: {question}
     Make sure the response you are generating strictly follow the rules mentioned above i.e. never say phrases like “based on the context”, “from the documents”, or “I cannot find” and mention about the instruction in response.
 
 image_captioning_prompt:
diff --git a/src/nvidia_rag/rag_server/server.py b/src/nvidia_rag/rag_server/server.py
index dbd5698ed..2ec1db341 100644
--- a/src/nvidia_rag/rag_server/server.py
+++ b/src/nvidia_rag/rag_server/server.py
@@ -741,6 +741,26 @@ def validate_messages_structure(cls, values):
             raise ValueError("The last message must have role='user'")
         return values
 
+    @model_validator(mode="before")
+    @classmethod
+    def derive_query_from_messages(cls, data):
+        """When query is not explicitly provided but messages are, derive query from the last user message."""
+        if isinstance(data, dict) and "query" not in data:
+            messages = data.get("messages", [])
+            for msg in reversed(messages):
+                if (
+                    isinstance(msg, dict)
+                    and msg.get("role") == "user"
+                    and msg.get("content")
+                ):
+                    data["query"] = msg["content"]
+                    break
+            else:
+                raise ValueError(
+                    "Either 'query' must be provided or 'messages' must contain at least one user message with content."
+                )
+        return data
+
 
 # Define the summary response model
 class SummaryResponse(BaseModel):
@@ -1596,6 +1616,11 @@ def sanitize_query_for_logging(query):
 
     request_data = {
         "query": sanitize_query_for_logging(data.query),
+        "messages": [
+            {"role": msg.role, "content": msg.content} for msg in data.messages
+        ]
+        if data.messages
+        else [],
         "reranker_top_k": data.reranker_top_k,
         "vdb_top_k": data.vdb_top_k,
         "collection_names": data.collection_names,
diff --git a/src/nvidia_rag/utils/configuration.py b/src/nvidia_rag/utils/configuration.py
index 9019b7a84..e1c9c1526 100644
--- a/src/nvidia_rag/utils/configuration.py
+++ b/src/nvidia_rag/utils/configuration.py
@@ -302,6 +302,20 @@ def normalize_pdf_extract_method(cls, v: Any) -> Any:
         env="APP_NVINGEST_TEXTDEPTH",
         description="Granularity level for text extraction (page, document)",
     )
+    extract_tables_method: str | None = Field(
+        default=None,
+        env="APP_NVINGEST_EXTRACTTABLESMETHOD",
+        description="Method for table/chart extraction in PDFs (e.g. yolox, nemotron_parse). If None, client default is used.",
+    )
+
+    @field_validator("extract_tables_method", mode="before")
+    @classmethod
+    def normalize_extract_tables_method(cls, v: Any) -> Any:
+        """Normalize string 'None'/'none' to Python None."""
+        if isinstance(v, str) and v.lower() in ("none", "null", ""):
+            return None
+        return v
+
     tokenizer: str = Field(
         default="intfloat/e5-large-unsupervised",
         env="APP_NVINGEST_TOKENIZER",
@@ -412,15 +426,30 @@ class ModelParametersConfig(_ConfigBase):
         env="LLM_MIN_TOKENS",
         description="Minimum number of tokens to generate in response",
     )
+    enable_thinking: bool = Field(
+        default=False,
+        env="LLM_ENABLE_THINKING",
+        description="Enable reasoning/thinking mode. Model emits reasoning tokens before the final answer.",
+    )
+    reasoning_budget: int = Field(
+        default=0,
+        env="LLM_REASONING_BUDGET",
+        description="Token budget for reasoning (0 = no budget, model decides depth). Only used when enable_thinking is true.",
+    )
+    low_effort: bool = Field(
+        default=False,
+        env="LLM_LOW_EFFORT",
+        description="Low-effort reasoning mode for faster, cheaper responses with shorter reasoning. Only used when enable_thinking is true.",
+    )
     max_thinking_tokens: int = Field(
         default=0,
         env="LLM_MAX_THINKING_TOKENS",
-        description="Maximum thinking tokens to allocate for reasoning models (0 = disabled by default)",
+        description="Maximum thinking tokens for reasoning models. Used directly by nemotron-nano-9b-v2; for other models acts as an alternative to reasoning_budget (0 = disabled).",
     )
     min_thinking_tokens: int = Field(
         default=0,
         env="LLM_MIN_THINKING_TOKENS",
-        description="Minimum thinking tokens to allocate for reasoning models (0 = disabled by default)",
+        description="Minimum thinking tokens for reasoning models. Only used by nemotron-nano-9b-v2 (0 = disabled).",
     )
     ignore_eos: bool = Field(
         default=False,
@@ -502,6 +531,9 @@ def get_model_parameters(self) -> dict:
             "min_tokens": self.parameters.min_tokens,
             "ignore_eos": self.parameters.ignore_eos,
             "max_tokens": self.parameters.max_tokens,
+            "enable_thinking": self.parameters.enable_thinking,
+            "reasoning_budget": self.parameters.reasoning_budget,
+            "low_effort": self.parameters.low_effort,
             "min_thinking_tokens": self.parameters.min_thinking_tokens,
             "max_thinking_tokens": self.parameters.max_thinking_tokens,
             "temperature": self.parameters.temperature,
@@ -631,7 +663,7 @@ class EmbeddingConfig(_ConfigBase):
     """Embedding configuration."""
 
     model_name: str = Field(
-        default="nvidia/llama-3.2-nv-embedqa-1b-v2",
+        default="nvidia/llama-nemotron-embed-1b-v2",
         env="APP_EMBEDDINGS_MODELNAME",
         description="Model for generating text embeddings",
     )
@@ -671,7 +703,7 @@ class RankingConfig(_ConfigBase):
     """Ranking configuration."""
 
     model_name: str = Field(
-        default="nvidia/llama-3.2-nv-rerankqa-1b-v2",
+        default="nvidia/llama-nemotron-rerank-1b-v2",
         env="APP_RANKING_MODELNAME",
         description="Model for reranking retrieved documents",
     )
diff --git a/src/nvidia_rag/utils/llm.py b/src/nvidia_rag/utils/llm.py
index 639c87d70..e1f226655 100644
--- a/src/nvidia_rag/utils/llm.py
+++ b/src/nvidia_rag/utils/llm.py
@@ -33,6 +33,7 @@
 import yaml
 from langchain_core.language_models.llms import LLM
 from langchain_core.language_models.chat_models import SimpleChatModel
+from langchain_core.messages import AIMessageChunk
 from langchain_nvidia_ai_endpoints import ChatNVIDIA
 
 from nvidia_rag.rag_server.response_generator import APIError, ErrorCodeMapping
@@ -128,117 +129,112 @@ def _is_nvidia_endpoint(url: str | None) -> bool:
     return True
 
 
-def _bind_thinking_tokens_if_configured(
-    llm: LLM | SimpleChatModel, **kwargs
-) -> LLM | SimpleChatModel:
-    """
-    If min_thinking_tokens or max_thinking_tokens are > 0 in kwargs, bind them to the LLM.
-    
-    Supports multiple reasoning/thinking model variants:
-    
-    1. nvidia/nvidia-nemotron-nano-9b-v2:
-       - Uses min_thinking_tokens and max_thinking_tokens parameters
-       - Reasoning content is not available for this model
-    
-    2. nemotron-3-nano variants (nemotron-3-nano-30b-a3b, nvidia/nemotron-3-nano):
-       - Uses reasoning_budget parameter (mapped from max_thinking_tokens)
-       - reasoning_budget is ONLY set when enable_thinking is true
-       - Outputs reasoning in a separate 'reasoning_content' field (not in content)
-       - Does NOT use <think> tags
-       - Can be controlled via ENABLE_NEMOTRON_3_NANO_THINKING env var
-
-    Raises:
-        ValueError: If min_thinking_tokens or max_thinking_tokens is passed but model
-                    is not a supported Nemotron thinking model, or if any of these
-                    parameters have invalid values (0 or negative).
-    """
-    min_think = kwargs.get("min_thinking_tokens", None)
-    max_think = kwargs.get("max_thinking_tokens", None)
-    model = kwargs.get("model", None)
+def _is_nemotron_3(model: str | None) -> bool:
+    """Detect Nemotron 3 model variants by checking for 'nemotron-3' in the model name."""
+    if not model:
+        return False
+    return "nemotron-3" in model.lower()
 
-    # Validate model compatibility for thinking tokens
-    has_thinking_tokens = (min_think is not None and min_think > 0) or (
-        max_think is not None and max_think > 0
-    )
 
-    if not has_thinking_tokens:
-        return llm
+def _is_nemotron_3_nano(model: str | None) -> bool:
+    """Detect Nemotron 3 Nano models (30b-a3b and locally hosted variants)."""
+    if not model:
+        return False
+    m = model.lower()
+    return "nemotron-3-nano" in m
 
-    # Check if model is a supported reasoning model (various name formats)
-    # Note: For locally hosted models, use "nvidia/nemotron-3-nano"
-    # For NVIDIA-hosted models, use "nvidia/nemotron-3-nano-30b-a3b"
-    is_nano_9b_v2 = model and "nvidia/nvidia-nemotron-nano-9b-v2" in model
-    is_nemotron_3_nano = model and (
-        "nemotron-3-nano" in model.lower() or 
-        "nvidia/nemotron-3-nano" in model or
-        "nemotron-3-nano-30b-a3b" in model
-    )
-    
-    if has_thinking_tokens and not (is_nano_9b_v2 or is_nemotron_3_nano):
-        raise ValueError(
-            "min_thinking_tokens and max_thinking_tokens are only supported for models "
-            "'nvidia/nvidia-nemotron-nano-9b-v2' and nemotron-3-nano variants "
-            "(e.g., 'nemotron-3-nano-30b-a3b', 'nvidia/nemotron-3-nano'), "
-            f"but got model '{model}'"
-        )
 
-    bind_args = {}
-    if is_nano_9b_v2:
-        # nvidia/nvidia-nemotron-nano-9b-v2: Uses thinking token parameters directly
-        if min_think is not None and min_think > 0:
-            bind_args["min_thinking_tokens"] = min_think
-        else:
-            raise ValueError(
-                f"min_thinking_tokens must be a positive integer, but got {min_think}"
-            )
-        if max_think is not None and max_think > 0:
-            bind_args["max_thinking_tokens"] = max_think
-        else:
-            raise ValueError(
-                f"max_thinking_tokens must be a positive integer, but got {max_think}"
-            )
-        logger.info(
-            "nvidia-nemotron-nano-9b-v2: Setting min_thinking_tokens=%d, max_thinking_tokens=%d",
-            min_think, max_think
+def _is_nemotron_nano_9b_v2(model: str | None) -> bool:
+    """Detect legacy Nemotron Nano 9B v2."""
+    if not model:
+        return False
+    return "nvidia/nvidia-nemotron-nano-9b-v2" in model
+
+
+def _resolve_enable_thinking(config: NvidiaRAGConfig | None = None, **kwargs) -> bool:
+    """Resolve enable_thinking from config, kwargs, or deprecated env var fallback."""
+    if config is not None:
+        enable = config.llm.parameters.enable_thinking
+        if enable:
+            return True
+    enable = kwargs.get("enable_thinking", False)
+    if enable:
+        return True
+    deprecated = os.getenv("ENABLE_NEMOTRON_3_NANO_THINKING")
+    if deprecated is not None:
+        logger.warning(
+            "ENABLE_NEMOTRON_3_NANO_THINKING is deprecated, use LLM_ENABLE_THINKING instead"
         )
-    elif is_nemotron_3_nano:
-        enable_thinking = os.getenv("ENABLE_NEMOTRON_3_NANO_THINKING", "true").lower() == "true"
-        if not enable_thinking:
-            raise ValueError(
-                "ENABLE_NEMOTRON_3_NANO_THINKING must be set to 'true' to use reasoning budget"
-            )
+        return deprecated.lower() == "true"
+    return False
 
-        # For nemotron-3-nano variants, min_thinking_tokens is not supported
-        if min_think is not None and min_think > 0:
-            logger.warning(
-                "min_thinking_tokens is not supported for nemotron-3-nano variants, "
-                "only max_thinking_tokens (mapped to reasoning_budget or nvext) is supported"
-            )
 
-        if max_think is not None and max_think > 0:
-            # Check if llm_endpoint is provided (locally hosted model)
-            llm_endpoint = kwargs.get("llm_endpoint", None)
+def _bind_reasoning_config(
+    llm: LLM | SimpleChatModel, config: NvidiaRAGConfig | None = None, **kwargs
+) -> LLM | SimpleChatModel:
+    """
+    Bind reasoning parameters to the LLM based on model type and configuration.
+
+    Reads enable_thinking, reasoning_budget, and low_effort from the config
+    object (LLM_ENABLE_THINKING, LLM_REASONING_BUDGET, LLM_LOW_EFFORT env vars).
+    kwargs can still override these for backward compatibility.
+
+    Supports:
+    - Nemotron 3 variants: enable_thinking, reasoning_budget, low_effort via chat_template_kwargs
+    - Nemotron 3 Nano: enable_thinking + reasoning_budget (or nvext for local NIM)
+    - Nemotron Nano 9B v2: legacy min_thinking_tokens / max_thinking_tokens
+    - Other models: no reasoning features bound
+    """
+    model = kwargs.get("model", "")
+    enable_thinking = _resolve_enable_thinking(config=config, **kwargs)
+    params = config.llm.parameters if config is not None else None
+    reasoning_budget = kwargs.get("reasoning_budget") or (params.reasoning_budget if params else 0)
+    low_effort = kwargs.get("low_effort") or (params.low_effort if params else False)
+    min_think = kwargs.get("min_thinking_tokens") or (params.min_thinking_tokens if params else 0) or 0
+    max_think = kwargs.get("max_thinking_tokens") or (params.max_thinking_tokens if params else 0) or 0
+
+    # Check specific variants first, then fall through to the general nemotron-3 check
+
+    if _is_nemotron_3_nano(model):
+        llm = llm.bind(chat_template_kwargs={"enable_thinking": enable_thinking})
+        if enable_thinking and (reasoning_budget > 0 or max_think > 0):
+            budget = reasoning_budget if reasoning_budget > 0 else max_think
+            llm_endpoint = kwargs.get("llm_endpoint", "")
             if llm_endpoint:
-                # For locally hosted models, use nvext syntax
-                bind_args["nvext"] = {"max_thinking_tokens": max_think}
-                logger.info(
-                    "nemotron-3-nano (locally hosted): Setting max_thinking_tokens=%d via nvext",
-                    max_think
-                )
+                llm = llm.bind(nvext={"max_thinking_tokens": budget})
+                logger.info("nemotron-3-nano (local): enable_thinking=%s, nvext.max_thinking_tokens=%d", enable_thinking, budget)
             else:
-                # For API catalog models, use reasoning_budget
-                bind_args["reasoning_budget"] = max_think
-                logger.info(
-                    "nemotron-3-nano (API catalog): Setting reasoning_budget=%d",
-                    max_think
-                )
+                llm = llm.bind(reasoning_budget=budget)
+                logger.info("nemotron-3-nano (API): enable_thinking=%s, reasoning_budget=%d", enable_thinking, budget)
         else:
+            logger.info("nemotron-3-nano: enable_thinking=%s", enable_thinking)
+        return llm
+
+    if _is_nemotron_nano_9b_v2(model):
+        if min_think > 0 and max_think > 0:
+            llm = llm.bind(min_thinking_tokens=min_think, max_thinking_tokens=max_think)
+            logger.info("nemotron-nano-9b-v2: min_thinking_tokens=%d, max_thinking_tokens=%d", min_think, max_think)
+        elif min_think > 0 or max_think > 0:
             raise ValueError(
-                f"max_thinking_tokens must be a positive integer, but got {max_think}"
+                "nemotron-nano-9b-v2 requires both min_thinking_tokens and max_thinking_tokens "
+                f"to be positive, got min={min_think}, max={max_think}"
             )
+        return llm
+
+    if _is_nemotron_3(model):
+        template_kwargs: dict = {"enable_thinking": enable_thinking}
+        if enable_thinking and low_effort:
+            template_kwargs["low_effort"] = True
+        budget = reasoning_budget if reasoning_budget > 0 else max_think
+        if enable_thinking and budget > 0:
+            template_kwargs["reasoning_budget"] = budget
+        llm = llm.bind(chat_template_kwargs=template_kwargs)
+        logger.info(
+            "nemotron-3: enable_thinking=%s, reasoning_budget=%d, low_effort=%s",
+            enable_thinking, budget, low_effort,
+        )
+        return llm
 
-    if bind_args:
-        return llm.bind(**bind_args)
     return llm
 
 
@@ -289,16 +285,18 @@ def get_llm(config: NvidiaRAGConfig | None = None, **kwargs) -> LLM | SimpleChat
                     default_headers = {**NVIDIA_API_DEFAULT_HEADERS}
                     if api_key:
                         default_headers["X-Model-Authorization"] = api_key
-                    return ChatOpenAI(
-                        model_name=kwargs.get("model"),
-                        openai_api_base=f"{guardrails_url}/v1/guardrail",
-                        openai_api_key="dummy-value",
-                        default_headers=default_headers,
-                        temperature=kwargs.get("temperature", None),
-                        top_p=kwargs.get("top_p", None),
-                        max_tokens=kwargs.get("max_tokens", None),
-                        stop=kwargs.get("stop", []),
-                    )
+                    openai_kwargs = {
+                        "model_name": kwargs.get("model"),
+                        "openai_api_base": f"{guardrails_url}/v1/guardrail",
+                        "openai_api_key": "dummy-value",
+                        "default_headers": default_headers,
+                        "temperature": kwargs.get("temperature", None),
+                        "top_p": kwargs.get("top_p", None),
+                        "max_tokens": kwargs.get("max_tokens", None),
+                    }
+                    if kwargs.get("stop"):
+                        openai_kwargs["stop"] = kwargs["stop"]
+                    return ChatOpenAI(**openai_kwargs)
                 except (requests.RequestException, requests.ConnectionError) as e:
                     error_msg = f"Guardrails NIM unavailable at {guardrails_url}. Please verify the service is running and accessible."
                     logger.exception(
@@ -318,13 +316,15 @@ def get_llm(config: NvidiaRAGConfig | None = None, **kwargs) -> LLM | SimpleChat
 
             # Build kwargs dict, only including parameters that are set
             # For non-NVIDIA endpoints, exclude NVIDIA-specific parameters
+            # Do not pass stop=[] - some Nemotron 3 APIs reject empty stop arrays
             chat_nvidia_kwargs = {
                 "base_url": url,
                 "model": kwargs.get("model"),
                 "api_key": api_key,
-                "stop": kwargs.get("stop", []),
                 "default_headers": NVIDIA_API_DEFAULT_HEADERS,
             }
+            if kwargs.get("stop"):
+                chat_nvidia_kwargs["stop"] = kwargs["stop"]
             if kwargs.get("temperature") is not None:
                 chat_nvidia_kwargs["temperature"] = kwargs["temperature"]
             if kwargs.get("top_p") is not None:
@@ -342,15 +342,8 @@ def get_llm(config: NvidiaRAGConfig | None = None, **kwargs) -> LLM | SimpleChat
                     chat_nvidia_kwargs["model_kwargs"] = model_kwargs
 
             llm = ChatNVIDIA(**chat_nvidia_kwargs)
-            # Only bind thinking tokens for NVIDIA endpoints
             if is_nvidia:
-                llm = _bind_thinking_tokens_if_configured(llm, **kwargs)
-                # For nemotron-3-nano models, set enable_thinking from env var
-                model = kwargs.get("model")
-                if model and ("nemotron-3-nano" in model.lower() or "nvidia/nemotron-3-nano" in model or "nemotron-3-nano-30b-a3b" in model):
-                    enable_thinking = os.getenv("ENABLE_NEMOTRON_3_NANO_THINKING", "true").lower() == "true"
-                    llm = llm.bind(chat_template_kwargs={"enable_thinking": enable_thinking})
-                    logger.info("nemotron-3-nano: Setting enable_thinking=%s (from ENABLE_NEMOTRON_3_NANO_THINKING)", enable_thinking)
+                llm = _bind_reasoning_config(llm, config=config, **kwargs)
             return llm
 
         logger.debug("Using llm model %s from api catalog", kwargs.get("model"))
@@ -363,23 +356,20 @@ def get_llm(config: NvidiaRAGConfig | None = None, **kwargs) -> LLM | SimpleChat
         if kwargs.get("ignore_eos") is not None:
             model_kwargs["ignore_eos"] = kwargs["ignore_eos"]
 
-        llm = ChatNVIDIA(
-            model=kwargs.get("model"),
-            api_key=api_key,
-            temperature=kwargs.get("temperature", None),
-            top_p=kwargs.get("top_p", None),
-            max_completion_tokens=kwargs.get("max_tokens", None),
-            stop=kwargs.get("stop", []),
-            default_headers=NVIDIA_API_DEFAULT_HEADERS,
+        # Do not pass stop=[] - some Nemotron 3 APIs reject empty stop arrays
+        chat_nvidia_kwargs = {
+            "model": kwargs.get("model"),
+            "api_key": api_key,
+            "temperature": kwargs.get("temperature", None),
+            "top_p": kwargs.get("top_p", None),
+            "max_completion_tokens": kwargs.get("max_tokens", None),
+            "default_headers": NVIDIA_API_DEFAULT_HEADERS,
             **({"model_kwargs": model_kwargs} if model_kwargs else {}),
-        )
-        llm = _bind_thinking_tokens_if_configured(llm, **kwargs)
-        # For nemotron-3-nano models, set enable_thinking from env var
-        model = kwargs.get("model")
-        if model and ("nemotron-3-nano" in model.lower() or "nvidia/nemotron-3-nano" in model or "nemotron-3-nano-30b-a3b" in model):
-            enable_thinking = os.getenv("ENABLE_NEMOTRON_3_NANO_THINKING", "true").lower() == "true"
-            llm = llm.bind(chat_template_kwargs={"enable_thinking": enable_thinking})
-            logger.info("nemotron-3-nano: Setting enable_thinking=%s (from ENABLE_NEMOTRON_3_NANO_THINKING)", enable_thinking)
+        }
+        if kwargs.get("stop"):
+            chat_nvidia_kwargs["stop"] = kwargs["stop"]
+        llm = ChatNVIDIA(**chat_nvidia_kwargs)
+        llm = _bind_reasoning_config(llm, config=config, **kwargs)
         return llm
 
     raise RuntimeError(
@@ -450,6 +440,9 @@ def streaming_filter_think(chunks: Iterable[str]) -> Iterable[str]:
     This generator filters content between think tags in streaming LLM responses.
     It handles both complete tags in a single chunk and tags split across multiple tokens.
 
+    When DEBUG logging is enabled (i.e. LOGLEVEL=DEBUG), reasoning tokens are
+    logged from <think> block content or reasoning_content field.
+
     Args:
         chunks (Iterable[str]): Chunks from a streaming LLM response
 
@@ -474,12 +467,19 @@ def streaming_filter_think(chunks: Iterable[str]) -> Iterable[str]:
     match_position = 0
     buffer = ""
     output_buffer = ""
+    think_accumulator = ""
+    reasoning_content_accumulator = ""
     chunk_count = 0
 
     for chunk in chunks:
-        content = chunk.content
+        reasoning, content = extract_reasoning_and_content(chunk)
+        content = content or reasoning
         chunk_count += 1
 
+        # Accumulate reasoning tokens when DEBUG logging is enabled (e.g. reasoning_content from nemotron-3-nano)
+        if reasoning and logger.isEnabledFor(logging.DEBUG):
+            reasoning_content_accumulator += reasoning
+
         # Let's first check for full tags - this is the most reliable approach
         buffer += content
 
@@ -496,6 +496,10 @@ def streaming_filter_think(chunks: Iterable[str]) -> Iterable[str]:
 
         while state == IN_THINK and FULL_END_TAG in buffer:
             end_idx = buffer.find(FULL_END_TAG)
+            if logger.isEnabledFor(logging.DEBUG):
+                think_content = buffer[:end_idx]
+                if think_content:
+                    think_accumulator += think_content + "\n"
             # Discard everything up to and including end tag
             buffer = buffer[end_idx + len(FULL_END_TAG) :]
             content = buffer
@@ -543,10 +547,14 @@ def streaming_filter_think(chunks: Iterable[str]) -> Iterable[str]:
 
         elif state == IN_THINK:
             if content_stripped == END_TAG_PARTS[0].strip():
+                # Accumulate think content before the end tag start
+                think_accumulator += buffer[: -len(content)] if content else buffer
                 state = MATCHING_END
                 match_position = 1
                 buffer = content  # Keep this token in buffer
             else:
+                if logger.isEnabledFor(logging.DEBUG):
+                    think_accumulator += buffer
                 buffer = ""  # Discard content inside think block
 
         elif state == MATCHING_END:
@@ -555,11 +563,15 @@ def streaming_filter_think(chunks: Iterable[str]) -> Iterable[str]:
                 match_position += 1
                 if match_position >= len(END_TAG_PARTS):
                     # Complete end tag matched
+                    if think_accumulator and logger.isEnabledFor(logging.DEBUG):
+                        think_accumulator += "\n"
                     state = NORMAL
                     match_position = 0
                     buffer = ""  # Clear buffer
             else:
                 # False match, revert to IN_THINK
+                if logger.isEnabledFor(logging.DEBUG):
+                    think_accumulator += buffer
                 state = IN_THINK
                 buffer = ""  # Discard content
 
@@ -581,6 +593,11 @@ def streaming_filter_think(chunks: Iterable[str]) -> Iterable[str]:
         if output_buffer:
             yield output_buffer
 
+    if think_accumulator and logger.isEnabledFor(logging.DEBUG):
+        logger.debug("Reasoning tokens (think): %s", think_accumulator.rstrip())
+    if reasoning_content_accumulator and logger.isEnabledFor(logging.DEBUG):
+        logger.debug("Reasoning tokens: %s", reasoning_content_accumulator)
+
     logger.info(
         "Finished streaming_filter_think processing after %d chunks", chunk_count
     )
@@ -611,14 +628,23 @@ def get_streaming_filter_think_parser():
         return RunnablePassthrough()
 
 
-async def streaming_filter_think_async(chunks):
+async def streaming_filter_think_async(chunks, enable_thinking: bool = False):
     """
     Async version of streaming_filter_think.
     This async generator filters content between think tags in streaming LLM responses.
     It handles both complete tags in a single chunk and tags split across multiple tokens.
 
+    When DEBUG logging is enabled (i.e. LOGLEVEL=DEBUG), reasoning tokens are
+    logged from <think> block content or reasoning_content field.
+
+    When enable_thinking is True and the model uses a separate reasoning_content field
+    (e.g. Nemotron 3), reasoning tokens are dropped and only content is forwarded.
+    The <think> tag filter still runs to handle models that embed reasoning in content.
+
     Args:
         chunks: Async iterable of chunks from a streaming LLM response
+        enable_thinking: When True, drop reasoning_content (genuine chain-of-thought).
+            When False, fall back to reasoning_content if content is empty (model quirk).
 
     Yields:
         str: Filtered content with think blocks removed
@@ -641,12 +667,19 @@ async def streaming_filter_think_async(chunks):
     match_position = 0
     buffer = ""
     output_buffer = ""
+    think_accumulator = ""
+    reasoning_content_accumulator = ""
     chunk_count = 0
 
     async for chunk in chunks:
-        content = chunk.content
+        reasoning, content = extract_reasoning_and_content(chunk)
+        content = content if enable_thinking else (content or reasoning)
         chunk_count += 1
 
+        # Accumulate reasoning when DEBUG logging is enabled (e.g. reasoning_content from nemotron-3-nano)
+        if reasoning and logger.isEnabledFor(logging.DEBUG):
+            reasoning_content_accumulator += reasoning
+
         # Let's first check for full tags - this is the most reliable approach
         buffer += content
 
@@ -663,6 +696,10 @@ async def streaming_filter_think_async(chunks):
 
         while state == IN_THINK and FULL_END_TAG in buffer:
             end_idx = buffer.find(FULL_END_TAG)
+            if logger.isEnabledFor(logging.DEBUG):
+                think_content = buffer[:end_idx]
+                if think_content:
+                    think_accumulator += think_content + "\n"
             # Discard everything up to and including end tag
             buffer = buffer[end_idx + len(FULL_END_TAG) :]
             content = buffer
@@ -710,10 +747,14 @@ async def streaming_filter_think_async(chunks):
 
         elif state == IN_THINK:
             if content_stripped == END_TAG_PARTS[0].strip():
+                # Accumulate think content before the end tag start
+                think_accumulator += buffer[: -len(content)] if content else buffer
                 state = MATCHING_END
                 match_position = 1
                 buffer = content  # Keep this token in buffer
             else:
+                if logger.isEnabledFor(logging.DEBUG):
+                    think_accumulator += buffer
                 buffer = ""  # Discard content inside think block
 
         elif state == MATCHING_END:
@@ -722,11 +763,15 @@ async def streaming_filter_think_async(chunks):
                 match_position += 1
                 if match_position >= len(END_TAG_PARTS):
                     # Complete end tag matched
+                    if think_accumulator and logger.isEnabledFor(logging.DEBUG):
+                        think_accumulator += "\n"
                     state = NORMAL
                     match_position = 0
                     buffer = ""  # Clear buffer
             else:
                 # False match, revert to IN_THINK
+                if logger.isEnabledFor(logging.DEBUG):
+                    think_accumulator += buffer
                 state = IN_THINK
                 buffer = ""  # Discard content
 
@@ -748,32 +793,71 @@ async def streaming_filter_think_async(chunks):
         if output_buffer:
             yield output_buffer
 
+    if think_accumulator and logger.isEnabledFor(logging.DEBUG):
+        logger.debug("Reasoning tokens: %s", think_accumulator.rstrip())
+    if reasoning_content_accumulator and logger.isEnabledFor(logging.DEBUG):
+        logger.debug("Reasoning tokens: %s", reasoning_content_accumulator)
+
     logger.info(
         "Finished streaming_filter_think_async processing after %d chunks", chunk_count
     )
 
 
-def get_streaming_filter_think_parser_async():
+async def _content_fallback_async(chunks, enable_thinking: bool = False):
+    """
+    Pass through LLM chunks WITHOUT filtering thinking tokens.
+    Used when FILTER_THINK_TOKENS=false - the user wants to see everything.
+
+    - When enable_thinking=true: forwards both reasoning_content and content so
+      the user can see the chain-of-thought followed by the answer.
+    - When enable_thinking=false: falls back to reasoning_content if content is
+      empty (NIM quirk where the answer lands in reasoning_content).
+
+    Args:
+        chunks: Async iterable of LLM response chunks
+        enable_thinking: Whether the model is producing genuine reasoning tokens.
+    """
+    async for chunk in chunks:
+        reasoning, content = extract_reasoning_and_content(chunk)
+
+        if enable_thinking:
+            if reasoning:
+                yield AIMessageChunk(content=reasoning)
+            if content:
+                yield AIMessageChunk(content=content)
+        else:
+            text = content or reasoning
+            if text:
+                yield AIMessageChunk(content=text)
+
+
+def get_streaming_filter_think_parser_async(enable_thinking: bool = False):
     """
     Creates and returns an async RunnableGenerator for filtering think tokens.
 
     If FILTER_THINK_TOKENS environment variable is set to "true" (case-insensitive),
     returns a parser that filters out content between <think> and </think> tags.
-    Otherwise, returns a pass-through parser that doesn't modify the content.
+    Otherwise, returns a parser that normalizes content (content or reasoning_content)
+    so models like Nemotron 3 that put reply in reasoning_content still yield text.
+
+    Args:
+        enable_thinking: When True, reasoning_content is genuine chain-of-thought and
+            will be dropped. When False, reasoning_content is used as a fallback if
+            content is empty (workaround for model quirk).
 
     Returns:
-        RunnableGenerator: An async parser for filtering (or not filtering) think tokens
+        RunnableGenerator: An async parser for filtering or content normalization
     """
+    from functools import partial
     from langchain_core.runnables import RunnableGenerator, RunnablePassthrough
 
     # Check environment variable
     filter_enabled = os.getenv("FILTER_THINK_TOKENS", "true").lower() == "true"
 
     if filter_enabled:
-        logger.info("Think token filtering is enabled (async)")
-        return RunnableGenerator(streaming_filter_think_async)
+        logger.info("Think token filtering is enabled (async), enable_thinking=%s", enable_thinking)
+        return RunnableGenerator(partial(streaming_filter_think_async, enable_thinking=enable_thinking))
     else:
-        logger.info("Think token filtering is disabled (async)")
-        # If filtering is disabled, use a passthrough that passes content as-is
-        return RunnablePassthrough()
+        logger.info("Think token filtering is disabled (async), enable_thinking=%s", enable_thinking)
+        return RunnableGenerator(partial(_content_fallback_async, enable_thinking=enable_thinking))
         
\ No newline at end of file
diff --git a/src/nvidia_rag/utils/metadata_validation.py b/src/nvidia_rag/utils/metadata_validation.py
index e1bdef417..3ff404483 100644
--- a/src/nvidia_rag/utils/metadata_validation.py
+++ b/src/nvidia_rag/utils/metadata_validation.py
@@ -2164,7 +2164,10 @@ def comparison(self, args) -> str:
                         logger.debug(f"[comparison] Failed to normalize datetime: {e}")
                         value_val = str(value_token)
                 elif field_info and is_string_type(field_info.type):
-                    value_val = str(value_token).lower()
+                    if field_name == "filename":
+                        value_val = str(value_token)
+                    else:
+                        value_val = str(value_token).lower()
                 else:
                     value_val = str(value_token)
             else:
@@ -2677,15 +2680,15 @@ def _get_error_context(filter_expr: str, error: UnexpectedInput) -> str:
             error_msg = (
                 f"Syntax error at line {line_num}, column {col_num}: '{snippet}'"
             )
-            error_msg += "\n\nExamples of valid filter expressions:"
-            error_msg += "\n• content_metadata[\"title\"] == 'value'"
-            error_msg += "\n• content_metadata[\"title\"] = 'value'"
+            error_msg += "\n\nExamples of valid filter expressions (use double quotes for string values):"
+            error_msg += '\n• content_metadata["title"] == "value"'
+            error_msg += '\n• content_metadata["title"] = "value"'
             error_msg += '\n• content_metadata["rating"] > 5'
-            error_msg += "\n• content_metadata[\"category\"] like '%tech%'"
-            error_msg += "\n• content_metadata[\"tags\"] in ['important', 'urgent']"
-            error_msg += "\n• content_metadata[\"created_date\"] between '2024-01-01' and '2024-12-31'"
+            error_msg += '\n• content_metadata["category"] like "%tech%"'
+            error_msg += '\n• content_metadata["tags"] in ["important", "urgent"]'
+            error_msg += '\n• content_metadata["created_date"] between "2024-01-01" and "2024-12-31"'
             error_msg += '\n• content_metadata["is_public"] == true'
-            error_msg += '\n• content_metadata["file_size"] > 1000 and content_metadata["type"] == \'pdf\''
+            error_msg += '\n• content_metadata["file_size"] > 1000 and content_metadata["type"] == "pdf"'
 
             return error_msg
 
diff --git a/src/nvidia_rag/utils/observability/langchain_callback_handler.py b/src/nvidia_rag/utils/observability/langchain_callback_handler.py
index dcdd7ba23..a35a71bab 100644
--- a/src/nvidia_rag/utils/observability/langchain_callback_handler.py
+++ b/src/nvidia_rag/utils/observability/langchain_callback_handler.py
@@ -40,6 +40,17 @@
 
 from .otel_metrics import OtelMetrics
 
+# Hardcoded attribute keys (replacing deprecated SpanAttributes constants)
+GEN_AI_PROMPTS = "gen_ai.prompt"
+GEN_AI_COMPLETIONS = "gen_ai.completion"
+LLM_REQUEST_MODEL = "gen_ai.request.model"
+LLM_RESPONSE_MODEL = "gen_ai.response.model"
+# Missing in opentelemetry.semconv_ai SpanAttributes (use llm.* to match existing semconv)
+LLM_REQUEST_MAX_TOKENS = "llm.request.max_tokens"
+LLM_REQUEST_TEMPERATURE = "llm.request.temperature"
+LLM_REQUEST_TOP_P = "llm.request.top_p"
+LLM_SYSTEM = "llm.system"
+
 
 class Config:
     exception_logger = None
@@ -137,9 +148,9 @@ def _set_request_params(span, kwargs, span_holder: SpanHolder):
     else:
         model = "unknown"
 
-    span.set_attribute(SpanAttributes.LLM_REQUEST_MODEL, model)
+    span.set_attribute(LLM_REQUEST_MODEL, model)
     # response is not available for LLM requests (as opposed to chat)
-    span.set_attribute(SpanAttributes.LLM_RESPONSE_MODEL, model)
+    span.set_attribute(LLM_RESPONSE_MODEL, model)
 
     if "invocation_params" in kwargs:
         params = (
@@ -150,13 +161,11 @@ def _set_request_params(span, kwargs, span_holder: SpanHolder):
 
     _set_span_attribute(
         span,
-        SpanAttributes.LLM_REQUEST_MAX_TOKENS,
+        LLM_REQUEST_MAX_TOKENS,
         params.get("max_tokens") or params.get("max_new_tokens"),
     )
-    _set_span_attribute(
-        span, SpanAttributes.LLM_REQUEST_TEMPERATURE, params.get("temperature")
-    )
-    _set_span_attribute(span, SpanAttributes.LLM_REQUEST_TOP_P, params.get("top_p"))
+    _set_span_attribute(span, LLM_REQUEST_TEMPERATURE, params.get("temperature"))
+    _set_span_attribute(span, LLM_REQUEST_TOP_P, params.get("top_p"))
 
 
 def _set_llm_request(
@@ -171,11 +180,11 @@ def _set_llm_request(
     if should_send_prompts():
         for i, msg in enumerate(prompts):
             span.set_attribute(
-                f"{SpanAttributes.LLM_PROMPTS}.{i}.role",
+                f"{GEN_AI_PROMPTS}.{i}.role",
                 "user",
             )
             span.set_attribute(
-                f"{SpanAttributes.LLM_PROMPTS}.{i}.content",
+                f"{GEN_AI_PROMPTS}.{i}.content",
                 msg,
             )
 
@@ -207,18 +216,18 @@ def _set_chat_request(
         for message in messages:
             for msg in message:
                 span.set_attribute(
-                    f"{SpanAttributes.LLM_PROMPTS}.{i}.role",
+                    f"{GEN_AI_PROMPTS}.{i}.role",
                     _message_type_to_role(msg.type),
                 )
                 # if msg.content is string
                 if isinstance(msg.content, str):
                     span.set_attribute(
-                        f"{SpanAttributes.LLM_PROMPTS}.{i}.content",
+                        f"{GEN_AI_PROMPTS}.{i}.content",
                         msg.content,
                     )
                 else:
                     span.set_attribute(
-                        f"{SpanAttributes.LLM_PROMPTS}.{i}.content",
+                        f"{GEN_AI_PROMPTS}.{i}.content",
                         json.dumps(msg.content, cls=CallbackFilteredJSONEncoder),
                     )
                 i += 1
@@ -252,7 +261,7 @@ def _set_chat_response(span: Span, response: LLMResult) -> None:
                 )
                 total_tokens = input_tokens + output_tokens
 
-            prefix = f"{SpanAttributes.LLM_COMPLETIONS}.{i}"
+            prefix = f"{GEN_AI_COMPLETIONS}.{i}"
             if hasattr(generation, "text") and generation.text != "":
                 span.set_attribute(
                     f"{prefix}.content",
@@ -317,11 +326,11 @@ def _set_chat_response(span: Span, response: LLMResult) -> None:
 
     if input_tokens > 0 or output_tokens > 0 or total_tokens > 0:
         span.set_attribute(
-            SpanAttributes.LLM_USAGE_PROMPT_TOKENS,
+            "gen_ai.usage.input_tokens",
             input_tokens,
         )
         span.set_attribute(
-            SpanAttributes.LLM_USAGE_COMPLETION_TOKENS,
+            "gen_ai.usage.output_tokens",
             output_tokens,
         )
         span.set_attribute(
@@ -462,7 +471,7 @@ def _create_llm_span(
             entity_path=entity_path,
             metadata=metadata,
         )
-        span.set_attribute(SpanAttributes.LLM_SYSTEM, "Langchain")
+        span.set_attribute(LLM_SYSTEM, "Langchain")
         span.set_attribute(SpanAttributes.LLM_REQUEST_TYPE, request_type.value)
 
         return span
@@ -650,10 +659,10 @@ def on_llm_end(
                 "model_name"
             ) or response.llm_output.get("model_id")
             if model_name is not None:
-                span.set_attribute(SpanAttributes.LLM_RESPONSE_MODEL, model_name)
+                span.set_attribute(LLM_RESPONSE_MODEL, model_name)
 
                 if self.spans[run_id].request_model is None:
-                    span.set_attribute(SpanAttributes.LLM_REQUEST_MODEL, model_name)
+                    span.set_attribute(LLM_REQUEST_MODEL, model_name)
 
         token_usage = (response.llm_output or {}).get("token_usage") or (
             response.llm_output or {}
@@ -673,12 +682,8 @@ def on_llm_end(
                 prompt_tokens + completion_tokens
             )
 
-            _set_span_attribute(
-                span, SpanAttributes.LLM_USAGE_PROMPT_TOKENS, prompt_tokens
-            )
-            _set_span_attribute(
-                span, SpanAttributes.LLM_USAGE_COMPLETION_TOKENS, completion_tokens
-            )
+            _set_span_attribute(span, "gen_ai.usage.input_tokens", prompt_tokens)
+            _set_span_attribute(span, "gen_ai.usage.output_tokens", completion_tokens)
             _set_span_attribute(
                 span, SpanAttributes.LLM_USAGE_TOTAL_TOKENS, total_tokens
             )
diff --git a/src/nvidia_rag/utils/vdb/milvus/milvus_vdb.py b/src/nvidia_rag/utils/vdb/milvus/milvus_vdb.py
index 3b40dc288..51528ed6b 100644
--- a/src/nvidia_rag/utils/vdb/milvus/milvus_vdb.py
+++ b/src/nvidia_rag/utils/vdb/milvus/milvus_vdb.py
@@ -617,6 +617,21 @@ def get_documents(self, collection_name: str) -> list[dict[str, Any]]:
         )
         return documents_list
 
+    @staticmethod
+    def _escape_milvus_string_literal(value: str) -> str:
+        """Escape a value for safe interpolation inside a single-quoted Milvus
+        boolean expression literal (e.g. ``field == '<value>'``).
+
+        Milvus filter expressions are parsed as a small expression language and
+        do not support parameterised queries for ``Collection.delete``. To
+        prevent filter-expression injection (CWE-89-class), escape backslashes
+        first and then single quotes so that user-controlled values cannot
+        break out of the surrounding ``'...'`` literal.
+        """
+        if not isinstance(value, str):
+            value = str(value)
+        return value.replace("\\", "\\\\").replace("'", "\\'")
+
     def delete_documents(
         self,
         collection_name: str,
@@ -634,15 +649,26 @@ def delete_documents(
 
         for source_value in source_values:
             doc_name = os.path.basename(source_value)
+            # Escape string literals before interpolating into Milvus filter
+            # expressions to prevent filter-expression injection via document
+            # names that contain single quotes or backslashes.
+            escaped_source_value = self._escape_milvus_string_literal(source_value)
+            escaped_doc_name = self._escape_milvus_string_literal(doc_name)
             logger.info(
                 f"Deleting document {source_value} from collection "
                 f"{collection_name} at {self.vdb_endpoint}"
             )
             try:
-                resp = collection.delete(f"source['source_name'] == '{source_value}'")
+                resp = collection.delete(
+                    f"source['source_name'] == '{escaped_source_value}'"
+                )
                 self._delete_entities(
                     collection_name=DEFAULT_DOCUMENT_INFO_COLLECTION,
-                    filter=f"info_type == 'document' and collection_name == '{collection_name}' and document_name == '{doc_name}'",
+                    filter=(
+                        f"info_type == 'document' and collection_name == "
+                        f"'{collection_name}' and document_name == "
+                        f"'{escaped_doc_name}'"
+                    ),
                 )
             except MilvusException:
                 # Fallback to legacy source field format
@@ -650,7 +676,7 @@ def delete_documents(
                     f"Failed to delete document {source_value}, source name might be "
                     "available in the source field"
                 )
-                resp = collection.delete(f"source == '{source_value}'")
+                resp = collection.delete(f"source == '{escaped_source_value}'")
 
             if result_dict is not None:
                 if resp.delete_count == 0:
diff --git a/src/nvidia_rag/utils/vdb/vdb_ingest_base.py b/src/nvidia_rag/utils/vdb/vdb_ingest_base.py
index 190d69ad4..bbbb61911 100644
--- a/src/nvidia_rag/utils/vdb/vdb_ingest_base.py
+++ b/src/nvidia_rag/utils/vdb/vdb_ingest_base.py
@@ -14,7 +14,8 @@
 # limitations under the License.
 
 """
-This module provides VDBRagIngest, a VDBRag subclass with nv_ingest support.
+This module provides VDBRagIngest, a VDBRag subclass with nv_ingest support,
+and SerializedVDBWrapper for serializing concurrent VDB write operations.
 
 VDBRagIngest combines VDBRag (pure abstract base) with VDB from nv_ingest_client,
 providing full ingestion capabilities. This class should be used by ingestor_server
@@ -25,6 +26,7 @@
 """
 
 import logging
+import threading
 
 from nvidia_rag.utils.vdb.vdb_base import VDBRag
 
@@ -52,6 +54,40 @@ class VDBRagIngest(VDBRag, VDB):
 
         pass
 
+    class SerializedVDBWrapper:
+        """Wraps a VDB op to serialize write operations while keeping reads parallel.
+
+        When multiple batches run extraction concurrently, their VDB writes can
+        overlap and cause indexing timeouts (e.g., GPU_CAGRA JIT compilation takes
+        longer than the client's patience window). This wrapper uses a threading
+        lock to ensure only one batch writes to the VDB at a time.
+        """
+
+        def __init__(self, vdb_op):
+            self._vdb_op = vdb_op
+            self._write_lock = threading.Lock()
+
+        def run_async(self, records):
+            with self._write_lock:
+                return self._vdb_op.run_async(records)
+
+        def run(self, records):
+            with self._write_lock:
+                return self._vdb_op.run(records)
+
+        def write_to_index(self, records, **kwargs):
+            with self._write_lock:
+                return self._vdb_op.write_to_index(records, **kwargs)
+
+        def create_index(self, **kwargs):
+            with self._write_lock:
+                return self._vdb_op.create_index(**kwargs)
+
+        def __getattr__(self, name):
+            return getattr(self._vdb_op, name)
+
+    VDB.register(SerializedVDBWrapper)
+
 except ImportError:
     logger.warning(
         "Optional nv_ingest_client module not installed. "
@@ -59,4 +95,4 @@ class VDBRagIngest(VDBRag, VDB):
     )
     # Fallback: VDBRagIngest is just VDBRag without nv_ingest support
     VDBRagIngest = VDBRag
-
+    SerializedVDBWrapper = None
diff --git a/tests/integration/notebook_test_config.yaml b/tests/integration/notebook_test_config.yaml
new file mode 100644
index 000000000..e0bcc3464
--- /dev/null
+++ b/tests/integration/notebook_test_config.yaml
@@ -0,0 +1,118 @@
+# NVIDIA RAG Configuration
+# This file contains configurable parameters with their values
+# You can override any of these values, and they take precedence over environment variables
+
+# Vector Store Configuration
+vector_store:
+  name: "milvus"  # Name of the vector store backend (e.g., milvus, elasticsearch)
+  url: "http://localhost:19530"  # URL endpoint for the vector store service
+  index_type: "GPU_CAGRA"  # Type of vector index (e.g., GPU_CAGRA, IVF_FLAT)
+  search_type: "dense"  # Type of search to perform (dense, hybrid)
+  enable_gpu_index: true  # Enable GPU acceleration for index building
+  enable_gpu_search: true  # Enable GPU acceleration for search operations
+  default_collection_name: "test_native"  # Default collection/index name for storing vectors
+
+# NV-Ingest Configuration
+nv_ingest:
+  message_client_hostname: "localhost"  # Hostname for NV-Ingest message client
+  message_client_port: 7670  # Port for NV-Ingest message client
+  extract_text: true  # Enable text extraction from documents
+  extract_infographics: false  # Enable infographic extraction from documents
+  extract_tables: true  # Enable table extraction from documents
+  extract_charts: true  # Enable chart extraction from documents
+  extract_images: false  # Enable image extraction from documents
+  pdf_extract_method: null  # Method to use for PDF extraction
+  text_depth: "page"  # Granularity level for text extraction (page, document)
+  chunk_size: 512  # Maximum size of text chunks in tokens
+  chunk_overlap: 150  # Number of overlapping tokens between chunks
+  caption_model_name: "nvidia/nemotron-nano-12b-v2-vl"  # Model name for generating image captions
+  caption_endpoint_url: "http://localhost:1977/v1/chat/completions"  # API endpoint for caption generation service
+  enable_pdf_splitter: true  # Enable PDF page splitting during ingestion
+
+# LLM Configuration
+llm:
+  server_url: "http://localhost:8999"  # URL endpoint for the LLM inference service (on-prem NIM default)
+  model_name: "nvidia/llama-3.3-nemotron-super-49b-v1.5"  # Name of the language model to use for generation
+  # api_key: ""  # Optional: API key for LLM service (overrides NVIDIA_API_KEY environment variable)
+  parameters:
+    max_tokens: 32768  # Maximum number of tokens to generate in response
+    temperature: 0.0  # Sampling temperature for controlling randomness (0.0 = deterministic)
+    top_p: 1.0  # Nucleus sampling threshold for token selection
+
+# Query Rewriter Configuration
+query_rewriter:
+  model_name: "nvidia/llama-3.3-nemotron-super-49b-v1.5"  # Model for rewriting user queries to improve retrieval
+  server_url: "localhost:8999"  # URL endpoint for query rewriter service
+  enable_query_rewriter: false  # Enable automatic query rewriting before retrieval
+  # api_key: ""  # Optional: API key for query rewriter (overrides NVIDIA_API_KEY environment variable)
+
+# Filter Expression Generator Configuration
+filter_expression_generator:
+  model_name: "nvidia/llama-3.3-nemotron-super-49b-v1.5"  # Model for generating metadata filter expressions from queries
+  server_url: "localhost:8999"  # URL endpoint for filter expression generator service
+  enable_filter_generator: false  # Enable automatic filter expression generation from natural language
+  # api_key: ""  # Optional: API key for filter generator (overrides NVIDIA_API_KEY environment variable)
+
+# Embedding Configuration
+embeddings:
+  model_name: "nvidia/llama-nemotron-embed-1b-v2"  # Model for generating text embeddings
+  dimensions: 2048  # Dimensionality of the embedding vectors
+  server_url: "http://localhost:9080/v1"  # URL endpoint for embedding service (on-prem NIM default)
+  # api_key: ""  # Optional: API key for embeddings (overrides NVIDIA_API_KEY environment variable)
+
+# Ranking Configuration
+ranking:
+  model_name: "nvidia/llama-3.2-nv-rerankqa-1b-v2"  # Model for reranking retrieved documents
+  server_url: "http://localhost:1976"  # URL endpoint for reranking service (on-prem NIM default)
+  enable_reranker: true  # Enable reranking of retrieved documents before generation
+  # api_key: ""  # Optional: API key for reranking (overrides NVIDIA_API_KEY environment variable)
+
+# Retriever Configuration
+retriever:
+  top_k: 10  # Number of top documents to return after retrieval and reranking
+  vdb_top_k: 100  # Number of documents to retrieve from vector database before reranking
+  score_threshold: 0.25  # Minimum similarity score threshold for retrieved documents
+
+# Tracing Configuration
+tracing:
+  enabled: false  # Enable distributed tracing and metrics collection
+  otlp_http_endpoint: "http://localhost:4318/v1/traces"  # OpenTelemetry HTTP endpoint for traces
+  otlp_grpc_endpoint: "grpc://localhost:4317"  # OpenTelemetry gRPC endpoint for traces
+
+# Vision-Language Model Configuration
+vlm:
+  server_url: "http://localhost:1977/v1"  # URL endpoint for Vision-Language Model service
+  model_name: "nvidia/nemotron-nano-12b-v2-vl"  # Vision-Language Model for processing images and text
+  # api_key: ""  # Optional: API key for VLM service (overrides NVIDIA_API_KEY environment variable)
+
+# MinIO Configuration
+minio:
+  endpoint: "localhost:9010"  # MinIO object storage endpoint
+  access_key: "minioadmin"  # MinIO access key for authentication
+  secret_key: "minioadmin"  # MinIO secret key for authentication
+
+# Summarizer Configuration
+summarizer:
+  model_name: "nvidia/llama-3.3-nemotron-super-49b-v1.5"  # Model for generating document summaries
+  server_url: "localhost:8999"  # URL endpoint for summarization service
+  max_chunk_length: 50000  # Maximum character length for chunks to summarize
+  chunk_overlap: 200  # Character overlap between chunks during summarization
+  temperature: 0.0  # Sampling temperature for summary generation
+  top_p: 1.0  # Nucleus sampling threshold for summary generation
+  # api_key: ""  # Optional: API key for summarization (overrides NVIDIA_API_KEY environment variable)
+
+# Reflection Configuration
+reflection:
+  enable_reflection: false  # Enable self-reflection to improve answer quality
+  max_loops: 3  # Maximum number of reflection iterations
+  model_name: "nvidia/llama-3.3-nemotron-super-49b-v1.5"  # Model for reflection and quality assessment
+  server_url: ""  # URL endpoint for reflection service
+  context_relevance_threshold: 1  # Minimum relevance score for context to be considered useful
+  response_groundedness_threshold: 1  # Minimum groundedness score for response to be considered factual
+  # api_key: ""  # Optional: API key for reflection (overrides NVIDIA_API_KEY environment variable)
+
+# Top-level Configuration Flags
+enable_guardrails: false  # Enable safety guardrails for input/output filtering
+enable_citations: true  # Include source citations in generated responses
+enable_vlm_inference: false  # Enable Vision-Language Model for multimodal queries
+temp_dir: "./tmp-data/"  # Temporary directory for file processing and storage
\ No newline at end of file
diff --git a/tests/integration/test_cases/library_usage.py b/tests/integration/test_cases/library_usage.py
index d67ef4942..c562201ca 100644
--- a/tests/integration/test_cases/library_usage.py
+++ b/tests/integration/test_cases/library_usage.py
@@ -49,8 +49,7 @@ def _get_config(self):
         """Get or create shared config object with common settings"""
         if self._config is None:
             from nvidia_rag.utils.configuration import NvidiaRAGConfig
-            
-            config_path = Path(__file__).parent.parent.parent.parent / "notebooks" / "config.yaml"
+            config_path = Path(__file__).parent.parent.parent.parent / "tests" / "integration" / "notebook_test_config.yaml"
             self._config = NvidiaRAGConfig.from_yaml(str(config_path))
             
             # Common configuration for all library tests
diff --git a/tests/integration/test_cases/multimodal_query.py b/tests/integration/test_cases/multimodal_query.py
index b888b8c7c..b38925211 100644
--- a/tests/integration/test_cases/multimodal_query.py
+++ b/tests/integration/test_cases/multimodal_query.py
@@ -95,7 +95,7 @@
 
     2. Deploy or upgrade the chart:
 
-        helm upgrade --install rag -n rag https://helm.ngc.nvidia.com/nvstaging/blueprint/charts/nvidia-blueprint-rag-v2.4.0-rc1.tgz \\
+        helm upgrade --install rag -n rag https://helm.ngc.nvidia.com/nvidia/blueprint/charts/nvidia-blueprint-rag-v2.5.0-rc1.tgz \\
           --username '$oauthtoken' \\
           --password "${NGC_API_KEY}" \\
           --set imagePullSecret.password=$NGC_API_KEY \\
diff --git a/tests/unit/test_compose_helm_parity/env_parity_exemptions.yaml b/tests/unit/test_compose_helm_parity/env_parity_exemptions.yaml
index 80cea693c..ed9f45c06 100644
--- a/tests/unit/test_compose_helm_parity/env_parity_exemptions.yaml
+++ b/tests/unit/test_compose_helm_parity/env_parity_exemptions.yaml
@@ -53,8 +53,8 @@ ngcApiKeyPresenceExemptions:
   perService:
     nims.yaml:
       nim-llm: true
-      nemoretriever-embedding-ms: true
-      nemoretriever-ranking-ms: true
+      nemotron-embedding-ms: true
+      nemotron-ranking-ms: true
       vlm-ms: true
 
 
diff --git a/tests/unit/test_compose_helm_parity/test_compose_helm_parity.py b/tests/unit/test_compose_helm_parity/test_compose_helm_parity.py
index 713e50a6c..15a36bbbf 100644
--- a/tests/unit/test_compose_helm_parity/test_compose_helm_parity.py
+++ b/tests/unit/test_compose_helm_parity/test_compose_helm_parity.py
@@ -300,7 +300,7 @@ def test_compose_helm_image_and_env_parity():
                     "ngcAPIKey",
                 ],
             },
-            "nemoretriever-embedding-ms": {
+            "nemotron-embedding-ms": {
                 "values_image_repo_path": [
                     "nimOperator",
                     "nvidia-nim-llama-32-nv-embedqa-1b-v2",
@@ -320,7 +320,7 @@ def test_compose_helm_image_and_env_parity():
                     "ngcAPIKey",
                 ],
             },
-            "nemoretriever-ranking-ms": {
+            "nemotron-ranking-ms": {
                 "values_image_repo_path": [
                     "nimOperator",
                     "nvidia-nim-llama-32-nv-rerankqa-1b-v2",
diff --git a/tests/unit/test_mcp/test_cwe22_path_traversal.py b/tests/unit/test_mcp/test_cwe22_path_traversal.py
new file mode 100644
index 000000000..bb6031f12
--- /dev/null
+++ b/tests/unit/test_mcp/test_cwe22_path_traversal.py
@@ -0,0 +1,232 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# PoC / regression tests for CWE-22 path traversal in MCP server file upload tools.
+#
+# The tool_upload_documents and tool_update_documents functions accept arbitrary
+# file_paths from MCP clients and read them without any path validation.
+# An attacker can supply paths like "/etc/passwd" or "../../sensitive.txt" to
+# read arbitrary files from the server's filesystem and exfiltrate them via
+# the ingestor upload.
+
+from __future__ import annotations
+
+import os
+import sys
+from types import SimpleNamespace
+from typing import Any
+
+import pytest
+
+import examples.nvidia_rag_mcp.mcp_server as mcp_server
+
+try:
+    from fastmcp.tools import FunctionTool
+except Exception:
+    FunctionTool = None
+
+
+def _tool_fn(tool: Any):
+    if FunctionTool is not None and isinstance(tool, FunctionTool):
+        inner = getattr(tool, "func", None) or getattr(tool, "__wrapped__", None)
+        if inner is not None:
+            return inner
+    return tool
+
+
+def _make_fake_aiohttp(captured_files: list):
+    """Build a fake aiohttp that records which file contents are uploaded."""
+
+    class FakeResp:
+        def __init__(self):
+            self.status = 200
+
+        async def json(self):
+            return {"ok": True}
+
+        async def text(self):
+            return "ok"
+
+    class FakeFormData:
+        def __init__(self):
+            self.fields: list[tuple] = []
+
+        def add_field(self, name, value, filename=None, content_type=None):
+            self.fields.append((name, value, filename, content_type))
+            if name == "documents":
+                captured_files.append({"filename": filename, "content": value})
+
+    class FakeSession:
+        async def __aenter__(self):
+            return self
+
+        async def __aexit__(self, exc_type, exc, tb):
+            return False
+
+        def post(self, url, data=None):
+            class Ctx:
+                async def __aenter__(self_inner):
+                    return FakeResp()
+                async def __aexit__(self_inner, exc_type, exc, tb):
+                    return False
+            return Ctx()
+
+        def patch(self, url, data=None):
+            class Ctx:
+                async def __aenter__(self_inner):
+                    return FakeResp()
+                async def __aexit__(self_inner, exc_type, exc, tb):
+                    return False
+            return Ctx()
+
+    FakeClientTimeout = type("ClientTimeout", (), {"__init__": lambda self, total=None: None})
+    return SimpleNamespace(
+        ClientSession=lambda timeout=None: FakeSession(),
+        ClientTimeout=FakeClientTimeout,
+        ContentTypeError=Exception,
+        FormData=FakeFormData,
+    )
+
+
+@pytest.mark.anyio
+async def test_upload_rejects_absolute_path_outside_allowed_dir(monkeypatch, tmp_path):
+    """
+    Path traversal PoC: tool_upload_documents must reject absolute paths
+    outside the allowed upload directory.
+
+    An attacker-controlled MCP client could pass "/etc/passwd" to read
+    arbitrary files. After the fix, a ValueError should be raised.
+    """
+    secret = tmp_path / "secret.txt"
+    secret.write_text("super-secret-data")
+
+    allowed_dir = tmp_path / "uploads"
+    allowed_dir.mkdir()
+    monkeypatch.setenv("MCP_UPLOAD_DIR", str(allowed_dir))
+
+    captured_files: list = []
+    fake = _make_fake_aiohttp(captured_files)
+    monkeypatch.setattr(mcp_server, "aiohttp", fake, raising=True)
+
+    tool = _tool_fn(mcp_server.tool_upload_documents)
+
+    with pytest.raises(ValueError, match="not within the allowed upload directory"):
+        await tool(
+            collection_name="test",
+            file_paths=[str(secret)],
+        )
+
+    assert len(captured_files) == 0, "Sensitive file was read despite being outside allowed dir"
+
+
+@pytest.mark.anyio
+async def test_update_rejects_absolute_path_outside_allowed_dir(monkeypatch, tmp_path):
+    """
+    Same traversal via tool_update_documents (PATCH variant).
+    """
+    secret = tmp_path / "secret.txt"
+    secret.write_text("super-secret-data")
+
+    allowed_dir = tmp_path / "uploads"
+    allowed_dir.mkdir()
+    monkeypatch.setenv("MCP_UPLOAD_DIR", str(allowed_dir))
+
+    captured_files: list = []
+    fake = _make_fake_aiohttp(captured_files)
+    monkeypatch.setattr(mcp_server, "aiohttp", fake, raising=True)
+
+    tool = _tool_fn(mcp_server.tool_update_documents)
+
+    with pytest.raises(ValueError, match="not within the allowed upload directory"):
+        await tool(
+            collection_name="test",
+            file_paths=[str(secret)],
+        )
+    assert len(captured_files) == 0
+
+
+@pytest.mark.anyio
+async def test_upload_rejects_dot_dot_traversal(monkeypatch, tmp_path):
+    """
+    Relative path traversal via '../' must be rejected.
+    """
+    allowed_dir = tmp_path / "uploads"
+    allowed_dir.mkdir()
+
+    secret = tmp_path / "secret.txt"
+    secret.write_text("traversal-secret")
+
+    monkeypatch.setenv("MCP_UPLOAD_DIR", str(allowed_dir))
+
+    captured_files: list = []
+    fake = _make_fake_aiohttp(captured_files)
+    monkeypatch.setattr(mcp_server, "aiohttp", fake, raising=True)
+
+    tool = _tool_fn(mcp_server.tool_upload_documents)
+
+    traversal_path = str(allowed_dir / ".." / "secret.txt")
+    with pytest.raises(ValueError, match="not within the allowed upload directory"):
+        await tool(
+            collection_name="test",
+            file_paths=[traversal_path],
+        )
+    assert len(captured_files) == 0
+
+
+@pytest.mark.anyio
+async def test_upload_allows_file_inside_allowed_dir(monkeypatch, tmp_path):
+    """
+    Files within the allowed upload directory should be accepted.
+    """
+    allowed_dir = tmp_path / "uploads"
+    allowed_dir.mkdir()
+
+    legit_file = allowed_dir / "doc.pdf"
+    legit_file.write_bytes(b"%PDF-1.4 legit content")
+
+    monkeypatch.setenv("MCP_UPLOAD_DIR", str(allowed_dir))
+
+    captured_files: list = []
+    fake = _make_fake_aiohttp(captured_files)
+    monkeypatch.setattr(mcp_server, "aiohttp", fake, raising=True)
+
+    tool = _tool_fn(mcp_server.tool_upload_documents)
+    result = await tool(
+        collection_name="test",
+        file_paths=[str(legit_file)],
+    )
+
+    assert result.get("ok") is True
+    assert len(captured_files) == 1
+    assert captured_files[0]["filename"] == "doc.pdf"
+    assert captured_files[0]["content"] == b"%PDF-1.4 legit content"
+
+
+@pytest.mark.anyio
+async def test_upload_rejects_symlink_escape(monkeypatch, tmp_path):
+    """
+    Symlink escape: a symlink inside the allowed dir pointing outside must be rejected.
+    """
+    allowed_dir = tmp_path / "uploads"
+    allowed_dir.mkdir()
+
+    secret = tmp_path / "secret.txt"
+    secret.write_text("symlink-secret")
+
+    link = allowed_dir / "evil_link.txt"
+    link.symlink_to(secret)
+
+    monkeypatch.setenv("MCP_UPLOAD_DIR", str(allowed_dir))
+
+    captured_files: list = []
+    fake = _make_fake_aiohttp(captured_files)
+    monkeypatch.setattr(mcp_server, "aiohttp", fake, raising=True)
+
+    tool = _tool_fn(mcp_server.tool_upload_documents)
+
+    with pytest.raises(ValueError, match="not within the allowed upload directory"):
+        await tool(
+            collection_name="test",
+            file_paths=[str(link)],
+        )
+    assert len(captured_files) == 0
diff --git a/tests/unit/test_mcp/test_mcp_server.py b/tests/unit/test_mcp/test_mcp_server.py
index 2473b3831..224d75762 100644
--- a/tests/unit/test_mcp/test_mcp_server.py
+++ b/tests/unit/test_mcp/test_mcp_server.py
@@ -289,6 +289,8 @@ async def test_tool_update_documents_uses_patch_and_form(monkeypatch, tmp_path):
     p2 = tmp_path / "b.pdf"
     p2.write_bytes(b"%PDF-1.4 b")
 
+    monkeypatch.setenv("MCP_UPLOAD_DIR", str(tmp_path))
+
     captured: dict[str, Any] = {}
 
     class FakeResp:
@@ -618,6 +620,8 @@ async def test_tool_upload_documents(monkeypatch, tmp_path):
     p = tmp_path / "doc.pdf"
     p.write_bytes(b"%PDF-1.4...")
 
+    monkeypatch.setenv("MCP_UPLOAD_DIR", str(tmp_path))
+
     class FakeResp:
         def __init__(self):
             self.status = 200
diff --git a/tests/unit/test_metadata_validation/test_filter_validator.py b/tests/unit/test_metadata_validation/test_filter_validator.py
index 0bf174973..3f6b34759 100644
--- a/tests/unit/test_metadata_validation/test_filter_validator.py
+++ b/tests/unit/test_metadata_validation/test_filter_validator.py
@@ -3222,6 +3222,30 @@ def test_string_basic_operations(self, mock_config, string_schema):
         assert result["status"] is True
         assert "error_message" not in result
 
+    def test_filename_preserves_case_other_string_lowercased(self, mock_config):
+        """Filename filter preserves case; other string fields are lowercased for matching."""
+        schema = MetadataSchema(
+            schema=[
+                MetadataField(name="filename", type="string", required=False),
+                MetadataField(name="title", type="string", required=False),
+            ]
+        )
+        parser = FilterExpressionParser(schema, mock_config)
+
+        # Filename must preserve case (ingestion stores original case).
+        result = parser.process_filter_expression(
+            'content_metadata["filename"] == "Report.PDF"'
+        )
+        assert result["status"] is True
+        assert '"Report.PDF"' in result["processed_expression"]
+
+        # Other string fields are normalized to lowercase.
+        result = parser.process_filter_expression(
+            'content_metadata["title"] == "Technical"'
+        )
+        assert result["status"] is True
+        assert '"technical"' in result["processed_expression"]
+
     def test_string_like_operations(self, mock_config, string_schema):
         """Test string LIKE operations."""
         parser = FilterExpressionParser(string_schema, mock_config)
diff --git a/tests/unit/test_observability/test_langchain_callback_handler.py b/tests/unit/test_observability/test_langchain_callback_handler.py
index 054f5d18d..80511caab 100644
--- a/tests/unit/test_observability/test_langchain_callback_handler.py
+++ b/tests/unit/test_observability/test_langchain_callback_handler.py
@@ -6,6 +6,12 @@
 import pytest
 from langchain_core.messages import AIMessageChunk
 from langchain_core.outputs import Generation, LLMResult
+from opentelemetry.semconv_ai import (
+    LLMRequestTypeValues,
+    SpanAttributes,
+    SUPPRESS_LANGUAGE_MODEL_INSTRUMENTATION_KEY,
+    TraceloopSpanKindValues,
+)
 
 
 class SpanMock:
@@ -58,7 +64,9 @@ def handler():
 
 
 def test_on_chat_model_start_sets_input_words_and_prompts(handler):
-    from nvidia_rag.utils.observability.langchain_callback_handler import SpanAttributes
+    from nvidia_rag.utils.observability.langchain_callback_handler import (
+        GEN_AI_PROMPTS,
+    )
 
     run_id = uuid4()
     messages = [
@@ -78,44 +86,8 @@ def test_on_chat_model_start_sets_input_words_and_prompts(handler):
     assert handler.total_input_words == 4
     # span should be created and attributes recorded
     assert run_id in handler.spans
-    span = handler.spans[run_id].span
-    # Check that at least one prompt attribute key prefix was used
-    prompt_prefix = f"{SpanAttributes.LLM_PROMPTS}."
-    prompt_keys = [k for k, _ in span.attributes if k.startswith(prompt_prefix)]
-    assert len(prompt_keys) >= 2
-
-
-def test_on_llm_start_and_end_sets_token_usage_and_ends_span(handler):
-    run_id = uuid4()
 
-    handler.on_llm_start(
-        serialized={"kwargs": {"name": "llm"}},
-        prompts=["What is ML?"],
-        run_id=run_id,
-    )
 
-    # Build a minimal valid LLMResult
-    gen = Generation(
-        text="Answer",
-        generation_info={"finish_reason": "stop"},
-    )
-
-    llm_result = LLMResult(
-        generations=[[gen]],
-        llm_output={
-            "model_name": "test-model",
-            "usage": {"prompt_tokens": 5, "completion_tokens": 7, "total_tokens": 12},
-        },
-    )
-
-    handler.on_llm_end(response=llm_result, run_id=run_id)
-
-    span = handler.spans[run_id].span
-    # Verify span ended
-    assert span.ended is True
-    # Verify some token usage attributes were set from llm_output usage
-    attr_keys = [k for k, _ in span.attributes]
-    assert any("usage" in k.lower() for k in attr_keys)
 
 
 def test_on_chain_end_updates_avg_words_per_chunk(handler):
@@ -156,3 +128,76 @@ def test_on_chain_end_updates_llm_tokens(handler):
 
     # Expect update_llm_tokens called with input words from chat (3) and output words (2)
     assert handler.metrics.token_calls[-1] == (3, 2)
+
+
+# SpanAttributes from opentelemetry.semconv_ai still used in langchain_callback_handler.py.
+# Missing/deprecated ones (LLM_REQUEST_MODEL, LLM_RESPONSE_MODEL, LLM_REQUEST_MAX_TOKENS,
+# LLM_REQUEST_TEMPERATURE, LLM_REQUEST_TOP_P, LLM_SYSTEM) are hardcoded in the handler.
+SPAN_ATTRIBUTES_USED = [
+    "LLM_REQUEST_FUNCTIONS",
+    "LLM_USAGE_TOTAL_TOKENS",
+    "TRACELOOP_WORKFLOW_NAME",
+    "TRACELOOP_ENTITY_PATH",
+    "TRACELOOP_SPAN_KIND",
+    "TRACELOOP_ENTITY_NAME",
+    "LLM_REQUEST_TYPE",
+    "TRACELOOP_ENTITY_INPUT",
+    "TRACELOOP_ENTITY_OUTPUT",
+]
+
+
+def test_semconv_ai_span_attributes_exist_and_not_deprecated():
+    """Ensure all SpanAttributes used in langchain_callback_handler exist and are non-empty strings."""
+    for attr_name in SPAN_ATTRIBUTES_USED:
+        assert hasattr(
+            SpanAttributes, attr_name
+        ), f"SpanAttributes.{attr_name} is missing or was removed from opentelemetry.semconv_ai"
+        value = getattr(SpanAttributes, attr_name)
+        assert isinstance(
+            value, str
+        ), f"SpanAttributes.{attr_name} should be a string, got {type(value).__name__}"
+        assert (
+            len(value) > 0
+        ), f"SpanAttributes.{attr_name} is empty (possibly deprecated or placeholder)"
+
+
+def test_semconv_ai_llm_request_type_values_used():
+    """Ensure LLMRequestTypeValues used in the handler (CHAT, COMPLETION) exist."""
+    assert hasattr(LLMRequestTypeValues, "CHAT")
+    assert hasattr(LLMRequestTypeValues, "COMPLETION")
+    assert isinstance(LLMRequestTypeValues.CHAT.value, str)
+    assert isinstance(LLMRequestTypeValues.COMPLETION.value, str)
+
+
+def test_semconv_ai_traceloop_span_kind_values_used():
+    """Ensure TraceloopSpanKindValues used in the handler (WORKFLOW, TASK, TOOL) exist."""
+    for kind in ("WORKFLOW", "TASK", "TOOL"):
+        assert hasattr(
+            TraceloopSpanKindValues, kind
+        ), f"TraceloopSpanKindValues.{kind} is missing"
+        assert isinstance(getattr(TraceloopSpanKindValues, kind).value, str)
+
+
+def test_langchain_callback_handler_imports_and_constants():
+    """Import the handler module and verify hardcoded attribute constants and semconv_ai key."""
+    from nvidia_rag.utils.observability.langchain_callback_handler import (
+        GEN_AI_COMPLETIONS,
+        GEN_AI_PROMPTS,
+        LLM_REQUEST_MAX_TOKENS,
+        LLM_REQUEST_MODEL,
+        LLM_REQUEST_TEMPERATURE,
+        LLM_REQUEST_TOP_P,
+        LLM_RESPONSE_MODEL,
+        LLM_SYSTEM,
+    )
+
+    assert GEN_AI_PROMPTS == "gen_ai.prompt"
+    assert GEN_AI_COMPLETIONS == "gen_ai.completion"
+    assert LLM_REQUEST_MODEL == "gen_ai.request.model"
+    assert LLM_RESPONSE_MODEL == "gen_ai.response.model"
+    assert LLM_REQUEST_MAX_TOKENS == "llm.request.max_tokens"
+    assert LLM_REQUEST_TEMPERATURE == "llm.request.temperature"
+    assert LLM_REQUEST_TOP_P == "llm.request.top_p"
+    assert LLM_SYSTEM == "llm.system"
+    assert SUPPRESS_LANGUAGE_MODEL_INSTRUMENTATION_KEY is not None
+    assert isinstance(SUPPRESS_LANGUAGE_MODEL_INSTRUMENTATION_KEY, str)
diff --git a/tests/unit/test_rag_server/test_query_rewriting.py b/tests/unit/test_rag_server/test_query_rewriting.py
index 129d82124..78e010474 100644
--- a/tests/unit/test_rag_server/test_query_rewriting.py
+++ b/tests/unit/test_rag_server/test_query_rewriting.py
@@ -17,6 +17,8 @@
 Test suite for query rewriting functionality in the RAG server.
 """
 
+from unittest.mock import AsyncMock, patch
+
 import pytest
 
 
@@ -64,6 +66,7 @@ class DummyVDB:
     """A minimal VDB stub used via monkeypatch on __prepare_vdb_op."""
 
     last_query = None
+    last_retrieval_method = None
 
     def check_collection_exists(self, collection_name: str) -> bool:
         return True
@@ -77,6 +80,15 @@ def get_metadata_schema(self, collection_name: str):
     def retrieval_langchain(self, query, collection_name, vectorstore=None, top_k=None, filter_expr="", otel_ctx=None):
         """Sync method - called in ThreadPoolExecutor or directly."""
         DummyVDB.last_query = query
+        DummyVDB.last_retrieval_method = "langchain"
+        return []
+
+    def retrieval_image_langchain(
+        self, query, collection_name, vectorstore=None, top_k=None, reranker_top_k=None
+    ):
+        """Called when query contains images (multimodal)."""
+        DummyVDB.last_query = query
+        DummyVDB.last_retrieval_method = "image"
         return []
 
 
@@ -253,6 +265,43 @@ async def test_search_combines_history_when_multiturn_enabled(monkeypatch):
     assert fake_vdb.last_query == "What is RAG?. How does it work?"
 
 
+@pytest.mark.asyncio
+async def test_search_skips_query_rewriter_for_image_query(monkeypatch):
+    """When query is multimodal with image, query rewriting is skipped and retrieval_image_langchain is used."""
+    from nvidia_rag.rag_server.main import NvidiaRAG
+
+    monkeypatch.setenv("CONVERSATION_HISTORY", "5")
+    monkeypatch.setenv("ENABLE_REFLECTION", "false")
+
+    fake_vdb = DummyVDB()
+    rag = NvidiaRAG()
+    monkeypatch.setattr(NvidiaRAG, "_prepare_vdb_op", lambda self, **kw: fake_vdb)
+
+    multimodal_query = [
+        {"type": "text", "text": "What is in this image?"},
+        {"type": "image_url", "image_url": {"url": "data:image/png;base64,x"}},
+    ]
+    messages = [
+        {"role": "user", "content": "Previous question"},
+        {"role": "assistant", "content": "Previous answer"},
+    ]
+
+    await rag.search(
+        query=multimodal_query,
+        messages=messages,
+        collection_names=["test"],
+        enable_query_rewriting=True,
+        enable_reranker=False,
+        filter_expr="",
+    )
+
+    # Assert: query rewriting skipped - last_query is text + image URL (no "REWRITTEN(...)")
+    assert fake_vdb.last_query == "What is in this image? data:image/png;base64,x"
+    assert "REWRITTEN" not in str(fake_vdb.last_query)
+    # Assert: retrieval_image_langchain was used (not retrieval_langchain)
+    assert fake_vdb.last_retrieval_method == "image"
+
+
 @pytest.mark.asyncio
 async def test_generate_uses_query_rewriter_when_enabled(monkeypatch):
     """Test that query rewriting is used in generate when enabled with conversation history."""
@@ -319,6 +368,55 @@ async def test_generate_uses_only_current_query_when_history_disabled(monkeypatc
     assert fake_vdb.last_query == "How does it work?"
 
 
+@pytest.mark.asyncio
+async def test_generate_skips_query_rewriter_for_image_query(monkeypatch):
+    """When messages contain multimodal content with image, query rewriting is skipped."""
+    from nvidia_rag.rag_server.main import NvidiaRAG
+
+    monkeypatch.setenv("CONVERSATION_HISTORY", "5")
+    monkeypatch.setenv("ENABLE_REFLECTION", "false")
+    monkeypatch.setenv("MULTITURN_RETRIEVER_SIMPLE", "False")
+
+    fake_vdb = DummyVDB()
+    rag = NvidiaRAG()
+    monkeypatch.setattr(NvidiaRAG, "_prepare_vdb_op", lambda self, **kw: fake_vdb)
+
+    messages = [
+        {"role": "user", "content": "What is RAG?"},
+        {"role": "assistant", "content": "A retrieval-augmented framework."},
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "What is in this image?"},
+                {"type": "image_url", "image_url": {"url": "data:image/png;base64,x"}},
+            ],
+        },
+    ]
+
+    async def _stream(*a, **k):
+        yield "ok"
+
+    with patch("nvidia_rag.rag_server.main.VLM") as mock_vlm_class:
+        mock_vlm_instance = mock_vlm_class.return_value
+        mock_vlm_instance.stream_with_messages = _stream
+
+        stream = await rag.generate(
+            messages=messages,
+            use_knowledge_base=True,
+            collection_names=["test"],
+            enable_query_rewriting=True,
+            enable_reranker=False,
+            enable_vlm_inference=True,
+            filter_expr="",
+        )
+
+    # Assert: query rewriting skipped - last_query is text + image URL (no "REWRITTEN(...)")
+    assert fake_vdb.last_query == "What is in this image? data:image/png;base64,x"
+    assert "REWRITTEN" not in str(fake_vdb.last_query)
+    # Assert: retrieval_image_langchain was used
+    assert fake_vdb.last_retrieval_method == "image"
+
+
 @pytest.mark.asyncio
 async def test_generate_combines_history_when_multiturn_enabled(monkeypatch):
     """Test that when multiturn_retrieval_simple is True, history is concatenated."""
@@ -353,4 +451,3 @@ async def test_generate_combines_history_when_multiturn_enabled(monkeypatch):
     # last previous user query is combined with current retriever_query
     # Expected concatenation: "What is RAG?. How does it work?"
     assert fake_vdb.last_query == "What is RAG?. How does it work?"
-
diff --git a/tests/unit/test_rag_server/test_rag_main_advanced_features.py b/tests/unit/test_rag_server/test_rag_main_advanced_features.py
index 4331586a7..30f4ace20 100644
--- a/tests/unit/test_rag_server/test_rag_main_advanced_features.py
+++ b/tests/unit/test_rag_server/test_rag_main_advanced_features.py
@@ -1192,26 +1192,20 @@ def test_handle_prompt_processing_basic(self):
         assert user_message == [("user", "Test human prompt")]
 
     def test_handle_prompt_processing_with_nemotron_v1_model(self):
-        """Test prompt processing with Nemotron v1 model."""
+        """Test prompt processing with Nemotron v1 model uses the chat_template system prompt."""
         rag = NvidiaRAG()
 
         chat_history = []
 
-        # Mock instance attribute
-        mock_prompts = Mock()
         with patch.dict(os.environ, {"ENABLE_NEMOTRON_THINKING": "true"}):
-            mock_prompts.get.return_value = {
-                "system": "Test system prompt",
-                "human": "Test human prompt",
-            }
-
             result = rag._handle_prompt_processing(
                 chat_history, "llama-3.3-nemotron-super-49b-v1", "chat_template"
             )
 
             assert len(result) == 3
             system_message, conversation_history, user_message = result
-            assert system_message == [("system", "detailed thinking on")]
+            expected_system = rag.prompts.get("chat_template", {}).get("system", "")
+            assert system_message == [("system", expected_system)]
 
     def test_handle_prompt_processing_with_system_message_in_history(self):
         """Test prompt processing with system message in chat history."""
diff --git a/tests/unit/test_rag_server/test_rag_main_core_components.py b/tests/unit/test_rag_server/test_rag_main_core_components.py
index cc14c7c08..d4567399c 100644
--- a/tests/unit/test_rag_server/test_rag_main_core_components.py
+++ b/tests/unit/test_rag_server/test_rag_main_core_components.py
@@ -416,8 +416,8 @@ def test_build_retriever_query_from_multimodal_list(self):
         ]
 
         result = rag._build_retriever_query_from_content(content)
-        # When image_url is present, the method returns the image URL
-        assert result == ("http://example.com/image.jpg", True)
+        # Text parts joined with \n\n first, then image URL with space separator
+        assert result == ("Hello\n\nworld http://example.com/image.jpg", True)
 
     def test_build_retriever_query_from_list_without_text(self):
         """Test building retriever query from list without text items."""
@@ -428,6 +428,7 @@ def test_build_retriever_query_from_list_without_text(self):
         ]
 
         result = rag._build_retriever_query_from_content(content)
+        # Image-only: no text, so final query is just the image URL (no leading space)
         assert result == ("http://example.com/image.jpg", True)
 
     def test_build_retriever_query_from_other_type(self):
diff --git a/tests/unit/test_rag_server/test_rag_main_integration.py b/tests/unit/test_rag_server/test_rag_main_integration.py
index 3e3cd8be6..9de59b630 100644
--- a/tests/unit/test_rag_server/test_rag_main_integration.py
+++ b/tests/unit/test_rag_server/test_rag_main_integration.py
@@ -445,8 +445,8 @@ def test_build_retriever_query_from_content_multimodal(self):
         ]
 
         result = rag._build_retriever_query_from_content(content)
-        # When image_url is present, the method returns the image URL
-        assert result == ("http://example.com/image.jpg", True)
+        # Text parts joined with \n\n first, then image URL with space separator
+        assert result == ("Hello\n\nworld http://example.com/image.jpg", True)
 
     def test_print_conversation_history(self):
         """Test __print_conversation_history method."""
diff --git a/tests/unit/test_rag_server/test_self_reflection.py b/tests/unit/test_rag_server/test_self_reflection.py
index f7a6d54c6..6acf3b0e0 100644
--- a/tests/unit/test_rag_server/test_self_reflection.py
+++ b/tests/unit/test_rag_server/test_self_reflection.py
@@ -142,7 +142,7 @@ async def test_check_context_relevance(mocker):
         and structured prompts for consistent, reproducible reflection results.
     """
     # Set up a local ranker for reranking documents
-    local_ranker = get_ranking_model(model="nvidia/llama-3.2-nv-rerankqa-1b-v2", url="")
+    local_ranker = get_ranking_model(model="nvidia/llama-nemotron-rerank-1b-v2", url="")
 
     # Create a mock VDBRag object
     mock_vdb_op = mocker.MagicMock(spec=VDBRag)
diff --git a/tests/unit/test_utils/test_configuration.py b/tests/unit/test_utils/test_configuration.py
index 3525664b7..04c81ba9d 100644
--- a/tests/unit/test_utils/test_configuration.py
+++ b/tests/unit/test_utils/test_configuration.py
@@ -23,8 +23,6 @@
 
 import pytest
 import yaml
-from pydantic import SecretStr, ValidationError
-
 from nvidia_rag.utils.configuration import (
     EmbeddingConfig,
     FilterExpressionGeneratorConfig,
@@ -44,6 +42,7 @@
     VectorStoreConfig,
     VLMConfig,
 )
+from pydantic import SecretStr, ValidationError
 
 
 class TestVectorStoreConfig:
@@ -135,14 +134,16 @@ def test_get_model_parameters_default(self):
         config = LLMConfig()
         params = config.get_model_parameters()
 
-        # Default model contains "llama-3.3-nemotron-super-49b" so it triggers nemotron logic
         expected = {
             "min_tokens": 0,
             "ignore_eos": False,
             "max_tokens": 32768,
+            "enable_thinking": False,
+            "reasoning_budget": 0,
+            "low_effort": False,
             "min_thinking_tokens": 0,
             "max_thinking_tokens": 0,
-            "temperature": 0,
+            "temperature": 0.0,
             "top_p": 1.0,
         }
         assert params == expected
@@ -152,14 +153,16 @@ def test_get_model_parameters_generic(self):
         config = LLMConfig(model_name="meta/llama-3.1-8b-instruct")
         params = config.get_model_parameters()
 
-        # Generic model should use the base parameter values
         expected = {
             "min_tokens": 0,
             "ignore_eos": False,
             "max_tokens": 32768,
+            "enable_thinking": False,
+            "reasoning_budget": 0,
+            "low_effort": False,
             "min_thinking_tokens": 0,
             "max_thinking_tokens": 0,
-            "temperature": 0,
+            "temperature": 0.0,
             "top_p": 1.0,
         }
         assert params == expected
@@ -197,7 +200,7 @@ def test_default_values(self):
         """Test default configuration values."""
         config = EmbeddingConfig()
 
-        assert config.model_name == "nvidia/llama-3.2-nv-embedqa-1b-v2"
+        assert config.model_name == "nvidia/llama-nemotron-embed-1b-v2"
         assert config.model_engine == "nvidia-ai-endpoints"
         assert config.dimensions == 2048
         assert config.server_url == ""
@@ -210,7 +213,7 @@ def test_default_values(self):
         """Test default configuration values."""
         config = RankingConfig()
 
-        assert config.model_name == "nvidia/llama-3.2-nv-rerankqa-1b-v2"
+        assert config.model_name == "nvidia/llama-nemotron-rerank-1b-v2"
         assert config.model_engine == "nvidia-ai-endpoints"
         assert config.server_url == ""
         assert config.enable_reranker is True
diff --git a/tests/unit/test_utils/test_llm.py b/tests/unit/test_utils/test_llm.py
index f691edb9d..c59cabf43 100644
--- a/tests/unit/test_utils/test_llm.py
+++ b/tests/unit/test_utils/test_llm.py
@@ -240,7 +240,6 @@ def test_get_llm_nvidia_endpoints_with_url(self, mock_chatnvidia, mock_sanitize)
                 base_url="http://test-url:8000",
                 model="test-model",
                 api_key="test-api-key",
-                stop=[],
                 default_headers={"source": "rag-blueprint"},
                 temperature=0.7,
                 top_p=0.9,
@@ -272,7 +271,6 @@ def test_get_llm_nvidia_endpoints_api_catalog(self, mock_chatnvidia, mock_saniti
                 temperature=None,
                 top_p=None,
                 max_completion_tokens=None,
-                stop=[],
                 default_headers={"source": "rag-blueprint"},
             )
 
@@ -325,7 +323,6 @@ def test_get_llm_with_guardrails_success(
                     temperature=0.7,
                     top_p=None,
                     max_tokens=None,
-                    stop=[],
                 )
 
     @patch("requests.get")
@@ -419,7 +416,6 @@ def test_get_llm_none_parameters(self, mock_sanitize):
                     temperature=None,
                     top_p=None,
                     max_completion_tokens=None,
-                    stop=[],
                     default_headers={"source": "rag-blueprint"},
                     model_kwargs={"ignore_eos": False},
                 )
@@ -429,9 +425,10 @@ class TestStreamingFilterThink:
     """Test cases for streaming_filter_think function."""
 
     def create_mock_chunk(self, content):
-        """Helper to create mock chunk with content attribute."""
+        """Helper to create mock chunk with content and additional_kwargs (so 'in' works)."""
         chunk = Mock()
         chunk.content = content
+        chunk.additional_kwargs = {}
         return chunk
 
     def test_streaming_filter_think_no_tags(self):
@@ -683,7 +680,6 @@ def test_llm_creation_with_all_parameters(self, mock_chatnvidia):
                     base_url="http://test:8000",
                     model="meta/llama-3.1-8b-instruct",
                     api_key="test-api-key",
-                    stop=[],
                     default_headers={"source": "rag-blueprint"},
                     temperature=0.7,
                     top_p=0.9,
@@ -826,67 +822,83 @@ def test_streaming_filter_complete_workflow(self):
         assert result == expected
 
     def create_mock_chunk(self, content):
-        """Helper to create mock chunk with content attribute."""
+        """Helper to create mock chunk with content and additional_kwargs (so 'in' works)."""
         chunk = Mock()
         chunk.content = content
+        chunk.additional_kwargs = {}
         return chunk
 
 
-class TestThinkingBudgetNemotron3Nano30B:
-    """Tests for thinking budget behavior with nvidia/nemotron-3-nano-30b-a3b."""
+class TestBindReasoningConfigNemotron3Nano:
+    """Tests for _bind_reasoning_config with nemotron-3-nano models."""
 
-    @patch.dict(os.environ, {"ENABLE_NEMOTRON_3_NANO_THINKING": "true"})
-    def test_bind_thinking_tokens_for_nemotron_30b_maps_reasoning_budget(self):
-        """max_thinking_tokens for nemotron-3-nano-30b-a3b maps to reasoning_budget."""
-        from nvidia_rag.utils.llm import _bind_thinking_tokens_if_configured
+    @patch.dict(os.environ, {"LLM_ENABLE_THINKING": "true"})
+    def test_bind_reasoning_config_nemotron_3_nano_with_budget(self):
+        """enable_thinking + reasoning_budget for nemotron-3-nano binds chat_template_kwargs and reasoning_budget."""
+        from nvidia_rag.utils.llm import _bind_reasoning_config
 
         mock_llm = Mock()
-        bound_llm = _bind_thinking_tokens_if_configured(
+        mock_llm.bind.return_value = mock_llm
+        config = Mock()
+        config.llm.parameters.enable_thinking = True
+        config.llm.parameters.reasoning_budget = 8192
+        config.llm.parameters.low_effort = False
+        config.llm.parameters.min_thinking_tokens = 0
+        config.llm.parameters.max_thinking_tokens = 0
+
+        bound_llm = _bind_reasoning_config(
             mock_llm,
+            config=config,
             model="nvidia/nemotron-3-nano-30b-a3b",
-            max_thinking_tokens=8192,
         )
 
-        mock_llm.bind.assert_called_once_with(
-            reasoning_budget=8192,
+        calls = mock_llm.bind.call_args_list
+        assert any(
+            call.kwargs.get("chat_template_kwargs", {}).get("enable_thinking") is True
+            for call in calls
         )
-        assert bound_llm is mock_llm.bind.return_value
 
-    def test_min_thinking_tokens_alone_raises_for_nemotron_30b(self):
-        """min_thinking_tokens alone raises ValueError for nemotron-3-nano-30b-a3b (max_thinking_tokens required)."""
-        from nvidia_rag.utils.llm import _bind_thinking_tokens_if_configured
+    def test_bind_reasoning_config_unsupported_model_returns_original(self):
+        """Unsupported model returns original LLM without binding."""
+        from nvidia_rag.utils.llm import _bind_reasoning_config
 
         mock_llm = Mock()
-        with pytest.raises(ValueError, match="max_thinking_tokens must be a positive integer"):
-            _bind_thinking_tokens_if_configured(
-                mock_llm,
-                model="nvidia/nemotron-3-nano-30b-a3b",
-                min_thinking_tokens=1,
-            )
-
-    def test_thinking_tokens_unsupported_model_raises(self):
-        """Using thinking tokens with unsupported model raises ValueError."""
-        from nvidia_rag.utils.llm import _bind_thinking_tokens_if_configured
+        config = Mock()
+        config.llm.parameters.enable_thinking = False
+        config.llm.parameters.reasoning_budget = 0
+        config.llm.parameters.low_effort = False
+        config.llm.parameters.min_thinking_tokens = 0
+        config.llm.parameters.max_thinking_tokens = 0
+
+        bound_llm = _bind_reasoning_config(
+            mock_llm,
+            config=config,
+            model="meta/llama-3.1-8b-instruct",
+        )
 
-        mock_llm = Mock()
-        with pytest.raises(ValueError):
-            _bind_thinking_tokens_if_configured(
-                mock_llm,
-                model="meta/llama-3.1-8b-instruct",
-                max_thinking_tokens=10,
-            )
+        mock_llm.bind.assert_not_called()
+        assert bound_llm is mock_llm
 
 
-class TestThinkingBudgetNemotronNano9B:
-    """Tests for thinking budget behavior with nvidia/nvidia-nemotron-nano-9b-v2."""
+class TestBindReasoningConfigNemotronNano9B:
+    """Tests for _bind_reasoning_config with nvidia/nvidia-nemotron-nano-9b-v2."""
 
-    def test_bind_thinking_tokens_for_nano_9b_binds_min_and_max(self):
+    def test_bind_reasoning_config_nano_9b_binds_min_and_max(self):
         """Both min_thinking_tokens and max_thinking_tokens bind for nano-9b."""
-        from nvidia_rag.utils.llm import _bind_thinking_tokens_if_configured
+        from nvidia_rag.utils.llm import _bind_reasoning_config
 
         mock_llm = Mock()
-        bound_llm = _bind_thinking_tokens_if_configured(
+        mock_llm.bind.return_value = mock_llm
+        config = Mock()
+        config.llm.parameters.enable_thinking = False
+        config.llm.parameters.reasoning_budget = 0
+        config.llm.parameters.low_effort = False
+        config.llm.parameters.min_thinking_tokens = 1
+        config.llm.parameters.max_thinking_tokens = 8192
+
+        bound_llm = _bind_reasoning_config(
             mock_llm,
+            config=config,
             model="nvidia/nvidia-nemotron-nano-9b-v2",
             min_thinking_tokens=1,
             max_thinking_tokens=8192,
@@ -898,13 +910,21 @@ def test_bind_thinking_tokens_for_nano_9b_binds_min_and_max(self):
         )
         assert bound_llm is mock_llm.bind.return_value
 
-    def test_no_thinking_tokens_for_nano_9b_returns_original_llm(self):
+    def test_bind_reasoning_config_nano_9b_no_tokens_returns_original(self):
         """If no thinking tokens are provided, nano-9b returns original LLM."""
-        from nvidia_rag.utils.llm import _bind_thinking_tokens_if_configured
+        from nvidia_rag.utils.llm import _bind_reasoning_config
 
         mock_llm = Mock()
-        bound_llm = _bind_thinking_tokens_if_configured(
+        config = Mock()
+        config.llm.parameters.enable_thinking = False
+        config.llm.parameters.reasoning_budget = 0
+        config.llm.parameters.low_effort = False
+        config.llm.parameters.min_thinking_tokens = 0
+        config.llm.parameters.max_thinking_tokens = 0
+
+        bound_llm = _bind_reasoning_config(
             mock_llm,
+            config=config,
             model="nvidia/nvidia-nemotron-nano-9b-v2",
         )
 
diff --git a/tests/unit/test_utils/test_reranker.py b/tests/unit/test_utils/test_reranker.py
index f012a2a5c..ac60c67fe 100644
--- a/tests/unit/test_utils/test_reranker.py
+++ b/tests/unit/test_utils/test_reranker.py
@@ -66,11 +66,11 @@ def test_get_ranking_model_nvidia_endpoints_with_model_name(
         mock_nvidia_rerank.return_value = mock_reranker
 
         result = _get_ranking_model(
-            "nvidia/llama-3.2-nv-rerankqa-1b-v2", "", 10, config=mock_config
+            "nvidia/llama-nemotron-rerank-1b-v2", "", 10, config=mock_config
         )
 
         mock_nvidia_rerank.assert_called_once_with(
-            model="nvidia/llama-3.2-nv-rerankqa-1b-v2",
+            model="nvidia/llama-nemotron-rerank-1b-v2",
             api_key="test-api-key",
             top_n=10,
             truncate="END",
@@ -297,7 +297,7 @@ def test_complete_ranking_workflow_with_url(
 
             # Test the workflow
             model = get_ranking_model(
-                "nvidia/llama-3.2-nv-rerankqa-1b-v2", "rerank-service:8080", 10
+                "nvidia/llama-nemotron-rerank-1b-v2", "rerank-service:8080", 10
             )
 
             # Test that the model can be used
@@ -325,7 +325,7 @@ def test_complete_ranking_workflow_api_catalog(
             mock_get_model.return_value = mock_reranker
 
             # Test the workflow
-            model = get_ranking_model("nvidia/llama-3.2-nv-rerankqa-1b-v2", "", 5)
+            model = get_ranking_model("nvidia/llama-nemotron-rerank-1b-v2", "", 5)
 
             # Test that the model can be used
             documents = ["doc1", "doc2"]
diff --git a/tests/unit/test_utils/test_vdb/test_milvus_vdb.py b/tests/unit/test_utils/test_vdb/test_milvus_vdb.py
index 973088df8..eb1f23084 100644
--- a/tests/unit/test_utils/test_vdb/test_milvus_vdb.py
+++ b/tests/unit/test_utils/test_vdb/test_milvus_vdb.py
@@ -33,6 +33,19 @@
 from nvidia_rag.utils.vdb.milvus.milvus_vdb import MilvusVDB
 
 
+def _make_dummy_milvus_vdb_for_delete():
+    """Build a MilvusVDB instance without running __init__ (no real connections).
+
+    Only sets attributes needed by delete_documents so we can test that method
+    without touching Milvus. Safe for CI where no Milvus is running.
+    """
+    vdb = object.__new__(MilvusVDB)
+    vdb.connection_alias = "milvus_dummy_test"
+    vdb.vdb_endpoint = "http://localhost:19530"
+    vdb._delete_entities = Mock()
+    return vdb
+
+
 class TestMilvusVDB:
     """Test the MilvusVDB class."""
 
@@ -749,81 +762,76 @@ def test_get_documents(self, mock_connections):
                 )
 
     @patch("nvidia_rag.utils.vdb.milvus.milvus_vdb.Collection")
-    @patch("nvidia_rag.utils.vdb.milvus.milvus_vdb.connections")
-    def test_delete_documents_success(self, mock_connections, mock_collection):
-        """Test delete_documents method with successful deletion."""
+    def test_delete_documents_success(self, mock_collection):
+        """Test delete_documents method with successful deletion (no real Milvus)."""
         mock_collection_obj = Mock()
         mock_resp = Mock()
         mock_resp.delete_count = 5
         mock_collection_obj.delete.return_value = mock_resp
         mock_collection.return_value = mock_collection_obj
 
-        with (
-            patch("nvidia_rag.utils.vdb.milvus.milvus_vdb.urlparse"),
-        ):
-            vdb = MilvusVDB(
-                embedding_model=Mock(),
-                milvus_uri="http://localhost:19530",
-                collection_name="test_collection",
-                config=Mock(),
-            )
+        vdb = _make_dummy_milvus_vdb_for_delete()
+        result = vdb.delete_documents("test_collection", ["file1.txt", "file2.txt"])
 
-            result = vdb.delete_documents("test_collection", ["file1.txt", "file2.txt"])
-
-            assert result is True
-            mock_collection_obj.flush.assert_called_once()
+        assert result is True
+        mock_collection_obj.flush.assert_called_once()
 
     @patch("nvidia_rag.utils.vdb.milvus.milvus_vdb.Collection")
-    @patch("nvidia_rag.utils.vdb.milvus.milvus_vdb.connections")
-    def test_delete_documents_not_found(self, mock_connections, mock_collection):
-        """Test delete_documents method when document not found."""
+    def test_delete_documents_not_found(self, mock_collection):
+        """Test delete_documents method when document not found (no real Milvus)."""
         mock_collection_obj = Mock()
         mock_resp = Mock()
         mock_resp.delete_count = 0
         mock_collection_obj.delete.return_value = mock_resp
         mock_collection.return_value = mock_collection_obj
 
-        with (
-            patch("nvidia_rag.utils.vdb.milvus.milvus_vdb.urlparse"),
-        ):
-            vdb = MilvusVDB(
-                embedding_model=Mock(),
-                milvus_uri="http://localhost:19530",
-                collection_name="test_collection",
-                config=Mock(),
-            )
+        vdb = _make_dummy_milvus_vdb_for_delete()
+        result = vdb.delete_documents("test_collection", ["file1.txt"])
 
-            result = vdb.delete_documents("test_collection", ["file1.txt"])
-
-            assert result is True
+        assert result is True
 
     @patch("nvidia_rag.utils.vdb.milvus.milvus_vdb.Collection")
-    @patch("nvidia_rag.utils.vdb.milvus.milvus_vdb.connections")
-    def test_delete_documents_milvus_exception(self, mock_connections, mock_collection):
-        """Test delete_documents method with MilvusException fallback."""
-
+    def test_delete_documents_milvus_exception(self, mock_collection):
+        """Test delete_documents method with MilvusException fallback (no real Milvus)."""
         mock_collection_obj = Mock()
         mock_resp = Mock()
         mock_resp.delete_count = 1
-
-        # First call raises MilvusException, second call succeeds
         mock_collection_obj.delete.side_effect = [MilvusException("Error"), mock_resp]
         mock_collection.return_value = mock_collection_obj
 
-        with (
-            patch("nvidia_rag.utils.vdb.milvus.milvus_vdb.urlparse"),
-        ):
-            vdb = MilvusVDB(
-                embedding_model=Mock(),
-                milvus_uri="http://localhost:19530",
-                collection_name="test_collection",
-                config=Mock(),
-            )
+        vdb = _make_dummy_milvus_vdb_for_delete()
+        result = vdb.delete_documents("test_collection", ["file1.txt"])
 
-            result = vdb.delete_documents("test_collection", ["file1.txt"])
+        assert result is True
+        assert mock_collection_obj.delete.call_count == 2
 
-            assert result is True
-            assert mock_collection_obj.delete.call_count == 2
+    @patch("nvidia_rag.utils.vdb.milvus.milvus_vdb.Collection")
+    def test_delete_documents_escapes_filter_injection(self, mock_collection):
+        """Source values containing single quotes must not break out of the
+        Milvus filter expression literal (CWE-89: filter expression injection).
+        """
+        mock_collection_obj = Mock()
+        mock_resp = Mock()
+        mock_resp.delete_count = 1
+        mock_collection_obj.delete.return_value = mock_resp
+        mock_collection.return_value = mock_collection_obj
+
+        vdb = _make_dummy_milvus_vdb_for_delete()
+        # Attempt classic boolean-injection payload
+        malicious = "evil.pdf' or '1'=='1"
+        vdb.delete_documents("test_collection", [malicious])
+
+        # Inspect the filter expression passed to Collection.delete
+        call_args = mock_collection_obj.delete.call_args_list[0]
+        expr = call_args.args[0] if call_args.args else call_args.kwargs.get("expr", "")
+        # The injection fragment must be neutralised by escaping the quote
+        assert " or '1'=='1'" not in expr, (
+            f"delete_documents is vulnerable to filter expression injection: {expr!r}"
+        )
+        # The escaped form must be present
+        assert "evil.pdf\\' or \\'1\\'==\\'1" in expr or                "evil.pdf\' or \'1\'==\'1" in expr, (
+            f"Expected escaped quotes in filter expression, got: {expr!r}"
+        )
 
     @patch("nvidia_rag.utils.vdb.milvus.milvus_vdb.MilvusClient")
     @patch("nvidia_rag.utils.vdb.milvus.milvus_vdb.connections")
diff --git a/tests/unit/test_utils/test_vdb/test_vdb_ingest_base.py b/tests/unit/test_utils/test_vdb/test_vdb_ingest_base.py
new file mode 100644
index 000000000..a13877d05
--- /dev/null
+++ b/tests/unit/test_utils/test_vdb/test_vdb_ingest_base.py
@@ -0,0 +1,129 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Unit tests for SerializedVDBWrapper from vdb_ingest_base module."""
+
+import threading
+from concurrent.futures import ThreadPoolExecutor
+from unittest.mock import MagicMock
+
+import pytest
+from nvidia_rag.utils.vdb.vdb_ingest_base import SerializedVDBWrapper
+
+
+@pytest.fixture
+def mock_vdb_op():
+    """Create a mock VDB operation object."""
+    op = MagicMock()
+    op.run_async.return_value = "run_async_result"
+    op.run.return_value = "run_result"
+    op.write_to_index.return_value = "write_result"
+    op.create_index.return_value = "index_result"
+    op.some_read_method.return_value = "read_result"
+    return op
+
+
+@pytest.fixture
+def wrapper(mock_vdb_op):
+    """Create a SerializedVDBWrapper around a mock VDB op."""
+    return SerializedVDBWrapper(mock_vdb_op)
+
+
+@pytest.mark.skipif(
+    SerializedVDBWrapper is None,
+    reason="nv_ingest_client not installed",
+)
+class TestSerializedVDBWrapper:
+    """Test cases for SerializedVDBWrapper."""
+
+    def test_run_async_delegates_to_wrapped_op(self, wrapper, mock_vdb_op):
+        """Test that run_async delegates to the wrapped VDB op."""
+        records = [{"data": "test"}]
+        result = wrapper.run_async(records)
+
+        mock_vdb_op.run_async.assert_called_once_with(records)
+        assert result == "run_async_result"
+
+    def test_run_delegates_to_wrapped_op(self, wrapper, mock_vdb_op):
+        """Test that run delegates to the wrapped VDB op."""
+        records = [{"data": "test"}]
+        result = wrapper.run(records)
+
+        mock_vdb_op.run.assert_called_once_with(records)
+        assert result == "run_result"
+
+    def test_write_to_index_delegates_with_kwargs(self, wrapper, mock_vdb_op):
+        """Test that write_to_index passes kwargs to the wrapped VDB op."""
+        records = [{"data": "test"}]
+        result = wrapper.write_to_index(records, collection_name="test")
+
+        mock_vdb_op.write_to_index.assert_called_once_with(
+            records, collection_name="test"
+        )
+        assert result == "write_result"
+
+    def test_create_index_delegates_with_kwargs(self, wrapper, mock_vdb_op):
+        """Test that create_index passes kwargs to the wrapped VDB op."""
+        result = wrapper.create_index(collection_name="test")
+
+        mock_vdb_op.create_index.assert_called_once_with(collection_name="test")
+        assert result == "index_result"
+
+    def test_getattr_delegates_non_overridden_methods(self, wrapper, mock_vdb_op):
+        """Test that non-write methods pass through to the wrapped VDB op."""
+        result = wrapper.some_read_method()
+        mock_vdb_op.some_read_method.assert_called_once()
+        assert result == "read_result"
+
+    def test_isinstance_check_with_vdb(self, wrapper):
+        """Test that wrapper passes isinstance check for VDB (was a real bug)."""
+        from nv_ingest_client.util.vdb.adt_vdb import VDB
+
+        assert isinstance(wrapper, VDB)
+
+    def test_write_methods_are_serialized(self, mock_vdb_op):
+        """Test that concurrent write calls are serialized by the lock."""
+        execution_log = []
+        lock_held = threading.Event()
+
+        def locked_write(records):
+            batch = records[0]
+            execution_log.append(f"acquired_{batch}")
+            if batch == "batch_1":
+                lock_held.set()
+                threading.Event().wait(0.1)
+            execution_log.append(f"released_{batch}")
+            return "done"
+
+        mock_vdb_op.run.side_effect = locked_write
+        wrapper = SerializedVDBWrapper(mock_vdb_op)
+
+        with ThreadPoolExecutor(max_workers=2) as executor:
+            f1 = executor.submit(wrapper.run, ["batch_1"])
+            lock_held.wait(timeout=2)
+            f2 = executor.submit(wrapper.run, ["batch_2"])
+            f1.result(timeout=5)
+            f2.result(timeout=5)
+
+        assert execution_log.index("released_batch_1") < execution_log.index(
+            "acquired_batch_2"
+        )
+
+    def test_wrapper_propagates_exceptions(self, wrapper, mock_vdb_op):
+        """Test that exceptions from the wrapped op propagate through the lock."""
+        mock_vdb_op.run.side_effect = ValueError("indexing failed")
+
+        with pytest.raises(ValueError, match="indexing failed"):
+            wrapper.run([{"data": "test"}])
diff --git a/uv.lock b/uv.lock
index dbbf310f2..090443721 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1309,16 +1309,17 @@ wheels = [
 
 [[package]]
 name = "langchain-nvidia-ai-endpoints"
-version = "1.0.3"
+version = "1.2.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "aiohttp" },
     { name = "filetype" },
     { name = "langchain-core" },
+    { name = "requests" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/5a/9e/30814da280f7a79b168f83180f6a0396c166f86a566e56bb9877bf562611/langchain_nvidia_ai_endpoints-1.0.3.tar.gz", hash = "sha256:11c48fd24e4a9d4c86c65bcef943400f4e709497c93254c7dc97c43f68c2be89", size = 46526, upload-time = "2026-01-28T22:04:33.93Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/47/4b/e417af1b2b7f861f37e26bf4fa4b05cda4052002e3f84a966f0735baf94f/langchain_nvidia_ai_endpoints-1.2.0.tar.gz", hash = "sha256:4bd63b812707ea348a86539001aa9a89b3cba3ee56ade7379247a955e4bfd3eb", size = 53851, upload-time = "2026-03-10T17:55:08.127Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/67/04/c83f61106a245b74de11c1e075c1cc1e70462ece1dd9fc0584ad992a776d/langchain_nvidia_ai_endpoints-1.0.3-py3-none-any.whl", hash = "sha256:e5f170ad0a335637298bb90fb3df119793821e316355f61ab82f0106913eebbf", size = 50130, upload-time = "2026-01-28T22:04:33.065Z" },
+    { url = "https://files.pythonhosted.org/packages/66/e4/186f1a99e4d30bd91c8438d024dc73a71c8f7e0657c7acb6e79658aa19cf/langchain_nvidia_ai_endpoints-1.2.0-py3-none-any.whl", hash = "sha256:c8e075d5b3d31216374af0cfa9e690ab28ada3ebbde34dd6d36fe16a26d883cc", size = 58269, upload-time = "2026-03-10T17:55:06.339Z" },
 ]
 
 [[package]]
@@ -1349,7 +1350,7 @@ wheels = [
 
 [[package]]
 name = "langgraph"
-version = "1.0.7"
+version = "1.0.10"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "langchain-core" },
@@ -1359,9 +1360,9 @@ dependencies = [
     { name = "pydantic" },
     { name = "xxhash" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/72/5b/f72655717c04e33d3b62f21b166dc063d192b53980e9e3be0e2a117f1c9f/langgraph-1.0.7.tar.gz", hash = "sha256:0cfdfee51e6e8cfe503ecc7367c73933437c505b03fa10a85c710975c8182d9a", size = 497098, upload-time = "2026-01-22T16:57:47.303Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/55/92/14df6fefba28c10caf1cb05aa5b8c7bf005838fe32a86d903b6c7cc4018d/langgraph-1.0.10.tar.gz", hash = "sha256:73bd10ee14a8020f31ef07e9cd4c1a70c35cc07b9c2b9cd637509a10d9d51e29", size = 511644, upload-time = "2026-02-27T21:04:38.743Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/7e/0e/fe80144e3e4048e5d19ccdb91ac547c1a7dc3da8dbd1443e210048194c14/langgraph-1.0.7-py3-none-any.whl", hash = "sha256:9d68e8f8dd8f3de2fec45f9a06de05766d9b075b78fb03171779893b7a52c4d2", size = 157353, upload-time = "2026-01-22T16:57:45.997Z" },
+    { url = "https://files.pythonhosted.org/packages/5d/60/260e0c04620a37ba8916b712766c341cc5fc685dabc6948c899494bbc2ae/langgraph-1.0.10-py3-none-any.whl", hash = "sha256:7c298bef4f6ea292fcf9824d6088fe41a6727e2904ad6066f240c4095af12247", size = 160920, upload-time = "2026-02-27T21:04:35.932Z" },
 ]
 
 [[package]]
@@ -1379,15 +1380,15 @@ wheels = [
 
 [[package]]
 name = "langgraph-prebuilt"
-version = "1.0.7"
+version = "1.0.8"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "langchain-core" },
     { name = "langgraph-checkpoint" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/a7/59/711aecd1a50999456850dc328f3cad72b4372d8218838d8d5326f80cb76f/langgraph_prebuilt-1.0.7.tar.gz", hash = "sha256:38e097e06de810de4d0e028ffc0e432bb56d1fb417620fb1dfdc76c5e03e4bf9", size = 163692, upload-time = "2026-01-22T16:45:22.801Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/0d/06/dd61a5c2dce009d1b03b1d56f2a85b3127659fdddf5b3be5d8f1d60820fb/langgraph_prebuilt-1.0.8.tar.gz", hash = "sha256:0cd3cf5473ced8a6cd687cc5294e08d3de57529d8dd14fdc6ae4899549efcf69", size = 164442, upload-time = "2026-02-19T18:14:39.083Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/47/49/5e37abb3f38a17a3487634abc2a5da87c208cc1d14577eb8d7184b25c886/langgraph_prebuilt-1.0.7-py3-none-any.whl", hash = "sha256:e14923516504405bb5edc3977085bc9622c35476b50c1808544490e13871fe7c", size = 35324, upload-time = "2026-01-22T16:45:21.784Z" },
+    { url = "https://files.pythonhosted.org/packages/dc/41/ec966424ad3f2ed3996d24079d3342c8cd6c0bd0653c12b2a917a685ec6c/langgraph_prebuilt-1.0.8-py3-none-any.whl", hash = "sha256:d16a731e591ba4470f3e313a319c7eee7dbc40895bcf15c821f985a3522a7ce0", size = 35648, upload-time = "2026-02-19T18:14:37.611Z" },
 ]
 
 [[package]]
@@ -1753,7 +1754,7 @@ wheels = [
 
 [[package]]
 name = "nv-ingest-api"
-version = "26.1.1"
+version = "26.1.2"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "backoff" },
@@ -1767,14 +1768,14 @@ dependencies = [
     { name = "tritonclient" },
     { name = "universal-pathlib" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/41/eb/e0469e918d617175e1d3bcf952f0ca8e9b7756fce7817d5386ac4ddca154/nv_ingest_api-26.1.1.tar.gz", hash = "sha256:063d51f1d560bf03d7a595ff3ecebac1bffae45607cf6bd01e4fa8ca2265a884", size = 259532, upload-time = "2026-01-13T23:44:11.112Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/f3/bd/e6e885cca94b89723468d4c32d52d30ccc0235ebe2f1db33b0605402d6b8/nv_ingest_api-26.1.2.tar.gz", hash = "sha256:fea08f9bda064938a5876f1610ef0b92c6a1e4943130c564f329b0c87efa3daf", size = 259604, upload-time = "2026-01-21T14:06:27.092Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/42/51/cd93750a1c5797d8d12e843bb13645b0930e4490f96ca49f458aeb641018/nv_ingest_api-26.1.1-py3-none-any.whl", hash = "sha256:e4f8b860765cedba72622782692e2ffc69a100fd61956e8b8a81a47e6c852d66", size = 357481, upload-time = "2026-01-13T23:44:07.943Z" },
+    { url = "https://files.pythonhosted.org/packages/78/66/21e30e658578b7e5ab30857b99e9a0a5c91728ffdca13dadc3d3dba58b98/nv_ingest_api-26.1.2-py3-none-any.whl", hash = "sha256:8e7539a6b7d52afd821c0030e3197cfffddc011d26a8b093cf7b5ffa8addf02d", size = 357537, upload-time = "2026-01-21T14:06:24.321Z" },
 ]
 
 [[package]]
 name = "nv-ingest-client"
-version = "26.1.1"
+version = "26.1.2"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "build" },
@@ -1789,14 +1790,14 @@ dependencies = [
     { name = "setuptools" },
     { name = "tqdm" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/cb/c4/ae5e2b00a8ffdfc1a3cf660ded68c188140ca433b22446adbb72ccfc455d/nv_ingest_client-26.1.1.tar.gz", hash = "sha256:26d6844eac946b4fdb8da2f5f1e77feb22b52ac21e6d772cf6b1c8c21cef4bb8", size = 126865, upload-time = "2026-01-13T23:44:15.615Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/b3/62/386bc4a336b91df9c65afc18d905bb1fe3dd44ef1ce038895a701cba6035/nv_ingest_client-26.1.2.tar.gz", hash = "sha256:7ea4a35d4e7051031c273eb2b15170a0555462b702602e5c4fdce947bd39d446", size = 126061, upload-time = "2026-01-21T14:06:31.836Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/64/ea/042ad6d8ddfa887667159af8b473835bfcc5b9f51ba6e82ff14078e59a3e/nv_ingest_client-26.1.1-py3-none-any.whl", hash = "sha256:df9906c7021e6a1ae64140fbe7a345679b1cfad7dfa486442e38ca75d64f2b39", size = 147197, upload-time = "2026-01-13T23:44:12.761Z" },
+    { url = "https://files.pythonhosted.org/packages/5f/a0/6f082f42ba1ba4e3de751deee98099114405489389d82b840024fa26512e/nv_ingest_client-26.1.2-py3-none-any.whl", hash = "sha256:49ec81c2e2470509fc527d778d886caebe05a7f3012918eef2fd67922ea9b8b4", size = 146091, upload-time = "2026-01-21T14:06:28.586Z" },
 ]
 
 [[package]]
 name = "nvidia-rag"
-version = "2.4.0.dev0"
+version = "2.5.0.dev0"
 source = { virtual = "." }
 dependencies = [
     { name = "anyio" },
@@ -1918,16 +1919,16 @@ requires-dist = [
     { name = "langchain-elasticsearch", marker = "extra == 'all'", specifier = ">=0.3" },
     { name = "langchain-elasticsearch", marker = "extra == 'elasticsearch'", specifier = ">=0.3" },
     { name = "langchain-milvus", specifier = ">=0.3.0" },
-    { name = "langchain-nvidia-ai-endpoints", specifier = ">=1.0.3" },
+    { name = "langchain-nvidia-ai-endpoints", specifier = ">=1.2.0" },
     { name = "langchain-openai", marker = "extra == 'all'", specifier = ">=0.2" },
     { name = "langchain-openai", marker = "extra == 'ingest'", specifier = ">=0.2" },
     { name = "langchain-openai", marker = "extra == 'rag'", specifier = ">=0.2" },
     { name = "lark", specifier = ">=1.2.2" },
     { name = "minio", specifier = ">=7.2,<8.0" },
-    { name = "nv-ingest-api", marker = "extra == 'all'", specifier = "==26.1.1" },
-    { name = "nv-ingest-api", marker = "extra == 'ingest'", specifier = "==26.1.1" },
-    { name = "nv-ingest-client", marker = "extra == 'all'", specifier = "==26.1.1" },
-    { name = "nv-ingest-client", marker = "extra == 'ingest'", specifier = "==26.1.1" },
+    { name = "nv-ingest-api", marker = "extra == 'all'", specifier = "==26.1.2" },
+    { name = "nv-ingest-api", marker = "extra == 'ingest'", specifier = "==26.1.2" },
+    { name = "nv-ingest-client", marker = "extra == 'all'", specifier = "==26.1.2" },
+    { name = "nv-ingest-client", marker = "extra == 'ingest'", specifier = "==26.1.2" },
     { name = "opentelemetry-api", marker = "extra == 'all'", specifier = ">=1.29,<2.0" },
     { name = "opentelemetry-api", marker = "extra == 'ingest'", specifier = ">=1.29,<2.0" },
     { name = "opentelemetry-api", marker = "extra == 'rag'", specifier = ">=1.29,<2.0" },
diff --git a/variables.env b/variables.env
index 790e1ad7c..b9003e7b5 100644
--- a/variables.env
+++ b/variables.env
@@ -15,8 +15,8 @@ DOCKER_VOLUME_DIRECTORY=vectordb
 
 # ==== Endpoints for using on-prem NIMs ====
 APP_LLM_SERVERURL=nim-llm:8000
-APP_EMBEDDINGS_SERVERURL=nemoretriever-embedding-ms:8000/v1
-APP_RANKING_SERVERURL=nemoretriever-ranking-ms:8000
+APP_EMBEDDINGS_SERVERURL=nemotron-embedding-ms:8000/v1
+APP_RANKING_SERVERURL=nemotron-ranking-ms:8000
 OCR_GRPC_ENDPOINT=nemoretriever-ocr:8001
 OCR_HTTP_ENDPOINT=http://nemoretriever-ocr:8000/v1/infer
 OCR_INFER_PROTOCOL=grpc
@@ -35,11 +35,11 @@ YOLOX_TABLE_STRUCTURE_INFER_PROTOCOL=grpc
 # OCR_HTTP_ENDPOINT=https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-ocr
 # OCR_INFER_PROTOCOL=http
 # OCR_MODEL_NAME=scene_text_ensemble
-# YOLOX_HTTP_ENDPOINT=https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-page-elements-v3
+# YOLOX_HTTP_ENDPOINT=https://ai.api.nvidia.com/v1/cv/nvidia/nemotron-page-elements-v3
 # YOLOX_INFER_PROTOCOL=http
-# YOLOX_GRAPHIC_ELEMENTS_HTTP_ENDPOINT=https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-graphic-elements-v1
+# YOLOX_GRAPHIC_ELEMENTS_HTTP_ENDPOINT=https://ai.api.nvidia.com/v1/cv/nvidia/nemotron-graphic-elements-v1
 # YOLOX_GRAPHIC_ELEMENTS_INFER_PROTOCOL=http
-# YOLOX_TABLE_STRUCTURE_HTTP_ENDPOINT=https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-table-structure-v1
+# YOLOX_TABLE_STRUCTURE_HTTP_ENDPOINT=https://ai.api.nvidia.com/v1/cv/nvidia/nemotron-table-structure-v1
 # YOLOX_TABLE_STRUCTURE_INFER_PROTOCOL=http