diff --git a/.gitattributes b/.gitattributes
index c8d189184..0a7e469ce 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -1,2 +1,4 @@
 data/dataset.zip filter=lfs diff=lfs merge=lfs -text
 data/ filter=lfs diff=lfs merge=lfs -text
+examples/rag_event_ingest/data/**/*.mp4 filter=lfs diff=lfs merge=lfs -text
+examples/rag_event_ingest/data/**/*.pdf filter=lfs diff=lfs merge=lfs -text
diff --git a/.github/ISSUE_TEMPLATE/bug_report_form.yml b/.github/ISSUE_TEMPLATE/bug_report_form.yml
deleted file mode 100644
index 99eb5fd2f..000000000
--- a/.github/ISSUE_TEMPLATE/bug_report_form.yml
+++ /dev/null
@@ -1,81 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-name: Bug Report
-description: File a bug report
-title: "[BUG]: "
-labels: ["bug"]
-
-body:
-  - type: markdown
-    attributes:
-      value: |
-        Thanks for taking the time to fill out this bug report!
-
-  - type: input
-    id: version
-    attributes:
-      label: Version
-      description: What version of rag are you running?
-      placeholder: "example: 1.0.0"
-    validations:
-      required: true
-
-  - type: textarea
-    id: description
-    attributes:
-      label: Describe the bug.
-      description: Also tell us, what did you expect to happen?
-      placeholder: XYZ occured, I expected QRS results
-    validations:
-      required: true
-
-  - type: textarea
-    id: mvr
-    attributes:
-      label: Minimum reproducible example
-      description: Please supply a [minimum reproducible code example](https://matthewrocklin.com/blog/work/2018/02/28/minimal-bug-reports) here
-      render: shell
-
-  - type: textarea
-    id: logs
-    attributes:
-      label: Relevant log output
-      description: Please paste relevant error and log output here
-      render: shell
-
-  - type: textarea
-    id: env-printout
-    attributes:
-      label: Full env printout
-      description: Please run and paste the output of the `print_env.sh` script here, to gather any other relevant environment details
-      render: shell
-
-  - type: textarea
-    id: misc
-    attributes:
-      label: Other/Misc.
-      description: Please enter any other helpful information here.
-
-  - type: checkboxes
-    id: terms
-    attributes:
-      label: Code of Conduct
-      description: By submitting this issue, you agree to follow our [Code of Conduct](https://github.com/NVIDIA-AI-Blueprints/rag/blob/main/CODE_OF_CONDUCT.md)
-      options:
-        - label: I agree to follow __THIS PROJECT__'s Code of Conduct
-          required: true
-        - label: I have searched the [open bugs](https://github.com/NVIDIA-AI-Blueprints/rag/issues?q=is%3Aopen+is%3Aissue+label%3Abug) and have found no duplicates for this bug report
-          required: true
diff --git a/.github/ISSUE_TEMPLATE/documentation_request_correction.yml b/.github/ISSUE_TEMPLATE/documentation_request_correction.yml
deleted file mode 100644
index f87a7b9f3..000000000
--- a/.github/ISSUE_TEMPLATE/documentation_request_correction.yml
+++ /dev/null
@@ -1,70 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-name: Documentation - Correction/Update Request
-description: Request corrections or updates to existing documentation
-title: "[DOC]: "
-labels: ["doc"]
-
-body:
-  - type: markdown
-    attributes:
-      value: |
-        Thanks for taking the time to improve our documentation!
-
-  - type: dropdown
-    id: criticality
-    attributes:
-      label: How would you describe the priority of this documentation request
-      options:
-        - Critical (currently preventing usage)
-        - High
-        - Medium
-        - Low (would be nice)
-    validations:
-      required: true
-
-  - type: input
-    id: correction_location
-    attributes:
-      label: Please provide a link or source to the relevant docs
-      placeholder: "ex: https://github.com/NVIDIA-AI-Blueprints/rag/blob/main/README.md"
-    validations:
-      required: true
-
-  - type: textarea
-    id: problem
-    attributes:
-      label: Describe the problems in the documentation
-      placeholder: The documents say to use foo.func(args) however an AttributeError is thrown
-    validations:
-      required: true
-
-  - type: textarea
-    id: correction
-    attributes:
-      label: (Optional) Propose a correction
-      placeholder: foo.func() was deprecated, replace documentation with foo.new_func()
-
-  - type: checkboxes
-    id: terms
-    attributes:
-      label: Code of Conduct
-      description: By submitting this issue, you agree to follow our [Code of Conduct](https://github.com/NVIDIA-AI-Blueprints/rag/blob/main/CODE_OF_CONDUCT.md)
-      options:
-        - label: I agree to follow __PROJECT__'s Code of Conduct
-          required: true
-        - label: I have searched the [open documentation issues](https://github.com/NVIDIA-AI-Blueprints/rag/issues?q=is%3Aopen+is%3Aissue+label%3Adoc) and have found no duplicates for this bug report
-          required: true
diff --git a/.github/ISSUE_TEMPLATE/documentation_request_new.yml b/.github/ISSUE_TEMPLATE/documentation_request_new.yml
deleted file mode 100644
index 0d1b543f2..000000000
--- a/.github/ISSUE_TEMPLATE/documentation_request_new.yml
+++ /dev/null
@@ -1,63 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-name: Documentation - New Documentation Request
-description: Request additions to rag documentation
-title: "[DOC]: "
-labels: ["doc"]
-
-body:
-  - type: markdown
-    attributes:
-      value: |
-        Thanks for taking the time to improve our documentation!
-
-  - type: dropdown
-    id: criticality
-    attributes:
-      label: How would you describe the priority of this documentation request
-      options:
-        - Critical (currently preventing usage)
-        - High
-        - Medium
-        - Low (would be nice)
-    validations:
-      required: true
-
-  - type: textarea
-    id: problem
-    attributes:
-      label: Describe the future/missing documentation
-      placeholder: A code snippet mentions function foo(args) but I cannot find any documentation on it.
-    validations:
-      required: true
-
-  - type: textarea
-    id: search_locs
-    attributes:
-      label: Where have you looked?
-      placeholder: |
-       https://github.com/NVIDIA-AI-Blueprints/rag/blob/main/README.md
-
-  - type: checkboxes
-    id: terms
-    attributes:
-      label: Code of Conduct
-      description: By submitting this issue, you agree to follow our [Code of Conduct](https://github.com/NVIDIA-AI-Blueprints/rag/blob/main/CODE_OF_CONDUCT.md)
-      options:
-        - label: I agree to follow rag's Code of Conduct
-          required: true
-        - label: I have searched the [open documentation issues](https://github.com/NVIDIA-AI-Blueprints/rag/issues?q=is%3Aopen+is%3Aissue+label%3Adoc) and have found no duplicates for this bug report
-          required: true
diff --git a/.github/ISSUE_TEMPLATE/feature_request_form.yml b/.github/ISSUE_TEMPLATE/feature_request_form.yml
deleted file mode 100644
index b3ba5787f..000000000
--- a/.github/ISSUE_TEMPLATE/feature_request_form.yml
+++ /dev/null
@@ -1,109 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-name: Feature Request Form
-description: Request new or improved functionality or changes to existing functionality
-title: "[FEA]: "
-labels: ["feature request"]
-
-body:
-  - type: markdown
-    attributes:
-      value: |
-        Thanks for taking the time to fill out this feature request!
-
-  - type: dropdown
-    id: new_or_improvement
-    attributes:
-      label: Is this a new feature, an improvement, or a change to existing functionality?
-      options:
-        - New Feature
-        - Improvement
-        - Change
-    validations:
-      required: true
-
-  - type: dropdown
-    id: criticality
-    attributes:
-      label: How would you describe the priority of this feature request
-      options:
-        - Critical (currently preventing usage)
-        - High
-        - Medium
-        - Low (would be nice)
-    validations:
-      required: true
-
-  - type: textarea
-    id: problem
-    attributes:
-      label: Please provide a clear description of problem this feature solves
-      description: Real usage examples are especially helpful, non-code.
-    validations:
-      required: true
-
-  - type: textarea
-    id: Feature_Description
-    attributes:
-      label: Feature Description
-      description: Please provide clear description of the feature you request (refer to [User Story format](https://www.atlassian.com/agile/project-management/user-stories#:~:text=User%20story%20template%20and%20examples) and [EARS format](https://ieeexplore.ieee.org/document/5328509))
-      placeholder: >
-        For new feature request, please use one of the following format to describe the feature
-          1. From End-user perspective, use the following user story format 
-              As a <persona>, I <want to>, <so that>.
-          2. From System perspective, use the following EARS format
-              <Pre-Condition> <System> shall  <process> <object to be process> <condition>
-        For changing or improving existing feature, it's recommended to provide the previoius Feature Request ID.
-    validations:
-      required: true
-
-  - type: textarea
-    id: solution
-    attributes:
-      label: Describe your ideal solution
-      description: Please describe the functionality you would like added.
-      placeholder: >
-        A new function that takes in the information in this form, and triages the issue
-
-        def feature_request(form_info):
-            parse(form_info)
-            return triage_outcome
-    validations:
-      required: true
-
-  - type: textarea
-    id: alternatives
-    attributes:
-      label: Describe any alternatives you have considered
-      description: List any other libraries, or approaches you have looked at or tried.
-      placeholder: I have looked at library xyz and qrs, but they do not offer GPU accleration
-
-  - type: textarea
-    id: misc
-    attributes:
-      label: Additional context
-      description: Add any other context, code examples, or references to existing implementations about the feature request here. If applicable, please list the modules affected.
-
-  - type: checkboxes
-    id: terms
-    attributes:
-      label: Code of Conduct
-      description: By submitting this issue, you agree to follow our [Code of Conduct](https://github.com/NVIDIA-AI-Blueprints/rag/blob/main/CODE_OF_CONDUCT.md)
-      options:
-        - label: I agree to follow rag's Code of Conduct
-          required: true
-        - label: I have searched the [open feature requests](https://github.com/NVIDIA-AI-Blueprints/rag/issues?q=is%3Aopen+is%3Aissue+label%3A%22feature+request%22%2Cimprovement%2Cenhancement) and have found no duplicates for this feature request
-          required: true
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
deleted file mode 100644
index 3c5b84e91..000000000
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ /dev/null
@@ -1,10 +0,0 @@
-## Description
-<!-- Provide a standalone description of changes in this PR. -->
-<!-- Reference any issues closed by this PR with "closes #1234". -->
-
-## Checklist
-- [ ] I am familiar with the [Contributing Guidelines](../CONTRIBUTING.md).
-- [ ] All commits are signed-off (`git commit -s`) and GPG signed (`git commit -S`).
-- [ ] New or existing tests cover these changes.
-- [ ] The documentation is up to date with these changes.
-- [ ] If adjusting docker-compose.yaml environment variables have you ensured those are mimicked in the Helm values.yaml file.
\ No newline at end of file
diff --git a/.github/workflows/build-push-main.yml b/.github/workflows/build-push-main.yml
new file mode 100644
index 000000000..4600a51ac
--- /dev/null
+++ b/.github/workflows/build-push-main.yml
@@ -0,0 +1,60 @@
+name: Build and Push on Main
+
+on:
+  push:
+    branches:
+      - oracle/dev
+
+jobs:
+  build-and-push-rag:
+    runs-on: ubuntu-latest
+    env:
+      # OCI AUTH
+      OCI_CLI_USER: ${{ secrets.OCI_CLI_USER }}
+      OCI_CLI_TENANCY: ${{ secrets.OCI_CLI_TENANCY }}
+      OCI_CLI_FINGERPRINT: ${{ secrets.OCI_CLI_FINGERPRINT }}
+      OCI_CLI_KEY_CONTENT: ${{ secrets.OCI_CLI_KEY_CONTENT }}
+      OCI_CLI_REGION: ${{ secrets.OCI_CLI_REGION }}
+      # IMAGE VARS
+      INGEST_IMAGE_NAME: corrino-devops-repository/nvidia-rag-ingestion-oci
+      RAG_IMAGE_NAME: corrino-devops-repository/nvidia-rag-retrieval-oci
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v3
+        with:
+          fetch-depth: 1
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v2
+
+      - name: Log into OCIR
+        uses: oracle-actions/login-ocir@v1.3.0
+        id: login-ocir
+        with:
+          auth_token: ${{ secrets.OCI_AUTH_TOKEN }}
+
+      - name: Read version
+        id: version
+        run: echo "value=$(cat VERSION)" >> $GITHUB_OUTPUT
+
+      - name: Build RAG Docker Image
+        run: |
+          docker build -t "${{ env.RAG_IMAGE_NAME }}":"${{ steps.version.outputs.value }}" -f src/nvidia_rag/rag_server/Dockerfile .
+
+      - name: Tag and Push RAG Docker Image
+        env:
+          IMAGE_REMOTE: "ord.ocir.io/${{ secrets.OCI_TENANCY_NAMESPACE }}/${{ env.RAG_IMAGE_NAME }}"
+        run: |
+          docker tag "${{ env.RAG_IMAGE_NAME }}":"${{ steps.version.outputs.value }}" "${{ env.IMAGE_REMOTE }}":"${{ steps.version.outputs.value }}"
+          docker push "${{ env.IMAGE_REMOTE }}":"${{ steps.version.outputs.value }}"
+
+      - name: Build Ingest Docker Image
+        run: |
+          docker build -t "${{ env.INGEST_IMAGE_NAME }}":"${{ steps.version.outputs.value }}" -f src/nvidia_rag/ingestor_server/Dockerfile .
+
+      - name: Tag and Push Ingest Docker Image
+        env:
+          IMAGE_REMOTE: "ord.ocir.io/${{ secrets.OCI_TENANCY_NAMESPACE }}/${{ env.INGEST_IMAGE_NAME }}"
+        run: |
+          docker tag "${{ env.INGEST_IMAGE_NAME }}":"${{ steps.version.outputs.value }}" "${{ env.IMAGE_REMOTE }}":"${{ steps.version.outputs.value }}"
+          docker push "${{ env.IMAGE_REMOTE }}":"${{ steps.version.outputs.value }}"
\ No newline at end of file
diff --git a/.github/workflows/build-push-pr.yml b/.github/workflows/build-push-pr.yml
new file mode 100644
index 000000000..4ba07aa23
--- /dev/null
+++ b/.github/workflows/build-push-pr.yml
@@ -0,0 +1,60 @@
+
+name: Build and Push on PR
+
+on:
+  pull_request:
+    types: [opened, synchronize, reopened]
+
+jobs:
+  build-and-push-images:
+    runs-on: ubuntu-latest
+    env:
+      # OCI AUTH
+      OCI_CLI_USER: ${{ secrets.OCI_CLI_USER }}
+      OCI_CLI_TENANCY: ${{ secrets.OCI_CLI_TENANCY }}
+      OCI_CLI_FINGERPRINT: ${{ secrets.OCI_CLI_FINGERPRINT }}
+      OCI_CLI_KEY_CONTENT: ${{ secrets.OCI_CLI_KEY_CONTENT }}
+      OCI_CLI_REGION: ${{ secrets.OCI_CLI_REGION }}
+      # IMAGE VARS
+      RAG_IMAGE_NAME: corrino-devops-repository/nvidia-rag-retrieval-oci
+      INGEST_IMAGE_NAME: corrino-devops-repository/nvidia-rag-ingestion-oci
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v3
+        with:
+          fetch-depth: 1
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v2
+
+      - name: Log into OCIR
+        uses: oracle-actions/login-ocir@v1.3.0
+        id: login-ocir
+        with:
+          auth_token: ${{ secrets.OCI_AUTH_TOKEN }}
+      
+      - name: Compute short SHA
+        id: short-sha
+        run: echo "value=${GITHUB_SHA::7}" >> $GITHUB_OUTPUT
+
+      - name: Build RAG Docker Image
+        run: |
+          docker build -t "${{ env.RAG_IMAGE_NAME }}":pr-${{ steps.short-sha.outputs.value }} -f src/nvidia_rag/rag_server/Dockerfile .
+
+      - name: Tag and Push RAG Docker Image
+        env:
+          IMAGE_REMOTE: "ord.ocir.io/${{ secrets.OCI_TENANCY_NAMESPACE }}/${{ env.RAG_IMAGE_NAME }}"
+        run: |
+          docker tag "${{ env.RAG_IMAGE_NAME }}:pr-${{ steps.short-sha.outputs.value }}" "${{ env.IMAGE_REMOTE }}:pr-${{ steps.short-sha.outputs.value }}"
+          docker push "${{ env.IMAGE_REMOTE }}:pr-${{ steps.short-sha.outputs.value }}"
+
+      - name: Build Ingest Docker Image
+        run: |
+          docker build -t "${{ env.INGEST_IMAGE_NAME }}":pr-${{ steps.short-sha.outputs.value }} -f src/nvidia_rag/ingestor_server/Dockerfile  .
+
+      - name: Tag and Push Ingest Docker Image
+        env:
+          IMAGE_REMOTE: "ord.ocir.io/${{ secrets.OCI_TENANCY_NAMESPACE }}/${{ env.INGEST_IMAGE_NAME }}"
+        run: |
+          docker tag "${{ env.INGEST_IMAGE_NAME }}:pr-${{ steps.short-sha.outputs.value }}" "${{ env.IMAGE_REMOTE }}:pr-${{ steps.short-sha.outputs.value }}"
+          docker push "${{ env.IMAGE_REMOTE }}:pr-${{ steps.short-sha.outputs.value }}"
diff --git a/.github/workflows/ci-pipeline.yml b/.github/workflows/ci-pipeline.yml
deleted file mode 100644
index 6bea384e4..000000000
--- a/.github/workflows/ci-pipeline.yml
+++ /dev/null
@@ -1,966 +0,0 @@
-name: CI Pipeline
-
-# Workflow control - mirroring GitLab workflow rules
-on:
-  # Run on PRs (including forks) - safe jobs without secrets run for all, secret jobs only for same-repo PRs
-  pull_request:
-  # Run on push only for default branch (e.g. after merge); avoids duplicate runs when pushing to a PR branch
-  push:
-    branches:
-      - develop
-      - main
-      - 'release-**'
-  workflow_dispatch:
-  schedule:
-    # Run nightly at 7.30 PM UTC or 1 AM IST
-    - cron: '30 19 * * *'
-
-env:
-  # Common environment variables
-  MILVUS_VERSION: v2.6.5
-  NV_INGEST_MAX_UTIL: 8
-  APP_VECTORSTORE_ENABLEGPUSEARCH: False
-  APP_VECTORSTORE_ENABLEGPUINDEX: False
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
-  cancel-in-progress: true
-
-jobs:
-  # ============================================================================
-  # TEST STAGE JOBS
-  # ============================================================================
-
-  helm-blueprint-compliance:
-    name: Helm Blueprint Compliance
-    runs-on: ubuntu-latest
-    # Only run if push to develop OR PR from same repo (not fork) - needs secrets
-    if: github.event_name == 'push' || github.event_name == 'workflow_dispatch' || github.event_name == 'schedule' || github.event.pull_request.head.repo.full_name == github.repository
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v4
-
-      - name: Setup Helm
-        uses: azure/setup-helm@v4
-        with:
-          version: 'latest'
-
-      - name: Add Helm repositories
-        env:
-          NGC_API_KEY: ${{ secrets.NGC_API_KEY }}
-          CI_NVSTAGING_BLUEPRINT_KEY: ${{ secrets.CI_NVSTAGING_BLUEPRINT_KEY }}
-        run: |
-          helm repo add nvidia-nim https://helm.ngc.nvidia.com/nim/nvidia/ --username='$oauthtoken' --password=$NGC_API_KEY
-          helm repo add nim https://helm.ngc.nvidia.com/nim/ --username='$oauthtoken' --password=$NGC_API_KEY
-          helm repo add nemo-microservices https://helm.ngc.nvidia.com/nvidia/nemo-microservices --username='$oauthtoken' --password=$NGC_API_KEY
-          helm repo add baidu-nim https://helm.ngc.nvidia.com/nim/baidu --username='$oauthtoken' --password=$NGC_API_KEY
-          helm repo add nvstaging-nim https://helm.ngc.nvidia.com/nvstaging/blueprint --username='$oauthtoken' --password=$CI_NVSTAGING_BLUEPRINT_KEY
-          helm repo update
-
-      - name: Run Helm Blueprint Compliance
-        run: |
-          # Add your helm blueprint compliance checks here
-          echo "Running Helm Blueprint Compliance checks..."
-          # The actual compliance command would depend on the blueprint compliance tool
-
-  lint:
-    runs-on: ubuntu-latest
-    steps:
-      - name: Check out repository code
-        uses: actions/checkout@v4
-
-      - uses: actions/setup-python@v3
-      - uses: pre-commit/action@v3.0.1
-
-  unit-tests:
-    name: Unit Tests
-    runs-on: ubuntu-latest
-    container:
-      image: python:3.12-slim
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v4
-
-      - name: Install system dependencies
-        run: |
-          apt-get update && apt-get install -y gcc && rm -rf /var/lib/apt/lists/*
-
-      - name: Install package with dependencies
-        run: |
-          pip install -e .[all]
-          pip install --no-cache-dir -r tests/unit/requirements-test.txt
-
-      - name: Run unit tests with coverage
-        run: |
-          python -m pytest -v -s --cov=src --cov-report=term-missing tests/unit
-
-  frontend-unit-tests:
-    name: Frontend Unit Tests
-    runs-on: ubuntu-latest
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v4
-
-      - name: Setup Node.js
-        uses: actions/setup-node@v4
-        with:
-          node-version: '20'
-
-      - name: Sanitize branch name
-        id: sanitize
-        run: |
-          SANITIZED_REF="${GITHUB_REF_NAME//\//-}"
-          echo "ref_name=$SANITIZED_REF" >> $GITHUB_OUTPUT
-
-      - name: Setup pnpm
-        uses: pnpm/action-setup@v4
-        with:
-          version: 10
-
-      - name: Cache frontend dependencies
-        uses: actions/cache@v4
-        with:
-          path: |
-            frontend/node_modules
-            frontend/.pnpm-store
-          key: frontend-deps-${{ github.ref_name }}
-          restore-keys: |
-            frontend-deps-
-
-      - name: Configure pnpm and install dependencies
-        working-directory: frontend
-        run: |
-          pnpm config set store-dir .pnpm-store
-          pnpm install --frozen-lockfile || pnpm install
-
-      - name: Run frontend unit tests with coverage
-        working-directory: frontend
-        run: |
-          pnpm test:coverage
-
-      - name: Upload coverage artifacts
-        uses: actions/upload-artifact@v4
-        if: always()
-        with:
-          name: frontend-coverage-${{ steps.sanitize.outputs.ref_name }}-${{ github.sha }}
-          path: frontend/coverage/
-          retention-days: 7
-
-  check-markdown-links:
-    name: Check Markdown Links
-    runs-on: ubuntu-latest
-    container:
-      image: python:3.12-slim
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v4
-
-      - name: Install required packages
-        run: |
-          pip install --no-cache-dir requests
-
-      - name: Run markdown link checker
-        run: |
-          python ci/check_markdown_links.py --root . --no-external
-
-  # ============================================================================
-  # INTEGRATION TESTS STAGE
-  # ============================================================================
-
-  integration-tests:
-    name: Integration Tests
-    runs-on: arc-runners-org-nvidia-ai-bp-2-gpu
-    # Only run if push to develop OR PR from same repo (not fork) - needs secrets
-    if: github.event_name == 'push' || github.event_name == 'workflow_dispatch' || github.event_name == 'schedule' || github.event.pull_request.head.repo.full_name == github.repository
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v4
-
-      - name: Install NGC CLI
-        env:          
-          NGC_API_KEY: ${{ secrets.CI_NVSTAGING_BLUEPRINT_KEY }}
-        run: |
-          echo "Installing NGC CLI..."
-          wget --content-disposition https://api.ngc.nvidia.com/v2/resources/nvidia/ngc-apps/ngc_cli/versions/4.9.10/files/ngccli_linux.zip -O ngccli_linux.zip
-          unzip -o ngccli_linux.zip
-          chmod u+x ngc-cli/ngc
-          
-          # Add NGC CLI to PATH for subsequent steps
-          echo "$(pwd)/ngc-cli" >> $GITHUB_PATH
-          
-          echo "NGC CLI installed successfully"
-          
-      - name: Download test data files
-        env:
-          NGC_API_KEY: ${{ secrets.CI_NV_RAG_BLUEPRINT_KEY }}
-        run: |
-          echo "Downloading test data files to tests/data..."
-          # Create tests/data directory if it doesn't exist
-          mkdir -p tests/data
-          
-          # Download integration test dataset from NGC
-          echo "Downloading integration_test_dataset:2.4.0 from NGC..."
-          ngc registry resource download-version "0648981100760671/integration_test_dataset:2.4.0" --dest ./tests/data --org 0648981100760671
-          
-          # Move files from subdirectory to tests/data root
-          echo "Moving files to tests/data root directory..."
-          if [ -d "tests/data/integration_test_dataset_v2.4.0" ]; then
-            mv tests/data/integration_test_dataset_v2.4.0/* tests/data/
-            rmdir tests/data/integration_test_dataset_v2.4.0
-            echo "Files moved successfully"
-          else
-            echo "Warning: integration_test_dataset_v2.4.0 directory not found"
-          fi
-          
-          # Verify downloads
-          echo "Files in tests/data:"
-          ls -lh tests/data/
-          
-          echo "Test data download completed"
-
-      - name: Docker info
-        run: docker info
-
-      - name: Clean up existing containers
-        run: |
-          echo "Cleaning up existing containers..."
-          docker ps -a
-          docker stop $(docker ps -aq) || true
-          docker rm $(docker ps -aq) || true
-
-      - name: Load common environment variables
-        run: |
-          echo "Loading common environment variables..."
-          export TAG=$(echo ${GITHUB_REF_NAME} | sed 's/[^a-zA-Z0-9]/-/g')-${GITHUB_SHA::7}
-          export NGC_API_KEY=${{ secrets.NGC_API_KEY }}
-          export DOCKER_VOLUME_DIRECTORY=/tmp/milvus-${MILVUS_VERSION}
-          export INGESTOR_SERVER_EXTERNAL_VOLUME_MOUNT=/tmp/ingestor-server-data
-          echo "TAG=$TAG" >> $GITHUB_ENV
-          echo "NGC_API_KEY=$NGC_API_KEY" >> $GITHUB_ENV
-          echo "DOCKER_VOLUME_DIRECTORY=$DOCKER_VOLUME_DIRECTORY" >> $GITHUB_ENV
-          echo "INGESTOR_SERVER_EXTERNAL_VOLUME_MOUNT=$INGESTOR_SERVER_EXTERNAL_VOLUME_MOUNT" >> $GITHUB_ENV
-          
-          # Load nvdev.env and export all variables to GITHUB_ENV
-          if [ -f ./deploy/compose/nvdev.env ]; then
-            set -a
-            source ./deploy/compose/nvdev.env
-            set +a
-            # Export all variables from nvdev.env to GITHUB_ENV
-            grep -E '^export ' ./deploy/compose/nvdev.env | sed 's/export //' | while IFS='=' read -r key value; do
-              # Evaluate the value to expand any variable references
-              eval "resolved_value=\"$value\""
-              echo "$key=$resolved_value" >> $GITHUB_ENV
-            done
-          fi
-
-      - name: Docker login
-        env:
-          CI_NVSTAGING_BLUEPRINT_KEY: ${{ secrets.CI_NVSTAGING_BLUEPRINT_KEY }}
-        run: |
-          echo "$CI_NVSTAGING_BLUEPRINT_KEY" | docker login nvcr.io -u '$oauthtoken' --password-stdin
-
-      - name: Start services
-        run: |
-          echo "Starting vector database services..."
-          docker compose -f tests/integration/vectordb.yaml up -d
-          echo "Starting RAG server..."
-          docker compose -f deploy/compose/docker-compose-rag-server.yaml up -d --build
-          echo "Starting ingestor server..."
-          docker compose -f deploy/compose/docker-compose-ingestor-server.yaml up -d --build
-          echo "Waiting for services to be ready..."
-          sleep 60
-          echo "Checking service status..."
-          docker ps
-
-      - name: Print logs for running containers
-        run: |
-          echo "=== LOGS FOR RUNNING CONTAINERS ==="
-          docker logs --tail 50 milvus-standalone || echo "No logs for milvus-standalone"
-          docker logs --tail 50 milvus-etcd || echo "No logs for milvus-etcd"
-          docker logs --tail 50 milvus-minio || echo "No logs for milvus-minio"
-          docker logs rag-server || echo "No logs for rag-server"
-          docker logs ingestor-server || echo "No logs for ingestor-server"
-          echo "Deploy stage completed successfully"
-
-      # ========================================================================
-      # BASIC TESTS
-      # ========================================================================
-      
-      - name: Setup Python environment for tests
-        run: |
-          # Ensure required Python tooling is installed before use
-          if command -v apt-get >/dev/null 2>&1; then
-            sudo apt-get update
-            sudo apt-get install -y python3 python3-venv python3-pip fuse lsof
-
-          fi
-
-          python3 --version || echo "Python3 not found"
-          pip3 --version || echo "pip3 not found"
-
-          # Install uv (Python package and environment manager)
-          curl -LsSf https://astral.sh/uv/install.sh | sh
-          export PATH="$HOME/.local/bin:$PATH"
-
-          # Create a fresh virtual environment using uv
-          rm -rf venv || echo "No existing venv to clean up"
-          uv venv venv
-          source venv/bin/activate
-          uv pip install -e .[all]
-          uv pip install -r tests/integration/requirements.txt
-
-      - name: Run basic integration tests
-        id: basic-tests
-        continue-on-error: true
-        run: |
-          source venv/bin/activate
-          echo "Running basic integration tests..."
-          python -m tests.integration.main --timeout 1200
-          echo "Basic integration tests completed"
-
-      - name: Collect logs after basic tests
-        if: always()
-        run: |
-          echo "Collecting container logs..."
-          mkdir -p logs/basic-tests
-          docker logs rag-server > logs/basic-tests/rag-server.log 2>&1 || true
-          docker logs ingestor-server > logs/basic-tests/ingestor-server.log 2>&1 || true
-          docker logs compose-nv-ingest-ms-runtime-1 > logs/basic-tests/nvingest.log 2>&1 || true
-          cp tests/integration/integration_test.log logs/basic-tests/ 2>/dev/null || true
-
-      # ========================================================================
-      # QUERY REWRITER TESTS
-      # ========================================================================
-      
-      - name: Configure environment for query rewriter tests
-        run: |
-          echo "ENABLE_QUERYREWRITER=True" >> $GITHUB_ENV
-          echo "CONVERSATION_HISTORY=5" >> $GITHUB_ENV
-
-      - name: Restart services for query rewriter tests
-        env:
-          CI_NVSTAGING_BLUEPRINT_KEY: ${{ secrets.CI_NVSTAGING_BLUEPRINT_KEY }}
-        run: |
-          echo "Restarting services with query rewriter configuration..."
-          docker compose -f deploy/compose/docker-compose-rag-server.yaml down || true
-          docker compose -f deploy/compose/docker-compose-ingestor-server.yaml down || true
-          sleep 5
-          docker compose -f deploy/compose/docker-compose-rag-server.yaml up -d --build
-          docker compose -f deploy/compose/docker-compose-ingestor-server.yaml up -d --build
-          sleep 30
-          docker ps
-
-      - name: Run query rewriter integration tests
-        id: query-rewriter-tests
-        continue-on-error: true
-        run: |
-          source venv/bin/activate
-          echo "Running query rewriter integration tests..."
-          python -m tests.integration.main --sequence query_rewriter
-          echo "Query rewriter integration tests completed"
-
-      - name: Collect logs after query rewriter tests
-        if: always()
-        run: |
-          mkdir -p logs/query-rewriter
-          docker logs rag-server > logs/query-rewriter/rag-server.log 2>&1 || true
-          docker logs ingestor-server > logs/query-rewriter/ingestor-server.log 2>&1 || true
-          docker logs compose-nv-ingest-ms-runtime-1 > logs/query-rewriter/nvingest.log 2>&1 || true
-          cp tests/integration/integration_test.log logs/query-rewriter/ 2>/dev/null || true
-
-      # ========================================================================
-      # REFLECTION TESTS
-      # ========================================================================
-      
-      - name: Configure environment for reflection tests
-        run: |
-          echo "ENABLE_REFLECTION=True" >> $GITHUB_ENV
-          echo "RESPONSE_GROUNDEDNESS_THRESHOLD=3" >> $GITHUB_ENV
-          # Unset query rewriter settings
-          echo "ENABLE_QUERYREWRITER=False" >> $GITHUB_ENV
-          echo "CONVERSATION_HISTORY=0" >> $GITHUB_ENV
-
-      - name: Restart services for reflection tests
-        env:
-          CI_NVSTAGING_BLUEPRINT_KEY: ${{ secrets.CI_NVSTAGING_BLUEPRINT_KEY }}
-        run: |
-          echo "Restarting services with reflection configuration..."
-          docker compose -f deploy/compose/docker-compose-rag-server.yaml down || true
-          docker compose -f deploy/compose/docker-compose-ingestor-server.yaml down || true
-          sleep 5
-          docker compose -f deploy/compose/docker-compose-rag-server.yaml up -d --build
-          docker compose -f deploy/compose/docker-compose-ingestor-server.yaml up -d --build
-          sleep 30
-          docker ps
-
-      - name: Run reflection integration tests
-        id: reflection-tests
-        continue-on-error: true
-        run: |
-          source venv/bin/activate
-          echo "Running reflection integration tests..."
-          python -m tests.integration.main --sequence reflection
-          echo "Reflection integration tests completed"
-
-      - name: Collect logs after reflection tests
-        if: always()
-        run: |
-          mkdir -p logs/reflection
-          docker logs rag-server > logs/reflection/rag-server.log 2>&1 || true
-          docker logs ingestor-server > logs/reflection/ingestor-server.log 2>&1 || true
-          docker logs compose-nv-ingest-ms-runtime-1 > logs/reflection/nvingest.log 2>&1 || true
-          cp tests/integration/integration_test.log logs/reflection/ 2>/dev/null || true
-
-      # ========================================================================
-      # NEMO GUARDRAILS TESTS
-      # ========================================================================
-      
-      - name: Configure environment for guardrails tests
-        run: |
-          echo "ENABLE_GUARDRAILS=True" >> $GITHUB_ENV
-          # Unset reflection and query rewriter settings
-          echo "ENABLE_REFLECTION=False" >> $GITHUB_ENV
-          echo "ENABLE_QUERYREWRITER=False" >> $GITHUB_ENV
-          echo "CONVERSATION_HISTORY=0" >> $GITHUB_ENV
-
-      - name: Restart services for guardrails tests
-        env:
-          CI_NVSTAGING_BLUEPRINT_KEY: ${{ secrets.CI_NVSTAGING_BLUEPRINT_KEY }}
-        run: |
-          echo "Restarting services with guardrails configuration..."
-          docker compose -f deploy/compose/docker-compose-rag-server.yaml down || true
-          docker compose -f deploy/compose/docker-compose-ingestor-server.yaml down || true
-          sleep 5
-          docker compose -f deploy/compose/docker-compose-rag-server.yaml up -d --build
-          docker compose -f deploy/compose/docker-compose-ingestor-server.yaml up -d --build
-          sleep 30
-          docker ps
-
-      - name: Start NemoGuardrails microservice
-        run: |
-          echo "Starting NemoGuardrails microservice..."
-          docker compose -f deploy/compose/docker-compose-nemo-guardrails.yaml up -d --no-deps nemo-guardrails-microservice
-          sleep 30
-
-      - name: Run NemoGuardrails integration tests
-        id: guardrails-tests
-        continue-on-error: true
-        run: |
-          source venv/bin/activate
-          echo "Running NemoGuardrails integration tests..."
-          python -m tests.integration.main --sequence nemo_guardrails
-          echo "NemoGuardrails integration tests completed"
-
-      - name: Collect logs after guardrails tests
-        if: always()
-        run: |
-          mkdir -p logs/nemo-guardrails
-          docker logs rag-server > logs/nemo-guardrails/rag-server.log 2>&1 || true
-          docker logs ingestor-server > logs/nemo-guardrails/ingestor-server.log 2>&1 || true
-          docker logs compose-nv-ingest-ms-runtime-1 > logs/nemo-guardrails/nvingest.log 2>&1 || true
-          docker logs nemo-guardrails-microservice > logs/nemo-guardrails/nemo-guardrails.log 2>&1 || true
-          cp tests/integration/integration_test.log logs/nemo-guardrails/ 2>/dev/null || true
-
-      - name: Stop NemoGuardrails microservice
-        if: always()
-        run: |
-          docker compose -f deploy/compose/docker-compose-nemo-guardrails.yaml down nemo-guardrails-microservice || true
-
-      # ========================================================================
-      # IMAGE CAPTIONING TESTS
-      # ========================================================================
-      
-      - name: Configure environment for image captioning tests
-        run: |
-          echo "APP_NVINGEST_EXTRACTIMAGES=True" >> $GITHUB_ENV
-          echo "ENABLE_GUARDRAILS=False" >> $GITHUB_ENV
-          echo "ENABLE_QUERYREWRITER=False" >> $GITHUB_ENV
-          echo "CONVERSATION_HISTORY=0" >> $GITHUB_ENV
-
-      - name: Restart services for image captioning tests
-        env:
-          CI_NVSTAGING_BLUEPRINT_KEY: ${{ secrets.CI_NVSTAGING_BLUEPRINT_KEY }}
-        run: |
-          echo "Restarting services with image captioning configuration..."
-          docker compose -f deploy/compose/docker-compose-rag-server.yaml down || true
-          docker compose -f deploy/compose/docker-compose-ingestor-server.yaml down || true
-          sleep 5
-          docker compose -f deploy/compose/docker-compose-rag-server.yaml up -d --build
-          docker compose -f deploy/compose/docker-compose-ingestor-server.yaml up -d --build
-          sleep 30
-          docker ps
-
-      - name: Run image captioning integration tests
-        id: image-captioning-tests
-        continue-on-error: true
-        run: |
-          source venv/bin/activate
-          echo "Running image captioning integration tests..."
-          python -m tests.integration.main --sequence image_captioning
-          echo "Image captioning integration tests completed"
-
-      - name: Collect logs after image captioning tests
-        if: always()
-        run: |
-          mkdir -p logs/image-captioning
-          docker logs rag-server > logs/image-captioning/rag-server.log 2>&1 || true
-          docker logs ingestor-server > logs/image-captioning/ingestor-server.log 2>&1 || true
-          docker logs compose-nv-ingest-ms-runtime-1 > logs/image-captioning/nvingest.log 2>&1 || true
-          cp tests/integration/integration_test.log logs/image-captioning/ 2>/dev/null || true
-
-      # ========================================================================
-      # VLM GENERATION TESTS
-      # ========================================================================
-      
-      - name: Configure environment for VLM generation tests
-        run: |
-          echo "ENABLE_VLM_INFERENCE=True" >> $GITHUB_ENV
-          echo "APP_NVINGEST_EXTRACTIMAGES=False" >> $GITHUB_ENV
-          echo "ENABLE_QUERYREWRITER=False" >> $GITHUB_ENV
-          echo "CONVERSATION_HISTORY=0" >> $GITHUB_ENV
-
-      - name: Restart services for VLM generation tests
-        env:
-          CI_NVSTAGING_BLUEPRINT_KEY: ${{ secrets.CI_NVSTAGING_BLUEPRINT_KEY }}
-        run: |
-          echo "Restarting services with VLM generation configuration..."
-          docker compose -f deploy/compose/docker-compose-rag-server.yaml down || true
-          docker compose -f deploy/compose/docker-compose-ingestor-server.yaml down || true
-          sleep 5
-          docker compose -f deploy/compose/docker-compose-rag-server.yaml up -d --build
-          docker compose -f deploy/compose/docker-compose-ingestor-server.yaml up -d --build
-          sleep 30
-          docker ps
-
-      - name: Run VLM generation integration tests
-        id: vlm-generation-tests
-        continue-on-error: true
-        run: |
-          source venv/bin/activate
-          echo "Running VLM generation integration tests..."
-          python -m tests.integration.main --sequence vlm_generation
-          echo "VLM generation integration tests completed"
-
-      - name: Collect logs after VLM generation tests
-        if: always()
-        run: |
-          mkdir -p logs/vlm-generation
-          docker logs rag-server > logs/vlm-generation/rag-server.log 2>&1 || true
-          docker logs ingestor-server > logs/vlm-generation/ingestor-server.log 2>&1 || true
-          docker logs compose-nv-ingest-ms-runtime-1 > logs/vlm-generation/nvingest.log 2>&1 || true
-          cp tests/integration/integration_test.log logs/vlm-generation/ 2>/dev/null || true
-
-      # ========================================================================
-      # MULTIMODAL QUERY TESTS
-      # ========================================================================
-
-      - name: Prepare multimodal test data
-        run: |
-          mkdir -p data/multimodal/query
-          [ -f tests/data/product_catalog.pdf ] && cp tests/data/product_catalog.pdf data/multimodal/ || true
-          [ -f tests/data/Creme_clutch_purse1-small.jpg ] && cp tests/data/Creme_clutch_purse1-small.jpg data/multimodal/query/ || true
-          [ -f tests/data/query/Creme_clutch_purse1-small.jpg ] && cp tests/data/query/Creme_clutch_purse1-small.jpg data/multimodal/query/ || true
-
-      - name: Configure environment for multimodal query tests
-        run: |
-          # VLM embedding (required for multimodal queries)
-          echo "APP_EMBEDDINGS_MODELNAME=nvidia/llama-nemotron-embed-vl-1b-v2" >> $GITHUB_ENV
-          echo "APP_EMBEDDINGS_SERVERURL=https://integrate.api.nvidia.com/v1" >> $GITHUB_ENV
-          # VLM model for generation
-          echo "ENABLE_VLM_INFERENCE=True" >> $GITHUB_ENV
-          echo "APP_VLM_MODELNAME=nvidia/nemotron-nano-12b-v2-vl" >> $GITHUB_ENV
-          echo "APP_VLM_SERVERURL=https://integrate.api.nvidia.com/v1" >> $GITHUB_ENV
-          # Disable reranker (not supported with multimodal)
-          echo "ENABLE_RERANKER=False" >> $GITHUB_ENV
-          echo "APP_RANKING_SERVERURL=" >> $GITHUB_ENV
-          # Image extraction for ingestion
-          echo "APP_NVINGEST_EXTRACTIMAGES=True" >> $GITHUB_ENV
-          echo "APP_NVINGEST_STRUCTURED_ELEMENTS_MODALITY=" >> $GITHUB_ENV
-          echo "APP_NVINGEST_IMAGE_ELEMENTS_MODALITY=image" >> $GITHUB_ENV
-          # Reset other flags
-          echo "ENABLE_QUERYREWRITER=False" >> $GITHUB_ENV
-          echo "CONVERSATION_HISTORY=0" >> $GITHUB_ENV
-
-      - name: Restart services for multimodal query tests
-        env:
-          CI_NVSTAGING_BLUEPRINT_KEY: ${{ secrets.CI_NVSTAGING_BLUEPRINT_KEY }}
-        run: |
-          echo "Restarting services with multimodal query configuration..."
-          docker compose -f deploy/compose/docker-compose-rag-server.yaml down || true
-          docker compose -f deploy/compose/docker-compose-ingestor-server.yaml down || true
-          sleep 5
-          docker compose -f deploy/compose/docker-compose-rag-server.yaml up -d --build
-          docker compose -f deploy/compose/docker-compose-ingestor-server.yaml up -d --build
-          sleep 30
-          docker ps
-
-      - name: Run multimodal query integration tests
-        id: multimodal-query-tests
-        continue-on-error: true
-        run: |
-          source venv/bin/activate
-          echo "Running multimodal query integration tests..."
-          python -m tests.integration.main --sequence multimodal_query
-          echo "Multimodal query integration tests completed"
-
-      - name: Collect logs after multimodal query tests
-        if: always()
-        run: |
-          mkdir -p logs/multimodal-query
-          docker logs rag-server > logs/multimodal-query/rag-server.log 2>&1 || true
-          docker logs ingestor-server > logs/multimodal-query/ingestor-server.log 2>&1 || true
-          docker logs compose-nv-ingest-ms-runtime-1 > logs/multimodal-query/nvingest.log 2>&1 || true
-          cp tests/integration/integration_test.log logs/multimodal-query/ 2>/dev/null || true
-
-      # ========================================================================
-      # CUSTOM PROMPT TESTS
-      # ========================================================================
-      
-      - name: Configure environment for custom prompt tests
-        run: |
-          echo "PROMPT_CONFIG_FILE=$(pwd)/tests/data/test_prompt.yaml" >> $GITHUB_ENV
-          echo "ENABLE_VLM_INFERENCE=False" >> $GITHUB_ENV
-          echo "ENABLE_QUERYREWRITER=False" >> $GITHUB_ENV
-          echo "CONVERSATION_HISTORY=0" >> $GITHUB_ENV
-
-      - name: Restart services for custom prompt tests
-        env:
-          CI_NVSTAGING_BLUEPRINT_KEY: ${{ secrets.CI_NVSTAGING_BLUEPRINT_KEY }}
-        run: |
-          echo "Restarting services with custom prompt configuration..."
-          docker compose -f deploy/compose/docker-compose-rag-server.yaml down || true
-          docker compose -f deploy/compose/docker-compose-ingestor-server.yaml down || true
-          sleep 5
-          docker compose -f deploy/compose/docker-compose-rag-server.yaml up -d --build
-          docker compose -f deploy/compose/docker-compose-ingestor-server.yaml up -d --build
-          sleep 30
-          docker ps
-
-      - name: Run custom prompt integration tests
-        id: custom-prompt-tests
-        continue-on-error: true
-        run: |
-          source venv/bin/activate
-          echo "Running custom prompt integration tests..."
-          python -m tests.integration.main --sequence custom_prompt
-          echo "Custom prompt integration tests completed"
-
-      - name: Collect logs after custom prompt tests
-        if: always()
-        run: |
-          mkdir -p logs/custom-prompt
-          docker logs rag-server > logs/custom-prompt/rag-server.log 2>&1 || true
-          docker logs ingestor-server > logs/custom-prompt/ingestor-server.log 2>&1 || true
-          docker logs compose-nv-ingest-ms-runtime-1 > logs/custom-prompt/nvingest.log 2>&1 || true
-          cp tests/integration/integration_test.log logs/custom-prompt/ 2>/dev/null || true
-
-      # ========================================================================
-      # LIBRARY USAGE TESTS
-      # ========================================================================
-      
-      - name: Stop rag-server and ingestor-server for library tests
-        run: |
-          echo "Stopping rag-server and ingestor-server containers (library mode doesn't need them)..."
-          echo "Keeping nv-ingest-ms-runtime, Milvus, Redis, MinIO running for library mode..."
-          docker stop rag-server || true
-          docker stop ingestor-server || true
-          echo "Services stopped. Library tests will use the nvidia_rag library directly."
-          echo "Remaining services (needed for library mode):"
-          docker ps
-      
-      - name: Configure environment for library usage tests
-        run: |
-          echo "PROMPT_CONFIG_FILE=${PWD}/src/nvidia_rag/rag_server/prompt.yaml" >> $GITHUB_ENV
-          echo "ENABLE_QUERYREWRITER=False" >> $GITHUB_ENV
-          echo "CONVERSATION_HISTORY=0" >> $GITHUB_ENV
-
-      - name: Run library usage integration tests
-        id: library-usage-tests
-        continue-on-error: true
-        run: |
-          source venv/bin/activate
-          echo "Running library usage integration tests..."
-          python -m tests.integration.main --sequence library_usage
-          echo "Library usage integration tests completed"
-
-      - name: Collect logs after library usage tests
-        if: always()
-        run: |
-          mkdir -p logs/library-usage
-          docker logs compose-nv-ingest-ms-runtime-1 > logs/library-usage/nvingest.log 2>&1 || true
-          cp tests/integration/integration_test.log logs/library-usage/ 2>/dev/null || true
-
-      # ========================================================================
-      # LIBRARY SUMMARIZATION TESTS
-      # ========================================================================
-      
-      - name: Run library summarization integration tests
-        id: library-summarization-tests
-        continue-on-error: true
-        run: |
-          source venv/bin/activate
-          echo "Running library summarization integration tests..."
-          python -m tests.integration.main --sequence library_summarization
-          echo "Library summarization integration tests completed"
-
-      - name: Collect logs after library summarization tests
-        if: always()
-        run: |
-          mkdir -p logs/library-summarization
-          docker logs compose-nv-ingest-ms-runtime-1 > logs/library-summarization/nvingest.log 2>&1 || true
-          cp tests/integration/integration_test.log logs/library-summarization/ 2>/dev/null || true
-
-      # ========================================================================
-      # OBSERVABILITY TESTS
-      # ========================================================================
-      
-      - name: Configure environment for observability tests
-        run: |
-          echo "APP_TRACING_ENABLED=True" >> $GITHUB_ENV
-          echo "OPENTELEMETRY_CONFIG_FILE=$(pwd)/deploy/config/otel-collector-config.yaml" >> $GITHUB_ENV
-          echo "PROMPT_CONFIG_FILE=${PWD}/src/nvidia_rag/rag_server/prompt.yaml" >> $GITHUB_ENV
-          echo "ENABLE_QUERYREWRITER=False" >> $GITHUB_ENV
-          echo "CONVERSATION_HISTORY=0" >> $GITHUB_ENV
-
-      - name: Restart services for observability tests
-        env:
-          CI_NVSTAGING_BLUEPRINT_KEY: ${{ secrets.CI_NVSTAGING_BLUEPRINT_KEY }}
-        run: |
-          echo "Restarting services with observability configuration..."
-          echo "(rag-server and ingestor-server were stopped for library tests, now restarting)"
-          docker compose -f tests/integration/vectordb.yaml down -v || true
-          docker compose -f deploy/compose/docker-compose-rag-server.yaml down || true
-          docker compose -f deploy/compose/docker-compose-ingestor-server.yaml down || true
-          sleep 5
-          docker compose -f tests/integration/vectordb.yaml up -d || true
-          docker compose -f deploy/compose/docker-compose-rag-server.yaml up -d --build
-          docker compose -f deploy/compose/docker-compose-ingestor-server.yaml up -d --build
-          sleep 30
-          docker ps
-
-      - name: Start observability services
-        run: |
-          echo "Starting observability services..."
-          docker compose -f deploy/compose/observability.yaml up -d otel-collector prometheus
-          sleep 30
-
-      - name: Run observability integration tests
-        id: observability-tests
-        continue-on-error: true
-        run: |
-          source venv/bin/activate
-          echo "Running observability integration tests..."
-          python -m tests.integration.main --sequence observability
-          echo "Observability integration tests completed"
-
-      - name: Collect logs after observability tests
-        if: always()
-        run: |
-          mkdir -p logs/observability
-          echo "=== Container Status (docker ps -a) ===" | tee logs/observability/container-status.log
-          docker ps -a --format "table {{.Names}}\t{{.Status}}\t{{.Ports}}" | tee -a logs/observability/container-status.log
-          echo "" | tee -a logs/observability/container-status.log
-          echo "=== Milvus Container Stats ===" | tee -a logs/observability/container-status.log
-          docker stats --no-stream milvus-standalone 2>&1 | tee -a logs/observability/container-status.log || echo "Milvus container not running" | tee -a logs/observability/container-status.log
-          docker logs rag-server > logs/observability/rag-server.log 2>&1 || true
-          docker logs ingestor-server > logs/observability/ingestor-server.log 2>&1 || true
-          docker logs compose-nv-ingest-ms-runtime-1 > logs/observability/nvingest.log 2>&1 || true
-          docker logs milvus-standalone > logs/observability/milvus.log 2>&1 || true
-          docker logs otel-collector > logs/observability/otel-collector.log 2>&1 || true
-          docker logs zipkin > logs/observability/zipkin.log 2>&1 || true
-          docker logs prometheus > logs/observability/prometheus.log 2>&1 || true
-          docker logs grafana-service > logs/observability/grafana.log 2>&1 || true
-          cp tests/integration/integration_test.log logs/observability/ 2>/dev/null || true
-
-      - name: Stop observability services
-        if: always()
-        run: |
-          docker compose -f deploy/compose/observability.yaml down || true
-
-      # ========================================================================
-      # MILVUS VDB AUTH VIA REST TESTS
-      # ========================================================================
-      
-      - name: Configure Milvus VDB auth
-        run: |
-          echo "Configuring Milvus authentication and override compose..."
-          # Set auth credentials for the servers (running in Docker)
-          echo "APP_VECTORSTORE_USERNAME=root" >> $GITHUB_ENV
-          echo "APP_VECTORSTORE_PASSWORD=Milvus" >> $GITHUB_ENV
-          # Set auth token for the integration tests (running on host)
-          echo "VDB_AUTH_TOKEN=root:Milvus" >> $GITHUB_ENV
-          echo "MILVUS_ROOT_TOKEN=root:Milvus" >> $GITHUB_ENV
-          
-          # Stop and clean up existing Milvus containers and volumes
-          echo "Stopping and cleaning up existing Milvus containers..."
-          docker compose -f tests/integration/vectordb.yaml down -v || true
-          
-          # Remove the old data directories to ensure clean start with auth
-          echo "Cleaning up old Milvus data directories..."
-          sudo rm -rf /tmp/milvus-${MILVUS_VERSION}/volumes/milvus || true
-          sudo rm -rf /tmp/milvus-${MILVUS_VERSION}/volumes/etcd || true
-          
-          # Ensure milvus.yaml exists in tests/integration by copying from running container
-          # Since we stopped the container, we need to start a temporary one to extract config
-          mkdir -p tests/integration
-          if [ ! -f tests/integration/milvus.yaml ]; then
-            echo "Creating temporary Milvus container to extract config..."
-            docker run --rm -d --name milvus-temp milvusdb/milvus:${MILVUS_VERSION:-v2.6.5-gpu} sleep 60
-            docker cp milvus-temp:/milvus/configs/milvus.yaml tests/integration/milvus.yaml
-            docker stop milvus-temp || true
-          fi
-          
-          # Update authentication settings in milvus.yaml
-          sed -i 's/authorizationEnabled:.*/authorizationEnabled: true/' tests/integration/milvus.yaml
-          sed -i 's/defaultRootPassword:.*/defaultRootPassword: Milvus/' tests/integration/milvus.yaml
-          echo "MILVUS_CONFIG_FILE=$(pwd)/tests/integration/milvus.yaml" >> $GITHUB_ENV
-          
-          # Verify the changes
-          echo "Verifying milvus.yaml authentication settings:"
-          grep -A2 "security:" tests/integration/milvus.yaml | head -5
-          
-          # Update vectordb.yaml to comment out data volume and enable config volume
-          sed -i 's|- \${DOCKER_VOLUME_DIRECTORY:-.}/volumes/milvus:/var/lib/milvus|# - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/milvus:/var/lib/milvus|' tests/integration/vectordb.yaml
-          sed -i 's|# - \${MILVUS_CONFIG_FILE:-./milvus.yaml}:/milvus/configs/milvus.yaml|- ${MILVUS_CONFIG_FILE:-./milvus.yaml}:/milvus/configs/milvus.yaml|' tests/integration/vectordb.yaml
-      
-      - name: Restart vector database with auth
-        run: |
-          echo "Starting vector database with authentication enabled..."
-          docker compose -f tests/integration/vectordb.yaml up -d
-          echo "Waiting for Milvus services to be ready..."
-          sleep 60
-          docker ps
-          echo "Checking Milvus logs..."
-          docker logs --tail 100 milvus-standalone || true
-          docker logs --tail 50 milvus-etcd || true
-          docker logs --tail 50 milvus-minio || true
-      
-      - name: Restart services for Milvus VDB auth tests
-        env:
-          CI_NVSTAGING_BLUEPRINT_KEY: ${{ secrets.CI_NVSTAGING_BLUEPRINT_KEY }}
-        run: |
-          echo "Restarting rag/ingestor services to pick up Milvus auth configuration..."
-          docker compose -f deploy/compose/docker-compose-rag-server.yaml down || true
-          docker compose -f deploy/compose/docker-compose-ingestor-server.yaml down || true
-          sleep 5
-          docker compose -f deploy/compose/docker-compose-rag-server.yaml up -d --build
-          docker compose -f deploy/compose/docker-compose-ingestor-server.yaml up -d --build
-          sleep 30
-          docker ps
-      
-      - name: Run Milvus VDB Auth tests
-        id: milvus-vdb-auth-tests
-        continue-on-error: true
-        env:
-          APP_VECTORSTORE_URL: http://localhost:19530
-        run: |
-          source venv/bin/activate
-          echo "Running Milvus VDB Auth tests..."
-          echo "APP_VECTORSTORE_URL set to: $APP_VECTORSTORE_URL"
-          python -m tests.integration.main --sequence milvus_vdb_auth_through_rest_api
-          echo "Milvus VDB Auth tests completed"
-      
-      - name: Collect logs after Milvus VDB auth tests
-        if: always()
-        run: |
-          mkdir -p logs/milvus-vdb-auth
-          docker logs rag-server > logs/milvus-vdb-auth/rag-server.log 2>&1 || true
-          docker logs ingestor-server > logs/milvus-vdb-auth/ingestor-server.log 2>&1 || true
-          docker logs milvus-standalone > logs/milvus-vdb-auth/milvus.log 2>&1 || true
-          docker logs milvus-etcd > logs/milvus-vdb-auth/etcd.log 2>&1 || true
-          docker logs milvus-minio > logs/milvus-vdb-auth/minio.log 2>&1 || true
-          cp tests/integration/integration_test.log logs/milvus-vdb-auth/ 2>/dev/null || true
-      
-      - name: Revert Milvus VDB auth configurations
-        if: always()
-        run: |
-          echo "Reverting Milvus VDB auth configurations..."
-          # Stop Milvus with auth enabled
-          docker compose -f tests/integration/vectordb.yaml down -v || true
-          
-          # Clean up auth-specific data to ensure fresh start
-          sudo rm -rf /tmp/milvus-${MILVUS_VERSION}/volumes/milvus || true
-          sudo rm -rf /tmp/milvus-${MILVUS_VERSION}/volumes/etcd || true
-          
-          # Remove the auth config file
-          rm -f tests/integration/milvus.yaml || true
-          
-          # Revert vectordb.yaml to original state
-          sed -i 's|- \${MILVUS_CONFIG_FILE:-./milvus.yaml}:/milvus/configs/milvus.yaml|# - ${MILVUS_CONFIG_FILE:-./milvus.yaml}:/milvus/configs/milvus.yaml|' tests/integration/vectordb.yaml
-          sed -i 's|# - \${DOCKER_VOLUME_DIRECTORY:-.}/volumes/milvus:/var/lib/milvus|- \${DOCKER_VOLUME_DIRECTORY:-.}/volumes/milvus:/var/lib/milvus|' tests/integration/vectordb.yaml
-          
-          # Restart Milvus without auth for subsequent tests
-          echo "Restarting Milvus without authentication for subsequent tests..."
-          docker compose -f tests/integration/vectordb.yaml up -d
-          sleep 30
-          
-          # Unset auth environment variables
-          echo "APP_VECTORSTORE_USERNAME=" >> $GITHUB_ENV
-          echo "APP_VECTORSTORE_PASSWORD=" >> $GITHUB_ENV
-          echo "VDB_AUTH_TOKEN=" >> $GITHUB_ENV
-          echo "MILVUS_ROOT_TOKEN=" >> $GITHUB_ENV
-      
-      # ========================================================================
-      # FAIL JOB IF ANY INTEGRATION TEST FAILED
-      # ========================================================================
-      # All test steps use continue-on-error so every suite runs; this step
-      # marks the job as failed if any of them failed.
-      - name: Fail job if any integration test failed
-        if: always() && (steps.basic-tests.outcome == 'failure' || steps.query-rewriter-tests.outcome == 'failure' || steps.reflection-tests.outcome == 'failure' || steps.guardrails-tests.outcome == 'failure' || steps.image-captioning-tests.outcome == 'failure' || steps.vlm-generation-tests.outcome == 'failure' || steps.multimodal-query-tests.outcome == 'failure' || steps.custom-prompt-tests.outcome == 'failure' || steps.library-usage-tests.outcome == 'failure' || steps.library-summarization-tests.outcome == 'failure' || steps.observability-tests.outcome == 'failure' || steps.milvus-vdb-auth-tests.outcome == 'failure')
-        run: |
-          echo "=== Failed integration test suites ==="
-          [ "${{ steps.basic-tests.outcome }}" = "failure" ] && echo "  - basic-tests"
-          [ "${{ steps.query-rewriter-tests.outcome }}" = "failure" ] && echo "  - query-rewriter-tests"
-          [ "${{ steps.reflection-tests.outcome }}" = "failure" ] && echo "  - reflection-tests"
-          [ "${{ steps.guardrails-tests.outcome }}" = "failure" ] && echo "  - guardrails-tests"
-          [ "${{ steps.image-captioning-tests.outcome }}" = "failure" ] && echo "  - image-captioning-tests"
-          [ "${{ steps.vlm-generation-tests.outcome }}" = "failure" ] && echo "  - vlm-generation-tests"
-          [ "${{ steps.multimodal-query-tests.outcome }}" = "failure" ] && echo "  - multimodal-query-tests"
-          [ "${{ steps.custom-prompt-tests.outcome }}" = "failure" ] && echo "  - custom-prompt-tests"
-          [ "${{ steps.library-usage-tests.outcome }}" = "failure" ] && echo "  - library-usage-tests"
-          [ "${{ steps.library-summarization-tests.outcome }}" = "failure" ] && echo "  - library-summarization-tests"
-          [ "${{ steps.observability-tests.outcome }}" = "failure" ] && echo "  - observability-tests"
-          [ "${{ steps.milvus-vdb-auth-tests.outcome }}" = "failure" ] && echo "  - milvus-vdb-auth-tests"
-          echo "One or more integration test suites failed. Failing job."
-          exit 1
-
-      # ========================================================================
-      # UPLOAD ALL LOGS
-      # ========================================================================
-      
-      - name: Sanitize branch name for artifacts
-        if: always()
-        id: sanitize
-        run: |
-          SANITIZED_REF="${GITHUB_REF_NAME//\//-}"
-          echo "ref_name=$SANITIZED_REF" >> $GITHUB_OUTPUT
-
-      - name: Upload all integration test logs
-        uses: actions/upload-artifact@v4
-        if: always()
-        with:
-          name: integration-tests-logs-${{ steps.sanitize.outputs.ref_name }}-${{ github.sha }}
-          path: logs/
-          retention-days: 7
-
-      # ========================================================================
-      # CLEANUP
-      # ========================================================================
-      
-      - name: Cleanup virtual environment
-        if: always()
-        run: |
-          echo "Cleaning up virtual environment..."
-          rm -rf venv
-          echo "Virtual environment cleanup completed"
-
-      - name: Cleanup Docker containers and volumes
-        if: always()
-        run: |
-          echo "Cleaning up integration test environment..."
-          # Bring down only the specific compose stacks used in this workflow,
-          # and clean up their containers, networks, and locally built images.
-          docker compose -f deploy/compose/docker-compose-ingestor-server.yaml down -v --remove-orphans --rmi local || true
-          docker compose -f deploy/compose/docker-compose-rag-server.yaml down -v --remove-orphans --rmi local || true
-          docker compose -f tests/integration/vectordb.yaml down -v --remove-orphans --rmi local || true
-          echo "Cleanup completed for integration test Docker resources"
diff --git a/.github/workflows/publish-artifacts.yml b/.github/workflows/publish-artifacts.yml
deleted file mode 100644
index 2be3979e1..000000000
--- a/.github/workflows/publish-artifacts.yml
+++ /dev/null
@@ -1,276 +0,0 @@
-name: Publish Artifacts
-
-# Workflow control - runs nightly or on manual trigger
-on:
-  schedule:
-    # Run nightly at 6.30 PM UTC or 12 AM IST
-    - cron: '30 18 * * *'
-  workflow_dispatch:
-    inputs:
-      CONTAINER_TAG:
-        description: 'Custom tag for containers (optional)'
-        required: false
-        default: ''
-      ARTIFACTORY_VERSION:
-        description: 'Artifactory version (optional, defaults to auto-generated from get_version.sh)'
-        required: false
-        default: ''
-
-env:
-  RELEASE_TYPE: dev
-
-jobs:
-  # ============================================================================
-  # PUBLISH WHEEL
-  # ============================================================================
-  publish-wheel:
-    name: Build and Publish Python Wheel
-    runs-on: ubuntu-latest
-    container:
-      image: python:3.10
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v4
-
-      - name: Set artifactory version
-        run: |
-          if [ -n "${{ github.event.inputs.ARTIFACTORY_VERSION }}" ]; then
-            echo "Using custom Artifactory version: ${{ github.event.inputs.ARTIFACTORY_VERSION }}"
-            echo "ARTIFACTORY_VERSION=${{ github.event.inputs.ARTIFACTORY_VERSION }}" >> $GITHUB_ENV
-          else
-            echo "Using version from get_version.sh..."
-            chmod +x ./ci/get_version.sh
-            DEFAULT_VERSION=$(./ci/get_version.sh)
-            echo "Generated default version: $DEFAULT_VERSION"
-            echo "ARTIFACTORY_VERSION=$DEFAULT_VERSION" >> $GITHUB_ENV
-          fi
-
-      - name: Install dependencies
-        run: |
-          pip install uv==0.8.12
-          apt-get update && apt-get install -y wget unzip
-
-      - name: Build wheel with version
-        run: |
-          echo "Building wheel with version: $ARTIFACTORY_VERSION"
-          sed -i "s#^version = \".*\"#version = \"$ARTIFACTORY_VERSION\"#" pyproject.toml
-          uv build
-          ls -la dist/
-
-      - name: Upload wheel artifact
-        uses: actions/upload-artifact@v4
-        with:
-          name: wheel-${{ env.ARTIFACTORY_VERSION }}
-          path: dist/*.whl
-          retention-days: 30
-
-      - name: Install NGC CLI
-        env:
-          NGC_API_KEY: ${{ secrets.CI_NVSTAGING_BLUEPRINT_KEY }}
-        run: |
-          echo "Installing NGC CLI..."
-          wget --content-disposition https://api.ngc.nvidia.com/v2/resources/nvidia/ngc-apps/ngc_cli/versions/4.9.10/files/ngccli_linux.zip -O ngccli_linux.zip
-          unzip -o ngccli_linux.zip
-          chmod u+x ngc-cli/ngc
-          
-          # Add NGC CLI to PATH for subsequent steps
-          echo "$(pwd)/ngc-cli" >> $GITHUB_PATH
-          
-          echo "NGC CLI installed successfully"
-      
-      - name: Publish wheel to NGC
-        env:
-          NGC_API_KEY: ${{ secrets.CI_NVSTAGING_BLUEPRINT_KEY }}
-        run: |
-          # Find the wheel file
-          WHEEL_FILE=$(ls dist/*.whl | tail -n 1)
-          echo "Found wheel file: $WHEEL_FILE"
-          
-          # Extract just the filename for display
-          WHEEL_FILENAME=$(basename "$WHEEL_FILE")
-          echo "Wheel filename: $WHEEL_FILENAME"
-          
-          # Publish to NGC
-          echo "Publishing wheel to NGC: nvstaging/blueprint/nvidia_rag:$ARTIFACTORY_VERSION"
-          ngc registry resource upload-version \
-            "nvstaging/blueprint/nvidia_rag:$ARTIFACTORY_VERSION" \
-            --source "$WHEEL_FILE" \
-            --org nvstaging
-          
-          echo "Wheel published to NGC successfully"
-          echo "NGC Resource: nvstaging/blueprint/nvidia_rag:$ARTIFACTORY_VERSION"
-
-  # ============================================================================
-  # PUBLISH RAG SERVER CONTAINER
-  # ============================================================================
-  publish-rag-server:
-    name: Build and Publish RAG Server Container
-    runs-on: ubuntu-latest
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v4
-
-      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v3
-
-      - name: Determine TAG
-        id: tag
-        run: |
-          if [ -n "${{ github.event.inputs.CONTAINER_TAG }}" ]; then
-            echo "Using custom TAG from input: ${{ github.event.inputs.CONTAINER_TAG }}"
-            TAG="${{ github.event.inputs.CONTAINER_TAG }}"
-          else
-            echo "Using auto-generated version as TAG"
-            VERSION=$(./ci/get_version.sh)
-            TAG=$VERSION
-          fi
-          echo "TAG=$TAG" >> $GITHUB_ENV
-          echo "tag=$TAG" >> $GITHUB_OUTPUT
-          echo "Final TAG value: $TAG"
-
-      - name: Login to NGC Container Registry
-        uses: docker/login-action@v3
-        with:
-          registry: nvcr.io
-          username: '$oauthtoken'
-          password: ${{ secrets.CI_NVSTAGING_BLUEPRINT_KEY }}
-
-      - name: Build and push RAG server container
-        env:
-          NGC_API_KEY: ${{ secrets.NGC_API_KEY }}
-        run: |
-          echo "Building rag-server container with tag ${TAG}..."
-          export TAG=${TAG}
-          export DOWNLOAD_LEGAL_COMPLIANCE=true
-          docker compose -f deploy/compose/docker-compose-rag-server.yaml build rag-server
-
-          # Tag and push to NGC Container Registry
-          echo "Pushing rag-server to NGC Container Registry..."
-          docker push nvcr.io/nvstaging/blueprint/rag-server:$TAG
-          docker tag nvcr.io/nvstaging/blueprint/rag-server:$TAG nvcr.io/nvstaging/blueprint/rag-server:latest
-          docker push nvcr.io/nvstaging/blueprint/rag-server:latest
-          echo "RAG server container publishing completed successfully"
-
-      - name: Cleanup Docker images
-        if: always()
-        run: |
-          echo "Cleaning up rag-server Docker images..."
-          docker images | grep "rag-server" | awk '{print $3}' | xargs -r docker rmi -f || echo "No rag-server images to delete"
-          docker system prune -f || true
-
-  # ============================================================================
-  # PUBLISH INGESTOR SERVER CONTAINER
-  # ============================================================================
-  publish-ingestor-server:
-    name: Build and Publish Ingestor Server Container
-    runs-on: ubuntu-latest
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v4
-
-      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v3
-
-      - name: Determine TAG
-        id: tag
-        run: |
-          if [ -n "${{ github.event.inputs.CONTAINER_TAG }}" ]; then
-            echo "Using custom TAG from input: ${{ github.event.inputs.CONTAINER_TAG }}"
-            TAG="${{ github.event.inputs.CONTAINER_TAG }}"
-          else
-            echo "Using auto-generated version as TAG"
-            VERSION=$(./ci/get_version.sh)
-            TAG=$VERSION
-          fi
-          echo "TAG=$TAG" >> $GITHUB_ENV
-          echo "tag=$TAG" >> $GITHUB_OUTPUT
-          echo "Final TAG value: $TAG"
-
-      - name: Login to NGC Container Registry
-        uses: docker/login-action@v3
-        with:
-          registry: nvcr.io
-          username: '$oauthtoken'
-          password: ${{ secrets.CI_NVSTAGING_BLUEPRINT_KEY }}
-
-      - name: Build and push ingestor server container
-        env:
-          NGC_API_KEY: ${{ secrets.NGC_API_KEY }}
-        run: |
-          echo "Building ingestor-server container with tag ${TAG}..."
-          export TAG=${TAG}
-          export DOWNLOAD_LEGAL_COMPLIANCE=true
-          docker compose -f deploy/compose/docker-compose-ingestor-server.yaml build ingestor-server
-
-          # Tag and push to NGC Container Registry
-          echo "Pushing ingestor-server to NGC Container Registry..."
-          docker push nvcr.io/nvstaging/blueprint/ingestor-server:$TAG
-          docker tag nvcr.io/nvstaging/blueprint/ingestor-server:$TAG nvcr.io/nvstaging/blueprint/ingestor-server:latest
-          docker push nvcr.io/nvstaging/blueprint/ingestor-server:latest
-          echo "Ingestor server container publishing completed successfully"
-
-      - name: Cleanup Docker images
-        if: always()
-        run: |
-          echo "Cleaning up ingestor-server Docker images..."
-          docker images | grep "ingestor-server" | awk '{print $3}' | xargs -r docker rmi -f || echo "No ingestor-server images to delete"
-          docker system prune -f || true
-
-  # ============================================================================
-  # PUBLISH RAG FRONTEND CONTAINER
-  # ============================================================================
-  publish-rag-frontend:
-    name: Build and Publish RAG Frontend Container
-    runs-on: ubuntu-latest
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v4
-
-      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v3
-
-      - name: Determine TAG
-        id: tag
-        run: |
-          if [ -n "${{ github.event.inputs.CONTAINER_TAG }}" ]; then
-            echo "Using custom TAG from input: ${{ github.event.inputs.CONTAINER_TAG }}"
-            TAG="${{ github.event.inputs.CONTAINER_TAG }}"
-          else
-            echo "Using auto-generated version as TAG"
-            VERSION=$(./ci/get_version.sh)
-            TAG=$VERSION
-          fi
-          echo "TAG=$TAG" >> $GITHUB_ENV
-          echo "tag=$TAG" >> $GITHUB_OUTPUT
-          echo "Final TAG value: $TAG"
-
-      - name: Login to NGC Container Registry
-        uses: docker/login-action@v3
-        with:
-          registry: nvcr.io
-          username: '$oauthtoken'
-          password: ${{ secrets.CI_NVSTAGING_BLUEPRINT_KEY }}
-
-      - name: Build and push RAG frontend container
-        env:
-          NGC_API_KEY: ${{ secrets.NGC_API_KEY }}
-        run: |
-          echo "Building rag-frontend container with tag ${TAG}..."
-          export TAG=${TAG}
-          export DOWNLOAD_LEGAL_COMPLIANCE=true
-          docker compose -f deploy/compose/docker-compose-rag-server.yaml build rag-frontend
-
-          # Tag and push to NGC Container Registry
-          echo "Pushing rag-frontend to NGC Container Registry..."
-          docker push nvcr.io/nvstaging/blueprint/rag-frontend:$TAG
-          docker tag nvcr.io/nvstaging/blueprint/rag-frontend:$TAG nvcr.io/nvstaging/blueprint/rag-frontend:latest
-          docker push nvcr.io/nvstaging/blueprint/rag-frontend:latest
-          echo "RAG frontend container publishing completed successfully"
-
-      - name: Cleanup Docker images
-        if: always()
-        run: |
-          echo "Cleaning up rag-frontend Docker images..."
-          docker images | grep "rag-frontend" | awk '{print $3}' | xargs -r docker rmi -f || echo "No rag-frontend images to delete"
-          docker system prune -f || true
-
diff --git a/.gitignore b/.gitignore
index 9dded62bf..3611412e3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -80,4 +80,9 @@ coverage/
 cover/
 *.log
 tests/data/
+# Agent skills (installed via npx skills add)
+/.agents/
+/.claude/
+skills-lock.json
+
 # Workbench Project Layout
\ No newline at end of file
diff --git a/AGENTS.md b/AGENTS.md
new file mode 100644
index 000000000..183677906
--- /dev/null
+++ b/AGENTS.md
@@ -0,0 +1,86 @@
+# NVIDIA RAG Blueprint
+
+Reference implementation for a Retrieval Augmented Generation pipeline. Python 3.11+ backend (FastAPI + LangChain), React/TypeScript frontend, deployable via Docker Compose or Helm.
+
+## Project structure
+
+```
+src/nvidia_rag/
+├── rag_server/        # RAG query/response server (FastAPI)
+├── ingestor_server/   # Document ingestion server (FastAPI)
+└── utils/             # Shared utilities
+frontend/              # React + TypeScript UI (pnpm)
+deploy/
+├── compose/           # Docker Compose files and env configs
+└── helm/              # Helm charts (standard + MIG-slicing)
+docs/                  # User-facing documentation (Sphinx, RST/MD)
+tests/
+├── unit/              # No network calls allowed
+└── integration/       # Network calls permitted
+notebooks/             # Jupyter notebooks for evaluation and examples
+```
+
+## Development commands
+
+### Backend (Python)
+
+```bash
+uv sync                              # Install all deps
+uv run pytest tests/unit/            # Unit tests
+uv run pytest tests/integration/     # Integration tests
+ruff check --fix src/                # Lint + autofix
+ruff format src/                     # Format
+pre-commit run --all-files           # Run all pre-commit hooks
+```
+
+### Frontend (TypeScript)
+
+```bash
+cd frontend
+pnpm install
+pnpm run dev                         # Dev server
+pnpm run lint                        # ESLint
+pnpm exec tsc --noEmit               # Type check
+pnpm run test:run                    # Tests
+```
+
+## Code conventions
+
+- **Python**: Ruff for linting and formatting (line-length 88, double quotes, space indent). Config in `pyproject.toml`.
+- **Type hints**: Required on all function signatures.
+- **Imports**: Sorted by isort via Ruff. No in-function imports.
+- **Tests**: Mirror source tree (`src/nvidia_rag/rag_server/server.py` → `tests/unit/rag_server/test_server.py`).
+- **Frontend**: ESLint + TypeScript strict mode. Function components with hooks.
+- **Env files**: `deploy/compose/nvdev.env` (NVIDIA-hosted NIMs) and `deploy/compose/.env` (self-hosted). These are the source of truth for Docker deployments — shell-only exports are lost on restart.
+
+## Deployment modes
+
+1. **Docker Compose** — `deploy/compose/` with env-file configs. Multiple profiles: standard, retrieval-only, NVIDIA-hosted.
+2. **Helm** — `deploy/helm/nvidia-blueprint-rag/` chart with `values.yaml`. Supports MIG GPU slicing via `deploy/helm/mig-slicing/`.
+3. **Library** — Import `nvidia_rag` as a Python package for custom pipelines.
+
+## Key files
+
+- `pyproject.toml` — All Python deps, ruff config, project metadata
+- `deploy/compose/nvdev.env` — Default env file for NVIDIA API Catalog deployments
+- `src/nvidia_rag/rag_server/prompt.yaml` — System prompt templates
+- `docs/support-matrix.md` — GPU requirements per deployment mode
+- `docs/service-port-gpu-reference.md` — Port mappings and GPU assignments
+
+## PR and commit guidelines
+
+- Target the `develop` branch, never `main`.
+- All commits must be signed off (DCO).
+- Run `pre-commit run --all-files` before submitting.
+- See `CONTRIBUTING.md` for full workflow.
+
+## Operations — `rag-blueprint` skill
+
+For any operational task — deploying, configuring, troubleshooting, or shutting down the RAG Blueprint — read and follow the skill at `.agents/skills/rag-blueprint/SKILL.md`.
+
+The skill handles:
+
+- **Deploy** — Docker Compose (standard, retrieval-only, NVIDIA-hosted), Helm, MIG-slicing, library mode
+- **Configure** — VLM, guardrails, query rewriting, ingestion, search & retrieval, models, observability, summarization, multimodal, MCP, evaluation, notebooks, UI, and more
+- **Troubleshoot** — Debug unhealthy services, container errors, GPU issues, connectivity failures
+- **Shutdown** — Stop, tear down, and clean up services
diff --git a/CLAUDE.md b/CLAUDE.md
new file mode 100644
index 000000000..e16f0c9d6
--- /dev/null
+++ b/CLAUDE.md
@@ -0,0 +1,84 @@
+# NVIDIA RAG Blueprint
+
+Reference implementation for a Retrieval Augmented Generation pipeline. Python 3.11+ backend (FastAPI + LangChain), React/TypeScript frontend, deployable via Docker Compose or Helm.
+
+## Project structure
+
+```
+src/nvidia_rag/
+├── rag_server/        # RAG query/response server (FastAPI)
+├── ingestor_server/   # Document ingestion server (FastAPI)
+└── utils/             # Shared utilities
+frontend/              # React + TypeScript UI (pnpm)
+deploy/
+├── compose/           # Docker Compose files and env configs
+└── helm/              # Helm charts (standard + MIG-slicing)
+docs/                  # User-facing documentation (Sphinx, RST/MD)
+tests/
+├── unit/              # No network calls allowed
+└── integration/       # Network calls permitted
+notebooks/             # Jupyter notebooks for evaluation and examples
+```
+
+## Development commands
+
+### Backend (Python)
+
+```bash
+uv sync                              # Install all deps
+uv run pytest tests/unit/            # Unit tests
+uv run pytest tests/integration/     # Integration tests
+ruff check --fix src/                # Lint + autofix
+ruff format src/                     # Format
+pre-commit run --all-files           # Run all pre-commit hooks
+```
+
+### Frontend (TypeScript)
+
+```bash
+cd frontend
+pnpm install
+pnpm run dev                         # Dev server
+pnpm run lint                        # ESLint
+pnpm exec tsc --noEmit               # Type check
+pnpm run test:run                    # Tests
+```
+
+## Code conventions
+
+- **Python**: Ruff for linting and formatting (line-length 88, double quotes, space indent). Config in `pyproject.toml`.
+- **Type hints**: Required on all function signatures.
+- **Imports**: Sorted by isort via Ruff. No in-function imports.
+- **Tests**: Mirror source tree (`src/nvidia_rag/rag_server/server.py` → `tests/unit/rag_server/test_server.py`).
+- **Frontend**: ESLint + TypeScript strict mode. Function components with hooks.
+- **Env files**: `deploy/compose/nvdev.env` (NVIDIA-hosted NIMs) and `deploy/compose/.env` (self-hosted). These are the source of truth for Docker deployments — shell-only exports are lost on restart.
+
+## Deployment modes
+
+1. **Docker Compose** — `deploy/compose/` with env-file configs. Multiple profiles: standard, retrieval-only, NVIDIA-hosted.
+2. **Helm** — `deploy/helm/nvidia-blueprint-rag/` chart with `values.yaml`. Supports MIG GPU slicing via `deploy/helm/mig-slicing/`.
+3. **Library** — Import `nvidia_rag` as a Python package for custom pipelines.
+
+## Key files
+
+- `pyproject.toml` — All Python deps, ruff config, project metadata
+- `deploy/compose/nvdev.env` — Default env file for NVIDIA API Catalog deployments
+- `src/nvidia_rag/rag_server/prompt.yaml` — System prompt templates
+- `docs/support-matrix.md` — GPU requirements per deployment mode
+- `docs/service-port-gpu-reference.md` — Port mappings and GPU assignments
+
+## PR and commit guidelines
+
+- Target the `develop` branch, never `main`.
+- All commits must be signed off (DCO).
+- Run `pre-commit run --all-files` before submitting.
+- See `CONTRIBUTING.md` for full workflow.
+
+## Operations — `/rag-blueprint` skill
+
+For any operational task, use the `rag-blueprint` skill (`.agents/skills/rag-blueprint/`).
+
+- **Deploy** — Docker Compose (standard, retrieval-only, NVIDIA-hosted), Helm, MIG-slicing, library mode
+- **Configure** — VLM, guardrails, query rewriting, ingestion, search & retrieval, models, observability, summarization, multimodal, MCP, evaluation, notebooks, UI, and more
+- **Troubleshoot** — Debug unhealthy services, container errors, GPU issues, connectivity failures
+- **Shutdown** — Stop, tear down, and clean up services
diff --git a/README.md b/README.md
index edea2e72a..c400dd410 100644
--- a/README.md
+++ b/README.md
@@ -105,9 +105,9 @@ This modular design ensures efficient query processing, accurate retrieval of in
 
     - [NVIDIA NIM llama-3_2-nv-embedqa-1b-v2](https://build.nvidia.com/nvidia/llama-3_2-nv-embedqa-1b-v2)
     - [NVIDIA NIM llama-3_2-nv-rerankqa-1b-v2](https://build.nvidia.com/nvidia/llama-3_2-nv-rerankqa-1b-v2)
-    - [NeMo Retriever Page Elements NIM](https://build.nvidia.com/nvidia/nemoretriever-page-elements-v3)
-    - [NeMo Retriever Table Structure NIM](https://build.nvidia.com/nvidia/nemoretriever-table-structure-v1)
-    - [NeMo Retriever Graphic Elements NIM](https://build.nvidia.com/nvidia/nemoretriever-graphic-elements-v1)
+    - [NeMo Retriever Page Elements NIM](https://build.nvidia.com/nvidia/nemotron-page-elements-v3)
+    - [NeMo Retriever Table Structure NIM](https://build.nvidia.com/nvidia/nemotron-table-structure-v1)
+    - [NeMo Retriever Graphic Elements NIM](https://build.nvidia.com/nvidia/nemotron-graphic-elements-v1)
     - [NeMo Retriever OCR NIM](https://build.nvidia.com/nvidia/nemoretriever-ocr)
 
 - Optional NIMs
@@ -162,6 +162,29 @@ The following is a step-by-step explanation of the workflow from the end-user pe
 
 
 
+## AI Agent Skill
+
+An agent skill is included that enables AI coding assistants (Claude Code, Cursor, etc.) to deploy, configure, troubleshoot, and manage the RAG Blueprint autonomously.
+
+### Install
+
+```bash
+npx skills add .
+```
+
+This installs the `rag-blueprint` skill from `skill-source/`. After installation, the agent handles requests like:
+
+- *"Deploy RAG on Docker with NVIDIA-hosted models"*
+- *"Enable VLM image captioning and restart the ingestor"*
+- *"Ingestion failed for 3 files, can you check why?"*
+- *"Switch from Docker to library mode"*
+- *"Shut down all RAG services"*
+
+> **Note:** If the agent doesn't pick up the skill automatically (e.g., for short or ambiguous queries), invoke it explicitly with `/rag-blueprint <your request>`.
+
+For skill architecture details, see [`skill-source/README.md`](skill-source/README.md).
+
+
 ## Get Started With NVIDIA RAG Blueprint
 
 The recommended way to get started is to deploy the NVIDIA RAG Blueprint
@@ -202,9 +225,9 @@ Use of the models in this blueprint is governed by the [NVIDIA AI Foundation Mod
 
 ## Terms of Use
 This blueprint is governed by the [NVIDIA Agreements | Enterprise Software | NVIDIA Software License Agreement](https://www.nvidia.com/en-us/agreements/enterprise-software/nvidia-software-license-agreement/) and the [NVIDIA Agreements | Enterprise Software | Product Specific Terms for AI Product](https://www.nvidia.com/en-us/agreements/enterprise-software/product-specific-terms-for-ai-products/). The models are governed by the [NVIDIA Agreements | Enterprise Software | NVIDIA Community Model License](https://www.nvidia.com/en-us/agreements/enterprise-software/nvidia-community-models-license/) and the [NVIDIA RAG dataset](./data/multimodal/) which is governed by the [NVIDIA Asset License Agreement](https://github.com/NVIDIA-AI-Blueprints/rag/blob/main/data/LICENSE.DATA).
-The following models that are built with Llama are governed by the Llama 3.2 Community License Agreement: nvidia/llama-3.2-nv-embedqa-1b-v2 and nvidia/llama-3.2-nv-rerankqa-1b-v2 and llama-3.2-nemoretriever-1b-vlm-embed-v1.
+The following models that are built with Llama are governed by the Llama 3.2 Community License Agreement: nvidia/llama-nemotron-embed-1b-v2 and nvidia/llama-nemotron-rerank-1b-v2 and llama-3.2-nemoretriever-1b-vlm-embed-v1.
 
 ## Additional Information
 
-The [Llama 3.1 Community License Agreement](https://www.llama.com/llama3_1/license/) for the llama-3.1-nemotron-nano-vl-8b-v1, llama-3.1-nemoguard-8b-content-safety and llama-3.1-nemoguard-8b-topic-control models. The [Llama 3.2 Community License Agreement](https://www.llama.com/llama3_2/license/) for the nvidia/llama-3.2-nv-embedqa-1b-v2, nvidia/llama-3.2-nv-rerankqa-1b-v2 and llama-3.2-nemoretriever-1b-vlm-embed-v1 models. The [Llama 3.3 Community License Agreement](https://github.com/meta-llama/llama-models/blob/main/models/llama3_3/LICENSE) for the llama-3.3-nemotron-super-49b-v1.5 models. Built with Llama. Apache 2.0 for NVIDIA Ingest and for the nemoretriever-page-elements-v2, nemoretriever-table-structure-v1, nemoretriever-graphic-elements-v1, paddleocr and nemoretriever-ocr-v1 models.
+The [Llama 3.1 Community License Agreement](https://www.llama.com/llama3_1/license/) for the llama-3.1-nemotron-nano-vl-8b-v1, llama-3.1-nemoguard-8b-content-safety and llama-3.1-nemoguard-8b-topic-control models. The [Llama 3.2 Community License Agreement](https://www.llama.com/llama3_2/license/) for the nvidia/llama-nemotron-embed-1b-v2, nvidia/llama-nemotron-rerank-1b-v2 and llama-3.2-nemoretriever-1b-vlm-embed-v1 models. The [Llama 3.3 Community License Agreement](https://github.com/meta-llama/llama-models/blob/main/models/llama3_3/LICENSE) for the llama-3.3-nemotron-super-49b-v1.5 models. Built with Llama. Apache 2.0 for NVIDIA Ingest and for the nemoretriever-page-elements-v2, nemotron-table-structure-v1, nemotron-graphic-elements-v1, paddleocr and nemoretriever-ocr-v1 models.
 
diff --git a/VERSION b/VERSION
new file mode 100644
index 000000000..41a281954
--- /dev/null
+++ b/VERSION
@@ -0,0 +1 @@
+v0.0.7
diff --git a/ci/publish_wheel.sh b/ci/publish_wheel.sh
index f59165fee..711abb462 100755
--- a/ci/publish_wheel.sh
+++ b/ci/publish_wheel.sh
@@ -25,8 +25,8 @@ if [ -n "$ARTIFACTORY_VERSION" ]; then
     echo "Using custom Artifactory version: $ARTIFACTORY_VERSION"
     ARTIFACTORY_VERSION_FINAL=$ARTIFACTORY_VERSION
 else
-    echo "Using default Artifactory version: 2.4.0.dev"
-    ARTIFACTORY_VERSION_FINAL="2.4.0.dev"
+    echo "Using default Artifactory version: 2.5.0.dev"
+    ARTIFACTORY_VERSION_FINAL="2.5.0.dev"
 fi
 
 # Build first wheel for GitLab Package Registry
diff --git a/deploy/compose/.env b/deploy/compose/.env
index 9f6ccf796..0c62f9144 100644
--- a/deploy/compose/.env
+++ b/deploy/compose/.env
@@ -22,8 +22,8 @@ export NVIDIA_API_KEY=${NGC_API_KEY}
 export APP_LLM_SERVERURL=nim-llm:8000
 export APP_FILTEREXPRESSIONGENERATOR_SERVERURL=nim-llm:8000
 export SUMMARY_LLM_SERVERURL=nim-llm:8000
-export APP_EMBEDDINGS_SERVERURL=nemoretriever-embedding-ms:8000/v1
-export APP_RANKING_SERVERURL=nemoretriever-ranking-ms:8000
+export APP_EMBEDDINGS_SERVERURL=nemotron-embedding-ms:8000/v1
+export APP_RANKING_SERVERURL=nemotron-ranking-ms:8000
 export OCR_GRPC_ENDPOINT=nemoretriever-ocr:8001
 export OCR_HTTP_ENDPOINT=http://nemoretriever-ocr:8000/v1/infer
 export OCR_INFER_PROTOCOL=grpc
@@ -50,11 +50,11 @@ export YOLOX_TABLE_STRUCTURE_INFER_PROTOCOL=grpc
 # export OCR_HTTP_ENDPOINT=https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-ocr
 # export OCR_INFER_PROTOCOL=http
 # export OCR_MODEL_NAME=scene_text_ensemble
-# export YOLOX_HTTP_ENDPOINT=https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-page-elements-v3
+# export YOLOX_HTTP_ENDPOINT=https://ai.api.nvidia.com/v1/cv/nvidia/nemotron-page-elements-v3
 # export YOLOX_INFER_PROTOCOL=http
-# export YOLOX_GRAPHIC_ELEMENTS_HTTP_ENDPOINT=https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-graphic-elements-v1
+# export YOLOX_GRAPHIC_ELEMENTS_HTTP_ENDPOINT=https://ai.api.nvidia.com/v1/cv/nvidia/nemotron-graphic-elements-v1
 # export YOLOX_GRAPHIC_ELEMENTS_INFER_PROTOCOL=http
-# export YOLOX_TABLE_STRUCTURE_HTTP_ENDPOINT=https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-table-structure-v1
+# export YOLOX_TABLE_STRUCTURE_HTTP_ENDPOINT=https://ai.api.nvidia.com/v1/cv/nvidia/nemotron-table-structure-v1
 # export YOLOX_TABLE_STRUCTURE_INFER_PROTOCOL=http
 # export APP_QUERYREWRITER_SERVERURL=""
 # export APP_QUERYREWRITER_MODELNAME="nvidia/llama-3.3-nemotron-super-49b-v1.5"
@@ -87,4 +87,22 @@ export OCR_MS_GPU_ID=0
 # Paths
 # ==========================
 
-export PROMPT_CONFIG_FILE=${PWD}/src/nvidia_rag/rag_server/prompt.yaml
\ No newline at end of file
+export PROMPT_CONFIG_FILE=${PWD}/src/nvidia_rag/rag_server/prompt.yaml
+
+# ==========================
+# Oracle 26ai Vector Database (Default)
+# ==========================
+# Set APP_VECTORSTORE_NAME=oracle to use Oracle 26ai (default)
+# Set APP_VECTORSTORE_NAME=milvus to use Milvus instead
+
+export APP_VECTORSTORE_NAME=oracle
+export ORACLE_USER=rag_user
+export ORACLE_PASSWORD=
+export ORACLE_DSN=localhost:1521/FREEPDB1
+
+# Oracle Vector Index Configuration
+export ORACLE_VECTOR_INDEX_TYPE=IVF
+export ORACLE_DISTANCE_METRIC=COSINE
+
+# For hybrid search (vector + text), set:
+# export APP_VECTORSTORE_SEARCH_TYPE=hybrid
\ No newline at end of file
diff --git a/deploy/compose/docker-compose-ingestor-server.yaml b/deploy/compose/docker-compose-ingestor-server.yaml
index 964a9c02c..1d284d53d 100644
--- a/deploy/compose/docker-compose-ingestor-server.yaml
+++ b/deploy/compose/docker-compose-ingestor-server.yaml
@@ -3,7 +3,7 @@ services:
   # Main ingestor server which is responsible for ingestion
   ingestor-server:
     container_name: ingestor-server
-    image: nvcr.io/nvstaging/blueprint/ingestor-server:${TAG:-2.4.0}
+    image: nvcr.io/nvidia/blueprint/ingestor-server:${TAG:-2.5.0}
     build:
       # Set context to repo's root directory
       context: ../../
@@ -75,8 +75,8 @@ services:
 
       ##===Embedding Model specific configurations===
       # url on which embedding model is hosted. If "", Nvidia hosted API is used
-      APP_EMBEDDINGS_SERVERURL: ${APP_EMBEDDINGS_SERVERURL-"nemoretriever-embedding-ms:8000/v1"}
-      APP_EMBEDDINGS_MODELNAME: ${APP_EMBEDDINGS_MODELNAME:-nvidia/llama-3.2-nv-embedqa-1b-v2}
+      APP_EMBEDDINGS_SERVERURL: ${APP_EMBEDDINGS_SERVERURL-"nemotron-embedding-ms:8000/v1"}
+      APP_EMBEDDINGS_MODELNAME: ${APP_EMBEDDINGS_MODELNAME:-nvidia/llama-nemotron-embed-1b-v2}
       # For VLM Embedding Model (Nemoretriever-1b-vlm-embed-v1)
       # APP_EMBEDDINGS_SERVERURL: ${APP_EMBEDDINGS_SERVERURL-"nemotron-vlm-embedding-ms:8000/v1"}
       # APP_EMBEDDINGS_MODELNAME: ${APP_EMBEDDINGS_MODELNAME:-nvidia/llama-nemotron-embed-vl-1b-v2}
@@ -95,7 +95,8 @@ services:
       APP_NVINGEST_EXTRACTPAGEASIMAGE: ${APP_NVINGEST_EXTRACTPAGEASIMAGE:-False}
       APP_NVINGEST_STRUCTURED_ELEMENTS_MODALITY: ${APP_NVINGEST_STRUCTURED_ELEMENTS_MODALITY:-""} # Select from "image", "text_image"
       APP_NVINGEST_IMAGE_ELEMENTS_MODALITY: ${APP_NVINGEST_IMAGE_ELEMENTS_MODALITY:-""} # Select from "image"
-      APP_NVINGEST_PDFEXTRACTMETHOD: ${APP_NVINGEST_PDFEXTRACTMETHOD:-None} # Select from pdfium, nemoretron_parse, None
+      APP_NVINGEST_PDFEXTRACTMETHOD: ${APP_NVINGEST_PDFEXTRACTMETHOD:-None} # Select from pdfium, nemotron_parse, None
+      APP_NVINGEST_EXTRACTTABLESMETHOD: ${APP_NVINGEST_EXTRACTTABLESMETHOD:-yolox} # yolox, nemotron_parse, or None
       # Extract text by "page" only recommended for documents with pages like .pdf, .docx, etc.
       APP_NVINGEST_TEXTDEPTH: ${APP_NVINGEST_TEXTDEPTH:-page} # extract by "page" or "document"
 
@@ -168,7 +169,7 @@ services:
       - "6379:6379"
 
   nv-ingest-ms-runtime:
-    image: nvcr.io/nvidia/nemo-microservices/nv-ingest:26.1.1
+    image: nvcr.io/nvidia/nemo-microservices/nv-ingest:26.1.2
     # cpuset: "0-15" # Uncomment to restrict this container to CPU cores 0–15
     shm_size: 40gb # Should be at minimum 30% of assigned memory per Ray documentation
     volumes:
@@ -234,13 +235,13 @@ services:
       - YOLOX_HTTP_ENDPOINT=${YOLOX_HTTP_ENDPOINT:-http://page-elements:8000/v1/infer}
       - YOLOX_INFER_PROTOCOL=${YOLOX_INFER_PROTOCOL:-grpc}
       # build.nvidia.com hosted yolox-graphics-elements endpoints.
-      #- YOLOX_GRAPHIC_ELEMENTS_HTTP_ENDPOINT=https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-graphic-elements-v1
+      #- YOLOX_GRAPHIC_ELEMENTS_HTTP_ENDPOINT=https://ai.api.nvidia.com/v1/cv/nvidia/nemotron-graphic-elements-v1
       #- YOLOX_GRAPHIC_ELEMENTS_INFER_PROTOCOL=http
       - YOLOX_GRAPHIC_ELEMENTS_GRPC_ENDPOINT=${YOLOX_GRAPHIC_ELEMENTS_GRPC_ENDPOINT:-graphic-elements:8001}
       - YOLOX_GRAPHIC_ELEMENTS_HTTP_ENDPOINT=${YOLOX_GRAPHIC_ELEMENTS_HTTP_ENDPOINT:-http://graphic-elements:8000/v1/infer}
       - YOLOX_GRAPHIC_ELEMENTS_INFER_PROTOCOL=${YOLOX_GRAPHIC_ELEMENTS_INFER_PROTOCOL:-grpc}
       # build.nvidia.com hosted  yolox-table-elements endpoints.
-      #- YOLOX_TABLE_STRUCTURE_HTTP_ENDPOINT=https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-table-structure-v1
+      #- YOLOX_TABLE_STRUCTURE_HTTP_ENDPOINT=https://ai.api.nvidia.com/v1/cv/nvidia/nemotron-table-structure-v1
       #- YOLOX_TABLE_STRUCTURE_INFER_PROTOCOL=http
       - YOLOX_TABLE_STRUCTURE_GRPC_ENDPOINT=${YOLOX_TABLE_STRUCTURE_GRPC_ENDPOINT:-table-structure:8001}
       - YOLOX_TABLE_STRUCTURE_HTTP_ENDPOINT=${YOLOX_TABLE_STRUCTURE_HTTP_ENDPOINT:-http://table-structure:8000/v1/infer}
diff --git a/deploy/compose/docker-compose-rag-server.yaml b/deploy/compose/docker-compose-rag-server.yaml
index b3e20808f..dc04c5329 100644
--- a/deploy/compose/docker-compose-rag-server.yaml
+++ b/deploy/compose/docker-compose-rag-server.yaml
@@ -3,7 +3,7 @@ services:
   # Main orchestrator server which stiches together all calls to different services to fulfill the user request
   rag-server:
     container_name: rag-server
-    image: nvcr.io/nvstaging/blueprint/rag-server:${TAG:-2.4.0}
+    image: nvcr.io/nvidia/blueprint/rag-server:${TAG:-2.5.0}
     build:
       # Set context to repo's root directory
       context: ../../
@@ -74,11 +74,11 @@ services:
       LLM_MAX_TOKENS: ${LLM_MAX_TOKENS:-32768}
       LLM_TEMPERATURE: ${LLM_TEMPERATURE:-0}
       LLM_TOP_P: ${LLM_TOP_P:-1.0}
-      
-      # Enable/disable thinking/reasoning for nemotron-3-nano models (30b variant)
-      # Set to "true" to enable reasoning mode with reasoning_budget
-      # Set to "false" to disable reasoning and get direct answers
-      ENABLE_NEMOTRON_3_NANO_THINKING: ${ENABLE_NEMOTRON_3_NANO_THINKING:-true}
+
+      # Reasoning configuration (supported by Nemotron 3 and other reasoning models)
+      LLM_ENABLE_THINKING: ${LLM_ENABLE_THINKING:-false}
+      LLM_REASONING_BUDGET: ${LLM_REASONING_BUDGET:-0}
+      LLM_LOW_EFFORT: ${LLM_LOW_EFFORT:-false}
 
       ##===Query Rewriter Model specific configurations===
       APP_QUERYREWRITER_MODELNAME: ${APP_QUERYREWRITER_MODELNAME:-"nvidia/llama-3.3-nemotron-super-49b-v1.5"}
@@ -94,8 +94,8 @@ services:
 
       ##===Embedding Model specific configurations===
       # url on which embedding model is hosted. If "", Nvidia hosted API is used
-      APP_EMBEDDINGS_SERVERURL: ${APP_EMBEDDINGS_SERVERURL-"nemoretriever-embedding-ms:8000/v1"}
-      APP_EMBEDDINGS_MODELNAME: ${APP_EMBEDDINGS_MODELNAME:-nvidia/llama-3.2-nv-embedqa-1b-v2}
+      APP_EMBEDDINGS_SERVERURL: ${APP_EMBEDDINGS_SERVERURL-"nemotron-embedding-ms:8000/v1"}
+      APP_EMBEDDINGS_MODELNAME: ${APP_EMBEDDINGS_MODELNAME:-nvidia/llama-nemotron-embed-1b-v2}
       APP_EMBEDDINGS_DIMENSIONS: ${APP_EMBEDDINGS_DIMENSIONS:-2048}
       # For VLM Embedding Model (Nemoretriever-1b-vlm-embed-v1)
       # APP_EMBEDDINGS_SERVERURL: ${APP_EMBEDDINGS_SERVERURL-"nemotron-vlm-embedding-ms:8000/v1"}
@@ -103,8 +103,8 @@ services:
 
       ##===Reranking Model specific configurations===
       # url on which ranking model is hosted. If "", Nvidia hosted API is used
-      APP_RANKING_SERVERURL: ${APP_RANKING_SERVERURL-"nemoretriever-ranking-ms:8000"}
-      APP_RANKING_MODELNAME: ${APP_RANKING_MODELNAME:-"nvidia/llama-3.2-nv-rerankqa-1b-v2"}
+      APP_RANKING_SERVERURL: ${APP_RANKING_SERVERURL-"nemotron-ranking-ms:8000"}
+      APP_RANKING_MODELNAME: ${APP_RANKING_MODELNAME:-"nvidia/llama-nemotron-rerank-1b-v2"}
       ENABLE_RERANKER: ${ENABLE_RERANKER:-True}
       # Default score threshold for filtering documents by reranker relevance (0.0 to 1.0)
       RERANKER_SCORE_THRESHOLD: ${RERANKER_SCORE_THRESHOLD:-${RERANKER_CONFIDENCE_THRESHOLD:-0.0}}
@@ -211,7 +211,7 @@ services:
   # Sample UI container which interacts with APIs exposed by rag-server container
   rag-frontend:
     container_name: rag-frontend
-    image: nvcr.io/nvstaging/blueprint/rag-frontend:${TAG:-2.4.0}
+    image: nvcr.io/nvidia/blueprint/rag-frontend:${TAG:-2.5.0}
     build:
       # Set context to repo's root directory
       context: ../../frontend
diff --git a/deploy/compose/nemoguardrails/config-store/nemoguard_cloud/config.yml b/deploy/compose/nemoguardrails/config-store/nemoguard_cloud/config.yml
index 1e014fc9f..17200db05 100644
--- a/deploy/compose/nemoguardrails/config-store/nemoguard_cloud/config.yml
+++ b/deploy/compose/nemoguardrails/config-store/nemoguard_cloud/config.yml
@@ -17,5 +17,7 @@ rails:
       - content safety check input $model=content_safety
       - topic safety check input $model=topic_control
   output:
+    streaming:
+      enabled: true
     flows:
       - content safety check output $model=content_safety
\ No newline at end of file
diff --git a/deploy/compose/nemotron3-super-cloud.env b/deploy/compose/nemotron3-super-cloud.env
new file mode 100644
index 000000000..468bd2fb7
--- /dev/null
+++ b/deploy/compose/nemotron3-super-cloud.env
@@ -0,0 +1,49 @@
+# ==============================================================================
+# Nemotron 3 Super - NVIDIA-hosted (cloud) endpoints
+# ==============================================================================
+# Self-contained cloud + Nemotron 3 Super. Source after .env so cloud endpoints
+# override on-prem defaults:  source deploy/compose/.env && source deploy/compose/nemotron3-super-cloud.env
+# No need to edit .env (uncomment/comment sections).
+# ==============================================================================
+
+# === Authentication ===
+export NVIDIA_API_KEY=${NGC_API_KEY}
+
+# === Embeddings, Ranking, OCR, YOLOX (cloud) ===
+export APP_EMBEDDINGS_SERVERURL=https://integrate.api.nvidia.com/v1
+export APP_RANKING_SERVERURL=https://integrate.api.nvidia.com/v1
+export OCR_HTTP_ENDPOINT=https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-ocr
+export OCR_INFER_PROTOCOL=http
+export OCR_MODEL_NAME=scene_text_ensemble
+export YOLOX_HTTP_ENDPOINT=https://ai.api.nvidia.com/v1/cv/nvidia/nemotron-page-elements-v3
+export YOLOX_INFER_PROTOCOL=http
+export YOLOX_GRAPHIC_ELEMENTS_HTTP_ENDPOINT=https://ai.api.nvidia.com/v1/cv/nvidia/nemotron-graphic-elements-v1
+export YOLOX_GRAPHIC_ELEMENTS_INFER_PROTOCOL=http
+export YOLOX_TABLE_STRUCTURE_HTTP_ENDPOINT=https://ai.api.nvidia.com/v1/cv/nvidia/nemotron-table-structure-v1
+export YOLOX_TABLE_STRUCTURE_INFER_PROTOCOL=http
+
+# === LLM ===
+export APP_LLM_MODELNAME=nvidia/nemotron-3-super-120b-a12b
+export APP_LLM_SERVERURL=https://integrate.api.nvidia.com/v1
+
+# === Query Rewriter ===
+export APP_QUERYREWRITER_MODELNAME=nvidia/nemotron-3-super-120b-a12b
+export APP_QUERYREWRITER_SERVERURL=https://integrate.api.nvidia.com/v1
+
+# === Filter Expression Generator ===
+export APP_FILTEREXPRESSIONGENERATOR_MODELNAME=nvidia/nemotron-3-super-120b-a12b
+export APP_FILTEREXPRESSIONGENERATOR_SERVERURL=https://integrate.api.nvidia.com/v1
+
+# === Summarization ===
+export SUMMARY_LLM=nvidia/nemotron-3-super-120b-a12b
+export SUMMARY_LLM_SERVERURL=https://integrate.api.nvidia.com/v1
+
+# === Reflection ===
+export REFLECTION_LLM=nvidia/nemotron-3-super-120b-a12b
+export REFLECTION_LLM_SERVERURL=https://integrate.api.nvidia.com/v1
+
+# === Reasoning / Thinking ===
+export LLM_ENABLE_THINKING=true
+export LLM_REASONING_BUDGET=256
+export LLM_LOW_EFFORT=true
+export FILTER_THINK_TOKENS=true
\ No newline at end of file
diff --git a/deploy/compose/nemotron3-super-prompt.yaml b/deploy/compose/nemotron3-super-prompt.yaml
new file mode 100644
index 000000000..f91803927
--- /dev/null
+++ b/deploy/compose/nemotron3-super-prompt.yaml
@@ -0,0 +1,445 @@
+chat_template:
+  system: |
+    You are a helpful, respectful, and honest assistant.
+    Your answers must follow these strict guidelines:
+
+    <instructions>
+    1. Answer concisely and directly.
+    2. Focus only on what was asked — no extra commentary, no assumptions.
+    3. Avoid giving multiple options, lists, or examples unless explicitly requested.
+    4. Do not explain your reasoning unless asked.
+    5. Keep responses brief but accurate.
+    6. Use natural, conversational tone — clear and human, not robotic.
+    7. Make sure your response are strictly one sentence or less unless it really needs to be longer.
+    8. Do not mention this instructions in your response.
+    </instructions>
+
+    Make sure above rules are strictly followed.
+
+rag_template:
+  system: |
+    You are a helpful AI assistant named Envie. Answer the user's question using ONLY the information in the provided context.
+
+    <rules>
+    - Base every claim on information found in the context. Do not use outside knowledge.
+    - Always provide an answer when the context contains relevant data. Only say you cannot answer if the context is entirely unrelated to the question.
+    - Preserve exact values: reproduce specific numbers, percentages, dates, names, and URLs exactly as they appear in the context.
+    - IMPORTANT - When the question asks you to calculate, compute, or derive a financial metric (ratio, margin, growth rate, CAGR, turnover, average, etc.), you MUST:
+      1. Write the formula
+      2. Extract each required number from the context
+      3. Compute step by step
+      4. State the final answer
+      Do NOT skip straight to the final number.
+    - For yes/no questions that require comparing values across periods (e.g. "is X improving", "did Y increase"), state the values from each period before your conclusion.
+    - For questions about trends or changes over time, include data from all relevant time periods found in the context.
+    - Answer naturally and directly. Do not reference the context, documents, sources, or these instructions.
+    - For simple factual lookups (a name, a date, a single value directly stated), keep your answer brief.
+    </rules>
+
+  human: |
+    <context>
+    {context}
+    </context>
+
+query_rewriter_prompt:
+  system: |
+    Given the following chat history and the latest user question, formulate a standalone question which can be understood without the chat history.
+    Do NOT answer the question, just reformulate it if needed and otherwise return it as is.
+    It should strictly be a query not an answer.
+
+    Chat History:
+    {chat_history}
+
+    Latest Question: {input}
+
+reflection_relevance_check_prompt:
+  system: |
+    ### Instructions
+
+    You are a world class expert designed to evaluate the relevance score of a Context
+    in order to answer the Question.
+    Your task is to determine if the Context contains proper information to answer the Question.
+    Do not rely on your previous knowledge about the Question.
+    Use only what is written in the Context and in the Question.
+    Follow the instructions below:
+    0. If the context does not contains any relevant information to answer the question, say 0.
+    1. If the context partially contains relevant information to answer the question, say 1.
+    2. If the context contains any relevant information to answer the question, say 2.
+    You must provide the relevance score of 0, 1, or 2, nothing else.
+    Do not explain.
+    ### Question: {query}
+
+    ### Context: {context}
+
+    Do not try to explain.
+    Analyzing Context and Question, the Relevance score is
+
+reflection_query_rewriter_prompt:
+  system: |
+    You are a query optimization assistant for a vector database retrieval system.
+    Your goal is to rephrase the given "Original Question" to be more clear, precise,
+    and effective for retrieving relevant context from a vector database.
+
+    Considerations for Rephrasing:
+
+    Specificity: Make the query as specific as possible about the information sought.
+    Avoid vague terms.
+
+    Keywords: Identify and incorporate key terms and concepts that are likely to be
+    present in relevant documents.
+
+    Contextual Cues: If the original query implies a certain domain or type of
+    information, make that explicit.
+
+    Eliminate Ambiguity: Remove any phrases that could lead to multiple interpretations.
+
+    Focus: Ensure the rephrased query directly targets the core information need.
+
+    Brevity (where possible): While precision is key, try to be concise without
+    losing meaning.
+
+    Only output the rewritten question with no other information.
+
+    Original Question: {query}
+
+    Rewritten Question:
+
+reflection_groundedness_check_prompt:
+  system: |
+    ### Instruction
+
+    You are a world class expert designed to evaluate the groundedness of an assertion.
+    You will be provided with an assertion and a context.
+    Your task is to determine if the assertion is supported by the context.
+    Follow the instructions below:
+    A. If there is no context or no assertion or context is empty or assertion is empty, say 0.
+    B. If the assertion is not supported by the context, say 0.
+    C. If the assertion is partially supported by the context, say 1.
+    D. If the assertion is fully supported by the context, say 2.
+    You must provide a rating of 0, 1, or 2, nothing else.
+
+    ### Context:
+    <{context}>
+
+    ### Assertion:
+    <{response}>
+
+    Analyzing Context and Response, the Groundedness score is
+
+reflection_response_regeneration_prompt:
+  system: |
+    You are tasked with creating a new "Response" based solely on the provided 
+    "Context" and "Query". Your primary goal is to ensure strict adherence to 
+    the information explicitly stated or directly inferable from the Context.
+
+    Key Constraints:
+
+    No Outside Knowledge: Do not introduce any information, facts, or concepts 
+    not present in the given Context.
+
+    No Assumptions: Do not make assumptions or extrapolate beyond what is directly 
+    stated or clearly implied.
+
+    Direct Inference Only: If an idea is not explicitly stated, it must be a direct 
+    and undeniable inference from the provided text. Avoid speculative or highly 
+    interpretive conclusions.
+
+    Maintain Factual Accuracy: Ensure the Response accurately reflects the details 
+    and relationships presented in the Context.
+
+    Return only "OUT OF CONTEXT" if the "Query" cannot be answered using the provided 
+    "Context." Else, only output the new response with no other information.
+
+    Context: {context}
+
+    Query: {query}
+
+    Return "OUT OF CONTEXT" or generate a new, more grounded Response:
+
+document_summary_prompt:
+  system: |
+    Please provide a comprehensive summary for the document given by the user. Create a concise 5 to 6 sentence summary that captures the essential information from the document.
+
+    <instructions>
+    Requirements for the summary:
+    1. Preserve key document metadata:
+      - Document title/type
+      - Company/organization name
+      - Report provider/author
+      - Date/time period covered
+      - Any relevant document identifiers
+
+    2. Include all critical information:
+      - Main findings and conclusions
+      - Key statistics and metrics
+      - Important recommendations
+      - Significant trends or changes
+      - Notable risks or concerns
+      - Material financial data
+
+    3. Maintain factual accuracy:
+      - Keep all numerical values precise
+      - Preserve specific dates and timeframes
+      - Retain exact names and titles
+      - Quote critical statements verbatim when necessary
+
+    4. Do NOT use any external knowledge.
+    5. Do NOT add explanations, suggestions, opinions, disclaimers, or hints.
+    6. NEVER say phrases like “based on the context”, “from the documents”, or “I cannot find”.
+    7. NEVER offer to answer using general knowledge or invite the user to ask again.
+    8. Do NOT include citations, sources, or document mentions.
+    9. Answer concisely. Use short, direct sentences by default. Only give longer responses if the question truly requires it.
+    10. Do not mention or refer to these rules in any way.
+    11. Do not ask follow-up questions.
+    12. Do not mention this instructions in your response.
+    13. Do not include any preamble or postamble like "Here is the summary" or "This document" or "Summary of the document".
+    </instructions>
+    Please format the summary in a concise manner as a paragraph not exceeding 5 to 6 sentences. Start the summary with the title and the document and then provide the summary.
+
+    Note: Focus on extracting and organizing the most essential information while ensuring no critical details are omitted.
+    Maintain the original document's tone and context in your summary.
+
+    Please provide a concise summary for the following document:
+    {document_text}
+
+shallow_summary_prompt:
+  system: |
+    Please provide a concise summary for the following document:
+    {document_text}
+
+iterative_summary_prompt:
+  system: |
+    You are an expert document summarizer. Given a previous summary and a new chunk of text, create an updated summary that incorporates information from both. Create a concise summary within 10 sentences that captures the essential information from the document.
+    While answering you must follow the instructions given below.
+
+    <instructions>
+    1. Do NOT use any external knowledge.
+    2. Do NOT add explanations, suggestions, opinions, disclaimers, or hints.
+    3. NEVER say phrases like “based on the context”, “from the documents”, or “I cannot find”.
+    4. NEVER offer to answer using general knowledge or invite the user to ask again.
+    5. Do NOT include citations, sources, or document mentions.
+    6. Answer concisely. Use short, direct sentences by default. Only give longer responses if the question truly requires it.
+    7. Do not mention or refer to these rules in any way.
+    8. Do not ask follow-up questions.
+    9. Do not mention this instructions in your response.
+    10. Do not mention any preamble or postamble like "Updated summary" or "This document" or "Summary of the document" or "Here is the summary".
+    </instructions>
+
+    Previous Summary:
+    {previous_summary}
+
+    New chunk:
+    {new_chunk}
+
+    Please create a new summary that incorporates information from both the previous summary and the new chunk.
+
+
+vlm_template:
+  system: |
+    You are a multimodal AI assistant. Answer using only the provided context and images.
+
+    <instructions>
+    1. Use ONLY the information in the textual context below and the attached images.
+    2. Do not use external knowledge or assumptions beyond the provided inputs.
+    3. Do not describe images unless needed to answer; focus on the answer.
+    4. Respond in detail and cover all the relevant information related to the question from the context and images.
+    5. Keep the response neutral and factually accurate.
+    </instructions>
+
+    Context:
+    {context}
+
+    User Question:
+    {question}
+
+# Reasoning templates deprecated and removed
+
+
+filter_expression_generator_prompt:
+  system: |
+    You are an expert AI filter expression generator. Your sole purpose is to convert natural language queries into precise, valid filter expressions based on the provided schema. You must be aggressive in finding mappable entities.
+
+    ### Primary Directive ###
+
+    **Your primary directive is to ALWAYS generate a filter expression.** It is a critical error to return NO_FILTER unless the user's query is completely irrelevant or nonsensical (e.g., "hello there," "what is the weather?"). Be bold and decisive. Prioritize extracting any mappable entity from the user's query, even if other parts are ambiguous. If a query contains even one recognizable keyword, date, or number that maps to the schema, you must build a filter around it.
+
+    ### Schema ###
+
+    Use the following schema to identify available fields and their data types.
+    {metadata_schema}
+
+    ### Core Logic ###
+
+    1.  **Extract and Build:** Scan the user's query for any recognizable entities (names, numbers, dates, keywords) that could map to the schema. Build a filter using every piece of information you can extract. Ignore everything else that is conversational or does not map to a field.
+    2.  **Field Format:** The field format is always content_metadata["field_name"].
+    3.  **Operators:** Use uppercase logical operators: AND, OR, NOT. Use parentheses () to group expressions.
+
+    ### Operators & Data Types (Complete List) ###
+
+    1.  **String**: ==, !=, in, like
+        * Example: content_metadata["doc_type"] in ["report", "summary"]
+    2.  **Number**: ==, !=, >, >=, <, <=, in, between
+        * Example: content_metadata["page_count"] > 10
+    3.  **Datetime** (Format: YYYY-MM-DDTHH:MM:SS): ==, !=, >, >=, <, <=
+        * Example: content_metadata["created_at"] >= "2024-01-01T00:00:00"
+    4.  **Boolean**: ==, !=
+        * Example: content_metadata["is_public"] == true
+    5.  **Array**: array_contains, array_contains_any, array_contains_all, array_length
+        * Single value: array_contains(content_metadata["category"], "AI")
+        * Multiple values (any): array_contains_any(content_metadata["regions"], ["EMEA", "APAC"])
+        * Multiple values (all): array_contains_all(content_metadata["tags"], ["urgent", "review"])
+
+    ### Intelligent Mapping Examples ###
+
+    * **Query:** "Project X"
+        * **Action:** Recognizes "Project X" as a single mappable entity and builds a filter.
+        * **Output:** content_metadata["project"] == "Project X"
+    * **Query:** "approved"
+        * **Action:** Recognizes "approved" as a status and builds a filter just for that.
+        * **Output:** content_metadata["status"] == "approved"
+    * **Query:** "Find the latest financial reports for Project X"
+        * **Action:** Ignore "latest" as it's subjective. Extract "financial reports" and "Project X".
+        * **Output:** (content_metadata["doc_type"] == "financial_report" AND content_metadata["project"] == "Project X")
+    * **Query:** "I think I need the document from Q2 last year about compliance"
+        * **Action:** Ignore "I think I need". Extract "Q2 last year" (2024) and "compliance".
+        * **Output:** (content_metadata["created_at"] >= "2024-04-01T00:00:00" AND content_metadata["created_at"] < "2024-07-01T00:00:00" AND array_contains(content_metadata["tags"], "compliance"))
+
+    ### Your Task ###
+
+    Convert the following user query into a filter expression.
+    {user_request}
+
+    ### Response Format ###
+
+    Your response **MUST** be only the raw filter expression string and nothing else. Do not use explanations, comments, or markdown.
+
+    1.  **On Success:** The filter expression string.
+        * content_metadata["year"] == 2024
+
+    2.  **On Absolute Failure:** The exact text NO_FILTER.
+        * **Use this ONLY if the query is completely unrelated to the schema**, like "what is your name?" or "tell me a joke".
+
+    3.  **On Logical Conflict:** The exact text UNSUPPORTED.
+        * **Use this ONLY for impossible logic**, like "year is 2022 and year is 2023".
+
+query_decomposition_multiquery_prompt:
+  system: |
+    You are an AI assistant designed to break down a user's complex question into a list of simpler, focused subqueries. 
+    The purpose of this decomposition is to improve the accuracy of a retrieval-augmented generation (RAG) system.
+
+    <instructions>
+    1. Analyze the user's main question to identify its key components.
+    2. Decompose the question into 1-3 distinct, self-contained subqueries. 
+    3. If the original question is simple and already focused, return query directly.
+    4. Each subquery should be a clear, direct question that, when answered, contributes to a comprehensive response to the original question.
+    5. Avoid creating redundant or overly broad subqueries. Focus on the core information needed to answer the original prompt
+    </instructions>
+
+    Return only the subqueries as a numbered list, without any additional text.
+    Original question: {question}
+
+query_decompositions_query_rewriter_prompt:
+  system: |
+    You are an expert at rewriting queries to improve information retrieval for a conversational AI system. Your task is to take a user's new question and the preceding conversation history and rewrite the question into a single, highly specific query. This new query should be ideal for a search or retrieval system.
+
+    <instructions>
+    1. Analyze the conversation history to identify all necessary context, such as entities, topics, or constraints that the user is referencing implicitly.
+    2. Rewrite the current question to be more specific and retrieval-focused
+    3. Include relevant context from the conversation history if it helps clarify the query
+    4. Make the query more explicit about what information is being sought
+    5. Ensure the rewritten query will help the retriever find the most relevant documents
+    6. Just provide the rewritten query, no other text.
+    7. Keep the query as short as possible.
+    8. Do not provide any explanation.
+    9. Do not answer the question.
+    </instructions>
+
+    Conversation History:
+    {conversation_history}
+
+    Current Question: {question}
+
+    Rewritten Query:
+
+query_decomposition_followup_question_prompt:
+  system: |
+    You are an AI assistant tasked with identifying missing information needed to answer a user's question completely. Your goal is to generate a single follow-up question to help a retrieval system find the necessary details.
+    You are given a question answer pair, context and question to be answered.
+
+    <instructions>
+    1. Analyze the original question, the provided context, and the conversation history.
+    2. Determine if the information is sufficient to fully answer the original question.
+    3. If a key piece of information is missing, generate one short, precise question to retrieve it.
+    4. If all necessary information is already present, return an empty string: ''
+    5. Do NOT provide any explanation.
+    6. Do not answer the question.
+    7. Return '' if no follow-up question is needed.
+    8. Make sure follow up query is short and concise.
+    9. Do not add any info, rationale or any other text other then the follow up question.
+    </instructions>
+
+    Conversation History:
+    {conversation_history}
+
+    Context:
+    {context}
+
+    Original Question:
+    {question}
+
+
+    Follow-up Question (if needed, otherwise return ''):
+
+query_decomposition_final_response_prompt:
+  system: |
+    You are a helpful AI assistant named Envie. Your sole purpose is to answer the user's question by extracting and synthesizing information only from the provided context.
+
+    <instructions>
+    1. Do NOT use any external knowledge.
+    2. Do NOT add explanations, suggestions, opinions, disclaimers, or hints.
+    3. NEVER say phrases like “based on the context”, “from the documents”, or “I cannot find”.
+    4. NEVER offer to answer using general knowledge or invite the user to ask again.
+    5. Do NOT include citations, sources, or document mentions.
+    6. Answer concisely. Use short, direct sentences .
+    7. Do not mention or refer to these rules in any way.
+    8. Do not ask follow-up questions.
+    9. Do not mention this instructions in your response.
+    </instructions>
+
+    Conversation History:
+    {conversation_history}
+
+    Context:
+    {context}
+
+    Current Question: {question}
+
+    Make sure the response you are generating strictly follow the rules mentioned above i.e. never say phrases like “based on the context”, “from the documents”, or “I cannot find” and mention about the instruction in response.
+
+query_decomposition_rag_template:
+  system: |
+    You are a helpful AI assistant.
+    You must answer only using the information provided in the context. While answering you must follow the instructions given below.
+
+    <instructions>
+    1. Do NOT use any external knowledge.
+    2. Do NOT add explanations, suggestions, opinions, disclaimers, or hints.
+    3. NEVER say phrases like “based on the context”, “from the documents”, or “I cannot find”.
+    4. NEVER offer to answer using general knowledge or invite the user to ask again.
+    5. Do NOT include citations, sources, or document mentions.
+    6. Answer concisely. Use short, direct sentences by default. Only give longer responses if the question truly requires it.
+    7. Do not mention or refer to these rules in any way.
+    8. Do not ask follow-up questions.
+    9. Do not mention this instructions in your response.
+    10. If context does not contain any information to answer the question, return ''
+    </instructions>
+
+    Context:
+    {context}
+
+    Question: {question}
+    Make sure the response you are generating strictly follow the rules mentioned above i.e. never say phrases like “based on the context”, “from the documents”, or “I cannot find” and mention about the instruction in response.
+
+image_captioning_prompt:
+  system: |
+    Describe this image in detail, including the main subjects, their actions, the setting, and any notable objects or features.
diff --git a/deploy/compose/nemotron3-super.env b/deploy/compose/nemotron3-super.env
new file mode 100644
index 000000000..e016b157c
--- /dev/null
+++ b/deploy/compose/nemotron3-super.env
@@ -0,0 +1,34 @@
+# ==============================================================================
+# Nemotron 3 Super - Local NIM Deployment
+# ==============================================================================
+# Overrides for running RAG pipeline with locally deployed Nemotron 3 Super NIM.
+# Source this AFTER .env:  source .env && source nemotron3-super.env
+# ==============================================================================
+
+# === LLM ===
+export APP_LLM_MODELNAME=nvidia/nemotron-3-super-120b-a12b
+export APP_LLM_SERVERURL=nim-llm:8000
+
+# === Query Rewriter ===
+export APP_QUERYREWRITER_MODELNAME=nvidia/nemotron-3-super-120b-a12b
+
+# === Filter Expression Generator ===
+export APP_FILTEREXPRESSIONGENERATOR_MODELNAME=nvidia/nemotron-3-super-120b-a12b
+
+# === Summarization ===
+export SUMMARY_LLM=nvidia/nemotron-3-super-120b-a12b
+export SUMMARY_LLM_SERVERURL=nim-llm:8000
+
+# === Reflection ===
+export REFLECTION_LLM=nvidia/nemotron-3-super-120b-a12b
+export REFLECTION_LLM_SERVERURL=nim-llm:8000
+
+# === Reasoning / Thinking ===
+export LLM_ENABLE_THINKING=true
+export LLM_REASONING_BUDGET=256
+export LLM_LOW_EFFORT=true
+export FILTER_THINK_TOKENS=true
+
+# === LLM_MAX_TOKENS (for RTX 6000 Pro when using NIM_MAX_MODEL_LEN=32768) ===
+# Uncomment and set: 16256
+# export LLM_MAX_TOKENS=16256
diff --git a/deploy/compose/nims.yaml b/deploy/compose/nims.yaml
index f376d9a64..2bca3dce2 100644
--- a/deploy/compose/nims.yaml
+++ b/deploy/compose/nims.yaml
@@ -31,9 +31,9 @@ services:
       retries: 100
     profiles: ["", "rag"]
 
-  nemoretriever-embedding-ms:
-    container_name: nemoretriever-embedding-ms
-    image: nvcr.io/nim/nvidia/llama-3.2-nv-embedqa-1b-v2:1.10.1
+  nemotron-embedding-ms:
+    container_name: nemotron-embedding-ms
+    image: nvcr.io/nim/nvidia/llama-nemotron-embed-1b-v2:1.13.0
     volumes:
     - ${MODEL_DIRECTORY:-./}:/opt/nim/.cache
     ports:
@@ -91,9 +91,9 @@ services:
       start_period: 10m
     profiles: ["vlm-embed", "vlm-ingest"]
 
-  nemoretriever-ranking-ms:
-    container_name: nemoretriever-ranking-ms
-    image: nvcr.io/nim/nvidia/llama-3.2-nv-rerankqa-1b-v2:1.8.0
+  nemotron-ranking-ms:
+    container_name: nemotron-ranking-ms
+    image: nvcr.io/nim/nvidia/llama-nemotron-rerank-1b-v2:1.10.0
     volumes:
     - ${MODEL_DIRECTORY:-./}:/opt/nim/.cache
     ports:
@@ -108,6 +108,7 @@ services:
       interval: 10s
       timeout: 20s
       retries: 100
+    shm_size: 16GB
     deploy:
       resources:
         reservations:
@@ -119,7 +120,7 @@ services:
     profiles: ["", "rag", "vlm-generation"]
 
   page-elements:
-    image: ${YOLOX_IMAGE:-nvcr.io/nim/nvidia/nemoretriever-page-elements-v3}:${YOLOX_TAG:-1.7.0}
+    image: ${YOLOX_IMAGE:-nvcr.io/nim/nvidia/nemotron-page-elements-v3}:${YOLOX_TAG:-1.8.0}
     shm_size: 16gb
     ports:
       - "8000:8000"
@@ -157,7 +158,7 @@ services:
     profiles: ["", "ingest", "vlm-ingest"]
 
   graphic-elements:
-    image: ${YOLOX_GRAPHIC_ELEMENTS_IMAGE:-nvcr.io/nim/nvidia/nemoretriever-graphic-elements-v1}:${YOLOX_GRAPHIC_ELEMENTS_TAG:-1.6.0}
+    image: ${YOLOX_GRAPHIC_ELEMENTS_IMAGE:-nvcr.io/nim/nvidia/nemotron-graphic-elements-v1}:${YOLOX_GRAPHIC_ELEMENTS_TAG:-1.8.0}
     shm_size: 16gb
     ports:
       - "8003:8000"
@@ -183,7 +184,7 @@ services:
     profiles: ["", "ingest", "vlm-ingest"]
 
   table-structure:
-    image: ${YOLOX_TABLE_STRUCTURE_IMAGE:-nvcr.io/nim/nvidia/nemoretriever-table-structure-v1}:${YOLOX_TABLE_STRUCTURE_TAG:-1.6.0}
+    image: ${YOLOX_TABLE_STRUCTURE_IMAGE:-nvcr.io/nim/nvidia/nemotron-table-structure-v1}:${YOLOX_TABLE_STRUCTURE_TAG:-1.8.0}
     shm_size: 16gb
     ports:
       - "8006:8000"
@@ -323,6 +324,7 @@ services:
       interval: 10s
       timeout: 20s
       retries: 100
+    shm_size: 16GB
     deploy:
       resources:
         reservations:
diff --git a/deploy/compose/nvdev.env b/deploy/compose/nvdev.env
index b92e4500b..d5a919153 100644
--- a/deploy/compose/nvdev.env
+++ b/deploy/compose/nvdev.env
@@ -20,24 +20,24 @@ export APP_LLM_MODELNAME=nvidia/llama-3.3-nemotron-super-49b-v1.5
 # export APP_LLM_MODELNAME=nvidia/nemotron-3-nano-30b-a3b
 # Note: For locally deployed nemotron-3-nano, use: nvidia/nemotron-3-nano
 export APP_FILTEREXPRESSIONGENERATOR_MODELNAME=nvidia/llama-3.3-nemotron-super-49b-v1.5
-export APP_EMBEDDINGS_MODELNAME=nvdev/nvidia/llama-3.2-nv-embedqa-1b-v2
+export APP_EMBEDDINGS_MODELNAME=nvidia/llama-nemotron-embed-1b-v2
 # For VLM Embedding Model (Nemoretriever-1b-vlm-embed-v1)
 # export APP_EMBEDDINGS_MODELNAME=nvdev/nvidia/llama-nemotron-embed-vl-1b-v2
-export APP_RANKING_MODELNAME=nvidia/llama-3.2-nv-rerankqa-1b-v2
+export APP_RANKING_MODELNAME=nvidia/llama-nemotron-rerank-1b-v2
 export ENABLE_RERANKER=True
 export APP_EMBEDDINGS_SERVERURL=https://integrate.api.nvidia.com/v1
 export APP_LLM_SERVERURL=""
 export APP_FILTEREXPRESSIONGENERATOR_SERVERURL=""
 export APP_RANKING_SERVERURL=""
-# export APP_RANKING_SERVERURL=https://ai.api.nvidia.com/v1/nvdev/retrieval/nvidia/llama-3_2-nv-rerankqa-1b-v2/reranking/v1
+# export APP_RANKING_SERVERURL=https://ai.api.nvidia.com/v1/retrieval/nvidia/llama-nemotron-rerank-1b-v2/reranking
 export OCR_HTTP_ENDPOINT=https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-ocr
 export OCR_INFER_PROTOCOL=http
 export OCR_MODEL_NAME=scene_text_ensemble
-export YOLOX_HTTP_ENDPOINT=https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-page-elements-v3
+export YOLOX_HTTP_ENDPOINT=https://ai.api.nvidia.com/v1/cv/nvidia/nemotron-page-elements-v3
 export YOLOX_INFER_PROTOCOL=http
-export YOLOX_GRAPHIC_ELEMENTS_HTTP_ENDPOINT=https://ai.api.nvidia.com/v1/cv/nvdev/nvidia/nemoretriever-graphic-elements-v1
+export YOLOX_GRAPHIC_ELEMENTS_HTTP_ENDPOINT=https://ai.api.nvidia.com/v1/cv/nvidia/nemotron-graphic-elements-v1
 export YOLOX_GRAPHIC_ELEMENTS_INFER_PROTOCOL=http
-export YOLOX_TABLE_STRUCTURE_HTTP_ENDPOINT=https://ai.api.nvidia.com/v1/cv/nvdev/nvidia/nemoretriever-table-structure-v1
+export YOLOX_TABLE_STRUCTURE_HTTP_ENDPOINT=https://ai.api.nvidia.com/v1/cv/nvidia/nemotron-table-structure-v1
 export YOLOX_TABLE_STRUCTURE_INFER_PROTOCOL=http
 export SUMMARY_LLM="nvidia/llama-3.3-nemotron-super-49b-v1.5"
 export SUMMARY_LLM_SERVERURL=""
diff --git a/deploy/compose/vectordb.yaml b/deploy/compose/vectordb.yaml
index 651f85ddc..3bd798a4b 100644
--- a/deploy/compose/vectordb.yaml
+++ b/deploy/compose/vectordb.yaml
@@ -78,7 +78,7 @@ services:
       interval: 30s
       timeout: 20s
       retries: 3
-    profiles: ["", "milvus", "elasticsearch", "minio"]
+    profiles: ["", "milvus", "elasticsearch", "minio", "oracle"]
 
   elasticsearch:
     container_name: elasticsearch
@@ -109,6 +109,29 @@ services:
       retries: 10
     profiles: ["elasticsearch"]
 
+  # Oracle 26ai Vector Database (CPU-based)
+  # Uses Oracle Free tier container for development/testing
+  # For production, connect to your Oracle 26ai instance using ORACLE_DSN
+  oracle-26ai:
+    container_name: oracle-26ai
+    image: container-registry.oracle.com/database/free:latest
+    environment:
+      - ORACLE_PWD=${ORACLE_PASSWORD:-oracle123}
+      - ORACLE_CHARACTERSET=AL32UTF8
+    ports:
+      - "1521:1521"
+      - "5500:5500"
+    volumes:
+      - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/oracle:/opt/oracle/oradata
+    healthcheck:
+      test: ["CMD", "bash", "-c", "echo 'SELECT 1 FROM DUAL;' | sqlplus -s system/${ORACLE_PASSWORD:-oracle123}@localhost:1521/FREEPDB1"]
+      interval: 30s
+      timeout: 10s
+      retries: 10
+      start_period: 300s
+    shm_size: 1g
+    profiles: ["oracle"]
+
 networks:
   default:
     name: nvidia-rag
\ No newline at end of file
diff --git a/deploy/helm/mig-slicing/mig-config.yaml b/deploy/helm/mig-slicing/mig-config-h100.yaml
similarity index 100%
rename from deploy/helm/mig-slicing/mig-config.yaml
rename to deploy/helm/mig-slicing/mig-config-h100.yaml
diff --git a/deploy/helm/mig-slicing/mig-config-rtx6000.yaml b/deploy/helm/mig-slicing/mig-config-rtx6000.yaml
new file mode 100644
index 000000000..14272b497
--- /dev/null
+++ b/deploy/helm/mig-slicing/mig-config-rtx6000.yaml
@@ -0,0 +1,26 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: custom-mig-config
+data:
+  config.yaml: |
+    version: v1
+    mig-configs:
+      all-disabled:
+        - devices: all
+          mig-enabled: false
+      
+      custom-rtx6000-4x1g24-2x1g24-1x2g48-1x4g96:
+        - devices: [0]
+          mig-enabled: true
+          mig-devices:
+            "1g.24gb": 4
+        - devices: [1]
+          mig-enabled: true
+          mig-devices:
+            "1g.24gb": 2
+            "2g.48gb": 1
+        - devices: [2]
+          mig-enabled: true
+          mig-devices:
+            "4g.96gb": 1
diff --git a/deploy/helm/mig-slicing/values-mig.yaml b/deploy/helm/mig-slicing/values-mig-h100.yaml
similarity index 100%
rename from deploy/helm/mig-slicing/values-mig.yaml
rename to deploy/helm/mig-slicing/values-mig-h100.yaml
diff --git a/deploy/helm/mig-slicing/values-mig-rtx6000.yaml b/deploy/helm/mig-slicing/values-mig-rtx6000.yaml
new file mode 100644
index 000000000..e7ae285da
--- /dev/null
+++ b/deploy/helm/mig-slicing/values-mig-rtx6000.yaml
@@ -0,0 +1,114 @@
+# MIG-optimized resource configuration for RAG Blueprint
+# This file only overrides GPU resource requirements to use MIG slices
+
+# NV-Ingest configuration
+nv-ingest:
+  # Milvus - uses 1g.24gb MIG slice
+  milvus:
+    standalone:
+      resources:
+        limits:
+          nvidia.com/gpu: "0"
+          nvidia.com/mig-1g.24gb: 1
+        requests:
+          nvidia.com/gpu: "0"
+          nvidia.com/mig-1g.24gb: 1
+
+  # NV-Ingest NIM Operator overrides
+  nimOperator:
+    # Page Elements - uses 1g.24gb
+    page_elements:
+      resources:
+        limits:
+          nvidia.com/gpu: "0"
+          nvidia.com/mig-1g.24gb: 1
+        requests:
+          nvidia.com/gpu: "0"
+          nvidia.com/mig-1g.24gb: 1
+      storage:
+        pvc:
+          storageClass: ""
+
+    # Graphic Elements - uses 1g.24gb
+    graphic_elements:
+      resources:
+        limits:
+          nvidia.com/gpu: "0"
+          nvidia.com/mig-1g.24gb: 1
+        requests:
+          nvidia.com/gpu: "0"
+          nvidia.com/mig-1g.24gb: 1
+      storage:
+        pvc:
+          storageClass: ""
+
+    # Table Structure - uses 1g.24gb
+    table_structure:
+      resources:
+        limits:
+          nvidia.com/gpu: "0"
+          nvidia.com/mig-1g.24gb: 1
+        requests:
+          nvidia.com/gpu: "0"
+          nvidia.com/mig-1g.24gb: 1
+      storage:
+        pvc:
+          storageClass: ""
+
+    # OCR - uses 2g.48gb (larger slice)
+    nemoretriever_ocr_v1:
+      resources:
+        limits:
+          nvidia.com/gpu: "0"
+          nvidia.com/mig-2g.48gb: 1
+        requests:
+          nvidia.com/gpu: "0"
+          nvidia.com/mig-2g.48gb: 1
+      storage:
+        pvc:
+          storageClass: ""
+# Main NIM Operator overrides for MIG
+nimOperator:
+  # LLM - uses 4g.96gb
+  nim-llm:
+    resources:
+      limits:
+        nvidia.com/gpu: "0"
+        nvidia.com/mig-4g.96gb: 1
+      requests:
+        nvidia.com/gpu: "0"
+        nvidia.com/mig-4g.96gb: 1
+      storage:
+        pvc:
+          storageClass: ""
+    model:
+      engine: tensorrt_llm
+      precision: "fp8"
+      qosProfile: "throughput"
+      tensorParallelism: "1"
+      gpus:
+        - product: "rtx6000_blackwell_sv"
+  # Embedding - uses 1g.24gb
+  nvidia-nim-llama-32-nv-embedqa-1b-v2:
+    resources:
+      limits:
+        nvidia.com/gpu: "0"
+        nvidia.com/mig-1g.24gb: 1
+      requests:
+        nvidia.com/gpu: "0"
+        nvidia.com/mig-1g.24gb: 1
+      storage:
+        pvc:
+          storageClass: ""
+  # Reranking - uses 1g.24gb
+  nvidia-nim-llama-32-nv-rerankqa-1b-v2:
+    resources:
+      limits:
+        nvidia.com/gpu: "0"
+        nvidia.com/mig-1g.24gb: 1
+      requests:
+        nvidia.com/gpu: "0"
+        nvidia.com/mig-1g.24gb: 1
+      storage:
+        pvc:
+          storageClass: ""
diff --git a/deploy/helm/nvidia-blueprint-rag/Chart.lock b/deploy/helm/nvidia-blueprint-rag/Chart.lock
index 7b479e4aa..723660bfd 100644
--- a/deploy/helm/nvidia-blueprint-rag/Chart.lock
+++ b/deploy/helm/nvidia-blueprint-rag/Chart.lock
@@ -1,7 +1,7 @@
 dependencies:
 - name: nv-ingest
   repository: https://helm.ngc.nvidia.com/nvidia/nemo-microservices
-  version: 26.1.1
+  version: 26.1.2
 - name: eck-elasticsearch
   repository: https://helm.elastic.co
   version: 0.18.0
@@ -14,5 +14,5 @@ dependencies:
 - name: kube-prometheus-stack
   repository: https://prometheus-community.github.io/helm-charts
   version: 76.3.0
-digest: sha256:7f85073bdf19922173b3372d9b5a877d6c2f783b431ce7a2f783308f67806c66
-generated: "2026-02-04T07:29:44.453434343Z"
+digest: sha256:a65037bbcb6fa587af3d15b949a32b059cf26d1102a2166d0e77daed29a0f520
+generated: "2026-03-02T16:48:31.702049307+05:30"
diff --git a/deploy/helm/nvidia-blueprint-rag/Chart.yaml b/deploy/helm/nvidia-blueprint-rag/Chart.yaml
index a8a459279..afe7a5cbd 100644
--- a/deploy/helm/nvidia-blueprint-rag/Chart.yaml
+++ b/deploy/helm/nvidia-blueprint-rag/Chart.yaml
@@ -1,10 +1,10 @@
 apiVersion: v2
-appVersion: v2.4.0
+appVersion: v2.5.0
 dependencies:
 - condition: nv-ingest.enabled
   name: nv-ingest
   repository: https://helm.ngc.nvidia.com/nvidia/nemo-microservices
-  version: 26.1.1
+  version: 26.1.2
 - condition: eck-elasticsearch.enabled
   name: eck-elasticsearch
   repository: https://helm.elastic.co
@@ -24,4 +24,4 @@ dependencies:
 description: An end to end Helm chart for the NVIDIA RAG Blueprint
 name: nvidia-blueprint-rag
 type: application
-version: v2.4.0
+version: v2.5.0
diff --git a/deploy/helm/nvidia-blueprint-rag/endpoints.md b/deploy/helm/nvidia-blueprint-rag/endpoints.md
index 7609053d5..e62b0d1fb 100644
--- a/deploy/helm/nvidia-blueprint-rag/endpoints.md
+++ b/deploy/helm/nvidia-blueprint-rag/endpoints.md
@@ -24,11 +24,11 @@ This document describes the configurable endpoints used by the RAG server and it
 
 ### Embedding Model
 - **APP_EMBEDDINGS_SERVERURL**: URL for the embedding model service (default: "nemo-retriever-embedding-ms:8000")
-- **APP_EMBEDDINGS_MODELNAME**: Name of the embedding model (default: "nvidia/llama-3.2-nv-embedqa-1b-v2")
+- **APP_EMBEDDINGS_MODELNAME**: Name of the embedding model (default: "nvidia/llama-nemotron-embed-1b-v2")
 
 ### Reranking Model
 - **APP_RANKING_SERVERURL**: URL for the ranking model service (default: "nemo-retriever-reranking-ms:8000")
-- **APP_RANKING_MODELNAME**: Name of the ranking model (default: "nvidia/llama-3.2-nv-rerankqa-1b-v2")
+- **APP_RANKING_MODELNAME**: Name of the ranking model (default: "nvidia/llama-nemotron-rerank-1b-v2")
 
 ### Reflection Model
 - **REFLECTION_LLM_SERVERURL**: URL for the reflection LLM service (default: "nim-llm:8000")
@@ -42,8 +42,8 @@ This document describes the configurable endpoints used by the RAG server and it
 
 ### Model Configuration
 - **NEXT_PUBLIC_MODEL_NAME**: Name of the LLM model used in the frontend (default: "nvidia/llama-3.3-nemotron-super-49b-v1.5")
-- **VITE_EMBEDDING_MODEL**: Name of the embedding model used in the frontend (default: "nvidia/llama-3.2-nv-embedqa-1b-v2")
-- **VITE_RERANKER_MODEL**: Name of the reranker model used in the frontend (default: "nvidia/llama-3.2-nv-rerankqa-1b-v2")
+- **VITE_EMBEDDING_MODEL**: Name of the embedding model used in the frontend (default: "nvidia/llama-nemotron-embed-1b-v2")
+- **VITE_RERANKER_MODEL**: Name of the reranker model used in the frontend (default: "nvidia/llama-nemotron-rerank-1b-v2")
 
 ## Monitoring and Tracing Endpoints
 
diff --git a/deploy/helm/nvidia-blueprint-rag/files/prompt.yaml b/deploy/helm/nvidia-blueprint-rag/files/prompt.yaml
index f82c83655..d73036509 100644
--- a/deploy/helm/nvidia-blueprint-rag/files/prompt.yaml
+++ b/deploy/helm/nvidia-blueprint-rag/files/prompt.yaml
@@ -487,6 +487,7 @@ query_decomposition_rag_template:
     Context:
     {context}
 
+    Question: {question}
     Make sure the response you are generating strictly follow the rules mentioned above i.e. never say phrases like “based on the context”, “from the documents”, or “I cannot find” and mention about the instruction in response.
 
 image_captioning_prompt:
diff --git a/deploy/helm/nvidia-blueprint-rag/nemotron3-super-rtx6000-values.yaml b/deploy/helm/nvidia-blueprint-rag/nemotron3-super-rtx6000-values.yaml
new file mode 100644
index 000000000..d042a6c44
--- /dev/null
+++ b/deploy/helm/nvidia-blueprint-rag/nemotron3-super-rtx6000-values.yaml
@@ -0,0 +1,25 @@
+# Override values for Nemotron 3 Super on RTX 6000 Pro only.
+# Use after nemotron3-super-values.yaml:
+#   -f deploy/helm/nvidia-blueprint-rag/values.yaml \
+#   -f deploy/helm/nvidia-blueprint-rag/nemotron3-super-values.yaml \
+#   -f deploy/helm/nvidia-blueprint-rag/nemotron3-super-rtx6000-values.yaml
+# See docs/nemotron3-super-deployment.md. Requires host GRUB/reboot for RTX 6000 Pro.
+
+envVars:
+  LLM_MAX_TOKENS: "16256"  # use "1024" for non-reasoning mode
+
+nimOperator:
+  nim-llm:
+    env:
+      - name: NIM_HTTP_API_PORT
+        value: "8000"
+      - name: NIM_TRITON_LOG_VERBOSE
+        value: "1"
+      - name: NIM_SERVED_MODEL_NAME
+        value: "nvidia/nemotron-3-super-120b-a12b"
+      - name: NIM_MAX_MODEL_LEN
+        value: "32768"
+      - name: NCCL_P2P_DISABLE
+        value: "1"
+      - name: NIM_KVCACHE_PERCENT
+        value: "0.9"
\ No newline at end of file
diff --git a/deploy/helm/nvidia-blueprint-rag/nemotron3-super-values.yaml b/deploy/helm/nvidia-blueprint-rag/nemotron3-super-values.yaml
new file mode 100644
index 000000000..710fff1fe
--- /dev/null
+++ b/deploy/helm/nvidia-blueprint-rag/nemotron3-super-values.yaml
@@ -0,0 +1,39 @@
+# Override values for Nemotron 3 Super LLM NIM (all hardware).
+# Use with: -f deploy/helm/nvidia-blueprint-rag/values.yaml -f deploy/helm/nvidia-blueprint-rag/nemotron3-super-values.yaml
+# For RTX 6000 Pro, add: -f deploy/helm/nvidia-blueprint-rag/nemotron3-super-rtx6000-values.yaml
+# See docs/nemotron3-super-deployment.md.
+
+envVars:
+  APP_LLM_MODELNAME: "nvidia/nemotron-3-super-120b-a12b"
+  APP_QUERYREWRITER_MODELNAME: "nvidia/nemotron-3-super-120b-a12b"
+  APP_FILTEREXPRESSIONGENERATOR_MODELNAME: "nvidia/nemotron-3-super-120b-a12b"
+  REFLECTION_LLM: "nvidia/nemotron-3-super-120b-a12b"
+
+ingestor-server:
+  envVars:
+    SUMMARY_LLM: "nvidia/nemotron-3-super-120b-a12b"
+
+nimOperator:
+  nim-llm:
+    image:
+      repository: nvcr.io/nim/nvidia/nemotron-3-super-120b-a12b
+      pullPolicy: IfNotPresent
+      tag: "1.8.0"
+    resources:
+      limits:
+        nvidia.com/gpu: 2
+      requests:
+        nvidia.com/gpu: 2
+    model:
+      engine: vllm
+      precision: "fp8"
+      tensorParallelism: "2"
+    env:
+      - name: NIM_HTTP_API_PORT
+        value: "8000"
+      - name: NIM_TRITON_LOG_VERBOSE
+        value: "1"
+      - name: NIM_SERVED_MODEL_NAME
+        value: "nvidia/nemotron-3-super-120b-a12b"
+      - name: NIM_MAX_MODEL_LEN
+        value: "131072"
diff --git a/deploy/helm/nvidia-blueprint-rag/templates/llm-nim.yaml b/deploy/helm/nvidia-blueprint-rag/templates/llm-nim.yaml
index 60f043973..f103a1a72 100644
--- a/deploy/helm/nvidia-blueprint-rag/templates/llm-nim.yaml
+++ b/deploy/helm/nvidia-blueprint-rag/templates/llm-nim.yaml
@@ -61,4 +61,8 @@ spec:
   {{- end }}
   expose:
 {{ toYaml $nimLlm.expose | nindent 4 }}
+  {{- with $nimLlm.startupProbe }}
+  startupProbe:
+{{ toYaml . | nindent 4 }}
+  {{- end }}
 {{- end }}
\ No newline at end of file
diff --git a/deploy/helm/nvidia-blueprint-rag/values.yaml b/deploy/helm/nvidia-blueprint-rag/values.yaml
index 47ef09b68..00e6914b0 100644
--- a/deploy/helm/nvidia-blueprint-rag/values.yaml
+++ b/deploy/helm/nvidia-blueprint-rag/values.yaml
@@ -56,8 +56,8 @@ apiKeysSecret:
 
 # -- RAG server container image
 image:
-  repository: nvcr.io/nvstaging/blueprint/rag-server
-  tag: "2.4.0"
+  repository: nvcr.io/nvidia/blueprint/rag-server
+  tag: "2.5.0"
   pullPolicy: Always
 
 # -- RAG server service configuration
@@ -160,15 +160,11 @@ envVars:
   # URL on which LLM model is hosted. If "", Nvidia hosted API is used
   APP_LLM_SERVERURL: "nim-llm:8000"
   # LLM model parameters
-  LLM_MAX_TOKENS: "32768"
+  # For Nemotron 3 Super on RTX 6000 Pro: uncomment and set to 16256 (reasoning) or 1024 (non-reasoning); comment LLM_MAX_TOKENS above
+  LLM_MAX_TOKENS: "32768" # "16256"
   LLM_TEMPERATURE: "0"
   LLM_TOP_P: "1.0"
 
-  # Enable/disable thinking/reasoning for nemotron-3-nano models (30b variant)
-  # Set to "true" to enable reasoning mode with reasoning_budget
-  # Set to "false" to disable reasoning and get direct answers
-  ENABLE_NEMOTRON_3_NANO_THINKING: "true"
-
   ##===Query Rewriter Model specific configurations===
   APP_QUERYREWRITER_MODELNAME: "nvidia/llama-3.3-nemotron-super-49b-v1.5"
   # URL on which query rewriter model is hosted. If "", Nvidia hosted API is used
@@ -183,14 +179,14 @@ envVars:
 
   ##===Embedding Model specific configurations===
   # URL on which embedding model is hosted. If "", Nvidia hosted API is used
-  APP_EMBEDDINGS_SERVERURL: "nemoretriever-embedding-ms:8000/v1"
-  APP_EMBEDDINGS_MODELNAME: "nvidia/llama-3.2-nv-embedqa-1b-v2"
+  APP_EMBEDDINGS_SERVERURL: "nemotron-embedding-ms:8000/v1"
+  APP_EMBEDDINGS_MODELNAME: "nvidia/llama-nemotron-embed-1b-v2"
   APP_EMBEDDINGS_DIMENSIONS: "2048"
 
   ##===Reranking Model specific configurations===
   # URL on which ranking model is hosted. If "", Nvidia hosted API is used
-  APP_RANKING_SERVERURL: "nemoretriever-ranking-ms:8000"
-  APP_RANKING_MODELNAME: "nvidia/llama-3.2-nv-rerankqa-1b-v2"
+  APP_RANKING_SERVERURL: "nemotron-ranking-ms:8000"
+  APP_RANKING_MODELNAME: "nvidia/llama-nemotron-rerank-1b-v2"
   ENABLE_RERANKER: "True"
   # Default score threshold for filtering documents by reranker relevance (0.0 to 1.0)
   RERANKER_SCORE_THRESHOLD: "0.0"
@@ -260,6 +256,11 @@ envVars:
   # Whether to filter content within <think></think> tags in model responses
   FILTER_THINK_TOKENS: "true"
 
+  # Reasoning configuration (supported by Nemotron 3 and other reasoning models)
+  LLM_ENABLE_THINKING: "false"
+  LLM_REASONING_BUDGET: "0"
+  LLM_LOW_EFFORT: "false"
+
   NEMO_GUARDRAILS_URL: "nemo-guardrails:7331"
 
   # enable iterative query decomposition
@@ -289,8 +290,8 @@ ingestor-server:
     password: ""
 
   image:
-    repository: nvcr.io/nvstaging/blueprint/ingestor-server
-    tag: "2.4.0"
+    repository: nvcr.io/nvidia/blueprint/ingestor-server
+    tag: "2.5.0"
     pullPolicy: Always
 
   # -- Service config for ingestor-server
@@ -349,8 +350,8 @@ ingestor-server:
     ## APP_EMBEDDINGS_APIKEY and SUMMARY_LLM_APIKEY are loaded from secrets automatically.
 
     # === Embeddings Configurations ===
-    APP_EMBEDDINGS_SERVERURL: "nemoretriever-embedding-ms:8000/v1"
-    APP_EMBEDDINGS_MODELNAME: "nvidia/llama-3.2-nv-embedqa-1b-v2"
+    APP_EMBEDDINGS_SERVERURL: "nemotron-embedding-ms:8000/v1"
+    APP_EMBEDDINGS_MODELNAME: "nvidia/llama-nemotron-embed-1b-v2"
     APP_EMBEDDINGS_DIMENSIONS: "2048"
 
     # === NV-Ingest Configurations ===
@@ -359,6 +360,7 @@ ingestor-server:
 
     # === NV-Ingest extraction configurations ===
     APP_NVINGEST_PDFEXTRACTMETHOD: "None"  # Method used for text extraction from "None", "pdfium", "nemotron_parse"
+    APP_NVINGEST_EXTRACTTABLESMETHOD: "yolox"  # Method for table extraction: "yolox", "nemotron_parse", or None
     APP_NVINGEST_EXTRACTTEXT: "True"  # Enable text extraction
     APP_NVINGEST_EXTRACTINFOGRAPHICS: "False"  # Enable infographic extraction
     APP_NVINGEST_EXTRACTTABLES: "True"  # Enable table extraction
@@ -452,9 +454,9 @@ frontend:
   replicaCount: 1
 
   image:
-    repository: nvcr.io/nvstaging/blueprint/rag-frontend
+    repository: nvcr.io/nvidia/blueprint/rag-frontend
     pullPolicy: IfNotPresent
-    tag: "2.4.0"
+    tag: "2.5.0"
 
   imagePullSecret:
     name: "ngc-secret"
@@ -657,11 +659,22 @@ nimOperator:
       repository: nvcr.io/nim/nvidia/llama-3.3-nemotron-super-49b-v1.5
       pullPolicy: IfNotPresent
       tag: "1.14.0"
+# -- For Nemotron 3 Super: uncomment the block below and comment the image block above
+#    image:
+#      repository: nvcr.io/nim/nvidia/nemotron-3-super-120b-a12b
+#      pullPolicy: IfNotPresent
+#      tag: "1.8.0"
     resources:
       limits:
         nvidia.com/gpu: 1
       requests:
         nvidia.com/gpu: 1
+# -- For Nemotron 3 Super (all hardware): uncomment the block below and comment the resources block above
+#    resources:
+#      limits:
+#        nvidia.com/gpu: 2
+#      requests:
+#        nvidia.com/gpu: 2
     nodeSelector: {}
     tolerations: []
     model:
@@ -672,6 +685,10 @@ nimOperator:
 #      tensorParallelism: "1"
 #      gpus:
 #        - product: "rtx6000_blackwell_sv"
+# -- For Nemotron 3 Super (all hardware): comment "engine: tensorrt_llm" above and uncomment the three lines below
+#      engine: vllm
+#      precision: "fp8"
+#      tensorParallelism: "2"
     storage:
       pvc:
         create: true
@@ -702,6 +719,15 @@ nimOperator:
         value: "1"
       - name: NIM_SERVED_MODEL_NAME
         value: "nvidia/llama-3.3-nemotron-super-49b-v1.5"
+      - name: NIM_MAX_MODEL_LEN
+        value: "131072"
+# -- For Nemotron 3 Super on RTX 6000 Pro: comment the NIM_MAX_MODEL_LEN entry above and uncomment the block below
+#      - name: NIM_MAX_MODEL_LEN
+#        value: "32768"
+#      - name: NCCL_P2P_DISABLE
+#        value: "1"
+#      - name: NIM_KVCACHE_PERCENT
+#        value: "0.9"
 #      - name: CUDA_VISIBLE_DEVICES
 #        value: "0"
     expose:
@@ -710,16 +736,27 @@ nimOperator:
         type: ClusterIP
         port: 8000
         grpcPort: 8001
+    startupProbe:
+      enabled: true
+      probe:
+        httpGet:
+          path: /v1/health/ready
+          port: 8000
+        initialDelaySeconds: 60
+        periodSeconds: 10
+        failureThreshold: 750
+        timeoutSeconds: 5
+
 # subsection: nvidia-nim-llama-32-nv-embedqa-1b-v2
 # NIM Text Embedding
   nvidia-nim-llama-32-nv-embedqa-1b-v2:
     enabled: true
     replicas: 1
     service:
-      name: "nemoretriever-embedding-ms"
+      name: "nemotron-embedding-ms"
     image:
-      repository: nvcr.io/nim/nvidia/llama-3.2-nv-embedqa-1b-v2
-      tag: "1.10.1"
+      repository: nvcr.io/nim/nvidia/llama-nemotron-embed-1b-v2
+      tag: "1.13.0"
       pullPolicy: IfNotPresent
     resources:
       limits:
@@ -795,10 +832,10 @@ nimOperator:
     enabled: true
     replicas: 1
     service:
-      name: "nemoretriever-ranking-ms"
+      name: "nemotron-ranking-ms"
     image:
-      repository: nvcr.io/nim/nvidia/llama-3.2-nv-rerankqa-1b-v2
-      tag: "1.8.0"
+      repository: nvcr.io/nim/nvidia/llama-nemotron-rerank-1b-v2
+      tag: "1.10.0"
       pullPolicy: IfNotPresent
     resources:
       limits:
@@ -870,7 +907,7 @@ nv-ingest:
     create: false
   image:
     repository: "nvcr.io/nvidia/nemo-microservices/nv-ingest"
-    tag: "26.1.1"
+    tag: "26.1.2"
   resources:
     limits:
       nvidia.com/gpu: 0
@@ -896,8 +933,8 @@ nv-ingest:
     RAY_num_server_call_thread: "1"
     RAY_worker_num_grpc_internal_threads: "1"
     
-    EMBEDDING_NIM_ENDPOINT: "http://nemoretriever-embedding-ms:8000/v1"
-    EMBEDDING_NIM_MODEL_NAME: "nvidia/llama-3.2-nv-embedqa-1b-v2"
+    EMBEDDING_NIM_ENDPOINT: "http://nemotron-embedding-ms:8000/v1"
+    EMBEDDING_NIM_MODEL_NAME: "nvidia/llama-nemotron-embed-1b-v2"
     MESSAGE_CLIENT_HOST: "rag-redis-master"
     MESSAGE_CLIENT_PORT: 6379
     MESSAGE_CLIENT_TYPE: "redis"
@@ -1015,7 +1052,7 @@ nv-ingest:
       replicaCount: 1
       image:
         repository: nvcr.io/nim/nvidia/nemoretriever-ocr-v1
-        tag: "1.2.0"
+        tag: "1.2.1"
       imagePullSecrets:
         - name: ngc-secret
       env:
@@ -1049,8 +1086,8 @@ nv-ingest:
       tolerations: []
       replicaCount: 1
       image:
-        repository: nvcr.io/nim/nvidia/nemoretriever-graphic-elements-v1
-        tag: "1.6.0"
+        repository: nvcr.io/nim/nvidia/nemotron-graphic-elements-v1
+        tag: "1.8.0"
       env:
         - name: NIM_HTTP_API_PORT
           value: "8000"
@@ -1082,8 +1119,8 @@ nv-ingest:
       tolerations: []
       replicaCount: 1
       image:
-        repository: nvcr.io/nim/nvidia/nemoretriever-page-elements-v3
-        tag: "1.7.0"
+        repository: nvcr.io/nim/nvidia/nemotron-page-elements-v3
+        tag: "1.8.0"
       env:
         - name: NIM_HTTP_API_PORT
           value: "8000"
@@ -1133,8 +1170,8 @@ nv-ingest:
       tolerations: []
       replicaCount: 1
       image:
-        repository: nvcr.io/nim/nvidia/nemoretriever-table-structure-v1
-        tag: "1.6.0"
+        repository: nvcr.io/nim/nvidia/nemotron-table-structure-v1
+        tag: "1.8.0"
       env:
         - name: NIM_HTTP_API_PORT
           value: "8000"
diff --git a/deploy/workbench/README.md b/deploy/workbench/README.md
index 6d02a360e..179c32ec5 100644
--- a/deploy/workbench/README.md
+++ b/deploy/workbench/README.md
@@ -75,4 +75,4 @@ Use of the models in this blueprint is governed by the [NVIDIA AI Foundation Mod
 ## Terms of Use
 This blueprint is governed by the [NVIDIA Agreements | Enterprise Software | NVIDIA Software License Agreement](https://www.nvidia.com/en-us/agreements/enterprise-software/nvidia-software-license-agreement/) and the [NVIDIA Agreements | Enterprise Software | Product Specific Terms for AI Product](https://www.nvidia.com/en-us/agreements/enterprise-software/product-specific-terms-for-ai-products/). The models are governed by the [NVIDIA Agreements | Enterprise Software | NVIDIA Community Model License](https://www.nvidia.com/en-us/agreements/enterprise-software/nvidia-community-models-license/) and the [NVIDIA RAG dataset](https://github.com/NVIDIA-AI-Blueprints/rag/tree/v2.0.0/data/multimodal) which is governed by the [NVIDIA Asset License Agreement](https://github.com/NVIDIA-AI-Blueprints/rag/blob/main/data/LICENSE.DATA).
 
-The following models that are built with Llama are governed by the [Llama 3.2 Community License Agreement](https://www.llama.com/llama3_2/license/): nvidia/llama-3.3-nemotron-super-49b-v1, nvidia/llama-3.2-nv-embedqa-1b-v2, and nvidia/llama-3.2-nv-rerankqa-1b-v2.
+The following models that are built with Llama are governed by the [Llama 3.2 Community License Agreement](https://www.llama.com/llama3_2/license/): nvidia/llama-3.3-nemotron-super-49b-v1, nvidia/llama-nemotron-embed-1b-v2, and nvidia/llama-nemotron-rerank-1b-v2.
diff --git a/deploy/workbench/compose.yaml b/deploy/workbench/compose.yaml
index 04cfdd2e2..91d4b3d28 100644
--- a/deploy/workbench/compose.yaml
+++ b/deploy/workbench/compose.yaml
@@ -28,9 +28,9 @@ services:
       retries: 100
     profiles: ["local"]
 
-  nemoretriever-embedding-ms:
-    container_name: nemoretriever-embedding-ms
-    image: nvcr.io/nim/nvidia/llama-3.2-nv-embedqa-1b-v2:1.10.1
+  nemotron-embedding-ms:
+    container_name: nemotron-embedding-ms
+    image: nvcr.io/nim/nvidia/llama-nemotron-embed-1b-v2:1.13.0
     volumes:
     - ${MODEL_DIRECTORY:-/tmp}:/opt/nim/.cache
     ports:
@@ -58,9 +58,9 @@ services:
       start_period: 10m
     profiles: ["local"]
 
-  nemoretriever-ranking-ms:
-    container_name: nemoretriever-ranking-ms
-    image: nvcr.io/nim/nvidia/llama-3.2-nv-rerankqa-1b-v2:1.8.0
+  nemotron-ranking-ms:
+    container_name: nemotron-ranking-ms
+    image: nvcr.io/nim/nvidia/llama-nemotron-rerank-1b-v2:1.10.0
     volumes:
     - ${MODEL_DIRECTORY:-/tmp}:/opt/nim/.cache
     ports:
@@ -86,7 +86,7 @@ services:
     profiles: ["local"]
 
   page-elements:
-    image: ${YOLOX_IMAGE:-nvcr.io/nim/nvidia/nemoretriever-page-elements-v3}:${YOLOX_TAG:-1.7.0}
+    image: ${YOLOX_IMAGE:-nvcr.io/nim/nvidia/nemotron-page-elements-v3}:${YOLOX_TAG:-1.8.0}
     ports:
       - "8000:8000"
       - "8001:8001"
@@ -122,7 +122,7 @@ services:
     profiles: ["local"]
 
   graphic-elements:
-    image: ${YOLOX_GRAPHIC_ELEMENTS_IMAGE:-nvcr.io/nim/nvidia/nemoretriever-graphic-elements-v1}:${YOLOX_GRAPHIC_ELEMENTS_TAG:-1.6.0}
+    image: ${YOLOX_GRAPHIC_ELEMENTS_IMAGE:-nvcr.io/nim/nvidia/nemotron-graphic-elements-v1}:${YOLOX_GRAPHIC_ELEMENTS_TAG:-1.8.0}
     ports:
       - "8003:8000"
       - "8004:8001"
@@ -147,7 +147,7 @@ services:
     profiles: ["local"]
 
   table-structure:
-    image: ${YOLOX_TABLE_STRUCTURE_IMAGE:-nvcr.io/nim/nvidia/nemoretriever-table-structure-v1}:${YOLOX_TABLE_STRUCTURE_TAG:-1.6.0}
+    image: ${YOLOX_TABLE_STRUCTURE_IMAGE:-nvcr.io/nim/nvidia/nemotron-table-structure-v1}:${YOLOX_TABLE_STRUCTURE_TAG:-1.8.0}
     ports:
       - "8006:8000"
       - "8007:8001"
@@ -200,7 +200,7 @@ services:
   # Main ingestor server which is responsible for ingestion
   ingestor-server:
     container_name: ingestor-server
-    image: nvcr.io/nvstaging/blueprint/ingestor-server:${TAG:-2.4.0}
+    image: nvcr.io/nvidia/blueprint/ingestor-server:${TAG:-2.5.0}
     build:
       # Set context to repo's root directory
       context: ../../
@@ -256,8 +256,8 @@ services:
 
       ##===Embedding Model specific configurations===
       # url on which embedding model is hosted. If "", Nvidia hosted API is used
-      APP_EMBEDDINGS_SERVERURL: ${APP_EMBEDDINGS_SERVERURL-"nemoretriever-embedding-ms:8000/v1"}
-      APP_EMBEDDINGS_MODELNAME: ${APP_EMBEDDINGS_MODELNAME:-nvidia/llama-3.2-nv-embedqa-1b-v2}
+      APP_EMBEDDINGS_SERVERURL: ${APP_EMBEDDINGS_SERVERURL-"nemotron-embedding-ms:8000/v1"}
+      APP_EMBEDDINGS_MODELNAME: ${APP_EMBEDDINGS_MODELNAME:-nvidia/llama-nemotron-embed-1b-v2}
       APP_EMBEDDINGS_DIMENSIONS: ${APP_EMBEDDINGS_DIMENSIONS:-2048}
 
       ##===NV-Ingest Connection Configurations=======
@@ -333,7 +333,7 @@ services:
     profiles: ["ingest"]
 
   nv-ingest-ms-runtime:
-    image: nvcr.io/nvidia/nemo-microservices/nv-ingest:26.1.1
+    image: nvcr.io/nvidia/nemo-microservices/nv-ingest:26.1.2
     # cpuset: "0-15" # Uncomment to restrict this container to CPU cores 0–15
     shm_size: 40gb # Should be at minimum 30% of assigned memory per Ray documentation
     volumes:
@@ -399,20 +399,20 @@ services:
       - REDIS_MORPHEUS_TASK_QUEUE=morpheus_task_queue
       # Self-hosted redis endpoints.
       # build.nvidia.com hosted yolox endpoints.
-      # - YOLOX_HTTP_ENDPOINT=https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-page-elements-v3
+      # - YOLOX_HTTP_ENDPOINT=https://ai.api.nvidia.com/v1/cv/nvidia/nemotron-page-elements-v3
       # - YOLOX_INFER_PROTOCOL=http
       - YOLOX_PAGE_IMAGE_FORMAT=JPEG
       - YOLOX_GRPC_ENDPOINT=${YOLOX_GRPC_ENDPOINT:-page-elements:8001}
       - YOLOX_HTTP_ENDPOINT=${YOLOX_HTTP_ENDPOINT:-http://page-elements:8000/v1/infer}
       - YOLOX_INFER_PROTOCOL=${YOLOX_INFER_PROTOCOL:-grpc}
       # build.nvidia.com hosted yolox-graphics-elements endpoints.
-      #- YOLOX_GRAPHIC_ELEMENTS_HTTP_ENDPOINT=https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-graphic-elements-v1
+      #- YOLOX_GRAPHIC_ELEMENTS_HTTP_ENDPOINT=https://ai.api.nvidia.com/v1/cv/nvidia/nemotron-graphic-elements-v1
       #- YOLOX_GRAPHIC_ELEMENTS_INFER_PROTOCOL=http
       - YOLOX_GRAPHIC_ELEMENTS_GRPC_ENDPOINT=${YOLOX_GRAPHIC_ELEMENTS_GRPC_ENDPOINT:-graphic-elements:8001}
       - YOLOX_GRAPHIC_ELEMENTS_HTTP_ENDPOINT=${YOLOX_GRAPHIC_ELEMENTS_HTTP_ENDPOINT:-http://graphic-elements:8000/v1/infer}
       - YOLOX_GRAPHIC_ELEMENTS_INFER_PROTOCOL=${YOLOX_GRAPHIC_ELEMENTS_INFER_PROTOCOL:-grpc}
       # build.nvidia.com hosted  yolox-table-elements endpoints.
-      #- YOLOX_TABLE_STRUCTURE_HTTP_ENDPOINT=https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-table-structure-v1
+      #- YOLOX_TABLE_STRUCTURE_HTTP_ENDPOINT=https://ai.api.nvidia.com/v1/cv/nvidia/nemotron-table-structure-v1
       #- YOLOX_TABLE_STRUCTURE_INFER_PROTOCOL=http
       - YOLOX_TABLE_STRUCTURE_GRPC_ENDPOINT=${YOLOX_TABLE_STRUCTURE_GRPC_ENDPOINT:-table-structure:8001}
       - YOLOX_TABLE_STRUCTURE_HTTP_ENDPOINT=${YOLOX_TABLE_STRUCTURE_HTTP_ENDPOINT:-http://table-structure:8000/v1/infer}
@@ -432,7 +432,7 @@ services:
   # Main orchestrator server which stiches together all calls to different services to fulfill the user request
   rag-server:
     container_name: rag-server
-    image: nvcr.io/nvstaging/blueprint/rag-server:${TAG:-2.4.0}
+    image: nvcr.io/nvidia/blueprint/rag-server:${TAG:-2.5.0}
     build:
       # Set context to repo's root directory
       context: ../../
@@ -495,13 +495,13 @@ services:
 
       ##===Embedding Model specific configurations===
       # url on which embedding model is hosted. If "", Nvidia hosted API is used
-      APP_EMBEDDINGS_SERVERURL: ${APP_EMBEDDINGS_SERVERURL-"nemoretriever-embedding-ms:8000/v1"}
-      APP_EMBEDDINGS_MODELNAME: ${APP_EMBEDDINGS_MODELNAME:-nvidia/llama-3.2-nv-embedqa-1b-v2}
+      APP_EMBEDDINGS_SERVERURL: ${APP_EMBEDDINGS_SERVERURL-"nemotron-embedding-ms:8000/v1"}
+      APP_EMBEDDINGS_MODELNAME: ${APP_EMBEDDINGS_MODELNAME:-nvidia/llama-nemotron-embed-1b-v2}
 
       ##===Reranking Model specific configurations===
       # url on which ranking model is hosted. If "", Nvidia hosted API is used
-      APP_RANKING_SERVERURL: ${APP_RANKING_SERVERURL-"nemoretriever-ranking-ms:8000"}
-      APP_RANKING_MODELNAME: ${APP_RANKING_MODELNAME:-"nvidia/llama-3.2-nv-rerankqa-1b-v2"}
+      APP_RANKING_SERVERURL: ${APP_RANKING_SERVERURL-"nemotron-ranking-ms:8000"}
+      APP_RANKING_MODELNAME: ${APP_RANKING_MODELNAME:-"nvidia/llama-nemotron-rerank-1b-v2"}
       ENABLE_RERANKER: ${ENABLE_RERANKER:-True}
 
       ##===VLM Model specific configurations===
@@ -569,7 +569,7 @@ services:
   # Sample UI container which interacts with APIs exposed by rag-server container
   rag-frontend:
     container_name: rag-frontend
-    image: nvcr.io/nvstaging/blueprint/rag-frontend:${TAG:-2.4.0}
+    image: nvcr.io/nvidia/blueprint/rag-frontend:${TAG:-2.5.0}
     build:
       # Set context to repo's root directory
       context: ../../frontend
diff --git a/deploy/workbench/quickstart.ipynb b/deploy/workbench/quickstart.ipynb
index d9a15a71a..00c524aba 100644
--- a/deploy/workbench/quickstart.ipynb
+++ b/deploy/workbench/quickstart.ipynb
@@ -966,10 +966,10 @@
     "    \"enable_citations\": True,\n",
     "    \"model\": \"nvidia/llama-3.3-nemotron-super-49b-v1.5\",\n",
     "    \"llm_endpoint\": \"nim-llm:8000\",\n",
-    "    \"embedding_model\": \"nvidia/llama-3.2-nv-embedqa-1b-v2\",\n",
-    "    \"embedding_endpoint\": \"nemoretriever-embedding-ms:8000/v1\",\n",
-    "    \"reranker_model\": \"nvidia/llama-3.2-nv-rerankqa-1b-v2\",\n",
-    "    \"reranker_endpoint\": \"nemoretriever-ranking-ms:8000\",\n",
+    "    \"embedding_model\": \"nvidia/llama-nemotron-embed-1b-v2\",\n",
+    "    \"embedding_endpoint\": \"nemotron-embedding-ms:8000/v1\",\n",
+    "    \"reranker_model\": \"nvidia/llama-nemotron-rerank-1b-v2\",\n",
+    "    \"reranker_endpoint\": \"nemotron-ranking-ms:8000\",\n",
     "    \"stop\": [],\n",
     "}\n",
     "\n",
@@ -1030,10 +1030,10 @@
     "    \"enable_citations\": True,\n",
     "    \"model\": \"nvidia/llama-3.3-nemotron-super-49b-v1.5\",\n",
     "    \"llm_endpoint\": \"nim-llm:8000\",\n",
-    "    \"embedding_model\": \"nvidia/llama-3.2-nv-embedqa-1b-v2\",\n",
-    "    \"embedding_endpoint\": \"nemoretriever-embedding-ms:8000/v1\",\n",
-    "    \"reranker_model\": \"nvidia/llama-3.2-nv-rerankqa-1b-v2\",\n",
-    "    \"reranker_endpoint\": \"nemoretriever-ranking-ms:8000\",\n",
+    "    \"embedding_model\": \"nvidia/llama-nemotron-embed-1b-v2\",\n",
+    "    \"embedding_endpoint\": \"nemotron-embedding-ms:8000/v1\",\n",
+    "    \"reranker_model\": \"nvidia/llama-nemotron-rerank-1b-v2\",\n",
+    "    \"reranker_endpoint\": \"nemotron-ranking-ms:8000\",\n",
     "    \"stop\": [],\n",
     "}\n",
     "\n",
@@ -1175,10 +1175,10 @@
     "    ],\n",
     "    \"enable_query_rewriting\": False,\n",
     "    \"enable_reranker\": False,\n",
-    "    \"embedding_model\": \"nvidia/llama-3.2-nv-embedqa-1b-v2\",\n",
-    "    \"embedding_endpoint\": \"nemoretriever-embedding-ms:8000/v1\",\n",
-    "    \"reranker_model\": \"nvidia/llama-3.2-nv-rerankqa-1b-v2\",\n",
-    "    \"reranker_endpoint\": \"nemoretriever-ranking-ms:8000\",\n",
+    "    \"embedding_model\": \"nvidia/llama-nemotron-embed-1b-v2\",\n",
+    "    \"embedding_endpoint\": \"nemotron-embedding-ms:8000/v1\",\n",
+    "    \"reranker_model\": \"nvidia/llama-nemotron-rerank-1b-v2\",\n",
+    "    \"reranker_endpoint\": \"nemotron-ranking-ms:8000\",\n",
     "}\n",
     "\n",
     "\n",
@@ -1233,10 +1233,10 @@
     "    ],\n",
     "    \"enable_query_rewriting\": False,\n",
     "    \"enable_reranker\": True,\n",
-    "    \"embedding_model\": \"nvidia/llama-3.2-nv-embedqa-1b-v2\",\n",
-    "    \"embedding_endpoint\": \"nemoretriever-embedding-ms:8000/v1\",\n",
-    "    \"reranker_model\": \"nvidia/llama-3.2-nv-rerankqa-1b-v2\",\n",
-    "    \"reranker_endpoint\": \"nemoretriever-ranking-ms:8000\",\n",
+    "    \"embedding_model\": \"nvidia/llama-nemotron-embed-1b-v2\",\n",
+    "    \"embedding_endpoint\": \"nemotron-embedding-ms:8000/v1\",\n",
+    "    \"reranker_model\": \"nvidia/llama-nemotron-rerank-1b-v2\",\n",
+    "    \"reranker_endpoint\": \"nemotron-ranking-ms:8000\",\n",
     "}\n",
     "\n",
     "\n",
diff --git a/docs/accuracy-benchmarks.md b/docs/accuracy-benchmarks.md
new file mode 100644
index 000000000..bffa6cc33
--- /dev/null
+++ b/docs/accuracy-benchmarks.md
@@ -0,0 +1,123 @@
+<!-- SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+SPDX-License-Identifier: Apache-2.0
+# Benchmarking RAG Accuracy: Evaluating LLM Reasoning and VLM Integration
+
+In the fast-moving world of Retrieval-Augmented Generation (RAG), the gap between a “good” system and one that’s truly production-ready often depends on how effectively the pipeline manages complex reasoning and multimodal data. To measure these advancements, our team conducted extensive benchmarks across multiple configurations, examining the influence of LLM reasoning (“Think” mode) and Vision-Language Models (VLM).
+
+## Benchmarked Datasets
+
+Our analysis centered on seven major public datasets encompassing a broad range of challenges, from financial reasoning to intricate structural document parsing.
+
+| Dataset | Domain | Corpus Language | Main Modalities | # Pages | # Queries |
+|---|---|---|---|---|---|
+| [RagBattlepacket](https://www.eyelevel.ai/post/most-accurate-rag) | Finance, Tax & Consulting | English | Text, Tables, Charts, Infographics | 1,141 | 92 |
+| [KG-RAG](https://github.com/docugami/KG-RAG-datasets/tree/main/sec-10-q/data/v1) | Finance (SEC 10-Q) | English | Text, Tables | 1,037 | 195 |
+| [Financebench](https://github.com/patronus-ai/financebench) | Finance (Public Equity) | English | Text, Tables | 54,057 | 150 |
+| [DC767](https://digitalcorpora.org/) | General (Gov, NGO, Health) | English | Text, Tables | 54,730 | 488 |
+| [HotPotQA](https://huggingface.co/datasets/hotpotqa/hotpot_qa) | Wikipedia-based question-answer pairs | English | Text | 2,673 (txt files) | 979 |
+| [Google Frames](https://huggingface.co/datasets/google/frames-benchmark) | History, Sports, Science, Animals, Health | English | Text | 31,708 | 824 |
+
+### [Vidore Dataset](https://huggingface.co/blog/QuentinJG/introducing-vidore-v3#public-datasets)
+
+| Dataset | Domain | Corpus Language | Main Modalities | # Pages | # Queries (with translations) |
+|---|---|---|---|---|---|
+| French Public Company Annual Reports | Finance-FR | French | Text, Table, Charts | 2,384 | 1,920 |
+| U.S. Public Company Annual Reports | Finance-EN | English | Text, Table | 2,942 | 1,854 |
+| Computer Science Textbooks | Computer Science | English | Text, Infographic, Tables | 1,360 | 1,290 |
+| HR Reports from EU | HR | English | Text, Table, Charts | 1,110 | 1,908 |
+| French Governmental Energy Reports | Energy | French | Text, Charts | 2,229 | 1,848 |
+| USAF Technical Orders | Industrial | English | Text, Tables, Infographics, Images | 5,244 | 1,698 |
+| FDA Reports | Pharmaceuticals | English | Text, Charts, Images, Infographic, Tables | 2,313 | 2,184 |
+| French Physics Lectures | Physics | French | Text, Images, Infographics | 1,674 | 1,812 |
+
+
+## Evaluation Methodology
+
+Our primary evaluation metric is end-to-end RAG answer accuracy, measured using the [NVIDIA Answer Accuracy metric from RAGAS](https://docs.ragas.io/en/stable/concepts/metrics/available_metrics/nvidia_metrics/). Each response is rated on a 0–4 scale by an LLM judge, with scores normalized to a range for reporting. We chose [mistralai/Mixtral-8x22B-Instruct-v0.1](https://build.nvidia.com/mistralai/mixtral-8x22b-instruct) as the LLM judge, guided by performance on the [Judge’s Verdict](https://huggingface.co/spaces/nvidia/judges-verdict) benchmark.
+
+> Full evaluation pipeline: [evaluation_01_ragas.ipynb](https://github.com/NVIDIA-AI-Blueprints/rag/blob/main/notebooks/evaluation_01_ragas.ipynb)
+
+- Metric: Accuracy, defined as the degree to which generated responses align with the ground truth answers.
+- Pipeline configuration: All experiments were run using the default configuration.
+- Generation models:
+  - LLM: nvidia/llama-3.3-nemotron-super-49b-v1.5
+  - VLM: nvidia/nemotron-nano-vl-12b-v2
+- Judge model: mistralai/Mixtral-8x22B-Instruct-v0.1
+
+## Configuration and Accuracy Results
+
+We tested four main configurations to evaluate how ["Reasoning" (Think On)](https://github.com/NVIDIA-AI-Blueprints/rag/blob/main/docs/enable-nemotron-thinking.md) and ["Vision Language Model" (VLM)](https://github.com/NVIDIA-AI-Blueprints/rag/blob/main/docs/vlm.md) features influence accuracy. In the VLM-based generation pipeline, image captioning was enabled during data ingestion. For text-only datasets, we excluded the VLM-based generation setup from evaluation.
+
+| Dataset | LLM (Reasoning Off) | LLM (Reasoning On) | VLM (Reasoning Off) | VLM (Reasoning On) |
+|---|---|---|---|---|
+| FinanceBench | 0.612 | 0.668 | 0.622 | 0.697 |
+| KG-RAG | 0.569 | 0.593 | 0.596 | 0.643 |
+| RAGBattle | 0.812 | 0.818 | 0.867 | 0.842 |
+| DC767 | 0.906 | 0.899 | 0.907 | 0.897 |
+| Hotpotqa | 0.672 | 0.676 | n/a | n/a |
+| Google Frames | 0.486 | 0.597 | n/a | n/a |
+
+The table in the following section summarizes the accuracy scores for each dataset across our experimental configurations.
+
+### Vidore-V3 Results
+
+For the Vidore-v3 evaluation, we combined all domains into a single collection and then performed domain-specific evaluations.
+
+| Dataset subsets | LLM (Reasoning Off) | LLM (Reasoning On) | VLM (Reasoning Off) | VLM (Reasoning On) |
+|---|---|---|---|---|
+| Computer Science | 0.894 | 0.882 | 0.927 | 0.931 |
+| Energy | 0.751 | 0.765 | 0.802 | 0.824 |
+| Finance EN | 0.699 | 0.718 | 0.758 | 0.766 |
+| Pharmaceuticals | 0.759 | 0.775 | 0.849 | 0.858 |
+| HR | 0.726 | 0.735 | 0.767 | 0.804 |
+| Industrial | 0.677 | 0.674 | 0.733 | 0.758 |
+| Physics | 0.840 | 0.806 | 0.903 | 0.910 |
+| Finance FR | 0.639 | 0.647 | 0.683 | 0.687 |
+
+
+## Key Results
+
+The following sections describe the key results from our analysis.
+
+### The "Reasoning Dividend" in FinanceBench and KG-RAG
+
+For FinanceBench and KG-RAG datasets we have observed improved accuracy with reasoning on.
+
+Why it makes sense
+
+- FinanceBench is heavily table-centric—about 75% of queries involve tables—and many of these require mathematical operations or extracting data across multiple line items. Simple retrieval is not sufficient; the model must perform an explicit reasoning step to carry out the necessary arithmetic and cross-referencing to match the human-annotated ground truth.
+
+- KG-RAG requires temporal reasoning (for example, comparing Q3 2022 with Q1 2023). Without reasoning enabled, the model may retrieve the correct company but the wrong fiscal quarter. Turning Reasoning On lets the LLM check dates and periods before finalizing its answer.
+
+### The Multimodal Unlock: Decoding Visual Complexity in ViDoRe and RAGBattlePacket
+
+Across both the ViDoRe benchmark and RAGBattlePacket, we saw best results when moving from a text-only LLM to a VLM. RAGBattlePacket reached its highest baseline accuracy (0.867) simply by enabling the VLM, and ViDoRe showed broad gains across nearly all of its diverse sub-domains.
+
+Why it makes sense
+
+- Preserving Spatial Layouts (ViDoRe): Sub-domains like Finance and Pharmaceuticals depend on rigid tables and charts that text-only pipelines often fail to capture. A VLM can directly “see” and preserve these structures, leading to higher accuracy on this benchmark.
+- Targeting Visual Queries (RAGBattlePacket): About 10% of RAGBattlePacket queries focus on charts, bar graphs, and customer journey diagrams, which standard pipelines often hallucinate on or ignore. A VLM can directly interpret these visuals, returning precise percentages and preserving the underlying structure.
+
+### Semantic Robustness in DC767
+
+This dataset showed the highest overall stability, maintaining roughly 0.90 or higher accuracy across almost all configurations.
+
+Why it makes sense
+
+Because the dataset is about 70% text-based prose, it relies heavily on high-quality embeddings and semantic search. Our core retriever is clearly optimized for dense text retrieval, as adding Vision or Reasoning produced only a marginal gain (about a 1.1% change). This suggests that our base RAG engine is already very strong for standard retriever-focused tasks.
+
+### Reasoning as the Catalyst in Google Frames
+
+This dataset demonstrated the true impact of active reasoning on complex, multi-hop queries. By turning reasoning on, the model achieved a massive leap in overall performance. This gain represents our most significant improvement driven purely by logical processing.
+
+Why it makes sense
+
+Google Frames targets complex queries that require synthesizing facts across multiple documents while tracking overlapping constraints. A standard LLM often struggles to keep all these parameters in mind in a single pass. Turning on reasoning enables the model to systematically decompose multi-step logic and verify dependencies, which is essential for accurate factual extraction.
+
+## Related Topics
+
+- [Evaluate Your NVIDIA RAG Blueprint System](evaluate.md)
+- [Enable Reasoning in Nemotron LLM Models](enable-nemotron-thinking.md)
+- [VLM-Based Inferencing in RAG](vlm.md)
+- [Image Captioning Support](image_captioning.md)
+- [Best Practices for Common Settings](accuracy_perf.md)
diff --git a/docs/accuracy_perf.md b/docs/accuracy_perf.md
index 8ee491fa6..c24f0c9a5 100644
--- a/docs/accuracy_perf.md
+++ b/docs/accuracy_perf.md
@@ -14,7 +14,7 @@ Change the setting if you want different behavior.
 | Name                 | Default    | Description         | Advantages           | Disadvantages            |
 |----------------------|------------|---------------------|----------------------|--------------------------|
 | `APP_NVINGEST_CHUNKOVERLAP` | `150` | Increase overlap to ensure smooth transitions between chunks. | - Larger overlap provides smoother transitions between chunks. <br/>  | - Might increase processing overhead. <br/> |
-| `APP_NVINGEST_CHUNKSIZE` | `512` | Increase chunk size for more context. | - Larger chunks retain more context, improving coherence. <br/> | - Larger chunks increase embedding size, slowing retrieval. <br/> - Longer chunks might increase latency due to larger prompt size. <br/> |
+| `APP_NVINGEST_CHUNKSIZE` | `512` | Increase chunk size for more context. | - Larger chunks retain more context, improving coherence. <br/> - Larger chunks increase compute time for embedding creation. <br/> - Larger chunks can lead to longer retrieved context, increasing generation latency. <br/> - Very large chunks may dilute semantic focus, reducing embedding precision. <br/> |
 | `APP_NVINGEST_ENABLEPDFSPLITTER` | `true` | Set to `true` to perform chunk-based splitting of pdfs after the default page-level extraction occurs. Recommended for PDFs that are mostly text content. | - Provides more granular content segmentation. <br/> | - Can increase the number of chunks and slow down the ingestion process. <br/> |
 | `APP_NVINGEST_EXTRACTCHARTS` | `true` | Set to `true` to extract charts. | - Improves accuracy for documents that contain charts. <br/> | - Increases ingestion time. <br/> |
 | `APP_NVINGEST_EXTRACTIMAGES` | `false` | Set to `true` to enable image captioning during ingestion. For details, refer to [Image Captioning Support](image_captioning.md). | - Enhances multimodal retrieval accuracy for documents having images. <br/> | - Increased processing time during ingestion. <br/> - Requires additional GPU resources for VLM model deployment. <br/> |
@@ -30,14 +30,14 @@ Change the setting if you want different behavior.
 
 | Name                 | Default    | Description         | Advantages           | Disadvantages            |
 |----------------------|------------|---------------------|----------------------|--------------------------|
-| - `APP_LLM_MODELNAME` <br/> - `APP_EMBEDDINGS_MODELNAME` <br/> - `APP_RANKING_MODELNAME` <br/> | See description | The default models are the following: <br/>- `nvidia/llama-3.3-nemotron-super-49b-v1.5` <br/> - `nvidia/llama-3.2-nv-embedqa-1b-v2` <br/> - `nvidia/llama-3.2-nv-rerankqa-1b-v2` <br/><br/>You can use larger models.  For details, refer to [Change the Inference or Embedding Model](change-model.md). | - Higher accuracy with better reasoning and a larger context length. <br/> | - Slower response time. <br/> - Higher inference cost. <br/> - Higher GPU requirement. <br/>  |
+| - `APP_LLM_MODELNAME` <br/> - `APP_EMBEDDINGS_MODELNAME` <br/> - `APP_RANKING_MODELNAME` <br/> | See description | The default models are the following: <br/>- `nvidia/llama-3.3-nemotron-super-49b-v1.5` <br/> - `nvidia/llama-nemotron-embed-1b-v2` <br/> - `nvidia/llama-nemotron-rerank-1b-v2` <br/><br/>You can use larger models.  For details, refer to [Change the Inference or Embedding Model](change-model.md). | - Higher accuracy with better reasoning and a larger context length. <br/> | - Slower response time. <br/> - Higher inference cost. <br/> - Higher GPU requirement. <br/>  |
 | `APP_VECTORSTORE_SEARCHTYPE` | `dense` | Set to `hybrid` to enable hybrid search. For details, refer to [Hybrid Search Support](hybrid_search.md). | - Can provide better retrieval accuracy for domain-specific content. <br/> | - Can induce higher latency for large number of documents. <br/> |
 | `ENABLE_GUARDRAILS` | `false` | Set to `true` to enable NeMo Guardrails. For details, refer to [Nemo Guardrails Support](nemo-guardrails.md). | - Applies input/output constraints for better safety and consistency. <br/> | - Significant increased processing overhead for additional LLM calls. <br/> - Needs additional GPUs to deploy guardrails-specific models locally. <br/> |
 | `ENABLE_QUERYREWRITER` | `false` | Set to `true` to enable query rewriting.  For details, refer to [Multi-Turn Conversation Support](multiturn.md). | - Enhances retrieval accuracy for multi-turn scenarios by rephrasing the query. <br/> | - Adds an extra LLM call, increasing latency. <br/> |
 | `ENABLE_REFLECTION` | `false` | Set to `true` to enable self-reflection. For details, refer to [Self-Reflection Support](self-reflection.md). | - Can improve the response quality by refining intermediate retrieval and final LLM output. <br/> | - Significantly higher latency due to multiple iterations of LLM model call. <br/> - You might need to deploy a separate judge LLM model, increasing GPU requirement. <br/> |
 | `ENABLE_RERANKER`    | `true` | Set to `true` to use the reranking model.    | - Improves accuracy by selecting better documents for response generation. <br/> | - Increases latency due to additional processing. <br/> - Additional hardware requirements for self-hosted on premises deployment. <br/>   |
 | `ENABLE_VLM_INFERENCE` | `false`    | Set to `true` to use the Vision-Language Model (VLM) for response generation. For details, refer to [VLM for Generation](vlm.md).  | - Enables analysis of retrieved images alongside text for richer, multimodal responses. <br/> - Can process up to 4 images per citation. <br/> - Useful for document Q&A, visual search, and multimodal chatbots. <br/> | - Requires additional GPU resources for VLM model deployment. <br/> - Increases latency due to image processing. <br/> |
-| Reasoning in `llama-3.3-nemotron-super-49b-v1.5` | `/no_think` | Use `/think` to enable reasoning. For details, refer to [Enable Reasoning](enable-nemotron-thinking.md). | - Improves response quality through enhanced reasoning capabilities. <br/> - Yields more precise responses. The default model is verbose and works best with reasoning enabled. <br/> | - Can increase response latency due to additional thinking process. <br/> - Can increase token usage and computational overhead. <br/> |
+| `LLM_ENABLE_THINKING` | `false` | Set to `true` to enable reasoning for Nemotron 3 models. Use `LLM_REASONING_BUDGET` and `LLM_LOW_EFFORT` for fine-grained control. For Nemotron 1.5 models, use the `/think` system prompt instead. For details, refer to [Enable Reasoning](enable-nemotron-thinking.md). | - Improves response quality through enhanced reasoning capabilities. <br/> - Yields more precise responses. <br/> | - Can increase response latency due to additional thinking process. <br/> - Can increase token usage and computational overhead. <br/> |
 | `RERANKER_SCORE_THRESHOLD` | `0.0` | Filters out retrieved chunks if reranker relevance is lower than this threshold. We recommend that you set this value between `0.3` and `0.5` to balance quality and coverage. For details, refer to [Use the Python Package](python-client.md). | - Faster retrieval by processing fewer documents. <br/> - Can improve accuracy by excluding low-relevance documents. <br/> | - Requires `ENABLE_RERANKER` set to `true` for effective filtering. <br/> - Might filter out too many chunks if the threshold is set high, causing no response from the RAG server. <br/> |
 | `RERANKER TOP K` | 10 | Increase `reranker TOP K` to increase the probability of relevant context being part of the top-k contexts. | Increasing the value can improve accuracy. | Increasing the value can increase latency. |
 | `VDB TOP K` | 100 | Increase `VDB TOP K` to provide a larger candidate pool for reranking. | Increasing the value can improve accuracy. | Increasing the value can increase latency. |
diff --git a/docs/api-ingestor.md b/docs/api-ingestor.md
index 443a521a6..adeeb4cf0 100644
--- a/docs/api-ingestor.md
+++ b/docs/api-ingestor.md
@@ -8,7 +8,7 @@
 This documentation contains the OpenAPI reference for the ingestor server.
 
 :::{tip}
-To view this documentation on docs.nvidia.com, go to https://docs.nvidia.com/rag/latest/api-ingestor.html.
+To view this documentation on docs.nvidia.com, browse to [https://docs.nvidia.com/rag/latest/api-ingestor](https://docs.nvidia.com/rag/latest/api-ingestor.html).
 :::
 
 
@@ -41,7 +41,7 @@ The status response includes progress metrics updated after each batch completes
 For more granular progress updates during batch processing, use the `nv_ingest_status` object described below, which tracks individual document extraction progress and updates more frequently than the batch-level metrics.
 :::
 
-### NV-Ingest Extraction Status
+### Extraction status
 
 The `/status` endpoint response includes an `nv_ingest_status` object that provides real-time document extraction progress, updating more frequently than batch-level metrics. This is useful for monitoring individual document processing when polling the status endpoint:
 
@@ -53,7 +53,7 @@ The `/status` endpoint response includes an `nv_ingest_status` object that provi
 | Status | Description |
 |--------|-------------|
 | `not_started` | Document queued, extraction not yet initiated |
-| `submitted` | Document submitted to NV-Ingest for processing |
+| `submitted` | Document submitted to NeMo Retriever Library for processing |
 | `processing` | Document extraction is in progress |
 | `completed` | Document extraction completed successfully |
 | `failed` | Document extraction failed |
diff --git a/docs/api-rag.md b/docs/api-rag.md
index 366d44b0f..7a15d8890 100644
--- a/docs/api-rag.md
+++ b/docs/api-rag.md
@@ -8,8 +8,10 @@
 This documentation contains the OpenAPI reference for the RAG server.
 
 :::{tip}
-To view this documentation on docs.nvidia.com, go to https://docs.nvidia.com/rag/latest/api-rag.html.
+To view this documentation on docs.nvidia.com, browse to [https://docs.nvidia.com/rag/latest/api-rag](https://docs.nvidia.com/rag/latest/api-rag.html).
 :::
+=======
+To view this documentation on docs.nvidia.com, browse to [https://docs.nvidia.com/rag/latest/api-rag](https://docs.nvidia.com/rag/latest/api-rag.html).
 
 
 :::{swagger-plugin} ../docs/api_reference/openapi_schema_rag_server.json
diff --git a/docs/api_reference/openapi_schema_rag_server.json b/docs/api_reference/openapi_schema_rag_server.json
index 63bbb4e33..5bcf2ec7d 100644
--- a/docs/api_reference/openapi_schema_rag_server.json
+++ b/docs/api_reference/openapi_schema_rag_server.json
@@ -707,7 +707,7 @@
             "maxLength": 256,
             "title": "Embedding Model",
             "description": "Name of the embedding model used for vectorization.",
-            "default": "nvdev/nvidia/llama-3.2-nv-embedqa-1b-v2"
+            "default": "nvdev/nvidia/llama-nemotron-embed-1b-v2"
           },
           "embedding_endpoint": {
             "type": "string",
@@ -721,7 +721,7 @@
             "maxLength": 256,
             "title": "Reranker Model",
             "description": "Name of the reranker model used for ranking results.",
-            "default": "nvidia/llama-3.2-nv-rerankqa-1b-v2"
+            "default": "nvidia/llama-nemotron-rerank-1b-v2"
           },
           "reranker_endpoint": {
             "anyOf": [
@@ -1342,7 +1342,7 @@
             "maxLength": 256,
             "title": "Embedding Model",
             "description": "Name of the embedding model used for vectorization.",
-            "default": "nvdev/nvidia/llama-3.2-nv-embedqa-1b-v2"
+            "default": "nvdev/nvidia/llama-nemotron-embed-1b-v2"
           },
           "embedding_endpoint": {
             "anyOf": [
@@ -1363,7 +1363,7 @@
             "maxLength": 256,
             "title": "Reranker Model",
             "description": "Name of the reranker model used for ranking results.",
-            "default": "nvidia/llama-3.2-nv-rerankqa-1b-v2"
+            "default": "nvidia/llama-nemotron-rerank-1b-v2"
           },
           "reranker_endpoint": {
             "anyOf": [
diff --git a/docs/audio_ingestion.md b/docs/audio_ingestion.md
index 55fcbc132..399ea7a64 100644
--- a/docs/audio_ingestion.md
+++ b/docs/audio_ingestion.md
@@ -132,7 +132,7 @@ When using Helm deployment, the Audio NIM service requires an additional GPU.
 
 The `APP_NVINGEST_SEGMENTAUDIO` environment variable controls whether audio segmentation is enabled during the ingestion process.
 
-When set to `True`, NV-Ingest will segment audio files based on commas and other punctuation marks, resulting in more granular audio chunks. This can improve downstream processing and retrieval accuracy for audio content. Note that splitting on captions will occur regardless of this setting; enabling `APP_NVINGEST_SEGMENTAUDIO` simply adds additional segmentation based on punctuation.
+When set to `True`, NeMo Retriever Library will segment audio files based on commas and other punctuation marks, resulting in more granular audio chunks. This can improve downstream processing and retrieval accuracy for audio content. Note that splitting on captions will occur regardless of this setting; enabling `APP_NVINGEST_SEGMENTAUDIO` simply adds additional segmentation based on punctuation.
 
 To enable audio segmentation, add the following export command to your environment configuration:
 
diff --git a/docs/change-model.md b/docs/change-model.md
index d0173462a..871d8f5f3 100644
--- a/docs/change-model.md
+++ b/docs/change-model.md
@@ -46,6 +46,10 @@ The `nemotron-3-nano-30b` model has different naming conventions depending on th
 
 Both names refer to the same underlying model. Use the appropriate name based on your deployment type.
 
+##### Nemotron 3 Super
+
+Nemotron 3 Super is a larger model with different GPU and environment requirements: local NIM deployment requires at least 2 GPUs (FP8 TP2), and you may need a dedicated prompt config and reasoning settings. For full deployment steps (Docker and Helm), see the [Nemotron 3 Super deployment guide](nemotron3-super-deployment.md).
+
 
 ### Change the Embedding Model
 
@@ -77,7 +81,7 @@ Always use same embedding model or model having same tokinizers for both ingesti
 
 ### Configure Embedding Dimensions
 
-The default embedding model (`nvidia/llama-3.2-nv-embedqa-1b-v2`) uses **2048 dimensions** by default. When changing to a different embedding model, you may need to update the dimensions to match the model's output.
+The default embedding model (`nvidia/llama-nemotron-embed-1b-v2`) uses **2048 dimensions** by default. When changing to a different embedding model, you may need to update the dimensions to match the model's output.
 
 **Important:** Some embedding models have **fixed output dimensions** and do not accept a `dimensions` parameter. For example, `nvidia/nv-embedqa-e5-v5` always outputs 1024-dimensional embeddings. If you use such a model without configuring the dimensions, you may encounter an error like:
 
@@ -124,13 +128,13 @@ You can specify the model for NVIDIA NIM containers to use in the [nims.yaml](..
        image: nvcr.io/nim/<image>:<tag>
        ...
 
-     nemoretriever-embedding-ms:
-       container_name: nemoretriever-embedding-ms
+     nemotron-embedding-ms:
+       container_name: nemotron-embedding-ms
        image: nvcr.io/nim/<image>:<tag>
 
 
-     nemoretriever-ranking-ms:
-       container_name: nemoretriever-ranking-ms
+     nemotron-ranking-ms:
+       container_name: nemotron-ranking-ms
        image: nvcr.io/nim/<image>:<tag>
    ```
 
@@ -173,11 +177,11 @@ Use this procedure to change models when you are running self-hosted NVIDIA NIM
 
       # === Embeddings ===
       APP_EMBEDDINGS_MODELNAME: "<embedding-model-name>"
-      APP_EMBEDDINGS_SERVERURL: "nemoretriever-embedding-ms:8000/v1"
+      APP_EMBEDDINGS_SERVERURL: "nemotron-embedding-ms:8000/v1"
 
       # === Reranker ===
       APP_RANKING_MODELNAME: "<reranker-model-name>"
-      APP_RANKING_SERVERURL: "nemoretriever-ranking-ms:8000"
+      APP_RANKING_SERVERURL: "nemotron-ranking-ms:8000"
     ```
 
 3. Configure the NIM microservices that host those models. Replace `<image>:<tag>` with the image you selected (format `nvcr.io/nim/<image>:<tag>`) in [values.yaml](../deploy/helm/nvidia-blueprint-rag/values.yaml).
@@ -215,7 +219,7 @@ Use this procedure to change models when you are running self-hosted NVIDIA NIM
       enabled: true
       replicas: 1
       service:
-        name: "nemoretriever-embedding-ms"
+        name: "nemotron-embedding-ms"
       image:
         # nvcr.io/nim/<image>:<tag>
         repository: nvcr.io/nim/<image>
@@ -237,7 +241,7 @@ Use this procedure to change models when you are running self-hosted NVIDIA NIM
       enabled: true
       replicas: 1
       service:
-        name: "nemoretriever-ranking-ms"
+        name: "nemotron-ranking-ms"
       image:
         # nvcr.io/nim/<image>:<tag>
         repository: nvcr.io/nim/<image>
@@ -264,7 +268,19 @@ Use this procedure to change models when you are running self-hosted NVIDIA NIM
     **If only the vLLM profile is available**
 
    When only a vLLM profile is available for a model, such as on H100 and RTX GPUs, you must use the vLLM engine. First [run the list-model-profiles command](model-profiles.md#list-available-profiles) to confirm which profiles are available and then apply the following configurations.
-
+    **For Nemotron Nano Models VLLM profile**
+    
+    When deploying `nvidia/nvidia-nemotron-nano-9b-v2` or `nvidia/nemotron-3-nano`, check if `tensorrt_llm` profile is available using below command for your required model. 
+    
+    ```bash
+    # Change model name as needed
+    USERID=$(id -u) docker run --rm --gpus all \
+      nvcr.io/nim/nvidia/nvidia-nemotron-nano-9b-v2:latest \ 
+      list-model-profiles
+    ```
+    
+    If only `vllm` profile is available, you must use the **vLLM engine** and add these specific configurations:
+    
     ```yaml
     nimOperator:
       nim-llm:
@@ -292,4 +308,5 @@ Use this procedure to change models when you are running self-hosted NVIDIA NIM
 - [Deploy with Docker (Self-Hosted Models)](deploy-docker-self-hosted.md)
 - [Deploy with Docker (NVIDIA-Hosted Models)](deploy-docker-nvidia-hosted.md)
 - [Deploy with Helm](deploy-helm.md)
+- [Nemotron 3 Super deployment (Docker and Helm)](nemotron3-super-deployment.md)
 - [Service-Specific API Keys](api-key.md#service-specific-api-keys)
diff --git a/docs/change-vectordb.md b/docs/change-vectordb.md
index a4dc993b8..36f4d4f9f 100644
--- a/docs/change-vectordb.md
+++ b/docs/change-vectordb.md
@@ -1001,7 +1001,7 @@ Update your [`values.yaml`](../deploy/helm/nvidia-blueprint-rag/values.yaml) fil
 
 ### Disable Default Vector Database and Add Custom Helm Chart
 
-1. **Disable Milvus in the NV-Ingest configuration:**
+1. **Disable Milvus in the NeMo Retriever Library configuration:**
    ```yaml
    nv-ingest:
      enabled: true
diff --git a/docs/conf.py b/docs/conf.py
index f0ffa9e07..27aeca848 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) '2025-%Y, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -23,10 +23,10 @@
 import os
 import sys
 
-project = " NVIDIA-RAG-blueprint"
-copyright = "2025, NVIDIA Corporation"
+project = " NVIDIA RAG blueprint"
+copyright = "'2025-%Y, NVIDIA Corporation"
 author = "NVIDIA Corporation"
-release = "2.4.0"
+release = "2.5.0"
 
 # -- General configuration ---------------------------------------------------
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
@@ -74,8 +74,7 @@
             "icon": "fa-brands fa-github",
         }
     ],
-    # Version switcher disabled: set "switcher": {"json_url": "...", "version_match": release}
-    # and ensure versions1.json is at the json_url path when using versioned doc deployments.
+    "switcher": {"json_url": "../versions1.json", "version_match": release},
     "extra_head": {
         """
     <script src="https://assets.adobedtm.com/5d4962a43b79/c1061d2c5e7b/launch-191c2462b890.min.js" ></script>
@@ -88,6 +87,7 @@
     },
 }
 
+
 # Add any paths that contain custom static files (such as style sheets) here,
 html_css_files = ["swagger-nvidia.css"]
 
diff --git a/docs/continuous-ingestion-object-storage.md b/docs/continuous-ingestion-object-storage.md
new file mode 100644
index 000000000..4baa9daf7
--- /dev/null
+++ b/docs/continuous-ingestion-object-storage.md
@@ -0,0 +1,127 @@
+<!--
+  SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+  SPDX-License-Identifier: Apache-2.0
+-->
+# Continuous Ingestion from Object Storage RAG Blueprint
+
+Continuous ingestion from object storage connects the [RAG blueprint](readme.md) to continuous integration. This enables an event-driven pipeline that automatically indexes documents. Continuous integration means that when you add documents to a storage bucket, the system detects new uploads, routes them for processing, and indexes their content—making all data immediately searchable and available for analysis through the [RAG Frontend](user-interface.md).
+
+## Hardware Requirements
+
+| Requirement | Details |
+|-------------|---------|
+| **GPU** | 2x RTX PRO 6000 Blackwell or 2x H100 |
+| **OS** | Ubuntu 22.04 or later |
+| **Docker** | Docker 24.0+ with Docker Compose v2 |
+| **NVIDIA Driver** | 570+ |
+| **NVIDIA Container Toolkit** | Required |
+
+
+## Overview
+
+You can create an event-driven continuous ingestion pipeline that works as follows:
+
+1. Upload documents to object storage.
+
+2. The system detects new uploads via storage events and routes them for processing.
+
+3. Content is automatically indexed into the RAG vector store.
+
+4. You can then query the ingested content through the RAG UI or API.
+
+Continuous ingestion supports documents such as PDF, DOCX, and other formats supported by the [ingestor](api-ingestor.md).
+
+## Architecture
+
+The continuous ingestion architecture features the following high-level flow:
+
+1. Object storage: Files are written to storage using a protocol that emits events (for example, MinIO configured with Kafka notifications).
+
+2. Event trigger: Upload events are published to a Kafka topic.
+
+3. Consumer: A Kafka consumer subscribes to the topic, retrieves the events, downloads the corresponding files from object storage, and routes them for processing.
+
+4. Document path: Files are passed to a file-based processing pipeline (such as the NeMo Retriever Library or ingestor-server) and then indexed in the vector database.
+
+The continuous ingestion architecture follows the end-to-end sequence described above and can be summarized as:
+
+- Document ingestion flow: (1) → (2) → (3) → file-based processing → VectorDB → RAG Agent.
+
+## Implementation Components
+
+The reference implementation includes the following components:
+
+- Object storage (MinIO): A bucket configured with Kafka notifications on put (and optionally delete) events.
+
+- Kafka: A broker and topic (for example, aidp-topic) used to publish storage event notifications.
+
+- Kafka consumer: A service that:
+
+-- Subscribes to the Kafka topic and consumes storage events.
+
+-- Downloads new objects from MinIO.
+
+-- Sends files to the RAG ingestor for indexing.
+
+The deployment is defined in `examples/rag_event_ingest/deploy/docker-compose.yaml`, which runs MinIO, Kafka, and the Kafka consumer on the same Docker network as the RAG stack (`nvidia-rag`).
+
+### Prerequisites
+
+- [Deploy the NVIDIA RAG Blueprint](deploy-docker-self-hosted.md) (NIMs, Milvus, ingestor-server, RAG server) so the consumer can reach the ingestor and the rest of the stack.
+- Ensure the `nvidia-rag` Docker network exists (created by the RAG deployment).
+- For the notebook, clone the repo, set `NGC_API_KEY`, and have the required hardware (see notebook for GPU and software requirements).
+
+### Option 1: Use the Notebook
+
+The notebook provides a guided walkthrough of the following steps:
+
+- Environment setup
+- NVIDIA RAG deployment
+- Continuous ingestion pipeline deployment (Kafka, MinIO, and consumer)
+- Testing document uploads with RAG queries
+- Cleanup
+
+To follow along, open and run: [rag_event_ingest.ipynb](https://github.com/NVIDIA-AI-Blueprints/rag/blob/main/notebooks/rag_event_ingest.ipynb).
+
+### Option 2: Deploy the Example with Docker Compose
+
+From the repository root, after the RAG stack is up:
+
+```bash
+docker compose -f examples/rag_event_ingest/deploy/docker-compose.yaml up -d
+```
+
+This command launches the following components:
+
+- Kafka (with an optional Kafka UI available on port 8080)
+- MinIO (object storage and console using ports 9201 and 9211 in the example)
+- Kafka consumer — connects to the ingestor at `INGESTOR_SERVER_URL` (default: `http://ingestor-server:8082`) and uses `COLLECTION_NAME` (default: `aidp_bucket`)
+
+After deployment, upload documents and query ingested content as follows:
+
+1. Open the MinIO Console UI at `http://<host-ip>:9211/login`.
+2. Log in with the default credentials (`minioadmin` / `minioadmin`).
+3. Navigate to the `aidp-bucket` bucket and upload your documents (PDF, DOCX, etc.).
+4. The system automatically publishes upload events to Kafka, the consumer retrieves the files, and documents are sent to the ingestor for indexing into the `aidp_bucket` collection.
+5. Query the ingested content through the RAG Frontend UI at `http://<host-ip>:8090` (select the `aidp_bucket` collection) or via the RAG API at `http://<host-ip>:8081/generate`.
+
+### Key Environment Variables
+
+The following environment variables configure the Kafka consumer. For details, refer to `examples/rag_event_ingest/deploy/docker-compose.yaml`.
+
+Consumer environment variables
+
+| Variable | Description | Default Value|
+|----------|---------|--------|
+| `KAFKA_BOOTSTRAP_SERVERS` | Address of the Kafka broker(s). | `kafka:9092` |
+| `KAFKA_TOPIC` |Kafka topic used for object storage events. | `aidp-topic` |
+| `MINIO_ENDPOINT` | MinIO endpoint in <host>:<port> format. | `minio-source-1:9000` |
+| `INGESTOR_SERVER_URL` | Base URL for the RAG ingestor service. | `http://ingestor-server:8082` |
+| `COLLECTION_NAME` | Target RAG collection for content indexing. | `aidp_bucket` |
+
+## Reference
+
+- [RAG Blueprint deployment (Docker self-hosted)](deploy-docker-self-hosted.md)
+- [Ingestor API](api-ingestor.md)
+- [Notebook: Document continuous ingestion from object storage](https://github.com/NVIDIA-AI-Blueprints/rag/blob/main/notebooks/rag_event_ingest.ipynb)
+- [Example: `examples/rag_event_ingest/`](https://github.com/NVIDIA-AI-Blueprints/rag/blob/main/examples/rag_event_ingest/) — Kafka consumer and `deploy/docker-compose.yaml`
diff --git a/docs/custom-metadata.md b/docs/custom-metadata.md
index dba96e461..ce96b63ba 100644
--- a/docs/custom-metadata.md
+++ b/docs/custom-metadata.md
@@ -233,12 +233,12 @@ The system automatically manages certain metadata fields that are added to all c
 | Field Name | Type | Description | Auto-Populated | User Override |
 |------------|------|-------------|----------------|---------------|
 | **`filename`** | `string` | Name of the uploaded file | ✅ RAG system | ✅ Yes - define in schema |
-| **`page_number`** | `integer` | Page number where content appears (1-indexed) | ✅ nv-ingest | ✅ Yes - define in schema |
-| **`start_time`** | `integer` | Start timestamp in milliseconds for audio/video segments | ✅ nv-ingest | ✅ Yes - define in schema |
-| **`end_time`** | `integer` | End timestamp in milliseconds for audio/video segments | ✅ nv-ingest | ✅ Yes - define in schema |
+| **`page_number`** | `integer` | Page number where content appears (1-indexed) | ✅ NeMo Retriever Library | ✅ Yes - define in schema |
+| **`start_time`** | `integer` | Start timestamp in milliseconds for audio/video segments | ✅ NeMo Retriever Library | ✅ Yes - define in schema |
+| **`end_time`** | `integer` | End timestamp in milliseconds for audio/video segments | ✅ NeMo Retriever Library | ✅ Yes - define in schema |
 
 :::{note}
-The following field names are **reserved** by NV-Ingest and cannot be used in custom metadata schemas: `type`, `subtype`, and `location`. These fields are exclusively managed by NV-Ingest during document processing and attempting to use them will result in a validation error.
+The following field names are **reserved** by NeMo Retriever Library and cannot be used in custom metadata schemas: `type`, `subtype`, and `location`. These fields are exclusively managed by NeMo Retriever Library during document processing and attempting to use them will result in a validation error.
 :::
 
 #### System-Managed Field Behavior
@@ -246,7 +246,7 @@ The following field names are **reserved** by NV-Ingest and cannot be used in cu
 - **Auto-Addition**: These fields are automatically added to your collection schema if you don't define them
 - **Auto-Population**: 
   - `filename` is populated by the RAG system during ingestion
-  - `page_number`, `start_time`, `end_time` are extracted and populated by nv-ingest during document processing
+  - `page_number`, `start_time`, `end_time` are extracted and populated by NeMo Retriever Library during document processing
 - **User Override**: You can define any of these fields in your schema with custom properties (e.g., different description, constraints)
   - If you provide a definition, your definition takes priority
   - If you don't provide a definition, the system auto-adds them with default settings
@@ -258,7 +258,7 @@ The following field names are **reserved** by NV-Ingest and cannot be used in cu
 :::{note}
 **Example**: If you upload a multi-page PDF without defining `page_number` in your schema, the system will:
 1. Automatically add the `page_number` field to your collection schema
-2. nv-ingest will extract the page number from each chunk during processing
+2. NeMo Retriever Library extracts the page number from each chunk during processing
 3. The page number will be available for filtering (e.g., `content_metadata["page_number"] == 5`)
 4. The page number will appear in citations when generating responses
 :::
diff --git a/docs/debugging.md b/docs/debugging.md
index 66a580419..1fb5aea0d 100644
--- a/docs/debugging.md
+++ b/docs/debugging.md
@@ -33,7 +33,7 @@ docker logs -f nim-llm-ms
 watch -n 10 'du -sh ~/.cache/model-cache/'
 
 # Check specific container resource usage
-docker stats nim-llm-ms nemoretriever-embedding-ms nemoretriever-ranking-ms
+docker stats nim-llm-ms nemotron-embedding-ms nemotron-ranking-ms
 ```
 
 The expected timeline for Docker (Self-Hosted) deployment is the following:
@@ -124,12 +124,12 @@ docker ps | grep -E "(ingestor-server|nv-ingest|nemoretriever-embedding|milvus|r
    milvus-standalone                       Up 36 minutes (healthy)
    milvus-minio                            Up 35 minutes (healthy)
    milvus-etcd                             Up 35 minutes (healthy)
-   nemoretriever-ranking-ms                Up 38 minutes (healthy)
+   nemotron-ranking-ms                Up 38 minutes (healthy)
    compose-page-elements-1                 Up 38 minutes
    compose-nemoretriever-ocr-1             Up 38 minutes
    compose-graphic-elements-1              Up 38 minutes
    compose-table-structure-1               Up 38 minutes
-   nemoretriever-embedding-ms              Up 38 minutes (healthy)
+   nemotron-embedding-ms              Up 38 minutes (healthy)
    nim-llm-ms                              Up 38 minutes (healthy)
    ```
 
@@ -141,7 +141,7 @@ docker ps | grep -E "(ingestor-server|nv-ingest|nemoretriever-embedding|milvus|r
 # Check ingestor server health with all dependencies
 curl -X GET "http://localhost:8082/v1/health?check_dependencies=true" | jq
 
-# Verify NV-Ingest runtime is ready for processing
+# Verify NeMo Retriever Library runtime is ready for processing
 curl -X GET "http://localhost:7670/v1/health/ready"
 
 # Check embedding service is responding
@@ -219,11 +219,11 @@ Start by examining the logs of key ingestion services to identify the specific e
 # Check ingestor server logs for API errors
 docker logs ingestor-server --tail 100
 
-# Check NV-Ingest runtime logs for processing errors
+# Check NeMo Retriever Library runtime logs for processing errors
 docker logs nv-ingest-ms-runtime --tail 100
 
 # Check embedding service logs for model issues
-docker logs nemoretriever-embedding-ms --tail 100
+docker logs nemotron-embedding-ms --tail 100
 ```
 
 ### 2. Common Ingestion Problems and Solutions
@@ -245,15 +245,15 @@ docker logs milvus-standalone --tail 50
 **Embedding Service Issues:**
 ```bash
 # Check embedding service logs
-docker logs nemoretriever-embedding-ms --tail 100
+docker logs nemotron-embedding-ms --tail 100
 
 # Verify GPU availability and memory
 nvidia-smi
 ```
 
-**NV-Ingest Processing Errors:**
+**NeMo Retriever Library Processing Errors:**
 ```bash
-# Check NV-Ingest logs for processing errors
+# Check NeMo Retriever Library logs for processing errors
 docker logs nv-ingest-ms-runtime --tail 200 | grep -i error
 
 # Check Redis connectivity for task queue
@@ -288,7 +288,7 @@ docker logs rag-server --tail 100
 docker logs nim-llm-ms --tail 100
 
 # Check ranking service logs for reranking errors
-docker logs nemoretriever-ranking-ms --tail 100
+docker logs nemotron-ranking-ms --tail 100
 ```
 
 ### 2. Common Retrieval Problems and Solutions
diff --git a/docs/deploy-docker-nvidia-hosted.md b/docs/deploy-docker-nvidia-hosted.md
index 4487edff3..2aabd06ce 100644
--- a/docs/deploy-docker-nvidia-hosted.md
+++ b/docs/deploy-docker-nvidia-hosted.md
@@ -111,7 +111,7 @@ Use the following procedure to start all containers needed for this blueprint.
         ],
         "processing": [
             {
-                "service": "NV-Ingest",
+                "service": "NeMo Retriever Library",
                 "status": "healthy",
                 ...
             }
@@ -238,7 +238,7 @@ After the first time you deploy the RAG Blueprint successfully, you can consider
 
 - If you don't have a GPU available, you can switch to CPU-only Milvus by following the instructions in [milvus-configuration.md](./milvus-configuration.md).
 
-- If you have a requirement to build the NVIDIA Ingest runtime container from source, you can do it by following instructions [here](https://github.com/NVIDIA/nv-ingest).
+- If you have a requirement to build the NeMo Retriever Library runtime container from source, you can do it by following instructions [here](https://github.com/NVIDIA/NeMo-Retriever).
 
 
 
diff --git a/docs/deploy-docker-self-hosted.md b/docs/deploy-docker-self-hosted.md
index 0efe64ea6..4913be36a 100644
--- a/docs/deploy-docker-self-hosted.md
+++ b/docs/deploy-docker-self-hosted.md
@@ -110,7 +110,7 @@ Use the following procedure to start all containers needed for this blueprint.
    USERID=$(id -u) docker compose -f deploy/compose/nims.yaml up -d
    ```
 
-5. Check the status of the deployment by running the following code. Wait until all services are up and the `nemoretriever-ranking-ms`, `nemoretriever-embedding-ms` and `nim-llm-ms`  NIMs are in healthy state before proceeding further.
+5. Check the status of the deployment by running the following code. Wait until all services are up and the `nemotron-ranking-ms`, `nemotron-embedding-ms` and `nim-llm-ms`  NIMs are in healthy state before proceeding further.
 
      ```bash
      watch -n 2 'docker ps --format "table {{.Names}}\t{{.Status}}"'
@@ -121,10 +121,10 @@ Use the following procedure to start all containers needed for this blueprint.
         NAMES                                   STATUS
 
         nim-llm-ms                    Up 4 minutes (healthy)
-        nemoretriever-ranking-ms      Up 4 minutes (healthy)
+        nemotron-ranking-ms      Up 4 minutes (healthy)
         compose-graphic-elements-1    Up 4 minutes
         compose-page-elements-1       Up 4 minutes
-        nemoretriever-embedding-ms    Up 4 minutes (healthy)
+        nemotron-embedding-ms    Up 4 minutes (healthy)
         compose-nemoretriever-ocr-1   Up 4 minutes
         compose-table-structure-1     Up 4 minutes
      ```
@@ -174,7 +174,7 @@ Use the following procedure to start all containers needed for this blueprint.
         ],
         "processing": [
             {
-                "service": "NV-Ingest",
+                "service": "NeMo Retriever Library",
                 "status": "healthy",
                 ...
             }
@@ -253,10 +253,10 @@ Use the following procedure to start all containers needed for this blueprint.
     340bc8210a0d   milvus-minio                     Up 3 minutes (healthy)
     0be702b87ad6   milvus-etcd                      Up 3 minutes (healthy)
     62eabf1d9f65   nim-llm-ms                       Up 10 minutes (healthy)
-    fe2751bfa734   nemoretriever-ranking-ms         Up 10 minutes (healthy)
+    fe2751bfa734   nemotron-ranking-ms         Up 10 minutes (healthy)
     7b5ddabf8be7   compose-graphic-elements-1       Up 10 minutes
     ecfaa5190302   compose-page-elements-1          Up 10 minutes
-    ea8c7fdf20d1   nemoretriever-embedding-ms       Up 10 minutes (healthy)
+    ea8c7fdf20d1   nemotron-embedding-ms       Up 10 minutes (healthy)
     6d62008a9b42   compose-nemoretriever-ocr-1      Up 10 minutes
     969b9f5c987c   compose-table-structure-1        Up 10 minutes
     ```
@@ -333,11 +333,11 @@ After the first time you deploy the RAG Blueprint successfully, you can consider
 - For improved accuracy, consider enabling reasoning mode. For details, refer to [Enable thinking](./enable-nemotron-thinking.md).
 
 
-- NeMo Retriever OCR is now the default OCR service. To use legacy Paddle OCR instead, refer to [OCR Configuration Guide](nemoretriever-ocr.md).
+- NeMo Retriever Library OCR is now the default OCR service. To use legacy Paddle OCR instead, refer to [OCR Configuration Guide](nemoretriever-ocr.md).
 
 - For advanced users who need direct filesystem access to extraction results, refer to [Ingestor Server Volume Mounting](mount-ingestor-volume.md).
 
-- A single NVIDIA A100-80GB or H100-80GB, B200 GPU can be used to start non-LLM NIMs (nemoretriever-embedding-ms, nemoretriever-ranking-ms, and ingestion services like page-elements, ocr, graphic-elements, and table-structure) for ingestion and RAG workflows. You can control which GPU is used for each service by setting these environment variables in `deploy/compose/.env` file before launching. For a complete list of all services and their default GPU assignments, see [Service Port and GPU Reference](service-port-gpu-reference.md).
+- A single NVIDIA A100-80GB or H100-80GB, B200 GPU can be used to start non-LLM NIMs (nemotron-embedding-ms, nemotron-ranking-ms, and ingestion services like page-elements, ocr, graphic-elements, and table-structure) for ingestion and RAG workflows. You can control which GPU is used for each service by setting these environment variables in `deploy/compose/.env` file before launching. For a complete list of all services and their default GPU assignments, see [Service Port and GPU Reference](service-port-gpu-reference.md).
 
    ```bash
    EMBEDDING_MS_GPU_ID=0
diff --git a/docs/deploy-helm-from-repo.md b/docs/deploy-helm-from-repo.md
index 04c39829a..e57c9ea26 100644
--- a/docs/deploy-helm-from-repo.md
+++ b/docs/deploy-helm-from-repo.md
@@ -14,7 +14,7 @@ The following are the core services that you install:
 
 - RAG server
 - Ingestor server
-- NV-Ingest
+- NeMo Retriever Library
 
 
 ## Prerequisites
diff --git a/docs/deploy-helm.md b/docs/deploy-helm.md
index bf8e792c5..18940aaf6 100644
--- a/docs/deploy-helm.md
+++ b/docs/deploy-helm.md
@@ -14,7 +14,7 @@ The following are the core services that you install:
 
 - RAG server
 - Ingestor server
-- NV-Ingest
+- NeMo Retriever Library
 
 
 ## Prerequisites
@@ -37,7 +37,7 @@ Plan for additional space if you are enabling persistence for multiple services.
 
 4. Verify that you have Kubernetes v1.34.2 installed and running on Ubuntu 22.04/24.04. For more information, see [Kubernetes documentation](https://kubernetes.io/docs/setup/) and [NVIDIA Cloud Native Stack](https://github.com/NVIDIA/cloud-native-stack).
 
-5. Verify that you have installed Helm 3.  To install Helm 3 (and avoid Helm 4), follow the official Helm v3 installation instructions for your platform, for example by using the `get-helm-3` script described in the [Helm documentation](https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3).
+5. Verify that you have installed Helm 3.  To install Helm 3 (and avoid Helm 4), follow the official Helm v3 installation instructions for your platform, for example by using the `get-helm-3` script described in the [Helm documentation](https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3).
 
 6. Verify that you have a default storage class available in the cluster for PVC provisioning. One option is the local path provisioner by Rancher.   Refer to the [installation](https://github.com/rancher/local-path-provisioner?tab=readme-ov-file#installation) section of the README in the GitHub repository.
 
@@ -87,7 +87,7 @@ To deploy End-to-End RAG Server and Ingestor Server, use the following procedure
 2. Install the Helm chart by running the following command.
 
     ```sh
-    helm upgrade --install rag -n rag https://helm.ngc.nvidia.com/nvstaging/blueprint/charts/nvidia-blueprint-rag-v2.4.0.tgz \
+    helm upgrade --install rag -n rag https://helm.ngc.nvidia.com/nvidia/blueprint/charts/nvidia-blueprint-rag-v2.5.0.tgz \
     --username '$oauthtoken' \
     --password "${NGC_API_KEY}" \
     --set imagePullSecret.password=$NGC_API_KEY \
@@ -112,7 +112,7 @@ To deploy End-to-End RAG Server and Ingestor Server, use the following procedure
    
    Then install using the modified values.yaml:
    ```sh
-   helm upgrade --install rag -n rag https://helm.ngc.nvidia.com/nvstaging/blueprint/charts/nvidia-blueprint-rag-v2.4.0.tgz \
+   helm upgrade --install rag -n rag https://helm.ngc.nvidia.com/nvidia/blueprint/charts/nvidia-blueprint-rag-v2.5.0.tgz \
      --username '$oauthtoken' \
      --password "${NGC_API_KEY}" \
      --set imagePullSecret.password=$NGC_API_KEY \
@@ -125,6 +125,8 @@ To deploy End-to-End RAG Server and Ingestor Server, use the following procedure
    Refer to [NIM Model Profile Configuration](model-profiles.md) for using non-default NIM LLM profile.
    :::
 
+   For **Nemotron 3 Super** on Helm, see the [Nemotron 3 Super deployment guide](nemotron3-super-deployment.md#helm-deployment-nemotron-3-super).
+
 
 ## Verify a Deployment
 
@@ -146,11 +148,11 @@ To verify a deployment, use the following procedure.
     NAME                                                 READY   STATUS      RESTARTS   AGE
     ingestor-server-6cc886bcdf-6rfwm                     1/1     Running     0          54m
     milvus-standalone-7dd5db4755-ctqzg                   1/1     Running     0          54m
-    nemoretriever-embedding-ms-86f75c8f65-dfhd2          1/1     Running     0          39m
+    nemotron-embedding-ms-86f75c8f65-dfhd2          1/1     Running     0          39m
     nemoretriever-graphic-elements-v1-67d9d65bdc-ftbkw   1/1     Running     0          33m
     nemoretriever-ocr-v1-78f56cddb9-f4852                1/1     Running     0          40m
     nemoretriever-page-elements-v3-56ddcf9b4b-qsg82      1/1     Running     0          49m
-    nemoretriever-ranking-ms-5ff774889f-fwrlm            1/1     Running     0          40m
+    nemotron-ranking-ms-5ff774889f-fwrlm            1/1     Running     0          40m
     nemoretriever-table-structure-v1-696c9f5665-l9sxn    1/1     Running     0          37m
     nim-llm-7cb9bdcc89-hwpkq                             1/1     Running     0          11m
     nim-llm-cache-job-77hpc                              0/1     Completed   0          94s
@@ -209,11 +211,11 @@ To verify a deployment, use the following procedure.
     NAME                                TYPE        CLUSTER-IP       EXTERNAL-IP   PORT(S)              AGE
     ingestor-server                     ClusterIP   10.107.12.217    <none>        8082/TCP             54m
     milvus                              ClusterIP   10.99.110.203    <none>        19530/TCP,9091/TCP   54m
-    nemoretriever-embedding-ms          ClusterIP   10.104.99.15     <none>        8000/TCP,8001/TCP    54m
+    nemotron-embedding-ms          ClusterIP   10.104.99.15     <none>        8000/TCP,8001/TCP    54m
     nemoretriever-graphic-elements-v1   ClusterIP   10.96.115.45     <none>        8000/TCP,8001/TCP    54m
     nemoretriever-ocr-v1                ClusterIP   10.100.107.215   <none>        8000/TCP,8001/TCP    54m
     nemoretriever-page-elements-v3      ClusterIP   10.102.237.196   <none>        8000/TCP,8001/TCP    54m
-    nemoretriever-ranking-ms            ClusterIP   10.96.114.244    <none>        8000/TCP,8001/TCP    54m
+    nemotron-ranking-ms            ClusterIP   10.96.114.244    <none>        8000/TCP,8001/TCP    54m
     nemoretriever-table-structure-v1    ClusterIP   10.107.227.139   <none>        8000/TCP,8001/TCP    54m
     nim-llm                             ClusterIP   10.104.60.155    <none>        8000/TCP,8001/TCP    54m
     rag-etcd                            ClusterIP   10.104.74.116    <none>        2379/TCP,2380/TCP    54m
@@ -250,7 +252,7 @@ Port-forwarding is provided as a quick method to try out the UI. However, large
 To change an existing deployment, after you modify the [`values.yaml`](../deploy/helm/nvidia-blueprint-rag/values.yaml) file, run the following code.
 
 ```sh
-helm upgrade --install rag -n rag https://helm.ngc.nvidia.com/nvstaging/blueprint/charts/nvidia-blueprint-rag-v2.4.0.tgz \
+helm upgrade --install rag -n rag https://helm.ngc.nvidia.com/nvidia/blueprint/charts/nvidia-blueprint-rag-v2.5.0.tgz \
 --username '$oauthtoken' \
 --password "${NGC_API_KEY}" \
 --set imagePullSecret.password=$NGC_API_KEY \
diff --git a/docs/enable-nemotron-thinking.md b/docs/enable-nemotron-thinking.md
index 182a09ea1..dc95b4285 100644
--- a/docs/enable-nemotron-thinking.md
+++ b/docs/enable-nemotron-thinking.md
@@ -19,9 +19,106 @@ This guide explains how to enable reasoning for different Nemotron models, each
 
 | Model | Control Method | Thinking Budget Parameters |
 |-------|----------------|----------------------------|
+| Nemotron 3 (Nano 30B, and others) | Environment variables | `LLM_ENABLE_THINKING`, `LLM_REASONING_BUDGET`, `LLM_LOW_EFFORT` |
 | Nemotron 1.5 | System prompts | None |
 | Nemotron-3-Nano 9B | System prompts | min/max thinking tokens |
-| Nemotron-3-Nano 30B | Environment variable | max thinking tokens only |
+
+## Enable Reasoning for Nemotron 3 Models
+
+Nemotron 3 models (such as `nvidia/nemotron-3-nano-30b-a3b`) use environment variables to control reasoning.
+
+Set the following environment variables on the RAG server container (via Docker Compose, Helm values, or shell export):
+
+**`LLM_ENABLE_THINKING`**
+: Enable or disable the reasoning phase. When `true`, the model emits reasoning tokens before the final answer. Default: `false`.
+
+**`LLM_REASONING_BUDGET`**
+: Maximum number of tokens allocated for reasoning. Only used when `LLM_ENABLE_THINKING` is `true`. Default: `0`.
+
+**`LLM_LOW_EFFORT`**
+: Low-effort reasoning mode for faster, cheaper responses with shorter reasoning. Only used when `LLM_ENABLE_THINKING` is `true`. Default: `false`.
+
+**`FILTER_THINK_TOKENS`**
+: Filter content between `<think>` and `</think>` tags in model responses. Keep `true` for production to return only the final answer. Set `false` to see the full reasoning process. Default: `true`.
+
+:::{important}
+**Disabling reasoning:** To disable reasoning, set **`LLM_ENABLE_THINKING=false`**. Setting `LLM_REASONING_BUDGET=0` alone does not disable reasoning: when the budget is `0`, the RAG pipeline does not pass it to the LLM, and the model uses its default reasoning behavior. Always set `LLM_ENABLE_THINKING=false` to turn reasoning off.
+:::
+
+## Enable Reasoning for Nemotron 3 Models
+
+Nemotron 3 models (such as `nvidia/nemotron-3-super-120b-a12b` and `nvidia/nemotron-3-nano-30b-a3b`) use environment variables to control reasoning.
+
+### Basic Configuration
+
+```bash
+export LLM_ENABLE_THINKING=true
+```
+
+### Configure Reasoning Budget (Optional)
+
+Limit the number of reasoning tokens to control latency and cost:
+
+```bash
+export LLM_ENABLE_THINKING=true
+export LLM_REASONING_BUDGET=8192
+```
+
+### Low-Effort Mode (Optional)
+
+For faster responses where deep reasoning is unnecessary:
+
+```bash
+export LLM_ENABLE_THINKING=true
+export LLM_LOW_EFFORT=true
+```
+
+### Configure Model Parameters
+
+After you enable reasoning, configure the model parameters for optimal reasoning performance:
+
+```bash
+export LLM_TEMPERATURE=0.6
+export LLM_TOP_P=0.95
+```
+
+### Nemotron-3-Nano 30B
+
+For `nvidia/nemotron-3-nano-30b-a3b`, reasoning is controlled with the same `LLM_ENABLE_THINKING` variable. The reasoning budget can be set with either `LLM_REASONING_BUDGET` or `LLM_MAX_THINKING_TOKENS`:
+
+```bash
+export LLM_ENABLE_THINKING=true
+export LLM_REASONING_BUDGET=8192
+```
+
+The 30B model also supports a maximum thinking token limit directly in API requests:
+
+```json
+{
+  "model": "nvidia/nemotron-3-nano-30b-a3b",
+  "messages": [
+    {
+      "role": "user",
+      "content": "What is the capital of France?"
+    }
+  ],
+  "max_thinking_tokens": 8192
+}
+```
+
+**Thinking budget parameters:**
+
+**`max_thinking_tokens`**
+: Maximum number of reasoning tokens allowed before generating the final answer.
+
+:::{important}
+The key differences for the 30B model are the following:
+
+- Uses only `max_thinking_tokens` (not `min_thinking_tokens`)
+- Reasoning is available in the model output's `reasoning_content` field (not wrapped in `<think>` tags)
+- The `reasoning_content` field is present in the model output but isn't exposed in the generate API response
+- No filtering is needed because reasoning is already separated from the final answer
+:::
 
 ## Enable Reasoning for Nemotron 1.5
 
@@ -81,7 +178,7 @@ export FILTER_THINK_TOKENS=false
 For most production use cases, keep `FILTER_THINK_TOKENS=true` (default) to provide cleaner responses to end users.
 :::
 
-## Enable Reasoning for Nemotron-3-Nano 9B
+## Enable Reasoning for Nemotron Nano 9B
 
 The `nvidia/nvidia-nemotron-nano-9b-v2` model uses system prompts to control reasoning similar to Nemotron 1.5. It also adds support for thinking budget parameters to control the extent of reasoning.
 
@@ -132,63 +229,6 @@ The key differences for the 9B model are the following:
 - No filtering is needed because reasoning is already separated from the final answer
 :::
 
-## Enable Reasoning for Nemotron-3-Nano 30B
-
-The `nvidia/nemotron-3-nano-30b-a3b` model uses a different approach for reasoning control. Instead of system prompts, you control reasoning through an environment variable.
-
-### Enable Reasoning Through an Environment Variable
-
-Set the environment variable to enable or disable reasoning:
-
-```bash
-# Enable reasoning (default)
-export ENABLE_NEMOTRON_3_NANO_THINKING=true
-
-# Disable reasoning
-export ENABLE_NEMOTRON_3_NANO_THINKING=false
-```
-
-### Configure Thinking Budget (Optional)
-
-The 30B model supports a maximum thinking token limit to control the reasoning phase:
-
-```json
-{
-  "model": "nvidia/nemotron-3-nano-30b-a3b",
-  "messages": [
-    {
-      "role": "user",
-      "content": "What is the capital of France?"
-    }
-  ],
-  "max_thinking_tokens": 8192
-}
-```
-
-**Thinking budget parameters:**
-
-**`max_thinking_tokens`**
-: Maximum number of reasoning tokens allowed before generating the final answer.
-
-:::{important}
-The key differences for the 30B model are the following:
-
-- Uses only `max_thinking_tokens` (not `min_thinking_tokens`)
-- Reasoning is available in the model output's `reasoning_content` field (not wrapped in `<think>` tags)
-- The `reasoning_content` field is present in the model output but isn't exposed in the generate API response
-- No filtering is needed because reasoning is already separated from the final answer
-:::
-
-### Model Naming
-
-Use the correct model name based on your deployment:
-
-**Locally deployed NIMs**
-: `nvidia/nemotron-3-nano`
-
-**NVIDIA-hosted models**
-: `nvidia/nemotron-3-nano-30b-a3b`
-
 ## Deploy with Reasoning Enabled
 
 After you configure reasoning settings in `prompt.yaml` or environment variables, redeploy your services:
@@ -220,6 +260,7 @@ Adjust the thinking budget based on your use case:
 
 - **Lower values (1024-4096)**: Faster responses for simpler questions
 - **Higher values (8192-16384)**: More thorough reasoning for complex queries
+- **Low-effort mode**: Use `LLM_LOW_EFFORT=true` for fast, low-cost reasoning when deep thought is not required
 :::
 
 ## Related Topics
diff --git a/docs/evaluate.md b/docs/evaluate.md
index 0d83e83b0..33d9ce758 100644
--- a/docs/evaluate.md
+++ b/docs/evaluate.md
@@ -36,3 +36,4 @@ For more information, refer to the notebook [Evaluate Your RAG Pipeline with Rag
 - [NVIDIA RAG Blueprint Documentation](readme.md)
 - [Get Started](deploy-docker-self-hosted.md)
 - [Notebooks](notebooks.md)
+- [RAG Accuracy Benchmarks](accuracy-benchmarks.md)
diff --git a/docs/index.md b/docs/index.md
index 941b2f9d8..956e9f809 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -81,6 +81,7 @@ After you deploy the RAG blueprint, you can customize it for your use cases.
 - Data Ingestion & Processing
 
     - [Audio Ingestion Support](audio_ingestion.md)
+    - [Continuous Ingestion from Object Storage](continuous-ingestion-object-storage.md)
     - [Custom Metadata Support](custom-metadata.md)
     - [File System Access to Extraction Results](mount-ingestor-volume.md)
     - [Multimodal Embedding Support (Early Access)](vlm-embed.md)
@@ -110,6 +111,7 @@ After you deploy the RAG blueprint, you can customize it for your use cases.
 - Evaluation
 
     - [Evaluate Your NVIDIA RAG Blueprint System](evaluate.md)
+    - [RAG Accuracy Benchmarks](accuracy-benchmarks.md)
 
 - Governance
 
@@ -141,7 +143,7 @@ After you deploy the RAG blueprint, you can customize it for your use cases.
 
 ## Blog Posts
 
-- [NVIDIA NeMo Retriever Delivers Accurate Multimodal PDF Data Extraction 15x Faster](https://developer.nvidia.com/blog/nvidia-nemo-retriever-delivers-accurate-multimodal-pdf-data-extraction-15x-faster/)
+- [NVIDIA NeMo Retriever Library Delivers Accurate Multimodal PDF Data Extraction 15x Faster](https://developer.nvidia.com/blog/nvidia-nemo-retriever-delivers-accurate-multimodal-pdf-data-extraction-15x-faster/)
 - [Finding the Best Chunking Strategy for Accurate AI Responses](https://developer.nvidia.com/blog/finding-the-best-chunking-strategy-for-accurate-ai-responses/)
 
 
@@ -211,13 +213,14 @@ After you deploy the RAG blueprint, you can customize it for your use cases.
    :hidden:
 
    Audio Ingestion Support <audio_ingestion.md>
+   Continuous Ingestion from Object Storage <continuous-ingestion-object-storage.md>
    Custom metadata Support <custom-metadata.md>
    Data Catalog for Collections and Documents <data-catalog.md>
    File System Access to Results <mount-ingestor-volume.md>
    Multimodal Embedding Support (Early Access) <vlm-embed.md>
    OCR Configuration Guide <nemoretriever-ocr.md>
    Enhanced PDF Extraction <nemotron-parse-extraction.md>
-   Standalone NV-Ingest <nv-ingest-standalone.md>
+   Standalone NeMo Retriever Library <nv-ingest-standalone.md>
    Text-Only Ingestion <text_only_ingest.md>
    MCP Server Usage <mcp.md>
 ```
@@ -255,6 +258,7 @@ After you deploy the RAG blueprint, you can customize it for your use cases.
    :hidden:
 
    Evaluate Your RAG System <evaluate.md>
+   RAG Accuracy Benchmarks <accuracy-benchmarks.md>
 ```
 
 
diff --git a/docs/mig-deployment.md b/docs/mig-deployment.md
index bc4793ab1..d2ee3cc5e 100644
--- a/docs/mig-deployment.md
+++ b/docs/mig-deployment.md
@@ -15,10 +15,10 @@ refer to the [MIG Supported Hardware List](https://docs.nvidia.com/datacenter/te
 
 Before you deploy, verify that you have the following:
 
-* A Kubernetes cluster with NVIDIA H100 GPUs
+* A Kubernetes cluster with NVIDIA H100 or RTX PRO 6000 GPUs
 
    :::{note}
-   This section showcases MIG support for `NVIDIA H100 80GB HBM3` GPU. The MIG profiles used in the `mig-config.yaml` are specific to this GPU.
+   This section showcases MIG support for `NVIDIA H100 80GB HBM3` GPU. The MIG profiles used in the `mig-config-h100.yaml` are specific to this GPU.
    Refer to the [MIG User Guide](https://docs.nvidia.com/datacenter/tesla/mig-user-guide/) for MIG profiles of other GPU types.
    :::
 
@@ -35,9 +35,9 @@ For monitoring deployment progress, refer to [Deploy on Kubernetes with Helm](./
 
 3. Verify that you have the NGC CLI available on your client computer. You can download the CLI from <https://ngc.nvidia.com/setup/installers/cli>.
 
-4. Verify that you have Kubernetes v1.34.2 installed and running on Ubuntu 22.04/24.04. For more information, see [Kubernetes documentation](https://kubernetes.io/docs/setup/) and [NVIDIA Cloud Native Stack 17.0](https://github.com/NVIDIA/cloud-native-stack/tree/17.0).
+4. Verify that you have Kubernetes v1.34.2 installed and running on Ubuntu 22.04/24.04. For more information, see [Kubernetes documentation](https://kubernetes.io/docs/setup/) and [NVIDIA Cloud Native Stack 17.0](https://github.com/NVIDIA/cloud-native-stack/tree/25.12.0).
 
-5. Verify that you have installed Helm 3 or later (Helm v3.20.0 recommended). For installation instructions, see [Helm Installation](https://helm.sh/docs/intro/install).
+5. Verify that you have installed Helm 3. To install Helm 3 (and avoid Helm 4), follow the official Helm v3 installation instructions for your platform, for example by using the `get-helm-3` script described in the [Helm documentation](https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3).
 
 6. Verify that you have a default storage class available in the cluster for PVC provisioning. One option is the local path provisioner by Rancher.   Refer to the [installation](https://github.com/rancher/local-path-provisioner?tab=readme-ov-file#installation) section of the README in the GitHub repository.
 
@@ -99,7 +99,7 @@ For monitoring deployment progress, refer to [Deploy on Kubernetes with Helm](./
 
 ## Step 2: Apply the MIG configuration
 
-Edit the MIG configuration file [`mig-config.yaml`](../deploy/helm/mig-slicing/mig-config.yaml) to adjust the slicing pattern as needed.
+Edit the MIG configuration file [`mig-config-h100.yaml`](../deploy/helm/mig-slicing/mig-config-h100.yaml) to adjust the slicing pattern as needed.
 The following example enables a custom configuration with mixed MIG slice sizes on the same GPU.
 
 
@@ -139,7 +139,7 @@ data:
 Apply the custom MIG configuration configMap to the node and update the ClusterPolicy, by running the following code.
 
 ```bash
-kubectl apply -n nvidia-gpu-operator -f mig-slicing/mig-config.yaml
+kubectl apply -n nvidia-gpu-operator -f mig-slicing/mig-config-h100.yaml
 kubectl patch clusterpolicies.nvidia.com/cluster-policy \
   --type='json' \
   -p='[{"op":"replace", "path":"/spec/migManager/config/name", "value":"custom-mig-config"}]'
@@ -151,6 +151,20 @@ Label the node with MIG configuration, by running the following code.
 kubectl label nodes <node-name> nvidia.com/mig.config=custom-7x1g10-2x1g20-1x3g40-1x7g80 --overwrite
 ```
 
+:::{important}
+**For NVIDIA RTX6000 Pro Deployments:**
+
+Use [`mig-config-rtx6000.yaml`](../deploy/helm/mig-slicing/mig-config-rtx6000.yaml) instead:
+
+```bash
+kubectl apply -n nvidia-gpu-operator -f mig-slicing/mig-config-rtx6000.yaml
+kubectl patch clusterpolicies.nvidia.com/cluster-policy \
+  --type='json' \
+  -p='[{"op":"replace", "path":"/spec/migManager/config/name", "value":"custom-mig-config"}]'
+kubectl label nodes <node-name> nvidia.com/mig.config=custom-rtx6000-4x1g24-2x1g24-1x2g48-1x4g96 --overwrite
+```
+:::
+
 Verify that the MIG configuration is successfully applied, by running the following code.
 
 ```bash
@@ -174,39 +188,26 @@ You should see output similar to the following.
 Run the following code to install the RAG Blueprint Helm Chart.
 
 ```bash
-helm upgrade --install rag -n rag https://helm.ngc.nvidia.com/nvstaging/blueprint/charts/nvidia-blueprint-rag-v2.4.0.tgz \
+helm upgrade --install rag -n rag https://helm.ngc.nvidia.com/nvidia/blueprint/charts/nvidia-blueprint-rag-v2.5.0.tgz \
   --username '$oauthtoken' \
   --password "${NGC_API_KEY}" \
   --set imagePullSecret.password=$NGC_API_KEY \
   --set ngcApiSecret.password=$NGC_API_KEY \
-  -f mig-slicing/values-mig.yaml
+  -f mig-slicing/values-mig-h100.yaml
 ```
 
 :::{important}
 **For NVIDIA RTX6000 Pro Deployments:**
 
-If you are deploying on NVIDIA RTX6000 Pro GPUs (instead of H100 GPUs), you need to configure the NIM LLM model profile. The required configuration is already present but commented out in the [`values.yaml`](../deploy/helm/nvidia-blueprint-rag/values.yaml) file.
-
-Uncomment and modify the following section under `nimOperator.nim-llm.model` in [`values.yaml`](../deploy/helm/nvidia-blueprint-rag/values.yaml):
-```yaml
-model:
-  engine: tensorrt_llm
-  precision: "fp8"
-  qosProfile: "throughput"
-  tensorParallelism: "1"
-  gpus:
-    - product: "rtx6000_blackwell_sv"
-```
+If you are deploying on NVIDIA RTX6000 Pro GPUs (instead of H100 GPUs), use [`values-mig-rtx6000.yaml`](../deploy/helm/mig-slicing/values-mig-rtx6000.yaml) and [`mig-config-rtx6000.yaml`](../deploy/helm/mig-slicing/mig-config-rtx6000.yaml) which include the RTX6000-specific MIG profiles and NIM LLM model configuration.
 
-Then install using the modified values.yaml along with MIG values:
 ```sh
-helm upgrade --install rag -n rag https://helm.ngc.nvidia.com/nvstaging/blueprint/charts/nvidia-blueprint-rag-v2.4.0.tgz \
+helm upgrade --install rag -n rag https://helm.ngc.nvidia.com/nvidia/blueprint/charts/nvidia-blueprint-rag-v2.5.0.tgz \
   --username '$oauthtoken' \
   --password "${NGC_API_KEY}" \
   --set imagePullSecret.password=$NGC_API_KEY \
   --set ngcApiSecret.password=$NGC_API_KEY \
-  -f values.yaml \
-  -f mig-slicing/values-mig.yaml
+  -f mig-slicing/values-mig-rtx6000.yaml
 ```
 :::
 
@@ -235,14 +236,14 @@ You should see output similar to the following.
 Resource                                    Requested   Limit    Allocatable  Free
 nvidia.com/mig-1g.10gb                      (86%) 6.0   (86%) 6.0     7.0        1.0
 ├─ milvus-standalone-...                   1.0     1.0
-├─ nemoretriever-embedding-ms-...          1.0     1.0
+├─ nemotron-embedding-ms-...          1.0     1.0
 ├─ rag-nv-ingest-...                       1.0     1.0
 ├─ nemoretriever-graphic-elements-v1-...   1.0     1.0
 ├─ nemoretriever-page-elements-v3-...      1.0     1.0
 └─ nemoretriever-table-structure-v1-...    1.0     1.0
 
 nvidia.com/mig-1g.20gb                      (100%) 2.0  (100%) 2.0     2.0        0.0
-├─ nemoretriever-ranking-ms-...            1.0     1.0
+├─ nemotron-ranking-ms-...            1.0     1.0
 └─ <other-workload>                        1.0     1.0
 
 nvidia.com/mig-3g.40gb                      (100%) 1.0  (100%) 1.0     1.0        0.0
@@ -303,7 +304,7 @@ GPU 3: NVIDIA H100 80GB HBM3 (UUID: ...)
 
 * Ensure you have the correct MIG strategy (`mixed`) configured.
 * Verify that `nvidia.com/mig.config.state` is `success` before deploying.
-* Customize `values-mig.yaml` to specify the correct MIG GPU resource requests for each pod.
+* Customize `values-mig-h100.yaml` or `values-mig-rtx6000.yaml` to specify the correct MIG GPU resource requests for each pod.
 
 
 
diff --git a/docs/migration-oracle-26ai.md b/docs/migration-oracle-26ai.md
new file mode 100644
index 000000000..f7c3615e2
--- /dev/null
+++ b/docs/migration-oracle-26ai.md
@@ -0,0 +1,330 @@
+<!--
+  SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+  SPDX-License-Identifier: Apache-2.0
+-->
+
+# Migration Guide: Milvus to Oracle AI Database 26ai
+
+This guide covers migrating the NVIDIA RAG Blueprint from Milvus to Oracle AI Database 26ai as the vector database. Oracle 26ai is now the **default** vector store for this blueprint.
+
+## Overview
+
+Oracle AI Database 26ai is Oracle's next-generation AI-native database that integrates vector search capabilities directly into the database engine. Key benefits include:
+
+- **Native VECTOR data type** - Store and query vectors alongside relational data
+- **IVF and HNSW indexes** - CPU-optimized vector indexes for fast similarity search
+- **Hybrid search** - Combine vector similarity with Oracle Text full-text search
+- **No separate vector DB** - Unified database for all data types
+- **Enterprise features** - Built-in security, scalability, and high availability
+
+## Prerequisites
+
+Before migrating, ensure you have:
+
+1. **Oracle 26ai database instance** - Either:
+   - Oracle Cloud Autonomous Database with AI Vector Search
+   - On-premises Oracle 26ai installation
+   - Docker container for development (Oracle Free tier)
+
+2. **Database user with privileges**:
+   - CREATE TABLE
+   - CREATE INDEX
+   - UNLIMITED TABLESPACE (or appropriate quota)
+
+3. **Network connectivity** from RAG servers to Oracle database
+
+4. **Python environment** with Oracle dependencies
+
+## Step 1: Install Oracle Dependencies
+
+Install the Oracle optional dependencies:
+
+```bash
+# Using pip
+pip install nvidia_rag[oracle]
+
+# Using uv
+uv sync --extra oracle
+```
+
+This installs:
+- `oracledb>=2.0.0` - Oracle Database Python driver (thin client, no Oracle Client needed)
+- `langchain-community>=0.4` - LangChain integration with OracleVS
+
+## Step 2: Configure Environment Variables
+
+Set the following environment variables for Oracle connection:
+
+```bash
+# Vector store selection (oracle is now default)
+export APP_VECTORSTORE_NAME=oracle
+
+# Oracle connection credentials
+export ORACLE_USER=rag_user
+export ORACLE_PASSWORD=your_secure_password
+export ORACLE_DSN=hostname:1521/service_name
+
+# Optional: Vector index configuration
+export ORACLE_VECTOR_INDEX_TYPE=IVF    # IVF (default) or HNSW
+export ORACLE_DISTANCE_METRIC=COSINE   # COSINE (default), L2, DOT, MANHATTAN
+
+# Optional: Enable hybrid search (vector + text)
+export APP_VECTORSTORE_SEARCH_TYPE=hybrid
+```
+
+### Connection String Formats
+
+Oracle DSN can be specified in multiple formats:
+
+```bash
+# Easy Connect format
+export ORACLE_DSN=hostname:1521/service_name
+
+# Easy Connect Plus with options
+export ORACLE_DSN=hostname:1521/service_name?connect_timeout=30
+
+# TNS alias (requires tnsnames.ora)
+export ORACLE_DSN=mydb_alias
+```
+
+## Step 3: Prepare Oracle Database
+
+Connect to your Oracle database as a DBA and create the RAG user:
+
+```sql
+-- Create user
+CREATE USER rag_user IDENTIFIED BY your_secure_password;
+
+-- Grant privileges
+GRANT CONNECT, RESOURCE TO rag_user;
+GRANT CREATE TABLE TO rag_user;
+GRANT CREATE INDEX TO rag_user;
+GRANT UNLIMITED TABLESPACE TO rag_user;
+
+-- For hybrid search (Oracle Text)
+GRANT CTXAPP TO rag_user;
+```
+
+### Verify Vector Search Support
+
+Ensure your Oracle 26ai instance has vector search enabled:
+
+```sql
+-- Check database version (should be 26.x)
+SELECT * FROM V$VERSION;
+
+-- Verify VECTOR data type is available
+SELECT VECTOR('[1.0, 2.0, 3.0]', 3, FLOAT32) FROM DUAL;
+```
+
+## Step 4: Deploy Services
+
+### Option A: Using Docker Compose (Development)
+
+Start the Oracle container for development:
+
+```bash
+# Set Oracle password
+export ORACLE_PASSWORD=oracle123
+
+# Start Oracle 26ai container
+docker compose -f deploy/compose/vectordb.yaml --profile oracle up -d
+
+# Wait for Oracle to be ready (first startup takes ~5 minutes)
+docker logs -f oracle-26ai
+```
+
+Then start the RAG services:
+
+```bash
+# Source environment
+source deploy/compose/.env
+
+# Start RAG and Ingestor servers
+docker compose -f deploy/compose/docker-compose-rag-server.yaml up -d --build
+docker compose -f deploy/compose/docker-compose-ingestor-server.yaml up -d --build
+```
+
+### Option B: External Oracle Database (Production)
+
+For production, connect to your existing Oracle 26ai instance:
+
+```bash
+# Set connection to your Oracle instance
+export ORACLE_USER=rag_user
+export ORACLE_PASSWORD=your_secure_password
+export ORACLE_DSN=your-oracle-host:1521/your_service
+
+# Source other environment variables
+source deploy/compose/.env
+
+# Start RAG services
+docker compose -f deploy/compose/docker-compose-rag-server.yaml up -d --build
+docker compose -f deploy/compose/docker-compose-ingestor-server.yaml up -d --build
+```
+
+## Step 5: Re-ingest Documents
+
+**Important**: Documents must be re-ingested when switching vector databases. Data does not automatically migrate from Milvus to Oracle.
+
+1. Access the RAG UI at `http://localhost:8090`
+2. Create a new collection
+3. Upload your documents
+4. Verify ingestion via health check:
+
+```bash
+curl -X GET 'http://localhost:8082/v1/health?check_dependencies=true' -H 'accept: application/json'
+```
+
+## Configuration Reference
+
+### Environment Variables
+
+| Variable | Description | Default |
+|----------|-------------|---------|
+| `APP_VECTORSTORE_NAME` | Vector store type | `oracle` |
+| `ORACLE_USER` | Database username | - |
+| `ORACLE_PASSWORD` | Database password | - |
+| `ORACLE_DSN` | Connection DSN | - |
+| `ORACLE_VECTOR_INDEX_TYPE` | Index type: `IVF` or `HNSW` | `IVF` |
+| `ORACLE_DISTANCE_METRIC` | Distance: `COSINE`, `L2`, `DOT`, `MANHATTAN` | `COSINE` |
+| `APP_VECTORSTORE_SEARCH_TYPE` | Search type: `dense` or `hybrid` | `dense` |
+
+### Vector Index Types
+
+**IVF (Inverted File Index)** - Default, recommended for CPU deployment:
+- Good balance of speed and accuracy
+- Lower memory usage
+- Best for large-scale deployments
+
+**HNSW (Hierarchical Navigable Small World)**:
+- Higher accuracy
+- Faster query time
+- Higher memory usage
+- Best for smaller datasets with high accuracy requirements
+
+### Distance Metrics
+
+| Metric | Use Case |
+|--------|----------|
+| `COSINE` | Default, best for normalized embeddings (most NLP models) |
+| `L2` | Euclidean distance, good for image embeddings |
+| `DOT` | Dot product, fast but requires normalized vectors |
+| `MANHATTAN` | L1 distance, robust to outliers |
+
+## Hybrid Search Configuration
+
+Hybrid search combines vector similarity search with Oracle Text full-text search for improved relevance:
+
+```bash
+export APP_VECTORSTORE_SEARCH_TYPE=hybrid
+```
+
+This creates both:
+- Vector index (IVF/HNSW) on the `vector` column
+- Oracle Text index (CONTEXT) on the `text` column
+
+Queries will combine semantic similarity with keyword matching.
+
+## Rollback to Milvus
+
+To switch back to Milvus:
+
+```bash
+# Change vector store
+export APP_VECTORSTORE_NAME=milvus
+
+# Start Milvus containers
+docker compose -f deploy/compose/vectordb.yaml up -d
+
+# Restart RAG services
+docker compose -f deploy/compose/docker-compose-rag-server.yaml up -d
+docker compose -f deploy/compose/docker-compose-ingestor-server.yaml up -d
+```
+
+Note: You will need to re-ingest documents into Milvus.
+
+## Troubleshooting
+
+### Connection Errors
+
+**ORA-01017: invalid username/password; logon denied**
+- Verify `ORACLE_USER` and `ORACLE_PASSWORD` are correct
+- Check if the user exists and is not locked
+
+**ORA-12541: TNS:no listener**
+- Verify `ORACLE_DSN` format and hostname
+- Check network connectivity: `telnet hostname 1521`
+- Ensure Oracle listener is running
+
+**ORA-12514: TNS:listener does not currently know of service requested**
+- Verify the service name in `ORACLE_DSN`
+- List available services: `lsnrctl services`
+
+### Vector Operations
+
+**ORA-51801: VECTOR data type not supported**
+- Ensure you're using Oracle 26ai (not an older version)
+- Check `COMPATIBLE` parameter is set to 23.4.0 or higher
+
+**Vector dimension mismatch**
+- Ensure your embedding model dimension matches the collection
+- Default dimension is 2048 (for NeMo Retriever embeddings)
+- Check with: `SELECT VECTOR_DIMS(vector) FROM your_table WHERE ROWNUM = 1`
+
+### Performance Issues
+
+**Slow ingestion**
+- Increase batch size in `oracle_vdb.py`
+- Consider disabling indexes during bulk load, then rebuild
+- Use parallel DML: `ALTER SESSION ENABLE PARALLEL DML`
+
+**Slow queries**
+- Ensure vector index exists: `SELECT * FROM USER_INDEXES WHERE INDEX_TYPE = 'VECTOR'`
+- Rebuild index if needed: `ALTER INDEX idx_name REBUILD`
+- Check index statistics: `DBMS_STATS.GATHER_INDEX_STATS`
+
+## Architecture
+
+```
+┌─────────────────────────────────────────────────────────┐
+│                   NVIDIA RAG Blueprint                   │
+├─────────────────────────────────────────────────────────┤
+│  RAG Server / Ingestor Server                           │
+│       │                                                  │
+│       ▼                                                  │
+│  ┌─────────────────┐                                    │
+│  │   OracleVDB     │ ◄── VDBRagIngest base class        │
+│  │   (oracle_vdb)  │                                    │
+│  └────────┬────────┘                                    │
+│           │                                              │
+│           ▼                                              │
+│  ┌─────────────────┐    ┌─────────────────┐            │
+│  │   oracledb      │    │  LangChain      │            │
+│  │   (thin client) │    │  OracleVS       │            │
+│  └────────┬────────┘    └────────┬────────┘            │
+│           │                      │                      │
+│           └──────────┬───────────┘                      │
+│                      ▼                                   │
+│           ┌─────────────────────┐                       │
+│           │  Oracle 26ai DB     │                       │
+│           │  - VECTOR columns   │                       │
+│           │  - IVF indexes      │                       │
+│           │  - Oracle Text      │                       │
+│           └─────────────────────┘                       │
+└─────────────────────────────────────────────────────────┘
+```
+
+## Related Topics
+
+- [NVIDIA RAG Blueprint Documentation](readme.md)
+- [Change the Vector Database](change-vectordb.md)
+- [Best Practices for Common Settings](accuracy_perf.md)
+- [Troubleshoot](troubleshooting.md)
+
+## External Resources
+
+- [Oracle AI Vector Search Documentation](https://docs.oracle.com/en/database/oracle/oracle-database/26/vecse/)
+- [Oracle Database 26ai Release Notes](https://docs.oracle.com/en/database/oracle/oracle-database/26/)
+- [LangChain OracleVS Integration](https://python.langchain.com/docs/integrations/vectorstores/oracle)
+- [oracledb Python Driver](https://python-oracledb.readthedocs.io/)
\ No newline at end of file
diff --git a/docs/migration-oracle-v0.0.6-to-v0.0.7.md b/docs/migration-oracle-v0.0.6-to-v0.0.7.md
new file mode 100644
index 000000000..eb3b7cb25
--- /dev/null
+++ b/docs/migration-oracle-v0.0.6-to-v0.0.7.md
@@ -0,0 +1,121 @@
+# Migrating Oracle 26ai deployments from v0.0.6 to v0.0.7
+
+## What changed
+
+v0.0.7 fixes a long-standing bug where Oracle case-folded all collection
+names to UPPERCASE, regardless of the case the client supplied. After
+upgrade, new collections preserve their original casing (matching
+Milvus/Elasticsearch semantics).
+
+| Behavior | v0.0.6 | v0.0.7 |
+|---|---|---|
+| Client creates `s_session_uuid` | Stored as `S_SESSION_UUID` | Stored as `s_session_uuid` |
+| Client creates `MyCollection` | Stored as `MYCOLLECTION` | Stored as `MyCollection` |
+| `list_collections()` returns | All names UPPERCASE | Names exactly as stored |
+| Comparison `coll.name == client_input` | Case-mismatch → 404 | Exact match |
+
+## Behavior on upgrade
+
+**No data loss.** v0.0.7 retains backward-compatible lookup:
+`check_collection_exists()` and `create_collection()` detect both
+case-preserved (v0.0.7) and case-folded (v0.0.6) tables.
+
+**Existing v0.0.6 collections remain accessible** if you address them
+with the case Oracle stored — i.e. UPPERCASE. For example:
+
+```python
+# Pre-upgrade: client sent "biomedical_dataset" → stored as BIOMEDICAL_DATASET
+# Post-upgrade: still listed as BIOMEDICAL_DATASET in get_collection()
+# Client must call with BIOMEDICAL_DATASET to find it.
+```
+
+If you want to rename existing UPPERCASE tables to a mixed-case form, see
+[Optional rename](#optional-rename) below.
+
+## Required cleanup before first v0.0.7 startup
+
+If you previously ran v0.0.6 against this Oracle database, the following
+system tables persist across container restarts and **do not need
+cleanup** — v0.0.7 detects them via `_table_exists_unquoted()` and
+short-circuits creation:
+
+- `METADATA_SCHEMA` (created by v0.0.6 unquoted DDL)
+- `DOCUMENT_INFO` (created by v0.0.6 unquoted DDL)
+
+If your deployment exhibited startup failures with `ORA-00955: name is
+already used by an existing object` on v0.0.6 → v0.0.7 transition
+**before** this fix landed, you can verify with:
+
+```sql
+SELECT table_name FROM user_tables
+WHERE table_name IN ('METADATA_SCHEMA', 'DOCUMENT_INFO');
+```
+
+Both should be present and accessible. v0.0.7 reuses them in place.
+
+## Idempotency guarantees in v0.0.7
+
+`create_collection()`, `create_metadata_schema_collection()`, and
+`create_document_info_collection()` are now safe to call repeatedly:
+
+1. They first check existence with both case-sensitive (quoted-DDL) and
+   case-folded (unquoted-DDL) lookups.
+2. The CREATE statement itself is wrapped in a try/except that swallows
+   `ORA-00955`. This handles races between the check and the CREATE,
+   and any cases the existence check missed.
+
+This means restart loops, parallel ingestion, and v0.0.6 → v0.0.7
+upgrades all converge cleanly without manual intervention.
+
+## Optional rename — convert existing UPPERCASE collections to mixed case
+
+This is **not required**. Only do this if you want, e.g., the AIQ
+frontend to consistently see lowercase collection names.
+
+```sql
+-- Rename the table itself (Oracle preserves case on quoted ALTER ... RENAME)
+ALTER TABLE BIOMEDICAL_DATASET RENAME TO "biomedical_dataset";
+
+-- Update metadata table to point at the new name. Both metadata_schema
+-- and document_info store collection_name as VARCHAR2 data, so case is
+-- preserved end-to-end.
+UPDATE metadata_schema
+   SET collection_name = 'biomedical_dataset'
+ WHERE collection_name = 'BIOMEDICAL_DATASET';
+
+UPDATE document_info
+   SET collection_name = 'biomedical_dataset'
+ WHERE collection_name = 'BIOMEDICAL_DATASET';
+
+COMMIT;
+```
+
+Verify with:
+
+```sql
+SELECT table_name FROM user_tables WHERE table_name = 'biomedical_dataset';
+-- Should return one row with the lowercase name.
+```
+
+Repeat for each collection you want renamed.
+
+## Troubleshooting
+
+### `ORA-00955: name is already used by an existing object`
+v0.0.7 should never raise this on collection or system-table creation.
+If you see it on a different code path (e.g., a custom migration
+script), check whether the offending DDL uses quoted identifiers — Oracle
+treats `"MyName"` and `MYNAME` as distinct objects, and creating one
+when the other already exists triggers ORA-00955.
+
+### A collection I created appears as UPPERCASE in `list_collections()`
+That collection was created by v0.0.6 (or earlier) before this fix.
+Either continue addressing it as UPPERCASE, or follow the
+[Optional rename](#optional-rename) steps above.
+
+### Long collection names cause `ORA-00972: identifier is too long`
+v0.0.7 enforces Oracle's 128-character identifier limit at create time.
+Auxiliary objects (vector index, text index) use a deterministic SHA-256
+prefix when the verbatim derivation would exceed 128 chars, so the
+practical name limit on the **table** is 128 chars (full Oracle limit).
+Names exceeding 128 characters are rejected with `ValueError`.
diff --git a/docs/mount-ingestor-volume.md b/docs/mount-ingestor-volume.md
index a9bbb43c4..ff776e34d 100644
--- a/docs/mount-ingestor-volume.md
+++ b/docs/mount-ingestor-volume.md
@@ -4,7 +4,7 @@
 -->
 # Ingestor Server Volume Mounting for NVIDIA RAG Blueprint
 
-You can mount a host directory to access NV-Ingest extraction results directly from the filesystem when you use the [NVIDIA RAG Blueprint](readme.md). Designed for advanced developers who need programmatic access to raw extraction results for custom processing pipelines or external vector database integration.
+You can mount a host directory to access extraction results from NeMo Retriever Library directly from the filesystem when you use the [NVIDIA RAG Blueprint](readme.md). Designed for advanced developers who need programmatic access to raw extraction results for custom processing pipelines or external vector database integration.
 
 ## Configuration
 
diff --git a/docs/multi-collection-retrieval.md b/docs/multi-collection-retrieval.md
index cd80c337f..07d05be12 100644
--- a/docs/multi-collection-retrieval.md
+++ b/docs/multi-collection-retrieval.md
@@ -38,10 +38,10 @@ The reranker settings are configured in `deploy/compose/docker-compose-rag-serve
 export ENABLE_RERANKER=True
 
 # Set reranker model (default is already configured)
-export APP_RANKING_MODELNAME="nvidia/llama-3.2-nv-rerankqa-1b-v2"
+export APP_RANKING_MODELNAME="nvidia/llama-nemotron-rerank-1b-v2"
 
 # Reranker service URL (default is already configured)
-export APP_RANKING_SERVERURL="nemoretriever-ranking-ms:8000"
+export APP_RANKING_SERVERURL="nemotron-ranking-ms:8000"
 ```
 
 ### For Helm Deployment
@@ -54,7 +54,7 @@ envVars:
   ENABLE_RERANKER: "True"
   
   # Reranker model name (default is already configured)
-  APP_RANKING_MODELNAME: "nvidia/llama-3.2-nv-rerankqa-1b-v2"
+  APP_RANKING_MODELNAME: "nvidia/llama-nemotron-rerank-1b-v2"
   
   # Reranker service URL (default is already configured)
   APP_RANKING_SERVERURL: "nemoretriever-reranking-ms:8000"
diff --git a/docs/nemoretriever-ocr.md b/docs/nemoretriever-ocr.md
index d8ef4f3c5..a76a7c113 100644
--- a/docs/nemoretriever-ocr.md
+++ b/docs/nemoretriever-ocr.md
@@ -11,17 +11,17 @@ This guide explains the OCR (Optical Character Recognition) services available i
 
 The NVIDIA RAG Blueprint supports two OCR services:
 
-1. **NeMo Retriever OCR** (Default) - High-performance OCR service offering 2x+ faster performance
+1. **NeMo Retriever Library OCR** (Default) - High-performance OCR service offering 2x+ faster performance
 2. **Paddle OCR** (Legacy) - General-purpose OCR service maintained for compatibility
 
 :::{tip}
-**NeMo Retriever OCR is now the default OCR service** and is recommended for all new deployments due to its superior performance and efficiency.
+**NeMo Retriever Library OCR is now the default OCR service** and is recommended for all new deployments due to its superior performance and efficiency.
 :::
 
 
-## NeMo Retriever OCR (Default)
+## NeMo Retriever Library OCR (Default)
 
-NeMo Retriever OCR is the default and recommended OCR service for the NVIDIA RAG Blueprint, providing:
+NeMo Retriever Library OCR is the default and recommended OCR service for the NVIDIA RAG Blueprint, providing:
 
 - **2x+ faster performance** compared to Paddle OCR
 - Optimized text extraction from documents and images
@@ -38,7 +38,7 @@ NeMo Retriever OCR is the default and recommended OCR service for the NVIDIA RAG
 
 ### Default Configuration
 
-By default, the NVIDIA RAG Blueprint is configured to use NeMo Retriever OCR with the following settings:
+By default, the NVIDIA RAG Blueprint is configured to use NeMo Retriever Library OCR with the following settings:
 
 | Variable | Default Value | Description |
 |----------|---------------|-------------|
@@ -49,11 +49,11 @@ By default, the NVIDIA RAG Blueprint is configured to use NeMo Retriever OCR wit
 
 ### Hardware Requirements
 
-For detailed hardware requirements and GPU support, refer to the [NeMo Retriever OCR Support Matrix](https://docs.nvidia.com/nim/ingestion/image-ocr/1.2.0/support-matrix.html).
+For detailed hardware requirements and GPU support, refer to the [NeMo Retriever Library OCR Support Matrix](https://docs.nvidia.com/nim/ingestion/image-ocr/1.2.0/support-matrix.html).
 
 ### Docker Configuration
 
-The NeMo Retriever OCR service is configured in the Docker Compose file with the following key settings:
+The NeMo Retriever Library OCR service is configured in the Docker Compose file with the following key settings:
 
 - **Image**: `nvcr.io/nim/nvidia/nemoretriever-ocr-v1:1.2.0`
 - **GPU Memory**: 8192 MB (default)
@@ -72,7 +72,7 @@ export OCR_OMP_NUM_THREADS=8  # Set OpenMP threads
 
 ## Paddle OCR (Legacy)
 
-Paddle OCR is maintained as a legacy option for compatibility with existing workflows. While still functional, it is recommended to migrate to NeMo Retriever OCR for better performance.
+Paddle OCR is maintained as a legacy option for compatibility with existing workflows. While still functional, it is recommended to migrate to NeMo Retriever Library OCR for better performance.
 
 ### When to Use Paddle OCR
 
@@ -83,8 +83,6 @@ Consider using Paddle OCR if you:
 
 ### Hardware Requirements
 
-For detailed hardware requirements, refer to the [Paddle OCR Support Matrix](https://docs.nvidia.com/nim/ingestion/table-extraction/latest/support-matrix.html#supported-hardware).
-
 ### Docker Configuration
 
 The Paddle OCR service configuration:
@@ -94,7 +92,7 @@ The Paddle OCR service configuration:
 - **Ports**: 8009 (HTTP), 8010 (gRPC), 8011 (Metrics)
 
 :::{note}
-**Legacy Service**: Paddle OCR is maintained as a legacy option. For new deployments, we recommend using the default NeMo Retriever OCR service for better performance.
+**Legacy Service**: Paddle OCR is maintained as a legacy option. For new deployments, we recommend using the default NeMo Retriever Library OCR service for better performance.
 :::
 
 
@@ -102,9 +100,9 @@ The Paddle OCR service configuration:
 
 ### Docker Compose Deployment
 
-#### Using NeMo Retriever OCR (Default)
+#### Using NeMo Retriever Library OCR (Default)
 
-NeMo Retriever OCR is deployed by default when you follow the standard deployment guide. No additional configuration is required.
+NeMo Retriever Library OCR is deployed by default when you follow the standard deployment guide. No additional configuration is required.
 
 1. **Prerequisites**: Follow the [deployment guide](deploy-docker-self-hosted.md) for standard setup.
 
@@ -114,7 +112,7 @@ NeMo Retriever OCR is deployed by default when you follow the standard deploymen
    ```
 
    :::{tip}
-   NeMo Retriever OCR is included in the default profile and will start automatically.
+   NeMo Retriever Library OCR is included in the default profile and will start automatically.
    :::
 
 3. **Verify Service Status**:
@@ -136,7 +134,7 @@ If you need to use Paddle OCR instead:
    export OCR_MODEL_NAME=paddle
    ```
 
-3. **Stop NeMo Retriever OCR if running**:
+3. **Stop NeMo Retriever Library OCR if running**:
    ```bash
    USERID=$(id -u) docker compose -f deploy/compose/nims.yaml down nemoretriever-ocr
    ```
@@ -146,7 +144,7 @@ If you need to use Paddle OCR instead:
    USERID=$(id -u) docker compose -f deploy/compose/nims.yaml --profile paddle up -d
    ```
 
-5. **Restart Ingestor Server and NV-Ingest Runtime**:
+5. **Restart Ingestor Server and NeMo Retriever Library Runtime**:
    ```bash
    docker compose -f deploy/compose/docker-compose-ingestor-server.yaml up -d
    ```
@@ -156,9 +154,9 @@ If you need to use Paddle OCR instead:
 
 ### NVIDIA-Hosted Deployment
 
-#### Using NeMo Retriever OCR (Default)
+#### Using NeMo Retriever Library OCR (Default)
 
-Follow the standard [NVIDIA-hosted deployment guide](deploy-docker-nvidia-hosted.md) - NeMo Retriever OCR is the default configuration.
+Follow the standard [NVIDIA-hosted deployment guide](deploy-docker-nvidia-hosted.md) - NeMo Retriever Library OCR is the default configuration.
 
 #### Using Paddle OCR with NVIDIA-Hosted Deployment
 
@@ -178,13 +176,13 @@ Follow the standard [NVIDIA-hosted deployment guide](deploy-docker-nvidia-hosted
 
 ### Helm Deployment
 
-#### Using NeMo Retriever OCR (Default)
+#### Using NeMo Retriever Library OCR (Default)
 
-NeMo Retriever OCR is deployed by default with Helm installations. Follow the standard [Helm Deployment Guide](deploy-helm.md) - no additional OCR configuration is required.
+NeMo Retriever Library OCR is deployed by default with Helm installations. Follow the standard [Helm Deployment Guide](deploy-helm.md) - no additional OCR configuration is required.
 
 #### Using Paddle OCR with Helm
 
-To use Paddle OCR instead of the default NeMo Retriever OCR:
+To use Paddle OCR instead of the default NeMo Retriever Library OCR:
 
 Modify [`values.yaml`](../deploy/helm/nvidia-blueprint-rag/values.yaml) to override the OCR service image:
 
@@ -216,7 +214,7 @@ For detailed Helm deployment instructions, see [Helm Deployment Guide](deploy-he
 
 ### Environment Variables
 
-| Variable | Description | NeMo Retriever Default | Paddle Default | Required |
+| Variable | Description | NeMo Retriever Library Default | Paddle Default | Required |
 |----------|-------------|------------------------|----------------|----------|
 | `OCR_GRPC_ENDPOINT` | gRPC endpoint for OCR service | `nemoretriever-ocr:8001` | `paddle:8001` | Yes (on-premises) |
 | `OCR_HTTP_ENDPOINT` | HTTP endpoint for OCR service | `http://nemoretriever-ocr:8000/v1/infer` | `http://paddle:8000/v1/infer` | Yes |
@@ -240,16 +238,16 @@ Replace `workstation_ip` with the actual IP address of the machine running the O
 
 ## Switching Between OCR Services
 
-### Migrating from Paddle OCR to NeMo Retriever OCR
+### Migrating from Paddle OCR to NeMo Retriever Library OCR
 
-To switch to the default NeMo Retriever OCR service:
+To switch to the default NeMo Retriever Library OCR service:
 
 1. **Stop Paddle OCR**:
    ```bash
    USERID=$(id -u) docker compose -f deploy/compose/nims.yaml down paddle
    ```
 
-2. **Configure NeMo Retriever OCR environment variables**:
+2. **Configure NeMo Retriever Library OCR environment variables**:
    ```bash
    export OCR_GRPC_ENDPOINT=nemoretriever-ocr:8001
    export OCR_HTTP_ENDPOINT=http://nemoretriever-ocr:8000/v1/infer
@@ -257,7 +255,7 @@ To switch to the default NeMo Retriever OCR service:
    export OCR_MODEL_NAME=scene_text_ensemble
    ```
 
-3. **Start NeMo Retriever OCR**:
+3. **Start NeMo Retriever Library OCR**:
    ```bash
    USERID=$(id -u) docker compose -f deploy/compose/nims.yaml up -d nemoretriever-ocr
    ```
@@ -267,14 +265,14 @@ To switch to the default NeMo Retriever OCR service:
    docker compose -f deploy/compose/docker-compose-ingestor-server.yaml up -d
    ```
 
-### Migrating from NeMo Retriever OCR to Paddle OCR
+### Migrating from NeMo Retriever Library OCR to Paddle OCR
 
 Follow the steps in [Switching to Paddle OCR](#switching-to-paddle-ocr) above.
 
 
 ## Performance Comparison
 
-| Feature | NeMo Retriever OCR | Paddle OCR |
+| Feature | NeMo Retriever Library OCR | Paddle OCR |
 |---------|-------------------|------------|
 | **Performance** | 2x+ faster | Baseline |
 | **GPU Memory** | 8 GB (default) | 3 GB (default) |
@@ -299,13 +297,13 @@ Follow the steps in [Switching to Paddle OCR](#switching-to-paddle-ocr) above.
 
 3. **Performance Issues**
    - Consider increasing `OCR_CUDA_MEMORY_POOL_MB`
-   - Adjust `OCR_BATCH_SIZE` for NeMo Retriever OCR
+   - Adjust `OCR_BATCH_SIZE` for NeMo Retriever Library OCR
    - Verify GPU has sufficient memory
 
 ### Getting Logs
 
 ```bash
-# NeMo Retriever OCR logs
+# NeMo Retriever Library OCR logs
 docker logs nemoretriever-ocr
 
 # Paddle OCR logs
diff --git a/docs/nemotron-parse-extraction.md b/docs/nemotron-parse-extraction.md
index a23dca7f4..0e2fc0b11 100644
--- a/docs/nemotron-parse-extraction.md
+++ b/docs/nemotron-parse-extraction.md
@@ -62,7 +62,7 @@ When using NVIDIA hosted endpoints, you may encounter rate limiting with larger
 
 ## Using Helm
 
-To enable PDF extraction with Nemotron Parse using Helm, you need to enable the Nemotron Parse service and configure the ingestor-server to use it.
+To enable PDF extraction with Nemotron Parse using Helm, enable the Nemotron Parse service and configure the ingestor-server to use it.
 
 ### Prerequisites
 - Ensure you have sufficient GPU resources. Nemotron Parse requires a dedicated GPU.
@@ -71,7 +71,7 @@ To enable PDF extraction with Nemotron Parse using Helm, you need to enable the
 
 To deploy with Nemotron Parse enabled:
 
-Modify [`values.yaml`](../deploy/helm/nvidia-blueprint-rag/values.yaml) to enable Nemotron Parse:
+Modify [`values.yaml`](../deploy/helm/nvidia-blueprint-rag/values.yaml) to enable Nemotron Parse and configure the ingestor-server:
 
 ```yaml
 # Enable Nemotron Parse NIM
@@ -93,9 +93,136 @@ For detailed HELM deployment instructions, see [Helm Deployment Guide](deploy-he
 :::{note}
 **Key Configuration Changes:**
 - `nv-ingest.nimOperator.nemotron_parse.enabled=true` - Enables Nemotron Parse NIM
-- `ingestor-server.envVars.APP_NVINGEST_PDFEXTRACTMETHOD="nemotron_parse"` - Configures ingestor to use Nemotron Parse
+- `ingestor-server.envVars.APP_NVINGEST_PDFEXTRACTMETHOD="nemotron_parse"` - Configures ingestor to use Nemotron Parse for PDF extraction
 :::
 
+## Experimental: Nemotron-parse-only extraction
+
+:::{note}
+The steps in this section describe a nemotron-parse-only pipeline. For production use, the default pipeline (Nemotron Parse with page-elements and table-structure NIMs) is recommended for better accuracy.
+:::
+
+The **default** Nemotron Parse pipeline uses the **page-elements** and **table-structure** NIMs together with the Nemotron Parse NIM in the extraction pipeline. This combination provides better accuracy for PDF and table extraction. 
+To **experiment** with a nemotron-parse-only extraction pipeline (using only the Nemotron Parse NIM, without OCR, page-elements, graphic-elements, or table-structure NIMs), use the following steps.
+
+### Key configuration
+
+- **PDF extraction method** — Set `APP_NVINGEST_PDFEXTRACTMETHOD` to `nemotron_parse` so the ingestor uses Nemotron Parse for PDF text extraction.
+- **Table extraction method** — Set `APP_NVINGEST_EXTRACTTABLESMETHOD` to `nemotron_parse` so the ingestor uses Nemotron Parse for table extraction instead of the default YOLOX-based table NIMs. This is required for a nemotron-parse-only pipeline.
+- **nv-ingest health check** — Set `COMPONENTS_TO_READY_CHECK` to an empty string (`""`) in the **nv-ingest** service environment. By default, nv-ingest readiness waits for other ingest NIMs. With only Nemotron Parse running, the readiness probe would otherwise never pass. Emptying this value allows nv-ingest to become ready when only Nemotron Parse is available.
+
+### Using Docker Compose (nemotron-parse-only)
+
+#### On-prem models
+
+1. **Prerequisites**: Follow the [deployment guide](deploy-docker-self-hosted.md) up to and including the step labelled "Start all required NIMs."
+
+2. Start only the Nemotron Parse service (and any other non-ingest services your setup needs):
+   ```bash
+   USERID=$(id -u) docker compose --profile rag --profile nemotron-parse -f deploy/compose/nims.yaml up -d
+   ```
+  You can skip the OCR, page-elements, graphic-elements, or table-structure NIMs if you want a nemotron-parse-only pipeline.
+
+3. Configure the ingestor-server and nv-ingest for nemotron-parse-only. Set these environment variables:
+
+   **Ingestor-server** (ingestor-server environment):
+   ```bash
+   export APP_NVINGEST_PDFEXTRACTMETHOD=nemotron_parse
+   export APP_NVINGEST_EXTRACTTABLESMETHOD=nemotron_parse
+   ```
+
+   **nv-ingest** (nv-ingest service environment, e.g. in the compose file where nv-ingest runs):
+   ```bash
+   export COMPONENTS_TO_READY_CHECK=""
+   ```
+   This ensures the nv-ingest readiness probe passes when other ingest NIMs are not running.
+
+4. Deploy the ingestion-server and rag-server containers following the remaining steps in the deployment guide.
+
+5. Ingest PDFs using the [ingestion API usage notebook](https://github.com/NVIDIA-AI-Blueprints/rag/blob/main/notebooks/ingestion_api_usage.ipynb).
+
+#### NVIDIA hosted API endpoints
+
+1. **Prerequisites**: Follow the [deployment guide](deploy-docker-nvidia-hosted.md) up to and including the step labelled "Start the vector db containers from the repo root."
+
+2. Export variables for the Nemotron Parse API:
+   ```bash
+   export NEMOTRON_PARSE_HTTP_ENDPOINT=https://integrate.api.nvidia.com/v1/chat/completions
+   export NEMOTRON_PARSE_MODEL_NAME=nvidia/nemotron-parse
+   export NEMOTRON_PARSE_INFER_PROTOCOL=http
+   ```
+
+3. Configure the ingestor-server and nv-ingest for nemotron-parse-only:
+
+   **Ingestor-server**:
+   ```bash
+   export APP_NVINGEST_PDFEXTRACTMETHOD=nemotron_parse
+   export APP_NVINGEST_EXTRACTTABLESMETHOD=nemotron_parse
+   ```
+
+   **nv-ingest** (so readiness passes without other NIMs):
+   ```bash
+   export COMPONENTS_TO_READY_CHECK=""
+   ```
+
+4. Deploy the ingestion-server and rag-server containers following the remaining steps in the deployment guide.
+
+5. Ingest PDFs using the [ingestion API usage notebook](https://github.com/NVIDIA-AI-Blueprints/rag/blob/main/notebooks/ingestion_api_usage.ipynb).
+
+:::{note}
+When using NVIDIA hosted endpoints, you may encounter rate limiting with larger file ingestions (>10 files).
+:::
+
+### Using Helm (nemotron-parse-only)
+
+To run only Nemotron Parse for PDF and table extraction with Helm:
+
+1. **Prerequisites**: Ensure you have sufficient GPU resources. Nemotron Parse requires a dedicated GPU.
+
+2. Edit [`values.yaml`](../deploy/helm/nvidia-blueprint-rag/values.yaml):
+
+   - **Enable Nemotron Parse** and **disable the other ingest NIMs** under `nv-ingest.nimOperator`:
+
+   ```yaml
+   nv-ingest:
+     nimOperator:
+       nemotron_parse:
+         enabled: true
+       nemoretriever_ocr_v1:
+         enabled: false
+       graphic_elements:
+         enabled: false
+       page_elements:
+         enabled: false
+       table_structure:
+         enabled: false
+     envVars:
+       COMPONENTS_TO_READY_CHECK: ""
+   ```
+
+   - **Configure the ingestor-server** to use Nemotron Parse for both PDF and table extraction:
+
+   ```yaml
+   ingestor-server:
+     envVars:
+       APP_NVINGEST_PDFEXTRACTMETHOD: "nemotron_parse"
+       APP_NVINGEST_EXTRACTTABLESMETHOD: "nemotron_parse"
+   ```
+
+3. Apply the changes as described in [Change a Deployment](deploy-helm.md#change-a-deployment).
+
+4. For full Helm deployment steps, see the [Helm Deployment Guide](deploy-helm.md).
+
+**Summary of nemotron-parse-only Helm settings:**
+
+| Setting | Purpose |
+|---------|---------|
+| `nv-ingest.nimOperator.nemotron_parse.enabled: true` | Enable the Nemotron Parse NIM. |
+| `nv-ingest.nimOperator.<other_nims>.enabled: false` | Disable OCR, page-elements, graphic-elements, and table-structure NIMs. |
+| `nv-ingest.envVars.COMPONENTS_TO_READY_CHECK: ""` | nv-ingest health check: readiness passes without other NIMs. |
+| `ingestor-server.envVars.APP_NVINGEST_PDFEXTRACTMETHOD: "nemotron_parse"` | Use Nemotron Parse for PDF extraction. |
+| `ingestor-server.envVars.APP_NVINGEST_EXTRACTTABLESMETHOD: "nemotron_parse"` | Use Nemotron Parse for table extraction. |
+
 ## Limitations and Requirements
 
 When using Nemotron Parse for PDF extraction, consider the following:
@@ -105,7 +232,7 @@ When using Nemotron Parse for PDF extraction, consider the following:
 - The extraction quality may vary depending on the PDF structure and content.
 - Nemotron Parse is not supported on NVIDIA B200 GPUs or RTX Pro 6000 GPUs.
 
-For detailed information about hardware requirements and supported GPUs for all NeMo Retriever extraction NIMs, refer to the [Nemotron Parse Support Matrix](https://docs.nvidia.com/nim/vision-language-models/latest/support-matrix.html#nemotron-parse).
+For detailed information about hardware requirements and supported GPUs for extraction NIMs used by NeMo Retriever Library, refer to the [Nemotron Parse Support Matrix](https://docs.nvidia.com/nim/vision-language-models/latest/support-matrix.html#nemotron-parse).
 
 ## Available PDF Extraction Methods
 
@@ -115,6 +242,8 @@ The `APP_NVINGEST_PDFEXTRACTMETHOD` environment variable supports the following
 - `pdfium`: Uses the default PDFium-based extraction
 - `None`: Uses the default extraction method
 
+**Table extraction method:** The `APP_NVINGEST_EXTRACTTABLESMETHOD` environment variable controls how tables are extracted. Set it to `nemotron_parse` to use Nemotron Parse for table extraction (recommended for a nemotron-parse-only pipeline). The default is `yolox`, which uses the YOLOX-based table NIMs.
+
 :::{note}
 The Nemotron Parse service requires GPU resources and must run on a dedicated GPU. Make sure you have sufficient GPU resources available before enabling this feature.
 :::
diff --git a/docs/nemotron3-super-deployment.md b/docs/nemotron3-super-deployment.md
new file mode 100644
index 000000000..8b4295945
--- /dev/null
+++ b/docs/nemotron3-super-deployment.md
@@ -0,0 +1,180 @@
+# Using Nemotron-3-Super-120B-A12B LLM NIM
+
+[Nemotron-3-Super-120B-A12B](https://build.nvidia.com/nvidia/nemotron-3-super-120b-a12b/modelcard) is a large language model (LLM) trained by NVIDIA, designed to deliver strong agentic, reasoning, and conversational capabilities. It is optimized for collaborative agents and high-volume workloads such as IT ticket automation. This LLM can considerably improve the accuracy of the RAG pipeline, especially with reasoning enabled. ([Model card](https://build.nvidia.com/nvidia/nemotron-3-super-120b-a12b/modelcard))
+
+We recommend to use the model with low-effort reasoning mode with a reasoning budget of 256 to have a balance between accuracy and performance. You can switch to non-reasoning mode for maximum performance or use reasoning mode for best accuracy.
+
+## Hardware requirements
+
+For Docker and Kubernetes deployment, see the following:
+
+- **Docker (local NIM):** [Hardware Requirements (Docker)](support-matrix.md#hardware-requirements-docker)
+- **Kubernetes (Helm):** [Hardware Requirements (Kubernetes)](support-matrix.md#hardware-requirements-kubernetes)
+
+For [self-hosted local NIM](deploy-docker-self-hosted.md) deployment with `nemotron-3-super-120b-a12b`, you need one of the following:
+
+- 3 x H100
+- 3 x B200
+- 3 x RTX PRO 6000
+
+### Hardware Requirements (Kubernetes)
+
+To deploy with [Helm](deploy-helm.md) using `nemotron-3-super-120b-a12b`, you need one of the following:
+
+- 9 x H100-80GB
+- 9 x B200
+- 9 x RTX PRO 6000
+
+---
+
+## Start services using NVIDIA-hosted models
+
+No local GPU needed for the LLM. The file `deploy/compose/nemotron3-super-cloud.env` sets all NVIDIA-hosted (cloud) endpoints and the `nemotron-3-super-120b-a12b` model.
+
+1. [Set your API key](https://github.com/NVIDIA-AI-Blueprints/rag/blob/main/docs/api-key.md) and prompt config, then source the env files:
+
+```bash
+export NGC_API_KEY=<ngc-api-key>
+source deploy/compose/.env
+source deploy/compose/nemotron3-super-cloud.env
+export PROMPT_CONFIG_FILE=$(pwd)/deploy/compose/nemotron3-super-prompt.yaml
+```
+
+2. Follow [Start services using NVIDIA-hosted models](deploy-docker-nvidia-hosted.md#start-services-using-nvidia-hosted-models) to start the vectorstore, rag-server, and ingestor-server.
+
+---
+
+## Start services using self-hosted on-premises models
+
+1. Update `nims.yaml`
+
+   Edit `deploy/compose/nims.yaml` and change the `nim-llm` service image and GPU allocation:
+
+   ```yaml
+   nim-llm:
+     image: nvcr.io/nim/nvidia/nemotron-3-super-120b-a12b:1.8.0
+     ...
+     user: "0"
+     environment:
+       NGC_API_KEY: ${NGC_API_KEY}
+       NIM_MAX_MODEL_LEN: "32768"  # required for TP2 profile
+       NIM_KVCACHE_PERCENT: "0.9"
+     deploy:
+       resources:
+         reservations:
+           devices:
+             - driver: nvidia
+               device_ids: ['1','2']  # 2 GPUs for FP8 TP2
+               capabilities: [gpu]
+   ```
+
+   > Note: To deploy TP2 profiles you need to limit NIM_MAX_MODEL_LEN to 32768
+
+   To confirm that a TP2 profile is available for your hardware, run:
+
+   ```bash
+   docker run -ti --rm --gpus all nvcr.io/nim/nvidia/nemotron-3-super-120b-a12b:1.8.0 list-model-profiles
+   ```
+
+   Check the [model page](https://build.nvidia.com/nvidia/nemotron-3-super-120b-a12b/modelcard) for more details.
+
+   > Note: For RTX 6000 Pro GPUs, additional NIM environment variables are required — see [RTX 6000 Pro](#rtx-6000-pro) below.
+
+2. Set nemotron-3-super specific environment variables.
+
+   Ensure the section **`Endpoints for using cloud NIMs`** in `deploy/compose/.env` is **commented** (so on-prem endpoints are used).
+
+   ```bash
+   source deploy/compose/.env
+   source deploy/compose/nemotron3-super.env
+   export PROMPT_CONFIG_FILE=$(pwd)/deploy/compose/nemotron3-super-prompt.yaml
+   export LLM_MAX_TOKENS=16256
+   ```
+
+   Follow [Start services using self-hosted on-premises models](deploy-docker-self-hosted.md#start-services-using-self-hosted-on-premises-models) to start the vectorstore, rag-server, NIMs, and ingestor-server.
+
+**RTX 6000 Pro**
+
+> Note: To deploy TP2 profiles on RTX PRO 6000 Blackwell Server Edition, run the following commands. You don't need to go through these steps if you are using TP4 or TP8 profile.
+
+1. Edit `/etc/default/grub` and set:
+
+   ```text
+   GRUB_CMDLINE_LINUX_DEFAULT="quiet splash iommu=pt"
+   ```
+
+2. Run:
+
+   ```bash
+   sudo update-grub2
+   sudo reboot
+   ```
+
+3. In `nims.yaml`, add under the `nim-llm` `environment:` block:
+
+   ```yaml
+   environment:
+     # In addition to variable already set in step 1
+     NCCL_P2P_DISABLE: "1"
+   ```
+
+---
+
+## Helm deployment (`nemotron-3-super-120b-a12b`)
+
+From the repository root, run:
+
+```bash
+helm upgrade --install rag -n rag https://helm.ngc.nvidia.com/nvidia/blueprint/charts/nvidia-blueprint-rag-v2.5.0.tgz \
+  --username '$oauthtoken' \
+  --password "${NGC_API_KEY}" \
+  --set imagePullSecret.password=$NGC_API_KEY \
+  --set ngcApiSecret.password=$NGC_API_KEY \
+  -f deploy/helm/nvidia-blueprint-rag/values.yaml \
+  -f deploy/helm/nvidia-blueprint-rag/nemotron3-super-values.yaml
+```
+
+The prompt file `deploy/compose/nemotron3-super-prompt.yaml` is tuned for `nemotron-3-super-120b-a12b`. To customize it, see [Prompt customization in Helm chart](prompt-customization.md#prompt-customization-in-helm-chart).
+
+**RTX 6000 Pro**
+
+> Note: To deploy TP2 profiles on RTX PRO 6000 Blackwell Server Edition, run the following commands. You don't need to go through these steps if you are using TP4 or TP8 profile.
+
+1. Edit `/etc/default/grub` and set:
+
+   ```text
+   GRUB_CMDLINE_LINUX_DEFAULT="quiet splash iommu=pt"
+   ```
+
+2. Run:
+
+   ```bash
+   sudo update-grub2
+   sudo reboot
+   ```
+
+3. From the repository root, run:
+
+   ```bash
+   helm upgrade --install rag -n rag https://helm.ngc.nvidia.com/nvidia/blueprint/charts/nvidia-blueprint-rag-v2.5.0.tgz \
+     --username '$oauthtoken' \
+     --password "${NGC_API_KEY}" \
+     --set imagePullSecret.password=$NGC_API_KEY \
+     --set ngcApiSecret.password=$NGC_API_KEY \
+     -f deploy/helm/nvidia-blueprint-rag/values.yaml \
+     -f deploy/helm/nvidia-blueprint-rag/nemotron3-super-values.yaml \
+     -f deploy/helm/nvidia-blueprint-rag/nemotron3-super-rtx6000-values.yaml
+   ```
+
+---
+
+## Reasoning and non-reasoning mode
+
+To disable reasoning mode set following
+
+```bash
+export LLM_ENABLE_THINKING=false
+export LLM_REASONING_BUDGET=0
+```
+
+For other options (e.g. full reasoning budget), see [Enable reasoning for Nemotron 3 models](enable-nemotron-thinking.md).
diff --git a/docs/notebooks.md b/docs/notebooks.md
index a88952f79..beff3dc94 100644
--- a/docs/notebooks.md
+++ b/docs/notebooks.md
@@ -101,7 +101,9 @@ Use the following notebooks to learn comprehensive Python client usage, metadata
 
 - [rag_library_usage.ipynb](https://github.com/NVIDIA-AI-Blueprints/rag/blob/main/notebooks/rag_library_usage.ipynb) – Demonstrates native usage of the NVIDIA RAG Python client, including environment setup, document ingestion, collection management, and querying. This notebook provides end-to-end API usage examples for interacting directly with the RAG system from Python, covering both ingestion and retrieval workflows.
 
-- [rag_library_lite_usage.ipynb](https://github.com/NVIDIA-AI-Blueprints/rag/blob/main/notebooks/rag_library_lite_usage.ipynb) – Demonstrates containerless deployment of the NVIDIA RAG Python package in lite mode. Uses Milvus Lite (embedded vector database) and NV-Ingest subprocess mode for a simplified setup without Docker containers. Leverages NVIDIA cloud APIs for embeddings, ranking, and LLM inference. **Note**: This mode does not support image/table/chart citations or document summarization.
+- [rag_library_lite_usage.ipynb](https://github.com/NVIDIA-AI-Blueprints/rag/blob/main/notebooks/rag_library_lite_usage.ipynb) – Demonstrates containerless deployment of the NVIDIA RAG Python package in lite mode. Uses Milvus Lite (embedded vector database) and NeMo Retriever Library subprocess mode for a simplified setup without Docker containers. Leverages NVIDIA cloud APIs for embeddings, ranking, and LLM inference. **Note**: This mode does not support image/table/chart citations or document summarization.
+
+- [langchain_nvidia_retriever.ipynb](https://github.com/NVIDIA-AI-Blueprints/rag/blob/main/notebooks/langchain_nvidia_retriever.ipynb) – Showcases **LangChain integration** with the NVIDIA RAG Blueprint. Run [ingestion_api_usage.ipynb](https://github.com/NVIDIA-AI-Blueprints/rag/blob/main/notebooks/ingestion_api_usage.ipynb) first to ingest documents, then use `NVIDIARAGRetriever` for retrieval (sync/async), custom parameters, error handling, and optional RAG chaining with `ChatNVIDIA`.
 
 
 
@@ -125,54 +127,6 @@ Use the following notebook for cloud deployment scenarios.
 - [launchable.ipynb](https://github.com/NVIDIA-AI-Blueprints/rag/blob/main/notebooks/launchable.ipynb) – A deployment-ready notebook intended to run in a [Brev environment](https://console.brev.dev/environment/new). To learn more about Brev, refer to [Brev](https://docs.nvidia.com/brev/latest/about-brev.html). Follow the instructions for running Jupyter notebooks in a cloud-based environment based on the hardware requirements specified in the launchable.
 
 
-
-## Set Up the Notebook Environment
-
-To run a notebook, use the following procedure with [uv](https://docs.astral.sh/uv/) - a fast Python package manager.
-
-> **Note**: Python version **3.11 or higher** is required.
-
-1. Install uv (if not already installed):
-
-    ```bash
-    curl -LsSf https://astral.sh/uv/0.8.12/install.sh | sh
-    ```
-
-2. Create and activate a virtual environment:
-
-    ```bash
-    uv venv --python=python3.12
-    source .venv/bin/activate
-    ```
-
-3. Install JupyterLab:
-
-    ```bash
-    uv pip install jupyterlab
-    ```
-
-4. Start JupyterLab:
-
-    ```bash
-    jupyter lab --allow-root --ip=0.0.0.0 --NotebookApp.token='' --port=8889 --no-browser
-    ```
-
-### Set-up Notes
-- Ensure that API keys and credentials are correctly set up before you run a notebook.
-- Modify endpoints or request parameters as necessary to match your specific use case.
-- For the custom VDB operator notebook, ensure that Docker is available for running OpenSearch services.
-
-
-
-## Run a Notebook
-
-After you set up your notebook environment, to run a notebook, use the following procedure.
-
-1. Access JupyterLab by opening a browser and navigating to `http://<your-server-ip>:8889`.
-2. Navigate to the notebook and run the cells sequentially.
-
-
-
 ## Related Topics
 
 - [Get Started](deploy-docker-self-hosted.md)
diff --git a/docs/nv-ingest-standalone.md b/docs/nv-ingest-standalone.md
index f09a37970..14319ad94 100644
--- a/docs/nv-ingest-standalone.md
+++ b/docs/nv-ingest-standalone.md
@@ -3,19 +3,19 @@
   SPDX-License-Identifier: Apache-2.0
 -->
 
-# Deploy NV-Ingest Standalone for NVIDIA RAG Blueprint
+# Deploy NeMo Retriever Library Standalone for NVIDIA RAG Blueprint
 
-This guide explains how to deploy and use NV-Ingest as a standalone service for [NVIDIA RAG Blueprint](readme.md) without deploying the full ingestor server. This is useful when you want to ingest documents directly using Python scripts.
+This guide explains how to deploy and use NeMo Retriever Library as a standalone service for [NVIDIA RAG Blueprint](readme.md) without deploying the full ingestor server. This is useful when you want to ingest documents directly using Python scripts.
 
 For more details and advanced usage, refer to:
-- [NVIDIA/nv-ingest repository](https://github.com/NVIDIA/nv-ingest)
-- [Official NV-Ingest Quickstart Guide](https://github.com/NVIDIA/nv-ingest/blob/main/docs/docs/extraction/quickstart-guide.md)
+- [NVIDIA/NeMo-Retriever Library repository](https://github.com/NVIDIA/NeMo-Retriever)
+- [Official NeMo Retriever Library Quickstart Guide](https://docs.nvidia.com/nemo/retriever/)
 
 ## Limitations
 
-When using NV-Ingest in standalone mode, consider the following limitations:
+When using NeMo Retriever Library in standalone mode, consider the following limitations:
 
-1. **Citations Disabled**: The RAG server's citation feature will be disabled for documents ingested through standalone NV-Ingest. This is because the citation metadata requires additional processing that is handled by the full ingestor server.
+1. **Citations Disabled**: The RAG server's citation feature will be disabled for documents ingested through standalone NeMo Retriever Library. This is because the citation metadata requires additional processing that is handled by the full ingestor server.
 
 2. **No Web UI**: The standalone deployment does not include the web-based upload interface. All document ingestion must be done through Python scripts.
 
@@ -92,7 +92,7 @@ COLLECTION_NAME = "multimodal_data_nvingest"
 MILVUS_URI = "http://localhost:19530"
 MINIO_ENDPOINT = "localhost:9010"
 
-# Server Mode (Create NV-Ingest client)
+# Server Mode (Create NeMo Retriever Library client)
 client = NvIngestClient(
     message_client_hostname="localhost",
     message_client_port=7670
@@ -118,10 +118,10 @@ ingestor = ingestor.split(
             )
 
 ingestor = ingestor.embed(
-    # For self-hosted: "http://nemoretriever-embedding-ms:8000/v1"
+    # For self-hosted: "http://nemotron-embedding-ms:8000/v1"
     # For cloud (NVIDIA-hosted): "https://integrate.api.nvidia.com/v1"
-    endpoint_url="http://nemoretriever-embedding-ms:8000/v1",
-    model_name="nvidia/llama-3.2-nv-embedqa-1b-v2"
+    endpoint_url="http://nemotron-embedding-ms:8000/v1",
+    model_name="nvidia/llama-nemotron-embed-1b-v2"
 )
 
 ingestor = ingestor.vdb_upload(
diff --git a/docs/observability.md b/docs/observability.md
index 0c4bb2665..587c6a70e 100644
--- a/docs/observability.md
+++ b/docs/observability.md
@@ -45,13 +45,13 @@ Use the following procedure to enable observability with Docker.
 
 After tracing is enabled and the system is running, you can **view the traces** in **Zipkin** by opening:
 
-<p align="center">
-<img src="assets/zipkin_ui.png" width="750">
-</p>
+```{image} assets/zipkin_ui.png
+:width: 750px
+:align: center
+```
 
 Open the Zipkin UI at: **http://localhost:9411**
 
-
 ## View Metrics in Grafana
 
 As part of the tracing, the RAG service also exports metrics like API request counts, LLM prompt and completion token count and words per chunk.
@@ -104,11 +104,10 @@ After tracing is enabled and running, you can view inputs and outputs of differe
 
 3. Similarly, you can view inputs and outputs for sub stages within the workflows by clicking on a substage and finding the `traceloop.entity.input` and `traceloop.entity.ouput` rows.
 
-  <p align="center">
-  <img src="assets/zipkin_ui_labelled.png" width="750">
-  </p>
-
-
+```{image} assets/zipkin_ui_labelled.png
+:width: 750px
+:align: center
+```
 
 ## Enable Observability with Helm
 
diff --git a/docs/project.json b/docs/project.json
index 66344b5d0..9b67aad99 100644
--- a/docs/project.json
+++ b/docs/project.json
@@ -1,4 +1,4 @@
 {
     "name": "NVIDIA-RAG-blueprint",
-    "version": "2.4.0"
+    "version": "2.5.0"
 }
\ No newline at end of file
diff --git a/docs/python-client.md b/docs/python-client.md
index 73b432eaf..5c9bc33ea 100644
--- a/docs/python-client.md
+++ b/docs/python-client.md
@@ -155,12 +155,12 @@ Verify all containers are running and healthy.
 
 ```output
 NAMES                           STATUS
-nemoretriever-ranking-ms        Up ... (healthy)
+nemotron-ranking-ms        Up ... (healthy)
 compose-page-elements-1         Up ...
 compose-nemoretriever-ocr-1     Up ...
 compose-graphic-elements-1      Up ...
 compose-table-structure-1       Up ...
-nemoretriever-embedding-ms      Up ... (healthy)
+nemotron-embedding-ms      Up ... (healthy)
 nim-llm-ms                      Up ... (healthy)
 ```
 
@@ -170,32 +170,32 @@ nim-llm-ms                      Up ... (healthy)
 
 `DEPLOYMENT_MODE = "cloud"`
 
-2.  Configure NV-Ingest to use NVIDIA hosted cloud APIs using the following hosted models.
+2.  Configure NeMo Retriever Library to use NVIDIA hosted cloud APIs using the following hosted models.
 
 - os.environ["OCR_HTTP_ENDPOINT"] = "https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-ocr"
 
 - os.environ["OCR_INFER_PROTOCOL"] = "http"
 os.environ["YOLOX_HTTP_ENDPOINT"] = (
-    "https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-page-elements-v3"
+    "https://ai.api.nvidia.com/v1/cv/nvidia/nemotron-page-elements-v3"
 )
 
 - os.environ["YOLOX_INFER_PROTOCOL"] = "http"
 
 - os.environ["YOLOX_GRAPHIC_ELEMENTS_HTTP_ENDPOINT"] = (
-    "https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-graphic-elements-v1"
+    "https://ai.api.nvidia.com/v1/cv/nvidia/nemotron-graphic-elements-v1"
 )
 
 - os.environ["YOLOX_GRAPHIC_ELEMENTS_INFER_PROTOCOL"] = "http"
 
 - os.environ["YOLOX_TABLE_STRUCTURE_HTTP_ENDPOINT"] = (
-    "https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-table-structure-v1"
+    "https://ai.api.nvidia.com/v1/cv/nvidia/nemotron-table-structure-v1"
 )
 os.environ["YOLOX_TABLE_STRUCTURE_INFER_PROTOCOL"] = "http"
 
 
-### Setup NVIDIA Ingest Runtime and Redis Service
+### Setup NeMo Retriever Library Runtime and Redis Service
 
-Use the following command to setup your NVIDIA Ingest Runtime and Redis Service.
+Use the following command to setup your NeMo Retriever Library Runtime and Redis Service.
 
 `docker compose -f ../deploy/compose/docker-compose-ingestor-server.yaml up nv-ingest-ms-runtime redis -d`
 
@@ -247,7 +247,7 @@ if DEPLOYMENT_MODE == "cloud":
     config_ingestor.llm.server_url = ""  # Empty uses NVIDIA API catalog
     config_ingestor.summarizer.server_url = ""  # Empty uses NVIDIA API catalog
 else:
-    config_ingestor.embeddings.server_url = "http://nemoretriever-embedding-ms:8000/v1"
+    config_ingestor.embeddings.server_url = "http://nemotron-embedding-ms:8000/v1"
 
 ingestor = NvidiaRAGIngestor(config=config_ingestor)
 ```
@@ -357,11 +357,11 @@ from nvidia_rag.utils.configuration import NvidiaRAGConfig
 #         "server_url": "",
 #     },
 #     "embeddings": {
-#         "model_name": "nvidia/llama-3.2-nv-embedqa-1b-v2",
+#         "model_name": "nvidia/llama-nemotron-embed-1b-v2",
 #         "server_url": "https://integrate.api.nvidia.com/v1",
 #     },
 #     "ranking": {
-#         "model_name": "nvidia/llama-3.2-nv-rerankqa-1b-v2",
+#         "model_name": "nvidia/llama-nemotron-rerank-1b-v2",
 #         "server_url": "",
 #     },
 # })
diff --git a/docs/query_decomposition.md b/docs/query_decomposition.md
index 8f346f847..b6826d668 100644
--- a/docs/query_decomposition.md
+++ b/docs/query_decomposition.md
@@ -26,7 +26,7 @@ Each subquery is processed independently to gather comprehensive context, which
 
 ## Accuracy Improvement Example
 
-The following example that uses the [HotpotQA](https://hotpotqa.github.io/) dataset demonstrates the accuracy improvement from enabling query decomposition.
+The following example that uses the [Google Frame](https://huggingface.co/datasets/google/frames-benchmark) benchmark demonstrates the accuracy improvement from enabling query decomposition.
 
 ```text
 Query: I am thinking of a Ancient Roman City. The city was destroyed by volcanic eruption. The eruption occurred in the year 79 AD. The volcano was a stratovolcano. Where was the session held where it was decided that the city would be named a UNESCO world heritage site?
diff --git a/docs/readme.md b/docs/readme.md
index ec8c5cf8e..0248a4a00 100644
--- a/docs/readme.md
+++ b/docs/readme.md
@@ -147,5 +147,5 @@ After you deploy the RAG blueprint, you can customize it for your use cases.
 
 ## Blog Posts
 
-- [NVIDIA NeMo Retriever Delivers Accurate Multimodal PDF Data Extraction 15x Faster](https://developer.nvidia.com/blog/nvidia-nemo-retriever-delivers-accurate-multimodal-pdf-data-extraction-15x-faster/)
+- [NVIDIA NeMo Retriever Library Delivers Accurate Multimodal PDF Data Extraction 15x Faster](https://developer.nvidia.com/blog/nvidia-nemo-retriever-delivers-accurate-multimodal-pdf-data-extraction-15x-faster/)
 - [Finding the Best Chunking Strategy for Accurate AI Responses](https://developer.nvidia.com/blog/finding-the-best-chunking-strategy-for-accurate-ai-responses/)
diff --git a/docs/release-notes.md b/docs/release-notes.md
index 96c48a121..6dd8e6911 100644
--- a/docs/release-notes.md
+++ b/docs/release-notes.md
@@ -8,7 +8,40 @@ This documentation contains the release notes for [NVIDIA RAG Blueprint](readme.
 
 
 
-## Release 2.4.0 (26-02-TBD)
+## Release 2.5.0 (2026-03-17)
+
+This release introduces support for the Nemotron-super-3 model, updates NIMs to the latest versions, upgrades NV-Ingest, and adds continuous ingestion along with RTX 6000 MIG support.
+
+### Highlights
+
+This release includes the following key updates:
+
+- **Nemotron-super-3 model support.** You can now integrate the Nemotron-super-3 model by following the steps outlined in [Change the Inference or Embedding Model](change-model.md).
+- **NIMs updated to latest versions.** 
+  The following model updates are included:
+  - `nvidia/llama-3.2-nv-embedqa-1b-v2` → `nvidia/llama-nemotron-embed-1b-v2`
+  - `nvidia/llama-3.2-nv-rerankqa-1b-v2` → `nvidia/llama-nemotron-rerank-1b-v2`
+  - `nemoretriever-page-elements-v3` → `nemotron-page-elements-v3`
+  - `nemoretriever-graphic-elements-v1` → `nemotron-graphic-elements-v1`
+  - `nemoretriever-table-structure-v1` → `nemotron-table-structure-v1`
+  - `nvidia/llama-3.2-nemoretriever-1b-vlm-embed-v1` → `nvidia/llama-nemotron-embed-vl-1b-v2`
+- Updated NVIngest to [version 26.1.2](https://github.com/NVIDIA/NeMo-Retriever/releases/tag/26.1.2).
+- Added an example demonstrating the continuous ingestion pipeline. For more information, see [rag_event_ingest.ipynb](https://github.com/NVIDIA-AI-Blueprints/rag/blob/main/notebooks/rag_event_ingest.ipynb).
+- **Added MIG support for RTX 6000.** For details, refer to [MIG Deployment](mig-deployment.md) and use `values-mig-rtx6000.yaml` and `mig-config-rtx6000.yaml`.
+- Added documentation for the experimental Nemotron-parse-only ingestion pipeline. This configuration allows you to perform extraction using only Nemotron Parse through NV-Ingest, without relying on OCR, page-elements, graphic-elements, or table-structure NIMs. For more information, refer to [nemotron-parse-extraction.md](nemotron-parse-extraction.md#experimental-nemotron-parse-only-extraction).
+- Several bug fixes, including frontend CVE resolutions, improved multimodal content concatenation for VLM embeddings, enhanced VDB serialization for high-concurrency parallel ingestion, and updates to observability and NeMo Guardrails configurations.
+- Added agentic skills support: the `rag-blueprint` skill enables AI coding assistants (Claude Code, Cursor, Codex, etc.) to deploy, configure, troubleshoot, and manage the RAG Blueprint autonomously. For details, refer to [RAG Blueprint Agent Skill](../skill-source/README.md).
+- Added [accuracy benchmark results](accuracy-benchmarks.md) across seven public datasets (RagBattlepacket, KG-RAG, Financebench, DC767, HotPotQA, Google Frames, and Vidore), comparing LLM and VLM configurations with reasoning on/off. Benchmarks use the NVIDIA Answer Accuracy metric from RAGAS.
+
+### Fixed Known Issues
+
+The following known issues have been resolved in this release:
+
+- Addressed frontend CVEs.
+
+- Resolved VDB indexing issues during high-concurrency batch parallel ingestion by implementing VDB serialization.
+
+## Release 2.4.0 (2026-02-20)
 
 This release adds new features to the RAG pipeline for supporting agent workflows and enhances generations with VLMs augmenting multimodal input.
 
@@ -16,10 +49,10 @@ This release adds new features to the RAG pipeline for supporting agent workflow
 
 This release contains the following key changes:
 
-- Updated NIMs and code to support  [NVIDIA Ingest 26.01 release](https://docs.nvidia.com/nemo/retriever/latest/extraction/releasenotes-nv-ingest/).
+- Updated NIMs and code to support  [NeMo Retriever Library 26.01 release](https://docs.nvidia.com/nemo/retriever/latest/extraction/releasenotes-nv-ingest/).
 - Added support for non-NIM models including OpenAI, models hosted on AWS and Azure, OSS models, and others. Supported through service-specific API keys. For details, refer to [Get an API Key](api-key.md).
-- The RAG Blueprint now uses [nemoretriever-ocr-v1](https://build.nvidia.com/nvidia/nemoretriever-ocr-v1/modelcard) as the default OCR model. For details, refer to [NeMo Retriever OCR Configuration Guide](nemoretriever-ocr.md).
-- The Vision-Language Model (VLM) inference feature now uses the model [nemotron-nano-12b-v2-vl](https://build.nvidia.com/nvidia/nemotron-nano-12b-v2-vl/modelcard). For details, refer to [VLM for Generation](vlm.md).
+- The RAG Blueprint now uses [nemoretriever-ocr-v1](https://build.nvidia.com/nvidia/nemoretriever-ocr-v1/modelcard) as the default OCR model. For details, refer to [NeMo Retriever Library OCR Configuration Guide](nemoretriever-ocr.md).
+- Improved VLM based generation support. The Vision-Language Model (VLM) inference feature now uses the model [nemotron-nano-12b-v2-vl](https://build.nvidia.com/nvidia/nemotron-nano-12b-v2-vl/modelcard). For details, refer to [VLM for Generation](vlm.md).
 - User interface improvements including catalog display, image and text query, and others. For details, refer to [User Interface](user-interface.md).
 - Added ingestion metrics endpoint support with OpenTelemetry (OTEL) for monitoring document uploads, elements ingested, and pages processed. For details, refer to [Observability](observability.md).
 - Support image and text as input query. For details, refer to [Multimodal Query Support](multimodal-query.md).
@@ -40,7 +73,7 @@ This release contains the following key changes:
   - Shallow summarization support
   - Easy model switches and dedicated configurations
   - Ease of prompt changes
-- Reserved field names `type`, `subtype`, and `location` for NV-Ingest exclusive use in metadata schemas.
+- Reserved field names `type`, `subtype`, and `location` for NeMo Retriever Library exclusive use in metadata schemas.
 - Added support for [rag_library_lite_usage.ipynb](https://github.com/NVIDIA-AI-Blueprints/rag/blob/main/notebooks/rag_library_lite_usage.ipynb) which demonstrates containerless deployment of the NVIDIA RAG Python package in lite mode.
 - Added example showcasing [NeMo Agent Toolkit integration](https://github.com/NVIDIA/NeMo-Agent-Toolkit) with NVIDIA RAG.
 - Added [weighted hybrid search](hybrid_search.md#weighted-hybrid-search) support with configurable weights.
@@ -77,7 +110,7 @@ The following are the known issues for the NVIDIA RAG Blueprint:
 - Optional features reflection and image captioning are not available in Helm-based deployment.
 - Currently, Helm-based deployment is not supported for [NeMo Guardrails](nemo-guardrails.md).
 - The Blueprint responses can have significant latency when using [NVIDIA API Catalog cloud hosted models](deploy-docker-nvidia-hosted.md).
-- The accuracy of the pipeline is optimized for certain file types like `.pdf`, `.txt`, `.docx`. The accuracy may be poor for other file types supported by NV-Ingest, since image captioning is disabled by default.
+- The accuracy of the pipeline is optimized for certain file types like `.pdf`, `.txt`, `.docx`. The accuracy may be poor for other file types supported by NeMo Retriever Library, since image captioning is disabled by default.
 - When updating model configurations in Kubernetes `values.yaml` (for example, changing from 70B to 8B models), the RAG UI automatically detects and displays the new model configuration from the backend. No container rebuilds are required - simply redeploy the Helm chart with updated values and refresh the UI to see the new model settings in the Settings panel.
 - The NeMo LLM microservice can take 5-6 minutes to start for every deployment.
 - B200 GPUs are not supported for the following advanced features. For these features, use H100 or A100 GPUs instead.
diff --git a/docs/retrieval-only-deployment.md b/docs/retrieval-only-deployment.md
index 3cfc5d30a..7f7f94475 100644
--- a/docs/retrieval-only-deployment.md
+++ b/docs/retrieval-only-deployment.md
@@ -88,11 +88,11 @@ Choose one of the following options based on your deployment preference.
 Instead of starting all NIMs, use the `text-embed` profile to start only the embedding and reranking services:
 
 ```bash
-USERID=$(id -u) docker compose -f deploy/compose/nims.yaml up -d nemoretriever-ranking-ms nemoretriever-embedding-ms
+USERID=$(id -u) docker compose -f deploy/compose/nims.yaml up -d nemotron-ranking-ms nemotron-embedding-ms
 ```
 
 :::{note}
-The `text-embed` profile starts only `nemoretriever-embedding-ms` and `nemoretriever-ranking-ms `, which is sufficient for retrieval operations. The LLM NIM (`nim-llm-ms`) is not started, saving significant GPU memory.
+The `text-embed` profile starts only `nemotron-embedding-ms` and `nemotron-ranking-ms `, which is sufficient for retrieval operations. The LLM NIM (`nim-llm-ms`) is not started, saving significant GPU memory.
 :::
 
 Wait for the services to become healthy:
@@ -105,8 +105,8 @@ Expected output:
 
 ```output
 NAMES                          STATUS
-nemoretriever-ranking-ms       Up 5 minutes (healthy)
-nemoretriever-embedding-ms     Up 5 minutes (healthy)
+nemotron-ranking-ms       Up 5 minutes (healthy)
+nemotron-embedding-ms     Up 5 minutes (healthy)
 ```
 
 #### Option B: NVIDIA-Hosted NIMs
@@ -308,7 +308,7 @@ This is expected behavior in retrieval-only mode. The `/generate` endpoint requi
 Check the embedding NIM logs:
 
 ```bash
-docker logs nemoretriever-embedding-ms
+docker logs nemotron-embedding-ms
 ```
 
 Ensure the model cache directory has proper permissions:
diff --git a/docs/service-port-gpu-reference.md b/docs/service-port-gpu-reference.md
index 648d1bd32..ed24b39f2 100644
--- a/docs/service-port-gpu-reference.md
+++ b/docs/service-port-gpu-reference.md
@@ -13,23 +13,23 @@ The following table provides a comprehensive reference of all services, their po
 | RAG Server | `rag-server` | 8081 | 8081 | N/A (CPU) | Main RAG API endpoint |
 | Ingestor Server | `ingestor-server` | 8082 | 8082 | N/A (CPU) | Document ingestion API |
 | RAG Frontend | `rag-frontend` | 8090 | 3000 | N/A (CPU) | Web UI |
-| NV-Ingest Runtime | `nv-ingest-ms-runtime` | 7670, 7671, 8265 | 7670, 7671, 8265 | N/A (CPU) | Main orchestrator (Ray dashboard: 8265) |
+| NeMo Retriever Library Runtime | `nv-ingest-ms-runtime` | 7670, 7671, 8265 | 7670, 7671, 8265 | N/A (CPU) | Main orchestrator (Ray dashboard: 8265) |
 
 ## NIM Microservices
 
 | Service | Container Name | Host Port(s) | Container Port(s) | Default GPU ID | Environment Variable | Notes |
 |---------|---------------|--------------|-------------------|----------------|---------------------|-------|
 | LLM | `nim-llm-ms` | 8999 | 8000 | 1 | `LLM_MS_GPU_ID` | Main language model |
-| Embedding | `nemoretriever-embedding-ms` | 9080 | 8000 | 0 | `EMBEDDING_MS_GPU_ID` | Text embeddings |
+| Embedding | `nemotron-embedding-ms` | 9080 | 8000 | 0 | `EMBEDDING_MS_GPU_ID` | Text embeddings |
 | VLM Embedding | `nemotron-vlm-embedding-ms` | 9081 | 8000 | 0 | `VLM_EMBEDDING_MS_GPU_ID` | Vision-language embeddings (opt-in, profile: vlm-embed) |
-| Ranking | `nemoretriever-ranking-ms` | 1976 | 8000 | 0 | `RANKING_MS_GPU_ID` | Reranking model |
+| Ranking | `nemotron-ranking-ms` | 1976 | 8000 | 0 | `RANKING_MS_GPU_ID` | Reranking model |
 | VLM | `nemo-vlm-microservice` | 1977 | 8000 | 5 | `VLM_MS_GPU_ID` | Vision-language model (opt-in, profile: vlm-only, vlm-generation) |
 | Nemotron Parse | `compose-nemotron-parse-1` | 8015, 8016, 8017 | 8000, 8001, 8002 | 1 | `NEMOTRON_PARSE_MS_GPU_ID` | PDF parsing (opt-in, profile: nemotron-parse) |
 | RIVA ASR | `compose-audio-1` | 8021, 8022 | 50051, 9000 | 0 | `AUDIO_MS_GPU_ID` | Audio speech recognition (opt-in, profile: audio) |
 | Page Elements | `compose-page-elements-1` | 8000, 8001, 8002 | 8000, 8001, 8002 | 0 | `YOLOX_MS_GPU_ID` | Object detection for pages |
 | Graphic Elements | `compose-graphic-elements-1` | 8003, 8004, 8005 | 8000, 8001, 8002 | 0 | `YOLOX_GRAPHICS_MS_GPU_ID` | Graphics detection |
 | Table Structure | `compose-table-structure-1` | 8006, 8007, 8008 | 8000, 8001, 8002 | 0 | `YOLOX_TABLE_MS_GPU_ID` | Table structure detection |
-| NeMo Retriever OCR | `compose-nemoretriever-ocr-1` | 8012, 8013, 8014 | 8000, 8001, 8002 | 0 | `OCR_MS_GPU_ID` | OCR service (default) |
+| NeMo Retriever Library OCR | `compose-nemoretriever-ocr-1` | 8012, 8013, 8014 | 8000, 8001, 8002 | 0 | `OCR_MS_GPU_ID` | OCR service (default) |
 
 ## Vector Database and Infrastructure
 
diff --git a/docs/support-matrix.md b/docs/support-matrix.md
index 6344822e1..07843a774 100644
--- a/docs/support-matrix.md
+++ b/docs/support-matrix.md
@@ -78,8 +78,7 @@ The following are requirements and recommendations for the individual components
 - **LLM NIM (llama-3.3-nemotron-super-49b-v1.5)** – Refer to the [Support Matrix](https://docs.nvidia.com/nim/large-language-models/latest/supported-models.html#llama-3-3-nemotron-super-49b-v1-5).
 - **Embedding NIM (Llama-3.2-NV-EmbedQA-1B-v2 )** – Refer to the [Support Matrix](https://docs.nvidia.com/nim/nemo-retriever/text-embedding/latest/support-matrix.html#llama-3-2-nv-embedqa-1b-v2).
 - **Reranking NIM (llama-3_2-nv-rerankqa-1b-v2 )**: Refer to the [Support Matrix](https://docs.nvidia.com/nim/nemo-retriever/text-reranking/latest/support-matrix.html#llama-3-2-nv-rerankqa-1b-v2).
-- **NeMo Retriever OCR (Default)**: Refer to the [Support Matrix](https://docs.nvidia.com/nim/ingestion/image-ocr/1.2.0/support-matrix.html).
-- **NVIDIA NIM for Image OCR (baidu/paddleocr - Legacy)**: Refer to the [Support Matrix](https://docs.nvidia.com/nim/ingestion/table-extraction/latest/support-matrix.html#supported-hardware).
+- **NeMo Retriever OCR (Default)**: Refer to the [Support Matrix](https://docs.nvidia.com/nim/ingestion/image-ocr/1.2.1/support-matrix.html).
 - **NVIDIA NIMs for Object Detection**:
   - NeMo Retriever Page Elements v3 [Support Matrix](https://docs.nvidia.com/nim/ingestion/object-detection/latest/support-matrix.html#nemo-retriever-page-elements-v3)
   - NeMo Retriever Graphic Elements v1 [Support Matrix](https://docs.nvidia.com/nim/ingestion/object-detection/latest/support-matrix.html#nemo-retriever-graphic-elements-v1)
diff --git a/docs/text_only_ingest.md b/docs/text_only_ingest.md
index 784c08978..c2a22afd6 100644
--- a/docs/text_only_ingest.md
+++ b/docs/text_only_ingest.md
@@ -19,7 +19,7 @@ You can enable text-only ingestion for the [NVIDIA RAG Blueprint](readme.md). Fo
    ```
 
    :::{important}
-   When disabling nv-ingest dependent services, you must set `COMPONENTS_TO_READY_CHECK=""` to ensure the nv-ingest container reaches ready state. Without this setting, nv-ingest will wait indefinitely for the disabled components.
+   When disabling NeMo Retriever Library dependent services, you must set `COMPONENTS_TO_READY_CHECK=""` to ensure the NeMo Retriever Library container reaches ready state. Without this setting, the NeMo Retriever Library container will wait indefinitely for the disabled components.
    :::
 
    Then deploy the ingestor-server:
@@ -43,8 +43,8 @@ You can enable text-only ingestion for the [NVIDIA RAG Blueprint](readme.md). Fo
    ```output
       NAMES                                   STATUS
 
-      nemoretriever-ranking-ms                Up 14 minutes (healthy)
-      nemoretriever-embedding-ms              Up 14 minutes (healthy)
+      nemotron-ranking-ms                Up 14 minutes (healthy)
+      nemotron-embedding-ms              Up 14 minutes (healthy)
       nim-llm-ms                              Up 14 minutes (healthy)
    ```
 
@@ -70,7 +70,7 @@ In case you are [interacting with cloud hosted models](deploy-docker-nvidia-host
    export APP_EMBEDDINGS_SERVERURL=""
    export APP_LLM_SERVERURL=""
    export APP_RANKING_SERVERURL=""
-   export YOLOX_HTTP_ENDPOINT="https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-page-elements-v3"
+   export YOLOX_HTTP_ENDPOINT="https://ai.api.nvidia.com/v1/cv/nvidia/nemotron-page-elements-v3"
    export YOLOX_INFER_PROTOCOL="http"
    ```
 :::
@@ -113,7 +113,7 @@ Additionally, ensure that **table extraction**, **chart extraction**, and **imag
 2. Then use the modified [`values.yaml`](../deploy/helm/nvidia-blueprint-rag/values.yaml) file in your Helm upgrade command:
 
 ```bash
-helm upgrade --install rag -n rag https://helm.ngc.nvidia.com/nvstaging/blueprint/charts/nvidia-blueprint-rag-v2.4.0.tgz \
+helm upgrade --install rag -n rag https://helm.ngc.nvidia.com/nvidia/blueprint/charts/nvidia-blueprint-rag-v2.5.0.tgz \
   --username '$oauthtoken' \
   --password "${NGC_API_KEY}" \
   --values deploy/helm/nvidia-blueprint-rag/values.yaml \
@@ -131,9 +131,9 @@ helm upgrade --install rag -n rag https://helm.ngc.nvidia.com/nvstaging/blueprin
 ```
 
 :::{important}
-**Disabling NV-Ingest Components for GPU Resource Management:**
+**Disabling NeMo Retriever Library Components for GPU Resource Management:**
 
-If you disable any nv-ingest dependent services (such as `table_structure`, `graphic_elements`, `nemoretriever_ocr_v1`, etc.) to free up GPU resources for customization, you must set the `COMPONENTS_TO_READY_CHECK` parameter to an empty string in the `nv-ingest.envVars` section of your [values.yaml](../deploy/helm/nvidia-blueprint-rag/values.yaml) file:
+If you disable any NeMo Retriever Library dependent services (such as `table_structure`, `graphic_elements`, `nemoretriever_ocr_v1`, etc.) to free up GPU resources for customization, you must set the `COMPONENTS_TO_READY_CHECK` parameter to an empty string in the `nv-ingest.envVars` section of your [values.yaml](../deploy/helm/nvidia-blueprint-rag/values.yaml) file:
 
 ```yaml
 nv-ingest:
@@ -141,6 +141,6 @@ nv-ingest:
     COMPONENTS_TO_READY_CHECK: ""
 ```
 
-This ensures the nv-ingest pod reaches ready state even when some dependent components are disabled. Without this setting, the nv-ingest pod will wait indefinitely for the disabled components to become ready.
+This ensures the NeMo Retriever Library pod reaches ready state even when some dependent components are disabled. Without this setting, the NeMo Retriever Library pod will wait indefinitely for the disabled components to become ready.
 
 :::
diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md
index 319056bf0..782176ed2 100644
--- a/docs/troubleshooting.md
+++ b/docs/troubleshooting.md
@@ -87,10 +87,10 @@ During first-time deployments, large models are downloaded without visible progr
 docker logs -f nim-llm-ms
 
 # Monitor embedding service
-docker logs -f nemoretriever-embedding-ms
+docker logs -f nemotron-embedding-ms
 
 # Monitor ranking service
-docker logs -f nemoretriever-ranking-ms
+docker logs -f nemotron-ranking-ms
 ```
 
 **Check disk usage to verify download progress:**
@@ -105,7 +105,7 @@ watch -n 10 'du -sh ~/.cache/model-cache/'
 **Check container stats:**
 ```bash
 # View resource usage and verify containers are active
-docker stats nim-llm-ms nemoretriever-embedding-ms nemoretriever-ranking-ms
+docker stats nim-llm-ms nemotron-embedding-ms nemotron-ranking-ms
 ```
 
 ### Kubernetes/Helm Deployments
@@ -340,7 +340,7 @@ If the above error related to dependency conflicts are seen while building conta
 We've integrated VDB and embedding creation directly into the pipeline with caching included for expediency.
 However, in a production environment, it's better to use a separately managed VDB service.
 
-NVIDIA offers optimized models and tools like NVIDIA NeMo Retriever ([build.nvidia.com/explore/retrieval](https://build.nvidia.com/explore/retrieval))
+NVIDIA offers optimized models and tools like NVIDIA NeMo Retriever Library ([build.nvidia.com/explore/retrieval](https://build.nvidia.com/explore/retrieval))
 and cuVS ([github.com/rapidsai/cuvs](https://github.com/rapidsai/cuvs)).
 
 
@@ -367,7 +367,7 @@ Adding this information may impact response accuracy, especially when partial in
 ## Helm Deployment Issues
 
 ### PVCs in Pending state (StorageClass issues)
-If NIM Cache PVCs (e.g., `nemoretriever-embedding-ms-cache-pvc`) remain in `Pending` state, check if they are requesting a `storageClassName: default` that does not exist.
+If NIM Cache PVCs (e.g., `nemotron-embedding-ms-cache-pvc`) remain in `Pending` state, check if they are requesting a `storageClassName: default` that does not exist.
 **Fix:** Ensure you have a default storage class. If using `local-path`, you can create an alias:
 ```yaml
 apiVersion: storage.k8s.io/v1
diff --git a/docs/versions1.json b/docs/versions1.json
index d0731c374..67cecce82 100644
--- a/docs/versions1.json
+++ b/docs/versions1.json
@@ -1,8 +1,8 @@
 [
     {
         "preferred": true,
-        "version": "2.4.0",
-        "url": "../2.4.0/"
+        "version": "2.5.0",
+        "url": "../2.5.0/"
     },
     {
         "version": "2.3.0",
diff --git a/docs/vlm-embed.md b/docs/vlm-embed.md
index 5b9913232..0bec55afe 100644
--- a/docs/vlm-embed.md
+++ b/docs/vlm-embed.md
@@ -153,8 +153,8 @@ To deploy the VLM embedding service with Helm, update the image and model settin
 nvidia-nim-llama-nemotron-embed-vl-1b-v2:
   enabled: true
   image:
-    repository: nvcr.io/nvidia/nemo-microservices/llama-3.2-nemoretriever-1b-vlm-embed-v1
-    tag: "1.7.0"
+    repository: nvcr.io/nim/nvidia/llama-nemotron-embed-vl-1b-v2
+    tag: "1.12.0"
 
 # Optional: disable the default text embedding NIM
 nvidia-nim-llama-32-nv-embedqa-1b-v2:
@@ -214,7 +214,7 @@ ingestor-server:
 
 nv-ingest:
   envVars:
-    # NV-Ingest runtime embedding target
+    # NeMo Retriever Library runtime embedding target
     EMBEDDING_NIM_ENDPOINT: "http://nemotron-vlm-embedding-ms:8000/v1"
     EMBEDDING_NIM_MODEL_NAME: "nvidia/llama-nemotron-embed-vl-1b-v2"
 ```
diff --git a/docs/vlm.md b/docs/vlm.md
index 4a61b2f52..64c6af176 100644
--- a/docs/vlm.md
+++ b/docs/vlm.md
@@ -124,7 +124,7 @@ Continue with [Deploy with Docker (NVIDIA-Hosted Models)](deploy-docker-nvidia-h
 ## Enable VLM with Helm
 
 :::{note}
-**GPU requirements for Helm**: VLM uses the same GPU normally assigned to LLM (GPU 1). With MIG slicing, assign a dedicated MIG slice to the VLM—see [mig-deployment.md](mig-deployment.md) and [values-mig.yaml](../deploy/helm/mig-slicing/values-mig.yaml). To run both VLM and LLM simultaneously, an additional GPU is required.
+**GPU requirements for Helm**: VLM uses the same GPU normally assigned to LLM (GPU 1). With MIG slicing, assign a dedicated MIG slice to the VLM—see [mig-deployment.md](mig-deployment.md) and [values-mig-h100.yaml](../deploy/helm/mig-slicing/values-mig-h100.yaml) or [values-mig-rtx6000.yaml](../deploy/helm/mig-slicing/values-mig-rtx6000.yaml). To run both VLM and LLM simultaneously, an additional GPU is required.
 :::
 
 1. In [values.yaml](../deploy/helm/nvidia-blueprint-rag/values.yaml), under the `rag-server` `envVars` section, set:
diff --git a/examples/README.md b/examples/README.md
index 0d56c4781..ba86d6765 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -8,6 +8,7 @@ This directory contains example integrations and extensions for NVIDIA RAG.
 |---------|-------------|---------------|
 | [rag_react_agent](./rag_react_agent/) | Integration with [NeMo Agent Toolkit (NAT)](https://github.com/NVIDIA/NeMo-Agent-Toolkit) providing RAG query and search capabilities for agent workflows | [README](./rag_react_agent/README.md) |
 | [nvidia_rag_mcp](./nvidia_rag_mcp/) | MCP (Model Context Protocol) server and client for exposing NVIDIA RAG capabilities to MCP-compatible applications | [Documentation](../docs/mcp.md) |
+| [rag_event_ingest](./rag_event_ingest/) | Automated document ingestion from object storage (MinIO) via Kafka | [Notebook](../notebooks/rag_event_ingest.ipynb) |
 
 ## rag_react_agent
 
@@ -27,3 +28,14 @@ This example provides an MCP server and client that exposes NVIDIA RAG and Inges
 - Manage collections and documents in the vector database
 
 See the [MCP documentation](../docs/mcp.md) for detailed setup and usage instructions.
+
+## rag_event_ingest
+
+This example deploys an event-driven ingestion pipeline that monitors MinIO object storage for new file uploads via Kafka events. Documents are automatically indexed through the RAG Ingestor and become queryable through the RAG Agent.
+
+Components:
+- **kafka_consumer/** - Event-driven consumer that routes files to RAG based on file type
+- **deploy/** - Docker Compose for Kafka, MinIO, and the consumer
+- **data/** - Sample documents for testing
+
+See the [notebook](../notebooks/rag_event_ingest.ipynb) for step-by-step deployment and testing.
diff --git a/examples/rag_event_ingest/data/documents/Seahawks-Patriots in Super Bowl LX_ What We Learned from Seattle's 29-13 win.pdf b/examples/rag_event_ingest/data/documents/Seahawks-Patriots in Super Bowl LX_ What We Learned from Seattle's 29-13 win.pdf
new file mode 100644
index 000000000..3d750564d
Binary files /dev/null and b/examples/rag_event_ingest/data/documents/Seahawks-Patriots in Super Bowl LX_ What We Learned from Seattle's 29-13 win.pdf differ
diff --git a/examples/rag_event_ingest/data/videos/Seattle Seahawks vs New England Patriots - Super Bowl LX Game Highlights.mp4 b/examples/rag_event_ingest/data/videos/Seattle Seahawks vs New England Patriots - Super Bowl LX Game Highlights.mp4
new file mode 100644
index 000000000..164dc505a
--- /dev/null
+++ b/examples/rag_event_ingest/data/videos/Seattle Seahawks vs New England Patriots - Super Bowl LX Game Highlights.mp4	
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:618e8d92f34e1a7c3b5ea139b49bce1cf1d00eb4f15fd1963ee53ea8302f6c70
+size 83123435
diff --git a/examples/rag_event_ingest/deploy/docker-compose.yaml b/examples/rag_event_ingest/deploy/docker-compose.yaml
new file mode 100644
index 000000000..05e5bbc0f
--- /dev/null
+++ b/examples/rag_event_ingest/deploy/docker-compose.yaml
@@ -0,0 +1,164 @@
+# AIDP - AI Data Pipeline Docker Compose
+# Event-driven document ingestion with Kafka + MinIO sources
+#
+# Usage:
+#   docker compose -f docker-compose.yaml up -d
+#
+# Prerequisites:
+#   - RAG stack running (from launchable.ipynb)
+#   - nvidia-rag network exists
+
+services:
+  # =============================================================================
+  # KAFKA STACK (KRaft - no Zookeeper needed)
+  # =============================================================================
+  kafka:
+    image: apache/kafka:latest
+    container_name: kafka
+    restart: unless-stopped
+    ports:
+      - "9092:9092"
+      - "9094:9094"
+    environment:
+      - KAFKA_NODE_ID=1
+      - KAFKA_PROCESS_ROLES=broker,controller
+      - KAFKA_LISTENERS=PLAINTEXT://:9092,CONTROLLER://:9093,EXTERNAL://:9094
+      - KAFKA_ADVERTISED_LISTENERS=PLAINTEXT://kafka:9092,EXTERNAL://${HOST_IP:-localhost}:9094
+      - KAFKA_LISTENER_SECURITY_PROTOCOL_MAP=CONTROLLER:PLAINTEXT,PLAINTEXT:PLAINTEXT,EXTERNAL:PLAINTEXT
+      - KAFKA_CONTROLLER_QUORUM_VOTERS=1@kafka:9093
+      - KAFKA_CONTROLLER_LISTENER_NAMES=CONTROLLER
+      - KAFKA_INTER_BROKER_LISTENER_NAME=PLAINTEXT
+      - KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR=1
+      - KAFKA_TRANSACTION_STATE_LOG_REPLICATION_FACTOR=1
+      - KAFKA_TRANSACTION_STATE_LOG_MIN_ISR=1
+      - KAFKA_AUTO_CREATE_TOPICS_ENABLE=true
+      - KAFKA_LOG_RETENTION_HOURS=168
+      - CLUSTER_ID=MkU3OEVBNTcwNTJENDM2Qk
+    volumes:
+      - kafka-data:/var/lib/kafka/data
+    networks:
+      - nvidia-rag
+    healthcheck:
+      test: ["CMD-SHELL", "/opt/kafka/bin/kafka-topics.sh --bootstrap-server localhost:9092 --list || exit 1"]
+      interval: 30s
+      timeout: 10s
+      retries: 5
+
+  kafka-ui:
+    image: provectuslabs/kafka-ui:latest
+    container_name: aidp-kafka-ui
+    depends_on:
+      kafka:
+        condition: service_healthy
+    environment:
+      KAFKA_CLUSTERS_0_NAME: aidp-cluster
+      KAFKA_CLUSTERS_0_BOOTSTRAPSERVERS: kafka:9092
+    ports:
+      - "8080:8080"
+    networks:
+      - nvidia-rag
+
+  # =============================================================================
+  # MINIO (Data Source)
+  # =============================================================================
+  minio-source-1:
+    image: minio/minio:RELEASE.2024-01-18T22-51-28Z
+    container_name: aidp-minio
+    command: server /data --console-address ":9001"
+    environment:
+      MINIO_ROOT_USER: minioadmin
+      MINIO_ROOT_PASSWORD: minioadmin
+      # Kafka notification configuration
+      MINIO_NOTIFY_KAFKA_ENABLE_AIDP: "on"
+      MINIO_NOTIFY_KAFKA_BROKERS_AIDP: "kafka:9092"
+      MINIO_NOTIFY_KAFKA_TOPIC_AIDP: "aidp-topic"
+    volumes:
+      - minio-data:/data
+    ports:
+      - "9201:9000"  
+      - "9211:9001"
+    networks:
+      - nvidia-rag
+    healthcheck:
+      test: ["CMD", "mc", "ready", "local"]
+      interval: 10s
+      timeout: 5s
+      retries: 5
+
+  # MinIO MC for bucket setup
+  minio-mc:
+    image: minio/mc:latest
+    container_name: aidp-minio-mc
+    depends_on:
+      minio-source-1:
+        condition: service_healthy
+      kafka:
+        condition: service_healthy
+    entrypoint: >
+      /bin/sh -c "
+      echo 'Waiting for MinIO...';
+      sleep 5;
+      
+      echo 'Setting up MinIO...';
+      mc alias set minio http://minio-source-1:9000 minioadmin minioadmin;
+      mc mb --ignore-existing minio/aidp-bucket;
+      mc event add minio/aidp-bucket arn:minio:sqs::AIDP:kafka --event put,delete || true;
+      
+      echo 'MinIO setup complete!';
+      echo 'Bucket: aidp-bucket on minio-source-1';
+      
+      echo 'Keeping container alive for mc commands...';
+      tail -f /dev/null
+      "
+    networks:
+      - nvidia-rag
+
+  # =============================================================================
+  # KAFKA CONSUMER (Event-driven Ingestion)
+  # =============================================================================
+  kafka-consumer:
+    build:
+      context: ../kafka_consumer
+      dockerfile: Dockerfile
+    image: kafka-consumer:local
+    container_name: kafka-consumer
+    depends_on:
+      kafka:
+        condition: service_healthy
+      minio-source-1:
+        condition: service_healthy
+    environment:
+      # Kafka
+      - KAFKA_BOOTSTRAP_SERVERS=kafka:9092
+      - KAFKA_TOPIC=${KAFKA_TOPIC:-aidp-topic}
+      - CONSUMER_GROUP=${CONSUMER_GROUP:-nvingest-consumer-group}
+      # MinIO
+      - MINIO_ENDPOINT=minio-source-1:9000
+      - MINIO_ACCESS_KEY=${MINIO_ACCESS_KEY:-minioadmin}
+      - MINIO_SECRET_KEY=${MINIO_SECRET_KEY:-minioadmin}
+      - MINIO_SECURE=false
+      # RAG Ingestor
+      - INGESTOR_SERVER_URL=${INGESTOR_SERVER_URL:-http://ingestor-server:8082}
+      - COLLECTION_NAME=${COLLECTION_NAME:-aidp_bucket}
+      # Logging
+      - LOG_LEVEL=${LOG_LEVEL:-INFO}
+    restart: unless-stopped
+    networks:
+      - nvidia-rag
+
+# =============================================================================
+# VOLUMES
+# =============================================================================
+volumes:
+  kafka-data:
+    driver: local
+  minio-data:
+    driver: local
+
+# =============================================================================
+# NETWORKS
+# =============================================================================
+networks:
+  nvidia-rag:
+    external: true
+    name: nvidia-rag
diff --git a/examples/rag_event_ingest/kafka_consumer/Dockerfile b/examples/rag_event_ingest/kafka_consumer/Dockerfile
new file mode 100644
index 000000000..d1ff4fc62
--- /dev/null
+++ b/examples/rag_event_ingest/kafka_consumer/Dockerfile
@@ -0,0 +1,12 @@
+FROM python:3.11-slim
+
+WORKDIR /app
+
+# Install dependencies
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+
+# Copy application code
+COPY . /app/
+
+CMD ["python", "-u", "main.py"]
diff --git a/examples/rag_event_ingest/kafka_consumer/config/__init__.py b/examples/rag_event_ingest/kafka_consumer/config/__init__.py
new file mode 100644
index 000000000..6bd140441
--- /dev/null
+++ b/examples/rag_event_ingest/kafka_consumer/config/__init__.py
@@ -0,0 +1,150 @@
+# config/__init__.py
+"""Configuration package for Kafka MinIO Consumer.
+
+Usage:
+    import config.settings as cfg
+    print(cfg.INGESTOR_SERVER_URL)
+    
+    from config.constants import DOCUMENT_EXTENSIONS, DEST_RAG
+"""
+
+# Settings (env vars)
+from .settings import (
+    # Kafka
+    KAFKA_BOOTSTRAP_SERVERS,
+    KAFKA_CONSUMER_GROUP,
+    KAFKA_TOPIC,
+    KAFKA_AUTO_OFFSET_RESET,
+    KAFKA_MAX_POLL_RECORDS,
+    KAFKA_MAX_POLL_INTERVAL_MS,
+    KAFKA_SESSION_TIMEOUT_MS,
+    KAFKA_HEARTBEAT_INTERVAL_MS,
+    # Services
+    INGESTOR_SERVER_URL,
+    INGESTOR_TIMEOUT,
+    # MinIO
+    MINIO_ENDPOINT,
+    MINIO_ACCESS_KEY,
+    MINIO_SECRET_KEY,
+    MINIO_SECURE,
+    MINIO_DEFAULT_COLLECTION,
+    MINIO_SOURCES,
+    # Features
+    ENABLE_IMAGE_PROCESSING,
+    ENABLE_AUDIO_PROCESSING,
+    # Collection
+    EMBEDDING_DIMENSION,
+    CHUNK_SIZE,
+    CHUNK_OVERLAP,
+    # Logging
+    LOG_LEVEL,
+    LOG_FORMAT,
+    # History
+    HISTORY_FILE,
+    # API Endpoints (configurable via env)
+    API_INGESTOR_DOCUMENTS,
+    API_INGESTOR_COLLECTIONS,
+    API_INGESTOR_COLLECTION,
+    API_INGESTOR_STATUS,
+)
+
+# Constants
+from .constants import (
+    # File extensions
+    DOCUMENT_EXTENSIONS,
+    IMAGE_EXTENSIONS,
+    AUDIO_EXTENSIONS,
+    SKIP_EXTENSIONS,
+    # Content types
+    CONTENT_TYPE_MAP,
+    DEFAULT_CONTENT_TYPE,
+    # Routing
+    DEST_RAG,
+    DEST_SKIP,
+    DEST_UNKNOWN,
+    # S3 Event fields
+    EVENT_NAME,
+    EVENT_RECORDS,
+    EVENT_S3,
+    EVENT_BUCKET,
+    EVENT_OBJECT,
+    EVENT_KEY,
+    EVENT_SIZE,
+    EVENT_ETAG,
+    EVENT_NAME_FIELD,
+    EVENT_FIRST_RECORD_INDEX,
+    EVENT_PREFIX_CREATED,
+    EVENT_PREFIX_REMOVED,
+    EVENT_TYPE_CREATE,
+    EVENT_TYPE_DELETE,
+    # Record field names (dataclass attributes)
+    FIELD_FILE_NAME,
+    FIELD_BUCKET,
+    FIELD_COLLECTION,
+    FIELD_STATUS,
+    FIELD_START_TIME,
+    FIELD_END_TIME,
+    FIELD_DURATION_SECONDS,
+    FIELD_ERROR_MESSAGE,
+    FIELD_TASK_ID,
+    # Record serialization output keys
+    RECORD_FILE_NAME,
+    RECORD_BUCKET,
+    RECORD_COLLECTION,
+    RECORD_START_TIME,
+    RECORD_END_TIME,
+    RECORD_DURATION,
+    RECORD_STATUS,
+    RECORD_ERROR,
+    RECORD_TASK_ID,
+    # Status
+    STATUS_PENDING,
+    STATUS_PROCESSING,
+    STATUS_FINISHED,
+    STATUS_FAILED,
+    STATUS_SKIPPED,
+    STATUS_DELETED,
+    STATUS_SUCCESS,
+    # Config keys (MinIO sources)
+    CFG_ENDPOINT,
+    CFG_ACCESS,
+    CFG_SECRET,
+    CFG_SECURE,
+    CFG_COLLECTION,
+    CFG_BUCKETS,
+    # API request fields (Ingestor)
+    FIELD_COLLECTION_NAME,
+    FIELD_BLOCKING,
+    FIELD_SPLIT_OPTIONS,
+    FIELD_CHUNK_SIZE,
+    FIELD_CHUNK_OVERLAP,
+    FIELD_GENERATE_SUMMARY,
+    FIELD_EMBEDDING_DIMENSION,
+    FIELD_TASK_ID,
+    # API response fields
+    RESP_MESSAGE,
+    RESP_ERROR,
+    RESP_COLLECTIONS,
+    RESP_TASK_ID,
+    RESP_STATE,
+    RESP_RESULT,
+    RESP_FAILED_DOCUMENTS,
+    RESP_VALIDATION_ERRORS,
+    # Timeouts
+    TIMEOUT_DEFAULT,
+    TIMEOUT_UPLOAD,
+    TIMEOUT_TASK_CHECK,
+    TIMEOUT_MAX_TASK_WAIT,
+    # Kafka defaults
+    KAFKA_DEFAULT_TOPIC,
+    KAFKA_DEFAULT_CONSUMER_GROUP,
+    KAFKA_DEFAULT_AUTO_OFFSET_RESET,
+    KAFKA_DEFAULT_MAX_POLL_RECORDS,
+    KAFKA_DEFAULT_MAX_POLL_INTERVAL_MS,
+    KAFKA_DEFAULT_SESSION_TIMEOUT_MS,
+    KAFKA_DEFAULT_HEARTBEAT_INTERVAL_MS,
+    # Collection defaults
+    COLLECTION_EMBEDDING_DIMENSION,
+    COLLECTION_CHUNK_SIZE,
+    COLLECTION_CHUNK_OVERLAP,
+)
diff --git a/examples/rag_event_ingest/kafka_consumer/config/constants.py b/examples/rag_event_ingest/kafka_consumer/config/constants.py
new file mode 100644
index 000000000..050cf70c4
--- /dev/null
+++ b/examples/rag_event_ingest/kafka_consumer/config/constants.py
@@ -0,0 +1,296 @@
+# config/constants.py
+"""Static constants that don't change at runtime.
+
+For configurable values from environment, see settings.py
+"""
+
+# ==================== File Extensions ====================
+
+DOCUMENT_EXTENSIONS = frozenset({
+    '.pdf', '.docx', '.doc', '.txt', '.md', '.rst',
+    '.html', '.htm', '.pptx', '.ppt', '.xlsx', '.xls',
+    '.csv', '.json', '.xml'
+})
+
+IMAGE_EXTENSIONS = frozenset({
+    '.jpg', '.jpeg', '.png', '.gif', 
+    '.webp', '.bmp', '.tiff', '.svg'
+})
+
+AUDIO_EXTENSIONS = frozenset({
+    '.mp3', '.wav', '.flac', '.aac', 
+    '.ogg', '.m4a', '.wma'
+})
+
+SKIP_EXTENSIONS = frozenset({
+    '.tmp', '.log', '.bak', '.swp', '.DS_Store',
+    '.gitkeep', '.gitignore'
+})
+
+
+# ==================== Content Types ====================
+
+CONTENT_TYPE_MAP = {
+    # Documents
+    '.pdf': 'application/pdf',
+    '.txt': 'text/plain',
+    '.doc': 'application/msword',
+    '.docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
+    '.html': 'text/html',
+    '.htm': 'text/html',
+    '.xml': 'application/xml',
+    '.json': 'application/json',
+    '.csv': 'text/csv',
+    '.md': 'text/markdown',
+    '.rst': 'text/x-rst',
+    '.ppt': 'application/vnd.ms-powerpoint',
+    '.pptx': 'application/vnd.openxmlformats-officedocument.presentationml.presentation',
+    '.xls': 'application/vnd.ms-excel',
+    '.xlsx': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
+    # Images
+    '.jpg': 'image/jpeg',
+    '.jpeg': 'image/jpeg',
+    '.png': 'image/png',
+    '.gif': 'image/gif',
+    '.webp': 'image/webp',
+    '.bmp': 'image/bmp',
+    '.tiff': 'image/tiff',
+    '.svg': 'image/svg+xml',
+    # Audio
+    '.mp3': 'audio/mpeg',
+    '.wav': 'audio/wav',
+    '.flac': 'audio/flac',
+    '.aac': 'audio/aac',
+    '.ogg': 'audio/ogg',
+    '.m4a': 'audio/mp4',
+    '.wma': 'audio/x-ms-wma',
+}
+
+DEFAULT_CONTENT_TYPE = 'application/octet-stream'
+
+
+# ==================== Routing ====================
+
+# Destinations
+DEST_RAG = 'rag'
+DEST_SKIP = 'skip'
+DEST_UNKNOWN = 'unknown'
+
+# Route result keys
+KEY_DESTINATION = 'destination'
+KEY_FILE_TYPE = 'file_type'
+KEY_EXTENSION = 'extension'
+KEY_REASON = 'reason'
+
+# File types
+FILE_TYPE_DOCUMENT = 'document'
+FILE_TYPE_IMAGE = 'image'
+FILE_TYPE_AUDIO = 'audio'
+FILE_TYPE_SKIP = 'skip'
+FILE_TYPE_UNKNOWN = 'unknown'
+
+# Config keys
+CFG_DOCUMENT_EXTENSIONS = 'document_extensions'
+CFG_IMAGE_EXTENSIONS = 'image_extensions'
+CFG_AUDIO_EXTENSIONS = 'audio_extensions'
+CFG_SKIP_EXTENSIONS = 'skip_extensions'
+CFG_ENABLE_IMAGE_PROCESSING = 'enable_image_processing'
+CFG_ENABLE_AUDIO_PROCESSING = 'enable_audio_processing'
+
+
+# ==================== S3 Event Fields ====================
+
+# Kafka S3 event structure
+EVENT_NAME = 'EventName'
+EVENT_RECORDS = 'Records'
+EVENT_FIRST_RECORD_INDEX = 0  # S3 events typically contain single record
+EVENT_S3 = 's3'
+EVENT_BUCKET = 'bucket'
+EVENT_OBJECT = 'object'
+EVENT_KEY = 'key'
+EVENT_SIZE = 'size'
+EVENT_ETAG = 'eTag'
+EVENT_NAME_FIELD = 'name'
+
+# Event type prefixes
+EVENT_PREFIX_CREATED = 's3:ObjectCreated:'
+EVENT_PREFIX_REMOVED = 's3:ObjectRemoved:'
+
+# Event type values
+EVENT_TYPE_CREATE = 'create'
+EVENT_TYPE_DELETE = 'delete'
+
+
+# ==================== Record Fields ====================
+
+# IngestionRecord field names (dataclass attributes)
+FIELD_FILE_NAME = 'file_name'
+FIELD_BUCKET = 'bucket'
+FIELD_COLLECTION = 'collection'
+FIELD_STATUS = 'status'
+FIELD_START_TIME = 'start_time'
+FIELD_END_TIME = 'end_time'
+FIELD_DURATION_SECONDS = 'duration_seconds'
+FIELD_ERROR_MESSAGE = 'error_message'
+FIELD_TASK_ID = 'task_id'
+
+# IngestionRecord serialization output keys
+RECORD_FILE_NAME = FIELD_FILE_NAME
+RECORD_BUCKET = FIELD_BUCKET
+RECORD_COLLECTION = FIELD_COLLECTION
+RECORD_START_TIME = FIELD_START_TIME
+RECORD_END_TIME = FIELD_END_TIME
+RECORD_DURATION = FIELD_DURATION_SECONDS
+RECORD_STATUS = FIELD_STATUS
+RECORD_ERROR = FIELD_ERROR_MESSAGE
+RECORD_TASK_ID = FIELD_TASK_ID
+
+
+# ==================== Task Status ====================
+
+STATUS_PENDING = 'PENDING'
+STATUS_PROCESSING = 'PROCESSING'
+STATUS_FINISHED = 'FINISHED'
+STATUS_FAILED = 'FAILED'
+STATUS_SKIPPED = 'SKIPPED'
+STATUS_DELETED = 'DELETED'
+STATUS_SUCCESS = 'SUCCESS'
+
+
+# ==================== Config Keys ====================
+
+# MinIO/S3 source config keys
+CFG_ENDPOINT = 'endpoint'
+CFG_ACCESS = 'access'
+CFG_SECRET = 'secret'
+CFG_SECURE = 'secure'
+CFG_COLLECTION = 'collection'
+CFG_BUCKETS = 'buckets'
+
+
+# ==================== API Request Fields ====================
+
+# Ingestor request fields
+FIELD_COLLECTION_NAME = 'collection_name'
+FIELD_BLOCKING = 'blocking'
+FIELD_SPLIT_OPTIONS = 'split_options'
+FIELD_CHUNK_SIZE = 'chunk_size'
+FIELD_CHUNK_OVERLAP = 'chunk_overlap'
+FIELD_GENERATE_SUMMARY = 'generate_summary'
+FIELD_EMBEDDING_DIMENSION = 'embedding_dimension'
+FIELD_TASK_ID = 'task_id'
+
+
+# ==================== API Response Fields ====================
+
+# Common response fields
+RESP_CONTENT = 'content'
+RESP_RESPONSE = 'response'
+RESP_TEXT = 'text'
+RESP_CHOICES = 'choices'
+RESP_MESSAGE = 'message'
+RESP_ERROR = 'error'
+
+# Ingestor response fields
+RESP_COLLECTIONS = 'collections'
+RESP_TASK_ID = 'task_id'
+RESP_STATE = 'state'
+RESP_RESULT = 'result'
+RESP_FAILED_DOCUMENTS = 'failed_documents'
+RESP_VALIDATION_ERRORS = 'validation_errors'
+
+
+# ==================== Timeouts (seconds) ====================
+
+TIMEOUT_DEFAULT = 30
+TIMEOUT_UPLOAD = 600
+TIMEOUT_TASK_CHECK = 30
+TIMEOUT_MAX_TASK_WAIT = 300
+
+
+# ==================== Kafka Defaults ====================
+
+KAFKA_DEFAULT_TOPIC = 'aidp-topic'
+KAFKA_DEFAULT_CONSUMER_GROUP = 'nvingest-consumer-group'
+KAFKA_DEFAULT_AUTO_OFFSET_RESET = 'earliest'
+KAFKA_DEFAULT_MAX_POLL_RECORDS = 1
+KAFKA_DEFAULT_MAX_POLL_INTERVAL_MS = 600000   # 10 min
+KAFKA_DEFAULT_SESSION_TIMEOUT_MS = 60000      # 60s
+KAFKA_DEFAULT_HEARTBEAT_INTERVAL_MS = 20000   # 20s
+
+
+# ==================== Collection Defaults ====================
+
+COLLECTION_EMBEDDING_DIMENSION = 2048
+COLLECTION_CHUNK_SIZE = 512
+COLLECTION_CHUNK_OVERLAP = 150
+
+
+# ==================== Environment Variable Keys ====================
+
+# Kafka
+ENV_KAFKA_BOOTSTRAP_SERVERS = 'KAFKA_BOOTSTRAP_SERVERS'
+ENV_KAFKA_TOPIC = 'KAFKA_TOPIC'
+ENV_CONSUMER_GROUP = 'CONSUMER_GROUP'
+ENV_KAFKA_AUTO_OFFSET_RESET = 'KAFKA_AUTO_OFFSET_RESET'
+ENV_KAFKA_MAX_POLL_RECORDS = 'KAFKA_MAX_POLL_RECORDS'
+ENV_KAFKA_MAX_POLL_INTERVAL_MS = 'KAFKA_MAX_POLL_INTERVAL_MS'
+ENV_KAFKA_SESSION_TIMEOUT_MS = 'KAFKA_SESSION_TIMEOUT_MS'
+ENV_KAFKA_HEARTBEAT_INTERVAL_MS = 'KAFKA_HEARTBEAT_INTERVAL_MS'
+
+# Service URLs
+ENV_INGESTOR_SERVER_URL = 'INGESTOR_SERVER_URL'
+ENV_INGESTOR_TIMEOUT = 'INGESTOR_TIMEOUT'
+
+# API Endpoints
+ENV_API_INGESTOR_DOCUMENTS = 'API_INGESTOR_DOCUMENTS'
+ENV_API_INGESTOR_COLLECTIONS = 'API_INGESTOR_COLLECTIONS'
+ENV_API_INGESTOR_COLLECTION = 'API_INGESTOR_COLLECTION'
+ENV_API_INGESTOR_STATUS = 'API_INGESTOR_STATUS'
+
+# MinIO
+ENV_MINIO_ENDPOINT = 'MINIO_ENDPOINT'
+ENV_MINIO_ACCESS_KEY = 'MINIO_ACCESS_KEY'
+ENV_MINIO_SECRET_KEY = 'MINIO_SECRET_KEY'
+ENV_MINIO_SECURE = 'MINIO_SECURE'
+ENV_COLLECTION_NAME = 'COLLECTION_NAME'
+ENV_MINIO_SOURCES = 'MINIO_SOURCES'
+
+# Feature Flags
+ENV_ENABLE_IMAGE_PROCESSING = 'ENABLE_IMAGE_PROCESSING'
+ENV_ENABLE_AUDIO_PROCESSING = 'ENABLE_AUDIO_PROCESSING'
+
+# Collection Settings
+ENV_EMBEDDING_DIMENSION = 'EMBEDDING_DIMENSION'
+ENV_CHUNK_SIZE = 'CHUNK_SIZE'
+ENV_CHUNK_OVERLAP = 'CHUNK_OVERLAP'
+
+# Logging
+ENV_LOG_LEVEL = 'LOG_LEVEL'
+ENV_LOG_FORMAT = 'LOG_FORMAT'
+
+# History
+ENV_HISTORY_FILE = 'HISTORY_FILE'
+
+# ==================== API Endpoint Defaults ====================
+
+DEFAULT_API_INGESTOR_DOCUMENTS = '/v1/documents'
+DEFAULT_API_INGESTOR_COLLECTIONS = '/v1/collections'
+DEFAULT_API_INGESTOR_COLLECTION = '/v1/collection'
+DEFAULT_API_INGESTOR_STATUS = '/v1/status'
+
+
+# ==================== Logging Defaults ====================
+
+DEFAULT_LOG_LEVEL = 'INFO'
+DEFAULT_LOG_FORMAT = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+
+
+# ==================== History Defaults ====================
+
+DEFAULT_HISTORY_FILE = '/tmp/ingestion_history.jsonl'
+
+
+# ==================== MinIO Defaults ====================
+
+DEFAULT_COLLECTION_NAME = 'multimodal_data'
diff --git a/examples/rag_event_ingest/kafka_consumer/config/settings.py b/examples/rag_event_ingest/kafka_consumer/config/settings.py
new file mode 100644
index 000000000..bbfe4824c
--- /dev/null
+++ b/examples/rag_event_ingest/kafka_consumer/config/settings.py
@@ -0,0 +1,135 @@
+# config/settings.py
+"""Runtime settings loaded from environment variables."""
+
+import os
+
+from .constants import (
+    # Default values
+    KAFKA_DEFAULT_TOPIC,
+    KAFKA_DEFAULT_CONSUMER_GROUP,
+    KAFKA_DEFAULT_AUTO_OFFSET_RESET,
+    KAFKA_DEFAULT_MAX_POLL_RECORDS,
+    KAFKA_DEFAULT_MAX_POLL_INTERVAL_MS,
+    KAFKA_DEFAULT_SESSION_TIMEOUT_MS,
+    KAFKA_DEFAULT_HEARTBEAT_INTERVAL_MS,
+    TIMEOUT_UPLOAD,
+    COLLECTION_EMBEDDING_DIMENSION,
+    COLLECTION_CHUNK_SIZE,
+    COLLECTION_CHUNK_OVERLAP,
+    # API Endpoint defaults
+    DEFAULT_API_INGESTOR_DOCUMENTS,
+    DEFAULT_API_INGESTOR_COLLECTIONS,
+    DEFAULT_API_INGESTOR_COLLECTION,
+    DEFAULT_API_INGESTOR_STATUS,
+    # Logging defaults
+    DEFAULT_LOG_LEVEL,
+    DEFAULT_LOG_FORMAT,
+    # History defaults
+    DEFAULT_HISTORY_FILE,
+    # MinIO defaults
+    DEFAULT_COLLECTION_NAME,
+    # Environment variable keys
+    ENV_KAFKA_BOOTSTRAP_SERVERS,
+    ENV_KAFKA_TOPIC,
+    ENV_CONSUMER_GROUP,
+    ENV_KAFKA_AUTO_OFFSET_RESET,
+    ENV_KAFKA_MAX_POLL_RECORDS,
+    ENV_KAFKA_MAX_POLL_INTERVAL_MS,
+    ENV_KAFKA_SESSION_TIMEOUT_MS,
+    ENV_KAFKA_HEARTBEAT_INTERVAL_MS,
+    ENV_INGESTOR_SERVER_URL,
+    ENV_INGESTOR_TIMEOUT,
+    ENV_API_INGESTOR_DOCUMENTS,
+    ENV_API_INGESTOR_COLLECTIONS,
+    ENV_API_INGESTOR_COLLECTION,
+    ENV_API_INGESTOR_STATUS,
+    ENV_MINIO_ENDPOINT,
+    ENV_MINIO_ACCESS_KEY,
+    ENV_MINIO_SECRET_KEY,
+    ENV_MINIO_SECURE,
+    ENV_COLLECTION_NAME,
+    ENV_MINIO_SOURCES,
+    ENV_ENABLE_IMAGE_PROCESSING,
+    ENV_ENABLE_AUDIO_PROCESSING,
+    ENV_EMBEDDING_DIMENSION,
+    ENV_CHUNK_SIZE,
+    ENV_CHUNK_OVERLAP,
+    ENV_LOG_LEVEL,
+    ENV_LOG_FORMAT,
+    ENV_HISTORY_FILE,
+)
+
+
+# ==================== Helper Functions ====================
+
+def _get_bool(key: str, default: bool = False) -> bool:
+    """Get boolean from environment variable."""
+    return os.getenv(key, str(default)).lower() in ('true', '1', 'yes', 'on')
+
+
+def _get_int(key: str, default: int) -> int:
+    """Get integer from environment variable."""
+    try:
+        return int(os.getenv(key, str(default)))
+    except ValueError:
+        return default
+
+
+# ==================== Kafka Settings ====================
+
+KAFKA_BOOTSTRAP_SERVERS = os.getenv(ENV_KAFKA_BOOTSTRAP_SERVERS)  # Required
+KAFKA_CONSUMER_GROUP = os.getenv(ENV_CONSUMER_GROUP, KAFKA_DEFAULT_CONSUMER_GROUP)
+KAFKA_TOPIC = os.getenv(ENV_KAFKA_TOPIC, KAFKA_DEFAULT_TOPIC)
+KAFKA_AUTO_OFFSET_RESET = os.getenv(ENV_KAFKA_AUTO_OFFSET_RESET, KAFKA_DEFAULT_AUTO_OFFSET_RESET)
+KAFKA_MAX_POLL_RECORDS = _get_int(ENV_KAFKA_MAX_POLL_RECORDS, KAFKA_DEFAULT_MAX_POLL_RECORDS)
+KAFKA_MAX_POLL_INTERVAL_MS = _get_int(ENV_KAFKA_MAX_POLL_INTERVAL_MS, KAFKA_DEFAULT_MAX_POLL_INTERVAL_MS)
+KAFKA_SESSION_TIMEOUT_MS = _get_int(ENV_KAFKA_SESSION_TIMEOUT_MS, KAFKA_DEFAULT_SESSION_TIMEOUT_MS)
+KAFKA_HEARTBEAT_INTERVAL_MS = _get_int(ENV_KAFKA_HEARTBEAT_INTERVAL_MS, KAFKA_DEFAULT_HEARTBEAT_INTERVAL_MS)
+
+
+# ==================== Service URLs ====================
+
+INGESTOR_SERVER_URL = os.getenv(ENV_INGESTOR_SERVER_URL)  # Required
+INGESTOR_TIMEOUT = _get_int(ENV_INGESTOR_TIMEOUT, TIMEOUT_UPLOAD)
+
+# API Endpoints - Ingestor Server
+API_INGESTOR_DOCUMENTS = os.getenv(ENV_API_INGESTOR_DOCUMENTS, DEFAULT_API_INGESTOR_DOCUMENTS)
+API_INGESTOR_COLLECTIONS = os.getenv(ENV_API_INGESTOR_COLLECTIONS, DEFAULT_API_INGESTOR_COLLECTIONS)
+API_INGESTOR_COLLECTION = os.getenv(ENV_API_INGESTOR_COLLECTION, DEFAULT_API_INGESTOR_COLLECTION)
+API_INGESTOR_STATUS = os.getenv(ENV_API_INGESTOR_STATUS, DEFAULT_API_INGESTOR_STATUS)
+
+
+# ==================== MinIO Settings ====================
+
+MINIO_ENDPOINT = os.getenv(ENV_MINIO_ENDPOINT)  # Required
+MINIO_ACCESS_KEY = os.getenv(ENV_MINIO_ACCESS_KEY)  # Required
+MINIO_SECRET_KEY = os.getenv(ENV_MINIO_SECRET_KEY)  # Required
+MINIO_SECURE = _get_bool(ENV_MINIO_SECURE, False)
+# Single collection for all buckets - matches RAG server's COLLECTION_NAME
+MINIO_DEFAULT_COLLECTION = os.getenv(ENV_COLLECTION_NAME, DEFAULT_COLLECTION_NAME)
+MINIO_SOURCES = os.getenv(ENV_MINIO_SOURCES)  # JSON config for multi-source
+
+
+# ==================== Feature Flags ====================
+
+ENABLE_IMAGE_PROCESSING = _get_bool(ENV_ENABLE_IMAGE_PROCESSING, False)
+ENABLE_AUDIO_PROCESSING = _get_bool(ENV_ENABLE_AUDIO_PROCESSING, False)
+
+
+# ==================== Collection Settings ====================
+
+EMBEDDING_DIMENSION = _get_int(ENV_EMBEDDING_DIMENSION, COLLECTION_EMBEDDING_DIMENSION)
+CHUNK_SIZE = _get_int(ENV_CHUNK_SIZE, COLLECTION_CHUNK_SIZE)
+CHUNK_OVERLAP = _get_int(ENV_CHUNK_OVERLAP, COLLECTION_CHUNK_OVERLAP)
+
+
+# ==================== Logging Settings ====================
+
+LOG_LEVEL = os.getenv(ENV_LOG_LEVEL, DEFAULT_LOG_LEVEL)
+LOG_FORMAT = os.getenv(ENV_LOG_FORMAT, DEFAULT_LOG_FORMAT)
+
+
+# ==================== History Settings ====================
+
+HISTORY_FILE = os.getenv(ENV_HISTORY_FILE, DEFAULT_HISTORY_FILE)
+
diff --git a/examples/rag_event_ingest/kafka_consumer/consumer.py b/examples/rag_event_ingest/kafka_consumer/consumer.py
new file mode 100644
index 000000000..87a5c0538
--- /dev/null
+++ b/examples/rag_event_ingest/kafka_consumer/consumer.py
@@ -0,0 +1,197 @@
+# consumer.py
+"""Kafka consumer for MinIO S3 events."""
+
+import json
+import logging
+from datetime import datetime
+from typing import Dict, Optional
+from kafka import KafkaConsumer
+
+import config.settings as cfg
+from pathlib import Path
+from config.constants import DEST_RAG, DEST_SKIP, STATUS_FAILED, KEY_DESTINATION, KEY_FILE_TYPE, KEY_REASON
+from router import FileRouter
+from models.events import S3Event, HandlerResult, IngestionRecord
+from handlers.base import BaseHandler
+from services.storage import ObjectStorage
+
+logger = logging.getLogger(__name__)
+
+
+class KafkaEventConsumer:
+    """Kafka consumer that routes MinIO events to handlers."""
+    
+    def __init__(
+        self,
+        handlers: Dict[str, BaseHandler],
+        storage: ObjectStorage,
+        history_file: str = '/tmp/ingestion_history.jsonl'
+    ):
+        """Initialize Kafka consumer."""
+        self.handlers = handlers
+        self.storage = storage
+        self.history_file = history_file
+        self.router = FileRouter()
+        
+        logger.info(f"Connecting to Kafka: {cfg.KAFKA_BOOTSTRAP_SERVERS}")
+        logger.info(f"Consumer group: {cfg.KAFKA_CONSUMER_GROUP}")
+        
+        self.kafka_consumer = KafkaConsumer(
+            cfg.KAFKA_TOPIC,
+            bootstrap_servers=cfg.KAFKA_BOOTSTRAP_SERVERS.split(','),
+            value_deserializer=lambda m: json.loads(m.decode('utf-8')),
+            group_id=cfg.KAFKA_CONSUMER_GROUP,
+            auto_offset_reset=cfg.KAFKA_AUTO_OFFSET_RESET,
+            enable_auto_commit=True,
+            max_poll_records=cfg.KAFKA_MAX_POLL_RECORDS,
+            max_poll_interval_ms=cfg.KAFKA_MAX_POLL_INTERVAL_MS,
+            session_timeout_ms=cfg.KAFKA_SESSION_TIMEOUT_MS,
+            heartbeat_interval_ms=cfg.KAFKA_HEARTBEAT_INTERVAL_MS
+        )
+        
+        logger.info("Kafka consumer initialized")
+        logger.info(f"Registered handlers: {list(self.handlers.keys())}")
+    
+    def process_event(self, raw_event: dict) -> Optional[HandlerResult]:
+        """Process a single MinIO S3 event."""
+        start_time = datetime.now()
+        event: Optional[S3Event] = None
+        result: Optional[HandlerResult] = None
+        
+        try:
+            logger.info(f"Received event: {json.dumps(raw_event, indent=2)}")
+            
+            event = S3Event.from_kafka_message(
+                raw_event,
+                collection_resolver=self.storage.get_collection_for_bucket
+            )
+            
+            if not event:
+                logger.warning("Invalid event format, skipping")
+                return None
+            
+            logger.info(f"Processing: {event.bucket}/{event.key} ({event.size} bytes)")
+            
+            if event.event_type == 'delete':
+                result = self._handle_delete(event)
+            else:
+                result = self._handle_create(event)
+            
+            return result
+            
+        except (json.JSONDecodeError, KeyError, ValueError) as e:
+            logger.error(f"Invalid event data: {e}")
+            result = HandlerResult.failed_result(str(e))
+            return result
+            
+        except (IOError, OSError) as e:
+            logger.error(f"Storage error: {e}")
+            result = HandlerResult.failed_result(str(e))
+            return result
+            
+        finally:
+            if event:
+                self._save_record(event, result, start_time)
+    
+    def _handle_delete(self, event: S3Event) -> HandlerResult:
+        """Handle S3 delete event."""
+        logger.info(f"🗑️  DELETE event for {event.key}")
+        
+        doc_handler = self.handlers.get(DEST_RAG)
+        if not doc_handler or not hasattr(doc_handler, 'indexer'):
+            return HandlerResult.failed_result("Delete failed - no indexer available")
+
+        indexer = doc_handler.indexer
+        success = indexer.delete(event.key, event.collection)
+
+        if success:
+            logger.info(f"✓ Deleted {event.key} from Milvus")
+            return HandlerResult(success=True, status='DELETED')
+        
+        return HandlerResult.failed_result("Delete failed")
+    
+    def _handle_create(self, event: S3Event) -> HandlerResult:
+        """Handle S3 create event."""
+        route_info = self.router.route(event.key)
+        destination = route_info[KEY_DESTINATION]
+        
+        logger.info(f"📁 {route_info[KEY_FILE_TYPE]} → {destination}")
+        
+        if destination == DEST_SKIP:
+            reason = route_info.get(KEY_REASON, 'Skipped by router')
+            logger.info(f"⏭️  Skipping: {reason}")
+            return HandlerResult.skipped_result(reason)
+        
+        handler = self.handlers.get(destination)
+        if not handler:
+            handler = self.handlers.get(DEST_RAG)
+        
+        if not handler:
+            return HandlerResult.failed_result(f"No handler for {destination}")
+        
+        return handler.handle(event)
+    
+    def _save_record(self, event: S3Event, result: Optional[HandlerResult], start_time: datetime):
+        """Save ingestion record to history file."""
+        end_time = datetime.now()
+        duration = (end_time - start_time).total_seconds()
+        
+        record = IngestionRecord(
+            file_name=event.key,
+            bucket=event.bucket,
+            collection=event.collection,
+            status=result.status if result else STATUS_FAILED,
+            start_time=start_time,
+            end_time=end_time,
+            duration_seconds=duration,
+            error_message=result.error_message if result else None,
+            task_id=result.task_id if result else None
+        )
+        
+        try:
+            with open(self.history_file, 'a') as f:
+                f.write(json.dumps(record.to_dict()) + '\n')
+        except (IOError, OSError) as e:
+            logger.error(f"Failed to save history: {e}")
+        
+        status_emoji = '✓' if record.status in ['SUCCESS', 'DELETED', 'SKIPPED'] else '✗'
+        logger.info(
+            f"{status_emoji} SUMMARY: {event.key} | "
+            f"Collection: {event.collection} | "
+            f"Duration: {duration:.2f}s | "
+            f"Status: {record.status}"
+        )
+    
+    def run(self):
+        """Main consumer loop."""
+        logger.info("Starting Kafka consumer loop...")
+        logger.info(f"Subscribed topics: {self.kafka_consumer.subscription()}")
+        logger.info("Waiting for messages...")
+        
+        try:
+            message_count = 0
+            for message in self._poll_messages():
+                message_count += 1
+                logger.info(
+                    f"[{message_count}] Message from "
+                    f"partition {message.partition}, offset {message.offset}"
+                )
+                self.process_event(message.value)
+                            
+        except KeyboardInterrupt:
+            logger.info("Shutting down...")
+        finally:
+            self.kafka_consumer.close()
+            logger.info("Consumer closed")
+    
+    def _poll_messages(self):
+        """Generator that yields messages from Kafka."""
+        while True:
+            msg_pack = self.kafka_consumer.poll(timeout_ms=5000, max_records=1)
+            
+            if not msg_pack:
+                logger.debug("No messages, continuing...")
+                continue
+            
+            for messages in msg_pack.values():
+                yield from messages
diff --git a/examples/rag_event_ingest/kafka_consumer/handlers/__init__.py b/examples/rag_event_ingest/kafka_consumer/handlers/__init__.py
new file mode 100644
index 000000000..e6f3efcff
--- /dev/null
+++ b/examples/rag_event_ingest/kafka_consumer/handlers/__init__.py
@@ -0,0 +1,5 @@
+# Handlers package
+from .base import BaseHandler
+from .document import DocumentHandler
+
+__all__ = ['BaseHandler', 'DocumentHandler']
diff --git a/examples/rag_event_ingest/kafka_consumer/handlers/base.py b/examples/rag_event_ingest/kafka_consumer/handlers/base.py
new file mode 100644
index 000000000..6745f2e09
--- /dev/null
+++ b/examples/rag_event_ingest/kafka_consumer/handlers/base.py
@@ -0,0 +1,43 @@
+# handlers/base.py
+"""Base handler abstract class."""
+
+from abc import ABC, abstractmethod
+import logging
+
+from models.events import S3Event, HandlerResult
+
+logger = logging.getLogger(__name__)
+
+
+class BaseHandler(ABC):
+    """Abstract base class for file handlers."""
+    
+    @property
+    @abstractmethod
+    def name(self) -> str:
+        """Handler name for logging."""
+        pass
+    
+    @abstractmethod
+    def handle(self, event: S3Event) -> HandlerResult:
+        """Process an S3 event.
+        
+        Args:
+            event: S3 event to process
+            
+        Returns:
+            HandlerResult with success status and optional task_id
+        """
+        pass
+    
+    def log_start(self, event: S3Event):
+        """Log handler start."""
+        logger.info(f"[{self.name}] Processing {event.bucket}/{event.key}")
+    
+    def log_success(self, event: S3Event, result: HandlerResult):
+        """Log successful handling."""
+        logger.info(f"[{self.name}] ✓ {event.key} → {result.status}")
+    
+    def log_failure(self, event: S3Event, result: HandlerResult):
+        """Log failed handling."""
+        logger.error(f"[{self.name}] ✗ {event.key}: {result.error_message}")
diff --git a/examples/rag_event_ingest/kafka_consumer/handlers/document.py b/examples/rag_event_ingest/kafka_consumer/handlers/document.py
new file mode 100644
index 000000000..1df03946d
--- /dev/null
+++ b/examples/rag_event_ingest/kafka_consumer/handlers/document.py
@@ -0,0 +1,89 @@
+# handlers/document.py
+"""Handler for document files (PDF, DOCX, TXT, etc.)."""
+
+import logging
+
+import requests
+
+from .base import BaseHandler
+from models.events import S3Event, HandlerResult
+from services.storage import ObjectStorage
+from services.document_indexer import DocumentIndexer
+
+logger = logging.getLogger(__name__)
+
+
+class DocumentHandler(BaseHandler):
+    """Handler for document files - sends to RAG ingestor."""
+    
+    def __init__(self, storage: ObjectStorage, indexer: DocumentIndexer):
+        """Initialize document handler.
+        
+        Args:
+            storage: Object storage for file downloads
+            indexer: Document indexer for RAG pipeline
+        """
+        self.storage = storage
+        self.indexer = indexer
+    
+    @property
+    def name(self) -> str:
+        return "DocumentHandler"
+    
+    def handle(self, event: S3Event) -> HandlerResult:
+        """Process document file.
+        
+        1. Delete existing entries (for updates)
+        2. Download from MinIO
+        3. Upload to ingestor
+        4. Wait for completion
+        
+        Args:
+            event: S3 event with document info
+            
+        Returns:
+            HandlerResult with task_id for status tracking
+        """
+        self.log_start(event)
+        
+        try:
+            # Step 1: Delete existing entries (handles updates)
+            logger.info(f"🔄 Checking for existing entries of {event.key}...")
+            self.indexer.delete(event.key, event.collection)
+            
+            # Step 2: Download from storage
+            logger.info(f"📥 Downloading from storage...")
+            file_data = self.storage.download(event.bucket, event.key)
+            
+            # Step 3: Upload to indexer
+            logger.info(f"📤 Sending to indexer...")
+            task_id = self.indexer.upload(
+                file_data=file_data,
+                filename=event.key,
+                collection=event.collection
+            )
+            
+            if not task_id:
+                result = HandlerResult.failed_result("Indexer upload failed")
+                self.log_failure(event, result)
+                return result
+            
+            # Step 4: Wait for completion
+            logger.info(f"⏳ Waiting for indexing (task_id: {task_id})...")
+            success, message = self.indexer.check_status(task_id)
+            
+            if success:
+                result = HandlerResult.success_result(task_id=task_id)
+                self.log_success(event, result)
+                return result
+            else:
+                result = HandlerResult.failed_result(message, task_id=task_id)
+                self.log_failure(event, result)
+                return result
+                
+        except requests.RequestException as e:
+            logger.error(f"Network error processing document: {e}")
+            return HandlerResult.failed_result(str(e))
+        except (IOError, OSError) as e:
+            logger.error(f"Storage error processing document: {e}")
+            return HandlerResult.failed_result(str(e))
diff --git a/examples/rag_event_ingest/kafka_consumer/main.py b/examples/rag_event_ingest/kafka_consumer/main.py
new file mode 100644
index 000000000..df4384d4f
--- /dev/null
+++ b/examples/rag_event_ingest/kafka_consumer/main.py
@@ -0,0 +1,47 @@
+#!/usr/bin/env python3
+# main.py
+"""Entry point for Kafka MinIO consumer."""
+
+import logging
+
+import config.settings as cfg
+from config.constants import DEST_RAG
+from services import ObjectStorage, DocumentIndexer
+from handlers import DocumentHandler
+from consumer import KafkaEventConsumer
+
+logging.basicConfig(
+    level=getattr(logging, cfg.LOG_LEVEL, logging.INFO),
+    format=cfg.LOG_FORMAT
+)
+logger = logging.getLogger(__name__)
+
+
+def main():
+    """Initialize and run the Kafka consumer."""
+    logger.info("=" * 60)
+    logger.info("Starting Kafka MinIO Consumer")
+    logger.info("=" * 60)
+    
+    # Initialize services
+    logger.info("Initializing services...")
+    storage = ObjectStorage()
+    indexer = DocumentIndexer(cfg.INGESTOR_SERVER_URL)
+
+    # Initialize handlers
+    logger.info("Initializing handlers...")
+    handlers = {
+        DEST_RAG: DocumentHandler(storage, indexer),
+    }
+    
+    # Initialize consumer
+    logger.info("Initializing Kafka consumer...")
+    consumer = KafkaEventConsumer(handlers=handlers, storage=storage, history_file=cfg.HISTORY_FILE)
+    
+    # Run consumer loop
+    logger.info("Starting consumer loop...")
+    consumer.run()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/examples/rag_event_ingest/kafka_consumer/models/__init__.py b/examples/rag_event_ingest/kafka_consumer/models/__init__.py
new file mode 100644
index 000000000..2abce8a0d
--- /dev/null
+++ b/examples/rag_event_ingest/kafka_consumer/models/__init__.py
@@ -0,0 +1,4 @@
+# Models package
+from .events import S3Event, HandlerResult, IngestionRecord
+
+__all__ = ['S3Event', 'HandlerResult', 'IngestionRecord']
diff --git a/examples/rag_event_ingest/kafka_consumer/models/events.py b/examples/rag_event_ingest/kafka_consumer/models/events.py
new file mode 100644
index 000000000..4baf7112f
--- /dev/null
+++ b/examples/rag_event_ingest/kafka_consumer/models/events.py
@@ -0,0 +1,138 @@
+# models/events.py
+"""Data models for Kafka consumer events and results."""
+
+from dataclasses import dataclass, field, fields
+from datetime import datetime
+from typing import Any, Callable, ClassVar, Dict, Optional
+from urllib.parse import unquote_plus
+
+from config.constants import (
+    STATUS_SUCCESS,
+    STATUS_FAILED,
+    STATUS_SKIPPED,
+    # S3 Event fields
+    EVENT_NAME,
+    EVENT_RECORDS,
+    EVENT_FIRST_RECORD_INDEX,
+    EVENT_S3,
+    EVENT_BUCKET,
+    EVENT_OBJECT,
+    EVENT_KEY,
+    EVENT_SIZE,
+    EVENT_ETAG,
+    EVENT_NAME_FIELD,
+    EVENT_PREFIX_CREATED,
+    EVENT_PREFIX_REMOVED,
+    EVENT_TYPE_CREATE,
+    EVENT_TYPE_DELETE,
+    # Record field names (for transformers)
+    FIELD_START_TIME,
+    FIELD_END_TIME,
+    FIELD_DURATION_SECONDS,
+)
+
+
+@dataclass
+class S3Event:
+    """Represents a MinIO S3 event from Kafka."""
+    bucket: str
+    key: str
+    size: int
+    etag: str
+    event_type: str
+    collection: str = ''
+    
+    @classmethod
+    def from_kafka_message(
+        cls,
+        event: Dict[str, Any],
+        collection_resolver: Callable[[str], str]
+    ) -> Optional['S3Event']:
+        """Parse S3 event from Kafka message.
+        
+        Args:
+            event: Raw Kafka message value
+            collection_resolver: Function to resolve bucket -> collection name
+        """
+        if EVENT_NAME not in event:
+            return None
+        
+        event_name = event[EVENT_NAME]
+        
+        if event_name.startswith(EVENT_PREFIX_CREATED):
+            event_type = EVENT_TYPE_CREATE
+        elif event_name.startswith(EVENT_PREFIX_REMOVED):
+            event_type = EVENT_TYPE_DELETE
+        else:
+            return None
+        
+        records = event.get(EVENT_RECORDS, [])
+        if not records:
+            return None
+        
+        record = records[EVENT_FIRST_RECORD_INDEX]
+        s3_data = record[EVENT_S3]
+        bucket = s3_data[EVENT_BUCKET][EVENT_NAME_FIELD]
+        obj_data = s3_data[EVENT_OBJECT]
+        key = unquote_plus(obj_data[EVENT_KEY])
+        size = obj_data.get(EVENT_SIZE, 0)
+        etag = obj_data.get(EVENT_ETAG, '')
+        
+        return cls(
+            bucket=bucket,
+            key=key,
+            size=size,
+            etag=etag,
+            event_type=event_type,
+            collection=collection_resolver(bucket)
+        )
+
+
+@dataclass
+class HandlerResult:
+    """Result from a handler execution."""
+    success: bool
+    status: str  # SUCCESS, FAILED, SKIPPED, DELETED
+    error_message: Optional[str] = None
+    task_id: Optional[str] = None  # For RAG status tracking
+    
+    @classmethod
+    def success_result(cls, task_id: Optional[str] = None) -> 'HandlerResult':
+        return cls(success=True, status=STATUS_SUCCESS, task_id=task_id)
+    
+    @classmethod
+    def failed_result(cls, error: str, task_id: Optional[str] = None) -> 'HandlerResult':
+        return cls(success=False, status=STATUS_FAILED, error_message=error, task_id=task_id)
+    
+    @classmethod
+    def skipped_result(cls, reason: str) -> 'HandlerResult':
+        return cls(success=True, status=STATUS_SKIPPED, error_message=reason)
+
+
+@dataclass
+class IngestionRecord:
+    """Record of an ingestion operation for history tracking."""
+    file_name: str
+    bucket: str
+    collection: str
+    status: str
+    start_time: datetime
+    end_time: datetime = field(default_factory=datetime.now)
+    duration_seconds: float = 0.0
+    error_message: Optional[str] = None
+    task_id: Optional[str] = None
+    
+    _TRANSFORMERS: ClassVar[Dict[str, Callable]] = {
+        FIELD_START_TIME: lambda v: v.isoformat(),
+        FIELD_END_TIME: lambda v: v.isoformat(),
+        FIELD_DURATION_SECONDS: lambda v: round(v, 2),
+    }
+    
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert to dictionary for JSON serialization."""
+        result = {}
+        for f in fields(self):
+            value = getattr(self, f.name)
+            transform = self._TRANSFORMERS.get(f.name)
+            result[f.name] = transform(value) if transform else value
+        return result
diff --git a/examples/rag_event_ingest/kafka_consumer/requirements.txt b/examples/rag_event_ingest/kafka_consumer/requirements.txt
new file mode 100644
index 000000000..3f3818161
--- /dev/null
+++ b/examples/rag_event_ingest/kafka_consumer/requirements.txt
@@ -0,0 +1,4 @@
+kafka-python==2.0.2
+minio==7.2.0
+requests==2.31.0
+requests-toolbelt==1.0.0
diff --git a/examples/rag_event_ingest/kafka_consumer/router.py b/examples/rag_event_ingest/kafka_consumer/router.py
new file mode 100644
index 000000000..41f5b8f23
--- /dev/null
+++ b/examples/rag_event_ingest/kafka_consumer/router.py
@@ -0,0 +1,91 @@
+# router.py
+"""File routing module for MinIO event processing."""
+
+import logging
+from pathlib import Path
+from typing import Dict, Any, List, Set, Union
+
+from config.constants import (
+    DOCUMENT_EXTENSIONS,
+    IMAGE_EXTENSIONS,
+    AUDIO_EXTENSIONS,
+    SKIP_EXTENSIONS,
+    DEST_RAG,
+    DEST_SKIP,
+    KEY_DESTINATION,
+    KEY_FILE_TYPE,
+    KEY_EXTENSION,
+    KEY_REASON,
+    FILE_TYPE_DOCUMENT,
+    FILE_TYPE_IMAGE,
+    FILE_TYPE_AUDIO,
+    FILE_TYPE_SKIP,
+    FILE_TYPE_UNKNOWN,
+    CFG_DOCUMENT_EXTENSIONS,
+    CFG_IMAGE_EXTENSIONS,
+    CFG_AUDIO_EXTENSIONS,
+    CFG_SKIP_EXTENSIONS,
+    CFG_ENABLE_IMAGE_PROCESSING,
+    CFG_ENABLE_AUDIO_PROCESSING,
+)
+
+logger = logging.getLogger(__name__)
+
+
+class FileRouter:
+    """Routes files to appropriate processing services based on file type."""
+    
+    def __init__(self, config: Union[Dict[str, Any], Any] = None):
+        """Initialize router with optional config overrides."""
+        if config is None:
+            config = {}
+        elif hasattr(config, '__dataclass_fields__'):
+            config = {
+                CFG_DOCUMENT_EXTENSIONS: config.document_extensions,
+                CFG_IMAGE_EXTENSIONS: config.image_extensions,
+                CFG_AUDIO_EXTENSIONS: config.audio_extensions,
+                CFG_SKIP_EXTENSIONS: config.skip_extensions,
+                CFG_ENABLE_IMAGE_PROCESSING: config.enable_image_processing,
+                CFG_ENABLE_AUDIO_PROCESSING: config.enable_audio_processing,
+            }
+        
+        self.config = config
+        self.document_extensions = self._to_set(config.get(CFG_DOCUMENT_EXTENSIONS, DOCUMENT_EXTENSIONS))
+        self.image_extensions = self._to_set(config.get(CFG_IMAGE_EXTENSIONS, IMAGE_EXTENSIONS))
+        self.audio_extensions = self._to_set(config.get(CFG_AUDIO_EXTENSIONS, AUDIO_EXTENSIONS))
+        self.skip_extensions = self._to_set(config.get(CFG_SKIP_EXTENSIONS, SKIP_EXTENSIONS))
+        self.enable_image_processing = config.get(CFG_ENABLE_IMAGE_PROCESSING, False)
+        self.enable_audio_processing = config.get(CFG_ENABLE_AUDIO_PROCESSING, False)
+        
+        logger.info(f"FileRouter initialized - Documents: {len(self.document_extensions)} types")
+    
+    @staticmethod
+    def _to_set(value: Union[List, Set, None]) -> Set[str]:
+        if value is None:
+            return set()
+        return set(value) if isinstance(value, (list, tuple)) else value
+    
+    def route(self, filename: str) -> dict:
+        """Determine routing destination for a file."""
+        ext = Path(filename).suffix.lower()
+        
+        if ext in self.skip_extensions:
+            return {KEY_DESTINATION: DEST_SKIP, KEY_FILE_TYPE: FILE_TYPE_SKIP, KEY_EXTENSION: ext, KEY_REASON: 'File extension in skip list'}
+        
+        if ext in self.document_extensions:
+            return {KEY_DESTINATION: DEST_RAG, KEY_FILE_TYPE: FILE_TYPE_DOCUMENT, KEY_EXTENSION: ext}
+        
+        if ext in self.image_extensions:
+            if self.enable_image_processing:
+                return {KEY_DESTINATION: DEST_RAG, KEY_FILE_TYPE: FILE_TYPE_IMAGE, KEY_EXTENSION: ext}
+            return {KEY_DESTINATION: DEST_SKIP, KEY_FILE_TYPE: FILE_TYPE_IMAGE, KEY_EXTENSION: ext, KEY_REASON: 'Image processing not enabled'}
+        
+        if ext in self.audio_extensions:
+            if self.enable_audio_processing:
+                return {KEY_DESTINATION: DEST_RAG, KEY_FILE_TYPE: FILE_TYPE_AUDIO, KEY_EXTENSION: ext}
+            return {KEY_DESTINATION: DEST_SKIP, KEY_FILE_TYPE: FILE_TYPE_AUDIO, KEY_EXTENSION: ext, KEY_REASON: 'Audio processing not enabled'}
+        
+        return {KEY_DESTINATION: DEST_RAG, KEY_FILE_TYPE: FILE_TYPE_UNKNOWN, KEY_EXTENSION: ext, KEY_REASON: 'Unknown extension, attempting RAG ingestion'}
+    
+    def is_document(self, filename: str) -> bool:
+        return Path(filename).suffix.lower() in self.document_extensions
diff --git a/examples/rag_event_ingest/kafka_consumer/services/__init__.py b/examples/rag_event_ingest/kafka_consumer/services/__init__.py
new file mode 100644
index 000000000..db2c0e347
--- /dev/null
+++ b/examples/rag_event_ingest/kafka_consumer/services/__init__.py
@@ -0,0 +1,7 @@
+# services/__init__.py
+"""External service clients."""
+
+from .storage import ObjectStorage
+from .document_indexer import DocumentIndexer
+
+__all__ = ['ObjectStorage', 'DocumentIndexer']
diff --git a/examples/rag_event_ingest/kafka_consumer/services/document_indexer.py b/examples/rag_event_ingest/kafka_consumer/services/document_indexer.py
new file mode 100644
index 000000000..ac60d41a2
--- /dev/null
+++ b/examples/rag_event_ingest/kafka_consumer/services/document_indexer.py
@@ -0,0 +1,227 @@
+# services/document_indexer.py
+"""Document indexing service for RAG pipeline."""
+
+import json
+import logging
+import time
+from pathlib import Path
+from typing import Optional, Tuple
+import requests
+
+from config import (
+    API_INGESTOR_DOCUMENTS,
+    API_INGESTOR_COLLECTIONS,
+    API_INGESTOR_COLLECTION,
+    API_INGESTOR_STATUS,
+    STATUS_PENDING,
+    STATUS_PROCESSING,
+    STATUS_FINISHED,
+    STATUS_FAILED,
+    TIMEOUT_DEFAULT,
+    TIMEOUT_MAX_TASK_WAIT,
+    COLLECTION_EMBEDDING_DIMENSION,
+    COLLECTION_CHUNK_SIZE,
+    COLLECTION_CHUNK_OVERLAP,
+    CONTENT_TYPE_MAP,
+    DEFAULT_CONTENT_TYPE,
+    FIELD_COLLECTION_NAME,
+    FIELD_BLOCKING,
+    FIELD_SPLIT_OPTIONS,
+    FIELD_CHUNK_SIZE,
+    FIELD_CHUNK_OVERLAP,
+    FIELD_GENERATE_SUMMARY,
+    FIELD_EMBEDDING_DIMENSION,
+    FIELD_TASK_ID,
+    RESP_COLLECTIONS,
+    RESP_TASK_ID,
+    RESP_STATE,
+    RESP_RESULT,
+    RESP_FAILED_DOCUMENTS,
+    RESP_VALIDATION_ERRORS,
+    RESP_MESSAGE,
+    RESP_ERROR,
+)
+
+logger = logging.getLogger(__name__)
+
+
+class DocumentIndexer:
+    """Indexes documents in vector store for RAG retrieval."""
+    
+    def __init__(self, base_url: str, timeout: int = 600):
+        """Initialize document indexer."""
+        self.base_url = base_url.rstrip('/')
+        self.timeout = timeout
+        self._created_collections: set = set()
+        
+        logger.info(f"DocumentIndexer initialized: {self.base_url}")
+    
+    def ensure_collection_exists(self, collection_name: str) -> bool:
+        """Create collection if it doesn't exist."""
+        if collection_name in self._created_collections:
+            return True
+        
+        # Check if collection exists
+        try:
+            response = requests.get(
+                f'{self.base_url}{API_INGESTOR_COLLECTIONS}',
+                timeout=TIMEOUT_DEFAULT
+            )
+        except requests.RequestException as e:
+            logger.error(f"Error checking collections: {e}")
+            return False
+        
+        if response.status_code == 200:
+            collections = response.json().get(RESP_COLLECTIONS, [])
+            if collection_name in collections:
+                logger.info(f"Collection '{collection_name}' already exists")
+                self._created_collections.add(collection_name)
+                return True
+        
+        # Create collection
+        logger.info(f"Creating collection '{collection_name}'...")
+        try:
+            create_response = requests.post(
+                f'{self.base_url}{API_INGESTOR_COLLECTION}',
+                json={
+                    FIELD_COLLECTION_NAME: collection_name,
+                    FIELD_EMBEDDING_DIMENSION: COLLECTION_EMBEDDING_DIMENSION,
+                    'metadata_schema': []
+                },
+                headers={'Content-Type': 'application/json'},
+                timeout=TIMEOUT_DEFAULT
+            )
+        except requests.RequestException as e:
+            logger.error(f"Error creating collection: {e}")
+            return False
+        
+        if create_response.status_code in [200, 201]:
+            logger.info(f"✓ Collection '{collection_name}' created")
+            self._created_collections.add(collection_name)
+            return True
+        
+        logger.error(f"Failed to create collection: {create_response.status_code}")
+        return False
+    
+    def upload(
+        self,
+        file_data: bytes,
+        filename: str,
+        collection: str,
+        chunk_size: int = COLLECTION_CHUNK_SIZE,
+        chunk_overlap: int = COLLECTION_CHUNK_OVERLAP
+    ) -> Optional[str]:
+        """Upload document to ingestor server."""
+        if not self.ensure_collection_exists(collection):
+            logger.error("Failed to ensure collection exists")
+            return None
+        
+        content_type = self._get_content_type(filename)
+        files = {'documents': (filename, file_data, content_type)}
+        
+        data_config = {
+            FIELD_COLLECTION_NAME: collection,
+            FIELD_BLOCKING: False,
+            FIELD_SPLIT_OPTIONS: {
+                FIELD_CHUNK_SIZE: chunk_size,
+                FIELD_CHUNK_OVERLAP: chunk_overlap
+            },
+            FIELD_GENERATE_SUMMARY: False
+        }
+        
+        logger.info(f"Uploading to collection: {collection}")
+        try:
+            response = requests.post(
+                f'{self.base_url}{API_INGESTOR_DOCUMENTS}',
+                files=files,
+                data={'data': json.dumps(data_config)},
+                timeout=self.timeout
+            )
+        except requests.RequestException as e:
+            logger.error(f"Error uploading document: {e}")
+            return None
+        
+        if response.status_code in [200, 201, 202]:
+            result = response.json()
+            task_id = result.get(RESP_TASK_ID)
+            if task_id:
+                logger.info(f"✓ File uploaded, task_id: {task_id}")
+                return task_id
+            logger.error("No task_id in response")
+            return None
+        
+        logger.error(f"Upload failed: {response.status_code} - {response.text}")
+        return None
+    
+    def check_status(self, task_id: str, max_wait: int = TIMEOUT_MAX_TASK_WAIT) -> Tuple[bool, str]:
+        """Check task status and wait for completion."""
+        start_time = time.time()
+        
+        while time.time() - start_time < max_wait:
+            try:
+                response = requests.get(
+                    f'{self.base_url}{API_INGESTOR_STATUS}',
+                    params={FIELD_TASK_ID: task_id},
+                    timeout=TIMEOUT_DEFAULT
+                )
+            except requests.RequestException as e:
+                return False, str(e)
+            
+            if response.status_code != 200:
+                return False, f"Status check failed: {response.status_code}"
+            
+            result = response.json()
+            state = result.get(RESP_STATE, 'UNKNOWN')
+            
+            if state == STATUS_FAILED:
+                return False, result.get(RESP_ERROR, 'Unknown error')
+            
+            if state == STATUS_FINISHED:
+                return self._parse_finished_result(result)
+            
+            if state in [STATUS_PENDING, STATUS_PROCESSING]:
+                elapsed = int(time.time() - start_time)
+                if elapsed % 5 == 0:
+                    logger.info(f"Task {task_id}: {state} ({elapsed}s)")
+            
+            time.sleep(1)
+        
+        return False, f"Timeout after {max_wait}s"
+    
+    def _parse_finished_result(self, result: dict) -> Tuple[bool, str]:
+        """Parse result from a finished task."""
+        task_result = result.get(RESP_RESULT, {})
+        failed_docs = task_result.get(RESP_FAILED_DOCUMENTS, [])
+        validation_errors = task_result.get(RESP_VALIDATION_ERRORS, [])
+        
+        if failed_docs or validation_errors:
+            return False, f"Failed: {failed_docs}, Errors: {validation_errors}"
+        return True, task_result.get(RESP_MESSAGE, 'Completed')
+    
+    def delete(self, filename: str, collection: str) -> bool:
+        """Delete document from collection."""
+        logger.info(f"Deleting '{filename}' from '{collection}'")
+        
+        try:
+            response = requests.delete(
+                f'{self.base_url}{API_INGESTOR_DOCUMENTS}',
+                params={FIELD_COLLECTION_NAME: collection},
+                json=[filename],
+                headers={'Content-Type': 'application/json'},
+                timeout=TIMEOUT_DEFAULT
+            )
+        except requests.RequestException as e:
+            logger.error(f"Error deleting document: {e}")
+            return False
+        
+        if response.status_code in [200, 201, 204]:
+            logger.info(f"Deleted '{filename}'")
+            return True
+        
+        logger.error(f"Delete failed: {response.status_code}")
+        return False
+    
+    def _get_content_type(self, filename: str) -> str:
+        """Get content type from filename."""
+        ext = Path(filename).suffix.lower()
+        return CONTENT_TYPE_MAP.get(ext, DEFAULT_CONTENT_TYPE)
diff --git a/examples/rag_event_ingest/kafka_consumer/services/storage.py b/examples/rag_event_ingest/kafka_consumer/services/storage.py
new file mode 100644
index 000000000..8f50a1f7b
--- /dev/null
+++ b/examples/rag_event_ingest/kafka_consumer/services/storage.py
@@ -0,0 +1,195 @@
+# services/storage.py
+"""S3-compatible object storage service."""
+
+import io
+import json
+import logging
+from abc import ABC, abstractmethod
+from typing import Dict, Optional
+
+from minio import Minio
+from minio.error import S3Error
+
+from config import (
+    MINIO_ENDPOINT,
+    MINIO_ACCESS_KEY,
+    MINIO_SECRET_KEY,
+    MINIO_SECURE,
+    MINIO_DEFAULT_COLLECTION,
+    MINIO_SOURCES,
+    CFG_ENDPOINT,
+    CFG_ACCESS,
+    CFG_SECRET,
+    CFG_SECURE,
+    CFG_COLLECTION,
+    CFG_BUCKETS,
+)
+
+logger = logging.getLogger(__name__)
+
+
+# =============================================================================
+# Abstract Interface
+# =============================================================================
+
+class StorageBackend(ABC):
+    """Abstract interface for object storage operations.
+    
+    Implement this to add new backends (Azure Blob, GCS, etc.)
+    """
+    
+    @abstractmethod
+    def download(self, bucket: str, key: str) -> bytes:
+        """Download file from storage."""
+        pass
+    
+    @abstractmethod
+    def upload(self, bucket: str, key: str, data: bytes, content_type: Optional[str] = None) -> None:
+        """Upload file to storage."""
+        pass
+    
+    @abstractmethod
+    def delete(self, bucket: str, key: str) -> None:
+        """Delete file from storage."""
+        pass
+    
+    @abstractmethod
+    def exists(self, bucket: str, key: str) -> bool:
+        """Check if file exists."""
+        pass
+
+
+# =============================================================================
+# S3 Implementation
+# =============================================================================
+
+class S3Backend(StorageBackend):
+    """S3-compatible storage (MinIO, AWS S3, Wasabi, etc.)."""
+    
+    def __init__(self, client: Minio):
+        self._client = client
+    
+    @classmethod
+    def create(
+        cls,
+        endpoint: str,
+        access_key: str,
+        secret_key: str,
+        secure: bool = False,
+    ) -> 'S3Backend':
+        """Factory method to create S3 backend."""
+        client = Minio(endpoint, access_key=access_key, secret_key=secret_key, secure=secure)
+        logger.info(f"Created S3 client: {endpoint}")
+        return cls(client)
+    
+    def download(self, bucket: str, key: str) -> bytes:
+        response = self._client.get_object(bucket, key)
+        try:
+            data = response.read()
+        finally:
+            response.close()
+            response.release_conn()
+        logger.info(f"Downloaded {bucket}/{key} ({len(data)} bytes)")
+        return data
+    
+    def upload(self, bucket: str, key: str, data: bytes, content_type: Optional[str] = None) -> None:
+        self._client.put_object(
+            bucket, key, io.BytesIO(data),
+            length=len(data),
+            content_type=content_type or 'application/octet-stream'
+        )
+        logger.info(f"Uploaded {bucket}/{key}")
+    
+    def delete(self, bucket: str, key: str) -> None:
+        self._client.remove_object(bucket, key)
+        logger.info(f"Deleted {bucket}/{key}")
+    
+    def exists(self, bucket: str, key: str) -> bool:
+        try:
+            self._client.stat_object(bucket, key)
+            return True
+        except S3Error:
+            return False
+
+
+# =============================================================================
+# Object Storage (Factory + Bucket Mapping)
+# =============================================================================
+
+class ObjectStorage:
+    """Object storage with bucket-to-collection mapping.
+    
+    Handles single or multiple S3 sources via configuration.
+    """
+    
+    def __init__(self):
+        self._backends: Dict[str, StorageBackend] = {}
+        self._bucket_to_backend: Dict[str, str] = {}
+        self._bucket_to_collection: Dict[str, str] = {}
+        self._default_collection = MINIO_DEFAULT_COLLECTION
+        self._configure()
+    
+    def _configure(self):
+        if MINIO_SOURCES:
+            self._configure_multi_source(MINIO_SOURCES)
+        else:
+            self._configure_single_source()
+    
+    def _configure_single_source(self):
+        logger.info(f"Single S3 mode: {MINIO_ENDPOINT}")
+        self._backends['default'] = S3Backend.create(
+            MINIO_ENDPOINT, MINIO_ACCESS_KEY, MINIO_SECRET_KEY, MINIO_SECURE
+        )
+    
+    def _configure_multi_source(self, sources_json: str):
+        config = json.loads(sources_json)
+        for name, src in config.items():
+            self._configure_source(name, src)
+    
+    def _configure_source(self, name: str, src: dict):
+        """Configure a single S3 source and register its buckets."""
+        logger.info(f"Configuring S3 source '{name}': {src[CFG_ENDPOINT]}")
+        
+        self._backends[name] = S3Backend.create(
+            src[CFG_ENDPOINT],
+            src.get(CFG_ACCESS, MINIO_ACCESS_KEY),
+            src.get(CFG_SECRET, MINIO_SECRET_KEY),
+            src.get(CFG_SECURE, False)
+        )
+        
+        collection = src.get(CFG_COLLECTION, name.replace('-', '_'))
+        self._register_buckets(name, src.get(CFG_BUCKETS, []), collection)
+    
+    def _register_buckets(self, backend_name: str, buckets: list, collection: str):
+        """Register bucket-to-backend and bucket-to-collection mappings."""
+        for bucket in buckets:
+            self._bucket_to_backend[bucket] = backend_name
+            self._bucket_to_collection[bucket] = collection
+            logger.info(f"  {bucket} → {collection}")
+    
+    def _get_backend(self, bucket: str) -> StorageBackend:
+        if bucket in self._bucket_to_backend:
+            return self._backends[self._bucket_to_backend[bucket]]
+        return next(iter(self._backends.values()))
+    
+    def download(self, bucket: str, key: str) -> bytes:
+        return self._get_backend(bucket).download(bucket, key)
+    
+    def get_collection_for_bucket(self, bucket: str) -> str:
+        """Get collection name for bucket.
+        
+        Priority:
+        1. Explicit mapping from MINIO_SOURCES config
+        2. Default collection from COLLECTION_NAME env var
+        3. Fallback: bucket name with hyphens → underscores
+        """
+        # Check explicit mapping first
+        if bucket in self._bucket_to_collection:
+            return self._bucket_to_collection[bucket]
+        
+        # Use default collection if configured
+        if self._default_collection:
+            return self._default_collection
+        
+        # Fallback to bucket name conversion
+        return bucket.replace('-', '_')
diff --git a/examples/rag_react_agent/pyproject.toml b/examples/rag_react_agent/pyproject.toml
index fcebbcb3a..c4967a58a 100644
--- a/examples/rag_react_agent/pyproject.toml
+++ b/examples/rag_react_agent/pyproject.toml
@@ -20,7 +20,7 @@ dependencies = [
   # Keep package version constraints as open as possible to avoid conflicts with other packages. Always define a minimum
   # version when adding a new package. If unsure, default to using `~=` instead of `==`. Does not apply to nvidia-nat packages.
   # Keep sorted!!!
-  "langgraph>=1.0.7",  # Required for react_agent workflow
+  "langgraph>=1.0.8",  # Required for react_agent workflow
   "langchain_classic",
   "nvidia-nat>=1.5.0a0,<2.0",  # Allow pre-release versions
   "nvidia-nat-langchain>=1.5.0a0,<2.0",  # Allow pre-release versions
diff --git a/examples/rag_react_agent/uv.lock b/examples/rag_react_agent/uv.lock
index 0554ef787..16af3b48d 100644
--- a/examples/rag_react_agent/uv.lock
+++ b/examples/rag_react_agent/uv.lock
@@ -324,6 +324,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/cd/3a/577b549de0cc09d95f11087ee63c739bba856cd3952697eec4c4bb91350a/bleach-6.3.0-py3-none-any.whl", hash = "sha256:fe10ec77c93ddf3d13a73b035abaac7a9f5e436513864ccdad516693213c65d6", size = 164437, upload-time = "2025-10-27T17:57:37.538Z" },
 ]
 
+[[package]]
+name = "blinker"
+version = "1.9.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/21/28/9b3f50ce0e048515135495f198351908d99540d69bfdc8c1d15b73dc55ce/blinker-1.9.0.tar.gz", hash = "sha256:b4ce2265a7abece45e7cc896e98dbebe6cead56bcf805a3d23136d145f5445bf", size = 22460, upload-time = "2024-11-08T17:25:47.436Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/10/cb/f2ad4230dc2eb1a74edf38f1a38b9b52277f75bef262d8908e60d957e13c/blinker-1.9.0-py3-none-any.whl", hash = "sha256:ba0efaa9080b619ff2f3459d1d500c57bddea4a6b424b60a91141db6fd2f08bc", size = 8458, upload-time = "2024-11-08T17:25:46.184Z" },
+]
+
 [[package]]
 name = "boto3"
 version = "1.40.61"
@@ -597,21 +606,20 @@ name = "datasets"
 version = "4.5.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "dill" },
-    { name = "filelock" },
-    { name = "fsspec", extra = ["http"] },
-    { name = "httpx" },
-    { name = "huggingface-hub" },
-    { name = "multiprocess" },
-    { name = "numpy", version = "1.26.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.12'" },
-    { name = "numpy", version = "2.4.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.12'" },
-    { name = "packaging" },
-    { name = "pandas" },
-    { name = "pyarrow" },
-    { name = "pyyaml" },
-    { name = "requests" },
-    { name = "tqdm" },
-    { name = "xxhash" },
+    { name = "dill", marker = "python_full_version >= '3.12'" },
+    { name = "filelock", marker = "python_full_version >= '3.12'" },
+    { name = "fsspec", extra = ["http"], marker = "python_full_version >= '3.12'" },
+    { name = "httpx", marker = "python_full_version >= '3.12'" },
+    { name = "huggingface-hub", marker = "python_full_version >= '3.12'" },
+    { name = "multiprocess", marker = "python_full_version >= '3.12'" },
+    { name = "numpy", marker = "python_full_version >= '3.12'" },
+    { name = "packaging", marker = "python_full_version >= '3.12'" },
+    { name = "pandas", marker = "python_full_version >= '3.12'" },
+    { name = "pyarrow", marker = "python_full_version >= '3.12'" },
+    { name = "pyyaml", marker = "python_full_version >= '3.12'" },
+    { name = "requests", marker = "python_full_version >= '3.12'" },
+    { name = "tqdm", marker = "python_full_version >= '3.12'" },
+    { name = "xxhash", marker = "python_full_version >= '3.12'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/55/bf/bb927bde63d649296c83e883171ae77074717c1b80fe2868b328bd0dbcbb/datasets-4.5.0.tar.gz", hash = "sha256:00c698ce1c2452e646cc5fad47fef39d3fe78dd650a8a6eb205bb45eb63cd500", size = 588384, upload-time = "2026-01-14T18:27:54.297Z" }
 wheels = [
@@ -736,6 +744,23 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/18/79/1b8fa1bb3568781e84c9200f951c735f3f157429f44be0495da55894d620/filetype-1.2.0-py2.py3-none-any.whl", hash = "sha256:7ce71b6880181241cf7ac8697a2f1eb6a8bd9b429f7ad6d27b8db9ba5f1c2d25", size = 19970, upload-time = "2022-11-02T17:34:01.425Z" },
 ]
 
+[[package]]
+name = "flask"
+version = "3.1.3"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "blinker" },
+    { name = "click" },
+    { name = "itsdangerous" },
+    { name = "jinja2" },
+    { name = "markupsafe" },
+    { name = "werkzeug" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/26/00/35d85dcce6c57fdc871f3867d465d780f302a175ea360f62533f12b27e2b/flask-3.1.3.tar.gz", hash = "sha256:0ef0e52b8a9cd932855379197dd8f94047b359ca0a78695144304cb45f87c9eb", size = 759004, upload-time = "2026-02-19T05:00:57.678Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/7f/9c/34f6962f9b9e9c71f6e5ed806e0d0ff03c9d1b0b2340088a0cf4bce09b18/flask-3.1.3-py3-none-any.whl", hash = "sha256:f4bcbefc124291925f1a26446da31a5178f9483862233b23c0c96a20701f670c", size = 103424, upload-time = "2026-02-19T05:00:56.027Z" },
+]
+
 [[package]]
 name = "flatbuffers"
 version = "25.12.19"
@@ -828,7 +853,7 @@ wheels = [
 
 [package.optional-dependencies]
 http = [
-    { name = "aiohttp" },
+    { name = "aiohttp", marker = "python_full_version >= '3.12'" },
 ]
 
 [[package]]
@@ -1078,6 +1103,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/15/aa/0aca39a37d3c7eb941ba736ede56d689e7be91cab5d9ca846bde3999eba6/isodate-0.7.2-py3-none-any.whl", hash = "sha256:28009937d8031054830160fce6d409ed342816b543597cece116d966c6d99e15", size = 22320, upload-time = "2024-10-08T23:04:09.501Z" },
 ]
 
+[[package]]
+name = "itsdangerous"
+version = "2.2.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/9c/cb/8ac0172223afbccb63986cc25049b154ecfb5e85932587206f42317be31d/itsdangerous-2.2.0.tar.gz", hash = "sha256:e0050c0b7da1eea53ffaf149c0cfbb5c6e2e2b69c4bef22c81fa6eb73e5f6173", size = 54410, upload-time = "2024-04-16T21:28:15.614Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/04/96/92447566d16df59b2a776c0fb82dbc4d9e07cd95062562af01e408583fc4/itsdangerous-2.2.0-py3-none-any.whl", hash = "sha256:c6242fc49e35958c8b15141343aa660db5fc54d4f13a1db01a3f5891b98700ef", size = 16234, upload-time = "2024-04-16T21:28:14.499Z" },
+]
+
 [[package]]
 name = "jinja2"
 version = "3.1.6"
@@ -1235,18 +1269,17 @@ wheels = [
 
 [[package]]
 name = "langchain-aws"
-version = "1.0.0"
+version = "1.1.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "boto3" },
     { name = "langchain-core" },
-    { name = "numpy", version = "1.26.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.12'" },
-    { name = "numpy", version = "2.4.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.12'" },
+    { name = "numpy" },
     { name = "pydantic" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/90/52/7e57fb7fc34c386625f66f0ab31da9cf2788b03ef15ae78ccd4c627b30cf/langchain_aws-1.0.0.tar.gz", hash = "sha256:597342bda0e7384e13590e9ab69c872ddcfbbf07d81ac6bb0f8a67970252212e", size = 214146, upload-time = "2025-10-17T19:06:49.001Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/52/1d/bb306951b1c394b7a27effb8eb6c9ee65dd77fcc4be7c20f76e3299a9e1e/langchain_aws-1.1.0.tar.gz", hash = "sha256:1e2f8570328eae4907c3cf7e900dc68d8034ddc865d9dc96823c9f9d8cccb901", size = 393899, upload-time = "2025-11-24T14:35:24.216Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/67/5d/5b3c07780a8eb4b916ffe504893896f87f318924c86dcbeb89562baa2d20/langchain_aws-1.0.0-py3-none-any.whl", hash = "sha256:68f6965b5030d0779b02e731ce1c910a5f4518bfe0e2ae82999a5342bc46dbd5", size = 150400, upload-time = "2025-10-17T19:06:47.926Z" },
+    { url = "https://files.pythonhosted.org/packages/26/33/91b8d2a7570657b371382b45054142c54165a51706990a5c1b4cc40c0e9a/langchain_aws-1.1.0-py3-none-any.whl", hash = "sha256:8ec074615b42839e035354063717374c32c63f5028ef5221ba073fd5f3ef5e37", size = 152432, upload-time = "2025-11-24T14:35:23.004Z" },
 ]
 
 [[package]]
@@ -1278,8 +1311,7 @@ dependencies = [
     { name = "langchain-classic" },
     { name = "langchain-core" },
     { name = "langsmith" },
-    { name = "numpy", version = "1.26.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.12'" },
-    { name = "numpy", version = "2.4.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.12'" },
+    { name = "numpy" },
     { name = "pydantic-settings" },
     { name = "pyyaml" },
     { name = "requests" },
@@ -1293,7 +1325,7 @@ wheels = [
 
 [[package]]
 name = "langchain-core"
-version = "1.2.7"
+version = "1.2.17"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "jsonpatch" },
@@ -1305,9 +1337,23 @@ dependencies = [
     { name = "typing-extensions" },
     { name = "uuid-utils" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/a2/0e/664d8d81b3493e09cbab72448d2f9d693d1fa5aa2bcc488602203a9b6da0/langchain_core-1.2.7.tar.gz", hash = "sha256:e1460639f96c352b4a41c375f25aeb8d16ffc1769499fb1c20503aad59305ced", size = 837039, upload-time = "2026-01-09T17:44:25.505Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/1d/93/36226f593df52b871fc24d494c274f3a6b2ac76763a2806e7d35611634a1/langchain_core-1.2.17.tar.gz", hash = "sha256:54aa267f3311e347fb2e50951fe08e53761cebfb999ab80e6748d70525bbe872", size = 836130, upload-time = "2026-03-02T22:47:55.846Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/6e/6f/34a9fba14d191a67f7e2ee3dbce3e9b86d2fa7310e2c7f2c713583481bd2/langchain_core-1.2.7-py3-none-any.whl", hash = "sha256:452f4fef7a3d883357b22600788d37e3d8854ef29da345b7ac7099f33c31828b", size = 490232, upload-time = "2026-01-09T17:44:24.236Z" },
+    { url = "https://files.pythonhosted.org/packages/be/90/073f33ab383a62908eca7ea699586dfea280e77182176e33199c80ddf22a/langchain_core-1.2.17-py3-none-any.whl", hash = "sha256:bf6bd6ce503874e9c2da1669a69383e967c3de1ea808921d19a9a6bff1a9fbbe", size = 502727, upload-time = "2026-03-02T22:47:54.537Z" },
+]
+
+[[package]]
+name = "langchain-huggingface"
+version = "1.2.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "huggingface-hub" },
+    { name = "langchain-core" },
+    { name = "tokenizers" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/42/5b/4910551367de5c6ec246616fcc0ddb0bc6f9e5d353d4a22dcb5ab1f87e60/langchain_huggingface-1.2.1.tar.gz", hash = "sha256:33d52a30a56775380c6b4321b78136a410eb079132a80fe7120ddd4b954b4efa", size = 253106, upload-time = "2026-03-02T18:44:39.163Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/bc/90/a1440bfa467a6dd9025ad80f3c239554de28aec49dacfb369fda92871556/langchain_huggingface-1.2.1-py3-none-any.whl", hash = "sha256:0930c216a457d2c8dc7b39a756c39c567f1d88593bfee2c3441f3ae718435f0f", size = 30924, upload-time = "2026-03-02T18:44:37.745Z" },
 ]
 
 [[package]]
@@ -1338,16 +1384,17 @@ wheels = [
 
 [[package]]
 name = "langchain-nvidia-ai-endpoints"
-version = "1.0.3"
+version = "1.2.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "aiohttp" },
     { name = "filetype" },
     { name = "langchain-core" },
+    { name = "requests" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/5a/9e/30814da280f7a79b168f83180f6a0396c166f86a566e56bb9877bf562611/langchain_nvidia_ai_endpoints-1.0.3.tar.gz", hash = "sha256:11c48fd24e4a9d4c86c65bcef943400f4e709497c93254c7dc97c43f68c2be89", size = 46526, upload-time = "2026-01-28T22:04:33.93Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/47/4b/e417af1b2b7f861f37e26bf4fa4b05cda4052002e3f84a966f0735baf94f/langchain_nvidia_ai_endpoints-1.2.0.tar.gz", hash = "sha256:4bd63b812707ea348a86539001aa9a89b3cba3ee56ade7379247a955e4bfd3eb", size = 53851, upload-time = "2026-03-10T17:55:08.127Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/67/04/c83f61106a245b74de11c1e075c1cc1e70462ece1dd9fc0584ad992a776d/langchain_nvidia_ai_endpoints-1.0.3-py3-none-any.whl", hash = "sha256:e5f170ad0a335637298bb90fb3df119793821e316355f61ab82f0106913eebbf", size = 50130, upload-time = "2026-01-28T22:04:33.065Z" },
+    { url = "https://files.pythonhosted.org/packages/66/e4/186f1a99e4d30bd91c8438d024dc73a71c8f7e0657c7acb6e79658aa19cf/langchain_nvidia_ai_endpoints-1.2.0-py3-none-any.whl", hash = "sha256:c8e075d5b3d31216374af0cfa9e690ab28ada3ebbde34dd6d36fe16a26d883cc", size = 58269, upload-time = "2026-03-10T17:55:06.339Z" },
 ]
 
 [[package]]
@@ -1393,7 +1440,7 @@ wheels = [
 
 [[package]]
 name = "langgraph"
-version = "1.0.7"
+version = "1.0.10"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "langchain-core" },
@@ -1403,9 +1450,9 @@ dependencies = [
     { name = "pydantic" },
     { name = "xxhash" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/72/5b/f72655717c04e33d3b62f21b166dc063d192b53980e9e3be0e2a117f1c9f/langgraph-1.0.7.tar.gz", hash = "sha256:0cfdfee51e6e8cfe503ecc7367c73933437c505b03fa10a85c710975c8182d9a", size = 497098, upload-time = "2026-01-22T16:57:47.303Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/55/92/14df6fefba28c10caf1cb05aa5b8c7bf005838fe32a86d903b6c7cc4018d/langgraph-1.0.10.tar.gz", hash = "sha256:73bd10ee14a8020f31ef07e9cd4c1a70c35cc07b9c2b9cd637509a10d9d51e29", size = 511644, upload-time = "2026-02-27T21:04:38.743Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/7e/0e/fe80144e3e4048e5d19ccdb91ac547c1a7dc3da8dbd1443e210048194c14/langgraph-1.0.7-py3-none-any.whl", hash = "sha256:9d68e8f8dd8f3de2fec45f9a06de05766d9b075b78fb03171779893b7a52c4d2", size = 157353, upload-time = "2026-01-22T16:57:45.997Z" },
+    { url = "https://files.pythonhosted.org/packages/5d/60/260e0c04620a37ba8916b712766c341cc5fc685dabc6948c899494bbc2ae/langgraph-1.0.10-py3-none-any.whl", hash = "sha256:7c298bef4f6ea292fcf9824d6088fe41a6727e2904ad6066f240c4095af12247", size = 160920, upload-time = "2026-02-27T21:04:35.932Z" },
 ]
 
 [[package]]
@@ -1423,15 +1470,15 @@ wheels = [
 
 [[package]]
 name = "langgraph-prebuilt"
-version = "1.0.7"
+version = "1.0.8"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "langchain-core" },
     { name = "langgraph-checkpoint" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/a7/59/711aecd1a50999456850dc328f3cad72b4372d8218838d8d5326f80cb76f/langgraph_prebuilt-1.0.7.tar.gz", hash = "sha256:38e097e06de810de4d0e028ffc0e432bb56d1fb417620fb1dfdc76c5e03e4bf9", size = 163692, upload-time = "2026-01-22T16:45:22.801Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/0d/06/dd61a5c2dce009d1b03b1d56f2a85b3127659fdddf5b3be5d8f1d60820fb/langgraph_prebuilt-1.0.8.tar.gz", hash = "sha256:0cd3cf5473ced8a6cd687cc5294e08d3de57529d8dd14fdc6ae4899549efcf69", size = 164442, upload-time = "2026-02-19T18:14:39.083Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/47/49/5e37abb3f38a17a3487634abc2a5da87c208cc1d14577eb8d7184b25c886/langgraph_prebuilt-1.0.7-py3-none-any.whl", hash = "sha256:e14923516504405bb5edc3977085bc9622c35476b50c1808544490e13871fe7c", size = 35324, upload-time = "2026-01-22T16:45:21.784Z" },
+    { url = "https://files.pythonhosted.org/packages/dc/41/ec966424ad3f2ed3996d24079d3342c8cd6c0bd0653c12b2a917a685ec6c/langgraph_prebuilt-1.0.8-py3-none-any.whl", hash = "sha256:d16a731e591ba4470f3e313a319c7eee7dbc40895bcf15c821f985a3522a7ce0", size = 35648, upload-time = "2026-02-19T18:14:37.611Z" },
 ]
 
 [[package]]
@@ -1592,20 +1639,20 @@ name = "mcp"
 version = "1.26.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "anyio" },
-    { name = "httpx" },
-    { name = "httpx-sse" },
-    { name = "jsonschema" },
-    { name = "pydantic" },
-    { name = "pydantic-settings" },
-    { name = "pyjwt", extra = ["crypto"] },
-    { name = "python-multipart" },
-    { name = "pywin32", marker = "sys_platform == 'win32'" },
-    { name = "sse-starlette" },
-    { name = "starlette" },
-    { name = "typing-extensions" },
-    { name = "typing-inspection" },
-    { name = "uvicorn", marker = "sys_platform != 'emscripten'" },
+    { name = "anyio", marker = "python_full_version >= '3.12'" },
+    { name = "httpx", marker = "python_full_version >= '3.12'" },
+    { name = "httpx-sse", marker = "python_full_version >= '3.12'" },
+    { name = "jsonschema", marker = "python_full_version >= '3.12'" },
+    { name = "pydantic", marker = "python_full_version >= '3.12'" },
+    { name = "pydantic-settings", marker = "python_full_version >= '3.12'" },
+    { name = "pyjwt", extra = ["crypto"], marker = "python_full_version >= '3.12'" },
+    { name = "python-multipart", marker = "python_full_version >= '3.12'" },
+    { name = "pywin32", marker = "python_full_version >= '3.12' and sys_platform == 'win32'" },
+    { name = "sse-starlette", marker = "python_full_version >= '3.12'" },
+    { name = "starlette", marker = "python_full_version >= '3.12'" },
+    { name = "typing-extensions", marker = "python_full_version >= '3.12'" },
+    { name = "typing-inspection", marker = "python_full_version >= '3.12'" },
+    { name = "uvicorn", marker = "python_full_version >= '3.12' and sys_platform != 'emscripten'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/fc/6d/62e76bbb8144d6ed86e202b5edd8a4cb631e7c8130f3f4893c3f90262b10/mcp-1.26.0.tar.gz", hash = "sha256:db6e2ef491eecc1a0d93711a76f28dec2e05999f93afd48795da1c1137142c66", size = 608005, upload-time = "2026-01-24T19:40:32.468Z" }
 wheels = [
@@ -1746,7 +1793,7 @@ name = "multiprocess"
 version = "0.70.18"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "dill" },
+    { name = "dill", marker = "python_full_version >= '3.12'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/72/fd/2ae3826f5be24c6ed87266bc4e59c46ea5b059a103f3d7e7eb76a52aeecb/multiprocess-0.70.18.tar.gz", hash = "sha256:f9597128e6b3e67b23956da07cf3d2e5cba79e2f4e0fba8d7903636663ec6d0d", size = 1798503, upload-time = "2025-04-17T03:11:27.742Z" }
 wheels = [
@@ -1770,6 +1817,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/79/7b/2c79738432f5c924bef5071f933bcc9efd0473bac3b4aa584a6f7c1c8df8/mypy_extensions-1.1.0-py3-none-any.whl", hash = "sha256:1be4cccdb0f2482337c4743e60421de3a356cd97508abadd57d47403e94f5505", size = 4963, upload-time = "2025-04-22T14:54:22.983Z" },
 ]
 
+[[package]]
+name = "narwhals"
+version = "2.17.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/75/59/81d0f4cad21484083466f278e6b392addd9f4205b48d45b5c8771670ebf8/narwhals-2.17.0.tar.gz", hash = "sha256:ebd5bc95bcfa2f8e89a8ac09e2765a63055162837208e67b42d6eeb6651d5e67", size = 620306, upload-time = "2026-02-23T09:44:34.142Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/4b/27/20770bd6bf8fbe1e16f848ba21da9df061f38d2e6483952c29d2bb5d1d8b/narwhals-2.17.0-py3-none-any.whl", hash = "sha256:2ac5307b7c2b275a7d66eeda906b8605e3d7a760951e188dcfff86e8ebe083dd", size = 444897, upload-time = "2026-02-23T09:44:32.006Z" },
+]
+
 [[package]]
 name = "nest-asyncio"
 version = "1.6.0"
@@ -1797,41 +1853,10 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/9e/c9/b2622292ea83fbb4ec318f5b9ab867d0a28ab43c5717bb85b0a5f6b3b0a4/networkx-3.6.1-py3-none-any.whl", hash = "sha256:d47fbf302e7d9cbbb9e2555a0d267983d2aa476bac30e90dfbe5669bd57f3762", size = 2068504, upload-time = "2025-12-08T17:02:38.159Z" },
 ]
 
-[[package]]
-name = "numpy"
-version = "1.26.4"
-source = { registry = "https://pypi.org/simple" }
-resolution-markers = [
-    "python_full_version < '3.12'",
-]
-sdist = { url = "https://files.pythonhosted.org/packages/65/6e/09db70a523a96d25e115e71cc56a6f9031e7b8cd166c1ac8438307c14058/numpy-1.26.4.tar.gz", hash = "sha256:2a02aba9ed12e4ac4eb3ea9421c420301a0c6460d9830d74a9df87efa4912010", size = 15786129, upload-time = "2024-02-06T00:26:44.495Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/11/57/baae43d14fe163fa0e4c47f307b6b2511ab8d7d30177c491960504252053/numpy-1.26.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:4c66707fabe114439db9068ee468c26bbdf909cac0fb58686a42a24de1760c71", size = 20630554, upload-time = "2024-02-05T23:51:50.149Z" },
-    { url = "https://files.pythonhosted.org/packages/1a/2e/151484f49fd03944c4a3ad9c418ed193cfd02724e138ac8a9505d056c582/numpy-1.26.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:edd8b5fe47dab091176d21bb6de568acdd906d1887a4584a15a9a96a1dca06ef", size = 13997127, upload-time = "2024-02-05T23:52:15.314Z" },
-    { url = "https://files.pythonhosted.org/packages/79/ae/7e5b85136806f9dadf4878bf73cf223fe5c2636818ba3ab1c585d0403164/numpy-1.26.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7ab55401287bfec946ced39700c053796e7cc0e3acbef09993a9ad2adba6ca6e", size = 14222994, upload-time = "2024-02-05T23:52:47.569Z" },
-    { url = "https://files.pythonhosted.org/packages/3a/d0/edc009c27b406c4f9cbc79274d6e46d634d139075492ad055e3d68445925/numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:666dbfb6ec68962c033a450943ded891bed2d54e6755e35e5835d63f4f6931d5", size = 18252005, upload-time = "2024-02-05T23:53:15.637Z" },
-    { url = "https://files.pythonhosted.org/packages/09/bf/2b1aaf8f525f2923ff6cfcf134ae5e750e279ac65ebf386c75a0cf6da06a/numpy-1.26.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:96ff0b2ad353d8f990b63294c8986f1ec3cb19d749234014f4e7eb0112ceba5a", size = 13885297, upload-time = "2024-02-05T23:53:42.16Z" },
-    { url = "https://files.pythonhosted.org/packages/df/a0/4e0f14d847cfc2a633a1c8621d00724f3206cfeddeb66d35698c4e2cf3d2/numpy-1.26.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:60dedbb91afcbfdc9bc0b1f3f402804070deed7392c23eb7a7f07fa857868e8a", size = 18093567, upload-time = "2024-02-05T23:54:11.696Z" },
-    { url = "https://files.pythonhosted.org/packages/d2/b7/a734c733286e10a7f1a8ad1ae8c90f2d33bf604a96548e0a4a3a6739b468/numpy-1.26.4-cp311-cp311-win32.whl", hash = "sha256:1af303d6b2210eb850fcf03064d364652b7120803a0b872f5211f5234b399f20", size = 5968812, upload-time = "2024-02-05T23:54:26.453Z" },
-    { url = "https://files.pythonhosted.org/packages/3f/6b/5610004206cf7f8e7ad91c5a85a8c71b2f2f8051a0c0c4d5916b76d6cbb2/numpy-1.26.4-cp311-cp311-win_amd64.whl", hash = "sha256:cd25bcecc4974d09257ffcd1f098ee778f7834c3ad767fe5db785be9a4aa9cb2", size = 15811913, upload-time = "2024-02-05T23:54:53.933Z" },
-    { url = "https://files.pythonhosted.org/packages/95/12/8f2020a8e8b8383ac0177dc9570aad031a3beb12e38847f7129bacd96228/numpy-1.26.4-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:b3ce300f3644fb06443ee2222c2201dd3a89ea6040541412b8fa189341847218", size = 20335901, upload-time = "2024-02-05T23:55:32.801Z" },
-    { url = "https://files.pythonhosted.org/packages/75/5b/ca6c8bd14007e5ca171c7c03102d17b4f4e0ceb53957e8c44343a9546dcc/numpy-1.26.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:03a8c78d01d9781b28a6989f6fa1bb2c4f2d51201cf99d3dd875df6fbd96b23b", size = 13685868, upload-time = "2024-02-05T23:55:56.28Z" },
-    { url = "https://files.pythonhosted.org/packages/79/f8/97f10e6755e2a7d027ca783f63044d5b1bc1ae7acb12afe6a9b4286eac17/numpy-1.26.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9fad7dcb1aac3c7f0584a5a8133e3a43eeb2fe127f47e3632d43d677c66c102b", size = 13925109, upload-time = "2024-02-05T23:56:20.368Z" },
-    { url = "https://files.pythonhosted.org/packages/0f/50/de23fde84e45f5c4fda2488c759b69990fd4512387a8632860f3ac9cd225/numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:675d61ffbfa78604709862923189bad94014bef562cc35cf61d3a07bba02a7ed", size = 17950613, upload-time = "2024-02-05T23:56:56.054Z" },
-    { url = "https://files.pythonhosted.org/packages/4c/0c/9c603826b6465e82591e05ca230dfc13376da512b25ccd0894709b054ed0/numpy-1.26.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:ab47dbe5cc8210f55aa58e4805fe224dac469cde56b9f731a4c098b91917159a", size = 13572172, upload-time = "2024-02-05T23:57:21.56Z" },
-    { url = "https://files.pythonhosted.org/packages/76/8c/2ba3902e1a0fc1c74962ea9bb33a534bb05984ad7ff9515bf8d07527cadd/numpy-1.26.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:1dda2e7b4ec9dd512f84935c5f126c8bd8b9f2fc001e9f54af255e8c5f16b0e0", size = 17786643, upload-time = "2024-02-05T23:57:56.585Z" },
-    { url = "https://files.pythonhosted.org/packages/28/4a/46d9e65106879492374999e76eb85f87b15328e06bd1550668f79f7b18c6/numpy-1.26.4-cp312-cp312-win32.whl", hash = "sha256:50193e430acfc1346175fcbdaa28ffec49947a06918b7b92130744e81e640110", size = 5677803, upload-time = "2024-02-05T23:58:08.963Z" },
-    { url = "https://files.pythonhosted.org/packages/16/2e/86f24451c2d530c88daf997cb8d6ac622c1d40d19f5a031ed68a4b73a374/numpy-1.26.4-cp312-cp312-win_amd64.whl", hash = "sha256:08beddf13648eb95f8d867350f6a018a4be2e5ad54c8d8caed89ebca558b2818", size = 15517754, upload-time = "2024-02-05T23:58:36.364Z" },
-]
-
 [[package]]
 name = "numpy"
 version = "2.4.1"
 source = { registry = "https://pypi.org/simple" }
-resolution-markers = [
-    "python_full_version >= '3.13'",
-    "python_full_version == '3.12.*'",
-]
 sdist = { url = "https://files.pythonhosted.org/packages/24/62/ae72ff66c0f1fd959925b4c11f8c2dea61f47f6acaea75a08512cdfe3fed/numpy-2.4.1.tar.gz", hash = "sha256:a1ceafc5042451a858231588a104093474c6a5c57dcc724841f5c888d237d690", size = 20721320, upload-time = "2026-01-10T06:44:59.619Z" }
 wheels = [
     { url = "https://files.pythonhosted.org/packages/a5/34/2b1bc18424f3ad9af577f6ce23600319968a70575bd7db31ce66731bbef9/numpy-2.4.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:0cce2a669e3c8ba02ee563c7835f92c153cf02edff1ae05e1823f1dde21b16a5", size = 16944563, upload-time = "2026-01-10T06:42:14.615Z" },
@@ -1890,67 +1915,129 @@ wheels = [
 name = "nvidia-nat"
 version = "1.5.0a20260112"
 source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "python_full_version >= '3.13'",
+    "python_full_version == '3.12.*'",
+]
+dependencies = [
+    { name = "aioboto3", marker = "python_full_version >= '3.12'" },
+    { name = "authlib", marker = "python_full_version >= '3.12'" },
+    { name = "click", marker = "python_full_version >= '3.12'" },
+    { name = "colorama", marker = "python_full_version >= '3.12'" },
+    { name = "datasets", marker = "python_full_version >= '3.12'" },
+    { name = "expandvars", marker = "python_full_version >= '3.12'" },
+    { name = "fastapi", marker = "python_full_version >= '3.12'" },
+    { name = "httpx", marker = "python_full_version >= '3.12'" },
+    { name = "jinja2", marker = "python_full_version >= '3.12'" },
+    { name = "jsonpath-ng", marker = "python_full_version >= '3.12'" },
+    { name = "mcp", marker = "python_full_version >= '3.12'" },
+    { name = "nest-asyncio2", marker = "python_full_version >= '3.12'" },
+    { name = "networkx", marker = "python_full_version >= '3.12'" },
+    { name = "numpy", marker = "python_full_version >= '3.12'" },
+    { name = "openinference-semantic-conventions", marker = "python_full_version >= '3.12'" },
+    { name = "openpyxl", marker = "python_full_version >= '3.12'" },
+    { name = "optuna", marker = "python_full_version >= '3.12'" },
+    { name = "pip", marker = "python_full_version >= '3.12'" },
+    { name = "pkce", marker = "python_full_version >= '3.12'" },
+    { name = "pkginfo", marker = "python_full_version >= '3.12'" },
+    { name = "platformdirs", marker = "python_full_version >= '3.12'" },
+    { name = "pydantic", marker = "python_full_version >= '3.12'" },
+    { name = "pymilvus", marker = "python_full_version >= '3.12'" },
+    { name = "python-dotenv", marker = "python_full_version >= '3.12'" },
+    { name = "pyyaml", marker = "python_full_version >= '3.12'" },
+    { name = "ragas", marker = "python_full_version >= '3.12'" },
+    { name = "rich", marker = "python_full_version >= '3.12'" },
+    { name = "tabulate", marker = "python_full_version >= '3.12'" },
+    { name = "uvicorn", extra = ["standard"], marker = "python_full_version >= '3.12'" },
+    { name = "wikipedia", marker = "python_full_version >= '3.12'" },
+]
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/a1/e0/c7426ed15d1eb528eb0c9135efb66da033b0a56b63f42d4099b2fe05fd24/nvidia_nat-1.5.0a20260112-py3-none-any.whl", hash = "sha256:3d05c948efe0e3ab58e3d7a58ab90510d1a1128eb678810e1ef62efc5dfc9681", size = 950027, upload-time = "2026-01-12T10:46:15.705Z" },
+]
+
+[[package]]
+name = "nvidia-nat"
+version = "1.5.0a20260223"
+source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "python_full_version < '3.12'",
+]
+dependencies = [
+    { name = "nvidia-nat-core", marker = "python_full_version < '3.12'" },
+]
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/07/7e/6e984de1473e8264d5cf6598d14f1c01f6dabf22f2fedda5f8e97140ae05/nvidia_nat-1.5.0a20260223-py3-none-any.whl", hash = "sha256:137461b310af90ed12e0496bac90ddb62297b00287707c80df48208437e2502a", size = 52704, upload-time = "2026-02-23T10:04:57.955Z" },
+]
+
+[[package]]
+name = "nvidia-nat-core"
+version = "1.5.0a20260223"
+source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "aioboto3" },
     { name = "authlib" },
     { name = "click" },
     { name = "colorama" },
-    { name = "datasets" },
     { name = "expandvars" },
     { name = "fastapi" },
+    { name = "flask" },
     { name = "httpx" },
     { name = "jinja2" },
     { name = "jsonpath-ng" },
-    { name = "mcp" },
     { name = "nest-asyncio2" },
     { name = "networkx" },
-    { name = "numpy", version = "1.26.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.12'" },
-    { name = "numpy", version = "2.4.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.12'" },
+    { name = "numpy" },
     { name = "openinference-semantic-conventions" },
-    { name = "openpyxl" },
     { name = "optuna" },
+    { name = "pandas" },
     { name = "pip" },
     { name = "pkce" },
     { name = "pkginfo" },
     { name = "platformdirs" },
+    { name = "plotly" },
     { name = "pydantic" },
+    { name = "pyjwt" },
     { name = "pymilvus" },
     { name = "python-dotenv" },
+    { name = "python-multipart" },
     { name = "pyyaml" },
-    { name = "ragas" },
     { name = "rich" },
     { name = "tabulate" },
+    { name = "urllib3" },
     { name = "uvicorn", extra = ["standard"] },
     { name = "wikipedia" },
 ]
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/a1/e0/c7426ed15d1eb528eb0c9135efb66da033b0a56b63f42d4099b2fe05fd24/nvidia_nat-1.5.0a20260112-py3-none-any.whl", hash = "sha256:3d05c948efe0e3ab58e3d7a58ab90510d1a1128eb678810e1ef62efc5dfc9681", size = 950027, upload-time = "2026-01-12T10:46:15.705Z" },
+    { url = "https://files.pythonhosted.org/packages/b1/23/b043caadf08a72e4eb2c95bb65fed5083e7bf40af48ea92305fabf3b2820/nvidia_nat_core-1.5.0a20260223-py3-none-any.whl", hash = "sha256:5262cae48d66efbd53f98134e7820759121a1b4398b339e1d14d307ed2195a21", size = 762259, upload-time = "2026-02-23T10:01:44.692Z" },
 ]
 
 [[package]]
 name = "nvidia-nat-langchain"
-version = "1.5.0a20260112"
+version = "1.5.0a20260223"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "langchain" },
     { name = "langchain-aws" },
     { name = "langchain-classic" },
+    { name = "langchain-community" },
     { name = "langchain-core" },
+    { name = "langchain-huggingface" },
     { name = "langchain-litellm" },
     { name = "langchain-milvus" },
     { name = "langchain-nvidia-ai-endpoints" },
     { name = "langchain-openai" },
     { name = "langchain-tavily" },
     { name = "langgraph" },
-    { name = "nvidia-nat" },
+    { name = "nvidia-nat-core" },
+    { name = "openevals" },
 ]
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/84/2a/7a2cd2e7444ef03bdebb6c9637e63a9eee33da84e7c23baceb18f83f2250/nvidia_nat_langchain-1.5.0a20260112-py3-none-any.whl", hash = "sha256:cba64b0192d589f325cbbc2de60da8eb514efc27b49157b3eafca204ab989a55", size = 60925, upload-time = "2026-01-12T10:43:35.414Z" },
+    { url = "https://files.pythonhosted.org/packages/98/65/e565dc570ecfdf4c4ca34d0d873794d33fdf11a8889a9b2b1a78ad15b589/nvidia_nat_langchain-1.5.0a20260223-py3-none-any.whl", hash = "sha256:87c70294c1f38fcd09252a79dd5e9038aee73326678dd9d7519b8064b914d7e4", size = 160480, upload-time = "2026-02-23T10:06:08.62Z" },
 ]
 
 [[package]]
 name = "nvidia-rag"
-version = "2.4.0.dev0"
+version = "2.5.0.dev0"
 source = { editable = "../../" }
 dependencies = [
     { name = "anyio" },
@@ -2016,16 +2103,16 @@ requires-dist = [
     { name = "langchain-elasticsearch", marker = "extra == 'all'", specifier = ">=0.3" },
     { name = "langchain-elasticsearch", marker = "extra == 'elasticsearch'", specifier = ">=0.3" },
     { name = "langchain-milvus", specifier = ">=0.3.0" },
-    { name = "langchain-nvidia-ai-endpoints", specifier = ">=1.0.3" },
+    { name = "langchain-nvidia-ai-endpoints", specifier = ">=1.2.0" },
     { name = "langchain-openai", marker = "extra == 'all'", specifier = ">=0.2" },
     { name = "langchain-openai", marker = "extra == 'ingest'", specifier = ">=0.2" },
     { name = "langchain-openai", marker = "extra == 'rag'", specifier = ">=0.2" },
     { name = "lark", specifier = ">=1.2.2" },
     { name = "minio", specifier = ">=7.2,<8.0" },
-    { name = "nv-ingest-api", marker = "extra == 'all'", specifier = "==26.1.1" },
-    { name = "nv-ingest-api", marker = "extra == 'ingest'", specifier = "==26.1.1" },
-    { name = "nv-ingest-client", marker = "extra == 'all'", specifier = "==26.1.1" },
-    { name = "nv-ingest-client", marker = "extra == 'ingest'", specifier = "==26.1.1" },
+    { name = "nv-ingest-api", marker = "extra == 'all'", specifier = "==26.1.2" },
+    { name = "nv-ingest-api", marker = "extra == 'ingest'", specifier = "==26.1.2" },
+    { name = "nv-ingest-client", marker = "extra == 'all'", specifier = "==26.1.2" },
+    { name = "nv-ingest-client", marker = "extra == 'ingest'", specifier = "==26.1.2" },
     { name = "opentelemetry-api", marker = "extra == 'all'", specifier = ">=1.29,<2.0" },
     { name = "opentelemetry-api", marker = "extra == 'ingest'", specifier = ">=1.29,<2.0" },
     { name = "opentelemetry-api", marker = "extra == 'rag'", specifier = ">=1.29,<2.0" },
@@ -2094,8 +2181,7 @@ source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "coloredlogs" },
     { name = "flatbuffers" },
-    { name = "numpy", version = "1.26.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.12'" },
-    { name = "numpy", version = "2.4.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.12'" },
+    { name = "numpy" },
     { name = "packaging" },
     { name = "protobuf" },
     { name = "sympy" },
@@ -2129,6 +2215,21 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/b5/df/c306f7375d42bafb379934c2df4c2fa3964656c8c782bac75ee10c102818/openai-2.15.0-py3-none-any.whl", hash = "sha256:6ae23b932cd7230f7244e52954daa6602716d6b9bf235401a107af731baea6c3", size = 1067879, upload-time = "2026-01-09T22:10:06.446Z" },
 ]
 
+[[package]]
+name = "openevals"
+version = "0.1.3"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "langchain" },
+    { name = "langchain-openai" },
+    { name = "langsmith" },
+    { name = "rich" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/d4/37/31e23ef661fa4c3c6a3c979afd884b30205512b4dde680b36d5909550500/openevals-0.1.3.tar.gz", hash = "sha256:9b00df1a7738464676aa887d4d950b77d3ef7024f6e8a54be3a83c82f485ea65", size = 100828, upload-time = "2025-12-18T04:09:03.034Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/0d/68/162b0d273ffef5b0ad557ebccb790725bf94d78969702324dd5726828cf0/openevals-0.1.3-py3-none-any.whl", hash = "sha256:aed448df0cfdded732e24cda026eda065435a71ffb8c406a3ce73e590156d9f9", size = 67802, upload-time = "2025-12-18T04:09:01.59Z" },
+]
+
 [[package]]
 name = "openinference-semantic-conventions"
 version = "0.1.25"
@@ -2143,7 +2244,7 @@ name = "openpyxl"
 version = "3.1.5"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "et-xmlfile" },
+    { name = "et-xmlfile", marker = "python_full_version >= '3.12'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/3d/f9/88d94a75de065ea32619465d2f77b29a0469500e99012523b91cc4141cd1/openpyxl-3.1.5.tar.gz", hash = "sha256:cf0e3cf56142039133628b5acffe8ef0c12bc902d2aadd3e0fe5878dc08d1050", size = 186464, upload-time = "2024-06-28T14:03:44.161Z" }
 wheels = [
@@ -2392,8 +2493,7 @@ source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "alembic" },
     { name = "colorlog" },
-    { name = "numpy", version = "1.26.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.12'" },
-    { name = "numpy", version = "2.4.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.12'" },
+    { name = "numpy" },
     { name = "packaging" },
     { name = "pyyaml" },
     { name = "sqlalchemy" },
@@ -2506,8 +2606,7 @@ name = "pandas"
 version = "2.3.3"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "numpy", version = "1.26.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.12'" },
-    { name = "numpy", version = "2.4.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.12'" },
+    { name = "numpy" },
     { name = "python-dateutil" },
     { name = "pytz" },
     { name = "tzdata" },
@@ -2668,6 +2767,19 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/cb/28/3bfe2fa5a7b9c46fe7e13c97bda14c895fb10fa2ebf1d0abb90e0cea7ee1/platformdirs-4.5.1-py3-none-any.whl", hash = "sha256:d03afa3963c806a9bed9d5125c8f4cb2fdaf74a55ab60e5d59b3fde758104d31", size = 18731, upload-time = "2025-12-05T13:52:56.823Z" },
 ]
 
+[[package]]
+name = "plotly"
+version = "6.6.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "narwhals" },
+    { name = "packaging" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/24/fb/41efe84970cfddefd4ccf025e2cbfafe780004555f583e93dba3dac2cdef/plotly-6.6.0.tar.gz", hash = "sha256:b897f15f3b02028d69f755f236be890ba950d0a42d7dfc619b44e2d8cea8748c", size = 7027956, upload-time = "2026-03-02T21:10:25.321Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/52/d2/c6e44dba74f17c6216ce1b56044a9b93a929f1c2d5bdaff892512b260f5e/plotly-6.6.0-py3-none-any.whl", hash = "sha256:8d6daf0f87412e0c0bfe72e809d615217ab57cc715899a1e5145135a7800d1d0", size = 9910315, upload-time = "2026-03-02T21:10:18.131Z" },
+]
+
 [[package]]
 name = "ply"
 version = "3.11"
@@ -2963,7 +3075,7 @@ wheels = [
 
 [package.optional-dependencies]
 crypto = [
-    { name = "cryptography" },
+    { name = "cryptography", marker = "python_full_version >= '3.12'" },
 ]
 
 [[package]]
@@ -2994,8 +3106,7 @@ name = "pymilvus-model"
 version = "0.3.2"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "numpy", version = "1.26.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.12'" },
-    { name = "numpy", version = "2.4.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.12'" },
+    { name = "numpy" },
     { name = "onnxruntime" },
     { name = "protobuf" },
     { name = "scipy" },
@@ -3142,7 +3253,8 @@ source = { editable = "." }
 dependencies = [
     { name = "langchain-classic" },
     { name = "langgraph" },
-    { name = "nvidia-nat" },
+    { name = "nvidia-nat", version = "1.5.0a20260112", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.12'" },
+    { name = "nvidia-nat", version = "1.5.0a20260223", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.12'" },
     { name = "nvidia-nat-langchain" },
     { name = "nvidia-rag", extra = ["rag"] },
     { name = "transformers" },
@@ -3151,7 +3263,7 @@ dependencies = [
 [package.metadata]
 requires-dist = [
     { name = "langchain-classic" },
-    { name = "langgraph", specifier = ">=1.0.7" },
+    { name = "langgraph", specifier = ">=1.0.8" },
     { name = "nvidia-nat", specifier = ">=1.5.0a0,<2.0" },
     { name = "nvidia-nat-langchain", specifier = ">=1.5.0a0,<2.0" },
     { name = "nvidia-rag", extras = ["rag"], editable = "../../" },
@@ -3163,19 +3275,18 @@ name = "ragas"
 version = "0.2.15"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "appdirs" },
-    { name = "datasets" },
-    { name = "diskcache" },
-    { name = "langchain" },
-    { name = "langchain-community" },
-    { name = "langchain-core" },
-    { name = "langchain-openai" },
-    { name = "nest-asyncio" },
-    { name = "numpy", version = "1.26.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.12'" },
-    { name = "numpy", version = "2.4.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.12'" },
-    { name = "openai" },
-    { name = "pydantic" },
-    { name = "tiktoken" },
+    { name = "appdirs", marker = "python_full_version >= '3.12'" },
+    { name = "datasets", marker = "python_full_version >= '3.12'" },
+    { name = "diskcache", marker = "python_full_version >= '3.12'" },
+    { name = "langchain", marker = "python_full_version >= '3.12'" },
+    { name = "langchain-community", marker = "python_full_version >= '3.12'" },
+    { name = "langchain-core", marker = "python_full_version >= '3.12'" },
+    { name = "langchain-openai", marker = "python_full_version >= '3.12'" },
+    { name = "nest-asyncio", marker = "python_full_version >= '3.12'" },
+    { name = "numpy", marker = "python_full_version >= '3.12'" },
+    { name = "openai", marker = "python_full_version >= '3.12'" },
+    { name = "pydantic", marker = "python_full_version >= '3.12'" },
+    { name = "tiktoken", marker = "python_full_version >= '3.12'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/6c/0f/04fddfa94744b1c3d8901aed8832a6b4193cc8e4886881f1bb88ff055350/ragas-0.2.15.tar.gz", hash = "sha256:2d0cd77b315a9c9c02ceb0a19ca8a48e82e1d02416587a2944ea51e6e327cd7b", size = 40867766, upload-time = "2025-04-24T16:39:28.734Z" }
 wheels = [
@@ -3438,8 +3549,7 @@ name = "scipy"
 version = "1.17.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "numpy", version = "1.26.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.12'" },
-    { name = "numpy", version = "2.4.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.12'" },
+    { name = "numpy" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/56/3e/9cca699f3486ce6bc12ff46dc2031f1ec8eb9ccc9a320fdaf925f1417426/scipy-1.17.0.tar.gz", hash = "sha256:2591060c8e648d8b96439e111ac41fd8342fdeff1876be2e19dea3fe8930454e", size = 30396830, upload-time = "2026-01-10T21:34:23.009Z" }
 wheels = [
@@ -3569,7 +3679,7 @@ name = "sse-starlette"
 version = "3.0.3"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "anyio" },
+    { name = "anyio", marker = "python_full_version >= '3.12'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/db/3c/fa6517610dc641262b77cc7bf994ecd17465812c1b0585fe33e11be758ab/sse_starlette-3.0.3.tar.gz", hash = "sha256:88cfb08747e16200ea990c8ca876b03910a23b547ab3bd764c0d8eb81019b971", size = 21943, upload-time = "2025-10-30T18:44:20.117Z" }
 wheels = [
@@ -3703,8 +3813,7 @@ version = "5.1.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "huggingface-hub" },
-    { name = "numpy", version = "1.26.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.12'" },
-    { name = "numpy", version = "2.4.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.12'" },
+    { name = "numpy" },
     { name = "packaging" },
     { name = "pyyaml" },
     { name = "regex" },
@@ -3976,6 +4085,18 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/6f/28/258ebab549c2bf3e64d2b0217b973467394a9cea8c42f70418ca2c5d0d2e/websockets-16.0-py3-none-any.whl", hash = "sha256:1637db62fad1dc833276dded54215f2c7fa46912301a24bd94d45d46a011ceec", size = 171598, upload-time = "2026-01-10T09:23:45.395Z" },
 ]
 
+[[package]]
+name = "werkzeug"
+version = "3.1.6"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "markupsafe" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/61/f1/ee81806690a87dab5f5653c1f146c92bc066d7f4cebc603ef88eb9e13957/werkzeug-3.1.6.tar.gz", hash = "sha256:210c6bede5a420a913956b4791a7f4d6843a43b6fcee4dfa08a65e93007d0d25", size = 864736, upload-time = "2026-02-19T15:17:18.884Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/4d/ec/d58832f89ede95652fd01f4f24236af7d32b70cab2196dfcc2d2fd13c5c2/werkzeug-3.1.6-py3-none-any.whl", hash = "sha256:7ddf3357bb9564e407607f988f683d72038551200c704012bb9a4c523d42f131", size = 225166, upload-time = "2026-02-19T15:17:17.475Z" },
+]
+
 [[package]]
 name = "wikipedia"
 version = "1.4.0"
diff --git a/frontend/Dockerfile b/frontend/Dockerfile
index 1c0a6961d..c6f3e3b01 100644
--- a/frontend/Dockerfile
+++ b/frontend/Dockerfile
@@ -94,7 +94,7 @@ RUN if [ "$DOWNLOAD_LEGAL_COMPLIANCE" = "true" ] && [ -d /legal ]; then \
 
 # Production stage - NVIDIA distroless (pre-approved)
 # Updated to latest version to address CVE-2025-9230 (libssl3)
-FROM nvcr.io/nvidia/distroless/node:24-v3.1.3
+FROM nvcr.io/nvidia/distroless/node:24-v4.0.2
 
 # Copy built application and config for production preview
 WORKDIR /app/frontend
diff --git a/frontend/package.json b/frontend/package.json
index fe77b7146..bdb3500f5 100644
--- a/frontend/package.json
+++ b/frontend/package.json
@@ -51,5 +51,12 @@
   },
   "resolutions": {
     "@kui/foundations": "./src/assets/kui-foundations-react-external-0.504.1.tgz"
+  },
+  "pnpm": {
+    "overrides": {
+      "rollup": ">=4.59.0",
+      "minimatch@3.1.2": "3.1.4",
+      "minimatch@9.0.5": "9.0.7"
+    }
   }
 }
\ No newline at end of file
diff --git a/frontend/pnpm-lock.yaml b/frontend/pnpm-lock.yaml
index ea8207293..cc1a633fc 100644
--- a/frontend/pnpm-lock.yaml
+++ b/frontend/pnpm-lock.yaml
@@ -6,6 +6,9 @@ settings:
 
 overrides:
   '@kui/foundations': ./src/assets/kui-foundations-react-external-0.504.1.tgz
+  rollup: '>=4.59.0'
+  minimatch@3.1.2: 3.1.4
+  minimatch@9.0.5: 9.0.7
 
 importers:
 
@@ -1233,113 +1236,141 @@ packages:
   '@rolldown/pluginutils@1.0.0-beta.27':
     resolution: {integrity: sha512-+d0F4MKMCbeVUJwG96uQ4SgAznZNSq93I3V+9NHA4OpvqG8mRCpGdKmK8l/dl02h2CCDHwW2FqilnTyDcAnqjA==}
 
-  '@rollup/rollup-android-arm-eabi@4.53.3':
-    resolution: {integrity: sha512-mRSi+4cBjrRLoaal2PnqH82Wqyb+d3HsPUN/W+WslCXsZsyHa9ZeQQX/pQsZaVIWDkPcpV6jJ+3KLbTbgnwv8w==}
+  '@rollup/rollup-android-arm-eabi@4.59.0':
+    resolution: {integrity: sha512-upnNBkA6ZH2VKGcBj9Fyl9IGNPULcjXRlg0LLeaioQWueH30p6IXtJEbKAgvyv+mJaMxSm1l6xwDXYjpEMiLMg==}
     cpu: [arm]
     os: [android]
 
-  '@rollup/rollup-android-arm64@4.53.3':
-    resolution: {integrity: sha512-CbDGaMpdE9sh7sCmTrTUyllhrg65t6SwhjlMJsLr+J8YjFuPmCEjbBSx4Z/e4SmDyH3aB5hGaJUP2ltV/vcs4w==}
+  '@rollup/rollup-android-arm64@4.59.0':
+    resolution: {integrity: sha512-hZ+Zxj3SySm4A/DylsDKZAeVg0mvi++0PYVceVyX7hemkw7OreKdCvW2oQ3T1FMZvCaQXqOTHb8qmBShoqk69Q==}
     cpu: [arm64]
     os: [android]
 
-  '@rollup/rollup-darwin-arm64@4.53.3':
-    resolution: {integrity: sha512-Nr7SlQeqIBpOV6BHHGZgYBuSdanCXuw09hon14MGOLGmXAFYjx1wNvquVPmpZnl0tLjg25dEdr4IQ6GgyToCUA==}
+  '@rollup/rollup-darwin-arm64@4.59.0':
+    resolution: {integrity: sha512-W2Psnbh1J8ZJw0xKAd8zdNgF9HRLkdWwwdWqubSVk0pUuQkoHnv7rx4GiF9rT4t5DIZGAsConRE3AxCdJ4m8rg==}
     cpu: [arm64]
     os: [darwin]
 
-  '@rollup/rollup-darwin-x64@4.53.3':
-    resolution: {integrity: sha512-DZ8N4CSNfl965CmPktJ8oBnfYr3F8dTTNBQkRlffnUarJ2ohudQD17sZBa097J8xhQ26AwhHJ5mvUyQW8ddTsQ==}
+  '@rollup/rollup-darwin-x64@4.59.0':
+    resolution: {integrity: sha512-ZW2KkwlS4lwTv7ZVsYDiARfFCnSGhzYPdiOU4IM2fDbL+QGlyAbjgSFuqNRbSthybLbIJ915UtZBtmuLrQAT/w==}
     cpu: [x64]
     os: [darwin]
 
-  '@rollup/rollup-freebsd-arm64@4.53.3':
-    resolution: {integrity: sha512-yMTrCrK92aGyi7GuDNtGn2sNW+Gdb4vErx4t3Gv/Tr+1zRb8ax4z8GWVRfr3Jw8zJWvpGHNpss3vVlbF58DZ4w==}
+  '@rollup/rollup-freebsd-arm64@4.59.0':
+    resolution: {integrity: sha512-EsKaJ5ytAu9jI3lonzn3BgG8iRBjV4LxZexygcQbpiU0wU0ATxhNVEpXKfUa0pS05gTcSDMKpn3Sx+QB9RlTTA==}
     cpu: [arm64]
     os: [freebsd]
 
-  '@rollup/rollup-freebsd-x64@4.53.3':
-    resolution: {integrity: sha512-lMfF8X7QhdQzseM6XaX0vbno2m3hlyZFhwcndRMw8fbAGUGL3WFMBdK0hbUBIUYcEcMhVLr1SIamDeuLBnXS+Q==}
+  '@rollup/rollup-freebsd-x64@4.59.0':
+    resolution: {integrity: sha512-d3DuZi2KzTMjImrxoHIAODUZYoUUMsuUiY4SRRcJy6NJoZ6iIqWnJu9IScV9jXysyGMVuW+KNzZvBLOcpdl3Vg==}
     cpu: [x64]
     os: [freebsd]
 
-  '@rollup/rollup-linux-arm-gnueabihf@4.53.3':
-    resolution: {integrity: sha512-k9oD15soC/Ln6d2Wv/JOFPzZXIAIFLp6B+i14KhxAfnq76ajt0EhYc5YPeX6W1xJkAdItcVT+JhKl1QZh44/qw==}
+  '@rollup/rollup-linux-arm-gnueabihf@4.59.0':
+    resolution: {integrity: sha512-t4ONHboXi/3E0rT6OZl1pKbl2Vgxf9vJfWgmUoCEVQVxhW6Cw/c8I6hbbu7DAvgp82RKiH7TpLwxnJeKv2pbsw==}
     cpu: [arm]
     os: [linux]
+    libc: [glibc]
 
-  '@rollup/rollup-linux-arm-musleabihf@4.53.3':
-    resolution: {integrity: sha512-vTNlKq+N6CK/8UktsrFuc+/7NlEYVxgaEgRXVUVK258Z5ymho29skzW1sutgYjqNnquGwVUObAaxae8rZ6YMhg==}
+  '@rollup/rollup-linux-arm-musleabihf@4.59.0':
+    resolution: {integrity: sha512-CikFT7aYPA2ufMD086cVORBYGHffBo4K8MQ4uPS/ZnY54GKj36i196u8U+aDVT2LX4eSMbyHtyOh7D7Zvk2VvA==}
     cpu: [arm]
     os: [linux]
+    libc: [musl]
 
-  '@rollup/rollup-linux-arm64-gnu@4.53.3':
-    resolution: {integrity: sha512-RGrFLWgMhSxRs/EWJMIFM1O5Mzuz3Xy3/mnxJp/5cVhZ2XoCAxJnmNsEyeMJtpK+wu0FJFWz+QF4mjCA7AUQ3w==}
+  '@rollup/rollup-linux-arm64-gnu@4.59.0':
+    resolution: {integrity: sha512-jYgUGk5aLd1nUb1CtQ8E+t5JhLc9x5WdBKew9ZgAXg7DBk0ZHErLHdXM24rfX+bKrFe+Xp5YuJo54I5HFjGDAA==}
     cpu: [arm64]
     os: [linux]
+    libc: [glibc]
 
-  '@rollup/rollup-linux-arm64-musl@4.53.3':
-    resolution: {integrity: sha512-kASyvfBEWYPEwe0Qv4nfu6pNkITLTb32p4yTgzFCocHnJLAHs+9LjUu9ONIhvfT/5lv4YS5muBHyuV84epBo/A==}
+  '@rollup/rollup-linux-arm64-musl@4.59.0':
+    resolution: {integrity: sha512-peZRVEdnFWZ5Bh2KeumKG9ty7aCXzzEsHShOZEFiCQlDEepP1dpUl/SrUNXNg13UmZl+gzVDPsiCwnV1uI0RUA==}
     cpu: [arm64]
     os: [linux]
+    libc: [musl]
 
-  '@rollup/rollup-linux-loong64-gnu@4.53.3':
-    resolution: {integrity: sha512-JiuKcp2teLJwQ7vkJ95EwESWkNRFJD7TQgYmCnrPtlu50b4XvT5MOmurWNrCj3IFdyjBQ5p9vnrX4JM6I8OE7g==}
+  '@rollup/rollup-linux-loong64-gnu@4.59.0':
+    resolution: {integrity: sha512-gbUSW/97f7+r4gHy3Jlup8zDG190AuodsWnNiXErp9mT90iCy9NKKU0Xwx5k8VlRAIV2uU9CsMnEFg/xXaOfXg==}
     cpu: [loong64]
     os: [linux]
+    libc: [glibc]
 
-  '@rollup/rollup-linux-ppc64-gnu@4.53.3':
-    resolution: {integrity: sha512-EoGSa8nd6d3T7zLuqdojxC20oBfNT8nexBbB/rkxgKj5T5vhpAQKKnD+h3UkoMuTyXkP5jTjK/ccNRmQrPNDuw==}
+  '@rollup/rollup-linux-loong64-musl@4.59.0':
+    resolution: {integrity: sha512-yTRONe79E+o0FWFijasoTjtzG9EBedFXJMl888NBEDCDV9I2wGbFFfJQQe63OijbFCUZqxpHz1GzpbtSFikJ4Q==}
+    cpu: [loong64]
+    os: [linux]
+    libc: [musl]
+
+  '@rollup/rollup-linux-ppc64-gnu@4.59.0':
+    resolution: {integrity: sha512-sw1o3tfyk12k3OEpRddF68a1unZ5VCN7zoTNtSn2KndUE+ea3m3ROOKRCZxEpmT9nsGnogpFP9x6mnLTCaoLkA==}
     cpu: [ppc64]
     os: [linux]
+    libc: [glibc]
 
-  '@rollup/rollup-linux-riscv64-gnu@4.53.3':
-    resolution: {integrity: sha512-4s+Wped2IHXHPnAEbIB0YWBv7SDohqxobiiPA1FIWZpX+w9o2i4LezzH/NkFUl8LRci/8udci6cLq+jJQlh+0g==}
+  '@rollup/rollup-linux-ppc64-musl@4.59.0':
+    resolution: {integrity: sha512-+2kLtQ4xT3AiIxkzFVFXfsmlZiG5FXYW7ZyIIvGA7Bdeuh9Z0aN4hVyXS/G1E9bTP/vqszNIN/pUKCk/BTHsKA==}
+    cpu: [ppc64]
+    os: [linux]
+    libc: [musl]
+
+  '@rollup/rollup-linux-riscv64-gnu@4.59.0':
+    resolution: {integrity: sha512-NDYMpsXYJJaj+I7UdwIuHHNxXZ/b/N2hR15NyH3m2qAtb/hHPA4g4SuuvrdxetTdndfj9b1WOmy73kcPRoERUg==}
     cpu: [riscv64]
     os: [linux]
+    libc: [glibc]
 
-  '@rollup/rollup-linux-riscv64-musl@4.53.3':
-    resolution: {integrity: sha512-68k2g7+0vs2u9CxDt5ktXTngsxOQkSEV/xBbwlqYcUrAVh6P9EgMZvFsnHy4SEiUl46Xf0IObWVbMvPrr2gw8A==}
+  '@rollup/rollup-linux-riscv64-musl@4.59.0':
+    resolution: {integrity: sha512-nLckB8WOqHIf1bhymk+oHxvM9D3tyPndZH8i8+35p/1YiVoVswPid2yLzgX7ZJP0KQvnkhM4H6QZ5m0LzbyIAg==}
     cpu: [riscv64]
     os: [linux]
+    libc: [musl]
 
-  '@rollup/rollup-linux-s390x-gnu@4.53.3':
-    resolution: {integrity: sha512-VYsFMpULAz87ZW6BVYw3I6sWesGpsP9OPcyKe8ofdg9LHxSbRMd7zrVrr5xi/3kMZtpWL/wC+UIJWJYVX5uTKg==}
+  '@rollup/rollup-linux-s390x-gnu@4.59.0':
+    resolution: {integrity: sha512-oF87Ie3uAIvORFBpwnCvUzdeYUqi2wY6jRFWJAy1qus/udHFYIkplYRW+wo+GRUP4sKzYdmE1Y3+rY5Gc4ZO+w==}
     cpu: [s390x]
     os: [linux]
+    libc: [glibc]
 
-  '@rollup/rollup-linux-x64-gnu@4.53.3':
-    resolution: {integrity: sha512-3EhFi1FU6YL8HTUJZ51imGJWEX//ajQPfqWLI3BQq4TlvHy4X0MOr5q3D2Zof/ka0d5FNdPwZXm3Yyib/UEd+w==}
+  '@rollup/rollup-linux-x64-gnu@4.59.0':
+    resolution: {integrity: sha512-3AHmtQq/ppNuUspKAlvA8HtLybkDflkMuLK4DPo77DfthRb71V84/c4MlWJXixZz4uruIH4uaa07IqoAkG64fg==}
     cpu: [x64]
     os: [linux]
+    libc: [glibc]
 
-  '@rollup/rollup-linux-x64-musl@4.53.3':
-    resolution: {integrity: sha512-eoROhjcc6HbZCJr+tvVT8X4fW3/5g/WkGvvmwz/88sDtSJzO7r/blvoBDgISDiCjDRZmHpwud7h+6Q9JxFwq1Q==}
+  '@rollup/rollup-linux-x64-musl@4.59.0':
+    resolution: {integrity: sha512-2UdiwS/9cTAx7qIUZB/fWtToJwvt0Vbo0zmnYt7ED35KPg13Q0ym1g442THLC7VyI6JfYTP4PiSOWyoMdV2/xg==}
     cpu: [x64]
     os: [linux]
+    libc: [musl]
+
+  '@rollup/rollup-openbsd-x64@4.59.0':
+    resolution: {integrity: sha512-M3bLRAVk6GOwFlPTIxVBSYKUaqfLrn8l0psKinkCFxl4lQvOSz8ZrKDz2gxcBwHFpci0B6rttydI4IpS4IS/jQ==}
+    cpu: [x64]
+    os: [openbsd]
 
-  '@rollup/rollup-openharmony-arm64@4.53.3':
-    resolution: {integrity: sha512-OueLAWgrNSPGAdUdIjSWXw+u/02BRTcnfw9PN41D2vq/JSEPnJnVuBgw18VkN8wcd4fjUs+jFHVM4t9+kBSNLw==}
+  '@rollup/rollup-openharmony-arm64@4.59.0':
+    resolution: {integrity: sha512-tt9KBJqaqp5i5HUZzoafHZX8b5Q2Fe7UjYERADll83O4fGqJ49O1FsL6LpdzVFQcpwvnyd0i+K/VSwu/o/nWlA==}
     cpu: [arm64]
     os: [openharmony]
 
-  '@rollup/rollup-win32-arm64-msvc@4.53.3':
-    resolution: {integrity: sha512-GOFuKpsxR/whszbF/bzydebLiXIHSgsEUp6M0JI8dWvi+fFa1TD6YQa4aSZHtpmh2/uAlj/Dy+nmby3TJ3pkTw==}
+  '@rollup/rollup-win32-arm64-msvc@4.59.0':
+    resolution: {integrity: sha512-V5B6mG7OrGTwnxaNUzZTDTjDS7F75PO1ae6MJYdiMu60sq0CqN5CVeVsbhPxalupvTX8gXVSU9gq+Rx1/hvu6A==}
     cpu: [arm64]
     os: [win32]
 
-  '@rollup/rollup-win32-ia32-msvc@4.53.3':
-    resolution: {integrity: sha512-iah+THLcBJdpfZ1TstDFbKNznlzoxa8fmnFYK4V67HvmuNYkVdAywJSoteUszvBQ9/HqN2+9AZghbajMsFT+oA==}
+  '@rollup/rollup-win32-ia32-msvc@4.59.0':
+    resolution: {integrity: sha512-UKFMHPuM9R0iBegwzKF4y0C4J9u8C6MEJgFuXTBerMk7EJ92GFVFYBfOZaSGLu6COf7FxpQNqhNS4c4icUPqxA==}
     cpu: [ia32]
     os: [win32]
 
-  '@rollup/rollup-win32-x64-gnu@4.53.3':
-    resolution: {integrity: sha512-J9QDiOIZlZLdcot5NXEepDkstocktoVjkaKUtqzgzpt2yWjGlbYiKyp05rWwk4nypbYUNoFAztEgixoLaSETkg==}
+  '@rollup/rollup-win32-x64-gnu@4.59.0':
+    resolution: {integrity: sha512-laBkYlSS1n2L8fSo1thDNGrCTQMmxjYY5G0WFWjFFYZkKPjsMBsgJfGf4TLxXrF6RyhI60L8TMOjBMvXiTcxeA==}
     cpu: [x64]
     os: [win32]
 
-  '@rollup/rollup-win32-x64-msvc@4.53.3':
-    resolution: {integrity: sha512-UhTd8u31dXadv0MopwGgNOBpUVROFKWVQgAg5N1ESyCz8AuBcMqm4AuTjrwgQKGDfoFuz02EuMRHQIw/frmYKQ==}
+  '@rollup/rollup-win32-x64-msvc@4.59.0':
+    resolution: {integrity: sha512-2HRCml6OztYXyJXAvdDXPKcawukWY2GpR5/nxKp4iBgiO3wcoEGkAaqctIbZcNB6KlUQBIqt8VYkNSj2397EfA==}
     cpu: [x64]
     os: [win32]
 
@@ -1381,24 +1412,28 @@ packages:
     engines: {node: '>= 10'}
     cpu: [arm64]
     os: [linux]
+    libc: [glibc]
 
   '@tailwindcss/oxide-linux-arm64-musl@4.1.17':
     resolution: {integrity: sha512-HvZLfGr42i5anKtIeQzxdkw/wPqIbpeZqe7vd3V9vI3RQxe3xU1fLjss0TjyhxWcBaipk7NYwSrwTwK1hJARMg==}
     engines: {node: '>= 10'}
     cpu: [arm64]
     os: [linux]
+    libc: [musl]
 
   '@tailwindcss/oxide-linux-x64-gnu@4.1.17':
     resolution: {integrity: sha512-M3XZuORCGB7VPOEDH+nzpJ21XPvK5PyjlkSFkFziNHGLc5d6g3di2McAAblmaSUNl8IOmzYwLx9NsE7bplNkwQ==}
     engines: {node: '>= 10'}
     cpu: [x64]
     os: [linux]
+    libc: [glibc]
 
   '@tailwindcss/oxide-linux-x64-musl@4.1.17':
     resolution: {integrity: sha512-k7f+pf9eXLEey4pBlw+8dgfJHY4PZ5qOUFDyNf7SI6lHjQ9Zt7+NcscjpwdCEbYi6FI5c2KDTDWyf2iHcCSyyQ==}
     engines: {node: '>= 10'}
     cpu: [x64]
     os: [linux]
+    libc: [musl]
 
   '@tailwindcss/oxide-wasm32-wasi@4.1.17':
     resolution: {integrity: sha512-cEytGqSSoy7zK4JRWiTCx43FsKP/zGr0CsuMawhH67ONlH+T79VteQeJQRO/X7L0juEUA8ZyuYikcRBf0vsxhg==}
@@ -1677,6 +1712,10 @@ packages:
   balanced-match@1.0.2:
     resolution: {integrity: sha512-3oSeUO0TMV67hN1AmbXsK4yaqU7tjiHlbxRDZOpH0KW9+CeX4bRAaX0Anxt0tx2MrpRpWwQaPwIlISEJhYU5Pw==}
 
+  balanced-match@4.0.4:
+    resolution: {integrity: sha512-BLrgEcRTwX2o6gGxGOCNyMvGSp35YofuYzw9h1IMTRmKqttAZZVU67bdb9Pr2vUHA8+j3i2tJfjO6C6+4myGTA==}
+    engines: {node: 18 || 20 || >=22}
+
   baseline-browser-mapping@2.9.6:
     resolution: {integrity: sha512-v9BVVpOTLB59C9E7aSnmIF8h7qRsFpx+A2nugVMTszEOMcfjlZMsXRm4LF23I3Z9AJxc8ANpIvzbzONoX9VJlg==}
     hasBin: true
@@ -1684,8 +1723,9 @@ packages:
   brace-expansion@1.1.12:
     resolution: {integrity: sha512-9T9UjW3r0UW5c1Q7GTwllptXwhvYmEzFhzMfZ9H7FQWt+uZePjZPjBP/W1ZEyZ1twGWom5/56TF4lPcqjnDHcg==}
 
-  brace-expansion@2.0.2:
-    resolution: {integrity: sha512-Jt0vHyM+jmUBqojB7E1NIYadt0vI0Qxjxd2TErW94wDz+E2LAm5vKMXXwg6ZZBTHPuUlDgQHKXvjGBdfcF1ZDQ==}
+  brace-expansion@5.0.4:
+    resolution: {integrity: sha512-h+DEnpVvxmfVefa4jFbCf5HdH5YMDXRsmKflpf1pILZWRFlTbJpxeU55nJl4Smt5HQaGzg1o6RHFPJaOqnmBDg==}
+    engines: {node: 18 || 20 || >=22}
 
   browserslist@4.28.1:
     resolution: {integrity: sha512-ZC5Bd0LgJXgwGqUknZY/vkUQ04r8NXnJZ3yYi4vDmSiZmC/pdSN0NbNRPxZpbtO4uAfDUAFffO8IZoM3Gj8IkA==}
@@ -2135,24 +2175,28 @@ packages:
     engines: {node: '>= 12.0.0'}
     cpu: [arm64]
     os: [linux]
+    libc: [glibc]
 
   lightningcss-linux-arm64-musl@1.30.2:
     resolution: {integrity: sha512-5Vh9dGeblpTxWHpOx8iauV02popZDsCYMPIgiuw97OJ5uaDsL86cnqSFs5LZkG3ghHoX5isLgWzMs+eD1YzrnA==}
     engines: {node: '>= 12.0.0'}
     cpu: [arm64]
     os: [linux]
+    libc: [musl]
 
   lightningcss-linux-x64-gnu@1.30.2:
     resolution: {integrity: sha512-Cfd46gdmj1vQ+lR6VRTTadNHu6ALuw2pKR9lYq4FnhvgBc4zWY1EtZcAc6EffShbb1MFrIPfLDXD6Xprbnni4w==}
     engines: {node: '>= 12.0.0'}
     cpu: [x64]
     os: [linux]
+    libc: [glibc]
 
   lightningcss-linux-x64-musl@1.30.2:
     resolution: {integrity: sha512-XJaLUUFXb6/QG2lGIW6aIk6jKdtjtcffUT0NKvIqhSBY3hh9Ch+1LCeH80dR9q9LBjG3ewbDjnumefsLsP6aiA==}
     engines: {node: '>= 12.0.0'}
     cpu: [x64]
     os: [linux]
+    libc: [musl]
 
   lightningcss-win32-arm64-msvc@1.30.2:
     resolution: {integrity: sha512-FZn+vaj7zLv//D/192WFFVA0RgHawIcHqLX9xuWiQt7P0PtdFEVaxgF9rjM/IRYHQXNnk61/H/gb2Ei+kUQ4xQ==}
@@ -2214,11 +2258,11 @@ packages:
     resolution: {integrity: sha512-I9jwMn07Sy/IwOj3zVkVik2JTvgpaykDZEigL6Rx6N9LbMywwUSMtxET+7lVoDLLd3O3IXwJwvuuns8UB/HeAg==}
     engines: {node: '>=4'}
 
-  minimatch@3.1.2:
-    resolution: {integrity: sha512-J7p63hRiAjw1NDEww1W7i37+ByIrOWO5XQQAzZ3VOcL0PNybwpfmV/N05zFAzwQ9USyEcX6t3UO+K5aqBQOIHw==}
+  minimatch@3.1.4:
+    resolution: {integrity: sha512-twmL+S8+7yIsE9wsqgzU3E8/LumN3M3QELrBZ20OdmQ9jB2JvW5oZtBEmft84k/Gs5CG9mqtWc6Y9vW+JEzGxw==}
 
-  minimatch@9.0.5:
-    resolution: {integrity: sha512-G6T0ZX48xgozx7587koeX9Ys2NYy6Gmv//P89sEte9V9whIapMNF4idKxnW2QtCcLiTWlb/wfCabAtAFWhhBow==}
+  minimatch@9.0.7:
+    resolution: {integrity: sha512-MOwgjc8tfrpn5QQEvjijjmDVtMw2oL88ugTevzxQnzRLm6l3fVEF2gzU0kYeYYKD8C66+IdGX6peJ4MyUlUnPg==}
     engines: {node: '>=16 || 14 >=14.17'}
 
   minipass@7.1.2:
@@ -2400,8 +2444,8 @@ packages:
     resolution: {integrity: sha512-pb/MYmXstAkysRFx8piNI1tGFNQIFA3vkE3Gq4EuA1dF6gHp/+vgZqsCGJapvy8N3Q+4o7FwvquPJcnZ7RYy4g==}
     engines: {node: '>=4'}
 
-  rollup@4.53.3:
-    resolution: {integrity: sha512-w8GmOxZfBmKknvdXU1sdM9NHcoQejwF/4mNgj2JuEEdRaHwwF12K7e9eXn1nLZ07ad+du76mkVsyeb2rKGllsA==}
+  rollup@4.59.0:
+    resolution: {integrity: sha512-2oMpl67a3zCH9H79LeMcbDhXW/UmWG/y2zuqnF2jQq5uq9TbM9TVyXvA4+t+ne2IIkBdrLpAaRQAvo7YI/Yyeg==}
     engines: {node: '>=18.0.0', npm: '>=8.0.0'}
     hasBin: true
 
@@ -3031,7 +3075,7 @@ snapshots:
     dependencies:
       '@eslint/object-schema': 2.1.7
       debug: 4.4.3
-      minimatch: 3.1.2
+      minimatch: 3.1.4
     transitivePeerDependencies:
       - supports-color
 
@@ -3052,7 +3096,7 @@ snapshots:
       ignore: 5.3.2
       import-fresh: 3.3.1
       js-yaml: 4.1.1
-      minimatch: 3.1.2
+      minimatch: 3.1.4
       strip-json-comments: 3.1.1
     transitivePeerDependencies:
       - supports-color
@@ -3910,70 +3954,79 @@ snapshots:
 
   '@rolldown/pluginutils@1.0.0-beta.27': {}
 
-  '@rollup/rollup-android-arm-eabi@4.53.3':
+  '@rollup/rollup-android-arm-eabi@4.59.0':
+    optional: true
+
+  '@rollup/rollup-android-arm64@4.59.0':
+    optional: true
+
+  '@rollup/rollup-darwin-arm64@4.59.0':
     optional: true
 
-  '@rollup/rollup-android-arm64@4.53.3':
+  '@rollup/rollup-darwin-x64@4.59.0':
     optional: true
 
-  '@rollup/rollup-darwin-arm64@4.53.3':
+  '@rollup/rollup-freebsd-arm64@4.59.0':
     optional: true
 
-  '@rollup/rollup-darwin-x64@4.53.3':
+  '@rollup/rollup-freebsd-x64@4.59.0':
     optional: true
 
-  '@rollup/rollup-freebsd-arm64@4.53.3':
+  '@rollup/rollup-linux-arm-gnueabihf@4.59.0':
     optional: true
 
-  '@rollup/rollup-freebsd-x64@4.53.3':
+  '@rollup/rollup-linux-arm-musleabihf@4.59.0':
     optional: true
 
-  '@rollup/rollup-linux-arm-gnueabihf@4.53.3':
+  '@rollup/rollup-linux-arm64-gnu@4.59.0':
     optional: true
 
-  '@rollup/rollup-linux-arm-musleabihf@4.53.3':
+  '@rollup/rollup-linux-arm64-musl@4.59.0':
     optional: true
 
-  '@rollup/rollup-linux-arm64-gnu@4.53.3':
+  '@rollup/rollup-linux-loong64-gnu@4.59.0':
     optional: true
 
-  '@rollup/rollup-linux-arm64-musl@4.53.3':
+  '@rollup/rollup-linux-loong64-musl@4.59.0':
     optional: true
 
-  '@rollup/rollup-linux-loong64-gnu@4.53.3':
+  '@rollup/rollup-linux-ppc64-gnu@4.59.0':
     optional: true
 
-  '@rollup/rollup-linux-ppc64-gnu@4.53.3':
+  '@rollup/rollup-linux-ppc64-musl@4.59.0':
     optional: true
 
-  '@rollup/rollup-linux-riscv64-gnu@4.53.3':
+  '@rollup/rollup-linux-riscv64-gnu@4.59.0':
     optional: true
 
-  '@rollup/rollup-linux-riscv64-musl@4.53.3':
+  '@rollup/rollup-linux-riscv64-musl@4.59.0':
     optional: true
 
-  '@rollup/rollup-linux-s390x-gnu@4.53.3':
+  '@rollup/rollup-linux-s390x-gnu@4.59.0':
     optional: true
 
-  '@rollup/rollup-linux-x64-gnu@4.53.3':
+  '@rollup/rollup-linux-x64-gnu@4.59.0':
     optional: true
 
-  '@rollup/rollup-linux-x64-musl@4.53.3':
+  '@rollup/rollup-linux-x64-musl@4.59.0':
     optional: true
 
-  '@rollup/rollup-openharmony-arm64@4.53.3':
+  '@rollup/rollup-openbsd-x64@4.59.0':
     optional: true
 
-  '@rollup/rollup-win32-arm64-msvc@4.53.3':
+  '@rollup/rollup-openharmony-arm64@4.59.0':
     optional: true
 
-  '@rollup/rollup-win32-ia32-msvc@4.53.3':
+  '@rollup/rollup-win32-arm64-msvc@4.59.0':
     optional: true
 
-  '@rollup/rollup-win32-x64-gnu@4.53.3':
+  '@rollup/rollup-win32-ia32-msvc@4.59.0':
     optional: true
 
-  '@rollup/rollup-win32-x64-msvc@4.53.3':
+  '@rollup/rollup-win32-x64-gnu@4.59.0':
+    optional: true
+
+  '@rollup/rollup-win32-x64-msvc@4.59.0':
     optional: true
 
   '@tailwindcss/node@4.1.17':
@@ -4198,7 +4251,7 @@ snapshots:
       '@typescript-eslint/types': 8.49.0
       '@typescript-eslint/visitor-keys': 8.49.0
       debug: 4.4.3
-      minimatch: 9.0.5
+      minimatch: 9.0.7
       semver: 7.7.3
       tinyglobby: 0.2.15
       ts-api-utils: 2.1.0(typescript@5.8.3)
@@ -4355,6 +4408,8 @@ snapshots:
 
   balanced-match@1.0.2: {}
 
+  balanced-match@4.0.4: {}
+
   baseline-browser-mapping@2.9.6: {}
 
   brace-expansion@1.1.12:
@@ -4362,9 +4417,9 @@ snapshots:
       balanced-match: 1.0.2
       concat-map: 0.0.1
 
-  brace-expansion@2.0.2:
+  brace-expansion@5.0.4:
     dependencies:
-      balanced-match: 1.0.2
+      balanced-match: 4.0.4
 
   browserslist@4.28.1:
     dependencies:
@@ -4557,7 +4612,7 @@ snapshots:
       is-glob: 4.0.3
       json-stable-stringify-without-jsonify: 1.0.1
       lodash.merge: 4.6.2
-      minimatch: 3.1.2
+      minimatch: 3.1.4
       natural-compare: 1.4.0
       optionator: 0.9.4
     optionalDependencies:
@@ -4639,7 +4694,7 @@ snapshots:
     dependencies:
       foreground-child: 3.3.1
       jackspeak: 3.4.3
-      minimatch: 9.0.5
+      minimatch: 9.0.7
       minipass: 7.1.2
       package-json-from-dist: 1.0.1
       path-scurry: 1.11.1
@@ -4871,13 +4926,13 @@ snapshots:
 
   min-indent@1.0.1: {}
 
-  minimatch@3.1.2:
+  minimatch@3.1.4:
     dependencies:
       brace-expansion: 1.1.12
 
-  minimatch@9.0.5:
+  minimatch@9.0.7:
     dependencies:
-      brace-expansion: 2.0.2
+      brace-expansion: 5.0.4
 
   minipass@7.1.2: {}
 
@@ -5082,32 +5137,35 @@ snapshots:
 
   resolve-from@4.0.0: {}
 
-  rollup@4.53.3:
+  rollup@4.59.0:
     dependencies:
       '@types/estree': 1.0.8
     optionalDependencies:
-      '@rollup/rollup-android-arm-eabi': 4.53.3
-      '@rollup/rollup-android-arm64': 4.53.3
-      '@rollup/rollup-darwin-arm64': 4.53.3
-      '@rollup/rollup-darwin-x64': 4.53.3
-      '@rollup/rollup-freebsd-arm64': 4.53.3
-      '@rollup/rollup-freebsd-x64': 4.53.3
-      '@rollup/rollup-linux-arm-gnueabihf': 4.53.3
-      '@rollup/rollup-linux-arm-musleabihf': 4.53.3
-      '@rollup/rollup-linux-arm64-gnu': 4.53.3
-      '@rollup/rollup-linux-arm64-musl': 4.53.3
-      '@rollup/rollup-linux-loong64-gnu': 4.53.3
-      '@rollup/rollup-linux-ppc64-gnu': 4.53.3
-      '@rollup/rollup-linux-riscv64-gnu': 4.53.3
-      '@rollup/rollup-linux-riscv64-musl': 4.53.3
-      '@rollup/rollup-linux-s390x-gnu': 4.53.3
-      '@rollup/rollup-linux-x64-gnu': 4.53.3
-      '@rollup/rollup-linux-x64-musl': 4.53.3
-      '@rollup/rollup-openharmony-arm64': 4.53.3
-      '@rollup/rollup-win32-arm64-msvc': 4.53.3
-      '@rollup/rollup-win32-ia32-msvc': 4.53.3
-      '@rollup/rollup-win32-x64-gnu': 4.53.3
-      '@rollup/rollup-win32-x64-msvc': 4.53.3
+      '@rollup/rollup-android-arm-eabi': 4.59.0
+      '@rollup/rollup-android-arm64': 4.59.0
+      '@rollup/rollup-darwin-arm64': 4.59.0
+      '@rollup/rollup-darwin-x64': 4.59.0
+      '@rollup/rollup-freebsd-arm64': 4.59.0
+      '@rollup/rollup-freebsd-x64': 4.59.0
+      '@rollup/rollup-linux-arm-gnueabihf': 4.59.0
+      '@rollup/rollup-linux-arm-musleabihf': 4.59.0
+      '@rollup/rollup-linux-arm64-gnu': 4.59.0
+      '@rollup/rollup-linux-arm64-musl': 4.59.0
+      '@rollup/rollup-linux-loong64-gnu': 4.59.0
+      '@rollup/rollup-linux-loong64-musl': 4.59.0
+      '@rollup/rollup-linux-ppc64-gnu': 4.59.0
+      '@rollup/rollup-linux-ppc64-musl': 4.59.0
+      '@rollup/rollup-linux-riscv64-gnu': 4.59.0
+      '@rollup/rollup-linux-riscv64-musl': 4.59.0
+      '@rollup/rollup-linux-s390x-gnu': 4.59.0
+      '@rollup/rollup-linux-x64-gnu': 4.59.0
+      '@rollup/rollup-linux-x64-musl': 4.59.0
+      '@rollup/rollup-openbsd-x64': 4.59.0
+      '@rollup/rollup-openharmony-arm64': 4.59.0
+      '@rollup/rollup-win32-arm64-msvc': 4.59.0
+      '@rollup/rollup-win32-ia32-msvc': 4.59.0
+      '@rollup/rollup-win32-x64-gnu': 4.59.0
+      '@rollup/rollup-win32-x64-msvc': 4.59.0
       fsevents: 2.3.3
 
   rrweb-cssom@0.8.0: {}
@@ -5192,7 +5250,7 @@ snapshots:
     dependencies:
       '@istanbuljs/schema': 0.1.3
       glob: 10.5.0
-      minimatch: 9.0.5
+      minimatch: 9.0.7
 
   tinybench@2.9.0: {}
 
@@ -5308,7 +5366,7 @@ snapshots:
       fdir: 6.5.0(picomatch@4.0.3)
       picomatch: 4.0.3
       postcss: 8.5.6
-      rollup: 4.53.3
+      rollup: 4.59.0
       tinyglobby: 0.2.15
     optionalDependencies:
       '@types/node': 24.10.3
diff --git a/frontend/src/store/__tests__/useSettingsStore.test.tsx b/frontend/src/store/__tests__/useSettingsStore.test.tsx
index 0d8165473..541451f20 100644
--- a/frontend/src/store/__tests__/useSettingsStore.test.tsx
+++ b/frontend/src/store/__tests__/useSettingsStore.test.tsx
@@ -52,7 +52,7 @@ const mockHealthResponse: HealthResponse = {
       status: 'healthy',
       latency_ms: 40,
       error: null,
-      model: 'nvidia/llama-3.2-nv-rerankqa-1b-v2',
+      model: 'nvidia/llama-nemotron-rerank-1b-v2',
       message: null,
       http_status: 200
     }
@@ -121,7 +121,7 @@ describe('useHealthInitialization', () => {
       const state = useSettingsStore.getState();
       expect(state.model).toBe('meta/llama-3.1-8b-instruct');
       expect(state.embeddingModel).toBe('nvidia/nv-embedqa-e5-v5');
-      expect(state.rerankerModel).toBe('nvidia/llama-3.2-nv-rerankqa-1b-v2');
+      expect(state.rerankerModel).toBe('nvidia/llama-nemotron-rerank-1b-v2');
     });
 
     // Verify endpoints are also set
@@ -140,7 +140,7 @@ describe('useHealthInitialization', () => {
         llmEndpoint: 'http://llm:8000',
         embeddingModel: 'nvidia/nv-embedqa-e5-v5',
         embeddingEndpoint: 'http://embeddings:8001',
-        rerankerModel: 'nvidia/llama-3.2-nv-rerankqa-1b-v2',
+        rerankerModel: 'nvidia/llama-nemotron-rerank-1b-v2',
         rerankerEndpoint: 'http://reranker:8002'
       })
     );
@@ -167,7 +167,7 @@ describe('useHealthInitialization', () => {
       expect(state.model).toBe('user-selected-llm-model');
       expect(state.embeddingModel).toBe('user-selected-embedding-model');
       // Should still populate undefined fields
-      expect(state.rerankerModel).toBe('nvidia/llama-3.2-nv-rerankqa-1b-v2');
+      expect(state.rerankerModel).toBe('nvidia/llama-nemotron-rerank-1b-v2');
     });
   });
 
diff --git a/notebooks/.env_library b/notebooks/.env_library
index a0d998752..eb5b68eb5 100644
--- a/notebooks/.env_library
+++ b/notebooks/.env_library
@@ -16,8 +16,8 @@ export MINIO_ACCESSKEY=minioadmin
 export MINIO_SECRETKEY=minioadmin
 
 # === Embedding Model specific configurations ===
-export APP_EMBEDDINGS_SERVERURL=nemoretriever-embedding-ms:8000/v1
-export APP_EMBEDDINGS_MODELNAME=nvidia/llama-3.2-nv-embedqa-1b-v2
+export APP_EMBEDDINGS_SERVERURL=nemotron-embedding-ms:8000/v1
+export APP_EMBEDDINGS_MODELNAME=nvidia/llama-nemotron-embed-1b-v2
 export APP_EMBEDDINGS_DIMENSIONS=2048
 # For VLM Embedding Model (Nemoretriever-1b-vlm-embed-v1)
 # export APP_EMBEDDINGS_SERVERURL=localhost:9081
@@ -85,7 +85,7 @@ export ENABLE_FILTER_GENERATOR=False
 
 # === Reranking Model specific configurations ===
 export APP_RANKING_SERVERURL=localhost:1976
-export APP_RANKING_MODELNAME="nvidia/llama-3.2-nv-rerankqa-1b-v2"
+export APP_RANKING_MODELNAME="nvidia/llama-nemotron-rerank-1b-v2"
 export ENABLE_RERANKER=True
 
 # === VLM Model specific configurations ===
diff --git a/notebooks/building_rag_vdb_operator.ipynb b/notebooks/building_rag_vdb_operator.ipynb
index fec360d84..1852f1d9c 100644
--- a/notebooks/building_rag_vdb_operator.ipynb
+++ b/notebooks/building_rag_vdb_operator.ipynb
@@ -360,12 +360,12 @@
     "Ensure all the below are running and healthy before proceeding further\n",
     "```output\n",
     "NAMES                           STATUS\n",
-    "nemoretriever-ranking-ms        Up ... (healthy)\n",
+    "nemotron-ranking-ms        Up ... (healthy)\n",
     "compose-page-elements-1         Up ...\n",
     "compose-nemoretriever-ocr-1     Up ...\n",
     "compose-graphic-elements-1      Up ...\n",
     "compose-table-structure-1       Up ...\n",
-    "nemoretriever-embedding-ms      Up ... (healthy)\n",
+    "nemotron-embedding-ms      Up ... (healthy)\n",
     "nim-llm-ms                      Up ... (healthy)\n",
     "```"
    ]
@@ -390,15 +390,15 @@
     "os.environ[\"OCR_HTTP_ENDPOINT\"] = \"https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-ocr\"\n",
     "os.environ[\"OCR_INFER_PROTOCOL\"] = \"http\"\n",
     "os.environ[\"YOLOX_HTTP_ENDPOINT\"] = (\n",
-    "    \"https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-page-elements-v3\"\n",
+    "    \"https://ai.api.nvidia.com/v1/cv/nvidia/nemotron-page-elements-v3\"\n",
     ")\n",
     "os.environ[\"YOLOX_INFER_PROTOCOL\"] = \"http\"\n",
     "os.environ[\"YOLOX_GRAPHIC_ELEMENTS_HTTP_ENDPOINT\"] = (\n",
-    "    \"https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-graphic-elements-v1\"\n",
+    "    \"https://ai.api.nvidia.com/v1/cv/nvidia/nemotron-graphic-elements-v1\"\n",
     ")\n",
     "os.environ[\"YOLOX_GRAPHIC_ELEMENTS_INFER_PROTOCOL\"] = \"http\"\n",
     "os.environ[\"YOLOX_TABLE_STRUCTURE_HTTP_ENDPOINT\"] = (\n",
-    "    \"https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-table-structure-v1\"\n",
+    "    \"https://ai.api.nvidia.com/v1/cv/nvidia/nemotron-table-structure-v1\"\n",
     ")\n",
     "os.environ[\"YOLOX_TABLE_STRUCTURE_INFER_PROTOCOL\"] = \"http\""
    ]
@@ -2081,13 +2081,13 @@
     "\n",
     "# IMPORTANT: Two different embedding URLs are needed:\n",
     "# 1. config_ingestor.embeddings.server_url → Used by nv-ingest (runs in Docker)\n",
-    "#    Must use Docker network hostname: nemoretriever-embedding-ms:8000\n",
+    "#    Must use Docker network hostname: nemotron-embedding-ms:8000\n",
     "# 2. embedding_model for VDB operator → Used for queries (runs locally in notebook)\n",
     "#    Must use localhost: localhost:9080\n",
     "\n",
     "if DEPLOYMENT_MODE == \"on_prem\":\n",
     "    # nv-ingest runs inside Docker, needs Docker network hostname\n",
-    "    config_ingestor.embeddings.server_url = \"http://nemoretriever-embedding-ms:8000/v1\"\n",
+    "    config_ingestor.embeddings.server_url = \"http://nemotron-embedding-ms:8000/v1\"\n",
     "if DEPLOYMENT_MODE == \"cloud\":\n",
     "    config_ingestor.embeddings.server_url = \"https://integrate.api.nvidia.com/v1\"\n",
     "    config_ingestor.llm.server_url = \"\"  # Empty uses NVIDIA API catalog\n",
diff --git a/notebooks/config.yaml b/notebooks/config.yaml
index ae9551877..f5a8eb53b 100644
--- a/notebooks/config.yaml
+++ b/notebooks/config.yaml
@@ -55,14 +55,14 @@ filter_expression_generator:
 
 # Embedding Configuration
 embeddings:
-  model_name: "nvidia/llama-3.2-nv-embedqa-1b-v2"  # Model for generating text embeddings
+  model_name: "nvidia/llama-nemotron-embed-1b-v2"  # Model for generating text embeddings
   dimensions: 2048  # Dimensionality of the embedding vectors
   server_url: "http://localhost:9080/v1"  # URL endpoint for embedding service (on-prem NIM default)
   # api_key: ""  # Optional: API key for embeddings (overrides NVIDIA_API_KEY environment variable)
 
 # Ranking Configuration
 ranking:
-  model_name: "nvidia/llama-3.2-nv-rerankqa-1b-v2"  # Model for reranking retrieved documents
+  model_name: "nvidia/llama-nemotron-rerank-1b-v2"  # Model for reranking retrieved documents
   server_url: "http://localhost:1976"  # URL endpoint for reranking service (on-prem NIM default)
   enable_reranker: true  # Enable reranking of retrieved documents before generation
   # api_key: ""  # Optional: API key for reranking (overrides NVIDIA_API_KEY environment variable)
diff --git a/notebooks/evaluation_01_ragas.ipynb b/notebooks/evaluation_01_ragas.ipynb
index 4b50aceb0..d3f12933d 100644
--- a/notebooks/evaluation_01_ragas.ipynb
+++ b/notebooks/evaluation_01_ragas.ipynb
@@ -1,550 +1,559 @@
 {
- "cells": [
-  {
-   "cell_type": "markdown",
-   "id": "f1de2541",
-   "metadata": {},
-   "source": [
-    "# Evaluate Your RAG Pipeline with Ragas: Answer Accuracy, Context Relevancy, and Groundedness\n",
-    "\n",
-    "In this notebook, we will evaluate our RAG system using three key metrics with the [Ragas](https://docs.ragas.io/en/stable/) library. \n",
-    "\n",
-    "Ragas provides a set of metrics that you can use to evaluate the performance of your LLM application. These metrics are designed to help you objectively measure the performance of your application. \n",
-    "\n",
-    "## Evaluation Metrics\n",
-    "\n",
-    "In this notebook, we will use the following three metrics, introduced to Ragas by NVIDIA:\n",
-    "- **Answer Accuracy** – Measures the agreement between a model’s response and a reference ground truth for a given question.\n",
-    "- **Context Relevancy** – Evaluates whether the retrieved contexts (chunks or passages) are pertinent to the user input. \n",
-    "- **Response Groundedness** – Measures how well a response is supported or \"grounded\" by the retrieved contexts. It assesses whether each claim in the response can be found, either wholly or partially, in the provided contexts.\n",
-    "\n",
-    "## Prerequisites\n",
-    "\n",
-    "This notebook assumes you are familiar with the RAG system and you have both `rag-server` and `ingestor-server` up and running. If you have not done that, you can refer to [Get Started](https://github.com/NVIDIA-AI-Blueprints/rag/blob/main/docs/deploy-docker-self-hosted.md) to start the RAG server.\n",
-    "\n",
-    "## 1. Download Evaluation Documents\n",
-    "\n",
-    "First, let's download the FinanceBench dataset to evaluate our RAG system. This dataset includes PDF files with information and reports about publicly traded companies, as well as ground truth question and answer pairs.\n",
-    "\n",
-    "We'll clone the repository into our data directory in a subdirectory called `financebench`. The PDFs can be found in the `pdfs` subdirectory.\n"
-   ]
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "id": "f1de2541",
+      "metadata": {},
+      "source": [
+        "# Evaluate Your RAG Pipeline with Ragas: Answer Accuracy, Context Relevancy, and Groundedness\n",
+        "\n",
+        "In this notebook, we will evaluate our RAG system using three key metrics with the [Ragas](https://docs.ragas.io/en/stable/) library. \n",
+        "\n",
+        "Ragas provides a set of metrics that you can use to evaluate the performance of your LLM application. These metrics are designed to help you objectively measure the performance of your application. \n",
+        "\n",
+        "## Evaluation Metrics\n",
+        "\n",
+        "In this notebook, we will use the following three metrics, introduced to Ragas by NVIDIA:\n",
+        "- **Answer Accuracy** – Measures the agreement between a model’s response and a reference ground truth for a given question.\n",
+        "- **Context Relevancy** – Evaluates whether the retrieved contexts (chunks or passages) are pertinent to the user input. \n",
+        "- **Response Groundedness** – Measures how well a response is supported or \"grounded\" by the retrieved contexts. It assesses whether each claim in the response can be found, either wholly or partially, in the provided contexts.\n",
+        "\n",
+        "## Prerequisites\n",
+        "\n",
+        "This notebook assumes you are familiar with the RAG system and you have both `rag-server` and `ingestor-server` up and running. If you have not done that, you can refer to [Get Started](https://github.com/NVIDIA-AI-Blueprints/rag/blob/main/docs/deploy-docker-self-hosted.md) to start the RAG server.\n",
+        "\n",
+        "## 1. Download Evaluation Documents\n",
+        "\n",
+        "First, let's download the FinanceBench dataset to evaluate our RAG system. This dataset includes PDF files with information and reports about publicly traded companies, as well as ground truth question and answer pairs.\n",
+        "\n",
+        "We'll clone the repository into our data directory in a subdirectory called `financebench`. The PDFs can be found in the `pdfs` subdirectory.\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "d87b89d8",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "! git clone https://github.com/patronus-ai/financebench.git ../data/financebench"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "id": "702a5f07",
+      "metadata": {},
+      "source": [
+        "## 2. Ingest Evaluation Documents\n",
+        "\n",
+        "For evaluation, we will use the FinanceBench dataset. In the data directory, we have the PDF files for the FinanceBench dataset, as well as the `financebench_open_source.jsonl` file, which includes ground truth question and answer pairs. \n",
+        "\n",
+        "Let's start by creating a collection called `financebench` and upload the relevant documents.\n",
+        "\n",
+        "This process is similar to the `ingestion_api_usage` notebook. First, we'll install the required packages and set up our API connections."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "0b88ef79",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Installing required Python packages\n",
+        "! pip install aiohttp langchain-nvidia-ai-endpoints ragas httpx"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "11bcb3fe",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "import aiohttp\n",
+        "import os\n",
+        "import json\n",
+        "import glob\n",
+        "import httpx"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "fa7a4226",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "IPADDRESS = \"ingestor-server\" if os.environ.get(\"AI_WORKBENCH\", \"false\") == \"true\" else \"localhost\" # Replace this with the correct IP address\n",
+        "INGESTOR_SERVER_PORT = \"8082\"\n",
+        "INGESTOR_BASE_URL = f\"http://{IPADDRESS}:{INGESTOR_SERVER_PORT}\"  # Replace with your server URL\n",
+        "\n",
+        "async def print_response(response):\n",
+        "    \"\"\"Helper to print API response.\"\"\"\n",
+        "    try:\n",
+        "        response_json = await response.json()\n",
+        "        print(json.dumps(response_json, indent=2))\n",
+        "    except aiohttp.ClientResponseError:\n",
+        "        print(await response.text())\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "47cc6774",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "async def create_collection(\n",
+        "    collection_name: str = None,\n",
+        "    metadata_schema: list = []\n",
+        "):\n",
+        "    \"\"\"Create a new collection in the vector database.\"\"\"\n",
+        "    data = {\n",
+        "        \"collection_name\": collection_name,\n",
+        "        \"metadata_schema\": metadata_schema\n",
+        "    }\n",
+        "\n",
+        "    HEADERS = {\"Content-Type\": \"application/json\"}\n",
+        "\n",
+        "    async with aiohttp.ClientSession() as session:\n",
+        "        try:\n",
+        "            async with session.post(f\"{INGESTOR_BASE_URL}/v1/collection\", json=data, headers=HEADERS) as response:\n",
+        "                await print_response(response)\n",
+        "        except aiohttp.ClientError as e:\n",
+        "            return 500, {\"error\": str(e)}\n",
+        "\n",
+        "# Create the financebench collection\n",
+        "await create_collection(\n",
+        "    collection_name=\"financebench\",\n",
+        ")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "92418e23",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Get all PDF files from the financebench directory\n",
+        "FILEPATHS = glob.glob(os.path.join(\"../data/financebench/pdfs\", \"*.pdf\"))\n",
+        "\n",
+        "async def upload_documents(collection_name: str = \"\"):\n",
+        "    \"\"\"Upload documents to the specified collection.\"\"\"\n",
+        "    data = {\n",
+        "        \"collection_name\": collection_name,\n",
+        "        \"blocking\": False,  # If True, upload is blocking; else async. Status API not needed when blocking\n",
+        "        \"split_options\": {\n",
+        "            \"chunk_size\": 512,\n",
+        "            \"chunk_overlap\": 150\n",
+        "        },\n",
+        "        \"generate_summary\": False  # Set to True to optionally generate summaries for all documents after ingestion\n",
+        "    }\n",
+        "\n",
+        "    form_data = aiohttp.FormData()\n",
+        "    \n",
+        "    # Add all PDF files to the form data\n",
+        "    for file_path in FILEPATHS:\n",
+        "        form_data.add_field(\"documents\", open(file_path, \"rb\"), filename=os.path.basename(file_path), content_type=\"application/pdf\")\n",
+        "\n",
+        "    form_data.add_field(\"data\", json.dumps(data), content_type=\"application/json\")\n",
+        "\n",
+        "    async with aiohttp.ClientSession() as session:\n",
+        "        try:\n",
+        "            async with session.post(f\"{INGESTOR_BASE_URL}/v1/documents\", data=form_data) as response: # Replace with session.patch for reingesting\n",
+        "                await print_response(response)\n",
+        "                # Return the response JSON for task_id extraction\n",
+        "                response_json = await response.json()\n",
+        "                return response_json\n",
+        "        except aiohttp.ClientError as e:\n",
+        "            print(f\"Error uploading documents: {e}\")\n",
+        "            return None\n",
+        "\n",
+        "# Store the response and extract task_id\n",
+        "upload_response = await upload_documents(collection_name=\"financebench\")\n",
+        "task_id = upload_response.get(\"task_id\") if upload_response else None\n",
+        "print(f\"Extracted task_id: {task_id}\")\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "id": "798b7771",
+      "metadata": {},
+      "source": [
+        "**⚠️ Note**: During the document ingestion process, two files (`INTEL_2023_8K_dated-2023-08-16.pdf` and `INTEL_2023_8K_dated-2023-02-10.pdf`) may fail to process due to formatting issues. This is expected and can be safely ignored, as it will not affect the evaluation methodology or results. The remaining documents in the dataset are sufficient for comprehensive evaluation."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "82b3e199",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# This might take a few minutes to complete depending on the number of documents uploaded\n",
+        "async def get_task_status(\n",
+        "    task_id: str\n",
+        "):\n",
+        "\n",
+        "    params = {\n",
+        "        \"task_id\": task_id,\n",
+        "    }\n",
+        "\n",
+        "    HEADERS = {\"Content-Type\": \"application/json\"}\n",
+        "\n",
+        "    async with aiohttp.ClientSession() as session:\n",
+        "        try:\n",
+        "            async with session.get(f\"{INGESTOR_BASE_URL}/v1/status\", params=params, headers=HEADERS) as response:\n",
+        "                await print_response(response)\n",
+        "        except aiohttp.ClientError as e:\n",
+        "            return 500, {\"error\": str(e)}\n",
+        "\n",
+        "# Use the extracted task_id from the upload_documents response\n",
+        "if task_id:\n",
+        "    await get_task_status(task_id=task_id)\n",
+        "else:\n",
+        "    print(\"No task_id available. Please run the upload_documents cell first.\")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "id": "8bb5edff",
+      "metadata": {},
+      "source": [
+        "## 3. Create Dataset for Ragas Evaluation\n",
+        "\n",
+        "In `data/financebench/data`, there is a file called `financebench_open_source.jsonl`. This file contains questions about the PDFs, as well as corresponding ground truth answers.\n",
+        "\n",
+        "For each ground-truth question and answer pair, we will:\n",
+        "1. Generate an answer from our RAG system\n",
+        "2. Retrieve the relevant document contexts\n",
+        "3. Create a dataset suitable for Ragas evaluation\n",
+        "\n",
+        "The answer and context retrieval from the RAG system is similar to the `retriever_api_usage` notebook.\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "b96c09f1",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "IPADDRESS = \"rag-server\" if os.environ.get(\"AI_WORKBENCH\", \"false\") == \"true\" else \"localhost\" #Replace this with the correct IP address\n",
+        "RAG_SERVER_PORT = \"8081\"\n",
+        "RAG_BASE_URL = f\"http://{IPADDRESS}:{RAG_SERVER_PORT}\"  # Replace with your server URL\n",
+        "\n",
+        "generate_url = f\"{RAG_BASE_URL}/v1/generate\"\n",
+        "\n",
+        "async def generate_answer(payload):\n",
+        "    \"\"\"Generate an answer using the RAG server.\"\"\"\n",
+        "    rag_response = \"\"\n",
+        "    citations = []\n",
+        "    is_first_token = True\n",
+        "\n",
+        "    async with httpx.AsyncClient(timeout=300.0) as client:\n",
+        "        try:\n",
+        "            async with client.stream(\"POST\", url=generate_url, json=payload) as response:\n",
+        "                # Raise an exception for bad status codes like 4xx or 5xx\n",
+        "                response.raise_for_status()\n",
+        "\n",
+        "                # iterate over the response lines\n",
+        "                async for line in response.aiter_lines():\n",
+        "                    if line.startswith(\"data: \"):\n",
+        "                        json_str = line[6:].strip()\n",
+        "                        if not json_str:\n",
+        "                            continue\n",
+        "\n",
+        "                        try:\n",
+        "                            data = json.loads(json_str)\n",
+        "\n",
+        "                            # --- Extract the response from the RAG server ---\n",
+        "                            message = data.get(\"choices\", [{}])[0].get(\"message\", {}).get(\"content\", \"\")\n",
+        "                            if message:\n",
+        "                                rag_response += message\n",
+        "\n",
+        "                            # --- Extract the citations from the RAG server ---\n",
+        "                            if is_first_token and data.get(\"citations\"):\n",
+        "                                for result in data.get(\"citations\", {}).get(\"results\", []):\n",
+        "                                    description = result.get(\"metadata\", {}).get(\"description\")\n",
+        "                                    if description:\n",
+        "                                        citations.append(description)\n",
+        "                                is_first_token = False\n",
+        "\n",
+        "                            finish_reason = data.get(\"choices\", [{}])[0].get(\"finish_reason\")\n",
+        "                            if finish_reason == \"stop\":\n",
+        "                                return rag_response, citations\n",
+        "\n",
+        "                        except json.JSONDecodeError:\n",
+        "                            print(f\"Skipping malformed JSON line: {json_str}\")\n",
+        "                            continue\n",
+        "        \n",
+        "        except httpx.HTTPStatusError as e:\n",
+        "            print(f\"HTTP error occurred: {e.response.status_code} - {e.response.text}\")\n",
+        "        except httpx.RequestError as e:\n",
+        "            print(f\"An error occurred while requesting {e.request.url!r}: {e}\")\n",
+        "        except Exception as e:\n",
+        "            print(f\"An error occurred: {e}\")\n",
+        "\n",
+        "    return rag_response, citations\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "805c5744",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Load the question and ground-truth answer pairs from the FinanceBench dataset\n",
+        "with open('../data/financebench/data/financebench_open_source.jsonl', 'r') as file:\n",
+        "    gt_qa_pairs = [json.loads(line) for line in file]\n",
+        "\n",
+        "print(f\"Loaded {len(gt_qa_pairs)} question-answer pairs from FinanceBench dataset\")\n",
+        "\n",
+        "dataset = []\n",
+        "\n",
+        "# For the purposes of keeping this demo brief, we will only evaluate on 50 questions. \n",
+        "# You can increase this to the full dataset for more comprehensive results.\n",
+        "n = 50 \n",
+        "print(f\"Evaluating on {n} questions...\")\n",
+        "\n",
+        "for idx, qa_pair in enumerate(gt_qa_pairs[:n]):\n",
+        "    question = qa_pair['question']\n",
+        "    \n",
+        "    print(f\"Processing question {idx + 1}/{n}: {question[:100]}...\")\n",
+        "\n",
+        "    generate_payload = {\n",
+        "        \"messages\": [\n",
+        "            {\n",
+        "                \"role\": \"user\",\n",
+        "                \"content\": question\n",
+        "            }\n",
+        "        ],\n",
+        "        \"use_knowledge_base\": True,\n",
+        "        \"reranker_top_k\": 2,\n",
+        "        \"vdb_top_k\": 10,\n",
+        "        \"vdb_endpoint\": \"http://milvus:19530\",\n",
+        "        \"collection_names\": [\"financebench\"],\n",
+        "        \"enable_reranker\": True,\n",
+        "        \"enable_citations\": True,\n",
+        "        \"stop\": [],\n",
+        "        \"filter_expr\": ''\n",
+        "    }\n",
+        "    \n",
+        "    rag_answer, citations = await generate_answer(generate_payload)\n",
+        "\n",
+        "    dataset.append({\n",
+        "        \"user_input\": question,\n",
+        "        \"retrieved_contexts\": citations,\n",
+        "        \"response\": rag_answer,\n",
+        "        \"reference\": qa_pair['answer'],\n",
+        "    })\n",
+        "\n",
+        "print(f\"Created dataset with {len(dataset)} entries for evaluation\")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "id": "43e68742",
+      "metadata": {},
+      "source": [
+        "\n",
+        "## 4. Evaluate with Ragas\n",
+        "\n",
+        "In this example, we will use the NVIDIA hosted endpoint for our judge model. To use this endpoint, please provide your NVIDIA API Key below. \n",
+        "\n",
+        "### Rate Limiting Considerations\n",
+        "\n",
+        "When using the public endpoint for the Judge LLM, you will likely encounter rate limit errors. We can try to reduce the number of errors by adjusting the configuration, which we do below. \n",
+        "\n",
+        "Alternatively, you can use self-hosted NIM Microservices endpoints to avoid these errors altogether. If you're using a self-hosted NIM, you do not need to provide your API Key.\n",
+        "\n",
+        "### Getting Your NVIDIA API Key\n",
+        "\n",
+        "To generate an API Key:\n",
+        "1. Go to [build.nvidia.com](https://build.nvidia.com/)\n",
+        "2. Click the green \"Get API Key\" button in the top right corner\n",
+        "3. Paste your key below to save it as an environment variable\n",
+        "\n",
+        "### Self-Hosted Option\n",
+        "\n",
+        "To deploy the Judge LLM as a NIM on your own infrastructure, follow the instructions [here](https://build.nvidia.com/openai/gpt-oss-120b/deploy).\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "id": "32df51d0",
+      "metadata": {},
+      "source": [
+        "Note: Mixtral 8x22b is the preferred model if you have required compute available. You can deploy it following steps [here](https://build.nvidia.com/mistralai/mixtral-8x22b-instruct/deploy)."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "31df3819",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "from getpass import getpass\n",
+        "# del os.environ['NVIDIA_API_KEY']  ## delete key and reset if needed\n",
+        "if os.environ.get(\"NVIDIA_API_KEY\", \"\").startswith(\"nvapi-\"):\n",
+        "    print(\"Valid NVIDIA_API_KEY already in environment. Delete to reset\")\n",
+        "else:\n",
+        "    candidate_api_key = getpass(\"NVAPI Key (starts with nvapi-): \")\n",
+        "    assert candidate_api_key.startswith(\"nvapi-\"), (\n",
+        "        f\"{candidate_api_key[:5]}... is not a valid key\"\n",
+        "    )\n",
+        "    os.environ[\"NVIDIA_API_KEY\"] = candidate_api_key"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "78fb75fe",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "\n",
+        "# Note: Models on build.nvidia.com are rate limited.\n",
+        "# To avoid rate-limit issues, either deploy the judge model locally (self-hosted NIM)\n",
+        "# or use any OpenAI-compatible LLM as the judge for evaluation.\n",
+        "from langchain_nvidia_ai_endpoints.chat_models import ChatNVIDIA\n",
+        "\n",
+        "# Initialize the judge LLM for evaluation\n",
+        "# You can use any other model by creating a ChatNVIDIA object with a different model id\n",
+        "llm = ChatNVIDIA(model=\"openai/gpt-oss-120b\") # For using NVIDIA hosted endpoint\n",
+        "# llm = ChatNVIDIA(model=\"openai/gpt-oss-120b\", base_url=\"http://0.0.0.0:8000/v1\") # If using self-hosted NIM"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "928a3c8a",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Create the evaluation dataset from our collected data\n",
+        "from ragas import EvaluationDataset\n",
+        "\n",
+        "evaluation_dataset = EvaluationDataset.from_list(dataset)\n",
+        "print(f\"Created evaluation dataset with {len(evaluation_dataset)} samples\")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "b3ec24f4",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Import the required metrics and evaluation components\n",
+        "from ragas.metrics import AnswerAccuracy, ContextRelevance, ResponseGroundedness\n",
+        "from ragas import evaluate\n",
+        "from ragas.llms import LangchainLLMWrapper\n",
+        "\n",
+        "# Wrap the LLM for use with Ragas\n",
+        "evaluator_llm = LangchainLLMWrapper(llm)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "9f2f4245",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "from ragas.run_config import RunConfig\n",
+        "\n",
+        "custom_config = RunConfig(max_workers=1, max_wait=120)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "3a3571af",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Run the evaluation with our three metrics\n",
+        "print(\"Starting Ragas evaluation...\")\n",
+        "print(\"This may take several minutes depending on the dataset size.\")\n",
+        "\n",
+        "results = evaluate(\n",
+        "    dataset=evaluation_dataset,\n",
+        "    metrics=[AnswerAccuracy(), ContextRelevance(), ResponseGroundedness()],\n",
+        "    llm=evaluator_llm, \n",
+        "    run_config=custom_config\n",
+        ")\n",
+        "\n",
+        "print(\"Evaluation completed!\")\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "id": "bac9dde6",
+      "metadata": {},
+      "source": [
+        "## 5. Analyze Results\n",
+        "\n",
+        "Finally, let's examine our evaluation results. We'll look at both the overall metrics and individual sample performance."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "4c90647f",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "results"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "2da683a1",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Convert results to pandas DataFrame for detailed analysis of individual queries\n",
+        "results_df = results.to_pandas()\n",
+        "\n",
+        "import pandas as pd\n",
+        "\n",
+        "# 1. Set the option to display ALL columns, preventing the '...'\n",
+        "pd.set_option('display.max_columns', None)\n",
+        "\n",
+        "# 2. To prevent long text in cells from being cut off, you can set the column width\n",
+        "pd.set_option('display.max_colwidth', 80)\n",
+        "\n",
+        "results_df.head()"
+      ]
+    }
+  ],
+  "metadata": {
+    "kernelspec": {
+      "display_name": "evaluate",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.12.9"
+    }
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "d87b89d8",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "! git clone https://github.com/patronus-ai/financebench.git ../data/financebench"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "702a5f07",
-   "metadata": {},
-   "source": [
-    "## 2. Ingest Evaluation Documents\n",
-    "\n",
-    "For evaluation, we will use the FinanceBench dataset. In the data directory, we have the PDF files for the FinanceBench dataset, as well as the `financebench_open_source.jsonl` file, which includes ground truth question and answer pairs. \n",
-    "\n",
-    "Let's start by creating a collection called `financebench` and upload the relevant documents.\n",
-    "\n",
-    "This process is similar to the `ingestion_api_usage` notebook. First, we'll install the required packages and set up our API connections."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "0b88ef79",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Installing required Python packages\n",
-    "! pip install aiohttp langchain-nvidia-ai-endpoints ragas httpx"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "11bcb3fe",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import aiohttp\n",
-    "import os\n",
-    "import json\n",
-    "import glob\n",
-    "import httpx"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "fa7a4226",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "IPADDRESS = \"ingestor-server\" if os.environ.get(\"AI_WORKBENCH\", \"false\") == \"true\" else \"localhost\" # Replace this with the correct IP address\n",
-    "INGESTOR_SERVER_PORT = \"8082\"\n",
-    "INGESTOR_BASE_URL = f\"http://{IPADDRESS}:{INGESTOR_SERVER_PORT}\"  # Replace with your server URL\n",
-    "\n",
-    "async def print_response(response):\n",
-    "    \"\"\"Helper to print API response.\"\"\"\n",
-    "    try:\n",
-    "        response_json = await response.json()\n",
-    "        print(json.dumps(response_json, indent=2))\n",
-    "    except aiohttp.ClientResponseError:\n",
-    "        print(await response.text())\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "47cc6774",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "async def create_collection(\n",
-    "    collection_name: str = None,\n",
-    "    metadata_schema: list = []\n",
-    "):\n",
-    "    \"\"\"Create a new collection in the vector database.\"\"\"\n",
-    "    data = {\n",
-    "        \"collection_name\": collection_name,\n",
-    "        \"metadata_schema\": metadata_schema\n",
-    "    }\n",
-    "\n",
-    "    HEADERS = {\"Content-Type\": \"application/json\"}\n",
-    "\n",
-    "    async with aiohttp.ClientSession() as session:\n",
-    "        try:\n",
-    "            async with session.post(f\"{INGESTOR_BASE_URL}/v1/collection\", json=data, headers=HEADERS) as response:\n",
-    "                await print_response(response)\n",
-    "        except aiohttp.ClientError as e:\n",
-    "            return 500, {\"error\": str(e)}\n",
-    "\n",
-    "# Create the financebench collection\n",
-    "await create_collection(\n",
-    "    collection_name=\"financebench\",\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "92418e23",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Get all PDF files from the financebench directory\n",
-    "FILEPATHS = glob.glob(os.path.join(\"../data/financebench/pdfs\", \"*.pdf\"))\n",
-    "\n",
-    "async def upload_documents(collection_name: str = \"\"):\n",
-    "    \"\"\"Upload documents to the specified collection.\"\"\"\n",
-    "    data = {\n",
-    "        \"collection_name\": collection_name,\n",
-    "        \"blocking\": False,  # If True, upload is blocking; else async. Status API not needed when blocking\n",
-    "        \"split_options\": {\n",
-    "            \"chunk_size\": 512,\n",
-    "            \"chunk_overlap\": 150\n",
-    "        },\n",
-    "        \"generate_summary\": False  # Set to True to optionally generate summaries for all documents after ingestion\n",
-    "    }\n",
-    "\n",
-    "    form_data = aiohttp.FormData()\n",
-    "    \n",
-    "    # Add all PDF files to the form data\n",
-    "    for file_path in FILEPATHS:\n",
-    "        form_data.add_field(\"documents\", open(file_path, \"rb\"), filename=os.path.basename(file_path), content_type=\"application/pdf\")\n",
-    "\n",
-    "    form_data.add_field(\"data\", json.dumps(data), content_type=\"application/json\")\n",
-    "\n",
-    "    async with aiohttp.ClientSession() as session:\n",
-    "        try:\n",
-    "            async with session.post(f\"{INGESTOR_BASE_URL}/v1/documents\", data=form_data) as response: # Replace with session.patch for reingesting\n",
-    "                await print_response(response)\n",
-    "                # Return the response JSON for task_id extraction\n",
-    "                response_json = await response.json()\n",
-    "                return response_json\n",
-    "        except aiohttp.ClientError as e:\n",
-    "            print(f\"Error uploading documents: {e}\")\n",
-    "            return None\n",
-    "\n",
-    "# Store the response and extract task_id\n",
-    "upload_response = await upload_documents(collection_name=\"financebench\")\n",
-    "task_id = upload_response.get(\"task_id\") if upload_response else None\n",
-    "print(f\"Extracted task_id: {task_id}\")\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "798b7771",
-   "metadata": {},
-   "source": [
-    "**⚠️ Note**: During the document ingestion process, two files (`INTEL_2023_8K_dated-2023-08-16.pdf` and `INTEL_2023_8K_dated-2023-02-10.pdf`) may fail to process due to formatting issues. This is expected and can be safely ignored, as it will not affect the evaluation methodology or results. The remaining documents in the dataset are sufficient for comprehensive evaluation."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "82b3e199",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# This might take a few minutes to complete depending on the number of documents uploaded\n",
-    "async def get_task_status(\n",
-    "    task_id: str\n",
-    "):\n",
-    "\n",
-    "    params = {\n",
-    "        \"task_id\": task_id,\n",
-    "    }\n",
-    "\n",
-    "    HEADERS = {\"Content-Type\": \"application/json\"}\n",
-    "\n",
-    "    async with aiohttp.ClientSession() as session:\n",
-    "        try:\n",
-    "            async with session.get(f\"{INGESTOR_BASE_URL}/v1/status\", params=params, headers=HEADERS) as response:\n",
-    "                await print_response(response)\n",
-    "        except aiohttp.ClientError as e:\n",
-    "            return 500, {\"error\": str(e)}\n",
-    "\n",
-    "# Use the extracted task_id from the upload_documents response\n",
-    "if task_id:\n",
-    "    await get_task_status(task_id=task_id)\n",
-    "else:\n",
-    "    print(\"No task_id available. Please run the upload_documents cell first.\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "8bb5edff",
-   "metadata": {},
-   "source": [
-    "## 3. Create Dataset for Ragas Evaluation\n",
-    "\n",
-    "In `data/financebench/data`, there is a file called `financebench_open_source.jsonl`. This file contains questions about the PDFs, as well as corresponding ground truth answers.\n",
-    "\n",
-    "For each ground-truth question and answer pair, we will:\n",
-    "1. Generate an answer from our RAG system\n",
-    "2. Retrieve the relevant document contexts\n",
-    "3. Create a dataset suitable for Ragas evaluation\n",
-    "\n",
-    "The answer and context retrieval from the RAG system is similar to the `retriever_api_usage` notebook.\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "b96c09f1",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "IPADDRESS = \"rag-server\" if os.environ.get(\"AI_WORKBENCH\", \"false\") == \"true\" else \"localhost\" #Replace this with the correct IP address\n",
-    "RAG_SERVER_PORT = \"8081\"\n",
-    "RAG_BASE_URL = f\"http://{IPADDRESS}:{RAG_SERVER_PORT}\"  # Replace with your server URL\n",
-    "\n",
-    "generate_url = f\"{RAG_BASE_URL}/v1/generate\"\n",
-    "\n",
-    "async def generate_answer(payload):\n",
-    "    \"\"\"Generate an answer using the RAG server.\"\"\"\n",
-    "    rag_response = \"\"\n",
-    "    citations = []\n",
-    "    is_first_token = True\n",
-    "\n",
-    "    async with httpx.AsyncClient(timeout=300.0) as client:\n",
-    "        try:\n",
-    "            async with client.stream(\"POST\", url=generate_url, json=payload) as response:\n",
-    "                # Raise an exception for bad status codes like 4xx or 5xx\n",
-    "                response.raise_for_status()\n",
-    "\n",
-    "                # iterate over the response lines\n",
-    "                async for line in response.aiter_lines():\n",
-    "                    if line.startswith(\"data: \"):\n",
-    "                        json_str = line[6:].strip()\n",
-    "                        if not json_str:\n",
-    "                            continue\n",
-    "\n",
-    "                        try:\n",
-    "                            data = json.loads(json_str)\n",
-    "\n",
-    "                            # --- Extract the response from the RAG server ---\n",
-    "                            message = data.get(\"choices\", [{}])[0].get(\"message\", {}).get(\"content\", \"\")\n",
-    "                            if message:\n",
-    "                                rag_response += message\n",
-    "\n",
-    "                            # --- Extract the citations from the RAG server ---\n",
-    "                            if is_first_token and data.get(\"citations\"):\n",
-    "                                for result in data.get(\"citations\", {}).get(\"results\", []):\n",
-    "                                    description = result.get(\"metadata\", {}).get(\"description\")\n",
-    "                                    if description:\n",
-    "                                        citations.append(description)\n",
-    "                                is_first_token = False\n",
-    "\n",
-    "                            finish_reason = data.get(\"choices\", [{}])[0].get(\"finish_reason\")\n",
-    "                            if finish_reason == \"stop\":\n",
-    "                                return rag_response, citations\n",
-    "\n",
-    "                        except json.JSONDecodeError:\n",
-    "                            print(f\"Skipping malformed JSON line: {json_str}\")\n",
-    "                            continue\n",
-    "        \n",
-    "        except httpx.HTTPStatusError as e:\n",
-    "            print(f\"HTTP error occurred: {e.response.status_code} - {e.response.text}\")\n",
-    "        except httpx.RequestError as e:\n",
-    "            print(f\"An error occurred while requesting {e.request.url!r}: {e}\")\n",
-    "        except Exception as e:\n",
-    "            print(f\"An error occurred: {e}\")\n",
-    "\n",
-    "    return rag_response, citations\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "805c5744",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Load the question and ground-truth answer pairs from the FinanceBench dataset\n",
-    "with open('../data/financebench/data/financebench_open_source.jsonl', 'r') as file:\n",
-    "    gt_qa_pairs = [json.loads(line) for line in file]\n",
-    "\n",
-    "print(f\"Loaded {len(gt_qa_pairs)} question-answer pairs from FinanceBench dataset\")\n",
-    "\n",
-    "dataset = []\n",
-    "\n",
-    "# For the purposes of keeping this demo brief, we will only evaluate on 50 questions. \n",
-    "# You can increase this to the full dataset for more comprehensive results.\n",
-    "n = 50 \n",
-    "print(f\"Evaluating on {n} questions...\")\n",
-    "\n",
-    "for idx, qa_pair in enumerate(gt_qa_pairs[:n]):\n",
-    "    question = qa_pair['question']\n",
-    "    \n",
-    "    print(f\"Processing question {idx + 1}/{n}: {question[:100]}...\")\n",
-    "\n",
-    "    generate_payload = {\n",
-    "        \"messages\": [\n",
-    "            {\n",
-    "                \"role\": \"user\",\n",
-    "                \"content\": question\n",
-    "            }\n",
-    "        ],\n",
-    "        \"use_knowledge_base\": True,\n",
-    "        \"reranker_top_k\": 2,\n",
-    "        \"vdb_top_k\": 10,\n",
-    "        \"vdb_endpoint\": \"http://milvus:19530\",\n",
-    "        \"collection_names\": [\"financebench\"],\n",
-    "        \"enable_reranker\": True,\n",
-    "        \"enable_citations\": True,\n",
-    "        \"stop\": [],\n",
-    "        \"filter_expr\": ''\n",
-    "    }\n",
-    "    \n",
-    "    rag_answer, citations = await generate_answer(generate_payload)\n",
-    "\n",
-    "    dataset.append({\n",
-    "        \"user_input\": question,\n",
-    "        \"retrieved_contexts\": citations,\n",
-    "        \"response\": rag_answer,\n",
-    "        \"reference\": qa_pair['answer'],\n",
-    "    })\n",
-    "\n",
-    "print(f\"Created dataset with {len(dataset)} entries for evaluation\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "43e68742",
-   "metadata": {},
-   "source": [
-    "\n",
-    "## 4. Evaluate with Ragas\n",
-    "\n",
-    "In this example, we will use the NVIDIA hosted endpoint for our judge model. To use this endpoint, please provide your NVIDIA API Key below. \n",
-    "\n",
-    "### Rate Limiting Considerations\n",
-    "\n",
-    "When using the public endpoint for the Judge LLM, you will likely encounter rate limit errors. We can try to reduce the number of errors by adjusting the configuration, which we do below. \n",
-    "\n",
-    "Alternatively, you can use self-hosted NIM Microservices endpoints to avoid these errors altogether. If you're using a self-hosted NIM, you do not need to provide your API Key.\n",
-    "\n",
-    "### Getting Your NVIDIA API Key\n",
-    "\n",
-    "To generate an API Key:\n",
-    "1. Go to [build.nvidia.com](https://build.nvidia.com/)\n",
-    "2. Click the green \"Get API Key\" button in the top right corner\n",
-    "3. Paste your key below to save it as an environment variable\n",
-    "\n",
-    "### Self-Hosted Option\n",
-    "\n",
-    "To deploy the Judge LLM as a NIM on your own infrastructure, follow the instructions [here](https://build.nvidia.com/mistralai/mixtral-8x22b-instruct/deploy).\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "31df3819",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import os\n",
-    "from getpass import getpass\n",
-    "# del os.environ['NVIDIA_API_KEY']  ## delete key and reset if needed\n",
-    "if os.environ.get(\"NVIDIA_API_KEY\", \"\").startswith(\"nvapi-\"):\n",
-    "    print(\"Valid NVIDIA_API_KEY already in environment. Delete to reset\")\n",
-    "else:\n",
-    "    candidate_api_key = getpass(\"NVAPI Key (starts with nvapi-): \")\n",
-    "    assert candidate_api_key.startswith(\"nvapi-\"), (\n",
-    "        f\"{candidate_api_key[:5]}... is not a valid key\"\n",
-    "    )\n",
-    "    os.environ[\"NVIDIA_API_KEY\"] = candidate_api_key"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "78fb75fe",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "\n",
-    "# Note: Models on build.nvidia.com are rate limited.\n",
-    "# To avoid rate-limit issues, either deploy the judge model locally (self-hosted NIM)\n",
-    "# or use any OpenAI-compatible LLM as the judge for evaluation.\n",
-    "from langchain_nvidia_ai_endpoints.chat_models import ChatNVIDIA\n",
-    "\n",
-    "# Initialize the judge LLM for evaluation\n",
-    "# You can use any other model by creating Chat Model object\n",
-    "llm = ChatNVIDIA(model=\"openai/gpt-oss-120b\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "928a3c8a",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Create the evaluation dataset from our collected data\n",
-    "from ragas import EvaluationDataset\n",
-    "\n",
-    "evaluation_dataset = EvaluationDataset.from_list(dataset)\n",
-    "print(f\"Created evaluation dataset with {len(evaluation_dataset)} samples\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "b3ec24f4",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Import the required metrics and evaluation components\n",
-    "from ragas.metrics import AnswerAccuracy, ContextRelevance, ResponseGroundedness\n",
-    "from ragas import evaluate\n",
-    "from ragas.llms import LangchainLLMWrapper\n",
-    "\n",
-    "# Wrap the LLM for use with Ragas\n",
-    "evaluator_llm = LangchainLLMWrapper(llm)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "9f2f4245",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from ragas.run_config import RunConfig\n",
-    "\n",
-    "custom_config = RunConfig(max_workers=1, max_wait=120)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "3a3571af",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Run the evaluation with our three metrics\n",
-    "print(\"Starting Ragas evaluation...\")\n",
-    "print(\"This may take several minutes depending on the dataset size.\")\n",
-    "\n",
-    "results = evaluate(\n",
-    "    dataset=evaluation_dataset,\n",
-    "    metrics=[AnswerAccuracy(), ContextRelevance(), ResponseGroundedness()],\n",
-    "    llm=evaluator_llm, \n",
-    "    run_config=custom_config\n",
-    ")\n",
-    "\n",
-    "print(\"Evaluation completed!\")\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "bac9dde6",
-   "metadata": {},
-   "source": [
-    "## 5. Analyze Results\n",
-    "\n",
-    "Finally, let's examine our evaluation results. We'll look at both the overall metrics and individual sample performance."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "4c90647f",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "results"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "2da683a1",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Convert results to pandas DataFrame for detailed analysis of individual queries\n",
-    "results_df = results.to_pandas()\n",
-    "\n",
-    "import pandas as pd\n",
-    "\n",
-    "# 1. Set the option to display ALL columns, preventing the '...'\n",
-    "pd.set_option('display.max_columns', None)\n",
-    "\n",
-    "# 2. To prevent long text in cells from being cut off, you can set the column width\n",
-    "pd.set_option('display.max_colwidth', 80)\n",
-    "\n",
-    "results_df.head()"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "evaluate",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.12.9"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
+  "nbformat": 4,
+  "nbformat_minor": 5
 }
diff --git a/notebooks/gt_page_mapper.py b/notebooks/gt_page_mapper.py
index 82db0458c..3ff8f3476 100644
--- a/notebooks/gt_page_mapper.py
+++ b/notebooks/gt_page_mapper.py
@@ -71,10 +71,10 @@ def get_gt_file_pages_from_dataset(dataset_contexts, dataset_path):
                 try:
                     # Read PDF and compare each page with ground truth text
                     reader = PyPDF2.PdfReader(file_path)
-                    best_match_page_number = 0  # Default to first page
+                    best_match_page_number = 1  # Default to first page (1-indexed)
                     max_similarity_ratio = 0
-                    
-                    for page_num, page in enumerate(reader.pages):
+
+                    for page_num, page in enumerate(reader.pages, start=1):
                         # Extract text from current page
                         page_text = page.extract_text()
                         
diff --git a/notebooks/image_input.ipynb b/notebooks/image_input.ipynb
index 622ec849b..5698b7985 100644
--- a/notebooks/image_input.ipynb
+++ b/notebooks/image_input.ipynb
@@ -1,992 +1,992 @@
 {
- "cells": [
-  {
-   "cell_type": "markdown",
-   "id": "e20e694c",
-   "metadata": {},
-   "source": [
-    "# Retriever API Usage with Multimodal Query Support\n",
-    "\n",
-    "This notebook demonstrates how to use the NVIDIA RAG retriever APIs with **multimodal queries** (text + images). You'll learn how to:\n",
-    "\n",
-    "- 🔍 Search for relevant documents using queries that contain images\n",
-    "- 🤖 Generate AI responses using the end-to-end RAG API with vision-language models (VLMs)\n",
-    "- 📊 Work with multimodal embeddings and vector databases\n",
-    "\n",
-    "**Use Case**: Query documents with images (e.g., \"What is the price of this item?\" + product image)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "0152f1eb",
-   "metadata": {},
-   "source": [
-    "## 📦 Setting up the Dependencies\n",
-    "\n",
-    "This section will guide you through:\n",
-    "1. Configuring your NGC API key for accessing NVIDIA services\n",
-    "2. Deploying the Milvus vector database\n",
-    "3. Setting up NVIDIA NIMs (NVIDIA Inference Microservices) for embeddings and VLM\n",
-    "4. Starting the NVIDIA Ingest runtime for document processing\n",
-    "5. Launching the RAG server\n",
-    "\n",
-    "**Note**: This setup uses Docker Compose to orchestrate all services."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "d77a630e",
-   "metadata": {},
-   "source": [
-    "### 0. Create a Virtual Environment (Recommended)\n",
-    "\n",
-    "Before running this notebook, create a virtual environment using `uv` to isolate dependencies:\n",
-    "\n",
-    "```bash\n",
-    "# Create a virtual environment\n",
-    "uv venv .venv\n",
-    "\n",
-    "# Activate the virtual environment\n",
-    "source .venv/bin/activate  # Linux/macOS\n",
-    "# .venv\\Scripts\\activate   # Windows\n",
-    "\n",
-    "# Install Jupyter Lab and ipykernel (if not already installed)\n",
-    "uv pip install jupyterlab ipykernel\n",
-    "\n",
-    "# Register the venv as a Jupyter kernel\n",
-    "python -m ipykernel install --user --name=.venv --display-name=\"Python (.venv)\"\n",
-    "```\n",
-    "\n",
-    "After setup, select the venv as the kernel for this notebook:\n",
-    "1. In Jupyter/VS Code/Cursor, click on the kernel selector (top right)\n",
-    "2. Choose **\".venv\"** or **\"Python (.venv)\"** as the kernel\n",
-    "\n",
-    "This ensures all packages installed via `uv pip install` in the notebook cells are installed into the isolated environment.\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "c39e628e",
-   "metadata": {},
-   "source": [
-    "### 1. Setup the Default Configurations\n",
-    "\n",
-    "Import necessary libraries for environment management."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "c03780a7",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Install python-dotenv for environment variable management\n",
-    "! uv pip install python-dotenv\n",
-    "\n",
-    "import os\n",
-    "from getpass import getpass"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "8a19cef7",
-   "metadata": {},
-   "source": [
-    "Provide your NGC_API_KEY after executing the cell below. You can obtain a key by following steps [here](https://github.com/NVIDIA-AI-Blueprints/rag/blob/main/docs/quickstart.md##obtain-an-api-key)."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "c1f7ffa3",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Check if NGC_API_KEY is already set, otherwise prompt for it\n",
-    "# Uncomment the line below to reset your API key\n",
-    "# del os.environ['NGC_API_KEY']\n",
-    "\n",
-    "if os.environ.get(\"NGC_API_KEY\", \"\").startswith(\"nvapi-\"):\n",
-    "    print(\"Valid NGC_API_KEY already in environment. Delete to reset\")\n",
-    "else:\n",
-    "    candidate_api_key = getpass(\"NVAPI Key (starts with nvapi-): \")\n",
-    "    assert candidate_api_key.startswith(\"nvapi-\"), (\n",
-    "        f\"{candidate_api_key[:5]}... is not a valid key\"\n",
-    "    )\n",
-    "    os.environ[\"NGC_API_KEY\"] = candidate_api_key"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "20ec8b61",
-   "metadata": {},
-   "source": [
-    "Login to nvcr.io which is needed for pulling the containers of dependencies"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "03972882",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Login to NVIDIA Container Registry (nvcr.io) to pull required containers\n",
-    "!echo \"${NGC_API_KEY}\" | docker login nvcr.io -u '$oauthtoken' --password-stdin"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "84642fbb",
-   "metadata": {},
-   "source": [
-    "### 2. Setup the Milvus Vector Database\n",
-    "\n",
-    "Milvus is a high-performance vector database used to store and search multimodal embeddings.\n",
-    "\n",
-    "**Configuration Notes**:\n",
-    "- By default, Milvus uses GPU indexing for faster performance\n",
-    "- Ensure you have provided the correct GPU ID below\n",
-    "- If you don't have a GPU available, you can switch to CPU-only Milvus by following the instructions in [milvus-configuration.md](https://github.com/NVIDIA-AI-Blueprints/rag/blob/main/docs/milvus-configuration.md)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "8125f717",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Specify which GPU to use for Milvus (change if using a different GPU)\n",
-    "os.environ[\"VECTORSTORE_GPU_DEVICE_ID\"] = \"0\""
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "3e2d3457",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Start Milvus vector database service\n",
-    "# This will run in the background (-d flag)\n",
-    "!docker compose -f ../deploy/compose/vectordb.yaml up -d"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "afe17557",
-   "metadata": {},
-   "source": [
-    "### 3. Setup NVIDIA Inference Microservices (NIMs)\n",
-    "\n",
-    "NIMs provide optimized inference for AI models. For multimodal RAG, we need:\n",
-    "- **VLM (Vision-Language Model)**: `nvidia/nemotron-nano-12b-v2-vl` for understanding images and generating responses\n",
-    "- **Embedding Model**: `llama-3.2-nemoretriever-1b-vlm-embed-v1` for creating multimodal embeddings"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "89a135eb",
-   "metadata": {},
-   "source": [
-    "#### Deploy On-Premise Models\n",
-    "\n",
-    "This section deploys NIMs locally using Docker. Models will be cached to avoid re-downloading."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "1b3d2e5c",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Create the model cache directory\n",
-    "!mkdir -p ~/.cache/model-cache"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "390df52d",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Set the MODEL_DIRECTORY environment variable to specify where models are cached\n",
-    "import os\n",
-    "\n",
-    "os.environ[\"MODEL_DIRECTORY\"] = os.path.expanduser(\"~/.cache/model-cache\")\n",
-    "print(\"MODEL_DIRECTORY set to:\", os.environ[\"MODEL_DIRECTORY\"])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "62a9946a",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Deploy NIMs with VLM and embedding profiles\n",
-    "# ⚠️ WARNING: This may take 10-20 minutes as models download (~10GB+)\n",
-    "# If the kernel times out, just rerun this cell - it will resume where it left off\n",
-    "# Select a free GPU for VLM Microservice\n",
-    "os.environ[\"VLM_MS_GPU_ID\"] = \"1\"\n",
-    "! USERID=$(id -u) docker compose --profile vlm-ingest --profile vlm-only -f ../deploy/compose/nims.yaml up -d"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "e91f511a",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Monitor the status of running containers\n",
-    "# Run this cell repeatedly to check if all services are healthy\n",
-    "# Look for STATUS showing \"healthy\" or \"Up\" for all containers\n",
-    "!docker ps"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "cfb34a6a",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Configure the model names and service URLs for the RAG pipeline\n",
-    "# These settings tell the RAG server which models and endpoints to use\n",
-    "\n",
-    "# VLM (Vision-Language Model) configuration\n",
-    "os.environ[\"APP_VLM_MODELNAME\"] = \"nvidia/nemotron-nano-12b-v2-vl\"\n",
-    "os.environ[\"APP_VLM_SERVERURL\"] = \"http://vlm-ms:8000/v1\"\n",
-    "\n",
-    "# Multimodal embedding model configuration\n",
-    "os.environ[\"APP_EMBEDDINGS_MODELNAME\"] = \"nvidia/llama-nemotron-embed-vl-1b-v2\"\n",
-    "os.environ[\"APP_EMBEDDINGS_SERVERURL\"] = \"nemotron-vlm-embedding-ms:8000/v1\"\n",
-    "os.environ[\"ENABLE_VLM_INFERENCE\"] = \"true\"\n",
-    "os.environ[\"VLM_TO_LLM_FALLBACK\"] = \"false\"\n",
-    "os.environ[\"ENABLE_RERANKER\"] = \"false\""
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "e62c7037",
-   "metadata": {},
-   "source": [
-    "#### Cloud based deployment\n",
-    "Using NVIDIA hosted cloud model"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "82084d4d",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import os\n",
-    "\n",
-    "# OCR and document processing endpoints - cloud hosted\n",
-    "os.environ[\"OCR_HTTP_ENDPOINT\"] = \"https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-ocr\"\n",
-    "os.environ[\"OCR_INFER_PROTOCOL\"] = \"http\"\n",
-    "os.environ[\"OCR_MODEL_NAME\"] = \"scene_text_ensemble\"\n",
-    "os.environ[\"YOLOX_HTTP_ENDPOINT\"] = \"https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-page-elements-v3\"\n",
-    "os.environ[\"YOLOX_INFER_PROTOCOL\"] = \"http\"\n",
-    "os.environ[\"YOLOX_GRAPHIC_ELEMENTS_HTTP_ENDPOINT\"] = \"https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-graphic-elements-v1\"\n",
-    "os.environ[\"YOLOX_GRAPHIC_ELEMENTS_INFER_PROTOCOL\"] = \"http\"\n",
-    "os.environ[\"YOLOX_TABLE_STRUCTURE_HTTP_ENDPOINT\"] = \"https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-table-structure-v1\"\n",
-    "os.environ[\"YOLOX_TABLE_STRUCTURE_INFER_PROTOCOL\"] = \"http\"\n",
-    "os.environ[\"APP_NVINGEST_CAPTIONENDPOINTURL\"] = \"https://integrate.api.nvidia.com/v1/chat/completions\"\n",
-    "\n",
-    "# VLM Model configuration - cloud hosted\n",
-    "os.environ[\"APP_VLM_MODELNAME\"] = \"nvidia/nemotron-nano-12b-v2-vl\"\n",
-    "os.environ[\"APP_VLM_SERVERURL\"] = \"https://integrate.api.nvidia.com/v1\"\n",
-    "os.environ[\"APP_LLM_SERVERURL\"] = \"\"\n",
-    "\n",
-    "# Multimodal embedding model configuration - cloud hosted\n",
-    "os.environ[\"APP_EMBEDDINGS_MODELNAME\"] = \"nvidia/llama-nemotron-embed-vl-1b-v2\"\n",
-    "os.environ[\"APP_EMBEDDINGS_SERVERURL\"] = \"https://integrate.api.nvidia.com/v1\"\n",
-    "os.environ[\"ENABLE_VLM_INFERENCE\"] = \"true\"\n",
-    "os.environ[\"VLM_TO_LLM_FALLBACK\"] = \"false\"\n",
-    "os.environ[\"ENABLE_RERANKER\"] = \"false\""
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "7cbcfa50",
-   "metadata": {},
-   "source": [
-    "### 4. Setup NVIDIA Ingest Runtime\n",
-    "\n",
-    "NVIDIA Ingest processes documents to extract text, images, and other elements. We'll configure it to:\n",
-    "- Extract images from documents\n",
-    "- Handle multimodal content"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "a5e0d73f",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Configure NVIDIA Ingest to extract and process images from documents\n",
-    "os.environ[\"APP_NVINGEST_STRUCTURED_ELEMENTS_MODALITY\"] = \"\"  # No special handling for structured elements\n",
-    "os.environ[\"APP_NVINGEST_IMAGE_ELEMENTS_MODALITY\"] = \"image\"  # Process image elements as images\n",
-    "os.environ[\"APP_NVINGEST_EXTRACTIMAGES\"] = \"True\"  # Extract images from documents\n",
-    "\n",
-    "# Start the ingestor server with Redis\n",
-    "! docker compose -f ../deploy/compose/docker-compose-ingestor-server.yaml up -d --build"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "da1bd9a3",
-   "metadata": {},
-   "source": [
-    "### 5. Setup the NVIDIA RAG Server\n",
-    "\n",
-    "The RAG server provides the main API endpoints for search and generation. It orchestrates all the components (embeddings, vector DB, VLM) to deliver intelligent responses."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "38ba7752",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Start the RAG server (accessible at localhost:8081)\n",
-    "os.environ[\"APP_RANKING_SERVERURL\"] = \"\"\n",
-    "! docker compose -f ../deploy/compose/docker-compose-rag-server.yaml up -d --build"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "ce492ce3",
-   "metadata": {},
-   "source": [
-    "---\n",
-    "\n",
-    "## 📚 Document Ingestion Workflow\n",
-    "\n",
-    "Now that all services are running, let's ingest documents into a collection.\n",
-    "\n",
-    "### 6. Create a Collection\n",
-    "\n",
-    "A collection is a logical grouping of documents in the vector database. Think of it as a database table optimized for similarity search."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "a8611aa1",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Install aiohttp for async HTTP requests\n",
-    "! uv pip install aiohttp\n",
-    "\n",
-    "# Configure the ingestor server URL\n",
-    "# Use \"ingestor-server\" when running in AI Workbench, otherwise \"localhost\"\n",
-    "IPADDRESS = (\n",
-    "    \"ingestor-server\"\n",
-    "    if os.environ.get(\"AI_WORKBENCH\", \"false\") == \"true\"\n",
-    "    else \"localhost\"\n",
-    ")\n",
-    "INGESTOR_SERVER_PORT = \"8082\"\n",
-    "BASE_URL = f\"http://{IPADDRESS}:{INGESTOR_SERVER_PORT}\"\n",
-    "\n",
-    "async def print_response(response):\n",
-    "    \"\"\"Helper function to pretty-print API responses.\"\"\"\n",
-    "    try:\n",
-    "        response_json = await response.json()\n",
-    "        print(json.dumps(response_json, indent=2))\n",
-    "    except aiohttp.ClientResponseError:\n",
-    "        print(await response.text())\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "688bc70f",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Define a unique name for your collection\n",
-    "# Change this if you want to create a different collection\n",
-    "collection_name = \"multimodal_query\""
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "24378f6f",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import aiohttp\n",
-    "import json\n",
-    "\n",
-    "\n",
-    "async def create_collection(\n",
-    "    collection_name: str | None = None,\n",
-    "    metadata_schema: list = [],\n",
-    "):\n",
-    "    \"\"\"\n",
-    "    Create a new collection in the vector database.\n",
-    "    \n",
-    "    Args:\n",
-    "        collection_name: Unique identifier for the collection\n",
-    "        metadata_schema: Optional schema for metadata fields\n",
-    "    \"\"\"\n",
-    "    data = {\n",
-    "        \"collection_name\": collection_name,\n",
-    "        \"metadata_schema\": metadata_schema,\n",
-    "    }\n",
-    "\n",
-    "    HEADERS = {\"Content-Type\": \"application/json\"}\n",
-    "\n",
-    "    async with aiohttp.ClientSession() as session:\n",
-    "        try:\n",
-    "            async with session.post(\n",
-    "                f\"{BASE_URL}/v1/collection\", json=data, headers=HEADERS\n",
-    "            ) as response:\n",
-    "                await print_response(response)\n",
-    "        except aiohttp.ClientError as e:\n",
-    "            return 500, {\"error\": str(e)}\n",
-    "\n",
-    "\n",
-    "# Create the collection\n",
-    "# The embedding dimension is 2048 for the multimodal embedding model we're using\n",
-    "await create_collection(\n",
-    "    collection_name=collection_name,\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "a29f4633",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Specify the documents to upload\n",
-    "# This PDF contains product images with pricing information\n",
-    "FILEPATHS = [\n",
-    "    \"../data/multimodal/product_catalog.pdf\",\n",
-    "]\n",
-    "\n",
-    "async def upload_documents(collection_name: str = \"\"):\n",
-    "    \"\"\"\n",
-    "    Upload and process documents into the collection.\n",
-    "    \n",
-    "    This will:\n",
-    "    1. Extract text and images from the PDFs\n",
-    "    2. Chunk the content for optimal retrieval\n",
-    "    3. Generate multimodal embeddings\n",
-    "    4. Store everything in the vector database\n",
-    "    \"\"\"\n",
-    "    data = {\n",
-    "        \"collection_name\": collection_name,\n",
-    "        \"blocking\": False,  # Async upload - use status API to check progress\n",
-    "        \"split_options\": {\n",
-    "            \"chunk_size\": 512,        # Characters per chunk\n",
-    "            \"chunk_overlap\": 150      # Overlap between chunks for context\n",
-    "        },\n",
-    "        \"generate_summary\": False  # Set to True to generate document summaries\n",
-    "    }\n",
-    "\n",
-    "    form_data = aiohttp.FormData()\n",
-    "    \n",
-    "    # Add all PDF files to the form data\n",
-    "    for file_path in FILEPATHS:\n",
-    "        form_data.add_field(\"documents\", open(file_path, \"rb\"), \n",
-    "                          filename=os.path.basename(file_path), \n",
-    "                          content_type=\"application/pdf\")\n",
-    "\n",
-    "    form_data.add_field(\"data\", json.dumps(data), content_type=\"application/json\")\n",
-    "\n",
-    "    async with aiohttp.ClientSession() as session:\n",
-    "        try:\n",
-    "            # Use POST for new uploads, PATCH for re-ingesting existing documents\n",
-    "            async with session.post(f\"{BASE_URL}/v1/documents\", data=form_data) as response:\n",
-    "                await print_response(response)\n",
-    "                response_json = await response.json()\n",
-    "                return response_json\n",
-    "        except aiohttp.ClientError as e:\n",
-    "            print(f\"Error uploading documents: {e}\")\n",
-    "            return None\n",
-    "\n",
-    "# Upload the documents and get the task ID for tracking progress\n",
-    "upload_response = await upload_documents(collection_name=collection_name)\n",
-    "task_id = upload_response.get(\"task_id\") if upload_response else None\n",
-    "print(f\"\\nTask ID for tracking: {task_id}\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "2234e059",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "async def get_task_status(task_id: str):\n",
-    "    \"\"\"\n",
-    "    Check the status of an asynchronous ingestion task.\n",
-    "    \n",
-    "    Possible statuses:\n",
-    "    - \"pending\": Task is queued\n",
-    "    - \"processing\": Currently processing documents\n",
-    "    - \"completed\": Successfully finished\n",
-    "    - \"failed\": Error occurred\n",
-    "    \"\"\"\n",
-    "    params = {\n",
-    "        \"task_id\": task_id,\n",
-    "    }\n",
-    "\n",
-    "    HEADERS = {\"Content-Type\": \"application/json\"}\n",
-    "\n",
-    "    async with aiohttp.ClientSession() as session:\n",
-    "        try:\n",
-    "            async with session.get(\n",
-    "                f\"{BASE_URL}/v1/status\", params=params, headers=HEADERS\n",
-    "            ) as response:\n",
-    "                await print_response(response)\n",
-    "        except aiohttp.ClientError as e:\n",
-    "            return 500, {\"error\": str(e)}\n",
-    "\n",
-    "\n",
-    "# Check the ingestion status\n",
-    "# Run this cell multiple times until status shows \"completed\"\n",
-    "await get_task_status(\n",
-    "    task_id=[task_id]\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "e2c1f1a8",
-   "metadata": {},
-   "source": [
-    "---\n",
-    "\n",
-    "## 🔍 Querying with Multimodal Inputs\n",
-    "\n",
-    "Now that documents are ingested, let's query them using both text and images!\n",
-    "\n",
-    "### 7. Using the Search and Generate APIs\n",
-    "\n",
-    "We'll demonstrate two approaches:\n",
-    "1. **Search API**: Find relevant documents without generating a response\n",
-    "2. **Generate API**: Get an AI-generated answer with citations"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "3990ca33",
-   "metadata": {},
-   "source": [
-    "#### Prepare a Multimodal Query\n",
-    "\n",
-    "To query with an image, we need to:\n",
-    "1. Convert the image to base64 encoding\n",
-    "2. Format it according to the OpenAI vision API format\n",
-    "3. Combine it with a text prompt"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "02dde830",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "! uv pip install requests httpx\n",
-    "import base64\n",
-    "import requests\n",
-    "from IPython.display import Image, Markdown, display\n",
-    "\n",
-    "def get_base64_image(image_source: str) -> str:\n",
-    "    \"\"\"\n",
-    "    Convert an image to base64 encoding.\n",
-    "    \n",
-    "    Args:\n",
-    "        image_source: Local file path or URL to the image\n",
-    "        \n",
-    "    Returns:\n",
-    "        Base64 encoded string of the image\n",
-    "    \"\"\"\n",
-    "    if image_source.startswith(('http://', 'https://')):\n",
-    "        # Download image from URL\n",
-    "        response = requests.get(image_source)\n",
-    "        return base64.b64encode(response.content).decode()\n",
-    "    else:\n",
-    "        # Read local file\n",
-    "        with open(image_source, \"rb\") as image_file:\n",
-    "            return base64.b64encode(image_file.read()).decode()\n",
-    "\n",
-    "# Convert the query image to base64\n",
-    "# Try different images to test different queries:\n",
-    "image_b64 = get_base64_image(\"../data/multimodal/Creme_clutch_purse1-small.jpg\")\n",
-    "\n",
-    "# Display the query image for reference\n",
-    "query_image_path = \"../data/multimodal/Creme_clutch_purse1-small.jpg\"\n",
-    "print(\"📷 Query Image:\")\n",
-    "display(Image(filename=query_image_path, width=300))\n",
-    "\n",
-    "# Format as a data URL\n",
-    "image_input = f\"data:image/png;base64,{image_b64}\"\n",
-    "\n",
-    "# Create the multimodal query with text + image\n",
-    "# This follows the OpenAI vision API format\n",
-    "query_1 = \"What material is this made of?\"\n",
-    "image_query = [\n",
-    "    {\"type\": \"text\", \"text\": query_1},\n",
-    "    {\n",
-    "        \"type\": \"image_url\",\n",
-    "        \"image_url\": {\n",
-    "            \"url\": image_input,\n",
-    "            \"detail\": \"auto\"  # Let the model decide the appropriate detail level\n",
-    "        }\n",
-    "    }\n",
-    "]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "c311f9d0",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import httpx\n",
-    "import json\n",
-    "from IPython.display import Image, Markdown, display\n",
-    "\n",
-    "RAG_BASE_URL = \"http://localhost:8081\"\n",
-    "\n",
-    "async def search_documents(payload):\n",
-    "    \"\"\"\n",
-    "    Search for relevant documents using a multimodal query.\n",
-    "    \n",
-    "    This performs similarity search in the vector database and optionally\n",
-    "    reranks results for better relevance.\n",
-    "    \"\"\"\n",
-    "    search_url = f\"{RAG_BASE_URL}/v1/search\"\n",
-    "    \n",
-    "    async with httpx.AsyncClient(timeout=300.0) as client:\n",
-    "        try:\n",
-    "            response = await client.post(url=search_url, json=payload)\n",
-    "            response.raise_for_status()\n",
-    "            \n",
-    "            search_results = response.json()\n",
-    "            print(\"Search Results:\")\n",
-    "            \n",
-    "            # Display search results with nice formatting\n",
-    "            if \"results\" in search_results:\n",
-    "                for idx, result in enumerate(search_results[\"results\"]):\n",
-    "                    doc_type = result.get(\"document_type\", \"text\")\n",
-    "                    content = result.get(\"content\", \"\")\n",
-    "                    doc_name = result.get(\"document_name\", f\"Result {idx + 1}\")\n",
-    "                    score = result.get(\"score\", \"N/A\")\n",
-    "                    \n",
-    "                    display(Markdown(f\"**Result {idx + 1}: {doc_name} (Score: {score})**\"))\n",
-    "                    try:\n",
-    "                        if doc_type == \"image\":\n",
-    "                            # Display image results\n",
-    "                            image_bytes = base64.b64decode(content)\n",
-    "                            display(Image(data=image_bytes))\n",
-    "                        else:\n",
-    "                            # Display text results\n",
-    "                            display(Markdown(f\"```\\n{content}\\n```\"))\n",
-    "                    except Exception as e:\n",
-    "                        print(f\"Error displaying content: {e}\")\n",
-    "                        display(Markdown(f\"```\\n{content}\\n```\"))\n",
-    "            \n",
-    "            return search_results\n",
-    "            \n",
-    "        except httpx.HTTPStatusError as e:\n",
-    "            print(f\"HTTP error occurred: {e.response.status_code} - {e.response.text}\")\n",
-    "        except httpx.RequestError as e:\n",
-    "            print(f\"An error occurred while requesting {e.request.url!r}: {e}\")\n",
-    "        except Exception as e:\n",
-    "            print(f\"An error occurred: {e}\")\n",
-    "\n",
-    "# Configure the search parameters\n",
-    "search_payload = {\n",
-    "    \"query\": image_query,                      # Our multimodal query (text + image)\n",
-    "    \"messages\": [],                            # No conversation history\n",
-    "    \"use_knowledge_base\": True,                # Search the vector database\n",
-    "    \"collection_names\": [collection_name],     # Which collection to search\n",
-    "    \"vdb_top_k\": 5,                           # Retrieve top 5 results from vector DB\n",
-    "    \"vdb_endpoint\": \"http://milvus:19530\",    # Milvus connection string\n",
-    "    \"enable_reranker\": False,                  # Set to True for better relevance (slower)\n",
-    "    \"reranker_top_k\": 3,                      # If reranker enabled, return top 3\n",
-    "    \"filter_expr\": \"\",                        # Optional metadata filter\n",
-    "}\n",
-    "\n",
-    "# Execute the search\n",
-    "print(\"🔍 Searching for documents matching the query...\\n\")\n",
-    "search_result = await search_documents(search_payload)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "bc5b1545",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import base64\n",
-    "import json\n",
-    "from IPython.display import Image, Markdown, display\n",
-    "\n",
-    "\n",
-    "async def print_streaming_response_and_citations(response_generator):\n",
-    "    \"\"\"\n",
-    "    Helper function to display streaming responses with citations.\n",
-    "    \n",
-    "    This function:\n",
-    "    1. Streams the AI-generated response token by token\n",
-    "    2. Extracts citations from the first chunk\n",
-    "    3. Displays citations (text or images) after the response completes\n",
-    "    \"\"\"\n",
-    "    first_chunk_data = None\n",
-    "    \n",
-    "    async for chunk in response_generator:\n",
-    "        # Parse Server-Sent Events (SSE) format\n",
-    "        if chunk.startswith(\"data: \"):\n",
-    "            chunk = chunk[len(\"data: \") :].strip()\n",
-    "        if not chunk:\n",
-    "            continue\n",
-    "            \n",
-    "        try:\n",
-    "            data = json.loads(chunk)\n",
-    "        except Exception as e:\n",
-    "            print(f\"JSON decode error: {e}\")\n",
-    "            continue\n",
-    "            \n",
-    "        choices = data.get(\"choices\", [])\n",
-    "        if not choices:\n",
-    "            continue\n",
-    "            \n",
-    "        # Save the first chunk with citations\n",
-    "        if first_chunk_data is None and data.get(\"citations\"):\n",
-    "            first_chunk_data = data\n",
-    "            \n",
-    "        # Print streaming text\n",
-    "        delta = choices[0].get(\"delta\", {})\n",
-    "        text = delta.get(\"content\")\n",
-    "        if not text:\n",
-    "            message = choices[0].get(\"message\", {})\n",
-    "            text = message.get(\"content\", \"\")\n",
-    "        print(text, end=\"\", flush=True)\n",
-    "        \n",
-    "    print()  # Newline after streaming\n",
-    "\n",
-    "    # Display citations after streaming is done\n",
-    "    if first_chunk_data and first_chunk_data.get(\"citations\"):\n",
-    "        print(\"\\n📚 Citations:\")\n",
-    "        citations = first_chunk_data[\"citations\"]\n",
-    "        for idx, citation in enumerate(citations.get(\"results\", [])):\n",
-    "            doc_type = citation.get(\"document_type\", \"text\")\n",
-    "            content = citation.get(\"content\", \"\")\n",
-    "            doc_name = citation.get(\"document_name\", f\"Citation {idx + 1}\")\n",
-    "            display(Markdown(f\"**Citation {idx + 1}: {doc_name}**\"))\n",
-    "            try:\n",
-    "                # Try to display as image\n",
-    "                image_bytes = base64.b64decode(content)\n",
-    "                display(Image(data=image_bytes))\n",
-    "            except Exception:\n",
-    "                # Fall back to text display\n",
-    "                display(Markdown(f\"```\\n{content}\\n```\"))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "31071359",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import httpx\n",
-    "\n",
-    "# Configure RAG server URL\n",
-    "IPADDRESS = \"rag-server\" if os.environ.get(\"AI_WORKBENCH\", \"false\") == \"true\" else \"localhost\"\n",
-    "RAG_SERVER_PORT = \"8081\"\n",
-    "RAG_BASE_URL = f\"http://{IPADDRESS}:{RAG_SERVER_PORT}\"\n",
-    "generate_url = f\"{RAG_BASE_URL}/v1/generate\"\n",
-    "\n",
-    "async def generate_answer(payload):\n",
-    "    \"\"\"\n",
-    "    Generate an AI answer using the RAG pipeline.\n",
-    "    \n",
-    "    This function:\n",
-    "    1. Sends the query to the RAG server\n",
-    "    2. Retrieves relevant context from the vector database\n",
-    "    3. Streams the AI-generated response\n",
-    "    4. Displays citations (sources) used to generate the answer\n",
-    "    \"\"\"\n",
-    "    rag_response = \"\"\n",
-    "    citations = []\n",
-    "    is_first_token = True\n",
-    "\n",
-    "    async with httpx.AsyncClient(timeout=300.0) as client:\n",
-    "        try:\n",
-    "            async with client.stream(\"POST\", url=generate_url, json=payload) as response:\n",
-    "                # Raise an exception for bad status codes like 4xx or 5xx\n",
-    "                response.raise_for_status()\n",
-    "\n",
-    "                # Iterate over the streaming response\n",
-    "                async for line in response.aiter_lines():\n",
-    "                    if line.startswith(\"data: \"):\n",
-    "                        json_str = line[6:].strip()\n",
-    "                        if not json_str:\n",
-    "                            continue\n",
-    "\n",
-    "                        try:\n",
-    "                            data = json.loads(json_str)\n",
-    "\n",
-    "                            # Extract and display the streaming response\n",
-    "                            message = data.get(\"choices\", [{}])[0].get(\"message\", {}).get(\"content\", \"\")\n",
-    "                            if message:\n",
-    "                                rag_response += message\n",
-    "\n",
-    "                            # Extract and display citations from the first chunk\n",
-    "                            if is_first_token and data.get(\"citations\"):\n",
-    "                                print(\"\\n📚 Citations:\")\n",
-    "                                citations = data[\"citations\"]\n",
-    "                                for idx, citation in enumerate(citations.get(\"results\", [])):\n",
-    "                                    doc_type = citation.get(\"document_type\", \"text\")\n",
-    "                                    content = citation.get(\"content\", \"\")\n",
-    "                                    doc_name = citation.get(\"document_name\", f\"Citation {idx + 1}\")\n",
-    "                                    display(Markdown(f\"**Citation {idx + 1}: {doc_name}**\"))\n",
-    "                                    try:\n",
-    "                                        # Display image citations\n",
-    "                                        image_bytes = base64.b64decode(content)\n",
-    "                                        display(Image(data=image_bytes))\n",
-    "                                    except Exception:\n",
-    "                                        # Display text citations\n",
-    "                                        display(Markdown(f\"```\\n{content}\\n```\"))\n",
-    "                                is_first_token = False\n",
-    "\n",
-    "                            # Check if streaming is complete\n",
-    "                            finish_reason = data.get(\"choices\", [{}])[0].get(\"finish_reason\")\n",
-    "                            if finish_reason == \"stop\":\n",
-    "                                return rag_response\n",
-    "\n",
-    "                        except json.JSONDecodeError:\n",
-    "                            print(f\"Skipping malformed JSON line: {json_str}\")\n",
-    "                            continue\n",
-    "        \n",
-    "        except httpx.HTTPStatusError as e:\n",
-    "            print(f\"HTTP error occurred: {e.response.status_code} - {e.response.text}\")\n",
-    "        except httpx.RequestError as e:\n",
-    "            print(f\"An error occurred while requesting {e.request.url!r}: {e}\")\n",
-    "        except Exception as e:\n",
-    "            print(f\"An error occurred: {e}\")\n",
-    "\n",
-    "    print(\"\\n✅ Response complete!\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "c3049696",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Format the query as a chat message\n",
-    "messages = [\n",
-    "    {\n",
-    "        \"role\": \"user\",\n",
-    "        \"content\": image_query  # Our multimodal query (text + image)\n",
-    "    }\n",
-    "]\n",
-    "\n",
-    "# Configure the generate API parameters\n",
-    "payload = {\n",
-    "    \"messages\": messages,                      # Chat conversation\n",
-    "    \"use_knowledge_base\": True,                # Enable RAG - use vector DB for context\n",
-    "    \"temperature\": 0.2,                        # Lower = more deterministic, higher = more creative\n",
-    "    \"top_p\": 0.7,                             # Nucleus sampling parameter\n",
-    "    \"max_tokens\": 1024,                       # Maximum response length\n",
-    "    \"reranker_top_k\": 2,                      # Keep top 2 results after reranking\n",
-    "    \"vdb_top_k\": 10,                          # Retrieve top 10 from vector DB initially\n",
-    "    \"vdb_endpoint\": \"http://milvus:19530\",    # Milvus connection\n",
-    "    \"collection_names\": [collection_name],     # Which collection to search\n",
-    "    \"enable_query_rewriting\": True,            # Improve query before searching\n",
-    "    \"enable_citations\": True,                  # Include source citations in response\n",
-    "    \"stop\": [],                               # Optional stop sequences\n",
-    "    \"filter_expr\": \"\",                        # Optional metadata filter    \n",
-    "}\n",
-    "\n",
-    "# Generate the answer with RAG\n",
-    "print(\"🤖 Generating answer with RAG...\\n\")\n",
-    "await generate_answer(payload)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "dcddc78d",
-   "metadata": {},
-   "source": [
-    "---\n",
-    "\n",
-    "## 🎉 Summary\n",
-    "\n",
-    "Congratulations! You've successfully:\n",
-    "\n",
-    "✅ **Set up the infrastructure**: Deployed Milvus vector DB, NVIDIA NIMs, and RAG services  \n",
-    "✅ **Ingested multimodal documents**: Uploaded PDFs with images and extracted their content  \n",
-    "✅ **Created multimodal queries**: Combined text and images in your search queries  \n",
-    "✅ **Retrieved relevant context**: Used semantic search to find matching documents  \n",
-    "✅ **Generated AI responses**: Got intelligent answers with source citations  \n",
-    "\n",
-    "### Next Steps\n",
-    "\n",
-    "- **Try different queries**: Change the query text or use different query images\n",
-    "- **Upload more documents**: Add more PDFs to enrich your knowledge base\n",
-    "- **Experiment with parameters**: Adjust `temperature`, `top_k`, reranker settings\n",
-    "- **Build applications**: Integrate these APIs into your own applications\n",
-    "\n",
-    "### Cleanup\n",
-    "\n",
-    "To stop all services and free up resources:\n",
-    "\n",
-    "```bash\n",
-    "cd ../deploy/compose\n",
-    "docker compose -f docker-compose-rag-server.yaml down\n",
-    "docker compose -f docker-compose-ingestor-server.yaml down\n",
-    "docker compose -f nims.yaml down\n",
-    "docker compose -f vectordb.yaml down\n",
-    "```\n"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": ".venv",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.11.10"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "id": "e20e694c",
+      "metadata": {},
+      "source": [
+        "# Retriever API Usage with Multimodal Query Support\n",
+        "\n",
+        "This notebook demonstrates how to use the NVIDIA RAG retriever APIs with **multimodal queries** (text + images). You'll learn how to:\n",
+        "\n",
+        "- 🔍 Search for relevant documents using queries that contain images\n",
+        "- 🤖 Generate AI responses using the end-to-end RAG API with vision-language models (VLMs)\n",
+        "- 📊 Work with multimodal embeddings and vector databases\n",
+        "\n",
+        "**Use Case**: Query documents with images (e.g., \"What is the price of this item?\" + product image)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "id": "0152f1eb",
+      "metadata": {},
+      "source": [
+        "## 📦 Setting up the Dependencies\n",
+        "\n",
+        "This section will guide you through:\n",
+        "1. Configuring your NGC API key for accessing NVIDIA services\n",
+        "2. Deploying the Milvus vector database\n",
+        "3. Setting up NVIDIA NIMs (NVIDIA Inference Microservices) for embeddings and VLM\n",
+        "4. Starting the NVIDIA Ingest runtime for document processing\n",
+        "5. Launching the RAG server\n",
+        "\n",
+        "**Note**: This setup uses Docker Compose to orchestrate all services."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "id": "d77a630e",
+      "metadata": {},
+      "source": [
+        "### 0. Create a Virtual Environment (Recommended)\n",
+        "\n",
+        "Before running this notebook, create a virtual environment using `uv` to isolate dependencies:\n",
+        "\n",
+        "```bash\n",
+        "# Create a virtual environment\n",
+        "uv venv .venv\n",
+        "\n",
+        "# Activate the virtual environment\n",
+        "source .venv/bin/activate  # Linux/macOS\n",
+        "# .venv\\Scripts\\activate   # Windows\n",
+        "\n",
+        "# Install Jupyter Lab and ipykernel (if not already installed)\n",
+        "uv pip install jupyterlab ipykernel\n",
+        "\n",
+        "# Register the venv as a Jupyter kernel\n",
+        "python -m ipykernel install --user --name=.venv --display-name=\"Python (.venv)\"\n",
+        "```\n",
+        "\n",
+        "After setup, select the venv as the kernel for this notebook:\n",
+        "1. In Jupyter/VS Code/Cursor, click on the kernel selector (top right)\n",
+        "2. Choose **\".venv\"** or **\"Python (.venv)\"** as the kernel\n",
+        "\n",
+        "This ensures all packages installed via `uv pip install` in the notebook cells are installed into the isolated environment.\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "id": "c39e628e",
+      "metadata": {},
+      "source": [
+        "### 1. Setup the Default Configurations\n",
+        "\n",
+        "Import necessary libraries for environment management."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "c03780a7",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Install python-dotenv for environment variable management\n",
+        "! uv pip install python-dotenv\n",
+        "\n",
+        "import os\n",
+        "from getpass import getpass"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "id": "8a19cef7",
+      "metadata": {},
+      "source": [
+        "Provide your NGC_API_KEY after executing the cell below. You can obtain a key by following steps [here](https://github.com/NVIDIA-AI-Blueprints/rag/blob/main/docs/api-key.md)."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "c1f7ffa3",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Check if NGC_API_KEY is already set, otherwise prompt for it\n",
+        "# Uncomment the line below to reset your API key\n",
+        "# del os.environ['NGC_API_KEY']\n",
+        "\n",
+        "if os.environ.get(\"NGC_API_KEY\", \"\").startswith(\"nvapi-\"):\n",
+        "    print(\"Valid NGC_API_KEY already in environment. Delete to reset\")\n",
+        "else:\n",
+        "    candidate_api_key = getpass(\"NVAPI Key (starts with nvapi-): \")\n",
+        "    assert candidate_api_key.startswith(\"nvapi-\"), (\n",
+        "        f\"{candidate_api_key[:5]}... is not a valid key\"\n",
+        "    )\n",
+        "    os.environ[\"NGC_API_KEY\"] = candidate_api_key"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "id": "20ec8b61",
+      "metadata": {},
+      "source": [
+        "Login to nvcr.io which is needed for pulling the containers of dependencies"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "03972882",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Login to NVIDIA Container Registry (nvcr.io) to pull required containers\n",
+        "!echo \"${NGC_API_KEY}\" | docker login nvcr.io -u '$oauthtoken' --password-stdin"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "id": "84642fbb",
+      "metadata": {},
+      "source": [
+        "### 2. Setup the Milvus Vector Database\n",
+        "\n",
+        "Milvus is a high-performance vector database used to store and search multimodal embeddings.\n",
+        "\n",
+        "**Configuration Notes**:\n",
+        "- By default, Milvus uses GPU indexing for faster performance\n",
+        "- Ensure you have provided the correct GPU ID below\n",
+        "- If you don't have a GPU available, you can switch to CPU-only Milvus by following the instructions in [milvus-configuration.md](https://github.com/NVIDIA-AI-Blueprints/rag/blob/main/docs/milvus-configuration.md)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "8125f717",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Specify which GPU to use for Milvus (change if using a different GPU)\n",
+        "os.environ[\"VECTORSTORE_GPU_DEVICE_ID\"] = \"0\""
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "3e2d3457",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Start Milvus vector database service\n",
+        "# This will run in the background (-d flag)\n",
+        "!docker compose -f ../deploy/compose/vectordb.yaml up -d"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "id": "afe17557",
+      "metadata": {},
+      "source": [
+        "### 3. Setup NVIDIA Inference Microservices (NIMs)\n",
+        "\n",
+        "NIMs provide optimized inference for AI models. For multimodal RAG, we need:\n",
+        "- **VLM (Vision-Language Model)**: `nvidia/nemotron-nano-12b-v2-vl` for understanding images and generating responses\n",
+        "- **Embedding Model**: `llama-3.2-nemoretriever-1b-vlm-embed-v1` for creating multimodal embeddings"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "id": "89a135eb",
+      "metadata": {},
+      "source": [
+        "#### Deploy On-Premise Models\n",
+        "\n",
+        "This section deploys NIMs locally using Docker. Models will be cached to avoid re-downloading."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "1b3d2e5c",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Create the model cache directory\n",
+        "!mkdir -p ~/.cache/model-cache"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "390df52d",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Set the MODEL_DIRECTORY environment variable to specify where models are cached\n",
+        "import os\n",
+        "\n",
+        "os.environ[\"MODEL_DIRECTORY\"] = os.path.expanduser(\"~/.cache/model-cache\")\n",
+        "print(\"MODEL_DIRECTORY set to:\", os.environ[\"MODEL_DIRECTORY\"])"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "62a9946a",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Deploy NIMs with VLM and embedding profiles\n",
+        "# ⚠️ WARNING: This may take 10-20 minutes as models download (~10GB+)\n",
+        "# If the kernel times out, just rerun this cell - it will resume where it left off\n",
+        "# Select a free GPU for VLM Microservice\n",
+        "os.environ[\"VLM_MS_GPU_ID\"] = \"1\"\n",
+        "! USERID=$(id -u) docker compose --profile vlm-ingest --profile vlm-only -f ../deploy/compose/nims.yaml up -d"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "e91f511a",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Monitor the status of running containers\n",
+        "# Run this cell repeatedly to check if all services are healthy\n",
+        "# Look for STATUS showing \"healthy\" or \"Up\" for all containers\n",
+        "!docker ps"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "cfb34a6a",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Configure the model names and service URLs for the RAG pipeline\n",
+        "# These settings tell the RAG server which models and endpoints to use\n",
+        "\n",
+        "# VLM (Vision-Language Model) configuration\n",
+        "os.environ[\"APP_VLM_MODELNAME\"] = \"nvidia/nemotron-nano-12b-v2-vl\"\n",
+        "os.environ[\"APP_VLM_SERVERURL\"] = \"http://vlm-ms:8000/v1\"\n",
+        "\n",
+        "# Multimodal embedding model configuration\n",
+        "os.environ[\"APP_EMBEDDINGS_MODELNAME\"] = \"nvidia/llama-nemotron-embed-vl-1b-v2\"\n",
+        "os.environ[\"APP_EMBEDDINGS_SERVERURL\"] = \"nemotron-vlm-embedding-ms:8000/v1\"\n",
+        "os.environ[\"ENABLE_VLM_INFERENCE\"] = \"true\"\n",
+        "os.environ[\"VLM_TO_LLM_FALLBACK\"] = \"false\"\n",
+        "os.environ[\"ENABLE_RERANKER\"] = \"false\""
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "id": "e62c7037",
+      "metadata": {},
+      "source": [
+        "#### Cloud based deployment\n",
+        "Using NVIDIA hosted cloud model"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "82084d4d",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "\n",
+        "# OCR and document processing endpoints - cloud hosted\n",
+        "os.environ[\"OCR_HTTP_ENDPOINT\"] = \"https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-ocr\"\n",
+        "os.environ[\"OCR_INFER_PROTOCOL\"] = \"http\"\n",
+        "os.environ[\"OCR_MODEL_NAME\"] = \"scene_text_ensemble\"\n",
+        "os.environ[\"YOLOX_HTTP_ENDPOINT\"] = \"https://ai.api.nvidia.com/v1/cv/nvidia/nemotron-page-elements-v3\"\n",
+        "os.environ[\"YOLOX_INFER_PROTOCOL\"] = \"http\"\n",
+        "os.environ[\"YOLOX_GRAPHIC_ELEMENTS_HTTP_ENDPOINT\"] = \"https://ai.api.nvidia.com/v1/cv/nvidia/nemotron-graphic-elements-v1\"\n",
+        "os.environ[\"YOLOX_GRAPHIC_ELEMENTS_INFER_PROTOCOL\"] = \"http\"\n",
+        "os.environ[\"YOLOX_TABLE_STRUCTURE_HTTP_ENDPOINT\"] = \"https://ai.api.nvidia.com/v1/cv/nvidia/nemotron-table-structure-v1\"\n",
+        "os.environ[\"YOLOX_TABLE_STRUCTURE_INFER_PROTOCOL\"] = \"http\"\n",
+        "os.environ[\"APP_NVINGEST_CAPTIONENDPOINTURL\"] = \"https://integrate.api.nvidia.com/v1/chat/completions\"\n",
+        "\n",
+        "# VLM Model configuration - cloud hosted\n",
+        "os.environ[\"APP_VLM_MODELNAME\"] = \"nvidia/nemotron-nano-12b-v2-vl\"\n",
+        "os.environ[\"APP_VLM_SERVERURL\"] = \"https://integrate.api.nvidia.com/v1\"\n",
+        "os.environ[\"APP_LLM_SERVERURL\"] = \"\"\n",
+        "\n",
+        "# Multimodal embedding model configuration - cloud hosted\n",
+        "os.environ[\"APP_EMBEDDINGS_MODELNAME\"] = \"nvidia/llama-nemotron-embed-vl-1b-v2\"\n",
+        "os.environ[\"APP_EMBEDDINGS_SERVERURL\"] = \"https://integrate.api.nvidia.com/v1\"\n",
+        "os.environ[\"ENABLE_VLM_INFERENCE\"] = \"true\"\n",
+        "os.environ[\"VLM_TO_LLM_FALLBACK\"] = \"false\"\n",
+        "os.environ[\"ENABLE_RERANKER\"] = \"false\""
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "id": "7cbcfa50",
+      "metadata": {},
+      "source": [
+        "### 4. Setup NVIDIA Ingest Runtime\n",
+        "\n",
+        "NVIDIA Ingest processes documents to extract text, images, and other elements. We'll configure it to:\n",
+        "- Extract images from documents\n",
+        "- Handle multimodal content"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "a5e0d73f",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Configure NVIDIA Ingest to extract and process images from documents\n",
+        "os.environ[\"APP_NVINGEST_STRUCTURED_ELEMENTS_MODALITY\"] = \"\"  # No special handling for structured elements\n",
+        "os.environ[\"APP_NVINGEST_IMAGE_ELEMENTS_MODALITY\"] = \"image\"  # Process image elements as images\n",
+        "os.environ[\"APP_NVINGEST_EXTRACTIMAGES\"] = \"True\"  # Extract images from documents\n",
+        "\n",
+        "# Start the ingestor server with Redis\n",
+        "! docker compose -f ../deploy/compose/docker-compose-ingestor-server.yaml up -d --build"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "id": "da1bd9a3",
+      "metadata": {},
+      "source": [
+        "### 5. Setup the NVIDIA RAG Server\n",
+        "\n",
+        "The RAG server provides the main API endpoints for search and generation. It orchestrates all the components (embeddings, vector DB, VLM) to deliver intelligent responses."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "38ba7752",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Start the RAG server (accessible at localhost:8081)\n",
+        "os.environ[\"APP_RANKING_SERVERURL\"] = \"\"\n",
+        "! docker compose -f ../deploy/compose/docker-compose-rag-server.yaml up -d --build"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "id": "ce492ce3",
+      "metadata": {},
+      "source": [
+        "---\n",
+        "\n",
+        "## 📚 Document Ingestion Workflow\n",
+        "\n",
+        "Now that all services are running, let's ingest documents into a collection.\n",
+        "\n",
+        "### 6. Create a Collection\n",
+        "\n",
+        "A collection is a logical grouping of documents in the vector database. Think of it as a database table optimized for similarity search."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "a8611aa1",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Install aiohttp for async HTTP requests\n",
+        "! uv pip install aiohttp\n",
+        "\n",
+        "# Configure the ingestor server URL\n",
+        "# Use \"ingestor-server\" when running in AI Workbench, otherwise \"localhost\"\n",
+        "IPADDRESS = (\n",
+        "    \"ingestor-server\"\n",
+        "    if os.environ.get(\"AI_WORKBENCH\", \"false\") == \"true\"\n",
+        "    else \"localhost\"\n",
+        ")\n",
+        "INGESTOR_SERVER_PORT = \"8082\"\n",
+        "BASE_URL = f\"http://{IPADDRESS}:{INGESTOR_SERVER_PORT}\"\n",
+        "\n",
+        "async def print_response(response):\n",
+        "    \"\"\"Helper function to pretty-print API responses.\"\"\"\n",
+        "    try:\n",
+        "        response_json = await response.json()\n",
+        "        print(json.dumps(response_json, indent=2))\n",
+        "    except aiohttp.ClientResponseError:\n",
+        "        print(await response.text())\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "688bc70f",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Define a unique name for your collection\n",
+        "# Change this if you want to create a different collection\n",
+        "collection_name = \"multimodal_query\""
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "24378f6f",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "import aiohttp\n",
+        "import json\n",
+        "\n",
+        "\n",
+        "async def create_collection(\n",
+        "    collection_name: str | None = None,\n",
+        "    metadata_schema: list = [],\n",
+        "):\n",
+        "    \"\"\"\n",
+        "    Create a new collection in the vector database.\n",
+        "    \n",
+        "    Args:\n",
+        "        collection_name: Unique identifier for the collection\n",
+        "        metadata_schema: Optional schema for metadata fields\n",
+        "    \"\"\"\n",
+        "    data = {\n",
+        "        \"collection_name\": collection_name,\n",
+        "        \"metadata_schema\": metadata_schema,\n",
+        "    }\n",
+        "\n",
+        "    HEADERS = {\"Content-Type\": \"application/json\"}\n",
+        "\n",
+        "    async with aiohttp.ClientSession() as session:\n",
+        "        try:\n",
+        "            async with session.post(\n",
+        "                f\"{BASE_URL}/v1/collection\", json=data, headers=HEADERS\n",
+        "            ) as response:\n",
+        "                await print_response(response)\n",
+        "        except aiohttp.ClientError as e:\n",
+        "            return 500, {\"error\": str(e)}\n",
+        "\n",
+        "\n",
+        "# Create the collection\n",
+        "# The embedding dimension is 2048 for the multimodal embedding model we're using\n",
+        "await create_collection(\n",
+        "    collection_name=collection_name,\n",
+        ")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "a29f4633",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Specify the documents to upload\n",
+        "# This PDF contains product images with pricing information\n",
+        "FILEPATHS = [\n",
+        "    \"../data/multimodal/product_catalog.pdf\",\n",
+        "]\n",
+        "\n",
+        "async def upload_documents(collection_name: str = \"\"):\n",
+        "    \"\"\"\n",
+        "    Upload and process documents into the collection.\n",
+        "    \n",
+        "    This will:\n",
+        "    1. Extract text and images from the PDFs\n",
+        "    2. Chunk the content for optimal retrieval\n",
+        "    3. Generate multimodal embeddings\n",
+        "    4. Store everything in the vector database\n",
+        "    \"\"\"\n",
+        "    data = {\n",
+        "        \"collection_name\": collection_name,\n",
+        "        \"blocking\": False,  # Async upload - use status API to check progress\n",
+        "        \"split_options\": {\n",
+        "            \"chunk_size\": 512,        # Characters per chunk\n",
+        "            \"chunk_overlap\": 150      # Overlap between chunks for context\n",
+        "        },\n",
+        "        \"generate_summary\": False  # Set to True to generate document summaries\n",
+        "    }\n",
+        "\n",
+        "    form_data = aiohttp.FormData()\n",
+        "    \n",
+        "    # Add all PDF files to the form data\n",
+        "    for file_path in FILEPATHS:\n",
+        "        form_data.add_field(\"documents\", open(file_path, \"rb\"), \n",
+        "                          filename=os.path.basename(file_path), \n",
+        "                          content_type=\"application/pdf\")\n",
+        "\n",
+        "    form_data.add_field(\"data\", json.dumps(data), content_type=\"application/json\")\n",
+        "\n",
+        "    async with aiohttp.ClientSession() as session:\n",
+        "        try:\n",
+        "            # Use POST for new uploads, PATCH for re-ingesting existing documents\n",
+        "            async with session.post(f\"{BASE_URL}/v1/documents\", data=form_data) as response:\n",
+        "                await print_response(response)\n",
+        "                response_json = await response.json()\n",
+        "                return response_json\n",
+        "        except aiohttp.ClientError as e:\n",
+        "            print(f\"Error uploading documents: {e}\")\n",
+        "            return None\n",
+        "\n",
+        "# Upload the documents and get the task ID for tracking progress\n",
+        "upload_response = await upload_documents(collection_name=collection_name)\n",
+        "task_id = upload_response.get(\"task_id\") if upload_response else None\n",
+        "print(f\"\\nTask ID for tracking: {task_id}\")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "2234e059",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "async def get_task_status(task_id: str):\n",
+        "    \"\"\"\n",
+        "    Check the status of an asynchronous ingestion task.\n",
+        "    \n",
+        "    Possible statuses:\n",
+        "    - \"pending\": Task is queued\n",
+        "    - \"processing\": Currently processing documents\n",
+        "    - \"completed\": Successfully finished\n",
+        "    - \"failed\": Error occurred\n",
+        "    \"\"\"\n",
+        "    params = {\n",
+        "        \"task_id\": task_id,\n",
+        "    }\n",
+        "\n",
+        "    HEADERS = {\"Content-Type\": \"application/json\"}\n",
+        "\n",
+        "    async with aiohttp.ClientSession() as session:\n",
+        "        try:\n",
+        "            async with session.get(\n",
+        "                f\"{BASE_URL}/v1/status\", params=params, headers=HEADERS\n",
+        "            ) as response:\n",
+        "                await print_response(response)\n",
+        "        except aiohttp.ClientError as e:\n",
+        "            return 500, {\"error\": str(e)}\n",
+        "\n",
+        "\n",
+        "# Check the ingestion status\n",
+        "# Run this cell multiple times until status shows \"completed\"\n",
+        "await get_task_status(\n",
+        "    task_id=[task_id]\n",
+        ")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "id": "e2c1f1a8",
+      "metadata": {},
+      "source": [
+        "---\n",
+        "\n",
+        "## 🔍 Querying with Multimodal Inputs\n",
+        "\n",
+        "Now that documents are ingested, let's query them using both text and images!\n",
+        "\n",
+        "### 7. Using the Search and Generate APIs\n",
+        "\n",
+        "We'll demonstrate two approaches:\n",
+        "1. **Search API**: Find relevant documents without generating a response\n",
+        "2. **Generate API**: Get an AI-generated answer with citations"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "id": "3990ca33",
+      "metadata": {},
+      "source": [
+        "#### Prepare a Multimodal Query\n",
+        "\n",
+        "To query with an image, we need to:\n",
+        "1. Convert the image to base64 encoding\n",
+        "2. Format it according to the OpenAI vision API format\n",
+        "3. Combine it with a text prompt"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "02dde830",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "! uv pip install requests httpx\n",
+        "import base64\n",
+        "import requests\n",
+        "from IPython.display import Image, Markdown, display\n",
+        "\n",
+        "def get_base64_image(image_source: str) -> str:\n",
+        "    \"\"\"\n",
+        "    Convert an image to base64 encoding.\n",
+        "    \n",
+        "    Args:\n",
+        "        image_source: Local file path or URL to the image\n",
+        "        \n",
+        "    Returns:\n",
+        "        Base64 encoded string of the image\n",
+        "    \"\"\"\n",
+        "    if image_source.startswith(('http://', 'https://')):\n",
+        "        # Download image from URL\n",
+        "        response = requests.get(image_source)\n",
+        "        return base64.b64encode(response.content).decode()\n",
+        "    else:\n",
+        "        # Read local file\n",
+        "        with open(image_source, \"rb\") as image_file:\n",
+        "            return base64.b64encode(image_file.read()).decode()\n",
+        "\n",
+        "# Convert the query image to base64\n",
+        "# Try different images to test different queries:\n",
+        "image_b64 = get_base64_image(\"../data/multimodal/Creme_clutch_purse1-small.jpg\")\n",
+        "\n",
+        "# Display the query image for reference\n",
+        "query_image_path = \"../data/multimodal/Creme_clutch_purse1-small.jpg\"\n",
+        "print(\"📷 Query Image:\")\n",
+        "display(Image(filename=query_image_path, width=300))\n",
+        "\n",
+        "# Format as a data URL\n",
+        "image_input = f\"data:image/png;base64,{image_b64}\"\n",
+        "\n",
+        "# Create the multimodal query with text + image\n",
+        "# This follows the OpenAI vision API format\n",
+        "query_1 = \"What material is this made of?\"\n",
+        "image_query = [\n",
+        "    {\"type\": \"text\", \"text\": query_1},\n",
+        "    {\n",
+        "        \"type\": \"image_url\",\n",
+        "        \"image_url\": {\n",
+        "            \"url\": image_input,\n",
+        "            \"detail\": \"auto\"  # Let the model decide the appropriate detail level\n",
+        "        }\n",
+        "    }\n",
+        "]"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "c311f9d0",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "import httpx\n",
+        "import json\n",
+        "from IPython.display import Image, Markdown, display\n",
+        "\n",
+        "RAG_BASE_URL = \"http://localhost:8081\"\n",
+        "\n",
+        "async def search_documents(payload):\n",
+        "    \"\"\"\n",
+        "    Search for relevant documents using a multimodal query.\n",
+        "    \n",
+        "    This performs similarity search in the vector database and optionally\n",
+        "    reranks results for better relevance.\n",
+        "    \"\"\"\n",
+        "    search_url = f\"{RAG_BASE_URL}/v1/search\"\n",
+        "    \n",
+        "    async with httpx.AsyncClient(timeout=300.0) as client:\n",
+        "        try:\n",
+        "            response = await client.post(url=search_url, json=payload)\n",
+        "            response.raise_for_status()\n",
+        "            \n",
+        "            search_results = response.json()\n",
+        "            print(\"Search Results:\")\n",
+        "            \n",
+        "            # Display search results with nice formatting\n",
+        "            if \"results\" in search_results:\n",
+        "                for idx, result in enumerate(search_results[\"results\"]):\n",
+        "                    doc_type = result.get(\"document_type\", \"text\")\n",
+        "                    content = result.get(\"content\", \"\")\n",
+        "                    doc_name = result.get(\"document_name\", f\"Result {idx + 1}\")\n",
+        "                    score = result.get(\"score\", \"N/A\")\n",
+        "                    \n",
+        "                    display(Markdown(f\"**Result {idx + 1}: {doc_name} (Score: {score})**\"))\n",
+        "                    try:\n",
+        "                        if doc_type == \"image\":\n",
+        "                            # Display image results\n",
+        "                            image_bytes = base64.b64decode(content)\n",
+        "                            display(Image(data=image_bytes))\n",
+        "                        else:\n",
+        "                            # Display text results\n",
+        "                            display(Markdown(f\"```\\n{content}\\n```\"))\n",
+        "                    except Exception as e:\n",
+        "                        print(f\"Error displaying content: {e}\")\n",
+        "                        display(Markdown(f\"```\\n{content}\\n```\"))\n",
+        "            \n",
+        "            return search_results\n",
+        "            \n",
+        "        except httpx.HTTPStatusError as e:\n",
+        "            print(f\"HTTP error occurred: {e.response.status_code} - {e.response.text}\")\n",
+        "        except httpx.RequestError as e:\n",
+        "            print(f\"An error occurred while requesting {e.request.url!r}: {e}\")\n",
+        "        except Exception as e:\n",
+        "            print(f\"An error occurred: {e}\")\n",
+        "\n",
+        "# Configure the search parameters\n",
+        "search_payload = {\n",
+        "    \"query\": image_query,                      # Our multimodal query (text + image)\n",
+        "    \"messages\": [],                            # No conversation history\n",
+        "    \"use_knowledge_base\": True,                # Search the vector database\n",
+        "    \"collection_names\": [collection_name],     # Which collection to search\n",
+        "    \"vdb_top_k\": 5,                           # Retrieve top 5 results from vector DB\n",
+        "    \"vdb_endpoint\": \"http://milvus:19530\",    # Milvus connection string\n",
+        "    \"enable_reranker\": False,                  # Set to True for better relevance (slower)\n",
+        "    \"reranker_top_k\": 3,                      # If reranker enabled, return top 3\n",
+        "    \"filter_expr\": \"\",                        # Optional metadata filter\n",
+        "}\n",
+        "\n",
+        "# Execute the search\n",
+        "print(\"🔍 Searching for documents matching the query...\\n\")\n",
+        "search_result = await search_documents(search_payload)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "bc5b1545",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "import base64\n",
+        "import json\n",
+        "from IPython.display import Image, Markdown, display\n",
+        "\n",
+        "\n",
+        "async def print_streaming_response_and_citations(response_generator):\n",
+        "    \"\"\"\n",
+        "    Helper function to display streaming responses with citations.\n",
+        "    \n",
+        "    This function:\n",
+        "    1. Streams the AI-generated response token by token\n",
+        "    2. Extracts citations from the first chunk\n",
+        "    3. Displays citations (text or images) after the response completes\n",
+        "    \"\"\"\n",
+        "    first_chunk_data = None\n",
+        "    \n",
+        "    async for chunk in response_generator:\n",
+        "        # Parse Server-Sent Events (SSE) format\n",
+        "        if chunk.startswith(\"data: \"):\n",
+        "            chunk = chunk[len(\"data: \") :].strip()\n",
+        "        if not chunk:\n",
+        "            continue\n",
+        "            \n",
+        "        try:\n",
+        "            data = json.loads(chunk)\n",
+        "        except Exception as e:\n",
+        "            print(f\"JSON decode error: {e}\")\n",
+        "            continue\n",
+        "            \n",
+        "        choices = data.get(\"choices\", [])\n",
+        "        if not choices:\n",
+        "            continue\n",
+        "            \n",
+        "        # Save the first chunk with citations\n",
+        "        if first_chunk_data is None and data.get(\"citations\"):\n",
+        "            first_chunk_data = data\n",
+        "            \n",
+        "        # Print streaming text\n",
+        "        delta = choices[0].get(\"delta\", {})\n",
+        "        text = delta.get(\"content\")\n",
+        "        if not text:\n",
+        "            message = choices[0].get(\"message\", {})\n",
+        "            text = message.get(\"content\", \"\")\n",
+        "        print(text, end=\"\", flush=True)\n",
+        "        \n",
+        "    print()  # Newline after streaming\n",
+        "\n",
+        "    # Display citations after streaming is done\n",
+        "    if first_chunk_data and first_chunk_data.get(\"citations\"):\n",
+        "        print(\"\\n📚 Citations:\")\n",
+        "        citations = first_chunk_data[\"citations\"]\n",
+        "        for idx, citation in enumerate(citations.get(\"results\", [])):\n",
+        "            doc_type = citation.get(\"document_type\", \"text\")\n",
+        "            content = citation.get(\"content\", \"\")\n",
+        "            doc_name = citation.get(\"document_name\", f\"Citation {idx + 1}\")\n",
+        "            display(Markdown(f\"**Citation {idx + 1}: {doc_name}**\"))\n",
+        "            try:\n",
+        "                # Try to display as image\n",
+        "                image_bytes = base64.b64decode(content)\n",
+        "                display(Image(data=image_bytes))\n",
+        "            except Exception:\n",
+        "                # Fall back to text display\n",
+        "                display(Markdown(f\"```\\n{content}\\n```\"))"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "31071359",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "import httpx\n",
+        "\n",
+        "# Configure RAG server URL\n",
+        "IPADDRESS = \"rag-server\" if os.environ.get(\"AI_WORKBENCH\", \"false\") == \"true\" else \"localhost\"\n",
+        "RAG_SERVER_PORT = \"8081\"\n",
+        "RAG_BASE_URL = f\"http://{IPADDRESS}:{RAG_SERVER_PORT}\"\n",
+        "generate_url = f\"{RAG_BASE_URL}/v1/generate\"\n",
+        "\n",
+        "async def generate_answer(payload):\n",
+        "    \"\"\"\n",
+        "    Generate an AI answer using the RAG pipeline.\n",
+        "    \n",
+        "    This function:\n",
+        "    1. Sends the query to the RAG server\n",
+        "    2. Retrieves relevant context from the vector database\n",
+        "    3. Streams the AI-generated response\n",
+        "    4. Displays citations (sources) used to generate the answer\n",
+        "    \"\"\"\n",
+        "    rag_response = \"\"\n",
+        "    citations = []\n",
+        "    is_first_token = True\n",
+        "\n",
+        "    async with httpx.AsyncClient(timeout=300.0) as client:\n",
+        "        try:\n",
+        "            async with client.stream(\"POST\", url=generate_url, json=payload) as response:\n",
+        "                # Raise an exception for bad status codes like 4xx or 5xx\n",
+        "                response.raise_for_status()\n",
+        "\n",
+        "                # Iterate over the streaming response\n",
+        "                async for line in response.aiter_lines():\n",
+        "                    if line.startswith(\"data: \"):\n",
+        "                        json_str = line[6:].strip()\n",
+        "                        if not json_str:\n",
+        "                            continue\n",
+        "\n",
+        "                        try:\n",
+        "                            data = json.loads(json_str)\n",
+        "\n",
+        "                            # Extract and display the streaming response\n",
+        "                            message = data.get(\"choices\", [{}])[0].get(\"message\", {}).get(\"content\", \"\")\n",
+        "                            if message:\n",
+        "                                rag_response += message\n",
+        "\n",
+        "                            # Extract and display citations from the first chunk\n",
+        "                            if is_first_token and data.get(\"citations\"):\n",
+        "                                print(\"\\n📚 Citations:\")\n",
+        "                                citations = data[\"citations\"]\n",
+        "                                for idx, citation in enumerate(citations.get(\"results\", [])):\n",
+        "                                    doc_type = citation.get(\"document_type\", \"text\")\n",
+        "                                    content = citation.get(\"content\", \"\")\n",
+        "                                    doc_name = citation.get(\"document_name\", f\"Citation {idx + 1}\")\n",
+        "                                    display(Markdown(f\"**Citation {idx + 1}: {doc_name}**\"))\n",
+        "                                    try:\n",
+        "                                        # Display image citations\n",
+        "                                        image_bytes = base64.b64decode(content)\n",
+        "                                        display(Image(data=image_bytes))\n",
+        "                                    except Exception:\n",
+        "                                        # Display text citations\n",
+        "                                        display(Markdown(f\"```\\n{content}\\n```\"))\n",
+        "                                is_first_token = False\n",
+        "\n",
+        "                            # Check if streaming is complete\n",
+        "                            finish_reason = data.get(\"choices\", [{}])[0].get(\"finish_reason\")\n",
+        "                            if finish_reason == \"stop\":\n",
+        "                                return rag_response\n",
+        "\n",
+        "                        except json.JSONDecodeError:\n",
+        "                            print(f\"Skipping malformed JSON line: {json_str}\")\n",
+        "                            continue\n",
+        "        \n",
+        "        except httpx.HTTPStatusError as e:\n",
+        "            print(f\"HTTP error occurred: {e.response.status_code} - {e.response.text}\")\n",
+        "        except httpx.RequestError as e:\n",
+        "            print(f\"An error occurred while requesting {e.request.url!r}: {e}\")\n",
+        "        except Exception as e:\n",
+        "            print(f\"An error occurred: {e}\")\n",
+        "\n",
+        "    print(\"\\n✅ Response complete!\")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "c3049696",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Format the query as a chat message\n",
+        "messages = [\n",
+        "    {\n",
+        "        \"role\": \"user\",\n",
+        "        \"content\": image_query  # Our multimodal query (text + image)\n",
+        "    }\n",
+        "]\n",
+        "\n",
+        "# Configure the generate API parameters\n",
+        "payload = {\n",
+        "    \"messages\": messages,                      # Chat conversation\n",
+        "    \"use_knowledge_base\": True,                # Enable RAG - use vector DB for context\n",
+        "    \"temperature\": 0.2,                        # Lower = more deterministic, higher = more creative\n",
+        "    \"top_p\": 0.7,                             # Nucleus sampling parameter\n",
+        "    \"max_tokens\": 1024,                       # Maximum response length\n",
+        "    \"reranker_top_k\": 2,                      # Keep top 2 results after reranking\n",
+        "    \"vdb_top_k\": 10,                          # Retrieve top 10 from vector DB initially\n",
+        "    \"vdb_endpoint\": \"http://milvus:19530\",    # Milvus connection\n",
+        "    \"collection_names\": [collection_name],     # Which collection to search\n",
+        "    \"enable_query_rewriting\": True,            # Improve query before searching\n",
+        "    \"enable_citations\": True,                  # Include source citations in response\n",
+        "    \"stop\": [],                               # Optional stop sequences\n",
+        "    \"filter_expr\": \"\",                        # Optional metadata filter    \n",
+        "}\n",
+        "\n",
+        "# Generate the answer with RAG\n",
+        "print(\"🤖 Generating answer with RAG...\\n\")\n",
+        "await generate_answer(payload)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "id": "dcddc78d",
+      "metadata": {},
+      "source": [
+        "---\n",
+        "\n",
+        "## 🎉 Summary\n",
+        "\n",
+        "Congratulations! You've successfully:\n",
+        "\n",
+        "✅ **Set up the infrastructure**: Deployed Milvus vector DB, NVIDIA NIMs, and RAG services  \n",
+        "✅ **Ingested multimodal documents**: Uploaded PDFs with images and extracted their content  \n",
+        "✅ **Created multimodal queries**: Combined text and images in your search queries  \n",
+        "✅ **Retrieved relevant context**: Used semantic search to find matching documents  \n",
+        "✅ **Generated AI responses**: Got intelligent answers with source citations  \n",
+        "\n",
+        "### Next Steps\n",
+        "\n",
+        "- **Try different queries**: Change the query text or use different query images\n",
+        "- **Upload more documents**: Add more PDFs to enrich your knowledge base\n",
+        "- **Experiment with parameters**: Adjust `temperature`, `top_k`, reranker settings\n",
+        "- **Build applications**: Integrate these APIs into your own applications\n",
+        "\n",
+        "### Cleanup\n",
+        "\n",
+        "To stop all services and free up resources:\n",
+        "\n",
+        "```bash\n",
+        "cd ../deploy/compose\n",
+        "docker compose -f docker-compose-rag-server.yaml down\n",
+        "docker compose -f docker-compose-ingestor-server.yaml down\n",
+        "docker compose -f nims.yaml down\n",
+        "docker compose -f vectordb.yaml down\n",
+        "```\n"
+      ]
+    }
+  ],
+  "metadata": {
+    "kernelspec": {
+      "display_name": ".venv",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.11.10"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 5
 }
diff --git a/notebooks/langchain_nvidia_retriever.ipynb b/notebooks/langchain_nvidia_retriever.ipynb
new file mode 100644
index 000000000..02b05b6db
--- /dev/null
+++ b/notebooks/langchain_nvidia_retriever.ipynb
@@ -0,0 +1,315 @@
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "id": "303aa520",
+      "metadata": {},
+      "source": [
+        "# NVIDIARAGRetriever Connector – LangChain Integration\n",
+        "\n",
+        "**Motivation:** This notebook showcases the **LangChain integration** with the NVIDIA RAG Blueprint. The `NVIDIARAGRetriever` from `langchain-nvidia-ai-endpoints` connects to the NVIDIA RAG `/v1/search` endpoint and returns standard LangChain `Document` objects, enabling seamless use in chains, agents, and RAG pipelines without custom HTTP code.\n",
+        "\n",
+        "---\n",
+        "\n",
+        "## Prerequisite: Run Ingestion First\n",
+        "\n",
+        "**You must ingest documents before using this notebook.** Use the [ingestion_api_usage.ipynb](./ingestion_api_usage.ipynb) notebook:\n",
+        "\n",
+        "1. Open [ingestion_api_usage.ipynb](./ingestion_api_usage.ipynb).\n",
+        "2. Execute the following cells **in order** (top to bottom):\n",
+        "   - **1. Install Dependencies** – `pip install aiohttp`\n",
+        "   - **2. Setup Base Configuration** – ingestor URL (port 8082)\n",
+        "   - **3. Health Check** – verify ingestor is running\n",
+        "   - **4. Create collection** – creates `multimodal_data` collection\n",
+        "   - **4. Upload Document** – FILEPATHS cell, then `upload_documents` cell\n",
+        "   - **5. Get Task Status** – poll until state is `FINISHED`\n",
+        "3. When ingestion is complete, return here and run the cells below.\n",
+        "\n",
+        "Ensure the **RAG server** (port 8081) is running. See [Get Started](https://github.com/NVIDIA-AI-Blueprints/rag/blob/main/docs/deploy-docker-self-hosted.md)."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "id": "c7a2a7dd",
+      "metadata": {},
+      "source": [
+        "---\n",
+        "## Setup"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "e6fe7153",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "!pip install langchain-nvidia-ai-endpoints langchain-core\n",
+        "\n",
+        "import os\n",
+        "\n",
+        "# RAG server URL (use collection from ingestion_api_usage.ipynb)\n",
+        "RAG_IPADDRESS = (\n",
+        "    \"rag-server\" if os.environ.get(\"AI_WORKBENCH\", \"false\") == \"true\" else \"localhost\"\n",
+        ")\n",
+        "RAG_BASE_URL = f\"http://{RAG_IPADDRESS}:8081\"\n",
+        "\n",
+        "# Collection from ingestion_api_usage.ipynb (default: multimodal_data)\n",
+        "COLLECTION_NAME = \"multimodal_data\""
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "id": "640eee93",
+      "metadata": {},
+      "source": [
+        "---\n",
+        "## Retrieval with NVIDIARAGRetriever\n",
+        "\n",
+        "The `NVIDIARAGRetriever` from `langchain-nvidia-ai-endpoints` connects to the NVIDIA RAG Blueprint `/v1/search` endpoint and returns LangChain `Document` objects. Use `COLLECTION_NAME` to match the collection you created in [ingestion_api_usage.ipynb](./ingestion_api_usage.ipynb)."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "### 2.1 Basic Sync Retrieval"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "4b09138a",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "from langchain_nvidia_ai_endpoints import NVIDIARAGRetriever\n",
+        "\n",
+        "retriever = NVIDIARAGRetriever(\n",
+        "    base_url=RAG_BASE_URL,\n",
+        "    collection_names=[COLLECTION_NAME],\n",
+        "    k=5,\n",
+        ")\n",
+        "\n",
+        "query = \"What are the main topics or products discussed?\"\n",
+        "docs = retriever.invoke(query)\n",
+        "\n",
+        "print(f\"Query: {query}\")\n",
+        "print(f\"Retrieved {len(docs)} documents:\\n\")\n",
+        "for i, doc in enumerate(docs, 1):\n",
+        "    content_preview = (doc.page_content or \"\")[:300] + \"...\" if len(doc.page_content or \"\") > 300 else (doc.page_content or \"\")\n",
+        "    score = doc.metadata.get(\"score\", \"N/A\")\n",
+        "    source = doc.metadata.get(\"document_name\", \"N/A\")\n",
+        "    print(f\"--- Document {i} ---\")\n",
+        "    print(f\"Score: {score} | Source: {source}\")\n",
+        "    print(f\"Content: {content_preview}\")\n",
+        "    print()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "### 2.2 Custom Retrieval Parameters"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "id": "35d7f118",
+      "metadata": {},
+      "source": [
+        "For details on retrieval parameters, filter expressions, and metadata:\n",
+        "- [Custom metadata & filter expressions](../docs/custom-metadata.md) – `filter_expr` syntax (Milvus), metadata schema\n",
+        "- [Multi-turn & query rewriting](../docs/multiturn.md) – `enable_query_rewriting` for decontextualizing follow-up questions\n",
+        "- [Retriever API usage](./retriever_api_usage.ipynb) – Search endpoint payload parameters"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "6e5aa122",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "retriever_custom = NVIDIARAGRetriever(\n",
+        "    base_url=RAG_BASE_URL,\n",
+        "    collection_names=[COLLECTION_NAME],\n",
+        "    # Result counts\n",
+        "    k=3,  # Number of document chunks to return (0-25, maps to reranker_top_k)\n",
+        "    vdb_top_k=50,  # Top results from vector DB before reranking (0-400)\n",
+        "    # Feature toggles\n",
+        "    enable_reranker=True,  # Rerank results for relevance\n",
+        "    enable_query_rewriting=False,  # Rewrite query for better retrieval\n",
+        "    enable_filter_generator=False,  # Auto-generate filters from query\n",
+        "    enable_citations=True,  # Include image/table/chart citations in metadata\n",
+        "    # Filtering\n",
+        "    confidence_threshold=0.0,  # Min confidence (0.0-1.0, requires enable_reranker=True)\n",
+        "    filter_expr=None,  # Milvus filter expression, e.g. content_metadata['file_name'] == \"doc.pdf\"'\n",
+        "    # Advanced\n",
+        "    vdb_endpoint=\"http://milvus:19530\",  # Vector DB endpoint (override if needed)\n",
+        "    messages=[],  # Conversation history for context-aware retrieval\n",
+        "    timeout=60.0,  # HTTP request timeout in seconds\n",
+        ")\n",
+        "\n",
+        "docs = retriever_custom.invoke(\"Summarize key information\")\n",
+        "print(f\"Retrieved {len(docs)} documents\")\n",
+        "for i, doc in enumerate(docs, 1):\n",
+        "    print(f\"  {i}. {doc.metadata.get('document_name', 'N/A')} (score: {doc.metadata.get('score', 'N/A')})\")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "### 2.3 Async Retrieval"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "docs = await retriever.ainvoke(\"What features or benefits are described?\")\n",
+        "print(f\"Async retrieval: {len(docs)} documents\")\n",
+        "for i, doc in enumerate(docs[:3], 1):\n",
+        "    print(f\"  {i}. {doc.metadata.get('document_name', 'N/A')}\")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "### 2.4 Error Handling"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "bfcc9f22",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "from langchain_nvidia_ai_endpoints.retrievers import (\n",
+        "    NVIDIARAGConnectionError,\n",
+        "    NVIDIARAGServerError,\n",
+        "    NVIDIARAGValidationError,\n",
+        ")\n",
+        "\n",
+        "try:\n",
+        "    bad_retriever = NVIDIARAGRetriever(\n",
+        "        base_url=\"http://invalid-host:8081\",\n",
+        "        collection_names=[COLLECTION_NAME],\n",
+        "    )\n",
+        "    bad_retriever.invoke(\"test\")\n",
+        "except NVIDIARAGConnectionError as e:\n",
+        "    print(f\"Connection error (expected): {e}\")\n",
+        "except NVIDIARAGValidationError as e:\n",
+        "    print(f\"Validation error: {e}\")\n",
+        "except NVIDIARAGServerError as e:\n",
+        "    print(f\"Server error ({e.status_code}): {e}\")\n",
+        "\n",
+        "print(\"\\nError handling works as expected.\")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "id": "3b5824c6",
+      "metadata": {},
+      "source": [
+        "---\n",
+        "## RAG Chain (Optional)\n",
+        "\n",
+        "Chain `NVIDIARAGRetriever` with `ChatNVIDIA` for end-to-end question answering. Requires `NVIDIA_API_KEY` to call the NVIDIA API Catalog.\n",
+        "\n",
+        "**Get an API key:** See [Get an API Key](../docs/api-key.md) for instructions."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "d82a5d54",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Set NVIDIA_API_KEY if not already set (see ../docs/api-key.md to get a key)\n",
+        "if not os.environ.get(\"NVIDIA_API_KEY\", \"\").startswith(\"nvapi-\"):\n",
+        "    import getpass\n",
+        "    key = getpass.getpass(\"Enter your NVIDIA API key (nvapi-...): \")\n",
+        "    if key.startswith(\"nvapi-\"):\n",
+        "        os.environ[\"NVIDIA_API_KEY\"] = key\n",
+        "    else:\n",
+        "        print(\"NVIDIA_API_KEY not set. Set it to run the RAG chain. See [Get an API Key](../docs/api-key.md)\")\n",
+        "\n",
+        "if os.environ.get(\"NVIDIA_API_KEY\", \"\").startswith(\"nvapi-\"):\n",
+        "    from langchain_core.output_parsers import StrOutputParser\n",
+        "    from langchain_core.prompts import ChatPromptTemplate\n",
+        "    from langchain_core.runnables import RunnablePassthrough\n",
+        "\n",
+        "    from langchain_nvidia_ai_endpoints import ChatNVIDIA, NVIDIARAGRetriever\n",
+        "\n",
+        "    retriever = NVIDIARAGRetriever(\n",
+        "        base_url=RAG_BASE_URL,\n",
+        "        collection_names=[COLLECTION_NAME],\n",
+        "        k=4,\n",
+        "    )\n",
+        "\n",
+        "    def format_docs(docs):\n",
+        "        return \"\\n\\n\".join(d.page_content for d in docs)\n",
+        "\n",
+        "    prompt = ChatPromptTemplate.from_messages([\n",
+        "        (\"system\", \"Answer based only on the context below.\\n\\n{context}\"),\n",
+        "        (\"human\", \"{question}\"),\n",
+        "    ])\n",
+        "\n",
+        "    # Model aligned with rag-server default (nvidia/llama-3.3-nemotron-super-49b-v1.5)\n",
+        "    llm = ChatNVIDIA(model=\"nvidia/llama-3.3-nemotron-super-49b-v1.5\")\n",
+        "    chain = (\n",
+        "        {\"context\": retriever | format_docs, \"question\": RunnablePassthrough()}\n",
+        "        | prompt\n",
+        "        | llm\n",
+        "        | StrOutputParser()\n",
+        "    )\n",
+        "\n",
+        "    answer = chain.invoke(\"What are the main topics or products?\")\n",
+        "    print(answer)\n",
+        "else:\n",
+        "    print(\"NVIDIA_API_KEY not set. Set it (e.g. os.environ['NVIDIA_API_KEY'] = 'nvapi-...') or run this cell again to be prompted. See [Get an API Key](../docs/api-key.md)\")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "id": "4c95fdc6",
+      "metadata": {},
+      "source": [
+        "---\n",
+        "## Cleanup (Optional)\n",
+        "\n",
+        "To remove the collection and documents, use the delete cells in [ingestion_api_usage.ipynb](./ingestion_api_usage.ipynb) (sections 7 and 9)."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Use ingestion_api_usage.ipynb sections 7 (Delete Documents) and 9 (Delete Collections)\n",
+        "# to remove the multimodal_data collection when finished."
+      ]
+    }
+  ],
+  "metadata": {
+    "kernelspec": {
+      "display_name": ".venv",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "name": "python",
+      "version": "3.11.12"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 5
+}
diff --git a/notebooks/launchable.ipynb b/notebooks/launchable.ipynb
index 8e5779af2..cccba6eba 100644
--- a/notebooks/launchable.ipynb
+++ b/notebooks/launchable.ipynb
@@ -127,6 +127,16 @@
     "RAG_BASE_URL = f\"http://{IPADDRESS}:{RAG_SERVER_PORT}\"\n",
     "INGESTOR_BASE_URL = f\"http://{IPADDRESS}:{INGESTOR_SERVER_PORT}\"\n",
     "\n",
+    "# NIM services to deploy (excludes nim-llm and vlm-ms since we use NVIDIA-hosted endpoints)\n",
+    "NIM_SERVICES = (\n",
+    "    \"nemotron-embedding-ms \"\n",
+    "    \"nemotron-ranking-ms \"\n",
+    "    \"page-elements \"\n",
+    "    \"graphic-elements \"\n",
+    "    \"table-structure \"\n",
+    "    \"nemoretriever-ocr\"\n",
+    ")\n",
+    "\n",
     "\n",
     "# =============================================================================\n",
     "# DOCKER COMPOSE HELPERS\n",
@@ -792,7 +802,7 @@
     "            print(f\"   • {warn}\")\n",
     "    print(\"\\n\" + \"=\" * 70)\n",
     "    print(\"Please fix the above errors before continuing.\")\n",
-    "    print(\"See: https://docs.nvidia.com/ai-blueprints/rag/latest/support-matrix.html\")\n",
+    "    print(\"See: https://docs.nvidia.com/rag/latest/support-matrix.html\")\n",
     "    print(\"=\" * 70)\n",
     "elif warnings:\n",
     "    print(\"\\n✅ REQUIREMENTS MET (with warnings)\")\n",
@@ -914,7 +924,7 @@
     "import subprocess\n",
     "\n",
     "REPO_URL = \"https://github.com/NVIDIA-AI-Blueprints/rag.git\"\n",
-    "BRANCH = \"release-v2.4.0\"\n",
+    "BRANCH = \"release-v2.5.0\"\n",
     "#BRANCH = \"develop\"\n",
     "# Check if we're already in the rag repo (look for deploy/compose)\n",
     "if os.path.exists(\"deploy/compose\"):\n",
@@ -1295,10 +1305,10 @@
     "902445432dde   milvus-standalone                Up 3 minutes\n",
     "340bc8210a0d   milvus-minio                     Up 3 minutes (healthy)\n",
     "0be702b87ad6   milvus-etcd                      Up 3 minutes (healthy)\n",
-    "fe2751bfa734   nemoretriever-ranking-ms         Up 10 minutes (healthy)\n",
+    "fe2751bfa734   nemotron-ranking-ms              Up 4 seconds (healthy)\n",
     "7b5ddabf8be7   compose-graphic-elements-1       Up 10 minutes\n",
     "ecfaa5190302   compose-page-elements-1          Up 10 minutes\n",
-    "ea8c7fdf20d1   nemoretriever-embedding-ms       Up 10 minutes (healthy)\n",
+    "ea8c7fdf20d1   nemotron-embedding-ms            Up 4 seconds  (healthy)\n",
     "6d62008a9b42   compose-nemoretriever-ocr-1      Up 10 minutes\n",
     "969b9f5c987c   compose-table-structure-1        Up 10 minutes\n",
     "```\n",
@@ -2030,9 +2040,9 @@
     "\n",
     "## 📚 Additional Resources\n",
     "\n",
-    "- **Documentation**: https://docs.nvidia.com/ai-blueprints/rag/latest/\n",
+    "- **Documentation**: https://docs.nvidia.com/rag/latest/\n",
     "- **GitHub**: https://github.com/NVIDIA-AI-Blueprints/rag\n",
-    "- **Support Matrix**: https://docs.nvidia.com/ai-blueprints/rag/latest/support-matrix.html\n",
+    "- **Support Matrix**: https://docs.nvidia.com/rag/latest/support-matrix.html\n",
     "\n",
     "---\n",
     "\n",
@@ -2056,7 +2066,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.12.12"
+   "version": "3.12.13"
   }
  },
  "nbformat": 4,
diff --git a/notebooks/nb_metadata.ipynb b/notebooks/nb_metadata.ipynb
index ad79dd7c7..39203d07b 100644
--- a/notebooks/nb_metadata.ipynb
+++ b/notebooks/nb_metadata.ipynb
@@ -951,8 +951,8 @@
     "  \"enable_reranker\": True,\n",
     "  \"enable_citations\": True,\n",
     "  \"model\": \"nvidia/llama-3.3-nemotron-super-49b-v1.5\",\n",
-    "  \"reranker_model\": \"nvidia/llama-3.2-nv-rerankqa-1b-v2\",\n",
-    "  \"embedding_model\": \"nvidia/llama-3.2-nv-embedqa-1b-v2\",\n",
+    "  \"reranker_model\": \"nvidia/llama-nemotron-rerank-1b-v2\",\n",
+    "  \"embedding_model\": \"nvidia/llama-nemotron-embed-1b-v2\",\n",
     "  # Provide url of the model endpoints if deployed elsewhere\n",
     "  # \"llm_endpoint\": \"\",\n",
     "  #\"embedding_endpoint\": \"\",\n",
@@ -1004,8 +1004,8 @@
     "  \"enable_reranker\": True,\n",
     "  \"enable_citations\": True,\n",
     "  \"model\": \"nvidia/llama-3.3-nemotron-super-49b-v1.5\",\n",
-    "  \"reranker_model\": \"nvidia/llama-3.2-nv-rerankqa-1b-v2\",\n",
-    "  \"embedding_model\": \"nvidia/llama-3.2-nv-embedqa-1b-v2\",\n",
+    "  \"reranker_model\": \"nvidia/llama-nemotron-rerank-1b-v2\",\n",
+    "  \"embedding_model\": \"nvidia/llama-nemotron-embed-1b-v2\",\n",
     "  # Provide url of the model endpoints if deployed elsewhere\n",
     "  # \"llm_endpoint\": \"\",\n",
     "  #\"embedding_endpoint\": \"\",\n",
@@ -1119,8 +1119,8 @@
     "    \"enable_citations\": True,\n",
     "    \"enable_filter_generator\": False,  # Disable to use manual complex filter\n",
     "    \"model\": \"nvidia/llama-3.3-nemotron-super-49b-v1.5\",\n",
-    "    \"reranker_model\": \"nvidia/llama-3.2-nv-rerankqa-1b-v2\",\n",
-    "    \"embedding_model\": \"nvidia/llama-3.2-nv-embedqa-1b-v2\",\n",
+    "    \"reranker_model\": \"nvidia/llama-nemotron-rerank-1b-v2\",\n",
+    "    \"embedding_model\": \"nvidia/llama-nemotron-embed-1b-v2\",\n",
     "    \"stop\": [],\n",
     "    \"filter_expr\": '(content_metadata[\"manufacturer\"] like \"%ford%\" and content_metadata[\"rating\"] > 4.0 and content_metadata[\"created_date\"] between \"2020-01-01\" and \"2024-12-31\" and content_metadata[\"is_public\"] == true) or (content_metadata[\"model\"] like \"%edge%\" and content_metadata[\"year\"] >= 2020 and content_metadata[\"tags\"] in [\"technology\", \"safety\", \"latest\"] and content_metadata[\"rating\"] >= 4.0)'\n",
     "}\n",
@@ -1186,8 +1186,8 @@
     "    \"enable_reranker\": True,\n",
     "    \"enable_citations\": True,\n",
     "    \"model\": \"nvidia/llama-3.3-nemotron-super-49b-v1.5\",\n",
-    "    \"reranker_model\": \"nvidia/llama-3.2-nv-rerankqa-1b-v2\",\n",
-    "    \"embedding_model\": \"nvidia/llama-3.2-nv-embedqa-1b-v2\",\n",
+    "    \"reranker_model\": \"nvidia/llama-nemotron-rerank-1b-v2\",\n",
+    "    \"embedding_model\": \"nvidia/llama-nemotron-embed-1b-v2\",\n",
     "    \"stop\": [],\n",
     "    \"filter_expr\": 'array_contains(content_metadata[\"tags\"], \"eco-friendly\")'\n",
     "}\n",
@@ -1257,8 +1257,8 @@
     "    \"enable_citations\": True,\n",
     "    \"enable_filter_generator\": True,  # 🎯 NEW FEATURE - Enable AI filter generation\n",
     "    \"model\": \"nvidia/llama-3.3-nemotron-super-49b-v1.5\",\n",
-    "    \"reranker_model\": \"nvidia/llama-3.2-nv-rerankqa-1b-v2\",\n",
-    "    \"embedding_model\": \"nvidia/llama-3.2-nv-embedqa-1b-v2\",\n",
+    "    \"reranker_model\": \"nvidia/llama-nemotron-rerank-1b-v2\",\n",
+    "    \"embedding_model\": \"nvidia/llama-nemotron-embed-1b-v2\",\n",
     "    \"stop\": [],\n",
     "    \"filter_expr\": \"\"  # Will be generated automatically by AI\n",
     "}\n",
@@ -1323,8 +1323,8 @@
     "  \"enable_reranker\": False,\n",
     "  \"enable_citations\": False,\n",
     "  \"model\": \"nvidia/llama-3.3-nemotron-super-49b-v1.5\",\n",
-    "  \"reranker_model\": \"nvidia/llama-3.2-nv-rerankqa-1b-v2\",\n",
-    "  \"embedding_model\": \"nvidia/llama-3.2-nv-embedqa-1b-v2\",\n",
+    "  \"reranker_model\": \"nvidia/llama-nemotron-rerank-1b-v2\",\n",
+    "  \"embedding_model\": \"nvidia/llama-nemotron-embed-1b-v2\",\n",
     "  # Provide url of the model endpoints if deployed elsewhere\n",
     "  # \"llm_endpoint\": \"\",\n",
     "  #\"embedding_endpoint\": \"\",\n",
@@ -1391,8 +1391,8 @@
     "    \"enable_citations\": True,\n",
     "    \"enable_filter_generator\": False,\n",
     "    \"model\": \"nvidia/llama-3.3-nemotron-super-49b-v1.5\",\n",
-    "    \"reranker_model\": \"nvidia/llama-3.2-nv-rerankqa-1b-v2\",\n",
-    "    \"embedding_model\": \"nvidia/llama-3.2-nv-embedqa-1b-v2\",\n",
+    "    \"reranker_model\": \"nvidia/llama-nemotron-rerank-1b-v2\",\n",
+    "    \"embedding_model\": \"nvidia/llama-nemotron-embed-1b-v2\",\n",
     "    \"stop\": [],\n",
     "    \"filter_expr\": 'content_metadata[\"nonexistent_field\"] == \"value\"'  # This will cause an error\n",
     "}\n",
diff --git a/notebooks/rag_event_ingest.ipynb b/notebooks/rag_event_ingest.ipynb
new file mode 100644
index 000000000..a38f976af
--- /dev/null
+++ b/notebooks/rag_event_ingest.ipynb
@@ -0,0 +1,793 @@
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Document Continuous Ingestion from Object Storage\n",
+        "\n",
+        "## Purpose\n",
+        "\n",
+        "This notebook demonstrates an **automated document ingestion pipeline** that:\n",
+        "\n",
+        "1. Monitors emulated object storage for new uploads via Kafka events\n",
+        "2. Routes documents to appropriate AI services for indexing\n",
+        "5. Enables RAG Agent for semantic search and contextual Q&A over all ingested content\n",
+        "\n",
+        "## What Gets Deployed\n",
+        "\n",
+        "1. **NVIDIA RAG** - Document indexing, vector search, and AI-powered Q&A (NIMs, Milvus, Ingestor)\n",
+        "2. **Continuous Ingestion** - Event-driven ingestion pipeline (Kafka, MinIO, Consumer)\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Prerequisites\n",
+        "\n",
+        "### Hardware\n",
+        "- **GPU**: 2x RTX PRO 6000 Blackwell or 2x H100\n",
+        "\n",
+        "#### Default GPU Assignment\n",
+        "\n",
+        "| GPU | Service |\n",
+        "|-----|---------|\n",
+        "| 0 | RAG NIMs (Embedding, Reranker) |\n",
+        "| 1 | RAG LLM NIM (Llama-3.3-Nemotron-Super-49B) |\n",
+        "\n",
+        "\n",
+        "### Software (pre-installed required)\n",
+        "- Ubuntu 22.04 or later\n",
+        "- Docker 24.0+ with Docker Compose v2\n",
+        "- NVIDIA Driver 570+\n",
+        "- NVIDIA Container Toolkit\n",
+        "\n",
+        "### API Keys\n",
+        "\n",
+        "<table style=\"margin-left: 0;\">\n",
+        "<tr><th>Key</th><th>Purpose</th><th>How to Get</th></tr>\n",
+        "<tr><td><code>NGC_API_KEY</code></td><td>Docker login, NIM deployments</td><td><a href=\"https://org.ngc.nvidia.com/setup/api-keys\">NGC Portal</a> → Generate API Key</td></tr>\n",
+        "</table>\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Table of Contents\n",
+        "\n",
+        "<table style=\"margin-left: 0;\">\n",
+        "<tr><th>Section</th><th>Description</th></tr>\n",
+        "<tr><td><b>Setup</b></td><td>Clone repo, install deps, set API keys, load helpers</td></tr>\n",
+        "<tr><td><b>Deploy RAG</b></td><td>NIMs, Vector DB, Ingestor, RAG Server</td></tr>\n",
+        "<tr><td><b>Deploy Continuous Ingestion</b></td><td>Kafka, MinIO, Consumer</td></tr>\n",
+        "<tr><td><b>Testing</b></td><td>Upload documents, query RAG</td></tr>\n",
+        "<tr><td><b>Clean Up</b></td><td>Stop services, clean data</td></tr>\n",
+        "</table>\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## References\n",
+        "\n",
+        "- **RAG Blueprint**: [NVIDIA RAG Documentation](https://github.com/NVIDIA-AI-Blueprints/rag/blob/main/docs/deploy-docker-self-hosted.md)\n",
+        "- **NIM**: [NVIDIA NIM Documentation](https://docs.nvidia.com/nim/index.html)\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Setup\n",
+        "\n",
+        "Clone the repository, configure API keys, and load helper functions.\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## 1. Clone Repository\n",
+        "\n",
+        "Clone the RAG Blueprint repo to `~/rag`. This includes the consumer source code, deploy configs, and sample test data.\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "import subprocess, sys, os, shutil\n",
+        "\n",
+        "RAG_REPO_DIR = os.path.expanduser(\"~/rag\")\n",
+        "RAG_REPO_URL = \"https://github.com/NVIDIA-AI-Blueprints/rag.git\"\n",
+        "\n",
+        "# Ensure git-lfs is installed before any LFS operations\n",
+        "if not shutil.which(\"git-lfs\"):\n",
+        "    print(\"[INSTALLING] git-lfs...\")\n",
+        "    subprocess.run(\"sudo apt-get update && sudo apt-get install -y git-lfs && git lfs install\", shell=True, check=True)\n",
+        "else:\n",
+        "    print(\"[OK] git-lfs found\")\n",
+        "\n",
+        "# Clone from correct branch (skip if already exists)\n",
+        "if not os.path.exists(RAG_REPO_DIR):\n",
+        "    subprocess.run(f\"git clone {RAG_REPO_URL} {RAG_REPO_DIR}\", shell=True, check=True)\n",
+        "else:\n",
+        "    print(f\"[OK] RAG repo already exists: {RAG_REPO_DIR}\")\n",
+        "subprocess.run(\"git lfs pull\", shell=True, cwd=RAG_REPO_DIR, check=True)\n",
+        "\n",
+        "# Verify\n",
+        "for path in [\"deploy/compose\", \"examples/rag_event_ingest/kafka_consumer\", \"examples/rag_event_ingest/data\"]:\n",
+        "    status = \"[OK]\" if os.path.exists(os.path.join(RAG_REPO_DIR, path)) else \"[MISSING]\"\n",
+        "    print(f\"  {status} {path}\")\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## 2. Install Dependencies\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "! python3 -m ensurepip --upgrade"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Ensure pip is available (some minimal Python installs lack it)\n",
+        "subprocess.run([sys.executable, \"-m\", \"ensurepip\", \"--upgrade\"], capture_output=True)\n",
+        "\n",
+        "def check_install_system_pkg(cmd: str, install_cmd: str):\n",
+        "    if shutil.which(cmd):\n",
+        "        print(f\"  [OK] {cmd} found\")\n",
+        "        return True\n",
+        "    print(f\"  [INSTALLING] {cmd}...\")\n",
+        "    result = subprocess.run(install_cmd, shell=True, capture_output=True, text=True)\n",
+        "    if result.returncode == 0:\n",
+        "        print(f\"  [OK] {cmd} installed\")\n",
+        "        return True\n",
+        "    print(f\"  [ERROR] Failed to install {cmd}. Please install manually: {install_cmd}\")\n",
+        "    return False\n",
+        "\n",
+        "check_install_system_pkg(\"git\", \"sudo apt-get update && sudo apt-get install -y git\")\n",
+        "\n",
+        "# Install Python packages\n",
+        "subprocess.check_call([sys.executable, \"-m\", \"pip\", \"install\", \"-q\", \"minio\", \"aiohttp\", \"requests\", \"python-dotenv\", \"pyyaml\"])\n",
+        "print(\"[OK] Ready\")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## 3. Set API Keys\n",
+        "\n",
+        "Configure NGC API key for NIM deployments.\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "import getpass\n",
+        "\n",
+        "def set_api_key(env_var: str, prompt: str, required: bool = True):\n",
+        "    if os.environ.get(env_var):\n",
+        "        print(f\"  [OK] {env_var} already set ({os.environ[env_var][:10]}...)\")\n",
+        "        return True\n",
+        "    key = getpass.getpass(prompt)\n",
+        "    if key:\n",
+        "        os.environ[env_var] = key\n",
+        "        print(f\"  [OK] {env_var} set\")\n",
+        "        return True\n",
+        "    if required:\n",
+        "        print(f\"  [ERROR] {env_var} is required\")\n",
+        "        return False\n",
+        "    print(f\"  [SKIP] {env_var} (optional)\")\n",
+        "    return True\n",
+        "\n",
+        "set_api_key(\"NGC_API_KEY\", \"Enter NGC_API_KEY (starts with 'nvapi-'): \", required=True)\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## 4. Helper Functions\n",
+        "\n",
+        "Shared utilities for deployment, file upload, status checks, and RAG queries.\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Install dependencies\n",
+        "import sys\n",
+        "!{sys.executable} -m pip install -q minio aiohttp requests python-dotenv"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "import os, sys, json, re, subprocess, time, socket, asyncio\n",
+        "import aiohttp, requests\n",
+        "from typing import List, Optional, Dict\n",
+        "\n",
+        "try:\n",
+        "    from minio import Minio\n",
+        "    from minio.error import S3Error\n",
+        "except ImportError:\n",
+        "    subprocess.check_call([sys.executable, \"-m\", \"pip\", \"install\", \"-q\", \"minio\"])\n",
+        "    from minio import Minio\n",
+        "    from minio.error import S3Error\n",
+        "\n",
+        "# =============================================================================\n",
+        "# CONFIGURATION\n",
+        "# =============================================================================\n",
+        "\n",
+        "# Paths relative to RAG repo root\n",
+        "RAG_REPO_DIR = os.path.expanduser(\"~/rag\")\n",
+        "EXAMPLE_DIR = os.path.join(RAG_REPO_DIR, \"examples/rag_event_ingest\")\n",
+        "AIDP_COMPOSE_FILE = os.path.join(EXAMPLE_DIR, \"deploy/docker-compose.yaml\")\n",
+        "DATA_DIR = os.path.join(EXAMPLE_DIR, \"data\")\n",
+        "RAG_SERVER_URL = \"http://localhost:8081\"\n",
+        "INGESTOR_URL = \"http://localhost:8082\"\n",
+        "\n",
+        "LOCAL_NIM_CACHE = os.path.expanduser(\"~/.cache/nim\")\n",
+        "\n",
+        "MINIO_ENDPOINT = \"localhost:9201\"\n",
+        "MINIO_ACCESS_KEY = \"minioadmin\"\n",
+        "MINIO_SECRET_KEY = \"minioadmin\"\n",
+        "MINIO_BUCKET = \"aidp-bucket\"\n",
+        "MINIO_COLLECTION = \"aidp_bucket\"\n",
+        "MINIO_CONSOLE_PORT = 9211\n",
+        "\n",
+        "# =============================================================================\n",
+        "# SHARED UTILITIES\n",
+        "# =============================================================================\n",
+        "\n",
+        "def run_command(cmd: str, capture: bool = False) -> Optional[str]:\n",
+        "    \"\"\"Execute a shell command and print it.\"\"\"\n",
+        "    print(f\"$ {cmd}\")\n",
+        "    result = subprocess.run(cmd, shell=True, capture_output=capture, text=True)\n",
+        "    return result.stdout if capture else None\n",
+        "\n",
+        "def get_host_ip() -> str:\n",
+        "    \"\"\"Get host IP address for external access URLs.\"\"\"\n",
+        "    try:\n",
+        "        s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)\n",
+        "        s.connect((\"8.8.8.8\", 80))\n",
+        "        ip = s.getsockname()[0]\n",
+        "        s.close()\n",
+        "        return ip\n",
+        "    except OSError:\n",
+        "        return \"localhost\"\n",
+        "\n",
+        "def get_minio_client() -> Minio:\n",
+        "    \"\"\"Create MinIO client for AIDP bucket operations.\"\"\"\n",
+        "    return Minio(MINIO_ENDPOINT, access_key=MINIO_ACCESS_KEY, secret_key=MINIO_SECRET_KEY, secure=False)\n",
+        "\n",
+        "def upload_file(local_path: str, object_name: Optional[str] = None) -> bool:\n",
+        "    \"\"\"Upload a local file to MinIO AIDP bucket.\"\"\"\n",
+        "    if not os.path.exists(local_path):\n",
+        "        print(f\"[ERROR] File not found: {local_path}\")\n",
+        "        return False\n",
+        "    obj = object_name or os.path.basename(local_path)\n",
+        "    try:\n",
+        "        client = get_minio_client()\n",
+        "        if not client.bucket_exists(MINIO_BUCKET):\n",
+        "            client.make_bucket(MINIO_BUCKET)\n",
+        "        client.fput_object(MINIO_BUCKET, obj, local_path)\n",
+        "        print(f\"[OK] Uploaded: {obj}\")\n",
+        "        return True\n",
+        "    except S3Error as e:\n",
+        "        print(f\"[ERROR] {e}\")\n",
+        "        return False\n",
+        "\n",
+        "def verify_file_in_storage(object_name: str, bucket: str = MINIO_BUCKET) -> bool:\n",
+        "    \"\"\"Check if a file exists in MinIO bucket and print verification status.\"\"\"\n",
+        "    try:\n",
+        "        client = get_minio_client()\n",
+        "        stat = client.stat_object(bucket, object_name)\n",
+        "        print(f\"[OK] File verified in storage:\")\n",
+        "        print(f\"  Bucket:   {bucket}\")\n",
+        "        print(f\"  Object:   {object_name}\")\n",
+        "        print(f\"  Size:     {stat.size:,} bytes\")\n",
+        "        print(f\"  Modified: {stat.last_modified}\")\n",
+        "        return True\n",
+        "    except S3Error as e:\n",
+        "        print(f\"[ERROR] File not found in storage: {object_name}\")\n",
+        "        print(f\"  Error: {e}\")\n",
+        "        return False\n",
+        "\n",
+        "def get_consumer_logs(lines: int = 30) -> None:\n",
+        "    \"\"\"Show recent Kafka consumer logs.\"\"\"\n",
+        "    run_command(f\"docker logs kafka-consumer --tail {lines}\")\n",
+        "\n",
+        "async def query_rag(question: str, collection: str = None) -> Optional[str]:\n",
+        "    \"\"\"Query RAG system and print the answer.\"\"\"\n",
+        "    coll = collection or MINIO_COLLECTION\n",
+        "    print(f\"Q: {question}\\nCollection: {coll}\\n\" + \"-\" * 40)\n",
+        "\n",
+        "    payload = {\n",
+        "        \"messages\": [{\"role\": \"user\", \"content\": question}],\n",
+        "        \"use_knowledge_base\": True,\n",
+        "        \"collection_names\": [coll],\n",
+        "    }\n",
+        "    try:\n",
+        "        async with aiohttp.ClientSession() as session:\n",
+        "            async with session.post(\n",
+        "                f\"{RAG_SERVER_URL}/generate\", json=payload,\n",
+        "                timeout=aiohttp.ClientTimeout(total=120),\n",
+        "            ) as resp:\n",
+        "                text = await resp.text()\n",
+        "                # Parse SSE response: extract content from each \"data: {...}\" line\n",
+        "                chunks = []\n",
+        "                for line in text.split(\"\\n\"):\n",
+        "                    if not line.startswith(\"data: \") or line[6:] == \"[DONE]\":\n",
+        "                        continue\n",
+        "                    try:\n",
+        "                        msg = json.loads(line[6:]).get(\"choices\", [{}])[0].get(\"message\", {})\n",
+        "                        if msg.get(\"content\"):\n",
+        "                            chunks.append(msg[\"content\"])\n",
+        "                    except json.JSONDecodeError:\n",
+        "                        pass\n",
+        "                answer = \"\".join(chunks)\n",
+        "                print(f\"Answer: {answer}\")\n",
+        "                return answer\n",
+        "    except aiohttp.ClientError as e:\n",
+        "        print(f\"[ERROR] {e}\")\n",
+        "        return None\n",
+        "\n",
+        "print(f\"[OK] Helpers loaded | Host IP: {get_host_ip()}\")\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Deploy NVIDIA RAG\n",
+        "\n",
+        "Deploy the NVIDIA RAG: NIMs (LLM, Embedding, Reranker), Milvus vector database, Ingestor server, and RAG server.\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "ngc_key = os.environ.get(\"NGC_API_KEY\")\n",
+        "if not ngc_key:\n",
+        "    raise RuntimeError(\"NGC_API_KEY not set! Run the API keys cell first.\")\n",
+        "\n",
+        "os.chdir(RAG_REPO_DIR)\n",
+        "\n",
+        "# Set env vars needed by docker compose\n",
+        "os.environ[\"NGC_API_KEY\"] = ngc_key\n",
+        "os.environ[\"USERID\"] = f\"{os.getuid()}:{os.getgid()}\"\n",
+        "os.environ[\"COLLECTION_NAME\"] = MINIO_COLLECTION\n",
+        "\n",
+        "# Load RAG .env defaults (MODEL_DIRECTORY, etc.)\n",
+        "from dotenv import load_dotenv\n",
+        "env_file = os.path.join(RAG_REPO_DIR, \"deploy/compose/.env\")\n",
+        "if os.path.exists(env_file):\n",
+        "    load_dotenv(env_file, override=False)\n",
+        "\n",
+        "# Login to nvcr.io\n",
+        "subprocess.run(f\"echo {ngc_key} | docker login nvcr.io -u '$oauthtoken' --password-stdin\",\n",
+        "               shell=True, capture_output=True, text=True, executable=\"/bin/bash\")\n",
+        "\n",
+        "# Deploy components\n",
+        "for label, compose_file in [\n",
+        "    (\"NIMs\",      \"deploy/compose/nims.yaml\"),\n",
+        "    (\"Vector DB\", \"deploy/compose/vectordb.yaml\"),\n",
+        "]:\n",
+        "    print(f\"Deploying {label}...\")\n",
+        "    run_command(f\"USERID=$(id -u) docker compose -f {compose_file} up -d\")\n",
+        "\n",
+        "print(\"Waiting 30s for Milvus...\")\n",
+        "time.sleep(30)\n",
+        "\n",
+        "for label, compose_file in [\n",
+        "    (\"Ingestor\", \"deploy/compose/docker-compose-ingestor-server.yaml\"),\n",
+        "    (\"RAG Server\", \"deploy/compose/docker-compose-rag-server.yaml\"),\n",
+        "]:\n",
+        "    print(f\"Deploying {label}...\")\n",
+        "    run_command(f\"docker compose -f {compose_file} up -d\")\n",
+        "\n",
+        "ip = get_host_ip()\n",
+        "print(f\"\\nRAG deployed: http://{ip}:8081 (server) | http://{ip}:8082 (ingestor) | http://{ip}:8090 (UI)\")\n",
+        "print(f\"COLLECTION_NAME: {MINIO_COLLECTION}\")\n",
+        "print(\"Wait ~10 minutes for NIMs to load models, then run the status check cell.\")\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Verify RAG services are healthy. Wait ~10 minutes for NIMs to load models.\n",
+        "\n",
+        "The deployment status should be:\n",
+        "```\n",
+        "NAMES                            STATUS\n",
+        "rag-frontend                     Up About a minute\n",
+        "rag-server                       Up About a minute\n",
+        "ingestor-server                  Up About a minute\n",
+        "milvus-standalone                Up 2 minutes (healthy)\n",
+        "milvus-etcd                      Up 2 minutes (healthy)\n",
+        "milvus-minio                     Up 2 minutes (healthy)\n",
+        "nim-llm-ms                       Up 2 minutes (healthy)\n",
+        "nemotron-embedding-ms            Up 2 minutes (healthy)\n",
+        "nemotron-ranking-ms              Up 2 minutes (healthy)\n",
+        "```\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Check service status and print access URLs\n",
+        "print(\"Wait ~10 minutes for services to become healthy.\")\n",
+        "print(\"Run this cell again after waiting.\\n\")\n",
+        "\n",
+        "ip = get_host_ip()\n",
+        "for name, port, path in [\n",
+        "    (\"RAG Server\", 8081, \"/health\"), (\"Ingestor\", 8082, \"/health\"),\n",
+        "    (\"Frontend\", 8090, \"/\"), (\"Milvus\", 19530, \"/v1/vector/collections\"),\n",
+        "]:\n",
+        "    try:\n",
+        "        s = \"[OK]\" if requests.get(f\"http://localhost:{port}{path}\", timeout=10).status_code == 200 else \"[WARN]\"\n",
+        "    except requests.ConnectionError:\n",
+        "        s = \"[DOWN]\"\n",
+        "    except requests.Timeout:\n",
+        "        s = \"[TIMEOUT]\"\n",
+        "    print(f\"  {s} {name}: http://{ip}:{port}\")\n",
+        "run_command(\"docker ps --format 'table {{.Names}}\\t{{.Status}}' | grep -E '(rag|milvus|ingestor|nim|nemotron|NAMES)'\")\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Deploy Continuous Ingestion from emulated object storage\n",
+        "\n",
+        "Deploy the Continuous Ingestion: Kafka message broker, MinIO object storage, and Kafka consumer for automated ingestion.\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## 1. Deploy Services\n",
+        "\n",
+        "Deploy Kafka, MinIO, and the Kafka consumer."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Verify prerequisites\n",
+        "net_check = subprocess.run(\"docker network inspect nvidia-rag\", shell=True, capture_output=True)\n",
+        "if net_check.returncode != 0:\n",
+        "    raise RuntimeError(\"nvidia-rag network not found. Deploy RAG first.\")\n",
+        "\n",
+        "ngc_key = os.environ.get(\"NGC_API_KEY\", \"\")\n",
+        "if not ngc_key:\n",
+        "    raise RuntimeError(\"NGC_API_KEY not set!\")\n",
+        "\n",
+        "host_ip = get_host_ip()\n",
+        "\n",
+        "# Set environment variables for docker compose\n",
+        "os.environ[\"HOST_IP\"] = host_ip\n",
+        "\n",
+        "# Login + pull + build\n",
+        "subprocess.run(f\"echo {ngc_key} | docker login nvcr.io -u '$oauthtoken' --password-stdin\",\n",
+        "               shell=True, capture_output=True, text=True, executable=\"/bin/bash\")\n",
+        "\n",
+        "compose = f\"docker compose -f {AIDP_COMPOSE_FILE}\"\n",
+        "subprocess.run(f\"{compose} pull --ignore-pull-failures\", shell=True, capture_output=True, text=True, executable=\"/bin/bash\")\n",
+        "subprocess.run(f\"{compose} up -d --build\", shell=True, capture_output=True, text=True, executable=\"/bin/bash\")\n",
+        "\n",
+        "print(f\"Continuous Ingestion deployed:\")\n",
+        "print(f\"  Kafka UI:      http://{host_ip}:8080\")\n",
+        "print(f\"  MinIO Console: http://{host_ip}:{MINIO_CONSOLE_PORT}\")\n",
+        "print(f\"  Credentials:   minioadmin / minioadmin\")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Verify continuous ingestion services are running.\n",
+        "\n",
+        "The deployment status should be:\n",
+        "```\n",
+        "NAMES                            STATUS\n",
+        "kafka-consumer                   Up About a minute\n",
+        "aidp-kafka-ui                    Up About a minute\n",
+        "aidp-minio-mc                    Up About a minute\n",
+        "aidp-minio                       Up About a minute (healthy)\n",
+        "kafka                            Up About a minute (healthy)\n",
+        "```\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Check service status and print access URLs\n",
+        "ip = get_host_ip()\n",
+        "for name, port, path in [\n",
+        "    (\"Kafka UI\", 8080, \"/\"),\n",
+        "    (\"MinIO Console\", MINIO_CONSOLE_PORT, \"/\"),\n",
+        "]:\n",
+        "    try:\n",
+        "        s = \"[OK]\" if requests.get(f\"http://localhost:{port}{path}\", timeout=10).status_code == 200 else \"[WARN]\"\n",
+        "    except requests.ConnectionError:\n",
+        "        s = \"[DOWN]\"\n",
+        "    except requests.Timeout:\n",
+        "        s = \"[TIMEOUT]\"\n",
+        "    print(f\"  {s} {name}: http://{ip}:{port}\")\n",
+        "\n",
+        "# Check kafka-consumer container status\n",
+        "result = subprocess.run(\"docker inspect -f '{{.State.Status}}' kafka-consumer 2>/dev/null\",\n",
+        "                        shell=True, capture_output=True, text=True)\n",
+        "status = result.stdout.strip()\n",
+        "s = \"[OK]\" if status == \"running\" else \"[DOWN]\"\n",
+        "print(f\"  {s} Kafka Consumer: {status or 'not found'}\")\n",
+        "\n",
+        "run_command(\"docker ps --format 'table {{.Names}}\\t{{.Status}}' | grep -E '(kafka|minio|NAMES)'\")\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Testing\n",
+        "\n",
+        "Test the deployment by uploading documents, then querying via RAG.\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## 1. Document Upload\n",
+        "\n",
+        "Upload a PDF document to MinIO, which triggers automatic ingestion via Kafka consumer.\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "### 1.1 Upload to Storage\n",
+        "\n",
+        "Upload the document to MinIO object storage.\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Sample documents are included in the repo under examples/rag_event_ingest/data/\n",
+        "pdf_path = os.path.join(DATA_DIR, \"documents\", \"Seahawks-Patriots in Super Bowl LX_ What We Learned from Seattle's 29-13 win.pdf\")\n",
+        "upload_file(pdf_path, \"Seahawks-Patriots_SuperBowl_LX_Analysis.pdf\")\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "### 1.2 Verify Document Ingestion"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Check consumer logs to verify document processing status.\n",
+        "\n",
+        "The logs should show the document being picked up and successfully ingested:\n",
+        "```\n",
+        "services.document_indexer - INFO - Task ...: PENDING (0s)\n",
+        "services.document_indexer - INFO - Task ...: PENDING (5s)\n",
+        "handlers.base - INFO - [DocumentHandler] ✓ Seahawks-Patriots_SuperBowl_LX_Analysis.pdf → SUCCESS\n",
+        "consumer - INFO - ✓ SUMMARY: Seahawks-Patriots_SuperBowl_LX_Analysis.pdf | Collection: aidp_bucket | Duration: 12.76s | Status: SUCCESS\n",
+        "```\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Verify file landed in object storage\n",
+        "verify_file_in_storage(\"Seahawks-Patriots_SuperBowl_LX_Analysis.pdf\")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "### 1.3 Verify Document Ingestion\n",
+        "\n",
+        "Check consumer logs to verify document processing status.\n",
+        "\n",
+        "The logs should show the document being picked up and successfully ingested:\n",
+        "```\n",
+        "services.document_indexer - INFO - Task ...: PENDING (0s)\n",
+        "services.document_indexer - INFO - Task ...: PENDING (5s)\n",
+        "handlers.base - INFO - [DocumentHandler] ✓ Seahawks-Patriots_SuperBowl_LX_Analysis.pdf → SUCCESS\n",
+        "consumer - INFO - ✓ SUMMARY: Seahawks-Patriots_SuperBowl_LX_Analysis.pdf | Collection: aidp_bucket | Duration: 12.76s | Status: SUCCESS\n",
+        "```"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Check consumer logs for ingestion status\n",
+        "print(\"Waiting for document processing...\")\n",
+        "get_consumer_logs(50)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "### 1.4 Query Document via RAG\n",
+        "\n",
+        "You can query the ingested document either **programmatically** below or via the **RAG Frontend UI**.\n",
+        "\n",
+        "> **💡 RAG Frontend**: Open `http://<host-ip>:8090` in your browser for an interactive Q&A interface.\n",
+        "> Make sure to select the collection **`aidp_bucket`** in the UI.\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Query the document\n",
+        "await query_rag(\"What was the final score and who won Super Bowl LX?\", MINIO_COLLECTION)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Ask another question about the document.\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Query about key takeaways\n",
+        "await query_rag(\"What were the key lessons learned from Seattle's victory in Super Bowl LX?\", MINIO_COLLECTION)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Clean Up\n",
+        "\n",
+        "Stop all services and clean up ingested data.\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## 1. Stop RAG Deployment\n",
+        "\n",
+        "Stop all RAG services (NIMs, Milvus, Ingestor, RAG server).\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "os.chdir(RAG_REPO_DIR)\n",
+        "for f in [\n",
+        "    \"deploy/compose/docker-compose-rag-server.yaml\",\n",
+        "    \"deploy/compose/docker-compose-ingestor-server.yaml\",\n",
+        "    \"deploy/compose/vectordb.yaml\",\n",
+        "    \"deploy/compose/nims.yaml\",\n",
+        "]:\n",
+        "    run_command(f\"docker compose -f {f} down\")\n",
+        "print(\"[OK] RAG stopped\")\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## 2. Stop Continuous ingestion Deployment\n",
+        "\n",
+        "Stop Continuous ingestion services (Kafka, MinIO, Consumer).\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "run_command(f\"docker compose -f {AIDP_COMPOSE_FILE} down\")\n",
+        "print(\"[OK] Continuous ingestion stopped\")\n"
+      ]
+    }
+  ],
+  "metadata": {
+    "kernelspec": {
+      "display_name": ".venv",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.12.3"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 4
+}
diff --git a/notebooks/rag_library_lite_usage.ipynb b/notebooks/rag_library_lite_usage.ipynb
index eb91c80b6..915eb00ca 100644
--- a/notebooks/rag_library_lite_usage.ipynb
+++ b/notebooks/rag_library_lite_usage.ipynb
@@ -36,7 +36,7 @@
     "\n",
     "Install nv-ingest library using below command - **OR** - Run the cell below if Jupyter notebook is started in the same environment:\n",
     "```bash\n",
-    "uv pip install nv-ingest==26.1.1\n",
+    "uv pip install nv-ingest==26.1.2\n",
     "```"
    ]
   },
@@ -71,7 +71,7 @@
     "# !uv pip install ../dist/nvidia_rag-*-py3-none-any.whl[all]\n",
     "\n",
     "# Install NV-Ingest library in the same environment to run NV-Ingest pipeline\n",
-    "!uv pip install nv-ingest==26.1.1"
+    "!uv pip install nv-ingest==26.1.2"
    ]
   },
   {
@@ -150,15 +150,15 @@
     "os.environ[\"OCR_HTTP_ENDPOINT\"] = \"https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-ocr\"\n",
     "os.environ[\"OCR_INFER_PROTOCOL\"] = \"http\"\n",
     "os.environ[\"YOLOX_HTTP_ENDPOINT\"] = (\n",
-    "    \"https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-page-elements-v3\"\n",
+    "    \"https://ai.api.nvidia.com/v1/cv/nvidia/nemotron-page-elements-v3\"\n",
     ")\n",
     "os.environ[\"YOLOX_INFER_PROTOCOL\"] = \"http\"\n",
     "os.environ[\"YOLOX_GRAPHIC_ELEMENTS_HTTP_ENDPOINT\"] = (\n",
-    "    \"https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-graphic-elements-v1\"\n",
+    "    \"https://ai.api.nvidia.com/v1/cv/nvidia/nemotron-graphic-elements-v1\"\n",
     ")\n",
     "os.environ[\"YOLOX_GRAPHIC_ELEMENTS_INFER_PROTOCOL\"] = \"http\"\n",
     "os.environ[\"YOLOX_TABLE_STRUCTURE_HTTP_ENDPOINT\"] = (\n",
-    "    \"https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-table-structure-v1\"\n",
+    "    \"https://ai.api.nvidia.com/v1/cv/nvidia/nemotron-table-structure-v1\"\n",
     ")\n",
     "os.environ[\"YOLOX_TABLE_STRUCTURE_INFER_PROTOCOL\"] = \"http\""
    ]
@@ -292,7 +292,7 @@
     "\n",
     "config_ingestor = NvidiaRAGConfig.from_yaml(\"config.yaml\")\n",
     "# You can update the config object to use different models and endpoints like below\n",
-    "# config_ingestor.embeddings.model_name = \"nvidia/llama-3.2-nv-embedqa-1b-v2\"\n",
+    "# config_ingestor.embeddings.model_name = \"nvidia/llama-nemotron-embed-1b-v2\"\n",
     "# config_ingestor.embeddings.server_url = \"https://integrate.api.nvidia.com/v1\"\n",
     "\n",
     "# Set config for rag lite library mode\n",
diff --git a/notebooks/rag_library_usage.ipynb b/notebooks/rag_library_usage.ipynb
index 01f07fb4a..894e82620 100644
--- a/notebooks/rag_library_usage.ipynb
+++ b/notebooks/rag_library_usage.ipynb
@@ -75,7 +75,8 @@
    "outputs": [],
    "source": [
     "# Option A: Install from PyPI (recommended)\n",
-    "# Uncomment the line below to install from PyPI\n",
+    "# Uncomment the line below to install from PyPI.\n",
+    "# Note: This will require a restart of the kernel after installation if you are using this notebook in a JupyterLab session.\n",
     "# !uv pip install nvidia-rag[all]\n",
     "\n",
     "# Option B: Install from source in development mode (for contributors)\n",
@@ -307,12 +308,12 @@
     "Ensure all the below are running and healthy before proceeding further\n",
     "```output\n",
     "NAMES                           STATUS\n",
-    "nemoretriever-ranking-ms        Up ... (healthy)\n",
+    "nemotron-ranking-ms        Up ... (healthy)\n",
     "compose-page-elements-1         Up ...\n",
     "compose-nemoretriever-ocr-1     Up ...\n",
     "compose-graphic-elements-1      Up ...\n",
     "compose-table-structure-1       Up ...\n",
-    "nemoretriever-embedding-ms      Up ... (healthy)\n",
+    "nemotron-embedding-ms      Up ... (healthy)\n",
     "nim-llm-ms                      Up ... (healthy)\n",
     "```"
    ]
@@ -337,15 +338,15 @@
     "os.environ[\"OCR_HTTP_ENDPOINT\"] = \"https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-ocr\"\n",
     "os.environ[\"OCR_INFER_PROTOCOL\"] = \"http\"\n",
     "os.environ[\"YOLOX_HTTP_ENDPOINT\"] = (\n",
-    "    \"https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-page-elements-v3\"\n",
+    "    \"https://ai.api.nvidia.com/v1/cv/nvidia/nemotron-page-elements-v3\"\n",
     ")\n",
     "os.environ[\"YOLOX_INFER_PROTOCOL\"] = \"http\"\n",
     "os.environ[\"YOLOX_GRAPHIC_ELEMENTS_HTTP_ENDPOINT\"] = (\n",
-    "    \"https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-graphic-elements-v1\"\n",
+    "    \"https://ai.api.nvidia.com/v1/cv/nvidia/nemotron-graphic-elements-v1\"\n",
     ")\n",
     "os.environ[\"YOLOX_GRAPHIC_ELEMENTS_INFER_PROTOCOL\"] = \"http\"\n",
     "os.environ[\"YOLOX_TABLE_STRUCTURE_HTTP_ENDPOINT\"] = (\n",
-    "    \"https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-table-structure-v1\"\n",
+    "    \"https://ai.api.nvidia.com/v1/cv/nvidia/nemotron-table-structure-v1\"\n",
     ")\n",
     "os.environ[\"YOLOX_TABLE_STRUCTURE_INFER_PROTOCOL\"] = \"http\""
    ]
@@ -439,7 +440,7 @@
     "    config_ingestor.llm.server_url = \"\"  # Empty uses NVIDIA API catalog\n",
     "    config_ingestor.summarizer.server_url = \"\"  # Empty uses NVIDIA API catalog\n",
     "else:\n",
-    "    config_ingestor.embeddings.server_url = \"http://nemoretriever-embedding-ms:8000/v1\"\n",
+    "    config_ingestor.embeddings.server_url = \"http://nemotron-embedding-ms:8000/v1\"\n",
     "ingestor = NvidiaRAGIngestor(config=config_ingestor)"
    ]
   },
@@ -624,11 +625,11 @@
     "#             \"server_url\": \"\",\n",
     "#         },\n",
     "#     \"embeddings\": {\n",
-    "#             \"model_name\": \"nvidia/llama-3.2-nv-embedqa-1b-v2\",\n",
+    "#             \"model_name\": \"nvidia/llama-nemotron-embed-1b-v2\",\n",
     "#             \"server_url\": \"https://integrate.api.nvidia.com/v1\",\n",
     "#         },\n",
     "#     \"ranking\": {\n",
-    "#             \"model_name\": \"nvidia/llama-3.2-nv-rerankqa-1b-v2\",\n",
+    "#             \"model_name\": \"nvidia/llama-nemotron-rerank-1b-v2\",\n",
     "#             \"server_url\": \"https://ai.api.nvidia.com/v1/retrieval/nvidia/llama-3_2-nv-rerankqa-1b-v2/reranking/v1\",\n",
     "#         },\n",
     "#     }\n",
diff --git a/notebooks/retriever_api_usage.ipynb b/notebooks/retriever_api_usage.ipynb
index 47466e3ed..9b52647ed 100644
--- a/notebooks/retriever_api_usage.ipynb
+++ b/notebooks/retriever_api_usage.ipynb
@@ -145,8 +145,8 @@
     "    \"filter_expr\": \"\",\n",
     "    # Override model endpoints and details if needed\n",
     "    #\"model\": \"nvidia/llama-3.3-nemotron-super-49b-v1.5\",\n",
-    "    #\"reranker_model\": \"nvidia/llama-3.2-nv-rerankqa-1b-v2\",\n",
-    "    #\"embedding_model\": \"nvidia/llama-3.2-nv-embedqa-1b-v2\",\n",
+    "    #\"reranker_model\": \"nvidia/llama-nemotron-rerank-1b-v2\",\n",
+    "    #\"embedding_model\": \"nvidia/llama-nemotron-embed-1b-v2\",\n",
     "    #\"llm_endpoint\": \"\",\n",
     "    #\"embedding_endpoint\": \"\",\n",
     "    #\"reranker_endpoint\": \"\",\n",
@@ -271,8 +271,8 @@
     "    \"enable_query_rewriting\": False,\n",
     "    \"enable_reranker\": True,\n",
     "    # Override model endpoints and details if needed\n",
-    "    #\"reranker_model\": \"nvidia/llama-3.2-nv-rerankqa-1b-v2\",\n",
-    "    #\"embedding_model\": \"nvidia/llama-3.2-nv-embedqa-1b-v2\",\n",
+    "    #\"reranker_model\": \"nvidia/llama-nemotron-rerank-1b-v2\",\n",
+    "    #\"embedding_model\": \"nvidia/llama-nemotron-embed-1b-v2\",\n",
     "    #\"embedding_endpoint\": \"\",\n",
     "    #\"reranker_endpoint\": \"\",\n",
     "}\n",
diff --git a/notebooks/summarization.ipynb b/notebooks/summarization.ipynb
index f824f8e7e..d0c7ef285 100644
--- a/notebooks/summarization.ipynb
+++ b/notebooks/summarization.ipynb
@@ -388,12 +388,12 @@
     "Ensure all the below are running and healthy before proceeding further\n",
     "```output\n",
     "NAMES                           STATUS\n",
-    "nemoretriever-ranking-ms        Up ... (healthy)\n",
+    "nemotron-ranking-ms        Up ... (healthy)\n",
     "compose-page-elements-1         Up ...\n",
     "compose-nemoretriever-ocr-1     Up ...\n",
     "compose-graphic-elements-1      Up ...\n",
     "compose-table-structure-1       Up ...\n",
-    "nemoretriever-embedding-ms      Up ... (healthy)\n",
+    "nemotron-embedding-ms      Up ... (healthy)\n",
     "nim-llm-ms                      Up ... (healthy)\n",
     "```"
    ]
@@ -419,15 +419,15 @@
     "os.environ[\"OCR_HTTP_ENDPOINT\"] = \"https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-ocr\"\n",
     "os.environ[\"OCR_INFER_PROTOCOL\"] = \"http\"\n",
     "os.environ[\"YOLOX_HTTP_ENDPOINT\"] = (\n",
-    "    \"https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-page-elements-v3\"\n",
+    "    \"https://ai.api.nvidia.com/v1/cv/nvidia/nemotron-page-elements-v3\"\n",
     ")\n",
     "os.environ[\"YOLOX_INFER_PROTOCOL\"] = \"http\"\n",
     "os.environ[\"YOLOX_GRAPHIC_ELEMENTS_HTTP_ENDPOINT\"] = (\n",
-    "    \"https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-graphic-elements-v1\"\n",
+    "    \"https://ai.api.nvidia.com/v1/cv/nvidia/nemotron-graphic-elements-v1\"\n",
     ")\n",
     "os.environ[\"YOLOX_GRAPHIC_ELEMENTS_INFER_PROTOCOL\"] = \"http\"\n",
     "os.environ[\"YOLOX_TABLE_STRUCTURE_HTTP_ENDPOINT\"] = (\n",
-    "    \"https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-table-structure-v1\"\n",
+    "    \"https://ai.api.nvidia.com/v1/cv/nvidia/nemotron-table-structure-v1\"\n",
     ")\n",
     "os.environ[\"YOLOX_TABLE_STRUCTURE_INFER_PROTOCOL\"] = \"http\""
    ]
@@ -548,8 +548,8 @@
     "    config.ranking.server_url = \"https://ai.api.nvidia.com/v1/retrieval/nvidia/llama-3_2-nv-rerankqa-1b-v2/reranking/v1\"\n",
     "    config.summarizer.server_url = \"\"  # Empty uses NVIDIA API catalog\n",
     "else:\n",
-    "    config.embeddings.server_url = \"nemoretriever-embedding-ms:8000/v1\"\n",
-    "    config.ranking.server_url = \"nemoretriever-ranking-ms:8000\"\n",
+    "    config.embeddings.server_url = \"nemotron-embedding-ms:8000/v1\"\n",
+    "    config.ranking.server_url = \"nemotron-ranking-ms:8000\"\n",
     "    config.summarizer.server_url = \"nim-llm:8000\"\n",
     "    config.llm.server_url = \"nim-llm:8000\"\n",
     "\n",
@@ -967,7 +967,7 @@
     "else:\n",
     "    os.environ[\"SUMMARY_LLM_SERVERURL\"] = \"nim-llm:8000\"\n",
     "    os.environ[\"LLM_SERVER_URL\"] = \"nim-llm:8000\"\n",
-    "    os.environ[\"APP_EMBEDDINGS_SERVERURL\"] = \"nemoretriever-embedding-ms:8000/v1\"\n",
+    "    os.environ[\"APP_EMBEDDINGS_SERVERURL\"] = \"nemotron-embedding-ms:8000/v1\"\n",
     "    print(\"✓ Configured for on-prem NIMs\")\n",
     "\n",
     "os.environ[\"LOGLEVEL\"] = \"INFO\"\n",
diff --git a/pyproject.toml b/pyproject.toml
index 09b647403..85b6d8df7 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "nvidia_rag"
-version = "2.4.0.dev"
+version = "2.5.0.dev"
 description = "This blueprint serves as a reference solution for a foundational Retrieval Augmented Generation (RAG) pipeline."
 readme = "README.md"
 license = "Apache-2.0"
@@ -23,8 +23,7 @@ dependencies = [
     "langchain>=1.2.7",
     "langchain-community>=0.4",
     "langchain-milvus>=0.3.0",
-    "langchain-nvidia-ai-endpoints>=1.0.3",
-    "minio>=7.2,<8.0",
+    "langchain-nvidia-ai-endpoints>=1.2.0",
     "pdfplumber>=0.11.9",
     "pydantic>=2.11,<3.0",
     "pymilvus[milvus_lite]>=2.6.7,<3.0",
@@ -37,6 +36,7 @@ dependencies = [
     "protobuf>=6.33.5",
     "lark>=1.2.2",
     "python-dateutil>=2.9.0.post0",
+    "oracledb>=3.4.2",
 ]
 
 [project.optional-dependencies]
@@ -58,8 +58,8 @@ rag = [
 ]
 ingest = [
     # nv-ingest dependencies (required for ingestion operations)
-    "nv-ingest-api==26.1.1",
-    "nv-ingest-client==26.1.1",
+    "nv-ingest-api==26.1.2",
+    "nv-ingest-client==26.1.2",
     "tritonclient==2.57.0",
     # Other ingest dependencies
     "langchain-openai>=0.2",
@@ -80,8 +80,8 @@ ingest = [
 ]
 all = [
     # nv-ingest dependencies (required for ingestion operations)
-    "nv-ingest-api==26.1.1",
-    "nv-ingest-client==26.1.1",
+    "nv-ingest-api==26.1.2",
+    "nv-ingest-client==26.1.2",
     "tritonclient==2.57.0",
     # RAG + Ingest dependencies
     "langchain-openai>=0.2",
@@ -100,10 +100,19 @@ all = [
     "pyarrow>=21.0,<22.0",
     # Elasticsearch support
     "langchain-elasticsearch>=0.3",
+    # Oracle 26ai support
+    "oracledb>=3.4.2",
 ]
 elasticsearch = [
     "langchain-elasticsearch>=0.3",
 ]
+oracle = [
+    "oracledb>=3.4.2",
+    "langchain-community>=0.4",
+]
+minio = [
+    "minio>=7.2,<8.0",
+]
 
 [tool.uv.sources]
 nvidia-rag = { workspace = true }
diff --git a/skill-source/.agents/skills/rag-blueprint/SKILL.md b/skill-source/.agents/skills/rag-blueprint/SKILL.md
new file mode 100644
index 000000000..8f48d2858
--- /dev/null
+++ b/skill-source/.agents/skills/rag-blueprint/SKILL.md
@@ -0,0 +1,136 @@
+---
+name: rag-blueprint
+description: "NVIDIA RAG Blueprint — deploy, configure, troubleshoot, and manage. Handles any RAG action: deploy, install, start, enable, disable, toggle, change, configure, troubleshoot, debug, fix, shutdown, stop, or tear down any RAG feature or service (VLM, guardrails, query rewriting, models, search, ingestion, observability, summarization, and more)."
+argument-hint: deploy RAG | enable feature | disable feature | configure | troubleshoot | shutdown
+allowed-tools: Bash(echo *), Bash(nvidia-smi *), Bash(curl *), Bash(docker ps *), Bash(docker exec *), Bash(docker info *), Bash(docker --version *), Bash(docker compose version *), Bash(docker logs *), Bash(docker system *), Bash(kubectl get *), Bash(kubectl describe *), Bash(kubectl version *), Bash(kubectl logs *), Bash(helm version *), Bash(helm list *), Bash(git rev-parse *), Bash(git describe *), Bash(git status *), Bash(python3 --version *), Bash(pip3 show *), Bash(df *), Bash(du *), Bash(cat /proc/*), Bash(cat /etc/os-release *), Bash(ss *), Bash(netstat *), Bash(ls *), Bash(grep *), Bash(lsof *), Bash(ps aux *), Read, Grep, Glob
+license: Apache-2.0
+metadata:
+  author: nvidia-rag-team
+  version: "1.0"
+---
+
+# NVIDIA RAG Blueprint
+
+## Autonomy Principles
+
+- Auto-detect everything: GPU, VRAM, drivers, Docker, CUDA, disk, OS, ports, existing services, NGC key, repo state.
+- If it can be checked with a command, check it — don't ask the user.
+- Ask only when user action is required: providing an API key, confirming data deletion, or choosing between equally valid options.
+- Once analysis is done, route to the correct workflow and execute.
+
+## Intent Detection
+
+Determine what the user wants and route immediately:
+
+| User Intent | Action |
+|-------------|--------|
+| Deploy, install, set up, start RAG | Read and follow `references/deploy.md` |
+| Configure, enable, change, toggle a feature | Use the **Configure** section below |
+| Troubleshoot, debug, fix, error, unhealthy | Read and follow `references/troubleshoot.md` |
+| Stop, shutdown, tear down, clean up | Read and follow `references/shutdown.md` |
+
+If the intent is ambiguous, infer from context (e.g., "RAG isn't working" → troubleshoot; "get RAG running" → deploy). Only ask if genuinely unclear.
+
+---
+
+## Configure
+
+Requires a running RAG deployment. If services are not running, deploy first via `references/deploy.md`.
+
+Match the user's request to a reference file, then read and follow it:
+
+| Feature Keywords | Reference |
+|-----------------|-----------|
+| VLM, VLM embeddings, image captioning | `references/configure/vlm.md` |
+| NeMo Guardrails | `references/configure/guardrails.md` |
+| Query rewriting, decomposition, multi-turn | `references/configure/query-and-conversation.md` |
+| Ingestion (text-only, audio, Nemotron Parse, OCR, batch CLI, NV-Ingest, volume mount, performance) | `references/configure/ingestion.md` |
+| Search, retrieval, hybrid search, multi-collection, metadata, filters, reranker, topK, accuracy/performance | `references/configure/search-and-retrieval.md` |
+| LLM/embedding/ranking model changes, vector DB, Milvus/Elasticsearch auth, service keys, model profiles, ports/GPU | `references/configure/models-and-infrastructure.md` |
+| Reasoning, self-reflection, prompts, generation params (tokens, temperature, citations), per-request LLM params | `references/configure/reasoning-and-generation.md` |
+| Summarization | `references/configure/summarization.md` |
+| Observability (tracing, Zipkin, Grafana, Prometheus) | `references/configure/observability.md` |
+| Multimodal query (image + text) | `references/configure/multimodal-query.md` |
+| Data catalog (collection/document metadata) | `references/configure/data-catalog.md` |
+| User interface (UI settings) | `references/configure/user-interface.md` |
+| API reference (endpoints, schemas) | `references/configure/api-reference.md` |
+| Evaluation (RAGAS metrics) | `references/configure/evaluation.md` |
+| MCP server & client, agent toolkit | `references/configure/mcp.md` |
+| Migration (version upgrades) | `references/configure/migration.md` |
+| Notebooks (setup and catalog) | `references/configure/notebooks.md` |
+
+### Configure Flow
+
+1. Match the user's request to a reference file from the table above.
+
+2. Detect what's running:
+   ```bash
+   echo "=== NIM ===" && docker ps --format '{{.Names}}' 2>/dev/null | grep -iE '(nim-llm|nemoretriever-embedding|nemoretriever-ranking|nemo-vlm|nemotron-vlm)' || echo "NO_LOCAL_NIMS"; echo "=== RAG ===" && docker ps --format '{{.Names}}' 2>/dev/null | grep -iE '(rag-server|ingestor-server|milvus)' || echo "NO_DOCKER_RAG"; echo "=== K8S ===" && kubectl get pods -n rag 2>/dev/null | head -5 || echo "NO_K8S"; echo "=== LIBRARY ===" && ps aux 2>/dev/null | grep -E '(nvidia_rag|uvicorn.*rag)' | grep -v grep || echo "NO_LIBRARY"
+   ```
+
+3. Use this table to determine platform, deployment type, and where config lives:
+
+   | Local NIMs running? | RAG services running? | Deployment Type | Config Location |
+   |---------------------|-----------------------|-----------------|-----------------|
+   | Yes (Docker) | Any | Self-hosted | `deploy/compose/.env` |
+   | No | Yes (Docker) | NVIDIA-hosted | `deploy/compose/nvdev.env` |
+   | Yes (K8s pods) | Any | Self-hosted | `values.yaml` (NIM sections) |
+   | No | Yes (K8s pods) | NVIDIA-hosted | `values.yaml` (envVars) |
+   | — | Library processes | Library mode | `notebooks/config.yaml` |
+   | No | No | Not running | Deploy first via `references/deploy.md` |
+
+   Tell the user what you detected and ask to confirm. Example: "I see local NIM containers running (nim-llm-ms, nemoretriever-embedding-ms) — this is a self-hosted deployment. Config file is `deploy/compose/.env`. Correct?"
+
+4. Check current feature state before changing anything — read the config location from step 3, then cross-check the live service:
+   - Docker: `docker exec rag-server env 2>/dev/null | grep -E "<VAR_NAME>"`
+   - Helm: `kubectl get pod -n rag -l app=rag-server -o jsonpath='{.items[0].spec.containers[0].env}' 2>/dev/null`
+
+   If the config file and live service disagree, tell the user the service has stale config and will need a restart.
+
+5. If the feature needs extra GPUs, check availability against hardware restrictions (see below):
+   ```bash
+   nvidia-smi --query-gpu=index,name,memory.total,memory.used --format=csv,noheader 2>/dev/null || echo "NO_GPU"
+   ```
+
+6. Read the reference file and apply changes:
+   - **Docker**: edit the env file (uncomment to enable, re-comment to disable — the env file is the source of truth). Then restart the affected service:
+     ```
+     source <env-file> && docker compose -f deploy/compose/<compose-file> up -d
+     ```
+     | Service | Compose File |
+     |---------|-------------|
+     | rag-server | `docker-compose-rag-server.yaml` |
+     | ingestor-server | `docker-compose-ingestor-server.yaml` |
+     | milvus, etcd, minio | `vectordb.yaml` |
+     | NIM containers (LLM, embedding, ranking, VLM, OCR) | `nims.yaml` |
+     | guardrails | `docker-compose-nemo-guardrails.yaml` |
+     | observability (Grafana, Prometheus, Zipkin) | `observability.yaml` |
+   - **Helm**: edit `values.yaml`, then upgrade: `helm upgrade rag <chart> -n rag -f values.yaml`
+   - **Library**: edit `notebooks/config.yaml`, then restart the Python process
+
+7. Verify:
+   - Docker: `docker ps --format "table {{.Names}}\t{{.Status}}" | head -20; curl -s http://localhost:8081/v1/health?check_dependencies=true 2>/dev/null | head -1`
+   - Helm: `kubectl get pods -n rag; kubectl rollout status deployment/rag-server -n rag --timeout=120s`
+   - Library: `curl -s http://localhost:8081/v1/health 2>/dev/null | head -1`
+
+8. If restart fails, read `references/troubleshoot.md`. If multiple features requested, repeat from step 1 for each.
+
+### When User Says "Configure" Without Specifics
+
+Run steps 2–3 above, then read the identified config file to list what's currently enabled:
+```bash
+grep -E "^(export )?(ENABLE_|APP_)" <config-file> 2>/dev/null | sort
+```
+Summarize what's running and enabled, then ask which feature to change.
+
+---
+
+## Hardware Restrictions
+
+Read `docs/support-matrix.md` for current GPU requirements per deployment mode.
+Read `docs/service-port-gpu-reference.md` for port mappings and GPU assignments.
+
+| GPU | Feature Restrictions |
+|-----|---------------------|
+| B200 | No VLM, No Guardrails, No Nemotron Parse. May need multi-GPU LLM (`LLM_MS_GPU_ID`). |
+| RTX PRO 6000 | No Nemotron Parse. No Audio on Helm. |
diff --git a/skill-source/.agents/skills/rag-blueprint/references/configure/api-reference.md b/skill-source/.agents/skills/rag-blueprint/references/configure/api-reference.md
new file mode 100644
index 000000000..056d814ba
--- /dev/null
+++ b/skill-source/.agents/skills/rag-blueprint/references/configure/api-reference.md
@@ -0,0 +1,29 @@
+# API Reference
+
+## When to Use
+- User needs to call RAG or Ingestor APIs directly
+- User asks about endpoints, request/response formats, or task status tracking
+
+## Process
+1. Read `docs/api-rag.md` for RAG server endpoints (port 8081)
+2. Read `docs/api-ingestor.md` for Ingestor server endpoints (port 8082)
+3. Consult OpenAPI schemas for exact request/response shapes
+
+## Agent-Specific Notes
+- RAG Server runs on port 8081: `/v1/generate`, `/v1/search`, `/v1/health`, `/v1/configuration`, `/v1/metrics`, `/v1/summary`
+- Ingestor Server runs on port 8082: `/v1/documents`, `/v1/collection`, `/v1/collections`, `/v1/status`
+- `POST /v1/documents` returns a `task_id` — poll `GET /v1/status?task_id=<id>` for progress
+- Task states: `PENDING` → `FINISHED` or `FAILED` (also `UNKNOWN` if not found)
+- NV-Ingest extraction states: `not_started` → `submitted` → `processing` → `completed` or `failed`
+- Max file size: 400 MB per document
+- Full health check: `GET /v1/health?check_dependencies=true`
+
+## Notebooks
+- `notebooks/ingestion_api_usage.ipynb` — ingestion API usage examples
+- `notebooks/retriever_api_usage.ipynb` — RAG retriever API: search and query examples
+
+## Source Documentation
+- `docs/api-rag.md` -- RAG server API details
+- `docs/api-ingestor.md` -- Ingestor server API details
+- `docs/api_reference/openapi_schema_rag_server.json` -- RAG server OpenAPI schema
+- `docs/api_reference/openapi_schema_ingestor_server.json` -- Ingestor server OpenAPI schema
diff --git a/skill-source/.agents/skills/rag-blueprint/references/configure/data-catalog.md b/skill-source/.agents/skills/rag-blueprint/references/configure/data-catalog.md
new file mode 100644
index 000000000..7b2d6c106
--- /dev/null
+++ b/skill-source/.agents/skills/rag-blueprint/references/configure/data-catalog.md
@@ -0,0 +1,36 @@
+# Data Catalog
+
+## When to Use
+- User wants to manage collection or document metadata for governance
+- User asks about tagging, ownership, or lifecycle status of collections
+- User wants to list or update collection metadata
+
+## Restrictions
+- None — available automatically after deployment, no additional configuration needed
+- Works with both Milvus and Elasticsearch (full feature parity)
+
+## Process
+1. Read `docs/data-catalog.md` for full API reference, field definitions, and examples
+2. All endpoints are on the ingestor server (port `8082`)
+3. Use PATCH endpoints for updates (merge updates — only provided fields change)
+
+## Decision Table
+
+| Goal | Source Doc | Key Action |
+|------|-----------|------------|
+| Add governance metadata | `docs/data-catalog.md` | POST `/v1/collection` with description, tags, owner |
+| Update lifecycle status | `docs/data-catalog.md` | PATCH with `status: "Archived"` |
+| Track content types | `docs/data-catalog.md` | Read auto-populated `has_tables`, `has_images` metrics |
+| Filter during retrieval | See custom metadata docs | Use `metadata_schema` + `filter_expr` (not data catalog) |
+
+## Agent-Specific Notes
+- Auto-populated metrics (`number_of_files`, `last_indexed`, `has_tables`, etc.) are system-set — not user-editable
+- `date_created` and `last_updated` timestamps are automatic
+- PATCH is a merge update — omitted fields keep current values
+- Different from custom metadata: catalog = governance/discovery, custom metadata = retrieval filtering
+
+## Notebooks
+- `notebooks/ingestion_api_usage.ipynb` — ingestion and collection management examples
+
+## Source Documentation
+- `docs/data-catalog.md` — full API reference, catalog fields, auto-populated metrics, Python client examples
diff --git a/skill-source/.agents/skills/rag-blueprint/references/configure/evaluation.md b/skill-source/.agents/skills/rag-blueprint/references/configure/evaluation.md
new file mode 100644
index 000000000..90f9c6206
--- /dev/null
+++ b/skill-source/.agents/skills/rag-blueprint/references/configure/evaluation.md
@@ -0,0 +1,26 @@
+# Evaluation
+
+## When to Use
+- User wants to measure RAG pipeline quality
+- User asks about accuracy, relevancy, groundedness, or recall metrics
+
+## Process
+1. Read `docs/evaluate.md` for full evaluation methodology and setup
+2. Choose the appropriate notebook based on metrics needed
+3. Run evaluation against the deployed RAG pipeline
+
+## Agent-Specific Notes
+- Uses RAGAS framework for all metrics
+- Answer Accuracy, Context Relevancy, and Groundedness are covered in one notebook
+- Recall is measured separately at top-k cutoffs (1, 3, 5, 10)
+
+## Notebooks
+| Notebook | Metrics |
+|----------|---------|
+| `notebooks/evaluation_01_ragas.ipynb` | Answer Accuracy, Context Relevancy, Groundedness |
+| `notebooks/evaluation_02_recall.ipynb` | Recall at top-k cutoffs |
+
+## Source Documentation
+- `docs/evaluate.md` -- full evaluation guide and metric definitions
+- [RAGAS documentation](https://docs.ragas.io/en/stable/)
+- [NVIDIA RAGAS metrics](https://docs.ragas.io/en/stable/concepts/metrics/available_metrics/nvidia_metrics/)
diff --git a/skill-source/.agents/skills/rag-blueprint/references/configure/guardrails.md b/skill-source/.agents/skills/rag-blueprint/references/configure/guardrails.md
new file mode 100644
index 000000000..309a18611
--- /dev/null
+++ b/skill-source/.agents/skills/rag-blueprint/references/configure/guardrails.md
@@ -0,0 +1,30 @@
+# NeMo Guardrails
+
+## When to Use
+- User wants content safety, topic control, or jailbreak prevention
+- User asks to enable/disable guardrails
+
+## Restrictions
+- Not available on B200 GPUs
+- Requires 2 extra GPUs with 48GB+ each (H100, A100 SXM 80GB, or RTX PRO 6000)
+- Not supported in library mode or Helm deployments
+- Jailbreak detection model not yet available out-of-the-box
+
+## Process
+
+1. Detect the deployment mode (guardrails are Docker-only — not supported on Helm or library mode). Edit the active env file for Docker
+2. Read `docs/nemo-guardrails.md` for full setup and configuration
+3. Choose deployment mode: self-hosted (local NIMs) or cloud-hosted (NVIDIA API)
+4. For self-hosted: assign GPU IDs — read `docs/service-port-gpu-reference.md` for default GPU assignments and adjust for your system
+5. Verify all three services healthy: `nemo-guardrails-microservice`, content-safety NIM, topic-control NIM
+6. Enable in UI: Settings > Output Preferences > Guardrails toggle
+
+## Agent-Specific Notes
+- Cloud mode (`nemoguard_cloud` config) skips local NIM containers — only the microservice is needed
+- Per-request toggle via `enable_guardrails` in `/generate` body requires server-level `ENABLE_GUARDRAILS=true` first
+- Override guardrails URL with `NEMO_GUARDRAILS_URL` if running on a different host
+- Content-safety and topic-control models are trained on single-turn data — multi-turn conversations may get inconsistent safety classifications
+- Current guardrails only produce simple refusal responses ("I'm sorry. I can't respond to that.")
+
+## Source Documentation
+- `docs/nemo-guardrails.md` -- full setup, configuration, and customization of guardrail rules
diff --git a/skill-source/.agents/skills/rag-blueprint/references/configure/ingestion.md b/skill-source/.agents/skills/rag-blueprint/references/configure/ingestion.md
new file mode 100644
index 000000000..ec5e6251f
--- /dev/null
+++ b/skill-source/.agents/skills/rag-blueprint/references/configure/ingestion.md
@@ -0,0 +1,53 @@
+# Ingestion: Text-Only, Audio, Nemotron Parse, OCR & Batch
+
+## When to Use
+User wants to configure ingestion mode (text-only, audio, Nemotron Parse), switch OCR engines, save extraction results to disk, use standalone NV-Ingest, tune ingestion performance, or run batch ingestion.
+
+## Restrictions
+- Nemotron Parse: not available on B200 or RTX PRO 6000 GPUs (requires H100 or A100 SXM 80GB)
+- Audio on Helm: not supported on RTX PRO 6000
+- Nemotron Parse GPU conflict: read `docs/service-port-gpu-reference.md` for default GPU assignments. Nemotron Parse defaults to the same GPU as LLM — reassign on limited-GPU systems
+
+## Process
+
+1. Detect the deployment mode (Docker self-hosted / NVIDIA-hosted / Helm / Library). Docker: edit the active env file. Helm: edit `values.yaml`. Library: edit `notebooks/config.yaml`
+2. Read the relevant source doc for detailed configuration
+3. Apply the required env vars to the active config, restart ingestor (and NIM services if enabling new profiles)
+4. Verify: upload a test document and check ingestion status
+
+## Decision Table
+
+| Goal | Source Doc | Key Action |
+|------|-----------|------------|
+| Text-only ingestion | `docs/text_only_ingest.md` | Set extract vars to False, set `COMPONENTS_TO_READY_CHECK=""` |
+| Audio ingestion | `docs/audio_ingestion.md` | Start audio NIM (`--profile audio`), set `AUDIO_MS_GPU_ID` |
+| Nemotron Parse | `docs/nemotron-parse-extraction.md` | `APP_NVINGEST_PDFEXTRACTMETHOD=nemotron_parse`, start NIM |
+| OCR config/switch | `docs/nemoretriever-ocr.md` | Switch between NeMo Retriever OCR and Paddle OCR |
+| Save to disk | `docs/mount-ingestor-volume.md` | `APP_NVINGEST_SAVETODISK=True`, mount volume |
+| Standalone NV-Ingest | `docs/nv-ingest-standalone.md` | Direct Python client, no full ingestor server |
+| Batch ingestion | See `scripts/batch_ingestion.py` | `python scripts/batch_ingestion.py --folder ... --collection-name ...` |
+| Tune performance | `docs/accuracy_perf.md` | Adjust chunk size, overlap, batch settings |
+| Summarization at ingest | `references/configure/summarization.md` | `generate_summary: true` in upload payload |
+
+## Agent-Specific Notes
+
+- Text-only mode: set `COMPONENTS_TO_READY_CHECK=""` in the active env file so NV-Ingest does not wait for disabled extraction services. If the compose file hardcodes `COMPONENTS_TO_READY_CHECK=ALL`, update it to `${COMPONENTS_TO_READY_CHECK:-ALL}` so the env var takes effect
+- Use `--profile rag` with nims.yaml to skip OCR/detection NIMs in text-only mode
+- Audio formats supported: `.mp3`, `.wav`, `.mp4`, `.avi`, `.mov`, `.mkv`
+- Riva ASR requires ~8GB VRAM
+- NeMo Retriever OCR is 2x+ faster than Paddle OCR but needs 8GB vs 3GB VRAM
+- Batch CLI: `pip install -r scripts/requirements.txt` first; idempotent (skips already-ingested files)
+- MIG deployments: reduce batch sizes for large bulk ingestion jobs
+
+## Notebooks
+- `notebooks/ingestion_api_usage.ipynb` — Ingestor API: collections, uploads, document management
+
+## Source Documentation
+- `docs/text_only_ingest.md` — Text-only ingestion (skip OCR/detection)
+- `docs/audio_ingestion.md` — Audio/video ingestion via ASR
+- `docs/nemotron-parse-extraction.md` — Nemotron Parse PDF extraction
+- `docs/nemoretriever-ocr.md` — OCR configuration and switching
+- `docs/mount-ingestor-volume.md` — Volume mount for extraction results
+- `docs/nv-ingest-standalone.md` — Standalone NV-Ingest without ingestor server
+- `docs/accuracy_perf.md` — Ingestion tuning settings (chunk size, overlap, batch params)
+- `docs/service-port-gpu-reference.md` — OCR port mappings and GPU assignments
diff --git a/skill-source/.agents/skills/rag-blueprint/references/configure/mcp.md b/skill-source/.agents/skills/rag-blueprint/references/configure/mcp.md
new file mode 100644
index 000000000..0fc9516e8
--- /dev/null
+++ b/skill-source/.agents/skills/rag-blueprint/references/configure/mcp.md
@@ -0,0 +1,26 @@
+# MCP Server & Client
+
+## When to Use
+- User wants to expose RAG APIs as MCP tools for agentic workflows
+- User asks about MCP transport modes, NeMo Agent Toolkit integration, or ReAct agents
+
+## Process
+1. Read `docs/mcp.md` for full MCP server/client setup and configuration
+2. Choose transport mode: `sse`, `streamable_http`, or `stdio`
+3. Run MCP server from `examples/nvidia_rag_mcp/mcp_server.py`
+4. For agentic RAG, see ReAct agent example in `examples/rag_react_agent/`
+
+## Agent-Specific Notes
+- MCP wraps both RAG tools (`generate`, `search`, `get_summary`) and Ingestor tools (`create_collection`, `upload_documents`, etc.) via FastMCP
+- `stdio` transport does not require a running server — client spawns it directly
+- ReAct agent requires: Python 3.11+, `NVIDIA_API_KEY`, and data already ingested into Milvus
+- Configure Milvus endpoint in `examples/rag_react_agent/src/rag_react_agent/configs/config.yml` or via `APP_VECTORSTORE_URL`
+
+## Notebooks
+| Notebook | Description |
+|----------|-------------|
+| `notebooks/mcp_server_usage.ipynb` | End-to-end MCP workflow: collection creation, upload, RAG queries |
+| `notebooks/nat_mcp_integration.ipynb` | NeMo Agent Toolkit integration with RAG MCP server |
+
+## Source Documentation
+- `docs/mcp.md` -- full MCP server/client documentation and transport configuration
diff --git a/skill-source/.agents/skills/rag-blueprint/references/configure/migration.md b/skill-source/.agents/skills/rag-blueprint/references/configure/migration.md
new file mode 100644
index 000000000..c56e020ee
--- /dev/null
+++ b/skill-source/.agents/skills/rag-blueprint/references/configure/migration.md
@@ -0,0 +1,35 @@
+# Migration Guide
+
+## When to Use
+- User is upgrading between RAG Blueprint versions
+- User encounters breaking API changes or deprecated endpoints after an update
+
+## Process
+1. Read `docs/migration_guide.md` for full version-by-version migration details
+2. Identify the user's current and target versions
+3. Apply changes sequentially for each version gap
+
+## Agent-Specific Notes
+
+### v2.2.0 → v2.3.0
+- New `confidence_threshold` field in `/generate` and `/search` (0.0–1.0, default 0.0)
+- New `summary_options` parameter with `page_filter`, `shallow_summary`, `summarization_strategy`
+- `SUMMARY_LLM_MAX_CHUNK_LENGTH` and `SUMMARY_CHUNK_OVERLAP` changed from character-based to token-based — divide old values by ~4
+
+### v2.1.0 → v2.2.0
+- Added `generate_summary` to `/documents`, new `GET /summary` endpoint
+- `POST /collection` (singular) replaces `POST /collections` for single collection creation
+- `collection_names: List[str]` replaces `collection_name: str` in `/generate` and `/search`
+
+### v2.0.0 → v2.1.0
+- `POST /documents` gained `blocking: bool` (default `True`); use `false` + `GET /status` for async
+
+### v1.0.0 → v2.0.0 (Breaking)
+- Single server split into RAG Server (port 8081) and Ingestion Server (port 8082)
+- Collections must be explicitly created before uploading documents
+- Default changed from cloud-hosted to on-prem models
+
+## Source Documentation
+- `docs/migration_guide.md` — Full migration guide with examples and env var changes
+- `docs/release-notes.md` — Release notes and version history
+- `docs/query-to-answer-pipeline.md` — Query-to-answer pipeline architecture overview
diff --git a/skill-source/.agents/skills/rag-blueprint/references/configure/models-and-infrastructure.md b/skill-source/.agents/skills/rag-blueprint/references/configure/models-and-infrastructure.md
new file mode 100644
index 000000000..68add08bc
--- /dev/null
+++ b/skill-source/.agents/skills/rag-blueprint/references/configure/models-and-infrastructure.md
@@ -0,0 +1,68 @@
+# Models, Vector DB & Service API Keys
+
+## When to Use
+User wants to change LLM, embedding, or ranking models; switch vector DB (Milvus/Elasticsearch); configure Milvus auth, GPU mode, or custom endpoints; set service-specific API keys; or build a custom VDB operator.
+
+## Process
+
+Detect the deployment mode before making changes. Docker: edit the active env file. Helm: edit `values.yaml` under `nimOperator` and `envVars` sections. Library: edit `notebooks/config.yaml`.
+
+### Change Models (LLM, Embedding, Ranking)
+1. Read `docs/change-model.md` for full model change instructions
+2. Read `docs/model-profiles.md` for NIM profile selection and GPU-specific profiles
+3. Key env vars: `APP_LLM_MODELNAME`, `APP_EMBEDDINGS_MODELNAME`, `APP_RANKING_MODELNAME`
+4. Embedding model change requires re-ingesting all documents — update `APP_EMBEDDINGS_DIMENSIONS` to match
+5. Restart affected services (RAG server + ingestor for embedding changes)
+6. Verify via health endpoint
+
+### Switch Vector DB (Milvus to Elasticsearch)
+1. Read `docs/change-vectordb.md` for full setup (Docker and Helm)
+2. Key env vars: `APP_VECTORSTORE_URL`, `APP_VECTORSTORE_NAME`
+3. Data is not migrated — re-ingest all documents after switching
+4. Elasticsearch requires port 9200; check for conflicts
+
+### Milvus Configuration
+1. Read `docs/milvus-configuration.md` for indexing, GPU, auth, and tuning
+2. Read `docs/milvus-schema.md` for collection schema requirements
+3. CPU mode: set `APP_VECTORSTORE_ENABLEGPUSEARCH=False`, `APP_VECTORSTORE_ENABLEGPUINDEX=False`, change Milvus image to non-GPU
+4. Auth: download milvus.yaml, enable `authorizationEnabled`, set password before first deployment
+
+### API Keys
+1. Read `docs/api-key.md` for NGC API key setup and per-service keys
+2. Fallback order: service-specific key > `NVIDIA_API_KEY` > `NGC_API_KEY`
+3. Per-service keys: `APP_LLM_APIKEY`, `APP_EMBEDDINGS_APIKEY`, `APP_RANKING_APIKEY`, `APP_VLM_APIKEY`, etc.
+
+## Decision Table
+
+| Goal | Source Doc | Key Action |
+|------|-----------|------------|
+| Change LLM | `docs/change-model.md` | Set `APP_LLM_MODELNAME`, restart RAG server |
+| Change embedding | `docs/change-model.md` | Set `APP_EMBEDDINGS_MODELNAME` + `APP_EMBEDDINGS_DIMENSIONS`, re-ingest |
+| Change reranker | `docs/change-model.md` | Set `APP_RANKING_MODELNAME`, restart RAG server |
+| Switch to Elasticsearch | `docs/change-vectordb.md` | Create data dir, start ES profile, set env vars, re-ingest |
+| Milvus auth | `docs/milvus-configuration.md` | Download config, enable auth, mount volume |
+| Milvus CPU mode | `docs/milvus-configuration.md` | Change image, disable GPU env vars |
+| Custom VDB | `docs/change-vectordb.md` | Implement `VDBRag`, register in `__init__.py` |
+| NIM profiles | `docs/model-profiles.md` | List profiles, set `NIM_MODEL_PROFILE` |
+| Service API keys | `docs/api-key.md` | Set per-service `*_APIKEY` vars |
+| Collection schema | `docs/milvus-schema.md` | Required fields: pk, vector, text, source, content_metadata |
+
+## Agent-Specific Notes
+
+- Nemotron-3-Nano naming: `nvidia/nemotron-3-nano-30b-a3b` (NVIDIA-hosted) vs `nvidia/nemotron-3-nano` (self-hosted NIM) — same model, different names
+- Helm model changes go in `values.yaml` under `nimOperator` and `envVars` sections
+- Custom VDB operator requires implementing `VDBRag` base class — see `docs/change-vectordb.md` "Custom Vector Database Operator" section
+- VDB auth tokens can be passed per-request via `Authorization: Bearer <token>` header
+- Milvus password persists in etcd volume — to change after deployment, must delete volumes (destroys data)
+
+## Notebooks
+- `notebooks/building_rag_vdb_operator.ipynb` — Custom VDB operator implementation (OpenSearch example)
+
+## Source Documentation
+- `docs/change-model.md` — Model changes (LLM, embedding, ranking, NIM images)
+- `docs/change-vectordb.md` — Vector DB switching, Elasticsearch setup, custom VDB operator
+- `docs/milvus-configuration.md` — Milvus indexing, GPU config, auth, tuning
+- `docs/milvus-schema.md` — Collection schema fields and requirements
+- `docs/model-profiles.md` — NIM profile definitions and selection
+- `docs/api-key.md` — NGC API key setup, per-service keys, fallback order
+- `docs/service-port-gpu-reference.md` — Port mappings and GPU assignments for all services
diff --git a/skill-source/.agents/skills/rag-blueprint/references/configure/multimodal-query.md b/skill-source/.agents/skills/rag-blueprint/references/configure/multimodal-query.md
new file mode 100644
index 000000000..783b6c209
--- /dev/null
+++ b/skill-source/.agents/skills/rag-blueprint/references/configure/multimodal-query.md
@@ -0,0 +1,35 @@
+# Multimodal Query (Image + Text)
+
+## When to Use
+- User wants to query knowledge base with images and text together
+- User asks about VLM (Vision Language Model) deployment for RAG
+- User wants image-based document understanding or visual Q&A
+
+## Restrictions
+- Not available with Elasticsearch — Milvus only
+- Reranker must be disabled (`ENABLE_RERANKER=false`)
+- Summarization not supported (VLM replaces LLM)
+- On-prem: requires NVIDIA H100 or A100 SXM 80GB GPU
+- Single-page retrieval only — image queries return content from one page per document
+
+## Process
+1. Detect the deployment mode (Docker / Helm / Library). Docker: edit the active env file. Helm: edit `values.yaml`. Library: edit `notebooks/config.yaml`
+2. Read `docs/multimodal-query.md` for full env var configuration and commands
+3. Choose variant: self-hosted (Docker), NVIDIA-hosted (cloud), or Helm
+4. Deploy VLM + VLM Embedding NIMs per source doc instructions
+5. Set VLM env vars in the active config and switch embedding model to VLM embedding
+6. Restart ingestor + RAG server (Docker: add `--build` flag) and verify
+
+## Agent-Specific Notes
+- Must select a collection before querying — queries without collection return no results
+- First VLM deployment: model downloads take 10–20 min (~10GB+)
+- `VLM_MS_GPU_ID` — read `docs/service-port-gpu-reference.md` for the default GPU assignment and override if needed
+- Cloud rate limits apply for ingestion of >10 files
+- For Helm with MIG: ensure dedicated MIG slice is assigned to VLM
+- Image extraction must be enabled: `APP_NVINGEST_EXTRACTIMAGES=True`, `APP_NVINGEST_IMAGE_ELEMENTS_MODALITY=image`
+
+## Notebooks
+- `notebooks/image_input.ipynb` — end-to-end multimodal query examples, image upload, VLM querying
+
+## Source Documentation
+- `docs/multimodal-query.md` — full Docker/cloud/Helm configuration, env vars, API usage, limitations
diff --git a/skill-source/.agents/skills/rag-blueprint/references/configure/notebooks.md b/skill-source/.agents/skills/rag-blueprint/references/configure/notebooks.md
new file mode 100644
index 000000000..544de03bb
--- /dev/null
+++ b/skill-source/.agents/skills/rag-blueprint/references/configure/notebooks.md
@@ -0,0 +1,50 @@
+# Notebooks
+
+## When to Use
+- Hands-on examples of NVIDIA RAG Blueprint features are needed
+- There are questions about Jupyter notebooks, tutorials, or code samples
+
+## Process
+1. Read `docs/notebooks.md` for full notebook descriptions and prerequisites.
+2. Set up the environment: virtualenv, `jupyterlab`, and `git lfs pull` for test data.
+3. Open JupyterLab at `http://<server-ip>:8889`.
+
+## Agent-Specific Notes
+- Git LFS is required because several notebooks rely on large data files (`git lfs install && git lfs pull`).
+- In Docker mode, deploy NVIDIA RAG Blueprint first, then run notebooks against the running services.
+- In library mode, use `rag_library_usage.ipynb` (full) or `rag_library_lite_usage.ipynb` (containerless).
+- The custom VDB operator notebook requires Docker for OpenSearch services.
+
+## Notebook Catalog
+
+### Beginner
+| Notebook                    | Topic                               |
+|-----------------------------|-------------------------------------|
+| `ingestion_api_usage.ipynb` | Document ingestion through the API  |
+| `retriever_api_usage.ipynb` | Search and retrieval API            |
+| `image_input.ipynb`         | Image upload and multimodal queries |
+
+### Intermediate
+| Notebook                       | Topic                                  |
+|--------------------------------|----------------------------------------|
+| `summarization.ipynb`          | Document summarization strategies      |
+| `evaluation_01_ragas.ipynb`    | RAGAS accuracy, relevancy, groundedness|
+| `evaluation_02_recall.ipynb`   | Recall at top-k cutoffs                |
+| `nb_metadata.ipynb`            | Custom metadata and filtered retrieval |
+| `rag_library_usage.ipynb`      | Full library mode end-to-end           |
+| `rag_library_lite_usage.ipynb` | Lite, containerless library mode       |
+
+### Advanced
+| Notebook                          | Topic                               |
+|-----------------------------------|-------------------------------------|
+| `building_rag_vdb_operator.ipynb` | Custom OpenSearch VDB operator      |
+| `mcp_server_usage.ipynb`          | MCP server with transport modes     |
+| `nat_mcp_integration.ipynb`       | NeMo Agent Toolkit plus MCP         |
+
+### Deployment
+| Notebook           | Topic                 |
+|--------------------|-----------------------|
+| `launchable.ipynb` | Brev cloud deployment |
+
+## Source Documentation
+- `docs/notebooks.md` — full notebook descriptions, setup, and prerequisites.
diff --git a/skill-source/.agents/skills/rag-blueprint/references/configure/observability.md b/skill-source/.agents/skills/rag-blueprint/references/configure/observability.md
new file mode 100644
index 000000000..5b291d339
--- /dev/null
+++ b/skill-source/.agents/skills/rag-blueprint/references/configure/observability.md
@@ -0,0 +1,29 @@
+# Observability
+
+## When to Use
+- User wants tracing, metrics, or monitoring for the RAG pipeline
+- User asks about latency debugging, Zipkin, Grafana, or Prometheus
+
+## Process
+1. Detect the deployment mode. Docker: edit the active env file. Helm: edit `values.yaml`. Library: edit `notebooks/config.yaml`
+2. Read `docs/observability.md` for full setup (Docker and Helm)
+3. Set `OPENTELEMETRY_CONFIG_FILE` and `APP_TRACING_ENABLED=True` in the active config
+4. Start observability stack and restart RAG server
+5. Import Grafana dashboard from `deploy/config/rag-metrics-dashboard.json`
+
+## Agent-Specific Notes
+- Library mode: set `OPENTELEMETRY_CONFIG_FILE` in the environment for tracing; the Docker-based Prometheus/Grafana stack is independent
+- Helm: Prometheus Operator CRDs must be installed before deploying with observability enabled
+- Default Grafana credentials: `admin` / `admin`
+- Zipkin spans cover: `query-rewriter`, `retriever`, `context-reranker`, `llm-stream`
+- Span I/O visible via `traceloop.entity.input` / `traceloop.entity.output` fields
+
+### Quick Latency Triage
+| Symptom | Check |
+|---------|-------|
+| Slow first token | `rag_ttft_ms` — compare retriever and reranker spans |
+| Slow full response | `llm_generation_time_ms` / `llm-stream` span |
+| Retrieval heavy | Compare `retrieval_time_ms` vs `context_reranker_time_ms` |
+
+## Source Documentation
+- `docs/observability.md` -- full Docker/Helm setup, env vars, metrics reference, and dashboard import
diff --git a/skill-source/.agents/skills/rag-blueprint/references/configure/query-and-conversation.md b/skill-source/.agents/skills/rag-blueprint/references/configure/query-and-conversation.md
new file mode 100644
index 000000000..2f092cd97
--- /dev/null
+++ b/skill-source/.agents/skills/rag-blueprint/references/configure/query-and-conversation.md
@@ -0,0 +1,82 @@
+```markdown
+# Query Rewriting, Query Decomposition, and Multi-Turn
+
+Use these features when you want the system to understand follow-up questions, rewrite queries for better retrieval, or break complex questions into smaller parts.
+
+## When to use
+
+Use these settings when:
+
+- You want to enable multi-turn conversations or support follow-up questions.
+- You want query rewriting to improve retrieval accuracy.
+- You need complex multi-hop query decomposition.
+- You are configuring or debugging conversation history behavior.[file:1]
+
+## Restrictions
+
+- Query rewriting and multi-turn both require `CONVERSATION_HISTORY > 0`. If it is set to 0, query rewriting has no effect.[file:1]
+- Query decomposition works only when `use_knowledge_base=true` and with a single collection.[file:1]
+- On Helm, query rewriting is supported only with an on-prem LLM, not with cloud-hosted models.[file:1]
+
+## Dependencies
+
+`CONVERSATION_HISTORY` is shared by query rewriting and multi-turn, so changing it affects both behaviors.
+
+| Setting                 | Depends on                 | Side effect when changed                                  |
+|-------------------------|----------------------------|-----------------------------------------------------------|
+| `ENABLE_QUERYREWRITER`  | `CONVERSATION_HISTORY > 0` | Enabling requires conversation history; disabling has no side effects |
+| `CONVERSATION_HISTORY`  | —                          | Setting to `0` also effectively disables query rewriting  |[file:1]
+
+## Process
+
+First detect the deployment mode.  
+- Docker: edit the active environment file.  
+- Helm: edit `values.yaml`.  
+- Library: edit `notebooks/config.yaml`.[file:1]
+
+### Query rewriting
+
+1. Review `docs/multiturn.md` for full configuration details.[file:1]
+2. To enable, set `ENABLE_QUERYREWRITER=True`. If `CONVERSATION_HISTORY` is `0`, set it to `5` or another positive value.[file:1]
+3. To disable, unset or comment out `ENABLE_QUERYREWRITER`.[file:1]
+4. Restart the RAG server.[file:1]
+
+### Multi-turn
+
+1. Review `docs/multiturn.md` for configuration, retrieval strategies, and API usage.[file:1]
+2. To enable, set `CONVERSATION_HISTORY > 0` and choose the retrieval strategy you want to use.[file:1]
+3. To disable, set `CONVERSATION_HISTORY=0`.[file:1]
+4. Restart the RAG server.[file:1]
+
+### Query decomposition
+
+1. Review `docs/query_decomposition.md` for the decomposition algorithm, limitations, and examples.[file:1]
+2. Set `ENABLE_QUERY_DECOMPOSITION=true` and `MAX_RECURSION_DEPTH=3` (or a different depth that fits your use case).[file:1]
+3. Restart the RAG server.[file:1]
+
+## Decision table
+
+| Goal                          | Source doc                 | Key settings                                              |
+|-------------------------------|----------------------------|-----------------------------------------------------------|
+| Multi-turn with best accuracy | `docs/multiturn.md`        | `CONVERSATION_HISTORY=5`, `ENABLE_QUERYREWRITER=True`    |
+| Multi-turn with low latency   | `docs/multiturn.md`        | `CONVERSATION_HISTORY=5`, `MULTITURN_RETRIEVER_SIMPLE=True` |
+| Complex multi-hop queries     | `docs/query_decomposition.md` | `ENABLE_QUERY_DECOMPOSITION=true`, `MAX_RECURSION_DEPTH=3` |
+| Disable multi-turn (default)  | —                          | `CONVERSATION_HISTORY=0`                                 |[file:1]
+
+## Agent-specific notes
+
+- `MULTITURN_RETRIEVER_SIMPLE` only applies when query rewriting is disabled. If both are configured, query rewriting takes precedence.[file:1]
+- You can toggle query rewriting per request by setting `enable_query_rewriting: true` in `POST /generate`, but `CONVERSATION_HISTORY` must still be greater than 0.[file:1]
+- By default, multi-turn is disabled with `CONVERSATION_HISTORY=0`.[file:1]
+- Query decomposition adds latency and is most useful for multi-hop queries that involve multiple entities or steps.[file:1]
+- In library mode, configure these settings in `notebooks/config.yaml` instead of using environment variables.[file:1]
+
+## Notebooks
+
+- `notebooks/retriever_api_usage.ipynb`: RAG retriever API usage with search and end-to-end query examples.[file:1]
+
+## Source documentation
+
+- `docs/query_decomposition.md`: Decomposition algorithm details, when to use it, and recursion depth guidance.[file:1]
+- `docs/multiturn.md`: Conversation history behavior, retrieval strategies, API usage, and Helm configuration.[file:1]
+```
diff --git a/skill-source/.agents/skills/rag-blueprint/references/configure/reasoning-and-generation.md b/skill-source/.agents/skills/rag-blueprint/references/configure/reasoning-and-generation.md
new file mode 100644
index 000000000..d5bb24114
--- /dev/null
+++ b/skill-source/.agents/skills/rag-blueprint/references/configure/reasoning-and-generation.md
@@ -0,0 +1,57 @@
+# Reasoning, Self-Reflection & Prompt Customization
+
+## When to Use
+User wants to enable reasoning/thinking mode, configure self-reflection, customize prompts, adjust generation parameters (max tokens, temperature, citations), or understand thinking budget options.
+
+## Process
+1. Detect the deployment mode (Docker / Helm / Library). Docker: edit the active env file. Helm: edit `values.yaml`. Library: edit `notebooks/config.yaml`
+2. Read the relevant source doc for the specific feature
+3. Apply env vars to the active config or edit prompt files, restart RAG server
+4. Prompt changes require `--build` flag (Docker); env var changes only need restart
+5. Verify: test with a query and check for reasoning output or changed behavior
+
+## Decision Table
+
+| Goal | Source Doc | Key Action |
+|------|-----------|------------|
+| Enable reasoning (Nemotron 1.5) | `docs/enable-nemotron-thinking.md` | Edit `prompt.yaml`: `/no_think` → `/think`, set temperature |
+| Enable reasoning (Nano 30B) | `docs/enable-nemotron-thinking.md` | `ENABLE_NEMOTRON_3_NANO_THINKING=true` |
+| Self-reflection | `docs/self-reflection.md` | `ENABLE_REFLECTION=true`, set thresholds |
+| Prompt customization | `docs/prompt-customization.md` | `PROMPT_CONFIG_FILE=/path/to/custom.yaml` or edit prompt.yaml |
+| Generation parameters | `docs/llm-params.md` | `LLM_MAX_TOKENS`, `LLM_TEMPERATURE`, `ENABLE_CITATIONS` |
+| Per-request overrides | `docs/llm-params.md` | `temperature`, `top_p`, `max_tokens`, `stop` in API payload |
+
+## Agent-Specific Notes
+
+- Prompt changes need `--build` flag on restart; env var changes do not
+- Self-reflection: streaming not supported during groundedness checks
+- Self-reflection uses same LLM by default; override with `REFLECTION_LLM`, `REFLECTION_LLM_SERVERURL`, `REFLECTION_LLM_APIKEY`
+- Helm: only on-premises reflection is supported
+- GPU requirements for reflection: see `docs/self-reflection.md` for optimal GPU configurations
+- Debug reflection: set `LOGLEVEL=INFO` to observe iteration counts
+- `FILTER_THINK_TOKENS=false` to see full reasoning output (filtered by default)
+- 18 prompt templates available in `prompt.yaml` — custom file only overrides specified keys
+
+### Reasoning Model Comparison
+
+| Model | Control | Thinking Budget | Output Format |
+|-------|---------|-----------------|---------------|
+| Nemotron 1.5 | System prompt (`/think`) | None | `<think>` tags (filtered by default) |
+| Nemotron-3-Nano 9B | System prompt (`/think`) | `min_thinking_tokens` + `max_thinking_tokens` | `reasoning_content` field |
+| Nemotron-3-Nano 30B | `ENABLE_NEMOTRON_3_NANO_THINKING` env var | `max_thinking_tokens` only | `reasoning_content` field |
+
+### Thinking Budget Recommendations
+
+| Range | Use Case |
+|-------|----------|
+| 1024–4096 | Faster responses for simpler questions |
+| 8192–16384 | More thorough reasoning for complex queries |
+
+## Notebooks
+- `notebooks/retriever_api_usage.ipynb` — end-to-end query examples showing generation behavior
+
+## Source Documentation
+- `docs/enable-nemotron-thinking.md` — Reasoning mode for all Nemotron models
+- `docs/self-reflection.md` — Self-reflection configuration and thresholds
+- `docs/prompt-customization.md` — Prompt template catalog and customization
+- `docs/llm-params.md` — Generation parameters (temperature, max tokens, etc.)
diff --git a/skill-source/.agents/skills/rag-blueprint/references/configure/search-and-retrieval.md b/skill-source/.agents/skills/rag-blueprint/references/configure/search-and-retrieval.md
new file mode 100644
index 000000000..b311c958d
--- /dev/null
+++ b/skill-source/.agents/skills/rag-blueprint/references/configure/search-and-retrieval.md
@@ -0,0 +1,67 @@
+# Search & Retrieval: Hybrid Search, Multi-Collection, Metadata & Profiles
+
+## When to Use
+User wants to enable hybrid search, query multiple collections, add custom metadata/filters, tune retrieval performance, configure reranker, enable natural language filter generation, or switch accuracy/performance profiles.
+
+## Process
+
+1. Detect the deployment mode (Docker / Helm / Library). Docker: edit the active env file. Helm: edit `values.yaml`. Library: edit `notebooks/config.yaml`
+2. Read the relevant source doc for detailed configuration
+3. Apply the required env vars to the active config and restart affected services
+4. Verify via search/generate API call
+
+## Decision Table
+
+| Goal | Source Doc | Key Env Vars |
+|------|-----------|-------------|
+| Hybrid search | `docs/hybrid_search.md` | `APP_VECTORSTORE_SEARCHTYPE=hybrid` |
+| Multi-collection | `docs/multi-collection-retrieval.md` | `enable_reranker: True` in API payload |
+| Custom metadata | `docs/custom-metadata.md` | Metadata in upload payload, `vdb_filter_expression` in query |
+| Accuracy profile | `docs/accuracy_perf.md` | Copy values from `deploy/compose/accuracy_profile.env` into the active env file |
+| Performance profile | `docs/accuracy_perf.md` | Copy values from `deploy/compose/perf_profile.env` into the active env file |
+| Filter generation | `docs/custom-metadata.md` | `ENABLE_FILTER_GENERATOR=True` |
+
+## Agent-Specific Notes
+
+- Hybrid search requires re-ingesting — existing collections created with `dense` must be re-created
+- Multi-collection: limited to 5 collections per query; reranker is mandatory
+- Multi-collection not supported when `ENABLE_QUERY_DECOMPOSITION=true`
+- Elasticsearch RRF not supported in open-source version — must use `weighted` ranker
+- Ingestor must be restarted alongside RAG server when enabling hybrid search
+- `RERANKER_CONFIDENCE_THRESHOLD` is a legacy alias for `RERANKER_SCORE_THRESHOLD`
+- Recommended `RERANKER_SCORE_THRESHOLD` range: 0.3–0.5 (too high filters out too many chunks)
+
+### Advanced Tuning (not fully documented elsewhere)
+
+| Variable | Default | Description |
+|----------|---------|-------------|
+| `APP_VECTORSTORE_INDEXTYPE` | `GPU_CAGRA` | Vector index type |
+| `APP_VECTORSTORE_EF` | `100` | Search accuracy/speed trade-off (must be >= `VECTOR_DB_TOPK`) |
+| `VECTOR_DB_TOPK` | `100` | Candidates from vector DB (input to reranker) |
+| `APP_RETRIEVER_TOPK` | `10` | Chunks sent to LLM prompt (after reranking) |
+| `ENABLE_RERANKER` | `True` | Toggle reranking model |
+| `RERANKER_SCORE_THRESHOLD` | `0.0` | Minimum reranker score (0.0–1.0) |
+| `COLLECTION_NAME` | `multimodal_data` | Default collection name |
+
+### Partial Filtering
+- Strict (default): fails if any collection doesn't support the filter
+- Flexible (`allow_partial_filtering: true` in config.yaml): succeeds if at least one collection supports it
+
+### VDB Filter Support
+
+| Feature | Milvus | Elasticsearch |
+|---------|--------|---------------|
+| NL filter generation | LLM-powered | Not supported (manual DSL) |
+| Filter syntax | String expressions | List of dicts (ES Query DSL) |
+| UI support | Full filtering interface | API only |
+
+## Notebooks
+- `notebooks/retriever_api_usage.ipynb` — RAG retriever API: search and end-to-end queries
+- `notebooks/nb_metadata.ipynb` — Metadata ingestion, filtering, and extraction from queries
+
+## Source Documentation
+- `docs/hybrid_search.md` — Hybrid dense + sparse search configuration
+- `docs/multi-collection-retrieval.md` — Multi-collection querying
+- `docs/custom-metadata.md` — Custom metadata schema, filtering expressions, filter generation
+- `docs/accuracy_perf.md` — Best practices for tuning ingestion/retrieval/generation settings
+- `docs/python-client.md` — Python library API for search and filtering
diff --git a/skill-source/.agents/skills/rag-blueprint/references/configure/summarization.md b/skill-source/.agents/skills/rag-blueprint/references/configure/summarization.md
new file mode 100644
index 000000000..299c41e89
--- /dev/null
+++ b/skill-source/.agents/skills/rag-blueprint/references/configure/summarization.md
@@ -0,0 +1,40 @@
+# Document Summarization
+
+## When to Use
+- User wants to generate summaries during document ingestion
+- User asks about summarization strategies or options
+- User wants to check summary status or progress
+
+## Restrictions
+- Not supported in lite mode (containerless/library-only deployment)
+- Requires Redis for status tracking and rate limiting
+- Collection must exist before uploading with `generate_summary: true`
+
+## Process
+1. Detect the deployment mode. Docker: edit the active env file. Helm: configure under `ingestor-server.envVars` in `values.yaml`. Library: use the upload API parameters directly (no env vars needed)
+2. Read `docs/summarization.md` for full configuration, env vars, and prompt customization
+3. Set `generate_summary: true` in the upload payload (per-request, no global toggle)
+4. Optionally configure `summary_options`: strategy, shallow mode, page filter
+5. Retrieve summary via `GET /v1/summary?collection_name=...&file_name=...`
+
+## Decision Table
+
+| Goal | Strategy | Notes |
+|------|----------|-------|
+| Fastest overview | `"single"` + `shallow_summary=true` + `page_filter` | Quick text-only extraction |
+| Best quality | `null` (iterative, default) + `shallow_summary=false` | Sequential refinement |
+| Balanced | `"hierarchical"` + `shallow_summary=true` | Parallel tree-based |
+
+## Agent-Specific Notes
+- `CONVERSATION_HISTORY` prerequisite does not apply — that's for query rewriting only
+- `SUMMARY_LLM_SERVERURL=""` (empty) routes to NVIDIA cloud; `"nim-llm:8000"` for self-hosted
+- `SUMMARY_LLM_MAX_CHUNK_LENGTH` should be below the model's context window to leave room for prompt + output
+- Redis semaphore auto-resets on ingestor startup (prevents stale values from crashes)
+- If Redis is unavailable, summaries still generate but no real-time status tracking
+- Status entries have 24-hour TTL in Redis
+
+## Notebooks
+- `notebooks/summarization.ipynb` — complete examples for all strategies, status polling, library mode usage
+
+## Source Documentation
+- `docs/summarization.md` — env var reference, prompt customization, rate limiting, chunking details
diff --git a/skill-source/.agents/skills/rag-blueprint/references/configure/user-interface.md b/skill-source/.agents/skills/rag-blueprint/references/configure/user-interface.md
new file mode 100644
index 000000000..7fbad8165
--- /dev/null
+++ b/skill-source/.agents/skills/rag-blueprint/references/configure/user-interface.md
@@ -0,0 +1,27 @@
+# User Interface
+
+## When to Use
+- User asks about the RAG UI, uploading documents, settings, or metadata filtering
+- User wants to configure features via the web interface
+
+## Restrictions
+- Sample/experimentation UI — not intended for production
+- 100-file limit per upload batch; use multiple batches or API for bulk uploads
+- 10 MB max per image attachment
+
+## Process
+1. Read `docs/user-interface.md` for full UI documentation
+2. Access at `http://localhost:8090` (or `http://<workstation-ip>:8090` for remote)
+3. Configure RAG settings and feature toggles via Settings panel
+4. Use Filter Bar above chat input for metadata-filtered queries
+
+## Agent-Specific Notes
+- VLM Inference must be enabled in Settings > Feature Toggles before image attachments work
+- ECONNRESET errors on multi-file uploads — recommend API for bulk operations
+- Document summaries generate asynchronously; UI shows "Generating summary..." until complete
+- Document count in UI may lag slightly after ingestion
+- Metadata filtering supports AND/OR logic between filters (toggle via logic button)
+- Custom metadata schema is set during collection creation via the Metadata Schema Editor
+
+## Source Documentation
+- `docs/user-interface.md` -- full UI documentation including settings, file types, metadata, and health monitoring
diff --git a/skill-source/.agents/skills/rag-blueprint/references/configure/vlm.md b/skill-source/.agents/skills/rag-blueprint/references/configure/vlm.md
new file mode 100644
index 000000000..d573cd0c2
--- /dev/null
+++ b/skill-source/.agents/skills/rag-blueprint/references/configure/vlm.md
@@ -0,0 +1,56 @@
+# VLM, VLM Embeddings & Image Captioning
+
+## When to Use
+User wants image understanding, visual content analysis, VLM inference, multimodal embeddings, or image captioning during ingestion.
+
+## Restrictions
+- Not available on B200 GPUs — use H100, A100 SXM 80GB, or RTX PRO 6000
+- Requires extra GPU (GPU 1+ for 2-GPU systems, GPU 2+ for 3+ GPUs with fallback)
+- VLM embeddings: experimental, PDF-only, no summarization, no citations with page-as-image
+- Image captioning on Helm: on-prem only (modify `values.yaml` to enable)
+
+## Process
+1. Detect the deployment mode (Docker / Helm / Library). Docker: edit the active env file. Helm: edit `values.yaml`. Library: edit `notebooks/config.yaml`
+2. Read the relevant source doc for detailed steps:
+   - VLM generation: `docs/vlm.md`
+   - VLM embeddings: `docs/vlm-embed.md`
+   - Image captioning: `docs/image_captioning.md`
+3. Start VLM NIM (self-hosted) or configure cloud endpoint (NVIDIA-hosted)
+4. Set the required variables in the active config:
+   - Enabling: `ENABLE_VLM_INFERENCE=true` and `APP_NVINGEST_EXTRACTIMAGES=True`
+   - Disabling: re-comment those variables in the env file
+5. Restart affected services and verify with a health check + image-containing document query
+
+## Decision Table
+
+| Goal | Source Doc | Docker Profile | Notes |
+|------|-----------|---------------|-------|
+| VLM replaces LLM | `docs/vlm.md` | `--profile vlm-generation` | LLM not started; set `VLM_TO_LLM_FALLBACK=false` |
+| VLM + LLM fallback | `docs/vlm.md` | `--profile vlm-only` | Needs 3+ GPUs; both VLM and LLM running |
+| VLM embeddings | `docs/vlm-embed.md` | `--profile vlm-embed` | Experimental; requires re-ingestion |
+| Image captioning | `docs/image_captioning.md` | `--profile vlm-only` | Requires VLM NIM; Helm: on-prem only |
+| Multimodal query | `docs/multimodal-query.md` | (depends on VLM mode) | Image + text querying |
+
+## Agent-Specific Notes
+
+- `--profile vlm-generation` skips the LLM entirely — use `--profile vlm-only` for fallback mode
+- `VLM_TO_LLM_FALLBACK` defaults to `true`, but `vlm-generation` profile does not start LLM
+- Helm VLM: disable `nim-llm` and enable `nim-vlm` (VLM uses LLM's GPU allocation)
+- Helm fallback: keep both `nim-vlm` and `nim-llm` enabled, set `VLM_TO_LLM_FALLBACK: "true"`
+- VLM context window is limited — keep queries self-contained
+- Image captioning known issue: files without graphs/charts/tables/plots fail to ingest when captioning is enabled
+
+### Key Env Vars (always needed)
+- `ENABLE_VLM_INFERENCE=true` — master toggle
+- `APP_NVINGEST_EXTRACTIMAGES=True` — extract images during ingestion
+- `VLM_MS_GPU_ID=<gpu-id>` — self-hosted GPU assignment
+
+## Notebooks
+- `notebooks/image_input.ipynb` — Multimodal queries with VLM (text + image)
+
+## Source Documentation
+- `docs/vlm.md` — VLM generation (self-hosted, NVIDIA-hosted, Helm, Library)
+- `docs/vlm-embed.md` — VLM embeddings (experimental)
+- `docs/image_captioning.md` — Image captioning during ingestion
+- `docs/multimodal-query.md` — Image + text querying
+- `docs/service-port-gpu-reference.md` — default GPU assignments for VLM and other NIMs
diff --git a/skill-source/.agents/skills/rag-blueprint/references/deploy.md b/skill-source/.agents/skills/rag-blueprint/references/deploy.md
new file mode 100644
index 000000000..96d712892
--- /dev/null
+++ b/skill-source/.agents/skills/rag-blueprint/references/deploy.md
@@ -0,0 +1,119 @@
+# RAG Blueprint Deployment
+
+## Phase 1: Environment Analysis
+
+Run this single command to collect all environment information at once:
+
+```bash
+echo "=== GPU ===" && nvidia-smi --query-gpu=index,name,memory.total --format=csv,noheader 2>/dev/null || echo "NO_GPU"; echo "=== VRAM ===" && nvidia-smi --query-gpu=memory.total --format=csv,noheader,nounits 2>/dev/null | awk '{s+=$1} END {print s "MB total"}' || echo "0MB total"; echo "=== DRIVER ===" && cat /proc/driver/nvidia/version 2>/dev/null | head -1 || echo "NO_DRIVER"; echo "=== CUDA ===" && nvcc --version 2>/dev/null | grep "release" || echo "NO_CUDA_TOOLKIT"; echo "=== DOCKER ===" && docker --version 2>/dev/null || echo "NO_DOCKER"; echo "=== COMPOSE ===" && docker compose version 2>/dev/null || echo "NO_COMPOSE"; echo "=== NVIDIA_TOOLKIT ===" && docker info 2>/dev/null | grep -i "runtimes.*nvidia" || echo "NO_NVIDIA_TOOLKIT"; echo "=== PYTHON ===" && python3 --version 2>/dev/null || echo "NO_PYTHON"; echo "=== DISK ===" && df -h --output=avail / | tail -1; echo "=== OS ===" && cat /etc/os-release 2>/dev/null | grep -E "^(NAME|VERSION)="; echo "=== NGC_KEY ===" && if [ -n "$NGC_API_KEY" ]; then echo "NGC_KEY_SET"; elif [ -n "$NVIDIA_API_KEY" ]; then echo "NVIDIA_KEY_SET"; elif grep -qr "NGC_API_KEY=" deploy/compose/.env deploy/compose/nvdev.env 2>/dev/null | grep -qv "nvapi-your-key"; then echo "DOTENV_SET"; else echo "NOT_SET"; fi; echo "=== RUNNING ===" && docker ps --format "{{.Names}}" 2>/dev/null | grep -E "(rag-server|ingestor-server|nim-llm|milvus)" | head -10 || echo "NO_RUNNING_SERVICES"; echo "=== PORTS ===" && (ss -tlnp 2>/dev/null || netstat -tlnp 2>/dev/null) | grep -E ":(8081|8082|8090|19530) " || echo "PORTS_FREE"; echo "=== REPO ===" && git rev-parse --show-toplevel 2>/dev/null && git describe --tags 2>/dev/null || echo "NO_GIT_REPO"; echo "=== CACHE ===" && du -sh ~/.cache/model-cache/ 2>/dev/null || echo "NO_CACHE"
+```
+
+Present a summary table:
+
+| Check | Result |
+|-------|--------|
+| GPU(s) | (list with VRAM, or NO_GPU) |
+| Total VRAM | (sum in MB/GB) |
+| NVIDIA Driver | (version or NO_DRIVER) |
+| CUDA Toolkit | (version or NO_CUDA_TOOLKIT) |
+| Docker | (version or NO_DOCKER) |
+| Docker Compose | (version or NO_COMPOSE) |
+| NVIDIA Container Toolkit | (detected or NO_NVIDIA_TOOLKIT) |
+| Python | (version or NO_PYTHON) |
+| Free disk | (value) |
+| OS | (name + version) |
+| NGC_API_KEY | ENV_SET / DOTENV_SET / NOT_SET |
+| Existing services | (list or none) |
+| Port availability | (free or list conflicts) |
+| Repo | (tag/branch or NO_GIT_REPO) |
+| Model cache | (size or empty) |
+
+### Existing Services Warning
+
+If RAG services are already running, tell the user briefly: "Existing RAG services detected (list). Proceeding will restart them." Continue unless the user objects.
+
+If the user wants to **switch deployment modes** (e.g., NVIDIA-hosted → self-hosted, or Docker → library), shut down the existing deployment first via `references/shutdown.md`, then proceed with the new mode.
+
+If ports are occupied by non-RAG processes, tell the user which ports conflict and suggest stopping the conflicting process. This is a blocker.
+
+## Phase 2: NGC_API_KEY Handling
+
+Check in this order:
+
+1. If `NGC_API_KEY` is set in the shell environment → proceed.
+2. If `NVIDIA_API_KEY` is set (common in library mode) → proceed silently.
+3. If `NGC_API_KEY` is in `deploy/compose/.env` or `deploy/compose/nvdev.env` (and not the placeholder `nvapi-your-key`) → load it and proceed.
+4. If none found → tell the user: "NGC_API_KEY is required. Get one from https://org.ngc.nvidia.com/setup/api-keys and run: `export NGC_API_KEY=\"nvapi-...\"` — then tell me when done."
+5. After user confirms → re-check silently. If still not set, write placeholder to `.env` and tell the user to edit it.
+
+## Phase 3: Blocker Checks
+
+Automatically check and report all blockers at once (don't stop at the first one):
+
+Read `docs/support-matrix.md` for current minimum versions and disk requirements, then check:
+
+- **Docker Compose below minimum**: "Upgrade Docker Compose. See https://docs.docker.com/compose/install/linux/"
+- **NVIDIA Driver below minimum** (if self-hosted): "Upgrade NVIDIA driver. See `docs/support-matrix.md` for required version."
+- **NVIDIA Container Toolkit missing** (and self-hosted needed): "Install NVIDIA Container Toolkit. See https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html"
+- **Insufficient disk**: "Check `docs/support-matrix.md` for disk requirements per deployment mode."
+- **No Docker and no Python 3.11+**: "Install Docker or Python 3.11+ to proceed."
+
+List all blockers together so the user can fix them in one pass — don't make them fix one, re-run, fix another.
+
+## Phase 4: Route to Deployment Mode
+
+### User explicitly requests a mode
+- "library mode" / "lite mode" / "no docker" / "python mode" → read and follow `deploy/library.md`
+- "docker" / "self-hosted" / "local" → read and follow `deploy/docker.md` with mode **self-hosted**
+- "cloud" / "nvidia-hosted" / "hosted" → read and follow `deploy/docker.md` with mode **nvidia-hosted**
+- "retrieval only" / "search only" / "no LLM" → read and follow `deploy/docker.md` with mode **retrieval-only**
+- "kubernetes" / "k8s" / "helm" → read and follow `deploy/helm.md`
+- "workbench" / "ai workbench" → tell user to follow `deploy/workbench/README.md` (AI Workbench uses its own UI-driven workflow)
+
+### Docker is available (Docker + Compose detected)
+
+**Self-hosted eligible** — read `docs/support-matrix.md` ("Hardware Requirements (Docker)" section) for current GPU requirements. All of the following must also be true:
+- GPU count and type matches the Docker self-hosted requirements from the support matrix
+- ≥200 GB free disk (per `docs/support-matrix.md` "Disk Space Requirements")
+- NVIDIA Container Toolkit detected
+- NVIDIA driver meets minimum version from `docs/support-matrix.md` ("Driver Versions")
+
+If self-hosted eligible → read and follow `deploy/docker.md` with mode **self-hosted**
+
+**Otherwise with Docker** → read and follow `deploy/docker.md` with mode **nvidia-hosted**
+
+Tell the user WHY if they have some GPU but not enough:
+- "You have [X GPU] with [Y GB] VRAM. Self-hosted requires [requirements from docs/support-matrix.md]. Deploying with NVIDIA-hosted cloud NIMs instead — faster startup, no model download."
+
+### Docker is available but Compose is not
+
+Tell the user: "Docker is installed but Docker Compose is below the minimum version (see `docs/support-matrix.md`). Install it: https://docs.docker.com/compose/install/linux/ — or use library mode instead."
+
+If user chooses library mode → read and follow `deploy/library.md`
+
+### Docker is not available
+
+- Python 3.11+ available → read and follow `deploy/library.md` with mode **lite**
+- No Python → tell user to install Python 3.11+ or Docker
+
+## After Deployment
+
+Once deployment completes, verify health:
+
+```bash
+echo "=== RAG Server ===" && curl -s http://localhost:8081/v1/health?check_dependencies=true 2>/dev/null || echo "RAG_SERVER_NOT_READY"; echo "=== Ingestor ===" && curl -s http://localhost:8082/v1/health?check_dependencies=true 2>/dev/null || echo "INGESTOR_NOT_READY"
+```
+
+If healthy, tell the user:
+- "RAG Blueprint is running and healthy."
+- "Ask me to configure features like VLM, query rewriting, guardrails, etc."
+- "Ask me to shutdown when you're done."
+
+If unhealthy, read `references/troubleshoot.md` and diagnose. Match error output against known issues, fix, and retry. Escalate to the user only if the fix requires their action (API key, data deletion).
+
+## Notebooks
+- `notebooks/launchable.ipynb` — Cloud deployment via Brev (alternative to local deployment)
+
+## Source Documentation
+- `docs/support-matrix.md` — GPU requirements, driver versions, disk space, supported platforms
+- `docs/service-port-gpu-reference.md` — port mappings and GPU assignments for all services
diff --git a/skill-source/.agents/skills/rag-blueprint/references/deploy/docker-nvidia-hosted.md b/skill-source/.agents/skills/rag-blueprint/references/deploy/docker-nvidia-hosted.md
new file mode 100644
index 000000000..f4c6ede07
--- /dev/null
+++ b/skill-source/.agents/skills/rag-blueprint/references/deploy/docker-nvidia-hosted.md
@@ -0,0 +1,38 @@
+# Docker Deployment (NVIDIA-Hosted NIMs)
+
+## When to Use
+- User wants fast deployment without local model downloads
+- User has no GPU or limited GPU
+- User asks about cloud-hosted or NVIDIA API deployment
+- User wants to avoid 15–30 min NIM startup time
+
+## Restrictions
+- Requires internet access (calls NVIDIA cloud APIs)
+- NVIDIA-hosted endpoints have rate limits — large ingestions (>10 files) may hit 429 errors
+- NGC_API_KEY required for cloud API access
+- Docker and Compose minimum versions per `docs/support-matrix.md`
+
+## Process
+1. Read `docs/deploy-docker-nvidia-hosted.md` for full commands and env configuration
+2. Use `deploy/compose/nvdev.env` — pre-configured for cloud endpoints. Source it before compose commands: `source deploy/compose/nvdev.env`
+3. Start vector DB → ingestor → RAG server + frontend (no NIM startup needed)
+4. Verify: `docker ps` shows containers; UI at `http://localhost:8090`
+
+## Decision Table
+
+| Goal | Key Action |
+|------|------------|
+| Standard cloud deployment | Use `nvdev.env` (pre-configured for cloud) |
+| Zero-GPU (no Milvus GPU) | Also switch Milvus image to CPU-only |
+| Large file ingestion | Reduce batch/concurrency settings to avoid 429s |
+| Maximum throughput | Use self-hosted deployment instead |
+
+## Agent-Specific Notes
+- First run: 5–10 min (image pulls only); subsequent: 1–2 min
+- No `nims.yaml` startup — all model inference is cloud-hosted
+- All subsequent configure/restart operations should source the same env file used for the initial deploy (`deploy/compose/nvdev.env`)
+- For zero-GPU: switch Milvus to CPU-only by changing the GPU image tag to the equivalent non-GPU tag and setting `APP_VECTORSTORE_ENABLEGPUSEARCH=False`. See `docs/deploy-docker-nvidia-hosted.md` for the current image tags
+- Rate limit mitigation for large ingestions: reduce `NV_INGEST_FILES_PER_BATCH`, `NV_INGEST_CONCURRENT_BATCHES`, `MAX_INGEST_PROCESS_WORKERS`, `NV_INGEST_MAX_UTIL` to minimum values
+
+## Source Documentation
+- `docs/deploy-docker-nvidia-hosted.md` — full step-by-step commands, env var blocks, CPU Milvus setup
diff --git a/skill-source/.agents/skills/rag-blueprint/references/deploy/docker-retrieval-only.md b/skill-source/.agents/skills/rag-blueprint/references/deploy/docker-retrieval-only.md
new file mode 100644
index 000000000..fb4935cf8
--- /dev/null
+++ b/skill-source/.agents/skills/rag-blueprint/references/deploy/docker-retrieval-only.md
@@ -0,0 +1,37 @@
+# Retrieval-Only Deployment
+
+## When to Use
+- User wants search/retrieval without LLM generation
+- User asks to deploy only embedding + reranking services
+- User wants `/search` endpoint with an external LLM
+- User wants a lightweight, low-GPU deployment
+
+## Restrictions
+- `/generate` endpoint returns an error — no LLM is deployed
+- Self-hosted: 1 GPU, ~24 GB memory
+- NVIDIA-hosted: 0 GPUs (cloud embedding + reranking)
+
+## Process
+1. Read `docs/retrieval-only-deployment.md` for full commands, env vars, and API examples
+2. Choose variant: self-hosted (local NIMs), NVIDIA-hosted (cloud), or Helm
+3. For self-hosted: start only embedding + ranking NIMs, skip LLM
+4. For NVIDIA-hosted: set embedding/ranking server URLs to empty, skip NIM startup entirely
+5. For Helm: set `nimOperator.nim-llm.enabled=false`
+6. Start vector DB → ingestor → RAG server
+7. Verify health: `GET http://localhost:8081/v1/health?check_dependencies=true`
+
+## Decision Table
+
+| Goal | Variant | Key Difference |
+|------|---------|----------------|
+| Minimal GPU usage with local models | Self-hosted | 1 GPU, ~24 GB |
+| Zero GPU, cloud APIs | NVIDIA-hosted | Set server URLs to empty, skip NIM startup |
+| Kubernetes | Helm | Disable `nim-llm` in values.yaml |
+
+## Agent-Specific Notes
+- Permission errors on model cache → try `USERID=0` or `chmod -R 755 ~/.cache/model-cache`
+- Empty search results → verify documents ingested: `GET http://localhost:8082/v1/documents?collection_name=<name>`
+- Users can send `/search` results to their own external LLM for generation
+
+## Source Documentation
+- `docs/retrieval-only-deployment.md` — full deployment commands, API examples, search payload options
diff --git a/skill-source/.agents/skills/rag-blueprint/references/deploy/docker-self-hosted.md b/skill-source/.agents/skills/rag-blueprint/references/deploy/docker-self-hosted.md
new file mode 100644
index 000000000..dc18ef649
--- /dev/null
+++ b/skill-source/.agents/skills/rag-blueprint/references/deploy/docker-self-hosted.md
@@ -0,0 +1,49 @@
+# Docker Deployment (Self-Hosted NIMs)
+
+## When to Use
+- User wants full on-premises deployment with local NIM containers
+- User has supported GPUs and wants models running locally
+- User asks to deploy RAG Blueprint with Docker
+
+## Restrictions
+
+Read `docs/support-matrix.md` for current GPU requirements. Feature restrictions per GPU type:
+
+| GPU | Cannot Use |
+|-----|------------|
+| B200 | VLM, Guardrails, Nemotron Parse |
+| RTX PRO 6000 | Nemotron Parse |
+
+- Read `docs/support-matrix.md` for current minimum NVIDIA Driver, CUDA, Docker, and Compose versions
+- NVIDIA Container Toolkit required (`docker info` shows nvidia runtime)
+- Disk space per `docs/support-matrix.md` ("Disk Space Requirements")
+- If any prerequisite is missing, tell the user what to install before proceeding
+
+## Process
+1. Read `docs/deploy-docker-self-hosted.md` for full commands and env configuration
+2. Read `docs/support-matrix.md` for GPU compatibility and supported model combinations
+3. Verify container toolkit, prepare model cache directory, source `.env`
+4. Apply GPU-specific config per source docs
+5. Start NIMs → wait for healthy → start remaining services
+6. Verify: `docker ps` shows all containers healthy; UI at `http://localhost:8090`
+
+## Decision Table
+
+| Goal | Profile Flag | Notes |
+|------|-------------|-------|
+| Full deployment (default) | (none) | LLM + embedding + ranking + OCR + detection |
+| Text-only RAG (lighter) | `--profile rag` | Skip OCR/detection NIMs |
+| Ingestion workload only | `--profile ingest` | Embedding + OCR + detection |
+| VLM replaces LLM | `--profile vlm-generation` | Not on B200 |
+| Advanced PDF extraction | `--profile nemotron-parse` | Not on B200 or RTX PRO 6000 |
+
+## Agent-Specific Notes
+- First run: 15–30 min (model downloads ~100–150 GB, no progress bar); subsequent: 2–5 min
+- Monitor download progress: `du -sh ~/.cache/model-cache/`
+- Permission error on model cache → try `USERID=0` instead of `USERID=$(id -u)`
+- Cloud NIM section in `deploy/compose/.env` must be commented out for self-hosted
+- Rebuild after code changes: add `--build` flag to compose up commands
+
+## Source Documentation
+- `docs/deploy-docker-self-hosted.md` — full step-by-step commands, env vars, GPU assignments
+- `docs/support-matrix.md` — GPU compatibility, supported models, hardware requirements
diff --git a/skill-source/.agents/skills/rag-blueprint/references/deploy/docker.md b/skill-source/.agents/skills/rag-blueprint/references/deploy/docker.md
new file mode 100644
index 000000000..eb31fb1a6
--- /dev/null
+++ b/skill-source/.agents/skills/rag-blueprint/references/deploy/docker.md
@@ -0,0 +1,88 @@
+# RAG Docker Deployment
+
+## Determine Mode
+
+If routed here from the deploy workflow, the mode (self-hosted, nvidia-hosted, or retrieval-only) was already decided. Use it.
+
+If invoked directly without a mode, auto-detect:
+
+```bash
+echo "=== COMPOSE ===" && docker compose version 2>/dev/null || echo "NO_COMPOSE"; echo "=== GPU ===" && nvidia-smi --query-gpu=name,memory.total --format=csv,noheader 2>/dev/null || echo "NO_GPU"; echo "=== DISK ===" && df -h --output=avail / | tail -1; echo "=== RUNNING ===" && docker ps --format "{{.Names}}" 2>/dev/null | grep -E "(rag-server|ingestor-server|nim-llm|milvus)" | head -10 || echo "NONE_RUNNING"
+```
+
+If NO_COMPOSE: stop and tell the user to install Docker Compose (see `docs/support-matrix.md` for minimum version).
+
+Read `docs/support-matrix.md` ("Hardware Requirements (Docker)" section) for current GPU requirements, then:
+- GPU count/type meets self-hosted requirements from the support matrix, and 200+ GB free disk → **self-hosted**
+- Any GPU or no GPU with ≥50 GB free disk → **nvidia-hosted**
+- User explicitly says "retrieval only" / "no LLM" / "search only" → **retrieval-only**
+
+Auto-route based on hardware. Only ask if two modes are equally valid and the user's intent is ambiguous.
+
+## Verify NGC_API_KEY
+
+Auto-check all possible locations before asking:
+
+```bash
+[ -n "$NGC_API_KEY" ] && echo "ENV_SET" || (grep -qr "NGC_API_KEY=" deploy/compose/.env deploy/compose/nvdev.env 2>/dev/null | grep -qv "nvapi-your-key" && echo "DOTENV_SET" || echo "NOT_SET")
+```
+
+- **ENV_SET**: proceed silently.
+- **DOTENV_SET**: load the env file that contains the key and proceed.
+- **NOT_SET**: ask the user to provide it. This is the only thing to ask for.
+
+## Docker Login
+
+Auto-check if already logged in:
+
+```bash
+grep -q "nvcr.io" ~/.docker/config.json 2>/dev/null && echo "ALREADY_LOGGED_IN" || echo "NOT_LOGGED_IN"
+```
+
+If already logged in → proceed silently.
+
+If not logged in → tell the user to run this themselves (the key gets expanded in agent logs):
+
+> Please run in your terminal: `echo "${NGC_API_KEY}" | docker login nvcr.io -u '$oauthtoken' --password-stdin`
+
+Wait for confirmation only if login was needed.
+
+## Deploy
+
+Based on the mode, read and follow the appropriate reference:
+
+- **Self-hosted**: read and follow `docker-self-hosted.md`
+- **NVIDIA-hosted**: read and follow `docker-nvidia-hosted.md`
+- **Retrieval-only**: read and follow `docker-retrieval-only.md`
+
+## Post-Deploy Verification
+
+Run health checks:
+
+```bash
+sleep 5; echo "=== RAG ===" && curl -s http://localhost:8081/v1/health?check_dependencies=true 2>/dev/null || echo "RAG_NOT_READY"; echo "=== INGESTOR ===" && curl -s http://localhost:8082/v1/health?check_dependencies=true 2>/dev/null || echo "INGESTOR_NOT_READY"; echo "=== CONTAINERS ===" && docker ps --format "table {{.Names}}\t{{.Status}}" 2>/dev/null | grep -E "(rag|milvus|nim|ingest)" | head -15
+```
+
+If services are still initializing, automatically poll every 30 seconds:
+- **NVIDIA-hosted**: poll until healthy or 5 minutes elapsed (no model downloads needed).
+- **Self-hosted**: poll until healthy or 15 minutes elapsed (model downloads on first run).
+- **Retrieval-only**: poll until healthy or 5 minutes elapsed.
+
+Show progress to the user during polling.
+
+## On Success
+
+Tell the user:
+- "RAG Blueprint is running and healthy. Open http://localhost:8090 to use the UI." (skip for retrieval-only)
+- "Ask me to configure features (VLM, query rewriting, guardrails, etc.)"
+- "Ask me to shutdown when you're done."
+
+## On Error
+
+1. Read the error output from the failed command.
+2. Read `references/troubleshoot.md` to match against common issues (port conflict, disk full, NGC auth, GPU OOM).
+3. Apply the fix and retry.
+4. If still failing, report the specific error to the user with the fix that was attempted.
+
+## Source Documentation
+- `docs/support-matrix.md` — GPU requirements, hardware compatibility, disk space
diff --git a/skill-source/.agents/skills/rag-blueprint/references/deploy/helm-mig.md b/skill-source/.agents/skills/rag-blueprint/references/deploy/helm-mig.md
new file mode 100644
index 000000000..dd3dd8ce3
--- /dev/null
+++ b/skill-source/.agents/skills/rag-blueprint/references/deploy/helm-mig.md
@@ -0,0 +1,38 @@
+# MIG GPU Deployment
+
+## When to Use
+- User wants fine-grained GPU allocation on Kubernetes using MIG slices
+- User has H100 GPUs and wants to share them across RAG services
+- User asks about Multi-Instance GPU deployment
+
+## Restrictions
+- Requires H100 80GB HBM3 GPUs (MIG-compatible)
+- MIG profiles in this guide are specific to H100 80GB — other GPUs need different profiles
+- Requires cloned repository (MIG config files in `deploy/helm/`)
+- All standard Helm prerequisites apply (GPU Operator, NIM Operator, StorageClass)
+- Ingestion profile is scaled down with MIG — large bulk ingestion jobs may fail
+
+## Process
+1. Read `docs/mig-deployment.md` for full configuration, commands, and MIG slice definitions
+2. Enable MIG with mixed strategy on ClusterPolicy
+3. Apply MIG ConfigMap and label the node
+4. Verify node labels show `mig.config.state: "success"` before proceeding
+5. Install Helm chart with `-f mig-slicing/values-mig.yaml`
+
+## Decision Table
+
+| Goal | Source Doc | Key Action |
+|------|-----------|------------|
+| Standard MIG on H100 | `docs/mig-deployment.md` | Apply MIG config, label node, install chart |
+| RTX PRO 6000 with MIG | `docs/mig-deployment.md` | Also uncomment model section in values.yaml |
+| Custom MIG profiles | NVIDIA MIG User Guide | Modify `mig-config.yaml` for different GPU types |
+
+## Agent-Specific Notes
+- Must wait for `mig.config.state: "success"` on the node before Helm install — if not present, wait and re-check
+- Default H100 MIG layout (see `docs/mig-deployment.md` for current GPU count and slice definitions): GPU 0 → small slices, GPU 1 → mixed slices, GPU 2 → full-GPU slice
+- LLM gets the largest slice (`7g.80gb`); embedding/Milvus/ingest share small slices
+- RTX PRO 6000 variant: uncomment model section in values.yaml, then use both `-f values.yaml -f mig-slicing/values-mig.yaml`
+- Uninstall follows standard Helm procedure (see Helm deployment docs)
+
+## Source Documentation
+- `docs/mig-deployment.md` — full MIG config, ClusterPolicy patches, node labeling, verification, Helm install commands
diff --git a/skill-source/.agents/skills/rag-blueprint/references/deploy/helm-standard.md b/skill-source/.agents/skills/rag-blueprint/references/deploy/helm-standard.md
new file mode 100644
index 000000000..df2630660
--- /dev/null
+++ b/skill-source/.agents/skills/rag-blueprint/references/deploy/helm-standard.md
@@ -0,0 +1,51 @@
+# Helm Deployment
+
+## When to Use
+- User wants to deploy RAG Blueprint on Kubernetes
+- User asks about Helm chart installation (from NGC or local repo)
+- User mentions Kubernetes, k8s, or Helm in deployment context
+
+## Restrictions
+
+Read `docs/support-matrix.md` for current Kubernetes, Helm, and OS version requirements.
+
+- Requires GPU Operator + NIM Operator pre-installed
+- Default StorageClass must be configured for PVC provisioning
+- Disk space per `docs/support-matrix.md`
+- NeMo Guardrails not available in Helm deployment
+- Image captioning: on-prem only (requires `values.yaml` changes; see `docs/image_captioning.md`)
+
+## Process
+
+### Option A: Deploy from NGC (Remote Chart)
+1. Read `docs/deploy-helm.md` for full commands and values
+2. Ensure prerequisites: GPU Operator, NIM Operator, StorageClass, NGC_API_KEY
+3. Install chart, monitor pods, port-forward frontend
+
+### Option B: Deploy from Repository (Local Chart)
+1. Read `docs/deploy-helm-from-repo.md` for full commands and repo setup
+2. Add required Helm repos, run `helm dependency update`, install from local path
+
+### RTX PRO 6000 Variant
+1. Uncomment model section under `nimOperator.nim-llm.model` in `values.yaml`
+2. See source docs for engine/precision/GPU settings
+
+## Decision Table
+
+| Goal | Option | Key Action |
+|------|--------|------------|
+| Quick deploy from published chart | NGC (Option A) | `helm upgrade --install` with NGC URL |
+| Customized chart | Local repo (Option B) | Clone, modify values, `helm dependency update` |
+| RTX PRO 6000 GPUs | Either option | Uncomment model section in values.yaml |
+| Retrieval-only (no LLM) | Either option | `--set nimOperator.nim-llm.enabled=false` |
+
+## Agent-Specific Notes
+- First deployment: 60–70 min (model cache download); subsequent: 10–15 min
+- Pods in `ContainerCreating`/`Init` for extended time is normal during cache download
+- PVCs are not removed by `helm uninstall` — delete manually: `kubectl delete nimcache --all -n rag && kubectl delete pvc --all -n rag`
+- Port-forwarding may timeout for large file ingestion — not suitable for bulk uploads
+- All configurable endpoints documented in `deploy/helm/nvidia-blueprint-rag/endpoints.md`
+
+## Source Documentation
+- `docs/deploy-helm.md` — NGC remote chart deployment, prerequisites, monitoring
+- `docs/deploy-helm-from-repo.md` — local chart deployment, repo setup, dependency management
diff --git a/skill-source/.agents/skills/rag-blueprint/references/deploy/helm.md b/skill-source/.agents/skills/rag-blueprint/references/deploy/helm.md
new file mode 100644
index 000000000..b381f4dcc
--- /dev/null
+++ b/skill-source/.agents/skills/rag-blueprint/references/deploy/helm.md
@@ -0,0 +1,103 @@
+# RAG Helm Deployment
+
+If routed here from the deploy workflow, proceed directly to Phase 1.
+
+## Phase 1: Prerequisites Check
+
+Run all checks at once:
+
+```bash
+echo "=== KUBECTL ===" && kubectl version --client 2>/dev/null || echo "NO_KUBECTL"; echo "=== HELM ===" && helm version --short 2>/dev/null || echo "NO_HELM"; echo "=== STORAGECLASS ===" && kubectl get storageclass 2>/dev/null || echo "NO_STORAGECLASS"; echo "=== NODES ===" && kubectl get nodes -o wide 2>/dev/null || echo "NO_CLUSTER_ACCESS"; echo "=== GPU_OPERATOR ===" && kubectl get pods -n gpu-operator 2>/dev/null | grep -i running || echo "NO_GPU_OPERATOR"; echo "=== NIM_OPERATOR ===" && kubectl get pods -n nim-operator 2>/dev/null | grep -i running || echo "NO_NIM_OPERATOR"; echo "=== NAMESPACE ===" && kubectl get namespace rag 2>/dev/null && echo "NAMESPACE_EXISTS" || echo "NO_NAMESPACE"; echo "=== HELM_RELEASE ===" && helm list -n rag 2>/dev/null | grep rag || echo "NO_EXISTING_RELEASE"; echo "=== PODS ===" && kubectl get pods -n rag 2>/dev/null | head -10 || echo "NO_PODS"; echo "=== NGC_KEY ===" && [ -n "$NGC_API_KEY" ] && echo "NGC_API_KEY SET" || echo "NGC_API_KEY NOT_SET"; echo "=== GPU_RESOURCES ===" && kubectl get nodes -o json 2>/dev/null | grep -o '"nvidia.com/gpu": "[0-9]*"' || echo "NO_GPU_RESOURCES"
+```
+
+Read `docs/support-matrix.md` for current Kubernetes, Helm, and OS version requirements.
+
+| Requirement | Check |
+|-------------|-------|
+| Kubernetes | Per `docs/support-matrix.md` |
+| Helm | Per `docs/support-matrix.md` |
+| NVIDIA GPU Operator | Installed and running |
+| NVIDIA NIM Operator | Installed and running |
+| Default StorageClass | Configured (e.g. local-path-provisioner) |
+| Disk space | ≥200 GB per node |
+| NGC_API_KEY | Set in environment |
+
+Report all missing prerequisites together so the user can fix everything in one pass.
+
+If NGC_API_KEY is NOT_SET: this is the one thing we must ask the user for.
+
+If an existing Helm release is detected: warn "Existing RAG Helm release found. Proceeding will upgrade it." Continue unless user objects.
+
+## Phase 2: Route to Reference
+
+Auto-detect the GPU variant from cluster nodes (not the local machine):
+
+```bash
+echo "=== GPU_LABELS ===" && kubectl get nodes -o json 2>/dev/null | grep -oE '"nvidia.com/gpu.product":\s*"[^"]*"' | sort -u || echo "NO_GPU_LABELS"; echo "=== MIG ===" && kubectl get nodes -o json 2>/dev/null | grep -oE '"nvidia.com/mig.strategy":\s*"[^"]*"' || echo "NO_MIG"
+```
+
+Determine variant from node GPU labels:
+
+Route based on detection:
+
+- **MIG enabled** → read and follow `helm-mig.md`
+- **RTX PRO 6000** → read and follow `helm-standard.md` (use the RTX values.yaml variant described there)
+- **Standard (everything else)** → read and follow `helm-standard.md`
+
+Ask the user only if the variant is genuinely ambiguous. Default to standard deployment.
+
+## Phase 3: Expected Timelines
+
+Set expectations with the user:
+
+| Scenario | Duration |
+|----------|----------|
+| First deployment | 60–70 min (NIM cache download ~40–50 min, NIMService init ~10–15 min, pod startup ~5–10 min) |
+| Subsequent deployments | 10–15 min (model caches already populated) |
+
+Pods in `ContainerCreating` or `Init` state for extended periods is normal — models download in the background without progress indicators.
+
+## Phase 4: Verification
+
+After deployment completes, verify:
+
+```bash
+echo "=== PODS ===" && kubectl get pods -n rag; echo "=== NIMCACHE ===" && kubectl get nimcache -n rag; echo "=== NIMSERVICE ===" && kubectl get nimservice -n rag
+```
+
+Wait for all pods to reach `Running` status. Poll every 60 seconds for up to 70 minutes (first deployment involves model downloads). Show progress.
+
+Once pods are running, port-forward and verify health:
+
+```bash
+kubectl port-forward -n rag service/rag-server 8081:8081 --address 0.0.0.0 & kubectl port-forward -n rag service/rag-frontend 3000:3000 --address 0.0.0.0 & sleep 3 && curl -s http://localhost:8081/v1/health?check_dependencies=true 2>/dev/null || echo "RAG_NOT_READY"
+```
+
+## Phase 5: Uninstall
+
+If the user wants to tear down:
+
+```bash
+helm uninstall rag -n rag
+kubectl delete nimcache --all -n rag
+kubectl delete pvc --all -n rag
+```
+
+## On Success
+
+Tell the user:
+- "RAG Blueprint is running on Kubernetes. Access the UI at http://localhost:3000 (via port-forward)."
+- "Ask me to configure features (VLM, query rewriting, guardrails, etc.)"
+- "Ask me to shutdown when you're done."
+
+## On Error
+
+1. Check pod status and events: `kubectl describe pod <failing-pod> -n rag` and `kubectl get events -n rag --sort-by='.lastTimestamp' | tail -20`.
+2. Read pod logs: `kubectl logs <failing-pod> -n rag --tail 50`.
+3. Read `references/troubleshoot.md` to match against common issues (PVC pending, OOM, image pull failure, port conflict).
+4. Apply the fix and retry. If the fix requires data deletion (PVCs, namespace), confirm with user first.
+
+## Source Documentation
+- `docs/support-matrix.md` — Kubernetes/Helm version requirements, GPU compatibility
+- `docs/deploy-helm.md` — standard Helm deployment from NGC
+- `docs/deploy-helm-from-repo.md` — Helm deployment from local repo
diff --git a/skill-source/.agents/skills/rag-blueprint/references/deploy/library-full.md b/skill-source/.agents/skills/rag-blueprint/references/deploy/library-full.md
new file mode 100644
index 000000000..1a386cb19
--- /dev/null
+++ b/skill-source/.agents/skills/rag-blueprint/references/deploy/library-full.md
@@ -0,0 +1,43 @@
+# Library Mode (Full)
+
+## When to Use
+- User wants programmatic Python access to RAG via `nvidia_rag` package
+- User prefers code-level configuration over Docker-based servers
+- User asks about library mode, Python client, or `NvidiaRAG`/`NvidiaRAGIngestor`
+
+## Restrictions
+- Python 3.11+ (< 3.14)
+- Docker still required for backend services (Milvus, NV-Ingest, Redis, optionally NIMs)
+- Self-hosted NIMs require supported GPUs (see `docs/support-matrix.md`)
+
+## Process
+1. Read `docs/python-client.md` for full API reference, configuration, and backend setup
+2. Create virtual environment and install `nvidia-rag[all]`
+3. Start backend services via Docker (Milvus, NV-Ingest + Redis, optionally NIMs)
+4. Load config from `notebooks/config.yaml` using `NvidiaRAGConfig.from_yaml()`
+5. Create `NvidiaRAGIngestor` and `NvidiaRAG` instances
+6. Use `ingestor.create_collection()`, `ingestor.upload_documents()`, `rag.generate()`, `rag.search()`
+
+## Decision Table
+
+| Goal | Source Doc | Key Action |
+|------|-----------|------------|
+| Self-hosted (local GPUs) | `docs/python-client.md` | Start nims.yaml + set on-prem config |
+| Cloud (NVIDIA-hosted) | `docs/python-client.md` | Skip nims.yaml, override server URLs in config |
+| Custom prompts | `docs/python-client.md` | Pass `prompts=` to NvidiaRAG constructor |
+| Summarization | `docs/python-client.md` | `generate_summary=True` in upload_documents |
+
+## Agent-Specific Notes
+- Config file: `notebooks/config.yaml`; env file: `notebooks/.env_library`
+- Docker login is interactive — tell user to run `docker login nvcr.io` themselves
+- For cloud deployment: override `config.embeddings.server_url`, `config.llm.server_url`, etc. in code
+- Config changes take effect immediately (no container restart needed, unlike Docker mode)
+- Prompt customization via constructor: `NvidiaRAG(config=config, prompts="custom_prompts.yaml")`
+- `upload_documents()` is async — returns `task_id` for status polling
+- NV-Ingest cloud endpoints must be exported before starting NV-Ingest container
+
+## Notebooks
+- `notebooks/rag_library_usage.ipynb` — complete walkthrough: setup, ingestion, querying, search, summaries
+
+## Source Documentation
+- `docs/python-client.md` — full API reference, backend setup, configuration, cloud/self-hosted options
diff --git a/skill-source/.agents/skills/rag-blueprint/references/deploy/library-lite.md b/skill-source/.agents/skills/rag-blueprint/references/deploy/library-lite.md
new file mode 100644
index 000000000..bf7da9041
--- /dev/null
+++ b/skill-source/.agents/skills/rag-blueprint/references/deploy/library-lite.md
@@ -0,0 +1,37 @@
+# Library Mode (Lite / Containerless)
+
+## When to Use
+- Quick prototyping with zero infrastructure (no Docker, no GPU)
+- User wants the fastest path to try RAG
+- CI/CD pipelines needing lightweight RAG testing
+
+## Restrictions
+- No image/table/chart citations
+- No document summarization
+- Subject to NVIDIA API rate limits (cloud-hosted inference)
+- Requires Python 3.11+ (< 3.14), internet access, and `NGC_API_KEY`
+
+## Process
+1. Read `docs/python-client.md` for full library mode documentation
+2. Create virtualenv and install: `pip install nvidia-rag[all]`
+3. Ensure `NGC_API_KEY` is exported — maps to `NVIDIA_API_KEY` internally
+4. Run the lite notebook: `jupyter lab notebooks/rag_library_lite_usage.ipynb`
+
+## Agent-Specific Notes
+- `NVIDIA_API_KEY` (used by `nvidia_rag` package) must be set from `NGC_API_KEY`: `os.environ["NVIDIA_API_KEY"] = os.environ.get("NGC_API_KEY", "")`
+- Lite config lives in `notebooks/config.yaml`; override `server_url` for embeddings to the NVIDIA API Catalog endpoint (see `docs/python-client.md` for current URL), and set LLM/ranking URLs to empty string for cloud defaults
+- Milvus Lite runs embedded (no container), NV-Ingest runs as subprocess (no container)
+- Also install `python-dotenv jupyterlab` for notebook support
+
+## When Not to Use
+- Production workloads — use Docker or Kubernetes
+- Large-scale ingestion — rate limits apply
+- Need citations from images/tables/charts or document summarization
+
+## Notebooks
+| Notebook | Description |
+|----------|-------------|
+| `notebooks/rag_library_lite_usage.ipynb` | End-to-end lite mode: collection creation, ingestion, querying, search |
+
+## Source Documentation
+- `docs/python-client.md` -- full library mode documentation (lite and full)
diff --git a/skill-source/.agents/skills/rag-blueprint/references/deploy/library.md b/skill-source/.agents/skills/rag-blueprint/references/deploy/library.md
new file mode 100644
index 000000000..a5162be7a
--- /dev/null
+++ b/skill-source/.agents/skills/rag-blueprint/references/deploy/library.md
@@ -0,0 +1,54 @@
+# RAG Library Mode Setup
+
+## Determine Mode
+
+If routed here from the deploy workflow, the mode (full or lite) may already be decided. Use it.
+
+If invoked directly, auto-detect:
+
+```bash
+echo "=== DOCKER ===" && docker --version 2>/dev/null || echo "NO_DOCKER"; echo "=== GPU ===" && nvidia-smi --query-gpu=name --format=csv,noheader 2>/dev/null || echo "NO_GPU"; echo "=== PYTHON ===" && python3 --version 2>/dev/null || echo "NO_PYTHON"; echo "=== PKG_MANAGER ===" && which uv 2>/dev/null && echo "UV_AVAILABLE" || (which pip3 2>/dev/null && echo "PIP_AVAILABLE" || echo "NO_PKG_MANAGER"); echo "=== VENV ===" && ls -d .venv/ venv/ nvidia-rag-env/ 2>/dev/null || echo "NO_EXISTING_VENV"; echo "=== INSTALLED ===" && pip3 show nvidia_rag 2>/dev/null | head -3 || echo "NOT_INSTALLED"
+```
+
+- Docker available → **full** (Python API + Docker backend services)
+- No Docker or user explicitly says "lite" / "no docker" / "containerless" → **lite**
+
+Auto-route based on Docker availability. Only ask if both modes are equally valid.
+
+## Verify NGC_API_KEY
+
+Auto-check all locations:
+
+```bash
+if [ -n "$NGC_API_KEY" ]; then echo "NGC_KEY_SET"; elif [ -n "$NVIDIA_API_KEY" ]; then echo "NVIDIA_KEY_SET"; else echo "NOT_SET"; fi
+```
+
+If NOT_SET: ask the user. Otherwise proceed silently.
+
+## Deploy
+
+Based on the mode:
+
+- **Full**: read and follow `library-full.md`
+- **Lite**: read and follow `library-lite.md`
+
+## On Success
+
+Tell the user:
+- Which mode was set up and how to start using it (notebook or Python script)
+- "Ask me to configure features, change models, etc."
+- "Ask me to shutdown backend services when done (if full mode)."
+
+## On Error
+
+1. Read the error output (pip install failure, import error, service connection error).
+2. Read `references/troubleshoot.md` to match against common issues.
+3. Common fixes to try:
+   - `pip install` failure → try `uv pip install` or check Python version ≥3.11.
+   - Import error → check if virtual environment is activated.
+   - Connection error to backend services → check Docker containers are running.
+4. Retry the failed step after fixing.
+5. If still failing, report the specific error to the user.
+
+## Source Documentation
+- `docs/python-client.md` — Python library API, installation, full and lite mode setup
diff --git a/skill-source/.agents/skills/rag-blueprint/references/shutdown.md b/skill-source/.agents/skills/rag-blueprint/references/shutdown.md
new file mode 100644
index 000000000..7407b63d8
--- /dev/null
+++ b/skill-source/.agents/skills/rag-blueprint/references/shutdown.md
@@ -0,0 +1,128 @@
+# RAG Shutdown
+
+Stopping containers and processes does not require confirmation. Deleting data (volumes, cache, images) does.
+
+## Step 1: Detect What Is Running
+
+Detect all deployment modes — Docker, K8s, and library:
+
+```bash
+echo "=== DOCKER ===" && docker ps --format "table {{.Names}}\t{{.Status}}\t{{.Image}}" 2>/dev/null || echo "NO_DOCKER"; echo "=== LIBRARY ===" && ps aux | grep -E "(nvidia_rag|uvicorn|jupyter)" | grep -v grep || echo "NO_LIBRARY_PROCESSES"; echo "=== K8S ===" && kubectl get pods -n rag 2>/dev/null | head -10 || echo "NO_K8S"; echo "=== HELM ===" && helm list -n rag 2>/dev/null | grep rag || echo "NO_HELM_RELEASE"
+```
+
+Based on what's detected, execute the appropriate shutdown path below. If multiple modes are active (e.g., Docker + library), stop all of them.
+
+## Step 2: Stop Services (Reverse Startup Order)
+
+Stop in this order — reverse of deployment. Only stop what is actually running (detected in Step 1).
+
+### 2a: Optional Services
+
+Stop these first if they are running:
+
+```bash
+docker compose -f deploy/compose/docker-compose-nemo-guardrails.yaml down 2>/dev/null; docker compose -f deploy/compose/observability.yaml down 2>/dev/null
+```
+
+### 2b: Application Services
+
+```bash
+docker compose -f deploy/compose/docker-compose-rag-server.yaml down; docker compose -f deploy/compose/docker-compose-ingestor-server.yaml down
+```
+
+### 2c: Vector DB
+
+```bash
+docker compose -f deploy/compose/vectordb.yaml down
+```
+
+If using Elasticsearch instead of Milvus:
+```bash
+docker compose -f deploy/compose/vectordb.yaml --profile elasticsearch down
+```
+
+### 2d: NIMs (Self-Hosted Only)
+
+Only present if self-hosted deployment was used:
+
+```bash
+docker compose -f deploy/compose/nims.yaml down
+```
+
+This stops ALL NIM containers (LLM, embedding, ranking, OCR, detection, and any profile-specific NIMs like VLM, audio, nemotron-parse).
+
+### 2e: Library Mode Processes
+
+If library mode is active (detected Python processes):
+
+```bash
+pkill -f "nvidia_rag" 2>/dev/null; pkill -f "uvicorn.*rag" 2>/dev/null; docker compose -f deploy/compose/docker-compose-ingestor-server.yaml down 2>/dev/null; docker compose -f deploy/compose/vectordb.yaml down 2>/dev/null
+```
+
+### 2f: Kubernetes (Helm) Deployment
+
+If K8s deployment was detected, use the release name and namespace from `helm list` output in step 1:
+
+```bash
+helm uninstall <release-name> -n <namespace> 2>/dev/null
+```
+
+To also clean up persistent data (only if user requests full cleanup):
+```bash
+kubectl delete nimcache --all -n <namespace> 2>/dev/null; kubectl delete pvc --all -n <namespace> 2>/dev/null
+```
+
+## Step 3: Verify Everything Stopped
+
+```bash
+echo "=== REMAINING ===" && docker ps --format "table {{.Names}}\t{{.Status}}" 2>/dev/null; echo "=== K8S ===" && kubectl get pods -n rag 2>/dev/null | head -10 || echo "NOT_K8S"; helm list -n rag 2>/dev/null || true
+```
+
+If any RAG-related containers remain, force remove:
+```bash
+docker ps -a --format "{{.Names}}" | grep -E "(rag|milvus|nim|ingest|redis|nemo|grafana|prometheus|embedding|ranking|vlm|ocr|page-elements|graphic-elements|table-structure)" | xargs -r docker rm -f
+```
+
+If pods remain after `helm uninstall`, force delete:
+```bash
+kubectl delete pods --all -n rag --force --grace-period=0 2>/dev/null
+```
+
+## Step 4: Optional Cleanup
+
+Ask the user if they want to clean up data/volumes:
+
+- **Remove Docker volumes** (deletes ingested data, Milvus indices):
+  ```bash
+  docker volume prune -f
+  ```
+
+- **Remove model cache** (frees 100-200 GB for self-hosted):
+  ```bash
+  rm -rf ~/.cache/model-cache/
+  ```
+
+- **Remove Docker images** (frees disk space):
+  ```bash
+  docker images | grep -E "nvcr.io/nvidia|milvusdb" | awk '{print $3}' | xargs -r docker rmi
+  ```
+
+Only perform cleanup if the user explicitly requests it.
+
+## Quick One-Liner (All Docker Services)
+
+If the user wants a fast full teardown:
+
+```bash
+cd "$(git rev-parse --show-toplevel)" && \
+docker compose -f deploy/compose/docker-compose-nemo-guardrails.yaml down 2>/dev/null; \
+docker compose -f deploy/compose/observability.yaml down 2>/dev/null; \
+docker compose -f deploy/compose/docker-compose-rag-server.yaml down 2>/dev/null; \
+docker compose -f deploy/compose/docker-compose-ingestor-server.yaml down 2>/dev/null; \
+docker compose -f deploy/compose/vectordb.yaml down 2>/dev/null; \
+docker compose -f deploy/compose/nims.yaml down 2>/dev/null; \
+echo "All RAG services stopped."
+```
+
+## Source Documentation
+- `docs/troubleshooting.md` — if services won't stop or containers hang
diff --git a/skill-source/.agents/skills/rag-blueprint/references/troubleshoot.md b/skill-source/.agents/skills/rag-blueprint/references/troubleshoot.md
new file mode 100644
index 000000000..bf0308f8b
--- /dev/null
+++ b/skill-source/.agents/skills/rag-blueprint/references/troubleshoot.md
@@ -0,0 +1,146 @@
+# RAG Troubleshooting
+
+## Auto-Triage: Run First
+
+Start with this diagnostic sweep:
+
+```bash
+echo "=== CONTAINERS ===" && docker ps --format "table {{.Names}}\t{{.Status}}\t{{.Ports}}" 2>/dev/null | grep -E "(rag|milvus|nim|ingest|redis|etcd|minio)" | head -20; echo "=== HEALTH ===" && curl -s http://localhost:8081/v1/health?check_dependencies=true 2>/dev/null || echo "RAG_UNREACHABLE"; curl -s http://localhost:8082/v1/health?check_dependencies=true 2>/dev/null || echo "INGESTOR_UNREACHABLE"; echo "=== LOGS ===" && for svc in rag-server ingestor-server nim-llm-ms nemoretriever-embedding-ms nemoretriever-ranking-ms; do echo "--- $svc ---"; docker logs --tail 20 "$svc" 2>/dev/null | grep -iE "(error|fail|exception|timeout|oom)" || echo "OK"; done; echo "=== GPU ===" && nvidia-smi 2>/dev/null | head -20 || echo "NO_GPU"; echo "=== DISK ===" && df -h / | tail -1; echo "=== DOCKER_DISK ===" && docker system df 2>/dev/null; echo "=== K8S ===" && kubectl get pods -n rag 2>/dev/null | head -20 || echo "NOT_K8S"
+```
+
+Analyze all output, then diagnose and fix. If Auto-Triage doesn't reveal the cause, dig deeper into the specific failing service's logs (`docker logs <service> --tail 100` or `kubectl logs <pod> -n rag --tail 100`).
+
+Confirm with the user before deleting data (volumes, collections, model cache), changing deployment mode, or modifying API keys.
+
+## Source Documentation for Detailed Diagnosis
+
+Read these docs to find specific issue descriptions, causes, and fixes:
+
+- `docs/troubleshooting.md` — primary reference: all common issues with detailed symptoms/fixes
+- `docs/debugging.md` — Pipeline debugging: monitoring deployment, verifying endpoints, tracing requests
+- `docs/service-port-gpu-reference.md` — Complete port/GPU mapping table for all services
+
+## Expected Deployment Times
+
+If user reports "deployment is taking too long," compare against these baselines:
+
+| Mode | First Run | Subsequent |
+|------|-----------|------------|
+| Docker (self-hosted) | 15--30 min (model downloads) | 2--5 min |
+| Docker (NVIDIA-hosted) | 5--10 min (no model downloads) | 1--2 min |
+| K8s/Helm | 60--70 min (NIM cache 40--50 min + init 10--15 min + pod startup 5--10 min) | 10--15 min |
+
+If deployment exceeds these times, check NIM container logs: `docker logs nim-llm-ms --tail 50` and model cache disk usage: `watch -n 10 'du -sh ~/.cache/model-cache/'`.
+
+## Service Health Endpoints
+
+Read `docs/service-port-gpu-reference.md` for the complete port/GPU mapping. Quick check:
+
+| Service | URL | Expected |
+|---------|-----|----------|
+| RAG Server | `http://localhost:8081/v1/health?check_dependencies=true` | `{"status":"healthy"}` |
+| Ingestor | `http://localhost:8082/v1/health?check_dependencies=true` | `{"status":"healthy"}` |
+| NV-Ingest | `http://localhost:7670/v1/health/ready` | 200 OK |
+| Embedding NIM | `http://localhost:9080/v1/health/ready` | 200 OK |
+| LLM NIM | `http://localhost:8999/v1/health/ready` | 200 OK |
+| Ranking NIM | `http://localhost:1976/v1/health/ready` | 200 OK |
+| Milvus | `http://localhost:9091/healthz` | 200 OK |
+
+## Kubernetes Monitoring Commands
+
+```bash
+kubectl get nimcache -n rag
+kubectl get pods -n rag
+kubectl logs -f <pod-name> -n rag
+kubectl get pvc -n rag
+kubectl get events -n rag --sort-by='.lastTimestamp'
+```
+
+Pods in `ContainerCreating` or `Init` state during model download is expected. Use `kubectl get nimcache -n rag -w` to watch download progress.
+
+## Enable Debug Logging
+
+```bash
+export LOGLEVEL=DEBUG
+docker compose -f deploy/compose/docker-compose-ingestor-server.yaml up -d --no-deps ingestor-server
+docker compose -f deploy/compose/docker-compose-rag-server.yaml up -d --no-deps rag-server
+```
+
+---
+
+## Symptom-to-Fix Quick Index
+
+Match the symptom from Auto-Triage output, then read `docs/troubleshooting.md` for the detailed fix. For pipeline debugging steps, read `docs/debugging.md`.
+
+| Symptom | Category | Quick Fix |
+|---------|----------|-----------|
+| NIM container stuck at `(health: starting)` >30min | NIM Startup | Check GPU memory, NGC auth, disk space. First-run model downloads are slow — wait and monitor cache size. |
+| Milvus unhealthy / search returns nothing | Milvus | Restart vectordb compose. Check etcd/MinIO. Port 19530 conflict. Corrupt data → `down -v` (destroys data). |
+| Document upload fails / ingestor health check fails | NV-Ingest | Check Redis, OCR NIMs. Rate limit (429) → reduce batch vars. Large PDFs → reduce batch size. |
+| Chat returns errors / /generate fails | RAG Server | Check LLM NIM health, embedding NIM, cloud API key. Verify `APP_LLM_MODELNAME` matches deployed NIM. |
+| DNS resolution failed for `<service>:<port>` | Networking | Service container not running. Check `docker ps`, restart missing service. |
+| Port already in use | Networking | `lsof -i :<port>` to find conflicting process. See port table above. |
+| GPU out of memory / `torch.OutOfMemoryError` | GPU | Kill other GPU processes, use `--profile rag` for fewer NIMs, or set correct `NIM_MODEL_PROFILE`. |
+| `nvidia-container-cli: unknown device` | GPU | GPU ID exceeds available GPUs. Run `nvidia-smi -L`, adjust `*_GPU_ID` vars to valid IDs. |
+| Disk full / insufficient space | Disk | `docker system prune -f`, remove unused images, check model cache size. |
+| `no configuration file provided: not found` | Docker Compose | Run from the repo root directory. |
+| `too many open files` | Docker Compose | Set `LimitNOFILE=65536` in containerd override, restart containerd. |
+| PVC stuck in Pending | Helm | Create missing StorageClass or update PVC. |
+| `ProvisioningFailed` access mode mismatch | Helm | Patch NIMCache to `ReadWriteOnce`. |
+| Ingestor OOMKilled | Helm | Increase memory limits in values.yaml. Set `SUMMARY_MAX_PARALLELIZATION=1`. |
+| Elasticsearch timeout during ingestion | Elasticsearch | Increase `ES_REQUEST_TIMEOUT` (default 600s). |
+| Hallucination / out-of-context responses | Quality | Add missing-info handling to prompt in `prompt.yaml`. |
+| Embedding dimensions mismatch | Models | Set `APP_EMBEDDINGS_DIMENSIONS` to match model output. Re-ingest. |
+| Hybrid/dense search type mismatch | Search | Align `APP_VECTORSTORE_SEARCHTYPE` on ingestor and rag-server. Re-ingest. |
+| Confidence threshold filtering all results | Search | Lower `RERANKER_SCORE_THRESHOLD` (range 0.0–1.0, default 0.0). |
+| OCR not starting / connection errors | OCR | Check GPU memory, NGC auth. Verify `OCR_GRPC_ENDPOINT`/`OCR_HTTP_ENDPOINT` match running service. |
+| NVIDIA API credits exhausted | Cloud | Contact NVIDIA representative for additional credits. |
+| Image-only PDFs not ingesting | Ingestion | Enable `APP_NVINGEST_EXTRACTINFOGRAPHICS`. Consider image captioning. |
+
+---
+
+## Troubleshooting Checklists
+
+### Ingestion Checklist
+- [ ] All required containers running (ingestor-server, nv-ingest-ms-runtime, milvus, redis)
+- [ ] Vector database accessible (`curl http://localhost:9091/healthz`)
+- [ ] Embedding service healthy (`curl http://localhost:9080/v1/health/ready`)
+- [ ] File format supported and size <= 400 MB
+- [ ] Sufficient disk space (`df -h /`)
+- [ ] GPU resources available (`nvidia-smi`)
+
+### Retrieval Checklist
+- [ ] RAG server running and healthy
+- [ ] LLM service accessible (`curl http://localhost:8999/v1/health/ready`)
+- [ ] Vector database contains data (collection exists with documents)
+- [ ] Collection name is correct
+- [ ] Query format is valid
+
+### Quality Checklist
+- [ ] Reranker is enabled and healthy
+- [ ] Top-K values are appropriate
+- [ ] Collection has sufficient relevant data
+- [ ] Query rewriting configured correctly
+- [ ] Prompt template appropriate for use case
+
+---
+
+## Full Reset
+
+Destroys all data (volumes, images, caches). Confirm with the user before running.
+
+If nothing else works and the user confirms:
+
+```bash
+cd "$(git rev-parse --show-toplevel)"
+docker compose -f deploy/compose/docker-compose-nemo-guardrails.yaml down 2>/dev/null
+docker compose -f deploy/compose/observability.yaml down 2>/dev/null
+docker compose -f deploy/compose/docker-compose-rag-server.yaml down 2>/dev/null
+docker compose -f deploy/compose/docker-compose-ingestor-server.yaml down 2>/dev/null
+docker compose -f deploy/compose/vectordb.yaml down -v 2>/dev/null
+docker compose -f deploy/compose/nims.yaml down 2>/dev/null
+
+docker system prune -af --volumes
+```
+
+Then deploy fresh using the deploy workflow.
diff --git a/skill-source/README.md b/skill-source/README.md
new file mode 100644
index 000000000..f5a787677
--- /dev/null
+++ b/skill-source/README.md
@@ -0,0 +1,111 @@
+# RAG Blueprint Agent Skill
+
+A single agent skill that enables AI coding assistants (Claude Code, Cursor, Codex, etc.) to deploy, configure, troubleshoot, and manage the NVIDIA RAG Blueprint autonomously.
+
+## Installation
+
+```bash
+npx skills add .
+```
+
+Select **rag-blueprint** — it includes all capabilities (deploy, configure, shutdown, troubleshoot) in one skill.
+
+## Architecture: Skills = Process, Docs = Truth
+
+```
+SKILL.md           = ROUTER (intent detection, autonomy rules, configure routing table)
+Reference files    = WHAT/HOW (deployment workflows, feature playbooks, diagnostics)
+docs/*.md          = SOURCE OF TRUTH (never copied into skills)
+notebooks/*.ipynb  = RUNNABLE EXAMPLES (referenced from relevant skills)
+```
+
+The SKILL.md detects user intent and routes to the correct reference file. Reference files are concise playbooks that point to `docs/*.md` for detailed configuration — this prevents staleness from duplicated content.
+
+## Skill Structure
+
+```
+skill-source/.agents/skills/rag-blueprint/
+  SKILL.md                              ← Single entry point (intent router)
+  references/
+    deploy.md                           ← Deployment: env analysis, NGC key, routing
+    deploy/
+      docker.md                         ← Docker Compose deployment workflow
+      docker-self-hosted.md             ← Self-hosted NIMs (local GPU inference)
+      docker-nvidia-hosted.md           ← Cloud NIMs (NVIDIA API endpoints)
+      docker-retrieval-only.md          ← Search/retrieve only (no LLM)
+      helm.md                           ← Kubernetes / Helm deployment workflow
+      helm-standard.md                  ← Standard Helm chart deployment
+      helm-mig.md                       ← Multi-Instance GPU deployment
+      library.md                        ← Python library mode workflow
+      library-full.md                   ← Python API + Docker backend
+      library-lite.md                   ← Containerless (Milvus Lite + cloud APIs)
+    configure/
+      vlm.md                            ← VLM, VLM embeddings, image captioning
+      guardrails.md                     ← NeMo Guardrails
+      query-and-conversation.md         ← Query rewriting, decomposition, multi-turn
+      ingestion.md                      ← Text-only, audio, Nemotron Parse, OCR, batch CLI
+      search-and-retrieval.md           ← Hybrid search, multi-collection, metadata, filters
+      models-and-infrastructure.md      ← Model changes, vector DB, auth, API keys, profiles
+      reasoning-and-generation.md       ← Reasoning, self-reflection, prompts, generation params
+      summarization.md                  ← Document summarization during ingestion
+      observability.md                  ← Tracing, Zipkin, Grafana, Prometheus
+      multimodal-query.md              ← Image + text querying with VLM embeddings
+      data-catalog.md                   ← Collection/document metadata management
+      user-interface.md                 ← RAG UI settings and usage
+      api-reference.md                  ← REST API endpoints and schemas
+      evaluation.md                     ← RAGAS quality metrics
+      mcp.md                            ← MCP server & client tools
+      migration.md                      ← Version upgrade guide
+      notebooks.md                      ← Notebook environment and catalog
+    shutdown.md                         ← Stop and tear down services
+    troubleshoot.md                     ← Diagnose and fix common issues
+```
+
+## How It Works
+
+1. User says "deploy RAG" → SKILL.md routes to `references/deploy.md` → env analysis → routes to `deploy/docker.md`, `deploy/helm.md`, or `deploy/library.md`
+2. User says "enable VLM" → SKILL.md routes to `references/configure/vlm.md` → reads `docs/vlm.md` for detailed steps
+3. User says "RAG is broken" → SKILL.md routes to `references/troubleshoot.md` → auto-triage diagnostic sweep
+4. User says "stop RAG" → SKILL.md routes to `references/shutdown.md` → detects and stops all services
+
+## Supported Deployment Modes
+
+Read `docs/support-matrix.md` for current hardware requirements per mode.
+
+| Mode | Docker Required | Description |
+|------|-----------------|-------------|
+| Docker (self-hosted) | Yes | Full on-prem with local NIM inference |
+| Docker (NVIDIA-hosted) | Yes | Cloud APIs for model inference |
+| Docker (retrieval-only) | Yes | No LLM, search/retrieve only |
+| Helm / Kubernetes | No (K8s) | Production K8s with NIM Operator |
+| Library (full) | Yes (backend) | Python API with Docker backend services |
+| Library (lite) | No | Milvus Lite + cloud APIs, zero infrastructure |
+
+## NGC_API_KEY Handling
+
+Skills never expose the API key value to the LLM. The approach:
+
+1. Check if `NGC_API_KEY` is set: `[ -n "$NGC_API_KEY" ] && echo "SET" || echo "NOT_SET"`
+2. If not set, ask the user to run `export NGC_API_KEY="nvapi-your-key"` in the terminal
+3. For `docker login`, the user runs it themselves (the command expands the key)
+4. As a fallback, offer to write a placeholder to `deploy/compose/.env` for the user to replace
+
+## Notebook Integration
+
+All 13 notebooks are referenced from relevant reference files:
+
+| Notebook | Referenced In |
+|----------|--------------|
+| `ingestion_api_usage.ipynb` | `references/configure/ingestion.md` |
+| `retriever_api_usage.ipynb` | `references/configure/search-and-retrieval.md` |
+| `image_input.ipynb` | `references/configure/vlm.md`, `references/configure/multimodal-query.md` |
+| `summarization.ipynb` | `references/configure/summarization.md` |
+| `evaluation_01_ragas.ipynb` | `references/configure/evaluation.md` |
+| `evaluation_02_recall.ipynb` | `references/configure/evaluation.md` |
+| `nb_metadata.ipynb` | `references/configure/search-and-retrieval.md` |
+| `rag_library_usage.ipynb` | `references/deploy/library-full.md` |
+| `rag_library_lite_usage.ipynb` | `references/deploy/library-lite.md` |
+| `building_rag_vdb_operator.ipynb` | `references/configure/models-and-infrastructure.md` |
+| `mcp_server_usage.ipynb` | `references/configure/mcp.md` |
+| `nat_mcp_integration.ipynb` | `references/configure/mcp.md` |
+| `launchable.ipynb` | `SKILL.md` |
diff --git a/src/nvidia_rag/ingestor_server/Dockerfile b/src/nvidia_rag/ingestor_server/Dockerfile
index b484849ca..ed481b832 100644
--- a/src/nvidia_rag/ingestor_server/Dockerfile
+++ b/src/nvidia_rag/ingestor_server/Dockerfile
@@ -71,7 +71,7 @@ WORKDIR /workspace
 COPY --from=builder /build/pyproject.toml /workspace/
 COPY --from=builder /build/uv.lock /workspace/
 
-RUN uv sync --locked --no-install-project --no-dev --extra ingest --extra elasticsearch
+RUN uv sync --locked --no-install-project --no-dev --extra ingest --extra oracle --extra minio
 
 COPY --from=builder /build/dist/*.whl /workspace/
 
diff --git a/src/nvidia_rag/ingestor_server/health.py b/src/nvidia_rag/ingestor_server/health.py
index 55c3e0963..c08d418d3 100644
--- a/src/nvidia_rag/ingestor_server/health.py
+++ b/src/nvidia_rag/ingestor_server/health.py
@@ -378,16 +378,17 @@ async def check_all_services_health(
     task_management: list[TaskManagementHealthInfo] = []
 
     # MinIO health check
-    minio_endpoint = config.minio.endpoint
-    minio_access_key = config.minio.access_key.get_secret_value()
-    minio_secret_key = config.minio.secret_key.get_secret_value()
-    if minio_endpoint:
-        minio_result = await check_minio_health(
-            endpoint=minio_endpoint,
-            access_key=minio_access_key,
-            secret_key=minio_secret_key,
-        )
-        object_storage.append(minio_result)
+    if config.minio.enabled:
+        minio_endpoint = config.minio.endpoint
+        minio_access_key = config.minio.access_key.get_secret_value()
+        minio_secret_key = config.minio.secret_key.get_secret_value()
+        if minio_endpoint:
+            minio_result = await check_minio_health(
+                endpoint=minio_endpoint,
+                access_key=minio_access_key,
+                secret_key=minio_secret_key,
+            )
+            object_storage.append(minio_result)
 
     # Vector DB health check
     try:
diff --git a/src/nvidia_rag/ingestor_server/main.py b/src/nvidia_rag/ingestor_server/main.py
index f28750e38..9b047ff1f 100644
--- a/src/nvidia_rag/ingestor_server/main.py
+++ b/src/nvidia_rag/ingestor_server/main.py
@@ -94,6 +94,7 @@
 from nvidia_rag.utils.summary_status_handler import SUMMARY_STATUS_HANDLER
 from nvidia_rag.utils.vdb import DEFAULT_DOCUMENT_INFO_COLLECTION, _get_vdb_op
 from nvidia_rag.utils.vdb.vdb_base import VDBRag
+from nvidia_rag.utils.vdb.vdb_ingest_base import SerializedVDBWrapper
 
 # Initialize logger
 logger = logging.getLogger(__name__)
@@ -158,24 +159,28 @@ def __init__(
         )
 
         # Initialize MinIO operator - handle failures gracefully
-        try:
-            if self.mode == Mode.LITE:
-                raise ValueError("MinIO operations are not supported in RAG Lite mode")
-            self.minio_operator = get_minio_operator(config=self.config)
-            # Ensure default bucket exists (idempotent operation)
-            try:
-                self.minio_operator._make_bucket(bucket_name="a-bucket")
-                logger.debug("Ensured 'a-bucket' exists in MinIO")
-            except Exception as bucket_err:
-                # Log specific exception for debugging bucket creation issues
-                logger.debug("Could not ensure bucket exists: %s", bucket_err)
-        except Exception as e:
+        if not self.config.minio.enabled:
             self.minio_operator = None
-            # Error already logged in MinioOperator.__init__, just note it here
-            logger.debug(
-                "MinIO operator set to None due to initialization failure, reason: %s",
-                e,
-            )
+        else:
+            try:
+                if self.mode == Mode.LITE:
+                    raise ValueError("MinIO operations are not supported in RAG Lite mode")
+                self.minio_operator = get_minio_operator(config=self.config)
+                # Ensure default bucket exists (idempotent operation)
+                if self.minio_operator is not None:
+                    try:
+                        self.minio_operator._make_bucket(bucket_name="a-bucket")
+                        logger.debug("Ensured 'a-bucket' exists in MinIO")
+                    except Exception as bucket_err:
+                        # Log specific exception for debugging bucket creation issues
+                        logger.debug("Could not ensure bucket exists: %s", bucket_err)
+            except Exception as e:
+                self.minio_operator = None
+                # Error already logged in MinioOperator.__init__, just note it here
+                logger.debug(
+                    "MinIO operator set to None due to initialization failure, reason: %s",
+                    e,
+                )
 
         if self.vdb_op is not None:
             if not (isinstance(self.vdb_op, VDBRag) or isinstance(self.vdb_op, VDB)):
@@ -752,10 +757,13 @@ async def __build_ingestion_response(
         uploaded_documents = []
         for filepath in filepaths:
             if os.path.basename(filepath) not in failures_filepaths:
-                doc_type_counts, _, total_elements, raw_text_elements_size = (
-                    self._get_document_type_counts(
-                        [filename_to_result_map.get(os.path.basename(filepath), [])]
-                    )
+                (
+                    doc_type_counts,
+                    _,
+                    total_elements,
+                    raw_text_elements_size,
+                ) = self._get_document_type_counts(
+                    [filename_to_result_map.get(os.path.basename(filepath), [])]
                 )
 
                 document_info = create_document_metadata(
@@ -1767,6 +1775,8 @@ def delete_minio_metadata(docs_to_delete: list[str]) -> None:
                     logger.info(
                         f"Recalculated collection info for {collection_name} after document deletion"
                     )
+                    # Delete MinIO metadata for successfully deleted documents
+                    delete_minio_metadata(deleted_docs)
                 elif hasattr(vdb_op, "_es_connection"):
                     # Elasticsearch: Delete first, then add without aggregation
                     # Lazy import to avoid requiring elasticsearch when not used
@@ -1798,6 +1808,14 @@ def delete_minio_metadata(docs_to_delete: list[str]) -> None:
                     logger.info(
                         f"Recalculated collection info for {collection_name} after document deletion"
                     )
+                    # Delete MinIO metadata for successfully deleted documents
+                    delete_minio_metadata(deleted_docs)
+                elif hasattr(vdb_op, "_oracle_cs"):
+                    # Oracle 26ai: Replace collection info directly without aggregation
+                    vdb_op.set_collection_info(collection_name, updated_collection_info)
+                    logger.info(
+                        f"Recalculated collection info for {collection_name} after document deletion"
+                    )
                 else:
                     # Fallback: Use add_document_info (may cause double-aggregation, but better than nothing)
                     logger.warning(
@@ -1819,9 +1837,6 @@ def delete_minio_metadata(docs_to_delete: list[str]) -> None:
                     "documents": [],
                 }
 
-            # Delete MinIO metadata for successfully deleted documents
-            delete_minio_metadata(deleted_docs)
-
             # Build documents response with metadata and document_info from fetched data
             documents = []
             for doc_name in deleted_docs:
@@ -2265,6 +2280,13 @@ async def __run_nvingest_batched_ingestion(
                 logger.info(
                     f"Processing batches in parallel with concurrency: {state_manager.concurrent_batches}"
                 )
+
+                if vdb_op is not None and SerializedVDBWrapper is not None:
+                    vdb_op = SerializedVDBWrapper(vdb_op)
+                    logger.info(
+                        "VDB write serialization enabled — extraction runs in parallel, VDB writes are sequential"
+                    )
+
                 all_results = []
                 all_failures = []
                 tasks = []
@@ -2850,9 +2872,12 @@ def _log_result_info(
         Returns:
             dict[str, Any]: Document info with metrics
         """
-        doc_type_counts, total_documents, total_elements, raw_text_elements_size = (
-            self._get_document_type_counts(results)
-        )
+        (
+            doc_type_counts,
+            total_documents,
+            total_elements,
+            raw_text_elements_size,
+        ) = self._get_document_type_counts(results)
 
         document_info = {
             "doc_type_counts": doc_type_counts,
diff --git a/src/nvidia_rag/ingestor_server/nvingest.py b/src/nvidia_rag/ingestor_server/nvingest.py
index 55e218c53..0f02aa1bb 100644
--- a/src/nvidia_rag/ingestor_server/nvingest.py
+++ b/src/nvidia_rag/ingestor_server/nvingest.py
@@ -141,6 +141,8 @@ def get_nv_ingest_ingestor(
             "extract_audio_params": {"segment_audio": config.nv_ingest.segment_audio},
             "extract_page_as_image": config.nv_ingest.extract_page_as_image,
         }
+        if config.nv_ingest.extract_tables_method is not None:
+            extract_kwargs["extract_tables_method"] = config.nv_ingest.extract_tables_method
 
     if remove_extract_method or config.nv_ingest.pdf_extract_method is None:
         extract_kwargs.pop("extract_method", None)
diff --git a/src/nvidia_rag/rag_server/Dockerfile b/src/nvidia_rag/rag_server/Dockerfile
index af8a2452d..e5b0aaed4 100644
--- a/src/nvidia_rag/rag_server/Dockerfile
+++ b/src/nvidia_rag/rag_server/Dockerfile
@@ -9,7 +9,7 @@ ENV PYTHONDONTWRITEBYTECODE=1
 ENV DEBIAN_FRONTEND noninteractive
 
 # Install uv https://docs.astral.sh/uv/guides/integration/docker/#installing-uv
-COPY --from=ghcr.io/astral-sh/uv:0.7.4 /uv /uvx /bin/
+COPY --from=ghcr.io/astral-sh/uv:0.8.12 /uv /uvx /bin/
 
 WORKDIR /build
 
@@ -32,10 +32,12 @@ RUN if [ "$DOWNLOAD_LEGAL_COMPLIANCE" = "true" ]; then \
     fi
 
 # Install required ubuntu packages for setting up python 3.13
+# Update all packages to address security vulnerabilities including CVE-2025-68973 (gpgv)
 RUN apt update && \
+    apt upgrade -y && \
     apt install -y curl software-properties-common libgl1 libglib2.0-0 libmagic1 file build-essential && \
     add-apt-repository ppa:deadsnakes/ppa && \
-    apt update && apt install -y python3.13 python3.13-dev && \
+    apt update && apt upgrade -y && apt install -y python3.13 python3.13-dev && \
     apt-get clean
 
 # Download ONLY sources for packages WE installed (conditional on DOWNLOAD_LEGAL_COMPLIANCE)
@@ -69,7 +71,7 @@ WORKDIR /workspace
 COPY --from=builder /build/pyproject.toml /workspace/
 COPY --from=builder /build/uv.lock /workspace/
 
-RUN uv sync --locked --no-install-project --no-dev --extra rag --extra elasticsearch
+RUN uv sync --locked --no-install-project --no-dev --extra rag --extra oracle --extra minio
 
 COPY --from=builder /build/dist/*.whl /workspace/
 
@@ -106,8 +108,10 @@ ENV PYTHONDONTWRITEBYTECODE=1
 ENV DEBIAN_FRONTEND noninteractive
 
 # Install ONLY the essential runtime libraries (no build tools, no Python installation process)
+# Update all packages to address CVE-2025-68973 (gpgv) and other security vulnerabilities
 RUN apt update && \
-    apt install -y libexpat1 gpgv=2.2.27-3ubuntu2.5 && \
+    apt upgrade -y && \
+    apt install -y libexpat1 && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/*
 
diff --git a/src/nvidia_rag/rag_server/health.py b/src/nvidia_rag/rag_server/health.py
index b0e850e25..78a526ef4 100644
--- a/src/nvidia_rag/rag_server/health.py
+++ b/src/nvidia_rag/rag_server/health.py
@@ -225,16 +225,17 @@ async def check_all_services_health(
     nim: list[NIMServiceHealthInfo] = []
 
     # MinIO health check
-    minio_endpoint = config.minio.endpoint
-    minio_access_key = config.minio.access_key.get_secret_value()
-    minio_secret_key = config.minio.secret_key.get_secret_value()
-    if minio_endpoint:
-        minio_result = await check_minio_health(
-            endpoint=minio_endpoint,
-            access_key=minio_access_key,
-            secret_key=minio_secret_key,
-        )
-        object_storage.append(minio_result)
+    if config.minio.enabled:
+        minio_endpoint = config.minio.endpoint
+        minio_access_key = config.minio.access_key.get_secret_value()
+        minio_secret_key = config.minio.secret_key.get_secret_value()
+        if minio_endpoint:
+            minio_result = await check_minio_health(
+                endpoint=minio_endpoint,
+                access_key=minio_access_key,
+                secret_key=minio_secret_key,
+            )
+            object_storage.append(minio_result)
 
     # Vector DB health check
     try:
diff --git a/src/nvidia_rag/rag_server/main.py b/src/nvidia_rag/rag_server/main.py
index 044120926..55aaabede 100644
--- a/src/nvidia_rag/rag_server/main.py
+++ b/src/nvidia_rag/rag_server/main.py
@@ -262,7 +262,9 @@ def __init__(
         # Load prompts and other utilities
         self.prompts = get_prompts(prompts)
         self.vdb_top_k = int(self.config.retriever.vdb_top_k)
-        self.StreamingFilterThinkParser = get_streaming_filter_think_parser_async()
+        self.StreamingFilterThinkParser = get_streaming_filter_think_parser_async(
+            enable_thinking=self.config.llm.parameters.enable_thinking
+        )
 
         if self._init_errors:
             logger.warning(
@@ -1368,18 +1370,6 @@ def _handle_prompt_processing(
         conversation_history = []
         user_message = []
 
-        is_nemotron_v1 = str(model).endswith("llama-3.3-nemotron-super-49b-v1")
-
-        # Nemotron controls thinking using system prompt, if nemotron v1 model is used update system prompt to enable/disable think
-        if is_nemotron_v1:
-            logger.info("Nemotron v1 model detected, updating system prompt")
-            if os.environ.get("ENABLE_NEMOTRON_THINKING", "false").lower() == "true":
-                logger.info("Setting system prompt as detailed thinking on")
-                system_prompt = "detailed thinking on"
-            else:
-                logger.info("Setting system prompt as detailed thinking off")
-                system_prompt = "detailed thinking off"
-
         # Process chat history
         for message in chat_history:
             # Overwrite system message if provided in conversation history
@@ -1883,27 +1873,46 @@ def _build_retriever_query_from_content(self, content: Any) -> tuple[str, bool]:
             tuple[str, bool]: Query string that may include base64 image data for VLM embeddings
             bool: True if image URL is provided, False otherwise
         """
+        is_image_query = False
         if isinstance(content, str):
-            return content, False
+            return content, is_image_query
         elif isinstance(content, list):
-            # Build multimodal query with both text and base64 images
-            query_parts = []
-            for item in content:
-                if isinstance(item, dict):
-                    if item.get("type") == "text":
-                        text_content = item.get("text", "").strip()
-                        if text_content:
-                            query_parts.append(text_content)
-                    elif item.get("type") == "image_url":
-                        image_url = item.get("image_url", {}).get("url", "")
-                        if image_url:
-                            # If image URL is provided, return it as is
-                            return image_url, True
-            # If no image URL is provided, return the text content
-            return "\n\n".join(query_parts), False
+            # Build multimodal query with both text and base64 images.
+            
+            # Process text types first, then image_url types.
+            text_items = [
+                item for item in content
+                if isinstance(item, dict) and item.get("type") == "text"
+            ]
+            image_items = [
+                item for item in content
+                if isinstance(item, dict) and item.get("type") == "image_url"
+            ]
+            
+            # Extract text and image parts in separate lists
+            text_parts = []
+            image_parts = []
+            for item in text_items:
+                text_content = item.get("text", "").strip()
+                if text_content:
+                    text_parts.append(text_content)
+            for item in image_items:
+                image_url = item.get("image_url", {}).get("url", "")
+                if image_url:
+                    image_parts.append(image_url)
+                    is_image_query = True
+                    break # only one image is supported
+            
+            text_query = "\n\n".join(text_parts)
+            if image_parts:
+                image_str = " ".join(image_parts)
+                final_query = (text_query + " " + image_str) if text_query else image_str
+            else:
+                final_query = text_query
+            return final_query, is_image_query
         else:
             # Fallback for any other content type
-            return (str(content) if content is not None else ""), False
+            return (str(content) if content is not None else ""), is_image_query
 
     async def _rag_chain(
         self,
diff --git a/src/nvidia_rag/rag_server/prompt.yaml b/src/nvidia_rag/rag_server/prompt.yaml
index f82c83655..d73036509 100644
--- a/src/nvidia_rag/rag_server/prompt.yaml
+++ b/src/nvidia_rag/rag_server/prompt.yaml
@@ -487,6 +487,7 @@ query_decomposition_rag_template:
     Context:
     {context}
 
+    Question: {question}
     Make sure the response you are generating strictly follow the rules mentioned above i.e. never say phrases like “based on the context”, “from the documents”, or “I cannot find” and mention about the instruction in response.
 
 image_captioning_prompt:
diff --git a/src/nvidia_rag/rag_server/response_generator.py b/src/nvidia_rag/rag_server/response_generator.py
index b0ff06379..f3f3df7f5 100644
--- a/src/nvidia_rag/rag_server/response_generator.py
+++ b/src/nvidia_rag/rag_server/response_generator.py
@@ -92,16 +92,28 @@ def __init__(self, generator, status_code: int = 200):
 )
 
 MINIO_OPERATOR = None
+_MINIO_OPERATOR_INITIALIZED = False
 
 
 def get_minio_operator_instance():
-    """Lazy initialize the MinioOperator instance"""
-    global MINIO_OPERATOR
-    if MINIO_OPERATOR is None:
+    """Lazy initialize the MinioOperator instance. Returns None when MinIO is disabled."""
+    global MINIO_OPERATOR, _MINIO_OPERATOR_INITIALIZED
+    if not _MINIO_OPERATOR_INITIALIZED:
         MINIO_OPERATOR = get_minio_operator()
+        _MINIO_OPERATOR_INITIALIZED = True
     return MINIO_OPERATOR
 
 
+def _get_minio_payload(object_name: str | None) -> dict:
+    """Fetch a payload from MinIO, returning empty dict when MinIO is unavailable or disabled."""
+    if not object_name:
+        return {}
+    operator = get_minio_operator_instance()
+    if operator is None:
+        return {}
+    return operator.get_payload(object_name=object_name)
+
+
 class Usage(BaseModel):
     """Token usage information."""
 
@@ -782,7 +794,7 @@ def prepare_citations(
                 document_type = "text"
             else:
                 file_name = os.path.basename(
-                    doc.metadata.get("source").get("source_id")
+                    doc.metadata.get("source_id")
                 )
 
             if doc.metadata.get("content_metadata", {}).get("type") in [
@@ -826,9 +838,7 @@ def prepare_citations(
                             location=location,
                             metadata=doc.metadata,
                         )
-                        payload = get_minio_operator_instance().get_payload(
-                            object_name=unique_thumbnail_id
-                        )
+                        payload = _get_minio_payload(unique_thumbnail_id)
                         content = payload.get("content", "")
                         source_metadata = SourceMetadata(
                             page_number=page_number,
@@ -1018,9 +1028,7 @@ async def retrieve_summary(
             location=[],
         )
 
-        payload = get_minio_operator_instance().get_payload(
-            object_name=unique_thumbnail_id
-        )
+        payload = _get_minio_payload(unique_thumbnail_id)
 
         if payload:
             return {
@@ -1091,9 +1099,7 @@ async def _wait_for_summary_completion(
                 page_number=0,
                 location=[],
             )
-            payload = get_minio_operator_instance().get_payload(
-                object_name=unique_thumbnail_id
-            )
+            payload = _get_minio_payload(unique_thumbnail_id)
             if payload:
                 return {
                     "message": "Summary retrieved successfully.",
@@ -1117,9 +1123,7 @@ async def _wait_for_summary_completion(
                         page_number=0,
                         location=[],
                     )
-                    payload = get_minio_operator_instance().get_payload(
-                        object_name=unique_thumbnail_id
-                    )
+                    payload = _get_minio_payload(unique_thumbnail_id)
                     if payload:
                         return {
                             "message": "Summary retrieved successfully.",
diff --git a/src/nvidia_rag/rag_server/server.py b/src/nvidia_rag/rag_server/server.py
index dbd5698ed..b96a2f02c 100644
--- a/src/nvidia_rag/rag_server/server.py
+++ b/src/nvidia_rag/rag_server/server.py
@@ -581,6 +581,14 @@ class Prompt(BaseModel):
         le=1.0,
     )
 
+    @model_validator(mode="before")
+    @classmethod
+    def coerce_collection_name_singular(cls, values):
+        """Accept legacy collection_name (singular str) from AIRA <=1.2.x and convert to collection_names."""
+        if isinstance(values, dict) and "collection_name" in values and "collection_names" not in values:
+            values["collection_names"] = [values.pop("collection_name")]
+        return values
+
     @model_validator(mode="after")
     def validate_confidence_threshold(cls, values):
         """Custom validator for confidence_threshold to provide better error messages."""
@@ -700,6 +708,14 @@ class DocumentSearch(BaseModel):
         default=CONFIG.enable_citations,
     )
 
+    @model_validator(mode="before")
+    @classmethod
+    def coerce_collection_name_singular(cls, values):
+        """Accept legacy collection_name (singular str) from AIRA <=1.2.x and convert to collection_names."""
+        if isinstance(values, dict) and "collection_name" in values and "collection_names" not in values:
+            values["collection_names"] = [values.pop("collection_name")]
+        return values
+
     @model_validator(mode="after")
     def validate_confidence_threshold(cls, values):
         """Custom validator for confidence_threshold to provide better error messages."""
@@ -741,6 +757,26 @@ def validate_messages_structure(cls, values):
             raise ValueError("The last message must have role='user'")
         return values
 
+    @model_validator(mode="before")
+    @classmethod
+    def derive_query_from_messages(cls, data):
+        """When query is not explicitly provided but messages are, derive query from the last user message."""
+        if isinstance(data, dict) and "query" not in data:
+            messages = data.get("messages", [])
+            for msg in reversed(messages):
+                if (
+                    isinstance(msg, dict)
+                    and msg.get("role") == "user"
+                    and msg.get("content")
+                ):
+                    data["query"] = msg["content"]
+                    break
+            else:
+                raise ValueError(
+                    "Either 'query' must be provided or 'messages' must contain at least one user message with content."
+                )
+        return data
+
 
 # Define the summary response model
 class SummaryResponse(BaseModel):
@@ -1596,6 +1632,11 @@ def sanitize_query_for_logging(query):
 
     request_data = {
         "query": sanitize_query_for_logging(data.query),
+        "messages": [
+            {"role": msg.role, "content": msg.content} for msg in data.messages
+        ]
+        if data.messages
+        else [],
         "reranker_top_k": data.reranker_top_k,
         "vdb_top_k": data.vdb_top_k,
         "collection_names": data.collection_names,
diff --git a/src/nvidia_rag/utils/configuration.py b/src/nvidia_rag/utils/configuration.py
index 9019b7a84..f8c61d44b 100644
--- a/src/nvidia_rag/utils/configuration.py
+++ b/src/nvidia_rag/utils/configuration.py
@@ -302,6 +302,20 @@ def normalize_pdf_extract_method(cls, v: Any) -> Any:
         env="APP_NVINGEST_TEXTDEPTH",
         description="Granularity level for text extraction (page, document)",
     )
+    extract_tables_method: str | None = Field(
+        default=None,
+        env="APP_NVINGEST_EXTRACTTABLESMETHOD",
+        description="Method for table/chart extraction in PDFs (e.g. yolox, nemotron_parse). If None, client default is used.",
+    )
+
+    @field_validator("extract_tables_method", mode="before")
+    @classmethod
+    def normalize_extract_tables_method(cls, v: Any) -> Any:
+        """Normalize string 'None'/'none' to Python None."""
+        if isinstance(v, str) and v.lower() in ("none", "null", ""):
+            return None
+        return v
+
     tokenizer: str = Field(
         default="intfloat/e5-large-unsupervised",
         env="APP_NVINGEST_TOKENIZER",
@@ -412,15 +426,30 @@ class ModelParametersConfig(_ConfigBase):
         env="LLM_MIN_TOKENS",
         description="Minimum number of tokens to generate in response",
     )
+    enable_thinking: bool = Field(
+        default=False,
+        env="LLM_ENABLE_THINKING",
+        description="Enable reasoning/thinking mode. Model emits reasoning tokens before the final answer.",
+    )
+    reasoning_budget: int = Field(
+        default=0,
+        env="LLM_REASONING_BUDGET",
+        description="Token budget for reasoning (0 = no budget, model decides depth). Only used when enable_thinking is true.",
+    )
+    low_effort: bool = Field(
+        default=False,
+        env="LLM_LOW_EFFORT",
+        description="Low-effort reasoning mode for faster, cheaper responses with shorter reasoning. Only used when enable_thinking is true.",
+    )
     max_thinking_tokens: int = Field(
         default=0,
         env="LLM_MAX_THINKING_TOKENS",
-        description="Maximum thinking tokens to allocate for reasoning models (0 = disabled by default)",
+        description="Maximum thinking tokens for reasoning models. Used directly by nemotron-nano-9b-v2; for other models acts as an alternative to reasoning_budget (0 = disabled).",
     )
     min_thinking_tokens: int = Field(
         default=0,
         env="LLM_MIN_THINKING_TOKENS",
-        description="Minimum thinking tokens to allocate for reasoning models (0 = disabled by default)",
+        description="Minimum thinking tokens for reasoning models. Only used by nemotron-nano-9b-v2 (0 = disabled).",
     )
     ignore_eos: bool = Field(
         default=False,
@@ -502,6 +531,9 @@ def get_model_parameters(self) -> dict:
             "min_tokens": self.parameters.min_tokens,
             "ignore_eos": self.parameters.ignore_eos,
             "max_tokens": self.parameters.max_tokens,
+            "enable_thinking": self.parameters.enable_thinking,
+            "reasoning_budget": self.parameters.reasoning_budget,
+            "low_effort": self.parameters.low_effort,
             "min_thinking_tokens": self.parameters.min_thinking_tokens,
             "max_thinking_tokens": self.parameters.max_thinking_tokens,
             "temperature": self.parameters.temperature,
@@ -631,7 +663,7 @@ class EmbeddingConfig(_ConfigBase):
     """Embedding configuration."""
 
     model_name: str = Field(
-        default="nvidia/llama-3.2-nv-embedqa-1b-v2",
+        default="nvidia/llama-nemotron-embed-1b-v2",
         env="APP_EMBEDDINGS_MODELNAME",
         description="Model for generating text embeddings",
     )
@@ -671,7 +703,7 @@ class RankingConfig(_ConfigBase):
     """Ranking configuration."""
 
     model_name: str = Field(
-        default="nvidia/llama-3.2-nv-rerankqa-1b-v2",
+        default="nvidia/llama-nemotron-rerank-1b-v2",
         env="APP_RANKING_MODELNAME",
         description="Model for reranking retrieved documents",
     )
@@ -856,6 +888,11 @@ def normalize_url(cls, v: Any) -> Any:
 class MinioConfig(_ConfigBase):
     """Minio configuration."""
 
+    enabled: bool = Field(
+        default=True,
+        env="ENABLE_MINIO",
+        description="Enable MinIO object storage for multimodal citations. Set to False for deployments without Milvus (e.g. Oracle 26ai).",
+    )
     endpoint: str = Field(
         default="localhost:9010",
         env="MINIO_ENDPOINT",
diff --git a/src/nvidia_rag/utils/llm.py b/src/nvidia_rag/utils/llm.py
index 639c87d70..e1f226655 100644
--- a/src/nvidia_rag/utils/llm.py
+++ b/src/nvidia_rag/utils/llm.py
@@ -33,6 +33,7 @@
 import yaml
 from langchain_core.language_models.llms import LLM
 from langchain_core.language_models.chat_models import SimpleChatModel
+from langchain_core.messages import AIMessageChunk
 from langchain_nvidia_ai_endpoints import ChatNVIDIA
 
 from nvidia_rag.rag_server.response_generator import APIError, ErrorCodeMapping
@@ -128,117 +129,112 @@ def _is_nvidia_endpoint(url: str | None) -> bool:
     return True
 
 
-def _bind_thinking_tokens_if_configured(
-    llm: LLM | SimpleChatModel, **kwargs
-) -> LLM | SimpleChatModel:
-    """
-    If min_thinking_tokens or max_thinking_tokens are > 0 in kwargs, bind them to the LLM.
-    
-    Supports multiple reasoning/thinking model variants:
-    
-    1. nvidia/nvidia-nemotron-nano-9b-v2:
-       - Uses min_thinking_tokens and max_thinking_tokens parameters
-       - Reasoning content is not available for this model
-    
-    2. nemotron-3-nano variants (nemotron-3-nano-30b-a3b, nvidia/nemotron-3-nano):
-       - Uses reasoning_budget parameter (mapped from max_thinking_tokens)
-       - reasoning_budget is ONLY set when enable_thinking is true
-       - Outputs reasoning in a separate 'reasoning_content' field (not in content)
-       - Does NOT use <think> tags
-       - Can be controlled via ENABLE_NEMOTRON_3_NANO_THINKING env var
-
-    Raises:
-        ValueError: If min_thinking_tokens or max_thinking_tokens is passed but model
-                    is not a supported Nemotron thinking model, or if any of these
-                    parameters have invalid values (0 or negative).
-    """
-    min_think = kwargs.get("min_thinking_tokens", None)
-    max_think = kwargs.get("max_thinking_tokens", None)
-    model = kwargs.get("model", None)
+def _is_nemotron_3(model: str | None) -> bool:
+    """Detect Nemotron 3 model variants by checking for 'nemotron-3' in the model name."""
+    if not model:
+        return False
+    return "nemotron-3" in model.lower()
 
-    # Validate model compatibility for thinking tokens
-    has_thinking_tokens = (min_think is not None and min_think > 0) or (
-        max_think is not None and max_think > 0
-    )
 
-    if not has_thinking_tokens:
-        return llm
+def _is_nemotron_3_nano(model: str | None) -> bool:
+    """Detect Nemotron 3 Nano models (30b-a3b and locally hosted variants)."""
+    if not model:
+        return False
+    m = model.lower()
+    return "nemotron-3-nano" in m
 
-    # Check if model is a supported reasoning model (various name formats)
-    # Note: For locally hosted models, use "nvidia/nemotron-3-nano"
-    # For NVIDIA-hosted models, use "nvidia/nemotron-3-nano-30b-a3b"
-    is_nano_9b_v2 = model and "nvidia/nvidia-nemotron-nano-9b-v2" in model
-    is_nemotron_3_nano = model and (
-        "nemotron-3-nano" in model.lower() or 
-        "nvidia/nemotron-3-nano" in model or
-        "nemotron-3-nano-30b-a3b" in model
-    )
-    
-    if has_thinking_tokens and not (is_nano_9b_v2 or is_nemotron_3_nano):
-        raise ValueError(
-            "min_thinking_tokens and max_thinking_tokens are only supported for models "
-            "'nvidia/nvidia-nemotron-nano-9b-v2' and nemotron-3-nano variants "
-            "(e.g., 'nemotron-3-nano-30b-a3b', 'nvidia/nemotron-3-nano'), "
-            f"but got model '{model}'"
-        )
 
-    bind_args = {}
-    if is_nano_9b_v2:
-        # nvidia/nvidia-nemotron-nano-9b-v2: Uses thinking token parameters directly
-        if min_think is not None and min_think > 0:
-            bind_args["min_thinking_tokens"] = min_think
-        else:
-            raise ValueError(
-                f"min_thinking_tokens must be a positive integer, but got {min_think}"
-            )
-        if max_think is not None and max_think > 0:
-            bind_args["max_thinking_tokens"] = max_think
-        else:
-            raise ValueError(
-                f"max_thinking_tokens must be a positive integer, but got {max_think}"
-            )
-        logger.info(
-            "nvidia-nemotron-nano-9b-v2: Setting min_thinking_tokens=%d, max_thinking_tokens=%d",
-            min_think, max_think
+def _is_nemotron_nano_9b_v2(model: str | None) -> bool:
+    """Detect legacy Nemotron Nano 9B v2."""
+    if not model:
+        return False
+    return "nvidia/nvidia-nemotron-nano-9b-v2" in model
+
+
+def _resolve_enable_thinking(config: NvidiaRAGConfig | None = None, **kwargs) -> bool:
+    """Resolve enable_thinking from config, kwargs, or deprecated env var fallback."""
+    if config is not None:
+        enable = config.llm.parameters.enable_thinking
+        if enable:
+            return True
+    enable = kwargs.get("enable_thinking", False)
+    if enable:
+        return True
+    deprecated = os.getenv("ENABLE_NEMOTRON_3_NANO_THINKING")
+    if deprecated is not None:
+        logger.warning(
+            "ENABLE_NEMOTRON_3_NANO_THINKING is deprecated, use LLM_ENABLE_THINKING instead"
         )
-    elif is_nemotron_3_nano:
-        enable_thinking = os.getenv("ENABLE_NEMOTRON_3_NANO_THINKING", "true").lower() == "true"
-        if not enable_thinking:
-            raise ValueError(
-                "ENABLE_NEMOTRON_3_NANO_THINKING must be set to 'true' to use reasoning budget"
-            )
+        return deprecated.lower() == "true"
+    return False
 
-        # For nemotron-3-nano variants, min_thinking_tokens is not supported
-        if min_think is not None and min_think > 0:
-            logger.warning(
-                "min_thinking_tokens is not supported for nemotron-3-nano variants, "
-                "only max_thinking_tokens (mapped to reasoning_budget or nvext) is supported"
-            )
 
-        if max_think is not None and max_think > 0:
-            # Check if llm_endpoint is provided (locally hosted model)
-            llm_endpoint = kwargs.get("llm_endpoint", None)
+def _bind_reasoning_config(
+    llm: LLM | SimpleChatModel, config: NvidiaRAGConfig | None = None, **kwargs
+) -> LLM | SimpleChatModel:
+    """
+    Bind reasoning parameters to the LLM based on model type and configuration.
+
+    Reads enable_thinking, reasoning_budget, and low_effort from the config
+    object (LLM_ENABLE_THINKING, LLM_REASONING_BUDGET, LLM_LOW_EFFORT env vars).
+    kwargs can still override these for backward compatibility.
+
+    Supports:
+    - Nemotron 3 variants: enable_thinking, reasoning_budget, low_effort via chat_template_kwargs
+    - Nemotron 3 Nano: enable_thinking + reasoning_budget (or nvext for local NIM)
+    - Nemotron Nano 9B v2: legacy min_thinking_tokens / max_thinking_tokens
+    - Other models: no reasoning features bound
+    """
+    model = kwargs.get("model", "")
+    enable_thinking = _resolve_enable_thinking(config=config, **kwargs)
+    params = config.llm.parameters if config is not None else None
+    reasoning_budget = kwargs.get("reasoning_budget") or (params.reasoning_budget if params else 0)
+    low_effort = kwargs.get("low_effort") or (params.low_effort if params else False)
+    min_think = kwargs.get("min_thinking_tokens") or (params.min_thinking_tokens if params else 0) or 0
+    max_think = kwargs.get("max_thinking_tokens") or (params.max_thinking_tokens if params else 0) or 0
+
+    # Check specific variants first, then fall through to the general nemotron-3 check
+
+    if _is_nemotron_3_nano(model):
+        llm = llm.bind(chat_template_kwargs={"enable_thinking": enable_thinking})
+        if enable_thinking and (reasoning_budget > 0 or max_think > 0):
+            budget = reasoning_budget if reasoning_budget > 0 else max_think
+            llm_endpoint = kwargs.get("llm_endpoint", "")
             if llm_endpoint:
-                # For locally hosted models, use nvext syntax
-                bind_args["nvext"] = {"max_thinking_tokens": max_think}
-                logger.info(
-                    "nemotron-3-nano (locally hosted): Setting max_thinking_tokens=%d via nvext",
-                    max_think
-                )
+                llm = llm.bind(nvext={"max_thinking_tokens": budget})
+                logger.info("nemotron-3-nano (local): enable_thinking=%s, nvext.max_thinking_tokens=%d", enable_thinking, budget)
             else:
-                # For API catalog models, use reasoning_budget
-                bind_args["reasoning_budget"] = max_think
-                logger.info(
-                    "nemotron-3-nano (API catalog): Setting reasoning_budget=%d",
-                    max_think
-                )
+                llm = llm.bind(reasoning_budget=budget)
+                logger.info("nemotron-3-nano (API): enable_thinking=%s, reasoning_budget=%d", enable_thinking, budget)
         else:
+            logger.info("nemotron-3-nano: enable_thinking=%s", enable_thinking)
+        return llm
+
+    if _is_nemotron_nano_9b_v2(model):
+        if min_think > 0 and max_think > 0:
+            llm = llm.bind(min_thinking_tokens=min_think, max_thinking_tokens=max_think)
+            logger.info("nemotron-nano-9b-v2: min_thinking_tokens=%d, max_thinking_tokens=%d", min_think, max_think)
+        elif min_think > 0 or max_think > 0:
             raise ValueError(
-                f"max_thinking_tokens must be a positive integer, but got {max_think}"
+                "nemotron-nano-9b-v2 requires both min_thinking_tokens and max_thinking_tokens "
+                f"to be positive, got min={min_think}, max={max_think}"
             )
+        return llm
+
+    if _is_nemotron_3(model):
+        template_kwargs: dict = {"enable_thinking": enable_thinking}
+        if enable_thinking and low_effort:
+            template_kwargs["low_effort"] = True
+        budget = reasoning_budget if reasoning_budget > 0 else max_think
+        if enable_thinking and budget > 0:
+            template_kwargs["reasoning_budget"] = budget
+        llm = llm.bind(chat_template_kwargs=template_kwargs)
+        logger.info(
+            "nemotron-3: enable_thinking=%s, reasoning_budget=%d, low_effort=%s",
+            enable_thinking, budget, low_effort,
+        )
+        return llm
 
-    if bind_args:
-        return llm.bind(**bind_args)
     return llm
 
 
@@ -289,16 +285,18 @@ def get_llm(config: NvidiaRAGConfig | None = None, **kwargs) -> LLM | SimpleChat
                     default_headers = {**NVIDIA_API_DEFAULT_HEADERS}
                     if api_key:
                         default_headers["X-Model-Authorization"] = api_key
-                    return ChatOpenAI(
-                        model_name=kwargs.get("model"),
-                        openai_api_base=f"{guardrails_url}/v1/guardrail",
-                        openai_api_key="dummy-value",
-                        default_headers=default_headers,
-                        temperature=kwargs.get("temperature", None),
-                        top_p=kwargs.get("top_p", None),
-                        max_tokens=kwargs.get("max_tokens", None),
-                        stop=kwargs.get("stop", []),
-                    )
+                    openai_kwargs = {
+                        "model_name": kwargs.get("model"),
+                        "openai_api_base": f"{guardrails_url}/v1/guardrail",
+                        "openai_api_key": "dummy-value",
+                        "default_headers": default_headers,
+                        "temperature": kwargs.get("temperature", None),
+                        "top_p": kwargs.get("top_p", None),
+                        "max_tokens": kwargs.get("max_tokens", None),
+                    }
+                    if kwargs.get("stop"):
+                        openai_kwargs["stop"] = kwargs["stop"]
+                    return ChatOpenAI(**openai_kwargs)
                 except (requests.RequestException, requests.ConnectionError) as e:
                     error_msg = f"Guardrails NIM unavailable at {guardrails_url}. Please verify the service is running and accessible."
                     logger.exception(
@@ -318,13 +316,15 @@ def get_llm(config: NvidiaRAGConfig | None = None, **kwargs) -> LLM | SimpleChat
 
             # Build kwargs dict, only including parameters that are set
             # For non-NVIDIA endpoints, exclude NVIDIA-specific parameters
+            # Do not pass stop=[] - some Nemotron 3 APIs reject empty stop arrays
             chat_nvidia_kwargs = {
                 "base_url": url,
                 "model": kwargs.get("model"),
                 "api_key": api_key,
-                "stop": kwargs.get("stop", []),
                 "default_headers": NVIDIA_API_DEFAULT_HEADERS,
             }
+            if kwargs.get("stop"):
+                chat_nvidia_kwargs["stop"] = kwargs["stop"]
             if kwargs.get("temperature") is not None:
                 chat_nvidia_kwargs["temperature"] = kwargs["temperature"]
             if kwargs.get("top_p") is not None:
@@ -342,15 +342,8 @@ def get_llm(config: NvidiaRAGConfig | None = None, **kwargs) -> LLM | SimpleChat
                     chat_nvidia_kwargs["model_kwargs"] = model_kwargs
 
             llm = ChatNVIDIA(**chat_nvidia_kwargs)
-            # Only bind thinking tokens for NVIDIA endpoints
             if is_nvidia:
-                llm = _bind_thinking_tokens_if_configured(llm, **kwargs)
-                # For nemotron-3-nano models, set enable_thinking from env var
-                model = kwargs.get("model")
-                if model and ("nemotron-3-nano" in model.lower() or "nvidia/nemotron-3-nano" in model or "nemotron-3-nano-30b-a3b" in model):
-                    enable_thinking = os.getenv("ENABLE_NEMOTRON_3_NANO_THINKING", "true").lower() == "true"
-                    llm = llm.bind(chat_template_kwargs={"enable_thinking": enable_thinking})
-                    logger.info("nemotron-3-nano: Setting enable_thinking=%s (from ENABLE_NEMOTRON_3_NANO_THINKING)", enable_thinking)
+                llm = _bind_reasoning_config(llm, config=config, **kwargs)
             return llm
 
         logger.debug("Using llm model %s from api catalog", kwargs.get("model"))
@@ -363,23 +356,20 @@ def get_llm(config: NvidiaRAGConfig | None = None, **kwargs) -> LLM | SimpleChat
         if kwargs.get("ignore_eos") is not None:
             model_kwargs["ignore_eos"] = kwargs["ignore_eos"]
 
-        llm = ChatNVIDIA(
-            model=kwargs.get("model"),
-            api_key=api_key,
-            temperature=kwargs.get("temperature", None),
-            top_p=kwargs.get("top_p", None),
-            max_completion_tokens=kwargs.get("max_tokens", None),
-            stop=kwargs.get("stop", []),
-            default_headers=NVIDIA_API_DEFAULT_HEADERS,
+        # Do not pass stop=[] - some Nemotron 3 APIs reject empty stop arrays
+        chat_nvidia_kwargs = {
+            "model": kwargs.get("model"),
+            "api_key": api_key,
+            "temperature": kwargs.get("temperature", None),
+            "top_p": kwargs.get("top_p", None),
+            "max_completion_tokens": kwargs.get("max_tokens", None),
+            "default_headers": NVIDIA_API_DEFAULT_HEADERS,
             **({"model_kwargs": model_kwargs} if model_kwargs else {}),
-        )
-        llm = _bind_thinking_tokens_if_configured(llm, **kwargs)
-        # For nemotron-3-nano models, set enable_thinking from env var
-        model = kwargs.get("model")
-        if model and ("nemotron-3-nano" in model.lower() or "nvidia/nemotron-3-nano" in model or "nemotron-3-nano-30b-a3b" in model):
-            enable_thinking = os.getenv("ENABLE_NEMOTRON_3_NANO_THINKING", "true").lower() == "true"
-            llm = llm.bind(chat_template_kwargs={"enable_thinking": enable_thinking})
-            logger.info("nemotron-3-nano: Setting enable_thinking=%s (from ENABLE_NEMOTRON_3_NANO_THINKING)", enable_thinking)
+        }
+        if kwargs.get("stop"):
+            chat_nvidia_kwargs["stop"] = kwargs["stop"]
+        llm = ChatNVIDIA(**chat_nvidia_kwargs)
+        llm = _bind_reasoning_config(llm, config=config, **kwargs)
         return llm
 
     raise RuntimeError(
@@ -450,6 +440,9 @@ def streaming_filter_think(chunks: Iterable[str]) -> Iterable[str]:
     This generator filters content between think tags in streaming LLM responses.
     It handles both complete tags in a single chunk and tags split across multiple tokens.
 
+    When DEBUG logging is enabled (i.e. LOGLEVEL=DEBUG), reasoning tokens are
+    logged from <think> block content or reasoning_content field.
+
     Args:
         chunks (Iterable[str]): Chunks from a streaming LLM response
 
@@ -474,12 +467,19 @@ def streaming_filter_think(chunks: Iterable[str]) -> Iterable[str]:
     match_position = 0
     buffer = ""
     output_buffer = ""
+    think_accumulator = ""
+    reasoning_content_accumulator = ""
     chunk_count = 0
 
     for chunk in chunks:
-        content = chunk.content
+        reasoning, content = extract_reasoning_and_content(chunk)
+        content = content or reasoning
         chunk_count += 1
 
+        # Accumulate reasoning tokens when DEBUG logging is enabled (e.g. reasoning_content from nemotron-3-nano)
+        if reasoning and logger.isEnabledFor(logging.DEBUG):
+            reasoning_content_accumulator += reasoning
+
         # Let's first check for full tags - this is the most reliable approach
         buffer += content
 
@@ -496,6 +496,10 @@ def streaming_filter_think(chunks: Iterable[str]) -> Iterable[str]:
 
         while state == IN_THINK and FULL_END_TAG in buffer:
             end_idx = buffer.find(FULL_END_TAG)
+            if logger.isEnabledFor(logging.DEBUG):
+                think_content = buffer[:end_idx]
+                if think_content:
+                    think_accumulator += think_content + "\n"
             # Discard everything up to and including end tag
             buffer = buffer[end_idx + len(FULL_END_TAG) :]
             content = buffer
@@ -543,10 +547,14 @@ def streaming_filter_think(chunks: Iterable[str]) -> Iterable[str]:
 
         elif state == IN_THINK:
             if content_stripped == END_TAG_PARTS[0].strip():
+                # Accumulate think content before the end tag start
+                think_accumulator += buffer[: -len(content)] if content else buffer
                 state = MATCHING_END
                 match_position = 1
                 buffer = content  # Keep this token in buffer
             else:
+                if logger.isEnabledFor(logging.DEBUG):
+                    think_accumulator += buffer
                 buffer = ""  # Discard content inside think block
 
         elif state == MATCHING_END:
@@ -555,11 +563,15 @@ def streaming_filter_think(chunks: Iterable[str]) -> Iterable[str]:
                 match_position += 1
                 if match_position >= len(END_TAG_PARTS):
                     # Complete end tag matched
+                    if think_accumulator and logger.isEnabledFor(logging.DEBUG):
+                        think_accumulator += "\n"
                     state = NORMAL
                     match_position = 0
                     buffer = ""  # Clear buffer
             else:
                 # False match, revert to IN_THINK
+                if logger.isEnabledFor(logging.DEBUG):
+                    think_accumulator += buffer
                 state = IN_THINK
                 buffer = ""  # Discard content
 
@@ -581,6 +593,11 @@ def streaming_filter_think(chunks: Iterable[str]) -> Iterable[str]:
         if output_buffer:
             yield output_buffer
 
+    if think_accumulator and logger.isEnabledFor(logging.DEBUG):
+        logger.debug("Reasoning tokens (think): %s", think_accumulator.rstrip())
+    if reasoning_content_accumulator and logger.isEnabledFor(logging.DEBUG):
+        logger.debug("Reasoning tokens: %s", reasoning_content_accumulator)
+
     logger.info(
         "Finished streaming_filter_think processing after %d chunks", chunk_count
     )
@@ -611,14 +628,23 @@ def get_streaming_filter_think_parser():
         return RunnablePassthrough()
 
 
-async def streaming_filter_think_async(chunks):
+async def streaming_filter_think_async(chunks, enable_thinking: bool = False):
     """
     Async version of streaming_filter_think.
     This async generator filters content between think tags in streaming LLM responses.
     It handles both complete tags in a single chunk and tags split across multiple tokens.
 
+    When DEBUG logging is enabled (i.e. LOGLEVEL=DEBUG), reasoning tokens are
+    logged from <think> block content or reasoning_content field.
+
+    When enable_thinking is True and the model uses a separate reasoning_content field
+    (e.g. Nemotron 3), reasoning tokens are dropped and only content is forwarded.
+    The <think> tag filter still runs to handle models that embed reasoning in content.
+
     Args:
         chunks: Async iterable of chunks from a streaming LLM response
+        enable_thinking: When True, drop reasoning_content (genuine chain-of-thought).
+            When False, fall back to reasoning_content if content is empty (model quirk).
 
     Yields:
         str: Filtered content with think blocks removed
@@ -641,12 +667,19 @@ async def streaming_filter_think_async(chunks):
     match_position = 0
     buffer = ""
     output_buffer = ""
+    think_accumulator = ""
+    reasoning_content_accumulator = ""
     chunk_count = 0
 
     async for chunk in chunks:
-        content = chunk.content
+        reasoning, content = extract_reasoning_and_content(chunk)
+        content = content if enable_thinking else (content or reasoning)
         chunk_count += 1
 
+        # Accumulate reasoning when DEBUG logging is enabled (e.g. reasoning_content from nemotron-3-nano)
+        if reasoning and logger.isEnabledFor(logging.DEBUG):
+            reasoning_content_accumulator += reasoning
+
         # Let's first check for full tags - this is the most reliable approach
         buffer += content
 
@@ -663,6 +696,10 @@ async def streaming_filter_think_async(chunks):
 
         while state == IN_THINK and FULL_END_TAG in buffer:
             end_idx = buffer.find(FULL_END_TAG)
+            if logger.isEnabledFor(logging.DEBUG):
+                think_content = buffer[:end_idx]
+                if think_content:
+                    think_accumulator += think_content + "\n"
             # Discard everything up to and including end tag
             buffer = buffer[end_idx + len(FULL_END_TAG) :]
             content = buffer
@@ -710,10 +747,14 @@ async def streaming_filter_think_async(chunks):
 
         elif state == IN_THINK:
             if content_stripped == END_TAG_PARTS[0].strip():
+                # Accumulate think content before the end tag start
+                think_accumulator += buffer[: -len(content)] if content else buffer
                 state = MATCHING_END
                 match_position = 1
                 buffer = content  # Keep this token in buffer
             else:
+                if logger.isEnabledFor(logging.DEBUG):
+                    think_accumulator += buffer
                 buffer = ""  # Discard content inside think block
 
         elif state == MATCHING_END:
@@ -722,11 +763,15 @@ async def streaming_filter_think_async(chunks):
                 match_position += 1
                 if match_position >= len(END_TAG_PARTS):
                     # Complete end tag matched
+                    if think_accumulator and logger.isEnabledFor(logging.DEBUG):
+                        think_accumulator += "\n"
                     state = NORMAL
                     match_position = 0
                     buffer = ""  # Clear buffer
             else:
                 # False match, revert to IN_THINK
+                if logger.isEnabledFor(logging.DEBUG):
+                    think_accumulator += buffer
                 state = IN_THINK
                 buffer = ""  # Discard content
 
@@ -748,32 +793,71 @@ async def streaming_filter_think_async(chunks):
         if output_buffer:
             yield output_buffer
 
+    if think_accumulator and logger.isEnabledFor(logging.DEBUG):
+        logger.debug("Reasoning tokens: %s", think_accumulator.rstrip())
+    if reasoning_content_accumulator and logger.isEnabledFor(logging.DEBUG):
+        logger.debug("Reasoning tokens: %s", reasoning_content_accumulator)
+
     logger.info(
         "Finished streaming_filter_think_async processing after %d chunks", chunk_count
     )
 
 
-def get_streaming_filter_think_parser_async():
+async def _content_fallback_async(chunks, enable_thinking: bool = False):
+    """
+    Pass through LLM chunks WITHOUT filtering thinking tokens.
+    Used when FILTER_THINK_TOKENS=false - the user wants to see everything.
+
+    - When enable_thinking=true: forwards both reasoning_content and content so
+      the user can see the chain-of-thought followed by the answer.
+    - When enable_thinking=false: falls back to reasoning_content if content is
+      empty (NIM quirk where the answer lands in reasoning_content).
+
+    Args:
+        chunks: Async iterable of LLM response chunks
+        enable_thinking: Whether the model is producing genuine reasoning tokens.
+    """
+    async for chunk in chunks:
+        reasoning, content = extract_reasoning_and_content(chunk)
+
+        if enable_thinking:
+            if reasoning:
+                yield AIMessageChunk(content=reasoning)
+            if content:
+                yield AIMessageChunk(content=content)
+        else:
+            text = content or reasoning
+            if text:
+                yield AIMessageChunk(content=text)
+
+
+def get_streaming_filter_think_parser_async(enable_thinking: bool = False):
     """
     Creates and returns an async RunnableGenerator for filtering think tokens.
 
     If FILTER_THINK_TOKENS environment variable is set to "true" (case-insensitive),
     returns a parser that filters out content between <think> and </think> tags.
-    Otherwise, returns a pass-through parser that doesn't modify the content.
+    Otherwise, returns a parser that normalizes content (content or reasoning_content)
+    so models like Nemotron 3 that put reply in reasoning_content still yield text.
+
+    Args:
+        enable_thinking: When True, reasoning_content is genuine chain-of-thought and
+            will be dropped. When False, reasoning_content is used as a fallback if
+            content is empty (workaround for model quirk).
 
     Returns:
-        RunnableGenerator: An async parser for filtering (or not filtering) think tokens
+        RunnableGenerator: An async parser for filtering or content normalization
     """
+    from functools import partial
     from langchain_core.runnables import RunnableGenerator, RunnablePassthrough
 
     # Check environment variable
     filter_enabled = os.getenv("FILTER_THINK_TOKENS", "true").lower() == "true"
 
     if filter_enabled:
-        logger.info("Think token filtering is enabled (async)")
-        return RunnableGenerator(streaming_filter_think_async)
+        logger.info("Think token filtering is enabled (async), enable_thinking=%s", enable_thinking)
+        return RunnableGenerator(partial(streaming_filter_think_async, enable_thinking=enable_thinking))
     else:
-        logger.info("Think token filtering is disabled (async)")
-        # If filtering is disabled, use a passthrough that passes content as-is
-        return RunnablePassthrough()
+        logger.info("Think token filtering is disabled (async), enable_thinking=%s", enable_thinking)
+        return RunnableGenerator(partial(_content_fallback_async, enable_thinking=enable_thinking))
         
\ No newline at end of file
diff --git a/src/nvidia_rag/utils/metadata_validation.py b/src/nvidia_rag/utils/metadata_validation.py
index e1bdef417..3ff404483 100644
--- a/src/nvidia_rag/utils/metadata_validation.py
+++ b/src/nvidia_rag/utils/metadata_validation.py
@@ -2164,7 +2164,10 @@ def comparison(self, args) -> str:
                         logger.debug(f"[comparison] Failed to normalize datetime: {e}")
                         value_val = str(value_token)
                 elif field_info and is_string_type(field_info.type):
-                    value_val = str(value_token).lower()
+                    if field_name == "filename":
+                        value_val = str(value_token)
+                    else:
+                        value_val = str(value_token).lower()
                 else:
                     value_val = str(value_token)
             else:
@@ -2677,15 +2680,15 @@ def _get_error_context(filter_expr: str, error: UnexpectedInput) -> str:
             error_msg = (
                 f"Syntax error at line {line_num}, column {col_num}: '{snippet}'"
             )
-            error_msg += "\n\nExamples of valid filter expressions:"
-            error_msg += "\n• content_metadata[\"title\"] == 'value'"
-            error_msg += "\n• content_metadata[\"title\"] = 'value'"
+            error_msg += "\n\nExamples of valid filter expressions (use double quotes for string values):"
+            error_msg += '\n• content_metadata["title"] == "value"'
+            error_msg += '\n• content_metadata["title"] = "value"'
             error_msg += '\n• content_metadata["rating"] > 5'
-            error_msg += "\n• content_metadata[\"category\"] like '%tech%'"
-            error_msg += "\n• content_metadata[\"tags\"] in ['important', 'urgent']"
-            error_msg += "\n• content_metadata[\"created_date\"] between '2024-01-01' and '2024-12-31'"
+            error_msg += '\n• content_metadata["category"] like "%tech%"'
+            error_msg += '\n• content_metadata["tags"] in ["important", "urgent"]'
+            error_msg += '\n• content_metadata["created_date"] between "2024-01-01" and "2024-12-31"'
             error_msg += '\n• content_metadata["is_public"] == true'
-            error_msg += '\n• content_metadata["file_size"] > 1000 and content_metadata["type"] == \'pdf\''
+            error_msg += '\n• content_metadata["file_size"] > 1000 and content_metadata["type"] == "pdf"'
 
             return error_msg
 
diff --git a/src/nvidia_rag/utils/minio_operator.py b/src/nvidia_rag/utils/minio_operator.py
index c1c3a4edd..7aa90188c 100644
--- a/src/nvidia_rag/utils/minio_operator.py
+++ b/src/nvidia_rag/utils/minio_operator.py
@@ -27,8 +27,15 @@
 import logging
 from io import BytesIO
 
-from minio import Minio
-from minio.commonconfig import SnowballObject
+try:
+    from minio import Minio
+    from minio.commonconfig import SnowballObject
+
+    _MINIO_AVAILABLE = True
+except ImportError:
+    Minio = None  # type: ignore[assignment,misc]
+    SnowballObject = None  # type: ignore[assignment,misc]
+    _MINIO_AVAILABLE = False
 
 from nvidia_rag.utils.configuration import NvidiaRAGConfig
 
@@ -47,6 +54,11 @@ def __init__(
         secret_key: str,
         default_bucket_name: str = DEFAULT_BUCKET_NAME,
     ):
+        if not _MINIO_AVAILABLE:
+            raise ImportError(
+                "minio package is not installed. Install with: pip install 'nvidia_rag[minio]'"
+            )
+        assert Minio is not None  # noqa: S101 — narrowing for type checker
         self.client = Minio(
             endpoint, access_key=access_key, secret_key=secret_key, secure=False
         )
@@ -88,6 +100,7 @@ def put_payloads_bulk(self, payloads: list[dict], object_names: list[str]):
         """Put list of dictionaries to S3 storage using minio client"""
         json_datas = [json.dumps(payload).encode("utf-8") for payload in payloads]
 
+        assert SnowballObject is not None  # noqa: S101 — narrowing for type checker
         snowball_objects = []
         for object_name, json_data in zip(object_names, json_datas, strict=False):
             snowball_objects.append(
@@ -134,20 +147,31 @@ def delete_payloads(self, object_names: list[str]) -> None:
 def get_minio_operator(
     default_bucket_name: str = DEFAULT_BUCKET_NAME,
     config: NvidiaRAGConfig | None = None,
-) -> MinioOperator:
+) -> MinioOperator | None:
     """
-    Prepares and return MinioOperator object
+    Prepares and return MinioOperator object, or None when MinIO is disabled.
 
     Args:
         default_bucket_name: Default bucket name
         config: NvidiaRAGConfig instance. If None, creates a new one.
 
     Returns:
-        - minio_operator: MinioOperator
+        - minio_operator: MinioOperator, or None if ENABLE_MINIO=false or package not installed
     """
     if config is None:
         config = NvidiaRAGConfig()
 
+    if not config.minio.enabled:
+        logger.info("MinIO is disabled (ENABLE_MINIO=false). Multimodal citations will be unavailable.")
+        return None
+
+    if not _MINIO_AVAILABLE:
+        logger.warning(
+            "minio package is not installed. Multimodal citations will be unavailable. "
+            "Install with: pip install 'nvidia_rag[minio]', or set ENABLE_MINIO=false to suppress this warning."
+        )
+        return None
+
     minio_operator = MinioOperator(
         endpoint=config.minio.endpoint,
         access_key=config.minio.access_key.get_secret_value(),
diff --git a/src/nvidia_rag/utils/observability/langchain_callback_handler.py b/src/nvidia_rag/utils/observability/langchain_callback_handler.py
index dcdd7ba23..a35a71bab 100644
--- a/src/nvidia_rag/utils/observability/langchain_callback_handler.py
+++ b/src/nvidia_rag/utils/observability/langchain_callback_handler.py
@@ -40,6 +40,17 @@
 
 from .otel_metrics import OtelMetrics
 
+# Hardcoded attribute keys (replacing deprecated SpanAttributes constants)
+GEN_AI_PROMPTS = "gen_ai.prompt"
+GEN_AI_COMPLETIONS = "gen_ai.completion"
+LLM_REQUEST_MODEL = "gen_ai.request.model"
+LLM_RESPONSE_MODEL = "gen_ai.response.model"
+# Missing in opentelemetry.semconv_ai SpanAttributes (use llm.* to match existing semconv)
+LLM_REQUEST_MAX_TOKENS = "llm.request.max_tokens"
+LLM_REQUEST_TEMPERATURE = "llm.request.temperature"
+LLM_REQUEST_TOP_P = "llm.request.top_p"
+LLM_SYSTEM = "llm.system"
+
 
 class Config:
     exception_logger = None
@@ -137,9 +148,9 @@ def _set_request_params(span, kwargs, span_holder: SpanHolder):
     else:
         model = "unknown"
 
-    span.set_attribute(SpanAttributes.LLM_REQUEST_MODEL, model)
+    span.set_attribute(LLM_REQUEST_MODEL, model)
     # response is not available for LLM requests (as opposed to chat)
-    span.set_attribute(SpanAttributes.LLM_RESPONSE_MODEL, model)
+    span.set_attribute(LLM_RESPONSE_MODEL, model)
 
     if "invocation_params" in kwargs:
         params = (
@@ -150,13 +161,11 @@ def _set_request_params(span, kwargs, span_holder: SpanHolder):
 
     _set_span_attribute(
         span,
-        SpanAttributes.LLM_REQUEST_MAX_TOKENS,
+        LLM_REQUEST_MAX_TOKENS,
         params.get("max_tokens") or params.get("max_new_tokens"),
     )
-    _set_span_attribute(
-        span, SpanAttributes.LLM_REQUEST_TEMPERATURE, params.get("temperature")
-    )
-    _set_span_attribute(span, SpanAttributes.LLM_REQUEST_TOP_P, params.get("top_p"))
+    _set_span_attribute(span, LLM_REQUEST_TEMPERATURE, params.get("temperature"))
+    _set_span_attribute(span, LLM_REQUEST_TOP_P, params.get("top_p"))
 
 
 def _set_llm_request(
@@ -171,11 +180,11 @@ def _set_llm_request(
     if should_send_prompts():
         for i, msg in enumerate(prompts):
             span.set_attribute(
-                f"{SpanAttributes.LLM_PROMPTS}.{i}.role",
+                f"{GEN_AI_PROMPTS}.{i}.role",
                 "user",
             )
             span.set_attribute(
-                f"{SpanAttributes.LLM_PROMPTS}.{i}.content",
+                f"{GEN_AI_PROMPTS}.{i}.content",
                 msg,
             )
 
@@ -207,18 +216,18 @@ def _set_chat_request(
         for message in messages:
             for msg in message:
                 span.set_attribute(
-                    f"{SpanAttributes.LLM_PROMPTS}.{i}.role",
+                    f"{GEN_AI_PROMPTS}.{i}.role",
                     _message_type_to_role(msg.type),
                 )
                 # if msg.content is string
                 if isinstance(msg.content, str):
                     span.set_attribute(
-                        f"{SpanAttributes.LLM_PROMPTS}.{i}.content",
+                        f"{GEN_AI_PROMPTS}.{i}.content",
                         msg.content,
                     )
                 else:
                     span.set_attribute(
-                        f"{SpanAttributes.LLM_PROMPTS}.{i}.content",
+                        f"{GEN_AI_PROMPTS}.{i}.content",
                         json.dumps(msg.content, cls=CallbackFilteredJSONEncoder),
                     )
                 i += 1
@@ -252,7 +261,7 @@ def _set_chat_response(span: Span, response: LLMResult) -> None:
                 )
                 total_tokens = input_tokens + output_tokens
 
-            prefix = f"{SpanAttributes.LLM_COMPLETIONS}.{i}"
+            prefix = f"{GEN_AI_COMPLETIONS}.{i}"
             if hasattr(generation, "text") and generation.text != "":
                 span.set_attribute(
                     f"{prefix}.content",
@@ -317,11 +326,11 @@ def _set_chat_response(span: Span, response: LLMResult) -> None:
 
     if input_tokens > 0 or output_tokens > 0 or total_tokens > 0:
         span.set_attribute(
-            SpanAttributes.LLM_USAGE_PROMPT_TOKENS,
+            "gen_ai.usage.input_tokens",
             input_tokens,
         )
         span.set_attribute(
-            SpanAttributes.LLM_USAGE_COMPLETION_TOKENS,
+            "gen_ai.usage.output_tokens",
             output_tokens,
         )
         span.set_attribute(
@@ -462,7 +471,7 @@ def _create_llm_span(
             entity_path=entity_path,
             metadata=metadata,
         )
-        span.set_attribute(SpanAttributes.LLM_SYSTEM, "Langchain")
+        span.set_attribute(LLM_SYSTEM, "Langchain")
         span.set_attribute(SpanAttributes.LLM_REQUEST_TYPE, request_type.value)
 
         return span
@@ -650,10 +659,10 @@ def on_llm_end(
                 "model_name"
             ) or response.llm_output.get("model_id")
             if model_name is not None:
-                span.set_attribute(SpanAttributes.LLM_RESPONSE_MODEL, model_name)
+                span.set_attribute(LLM_RESPONSE_MODEL, model_name)
 
                 if self.spans[run_id].request_model is None:
-                    span.set_attribute(SpanAttributes.LLM_REQUEST_MODEL, model_name)
+                    span.set_attribute(LLM_REQUEST_MODEL, model_name)
 
         token_usage = (response.llm_output or {}).get("token_usage") or (
             response.llm_output or {}
@@ -673,12 +682,8 @@ def on_llm_end(
                 prompt_tokens + completion_tokens
             )
 
-            _set_span_attribute(
-                span, SpanAttributes.LLM_USAGE_PROMPT_TOKENS, prompt_tokens
-            )
-            _set_span_attribute(
-                span, SpanAttributes.LLM_USAGE_COMPLETION_TOKENS, completion_tokens
-            )
+            _set_span_attribute(span, "gen_ai.usage.input_tokens", prompt_tokens)
+            _set_span_attribute(span, "gen_ai.usage.output_tokens", completion_tokens)
             _set_span_attribute(
                 span, SpanAttributes.LLM_USAGE_TOTAL_TOKENS, total_tokens
             )
diff --git a/src/nvidia_rag/utils/vdb/__init__.py b/src/nvidia_rag/utils/vdb/__init__.py
index f8895eb28..fa52b4908 100644
--- a/src/nvidia_rag/utils/vdb/__init__.py
+++ b/src/nvidia_rag/utils/vdb/__init__.py
@@ -63,7 +63,24 @@ def _get_vdb_op(
     )
 
     # Get VDBRag class object based on the configuration
-    if config.vector_store.name == "milvus":
+    # Oracle 26ai is the default vector store
+    if config.vector_store.name == "oracle":
+        from nvidia_rag.utils.vdb.oracle.oracle_vdb import OracleVDB
+        return OracleVDB(
+            collection_name=collection_name,
+            oracle_user=os.getenv("ORACLE_USER"),
+            oracle_password=os.getenv("ORACLE_PASSWORD"),
+            oracle_cs=os.getenv("ORACLE_CS"),
+            embedding_model=embedding_model,
+            config=config,
+            meta_dataframe=csv_file_path,
+            meta_source_field=meta_source_field,
+            meta_fields=meta_fields,
+            csv_file_path=csv_file_path,
+            hybrid=(config.vector_store.search_type == SearchType.HYBRID),
+        )
+
+    elif config.vector_store.name == "milvus":
         from nvidia_rag.utils.vdb.milvus.milvus_vdb import MilvusVDB
 
         return MilvusVDB(
diff --git a/src/nvidia_rag/utils/vdb/oracle/__init__.py b/src/nvidia_rag/utils/vdb/oracle/__init__.py
new file mode 100644
index 000000000..ac9d03237
--- /dev/null
+++ b/src/nvidia_rag/utils/vdb/oracle/__init__.py
@@ -0,0 +1,20 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Oracle 26ai Vector Database module for NVIDIA RAG Blueprint."""
+
+from nvidia_rag.utils.vdb.oracle.oracle_vdb import OracleVDB
+
+__all__ = ["OracleVDB"]
\ No newline at end of file
diff --git a/src/nvidia_rag/utils/vdb/oracle/oracle_queries.py b/src/nvidia_rag/utils/vdb/oracle/oracle_queries.py
new file mode 100644
index 000000000..a0b9c1c6b
--- /dev/null
+++ b/src/nvidia_rag/utils/vdb/oracle/oracle_queries.py
@@ -0,0 +1,390 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Oracle 26ai SQL query utilities for vector database operations.
+Provides DDL and query functions for document and metadata management.
+
+Functions:
+1. create_vector_table_ddl: Generate DDL for vector collection tables
+2. create_vector_index_ddl: Generate DDL for IVF vector index
+3. create_metadata_schema_table_ddl: DDL for metadata schema storage
+4. create_document_info_table_ddl: DDL for document info storage
+5. get_unique_sources_query: Retrieve all unique document sources
+6. get_delete_docs_query: Delete documents by source value
+7. get_similarity_search_query: Vector similarity search query
+8. get_hybrid_search_query: Combined vector + text search query
+"""
+
+import hashlib
+from typing import Literal
+
+DistanceMetric = Literal["COSINE", "L2", "DOT", "MANHATTAN"]
+IndexType = Literal["IVF", "HNSW"]
+
+# Oracle 12.2+ identifier length limit (in bytes; we treat as chars since we
+# only emit ASCII-safe derived names). Underscore-prefixed variant left for
+# clarity at call sites.
+ORACLE_MAX_IDENTIFIER_LEN = 128
+
+
+def _derive_object_name(table_name: str, suffix: str) -> str:
+    """Derive a per-collection object name (index, etc.) that fits in 128 chars.
+
+    For short table names, returns f"{table_name}{suffix}" verbatim so the
+    derived name stays human-readable and case-preserves alongside its parent
+    table.
+
+    For long table names where f"{table_name}{suffix}" would exceed Oracle's
+    128-character identifier limit, builds a deterministic hashed name:
+    f"nvr_{16-char-hex-digest}{suffix}". The hash is stable across calls,
+    so re-running CREATE produces the same name (idempotent ORA-00955).
+
+    The "nvr_" prefix is fixed (Oracle case-folds it consistently to NVR_
+    when unquoted, or preserves it when quoted), and the digest is hex
+    (case-insensitive) so the derived name is identical regardless of
+    quoting/case-folding mode.
+    """
+    candidate = f"{table_name}{suffix}"
+    if len(candidate) <= ORACLE_MAX_IDENTIFIER_LEN:
+        return candidate
+    digest = hashlib.sha256(table_name.encode("utf-8")).hexdigest()[:16]
+    return f"nvr_{digest}{suffix}"
+
+
+def create_vector_table_ddl(
+    table_name: str,
+    dimension: int = 2048,
+) -> str:
+    """
+    Generate DDL for creating a vector collection table.
+
+    Args:
+        table_name: Name of the table to create
+        dimension: Vector dimension size
+
+    Returns:
+        DDL statement string
+    """
+    return f"""
+    CREATE TABLE "{table_name}" (
+        id RAW(16) DEFAULT SYS_GUID() PRIMARY KEY,
+        text CLOB,
+        vector VECTOR({dimension}, FLOAT32),
+        source VARCHAR2(4000),
+        content_metadata CLOB CHECK (content_metadata IS JSON),
+        created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
+    )
+    """
+
+
+def create_vector_index_ddl(
+    table_name: str,
+    index_type: IndexType = "IVF",
+    distance_metric: DistanceMetric = "COSINE",
+    ivf_neighbor_partitions: int = 100,
+    hnsw_m: int = 16,
+    hnsw_ef_construction: int = 200,
+) -> str:
+    """
+    Generate DDL for creating a vector index (IVF or HNSW).
+
+    Args:
+        table_name: Name of the table
+        index_type: IVF or HNSW
+        distance_metric: COSINE, L2, DOT, or MANHATTAN
+        ivf_neighbor_partitions: Number of partitions for IVF index
+        hnsw_m: Max connections per node for HNSW
+        hnsw_ef_construction: Size of dynamic candidate list for HNSW
+
+    Returns:
+        DDL statement string
+    """
+    index_name = _derive_object_name(table_name, "_vec_idx")
+
+    if index_type == "IVF":
+        return f"""
+        CREATE VECTOR INDEX "{index_name}" ON "{table_name}"(vector)
+        ORGANIZATION NEIGHBOR PARTITIONS
+        WITH DISTANCE {distance_metric}
+        WITH TARGET ACCURACY 95
+        PARAMETERS (
+            type IVF,
+            neighbor_partitions {ivf_neighbor_partitions}
+        )
+        """
+    else:  # HNSW
+        return f"""
+        CREATE VECTOR INDEX "{index_name}" ON "{table_name}"(vector)
+        ORGANIZATION INMEMORY NEIGHBOR GRAPH
+        WITH DISTANCE {distance_metric}
+        WITH TARGET ACCURACY 95
+        PARAMETERS (
+            type HNSW,
+            m {hnsw_m},
+            efConstruction {hnsw_ef_construction}
+        )
+        """
+
+
+def create_text_index_ddl(table_name: str) -> str:
+    """
+    Generate DDL for creating Oracle Text index for hybrid search.
+
+    Args:
+        table_name: Name of the table
+
+    Returns:
+        DDL statement string
+    """
+    index_name = _derive_object_name(table_name, "_text_idx")
+    return f"""
+    CREATE INDEX "{index_name}" ON "{table_name}"(text)
+    INDEXTYPE IS CTXSYS.CONTEXT
+    PARAMETERS ('SYNC (ON COMMIT)')
+    """
+
+
+def create_metadata_schema_table_ddl() -> str:
+    """Generate DDL for metadata schema storage table."""
+    return """
+    CREATE TABLE metadata_schema (
+        id RAW(16) DEFAULT SYS_GUID() PRIMARY KEY,
+        collection_name VARCHAR2(255) NOT NULL UNIQUE,
+        metadata_schema CLOB CHECK (metadata_schema IS JSON),
+        created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
+        updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
+    )
+    """
+
+
+def create_document_info_table_ddl() -> str:
+    """Generate DDL for document info storage table."""
+    return """
+    CREATE TABLE document_info (
+        id RAW(16) DEFAULT SYS_GUID() PRIMARY KEY,
+        collection_name VARCHAR2(255) NOT NULL,
+        info_type VARCHAR2(50) NOT NULL,
+        document_name VARCHAR2(4000) NOT NULL,
+        info_value CLOB CHECK (info_value IS JSON),
+        created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
+        updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
+        CONSTRAINT doc_info_unique UNIQUE (collection_name, info_type, document_name)
+    )
+    """
+
+
+def get_unique_sources_query(table_name: str) -> str:
+    """
+    Generate query to retrieve all unique document sources.
+
+    Args:
+        table_name: Name of the collection table
+
+    Returns:
+        SQL query string
+    """
+    return f"""
+    WITH unique_sources AS (
+        SELECT source, content_metadata,
+               ROW_NUMBER() OVER (PARTITION BY source ORDER BY created_at DESC) as rn
+        FROM "{table_name}"
+        WHERE source IS NOT NULL
+    )
+    SELECT source, content_metadata
+    FROM unique_sources
+    WHERE rn = 1
+    ORDER BY source
+    """
+
+
+def get_delete_docs_query(table_name: str) -> str:
+    """
+    Generate parameterized delete query for documents by source.
+
+    The source column stores a JSON object with a source_name field
+    (e.g. {"source_name": "/tmp/.../file.pdf"}).  We match against the
+    extracted source_name value; the fallback OR clause handles any rows
+    where source was stored as a plain string.
+
+    Args:
+        table_name: Name of the collection table
+
+    Returns:
+        SQL query string with :source_value bind variable
+    """
+    return f"""
+    DELETE FROM "{table_name}"
+    WHERE JSON_VALUE(source, '$.source_name') = :source_value
+       OR source = :source_value
+    """
+
+
+def get_delete_metadata_schema_query() -> str:
+    """Generate parameterized delete query for metadata schema."""
+    return """
+    DELETE FROM metadata_schema
+    WHERE collection_name = :collection_name
+    """
+
+
+def get_metadata_schema_query() -> str:
+    """Generate parameterized query to retrieve metadata schema."""
+    return """
+    SELECT metadata_schema
+    FROM metadata_schema
+    WHERE collection_name = :collection_name
+    """
+
+
+def get_delete_document_info_query() -> str:
+    """Generate parameterized delete query for document info."""
+    return """
+    DELETE FROM document_info
+    WHERE collection_name = :collection_name
+    AND document_name = :document_name
+    AND info_type = :info_type
+    """
+
+
+def get_delete_document_info_by_collection_query() -> str:
+    """Generate parameterized delete query for all document info in a collection."""
+    return """
+    DELETE FROM document_info
+    WHERE collection_name = :collection_name
+    """
+
+
+def get_document_info_query() -> str:
+    """Generate parameterized query to retrieve document info."""
+    return """
+    SELECT info_value
+    FROM document_info
+    WHERE collection_name = :collection_name
+    AND document_name = :document_name
+    AND info_type = :info_type
+    """
+
+
+def get_collection_document_info_query() -> str:
+    """Generate parameterized query to retrieve collection-level document info."""
+    return """
+    SELECT info_value
+    FROM document_info
+    WHERE collection_name = :collection_name
+    AND info_type = :info_type
+    """
+
+
+def get_similarity_search_query(
+    table_name: str,
+    distance_metric: DistanceMetric = "COSINE",
+) -> str:
+    """
+    Generate vector similarity search query.
+
+    Args:
+        table_name: Name of the collection table
+        distance_metric: Distance function to use
+
+    Returns:
+        SQL query string with :query_vector and :top_k bind variables
+    """
+    return f"""
+    SELECT id, text, source, content_metadata,
+           VECTOR_DISTANCE(vector, :query_vector, {distance_metric}) as distance
+    FROM "{table_name}"
+    ORDER BY distance
+    FETCH FIRST :top_k ROWS ONLY
+    """
+
+
+def get_hybrid_search_query(
+    table_name: str,
+    distance_metric: DistanceMetric = "COSINE",
+    vector_weight: float = 0.7,
+    text_weight: float = 0.3,
+) -> str:
+    """
+    Generate hybrid search query combining vector similarity and text search.
+
+    Args:
+        table_name: Name of the collection table
+        distance_metric: Distance function for vector search
+        vector_weight: Weight for vector similarity score
+        text_weight: Weight for text search score
+
+    Returns:
+        SQL query string with :query_vector, :query_text, and :top_k bind variables
+    """
+    return f"""
+    WITH vector_results AS (
+        SELECT id, text, source, content_metadata,
+               VECTOR_DISTANCE(vector, :query_vector, {distance_metric}) as vec_distance,
+               ROW_NUMBER() OVER (ORDER BY VECTOR_DISTANCE(vector, :query_vector, {distance_metric})) as vec_rank
+        FROM "{table_name}"
+        FETCH FIRST :top_k * 2 ROWS ONLY
+    ),
+    text_results AS (
+        SELECT id, SCORE(1) as text_score,
+               ROW_NUMBER() OVER (ORDER BY SCORE(1) DESC) as text_rank
+        FROM "{table_name}"
+        WHERE CONTAINS(text, :query_text, 1) > 0
+        FETCH FIRST :top_k * 2 ROWS ONLY
+    )
+    SELECT v.id, v.text, v.source, v.content_metadata,
+           ({vector_weight} * (1 / (1 + v.vec_distance)) + 
+            {text_weight} * COALESCE(t.text_score / 100, 0)) as combined_score
+    FROM vector_results v
+    LEFT JOIN text_results t ON v.id = t.id
+    ORDER BY combined_score DESC
+    FETCH FIRST :top_k ROWS ONLY
+    """
+
+
+def get_count_query(table_name: str) -> str:
+    """Generate query to count documents in a collection."""
+    return f'SELECT COUNT(*) as cnt FROM "{table_name}"'
+
+
+def check_table_exists_query() -> str:
+    """Generate query to check if a table exists.
+
+    Uses exact case match (no UPPER) so quoted-identifier tables
+    preserve their original casing.
+    """
+    return """
+    SELECT COUNT(*) as cnt
+    FROM user_tables
+    WHERE table_name = :table_name
+    """
+
+
+def drop_table_ddl(table_name: str) -> str:
+    """Generate DDL to drop a table."""
+    return f'DROP TABLE "{table_name}" CASCADE CONSTRAINTS PURGE'
+
+
+def get_all_collections_query() -> str:
+    """Generate query to list all collection tables."""
+    return """
+    SELECT table_name
+    FROM user_tables
+    WHERE table_name NOT IN ('METADATA_SCHEMA', 'DOCUMENT_INFO')
+    AND table_name NOT LIKE 'SYS%'
+    AND table_name NOT LIKE 'DR$%'
+    AND table_name NOT LIKE 'DBTOOLS%'
+    ORDER BY table_name
+    """
\ No newline at end of file
diff --git a/src/nvidia_rag/utils/vdb/oracle/oracle_vdb.py b/src/nvidia_rag/utils/vdb/oracle/oracle_vdb.py
new file mode 100644
index 000000000..501a22661
--- /dev/null
+++ b/src/nvidia_rag/utils/vdb/oracle/oracle_vdb.py
@@ -0,0 +1,1207 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Oracle 26ai Vector Database implementation for NVIDIA RAG Blueprint.
+
+This module provides the OracleVDB class which implements vector database
+operations using Oracle AI Database 26ai's native vector search capabilities.
+Supports CPU-based IVF indexes with optional hybrid search using Oracle Text.
+
+Key Features:
+- Native VECTOR data type support
+- IVF (Inverted File Index) for CPU-optimized search
+- Hybrid search combining vector similarity with Oracle Text
+- LangChain OracleVS integration for retrieval
+"""
+
+import json
+import logging
+import os
+import time
+from concurrent.futures import Future
+from typing import Any
+from array import array
+
+import oracledb
+from langchain_community.vectorstores.oraclevs import (
+    OracleVS,
+    _compare_version,
+    _get_connection,
+    _get_distance_function,
+)
+from langchain_community.vectorstores.utils import DistanceStrategy
+from langchain_core.documents import Document
+from langchain_core.runnables import RunnableAssign, RunnableLambda
+from opentelemetry import context as otel_context
+
+from nvidia_rag.rag_server.response_generator import APIError, ErrorCodeMapping
+from nvidia_rag.utils.common import (
+    get_current_timestamp,
+    perform_document_info_aggregation,
+)
+from nvidia_rag.utils.configuration import NvidiaRAGConfig, SearchType
+from nvidia_rag.utils.health_models import ServiceStatus
+from nvidia_rag.utils.vdb import (
+    DEFAULT_DOCUMENT_INFO_COLLECTION,
+    DEFAULT_METADATA_SCHEMA_COLLECTION,
+    SYSTEM_COLLECTIONS,
+)
+from nvidia_rag.utils.vdb.oracle.oracle_queries import (
+    check_table_exists_query,
+    create_document_info_table_ddl,
+    create_metadata_schema_table_ddl,
+    create_text_index_ddl,
+    create_vector_index_ddl,
+    create_vector_table_ddl,
+    drop_table_ddl,
+    get_all_collections_query,
+    get_collection_document_info_query,
+    get_count_query,
+    get_delete_document_info_by_collection_query,
+    get_delete_document_info_query,
+    get_delete_docs_query,
+    get_delete_metadata_schema_query,
+    get_document_info_query,
+    get_hybrid_search_query,
+    get_metadata_schema_query,
+    get_similarity_search_query,
+    get_unique_sources_query,
+)
+from nvidia_rag.utils.vdb.vdb_ingest_base import VDBRagIngest
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.DEBUG)
+
+
+class OracleVSCompat(OracleVS):
+    """Compatibility shim to align LangChain OracleVS with our schema and vector bind needs.
+
+    Bypasses LangChain OracleVS's __init__ table-creation logic, which uses
+    unquoted DDL (Oracle case-folds to UPPERCASE) and would create a shadow
+    uppercase table when our quoted-identifier table already exists. We
+    create the user table ourselves via create_vector_table_ddl with quoted
+    identifiers; LangChain only handles retrieval through our overridden
+    similarity_search_by_vector_with_relevance_scores.
+    """
+
+    def __init__(
+        self,
+        client: Any,
+        embedding_function: Any,
+        table_name: str,
+        distance_strategy: Any = None,
+        query: str | None = "What is a Oracle database",
+        params: dict[str, Any] | None = None,
+    ):
+        # Replicate OracleVS.__init__ state setup WITHOUT calling _create_table.
+        # The user table was already created by NvidiaRAG's create_collection()
+        # using create_vector_table_ddl which quotes identifiers to preserve case.
+        connection = _get_connection(client)
+        if connection is None:
+            raise ValueError("Failed to acquire a connection.")
+
+        # Determine insert_mode the same way OracleVS does: thin vs thick driver
+        # and oracledb client version dictate whether vectors are bound as
+        # Python arrays or CLOB JSON.
+        self.insert_mode = "array"
+        if hasattr(connection, "thin") and connection.thin:
+            if oracledb.__version__ == "2.1.0":
+                raise Exception(
+                    "Oracle DB python thin client driver version 2.1.0 not supported"
+                )
+            elif _compare_version(oracledb.__version__, "2.2.0"):
+                self.insert_mode = "clob"
+        else:
+            try:
+                client_ver = ".".join(map(str, oracledb.clientversion()))
+                if _compare_version(client_ver, "23.4"):
+                    self.insert_mode = "clob"
+            except oracledb.Error:
+                # No thick client available; default to array mode
+                self.insert_mode = "array"
+
+        self.client = client
+        self.embedding_function = embedding_function
+        self.query = query
+        self.table_name = table_name
+        self.distance_strategy = (
+            distance_strategy
+            if distance_strategy is not None
+            else DistanceStrategy.EUCLIDEAN_DISTANCE
+        )
+        self.params = params
+
+        # Cache embedding dimension to avoid re-embedding per query
+        self._embedding_dim = self.get_embedding_dimension()
+
+    @staticmethod
+    def _get_clob_value(lob_or_str) -> str:
+        """Read a CLOB/LOB column value to a plain string."""
+        if hasattr(lob_or_str, "read"):
+            return lob_or_str.read()
+        return lob_or_str or ""
+
+    def similarity_search_by_vector_with_relevance_scores(
+        self,
+        embedding,
+        k: int = 4,
+        filter=None,
+        **kwargs,
+    ):
+        # Bind embedding as CLOB or array, then cast via TO_VECTOR to avoid ORA-00904 on bind name
+        if self.insert_mode == "clob":
+            embedding_arr = json.dumps(embedding)
+        else:
+            embedding_arr = array("f", embedding)
+
+        distance_fn = _get_distance_function(self.distance_strategy)
+        query = f"""
+            SELECT id,
+                   text,
+                   source,
+                   content_metadata,
+                   VECTOR_DISTANCE(
+                       vector,
+                       TO_VECTOR(:embedding, {self._embedding_dim}, FLOAT32),
+                       {distance_fn}
+                   ) AS distance
+            FROM "{self.table_name}"
+            ORDER BY distance
+            FETCH APPROX FIRST {k} ROWS ONLY
+        """
+
+        docs_and_scores = []
+        connection = _get_connection(self.client)
+        if connection is None:
+            raise ValueError("Failed to acquire a connection.")
+
+        with connection.cursor() as cursor:
+            cursor.execute(query, embedding=embedding_arr)
+            results = cursor.fetchall()
+
+            for result in results:
+                # parse source data
+                metadata = json.loads(result[2]) if result[2] else {}
+                # parse content_metadata
+                content_metadata = {}
+                if isinstance(result[3], oracledb.LOB) and result[3]:
+                    content_metadata = json.loads(self._get_clob_value(result[3]))
+                elif isinstance(result[3], dict) and result[3]:
+                    content_metadata = result[3]
+
+                # combining source data and metadata
+                metadata['content_metadata'] = content_metadata
+
+                # Apply filter if provided
+                if filter:
+                    logger.info(f'Filtering on :{filter}')
+                    if not all(metadata.get(key) in value for key, value in filter.items()):
+                        continue
+
+                doc = Document(
+                    page_content=(self._get_clob_value(result[1]) if result[1] is not None else ""),
+                    metadata=metadata,
+                )
+                distance = result[4]
+                docs_and_scores.append((doc, distance))
+
+        return docs_and_scores
+
+
+class OracleVDB(VDBRagIngest):
+    """
+    Oracle 26ai Vector Database implementation.
+
+    Provides vector storage and retrieval using Oracle AI Database 26ai's
+    native vector capabilities with IVF indexes optimized for CPU-based deployment.
+    """
+
+    def __init__(
+        self,
+        collection_name: str,
+        oracle_user: str | None = None,
+        oracle_password: str | None = None,
+        oracle_cs: str | None = None,
+        embedding_model: Any | None = None,
+        config: NvidiaRAGConfig | None = None,
+        meta_dataframe: Any | None = None,
+        meta_source_field: str | None = None,
+        meta_fields: list[str] | None = None,
+        csv_file_path: str | None = None,
+        index_type: str = "IVF",
+        distance_metric: str = "COSINE",
+        hybrid: bool = False,
+    ):
+        """
+        Initialize Oracle VDB connection.
+
+        Args:
+            collection_name: Name of the vector collection/table
+            oracle_user: Database username (or set ORACLE_USER env var)
+            oracle_password: Database password (or set ORACLE_PASSWORD env var)
+            oracle_dsn: Connection DSN (or set ORACLE_DSN env var)
+            embedding_model: Embedding model instance for retrieval
+            config: NvidiaRAGConfig instance
+            meta_dataframe: Metadata dataframe for custom metadata
+            meta_source_field: Source field name in metadata
+            meta_fields: List of metadata field names
+            csv_file_path: Path to CSV file for metadata
+            index_type: Vector index type (IVF or HNSW)
+            distance_metric: Distance metric (COSINE, L2, DOT)
+            hybrid: Enable hybrid search with Oracle Text
+        """
+        self.config = config or NvidiaRAGConfig()
+        self._collection_name = collection_name if collection_name else ""
+        self._embedding_model = embedding_model
+
+        # Connection parameters from args or environment
+        self._oracle_user = oracle_user or os.getenv("ORACLE_USER")
+        self._oracle_password = oracle_password or os.getenv("ORACLE_PASSWORD")
+        self._oracle_cs = oracle_cs or os.getenv("ORACLE_CS")
+
+        if not all([
+            self._oracle_user, 
+            self._oracle_password, 
+            self._oracle_cs
+            ]):
+            raise ValueError(
+                "Oracle connection requires ORACLE_USER, ORACLE_PASSWORD, ORACLE_CS variables."
+                "Set via parameters or environment variables."
+            )
+
+        # Vector index configuration
+        self._index_type = os.getenv("ORACLE_VECTOR_INDEX_TYPE", index_type).upper()
+        self._distance_metric = os.getenv("ORACLE_DISTANCE_METRIC", distance_metric).upper()
+        self._hybrid = hybrid or (self.config.vector_store.search_type == SearchType.HYBRID)
+
+        # Metadata fields for NV-Ingest
+        self.meta_dataframe = meta_dataframe
+        self.meta_source_field = meta_source_field
+        self.meta_fields = meta_fields
+        self.csv_file_path = csv_file_path
+
+        # System collection initialization flags
+        self._metadata_schema_initialized = False
+        self._document_info_initialized = False
+
+        # Initialize connection pool
+        try:
+            self._pool = oracledb.create_pool(
+                user=self._oracle_user,
+                password=self._oracle_password,
+                dsn=self._oracle_cs,
+                min=2,
+                max=10,
+                increment=1,
+            )
+            # Test connection
+            with self._pool.acquire() as conn:
+                with conn.cursor() as cursor:
+                    cursor.execute("SELECT 1 FROM DUAL")
+            logger.info(f"Connected to Oracle with connection {self._oracle_cs}")
+        except oracledb.Error as e:
+            logger.exception("Failed to connect to Oracle at %s: %s", self._oracle_cs, e)
+            raise APIError(
+                f"Oracle database is unavailable at {self._oracle_cs}. "
+                "Please verify Oracle is running and credentials are correct.",
+                ErrorCodeMapping.SERVICE_UNAVAILABLE,
+            ) from e
+
+    @property
+    def collection_name(self) -> str:
+        """Get the collection name."""
+        return self._collection_name
+
+    @collection_name.setter
+    def collection_name(self, value: str) -> None:
+        """Set the collection name."""
+        self._collection_name = value if value else ""
+
+    def _get_connection(self):
+        """Acquire a connection from the pool."""
+        return self._pool.acquire()
+
+    def _table_exists(self, table_name: str) -> bool:
+        """Check if a table exists with exact case match.
+
+        Quoted-identifier user tables (created via CREATE TABLE "name") preserve
+        case in user_tables, so exact-case lookup matches.
+
+        For tables created without quoting (legacy v0.0.6 user tables, system
+        tables like METADATA_SCHEMA / DOCUMENT_INFO), Oracle case-folds names
+        to UPPERCASE in user_tables. Use _table_exists_unquoted() for those.
+        """
+        with self._get_connection() as conn:
+            with conn.cursor() as cursor:
+                cursor.execute(check_table_exists_query(), {"table_name": table_name})
+                result = cursor.fetchone()
+                return result[0] > 0
+
+    def _table_exists_unquoted(self, table_name: str) -> bool:
+        """Check if a table exists, matching Oracle's unquoted-identifier folding.
+
+        Oracle stores unquoted CREATE TABLE foo as FOO in user_tables. Lookups
+        for the system tables (metadata_schema, document_info) and any legacy
+        v0.0.6 collection tables must use UPPER() to find them.
+        """
+        with self._get_connection() as conn:
+            with conn.cursor() as cursor:
+                cursor.execute(
+                    "SELECT COUNT(*) FROM user_tables WHERE table_name = UPPER(:table_name)",
+                    {"table_name": table_name},
+                )
+                result = cursor.fetchone()
+                return result[0] > 0
+
+    @staticmethod
+    def _execute_ddl_idempotent(cursor: Any, ddl: str, kind: str = "object") -> bool:
+        """Execute a CREATE DDL, swallowing ORA-00955 (name already exists).
+
+        Oracle has no native CREATE ... IF NOT EXISTS for tables/indexes/etc.
+        Existence pre-checks have race conditions and don't always match
+        (case-folding subtleties). This wrapper makes CREATE idempotent.
+
+        Returns True if the object was created, False if it already existed.
+        Re-raises any other oracledb error.
+        """
+        try:
+            cursor.execute(ddl)
+            return True
+        except oracledb.Error as e:
+            # ORA-00955: name is already used by an existing object
+            err_obj = e.args[0] if e.args else None
+            code = getattr(err_obj, "code", None)
+            if code == 955:
+                logger.info(f"{kind} already exists, skipping CREATE")
+                return False
+            raise
+
+    # -------------------------------------------------------------------------
+    # NV-Ingest VDB Operations
+    def _check_index_exists(self, index_name: str) -> bool:
+        """Check if the collection table exists."""
+        return self._table_exists(index_name)
+
+    def create_index(self):
+        """Create the vector table and index."""
+        logger.info(f"Creating Oracle collection if not exists: {self._collection_name}")
+        self.create_collection(
+            self._collection_name,
+            dimension=self.config.embeddings.dimensions,
+        )
+
+    def write_to_index(self, records: list, **kwargs) -> None:
+        """
+        Write records to the Oracle vector table.
+
+        Requires nv_ingest_client for record cleanup.
+        """
+        try:
+            from nv_ingest_client.util.milvus import cleanup_records, pandas_file_reader
+        except ImportError as e:
+            raise ImportError(
+                "nv_ingest_client is required for write_to_index operation. "
+                "Install with: pip install nvidia-rag[ingest]"
+            ) from e
+
+        # Load metadata if needed
+        meta_dataframe = self.meta_dataframe
+        if meta_dataframe is None and self.csv_file_path is not None:
+            meta_dataframe = pandas_file_reader(self.csv_file_path)
+        elif isinstance(meta_dataframe, str):
+            meta_dataframe = pandas_file_reader(meta_dataframe)
+
+        # Clean records
+        cleaned_records = cleanup_records(
+            records=records,
+            meta_dataframe=meta_dataframe,
+            meta_source_field=self.meta_source_field,
+            meta_fields=self.meta_fields,
+        )
+
+        total_records = len(cleaned_records)
+        batch_size = 100
+        uploaded_count = 0
+
+        logger.info(f"Starting Oracle ingestion for {total_records} records...")
+
+        with self._get_connection() as conn:
+            with conn.cursor() as cursor:
+                insert_sql = f"""
+                INSERT INTO "{self._collection_name}" (text, vector, source, content_metadata)
+                VALUES (:text, TO_VECTOR(:vector, {self.config.embeddings.dimensions}, FLOAT32), :source, :content_metadata)
+                """
+
+                for i in range(0, total_records, batch_size):
+                    batch = cleaned_records[i:i + batch_size]
+                    batch_data = []
+
+                    for record in batch:
+                        vector = record.get("vector", [])
+
+                        source_val = record.get("source", "")
+                        if isinstance(source_val, dict):
+                            source_val = json.dumps(source_val)
+
+                        content_metadata_val = record.get("content_metadata", {})
+                        if not isinstance(content_metadata_val, str):
+                            content_metadata_val = json.dumps(content_metadata_val)
+                        # Bind vector using TO_VECTOR with dense float32 format
+                        vector_array = array("f", vector)
+
+                        batch_data.append({
+                            "text": record.get("text", ""),
+                            "vector": vector_array,
+                            "source": source_val,
+                            "content_metadata": content_metadata_val,
+                        })
+
+                    cursor.executemany(insert_sql, batch_data)
+                    conn.commit()
+
+                    uploaded_count += len(batch)
+                    if uploaded_count % (5 * batch_size) == 0 or uploaded_count == total_records:
+                        logger.info(f"Ingested {uploaded_count}/{total_records} records into {self._collection_name}")
+
+        logger.info(f"Oracle ingestion completed. Total: {uploaded_count} records")
+
+    def retrieval(self, queries: list, **kwargs) -> list[dict[str, Any]]:
+        """Retrieve documents based on queries."""
+        raise NotImplementedError("Use retrieval_langchain for Oracle retrieval")
+
+    def reindex(self, records: list, **kwargs) -> None:
+        """Reindex documents."""
+        raise NotImplementedError("Reindex not implemented for Oracle")
+
+    def run(self, records: list) -> None:
+        """Run ingestion pipeline."""
+        self.create_index()
+        self.write_to_index(records)
+
+    def run_async(self, records: list | Future) -> list:
+        """Run async ingestion."""
+        logger.info(f"Creating index - {self._collection_name}")
+        self.create_index()
+
+        if isinstance(records, Future):
+            records = records.result()
+
+        logger.info(f"Writing to index - {self._collection_name}")
+        self.write_to_index(records)
+        return records
+
+    # -------------------------------------------------------------------------
+    # VDBRag Collection Management
+    async def check_health(self) -> dict[str, Any]:
+        """Check Oracle database health."""
+        status = {
+            "service": "Oracle 26ai",
+            "url": self._oracle_cs,
+            "status": ServiceStatus.UNKNOWN.value,
+            "error": None,
+        }
+
+        try:
+            start_time = time.time()
+            with self._get_connection() as conn:
+                with conn.cursor() as cursor:
+                    cursor.execute("SELECT 1 FROM DUAL")
+                    cursor.execute("SELECT COUNT(*) FROM user_tables")
+                    table_count = cursor.fetchone()[0]
+
+            status["status"] = ServiceStatus.HEALTHY.value
+            status["latency_ms"] = round((time.time() - start_time) * 1000, 2)
+            status["tables"] = table_count
+        except Exception as e:
+            status["status"] = ServiceStatus.ERROR.value
+            status["error"] = str(e)
+
+        return status
+
+    def create_collection(
+        self,
+        collection_name: str,
+        dimension: int = 2048,
+        collection_type: str = "text",
+    ) -> None:
+        """Create a new vector collection table.
+
+        Idempotent: existence checks AND ORA-00955 swallowing handle both
+        the case where the table exists (skip) and the race where another
+        caller created it between the check and the CREATE.
+
+        Both legacy v0.0.6 uppercase tables (BIOMEDICAL_DATASET) and v0.0.7+
+        case-preserved tables (Test_MixedCase) are detected via the
+        case-insensitive existence check, preventing ORA-00955 collisions
+        on upgrade.
+        """
+        table_name = collection_name
+
+        # Validate length: Oracle's hard limit is 128 chars for identifiers.
+        # We accept up to 128; index names that would exceed get hashed in
+        # _derive_object_name, but the table name itself can't be hashed
+        # without losing user-visible identity.
+        if len(table_name) > 128:
+            raise ValueError(
+                f"Collection name exceeds Oracle's 128-character identifier limit "
+                f"(got {len(table_name)} chars): {table_name[:64]}..."
+            )
+
+        # Detect both case-preserved (v0.0.7+) and case-folded (v0.0.6 legacy
+        # or system tables created without quoting) existing tables.
+        if self._table_exists(table_name) or self._table_exists_unquoted(table_name):
+            logger.info(f"Collection {table_name} already exists, skipping CREATE")
+            return
+
+        with self._get_connection() as conn:
+            with conn.cursor() as cursor:
+                # Create table (idempotent on ORA-00955 in case of race)
+                created = self._execute_ddl_idempotent(
+                    cursor,
+                    create_vector_table_ddl(table_name, dimension),
+                    kind=f"table {table_name}",
+                )
+                if created:
+                    logger.info(f"Created table {table_name}")
+
+                # Create vector index (idempotent)
+                try:
+                    self._execute_ddl_idempotent(
+                        cursor,
+                        create_vector_index_ddl(
+                            table_name,
+                            index_type=self._index_type,
+                            distance_metric=self._distance_metric,
+                        ),
+                        kind=f"vector index for {table_name}",
+                    )
+                    logger.info(f"Created {self._index_type} vector index on {table_name}")
+                except oracledb.Error as e:
+                    logger.warning(f"Could not create vector index: {e}")
+
+                # Create text index for hybrid search (idempotent)
+                if self._hybrid:
+                    try:
+                        self._execute_ddl_idempotent(
+                            cursor,
+                            create_text_index_ddl(table_name),
+                            kind=f"text index for {table_name}",
+                        )
+                        logger.info(f"Created text index on {table_name}")
+                    except oracledb.Error as e:
+                        logger.warning(f"Could not create text index: {e}")
+
+                conn.commit()
+
+    def check_collection_exists(self, collection_name: str) -> bool:
+        """Check if a collection exists.
+
+        Returns True for both case-preserved (v0.0.7+ quoted-DDL) tables and
+        case-folded (v0.0.6 legacy unquoted-DDL) tables. This lets clients
+        upgrading from v0.0.6 still find their legacy collections regardless
+        of which case they query with.
+        """
+        return self._table_exists(collection_name) or self._table_exists_unquoted(
+            collection_name
+        )
+
+    def get_collection(self) -> list[dict[str, Any]]:
+        """Get all collections with metadata."""
+        self.create_metadata_schema_collection()
+        self.create_document_info_collection()
+
+        collections = []
+        with self._get_connection() as conn:
+            with conn.cursor() as cursor:
+                cursor.execute(get_all_collections_query())
+                tables = cursor.fetchall()
+
+                for (table_name,) in tables:
+                    if table_name.upper() in [s.upper() for s in SYSTEM_COLLECTIONS]:
+                        continue
+
+                    # Get document count
+                    cursor.execute(get_count_query(table_name))
+                    count = cursor.fetchone()[0]
+
+                    # Get metadata schema
+                    metadata_schema = self.get_metadata_schema(table_name)
+
+                    # Get catalog and metrics data
+                    catalog_data = self.get_document_info(
+                        info_type="catalog",
+                        collection_name=table_name,
+                        document_name="NA",
+                    )
+                    metrics_data = self.get_document_info(
+                        info_type="collection",
+                        collection_name=table_name,
+                        document_name="NA",
+                    )
+
+                    collections.append({
+                        "collection_name": table_name,
+                        "num_entities": count,
+                        "metadata_schema": metadata_schema,
+                        "collection_info": {**metrics_data, **catalog_data},
+                    })
+
+        return collections
+
+    def delete_collections(self, collection_names: list[str]) -> dict[str, Any]:
+        """Delete collections."""
+        deleted = []
+        failed = []
+
+        for name in collection_names:
+            table_name = name
+            try:
+                if self._table_exists(table_name):
+                    with self._get_connection() as conn:
+                        with conn.cursor() as cursor:
+                            cursor.execute(drop_table_ddl(table_name))
+                            conn.commit()
+                    deleted.append(name)
+                    logger.info(f"Deleted collection: {name}")
+
+                    # Clean up metadata
+                    self._delete_collection_metadata(table_name)
+                else:
+                    failed.append({
+                        "collection_name": name,
+                        "error_message": f"Collection {name} not found.",
+                    })
+            except Exception as e:
+                failed.append({
+                    "collection_name": name,
+                    "error_message": str(e),
+                })
+                logger.exception(f"Failed to delete collection {name}")
+
+        return {
+            "message": "Collection deletion completed.",
+            "successful": deleted,
+            "failed": failed,
+            "total_success": len(deleted),
+            "total_failed": len(failed),
+        }
+
+    def _delete_collection_metadata(self, collection_name: str) -> None:
+        """Delete metadata and document info for a collection."""
+        with self._get_connection() as conn:
+            with conn.cursor() as cursor:
+                try:
+                    cursor.execute(
+                        get_delete_metadata_schema_query(),
+                        {"collection_name": collection_name},
+                    )
+                except Exception as e:
+                    logger.warning(f"Error deleting metadata schema: {e}")
+
+                try:
+                    cursor.execute(
+                        get_delete_document_info_by_collection_query(),
+                        {"collection_name": collection_name},
+                    )
+                except Exception as e:
+                    logger.warning(f"Error deleting document info: {e}")
+
+                conn.commit()
+
+    # -------------------------------------------------------------------------
+    # Document Management
+    def get_documents(self, collection_name: str) -> list[dict[str, Any]]:
+        """Get all documents in a collection."""
+        table_name = collection_name
+        metadata_schema = self.get_metadata_schema(table_name)
+
+        documents = []
+        with self._get_connection() as conn:
+            with conn.cursor() as cursor:
+                cursor.execute(get_unique_sources_query(table_name))
+                for row in cursor:
+                    source_name = json.loads(row[0]).get('source_name')
+                    raw_cm = row[1]
+                    if raw_cm is None:
+                        content_metadata = {}
+                    elif hasattr(raw_cm, 'read'):
+                        content_metadata = json.loads(raw_cm.read()) or {}
+                    elif isinstance(raw_cm, str):
+                        content_metadata = json.loads(raw_cm) or {}
+                    else:
+                        content_metadata = raw_cm or {}
+
+                    metadata_dict = {}
+                    for item in metadata_schema:
+                        field_name = item.get("name")
+                        metadata_dict[field_name] = content_metadata.get(field_name)
+
+                    doc_info = self.get_document_info(
+                        info_type="document",
+                        collection_name=table_name,
+                        document_name=os.path.basename(source_name),
+                    )
+
+                    documents.append({
+                        "document_name": os.path.basename(source_name),
+                        "metadata": metadata_dict,
+                        "document_info": doc_info,
+                    })
+
+        return documents
+
+    def delete_documents(
+        self,
+        collection_name: str,
+        source_values: list[str],
+        result_dict: dict[str, list[str]] | None = None,
+    ) -> bool:
+        """Delete documents by source values."""
+        table_name = collection_name
+
+        if result_dict is not None:
+            result_dict["deleted"] = []
+            result_dict["not_found"] = []
+
+        existing_docs = set()
+        if result_dict is not None:
+            try:
+                all_docs = self.get_documents(collection_name)
+                existing_docs = {doc.get("document_name", "") for doc in all_docs}
+            except Exception as e:
+                logger.warning(f"Failed to check existing documents: {e}")
+
+        with self._get_connection() as conn:
+            with conn.cursor() as cursor:
+                for source_value in source_values:
+                    doc_basename = os.path.basename(source_value)
+                    try:
+                        cursor.execute(
+                            get_delete_docs_query(table_name),
+                            {"source_value": source_value},
+                        )
+                        deleted_count = cursor.rowcount
+
+                        if result_dict is not None:
+                            if deleted_count > 0 or doc_basename in existing_docs:
+                                result_dict["deleted"].append(doc_basename)
+                            else:
+                                result_dict["not_found"].append(doc_basename)
+                    except Exception as e:
+                        logger.warning(f"Failed to delete {source_value}: {e}")
+                        if result_dict is not None:
+                            result_dict["not_found"].append(doc_basename)
+
+                conn.commit()
+
+        return True
+
+    # -------------------------------------------------------------------------
+    # Metadata Schema Management
+    def create_metadata_schema_collection(self) -> None:
+        """Create metadata schema table if not exists.
+
+        Uses unquoted lookup since the DDL itself is unquoted — Oracle stores
+        it as METADATA_SCHEMA. CREATE is idempotent (swallows ORA-00955).
+        """
+        if self._metadata_schema_initialized:
+            return
+
+        if not self._table_exists_unquoted(DEFAULT_METADATA_SCHEMA_COLLECTION):
+            with self._get_connection() as conn:
+                with conn.cursor() as cursor:
+                    created = self._execute_ddl_idempotent(
+                        cursor,
+                        create_metadata_schema_table_ddl(),
+                        kind=DEFAULT_METADATA_SCHEMA_COLLECTION,
+                    )
+                    conn.commit()
+                    if created:
+                        logger.info(f"Created {DEFAULT_METADATA_SCHEMA_COLLECTION} table")
+
+        self._metadata_schema_initialized = True
+
+    def add_metadata_schema(
+        self,
+        collection_name: str,
+        metadata_schema: list[dict[str, Any]],
+    ) -> None:
+        """Add or update metadata schema for a collection."""
+        table_name = collection_name
+
+        with self._get_connection() as conn:
+            with conn.cursor() as cursor:
+                # Delete existing
+                cursor.execute(
+                    get_delete_metadata_schema_query(),
+                    {"collection_name": table_name},
+                )
+
+                # Insert new
+                cursor.execute(
+                    """
+                    INSERT INTO metadata_schema (collection_name, metadata_schema)
+                    VALUES (:collection_name, :metadata_schema)
+                    """,
+                    {
+                        "collection_name": table_name,
+                        "metadata_schema": json.dumps(metadata_schema),
+                    },
+                )
+                conn.commit()
+
+        logger.info(f"Added metadata schema for {table_name}")
+
+    def get_metadata_schema(self, collection_name: str) -> list[dict[str, Any]]:
+        """Get metadata schema for a collection."""
+        table_name = collection_name
+
+        with self._get_connection() as conn:
+            with conn.cursor() as cursor:
+                cursor.execute(
+                    get_metadata_schema_query(),
+                    {"collection_name": table_name},
+                )
+                row = cursor.fetchone()
+                if row and row[0]:
+                    return row[0]
+
+        logger.info(f"No metadata schema found for {table_name}")
+        return []
+
+    # -------------------------------------------------------------------------
+    # Document Info Management
+    def create_document_info_collection(self) -> None:
+        """Create document info table if not exists.
+
+        Uses unquoted lookup since the DDL itself is unquoted — Oracle stores
+        it as DOCUMENT_INFO. CREATE is idempotent (swallows ORA-00955).
+        """
+        if self._document_info_initialized:
+            return
+
+        if not self._table_exists_unquoted(DEFAULT_DOCUMENT_INFO_COLLECTION):
+            with self._get_connection() as conn:
+                with conn.cursor() as cursor:
+                    created = self._execute_ddl_idempotent(
+                        cursor,
+                        create_document_info_table_ddl(),
+                        kind=DEFAULT_DOCUMENT_INFO_COLLECTION,
+                    )
+                    conn.commit()
+                    if created:
+                        logger.info(f"Created {DEFAULT_DOCUMENT_INFO_COLLECTION} table")
+
+        self._document_info_initialized = True
+
+    def add_document_info(
+        self,
+        info_type: str,
+        collection_name: str,
+        document_name: str,
+        info_value: dict[str, Any],
+    ) -> None:
+        """Add document info."""
+        table_name = collection_name
+
+        # Aggregate collection info
+        if info_type == "collection":
+            info_value = self._get_aggregated_document_info(table_name, info_value)
+
+        with self._get_connection() as conn:
+            with conn.cursor() as cursor:
+                # Delete existing
+                cursor.execute(
+                    get_delete_document_info_query(),
+                    {
+                        "collection_name": table_name,
+                        "document_name": document_name,
+                        "info_type": info_type,
+                    },
+                )
+
+                # Insert new
+                cursor.execute(
+                    """
+                    INSERT INTO document_info (collection_name, info_type, document_name, info_value)
+                    VALUES (:collection_name, :info_type, :document_name, :info_value)
+                    """,
+                    {
+                        "collection_name": table_name,
+                        "info_type": info_type,
+                        "document_name": document_name,
+                        "info_value": json.dumps(info_value),
+                    },
+                )
+                conn.commit()
+
+        logger.info(f"Added document info for {table_name}/{document_name}")
+
+    def set_collection_info(
+        self,
+        collection_name: str,
+        info_value: dict[str, Any],
+    ) -> None:
+        """Directly replace the collection-level document_info entry without aggregation.
+
+        Unlike add_document_info, this bypasses _get_aggregated_document_info so the
+        caller's pre-computed value is stored as-is.  Use this after document deletion
+        when the caller has already recalculated the correct aggregated state.
+        """
+        table_name = collection_name
+
+        with self._get_connection() as conn:
+            with conn.cursor() as cursor:
+                cursor.execute(
+                    get_delete_document_info_query(),
+                    {
+                        "collection_name": table_name,
+                        "document_name": "NA",
+                        "info_type": "collection",
+                    },
+                )
+                cursor.execute(
+                    """
+                    INSERT INTO document_info (collection_name, info_type, document_name, info_value)
+                    VALUES (:collection_name, :info_type, :document_name, :info_value)
+                    """,
+                    {
+                        "collection_name": table_name,
+                        "info_type": "collection",
+                        "document_name": "NA",
+                        "info_value": json.dumps(info_value),
+                    },
+                )
+                conn.commit()
+
+        logger.info(f"Set collection info for {table_name}")
+
+    @staticmethod
+    def _read_clob_json(value) -> dict[str, Any]:
+        """Normalise a CLOB column value to a dict.
+
+        oracledb returns CLOB columns as LOB objects; read() converts them to
+        str, then json.loads converts to dict.  If the value is already a dict
+        (e.g. when Oracle returns native JSON), it is returned as-is.
+        """
+        if hasattr(value, "read"):
+            value = value.read()
+        if isinstance(value, str):
+            return json.loads(value)
+        return value or {}
+
+    def _get_aggregated_document_info(
+        self,
+        collection_name: str,
+        info_value: dict[str, Any],
+    ) -> dict[str, Any]:
+        """Get aggregated document info for a collection."""
+        try:
+            with self._get_connection() as conn:
+                with conn.cursor() as cursor:
+                    cursor.execute(
+                        get_collection_document_info_query(),
+                        {"collection_name": collection_name, "info_type": "collection"},
+                    )
+                    row = cursor.fetchone()
+                    if row and row[0]:
+                        existing = self._read_clob_json(row[0])
+                        return perform_document_info_aggregation(existing, info_value)
+        except Exception as e:
+            logger.warning(f"Error getting aggregated info: {e}")
+
+        return info_value
+
+    def get_document_info(
+        self,
+        info_type: str,
+        collection_name: str,
+        document_name: str,
+    ) -> dict[str, Any]:
+        """Get document info."""
+        table_name = collection_name
+
+        with self._get_connection() as conn:
+            with conn.cursor() as cursor:
+                cursor.execute(
+                    get_document_info_query(),
+                    {
+                        "collection_name": table_name,
+                        "document_name": document_name,
+                        "info_type": info_type,
+                    },
+                )
+                row = cursor.fetchone()
+                if row and row[0]:
+                    return self._read_clob_json(row[0])
+
+        return {}
+
+    def get_catalog_metadata(self, collection_name: str) -> dict[str, Any]:
+        """Get catalog metadata for a collection."""
+        return self.get_document_info(
+            info_type="catalog",
+            collection_name=collection_name,
+            document_name="NA",
+        )
+
+    def update_catalog_metadata(
+        self,
+        collection_name: str,
+        updates: dict[str, Any],
+    ) -> None:
+        """Update catalog metadata for a collection."""
+        existing = self.get_catalog_metadata(collection_name)
+        merged = {**existing, **updates}
+        merged["last_updated"] = get_current_timestamp()
+
+        self.add_document_info(
+            info_type="catalog",
+            collection_name=collection_name,
+            document_name="NA",
+            info_value=merged,
+        )
+
+    def get_document_catalog_metadata(
+        self,
+        collection_name: str,
+        document_name: str,
+    ) -> dict[str, Any]:
+        """Get catalog metadata for a document."""
+        doc_info = self.get_document_info(
+            info_type="document",
+            collection_name=collection_name,
+            document_name=document_name,
+        )
+        return {
+            "description": doc_info.get("description", ""),
+            "tags": doc_info.get("tags", []),
+        }
+
+    def update_document_catalog_metadata(
+        self,
+        collection_name: str,
+        document_name: str,
+        updates: dict[str, Any],
+    ) -> None:
+        """Update catalog metadata for a document."""
+        existing = self.get_document_info(
+            info_type="document",
+            collection_name=collection_name,
+            document_name=document_name,
+        )
+
+        for key in ["description", "tags"]:
+            if key in updates:
+                existing[key] = updates[key]
+
+        self.add_document_info(
+            info_type="document",
+            collection_name=collection_name,
+            document_name=document_name,
+            info_value=existing,
+        )
+
+    # -------------------------------------------------------------------------
+    # Retrieval Operations
+    def get_langchain_vectorstore(self, collection_name: str) -> OracleVS:
+        """Get LangChain OracleVS vectorstore."""
+        table_name = collection_name
+
+        # Map distance metric
+        distance_map = {
+            "COSINE": DistanceStrategy.COSINE,
+            "L2": DistanceStrategy.EUCLIDEAN_DISTANCE,
+            "DOT": DistanceStrategy.DOT_PRODUCT,
+        }
+        distance_strategy = distance_map.get(self._distance_metric, DistanceStrategy.COSINE)
+
+        # Create connection for OracleVS
+        conn = oracledb.connect(
+            user=self._oracle_user,
+            password=self._oracle_password,
+            dsn=self._oracle_cs
+        )
+
+        return OracleVSCompat(
+            client=conn,
+            embedding_function=self._embedding_model,
+            table_name=table_name,
+            distance_strategy=distance_strategy,
+        )
+
+    def retrieval_langchain(
+        self,
+        query: str,
+        collection_name: str,
+        top_k: int = 10,
+        filter_expr: str | list[dict[str, Any]] = "",
+        vectorstore: OracleVS | None = None,
+        otel_ctx: Any | None = None,
+    ) -> list[Document]:
+        """Perform semantic search and return documents."""
+        table_name = collection_name
+
+        logger.info(
+            "Oracle Retrieval: Retrieving from %s, search type: %s",
+            table_name,
+            "hybrid" if self._hybrid else "vector",
+        )
+
+        if vectorstore is None:
+            vectorstore = self.get_langchain_vectorstore(collection_name)
+
+        token = otel_context.attach(otel_ctx) if otel_ctx is not None else None
+
+        try:
+            start_time = time.time()
+
+            logger.info("  [Embedding] Generating query embedding...")
+            retriever = vectorstore.as_retriever(search_kwargs={"k": top_k})
+
+            retriever_lambda = RunnableLambda(lambda x: retriever.invoke(x))
+            retriever_chain = {"context": retriever_lambda} | RunnableAssign(
+                {"context": lambda inp: inp["context"]}
+            )
+
+            logger.info("  [VDB Search] Performing vector similarity search...")
+            result = retriever_chain.invoke(query, config={"run_name": "retriever"})
+            docs = result.get("context", [])
+
+            latency = time.time() - start_time
+            logger.info("  [VDB Search] Retrieved %d documents in %.4fs", len(docs), latency)
+
+            return self._add_collection_name_to_retrieved_docs(docs, collection_name)
+
+        except Exception as e:
+            logger.exception("Error in retrieval_langchain: %s", e)
+            raise APIError(
+                f"Oracle retrieval failed: {e}",
+                ErrorCodeMapping.SERVICE_UNAVAILABLE,
+            ) from e
+        finally:
+            if token is not None:
+                otel_context.detach(token)
+
+    @staticmethod
+    def _add_collection_name_to_retrieved_docs(
+        docs: list[Document],
+        collection_name: str,
+    ) -> list[Document]:
+        """Add collection name to document metadata."""
+        for doc in docs:
+            doc.metadata["collection_name"] = collection_name
+        return docs
diff --git a/src/nvidia_rag/utils/vdb/vdb_ingest_base.py b/src/nvidia_rag/utils/vdb/vdb_ingest_base.py
index 190d69ad4..bbbb61911 100644
--- a/src/nvidia_rag/utils/vdb/vdb_ingest_base.py
+++ b/src/nvidia_rag/utils/vdb/vdb_ingest_base.py
@@ -14,7 +14,8 @@
 # limitations under the License.
 
 """
-This module provides VDBRagIngest, a VDBRag subclass with nv_ingest support.
+This module provides VDBRagIngest, a VDBRag subclass with nv_ingest support,
+and SerializedVDBWrapper for serializing concurrent VDB write operations.
 
 VDBRagIngest combines VDBRag (pure abstract base) with VDB from nv_ingest_client,
 providing full ingestion capabilities. This class should be used by ingestor_server
@@ -25,6 +26,7 @@
 """
 
 import logging
+import threading
 
 from nvidia_rag.utils.vdb.vdb_base import VDBRag
 
@@ -52,6 +54,40 @@ class VDBRagIngest(VDBRag, VDB):
 
         pass
 
+    class SerializedVDBWrapper:
+        """Wraps a VDB op to serialize write operations while keeping reads parallel.
+
+        When multiple batches run extraction concurrently, their VDB writes can
+        overlap and cause indexing timeouts (e.g., GPU_CAGRA JIT compilation takes
+        longer than the client's patience window). This wrapper uses a threading
+        lock to ensure only one batch writes to the VDB at a time.
+        """
+
+        def __init__(self, vdb_op):
+            self._vdb_op = vdb_op
+            self._write_lock = threading.Lock()
+
+        def run_async(self, records):
+            with self._write_lock:
+                return self._vdb_op.run_async(records)
+
+        def run(self, records):
+            with self._write_lock:
+                return self._vdb_op.run(records)
+
+        def write_to_index(self, records, **kwargs):
+            with self._write_lock:
+                return self._vdb_op.write_to_index(records, **kwargs)
+
+        def create_index(self, **kwargs):
+            with self._write_lock:
+                return self._vdb_op.create_index(**kwargs)
+
+        def __getattr__(self, name):
+            return getattr(self._vdb_op, name)
+
+    VDB.register(SerializedVDBWrapper)
+
 except ImportError:
     logger.warning(
         "Optional nv_ingest_client module not installed. "
@@ -59,4 +95,4 @@ class VDBRagIngest(VDBRag, VDB):
     )
     # Fallback: VDBRagIngest is just VDBRag without nv_ingest support
     VDBRagIngest = VDBRag
-
+    SerializedVDBWrapper = None
diff --git a/tests/integration/notebook_test_config.yaml b/tests/integration/notebook_test_config.yaml
new file mode 100644
index 000000000..e0bcc3464
--- /dev/null
+++ b/tests/integration/notebook_test_config.yaml
@@ -0,0 +1,118 @@
+# NVIDIA RAG Configuration
+# This file contains configurable parameters with their values
+# You can override any of these values, and they take precedence over environment variables
+
+# Vector Store Configuration
+vector_store:
+  name: "milvus"  # Name of the vector store backend (e.g., milvus, elasticsearch)
+  url: "http://localhost:19530"  # URL endpoint for the vector store service
+  index_type: "GPU_CAGRA"  # Type of vector index (e.g., GPU_CAGRA, IVF_FLAT)
+  search_type: "dense"  # Type of search to perform (dense, hybrid)
+  enable_gpu_index: true  # Enable GPU acceleration for index building
+  enable_gpu_search: true  # Enable GPU acceleration for search operations
+  default_collection_name: "test_native"  # Default collection/index name for storing vectors
+
+# NV-Ingest Configuration
+nv_ingest:
+  message_client_hostname: "localhost"  # Hostname for NV-Ingest message client
+  message_client_port: 7670  # Port for NV-Ingest message client
+  extract_text: true  # Enable text extraction from documents
+  extract_infographics: false  # Enable infographic extraction from documents
+  extract_tables: true  # Enable table extraction from documents
+  extract_charts: true  # Enable chart extraction from documents
+  extract_images: false  # Enable image extraction from documents
+  pdf_extract_method: null  # Method to use for PDF extraction
+  text_depth: "page"  # Granularity level for text extraction (page, document)
+  chunk_size: 512  # Maximum size of text chunks in tokens
+  chunk_overlap: 150  # Number of overlapping tokens between chunks
+  caption_model_name: "nvidia/nemotron-nano-12b-v2-vl"  # Model name for generating image captions
+  caption_endpoint_url: "http://localhost:1977/v1/chat/completions"  # API endpoint for caption generation service
+  enable_pdf_splitter: true  # Enable PDF page splitting during ingestion
+
+# LLM Configuration
+llm:
+  server_url: "http://localhost:8999"  # URL endpoint for the LLM inference service (on-prem NIM default)
+  model_name: "nvidia/llama-3.3-nemotron-super-49b-v1.5"  # Name of the language model to use for generation
+  # api_key: ""  # Optional: API key for LLM service (overrides NVIDIA_API_KEY environment variable)
+  parameters:
+    max_tokens: 32768  # Maximum number of tokens to generate in response
+    temperature: 0.0  # Sampling temperature for controlling randomness (0.0 = deterministic)
+    top_p: 1.0  # Nucleus sampling threshold for token selection
+
+# Query Rewriter Configuration
+query_rewriter:
+  model_name: "nvidia/llama-3.3-nemotron-super-49b-v1.5"  # Model for rewriting user queries to improve retrieval
+  server_url: "localhost:8999"  # URL endpoint for query rewriter service
+  enable_query_rewriter: false  # Enable automatic query rewriting before retrieval
+  # api_key: ""  # Optional: API key for query rewriter (overrides NVIDIA_API_KEY environment variable)
+
+# Filter Expression Generator Configuration
+filter_expression_generator:
+  model_name: "nvidia/llama-3.3-nemotron-super-49b-v1.5"  # Model for generating metadata filter expressions from queries
+  server_url: "localhost:8999"  # URL endpoint for filter expression generator service
+  enable_filter_generator: false  # Enable automatic filter expression generation from natural language
+  # api_key: ""  # Optional: API key for filter generator (overrides NVIDIA_API_KEY environment variable)
+
+# Embedding Configuration
+embeddings:
+  model_name: "nvidia/llama-nemotron-embed-1b-v2"  # Model for generating text embeddings
+  dimensions: 2048  # Dimensionality of the embedding vectors
+  server_url: "http://localhost:9080/v1"  # URL endpoint for embedding service (on-prem NIM default)
+  # api_key: ""  # Optional: API key for embeddings (overrides NVIDIA_API_KEY environment variable)
+
+# Ranking Configuration
+ranking:
+  model_name: "nvidia/llama-3.2-nv-rerankqa-1b-v2"  # Model for reranking retrieved documents
+  server_url: "http://localhost:1976"  # URL endpoint for reranking service (on-prem NIM default)
+  enable_reranker: true  # Enable reranking of retrieved documents before generation
+  # api_key: ""  # Optional: API key for reranking (overrides NVIDIA_API_KEY environment variable)
+
+# Retriever Configuration
+retriever:
+  top_k: 10  # Number of top documents to return after retrieval and reranking
+  vdb_top_k: 100  # Number of documents to retrieve from vector database before reranking
+  score_threshold: 0.25  # Minimum similarity score threshold for retrieved documents
+
+# Tracing Configuration
+tracing:
+  enabled: false  # Enable distributed tracing and metrics collection
+  otlp_http_endpoint: "http://localhost:4318/v1/traces"  # OpenTelemetry HTTP endpoint for traces
+  otlp_grpc_endpoint: "grpc://localhost:4317"  # OpenTelemetry gRPC endpoint for traces
+
+# Vision-Language Model Configuration
+vlm:
+  server_url: "http://localhost:1977/v1"  # URL endpoint for Vision-Language Model service
+  model_name: "nvidia/nemotron-nano-12b-v2-vl"  # Vision-Language Model for processing images and text
+  # api_key: ""  # Optional: API key for VLM service (overrides NVIDIA_API_KEY environment variable)
+
+# MinIO Configuration
+minio:
+  endpoint: "localhost:9010"  # MinIO object storage endpoint
+  access_key: "minioadmin"  # MinIO access key for authentication
+  secret_key: "minioadmin"  # MinIO secret key for authentication
+
+# Summarizer Configuration
+summarizer:
+  model_name: "nvidia/llama-3.3-nemotron-super-49b-v1.5"  # Model for generating document summaries
+  server_url: "localhost:8999"  # URL endpoint for summarization service
+  max_chunk_length: 50000  # Maximum character length for chunks to summarize
+  chunk_overlap: 200  # Character overlap between chunks during summarization
+  temperature: 0.0  # Sampling temperature for summary generation
+  top_p: 1.0  # Nucleus sampling threshold for summary generation
+  # api_key: ""  # Optional: API key for summarization (overrides NVIDIA_API_KEY environment variable)
+
+# Reflection Configuration
+reflection:
+  enable_reflection: false  # Enable self-reflection to improve answer quality
+  max_loops: 3  # Maximum number of reflection iterations
+  model_name: "nvidia/llama-3.3-nemotron-super-49b-v1.5"  # Model for reflection and quality assessment
+  server_url: ""  # URL endpoint for reflection service
+  context_relevance_threshold: 1  # Minimum relevance score for context to be considered useful
+  response_groundedness_threshold: 1  # Minimum groundedness score for response to be considered factual
+  # api_key: ""  # Optional: API key for reflection (overrides NVIDIA_API_KEY environment variable)
+
+# Top-level Configuration Flags
+enable_guardrails: false  # Enable safety guardrails for input/output filtering
+enable_citations: true  # Include source citations in generated responses
+enable_vlm_inference: false  # Enable Vision-Language Model for multimodal queries
+temp_dir: "./tmp-data/"  # Temporary directory for file processing and storage
\ No newline at end of file
diff --git a/tests/integration/test_cases/library_usage.py b/tests/integration/test_cases/library_usage.py
index d67ef4942..c562201ca 100644
--- a/tests/integration/test_cases/library_usage.py
+++ b/tests/integration/test_cases/library_usage.py
@@ -49,8 +49,7 @@ def _get_config(self):
         """Get or create shared config object with common settings"""
         if self._config is None:
             from nvidia_rag.utils.configuration import NvidiaRAGConfig
-            
-            config_path = Path(__file__).parent.parent.parent.parent / "notebooks" / "config.yaml"
+            config_path = Path(__file__).parent.parent.parent.parent / "tests" / "integration" / "notebook_test_config.yaml"
             self._config = NvidiaRAGConfig.from_yaml(str(config_path))
             
             # Common configuration for all library tests
diff --git a/tests/integration/test_cases/multimodal_query.py b/tests/integration/test_cases/multimodal_query.py
index b888b8c7c..b38925211 100644
--- a/tests/integration/test_cases/multimodal_query.py
+++ b/tests/integration/test_cases/multimodal_query.py
@@ -95,7 +95,7 @@
 
     2. Deploy or upgrade the chart:
 
-        helm upgrade --install rag -n rag https://helm.ngc.nvidia.com/nvstaging/blueprint/charts/nvidia-blueprint-rag-v2.4.0-rc1.tgz \\
+        helm upgrade --install rag -n rag https://helm.ngc.nvidia.com/nvidia/blueprint/charts/nvidia-blueprint-rag-v2.5.0-rc1.tgz \\
           --username '$oauthtoken' \\
           --password "${NGC_API_KEY}" \\
           --set imagePullSecret.password=$NGC_API_KEY \\
diff --git a/tests/unit/test_compose_helm_parity/env_parity_exemptions.yaml b/tests/unit/test_compose_helm_parity/env_parity_exemptions.yaml
index 80cea693c..ed9f45c06 100644
--- a/tests/unit/test_compose_helm_parity/env_parity_exemptions.yaml
+++ b/tests/unit/test_compose_helm_parity/env_parity_exemptions.yaml
@@ -53,8 +53,8 @@ ngcApiKeyPresenceExemptions:
   perService:
     nims.yaml:
       nim-llm: true
-      nemoretriever-embedding-ms: true
-      nemoretriever-ranking-ms: true
+      nemotron-embedding-ms: true
+      nemotron-ranking-ms: true
       vlm-ms: true
 
 
diff --git a/tests/unit/test_compose_helm_parity/test_compose_helm_parity.py b/tests/unit/test_compose_helm_parity/test_compose_helm_parity.py
index 713e50a6c..15a36bbbf 100644
--- a/tests/unit/test_compose_helm_parity/test_compose_helm_parity.py
+++ b/tests/unit/test_compose_helm_parity/test_compose_helm_parity.py
@@ -300,7 +300,7 @@ def test_compose_helm_image_and_env_parity():
                     "ngcAPIKey",
                 ],
             },
-            "nemoretriever-embedding-ms": {
+            "nemotron-embedding-ms": {
                 "values_image_repo_path": [
                     "nimOperator",
                     "nvidia-nim-llama-32-nv-embedqa-1b-v2",
@@ -320,7 +320,7 @@ def test_compose_helm_image_and_env_parity():
                     "ngcAPIKey",
                 ],
             },
-            "nemoretriever-ranking-ms": {
+            "nemotron-ranking-ms": {
                 "values_image_repo_path": [
                     "nimOperator",
                     "nvidia-nim-llama-32-nv-rerankqa-1b-v2",
diff --git a/tests/unit/test_metadata_validation/test_filter_validator.py b/tests/unit/test_metadata_validation/test_filter_validator.py
index 0bf174973..3f6b34759 100644
--- a/tests/unit/test_metadata_validation/test_filter_validator.py
+++ b/tests/unit/test_metadata_validation/test_filter_validator.py
@@ -3222,6 +3222,30 @@ def test_string_basic_operations(self, mock_config, string_schema):
         assert result["status"] is True
         assert "error_message" not in result
 
+    def test_filename_preserves_case_other_string_lowercased(self, mock_config):
+        """Filename filter preserves case; other string fields are lowercased for matching."""
+        schema = MetadataSchema(
+            schema=[
+                MetadataField(name="filename", type="string", required=False),
+                MetadataField(name="title", type="string", required=False),
+            ]
+        )
+        parser = FilterExpressionParser(schema, mock_config)
+
+        # Filename must preserve case (ingestion stores original case).
+        result = parser.process_filter_expression(
+            'content_metadata["filename"] == "Report.PDF"'
+        )
+        assert result["status"] is True
+        assert '"Report.PDF"' in result["processed_expression"]
+
+        # Other string fields are normalized to lowercase.
+        result = parser.process_filter_expression(
+            'content_metadata["title"] == "Technical"'
+        )
+        assert result["status"] is True
+        assert '"technical"' in result["processed_expression"]
+
     def test_string_like_operations(self, mock_config, string_schema):
         """Test string LIKE operations."""
         parser = FilterExpressionParser(string_schema, mock_config)
diff --git a/tests/unit/test_observability/test_langchain_callback_handler.py b/tests/unit/test_observability/test_langchain_callback_handler.py
index 054f5d18d..80511caab 100644
--- a/tests/unit/test_observability/test_langchain_callback_handler.py
+++ b/tests/unit/test_observability/test_langchain_callback_handler.py
@@ -6,6 +6,12 @@
 import pytest
 from langchain_core.messages import AIMessageChunk
 from langchain_core.outputs import Generation, LLMResult
+from opentelemetry.semconv_ai import (
+    LLMRequestTypeValues,
+    SpanAttributes,
+    SUPPRESS_LANGUAGE_MODEL_INSTRUMENTATION_KEY,
+    TraceloopSpanKindValues,
+)
 
 
 class SpanMock:
@@ -58,7 +64,9 @@ def handler():
 
 
 def test_on_chat_model_start_sets_input_words_and_prompts(handler):
-    from nvidia_rag.utils.observability.langchain_callback_handler import SpanAttributes
+    from nvidia_rag.utils.observability.langchain_callback_handler import (
+        GEN_AI_PROMPTS,
+    )
 
     run_id = uuid4()
     messages = [
@@ -78,44 +86,8 @@ def test_on_chat_model_start_sets_input_words_and_prompts(handler):
     assert handler.total_input_words == 4
     # span should be created and attributes recorded
     assert run_id in handler.spans
-    span = handler.spans[run_id].span
-    # Check that at least one prompt attribute key prefix was used
-    prompt_prefix = f"{SpanAttributes.LLM_PROMPTS}."
-    prompt_keys = [k for k, _ in span.attributes if k.startswith(prompt_prefix)]
-    assert len(prompt_keys) >= 2
-
-
-def test_on_llm_start_and_end_sets_token_usage_and_ends_span(handler):
-    run_id = uuid4()
 
-    handler.on_llm_start(
-        serialized={"kwargs": {"name": "llm"}},
-        prompts=["What is ML?"],
-        run_id=run_id,
-    )
 
-    # Build a minimal valid LLMResult
-    gen = Generation(
-        text="Answer",
-        generation_info={"finish_reason": "stop"},
-    )
-
-    llm_result = LLMResult(
-        generations=[[gen]],
-        llm_output={
-            "model_name": "test-model",
-            "usage": {"prompt_tokens": 5, "completion_tokens": 7, "total_tokens": 12},
-        },
-    )
-
-    handler.on_llm_end(response=llm_result, run_id=run_id)
-
-    span = handler.spans[run_id].span
-    # Verify span ended
-    assert span.ended is True
-    # Verify some token usage attributes were set from llm_output usage
-    attr_keys = [k for k, _ in span.attributes]
-    assert any("usage" in k.lower() for k in attr_keys)
 
 
 def test_on_chain_end_updates_avg_words_per_chunk(handler):
@@ -156,3 +128,76 @@ def test_on_chain_end_updates_llm_tokens(handler):
 
     # Expect update_llm_tokens called with input words from chat (3) and output words (2)
     assert handler.metrics.token_calls[-1] == (3, 2)
+
+
+# SpanAttributes from opentelemetry.semconv_ai still used in langchain_callback_handler.py.
+# Missing/deprecated ones (LLM_REQUEST_MODEL, LLM_RESPONSE_MODEL, LLM_REQUEST_MAX_TOKENS,
+# LLM_REQUEST_TEMPERATURE, LLM_REQUEST_TOP_P, LLM_SYSTEM) are hardcoded in the handler.
+SPAN_ATTRIBUTES_USED = [
+    "LLM_REQUEST_FUNCTIONS",
+    "LLM_USAGE_TOTAL_TOKENS",
+    "TRACELOOP_WORKFLOW_NAME",
+    "TRACELOOP_ENTITY_PATH",
+    "TRACELOOP_SPAN_KIND",
+    "TRACELOOP_ENTITY_NAME",
+    "LLM_REQUEST_TYPE",
+    "TRACELOOP_ENTITY_INPUT",
+    "TRACELOOP_ENTITY_OUTPUT",
+]
+
+
+def test_semconv_ai_span_attributes_exist_and_not_deprecated():
+    """Ensure all SpanAttributes used in langchain_callback_handler exist and are non-empty strings."""
+    for attr_name in SPAN_ATTRIBUTES_USED:
+        assert hasattr(
+            SpanAttributes, attr_name
+        ), f"SpanAttributes.{attr_name} is missing or was removed from opentelemetry.semconv_ai"
+        value = getattr(SpanAttributes, attr_name)
+        assert isinstance(
+            value, str
+        ), f"SpanAttributes.{attr_name} should be a string, got {type(value).__name__}"
+        assert (
+            len(value) > 0
+        ), f"SpanAttributes.{attr_name} is empty (possibly deprecated or placeholder)"
+
+
+def test_semconv_ai_llm_request_type_values_used():
+    """Ensure LLMRequestTypeValues used in the handler (CHAT, COMPLETION) exist."""
+    assert hasattr(LLMRequestTypeValues, "CHAT")
+    assert hasattr(LLMRequestTypeValues, "COMPLETION")
+    assert isinstance(LLMRequestTypeValues.CHAT.value, str)
+    assert isinstance(LLMRequestTypeValues.COMPLETION.value, str)
+
+
+def test_semconv_ai_traceloop_span_kind_values_used():
+    """Ensure TraceloopSpanKindValues used in the handler (WORKFLOW, TASK, TOOL) exist."""
+    for kind in ("WORKFLOW", "TASK", "TOOL"):
+        assert hasattr(
+            TraceloopSpanKindValues, kind
+        ), f"TraceloopSpanKindValues.{kind} is missing"
+        assert isinstance(getattr(TraceloopSpanKindValues, kind).value, str)
+
+
+def test_langchain_callback_handler_imports_and_constants():
+    """Import the handler module and verify hardcoded attribute constants and semconv_ai key."""
+    from nvidia_rag.utils.observability.langchain_callback_handler import (
+        GEN_AI_COMPLETIONS,
+        GEN_AI_PROMPTS,
+        LLM_REQUEST_MAX_TOKENS,
+        LLM_REQUEST_MODEL,
+        LLM_REQUEST_TEMPERATURE,
+        LLM_REQUEST_TOP_P,
+        LLM_RESPONSE_MODEL,
+        LLM_SYSTEM,
+    )
+
+    assert GEN_AI_PROMPTS == "gen_ai.prompt"
+    assert GEN_AI_COMPLETIONS == "gen_ai.completion"
+    assert LLM_REQUEST_MODEL == "gen_ai.request.model"
+    assert LLM_RESPONSE_MODEL == "gen_ai.response.model"
+    assert LLM_REQUEST_MAX_TOKENS == "llm.request.max_tokens"
+    assert LLM_REQUEST_TEMPERATURE == "llm.request.temperature"
+    assert LLM_REQUEST_TOP_P == "llm.request.top_p"
+    assert LLM_SYSTEM == "llm.system"
+    assert SUPPRESS_LANGUAGE_MODEL_INSTRUMENTATION_KEY is not None
+    assert isinstance(SUPPRESS_LANGUAGE_MODEL_INSTRUMENTATION_KEY, str)
diff --git a/tests/unit/test_rag_server/test_query_rewriting.py b/tests/unit/test_rag_server/test_query_rewriting.py
index 129d82124..78e010474 100644
--- a/tests/unit/test_rag_server/test_query_rewriting.py
+++ b/tests/unit/test_rag_server/test_query_rewriting.py
@@ -17,6 +17,8 @@
 Test suite for query rewriting functionality in the RAG server.
 """
 
+from unittest.mock import AsyncMock, patch
+
 import pytest
 
 
@@ -64,6 +66,7 @@ class DummyVDB:
     """A minimal VDB stub used via monkeypatch on __prepare_vdb_op."""
 
     last_query = None
+    last_retrieval_method = None
 
     def check_collection_exists(self, collection_name: str) -> bool:
         return True
@@ -77,6 +80,15 @@ def get_metadata_schema(self, collection_name: str):
     def retrieval_langchain(self, query, collection_name, vectorstore=None, top_k=None, filter_expr="", otel_ctx=None):
         """Sync method - called in ThreadPoolExecutor or directly."""
         DummyVDB.last_query = query
+        DummyVDB.last_retrieval_method = "langchain"
+        return []
+
+    def retrieval_image_langchain(
+        self, query, collection_name, vectorstore=None, top_k=None, reranker_top_k=None
+    ):
+        """Called when query contains images (multimodal)."""
+        DummyVDB.last_query = query
+        DummyVDB.last_retrieval_method = "image"
         return []
 
 
@@ -253,6 +265,43 @@ async def test_search_combines_history_when_multiturn_enabled(monkeypatch):
     assert fake_vdb.last_query == "What is RAG?. How does it work?"
 
 
+@pytest.mark.asyncio
+async def test_search_skips_query_rewriter_for_image_query(monkeypatch):
+    """When query is multimodal with image, query rewriting is skipped and retrieval_image_langchain is used."""
+    from nvidia_rag.rag_server.main import NvidiaRAG
+
+    monkeypatch.setenv("CONVERSATION_HISTORY", "5")
+    monkeypatch.setenv("ENABLE_REFLECTION", "false")
+
+    fake_vdb = DummyVDB()
+    rag = NvidiaRAG()
+    monkeypatch.setattr(NvidiaRAG, "_prepare_vdb_op", lambda self, **kw: fake_vdb)
+
+    multimodal_query = [
+        {"type": "text", "text": "What is in this image?"},
+        {"type": "image_url", "image_url": {"url": "data:image/png;base64,x"}},
+    ]
+    messages = [
+        {"role": "user", "content": "Previous question"},
+        {"role": "assistant", "content": "Previous answer"},
+    ]
+
+    await rag.search(
+        query=multimodal_query,
+        messages=messages,
+        collection_names=["test"],
+        enable_query_rewriting=True,
+        enable_reranker=False,
+        filter_expr="",
+    )
+
+    # Assert: query rewriting skipped - last_query is text + image URL (no "REWRITTEN(...)")
+    assert fake_vdb.last_query == "What is in this image? data:image/png;base64,x"
+    assert "REWRITTEN" not in str(fake_vdb.last_query)
+    # Assert: retrieval_image_langchain was used (not retrieval_langchain)
+    assert fake_vdb.last_retrieval_method == "image"
+
+
 @pytest.mark.asyncio
 async def test_generate_uses_query_rewriter_when_enabled(monkeypatch):
     """Test that query rewriting is used in generate when enabled with conversation history."""
@@ -319,6 +368,55 @@ async def test_generate_uses_only_current_query_when_history_disabled(monkeypatc
     assert fake_vdb.last_query == "How does it work?"
 
 
+@pytest.mark.asyncio
+async def test_generate_skips_query_rewriter_for_image_query(monkeypatch):
+    """When messages contain multimodal content with image, query rewriting is skipped."""
+    from nvidia_rag.rag_server.main import NvidiaRAG
+
+    monkeypatch.setenv("CONVERSATION_HISTORY", "5")
+    monkeypatch.setenv("ENABLE_REFLECTION", "false")
+    monkeypatch.setenv("MULTITURN_RETRIEVER_SIMPLE", "False")
+
+    fake_vdb = DummyVDB()
+    rag = NvidiaRAG()
+    monkeypatch.setattr(NvidiaRAG, "_prepare_vdb_op", lambda self, **kw: fake_vdb)
+
+    messages = [
+        {"role": "user", "content": "What is RAG?"},
+        {"role": "assistant", "content": "A retrieval-augmented framework."},
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "What is in this image?"},
+                {"type": "image_url", "image_url": {"url": "data:image/png;base64,x"}},
+            ],
+        },
+    ]
+
+    async def _stream(*a, **k):
+        yield "ok"
+
+    with patch("nvidia_rag.rag_server.main.VLM") as mock_vlm_class:
+        mock_vlm_instance = mock_vlm_class.return_value
+        mock_vlm_instance.stream_with_messages = _stream
+
+        stream = await rag.generate(
+            messages=messages,
+            use_knowledge_base=True,
+            collection_names=["test"],
+            enable_query_rewriting=True,
+            enable_reranker=False,
+            enable_vlm_inference=True,
+            filter_expr="",
+        )
+
+    # Assert: query rewriting skipped - last_query is text + image URL (no "REWRITTEN(...)")
+    assert fake_vdb.last_query == "What is in this image? data:image/png;base64,x"
+    assert "REWRITTEN" not in str(fake_vdb.last_query)
+    # Assert: retrieval_image_langchain was used
+    assert fake_vdb.last_retrieval_method == "image"
+
+
 @pytest.mark.asyncio
 async def test_generate_combines_history_when_multiturn_enabled(monkeypatch):
     """Test that when multiturn_retrieval_simple is True, history is concatenated."""
@@ -353,4 +451,3 @@ async def test_generate_combines_history_when_multiturn_enabled(monkeypatch):
     # last previous user query is combined with current retriever_query
     # Expected concatenation: "What is RAG?. How does it work?"
     assert fake_vdb.last_query == "What is RAG?. How does it work?"
-
diff --git a/tests/unit/test_rag_server/test_rag_main_advanced_features.py b/tests/unit/test_rag_server/test_rag_main_advanced_features.py
index 4331586a7..30f4ace20 100644
--- a/tests/unit/test_rag_server/test_rag_main_advanced_features.py
+++ b/tests/unit/test_rag_server/test_rag_main_advanced_features.py
@@ -1192,26 +1192,20 @@ def test_handle_prompt_processing_basic(self):
         assert user_message == [("user", "Test human prompt")]
 
     def test_handle_prompt_processing_with_nemotron_v1_model(self):
-        """Test prompt processing with Nemotron v1 model."""
+        """Test prompt processing with Nemotron v1 model uses the chat_template system prompt."""
         rag = NvidiaRAG()
 
         chat_history = []
 
-        # Mock instance attribute
-        mock_prompts = Mock()
         with patch.dict(os.environ, {"ENABLE_NEMOTRON_THINKING": "true"}):
-            mock_prompts.get.return_value = {
-                "system": "Test system prompt",
-                "human": "Test human prompt",
-            }
-
             result = rag._handle_prompt_processing(
                 chat_history, "llama-3.3-nemotron-super-49b-v1", "chat_template"
             )
 
             assert len(result) == 3
             system_message, conversation_history, user_message = result
-            assert system_message == [("system", "detailed thinking on")]
+            expected_system = rag.prompts.get("chat_template", {}).get("system", "")
+            assert system_message == [("system", expected_system)]
 
     def test_handle_prompt_processing_with_system_message_in_history(self):
         """Test prompt processing with system message in chat history."""
diff --git a/tests/unit/test_rag_server/test_rag_main_core_components.py b/tests/unit/test_rag_server/test_rag_main_core_components.py
index cc14c7c08..d4567399c 100644
--- a/tests/unit/test_rag_server/test_rag_main_core_components.py
+++ b/tests/unit/test_rag_server/test_rag_main_core_components.py
@@ -416,8 +416,8 @@ def test_build_retriever_query_from_multimodal_list(self):
         ]
 
         result = rag._build_retriever_query_from_content(content)
-        # When image_url is present, the method returns the image URL
-        assert result == ("http://example.com/image.jpg", True)
+        # Text parts joined with \n\n first, then image URL with space separator
+        assert result == ("Hello\n\nworld http://example.com/image.jpg", True)
 
     def test_build_retriever_query_from_list_without_text(self):
         """Test building retriever query from list without text items."""
@@ -428,6 +428,7 @@ def test_build_retriever_query_from_list_without_text(self):
         ]
 
         result = rag._build_retriever_query_from_content(content)
+        # Image-only: no text, so final query is just the image URL (no leading space)
         assert result == ("http://example.com/image.jpg", True)
 
     def test_build_retriever_query_from_other_type(self):
diff --git a/tests/unit/test_rag_server/test_rag_main_integration.py b/tests/unit/test_rag_server/test_rag_main_integration.py
index 3e3cd8be6..9de59b630 100644
--- a/tests/unit/test_rag_server/test_rag_main_integration.py
+++ b/tests/unit/test_rag_server/test_rag_main_integration.py
@@ -445,8 +445,8 @@ def test_build_retriever_query_from_content_multimodal(self):
         ]
 
         result = rag._build_retriever_query_from_content(content)
-        # When image_url is present, the method returns the image URL
-        assert result == ("http://example.com/image.jpg", True)
+        # Text parts joined with \n\n first, then image URL with space separator
+        assert result == ("Hello\n\nworld http://example.com/image.jpg", True)
 
     def test_print_conversation_history(self):
         """Test __print_conversation_history method."""
diff --git a/tests/unit/test_rag_server/test_self_reflection.py b/tests/unit/test_rag_server/test_self_reflection.py
index f7a6d54c6..6acf3b0e0 100644
--- a/tests/unit/test_rag_server/test_self_reflection.py
+++ b/tests/unit/test_rag_server/test_self_reflection.py
@@ -142,7 +142,7 @@ async def test_check_context_relevance(mocker):
         and structured prompts for consistent, reproducible reflection results.
     """
     # Set up a local ranker for reranking documents
-    local_ranker = get_ranking_model(model="nvidia/llama-3.2-nv-rerankqa-1b-v2", url="")
+    local_ranker = get_ranking_model(model="nvidia/llama-nemotron-rerank-1b-v2", url="")
 
     # Create a mock VDBRag object
     mock_vdb_op = mocker.MagicMock(spec=VDBRag)
diff --git a/tests/unit/test_utils/test_configuration.py b/tests/unit/test_utils/test_configuration.py
index 3525664b7..04c81ba9d 100644
--- a/tests/unit/test_utils/test_configuration.py
+++ b/tests/unit/test_utils/test_configuration.py
@@ -23,8 +23,6 @@
 
 import pytest
 import yaml
-from pydantic import SecretStr, ValidationError
-
 from nvidia_rag.utils.configuration import (
     EmbeddingConfig,
     FilterExpressionGeneratorConfig,
@@ -44,6 +42,7 @@
     VectorStoreConfig,
     VLMConfig,
 )
+from pydantic import SecretStr, ValidationError
 
 
 class TestVectorStoreConfig:
@@ -135,14 +134,16 @@ def test_get_model_parameters_default(self):
         config = LLMConfig()
         params = config.get_model_parameters()
 
-        # Default model contains "llama-3.3-nemotron-super-49b" so it triggers nemotron logic
         expected = {
             "min_tokens": 0,
             "ignore_eos": False,
             "max_tokens": 32768,
+            "enable_thinking": False,
+            "reasoning_budget": 0,
+            "low_effort": False,
             "min_thinking_tokens": 0,
             "max_thinking_tokens": 0,
-            "temperature": 0,
+            "temperature": 0.0,
             "top_p": 1.0,
         }
         assert params == expected
@@ -152,14 +153,16 @@ def test_get_model_parameters_generic(self):
         config = LLMConfig(model_name="meta/llama-3.1-8b-instruct")
         params = config.get_model_parameters()
 
-        # Generic model should use the base parameter values
         expected = {
             "min_tokens": 0,
             "ignore_eos": False,
             "max_tokens": 32768,
+            "enable_thinking": False,
+            "reasoning_budget": 0,
+            "low_effort": False,
             "min_thinking_tokens": 0,
             "max_thinking_tokens": 0,
-            "temperature": 0,
+            "temperature": 0.0,
             "top_p": 1.0,
         }
         assert params == expected
@@ -197,7 +200,7 @@ def test_default_values(self):
         """Test default configuration values."""
         config = EmbeddingConfig()
 
-        assert config.model_name == "nvidia/llama-3.2-nv-embedqa-1b-v2"
+        assert config.model_name == "nvidia/llama-nemotron-embed-1b-v2"
         assert config.model_engine == "nvidia-ai-endpoints"
         assert config.dimensions == 2048
         assert config.server_url == ""
@@ -210,7 +213,7 @@ def test_default_values(self):
         """Test default configuration values."""
         config = RankingConfig()
 
-        assert config.model_name == "nvidia/llama-3.2-nv-rerankqa-1b-v2"
+        assert config.model_name == "nvidia/llama-nemotron-rerank-1b-v2"
         assert config.model_engine == "nvidia-ai-endpoints"
         assert config.server_url == ""
         assert config.enable_reranker is True
diff --git a/tests/unit/test_utils/test_llm.py b/tests/unit/test_utils/test_llm.py
index f691edb9d..c59cabf43 100644
--- a/tests/unit/test_utils/test_llm.py
+++ b/tests/unit/test_utils/test_llm.py
@@ -240,7 +240,6 @@ def test_get_llm_nvidia_endpoints_with_url(self, mock_chatnvidia, mock_sanitize)
                 base_url="http://test-url:8000",
                 model="test-model",
                 api_key="test-api-key",
-                stop=[],
                 default_headers={"source": "rag-blueprint"},
                 temperature=0.7,
                 top_p=0.9,
@@ -272,7 +271,6 @@ def test_get_llm_nvidia_endpoints_api_catalog(self, mock_chatnvidia, mock_saniti
                 temperature=None,
                 top_p=None,
                 max_completion_tokens=None,
-                stop=[],
                 default_headers={"source": "rag-blueprint"},
             )
 
@@ -325,7 +323,6 @@ def test_get_llm_with_guardrails_success(
                     temperature=0.7,
                     top_p=None,
                     max_tokens=None,
-                    stop=[],
                 )
 
     @patch("requests.get")
@@ -419,7 +416,6 @@ def test_get_llm_none_parameters(self, mock_sanitize):
                     temperature=None,
                     top_p=None,
                     max_completion_tokens=None,
-                    stop=[],
                     default_headers={"source": "rag-blueprint"},
                     model_kwargs={"ignore_eos": False},
                 )
@@ -429,9 +425,10 @@ class TestStreamingFilterThink:
     """Test cases for streaming_filter_think function."""
 
     def create_mock_chunk(self, content):
-        """Helper to create mock chunk with content attribute."""
+        """Helper to create mock chunk with content and additional_kwargs (so 'in' works)."""
         chunk = Mock()
         chunk.content = content
+        chunk.additional_kwargs = {}
         return chunk
 
     def test_streaming_filter_think_no_tags(self):
@@ -683,7 +680,6 @@ def test_llm_creation_with_all_parameters(self, mock_chatnvidia):
                     base_url="http://test:8000",
                     model="meta/llama-3.1-8b-instruct",
                     api_key="test-api-key",
-                    stop=[],
                     default_headers={"source": "rag-blueprint"},
                     temperature=0.7,
                     top_p=0.9,
@@ -826,67 +822,83 @@ def test_streaming_filter_complete_workflow(self):
         assert result == expected
 
     def create_mock_chunk(self, content):
-        """Helper to create mock chunk with content attribute."""
+        """Helper to create mock chunk with content and additional_kwargs (so 'in' works)."""
         chunk = Mock()
         chunk.content = content
+        chunk.additional_kwargs = {}
         return chunk
 
 
-class TestThinkingBudgetNemotron3Nano30B:
-    """Tests for thinking budget behavior with nvidia/nemotron-3-nano-30b-a3b."""
+class TestBindReasoningConfigNemotron3Nano:
+    """Tests for _bind_reasoning_config with nemotron-3-nano models."""
 
-    @patch.dict(os.environ, {"ENABLE_NEMOTRON_3_NANO_THINKING": "true"})
-    def test_bind_thinking_tokens_for_nemotron_30b_maps_reasoning_budget(self):
-        """max_thinking_tokens for nemotron-3-nano-30b-a3b maps to reasoning_budget."""
-        from nvidia_rag.utils.llm import _bind_thinking_tokens_if_configured
+    @patch.dict(os.environ, {"LLM_ENABLE_THINKING": "true"})
+    def test_bind_reasoning_config_nemotron_3_nano_with_budget(self):
+        """enable_thinking + reasoning_budget for nemotron-3-nano binds chat_template_kwargs and reasoning_budget."""
+        from nvidia_rag.utils.llm import _bind_reasoning_config
 
         mock_llm = Mock()
-        bound_llm = _bind_thinking_tokens_if_configured(
+        mock_llm.bind.return_value = mock_llm
+        config = Mock()
+        config.llm.parameters.enable_thinking = True
+        config.llm.parameters.reasoning_budget = 8192
+        config.llm.parameters.low_effort = False
+        config.llm.parameters.min_thinking_tokens = 0
+        config.llm.parameters.max_thinking_tokens = 0
+
+        bound_llm = _bind_reasoning_config(
             mock_llm,
+            config=config,
             model="nvidia/nemotron-3-nano-30b-a3b",
-            max_thinking_tokens=8192,
         )
 
-        mock_llm.bind.assert_called_once_with(
-            reasoning_budget=8192,
+        calls = mock_llm.bind.call_args_list
+        assert any(
+            call.kwargs.get("chat_template_kwargs", {}).get("enable_thinking") is True
+            for call in calls
         )
-        assert bound_llm is mock_llm.bind.return_value
 
-    def test_min_thinking_tokens_alone_raises_for_nemotron_30b(self):
-        """min_thinking_tokens alone raises ValueError for nemotron-3-nano-30b-a3b (max_thinking_tokens required)."""
-        from nvidia_rag.utils.llm import _bind_thinking_tokens_if_configured
+    def test_bind_reasoning_config_unsupported_model_returns_original(self):
+        """Unsupported model returns original LLM without binding."""
+        from nvidia_rag.utils.llm import _bind_reasoning_config
 
         mock_llm = Mock()
-        with pytest.raises(ValueError, match="max_thinking_tokens must be a positive integer"):
-            _bind_thinking_tokens_if_configured(
-                mock_llm,
-                model="nvidia/nemotron-3-nano-30b-a3b",
-                min_thinking_tokens=1,
-            )
-
-    def test_thinking_tokens_unsupported_model_raises(self):
-        """Using thinking tokens with unsupported model raises ValueError."""
-        from nvidia_rag.utils.llm import _bind_thinking_tokens_if_configured
+        config = Mock()
+        config.llm.parameters.enable_thinking = False
+        config.llm.parameters.reasoning_budget = 0
+        config.llm.parameters.low_effort = False
+        config.llm.parameters.min_thinking_tokens = 0
+        config.llm.parameters.max_thinking_tokens = 0
+
+        bound_llm = _bind_reasoning_config(
+            mock_llm,
+            config=config,
+            model="meta/llama-3.1-8b-instruct",
+        )
 
-        mock_llm = Mock()
-        with pytest.raises(ValueError):
-            _bind_thinking_tokens_if_configured(
-                mock_llm,
-                model="meta/llama-3.1-8b-instruct",
-                max_thinking_tokens=10,
-            )
+        mock_llm.bind.assert_not_called()
+        assert bound_llm is mock_llm
 
 
-class TestThinkingBudgetNemotronNano9B:
-    """Tests for thinking budget behavior with nvidia/nvidia-nemotron-nano-9b-v2."""
+class TestBindReasoningConfigNemotronNano9B:
+    """Tests for _bind_reasoning_config with nvidia/nvidia-nemotron-nano-9b-v2."""
 
-    def test_bind_thinking_tokens_for_nano_9b_binds_min_and_max(self):
+    def test_bind_reasoning_config_nano_9b_binds_min_and_max(self):
         """Both min_thinking_tokens and max_thinking_tokens bind for nano-9b."""
-        from nvidia_rag.utils.llm import _bind_thinking_tokens_if_configured
+        from nvidia_rag.utils.llm import _bind_reasoning_config
 
         mock_llm = Mock()
-        bound_llm = _bind_thinking_tokens_if_configured(
+        mock_llm.bind.return_value = mock_llm
+        config = Mock()
+        config.llm.parameters.enable_thinking = False
+        config.llm.parameters.reasoning_budget = 0
+        config.llm.parameters.low_effort = False
+        config.llm.parameters.min_thinking_tokens = 1
+        config.llm.parameters.max_thinking_tokens = 8192
+
+        bound_llm = _bind_reasoning_config(
             mock_llm,
+            config=config,
             model="nvidia/nvidia-nemotron-nano-9b-v2",
             min_thinking_tokens=1,
             max_thinking_tokens=8192,
@@ -898,13 +910,21 @@ def test_bind_thinking_tokens_for_nano_9b_binds_min_and_max(self):
         )
         assert bound_llm is mock_llm.bind.return_value
 
-    def test_no_thinking_tokens_for_nano_9b_returns_original_llm(self):
+    def test_bind_reasoning_config_nano_9b_no_tokens_returns_original(self):
         """If no thinking tokens are provided, nano-9b returns original LLM."""
-        from nvidia_rag.utils.llm import _bind_thinking_tokens_if_configured
+        from nvidia_rag.utils.llm import _bind_reasoning_config
 
         mock_llm = Mock()
-        bound_llm = _bind_thinking_tokens_if_configured(
+        config = Mock()
+        config.llm.parameters.enable_thinking = False
+        config.llm.parameters.reasoning_budget = 0
+        config.llm.parameters.low_effort = False
+        config.llm.parameters.min_thinking_tokens = 0
+        config.llm.parameters.max_thinking_tokens = 0
+
+        bound_llm = _bind_reasoning_config(
             mock_llm,
+            config=config,
             model="nvidia/nvidia-nemotron-nano-9b-v2",
         )
 
diff --git a/tests/unit/test_utils/test_reranker.py b/tests/unit/test_utils/test_reranker.py
index f012a2a5c..ac60c67fe 100644
--- a/tests/unit/test_utils/test_reranker.py
+++ b/tests/unit/test_utils/test_reranker.py
@@ -66,11 +66,11 @@ def test_get_ranking_model_nvidia_endpoints_with_model_name(
         mock_nvidia_rerank.return_value = mock_reranker
 
         result = _get_ranking_model(
-            "nvidia/llama-3.2-nv-rerankqa-1b-v2", "", 10, config=mock_config
+            "nvidia/llama-nemotron-rerank-1b-v2", "", 10, config=mock_config
         )
 
         mock_nvidia_rerank.assert_called_once_with(
-            model="nvidia/llama-3.2-nv-rerankqa-1b-v2",
+            model="nvidia/llama-nemotron-rerank-1b-v2",
             api_key="test-api-key",
             top_n=10,
             truncate="END",
@@ -297,7 +297,7 @@ def test_complete_ranking_workflow_with_url(
 
             # Test the workflow
             model = get_ranking_model(
-                "nvidia/llama-3.2-nv-rerankqa-1b-v2", "rerank-service:8080", 10
+                "nvidia/llama-nemotron-rerank-1b-v2", "rerank-service:8080", 10
             )
 
             # Test that the model can be used
@@ -325,7 +325,7 @@ def test_complete_ranking_workflow_api_catalog(
             mock_get_model.return_value = mock_reranker
 
             # Test the workflow
-            model = get_ranking_model("nvidia/llama-3.2-nv-rerankqa-1b-v2", "", 5)
+            model = get_ranking_model("nvidia/llama-nemotron-rerank-1b-v2", "", 5)
 
             # Test that the model can be used
             documents = ["doc1", "doc2"]
diff --git a/tests/unit/test_utils/test_vdb/test_oracle_queries.py b/tests/unit/test_utils/test_vdb/test_oracle_queries.py
new file mode 100644
index 000000000..eb2e45705
--- /dev/null
+++ b/tests/unit/test_utils/test_vdb/test_oracle_queries.py
@@ -0,0 +1,146 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+"""Unit tests for Oracle SQL query generation, focused on case preservation."""
+
+import pytest
+
+from nvidia_rag.utils.vdb.oracle.oracle_queries import (
+    check_table_exists_query,
+    create_text_index_ddl,
+    create_vector_index_ddl,
+    create_vector_table_ddl,
+    drop_table_ddl,
+    get_count_query,
+    get_delete_docs_query,
+    get_hybrid_search_query,
+    get_similarity_search_query,
+    get_unique_sources_query,
+)
+
+
+@pytest.mark.parametrize(
+    "name",
+    [
+        "Financial_Dataset",  # mixed case
+        "s_fdbe0520aaa1234",  # lowercase session UUID
+        "UPPERCASE_NAME",  # all uppercase (regression)
+        "MyCollection",  # PascalCase
+        "with-dashes",  # special chars valid in quoted identifiers
+    ],
+)
+class TestQuotedIdentifiers:
+    """Every DDL/DML must quote the table name to preserve case in Oracle."""
+
+    def test_create_vector_table_quotes_name(self, name):
+        ddl = create_vector_table_ddl(name)
+        assert f'CREATE TABLE "{name}"' in ddl
+        # Must NOT contain unquoted form (which Oracle would case-fold)
+        assert f"CREATE TABLE {name} " not in ddl
+
+    def test_create_vector_index_quotes_both_index_and_table(self, name):
+        ddl = create_vector_index_ddl(name, index_type="IVF")
+        assert f'CREATE VECTOR INDEX "{name}_vec_idx"' in ddl
+        assert f'ON "{name}"(vector)' in ddl
+
+    def test_create_vector_index_hnsw_quotes_both(self, name):
+        ddl = create_vector_index_ddl(name, index_type="HNSW")
+        assert f'CREATE VECTOR INDEX "{name}_vec_idx"' in ddl
+        assert f'ON "{name}"(vector)' in ddl
+
+    def test_create_text_index_quotes_both(self, name):
+        ddl = create_text_index_ddl(name)
+        assert f'CREATE INDEX "{name}_text_idx"' in ddl
+        assert f'ON "{name}"(text)' in ddl
+
+    def test_drop_table_quotes_name(self, name):
+        assert drop_table_ddl(name) == f'DROP TABLE "{name}" CASCADE CONSTRAINTS PURGE'
+
+    def test_count_query_quotes_name(self, name):
+        assert get_count_query(name) == f'SELECT COUNT(*) as cnt FROM "{name}"'
+
+    def test_unique_sources_query_quotes_name(self, name):
+        sql = get_unique_sources_query(name)
+        assert f'FROM "{name}"' in sql
+
+    def test_delete_docs_query_quotes_name(self, name):
+        sql = get_delete_docs_query(name)
+        assert f'DELETE FROM "{name}"' in sql
+
+    def test_similarity_search_quotes_name(self, name):
+        sql = get_similarity_search_query(name, distance_metric="COSINE")
+        assert f'FROM "{name}"' in sql
+
+    def test_hybrid_search_quotes_name_in_both_ctes(self, name):
+        sql = get_hybrid_search_query(name)
+        # Both vector_results and text_results CTEs query the user table
+        assert sql.count(f'FROM "{name}"') == 2
+
+
+def test_check_table_exists_uses_exact_case_match():
+    """check_table_exists_query must NOT wrap with UPPER() — quoted identifiers
+    are case-sensitive and we want exact case lookup."""
+    sql = check_table_exists_query()
+    assert "UPPER(:table_name)" not in sql
+    assert ":table_name" in sql
+    assert "WHERE table_name = :table_name" in sql
+
+
+class TestObjectNameDerivation:
+    """Per-collection object names must fit Oracle's 128-char limit."""
+
+    def test_short_name_unchanged(self):
+        from nvidia_rag.utils.vdb.oracle.oracle_queries import _derive_object_name
+
+        assert _derive_object_name("MyCollection", "_vec_idx") == "MyCollection_vec_idx"
+
+    def test_short_lowercase_unchanged(self):
+        from nvidia_rag.utils.vdb.oracle.oracle_queries import _derive_object_name
+
+        assert _derive_object_name("s_fdbe123", "_text_idx") == "s_fdbe123_text_idx"
+
+    def test_long_name_hashed(self):
+        """Names that would exceed 128 chars get a deterministic hash prefix."""
+        from nvidia_rag.utils.vdb.oracle.oracle_queries import _derive_object_name
+
+        # 130-char collection name — verbatim concatenation would exceed limit
+        long_name = "C" * 130
+        derived = _derive_object_name(long_name, "_vec_idx")
+        assert len(derived) <= 128
+        assert derived.startswith("nvr_")
+        assert derived.endswith("_vec_idx")
+
+    def test_long_name_deterministic(self):
+        """Same input must always produce the same derived name (idempotent)."""
+        from nvidia_rag.utils.vdb.oracle.oracle_queries import _derive_object_name
+
+        long_name = "C" * 200
+        a = _derive_object_name(long_name, "_vec_idx")
+        b = _derive_object_name(long_name, "_vec_idx")
+        assert a == b
+
+    def test_long_name_collision_resistant(self):
+        """Different long names produce different derived names."""
+        from nvidia_rag.utils.vdb.oracle.oracle_queries import _derive_object_name
+
+        a = _derive_object_name("A" * 200, "_vec_idx")
+        b = _derive_object_name("B" * 200, "_vec_idx")
+        assert a != b
+
+
+def test_derived_index_used_in_long_name_ddl():
+    """When the table name is long, the emitted DDL uses the hashed index name."""
+    from nvidia_rag.utils.vdb.oracle.oracle_queries import (
+        create_text_index_ddl,
+        create_vector_index_ddl,
+    )
+
+    long_name = "X" * 200
+    ddl = create_vector_index_ddl(long_name, "IVF")
+    # Verbatim concatenation would have produced "X*200_vec_idx" — should NOT appear
+    assert f"{long_name}_vec_idx" not in ddl
+    # Should contain the hashed prefix
+    assert '"nvr_' in ddl
+
+    txt_ddl = create_text_index_ddl(long_name)
+    assert f"{long_name}_text_idx" not in txt_ddl
+    assert '"nvr_' in txt_ddl
diff --git a/tests/unit/test_utils/test_vdb/test_vdb_ingest_base.py b/tests/unit/test_utils/test_vdb/test_vdb_ingest_base.py
new file mode 100644
index 000000000..a13877d05
--- /dev/null
+++ b/tests/unit/test_utils/test_vdb/test_vdb_ingest_base.py
@@ -0,0 +1,129 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Unit tests for SerializedVDBWrapper from vdb_ingest_base module."""
+
+import threading
+from concurrent.futures import ThreadPoolExecutor
+from unittest.mock import MagicMock
+
+import pytest
+from nvidia_rag.utils.vdb.vdb_ingest_base import SerializedVDBWrapper
+
+
+@pytest.fixture
+def mock_vdb_op():
+    """Create a mock VDB operation object."""
+    op = MagicMock()
+    op.run_async.return_value = "run_async_result"
+    op.run.return_value = "run_result"
+    op.write_to_index.return_value = "write_result"
+    op.create_index.return_value = "index_result"
+    op.some_read_method.return_value = "read_result"
+    return op
+
+
+@pytest.fixture
+def wrapper(mock_vdb_op):
+    """Create a SerializedVDBWrapper around a mock VDB op."""
+    return SerializedVDBWrapper(mock_vdb_op)
+
+
+@pytest.mark.skipif(
+    SerializedVDBWrapper is None,
+    reason="nv_ingest_client not installed",
+)
+class TestSerializedVDBWrapper:
+    """Test cases for SerializedVDBWrapper."""
+
+    def test_run_async_delegates_to_wrapped_op(self, wrapper, mock_vdb_op):
+        """Test that run_async delegates to the wrapped VDB op."""
+        records = [{"data": "test"}]
+        result = wrapper.run_async(records)
+
+        mock_vdb_op.run_async.assert_called_once_with(records)
+        assert result == "run_async_result"
+
+    def test_run_delegates_to_wrapped_op(self, wrapper, mock_vdb_op):
+        """Test that run delegates to the wrapped VDB op."""
+        records = [{"data": "test"}]
+        result = wrapper.run(records)
+
+        mock_vdb_op.run.assert_called_once_with(records)
+        assert result == "run_result"
+
+    def test_write_to_index_delegates_with_kwargs(self, wrapper, mock_vdb_op):
+        """Test that write_to_index passes kwargs to the wrapped VDB op."""
+        records = [{"data": "test"}]
+        result = wrapper.write_to_index(records, collection_name="test")
+
+        mock_vdb_op.write_to_index.assert_called_once_with(
+            records, collection_name="test"
+        )
+        assert result == "write_result"
+
+    def test_create_index_delegates_with_kwargs(self, wrapper, mock_vdb_op):
+        """Test that create_index passes kwargs to the wrapped VDB op."""
+        result = wrapper.create_index(collection_name="test")
+
+        mock_vdb_op.create_index.assert_called_once_with(collection_name="test")
+        assert result == "index_result"
+
+    def test_getattr_delegates_non_overridden_methods(self, wrapper, mock_vdb_op):
+        """Test that non-write methods pass through to the wrapped VDB op."""
+        result = wrapper.some_read_method()
+        mock_vdb_op.some_read_method.assert_called_once()
+        assert result == "read_result"
+
+    def test_isinstance_check_with_vdb(self, wrapper):
+        """Test that wrapper passes isinstance check for VDB (was a real bug)."""
+        from nv_ingest_client.util.vdb.adt_vdb import VDB
+
+        assert isinstance(wrapper, VDB)
+
+    def test_write_methods_are_serialized(self, mock_vdb_op):
+        """Test that concurrent write calls are serialized by the lock."""
+        execution_log = []
+        lock_held = threading.Event()
+
+        def locked_write(records):
+            batch = records[0]
+            execution_log.append(f"acquired_{batch}")
+            if batch == "batch_1":
+                lock_held.set()
+                threading.Event().wait(0.1)
+            execution_log.append(f"released_{batch}")
+            return "done"
+
+        mock_vdb_op.run.side_effect = locked_write
+        wrapper = SerializedVDBWrapper(mock_vdb_op)
+
+        with ThreadPoolExecutor(max_workers=2) as executor:
+            f1 = executor.submit(wrapper.run, ["batch_1"])
+            lock_held.wait(timeout=2)
+            f2 = executor.submit(wrapper.run, ["batch_2"])
+            f1.result(timeout=5)
+            f2.result(timeout=5)
+
+        assert execution_log.index("released_batch_1") < execution_log.index(
+            "acquired_batch_2"
+        )
+
+    def test_wrapper_propagates_exceptions(self, wrapper, mock_vdb_op):
+        """Test that exceptions from the wrapped op propagate through the lock."""
+        mock_vdb_op.run.side_effect = ValueError("indexing failed")
+
+        with pytest.raises(ValueError, match="indexing failed"):
+            wrapper.run([{"data": "test"}])
diff --git a/uv.lock b/uv.lock
index dbbf310f2..a1d8a8973 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1,5 +1,5 @@
 version = 1
-revision = 2
+revision = 3
 requires-python = ">=3.11, <3.14"
 resolution-markers = [
     "python_full_version >= '3.13'",
@@ -804,7 +804,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/1f/cb/48e964c452ca2b92175a9b2dca037a553036cb053ba69e284650ce755f13/greenlet-3.3.0-cp311-cp311-macosx_11_0_universal2.whl", hash = "sha256:e29f3018580e8412d6aaf5641bb7745d38c85228dacf51a73bd4e26ddf2a6a8e", size = 274908, upload-time = "2025-12-04T14:23:26.435Z" },
     { url = "https://files.pythonhosted.org/packages/28/da/38d7bff4d0277b594ec557f479d65272a893f1f2a716cad91efeb8680953/greenlet-3.3.0-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a687205fb22794e838f947e2194c0566d3812966b41c78709554aa883183fb62", size = 577113, upload-time = "2025-12-04T14:50:05.493Z" },
     { url = "https://files.pythonhosted.org/packages/3c/f2/89c5eb0faddc3ff014f1c04467d67dee0d1d334ab81fadbf3744847f8a8a/greenlet-3.3.0-cp311-cp311-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:4243050a88ba61842186cb9e63c7dfa677ec146160b0efd73b855a3d9c7fcf32", size = 590338, upload-time = "2025-12-04T14:57:41.136Z" },
-    { url = "https://files.pythonhosted.org/packages/80/d7/db0a5085035d05134f8c089643da2b44cc9b80647c39e93129c5ef170d8f/greenlet-3.3.0-cp311-cp311-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:670d0f94cd302d81796e37299bcd04b95d62403883b24225c6b5271466612f45", size = 601098, upload-time = "2025-12-04T15:07:11.898Z" },
     { url = "https://files.pythonhosted.org/packages/dc/a6/e959a127b630a58e23529972dbc868c107f9d583b5a9f878fb858c46bc1a/greenlet-3.3.0-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:6cb3a8ec3db4a3b0eb8a3c25436c2d49e3505821802074969db017b87bc6a948", size = 590206, upload-time = "2025-12-04T14:26:01.254Z" },
     { url = "https://files.pythonhosted.org/packages/48/60/29035719feb91798693023608447283b266b12efc576ed013dd9442364bb/greenlet-3.3.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:2de5a0b09eab81fc6a382791b995b1ccf2b172a9fec934747a7a23d2ff291794", size = 1550668, upload-time = "2025-12-04T15:04:22.439Z" },
     { url = "https://files.pythonhosted.org/packages/0a/5f/783a23754b691bfa86bd72c3033aa107490deac9b2ef190837b860996c9f/greenlet-3.3.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:4449a736606bd30f27f8e1ff4678ee193bc47f6ca810d705981cfffd6ce0d8c5", size = 1615483, upload-time = "2025-12-04T14:27:28.083Z" },
@@ -812,7 +811,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/f8/0a/a3871375c7b9727edaeeea994bfff7c63ff7804c9829c19309ba2e058807/greenlet-3.3.0-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:b01548f6e0b9e9784a2c99c5651e5dc89ffcbe870bc5fb2e5ef864e9cc6b5dcb", size = 276379, upload-time = "2025-12-04T14:23:30.498Z" },
     { url = "https://files.pythonhosted.org/packages/43/ab/7ebfe34dce8b87be0d11dae91acbf76f7b8246bf9d6b319c741f99fa59c6/greenlet-3.3.0-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:349345b770dc88f81506c6861d22a6ccd422207829d2c854ae2af8025af303e3", size = 597294, upload-time = "2025-12-04T14:50:06.847Z" },
     { url = "https://files.pythonhosted.org/packages/a4/39/f1c8da50024feecd0793dbd5e08f526809b8ab5609224a2da40aad3a7641/greenlet-3.3.0-cp312-cp312-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:e8e18ed6995e9e2c0b4ed264d2cf89260ab3ac7e13555b8032b25a74c6d18655", size = 607742, upload-time = "2025-12-04T14:57:42.349Z" },
-    { url = "https://files.pythonhosted.org/packages/77/cb/43692bcd5f7a0da6ec0ec6d58ee7cddb606d055ce94a62ac9b1aa481e969/greenlet-3.3.0-cp312-cp312-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:c024b1e5696626890038e34f76140ed1daf858e37496d33f2af57f06189e70d7", size = 622297, upload-time = "2025-12-04T15:07:13.552Z" },
     { url = "https://files.pythonhosted.org/packages/75/b0/6bde0b1011a60782108c01de5913c588cf51a839174538d266de15e4bf4d/greenlet-3.3.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:047ab3df20ede6a57c35c14bf5200fcf04039d50f908270d3f9a7a82064f543b", size = 609885, upload-time = "2025-12-04T14:26:02.368Z" },
     { url = "https://files.pythonhosted.org/packages/49/0e/49b46ac39f931f59f987b7cd9f34bfec8ef81d2a1e6e00682f55be5de9f4/greenlet-3.3.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:2d9ad37fc657b1102ec880e637cccf20191581f75c64087a549e66c57e1ceb53", size = 1567424, upload-time = "2025-12-04T15:04:23.757Z" },
     { url = "https://files.pythonhosted.org/packages/05/f5/49a9ac2dff7f10091935def9165c90236d8f175afb27cbed38fb1d61ab6b/greenlet-3.3.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:83cd0e36932e0e7f36a64b732a6f60c2fc2df28c351bae79fbaf4f8092fe7614", size = 1636017, upload-time = "2025-12-04T14:27:29.688Z" },
@@ -820,7 +818,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/02/2f/28592176381b9ab2cafa12829ba7b472d177f3acc35d8fbcf3673d966fff/greenlet-3.3.0-cp313-cp313-macosx_11_0_universal2.whl", hash = "sha256:a1e41a81c7e2825822f4e068c48cb2196002362619e2d70b148f20a831c00739", size = 275140, upload-time = "2025-12-04T14:23:01.282Z" },
     { url = "https://files.pythonhosted.org/packages/2c/80/fbe937bf81e9fca98c981fe499e59a3f45df2a04da0baa5c2be0dca0d329/greenlet-3.3.0-cp313-cp313-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9f515a47d02da4d30caaa85b69474cec77b7929b2e936ff7fb853d42f4bf8808", size = 599219, upload-time = "2025-12-04T14:50:08.309Z" },
     { url = "https://files.pythonhosted.org/packages/c2/ff/7c985128f0514271b8268476af89aee6866df5eec04ac17dcfbc676213df/greenlet-3.3.0-cp313-cp313-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:7d2d9fd66bfadf230b385fdc90426fcd6eb64db54b40c495b72ac0feb5766c54", size = 610211, upload-time = "2025-12-04T14:57:43.968Z" },
-    { url = "https://files.pythonhosted.org/packages/79/07/c47a82d881319ec18a4510bb30463ed6891f2ad2c1901ed5ec23d3de351f/greenlet-3.3.0-cp313-cp313-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:30a6e28487a790417d036088b3bcb3f3ac7d8babaa7d0139edbaddebf3af9492", size = 624311, upload-time = "2025-12-04T15:07:14.697Z" },
     { url = "https://files.pythonhosted.org/packages/fd/8e/424b8c6e78bd9837d14ff7df01a9829fc883ba2ab4ea787d4f848435f23f/greenlet-3.3.0-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:087ea5e004437321508a8d6f20efc4cfec5e3c30118e1417ea96ed1d93950527", size = 612833, upload-time = "2025-12-04T14:26:03.669Z" },
     { url = "https://files.pythonhosted.org/packages/b5/ba/56699ff9b7c76ca12f1cdc27a886d0f81f2189c3455ff9f65246780f713d/greenlet-3.3.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:ab97cf74045343f6c60a39913fa59710e4bd26a536ce7ab2397adf8b27e67c39", size = 1567256, upload-time = "2025-12-04T15:04:25.276Z" },
     { url = "https://files.pythonhosted.org/packages/1e/37/f31136132967982d698c71a281a8901daf1a8fbab935dce7c0cf15f942cc/greenlet-3.3.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:5375d2e23184629112ca1ea89a53389dddbffcf417dad40125713d88eb5f96e8", size = 1636483, upload-time = "2025-12-04T14:27:30.804Z" },
@@ -1309,16 +1306,17 @@ wheels = [
 
 [[package]]
 name = "langchain-nvidia-ai-endpoints"
-version = "1.0.3"
+version = "1.2.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "aiohttp" },
     { name = "filetype" },
     { name = "langchain-core" },
+    { name = "requests" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/5a/9e/30814da280f7a79b168f83180f6a0396c166f86a566e56bb9877bf562611/langchain_nvidia_ai_endpoints-1.0.3.tar.gz", hash = "sha256:11c48fd24e4a9d4c86c65bcef943400f4e709497c93254c7dc97c43f68c2be89", size = 46526, upload-time = "2026-01-28T22:04:33.93Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/47/4b/e417af1b2b7f861f37e26bf4fa4b05cda4052002e3f84a966f0735baf94f/langchain_nvidia_ai_endpoints-1.2.0.tar.gz", hash = "sha256:4bd63b812707ea348a86539001aa9a89b3cba3ee56ade7379247a955e4bfd3eb", size = 53851, upload-time = "2026-03-10T17:55:08.127Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/67/04/c83f61106a245b74de11c1e075c1cc1e70462ece1dd9fc0584ad992a776d/langchain_nvidia_ai_endpoints-1.0.3-py3-none-any.whl", hash = "sha256:e5f170ad0a335637298bb90fb3df119793821e316355f61ab82f0106913eebbf", size = 50130, upload-time = "2026-01-28T22:04:33.065Z" },
+    { url = "https://files.pythonhosted.org/packages/66/e4/186f1a99e4d30bd91c8438d024dc73a71c8f7e0657c7acb6e79658aa19cf/langchain_nvidia_ai_endpoints-1.2.0-py3-none-any.whl", hash = "sha256:c8e075d5b3d31216374af0cfa9e690ab28ada3ebbde34dd6d36fe16a26d883cc", size = 58269, upload-time = "2026-03-10T17:55:06.339Z" },
 ]
 
 [[package]]
@@ -1349,7 +1347,7 @@ wheels = [
 
 [[package]]
 name = "langgraph"
-version = "1.0.7"
+version = "1.0.10"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "langchain-core" },
@@ -1359,9 +1357,9 @@ dependencies = [
     { name = "pydantic" },
     { name = "xxhash" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/72/5b/f72655717c04e33d3b62f21b166dc063d192b53980e9e3be0e2a117f1c9f/langgraph-1.0.7.tar.gz", hash = "sha256:0cfdfee51e6e8cfe503ecc7367c73933437c505b03fa10a85c710975c8182d9a", size = 497098, upload-time = "2026-01-22T16:57:47.303Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/55/92/14df6fefba28c10caf1cb05aa5b8c7bf005838fe32a86d903b6c7cc4018d/langgraph-1.0.10.tar.gz", hash = "sha256:73bd10ee14a8020f31ef07e9cd4c1a70c35cc07b9c2b9cd637509a10d9d51e29", size = 511644, upload-time = "2026-02-27T21:04:38.743Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/7e/0e/fe80144e3e4048e5d19ccdb91ac547c1a7dc3da8dbd1443e210048194c14/langgraph-1.0.7-py3-none-any.whl", hash = "sha256:9d68e8f8dd8f3de2fec45f9a06de05766d9b075b78fb03171779893b7a52c4d2", size = 157353, upload-time = "2026-01-22T16:57:45.997Z" },
+    { url = "https://files.pythonhosted.org/packages/5d/60/260e0c04620a37ba8916b712766c341cc5fc685dabc6948c899494bbc2ae/langgraph-1.0.10-py3-none-any.whl", hash = "sha256:7c298bef4f6ea292fcf9824d6088fe41a6727e2904ad6066f240c4095af12247", size = 160920, upload-time = "2026-02-27T21:04:35.932Z" },
 ]
 
 [[package]]
@@ -1379,15 +1377,15 @@ wheels = [
 
 [[package]]
 name = "langgraph-prebuilt"
-version = "1.0.7"
+version = "1.0.8"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "langchain-core" },
     { name = "langgraph-checkpoint" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/a7/59/711aecd1a50999456850dc328f3cad72b4372d8218838d8d5326f80cb76f/langgraph_prebuilt-1.0.7.tar.gz", hash = "sha256:38e097e06de810de4d0e028ffc0e432bb56d1fb417620fb1dfdc76c5e03e4bf9", size = 163692, upload-time = "2026-01-22T16:45:22.801Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/0d/06/dd61a5c2dce009d1b03b1d56f2a85b3127659fdddf5b3be5d8f1d60820fb/langgraph_prebuilt-1.0.8.tar.gz", hash = "sha256:0cd3cf5473ced8a6cd687cc5294e08d3de57529d8dd14fdc6ae4899549efcf69", size = 164442, upload-time = "2026-02-19T18:14:39.083Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/47/49/5e37abb3f38a17a3487634abc2a5da87c208cc1d14577eb8d7184b25c886/langgraph_prebuilt-1.0.7-py3-none-any.whl", hash = "sha256:e14923516504405bb5edc3977085bc9622c35476b50c1808544490e13871fe7c", size = 35324, upload-time = "2026-01-22T16:45:21.784Z" },
+    { url = "https://files.pythonhosted.org/packages/dc/41/ec966424ad3f2ed3996d24079d3342c8cd6c0bd0653c12b2a917a685ec6c/langgraph_prebuilt-1.0.8-py3-none-any.whl", hash = "sha256:d16a731e591ba4470f3e313a319c7eee7dbc40895bcf15c821f985a3522a7ce0", size = 35648, upload-time = "2026-02-19T18:14:37.611Z" },
 ]
 
 [[package]]
@@ -1753,7 +1751,7 @@ wheels = [
 
 [[package]]
 name = "nv-ingest-api"
-version = "26.1.1"
+version = "26.1.2"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "backoff" },
@@ -1767,14 +1765,14 @@ dependencies = [
     { name = "tritonclient" },
     { name = "universal-pathlib" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/41/eb/e0469e918d617175e1d3bcf952f0ca8e9b7756fce7817d5386ac4ddca154/nv_ingest_api-26.1.1.tar.gz", hash = "sha256:063d51f1d560bf03d7a595ff3ecebac1bffae45607cf6bd01e4fa8ca2265a884", size = 259532, upload-time = "2026-01-13T23:44:11.112Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/f3/bd/e6e885cca94b89723468d4c32d52d30ccc0235ebe2f1db33b0605402d6b8/nv_ingest_api-26.1.2.tar.gz", hash = "sha256:fea08f9bda064938a5876f1610ef0b92c6a1e4943130c564f329b0c87efa3daf", size = 259604, upload-time = "2026-01-21T14:06:27.092Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/42/51/cd93750a1c5797d8d12e843bb13645b0930e4490f96ca49f458aeb641018/nv_ingest_api-26.1.1-py3-none-any.whl", hash = "sha256:e4f8b860765cedba72622782692e2ffc69a100fd61956e8b8a81a47e6c852d66", size = 357481, upload-time = "2026-01-13T23:44:07.943Z" },
+    { url = "https://files.pythonhosted.org/packages/78/66/21e30e658578b7e5ab30857b99e9a0a5c91728ffdca13dadc3d3dba58b98/nv_ingest_api-26.1.2-py3-none-any.whl", hash = "sha256:8e7539a6b7d52afd821c0030e3197cfffddc011d26a8b093cf7b5ffa8addf02d", size = 357537, upload-time = "2026-01-21T14:06:24.321Z" },
 ]
 
 [[package]]
 name = "nv-ingest-client"
-version = "26.1.1"
+version = "26.1.2"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "build" },
@@ -1789,14 +1787,14 @@ dependencies = [
     { name = "setuptools" },
     { name = "tqdm" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/cb/c4/ae5e2b00a8ffdfc1a3cf660ded68c188140ca433b22446adbb72ccfc455d/nv_ingest_client-26.1.1.tar.gz", hash = "sha256:26d6844eac946b4fdb8da2f5f1e77feb22b52ac21e6d772cf6b1c8c21cef4bb8", size = 126865, upload-time = "2026-01-13T23:44:15.615Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/b3/62/386bc4a336b91df9c65afc18d905bb1fe3dd44ef1ce038895a701cba6035/nv_ingest_client-26.1.2.tar.gz", hash = "sha256:7ea4a35d4e7051031c273eb2b15170a0555462b702602e5c4fdce947bd39d446", size = 126061, upload-time = "2026-01-21T14:06:31.836Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/64/ea/042ad6d8ddfa887667159af8b473835bfcc5b9f51ba6e82ff14078e59a3e/nv_ingest_client-26.1.1-py3-none-any.whl", hash = "sha256:df9906c7021e6a1ae64140fbe7a345679b1cfad7dfa486442e38ca75d64f2b39", size = 147197, upload-time = "2026-01-13T23:44:12.761Z" },
+    { url = "https://files.pythonhosted.org/packages/5f/a0/6f082f42ba1ba4e3de751deee98099114405489389d82b840024fa26512e/nv_ingest_client-26.1.2-py3-none-any.whl", hash = "sha256:49ec81c2e2470509fc527d778d886caebe05a7f3012918eef2fd67922ea9b8b4", size = 146091, upload-time = "2026-01-21T14:06:28.586Z" },
 ]
 
 [[package]]
 name = "nvidia-rag"
-version = "2.4.0.dev0"
+version = "2.5.0.dev0"
 source = { virtual = "." }
 dependencies = [
     { name = "anyio" },
@@ -1811,7 +1809,7 @@ dependencies = [
     { name = "langchain-milvus" },
     { name = "langchain-nvidia-ai-endpoints" },
     { name = "lark" },
-    { name = "minio" },
+    { name = "oracledb" },
     { name = "pdfplumber" },
     { name = "protobuf" },
     { name = "pydantic" },
@@ -1840,6 +1838,7 @@ all = [
     { name = "opentelemetry-instrumentation-milvus" },
     { name = "opentelemetry-processor-baggage" },
     { name = "opentelemetry-sdk" },
+    { name = "oracledb" },
     { name = "overrides" },
     { name = "pyarrow" },
     { name = "tqdm" },
@@ -1868,6 +1867,13 @@ ingest = [
     { name = "tqdm" },
     { name = "tritonclient" },
 ]
+minio = [
+    { name = "minio" },
+]
+oracle = [
+    { name = "langchain-community" },
+    { name = "oracledb" },
+]
 rag = [
     { name = "azure-core" },
     { name = "azure-storage-blob" },
@@ -1914,20 +1920,21 @@ requires-dist = [
     { name = "httpx-sse", specifier = ">=0.4.3" },
     { name = "langchain", specifier = ">=1.2.7" },
     { name = "langchain-community", specifier = ">=0.4" },
+    { name = "langchain-community", marker = "extra == 'oracle'", specifier = ">=0.4" },
     { name = "langchain-core", specifier = ">=1.0.0" },
     { name = "langchain-elasticsearch", marker = "extra == 'all'", specifier = ">=0.3" },
     { name = "langchain-elasticsearch", marker = "extra == 'elasticsearch'", specifier = ">=0.3" },
     { name = "langchain-milvus", specifier = ">=0.3.0" },
-    { name = "langchain-nvidia-ai-endpoints", specifier = ">=1.0.3" },
+    { name = "langchain-nvidia-ai-endpoints", specifier = ">=1.2.0" },
     { name = "langchain-openai", marker = "extra == 'all'", specifier = ">=0.2" },
     { name = "langchain-openai", marker = "extra == 'ingest'", specifier = ">=0.2" },
     { name = "langchain-openai", marker = "extra == 'rag'", specifier = ">=0.2" },
     { name = "lark", specifier = ">=1.2.2" },
-    { name = "minio", specifier = ">=7.2,<8.0" },
-    { name = "nv-ingest-api", marker = "extra == 'all'", specifier = "==26.1.1" },
-    { name = "nv-ingest-api", marker = "extra == 'ingest'", specifier = "==26.1.1" },
-    { name = "nv-ingest-client", marker = "extra == 'all'", specifier = "==26.1.1" },
-    { name = "nv-ingest-client", marker = "extra == 'ingest'", specifier = "==26.1.1" },
+    { name = "minio", marker = "extra == 'minio'", specifier = ">=7.2,<8.0" },
+    { name = "nv-ingest-api", marker = "extra == 'all'", specifier = "==26.1.2" },
+    { name = "nv-ingest-api", marker = "extra == 'ingest'", specifier = "==26.1.2" },
+    { name = "nv-ingest-client", marker = "extra == 'all'", specifier = "==26.1.2" },
+    { name = "nv-ingest-client", marker = "extra == 'ingest'", specifier = "==26.1.2" },
     { name = "opentelemetry-api", marker = "extra == 'all'", specifier = ">=1.29,<2.0" },
     { name = "opentelemetry-api", marker = "extra == 'ingest'", specifier = ">=1.29,<2.0" },
     { name = "opentelemetry-api", marker = "extra == 'rag'", specifier = ">=1.29,<2.0" },
@@ -1953,6 +1960,9 @@ requires-dist = [
     { name = "opentelemetry-sdk", marker = "extra == 'ingest'", specifier = ">=1.29,<2.0" },
     { name = "opentelemetry-sdk", marker = "extra == 'rag'", specifier = ">=1.29,<2.0" },
     { name = "opentelemetry-sdk-extension-prometheus-multiprocess", marker = "extra == 'rag'", specifier = ">=1.0,<2.0" },
+    { name = "oracledb", specifier = ">=3.4.2" },
+    { name = "oracledb", marker = "extra == 'all'", specifier = ">=3.4.2" },
+    { name = "oracledb", marker = "extra == 'oracle'", specifier = ">=3.4.2" },
     { name = "overrides", marker = "extra == 'all'", specifier = ">=7.7,<8.0" },
     { name = "overrides", marker = "extra == 'ingest'", specifier = ">=7.7,<8.0" },
     { name = "pdfplumber", specifier = ">=0.11.9" },
@@ -1975,7 +1985,7 @@ requires-dist = [
     { name = "tritonclient", marker = "extra == 'ingest'", specifier = "==2.57.0" },
     { name = "uvicorn", extras = ["standard"], specifier = ">=0.32,<1.0" },
 ]
-provides-extras = ["rag", "ingest", "all", "elasticsearch"]
+provides-extras = ["rag", "ingest", "all", "elasticsearch", "oracle", "minio"]
 
 [package.metadata.requires-dev]
 docs = [
@@ -2288,6 +2298,33 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/16/5c/d3f1733665f7cd582ef0842fb1d2ed0bc1fba10875160593342d22bba375/opentelemetry_util_http-0.60b1-py3-none-any.whl", hash = "sha256:66381ba28550c91bee14dcba8979ace443444af1ed609226634596b4b0faf199", size = 8947, upload-time = "2025-12-11T13:36:37.151Z" },
 ]
 
+[[package]]
+name = "oracledb"
+version = "3.4.2"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "cryptography" },
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/f7/02/70a872d1a4a739b4f7371ab8d3d5ed8c6e57e142e2503531aafcb220893c/oracledb-3.4.2.tar.gz", hash = "sha256:46e0f2278ff1fe83fbc33a3b93c72d429323ec7eed47bc9484e217776cd437e5", size = 855467, upload-time = "2026-01-28T17:25:39.91Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/64/80/be263b668ba32b258d07c85f7bfb6967a9677e016c299207b28734f04c4b/oracledb-3.4.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:b8e4b8a852251cef09038b75f30fce1227010835f4e19cfbd436027acba2697c", size = 4228552, upload-time = "2026-01-28T17:25:54.844Z" },
+    { url = "https://files.pythonhosted.org/packages/91/bc/e832a649529da7c60409a81be41f3213b4c7ffda4fe424222b2145e8d43c/oracledb-3.4.2-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1617a1db020346883455af005efbefd51be2c4d797e43b1b38455a19f8526b48", size = 2421924, upload-time = "2026-01-28T17:25:56.984Z" },
+    { url = "https://files.pythonhosted.org/packages/86/21/d867c37e493a63b5521bd248110ad5b97b18253d64a30703e3e8f3d9631e/oracledb-3.4.2-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5ed78d7e7079a778062744ccf42141ce4806818c3f4dd6463e4a7edd561c9f86", size = 2599301, upload-time = "2026-01-28T17:25:58.529Z" },
+    { url = "https://files.pythonhosted.org/packages/2a/de/9b1843ea27f7791449652d7f340f042c3053336d2c11caf29e59bab86189/oracledb-3.4.2-cp311-cp311-win32.whl", hash = "sha256:0e16fe3d057e0c41a23ad2ae95bfa002401690773376d476be608f79ac74bf05", size = 1492890, upload-time = "2026-01-28T17:26:00.662Z" },
+    { url = "https://files.pythonhosted.org/packages/d6/10/cbc8afa2db0cec80530858d3e4574f9734fae8c0b7f1df261398aa026c5f/oracledb-3.4.2-cp311-cp311-win_amd64.whl", hash = "sha256:f93cae08e8ed20f2d5b777a8602a71f9418389c661d2c937e84d94863e7e7011", size = 1843355, upload-time = "2026-01-28T17:26:02.637Z" },
+    { url = "https://files.pythonhosted.org/packages/8f/81/2e6154f34b71cd93b4946c73ea13b69d54b8d45a5f6bbffe271793240d21/oracledb-3.4.2-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:a7396664e592881225ba66385ee83ce339d864f39003d6e4ca31a894a7e7c552", size = 4220806, upload-time = "2026-01-28T17:26:04.322Z" },
+    { url = "https://files.pythonhosted.org/packages/ab/a9/a1d59aaac77d8f727156ec6a3b03399917c90b7da4f02d057f92e5601f56/oracledb-3.4.2-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0f04a2d62073407672f114d02529921de0677c6883ed7c64d8d1a3c04caa3238", size = 2233795, upload-time = "2026-01-28T17:26:05.877Z" },
+    { url = "https://files.pythonhosted.org/packages/94/ec/8c4a38020cd251572bd406ddcbde98ca052ec94b5684f9aa9ef1ddfcc68c/oracledb-3.4.2-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d8d75e4f879b908be66cce05ba6c05791a5dbb4a15e39abc01aa25c8a2492bd9", size = 2424756, upload-time = "2026-01-28T17:26:07.35Z" },
+    { url = "https://files.pythonhosted.org/packages/fa/7d/c251c2a8567151ccfcfbe3467ea9a60fb5480dc4719342e2e6b7a9679e5d/oracledb-3.4.2-cp312-cp312-win32.whl", hash = "sha256:31b7ee83c23d0439778303de8a675717f805f7e8edb5556d48c4d8343bcf14f5", size = 1453486, upload-time = "2026-01-28T17:26:08.869Z" },
+    { url = "https://files.pythonhosted.org/packages/4c/78/c939f3c16fb39400c4734d5a3340db5659ba4e9dce23032d7b33ccfd3fe5/oracledb-3.4.2-cp312-cp312-win_amd64.whl", hash = "sha256:ac25a0448fc830fb7029ad50cd136cdbfcd06975d53967e269772cc5cb8c203a", size = 1794445, upload-time = "2026-01-28T17:26:10.66Z" },
+    { url = "https://files.pythonhosted.org/packages/22/68/f7126f5d911c295b57720c6b1a0609a5a2667b4546946433552a4de46333/oracledb-3.4.2-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:643c25d301a289a371e37fcedb59e5fa5e54fb321708e5c12821c4b55bdd8a4d", size = 4205176, upload-time = "2026-01-28T17:26:12.463Z" },
+    { url = "https://files.pythonhosted.org/packages/5d/93/2fced60f92dc82e66980a8a3ba5c1ea48110bf1dd81d030edb69d88f992e/oracledb-3.4.2-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:55397e7eb43bb7017c03a981c736c25724182f5210951181dfe3fab0e5d457fb", size = 2231298, upload-time = "2026-01-28T17:26:14.497Z" },
+    { url = "https://files.pythonhosted.org/packages/75/a7/4dd286f3a6348d786fef9e6ab2e6c9b74ca9195d9a756f2a67e45743cdf0/oracledb-3.4.2-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b26a10f9c790bd141ffc8af68520803ed4a44a9258bf7d1eea9bfdd36bd6df7f", size = 2439430, upload-time = "2026-01-28T17:26:16.044Z" },
+    { url = "https://files.pythonhosted.org/packages/19/28/94bc753e5e969c60ee5d9c914e2b4ef79999eaca8e91bcab2fbf0586b80b/oracledb-3.4.2-cp313-cp313-win32.whl", hash = "sha256:b974caec2c330c22bbe765705a5ac7d98ec3022811dec2042d561a3c65cb991b", size = 1458209, upload-time = "2026-01-28T17:26:17.652Z" },
+    { url = "https://files.pythonhosted.org/packages/cb/2b/593a9b2d4c12c9de3289e67d84fe023336d99f36ba51442a5a0f5ce6acf7/oracledb-3.4.2-cp313-cp313-win_amd64.whl", hash = "sha256:3df8eee1410d25360599968b1625b000f10c5ae0e47274031a7842a9dc418890", size = 1793558, upload-time = "2026-01-28T17:26:19.914Z" },
+]
+
 [[package]]
 name = "orjson"
 version = "3.11.5"
diff --git a/variables.env b/variables.env
index 790e1ad7c..b9003e7b5 100644
--- a/variables.env
+++ b/variables.env
@@ -15,8 +15,8 @@ DOCKER_VOLUME_DIRECTORY=vectordb
 
 # ==== Endpoints for using on-prem NIMs ====
 APP_LLM_SERVERURL=nim-llm:8000
-APP_EMBEDDINGS_SERVERURL=nemoretriever-embedding-ms:8000/v1
-APP_RANKING_SERVERURL=nemoretriever-ranking-ms:8000
+APP_EMBEDDINGS_SERVERURL=nemotron-embedding-ms:8000/v1
+APP_RANKING_SERVERURL=nemotron-ranking-ms:8000
 OCR_GRPC_ENDPOINT=nemoretriever-ocr:8001
 OCR_HTTP_ENDPOINT=http://nemoretriever-ocr:8000/v1/infer
 OCR_INFER_PROTOCOL=grpc
@@ -35,11 +35,11 @@ YOLOX_TABLE_STRUCTURE_INFER_PROTOCOL=grpc
 # OCR_HTTP_ENDPOINT=https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-ocr
 # OCR_INFER_PROTOCOL=http
 # OCR_MODEL_NAME=scene_text_ensemble
-# YOLOX_HTTP_ENDPOINT=https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-page-elements-v3
+# YOLOX_HTTP_ENDPOINT=https://ai.api.nvidia.com/v1/cv/nvidia/nemotron-page-elements-v3
 # YOLOX_INFER_PROTOCOL=http
-# YOLOX_GRAPHIC_ELEMENTS_HTTP_ENDPOINT=https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-graphic-elements-v1
+# YOLOX_GRAPHIC_ELEMENTS_HTTP_ENDPOINT=https://ai.api.nvidia.com/v1/cv/nvidia/nemotron-graphic-elements-v1
 # YOLOX_GRAPHIC_ELEMENTS_INFER_PROTOCOL=http
-# YOLOX_TABLE_STRUCTURE_HTTP_ENDPOINT=https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-table-structure-v1
+# YOLOX_TABLE_STRUCTURE_HTTP_ENDPOINT=https://ai.api.nvidia.com/v1/cv/nvidia/nemotron-table-structure-v1
 # YOLOX_TABLE_STRUCTURE_INFER_PROTOCOL=http