From 612a39e281351d9cbca777c8005ef31494ee6510 Mon Sep 17 00:00:00 2001 From: Prajwal Raymond Moras Date: Sat, 21 Feb 2026 20:07:18 +0530 Subject: [PATCH 1/3] Add advanced local RAG notebook with ChromaDB, FastEmbed, and Ollama --- index.toml | 6 + .../advanced_local_rag_chroma_ollama.ipynb | 789 ++++++++++++++++++ 2 files changed, 795 insertions(+) create mode 100644 notebooks/advanced_local_rag_chroma_ollama.ipynb diff --git a/index.toml b/index.toml index 6809dc6..01d00e5 100644 --- a/index.toml +++ b/index.toml @@ -368,3 +368,9 @@ title = "LinkedIn, Company Intelligence & Lead Enrichment with Haystack, MongoDB notebook = "ai_sales_research_assistant.ipynb" new = true topics = ["RAG", "Web-QA"] + +[[cookbook]] +title = "Advanced Local RAG with ChromaDB, FastEmbed, and Ollama" +notebook = "advanced_local_rag_chroma_ollama.ipynb" +new = true +topics = ["RAG", "Advanced Retrieval", "Vector Databases"] diff --git a/notebooks/advanced_local_rag_chroma_ollama.ipynb b/notebooks/advanced_local_rag_chroma_ollama.ipynb new file mode 100644 index 0000000..33baeb5 --- /dev/null +++ b/notebooks/advanced_local_rag_chroma_ollama.ipynb @@ -0,0 +1,789 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "bc10761b", + "metadata": {}, + "source": [ + "# Advanced Local RAG with ChromaDB + FastEmbed + Ollama (Haystack)\n", + "\n", + "This notebook demonstrates a production-style **fully local** RAG pipeline:\n", + "- **Vector DB:** ChromaDB (persistent)\n", + "- **Embeddings:** FastEmbed (BAAI/bge-small-en-v1.5)\n", + "- **Generator:** Ollama (llama3.2:3b)\n", + "- **Advanced retrieval:** Query decomposition + reranking\n", + "- **Evaluation:** Hit-Rate and MRR\n", + "\n", + "No cloud API key required." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "install", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m25.2\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m26.0.1\u001b[0m\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n" + ] + } + ], + "source": [ + "!pip install -qU haystack-ai chroma-haystack fastembed-haystack ollama-haystack sentence-transformers" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "imports", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✓ Imports loaded\n" + ] + } + ], + "source": [ + "import json\n", + "import logging\n", + "import re\n", + "from pathlib import Path\n", + "from typing import List, Dict\n", + "\n", + "from haystack import Document, Pipeline\n", + "from haystack.components.builders import PromptBuilder\n", + "from haystack.components.rankers import SentenceTransformersSimilarityRanker\n", + "from haystack_integrations.components.embedders.fastembed import (\n", + " FastembedDocumentEmbedder,\n", + " FastembedTextEmbedder,\n", + ")\n", + "from haystack_integrations.components.generators.ollama import OllamaGenerator\n", + "from haystack_integrations.components.retrievers.chroma import ChromaEmbeddingRetriever\n", + "from haystack_integrations.document_stores.chroma import ChromaDocumentStore\n", + "\n", + "logging.basicConfig(level=logging.INFO, format=\"%(asctime)s | %(levelname)s | %(message)s\")\n", + "logger = logging.getLogger(\"advanced_local_rag\")\n", + "\n", + "print(\"✓ Imports loaded\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "corpus", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✓ Corpus: 10 docs | Eval: 5 questions\n" + ] + } + ], + "source": [ + "corpus = [\n", + " Document(content=\"Alignment means AI goals and behaviors should match human intentions and values.\"),\n", + " Document(content=\"Robustness is reliable performance under distribution shift, adversarial input, and edge cases.\"),\n", + " Document(content=\"Interpretability helps humans understand why a model produced an output.\"),\n", + " Document(content=\"Corrigibility means systems can be corrected, interrupted, or shut down by operators.\"),\n", + " Document(content=\"RLHF uses human preferences to train a reward model and fine-tune language models.\"),\n", + " Document(content=\"Constitutional AI uses explicit principles to critique and revise model outputs.\"),\n", + " Document(content=\"Red teaming stress-tests models by probing unsafe or harmful behaviors.\"),\n", + " Document(content=\"Reward hacking occurs when models exploit loopholes in reward signals.\"),\n", + " Document(content=\"Deceptive alignment is when a model appears aligned during training but pursues different goals later.\"),\n", + " Document(content=\"Distributional shift can break models when deployment data differs from training data.\"),\n", + "]\n", + "\n", + "gold_qa = [\n", + " {\"id\": \"q1\", \"question\": \"What is alignment in AI safety?\", \"keywords\": [\"goals\", \"behaviors\", \"human\", \"intentions\"]},\n", + " {\"id\": \"q2\", \"question\": \"How does RLHF work?\", \"keywords\": [\"human preferences\", \"reward model\", \"fine-tune\"]},\n", + " {\"id\": \"q3\", \"question\": \"What is reward hacking?\", \"keywords\": [\"loopholes\", \"reward signals\", \"exploit\"]},\n", + " {\"id\": \"q4\", \"question\": \"What is deceptive alignment?\", \"keywords\": [\"appears aligned\", \"training\", \"different goals\"]},\n", + " {\"id\": \"q5\", \"question\": \"Why is robustness important?\", \"keywords\": [\"reliable\", \"distribution shift\", \"adversarial\"]},\n", + "]\n", + "\n", + "print(f\"✓ Corpus: {len(corpus)} docs | Eval: {len(gold_qa)} questions\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "index", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Calculating embeddings: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 407.51it/s]\n", + "2026-02-21 19:59:00,776 | INFO | Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✓ Indexed 10 documents into ChromaDB\n" + ] + } + ], + "source": [ + "document_store = ChromaDocumentStore(\n", + " persist_path=\"chroma_db_advanced_local_rag\",\n", + " collection_name=\"advanced_local_rag_demo\",\n", + ")\n", + "\n", + "doc_embedder = FastembedDocumentEmbedder(model=\"BAAI/bge-small-en-v1.5\", prefix=\"passage:\")\n", + "doc_embedder.warm_up()\n", + "\n", + "embedded_docs = doc_embedder.run(documents=corpus)[\"documents\"]\n", + "document_store.write_documents(embedded_docs)\n", + "\n", + "print(f\"✓ Indexed {len(embedded_docs)} documents into ChromaDB\")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "decomposer", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✓ Query decomposer ready\n" + ] + } + ], + "source": [ + "decomposer = OllamaGenerator(\n", + " model=\"llama3.2:3b\",\n", + " url=\"http://localhost:11434\",\n", + " generation_kwargs={\"temperature\": 0.0},\n", + ")\n", + "\n", + "def decompose_query(query: str) -> List[str]:\n", + " prompt = f\"\"\"You are a query planner. Split this query into 2-4 focused sub-queries. Return ONLY a JSON array of strings.\\n\\nQuery: {query}\"\"\"\n", + " out = decomposer.run(prompt=prompt)\n", + " text = out[\"replies\"][0].strip()\n", + " \n", + " try:\n", + " parsed = json.loads(text)\n", + " if isinstance(parsed, list) and all(isinstance(x, str) for x in parsed):\n", + " return parsed\n", + " except Exception:\n", + " pass\n", + " \n", + " quoted = re.findall(r'\"([^\"]+)\"', text)\n", + " if quoted:\n", + " return quoted[:4]\n", + " \n", + " lines = [ln.strip(\"-• \").strip() for ln in text.splitlines() if ln.strip()]\n", + " return lines[:4] if lines else [query]\n", + "\n", + "print(\"✓ Query decomposer ready\")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "pipeline", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2026-02-21 19:59:01,238 | INFO | HTTP Request: HEAD https://huggingface.co/cross-encoder/ms-marco-MiniLM-L-6-v2/resolve/main/config.json \"HTTP/1.1 307 Temporary Redirect\"\n", + "2026-02-21 19:59:01,477 | INFO | HTTP Request: HEAD https://huggingface.co/cross-encoder/ms-marco-MiniLM-L6-v2/resolve/main/config.json \"HTTP/1.1 307 Temporary Redirect\"\n", + "2026-02-21 19:59:01,563 | INFO | HTTP Request: HEAD https://huggingface.co/api/resolve-cache/models/cross-encoder/ms-marco-MiniLM-L6-v2/c5ee24cb16019beea0893ab7796b1df96625c6b8/config.json \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "9086467256d8400b8660476a4ca5538c", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Loading weights: 0%| | 0/105 [00:00 Date: Sat, 21 Feb 2026 23:51:52 +0530 Subject: [PATCH 2/3] Add ChromaDB runtime artifacts to .gitignore --- .gitignore | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 13b82e5..ecc02d3 100644 --- a/.gitignore +++ b/.gitignore @@ -202,4 +202,7 @@ __marimo__/ # Streamlit .streamlit/secrets.toml -.DS_Store \ No newline at end of file +.DS_Store + +# ChromaDB runtime artifacts +notebooks/chroma_db_*/ \ No newline at end of file From 87159f204fd681a66d5a494f60360b52902d2a82 Mon Sep 17 00:00:00 2001 From: Prajwal Raymond Moras Date: Tue, 24 Feb 2026 17:31:56 +0530 Subject: [PATCH 3/3] Add rich narrative markdown between cells per maintainer feedback --- .../advanced_local_rag_chroma_ollama.ipynb | 224 ++++++++++++++++-- 1 file changed, 204 insertions(+), 20 deletions(-) diff --git a/notebooks/advanced_local_rag_chroma_ollama.ipynb b/notebooks/advanced_local_rag_chroma_ollama.ipynb index 33baeb5..827ef91 100644 --- a/notebooks/advanced_local_rag_chroma_ollama.ipynb +++ b/notebooks/advanced_local_rag_chroma_ollama.ipynb @@ -5,21 +5,84 @@ "id": "bc10761b", "metadata": {}, "source": [ - "# Advanced Local RAG with ChromaDB + FastEmbed + Ollama (Haystack)\n", + "# Advanced Local RAG with ChromaDB, FastEmbed, and Ollama\n", "\n", - "This notebook demonstrates a production-style **fully local** RAG pipeline:\n", - "- **Vector DB:** ChromaDB (persistent)\n", - "- **Embeddings:** FastEmbed (BAAI/bge-small-en-v1.5)\n", - "- **Generator:** Ollama (llama3.2:3b)\n", - "- **Advanced retrieval:** Query decomposition + reranking\n", - "- **Evaluation:** Hit-Rate and MRR\n", + "> **No API key required.** Every component in this pipeline runs on your local machine — no cloud, no cost, no data leaving your environment.\n", "\n", - "No cloud API key required." + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/deepset-ai/haystack-cookbook/blob/main/notebooks/advanced_local_rag_chroma_ollama.ipynb)\n", + "\n", + "## Overview\n", + "\n", + "Retrieval-Augmented Generation (RAG) grounds an LLM's answers in a specific document corpus, reducing hallucinations and making responses traceable to sources. This notebook goes beyond a basic RAG setup by introducing two advanced techniques:\n", + "\n", + "- **Query decomposition** — complex questions are broken into focused sub-queries, each independently retrieved and answered, then synthesized into a final response.\n", + "- **Cross-encoder reranking** — after an initial vector search, a more powerful cross-encoder model re-scores the retrieved chunks to surface the most relevant ones.\n", + "\n", + "Here is the full system architecture we will build:\n", + "\n", + "```\n", + "User Query\n", + " │\n", + " ├──► OllamaGenerator (llama3.2:3b)\n", + " │ decomposes query into sub-queries\n", + " │\n", + " └──► For each sub-query:\n", + " │\n", + " ▼\n", + " FastembedTextEmbedder (BAAI/bge-small-en-v1.5)\n", + " │ embeds the sub-query\n", + " ▼\n", + " ChromaEmbeddingRetriever (top-k=8)\n", + " │ vector similarity search\n", + " ▼\n", + " SentenceTransformersSimilarityRanker (cross-encoder, top-k=3)\n", + " │ re-scores chunks for precision\n", + " ▼\n", + " OllamaGenerator (llama3.2:3b)\n", + " │ generates a grounded sub-answer\n", + " ▼\n", + " Final synthesis pass → Coherent answer\n", + "```\n", + "\n", + "## What you will learn\n", + "\n", + "| Step | Concept |\n", + "|------|---------|\n", + "| 1 | Install Haystack and its local integrations |\n", + "| 2 | Define a corpus and evaluation set |\n", + "| 3 | Embed and index documents into a persistent ChromaDB store |\n", + "| 4 | Decompose a complex query into focused sub-queries |\n", + "| 5 | Build a Haystack pipeline with retrieval, reranking, and generation |\n", + "| 6 | Run the full pipeline and synthesize a final answer |\n", + "| 7 | Evaluate retrieval quality with Hit-Rate@5 and MRR@5 |\n", + "| 8 | Explore failure modes: out-of-domain queries and hallucination resistance |\n", + "\n", + "## Prerequisites\n", + "\n", + "1. **Ollama** installed and running — [download here](https://ollama.com/download)\n", + "2. The following model pulled locally:\n", + " ```bash\n", + " ollama pull llama3.2:3b\n", + " ```" + ] + }, + { + "cell_type": "markdown", + "id": "5d55026e", + "metadata": {}, + "source": [ + "## Step 1 — Install dependencies\n", + "\n", + "We need Haystack's core library plus four local integrations:\n", + "- **`chroma-haystack`** — ChromaDB document store and retriever\n", + "- **`fastembed-haystack`** — fast, local document and query embedders using ONNX-optimized models\n", + "- **`ollama-haystack`** — Haystack wrapper for locally running Ollama LLMs\n", + "- **`sentence-transformers`** — used by the cross-encoder reranker" ] }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "id": "install", "metadata": {}, "outputs": [ @@ -37,6 +100,16 @@ "!pip install -qU haystack-ai chroma-haystack fastembed-haystack ollama-haystack sentence-transformers" ] }, + { + "cell_type": "markdown", + "id": "036e9b1b", + "metadata": {}, + "source": [ + "## Step 2 — Imports and logging\n", + "\n", + "We import the core Haystack components and integrations we will use throughout the notebook, then set up structured logging so each step prints a timestamped status line." + ] + }, { "cell_type": "code", "execution_count": 2, @@ -75,9 +148,23 @@ "print(\"✓ Imports loaded\")" ] }, + { + "cell_type": "markdown", + "id": "bd396995", + "metadata": {}, + "source": [ + "## Step 3 — Define the corpus and evaluation set\n", + "\n", + "A RAG pipeline is only as good as its data. We define two things here:\n", + "\n", + "**Corpus** — 10 short documents covering AI safety concepts. Each document becomes a single chunk in our vector store. In a real project, replace this with your own documents loaded from files, URLs, or a database.\n", + "\n", + "**Gold Q&A set** — 5 question-keyword pairs used for retrieval evaluation in Step 8. For each question, we define keywords that *must* appear in a retrieved document to count as a \"hit\". This evaluation approach is LLM-free and deterministic — runs in seconds and is safe for CI pipelines." + ] + }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "id": "corpus", "metadata": {}, "outputs": [ @@ -114,6 +201,22 @@ "print(f\"✓ Corpus: {len(corpus)} docs | Eval: {len(gold_qa)} questions\")" ] }, + { + "cell_type": "markdown", + "id": "1a2a6d97", + "metadata": {}, + "source": [ + "## Step 4 — Embed and index documents into ChromaDB\n", + "\n", + "Before we can retrieve anything, we need to convert our text documents into vector embeddings and store them in a vector database.\n", + "\n", + "**Why FastEmbed?** FastEmbed uses ONNX-optimized models that run efficiently on CPU — no GPU required. The `BAAI/bge-small-en-v1.5` model produces 384-dimensional embeddings and is one of the top-performing small embedding models on the MTEB benchmark.\n", + "\n", + "**Why ChromaDB with persistence?** The `persist_path` argument tells ChromaDB to write the collection to disk. On subsequent runs, the existing vectors are loaded — no re-embedding needed.\n", + "\n", + "Note the `prefix=\"passage:\"` argument — BGE models are trained with separate prefixes for documents (`passage:`) and queries (`query:`). Using the correct prefix improves retrieval quality." + ] + }, { "cell_type": "code", "execution_count": 4, @@ -151,9 +254,27 @@ "print(f\"✓ Indexed {len(embedded_docs)} documents into ChromaDB\")" ] }, + { + "cell_type": "markdown", + "id": "5c312435", + "metadata": {}, + "source": [ + "## Step 5 — Query decomposition\n", + "\n", + "Complex questions often span multiple concepts. A simple RAG pipeline retrieves chunks for the query as a whole, which can miss relevant documents that address only one aspect.\n", + "\n", + "**Query decomposition** solves this by breaking a complex question into 2–4 focused sub-queries, each targeting a specific concept. For example:\n", + "\n", + "> *\"Explain alignment, RLHF, and why reward hacking is dangerous\"*\n", + "\n", + "...gets decomposed into: `\"Alignment\"`, `\"RLHF\"`, `\"Reward Hacking\"`.\n", + "\n", + "Each sub-query is independently retrieved and answered, then all sub-answers are synthesized into one coherent final response. We use `OllamaGenerator` with `llama3.2:3b` and include a fallback regex parser in case the model doesn't return valid JSON." + ] + }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "id": "decomposer", "metadata": {}, "outputs": [ @@ -176,24 +297,45 @@ " prompt = f\"\"\"You are a query planner. Split this query into 2-4 focused sub-queries. Return ONLY a JSON array of strings.\\n\\nQuery: {query}\"\"\"\n", " out = decomposer.run(prompt=prompt)\n", " text = out[\"replies\"][0].strip()\n", - " \n", + "\n", " try:\n", " parsed = json.loads(text)\n", " if isinstance(parsed, list) and all(isinstance(x, str) for x in parsed):\n", " return parsed\n", " except Exception:\n", " pass\n", - " \n", + "\n", " quoted = re.findall(r'\"([^\"]+)\"', text)\n", " if quoted:\n", " return quoted[:4]\n", - " \n", + "\n", " lines = [ln.strip(\"-• \").strip() for ln in text.splitlines() if ln.strip()]\n", " return lines[:4] if lines else [query]\n", "\n", "print(\"✓ Query decomposer ready\")" ] }, + { + "cell_type": "markdown", + "id": "643f73dd", + "metadata": {}, + "source": [ + "## Step 6 — Build the Haystack RAG pipeline\n", + "\n", + "Now we assemble the full retrieval-generation pipeline using Haystack's `Pipeline` abstraction. Each component is connected in a directed graph:\n", + "\n", + "```\n", + "query_embedder → retriever → ranker → prompt_builder → generator\n", + "```\n", + "\n", + "Key components:\n", + "- **`FastembedTextEmbedder`** — embeds the user query with the same model used during indexing. The `query:` prefix is required for BGE models.\n", + "- **`ChromaEmbeddingRetriever`** — vector similarity search with `top_k=8`, casting a wide net.\n", + "- **`SentenceTransformersSimilarityRanker`** — a cross-encoder that re-scores every (query, chunk) pair and keeps the top 3. Unlike the bi-encoder, a cross-encoder sees both texts together, making it significantly more accurate — ideal as a second-stage filter.\n", + "- **`PromptBuilder`** — assembles the final prompt by injecting the ranked chunks as context.\n", + "- **`OllamaGenerator`** — generates the final answer with `temperature=0.0` for deterministic, grounded responses." + ] + }, { "cell_type": "code", "execution_count": 6, @@ -290,9 +432,21 @@ "print(\"✓ RAG pipeline ready\")" ] }, + { + "cell_type": "markdown", + "id": "ea97d4f3", + "metadata": {}, + "source": [ + "## Step 7 — Run the full pipeline with query decomposition\n", + "\n", + "We now put it all together. The pipeline runs once per sub-query, collecting a generated answer for each. Finally, a synthesis call combines all sub-answers into a single coherent response.\n", + "\n", + "This approach provides better coverage than a single retrieval pass because each sub-query targets a specific concept, reducing the chance that relevant documents are missed due to the original question being too broad." + ] + }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "id": "query", "metadata": {}, "outputs": [ @@ -432,14 +586,11 @@ "for i, sq in enumerate(sub_queries, 1):\n", " print(f\"{i}. {sq}\")\n", "\n", - "# Run the full pipeline for each sub-query — answer is generated per sub-query\n", - "# then we do a final synthesis pass\n", "answers = []\n", "for sq in sub_queries:\n", " out = rag.run({\"query_embedder\": {\"text\": sq}, \"ranker\": {\"query\": sq}, \"prompt_builder\": {\"question\": sq}})\n", " answers.append(out[\"generator\"][\"replies\"][0])\n", "\n", - "# Final synthesis: ask the LLM to combine the sub-answers\n", "synthesis_prompt = f\"\"\"You are a careful assistant. Synthesize these answers into one coherent response.\n", "\n", "Question: {user_query}\n", @@ -454,6 +605,23 @@ "print(final_answer)" ] }, + { + "cell_type": "markdown", + "id": "b40f1985", + "metadata": {}, + "source": [ + "## Step 8 — Evaluate retrieval quality\n", + "\n", + "Good answers depend on good retrieval. We measure retrieval quality using two standard metrics:\n", + "\n", + "- **Hit-Rate@k** — the fraction of questions where at least one of the top-k retrieved chunks contains the answer. A hit-rate of 100% means every question had a relevant chunk in the retrieved set.\n", + "- **MRR (Mean Reciprocal Rank)** — measures *where* the first relevant chunk appears. A chunk ranked #1 scores 1.0; ranked #2 scores 0.5. Higher MRR means relevant chunks appear at the top.\n", + "\n", + "We call the embedder, retriever, and ranker components directly (bypassing the generator) to keep evaluation fast and LLM-free.\n", + "\n", + "> **Why this matters:** Always evaluate retrieval independently from generation. A poor retriever cannot be compensated by a better LLM." + ] + }, { "cell_type": "code", "execution_count": 9, @@ -609,9 +777,25 @@ "print(f\"MRR@5: {mrr:.4f}\")" ] }, + { + "cell_type": "markdown", + "id": "5cb83bdb", + "metadata": {}, + "source": [ + "## Step 9 — Explore failure modes\n", + "\n", + "Understanding where a RAG pipeline breaks is just as important as knowing where it works. We test three failure scenarios:\n", + "\n", + "1. **Empty query** — no meaningful semantic content; retrieval returns irrelevant chunks and the LLM is forced to work with bad context.\n", + "2. **Out-of-domain query** — the document corpus covers AI safety; asking about an unrelated topic (e.g., FIFA) should return a \"not enough context\" response, not a hallucinated answer.\n", + "3. **Prompt injection attempt** — asking the model to \"ignore context and fabricate\" tests whether the system prompt's grounding instruction holds.\n", + "\n", + "These tests show that a well-crafted system prompt (`\"Answer using only the provided context\"`) provides meaningful hallucination resistance even without additional guardrails." + ] + }, { "cell_type": "code", - "execution_count": 10, + "execution_count": null, "id": "failure", "metadata": {}, "outputs": [ @@ -767,7 +951,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "base", "language": "python", "name": "python3" },