From c444e8d8890d611ace5b6ad691bc609ad14b8a3f Mon Sep 17 00:00:00 2001 From: manishpanda01 Date: Fri, 27 Mar 2026 10:55:36 +0000 Subject: [PATCH 1/2] feat: add interview preparation RAG notebook --- notebooks/interview_preparation_rag.ipynb | 521 ++++++++++++++++++++++ 1 file changed, 521 insertions(+) create mode 100644 notebooks/interview_preparation_rag.ipynb diff --git a/notebooks/interview_preparation_rag.ipynb b/notebooks/interview_preparation_rag.ipynb new file mode 100644 index 0000000..e2ec46f --- /dev/null +++ b/notebooks/interview_preparation_rag.ipynb @@ -0,0 +1,521 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Domain-Specific RAG for Interview Preparation\n", + "\n", + "This notebook shows how to build a Retrieval-Augmented Generation (RAG)\n", + "pipeline with Haystack for interview preparation. We create a small\n", + "domain-specific knowledge base with machine learning interview notes,\n", + "index it with FastEmbed, retrieve relevant passages, and use an LLM to\n", + "generate grounded answers.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Install dependencies\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: fastembed-haystack in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (2.1.0)\n", + "Requirement already satisfied: qdrant-haystack in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (10.3.0)\n", + "Requirement already satisfied: fastembed>=0.4.2 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from fastembed-haystack) (0.8.0)\n", + "Requirement already satisfied: haystack-ai>=2.22.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from fastembed-haystack) (2.26.1)\n", + "Requirement already satisfied: qdrant-client>=1.12.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from qdrant-haystack) (1.17.1)\n", + "Requirement already satisfied: huggingface-hub<2.0,>=0.20 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from fastembed>=0.4.2->fastembed-haystack) (1.8.0)\n", + "Requirement already satisfied: loguru<0.8.0,>=0.7.2 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from fastembed>=0.4.2->fastembed-haystack) (0.7.3)\n", + "Requirement already satisfied: mmh3<6.0.0,>=4.1.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from fastembed>=0.4.2->fastembed-haystack) (5.2.1)\n", + "Requirement already satisfied: numpy>=1.26 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from fastembed>=0.4.2->fastembed-haystack) (2.4.3)\n", + "Requirement already satisfied: onnxruntime!=1.20.0,!=1.24.0,!=1.24.1,>=1.17.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from fastembed>=0.4.2->fastembed-haystack) (1.24.4)\n", + "Requirement already satisfied: pillow<13.0,>=10.3.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from fastembed>=0.4.2->fastembed-haystack) (12.1.1)\n", + "Requirement already satisfied: py-rust-stemmers<0.2.0,>=0.1.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from fastembed>=0.4.2->fastembed-haystack) (0.1.5)\n", + "Requirement already satisfied: requests<3.0,>=2.31 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from fastembed>=0.4.2->fastembed-haystack) (2.33.0)\n", + "Requirement already satisfied: tokenizers<1.0,>=0.15 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from fastembed>=0.4.2->fastembed-haystack) (0.22.2)\n", + "Requirement already satisfied: tqdm<5.0,>=4.66 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from fastembed>=0.4.2->fastembed-haystack) (4.67.3)\n", + "Requirement already satisfied: docstring-parser in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from haystack-ai>=2.22.0->fastembed-haystack) (0.17.0)\n", + "Requirement already satisfied: filetype in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from haystack-ai>=2.22.0->fastembed-haystack) (1.2.0)\n", + "Requirement already satisfied: haystack-experimental in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from haystack-ai>=2.22.0->fastembed-haystack) (0.19.0)\n", + "Requirement already satisfied: jinja2 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from haystack-ai>=2.22.0->fastembed-haystack) (3.1.6)\n", + "Requirement already satisfied: jsonschema in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from haystack-ai>=2.22.0->fastembed-haystack) (4.26.0)\n", + "Requirement already satisfied: lazy-imports in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from haystack-ai>=2.22.0->fastembed-haystack) (1.2.0)\n", + "Requirement already satisfied: markupsafe in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from haystack-ai>=2.22.0->fastembed-haystack) (3.0.3)\n", + "Requirement already satisfied: more-itertools in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from haystack-ai>=2.22.0->fastembed-haystack) (10.8.0)\n", + "Requirement already satisfied: networkx in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from haystack-ai>=2.22.0->fastembed-haystack) (3.6.1)\n", + "Requirement already satisfied: openai>=1.99.2 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from haystack-ai>=2.22.0->fastembed-haystack) (2.30.0)\n", + "Requirement already satisfied: posthog!=3.12.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from haystack-ai>=2.22.0->fastembed-haystack) (7.9.12)\n", + "Requirement already satisfied: pydantic in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from haystack-ai>=2.22.0->fastembed-haystack) (2.12.5)\n", + "Requirement already satisfied: python-dateutil in /Users/manishpanda01/Library/Python/3.12/lib/python/site-packages (from haystack-ai>=2.22.0->fastembed-haystack) (2.9.0.post0)\n", + "Requirement already satisfied: pyyaml in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from haystack-ai>=2.22.0->fastembed-haystack) (6.0.3)\n", + "Requirement already satisfied: tenacity!=8.4.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from haystack-ai>=2.22.0->fastembed-haystack) (9.1.4)\n", + "Requirement already satisfied: typing-extensions>=4.7 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from haystack-ai>=2.22.0->fastembed-haystack) (4.15.0)\n", + "Requirement already satisfied: grpcio>=1.41.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from qdrant-client>=1.12.0->qdrant-haystack) (1.78.0)\n", + "Requirement already satisfied: httpx>=0.20.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from httpx[http2]>=0.20.0->qdrant-client>=1.12.0->qdrant-haystack) (0.28.1)\n", + "Requirement already satisfied: portalocker<4.0,>=2.7.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from qdrant-client>=1.12.0->qdrant-haystack) (3.2.0)\n", + "Requirement already satisfied: protobuf>=3.20.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from qdrant-client>=1.12.0->qdrant-haystack) (7.34.1)\n", + "Requirement already satisfied: urllib3<3,>=1.26.14 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from qdrant-client>=1.12.0->qdrant-haystack) (2.6.3)\n", + "Requirement already satisfied: anyio in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from httpx>=0.20.0->httpx[http2]>=0.20.0->qdrant-client>=1.12.0->qdrant-haystack) (4.13.0)\n", + "Requirement already satisfied: certifi in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from httpx>=0.20.0->httpx[http2]>=0.20.0->qdrant-client>=1.12.0->qdrant-haystack) (2026.2.25)\n", + "Requirement already satisfied: httpcore==1.* in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from httpx>=0.20.0->httpx[http2]>=0.20.0->qdrant-client>=1.12.0->qdrant-haystack) (1.0.9)\n", + "Requirement already satisfied: idna in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from httpx>=0.20.0->httpx[http2]>=0.20.0->qdrant-client>=1.12.0->qdrant-haystack) (3.11)\n", + "Requirement already satisfied: h11>=0.16 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from httpcore==1.*->httpx>=0.20.0->httpx[http2]>=0.20.0->qdrant-client>=1.12.0->qdrant-haystack) (0.16.0)\n", + "Requirement already satisfied: h2<5,>=3 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from httpx[http2]>=0.20.0->qdrant-client>=1.12.0->qdrant-haystack) (4.3.0)\n", + "Requirement already satisfied: filelock>=3.10.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from huggingface-hub<2.0,>=0.20->fastembed>=0.4.2->fastembed-haystack) (3.25.2)\n", + "Requirement already satisfied: fsspec>=2023.5.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from huggingface-hub<2.0,>=0.20->fastembed>=0.4.2->fastembed-haystack) (2026.2.0)\n", + "Requirement already satisfied: hf-xet<2.0.0,>=1.4.2 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from huggingface-hub<2.0,>=0.20->fastembed>=0.4.2->fastembed-haystack) (1.4.2)\n", + "Requirement already satisfied: packaging>=20.9 in /Users/manishpanda01/Library/Python/3.12/lib/python/site-packages (from huggingface-hub<2.0,>=0.20->fastembed>=0.4.2->fastembed-haystack) (26.0)\n", + "Requirement already satisfied: typer in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from huggingface-hub<2.0,>=0.20->fastembed>=0.4.2->fastembed-haystack) (0.24.1)\n", + "Requirement already satisfied: flatbuffers in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from onnxruntime!=1.20.0,!=1.24.0,!=1.24.1,>=1.17.0->fastembed>=0.4.2->fastembed-haystack) (25.12.19)\n", + "Requirement already satisfied: sympy in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from onnxruntime!=1.20.0,!=1.24.0,!=1.24.1,>=1.17.0->fastembed>=0.4.2->fastembed-haystack) (1.14.0)\n", + "Requirement already satisfied: distro<2,>=1.7.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from openai>=1.99.2->haystack-ai>=2.22.0->fastembed-haystack) (1.9.0)\n", + "Requirement already satisfied: jiter<1,>=0.10.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from openai>=1.99.2->haystack-ai>=2.22.0->fastembed-haystack) (0.13.0)\n", + "Requirement already satisfied: sniffio in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from openai>=1.99.2->haystack-ai>=2.22.0->fastembed-haystack) (1.3.1)\n", + "Requirement already satisfied: six>=1.5 in /Users/manishpanda01/Library/Python/3.12/lib/python/site-packages (from posthog!=3.12.0->haystack-ai>=2.22.0->fastembed-haystack) (1.17.0)\n", + "Requirement already satisfied: backoff>=1.10.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from posthog!=3.12.0->haystack-ai>=2.22.0->fastembed-haystack) (2.2.1)\n", + "Requirement already satisfied: annotated-types>=0.6.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from pydantic->haystack-ai>=2.22.0->fastembed-haystack) (0.7.0)\n", + "Requirement already satisfied: pydantic-core==2.41.5 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from pydantic->haystack-ai>=2.22.0->fastembed-haystack) (2.41.5)\n", + "Requirement already satisfied: typing-inspection>=0.4.2 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from pydantic->haystack-ai>=2.22.0->fastembed-haystack) (0.4.2)\n", + "Requirement already satisfied: charset_normalizer<4,>=2 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from requests<3.0,>=2.31->fastembed>=0.4.2->fastembed-haystack) (3.4.6)\n", + "Requirement already satisfied: attrs>=22.2.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from jsonschema->haystack-ai>=2.22.0->fastembed-haystack) (26.1.0)\n", + "Requirement already satisfied: jsonschema-specifications>=2023.03.6 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from jsonschema->haystack-ai>=2.22.0->fastembed-haystack) (2025.9.1)\n", + "Requirement already satisfied: referencing>=0.28.4 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from jsonschema->haystack-ai>=2.22.0->fastembed-haystack) (0.37.0)\n", + "Requirement already satisfied: rpds-py>=0.25.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from jsonschema->haystack-ai>=2.22.0->fastembed-haystack) (0.30.0)\n", + "Requirement already satisfied: hyperframe<7,>=6.1 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from h2<5,>=3->httpx[http2]>=0.20.0->qdrant-client>=1.12.0->qdrant-haystack) (6.1.0)\n", + "Requirement already satisfied: hpack<5,>=4.1 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from h2<5,>=3->httpx[http2]>=0.20.0->qdrant-client>=1.12.0->qdrant-haystack) (4.1.0)\n", + "Requirement already satisfied: mpmath<1.4,>=1.1.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from sympy->onnxruntime!=1.20.0,!=1.24.0,!=1.24.1,>=1.17.0->fastembed>=0.4.2->fastembed-haystack) (1.3.0)\n", + "Requirement already satisfied: click>=8.2.1 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from typer->huggingface-hub<2.0,>=0.20->fastembed>=0.4.2->fastembed-haystack) (8.3.1)\n", + "Requirement already satisfied: shellingham>=1.3.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from typer->huggingface-hub<2.0,>=0.20->fastembed>=0.4.2->fastembed-haystack) (1.5.4)\n", + "Requirement already satisfied: rich>=12.3.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from typer->huggingface-hub<2.0,>=0.20->fastembed>=0.4.2->fastembed-haystack) (14.3.3)\n", + "Requirement already satisfied: annotated-doc>=0.0.2 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from typer->huggingface-hub<2.0,>=0.20->fastembed>=0.4.2->fastembed-haystack) (0.0.4)\n", + "Requirement already satisfied: markdown-it-py>=2.2.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from rich>=12.3.0->typer->huggingface-hub<2.0,>=0.20->fastembed>=0.4.2->fastembed-haystack) (4.0.0)\n", + "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /Users/manishpanda01/Library/Python/3.12/lib/python/site-packages (from rich>=12.3.0->typer->huggingface-hub<2.0,>=0.20->fastembed>=0.4.2->fastembed-haystack) (2.19.2)\n", + "Requirement already satisfied: mdurl~=0.1 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from markdown-it-py>=2.2.0->rich>=12.3.0->typer->huggingface-hub<2.0,>=0.20->fastembed>=0.4.2->fastembed-haystack) (0.1.2)\n", + "\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m24.2\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m26.0.1\u001b[0m\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpython3 -m pip install --upgrade pip\u001b[0m\n", + "Note: you may need to restart the kernel to use updated packages.\n", + "\u001b[33m WARNING: The script nltk is installed in '/Library/Frameworks/Python.framework/Versions/3.12/bin' which is not on PATH.\n", + " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n", + "\u001b[0m\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m24.2\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m26.0.1\u001b[0m\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpython3 -m pip install --upgrade pip\u001b[0m\n", + "Note: you may need to restart the kernel to use updated packages.\n" + ] + } + ], + "source": [ + "%pip install fastembed-haystack qdrant-haystack\n", + "%pip install nltk>=3.9.1\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create interview-preparation documents\n" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "5" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from haystack.dataclasses import Document\n", + "\n", + "raw_docs = [\n", + " Document(\n", + " content=(\n", + " \"Bias-variance tradeoff is the balance between underfitting and overfitting. \"\n", + " \"High bias leads to overly simple models that miss patterns. \"\n", + " \"High variance leads to models that fit noise in the training data.\"\n", + " ),\n", + " meta={\"title\": \"Bias-Variance Tradeoff\", \"category\": \"ml_fundamentals\"},\n", + " ),\n", + " Document(\n", + " content=(\n", + " \"Precision measures how many predicted positive examples are actually positive. \"\n", + " \"Recall measures how many actual positive examples were correctly identified. \"\n", + " \"Precision is important when false positives are costly. \"\n", + " \"Recall is important when false negatives are costly.\"\n", + " ),\n", + " meta={\"title\": \"Precision and Recall\", \"category\": \"evaluation\"},\n", + " ),\n", + " Document(\n", + " content=(\n", + " \"Overfitting happens when a model learns noise and specific details from the training data \"\n", + " \"instead of general patterns. Common mitigation strategies include regularization, \"\n", + " \"cross-validation, simplifying the model, early stopping, and collecting more data.\"\n", + " ),\n", + " meta={\"title\": \"Overfitting\", \"category\": \"modeling\"},\n", + " ),\n", + " Document(\n", + " content=(\n", + " \"In machine learning interviews, candidates are often asked to explain how they would \"\n", + " \"deploy a model to production. A strong answer should mention model serving, monitoring, \"\n", + " \"logging, latency, scalability, retraining, and rollback plans.\"\n", + " ),\n", + " meta={\"title\": \"Model Deployment Interview Answer\", \"category\": \"ml_system_design\"},\n", + " ),\n", + " Document(\n", + " content=(\n", + " \"Behavioral interview answers are often structured with the STAR method: \"\n", + " \"Situation, Task, Action, Result. This helps candidates give concise and evidence-based responses.\"\n", + " ),\n", + " meta={\"title\": \"STAR Method\", \"category\": \"behavioral\"},\n", + " ),\n", + "]\n", + "\n", + "len(raw_docs)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Clean, split, and index documents in Qdrant\n" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "from haystack_integrations.document_stores.qdrant import QdrantDocumentStore\n", + "from haystack.components.preprocessors import DocumentCleaner, DocumentSplitter\n", + "from haystack_integrations.components.embedders.fastembed import FastembedDocumentEmbedder\n", + "from haystack.document_stores.types import DuplicatePolicy\n" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "document_store = QdrantDocumentStore(\n", + " \":memory:\",\n", + " embedding_dim=384,\n", + " recreate_index=True,\n", + " return_embedding=True,\n", + " wait_result_from_api=True,\n", + ")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "6" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from haystack.components.preprocessors import DocumentCleaner, DocumentSplitter\n", + "\n", + "cleaner = DocumentCleaner()\n", + "splitter = DocumentSplitter(split_by=\"period\", split_length=3)\n", + "\n", + "cleaned_docs = cleaner.run(raw_docs)[\"documents\"]\n", + "split_docs = splitter.run(cleaned_docs)[\"documents\"]\n", + "\n", + "len(split_docs)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Embed and write documents\n" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Fetching 5 files: 100%|██████████| 5/5 [00:20<00:00, 4.14s/it]\n", + "Calculating embeddings: 100%|██████████| 6/6 [00:00<00:00, 98.06it/s]\n", + "100it [00:00, 50588.64it/s] \n" + ] + }, + { + "data": { + "text/plain": [ + "6" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "document_embedder = FastembedDocumentEmbedder(\n", + " model=\"BAAI/bge-small-en-v1.5\",\n", + " parallel=0,\n", + " meta_fields_to_embed=[\"title\", \"category\"],\n", + ")\n", + "\n", + "documents_with_embeddings = document_embedder.run(split_docs)[\"documents\"]\n", + "document_store.write_documents(documents_with_embeddings, policy=DuplicatePolicy.OVERWRITE)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Build the RAG pipeline\n" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "from haystack import Pipeline\n", + "from haystack_integrations.components.retrievers.qdrant import QdrantEmbeddingRetriever\n", + "from haystack_integrations.components.embedders.fastembed import FastembedTextEmbedder\n", + "from haystack.components.builders import ChatPromptBuilder\n", + "from haystack.components.generators.chat import HuggingFaceAPIChatGenerator\n", + "from haystack.dataclasses import ChatMessage\n" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "from getpass import getpass\n", + "import os\n", + "\n", + "os.environ[\"HF_API_TOKEN\"] = getpass(\"Enter your Hugging Face token: \")\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "generator = HuggingFaceAPIChatGenerator(\n", + " api_type=\"serverless_inference_api\",\n", + " api_params={\"model\": \"Qwen/Qwen2.5-7B-Instruct\", \"provider\": \"together\"},\n", + " generation_kwargs={\"max_tokens\": 300},\n", + ")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "ChatPromptBuilder has 2 prompt variables, but `required_variables` is not set. By default, all prompt variables are treated as optional, which may lead to unintended behavior in multi-branch pipelines. To avoid unexpected execution, ensure that variables intended to be required are explicitly set in `required_variables`.\n" + ] + } + ], + "source": [ + "template = [\n", + " ChatMessage.from_user(\n", + " \"\"\"\n", + "You are an interview preparation assistant.\n", + "Answer the question using only the information contained in the documents.\n", + "If the answer cannot be inferred from the documents, say \\\"I don't know.\\\"\n", + "\n", + "Documents:\n", + "{% for doc in documents %}\n", + "- {{ doc.content }}\n", + "{% endfor %}\n", + "\n", + "Question: {{ question }}\n", + "Answer:\n", + "\"\"\"\n", + " )\n", + "]\n", + "\n", + "prompt_builder = ChatPromptBuilder(template=template)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "\n", + "🚅 Components\n", + " - text_embedder: FastembedTextEmbedder\n", + " - retriever: QdrantEmbeddingRetriever\n", + " - prompt_builder: ChatPromptBuilder\n", + " - generator: HuggingFaceAPIChatGenerator\n", + "🛤️ Connections\n", + " - text_embedder.embedding -> retriever.query_embedding (list[float])\n", + " - retriever.documents -> prompt_builder.documents (list[Document])\n", + " - prompt_builder.prompt -> generator.messages (list[ChatMessage])" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "query_pipeline = Pipeline()\n", + "query_pipeline.add_component(\n", + " \"text_embedder\",\n", + " FastembedTextEmbedder(model=\"BAAI/bge-small-en-v1.5\", parallel=0, prefix=\"query:\"),\n", + ")\n", + "query_pipeline.add_component(\n", + " \"retriever\",\n", + " QdrantEmbeddingRetriever(document_store=document_store, top_k=3),\n", + ")\n", + "query_pipeline.add_component(\"prompt_builder\", prompt_builder)\n", + "query_pipeline.add_component(\"generator\", generator)\n", + "\n", + "query_pipeline.connect(\"text_embedder.embedding\", \"retriever.query_embedding\")\n", + "query_pipeline.connect(\"retriever.documents\", \"prompt_builder.documents\")\n", + "query_pipeline.connect(\"prompt_builder\", \"generator\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Ask interview-preparation questions\n" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Calculating embeddings: 100%|██████████| 1/1 [00:00<00:00, 12.54it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Overfitting happens when a model learns noise and specific details from the training data instead of general patterns. To mitigate overfitting, common strategies include regularization, cross-validation, simplifying the model, early stopping, and collecting more data.\n" + ] + } + ], + "source": [ + "question = \"How should I explain overfitting in an interview?\"\n", + "\n", + "results = query_pipeline.run(\n", + " {\n", + " \"text_embedder\": {\"text\": question},\n", + " \"prompt_builder\": {\"question\": question},\n", + " }\n", + ")\n", + "\n", + "for reply in results[\"generator\"][\"replies\"]:\n", + " print(reply.text)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Calculating embeddings: 100%|██████████| 1/1 [00:00<00:00, 25.82it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The STAR method is a structured approach used in behavioral interview answers. It stands for Situation, Task, Action, and Result. This method helps candidates provide concise and evidence-based responses.\n" + ] + } + ], + "source": [ + "question = \"What is the STAR method?\"\n", + "\n", + "results = query_pipeline.run(\n", + " {\n", + " \"text_embedder\": {\"text\": question},\n", + " \"prompt_builder\": {\"question\": question},\n", + " }\n", + ")\n", + "\n", + "for reply in results[\"generator\"][\"replies\"]:\n", + " print(reply.text)\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 233cc2985c271c165cbaf288b9d5e0e716652c4f Mon Sep 17 00:00:00 2001 From: manishpanda01 Date: Fri, 27 Mar 2026 11:01:14 +0000 Subject: [PATCH 2/2] chore: add notebook entry to index.toml --- index.toml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/index.toml b/index.toml index 6809dc6..e9555d1 100644 --- a/index.toml +++ b/index.toml @@ -368,3 +368,9 @@ title = "LinkedIn, Company Intelligence & Lead Enrichment with Haystack, MongoDB notebook = "ai_sales_research_assistant.ipynb" new = true topics = ["RAG", "Web-QA"] + +[[notebooks]] +name = "interview_preparation_rag" +title = "Interview Preparation RAG Pipeline" +path = "notebooks/interview_preparation_rag.ipynb" +topics = ["rag", "fastembed", "qdrant", "llm", "interview"]