diff --git a/notebooks/cross_lingual_hybrid_retrieval.ipynb b/notebooks/cross_lingual_hybrid_retrieval.ipynb new file mode 100644 index 0000000..edee994 --- /dev/null +++ b/notebooks/cross_lingual_hybrid_retrieval.ipynb @@ -0,0 +1,7020 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "ce9c5149", + "metadata": {}, + "source": [ + "# Cross-lingual Hybrid Retrieval with Haystack\n", + "\n", + "When building search systems for multilingual content, a common challenge arises: **keyword-based retrieval (BM25) only matches documents in the same language as the query**, while **dense retrieval with multilingual embeddings can bridge languages but may miss exact term matches**.\n", + "\n", + "This cookbook demonstrates how to build a **hybrid retrieval pipeline** in Haystack that combines BM25 and multilingual dense embeddings to handle cross-lingual search effectively. We'll work with a mixed Chinese-English document collection and show how hybrid retrieval outperforms either method alone.\n", + "\n", + "**What you'll learn:**\n", + "- How BM25 fails on cross-lingual queries\n", + "- How multilingual dense embeddings enable cross-lingual retrieval\n", + "- How to combine both with Haystack's `DocumentJoiner` using Reciprocal Rank Fusion\n", + "- How to build a complete cross-lingual RAG pipeline with a generator\n", + "\n", + "> 💡 **Real-world motivation:** In e-commerce platforms serving multiple markets, product descriptions may exist in different languages. A user searching in English should still find relevant Chinese-language product pages, and vice versa. This pattern applies broadly to multilingual knowledge bases, academic literature, and enterprise document search." + ] + }, + { + "cell_type": "markdown", + "id": "f258b477", + "metadata": {}, + "source": [ + "## Install Dependencies\n", + "\n", + "We'll use Haystack's in-memory document store (no external database needed) with `sentence-transformers` for multilingual embeddings." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "784cd52d", + "metadata": { + "execution": { + "iopub.execute_input": "2026-02-24T14:17:21.742972Z", + "iopub.status.busy": "2026-02-24T14:17:21.742628Z", + "iopub.status.idle": "2026-02-24T14:17:22.145654Z", + "shell.execute_reply": "2026-02-24T14:17:22.144984Z" + } + }, + "outputs": [], + "source": [ + "%%bash\n", + "pip install -q haystack-ai \"sentence-transformers>=3.0.0\"" + ] + }, + { + "cell_type": "markdown", + "id": "7034f30c", + "metadata": {}, + "source": [ + "## Prepare a Multilingual Document Collection\n", + "\n", + "Let's create a small but realistic dataset of documents in both English and Chinese. These cover overlapping topics — renewable energy, AI policy, and urban planning — so we can test whether retrieval works **across language boundaries**." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "0036f66a", + "metadata": { + "execution": { + "iopub.execute_input": "2026-02-24T14:17:22.147444Z", + "iopub.status.busy": "2026-02-24T14:17:22.147324Z", + "iopub.status.idle": "2026-02-24T14:17:22.977940Z", + "shell.execute_reply": "2026-02-24T14:17:22.977481Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Total documents: 10\n", + "English: 5\n", + "Chinese: 5\n" + ] + } + ], + "source": [ + "from haystack import Document\n", + "\n", + "documents = [\n", + " # English documents\n", + " Document(\n", + " content=\"Solar panel efficiency has improved significantly in recent years. \"\n", + " \"Modern photovoltaic cells can convert over 22% of sunlight into electricity, \"\n", + " \"making rooftop solar installations increasingly cost-effective for homeowners.\",\n", + " meta={\"language\": \"en\", \"topic\": \"renewable_energy\"}\n", + " ),\n", + " Document(\n", + " content=\"The European Union has set ambitious carbon neutrality targets for 2050. \"\n", + " \"Key policies include the Emissions Trading System, renewable energy mandates, \"\n", + " \"and substantial funding for green hydrogen research.\",\n", + " meta={\"language\": \"en\", \"topic\": \"climate_policy\"}\n", + " ),\n", + " Document(\n", + " content=\"Large language models are transforming how developers write code. \"\n", + " \"AI-assisted programming tools can suggest entire functions, detect bugs, \"\n", + " \"and explain complex codebases, significantly boosting productivity.\",\n", + " meta={\"language\": \"en\", \"topic\": \"ai_programming\"}\n", + " ),\n", + " Document(\n", + " content=\"Urban green spaces play a crucial role in reducing heat island effects. \"\n", + " \"Cities that invest in parks, green roofs, and tree-lined streets see measurable \"\n", + " \"improvements in air quality and residents' mental health.\",\n", + " meta={\"language\": \"en\", \"topic\": \"urban_planning\"}\n", + " ),\n", + " Document(\n", + " content=\"Wind power capacity reached record levels globally in 2024. \"\n", + " \"Offshore wind farms in particular are becoming major contributors \"\n", + " \"to national energy grids across Europe and East Asia.\",\n", + " meta={\"language\": \"en\", \"topic\": \"renewable_energy\"}\n", + " ),\n", + " # Chinese documents\n", + " Document(\n", + " content=\"中国的碳中和目标要求在2060年前实现净零排放。\"\n", + " \"主要措施包括大规模发展光伏发电、推进电动汽车普及、\"\n", + " \"以及建立全国性的碳排放交易市场。\",\n", + " meta={\"language\": \"zh\", \"topic\": \"climate_policy\"}\n", + " ),\n", + " Document(\n", + " content=\"深度学习技术在自然语言处理领域取得了重大突破。\"\n", + " \"基于Transformer架构的大语言模型能够理解上下文语义,\"\n", + " \"在机器翻译、文本摘要和代码生成等任务上表现优异。\",\n", + " meta={\"language\": \"zh\", \"topic\": \"ai_programming\"}\n", + " ),\n", + " Document(\n", + " content=\"城市热岛效应是现代都市面临的重要环境问题。\"\n", + " \"通过增加城市绿化面积、推广绿色屋顶和透水路面,\"\n", + " \"可以有效降低城区温度并改善居民生活环境。\",\n", + " meta={\"language\": \"zh\", \"topic\": \"urban_planning\"}\n", + " ),\n", + " Document(\n", + " content=\"新型钙钛矿太阳能电池的转换效率已突破25%。\"\n", + " \"与传统硅基电池相比,钙钛矿电池制造成本更低,\"\n", + " \"柔性基底的特性使其可以应用于建筑外墙和便携设备。\",\n", + " meta={\"language\": \"zh\", \"topic\": \"renewable_energy\"}\n", + " ),\n", + " Document(\n", + " content=\"检索增强生成(RAG)技术通过结合外部知识库来提升大模型的准确性。\"\n", + " \"系统首先从文档集合中检索相关段落,然后将检索结果作为上下文\"\n", + " \"输入给生成模型,从而减少幻觉问题并提供可追溯的信息来源。\",\n", + " meta={\"language\": \"zh\", \"topic\": \"rag\"}\n", + " ),\n", + "]\n", + "\n", + "print(f\"Total documents: {len(documents)}\")\n", + "print(f\"English: {sum(1 for d in documents if d.meta['language'] == 'en')}\")\n", + "print(f\"Chinese: {sum(1 for d in documents if d.meta['language'] == 'zh')}\")" + ] + }, + { + "cell_type": "markdown", + "id": "db1fd927", + "metadata": {}, + "source": [ + "## Approach 1: BM25 Retrieval (Keyword-based)\n", + "\n", + "Let's first try BM25, the classic keyword-matching algorithm. BM25 works by matching exact terms between the query and documents.\n", + "\n", + "BM25 relies on **whitespace tokenization** to split text into terms. This works well for space-delimited languages like English, but presents a fundamental limitation for languages like Chinese, Japanese, or Thai where words are not separated by spaces. Haystack's `InMemoryBM25Retriever` uses whitespace-based tokenization, so:\n", + "\n", + "- **English → English**: Works normally\n", + "- **English → Chinese**: No matches (different scripts)\n", + "- **Chinese → Chinese**: Poor results (entire sentences become single \"tokens\" without word segmentation)\n", + "\n", + "Let's verify this limitation:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "61701b4b", + "metadata": { + "execution": { + "iopub.execute_input": "2026-02-24T14:17:22.979291Z", + "iopub.status.busy": "2026-02-24T14:17:22.979152Z", + "iopub.status.idle": "2026-02-24T14:17:22.986536Z", + "shell.execute_reply": "2026-02-24T14:17:22.986206Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Query: 'carbon neutrality policies and emission targets'\n", + "Results found: 5\n", + "\n", + " [1] (lang=en, score=9.3893)\n", + " The European Union has set ambitious carbon neutrality targets for 2050. Key pol...\n", + "\n", + " [2] (lang=en, score=6.1257)\n", + " Urban green spaces play a crucial role in reducing heat island effects. Cities t...\n", + "\n", + " [3] (lang=en, score=5.9469)\n", + " Large language models are transforming how developers write code. AI-assisted pr...\n", + "\n", + " [4] (lang=en, score=5.9372)\n", + " Wind power capacity reached record levels globally in 2024. Offshore wind farms ...\n", + "\n", + " [5] (lang=en, score=5.5397)\n", + " Solar panel efficiency has improved significantly in recent years. Modern photov...\n", + "\n" + ] + } + ], + "source": [ + "from haystack.document_stores.in_memory import InMemoryDocumentStore\n", + "from haystack.components.retrievers.in_memory import InMemoryBM25Retriever\n", + "\n", + "# Create document store and write documents\n", + "bm25_store = InMemoryDocumentStore()\n", + "bm25_store.write_documents(documents)\n", + "\n", + "bm25_retriever = InMemoryBM25Retriever(document_store=bm25_store, top_k=5)\n", + "\n", + "# Test: English query about carbon neutrality\n", + "query = \"carbon neutrality policies and emission targets\"\n", + "results = bm25_retriever.run(query=query)\n", + "\n", + "print(f\"Query: '{query}'\")\n", + "print(f\"Results found: {len(results['documents'])}\\n\")\n", + "for i, doc in enumerate(results[\"documents\"]):\n", + " print(f\" [{i+1}] (lang={doc.meta['language']}, score={doc.score:.4f})\")\n", + " print(f\" {doc.content[:80]}...\\n\")\n", + "\n", + "if len(results[\"documents\"]) == 0:\n", + " print(\" (No results — BM25 could not match any documents)\")" + ] + }, + { + "cell_type": "markdown", + "id": "4cfd623a", + "metadata": {}, + "source": [ + "As expected, BM25 can only find documents that share the same script and vocabulary as the query. The Chinese document about China's carbon neutrality goals (碳中和) is completely invisible to English keyword matching.\n", + "\n", + "> ⚠️ **Note on Chinese BM25:** Haystack's `InMemoryDocumentStore` uses whitespace-based tokenization. Since Chinese text has no spaces between words, BM25 treats entire Chinese sentences as single tokens, leading to poor recall even for Chinese-to-Chinese queries. In production systems, you would use a document store with proper CJK tokenization (e.g., Elasticsearch with `ik_analyzer` or OpenSearch with CJK plugins)." + ] + }, + { + "cell_type": "markdown", + "id": "d9c93a84", + "metadata": {}, + "source": [ + "## Approach 2: Dense Retrieval (Multilingual Embeddings)\n", + "\n", + "Now let's use a **multilingual embedding model** that maps both Chinese and English text into a shared semantic vector space. We'll use `sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2`, which supports 50+ languages.\n", + "\n", + "The key insight: semantically similar content in different languages gets mapped to nearby vectors, enabling cross-lingual retrieval without any tokenization workarounds." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "89ac58c5", + "metadata": { + "execution": { + "iopub.execute_input": "2026-02-24T14:17:22.987790Z", + "iopub.status.busy": "2026-02-24T14:17:22.987710Z", + "iopub.status.idle": "2026-02-24T14:17:28.767367Z", + "shell.execute_reply": "2026-02-24T14:17:28.766920Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Warning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads.\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "f737c99bcd144113aebe1c7199acf44b", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Loading weights: 0%| | 0/199 [00:00\n", + "🚅 Components\n", + " - text_embedder: SentenceTransformersTextEmbedder\n", + " - bm25_retriever: InMemoryBM25Retriever\n", + " - embedding_retriever: InMemoryEmbeddingRetriever\n", + " - joiner: DocumentJoiner\n", + "🛤️ Connections\n", + " - text_embedder.embedding -> embedding_retriever.query_embedding (list[float])\n", + " - bm25_retriever.documents -> joiner.documents (list[Document])\n", + " - embedding_retriever.documents -> joiner.documents (list[Document])\n", + "\n" + ] + } + ], + "source": [ + "from haystack import Pipeline\n", + "from haystack.components.joiners import DocumentJoiner\n", + "from haystack.components.retrievers.in_memory import (\n", + " InMemoryBM25Retriever,\n", + " InMemoryEmbeddingRetriever,\n", + ")\n", + "from haystack.components.embedders import SentenceTransformersTextEmbedder\n", + "\n", + "# dense_store already has embeddings, and InMemoryDocumentStore supports BM25 natively\n", + "\n", + "hybrid_pipeline = Pipeline()\n", + "\n", + "# Components\n", + "hybrid_pipeline.add_component(\n", + " \"text_embedder\",\n", + " SentenceTransformersTextEmbedder(model=EMBEDDING_MODEL)\n", + ")\n", + "hybrid_pipeline.add_component(\n", + " \"bm25_retriever\",\n", + " InMemoryBM25Retriever(document_store=dense_store, top_k=5)\n", + ")\n", + "hybrid_pipeline.add_component(\n", + " \"embedding_retriever\",\n", + " InMemoryEmbeddingRetriever(document_store=dense_store, top_k=5)\n", + ")\n", + "hybrid_pipeline.add_component(\n", + " \"joiner\",\n", + " DocumentJoiner(join_mode=\"reciprocal_rank_fusion\", top_k=5)\n", + ")\n", + "\n", + "# Connections\n", + "hybrid_pipeline.connect(\"text_embedder.embedding\", \"embedding_retriever.query_embedding\")\n", + "hybrid_pipeline.connect(\"bm25_retriever.documents\", \"joiner.documents\")\n", + "hybrid_pipeline.connect(\"embedding_retriever.documents\", \"joiner.documents\")\n", + "\n", + "print(hybrid_pipeline)" + ] + }, + { + "cell_type": "markdown", + "id": "7ed703e9", + "metadata": {}, + "source": [ + "Now let's run the hybrid pipeline with our cross-lingual test queries:" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "479a2e9e", + "metadata": { + "execution": { + "iopub.execute_input": "2026-02-24T14:17:28.822897Z", + "iopub.status.busy": "2026-02-24T14:17:28.822817Z", + "iopub.status.idle": "2026-02-24T14:17:29.610005Z", + "shell.execute_reply": "2026-02-24T14:17:29.609437Z" + } + }, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "25caa95ec18448d8a03303e4a589f460", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Batches: 0%| | 0/1 [00:00\n", + "🚅 Components\n", + " - text_embedder: SentenceTransformersTextEmbedder\n", + " - bm25_retriever: InMemoryBM25Retriever\n", + " - embedding_retriever: InMemoryEmbeddingRetriever\n", + " - joiner: DocumentJoiner\n", + " - prompt_builder: ChatPromptBuilder\n", + " - llm: HuggingFaceLocalChatGenerator\n", + "🛤️ Connections\n", + " - text_embedder.embedding -> embedding_retriever.query_embedding (list[float])\n", + " - bm25_retriever.documents -> joiner.documents (list[Document])\n", + " - embedding_retriever.documents -> joiner.documents (list[Document])\n", + " - joiner.documents -> prompt_builder.documents (list[Document])\n", + " - prompt_builder.prompt -> llm.messages (list[ChatMessage])\n", + "\n" + ] + } + ], + "source": [ + "from haystack.components.builders import ChatPromptBuilder\n", + "from haystack.components.generators.chat import HuggingFaceLocalChatGenerator\n", + "from haystack.dataclasses import ChatMessage\n", + "\n", + "rag_pipeline = Pipeline()\n", + "\n", + "# Retrieval components (same as before)\n", + "rag_pipeline.add_component(\n", + " \"text_embedder\",\n", + " SentenceTransformersTextEmbedder(model=EMBEDDING_MODEL)\n", + ")\n", + "rag_pipeline.add_component(\n", + " \"bm25_retriever\",\n", + " InMemoryBM25Retriever(document_store=dense_store, top_k=5)\n", + ")\n", + "rag_pipeline.add_component(\n", + " \"embedding_retriever\",\n", + " InMemoryEmbeddingRetriever(document_store=dense_store, top_k=5)\n", + ")\n", + "rag_pipeline.add_component(\n", + " \"joiner\",\n", + " DocumentJoiner(join_mode=\"reciprocal_rank_fusion\", top_k=3)\n", + ")\n", + "\n", + "# Generation components\n", + "template = [\n", + " ChatMessage.from_system(\n", + " \"You are a helpful multilingual assistant. Answer the question based on the \"\n", + " \"provided context documents, which may be in English or Chinese. Synthesize \"\n", + " \"information from all relevant documents regardless of their language. \"\n", + " \"Answer in the same language as the question.\"\n", + " ),\n", + " ChatMessage.from_user(\n", + " \"Context:\\n\"\n", + " \"{% for doc in documents %}\\n\"\n", + " \"[{{ doc.meta.language | upper }}] {{ doc.content }}\\n\"\n", + " \"{% endfor %}\\n\\n\"\n", + " \"Question: {{ query }}\\n\"\n", + " ),\n", + "]\n", + "\n", + "rag_pipeline.add_component(\n", + " \"prompt_builder\",\n", + " ChatPromptBuilder(template=template)\n", + ")\n", + "rag_pipeline.add_component(\n", + " \"llm\",\n", + " HuggingFaceLocalChatGenerator(model=\"Qwen/Qwen3-0.6B\")\n", + ")\n", + "\n", + "# Connect retrieval\n", + "rag_pipeline.connect(\"text_embedder.embedding\", \"embedding_retriever.query_embedding\")\n", + "rag_pipeline.connect(\"bm25_retriever.documents\", \"joiner.documents\")\n", + "rag_pipeline.connect(\"embedding_retriever.documents\", \"joiner.documents\")\n", + "\n", + "# Connect generation\n", + "rag_pipeline.connect(\"joiner.documents\", \"prompt_builder.documents\")\n", + "rag_pipeline.connect(\"prompt_builder\", \"llm\")\n", + "print(rag_pipeline)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "fa200f65", + "metadata": { + "execution": { + "iopub.execute_input": "2026-02-24T14:17:30.350738Z", + "iopub.status.busy": "2026-02-24T14:17:30.350595Z", + "iopub.status.idle": "2026-02-24T14:17:34.594228Z", + "shell.execute_reply": "2026-02-24T14:17:34.593667Z" + } + }, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "29735207967a4bd28d5f479d95a7d530", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Loading weights: 0%| | 0/311 [00:00