diff --git a/evaluation/bo767_recall.ipynb b/evaluation/bo767_recall.ipynb index 10deca351b..fc89808d35 100644 --- a/evaluation/bo767_recall.ipynb +++ b/evaluation/bo767_recall.ipynb @@ -5,28 +5,15 @@ "id": "3c4c7d5f-51fb-4879-8fd3-d304165ffd38", "metadata": {}, "source": [ - "# Evaluate bo767 retrieval recall accuracy with NV-Ingest and Milvus" + "# Evaluate bo767 retrieval recall accuracy with NeMo Retriever" ] }, { "cell_type": "markdown", - "id": "3a453802-83f4-4fa5-95f2-b663dfeec59b", + "id": "f6f80e87", "metadata": {}, "source": [ - "In this notebook, we'll use NV-ingest and LlamaIndex to get the end-to-end recall accuracy of a retrieval pipeline made up of NV-Ingest's extraction and embedding tasks and a Milvus vector database (VDB)." - ] - }, - { - "cell_type": "code", - "execution_count": 62, - "id": "1c174e25-ffdf-4764-bad5-e3be8cb00943", - "metadata": {}, - "outputs": [], - "source": [ - "from pymilvus import MilvusClient\n", - "\n", - "milvus_client = MilvusClient(\"http://localhost:19530\")\n", - "milvus_client.drop_collection(collection_name='bo767')" + "To download the bo767 PDF corpus, please refer to [digital_corpora_download.ipynb](https://github.com/NVIDIA/NeMo-Retriever/blob/main/evaluation/digital_corpora_download.ipynb)" ] }, { @@ -34,7 +21,7 @@ "id": "0d3116aa-2992-4798-bae6-42a4e3cac58f", "metadata": {}, "source": [ - "## Ingestion" + "### CLI" ] }, { @@ -44,350 +31,75 @@ "metadata": {}, "outputs": [], "source": [ - "from nv_ingest_client.client import Ingestor\n", - "\n", - "ingestor = (\n", - " Ingestor()\n", - " .files(\"../data/nv-ingest/bo767/*.pdf\")\n", - " .extract(\n", - " extract_text=True,\n", - " extract_tables=True,\n", - " extract_charts=True,\n", - " extract_images=False,\n", - " text_depth=\"page\",\n", - " ).embed()\n", - " .vdb_upload(\n", - " collection_name=\"bo767\",\n", - " stream=False,\n", - " )\n", - ")\n", - "\n", - "results = ingestor.ingest(show_progress=True)" + "%%bash\n", + "retriever pipeline run path/to/bo767/pdfs \\\n", + " --vdb-kwargs-json '{\"uri\":\"../lancedb\",\"table_name\":\"bo767\"}' \\\n", + " --evaluation-mode beir \\\n", + " --beir-dataset-name bo767 \\\n", + " --quiet" ] }, { - "cell_type": "code", - "execution_count": 37, + "cell_type": "markdown", "id": "e25582b6-005b-47d2-8b47-b0823422bda9", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "767" - ] - }, - "execution_count": 37, - "metadata": {}, - "output_type": "execute_result" - } - ], "source": [ - "len(results)" + "### Python" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "id": "9f1dba08-c468-425f-9eb3-48fe568b67c7", "metadata": {}, "outputs": [], "source": [ - "# Optional: save results\n", - "import pickle\n", + "from pathlib import Path\n", "\n", - "with open('bo767_results.pkl', 'wb') as f:\n", - " pickle.dump(results, f)" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "id": "f4cd3db7-c8a4-478e-9b48-3c8fffe4d32c", - "metadata": {}, - "outputs": [], - "source": [ - "# Optional: load results\n", - "import pickle\n", + "from nemo_retriever import create_ingestor\n", + "from nemo_retriever.model import VL_EMBED_MODEL\n", + "from nemo_retriever.params import EmbedParams, VdbUploadParams\n", + "from nemo_retriever.recall.beir import BeirConfig, evaluate_lancedb_beir, resolve_beir_dataset_options\n", "\n", - "with open('bo767_results.pkl', 'rb') as f:\n", - " results = pickle.load(f)" - ] - }, - { - "cell_type": "markdown", - "id": "5bb4ae87-f04d-42ce-aa70-579b9b461172", - "metadata": {}, - "source": [ - "## Recall" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "id": "2667d436-0e13-4539-b679-2c922b6069a5", - "metadata": {}, - "outputs": [], - "source": [ - "from nv_ingest_client.util.milvus import nvingest_retrieval\n", - "from collections import defaultdict\n", - "import os\n", - "import numpy as np\n", + "input_path = str(Path(\"path/to/bo767/pdfs\").resolve())\n", + "lancedb_uri = str(Path(\"../lancedb\").resolve())\n", + "table_name = \"bo767\"\n", "\n", - "def get_recall_scores(query_df, collection_name):\n", - " hits = defaultdict(list)\n", - " all_answers = nvingest_retrieval(\n", - " query_df[\"query\"].to_list(),\n", - " collection_name,\n", - " hybrid=False,\n", - " milvus_uri=\"http://localhost:19530\",\n", - " embedding_endpoint=\"http://localhost:8012/v1\",\n", - " model_name=\"nvidia/llama-nemotron-embed-1b-v2\",\n", - " top_k=10,\n", - " gpu_search=False,\n", + "result = (\n", + " create_ingestor(run_mode=\"batch\")\n", + " .files(input_path)\n", + " .extract()\n", + " .embed(EmbedParams(model_name=VL_EMBED_MODEL))\n", + " .vdb_upload(\n", + " VdbUploadParams(\n", + " vdb_op=\"lancedb\",\n", + " vdb_kwargs={\"uri\": lancedb_uri, \"table_name\": table_name, \"overwrite\": True},\n", + " )\n", " )\n", + " .ingest()\n", + ")\n", "\n", - " for i in range(len(query_df)):\n", - " expected_pdf_page = query_df['pdf_page'][i]\n", - " retrieved_answers = all_answers[i]\n", - " retrieved_pdfs = [os.path.basename(result['entity']['source']['source_id']).split('.')[0] for result in retrieved_answers]\n", - " retrieved_pages = [str(result['entity']['content_metadata']['page_number']) for result in retrieved_answers]\n", - " retrieved_pdf_pages = [f\"{pdf}_{page}\" for pdf, page in zip(retrieved_pdfs, retrieved_pages)] \n", + "beir = resolve_beir_dataset_options(dataset_name=\"bo767\")\n", "\n", - " for k in [1, 3, 5, 10]:\n", - " hits[k].append(expected_pdf_page in retrieved_pdf_pages[:k])\n", - " \n", - " for k in hits:\n", - " print(f' - Recall @{k}: {np.mean(hits[k]) :.3f}')" - ] - }, - { - "cell_type": "code", - "execution_count": 61, - "id": "50cf7163-668a-47e4-904e-10fc8fae594c", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
| \n", - " | query | \n", - "page | \n", - "modality | \n", - "pdf_page | \n", - "|
|---|---|---|---|---|---|
| 0 | \n", - "How much was the ARtillery Intelligence projec... | \n", - "1102434 | \n", - "19 | \n", - "text | \n", - "1102434_20 | \n", - "
| 1 | \n", - "How much revenue of AR advertising is expected... | \n", - "1102434 | \n", - "3 | \n", - "text | \n", - "1102434_4 | \n", - "
| 2 | \n", - "What types of statistics were utilized by Rein... | \n", - "1096078 | \n", - "3 | \n", - "text | \n", - "1096078_4 | \n", - "
| 3 | \n", - "What was the maximum amount requested for cond... | \n", - "1054125 | \n", - "1 | \n", - "text | \n", - "1054125_2 | \n", - "
| 4 | \n", - "What is the median household income for the Ci... | \n", - "1246906 | \n", - "7 | \n", - "text | \n", - "1246906_8 | \n", - "
| ... | \n", - "... | \n", - "... | \n", - "... | \n", - "... | \n", - "... | \n", - "
| 986 | \n", - "After the 2008 recession, what percentage of p... | \n", - "2384395 | \n", - "6 | \n", - "chart | \n", - "2384395_7 | \n", - "
| 987 | \n", - "what were the top 3 major religious groups in ... | \n", - "2392676 | \n", - "5 | \n", - "chart | \n", - "2392676_6 | \n", - "
| 988 | \n", - "What percentage of people in the world identif... | \n", - "2392676 | \n", - "5 | \n", - "chart | \n", - "2392676_6 | \n", - "
| 989 | \n", - "Between 2003 and 2019, has the household mortg... | \n", - "2410699 | \n", - "189 | \n", - "chart | \n", - "2410699_190 | \n", - "
| 990 | \n", - "When did the total household mortgage debt in ... | \n", - "2410699 | \n", - "189 | \n", - "chart | \n", - "2410699_190 | \n", - "
991 rows × 5 columns
\n", - "