diff --git a/evaluation/bo767_recall.ipynb b/evaluation/bo767_recall.ipynb index 10deca351b..fc89808d35 100644 --- a/evaluation/bo767_recall.ipynb +++ b/evaluation/bo767_recall.ipynb @@ -5,28 +5,15 @@ "id": "3c4c7d5f-51fb-4879-8fd3-d304165ffd38", "metadata": {}, "source": [ - "# Evaluate bo767 retrieval recall accuracy with NV-Ingest and Milvus" + "# Evaluate bo767 retrieval recall accuracy with NeMo Retriever" ] }, { "cell_type": "markdown", - "id": "3a453802-83f4-4fa5-95f2-b663dfeec59b", + "id": "f6f80e87", "metadata": {}, "source": [ - "In this notebook, we'll use NV-ingest and LlamaIndex to get the end-to-end recall accuracy of a retrieval pipeline made up of NV-Ingest's extraction and embedding tasks and a Milvus vector database (VDB)." - ] - }, - { - "cell_type": "code", - "execution_count": 62, - "id": "1c174e25-ffdf-4764-bad5-e3be8cb00943", - "metadata": {}, - "outputs": [], - "source": [ - "from pymilvus import MilvusClient\n", - "\n", - "milvus_client = MilvusClient(\"http://localhost:19530\")\n", - "milvus_client.drop_collection(collection_name='bo767')" + "To download the bo767 PDF corpus, please refer to [digital_corpora_download.ipynb](https://github.com/NVIDIA/NeMo-Retriever/blob/main/evaluation/digital_corpora_download.ipynb)" ] }, { @@ -34,7 +21,7 @@ "id": "0d3116aa-2992-4798-bae6-42a4e3cac58f", "metadata": {}, "source": [ - "## Ingestion" + "### CLI" ] }, { @@ -44,350 +31,75 @@ "metadata": {}, "outputs": [], "source": [ - "from nv_ingest_client.client import Ingestor\n", - "\n", - "ingestor = (\n", - " Ingestor()\n", - " .files(\"../data/nv-ingest/bo767/*.pdf\")\n", - " .extract(\n", - " extract_text=True,\n", - " extract_tables=True,\n", - " extract_charts=True,\n", - " extract_images=False,\n", - " text_depth=\"page\",\n", - " ).embed()\n", - " .vdb_upload(\n", - " collection_name=\"bo767\",\n", - " stream=False,\n", - " )\n", - ")\n", - "\n", - "results = ingestor.ingest(show_progress=True)" + "%%bash\n", + "retriever pipeline run path/to/bo767/pdfs \\\n", + " --vdb-kwargs-json '{\"uri\":\"../lancedb\",\"table_name\":\"bo767\"}' \\\n", + " --evaluation-mode beir \\\n", + " --beir-dataset-name bo767 \\\n", + " --quiet" ] }, { - "cell_type": "code", - "execution_count": 37, + "cell_type": "markdown", "id": "e25582b6-005b-47d2-8b47-b0823422bda9", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "767" - ] - }, - "execution_count": 37, - "metadata": {}, - "output_type": "execute_result" - } - ], "source": [ - "len(results)" + "### Python" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "id": "9f1dba08-c468-425f-9eb3-48fe568b67c7", "metadata": {}, "outputs": [], "source": [ - "# Optional: save results\n", - "import pickle\n", + "from pathlib import Path\n", "\n", - "with open('bo767_results.pkl', 'wb') as f:\n", - " pickle.dump(results, f)" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "id": "f4cd3db7-c8a4-478e-9b48-3c8fffe4d32c", - "metadata": {}, - "outputs": [], - "source": [ - "# Optional: load results\n", - "import pickle\n", + "from nemo_retriever import create_ingestor\n", + "from nemo_retriever.model import VL_EMBED_MODEL\n", + "from nemo_retriever.params import EmbedParams, VdbUploadParams\n", + "from nemo_retriever.recall.beir import BeirConfig, evaluate_lancedb_beir, resolve_beir_dataset_options\n", "\n", - "with open('bo767_results.pkl', 'rb') as f:\n", - " results = pickle.load(f)" - ] - }, - { - "cell_type": "markdown", - "id": "5bb4ae87-f04d-42ce-aa70-579b9b461172", - "metadata": {}, - "source": [ - "## Recall" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "id": "2667d436-0e13-4539-b679-2c922b6069a5", - "metadata": {}, - "outputs": [], - "source": [ - "from nv_ingest_client.util.milvus import nvingest_retrieval\n", - "from collections import defaultdict\n", - "import os\n", - "import numpy as np\n", + "input_path = str(Path(\"path/to/bo767/pdfs\").resolve())\n", + "lancedb_uri = str(Path(\"../lancedb\").resolve())\n", + "table_name = \"bo767\"\n", "\n", - "def get_recall_scores(query_df, collection_name):\n", - " hits = defaultdict(list)\n", - " all_answers = nvingest_retrieval(\n", - " query_df[\"query\"].to_list(),\n", - " collection_name,\n", - " hybrid=False,\n", - " milvus_uri=\"http://localhost:19530\",\n", - " embedding_endpoint=\"http://localhost:8012/v1\",\n", - " model_name=\"nvidia/llama-nemotron-embed-1b-v2\",\n", - " top_k=10,\n", - " gpu_search=False,\n", + "result = (\n", + " create_ingestor(run_mode=\"batch\")\n", + " .files(input_path)\n", + " .extract()\n", + " .embed(EmbedParams(model_name=VL_EMBED_MODEL))\n", + " .vdb_upload(\n", + " VdbUploadParams(\n", + " vdb_op=\"lancedb\",\n", + " vdb_kwargs={\"uri\": lancedb_uri, \"table_name\": table_name, \"overwrite\": True},\n", + " )\n", " )\n", + " .ingest()\n", + ")\n", "\n", - " for i in range(len(query_df)):\n", - " expected_pdf_page = query_df['pdf_page'][i]\n", - " retrieved_answers = all_answers[i]\n", - " retrieved_pdfs = [os.path.basename(result['entity']['source']['source_id']).split('.')[0] for result in retrieved_answers]\n", - " retrieved_pages = [str(result['entity']['content_metadata']['page_number']) for result in retrieved_answers]\n", - " retrieved_pdf_pages = [f\"{pdf}_{page}\" for pdf, page in zip(retrieved_pdfs, retrieved_pages)] \n", + "beir = resolve_beir_dataset_options(dataset_name=\"bo767\")\n", "\n", - " for k in [1, 3, 5, 10]:\n", - " hits[k].append(expected_pdf_page in retrieved_pdf_pages[:k])\n", - " \n", - " for k in hits:\n", - " print(f' - Recall @{k}: {np.mean(hits[k]) :.3f}')" - ] - }, - { - "cell_type": "code", - "execution_count": 61, - "id": "50cf7163-668a-47e4-904e-10fc8fae594c", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
querypdfpagemodalitypdf_page
0How much was the ARtillery Intelligence projec...110243419text1102434_20
1How much revenue of AR advertising is expected...11024343text1102434_4
2What types of statistics were utilized by Rein...10960783text1096078_4
3What was the maximum amount requested for cond...10541251text1054125_2
4What is the median household income for the Ci...12469067text1246906_8
..................
986After the 2008 recession, what percentage of p...23843956chart2384395_7
987what were the top 3 major religious groups in ...23926765chart2392676_6
988What percentage of people in the world identif...23926765chart2392676_6
989Between 2003 and 2019, has the household mortg...2410699189chart2410699_190
990When did the total household mortgage debt in ...2410699189chart2410699_190
\n", - "

991 rows × 5 columns

\n", - "
" - ], - "text/plain": [ - " query pdf page \\\n", - "0 How much was the ARtillery Intelligence projec... 1102434 19 \n", - "1 How much revenue of AR advertising is expected... 1102434 3 \n", - "2 What types of statistics were utilized by Rein... 1096078 3 \n", - "3 What was the maximum amount requested for cond... 1054125 1 \n", - "4 What is the median household income for the Ci... 1246906 7 \n", - ".. ... ... ... \n", - "986 After the 2008 recession, what percentage of p... 2384395 6 \n", - "987 what were the top 3 major religious groups in ... 2392676 5 \n", - "988 What percentage of people in the world identif... 2392676 5 \n", - "989 Between 2003 and 2019, has the household mortg... 2410699 189 \n", - "990 When did the total household mortgage debt in ... 2410699 189 \n", - "\n", - " modality pdf_page \n", - "0 text 1102434_20 \n", - "1 text 1102434_4 \n", - "2 text 1096078_4 \n", - "3 text 1054125_2 \n", - "4 text 1246906_8 \n", - ".. ... ... \n", - "986 chart 2384395_7 \n", - "987 chart 2392676_6 \n", - "988 chart 2392676_6 \n", - "989 chart 2410699_190 \n", - "990 chart 2410699_190 \n", - "\n", - "[991 rows x 5 columns]" - ] - }, - "execution_count": 61, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df_query = pd.read_csv('../data/bo767_query_gt.csv')\n", - "df_query" - ] - }, - { - "cell_type": "code", - "execution_count": 44, - "id": "c5798557-53ba-4aac-801a-aa65a1701814", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " - Recall @1: 0.582\n", - " - Recall @3: 0.796\n", - " - Recall @5: 0.854\n", - " - Recall @10: 0.903\n" - ] - } - ], - "source": [ - "get_recall_scores(df_query, \"bo767\")" - ] - }, - { - "cell_type": "code", - "execution_count": 58, - "id": "f5c710f4-ce5a-4308-a0aa-f7231bccd82e", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "text\n", - " - Recall @1: 0.611\n", - " - Recall @3: 0.816\n", - " - Recall @5: 0.865\n", - " - Recall @10: 0.916\n", - "table\n", - " - Recall @1: 0.455\n", - " - Recall @3: 0.719\n", - " - Recall @5: 0.796\n", - " - Recall @10: 0.860\n", - "chart\n", - " - Recall @1: 0.642\n", - " - Recall @3: 0.828\n", - " - Recall @5: 0.884\n", - " - Recall @10: 0.918\n" - ] - } - ], - "source": [ - "for modality in df_query.modality.unique():\n", - " print(modality)\n", - " get_recall_scores(df_query.query(f\"modality=='{modality}'\").reset_index(drop=True), \"bo767\")" + "dataset, raw_hits, run, metrics = evaluate_lancedb_beir(\n", + " BeirConfig(\n", + " lancedb_uri=lancedb_uri,\n", + " lancedb_table=table_name,\n", + " embedding_model=VL_EMBED_MODEL,\n", + " loader=beir.loader,\n", + " dataset_name=beir.dataset_name,\n", + " doc_id_field=beir.doc_id_field,\n", + " ks=beir.ks,\n", + " )\n", + ")\n", + "\n", + "metrics" ] }, { "cell_type": "code", "execution_count": null, - "id": "d44cea39-5b3c-4292-80a4-06066942fb6a", + "id": "249a4852", "metadata": {}, "outputs": [], "source": []