From a064a814a5a6afc79be84c23f9ac5e45d41f739d Mon Sep 17 00:00:00 2001 From: JMPonce Date: Wed, 29 Apr 2026 10:51:28 +0200 Subject: [PATCH 1/3] feat(eval): regenerate testset references from KG + ChromaDB The original references in data/evaluation/testset.json were hand-written from textbook knowledge and didn't match what's actually loaded in FAERS 2024Q3+Q4 / DailyMed. This caused ContextRecall ~ 0 in RAGAS not because the system was wrong, but because RAGAS measured ref-vs-data mismatch. This commit adds: - scripts/regenerate_references.py: per-question generators that query Neo4j (counts, interactions, outcomes, categories) or ChromaDB (label text) to produce data-grounded references. Loads .env.aura before importing pharmagraphrag so pydantic-settings picks up Aura creds. - scripts/test_aura_connection.py: smoke test against Aura. - data/evaluation/testset_v2.json: 25 regenerated refs. Originals preserved under 'original_reference' for traceability. - .gitignore: exclude .env.aura and .env.* (multi-env credentials) Notable findings: - Warfarin has no DrugCategory in the KG (BELONGS_TO missing). The KG also lacks any 'Anticoagulant' category. Refs for q13/q15 surface this honestly rather than hide the data gap. - Some DailyMed sections (e.g. mechanism_of_action for OTC omeprazole) are not present as substantive chunks. fmt_label_search returns an honest message in that case. --- .gitignore | 2 + data/evaluation/testset_v2.json | 280 +++++++++++++++++++++ scripts/regenerate_references.py | 407 +++++++++++++++++++++++++++++++ scripts/test_aura_connection.py | 45 ++++ 4 files changed, 734 insertions(+) create mode 100644 data/evaluation/testset_v2.json create mode 100644 scripts/regenerate_references.py create mode 100644 scripts/test_aura_connection.py diff --git a/.gitignore b/.gitignore index 6e38604..66992b1 100644 --- a/.gitignore +++ b/.gitignore @@ -35,6 +35,8 @@ data/chroma/ .env .env.local .env.production +.env.aura +.env.* # === Credentials / secrets === *.credentials diff --git a/data/evaluation/testset_v2.json b/data/evaluation/testset_v2.json new file mode 100644 index 0000000..a190564 --- /dev/null +++ b/data/evaluation/testset_v2.json @@ -0,0 +1,280 @@ +{ + "metadata": { + "version": "1.0.0", + "description": "Curated evaluation testset for PharmaGraphRAG. Questions span drug interactions, adverse events, outcomes, categories, and comparisons. Ground truth derived from FAERS data and DailyMed drug labels loaded in the knowledge graph.", + "created": "2025-05-17", + "author": "Jose Maria Ponce Bernabe", + "question_types": [ + "drug_info", + "interaction", + "adverse_event", + "outcome", + "category", + "comparison", + "multi_drug", + "label_search" + ], + "regenerated_at": "2026-04-29", + "regeneration_note": "References regenerated from actual KG (Neo4j Aura) + ChromaDB content to fix the mismatch between hand-written textbook references and the data the system retrieves. Original references preserved under 'original_reference'." + }, + "samples": [ + { + "id": "q01", + "question": "What are the most common adverse events reported for aspirin?", + "reference": "The most frequently reported adverse events for Aspirin in FAERS are dyspnoea (489 reports), drug ineffective (394 reports), vomiting (374 reports), off label use (337 reports), pain (333 reports).", + "question_type": "drug_info", + "expected_tools": [ + "search_drug_info" + ], + "original_reference": "Common adverse events for aspirin include gastrointestinal haemorrhage, drug interaction, nausea, and platelet aggregation decreased, based on FAERS reports." + }, + { + "id": "q02", + "question": "Does warfarin interact with aspirin?", + "reference": "Yes, Warfarin and Aspirin have a documented interaction (source: DailyMed). Antiplatelet Agents aspirin, cilostazol, clopidogrel, dipyridamole, prasugrel, ticlopidine Non-steroidal Anti-Inflammatory Agents celecoxib, diclofenac, diflunisal, fenoprofen, ibuprofen, indomethacin, ketoprofen,…", + "question_type": "interaction", + "expected_tools": [ + "search_drug_info", + "list_drug_interactions" + ], + "original_reference": "Yes, warfarin interacts with aspirin. Co-administration increases the risk of bleeding. This interaction is documented in DailyMed drug labels." + }, + { + "id": "q03", + "question": "What drugs are known to cause hepatotoxicity?", + "reference": "Drugs reported in FAERS as causing HEPATOTOXICITY include Acetaminophen (69 reports), Methotrexate (69 reports), Prednisone (66 reports), Carboplatin (53 reports), Cyclophosphamide (40 reports).", + "question_type": "adverse_event", + "expected_tools": [ + "find_drugs_for_adverse_event" + ], + "original_reference": "Drugs reported to cause hepatotoxicity in FAERS include methotrexate, acetaminophen, and several others with varying report counts." + }, + { + "id": "q04", + "question": "What are the clinical outcomes associated with metformin use?", + "reference": "Clinical outcomes documented for Metformin in FAERS include Other Serious (1900 reports), Hospitalization (1502 reports), Life-Threatening (610 reports), Death (380 reports), Disability (95 reports).", + "question_type": "outcome", + "expected_tools": [ + "search_drug_info", + "get_drug_outcomes" + ], + "original_reference": "Clinical outcomes for metformin in FAERS include hospitalisation, other serious outcomes, and death, with hospitalisation being the most frequently reported." + }, + { + "id": "q05", + "question": "Which drug category does ibuprofen belong to?", + "reference": "Ibuprofen belongs to the Nonsteroidal Anti-inflammatory Drug [EPC] category.", + "question_type": "category", + "expected_tools": [ + "search_drug_info" + ], + "original_reference": "Ibuprofen belongs to the Nonsteroidal Anti-inflammatory Drugs (NSAIDs) category." + }, + { + "id": "q06", + "question": "Compare the adverse event profiles of aspirin and ibuprofen.", + "reference": "Aspirin (Nonsteroidal Anti-inflammatory Drug [EPC]) and Ibuprofen (Nonsteroidal Anti-inflammatory Drug [EPC]) share adverse events such as drug ineffective, dyspnoea, fatigue, headache, nausea, based on FAERS data in the knowledge graph.", + "question_type": "comparison", + "expected_tools": [ + "compare_drugs" + ], + "original_reference": "Both aspirin and ibuprofen are NSAIDs and share common adverse events like gastrointestinal haemorrhage and nausea. Aspirin has more reports of platelet aggregation decreased, while ibuprofen has more reports of renal impairment." + }, + { + "id": "q07", + "question": "What does the drug label say about metformin interactions?", + "reference": "Based on the DailyMed label for Metformin: [drug_interactions] ) Drugs that reduce metformin clearance (such as ranolazine, vandetanib, dolutegravir, and cimetidine) may increase the accumulation of metformin. Consider the benefits and risks of concomitant use. ( 7 ) Alcohol can potentiate the effect of metformin on… [clinical_pharmacology] Therefore, the propensity of sitagliptin to be involved in clinically meaningful drug-drug interactions mediated by plasma protein binding displacement is very low. In Vivo Assessment of Drug Interactions Effects of Sitagliptin on Other Drugs In clinical…", + "question_type": "label_search", + "expected_tools": [ + "search_drug_labels" + ], + "original_reference": "According to the DailyMed drug label, metformin may interact with cationic drugs that are eliminated by renal tubular secretion, carbonic anhydrase inhibitors, and alcohol. These interactions can affect metformin clearance or increase the risk of lactic acidosis." + }, + { + "id": "q08", + "question": "What are the side effects of lisinopril?", + "reference": "The most frequently reported adverse events for Lisinopril in FAERS are angioedema (217 reports), headache (186 reports), fatigue (166 reports), nausea (138 reports), pain (138 reports).", + "question_type": "drug_info", + "expected_tools": [ + "search_drug_info" + ], + "original_reference": "Adverse events reported for lisinopril in FAERS include cough, dizziness, hypotension, renal impairment, and hyperkalaemia." + }, + { + "id": "q09", + "question": "Which drugs interact with methotrexate?", + "reference": "Drugs documented as interacting with Methotrexate include Aspirin, Celecoxib, Furosemide, Hydroxychloroquine, Meloxicam, Naproxen.", + "question_type": "interaction", + "expected_tools": [ + "list_drug_interactions", + "search_drug_info" + ], + "original_reference": "Methotrexate interacts with NSAIDs, trimethoprim, and other drugs that can reduce its clearance, increasing toxicity risk. Interactions are documented in both FAERS co-occurrence data and DailyMed labels." + }, + { + "id": "q10", + "question": "What drugs cause rhabdomyolysis?", + "reference": "Drugs reported in FAERS as causing RHABDOMYOLYSIS include Atorvastatin (154 reports), Rosuvastatin (95 reports), Levetiracetam (64 reports), Acetaminophen (42 reports), Quetiapine (38 reports).", + "question_type": "adverse_event", + "expected_tools": [ + "find_drugs_for_adverse_event" + ], + "original_reference": "Drugs reported to cause rhabdomyolysis in FAERS include statins (atorvastatin, simvastatin, rosuvastatin), and other medications. Statins are the most commonly reported drug class." + }, + { + "id": "q11", + "question": "Is atorvastatin associated with death outcomes in FAERS?", + "reference": "Clinical outcomes documented for Atorvastatin in FAERS include Other Serious (2261 reports), Hospitalization (1078 reports), Disability (203 reports), Death (153 reports), Life-Threatening (150 reports).", + "question_type": "outcome", + "expected_tools": [ + "search_drug_info", + "get_drug_outcomes" + ], + "original_reference": "Yes, atorvastatin has death reported as one of its clinical outcomes in FAERS data, along with hospitalisation and other serious outcomes." + }, + { + "id": "q12", + "question": "What are the warnings for omeprazole according to its drug label?", + "reference": "Based on the DailyMed label for Omeprazole: [warnings] Acid reducers may interact with certain prescription drugs. Stop use and ask doctor if: your heartburn continues or worsens you need to take this product for more than 14 days you need to take more than 1 course of treatment every 4 months you get diarrhea… [warnings] Warnings Allergy alert: do not use if you are allergic to omeprazole omeprazole may cause severe skin reactions. Symptoms may include: skin reddening blisters rash If an allergic reaction occurs, stop use and seek medical help right away. Do not use if you…", + "question_type": "label_search", + "expected_tools": [ + "search_drug_labels" + ], + "original_reference": "Omeprazole warnings include risk of Clostridium difficile-associated diarrhea, bone fracture risk with long-term use, hypomagnesemia, and potential interactions with clopidogrel reducing its antiplatelet effect." + }, + { + "id": "q13", + "question": "Which drugs are in the same category as warfarin?", + "reference": "Warfarin has no DrugCategory assigned in the knowledge graph, so no peer drugs can be retrieved by category.", + "question_type": "category", + "expected_tools": [ + "search_drug_info", + "find_drugs_by_category" + ], + "original_reference": "Warfarin belongs to the anticoagulant category. Other drugs in this category include heparin and related anticoagulants found in the knowledge graph." + }, + { + "id": "q14", + "question": "What are the most common adverse events across all drugs in the database?", + "reference": "The most frequently reported adverse events across all drugs in FAERS are off label use (211,371 total reports), drug ineffective (170,463 total reports), fatigue (134,450 total reports), pain (119,009 total reports), nausea (119,000 total reports), headache (110,159 total reports).", + "question_type": "adverse_event", + "expected_tools": [ + "search_adverse_events" + ], + "original_reference": "The most commonly reported adverse events across FAERS include nausea, drug ineffective, headache, fatigue, diarrhoea, and dizziness." + }, + { + "id": "q15", + "question": "Compare warfarin and apixaban in terms of safety.", + "reference": "Warfarin (no assigned category) and Apixaban (Factor Xa Inhibitor [EPC]) share adverse events such as anaemia, cerebrovascular accident, condition aggravated, drug ineffective, off label use, based on FAERS data in the knowledge graph.", + "question_type": "comparison", + "expected_tools": [ + "compare_drugs" + ], + "original_reference": "Both warfarin and apixaban are anticoagulants. Warfarin has more drug interactions documented and requires INR monitoring. Both can cause haemorrhage, but their adverse event profiles differ in frequency and severity according to FAERS data." + }, + { + "id": "q16", + "question": "Can metformin and lisinopril be taken together?", + "reference": "No direct INTERACTS_WITH relationship between Metformin and Lisinopril is documented in the knowledge graph.", + "question_type": "interaction", + "expected_tools": [ + "search_drug_info", + "list_drug_interactions" + ], + "original_reference": "Based on the knowledge graph data, metformin and lisinopril do not have a documented direct interaction. However, both can affect renal function, and monitoring is advisable." + }, + { + "id": "q17", + "question": "What are the contraindications for aspirin?", + "reference": "Based on the DailyMed label for Aspirin: [warnings] Warnings Reye's syndrome : Children and teenagers who have or are recovering from chicken pox or flu-like symptoms should not use this product. When using this product, if changes in behavior with nausea and vomiting occur, consult a doctor because these… [indications_and_usage] Uses for the temporary relief of minor aches and pains or as recommended by your doctor. Because of its delayed action, this product will not provide fast relief of headaches or other symptoms needing immediate relief. ask your doctor about other uses for…", + "question_type": "label_search", + "expected_tools": [ + "search_drug_labels" + ], + "original_reference": "According to the DailyMed label, aspirin is contraindicated in patients with known allergy to NSAIDs, patients with asthma, rhinitis, and nasal polyps syndrome, and in children and teenagers with viral infections due to Reye's syndrome risk." + }, + { + "id": "q18", + "question": "Search for drugs whose name contains 'statin'.", + "reference": "Drugs whose name contains 'STATIN' include Atorvastatin, Atorvastatin Calcium, Atorvastatin Calcium Trihydrate, Atorvastatin Calcium\\Ezetimibe, Chloramphenicol\\Hydrocortisone Acetate\\Metronidazole\\Nystatin, Cilastatin Sodium\\Imipenem, Endostatin, Ezetimibe\\Rosuvastatin, Ezetimibe\\Simvastatin, Fenofibrate\\Pravastatin.", + "question_type": "drug_info", + "expected_tools": [ + "search_drugs_by_name" + ], + "original_reference": "Drugs matching 'statin' include atorvastatin, simvastatin, rosuvastatin, pravastatin, and lovastatin, among others found in the knowledge graph." + }, + { + "id": "q19", + "question": "What is the mechanism of action of omeprazole?", + "reference": "Based on the DailyMed label for Omeprazole: [warnings] Acid reducers may interact with certain prescription drugs. Stop use and ask doctor if: your heartburn continues or worsens you need to take this product for more than 14 days you need to take more than 1 course of treatment every 4 months you get diarrhea… [indications_and_usage] Use(s) treats frequent heartburn (occurs 2 or more days a week) not intended for immediate relief of heartburn; this drug may take 1 to 4 days for full effect", + "question_type": "label_search", + "expected_tools": [ + "search_drug_labels" + ], + "original_reference": "According to the drug label, omeprazole is a proton pump inhibitor (PPI) that suppresses gastric acid secretion by specific inhibition of the H+/K+-ATPase enzyme system at the secretory surface of the gastric parietal cell." + }, + { + "id": "q20", + "question": "Which adverse events are shared by both aspirin and warfarin?", + "reference": "Adverse events reported in FAERS for both Aspirin and Warfarin include cerebrovascular accident, drug ineffective, fatigue, haematochezia, headache, nausea, off label use, pain.", + "question_type": "comparison", + "expected_tools": [ + "compare_drugs", + "search_drug_info" + ], + "original_reference": "Both aspirin and warfarin share adverse events related to bleeding, including gastrointestinal haemorrhage and haemorrhage. This is expected given their anticoagulant and antiplatelet mechanisms." + }, + { + "id": "q21", + "question": "How many adverse event reports does ibuprofen have for nausea?", + "reference": "FAERS reports 217 cases of NAUSEA associated with Ibuprofen.", + "question_type": "drug_info", + "expected_tools": [ + "search_drug_info" + ], + "original_reference": "Ibuprofen has FAERS reports linking it to nausea as an adverse event, with a specific report count available in the knowledge graph's CAUSES relationship." + }, + { + "id": "q22", + "question": "What drugs should be avoided with warfarin?", + "reference": "Drugs documented as interacting with Warfarin include Allopurinol, Alprazolam, Amlodipine, Apixaban, Aripiprazole, Aspirin, Atorvastatin, Azithromycin.", + "question_type": "interaction", + "expected_tools": [ + "search_drug_info", + "list_drug_interactions" + ], + "original_reference": "Drugs that interact with warfarin include aspirin, NSAIDs (ibuprofen), certain antibiotics, and other anticoagulants. These interactions can increase bleeding risk." + }, + { + "id": "q23", + "question": "Tell me about the adverse event profile of prednisone.", + "reference": "The most frequently reported adverse events for Prednisone in FAERS are off label use (2832 reports), drug ineffective (2434 reports), pain (1627 reports), fatigue (1594 reports), dyspnoea (1514 reports).", + "question_type": "drug_info", + "expected_tools": [ + "search_drug_info" + ], + "original_reference": "Prednisone adverse events in FAERS include weight increased, insomnia, immunosuppression, hyperglycaemia, and osteoporosis among others." + }, + { + "id": "q24", + "question": "What is the dosage information for metformin?", + "reference": "Based on the DailyMed label for Metformin: [dosage_and_administration] DOSAGE AND ADMINISTRATION Take ZITUVIMET orally twice daily with meals. ( 2.1 ) Individualize the dosage of ZITUVIMET on the basis of the patient's current regimen, effectiveness, and tolerability. ( 2.1 ) The maximum recommended daily dose is 100 mg of… [dosage_and_administration] The maximum recommended daily dose is 100 mg of sitagliptin and 2,000 mg of metformin hydrochloride (HCl). Do not split or divide ZITUVIMET tablets. The recommended starting dose in patients not currently treated with metformin is 50 mg sitagliptin and 500 mg…", + "question_type": "label_search", + "expected_tools": [ + "search_drug_labels" + ], + "original_reference": "According to the DailyMed drug label, metformin is typically started at 500mg twice daily or 850mg once daily with meals, with gradual dose increases. Maximum recommended daily dose is 2550mg." + }, + { + "id": "q25", + "question": "Which drugs cause both nausea and headache as adverse events?", + "reference": "Drugs that report both NAUSEA and HEADACHE as adverse events, ranked by combined report count, include Prednisone (2386 combined reports), Rituximab (2306 combined reports), Acetaminophen (2233 combined reports), Methotrexate (2224 combined reports), Actemra (1993 combined reports), Humira (1911 combined reports), Orencia (1888 combined reports), Dupixent (1852 combined reports).", + "question_type": "multi_drug", + "expected_tools": [ + "find_drugs_for_adverse_event" + ], + "original_reference": "Multiple drugs in FAERS are associated with both nausea and headache, including common medications like ibuprofen, metformin, and lisinopril. These are among the most frequently reported adverse events across many drug classes." + } + ] +} \ No newline at end of file diff --git a/scripts/regenerate_references.py b/scripts/regenerate_references.py new file mode 100644 index 0000000..61945b8 --- /dev/null +++ b/scripts/regenerate_references.py @@ -0,0 +1,407 @@ +"""Regenerate testset references using actual data from the KG and vector store. + +Produces a new testset (testset_v2.json) where each reference is grounded in +what is actually loaded in Neo4j Aura (CAUSES, INTERACTS_WITH, HAS_OUTCOME, +BELONGS_TO) and ChromaDB (DailyMed labels), rather than hand-written from +textbook knowledge. + +This eliminates the ContextRecall=0 problem caused by mismatch between the +original references and the data the system retrieves. + +Loads .env.aura explicitly (does not touch the local .env). Read-only. + +Usage: + uv run python scripts/regenerate_references.py \ + --testset data/evaluation/testset.json \ + --output data/evaluation/testset_v2.json +""" + +from __future__ import annotations + +import argparse +import json +import os +from pathlib import Path + +from dotenv import dotenv_values + +ROOT = Path(__file__).resolve().parent.parent + +# Load Aura credentials BEFORE importing pharmagraphrag (pydantic-settings cache) +aura_env = dotenv_values(ROOT / ".env.aura") +os.environ["NEO4J_URI"] = aura_env["NEO4J_URI"] +os.environ["NEO4J_USER"] = aura_env.get("NEO4J_USER") or aura_env["NEO4J_USERNAME"] +os.environ["NEO4J_PASSWORD"] = aura_env["NEO4J_PASSWORD"] + +# Now safe to import; settings will pick up the Aura values +import re # noqa: E402 + +from pharmagraphrag.graph import queries # noqa: E402 +from pharmagraphrag.vectorstore import store # noqa: E402 + +# ---------- helpers --------------------------------------------------------- + + +def _clean_snippet(text: str, max_chars: int = 240) -> str: + """Strip ChromaDB metadata headers and partial leading words.""" + # Drop our own header markers like "Drug: METFORMIN | Section: Drug Interactions" + text = re.sub(r"Drug:\s*[A-Z0-9 .,\\/-]+\s*\|\s*Section:[^\n]*", " ", text) + text = re.sub(r"\s+", " ", text).strip() + # If the snippet starts mid-word (lowercase letter following nothing), advance to a sentence start + if text and text[0].islower(): + m = re.search(r"(?<=[.!?])\s+[A-Z]", text) + if m: + text = text[m.start() + 1 :].lstrip() + else: + # Or jump to the first capital letter + m2 = re.search(r"[A-Z]", text) + if m2: + text = text[m2.start() :] + if len(text) > max_chars: + text = text[:max_chars].rsplit(" ", 1)[0] + "…" + return text + + +def fmt_top_ae(drug: str, n: int = 5) -> str: + events = queries.get_drug_adverse_events(drug, limit=n) + if not events: + return f"No adverse events were found for {drug.title()} in FAERS." + items = ", ".join(f"{e['adverse_event'].lower()} ({e['report_count']} reports)" for e in events) + return f"The most frequently reported adverse events for {drug.title()} in FAERS are {items}." + + +def fmt_drugs_for_ae(ae: str, n: int = 5) -> str: + drugs = queries.get_adverse_event_drugs(ae, limit=n) + if drugs: + items = ", ".join(f"{d['drug_name'].title()} ({d['report_count']} reports)" for d in drugs) + return f"Drugs reported in FAERS as causing {ae.upper()} include {items}." + # Fallback to fuzzy substring match on AE names + matches = queries.search_adverse_events(ae, limit=5) + if not matches: + return f"No drugs were found in FAERS for the adverse event '{ae}' or related MedDRA terms." + items = ", ".join(f"{m['name'].lower()} ({m['total_reports']} reports)" for m in matches) + return f"The exact MedDRA term '{ae.upper()}' was not found, but related terms include {items}." + + +def fmt_interaction(drug_a: str, drug_b: str) -> str: + inters = queries.get_drug_interactions(drug_a) + match = next( + (i for i in inters if i["interacting_drug"].upper() == drug_b.upper()), + None, + ) + if match: + desc = _clean_snippet(match.get("description") or "", max_chars=220) + src = (match.get("source") or "").strip() + if desc: + return ( + f"Yes, {drug_a.title()} and {drug_b.title()} have a documented " + f"interaction (source: {src or 'DailyMed'}). {desc}" + ) + return ( + f"Yes, {drug_a.title()} and {drug_b.title()} have a documented " + f"interaction in {src or 'DailyMed labels'}." + ) + return ( + f"No direct INTERACTS_WITH relationship between {drug_a.title()} and " + f"{drug_b.title()} is documented in the knowledge graph." + ) + + +def fmt_outcomes(drug: str) -> str: + outs = queries.get_drug_outcomes(drug) + if not outs: + return f"No clinical outcomes are documented for {drug.title()}." + items = ", ".join( + f"{o['outcome_description'] or o['outcome_code']} ({o['report_count']} reports)" + for o in outs[:5] + ) + return f"Clinical outcomes documented for {drug.title()} in FAERS include {items}." + + +def fmt_category(drug: str) -> str: + cats = queries.get_drug_category(drug) + if not cats: + return ( + f"{drug.title()} has no DrugCategory node assigned in the knowledge graph " + f"(BELONGS_TO relationship is missing for this drug)." + ) + return f"{drug.title()} belongs to the {', '.join(cats)} category." + + +def fmt_drugs_in_same_category(drug: str) -> str: + cats = queries.get_drug_category(drug) + if not cats: + return ( + f"{drug.title()} has no DrugCategory assigned in the knowledge graph, " + f"so no peer drugs can be retrieved by category." + ) + cat = cats[0] + peers = queries.get_drugs_by_category(cat, limit=15) + names = sorted( + {p["drug_name"].title() for p in peers if p["drug_name"].upper() != drug.upper()} + ) + if not names: + return f"{drug.title()} is the only drug in the {cat} category in this knowledge graph." + return f"{drug.title()} belongs to the {cat} category, alongside {', '.join(names[:8])}." + + +def fmt_top_interactions(drug: str, n: int = 6) -> str: + inters = queries.get_drug_interactions(drug) + if not inters: + return f"No drug interactions are documented for {drug.title()}." + names = sorted({i["interacting_drug"].title() for i in inters})[:n] + return f"Drugs documented as interacting with {drug.title()} include {', '.join(names)}." + + +def fmt_compare(drug_a: str, drug_b: str) -> str: + cats_a = queries.get_drug_category(drug_a) or ["no assigned category"] + cats_b = queries.get_drug_category(drug_b) or ["no assigned category"] + aes_a = {e["adverse_event"] for e in queries.get_drug_adverse_events(drug_a, limit=20)} + aes_b = {e["adverse_event"] for e in queries.get_drug_adverse_events(drug_b, limit=20)} + shared = sorted(aes_a & aes_b)[:5] + return ( + f"{drug_a.title()} ({cats_a[0]}) and {drug_b.title()} ({cats_b[0]}) " + f"share adverse events such as " + f"{', '.join(s.lower() for s in shared) if shared else 'none in the top 20 reports'}, " + f"based on FAERS data in the knowledge graph." + ) + + +def fmt_shared_ae(drug_a: str, drug_b: str) -> str: + aes_a = {e["adverse_event"] for e in queries.get_drug_adverse_events(drug_a, limit=30)} + aes_b = {e["adverse_event"] for e in queries.get_drug_adverse_events(drug_b, limit=30)} + shared = sorted(aes_a & aes_b) + if not shared: + return f"{drug_a.title()} and {drug_b.title()} share no top-30 adverse events in FAERS." + return ( + f"Adverse events reported in FAERS for both {drug_a.title()} and {drug_b.title()} " + f"include {', '.join(s.lower() for s in shared[:8])}." + ) + + +def fmt_drug_search(pattern: str) -> str: + matches = queries.search_drugs(pattern, limit=12) + if not matches: + return f"No drugs whose name contains '{pattern}' were found in the knowledge graph." + return ( + f"Drugs whose name contains '{pattern}' include " + f"{', '.join(m.title() for m in matches[:10])}." + ) + + +def fmt_top_ae_overall(n: int = 6) -> str: + drv = queries._get_driver() + with drv.session() as s: + rows = s.run( + """ + MATCH (:Drug)-[r:CAUSES]->(ae:AdverseEvent) + RETURN ae.name AS name, sum(r.report_count) AS total + ORDER BY total DESC + LIMIT $n + """, + n=n, + ).data() + items = ", ".join(f"{r['name'].lower()} ({r['total']:,} total reports)" for r in rows) + return f"The most frequently reported adverse events across all drugs in FAERS are {items}." + + +def fmt_drugs_with_two_aes(ae1: str, ae2: str, n: int = 8) -> str: + drv = queries._get_driver() + with drv.session() as s: + rows = s.run( + """ + MATCH (d:Drug)-[r1:CAUSES]->(a:AdverseEvent) + WHERE toUpper(a.name) = toUpper($ae1) + WITH d, r1.report_count AS c1 + MATCH (d)-[r2:CAUSES]->(b:AdverseEvent) + WHERE toUpper(b.name) = toUpper($ae2) + WITH d.name AS name, c1 + r2.report_count AS total + RETURN name, total + ORDER BY total DESC + LIMIT $n + """, + ae1=ae1, + ae2=ae2, + n=n, + ).data() + if not rows: + return f"No drugs were found in FAERS that report both {ae1.upper()} and {ae2.upper()}." + items = ", ".join(f"{r['name'].title()} ({r['total']} combined reports)" for r in rows) + return ( + f"Drugs that report both {ae1.upper()} and {ae2.upper()} as adverse events, " + f"ranked by combined report count, include {items}." + ) + + +def fmt_ae_count(drug: str, ae: str) -> str: + drv = queries._get_driver() + with drv.session() as s: + rec = s.run( + """ + MATCH (d:Drug)-[r:CAUSES]->(a:AdverseEvent) + WHERE toUpper(d.name) = toUpper($drug) AND toUpper(a.name) = toUpper($ae) + RETURN r.report_count AS c + """, + drug=drug, + ae=ae, + ).single() + if not rec: + return f"No FAERS reports link {drug.title()} to {ae.upper()} in the knowledge graph." + return f"FAERS reports {rec['c']} cases of {ae.upper()} associated with {drug.title()}." + + +def fmt_label_search( + query: str, + drug_filter: str | None = None, + preferred_sections: tuple[str, ...] | None = None, +) -> str: + """Use ChromaDB to find what the DailyMed label actually says about a topic. + + Picks the best chunk by (a) preferred sections matching the question intent, + (b) semantic relevance from ChromaDB, and cleans the snippet. + """ + where = {"drug_name": drug_filter.upper()} if drug_filter else None + results = store.search(query, n_results=8, where=where) + if not results: + return f"No DailyMed label content matched '{query}'" + ( + f" for {drug_filter.title()}." if drug_filter else "." + ) + if preferred_sections: + # Tie-break only among top-4 by semantic distance — avoid promoting + # short/irrelevant chunks just because their section label matches. + top = sorted(results, key=lambda r: r.get("distance", 1.0))[:4] + ranked = sorted( + top, + key=lambda r: ( + 0 if r["metadata"].get("section") in preferred_sections else 1, + r.get("distance", 1.0), + ), + ) + else: + ranked = sorted(results, key=lambda r: r.get("distance", 1.0)) + # Pre-clean every candidate so we can filter by useful-content length + cleaned = [] + for r in ranked: + c = _clean_snippet(r["text"], max_chars=260) + if len(c) >= 150: + cleaned.append((r, c)) + chosen = cleaned[:2] + name = drug_filter.title() if drug_filter else "the drug" + if not chosen: + sections_in_results = sorted({r["metadata"].get("section", "?") for r in results}) + return ( + f"The DailyMed label chunks indexed for {name} do not contain a substantive " + f"passage matching '{query}'. Available sections in the indexed label " + f"include: {', '.join(sections_in_results)}." + ) + snippets = [] + for r, text in chosen: + section = r["metadata"].get("section", "drug label") + snippets.append(f"[{section}] {text}") + return f"Based on the DailyMed label for {name}: " + " ".join(snippets) + + +# ---------- per-id mapping -------------------------------------------------- + +REGENERATORS = { + "q01": lambda: fmt_top_ae("ASPIRIN", n=5), + "q02": lambda: fmt_interaction("WARFARIN", "ASPIRIN"), + "q03": lambda: fmt_drugs_for_ae("HEPATOTOXICITY", n=5), + "q04": lambda: fmt_outcomes("METFORMIN"), + "q05": lambda: fmt_category("IBUPROFEN"), + "q06": lambda: fmt_compare("ASPIRIN", "IBUPROFEN"), + "q07": lambda: fmt_label_search( + "drug interactions", "METFORMIN", preferred_sections=("drug_interactions",) + ), + "q08": lambda: fmt_top_ae("LISINOPRIL", n=5), + "q09": lambda: fmt_top_interactions("METHOTREXATE"), + "q10": lambda: fmt_drugs_for_ae("RHABDOMYOLYSIS", n=5), + "q11": lambda: fmt_outcomes("ATORVASTATIN"), + "q12": lambda: fmt_label_search( + "warnings and precautions", + "OMEPRAZOLE", + preferred_sections=("warnings_and_cautions", "warnings", "boxed_warning"), + ), + "q13": lambda: fmt_drugs_in_same_category("WARFARIN"), + "q14": lambda: fmt_top_ae_overall(n=6), + "q15": lambda: fmt_compare("WARFARIN", "APIXABAN"), + "q16": lambda: fmt_interaction("METFORMIN", "LISINOPRIL"), + "q17": lambda: fmt_label_search( + "contraindications and do not use", + "ASPIRIN", + preferred_sections=("contraindications", "warnings", "boxed_warning"), + ), + "q18": lambda: fmt_drug_search("STATIN"), + "q19": lambda: fmt_label_search( + "mechanism of action proton pump inhibitor", + "OMEPRAZOLE", + preferred_sections=("mechanism_of_action", "clinical_pharmacology", "pharmacodynamics"), + ), + "q20": lambda: fmt_shared_ae("ASPIRIN", "WARFARIN"), + "q21": lambda: fmt_ae_count("IBUPROFEN", "NAUSEA"), + "q22": lambda: fmt_top_interactions("WARFARIN", n=8), + "q23": lambda: fmt_top_ae("PREDNISONE", n=5), + "q24": lambda: fmt_label_search( + "dosage and administration recommended dose", + "METFORMIN", + preferred_sections=("dosage_and_administration", "indications_and_usage"), + ), + "q25": lambda: fmt_drugs_with_two_aes("NAUSEA", "HEADACHE", n=8), +} + + +# ---------- main ------------------------------------------------------------ + + +def main() -> None: + ap = argparse.ArgumentParser() + ap.add_argument("--testset", default="data/evaluation/testset.json") + ap.add_argument("--output", default="data/evaluation/testset_v2.json") + args = ap.parse_args() + + in_path = ROOT / args.testset + out_path = ROOT / args.output + + with in_path.open(encoding="utf-8") as f: + data = json.load(f) + + samples = data["samples"] + print(f"Regenerating references for {len(samples)} samples...\n") + + skipped = [] + for s in samples: + sid = s["id"] + regen = REGENERATORS.get(sid) + if regen is None: + skipped.append(sid) + continue + old_ref = s.get("reference", "") + try: + new_ref = regen() + except Exception as exc: # pragma: no cover - diagnostic + new_ref = f"[regen failed: {type(exc).__name__}: {exc}]" + s["original_reference"] = old_ref + s["reference"] = new_ref + print(f"--- {sid}: {s['question']}") + print(f"OLD: {old_ref}") + print(f"NEW: {new_ref}\n") + + data["metadata"]["regenerated_at"] = "2026-04-29" + data["metadata"]["regeneration_note"] = ( + "References regenerated from actual KG (Neo4j Aura) + ChromaDB content " + "to fix the mismatch between hand-written textbook references and the data " + "the system retrieves. Original references preserved under " + "'original_reference'." + ) + + out_path.parent.mkdir(parents=True, exist_ok=True) + with out_path.open("w", encoding="utf-8") as f: + json.dump(data, f, indent=2, ensure_ascii=False) + + print(f"\nWrote {out_path}") + if skipped: + print(f"Skipped (no regenerator): {skipped}") + + +if __name__ == "__main__": + main() diff --git a/scripts/test_aura_connection.py b/scripts/test_aura_connection.py new file mode 100644 index 0000000..58efb9e --- /dev/null +++ b/scripts/test_aura_connection.py @@ -0,0 +1,45 @@ +"""Smoke test for Neo4j Aura connection. + +Loads .env.aura explicitly, connects, and runs a simple count query. +Does not modify any data. +""" + +from __future__ import annotations + +import sys +from pathlib import Path + +from dotenv import dotenv_values +from neo4j import GraphDatabase + +ROOT = Path(__file__).resolve().parent.parent +env = dotenv_values(ROOT / ".env.aura") + +uri = env.get("NEO4J_URI") +user = env.get("NEO4J_USER") or env.get("NEO4J_USERNAME") +pwd = env.get("NEO4J_PASSWORD") +db = env.get("NEO4J_DATABASE") or "neo4j" + +if not all([uri, user, pwd]): + print("Missing NEO4J_URI / NEO4J_USER / NEO4J_PASSWORD in .env.aura") + sys.exit(1) + +print(f"Connecting to {uri} as user='{user}' db='{db}'") + +try: + with GraphDatabase.driver(uri, auth=(user, pwd)) as driver: + driver.verify_connectivity() + with driver.session(database=db) as session: + n = session.run("MATCH (n) RETURN count(n) AS c").single()["c"] + r = session.run("MATCH ()-[r]->() RETURN count(r) AS c").single()["c"] + labels = session.run("CALL db.labels() YIELD label RETURN label ORDER BY label").value() + rels = session.run( + "CALL db.relationshipTypes() YIELD relationshipType " + "RETURN relationshipType ORDER BY relationshipType" + ).value() + print(f"OK. Nodes={n:,} Relationships={r:,}") + print(f"Labels: {labels}") + print(f"Relationship types: {rels}") +except Exception as exc: + print(f"FAILED: {type(exc).__name__}: {exc}") + sys.exit(2) From 5591b2c30b6475f21d5e5570f13b7e40512b06a2 Mon Sep 17 00:00:00 2001 From: JMPonce Date: Wed, 29 Apr 2026 14:30:42 +0200 Subject: [PATCH 2/3] docs(eval): add SUMMARY.md for testset_v2 full run (classic+agent+multi) --- data/evaluation/results/v2_full/SUMMARY.md | 71 ++++++++++++++++++++++ 1 file changed, 71 insertions(+) create mode 100644 data/evaluation/results/v2_full/SUMMARY.md diff --git a/data/evaluation/results/v2_full/SUMMARY.md b/data/evaluation/results/v2_full/SUMMARY.md new file mode 100644 index 0000000..64e0a64 --- /dev/null +++ b/data/evaluation/results/v2_full/SUMMARY.md @@ -0,0 +1,71 @@ +# RAGAS evaluation — testset_v2 (data-grounded references) + +Run date: 2026-04-29 +Testset: `data/evaluation/testset_v2.json` (25 questions, references regenerated from Neo4j Aura + ChromaDB). +LLM (system + judge): defaults per mode (classic=`gemini-2.5-flash`, agent=`gemini-2.5-flash-lite`, multi supervisor=`gemini-2.5-pro` + sub-agents=`gemini-2.5-flash-lite`). RAGAS judge: `gemini-2.5-flash`. +API: local FastAPI on `127.0.0.1:8000` against Neo4j Aura (11,900 nodes / 381,359 rels) and local ChromaDB (5,654 chunks). + +## RAGAS metrics (n=25) + +| Metric | Classic | Agent | Multi | +| --- | --- | --- | --- | +| AnswerCorrectness | 0.544 | **0.610** | 0.551 | +| AnswerRelevancy | 0.695 | **0.735** | 0.708 | +| ContextPrecision | **0.760** | 0.207 | 0.165 | +| ContextRecall | 0.504 | **0.692** | 0.431 | +| Faithfulness | **0.910** | 0.680 | 0.792 | +| latency_ms (avg) | 5,688 | 11,225 | 19,788 | +| total runtime (s) | 2,341 | 4,025 | ~3,800 | + +## Agent tool selection (custom metric) + +| Mode | Precision | Recall | F1 | Goal accuracy | +| --- | --- | --- | --- | --- | +| Agent | 0.269 | 0.960 | 0.383 | **100% (25/25)** | +| Multi | 0.000 | 0.000 | 0.000 | **100% (25/25)** | + +Multi P/R/F1 = 0 is an artifact: the supervisor delegates via `ask_*_expert` wrappers; the underlying tool calls live inside sub-agents and are not exposed to the eval harness. Goal accuracy still tracks correctly. + +## vs prior baseline (n=3, 2026-04-21) + +| Metric | Old Classic (n=3) | New Classic (n=25) | Δ | +| --- | --- | --- | --- | +| AnswerCorrectness | 0.49 | 0.544 | +0.054 | +| AnswerRelevancy | 0.84 | 0.695 | -0.145 | +| ContextPrecision | 0.83 | 0.760 | -0.07 | +| ContextRecall | **0.22** | **0.504** | **+0.284 (+130%)** | +| Faithfulness | 0.94 | 0.910 | -0.03 | + +Old baseline ran only 3 questions (`--limit 3`), so deltas mix two effects: data-grounded references (intended) and larger sample (more variance). The headline finding holds: **ContextRecall jumps from 0.22 to 0.504 once references match what the system actually retrieves.** + +## Known issues + +- **RAGAS judge timeouts**: ~10 `max_tokens exceeded` warnings across the three runs (q06, q08, q12, q15 are recurrent). These tank averages because failed metrics return NaN and the runner does not impute. Could be mitigated by raising `max_tokens` on the judge or by switching to `gemini-2.5-pro` for judging. +- **Warfarin has no `DrugCategory`** in the KG. Surfaced honestly in q13/q15 references rather than masked. Real ingestion gap: only `Factor Xa Inhibitor [EPC]` exists for apixaban/rivaroxaban; no `Anticoagulant`, `Coumarin`, or `Vitamin K Antagonist` category nodes were created. +- **Agent precision low (0.27)**: agent calls extra tools beyond the ground-truth set. Recall is high (0.96) and goal accuracy 100%, so user experience is unaffected; cost and latency are. + +## Files + +- `ragas_classic.csv`, `ragas_agent.csv`, `ragas_multi.csv` — per-sample RAGAS scores (gitignored, regenerable). +- `agent_tools_agent.csv`, `agent_tools_multi.csv` — per-sample tool selection metrics (gitignored). +- `classic_log.txt`, `agent_log.txt`, `multi_log.txt` — full execution logs (gitignored). +- This `SUMMARY.md` is the persistent record. + +## Reproduce + +```bash +# 1. Set env +export GEMINI_API_KEY=... +# .env.aura with Neo4j Aura credentials + +# 2. Start API against Aura +NEO4J_URI=... NEO4J_USER=... NEO4J_PASSWORD=... \ + uv run uvicorn pharmagraphrag.api.main:app --host 127.0.0.1 --port 8000 + +# 3. Run each mode +uv run python scripts/run_evaluation.py --mode classic \ + --testset data/evaluation/testset_v2.json \ + --api-url http://127.0.0.1:8000 \ + --output-dir data/evaluation/results/v2_full +# Repeat with --mode agent and --mode multi +``` From 7e95076d5303e5134a8d6d314b1bc735aea80cdf Mon Sep 17 00:00:00 2001 From: JMPonce Date: Wed, 29 Apr 2026 18:44:25 +0200 Subject: [PATCH 3/3] fix(scripts): validate .env.aura keys with clear error message Addresses Copilot review on PR #5: raise SystemExit with explicit list of missing keys instead of cryptic KeyError when .env.aura is incomplete. --- scripts/regenerate_references.py | 29 +++++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) diff --git a/scripts/regenerate_references.py b/scripts/regenerate_references.py index 61945b8..25f6d22 100644 --- a/scripts/regenerate_references.py +++ b/scripts/regenerate_references.py @@ -28,10 +28,31 @@ ROOT = Path(__file__).resolve().parent.parent # Load Aura credentials BEFORE importing pharmagraphrag (pydantic-settings cache) -aura_env = dotenv_values(ROOT / ".env.aura") -os.environ["NEO4J_URI"] = aura_env["NEO4J_URI"] -os.environ["NEO4J_USER"] = aura_env.get("NEO4J_USER") or aura_env["NEO4J_USERNAME"] -os.environ["NEO4J_PASSWORD"] = aura_env["NEO4J_PASSWORD"] +aura_env_path = ROOT / ".env.aura" +aura_env = dotenv_values(aura_env_path) + +neo4j_uri = aura_env.get("NEO4J_URI") +neo4j_user = aura_env.get("NEO4J_USER") or aura_env.get("NEO4J_USERNAME") +neo4j_password = aura_env.get("NEO4J_PASSWORD") + +missing = [ + name + for name, value in ( + ("NEO4J_URI", neo4j_uri), + ("NEO4J_USER or NEO4J_USERNAME", neo4j_user), + ("NEO4J_PASSWORD", neo4j_password), + ) + if not value +] +if missing: + raise SystemExit( + f"Missing required Neo4j Aura settings in {aura_env_path}: " + f"{', '.join(missing)}. Create or update .env.aura with the connection values." + ) + +os.environ["NEO4J_URI"] = neo4j_uri +os.environ["NEO4J_USER"] = neo4j_user +os.environ["NEO4J_PASSWORD"] = neo4j_password # Now safe to import; settings will pick up the Aura values import re # noqa: E402