Use drug pivot table to compute interactions betwen drugs in JSON

NetBUG · NetBUG · commit e2a9a7a8a4cc · 2025-07-24T14:30:30.000+04:00
diff --git a/app/pipeline.py b/app/pipeline.py
@@ -1,13 +1,15 @@
 #!/usr/bin/env python3
 
 import csv
-from typing import List, Optional
+import json
 from loguru import logger
+from typing import List, Optional
 
 from app.clients import query_llama
 from app.prompts import DETERMINE_TASK_PROMPT, GENERAL_PROMPT, \
         DENY_PROMPT, TASKS, GRAPH_NEEDED, FIND_SUBSTANCES_PROMPT, GRAPH_PROMPT
 from app.gpraph import run_subgraph_builder
+from app.substance_mapper import create_json_for_llm
 
 entities_file = "data/entity_name_mapping.json"
 substances_file = "data/drugbank/drugbank_vocabulary.csv"
@@ -119,9 +121,10 @@ def process_pipeline(query: str, history: List[str]=[], graph: Optional[object]=
                 logger.info(f"Found substances by LLM: {substances}. Try to find in the DrugBank vocabulary and bulding a graph")
                 response['graph'] = run_subgraph_builder(substances)
                 logger.info(f"Subgraph built with {len(response['graph'].vs)} vertices and {len(response['graph'].es)} edges.")
-
-            if response['graph']:
-                prompt = f"{GENERAL_PROMPT}\n\n{TASKS[discovered_class]}\n{GRAPH_PROMPT}\n{response['graph']}\nTask: {query}"
+                supplemental_json = create_json_for_llm(substances)
+                # logger.debug(json.dumps(supplemental_json, indent=4))
+                if supplemental_json and len(supplemental_json) > 2:
+                    prompt = f"{GENERAL_PROMPT}\n\n{TASKS[discovered_class]}\n{GRAPH_PROMPT}\n{supplemental_json}\nTask: {query}"
 
         else:
             prompt = f"{GENERAL_PROMPT}\n\n{TASKS[discovered_class]}\nQuery: {query}"
diff --git a/app/prompts.py b/app/prompts.py
@@ -46,4 +46,4 @@
 FIND_SUBSTANCES_PROMPT = """Find any substances in the query below.
 Return a list of original words from text separated by ',' without spaces after ','. Do not separate one substance with ',' if it takes more than one word."""
 
-GRAPH_PROMPT = "Additional information below includes a subgraph displaying relations between those substances in iGraph format. Use it to form the response as a ground truth"
+GRAPH_PROMPT = "Additional information below includes a subgraph displaying relations between those substances in JSON format. Use it to form the response as a ground truth. Do not mention the work JSON in the output"
diff --git a/app/substance_mapper.py b/app/substance_mapper.py
@@ -0,0 +1,88 @@
+import json
+from loguru import logger
+import time
+
+import pandas as pd
+import numpy as np
+
+
+columns_pathway_function = [
+    'gene_pathways_activated_by_drug',
+    'gene_pathways_inhibited_by_drug',
+    'molecular_function_activated_by_drug',
+    'molecular_function_inhibited_by_drug'
+]
+
+def load_json_file(filepath):
+    """
+    Load JSON data from a file into a Python dict.
+    """
+    with open(filepath, 'r', encoding='utf-8') as f:
+        data = json.load(f)  # parses file into dict/list
+    return data
+
+def process_mapping(ent_mapper):
+    ent_mapper_new = {x:ent_mapper[x].split(':')[-1] for x in ent_mapper.keys()}
+    ent_mapper_new['drug_disease_minus'] = 'disease_associated_with_drug'
+    ent_mapper_new['drug_disease_plus'] = 'disease_cured_by_drug'
+    ent_mapper_new['drug_gene_minus'] = 'genes_inhibited_or_suppressed_by_drug'
+    ent_mapper_new['drug_gene_plus'] = 'genes_enhanced_or_activated_by_drug'
+    ent_mapper_new['drug_side_effect_plus'] = 'side_effects_assosiated_with_drug'
+    ent_mapper_new['gene_pathway_plus'] = 'gene_pathways_activated_by_drug'
+    ent_mapper_new['gene_pathway_minus'] = 'gene_pathways_inhibited_by_drug'
+    ent_mapper_new['gene_function_plus'] = 'molecular_function_activated_by_drug'
+    ent_mapper_new['gene_function_minus'] = 'molecular_function_inhibited_by_drug'
+    return ent_mapper_new
+
+def map_cell(cell, mapper):
+    """
+    Map a cell value (which can be NaN, list, numpy array, or scalar) using mapper dict.
+    - If list/array: map each element, keep original if not in mapper
+    - If None/NaN: leave as is
+    - Else (scalar): map if in mapper, else leave
+    """
+    # 1) handle lists and numpy arrays first
+    if isinstance(cell, (list, np.ndarray)):
+        mapped = [mapper.get(item, item) for item in cell]
+        logger.debug(f"List/array mapped: {cell} -> {mapped}")
+        return mapped
+
+    # 2) handle missing scalars
+    if cell is None or pd.isna(cell):
+        logger.debug("Missing value encountered, leaving unchanged")
+        return cell
+
+    # 3) scalar mapping
+    new_val = mapper.get(cell, cell)
+    if new_val != cell:
+        logger.debug(f"Scalar mapped: {cell} -> {new_val}")
+    return new_val
+
+
+def map_dataframe(df, entity_mapper):
+    # 1) map all cells
+    df_mapped = df.map(lambda x: map_cell(x, entity_mapper))
+    logger.info("Finished mapping cell values")
+    
+    # 2) map row index
+    logger.info("Mapping DataFrame index")
+    new_index = [entity_mapper.get(idx, idx).lower() for idx in df_mapped.index]
+    df_mapped.index = new_index
+
+    logger.info("Mapping DataFrame columns")
+    df_mapped.rename(columns=entity_mapper, inplace=True)
+    return df_mapped
+
+start = time.time()
+drug_pivot = pd.read_json("data/drug_pivot_full.json", orient="table").set_index("compound")
+ent_mapper_new = process_mapping(load_json_file('data/entity_name_mapping.json'))
+drug_pivot_mapped = map_dataframe(drug_pivot, ent_mapper_new)
+logger.info(f"Loaded substance mapping graph in {time.time() - start}s...")
+
+def create_json_for_llm(compounds: list, drug_pivot=drug_pivot_mapped, mapper=ent_mapper_new) -> dict:
+    try:
+        drug_pivot_comp = drug_pivot.loc[compounds].dropna(axis=1, how='all').drop(columns=columns_pathway_function)
+        return drug_pivot_comp.to_dict()
+    except:
+        return {}
+