|
| 1 | +import json |
| 2 | +from loguru import logger |
| 3 | +import time |
| 4 | + |
| 5 | +import pandas as pd |
| 6 | +import numpy as np |
| 7 | + |
| 8 | + |
| 9 | +columns_pathway_function = [ |
| 10 | + 'gene_pathways_activated_by_drug', |
| 11 | + 'gene_pathways_inhibited_by_drug', |
| 12 | + 'molecular_function_activated_by_drug', |
| 13 | + 'molecular_function_inhibited_by_drug' |
| 14 | +] |
| 15 | + |
| 16 | +def load_json_file(filepath): |
| 17 | + """ |
| 18 | + Load JSON data from a file into a Python dict. |
| 19 | + """ |
| 20 | + with open(filepath, 'r', encoding='utf-8') as f: |
| 21 | + data = json.load(f) # parses file into dict/list |
| 22 | + return data |
| 23 | + |
| 24 | +def process_mapping(ent_mapper): |
| 25 | + ent_mapper_new = {x:ent_mapper[x].split(':')[-1] for x in ent_mapper.keys()} |
| 26 | + ent_mapper_new['drug_disease_minus'] = 'disease_associated_with_drug' |
| 27 | + ent_mapper_new['drug_disease_plus'] = 'disease_cured_by_drug' |
| 28 | + ent_mapper_new['drug_gene_minus'] = 'genes_inhibited_or_suppressed_by_drug' |
| 29 | + ent_mapper_new['drug_gene_plus'] = 'genes_enhanced_or_activated_by_drug' |
| 30 | + ent_mapper_new['drug_side_effect_plus'] = 'side_effects_assosiated_with_drug' |
| 31 | + ent_mapper_new['gene_pathway_plus'] = 'gene_pathways_activated_by_drug' |
| 32 | + ent_mapper_new['gene_pathway_minus'] = 'gene_pathways_inhibited_by_drug' |
| 33 | + ent_mapper_new['gene_function_plus'] = 'molecular_function_activated_by_drug' |
| 34 | + ent_mapper_new['gene_function_minus'] = 'molecular_function_inhibited_by_drug' |
| 35 | + return ent_mapper_new |
| 36 | + |
| 37 | +def map_cell(cell, mapper): |
| 38 | + """ |
| 39 | + Map a cell value (which can be NaN, list, numpy array, or scalar) using mapper dict. |
| 40 | + - If list/array: map each element, keep original if not in mapper |
| 41 | + - If None/NaN: leave as is |
| 42 | + - Else (scalar): map if in mapper, else leave |
| 43 | + """ |
| 44 | + # 1) handle lists and numpy arrays first |
| 45 | + if isinstance(cell, (list, np.ndarray)): |
| 46 | + mapped = [mapper.get(item, item) for item in cell] |
| 47 | + logger.debug(f"List/array mapped: {cell} -> {mapped}") |
| 48 | + return mapped |
| 49 | + |
| 50 | + # 2) handle missing scalars |
| 51 | + if cell is None or pd.isna(cell): |
| 52 | + logger.debug("Missing value encountered, leaving unchanged") |
| 53 | + return cell |
| 54 | + |
| 55 | + # 3) scalar mapping |
| 56 | + new_val = mapper.get(cell, cell) |
| 57 | + if new_val != cell: |
| 58 | + logger.debug(f"Scalar mapped: {cell} -> {new_val}") |
| 59 | + return new_val |
| 60 | + |
| 61 | + |
| 62 | +def map_dataframe(df, entity_mapper): |
| 63 | + # 1) map all cells |
| 64 | + df_mapped = df.map(lambda x: map_cell(x, entity_mapper)) |
| 65 | + logger.info("Finished mapping cell values") |
| 66 | + |
| 67 | + # 2) map row index |
| 68 | + logger.info("Mapping DataFrame index") |
| 69 | + new_index = [entity_mapper.get(idx, idx).lower() for idx in df_mapped.index] |
| 70 | + df_mapped.index = new_index |
| 71 | + |
| 72 | + logger.info("Mapping DataFrame columns") |
| 73 | + df_mapped.rename(columns=entity_mapper, inplace=True) |
| 74 | + return df_mapped |
| 75 | + |
| 76 | +start = time.time() |
| 77 | +drug_pivot = pd.read_json("data/drug_pivot_full.json", orient="table").set_index("compound") |
| 78 | +ent_mapper_new = process_mapping(load_json_file('data/entity_name_mapping.json')) |
| 79 | +drug_pivot_mapped = map_dataframe(drug_pivot, ent_mapper_new) |
| 80 | +logger.info(f"Loaded substance mapping graph in {time.time() - start}s...") |
| 81 | + |
| 82 | +def create_json_for_llm(compounds: list, drug_pivot=drug_pivot_mapped, mapper=ent_mapper_new) -> dict: |
| 83 | + try: |
| 84 | + drug_pivot_comp = drug_pivot.loc[compounds].dropna(axis=1, how='all').drop(columns=columns_pathway_function) |
| 85 | + return drug_pivot_comp.to_dict() |
| 86 | + except: |
| 87 | + return {} |
| 88 | + |
0 commit comments