From de398edafd83317edb2e132661df466988af18c5 Mon Sep 17 00:00:00 2001 From: Matthieu NAJM Date: Mon, 16 Mar 2026 15:43:12 +0100 Subject: [PATCH 01/15] deleted last previous mapping file of open_targets --- .../adapters/open_targets_diseases.yaml | 30 ------------------- 1 file changed, 30 deletions(-) delete mode 100644 oncodashkb/adapters/open_targets_diseases.yaml diff --git a/oncodashkb/adapters/open_targets_diseases.yaml b/oncodashkb/adapters/open_targets_diseases.yaml deleted file mode 100644 index e290608..0000000 --- a/oncodashkb/adapters/open_targets_diseases.yaml +++ /dev/null @@ -1,30 +0,0 @@ -row: - rowIndex: - to_subject: id -transformers: - - map: - columns: - - id - to_object: disease - via_relation: disease_to_id - - map: - columns: - - name - to_property: - - name - for_objects: - - disease - - map: - columns: - - description - to_property: - - description - for_objects: - - disease - - map: - columns: - - code - to_property: - - code - for_objects: - - disease From 8f7530dbc101c75fd90cb6e0225767809f0ab85c Mon Sep 17 00:00:00 2001 From: Matthieu NAJM Date: Mon, 16 Mar 2026 16:07:20 +0100 Subject: [PATCH 02/15] remove unnecessary files --- oncodashkb/adapters/Ensembl_genes.conf | 67 ---------------------- oncodashkb/adapters/Hugo_Symbol_genes.conf | 9 --- 2 files changed, 76 deletions(-) delete mode 100644 oncodashkb/adapters/Ensembl_genes.conf delete mode 100644 oncodashkb/adapters/Hugo_Symbol_genes.conf diff --git a/oncodashkb/adapters/Ensembl_genes.conf b/oncodashkb/adapters/Ensembl_genes.conf deleted file mode 100644 index c595cc8..0000000 --- a/oncodashkb/adapters/Ensembl_genes.conf +++ /dev/null @@ -1,67 +0,0 @@ -'ENSG00000100311', - 'ENSG00000140538', - 'ENSG00000101972', - 'ENSG00000107485', - 'ENSG00000141510', - 'ENSG00000171456', - 'ENSG00000136997', - 'ENSG00000099956', - 'ENSG00000157168', - 'ENSG00000104884', - 'ENSG00000112679', - 'ENSG00000169032', - 'ENSG00000115524', - 'ENSG00000187266', - 'ENSG00000119772', - 'ENSG00000139083', - 'ENSG00000172175', - 'ENSG00000113916', - 'ENSG00000171094', - 'ENSG00000121879', - 'ENSG00000141736', - 'ENSG00000109670', - 'ENSG00000073282', - 'ENSG00000127528', - 'ENSG00000133703', - 'ENSG00000138376', - 'ENSG00000066468', - 'ENSG00000179218', - 'ENSG00000156531', - 'ENSG00000183765', - 'ENSG00000149311', - 'ENSG00000169249', - 'ENSG00000120217', - 'ENSG00000245848', - 'ENSG00000096968', - 'ENSG00000023445', - 'ENSG00000105976', - 'ENSG00000068078', - 'ENSG00000147889', - 'ENSG00000178573', - 'ENSG00000182054', - 'ENSG00000139163', - 'ENSG00000097007', - 'ENSG00000174775', - 'ENSG00000012048', - 'ENSG00000157764', - 'ENSG00000100393', - 'ENSG00000157873', - 'ENSG00000168685', - 'ENSG00000183337', - 'ENSG00000085224', - 'ENSG00000071564', - 'ENSG00000105397', - 'ENSG00000152217', - 'ENSG00000185920', - 'ENSG00000106462', - 'ENSG00000205755', - 'ENSG00000197646', - 'ENSG00000091831', - 'ENSG00000292363', - 'ENSG00000148400', - 'ENSG00000135679', - 'ENSG00000138413', - 'ENSG00000171791', - 'ENSG00000077782', - 'ENSG00000137265', - 'ENSG00000187741' \ No newline at end of file diff --git a/oncodashkb/adapters/Hugo_Symbol_genes.conf b/oncodashkb/adapters/Hugo_Symbol_genes.conf deleted file mode 100644 index 32e8631..0000000 --- a/oncodashkb/adapters/Hugo_Symbol_genes.conf +++ /dev/null @@ -1,9 +0,0 @@ -'MET', 'BRAF', 'EZH2', 'CDKN2A', 'ETV6', 'ETNK1', 'KRAS', 'NTRK3', -'IDH2', 'MAF', 'BRCA1', 'TP53', 'BCOR', 'FGFR1', 'MYC', 'JAK2', -'CD274', 'PDCD1LG2', 'PIK3CA', 'BCL6', 'TP63', 'IL7R', 'MDM2', -'SETBP1', 'FBXW7', 'ABL1', 'MAP2K1', 'TYK2', 'EPOR', 'ERCC2', -'SMARCB1', 'CHEK2', 'PDGFB', 'EP300', 'STAG2', 'PHF6', 'FGFR2', -'FGFR3', 'NRG1', 'GATA3', 'HRAS', 'ERBB2', 'BCL2', 'TCF3', 'CEBPA', -'CRLF2', 'ZRSR2', 'NOTCH1', 'TNFRSF14', 'BARD1', 'ESR1', 'PTCH1', -'FANCA', 'KLF2', 'MALT1', 'CALR', 'DNMT3A', 'ALK', 'SF3B1', 'IDH1', -'DUSP22', 'IRF4', 'BIRC3', 'ATM', 'ASXL1', 'ATRX' \ No newline at end of file From ba5e83b856725ea0383d1ad9f31cc38b8da313ba Mon Sep 17 00:00:00 2001 From: Matthieu NAJM Date: Mon, 16 Mar 2026 16:31:38 +0100 Subject: [PATCH 03/15] remove old integration of gene ontology --- README.md | 53 ------- config/schema.yaml | 25 +--- oncodashkb/adapters/README.md | 29 ---- oncodashkb/adapters/gene_ontology.py | 139 ------------------ oncodashkb/adapters/gene_ontology.yaml | 36 ----- .../adapters/gene_ontology_reverse.yaml | 36 ----- weave.py | 73 --------- 7 files changed, 6 insertions(+), 385 deletions(-) delete mode 100644 oncodashkb/adapters/README.md delete mode 100644 oncodashkb/adapters/gene_ontology.py delete mode 100644 oncodashkb/adapters/gene_ontology.yaml delete mode 100644 oncodashkb/adapters/gene_ontology_reverse.yaml diff --git a/README.md b/README.md index 7e78fb4..73ab27f 100644 --- a/README.md +++ b/README.md @@ -224,59 +224,6 @@ the data that you want to integrate. ./weave.py –oncokb /path_to_file/test_genomics_oncokbannotation.csv ``` - -### Gene Ontology adapter - -**Gene Ontology** is one of the biggest biomedical databases. The described -adapter helps to integrate the data about the molecular function of the gene -product, as well as the biological process in which these genes are involved. - -- Molecular function: GO annotations that have relation type `enabled` - or `contributes_to`. -- Biological process: GO annotations that have relation type `involved_in`. - -**To integrate the data, three files are necessary:** -- `--gene_ontology` option for GO annotations in GAF format [Download GO annotations](http://current.geneontology.org/products/pages/downloads.html) -- `--gene_ontology_owl` option for GO ontology in OWL format [Download GO ontology](https://geneontology.org/docs/download-ontology/) -- `--gene_ontology_genes` option for the list of genes for which we want to - integrate the GO annotations (example in adapters/Hugo_Symbol_genes.conf file, - by default = list of genes from OncoKB database). - -**Example of use:** - -``` sh -./weave.py --gene_ontology /path_to_file/goa_human.gaf --gene_ontology_owl /path_to_file/go.owl --gene_ontology_genes /path_to_file/Hugo_Symbol_genes.conf -``` - -If you want to integrate annotations with another type of relations, you can -modify the `adapters/gene_ontology.py` file by adding the next code in the -**class Gene_ontology** (example for the `involved_in` edge type): - -``` python -# Create new columns that depends on edge type. -df['GO_involved_in'] = None - -# Cut df to include only edge type that we have chosen and annotations -# for genes from OncoKB. -df = df[((df['Qualifier'].isin(['enables', 'involved_in', 'contributes_to'])) & - (df['DB_Object_Symbol'].isin(included_genes)))] -``` -Also, you need to add code in `separate_edges_types` method: - -``` sh -# Function to copy GO_term to related column for future ontoweaver mapping -# based on Qualifier column (relation type). - def separate_edges_types(row): - if row['Qualifier'] == 'enables': - row['GO_enables'] = row['GO_term'] - elif row['Qualifier'] == 'involved_in': - row['GO_involved_in'] = row['GO_term'] -``` - -Finally, you need to specify the node and edge types in the `gene_ontology.yaml` -for `GO_involved_in` column. - - ### Open Targets adapter Open Targets is a public database that aims to systematically identify and diff --git a/config/schema.yaml b/config/schema.yaml index e6ef1d1..a28150b 100644 --- a/config/schema.yaml +++ b/config/schema.yaml @@ -302,10 +302,6 @@ gene status affects gene: ### GO -- TO BE FIXED -# annotation: -# is_a: named thing -# represented_as: node -# label_in_input: annotation biological process: is_a: named thing @@ -314,20 +310,6 @@ biological process: properties: data_source: str -# annotation for gene: -# is_a: association -# represented_as: edge -# label_in_input: annotation_for_gene -# source: annotation -# target: gene - -# involved in: -# is_a: association -# represented_as: edge -# label_in_input: involved_in -# source: annotation -# target: biological process - gene to biological process: is_a: association represented_as: edge @@ -345,9 +327,10 @@ biological process to gene: source: biological process target: gene properties: - # edglelabel: str data_source: str +### FUNCTIONAL PROTEIN PROTEIN INTERACTIONS + undirected molecular interaction: is_a: pairwise molecular interaction represented_as: edge @@ -447,6 +430,8 @@ inhibition: extra_attrs: str evidences: str +### TRASNCRIPT TO GENE RELATIONSHIP + transcript to gene relationship: # is_a: transcript to gene relationship represented_as: edge @@ -454,6 +439,8 @@ transcript to gene relationship: properties: data_source: str +### DRUG HAS TARGET + drug has target: is_a: drug to gene association represented_as: edge diff --git a/oncodashkb/adapters/README.md b/oncodashkb/adapters/README.md deleted file mode 100644 index b79f2c3..0000000 --- a/oncodashkb/adapters/README.md +++ /dev/null @@ -1,29 +0,0 @@ -## Gene Ontology Data Preparation - -**Gene Ontology** (GO) is one of the biggest biomedical databases for the annotation of genes and their products across different species. To integrate the data in the Semantic Knowledge Graph (SKG), we use the `GO Annotations file` for Homo Sapiens in `GAF format` [Download page](https://geneontology.org/docs/download-go-annotations/) . Each line in GAF file represents **one annotation** for a gene product and contains **17 columns** (you can read a detailed description of each column [here](https://geneontology.org/docs/go-annotation-file-gaf-format-2.2/])). - -Compared to the integration of the CGI and OncoKB databases, where each column represents a concrete data type from Biolink ontology, the GO annotations file contains data type for each annotation (row) in the column 'Qualifier'. For further details regarding different types of relationships, please refer to the following [link](https://wiki.geneontology.org/Annotation_Relations). - -To solve the issue concerning data types represented in one column and to make the integrated data in the SKG more clear and easy to understand, the following steps were implemented in the GO adapter: -- [Download](https://geneontology.org/docs/download-ontology/) the **GO ontology OWL file** to create a dictionary that can map **GO_ID** to **GO_term** cause there is only a **GO_ID** column in the GAF file. -- Create a new column **GO_term** using a dictionary and `create_id_term_dict` method. -- For the chosen type of the relation from the **column 'Qualifier'** (in our case, `enables`, `involved_in`, `contributes_to` relation types) create an additional column (in our case, `GO_enables`, `GO_involved_in`, `GO_contributes_to` columns) and copy the **GO_term** in the related column (see illustration below) - -![Schema_columns_GO_adapter](https://github.com/kgaydukova/oncodashkb/assets/23275374/37b23c98-17b6-45bd-ab34-bc4d7fdf72f9) - -- Declare data type and relation type in the mapping file `gene_ontology.yaml` for each synthetic additional column (`GO_enables`, `GO_involved_in`, `GO_contributes_to`). - -```yaml -subject: annotation # Type for each entry (e.g. line). - -columns: - GO_enables: - to_object: molecular_function - via_relation: enables - GO_involved_in: - to_object: biological_process - via_relation: involved_in - GO_contributes_to: - to_object: molecular_function - via_relation: contributes_to -``` diff --git a/oncodashkb/adapters/gene_ontology.py b/oncodashkb/adapters/gene_ontology.py deleted file mode 100644 index 4b6b7af..0000000 --- a/oncodashkb/adapters/gene_ontology.py +++ /dev/null @@ -1,139 +0,0 @@ -import types as pytypes -import logging -import ontoweaver - -from typing import Optional -from collections.abc import Iterable - -import pandas as pd - -from owlready2 import get_ontology - - -class Gene_ontology(ontoweaver.tabular.PandasAdapter): - - def __init__(self, - df: pd.DataFrame, - ontology: str, - genes_list: str, - config: dict, - type_affix=ontoweaver.base.TypeAffixes.none - ): - - # logging.info(" | | In Gene_ontology adapter init") - self.ontology = ontology - self.genes_list = genes_list - assert self.genes_list != None - - # define column names based on the GAF specification - columns = ['DB', 'DB_Object_ID', 'DB_Object_Symbol', 'Qualifier', 'GO_ID', 'DB_Reference', 'Evidence_Code', - 'With_or_From', 'Aspect', 'DB_Object_Name', 'DB_Object_Synonym', 'DB_Object_Type', 'Taxon', 'Date', - 'Assigned_By', 'Annotation_Extension', 'Gene_Product_Form_ID'] - - # assign column names to the DataFrame - df.columns = columns - - # create dict with GO_id:GO_term - logging.info(" | | Load GO taxonomy") - dict_go_plus = self.create_id_term_dict() - - # logging.info(" | | Sanitize keys") - # DELETE ; and , from terms (values in dictionary) to avoid future errors in CSV for neo4j import - for key in dict_go_plus.keys(): - if ',' in dict_go_plus[key]: - dict_go_plus[key] = dict_go_plus[key].replace(',', '') - if ';' in dict_go_plus[key]: - dict_go_plus[key] = dict_go_plus[key].replace(';', '') - if '\'' in dict_go_plus[key]: - dict_go_plus[key] = dict_go_plus[key].replace('\'', '') - - # logging.info(" | | Expand data") - # create additional column with GO terms (mapped from GO_id) - df['GO_term'] = df['GO_ID'].map(lambda go_id: dict_go_plus[go_id]) - - # create new columns that depends on edge type - df['GO_involved_in'] = None - df['GO_enables'] = None - df['GO_contributes_to'] = None - - ''' - List of genes the annotation for which we will integrate from Gene Ontology data, - Reading from Hugo_Symbol_genes.conf file - By default = genes from OncoKB database - ''' - # logging.info(" | | Read genes list") - included_genes = self.read_genes_list() - assert len(included_genes) > 0 - - # logging.info(" | | Filter out useless edges") - # cut df to include only edge type that we have chosen and annotations for genes from OncoKB - df = df[((df['Qualifier'].isin(['enables', 'involved_in', 'contributes_to'])) & - (df['DB_Object_Symbol'].isin(included_genes)))] - assert len(df) > 0 - - # add the GO_term in GO_involved_in, GO_enables, GO_contributes_to columns depending on the edge type in - # Qualifier column - # logging.info(" | | Separate edge types") - df = df.apply(self.separate_edges_types, axis=1) - assert len(df) > 0 - - # Default mapping as a simple config. - # logging.info(" | | Parse data") - from . import types - parser = ontoweaver.tabular.YamlParser(config, types) - mapping = parser() - - # logging.info(" | | Declare types") - # Declare types defined in the config. - super().__init__( - df, - *mapping, - ) - - logging.info(" | | Done Gene_ontology init") - - # function to create a dictionary with GO_id:GO_term for gene ontology, input - OWL file, output - dictionary - def create_id_term_dict(self): - dict_id_term = {} - - logging.debug(f"Load ontology: {self.ontology}") - - ont = get_ontology(self.ontology).load() - - # iterate through all classes in the ontology - for cls in ont.classes(): - # get the class ID and label (term) - class_id = cls.iri # read class_id like http://purl.obolibrary.org/obo/GO_0003674' - class_label = cls.label.first() if cls.label else cls.name - - # make the same key as we have in GO annotation files - class_id_key = class_id.replace("http://purl.obolibrary.org/obo/GO_", "GO:") - # add to dictionary like GO:0003674': 'molecular_function' - dict_id_term[class_id_key] = class_label - - return dict_id_term - - def read_genes_list(self): - - # print(self.genes_list=='o') - - with open(self.genes_list, 'r') as file: - - content = file.read() - genes = content.replace('\n', '').split(',') - genes = [gene.strip().strip("'") for gene in genes] - genes = list(filter(None, genes)) - - return genes - - # function to copy GO_term to related column for future ontoweaver mapping based on Qualifier column (relation type) - - @staticmethod - def separate_edges_types(row): - if row['Qualifier'] == 'enables': - row['GO_enables'] = row['GO_term'] - elif row['Qualifier'] == 'involved_in': - row['GO_involved_in'] = row['GO_term'] - elif row['Qualifier'] == 'contributes_to': - row['GO_contributes_to'] = row['GO_term'] - return row diff --git a/oncodashkb/adapters/gene_ontology.yaml b/oncodashkb/adapters/gene_ontology.yaml deleted file mode 100644 index d8d4184..0000000 --- a/oncodashkb/adapters/gene_ontology.yaml +++ /dev/null @@ -1,36 +0,0 @@ -row: - map: - column: DB_Object_Symbol - to_subject: gene -transformers: - # - map: - # column: DB_Object_Symbol - # to_object: gene - # via_relation: annotation_for_gene - # - map: - # column: GO_enables - # to_object: molecular_function - # via_relation: enables - # - map: - # columns: GO_involved_in - # to_object: biological_process - # via_relation: involved_in - # - map: - # columns: GO_contributes_to - # to_object: molecular_function - # via_relation: contributes_to - - map: - columns: GO_involved_in - to_object: biological_process - via_relation: gene_to_biological_process - # - map: - # column: GO_contributes_to - # from_subject: gene - # to_object: molecular_function - # via_relation: gene_to_molecular_function - - string: - value: " " - to_property: edgelabel - for_objects: gene_to_biological_process -metadata: - - data_source: gene_ontology diff --git a/oncodashkb/adapters/gene_ontology_reverse.yaml b/oncodashkb/adapters/gene_ontology_reverse.yaml deleted file mode 100644 index 1c72430..0000000 --- a/oncodashkb/adapters/gene_ontology_reverse.yaml +++ /dev/null @@ -1,36 +0,0 @@ -row: - map: - column: GO_involved_in - to_subject: biological_process -transformers: - # - map: - # column: DB_Object_Symbol - # to_object: gene - # via_relation: annotation_for_gene - # - map: - # column: GO_enables - # to_object: molecular_function - # via_relation: enables - # - map: - # columns: GO_involved_in - # to_object: biological_process - # via_relation: involved_in - # - map: - # columns: GO_contributes_to - # to_object: molecular_function - # via_relation: contributes_to - - map: - columns: DB_Object_Symbol - to_object: gene - via_relation: biological_process_to_gene - # - map: - # column: GO_contributes_to - # from_subject: gene - # to_object: molecular_function - # via_relation: gene_to_molecular_function - - string: - value: " " - to_property: edgelabel - for_objects: biological_process_to_gene -metadata: - - data_source: gene_ontology diff --git a/weave.py b/weave.py index 89fc028..cd0190b 100755 --- a/weave.py +++ b/weave.py @@ -14,7 +14,6 @@ import biocypher import ontoweaver -# import oncodashkb.adapters as od from alive_progress import alive_bar error_codes = { @@ -129,39 +128,6 @@ def process_OT(directory, name): return local_nodes, local_edges - -def process_GO(name): - logging.info(f" | Weave {name} data...") - # Table input data. - logging.info(f" | | Load {name} data...") - df = progress_read(asked.gene_ontology[0], sep='\t', comment='!', header=None, dtype={15: str}, hint=969214) - - logging.info(f" | | Read {name} mapping...") - # Extraction mapping configuration. - try: - with open(f"./oncodashkb/adapters/{name}.yaml") as fd: - conf = yaml.full_load(fd) - except Exception as e: - logging.error(e) - sys.exit(error_codes["CannotAccessFile"]) - - logging.info(f" | | Preprocess {name} data...") - manager = od.gene_ontology.Gene_ontology(df, asked.gene_ontology_owl, asked.gene_ontology_genes, conf) - - logging.info(f" | | Transform {name} data...") - local_nodes = [] - local_edges = [] - # Use manager.df because Gene_ontology does filter the input dataframe - with alive_bar(len(manager.df), file=sys.stderr) as progress: - for n,e in manager(): - local_nodes += n - local_edges += e - progress() - - return local_nodes, local_edges - - - if __name__ == "__main__": # TODO add adapter for parquet, one for csv and one that automatically checks filetype. @@ -205,18 +171,6 @@ def process_GO(name): parser.add_argument("-c", "--cgi", metavar="CSV", nargs="+", help="Extract from a CGI CSV file.") - parser.add_argument("-g", "--gene-ontology", metavar="CSV", nargs="+", - help="Extract from a Gene_Ontology_Annotation GAF file.") - - parser.add_argument("-n", "--gene-ontology-owl", metavar="OWL", - help="Download Gene_Ontology owl file.") - - parser.add_argument("-G", "--gene-ontology-genes", metavar="TXT", - help="List of genes for which we integrate Gene Ontology annotations (by default genes from OncoKB).") - - parser.add_argument("-r", "--gene-ontology-reverse", action='store_true', - help="Extract from a Gene_Ontology_Annotation GAF file.") - parser.add_argument("-s", "--separator", metavar="STRING", default=", ", help="Separator in exported data files.") @@ -270,10 +224,6 @@ def process_GO(name): "open_targets_drug_mechanism_of_action", "open_targets_drug_molecule", "cgi", - "gene_ontology", - "gene_ontology_owl", - "gene_ontology_genes", - "gene_ontology_reverse", ] opt_total = 0 for opt in all_options: @@ -446,29 +396,6 @@ def process_GO(name): edges += local_edges logging.info(f"Done adapter {opt_loaded}/{opt_total}") - ## GeneOntology - - ### GO - if asked.gene_ontology: - opt_loaded += 1 - logging.info(f"########## Adapter #{opt_loaded}/{opt_total} ##########") - local_nodes, local_edges = process_GO("gene_ontology") - logging.info(f" | Save data...") - nodes += local_nodes - edges += local_edges - logging.info(f"OK, wove Gene Ontology data: {len(local_nodes)} nodes, {len(local_edges)} edges.") - logging.info(f"Done adapter {opt_loaded}/{opt_total}") - - ### GO reversed - if asked.gene_ontology_reverse: - opt_loaded += 1 - logging.info(f"########## Adapter #{opt_loaded}/{opt_total} ##########") - local_nodes, local_edges = process_GO("gene_ontology_reverse") - nodes += local_nodes - edges += local_edges - logging.info(f"OK, reverse-wove Gene Ontology: {len(local_nodes)} nodes, {len(local_edges)} edges.") - logging.info(f"Done adapter {opt_loaded}/{opt_total}") - ################################################### # Map the data not requiring special loadings. # ################################################### From 51988fe855e22b9efdd48b1fc0e4de743162b9c7 Mon Sep 17 00:00:00 2001 From: Matthieu NAJM Date: Mon, 16 Mar 2026 18:27:14 +0100 Subject: [PATCH 04/15] remove last marks of old integration of open targets and gene ontology --- make.sh | 4 ---- oncodashkb/adapters/__init__.py | 9 --------- 2 files changed, 13 deletions(-) delete mode 100644 oncodashkb/adapters/__init__.py diff --git a/make.sh b/make.sh index 8e5d64a..68eb7da 100755 --- a/make.sh +++ b/make.sh @@ -90,11 +90,7 @@ cmd="uv run python3 ${py_args} $script_dir/weave.py \ --open-targets-target $data_dir/OT/target/ ${weave_args}" # \ # --clinical $data_dir/DECIDER/clinical/clinical_export.xlsx \ - # --gene_ontology_genes $data_dir/DECIDER/$data_version/OncoKB_gene_symbols.conf \ # --oncokb $data_dir/DECIDER/$data_version/treatments.csv \ - # --gene_ontology $data_dir/GO/goa_human.gaf.gz \ - # --gene_ontology_owl $data_dir/GO/go.owl \ - # --gene_ontology_reverse echo "Weaving command:" >&2 diff --git a/oncodashkb/adapters/__init__.py b/oncodashkb/adapters/__init__.py deleted file mode 100644 index 582dac2..0000000 --- a/oncodashkb/adapters/__init__.py +++ /dev/null @@ -1,9 +0,0 @@ - -# from . import types -from . import gene_ontology -from . import open_targets -from . import open_targets_evidences -from . import open_targets_drugs -from . import open_targets_diseases -__all__ = ['types', 'gene_ontology', 'open_targets', 'open_targets_evidences', 'open_targets_drugs', 'open_targets_diseases'] - From 41e3f3988a520323a98b4146039640afff024ea6 Mon Sep 17 00:00:00 2001 From: Matthieu NAJM Date: Tue, 17 Mar 2026 17:21:22 +0100 Subject: [PATCH 05/15] feat: integration of the structural variants adapter --- make.sh | 1 + oncodashkb/adapters/structural_variants.yaml | 115 +++++++++++++++++++ weave.py | 57 ++++++++- 3 files changed, 170 insertions(+), 3 deletions(-) create mode 100644 oncodashkb/adapters/structural_variants.yaml diff --git a/make.sh b/make.sh index 68eb7da..a519657 100755 --- a/make.sh +++ b/make.sh @@ -84,6 +84,7 @@ cmd="uv run python3 ${py_args} $script_dir/weave.py \ --short-mutations-external $decider_dir/short_mutations_external.csv \ --copy-number-amplifications-local $decider_dir/cnas_local.csv \ --copy-number-amplifications-external $decider_dir/cnas_external.csv \ + --structural-variants $data_dir/DECIDER/$data_version/structural_variants.xlsx \ --omnipath-networks $data_dir/omnipath_networks/omnipath_webservice_interactions__latest.tsv.gz \ --open-targets-drug-molecule $data_dir/OT/drug_molecule/ --open-targets-drug_mechanism_of_action $data_dir/OT/drug_mechanism_of_action/ diff --git a/oncodashkb/adapters/structural_variants.yaml b/oncodashkb/adapters/structural_variants.yaml new file mode 100644 index 0000000..1b96645 --- /dev/null +++ b/oncodashkb/adapters/structural_variants.yaml @@ -0,0 +1,115 @@ +row: + translate: + column: patient + to_subject: patient + translations_file: ./data/DECIDER/clinical/clinical_export.xlsx + translate_from: Patient card::Patient cohort code_Patient Card + translate_to: Patient card::Publication code + index_col: 0 + usecols: [0,1,2] +transformers: + # Nodes + ## Samples + - translate_sample_ids : + column: sample + to_object: sample + via_relation: patient_carries_sample + translations_file: ./data/DECIDER/clinical/clinical_export.xlsx + translate_from: Patient card::Patient cohort code_Patient Card + translate_to: Patient card::Publication code + index_col: 0 + usecols: [0,1,2] + ## Alterations + - cat_format: + columns: + - primary_gene + - effect + from_subject: sample + to_object: alteration + via_relation: sample_carries_alteration + format_string: "{primary_gene}:{effect}" + ## Gene status + - translate_cat_format: + columns: + - primary_gene + - Gene_type + from_subject: alteration + to_object: gene_status + format_string: "{primary_gene}:{Gene_type}" + via_relation: alteration_causes_gene_status + column_to_translate: + - primary_gene + translations_file: data/HGNC/hgnc_complete_set.txt + translate_from: symbol + translate_to: ensembl_gene_id + sep: "\t" + ## Genes + - translate: + column: primary_gene + from_subject: gene_status + to_object: gene + via_relation: gene_status_affects_gene + translations_file: data/HGNC/hgnc_complete_set.txt + translate_from: symbol + translate_to: ensembl_gene_id + sep: "\t" + # Properties + ## Alterations + - translate_cat_format: + columns: + - primary_gene + - effect + to_property: ensembl_id_alteration + for_object: alteration + format_string: "{primary_gene}:{effect}" + column_to_translate: + - primary_gene + translations_file: data/HGNC/hgnc_complete_set.txt + translate_from: symbol + translate_to: ensembl_gene_id + sep: "\t" + ## Gene status + - cat_format: + columns: + - primary_gene + - Gene_type + to_property: gene_symbol_gene_status + for_object: gene_status + format_string: "{primary_gene}:{Gene_type}" + - map: + columns: Gene_type + to_property: gene_role + for_object: gene_status + ## Genes + - map: + column: primary_gene + to_property: gene_symbol + for_object: gene + ## Alterations + - map: + column: pathogenic + to_property: oncogenic + for_object: alteration + - map: + column: effect + to_property: consequence + for_object: alteration + - map: + column: Homogeneous + to_property: homogenous + for_object: alteration + - map: + column: expressed + to_property: expressed + for_object: alteration + - string: + value: " " + to_property: edgelabel + for_objects: + - patient_carries_sample + - sample_carries_alteration + - alteration_causes_gene_status + - gene_status_affects_gene + +metadata: + - data_source: structural_variants_placeholder \ No newline at end of file diff --git a/weave.py b/weave.py index cd0190b..38dcbda 100755 --- a/weave.py +++ b/weave.py @@ -34,8 +34,9 @@ ontoweaver.transformer.register(OmniPath_directed) # Importing custom transformer for translating sample ids with publication code and registering it. -from oncodashkb.transformers.specific_translate_transformers import translate_sample_ids +from oncodashkb.transformers.specific_translate_transformers import translate_sample_ids, translate_cat_format ontoweaver.transformer.register(translate_sample_ids) +ontoweaver.transformer.register(translate_cat_format) # Importing OpenTargets custom transformer and registering it. from oncodashkb.transformers.ot_transformers import access_proteins, urls_to_prop @@ -153,6 +154,9 @@ def process_OT(directory, name): parser.add_argument("-cnae", "--copy-number-amplifications-external", metavar="CSV", nargs="+", help="Extract from a CSV file with copy number amplifications' external annotations.") + parser.add_argument("-sv", "--structural-variants", metavar="CSV", nargs="+", + help="Extract from a CSV file with short mutations' local annotations.") + parser.add_argument("-o", "--oncokb", metavar="CSV", nargs="+", help="Extract from an OncoKB CSV file.") @@ -218,6 +222,7 @@ def process_OT(directory, name): "short_mutations_external", "copy_number_amplifications_local", "copy_number_amplifications_external", + "structural_variants", "oncokb", "omnipath_networks", "open_targets_target", @@ -282,6 +287,53 @@ def process_OT(directory, name): edges += local_edges logging.info(f"Done adapter {opt_loaded}/{opt_total}") + if asked.structural_variants: + opt_loaded += 1 + logging.info(f"########## Adapter #{opt_loaded}/{opt_total} ##########") + data_file = asked.structural_variants[0] + mapping_file = "./oncodashkb/adapters/structural_variants.yaml" + + # logging.info(f"Weave structural variants...") + logging.info(f" | Weave `{data_file}:{mapping_file}`...") + logging.info(f" | | Load data `{data_file}`...") + table = pd.read_excel(data_file) + + table = table.rename(columns={"Gene.type":"Gene_type"}) + + try: + with open(mapping_file) as fd: + ymapping = yaml.full_load(fd) + except Exception as e: + logging.error(e) + sys.exit(error_codes["CannotAccessFile"]) + + logging.info(f" | | Process {mapping_file}...") + + yparser = ontoweaver.mapping.YamlParser(ymapping) + mapping = yparser() + + adapter = ontoweaver.tabular.PandasAdapter( + table, + *mapping, + type_affix="suffix", + type_affix_sep=":", + raise_errors = True + ) + + local_nodes = [] + local_edges = [] + with alive_bar(len(table), file=sys.stderr) as progress: + for n,e in adapter(): + # NOTE: here, n & e are ontoweaver.base.Element, not BioCypher tuples. + local_nodes += n + local_edges += e + progress() + + logging.info(f" | | OK, wove: {len(local_nodes)} nodes, {len(local_edges)} edges.") + nodes += local_nodes + edges += local_edges + logging.info(f"Done adapter {opt_loaded}/{opt_total}") + if asked.omnipath_networks: opt_loaded += 1 logging.info(f"########## Adapter #{opt_loaded}/{opt_total} ##########") @@ -414,9 +466,8 @@ def process_OT(directory, name): "short_mutations_external", "copy_number_amplifications_local", "copy_number_amplifications_external", + # "structural_variants", "oncokb", - # "omnipath_networks", - # "ot-" "cgi", ] for name in direct_mappings: From cf5e37299cf6c675958cf61ecd27a0ffc9339c5a Mon Sep 17 00:00:00 2001 From: Matthieu NAJM Date: Wed, 18 Mar 2026 19:15:41 +0100 Subject: [PATCH 06/15] feat: distinguished structural variants, short mutations and copy number amplifications --- config/schema.yaml | 69 +++++++++++++++---- .../copy_number_amplifications_external.yaml | 26 +++---- .../copy_number_amplifications_local.yaml | 8 +-- .../adapters/short_mutations_external.yaml | 26 +++---- .../adapters/short_mutations_local.yaml | 21 +++--- oncodashkb/adapters/structural_variants.yaml | 22 +++--- 6 files changed, 105 insertions(+), 67 deletions(-) diff --git a/config/schema.yaml b/config/schema.yaml index a28150b..bd0f079 100644 --- a/config/schema.yaml +++ b/config/schema.yaml @@ -4,12 +4,11 @@ # Defined in alphabetical order -alteration: +short mutation: is_a: sequence variant represented_as: node - label_in_input: alteration + label_in_input: short_mutation properties: - gene_symbol_alteration: str citation_PM_ids: str consequence: str homogenous: str @@ -23,7 +22,47 @@ alteration: refCount: int64 altCount: int64 expressed: bool - ensembl_id_alteration: str + # ensembl_id_alteration: str + +copy number amplification: + is_a: sequence variant + represented_as: node + label_in_input: copy_number_amplification + properties: + citation_PM_ids: str + consequence: str + homogenous: str + mutation_effect_description: str + data_source: str + oncogenic: str + reference_genome: str + tumor_type: str + tumor_type_summary: str + variant_summary: str + refCount: int64 + altCount: int64 + expressed: bool + # ensembl_id_alteration: str + +structural variant: + is_a: sequence variant + represented_as: node + label_in_input: structural_variant + properties: + citation_PM_ids: str + consequence: str + homogenous: str + mutation_effect_description: str + data_source: str + oncogenic: str + reference_genome: str + tumor_type: str + tumor_type_summary: str + variant_summary: str + refCount: int64 + altCount: int64 + expressed: bool + # ensembl_id_alteration: str disease: represented_as: node @@ -212,7 +251,7 @@ protein: ### CARRIES -# To allow queries for patient carrying samples, and samples carrying alterations, +# To allow queries for patient carrying samples, and samples carrying variants, # without mixing with "effects" causes. carries: is_a: causes @@ -232,12 +271,12 @@ patient carries sample: data_source: str edglelabel: str -sample carries alteration: +sample carries variant: is_a: carries represented_as: edge - label_in_input: sample_carries_alteration + label_in_input: sample_carries_variant source: sample - target: alteration + target: sequence variant properties: data_source: str edglelabel: str @@ -246,16 +285,16 @@ sample carries alteration: # A gene is linked to its gene status (gain or loss of function), # which are represented as nodes, so as to allow a causal path -# to go through alteration -> gene status -> transcript activity. +# to go through variant -> gene status -> transcript activity. # Hence, outcomes have at least two instances: # - Gene:GoF, and # - Gene:LoF. -alteration causes gene status: +variant causes gene status: is_a: causes represented_as: edge - label_in_input: alteration_causes_gene_status - source: alteration + label_in_input: variant_causes_gene_status + source: sequence variant target: gene status properties: data_source: str @@ -267,11 +306,11 @@ alteration causes gene status: # as predictive markers for treatment response, # based on clinical evidence categorized by evidence levels. -alteration biomarker for drug: +variant biomarker for drug: is_a: biomarker for represented_as: edge - label_in_input: alteration_biomarker_for_drug - source: alteration + label_in_input: variant_biomarker_for_drug + source: sequence variant target: drug properties: data_source: str diff --git a/oncodashkb/adapters/copy_number_amplifications_external.yaml b/oncodashkb/adapters/copy_number_amplifications_external.yaml index f0ef571..5fee1b5 100644 --- a/oncodashkb/adapters/copy_number_amplifications_external.yaml +++ b/oncodashkb/adapters/copy_number_amplifications_external.yaml @@ -29,18 +29,18 @@ transformers: - hugoSymbol - alteration from_subject: sample - to_object: alteration - via_relation: sample_carries_alteration + to_object: copy_number_amplification + via_relation: sample_carries_variant format_string: "{hugoSymbol}:{alteration}" ## Gene status - cat_format: columns: - ensembl_id - gene_role - from_subject: alteration + from_subject: copy_number_amplification to_object: gene_status format_string: "{ensembl_id}:{gene_role}" - via_relation: alteration_causes_gene_status + via_relation: variant_causes_gene_status # column_to_translate: # - hugoSymbol # translations_file: data/HGNC/hgnc_complete_set.txt @@ -64,7 +64,7 @@ transformers: - ensembl_id - alteration to_property: ensembl_id_alteration - for_object: alteration + for_object: copy_number_amplification format_string: "{ensembl_id}:{alteration}" # column_to_translate: # - hugoSymbol @@ -89,19 +89,19 @@ transformers: - map: column: tumorType to_property: tumor_type - for_object: alteration + for_object: copy_number_amplification - map: column: oncogenic to_property: oncogenic - for_object: alteration + for_object: copy_number_amplification - replace: column: mutationEffectDescription to_property: mutation_effect_description - for_object: alteration + for_object: copy_number_amplification - map: column: citationPMids to_property: citation_PM_ids - for_object: alteration + for_object: copy_number_amplification - replace: column: geneSummary to_property: gene_summary @@ -109,18 +109,18 @@ transformers: - map: column: variantSummary to_property: variant_summary - for_object: alteration + for_object: copy_number_amplification - map: column: tumorTypeSummary to_property: tumor_type_summary - for_object: alteration + for_object: copy_number_amplification - string: value: " " to_property: edgelabel for_objects: - patient_carries_sample - - sample_carries_alteration - - alteration_causes_gene_status + - sample_carries_variant + - variant_causes_gene_status - gene_status_affects_gene metadata: diff --git a/oncodashkb/adapters/copy_number_amplifications_local.yaml b/oncodashkb/adapters/copy_number_amplifications_local.yaml index 87a4fb4..d7a46f8 100644 --- a/oncodashkb/adapters/copy_number_amplifications_local.yaml +++ b/oncodashkb/adapters/copy_number_amplifications_local.yaml @@ -22,16 +22,16 @@ transformers: - hugoSymbol - alteration from_subject: sample - to_object: alteration + to_object: copy_number_amplification format_string: "{hugoSymbol}:{alteration}" - via_relation: sample_carries_alteration + via_relation: sample_carries_variant - map: column: referenceGenome to_property: reference_genome - for_object: alteration + for_object: copy_number_amplification - map: column: tumorType to_property: tumor_type - for_object: alteration + for_object: copy_number_amplification metadata: - data_source: copy_number_amplifications_local \ No newline at end of file diff --git a/oncodashkb/adapters/short_mutations_external.yaml b/oncodashkb/adapters/short_mutations_external.yaml index b4e4bfc..626f981 100644 --- a/oncodashkb/adapters/short_mutations_external.yaml +++ b/oncodashkb/adapters/short_mutations_external.yaml @@ -23,17 +23,17 @@ transformers: - map: column: alteration from_subject: sample - to_object: alteration - via_relation: sample_carries_alteration + to_object: short_mutation + via_relation: sample_carries_variant ## Gene Stauts - cat_format: columns: - ensembl_id - gene_role - from_subject: alteration + from_subject: short_mutation to_object: gene_status format_string: "{ensembl_id}:{gene_role}" - via_relation: alteration_causes_gene_status + via_relation: variant_causes_gene_status ## Genes - map: column: ensembl_id @@ -63,23 +63,23 @@ transformers: - map: column: tumorType to_property: tumor_type - for_object: alteration + for_object: short_mutation - map: column: consequence to_property: consequence - for_object: alteration + for_object: short_mutation - map: column: oncogenic to_property: oncogenic - for_object: alteration + for_object: short_mutation - replace: column: mutationEffectDescription to_property: mutation_effect_description - for_object: alteration + for_object: short_mutation - map: column: citationPMids to_property: citation_PM_ids - for_object: alteration + for_object: short_mutation - replace: column: geneSummary to_property: gene_summary @@ -87,18 +87,18 @@ transformers: - map: column: variantSummary to_property: variant_summary - for_object: alteration + for_object: short_mutation - map: column: tumorTypeSummary to_property: tumor_type_summary - for_object: alteration + for_object: short_mutation - string: value: " " to_property: edgelabel for_objects: - patient_carries_sample - - sample_carries_alteration - - alteration_causes_gene_status + - sample_carries_variant + - variant_causes_gene_status - gene_status_affects_gene metadata: - data_source: short_mutations_external \ No newline at end of file diff --git a/oncodashkb/adapters/short_mutations_local.yaml b/oncodashkb/adapters/short_mutations_local.yaml index 31c4c31..dfa514f 100644 --- a/oncodashkb/adapters/short_mutations_local.yaml +++ b/oncodashkb/adapters/short_mutations_local.yaml @@ -20,42 +20,41 @@ transformers: - map: column: alteration from_subject: sample - to_object: alteration - via_relation: sample_carries_alteration + to_object: short_mutation + via_relation: sample_carries_variant - map: column: referenceGenome to_property: reference_genome - for_object: alteration + for_object: short_mutation - map: column: tumorType to_property: tumor_type - for_object: alteration + for_object: short_mutation - map: column: consequence to_property: consequence - for_object: alteration + for_object: short_mutation - map: column: homogenous to_property: homogenous - for_object: alteration + for_object: short_mutation - map: column: refCount to_property: refCount - for_object: alteration + for_object: short_mutation - map: column: altCount to_property: altCount - for_object: alteration + for_object: short_mutation - map: column: expressed to_property: expressed - for_object: alteration + for_object: short_mutation - string: value: " " to_property: edgelabel for_objects: - patient_carries_sample - - sample_carries_alteration - - alteration_affects_gene + - sample_carries_variant metadata: - data_source: short_mutations_local \ No newline at end of file diff --git a/oncodashkb/adapters/structural_variants.yaml b/oncodashkb/adapters/structural_variants.yaml index 1b96645..754ce68 100644 --- a/oncodashkb/adapters/structural_variants.yaml +++ b/oncodashkb/adapters/structural_variants.yaml @@ -25,18 +25,18 @@ transformers: - primary_gene - effect from_subject: sample - to_object: alteration - via_relation: sample_carries_alteration + to_object: structural_variant + via_relation: sample_carries_variant format_string: "{primary_gene}:{effect}" ## Gene status - translate_cat_format: columns: - primary_gene - Gene_type - from_subject: alteration + from_subject: structural_variant to_object: gene_status format_string: "{primary_gene}:{Gene_type}" - via_relation: alteration_causes_gene_status + via_relation: variant_causes_gene_status column_to_translate: - primary_gene translations_file: data/HGNC/hgnc_complete_set.txt @@ -60,7 +60,7 @@ transformers: - primary_gene - effect to_property: ensembl_id_alteration - for_object: alteration + for_object: structural_variant format_string: "{primary_gene}:{effect}" column_to_translate: - primary_gene @@ -89,26 +89,26 @@ transformers: - map: column: pathogenic to_property: oncogenic - for_object: alteration + for_object: structural_variant - map: column: effect to_property: consequence - for_object: alteration + for_object: structural_variant - map: column: Homogeneous to_property: homogenous - for_object: alteration + for_object: structural_variant - map: column: expressed to_property: expressed - for_object: alteration + for_object: structural_variant - string: value: " " to_property: edgelabel for_objects: - patient_carries_sample - - sample_carries_alteration - - alteration_causes_gene_status + - sample_carries_variant + - variant_causes_gene_status - gene_status_affects_gene metadata: From 932db67d57fc72e9a331a221b9e3e7e8db27cd71 Mon Sep 17 00:00:00 2001 From: Matthieu NAJM Date: Thu, 19 Mar 2026 16:05:10 +0100 Subject: [PATCH 07/15] first integration of cgi annotations --- config/schema.yaml | 26 ++++++++-- make.sh | 3 +- oncodashkb/adapters/cgi.yaml | 96 +++++++++++++++++++++++------------- weave.py | 49 +++++++++++++++++- 4 files changed, 135 insertions(+), 39 deletions(-) diff --git a/config/schema.yaml b/config/schema.yaml index bd0f079..45d5e38 100644 --- a/config/schema.yaml +++ b/config/schema.yaml @@ -241,6 +241,12 @@ protein: ncbi_tax_id: str data_source: str +# Treatment + +treatment: + represented_as: node + input_label: treatment + ######################## # EDGES ######################## @@ -306,13 +312,17 @@ variant causes gene status: # as predictive markers for treatment response, # based on clinical evidence categorized by evidence levels. -variant biomarker for drug: - is_a: biomarker for +variant biomarker for treatment: + is_a: sequence variant modulates treatment association represented_as: edge - label_in_input: variant_biomarker_for_drug + label_in_input: variant_biomarker_for_treatment source: sequence variant - target: drug + target: treatment properties: + level_of_evidence: str + cgi_level: str + citations: str + tumorType: str data_source: str edglelabel: str @@ -486,3 +496,11 @@ drug has target: label_in_input: drug_has_target properties: data_source: str + +treatment has part drug: + is_a: association + represented_as: edge + label_in_input: treatment_has_part_drug + properties: + data_source: str + diff --git a/make.sh b/make.sh index a519657..9356eaf 100755 --- a/make.sh +++ b/make.sh @@ -85,10 +85,11 @@ cmd="uv run python3 ${py_args} $script_dir/weave.py \ --copy-number-amplifications-local $decider_dir/cnas_local.csv \ --copy-number-amplifications-external $decider_dir/cnas_external.csv \ --structural-variants $data_dir/DECIDER/$data_version/structural_variants.xlsx \ - --omnipath-networks $data_dir/omnipath_networks/omnipath_webservice_interactions__latest.tsv.gz \ + --omnipath-networks $data_dir/omnipath_networks/omnipath_webservice_interactions__latest.tsv.gz \ --open-targets-drug-molecule $data_dir/OT/drug_molecule/ --open-targets-drug_mechanism_of_action $data_dir/OT/drug_mechanism_of_action/ --open-targets-target $data_dir/OT/target/ + --cgi $data_dir/DECIDER/$data_version/treatments_cgi.csv \ ${weave_args}" # \ # --clinical $data_dir/DECIDER/clinical/clinical_export.xlsx \ # --oncokb $data_dir/DECIDER/$data_version/treatments.csv \ diff --git a/oncodashkb/adapters/cgi.yaml b/oncodashkb/adapters/cgi.yaml index 35b2ad3..8c47412 100644 --- a/oncodashkb/adapters/cgi.yaml +++ b/oncodashkb/adapters/cgi.yaml @@ -1,39 +1,69 @@ row: - rowIndex: - to_subject: variant + map: + id_from_column: alteration + match_type_from_column: alteration_type + match: + - SNV: + to_subject: short_mutation + - CNA: + to_subject: copy_number_amplification transformers: + - replace: + column: treatment + to_object: treatment + via_relation: variant_biomarker_for_treatment + forbidden: ';' + substitue: ',' + - split_translate: + column: treatment + from_subject: treatment + to_object: drug + via_relation: treatment_has_part_drug + separator: "[,|;|+]" + translations_file: ./data/OT/drug_molecule/part-00000-871f412e-aec4-4d33-a50d-feee532ddcd2-c000.snappy.parquet + translate_from: name + translate_to: id - map: - columns: - - patient_id - to_object: patient - via_relation: patient_has_variant + column: level_of_evidence + to_property: level_of_evidence + for_object: variant_biomarker_for_treatment - map: - columns: - - gene - to_object: gene_hugo - via_relation: variant_in_gene - - split: - columns: - - sample - to_object: sample - via_relation: variant_in_sample - separator: ";" + column: cgi_level + to_property: cgi_level + for_object: variant_biomarker_for_treatment - map: - columns: - - transcript - from_subject: gene_hugo - to_object: transcript - via_relation: transcript_to_gene_relationship + column: citations + to_property: citations + for_object: variant_biomarker_for_treatment - map: - columns: - - oncogenic_summary - from_subject: variant - to_object: disease - via_relation: variant_to_disease - - map: - columns: - - consequence - to_property: - - consequence - for_objects: - - variant + column: tumorType + to_property: tumorType + for_object: variant_biomarker_for_treatment + # separator: "+" + # node type: DRUG + # upper + # split for ;,+ + # remove inside parenthesis + # translate to CHEMBLid + # node type: DRUG CATEGORY + # upper + # split for ;,+ + # remove inside parenthesis + # match if inhibitor blablabla : drug category + # - map: + # column: treatment + # from_subject: treatment + # to_object: drug + # - replace: + # columns: + # - treatment + # to_object: drug + # via_relation: variant_biomarker_for_drug + # substitute: "_" + # - string: + # value: "." + # to_property: edgelabel + # for_objects: + # - variant_biomarker_for_drug +metadata: + - data_source: cgi_annotation diff --git a/weave.py b/weave.py index 38dcbda..51e1bb6 100755 --- a/weave.py +++ b/weave.py @@ -334,6 +334,53 @@ def process_OT(directory, name): edges += local_edges logging.info(f"Done adapter {opt_loaded}/{opt_total}") + if asked.cgi: + opt_loaded += 1 + logging.info(f"########## Adapter #{opt_loaded}/{opt_total} ##########") + data_file = asked.cgi[0] + mapping_file = "./oncodashkb/adapters/cgi.yaml" + + # logging.info(f"Weave structural variants...") + logging.info(f" | Weave `{data_file}:{mapping_file}`...") + logging.info(f" | | Load data `{data_file}`...") + table = progress_read(data_file, hint=72648) + + table["treatment"] = table.treatment.str.upper().str.replace(r'\([^()]*\)', '', regex=True) + + try: + with open(mapping_file) as fd: + ymapping = yaml.full_load(fd) + except Exception as e: + logging.error(e) + sys.exit(error_codes["CannotAccessFile"]) + + logging.info(f" | | Process {mapping_file}...") + + yparser = ontoweaver.mapping.YamlParser(ymapping) + mapping = yparser() + + adapter = ontoweaver.tabular.PandasAdapter( + table, + *mapping, + type_affix="suffix", + type_affix_sep=":", + raise_errors = True + ) + + local_nodes = [] + local_edges = [] + with alive_bar(len(table), file=sys.stderr) as progress: + for n,e in adapter(): + # NOTE: here, n & e are ontoweaver.base.Element, not BioCypher tuples. + local_nodes += n + local_edges += e + progress() + + logging.info(f" | | OK, wove: {len(local_nodes)} nodes, {len(local_edges)} edges.") + nodes += local_nodes + edges += local_edges + logging.info(f"Done adapter {opt_loaded}/{opt_total}") + if asked.omnipath_networks: opt_loaded += 1 logging.info(f"########## Adapter #{opt_loaded}/{opt_total} ##########") @@ -468,7 +515,7 @@ def process_OT(directory, name): "copy_number_amplifications_external", # "structural_variants", "oncokb", - "cgi", + # "cgi", ] for name in direct_mappings: option = getattr(asked, name) From f4f771bef807f907193747a21cadcd2ed0ea2747 Mon Sep 17 00:00:00 2001 From: Matthieu NAJM Date: Thu, 19 Mar 2026 17:54:28 +0100 Subject: [PATCH 08/15] added gene_role as property for CNA --- oncodashkb/adapters/copy_number_amplifications_external.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/oncodashkb/adapters/copy_number_amplifications_external.yaml b/oncodashkb/adapters/copy_number_amplifications_external.yaml index 5fee1b5..074f1f3 100644 --- a/oncodashkb/adapters/copy_number_amplifications_external.yaml +++ b/oncodashkb/adapters/copy_number_amplifications_external.yaml @@ -80,6 +80,10 @@ transformers: to_property: gene_symbol_gene_status for_object: gene_status format_string: "{hugoSymbol}:{gene_role}" + - map: + columns: gene_role + to_property: gene_role + for_object: gene_status ## Genes - map: column: hugoSymbol From b03c67267f58cbb361bfec863761997a296e65a7 Mon Sep 17 00:00:00 2001 From: Matthieu NAJM Date: Thu, 19 Mar 2026 17:55:31 +0100 Subject: [PATCH 09/15] fix: starting the neo4j database when no second argument is given --- make.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/make.sh b/make.sh index 9356eaf..5aafc7b 100755 --- a/make.sh +++ b/make.sh @@ -65,7 +65,7 @@ echo "Activate virtual environment..." >&2 source $(dirname $(uv python find))/activate -if [[ "$2" == "config/neo4j.yaml" ]] ; then +if [[ "$CONFIG" == "config/neo4j.yaml" ]] ; then echo "Stop Neo4j server..." >&2 neo_version=$(neo4j-admin --version | cut -d. -f 1) if [[ "$neo_version" -eq 4 ]]; then From efc25b3693cdd419bd6484a4aa24839fa3d6bc23 Mon Sep 17 00:00:00 2001 From: Matthieu NAJM Date: Thu, 19 Mar 2026 17:56:04 +0100 Subject: [PATCH 10/15] changing effect to mutation when defining structural variant id --- oncodashkb/adapters/structural_variants.yaml | 12 ++++++++---- weave.py | 1 + 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/oncodashkb/adapters/structural_variants.yaml b/oncodashkb/adapters/structural_variants.yaml index 754ce68..278ec62 100644 --- a/oncodashkb/adapters/structural_variants.yaml +++ b/oncodashkb/adapters/structural_variants.yaml @@ -23,11 +23,11 @@ transformers: - cat_format: columns: - primary_gene - - effect + - mutation from_subject: sample to_object: structural_variant via_relation: sample_carries_variant - format_string: "{primary_gene}:{effect}" + format_string: "{primary_gene}:{mutation}" ## Gene status - translate_cat_format: columns: @@ -55,13 +55,17 @@ transformers: sep: "\t" # Properties ## Alterations + - map: + column: effect + to_property: consequence + for_object: structural_variant - translate_cat_format: columns: - primary_gene - - effect + - mutation to_property: ensembl_id_alteration for_object: structural_variant - format_string: "{primary_gene}:{effect}" + format_string: "{primary_gene}:{mutation}" column_to_translate: - primary_gene translations_file: data/HGNC/hgnc_complete_set.txt diff --git a/weave.py b/weave.py index 51e1bb6..1044015 100755 --- a/weave.py +++ b/weave.py @@ -299,6 +299,7 @@ def process_OT(directory, name): table = pd.read_excel(data_file) table = table.rename(columns={"Gene.type":"Gene_type"}) + table["mutation"] = table.mutation.str.replace(r';', ',', regex=True) try: with open(mapping_file) as fd: From d8ecc54be3c30ad0977f5e41decacf54b74e7fae Mon Sep 17 00:00:00 2001 From: Matthieu NAJM Date: Mon, 23 Mar 2026 10:28:05 +0100 Subject: [PATCH 11/15] upgrade ontoweaver and set up for debugging omnipath adapter --- make.sh | 20 ++++++++++---------- pyproject.toml | 5 ++--- 2 files changed, 12 insertions(+), 13 deletions(-) diff --git a/make.sh b/make.sh index 5aafc7b..5cf3e70 100755 --- a/make.sh +++ b/make.sh @@ -79,18 +79,18 @@ fi echo "Weave data..." >&2 cmd="uv run python3 ${py_args} $script_dir/weave.py \ + --omnipath-networks $data_dir/omnipath_networks/subset_omnipath_networks_different_type_entity_type_source_and_entity_type_target_shorter.tsv \ --config $CONFIG \ - --short-mutations-local $decider_dir/short_mutations_local.csv \ - --short-mutations-external $decider_dir/short_mutations_external.csv \ - --copy-number-amplifications-local $decider_dir/cnas_local.csv \ - --copy-number-amplifications-external $decider_dir/cnas_external.csv \ - --structural-variants $data_dir/DECIDER/$data_version/structural_variants.xlsx \ - --omnipath-networks $data_dir/omnipath_networks/omnipath_webservice_interactions__latest.tsv.gz \ - --open-targets-drug-molecule $data_dir/OT/drug_molecule/ - --open-targets-drug_mechanism_of_action $data_dir/OT/drug_mechanism_of_action/ - --open-targets-target $data_dir/OT/target/ - --cgi $data_dir/DECIDER/$data_version/treatments_cgi.csv \ ${weave_args}" # \ + # --copy-number-amplifications-external $decider_dir/cnas_external.csv \ + # --short-mutations-local $decider_dir/short_mutations_local.csv \ + # --short-mutations-external $decider_dir/short_mutations_external.csv \ + # --copy-number-amplifications-local $decider_dir/cnas_local.csv \ + # --structural-variants $decider_dir/structural_variants.xlsx \ + # --open-targets-drug-molecule $data_dir/OT/drug_molecule/ + # --open-targets-drug_mechanism_of_action $data_dir/OT/drug_mechanism_of_action/ + # --open-targets-target $data_dir/OT/target/ + # --cgi $decider_dir/treatments_cgi.csv \ # --clinical $data_dir/DECIDER/clinical/clinical_export.xlsx \ # --oncokb $data_dir/DECIDER/$data_version/treatments.csv \ diff --git a/pyproject.toml b/pyproject.toml index c0b074c..98bffa4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,7 +22,7 @@ dependencies = [ "matplotlib>=3.10.0,<4.0", "polars>=1.22.0,<2.0", "seaborn>=0.13.2,<0.14", - "ontoweaver>=1.3.0,<1.4.0", + "ontoweaver>=1.4.0,<1.5.0", "openpyxl>=3.1.5", "pyarrow<21.0.0", "fastparquet<2026.3.0", @@ -32,5 +32,4 @@ dependencies = [ dev = [ "pre-commit>=4.5.0", "pytest>=8.4.1", -] - +] \ No newline at end of file From d0c20f18bc7063d5ee4d9c58d916d19f628b8123 Mon Sep 17 00:00:00 2001 From: Claire Laudy Date: Tue, 31 Mar 2026 15:14:05 +0200 Subject: [PATCH 12/15] fix(config for BPN): adds parameters to the biopathnet config file + fixes the pyproject for MacOs/ARM computers. --- config/biopathnet.yaml | 4 +++- make.sh | 2 ++ pyproject.toml | 2 +- 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/config/biopathnet.yaml b/config/biopathnet.yaml index 6c3cc50..18a7c40 100644 --- a/config/biopathnet.yaml +++ b/config/biopathnet.yaml @@ -9,9 +9,11 @@ biocypher: root_node: entity biopathnet: - file_format: txt + file_format: txt:bn entity_types_file_stem: entity_types entity_names_file_stem: entity_names background_graph_file_stem: brg skg_file_stem: skg + targeted_relation: "(alteration, variant_biomarker_for_treatment, drug)" + include_properties: False diff --git a/make.sh b/make.sh index 8e5d64a..3c267b7 100755 --- a/make.sh +++ b/make.sh @@ -78,6 +78,8 @@ fi echo "Weave data..." >&2 +echo "CONFIG = $CONFIG" >&2 + cmd="uv run python3 ${py_args} $script_dir/weave.py \ --config $CONFIG \ --short-mutations-local $decider_dir/short_mutations_local.csv \ diff --git a/pyproject.toml b/pyproject.toml index c0b074c..9060199 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,7 +24,7 @@ dependencies = [ "seaborn>=0.13.2,<0.14", "ontoweaver>=1.3.0,<1.4.0", "openpyxl>=3.1.5", - "pyarrow<21.0.0", + "pyarrow>20.0.0", "fastparquet<2026.3.0", ] From ef7744f014b90363fa63c0a9606404ed1dd281ec Mon Sep 17 00:00:00 2001 From: Claire Laudy Date: Tue, 31 Mar 2026 20:06:30 +0200 Subject: [PATCH 13/15] feat(fetch all datasources): Adds all the datasource to the make.sh script. --- make.sh | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/make.sh b/make.sh index 2d5df70..fec908a 100755 --- a/make.sh +++ b/make.sh @@ -81,17 +81,17 @@ echo "Weave data..." >&2 echo "CONFIG = $CONFIG" >&2 cmd="uv run python3 ${py_args} $script_dir/weave.py \ - --omnipath-networks $data_dir/omnipath_networks/subset_omnipath_networks_different_type_entity_type_source_and_entity_type_target_shorter.tsv \ + --omnipath-networks $data_dir/omnipath_networks/omnipath_networks_different_type_entity_type_source_and_entity_type_target_shorter.tsv \ + --copy-number-amplifications-external $decider_dir/cnas_external.csv \ + --short-mutations-local $decider_dir/short_mutations_local.csv \ + --short-mutations-external $decider_dir/short_mutations_external.csv \ + --copy-number-amplifications-local $decider_dir/cnas_local.csv \ + --structural-variants $decider_dir/structural_variants.xlsx \ + --open-targets-drug-molecule $data_dir/OT/drug_molecule/ + --open-targets-drug_mechanism_of_action $data_dir/OT/drug_mechanism_of_action/ + --open-targets-target $data_dir/OT/target/ --config $CONFIG \ ${weave_args}" # \ - # --copy-number-amplifications-external $decider_dir/cnas_external.csv \ - # --short-mutations-local $decider_dir/short_mutations_local.csv \ - # --short-mutations-external $decider_dir/short_mutations_external.csv \ - # --copy-number-amplifications-local $decider_dir/cnas_local.csv \ - # --structural-variants $decider_dir/structural_variants.xlsx \ - # --open-targets-drug-molecule $data_dir/OT/drug_molecule/ - # --open-targets-drug_mechanism_of_action $data_dir/OT/drug_mechanism_of_action/ - # --open-targets-target $data_dir/OT/target/ # --cgi $decider_dir/treatments_cgi.csv \ # --clinical $data_dir/DECIDER/clinical/clinical_export.xlsx \ # --oncokb $data_dir/DECIDER/$data_version/treatments.csv \ From 71a55d86ec55df6926dcca7c37f92ca1857f69b2 Mon Sep 17 00:00:00 2001 From: Claire Laudy Date: Wed, 1 Apr 2026 17:36:32 +0200 Subject: [PATCH 14/15] fix(config & make): Fixes the biopathnet config file and make.sh to script to enable the export of oncodashkb into a BioPathNet set of input files. --- config/biopathnet.yaml | 2 +- make.sh | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/config/biopathnet.yaml b/config/biopathnet.yaml index 18a7c40..7d87237 100644 --- a/config/biopathnet.yaml +++ b/config/biopathnet.yaml @@ -14,6 +14,6 @@ biopathnet: entity_names_file_stem: entity_names background_graph_file_stem: brg skg_file_stem: skg - targeted_relation: "(alteration, variant_biomarker_for_treatment, drug)" + targeted_relation: "(alteration, variant biomarker for treatment, drug)" include_properties: False diff --git a/make.sh b/make.sh index fec908a..a7b5ad3 100755 --- a/make.sh +++ b/make.sh @@ -81,18 +81,18 @@ echo "Weave data..." >&2 echo "CONFIG = $CONFIG" >&2 cmd="uv run python3 ${py_args} $script_dir/weave.py \ - --omnipath-networks $data_dir/omnipath_networks/omnipath_networks_different_type_entity_type_source_and_entity_type_target_shorter.tsv \ --copy-number-amplifications-external $decider_dir/cnas_external.csv \ --short-mutations-local $decider_dir/short_mutations_local.csv \ --short-mutations-external $decider_dir/short_mutations_external.csv \ --copy-number-amplifications-local $decider_dir/cnas_local.csv \ - --structural-variants $decider_dir/structural_variants.xlsx \ --open-targets-drug-molecule $data_dir/OT/drug_molecule/ --open-targets-drug_mechanism_of_action $data_dir/OT/drug_mechanism_of_action/ --open-targets-target $data_dir/OT/target/ + --cgi $decider_dir/treatments_cgi.csv \ --config $CONFIG \ ${weave_args}" # \ - # --cgi $decider_dir/treatments_cgi.csv \ + # --omnipath-networks $data_dir/omnipath_networks/omnipath_networks_different_type_entity_type_source_and_entity_type_target_shorter.tsv \ + # --structural-variants $decider_dir/structural_variants.xlsx \ # --clinical $data_dir/DECIDER/clinical/clinical_export.xlsx \ # --oncokb $data_dir/DECIDER/$data_version/treatments.csv \ From 9453d91917204b1abcb630eed992edce4b4b9ab3 Mon Sep 17 00:00:00 2001 From: Claire Laudy Date: Fri, 3 Apr 2026 10:53:56 +0200 Subject: [PATCH 15/15] feat(multiple export back-ends): Adds the possibility to export the SKG to sevral back-ends. --- config/owl.yaml | 16 ++++++++++++++++ weave.py | 38 ++++++++++++++++++++++---------------- 2 files changed, 38 insertions(+), 16 deletions(-) create mode 100644 config/owl.yaml diff --git a/config/owl.yaml b/config/owl.yaml new file mode 100644 index 0000000..60a2b17 --- /dev/null +++ b/config/owl.yaml @@ -0,0 +1,16 @@ +biocypher: + debug: false + offline: true + dbms: owl + + # Ontology configuration + head_ontology: + url: https://github.com/biolink/biolink-model/raw/v3.2.1/biolink-model.owl.ttl + root_node: entity + +owl: + edge_model: ObjectProperty + file_format: turtle + labels_order: "Ascending" # Default: From more specific to more generic. + node_labels_order: "Ascending" # Default: use labels_order. + edge_labels_order: "Leaves" diff --git a/weave.py b/weave.py index 1044015..278730e 100755 --- a/weave.py +++ b/weave.py @@ -136,7 +136,8 @@ def process_OT(directory, name): parser = argparse.ArgumentParser( description=usage) - parser.add_argument("-C", "--config", metavar="FILE", default="config/neo4j.yaml", + parser.add_argument("-C", "--config", metavar="FILE", default=["config/neo4j.yaml"], + action="append", help="The BioCypher configuration to load [default: config/neo4j.yaml].") parser.add_argument("-i", "--clinical", metavar="CSV", nargs="+", @@ -197,10 +198,6 @@ def process_OT(directory, name): help="Set the verbose level (default: %(default)s).") asked = parser.parse_args() - bc = biocypher.BioCypher( - biocypher_config_path = asked.config, - schema_config_path = "config/schema.yaml" - ) logging.basicConfig() logging.getLogger().setLevel(asked.verbose) @@ -670,17 +667,26 @@ def process_OT(directory, name): # Export the final SKG. ################################################### - logging.info(f"Write the final SKG into files...") - if fnodes: - bc.write_nodes(n.as_tuple() for n in fnodes) - if fedges: - bc.write_edges(e.as_tuple() for e in fedges) - #bc.summary() - import_file = bc.write_import_call() - logging.info(f"OK, wrote files.") - - # Print on stdout for other scripts to get. - print(import_file) + configs = asked.config + + for config in configs: + logging.info(f"Write the final SKG into {config} files...") + + bc = biocypher.BioCypher( + biocypher_config_path = config, + schema_config_path = "config/schema.yaml" + ) + + if fnodes: + bc.write_nodes(n.as_tuple() for n in fnodes) + if fedges: + bc.write_edges(e.as_tuple() for e in fedges) + #bc.summary() + import_file = bc.write_import_call() + logging.info(f"OK, wrote files.") + + # Print on stdout for other scripts to get. + print(import_file) if asked.import_script_run: shell = os.environ["SHELL"]