diff --git a/README.md b/README.md index 7e78fb4..73ab27f 100644 --- a/README.md +++ b/README.md @@ -224,59 +224,6 @@ the data that you want to integrate. ./weave.py –oncokb /path_to_file/test_genomics_oncokbannotation.csv ``` - -### Gene Ontology adapter - -**Gene Ontology** is one of the biggest biomedical databases. The described -adapter helps to integrate the data about the molecular function of the gene -product, as well as the biological process in which these genes are involved. - -- Molecular function: GO annotations that have relation type `enabled` - or `contributes_to`. -- Biological process: GO annotations that have relation type `involved_in`. - -**To integrate the data, three files are necessary:** -- `--gene_ontology` option for GO annotations in GAF format [Download GO annotations](http://current.geneontology.org/products/pages/downloads.html) -- `--gene_ontology_owl` option for GO ontology in OWL format [Download GO ontology](https://geneontology.org/docs/download-ontology/) -- `--gene_ontology_genes` option for the list of genes for which we want to - integrate the GO annotations (example in adapters/Hugo_Symbol_genes.conf file, - by default = list of genes from OncoKB database). - -**Example of use:** - -``` sh -./weave.py --gene_ontology /path_to_file/goa_human.gaf --gene_ontology_owl /path_to_file/go.owl --gene_ontology_genes /path_to_file/Hugo_Symbol_genes.conf -``` - -If you want to integrate annotations with another type of relations, you can -modify the `adapters/gene_ontology.py` file by adding the next code in the -**class Gene_ontology** (example for the `involved_in` edge type): - -``` python -# Create new columns that depends on edge type. -df['GO_involved_in'] = None - -# Cut df to include only edge type that we have chosen and annotations -# for genes from OncoKB. -df = df[((df['Qualifier'].isin(['enables', 'involved_in', 'contributes_to'])) & - (df['DB_Object_Symbol'].isin(included_genes)))] -``` -Also, you need to add code in `separate_edges_types` method: - -``` sh -# Function to copy GO_term to related column for future ontoweaver mapping -# based on Qualifier column (relation type). - def separate_edges_types(row): - if row['Qualifier'] == 'enables': - row['GO_enables'] = row['GO_term'] - elif row['Qualifier'] == 'involved_in': - row['GO_involved_in'] = row['GO_term'] -``` - -Finally, you need to specify the node and edge types in the `gene_ontology.yaml` -for `GO_involved_in` column. - - ### Open Targets adapter Open Targets is a public database that aims to systematically identify and diff --git a/config/biopathnet.yaml b/config/biopathnet.yaml index 6c3cc50..7d87237 100644 --- a/config/biopathnet.yaml +++ b/config/biopathnet.yaml @@ -9,9 +9,11 @@ biocypher: root_node: entity biopathnet: - file_format: txt + file_format: txt:bn entity_types_file_stem: entity_types entity_names_file_stem: entity_names background_graph_file_stem: brg skg_file_stem: skg + targeted_relation: "(alteration, variant biomarker for treatment, drug)" + include_properties: False diff --git a/config/owl.yaml b/config/owl.yaml new file mode 100644 index 0000000..60a2b17 --- /dev/null +++ b/config/owl.yaml @@ -0,0 +1,16 @@ +biocypher: + debug: false + offline: true + dbms: owl + + # Ontology configuration + head_ontology: + url: https://github.com/biolink/biolink-model/raw/v3.2.1/biolink-model.owl.ttl + root_node: entity + +owl: + edge_model: ObjectProperty + file_format: turtle + labels_order: "Ascending" # Default: From more specific to more generic. + node_labels_order: "Ascending" # Default: use labels_order. + edge_labels_order: "Leaves" diff --git a/config/schema.yaml b/config/schema.yaml index e6ef1d1..45d5e38 100644 --- a/config/schema.yaml +++ b/config/schema.yaml @@ -4,12 +4,11 @@ # Defined in alphabetical order -alteration: +short mutation: is_a: sequence variant represented_as: node - label_in_input: alteration + label_in_input: short_mutation properties: - gene_symbol_alteration: str citation_PM_ids: str consequence: str homogenous: str @@ -23,7 +22,47 @@ alteration: refCount: int64 altCount: int64 expressed: bool - ensembl_id_alteration: str + # ensembl_id_alteration: str + +copy number amplification: + is_a: sequence variant + represented_as: node + label_in_input: copy_number_amplification + properties: + citation_PM_ids: str + consequence: str + homogenous: str + mutation_effect_description: str + data_source: str + oncogenic: str + reference_genome: str + tumor_type: str + tumor_type_summary: str + variant_summary: str + refCount: int64 + altCount: int64 + expressed: bool + # ensembl_id_alteration: str + +structural variant: + is_a: sequence variant + represented_as: node + label_in_input: structural_variant + properties: + citation_PM_ids: str + consequence: str + homogenous: str + mutation_effect_description: str + data_source: str + oncogenic: str + reference_genome: str + tumor_type: str + tumor_type_summary: str + variant_summary: str + refCount: int64 + altCount: int64 + expressed: bool + # ensembl_id_alteration: str disease: represented_as: node @@ -202,6 +241,12 @@ protein: ncbi_tax_id: str data_source: str +# Treatment + +treatment: + represented_as: node + input_label: treatment + ######################## # EDGES ######################## @@ -212,7 +257,7 @@ protein: ### CARRIES -# To allow queries for patient carrying samples, and samples carrying alterations, +# To allow queries for patient carrying samples, and samples carrying variants, # without mixing with "effects" causes. carries: is_a: causes @@ -232,12 +277,12 @@ patient carries sample: data_source: str edglelabel: str -sample carries alteration: +sample carries variant: is_a: carries represented_as: edge - label_in_input: sample_carries_alteration + label_in_input: sample_carries_variant source: sample - target: alteration + target: sequence variant properties: data_source: str edglelabel: str @@ -246,16 +291,16 @@ sample carries alteration: # A gene is linked to its gene status (gain or loss of function), # which are represented as nodes, so as to allow a causal path -# to go through alteration -> gene status -> transcript activity. +# to go through variant -> gene status -> transcript activity. # Hence, outcomes have at least two instances: # - Gene:GoF, and # - Gene:LoF. -alteration causes gene status: +variant causes gene status: is_a: causes represented_as: edge - label_in_input: alteration_causes_gene_status - source: alteration + label_in_input: variant_causes_gene_status + source: sequence variant target: gene status properties: data_source: str @@ -267,13 +312,17 @@ alteration causes gene status: # as predictive markers for treatment response, # based on clinical evidence categorized by evidence levels. -alteration biomarker for drug: - is_a: biomarker for +variant biomarker for treatment: + is_a: sequence variant modulates treatment association represented_as: edge - label_in_input: alteration_biomarker_for_drug - source: alteration - target: drug - properties: + label_in_input: variant_biomarker_for_treatment + source: sequence variant + target: treatment + properties: + level_of_evidence: str + cgi_level: str + citations: str + tumorType: str data_source: str edglelabel: str @@ -302,10 +351,6 @@ gene status affects gene: ### GO -- TO BE FIXED -# annotation: -# is_a: named thing -# represented_as: node -# label_in_input: annotation biological process: is_a: named thing @@ -314,20 +359,6 @@ biological process: properties: data_source: str -# annotation for gene: -# is_a: association -# represented_as: edge -# label_in_input: annotation_for_gene -# source: annotation -# target: gene - -# involved in: -# is_a: association -# represented_as: edge -# label_in_input: involved_in -# source: annotation -# target: biological process - gene to biological process: is_a: association represented_as: edge @@ -345,9 +376,10 @@ biological process to gene: source: biological process target: gene properties: - # edglelabel: str data_source: str +### FUNCTIONAL PROTEIN PROTEIN INTERACTIONS + undirected molecular interaction: is_a: pairwise molecular interaction represented_as: edge @@ -447,6 +479,8 @@ inhibition: extra_attrs: str evidences: str +### TRASNCRIPT TO GENE RELATIONSHIP + transcript to gene relationship: # is_a: transcript to gene relationship represented_as: edge @@ -454,9 +488,19 @@ transcript to gene relationship: properties: data_source: str +### DRUG HAS TARGET + drug has target: is_a: drug to gene association represented_as: edge label_in_input: drug_has_target properties: data_source: str + +treatment has part drug: + is_a: association + represented_as: edge + label_in_input: treatment_has_part_drug + properties: + data_source: str + diff --git a/make.sh b/make.sh index 8e5d64a..a7b5ad3 100755 --- a/make.sh +++ b/make.sh @@ -65,7 +65,7 @@ echo "Activate virtual environment..." >&2 source $(dirname $(uv python find))/activate -if [[ "$2" == "config/neo4j.yaml" ]] ; then +if [[ "$CONFIG" == "config/neo4j.yaml" ]] ; then echo "Stop Neo4j server..." >&2 neo_version=$(neo4j-admin --version | cut -d. -f 1) if [[ "$neo_version" -eq 4 ]]; then @@ -78,23 +78,23 @@ fi echo "Weave data..." >&2 +echo "CONFIG = $CONFIG" >&2 + cmd="uv run python3 ${py_args} $script_dir/weave.py \ - --config $CONFIG \ + --copy-number-amplifications-external $decider_dir/cnas_external.csv \ --short-mutations-local $decider_dir/short_mutations_local.csv \ --short-mutations-external $decider_dir/short_mutations_external.csv \ --copy-number-amplifications-local $decider_dir/cnas_local.csv \ - --copy-number-amplifications-external $decider_dir/cnas_external.csv \ - --omnipath-networks $data_dir/omnipath_networks/omnipath_webservice_interactions__latest.tsv.gz \ --open-targets-drug-molecule $data_dir/OT/drug_molecule/ --open-targets-drug_mechanism_of_action $data_dir/OT/drug_mechanism_of_action/ --open-targets-target $data_dir/OT/target/ + --cgi $decider_dir/treatments_cgi.csv \ + --config $CONFIG \ ${weave_args}" # \ + # --omnipath-networks $data_dir/omnipath_networks/omnipath_networks_different_type_entity_type_source_and_entity_type_target_shorter.tsv \ + # --structural-variants $decider_dir/structural_variants.xlsx \ # --clinical $data_dir/DECIDER/clinical/clinical_export.xlsx \ - # --gene_ontology_genes $data_dir/DECIDER/$data_version/OncoKB_gene_symbols.conf \ # --oncokb $data_dir/DECIDER/$data_version/treatments.csv \ - # --gene_ontology $data_dir/GO/goa_human.gaf.gz \ - # --gene_ontology_owl $data_dir/GO/go.owl \ - # --gene_ontology_reverse echo "Weaving command:" >&2 diff --git a/oncodashkb/adapters/Ensembl_genes.conf b/oncodashkb/adapters/Ensembl_genes.conf deleted file mode 100644 index c595cc8..0000000 --- a/oncodashkb/adapters/Ensembl_genes.conf +++ /dev/null @@ -1,67 +0,0 @@ -'ENSG00000100311', - 'ENSG00000140538', - 'ENSG00000101972', - 'ENSG00000107485', - 'ENSG00000141510', - 'ENSG00000171456', - 'ENSG00000136997', - 'ENSG00000099956', - 'ENSG00000157168', - 'ENSG00000104884', - 'ENSG00000112679', - 'ENSG00000169032', - 'ENSG00000115524', - 'ENSG00000187266', - 'ENSG00000119772', - 'ENSG00000139083', - 'ENSG00000172175', - 'ENSG00000113916', - 'ENSG00000171094', - 'ENSG00000121879', - 'ENSG00000141736', - 'ENSG00000109670', - 'ENSG00000073282', - 'ENSG00000127528', - 'ENSG00000133703', - 'ENSG00000138376', - 'ENSG00000066468', - 'ENSG00000179218', - 'ENSG00000156531', - 'ENSG00000183765', - 'ENSG00000149311', - 'ENSG00000169249', - 'ENSG00000120217', - 'ENSG00000245848', - 'ENSG00000096968', - 'ENSG00000023445', - 'ENSG00000105976', - 'ENSG00000068078', - 'ENSG00000147889', - 'ENSG00000178573', - 'ENSG00000182054', - 'ENSG00000139163', - 'ENSG00000097007', - 'ENSG00000174775', - 'ENSG00000012048', - 'ENSG00000157764', - 'ENSG00000100393', - 'ENSG00000157873', - 'ENSG00000168685', - 'ENSG00000183337', - 'ENSG00000085224', - 'ENSG00000071564', - 'ENSG00000105397', - 'ENSG00000152217', - 'ENSG00000185920', - 'ENSG00000106462', - 'ENSG00000205755', - 'ENSG00000197646', - 'ENSG00000091831', - 'ENSG00000292363', - 'ENSG00000148400', - 'ENSG00000135679', - 'ENSG00000138413', - 'ENSG00000171791', - 'ENSG00000077782', - 'ENSG00000137265', - 'ENSG00000187741' \ No newline at end of file diff --git a/oncodashkb/adapters/Hugo_Symbol_genes.conf b/oncodashkb/adapters/Hugo_Symbol_genes.conf deleted file mode 100644 index 32e8631..0000000 --- a/oncodashkb/adapters/Hugo_Symbol_genes.conf +++ /dev/null @@ -1,9 +0,0 @@ -'MET', 'BRAF', 'EZH2', 'CDKN2A', 'ETV6', 'ETNK1', 'KRAS', 'NTRK3', -'IDH2', 'MAF', 'BRCA1', 'TP53', 'BCOR', 'FGFR1', 'MYC', 'JAK2', -'CD274', 'PDCD1LG2', 'PIK3CA', 'BCL6', 'TP63', 'IL7R', 'MDM2', -'SETBP1', 'FBXW7', 'ABL1', 'MAP2K1', 'TYK2', 'EPOR', 'ERCC2', -'SMARCB1', 'CHEK2', 'PDGFB', 'EP300', 'STAG2', 'PHF6', 'FGFR2', -'FGFR3', 'NRG1', 'GATA3', 'HRAS', 'ERBB2', 'BCL2', 'TCF3', 'CEBPA', -'CRLF2', 'ZRSR2', 'NOTCH1', 'TNFRSF14', 'BARD1', 'ESR1', 'PTCH1', -'FANCA', 'KLF2', 'MALT1', 'CALR', 'DNMT3A', 'ALK', 'SF3B1', 'IDH1', -'DUSP22', 'IRF4', 'BIRC3', 'ATM', 'ASXL1', 'ATRX' \ No newline at end of file diff --git a/oncodashkb/adapters/README.md b/oncodashkb/adapters/README.md deleted file mode 100644 index b79f2c3..0000000 --- a/oncodashkb/adapters/README.md +++ /dev/null @@ -1,29 +0,0 @@ -## Gene Ontology Data Preparation - -**Gene Ontology** (GO) is one of the biggest biomedical databases for the annotation of genes and their products across different species. To integrate the data in the Semantic Knowledge Graph (SKG), we use the `GO Annotations file` for Homo Sapiens in `GAF format` [Download page](https://geneontology.org/docs/download-go-annotations/) . Each line in GAF file represents **one annotation** for a gene product and contains **17 columns** (you can read a detailed description of each column [here](https://geneontology.org/docs/go-annotation-file-gaf-format-2.2/])). - -Compared to the integration of the CGI and OncoKB databases, where each column represents a concrete data type from Biolink ontology, the GO annotations file contains data type for each annotation (row) in the column 'Qualifier'. For further details regarding different types of relationships, please refer to the following [link](https://wiki.geneontology.org/Annotation_Relations). - -To solve the issue concerning data types represented in one column and to make the integrated data in the SKG more clear and easy to understand, the following steps were implemented in the GO adapter: -- [Download](https://geneontology.org/docs/download-ontology/) the **GO ontology OWL file** to create a dictionary that can map **GO_ID** to **GO_term** cause there is only a **GO_ID** column in the GAF file. -- Create a new column **GO_term** using a dictionary and `create_id_term_dict` method. -- For the chosen type of the relation from the **column 'Qualifier'** (in our case, `enables`, `involved_in`, `contributes_to` relation types) create an additional column (in our case, `GO_enables`, `GO_involved_in`, `GO_contributes_to` columns) and copy the **GO_term** in the related column (see illustration below) - -![Schema_columns_GO_adapter](https://github.com/kgaydukova/oncodashkb/assets/23275374/37b23c98-17b6-45bd-ab34-bc4d7fdf72f9) - -- Declare data type and relation type in the mapping file `gene_ontology.yaml` for each synthetic additional column (`GO_enables`, `GO_involved_in`, `GO_contributes_to`). - -```yaml -subject: annotation # Type for each entry (e.g. line). - -columns: - GO_enables: - to_object: molecular_function - via_relation: enables - GO_involved_in: - to_object: biological_process - via_relation: involved_in - GO_contributes_to: - to_object: molecular_function - via_relation: contributes_to -``` diff --git a/oncodashkb/adapters/__init__.py b/oncodashkb/adapters/__init__.py deleted file mode 100644 index 582dac2..0000000 --- a/oncodashkb/adapters/__init__.py +++ /dev/null @@ -1,9 +0,0 @@ - -# from . import types -from . import gene_ontology -from . import open_targets -from . import open_targets_evidences -from . import open_targets_drugs -from . import open_targets_diseases -__all__ = ['types', 'gene_ontology', 'open_targets', 'open_targets_evidences', 'open_targets_drugs', 'open_targets_diseases'] - diff --git a/oncodashkb/adapters/cgi.yaml b/oncodashkb/adapters/cgi.yaml index 35b2ad3..8c47412 100644 --- a/oncodashkb/adapters/cgi.yaml +++ b/oncodashkb/adapters/cgi.yaml @@ -1,39 +1,69 @@ row: - rowIndex: - to_subject: variant + map: + id_from_column: alteration + match_type_from_column: alteration_type + match: + - SNV: + to_subject: short_mutation + - CNA: + to_subject: copy_number_amplification transformers: + - replace: + column: treatment + to_object: treatment + via_relation: variant_biomarker_for_treatment + forbidden: ';' + substitue: ',' + - split_translate: + column: treatment + from_subject: treatment + to_object: drug + via_relation: treatment_has_part_drug + separator: "[,|;|+]" + translations_file: ./data/OT/drug_molecule/part-00000-871f412e-aec4-4d33-a50d-feee532ddcd2-c000.snappy.parquet + translate_from: name + translate_to: id - map: - columns: - - patient_id - to_object: patient - via_relation: patient_has_variant + column: level_of_evidence + to_property: level_of_evidence + for_object: variant_biomarker_for_treatment - map: - columns: - - gene - to_object: gene_hugo - via_relation: variant_in_gene - - split: - columns: - - sample - to_object: sample - via_relation: variant_in_sample - separator: ";" + column: cgi_level + to_property: cgi_level + for_object: variant_biomarker_for_treatment - map: - columns: - - transcript - from_subject: gene_hugo - to_object: transcript - via_relation: transcript_to_gene_relationship + column: citations + to_property: citations + for_object: variant_biomarker_for_treatment - map: - columns: - - oncogenic_summary - from_subject: variant - to_object: disease - via_relation: variant_to_disease - - map: - columns: - - consequence - to_property: - - consequence - for_objects: - - variant + column: tumorType + to_property: tumorType + for_object: variant_biomarker_for_treatment + # separator: "+" + # node type: DRUG + # upper + # split for ;,+ + # remove inside parenthesis + # translate to CHEMBLid + # node type: DRUG CATEGORY + # upper + # split for ;,+ + # remove inside parenthesis + # match if inhibitor blablabla : drug category + # - map: + # column: treatment + # from_subject: treatment + # to_object: drug + # - replace: + # columns: + # - treatment + # to_object: drug + # via_relation: variant_biomarker_for_drug + # substitute: "_" + # - string: + # value: "." + # to_property: edgelabel + # for_objects: + # - variant_biomarker_for_drug +metadata: + - data_source: cgi_annotation diff --git a/oncodashkb/adapters/copy_number_amplifications_external.yaml b/oncodashkb/adapters/copy_number_amplifications_external.yaml index f0ef571..074f1f3 100644 --- a/oncodashkb/adapters/copy_number_amplifications_external.yaml +++ b/oncodashkb/adapters/copy_number_amplifications_external.yaml @@ -29,18 +29,18 @@ transformers: - hugoSymbol - alteration from_subject: sample - to_object: alteration - via_relation: sample_carries_alteration + to_object: copy_number_amplification + via_relation: sample_carries_variant format_string: "{hugoSymbol}:{alteration}" ## Gene status - cat_format: columns: - ensembl_id - gene_role - from_subject: alteration + from_subject: copy_number_amplification to_object: gene_status format_string: "{ensembl_id}:{gene_role}" - via_relation: alteration_causes_gene_status + via_relation: variant_causes_gene_status # column_to_translate: # - hugoSymbol # translations_file: data/HGNC/hgnc_complete_set.txt @@ -64,7 +64,7 @@ transformers: - ensembl_id - alteration to_property: ensembl_id_alteration - for_object: alteration + for_object: copy_number_amplification format_string: "{ensembl_id}:{alteration}" # column_to_translate: # - hugoSymbol @@ -80,6 +80,10 @@ transformers: to_property: gene_symbol_gene_status for_object: gene_status format_string: "{hugoSymbol}:{gene_role}" + - map: + columns: gene_role + to_property: gene_role + for_object: gene_status ## Genes - map: column: hugoSymbol @@ -89,19 +93,19 @@ transformers: - map: column: tumorType to_property: tumor_type - for_object: alteration + for_object: copy_number_amplification - map: column: oncogenic to_property: oncogenic - for_object: alteration + for_object: copy_number_amplification - replace: column: mutationEffectDescription to_property: mutation_effect_description - for_object: alteration + for_object: copy_number_amplification - map: column: citationPMids to_property: citation_PM_ids - for_object: alteration + for_object: copy_number_amplification - replace: column: geneSummary to_property: gene_summary @@ -109,18 +113,18 @@ transformers: - map: column: variantSummary to_property: variant_summary - for_object: alteration + for_object: copy_number_amplification - map: column: tumorTypeSummary to_property: tumor_type_summary - for_object: alteration + for_object: copy_number_amplification - string: value: " " to_property: edgelabel for_objects: - patient_carries_sample - - sample_carries_alteration - - alteration_causes_gene_status + - sample_carries_variant + - variant_causes_gene_status - gene_status_affects_gene metadata: diff --git a/oncodashkb/adapters/copy_number_amplifications_local.yaml b/oncodashkb/adapters/copy_number_amplifications_local.yaml index 87a4fb4..d7a46f8 100644 --- a/oncodashkb/adapters/copy_number_amplifications_local.yaml +++ b/oncodashkb/adapters/copy_number_amplifications_local.yaml @@ -22,16 +22,16 @@ transformers: - hugoSymbol - alteration from_subject: sample - to_object: alteration + to_object: copy_number_amplification format_string: "{hugoSymbol}:{alteration}" - via_relation: sample_carries_alteration + via_relation: sample_carries_variant - map: column: referenceGenome to_property: reference_genome - for_object: alteration + for_object: copy_number_amplification - map: column: tumorType to_property: tumor_type - for_object: alteration + for_object: copy_number_amplification metadata: - data_source: copy_number_amplifications_local \ No newline at end of file diff --git a/oncodashkb/adapters/gene_ontology.py b/oncodashkb/adapters/gene_ontology.py deleted file mode 100644 index 4b6b7af..0000000 --- a/oncodashkb/adapters/gene_ontology.py +++ /dev/null @@ -1,139 +0,0 @@ -import types as pytypes -import logging -import ontoweaver - -from typing import Optional -from collections.abc import Iterable - -import pandas as pd - -from owlready2 import get_ontology - - -class Gene_ontology(ontoweaver.tabular.PandasAdapter): - - def __init__(self, - df: pd.DataFrame, - ontology: str, - genes_list: str, - config: dict, - type_affix=ontoweaver.base.TypeAffixes.none - ): - - # logging.info(" | | In Gene_ontology adapter init") - self.ontology = ontology - self.genes_list = genes_list - assert self.genes_list != None - - # define column names based on the GAF specification - columns = ['DB', 'DB_Object_ID', 'DB_Object_Symbol', 'Qualifier', 'GO_ID', 'DB_Reference', 'Evidence_Code', - 'With_or_From', 'Aspect', 'DB_Object_Name', 'DB_Object_Synonym', 'DB_Object_Type', 'Taxon', 'Date', - 'Assigned_By', 'Annotation_Extension', 'Gene_Product_Form_ID'] - - # assign column names to the DataFrame - df.columns = columns - - # create dict with GO_id:GO_term - logging.info(" | | Load GO taxonomy") - dict_go_plus = self.create_id_term_dict() - - # logging.info(" | | Sanitize keys") - # DELETE ; and , from terms (values in dictionary) to avoid future errors in CSV for neo4j import - for key in dict_go_plus.keys(): - if ',' in dict_go_plus[key]: - dict_go_plus[key] = dict_go_plus[key].replace(',', '') - if ';' in dict_go_plus[key]: - dict_go_plus[key] = dict_go_plus[key].replace(';', '') - if '\'' in dict_go_plus[key]: - dict_go_plus[key] = dict_go_plus[key].replace('\'', '') - - # logging.info(" | | Expand data") - # create additional column with GO terms (mapped from GO_id) - df['GO_term'] = df['GO_ID'].map(lambda go_id: dict_go_plus[go_id]) - - # create new columns that depends on edge type - df['GO_involved_in'] = None - df['GO_enables'] = None - df['GO_contributes_to'] = None - - ''' - List of genes the annotation for which we will integrate from Gene Ontology data, - Reading from Hugo_Symbol_genes.conf file - By default = genes from OncoKB database - ''' - # logging.info(" | | Read genes list") - included_genes = self.read_genes_list() - assert len(included_genes) > 0 - - # logging.info(" | | Filter out useless edges") - # cut df to include only edge type that we have chosen and annotations for genes from OncoKB - df = df[((df['Qualifier'].isin(['enables', 'involved_in', 'contributes_to'])) & - (df['DB_Object_Symbol'].isin(included_genes)))] - assert len(df) > 0 - - # add the GO_term in GO_involved_in, GO_enables, GO_contributes_to columns depending on the edge type in - # Qualifier column - # logging.info(" | | Separate edge types") - df = df.apply(self.separate_edges_types, axis=1) - assert len(df) > 0 - - # Default mapping as a simple config. - # logging.info(" | | Parse data") - from . import types - parser = ontoweaver.tabular.YamlParser(config, types) - mapping = parser() - - # logging.info(" | | Declare types") - # Declare types defined in the config. - super().__init__( - df, - *mapping, - ) - - logging.info(" | | Done Gene_ontology init") - - # function to create a dictionary with GO_id:GO_term for gene ontology, input - OWL file, output - dictionary - def create_id_term_dict(self): - dict_id_term = {} - - logging.debug(f"Load ontology: {self.ontology}") - - ont = get_ontology(self.ontology).load() - - # iterate through all classes in the ontology - for cls in ont.classes(): - # get the class ID and label (term) - class_id = cls.iri # read class_id like http://purl.obolibrary.org/obo/GO_0003674' - class_label = cls.label.first() if cls.label else cls.name - - # make the same key as we have in GO annotation files - class_id_key = class_id.replace("http://purl.obolibrary.org/obo/GO_", "GO:") - # add to dictionary like GO:0003674': 'molecular_function' - dict_id_term[class_id_key] = class_label - - return dict_id_term - - def read_genes_list(self): - - # print(self.genes_list=='o') - - with open(self.genes_list, 'r') as file: - - content = file.read() - genes = content.replace('\n', '').split(',') - genes = [gene.strip().strip("'") for gene in genes] - genes = list(filter(None, genes)) - - return genes - - # function to copy GO_term to related column for future ontoweaver mapping based on Qualifier column (relation type) - - @staticmethod - def separate_edges_types(row): - if row['Qualifier'] == 'enables': - row['GO_enables'] = row['GO_term'] - elif row['Qualifier'] == 'involved_in': - row['GO_involved_in'] = row['GO_term'] - elif row['Qualifier'] == 'contributes_to': - row['GO_contributes_to'] = row['GO_term'] - return row diff --git a/oncodashkb/adapters/gene_ontology.yaml b/oncodashkb/adapters/gene_ontology.yaml deleted file mode 100644 index d8d4184..0000000 --- a/oncodashkb/adapters/gene_ontology.yaml +++ /dev/null @@ -1,36 +0,0 @@ -row: - map: - column: DB_Object_Symbol - to_subject: gene -transformers: - # - map: - # column: DB_Object_Symbol - # to_object: gene - # via_relation: annotation_for_gene - # - map: - # column: GO_enables - # to_object: molecular_function - # via_relation: enables - # - map: - # columns: GO_involved_in - # to_object: biological_process - # via_relation: involved_in - # - map: - # columns: GO_contributes_to - # to_object: molecular_function - # via_relation: contributes_to - - map: - columns: GO_involved_in - to_object: biological_process - via_relation: gene_to_biological_process - # - map: - # column: GO_contributes_to - # from_subject: gene - # to_object: molecular_function - # via_relation: gene_to_molecular_function - - string: - value: " " - to_property: edgelabel - for_objects: gene_to_biological_process -metadata: - - data_source: gene_ontology diff --git a/oncodashkb/adapters/gene_ontology_reverse.yaml b/oncodashkb/adapters/gene_ontology_reverse.yaml deleted file mode 100644 index 1c72430..0000000 --- a/oncodashkb/adapters/gene_ontology_reverse.yaml +++ /dev/null @@ -1,36 +0,0 @@ -row: - map: - column: GO_involved_in - to_subject: biological_process -transformers: - # - map: - # column: DB_Object_Symbol - # to_object: gene - # via_relation: annotation_for_gene - # - map: - # column: GO_enables - # to_object: molecular_function - # via_relation: enables - # - map: - # columns: GO_involved_in - # to_object: biological_process - # via_relation: involved_in - # - map: - # columns: GO_contributes_to - # to_object: molecular_function - # via_relation: contributes_to - - map: - columns: DB_Object_Symbol - to_object: gene - via_relation: biological_process_to_gene - # - map: - # column: GO_contributes_to - # from_subject: gene - # to_object: molecular_function - # via_relation: gene_to_molecular_function - - string: - value: " " - to_property: edgelabel - for_objects: biological_process_to_gene -metadata: - - data_source: gene_ontology diff --git a/oncodashkb/adapters/open_targets_diseases.yaml b/oncodashkb/adapters/open_targets_diseases.yaml deleted file mode 100644 index e290608..0000000 --- a/oncodashkb/adapters/open_targets_diseases.yaml +++ /dev/null @@ -1,30 +0,0 @@ -row: - rowIndex: - to_subject: id -transformers: - - map: - columns: - - id - to_object: disease - via_relation: disease_to_id - - map: - columns: - - name - to_property: - - name - for_objects: - - disease - - map: - columns: - - description - to_property: - - description - for_objects: - - disease - - map: - columns: - - code - to_property: - - code - for_objects: - - disease diff --git a/oncodashkb/adapters/short_mutations_external.yaml b/oncodashkb/adapters/short_mutations_external.yaml index b4e4bfc..626f981 100644 --- a/oncodashkb/adapters/short_mutations_external.yaml +++ b/oncodashkb/adapters/short_mutations_external.yaml @@ -23,17 +23,17 @@ transformers: - map: column: alteration from_subject: sample - to_object: alteration - via_relation: sample_carries_alteration + to_object: short_mutation + via_relation: sample_carries_variant ## Gene Stauts - cat_format: columns: - ensembl_id - gene_role - from_subject: alteration + from_subject: short_mutation to_object: gene_status format_string: "{ensembl_id}:{gene_role}" - via_relation: alteration_causes_gene_status + via_relation: variant_causes_gene_status ## Genes - map: column: ensembl_id @@ -63,23 +63,23 @@ transformers: - map: column: tumorType to_property: tumor_type - for_object: alteration + for_object: short_mutation - map: column: consequence to_property: consequence - for_object: alteration + for_object: short_mutation - map: column: oncogenic to_property: oncogenic - for_object: alteration + for_object: short_mutation - replace: column: mutationEffectDescription to_property: mutation_effect_description - for_object: alteration + for_object: short_mutation - map: column: citationPMids to_property: citation_PM_ids - for_object: alteration + for_object: short_mutation - replace: column: geneSummary to_property: gene_summary @@ -87,18 +87,18 @@ transformers: - map: column: variantSummary to_property: variant_summary - for_object: alteration + for_object: short_mutation - map: column: tumorTypeSummary to_property: tumor_type_summary - for_object: alteration + for_object: short_mutation - string: value: " " to_property: edgelabel for_objects: - patient_carries_sample - - sample_carries_alteration - - alteration_causes_gene_status + - sample_carries_variant + - variant_causes_gene_status - gene_status_affects_gene metadata: - data_source: short_mutations_external \ No newline at end of file diff --git a/oncodashkb/adapters/short_mutations_local.yaml b/oncodashkb/adapters/short_mutations_local.yaml index 31c4c31..dfa514f 100644 --- a/oncodashkb/adapters/short_mutations_local.yaml +++ b/oncodashkb/adapters/short_mutations_local.yaml @@ -20,42 +20,41 @@ transformers: - map: column: alteration from_subject: sample - to_object: alteration - via_relation: sample_carries_alteration + to_object: short_mutation + via_relation: sample_carries_variant - map: column: referenceGenome to_property: reference_genome - for_object: alteration + for_object: short_mutation - map: column: tumorType to_property: tumor_type - for_object: alteration + for_object: short_mutation - map: column: consequence to_property: consequence - for_object: alteration + for_object: short_mutation - map: column: homogenous to_property: homogenous - for_object: alteration + for_object: short_mutation - map: column: refCount to_property: refCount - for_object: alteration + for_object: short_mutation - map: column: altCount to_property: altCount - for_object: alteration + for_object: short_mutation - map: column: expressed to_property: expressed - for_object: alteration + for_object: short_mutation - string: value: " " to_property: edgelabel for_objects: - patient_carries_sample - - sample_carries_alteration - - alteration_affects_gene + - sample_carries_variant metadata: - data_source: short_mutations_local \ No newline at end of file diff --git a/oncodashkb/adapters/structural_variants.yaml b/oncodashkb/adapters/structural_variants.yaml new file mode 100644 index 0000000..278ec62 --- /dev/null +++ b/oncodashkb/adapters/structural_variants.yaml @@ -0,0 +1,119 @@ +row: + translate: + column: patient + to_subject: patient + translations_file: ./data/DECIDER/clinical/clinical_export.xlsx + translate_from: Patient card::Patient cohort code_Patient Card + translate_to: Patient card::Publication code + index_col: 0 + usecols: [0,1,2] +transformers: + # Nodes + ## Samples + - translate_sample_ids : + column: sample + to_object: sample + via_relation: patient_carries_sample + translations_file: ./data/DECIDER/clinical/clinical_export.xlsx + translate_from: Patient card::Patient cohort code_Patient Card + translate_to: Patient card::Publication code + index_col: 0 + usecols: [0,1,2] + ## Alterations + - cat_format: + columns: + - primary_gene + - mutation + from_subject: sample + to_object: structural_variant + via_relation: sample_carries_variant + format_string: "{primary_gene}:{mutation}" + ## Gene status + - translate_cat_format: + columns: + - primary_gene + - Gene_type + from_subject: structural_variant + to_object: gene_status + format_string: "{primary_gene}:{Gene_type}" + via_relation: variant_causes_gene_status + column_to_translate: + - primary_gene + translations_file: data/HGNC/hgnc_complete_set.txt + translate_from: symbol + translate_to: ensembl_gene_id + sep: "\t" + ## Genes + - translate: + column: primary_gene + from_subject: gene_status + to_object: gene + via_relation: gene_status_affects_gene + translations_file: data/HGNC/hgnc_complete_set.txt + translate_from: symbol + translate_to: ensembl_gene_id + sep: "\t" + # Properties + ## Alterations + - map: + column: effect + to_property: consequence + for_object: structural_variant + - translate_cat_format: + columns: + - primary_gene + - mutation + to_property: ensembl_id_alteration + for_object: structural_variant + format_string: "{primary_gene}:{mutation}" + column_to_translate: + - primary_gene + translations_file: data/HGNC/hgnc_complete_set.txt + translate_from: symbol + translate_to: ensembl_gene_id + sep: "\t" + ## Gene status + - cat_format: + columns: + - primary_gene + - Gene_type + to_property: gene_symbol_gene_status + for_object: gene_status + format_string: "{primary_gene}:{Gene_type}" + - map: + columns: Gene_type + to_property: gene_role + for_object: gene_status + ## Genes + - map: + column: primary_gene + to_property: gene_symbol + for_object: gene + ## Alterations + - map: + column: pathogenic + to_property: oncogenic + for_object: structural_variant + - map: + column: effect + to_property: consequence + for_object: structural_variant + - map: + column: Homogeneous + to_property: homogenous + for_object: structural_variant + - map: + column: expressed + to_property: expressed + for_object: structural_variant + - string: + value: " " + to_property: edgelabel + for_objects: + - patient_carries_sample + - sample_carries_variant + - variant_causes_gene_status + - gene_status_affects_gene + +metadata: + - data_source: structural_variants_placeholder \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index c0b074c..8fe1506 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,9 +22,9 @@ dependencies = [ "matplotlib>=3.10.0,<4.0", "polars>=1.22.0,<2.0", "seaborn>=0.13.2,<0.14", - "ontoweaver>=1.3.0,<1.4.0", + "ontoweaver>=1.4.0,<1.5.0", "openpyxl>=3.1.5", - "pyarrow<21.0.0", + "pyarrow>20.0.0", "fastparquet<2026.3.0", ] @@ -32,5 +32,4 @@ dependencies = [ dev = [ "pre-commit>=4.5.0", "pytest>=8.4.1", -] - +] \ No newline at end of file diff --git a/weave.py b/weave.py index 89fc028..278730e 100755 --- a/weave.py +++ b/weave.py @@ -14,7 +14,6 @@ import biocypher import ontoweaver -# import oncodashkb.adapters as od from alive_progress import alive_bar error_codes = { @@ -35,8 +34,9 @@ ontoweaver.transformer.register(OmniPath_directed) # Importing custom transformer for translating sample ids with publication code and registering it. -from oncodashkb.transformers.specific_translate_transformers import translate_sample_ids +from oncodashkb.transformers.specific_translate_transformers import translate_sample_ids, translate_cat_format ontoweaver.transformer.register(translate_sample_ids) +ontoweaver.transformer.register(translate_cat_format) # Importing OpenTargets custom transformer and registering it. from oncodashkb.transformers.ot_transformers import access_proteins, urls_to_prop @@ -129,39 +129,6 @@ def process_OT(directory, name): return local_nodes, local_edges - -def process_GO(name): - logging.info(f" | Weave {name} data...") - # Table input data. - logging.info(f" | | Load {name} data...") - df = progress_read(asked.gene_ontology[0], sep='\t', comment='!', header=None, dtype={15: str}, hint=969214) - - logging.info(f" | | Read {name} mapping...") - # Extraction mapping configuration. - try: - with open(f"./oncodashkb/adapters/{name}.yaml") as fd: - conf = yaml.full_load(fd) - except Exception as e: - logging.error(e) - sys.exit(error_codes["CannotAccessFile"]) - - logging.info(f" | | Preprocess {name} data...") - manager = od.gene_ontology.Gene_ontology(df, asked.gene_ontology_owl, asked.gene_ontology_genes, conf) - - logging.info(f" | | Transform {name} data...") - local_nodes = [] - local_edges = [] - # Use manager.df because Gene_ontology does filter the input dataframe - with alive_bar(len(manager.df), file=sys.stderr) as progress: - for n,e in manager(): - local_nodes += n - local_edges += e - progress() - - return local_nodes, local_edges - - - if __name__ == "__main__": # TODO add adapter for parquet, one for csv and one that automatically checks filetype. @@ -169,7 +136,8 @@ def process_GO(name): parser = argparse.ArgumentParser( description=usage) - parser.add_argument("-C", "--config", metavar="FILE", default="config/neo4j.yaml", + parser.add_argument("-C", "--config", metavar="FILE", default=["config/neo4j.yaml"], + action="append", help="The BioCypher configuration to load [default: config/neo4j.yaml].") parser.add_argument("-i", "--clinical", metavar="CSV", nargs="+", @@ -187,6 +155,9 @@ def process_GO(name): parser.add_argument("-cnae", "--copy-number-amplifications-external", metavar="CSV", nargs="+", help="Extract from a CSV file with copy number amplifications' external annotations.") + parser.add_argument("-sv", "--structural-variants", metavar="CSV", nargs="+", + help="Extract from a CSV file with short mutations' local annotations.") + parser.add_argument("-o", "--oncokb", metavar="CSV", nargs="+", help="Extract from an OncoKB CSV file.") @@ -205,18 +176,6 @@ def process_GO(name): parser.add_argument("-c", "--cgi", metavar="CSV", nargs="+", help="Extract from a CGI CSV file.") - parser.add_argument("-g", "--gene-ontology", metavar="CSV", nargs="+", - help="Extract from a Gene_Ontology_Annotation GAF file.") - - parser.add_argument("-n", "--gene-ontology-owl", metavar="OWL", - help="Download Gene_Ontology owl file.") - - parser.add_argument("-G", "--gene-ontology-genes", metavar="TXT", - help="List of genes for which we integrate Gene Ontology annotations (by default genes from OncoKB).") - - parser.add_argument("-r", "--gene-ontology-reverse", action='store_true', - help="Extract from a Gene_Ontology_Annotation GAF file.") - parser.add_argument("-s", "--separator", metavar="STRING", default=", ", help="Separator in exported data files.") @@ -239,10 +198,6 @@ def process_GO(name): help="Set the verbose level (default: %(default)s).") asked = parser.parse_args() - bc = biocypher.BioCypher( - biocypher_config_path = asked.config, - schema_config_path = "config/schema.yaml" - ) logging.basicConfig() logging.getLogger().setLevel(asked.verbose) @@ -264,16 +219,13 @@ def process_GO(name): "short_mutations_external", "copy_number_amplifications_local", "copy_number_amplifications_external", + "structural_variants", "oncokb", "omnipath_networks", "open_targets_target", "open_targets_drug_mechanism_of_action", "open_targets_drug_molecule", "cgi", - "gene_ontology", - "gene_ontology_owl", - "gene_ontology_genes", - "gene_ontology_reverse", ] opt_total = 0 for opt in all_options: @@ -332,6 +284,101 @@ def process_GO(name): edges += local_edges logging.info(f"Done adapter {opt_loaded}/{opt_total}") + if asked.structural_variants: + opt_loaded += 1 + logging.info(f"########## Adapter #{opt_loaded}/{opt_total} ##########") + data_file = asked.structural_variants[0] + mapping_file = "./oncodashkb/adapters/structural_variants.yaml" + + # logging.info(f"Weave structural variants...") + logging.info(f" | Weave `{data_file}:{mapping_file}`...") + logging.info(f" | | Load data `{data_file}`...") + table = pd.read_excel(data_file) + + table = table.rename(columns={"Gene.type":"Gene_type"}) + table["mutation"] = table.mutation.str.replace(r';', ',', regex=True) + + try: + with open(mapping_file) as fd: + ymapping = yaml.full_load(fd) + except Exception as e: + logging.error(e) + sys.exit(error_codes["CannotAccessFile"]) + + logging.info(f" | | Process {mapping_file}...") + + yparser = ontoweaver.mapping.YamlParser(ymapping) + mapping = yparser() + + adapter = ontoweaver.tabular.PandasAdapter( + table, + *mapping, + type_affix="suffix", + type_affix_sep=":", + raise_errors = True + ) + + local_nodes = [] + local_edges = [] + with alive_bar(len(table), file=sys.stderr) as progress: + for n,e in adapter(): + # NOTE: here, n & e are ontoweaver.base.Element, not BioCypher tuples. + local_nodes += n + local_edges += e + progress() + + logging.info(f" | | OK, wove: {len(local_nodes)} nodes, {len(local_edges)} edges.") + nodes += local_nodes + edges += local_edges + logging.info(f"Done adapter {opt_loaded}/{opt_total}") + + if asked.cgi: + opt_loaded += 1 + logging.info(f"########## Adapter #{opt_loaded}/{opt_total} ##########") + data_file = asked.cgi[0] + mapping_file = "./oncodashkb/adapters/cgi.yaml" + + # logging.info(f"Weave structural variants...") + logging.info(f" | Weave `{data_file}:{mapping_file}`...") + logging.info(f" | | Load data `{data_file}`...") + table = progress_read(data_file, hint=72648) + + table["treatment"] = table.treatment.str.upper().str.replace(r'\([^()]*\)', '', regex=True) + + try: + with open(mapping_file) as fd: + ymapping = yaml.full_load(fd) + except Exception as e: + logging.error(e) + sys.exit(error_codes["CannotAccessFile"]) + + logging.info(f" | | Process {mapping_file}...") + + yparser = ontoweaver.mapping.YamlParser(ymapping) + mapping = yparser() + + adapter = ontoweaver.tabular.PandasAdapter( + table, + *mapping, + type_affix="suffix", + type_affix_sep=":", + raise_errors = True + ) + + local_nodes = [] + local_edges = [] + with alive_bar(len(table), file=sys.stderr) as progress: + for n,e in adapter(): + # NOTE: here, n & e are ontoweaver.base.Element, not BioCypher tuples. + local_nodes += n + local_edges += e + progress() + + logging.info(f" | | OK, wove: {len(local_nodes)} nodes, {len(local_edges)} edges.") + nodes += local_nodes + edges += local_edges + logging.info(f"Done adapter {opt_loaded}/{opt_total}") + if asked.omnipath_networks: opt_loaded += 1 logging.info(f"########## Adapter #{opt_loaded}/{opt_total} ##########") @@ -446,29 +493,6 @@ def process_GO(name): edges += local_edges logging.info(f"Done adapter {opt_loaded}/{opt_total}") - ## GeneOntology - - ### GO - if asked.gene_ontology: - opt_loaded += 1 - logging.info(f"########## Adapter #{opt_loaded}/{opt_total} ##########") - local_nodes, local_edges = process_GO("gene_ontology") - logging.info(f" | Save data...") - nodes += local_nodes - edges += local_edges - logging.info(f"OK, wove Gene Ontology data: {len(local_nodes)} nodes, {len(local_edges)} edges.") - logging.info(f"Done adapter {opt_loaded}/{opt_total}") - - ### GO reversed - if asked.gene_ontology_reverse: - opt_loaded += 1 - logging.info(f"########## Adapter #{opt_loaded}/{opt_total} ##########") - local_nodes, local_edges = process_GO("gene_ontology_reverse") - nodes += local_nodes - edges += local_edges - logging.info(f"OK, reverse-wove Gene Ontology: {len(local_nodes)} nodes, {len(local_edges)} edges.") - logging.info(f"Done adapter {opt_loaded}/{opt_total}") - ################################################### # Map the data not requiring special loadings. # ################################################### @@ -487,10 +511,9 @@ def process_GO(name): "short_mutations_external", "copy_number_amplifications_local", "copy_number_amplifications_external", + # "structural_variants", "oncokb", - # "omnipath_networks", - # "ot-" - "cgi", + # "cgi", ] for name in direct_mappings: option = getattr(asked, name) @@ -644,17 +667,26 @@ def process_GO(name): # Export the final SKG. ################################################### - logging.info(f"Write the final SKG into files...") - if fnodes: - bc.write_nodes(n.as_tuple() for n in fnodes) - if fedges: - bc.write_edges(e.as_tuple() for e in fedges) - #bc.summary() - import_file = bc.write_import_call() - logging.info(f"OK, wrote files.") - - # Print on stdout for other scripts to get. - print(import_file) + configs = asked.config + + for config in configs: + logging.info(f"Write the final SKG into {config} files...") + + bc = biocypher.BioCypher( + biocypher_config_path = config, + schema_config_path = "config/schema.yaml" + ) + + if fnodes: + bc.write_nodes(n.as_tuple() for n in fnodes) + if fedges: + bc.write_edges(e.as_tuple() for e in fedges) + #bc.summary() + import_file = bc.write_import_call() + logging.info(f"OK, wrote files.") + + # Print on stdout for other scripts to get. + print(import_file) if asked.import_script_run: shell = os.environ["SHELL"]