diff --git a/pori_python/graphkb/constants.py b/pori_python/graphkb/constants.py index fe22f4a0..07f686b9 100644 --- a/pori_python/graphkb/constants.py +++ b/pori_python/graphkb/constants.py @@ -59,7 +59,10 @@ TSO500_SOURCE_NAME = 'tso500' ONCOGENE = 'oncogenic' TUMOUR_SUPPRESSIVE = 'tumour suppressive' -CANCER_GENE = 'cancer gene' +CANCER_GENE = [ + 'cancer gene', + 'tumourigenesis', +] # KBDEV-1532. tumourigenesis for backward compatibility FUSION_NAMES = ['structural variant', 'fusion'] GSC_PHARMACOGENOMIC_SOURCE_EXCLUDE_LIST = ['cancer genome interpreter', 'civic'] diff --git a/pori_python/graphkb/genes.py b/pori_python/graphkb/genes.py index 09da3ed7..6c62e601 100644 --- a/pori_python/graphkb/genes.py +++ b/pori_python/graphkb/genes.py @@ -2,7 +2,7 @@ from __future__ import annotations -from typing import Any, Dict, List, Sequence, Set, Tuple, cast +from typing import Any, Dict, List, Sequence, Set, Tuple, cast, Union from typing_extensions import deprecated from pori_python.types import IprGene, Ontology, Record, Statement, Variant @@ -27,8 +27,116 @@ from .vocab import get_terms_set +def get_cancer_gene_flags( + conn: GraphKBConnection, + flags: bool = False, + ignore_cache: bool = False, +) -> Union[List, Dict]: + """ + Return all cancer genes, optionally sorted by flags. + + Flag definitions: + oncogenic: relevance 'oncogenic' from OncoKB + tumourSuppressive: relevance 'tumour suppressive' from OncoKB + cancerGene: relevance 'cancer gene' AND child terms ('oncogenic', 'tumour suppressive', 'other cancer gene'), from OncoKB AND TSO500 + + Args: + conn: the graphkb connection object + namesOnly: if only the gene names should be returned + + Returns (if flags=False; default): list of unique gene records + [ , , ... ] + + Returns (if flags=True): dict of flags as keys, and list of gene records as value + { + 'oncogenic': [ , , ... ], + 'tumourSuppressive' = [ , , ... ], + 'cancerGene' = [ , , ... ], + } + """ + # all cancer gene statements + CANCER_GENES = conn.get_related_terms( + terms=CANCER_GENE, + subgraphType='children', + ) + statements = cast( + List[Statement], + conn.query( + { + 'target': 'Statement', + 'filters': { + 'relevance': {'target': 'Vocabulary', 'filters': {'name': CANCER_GENES}} + }, + 'returnProperties': [ + 'source.name', + 'relevance.name', + *[f'subject.{prop}' for prop in GENE_RETURN_PROPERTIES], + ], + }, + ignore_cache=ignore_cache, + ), + ) + + # post-query filtering (faster) + cancerGeneStms = list( + filter( + lambda r: ( + r['subject']['@class'] == 'Feature' + and r['subject']['biotype'] == 'gene' + and r['source']['name'] in [ONCOKB_SOURCE_NAME, TSO500_SOURCE_NAME] + ), + statements, + ) + ) + oncogenicStms = list( + filter( + lambda r: ( + r['relevance']['name'] == ONCOGENE and r['source']['name'] == ONCOKB_SOURCE_NAME + ), + cancerGeneStms, + ) + ) + tumourSuppressiveStms = list( + filter( + lambda r: ( + r['relevance']['name'] == TUMOUR_SUPPRESSIVE + and r['source']['name'] == ONCOKB_SOURCE_NAME + ), + cancerGeneStms, + ) + ) + + # Returning a sorted list of unique gene records, based on iProbe requirements + # Unique by name, sorted by displayName + names = set() # for unique gene names tracking + if not flags: + return cast( + List[Record], + sorted( + [ + r['subject'] + for r in cancerGeneStms + if r['subject']['name'] not in names and not names.add(r['subject']['name']) + ], + key=lambda gene: gene['displayName'], + ), + ) + + # Returning a Dict of flags, with list of associated gene records + # Duplicates are ok + return { + 'cancerGene': [r['subject'] for r in cancerGeneStms], + 'oncogenic': [r['subject'] for r in oncogenicStms], + 'tumourSuppressive': [r['subject'] for r in tumourSuppressiveStms], + } + + +@deprecated('fuctionality replaced by get_cancer_gene_flags') def _get_tumourigenesis_genes_list( - conn: GraphKBConnection, relevance: str, sources: List[str], ignore_cache: bool = False + conn: GraphKBConnection, + relevance: Union[str, list[str]], + sources: Union[str, list[str]], + ignore_cache: bool = False, ) -> List[Ontology]: statements = cast( List[Statement], @@ -57,6 +165,7 @@ def _get_tumourigenesis_genes_list( return [gene for gene in genes.values()] +@deprecated('fuctionality replaced by get_cancer_gene_flags') def get_oncokb_oncogenes(conn: GraphKBConnection) -> List[Ontology]: """Get the list of oncogenes stored in GraphKB derived from OncoKB. @@ -66,9 +175,10 @@ def get_oncokb_oncogenes(conn: GraphKBConnection) -> List[Ontology]: Returns: gene (Feature) records """ - return _get_tumourigenesis_genes_list(conn, ONCOGENE, [ONCOKB_SOURCE_NAME]) + return _get_tumourigenesis_genes_list(conn, ONCOGENE, ONCOKB_SOURCE_NAME) +@deprecated('fuctionality replaced by get_cancer_gene_flags') def get_oncokb_tumour_supressors(conn: GraphKBConnection) -> List[Ontology]: """Get the list of tumour supressor genes stored in GraphKB derived from OncoKB. @@ -78,11 +188,14 @@ def get_oncokb_tumour_supressors(conn: GraphKBConnection) -> List[Ontology]: Returns: gene (Feature) records """ - return _get_tumourigenesis_genes_list(conn, TUMOUR_SUPPRESSIVE, [ONCOKB_SOURCE_NAME]) + return _get_tumourigenesis_genes_list(conn, TUMOUR_SUPPRESSIVE, ONCOKB_SOURCE_NAME) +@deprecated('fuctionality replaced by get_cancer_gene_flags') def get_cancer_genes(conn: GraphKBConnection) -> List[Ontology]: - """Get the list of cancer genes stored in GraphKB derived from OncoKB & TSO500. + """ + Get the list of cancer genes stored in GraphKB derived from OncoKB & TSO500. + Cancer genes include oncogenes, tumour supressor genes and other cancer genes. Args: conn: the graphkb connection object @@ -90,8 +203,12 @@ def get_cancer_genes(conn: GraphKBConnection) -> List[Ontology]: Returns: gene (Feature) records """ + cancer_gene_terms = conn.get_related_terms( + terms=CANCER_GENE, + subgraphType='children', + ) return _get_tumourigenesis_genes_list( - conn, CANCER_GENE, [ONCOKB_SOURCE_NAME, TSO500_SOURCE_NAME] + conn, cancer_gene_terms, [ONCOKB_SOURCE_NAME, TSO500_SOURCE_NAME] ) @@ -513,12 +630,12 @@ def get_gene_information( # PositionalVariant without a reference2 implies a smallMutation type gene_flags['knownSmallMutation'].add(condition['reference1']) # type: ignore - logger.info('fetching oncogenes list') - gene_flags['oncogene'] = convert_to_rid_set(get_oncokb_oncogenes(graphkb_conn)) - logger.info('fetching tumour supressors list') - gene_flags['tumourSuppressor'] = convert_to_rid_set(get_oncokb_tumour_supressors(graphkb_conn)) - logger.info('fetching cancerGeneListMatch list') - gene_flags['cancerGeneListMatch'] = convert_to_rid_set(get_cancer_genes(graphkb_conn)) + # cancer gene flags + logger.info('fetching cancer genes') + cancer_gene_flags = get_cancer_gene_flags(graphkb_conn, flags=True) + gene_flags['oncogene'] = convert_to_rid_set(cancer_gene_flags['oncogenic']) + gene_flags['tumourSuppressor'] = convert_to_rid_set(cancer_gene_flags['tumourSuppressive']) + gene_flags['cancerGeneListMatch'] = convert_to_rid_set(cancer_gene_flags['cancerGene']) logger.info('fetching therapeutic associated genes lists') gene_flags['therapeuticAssociated'] = convert_to_rid_set( @@ -527,8 +644,14 @@ def get_gene_information( logger.info(f'Setting gene_info flags on {len(gene_names)} genes') result: List[IprGene] = [] + EQUIVALENT_CACHE = {} for gene_name in gene_names: - equivalent = convert_to_rid_set(get_equivalent_features(graphkb_conn, gene_name)) + if gene_name not in EQUIVALENT_CACHE: + EQUIVALENT_CACHE[gene_name] = convert_to_rid_set( + get_equivalent_features(graphkb_conn, gene_name) + ) + equivalent = EQUIVALENT_CACHE[gene_name] + row: Dict[str, str | bool] = {'name': gene_name} flagged = False for flag in gene_flags: diff --git a/pori_python/graphkb/util.py b/pori_python/graphkb/util.py index 075084ff..2508b30b 100644 --- a/pori_python/graphkb/util.py +++ b/pori_python/graphkb/util.py @@ -366,6 +366,64 @@ def version(self) -> Dict[str, str]: """ return self.request('version') + def get_related_records( + self, + base: Union[str, list[str]], + ontology: str, + subgraphType: str, + returnProperties=[], + ): + """ + Given some base node RIDs, an ontology class and a subgraph type, + leverage the subgraphs route to return the list of related nodes. + + Args: + base: the base node RIDs to start the graph traversal from + ontology: the ontology class to traverse + subgraphType: the type of traversal. See options in API specs + returnProperties: additional record properties to return + + Returns: + list of related node record(s) traversed + """ + related = self.post( + uri=f'/subgraphs/{ontology}', + data={ + 'base': base if isinstance(base, list) else [base], + 'subgraphType': subgraphType, + 'returnProperties': returnProperties, + }, + ) + return related['result']['g']['nodes'] + + def get_related_terms( + self, + terms: Union[str, list[str]], + ontology: str = 'Vocabulary', + subgraphType: str = 'similar', + ) -> list[str]: + """ + Given some base term name(s), an ontology class and a subgraph type, + leverage the subgraphs route to return the list of related term name(s) + + Args: + terms: the base term name(s) to start the graph traversal from + ontology: the ontology class to traverse + subgraphType: the type of traversal + + Returns: + list of related term name(s) + """ + rids = convert_to_rid_list(self.query({'target': ontology, 'filters': {'name': terms}})) + nodes = self.get_related_records( + base=rids, + ontology=ontology, + subgraphType=subgraphType, + ) + return list( + map(lambda x: x['name'], nodes.values()), + ) + def get_rid(conn: GraphKBConnection, target: str, name: str) -> str: """ diff --git a/pori_python/graphkb/vocab.py b/pori_python/graphkb/vocab.py index e9242a7a..bb96e5f5 100644 --- a/pori_python/graphkb/vocab.py +++ b/pori_python/graphkb/vocab.py @@ -1,4 +1,4 @@ -from typing import Callable, Dict, Iterable, List, Set, cast +from typing import Callable, Dict, Iterable, List, Set, cast, Union from pori_python.types import Ontology @@ -6,7 +6,7 @@ from .util import convert_to_rid_list -def query_by_name(ontology_class: str, base_term_name: str) -> Dict: +def query_by_name(ontology_class: str, base_term_name: Union[str, list[str]]) -> Dict: return {'target': ontology_class, 'filters': {'name': base_term_name}} diff --git a/tests/test_graphkb/test_genes.py b/tests/test_graphkb/test_genes.py index 90efe5d4..d53b4e9d 100644 --- a/tests/test_graphkb/test_genes.py +++ b/tests/test_graphkb/test_genes.py @@ -8,6 +8,7 @@ from pori_python.graphkb import GraphKBConnection from pori_python.graphkb.genes import ( get_cancer_genes, + get_cancer_gene_flags, get_cancer_predisposition_info, get_gene_information, get_gene_linked_cancer_predisposition_info, @@ -27,7 +28,7 @@ CANONICAL_ONCOGENES = ['kras', 'nras', 'alk'] CANONICAL_TS = ['cdkn2a', 'tp53'] -CANONICAL_CG = ['alb'] +CANONICAL_OTHER_CG = ['alb'] CANONICAL_FUSION_GENES = ['alk', 'ewsr1', 'fli1'] CANONICAL_STRUCTURAL_VARIANT_GENES = ['brca1', 'dpyd', 'pten'] CANNONICAL_THERAPY_GENES = ['erbb2', 'brca2', 'egfr'] @@ -111,6 +112,27 @@ def conn(): return conn +@pytest.mark.skipif(EXCLUDE_ONCOKB_TESTS, reason='excluding tests that depend on oncokb data') +def test_cancer_gene_flags(conn): + # wo/ flags + result = get_cancer_gene_flags(conn) + for gene in [*CANONICAL_OTHER_CG, *CANONICAL_TS, *CANONICAL_ONCOGENES]: + assert gene in {row['name'] for row in result} + # w/ flags + result = get_cancer_gene_flags(conn, flags=True) + for gene in [*CANONICAL_OTHER_CG, *CANONICAL_TS, *CANONICAL_ONCOGENES]: + assert gene in {row['name'] for row in result['cancerGene']} + for gene in CANONICAL_TS: + assert gene in {row['name'] for row in result['tumourSuppressive']} + assert gene not in {row['name'] for row in result['oncogenic']} + for gene in CANONICAL_ONCOGENES: + assert gene in {row['name'] for row in result['oncogenic']} + assert gene not in {row['name'] for row in result['tumourSuppressive']} + for gene in [*CANONICAL_OTHER_CG]: + assert gene not in {row['name'] for row in result['oncogenic']} + assert gene not in {row['name'] for row in result['tumourSuppressive']} + + @pytest.mark.skipif(EXCLUDE_ONCOKB_TESTS, reason='excluding tests that depend on oncokb data') def test_oncogene(conn): result = get_oncokb_oncogenes(conn) @@ -119,7 +141,7 @@ def test_oncogene(conn): assert gene in names for gene in CANONICAL_TS: assert gene not in names - for gene in CANONICAL_CG: + for gene in CANONICAL_OTHER_CG: assert gene not in names @@ -131,7 +153,7 @@ def test_tumour_supressors(conn): assert gene in names for gene in CANONICAL_ONCOGENES: assert gene not in names - for gene in CANONICAL_CG: + for gene in CANONICAL_OTHER_CG: assert gene not in names @@ -142,12 +164,12 @@ def test_tumour_supressors(conn): def test_cancer_genes(conn): result = get_cancer_genes(conn) names = {row['name'] for row in result} - for gene in CANONICAL_CG: + for gene in CANONICAL_OTHER_CG: assert gene in names for gene in CANONICAL_TS: - assert gene not in names + assert gene in names for gene in CANONICAL_ONCOGENES: - assert gene not in names + assert gene in names @pytest.mark.skipif( @@ -254,7 +276,7 @@ def test_get_gene_information(conn): conn, CANONICAL_ONCOGENES + CANONICAL_TS - + CANONICAL_CG + + CANONICAL_OTHER_CG + CANONICAL_FUSION_GENES + CANONICAL_STRUCTURAL_VARIANT_GENES + CANNONICAL_THERAPY_GENES @@ -300,7 +322,7 @@ def test_get_gene_information(conn): f'Missed kbStatementRelated {gene}' ) - for gene in CANONICAL_CG: + for gene in CANONICAL_ONCOGENES + CANONICAL_TS + CANONICAL_OTHER_CG: assert gene in [g['name'] for g in gene_info if g.get('cancerGeneListMatch')], ( f'Missed cancerGeneListMatch {gene}' ) diff --git a/tests/test_graphkb/test_util.py b/tests/test_graphkb/test_util.py index e0173a0f..dbbb2c2b 100644 --- a/tests/test_graphkb/test_util.py +++ b/tests/test_graphkb/test_util.py @@ -152,7 +152,7 @@ def test_stringifyVariant_positional(self, conn, rid, createdAt, stringifiedVari assert util.stringifyVariant(variant=variant, **opt) == stringifiedVariant -class TestVersion: +class TestGraphKBConnection: def test_version(self, conn): version = conn.version assert version['db'] in [ @@ -164,3 +164,30 @@ def test_version(self, conn): assert SEMANTIC_VERSIONING_REGEX.match(version['api']) assert SEMANTIC_VERSIONING_REGEX.match(version['parser']) assert SEMANTIC_VERSIONING_REGEX.match(version['schema']) + + def test_get_related_records(self, conn): + base = util.convert_to_rid_list( + conn.query({'target': 'Vocabulary', 'filters': {'name': 'missense'}}) + ) + records = conn.get_related_records( + base=base, + ontology='Vocabulary', + subgraphType='similar', + returnProperties=['displayName'], + ) + assert 'missense mutation' in list(map(lambda x: x['displayName'], records.values())) + + def test_get_related_terms(self, conn): + # with defaults + vocab_terms = conn.get_related_terms( + terms='missense', + ) + assert 'missense mutation' in vocab_terms + + # overriding ontology & subgraphType defaults + disease_terms = conn.get_related_terms( + terms='all solid tumors', + ontology='Disease', + subgraphType='parents', + ) + assert 'cancer' in disease_terms