diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml index 607c0c6e..7804a8a0 100644 --- a/.github/workflows/pytest.yml +++ b/.github/workflows/pytest.yml @@ -28,15 +28,10 @@ jobs: run: | python -m pip install --upgrade pip setuptools pip install -e .[test] # coverage reports need -e to capture properly - - name: Lint with flake8 + - name: Check with ruff run: | - pip install flake8 - # stop the build if there are Python syntax errors or undefined names - flake8 pori_python --count --select=E9,F63,F7,F82 --show-source --statistics - - name: Check with black - run: | - pip install black - black --check -S -l 100 pori_python tests + pip install ruff + ruff format --check pori_python tests - name: Full Tests with pytest run: | pip list @@ -46,6 +41,7 @@ jobs: IPR_PASS: ${{ secrets.IPR_TEST_PASSWORD }} GRAPHKB_USER: ${{ secrets.GKB_TEST_USER }} GRAPHKB_PASS: ${{ secrets.GKB_TEST_PASS }} + GRAPHKB_URL: ${{ secrets.GKB_TEST_URL }} # SDEV-3381 - Turn off integration tests temporarily, till efficiency is increased # turn on integration tests for one python version only EXCLUDE_INTEGRATION_TESTS: ${{ matrix.python-version != '3.11' }} diff --git a/.github/workflows/quick-pytest.yml b/.github/workflows/quick-pytest.yml index d4f70223..aef2b56d 100644 --- a/.github/workflows/quick-pytest.yml +++ b/.github/workflows/quick-pytest.yml @@ -25,22 +25,19 @@ jobs: run: | python -m pip install --upgrade pip setuptools pip install -e .[test] # coverage reports need -e to capture properly - - name: Lint with flake8 + - name: Check with ruff run: | - pip install flake8 - # stop the build if there are Python syntax errors or undefined names - flake8 pori_python --count --select=E9,F63,F7,F82 --show-source --statistics - - name: Check with black - run: | - pip install black - black --check -S -l 100 pori_python tests + pip install ruff + ruff format --check pori_python tests - name: Short Tests with pytest run: pytest --junitxml=junit/test-results-${{ matrix.python-version }}.xml --cov ipr --cov-report term --cov-report xml env: IPR_USER: ${{ secrets.IPR_TEST_USER }} IPR_PASS: ${{ secrets.IPR_TEST_PASSWORD }} + IPR_URL: ${{ secrets.IPR_TEST_URL }} GRAPHKB_USER: ${{ secrets.GKB_TEST_USER }} GRAPHKB_PASS: ${{ secrets.GKB_TEST_PASS }} + GRAPHKB_URL: ${{ secrets.GKB_TEST_URL }} EXCLUDE_INTEGRATION_TESTS: 1 # EXCLUDE_INTEGRATION_TESTS: ${{ matrix.python-version != '3.11' }} - if: github.event_name != 'pull_request' \ No newline at end of file + if: github.event_name != 'pull_request' diff --git a/README.md b/README.md index 44d015ec..192ab2a3 100644 --- a/README.md +++ b/README.md @@ -46,11 +46,9 @@ pip install -e .[dev] Run the tests: -Export usernames, passwords, and set test options. +Export usernames, passwords, and test options. -Note that IPR tests will try to use the BCGSC production GraphKB API by default. -If you want to test interaction with a different instance, you will need to -set the GraphKB variables. +IPR_URL and GRAPHKB_URL values must also be set. Set EXCLUDE vars to 1 if you don't want to run these tests. ONCOKB and BCGSC tests are enabled by default. @@ -67,11 +65,12 @@ export EXCLUDE_ONCOKB_TESTS=1 ``` If you want to run tests that upload reports to a live IPR instance, -specify the url of the IPR API you want to use and set the test var to 1. +specify the url of the IPR API you want to use and set the test var +INCLUDE_UPLOAD_TESTS to 1. These tests are disabled by default. The created reports are deleted by default. If you want to keep them, -set DELETE_UPLOAD_TEST_REPORTS to 0 in the env. +set DELETE_UPLOAD_TEST_REPORTS to 0. ```bash export IPR_TEST_URL='http://localhost:8081/api' @@ -84,14 +83,16 @@ pytest tests ``` ### JSON Validate and Upload to IPR +An IPR_URL must be provided either as an environment variable or an arg. + If you only want to validate the json content, use ```bash -ipr --password $IPR_PASS -c 'path/to/content.json' --validate_json +ipr --password $IPR_PASS -c 'path/to/content.json' --validate_json --ipr_url $IPR_URL ``` If you only want to upload the json directly to ipr and skip all the preprocessing, use ```bash -ipr --password $IPR_PASS -c 'path/to/content.json' --upload_json +ipr --password $IPR_PASS -c 'path/to/content.json' --upload_json --ipr_url $IPR_URL ``` ## Documentation diff --git a/pori_python/graphkb/__init__.py b/pori_python/graphkb/__init__.py index a6fdd663..acce57aa 100644 --- a/pori_python/graphkb/__init__.py +++ b/pori_python/graphkb/__init__.py @@ -1,2 +1 @@ -from .constants import DEFAULT_URL # noqa: F401 from .util import GraphKBConnection, logger # noqa: F401 diff --git a/pori_python/graphkb/constants.py b/pori_python/graphkb/constants.py index 4861d70c..fe22f4a0 100644 --- a/pori_python/graphkb/constants.py +++ b/pori_python/graphkb/constants.py @@ -4,113 +4,108 @@ from pori_python.types import CategoryBaseTermMapping DEFAULT_LIMIT = 1000 -GKB_BASE_URL = "https://graphkb-api.bcgsc.ca/api" -GKB_STAGING_URL = "https://graphkbstaging-api.bcgsc.ca/api" -GKB_DEV_URL = "https://graphkbdev-api.bcgsc.ca/api" -DEFAULT_URL = GKB_BASE_URL -PREFERRED_GENE_SOURCE = "#39:5" # HGNC -PREFERRED_GENE_SOURCE_NAME = "HGNC" +PREFERRED_GENE_SOURCE_NAME = 'HGNC' -BASE_RETURN_PROPERTIES = ["@rid", "@class"] +BASE_RETURN_PROPERTIES = ['@rid', '@class'] GENERIC_RETURN_PROPERTIES = [ - "name", - "sourceId", - "sourceIdVersion", - "source.name", - "source.@rid", - "displayName", - "deprecated", + 'name', + 'sourceId', + 'sourceIdVersion', + 'source.name', + 'source.@rid', + 'displayName', + 'deprecated', ] + BASE_RETURN_PROPERTIES -GENE_RETURN_PROPERTIES = ["biotype"] + GENERIC_RETURN_PROPERTIES +GENE_RETURN_PROPERTIES = ['biotype'] + GENERIC_RETURN_PROPERTIES VARIANT_RETURN_PROPERTIES = ( BASE_RETURN_PROPERTIES - + [f"type.{p}" for p in GENERIC_RETURN_PROPERTIES] - + [f"reference1.{p}" for p in GENE_RETURN_PROPERTIES] - + [f"reference2.{p}" for p in GENE_RETURN_PROPERTIES] - + ["zygosity", "germline", "displayName"] + + [f'type.{p}' for p in GENERIC_RETURN_PROPERTIES] + + [f'reference1.{p}' for p in GENE_RETURN_PROPERTIES] + + [f'reference2.{p}' for p in GENE_RETURN_PROPERTIES] + + ['zygosity', 'germline', 'displayName'] ) POS_VARIANT_RETURN_PROPERTIES = VARIANT_RETURN_PROPERTIES + [ - "break1Start", - "break1End", - "break2Start", - "break2End", - "break1Repr", - "break2Repr", - "refSeq", - "untemplatedSeq", - "untemplatedSeqSize", - "truncation", - "assembly", + 'break1Start', + 'break1End', + 'break2Start', + 'break2End', + 'break1Repr', + 'break2Repr', + 'refSeq', + 'untemplatedSeq', + 'untemplatedSeqSize', + 'truncation', + 'assembly', ] STATEMENT_RETURN_PROPERTIES = ( BASE_RETURN_PROPERTIES - + ["displayNameTemplate", "sourceId", "source.name", "source.displayName"] - + [f"conditions.{p}" for p in GENERIC_RETURN_PROPERTIES] - + [f"subject.{p}" for p in GENERIC_RETURN_PROPERTIES] - + [f"evidence.{p}" for p in GENERIC_RETURN_PROPERTIES] - + [f"relevance.{p}" for p in GENERIC_RETURN_PROPERTIES] - + [f"evidenceLevel.{p}" for p in GENERIC_RETURN_PROPERTIES] - + ["reviewStatus"] + + ['displayNameTemplate', 'sourceId', 'source.name', 'source.displayName'] + + [f'conditions.{p}' for p in GENERIC_RETURN_PROPERTIES] + + [f'subject.{p}' for p in GENERIC_RETURN_PROPERTIES] + + [f'evidence.{p}' for p in GENERIC_RETURN_PROPERTIES] + + [f'relevance.{p}' for p in GENERIC_RETURN_PROPERTIES] + + [f'evidenceLevel.{p}' for p in GENERIC_RETURN_PROPERTIES] + + ['reviewStatus'] ) -ONCOKB_SOURCE_NAME = "oncokb" -TSO500_SOURCE_NAME = "tso500" -ONCOGENE = "oncogenic" -TUMOUR_SUPPRESSIVE = "tumour suppressive" -CANCER_GENE = "cancer gene" -FUSION_NAMES = ["structural variant", "fusion"] +ONCOKB_SOURCE_NAME = 'oncokb' +TSO500_SOURCE_NAME = 'tso500' +ONCOGENE = 'oncogenic' +TUMOUR_SUPPRESSIVE = 'tumour suppressive' +CANCER_GENE = 'cancer gene' +FUSION_NAMES = ['structural variant', 'fusion'] -GSC_PHARMACOGENOMIC_SOURCE_EXCLUDE_LIST = ["cancer genome interpreter", "civic"] -GSC_PHARMACOGENOMIC_SOURCE_DISPLAYNAME_EXCLUDE_LIST = ["CGI", "CIViC"] +GSC_PHARMACOGENOMIC_SOURCE_EXCLUDE_LIST = ['cancer genome interpreter', 'civic'] +GSC_PHARMACOGENOMIC_SOURCE_DISPLAYNAME_EXCLUDE_LIST = ['CGI', 'CIViC'] -BASE_THERAPEUTIC_TERMS = ["therapeutic efficacy", "eligibility"] +BASE_THERAPEUTIC_TERMS = ['therapeutic efficacy', 'eligibility'] # the order here is the order these are applied, the first category matched is returned RELEVANCE_BASE_TERMS: CategoryBaseTermMapping = [ - ("therapeutic", BASE_THERAPEUTIC_TERMS), - ("diagnostic", ["diagnostic indicator"]), - ("prognostic", ["prognostic indicator"]), - ("pharmacogenomic", ["metabolism", "toxicity", "dosage"]), - ("cancer predisposition", ["pathogenic"]), - ("biological", ["functional effect", "tumourigenesis", "predisposing"]), + ('therapeutic', BASE_THERAPEUTIC_TERMS), + ('diagnostic', ['diagnostic indicator']), + ('prognostic', ['prognostic indicator']), + ('pharmacogenomic', ['metabolism', 'toxicity', 'dosage']), + ('cancer predisposition', ['pathogenic']), + ('biological', ['functional effect', 'tumourigenesis', 'predisposing']), ] -FAILED_REVIEW_STATUS = "failed" +FAILED_REVIEW_STATUS = 'failed' -CHROMOSOMES_HG38 = [f"chr{i}" for i in range(1, 23)] + ["chrX", "chrY", "chrM"] -CHROMOSOMES_HG19 = [str(i) for i in range(1, 23)] + ["x", "y", "mt"] +CHROMOSOMES_HG38 = [f'chr{i}' for i in range(1, 23)] + ['chrX', 'chrY', 'chrM'] +CHROMOSOMES_HG19 = [str(i) for i in range(1, 23)] + ['x', 'y', 'mt'] CHROMOSOMES = CHROMOSOMES_HG38 + CHROMOSOMES_HG19 -AMBIGUOUS_AA = ["x", "?", "X"] +AMBIGUOUS_AA = ['x', '?', 'X'] AA_3to1_MAPPING = { - "Ala": "A", - "Arg": "R", - "Asn": "N", - "Asp": "D", - "Asx": "B", - "Cys": "C", - "Glu": "E", - "Gln": "Q", - "Glx": "Z", - "Gly": "G", - "His": "H", - "Ile": "I", - "Leu": "L", - "Lys": "K", - "Met": "M", - "Phe": "F", - "Pro": "P", - "Ser": "S", - "Thr": "T", - "Trp": "W", - "Tyr": "Y", - "Val": "V", - "Ter": "*", + 'Ala': 'A', + 'Arg': 'R', + 'Asn': 'N', + 'Asp': 'D', + 'Asx': 'B', + 'Cys': 'C', + 'Glu': 'E', + 'Gln': 'Q', + 'Glx': 'Z', + 'Gly': 'G', + 'His': 'H', + 'Ile': 'I', + 'Leu': 'L', + 'Lys': 'K', + 'Met': 'M', + 'Phe': 'F', + 'Pro': 'P', + 'Ser': 'S', + 'Thr': 'T', + 'Trp': 'W', + 'Tyr': 'Y', + 'Val': 'V', + 'Ter': '*', } @@ -132,89 +127,89 @@ def __getitem__(self, key): INPUT_COPY_CATEGORIES = IterableNamespace( - AMP="amplification", - ANY_GAIN="copy gain", - ANY_LOSS="copy loss", - DEEP="deep deletion", - GAIN="low level copy gain", - LOSS="shallow deletion", + AMP='amplification', + ANY_GAIN='copy gain', + ANY_LOSS='copy loss', + DEEP='deep deletion', + GAIN='low level copy gain', + LOSS='shallow deletion', ) INPUT_EXPRESSION_CATEGORIES = IterableNamespace( - UP="increased expression", DOWN="reduced expression" + UP='increased expression', DOWN='reduced expression' ) # From: https://github.com/bcgsc/pori_graphkb_parser/blob/ae3738842a4c208ab30f58c08ae987594d632504/src/constants.ts#L33-L80 TYPES_TO_NOTATION: Dict[str, str] = { - "acetylation": "ac", - "copy gain": "copygain", - "copy loss": "copyloss", - "deletion": "del", - "duplication": "dup", - "extension": "ext", - "frameshift": "fs", - "fusion": "fusion", - "indel": "delins", - "insertion": "ins", - "inversion": "inv", - "inverted translocation": "itrans", - "methylation": "me", - "missense mutation": "mis", - "mutation": "mut", - "nonsense mutation": ">", - "phosphorylation": "phos", - "splice-site": "spl", - "substitution": ">", - "translocation": "trans", - "truncating frameshift mutation": "fs", - "ubiquitination": "ub", + 'acetylation': 'ac', + 'copy gain': 'copygain', + 'copy loss': 'copyloss', + 'deletion': 'del', + 'duplication': 'dup', + 'extension': 'ext', + 'frameshift': 'fs', + 'fusion': 'fusion', + 'indel': 'delins', + 'insertion': 'ins', + 'inversion': 'inv', + 'inverted translocation': 'itrans', + 'methylation': 'me', + 'missense mutation': 'mis', + 'mutation': 'mut', + 'nonsense mutation': '>', + 'phosphorylation': 'phos', + 'splice-site': 'spl', + 'substitution': '>', + 'translocation': 'trans', + 'truncating frameshift mutation': 'fs', + 'ubiquitination': 'ub', # deprecated forms and aliases - "frameshift mutation": "fs", - "frameshift truncation": "fs", - "missense variant": "mis", - "truncating frameshift": "fs", - "missense": "mis", - "mutations": "mut", - "nonsense": ">", + 'frameshift mutation': 'fs', + 'frameshift truncation': 'fs', + 'missense variant': 'mis', + 'truncating frameshift': 'fs', + 'missense': 'mis', + 'mutations': 'mut', + 'nonsense': '>', } # For match.type_screening() [KBDEV-1056] -DEFAULT_NON_STRUCTURAL_VARIANT_TYPE = "mutation" +DEFAULT_NON_STRUCTURAL_VARIANT_TYPE = 'mutation' STRUCTURAL_VARIANT_SIZE_THRESHOLD = 48 # bp STRUCTURAL_VARIANT_TYPES = [ - "structural variant", - "insertion", - "in-frame insertion", - "deletion", - "deletion polymorphism", - "in-frame deletion", - "translocation", - "inverted translocation", - "inversion", - "indel", - "fusion", - "out-of-frame fusion", - "oncogenic fusion", - "in-frame fusion", - "disruptive fusion", - "duplication", - "internal duplication", - "tandem duplication", - "internal tandem duplication", - "itd", - "domain duplication", - "kinase domain duplication", - "copy variant", - "copy number variation", - "copy number variant", - "copy loss", - "copy number loss", - "shallow deletion", - "deep deletion", - "gene deletion", - "copy gain", - "copy number gain", - "low level copy gain", - "amplification", - "focal amplification", - "rearrangement", + 'structural variant', + 'insertion', + 'in-frame insertion', + 'deletion', + 'deletion polymorphism', + 'in-frame deletion', + 'translocation', + 'inverted translocation', + 'inversion', + 'indel', + 'fusion', + 'out-of-frame fusion', + 'oncogenic fusion', + 'in-frame fusion', + 'disruptive fusion', + 'duplication', + 'internal duplication', + 'tandem duplication', + 'internal tandem duplication', + 'itd', + 'domain duplication', + 'kinase domain duplication', + 'copy variant', + 'copy number variation', + 'copy number variant', + 'copy loss', + 'copy number loss', + 'shallow deletion', + 'deep deletion', + 'gene deletion', + 'copy gain', + 'copy number gain', + 'low level copy gain', + 'amplification', + 'focal amplification', + 'rearrangement', ] diff --git a/pori_python/graphkb/genes.py b/pori_python/graphkb/genes.py index e61e3cf5..09da3ed7 100644 --- a/pori_python/graphkb/genes.py +++ b/pori_python/graphkb/genes.py @@ -17,7 +17,6 @@ GSC_PHARMACOGENOMIC_SOURCE_EXCLUDE_LIST, ONCOGENE, ONCOKB_SOURCE_NAME, - PREFERRED_GENE_SOURCE, PREFERRED_GENE_SOURCE_NAME, RELEVANCE_BASE_TERMS, TSO500_SOURCE_NAME, @@ -35,14 +34,14 @@ def _get_tumourigenesis_genes_list( List[Statement], conn.query( { - "target": "Statement", - "filters": { - "AND": [ - {"source": {"target": "Source", "filters": {"name": sources}}}, - {"relevance": {"target": "Vocabulary", "filters": {"name": relevance}}}, + 'target': 'Statement', + 'filters': { + 'AND': [ + {'source': {'target': 'Source', 'filters': {'name': sources}}}, + {'relevance': {'target': 'Vocabulary', 'filters': {'name': relevance}}}, ] }, - "returnProperties": [f"subject.{prop}" for prop in GENE_RETURN_PROPERTIES], + 'returnProperties': [f'subject.{prop}' for prop in GENE_RETURN_PROPERTIES], }, ignore_cache=ignore_cache, ), @@ -51,9 +50,9 @@ def _get_tumourigenesis_genes_list( genes: Dict[str, Ontology] = {} for statement in statements: - if statement["subject"].get("biotype", "") == "gene": - record_id = statement["subject"]["@rid"] - genes[record_id] = statement["subject"] + if statement['subject'].get('biotype', '') == 'gene': + record_id = statement['subject']['@rid'] + genes[record_id] = statement['subject'] return [gene for gene in genes.values()] @@ -101,35 +100,35 @@ def get_therapeutic_associated_genes(graphkb_conn: GraphKBConnection) -> List[On therapeutic_relevance = get_terms_set(graphkb_conn, BASE_THERAPEUTIC_TERMS) statements = graphkb_conn.query( { - "target": "Statement", - "filters": {"relevance": sorted(list(therapeutic_relevance))}, - "returnProperties": ["reviewStatus"] - + [f"conditions.{prop}" for prop in GENE_RETURN_PROPERTIES] + 'target': 'Statement', + 'filters': {'relevance': sorted(list(therapeutic_relevance))}, + 'returnProperties': ['reviewStatus'] + + [f'conditions.{prop}' for prop in GENE_RETURN_PROPERTIES] + [ - f"conditions.reference{ref}.{prop}" + f'conditions.reference{ref}.{prop}' for prop in GENE_RETURN_PROPERTIES - for ref in ("1", "2") + for ref in ('1', '2') ], } ) genes: List[Ontology] = [] for statement in statements: statement = cast(Statement, statement) - if statement["reviewStatus"] == "failed": + if statement['reviewStatus'] == 'failed': continue - for condition in statement["conditions"]: - if condition["@class"] == "Feature": + for condition in statement['conditions']: + if condition['@class'] == 'Feature': genes.append(condition) - elif condition["@class"].endswith("Variant"): + elif condition['@class'].endswith('Variant'): cond = cast(Variant, condition) - if cond["reference1"] and cond["reference1"]["@class"] == "Feature": - genes.append(cond["reference1"]) - if cond["reference2"] and cond["reference2"]["@class"] == "Feature": - genes.append(cond["reference2"]) + if cond['reference1'] and cond['reference1']['@class'] == 'Feature': + genes.append(cond['reference1']) + if cond['reference2'] and cond['reference2']['@class'] == 'Feature': + genes.append(cond['reference2']) unique_genes: List[Ontology] = [] for gene in genes: - if not gene.get("deprecated", False): - if gene["@rid"] not in [g["@rid"] for g in unique_genes]: + if not gene.get('deprecated', False): + if gene['@rid'] not in [g['@rid'] for g in unique_genes]: unique_genes.append(gene) return unique_genes @@ -153,16 +152,16 @@ def get_genes_from_variant_types( variant_filters: List[Dict[str, Any]] = [] if types: variant_filters.append( - {"type": {"target": "Vocabulary", "filters": {"name": types, "operator": "IN"}}} + {'type': {'target': 'Vocabulary', 'filters': {'name': types, 'operator': 'IN'}}} ) variants = cast( List[Variant], conn.query( { - "target": "Variant", - "filters": variant_filters, - "returnProperties": ["reference1", "reference2"], + 'target': 'Variant', + 'filters': variant_filters, + 'returnProperties': ['reference1', 'reference2'], }, ignore_cache=ignore_cache, ), @@ -170,23 +169,23 @@ def get_genes_from_variant_types( genes = set() for variant in variants: - genes.add(variant["reference1"]) - if variant["reference2"]: - genes.add(variant["reference2"]) + genes.add(variant['reference1']) + if variant['reference2']: + genes.add(variant['reference2']) if not genes: return [] - gene_filters: List[Dict[str, Any]] = [{"biotype": "gene"}] + gene_filters: List[Dict[str, Any]] = [{'biotype': 'gene'}] if source_record_ids: - gene_filters.append({"source": source_record_ids, "operator": "IN"}) + gene_filters.append({'source': source_record_ids, 'operator': 'IN'}) result = cast( List[Ontology], conn.query( { - "target": list(genes), - "returnProperties": GENE_RETURN_PROPERTIES, - "filters": gene_filters, + 'target': list(genes), + 'returnProperties': GENE_RETURN_PROPERTIES, + 'filters': gene_filters, }, ignore_cache=ignore_cache, ), @@ -210,10 +209,10 @@ def get_preferred_gene_source_rid( return preferred_source_name result = conn.query( { - "target": {"target": "Source", "filters": {"name": preferred_source_name}}, - "queryType": "similarTo", + 'target': {'target': 'Source', 'filters': {'name': preferred_source_name}}, + 'queryType': 'similarTo', } - )[0]["@rid"] + )[0]['@rid'] return result @@ -235,29 +234,29 @@ def get_preferred_gene_name( """ source_rid = get_preferred_gene_source_rid(conn, source) if gene_name in CHROMOSOMES: - logger.error(f"{gene_name} assumed to be a chromosome, not gene") - return "" + logger.error(f'{gene_name} assumed to be a chromosome, not gene') + return '' eq = get_equivalent_features(conn=conn, gene_name=gene_name) - genes = [m for m in eq if m.get("biotype") == "gene" and not m.get("deprecated")] + genes = [m for m in eq if m.get('biotype') == 'gene' and not m.get('deprecated')] if not genes: - logger.error(f"No genes found for: {gene_name}") - return "" + logger.error(f'No genes found for: {gene_name}') + return '' if source_rid: - source_filtered_genes = [m for m in genes if m.get("source") == source_rid] + source_filtered_genes = [m for m in genes if m.get('source') == source_rid] if not source_filtered_genes: - logger.error(f"No data from source {source_rid} for {gene_name}") + logger.error(f'No data from source {source_rid} for {gene_name}') else: genes = source_filtered_genes - gene_names = [g["displayName"] for g in genes if g] + gene_names = [g['displayName'] for g in genes if g] if len(gene_names) > 1: logger.error( - f"Multiple gene names found for: {gene_name} - using {gene_names[0]}, ignoring {gene_names[1:]}" + f'Multiple gene names found for: {gene_name} - using {gene_names[0]}, ignoring {gene_names[1:]}' ) return gene_names[0] -@deprecated("Use get_gene_linked_cancer_predisposition_info instead") +@deprecated('Use get_gene_linked_cancer_predisposition_info instead') def get_cancer_predisposition_info( conn: GraphKBConnection, source: str = PREFERRED_GENE_SOURCE_NAME ) -> Tuple[List[str], Dict[str, str]]: @@ -267,7 +266,7 @@ def get_cancer_predisposition_info( def get_gene_linked_cancer_predisposition_info( - conn: GraphKBConnection, source: str = PREFERRED_GENE_SOURCE + conn: GraphKBConnection, source: str = PREFERRED_GENE_SOURCE_NAME ) -> Tuple[List[str], Dict[str, Tuple[str, List[str]]]]: """ Return two lists from GraphKB, one of cancer predisposition genes and one of associated variants. @@ -275,16 +274,15 @@ def get_gene_linked_cancer_predisposition_info( GERO-272 - criteria for what counts as a "cancer predisposition" variant In short: - * Statement 'source' is 'CGL' + * Statement 'source' is 'CGL' (not related to the preferred gene source) * Statement 'relevance' is 'pathogenic' * gene is gotten from any associated 'PositionalVariant' records Example: https://graphkb.bcgsc.ca/view/Statement/155:11616 - - Returns: genes: list of cancer predisposition genes + (using names from the source specified in this function's arguments) variants: dictionary mapping pharmacogenomic variant IDs to variant display names """ genes = set() @@ -293,51 +291,51 @@ def get_gene_linked_cancer_predisposition_info( variants: Dict[str, Tuple[str, List[str]]] = {} terms: dict = {term: lst for term, lst in RELEVANCE_BASE_TERMS} - relevance_rids = list(get_terms_set(conn, terms.get("cancer predisposition", []))) + relevance_rids = list(get_terms_set(conn, terms.get('cancer predisposition', []))) source_rid = get_preferred_gene_source_rid(conn, source) predisp_statements = [ cast(Statement, record) for record in conn.query( { - "target": "Statement", - "filters": { - "AND": [ + 'target': 'Statement', + 'filters': { + 'AND': [ { - "evidence": { - "target": "Source", - "filters": {"@rid": get_rid(conn, "Source", "CGL")}, + 'evidence': { + 'target': 'Source', + 'filters': {'@rid': get_rid(conn, 'Source', 'CGL')}, } }, { - "relevance": { - "target": "Vocabulary", - "filters": {"@rid": relevance_rids}, + 'relevance': { + 'target': 'Vocabulary', + 'filters': {'@rid': relevance_rids}, } }, ] }, - "returnProperties": [ - "conditions.@class", - "conditions.@rid", - "conditions.displayName", - "conditions.reference1.biotype", - "conditions.reference1.displayName", - "conditions.reference2.biotype", - "conditions.reference2.displayName", + 'returnProperties': [ + 'conditions.@class', + 'conditions.@rid', + 'conditions.displayName', + 'conditions.reference1.biotype', + 'conditions.reference1.displayName', + 'conditions.reference2.biotype', + 'conditions.reference2.displayName', ], }, ignore_cache=False, ) ] for record in predisp_statements: - for condition in record["conditions"]: - if condition["@class"] == "PositionalVariant": + for condition in record['conditions']: + if condition['@class'] == 'PositionalVariant': assoc_gene_list: List[str] = [] - for reference in ["reference1", "reference2"]: - name = (condition.get(reference) or {}).get("displayName", "") # type: ignore - biotype = (condition.get(reference) or {}).get("biotype", "") # type: ignore - if name and biotype == "gene": + for reference in ['reference1', 'reference2']: + name = (condition.get(reference) or {}).get('displayName', '') # type: ignore + biotype = (condition.get(reference) or {}).get('biotype', '') # type: ignore + if name and biotype == 'gene': genes.add(name) assoc_gene_list.append(name) elif name: @@ -348,9 +346,9 @@ def get_gene_linked_cancer_predisposition_info( else: non_genes.add((name, biotype)) logger.error( - f"Non-gene cancer predisposition {biotype}: {name} for {condition['displayName']}" + f'Non-gene cancer predisposition {biotype}: {name} for {condition["displayName"]}' ) - variants[condition["@rid"]] = (condition["displayName"], assoc_gene_list) + variants[condition['@rid']] = (condition['displayName'], assoc_gene_list) for gene, name, biotype in infer_genes: logger.debug(f"Found gene '{gene}' for '{name}' ({biotype})") @@ -362,7 +360,7 @@ def get_gene_linked_cancer_predisposition_info( return sorted(genes), variants -@deprecated("Use get_gene_linked_pharmacogenomic_info instead") +@deprecated('Use get_gene_linked_pharmacogenomic_info instead') def get_pharmacogenomic_info( conn: GraphKBConnection, source: str = PREFERRED_GENE_SOURCE_NAME ) -> Tuple[List[str], Dict[str, str]]: @@ -372,7 +370,7 @@ def get_pharmacogenomic_info( def get_gene_linked_pharmacogenomic_info( - conn: GraphKBConnection, source: str = PREFERRED_GENE_SOURCE + conn: GraphKBConnection, source: str = PREFERRED_GENE_SOURCE_NAME ) -> Tuple[List[str], Dict[str, Tuple[str, List[str]]]]: """ Return two lists from GraphKB, one of pharmacogenomic genes and one of associated variants. @@ -395,39 +393,39 @@ def get_gene_linked_pharmacogenomic_info( infer_genes = set() variants: Dict[str, Tuple] = {} - relevance_rids = list(get_terms_set(conn, "pharmacogenomic")) + relevance_rids = list(get_terms_set(conn, 'pharmacogenomic')) source_rid = get_preferred_gene_source_rid(conn, source) for record in conn.query( { - "target": "Statement", - "filters": [ - {"relevance": {"target": "Vocabulary", "filters": {"@rid": relevance_rids}}} + 'target': 'Statement', + 'filters': [ + {'relevance': {'target': 'Vocabulary', 'filters': {'@rid': relevance_rids}}} ], - "returnProperties": [ - "conditions.@class", - "conditions.@rid", - "conditions.displayName", - "conditions.reference1.biotype", - "conditions.reference1.displayName", - "conditions.reference2.biotype", - "conditions.reference2.displayName", - "source.name", + 'returnProperties': [ + 'conditions.@class', + 'conditions.@rid', + 'conditions.displayName', + 'conditions.reference1.biotype', + 'conditions.reference1.displayName', + 'conditions.reference2.biotype', + 'conditions.reference2.displayName', + 'source.name', ], }, ignore_cache=False, ): - if record["source"]: # type: ignore - if record["source"]["name"].lower() in GSC_PHARMACOGENOMIC_SOURCE_EXCLUDE_LIST: # type: ignore + if record['source']: # type: ignore + if record['source']['name'].lower() in GSC_PHARMACOGENOMIC_SOURCE_EXCLUDE_LIST: # type: ignore continue - for condition in record["conditions"]: # type: ignore - if condition["@class"] == "PositionalVariant": + for condition in record['conditions']: # type: ignore + if condition['@class'] == 'PositionalVariant': assoc_gene_list = [] - for reference in ["reference1", "reference2"]: - name = (condition.get(reference) or {}).get("displayName", "") - biotype = (condition.get(reference) or {}).get("biotype", "") - if name and biotype == "gene": + for reference in ['reference1', 'reference2']: + name = (condition.get(reference) or {}).get('displayName', '') + biotype = (condition.get(reference) or {}).get('biotype', '') + if name and biotype == 'gene': genes.add(name) assoc_gene_list.append(name) elif name: @@ -438,9 +436,9 @@ def get_gene_linked_pharmacogenomic_info( else: non_genes.add((name, biotype)) logger.error( - f"Non-gene pharmacogenomic {biotype}: {name} for {condition['displayName']}" + f'Non-gene pharmacogenomic {biotype}: {name} for {condition["displayName"]}' ) - variants[condition["@rid"]] = (condition["displayName"], assoc_gene_list) + variants[condition['@rid']] = (condition['displayName'], assoc_gene_list) for gene, name, biotype in infer_genes: logger.debug(f"Found gene '{gene}' for '{name}' ({biotype})") genes.add(gene) @@ -452,7 +450,7 @@ def get_gene_linked_pharmacogenomic_info( def convert_to_rid_set(records: List[Record] | List[Ontology]) -> Set[str]: - return {r["@rid"] for r in records} + return {r['@rid'] for r in records} def get_gene_information( @@ -479,59 +477,59 @@ def get_gene_information( 'name': 'TERT', 'oncogene': True}] """ - logger.info("fetching variant related genes list") + logger.info('fetching variant related genes list') # For query speed, only fetch the minimum needed details ret_props = [ - "conditions.@rid", - "conditions.@class", - "conditions.reference1", - "conditions.reference2", - "reviewStatus", + 'conditions.@rid', + 'conditions.@class', + 'conditions.reference1', + 'conditions.reference2', + 'reviewStatus', ] - body: Dict[str, Any] = {"target": "Statement", "returnProperties": ret_props} + body: Dict[str, Any] = {'target': 'Statement', 'returnProperties': ret_props} gene_names = sorted(set(gene_names)) statements = graphkb_conn.query(body) - statements = [s for s in statements if s.get("reviewStatus") != FAILED_REVIEW_STATUS] + statements = [s for s in statements if s.get('reviewStatus') != FAILED_REVIEW_STATUS] gene_flags: Dict[str, Set[str]] = { - "kbStatementRelated": set(), - "knownFusionPartner": set(), - "knownSmallMutation": set(), + 'kbStatementRelated': set(), + 'knownFusionPartner': set(), + 'knownSmallMutation': set(), } for statement in statements: statement = cast(Statement, statement) - for condition in statement["conditions"]: + for condition in statement['conditions']: # ignore types, as there can be various types of conditions - if condition.get("reference1"): - gene_flags["kbStatementRelated"].add(condition["reference1"]) # type: ignore - if condition.get("reference2"): + if condition.get('reference1'): + gene_flags['kbStatementRelated'].add(condition['reference1']) # type: ignore + if condition.get('reference2'): # Having a reference2 implies the event is a fusion - gene_flags["kbStatementRelated"].add(condition["reference2"]) # type: ignore - gene_flags["knownFusionPartner"].add(condition["reference1"]) # type: ignore - gene_flags["knownFusionPartner"].add(condition["reference2"]) # type: ignore - elif condition["@class"] == "PositionalVariant": + gene_flags['kbStatementRelated'].add(condition['reference2']) # type: ignore + gene_flags['knownFusionPartner'].add(condition['reference1']) # type: ignore + gene_flags['knownFusionPartner'].add(condition['reference2']) # type: ignore + elif condition['@class'] == 'PositionalVariant': # PositionalVariant without a reference2 implies a smallMutation type - gene_flags["knownSmallMutation"].add(condition["reference1"]) # type: ignore + gene_flags['knownSmallMutation'].add(condition['reference1']) # type: ignore - logger.info("fetching oncogenes list") - gene_flags["oncogene"] = convert_to_rid_set(get_oncokb_oncogenes(graphkb_conn)) - logger.info("fetching tumour supressors list") - gene_flags["tumourSuppressor"] = convert_to_rid_set(get_oncokb_tumour_supressors(graphkb_conn)) - logger.info("fetching cancerGeneListMatch list") - gene_flags["cancerGeneListMatch"] = convert_to_rid_set(get_cancer_genes(graphkb_conn)) + logger.info('fetching oncogenes list') + gene_flags['oncogene'] = convert_to_rid_set(get_oncokb_oncogenes(graphkb_conn)) + logger.info('fetching tumour supressors list') + gene_flags['tumourSuppressor'] = convert_to_rid_set(get_oncokb_tumour_supressors(graphkb_conn)) + logger.info('fetching cancerGeneListMatch list') + gene_flags['cancerGeneListMatch'] = convert_to_rid_set(get_cancer_genes(graphkb_conn)) - logger.info("fetching therapeutic associated genes lists") - gene_flags["therapeuticAssociated"] = convert_to_rid_set( + logger.info('fetching therapeutic associated genes lists') + gene_flags['therapeuticAssociated'] = convert_to_rid_set( get_therapeutic_associated_genes(graphkb_conn) ) - logger.info(f"Setting gene_info flags on {len(gene_names)} genes") + logger.info(f'Setting gene_info flags on {len(gene_names)} genes') result: List[IprGene] = [] for gene_name in gene_names: equivalent = convert_to_rid_set(get_equivalent_features(graphkb_conn, gene_name)) - row: Dict[str, str | bool] = {"name": gene_name} + row: Dict[str, str | bool] = {'name': gene_name} flagged = False for flag in gene_flags: # make smaller JSON to upload since all default to false already diff --git a/pori_python/graphkb/match.py b/pori_python/graphkb/match.py index 0c791383..29c8cf32 100644 --- a/pori_python/graphkb/match.py +++ b/pori_python/graphkb/match.py @@ -46,8 +46,8 @@ def get_equivalent_features( gene_name: str, ignore_cache: bool = False, is_source_id: bool = False, - source: str = "", - source_id_version: str = "", + source: str = '', + source_id_version: str = '', ) -> List[Ontology]: """Match an equivalent list of features given some input feature name (or ID). @@ -76,36 +76,36 @@ def get_equivalent_features( return cast( List[Ontology], conn.query( - {"target": [gene_name], "queryType": "similarTo"}, ignore_cache=ignore_cache + {'target': [gene_name], 'queryType': 'similarTo'}, ignore_cache=ignore_cache ), ) filters: List[Dict] = [] if source: - filters.append({"source": {"target": "Source", "filters": {"name": source}}}) + filters.append({'source': {'target': 'Source', 'filters': {'name': source}}}) - if gene_name.count(".") == 1 and gene_name.split(".")[-1].isnumeric(): + if gene_name.count('.') == 1 and gene_name.split('.')[-1].isnumeric(): # eg. ENSG00000133703.11 or NM_033360.4 logger.debug( - f"Assuming {gene_name} has a .version_format - ignoring the version for equivalent features" + f'Assuming {gene_name} has a .version_format - ignoring the version for equivalent features' ) - gene_name = gene_name.split(".")[0] + gene_name = gene_name.split('.')[0] if is_source_id or source_id_version: - filters.append({"sourceId": gene_name}) + filters.append({'sourceId': gene_name}) if source_id_version: filters.append( - {"OR": [{"sourceIdVersion": source_id_version}, {"sourceIdVersion": None}]} + {'OR': [{'sourceIdVersion': source_id_version}, {'sourceIdVersion': None}]} ) elif FEATURES_CACHE and gene_name.lower() not in FEATURES_CACHE and not ignore_cache: return [] else: - filters.append({"OR": [{"sourceId": gene_name}, {"name": gene_name}]}) + filters.append({'OR': [{'sourceId': gene_name}, {'name': gene_name}]}) return cast( List[Ontology], conn.query( - {"target": {"target": "Feature", "filters": filters}, "queryType": "similarTo"}, + {'target': {'target': 'Feature', 'filters': filters}, 'queryType': 'similarTo'}, ignore_cache=ignore_cache, ), ) @@ -118,24 +118,24 @@ def cache_missing_features(conn: GraphKBConnection) -> None: """ genes = cast( List[Ontology], - conn.query({"target": "Feature", "returnProperties": ["name", "sourceId"], "neighbors": 0}), + conn.query({'target': 'Feature', 'returnProperties': ['name', 'sourceId'], 'neighbors': 0}), ) for gene in genes: - if gene["name"]: - FEATURES_CACHE.add(gene["name"].lower()) - if gene["sourceId"]: - FEATURES_CACHE.add(gene["sourceId"].lower()) + if gene['name']: + FEATURES_CACHE.add(gene['name'].lower()) + if gene['sourceId']: + FEATURES_CACHE.add(gene['sourceId'].lower()) def match_category_variant( conn: GraphKBConnection, reference_name: str, category: str, - root_exclude_term: str = "", - gene_source: str = "", + root_exclude_term: str = '', + gene_source: str = '', gene_is_source_id: bool = False, ignore_cache: bool = False, - reference_class: str = "Feature", + reference_class: str = 'Feature', ) -> List[Variant]: """ Returns a list of variants matching the input variant @@ -155,7 +155,7 @@ def match_category_variant( """ # disambiguate the reference to find all equivalent representations references: List[str] = [] - if reference_class == "Feature": + if reference_class == 'Feature': references = convert_to_rid_list( get_equivalent_features( conn, @@ -167,14 +167,14 @@ def match_category_variant( ) if not references: raise FeatureNotFoundError( - f"unable to find the gene ({reference_name}) or any equivalent representations" + f'unable to find the gene ({reference_name}) or any equivalent representations' ) - if reference_class == "Signature": + if reference_class == 'Signature': references = convert_to_rid_list( get_equivalent_terms( conn, reference_name.lower(), - ontology_class="Signature", + ontology_class='Signature', ignore_cache=ignore_cache, ) ) @@ -185,24 +185,24 @@ def match_category_variant( ) if not types: - raise ValueError(f"unable to find the term/category ({category}) or any equivalent") + raise ValueError(f'unable to find the term/category ({category}) or any equivalent') # find the variant list return cast( List[Variant], conn.query( { - "target": { - "target": "CategoryVariant", - "filters": [ - {"reference1": references, "operator": "IN"}, - {"type": types, "operator": "IN"}, + 'target': { + 'target': 'CategoryVariant', + 'filters': [ + {'reference1': references, 'operator': 'IN'}, + {'type': types, 'operator': 'IN'}, ], }, - "queryType": "similarTo", - "edges": ["AliasOf", "DeprecatedBy", "CrossReferenceOf", "GeneralizationOf"], - "treeEdges": ["Infers"], - "returnProperties": VARIANT_RETURN_PROPERTIES, + 'queryType': 'similarTo', + 'edges': ['AliasOf', 'DeprecatedBy', 'CrossReferenceOf', 'GeneralizationOf'], + 'treeEdges': ['Infers'], + 'returnProperties': VARIANT_RETURN_PROPERTIES, }, ignore_cache=ignore_cache, ), @@ -228,14 +228,14 @@ def match_copy_variant( List of variant records from GraphKB which match the input """ if category not in INPUT_COPY_CATEGORIES.values(): - raise ValueError(f"not a valid copy variant input category ({category})") + raise ValueError(f'not a valid copy variant input category ({category})') result = match_category_variant( - conn, gene_name, category, root_exclude_term="structural variant", **kwargs + conn, gene_name, category, root_exclude_term='structural variant', **kwargs ) if drop_homozygous: - return [row for row in result if row["zygosity"] != "homozygous"] + return [row for row in result if row['zygosity'] != 'homozygous'] return result @@ -243,10 +243,10 @@ def match_expression_variant( conn: GraphKBConnection, gene_name: str, category: str, **kwargs ) -> List[Variant]: if category not in INPUT_EXPRESSION_CATEGORIES.values(): - raise ValueError(f"not a valid expression variant input category ({category})") + raise ValueError(f'not a valid expression variant input category ({category})') return match_category_variant( - conn, gene_name, category, root_exclude_term="biological", **kwargs + conn, gene_name, category, root_exclude_term='biological', **kwargs ) @@ -270,19 +270,19 @@ def positions_overlap( Returns: bool: True if the positions overlap """ - if pos_record.get("@class", "") == "CytobandPosition": + if pos_record.get('@class', '') == 'CytobandPosition': raise NotImplementedError( - "Position comparison for cytoband coordinates is not yet implemented" + 'Position comparison for cytoband coordinates is not yet implemented' ) - pos = pos_record.get("pos", None) + pos = pos_record.get('pos', None) if pos is None: return True - start = range_start.get("pos", None) + start = range_start.get('pos', None) if range_end: - end = range_end.get("pos", None) + end = range_end.get('pos', None) if start is not None and pos < start: return False @@ -315,15 +315,15 @@ def equivalent_types( # Convert rid to displayName if needed if looks_like_rid(type1): - type1 = conn.get_records_by_id([type1])[0]["displayName"] + type1 = conn.get_records_by_id([type1])[0]['displayName'] if looks_like_rid(type2): - type2 = conn.get_records_by_id([type2])[0]["displayName"] + type2 = conn.get_records_by_id([type2])[0]['displayName'] # Get type terms from observed variant terms1 = [] if strict: try: - terms1.append(get_term_by_name(conn, type1)["@rid"]) + terms1.append(get_term_by_name(conn, type1)['@rid']) except Exception: pass else: @@ -375,12 +375,12 @@ def compare_positional_variants( # For break1, check if positions are overlaping between the variant and the reference. # Continue only if True. if not positions_overlap( - cast(BasicPosition, variant["break1Start"]), - cast(BasicPosition, reference_variant["break1Start"]), + cast(BasicPosition, variant['break1Start']), + cast(BasicPosition, reference_variant['break1Start']), ( None - if "break1End" not in reference_variant - else cast(BasicPosition, reference_variant["break1End"]) + if 'break1End' not in reference_variant + else cast(BasicPosition, reference_variant['break1End']) ), ): return False @@ -388,16 +388,16 @@ def compare_positional_variants( # For break2, check if positions are overlaping between the variant and the reference. # Continue only if True or no break2. # TODO: check for variant without break2 but reference_variant with one. - if variant.get("break2Start"): - if not reference_variant.get("break2Start"): + if variant.get('break2Start'): + if not reference_variant.get('break2Start'): return False if not positions_overlap( - cast(BasicPosition, variant["break2Start"]), - cast(BasicPosition, reference_variant["break2Start"]), + cast(BasicPosition, variant['break2Start']), + cast(BasicPosition, reference_variant['break2Start']), ( None - if "break2End" not in reference_variant - else cast(BasicPosition, reference_variant["break2End"]) + if 'break2End' not in reference_variant + else cast(BasicPosition, reference_variant['break2End']) ), ): return False @@ -405,47 +405,47 @@ def compare_positional_variants( # If both variants have untemplated sequence, # check for size and content. if ( - variant.get("untemplatedSeq", None) is not None - and reference_variant.get("untemplatedSeq", None) is not None + variant.get('untemplatedSeq', None) is not None + and reference_variant.get('untemplatedSeq', None) is not None ): if ( - variant.get("untemplatedSeqSize", None) is not None - and reference_variant.get("untemplatedSeqSize", None) is not None + variant.get('untemplatedSeqSize', None) is not None + and reference_variant.get('untemplatedSeqSize', None) is not None ): - if variant["untemplatedSeqSize"] != reference_variant["untemplatedSeqSize"]: + if variant['untemplatedSeqSize'] != reference_variant['untemplatedSeqSize']: return False if ( - reference_variant["untemplatedSeq"] is not None - and variant["untemplatedSeq"] is not None + reference_variant['untemplatedSeq'] is not None + and variant['untemplatedSeq'] is not None ): if ( - reference_variant["untemplatedSeq"] not in AMBIGUOUS_AA - and variant["untemplatedSeq"] not in AMBIGUOUS_AA + reference_variant['untemplatedSeq'] not in AMBIGUOUS_AA + and variant['untemplatedSeq'] not in AMBIGUOUS_AA ): - if reference_variant["untemplatedSeq"].lower() != variant["untemplatedSeq"].lower(): + if reference_variant['untemplatedSeq'].lower() != variant['untemplatedSeq'].lower(): return False - elif len(variant["untemplatedSeq"]) != len(reference_variant["untemplatedSeq"]): + elif len(variant['untemplatedSeq']) != len(reference_variant['untemplatedSeq']): return False # If both variants have a reference sequence, # check if they are the same. if ( - variant.get("refSeq", None) is not None - and reference_variant.get("refSeq", None) is not None + variant.get('refSeq', None) is not None + and reference_variant.get('refSeq', None) is not None ): if ( - reference_variant["refSeq"] not in AMBIGUOUS_AA - and variant["refSeq"] not in AMBIGUOUS_AA + reference_variant['refSeq'] not in AMBIGUOUS_AA + and variant['refSeq'] not in AMBIGUOUS_AA ): - if reference_variant["refSeq"].lower() != variant["refSeq"].lower(): # type: ignore + if reference_variant['refSeq'].lower() != variant['refSeq'].lower(): # type: ignore return False - elif len(variant["refSeq"]) != len(reference_variant["refSeq"]): # type: ignore + elif len(variant['refSeq']) != len(reference_variant['refSeq']): # type: ignore return False # Equivalent types - if variant.get("type") and reference_variant.get("type"): - if not equivalent_types(conn, variant["type"], reference_variant["type"]): + if variant.get('type') and reference_variant.get('type'): + if not equivalent_types(conn, variant['type'], reference_variant['type']): return False return True @@ -500,38 +500,38 @@ def type_screening( # Will use either hardcoded type list or an updated list from the API if updateStructuralTypes: - rids = list(get_terms_set(conn, ["structural variant"])) + rids = list(get_terms_set(conn, ['structural variant'])) records = conn.get_records_by_id(rids) - structuralVariantTypes = [el["name"] for el in records] + structuralVariantTypes = [el['name'] for el in records] # Unambiguous non-structural variation type - if parsed["type"] not in structuralVariantTypes: - return parsed["type"] + if parsed['type'] not in structuralVariantTypes: + return parsed['type'] # Unambiguous structural variation type - if parsed["type"] in ["fusion", "translocation"]: - return parsed["type"] - if parsed.get("reference2", None): - return parsed["type"] - prefix = parsed.get("prefix", "g") - if prefix == "y": # Assuming all variations using cytoband coordiantes meet the size threshold - return parsed["type"] + if parsed['type'] in ['fusion', 'translocation']: + return parsed['type'] + if parsed.get('reference2', None): + return parsed['type'] + prefix = parsed.get('prefix', 'g') + if prefix == 'y': # Assuming all variations using cytoband coordiantes meet the size threshold + return parsed['type'] # When size cannot be determined: exonic and intronic coordinates # e.g. "MET:e.14del" meaning "Any deletion occuring at the 14th exon" - if prefix in ["e", "i"]: # Assuming they don't meet the size threshold + if prefix in ['e', 'i']: # Assuming they don't meet the size threshold return default_type # When size is given - if (parsed.get("untemplatedSeqSize") or 0) >= threshold: - return parsed["type"] + if (parsed.get('untemplatedSeqSize') or 0) >= threshold: + return parsed['type'] # When size needs to be computed from positions - pos_start: int = parsed.get("break1Start", {}).get("pos", 1) # type: ignore - pos_end: int = parsed.get("break2Start", {}).get("pos", pos_start) # type: ignore - pos_size = 3 if prefix == "p" else 1 + pos_start: int = parsed.get('break1Start', {}).get('pos', 1) # type: ignore + pos_end: int = parsed.get('break2Start', {}).get('pos', pos_start) # type: ignore + pos_size = 3 if prefix == 'p' else 1 if ((pos_end - pos_start) + 1) * pos_size >= threshold: - return parsed["type"] + return parsed['type'] # Default return default_type @@ -543,7 +543,7 @@ def match_positional_variant( reference1: Optional[str] = None, reference2: Optional[str] = None, gene_is_source_id: bool = False, - gene_source: str = "", + gene_source: str = '', ignore_cache: bool = False, updateStructuralTypes: bool = False, ) -> List[Variant]: @@ -590,21 +590,21 @@ def match_positional_variant( # parse the representation parsed = conn.parse(variant_string, not (reference1 or reference2)) - if "break1End" in parsed or "break2End" in parsed: # uncertain position + if 'break1End' in parsed or 'break2End' in parsed: # uncertain position raise NotImplementedError( - f"Matching does not support uncertain positions ({variant_string}) as input" + f'Matching does not support uncertain positions ({variant_string}) as input' ) if reference2 and not reference1: - raise ValueError("cannot specify reference2 without reference1") + raise ValueError('cannot specify reference2 without reference1') # disambiguate the gene name if reference1: gene1 = reference1 - if "reference1" in parsed: + if 'reference1' in parsed: raise ValueError( - "Cannot specify reference1 explicitly as well as in the variant notation" + 'Cannot specify reference1 explicitly as well as in the variant notation' ) else: - gene1 = parsed["reference1"] + gene1 = parsed['reference1'] gene1_features = get_equivalent_features( conn, gene1, source=gene_source, is_source_id=gene_is_source_id, ignore_cache=ignore_cache @@ -613,7 +613,7 @@ def match_positional_variant( if not features: raise FeatureNotFoundError( - f"unable to find the gene ({gene1}) or any equivalent representations" + f'unable to find the gene ({gene1}) or any equivalent representations' ) secondary_features = None @@ -621,20 +621,20 @@ def match_positional_variant( gene2: Optional[str] = None if reference2: gene2 = reference2 - if "reference2" in parsed: + if 'reference2' in parsed: raise ValueError( - "Cannot specify reference2 explicitly as well as in the variant notation" + 'Cannot specify reference2 explicitly as well as in the variant notation' ) - elif "reference1" in parsed: + elif 'reference1' in parsed: raise ValueError( - "variant notation cannot contain features when explicit features are given" + 'variant notation cannot contain features when explicit features are given' ) elif ( - "reference2" in parsed - and parsed.get("reference2", "?") != "?" - and parsed["reference2"] is not None + 'reference2' in parsed + and parsed.get('reference2', '?') != '?' + and parsed['reference2'] is not None ): - gene2 = parsed["reference2"] + gene2 = parsed['reference2'] if gene2: gene2_features = get_equivalent_features( @@ -647,14 +647,14 @@ def match_positional_variant( secondary_features = convert_to_rid_list(gene2_features) if not secondary_features: raise FeatureNotFoundError( - f"unable to find the gene ({gene2}) or any equivalent representations" + f'unable to find the gene ({gene2}) or any equivalent representations' ) # match the existing mutations (positional) query_filters = [ - {"reference1": features}, - {"reference2": secondary_features}, - {"break1Start.@class": parsed["break1Start"]["@class"]}, + {'reference1': features}, + {'reference2': secondary_features}, + {'break1Start.@class': parsed['break1Start']['@class']}, ] filtered_similarOnly: List[Record] = [] # For post filter match use @@ -663,7 +663,7 @@ def match_positional_variant( for row in cast( List[Record], conn.query( - {"target": "PositionalVariant", "filters": query_filters}, ignore_cache=ignore_cache + {'target': 'PositionalVariant', 'filters': query_filters}, ignore_cache=ignore_cache ), ): # TODO: Check if variant and reference_variant should be interchanged @@ -688,11 +688,11 @@ def match_positional_variant( matches.extend( conn.query( { - "target": convert_to_rid_list(filtered_similarOnly), - "queryType": "similarTo", - "edges": ["AliasOf", "DeprecatedBy", "CrossReferenceOf", "GeneralizationOf"], - "treeEdges": ["Infers"], - "returnProperties": POS_VARIANT_RETURN_PROPERTIES, + 'target': convert_to_rid_list(filtered_similarOnly), + 'queryType': 'similarTo', + 'edges': ['AliasOf', 'DeprecatedBy', 'CrossReferenceOf', 'GeneralizationOf'], + 'treeEdges': ['Infers'], + 'returnProperties': POS_VARIANT_RETURN_PROPERTIES, }, ignore_cache=ignore_cache, ) @@ -705,7 +705,7 @@ def match_positional_variant( variant_types_details = get_equivalent_terms( conn, screened_type, - root_exclude_term="mutation" if secondary_features else "", + root_exclude_term='mutation' if secondary_features else '', ignore_cache=ignore_cache, ) @@ -714,18 +714,18 @@ def match_positional_variant( matches.extend( conn.query( { - "target": { - "target": "CategoryVariant", - "filters": [ - {"reference1": features}, - {"type": types}, - {"reference2": secondary_features}, + 'target': { + 'target': 'CategoryVariant', + 'filters': [ + {'reference1': features}, + {'type': types}, + {'reference2': secondary_features}, ], }, - "queryType": "similarTo", - "edges": ["AliasOf", "DeprecatedBy", "CrossReferenceOf"], - "treeEdges": ["Infers"], - "returnProperties": POS_VARIANT_RETURN_PROPERTIES, + 'queryType': 'similarTo', + 'edges': ['AliasOf', 'DeprecatedBy', 'CrossReferenceOf'], + 'treeEdges': ['Infers'], + 'returnProperties': POS_VARIANT_RETURN_PROPERTIES, }, ignore_cache=ignore_cache, ) @@ -739,18 +739,18 @@ def cat_variant_query( matches.extend( conn.query( { - "target": { - "target": "CategoryVariant", - "filters": [ - {"reference1": cat_features}, - {"type": cat_types}, - {"reference2": cat_secondary_features}, + 'target': { + 'target': 'CategoryVariant', + 'filters': [ + {'reference1': cat_features}, + {'type': cat_types}, + {'reference2': cat_secondary_features}, ], }, - "queryType": "similarTo", - "edges": ["AliasOf", "DeprecatedBy", "CrossReferenceOf"], - "treeEdges": [], - "returnProperties": VARIANT_RETURN_PROPERTIES, + 'queryType': 'similarTo', + 'edges': ['AliasOf', 'DeprecatedBy', 'CrossReferenceOf'], + 'treeEdges': [], + 'returnProperties': VARIANT_RETURN_PROPERTIES, }, ignore_cache=ignore_cache, ) @@ -768,10 +768,10 @@ def cat_variant_query( matches.extend( conn.query( { - "target": convert_to_rid_list(filtered_similarAndGeneric), - "queryType": "descendants", - "edges": [], - "returnProperties": POS_VARIANT_RETURN_PROPERTIES, + 'target': convert_to_rid_list(filtered_similarAndGeneric), + 'queryType': 'descendants', + 'edges': [], + 'returnProperties': POS_VARIANT_RETURN_PROPERTIES, }, ignore_cache=ignore_cache, ) @@ -779,6 +779,6 @@ def cat_variant_query( result: Dict[str, Variant] = {} for row in matches: - result[row["@rid"]] = cast(Variant, row) + result[row['@rid']] = cast(Variant, row) return list(result.values()) diff --git a/pori_python/graphkb/statement.py b/pori_python/graphkb/statement.py index 24246b91..3f077ee1 100644 --- a/pori_python/graphkb/statement.py +++ b/pori_python/graphkb/statement.py @@ -20,7 +20,7 @@ def categorize_relevance( term_set = get_terms_set(graphkb_conn, base_terms) if relevance_rid in term_set: return category - return "" + return '' def get_statements_from_variants( @@ -38,11 +38,11 @@ def get_statements_from_variants( """ statements = graphkb_conn.query( { - "target": "Statement", - "filters": {"conditions": convert_to_rid_list(variants), "operator": "CONTAINSANY"}, - "returnProperties": STATEMENT_RETURN_PROPERTIES, + 'target': 'Statement', + 'filters': {'conditions': convert_to_rid_list(variants), 'operator': 'CONTAINSANY'}, + 'returnProperties': STATEMENT_RETURN_PROPERTIES, } ) if not failed_review: - statements = [s for s in statements if s.get("reviewStatus") != FAILED_REVIEW_STATUS] + statements = [s for s in statements if s.get('reviewStatus') != FAILED_REVIEW_STATUS] return [cast(Statement, s) for s in statements] diff --git a/pori_python/graphkb/util.py b/pori_python/graphkb/util.py index 2ff8620c..23c28963 100644 --- a/pori_python/graphkb/util.py +++ b/pori_python/graphkb/util.py @@ -14,14 +14,14 @@ from pori_python.types import ParsedVariant, PositionalVariant, Record -from .constants import DEFAULT_LIMIT, DEFAULT_URL, TYPES_TO_NOTATION, AA_3to1_MAPPING +from .constants import DEFAULT_LIMIT, TYPES_TO_NOTATION, AA_3to1_MAPPING QUERY_CACHE: Dict[Any, Any] = {} # name the logger after the package to make it simple to disable for packages using this one as a dependency # https://stackoverflow.com/questions/11029717/how-do-i-disable-log-messages-from-the-requests-library -logger = logging.getLogger("graphkb") +logger = logging.getLogger('graphkb') def convert_to_rid_list(records: Iterable[Record]) -> List[str]: @@ -31,7 +31,7 @@ def convert_to_rid_list(records: Iterable[Record]) -> List[str]: if isinstance(record, str): result.append(record) # assume an @rid string else: - result.append(record["@rid"]) + result.append(record['@rid']) return result @@ -41,7 +41,7 @@ class FeatureNotFoundError(Exception): def looks_like_rid(rid: str) -> bool: """Check if an input string looks like a GraphKB ID.""" - if re.match(r"^#-?\d+:-?\d+$", rid): + if re.match(r'^#-?\d+:-?\d+$', rid): return True return False @@ -50,15 +50,15 @@ def convert_aa_3to1(three_letter_notation: str) -> str: """Convert an Input string from 3 letter AA notation to 1 letter AA notation.""" result = [] - if ":" in three_letter_notation: + if ':' in three_letter_notation: # do not include the feature/gene in replacements - pos = three_letter_notation.index(":") + pos = three_letter_notation.index(':') result.append(three_letter_notation[: pos + 1]) three_letter_notation = three_letter_notation[pos + 1 :] last_match_end = 0 # exclusive interval [ ) - for match in re.finditer(r"[A-Z][a-z][a-z]", three_letter_notation): + for match in re.finditer(r'[A-Z][a-z][a-z]', three_letter_notation): # add the in-between string result.append(three_letter_notation[last_match_end : match.start()]) text = three_letter_notation[match.start() : match.end()] @@ -66,7 +66,7 @@ def convert_aa_3to1(three_letter_notation: str) -> str: last_match_end = match.end() result.append(three_letter_notation[last_match_end:]) - return "".join(result) + return ''.join(result) def join_url(base_url: str, *parts) -> str: @@ -74,9 +74,9 @@ def join_url(base_url: str, *parts) -> str: if not parts: return base_url - url = [base_url.rstrip("/")] + [part.strip("/") for part in parts] + url = [base_url.rstrip('/')] + [part.strip('/') for part in parts] - return "/".join(url) + return '/'.join(url) def millis_interval(start: datetime, end: datetime) -> int: @@ -91,16 +91,16 @@ def millis_interval(start: datetime, end: datetime) -> int: def cache_key(request_body) -> str: """Create a cache key for a query request to GraphKB.""" body = json.dumps(request_body, sort_keys=True) - hash_code = hashlib.md5(f"/query{body}".encode("utf-8")).hexdigest() + hash_code = hashlib.md5(f'/query{body}'.encode('utf-8')).hexdigest() return hash_code class GraphKBConnection: def __init__( self, - url: str = os.environ.get("GRAPHKB_URL", DEFAULT_URL), - username: str = "", - password: str = "", + url: str = os.environ.get('GRAPHKB_URL'), + username: str = '', + password: str = '', use_global_cache: bool = True, ): self.http = requests.Session() @@ -111,13 +111,13 @@ def __init__( backoff_factor=5, status_forcelist=[429, 500, 502, 503, 504], ) - self.http.mount("https://", HTTPAdapter(max_retries=retries)) - self.token = "" - self.token_kc = "" + self.http.mount('https://', HTTPAdapter(max_retries=retries)) + self.token = '' + self.token_kc = '' self.url = url self.username = username self.password = password - self.headers = {"Accept": "application/json", "Content-Type": "application/json"} + self.headers = {'Accept': 'application/json', 'Content-Type': 'application/json'} self.cache: Dict[Any, Any] = {} if not use_global_cache else QUERY_CACHE self.request_count = 0 self.first_request: Optional[datetime] = None @@ -125,6 +125,10 @@ def __init__( if username and password: self.login(username=username, password=password) + # URL check + if not self.url: + raise ValueError('URL to a GraphKB API instance is required') + @property def load(self) -> Optional[float]: if self.first_request and self.last_request: @@ -133,7 +137,7 @@ def load(self) -> Optional[float]: return self.request_count * 1000 / msec return None - def request(self, endpoint: str, method: str = "GET", **kwargs) -> Dict: + def request(self, endpoint: str, method: str = 'GET', **kwargs) -> Dict: """Request wrapper to handle adding common headers and logging. Args: @@ -151,7 +155,7 @@ def request(self, endpoint: str, method: str = "GET", **kwargs) -> Dict: # don't want to use a read timeout if the request is not idempotent # otherwise you may wind up making unintended changes timeout = None - if endpoint in ["query", "parse"]: + if endpoint in ['query', 'parse']: timeout = (connect_timeout, read_timeout) start_time = datetime.now() @@ -170,7 +174,6 @@ def request(self, endpoint: str, method: str = "GET", **kwargs) -> Dict: if attempt > 0: time.sleep(2) # wait between retries try: - if need_refresh_login: self.refresh_login() need_refresh_login = False @@ -180,7 +183,7 @@ def request(self, endpoint: str, method: str = "GET", **kwargs) -> Dict: method, url, headers=self.headers, timeout=timeout, **kwargs ) if resp.status_code == 401 or resp.status_code == 403: - logger.debug(f"/{endpoint} - {resp.status_code} - retrying") + logger.debug(f'/{endpoint} - {resp.status_code} - retrying') # try to re-login if the token expired need_refresh_login = True continue @@ -188,14 +191,14 @@ def request(self, endpoint: str, method: str = "GET", **kwargs) -> Dict: break except (requests.exceptions.ConnectionError, OSError) as err: if attempt < len(attempts) - 1: - logger.debug(f"/{endpoint} - {str(err)} - retrying") + logger.debug(f'/{endpoint} - {str(err)} - retrying') continue raise err except Exception as err2: raise err2 timing = millis_interval(start_time, datetime.now()) - logger.debug(f"/{endpoint} - {resp.status_code} - {timing} ms") # type: ignore + logger.debug(f'/{endpoint} - {resp.status_code} - {timing} ms') # type: ignore try: resp.raise_for_status() @@ -203,7 +206,7 @@ def request(self, endpoint: str, method: str = "GET", **kwargs) -> Dict: # try to get more error details message = str(err) try: - message += " " + resp.json()["message"] + message += ' ' + resp.json()['message'] except Exception: pass @@ -213,7 +216,7 @@ def request(self, endpoint: str, method: str = "GET", **kwargs) -> Dict: def post(self, uri: str, data: Dict = {}, **kwargs) -> Dict: """Convenience method for making post requests.""" - return self.request(uri, method="POST", data=json.dumps(data), **kwargs) + return self.request(uri, method='POST', data=json.dumps(data), **kwargs) def login_demo(self) -> None: """ @@ -223,26 +226,26 @@ def login_demo(self) -> None: 2. get a second token from the GraphKB API using keyCloakToken; self.login() """ url_parts = urlsplit(self.url) - base_url = f"{url_parts.scheme}://{url_parts.netloc}" + base_url = f'{url_parts.scheme}://{url_parts.netloc}' try: resp = requests.request( - url=f"{base_url}/auth/realms/PORI/protocol/openid-connect/token", - method="POST", + url=f'{base_url}/auth/realms/PORI/protocol/openid-connect/token', + method='POST', data={ - "client_id": "GraphKB", - "grant_type": "password", - "password": self.password, - "username": self.username, + 'client_id': 'GraphKB', + 'grant_type': 'password', + 'password': self.password, + 'username': self.username, }, ) except Exception as err: - logger.debug(f"unable to fetch a token from KeyCloak: {err}") + logger.debug(f'unable to fetch a token from KeyCloak: {err}') raise err resp.raise_for_status() content = resp.json() - self.token_kc = content["access_token"] + self.token_kc = content['access_token'] def login(self, username: str, password: str, pori_demo: bool = False) -> None: self.username = username @@ -251,7 +254,8 @@ def login(self, username: str, password: str, pori_demo: bool = False) -> None: read_timeout = 61 # KBDEV-1328. Alt. GraphKB login for GSC's PORI online demo - if pori_demo or "pori-demo" in self.url: + if pori_demo or 'pori-demo' in self.url: + logger.warning('login demo') self.login_demo() # use requests package directly to avoid recursion loop on login failure @@ -262,29 +266,29 @@ def login(self, username: str, password: str, pori_demo: bool = False) -> None: try: self.request_count += 1 resp = requests.request( - url=f"{self.url}/token", - method="POST", + url=f'{self.url}/token', + method='POST', headers=self.headers, timeout=(connect_timeout, read_timeout), data=json.dumps( # KBDEV-1328. Alt. GraphKB login for GSC's PORI online demo - {"keyCloakToken": self.token_kc} + {'keyCloakToken': self.token_kc} if self.token_kc - else {"username": username, "password": password} + else {'username': username, 'password': password} ), ) break except (requests.exceptions.ConnectionError, OSError) as err: if attempt < len(attempts) - 1: - logger.debug(f"/login - {str(err)} - retrying") + logger.debug(f'/login - {str(err)} - retrying') continue raise err except Exception as err2: raise err2 resp.raise_for_status() content = resp.json() - self.token = content["kbToken"] - self.headers["Authorization"] = self.token + self.token = content['kbToken'] + self.headers['Authorization'] = self.token def refresh_login(self) -> None: self.login(self.username, self.password) @@ -306,7 +310,7 @@ def query( Query GraphKB """ result: List[Record] = [] - hash_code = "" + hash_code = '' if not ignore_cache and paginate: hash_code = cache_key(request_body) @@ -314,8 +318,8 @@ def query( return self.cache[hash_code] while True: - content = self.post("query", data={**request_body, "limit": limit, "skip": len(result)}) - records = content["result"] + content = self.post('query', data={**request_body, 'limit': limit, 'skip': len(result)}) + records = content['result'] result.extend(records) if len(records) < limit or not paginate: break @@ -326,17 +330,17 @@ def query( def parse(self, hgvs_string: str, requireFeatures: bool = False) -> ParsedVariant: content = self.post( - "parse", data={"content": hgvs_string, "requireFeatures": requireFeatures} + 'parse', data={'content': hgvs_string, 'requireFeatures': requireFeatures} ) - return cast(ParsedVariant, content["result"]) + return cast(ParsedVariant, content['result']) def get_records_by_id(self, record_ids: List[str]) -> List[Record]: if not record_ids: return [] - result = self.query({"target": record_ids}) + result = self.query({'target': record_ids}) if len(record_ids) != len(result): raise AssertionError( - f"The number of Ids given ({len(record_ids)}) does not match the number of records fetched ({len(result)})" + f'The number of Ids given ({len(record_ids)}) does not match the number of records fetched ({len(result)})' ) return result @@ -345,9 +349,9 @@ def get_record_by_id(self, record_id: str) -> Record: return result[0] def get_source(self, name: str) -> Record: - source = self.query({"target": "Source", "filters": {"name": name}}) + source = self.query({'target': 'Source', 'filters': {'name': name}}) if len(source) != 1: - raise AssertionError(f"Unable to unqiuely identify source with name {name}") + raise AssertionError(f'Unable to unqiuely identify source with name {name}') return source[0] @@ -367,27 +371,27 @@ def get_rid(conn: GraphKBConnection, target: str, name: str) -> str: AssertionError: if the term was not found or more than 1 match was found (expected to be unique) """ result = conn.query( - {"target": target, "filters": {"name": name}, "returnProperties": ["@rid"]}, + {'target': target, 'filters': {'name': name}, 'returnProperties': ['@rid']}, ignore_cache=False, ) assert len(result) == 1, f"unable to find unique '{target}' ID for '{name}'" - return result[0]["@rid"] + return result[0]['@rid'] def stripParentheses(breakRepr: str) -> str: - match = re.search(r"^([a-z])\.\((.+)\)$", breakRepr) + match = re.search(r'^([a-z])\.\((.+)\)$', breakRepr) if match: - return f"{match.group(1)}.{match.group(2)}" + return f'{match.group(1)}.{match.group(2)}' return breakRepr def stripRefSeq(breakRepr: str) -> str: # 1 leading RefSeq - match = re.search(r"^([a-z])\.([A-Z]*|\?)([0-9]*[A-Z]*)$", breakRepr) + match = re.search(r'^([a-z])\.([A-Z]*|\?)([0-9]*[A-Z]*)$', breakRepr) if match: - return f"{match.group(1)}.{match.group(3)}" + return f'{match.group(1)}.{match.group(3)}' # TODO: Deal with cases like "p.?889_?890", "chr4:g.55593604_55593605delGGinsTT", ... @@ -395,27 +399,27 @@ def stripRefSeq(breakRepr: str) -> str: def stripDisplayName(displayName: str, withRef: bool = True, withRefSeq: bool = True) -> str: - match = re.search(r"^(.*)(\:)(.*)$", displayName) + match = re.search(r'^(.*)(\:)(.*)$', displayName) if match and not withRef: if withRefSeq: return match.group(3) displayName = match.group(2) + match.group(3) - match = re.search(r"^(.*\:)([a-z]\.)(.*)$", displayName) + match = re.search(r'^(.*\:)([a-z]\.)(.*)$', displayName) if match and not withRefSeq: - ref: str = match.group(1) if match.group(1) != ":" else "" + ref: str = match.group(1) if match.group(1) != ':' else '' prefix: str = match.group(2) rest: str = match.group(3) new_matches: Union[bool, object] = True # refSeq before position while new_matches: - new_matches = re.search(r"(.*)([A-Z]|\?)([0-9]+)(.*)", rest) + new_matches = re.search(r'(.*)([A-Z]|\?)([0-9]+)(.*)', rest) if new_matches: rest = new_matches.group(1) + new_matches.group(3) + new_matches.group(4) # refSeq before '>' - new_matches = re.search(r"^([0-9]*)([A-Z]*|\?)(\>)(.*)$", rest) + new_matches = re.search(r'^([0-9]*)([A-Z]*|\?)(\>)(.*)$', rest) if new_matches: rest = new_matches.group(1) + new_matches.group(3) + new_matches.group(4) @@ -442,18 +446,18 @@ def stringifyVariant( str: The string representation """ - displayName: str = variant.get("displayName") or "" # type: ignore + displayName: str = variant.get('displayName') or '' # type: ignore # If variant is a PositionalVariant (i.e. variant with a displayName) and # we already have the appropriate string representation, # then return it right away - if displayName != "" and (withRef and withRefSeq): + if displayName != '' and (withRef and withRefSeq): return displayName # If variant is a PositionalVariant (i.e. variant with a displayName) and # we DO NOT have the appropriate string representation, # then strip unwanted features, then return it right away - if displayName != "": + if displayName != '': return stripDisplayName(displayName, withRef, withRefSeq) # If variant is a ParsedVariant (i.e. variant without a displayName yet), @@ -464,106 +468,106 @@ def stringifyVariant( result: List[str] = [] # Extracting parsed values into individual variables - break1Repr: str = str(parsed.get("break1Repr", "")) - break2Repr: str = str(parsed.get("break2Repr", "")) - multiFeature: bool = bool(parsed.get("multiFeature")) - noFeatures: bool = bool(parsed.get("noFeatures")) - notationType: str = str(parsed.get("notationType", "")) - reference1: str = "" - if ref1 := parsed.get("reference1"): + break1Repr: str = str(parsed.get('break1Repr', '')) + break2Repr: str = str(parsed.get('break2Repr', '')) + multiFeature: bool = bool(parsed.get('multiFeature')) + noFeatures: bool = bool(parsed.get('noFeatures')) + notationType: str = str(parsed.get('notationType', '')) + reference1: str = '' + if ref1 := parsed.get('reference1'): if isinstance(ref1, str): reference1 = ref1 else: - reference1 = ref1.get("displayName", str(ref1)) - reference2: str = "" - if ref2 := parsed.get("reference2"): + reference1 = ref1.get('displayName', str(ref1)) + reference2: str = '' + if ref2 := parsed.get('reference2'): if isinstance(ref2, str): reference2 = ref2 else: - reference2 = ref2.get("displayName", str(ref2)) - refSeq: str = parsed.get("refSeq") or "" - truncation: int = parsed.get("truncation") or 0 # type: ignore - variantType: str = parsed.get("type", "") - untemplatedSeq: str = parsed.get("untemplatedSeq") or "" - untemplatedSeqSize: int = parsed.get("untemplatedSeqSize") or 0 + reference2 = ref2.get('displayName', str(ref2)) + refSeq: str = parsed.get('refSeq') or '' + truncation: int = parsed.get('truncation') or 0 # type: ignore + variantType: str = parsed.get('type', '') + untemplatedSeq: str = parsed.get('untemplatedSeq') or '' + untemplatedSeqSize: int = parsed.get('untemplatedSeqSize') or 0 # formating notationType if not notationType: - notationType = TYPES_TO_NOTATION.get(variantType, re.sub(r"\s", "-", variantType)) + notationType = TYPES_TO_NOTATION.get(variantType, re.sub(r'\s', '-', variantType)) # If multiFeature - if multiFeature or (reference2 != "" and reference1 != reference2): + if multiFeature or (reference2 != '' and reference1 != reference2): if withRef and not noFeatures: - result.append(f"({reference1}:{reference2})") + result.append(f'({reference1}:{reference2})') result.append(notationType) if withRefSeq: break1Repr_noParentheses = stripParentheses(break1Repr) break2Repr_noParentheses = stripParentheses(break2Repr) - result.append(f"({break1Repr_noParentheses},{break2Repr_noParentheses})") + result.append(f'({break1Repr_noParentheses},{break2Repr_noParentheses})') else: break1Repr_noParentheses_noRefSeq = stripRefSeq(stripParentheses(break1Repr)) break2Repr_noParentheses_noRefSeq = stripRefSeq(stripParentheses(break2Repr)) result.append( - f"({break1Repr_noParentheses_noRefSeq},{break2Repr_noParentheses_noRefSeq})" + f'({break1Repr_noParentheses_noRefSeq},{break2Repr_noParentheses_noRefSeq})' ) - if untemplatedSeq != "": + if untemplatedSeq != '': result.append(untemplatedSeq) elif untemplatedSeqSize: result.append(str(untemplatedSeqSize)) - return "".join(result) + return ''.join(result) # Continuous notation... # Reference if withRef and not noFeatures: - result.append(f"{reference1}:") + result.append(f'{reference1}:') # BreakRep if withRefSeq: result.append(break1Repr) - if break2Repr != "": - result.append(f"_{break2Repr[2:]}") + if break2Repr != '': + result.append(f'_{break2Repr[2:]}') else: result.append(stripRefSeq(break1Repr)) - if break2Repr != "": - result.append(f"_{stripRefSeq(break2Repr)[2:]}") + if break2Repr != '': + result.append(f'_{stripRefSeq(break2Repr)[2:]}') # refSeq, truncation, notationType, untemplatedSeq, untemplatedSeqSize - if any(i in notationType for i in ["ext", "fs"]) or ( - notationType == ">" and break1Repr.startswith("p.") + if any(i in notationType for i in ['ext', 'fs']) or ( + notationType == '>' and break1Repr.startswith('p.') ): result.append(untemplatedSeq) - if notationType == "mis" and break1Repr.startswith("p."): + if notationType == 'mis' and break1Repr.startswith('p.'): result.append(untemplatedSeq) - elif notationType != ">": - if notationType == "delins": + elif notationType != '>': + if notationType == 'delins': if withRefSeq: - result.append(f"del{refSeq}ins") + result.append(f'del{refSeq}ins') else: - result.append("delins") + result.append('delins') else: result.append(notationType) if truncation and truncation != 1: if truncation < 0: result.append(str(truncation)) else: - result.append(f"*{truncation}") - if any(i in notationType for i in ["dup", "del", "inv"]): + result.append(f'*{truncation}') + if any(i in notationType for i in ['dup', 'del', 'inv']): if withRefSeq: result.append(refSeq) - if any(i in notationType for i in ["ins", "delins"]): - if untemplatedSeq != "": + if any(i in notationType for i in ['ins', 'delins']): + if untemplatedSeq != '': result.append(untemplatedSeq) elif untemplatedSeqSize: result.append(str(untemplatedSeqSize)) - elif not break1Repr.startswith("p."): + elif not break1Repr.startswith('p.'): if withRefSeq: - refSeq = refSeq if refSeq != "" else "?" + refSeq = refSeq if refSeq != '' else '?' else: - refSeq = "" - untemplatedSeq = untemplatedSeq if untemplatedSeq != "" else "?" - result.append(f"{refSeq}{notationType}{untemplatedSeq}") + refSeq = '' + untemplatedSeq = untemplatedSeq if untemplatedSeq != '' else '?' + result.append(f'{refSeq}{notationType}{untemplatedSeq}') # TODO: Deal with more complexes cases like 'MED12:p.(?34_?68)mut' - return "".join(result) + return ''.join(result) diff --git a/pori_python/graphkb/vocab.py b/pori_python/graphkb/vocab.py index 26033e75..e9242a7a 100644 --- a/pori_python/graphkb/vocab.py +++ b/pori_python/graphkb/vocab.py @@ -7,14 +7,14 @@ def query_by_name(ontology_class: str, base_term_name: str) -> Dict: - return {"target": ontology_class, "filters": {"name": base_term_name}} + return {'target': ontology_class, 'filters': {'name': base_term_name}} def get_equivalent_terms( conn: GraphKBConnection, base_term_name: str, - root_exclude_term: str = "", - ontology_class: str = "Vocabulary", + root_exclude_term: str = '', + ontology_class: str = 'Vocabulary', ignore_cache: bool = False, build_base_query: Callable = query_by_name, ) -> List[Ontology]: @@ -32,10 +32,10 @@ def get_equivalent_terms( List[Ontology], conn.query( { - "target": {"target": base_records, "queryType": "descendants"}, - "queryType": "similarTo", - "treeEdges": [], - "returnProperties": ["sourceId", "sourceIdVersion", "deprecated", "name", "@rid"], + 'target': {'target': base_records, 'queryType': 'descendants'}, + 'queryType': 'similarTo', + 'treeEdges': [], + 'returnProperties': ['sourceId', 'sourceIdVersion', 'deprecated', 'name', '@rid'], }, ignore_cache=ignore_cache, ), @@ -51,30 +51,30 @@ def get_equivalent_terms( convert_to_rid_list( conn.query( { - "target": {"target": root_records, "queryType": "descendants"}, - "queryType": "similarTo", - "treeEdges": [], - "returnProperties": [ - "sourceId", - "sourceIdVersion", - "deprecated", - "name", - "@rid", + 'target': {'target': root_records, 'queryType': 'descendants'}, + 'queryType': 'similarTo', + 'treeEdges': [], + 'returnProperties': [ + 'sourceId', + 'sourceIdVersion', + 'deprecated', + 'name', + '@rid', ], }, ignore_cache=ignore_cache, ) ) ) - return [term for term in base_term_parents if term["@rid"] not in exclude] + return [term for term in base_term_parents if term['@rid'] not in exclude] return base_term_parents def get_term_tree( conn: GraphKBConnection, base_term_name: str, - root_exclude_term: str = "", - ontology_class: str = "Vocabulary", + root_exclude_term: str = '', + ontology_class: str = 'Vocabulary', include_superclasses: bool = True, ignore_cache: bool = False, build_base_query: Callable = query_by_name, @@ -102,10 +102,10 @@ def get_term_tree( List[Ontology], conn.query( { - "target": {"target": base_records, "queryType": "ancestors"}, - "queryType": "similarTo", - "treeEdges": [], - "returnProperties": ["sourceId", "sourceIdVersion", "deprecated", "name", "@rid"], + 'target': {'target': base_records, 'queryType': 'ancestors'}, + 'queryType': 'similarTo', + 'treeEdges': [], + 'returnProperties': ['sourceId', 'sourceIdVersion', 'deprecated', 'name', '@rid'], }, ignore_cache=ignore_cache, ), @@ -126,7 +126,7 @@ def get_term_tree( terms = {} # merge the two lists for term in child_terms + parent_terms: - terms[term["@rid"]] = term + terms[term['@rid']] = term return list(terms.values()) @@ -134,7 +134,7 @@ def get_term_tree( def get_term_by_name( conn: GraphKBConnection, name: str, - ontology_class: str = "Vocabulary", + ontology_class: str = 'Vocabulary', ignore_cache: bool = False, **kwargs, ) -> Ontology: @@ -156,15 +156,15 @@ def get_term_by_name( """ result = conn.query( { - "target": ontology_class, - "filters": {"name": name}, - "returnProperties": [ - "sourceId", - "sourceIdVersion", - "deprecated", - "name", - "@rid", - "@class", + 'target': ontology_class, + 'filters': {'name': name}, + 'returnProperties': [ + 'sourceId', + 'sourceIdVersion', + 'deprecated', + 'name', + '@rid', + '@class', ], }, ignore_cache=ignore_cache, @@ -172,7 +172,7 @@ def get_term_by_name( ) if len(result) != 1: - raise AssertionError(f"unable to find term ({name}) by name") + raise AssertionError(f'unable to find term ({name}) by name') return cast(Ontology, result[0]) diff --git a/pori_python/ipr/annotate.py b/pori_python/ipr/annotate.py index 72ae7626..cd6478a3 100644 --- a/pori_python/ipr/annotate.py +++ b/pori_python/ipr/annotate.py @@ -43,16 +43,16 @@ def get_second_pass_variants( # second-pass matching all_inferred_matches: Dict[str, Variant] = {} inferred_variants = { - (s["subject"]["@rid"], s["relevance"]["name"]) + (s['subject']['@rid'], s['relevance']['name']) for s in statements - if s["subject"] and s["subject"]["@class"] in ("Feature", "Signature") + if s['subject'] and s['subject']['@class'] in ('Feature', 'Signature') } for reference1, variant_type in inferred_variants: variants = gkb_match.match_category_variant(graphkb_conn, reference1, variant_type) for variant in variants: - all_inferred_matches[variant["@rid"]] = variant + all_inferred_matches[variant['@rid']] = variant inferred_matches: List[Variant] = list(all_inferred_matches.values()) return inferred_matches @@ -70,7 +70,7 @@ def get_ipr_statements_from_variants( rows = [] statements = get_statements_from_variants(graphkb_conn, matches) - existing_statements = {s["@rid"] for s in statements} + existing_statements = {s['@rid'] for s in statements} for ipr_row in convert_statements_to_alterations( graphkb_conn, statements, disease_matches, convert_to_rid_set(matches) @@ -83,7 +83,7 @@ def get_ipr_statements_from_variants( inferred_statements = [ s for s in get_statements_from_variants(graphkb_conn, inferred_matches) - if s["@rid"] not in existing_statements # do not duplicate if non-inferred match + if s['@rid'] not in existing_statements # do not duplicate if non-inferred match ] for ipr_row in convert_statements_to_alterations( @@ -92,7 +92,7 @@ def get_ipr_statements_from_variants( disease_matches, convert_to_rid_set(inferred_matches), ): - ipr_row["kbData"]["inferred"] = True + ipr_row['kbData']['inferred'] = True rows.append(ipr_row) return rows @@ -118,35 +118,35 @@ def annotate_expression_variants( skipped = 0 alterations = [] problem_genes = set() - logger.info(f"Starting annotation of {len(variants)} expression category_variants") + logger.info(f'Starting annotation of {len(variants)} expression category_variants') iterfunc = tqdm if show_progress else iter for row in iterfunc(variants): - gene = row["gene"] - variant = row["variant"] + gene = row['gene'] + variant = row['variant'] if not variant: skipped += 1 - logger.debug(f"Skipping malformed Expression {gene}: {row}") + logger.debug(f'Skipping malformed Expression {gene}: {row}') continue try: matches = gkb_match.match_expression_variant(graphkb_conn, gene, variant) for ipr_row in get_ipr_statements_from_variants(graphkb_conn, matches, disease_matches): - ipr_row["variant"] = row["key"] - ipr_row["variantType"] = row.get("variantType", "exp") + ipr_row['variant'] = row['key'] + ipr_row['variantType'] = row.get('variantType', 'exp') alterations.append(ipr_row) except FeatureNotFoundError as err: problem_genes.add(gene) - logger.debug(f"Unrecognized gene ({gene} {variant}): {err}") + logger.debug(f'Unrecognized gene ({gene} {variant}): {err}') except ValueError as err: - logger.error(f"failed to match variants ({gene} {variant}): {err}") + logger.error(f'failed to match variants ({gene} {variant}): {err}') if skipped: - logger.info(f"skipped matching {skipped} expression information rows") + logger.info(f'skipped matching {skipped} expression information rows') if problem_genes: - logger.error(f"gene finding failures for expression {sorted(problem_genes)}") - logger.error(f"gene finding falure for {len(problem_genes)} expression genes") + logger.error(f'gene finding failures for expression {sorted(problem_genes)}') + logger.error(f'gene finding falure for {len(problem_genes)} expression genes') logger.info( - f"matched {len(variants)} expression variants to {len(alterations)} graphkb annotations" + f'matched {len(variants)} expression variants to {len(alterations)} graphkb annotations' ) return alterations @@ -172,11 +172,11 @@ def annotate_copy_variants( alterations = [] problem_genes = set() - logger.info(f"Starting annotation of {len(variants)} copy category_variants") + logger.info(f'Starting annotation of {len(variants)} copy category_variants') iterfunc = tqdm if show_progress else iter for row in iterfunc(variants): - gene = row["gene"] - variant = row["variant"] + gene = row['gene'] + variant = row['variant'] if variant not in REPORTED_COPY_VARIANTS: # https://www.bcgsc.ca/jira/browse/GERO-77 @@ -186,24 +186,24 @@ def annotate_copy_variants( try: matches = gkb_match.match_copy_variant(graphkb_conn, gene, variant) for ipr_row in get_ipr_statements_from_variants(graphkb_conn, matches, disease_matches): - ipr_row["variant"] = row["key"] - ipr_row["variantType"] = row.get("variantType", "cnv") + ipr_row['variant'] = row['key'] + ipr_row['variantType'] = row.get('variantType', 'cnv') alterations.append(ipr_row) except FeatureNotFoundError as err: problem_genes.add(gene) - logger.debug(f"Unrecognized gene ({gene} {variant}): {err}") + logger.debug(f'Unrecognized gene ({gene} {variant}): {err}') except ValueError as err: - logger.error(f"failed to match variants ({gene} {variant}): {err}") + logger.error(f'failed to match variants ({gene} {variant}): {err}') if skipped: logger.info( - f"skipped matching {skipped} copy number variants not in {REPORTED_COPY_VARIANTS}" + f'skipped matching {skipped} copy number variants not in {REPORTED_COPY_VARIANTS}' ) if problem_genes: - logger.error(f"gene finding failures for copy variants {sorted(problem_genes)}") - logger.error(f"gene finding failure for {len(problem_genes)} copy variant genes") + logger.error(f'gene finding failures for copy variants {sorted(problem_genes)}') + logger.error(f'gene finding failure for {len(problem_genes)} copy variant genes') logger.info( - f"matched {len(variants)} copy category variants to {len(alterations)} graphkb annotations" + f'matched {len(variants)} copy category variants to {len(alterations)} graphkb annotations' ) return alterations @@ -226,14 +226,14 @@ def annotate_positional_variants( Returns: Hashable list of kbMatches records for IPR """ - VARIANT_KEYS = ("variant", "hgvsProtein", "hgvsCds", "hgvsGenomic") + VARIANT_KEYS = ('variant', 'hgvsProtein', 'hgvsCds', 'hgvsGenomic') errors = 0 alterations: List[Hashabledict] = [] problem_genes = set() iterfunc = tqdm if show_progress else iter for row in iterfunc(variants): - if not row.get("gene") and (not row.get("gene1") or not row.get("gene2")): + if not row.get('gene') and (not row.get('gene1') or not row.get('gene2')): # https://www.bcgsc.ca/jira/browse/GERO-56?focusedCommentId=1234791&page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel#comment-1234791 # should not match single gene SVs continue @@ -250,15 +250,15 @@ def annotate_positional_variants( # DEVSU-1885 - fix malformed single deletion described as substitution of blank # eg. deletion described as substitution with nothing: 'chr1:g.150951027T>' if ( - variant[-1] == ">" - and "g." in variant + variant[-1] == '>' + and 'g.' in variant and variant[-2].isalpha() and variant[-3].isnumeric() ): logger.warning( - f"Assuming malformed deletion variant {variant} is {variant[:-2] + 'del'}" + f'Assuming malformed deletion variant {variant} is {variant[:-2] + "del"}' ) - variant = variant[:-2] + "del" + variant = variant[:-2] + 'del' matches = gkb_match.match_positional_variant(graphkb_conn, variant) else: raise parse_err @@ -268,42 +268,42 @@ def annotate_positional_variants( matches, disease_matches, ): - ipr_row["variant"] = row["key"] - ipr_row["variantType"] = row.get( - "variantType", "mut" if row.get("gene") else "sv" + ipr_row['variant'] = row['key'] + ipr_row['variantType'] = row.get( + 'variantType', 'mut' if row.get('gene') else 'sv' ) alterations.append(Hashabledict(ipr_row)) except FeatureNotFoundError as err: - logger.debug(f"failed to match positional variants ({variant}): {err}") + logger.debug(f'failed to match positional variants ({variant}): {err}') errors += 1 - if "gene" in row: - problem_genes.add(row["gene"]) - elif "gene1" in row and f"({row['gene1']})" in str(err): - problem_genes.add(row["gene1"]) - elif "gene2" in row and f"({row['gene2']})" in str(err): - problem_genes.add(row["gene2"]) - elif "gene1" in row and "gene2" in row: - problem_genes.add(row["gene1"]) - problem_genes.add(row["gene2"]) + if 'gene' in row: + problem_genes.add(row['gene']) + elif 'gene1' in row and f'({row["gene1"]})' in str(err): + problem_genes.add(row['gene1']) + elif 'gene2' in row and f'({row["gene2"]})' in str(err): + problem_genes.add(row['gene2']) + elif 'gene1' in row and 'gene2' in row: + problem_genes.add(row['gene1']) + problem_genes.add(row['gene2']) else: raise err except HTTPError as err: errors += 1 - logger.error(f"failed to match positional variants ({variant}): {err}") + logger.error(f'failed to match positional variants ({variant}): {err}') if problem_genes: - logger.error(f"gene finding failures for {sorted(problem_genes)}") - logger.error(f"{len(problem_genes)} gene finding failures for positional variants") + logger.error(f'gene finding failures for {sorted(problem_genes)}') + logger.error(f'{len(problem_genes)} gene finding failures for positional variants') if errors: - logger.error(f"skipped {errors} positional variants due to errors") + logger.error(f'skipped {errors} positional variants due to errors') # drop duplicates alterations = list(set(alterations)) - variant_types = ", ".join(sorted(set([alt["variantType"] for alt in alterations]))) + variant_types = ', '.join(sorted(set([alt['variantType'] for alt in alterations]))) logger.info( - f"matched {len(variants)} {variant_types} positional variants to {len(alterations)} graphkb annotations" + f'matched {len(variants)} {variant_types} positional variants to {len(alterations)} graphkb annotations' ) return alterations @@ -336,30 +336,30 @@ def annotate_signature_variants( # Matching signature variant to GKB Variants matched_variants: List[Variant] = gkb_match.match_category_variant( graphkb_conn, - variant["signatureName"], - variant["variantTypeName"], - reference_class="Signature", + variant['signatureName'], + variant['variantTypeName'], + reference_class='Signature', ) # KBDEV-1246 # Keep support for 'high mutation burden' until statement datafix if ( - variant["signatureName"] == TMB_SIGNATURE - and TMB_SIGNATURE != "high mutation burden" + variant['signatureName'] == TMB_SIGNATURE + and TMB_SIGNATURE != 'high mutation burden' ): matched_variants.extend( gkb_match.match_category_variant( graphkb_conn, - "high mutation burden", - variant["variantTypeName"], - reference_class="Signature", + 'high mutation burden', + variant['variantTypeName'], + reference_class='Signature', ) ) # Matching GKB Variants to GKB Statements for ipr_row in get_ipr_statements_from_variants( graphkb_conn, matched_variants, disease_matches ): - ipr_row["variant"] = variant["key"] - ipr_row["variantType"] = "sigv" + ipr_row['variant'] = variant['key'] + ipr_row['variantType'] = 'sigv' alterations.append(Hashabledict(ipr_row)) except ValueError as err: @@ -369,7 +369,7 @@ def annotate_signature_variants( alterations = list(set(alterations)) logger.info( - f"matched {len(variants)} signature category variants to {len(alterations)} graphkb annotations" + f'matched {len(variants)} signature category variants to {len(alterations)} graphkb annotations' ) return alterations @@ -401,25 +401,25 @@ def annotate_variants( gkb_matches: List[Hashabledict] = [] # MATCHING SIGNATURE CATEGORY VARIANTS - logger.info(f"annotating {len(signature_variants)} signatures") + logger.info(f'annotating {len(signature_variants)} signatures') gkb_matches.extend( annotate_signature_variants( graphkb_conn, disease_matches, signature_variants, show_progress=interactive ) ) - logger.debug(f"\tgkb_matches: {len(gkb_matches)}") + logger.debug(f'\tgkb_matches: {len(gkb_matches)}') # MATCHING SMALL MUTATIONS - logger.info(f"annotating {len(small_mutations)} small mutations") + logger.info(f'annotating {len(small_mutations)} small mutations') gkb_matches.extend( annotate_positional_variants( graphkb_conn, small_mutations, disease_matches, show_progress=interactive ) ) - logger.debug(f"\tgkb_matches: {len(gkb_matches)}") + logger.debug(f'\tgkb_matches: {len(gkb_matches)}') # MATCHING STRUCTURAL VARIANTS - logger.info(f"annotating {len(structural_variants)} structural variants") + logger.info(f'annotating {len(structural_variants)} structural variants') gkb_matches.extend( annotate_positional_variants( graphkb_conn, @@ -428,10 +428,10 @@ def annotate_variants( show_progress=interactive, ) ) - logger.debug(f"\tgkb_matches: {len(gkb_matches)}") + logger.debug(f'\tgkb_matches: {len(gkb_matches)}') # MATCHING COPY VARIANTS - logger.info(f"annotating {len(copy_variants)} copy variants") + logger.info(f'annotating {len(copy_variants)} copy variants') gkb_matches.extend( [ Hashabledict(copy_var) @@ -440,10 +440,10 @@ def annotate_variants( ) ] ) - logger.debug(f"\tgkb_matches: {len(gkb_matches)}") + logger.debug(f'\tgkb_matches: {len(gkb_matches)}') # MATCHING EXPRESSION VARIANTS - logger.info(f"annotating {len(expression_variants)} expression variants") + logger.info(f'annotating {len(expression_variants)} expression variants') gkb_matches.extend( [ Hashabledict(exp_var) @@ -455,6 +455,6 @@ def annotate_variants( ) ] ) - logger.debug(f"\tgkb_matches: {len(gkb_matches)}") + logger.debug(f'\tgkb_matches: {len(gkb_matches)}') return gkb_matches diff --git a/pori_python/ipr/connection.py b/pori_python/ipr/connection.py index f2652fdd..70eaf26c 100644 --- a/pori_python/ipr/connection.py +++ b/pori_python/ipr/connection.py @@ -6,7 +6,6 @@ import zlib from typing import Dict, List -from .constants import DEFAULT_URL from .util import logger IMAGE_MAX = 20 # cannot upload more than 20 images at a time @@ -17,21 +16,21 @@ def __init__( self, username: str, password: str, - url: str = os.environ.get("IPR_URL", DEFAULT_URL), + url: str = os.environ.get('IPR_URL'), ): self.token = None self.url = url self.username = username self.password = password self.headers = { - "Accept": "application/json", - "Content-Type": "application/json", - "Content-Encoding": "deflate", + 'Accept': 'application/json', + 'Content-Type': 'application/json', + 'Content-Encoding': 'deflate', } self.cache: Dict[str, List[Dict]] = {} self.request_count = 0 - def request(self, endpoint: str, method: str = "GET", **kwargs) -> Dict: + def request(self, endpoint: str, method: str = 'GET', **kwargs) -> Dict: """Request wrapper to handle adding common headers and logging Args: @@ -41,9 +40,9 @@ def request(self, endpoint: str, method: str = "GET", **kwargs) -> Dict: Returns: dict: the json response as a python dict """ - url = f"{self.url}/{endpoint}" + url = f'{self.url}/{endpoint}' self.request_count += 1 - kwargs_header = kwargs.pop("headers", None) + kwargs_header = kwargs.pop('headers', None) if kwargs_header: headers = json.loads(kwargs_header) else: @@ -57,21 +56,21 @@ def request(self, endpoint: str, method: str = "GET", **kwargs) -> Dict: # try to get more error details message = str(err) try: - message += " " + resp.json()["error"]["message"] + message += ' ' + resp.json()['error']['message'] except Exception: pass raise requests.exceptions.HTTPError(message) if resp.status_code == 204: # TODO: address this in api - return {"status_code": 204} + return {'status_code': 204} return resp.json() def post(self, uri: str, data: Dict = {}, **kwargs) -> Dict: """Convenience method for making post requests""" return self.request( uri, - method="POST", - data=zlib.compress(json.dumps(data, allow_nan=False).encode("utf-8")), + method='POST', + data=zlib.compress(json.dumps(data, allow_nan=False).encode('utf-8')), **kwargs, ) @@ -79,8 +78,8 @@ def get(self, uri: str, data: Dict = {}, **kwargs) -> Dict: """Convenience method for making get requests""" return self.request( uri, - method="GET", - data=zlib.compress(json.dumps(data, allow_nan=False).encode("utf-8")), + method='GET', + data=zlib.compress(json.dumps(data, allow_nan=False).encode('utf-8')), **kwargs, ) @@ -88,9 +87,9 @@ def delete(self, uri: str, data: Dict = {}, **kwargs) -> Dict: """Convenience method for making delete requests""" return self.request( uri, - method="DELETE", - data=zlib.compress(json.dumps(data, allow_nan=False).encode("utf-8")), - headers=json.dumps({"Accept": "*/*"}), + method='DELETE', + data=zlib.compress(json.dumps(data, allow_nan=False).encode('utf-8')), + headers=json.dumps({'Accept': '*/*'}), **kwargs, ) @@ -106,83 +105,83 @@ def upload_report( # or 'report'. jobStatus is no longer available once the report is successfully # uploaded. - projects = self.get("project") - project_names = [item["name"] for item in projects] + projects = self.get('project') + project_names = [item['name'] for item in projects] # if project is not exist, create one - if content["project"] not in project_names: + if content['project'] not in project_names: logger.info( - f"Project not found - attempting to create project {content['project']}" + f'Project not found - attempting to create project {content["project"]}' ) try: - self.post("project", {"name": content["project"]}) + self.post('project', {'name': content['project']}) except Exception as err: - raise Exception(f"Project creation failed due to {err}") + raise Exception(f'Project creation failed due to {err}') if ignore_extra_fields: - initial_result = self.post("reports-async?ignore_extra_fields=true", content) + initial_result = self.post('reports-async?ignore_extra_fields=true', content) else: - initial_result = self.post("reports-async", content) + initial_result = self.post('reports-async', content) - report_id = initial_result["ident"] + report_id = initial_result['ident'] def check_status_result(result): - if result.get("report", False): - return "upload complete" - if result.get("jobStatus", False) and result["jobStatus"].get("state", False): - return result["jobStatus"]["state"] + if result.get('report', False): + return 'upload complete' + if result.get('jobStatus', False) and result['jobStatus'].get('state', False): + return result['jobStatus']['state'] raise Exception( - "async report get returned with no report or jobStatus, or unexpected jobStatus type" + 'async report get returned with no report or jobStatus, or unexpected jobStatus type' ) def check_status(interval: int = 5, num_attempts: int = 5): for i in range(num_attempts): - logger.info(f"checking report loading status in {interval} seconds") + logger.info(f'checking report loading status in {interval} seconds') time.sleep(interval) - current_status = self.get(f"reports-async/{report_id}") + current_status = self.get(f'reports-async/{report_id}') check_result = check_status_result(current_status) - if check_result == "upload complete": + if check_result == 'upload complete': return current_status - if check_result == "failed": + if check_result == 'failed': raise Exception( - f"async report upload failed with reason: {current_status.get('jobStatus', {}).get('failedReason', 'Unknown')}" + f'async report upload failed with reason: {current_status.get("jobStatus", {}).get("failedReason", "Unknown")}' ) if check_result not in [ - "active", - "ready", - "waiting", - "completed", + 'active', + 'ready', + 'waiting', + 'completed', ]: - raise Exception(f"async report upload in unexpected state: {check_result}") + raise Exception(f'async report upload in unexpected state: {check_result}') return current_status current_status = check_status() check_result = check_status_result(current_status) - if check_result in ["active", "waiting"]: + if check_result in ['active', 'waiting']: current_status = check_status(interval=30) check_result = check_status_result(current_status) - if check_result in ["active", "waiting"]: + if check_result in ['active', 'waiting']: current_status = check_status(interval=60, num_attempts=mins_to_wait) check_result = check_status_result(current_status) - if check_result in ["active", "waiting"]: + if check_result in ['active', 'waiting']: raise Exception( - f"async report upload taking longer than expected: {current_status}" + f'async report upload taking longer than expected: {current_status}' ) return current_status else: if ignore_extra_fields: - return self.post("reports?ignore_extra_fields=true", content) + return self.post('reports?ignore_extra_fields=true', content) else: - return self.post("reports", content) + return self.post('reports', content) def set_analyst_comments(self, report_id: str, data: Dict) -> Dict: """ @@ -193,9 +192,9 @@ def set_analyst_comments(self, report_id: str, data: Dict) -> Dict: Pending: https://www.bcgsc.ca/jira/browse/DEVSU-1177 """ return self.request( - f"/reports/{report_id}/summary/analyst-comments", - method="PUT", - data=zlib.compress(json.dumps(data, allow_nan=False).encode("utf-8")), + f'/reports/{report_id}/summary/analyst-comments', + method='PUT', + data=zlib.compress(json.dumps(data, allow_nan=False).encode('utf-8')), ) def post_images(self, report_id: str, files: Dict[str, str], data: Dict[str, str] = {}) -> None: @@ -212,18 +211,18 @@ def post_images(self, report_id: str, files: Dict[str, str], data: Dict[str, str if not os.path.exists(path): raise FileNotFoundError(path) current_files[key] = path - open_files = {k: open(f, "rb") for (k, f) in current_files.items()} + open_files = {k: open(f, 'rb') for (k, f) in current_files.items()} try: resp = self.request( - f"reports/{report_id}/image", - method="POST", + f'reports/{report_id}/image', + method='POST', data=data, files=open_files, headers=json.dumps({}), ) for status in resp: - if status.get("upload") != "successful": - image_errors.add(status["key"]) + if status.get('upload') != 'successful': + image_errors.add(status['key']) finally: for handler in open_files.values(): handler.close() @@ -235,12 +234,12 @@ def get_spec(self) -> Dict: """ Get the current IPR spec, for the purposes of current report upload fields """ - return self.request("/spec.json", method="GET") + return self.request('/spec.json', method='GET') def validate_json(self, content: Dict) -> Dict: """ Validate the provided json schema """ - result = self.post("reports/schema", content) - logger.info(f"{result['message']}") + result = self.post('reports/schema', content) + logger.info(f'{result["message"]}') return result diff --git a/pori_python/ipr/constants.py b/pori_python/ipr/constants.py index 6f3958c3..35c2a547 100644 --- a/pori_python/ipr/constants.py +++ b/pori_python/ipr/constants.py @@ -1,28 +1,40 @@ -DEFAULT_URL = "https://iprstaging-api.bcgsc.ca/api" -GERMLINE_BASE_TERMS = ("pharmacogenomic", "cancer predisposition") # based on graphkb.constants -VARIANT_CLASSES = {"Variant", "CategoryVariant", "PositionalVariant", "CatalogueVariant"} +GERMLINE_BASE_TERMS = ('pharmacogenomic', 'cancer predisposition') # based on graphkb.constants +VARIANT_CLASSES = {'Variant', 'CategoryVariant', 'PositionalVariant', 'CatalogueVariant'} # all possible values for review status are: ['pending', 'not required', 'passed', 'failed', 'initial'] -FAILED_REVIEW_STATUS = "failed" +FAILED_REVIEW_STATUS = 'failed' # Signatures -COSMIC_SIGNATURE_VARIANT_TYPE = "high signature" -HLA_SIGNATURE_VARIANT_TYPE = "signature present" -TMB_SIGNATURE = "mutation burden" +COSMIC_SIGNATURE_VARIANT_TYPE = 'high signature' +HLA_SIGNATURE_VARIANT_TYPE = 'signature present' +TMB_SIGNATURE = 'mutation burden' TMB_SIGNATURE_HIGH_THRESHOLD = ( 10.0 # genomic mutations per mb - https://www.bcgsc.ca/jira/browse/GERO-296 ) -TMB_SIGNATURE_VARIANT_TYPE = "high signature" +TMB_SIGNATURE_VARIANT_TYPE = 'high signature' # Mapping micro-satellite from pipeline terms to GraphKB terms MSI_MAPPING = { - "microsatellite instability": { # MSI - "displayName": "microsatellite instability high signature", - "signatureName": "microsatellite instability", - "variantTypeName": "high signature", + 'microsatellite instability': { # MSI + 'displayName': 'microsatellite instability high signature', + 'signatureName': 'microsatellite instability', + 'variantTypeName': 'high signature', }, - "microsatellite stable": { # MSS - "displayName": "microsatellite stable signature present", - "signatureName": "microsatellite stable", - "variantTypeName": "signature present", + 'microsatellite stable': { # MSS + 'displayName': 'microsatellite stable signature present', + 'signatureName': 'microsatellite stable', + 'variantTypeName': 'signature present', + }, +} +# Mapping hrd from pipeline terms to GraphKB terms +HRD_MAPPING = { + 'homologous recombination deficiency strong signature': { + 'displayName': 'homologous recombination deficiency strong signature', + 'signatureName': 'homologous recombination deficiency', + 'variantTypeName': 'strong signature', + }, + 'homologous recombination deficiency moderate signature': { + 'displayName': 'homologous recombination deficiency moderate signature', + 'signatureName': 'homologous recombination deficiency', + 'variantTypeName': 'moderate signature', }, } diff --git a/pori_python/ipr/content.spec.json b/pori_python/ipr/content.spec.json index c2df68e9..a9790129 100644 --- a/pori_python/ipr/content.spec.json +++ b/pori_python/ipr/content.spec.json @@ -541,6 +541,20 @@ }, "type": "array" }, + "hrd": { + "properties": { + "kbCategory": { + "type": "string" + }, + "score": { + "type": "number" + } + }, + "required": [ + "score" + ], + "type": "object" + }, "images": { "items": { "example": { diff --git a/pori_python/ipr/inputs.py b/pori_python/ipr/inputs.py index dcff908b..01976603 100644 --- a/pori_python/ipr/inputs.py +++ b/pori_python/ipr/inputs.py @@ -9,7 +9,6 @@ import os import pandas as pd import re -from Bio.Data.IUPACData import protein_letters_3to1 from numpy import nan from typing import Any, Callable, Dict, Iterable, List, Set, Tuple, cast @@ -25,149 +24,149 @@ from .constants import ( COSMIC_SIGNATURE_VARIANT_TYPE, - DEFAULT_URL, HLA_SIGNATURE_VARIANT_TYPE, MSI_MAPPING, + HRD_MAPPING, TMB_SIGNATURE, TMB_SIGNATURE_VARIANT_TYPE, ) -from .util import hash_key, logger, pandas_falsy +from .util import hash_key, logger, pandas_falsy, protein_letters_3to1 -protein_letters_3to1.setdefault("Ter", "*") +protein_letters_3to1.setdefault('Ter', '*') -SPECIFICATION = os.path.join(os.path.dirname(__file__), "content.spec.json") +SPECIFICATION = os.path.join(os.path.dirname(__file__), 'content.spec.json') # content in the local specification should match the values in IPR_API_SPEC_JSON_URL -IPR_API_SPEC_JSON_URL = f'{os.environ.get("IPR_URL", DEFAULT_URL)}/spec.json' +IPR_API_SPEC_JSON_URL = f'{os.environ.get("IPR_URL")}/spec.json' # TODO: GERO-307 - use SPECIFICATION json to derive the variant required and optional details defined below # 'cnvState' is for display -COPY_REQ = ["gene", "kbCategory"] -COPY_KEY = ["gene"] +COPY_REQ = ['gene', 'kbCategory'] +COPY_KEY = ['gene'] COPY_OPTIONAL = [ - "cnvState", - "copyChange", - "lohState", # Loss of Heterzygosity state - informative detail to analyst - "chromosomeBand", - "chromosome", - "chr", # expect only one of chromosome or chr - "start", - "end", - "size", - "log2Cna", - "cna", - "comments", - "library", - "germline", + 'cnvState', + 'copyChange', + 'lohState', # Loss of Heterzygosity state - informative detail to analyst + 'chromosomeBand', + 'chromosome', + 'chr', # expect only one of chromosome or chr + 'start', + 'end', + 'size', + 'log2Cna', + 'cna', + 'comments', + 'library', + 'germline', ] -SMALL_MUT_REQ = ["gene", "proteinChange"] +SMALL_MUT_REQ = ['gene', 'proteinChange'] # alternate details in the key, can distinguish / subtype events. SMALL_MUT_KEY = SMALL_MUT_REQ + [ - "altSeq", - "chromosome", - "endPosition", - "refSeq", - "startPosition", - "transcript", + 'altSeq', + 'chromosome', + 'endPosition', + 'refSeq', + 'startPosition', + 'transcript', ] SMALL_MUT_OPTIONAL = [ - "altSeq", - "comments", - "chromosome", - "endPosition", - "germline", - "hgvsCds", - "hgvsGenomic", - "hgvsProtein", - "library", - "ncbiBuild", - "normalAltCount", - "normalDepth", - "normalRefCount", - "refSeq", - "rnaAltCount", - "rnaDepth", - "rnaRefCount", - "startPosition", - "transcript", - "tumourAltCount", - "tumourAltCopies", - "tumourDepth", - "tumourRefCount", - "tumourRefCopies", - "zygosity", + 'altSeq', + 'comments', + 'chromosome', + 'endPosition', + 'germline', + 'hgvsCds', + 'hgvsGenomic', + 'hgvsProtein', + 'library', + 'ncbiBuild', + 'normalAltCount', + 'normalDepth', + 'normalRefCount', + 'refSeq', + 'rnaAltCount', + 'rnaDepth', + 'rnaRefCount', + 'startPosition', + 'transcript', + 'tumourAltCount', + 'tumourAltCopies', + 'tumourDepth', + 'tumourRefCount', + 'tumourRefCopies', + 'zygosity', ] -EXP_REQ = ["gene", "kbCategory"] -EXP_KEY = ["gene"] +EXP_REQ = ['gene', 'kbCategory'] +EXP_KEY = ['gene'] EXP_OPTIONAL = [ - "biopsySiteFoldChange", - "biopsySitePercentile", - "biopsySiteQC", - "biopsySiteZScore", - "biopsySitekIQR", - "comments", - "diseaseFoldChange", - "diseasekIQR", - "diseasePercentile", - "diseaseQC", - "diseaseZScore", - "expressionState", - "histogramImage", - "library", - "primarySiteFoldChange", - "primarySitekIQR", - "primarySitePercentile", - "primarySiteQC", - "primarySiteZScore", - "internalPancancerFoldChange", - "internalPancancerkIQR", - "internalPancancerPercentile", - "internalPancancerQC", - "internalPancancerZScore", - "rnaReads", - "rpkm", - "tpm", + 'biopsySiteFoldChange', + 'biopsySitePercentile', + 'biopsySiteQC', + 'biopsySiteZScore', + 'biopsySitekIQR', + 'comments', + 'diseaseFoldChange', + 'diseasekIQR', + 'diseasePercentile', + 'diseaseQC', + 'diseaseZScore', + 'expressionState', + 'histogramImage', + 'library', + 'primarySiteFoldChange', + 'primarySitekIQR', + 'primarySitePercentile', + 'primarySiteQC', + 'primarySiteZScore', + 'internalPancancerFoldChange', + 'internalPancancerkIQR', + 'internalPancancerPercentile', + 'internalPancancerQC', + 'internalPancancerZScore', + 'rnaReads', + 'rpkm', + 'tpm', ] SV_REQ = [ - "eventType", - "breakpoint", - "gene1", # prev: nterm_hugo - "gene2", # prev: cterm_hugo - "exon1", # n-terminal - "exon2", # c-terminal + 'eventType', + 'breakpoint', + 'gene1', # prev: nterm_hugo + 'gene2', # prev: cterm_hugo + 'exon1', # n-terminal + 'exon2', # c-terminal ] SV_KEY = SV_REQ[:] SV_OPTIONAL = [ - "ctermTranscript", - "ntermTranscript", - "ctermGene", # combined hugo ensembl form - "ntermGene", # combined hugo ensembl form - "detectedIn", - "conventionalName", - "svg", - "svgTitle", - "name", - "frame", - "omicSupport", - "highQuality", - "comments", - "library", - "rnaAltCount", - "rnaDepth", - "tumourAltCount", - "tumourDepth", - "germline", - "mavis_product_id", + 'ctermTranscript', + 'ntermTranscript', + 'ctermGene', # combined hugo ensembl form + 'ntermGene', # combined hugo ensembl form + 'detectedIn', + 'conventionalName', + 'svg', + 'svgTitle', + 'name', + 'frame', + 'omicSupport', + 'highQuality', + 'comments', + 'library', + 'rnaAltCount', + 'rnaDepth', + 'tumourAltCount', + 'tumourDepth', + 'germline', + 'mavis_product_id', ] -SIGV_REQ = ["signatureName", "variantTypeName"] -SIGV_COSMIC = ["signature"] # 1st element used as signatureName key -SIGV_HLA = ["a1", "a2", "b1", "b2", "c1", "c2"] -SIGV_OPTIONAL = ["displayName"] +SIGV_REQ = ['signatureName', 'variantTypeName'] +SIGV_COSMIC = ['signature'] # 1st element used as signatureName key +SIGV_HLA = ['a1', 'a2', 'b1', 'b2', 'c1', 'c2'] +SIGV_OPTIONAL = ['displayName'] SIGV_KEY = SIGV_REQ[:] @@ -192,7 +191,7 @@ def validate_variant_rows( Returns: the rows from the tab file as dictionaries """ - header = required + optional + ["key"] + header = required + optional + ['key'] result = [] keys = set() @@ -202,18 +201,18 @@ def validate_variant_rows( if not header_validated: for req_col in required: if req_col not in row: - raise ValueError(f"header missing required column ({req_col})") + raise ValueError(f'header missing required column ({req_col})') header_validated = True row_key = hash_key(row_to_key(row)) if row_key in keys: - raise ValueError(f"duplicate row key ({row_key}) from ({row_to_key(row)})") - row["key"] = row_key + raise ValueError(f'duplicate row key ({row_key}) from ({row_to_key(row)})') + row['key'] = row_key keys.add(row_key) for k, v in row.items(): if v is pd.NA: - row[k] = "" + row[k] = '' - result.append(cast(IprVariant, {col: row.get(col, "") for col in header})) + result.append(cast(IprVariant, {col: row.get(col, '') for col in header})) return result @@ -225,43 +224,42 @@ def preprocess_copy_variants(rows: Iterable[Dict]) -> List[IprCopyVariant]: """ # default map for display - concise names display_name_mapping = { - INPUT_COPY_CATEGORIES.DEEP: "deep deletion", - INPUT_COPY_CATEGORIES.AMP: "amplification", - INPUT_COPY_CATEGORIES.GAIN: "copy gain", - INPUT_COPY_CATEGORIES.LOSS: "copy loss", + INPUT_COPY_CATEGORIES.DEEP: 'deep deletion', + INPUT_COPY_CATEGORIES.AMP: 'amplification', + INPUT_COPY_CATEGORIES.GAIN: 'copy gain', + INPUT_COPY_CATEGORIES.LOSS: 'copy loss', } display_name_mapping.update(dict([(v, v) for v in display_name_mapping.values()])) def row_key(row: Dict) -> Tuple[str, ...]: - return tuple(["cnv"] + [row[key] for key in COPY_KEY]) + return tuple(['cnv'] + [row[key] for key in COPY_KEY]) result = validate_variant_rows(rows, COPY_REQ, COPY_OPTIONAL, row_key) ret_list = [cast(IprCopyVariant, var) for var in result] for row in ret_list: - - kb_cat = row.get("kbCategory") - kb_cat = "" if pd.isnull(kb_cat) else str(kb_cat) + kb_cat = row.get('kbCategory') + kb_cat = '' if pd.isnull(kb_cat) else str(kb_cat) if kb_cat: if kb_cat not in INPUT_COPY_CATEGORIES.values(): - raise ValueError(f"invalid copy variant kbCategory value ({kb_cat})") - if not row.get("cnvState"): # apply default short display name - row["cnvState"] = display_name_mapping[kb_cat] - row["variant"] = kb_cat - row["variantType"] = "cnv" - chrband = row.get("chromosomeBand", False) - chrom = row.pop("chromosome", False) + raise ValueError(f'invalid copy variant kbCategory value ({kb_cat})') + if not row.get('cnvState'): # apply default short display name + row['cnvState'] = display_name_mapping[kb_cat] + row['variant'] = kb_cat + row['variantType'] = 'cnv' + chrband = row.get('chromosomeBand', False) + chrom = row.pop('chromosome', False) if not chrom: - chrom = row.pop("chr", False) + chrom = row.pop('chr', False) # remove chr if it was not used for chrom - row.pop("chr", False) + row.pop('chr', False) if chrom: # check that chr isn't already in the chrband; # this regex from https://vrs.ga4gh.org/en/1.2/terms_and_model.html#id25 - if chrband and (re.match(r"^cen|[pq](ter|([1-9][0-9]*(\.[1-9][0-9]*)?))$", chrband)): + if chrband and (re.match(r'^cen|[pq](ter|([1-9][0-9]*(\.[1-9][0-9]*)?))$', chrband)): if isinstance(chrom, int): chrom = str(chrom) - chrom = chrom.strip("chr") - row["chromosomeBand"] = chrom + row["chromosomeBand"] + chrom = chrom.strip('chr') + row['chromosomeBand'] = chrom + row['chromosomeBand'] return ret_list @@ -274,28 +272,28 @@ def preprocess_small_mutations(rows: Iterable[Dict]) -> List[IprSmallMutationVar def row_key(row: IprSmallMutationVariant) -> Tuple[str, ...]: key_vals = [] - for kval in [row.get(key, "") for key in SMALL_MUT_KEY]: - key_vals.append(str(kval) if pd.notnull(kval) else "") - return tuple(["small mutation"] + key_vals) + for kval in [row.get(key, '') for key in SMALL_MUT_KEY]: + key_vals.append(str(kval) if pd.notnull(kval) else '') + return tuple(['small mutation'] + key_vals) result = validate_variant_rows(rows, SMALL_MUT_REQ, SMALL_MUT_OPTIONAL, row_key) if not result: return [] def pick_variant(row: IprSmallMutationVariant) -> str: - protein_change = row.get("proteinChange") + protein_change = row.get('proteinChange') if not pandas_falsy(protein_change): for longAA, shortAA in protein_letters_3to1.items(): protein_change = str(protein_change).replace(longAA, shortAA) - hgvsp = "{}:{}".format(row["gene"], protein_change) + hgvsp = '{}:{}'.format(row['gene'], protein_change) return hgvsp - for field in ["hgvsProtein", "hgvsCds", "hgvsGenomic"]: + for field in ['hgvsProtein', 'hgvsCds', 'hgvsGenomic']: if not pandas_falsy(row.get(field)): return str(row.get(field)) raise ValueError( - "Variant field cannot be empty. Must include proteinChange or one of the hgvs fields (hgvsProtein, hgvsCds, hgvsGenomic) to build the variant string" + 'Variant field cannot be empty. Must include proteinChange or one of the hgvs fields (hgvsProtein, hgvsCds, hgvsGenomic) to build the variant string' ) # 'location' and 'refAlt' are not currently used for matching; still optional and allowed blank @@ -304,21 +302,21 @@ def pick_variant(row: IprSmallMutationVariant) -> str: # for row in result: def convert_sm(row: IprVariant) -> IprSmallMutationVariant: ret = cast(IprSmallMutationVariant, row) - ret["variant"] = pick_variant(ret) - ret["variantType"] = "mut" + ret['variant'] = pick_variant(ret) + ret['variantType'] = 'mut' - if ret.get("startPosition") and not ret.get("endPosition"): - ret["endPosition"] = ret["startPosition"] + if ret.get('startPosition') and not ret.get('endPosition'): + ret['endPosition'] = ret['startPosition'] # default depth to alt + ref if not given - for sample_type in ("normal", "rna", "tumour"): + for sample_type in ('normal', 'rna', 'tumour'): if ( - ret.get(f"{sample_type}RefCount") - and ret.get(f"{sample_type}AltCount") - and not ret.get(f"{sample_type}Depth") + ret.get(f'{sample_type}RefCount') + and ret.get(f'{sample_type}AltCount') + and not ret.get(f'{sample_type}Depth') ): - ret[f"{sample_type}Depth"] = ( # type: ignore - ret[f"{sample_type}RefCount"] + ret[f"{sample_type}AltCount"] # type: ignore + ret[f'{sample_type}Depth'] = ( # type: ignore + ret[f'{sample_type}RefCount'] + ret[f'{sample_type}AltCount'] # type: ignore ) return ret @@ -334,65 +332,65 @@ def preprocess_expression_variants(rows: Iterable[Dict]) -> List[IprExprVariant] """ def row_key(row: Dict) -> Tuple[str, ...]: - return tuple(["expression"] + [row[key] for key in EXP_KEY]) + return tuple(['expression'] + [row[key] for key in EXP_KEY]) variants = validate_variant_rows(rows, EXP_REQ, EXP_OPTIONAL, row_key) result = [cast(IprExprVariant, var) for var in variants] float_columns = [ col for col in EXP_REQ + EXP_OPTIONAL - if col.endswith("kIQR") - or col.endswith("Percentile") - or col.endswith("FoldChange") - or col.endswith("QC") - or col.endswith("ZScore") - or col in ["tpm", "rpkm"] + if col.endswith('kIQR') + or col.endswith('Percentile') + or col.endswith('FoldChange') + or col.endswith('QC') + or col.endswith('ZScore') + or col in ['tpm', 'rpkm'] ] errors = [] for row in result: - row["variant"] = row["kbCategory"] - if not row["expressionState"] and row["kbCategory"]: - row["expressionState"] = row["kbCategory"] + row['variant'] = row['kbCategory'] + if not row['expressionState'] and row['kbCategory']: + row['expressionState'] = row['kbCategory'] - if row["variant"] and not pd.isnull(row["variant"]): - if row["variant"] not in INPUT_EXPRESSION_CATEGORIES.values(): + if row['variant'] and not pd.isnull(row['variant']): + if row['variant'] not in INPUT_EXPRESSION_CATEGORIES.values(): err_msg = f"{row['gene']} variant '{row['variant']}' not in {INPUT_EXPRESSION_CATEGORIES.values()}" errors.append(err_msg) logger.error(err_msg) - row["variantType"] = "exp" + row['variantType'] = 'exp' for col in float_columns: - if row.get(col) in ["inf", "+inf", "-inf"]: - row[col] = row[col].replace("inf", "Infinity") # type: ignore + if row.get(col) in ['inf', '+inf', '-inf']: + row[col] = row[col].replace('inf', 'Infinity') # type: ignore # check images exist - if row["histogramImage"] and not os.path.exists(row["histogramImage"]): + if row['histogramImage'] and not os.path.exists(row['histogramImage']): raise FileNotFoundError(f'missing image ({row["histogramImage"]})') if errors: - raise ValueError(f"{len(errors)} Invalid expression variants in file") + raise ValueError(f'{len(errors)} Invalid expression variants in file') return result def create_graphkb_sv_notation(row: IprFusionVariant) -> str: """Generate GKB/IPR fusion style notation from a structural variant.""" - gene1 = row["gene1"] or "?" - gene2 = row["gene2"] or "?" - exon1 = str(row["exon1"]) if row["exon1"] else "?" - exon2 = str(row["exon2"]) if row["exon2"] else "?" - if not row["gene1"]: + gene1 = row['gene1'] or '?' + gene2 = row['gene2'] or '?' + exon1 = str(row['exon1']) if row['exon1'] else '?' + exon2 = str(row['exon2']) if row['exon2'] else '?' + if not row['gene1']: gene1, gene2 = gene2, gene1 exon1, exon2 = exon2, exon1 - if gene1 == "?": + if gene1 == '?': raise ValueError( f'both genes cannot be blank for a structural variant {row["key"]}. At least 1 gene must be entered' ) # force exons to integer repr string - exon1 = exon1[:-2] if exon1.endswith(".0") else exon1 - exon2 = exon2[:-2] if exon2.endswith(".0") else exon2 - return f"({gene1},{gene2}):fusion(e.{exon1},e.{exon2})" + exon1 = exon1[:-2] if exon1.endswith('.0') else exon1 + exon2 = exon2[:-2] if exon2.endswith('.0') else exon2 + return f'({gene1},{gene2}):fusion(e.{exon1},e.{exon2})' def preprocess_structural_variants(rows: Iterable[Dict]) -> List[IprFusionVariant]: @@ -402,21 +400,21 @@ def preprocess_structural_variants(rows: Iterable[Dict]) -> List[IprFusionVarian """ def row_key(row: Dict) -> Tuple[str, ...]: - return tuple(["sv"] + [row[key] for key in SV_KEY]) + return tuple(['sv'] + [row[key] for key in SV_KEY]) variants = validate_variant_rows(rows, SV_REQ, SV_OPTIONAL, row_key) result = [cast(IprFusionVariant, var) for var in variants] # genes are optional for structural variants for row in result: - row["variant"] = create_graphkb_sv_notation(row) - row["variantType"] = "sv" + row['variant'] = create_graphkb_sv_notation(row) + row['variantType'] = 'sv' # check and load the svg file where applicable - if row["svg"] and not pd.isnull(row["svg"]): - if not os.path.exists(row["svg"]): - raise FileNotFoundError(row["svg"]) - with open(row["svg"], "r") as fh: - row["svg"] = fh.read() + if row['svg'] and not pd.isnull(row['svg']): + if not os.path.exists(row['svg']): + raise FileNotFoundError(row['svg']) + with open(row['svg'], 'r') as fh: + row['svg'] = fh.read() return result @@ -428,15 +426,15 @@ def preprocess_signature_variants(rows: Iterable[Dict]) -> List[IprSignatureVari """ def row_key(row: Dict) -> Tuple[str, ...]: - return tuple(["sigv"] + [row[key] for key in SIGV_KEY]) + return tuple(['sigv'] + [row[key] for key in SIGV_KEY]) variants = validate_variant_rows(rows, SIGV_REQ, SIGV_OPTIONAL, row_key) result = [cast(IprSignatureVariant, var) for var in variants] # Adding additional required properties for row in result: - row["variant"] = row["displayName"] - row["variantType"] = "sigv" + row['variant'] = row['displayName'] + row['variantType'] = 'sigv' return result @@ -448,9 +446,9 @@ def preprocess_cosmic(rows: Iterable[Dict]) -> Iterable[Dict]: """ return [ { - "displayName": f"{signature} {COSMIC_SIGNATURE_VARIANT_TYPE}", - "signatureName": signature, - "variantTypeName": COSMIC_SIGNATURE_VARIANT_TYPE, + 'displayName': f'{signature} {COSMIC_SIGNATURE_VARIANT_TYPE}', + 'signatureName': signature, + 'variantTypeName': COSMIC_SIGNATURE_VARIANT_TYPE, } for signature in rows ] @@ -465,21 +463,21 @@ def preprocess_hla(rows: Iterable[Dict]) -> Iterable[Dict]: for k, v in row.items(): if k not in SIGV_HLA: continue - hla.add(f"HLA-{v}") # 2nd level, e.g. 'HLA-A*02:01' - hla.add(f"HLA-{v.split(':')[0]}") # 1st level, e.g. 'HLA-A*02' + hla.add(f'HLA-{v}') # 2nd level, e.g. 'HLA-A*02:01' + hla.add(f'HLA-{v.split(":")[0]}') # 1st level, e.g. 'HLA-A*02' return [ { - "displayName": f"{signature} {HLA_SIGNATURE_VARIANT_TYPE}", - "signatureName": signature, - "variantTypeName": HLA_SIGNATURE_VARIANT_TYPE, + 'displayName': f'{signature} {HLA_SIGNATURE_VARIANT_TYPE}', + 'signatureName': signature, + 'variantTypeName': HLA_SIGNATURE_VARIANT_TYPE, } for signature in hla ] def preprocess_tmb( - tmb_high: float, tmburMutationBurden: Dict = {}, genomeTmb: float | str = "" + tmb_high: float, tmburMutationBurden: Dict = {}, genomeTmb: float | str = '' ) -> Iterable[Dict]: """ Process tumour mutation burden (tmb) input(s) into preformatted signature input. @@ -493,15 +491,15 @@ def preprocess_tmb( if tmburMutationBurden: try: tmbur_tmb_val = float( - tmburMutationBurden["genomeIndelTmb"] + tmburMutationBurden["genomeSnvTmb"] + tmburMutationBurden['genomeIndelTmb'] + tmburMutationBurden['genomeSnvTmb'] ) if not genomeTmb and not isinstance(genomeTmb, float): logger.error( - "backwards compatibility: deriving genomeTmb from tmburMutationBurden genomeIndelTmb + genomeSnvTmb" + 'backwards compatibility: deriving genomeTmb from tmburMutationBurden genomeIndelTmb + genomeSnvTmb' ) tmb_val = tmbur_tmb_val except Exception as err: - logger.error(f"tmburMutationBurden parsing failure: {err}") + logger.error(f'tmburMutationBurden parsing failure: {err}') # genomeTmb # SDEV-4811 - mutation burden is now expected to be uploaded in genomeTmb as mutations/megabase @@ -512,19 +510,19 @@ def preprocess_tmb( tmb_val = float(genomeTmb) if tmburMutationBurden and tmbur_tmb_val != tmb_val: logger.warning( - f"genomeTmb given {tmb_val} does not match tmburMutationBurden TMB {tmbur_tmb_val}" + f'genomeTmb given {tmb_val} does not match tmburMutationBurden TMB {tmbur_tmb_val}' ) except TypeError as err: - logger.error(f"genomeTmb parsing failure {genomeTmb}: {err}") + logger.error(f'genomeTmb parsing failure {genomeTmb}: {err}') # comparaing tmb_val to threshold # Signature CategoryVariant created only if threshold met if tmb_val >= tmb_high: return [ { - "displayName": f"{TMB_SIGNATURE} {TMB_SIGNATURE_VARIANT_TYPE}", - "signatureName": TMB_SIGNATURE, - "variantTypeName": TMB_SIGNATURE_VARIANT_TYPE, + 'displayName': f'{TMB_SIGNATURE} {TMB_SIGNATURE_VARIANT_TYPE}', + 'signatureName': TMB_SIGNATURE, + 'variantTypeName': TMB_SIGNATURE_VARIANT_TYPE, } ] return [] @@ -536,17 +534,16 @@ def preprocess_msi(msi: Any) -> Iterable[Dict]: Both msi & mss gets mapped to corresponding GraphKB Signature CategoryVariants. """ if msi: - # MSI category is given from upstream (only one msi variant per library) if isinstance(msi, list): # msi is given as a list of one dict - msi_cat = msi[0].get("kbCategory", "") + msi_cat = msi[0].get('kbCategory', '') elif isinstance(msi, str): # msi is given as a string msi_cat = msi else: # msi is given as a dict; uncatched error if not. - msi_cat = msi.get("kbCategory", "") + msi_cat = msi.get('kbCategory', '') msi_variant = MSI_MAPPING.get(msi_cat, None) @@ -557,6 +554,23 @@ def preprocess_msi(msi: Any) -> Iterable[Dict]: return [] +def preprocess_hrd(hrd: Any) -> Iterable[Dict]: + """ + Process hrd input into preformatted signature input. + HRD gets mapped to corresponding GraphKB Signature CategoryVariants. + """ + if hrd: + hrd_cat = hrd.get('kbCategory', '') + + hrd_variant = HRD_MAPPING.get(hrd_cat, None) + + # Signature CategoryVariant created either for msi or mss + if hrd_variant: + return [hrd_variant] + + return [] + + def check_variant_links( small_mutations: List[IprSmallMutationVariant], expression_variants: List[IprExprVariant], @@ -580,67 +594,67 @@ def check_variant_links( missing_information_genes = set() missing_information_errors = set() - copy_variant_genes = {variant["gene"] for variant in copy_variants} - expression_variant_genes = {variant["gene"] for variant in expression_variants} + copy_variant_genes = {variant['gene'] for variant in copy_variants} + expression_variant_genes = {variant['gene'] for variant in expression_variants} genes_with_variants = set() # filter excess copy variants variant: IprCopyVariant | IprExprVariant | IprFusionVariant | IprSmallMutationVariant for variant in copy_variants: - gene = variant["gene"] + gene = variant['gene'] if not gene: - logger.error("copy_variant data cannot be applied to an empty genename") - elif variant["variant"]: + logger.error('copy_variant data cannot be applied to an empty genename') + elif variant['variant']: genes_with_variants.add(gene) if expression_variant_genes and gene not in expression_variant_genes: missing_information_genes.add(gene) missing_information_errors.add( - f"gene ({gene}) has a copy variant but is missing expression information" + f'gene ({gene}) has a copy variant but is missing expression information' ) for variant in expression_variants: - gene = variant["gene"] + gene = variant['gene'] if not gene: - logger.error("expression_variant data cannot be applied to an empty genename") - elif variant["variant"]: + logger.error('expression_variant data cannot be applied to an empty genename') + elif variant['variant']: genes_with_variants.add(gene) if copy_variant_genes and gene not in copy_variant_genes: missing_information_genes.add(gene) missing_information_errors.add( - f"gene ({gene}) has an expression variant but is missing copy number information" + f'gene ({gene}) has an expression variant but is missing copy number information' ) for variant in small_mutations: - gene = variant["gene"] + gene = variant['gene'] if not gene: - logger.error("small_mutation data cannot be applied to an empty genename") + logger.error('small_mutation data cannot be applied to an empty genename') continue if copy_variant_genes and gene not in copy_variant_genes: missing_information_genes.add(gene) missing_information_errors.add( - f"gene ({gene}) has a small mutation but is missing copy number information" + f'gene ({gene}) has a small mutation but is missing copy number information' ) if expression_variant_genes and gene not in expression_variant_genes: missing_information_genes.add(gene) missing_information_errors.add( - f"gene ({gene}) has a small mutation but is missing expression information" + f'gene ({gene}) has a small mutation but is missing expression information' ) genes_with_variants.add(gene) for variant in structural_variants: - for gene in [variant["gene1"], variant["gene2"]]: + for gene in [variant['gene1'], variant['gene2']]: if gene: # genes are optional for structural variants if gene not in copy_variant_genes: missing_information_genes.add(gene) missing_information_errors.add( - f"gene ({gene}) has a structural variant but is missing copy number information" + f'gene ({gene}) has a structural variant but is missing copy number information' ) if gene not in expression_variant_genes: missing_information_genes.add(gene) missing_information_errors.add( - f"gene ({gene}) has a structural variant but is missing expression information" + f'gene ({gene}) has a structural variant but is missing expression information' ) genes_with_variants.add(gene) @@ -648,7 +662,7 @@ def check_variant_links( for err_msg in sorted(missing_information_errors): logger.debug(err_msg) link_err_msg = ( - f"Missing information variant links on {len(missing_information_genes)} genes" + f'Missing information variant links on {len(missing_information_genes)} genes' ) logger.warning(link_err_msg) return genes_with_variants @@ -659,91 +673,91 @@ def check_comparators(content: Dict, expresssionVariants: List[IprExprVariant] = Given the optional content dictionary, check that based on the analyses present the correct/sufficient comparators have also been specified """ - mutation_burden = "mutationBurden" - comparator_roles = {c["analysisRole"] for c in content.get("comparators", [])} + mutation_burden = 'mutationBurden' + comparator_roles = {c['analysisRole'] for c in content.get('comparators', [])} - for image in content.get("images", []): - key = image["key"] + for image in content.get('images', []): + key = image['key'] if key.startswith(mutation_burden): - comp_type = key.split(".")[-1] - role = f"mutation burden ({comp_type})" + comp_type = key.split('.')[-1] + role = f'mutation burden ({comp_type})' if role in comparator_roles: continue - if "_sv." in key: - sv_role = f"mutation burden SV ({comp_type})" + if '_sv.' in key: + sv_role = f'mutation burden SV ({comp_type})' if sv_role in comparator_roles: continue - raise ValueError(f"missing required comparator definition ({role})") + raise ValueError(f'missing required comparator definition ({role})') if expresssionVariants: - required_comparators = {"expression (disease)"} + required_comparators = {'expression (disease)'} def all_none(row: IprExprVariant, columns: List[str]) -> bool: - return all([row.get(col) is None or row.get(col) == "" for col in columns]) + return all([row.get(col) is None or row.get(col) == '' for col in columns]) for exp in expresssionVariants: if not all_none( exp, [ - "primarySitekIQR", - "primarySitePercentile", - "primarySiteZScore", - "primarySiteFoldChange", + 'primarySitekIQR', + 'primarySitePercentile', + 'primarySiteZScore', + 'primarySiteFoldChange', ], ): - required_comparators.add("expression (primary site)") + required_comparators.add('expression (primary site)') if not all_none( exp, [ - "biopsySitekIQR", - "biopsySitePercentile", - "biopsySiteZScore", - "biopsySiteFoldChange", + 'biopsySitekIQR', + 'biopsySitePercentile', + 'biopsySiteZScore', + 'biopsySiteFoldChange', ], ): - required_comparators.add("expression (biopsy site)") + required_comparators.add('expression (biopsy site)') if not all_none( exp, [ - "internalPancancerkIQR", - "internalPancancerPercentile", - "internalPancancerZScore", - "internalPancancerFoldChange", + 'internalPancancerkIQR', + 'internalPancancerPercentile', + 'internalPancancerZScore', + 'internalPancancerFoldChange', ], ): - required_comparators.add("expression (internal pancancer cohort)") + required_comparators.add('expression (internal pancancer cohort)') if required_comparators - comparator_roles: - missing = "; ".join(sorted(list(required_comparators - comparator_roles))) - raise ValueError(f"missing required comparator definitions ({missing})") + missing = '; '.join(sorted(list(required_comparators - comparator_roles))) + raise ValueError(f'missing required comparator definitions ({missing})') def extend_with_default(validator_class): # https://python-jsonschema.readthedocs.io/en/latest/faq/#why-doesn-t-my-schema-s-default-property-set-the-default-on-my-instance - validate_properties = validator_class.VALIDATORS["properties"] + validate_properties = validator_class.VALIDATORS['properties'] def set_defaults(validator, properties, instance, schema): for property, subschema in properties.items(): - if "default" in subschema: - instance.setdefault(property, subschema["default"]) + if 'default' in subschema: + instance.setdefault(property, subschema['default']) for error in validate_properties(validator, properties, instance, schema): yield error def check_null(checker, instance): return ( - validator_class.TYPE_CHECKER.is_type(instance, "null") + validator_class.TYPE_CHECKER.is_type(instance, 'null') or pd.isnull(instance) - or instance == "" + or instance == '' ) - type_checker = validator_class.TYPE_CHECKER.redefine("null", check_null) + type_checker = validator_class.TYPE_CHECKER.redefine('null', check_null) return jsonschema.validators.extend( validator_class, - validators={"properties": set_defaults}, + validators={'properties': set_defaults}, type_checker=type_checker, ) @@ -758,7 +772,7 @@ def validate_report_content(content: Dict, schema_file: str = SPECIFICATION) -> Adds defaults as reccommended by: https://python-jsonschema.readthedocs.io/en/latest/faq/#why-doesn-t-my-schema-s-default-property-set-the-default-on-my-instance """ - with open(schema_file, "r") as fh: + with open(schema_file, 'r') as fh: schema = json.load(fh) return DefaultValidatingDraft7Validator(schema).validate(content) diff --git a/pori_python/ipr/ipr.py b/pori_python/ipr/ipr.py index 8487dc1d..06d9efbd 100644 --- a/pori_python/ipr/ipr.py +++ b/pori_python/ipr/ipr.py @@ -35,12 +35,12 @@ def display_evidence_levels(statement: Statement) -> str: result = [] - for evidence_level in statement.get("evidenceLevel", []) or []: + for evidence_level in statement.get('evidenceLevel', []) or []: if isinstance(evidence_level, str): result.append(evidence_level) - elif "displayName" in evidence_level: - result.append(evidence_level["displayName"]) - return ";".join(sorted(result)) + elif 'displayName' in evidence_level: + result.append(evidence_level['displayName']) + return ';'.join(sorted(result)) def filter_structural_variants( @@ -52,9 +52,9 @@ def filter_structural_variants( Filter structural variants to remove non-high quality events unless they are matched/annotated or they involve a gene that is a known fusion partner """ - matched_svs = {match["variant"] for match in kb_matches if match["variantType"] == "sv"} + matched_svs = {match['variant'] for match in kb_matches if match['variantType'] == 'sv'} fusion_genes = { - gene["name"] for gene in gene_annotations if gene.get("knownFusionPartner", False) + gene['name'] for gene in gene_annotations if gene.get('knownFusionPartner', False) } result = [] @@ -62,10 +62,10 @@ def filter_structural_variants( for structural_variant in structural_variants: if any( [ - structural_variant["highQuality"], - structural_variant["key"] in matched_svs, - structural_variant["gene1"] in fusion_genes, - structural_variant["gene2"] in fusion_genes, + structural_variant['highQuality'], + structural_variant['key'] in matched_svs, + structural_variant['gene1'] in fusion_genes, + structural_variant['gene2'] in fusion_genes, ] ): result.append(structural_variant) @@ -83,22 +83,22 @@ def get_evidencelevel_mapping(graphkb_conn: GraphKBConnection) -> Dict[str, str] """ # Get all EvidenceLevel from GraphKB # Note: not specifying any returnProperties allows for retreiving in/out_CrossReferenceOf - evidence_levels = graphkb_conn.query({"target": "EvidenceLevel"}) + evidence_levels = graphkb_conn.query({'target': 'EvidenceLevel'}) # Map EvidenceLevel RIDs to list of incoming CrossReferenceOf evidence_levels_mapping = dict( - map(lambda d: (d["@rid"], d.get("in_CrossReferenceOf", [])), evidence_levels) + map(lambda d: (d['@rid'], d.get('in_CrossReferenceOf', [])), evidence_levels) ) # Filter IPR EvidenceLevel and map each outgoing CrossReferenceOf to displayName - ipr_source_rid = graphkb_conn.get_source("ipr")["@rid"] - ipr_evidence_levels = filter(lambda d: d.get("source") == ipr_source_rid, evidence_levels) + ipr_source_rid = graphkb_conn.get_source('ipr')['@rid'] + ipr_evidence_levels = filter(lambda d: d.get('source') == ipr_source_rid, evidence_levels) cross_references_mapping: Dict[str, str] = dict() ipr_rids_to_displayname: Dict[str, str] = dict() for level in ipr_evidence_levels: - d = map(lambda i: (i, level["displayName"]), level.get("out_CrossReferenceOf", [])) # type: ignore + d = map(lambda i: (i, level['displayName']), level.get('out_CrossReferenceOf', [])) # type: ignore cross_references_mapping.update(d) - ipr_rids_to_displayname[level["@rid"]] = level["displayName"] # type: ignore + ipr_rids_to_displayname[level['@rid']] = level['displayName'] # type: ignore # Update EvidenceLevel mapping to corresponding IPR EvidenceLevel displayName def link_refs(refs) -> Tuple[str, str]: @@ -107,10 +107,10 @@ def link_refs(refs) -> Tuple[str, str]: return (refs[0], cross_references_mapping[rid]) if refs[0] in ipr_rids_to_displayname: # self-referencing IPR levels return (refs[0], ipr_rids_to_displayname[refs[0]]) - return (refs[0], "") + return (refs[0], '') evidence_levels_mapping = dict(map(link_refs, evidence_levels_mapping.items())) - evidence_levels_mapping[""] = "" + evidence_levels_mapping[''] = '' return evidence_levels_mapping # type: ignore @@ -142,11 +142,11 @@ def convert_statements_to_alterations( rows = [] ev_map = get_evidencelevel_mapping(graphkb_conn) # GERO-318 - add all IPR-A evidence equivalents to the approvedTherapy flag - approved = set([ev for (ev, ipr) in ev_map.items() if ipr == "IPR-A"]) + approved = set([ev for (ev, ipr) in ev_map.items() if ipr == 'IPR-A']) # get the recruitment status for any trial associated with a statement clinical_trials = [ - s["subject"]["@rid"] for s in statements if s["subject"]["@class"] == "ClinicalTrial" + s['subject']['@rid'] for s in statements if s['subject']['@class'] == 'ClinicalTrial' ] recruitment_statuses = {} if clinical_trials: @@ -154,76 +154,79 @@ def convert_statements_to_alterations( for rid in clinical_trials: query_result = graphkb_conn.query( { - "target": {"target": "ClinicalTrial", "filters": {"@rid": rid}}, - "returnProperties": ["@rid", "recruitmentStatus"], + 'target': {'target': 'ClinicalTrial', 'filters': {'@rid': rid}}, + 'returnProperties': ['@rid', 'recruitmentStatus'], } ) if query_result: - recruitment_statuses[rid] = query_result[0]["recruitmentStatus"] # type: ignore + recruitment_statuses[rid] = query_result[0]['recruitmentStatus'] # type: ignore for statement in statements: variants = [ - cast(Variant, c) for c in statement["conditions"] if c["@class"] in VARIANT_CLASSES + cast(Variant, c) for c in statement['conditions'] if c['@class'] in VARIANT_CLASSES ] - diseases = [c for c in statement["conditions"] if c["@class"] == "Disease"] - disease_match = len(diseases) == 1 and diseases[0]["@rid"] in disease_matches - pmid = ";".join([e["displayName"] for e in statement["evidence"]]) + diseases = [c for c in statement['conditions'] if c['@class'] == 'Disease'] + disease_match = len(diseases) == 1 and diseases[0]['@rid'] in disease_matches + reference = ';'.join([e['displayName'] for e in statement['evidence']]) + + if statement['relevance']['name'] == 'eligibility': + reference = ';'.join([e['sourceId'] for e in statement['evidence']]) ipr_section = gkb_statement.categorize_relevance( - graphkb_conn, statement["relevance"]["@rid"] + graphkb_conn, statement['relevance']['@rid'] ) approved_therapy = False - if ipr_section == "therapeutic": - for level in statement["evidenceLevel"] or []: - if level["@rid"] in approved: + if ipr_section == 'therapeutic': + for level in statement['evidenceLevel'] or []: + if level['@rid'] in approved: approved_therapy = True break - if ipr_section == "prognostic" and not disease_match: + if ipr_section == 'prognostic' and not disease_match: continue # GERO-72 / GERO-196 evidence_level_str = display_evidence_levels(statement) - evidence_levels = statement.get("evidenceLevel") or [] - ipr_evidence_levels = [ev_map[el.get("@rid", "")] for el in evidence_levels if el] - ipr_evidence_levels_str = ";".join(sorted(set([el for el in ipr_evidence_levels]))) + evidence_levels = statement.get('evidenceLevel') or [] + ipr_evidence_levels = [ev_map[el.get('@rid', '')] for el in evidence_levels if el] + ipr_evidence_levels_str = ';'.join(sorted(set([el for el in ipr_evidence_levels]))) for variant in variants: - if variant["@rid"] not in variant_matches: + if variant['@rid'] not in variant_matches: continue row = KbMatch( { - "approvedTherapy": approved_therapy or False, - "category": ipr_section or "unknown", - "context": ( - statement["subject"]["displayName"] if statement["subject"] else "" + 'approvedTherapy': approved_therapy or False, + 'category': ipr_section or 'unknown', + 'context': ( + statement['subject']['displayName'] if statement['subject'] else '' ), - "kbContextId": (statement["subject"]["@rid"] if statement["subject"] else ""), - "disease": ";".join(sorted(d.get("displayName", "") for d in diseases)), - "evidenceLevel": evidence_level_str or "", - "iprEvidenceLevel": ipr_evidence_levels_str or "", - "kbStatementId": statement["@rid"], - "kbVariant": str(variant.get("displayName", "")) or "", - "variant": str(variant.get("displayName", "")) or "", - "variantType": "", - "kbVariantId": variant["@rid"], - "matchedCancer": disease_match, - "reference": pmid, - "relevance": statement["relevance"]["displayName"], - "kbRelevanceId": statement["relevance"]["@rid"], - "externalSource": ( - str(statement["source"].get("displayName", "")) - if statement["source"] - else "" + 'kbContextId': (statement['subject']['@rid'] if statement['subject'] else ''), + 'disease': ';'.join(sorted(d.get('displayName', '') for d in diseases)), + 'evidenceLevel': evidence_level_str or '', + 'iprEvidenceLevel': ipr_evidence_levels_str or '', + 'kbStatementId': statement['@rid'], + 'kbVariant': str(variant.get('displayName', '')) or '', + 'variant': str(variant.get('displayName', '')) or '', + 'variantType': '', + 'kbVariantId': variant['@rid'], + 'matchedCancer': disease_match, + 'reference': reference, + 'relevance': statement['relevance']['displayName'], + 'kbRelevanceId': statement['relevance']['@rid'], + 'externalSource': ( + str(statement['source'].get('displayName', '')) + if statement['source'] + else '' ), - "requiredKbMatches": [item["@rid"] for item in variants], - "externalStatementId": statement.get("sourceId", "") or "", - "reviewStatus": statement.get("reviewStatus", "") or "", - "kbData": {}, + 'requiredKbMatches': [item['@rid'] for item in variants], + 'externalStatementId': statement.get('sourceId', '') or '', + 'reviewStatus': statement.get('reviewStatus', '') or '', + 'kbData': {}, } ) - if statement["relevance"]["name"] == "eligibility": - row["kbData"]["recruitment_status"] = recruitment_statuses.get( - row["kbContextId"], "not found" + if statement['relevance']['name'] == 'eligibility': + row['kbData']['recruitment_status'] = recruitment_statuses.get( + row['kbContextId'], 'not found' ) rows.append(row) return rows @@ -246,83 +249,99 @@ def select_expression_plots( """ selected_variants = { - (match["variantType"], match["variant"]) + (match['variantType'], match['variant']) for match in kb_matches - if match["category"] == "therapeutic" + if match['category'] == 'therapeutic' } images_by_gene: Dict[str, ImageDefinition] = {} selected_genes = set() for variant in all_variants: - if (variant["variantType"], variant["key"]) in selected_variants: - for key in ["gene", "gene1", "gene2"]: + if (variant['variantType'], variant['key']) in selected_variants: + for key in ['gene', 'gene1', 'gene2']: gene = variant.get(key) if gene: selected_genes.add(str(gene)) - gene = str(variant.get("gene", "")) - hist = str(variant.get("histogramImage", "")) + gene = str(variant.get('gene', '')) + hist = str(variant.get('histogramImage', '')) if hist: - images_by_gene[gene] = ImageDefinition({"key": f"expDensity.{gene}", "path": hist}) + images_by_gene[gene] = ImageDefinition({'key': f'expDensity.{gene}', 'path': hist}) return [images_by_gene[gene] for gene in selected_genes if gene in images_by_gene] def create_key_alterations( - kb_matches: List[Hashabledict], all_variants: Sequence[IprVariant] + kb_matches: List[Hashabledict], + all_variants: Sequence[IprVariant], + included_kb_matches: List[KbVariantMatch], ) -> Tuple[List[Dict], Dict]: """Create the list of significant variants matched by the KB. This list of matches is also used to create the variant counts. + + kb_matches: the full list of matched kb objects found for the reported variants + all_variants: the full list of all reported variants, matched or unmatched + included_kb_matches: the list of kb_variant ids to be allowed in the key alterations table; + this is all kb_variants if partially matched statements are allowed, or + the subset of kb_variants that are conditions for at least one + fully satisfied statement condition set, if partially matched statements + are not allowed (ie, kb_variants that are not part of any fully satisfied + statement condition set are excluded) """ alterations = [] type_mapping = { - "mut": "smallMutations", - "cnv": "CNVs", - "sv": "SVs", - "exp": "expressionOutliers", + 'mut': 'smallMutations', + 'cnv': 'CNVs', + 'sv': 'SVs', + 'exp': 'expressionOutliers', } counts: Dict[str, Set] = {v: set() for v in type_mapping.values()} skipped_variant_types = [] + + included_kbvariant_ids = list(set([item['kbVariantId'] for item in included_kb_matches])) + for kb_match in kb_matches: - variant_type = kb_match["variantType"] - variant_key = kb_match["variant"] - if kb_match["category"] == "unknown": + if kb_match['kbVariantId'] not in included_kbvariant_ids: + continue + variant_type = kb_match['variantType'] + variant_key = kb_match['variant'] + if kb_match['category'] == 'unknown': continue if variant_type not in type_mapping.keys(): if variant_type not in skipped_variant_types: skipped_variant_types.append(variant_type) logger.warning( - f"No summary key alterations for {variant_type}. Skipping {variant_key}" + f'No summary key alterations for {variant_type}. Skipping {variant_key}' ) continue try: variant = find_variant(all_variants, variant_type, variant_key) except KeyError as err: logger.error(err) - logger.error(f"No variant match found for {variant_key}") + logger.error(f'No variant match found for {variant_key}') continue counts[type_mapping[variant_type]].add(variant_key) - if variant_type == "exp": - alterations.append(f'{variant.get("gene","")} ({variant.get("expressionState")})') - elif variant_type == "cnv": - alterations.append(f'{variant.get("gene","")} ({variant.get("cnvState")})') + if variant_type == 'exp': + alterations.append(f'{variant.get("gene", "")} ({variant.get("expressionState")})') + elif variant_type == 'cnv': + alterations.append(f'{variant.get("gene", "")} ({variant.get("cnvState")})') # only show germline if relevant - elif kb_match["category"] in GERMLINE_BASE_TERMS and variant.get("germline"): - alterations.append(f"germline {variant['variant']}") + elif kb_match['category'] in GERMLINE_BASE_TERMS and variant.get('germline'): + alterations.append(f'germline {variant["variant"]}') else: - alterations.append(variant["variant"]) + alterations.append(variant['variant']) counted_variants = set.union(*counts.values()) - counts["variantsUnknown"] = set() + counts['variantsUnknown'] = set() # count the un-matched variants for variant in all_variants: - if variant["variant"] and variant["key"] not in counted_variants: - counts["variantsUnknown"].add(variant["key"]) + if variant['variant'] and variant['key'] not in counted_variants: + counts['variantsUnknown'].add(variant['key']) return ( - [{"geneVariant": alt} for alt in set(alterations)], + [{'geneVariant': alt} for alt in set(alterations)], {k: len(v) for k, v in counts.items()}, ) @@ -347,44 +366,44 @@ def germline_kb_matches( filtered list of kb_matches """ ret_list = [] - germ_alts = [alt for alt in kb_matches if alt["category"] in GERMLINE_BASE_TERMS] + germ_alts = [alt for alt in kb_matches if alt['category'] in GERMLINE_BASE_TERMS] somatic_alts = [alt for alt in kb_matches if alt not in germ_alts] if germ_alts: - logger.info(f"checking germline status of {GERMLINE_BASE_TERMS}") + logger.info(f'checking germline status of {GERMLINE_BASE_TERMS}') for alt in germ_alts: - var_list = [v for v in all_variants if v["key"] == alt["variant"]] - germline_var_list = [v for v in var_list if v.get("germline")] - unknown_var_list = [v for v in var_list if "germline" not in v] + var_list = [v for v in all_variants if v['key'] == alt['variant']] + germline_var_list = [v for v in var_list if v.get('germline')] + unknown_var_list = [v for v in var_list if 'germline' not in v] if germline_var_list: logger.debug( - f"germline kbStatementId:{alt['kbStatementId']}: {alt['kbVariant']} {alt['category']}" + f'germline kbStatementId:{alt["kbStatementId"]}: {alt["kbVariant"]} {alt["category"]}' ) ret_list.append(alt) elif unknown_var_list: logger.warning( - f"germline no data fail for: {alt['kbStatementId']}: {alt['kbVariant']} {alt['category']}" + f'germline no data fail for: {alt["kbStatementId"]}: {alt["kbVariant"]} {alt["category"]}' ) if not assume_somatic: logger.debug( - f"Keeping unverified match to germline kbStatementId:{alt['kbStatementId']}: {alt['kbVariant']} {alt['category']}" + f'Keeping unverified match to germline kbStatementId:{alt["kbStatementId"]}: {alt["kbVariant"]} {alt["category"]}' ) ret_list.append(alt) else: logger.debug( - f"Dropping unverified match to germline kbStatementId:{alt['kbStatementId']}: {alt['kbVariant']} {alt['category']}" + f'Dropping unverified match to germline kbStatementId:{alt["kbStatementId"]}: {alt["kbVariant"]} {alt["category"]}' ) else: logger.debug( - f"Dropping somatic match to germline kbStatementId:{alt['kbStatementId']}: {alt['kbVariant']} {alt['category']}" + f'Dropping somatic match to germline kbStatementId:{alt["kbStatementId"]}: {alt["kbVariant"]} {alt["category"]}' ) if somatic_alts: # Remove any matches to germline events for alt in somatic_alts: - var_list = [v for v in all_variants if v["key"] == alt["variant"]] - somatic_var_list = [v for v in var_list if not v.get("germline", not assume_somatic)] + var_list = [v for v in all_variants if v['key'] == alt['variant']] + somatic_var_list = [v for v in var_list if not v.get('germline', not assume_somatic)] if var_list and not somatic_var_list: logger.debug( - f"Dropping germline match to somatic statement kbStatementId:{alt['kbStatementId']}: {alt['kbVariant']} {alt['category']}" + f'Dropping germline match to somatic statement kbStatementId:{alt["kbStatementId"]}: {alt["kbVariant"]} {alt["category"]}' ) elif somatic_var_list: ret_list.append(alt) # match to somatic variant @@ -397,7 +416,7 @@ def germline_kb_matches( def multi_variant_filtering( graphkb_conn: GraphKBConnection, gkb_matches: List[KbMatch], - excludedTypes: List[str] = ["wildtype"], + excludedTypes: List[str] = ['wildtype'], ) -> List[KbMatch]: """Filters out GraphKB matches that doesn't match to all required variants on multi-variant statements @@ -417,42 +436,42 @@ def multi_variant_filtering( filtered list of KbMatch statements """ # All matching statements & variants (GKB RIDs) - matching_statement_rids = {match["kbStatementId"] for match in gkb_matches} - matching_variant_rids = {match["kbVariantId"] for match in gkb_matches} + matching_statement_rids = {match['kbStatementId'] for match in gkb_matches} + matching_variant_rids = {match['kbVariantId'] for match in gkb_matches} # Get conditions detail on all matching statements res = graphkb_conn.post( - uri="query", + uri='query', data={ - "target": "Statement", - "filters": { - "@rid": list(matching_statement_rids), - "operator": "IN", + 'target': 'Statement', + 'filters': { + '@rid': list(matching_statement_rids), + 'operator': 'IN', }, - "history": True, - "returnProperties": [ - "@rid", - "conditions.@rid", - "conditions.@class", - "conditions.type", + 'history': True, + 'returnProperties': [ + '@rid', + 'conditions.@rid', + 'conditions.@class', + 'conditions.type', ], }, ) - statements = res["result"] + statements = res['result'] # Get set of excluded Vocabulary RIDs for variant types excluded = {} - if len(excludedTypes) != 0 and excludedTypes[0] != "": + if len(excludedTypes) != 0 and excludedTypes[0] != '': excluded = gkb_vocab.get_terms_set(graphkb_conn, excludedTypes) # Mapping statements to their conditional variants # (discarding non-variant conditions & variant conditions from excluded types) statement_to_variants = {} for statement in statements: - statement_to_variants[statement["@rid"]] = { - el["@rid"] - for el in statement["conditions"] - if (el["@class"] in VARIANT_CLASSES and el.get("type", "") not in excluded) + statement_to_variants[statement['@rid']] = { + el['@rid'] + for el in statement['conditions'] + if (el['@class'] in VARIANT_CLASSES and el.get('type', '') not in excluded) } # Set of statements with complete matching @@ -464,7 +483,7 @@ def multi_variant_filtering( # Filtering out incompleted matches of gkb_matches return [ - match for match in gkb_matches if match["kbStatementId"] in complete_matching_statements + match for match in gkb_matches if match['kbStatementId'] in complete_matching_statements ] @@ -483,10 +502,10 @@ def get_kb_variants( for item in gkb_matches: kbv = KbVariantMatch( { - "kbVariant": item["kbVariant"], - "variant": item["variant"], - "variantType": item["variantType"], - "kbVariantId": item["kbVariantId"], + 'kbVariant': item['kbVariant'], + 'variant': item['variant'], + 'variantType': item['variantType'], + 'kbVariantId': item['kbVariantId'], } ) kbVariants[str(kbv)] = kbv @@ -509,7 +528,7 @@ def get_kb_matched_statements( kbs_keys = KbMatchedStatement.__annotations__.keys() for item in gkb_matches: stmt = copy(item) - stmt["requiredKbMatches"].sort() + stmt['requiredKbMatches'].sort() kbs = KbMatchedStatement({key: val for (key, val) in stmt.items() if key in kbs_keys}) dict_key = str(kbs) kbMatchedStatements[dict_key] = kbs @@ -568,20 +587,20 @@ def get_kb_statement_matched_conditions( kbMatchedStatementConditions = {} for kbStmt in kbMatchedStatements: - stmts = [item for item in gkb_matches if item["kbStatementId"] == kbStmt["kbStatementId"]] + stmts = [item for item in gkb_matches if item['kbStatementId'] == kbStmt['kbStatementId']] requirements = {} - for requirement in stmts[0]["requiredKbMatches"]: + for requirement in stmts[0]['requiredKbMatches']: if not requirements.get(requirement, False): # only use explicit variant/statement links reqlist = [ { - "kbVariantId": requirement, - "observedVariantKey": item["variant"], + 'kbVariantId': requirement, + 'observedVariantKey': item['variant'], } for item in gkb_matches if ( - item["kbVariantId"] == requirement - and item["kbStatementId"] == kbStmt["kbStatementId"] + item['kbVariantId'] == requirement + and item['kbStatementId'] == kbStmt['kbStatementId'] ) ] requirements[requirement] = reqlist @@ -592,18 +611,18 @@ def get_kb_statement_matched_conditions( variantConditionSets = list(product(*requirements.values())) conditionSets = [ - {"kbStatementId": kbStmt["kbStatementId"], "matchedConditions": item} + {'kbStatementId': kbStmt['kbStatementId'], 'matchedConditions': item} for item in variantConditionSets ] for conditionSet in conditionSets: matchedConditions = sorted( - conditionSet["matchedConditions"], - key=lambda x: (x["kbVariantId"], x["observedVariantKey"]), + conditionSet['matchedConditions'], + key=lambda x: (x['kbVariantId'], x['observedVariantKey']), ) kbmc = KbMatchedStatementConditionSet( { - "kbStatementId": conditionSet["kbStatementId"], - "matchedConditions": matchedConditions, + 'kbStatementId': conditionSet['kbStatementId'], + 'matchedConditions': matchedConditions, } ) key = str( @@ -622,10 +641,24 @@ def get_kb_matches_sections( kb_statement_matched_conditions = get_kb_statement_matched_conditions( gkb_matches, allow_partial_matches ) + + if not allow_partial_matches: + # remove kb_matches that are not part of any fully matched condition set + unique_kb_variant_ids = list( + set( + [ + item['kbVariantId'] + for conditionSet in kb_statement_matched_conditions + for item in conditionSet['matchedConditions'] + ] + ) + ) + kb_variants = [item for item in kb_variants if item['kbVariantId'] in unique_kb_variant_ids] + return { - "kbMatches": kb_variants, - "kbMatchedStatements": kb_matched_statements, - "kbStatementMatchedConditions": kb_statement_matched_conditions, + 'kbMatches': kb_variants, + 'kbMatchedStatements': kb_matched_statements, + 'kbStatementMatchedConditions': kb_statement_matched_conditions, } @@ -634,12 +667,12 @@ def get_kb_disease_matches( kb_disease_match: Optional[str] = None, verbose: bool = True, useSubgraphsRoute: bool = True, -) -> list[str]: +) -> list[Dict]: disease_matches = [] if not kb_disease_match: - kb_disease_match = "cancer" + kb_disease_match = 'cancer' if verbose: logger.warning(f"No disease provided; will use '{kb_disease_match}'") @@ -657,20 +690,20 @@ def get_kb_disease_matches( base_records = gkb_util.convert_to_rid_list( graphkb_conn.query( gkb_vocab.query_by_name( - "Disease", + 'Disease', kb_disease_match, ) ) ) if base_records: response = graphkb_conn.post( - "/subgraphs/Disease", + '/subgraphs/Disease', { - "subgraphType": "tree", - "base": base_records, + 'subgraphType': 'tree', + 'base': base_records, }, ) - disease_matches = list(response["result"]["g"]["nodes"].keys()) + disease_matches = list(response['result']['g']['nodes'].values()) except Exception: if verbose: @@ -681,20 +714,15 @@ def get_kb_disease_matches( # Traversal depth is limited if not useSubgraphsRoute: if verbose: - logger.info(f"Matching disease ({kb_disease_match}) to graphkb using get_term_tree()") - disease_matches = list( - { - r["@rid"] - for r in gkb_vocab.get_term_tree( - graphkb_conn, - kb_disease_match, - ontology_class="Disease", - ) - } + logger.info(f'Matching disease ({kb_disease_match}) to graphkb using get_term_tree()') + disease_matches = gkb_vocab.get_term_tree( + graphkb_conn, + kb_disease_match, + ontology_class='Disease', ) if not disease_matches: - msg = f"failed to match disease ({kb_disease_match}) to graphkb" + msg = f'failed to match disease ({kb_disease_match}) to graphkb' if verbose: logger.error(msg) raise ValueError(msg) diff --git a/pori_python/ipr/main.py b/pori_python/ipr/main.py index 63a028fe..cbb7c128 100644 --- a/pori_python/ipr/main.py +++ b/pori_python/ipr/main.py @@ -23,7 +23,7 @@ from .annotate import annotate_variants from .connection import IprConnection -from .constants import DEFAULT_URL, TMB_SIGNATURE_HIGH_THRESHOLD +from .constants import TMB_SIGNATURE_HIGH_THRESHOLD from .inputs import ( check_comparators, check_variant_links, @@ -32,6 +32,7 @@ preprocess_expression_variants, preprocess_hla, preprocess_msi, + preprocess_hrd, preprocess_signature_variants, preprocess_small_mutations, preprocess_structural_variants, @@ -53,19 +54,19 @@ CACHE_GENE_MINIMUM = 5000 RENAMED_GENE_PROPERTIES = { # old_name: new_name - "cancerRelated": "kbStatementRelated", - "cancerGene": "cancerGeneListMatch", + 'cancerRelated': 'kbStatementRelated', + 'cancerGene': 'cancerGeneListMatch', } def file_path(path: str) -> str: if not os.path.exists(path): - raise argparse.ArgumentTypeError(f"{repr(path)} is not a valid filename. does not exist") + raise argparse.ArgumentTypeError(f'{repr(path)} is not a valid filename. does not exist') return path def timestamp() -> str: - return datetime.datetime.now().strftime("%Y-%m-%dT%H:%M:%S") + return datetime.datetime.now().strftime('%Y-%m-%dT%H:%M:%S') def command_interface() -> None: @@ -73,92 +74,92 @@ def command_interface() -> None: Parsed arguments are used to call the ipr_report() function. """ parser = ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatter) - req = parser.add_argument_group("required arguments") - (req if not os.environ.get("USER") else parser).add_argument( - "--username", - required=not os.environ.get("USER"), - default=os.environ.get("USER"), - help="username to use connecting to graphkb/ipr", + req = parser.add_argument_group('required arguments') + (req if not os.environ.get('USER') else parser).add_argument( + '--username', + required=not os.environ.get('USER'), + default=os.environ.get('USER'), + help='username to use connecting to graphkb/ipr', ) - req.add_argument("--password", required=True, help="password to use connecting to graphkb/ipr") + req.add_argument('--password', required=True, help='password to use connecting to graphkb/ipr') req.add_argument( - "-c", "--content", required=True, type=file_path, help="Report Content as JSON" + '-c', '--content', required=True, type=file_path, help='Report Content as JSON' ) - parser.add_argument("--ipr_url", default=os.environ.get("IPR_URL", DEFAULT_URL)) + parser.add_argument('--ipr_url', default=os.environ.get('IPR_URL')) parser.add_argument( - "--graphkb_username", - help="username to use connecting to graphkb if different from ipr", + '--graphkb_username', + help='username to use connecting to graphkb if different from ipr', ) parser.add_argument( - "--graphkb_password", - help="password to use connecting to graphkb if different from ipr", + '--graphkb_password', + help='password to use connecting to graphkb if different from ipr', ) - parser.add_argument("--graphkb_url", default=os.environ.get("GRAPHKB_URL", None)) - parser.add_argument("--log_level", default="info", choices=LOG_LEVELS.keys()) + parser.add_argument('--graphkb_url', default=os.environ.get('GRAPHKB_URL', None)) + parser.add_argument('--log_level', default='info', choices=LOG_LEVELS.keys()) parser.add_argument( - "--therapeutics", + '--therapeutics', default=False, - help="Generate therapeutic options", - action="store_true", + help='Generate therapeutic options', + action='store_true', ) parser.add_argument( - "--skip_comments", + '--skip_comments', default=False, - action="store_true", - help="Turn off generating the analyst comments section of the report", + action='store_true', + help='Turn off generating the analyst comments section of the report', ) parser.add_argument( - "-o", - "--output_json_path", - default=f"pori_python_report_{timestamp()}.json", - help="path to a JSON to output the report upload body", + '-o', + '--output_json_path', + default=f'pori_python_report_{timestamp()}.json', + help='path to a JSON to output the report upload body', ) parser.add_argument( - "-w", - "--always_write_output_json", - action="store_true", - help="Write to output_json_path on successful IPR uploads instead of just when the upload fails", + '-w', + '--always_write_output_json', + action='store_true', + help='Write to output_json_path on successful IPR uploads instead of just when the upload fails', ) parser.add_argument( - "--async_upload", + '--async_upload', default=False, - action="store_true", - help="True if reports-async ipr endpoint should be used instead of basic reports", + action='store_true', + help='True if reports-async ipr endpoint should be used instead of basic reports', ) parser.add_argument( - "--mins_to_wait", + '--mins_to_wait', default=5, - action="store", - help="is using reports-async, number of minutes to wait before throwing error", + action='store', + help='is using reports-async, number of minutes to wait before throwing error', ) parser.add_argument( - "--allow_partial_matches", + '--allow_partial_matches', default=False, - action="store_true", - help="True to include matches to multivariant statements where not all variants are present", + action='store_true', + help='True to include matches to multivariant statements where not all variants are present', ) parser.add_argument( - "--upload_json", + '--upload_json', default=False, - action="store_true", - help="True to skip all the preprocessing and just submit a json to ipr", + action='store_true', + help='True to skip all the preprocessing and just submit a json to ipr', ) parser.add_argument( - "--validate_json", + '--validate_json', default=False, - action="store_true", - help="True if only need to validate the json", + action='store_true', + help='True if only need to validate the json', ) parser.add_argument( - "--ignore_extra_fields", + '--ignore_extra_fields', default=False, - action="store_true", - help="True if ignore extra fields in json", + action='store_true', + help='True if ignore extra fields in json', ) args = parser.parse_args() - with open(args.content, "r") as fh: + with open(args.content, 'r') as fh: content = json.load(fh) ipr_report( @@ -191,38 +192,38 @@ def clean_unsupported_content(upload_content: Dict, ipr_spec: Dict = {}) -> Dict """ if ( ipr_spec - and "components" in ipr_spec.keys() - and "schemas" in ipr_spec["components"].keys() - and "genesCreate" in ipr_spec["components"]["schemas"].keys() - and "properties" in ipr_spec["components"]["schemas"]["genesCreate"].keys() + and 'components' in ipr_spec.keys() + and 'schemas' in ipr_spec['components'].keys() + and 'genesCreate' in ipr_spec['components']['schemas'].keys() + and 'properties' in ipr_spec['components']['schemas']['genesCreate'].keys() ): - genes_spec = ipr_spec["components"]["schemas"]["genesCreate"]["properties"].keys() + genes_spec = ipr_spec['components']['schemas']['genesCreate']['properties'].keys() # check what ipr report upload expects and adjust contents to match for old_name, new_name in RENAMED_GENE_PROPERTIES.items(): if old_name in genes_spec: logger.warning( - f"Legacy IPR - Renaming property {new_name} to {old_name} for compatibility to ipr_spec" + f'Legacy IPR - Renaming property {new_name} to {old_name} for compatibility to ipr_spec' ) - for gene in upload_content["genes"]: + for gene in upload_content['genes']: if new_name in gene: gene[old_name] = gene[new_name] gene.pop(new_name) else: outdate_properties = 0 - for gene in upload_content["genes"]: + for gene in upload_content['genes']: if old_name in gene: gene[new_name] = gene[old_name] gene.pop(old_name) outdate_properties += 1 if outdate_properties: logger.warning( - f"Renamed property {old_name} to {new_name} on {outdate_properties} genes for ipr_spec" + f'Renamed property {old_name} to {new_name} on {outdate_properties} genes for ipr_spec' ) # remove any unhandled incompatible keys removed_keys: Dict[str, int] = {} - for gene in upload_content["genes"]: + for gene in upload_content['genes']: unsupported_keys = [key for key in gene.keys() if key not in genes_spec] for key in unsupported_keys: if key in removed_keys: @@ -233,23 +234,23 @@ def clean_unsupported_content(upload_content: Dict, ipr_spec: Dict = {}) -> Dict for key, count in removed_keys.items(): logger.warning(f"IPR unsupported property '{key}' removed from {count} genes.") - drop_columns = ["variant", "variantType", "histogramImage"] + drop_columns = ['variant', 'variantType', 'histogramImage'] # DEVSU-2034 - use a 'displayName' VARIANT_LIST_KEYS = [ - "expressionVariants", - "smallMutations", - "copyVariants", - "structuralVariants", - "probeResults", - "signatureVariants", + 'expressionVariants', + 'smallMutations', + 'copyVariants', + 'structuralVariants', + 'probeResults', + 'signatureVariants', ] for variant_list_section in VARIANT_LIST_KEYS: for variant in upload_content.get(variant_list_section, []): - if not variant.get("displayName"): - variant["displayName"] = ( - variant.get("variant") or variant.get("kbCategory") or variant.get("key", "") + if not variant.get('displayName'): + variant['displayName'] = ( + variant.get('variant') or variant.get('kbCategory') or variant.get('key', '') ) - if variant_list_section == "probeResults": + if variant_list_section == 'probeResults': # currently probeResults will error if they do NOT have a 'variant' column. # smallMutations will error if they DO have a 'variant' column. continue @@ -257,29 +258,29 @@ def clean_unsupported_content(upload_content: Dict, ipr_spec: Dict = {}) -> Dict if col in variant: del variant[col] # tmburMutationBurden is a single value, not list - if upload_content.get("tmburMutationBurden"): - if not upload_content["tmburMutationBurden"].get("displayName"): - upload_content["tmburMutationBurden"]["displayName"] = upload_content[ - "tmburMutationBurden" - ].get("kbCategory", "") + if upload_content.get('tmburMutationBurden'): + if not upload_content['tmburMutationBurden'].get('displayName'): + upload_content['tmburMutationBurden']['displayName'] = upload_content[ + 'tmburMutationBurden' + ].get('kbCategory', '') # TODO: check this is still necessary - for row in upload_content["kbMatches"]: - if "kbContextId" in row: - del row["kbContextId"] - if "kbRelevanceId" in row: - del row["kbRelevanceId"] - if "requiredKbMatches" in row: - del row["requiredKbMatches"] - - for row in upload_content["kbMatchedStatements"]: - if "kbContextId" in row: - del row["kbContextId"] - if "kbRelevanceId" in row: - del row["kbRelevanceId"] + for row in upload_content['kbMatches']: + if 'kbContextId' in row: + del row['kbContextId'] + if 'kbRelevanceId' in row: + del row['kbRelevanceId'] + if 'requiredKbMatches' in row: + del row['requiredKbMatches'] + + for row in upload_content['kbMatchedStatements']: + if 'kbContextId' in row: + del row['kbContextId'] + if 'kbRelevanceId' in row: + del row['kbRelevanceId'] # Removing cosmicSignatures. Temporary - upload_content.pop("cosmicSignatures", None) + upload_content.pop('cosmicSignatures', None) return upload_content @@ -293,15 +294,15 @@ def ipr_report( username: str, password: str, content: Dict, - ipr_url: str = DEFAULT_URL, - log_level: str = "info", - output_json_path: str = "", + ipr_url: str = '', + log_level: str = 'info', + output_json_path: str = '', always_write_output_json: bool = False, ipr_upload: bool = True, interactive: bool = False, - graphkb_username: str = "", - graphkb_password: str = "", - graphkb_url: str = "", + graphkb_username: str = '', + graphkb_password: str = '', + graphkb_url: str = '', generate_therapeutics: bool = False, generate_comments: bool = True, match_germline: bool = False, @@ -323,7 +324,7 @@ def ipr_report( Args: username: the username for connecting to GraphKB and IPR password: the password for connecting to GraphKB and IPR - ipr_url: base URL to use in connecting to IPR + ipr_url: base URL to use in connecting to IPR (eg. https://ipr-api.bcgsc.ca/api) log_level: the logging level content: report content output_json_path: path to a JSON file to output the report upload body. @@ -352,18 +353,27 @@ def ipr_report( # set the default logging configuration logging.basicConfig( level=LOG_LEVELS[log_level], - format="%(asctime)s %(name)s %(levelname)s %(message)s", - datefmt="%m-%d-%y %H:%M:%S", + format='%(asctime)s %(name)s %(levelname)s %(message)s', + datefmt='%m-%d-%y %H:%M:%S', ) # IPR CONNECTION - ipr_conn = IprConnection(username, password, ipr_url) + ipr_url = ipr_url if ipr_url else os.environ.get('IPR_URL', '') + ipr_conn = None + if ipr_url: + ipr_conn = IprConnection(username, password, ipr_url) + else: + logger.warning('No ipr_url given') if validate_json: + if not ipr_conn: + raise ValueError('ipr_url required to validate json') ipr_result = ipr_conn.validate_json(content) return ipr_result if upload_json: + if not ipr_conn: + raise ValueError('ipr_url required to upload json') ipr_result = ipr_conn.upload_report( content, mins_to_wait, async_upload, ignore_extra_fields ) @@ -373,31 +383,32 @@ def ipr_report( try: validate_report_content(content) except jsonschema.exceptions.ValidationError as err: - logger.error("Failed schema check - report variants may be corrupted or unmatched.") - logger.error(f"Failed schema check: {err}") + logger.error('Failed schema check - report variants may be corrupted or unmatched.') + logger.error(f'Failed schema check: {err}') # INPUT VARIANTS VALIDATION & PREPROCESSING (OBSERVED BIOMARKERS) signature_variants: List[IprSignatureVariant] = preprocess_signature_variants( [ - *preprocess_cosmic(content.get("cosmicSignatures", [])), # includes dMMR - *preprocess_hla(content.get("hlaTypes", [])), + *preprocess_cosmic(content.get('cosmicSignatures', [])), # includes dMMR + *preprocess_hla(content.get('hlaTypes', [])), *preprocess_tmb( tmb_high, - content.get("tmburMutationBurden", {}), # old tmb pipeline - content.get("genomeTmb", ""), # newer tmb pipeline + content.get('tmburMutationBurden', {}), # old tmb pipeline + content.get('genomeTmb', ''), # newer tmb pipeline ), - *preprocess_msi(content.get("msi", None)), + *preprocess_msi(content.get('msi', None)), + *preprocess_hrd(content.get('hrd', None)), ] ) small_mutations: List[IprSmallMutationVariant] = preprocess_small_mutations( - content.get("smallMutations", []) + content.get('smallMutations', []) ) structural_variants: List[IprFusionVariant] = preprocess_structural_variants( - content.get("structuralVariants", []) + content.get('structuralVariants', []) ) - copy_variants: List[IprCopyVariant] = preprocess_copy_variants(content.get("copyVariants", [])) + copy_variants: List[IprCopyVariant] = preprocess_copy_variants(content.get('copyVariants', [])) expression_variants: List[IprExprVariant] = preprocess_expression_variants( - content.get("expressionVariants", []) + content.get('expressionVariants', []) ) # Additional checks if expression_variants: @@ -408,30 +419,29 @@ def ipr_report( ) # GKB CONNECTION - if graphkb_url: - logger.info(f"connecting to graphkb: {graphkb_url}") - graphkb_conn = GraphKBConnection(graphkb_url) - else: - graphkb_conn = GraphKBConnection() - - gkb_user = graphkb_username if graphkb_username else username - gkb_pass = graphkb_password if graphkb_password else password + graphkb_conn = GraphKBConnection(graphkb_url) if graphkb_url else GraphKBConnection() + logger.info(f'connecting to graphkb: {graphkb_conn.url}') - graphkb_conn.login(gkb_user, gkb_pass) + graphkb_conn.login( + graphkb_username if graphkb_username else username, + graphkb_password if graphkb_password else password, + ) # DISEASE # Disease term from bioapps; expected OncoTree term - kb_disease_match: str = content["kbDiseaseMatch"] + kb_disease_match: str = content['kbDiseaseMatch'] # Matching disease RIDs from GraphKB using term tree # (Will raise uncatched error if no match) - disease_matches: list[str] = get_kb_disease_matches(graphkb_conn, kb_disease_match) + disease_match_records: list[Dict] = get_kb_disease_matches(graphkb_conn, kb_disease_match) + disease_match_rids: list[str] = [item['@rid'] for item in disease_match_records] + disease_match_names: list[str] = [item['name'] for item in disease_match_records] # GKB MATCHING (AKA ANNOTATION) gkb_matches: List[Hashabledict] = annotate_variants( graphkb_conn=graphkb_conn, interactive=interactive, - disease_matches=disease_matches, + disease_matches=disease_match_rids, # Variants, per type: signature_variants=signature_variants, small_mutations=small_mutations, @@ -458,53 +468,53 @@ def ipr_report( ] num_removed = org_len - len(gkb_matches) if num_removed: - logger.info(f"Removing {num_removed} germline events without medical matches.") + logger.info(f'Removing {num_removed} germline events without medical matches.') if custom_kb_match_filter: - logger.info(f"custom_kb_match_filter on {len(gkb_matches)} variants") + logger.info(f'custom_kb_match_filter on {len(gkb_matches)} variants') gkb_matches = [Hashabledict(match) for match in custom_kb_match_filter(gkb_matches)] - logger.info(f"\t custom_kb_match_filter left {len(gkb_matches)} variants") - - # KEY ALTERATIONS - key_alterations, variant_counts = create_key_alterations(gkb_matches, all_variants) + logger.info(f'\t custom_kb_match_filter left {len(gkb_matches)} variants') # GENE INFORMATION - logger.info("fetching gene annotations") + logger.info('fetching gene annotations') gene_information = get_gene_information(graphkb_conn, sorted(genes_with_variants)) # THERAPEUTIC OPTIONS if generate_therapeutics: - logger.info("generating therapeutic options") + logger.info('generating therapeutic options') targets = create_therapeutic_options(graphkb_conn, gkb_matches, all_variants) else: targets = [] # ANALYST COMMENTS - logger.info("generating analyst comments") + logger.info('generating analyst comments') comments_list = [] if generate_comments: graphkb_comments = auto_analyst_comments( graphkb_conn, gkb_matches, - disease_matches=set(disease_matches), + disease_matches=set(disease_match_rids), variants=all_variants, ) comments_list.append(graphkb_comments) if include_ipr_variant_text: + if not ipr_conn: + raise ValueError('ipr_url required to include ipr variant text') ipr_comments = get_ipr_analyst_comments( ipr_conn, gkb_matches, disease_name=kb_disease_match, - project_name=content["project"], - report_type=content["template"], + disease_match_names=disease_match_names, + project_name=content['project'], + report_type=content['template'], include_nonspecific_disease=include_nonspecific_disease, include_nonspecific_project=include_nonspecific_project, include_nonspecific_template=include_nonspecific_template, ) comments_list.append(ipr_comments) - comments = {"comments": "\n".join(comments_list)} + comments = {'comments': '\n'.join(comments_list)} # REFORMATTING KBMATCHES # kbMatches -> kbMatches, kbMatchedStatements & kbStatementMatchedConditions @@ -512,50 +522,64 @@ def ipr_report( gkb_matches, allow_partial_matches=allow_partial_matches ) + # KEY ALTERATIONS + key_alterations, variant_counts = create_key_alterations( + gkb_matches, all_variants, kb_matched_sections['kbMatches'] + ) + # OUTPUT CONTENT # thread safe deep-copy the original content output = json.loads(json.dumps(content)) - output.update(kb_matched_sections) output.update( { - "copyVariants": [ - trim_empty_values(c) for c in copy_variants if c["gene"] in genes_with_variants + 'copyVariants': [ + trim_empty_values(c) for c in copy_variants if c['gene'] in genes_with_variants ], - "smallMutations": [trim_empty_values(s) for s in small_mutations], - "expressionVariants": [ + 'smallMutations': [trim_empty_values(s) for s in small_mutations], + 'expressionVariants': [ trim_empty_values(e) for e in expression_variants - if e["gene"] in genes_with_variants + if e['gene'] in genes_with_variants ], - "kbDiseaseMatch": kb_disease_match, - "kbUrl": graphkb_conn.url, - "kbVersion": timestamp(), - "structuralVariants": [ + 'kbDiseaseMatch': kb_disease_match, + 'kbUrl': graphkb_conn.url, + 'kbVersion': timestamp(), + 'structuralVariants': [ trim_empty_values(s) for s in filter_structural_variants( structural_variants, gkb_matches, gene_information ) ], - "signatureVariants": [trim_empty_values(s) for s in signature_variants], - "genes": gene_information, - "genomicAlterationsIdentified": key_alterations, - "variantCounts": variant_counts, - "analystComments": comments, - "therapeuticTarget": targets, + 'signatureVariants': [trim_empty_values(s) for s in signature_variants], + 'genes': gene_information, + 'genomicAlterationsIdentified': key_alterations, + 'variantCounts': variant_counts, + 'analystComments': comments, + 'therapeuticTarget': targets, } ) - output.setdefault("images", []).extend(select_expression_plots(gkb_matches, all_variants)) + output.setdefault('images', []).extend(select_expression_plots(gkb_matches, all_variants)) + + # if input includes hrdScore field, that is ok to pass to db + # but prefer the 'hrd' field if it exists + if output.get('hrd'): + if output.get('hrd').get('score'): + output['hrdScore'] = output['hrd']['score'] + output.pop('hrd') # kbmatches have already been made - ipr_spec = ipr_conn.get_spec() - output = clean_unsupported_content(output, ipr_spec) ipr_result = {} upload_error = None # UPLOAD TO IPR + if ipr_upload: + if not ipr_conn: + raise ValueError('ipr_url required to upload report') + ipr_spec = ipr_conn.get_spec() + output = clean_unsupported_content(output, ipr_spec) try: - logger.info(f"Uploading to IPR {ipr_conn.url}") + logger.info(f'Uploading to IPR {ipr_conn.url}') ipr_result = ipr_conn.upload_report( output, mins_to_wait, async_upload, ignore_extra_fields ) @@ -563,16 +587,16 @@ def ipr_report( output.update(ipr_result) except Exception as err: upload_error = err - logger.error(f"ipr_conn.upload_report failed: {err}", exc_info=True) + logger.error(f'ipr_conn.upload_report failed: {err}', exc_info=True) # SAVE TO JSON FILE if always_write_output_json: - logger.info(f"Writing IPR upload json to: {output_json_path}") - with open(output_json_path, "w") as fh: + logger.info(f'Writing IPR upload json to: {output_json_path}') + with open(output_json_path, 'w') as fh: fh.write(json.dumps(output)) - logger.info(f"made {graphkb_conn.request_count} requests to graphkb") - logger.info(f"average load {int(graphkb_conn.load or 0)} req/s") + logger.info(f'made {graphkb_conn.request_count} requests to graphkb') + logger.info(f'average load {int(graphkb_conn.load or 0)} req/s') if upload_error: raise upload_error return output diff --git a/pori_python/ipr/summary.py b/pori_python/ipr/summary.py index c6d63b20..cfe60868 100644 --- a/pori_python/ipr/summary.py +++ b/pori_python/ipr/summary.py @@ -28,10 +28,10 @@ logger, ) -OTHER_DISEASES = "other disease types" -ENTREZ_GENE_URL = "https://www.ncbi.nlm.nih.gov/gene" +OTHER_DISEASES = 'other disease types' +ENTREZ_GENE_URL = 'https://www.ncbi.nlm.nih.gov/gene' # TODO: https://www.bcgsc.ca/jira/browse/DEVSU-1181 -GRAPHKB_GUI = "https://graphkb.bcgsc.ca" +GRAPHKB_GUI = 'https://graphkb.bcgsc.ca' def filter_by_record_class( @@ -45,17 +45,17 @@ def check(name: str) -> bool: else: return name in record_classes - return [rec for rec in record_list if check(rec["@class"])] + return [rec for rec in record_list if check(rec['@class'])] def natural_join(word_list: List[str]) -> str: if len(word_list) > 1: - return ", ".join(word_list[:-1]) + ", and " + word_list[-1] - return "".join(word_list) + return ', '.join(word_list[:-1]) + ', and ' + word_list[-1] + return ''.join(word_list) def get_displayname(rec: Record) -> str: - ret_val = rec.get("displayName", rec["@rid"]) + ret_val = rec.get('displayName', rec['@rid']) return str(ret_val) @@ -66,26 +66,26 @@ def natural_join_records( return natural_join(word_list) -def create_graphkb_link(record_ids: List[str], record_class: str = "Statement") -> str: +def create_graphkb_link(record_ids: List[str], record_class: str = 'Statement') -> str: """ Create a link for a set of statements to the GraphKB client """ record_ids = sorted(list(set(record_ids))) if len(record_ids) == 1: return f'{GRAPHKB_GUI}/view/{record_class}/{record_ids[0].replace("#", "")}' - complex_param = base64.b64encode(json.dumps({"target": record_ids}).encode("utf-8")) - search_params = {"complex": complex_param, "@class": record_class} - return f"{GRAPHKB_GUI}/data/table?{urlencode(search_params)}" + complex_param = base64.b64encode(json.dumps({'target': record_ids}).encode('utf-8')) + search_params = {'complex': complex_param, '@class': record_class} + return f'{GRAPHKB_GUI}/data/table?{urlencode(search_params)}' def merge_diseases( diseases: List[Ontology] | List[Record], disease_matches: Set[str] = set() ) -> str: if len(convert_to_rid_set(diseases) - disease_matches) >= 2 and all( - [d["@class"] == "Disease" for d in diseases] + [d['@class'] == 'Disease' for d in diseases] ): words = sorted( - list(set([get_displayname(s) for s in diseases if s["@rid"] in disease_matches])) + list(set([get_displayname(s) for s in diseases if s['@rid'] in disease_matches])) ) words.append(OTHER_DISEASES) return natural_join(words) @@ -105,54 +105,54 @@ def substitute_sentence_template( """Create the filled-in sentence template for a given template and list of substitutions which may be the result of the aggregation of 1 or more statements. """ - disease_conditions = filter_by_record_class(conditions, "Disease") + disease_conditions = filter_by_record_class(conditions, 'Disease') variant_conditions = filter_by_record_class( - conditions, "CategoryVariant", "CatalogueVariant", "PositionalVariant" + conditions, 'CategoryVariant', 'CatalogueVariant', 'PositionalVariant' ) other_conditions = filter_by_record_class( conditions, - "CategoryVariant", - "CatalogueVariant", - "PositionalVariant", - "Disease", + 'CategoryVariant', + 'CatalogueVariant', + 'PositionalVariant', + 'Disease', exclude=True, ) - result = template.replace(r"{relevance}", relevance["displayName"]) + result = template.replace(r'{relevance}', relevance['displayName']) - if r"{subject}" in template: + if r'{subject}' in template: # remove subject from the conditions replacements subjects_ids = convert_to_rid_set(subjects) disease_conditions = [ - cast(Ontology, d) for d in disease_conditions if d["@rid"] not in subjects_ids + cast(Ontology, d) for d in disease_conditions if d['@rid'] not in subjects_ids ] variant_conditions = [ - cast(Ontology, d) for d in variant_conditions if d["@rid"] not in subjects_ids + cast(Ontology, d) for d in variant_conditions if d['@rid'] not in subjects_ids ] - other_conditions = [d for d in other_conditions if d["@rid"] not in subjects_ids] + other_conditions = [d for d in other_conditions if d['@rid'] not in subjects_ids] - result = result.replace(r"{subject}", merge_diseases(subjects, disease_matches)) + result = result.replace(r'{subject}', merge_diseases(subjects, disease_matches)) - if r"{conditions:disease}" in template: + if r'{conditions:disease}' in template: result = result.replace( - r"{conditions:disease}", merge_diseases(disease_conditions, disease_matches) + r'{conditions:disease}', merge_diseases(disease_conditions, disease_matches) ) else: other_conditions.extend(disease_conditions) - if r"{conditions:variant}" in template: - result = result.replace(r"{conditions:variant}", natural_join_records(variant_conditions)) + if r'{conditions:variant}' in template: + result = result.replace(r'{conditions:variant}', natural_join_records(variant_conditions)) else: other_conditions.extend(variant_conditions) - result = result.replace(r"{conditions}", natural_join_records(other_conditions)) + result = result.replace(r'{conditions}', natural_join_records(other_conditions)) - link_url = create_graphkb_link(statement_rids) if statement_rids else "" + link_url = create_graphkb_link(statement_rids) if statement_rids else '' - if r"{evidence}" in template: - evidence_str = ", ".join(sorted(list({e["displayName"] for e in evidence}))) + if r'{evidence}' in template: + evidence_str = ', '.join(sorted(list({e['displayName'] for e in evidence}))) if link_url: evidence_str = f'{evidence_str}' - result = result.replace(r"{evidence}", evidence_str) + result = result.replace(r'{evidence}', evidence_str) return result @@ -170,18 +170,18 @@ def aggregate_statements( def generate_key(statement: Statement) -> Tuple: result = [ - cond.get("displayName", cond["@rid"]) - for cond in filter_by_record_class(statement["conditions"], "Disease", exclude=True) - if cond["@rid"] != statement["subject"]["@rid"] + cond.get('displayName', cond['@rid']) + for cond in filter_by_record_class(statement['conditions'], 'Disease', exclude=True) + if cond['@rid'] != statement['subject']['@rid'] ] - if statement.get("subject", {}).get("@class", "Disease") != "Disease": - subject = statement["subject"] - if subject["@class"] == "Therapy": - alt = get_preferred_drug_representation(graphkb_conn, subject["@rid"]) - statement["subject"] = alt - result.append(statement["subject"]["displayName"]) - result.append(statement["relevance"]["displayName"]) - result.append(statement["displayNameTemplate"]) + if statement.get('subject', {}).get('@class', 'Disease') != 'Disease': + subject = statement['subject'] + if subject['@class'] == 'Therapy': + alt = get_preferred_drug_representation(graphkb_conn, subject['@rid']) + statement['subject'] = alt + result.append(statement['subject']['displayName']) + result.append(statement['relevance']['displayName']) + result.append(statement['displayNameTemplate']) return tuple(sorted(set(result))) for statement in statements: @@ -193,12 +193,12 @@ def generate_key(statement: Statement) -> Tuple: conditions = [] subjects = [] evidence = [] - relevance = group[0]["relevance"] - template = group[0]["displayNameTemplate"] + relevance = group[0]['relevance'] + template = group[0]['displayNameTemplate'] for statement in group: - conditions.extend(statement["conditions"]) - evidence.extend(statement["evidence"]) - subjects.append(statement["subject"]) + conditions.extend(statement['conditions']) + evidence.extend(statement['evidence']) + subjects.append(statement['subject']) sentence = substitute_sentence_template( template, @@ -211,35 +211,35 @@ def generate_key(statement: Statement) -> Tuple: ) for statement in group: - result[statement["@rid"]] = sentence + result[statement['@rid']] = sentence return result def display_variant(variant: IprVariant) -> str: """Short, human readable variant description string.""" - gene = variant.get("gene", "") - if not gene and "gene1" in variant and "gene2" in variant: + gene = variant.get('gene', '') + if not gene and 'gene1' in variant and 'gene2' in variant: gene = f'({variant.get("gene1", "")},{variant.get("gene2", "")})' - if variant.get("kbCategory"): + if variant.get('kbCategory'): return f'{variant.get("kbCategory")} of {gene}' # Special display of IprFusionVariant with exons - if variant.get("exon1") or variant.get("exon2"): + if variant.get('exon1') or variant.get('exon2'): return create_graphkb_sv_notation(variant) # type: ignore # Use chosen legacy 'proteinChange' or an hgvs description of lowest detail. hgvs = variant.get( - "proteinChange", - variant.get("hgvsProtein", variant.get("hgvsCds", variant.get("hgvsGenomic", ""))), + 'proteinChange', + variant.get('hgvsProtein', variant.get('hgvsCds', variant.get('hgvsGenomic', ''))), ) if gene and hgvs: - return f"{gene}:{hgvs}" - elif variant.get("variant"): - return str(variant.get("variant")) + return f'{gene}:{hgvs}' + elif variant.get('variant'): + return str(variant.get('variant')) - raise ValueError(f"Unable to form display_variant of {variant}") + raise ValueError(f'Unable to form display_variant of {variant}') def display_variants(gene_name: str, variants: List[IprVariant]) -> str: @@ -247,11 +247,11 @@ def display_variants(gene_name: str, variants: List[IprVariant]) -> str: variants_text = natural_join(result) if len(result) > 1: return ( - f"Multiple variants of the gene {gene_name} were observed in this case: {variants_text}" + f'Multiple variants of the gene {gene_name} were observed in this case: {variants_text}' ) elif result: - return f"{variants_text[0].upper()}{variants_text[1:]} was observed in this case." - return "" + return f'{variants_text[0].upper()}{variants_text[1:]} was observed in this case.' + return '' def create_section_html( @@ -264,33 +264,33 @@ def create_section_html( """ Generate HTML for a gene section of the comments """ - output = [f"

{gene_name}

"] + output = [f'

{gene_name}

'] sentence_categories: Dict[str, str] = {} for statement_id, sentence in sentences_by_statement_id.items(): - relevance = statements[statement_id]["relevance"]["@rid"] + relevance = statements[statement_id]['relevance']['@rid'] category = categorize_relevance( graphkb_conn, relevance, - RELEVANCE_BASE_TERMS + [("resistance", ["no sensitivity"])], + RELEVANCE_BASE_TERMS + [('resistance', ['no sensitivity'])], ) sentence_categories[sentence] = category # get the entrez gene descriptive hugo name genes = graphkb_conn.query( { - "target": "Feature", - "filters": { - "AND": [ + 'target': 'Feature', + 'filters': { + 'AND': [ { - "source": { - "target": "Source", - "filters": {"name": "entrez gene"}, + 'source': { + 'target': 'Source', + 'filters': {'name': 'entrez gene'}, } }, - {"name": gene_name}, - {"biotype": "gene"}, + {'name': gene_name}, + {'biotype': 'gene'}, ] }, } @@ -300,10 +300,10 @@ def create_section_html( variants_text = display_variants(gene_name, exp_variants) if not variants_text: # exclude sections where they are not linked to an experimental variant. this can occur when there are co-occurent statements collected - return "" - if genes and genes[0].get("description", ""): - description = ". ".join(genes[0]["description"].split(". ")[:2]) # type: ignore - sourceId = genes[0].get("sourceId", "") + return '' + if genes and genes[0].get('description', ''): + description = '. '.join(genes[0]['description'].split('. ')[:2]) # type: ignore + sourceId = genes[0].get('sourceId', '') output.append( f""" @@ -319,27 +319,27 @@ def create_section_html( sentences_used: Set[str] = set() for section in [ - {s for (s, v) in sentence_categories.items() if v == "diagnostic"}, - {s for (s, v) in sentence_categories.items() if v == "biological"}, - {s for (s, v) in sentence_categories.items() if v in ["therapeutic", "prognostic"]}, + {s for (s, v) in sentence_categories.items() if v == 'diagnostic'}, + {s for (s, v) in sentence_categories.items() if v == 'biological'}, + {s for (s, v) in sentence_categories.items() if v in ['therapeutic', 'prognostic']}, { s for (s, v) in sentence_categories.items() if v not in [ - "diagnostic", - "biological", - "therapeutic", - "prognostic", - "resistance", + 'diagnostic', + 'biological', + 'therapeutic', + 'prognostic', + 'resistance', ] }, - {s for (s, v) in sentence_categories.items() if v == "resistance"}, + {s for (s, v) in sentence_categories.items() if v == 'resistance'}, ]: - content = ". ".join(sorted(list(section - sentences_used))) + content = '. '.join(sorted(list(section - sentences_used))) sentences_used.update(section) - output.append(f"

{content}

") - return "\n".join(output) + output.append(f'

{content}

') + return '\n'.join(output) def section_statements_by_genes( @@ -349,16 +349,16 @@ def section_statements_by_genes( genes: Dict[str, Set[str]] = {} for statement in statements: - for condition in statement["conditions"]: - if condition.get("biotype", "") == "gene": - gene = get_preferred_gene_name(graphkb_conn, condition["@rid"]) - genes.setdefault(gene, set()).add(statement["@rid"]) + for condition in statement['conditions']: + if condition.get('biotype', '') == 'gene': + gene = get_preferred_gene_name(graphkb_conn, condition['@rid']) + genes.setdefault(gene, set()).add(statement['@rid']) else: - for cond_ref_key in ("reference1", "reference2"): + for cond_ref_key in ('reference1', 'reference2'): cond_ref_gene = condition.get(cond_ref_key) if cond_ref_gene: gene = get_preferred_gene_name(graphkb_conn, str(cond_ref_gene)) - genes.setdefault(gene, set()).add(statement["@rid"]) + genes.setdefault(gene, set()).add(statement['@rid']) return genes @@ -372,12 +372,12 @@ def prep_single_ipr_variant_comment(variant_text): Returns: section: html-formatted string """ - cancer_type = ",".join(variant_text["cancerType"]) + cancer_type = ','.join(variant_text['cancerType']) if not cancer_type: - cancer_type = "no specific cancer types" - cancer_type = f" ({cancer_type})" - section = [f"

{variant_text['variantName']}{cancer_type}

"] - section.append(f"

{variant_text['text']}

") + cancer_type = 'no specific cancer types' + cancer_type = f' ({cancer_type})' + section = [f'

{variant_text["variantName"]}{cancer_type}

'] + section.append(f'

{variant_text["text"]}

') return section @@ -385,6 +385,7 @@ def get_ipr_analyst_comments( ipr_conn: IprConnection, matches: Sequence[KbMatch] | Sequence[Hashabledict], disease_name: str, + disease_match_names: [str], project_name: str, report_type: str, include_nonspecific_disease: bool = False, @@ -403,6 +404,7 @@ def get_ipr_analyst_comments( ipr_conn: connection to the ipr db matches: list of kbmatches which will be included in the report disease_name: str, eg 'colorectal cancer' + disease_match_names: list[str] of names considered to be equivalent to the disease name project_name: str, eg TEST or pog report_type: str, eg genomic or rapid include_nonspecific_disease: bool - true if variant texts that don't explicitly @@ -414,48 +416,58 @@ def get_ipr_analyst_comments( Returns: html-formatted string """ - output_header = "

The comments below were automatically drawn from curated text stored in IPR for variant matches in this report, and have not been manually reviewed

" - no_comments_found_output = "No comments found in IPR for variants in this report" + output_header = '

The comments below were automatically drawn from curated text stored in IPR for variant matches in this report, and have not been manually reviewed

' + no_comments_found_output = 'No comments found in IPR for variants in this report' output = [] # get the list of variants to check for custom text for - match_set = list(set([item["kbVariant"] for item in matches])) + match_set = list(set([item['kbVariant'] for item in matches])) + + disease_match_set = set([disease_name.lower()] + [item.lower() for item in disease_match_names]) for variant in match_set: data = { - "variantName": variant, + 'variantName': variant, } itemlist: list[dict] = [] - itemlist = ipr_conn.get("variant-text", data=data) # type: ignore + itemlist = ipr_conn.get('variant-text', data=data) # type: ignore if itemlist: project_matches = [ item for item in itemlist - if "project" in item.keys() and item["project"]["name"] == project_name + if 'project' in item.keys() and item['project']['name'] == project_name ] if project_matches: itemlist = project_matches elif include_nonspecific_project: - itemlist = [item for item in itemlist if "project" not in item.keys()] + itemlist = [item for item in itemlist if 'project' not in item.keys()] else: itemlist = [] template_matches = [ item for item in itemlist - if "template" in item.keys() and item["template"]["name"] == report_type + if 'template' in item.keys() and item['template']['name'] == report_type ] if template_matches: itemlist = template_matches elif include_nonspecific_template: - itemlist = [item for item in itemlist if "template" not in item.keys()] + itemlist = [item for item in itemlist if 'template' not in item.keys()] else: itemlist = [] - disease_matches = [item for item in itemlist if disease_name in item["cancerType"]] + disease_matches = [ + item + for item in itemlist + if len( + set([ct.lower() for ct in item['cancerType']]).intersection(disease_match_set) + ) + > 0 + ] + if disease_matches: itemlist = disease_matches elif include_nonspecific_disease: - itemlist = [item for item in itemlist if not item["cancerType"]] + itemlist = [item for item in itemlist if not item['cancerType']] else: itemlist = [] @@ -466,7 +478,7 @@ def get_ipr_analyst_comments( if not output: return no_comments_found_output output.insert(0, output_header) - return "\n".join(output) + return '\n'.join(output) def auto_analyst_comments( @@ -478,12 +490,12 @@ def auto_analyst_comments( """Given a list of GraphKB matches, generate a text summary to add to the report.""" templates: Dict[str, List[Statement]] = {} statements: Dict[str, Statement] = {} - variants_by_keys = {v["key"]: v for v in variants} + variants_by_keys = {v['key']: v for v in variants} variant_keys_by_statement_ids: Dict[str, Set[str]] = {} for match in matches: - rid = match["kbStatementId"] - exp_variant = match["variant"] + rid = match['kbStatementId'] + exp_variant = match['variant'] variant_keys_by_statement_ids.setdefault(rid, set()).add(exp_variant) exp_variants_by_statements: Dict[str, List[IprVariant]] = {} @@ -491,16 +503,16 @@ def auto_analyst_comments( try: exp_variants_by_statements[rid] = [variants_by_keys[key] for key in keys] except KeyError as err: - logger.warning(f"No specific variant matched for {rid}:{keys} - {err}") + logger.warning(f'No specific variant matched for {rid}:{keys} - {err}') exp_variants_by_statements[rid] = [] # get details for statements for match in matches: - rid = match["kbStatementId"].replace("#", "") - result = graphkb_conn.request(f"/statements/{rid}?neighbors=1")["result"] + rid = match['kbStatementId'].replace('#', '') + result = graphkb_conn.request(f'/statements/{rid}?neighbors=1')['result'] - templates.setdefault(result["displayNameTemplate"], []).append(result) - statements[result["@rid"]] = result + templates.setdefault(result['displayNameTemplate'], []).append(result) + statements[result['@rid']] = result # aggregate similar sentences sentences = {} @@ -511,7 +523,7 @@ def auto_analyst_comments( statements_by_genes = section_statements_by_genes(graphkb_conn, list(statements.values())) output: List[str] = [ - "

The comments below were automatically generated from matches to GraphKB and have not been manually reviewed

" + '

The comments below were automatically generated from matches to GraphKB and have not been manually reviewed

' ] for section, statement_rids in sorted( @@ -520,7 +532,7 @@ def auto_analyst_comments( exp_variants = {} for variant_list in [exp_variants_by_statements[r] for r in statement_rids]: for variant in variant_list: - exp_variants[variant["key"]] = variant + exp_variants[variant['key']] = variant output.append( create_section_html( @@ -532,4 +544,4 @@ def auto_analyst_comments( ) ) - return "\n".join(output) + return '\n'.join(output) diff --git a/pori_python/ipr/therapeutic_options.py b/pori_python/ipr/therapeutic_options.py index 9ca8c3be..7dc38cee 100644 --- a/pori_python/ipr/therapeutic_options.py +++ b/pori_python/ipr/therapeutic_options.py @@ -27,57 +27,57 @@ def create_therapeutic_options( Generate therapeutic options summary from the list of kb-matches """ options: List[Dict[str, Any]] = [] - resistance_markers = get_terms_set(graphkb_conn, ["no sensitivity"]) + resistance_markers = get_terms_set(graphkb_conn, ['no sensitivity']) for match in kb_matches: - row_type = "therapeutic" - if match["category"] != "therapeutic" or match["relevance"] == "eligibility": + row_type = 'therapeutic' + if match['category'] != 'therapeutic' or match['relevance'] == 'eligibility': continue - if match["kbRelevanceId"] in resistance_markers: - row_type = "chemoresistance" - variant = find_variant(variants, match["variantType"], match["variant"]) - drug = get_preferred_drug_representation(graphkb_conn, match["kbContextId"]) + if match['kbRelevanceId'] in resistance_markers: + row_type = 'chemoresistance' + variant = find_variant(variants, match['variantType'], match['variant']) + drug = get_preferred_drug_representation(graphkb_conn, match['kbContextId']) gene, variant_string = create_variant_name_tuple(variant) options.append( { - "gene": gene, - "type": row_type, - "therapy": drug["displayName"], - "therapyGraphkbId": drug["@rid"], - "context": match["relevance"], - "contextGraphkbId": match["kbRelevanceId"], - "variantGraphkbId": match["kbVariantId"], - "variant": variant_string, - "evidenceLevel": match["evidenceLevel"], - "kbStatementIds": match["kbStatementId"], - "notes": "", + 'gene': gene, + 'type': row_type, + 'therapy': drug['displayName'], + 'therapyGraphkbId': drug['@rid'], + 'context': match['relevance'], + 'contextGraphkbId': match['kbRelevanceId'], + 'variantGraphkbId': match['kbVariantId'], + 'variant': variant_string, + 'evidenceLevel': match['evidenceLevel'], + 'kbStatementIds': match['kbStatementId'], + 'notes': '', } ) if not options: return options options_df = pandas.DataFrame.from_records(options) - def delimited_list(inputs: List, delimiter: str = " / ") -> str: + def delimited_list(inputs: List, delimiter: str = ' / ') -> str: return delimiter.join(sorted(list({i for i in inputs if i}))) - options_df = options_df.groupby(["gene", "type", "therapy", "variant"]).agg( + options_df = options_df.groupby(['gene', 'type', 'therapy', 'variant']).agg( { - "evidenceLevel": delimited_list, - "context": delimited_list, - "notes": lambda x: delimited_list(x, " "), + 'evidenceLevel': delimited_list, + 'context': delimited_list, + 'notes': lambda x: delimited_list(x, ' '), } ) options_df = options_df.reset_index() - options = options_df.to_dict("records") # type: ignore + options = options_df.to_dict('records') # type: ignore therapeutic_rank = 0 chemoresistance_rank = 0 for option in options: - if option["type"] == "therapeutic": - option["rank"] = therapeutic_rank + if option['type'] == 'therapeutic': + option['rank'] = therapeutic_rank therapeutic_rank += 1 else: - option["rank"] = chemoresistance_rank + option['rank'] = chemoresistance_rank chemoresistance_rank += 1 return options diff --git a/pori_python/ipr/util.py b/pori_python/ipr/util.py index d2aca074..69ac7024 100644 --- a/pori_python/ipr/util.py +++ b/pori_python/ipr/util.py @@ -12,12 +12,12 @@ GENE_NEIGHBORS_MAX = 3 # name the logger after the package to make it simple to disable for packages using this one as a dependency -logger = logging.getLogger("ipr") +logger = logging.getLogger('ipr') LOG_LEVELS = { - "info": logging.INFO, - "debug": logging.DEBUG, - "warn": logging.WARN, - "error": logging.ERROR, + 'info': logging.INFO, + 'debug': logging.DEBUG, + 'warn': logging.WARN, + 'error': logging.ERROR, } @@ -31,17 +31,17 @@ def get_terms_set(graphkb_conn: GraphKBConnection, base_terms: List[str]) -> Set def hash_key(key: Tuple[str]) -> str: - body = json.dumps({"key": key}, sort_keys=True) - hash_code = hashlib.md5(body.encode("utf-8")).hexdigest() + body = json.dumps({'key': key}, sort_keys=True) + hash_code = hashlib.md5(body.encode('utf-8')).hexdigest() return hash_code def convert_to_rid_set(records: Sequence[Record]) -> Set[str]: - return {r["@rid"] for r in records} + return {r['@rid'] for r in records} -def trim_empty_values(obj: IprVariant, empty_values: Sequence = ("", None, nan)): - blacklist = ("gene1", "gene2") # allow null for sv genes +def trim_empty_values(obj: IprVariant, empty_values: Sequence = ('', None, nan)): + blacklist = ('gene1', 'gene2') # allow null for sv genes keys = list(obj.keys()) for key in keys: @@ -55,19 +55,19 @@ def create_variant_name_tuple(variant: IprVariant) -> Tuple[str, str]: Given an IPR variant row, create the variant representation to be used as the name of the variant """ - variant_type = variant["variantType"] - gene = str(variant.get("gene", variant.get("gene1", ""))) - if variant_type == "exp": - return (gene, str(variant.get("expressionState", ""))) - elif variant_type == "cnv": - return (gene, str(variant.get("cnvState", ""))) + variant_type = variant['variantType'] + gene = str(variant.get('gene', variant.get('gene1', ''))) + if variant_type == 'exp': + return (gene, str(variant.get('expressionState', ''))) + elif variant_type == 'cnv': + return (gene, str(variant.get('cnvState', ''))) variant_split = ( - variant["variant"].split(":", 1)[1] if ":" in variant["variant"] else variant["variant"] + variant['variant'].split(':', 1)[1] if ':' in variant['variant'] else variant['variant'] ) - gene2 = str(variant.get("gene2", "")) + gene2 = str(variant.get('gene2', '')) if gene and gene2: - gene = f"{gene}, {gene2}" + gene = f'{gene}, {gene2}' elif gene2: gene = gene2 @@ -81,28 +81,28 @@ def find_variant( Find a variant in a list of variants by its key and type """ for variant in all_variants: - if variant["key"] == variant_key and variant["variantType"] == variant_type: + if variant['key'] == variant_key and variant['variantType'] == variant_type: return variant - raise KeyError(f"expected variant ({variant_key}, {variant_type}) does not exist") + raise KeyError(f'expected variant ({variant_key}, {variant_type}) does not exist') def generate_ontology_preference_key(record: Ontology, sources_sort: Dict[str, int] = {}) -> Tuple: """Generate a tuple key for comparing preferred ontology terms.""" return ( - record.get("name") == record.get("sourceId"), - record.get("deprecated", False), - record.get("alias", False), - bool(record.get("dependency", "")), - sources_sort.get(str(record.get("source")), 99999), - record["sourceId"], - record.get("sourceIdVersion", ""), - record["name"], + record.get('name') == record.get('sourceId'), + record.get('deprecated', False), + record.get('alias', False), + bool(record.get('dependency', '')), + sources_sort.get(str(record.get('source')), 99999), + record['sourceId'], + record.get('sourceIdVersion', ''), + record['name'], ) def get_alternatives(graphkb_conn: GraphKBConnection, record_id: str) -> List[Ontology]: rec_list = graphkb_conn.query( - {"target": [record_id], "queryType": "similarTo", "treeEdges": []} + {'target': [record_id], 'queryType': 'similarTo', 'treeEdges': []} ) return [cast(Ontology, rec) for rec in rec_list] @@ -115,8 +115,8 @@ def get_preferred_drug_representation( """ source_preference = { - r["@rid"]: r["sort"] # type: ignore - for r in graphkb_conn.query({"target": "Source", "returnProperties": ["sort", "@rid"]}) + r['@rid']: r['sort'] # type: ignore + for r in graphkb_conn.query({'target': 'Source', 'returnProperties': ['sort', '@rid']}) } drugs = sorted( get_alternatives(graphkb_conn, drug_record_id), @@ -130,44 +130,78 @@ def get_preferred_gene_name( ) -> str: """Given some Feature record ID return the preferred gene name.""" record = graphkb_conn.get_record_by_id(record_id) - biotype = record.get("biotype", "") + biotype = record.get('biotype', '') genes = [] - expanded_gene_names = graphkb_conn.query({"target": [record_id], "neighbors": neighbors}) - assert len(expanded_gene_names) == 1, "get_preferred_gene_name should have single result" + expanded_gene_names = graphkb_conn.query({'target': [record_id], 'neighbors': neighbors}) + assert len(expanded_gene_names) == 1, 'get_preferred_gene_name should have single result' expanded: Dict[str, List] = expanded_gene_names[0] # type: ignore - if biotype != "gene": - for edge in expanded.get("out_ElementOf", []): - target = edge["in"] - if target.get("biotype") == "gene": + if biotype != 'gene': + for edge in expanded.get('out_ElementOf', []): + target = edge['in'] + if target.get('biotype') == 'gene': genes.append(target) for edge_type in [ - "out_AliasOf", - "in_AliasOf", - "in_DeprecatedBy", - "out_CrossReferenceOf", - "in_CrossReferenceOf", + 'out_AliasOf', + 'in_AliasOf', + 'in_DeprecatedBy', + 'out_CrossReferenceOf', + 'in_CrossReferenceOf', ]: - target_name = "out" if edge_type.startswith("in") else "in" + target_name = 'out' if edge_type.startswith('in') else 'in' for edge in expanded.get(edge_type, []): target = edge[target_name] - if target.get("biotype") == "gene": + if target.get('biotype') == 'gene': genes.append(target) genes = sorted( genes, key=lambda gene: ( - gene["deprecated"], - bool(gene["dependency"]), - "_" in gene["name"], - gene["name"].startswith("ens"), + gene['deprecated'], + bool(gene['dependency']), + '_' in gene['name'], + gene['name'].startswith('ens'), ), ) if genes: - return genes[0]["displayName"] + return genes[0]['displayName'] # fallback to the input displayName - return str(record.get("displayName", "")) + return str(record.get('displayName', '')) def pandas_falsy(field: Any) -> bool: """Check if a field is python falsy or pandas null.""" return bool(pd.isnull(field) or not field) + + +# the below is copied from +# https://github.com/biopython/biopython/blob/master/Bio/Data/IUPACData.py +# to allow us to remove otherwise unnecessary biopython dependency + +protein_letters_1to3 = { + 'A': 'Ala', + 'C': 'Cys', + 'D': 'Asp', + 'E': 'Glu', + 'F': 'Phe', + 'G': 'Gly', + 'H': 'His', + 'I': 'Ile', + 'K': 'Lys', + 'L': 'Leu', + 'M': 'Met', + 'N': 'Asn', + 'P': 'Pro', + 'Q': 'Gln', + 'R': 'Arg', + 'S': 'Ser', + 'T': 'Thr', + 'V': 'Val', + 'W': 'Trp', + 'Y': 'Tyr', +} +protein_letters_1to3_extended = { + **protein_letters_1to3, + **{'B': 'Asx', 'X': 'Xaa', 'Z': 'Glx', 'J': 'Xle', 'U': 'Sec', 'O': 'Pyl'}, +} + +protein_letters_3to1 = {value: key for key, value in protein_letters_1to3.items()} diff --git a/pori_python/types.py b/pori_python/types.py index faec8f29..dd1ab7e5 100644 --- a/pori_python/types.py +++ b/pori_python/types.py @@ -5,8 +5,8 @@ # TODO: Can constants in inputs.py like COPY_REQ, SMALL_MUT_REQ, just be replaced by types? CategoryBaseTermMapping = List[Tuple[str, List[str]]] -Record = TypedDict("Record", {"@rid": str, "@class": str, "name": str}) -EmbeddedRecord = TypedDict("EmbeddedRecord", {"@class": str}) +Record = TypedDict('Record', {'@rid': str, '@class': str, 'name': str}) +EmbeddedRecord = TypedDict('EmbeddedRecord', {'@class': str}) class DisplayedRecord(Record): diff --git a/pyproject.toml b/pyproject.toml index 9e5a9848..6213fb7c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1 +1,7 @@ build-backend = "setuptools.build_meta" + +[tool.ruff] +line-length = 100 + +[tool.ruff.format] +quote-style = "single" diff --git a/setup.cfg b/setup.cfg index e466a485..e508812e 100644 --- a/setup.cfg +++ b/setup.cfg @@ -18,7 +18,7 @@ known_standard_library = requests [metadata] name = pori_python -version = 1.2.2 +version = 1.3.0 url = https://github.com/bcgsc/pori_python author_email = dat@bcgsc.ca maintainer_email = dat@bcgsc.ca @@ -31,7 +31,6 @@ python_requires = >=3.9 dependency_links = [] include_package_data = true install_requires = - biopython jsonschema pandas>=1.1.0 requests @@ -53,10 +52,7 @@ dev = mkdocs-material mkdocs-redirects markdown-refdocs - flake8 - black - flake8-annotations - isort + ruff mypy [options.package_data] diff --git a/tests/test_graphkb/data.py b/tests/test_graphkb/data.py index d628ff20..955e3911 100644 --- a/tests/test_graphkb/data.py +++ b/tests/test_graphkb/data.py @@ -7,72 +7,72 @@ # structuralVariants = { # Unambiguous structural variations - "(FGFR3,BRCA2):fusion(g.1234567,g.1234567)": { - "matches": { - "displayName": ["FGFR3 fusion", "FGFR3 rearrangement"], - "type": ["fusion", "rearrangement"], + '(FGFR3,BRCA2):fusion(g.1234567,g.1234567)': { + 'matches': { + 'displayName': ['FGFR3 fusion', 'FGFR3 rearrangement'], + 'type': ['fusion', 'rearrangement'], } }, # ambiguous structural variations -> structural - "FGFR3:c.1200_1300dup": { - "matches": { - "displayName": ["FGFR3 mutation", "FGFR3 rearrangement"], - "type": ["mutation", "rearrangement"], + 'FGFR3:c.1200_1300dup': { + 'matches': { + 'displayName': ['FGFR3 mutation', 'FGFR3 rearrangement'], + 'type': ['mutation', 'rearrangement'], } }, - "FGFR3:c.1200_1201insACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT": { - "matches": { - "displayName": ["FGFR3 mutation", "FGFR3 rearrangement"], - "type": ["mutation", "rearrangement"], + 'FGFR3:c.1200_1201insACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT': { + 'matches': { + 'displayName': ['FGFR3 mutation', 'FGFR3 rearrangement'], + 'type': ['mutation', 'rearrangement'], } }, - "FGFR3:g.5000_5100del": { - "matches": { - "displayName": ["FGFR3 mutation", "FGFR3 rearrangement"], - "type": ["mutation", "rearrangement"], + 'FGFR3:g.5000_5100del': { + 'matches': { + 'displayName': ['FGFR3 mutation', 'FGFR3 rearrangement'], + 'type': ['mutation', 'rearrangement'], } }, - "FGFR3:c.1200_1300delinsA": { - "matches": { - "displayName": ["FGFR3 mutation", "FGFR3 rearrangement"], - "type": ["mutation", "rearrangement"], + 'FGFR3:c.1200_1300delinsA': { + 'matches': { + 'displayName': ['FGFR3 mutation', 'FGFR3 rearrangement'], + 'type': ['mutation', 'rearrangement'], } }, - "FGFR3:c.1200delinsACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT": { - "matches": { - "displayName": ["FGFR3 mutation", "FGFR3 rearrangement"], - "type": ["mutation", "rearrangement"], + 'FGFR3:c.1200delinsACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT': { + 'matches': { + 'displayName': ['FGFR3 mutation', 'FGFR3 rearrangement'], + 'type': ['mutation', 'rearrangement'], } }, # ambiguous structural variations -> non-structural - "FGFR3:c.1200dup": { - "matches": {"displayName": ["FGFR3 mutation"], "type": ["mutation"]}, - "does_not_matches": {"displayName": ["FGFR3 rearrangement"], "type": ["rearrangement"]}, + 'FGFR3:c.1200dup': { + 'matches': {'displayName': ['FGFR3 mutation'], 'type': ['mutation']}, + 'does_not_matches': {'displayName': ['FGFR3 rearrangement'], 'type': ['rearrangement']}, }, - "FGFR3:c.1200_1201insA": { - "matches": {"displayName": ["FGFR3 mutation"], "type": ["mutation"]}, - "does_not_matches": {"displayName": ["FGFR3 rearrangement"], "type": ["rearrangement"]}, + 'FGFR3:c.1200_1201insA': { + 'matches': {'displayName': ['FGFR3 mutation'], 'type': ['mutation']}, + 'does_not_matches': {'displayName': ['FGFR3 rearrangement'], 'type': ['rearrangement']}, }, - "FGFR3:g.5000del": { - "matches": {"displayName": ["FGFR3 mutation"], "type": ["mutation"]}, - "does_not_matches": {"displayName": ["FGFR3 rearrangement"], "type": ["rearrangement"]}, + 'FGFR3:g.5000del': { + 'matches': {'displayName': ['FGFR3 mutation'], 'type': ['mutation']}, + 'does_not_matches': {'displayName': ['FGFR3 rearrangement'], 'type': ['rearrangement']}, }, - "FGFR3:c.1200delinsA": { - "matches": {"displayName": ["FGFR3 mutation"], "type": ["mutation"]}, - "does_not_matches": {"displayName": ["FGFR3 rearrangement"], "type": ["rearrangement"]}, + 'FGFR3:c.1200delinsA': { + 'matches': {'displayName': ['FGFR3 mutation'], 'type': ['mutation']}, + 'does_not_matches': {'displayName': ['FGFR3 rearrangement'], 'type': ['rearrangement']}, }, - "STK11:e.1_100del": { - "matches": {"displayName": ["STK11 mutation"], "type": ["mutation"]}, - "does_not_matches": {"displayName": ["STK11 deletion"], "type": ["deletion"]}, + 'STK11:e.1_100del': { + 'matches': {'displayName': ['STK11 mutation'], 'type': ['mutation']}, + 'does_not_matches': {'displayName': ['STK11 deletion'], 'type': ['deletion']}, }, - "STK11:i.1_100del": { - "matches": {"displayName": ["STK11 mutation"], "type": ["mutation"]}, - "does_not_matches": {"displayName": ["STK11 deletion"], "type": ["deletion"]}, + 'STK11:i.1_100del': { + 'matches': {'displayName': ['STK11 mutation'], 'type': ['mutation']}, + 'does_not_matches': {'displayName': ['STK11 deletion'], 'type': ['deletion']}, }, # non-structural variations - "FGFR3:c.1200C>A": { - "matches": {"displayName": ["FGFR3 mutation"], "type": ["mutation"]}, - "does_not_matches": {"displayName": ["FGFR3 rearrangement"], "type": ["rearrangement"]}, + 'FGFR3:c.1200C>A': { + 'matches': {'displayName': ['FGFR3 mutation'], 'type': ['mutation']}, + 'does_not_matches': {'displayName': ['FGFR3 rearrangement'], 'type': ['rearrangement']}, }, } @@ -81,118 +81,118 @@ # pos 1: expected equivalences ensemblProteinSample = [ ( - "EGFR", + 'EGFR', [ - "EGFR", - "ERBB", - "ENSG00000146648", - "ENSG00000146648.17", - "ENST00000275493", - "ENST00000275493.6", - "NM_001346897", - "NM_001346897.2", - "NP_001333826", - "NP_001333826.1", + 'EGFR', + 'ERBB', + 'ENSG00000146648', + 'ENSG00000146648.17', + 'ENST00000275493', + 'ENST00000275493.6', + 'NM_001346897', + 'NM_001346897.2', + 'NP_001333826', + 'NP_001333826.1', ], ), ( - "NM_001346897", + 'NM_001346897', [ - "EGFR", - "ERBB", - "ENSG00000146648", - "ENSG00000146648.17", - "NM_001346897", - "NM_001346897.2", - "NP_001333826", - "NP_001333826.1", + 'EGFR', + 'ERBB', + 'ENSG00000146648', + 'ENSG00000146648.17', + 'NM_001346897', + 'NM_001346897.2', + 'NP_001333826', + 'NP_001333826.1', ], ), ( - "NM_001346897.2", + 'NM_001346897.2', [ - "EGFR", - "ERBB", - "ENSG00000146648", - "ENSG00000146648.17", - "NM_001346897", - "NM_001346897.2", - "NP_001333826", - "NP_001333826.1", + 'EGFR', + 'ERBB', + 'ENSG00000146648', + 'ENSG00000146648.17', + 'NM_001346897', + 'NM_001346897.2', + 'NP_001333826', + 'NP_001333826.1', ], ), ( - "NP_001333826", + 'NP_001333826', [ - "EGFR", - "ERBB", - "ENSG00000146648", # Warn: Versionized ENSG won't be returned due to API limitations - "NM_001346897", - "NM_001346897.2", - "NP_001333826", - "NP_001333826.1", + 'EGFR', + 'ERBB', + 'ENSG00000146648', # Warn: Versionized ENSG won't be returned due to API limitations + 'NM_001346897', + 'NM_001346897.2', + 'NP_001333826', + 'NP_001333826.1', ], ), ( - "NP_001333826.1", + 'NP_001333826.1', [ - "EGFR", - "ERBB", - "ENSG00000146648", # Warn: Versionized ENSG won't be returned due to API limitations - "NM_001346897", - "NM_001346897.2", - "NP_001333826", - "NP_001333826.1", + 'EGFR', + 'ERBB', + 'ENSG00000146648', # Warn: Versionized ENSG won't be returned due to API limitations + 'NM_001346897', + 'NM_001346897.2', + 'NP_001333826', + 'NP_001333826.1', ], ), ( - "ENSG00000146648", + 'ENSG00000146648', [ - "EGFR", - "ERBB", - "ENSG00000146648", - "ENSG00000146648.17", - "ENST00000275493", - "ENST00000275493.6", - "NM_001346897", - "NM_001346897.2", - "NP_001333826", # Warn: Versionized NP won't be returned due to API limitations + 'EGFR', + 'ERBB', + 'ENSG00000146648', + 'ENSG00000146648.17', + 'ENST00000275493', + 'ENST00000275493.6', + 'NM_001346897', + 'NM_001346897.2', + 'NP_001333826', # Warn: Versionized NP won't be returned due to API limitations ], ), ( - "ENSG00000146648.17", + 'ENSG00000146648.17', [ - "EGFR", - "ERBB", - "ENSG00000146648", - "ENSG00000146648.17", - "ENST00000275493", - "ENST00000275493.6", - "NM_001346897", - "NM_001346897.2", - "NP_001333826", # Warn: Versionized NP won't be returned due to API limitations + 'EGFR', + 'ERBB', + 'ENSG00000146648', + 'ENSG00000146648.17', + 'ENST00000275493', + 'ENST00000275493.6', + 'NM_001346897', + 'NM_001346897.2', + 'NP_001333826', # Warn: Versionized NP won't be returned due to API limitations ], ), ( - "ENST00000275493", + 'ENST00000275493', [ - "EGFR", - "ERBB", - "ENSG00000146648", - "ENSG00000146648.17", - "ENST00000275493", - "ENST00000275493.6", + 'EGFR', + 'ERBB', + 'ENSG00000146648', + 'ENSG00000146648.17', + 'ENST00000275493', + 'ENST00000275493.6', ], ), ( - "ENST00000275493.6", + 'ENST00000275493.6', [ - "EGFR", - "ERBB", - "ENSG00000146648", - "ENSG00000146648.17", - "ENST00000275493", - "ENST00000275493.6", + 'EGFR', + 'ERBB', + 'ENSG00000146648', + 'ENSG00000146648.17', + 'ENST00000275493', + 'ENST00000275493.6', ], ), ] diff --git a/tests/test_graphkb/test_genes.py b/tests/test_graphkb/test_genes.py index fd97ffcd..90efe5d4 100644 --- a/tests/test_graphkb/test_genes.py +++ b/tests/test_graphkb/test_genes.py @@ -21,100 +21,100 @@ ) from pori_python.graphkb.util import get_rid -EXCLUDE_INTEGRATION_TESTS = os.environ.get("EXCLUDE_INTEGRATION_TESTS") == "1" -EXCLUDE_BCGSC_TESTS = os.environ.get("EXCLUDE_BCGSC_TESTS") == "1" -EXCLUDE_ONCOKB_TESTS = os.environ.get("EXCLUDE_ONCOKB_TESTS") == "1" +EXCLUDE_INTEGRATION_TESTS = os.environ.get('EXCLUDE_INTEGRATION_TESTS') == '1' +EXCLUDE_BCGSC_TESTS = os.environ.get('EXCLUDE_BCGSC_TESTS') == '1' +EXCLUDE_ONCOKB_TESTS = os.environ.get('EXCLUDE_ONCOKB_TESTS') == '1' -CANONICAL_ONCOGENES = ["kras", "nras", "alk"] -CANONICAL_TS = ["cdkn2a", "tp53"] -CANONICAL_CG = ["alb"] -CANONICAL_FUSION_GENES = ["alk", "ewsr1", "fli1"] -CANONICAL_STRUCTURAL_VARIANT_GENES = ["brca1", "dpyd", "pten"] -CANNONICAL_THERAPY_GENES = ["erbb2", "brca2", "egfr"] +CANONICAL_ONCOGENES = ['kras', 'nras', 'alk'] +CANONICAL_TS = ['cdkn2a', 'tp53'] +CANONICAL_CG = ['alb'] +CANONICAL_FUSION_GENES = ['alk', 'ewsr1', 'fli1'] +CANONICAL_STRUCTURAL_VARIANT_GENES = ['brca1', 'dpyd', 'pten'] +CANNONICAL_THERAPY_GENES = ['erbb2', 'brca2', 'egfr'] PHARMACOGENOMIC_INITIAL_GENES = [ - "ACYP2", - "CEP72", + 'ACYP2', + 'CEP72', # 'CYP26B1', # defined as hgvsGenomic chr2:g.233760235_233760235nc_000002.12:g.233760235ta[7]>ta[8] - "DPYD", - "NUDT15", - "RARG", - "SLC28A3", - "TPMT", - "UGT1A6", + 'DPYD', + 'NUDT15', + 'RARG', + 'SLC28A3', + 'TPMT', + 'UGT1A6', ] CANCER_PREDISP_INITIAL_GENES = [ - "AKT1", - "APC", - "ATM", - "AXIN2", - "BAP1", - "BLM", - "BMPR1A", - "BRCA1", - "BRCA2", - "BRIP1", - "CBL", - "CDH1", - "CDK4", - "CDKN2A", - "CHEK2", - "DICER1", - "EGFR", - "EPCAM", - "ETV6", - "EZH2", - "FH", - "FLCN", - "GATA2", - "HRAS", - "KIT", - "MEN1", - "MET", - "MLH1", - "MSH2", - "MSH6", - "MUTYH", - "NBN", - "NF1", - "PALB2", - "PDGFRA", - "PMS2", - "PTCH1", - "PTEN", - "PTPN11", - "RAD51C", - "RAD51D", - "RB1", - "RET", - "RUNX1", - "SDHA", - "SDHB", - "SDHC", - "SDHD", - "SMAD4", - "SMARCA4", - "STK11", - "TP53", - "TSC1", - "TSC2", - "VHL", - "WT1", + 'AKT1', + 'APC', + 'ATM', + 'AXIN2', + 'BAP1', + 'BLM', + 'BMPR1A', + 'BRCA1', + 'BRCA2', + 'BRIP1', + 'CBL', + 'CDH1', + 'CDK4', + 'CDKN2A', + 'CHEK2', + 'DICER1', + 'EGFR', + 'EPCAM', + 'ETV6', + 'EZH2', + 'FH', + 'FLCN', + 'GATA2', + 'HRAS', + 'KIT', + 'MEN1', + 'MET', + 'MLH1', + 'MSH2', + 'MSH6', + 'MUTYH', + 'NBN', + 'NF1', + 'PALB2', + 'PDGFRA', + 'PMS2', + 'PTCH1', + 'PTEN', + 'PTPN11', + 'RAD51C', + 'RAD51D', + 'RB1', + 'RET', + 'RUNX1', + 'SDHA', + 'SDHB', + 'SDHC', + 'SDHD', + 'SMAD4', + 'SMARCA4', + 'STK11', + 'TP53', + 'TSC1', + 'TSC2', + 'VHL', + 'WT1', ] -@pytest.fixture(scope="module") +@pytest.fixture(scope='module') def conn(): conn = GraphKBConnection() - conn.login(os.environ["GRAPHKB_USER"], os.environ["GRAPHKB_PASS"]) + conn.login(os.environ['GRAPHKB_USER'], os.environ['GRAPHKB_PASS']) return conn -@pytest.mark.skipif(EXCLUDE_ONCOKB_TESTS, reason="excluding tests that depend on oncokb data") +@pytest.mark.skipif(EXCLUDE_ONCOKB_TESTS, reason='excluding tests that depend on oncokb data') def test_oncogene(conn): result = get_oncokb_oncogenes(conn) - names = {row["name"] for row in result} + names = {row['name'] for row in result} for gene in CANONICAL_ONCOGENES: assert gene in names for gene in CANONICAL_TS: @@ -123,10 +123,10 @@ def test_oncogene(conn): assert gene not in names -@pytest.mark.skipif(EXCLUDE_ONCOKB_TESTS, reason="excluding tests that depend on oncokb data") +@pytest.mark.skipif(EXCLUDE_ONCOKB_TESTS, reason='excluding tests that depend on oncokb data') def test_tumour_supressors(conn): result = get_oncokb_tumour_supressors(conn) - names = {row["name"] for row in result} + names = {row['name'] for row in result} for gene in CANONICAL_TS: assert gene in names for gene in CANONICAL_ONCOGENES: @@ -135,13 +135,13 @@ def test_tumour_supressors(conn): assert gene not in names -@pytest.mark.skipif(EXCLUDE_ONCOKB_TESTS, reason="excluding tests that depend on oncokb data") +@pytest.mark.skipif(EXCLUDE_ONCOKB_TESTS, reason='excluding tests that depend on oncokb data') @pytest.mark.skipif( - EXCLUDE_BCGSC_TESTS, reason="excluding BCGSC-specific tests (tso500 not available)" + EXCLUDE_BCGSC_TESTS, reason='excluding BCGSC-specific tests (tso500 not available)' ) def test_cancer_genes(conn): result = get_cancer_genes(conn) - names = {row["name"] for row in result} + names = {row['name'] for row in result} for gene in CANONICAL_CG: assert gene in names for gene in CANONICAL_TS: @@ -151,104 +151,104 @@ def test_cancer_genes(conn): @pytest.mark.skipif( - EXCLUDE_BCGSC_TESTS, reason="excluding BCGSC-specific tests (requires CGL loader))" + EXCLUDE_BCGSC_TESTS, reason='excluding BCGSC-specific tests (requires CGL loader))' ) def test_get_pharmacogenomic_info(conn): genes, matches = get_pharmacogenomic_info(conn) for gene in PHARMACOGENOMIC_INITIAL_GENES: - assert gene in genes, f"{gene} not found in get_pharmacogenomic_info" + assert gene in genes, f'{gene} not found in get_pharmacogenomic_info' for rid, variant_display in matches.items(): if variant_display.startswith(gene): break else: # no break called # failing on this version of the func; addressed in 'new' version - if gene == "ACYP2": + if gene == 'ACYP2': continue - assert False, f"No rid found for a pharmacogenomic with {gene}" + assert False, f'No rid found for a pharmacogenomic with {gene}' @pytest.mark.skipif( - EXCLUDE_BCGSC_TESTS, reason="excluding BCGSC-specific tests (requires CGL loader))" + EXCLUDE_BCGSC_TESTS, reason='excluding BCGSC-specific tests (requires CGL loader))' ) def test_get_gene_linked_pharmacogenomic_info(conn): genes, matches = get_gene_linked_pharmacogenomic_info(conn) for gene in PHARMACOGENOMIC_INITIAL_GENES: - assert gene in genes, f"{gene} not found in get_pharmacogenomic_info" + assert gene in genes, f'{gene} not found in get_pharmacogenomic_info' for rid, variant_info in matches.items(): variant_gene_assoc = variant_info[1] if gene in variant_gene_assoc: break else: # no break called - assert False, f"No rid found for a pharmacogenomic with {gene}" + assert False, f'No rid found for a pharmacogenomic with {gene}' -@pytest.mark.skipif(EXCLUDE_INTEGRATION_TESTS, reason="excluding long running integration tests") +@pytest.mark.skipif(EXCLUDE_INTEGRATION_TESTS, reason='excluding long running integration tests') @pytest.mark.skipif( - EXCLUDE_BCGSC_TESTS, reason="excluding BCGSC-specific tests (requires CGL loader))" + EXCLUDE_BCGSC_TESTS, reason='excluding BCGSC-specific tests (requires CGL loader))' ) def test_get_cancer_predisposition_info(conn): genes, matches = get_cancer_predisposition_info(conn) for gene in CANCER_PREDISP_INITIAL_GENES: - assert gene in genes, f"{gene} not found in get_cancer_predisposition_info" + assert gene in genes, f'{gene} not found in get_cancer_predisposition_info' -@pytest.mark.skipif(EXCLUDE_INTEGRATION_TESTS, reason="excluding long running integration tests") +@pytest.mark.skipif(EXCLUDE_INTEGRATION_TESTS, reason='excluding long running integration tests') @pytest.mark.skipif( - EXCLUDE_BCGSC_TESTS, reason="excluding BCGSC-specific tests (requires CGL loader))" + EXCLUDE_BCGSC_TESTS, reason='excluding BCGSC-specific tests (requires CGL loader))' ) def test_get_gene_linked_cancer_predisposition_info(conn): genes, matches = get_gene_linked_cancer_predisposition_info(conn) for gene in CANCER_PREDISP_INITIAL_GENES: - assert gene in genes, f"{gene} not found in get_cancer_predisposition_info" + assert gene in genes, f'{gene} not found in get_cancer_predisposition_info' @pytest.mark.parametrize( - "alt_rep", ("NM_033360.4", "NM_033360", "ENSG00000133703.11", "ENSG00000133703") + 'alt_rep', ('NM_033360.4', 'NM_033360', 'ENSG00000133703.11', 'ENSG00000133703') ) def test_get_preferred_gene_name_kras(alt_rep, conn): gene_name = get_preferred_gene_name(conn, alt_rep) - assert ( - "KRAS" == gene_name - ), f"Expected KRAS as preferred gene name for {alt_rep}, not '{gene_name}'" + assert 'KRAS' == gene_name, ( + f"Expected KRAS as preferred gene name for {alt_rep}, not '{gene_name}'" + ) @pytest.mark.skipif( - EXCLUDE_BCGSC_TESTS, reason="excluding BCGSC-specific tests (requires CGL loader))" + EXCLUDE_BCGSC_TESTS, reason='excluding BCGSC-specific tests (requires CGL loader))' ) -@pytest.mark.skipif(EXCLUDE_INTEGRATION_TESTS, reason="excluding long running integration tests") +@pytest.mark.skipif(EXCLUDE_INTEGRATION_TESTS, reason='excluding long running integration tests') def test_find_genes_by_variant_type_structural_variant(conn): - result = get_genes_from_variant_types(conn, ["structural variant"]) - names = {row["name"] for row in result} + result = get_genes_from_variant_types(conn, ['structural variant']) + names = {row['name'] for row in result} for gene in CANONICAL_STRUCTURAL_VARIANT_GENES: - assert gene in names, f"{gene} was not identified as a structural variant gene." + assert gene in names, f'{gene} was not identified as a structural variant gene.' -@pytest.mark.skipif(EXCLUDE_INTEGRATION_TESTS, reason="excluding long running integration tests") +@pytest.mark.skipif(EXCLUDE_INTEGRATION_TESTS, reason='excluding long running integration tests') def test_find_no_genes_by_variant_type_with_nonmatching_source_record_id(conn): - refseq_id = get_rid(conn, target="source", name="refseq") + refseq_id = get_rid(conn, target='source', name='refseq') result = get_genes_from_variant_types( - conn, ["structural variant"], source_record_ids=[refseq_id] + conn, ['structural variant'], source_record_ids=[refseq_id] ) assert not result -@pytest.mark.skipif(EXCLUDE_INTEGRATION_TESTS, reason="excluding long running integration tests") +@pytest.mark.skipif(EXCLUDE_INTEGRATION_TESTS, reason='excluding long running integration tests') def test_get_therapeutic_associated_genes(conn): gene_list = get_therapeutic_associated_genes(graphkb_conn=conn) - assert gene_list, "No get_therapeutic_associated_genes found" - assert ( - len(gene_list) > 300 - ), f"Expected over 300 get_therapeutic_associated_genes but found {len(gene_list)}" - names = {row["name"] for row in gene_list} + assert gene_list, 'No get_therapeutic_associated_genes found' + assert len(gene_list) > 300, ( + f'Expected over 300 get_therapeutic_associated_genes but found {len(gene_list)}' + ) + names = {row['name'] for row in gene_list} for gene in CANNONICAL_THERAPY_GENES + CANONICAL_ONCOGENES + CANONICAL_TS: - assert gene in names, f"{gene} not found by get_therapeutic_associated_genes" + assert gene in names, f'{gene} not found by get_therapeutic_associated_genes' @pytest.mark.skipif( EXCLUDE_BCGSC_TESTS, - reason="excluding BCGSC-specific tests (requires oncokb and other loaders))", + reason='excluding BCGSC-specific tests (requires oncokb and other loaders))', ) -@pytest.mark.skipif(EXCLUDE_INTEGRATION_TESTS, reason="excluding long running integration tests") +@pytest.mark.skipif(EXCLUDE_INTEGRATION_TESTS, reason='excluding long running integration tests') def test_get_gene_information(conn): gene_info = get_gene_information( conn, @@ -258,36 +258,36 @@ def test_get_gene_information(conn): + CANONICAL_FUSION_GENES + CANONICAL_STRUCTURAL_VARIANT_GENES + CANNONICAL_THERAPY_GENES - + ["notagenename"], + + ['notagenename'], ) assert gene_info - nongene_flagged = [g["name"] for g in gene_info if g["name"] == "notagenename"] - assert not nongene_flagged, f"Improper gene category: {nongene_flagged}" + nongene_flagged = [g['name'] for g in gene_info if g['name'] == 'notagenename'] + assert not nongene_flagged, f'Improper gene category: {nongene_flagged}' for gene in CANONICAL_ONCOGENES: - assert gene in [ - g["name"] for g in gene_info if g.get("oncogene") - ], f"Missed oncogene {gene}" + assert gene in [g['name'] for g in gene_info if g.get('oncogene')], ( + f'Missed oncogene {gene}' + ) for gene in CANONICAL_TS: - assert gene in [ - g["name"] for g in gene_info if g.get("tumourSuppressor") - ], f"Missed 'tumourSuppressor' {gene}" + assert gene in [g['name'] for g in gene_info if g.get('tumourSuppressor')], ( + f"Missed 'tumourSuppressor' {gene}" + ) for gene in CANONICAL_FUSION_GENES: - assert gene in [ - g["name"] for g in gene_info if g.get("knownFusionPartner") - ], f"Missed knownFusionPartner {gene}" + assert gene in [g['name'] for g in gene_info if g.get('knownFusionPartner')], ( + f'Missed knownFusionPartner {gene}' + ) for gene in CANONICAL_STRUCTURAL_VARIANT_GENES: - assert gene in [ - g["name"] for g in gene_info if g.get("knownSmallMutation") - ], f"Missed knownSmallMutation {gene}" + assert gene in [g['name'] for g in gene_info if g.get('knownSmallMutation')], ( + f'Missed knownSmallMutation {gene}' + ) for gene in CANNONICAL_THERAPY_GENES: - assert gene in [ - g["name"] for g in gene_info if g.get("therapeuticAssociated") - ], f"Missed therapeuticAssociated {gene}" + assert gene in [g['name'] for g in gene_info if g.get('therapeuticAssociated')], ( + f'Missed therapeuticAssociated {gene}' + ) for gene in ( CANONICAL_ONCOGENES @@ -296,11 +296,11 @@ def test_get_gene_information(conn): + CANONICAL_STRUCTURAL_VARIANT_GENES + CANNONICAL_THERAPY_GENES ): - assert gene in [ - g["name"] for g in gene_info if g.get("kbStatementRelated") - ], f"Missed kbStatementRelated {gene}" + assert gene in [g['name'] for g in gene_info if g.get('kbStatementRelated')], ( + f'Missed kbStatementRelated {gene}' + ) for gene in CANONICAL_CG: - assert gene in [ - g["name"] for g in gene_info if g.get("cancerGeneListMatch") - ], f"Missed cancerGeneListMatch {gene}" + assert gene in [g['name'] for g in gene_info if g.get('cancerGeneListMatch')], ( + f'Missed cancerGeneListMatch {gene}' + ) diff --git a/tests/test_graphkb/test_graphkb.py b/tests/test_graphkb/test_graphkb.py index 5a7e48fe..88158ac0 100644 --- a/tests/test_graphkb/test_graphkb.py +++ b/tests/test_graphkb/test_graphkb.py @@ -7,26 +7,26 @@ def test_login_ok(): conn = GraphKBConnection() - conn.login(os.environ["GRAPHKB_USER"], os.environ["GRAPHKB_PASS"]) + conn.login(os.environ['GRAPHKB_USER'], os.environ['GRAPHKB_PASS']) assert conn.token is not None -@pytest.fixture(scope="module") +@pytest.fixture(scope='module') def conn(): conn = GraphKBConnection() - conn.login(os.environ["GRAPHKB_USER"], os.environ["GRAPHKB_PASS"]) + conn.login(os.environ['GRAPHKB_USER'], os.environ['GRAPHKB_PASS']) return conn class TestPaginate: - @mock.patch("pori_python.graphkb.GraphKBConnection.request") + @mock.patch('pori_python.graphkb.GraphKBConnection.request') def test_does_not_paginate_when_false(self, graphkb_request, conn): - graphkb_request.side_effect = [{"result": [1, 2, 3]}, {"result": [4, 5]}] + graphkb_request.side_effect = [{'result': [1, 2, 3]}, {'result': [4, 5]}] result = conn.query({}, paginate=False, limit=3) assert result == [1, 2, 3] - @mock.patch("pori_python.graphkb.GraphKBConnection.request") + @mock.patch('pori_python.graphkb.GraphKBConnection.request') def test_paginates_by_default(self, graphkb_request, conn): - graphkb_request.side_effect = [{"result": [1, 2, 3]}, {"result": [4, 5]}] + graphkb_request.side_effect = [{'result': [1, 2, 3]}, {'result': [4, 5]}] result = conn.query({}, paginate=True, limit=3) assert result == [1, 2, 3, 4, 5] diff --git a/tests/test_graphkb/test_match.py b/tests/test_graphkb/test_match.py index d4ee5679..7a34b900 100644 --- a/tests/test_graphkb/test_match.py +++ b/tests/test_graphkb/test_match.py @@ -15,53 +15,53 @@ # Test datasets from .data import ensemblProteinSample, structuralVariants -EXCLUDE_BCGSC_TESTS = os.environ.get("EXCLUDE_BCGSC_TESTS") == "1" -EXCLUDE_INTEGRATION_TESTS = os.environ.get("EXCLUDE_INTEGRATION_TESTS") == "1" +EXCLUDE_BCGSC_TESTS = os.environ.get('EXCLUDE_BCGSC_TESTS') == '1' +EXCLUDE_INTEGRATION_TESTS = os.environ.get('EXCLUDE_INTEGRATION_TESTS') == '1' -INCREASE_PREFIXES = ["up", "increase", "over", "gain", "amp"] -DECREASE_PREFIXES = ["down", "decrease", "reduce", "under", "loss", "delet"] -GENERAL_MUTATION = "mutation" +INCREASE_PREFIXES = ['up', 'increase', 'over', 'gain', 'amp'] +DECREASE_PREFIXES = ['down', 'decrease', 'reduce', 'under', 'loss', 'delet'] +GENERAL_MUTATION = 'mutation' def has_prefix(word: str, prefixes: List[str]) -> bool: for prefix in prefixes: - if re.search(r"\b" + prefix, word): + if re.search(r'\b' + prefix, word): return True return False -@pytest.fixture(scope="module") +@pytest.fixture(scope='module') def conn() -> GraphKBConnection: conn = GraphKBConnection() - conn.login(os.environ["GRAPHKB_USER"], os.environ["GRAPHKB_PASS"]) + conn.login(os.environ['GRAPHKB_USER'], os.environ['GRAPHKB_PASS']) return conn -@pytest.fixture(scope="class") +@pytest.fixture(scope='class') def kras(conn): - return [f["displayName"] for f in match.get_equivalent_features(conn, "kras")] + return [f['displayName'] for f in match.get_equivalent_features(conn, 'kras')] """ version found in the db for ENSG00000133703 will vary depending on which version of ensembl was loaded. checking for any . version """ -kras_ensg_version = r"ENSG00000133703\..*" +kras_ensg_version = r'ENSG00000133703\..*' class TestGetEquivalentFeatures: def test_kras_has_self(self, kras): - assert "KRAS" in kras + assert 'KRAS' in kras def test_expands_aliases(self, kras): - assert "KRAS2" in kras + assert 'KRAS2' in kras def test_expands_elements(self, kras): - assert "NM_033360" in kras - assert "ENST00000311936" in kras + assert 'NM_033360' in kras + assert 'ENST00000311936' in kras def test_expands_generalizations(self, kras): - assert "NM_033360.4" in kras + assert 'NM_033360.4' in kras ensg_version_found = False for item in kras: if re.match(kras_ensg_version, item): @@ -69,47 +69,47 @@ def test_expands_generalizations(self, kras): assert ensg_version_found def test_expands_generalizations_kras(self, kras): - assert "NM_033360.4" in kras - assert "NM_033360" in kras + assert 'NM_033360.4' in kras + assert 'NM_033360' in kras ensg_version_found = False for item in kras: if re.match(kras_ensg_version, item): ensg_version_found = True assert ensg_version_found - assert "ENSG00000133703" in kras + assert 'ENSG00000133703' in kras @pytest.mark.parametrize( - "alt_rep", ("NM_033360.4", "NM_033360", "ENSG00000133703.11", "ENSG00000133703") + 'alt_rep', ('NM_033360.4', 'NM_033360', 'ENSG00000133703.11', 'ENSG00000133703') ) def test_expands_generalizations_refseq(self, alt_rep, conn): - kras = [f["displayName"] for f in match.get_equivalent_features(conn, alt_rep)] - assert "NM_033360.4" in kras - assert "NM_033360" in kras + kras = [f['displayName'] for f in match.get_equivalent_features(conn, alt_rep)] + assert 'NM_033360.4' in kras + assert 'NM_033360' in kras ensg_version_found = False for item in kras: if re.match(kras_ensg_version, item): ensg_version_found = True assert ensg_version_found - assert "ENSG00000133703" in kras + assert 'ENSG00000133703' in kras def test_checks_by_source_id_kras(self, conn): kras = [ - f["displayName"] + f['displayName'] for f in match.get_equivalent_features( - conn, "nm_033360", source="refseq", source_id_version="4", is_source_id=True + conn, 'nm_033360', source='refseq', source_id_version='4', is_source_id=True ) ] - assert "KRAS" in kras + assert 'KRAS' in kras # KBDEV-1163 # Testing if the addition of Ensembl protein Features are limiting results # returned by get_equivalent_features() since SimilatTo queryType queries # aren't traversing the graph to it's whole depth. - @pytest.mark.skipif(EXCLUDE_INTEGRATION_TESTS, reason="excluding data-specific test") + @pytest.mark.skipif(EXCLUDE_INTEGRATION_TESTS, reason='excluding data-specific test') def test_ensembl_protein(self, conn): for feature, expected in ensemblProteinSample: equivalent_features = match.get_equivalent_features(conn, feature) - equivalent_features = [el["displayName"] for el in equivalent_features] + equivalent_features = [el['displayName'] for el in equivalent_features] for equivalent_feature in expected: assert equivalent_feature in equivalent_features @@ -117,46 +117,46 @@ def test_ensembl_protein(self, conn): class TestMatchCopyVariant: def test_bad_category(self, conn): with pytest.raises(ValueError): - match.match_copy_variant(conn, "kras", "not a copy number") + match.match_copy_variant(conn, 'kras', 'not a copy number') def test_bad_gene_name(self, conn): with pytest.raises(FeatureNotFoundError): - match.match_copy_variant(conn, "not a real gene name", match.INPUT_COPY_CATEGORIES.AMP) + match.match_copy_variant(conn, 'not a real gene name', match.INPUT_COPY_CATEGORIES.AMP) @pytest.mark.skipif( EXCLUDE_BCGSC_TESTS, - reason="excluding BCGSC-specific tests - no copy loss variants in other data", + reason='excluding BCGSC-specific tests - no copy loss variants in other data', ) def test_known_loss(self, conn): - matches = match.match_copy_variant(conn, "CDKN2A", match.INPUT_COPY_CATEGORIES.ANY_LOSS) + matches = match.match_copy_variant(conn, 'CDKN2A', match.INPUT_COPY_CATEGORIES.ANY_LOSS) assert matches - types_selected = {record["type"]["name"] for record in matches} - zygositys = {record["zygosity"] for record in matches} + types_selected = {record['type']['name'] for record in matches} + zygositys = {record['zygosity'] for record in matches} assert match.INPUT_COPY_CATEGORIES.ANY_LOSS in types_selected assert match.INPUT_COPY_CATEGORIES.AMP not in types_selected assert GENERAL_MUTATION not in types_selected - assert "homozygous" in zygositys + assert 'homozygous' in zygositys for variant_type in types_selected: assert not has_prefix(variant_type, INCREASE_PREFIXES) @pytest.mark.skipif( EXCLUDE_BCGSC_TESTS, - reason="excluding BCGSC-specific tests - no copy loss variants in other data", + reason='excluding BCGSC-specific tests - no copy loss variants in other data', ) def test_known_loss_zygosity_filtered(self, conn): matches = match.match_copy_variant( - conn, "CDKN2A", match.INPUT_COPY_CATEGORIES.ANY_LOSS, True + conn, 'CDKN2A', match.INPUT_COPY_CATEGORIES.ANY_LOSS, True ) assert matches - types_selected = {record["type"]["name"] for record in matches} - zygositys = {record["zygosity"] for record in matches} + types_selected = {record['type']['name'] for record in matches} + zygositys = {record['zygosity'] for record in matches} - assert "homozygous" not in zygositys + assert 'homozygous' not in zygositys assert GENERAL_MUTATION not in types_selected assert match.INPUT_COPY_CATEGORIES.ANY_LOSS in types_selected @@ -166,10 +166,10 @@ def test_known_loss_zygosity_filtered(self, conn): assert not has_prefix(variant_type, INCREASE_PREFIXES) def test_known_gain(self, conn): - matches = match.match_copy_variant(conn, "KRAS", "copy gain") + matches = match.match_copy_variant(conn, 'KRAS', 'copy gain') assert matches - types_selected = {record["type"]["name"] for record in matches} + types_selected = {record['type']['name'] for record in matches} assert GENERAL_MUTATION not in types_selected assert match.INPUT_COPY_CATEGORIES.AMP in types_selected @@ -179,12 +179,12 @@ def test_known_gain(self, conn): assert not has_prefix(variant_type, DECREASE_PREFIXES) @pytest.mark.skipif( - EXCLUDE_INTEGRATION_TESTS, reason="excluding long running integration tests" + EXCLUDE_INTEGRATION_TESTS, reason='excluding long running integration tests' ) def test_low_gain_excludes_amplification(self, conn): - matches = match.match_copy_variant(conn, "KRAS", match.INPUT_COPY_CATEGORIES.GAIN) + matches = match.match_copy_variant(conn, 'KRAS', match.INPUT_COPY_CATEGORIES.GAIN) - types_selected = {record["type"]["name"] for record in matches} + types_selected = {record['type']['name'] for record in matches} assert match.INPUT_COPY_CATEGORIES.AMP not in types_selected assert match.INPUT_COPY_CATEGORIES.LOSS not in types_selected @@ -194,49 +194,49 @@ def test_low_gain_excludes_amplification(self, conn): assert not has_prefix(variant_type, DECREASE_PREFIXES) -@pytest.mark.parametrize("pos1,pos2_start,pos2_end", [[3, 2, 5], [2, None, 5], [3, 2, None]]) +@pytest.mark.parametrize('pos1,pos2_start,pos2_end', [[3, 2, 5], [2, None, 5], [3, 2, None]]) def test_range_overlap(pos1, pos2_start, pos2_end): - assert match.positions_overlap({"pos": pos1}, {"pos": pos2_start}, {"pos": pos2_end}) + assert match.positions_overlap({'pos': pos1}, {'pos': pos2_start}, {'pos': pos2_end}) @pytest.mark.parametrize( - "pos1,pos2_start,pos2_end", + 'pos1,pos2_start,pos2_end', [[2, 4, 5], [5, 2, 3], [10, None, 9], [10, 11, None], [1, 2, 2], [2, 1, 1]], ) def test_range_not_overlap(pos1, pos2_start, pos2_end): - assert not match.positions_overlap({"pos": pos1}, {"pos": pos2_start}, {"pos": pos2_end}) + assert not match.positions_overlap({'pos': pos1}, {'pos': pos2_start}, {'pos': pos2_end}) -@pytest.mark.parametrize("pos1", [None, 1]) -@pytest.mark.parametrize("pos2", [None, 1]) +@pytest.mark.parametrize('pos1', [None, 1]) +@pytest.mark.parametrize('pos2', [None, 1]) def test_position_match(pos1, pos2): - assert match.positions_overlap({"pos": pos1}, {"pos": pos2}) + assert match.positions_overlap({'pos': pos1}, {'pos': pos2}) class TestMatchExpressionVariant: def test_bad_category(self, conn): with pytest.raises(ValueError): - match.match_expression_variant(conn, "PTEN", "not a expression category") + match.match_expression_variant(conn, 'PTEN', 'not a expression category') def test_bad_gene_name(self, conn): with pytest.raises(FeatureNotFoundError): match.match_expression_variant( - conn, "not a real gene name", match.INPUT_EXPRESSION_CATEGORIES.UP + conn, 'not a real gene name', match.INPUT_EXPRESSION_CATEGORIES.UP ) @pytest.mark.skipif( - EXCLUDE_INTEGRATION_TESTS, reason="excluding long running integration tests" + EXCLUDE_INTEGRATION_TESTS, reason='excluding long running integration tests' ) @pytest.mark.skipif( - EXCLUDE_BCGSC_TESTS, reason="variants in db for this test are from IPRKB and ESMO" + EXCLUDE_BCGSC_TESTS, reason='variants in db for this test are from IPRKB and ESMO' ) def test_known_reduced_expression(self, conn): matches = match.match_expression_variant( - conn, "PTEN", match.INPUT_EXPRESSION_CATEGORIES.DOWN + conn, 'PTEN', match.INPUT_EXPRESSION_CATEGORIES.DOWN ) assert matches - types_selected = {record["type"]["name"] for record in matches} + types_selected = {record['type']['name'] for record in matches} assert match.INPUT_EXPRESSION_CATEGORIES.UP not in types_selected assert GENERAL_MUTATION not in types_selected @@ -245,16 +245,16 @@ def test_known_reduced_expression(self, conn): assert not has_prefix(variant_type, INCREASE_PREFIXES) @pytest.mark.skipif( - EXCLUDE_BCGSC_TESTS, reason="excluding BCGSC-specific tests - no applicable variants" + EXCLUDE_BCGSC_TESTS, reason='excluding BCGSC-specific tests - no applicable variants' ) def test_known_reduced_expression_gene_id(self, conn): - gene_id = conn.query({"target": "Feature", "filters": [{"name": "PTEN"}]})[0]["@rid"] + gene_id = conn.query({'target': 'Feature', 'filters': [{'name': 'PTEN'}]})[0]['@rid'] matches = match.match_expression_variant( conn, gene_id, match.INPUT_EXPRESSION_CATEGORIES.DOWN ) assert matches - types_selected = {record["type"]["name"] for record in matches} + types_selected = {record['type']['name'] for record in matches} assert match.INPUT_EXPRESSION_CATEGORIES.UP not in types_selected assert GENERAL_MUTATION not in types_selected @@ -263,13 +263,13 @@ def test_known_reduced_expression_gene_id(self, conn): assert not has_prefix(variant_type, INCREASE_PREFIXES) @pytest.mark.skipif( - EXCLUDE_INTEGRATION_TESTS, reason="excluding long running integration tests" + EXCLUDE_INTEGRATION_TESTS, reason='excluding long running integration tests' ) def test_known_increased_expression(self, conn): - matches = match.match_expression_variant(conn, "CA9", match.INPUT_EXPRESSION_CATEGORIES.UP) + matches = match.match_expression_variant(conn, 'CA9', match.INPUT_EXPRESSION_CATEGORIES.UP) assert matches - types_selected = {record["type"]["name"] for record in matches} + types_selected = {record['type']['name'] for record in matches} assert match.INPUT_EXPRESSION_CATEGORIES.UP not in types_selected assert GENERAL_MUTATION not in types_selected @@ -281,115 +281,115 @@ def test_known_increased_expression(self, conn): class TestComparePositionalVariants: def test_nonspecific_altseq(self): assert match.compare_positional_variants( - conn, {"break1Start": {"pos": 1}}, {"break1Start": {"pos": 1}} + conn, {'break1Start': {'pos': 1}}, {'break1Start': {'pos': 1}} ) # null matches anything assert match.compare_positional_variants( - conn, {"break1Start": {"pos": 1}, "untemplatedSeq": "T"}, {"break1Start": {"pos": 1}} + conn, {'break1Start': {'pos': 1}, 'untemplatedSeq': 'T'}, {'break1Start': {'pos': 1}} ) assert match.compare_positional_variants( - conn, {"break1Start": {"pos": 1}}, {"break1Start": {"pos": 1}, "untemplatedSeq": "T"} + conn, {'break1Start': {'pos': 1}}, {'break1Start': {'pos': 1}, 'untemplatedSeq': 'T'} ) - @pytest.mark.parametrize("seq1", ["T", "X", "?"]) - @pytest.mark.parametrize("seq2", ["T", "X", "?"]) + @pytest.mark.parametrize('seq1', ['T', 'X', '?']) + @pytest.mark.parametrize('seq2', ['T', 'X', '?']) def test_ambiguous_altseq(self, seq1, seq2): # ambiguous AA matches anything the same length assert match.compare_positional_variants( conn, - {"break1Start": {"pos": 1}, "untemplatedSeq": seq1}, - {"break1Start": {"pos": 1}, "untemplatedSeq": seq2}, + {'break1Start': {'pos': 1}, 'untemplatedSeq': seq1}, + {'break1Start': {'pos': 1}, 'untemplatedSeq': seq2}, ) def test_altseq_length_mismatch(self): assert not match.compare_positional_variants( conn, - {"break1Start": {"pos": 1}, "untemplatedSeq": "??"}, - {"break1Start": {"pos": 1}, "untemplatedSeq": "T"}, + {'break1Start': {'pos': 1}, 'untemplatedSeq': '??'}, + {'break1Start': {'pos': 1}, 'untemplatedSeq': 'T'}, ) assert not match.compare_positional_variants( conn, - {"break1Start": {"pos": 1}, "untemplatedSeq": "?"}, - {"break1Start": {"pos": 1}, "untemplatedSeq": "TT"}, + {'break1Start': {'pos': 1}, 'untemplatedSeq': '?'}, + {'break1Start': {'pos': 1}, 'untemplatedSeq': 'TT'}, ) def test_nonspecific_refseq(self): # null matches anything assert match.compare_positional_variants( - conn, {"break1Start": {"pos": 1}, "refSeq": "T"}, {"break1Start": {"pos": 1}} + conn, {'break1Start': {'pos': 1}, 'refSeq': 'T'}, {'break1Start': {'pos': 1}} ) assert match.compare_positional_variants( - conn, {"break1Start": {"pos": 1}}, {"break1Start": {"pos": 1}, "refSeq": "T"} + conn, {'break1Start': {'pos': 1}}, {'break1Start': {'pos': 1}, 'refSeq': 'T'} ) - @pytest.mark.parametrize("seq1", ["T", "X", "?"]) - @pytest.mark.parametrize("seq2", ["T", "X", "?"]) + @pytest.mark.parametrize('seq1', ['T', 'X', '?']) + @pytest.mark.parametrize('seq2', ['T', 'X', '?']) def test_ambiguous_refseq(self, seq1, seq2): # ambiguous AA matches anything the same length assert match.compare_positional_variants( conn, - {"break1Start": {"pos": 1}, "refSeq": seq1}, - {"break1Start": {"pos": 1}, "refSeq": seq2}, + {'break1Start': {'pos': 1}, 'refSeq': seq1}, + {'break1Start': {'pos': 1}, 'refSeq': seq2}, ) def test_refseq_length_mismatch(self): assert not match.compare_positional_variants( conn, - {"break1Start": {"pos": 1}, "refSeq": "??"}, - {"break1Start": {"pos": 1}, "refSeq": "T"}, + {'break1Start': {'pos': 1}, 'refSeq': '??'}, + {'break1Start': {'pos': 1}, 'refSeq': 'T'}, ) assert not match.compare_positional_variants( conn, - {"break1Start": {"pos": 1}, "refSeq": "?"}, - {"break1Start": {"pos": 1}, "refSeq": "TT"}, + {'break1Start': {'pos': 1}, 'refSeq': '?'}, + {'break1Start': {'pos': 1}, 'refSeq': 'TT'}, ) def test_diff_altseq(self): assert not match.compare_positional_variants( conn, - {"break1Start": {"pos": 1}, "untemplatedSeq": "M"}, - {"break1Start": {"pos": 1}, "untemplatedSeq": "R"}, + {'break1Start': {'pos': 1}, 'untemplatedSeq': 'M'}, + {'break1Start': {'pos': 1}, 'untemplatedSeq': 'R'}, ) def test_same_altseq_matches(self): assert match.compare_positional_variants( conn, - {"break1Start": {"pos": 1}, "untemplatedSeq": "R"}, - {"break1Start": {"pos": 1}, "untemplatedSeq": "R"}, + {'break1Start': {'pos': 1}, 'untemplatedSeq': 'R'}, + {'break1Start': {'pos': 1}, 'untemplatedSeq': 'R'}, ) def test_diff_refseq(self): assert not match.compare_positional_variants( conn, - {"break1Start": {"pos": 1}, "refSeq": "M"}, - {"break1Start": {"pos": 1}, "refSeq": "R"}, + {'break1Start': {'pos': 1}, 'refSeq': 'M'}, + {'break1Start': {'pos': 1}, 'refSeq': 'R'}, ) def test_same_refseq_matches(self): assert match.compare_positional_variants( conn, - {"break1Start": {"pos": 1}, "refSeq": "R"}, - {"break1Start": {"pos": 1}, "refSeq": "R"}, + {'break1Start': {'pos': 1}, 'refSeq': 'R'}, + {'break1Start': {'pos': 1}, 'refSeq': 'R'}, ) def test_range_vs_sub(self): sub = { - "break1Repr": "p.G776", - "break1Start": {"@Class": "ProteinPosition", "pos": 776, "refAA": "G"}, - "break2Repr": "p.V777", - "break2Start": {"@Class": "ProteinPosition", "pos": 777, "refAA": "V"}, - "reference1": "ERBB2", - "type": "insertion", - "untemplatedSeq": "YVMA", - "untemplatedSeqSize": 4, + 'break1Repr': 'p.G776', + 'break1Start': {'@Class': 'ProteinPosition', 'pos': 776, 'refAA': 'G'}, + 'break2Repr': 'p.V777', + 'break2Start': {'@Class': 'ProteinPosition', 'pos': 777, 'refAA': 'V'}, + 'reference1': 'ERBB2', + 'type': 'insertion', + 'untemplatedSeq': 'YVMA', + 'untemplatedSeqSize': 4, } range_variant = { - "break1Repr": "p.G776", - "break1Start": {"@Class": "ProteinPosition", "pos": 776, "refAA": "G"}, - "break2Repr": "p.?776", - "break2Start": None, - "refSeq": "G", - "untemplatedSeq": "VV", + 'break1Repr': 'p.G776', + 'break1Start': {'@Class': 'ProteinPosition', 'pos': 776, 'refAA': 'G'}, + 'break2Repr': 'p.?776', + 'break2Start': None, + 'refSeq': 'G', + 'untemplatedSeq': 'VV', } assert not match.compare_positional_variants(conn, sub, range_variant) assert not match.compare_positional_variants(conn, range_variant, sub) @@ -398,64 +398,64 @@ def test_range_vs_sub(self): class TestMatchPositionalVariant: def test_error_on_duplicate_reference1(self, conn): with pytest.raises(ValueError): - match.match_positional_variant(conn, "KRAS:p.G12D", "#123:34") + match.match_positional_variant(conn, 'KRAS:p.G12D', '#123:34') def test_error_on_bad_reference2(self, conn): with pytest.raises(ValueError): - match.match_positional_variant(conn, "KRAS:p.G12D", reference2="#123:34") + match.match_positional_variant(conn, 'KRAS:p.G12D', reference2='#123:34') def test_error_on_duplicate_reference2(self, conn): with pytest.raises(ValueError): match.match_positional_variant( - conn, "(BCR,ABL1):fusion(e.13,e.3)", reference2="#123:34" + conn, '(BCR,ABL1):fusion(e.13,e.3)', reference2='#123:34' ) def test_uncertain_position_not_supported(self, conn): with pytest.raises(NotImplementedError): - match.match_positional_variant(conn, "(BCR,ABL1):fusion(e.13_24,e.3)") + match.match_positional_variant(conn, '(BCR,ABL1):fusion(e.13_24,e.3)') def test_bad_gene_name(self, conn): with pytest.raises(FeatureNotFoundError): - match.match_positional_variant(conn, "ME-AS-A-GENE:p.G12D") + match.match_positional_variant(conn, 'ME-AS-A-GENE:p.G12D') def test_bad_gene2_name(self, conn): with pytest.raises(FeatureNotFoundError): - match.match_positional_variant(conn, "(BCR,ME-AS-A-GENE):fusion(e.13,e.3)") + match.match_positional_variant(conn, '(BCR,ME-AS-A-GENE):fusion(e.13,e.3)') def test_match_explicit_reference1(self, conn): - reference1 = conn.query({"target": "Feature", "filters": {"name": "KRAS"}})[0]["@rid"] - matches = match.match_positional_variant(conn, "p.G12D", reference1=reference1) + reference1 = conn.query({'target': 'Feature', 'filters': {'name': 'KRAS'}})[0]['@rid'] + matches = match.match_positional_variant(conn, 'p.G12D', reference1=reference1) assert matches @pytest.mark.skipif( - EXCLUDE_INTEGRATION_TESTS, reason="excluding long running integration tests" + EXCLUDE_INTEGRATION_TESTS, reason='excluding long running integration tests' ) def test_match_explicit_references(self, conn): - reference1 = conn.query({"target": "Feature", "filters": {"name": "BCR"}})[0]["@rid"] - reference2 = conn.query({"target": "Feature", "filters": {"name": "ABL1"}})[0]["@rid"] + reference1 = conn.query({'target': 'Feature', 'filters': {'name': 'BCR'}})[0]['@rid'] + reference2 = conn.query({'target': 'Feature', 'filters': {'name': 'ABL1'}})[0]['@rid'] matches = match.match_positional_variant( - conn, "fusion(e.13,e.3)", reference1=reference1, reference2=reference2 + conn, 'fusion(e.13,e.3)', reference1=reference1, reference2=reference2 ) assert matches @pytest.mark.skipif( - EXCLUDE_INTEGRATION_TESTS, reason="excluding long running integration tests" + EXCLUDE_INTEGRATION_TESTS, reason='excluding long running integration tests' ) @pytest.mark.parametrize( - "known_variant,related_variants,unrelated_variants", + 'known_variant,related_variants,unrelated_variants', [ - ["KRAS:p.G12D", ["KRAS:p.G12X", "chr12:g.25398284C>T"], ["KRAS:p.G12V"]], - ["KRAS:p.G13D", ["KRAS:p.?13mut"], []], - ["chr12:g.25398284C>T", ["KRAS:p.G12D"], ["KRAS:p.G12V"]], - ["EGFR:p.E746_S752delinsI", ["EGFR mutation"], ["EGFR copy variant"]], + ['KRAS:p.G12D', ['KRAS:p.G12X', 'chr12:g.25398284C>T'], ['KRAS:p.G12V']], + ['KRAS:p.G13D', ['KRAS:p.?13mut'], []], + ['chr12:g.25398284C>T', ['KRAS:p.G12D'], ['KRAS:p.G12V']], + ['EGFR:p.E746_S752delinsI', ['EGFR mutation'], ['EGFR copy variant']], ], ) @pytest.mark.skipif( - EXCLUDE_BCGSC_TESTS, reason="TODO: fix loader for vars ending in X, p.?, copy variant" + EXCLUDE_BCGSC_TESTS, reason='TODO: fix loader for vars ending in X, p.?, copy variant' ) def test_known_variants(self, conn, known_variant, related_variants, unrelated_variants): matches = match.match_positional_variant(conn, known_variant) - names = {m["displayName"] for m in matches} + names = {m['displayName'] for m in matches} assert matches assert known_variant in names for variant in related_variants: @@ -464,103 +464,103 @@ def test_known_variants(self, conn, known_variant, related_variants, unrelated_v assert variant not in names @pytest.mark.skipif( - EXCLUDE_BCGSC_TESTS, reason="TODO: add nonIPRKB fusion tests; source for these is IPRKB" + EXCLUDE_BCGSC_TESTS, reason='TODO: add nonIPRKB fusion tests; source for these is IPRKB' ) @pytest.mark.parametrize( - "known_variant,related_variants", + 'known_variant,related_variants', [ - ["(BCR,ABL1):fusion(e.13,e.3)", ["BCR and ABL1 fusion"]], - ["(ATP1B1,NRG1):fusion(e.2,e.2)", ["NRG1 fusion", "ATP1B1 and NRG1 fusion"]], + ['(BCR,ABL1):fusion(e.13,e.3)', ['BCR and ABL1 fusion']], + ['(ATP1B1,NRG1):fusion(e.2,e.2)', ['NRG1 fusion', 'ATP1B1 and NRG1 fusion']], ], ) def test_known_fusions(self, conn, known_variant, related_variants): matches = match.match_positional_variant(conn, known_variant) - types_selected = [m["type"]["name"] for m in matches] + types_selected = [m['type']['name'] for m in matches] assert GENERAL_MUTATION not in types_selected - names = {m["displayName"] for m in matches} + names = {m['displayName'] for m in matches} assert matches assert known_variant in names for variant in related_variants: assert variant in names def test_known_fusion_single_gene_no_match(self, conn): - known = "(TERT,?):fusion(e.1,e.?)" + known = '(TERT,?):fusion(e.1,e.?)' matches = match.match_positional_variant(conn, known) assert not matches def test_novel_specific_matches_general(self, conn): - novel_specific = "CDKN2A:p.T18888888888888888888M" + novel_specific = 'CDKN2A:p.T18888888888888888888M' matches = match.match_positional_variant(conn, novel_specific) - names = {m["displayName"] for m in matches} + names = {m['displayName'] for m in matches} assert matches assert novel_specific not in names - assert "CDKN2A mutation" in names + assert 'CDKN2A mutation' in names @pytest.mark.skipif( - EXCLUDE_INTEGRATION_TESTS, reason="excluding long running integration tests" + EXCLUDE_INTEGRATION_TESTS, reason='excluding long running integration tests' ) def test_genomic_coordinates(self, conn): - genomic = "X:g.100611165A>T" + genomic = 'X:g.100611165A>T' x = match.match_positional_variant(conn, genomic) assert x != [] # no assert b/c checking for no error rather than the result (but also want to confirm some result returned) @pytest.mark.skipif( - EXCLUDE_INTEGRATION_TESTS, reason="excluding long running integration tests" + EXCLUDE_INTEGRATION_TESTS, reason='excluding long running integration tests' ) - @pytest.mark.skipif(EXCLUDE_BCGSC_TESTS, reason="source for this variant is IPRKB") + @pytest.mark.skipif(EXCLUDE_BCGSC_TESTS, reason='source for this variant is IPRKB') def test_tert_promoter(self, conn): - assert match.match_positional_variant(conn, "TERT:c.-124C>T") + assert match.match_positional_variant(conn, 'TERT:c.-124C>T') def test_wildtype_match_error(self, conn): - for gkb_match in match.match_positional_variant(conn, "TP53:p.E285K"): - assert ( - "wildtype" not in gkb_match["displayName"] - ), f"TP53:p.E285K should not match {gkb_match['displayName']}" + for gkb_match in match.match_positional_variant(conn, 'TP53:p.E285K'): + assert 'wildtype' not in gkb_match['displayName'], ( + f'TP53:p.E285K should not match {gkb_match["displayName"]}' + ) @pytest.mark.skipif( - True, reason="GERO-303 - technically incorrect notation for GSC backwards compatibility." + True, reason='GERO-303 - technically incorrect notation for GSC backwards compatibility.' ) def test_tert_promoter_leading_one_alt_notation(self, conn): # GERO-303 - technically this format is incorrect. - assert match.match_positional_variant(conn, "TERT:c.1-124C>T") + assert match.match_positional_variant(conn, 'TERT:c.1-124C>T') def test_missense_is_not_nonsense(self, conn): """GERO-299 - nonsense mutation creates a stop codon and is usually more severe.""" # equivalent TP53 notations - genomic = "chr17:g.7674252C>T" - cds = "ENST00000269305:c.711G>A" - protein = "TP53:p.M237I" + genomic = 'chr17:g.7674252C>T' + cds = 'ENST00000269305:c.711G>A' + protein = 'TP53:p.M237I' for mut in (protein, genomic, cds): matches = match.match_positional_variant(conn, mut) - nonsense = [m for m in matches if "nonsense" in m["displayName"]] - assert ( - not nonsense - ), f"Missense {mut} is not a nonsense variant: {((m['displayName'], m['@rid']) for m in nonsense)}" + nonsense = [m for m in matches if 'nonsense' in m['displayName']] + assert not nonsense, ( + f'Missense {mut} is not a nonsense variant: {((m["displayName"], m["@rid"]) for m in nonsense)}' + ) - @pytest.mark.skipif(EXCLUDE_BCGSC_TESTS, reason="TODO: missing record for FGFR3 rearrangement") + @pytest.mark.skipif(EXCLUDE_BCGSC_TESTS, reason='TODO: missing record for FGFR3 rearrangement') def test_structural_variants(self, conn): """KBDEV-1056""" for variant_string, expected in structuralVariants.items(): print(variant_string) # Querying matches for variant_string m = match.match_positional_variant(conn, variant_string) - MatchingDisplayNames = [el["displayName"] for el in m] - MatchingTypes = [el["type"]["name"] for el in m] + MatchingDisplayNames = [el['displayName'] for el in m] + MatchingTypes = [el['type']['name'] for el in m] # Match - for displayName in expected.get("matches", {}).get("displayName", []): + for displayName in expected.get('matches', {}).get('displayName', []): assert displayName in MatchingDisplayNames - for type in expected.get("matches", {}).get("type", []): + for type in expected.get('matches', {}).get('type', []): assert type in MatchingTypes # Does not match for displayName in MatchingDisplayNames: - assert displayName not in expected.get("does_not_matches", {}).get( - "displayName", [] + assert displayName not in expected.get('does_not_matches', {}).get( + 'displayName', [] ) for type in MatchingTypes: - assert type not in expected.get("does_not_matches", {}).get("type", []) + assert type not in expected.get('does_not_matches', {}).get('type', []) class TestCacheMissingFeatures: @@ -568,14 +568,14 @@ def test_filling_cache(self): mock_conn = MagicMock( query=MagicMock( return_value=[ - {"name": "bob", "sourceId": "alice"}, - {"name": "KRAS", "sourceId": "1234"}, + {'name': 'bob', 'sourceId': 'alice'}, + {'name': 'KRAS', 'sourceId': '1234'}, ] ) ) match.cache_missing_features(mock_conn) - assert "kras" in match.FEATURES_CACHE - assert "alice" in match.FEATURES_CACHE + assert 'kras' in match.FEATURES_CACHE + assert 'alice' in match.FEATURES_CACHE match.FEATURES_CACHE = None @@ -583,9 +583,9 @@ class TestTypeScreening: # Types as class variables default_type = DEFAULT_NON_STRUCTURAL_VARIANT_TYPE threshold = STRUCTURAL_VARIANT_SIZE_THRESHOLD - unambiguous_structural = ["fusion", "translocation"] - ambiguous_structural = ["duplication", "deletion", "insertion", "indel"] - non_structural = ["substitution", "missense", "nonsense", "frameshift", "truncating"] + unambiguous_structural = ['fusion', 'translocation'] + ambiguous_structural = ['duplication', 'deletion', 'insertion', 'indel'] + non_structural = ['substitution', 'missense', 'nonsense', 'frameshift', 'truncating'] def test_type_screening_update(self, conn, monkeypatch): # Monkey-patching get_terms_set() @@ -594,44 +594,44 @@ def mock_get_terms_set(graphkb_conn, base_terms): called = True return set() - monkeypatch.setattr("pori_python.graphkb.match.get_terms_set", mock_get_terms_set) + monkeypatch.setattr('pori_python.graphkb.match.get_terms_set', mock_get_terms_set) # Assert get_terms_set() has been called called = False - pori_python.graphkb.match.type_screening(conn, {"type": ""}, updateStructuralTypes=True) + pori_python.graphkb.match.type_screening(conn, {'type': ''}, updateStructuralTypes=True) assert called # Assert get_terms_set() has not been called (default behavior) called = False - pori_python.graphkb.match.type_screening(conn, {"type": ""}) + pori_python.graphkb.match.type_screening(conn, {'type': ''}) assert not called def test_type_screening_non_structural(self, conn): for type in TestTypeScreening.non_structural: # type substitution and alike - assert match.type_screening(conn, {"type": type}) == type + assert match.type_screening(conn, {'type': type}) == type def test_type_screening_structural(self, conn): for type in TestTypeScreening.unambiguous_structural: # type fusion and alike - assert match.type_screening(conn, {"type": type}) == type + assert match.type_screening(conn, {'type': type}) == type for type in TestTypeScreening.ambiguous_structural: # w/ reference2 - assert match.type_screening(conn, {"type": type, "reference2": "#123:45"}) == type + assert match.type_screening(conn, {'type': type, 'reference2': '#123:45'}) == type # w/ cytoband coordinates - assert match.type_screening(conn, {"type": type, "prefix": "y"}) == type + assert match.type_screening(conn, {'type': type, 'prefix': 'y'}) == type def test_type_screening_structural_ambiguous_size(self, conn): for type in TestTypeScreening.ambiguous_structural: # coordinate system with ambiguous size - for prefix in ["e", "i"]: + for prefix in ['e', 'i']: assert ( match.type_screening( conn, { - "type": type, - "break2Start": {"pos": TestTypeScreening.threshold}, - "prefix": prefix, + 'type': type, + 'break2Start': {'pos': TestTypeScreening.threshold}, + 'prefix': prefix, }, ) == TestTypeScreening.default_type @@ -642,14 +642,14 @@ def test_type_screening_structural_untemplatedSeqSize(self, conn): # Variation length too small (< threshold) assert ( match.type_screening( - conn, {"type": type, "untemplatedSeqSize": TestTypeScreening.threshold - 1} + conn, {'type': type, 'untemplatedSeqSize': TestTypeScreening.threshold - 1} ) == TestTypeScreening.default_type ) # Variation length big enough (>= threshold) assert ( match.type_screening( - conn, {"type": type, "untemplatedSeqSize": TestTypeScreening.threshold} + conn, {'type': type, 'untemplatedSeqSize': TestTypeScreening.threshold} ) == type ) @@ -658,32 +658,32 @@ def test_type_screening_structural_positions(self, conn): for type in TestTypeScreening.ambiguous_structural: # Variation length too small (< threshold) for opt in [ - {"break2Start": {"pos": TestTypeScreening.threshold - 1}}, - {"break2Start": {"pos": TestTypeScreening.threshold - 1}, "prefix": "c"}, - {"break2Start": {"pos": TestTypeScreening.threshold - 1}, "prefix": "g"}, - {"break2Start": {"pos": TestTypeScreening.threshold - 1}, "prefix": "n"}, - {"break2Start": {"pos": TestTypeScreening.threshold - 1}, "prefix": "r"}, - {"break2Start": {"pos": int(TestTypeScreening.threshold / 3) - 1}, "prefix": "p"}, + {'break2Start': {'pos': TestTypeScreening.threshold - 1}}, + {'break2Start': {'pos': TestTypeScreening.threshold - 1}, 'prefix': 'c'}, + {'break2Start': {'pos': TestTypeScreening.threshold - 1}, 'prefix': 'g'}, + {'break2Start': {'pos': TestTypeScreening.threshold - 1}, 'prefix': 'n'}, + {'break2Start': {'pos': TestTypeScreening.threshold - 1}, 'prefix': 'r'}, + {'break2Start': {'pos': int(TestTypeScreening.threshold / 3) - 1}, 'prefix': 'p'}, { - "break1Start": {"pos": 1 + 99}, - "break2Start": {"pos": TestTypeScreening.threshold + 99 - 1}, + 'break1Start': {'pos': 1 + 99}, + 'break2Start': {'pos': TestTypeScreening.threshold + 99 - 1}, }, ]: assert ( - match.type_screening(conn, {"type": type, **opt}) + match.type_screening(conn, {'type': type, **opt}) == TestTypeScreening.default_type ) # Variation length big enough (>= threshold) for opt in [ - {"break2Start": {"pos": TestTypeScreening.threshold}}, - {"break2Start": {"pos": TestTypeScreening.threshold}, "prefix": "c"}, - {"break2Start": {"pos": TestTypeScreening.threshold}, "prefix": "g"}, - {"break2Start": {"pos": TestTypeScreening.threshold}, "prefix": "n"}, - {"break2Start": {"pos": TestTypeScreening.threshold}, "prefix": "r"}, - {"break2Start": {"pos": int(TestTypeScreening.threshold / 3) + 1}, "prefix": "p"}, + {'break2Start': {'pos': TestTypeScreening.threshold}}, + {'break2Start': {'pos': TestTypeScreening.threshold}, 'prefix': 'c'}, + {'break2Start': {'pos': TestTypeScreening.threshold}, 'prefix': 'g'}, + {'break2Start': {'pos': TestTypeScreening.threshold}, 'prefix': 'n'}, + {'break2Start': {'pos': TestTypeScreening.threshold}, 'prefix': 'r'}, + {'break2Start': {'pos': int(TestTypeScreening.threshold / 3) + 1}, 'prefix': 'p'}, { - "break1Start": {"pos": 1 + 99}, - "break2Start": {"pos": TestTypeScreening.threshold + 99}, + 'break1Start': {'pos': 1 + 99}, + 'break2Start': {'pos': TestTypeScreening.threshold + 99}, }, ]: - assert match.type_screening(conn, {"type": type, **opt}) == type + assert match.type_screening(conn, {'type': type, **opt}) == type diff --git a/tests/test_graphkb/test_statement.py b/tests/test_graphkb/test_statement.py index fcf0bef5..89faafdd 100644 --- a/tests/test_graphkb/test_statement.py +++ b/tests/test_graphkb/test_statement.py @@ -4,36 +4,36 @@ from pori_python.graphkb import GraphKBConnection, statement -EXCLUDE_INTEGRATION_TESTS = os.environ.get("EXCLUDE_INTEGRATION_TESTS") == "1" -EXCLUDE_BCGSC_TESTS = os.environ.get("EXCLUDE_BCGSC_TESTS") == "1" +EXCLUDE_INTEGRATION_TESTS = os.environ.get('EXCLUDE_INTEGRATION_TESTS') == '1' +EXCLUDE_BCGSC_TESTS = os.environ.get('EXCLUDE_BCGSC_TESTS') == '1' -@pytest.fixture(scope="module") +@pytest.fixture(scope='module') def conn() -> GraphKBConnection: conn = GraphKBConnection() - conn.login(os.environ["GRAPHKB_USER"], os.environ["GRAPHKB_PASS"]) + conn.login(os.environ['GRAPHKB_USER'], os.environ['GRAPHKB_PASS']) return conn @pytest.fixture() def graphkb_conn(): def make_rid_list(*values): - return [{"@rid": v} for v in values] + return [{'@rid': v} for v in values] def term_tree_calls(*final_values): # this function makes 2 calls to conn.query here - sets = [["fake"], final_values] + sets = [['fake'], final_values] return [make_rid_list(*s) for s in sets] return_values = [ - *term_tree_calls("1"), # therapeutic - *term_tree_calls("2"), # therapeutic (2nd base term) - *term_tree_calls("3"), # diagnostic - *term_tree_calls("4"), # prognostic - *term_tree_calls("5"), # pharmacogenomic ['metabolism'] - *term_tree_calls("6"), # pharmacogenomic ['toxicity'] - *term_tree_calls("7"), # pharmacogenomic ['dosage'] - *term_tree_calls("8"), # cancer predisposition + *term_tree_calls('1'), # therapeutic + *term_tree_calls('2'), # therapeutic (2nd base term) + *term_tree_calls('3'), # diagnostic + *term_tree_calls('4'), # prognostic + *term_tree_calls('5'), # pharmacogenomic ['metabolism'] + *term_tree_calls('6'), # pharmacogenomic ['toxicity'] + *term_tree_calls('7'), # pharmacogenomic ['dosage'] + *term_tree_calls('8'), # cancer predisposition *term_tree_calls(), # biological *term_tree_calls(), # biological (2nd base term) *term_tree_calls(), # biological (3rd base term) @@ -46,55 +46,55 @@ def term_tree_calls(*final_values): class TestCategorizeRelevance: def test_default_categories(self, graphkb_conn): - category = statement.categorize_relevance(graphkb_conn, "1") - assert category == "therapeutic" + category = statement.categorize_relevance(graphkb_conn, '1') + assert category == 'therapeutic' def test_first_match_returns(self, graphkb_conn): - category = statement.categorize_relevance(graphkb_conn, "2") - assert category == "therapeutic" + category = statement.categorize_relevance(graphkb_conn, '2') + assert category == 'therapeutic' def test_second_category(self, graphkb_conn): - category = statement.categorize_relevance(graphkb_conn, "3") - assert category == "diagnostic" + category = statement.categorize_relevance(graphkb_conn, '3') + assert category == 'diagnostic' def test_third_category(self, graphkb_conn): - category = statement.categorize_relevance(graphkb_conn, "4") - assert category == "prognostic" + category = statement.categorize_relevance(graphkb_conn, '4') + assert category == 'prognostic' def test_fourth_category(self, graphkb_conn): - category = statement.categorize_relevance(graphkb_conn, "5") - assert category == "pharmacogenomic" + category = statement.categorize_relevance(graphkb_conn, '5') + assert category == 'pharmacogenomic' def test_fifth_category(self, graphkb_conn): - category = statement.categorize_relevance(graphkb_conn, "6") - assert category == "pharmacogenomic" + category = statement.categorize_relevance(graphkb_conn, '6') + assert category == 'pharmacogenomic' def test_predisposition_category(self, graphkb_conn): - category = statement.categorize_relevance(graphkb_conn, "8") - assert category == "cancer predisposition" + category = statement.categorize_relevance(graphkb_conn, '8') + assert category == 'cancer predisposition' def test_no_match(self, graphkb_conn): - category = statement.categorize_relevance(graphkb_conn, "x") - assert category == "" + category = statement.categorize_relevance(graphkb_conn, 'x') + assert category == '' def test_custom_categories(self, graphkb_conn): category = statement.categorize_relevance( - graphkb_conn, "x", [("blargh", ["some", "blargh"])] + graphkb_conn, 'x', [('blargh', ['some', 'blargh'])] ) - assert category == "" + assert category == '' category = statement.categorize_relevance( - graphkb_conn, "1", [("blargh", ["some", "blargh"])] + graphkb_conn, '1', [('blargh', ['some', 'blargh'])] ) - assert category == "blargh" + assert category == 'blargh' @pytest.mark.skipif( - EXCLUDE_BCGSC_TESTS, reason="db-specific rid; requires Inferred Functional Annotation source" + EXCLUDE_BCGSC_TESTS, reason='db-specific rid; requires Inferred Functional Annotation source' ) -@pytest.mark.skipif(EXCLUDE_INTEGRATION_TESTS, reason="excluding long running integration tests") +@pytest.mark.skipif(EXCLUDE_INTEGRATION_TESTS, reason='excluding long running integration tests') class TestStatementMatch: def test_truncating_categories(self, conn): # noqa - pytest fixture, not redefinition - variant = {"@class": "CategoryVariant", "@rid": "#161:429", "displayName": "RB1 truncating"} + variant = {'@class': 'CategoryVariant', '@rid': '#161:429', 'displayName': 'RB1 truncating'} statements = statement.get_statements_from_variants(conn, [variant]) assert statements diff --git a/tests/test_graphkb/test_util.py b/tests/test_graphkb/test_util.py index 319df576..36760b2a 100644 --- a/tests/test_graphkb/test_util.py +++ b/tests/test_graphkb/test_util.py @@ -3,7 +3,7 @@ from pori_python.graphkb import GraphKBConnection, util -EXCLUDE_BCGSC_TESTS = os.environ.get("EXCLUDE_BCGSC_TESTS") == "1" +EXCLUDE_BCGSC_TESTS = os.environ.get('EXCLUDE_BCGSC_TESTS') == '1' class OntologyTerm: @@ -13,34 +13,34 @@ def __init__(self, name, sourceId, displayName): self.displayName = displayName -@pytest.fixture(scope="module") +@pytest.fixture(scope='module') def conn() -> GraphKBConnection: conn = GraphKBConnection() - conn.login(os.environ["GRAPHKB_USER"], os.environ["GRAPHKB_PASS"]) + conn.login(os.environ['GRAPHKB_USER'], os.environ['GRAPHKB_PASS']) return conn class TestLooksLikeRid: - @pytest.mark.parametrize("rid", ["#3:4", "#50:04", "#-3:4", "#-3:-4", "#3:-4"]) + @pytest.mark.parametrize('rid', ['#3:4', '#50:04', '#-3:4', '#-3:-4', '#3:-4']) def test_valid(self, rid): assert util.looks_like_rid(rid) - @pytest.mark.parametrize("rid", ["-3:4", "KRAS"]) + @pytest.mark.parametrize('rid', ['-3:4', 'KRAS']) def test_invalid(self, rid): assert not util.looks_like_rid(rid) @pytest.mark.parametrize( - "input,result", + 'input,result', [ - ["GP5:p.Leu113His", "GP5:p.L113H"], - ["GP5:p.Lys113His", "GP5:p.K113H"], - ["CDK11A:p.Arg536Gln", "CDK11A:p.R536Q"], - ["APC:p.Cys1405*", "APC:p.C1405*"], - ["ApcTer:p.Cys1405*", "ApcTer:p.C1405*"], - ["GP5:p.Leu113_His114insLys", "GP5:p.L113_H114insK"], - ["NP_003997.1:p.Lys23_Val25del", "NP_003997.1:p.K23_V25del"], - ["LRG_199p1:p.Val7del", "LRG_199p1:p.V7del"], + ['GP5:p.Leu113His', 'GP5:p.L113H'], + ['GP5:p.Lys113His', 'GP5:p.K113H'], + ['CDK11A:p.Arg536Gln', 'CDK11A:p.R536Q'], + ['APC:p.Cys1405*', 'APC:p.C1405*'], + ['ApcTer:p.Cys1405*', 'ApcTer:p.C1405*'], + ['GP5:p.Leu113_His114insLys', 'GP5:p.L113_H114insK'], + ['NP_003997.1:p.Lys23_Val25del', 'NP_003997.1:p.K23_V25del'], + ['LRG_199p1:p.Val7del', 'LRG_199p1:p.V7del'], ], ) def test_convert_aa_3to1(input, result): @@ -49,12 +49,12 @@ def test_convert_aa_3to1(input, result): class TestStripParentheses: @pytest.mark.parametrize( - "breakRepr,StrippedBreakRepr", + 'breakRepr,StrippedBreakRepr', [ - ["p.(E2015_Q2114)", "p.E2015_Q2114"], - ["p.(?572_?630)", "p.?572_?630"], - ["g.178916854", "g.178916854"], - ["e.10", "e.10"], + ['p.(E2015_Q2114)', 'p.E2015_Q2114'], + ['p.(?572_?630)', 'p.?572_?630'], + ['g.178916854', 'g.178916854'], + ['e.10', 'e.10'], ], ) def test_stripParentheses(self, breakRepr, StrippedBreakRepr): @@ -63,10 +63,10 @@ def test_stripParentheses(self, breakRepr, StrippedBreakRepr): class TestStripRefSeq: @pytest.mark.parametrize( - "breakRepr,StrippedBreakRepr", + 'breakRepr,StrippedBreakRepr', [ - ["p.L2209", "p.2209"], - ["p.?891", "p.891"], + ['p.L2209', 'p.2209'], + ['p.?891', 'p.891'], # TODO: ['p.?572_?630', 'p.572_630'], ], ) @@ -76,31 +76,31 @@ def test_stripRefSeq(self, breakRepr, StrippedBreakRepr): class TestStripDisplayName: @pytest.mark.parametrize( - "opt,stripDisplayName", + 'opt,stripDisplayName', [ - [{"displayName": "ABL1:p.T315I", "withRef": True, "withRefSeq": True}, "ABL1:p.T315I"], - [{"displayName": "ABL1:p.T315I", "withRef": False, "withRefSeq": True}, "p.T315I"], - [{"displayName": "ABL1:p.T315I", "withRef": True, "withRefSeq": False}, "ABL1:p.315I"], - [{"displayName": "ABL1:p.T315I", "withRef": False, "withRefSeq": False}, "p.315I"], + [{'displayName': 'ABL1:p.T315I', 'withRef': True, 'withRefSeq': True}, 'ABL1:p.T315I'], + [{'displayName': 'ABL1:p.T315I', 'withRef': False, 'withRefSeq': True}, 'p.T315I'], + [{'displayName': 'ABL1:p.T315I', 'withRef': True, 'withRefSeq': False}, 'ABL1:p.315I'], + [{'displayName': 'ABL1:p.T315I', 'withRef': False, 'withRefSeq': False}, 'p.315I'], [ - {"displayName": "chr3:g.41266125C>T", "withRef": False, "withRefSeq": False}, - "g.41266125>T", + {'displayName': 'chr3:g.41266125C>T', 'withRef': False, 'withRefSeq': False}, + 'g.41266125>T', ], [ { - "displayName": "chrX:g.99662504_99662505insG", - "withRef": False, - "withRefSeq": False, + 'displayName': 'chrX:g.99662504_99662505insG', + 'withRef': False, + 'withRefSeq': False, }, - "g.99662504_99662505insG", + 'g.99662504_99662505insG', ], [ { - "displayName": "chrX:g.99662504_99662505dup", - "withRef": False, - "withRefSeq": False, + 'displayName': 'chrX:g.99662504_99662505dup', + 'withRef': False, + 'withRefSeq': False, }, - "g.99662504_99662505dup", + 'g.99662504_99662505dup', ], # TODO: [{'displayName': 'VHL:c.330_331delCAinsTT', 'withRef': False, 'withRefSeq': False}, 'c.330_331delinsTT'], # TODO: [{'displayName': 'VHL:c.464-2G>A', 'withRef': False, 'withRefSeq': False}, 'c.464-2>A'], @@ -112,40 +112,40 @@ def test_stripDisplayName(self, opt, stripDisplayName): class TestStringifyVariant: @pytest.mark.parametrize( - "hgvs_string,opt,stringifiedVariant", + 'hgvs_string,opt,stringifiedVariant', [ - ["VHL:c.345C>G", {"withRef": True, "withRefSeq": True}, "VHL:c.345C>G"], - ["VHL:c.345C>G", {"withRef": False, "withRefSeq": True}, "c.345C>G"], - ["VHL:c.345C>G", {"withRef": True, "withRefSeq": False}, "VHL:c.345>G"], - ["VHL:c.345C>G", {"withRef": False, "withRefSeq": False}, "c.345>G"], + ['VHL:c.345C>G', {'withRef': True, 'withRefSeq': True}, 'VHL:c.345C>G'], + ['VHL:c.345C>G', {'withRef': False, 'withRefSeq': True}, 'c.345C>G'], + ['VHL:c.345C>G', {'withRef': True, 'withRefSeq': False}, 'VHL:c.345>G'], + ['VHL:c.345C>G', {'withRef': False, 'withRefSeq': False}, 'c.345>G'], [ - "(LMNA,NTRK1):fusion(e.10,e.12)", - {"withRef": False, "withRefSeq": False}, - "fusion(e.10,e.12)", + '(LMNA,NTRK1):fusion(e.10,e.12)', + {'withRef': False, 'withRefSeq': False}, + 'fusion(e.10,e.12)', ], - ["ABCA12:p.N1671Ifs*4", {"withRef": False, "withRefSeq": False}, "p.1671Ifs*4"], - ["x:y.p22.33copyloss", {"withRef": False, "withRefSeq": False}, "y.p22.33copyloss"], + ['ABCA12:p.N1671Ifs*4', {'withRef': False, 'withRefSeq': False}, 'p.1671Ifs*4'], + ['x:y.p22.33copyloss', {'withRef': False, 'withRefSeq': False}, 'y.p22.33copyloss'], # TODO: ['MED12:p.(?34_?68)mut', {'withRef': False, 'withRefSeq': False}, 'p.(34_68)mut'], # TODO: ['FLT3:p.(?572_?630)_(?572_?630)ins', {'withRef': False, 'withRefSeq': False}, 'p.(572_630)_(572_630)ins'], ], ) def test_stringifyVariant_parsed(self, conn, hgvs_string, opt, stringifiedVariant): - opt["variant"] = conn.parse(hgvs_string) + opt['variant'] = conn.parse(hgvs_string) assert util.stringifyVariant(**opt) == stringifiedVariant # Based on the assumption that these variants are in the database. # createdAt date help avoiding errors if assumption tuns to be false @pytest.mark.parametrize( - "rid,createdAt,stringifiedVariant", + 'rid,createdAt,stringifiedVariant', [ - ["#157:0", 1565627324397, "p.315I"], - ["#157:79", 1565627683602, "p.776_777insVGC"], - ["#158:35317", 1652734056311, "c.1>G"], + ['#157:0', 1565627324397, 'p.315I'], + ['#157:79', 1565627683602, 'p.776_777insVGC'], + ['#158:35317', 1652734056311, 'c.1>G'], ], ) - @pytest.mark.skipif(EXCLUDE_BCGSC_TESTS, reason="db-dependent rids") + @pytest.mark.skipif(EXCLUDE_BCGSC_TESTS, reason='db-dependent rids') def test_stringifyVariant_positional(self, conn, rid, createdAt, stringifiedVariant): - opt = {"withRef": False, "withRefSeq": False} + opt = {'withRef': False, 'withRefSeq': False} variant = conn.get_record_by_id(rid) - if variant and variant.get("createdAt", None) == createdAt: + if variant and variant.get('createdAt', None) == createdAt: assert util.stringifyVariant(variant=variant, **opt) == stringifiedVariant diff --git a/tests/test_graphkb/test_vocab.py b/tests/test_graphkb/test_vocab.py index fc8497f7..ff64b7a7 100644 --- a/tests/test_graphkb/test_vocab.py +++ b/tests/test_graphkb/test_vocab.py @@ -7,79 +7,79 @@ from pori_python.graphkb import GraphKBConnection, genes, vocab -BASE_EXPRESSION = "expression variant" -BASE_INCREASED_EXPRESSION = "increased expression" -BASE_REDUCED_EXPRESSION = "reduced expression" +BASE_EXPRESSION = 'expression variant' +BASE_INCREASED_EXPRESSION = 'increased expression' +BASE_REDUCED_EXPRESSION = 'reduced expression' -@pytest.fixture(scope="module") +@pytest.fixture(scope='module') def conn(): conn = GraphKBConnection() - conn.login(os.environ["GRAPHKB_USER"], os.environ["GRAPHKB_PASS"]) + conn.login(os.environ['GRAPHKB_USER'], os.environ['GRAPHKB_PASS']) return conn def test_expression_vocabulary(conn): result = vocab.get_term_tree(conn, BASE_EXPRESSION) - names = [row["name"] for row in result] + names = [row['name'] for row in result] assert BASE_EXPRESSION in names - assert "increased rna expression" in names + assert 'increased rna expression' in names def test_indel_vocabulary(conn): - result = vocab.get_term_tree(conn, "indel") + result = vocab.get_term_tree(conn, 'indel') - names = {row["name"] for row in result} - assert "indel" in names - assert "copy variant" not in names - assert "copy number variant" not in names + names = {row['name'] for row in result} + assert 'indel' in names + assert 'copy variant' not in names + assert 'copy number variant' not in names def test_expression_up(conn): result = vocab.get_term_tree(conn, BASE_INCREASED_EXPRESSION) - names = [row["name"] for row in result] + names = [row['name'] for row in result] assert BASE_EXPRESSION in names assert BASE_INCREASED_EXPRESSION in names - assert "increased rna expression" in names - assert "reduced rna expression" not in names + assert 'increased rna expression' in names + assert 'reduced rna expression' not in names assert BASE_REDUCED_EXPRESSION not in names def test_expression_down(conn): result = vocab.get_term_tree(conn, BASE_REDUCED_EXPRESSION) - names = [row["name"] for row in result] + names = [row['name'] for row in result] assert BASE_EXPRESSION in names assert BASE_REDUCED_EXPRESSION in names assert BASE_INCREASED_EXPRESSION not in names - assert "increased rna expression" not in names - assert "reduced rna expression" in names + assert 'increased rna expression' not in names + assert 'reduced rna expression' in names class TestGetEquivalentTerms: def test_gain_excludes_amplification(self, conn): - result = vocab.get_equivalent_terms(conn, "copy gain") - names = {row["name"] for row in result} - assert "copy gain" in names - assert "amplification" not in names + result = vocab.get_equivalent_terms(conn, 'copy gain') + names = {row['name'] for row in result} + assert 'copy gain' in names + assert 'amplification' not in names def test_amplification_includes_gain(self, conn): - result = vocab.get_equivalent_terms(conn, "amplification") - names = {row["name"] for row in result} - assert "copy gain" in names - assert "amplification" in names + result = vocab.get_equivalent_terms(conn, 'amplification') + names = {row['name'] for row in result} + assert 'copy gain' in names + assert 'amplification' in names def test_oncogenic(conn): result = vocab.get_term_by_name(conn, genes.ONCOGENE) - assert result["name"] == genes.ONCOGENE + assert result['name'] == genes.ONCOGENE def test_get_terms_set(conn): - terms = vocab.get_terms_set(conn, ["copy variant"]) + terms = vocab.get_terms_set(conn, ['copy variant']) assert terms - more_terms = vocab.get_terms_set(conn, ["copy variant", "expression variant"]) + more_terms = vocab.get_terms_set(conn, ['copy variant', 'expression variant']) assert more_terms assert len(more_terms) > len(terms) diff --git a/tests/test_ipr/constants.py b/tests/test_ipr/constants.py index b4edeab3..8b211ff9 100644 --- a/tests/test_ipr/constants.py +++ b/tests/test_ipr/constants.py @@ -1,3 +1,3 @@ import os -EXCLUDE_INTEGRATION_TESTS = os.environ.get("EXCLUDE_INTEGRATION_TESTS") == "1" +EXCLUDE_INTEGRATION_TESTS = os.environ.get('EXCLUDE_INTEGRATION_TESTS') == '1' diff --git a/tests/test_ipr/test_annotate.py b/tests/test_ipr/test_annotate.py index 0695debe..43de4216 100644 --- a/tests/test_ipr/test_annotate.py +++ b/tests/test_ipr/test_annotate.py @@ -15,81 +15,81 @@ from .test_ipr import DISEASE_RIDS -EXCLUDE_BCGSC_TESTS = os.environ.get("EXCLUDE_BCGSC_TESTS") == "1" +EXCLUDE_BCGSC_TESTS = os.environ.get('EXCLUDE_BCGSC_TESTS') == '1' # TP53 examples from https://www.bcgsc.ca/jira/browse/SDEV-3122 # Mutations are actually identical but on alternate transcripts. TP53_MUT_DICT = { - "pref": IprSmallMutationVariant( # type: ignore + 'pref': IprSmallMutationVariant( # type: ignore { - "key": "SDEV-3122_preferred", - "gene": "TP53", - "hgvsGenomic": "chr17:g.7674252C>T", - "hgvsCds": "ENST00000269305:c.711G>A", - "hgvsProtein": "TP53:p.M237I", + 'key': 'SDEV-3122_preferred', + 'gene': 'TP53', + 'hgvsGenomic': 'chr17:g.7674252C>T', + 'hgvsCds': 'ENST00000269305:c.711G>A', + 'hgvsProtein': 'TP53:p.M237I', } ), - "intersect": IprSmallMutationVariant( # type: ignore + 'intersect': IprSmallMutationVariant( # type: ignore { - "key": "SDEV-3122_alt", - "gene": "TP53", - "hgvsGenomic": "chr17:g.7674252C>T", - "hgvsCds": "ENST00000610292:c.594G>A", - "hgvsProtein": "TP53:p.M198I", + 'key': 'SDEV-3122_alt', + 'gene': 'TP53', + 'hgvsGenomic': 'chr17:g.7674252C>T', + 'hgvsCds': 'ENST00000610292:c.594G>A', + 'hgvsProtein': 'TP53:p.M198I', } ), - "prot_only": IprSmallMutationVariant( # type: ignore - {"key": "prot_only", "gene": "TP53", "hgvsProtein": "TP53:p.M237I"} + 'prot_only': IprSmallMutationVariant( # type: ignore + {'key': 'prot_only', 'gene': 'TP53', 'hgvsProtein': 'TP53:p.M237I'} ), - "cds_only": IprSmallMutationVariant( # type: ignore - {"key": "cds_only", "gene": "TP53", "hgvsCds": "ENST00000269305:c.711G>A"} + 'cds_only': IprSmallMutationVariant( # type: ignore + {'key': 'cds_only', 'gene': 'TP53', 'hgvsCds': 'ENST00000269305:c.711G>A'} ), - "genome_only": IprSmallMutationVariant( # type: ignore - {"key": "genome_only", "gene": "TP53", "hgvsGenomic": "chr17:g.7674252C>T"} + 'genome_only': IprSmallMutationVariant( # type: ignore + {'key': 'genome_only', 'gene': 'TP53', 'hgvsGenomic': 'chr17:g.7674252C>T'} ), } KBDEV1231_TP53_ERR_MATCH_WT = { - "altSeq": "", - "chromosome": "chr17", - "comments": "", - "endPosition": "", - "gene": "TP53", - "germline": False, - "hgvsCds": "ENST00000269305:c.853G>A", - "hgvsGenomic": "chr17:g.7673767C>T", - "hgvsProtein": "TP53:p.E285K", - "key": "c23a7b0387335e7a5ed6c1081a1822ae", - "library": "F145233;F145265", - "ncbiBuild": "GRCh38", - "normalAltCount": "", - "normalDepth": "", - "normalRefCount": "", - "proteinChange": "p.E285K", - "refSeq": "", - "rnaAltCount": 311, - "rnaDepth": 370, - "rnaRefCount": 59, - "startPosition": "", - "transcript": "ENST00000269305", - "tumourAltCopies": "", - "tumourAltCount": 64, - "tumourDepth": 100, - "tumourRefCopies": "", - "tumourRefCount": 36, - "variant": "TP53:p.E285K", - "variantType": "mut", - "zygosity": "", + 'altSeq': '', + 'chromosome': 'chr17', + 'comments': '', + 'endPosition': '', + 'gene': 'TP53', + 'germline': False, + 'hgvsCds': 'ENST00000269305:c.853G>A', + 'hgvsGenomic': 'chr17:g.7673767C>T', + 'hgvsProtein': 'TP53:p.E285K', + 'key': 'c23a7b0387335e7a5ed6c1081a1822ae', + 'library': 'F145233;F145265', + 'ncbiBuild': 'GRCh38', + 'normalAltCount': '', + 'normalDepth': '', + 'normalRefCount': '', + 'proteinChange': 'p.E285K', + 'refSeq': '', + 'rnaAltCount': 311, + 'rnaDepth': 370, + 'rnaRefCount': 59, + 'startPosition': '', + 'transcript': 'ENST00000269305', + 'tumourAltCopies': '', + 'tumourAltCount': 64, + 'tumourDepth': 100, + 'tumourRefCopies': '', + 'tumourRefCount': 36, + 'variant': 'TP53:p.E285K', + 'variantType': 'mut', + 'zygosity': '', } -@pytest.fixture(scope="module") +@pytest.fixture(scope='module') def graphkb_conn(): - username = os.environ.get("GRAPHKB_USER", os.environ["IPR_USER"]) - password = os.environ.get("GRAPHKB_PASS", os.environ["IPR_PASS"]) - graphkb_url = os.environ.get("GRAPHKB_URL", False) + username = os.environ.get('GRAPHKB_USER', os.environ['IPR_USER']) + password = os.environ.get('GRAPHKB_PASS', os.environ['IPR_PASS']) + graphkb_url = os.environ.get('GRAPHKB_URL', False) if graphkb_url: graphkb_conn = GraphKBConnection(graphkb_url) else: @@ -99,77 +99,77 @@ def graphkb_conn(): @pytest.mark.skipif( - EXCLUDE_BCGSC_TESTS, reason="excluding tests that depend on BCGSC-specific data" + EXCLUDE_BCGSC_TESTS, reason='excluding tests that depend on BCGSC-specific data' ) class TestAnnotation: def test_annotate_nonsense_vs_missense(self, graphkb_conn): """Verify missense (point mutation) is not mistaken for a nonsense (stop codon) mutation.""" - for key in ("prot_only", "cds_only", "genome_only", "pref"): + for key in ('prot_only', 'cds_only', 'genome_only', 'pref'): matched = annotate_positional_variants(graphkb_conn, [TP53_MUT_DICT[key]], DISEASE_RIDS) # nonsense - stop codon - should not match. This is missense not nonsense (#164:933). - nonsense = [a for a in matched if a["kbVariant"] == "TP53 nonsense"] - assert not nonsense, f"nonsense matched to {key}: {TP53_MUT_DICT[key]}" - assert matched, f"should have matched in {key}: {TP53_MUT_DICT[key]}" + nonsense = [a for a in matched if a['kbVariant'] == 'TP53 nonsense'] + assert not nonsense, f'nonsense matched to {key}: {TP53_MUT_DICT[key]}' + assert matched, f'should have matched in {key}: {TP53_MUT_DICT[key]}' def test_annotate_nonsense_vs_missense_protein(self, graphkb_conn): """Verify missense (point mutation) is not mistaken for a nonsense (stop codon) mutation.""" - for key in ("prot_only", "pref"): + for key in ('prot_only', 'pref'): matched = annotate_positional_variants(graphkb_conn, [TP53_MUT_DICT[key]], DISEASE_RIDS) # nonsense - stop codon - should not match. This is missense not nonsense (#164:933). - nonsense = [a for a in matched if "nonsense" in a["kbVariant"]] - assert not nonsense, f"nonsense matched to {key}: {TP53_MUT_DICT[key]}" - assert matched, f"should have matched in {key}: {TP53_MUT_DICT[key]}" + nonsense = [a for a in matched if 'nonsense' in a['kbVariant']] + assert not nonsense, f'nonsense matched to {key}: {TP53_MUT_DICT[key]}' + assert matched, f'should have matched in {key}: {TP53_MUT_DICT[key]}' def test_annotate_signature_variants_cosmic(self, graphkb_conn): """Test a Cosmic Signature CVs with known GKB statements""" - signature = "SBS10B" + signature = 'SBS10B' cosmic = annotate_signature_variants( graphkb_conn, DISEASE_RIDS, preprocess_signature_variants( [ { - "displayName": f"{signature} {COSMIC_SIGNATURE_VARIANT_TYPE}", - "signatureName": signature, - "variantTypeName": COSMIC_SIGNATURE_VARIANT_TYPE, + 'displayName': f'{signature} {COSMIC_SIGNATURE_VARIANT_TYPE}', + 'signatureName': signature, + 'variantTypeName': COSMIC_SIGNATURE_VARIANT_TYPE, } ] ), ) assert len(cosmic) != 0 - @pytest.mark.skip(reason="no GKB statement for dMMR Signature CVs yet") + @pytest.mark.skip(reason='no GKB statement for dMMR Signature CVs yet') def test_annotate_signature_variants_dmmr(self, graphkb_conn): """Test a dMMR (from Cosmic) Signature CVs with known GKB statements""" - signature = "DMMR" + signature = 'DMMR' dmmr = annotate_signature_variants( graphkb_conn, DISEASE_RIDS, preprocess_signature_variants( [ { - "displayName": f"{signature} {COSMIC_SIGNATURE_VARIANT_TYPE}", - "signatureName": signature, - "variantTypeName": COSMIC_SIGNATURE_VARIANT_TYPE, + 'displayName': f'{signature} {COSMIC_SIGNATURE_VARIANT_TYPE}', + 'signatureName': signature, + 'variantTypeName': COSMIC_SIGNATURE_VARIANT_TYPE, } ] ), ) assert len(dmmr) != 0 - @pytest.mark.skip(reason="no GKB statement for HLA Signature CVs yet") + @pytest.mark.skip(reason='no GKB statement for HLA Signature CVs yet') def test_annotate_signature_variants_hla(self, graphkb_conn): """Test an HLA Signature CVs with known GKB statements""" - signature = "HLA-A*02:01" + signature = 'HLA-A*02:01' hla = annotate_signature_variants( graphkb_conn, DISEASE_RIDS, preprocess_signature_variants( [ { - "displayName": f"{signature} {HLA_SIGNATURE_VARIANT_TYPE}", - "signatureName": signature, - "variantTypeName": HLA_SIGNATURE_VARIANT_TYPE, + 'displayName': f'{signature} {HLA_SIGNATURE_VARIANT_TYPE}', + 'signatureName': signature, + 'variantTypeName': HLA_SIGNATURE_VARIANT_TYPE, } ] ), @@ -184,9 +184,9 @@ def test_annotate_signature_variants_tmb(self, graphkb_conn): preprocess_signature_variants( [ { - "displayName": f"{TMB_SIGNATURE} {TMB_SIGNATURE_VARIANT_TYPE}", - "signatureName": TMB_SIGNATURE, - "variantTypeName": TMB_SIGNATURE_VARIANT_TYPE, + 'displayName': f'{TMB_SIGNATURE} {TMB_SIGNATURE_VARIANT_TYPE}', + 'signatureName': TMB_SIGNATURE, + 'variantTypeName': TMB_SIGNATURE_VARIANT_TYPE, } ] ), @@ -199,39 +199,39 @@ def test_annotate_signature_variants_msi(self, graphkb_conn): msi = annotate_signature_variants( graphkb_conn, DISEASE_RIDS, - preprocess_signature_variants([MSI_MAPPING.get("microsatellite instability")]), + preprocess_signature_variants([MSI_MAPPING.get('microsatellite instability')]), ) assert len(msi) != 0 def test_annotate_structural_variants_tp53(self, graphkb_conn): """Verify alternate TP53 variants match.""" - ref_key = "prot_only" + ref_key = 'prot_only' pref = annotate_positional_variants(graphkb_conn, [TP53_MUT_DICT[ref_key]], DISEASE_RIDS) # GERO-299 - nonsense - stop codon - should not match. This is missense not nonsense (#164:933). - nonsense = [a for a in pref if a["kbVariant"] == "TP53 nonsense"] + nonsense = [a for a in pref if a['kbVariant'] == 'TP53 nonsense'] assert not nonsense - pref_vars = set([m["kbVariant"] for m in pref]) - assert pref_vars, f"No matches to {TP53_MUT_DICT[pref]}" + pref_vars = set([m['kbVariant'] for m in pref]) + assert pref_vars, f'No matches to {TP53_MUT_DICT[pref]}' print(pref_vars) for key, alt_rep in TP53_MUT_DICT.items(): if key == ref_key: continue - if key in ("cds_only", "genome_only"): + if key in ('cds_only', 'genome_only'): # KBDEV-1259. Temporarely disabled until issue resolution. continue alt = annotate_positional_variants(graphkb_conn, [alt_rep], DISEASE_RIDS) - alt_vars = set([m["kbVariant"] for m in alt]) + alt_vars = set([m['kbVariant'] for m in alt]) diff = pref_vars.symmetric_difference(alt_vars) missing = pref_vars.difference(alt_vars) known_issues = set() - if key == "genome_only": + if key == 'genome_only': # genome_only matched to more precise type 'TP53 deleterious mutation' but not 'TP53 mutation' - known_issues.add("TP53 mutation") + known_issues.add('TP53 mutation') missing = pref_vars.difference(alt_vars).difference(known_issues) print(alt_vars) - assert not missing, f"{key} missing{missing}: {diff}" + assert not missing, f'{key} missing{missing}: {diff}' def test_wt_not_matched(self, graphkb_conn): """Verify wildtypes are not matched to mutations.""" @@ -239,5 +239,5 @@ def test_wt_not_matched(self, graphkb_conn): graphkb_conn, [KBDEV1231_TP53_ERR_MATCH_WT], DISEASE_RIDS ) # KBDEV-1231 - wildtype - should not match. A mutation is not wildtype - wt_matches = sorted(set([m["kbVariant"] for m in matches if "wildtype" in m["kbVariant"]])) + wt_matches = sorted(set([m['kbVariant'] for m in matches if 'wildtype' in m['kbVariant']])) assert not wt_matches, f"Mutation 'TP53:p.E285K' should NOT match {wt_matches}" diff --git a/tests/test_ipr/test_connection.py b/tests/test_ipr/test_connection.py index 611afb25..d83ac79a 100644 --- a/tests/test_ipr/test_connection.py +++ b/tests/test_ipr/test_connection.py @@ -4,37 +4,37 @@ from pori_python.ipr.connection import IprConnection -IMAGE_DIR = os.path.join(os.path.dirname(__file__), "../../docs/images") +IMAGE_DIR = os.path.join(os.path.dirname(__file__), '../../docs/images') class TestPostImages: def test_no_images_ok(self): def request(*args, **kwargs): m = mock.MagicMock( - json=lambda: [{"upload": "successful"}], raise_for_status=lambda: None + json=lambda: [{'upload': 'successful'}], raise_for_status=lambda: None ) return m - with mock.patch("pori_python.ipr.connection.requests.request", request): - conn = IprConnection("user", "pass") - result = conn.post_images("report_id", files={}, data={}) + with mock.patch('pori_python.ipr.connection.requests.request', request): + conn = IprConnection('user', 'pass') + result = conn.post_images('report_id', files={}, data={}) assert result is None def test_images_load_ok(self): def request(*args, **kwargs): m = mock.MagicMock( - json=lambda: [{"upload": "successful"}], raise_for_status=lambda: None + json=lambda: [{'upload': 'successful'}], raise_for_status=lambda: None ) return m - with mock.patch("pori_python.ipr.connection.requests.request", request): - conn = IprConnection("user", "pass") + with mock.patch('pori_python.ipr.connection.requests.request', request): + conn = IprConnection('user', 'pass') result = conn.post_images( - "report_id", + 'report_id', files={ - "expression.correlation": os.path.join(IMAGE_DIR, "expression_correlation.png"), - "mixcr.circos_trb_vj_gene_usage": os.path.join( - IMAGE_DIR, "mixcr.circos_trb_vj_gene_usage.png" + 'expression.correlation': os.path.join(IMAGE_DIR, 'expression_correlation.png'), + 'mixcr.circos_trb_vj_gene_usage': os.path.join( + IMAGE_DIR, 'mixcr.circos_trb_vj_gene_usage.png' ), }, data={}, @@ -44,54 +44,54 @@ def request(*args, **kwargs): def test_images_with_data_load_ok(self): def request(*args, **kwargs): m = mock.MagicMock( - json=lambda: [{"upload": "successful"}], raise_for_status=lambda: None + json=lambda: [{'upload': 'successful'}], raise_for_status=lambda: None ) return m - with mock.patch("pori_python.ipr.connection.requests.request", request): - conn = IprConnection("user", "pass") + with mock.patch('pori_python.ipr.connection.requests.request', request): + conn = IprConnection('user', 'pass') result = conn.post_images( - "report_id", + 'report_id', files={ - "expression.correlation": os.path.join(IMAGE_DIR, "expression_correlation.png"), - "mixcr.circos_trb_vj_gene_usage": os.path.join( - IMAGE_DIR, "mixcr.circos_trb_vj_gene_usage.png" + 'expression.correlation': os.path.join(IMAGE_DIR, 'expression_correlation.png'), + 'mixcr.circos_trb_vj_gene_usage': os.path.join( + IMAGE_DIR, 'mixcr.circos_trb_vj_gene_usage.png' ), }, - data={"expression.correlation.title": "this is a title"}, + data={'expression.correlation.title': 'this is a title'}, ) assert result is None def test_bad_file(self): def request(*args, **kwargs): m = mock.MagicMock( - json=lambda: [{"upload": "successful"}], raise_for_status=lambda: None + json=lambda: [{'upload': 'successful'}], raise_for_status=lambda: None ) return m - with mock.patch("pori_python.ipr.connection.requests.request", request): - conn = IprConnection("user", "pass") + with mock.patch('pori_python.ipr.connection.requests.request', request): + conn = IprConnection('user', 'pass') with pytest.raises(FileNotFoundError): conn.post_images( - "report_id", files={"expression.correlation": "thing/that/does/not/exist.png"} + 'report_id', files={'expression.correlation': 'thing/that/does/not/exist.png'} ) def test_failed_image_load(self): def request(*args, **kwargs): m = mock.MagicMock( - json=lambda: [{"upload": "anything else", "key": "thing"}], + json=lambda: [{'upload': 'anything else', 'key': 'thing'}], raise_for_status=lambda: None, ) return m - with mock.patch("pori_python.ipr.connection.requests.request", request): - conn = IprConnection("user", "pass") + with mock.patch('pori_python.ipr.connection.requests.request', request): + conn = IprConnection('user', 'pass') with pytest.raises(ValueError): conn.post_images( - "report_id", + 'report_id', { - "expression.correlation": os.path.join( - IMAGE_DIR, "expression_correlation.png" + 'expression.correlation': os.path.join( + IMAGE_DIR, 'expression_correlation.png' ) }, ) diff --git a/tests/test_ipr/test_data/expression.short.tab b/tests/test_ipr/test_data/expression.short.tab index a10cb803..0d286531 100644 --- a/tests/test_ipr/test_data/expression.short.tab +++ b/tests/test_ipr/test_data/expression.short.tab @@ -25,3 +25,4 @@ KLHL25 27.39 outlier_high increased expression 100.0 6.97 46.01 100.0 210.66 8. ZNRF3-IT1 0.2 no_category 6.75 0.12 34 -1.816 2.225 -2.505 1.744 26 0.623 -1.272 0.2 0.3 0.4 0.5 PTP4A3 1.33 high_percentile increased expression 100.0 9.74 99.62 100.0 32.2 6.09 -2.217 87 1.863 2.013 0.286 -0.767 66 -0.091 -1.824 0.2 0.3 0.4 0.5 ERBB2 0.05 no_category 67.0 0.62 51.71 100.0 1.01 4.8 -0.551 61 -2.009 -2.216 0.147 -2.385 86 2.299 -0.953 0.2 0.3 0.4 0.5 +DPYD 1.12 increased rna expression increased expression 97.0 4.75 73.38 100.0 50.19 0.99 -0.193 92 2.786 0.537 2.113 1.888 31 0.915 0.861 0.2 0.3 0.4 0.5 diff --git a/tests/test_ipr/test_inputs.py b/tests/test_ipr/test_inputs.py index 61e9e0f9..56e0c4d4 100644 --- a/tests/test_ipr/test_inputs.py +++ b/tests/test_ipr/test_inputs.py @@ -8,6 +8,7 @@ from pori_python.graphkb.match import INPUT_COPY_CATEGORIES from pori_python.ipr.constants import ( MSI_MAPPING, + HRD_MAPPING, TMB_SIGNATURE, TMB_SIGNATURE_HIGH_THRESHOLD, ) @@ -21,6 +22,7 @@ preprocess_expression_variants, preprocess_hla, preprocess_msi, + preprocess_hrd, preprocess_signature_variants, preprocess_small_mutations, preprocess_structural_variants, @@ -30,25 +32,28 @@ from pori_python.ipr.util import logger from pori_python.types import IprFusionVariant, IprGeneVariant -DATA_DIR = os.path.join(os.path.dirname(__file__), "test_data") -NON_EMPTY_STRING_NULLS = ["", None, np.nan, pd.NA] -EXPECTED_COSMIC = {"DBS9", "DBS11", "ID2", "ID7", "ID10", "SBS2", "SBS5", "DMMR"} +DATA_DIR = os.path.join(os.path.dirname(__file__), 'test_data') +NON_EMPTY_STRING_NULLS = ['', None, np.nan, pd.NA] +EXPECTED_COSMIC = {'DBS9', 'DBS11', 'ID2', 'ID7', 'ID10', 'SBS2', 'SBS5', 'DMMR'} EXPECTED_HLA = { - "HLA-A*02:01", - "HLA-A*02", - "HLA-A*30:01", - "HLA-A*30", - "HLA-B*27:01", - "HLA-B*27", - "HLA-B*15:01", - "HLA-B*15", - "HLA-C*03:03", - "HLA-C*03", - "HLA-C*06:02", - "HLA-C*06", + 'HLA-A*02:01', + 'HLA-A*02', + 'HLA-A*30:01', + 'HLA-A*30', + 'HLA-B*27:01', + 'HLA-B*27', + 'HLA-B*15:01', + 'HLA-B*15', + 'HLA-C*03:03', + 'HLA-C*03', + 'HLA-C*06:02', + 'HLA-C*06', } EXPECTED_TMB = {TMB_SIGNATURE} -EXPECTED_MSI = {MSI_MAPPING.get("microsatellite instability")["signatureName"]} +EXPECTED_MSI = {MSI_MAPPING.get('microsatellite instability')['signatureName']} +EXPECTED_HRD = { + HRD_MAPPING.get('homologous recombination deficiency strong signature')['signatureName'] +} def read_data_file(filename): @@ -58,220 +63,235 @@ def read_data_file(filename): class TestPreProcessSmallMutations: def test_load_test_file(self) -> None: records = preprocess_small_mutations( - pd.read_csv(os.path.join(DATA_DIR, "small_mutations.tab"), sep="\t").to_dict("records") + pd.read_csv(os.path.join(DATA_DIR, 'small_mutations.tab'), sep='\t').to_dict('records') ) assert records assert len(records) == 2614 def test_maintains_optional_fields(self): original = { - "gene": "A1BG", - "proteinChange": "p.V460M", - "zygosity": "het", - "tumourAltCount": 42, - "tumourRefCount": 48, - "hgvsProtein": "", - "transcript": "ENST1000", - "hgvsCds": "", - "hgvsGenomic": "", - "key": "02fe85a3477784b5ac0f8ecffb300d10", - "variant": "blargh", - "chromosome": "2", - "startPosition": 1234, + 'gene': 'A1BG', + 'proteinChange': 'p.V460M', + 'zygosity': 'het', + 'tumourAltCount': 42, + 'tumourRefCount': 48, + 'hgvsProtein': '', + 'transcript': 'ENST1000', + 'hgvsCds': '', + 'hgvsGenomic': '', + 'key': '02fe85a3477784b5ac0f8ecffb300d10', + 'variant': 'blargh', + 'chromosome': '2', + 'startPosition': 1234, } records = preprocess_small_mutations([original]) record = records[0] - assert record["variantType"] == "mut" + assert record['variantType'] == 'mut' for col in original: assert col in record - assert record["variant"] == "A1BG:p.V460M" - assert "endPosition" in record - assert record["endPosition"] == record["startPosition"] - assert "tumourDepth" in record - assert record["tumourDepth"] == 90 + assert record['variant'] == 'A1BG:p.V460M' + assert 'endPosition' in record + assert record['endPosition'] == record['startPosition'] + assert 'tumourDepth' in record + assert record['tumourDepth'] == 90 def test_null(self): original = { - "gene": "A1BG", - "proteinChange": "p.V460M", - "tumourAltCount": 42, - "tumourRefCount": 48, - "startPosition": 1234, + 'gene': 'A1BG', + 'proteinChange': 'p.V460M', + 'tumourAltCount': 42, + 'tumourRefCount': 48, + 'startPosition': 1234, } # Make sure TEST_KEYS are appropriate. # For some fields, like 'ref' and 'alt', NA is _not_ equivalent to a null string. - TEST_KEYS = ["startPosition", "endPosition", "tumourAltCount", "tumourRefCount"] + TEST_KEYS = ['startPosition', 'endPosition', 'tumourAltCount', 'tumourRefCount'] for key in TEST_KEYS: for null in NON_EMPTY_STRING_NULLS: small_mut = original.copy() small_mut[key] = null records = preprocess_small_mutations([small_mut]) record = records[0] - assert record["variantType"] == "mut" + assert record['variantType'] == 'mut' for col in original: assert col in record - assert record["variant"] == "A1BG:p.V460M" - assert "endPosition" in record + assert record['variant'] == 'A1BG:p.V460M' + assert 'endPosition' in record def test_load_small_mutations_probe(self) -> None: records = preprocess_small_mutations( - pd.read_csv(os.path.join(DATA_DIR, "small_mutations_probe.tab"), sep="\t").to_dict( - "records" + pd.read_csv(os.path.join(DATA_DIR, 'small_mutations_probe.tab'), sep='\t').to_dict( + 'records' ) ) assert records assert len(records) == 4 - assert records[0]["variantType"] == "mut" - assert "variant" in records[0] + assert records[0]['variantType'] == 'mut' + assert 'variant' in records[0] class TestPreProcessCopyVariants: def test_load_copy_variants(self) -> None: records = preprocess_copy_variants( - pd.read_csv(os.path.join(DATA_DIR, "copy_variants.tab"), sep="\t").to_dict("records") + pd.read_csv(os.path.join(DATA_DIR, 'copy_variants.tab'), sep='\t').to_dict('records') ) assert records assert len(records) == 4603 - assert records[0]["variantType"] == "cnv" - assert "variant" in records[0] + assert records[0]['variantType'] == 'cnv' + assert 'variant' in records[0] def test_add_chr_to_chrband(self) -> None: - df1 = pd.read_csv(os.path.join(DATA_DIR, "copy_variants.tab"), sep="\t") - df1 = df1.to_dict("records") + df1 = pd.read_csv(os.path.join(DATA_DIR, 'copy_variants.tab'), sep='\t') + df1 = df1.to_dict('records') records = preprocess_copy_variants(df1) assert records assert len(records) == 4603 - assert records[0]["chromosomeBand"] == "1q22.1" - assert "chromosome" not in records[0] + assert records[0]['chromosomeBand'] == '1q22.1' + assert 'chromosome' not in records[0] def test_add_int_chr_to_chrband(self) -> None: - df1 = pd.read_csv(os.path.join(DATA_DIR, "copy_variants.tab"), sep="\t") - df1["chromosome"] = df1["chromosome"].apply(lambda x: x.split("chr")[1]) - df1 = df1.to_dict("records") + df1 = pd.read_csv(os.path.join(DATA_DIR, 'copy_variants.tab'), sep='\t') + df1['chromosome'] = df1['chromosome'].apply(lambda x: x.split('chr')[1]) + df1 = df1.to_dict('records') records = preprocess_copy_variants(df1) assert records assert len(records) == 4603 - assert records[0]["chromosomeBand"] == "1q22.1" - assert "chromosome" not in records[0] + assert records[0]['chromosomeBand'] == '1q22.1' + assert 'chromosome' not in records[0] def test_add_chr_to_chrband_if_chromosome_not_present(self) -> None: - df1 = pd.read_csv(os.path.join(DATA_DIR, "copy_variants.tab"), sep="\t") - df1["chr"] = df1["chromosome"].copy() - df1.drop("chromosome", axis=1, inplace=True) - df1 = df1.to_dict("records") + df1 = pd.read_csv(os.path.join(DATA_DIR, 'copy_variants.tab'), sep='\t') + df1['chr'] = df1['chromosome'].copy() + df1.drop('chromosome', axis=1, inplace=True) + df1 = df1.to_dict('records') records = preprocess_copy_variants(df1) assert records assert len(records) == 4603 - assert records[0]["chromosomeBand"] == "1q22.1" - assert "chr" not in records[0] - assert "chromosome" not in records[0] + assert records[0]['chromosomeBand'] == '1q22.1' + assert 'chr' not in records[0] + assert 'chromosome' not in records[0] def test_do_not_add_chr_if_chr_already_in_chrband(self) -> None: - df1 = pd.read_csv(os.path.join(DATA_DIR, "copy_variants.tab"), sep="\t") + df1 = pd.read_csv(os.path.join(DATA_DIR, 'copy_variants.tab'), sep='\t') df2 = df1.copy() - df1["chromosomeBand"] = df1["chromosomeBand"].apply(lambda x: "chr99" + x) - df1 = df1.to_dict("records") - df2["chromosomeBand"] = df2["chromosomeBand"].apply(lambda x: "99" + x) - df2 = df2.to_dict("records") + df1['chromosomeBand'] = df1['chromosomeBand'].apply(lambda x: 'chr99' + x) + df1 = df1.to_dict('records') + df2['chromosomeBand'] = df2['chromosomeBand'].apply(lambda x: '99' + x) + df2 = df2.to_dict('records') records = preprocess_copy_variants(df1) assert records assert len(records) == 4603 - assert records[0]["chromosomeBand"] == "chr99q22.1" - assert "chr" not in records[0] # make sure these cols are still getting removed - assert "chromosome" not in records[0] + assert records[0]['chromosomeBand'] == 'chr99q22.1' + assert 'chr' not in records[0] # make sure these cols are still getting removed + assert 'chromosome' not in records[0] records2 = preprocess_copy_variants(df2) - assert records2[0]["chromosomeBand"] == "99q22.1" + assert records2[0]['chromosomeBand'] == '99q22.1' def test_no_error_if_chr_column_not_present(self) -> None: - df1 = pd.read_csv(os.path.join(DATA_DIR, "copy_variants.tab"), sep="\t") - df1.drop("chromosome", axis=1, inplace=True) - df1 = df1.to_dict("records") + df1 = pd.read_csv(os.path.join(DATA_DIR, 'copy_variants.tab'), sep='\t') + df1.drop('chromosome', axis=1, inplace=True) + df1 = df1.to_dict('records') records = preprocess_copy_variants(df1) assert records assert len(records) == 4603 - assert records[0]["chromosomeBand"] == "q22.1" + assert records[0]['chromosomeBand'] == 'q22.1' def test_null(self): for kb_cat in list(INPUT_COPY_CATEGORIES.values()) + NON_EMPTY_STRING_NULLS: - original = {"gene": "ERBB2", "kbCategory": kb_cat} + original = {'gene': 'ERBB2', 'kbCategory': kb_cat} for key in COPY_OPTIONAL: for null in NON_EMPTY_STRING_NULLS: copy_var = original.copy() copy_var[key] = null records = preprocess_copy_variants([copy_var]) record = records[0] - assert record["variantType"] == "cnv" + assert record['variantType'] == 'cnv' class TestPreProcessSignatureVariants: - # Preprocessing records from file cosmic = preprocess_cosmic( [ - r["signature"] - for r in pd.read_csv(os.path.join(DATA_DIR, "cosmic_variants.tab"), sep="\t").to_dict( - "records" + r['signature'] + for r in pd.read_csv(os.path.join(DATA_DIR, 'cosmic_variants.tab'), sep='\t').to_dict( + 'records' ) ] ) hla = preprocess_hla( - pd.read_csv(os.path.join(DATA_DIR, "hla_variants.tab"), sep="\t").to_dict("records") + pd.read_csv(os.path.join(DATA_DIR, 'hla_variants.tab'), sep='\t').to_dict('records') ) tmb = preprocess_tmb( tmb_high=TMB_SIGNATURE_HIGH_THRESHOLD, tmburMutationBurden=pd.read_csv( - os.path.join(DATA_DIR, "tmburMutationBurden.tab"), sep="\t" - ).to_dict("records"), - genomeTmb="11.430000000000001", + os.path.join(DATA_DIR, 'tmburMutationBurden.tab'), sep='\t' + ).to_dict('records'), + genomeTmb='11.430000000000001', ) msi = preprocess_msi( [ { - "score": 27.55, - "kbCategory": "microsatellite instability", - "key": "microsatellite instability", + 'score': 27.55, + 'kbCategory': 'microsatellite instability', + 'key': 'microsatellite instability', } ] ) + hrd = preprocess_hrd( + { + 'score': 9999, + 'kbCategory': 'homologous recombination deficiency strong signature', + 'key': 'homologous recombination deficiency strong signature', + } + ) # tests on preprocessed records def test_preprocess_cosmic(self) -> None: assert self.cosmic assert len(self.cosmic) == len(EXPECTED_COSMIC) - assert "variantTypeName" in self.cosmic[0] - assert "displayName" in self.cosmic[0] + assert 'variantTypeName' in self.cosmic[0] + assert 'displayName' in self.cosmic[0] - signatureNames = {r.get("signatureName", "") for r in self.cosmic} + signatureNames = {r.get('signatureName', '') for r in self.cosmic} assert len(EXPECTED_COSMIC.symmetric_difference(signatureNames)) == 0 def test_preprocess_hla(self) -> None: assert self.hla assert len(self.hla) == len(EXPECTED_HLA) - assert "variantTypeName" in self.hla[0] - assert "displayName" in self.hla[0] + assert 'variantTypeName' in self.hla[0] + assert 'displayName' in self.hla[0] - signatureNames = {r.get("signatureName", "") for r in self.hla} + signatureNames = {r.get('signatureName', '') for r in self.hla} assert len(EXPECTED_HLA.symmetric_difference(signatureNames)) == 0 def test_preprocess_tmb(self) -> None: assert self.tmb assert len(self.tmb) == len(EXPECTED_TMB) - assert "variantTypeName" in self.tmb[0] - assert "displayName" in self.tmb[0] + assert 'variantTypeName' in self.tmb[0] + assert 'displayName' in self.tmb[0] - signatureNames = {r.get("signatureName", "") for r in self.tmb} + signatureNames = {r.get('signatureName', '') for r in self.tmb} assert len(EXPECTED_TMB.symmetric_difference(signatureNames)) == 0 def test_preprocess_msi(self) -> None: assert self.msi assert len(self.msi) == len(EXPECTED_MSI) - assert "variantTypeName" in self.msi[0] - assert "displayName" in self.msi[0] + assert 'variantTypeName' in self.msi[0] + assert 'displayName' in self.msi[0] - signatureNames = {r.get("signatureName", "") for r in self.msi} + signatureNames = {r.get('signatureName', '') for r in self.msi} assert len(EXPECTED_MSI.symmetric_difference(signatureNames)) == 0 + def test_preprocess_hrd(self) -> None: + assert self.hrd + assert len(self.hrd) == len(EXPECTED_HRD) + assert 'variantTypeName' in self.hrd[0] + assert 'displayName' in self.hrd[0] + + signatureNames = {r.get('signatureName', '') for r in self.hrd} + assert len(EXPECTED_HRD.symmetric_difference(signatureNames)) == 0 + def test_preprocess_signature_variants(self) -> None: records = preprocess_signature_variants( [ @@ -285,100 +305,100 @@ def test_preprocess_signature_variants(self) -> None: assert len(records) == ( len(EXPECTED_COSMIC) + len(EXPECTED_HLA) + len(EXPECTED_TMB) + len(EXPECTED_MSI) ) - assert "key" in records[0] + assert 'key' in records[0] def test_load_structural_variants() -> None: records = preprocess_structural_variants( - pd.read_csv(os.path.join(DATA_DIR, "fusions.tab"), sep="\t").to_dict("records") + pd.read_csv(os.path.join(DATA_DIR, 'fusions.tab'), sep='\t').to_dict('records') ) assert records assert len(records) == 7 - assert records[0]["variantType"] == "sv" - assert "variant" in records[0] + assert records[0]['variantType'] == 'sv' + assert 'variant' in records[0] def test_load_expression_variants() -> None: records = preprocess_expression_variants( - pd.read_csv(os.path.join(DATA_DIR, "expression.tab"), sep="\t").to_dict("records") + pd.read_csv(os.path.join(DATA_DIR, 'expression.tab'), sep='\t').to_dict('records') ) assert records assert len(records) == 4603 - assert records[0]["variantType"] == "exp" - assert "variant" in records[0] + assert records[0]['variantType'] == 'exp' + assert 'variant' in records[0] class TestCheckVariantLinks: def test_sm_missing_copy_empty_ok(self) -> None: genes = check_variant_links( - small_mutations=[IprGeneVariant({"gene": "KRAS"})], # type: ignore + small_mutations=[IprGeneVariant({'gene': 'KRAS'})], # type: ignore copy_variants=[], - expression_variants=[IprGeneVariant({"gene": "KRAS", "variant": ""})], # type: ignore + expression_variants=[IprGeneVariant({'gene': 'KRAS', 'variant': ''})], # type: ignore structural_variants=[], ) - assert genes == {"KRAS"} + assert genes == {'KRAS'} def test_sm_missing_exp_empty_ok(self) -> None: genes = check_variant_links( - small_mutations=[IprGeneVariant({"gene": "KRAS"})], # type: ignore - copy_variants=[IprGeneVariant({"gene": "KRAS", "variant": ""})], # type: ignore + small_mutations=[IprGeneVariant({'gene': 'KRAS'})], # type: ignore + copy_variants=[IprGeneVariant({'gene': 'KRAS', 'variant': ''})], # type: ignore expression_variants=[], structural_variants=[], ) - assert genes == {"KRAS"} + assert genes == {'KRAS'} def test_sm_missing_copy(self) -> None: - with mock.patch.object(logger, "debug") as mock_debug: + with mock.patch.object(logger, 'debug') as mock_debug: check_variant_links( - small_mutations=[IprGeneVariant({"gene": "KRAS"})], # type: ignore - copy_variants=[IprGeneVariant({"gene": "CDK", "variant": ""})], # type: ignore - expression_variants=[IprGeneVariant({"gene": "KRAS", "variant": ""})], # type: ignore + small_mutations=[IprGeneVariant({'gene': 'KRAS'})], # type: ignore + copy_variants=[IprGeneVariant({'gene': 'CDK', 'variant': ''})], # type: ignore + expression_variants=[IprGeneVariant({'gene': 'KRAS', 'variant': ''})], # type: ignore structural_variants=[], ) assert mock_debug.called def test_sm_missing_exp(self) -> None: - with mock.patch.object(logger, "debug") as mock_debug: + with mock.patch.object(logger, 'debug') as mock_debug: check_variant_links( - small_mutations=[IprGeneVariant({"gene": "KRAS"})], # type: ignore - copy_variants=[IprGeneVariant({"gene": "KRAS", "variant": ""})], # type: ignore - expression_variants=[IprGeneVariant({"gene": "CDK", "variant": ""})], # type: ignore + small_mutations=[IprGeneVariant({'gene': 'KRAS'})], # type: ignore + copy_variants=[IprGeneVariant({'gene': 'KRAS', 'variant': ''})], # type: ignore + expression_variants=[IprGeneVariant({'gene': 'CDK', 'variant': ''})], # type: ignore structural_variants=[], ) assert mock_debug.called def test_with_valid_inputs(self) -> None: genes = check_variant_links( - small_mutations=[IprGeneVariant({"gene": "KRAS"})], # type: ignore + small_mutations=[IprGeneVariant({'gene': 'KRAS'})], # type: ignore copy_variants=[ - IprGeneVariant({"gene": "KRAS", "variant": ""}), # type: ignore - IprGeneVariant({"gene": "CDK", "variant": ""}), # type: ignore + IprGeneVariant({'gene': 'KRAS', 'variant': ''}), # type: ignore + IprGeneVariant({'gene': 'CDK', 'variant': ''}), # type: ignore ], - expression_variants=[IprGeneVariant({"gene": "KRAS", "variant": ""})], # type: ignore + expression_variants=[IprGeneVariant({'gene': 'KRAS', 'variant': ''})], # type: ignore structural_variants=[], ) - assert genes == {"KRAS"} + assert genes == {'KRAS'} def test_copy_missing_exp(self) -> None: - with mock.patch.object(logger, "debug") as mock_debug: + with mock.patch.object(logger, 'debug') as mock_debug: check_variant_links( small_mutations=[], copy_variants=[ - IprGeneVariant({"gene": "BRAF", "variant": "copy gain"}), # type: ignore - IprGeneVariant({"gene": "KRAS", "variant": ""}), # type: ignore + IprGeneVariant({'gene': 'BRAF', 'variant': 'copy gain'}), # type: ignore + IprGeneVariant({'gene': 'KRAS', 'variant': ''}), # type: ignore ], - expression_variants=[IprGeneVariant({"gene": "KRAS", "variant": ""})], # type: ignore + expression_variants=[IprGeneVariant({'gene': 'KRAS', 'variant': ''})], # type: ignore structural_variants=[], ) assert mock_debug.called def test_exp_missing_copy(self) -> None: - with mock.patch.object(logger, "debug") as mock_debug: + with mock.patch.object(logger, 'debug') as mock_debug: check_variant_links( small_mutations=[], - copy_variants=[IprGeneVariant({"gene": "KRAS", "variant": ""})], # type: ignore + copy_variants=[IprGeneVariant({'gene': 'KRAS', 'variant': ''})], # type: ignore expression_variants=[ - IprGeneVariant({"gene": "BRAF", "variant": "increased expression"}) # type: ignore + IprGeneVariant({'gene': 'BRAF', 'variant': 'increased expression'}) # type: ignore ], structural_variants=[], ) @@ -388,67 +408,67 @@ def test_exp_missing_copy(self) -> None: class TestCreateGraphkbSvNotation: def test_both_genes_and_exons(self) -> None: notation = create_graphkb_sv_notation( - IprFusionVariant({"gene1": "A", "gene2": "B", "exon1": 1, "exon2": 2}) # type: ignore + IprFusionVariant({'gene1': 'A', 'gene2': 'B', 'exon1': 1, 'exon2': 2}) # type: ignore ) - assert notation == "(A,B):fusion(e.1,e.2)" + assert notation == '(A,B):fusion(e.1,e.2)' def test_one_exon_missing(self) -> None: notation = create_graphkb_sv_notation( - IprFusionVariant({"gene1": "A", "gene2": "B", "exon1": "", "exon2": 2}) # type: ignore + IprFusionVariant({'gene1': 'A', 'gene2': 'B', 'exon1': '', 'exon2': 2}) # type: ignore ) - assert notation == "(A,B):fusion(e.?,e.2)" + assert notation == '(A,B):fusion(e.?,e.2)' def test_one_gene_missing(self) -> None: notation = create_graphkb_sv_notation( - IprFusionVariant({"gene1": "A", "gene2": "", "exon1": 1, "exon2": 2}) # type: ignore + IprFusionVariant({'gene1': 'A', 'gene2': '', 'exon1': 1, 'exon2': 2}) # type: ignore ) - assert notation == "(A,?):fusion(e.1,e.2)" + assert notation == '(A,?):fusion(e.1,e.2)' def test_first_gene_missing(self) -> None: notation = create_graphkb_sv_notation( - IprFusionVariant({"gene1": "", "gene2": "B", "exon1": 1, "exon2": 2}) # type: ignore + IprFusionVariant({'gene1': '', 'gene2': 'B', 'exon1': 1, 'exon2': 2}) # type: ignore ) - assert notation == "(B,?):fusion(e.2,e.1)" + assert notation == '(B,?):fusion(e.2,e.1)' def test_no_genes_error(self) -> None: with pytest.raises(ValueError): create_graphkb_sv_notation( - IprFusionVariant({"gene1": "", "gene2": "", "exon1": 1, "exon2": 2, "key": "x"}) # type: ignore + IprFusionVariant({'gene1': '', 'gene2': '', 'exon1': 1, 'exon2': 2, 'key': 'x'}) # type: ignore ) class TestCheckComparators: def test_missing_disease_expression_error(self): - content = {"comparators": [{"analysisRole": "expression (primary site)"}]} + content = {'comparators': [{'analysisRole': 'expression (primary site)'}]} variants = [{}] with pytest.raises(ValueError): check_comparators(content, variants) def test_missing_primary_expression_error(self): - content = {"comparators": [{"analysisRole": "expression (disease)"}]} - variants = [{"primarySiteFoldChange": 1}] + content = {'comparators': [{'analysisRole': 'expression (disease)'}]} + variants = [{'primarySiteFoldChange': 1}] with pytest.raises(ValueError): check_comparators(content, variants) def test_missing_biopsy_expression_error(self): - content = {"comparators": [{"analysisRole": "expression (disease)"}]} - variants = [{"biopsySitePercentile": 1}] + content = {'comparators': [{'analysisRole': 'expression (disease)'}]} + variants = [{'biopsySitePercentile': 1}] with pytest.raises(ValueError): check_comparators(content, variants) def test_expression_not_required_without_variants(self): - content = {"comparators": []} + content = {'comparators': []} variants = [] assert check_comparators(content, variants) is None def test_missing_mutation_burden(self): content = { - "comparators": [{"analysisRole": "mutation burden (secondary)"}], - "images": [{"key": "mutationBurden.density_snv.primary"}], + 'comparators': [{'analysisRole': 'mutation burden (secondary)'}], + 'images': [{'key': 'mutationBurden.density_snv.primary'}], } variants = [] @@ -456,8 +476,8 @@ def test_missing_mutation_burden(self): check_comparators(content, variants) -@pytest.mark.parametrize("example_name", ["no_variants", "sm_and_exp", "sm_only"]) +@pytest.mark.parametrize('example_name', ['no_variants', 'sm_and_exp', 'sm_only']) def test_valid_json_inputs(example_name: str): - with open(os.path.join(DATA_DIR, "json_examples", f"{example_name}.json"), "r") as fh: + with open(os.path.join(DATA_DIR, 'json_examples', f'{example_name}.json'), 'r') as fh: content = json.load(fh) validate_report_content(content) diff --git a/tests/test_ipr/test_ipr.py b/tests/test_ipr/test_ipr.py index adadcf9e..3e9b01a3 100644 --- a/tests/test_ipr/test_ipr.py +++ b/tests/test_ipr/test_ipr.py @@ -10,196 +10,198 @@ get_kb_matched_statements, get_kb_statement_matched_conditions, get_kb_variants, + get_kb_matches_sections, + create_key_alterations, ) from pori_python.types import Statement -DISEASE_RIDS = ["#138:12", "#138:13"] -APPROVED_EVIDENCE_RIDS = ["approved1", "approved2"] +DISEASE_RIDS = ['#138:12', '#138:13'] +APPROVED_EVIDENCE_RIDS = ['approved1', 'approved2'] GERMLINE_VARIANTS = [ { - "key": "1", - "germline": True, - "hgvsCds": "SLC28A3:c.1381C>T", - "hgvsGenomic": "chr9:g.84286011G>A", - "hgvsProtein": "SLC28A3:p.L461L", - "ncbiBuild": "GRCh38", - "normalAltCount": 37, - "normalDepth": 37, - "normalRefCount": 0, - "proteinChange": "p.L461L", - "rnaAltCount": "", - "rnaDepth": "", - "rnaRefCount": "", - "startPosition": 84286011, - "transcript": "ENST00000376238", - "tumourAltCount": "", - "tumourDepth": "", - "tumourRefCount": "", - "variant": "SLC28A3:p.L461L", - "variantType": "mut", - "zygosity": "", + 'key': '1', + 'germline': True, + 'hgvsCds': 'SLC28A3:c.1381C>T', + 'hgvsGenomic': 'chr9:g.84286011G>A', + 'hgvsProtein': 'SLC28A3:p.L461L', + 'ncbiBuild': 'GRCh38', + 'normalAltCount': 37, + 'normalDepth': 37, + 'normalRefCount': 0, + 'proteinChange': 'p.L461L', + 'rnaAltCount': '', + 'rnaDepth': '', + 'rnaRefCount': '', + 'startPosition': 84286011, + 'transcript': 'ENST00000376238', + 'tumourAltCount': '', + 'tumourDepth': '', + 'tumourRefCount': '', + 'variant': 'SLC28A3:p.L461L', + 'variantType': 'mut', + 'zygosity': '', }, { - "key": "2", - "germline": True, - "hgvsCds": "BRCA1:c.4837A>", - "hgvsGenomic": "chr17:g.43071077T>C", - "hgvsProtein": "BRCA1:p.S1613G", - "normalAltCount": 33, - "normalDepth": 33, - "normalRefCount": 0, - "tumourAltCount": 37, - "tumourDepth": 37, - "tumourRefCount": 0, + 'key': '2', + 'germline': True, + 'hgvsCds': 'BRCA1:c.4837A>', + 'hgvsGenomic': 'chr17:g.43071077T>C', + 'hgvsProtein': 'BRCA1:p.S1613G', + 'normalAltCount': 33, + 'normalDepth': 33, + 'normalRefCount': 0, + 'tumourAltCount': 37, + 'tumourDepth': 37, + 'tumourRefCount': 0, }, ] SOMATIC_VARIANTS = [ { - "key": "1", - "gene": "SLC28A3", - "germline": False, - "hgvsCds": "SLC28A3:c.1381C>T", - "hgvsGenomic": "chr9:g.84286011G>A", - "hgvsProtein": "SLC28A3:p.L461L", - "ncbiBuild": "GRCh38", - "normalAltCount": 0, - "normalDepth": 37, - "normalRefCount": 37, - "tumourAltCount": 37, - "tumourDepth": 37, - "tumourRefCount": 0, - "variant": "SLC28A3:p.L461L", - "variantType": "mut", - "zygosity": "", + 'key': '1', + 'gene': 'SLC28A3', + 'germline': False, + 'hgvsCds': 'SLC28A3:c.1381C>T', + 'hgvsGenomic': 'chr9:g.84286011G>A', + 'hgvsProtein': 'SLC28A3:p.L461L', + 'ncbiBuild': 'GRCh38', + 'normalAltCount': 0, + 'normalDepth': 37, + 'normalRefCount': 37, + 'tumourAltCount': 37, + 'tumourDepth': 37, + 'tumourRefCount': 0, + 'variant': 'SLC28A3:p.L461L', + 'variantType': 'mut', + 'zygosity': '', }, { - "key": "2", - "germline": False, - "hgvsCds": "BRCA1:c.4837A>", - "hgvsGenomic": "chr17:g.43071077T>C", - "hgvsProtein": "BRCA1:p.S1613G", - "normalAltCount": 1, - "normalDepth": 33, - "normalRefCount": 32, - "tumourAltCount": 37, - "tumourDepth": 37, - "tumourRefCount": 0, + 'key': '2', + 'germline': False, + 'hgvsCds': 'BRCA1:c.4837A>', + 'hgvsGenomic': 'chr17:g.43071077T>C', + 'hgvsProtein': 'BRCA1:p.S1613G', + 'normalAltCount': 1, + 'normalDepth': 33, + 'normalRefCount': 32, + 'tumourAltCount': 37, + 'tumourDepth': 37, + 'tumourRefCount': 0, }, ] GERMLINE_KB_MATCHES = [ { - "variant": "1", - "approvedTherapy": False, - "category": "pharmacogenomic", - "context": "anthracyclines", - "kbContextId": "#122:20944", - "kbRelevanceId": "#147:38", - "kbStatementId": "#154:13387", - "kbVariant": "SLC28A3:c.1381C>T", - "kbVariantId": "#159:5426", - "matchedCancer": False, - "reference": "PMID: 27197003", - "relevance": "decreased toxicity", - "reviewStatus": "initial", + 'variant': '1', + 'approvedTherapy': False, + 'category': 'pharmacogenomic', + 'context': 'anthracyclines', + 'kbContextId': '#122:20944', + 'kbRelevanceId': '#147:38', + 'kbStatementId': '#154:13387', + 'kbVariant': 'SLC28A3:c.1381C>T', + 'kbVariantId': '#159:5426', + 'matchedCancer': False, + 'reference': 'PMID: 27197003', + 'relevance': 'decreased toxicity', + 'reviewStatus': 'initial', }, { - "variant": "2", - "approvedTherapy": True, - "category": "cancer predisposition", - "kbContextId": "#135:8764", - "kbRelevanceId": "#147:32", - "kbStatementId": "#155:13511", - "kbVariant": "BRCA1 mutation", - "kbVariantId": "#161:938", - "matchedCancer": False, - "reference": "MOAlmanac FDA-56", - "relevance": "therapy", - "reviewStatus": None, + 'variant': '2', + 'approvedTherapy': True, + 'category': 'cancer predisposition', + 'kbContextId': '#135:8764', + 'kbRelevanceId': '#147:32', + 'kbStatementId': '#155:13511', + 'kbVariant': 'BRCA1 mutation', + 'kbVariantId': '#161:938', + 'matchedCancer': False, + 'reference': 'MOAlmanac FDA-56', + 'relevance': 'therapy', + 'reviewStatus': None, }, ] SOMATIC_KB_MATCHES = [ { - "variant": "1", - "approvedTherapy": False, - "category": "prognostic", - "kbContextId": "somatic_test", - "kbRelevanceId": "#147:38", - "kbStatementId": "#154:13387", - "kbVariant": "SLC28A3:c.1381C>T", - "kbVariantId": "#159:5426", - "relevance": "prognostic", - "reviewStatus": "initial", + 'variant': '1', + 'approvedTherapy': False, + 'category': 'prognostic', + 'kbContextId': 'somatic_test', + 'kbRelevanceId': '#147:38', + 'kbStatementId': '#154:13387', + 'kbVariant': 'SLC28A3:c.1381C>T', + 'kbVariantId': '#159:5426', + 'relevance': 'prognostic', + 'reviewStatus': 'initial', }, { - "variant": "2", - "approvedTherapy": True, - "category": "therapy", - "kbContextId": "#135:8764", - "kbRelevanceId": "#147:32", - "kbStatementId": "#155:13511", - "kbVariant": "BRCA1 mutation", - "kbVariantId": "#161:938", - "matchedCancer": False, - "reference": "MOAlmanac FDA-56", - "relevance": "therapy", - "reviewStatus": None, + 'variant': '2', + 'approvedTherapy': True, + 'category': 'therapy', + 'kbContextId': '#135:8764', + 'kbRelevanceId': '#147:32', + 'kbStatementId': '#155:13511', + 'kbVariant': 'BRCA1 mutation', + 'kbVariantId': '#161:938', + 'matchedCancer': False, + 'reference': 'MOAlmanac FDA-56', + 'relevance': 'therapy', + 'reviewStatus': None, }, ] KB_MATCHES_STATEMENTS = [ { - "@rid": SOMATIC_KB_MATCHES[0]["kbStatementId"], - "conditions": [ + '@rid': SOMATIC_KB_MATCHES[0]['kbStatementId'], + 'conditions': [ { - "@class": "PositionalVariant", - "@rid": SOMATIC_KB_MATCHES[0]["kbVariantId"], + '@class': 'PositionalVariant', + '@rid': SOMATIC_KB_MATCHES[0]['kbVariantId'], }, - {"@class": "CategoryVariant", "@rid": SOMATIC_KB_MATCHES[1]["kbVariantId"]}, - {"@class": "Disease", "@rid": ""}, + {'@class': 'CategoryVariant', '@rid': SOMATIC_KB_MATCHES[1]['kbVariantId']}, + {'@class': 'Disease', '@rid': ''}, ], }, { - "@rid": SOMATIC_KB_MATCHES[1]["kbStatementId"], - "conditions": [ - {"@class": "CategoryVariant", "@rid": SOMATIC_KB_MATCHES[1]["kbVariantId"]}, - {"@class": "PositionalVariant", "@rid": "157:0", "type": "#999:99"}, + '@rid': SOMATIC_KB_MATCHES[1]['kbStatementId'], + 'conditions': [ + {'@class': 'CategoryVariant', '@rid': SOMATIC_KB_MATCHES[1]['kbVariantId']}, + {'@class': 'PositionalVariant', '@rid': '157:0', 'type': '#999:99'}, ], }, ] -def base_graphkb_statement(disease_id: str = "disease", relevance_rid: str = "other") -> Statement: +def base_graphkb_statement(disease_id: str = 'disease', relevance_rid: str = 'other') -> Statement: statement = Statement( # type: ignore { - "conditions": [ + 'conditions': [ { - "@class": "Disease", - "@rid": disease_id, - "displayName": "disease_display_name", + '@class': 'Disease', + '@rid': disease_id, + 'displayName': 'disease_display_name', }, { - "@class": "CategoryVariant", - "@rid": "variant_rid", - "displayName": "KRAS increased expression", + '@class': 'CategoryVariant', + '@rid': 'variant_rid', + 'displayName': 'KRAS increased expression', }, ], - "evidence": [], - "subject": { - "@class": "dummy_value", - "@rid": "101:010", - "displayName": "dummy_display_name", + 'evidence': [{'displayName': 'pmid12345', 'sourceId': 'nct12345'}], + 'subject': { + '@class': 'dummy_value', + '@rid': '101:010', + 'displayName': 'dummy_display_name', }, - "source": None, - "sourceId": None, - "relevance": { - "@rid": relevance_rid, - "displayName": "relevance_display_name", - "name": "relevance_name", + 'source': None, + 'sourceId': None, + 'relevance': { + '@rid': relevance_rid, + 'displayName': 'relevance_display_name', + 'name': 'relevance_name', }, - "@rid": "statement_rid", + '@rid': 'statement_rid', } ) return statement @@ -209,25 +211,25 @@ def base_graphkb_statement(disease_id: str = "disease", relevance_rid: str = "ot def graphkb_conn(): # Mock for the 'query' method query_mock = Mock() - query_return_values = [[{"@rid": v} for v in APPROVED_EVIDENCE_RIDS]] - query_index = {"value": -1} # Mutable index for closure + query_return_values = [[{'@rid': v} for v in APPROVED_EVIDENCE_RIDS]] + query_index = {'value': -1} # Mutable index for closure def query_side_effect(*args, **kwargs): if args: # for TestGetKbDiseaseMatches - return [{"@rid": "#123:45"}] - query_index["value"] += 1 - idx = query_index["value"] + return [{'@rid': '#123:45', 'name': 'name_for_#123:45'}] + query_index['value'] += 1 + idx = query_index['value'] return query_return_values[idx] if idx < len(query_return_values) else [] query_mock.side_effect = query_side_effect # Mock for the 'post' method - post_mock = Mock(return_value={"result": KB_MATCHES_STATEMENTS}) + post_mock = Mock(return_value={'result': KB_MATCHES_STATEMENTS}) # 'get_source' remains a plain function def mock_get_source(source): - return {"@rid": 0} + return {'@rid': 0} # Create the connection mock with attributes conn = Mock() @@ -243,8 +245,8 @@ def mock_get_source(source): @pytest.fixture(autouse=True) def mock_get_term_tree(monkeypatch): - mock_func = Mock(return_value=[{"@rid": d} for d in DISEASE_RIDS]) - monkeypatch.setattr(gkb_vocab, "get_term_tree", mock_func) + mock_func = Mock(return_value=[{'@rid': d, 'name': 'name_of_' + d} for d in DISEASE_RIDS]) + monkeypatch.setattr(gkb_vocab, 'get_term_tree', mock_func) yield mock_func mock_func.reset_mock() @@ -252,9 +254,9 @@ def mock_get_term_tree(monkeypatch): @pytest.fixture(autouse=True) def get_terms_set(monkeypatch): def mock_func(*pos, **kwargs): - return {"#999:99"} + return {'#999:99'} - monkeypatch.setattr(gkb_vocab, "get_terms_set", mock_func) + monkeypatch.setattr(gkb_vocab, 'get_terms_set', mock_func) @pytest.fixture(autouse=True) @@ -262,22 +264,22 @@ def mock_categorize_relevance(monkeypatch): def mock_func(_, relevance_id): return relevance_id - monkeypatch.setattr(gkb_statement, "categorize_relevance", mock_func) + monkeypatch.setattr(gkb_statement, 'categorize_relevance', mock_func) class TestGetKbDiseaseMatches: def test_get_kb_disease_matches_default_disease(self, graphkb_conn) -> None: get_kb_disease_matches(graphkb_conn) # default to 'cancer' assert graphkb_conn.post.called - assert graphkb_conn.post.call_args_list[0].args[0] == "/subgraphs/Disease" + assert graphkb_conn.post.call_args_list[0].args[0] == '/subgraphs/Disease' def test_get_kb_disease_matches_disease_with_subgraphs(self, graphkb_conn) -> None: - get_kb_disease_matches(graphkb_conn, "Breast Cancer") + get_kb_disease_matches(graphkb_conn, 'Breast Cancer') assert graphkb_conn.post.called - assert graphkb_conn.post.call_args_list[0].args[0] == "/subgraphs/Disease" + assert graphkb_conn.post.call_args_list[0].args[0] == '/subgraphs/Disease' def test_get_kb_disease_matches_get_term_tree(self, graphkb_conn) -> None: - get_kb_disease_matches(graphkb_conn, "Breast Cancer", useSubgraphsRoute=False) + get_kb_disease_matches(graphkb_conn, 'Breast Cancer', useSubgraphsRoute=False) assert not graphkb_conn.post.called @@ -285,210 +287,238 @@ class TestConvertStatementsToAlterations: def test_disease_match(self, graphkb_conn) -> None: statement = base_graphkb_statement(DISEASE_RIDS[0]) result = convert_statements_to_alterations( - graphkb_conn, [statement], DISEASE_RIDS, {"variant_rid"} + graphkb_conn, [statement], DISEASE_RIDS, {'variant_rid'} ) assert len(result) == 1 row = result[0] - assert row["kbVariantId"] == "variant_rid" - assert row["kbStatementId"] == "statement_rid" - assert row["matchedCancer"] - assert row["kbVariant"] == "KRAS increased expression" - assert row["relevance"] == "relevance_display_name" + assert row['kbVariantId'] == 'variant_rid' + assert row['kbStatementId'] == 'statement_rid' + assert row['matchedCancer'] + assert row['kbVariant'] == 'KRAS increased expression' + assert row['relevance'] == 'relevance_display_name' def test_no_disease_match(self, graphkb_conn) -> None: - statement = base_graphkb_statement("other") + statement = base_graphkb_statement('other') result = convert_statements_to_alterations( - graphkb_conn, [statement], DISEASE_RIDS, {"variant_rid"} + graphkb_conn, [statement], DISEASE_RIDS, {'variant_rid'} ) assert len(result) == 1 row = result[0] - assert not row["matchedCancer"] + assert not row['matchedCancer'] def test_multiple_disease_not_match(self, graphkb_conn) -> None: - statement = base_graphkb_statement("disease") - statement["conditions"].append( - {"@class": "Disease", "@rid": "other", "displayName": "disease_display_name"} # type: ignore + statement = base_graphkb_statement('disease') + statement['conditions'].append( + {'@class': 'Disease', '@rid': 'other', 'displayName': 'disease_display_name'} # type: ignore ) result = convert_statements_to_alterations( - graphkb_conn, [statement], DISEASE_RIDS, {"variant_rid"} + graphkb_conn, [statement], DISEASE_RIDS, {'variant_rid'} ) assert len(result) == 1 row = result[0] - assert not row["matchedCancer"] + assert not row['matchedCancer'] def test_biological(self, graphkb_conn) -> None: statement = base_graphkb_statement() - statement["relevance"]["@rid"] = "biological" + statement['relevance']['@rid'] = 'biological' result = convert_statements_to_alterations( - graphkb_conn, [statement], DISEASE_RIDS, {"variant_rid"} + graphkb_conn, [statement], DISEASE_RIDS, {'variant_rid'} ) assert len(result) == 1 row = result[0] - assert row["category"] == "biological" + assert row['category'] == 'biological' def test_prognostic_no_disease_match(self, graphkb_conn) -> None: statement = base_graphkb_statement() - statement["relevance"]["@rid"] = "prognostic" + statement['relevance']['@rid'] = 'prognostic' result = convert_statements_to_alterations( - graphkb_conn, [statement], DISEASE_RIDS, {"variant_rid"} + graphkb_conn, [statement], DISEASE_RIDS, {'variant_rid'} ) assert len(result) == 0 def test_prognostic_disease_match(self, graphkb_conn) -> None: statement = base_graphkb_statement(DISEASE_RIDS[0]) - statement["relevance"]["@rid"] = "prognostic" + statement['relevance']['@rid'] = 'prognostic' result = convert_statements_to_alterations( - graphkb_conn, [statement], DISEASE_RIDS, {"variant_rid"} + graphkb_conn, [statement], DISEASE_RIDS, {'variant_rid'} ) assert len(result) == 1 row = result[0] - assert row["category"] == "prognostic" + assert row['category'] == 'prognostic' def test_diagnostic(self, graphkb_conn) -> None: statement = base_graphkb_statement() - statement["relevance"]["@rid"] = "diagnostic" + statement['relevance']['@rid'] = 'diagnostic' result = convert_statements_to_alterations( - graphkb_conn, [statement], DISEASE_RIDS, {"variant_rid"} + graphkb_conn, [statement], DISEASE_RIDS, {'variant_rid'} ) assert len(result) == 1 row = result[0] - assert row["category"] == "diagnostic" + assert row['category'] == 'diagnostic' - @patch("pori_python.ipr.ipr.get_evidencelevel_mapping") + def test_reference_from_displayname_for_noneligibility_stmts(self, graphkb_conn) -> None: + statement = base_graphkb_statement() + + result = convert_statements_to_alterations( + graphkb_conn, [statement], DISEASE_RIDS, {'variant_rid'} + ) + assert len(result) == 1 + row = result[0] + assert row['reference'] == 'pmid12345' + + def test_reference_from_sourceid_for_eligibility_stmts(self, graphkb_conn) -> None: + statement = base_graphkb_statement() + statement['relevance']['name'] = 'eligibility' + + result = convert_statements_to_alterations( + graphkb_conn, [statement], DISEASE_RIDS, {'variant_rid'} + ) + assert len(result) == 1 + row = result[0] + assert row['reference'] == 'nct12345' + + @patch('pori_python.ipr.ipr.get_evidencelevel_mapping') def test_unapproved_therapeutic(self, mock_get_evidencelevel_mapping, graphkb_conn) -> None: - mock_get_evidencelevel_mapping.return_value = {"other": "test"} + mock_get_evidencelevel_mapping.return_value = {'other': 'test'} statement = base_graphkb_statement() - statement["relevance"]["@rid"] = "therapeutic" - statement["evidenceLevel"] = [{"@rid": "other", "displayName": "level"}] # type: ignore + statement['relevance']['@rid'] = 'therapeutic' + statement['evidenceLevel'] = [{'@rid': 'other', 'displayName': 'level'}] # type: ignore result = convert_statements_to_alterations( - graphkb_conn, [statement], DISEASE_RIDS, {"variant_rid"} + graphkb_conn, [statement], DISEASE_RIDS, {'variant_rid'} ) assert len(result) == 1 row = result[0] - assert row["category"] == "therapeutic" + assert row['category'] == 'therapeutic' - @patch("pori_python.ipr.ipr.get_evidencelevel_mapping") + @patch('pori_python.ipr.ipr.get_evidencelevel_mapping') def test_approved_therapeutic(self, mock_get_evidencelevel_mapping, graphkb_conn) -> None: - mock_get_evidencelevel_mapping.return_value = {APPROVED_EVIDENCE_RIDS[0]: "test"} + mock_get_evidencelevel_mapping.return_value = {APPROVED_EVIDENCE_RIDS[0]: 'test'} statement = base_graphkb_statement() - statement["relevance"]["@rid"] = "therapeutic" - statement["evidenceLevel"] = [{"@rid": APPROVED_EVIDENCE_RIDS[0], "displayName": "level"}] # type: ignore + statement['relevance']['@rid'] = 'therapeutic' + statement['evidenceLevel'] = [{'@rid': APPROVED_EVIDENCE_RIDS[0], 'displayName': 'level'}] # type: ignore result = convert_statements_to_alterations( - graphkb_conn, [statement], DISEASE_RIDS, {"variant_rid"} + graphkb_conn, [statement], DISEASE_RIDS, {'variant_rid'} ) assert len(result) == 1 row = result[0] - assert row["category"] == "therapeutic" + assert row['category'] == 'therapeutic' class TestKbmatchFilters: def test_germline_kb_matches(self): assert len(germline_kb_matches(GERMLINE_KB_MATCHES, GERMLINE_VARIANTS)) == len( GERMLINE_KB_MATCHES - ), "Germline variant not matched to germline KB statement." - assert not germline_kb_matches( - GERMLINE_KB_MATCHES, SOMATIC_VARIANTS - ), "Somatic variant matched to KB germline statement." + ), 'Germline variant not matched to germline KB statement.' + assert not germline_kb_matches(GERMLINE_KB_MATCHES, SOMATIC_VARIANTS), ( + 'Somatic variant matched to KB germline statement.' + ) assert len(germline_kb_matches(SOMATIC_KB_MATCHES, SOMATIC_VARIANTS)) == len( SOMATIC_KB_MATCHES - ), "Somatic variant not matched to somatic KB statement." - assert not germline_kb_matches( - SOMATIC_KB_MATCHES, GERMLINE_VARIANTS - ), "Germline variant matched to KB somatic statement." + ), 'Somatic variant not matched to somatic KB statement.' + assert not germline_kb_matches(SOMATIC_KB_MATCHES, GERMLINE_VARIANTS), ( + 'Germline variant matched to KB somatic statement.' + ) GKB_MATCHES = [ { - "variant": "1", - "approvedTherapy": False, - "category": "prognostic", - "kbContextId": "somatic_test", - "kbRelevanceId": "#147:38", - "kbStatementId": "#154:13387", - "requiredKbMatches": ["#159:5426"], - "kbVariant": "SLC28A3:c.1381C>T", - "kbVariantId": "#159:5426", - "relevance": "prognostic", - "variantType": "mut", - "reviewStatus": "initial", + 'variant': '1', + 'approvedTherapy': False, + 'category': 'prognostic', + 'kbContextId': 'somatic_test', + 'kbRelevanceId': '#147:38', + 'kbStatementId': '#154:13387', + 'requiredKbMatches': ['#159:5426'], + 'kbVariant': 'SLC28A3:c.1381C>T', + 'kbVariantId': '#159:5426', + 'relevance': 'prognostic', + 'variantType': 'mut', + 'reviewStatus': 'initial', }, { - "variant": "2", - "approvedTherapy": True, - "category": "therapy", - "kbContextId": "#135:8764", - "kbRelevanceId": "#147:32", - "kbStatementId": "#155:13511", - "requiredKbMatches": ["#161:938"], - "kbVariant": "BRCA1 mutation", - "kbVariantId": "#161:938", - "matchedCancer": False, - "reference": "MOAlmanac FDA-56", - "relevance": "therapy", - "variantType": "mut", - "reviewStatus": None, + 'variant': '2', + 'approvedTherapy': True, + 'category': 'therapy', + 'kbContextId': '#135:8764', + 'kbRelevanceId': '#147:32', + 'kbStatementId': '#155:13511', + 'requiredKbMatches': ['#161:938'], + 'kbVariant': 'BRCA1 mutation', + 'kbVariantId': '#161:938', + 'matchedCancer': False, + 'reference': 'MOAlmanac FDA-56', + 'relevance': 'therapy', + 'variantType': 'mut', + 'reviewStatus': None, }, { - "variant": "3", - "approvedTherapy": True, - "category": "therapy", - "kbContextId": "#135:8764", - "kbRelevanceId": "#147:32", - "kbStatementId": "#155:13511", - "requiredKbMatches": ["#161:938"], - "kbVariant": "BRCA1 mutation", - "kbVariantId": "#161:938", - "matchedCancer": False, - "reference": "MOAlmanac FDA-56", - "relevance": "therapy", - "variantType": "mut", - "reviewStatus": None, + 'variant': '3', + 'approvedTherapy': True, + 'category': 'therapy', + 'kbContextId': '#135:8764', + 'kbRelevanceId': '#147:32', + 'kbStatementId': '#155:13511', + 'requiredKbMatches': ['#161:938'], + 'kbVariant': 'BRCA1 mutation', + 'kbVariantId': '#161:938', + 'matchedCancer': False, + 'reference': 'MOAlmanac FDA-56', + 'relevance': 'therapy', + 'variantType': 'mut', + 'reviewStatus': None, }, { - "variant": "4", - "approvedTherapy": True, - "category": "therapy", - "kbContextId": "#135:8764", - "kbRelevanceId": "#147:32", - "kbStatementId": "#155:13511", - "requiredKbMatches": ["#159:5426", "#161:938"], - "kbVariant": "BRCA1 mutation", - "kbVariantId": "#161:938", - "matchedCancer": False, - "reference": "MOAlmanac FDA-56", - "relevance": "therapy", - "variantType": "mut", - "reviewStatus": None, + 'variant': '4', + 'approvedTherapy': True, + 'category': 'therapy', + 'kbContextId': '#135:8764', + 'kbRelevanceId': '#147:32', + 'kbStatementId': '#155:13511', + 'requiredKbMatches': ['#159:54261', '#161:9381'], + 'kbVariant': 'BRCA1 mutation', + 'kbVariantId': '#161:9381', + 'matchedCancer': False, + 'reference': 'MOAlmanac FDA-56', + 'relevance': 'therapy', + 'variantType': 'mut', + 'reviewStatus': None, }, ] +ALL_VARIANTS = [ + {'variant': 'var1', 'key': '1', 'variantType': 'mut'}, + {'variant': 'var2', 'key': '2', 'variantType': 'mut'}, + {'variant': 'var3', 'key': '3', 'variantType': 'mut'}, + {'variant': 'var4', 'key': '4', 'variantType': 'mut'}, +] + BASIC_GKB_MATCH = { - "approvedTherapy": False, - "category": "test", - "context": "test", - "kbContextId": "#124:24761", - "disease": "test", - "evidenceLevel": "test", - "iprEvidenceLevel": "test", - "matchedCancer": False, - "reference": "test", - "relevance": "test", - "kbRelevanceId": "#148:31", - "externalSource": "", - "externalStatementId": "", - "reviewStatus": "passed", - "kbData": {}, + 'approvedTherapy': False, + 'category': 'test', + 'context': 'test', + 'kbContextId': '#124:24761', + 'disease': 'test', + 'evidenceLevel': 'test', + 'iprEvidenceLevel': 'test', + 'matchedCancer': False, + 'reference': 'test', + 'relevance': 'test', + 'kbRelevanceId': '#148:31', + 'externalSource': '', + 'externalStatementId': '', + 'reviewStatus': 'passed', + 'kbData': {}, } @@ -503,13 +533,13 @@ def create_gkb_matches(input_fields): def get_condition_set_string_rep(condition_set): for item in condition_set: - item["observedKeysStrs"] = [ - "-".join([elem["kbVariantId"], elem["observedVariantKey"]]) - for elem in item["matchedConditions"] + item['observedKeysStrs'] = [ + '-'.join([elem['kbVariantId'], elem['observedVariantKey']]) + for elem in item['matchedConditions'] ] - item["observedKeysStrs"].sort() - item["observedKeysStr"] = ",".join(item["observedKeysStrs"]) - condition_set = [f"{item['kbStatementId']},{item['observedKeysStr']}" for item in condition_set] + item['observedKeysStrs'].sort() + item['observedKeysStr'] = ','.join(item['observedKeysStrs']) + condition_set = [f'{item["kbStatementId"]},{item["observedKeysStr"]}' for item in condition_set] condition_set.sort() return condition_set @@ -517,132 +547,132 @@ def get_condition_set_string_rep(condition_set): class TestKbMatchSectionPrep: def test_matched_variant_pairs_extracted_only_once_for_multiple_statements(self): input_fields = [ - {"variant": "A", "kbVariantId": "test1", "kbStatementId": "test1"}, + {'variant': 'A', 'kbVariantId': 'test1', 'kbStatementId': 'test1'}, { - "variant": "A", - "kbVariantId": "test1", - "kbStatementId": "test2", + 'variant': 'A', + 'kbVariantId': 'test1', + 'kbStatementId': 'test2', }, # diff statement ] for item in input_fields: # we don't care about these for this test - item["variantType"] = "test" - item["kbVariant"] = "test" - item["requiredKbMatches"] = ["test1", "test2"] + item['variantType'] = 'test' + item['kbVariant'] = 'test' + item['requiredKbMatches'] = ['test1', 'test2'] gkb_matches = create_gkb_matches(input_fields) kb_variants = get_kb_variants(gkb_matches) - found_variants = [f"{item['variant']},{item['kbVariantId']}" for item in kb_variants] + found_variants = [f'{item["variant"]},{item["kbVariantId"]}' for item in kb_variants] found_variants.sort() - assert found_variants == ["A,test1"] + assert found_variants == ['A,test1'] def test_all_distinct_observed_and_matched_variant_pairs_extracted(self): input_fields = [ - {"variant": "A", "kbVariantId": "test1", "kbStatementId": "test1"}, - {"variant": "A", "kbVariantId": "test2", "kbStatementId": "test1"}, - {"variant": "B", "kbVariantId": "test1", "kbStatementId": "test1"}, - {"variant": "B", "kbVariantId": "test1", "kbStatementId": "test1"}, + {'variant': 'A', 'kbVariantId': 'test1', 'kbStatementId': 'test1'}, + {'variant': 'A', 'kbVariantId': 'test2', 'kbStatementId': 'test1'}, + {'variant': 'B', 'kbVariantId': 'test1', 'kbStatementId': 'test1'}, + {'variant': 'B', 'kbVariantId': 'test1', 'kbStatementId': 'test1'}, ] for item in input_fields: # we don't care about these for this test - item["variantType"] = "test" - item["kbVariant"] = "test" - item["requiredKbMatches"] = ["test1", "test2"] + item['variantType'] = 'test' + item['kbVariant'] = 'test' + item['requiredKbMatches'] = ['test1', 'test2'] gkb_matches = create_gkb_matches(input_fields) kb_variants = get_kb_variants(gkb_matches) - found_variants = [f"{item['variant']},{item['kbVariantId']}" for item in kb_variants] + found_variants = [f'{item["variant"]},{item["kbVariantId"]}' for item in kb_variants] found_variants.sort() - assert found_variants == ["A,test1", "A,test2", "B,test1"] + assert found_variants == ['A,test1', 'A,test2', 'B,test1'] def test_statements_extracted_only_once(self): input_fields = [ - {"variant": "A", "kbVariantId": "test1", "kbStatementId": "X"}, - {"variant": "A", "kbVariantId": "test2", "kbStatementId": "X"}, - {"variant": "B", "kbVariantId": "test1", "kbStatementId": "X"}, - {"variant": "B", "kbVariantId": "test2", "kbStatementId": "X"}, - {"variant": "C", "kbVariantId": "test1", "kbStatementId": "Y"}, + {'variant': 'A', 'kbVariantId': 'test1', 'kbStatementId': 'X'}, + {'variant': 'A', 'kbVariantId': 'test2', 'kbStatementId': 'X'}, + {'variant': 'B', 'kbVariantId': 'test1', 'kbStatementId': 'X'}, + {'variant': 'B', 'kbVariantId': 'test2', 'kbStatementId': 'X'}, + {'variant': 'C', 'kbVariantId': 'test1', 'kbStatementId': 'Y'}, ] for item in input_fields: # we don't care about these for this test - item["variantType"] = "test" - item["kbVariant"] = "test" - item["requiredKbMatches"] = ["test1", "test2"] + item['variantType'] = 'test' + item['kbVariant'] = 'test' + item['requiredKbMatches'] = ['test1', 'test2'] gkb_matches = create_gkb_matches(input_fields) kb_stmts = get_kb_matched_statements(gkb_matches) - kb_stmts = [item["kbStatementId"] for item in kb_stmts] + kb_stmts = [item['kbStatementId'] for item in kb_stmts] kb_stmts.sort() - assert kb_stmts == ["X", "Y"] + assert kb_stmts == ['X', 'Y'] def test_singlevar_statements_with_multiple_satisfying_condition_sets(self): input_fields = [ - {"variant": "A", "kbVariantId": "test1", "kbStatementId": "X"}, - {"variant": "B", "kbVariantId": "test1", "kbStatementId": "X"}, - {"variant": "C", "kbVariantId": "test1", "kbStatementId": "X"}, + {'variant': 'A', 'kbVariantId': 'test1', 'kbStatementId': 'X'}, + {'variant': 'B', 'kbVariantId': 'test1', 'kbStatementId': 'X'}, + {'variant': 'C', 'kbVariantId': 'test1', 'kbStatementId': 'X'}, ] for item in input_fields: # we don't care about these for this test - item["variantType"] = "test" - item["kbVariant"] = "test" - item["requiredKbMatches"] = ["test1"] + item['variantType'] = 'test' + item['kbVariant'] = 'test' + item['requiredKbMatches'] = ['test1'] gkb_matches = create_gkb_matches(input_fields) kbcs = get_kb_statement_matched_conditions(gkb_matches) kbcs_string_rep = get_condition_set_string_rep(kbcs) - assert kbcs_string_rep == ["X,test1-A", "X,test1-B", "X,test1-C"] + assert kbcs_string_rep == ['X,test1-A', 'X,test1-B', 'X,test1-C'] def test_multivar_statements_with_multiple_satisfying_condition_sets(self): input_fields = [ - {"variant": "A", "kbVariantId": "test1", "kbStatementId": "X"}, - {"variant": "B", "kbVariantId": "test2", "kbStatementId": "X"}, - {"variant": "C", "kbVariantId": "test2", "kbStatementId": "X"}, + {'variant': 'A', 'kbVariantId': 'test1', 'kbStatementId': 'X'}, + {'variant': 'B', 'kbVariantId': 'test2', 'kbStatementId': 'X'}, + {'variant': 'C', 'kbVariantId': 'test2', 'kbStatementId': 'X'}, ] for item in input_fields: # we don't care about these for this test - item["variantType"] = "test" - item["kbVariant"] = "test" - item["requiredKbMatches"] = ["test1", "test2"] + item['variantType'] = 'test' + item['kbVariant'] = 'test' + item['requiredKbMatches'] = ['test1', 'test2'] gkb_matches = create_gkb_matches(input_fields) kbcs = get_kb_statement_matched_conditions(gkb_matches) kbcs_string_rep = get_condition_set_string_rep(kbcs) - assert kbcs_string_rep == ["X,test1-A,test2-B", "X,test1-A,test2-C"] + assert kbcs_string_rep == ['X,test1-A,test2-B', 'X,test1-A,test2-C'] def test_do_not_infer_possible_matches(self): """edge case - when infer_possible_matches is false, do not allow var/kbvar pairs to satisfy conditions for statements they are not explicitly linked to in the input""" input_fields = [ - {"variant": "A", "kbVariantId": "test1", "kbStatementId": "X"}, - {"variant": "B", "kbVariantId": "test1", "kbStatementId": "Y"}, + {'variant': 'A', 'kbVariantId': 'test1', 'kbStatementId': 'X'}, + {'variant': 'B', 'kbVariantId': 'test1', 'kbStatementId': 'Y'}, ] for item in input_fields: # we don't care about these for this test - item["variantType"] = "test" - item["kbVariant"] = "test" - item["requiredKbMatches"] = ["test1"] + item['variantType'] = 'test' + item['kbVariant'] = 'test' + item['requiredKbMatches'] = ['test1'] gkb_matches = create_gkb_matches(input_fields) kbcs = get_kb_statement_matched_conditions(gkb_matches) kbcs_string_rep = get_condition_set_string_rep(kbcs) - assert kbcs_string_rep == ["X,test1-A", "Y,test1-B"] + assert kbcs_string_rep == ['X,test1-A', 'Y,test1-B'] def test_no_dupes_when_requiredKbMatches_not_sorted(self): input_fields = [ { - "variant": "A", - "kbVariantId": "test1", - "requiredKbMatches": ["test1", "test2"], + 'variant': 'A', + 'kbVariantId': 'test1', + 'requiredKbMatches': ['test1', 'test2'], }, { - "variant": "B", - "kbVariantId": "test2", - "requiredKbMatches": ["test1", "test2"], + 'variant': 'B', + 'kbVariantId': 'test2', + 'requiredKbMatches': ['test1', 'test2'], }, { - "variant": "A", - "kbVariantId": "test1", - "requiredKbMatches": ["test2", "test1"], + 'variant': 'A', + 'kbVariantId': 'test1', + 'requiredKbMatches': ['test2', 'test1'], }, { - "variant": "B", - "kbVariantId": "test2", - "requiredKbMatches": ["test2", "test1"], + 'variant': 'B', + 'kbVariantId': 'test2', + 'requiredKbMatches': ['test2', 'test1'], }, ] for item in input_fields: # we don't care about these for this test - item["variantType"] = "test" - item["kbVariant"] = "test" - item["kbStatementId"] = "X" + item['variantType'] = 'test' + item['kbVariant'] = 'test' + item['kbStatementId'] = 'X' gkb_matches = create_gkb_matches(input_fields) stmts = get_kb_matched_statements(gkb_matches) kbcs = get_kb_statement_matched_conditions(gkb_matches) @@ -654,33 +684,36 @@ def test_partial_matches_omitted(self): are omitted when allow_partial_matches=False""" input_fields = [ { - "variant": "A", - "kbVariantId": "test1", - "kbStatementId": "X", - "requiredKbMatches": ["test1", "test2"], + 'variant': 'A', + 'kbVariantId': 'test1', + 'kbStatementId': 'X', + 'requiredKbMatches': ['test1', 'test2'], }, { - "variant": "B", - "kbVariantId": "test2", - "kbStatementId": "X", - "requiredKbMatches": ["test1", "test2"], + 'variant': 'B', + 'kbVariantId': 'test2', + 'kbStatementId': 'X', + 'requiredKbMatches': ['test1', 'test2'], }, { - "variant": "A", - "kbVariantId": "test1", - "kbStatementId": "Y", - "requiredKbMatches": ["test1", "test3"], + 'variant': 'A', + 'kbVariantId': 'test1', + 'kbStatementId': 'Y', + 'requiredKbMatches': ['test1', 'test3'], }, ] for item in input_fields: # we don't care about these for this test - item["variantType"] = "test" - item["kbVariant"] = "test" + item['variantType'] = 'test' + item['kbVariant'] = 'test' + gkb_matches = create_gkb_matches(input_fields) - stmts = get_kb_matched_statements(gkb_matches) - kbcs = get_kb_statement_matched_conditions(gkb_matches) + sections = get_kb_matches_sections(gkb_matches, allow_partial_matches=False) + + stmts = sections['kbMatchedStatements'] + kbcs = sections['kbStatementMatchedConditions'] assert len(stmts) == 2 assert len(kbcs) == 1 # X only - assert kbcs[0]["kbStatementId"] == "X" + assert kbcs[0]['kbStatementId'] == 'X' def test_partial_matches_omitted_even_when_var_used_elsewhere(self): """edge case - @@ -693,68 +726,133 @@ def test_partial_matches_omitted_even_when_var_used_elsewhere(self): satisfied for statement Z""" input_fields = [ { - "variant": "A", - "kbVariantId": "test1", - "kbStatementId": "X", - "requiredKbMatches": ["test1", "test2"], + 'variant': 'A', + 'kbVariantId': 'test1', + 'kbStatementId': 'X', + 'requiredKbMatches': ['test1', 'test2'], }, { - "variant": "B", - "kbVariantId": "test2", - "kbStatementId": "X", - "requiredKbMatches": ["test1", "test2"], + 'variant': 'B', + 'kbVariantId': 'test2', + 'kbStatementId': 'X', + 'requiredKbMatches': ['test1', 'test2'], }, { - "variant": "A", - "kbVariantId": "test1", - "kbStatementId": "Y", - "requiredKbMatches": ["test1", "test3"], + 'variant': 'A', + 'kbVariantId': 'test1', + 'kbStatementId': 'Y', + 'requiredKbMatches': ['test1', 'test3'], }, { - "variant": "C", - "kbVariantId": "test3", - "kbStatementId": "Z", - "requiredKbMatches": ["test3"], + 'variant': 'C', + 'kbVariantId': 'test3', + 'kbStatementId': 'Z', + 'requiredKbMatches': ['test3'], }, ] for item in input_fields: # we don't care about these for this test - item["variantType"] = "test" - item["kbVariant"] = "test" + item['variantType'] = 'test' + item['kbVariant'] = 'test' gkb_matches = create_gkb_matches(input_fields) stmts = get_kb_matched_statements(gkb_matches) kbcs = get_kb_statement_matched_conditions(gkb_matches) assert len(stmts) == 3 assert len(kbcs) == 2 # X and Z but not Y - assert "Y" not in [item["kbStatementId"] for item in kbcs] + assert 'Y' not in [item['kbStatementId'] for item in kbcs] + + def test_kbvariants_removed_from_set_when_not_part_of_full_conditionset_match(self): + """When there is a variant that fulfills one part of a statement's condition set, + but isn't part of any fully satisfied condition set, + the kbvariant record should be removed from the kbvariants list + """ + input_fields = [ + { + 'variant': 'A', + 'kbVariantId': 'test1', + 'kbStatementId': 'X', + 'requiredKbMatches': ['test1', 'test2', 'test3'], + }, + { + 'variant': 'B', + 'kbVariantId': 'test2', + 'kbStatementId': 'X', + 'requiredKbMatches': ['test1', 'test2', 'test3'], + }, + { + 'variant': 'A', + 'kbVariantId': 'test1', + 'kbStatementId': 'Y', + 'requiredKbMatches': ['test4', 'test1'], + }, + { + 'variant': 'D', + 'kbVariantId': 'test4', + 'kbStatementId': 'Y', + 'requiredKbMatches': ['test4', 'test1'], + }, + ] + for item in input_fields: # we don't care about these for this test + item['variantType'] = 'test' + item['kbVariant'] = 'test' + gkb_matches = create_gkb_matches(input_fields) + sections1 = get_kb_matches_sections(gkb_matches, allow_partial_matches=False) + kbcs1 = sections1['kbStatementMatchedConditions'] + kbvars1 = sections1['kbMatches'] + assert len(kbcs1) == 1 # only fully matched condition sets included + assert len(kbvars1) == 2 # therefore, kbvars associated with stmt X are pruned + + sections2 = get_kb_matches_sections(gkb_matches, allow_partial_matches=True) + kbcs2 = sections2['kbStatementMatchedConditions'] + kbvars2 = sections2['kbMatches'] + assert len(kbcs2) == 2 # all condition sets included + assert len(kbvars2) == 3 # therefore, no pruning def test_partial_matches_included(self): """check statements that are only partially supported are included when allow_partial_matches=True""" input_fields = [ { - "variant": "A", - "kbVariantId": "test1", - "kbStatementId": "X", - "requiredKbMatches": ["test1", "test2"], + 'variant': 'A', + 'kbVariantId': 'test1', + 'kbStatementId': 'X', + 'requiredKbMatches': ['test1', 'test2'], }, { - "variant": "B", - "kbVariantId": "test2", - "kbStatementId": "X", - "requiredKbMatches": ["test1", "test2"], + 'variant': 'B', + 'kbVariantId': 'test2', + 'kbStatementId': 'X', + 'requiredKbMatches': ['test1', 'test2'], }, { - "variant": "A", - "kbVariantId": "test1", - "kbStatementId": "Y", - "requiredKbMatches": ["test1", "test3"], + 'variant': 'A', + 'kbVariantId': 'test1', + 'kbStatementId': 'Y', + 'requiredKbMatches': ['test1', 'test3'], }, ] for item in input_fields: # we don't care about these for this test - item["variantType"] = "test" - item["kbVariant"] = "test" + item['variantType'] = 'test' + item['kbVariant'] = 'test' gkb_matches = create_gkb_matches(input_fields) stmts = get_kb_matched_statements(gkb_matches) kbcs = get_kb_statement_matched_conditions(gkb_matches, allow_partial_matches=True) assert len(stmts) == 2 # X and Y assert len(kbcs) == 2 + + def test_create_key_alterations_includes_only_pruned_kbmatches(self): + gkb_matches = create_gkb_matches(GKB_MATCHES) + + sections1 = get_kb_matches_sections(gkb_matches, allow_partial_matches=False) + key_alts1, counts1 = create_key_alterations( + gkb_matches, ALL_VARIANTS, sections1['kbMatches'] + ) + + sections2 = get_kb_matches_sections(gkb_matches, allow_partial_matches=True) + key_alts2, counts2 = create_key_alterations( + gkb_matches, ALL_VARIANTS, sections2['kbMatches'] + ) + + # check partial-match-only variants are not included in key alterations when + # partial matches is false + assert len(key_alts1) == 3 + assert len(key_alts2) == 4 diff --git a/tests/test_ipr/test_main.py b/tests/test_ipr/test_main.py index f4e9e788..8fe585cd 100644 --- a/tests/test_ipr/test_main.py +++ b/tests/test_ipr/test_main.py @@ -12,98 +12,108 @@ from .constants import EXCLUDE_INTEGRATION_TESTS -EXCLUDE_BCGSC_TESTS = os.environ.get("EXCLUDE_BCGSC_TESTS") == "1" -EXCLUDE_ONCOKB_TESTS = os.environ.get("EXCLUDE_ONCOKB_TESTS") == "1" +EXCLUDE_BCGSC_TESTS = os.environ.get('EXCLUDE_BCGSC_TESTS') == '1' +EXCLUDE_ONCOKB_TESTS = os.environ.get('EXCLUDE_ONCOKB_TESTS') == '1' def get_test_spec(): - ipr_spec = {"components": {"schemas": {"genesCreate": {"properties": {}}}}} + ipr_spec = {'components': {'schemas': {'genesCreate': {'properties': {}}}}} ipr_gene_keys = IprGene.__required_keys__ | IprGene.__optional_keys__ for key in ipr_gene_keys: - ipr_spec["components"]["schemas"]["genesCreate"]["properties"][key] = "" + ipr_spec['components']['schemas']['genesCreate']['properties'][key] = '' return ipr_spec def get_test_file(name: str) -> str: - return os.path.join(os.path.dirname(__file__), "test_data", name) + return os.path.join(os.path.dirname(__file__), 'test_data', name) -@pytest.fixture(scope="module") +@pytest.fixture(scope='module') def report_upload_content(tmp_path_factory) -> Dict: mock = MagicMock() - json_file = tmp_path_factory.mktemp("inputs") / "content.json" + json_file = tmp_path_factory.mktemp('inputs') / 'content.json' json_file.write_text( json.dumps( { - "blargh": "some fake content", - "comparators": [ - {"analysisRole": "expression (disease)", "name": "1"}, - {"analysisRole": "expression (primary site)", "name": "2"}, - {"analysisRole": "expression (biopsy site)", "name": "3"}, + 'blargh': 'some fake content', + 'comparators': [ + {'analysisRole': 'expression (disease)', 'name': '1'}, + {'analysisRole': 'expression (primary site)', 'name': '2'}, + {'analysisRole': 'expression (biopsy site)', 'name': '3'}, { - "analysisRole": "expression (internal pancancer cohort)", - "name": "4", + 'analysisRole': 'expression (internal pancancer cohort)', + 'name': '4', }, ], - "patientId": "PATIENT001", - "project": "TEST", - "expressionVariants": json.loads( - pd.read_csv(get_test_file("expression.short.tab"), sep="\t").to_json( - orient="records" + 'patientId': 'PATIENT001', + 'project': 'TEST', + 'expressionVariants': json.loads( + pd.read_csv(get_test_file('expression.short.tab'), sep='\t').to_json( + orient='records' ) ), - "smallMutations": json.loads( - pd.read_csv(get_test_file("small_mutations.short.tab"), sep="\t").to_json( - orient="records" + 'smallMutations': json.loads( + pd.read_csv(get_test_file('small_mutations.short.tab'), sep='\t').to_json( + orient='records' ) ), - "copyVariants": json.loads( - pd.read_csv(get_test_file("copy_variants.short.tab"), sep="\t").to_json( - orient="records" + 'copyVariants': json.loads( + pd.read_csv(get_test_file('copy_variants.short.tab'), sep='\t').to_json( + orient='records' ) ), - "structuralVariants": json.loads( - pd.read_csv(get_test_file("fusions.tab"), sep="\t").to_json(orient="records") + 'structuralVariants': json.loads( + pd.read_csv(get_test_file('fusions.tab'), sep='\t').to_json(orient='records') ), - "kbDiseaseMatch": "colorectal cancer", + 'kbDiseaseMatch': 'colorectal cancer', + 'msi': [ + { + 'score': 1000.0, + 'kbCategory': 'microsatellite instability', + } + ], + 'hrd': { + 'score': 9999.0, + 'kbCategory': 'homologous recombination deficiency strong signature', + }, }, allow_nan=False, ) ) def side_effect_function(*args, **kwargs): - if "templates" in args[0]: - return [{"name": "genomic", "ident": "001"}] - elif args[0] == "project": - return [{"name": "TEST", "ident": "001"}] + if 'templates' in args[0]: + return [{'name': 'genomic', 'ident': '001'}] + elif args[0] == 'project': + return [{'name': 'TEST', 'ident': '001'}] else: return [] with patch.object( sys, - "argv", + 'argv', [ - "ipr", - "--username", - os.environ.get("IPR_USER", os.environ["USER"]), - "--password", - os.environ["IPR_PASS"], - "--ipr_url", - "http://fake.url.ca", - "--graphkb_username", - os.environ.get("GRAPHKB_USER", os.environ["USER"]), - "--graphkb_password", - os.environ.get("GRAPHKB_PASS", os.environ["IPR_PASS"]), - "--graphkb_url", - os.environ.get("GRAPHKB_URL", False), - "--content", + 'ipr', + '--username', + os.environ.get('IPR_USER', os.environ['USER']), + '--password', + os.environ['IPR_PASS'], + '--ipr_url', + 'http://fake.url.ca', + '--graphkb_username', + os.environ.get('GRAPHKB_USER', os.environ['USER']), + '--graphkb_password', + os.environ.get('GRAPHKB_PASS', os.environ['IPR_PASS']), + '--graphkb_url', + os.environ.get('GRAPHKB_URL', False), + '--content', str(json_file), - "--therapeutics", + '--therapeutics', ], ): - with patch.object(IprConnection, "upload_report", new=mock): - with patch.object(IprConnection, "get_spec", return_value=get_test_spec()): - with patch.object(IprConnection, "get", side_effect=side_effect_function): + with patch.object(IprConnection, 'upload_report', new=mock): + with patch.object(IprConnection, 'get_spec', return_value=get_test_spec()): + with patch.object(IprConnection, 'get', side_effect=side_effect_function): command_interface() assert mock.called @@ -112,57 +122,57 @@ def side_effect_function(*args, **kwargs): return report_content -@pytest.mark.skip(reason="KBDEV-1308; taking too long, getting canceled after reaching max delay") -@pytest.mark.skipif(EXCLUDE_INTEGRATION_TESTS, reason="excluding long running integration tests") +@pytest.mark.skip(reason='KBDEV-1308; taking too long, getting canceled after reaching max delay') +@pytest.mark.skipif(EXCLUDE_INTEGRATION_TESTS, reason='excluding long running integration tests') class TestCreateReport: def test_main_sections_present(self, report_upload_content: Dict) -> None: sections = set(report_upload_content.keys()) for section in [ - "structuralVariants", - "expressionVariants", - "copyVariants", - "smallMutations", - "kbMatches", - "genes", + 'structuralVariants', + 'expressionVariants', + 'copyVariants', + 'smallMutations', + 'kbMatches', + 'genes', ]: assert section in sections def test_kept_low_quality_fusion(self, report_upload_content: Dict) -> None: - fusions = [(sv["gene1"], sv["gene2"]) for sv in report_upload_content["structuralVariants"]] + fusions = [(sv['gene1'], sv['gene2']) for sv in report_upload_content['structuralVariants']] if ( EXCLUDE_BCGSC_TESTS ): # may be missing statements assoc with SUZ12 if no access to bcgsc data - assert ("SARM1", "CDKL2") in fusions + assert ('SARM1', 'CDKL2') in fusions else: - assert ("SARM1", "SUZ12") in fusions + assert ('SARM1', 'SUZ12') in fusions def test_pass_through_content_added(self, report_upload_content: Dict) -> None: # check the passthorough content was added - assert "blargh" in report_upload_content + assert 'blargh' in report_upload_content def test_found_fusion_partner_gene(self, report_upload_content: Dict) -> None: - genes = report_upload_content["genes"] + genes = report_upload_content['genes'] # eg, A1BG - assert any([g.get("knownFusionPartner", False) for g in genes]) + assert any([g.get('knownFusionPartner', False) for g in genes]) - @pytest.mark.skipif(EXCLUDE_ONCOKB_TESTS, reason="excluding tests that depend on oncokb data") + @pytest.mark.skipif(EXCLUDE_ONCOKB_TESTS, reason='excluding tests that depend on oncokb data') def test_found_oncogene(self, report_upload_content: Dict) -> None: - genes = report_upload_content["genes"] + genes = report_upload_content['genes'] # eg, ZBTB20 - assert any([g.get("oncogene", False) for g in genes]) + assert any([g.get('oncogene', False) for g in genes]) - @pytest.mark.skipif(EXCLUDE_ONCOKB_TESTS, reason="excluding tests that depend on oncokb data)") + @pytest.mark.skipif(EXCLUDE_ONCOKB_TESTS, reason='excluding tests that depend on oncokb data)') def test_found_tumour_supressor(self, report_upload_content: Dict) -> None: - genes = report_upload_content["genes"] + genes = report_upload_content['genes'] # eg, ZNRF3 - assert any([g.get("tumourSuppressor", False) for g in genes]) + assert any([g.get('tumourSuppressor', False) for g in genes]) def test_found_kb_statement_related_gene(self, report_upload_content: Dict) -> None: - genes = report_upload_content["genes"] - assert any([g.get("kbStatementRelated", False) for g in genes]) + genes = report_upload_content['genes'] + assert any([g.get('kbStatementRelated', False) for g in genes]) - @pytest.mark.skipif(EXCLUDE_ONCOKB_TESTS, reason="excluding tests that depend on oncokb data") + @pytest.mark.skipif(EXCLUDE_ONCOKB_TESTS, reason='excluding tests that depend on oncokb data') def test_found_cancer_gene_list_match_gene(self, report_upload_content: Dict) -> None: - genes = report_upload_content["genes"] - assert any([g.get("cancerGeneListMatch", False) for g in genes]) + genes = report_upload_content['genes'] + assert any([g.get('cancerGeneListMatch', False) for g in genes]) diff --git a/tests/test_ipr/test_probe.py b/tests/test_ipr/test_probe.py index 4801c54a..43ead9f1 100644 --- a/tests/test_ipr/test_probe.py +++ b/tests/test_ipr/test_probe.py @@ -9,50 +9,50 @@ from .constants import EXCLUDE_INTEGRATION_TESTS -EXCLUDE_BCGSC_TESTS = os.environ.get("EXCLUDE_BCGSC_TESTS") == "1" +EXCLUDE_BCGSC_TESTS = os.environ.get('EXCLUDE_BCGSC_TESTS') == '1' def get_test_file(name: str) -> str: - return os.path.join(os.path.dirname(__file__), "test_data", name) + return os.path.join(os.path.dirname(__file__), 'test_data', name) -@pytest.fixture(scope="module") +@pytest.fixture(scope='module') def probe_upload_content() -> Dict: mock = MagicMock() def side_effect_function(*args, **kwargs): - if "templates" in args[0]: - return [{"name": "genomic", "ident": "001"}] - elif args[0] == "project": - return [{"name": "TEST", "ident": "001"}] + if 'templates' in args[0]: + return [{'name': 'genomic', 'ident': '001'}] + elif args[0] == 'project': + return [{'name': 'TEST', 'ident': '001'}] else: return [] - with patch.object(IprConnection, "upload_report", new=mock): - with patch.object(IprConnection, "get_spec", return_value={}): - with patch.object(IprConnection, "get", side_effect=side_effect_function): + with patch.object(IprConnection, 'upload_report', new=mock): + with patch.object(IprConnection, 'get_spec', return_value={}): + with patch.object(IprConnection, 'get', side_effect=side_effect_function): create_report( content={ - "patientId": "PATIENT001", - "project": "TEST", - "smallMutations": pd.read_csv( - get_test_file("small_mutations_probe.tab"), - sep="\t", - dtype={"chromosome": "string"}, - ).to_dict("records"), - "structuralVariants": pd.read_csv( - get_test_file("fusions.tab"), sep="\t" - ).to_dict("records"), - "blargh": "some fake content", - "kbDiseaseMatch": "colorectal cancer", + 'patientId': 'PATIENT001', + 'project': 'TEST', + 'smallMutations': pd.read_csv( + get_test_file('small_mutations_probe.tab'), + sep='\t', + dtype={'chromosome': 'string'}, + ).to_dict('records'), + 'structuralVariants': pd.read_csv( + get_test_file('fusions.tab'), sep='\t' + ).to_dict('records'), + 'blargh': 'some fake content', + 'kbDiseaseMatch': 'colorectal cancer', }, - username=os.environ["IPR_USER"], - password=os.environ["IPR_PASS"], - log_level="info", - ipr_url="http://fake.url.ca", - graphkb_username=os.environ.get("GRAPHKB_USER", os.environ["IPR_USER"]), - graphkb_password=os.environ.get("GRAPHKB_PASS", os.environ["IPR_PASS"]), - graphkb_url=os.environ.get("GRAPHKB_URL", False), + username=os.environ['IPR_USER'], + password=os.environ['IPR_PASS'], + log_level='info', + ipr_url='http://fake.url.ca', + graphkb_username=os.environ.get('GRAPHKB_USER', os.environ['IPR_USER']), + graphkb_password=os.environ.get('GRAPHKB_PASS', os.environ['IPR_PASS']), + graphkb_url=os.environ.get('GRAPHKB_URL', False), ) assert mock.called @@ -61,22 +61,22 @@ def side_effect_function(*args, **kwargs): return report_content -@pytest.mark.skipif(EXCLUDE_INTEGRATION_TESTS, reason="excluding long running integration tests") +@pytest.mark.skipif(EXCLUDE_INTEGRATION_TESTS, reason='excluding long running integration tests') class TestCreateReport: def test_found_probe_small_mutations(self, probe_upload_content: Dict) -> None: - assert probe_upload_content["smallMutations"] + assert probe_upload_content['smallMutations'] @pytest.mark.skipif( - EXCLUDE_BCGSC_TESTS, reason="excluding tests that depend on BCGSC-specific data" + EXCLUDE_BCGSC_TESTS, reason='excluding tests that depend on BCGSC-specific data' ) def test_found_probe_small_mutations_match(self, probe_upload_content: Dict) -> None: # verify each probe had a KB match - for sm_probe in probe_upload_content["smallMutations"]: + for sm_probe in probe_upload_content['smallMutations']: match_list = [ kb_match - for kb_match in probe_upload_content["kbMatches"] - if kb_match["variant"] == sm_probe["key"] + for kb_match in probe_upload_content['kbMatches'] + if kb_match['variant'] == sm_probe['key'] ] - assert ( - match_list - ), f"probe match failure: {sm_probe['gene']} {sm_probe['proteinChange']} key: {sm_probe['proteinChange']}" + assert match_list, ( + f'probe match failure: {sm_probe["gene"]} {sm_probe["proteinChange"]} key: {sm_probe["proteinChange"]}' + ) diff --git a/tests/test_ipr/test_summary.py b/tests/test_ipr/test_summary.py index 083683ef..248a99f2 100644 --- a/tests/test_ipr/test_summary.py +++ b/tests/test_ipr/test_summary.py @@ -16,14 +16,14 @@ def test_prefers_non_alias(self): side_effect=[ [], [ - {"sourceId": "1", "alias": False, "source": "source", "name": "name"}, - {"sourceId": "2", "alias": True, "source": "source", "name": "name"}, + {'sourceId': '1', 'alias': False, 'source': 'source', 'name': 'name'}, + {'sourceId': '2', 'alias': True, 'source': 'source', 'name': 'name'}, ], ] ) ) - rec = get_preferred_drug_representation(api, "anything") - assert rec["sourceId"] == "1" + rec = get_preferred_drug_representation(api, 'anything') + assert rec['sourceId'] == '1' def test_prefers_non_deprecated(self): api = MagicMock( @@ -31,29 +31,29 @@ def test_prefers_non_deprecated(self): side_effect=[ [], [ - {"sourceId": "1", "deprecated": True, "source": "source", "name": "name"}, - {"sourceId": "2", "deprecated": False, "source": "source", "name": "name"}, + {'sourceId': '1', 'deprecated': True, 'source': 'source', 'name': 'name'}, + {'sourceId': '2', 'deprecated': False, 'source': 'source', 'name': 'name'}, ], ] ) ) - rec = get_preferred_drug_representation(api, "anything") - assert rec["sourceId"] == "2" + rec = get_preferred_drug_representation(api, 'anything') + assert rec['sourceId'] == '2' def test_prefers_lower_sort_source(self): api = MagicMock( query=MagicMock( side_effect=[ - [{"@rid": "source2", "sort": 0}, {"@rid": "source1", "sort": 1}], + [{'@rid': 'source2', 'sort': 0}, {'@rid': 'source1', 'sort': 1}], [ - {"sourceId": "1", "deprecated": False, "source": "source1", "name": "name"}, - {"sourceId": "2", "deprecated": False, "source": "source2", "name": "name"}, + {'sourceId': '1', 'deprecated': False, 'source': 'source1', 'name': 'name'}, + {'sourceId': '2', 'deprecated': False, 'source': 'source2', 'name': 'name'}, ], ] ) ) - rec = get_preferred_drug_representation(api, "anything") - assert rec["sourceId"] == "2" + rec = get_preferred_drug_representation(api, 'anything') + assert rec['sourceId'] == '2' def test_prefers_newer_version(self): api = MagicMock( @@ -62,46 +62,46 @@ def test_prefers_newer_version(self): [], [ { - "sourceId": "2", - "deprecated": True, - "source": "source", - "name": "name", - "sourceIdVersion": "1", + 'sourceId': '2', + 'deprecated': True, + 'source': 'source', + 'name': 'name', + 'sourceIdVersion': '1', }, { - "sourceId": "2", - "deprecated": True, - "source": "source", - "name": "name", - "sourceIdVersion": "2", + 'sourceId': '2', + 'deprecated': True, + 'source': 'source', + 'name': 'name', + 'sourceIdVersion': '2', }, ], ] ) ) - rec = get_preferred_drug_representation(api, "anything") - assert rec["sourceIdVersion"] == "1" + rec = get_preferred_drug_representation(api, 'anything') + assert rec['sourceIdVersion'] == '1' class TestSubstituteSentenceTemplate: def test_multiple_diseases_no_matches(self): - template = "{conditions:variant} is associated with {relevance} to {subject} in {conditions:disease} ({evidence})" - relevance = {"displayName": "senitivity"} - disease_matches = {"1"} + template = '{conditions:variant} is associated with {relevance} to {subject} in {conditions:disease} ({evidence})' + relevance = {'displayName': 'senitivity'} + disease_matches = {'1'} diseases = [ - {"@class": "Disease", "@rid": "2", "displayName": "disease 1"}, - {"@class": "Disease", "@rid": "3", "displayName": "disease 2"}, + {'@class': 'Disease', '@rid': '2', 'displayName': 'disease 1'}, + {'@class': 'Disease', '@rid': '3', 'displayName': 'disease 2'}, ] variants = [ { - "@class": "CategoryVariant", - "displayName": "KRAS increased RNA expression", - "@rid": "4", + '@class': 'CategoryVariant', + 'displayName': 'KRAS increased RNA expression', + '@rid': '4', } ] - subjects = [{"@class": "Therapy", "displayName": "some drug", "@rid": "5"}] + subjects = [{'@class': 'Therapy', 'displayName': 'some drug', '@rid': '5'}] sentence = substitute_sentence_template( - template, diseases + variants, subjects, relevance, [], ["6", "7"], disease_matches + template, diseases + variants, subjects, relevance, [], ['6', '7'], disease_matches ) assert ( sentence @@ -109,24 +109,24 @@ def test_multiple_diseases_no_matches(self): ) def test_multiple_diseases_some_matches(self): - template = "{conditions:variant} is associated with {relevance} to {subject} in {conditions:disease} ({evidence})" - relevance = {"displayName": "senitivity"} - disease_matches = {"1"} + template = '{conditions:variant} is associated with {relevance} to {subject} in {conditions:disease} ({evidence})' + relevance = {'displayName': 'senitivity'} + disease_matches = {'1'} diseases = [ - {"@class": "Disease", "@rid": "2", "displayName": "disease 2"}, - {"@class": "Disease", "@rid": "1", "displayName": "disease 1"}, - {"@class": "Disease", "@rid": "3", "displayName": "disease 3"}, + {'@class': 'Disease', '@rid': '2', 'displayName': 'disease 2'}, + {'@class': 'Disease', '@rid': '1', 'displayName': 'disease 1'}, + {'@class': 'Disease', '@rid': '3', 'displayName': 'disease 3'}, ] variants = [ { - "@class": "CategoryVariant", - "displayName": "KRAS increased RNA expression", - "@rid": "4", + '@class': 'CategoryVariant', + 'displayName': 'KRAS increased RNA expression', + '@rid': '4', } ] - subjects = [{"@class": "Therapy", "displayName": "some drug", "@rid": "5"}] + subjects = [{'@class': 'Therapy', 'displayName': 'some drug', '@rid': '5'}] sentence = substitute_sentence_template( - template, diseases + variants, subjects, relevance, [], ["6", "7"], disease_matches + template, diseases + variants, subjects, relevance, [], ['6', '7'], disease_matches ) assert ( sentence @@ -134,24 +134,24 @@ def test_multiple_diseases_some_matches(self): ) def test_multiple_diseases_only_matches(self): - template = "{conditions:variant} is associated with {relevance} to {subject} in {conditions:disease} ({evidence})" - relevance = {"displayName": "senitivity"} - disease_matches = {"1", "2", "3"} + template = '{conditions:variant} is associated with {relevance} to {subject} in {conditions:disease} ({evidence})' + relevance = {'displayName': 'senitivity'} + disease_matches = {'1', '2', '3'} diseases = [ - {"@class": "Disease", "@rid": "2", "displayName": "disease 2"}, - {"@class": "Disease", "@rid": "1", "displayName": "disease 1"}, - {"@class": "Disease", "@rid": "3", "displayName": "disease 3"}, + {'@class': 'Disease', '@rid': '2', 'displayName': 'disease 2'}, + {'@class': 'Disease', '@rid': '1', 'displayName': 'disease 1'}, + {'@class': 'Disease', '@rid': '3', 'displayName': 'disease 3'}, ] variants = [ { - "@class": "CategoryVariant", - "displayName": "KRAS increased RNA expression", - "@rid": "4", + '@class': 'CategoryVariant', + 'displayName': 'KRAS increased RNA expression', + '@rid': '4', } ] - subjects = [{"@class": "Therapy", "displayName": "some drug", "@rid": "5"}] + subjects = [{'@class': 'Therapy', 'displayName': 'some drug', '@rid': '5'}] sentence = substitute_sentence_template( - template, diseases + variants, subjects, relevance, [], ["6", "7"], disease_matches + template, diseases + variants, subjects, relevance, [], ['6', '7'], disease_matches ) assert ( sentence @@ -162,74 +162,76 @@ def test_multiple_diseases_only_matches(self): mock_ipr_results = [ [ { - "text": "

no cancerType

", - "variantName": "ERBB2 amplification", - "cancerType": [], - "template": {"name": "test3"}, - "project": {"name": "test2"}, + 'text': '

no cancerType

', + 'variantName': 'ERBB2 amplification', + 'cancerType': [], + 'template': {'name': 'test3'}, + 'project': {'name': 'test2'}, }, { - "text": "

normal

", - "variantName": "ERBB2 amplification", - "cancerType": ["test1", "test"], - "template": {"name": "test3"}, - "project": {"name": "test2"}, + 'text': '

normal

', + 'variantName': 'ERBB2 amplification', + 'cancerType': ['test1', 'test'], + 'template': {'name': 'test3'}, + 'project': {'name': 'test2'}, }, { - "text": "

no project

", - "variantName": "ERBB2 amplification", - "cancerType": ["test1", "test"], - "template": {"name": "test3"}, + 'text': '

no project

', + 'variantName': 'ERBB2 amplification', + 'cancerType': ['test1', 'test'], + 'template': {'name': 'test3'}, }, { - "text": "

no template

", - "variantName": "ERBB2 amplification", - "cancerType": ["test1", "test"], - "project": {"name": "test2"}, + 'text': '

no template

', + 'variantName': 'ERBB2 amplification', + 'cancerType': ['test1', 'test'], + 'project': {'name': 'test2'}, }, ], [ { - "text": "

normal, second variant

", - "variantName": "second variant", - "cancerType": ["test1", "test"], - "template": {"name": "test3"}, - "project": {"name": "test2"}, + 'text': '

normal, second variant

', + 'variantName': 'second variant', + 'cancerType': ['test1', 'test'], + 'template': {'name': 'test3'}, + 'project': {'name': 'test2'}, }, ], ] -no_comments_found_output = "No comments found in IPR for variants in this report" +no_comments_found_output = 'No comments found in IPR for variants in this report' class TestVariantTextFromIPR: def test_gets_fully_matched_output_when_possible(self): ipr_conn = MagicMock(get=MagicMock(side_effect=copy(mock_ipr_results))) - matches = [{"kbVariant": "ERBB2 amplification"}] + matches = [{'kbVariant': 'ERBB2 amplification'}] ipr_summary = get_ipr_analyst_comments( ipr_conn, matches=matches, - disease_name="test1", - project_name="test2", - report_type="test3", + disease_name='test1', + disease_match_names=[], + project_name='test2', + report_type='test3', include_nonspecific_project=False, include_nonspecific_disease=True, include_nonspecific_template=True, ) - summary_lines = ipr_summary.split("\n") - assert summary_lines[1] == "

ERBB2 amplification (test1,test)

" - assert summary_lines[2] == "

normal

" + summary_lines = ipr_summary.split('\n') + assert summary_lines[1] == '

ERBB2 amplification (test1,test)

' + assert summary_lines[2] == '

normal

' assert len(summary_lines) == 3 def test_omits_nonspecific_project_matches_when_specified(self): ipr_conn = MagicMock(get=MagicMock(side_effect=copy(mock_ipr_results))) - matches = [{"kbVariant": "ERBB2 amplification"}] + matches = [{'kbVariant': 'ERBB2 amplification'}] ipr_summary = get_ipr_analyst_comments( ipr_conn, matches=matches, - disease_name="test1", - project_name="notfound", - report_type="test3", + disease_name='test1', + disease_match_names=[], + project_name='notfound', + report_type='test3', include_nonspecific_project=False, include_nonspecific_disease=True, include_nonspecific_template=True, @@ -238,13 +240,14 @@ def test_omits_nonspecific_project_matches_when_specified(self): def test_omits_nonspecific_template_matches_when_specified(self): ipr_conn = MagicMock(get=MagicMock(side_effect=copy(mock_ipr_results))) - matches = [{"kbVariant": "ERBB2 amplification"}] + matches = [{'kbVariant': 'ERBB2 amplification'}] ipr_summary = get_ipr_analyst_comments( ipr_conn, matches=matches, - disease_name="test1", - project_name="test2", - report_type="notfound", + disease_name='test1', + disease_match_names=[], + project_name='test2', + report_type='notfound', include_nonspecific_project=True, include_nonspecific_disease=True, include_nonspecific_template=False, @@ -253,13 +256,14 @@ def test_omits_nonspecific_template_matches_when_specified(self): def test_omits_nonspecific_disease_matches_when_specified(self): ipr_conn = MagicMock(get=MagicMock(side_effect=copy(mock_ipr_results))) - matches = [{"kbVariant": "ERBB2 amplification"}] + matches = [{'kbVariant': 'ERBB2 amplification'}] ipr_summary = get_ipr_analyst_comments( ipr_conn, matches=matches, - disease_name="notfound", - project_name="test2", - report_type="test3", + disease_name='notfound', + disease_match_names=[], + project_name='test2', + report_type='test3', include_nonspecific_project=True, include_nonspecific_disease=False, include_nonspecific_template=True, @@ -268,86 +272,110 @@ def test_omits_nonspecific_disease_matches_when_specified(self): def test_includes_nonspecific_project_matches_when_specified(self): ipr_conn = MagicMock(get=MagicMock(side_effect=copy(mock_ipr_results))) - matches = [{"kbVariant": "ERBB2 amplification"}] + matches = [{'kbVariant': 'ERBB2 amplification'}] ipr_summary = get_ipr_analyst_comments( ipr_conn, matches=matches, - disease_name="test1", - project_name="notfound", - report_type="test3", + disease_name='test1', + disease_match_names=[], + project_name='notfound', + report_type='test3', include_nonspecific_project=True, include_nonspecific_disease=False, include_nonspecific_template=False, ) - summary_lines = ipr_summary.split("\n") - assert summary_lines[2] == "

no project

" + summary_lines = ipr_summary.split('\n') + assert summary_lines[2] == '

no project

' assert len(summary_lines) == 3 def test_includes_nonspecific_template_matches_when_specified(self): ipr_conn = MagicMock(get=MagicMock(side_effect=copy(mock_ipr_results))) - matches = [{"kbVariant": "ERBB2 amplification"}] + matches = [{'kbVariant': 'ERBB2 amplification'}] ipr_summary = get_ipr_analyst_comments( ipr_conn, matches=matches, - disease_name="test1", - project_name="test2", - report_type="notfound", + disease_name='test1', + disease_match_names=[], + project_name='test2', + report_type='notfound', include_nonspecific_project=False, include_nonspecific_disease=False, include_nonspecific_template=True, ) - summary_lines = ipr_summary.split("\n") - assert summary_lines[2] == "

no template

" + summary_lines = ipr_summary.split('\n') + assert summary_lines[2] == '

no template

' assert len(summary_lines) == 3 def test_includes_nonspecific_disease_matches_when_specified(self): ipr_conn = MagicMock(get=MagicMock(side_effect=copy(mock_ipr_results))) - matches = [{"kbVariant": "ERBB2 amplification"}] + matches = [{'kbVariant': 'ERBB2 amplification'}] ipr_summary = get_ipr_analyst_comments( ipr_conn, matches=matches, - disease_name="notfound", - project_name="test2", - report_type="test3", + disease_name='notfound', + disease_match_names=[], + project_name='test2', + report_type='test3', include_nonspecific_project=False, include_nonspecific_disease=True, include_nonspecific_template=False, ) - summary_lines = ipr_summary.split("\n") - assert summary_lines[1] == "

ERBB2 amplification (no specific cancer types)

" - assert summary_lines[2] == "

no cancerType

" + summary_lines = ipr_summary.split('\n') + assert summary_lines[1] == '

ERBB2 amplification (no specific cancer types)

' + assert summary_lines[2] == '

no cancerType

' + assert len(summary_lines) == 3 + + def test_includes_all_graphkb_disease_matches(self): + ipr_conn = MagicMock(get=MagicMock(side_effect=copy(mock_ipr_results))) + matches = [{'kbVariant': 'ERBB2 amplification'}] + ipr_summary = get_ipr_analyst_comments( + ipr_conn, + matches=matches, + disease_name='notfound', + disease_match_names=['TEST1'], + project_name='test2', + report_type='test3', + include_nonspecific_project=False, + include_nonspecific_disease=False, + include_nonspecific_template=False, + ) + summary_lines = ipr_summary.split('\n') + assert summary_lines[1] == '

ERBB2 amplification (test1,test)

' + assert summary_lines[2] == '

normal

' assert len(summary_lines) == 3 def test_prepare_section_for_multiple_variants(self): ipr_conn = MagicMock(get=MagicMock(side_effect=copy(mock_ipr_results))) # NB this test relies on matches being processed in this order - matches = [{"kbVariant": "ERBB2 amplification"}, {"kbVariant": "second variant"}] + matches = [{'kbVariant': 'ERBB2 amplification'}, {'kbVariant': 'second variant'}] ipr_summary = get_ipr_analyst_comments( ipr_conn, matches=matches, - disease_name="test1", - project_name="test2", - report_type="test3", + disease_name='test1', + disease_match_names=[], + project_name='test2', + report_type='test3', include_nonspecific_project=False, include_nonspecific_disease=False, include_nonspecific_template=False, ) - summary_lines = ipr_summary.split("\n") + summary_lines = ipr_summary.split('\n') assert len(summary_lines) == 5 assert ( - "\n".join(summary_lines[1:]) - == "

ERBB2 amplification (test1,test)

\n

normal

\n

second variant (test1,test)

\n

normal, second variant

" + '\n'.join(summary_lines[1:]) + == '

ERBB2 amplification (test1,test)

\n

normal

\n

second variant (test1,test)

\n

normal, second variant

' ) def test_empty_section_when_no_variant_match(self): ipr_conn = MagicMock(get=MagicMock(side_effect=[[], []])) - matches = [{"kbVariant": "notfound1"}, {"kbVariant": "notfound2"}] + matches = [{'kbVariant': 'notfound1'}, {'kbVariant': 'notfound2'}] ipr_summary = get_ipr_analyst_comments( ipr_conn, matches=matches, - disease_name="test1", - project_name="test2", - report_type="test3", + disease_name='test1', + disease_match_names=[], + project_name='test2', + report_type='test3', include_nonspecific_project=False, include_nonspecific_disease=False, include_nonspecific_template=False, diff --git a/tests/test_ipr/test_upload.py b/tests/test_ipr/test_upload.py index 95f7fd7a..79568f75 100644 --- a/tests/test_ipr/test_upload.py +++ b/tests/test_ipr/test_upload.py @@ -13,87 +13,97 @@ from .constants import EXCLUDE_INTEGRATION_TESTS -EXCLUDE_BCGSC_TESTS = os.environ.get("EXCLUDE_BCGSC_TESTS") == "1" -EXCLUDE_ONCOKB_TESTS = os.environ.get("EXCLUDE_ONCOKB_TESTS") == "1" -INCLUDE_UPLOAD_TESTS = os.environ.get("INCLUDE_UPLOAD_TESTS", "0") == "1" -DELETE_UPLOAD_TEST_REPORTS = os.environ.get("DELETE_UPLOAD_TEST_REPORTS", "1") == "1" +EXCLUDE_BCGSC_TESTS = os.environ.get('EXCLUDE_BCGSC_TESTS') == '1' +EXCLUDE_ONCOKB_TESTS = os.environ.get('EXCLUDE_ONCOKB_TESTS') == '1' +INCLUDE_UPLOAD_TESTS = os.environ.get('INCLUDE_UPLOAD_TESTS', '0') == '1' +DELETE_UPLOAD_TEST_REPORTS = os.environ.get('DELETE_UPLOAD_TEST_REPORTS', '1') == '1' def get_test_spec(): - ipr_spec = {"components": {"schemas": {"genesCreate": {"properties": {}}}}} + ipr_spec = {'components': {'schemas': {'genesCreate': {'properties': {}}}}} ipr_gene_keys = IprGene.__required_keys__ | IprGene.__optional_keys__ for key in ipr_gene_keys: - ipr_spec["components"]["schemas"]["genesCreate"]["properties"][key] = "" + ipr_spec['components']['schemas']['genesCreate']['properties'][key] = '' return ipr_spec def get_test_file(name: str) -> str: - return os.path.join(os.path.dirname(__file__), "test_data", name) + return os.path.join(os.path.dirname(__file__), 'test_data', name) -@pytest.fixture(scope="module") +@pytest.fixture(scope='module') def loaded_reports(tmp_path_factory) -> Generator: - json_file = tmp_path_factory.mktemp("inputs") / "content.json" - async_json_file = tmp_path_factory.mktemp("inputs") / "async_content.json" - patient_id = f"TEST_{str(uuid.uuid4())}" - async_patient_id = f"TEST_ASYNC_{str(uuid.uuid4())}" + json_file = tmp_path_factory.mktemp('inputs') / 'content.json' + async_json_file = tmp_path_factory.mktemp('inputs') / 'async_content.json' + patient_id = f'TEST_{str(uuid.uuid4())}' + async_patient_id = f'TEST_ASYNC_{str(uuid.uuid4())}' json_contents = { - "comparators": [ - {"analysisRole": "expression (disease)", "name": "1"}, - {"analysisRole": "expression (primary site)", "name": "2"}, - {"analysisRole": "expression (biopsy site)", "name": "3"}, + 'comparators': [ + {'analysisRole': 'expression (disease)', 'name': '1'}, + {'analysisRole': 'expression (primary site)', 'name': '2'}, + {'analysisRole': 'expression (biopsy site)', 'name': '3'}, { - "analysisRole": "expression (internal pancancer cohort)", - "name": "4", + 'analysisRole': 'expression (internal pancancer cohort)', + 'name': '4', }, ], - "patientId": patient_id, - "project": "TEST", - "sampleInfo": [ + 'patientId': patient_id, + 'project': 'TEST', + 'sampleInfo': [ { - "sample": "Constitutional", - "biopsySite": "Normal tissue", - "sampleName": "SAMPLE1-PB", - "primarySite": "Blood-Peripheral", - "collectionDate": "11-11-11", + 'sample': 'Constitutional', + 'biopsySite': 'Normal tissue', + 'sampleName': 'SAMPLE1-PB', + 'primarySite': 'Blood-Peripheral', + 'collectionDate': '11-11-11', }, { - "sample": "Tumour", - "pathoTc": "90%", - "biopsySite": "hepatic", - "sampleName": "SAMPLE2-FF-1", - "primarySite": "Vena Cava-Hepatic", - "collectionDate": "12-12-12", + 'sample': 'Tumour', + 'pathoTc': '90%', + 'biopsySite': 'hepatic', + 'sampleName': 'SAMPLE2-FF-1', + 'primarySite': 'Vena Cava-Hepatic', + 'collectionDate': '12-12-12', }, ], - "expressionVariants": json.loads( - pd.read_csv(get_test_file("expression.short.tab"), sep="\t").to_json(orient="records") + 'msi': [ + { + 'score': 1000.0, + 'kbCategory': 'microsatellite instability', + } + ], + 'hrd': { + 'score': 9999.0, + 'kbCategory': 'homologous recombination deficiency strong signature', + }, + 'expressionVariants': json.loads( + pd.read_csv(get_test_file('expression.short.tab'), sep='\t').to_json(orient='records') ), - "smallMutations": json.loads( - pd.read_csv(get_test_file("small_mutations.short.tab"), sep="\t").to_json( - orient="records" + 'smallMutations': json.loads( + pd.read_csv(get_test_file('small_mutations.short.tab'), sep='\t').to_json( + orient='records' ) ), - "copyVariants": json.loads( - pd.read_csv(get_test_file("copy_variants.short.tab"), sep="\t").to_json( - orient="records" + 'copyVariants': json.loads( + pd.read_csv(get_test_file('copy_variants.short.tab'), sep='\t').to_json( + orient='records' ) ), - "structuralVariants": json.loads( - pd.read_csv(get_test_file("fusions.tab"), sep="\t").to_json(orient="records") + 'structuralVariants': json.loads( + pd.read_csv(get_test_file('fusions.tab'), sep='\t').to_json(orient='records') ), - "kbDiseaseMatch": "colorectal cancer", - "cosmicSignatures": pd.read_csv( - get_test_file("cosmic_variants.tab"), sep="\t" + 'kbDiseaseMatch': 'colorectal cancer', + 'cosmicSignatures': pd.read_csv( + get_test_file('cosmic_variants.tab'), sep='\t' ).signature.tolist(), - "hlaTypes": json.loads( - pd.read_csv(get_test_file("hla_variants.tab"), sep="\t").to_json(orient="records") + 'hlaTypes': json.loads( + pd.read_csv(get_test_file('hla_variants.tab'), sep='\t').to_json(orient='records') ), - "images": [ + 'images': [ { - "key": "cnvLoh.circos", - "path": "test/testData/images/cnvLoh.png", - "caption": "Test adding a caption to an image", + 'key': 'cnvLoh.circos', + 'path': 'test/testData/images/cnvLoh.png', + 'caption': 'Test adding a caption to an image', } ], } @@ -105,7 +115,7 @@ def loaded_reports(tmp_path_factory) -> Generator: ) ) - json_contents["patientId"] = async_patient_id + json_contents['patientId'] = async_patient_id async_json_file.write_text( json.dumps( json_contents, @@ -114,61 +124,61 @@ def loaded_reports(tmp_path_factory) -> Generator: ) argslist = [ - "ipr", - "--username", - os.environ.get("IPR_USER", os.environ["USER"]), - "--password", - os.environ["IPR_PASS"], - "--graphkb_username", - os.environ.get("GRAPHKB_USER", os.environ.get("IPR_USER", os.environ["USER"])), - "--graphkb_password", - os.environ.get("GRAPHKB_PASS", os.environ["IPR_PASS"]), - "--ipr_url", - os.environ["IPR_TEST_URL"], - "--graphkb_url", - os.environ.get("GRAPHKB_URL", False), - "--therapeutics", - "--allow_partial_matches", + 'ipr', + '--username', + os.environ.get('IPR_USER', os.environ['USER']), + '--password', + os.environ['IPR_PASS'], + '--graphkb_username', + os.environ.get('GRAPHKB_USER', os.environ.get('IPR_USER', os.environ['USER'])), + '--graphkb_password', + os.environ.get('GRAPHKB_PASS', os.environ['IPR_PASS']), + '--ipr_url', + os.environ['IPR_TEST_URL'], + '--graphkb_url', + os.environ.get('GRAPHKB_URL', False), + '--therapeutics', + '--allow_partial_matches', ] sync_argslist = argslist.copy() - sync_argslist.extend(["--content", str(json_file)]) - with patch.object(sys, "argv", sync_argslist): - with patch.object(IprConnection, "get_spec", return_value=get_test_spec()): + sync_argslist.extend(['--content', str(json_file)]) + with patch.object(sys, 'argv', sync_argslist): + with patch.object(IprConnection, 'get_spec', return_value=get_test_spec()): command_interface() async_argslist = argslist.copy() - async_argslist.extend(["--content", str(async_json_file), "--async_upload"]) - with patch.object(sys, "argv", async_argslist): - with patch.object(IprConnection, "get_spec", return_value=get_test_spec()): + async_argslist.extend(['--content', str(async_json_file), '--async_upload']) + with patch.object(sys, 'argv', async_argslist): + with patch.object(IprConnection, 'get_spec', return_value=get_test_spec()): command_interface() ipr_conn = IprConnection( - username=os.environ.get("IPR_USER", os.environ["USER"]), - password=os.environ["IPR_PASS"], - url=os.environ["IPR_TEST_URL"], + username=os.environ.get('IPR_USER', os.environ['USER']), + password=os.environ['IPR_PASS'], + url=os.environ['IPR_TEST_URL'], ) - loaded_report = ipr_conn.get(uri=f"reports?searchText={patient_id}") - async_loaded_report = ipr_conn.get(uri=f"reports?searchText={async_patient_id}") + loaded_report = ipr_conn.get(uri=f'reports?searchText={patient_id}') + async_loaded_report = ipr_conn.get(uri=f'reports?searchText={async_patient_id}') loaded_reports_result = { - "sync": (patient_id, loaded_report), - "async": (async_patient_id, async_loaded_report), + 'sync': (patient_id, loaded_report), + 'async': (async_patient_id, async_loaded_report), } yield loaded_reports_result if DELETE_UPLOAD_TEST_REPORTS: - ipr_conn.delete(uri=f"reports/{loaded_report['reports'][0]['ident']}") - ipr_conn.delete(uri=f"reports/{async_loaded_report['reports'][0]['ident']}") + ipr_conn.delete(uri=f'reports/{loaded_report["reports"][0]["ident"]}') + ipr_conn.delete(uri=f'reports/{async_loaded_report["reports"][0]["ident"]}') def get_section(loaded_report, section_name): - ident = loaded_report[1]["reports"][0]["ident"] + ident = loaded_report[1]['reports'][0]['ident'] ipr_conn = IprConnection( - username=os.environ.get("IPR_USER", os.environ["USER"]), - password=os.environ["IPR_PASS"], - url=os.environ["IPR_TEST_URL"], + username=os.environ.get('IPR_USER', os.environ['USER']), + password=os.environ['IPR_PASS'], + url=os.environ['IPR_TEST_URL'], ) - return ipr_conn.get(uri=f"reports/{ident}/{section_name}") + return ipr_conn.get(uri=f'reports/{ident}/{section_name}') def stringify_sorted(obj): @@ -181,7 +191,7 @@ def stringify_sorted(obj): obj.sort() return str(obj) elif isinstance(obj, dict): - for key in ("ident", "updatedAt", "createdAt", "deletedAt"): + for key in ('ident', 'updatedAt', 'createdAt', 'deletedAt'): obj.pop(key, None) keys = obj.keys() for key in keys: @@ -197,50 +207,50 @@ def stringify_sorted(obj): @pytest.mark.skipif( - not INCLUDE_UPLOAD_TESTS, reason="excluding tests of upload to live ipr instance" + not INCLUDE_UPLOAD_TESTS, reason='excluding tests of upload to live ipr instance' ) -@pytest.mark.skipif(EXCLUDE_INTEGRATION_TESTS, reason="excluding long running integration tests") +@pytest.mark.skipif(EXCLUDE_INTEGRATION_TESTS, reason='excluding long running integration tests') class TestCreateReport: def test_patient_id_loaded_once(self, loaded_reports) -> None: - sync_patient_id = loaded_reports["sync"][0] - assert loaded_reports["sync"][1]["total"] == 1 - assert loaded_reports["sync"][1]["reports"][0]["patientId"] == sync_patient_id - async_patient_id = loaded_reports["async"][0] - assert loaded_reports["async"][1]["total"] == 1 - assert loaded_reports["async"][1]["reports"][0]["patientId"] == async_patient_id + sync_patient_id = loaded_reports['sync'][0] + assert loaded_reports['sync'][1]['total'] == 1 + assert loaded_reports['sync'][1]['reports'][0]['patientId'] == sync_patient_id + async_patient_id = loaded_reports['async'][0] + assert loaded_reports['async'][1]['total'] == 1 + assert loaded_reports['async'][1]['reports'][0]['patientId'] == async_patient_id def test_expression_variants_loaded(self, loaded_reports) -> None: - section = get_section(loaded_reports["sync"], "expression-variants") - kbmatched = [item for item in section if item["kbMatches"]] - assert "PTP4A3" in [item["gene"]["name"] for item in kbmatched] - async_section = get_section(loaded_reports["async"], "expression-variants") + section = get_section(loaded_reports['sync'], 'expression-variants') + kbmatched = [item for item in section if item['kbMatches']] + assert 'PTP4A3' in [item['gene']['name'] for item in kbmatched] + async_section = get_section(loaded_reports['async'], 'expression-variants') async_equals_sync = stringify_sorted(section) == stringify_sorted(async_section) assert async_equals_sync def test_structural_variants_loaded(self, loaded_reports) -> None: - section = get_section(loaded_reports["sync"], "structural-variants") - kbmatched = [item for item in section if item["kbMatches"]] - assert "(EWSR1,FLI1):fusion(e.7,e.4)" in [item["displayName"] for item in kbmatched] - async_section = get_section(loaded_reports["async"], "structural-variants") + section = get_section(loaded_reports['sync'], 'structural-variants') + kbmatched = [item for item in section if item['kbMatches']] + assert '(EWSR1,FLI1):fusion(e.7,e.4)' in [item['displayName'] for item in kbmatched] + async_section = get_section(loaded_reports['async'], 'structural-variants') async_equals_sync = stringify_sorted(section) == stringify_sorted(async_section) assert async_equals_sync def test_small_mutations_loaded(self, loaded_reports) -> None: - section = get_section(loaded_reports["sync"], "small-mutations") - kbmatched = [item for item in section if item["kbMatches"]] - assert "FGFR2:p.R421C" in [item["displayName"] for item in kbmatched] - assert "CDKN2A:p.T18M" in [item["displayName"] for item in kbmatched] - async_section = get_section(loaded_reports["async"], "small-mutations") + section = get_section(loaded_reports['sync'], 'small-mutations') + kbmatched = [item for item in section if item['kbMatches']] + assert 'FGFR2:p.R421C' in [item['displayName'] for item in kbmatched] + assert 'CDKN2A:p.T18M' in [item['displayName'] for item in kbmatched] + async_section = get_section(loaded_reports['async'], 'small-mutations') async_equals_sync = stringify_sorted(section) == stringify_sorted(async_section) assert async_equals_sync def test_copy_variants_loaded(self, loaded_reports) -> None: - section = get_section(loaded_reports["sync"], "copy-variants") - kbmatched = [item for item in section if item["kbMatches"]] - assert ("ERBB2", "amplification") in [ - (item["gene"]["name"], item["displayName"]) for item in kbmatched + section = get_section(loaded_reports['sync'], 'copy-variants') + kbmatched = [item for item in section if item['kbMatches']] + assert ('ERBB2', 'amplification') in [ + (item['gene']['name'], item['displayName']) for item in kbmatched ] - async_section = get_section(loaded_reports["async"], "copy-variants") + async_section = get_section(loaded_reports['async'], 'copy-variants') async_equals_sync = stringify_sorted(section) == stringify_sorted(async_section) assert async_equals_sync @@ -255,58 +265,58 @@ def test_copy_variants_loaded(self, loaded_reports) -> None: # assert compare_sections(section, async_section) def test_kb_matches_loaded(self, loaded_reports) -> None: - section = get_section(loaded_reports["sync"], "kb-matches") + section = get_section(loaded_reports['sync'], 'kb-matches') observed_and_matched = set( - [(item["kbVariant"], item["variant"]["displayName"]) for item in section] + [(item['kbVariant'], item['variant']['displayName']) for item in section] ) for pair in [ - ("ERBB2 amplification", "amplification"), - ("FGFR2 mutation", "FGFR2:p.R421C"), - ("PTP4A3 overexpression", "increased expression"), - ("EWSR1 and FLI1 fusion", "(EWSR1,FLI1):fusion(e.7,e.4)"), - ("CDKN2A mutation", "CDKN2A:p.T18M"), + ('ERBB2 amplification', 'amplification'), + ('FGFR2 mutation', 'FGFR2:p.R421C'), + ('PTP4A3 overexpression', 'increased expression'), + ('EWSR1 and FLI1 fusion', '(EWSR1,FLI1):fusion(e.7,e.4)'), + ('CDKN2A mutation', 'CDKN2A:p.T18M'), ]: assert pair in observed_and_matched - async_section = get_section(loaded_reports["async"], "kb-matches") + async_section = get_section(loaded_reports['async'], 'kb-matches') async_equals_sync = stringify_sorted(section) == stringify_sorted(async_section) assert async_equals_sync def test_therapeutic_targets_loaded(self, loaded_reports) -> None: - section = get_section(loaded_reports["sync"], "therapeutic-targets") - therapeutic_target_genes = set([item["gene"] for item in section]) - for gene in ["CDKN2A", "ERBB2", "FGFR2", "PTP4A3"]: + section = get_section(loaded_reports['sync'], 'therapeutic-targets') + therapeutic_target_genes = set([item['gene'] for item in section]) + for gene in ['CDKN2A', 'ERBB2', 'FGFR2', 'PTP4A3']: assert gene in therapeutic_target_genes - async_section = get_section(loaded_reports["async"], "therapeutic-targets") + async_section = get_section(loaded_reports['async'], 'therapeutic-targets') async_equals_sync = stringify_sorted(section) == stringify_sorted(async_section) assert async_equals_sync def test_genomic_alterations_identified_loaded(self, loaded_reports) -> None: - section = get_section(loaded_reports["sync"], "summary/genomic-alterations-identified") - variants = set([item["geneVariant"] for item in section]) + section = get_section(loaded_reports['sync'], 'summary/genomic-alterations-identified') + variants = set([item['geneVariant'] for item in section]) for variant in [ - "FGFR2:p.R421C", - "PTP4A3 (high_percentile)", - "ERBB2 (Amplification)", - "(EWSR1,FLI1):fusion(e.7,e.4)", - "CDKN2A:p.T18M", + 'FGFR2:p.R421C', + 'PTP4A3 (high_percentile)', + 'ERBB2 (Amplification)', + '(EWSR1,FLI1):fusion(e.7,e.4)', + 'CDKN2A:p.T18M', ]: assert variant in variants async_section = get_section( - loaded_reports["async"], "summary/genomic-alterations-identified" + loaded_reports['async'], 'summary/genomic-alterations-identified' ) async_equals_sync = stringify_sorted(section) == stringify_sorted(async_section) assert async_equals_sync def test_analyst_comments_loaded(self, loaded_reports) -> None: - sync_section = get_section(loaded_reports["sync"], "summary/analyst-comments") - assert sync_section["comments"] - async_section = get_section(loaded_reports["async"], "summary/analyst-comments") - assert async_section["comments"] - assert sync_section["comments"] == async_section["comments"] + sync_section = get_section(loaded_reports['sync'], 'summary/analyst-comments') + assert sync_section['comments'] + async_section = get_section(loaded_reports['async'], 'summary/analyst-comments') + assert async_section['comments'] + assert sync_section['comments'] == async_section['comments'] def test_sample_info_loaded(self, loaded_reports) -> None: - sync_section = get_section(loaded_reports["sync"], "sample-info") - async_section = get_section(loaded_reports["async"], "sample-info") + sync_section = get_section(loaded_reports['sync'], 'sample-info') + async_section = get_section(loaded_reports['async'], 'sample-info') async_equals_sync = stringify_sorted(sync_section) == stringify_sorted(async_section) assert async_equals_sync @@ -322,31 +332,31 @@ def test_multivariant_multiconditionset_statements_loaded(self, loaded_reports) are met. This is also a test of multiple condition sets since there are two variants in the test data that satisfy one of the conditions (the APC mutation).""" - section = get_section(loaded_reports["sync"], "kb-matches/kb-matched-statements") - multivariant_stmts = [item for item in section if item["reference"] == "pmid:27302369"] + section = get_section(loaded_reports['sync'], 'kb-matches/kb-matched-statements') + multivariant_stmts = [item for item in section if item['reference'] == 'pmid:27302369'] # if this statement is entered more than once there may be multiple sets of records to # check, so to make sure the count checks work, go stmt_id by stmt_id: - stmt_ids = list(set([item["kbStatementId"] for item in multivariant_stmts])) + stmt_ids = list(set([item['kbStatementId'] for item in multivariant_stmts])) for stmt_id in stmt_ids: - stmts = [item for item in multivariant_stmts if item["kbStatementId"] == stmt_id] + stmts = [item for item in multivariant_stmts if item['kbStatementId'] == stmt_id] - # we expect two stmts, one for each condition set - assert len(stmts) == 2 + # we expect three stmts, one for each condition set + assert len(stmts) == 3 # we expect each condition set to have two kb variants in it # we expect the two kb variants to be the same in each stmt - assert len(stmts[0]["kbMatches"]) == 2 - assert len(stmts[1]["kbMatches"]) == 2 - kbmatches1 = [item["kbVariant"] for item in stmts[0]["kbMatches"]] - kbmatches2 = [item["kbVariant"] for item in stmts[1]["kbMatches"]] + assert len(stmts[0]['kbMatches']) == 2 + assert len(stmts[1]['kbMatches']) == 2 + kbmatches1 = [item['kbVariant'] for item in stmts[0]['kbMatches']] + kbmatches2 = [item['kbVariant'] for item in stmts[1]['kbMatches']] kbmatches1.sort() kbmatches2.sort() - assert kbmatches1 == kbmatches2 == ["APC mutation", "KRAS mutation"] + assert kbmatches1 == kbmatches2 == ['APC mutation', 'KRAS mutation'] # we expect the two stmts to have different observed variant sets - observedVariants1 = [item["variant"]["ident"] for item in stmts[0]["kbMatches"]] - observedVariants2 = [item["variant"]["ident"] for item in stmts[1]["kbMatches"]] + observedVariants1 = [item['variant']['ident'] for item in stmts[0]['kbMatches']] + observedVariants2 = [item['variant']['ident'] for item in stmts[1]['kbMatches']] observedVariants1.sort() observedVariants2.sort() assert observedVariants1 != observedVariants2 diff --git a/tests/test_ipr/test_util.py b/tests/test_ipr/test_util.py index 70318818..bbae6d98 100644 --- a/tests/test_ipr/test_util.py +++ b/tests/test_ipr/test_util.py @@ -4,8 +4,8 @@ @pytest.mark.parametrize( - "input,output_keys", - [[{"key": 0}, ["key"]], [{"key": None}, []], [{"key": ""}, []], [{"gene1": None}, ["gene1"]]], + 'input,output_keys', + [[{'key': 0}, ['key']], [{'key': None}, []], [{'key': ''}, []], [{'gene1': None}, ['gene1']]], ) def test_trim_empty_values(input, output_keys): modified_object = trim_empty_values(input) @@ -13,17 +13,17 @@ def test_trim_empty_values(input, output_keys): @pytest.mark.parametrize( - "variant,result", + 'variant,result', [ [ - {"variantType": "exp", "gene": "GENE", "expressionState": "increased expression"}, - "increased expression", + {'variantType': 'exp', 'gene': 'GENE', 'expressionState': 'increased expression'}, + 'increased expression', ], - [{"variantType": "cnv", "gene": "GENE", "cnvState": "amplification"}, "amplification"], - [{"variantType": "other", "gene2": "GENE", "variant": "GENE:anything"}, "anything"], + [{'variantType': 'cnv', 'gene': 'GENE', 'cnvState': 'amplification'}, 'amplification'], + [{'variantType': 'other', 'gene2': 'GENE', 'variant': 'GENE:anything'}, 'anything'], ], ) def test_create_variant_name_tuple(variant, result): gene, name = create_variant_name_tuple(variant) assert name == result - assert gene == "GENE" + assert gene == 'GENE'