diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml index 607c0c6e..7804a8a0 100644 --- a/.github/workflows/pytest.yml +++ b/.github/workflows/pytest.yml @@ -28,15 +28,10 @@ jobs: run: | python -m pip install --upgrade pip setuptools pip install -e .[test] # coverage reports need -e to capture properly - - name: Lint with flake8 + - name: Check with ruff run: | - pip install flake8 - # stop the build if there are Python syntax errors or undefined names - flake8 pori_python --count --select=E9,F63,F7,F82 --show-source --statistics - - name: Check with black - run: | - pip install black - black --check -S -l 100 pori_python tests + pip install ruff + ruff format --check pori_python tests - name: Full Tests with pytest run: | pip list @@ -46,6 +41,7 @@ jobs: IPR_PASS: ${{ secrets.IPR_TEST_PASSWORD }} GRAPHKB_USER: ${{ secrets.GKB_TEST_USER }} GRAPHKB_PASS: ${{ secrets.GKB_TEST_PASS }} + GRAPHKB_URL: ${{ secrets.GKB_TEST_URL }} # SDEV-3381 - Turn off integration tests temporarily, till efficiency is increased # turn on integration tests for one python version only EXCLUDE_INTEGRATION_TESTS: ${{ matrix.python-version != '3.11' }} diff --git a/.github/workflows/quick-pytest.yml b/.github/workflows/quick-pytest.yml index d4f70223..aef2b56d 100644 --- a/.github/workflows/quick-pytest.yml +++ b/.github/workflows/quick-pytest.yml @@ -25,22 +25,19 @@ jobs: run: | python -m pip install --upgrade pip setuptools pip install -e .[test] # coverage reports need -e to capture properly - - name: Lint with flake8 + - name: Check with ruff run: | - pip install flake8 - # stop the build if there are Python syntax errors or undefined names - flake8 pori_python --count --select=E9,F63,F7,F82 --show-source --statistics - - name: Check with black - run: | - pip install black - black --check -S -l 100 pori_python tests + pip install ruff + ruff format --check pori_python tests - name: Short Tests with pytest run: pytest --junitxml=junit/test-results-${{ matrix.python-version }}.xml --cov ipr --cov-report term --cov-report xml env: IPR_USER: ${{ secrets.IPR_TEST_USER }} IPR_PASS: ${{ secrets.IPR_TEST_PASSWORD }} + IPR_URL: ${{ secrets.IPR_TEST_URL }} GRAPHKB_USER: ${{ secrets.GKB_TEST_USER }} GRAPHKB_PASS: ${{ secrets.GKB_TEST_PASS }} + GRAPHKB_URL: ${{ secrets.GKB_TEST_URL }} EXCLUDE_INTEGRATION_TESTS: 1 # EXCLUDE_INTEGRATION_TESTS: ${{ matrix.python-version != '3.11' }} - if: github.event_name != 'pull_request' \ No newline at end of file + if: github.event_name != 'pull_request' diff --git a/README.md b/README.md index 44d015ec..192ab2a3 100644 --- a/README.md +++ b/README.md @@ -46,11 +46,9 @@ pip install -e .[dev] Run the tests: -Export usernames, passwords, and set test options. +Export usernames, passwords, and test options. -Note that IPR tests will try to use the BCGSC production GraphKB API by default. -If you want to test interaction with a different instance, you will need to -set the GraphKB variables. +IPR_URL and GRAPHKB_URL values must also be set. Set EXCLUDE vars to 1 if you don't want to run these tests. ONCOKB and BCGSC tests are enabled by default. @@ -67,11 +65,12 @@ export EXCLUDE_ONCOKB_TESTS=1 ``` If you want to run tests that upload reports to a live IPR instance, -specify the url of the IPR API you want to use and set the test var to 1. +specify the url of the IPR API you want to use and set the test var +INCLUDE_UPLOAD_TESTS to 1. These tests are disabled by default. The created reports are deleted by default. If you want to keep them, -set DELETE_UPLOAD_TEST_REPORTS to 0 in the env. +set DELETE_UPLOAD_TEST_REPORTS to 0. ```bash export IPR_TEST_URL='http://localhost:8081/api' @@ -84,14 +83,16 @@ pytest tests ``` ### JSON Validate and Upload to IPR +An IPR_URL must be provided either as an environment variable or an arg. + If you only want to validate the json content, use ```bash -ipr --password $IPR_PASS -c 'path/to/content.json' --validate_json +ipr --password $IPR_PASS -c 'path/to/content.json' --validate_json --ipr_url $IPR_URL ``` If you only want to upload the json directly to ipr and skip all the preprocessing, use ```bash -ipr --password $IPR_PASS -c 'path/to/content.json' --upload_json +ipr --password $IPR_PASS -c 'path/to/content.json' --upload_json --ipr_url $IPR_URL ``` ## Documentation diff --git a/pori_python/graphkb/__init__.py b/pori_python/graphkb/__init__.py index a6fdd663..acce57aa 100644 --- a/pori_python/graphkb/__init__.py +++ b/pori_python/graphkb/__init__.py @@ -1,2 +1 @@ -from .constants import DEFAULT_URL # noqa: F401 from .util import GraphKBConnection, logger # noqa: F401 diff --git a/pori_python/graphkb/constants.py b/pori_python/graphkb/constants.py index 4861d70c..fe22f4a0 100644 --- a/pori_python/graphkb/constants.py +++ b/pori_python/graphkb/constants.py @@ -4,113 +4,108 @@ from pori_python.types import CategoryBaseTermMapping DEFAULT_LIMIT = 1000 -GKB_BASE_URL = "https://graphkb-api.bcgsc.ca/api" -GKB_STAGING_URL = "https://graphkbstaging-api.bcgsc.ca/api" -GKB_DEV_URL = "https://graphkbdev-api.bcgsc.ca/api" -DEFAULT_URL = GKB_BASE_URL -PREFERRED_GENE_SOURCE = "#39:5" # HGNC -PREFERRED_GENE_SOURCE_NAME = "HGNC" +PREFERRED_GENE_SOURCE_NAME = 'HGNC' -BASE_RETURN_PROPERTIES = ["@rid", "@class"] +BASE_RETURN_PROPERTIES = ['@rid', '@class'] GENERIC_RETURN_PROPERTIES = [ - "name", - "sourceId", - "sourceIdVersion", - "source.name", - "source.@rid", - "displayName", - "deprecated", + 'name', + 'sourceId', + 'sourceIdVersion', + 'source.name', + 'source.@rid', + 'displayName', + 'deprecated', ] + BASE_RETURN_PROPERTIES -GENE_RETURN_PROPERTIES = ["biotype"] + GENERIC_RETURN_PROPERTIES +GENE_RETURN_PROPERTIES = ['biotype'] + GENERIC_RETURN_PROPERTIES VARIANT_RETURN_PROPERTIES = ( BASE_RETURN_PROPERTIES - + [f"type.{p}" for p in GENERIC_RETURN_PROPERTIES] - + [f"reference1.{p}" for p in GENE_RETURN_PROPERTIES] - + [f"reference2.{p}" for p in GENE_RETURN_PROPERTIES] - + ["zygosity", "germline", "displayName"] + + [f'type.{p}' for p in GENERIC_RETURN_PROPERTIES] + + [f'reference1.{p}' for p in GENE_RETURN_PROPERTIES] + + [f'reference2.{p}' for p in GENE_RETURN_PROPERTIES] + + ['zygosity', 'germline', 'displayName'] ) POS_VARIANT_RETURN_PROPERTIES = VARIANT_RETURN_PROPERTIES + [ - "break1Start", - "break1End", - "break2Start", - "break2End", - "break1Repr", - "break2Repr", - "refSeq", - "untemplatedSeq", - "untemplatedSeqSize", - "truncation", - "assembly", + 'break1Start', + 'break1End', + 'break2Start', + 'break2End', + 'break1Repr', + 'break2Repr', + 'refSeq', + 'untemplatedSeq', + 'untemplatedSeqSize', + 'truncation', + 'assembly', ] STATEMENT_RETURN_PROPERTIES = ( BASE_RETURN_PROPERTIES - + ["displayNameTemplate", "sourceId", "source.name", "source.displayName"] - + [f"conditions.{p}" for p in GENERIC_RETURN_PROPERTIES] - + [f"subject.{p}" for p in GENERIC_RETURN_PROPERTIES] - + [f"evidence.{p}" for p in GENERIC_RETURN_PROPERTIES] - + [f"relevance.{p}" for p in GENERIC_RETURN_PROPERTIES] - + [f"evidenceLevel.{p}" for p in GENERIC_RETURN_PROPERTIES] - + ["reviewStatus"] + + ['displayNameTemplate', 'sourceId', 'source.name', 'source.displayName'] + + [f'conditions.{p}' for p in GENERIC_RETURN_PROPERTIES] + + [f'subject.{p}' for p in GENERIC_RETURN_PROPERTIES] + + [f'evidence.{p}' for p in GENERIC_RETURN_PROPERTIES] + + [f'relevance.{p}' for p in GENERIC_RETURN_PROPERTIES] + + [f'evidenceLevel.{p}' for p in GENERIC_RETURN_PROPERTIES] + + ['reviewStatus'] ) -ONCOKB_SOURCE_NAME = "oncokb" -TSO500_SOURCE_NAME = "tso500" -ONCOGENE = "oncogenic" -TUMOUR_SUPPRESSIVE = "tumour suppressive" -CANCER_GENE = "cancer gene" -FUSION_NAMES = ["structural variant", "fusion"] +ONCOKB_SOURCE_NAME = 'oncokb' +TSO500_SOURCE_NAME = 'tso500' +ONCOGENE = 'oncogenic' +TUMOUR_SUPPRESSIVE = 'tumour suppressive' +CANCER_GENE = 'cancer gene' +FUSION_NAMES = ['structural variant', 'fusion'] -GSC_PHARMACOGENOMIC_SOURCE_EXCLUDE_LIST = ["cancer genome interpreter", "civic"] -GSC_PHARMACOGENOMIC_SOURCE_DISPLAYNAME_EXCLUDE_LIST = ["CGI", "CIViC"] +GSC_PHARMACOGENOMIC_SOURCE_EXCLUDE_LIST = ['cancer genome interpreter', 'civic'] +GSC_PHARMACOGENOMIC_SOURCE_DISPLAYNAME_EXCLUDE_LIST = ['CGI', 'CIViC'] -BASE_THERAPEUTIC_TERMS = ["therapeutic efficacy", "eligibility"] +BASE_THERAPEUTIC_TERMS = ['therapeutic efficacy', 'eligibility'] # the order here is the order these are applied, the first category matched is returned RELEVANCE_BASE_TERMS: CategoryBaseTermMapping = [ - ("therapeutic", BASE_THERAPEUTIC_TERMS), - ("diagnostic", ["diagnostic indicator"]), - ("prognostic", ["prognostic indicator"]), - ("pharmacogenomic", ["metabolism", "toxicity", "dosage"]), - ("cancer predisposition", ["pathogenic"]), - ("biological", ["functional effect", "tumourigenesis", "predisposing"]), + ('therapeutic', BASE_THERAPEUTIC_TERMS), + ('diagnostic', ['diagnostic indicator']), + ('prognostic', ['prognostic indicator']), + ('pharmacogenomic', ['metabolism', 'toxicity', 'dosage']), + ('cancer predisposition', ['pathogenic']), + ('biological', ['functional effect', 'tumourigenesis', 'predisposing']), ] -FAILED_REVIEW_STATUS = "failed" +FAILED_REVIEW_STATUS = 'failed' -CHROMOSOMES_HG38 = [f"chr{i}" for i in range(1, 23)] + ["chrX", "chrY", "chrM"] -CHROMOSOMES_HG19 = [str(i) for i in range(1, 23)] + ["x", "y", "mt"] +CHROMOSOMES_HG38 = [f'chr{i}' for i in range(1, 23)] + ['chrX', 'chrY', 'chrM'] +CHROMOSOMES_HG19 = [str(i) for i in range(1, 23)] + ['x', 'y', 'mt'] CHROMOSOMES = CHROMOSOMES_HG38 + CHROMOSOMES_HG19 -AMBIGUOUS_AA = ["x", "?", "X"] +AMBIGUOUS_AA = ['x', '?', 'X'] AA_3to1_MAPPING = { - "Ala": "A", - "Arg": "R", - "Asn": "N", - "Asp": "D", - "Asx": "B", - "Cys": "C", - "Glu": "E", - "Gln": "Q", - "Glx": "Z", - "Gly": "G", - "His": "H", - "Ile": "I", - "Leu": "L", - "Lys": "K", - "Met": "M", - "Phe": "F", - "Pro": "P", - "Ser": "S", - "Thr": "T", - "Trp": "W", - "Tyr": "Y", - "Val": "V", - "Ter": "*", + 'Ala': 'A', + 'Arg': 'R', + 'Asn': 'N', + 'Asp': 'D', + 'Asx': 'B', + 'Cys': 'C', + 'Glu': 'E', + 'Gln': 'Q', + 'Glx': 'Z', + 'Gly': 'G', + 'His': 'H', + 'Ile': 'I', + 'Leu': 'L', + 'Lys': 'K', + 'Met': 'M', + 'Phe': 'F', + 'Pro': 'P', + 'Ser': 'S', + 'Thr': 'T', + 'Trp': 'W', + 'Tyr': 'Y', + 'Val': 'V', + 'Ter': '*', } @@ -132,89 +127,89 @@ def __getitem__(self, key): INPUT_COPY_CATEGORIES = IterableNamespace( - AMP="amplification", - ANY_GAIN="copy gain", - ANY_LOSS="copy loss", - DEEP="deep deletion", - GAIN="low level copy gain", - LOSS="shallow deletion", + AMP='amplification', + ANY_GAIN='copy gain', + ANY_LOSS='copy loss', + DEEP='deep deletion', + GAIN='low level copy gain', + LOSS='shallow deletion', ) INPUT_EXPRESSION_CATEGORIES = IterableNamespace( - UP="increased expression", DOWN="reduced expression" + UP='increased expression', DOWN='reduced expression' ) # From: https://github.com/bcgsc/pori_graphkb_parser/blob/ae3738842a4c208ab30f58c08ae987594d632504/src/constants.ts#L33-L80 TYPES_TO_NOTATION: Dict[str, str] = { - "acetylation": "ac", - "copy gain": "copygain", - "copy loss": "copyloss", - "deletion": "del", - "duplication": "dup", - "extension": "ext", - "frameshift": "fs", - "fusion": "fusion", - "indel": "delins", - "insertion": "ins", - "inversion": "inv", - "inverted translocation": "itrans", - "methylation": "me", - "missense mutation": "mis", - "mutation": "mut", - "nonsense mutation": ">", - "phosphorylation": "phos", - "splice-site": "spl", - "substitution": ">", - "translocation": "trans", - "truncating frameshift mutation": "fs", - "ubiquitination": "ub", + 'acetylation': 'ac', + 'copy gain': 'copygain', + 'copy loss': 'copyloss', + 'deletion': 'del', + 'duplication': 'dup', + 'extension': 'ext', + 'frameshift': 'fs', + 'fusion': 'fusion', + 'indel': 'delins', + 'insertion': 'ins', + 'inversion': 'inv', + 'inverted translocation': 'itrans', + 'methylation': 'me', + 'missense mutation': 'mis', + 'mutation': 'mut', + 'nonsense mutation': '>', + 'phosphorylation': 'phos', + 'splice-site': 'spl', + 'substitution': '>', + 'translocation': 'trans', + 'truncating frameshift mutation': 'fs', + 'ubiquitination': 'ub', # deprecated forms and aliases - "frameshift mutation": "fs", - "frameshift truncation": "fs", - "missense variant": "mis", - "truncating frameshift": "fs", - "missense": "mis", - "mutations": "mut", - "nonsense": ">", + 'frameshift mutation': 'fs', + 'frameshift truncation': 'fs', + 'missense variant': 'mis', + 'truncating frameshift': 'fs', + 'missense': 'mis', + 'mutations': 'mut', + 'nonsense': '>', } # For match.type_screening() [KBDEV-1056] -DEFAULT_NON_STRUCTURAL_VARIANT_TYPE = "mutation" +DEFAULT_NON_STRUCTURAL_VARIANT_TYPE = 'mutation' STRUCTURAL_VARIANT_SIZE_THRESHOLD = 48 # bp STRUCTURAL_VARIANT_TYPES = [ - "structural variant", - "insertion", - "in-frame insertion", - "deletion", - "deletion polymorphism", - "in-frame deletion", - "translocation", - "inverted translocation", - "inversion", - "indel", - "fusion", - "out-of-frame fusion", - "oncogenic fusion", - "in-frame fusion", - "disruptive fusion", - "duplication", - "internal duplication", - "tandem duplication", - "internal tandem duplication", - "itd", - "domain duplication", - "kinase domain duplication", - "copy variant", - "copy number variation", - "copy number variant", - "copy loss", - "copy number loss", - "shallow deletion", - "deep deletion", - "gene deletion", - "copy gain", - "copy number gain", - "low level copy gain", - "amplification", - "focal amplification", - "rearrangement", + 'structural variant', + 'insertion', + 'in-frame insertion', + 'deletion', + 'deletion polymorphism', + 'in-frame deletion', + 'translocation', + 'inverted translocation', + 'inversion', + 'indel', + 'fusion', + 'out-of-frame fusion', + 'oncogenic fusion', + 'in-frame fusion', + 'disruptive fusion', + 'duplication', + 'internal duplication', + 'tandem duplication', + 'internal tandem duplication', + 'itd', + 'domain duplication', + 'kinase domain duplication', + 'copy variant', + 'copy number variation', + 'copy number variant', + 'copy loss', + 'copy number loss', + 'shallow deletion', + 'deep deletion', + 'gene deletion', + 'copy gain', + 'copy number gain', + 'low level copy gain', + 'amplification', + 'focal amplification', + 'rearrangement', ] diff --git a/pori_python/graphkb/genes.py b/pori_python/graphkb/genes.py index e61e3cf5..09da3ed7 100644 --- a/pori_python/graphkb/genes.py +++ b/pori_python/graphkb/genes.py @@ -17,7 +17,6 @@ GSC_PHARMACOGENOMIC_SOURCE_EXCLUDE_LIST, ONCOGENE, ONCOKB_SOURCE_NAME, - PREFERRED_GENE_SOURCE, PREFERRED_GENE_SOURCE_NAME, RELEVANCE_BASE_TERMS, TSO500_SOURCE_NAME, @@ -35,14 +34,14 @@ def _get_tumourigenesis_genes_list( List[Statement], conn.query( { - "target": "Statement", - "filters": { - "AND": [ - {"source": {"target": "Source", "filters": {"name": sources}}}, - {"relevance": {"target": "Vocabulary", "filters": {"name": relevance}}}, + 'target': 'Statement', + 'filters': { + 'AND': [ + {'source': {'target': 'Source', 'filters': {'name': sources}}}, + {'relevance': {'target': 'Vocabulary', 'filters': {'name': relevance}}}, ] }, - "returnProperties": [f"subject.{prop}" for prop in GENE_RETURN_PROPERTIES], + 'returnProperties': [f'subject.{prop}' for prop in GENE_RETURN_PROPERTIES], }, ignore_cache=ignore_cache, ), @@ -51,9 +50,9 @@ def _get_tumourigenesis_genes_list( genes: Dict[str, Ontology] = {} for statement in statements: - if statement["subject"].get("biotype", "") == "gene": - record_id = statement["subject"]["@rid"] - genes[record_id] = statement["subject"] + if statement['subject'].get('biotype', '') == 'gene': + record_id = statement['subject']['@rid'] + genes[record_id] = statement['subject'] return [gene for gene in genes.values()] @@ -101,35 +100,35 @@ def get_therapeutic_associated_genes(graphkb_conn: GraphKBConnection) -> List[On therapeutic_relevance = get_terms_set(graphkb_conn, BASE_THERAPEUTIC_TERMS) statements = graphkb_conn.query( { - "target": "Statement", - "filters": {"relevance": sorted(list(therapeutic_relevance))}, - "returnProperties": ["reviewStatus"] - + [f"conditions.{prop}" for prop in GENE_RETURN_PROPERTIES] + 'target': 'Statement', + 'filters': {'relevance': sorted(list(therapeutic_relevance))}, + 'returnProperties': ['reviewStatus'] + + [f'conditions.{prop}' for prop in GENE_RETURN_PROPERTIES] + [ - f"conditions.reference{ref}.{prop}" + f'conditions.reference{ref}.{prop}' for prop in GENE_RETURN_PROPERTIES - for ref in ("1", "2") + for ref in ('1', '2') ], } ) genes: List[Ontology] = [] for statement in statements: statement = cast(Statement, statement) - if statement["reviewStatus"] == "failed": + if statement['reviewStatus'] == 'failed': continue - for condition in statement["conditions"]: - if condition["@class"] == "Feature": + for condition in statement['conditions']: + if condition['@class'] == 'Feature': genes.append(condition) - elif condition["@class"].endswith("Variant"): + elif condition['@class'].endswith('Variant'): cond = cast(Variant, condition) - if cond["reference1"] and cond["reference1"]["@class"] == "Feature": - genes.append(cond["reference1"]) - if cond["reference2"] and cond["reference2"]["@class"] == "Feature": - genes.append(cond["reference2"]) + if cond['reference1'] and cond['reference1']['@class'] == 'Feature': + genes.append(cond['reference1']) + if cond['reference2'] and cond['reference2']['@class'] == 'Feature': + genes.append(cond['reference2']) unique_genes: List[Ontology] = [] for gene in genes: - if not gene.get("deprecated", False): - if gene["@rid"] not in [g["@rid"] for g in unique_genes]: + if not gene.get('deprecated', False): + if gene['@rid'] not in [g['@rid'] for g in unique_genes]: unique_genes.append(gene) return unique_genes @@ -153,16 +152,16 @@ def get_genes_from_variant_types( variant_filters: List[Dict[str, Any]] = [] if types: variant_filters.append( - {"type": {"target": "Vocabulary", "filters": {"name": types, "operator": "IN"}}} + {'type': {'target': 'Vocabulary', 'filters': {'name': types, 'operator': 'IN'}}} ) variants = cast( List[Variant], conn.query( { - "target": "Variant", - "filters": variant_filters, - "returnProperties": ["reference1", "reference2"], + 'target': 'Variant', + 'filters': variant_filters, + 'returnProperties': ['reference1', 'reference2'], }, ignore_cache=ignore_cache, ), @@ -170,23 +169,23 @@ def get_genes_from_variant_types( genes = set() for variant in variants: - genes.add(variant["reference1"]) - if variant["reference2"]: - genes.add(variant["reference2"]) + genes.add(variant['reference1']) + if variant['reference2']: + genes.add(variant['reference2']) if not genes: return [] - gene_filters: List[Dict[str, Any]] = [{"biotype": "gene"}] + gene_filters: List[Dict[str, Any]] = [{'biotype': 'gene'}] if source_record_ids: - gene_filters.append({"source": source_record_ids, "operator": "IN"}) + gene_filters.append({'source': source_record_ids, 'operator': 'IN'}) result = cast( List[Ontology], conn.query( { - "target": list(genes), - "returnProperties": GENE_RETURN_PROPERTIES, - "filters": gene_filters, + 'target': list(genes), + 'returnProperties': GENE_RETURN_PROPERTIES, + 'filters': gene_filters, }, ignore_cache=ignore_cache, ), @@ -210,10 +209,10 @@ def get_preferred_gene_source_rid( return preferred_source_name result = conn.query( { - "target": {"target": "Source", "filters": {"name": preferred_source_name}}, - "queryType": "similarTo", + 'target': {'target': 'Source', 'filters': {'name': preferred_source_name}}, + 'queryType': 'similarTo', } - )[0]["@rid"] + )[0]['@rid'] return result @@ -235,29 +234,29 @@ def get_preferred_gene_name( """ source_rid = get_preferred_gene_source_rid(conn, source) if gene_name in CHROMOSOMES: - logger.error(f"{gene_name} assumed to be a chromosome, not gene") - return "" + logger.error(f'{gene_name} assumed to be a chromosome, not gene') + return '' eq = get_equivalent_features(conn=conn, gene_name=gene_name) - genes = [m for m in eq if m.get("biotype") == "gene" and not m.get("deprecated")] + genes = [m for m in eq if m.get('biotype') == 'gene' and not m.get('deprecated')] if not genes: - logger.error(f"No genes found for: {gene_name}") - return "" + logger.error(f'No genes found for: {gene_name}') + return '' if source_rid: - source_filtered_genes = [m for m in genes if m.get("source") == source_rid] + source_filtered_genes = [m for m in genes if m.get('source') == source_rid] if not source_filtered_genes: - logger.error(f"No data from source {source_rid} for {gene_name}") + logger.error(f'No data from source {source_rid} for {gene_name}') else: genes = source_filtered_genes - gene_names = [g["displayName"] for g in genes if g] + gene_names = [g['displayName'] for g in genes if g] if len(gene_names) > 1: logger.error( - f"Multiple gene names found for: {gene_name} - using {gene_names[0]}, ignoring {gene_names[1:]}" + f'Multiple gene names found for: {gene_name} - using {gene_names[0]}, ignoring {gene_names[1:]}' ) return gene_names[0] -@deprecated("Use get_gene_linked_cancer_predisposition_info instead") +@deprecated('Use get_gene_linked_cancer_predisposition_info instead') def get_cancer_predisposition_info( conn: GraphKBConnection, source: str = PREFERRED_GENE_SOURCE_NAME ) -> Tuple[List[str], Dict[str, str]]: @@ -267,7 +266,7 @@ def get_cancer_predisposition_info( def get_gene_linked_cancer_predisposition_info( - conn: GraphKBConnection, source: str = PREFERRED_GENE_SOURCE + conn: GraphKBConnection, source: str = PREFERRED_GENE_SOURCE_NAME ) -> Tuple[List[str], Dict[str, Tuple[str, List[str]]]]: """ Return two lists from GraphKB, one of cancer predisposition genes and one of associated variants. @@ -275,16 +274,15 @@ def get_gene_linked_cancer_predisposition_info( GERO-272 - criteria for what counts as a "cancer predisposition" variant In short: - * Statement 'source' is 'CGL' + * Statement 'source' is 'CGL' (not related to the preferred gene source) * Statement 'relevance' is 'pathogenic' * gene is gotten from any associated 'PositionalVariant' records Example: https://graphkb.bcgsc.ca/view/Statement/155:11616 - - Returns: genes: list of cancer predisposition genes + (using names from the source specified in this function's arguments) variants: dictionary mapping pharmacogenomic variant IDs to variant display names """ genes = set() @@ -293,51 +291,51 @@ def get_gene_linked_cancer_predisposition_info( variants: Dict[str, Tuple[str, List[str]]] = {} terms: dict = {term: lst for term, lst in RELEVANCE_BASE_TERMS} - relevance_rids = list(get_terms_set(conn, terms.get("cancer predisposition", []))) + relevance_rids = list(get_terms_set(conn, terms.get('cancer predisposition', []))) source_rid = get_preferred_gene_source_rid(conn, source) predisp_statements = [ cast(Statement, record) for record in conn.query( { - "target": "Statement", - "filters": { - "AND": [ + 'target': 'Statement', + 'filters': { + 'AND': [ { - "evidence": { - "target": "Source", - "filters": {"@rid": get_rid(conn, "Source", "CGL")}, + 'evidence': { + 'target': 'Source', + 'filters': {'@rid': get_rid(conn, 'Source', 'CGL')}, } }, { - "relevance": { - "target": "Vocabulary", - "filters": {"@rid": relevance_rids}, + 'relevance': { + 'target': 'Vocabulary', + 'filters': {'@rid': relevance_rids}, } }, ] }, - "returnProperties": [ - "conditions.@class", - "conditions.@rid", - "conditions.displayName", - "conditions.reference1.biotype", - "conditions.reference1.displayName", - "conditions.reference2.biotype", - "conditions.reference2.displayName", + 'returnProperties': [ + 'conditions.@class', + 'conditions.@rid', + 'conditions.displayName', + 'conditions.reference1.biotype', + 'conditions.reference1.displayName', + 'conditions.reference2.biotype', + 'conditions.reference2.displayName', ], }, ignore_cache=False, ) ] for record in predisp_statements: - for condition in record["conditions"]: - if condition["@class"] == "PositionalVariant": + for condition in record['conditions']: + if condition['@class'] == 'PositionalVariant': assoc_gene_list: List[str] = [] - for reference in ["reference1", "reference2"]: - name = (condition.get(reference) or {}).get("displayName", "") # type: ignore - biotype = (condition.get(reference) or {}).get("biotype", "") # type: ignore - if name and biotype == "gene": + for reference in ['reference1', 'reference2']: + name = (condition.get(reference) or {}).get('displayName', '') # type: ignore + biotype = (condition.get(reference) or {}).get('biotype', '') # type: ignore + if name and biotype == 'gene': genes.add(name) assoc_gene_list.append(name) elif name: @@ -348,9 +346,9 @@ def get_gene_linked_cancer_predisposition_info( else: non_genes.add((name, biotype)) logger.error( - f"Non-gene cancer predisposition {biotype}: {name} for {condition['displayName']}" + f'Non-gene cancer predisposition {biotype}: {name} for {condition["displayName"]}' ) - variants[condition["@rid"]] = (condition["displayName"], assoc_gene_list) + variants[condition['@rid']] = (condition['displayName'], assoc_gene_list) for gene, name, biotype in infer_genes: logger.debug(f"Found gene '{gene}' for '{name}' ({biotype})") @@ -362,7 +360,7 @@ def get_gene_linked_cancer_predisposition_info( return sorted(genes), variants -@deprecated("Use get_gene_linked_pharmacogenomic_info instead") +@deprecated('Use get_gene_linked_pharmacogenomic_info instead') def get_pharmacogenomic_info( conn: GraphKBConnection, source: str = PREFERRED_GENE_SOURCE_NAME ) -> Tuple[List[str], Dict[str, str]]: @@ -372,7 +370,7 @@ def get_pharmacogenomic_info( def get_gene_linked_pharmacogenomic_info( - conn: GraphKBConnection, source: str = PREFERRED_GENE_SOURCE + conn: GraphKBConnection, source: str = PREFERRED_GENE_SOURCE_NAME ) -> Tuple[List[str], Dict[str, Tuple[str, List[str]]]]: """ Return two lists from GraphKB, one of pharmacogenomic genes and one of associated variants. @@ -395,39 +393,39 @@ def get_gene_linked_pharmacogenomic_info( infer_genes = set() variants: Dict[str, Tuple] = {} - relevance_rids = list(get_terms_set(conn, "pharmacogenomic")) + relevance_rids = list(get_terms_set(conn, 'pharmacogenomic')) source_rid = get_preferred_gene_source_rid(conn, source) for record in conn.query( { - "target": "Statement", - "filters": [ - {"relevance": {"target": "Vocabulary", "filters": {"@rid": relevance_rids}}} + 'target': 'Statement', + 'filters': [ + {'relevance': {'target': 'Vocabulary', 'filters': {'@rid': relevance_rids}}} ], - "returnProperties": [ - "conditions.@class", - "conditions.@rid", - "conditions.displayName", - "conditions.reference1.biotype", - "conditions.reference1.displayName", - "conditions.reference2.biotype", - "conditions.reference2.displayName", - "source.name", + 'returnProperties': [ + 'conditions.@class', + 'conditions.@rid', + 'conditions.displayName', + 'conditions.reference1.biotype', + 'conditions.reference1.displayName', + 'conditions.reference2.biotype', + 'conditions.reference2.displayName', + 'source.name', ], }, ignore_cache=False, ): - if record["source"]: # type: ignore - if record["source"]["name"].lower() in GSC_PHARMACOGENOMIC_SOURCE_EXCLUDE_LIST: # type: ignore + if record['source']: # type: ignore + if record['source']['name'].lower() in GSC_PHARMACOGENOMIC_SOURCE_EXCLUDE_LIST: # type: ignore continue - for condition in record["conditions"]: # type: ignore - if condition["@class"] == "PositionalVariant": + for condition in record['conditions']: # type: ignore + if condition['@class'] == 'PositionalVariant': assoc_gene_list = [] - for reference in ["reference1", "reference2"]: - name = (condition.get(reference) or {}).get("displayName", "") - biotype = (condition.get(reference) or {}).get("biotype", "") - if name and biotype == "gene": + for reference in ['reference1', 'reference2']: + name = (condition.get(reference) or {}).get('displayName', '') + biotype = (condition.get(reference) or {}).get('biotype', '') + if name and biotype == 'gene': genes.add(name) assoc_gene_list.append(name) elif name: @@ -438,9 +436,9 @@ def get_gene_linked_pharmacogenomic_info( else: non_genes.add((name, biotype)) logger.error( - f"Non-gene pharmacogenomic {biotype}: {name} for {condition['displayName']}" + f'Non-gene pharmacogenomic {biotype}: {name} for {condition["displayName"]}' ) - variants[condition["@rid"]] = (condition["displayName"], assoc_gene_list) + variants[condition['@rid']] = (condition['displayName'], assoc_gene_list) for gene, name, biotype in infer_genes: logger.debug(f"Found gene '{gene}' for '{name}' ({biotype})") genes.add(gene) @@ -452,7 +450,7 @@ def get_gene_linked_pharmacogenomic_info( def convert_to_rid_set(records: List[Record] | List[Ontology]) -> Set[str]: - return {r["@rid"] for r in records} + return {r['@rid'] for r in records} def get_gene_information( @@ -479,59 +477,59 @@ def get_gene_information( 'name': 'TERT', 'oncogene': True}] """ - logger.info("fetching variant related genes list") + logger.info('fetching variant related genes list') # For query speed, only fetch the minimum needed details ret_props = [ - "conditions.@rid", - "conditions.@class", - "conditions.reference1", - "conditions.reference2", - "reviewStatus", + 'conditions.@rid', + 'conditions.@class', + 'conditions.reference1', + 'conditions.reference2', + 'reviewStatus', ] - body: Dict[str, Any] = {"target": "Statement", "returnProperties": ret_props} + body: Dict[str, Any] = {'target': 'Statement', 'returnProperties': ret_props} gene_names = sorted(set(gene_names)) statements = graphkb_conn.query(body) - statements = [s for s in statements if s.get("reviewStatus") != FAILED_REVIEW_STATUS] + statements = [s for s in statements if s.get('reviewStatus') != FAILED_REVIEW_STATUS] gene_flags: Dict[str, Set[str]] = { - "kbStatementRelated": set(), - "knownFusionPartner": set(), - "knownSmallMutation": set(), + 'kbStatementRelated': set(), + 'knownFusionPartner': set(), + 'knownSmallMutation': set(), } for statement in statements: statement = cast(Statement, statement) - for condition in statement["conditions"]: + for condition in statement['conditions']: # ignore types, as there can be various types of conditions - if condition.get("reference1"): - gene_flags["kbStatementRelated"].add(condition["reference1"]) # type: ignore - if condition.get("reference2"): + if condition.get('reference1'): + gene_flags['kbStatementRelated'].add(condition['reference1']) # type: ignore + if condition.get('reference2'): # Having a reference2 implies the event is a fusion - gene_flags["kbStatementRelated"].add(condition["reference2"]) # type: ignore - gene_flags["knownFusionPartner"].add(condition["reference1"]) # type: ignore - gene_flags["knownFusionPartner"].add(condition["reference2"]) # type: ignore - elif condition["@class"] == "PositionalVariant": + gene_flags['kbStatementRelated'].add(condition['reference2']) # type: ignore + gene_flags['knownFusionPartner'].add(condition['reference1']) # type: ignore + gene_flags['knownFusionPartner'].add(condition['reference2']) # type: ignore + elif condition['@class'] == 'PositionalVariant': # PositionalVariant without a reference2 implies a smallMutation type - gene_flags["knownSmallMutation"].add(condition["reference1"]) # type: ignore + gene_flags['knownSmallMutation'].add(condition['reference1']) # type: ignore - logger.info("fetching oncogenes list") - gene_flags["oncogene"] = convert_to_rid_set(get_oncokb_oncogenes(graphkb_conn)) - logger.info("fetching tumour supressors list") - gene_flags["tumourSuppressor"] = convert_to_rid_set(get_oncokb_tumour_supressors(graphkb_conn)) - logger.info("fetching cancerGeneListMatch list") - gene_flags["cancerGeneListMatch"] = convert_to_rid_set(get_cancer_genes(graphkb_conn)) + logger.info('fetching oncogenes list') + gene_flags['oncogene'] = convert_to_rid_set(get_oncokb_oncogenes(graphkb_conn)) + logger.info('fetching tumour supressors list') + gene_flags['tumourSuppressor'] = convert_to_rid_set(get_oncokb_tumour_supressors(graphkb_conn)) + logger.info('fetching cancerGeneListMatch list') + gene_flags['cancerGeneListMatch'] = convert_to_rid_set(get_cancer_genes(graphkb_conn)) - logger.info("fetching therapeutic associated genes lists") - gene_flags["therapeuticAssociated"] = convert_to_rid_set( + logger.info('fetching therapeutic associated genes lists') + gene_flags['therapeuticAssociated'] = convert_to_rid_set( get_therapeutic_associated_genes(graphkb_conn) ) - logger.info(f"Setting gene_info flags on {len(gene_names)} genes") + logger.info(f'Setting gene_info flags on {len(gene_names)} genes') result: List[IprGene] = [] for gene_name in gene_names: equivalent = convert_to_rid_set(get_equivalent_features(graphkb_conn, gene_name)) - row: Dict[str, str | bool] = {"name": gene_name} + row: Dict[str, str | bool] = {'name': gene_name} flagged = False for flag in gene_flags: # make smaller JSON to upload since all default to false already diff --git a/pori_python/graphkb/match.py b/pori_python/graphkb/match.py index 0c791383..29c8cf32 100644 --- a/pori_python/graphkb/match.py +++ b/pori_python/graphkb/match.py @@ -46,8 +46,8 @@ def get_equivalent_features( gene_name: str, ignore_cache: bool = False, is_source_id: bool = False, - source: str = "", - source_id_version: str = "", + source: str = '', + source_id_version: str = '', ) -> List[Ontology]: """Match an equivalent list of features given some input feature name (or ID). @@ -76,36 +76,36 @@ def get_equivalent_features( return cast( List[Ontology], conn.query( - {"target": [gene_name], "queryType": "similarTo"}, ignore_cache=ignore_cache + {'target': [gene_name], 'queryType': 'similarTo'}, ignore_cache=ignore_cache ), ) filters: List[Dict] = [] if source: - filters.append({"source": {"target": "Source", "filters": {"name": source}}}) + filters.append({'source': {'target': 'Source', 'filters': {'name': source}}}) - if gene_name.count(".") == 1 and gene_name.split(".")[-1].isnumeric(): + if gene_name.count('.') == 1 and gene_name.split('.')[-1].isnumeric(): # eg. ENSG00000133703.11 or NM_033360.4 logger.debug( - f"Assuming {gene_name} has a .version_format - ignoring the version for equivalent features" + f'Assuming {gene_name} has a .version_format - ignoring the version for equivalent features' ) - gene_name = gene_name.split(".")[0] + gene_name = gene_name.split('.')[0] if is_source_id or source_id_version: - filters.append({"sourceId": gene_name}) + filters.append({'sourceId': gene_name}) if source_id_version: filters.append( - {"OR": [{"sourceIdVersion": source_id_version}, {"sourceIdVersion": None}]} + {'OR': [{'sourceIdVersion': source_id_version}, {'sourceIdVersion': None}]} ) elif FEATURES_CACHE and gene_name.lower() not in FEATURES_CACHE and not ignore_cache: return [] else: - filters.append({"OR": [{"sourceId": gene_name}, {"name": gene_name}]}) + filters.append({'OR': [{'sourceId': gene_name}, {'name': gene_name}]}) return cast( List[Ontology], conn.query( - {"target": {"target": "Feature", "filters": filters}, "queryType": "similarTo"}, + {'target': {'target': 'Feature', 'filters': filters}, 'queryType': 'similarTo'}, ignore_cache=ignore_cache, ), ) @@ -118,24 +118,24 @@ def cache_missing_features(conn: GraphKBConnection) -> None: """ genes = cast( List[Ontology], - conn.query({"target": "Feature", "returnProperties": ["name", "sourceId"], "neighbors": 0}), + conn.query({'target': 'Feature', 'returnProperties': ['name', 'sourceId'], 'neighbors': 0}), ) for gene in genes: - if gene["name"]: - FEATURES_CACHE.add(gene["name"].lower()) - if gene["sourceId"]: - FEATURES_CACHE.add(gene["sourceId"].lower()) + if gene['name']: + FEATURES_CACHE.add(gene['name'].lower()) + if gene['sourceId']: + FEATURES_CACHE.add(gene['sourceId'].lower()) def match_category_variant( conn: GraphKBConnection, reference_name: str, category: str, - root_exclude_term: str = "", - gene_source: str = "", + root_exclude_term: str = '', + gene_source: str = '', gene_is_source_id: bool = False, ignore_cache: bool = False, - reference_class: str = "Feature", + reference_class: str = 'Feature', ) -> List[Variant]: """ Returns a list of variants matching the input variant @@ -155,7 +155,7 @@ def match_category_variant( """ # disambiguate the reference to find all equivalent representations references: List[str] = [] - if reference_class == "Feature": + if reference_class == 'Feature': references = convert_to_rid_list( get_equivalent_features( conn, @@ -167,14 +167,14 @@ def match_category_variant( ) if not references: raise FeatureNotFoundError( - f"unable to find the gene ({reference_name}) or any equivalent representations" + f'unable to find the gene ({reference_name}) or any equivalent representations' ) - if reference_class == "Signature": + if reference_class == 'Signature': references = convert_to_rid_list( get_equivalent_terms( conn, reference_name.lower(), - ontology_class="Signature", + ontology_class='Signature', ignore_cache=ignore_cache, ) ) @@ -185,24 +185,24 @@ def match_category_variant( ) if not types: - raise ValueError(f"unable to find the term/category ({category}) or any equivalent") + raise ValueError(f'unable to find the term/category ({category}) or any equivalent') # find the variant list return cast( List[Variant], conn.query( { - "target": { - "target": "CategoryVariant", - "filters": [ - {"reference1": references, "operator": "IN"}, - {"type": types, "operator": "IN"}, + 'target': { + 'target': 'CategoryVariant', + 'filters': [ + {'reference1': references, 'operator': 'IN'}, + {'type': types, 'operator': 'IN'}, ], }, - "queryType": "similarTo", - "edges": ["AliasOf", "DeprecatedBy", "CrossReferenceOf", "GeneralizationOf"], - "treeEdges": ["Infers"], - "returnProperties": VARIANT_RETURN_PROPERTIES, + 'queryType': 'similarTo', + 'edges': ['AliasOf', 'DeprecatedBy', 'CrossReferenceOf', 'GeneralizationOf'], + 'treeEdges': ['Infers'], + 'returnProperties': VARIANT_RETURN_PROPERTIES, }, ignore_cache=ignore_cache, ), @@ -228,14 +228,14 @@ def match_copy_variant( List of variant records from GraphKB which match the input """ if category not in INPUT_COPY_CATEGORIES.values(): - raise ValueError(f"not a valid copy variant input category ({category})") + raise ValueError(f'not a valid copy variant input category ({category})') result = match_category_variant( - conn, gene_name, category, root_exclude_term="structural variant", **kwargs + conn, gene_name, category, root_exclude_term='structural variant', **kwargs ) if drop_homozygous: - return [row for row in result if row["zygosity"] != "homozygous"] + return [row for row in result if row['zygosity'] != 'homozygous'] return result @@ -243,10 +243,10 @@ def match_expression_variant( conn: GraphKBConnection, gene_name: str, category: str, **kwargs ) -> List[Variant]: if category not in INPUT_EXPRESSION_CATEGORIES.values(): - raise ValueError(f"not a valid expression variant input category ({category})") + raise ValueError(f'not a valid expression variant input category ({category})') return match_category_variant( - conn, gene_name, category, root_exclude_term="biological", **kwargs + conn, gene_name, category, root_exclude_term='biological', **kwargs ) @@ -270,19 +270,19 @@ def positions_overlap( Returns: bool: True if the positions overlap """ - if pos_record.get("@class", "") == "CytobandPosition": + if pos_record.get('@class', '') == 'CytobandPosition': raise NotImplementedError( - "Position comparison for cytoband coordinates is not yet implemented" + 'Position comparison for cytoband coordinates is not yet implemented' ) - pos = pos_record.get("pos", None) + pos = pos_record.get('pos', None) if pos is None: return True - start = range_start.get("pos", None) + start = range_start.get('pos', None) if range_end: - end = range_end.get("pos", None) + end = range_end.get('pos', None) if start is not None and pos < start: return False @@ -315,15 +315,15 @@ def equivalent_types( # Convert rid to displayName if needed if looks_like_rid(type1): - type1 = conn.get_records_by_id([type1])[0]["displayName"] + type1 = conn.get_records_by_id([type1])[0]['displayName'] if looks_like_rid(type2): - type2 = conn.get_records_by_id([type2])[0]["displayName"] + type2 = conn.get_records_by_id([type2])[0]['displayName'] # Get type terms from observed variant terms1 = [] if strict: try: - terms1.append(get_term_by_name(conn, type1)["@rid"]) + terms1.append(get_term_by_name(conn, type1)['@rid']) except Exception: pass else: @@ -375,12 +375,12 @@ def compare_positional_variants( # For break1, check if positions are overlaping between the variant and the reference. # Continue only if True. if not positions_overlap( - cast(BasicPosition, variant["break1Start"]), - cast(BasicPosition, reference_variant["break1Start"]), + cast(BasicPosition, variant['break1Start']), + cast(BasicPosition, reference_variant['break1Start']), ( None - if "break1End" not in reference_variant - else cast(BasicPosition, reference_variant["break1End"]) + if 'break1End' not in reference_variant + else cast(BasicPosition, reference_variant['break1End']) ), ): return False @@ -388,16 +388,16 @@ def compare_positional_variants( # For break2, check if positions are overlaping between the variant and the reference. # Continue only if True or no break2. # TODO: check for variant without break2 but reference_variant with one. - if variant.get("break2Start"): - if not reference_variant.get("break2Start"): + if variant.get('break2Start'): + if not reference_variant.get('break2Start'): return False if not positions_overlap( - cast(BasicPosition, variant["break2Start"]), - cast(BasicPosition, reference_variant["break2Start"]), + cast(BasicPosition, variant['break2Start']), + cast(BasicPosition, reference_variant['break2Start']), ( None - if "break2End" not in reference_variant - else cast(BasicPosition, reference_variant["break2End"]) + if 'break2End' not in reference_variant + else cast(BasicPosition, reference_variant['break2End']) ), ): return False @@ -405,47 +405,47 @@ def compare_positional_variants( # If both variants have untemplated sequence, # check for size and content. if ( - variant.get("untemplatedSeq", None) is not None - and reference_variant.get("untemplatedSeq", None) is not None + variant.get('untemplatedSeq', None) is not None + and reference_variant.get('untemplatedSeq', None) is not None ): if ( - variant.get("untemplatedSeqSize", None) is not None - and reference_variant.get("untemplatedSeqSize", None) is not None + variant.get('untemplatedSeqSize', None) is not None + and reference_variant.get('untemplatedSeqSize', None) is not None ): - if variant["untemplatedSeqSize"] != reference_variant["untemplatedSeqSize"]: + if variant['untemplatedSeqSize'] != reference_variant['untemplatedSeqSize']: return False if ( - reference_variant["untemplatedSeq"] is not None - and variant["untemplatedSeq"] is not None + reference_variant['untemplatedSeq'] is not None + and variant['untemplatedSeq'] is not None ): if ( - reference_variant["untemplatedSeq"] not in AMBIGUOUS_AA - and variant["untemplatedSeq"] not in AMBIGUOUS_AA + reference_variant['untemplatedSeq'] not in AMBIGUOUS_AA + and variant['untemplatedSeq'] not in AMBIGUOUS_AA ): - if reference_variant["untemplatedSeq"].lower() != variant["untemplatedSeq"].lower(): + if reference_variant['untemplatedSeq'].lower() != variant['untemplatedSeq'].lower(): return False - elif len(variant["untemplatedSeq"]) != len(reference_variant["untemplatedSeq"]): + elif len(variant['untemplatedSeq']) != len(reference_variant['untemplatedSeq']): return False # If both variants have a reference sequence, # check if they are the same. if ( - variant.get("refSeq", None) is not None - and reference_variant.get("refSeq", None) is not None + variant.get('refSeq', None) is not None + and reference_variant.get('refSeq', None) is not None ): if ( - reference_variant["refSeq"] not in AMBIGUOUS_AA - and variant["refSeq"] not in AMBIGUOUS_AA + reference_variant['refSeq'] not in AMBIGUOUS_AA + and variant['refSeq'] not in AMBIGUOUS_AA ): - if reference_variant["refSeq"].lower() != variant["refSeq"].lower(): # type: ignore + if reference_variant['refSeq'].lower() != variant['refSeq'].lower(): # type: ignore return False - elif len(variant["refSeq"]) != len(reference_variant["refSeq"]): # type: ignore + elif len(variant['refSeq']) != len(reference_variant['refSeq']): # type: ignore return False # Equivalent types - if variant.get("type") and reference_variant.get("type"): - if not equivalent_types(conn, variant["type"], reference_variant["type"]): + if variant.get('type') and reference_variant.get('type'): + if not equivalent_types(conn, variant['type'], reference_variant['type']): return False return True @@ -500,38 +500,38 @@ def type_screening( # Will use either hardcoded type list or an updated list from the API if updateStructuralTypes: - rids = list(get_terms_set(conn, ["structural variant"])) + rids = list(get_terms_set(conn, ['structural variant'])) records = conn.get_records_by_id(rids) - structuralVariantTypes = [el["name"] for el in records] + structuralVariantTypes = [el['name'] for el in records] # Unambiguous non-structural variation type - if parsed["type"] not in structuralVariantTypes: - return parsed["type"] + if parsed['type'] not in structuralVariantTypes: + return parsed['type'] # Unambiguous structural variation type - if parsed["type"] in ["fusion", "translocation"]: - return parsed["type"] - if parsed.get("reference2", None): - return parsed["type"] - prefix = parsed.get("prefix", "g") - if prefix == "y": # Assuming all variations using cytoband coordiantes meet the size threshold - return parsed["type"] + if parsed['type'] in ['fusion', 'translocation']: + return parsed['type'] + if parsed.get('reference2', None): + return parsed['type'] + prefix = parsed.get('prefix', 'g') + if prefix == 'y': # Assuming all variations using cytoband coordiantes meet the size threshold + return parsed['type'] # When size cannot be determined: exonic and intronic coordinates # e.g. "MET:e.14del" meaning "Any deletion occuring at the 14th exon" - if prefix in ["e", "i"]: # Assuming they don't meet the size threshold + if prefix in ['e', 'i']: # Assuming they don't meet the size threshold return default_type # When size is given - if (parsed.get("untemplatedSeqSize") or 0) >= threshold: - return parsed["type"] + if (parsed.get('untemplatedSeqSize') or 0) >= threshold: + return parsed['type'] # When size needs to be computed from positions - pos_start: int = parsed.get("break1Start", {}).get("pos", 1) # type: ignore - pos_end: int = parsed.get("break2Start", {}).get("pos", pos_start) # type: ignore - pos_size = 3 if prefix == "p" else 1 + pos_start: int = parsed.get('break1Start', {}).get('pos', 1) # type: ignore + pos_end: int = parsed.get('break2Start', {}).get('pos', pos_start) # type: ignore + pos_size = 3 if prefix == 'p' else 1 if ((pos_end - pos_start) + 1) * pos_size >= threshold: - return parsed["type"] + return parsed['type'] # Default return default_type @@ -543,7 +543,7 @@ def match_positional_variant( reference1: Optional[str] = None, reference2: Optional[str] = None, gene_is_source_id: bool = False, - gene_source: str = "", + gene_source: str = '', ignore_cache: bool = False, updateStructuralTypes: bool = False, ) -> List[Variant]: @@ -590,21 +590,21 @@ def match_positional_variant( # parse the representation parsed = conn.parse(variant_string, not (reference1 or reference2)) - if "break1End" in parsed or "break2End" in parsed: # uncertain position + if 'break1End' in parsed or 'break2End' in parsed: # uncertain position raise NotImplementedError( - f"Matching does not support uncertain positions ({variant_string}) as input" + f'Matching does not support uncertain positions ({variant_string}) as input' ) if reference2 and not reference1: - raise ValueError("cannot specify reference2 without reference1") + raise ValueError('cannot specify reference2 without reference1') # disambiguate the gene name if reference1: gene1 = reference1 - if "reference1" in parsed: + if 'reference1' in parsed: raise ValueError( - "Cannot specify reference1 explicitly as well as in the variant notation" + 'Cannot specify reference1 explicitly as well as in the variant notation' ) else: - gene1 = parsed["reference1"] + gene1 = parsed['reference1'] gene1_features = get_equivalent_features( conn, gene1, source=gene_source, is_source_id=gene_is_source_id, ignore_cache=ignore_cache @@ -613,7 +613,7 @@ def match_positional_variant( if not features: raise FeatureNotFoundError( - f"unable to find the gene ({gene1}) or any equivalent representations" + f'unable to find the gene ({gene1}) or any equivalent representations' ) secondary_features = None @@ -621,20 +621,20 @@ def match_positional_variant( gene2: Optional[str] = None if reference2: gene2 = reference2 - if "reference2" in parsed: + if 'reference2' in parsed: raise ValueError( - "Cannot specify reference2 explicitly as well as in the variant notation" + 'Cannot specify reference2 explicitly as well as in the variant notation' ) - elif "reference1" in parsed: + elif 'reference1' in parsed: raise ValueError( - "variant notation cannot contain features when explicit features are given" + 'variant notation cannot contain features when explicit features are given' ) elif ( - "reference2" in parsed - and parsed.get("reference2", "?") != "?" - and parsed["reference2"] is not None + 'reference2' in parsed + and parsed.get('reference2', '?') != '?' + and parsed['reference2'] is not None ): - gene2 = parsed["reference2"] + gene2 = parsed['reference2'] if gene2: gene2_features = get_equivalent_features( @@ -647,14 +647,14 @@ def match_positional_variant( secondary_features = convert_to_rid_list(gene2_features) if not secondary_features: raise FeatureNotFoundError( - f"unable to find the gene ({gene2}) or any equivalent representations" + f'unable to find the gene ({gene2}) or any equivalent representations' ) # match the existing mutations (positional) query_filters = [ - {"reference1": features}, - {"reference2": secondary_features}, - {"break1Start.@class": parsed["break1Start"]["@class"]}, + {'reference1': features}, + {'reference2': secondary_features}, + {'break1Start.@class': parsed['break1Start']['@class']}, ] filtered_similarOnly: List[Record] = [] # For post filter match use @@ -663,7 +663,7 @@ def match_positional_variant( for row in cast( List[Record], conn.query( - {"target": "PositionalVariant", "filters": query_filters}, ignore_cache=ignore_cache + {'target': 'PositionalVariant', 'filters': query_filters}, ignore_cache=ignore_cache ), ): # TODO: Check if variant and reference_variant should be interchanged @@ -688,11 +688,11 @@ def match_positional_variant( matches.extend( conn.query( { - "target": convert_to_rid_list(filtered_similarOnly), - "queryType": "similarTo", - "edges": ["AliasOf", "DeprecatedBy", "CrossReferenceOf", "GeneralizationOf"], - "treeEdges": ["Infers"], - "returnProperties": POS_VARIANT_RETURN_PROPERTIES, + 'target': convert_to_rid_list(filtered_similarOnly), + 'queryType': 'similarTo', + 'edges': ['AliasOf', 'DeprecatedBy', 'CrossReferenceOf', 'GeneralizationOf'], + 'treeEdges': ['Infers'], + 'returnProperties': POS_VARIANT_RETURN_PROPERTIES, }, ignore_cache=ignore_cache, ) @@ -705,7 +705,7 @@ def match_positional_variant( variant_types_details = get_equivalent_terms( conn, screened_type, - root_exclude_term="mutation" if secondary_features else "", + root_exclude_term='mutation' if secondary_features else '', ignore_cache=ignore_cache, ) @@ -714,18 +714,18 @@ def match_positional_variant( matches.extend( conn.query( { - "target": { - "target": "CategoryVariant", - "filters": [ - {"reference1": features}, - {"type": types}, - {"reference2": secondary_features}, + 'target': { + 'target': 'CategoryVariant', + 'filters': [ + {'reference1': features}, + {'type': types}, + {'reference2': secondary_features}, ], }, - "queryType": "similarTo", - "edges": ["AliasOf", "DeprecatedBy", "CrossReferenceOf"], - "treeEdges": ["Infers"], - "returnProperties": POS_VARIANT_RETURN_PROPERTIES, + 'queryType': 'similarTo', + 'edges': ['AliasOf', 'DeprecatedBy', 'CrossReferenceOf'], + 'treeEdges': ['Infers'], + 'returnProperties': POS_VARIANT_RETURN_PROPERTIES, }, ignore_cache=ignore_cache, ) @@ -739,18 +739,18 @@ def cat_variant_query( matches.extend( conn.query( { - "target": { - "target": "CategoryVariant", - "filters": [ - {"reference1": cat_features}, - {"type": cat_types}, - {"reference2": cat_secondary_features}, + 'target': { + 'target': 'CategoryVariant', + 'filters': [ + {'reference1': cat_features}, + {'type': cat_types}, + {'reference2': cat_secondary_features}, ], }, - "queryType": "similarTo", - "edges": ["AliasOf", "DeprecatedBy", "CrossReferenceOf"], - "treeEdges": [], - "returnProperties": VARIANT_RETURN_PROPERTIES, + 'queryType': 'similarTo', + 'edges': ['AliasOf', 'DeprecatedBy', 'CrossReferenceOf'], + 'treeEdges': [], + 'returnProperties': VARIANT_RETURN_PROPERTIES, }, ignore_cache=ignore_cache, ) @@ -768,10 +768,10 @@ def cat_variant_query( matches.extend( conn.query( { - "target": convert_to_rid_list(filtered_similarAndGeneric), - "queryType": "descendants", - "edges": [], - "returnProperties": POS_VARIANT_RETURN_PROPERTIES, + 'target': convert_to_rid_list(filtered_similarAndGeneric), + 'queryType': 'descendants', + 'edges': [], + 'returnProperties': POS_VARIANT_RETURN_PROPERTIES, }, ignore_cache=ignore_cache, ) @@ -779,6 +779,6 @@ def cat_variant_query( result: Dict[str, Variant] = {} for row in matches: - result[row["@rid"]] = cast(Variant, row) + result[row['@rid']] = cast(Variant, row) return list(result.values()) diff --git a/pori_python/graphkb/statement.py b/pori_python/graphkb/statement.py index 24246b91..3f077ee1 100644 --- a/pori_python/graphkb/statement.py +++ b/pori_python/graphkb/statement.py @@ -20,7 +20,7 @@ def categorize_relevance( term_set = get_terms_set(graphkb_conn, base_terms) if relevance_rid in term_set: return category - return "" + return '' def get_statements_from_variants( @@ -38,11 +38,11 @@ def get_statements_from_variants( """ statements = graphkb_conn.query( { - "target": "Statement", - "filters": {"conditions": convert_to_rid_list(variants), "operator": "CONTAINSANY"}, - "returnProperties": STATEMENT_RETURN_PROPERTIES, + 'target': 'Statement', + 'filters': {'conditions': convert_to_rid_list(variants), 'operator': 'CONTAINSANY'}, + 'returnProperties': STATEMENT_RETURN_PROPERTIES, } ) if not failed_review: - statements = [s for s in statements if s.get("reviewStatus") != FAILED_REVIEW_STATUS] + statements = [s for s in statements if s.get('reviewStatus') != FAILED_REVIEW_STATUS] return [cast(Statement, s) for s in statements] diff --git a/pori_python/graphkb/util.py b/pori_python/graphkb/util.py index 2ff8620c..23c28963 100644 --- a/pori_python/graphkb/util.py +++ b/pori_python/graphkb/util.py @@ -14,14 +14,14 @@ from pori_python.types import ParsedVariant, PositionalVariant, Record -from .constants import DEFAULT_LIMIT, DEFAULT_URL, TYPES_TO_NOTATION, AA_3to1_MAPPING +from .constants import DEFAULT_LIMIT, TYPES_TO_NOTATION, AA_3to1_MAPPING QUERY_CACHE: Dict[Any, Any] = {} # name the logger after the package to make it simple to disable for packages using this one as a dependency # https://stackoverflow.com/questions/11029717/how-do-i-disable-log-messages-from-the-requests-library -logger = logging.getLogger("graphkb") +logger = logging.getLogger('graphkb') def convert_to_rid_list(records: Iterable[Record]) -> List[str]: @@ -31,7 +31,7 @@ def convert_to_rid_list(records: Iterable[Record]) -> List[str]: if isinstance(record, str): result.append(record) # assume an @rid string else: - result.append(record["@rid"]) + result.append(record['@rid']) return result @@ -41,7 +41,7 @@ class FeatureNotFoundError(Exception): def looks_like_rid(rid: str) -> bool: """Check if an input string looks like a GraphKB ID.""" - if re.match(r"^#-?\d+:-?\d+$", rid): + if re.match(r'^#-?\d+:-?\d+$', rid): return True return False @@ -50,15 +50,15 @@ def convert_aa_3to1(three_letter_notation: str) -> str: """Convert an Input string from 3 letter AA notation to 1 letter AA notation.""" result = [] - if ":" in three_letter_notation: + if ':' in three_letter_notation: # do not include the feature/gene in replacements - pos = three_letter_notation.index(":") + pos = three_letter_notation.index(':') result.append(three_letter_notation[: pos + 1]) three_letter_notation = three_letter_notation[pos + 1 :] last_match_end = 0 # exclusive interval [ ) - for match in re.finditer(r"[A-Z][a-z][a-z]", three_letter_notation): + for match in re.finditer(r'[A-Z][a-z][a-z]', three_letter_notation): # add the in-between string result.append(three_letter_notation[last_match_end : match.start()]) text = three_letter_notation[match.start() : match.end()] @@ -66,7 +66,7 @@ def convert_aa_3to1(three_letter_notation: str) -> str: last_match_end = match.end() result.append(three_letter_notation[last_match_end:]) - return "".join(result) + return ''.join(result) def join_url(base_url: str, *parts) -> str: @@ -74,9 +74,9 @@ def join_url(base_url: str, *parts) -> str: if not parts: return base_url - url = [base_url.rstrip("/")] + [part.strip("/") for part in parts] + url = [base_url.rstrip('/')] + [part.strip('/') for part in parts] - return "/".join(url) + return '/'.join(url) def millis_interval(start: datetime, end: datetime) -> int: @@ -91,16 +91,16 @@ def millis_interval(start: datetime, end: datetime) -> int: def cache_key(request_body) -> str: """Create a cache key for a query request to GraphKB.""" body = json.dumps(request_body, sort_keys=True) - hash_code = hashlib.md5(f"/query{body}".encode("utf-8")).hexdigest() + hash_code = hashlib.md5(f'/query{body}'.encode('utf-8')).hexdigest() return hash_code class GraphKBConnection: def __init__( self, - url: str = os.environ.get("GRAPHKB_URL", DEFAULT_URL), - username: str = "", - password: str = "", + url: str = os.environ.get('GRAPHKB_URL'), + username: str = '', + password: str = '', use_global_cache: bool = True, ): self.http = requests.Session() @@ -111,13 +111,13 @@ def __init__( backoff_factor=5, status_forcelist=[429, 500, 502, 503, 504], ) - self.http.mount("https://", HTTPAdapter(max_retries=retries)) - self.token = "" - self.token_kc = "" + self.http.mount('https://', HTTPAdapter(max_retries=retries)) + self.token = '' + self.token_kc = '' self.url = url self.username = username self.password = password - self.headers = {"Accept": "application/json", "Content-Type": "application/json"} + self.headers = {'Accept': 'application/json', 'Content-Type': 'application/json'} self.cache: Dict[Any, Any] = {} if not use_global_cache else QUERY_CACHE self.request_count = 0 self.first_request: Optional[datetime] = None @@ -125,6 +125,10 @@ def __init__( if username and password: self.login(username=username, password=password) + # URL check + if not self.url: + raise ValueError('URL to a GraphKB API instance is required') + @property def load(self) -> Optional[float]: if self.first_request and self.last_request: @@ -133,7 +137,7 @@ def load(self) -> Optional[float]: return self.request_count * 1000 / msec return None - def request(self, endpoint: str, method: str = "GET", **kwargs) -> Dict: + def request(self, endpoint: str, method: str = 'GET', **kwargs) -> Dict: """Request wrapper to handle adding common headers and logging. Args: @@ -151,7 +155,7 @@ def request(self, endpoint: str, method: str = "GET", **kwargs) -> Dict: # don't want to use a read timeout if the request is not idempotent # otherwise you may wind up making unintended changes timeout = None - if endpoint in ["query", "parse"]: + if endpoint in ['query', 'parse']: timeout = (connect_timeout, read_timeout) start_time = datetime.now() @@ -170,7 +174,6 @@ def request(self, endpoint: str, method: str = "GET", **kwargs) -> Dict: if attempt > 0: time.sleep(2) # wait between retries try: - if need_refresh_login: self.refresh_login() need_refresh_login = False @@ -180,7 +183,7 @@ def request(self, endpoint: str, method: str = "GET", **kwargs) -> Dict: method, url, headers=self.headers, timeout=timeout, **kwargs ) if resp.status_code == 401 or resp.status_code == 403: - logger.debug(f"/{endpoint} - {resp.status_code} - retrying") + logger.debug(f'/{endpoint} - {resp.status_code} - retrying') # try to re-login if the token expired need_refresh_login = True continue @@ -188,14 +191,14 @@ def request(self, endpoint: str, method: str = "GET", **kwargs) -> Dict: break except (requests.exceptions.ConnectionError, OSError) as err: if attempt < len(attempts) - 1: - logger.debug(f"/{endpoint} - {str(err)} - retrying") + logger.debug(f'/{endpoint} - {str(err)} - retrying') continue raise err except Exception as err2: raise err2 timing = millis_interval(start_time, datetime.now()) - logger.debug(f"/{endpoint} - {resp.status_code} - {timing} ms") # type: ignore + logger.debug(f'/{endpoint} - {resp.status_code} - {timing} ms') # type: ignore try: resp.raise_for_status() @@ -203,7 +206,7 @@ def request(self, endpoint: str, method: str = "GET", **kwargs) -> Dict: # try to get more error details message = str(err) try: - message += " " + resp.json()["message"] + message += ' ' + resp.json()['message'] except Exception: pass @@ -213,7 +216,7 @@ def request(self, endpoint: str, method: str = "GET", **kwargs) -> Dict: def post(self, uri: str, data: Dict = {}, **kwargs) -> Dict: """Convenience method for making post requests.""" - return self.request(uri, method="POST", data=json.dumps(data), **kwargs) + return self.request(uri, method='POST', data=json.dumps(data), **kwargs) def login_demo(self) -> None: """ @@ -223,26 +226,26 @@ def login_demo(self) -> None: 2. get a second token from the GraphKB API using keyCloakToken; self.login() """ url_parts = urlsplit(self.url) - base_url = f"{url_parts.scheme}://{url_parts.netloc}" + base_url = f'{url_parts.scheme}://{url_parts.netloc}' try: resp = requests.request( - url=f"{base_url}/auth/realms/PORI/protocol/openid-connect/token", - method="POST", + url=f'{base_url}/auth/realms/PORI/protocol/openid-connect/token', + method='POST', data={ - "client_id": "GraphKB", - "grant_type": "password", - "password": self.password, - "username": self.username, + 'client_id': 'GraphKB', + 'grant_type': 'password', + 'password': self.password, + 'username': self.username, }, ) except Exception as err: - logger.debug(f"unable to fetch a token from KeyCloak: {err}") + logger.debug(f'unable to fetch a token from KeyCloak: {err}') raise err resp.raise_for_status() content = resp.json() - self.token_kc = content["access_token"] + self.token_kc = content['access_token'] def login(self, username: str, password: str, pori_demo: bool = False) -> None: self.username = username @@ -251,7 +254,8 @@ def login(self, username: str, password: str, pori_demo: bool = False) -> None: read_timeout = 61 # KBDEV-1328. Alt. GraphKB login for GSC's PORI online demo - if pori_demo or "pori-demo" in self.url: + if pori_demo or 'pori-demo' in self.url: + logger.warning('login demo') self.login_demo() # use requests package directly to avoid recursion loop on login failure @@ -262,29 +266,29 @@ def login(self, username: str, password: str, pori_demo: bool = False) -> None: try: self.request_count += 1 resp = requests.request( - url=f"{self.url}/token", - method="POST", + url=f'{self.url}/token', + method='POST', headers=self.headers, timeout=(connect_timeout, read_timeout), data=json.dumps( # KBDEV-1328. Alt. GraphKB login for GSC's PORI online demo - {"keyCloakToken": self.token_kc} + {'keyCloakToken': self.token_kc} if self.token_kc - else {"username": username, "password": password} + else {'username': username, 'password': password} ), ) break except (requests.exceptions.ConnectionError, OSError) as err: if attempt < len(attempts) - 1: - logger.debug(f"/login - {str(err)} - retrying") + logger.debug(f'/login - {str(err)} - retrying') continue raise err except Exception as err2: raise err2 resp.raise_for_status() content = resp.json() - self.token = content["kbToken"] - self.headers["Authorization"] = self.token + self.token = content['kbToken'] + self.headers['Authorization'] = self.token def refresh_login(self) -> None: self.login(self.username, self.password) @@ -306,7 +310,7 @@ def query( Query GraphKB """ result: List[Record] = [] - hash_code = "" + hash_code = '' if not ignore_cache and paginate: hash_code = cache_key(request_body) @@ -314,8 +318,8 @@ def query( return self.cache[hash_code] while True: - content = self.post("query", data={**request_body, "limit": limit, "skip": len(result)}) - records = content["result"] + content = self.post('query', data={**request_body, 'limit': limit, 'skip': len(result)}) + records = content['result'] result.extend(records) if len(records) < limit or not paginate: break @@ -326,17 +330,17 @@ def query( def parse(self, hgvs_string: str, requireFeatures: bool = False) -> ParsedVariant: content = self.post( - "parse", data={"content": hgvs_string, "requireFeatures": requireFeatures} + 'parse', data={'content': hgvs_string, 'requireFeatures': requireFeatures} ) - return cast(ParsedVariant, content["result"]) + return cast(ParsedVariant, content['result']) def get_records_by_id(self, record_ids: List[str]) -> List[Record]: if not record_ids: return [] - result = self.query({"target": record_ids}) + result = self.query({'target': record_ids}) if len(record_ids) != len(result): raise AssertionError( - f"The number of Ids given ({len(record_ids)}) does not match the number of records fetched ({len(result)})" + f'The number of Ids given ({len(record_ids)}) does not match the number of records fetched ({len(result)})' ) return result @@ -345,9 +349,9 @@ def get_record_by_id(self, record_id: str) -> Record: return result[0] def get_source(self, name: str) -> Record: - source = self.query({"target": "Source", "filters": {"name": name}}) + source = self.query({'target': 'Source', 'filters': {'name': name}}) if len(source) != 1: - raise AssertionError(f"Unable to unqiuely identify source with name {name}") + raise AssertionError(f'Unable to unqiuely identify source with name {name}') return source[0] @@ -367,27 +371,27 @@ def get_rid(conn: GraphKBConnection, target: str, name: str) -> str: AssertionError: if the term was not found or more than 1 match was found (expected to be unique) """ result = conn.query( - {"target": target, "filters": {"name": name}, "returnProperties": ["@rid"]}, + {'target': target, 'filters': {'name': name}, 'returnProperties': ['@rid']}, ignore_cache=False, ) assert len(result) == 1, f"unable to find unique '{target}' ID for '{name}'" - return result[0]["@rid"] + return result[0]['@rid'] def stripParentheses(breakRepr: str) -> str: - match = re.search(r"^([a-z])\.\((.+)\)$", breakRepr) + match = re.search(r'^([a-z])\.\((.+)\)$', breakRepr) if match: - return f"{match.group(1)}.{match.group(2)}" + return f'{match.group(1)}.{match.group(2)}' return breakRepr def stripRefSeq(breakRepr: str) -> str: # 1 leading RefSeq - match = re.search(r"^([a-z])\.([A-Z]*|\?)([0-9]*[A-Z]*)$", breakRepr) + match = re.search(r'^([a-z])\.([A-Z]*|\?)([0-9]*[A-Z]*)$', breakRepr) if match: - return f"{match.group(1)}.{match.group(3)}" + return f'{match.group(1)}.{match.group(3)}' # TODO: Deal with cases like "p.?889_?890", "chr4:g.55593604_55593605delGGinsTT", ... @@ -395,27 +399,27 @@ def stripRefSeq(breakRepr: str) -> str: def stripDisplayName(displayName: str, withRef: bool = True, withRefSeq: bool = True) -> str: - match = re.search(r"^(.*)(\:)(.*)$", displayName) + match = re.search(r'^(.*)(\:)(.*)$', displayName) if match and not withRef: if withRefSeq: return match.group(3) displayName = match.group(2) + match.group(3) - match = re.search(r"^(.*\:)([a-z]\.)(.*)$", displayName) + match = re.search(r'^(.*\:)([a-z]\.)(.*)$', displayName) if match and not withRefSeq: - ref: str = match.group(1) if match.group(1) != ":" else "" + ref: str = match.group(1) if match.group(1) != ':' else '' prefix: str = match.group(2) rest: str = match.group(3) new_matches: Union[bool, object] = True # refSeq before position while new_matches: - new_matches = re.search(r"(.*)([A-Z]|\?)([0-9]+)(.*)", rest) + new_matches = re.search(r'(.*)([A-Z]|\?)([0-9]+)(.*)', rest) if new_matches: rest = new_matches.group(1) + new_matches.group(3) + new_matches.group(4) # refSeq before '>' - new_matches = re.search(r"^([0-9]*)([A-Z]*|\?)(\>)(.*)$", rest) + new_matches = re.search(r'^([0-9]*)([A-Z]*|\?)(\>)(.*)$', rest) if new_matches: rest = new_matches.group(1) + new_matches.group(3) + new_matches.group(4) @@ -442,18 +446,18 @@ def stringifyVariant( str: The string representation """ - displayName: str = variant.get("displayName") or "" # type: ignore + displayName: str = variant.get('displayName') or '' # type: ignore # If variant is a PositionalVariant (i.e. variant with a displayName) and # we already have the appropriate string representation, # then return it right away - if displayName != "" and (withRef and withRefSeq): + if displayName != '' and (withRef and withRefSeq): return displayName # If variant is a PositionalVariant (i.e. variant with a displayName) and # we DO NOT have the appropriate string representation, # then strip unwanted features, then return it right away - if displayName != "": + if displayName != '': return stripDisplayName(displayName, withRef, withRefSeq) # If variant is a ParsedVariant (i.e. variant without a displayName yet), @@ -464,106 +468,106 @@ def stringifyVariant( result: List[str] = [] # Extracting parsed values into individual variables - break1Repr: str = str(parsed.get("break1Repr", "")) - break2Repr: str = str(parsed.get("break2Repr", "")) - multiFeature: bool = bool(parsed.get("multiFeature")) - noFeatures: bool = bool(parsed.get("noFeatures")) - notationType: str = str(parsed.get("notationType", "")) - reference1: str = "" - if ref1 := parsed.get("reference1"): + break1Repr: str = str(parsed.get('break1Repr', '')) + break2Repr: str = str(parsed.get('break2Repr', '')) + multiFeature: bool = bool(parsed.get('multiFeature')) + noFeatures: bool = bool(parsed.get('noFeatures')) + notationType: str = str(parsed.get('notationType', '')) + reference1: str = '' + if ref1 := parsed.get('reference1'): if isinstance(ref1, str): reference1 = ref1 else: - reference1 = ref1.get("displayName", str(ref1)) - reference2: str = "" - if ref2 := parsed.get("reference2"): + reference1 = ref1.get('displayName', str(ref1)) + reference2: str = '' + if ref2 := parsed.get('reference2'): if isinstance(ref2, str): reference2 = ref2 else: - reference2 = ref2.get("displayName", str(ref2)) - refSeq: str = parsed.get("refSeq") or "" - truncation: int = parsed.get("truncation") or 0 # type: ignore - variantType: str = parsed.get("type", "") - untemplatedSeq: str = parsed.get("untemplatedSeq") or "" - untemplatedSeqSize: int = parsed.get("untemplatedSeqSize") or 0 + reference2 = ref2.get('displayName', str(ref2)) + refSeq: str = parsed.get('refSeq') or '' + truncation: int = parsed.get('truncation') or 0 # type: ignore + variantType: str = parsed.get('type', '') + untemplatedSeq: str = parsed.get('untemplatedSeq') or '' + untemplatedSeqSize: int = parsed.get('untemplatedSeqSize') or 0 # formating notationType if not notationType: - notationType = TYPES_TO_NOTATION.get(variantType, re.sub(r"\s", "-", variantType)) + notationType = TYPES_TO_NOTATION.get(variantType, re.sub(r'\s', '-', variantType)) # If multiFeature - if multiFeature or (reference2 != "" and reference1 != reference2): + if multiFeature or (reference2 != '' and reference1 != reference2): if withRef and not noFeatures: - result.append(f"({reference1}:{reference2})") + result.append(f'({reference1}:{reference2})') result.append(notationType) if withRefSeq: break1Repr_noParentheses = stripParentheses(break1Repr) break2Repr_noParentheses = stripParentheses(break2Repr) - result.append(f"({break1Repr_noParentheses},{break2Repr_noParentheses})") + result.append(f'({break1Repr_noParentheses},{break2Repr_noParentheses})') else: break1Repr_noParentheses_noRefSeq = stripRefSeq(stripParentheses(break1Repr)) break2Repr_noParentheses_noRefSeq = stripRefSeq(stripParentheses(break2Repr)) result.append( - f"({break1Repr_noParentheses_noRefSeq},{break2Repr_noParentheses_noRefSeq})" + f'({break1Repr_noParentheses_noRefSeq},{break2Repr_noParentheses_noRefSeq})' ) - if untemplatedSeq != "": + if untemplatedSeq != '': result.append(untemplatedSeq) elif untemplatedSeqSize: result.append(str(untemplatedSeqSize)) - return "".join(result) + return ''.join(result) # Continuous notation... # Reference if withRef and not noFeatures: - result.append(f"{reference1}:") + result.append(f'{reference1}:') # BreakRep if withRefSeq: result.append(break1Repr) - if break2Repr != "": - result.append(f"_{break2Repr[2:]}") + if break2Repr != '': + result.append(f'_{break2Repr[2:]}') else: result.append(stripRefSeq(break1Repr)) - if break2Repr != "": - result.append(f"_{stripRefSeq(break2Repr)[2:]}") + if break2Repr != '': + result.append(f'_{stripRefSeq(break2Repr)[2:]}') # refSeq, truncation, notationType, untemplatedSeq, untemplatedSeqSize - if any(i in notationType for i in ["ext", "fs"]) or ( - notationType == ">" and break1Repr.startswith("p.") + if any(i in notationType for i in ['ext', 'fs']) or ( + notationType == '>' and break1Repr.startswith('p.') ): result.append(untemplatedSeq) - if notationType == "mis" and break1Repr.startswith("p."): + if notationType == 'mis' and break1Repr.startswith('p.'): result.append(untemplatedSeq) - elif notationType != ">": - if notationType == "delins": + elif notationType != '>': + if notationType == 'delins': if withRefSeq: - result.append(f"del{refSeq}ins") + result.append(f'del{refSeq}ins') else: - result.append("delins") + result.append('delins') else: result.append(notationType) if truncation and truncation != 1: if truncation < 0: result.append(str(truncation)) else: - result.append(f"*{truncation}") - if any(i in notationType for i in ["dup", "del", "inv"]): + result.append(f'*{truncation}') + if any(i in notationType for i in ['dup', 'del', 'inv']): if withRefSeq: result.append(refSeq) - if any(i in notationType for i in ["ins", "delins"]): - if untemplatedSeq != "": + if any(i in notationType for i in ['ins', 'delins']): + if untemplatedSeq != '': result.append(untemplatedSeq) elif untemplatedSeqSize: result.append(str(untemplatedSeqSize)) - elif not break1Repr.startswith("p."): + elif not break1Repr.startswith('p.'): if withRefSeq: - refSeq = refSeq if refSeq != "" else "?" + refSeq = refSeq if refSeq != '' else '?' else: - refSeq = "" - untemplatedSeq = untemplatedSeq if untemplatedSeq != "" else "?" - result.append(f"{refSeq}{notationType}{untemplatedSeq}") + refSeq = '' + untemplatedSeq = untemplatedSeq if untemplatedSeq != '' else '?' + result.append(f'{refSeq}{notationType}{untemplatedSeq}') # TODO: Deal with more complexes cases like 'MED12:p.(?34_?68)mut' - return "".join(result) + return ''.join(result) diff --git a/pori_python/graphkb/vocab.py b/pori_python/graphkb/vocab.py index 26033e75..e9242a7a 100644 --- a/pori_python/graphkb/vocab.py +++ b/pori_python/graphkb/vocab.py @@ -7,14 +7,14 @@ def query_by_name(ontology_class: str, base_term_name: str) -> Dict: - return {"target": ontology_class, "filters": {"name": base_term_name}} + return {'target': ontology_class, 'filters': {'name': base_term_name}} def get_equivalent_terms( conn: GraphKBConnection, base_term_name: str, - root_exclude_term: str = "", - ontology_class: str = "Vocabulary", + root_exclude_term: str = '', + ontology_class: str = 'Vocabulary', ignore_cache: bool = False, build_base_query: Callable = query_by_name, ) -> List[Ontology]: @@ -32,10 +32,10 @@ def get_equivalent_terms( List[Ontology], conn.query( { - "target": {"target": base_records, "queryType": "descendants"}, - "queryType": "similarTo", - "treeEdges": [], - "returnProperties": ["sourceId", "sourceIdVersion", "deprecated", "name", "@rid"], + 'target': {'target': base_records, 'queryType': 'descendants'}, + 'queryType': 'similarTo', + 'treeEdges': [], + 'returnProperties': ['sourceId', 'sourceIdVersion', 'deprecated', 'name', '@rid'], }, ignore_cache=ignore_cache, ), @@ -51,30 +51,30 @@ def get_equivalent_terms( convert_to_rid_list( conn.query( { - "target": {"target": root_records, "queryType": "descendants"}, - "queryType": "similarTo", - "treeEdges": [], - "returnProperties": [ - "sourceId", - "sourceIdVersion", - "deprecated", - "name", - "@rid", + 'target': {'target': root_records, 'queryType': 'descendants'}, + 'queryType': 'similarTo', + 'treeEdges': [], + 'returnProperties': [ + 'sourceId', + 'sourceIdVersion', + 'deprecated', + 'name', + '@rid', ], }, ignore_cache=ignore_cache, ) ) ) - return [term for term in base_term_parents if term["@rid"] not in exclude] + return [term for term in base_term_parents if term['@rid'] not in exclude] return base_term_parents def get_term_tree( conn: GraphKBConnection, base_term_name: str, - root_exclude_term: str = "", - ontology_class: str = "Vocabulary", + root_exclude_term: str = '', + ontology_class: str = 'Vocabulary', include_superclasses: bool = True, ignore_cache: bool = False, build_base_query: Callable = query_by_name, @@ -102,10 +102,10 @@ def get_term_tree( List[Ontology], conn.query( { - "target": {"target": base_records, "queryType": "ancestors"}, - "queryType": "similarTo", - "treeEdges": [], - "returnProperties": ["sourceId", "sourceIdVersion", "deprecated", "name", "@rid"], + 'target': {'target': base_records, 'queryType': 'ancestors'}, + 'queryType': 'similarTo', + 'treeEdges': [], + 'returnProperties': ['sourceId', 'sourceIdVersion', 'deprecated', 'name', '@rid'], }, ignore_cache=ignore_cache, ), @@ -126,7 +126,7 @@ def get_term_tree( terms = {} # merge the two lists for term in child_terms + parent_terms: - terms[term["@rid"]] = term + terms[term['@rid']] = term return list(terms.values()) @@ -134,7 +134,7 @@ def get_term_tree( def get_term_by_name( conn: GraphKBConnection, name: str, - ontology_class: str = "Vocabulary", + ontology_class: str = 'Vocabulary', ignore_cache: bool = False, **kwargs, ) -> Ontology: @@ -156,15 +156,15 @@ def get_term_by_name( """ result = conn.query( { - "target": ontology_class, - "filters": {"name": name}, - "returnProperties": [ - "sourceId", - "sourceIdVersion", - "deprecated", - "name", - "@rid", - "@class", + 'target': ontology_class, + 'filters': {'name': name}, + 'returnProperties': [ + 'sourceId', + 'sourceIdVersion', + 'deprecated', + 'name', + '@rid', + '@class', ], }, ignore_cache=ignore_cache, @@ -172,7 +172,7 @@ def get_term_by_name( ) if len(result) != 1: - raise AssertionError(f"unable to find term ({name}) by name") + raise AssertionError(f'unable to find term ({name}) by name') return cast(Ontology, result[0]) diff --git a/pori_python/ipr/annotate.py b/pori_python/ipr/annotate.py index 72ae7626..cd6478a3 100644 --- a/pori_python/ipr/annotate.py +++ b/pori_python/ipr/annotate.py @@ -43,16 +43,16 @@ def get_second_pass_variants( # second-pass matching all_inferred_matches: Dict[str, Variant] = {} inferred_variants = { - (s["subject"]["@rid"], s["relevance"]["name"]) + (s['subject']['@rid'], s['relevance']['name']) for s in statements - if s["subject"] and s["subject"]["@class"] in ("Feature", "Signature") + if s['subject'] and s['subject']['@class'] in ('Feature', 'Signature') } for reference1, variant_type in inferred_variants: variants = gkb_match.match_category_variant(graphkb_conn, reference1, variant_type) for variant in variants: - all_inferred_matches[variant["@rid"]] = variant + all_inferred_matches[variant['@rid']] = variant inferred_matches: List[Variant] = list(all_inferred_matches.values()) return inferred_matches @@ -70,7 +70,7 @@ def get_ipr_statements_from_variants( rows = [] statements = get_statements_from_variants(graphkb_conn, matches) - existing_statements = {s["@rid"] for s in statements} + existing_statements = {s['@rid'] for s in statements} for ipr_row in convert_statements_to_alterations( graphkb_conn, statements, disease_matches, convert_to_rid_set(matches) @@ -83,7 +83,7 @@ def get_ipr_statements_from_variants( inferred_statements = [ s for s in get_statements_from_variants(graphkb_conn, inferred_matches) - if s["@rid"] not in existing_statements # do not duplicate if non-inferred match + if s['@rid'] not in existing_statements # do not duplicate if non-inferred match ] for ipr_row in convert_statements_to_alterations( @@ -92,7 +92,7 @@ def get_ipr_statements_from_variants( disease_matches, convert_to_rid_set(inferred_matches), ): - ipr_row["kbData"]["inferred"] = True + ipr_row['kbData']['inferred'] = True rows.append(ipr_row) return rows @@ -118,35 +118,35 @@ def annotate_expression_variants( skipped = 0 alterations = [] problem_genes = set() - logger.info(f"Starting annotation of {len(variants)} expression category_variants") + logger.info(f'Starting annotation of {len(variants)} expression category_variants') iterfunc = tqdm if show_progress else iter for row in iterfunc(variants): - gene = row["gene"] - variant = row["variant"] + gene = row['gene'] + variant = row['variant'] if not variant: skipped += 1 - logger.debug(f"Skipping malformed Expression {gene}: {row}") + logger.debug(f'Skipping malformed Expression {gene}: {row}') continue try: matches = gkb_match.match_expression_variant(graphkb_conn, gene, variant) for ipr_row in get_ipr_statements_from_variants(graphkb_conn, matches, disease_matches): - ipr_row["variant"] = row["key"] - ipr_row["variantType"] = row.get("variantType", "exp") + ipr_row['variant'] = row['key'] + ipr_row['variantType'] = row.get('variantType', 'exp') alterations.append(ipr_row) except FeatureNotFoundError as err: problem_genes.add(gene) - logger.debug(f"Unrecognized gene ({gene} {variant}): {err}") + logger.debug(f'Unrecognized gene ({gene} {variant}): {err}') except ValueError as err: - logger.error(f"failed to match variants ({gene} {variant}): {err}") + logger.error(f'failed to match variants ({gene} {variant}): {err}') if skipped: - logger.info(f"skipped matching {skipped} expression information rows") + logger.info(f'skipped matching {skipped} expression information rows') if problem_genes: - logger.error(f"gene finding failures for expression {sorted(problem_genes)}") - logger.error(f"gene finding falure for {len(problem_genes)} expression genes") + logger.error(f'gene finding failures for expression {sorted(problem_genes)}') + logger.error(f'gene finding falure for {len(problem_genes)} expression genes') logger.info( - f"matched {len(variants)} expression variants to {len(alterations)} graphkb annotations" + f'matched {len(variants)} expression variants to {len(alterations)} graphkb annotations' ) return alterations @@ -172,11 +172,11 @@ def annotate_copy_variants( alterations = [] problem_genes = set() - logger.info(f"Starting annotation of {len(variants)} copy category_variants") + logger.info(f'Starting annotation of {len(variants)} copy category_variants') iterfunc = tqdm if show_progress else iter for row in iterfunc(variants): - gene = row["gene"] - variant = row["variant"] + gene = row['gene'] + variant = row['variant'] if variant not in REPORTED_COPY_VARIANTS: # https://www.bcgsc.ca/jira/browse/GERO-77 @@ -186,24 +186,24 @@ def annotate_copy_variants( try: matches = gkb_match.match_copy_variant(graphkb_conn, gene, variant) for ipr_row in get_ipr_statements_from_variants(graphkb_conn, matches, disease_matches): - ipr_row["variant"] = row["key"] - ipr_row["variantType"] = row.get("variantType", "cnv") + ipr_row['variant'] = row['key'] + ipr_row['variantType'] = row.get('variantType', 'cnv') alterations.append(ipr_row) except FeatureNotFoundError as err: problem_genes.add(gene) - logger.debug(f"Unrecognized gene ({gene} {variant}): {err}") + logger.debug(f'Unrecognized gene ({gene} {variant}): {err}') except ValueError as err: - logger.error(f"failed to match variants ({gene} {variant}): {err}") + logger.error(f'failed to match variants ({gene} {variant}): {err}') if skipped: logger.info( - f"skipped matching {skipped} copy number variants not in {REPORTED_COPY_VARIANTS}" + f'skipped matching {skipped} copy number variants not in {REPORTED_COPY_VARIANTS}' ) if problem_genes: - logger.error(f"gene finding failures for copy variants {sorted(problem_genes)}") - logger.error(f"gene finding failure for {len(problem_genes)} copy variant genes") + logger.error(f'gene finding failures for copy variants {sorted(problem_genes)}') + logger.error(f'gene finding failure for {len(problem_genes)} copy variant genes') logger.info( - f"matched {len(variants)} copy category variants to {len(alterations)} graphkb annotations" + f'matched {len(variants)} copy category variants to {len(alterations)} graphkb annotations' ) return alterations @@ -226,14 +226,14 @@ def annotate_positional_variants( Returns: Hashable list of kbMatches records for IPR """ - VARIANT_KEYS = ("variant", "hgvsProtein", "hgvsCds", "hgvsGenomic") + VARIANT_KEYS = ('variant', 'hgvsProtein', 'hgvsCds', 'hgvsGenomic') errors = 0 alterations: List[Hashabledict] = [] problem_genes = set() iterfunc = tqdm if show_progress else iter for row in iterfunc(variants): - if not row.get("gene") and (not row.get("gene1") or not row.get("gene2")): + if not row.get('gene') and (not row.get('gene1') or not row.get('gene2')): # https://www.bcgsc.ca/jira/browse/GERO-56?focusedCommentId=1234791&page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel#comment-1234791 # should not match single gene SVs continue @@ -250,15 +250,15 @@ def annotate_positional_variants( # DEVSU-1885 - fix malformed single deletion described as substitution of blank # eg. deletion described as substitution with nothing: 'chr1:g.150951027T>' if ( - variant[-1] == ">" - and "g." in variant + variant[-1] == '>' + and 'g.' in variant and variant[-2].isalpha() and variant[-3].isnumeric() ): logger.warning( - f"Assuming malformed deletion variant {variant} is {variant[:-2] + 'del'}" + f'Assuming malformed deletion variant {variant} is {variant[:-2] + "del"}' ) - variant = variant[:-2] + "del" + variant = variant[:-2] + 'del' matches = gkb_match.match_positional_variant(graphkb_conn, variant) else: raise parse_err @@ -268,42 +268,42 @@ def annotate_positional_variants( matches, disease_matches, ): - ipr_row["variant"] = row["key"] - ipr_row["variantType"] = row.get( - "variantType", "mut" if row.get("gene") else "sv" + ipr_row['variant'] = row['key'] + ipr_row['variantType'] = row.get( + 'variantType', 'mut' if row.get('gene') else 'sv' ) alterations.append(Hashabledict(ipr_row)) except FeatureNotFoundError as err: - logger.debug(f"failed to match positional variants ({variant}): {err}") + logger.debug(f'failed to match positional variants ({variant}): {err}') errors += 1 - if "gene" in row: - problem_genes.add(row["gene"]) - elif "gene1" in row and f"({row['gene1']})" in str(err): - problem_genes.add(row["gene1"]) - elif "gene2" in row and f"({row['gene2']})" in str(err): - problem_genes.add(row["gene2"]) - elif "gene1" in row and "gene2" in row: - problem_genes.add(row["gene1"]) - problem_genes.add(row["gene2"]) + if 'gene' in row: + problem_genes.add(row['gene']) + elif 'gene1' in row and f'({row["gene1"]})' in str(err): + problem_genes.add(row['gene1']) + elif 'gene2' in row and f'({row["gene2"]})' in str(err): + problem_genes.add(row['gene2']) + elif 'gene1' in row and 'gene2' in row: + problem_genes.add(row['gene1']) + problem_genes.add(row['gene2']) else: raise err except HTTPError as err: errors += 1 - logger.error(f"failed to match positional variants ({variant}): {err}") + logger.error(f'failed to match positional variants ({variant}): {err}') if problem_genes: - logger.error(f"gene finding failures for {sorted(problem_genes)}") - logger.error(f"{len(problem_genes)} gene finding failures for positional variants") + logger.error(f'gene finding failures for {sorted(problem_genes)}') + logger.error(f'{len(problem_genes)} gene finding failures for positional variants') if errors: - logger.error(f"skipped {errors} positional variants due to errors") + logger.error(f'skipped {errors} positional variants due to errors') # drop duplicates alterations = list(set(alterations)) - variant_types = ", ".join(sorted(set([alt["variantType"] for alt in alterations]))) + variant_types = ', '.join(sorted(set([alt['variantType'] for alt in alterations]))) logger.info( - f"matched {len(variants)} {variant_types} positional variants to {len(alterations)} graphkb annotations" + f'matched {len(variants)} {variant_types} positional variants to {len(alterations)} graphkb annotations' ) return alterations @@ -336,30 +336,30 @@ def annotate_signature_variants( # Matching signature variant to GKB Variants matched_variants: List[Variant] = gkb_match.match_category_variant( graphkb_conn, - variant["signatureName"], - variant["variantTypeName"], - reference_class="Signature", + variant['signatureName'], + variant['variantTypeName'], + reference_class='Signature', ) # KBDEV-1246 # Keep support for 'high mutation burden' until statement datafix if ( - variant["signatureName"] == TMB_SIGNATURE - and TMB_SIGNATURE != "high mutation burden" + variant['signatureName'] == TMB_SIGNATURE + and TMB_SIGNATURE != 'high mutation burden' ): matched_variants.extend( gkb_match.match_category_variant( graphkb_conn, - "high mutation burden", - variant["variantTypeName"], - reference_class="Signature", + 'high mutation burden', + variant['variantTypeName'], + reference_class='Signature', ) ) # Matching GKB Variants to GKB Statements for ipr_row in get_ipr_statements_from_variants( graphkb_conn, matched_variants, disease_matches ): - ipr_row["variant"] = variant["key"] - ipr_row["variantType"] = "sigv" + ipr_row['variant'] = variant['key'] + ipr_row['variantType'] = 'sigv' alterations.append(Hashabledict(ipr_row)) except ValueError as err: @@ -369,7 +369,7 @@ def annotate_signature_variants( alterations = list(set(alterations)) logger.info( - f"matched {len(variants)} signature category variants to {len(alterations)} graphkb annotations" + f'matched {len(variants)} signature category variants to {len(alterations)} graphkb annotations' ) return alterations @@ -401,25 +401,25 @@ def annotate_variants( gkb_matches: List[Hashabledict] = [] # MATCHING SIGNATURE CATEGORY VARIANTS - logger.info(f"annotating {len(signature_variants)} signatures") + logger.info(f'annotating {len(signature_variants)} signatures') gkb_matches.extend( annotate_signature_variants( graphkb_conn, disease_matches, signature_variants, show_progress=interactive ) ) - logger.debug(f"\tgkb_matches: {len(gkb_matches)}") + logger.debug(f'\tgkb_matches: {len(gkb_matches)}') # MATCHING SMALL MUTATIONS - logger.info(f"annotating {len(small_mutations)} small mutations") + logger.info(f'annotating {len(small_mutations)} small mutations') gkb_matches.extend( annotate_positional_variants( graphkb_conn, small_mutations, disease_matches, show_progress=interactive ) ) - logger.debug(f"\tgkb_matches: {len(gkb_matches)}") + logger.debug(f'\tgkb_matches: {len(gkb_matches)}') # MATCHING STRUCTURAL VARIANTS - logger.info(f"annotating {len(structural_variants)} structural variants") + logger.info(f'annotating {len(structural_variants)} structural variants') gkb_matches.extend( annotate_positional_variants( graphkb_conn, @@ -428,10 +428,10 @@ def annotate_variants( show_progress=interactive, ) ) - logger.debug(f"\tgkb_matches: {len(gkb_matches)}") + logger.debug(f'\tgkb_matches: {len(gkb_matches)}') # MATCHING COPY VARIANTS - logger.info(f"annotating {len(copy_variants)} copy variants") + logger.info(f'annotating {len(copy_variants)} copy variants') gkb_matches.extend( [ Hashabledict(copy_var) @@ -440,10 +440,10 @@ def annotate_variants( ) ] ) - logger.debug(f"\tgkb_matches: {len(gkb_matches)}") + logger.debug(f'\tgkb_matches: {len(gkb_matches)}') # MATCHING EXPRESSION VARIANTS - logger.info(f"annotating {len(expression_variants)} expression variants") + logger.info(f'annotating {len(expression_variants)} expression variants') gkb_matches.extend( [ Hashabledict(exp_var) @@ -455,6 +455,6 @@ def annotate_variants( ) ] ) - logger.debug(f"\tgkb_matches: {len(gkb_matches)}") + logger.debug(f'\tgkb_matches: {len(gkb_matches)}') return gkb_matches diff --git a/pori_python/ipr/connection.py b/pori_python/ipr/connection.py index f2652fdd..70eaf26c 100644 --- a/pori_python/ipr/connection.py +++ b/pori_python/ipr/connection.py @@ -6,7 +6,6 @@ import zlib from typing import Dict, List -from .constants import DEFAULT_URL from .util import logger IMAGE_MAX = 20 # cannot upload more than 20 images at a time @@ -17,21 +16,21 @@ def __init__( self, username: str, password: str, - url: str = os.environ.get("IPR_URL", DEFAULT_URL), + url: str = os.environ.get('IPR_URL'), ): self.token = None self.url = url self.username = username self.password = password self.headers = { - "Accept": "application/json", - "Content-Type": "application/json", - "Content-Encoding": "deflate", + 'Accept': 'application/json', + 'Content-Type': 'application/json', + 'Content-Encoding': 'deflate', } self.cache: Dict[str, List[Dict]] = {} self.request_count = 0 - def request(self, endpoint: str, method: str = "GET", **kwargs) -> Dict: + def request(self, endpoint: str, method: str = 'GET', **kwargs) -> Dict: """Request wrapper to handle adding common headers and logging Args: @@ -41,9 +40,9 @@ def request(self, endpoint: str, method: str = "GET", **kwargs) -> Dict: Returns: dict: the json response as a python dict """ - url = f"{self.url}/{endpoint}" + url = f'{self.url}/{endpoint}' self.request_count += 1 - kwargs_header = kwargs.pop("headers", None) + kwargs_header = kwargs.pop('headers', None) if kwargs_header: headers = json.loads(kwargs_header) else: @@ -57,21 +56,21 @@ def request(self, endpoint: str, method: str = "GET", **kwargs) -> Dict: # try to get more error details message = str(err) try: - message += " " + resp.json()["error"]["message"] + message += ' ' + resp.json()['error']['message'] except Exception: pass raise requests.exceptions.HTTPError(message) if resp.status_code == 204: # TODO: address this in api - return {"status_code": 204} + return {'status_code': 204} return resp.json() def post(self, uri: str, data: Dict = {}, **kwargs) -> Dict: """Convenience method for making post requests""" return self.request( uri, - method="POST", - data=zlib.compress(json.dumps(data, allow_nan=False).encode("utf-8")), + method='POST', + data=zlib.compress(json.dumps(data, allow_nan=False).encode('utf-8')), **kwargs, ) @@ -79,8 +78,8 @@ def get(self, uri: str, data: Dict = {}, **kwargs) -> Dict: """Convenience method for making get requests""" return self.request( uri, - method="GET", - data=zlib.compress(json.dumps(data, allow_nan=False).encode("utf-8")), + method='GET', + data=zlib.compress(json.dumps(data, allow_nan=False).encode('utf-8')), **kwargs, ) @@ -88,9 +87,9 @@ def delete(self, uri: str, data: Dict = {}, **kwargs) -> Dict: """Convenience method for making delete requests""" return self.request( uri, - method="DELETE", - data=zlib.compress(json.dumps(data, allow_nan=False).encode("utf-8")), - headers=json.dumps({"Accept": "*/*"}), + method='DELETE', + data=zlib.compress(json.dumps(data, allow_nan=False).encode('utf-8')), + headers=json.dumps({'Accept': '*/*'}), **kwargs, ) @@ -106,83 +105,83 @@ def upload_report( # or 'report'. jobStatus is no longer available once the report is successfully # uploaded. - projects = self.get("project") - project_names = [item["name"] for item in projects] + projects = self.get('project') + project_names = [item['name'] for item in projects] # if project is not exist, create one - if content["project"] not in project_names: + if content['project'] not in project_names: logger.info( - f"Project not found - attempting to create project {content['project']}" + f'Project not found - attempting to create project {content["project"]}' ) try: - self.post("project", {"name": content["project"]}) + self.post('project', {'name': content['project']}) except Exception as err: - raise Exception(f"Project creation failed due to {err}") + raise Exception(f'Project creation failed due to {err}') if ignore_extra_fields: - initial_result = self.post("reports-async?ignore_extra_fields=true", content) + initial_result = self.post('reports-async?ignore_extra_fields=true', content) else: - initial_result = self.post("reports-async", content) + initial_result = self.post('reports-async', content) - report_id = initial_result["ident"] + report_id = initial_result['ident'] def check_status_result(result): - if result.get("report", False): - return "upload complete" - if result.get("jobStatus", False) and result["jobStatus"].get("state", False): - return result["jobStatus"]["state"] + if result.get('report', False): + return 'upload complete' + if result.get('jobStatus', False) and result['jobStatus'].get('state', False): + return result['jobStatus']['state'] raise Exception( - "async report get returned with no report or jobStatus, or unexpected jobStatus type" + 'async report get returned with no report or jobStatus, or unexpected jobStatus type' ) def check_status(interval: int = 5, num_attempts: int = 5): for i in range(num_attempts): - logger.info(f"checking report loading status in {interval} seconds") + logger.info(f'checking report loading status in {interval} seconds') time.sleep(interval) - current_status = self.get(f"reports-async/{report_id}") + current_status = self.get(f'reports-async/{report_id}') check_result = check_status_result(current_status) - if check_result == "upload complete": + if check_result == 'upload complete': return current_status - if check_result == "failed": + if check_result == 'failed': raise Exception( - f"async report upload failed with reason: {current_status.get('jobStatus', {}).get('failedReason', 'Unknown')}" + f'async report upload failed with reason: {current_status.get("jobStatus", {}).get("failedReason", "Unknown")}' ) if check_result not in [ - "active", - "ready", - "waiting", - "completed", + 'active', + 'ready', + 'waiting', + 'completed', ]: - raise Exception(f"async report upload in unexpected state: {check_result}") + raise Exception(f'async report upload in unexpected state: {check_result}') return current_status current_status = check_status() check_result = check_status_result(current_status) - if check_result in ["active", "waiting"]: + if check_result in ['active', 'waiting']: current_status = check_status(interval=30) check_result = check_status_result(current_status) - if check_result in ["active", "waiting"]: + if check_result in ['active', 'waiting']: current_status = check_status(interval=60, num_attempts=mins_to_wait) check_result = check_status_result(current_status) - if check_result in ["active", "waiting"]: + if check_result in ['active', 'waiting']: raise Exception( - f"async report upload taking longer than expected: {current_status}" + f'async report upload taking longer than expected: {current_status}' ) return current_status else: if ignore_extra_fields: - return self.post("reports?ignore_extra_fields=true", content) + return self.post('reports?ignore_extra_fields=true', content) else: - return self.post("reports", content) + return self.post('reports', content) def set_analyst_comments(self, report_id: str, data: Dict) -> Dict: """ @@ -193,9 +192,9 @@ def set_analyst_comments(self, report_id: str, data: Dict) -> Dict: Pending: https://www.bcgsc.ca/jira/browse/DEVSU-1177 """ return self.request( - f"/reports/{report_id}/summary/analyst-comments", - method="PUT", - data=zlib.compress(json.dumps(data, allow_nan=False).encode("utf-8")), + f'/reports/{report_id}/summary/analyst-comments', + method='PUT', + data=zlib.compress(json.dumps(data, allow_nan=False).encode('utf-8')), ) def post_images(self, report_id: str, files: Dict[str, str], data: Dict[str, str] = {}) -> None: @@ -212,18 +211,18 @@ def post_images(self, report_id: str, files: Dict[str, str], data: Dict[str, str if not os.path.exists(path): raise FileNotFoundError(path) current_files[key] = path - open_files = {k: open(f, "rb") for (k, f) in current_files.items()} + open_files = {k: open(f, 'rb') for (k, f) in current_files.items()} try: resp = self.request( - f"reports/{report_id}/image", - method="POST", + f'reports/{report_id}/image', + method='POST', data=data, files=open_files, headers=json.dumps({}), ) for status in resp: - if status.get("upload") != "successful": - image_errors.add(status["key"]) + if status.get('upload') != 'successful': + image_errors.add(status['key']) finally: for handler in open_files.values(): handler.close() @@ -235,12 +234,12 @@ def get_spec(self) -> Dict: """ Get the current IPR spec, for the purposes of current report upload fields """ - return self.request("/spec.json", method="GET") + return self.request('/spec.json', method='GET') def validate_json(self, content: Dict) -> Dict: """ Validate the provided json schema """ - result = self.post("reports/schema", content) - logger.info(f"{result['message']}") + result = self.post('reports/schema', content) + logger.info(f'{result["message"]}') return result diff --git a/pori_python/ipr/constants.py b/pori_python/ipr/constants.py index 6f3958c3..35c2a547 100644 --- a/pori_python/ipr/constants.py +++ b/pori_python/ipr/constants.py @@ -1,28 +1,40 @@ -DEFAULT_URL = "https://iprstaging-api.bcgsc.ca/api" -GERMLINE_BASE_TERMS = ("pharmacogenomic", "cancer predisposition") # based on graphkb.constants -VARIANT_CLASSES = {"Variant", "CategoryVariant", "PositionalVariant", "CatalogueVariant"} +GERMLINE_BASE_TERMS = ('pharmacogenomic', 'cancer predisposition') # based on graphkb.constants +VARIANT_CLASSES = {'Variant', 'CategoryVariant', 'PositionalVariant', 'CatalogueVariant'} # all possible values for review status are: ['pending', 'not required', 'passed', 'failed', 'initial'] -FAILED_REVIEW_STATUS = "failed" +FAILED_REVIEW_STATUS = 'failed' # Signatures -COSMIC_SIGNATURE_VARIANT_TYPE = "high signature" -HLA_SIGNATURE_VARIANT_TYPE = "signature present" -TMB_SIGNATURE = "mutation burden" +COSMIC_SIGNATURE_VARIANT_TYPE = 'high signature' +HLA_SIGNATURE_VARIANT_TYPE = 'signature present' +TMB_SIGNATURE = 'mutation burden' TMB_SIGNATURE_HIGH_THRESHOLD = ( 10.0 # genomic mutations per mb - https://www.bcgsc.ca/jira/browse/GERO-296 ) -TMB_SIGNATURE_VARIANT_TYPE = "high signature" +TMB_SIGNATURE_VARIANT_TYPE = 'high signature' # Mapping micro-satellite from pipeline terms to GraphKB terms MSI_MAPPING = { - "microsatellite instability": { # MSI - "displayName": "microsatellite instability high signature", - "signatureName": "microsatellite instability", - "variantTypeName": "high signature", + 'microsatellite instability': { # MSI + 'displayName': 'microsatellite instability high signature', + 'signatureName': 'microsatellite instability', + 'variantTypeName': 'high signature', }, - "microsatellite stable": { # MSS - "displayName": "microsatellite stable signature present", - "signatureName": "microsatellite stable", - "variantTypeName": "signature present", + 'microsatellite stable': { # MSS + 'displayName': 'microsatellite stable signature present', + 'signatureName': 'microsatellite stable', + 'variantTypeName': 'signature present', + }, +} +# Mapping hrd from pipeline terms to GraphKB terms +HRD_MAPPING = { + 'homologous recombination deficiency strong signature': { + 'displayName': 'homologous recombination deficiency strong signature', + 'signatureName': 'homologous recombination deficiency', + 'variantTypeName': 'strong signature', + }, + 'homologous recombination deficiency moderate signature': { + 'displayName': 'homologous recombination deficiency moderate signature', + 'signatureName': 'homologous recombination deficiency', + 'variantTypeName': 'moderate signature', }, } diff --git a/pori_python/ipr/content.spec.json b/pori_python/ipr/content.spec.json index c2df68e9..a9790129 100644 --- a/pori_python/ipr/content.spec.json +++ b/pori_python/ipr/content.spec.json @@ -541,6 +541,20 @@ }, "type": "array" }, + "hrd": { + "properties": { + "kbCategory": { + "type": "string" + }, + "score": { + "type": "number" + } + }, + "required": [ + "score" + ], + "type": "object" + }, "images": { "items": { "example": { diff --git a/pori_python/ipr/inputs.py b/pori_python/ipr/inputs.py index dcff908b..01976603 100644 --- a/pori_python/ipr/inputs.py +++ b/pori_python/ipr/inputs.py @@ -9,7 +9,6 @@ import os import pandas as pd import re -from Bio.Data.IUPACData import protein_letters_3to1 from numpy import nan from typing import Any, Callable, Dict, Iterable, List, Set, Tuple, cast @@ -25,149 +24,149 @@ from .constants import ( COSMIC_SIGNATURE_VARIANT_TYPE, - DEFAULT_URL, HLA_SIGNATURE_VARIANT_TYPE, MSI_MAPPING, + HRD_MAPPING, TMB_SIGNATURE, TMB_SIGNATURE_VARIANT_TYPE, ) -from .util import hash_key, logger, pandas_falsy +from .util import hash_key, logger, pandas_falsy, protein_letters_3to1 -protein_letters_3to1.setdefault("Ter", "*") +protein_letters_3to1.setdefault('Ter', '*') -SPECIFICATION = os.path.join(os.path.dirname(__file__), "content.spec.json") +SPECIFICATION = os.path.join(os.path.dirname(__file__), 'content.spec.json') # content in the local specification should match the values in IPR_API_SPEC_JSON_URL -IPR_API_SPEC_JSON_URL = f'{os.environ.get("IPR_URL", DEFAULT_URL)}/spec.json' +IPR_API_SPEC_JSON_URL = f'{os.environ.get("IPR_URL")}/spec.json' # TODO: GERO-307 - use SPECIFICATION json to derive the variant required and optional details defined below # 'cnvState' is for display -COPY_REQ = ["gene", "kbCategory"] -COPY_KEY = ["gene"] +COPY_REQ = ['gene', 'kbCategory'] +COPY_KEY = ['gene'] COPY_OPTIONAL = [ - "cnvState", - "copyChange", - "lohState", # Loss of Heterzygosity state - informative detail to analyst - "chromosomeBand", - "chromosome", - "chr", # expect only one of chromosome or chr - "start", - "end", - "size", - "log2Cna", - "cna", - "comments", - "library", - "germline", + 'cnvState', + 'copyChange', + 'lohState', # Loss of Heterzygosity state - informative detail to analyst + 'chromosomeBand', + 'chromosome', + 'chr', # expect only one of chromosome or chr + 'start', + 'end', + 'size', + 'log2Cna', + 'cna', + 'comments', + 'library', + 'germline', ] -SMALL_MUT_REQ = ["gene", "proteinChange"] +SMALL_MUT_REQ = ['gene', 'proteinChange'] # alternate details in the key, can distinguish / subtype events. SMALL_MUT_KEY = SMALL_MUT_REQ + [ - "altSeq", - "chromosome", - "endPosition", - "refSeq", - "startPosition", - "transcript", + 'altSeq', + 'chromosome', + 'endPosition', + 'refSeq', + 'startPosition', + 'transcript', ] SMALL_MUT_OPTIONAL = [ - "altSeq", - "comments", - "chromosome", - "endPosition", - "germline", - "hgvsCds", - "hgvsGenomic", - "hgvsProtein", - "library", - "ncbiBuild", - "normalAltCount", - "normalDepth", - "normalRefCount", - "refSeq", - "rnaAltCount", - "rnaDepth", - "rnaRefCount", - "startPosition", - "transcript", - "tumourAltCount", - "tumourAltCopies", - "tumourDepth", - "tumourRefCount", - "tumourRefCopies", - "zygosity", + 'altSeq', + 'comments', + 'chromosome', + 'endPosition', + 'germline', + 'hgvsCds', + 'hgvsGenomic', + 'hgvsProtein', + 'library', + 'ncbiBuild', + 'normalAltCount', + 'normalDepth', + 'normalRefCount', + 'refSeq', + 'rnaAltCount', + 'rnaDepth', + 'rnaRefCount', + 'startPosition', + 'transcript', + 'tumourAltCount', + 'tumourAltCopies', + 'tumourDepth', + 'tumourRefCount', + 'tumourRefCopies', + 'zygosity', ] -EXP_REQ = ["gene", "kbCategory"] -EXP_KEY = ["gene"] +EXP_REQ = ['gene', 'kbCategory'] +EXP_KEY = ['gene'] EXP_OPTIONAL = [ - "biopsySiteFoldChange", - "biopsySitePercentile", - "biopsySiteQC", - "biopsySiteZScore", - "biopsySitekIQR", - "comments", - "diseaseFoldChange", - "diseasekIQR", - "diseasePercentile", - "diseaseQC", - "diseaseZScore", - "expressionState", - "histogramImage", - "library", - "primarySiteFoldChange", - "primarySitekIQR", - "primarySitePercentile", - "primarySiteQC", - "primarySiteZScore", - "internalPancancerFoldChange", - "internalPancancerkIQR", - "internalPancancerPercentile", - "internalPancancerQC", - "internalPancancerZScore", - "rnaReads", - "rpkm", - "tpm", + 'biopsySiteFoldChange', + 'biopsySitePercentile', + 'biopsySiteQC', + 'biopsySiteZScore', + 'biopsySitekIQR', + 'comments', + 'diseaseFoldChange', + 'diseasekIQR', + 'diseasePercentile', + 'diseaseQC', + 'diseaseZScore', + 'expressionState', + 'histogramImage', + 'library', + 'primarySiteFoldChange', + 'primarySitekIQR', + 'primarySitePercentile', + 'primarySiteQC', + 'primarySiteZScore', + 'internalPancancerFoldChange', + 'internalPancancerkIQR', + 'internalPancancerPercentile', + 'internalPancancerQC', + 'internalPancancerZScore', + 'rnaReads', + 'rpkm', + 'tpm', ] SV_REQ = [ - "eventType", - "breakpoint", - "gene1", # prev: nterm_hugo - "gene2", # prev: cterm_hugo - "exon1", # n-terminal - "exon2", # c-terminal + 'eventType', + 'breakpoint', + 'gene1', # prev: nterm_hugo + 'gene2', # prev: cterm_hugo + 'exon1', # n-terminal + 'exon2', # c-terminal ] SV_KEY = SV_REQ[:] SV_OPTIONAL = [ - "ctermTranscript", - "ntermTranscript", - "ctermGene", # combined hugo ensembl form - "ntermGene", # combined hugo ensembl form - "detectedIn", - "conventionalName", - "svg", - "svgTitle", - "name", - "frame", - "omicSupport", - "highQuality", - "comments", - "library", - "rnaAltCount", - "rnaDepth", - "tumourAltCount", - "tumourDepth", - "germline", - "mavis_product_id", + 'ctermTranscript', + 'ntermTranscript', + 'ctermGene', # combined hugo ensembl form + 'ntermGene', # combined hugo ensembl form + 'detectedIn', + 'conventionalName', + 'svg', + 'svgTitle', + 'name', + 'frame', + 'omicSupport', + 'highQuality', + 'comments', + 'library', + 'rnaAltCount', + 'rnaDepth', + 'tumourAltCount', + 'tumourDepth', + 'germline', + 'mavis_product_id', ] -SIGV_REQ = ["signatureName", "variantTypeName"] -SIGV_COSMIC = ["signature"] # 1st element used as signatureName key -SIGV_HLA = ["a1", "a2", "b1", "b2", "c1", "c2"] -SIGV_OPTIONAL = ["displayName"] +SIGV_REQ = ['signatureName', 'variantTypeName'] +SIGV_COSMIC = ['signature'] # 1st element used as signatureName key +SIGV_HLA = ['a1', 'a2', 'b1', 'b2', 'c1', 'c2'] +SIGV_OPTIONAL = ['displayName'] SIGV_KEY = SIGV_REQ[:] @@ -192,7 +191,7 @@ def validate_variant_rows( Returns: the rows from the tab file as dictionaries """ - header = required + optional + ["key"] + header = required + optional + ['key'] result = [] keys = set() @@ -202,18 +201,18 @@ def validate_variant_rows( if not header_validated: for req_col in required: if req_col not in row: - raise ValueError(f"header missing required column ({req_col})") + raise ValueError(f'header missing required column ({req_col})') header_validated = True row_key = hash_key(row_to_key(row)) if row_key in keys: - raise ValueError(f"duplicate row key ({row_key}) from ({row_to_key(row)})") - row["key"] = row_key + raise ValueError(f'duplicate row key ({row_key}) from ({row_to_key(row)})') + row['key'] = row_key keys.add(row_key) for k, v in row.items(): if v is pd.NA: - row[k] = "" + row[k] = '' - result.append(cast(IprVariant, {col: row.get(col, "") for col in header})) + result.append(cast(IprVariant, {col: row.get(col, '') for col in header})) return result @@ -225,43 +224,42 @@ def preprocess_copy_variants(rows: Iterable[Dict]) -> List[IprCopyVariant]: """ # default map for display - concise names display_name_mapping = { - INPUT_COPY_CATEGORIES.DEEP: "deep deletion", - INPUT_COPY_CATEGORIES.AMP: "amplification", - INPUT_COPY_CATEGORIES.GAIN: "copy gain", - INPUT_COPY_CATEGORIES.LOSS: "copy loss", + INPUT_COPY_CATEGORIES.DEEP: 'deep deletion', + INPUT_COPY_CATEGORIES.AMP: 'amplification', + INPUT_COPY_CATEGORIES.GAIN: 'copy gain', + INPUT_COPY_CATEGORIES.LOSS: 'copy loss', } display_name_mapping.update(dict([(v, v) for v in display_name_mapping.values()])) def row_key(row: Dict) -> Tuple[str, ...]: - return tuple(["cnv"] + [row[key] for key in COPY_KEY]) + return tuple(['cnv'] + [row[key] for key in COPY_KEY]) result = validate_variant_rows(rows, COPY_REQ, COPY_OPTIONAL, row_key) ret_list = [cast(IprCopyVariant, var) for var in result] for row in ret_list: - - kb_cat = row.get("kbCategory") - kb_cat = "" if pd.isnull(kb_cat) else str(kb_cat) + kb_cat = row.get('kbCategory') + kb_cat = '' if pd.isnull(kb_cat) else str(kb_cat) if kb_cat: if kb_cat not in INPUT_COPY_CATEGORIES.values(): - raise ValueError(f"invalid copy variant kbCategory value ({kb_cat})") - if not row.get("cnvState"): # apply default short display name - row["cnvState"] = display_name_mapping[kb_cat] - row["variant"] = kb_cat - row["variantType"] = "cnv" - chrband = row.get("chromosomeBand", False) - chrom = row.pop("chromosome", False) + raise ValueError(f'invalid copy variant kbCategory value ({kb_cat})') + if not row.get('cnvState'): # apply default short display name + row['cnvState'] = display_name_mapping[kb_cat] + row['variant'] = kb_cat + row['variantType'] = 'cnv' + chrband = row.get('chromosomeBand', False) + chrom = row.pop('chromosome', False) if not chrom: - chrom = row.pop("chr", False) + chrom = row.pop('chr', False) # remove chr if it was not used for chrom - row.pop("chr", False) + row.pop('chr', False) if chrom: # check that chr isn't already in the chrband; # this regex from https://vrs.ga4gh.org/en/1.2/terms_and_model.html#id25 - if chrband and (re.match(r"^cen|[pq](ter|([1-9][0-9]*(\.[1-9][0-9]*)?))$", chrband)): + if chrband and (re.match(r'^cen|[pq](ter|([1-9][0-9]*(\.[1-9][0-9]*)?))$', chrband)): if isinstance(chrom, int): chrom = str(chrom) - chrom = chrom.strip("chr") - row["chromosomeBand"] = chrom + row["chromosomeBand"] + chrom = chrom.strip('chr') + row['chromosomeBand'] = chrom + row['chromosomeBand'] return ret_list @@ -274,28 +272,28 @@ def preprocess_small_mutations(rows: Iterable[Dict]) -> List[IprSmallMutationVar def row_key(row: IprSmallMutationVariant) -> Tuple[str, ...]: key_vals = [] - for kval in [row.get(key, "") for key in SMALL_MUT_KEY]: - key_vals.append(str(kval) if pd.notnull(kval) else "") - return tuple(["small mutation"] + key_vals) + for kval in [row.get(key, '') for key in SMALL_MUT_KEY]: + key_vals.append(str(kval) if pd.notnull(kval) else '') + return tuple(['small mutation'] + key_vals) result = validate_variant_rows(rows, SMALL_MUT_REQ, SMALL_MUT_OPTIONAL, row_key) if not result: return [] def pick_variant(row: IprSmallMutationVariant) -> str: - protein_change = row.get("proteinChange") + protein_change = row.get('proteinChange') if not pandas_falsy(protein_change): for longAA, shortAA in protein_letters_3to1.items(): protein_change = str(protein_change).replace(longAA, shortAA) - hgvsp = "{}:{}".format(row["gene"], protein_change) + hgvsp = '{}:{}'.format(row['gene'], protein_change) return hgvsp - for field in ["hgvsProtein", "hgvsCds", "hgvsGenomic"]: + for field in ['hgvsProtein', 'hgvsCds', 'hgvsGenomic']: if not pandas_falsy(row.get(field)): return str(row.get(field)) raise ValueError( - "Variant field cannot be empty. Must include proteinChange or one of the hgvs fields (hgvsProtein, hgvsCds, hgvsGenomic) to build the variant string" + 'Variant field cannot be empty. Must include proteinChange or one of the hgvs fields (hgvsProtein, hgvsCds, hgvsGenomic) to build the variant string' ) # 'location' and 'refAlt' are not currently used for matching; still optional and allowed blank @@ -304,21 +302,21 @@ def pick_variant(row: IprSmallMutationVariant) -> str: # for row in result: def convert_sm(row: IprVariant) -> IprSmallMutationVariant: ret = cast(IprSmallMutationVariant, row) - ret["variant"] = pick_variant(ret) - ret["variantType"] = "mut" + ret['variant'] = pick_variant(ret) + ret['variantType'] = 'mut' - if ret.get("startPosition") and not ret.get("endPosition"): - ret["endPosition"] = ret["startPosition"] + if ret.get('startPosition') and not ret.get('endPosition'): + ret['endPosition'] = ret['startPosition'] # default depth to alt + ref if not given - for sample_type in ("normal", "rna", "tumour"): + for sample_type in ('normal', 'rna', 'tumour'): if ( - ret.get(f"{sample_type}RefCount") - and ret.get(f"{sample_type}AltCount") - and not ret.get(f"{sample_type}Depth") + ret.get(f'{sample_type}RefCount') + and ret.get(f'{sample_type}AltCount') + and not ret.get(f'{sample_type}Depth') ): - ret[f"{sample_type}Depth"] = ( # type: ignore - ret[f"{sample_type}RefCount"] + ret[f"{sample_type}AltCount"] # type: ignore + ret[f'{sample_type}Depth'] = ( # type: ignore + ret[f'{sample_type}RefCount'] + ret[f'{sample_type}AltCount'] # type: ignore ) return ret @@ -334,65 +332,65 @@ def preprocess_expression_variants(rows: Iterable[Dict]) -> List[IprExprVariant] """ def row_key(row: Dict) -> Tuple[str, ...]: - return tuple(["expression"] + [row[key] for key in EXP_KEY]) + return tuple(['expression'] + [row[key] for key in EXP_KEY]) variants = validate_variant_rows(rows, EXP_REQ, EXP_OPTIONAL, row_key) result = [cast(IprExprVariant, var) for var in variants] float_columns = [ col for col in EXP_REQ + EXP_OPTIONAL - if col.endswith("kIQR") - or col.endswith("Percentile") - or col.endswith("FoldChange") - or col.endswith("QC") - or col.endswith("ZScore") - or col in ["tpm", "rpkm"] + if col.endswith('kIQR') + or col.endswith('Percentile') + or col.endswith('FoldChange') + or col.endswith('QC') + or col.endswith('ZScore') + or col in ['tpm', 'rpkm'] ] errors = [] for row in result: - row["variant"] = row["kbCategory"] - if not row["expressionState"] and row["kbCategory"]: - row["expressionState"] = row["kbCategory"] + row['variant'] = row['kbCategory'] + if not row['expressionState'] and row['kbCategory']: + row['expressionState'] = row['kbCategory'] - if row["variant"] and not pd.isnull(row["variant"]): - if row["variant"] not in INPUT_EXPRESSION_CATEGORIES.values(): + if row['variant'] and not pd.isnull(row['variant']): + if row['variant'] not in INPUT_EXPRESSION_CATEGORIES.values(): err_msg = f"{row['gene']} variant '{row['variant']}' not in {INPUT_EXPRESSION_CATEGORIES.values()}" errors.append(err_msg) logger.error(err_msg) - row["variantType"] = "exp" + row['variantType'] = 'exp' for col in float_columns: - if row.get(col) in ["inf", "+inf", "-inf"]: - row[col] = row[col].replace("inf", "Infinity") # type: ignore + if row.get(col) in ['inf', '+inf', '-inf']: + row[col] = row[col].replace('inf', 'Infinity') # type: ignore # check images exist - if row["histogramImage"] and not os.path.exists(row["histogramImage"]): + if row['histogramImage'] and not os.path.exists(row['histogramImage']): raise FileNotFoundError(f'missing image ({row["histogramImage"]})') if errors: - raise ValueError(f"{len(errors)} Invalid expression variants in file") + raise ValueError(f'{len(errors)} Invalid expression variants in file') return result def create_graphkb_sv_notation(row: IprFusionVariant) -> str: """Generate GKB/IPR fusion style notation from a structural variant.""" - gene1 = row["gene1"] or "?" - gene2 = row["gene2"] or "?" - exon1 = str(row["exon1"]) if row["exon1"] else "?" - exon2 = str(row["exon2"]) if row["exon2"] else "?" - if not row["gene1"]: + gene1 = row['gene1'] or '?' + gene2 = row['gene2'] or '?' + exon1 = str(row['exon1']) if row['exon1'] else '?' + exon2 = str(row['exon2']) if row['exon2'] else '?' + if not row['gene1']: gene1, gene2 = gene2, gene1 exon1, exon2 = exon2, exon1 - if gene1 == "?": + if gene1 == '?': raise ValueError( f'both genes cannot be blank for a structural variant {row["key"]}. At least 1 gene must be entered' ) # force exons to integer repr string - exon1 = exon1[:-2] if exon1.endswith(".0") else exon1 - exon2 = exon2[:-2] if exon2.endswith(".0") else exon2 - return f"({gene1},{gene2}):fusion(e.{exon1},e.{exon2})" + exon1 = exon1[:-2] if exon1.endswith('.0') else exon1 + exon2 = exon2[:-2] if exon2.endswith('.0') else exon2 + return f'({gene1},{gene2}):fusion(e.{exon1},e.{exon2})' def preprocess_structural_variants(rows: Iterable[Dict]) -> List[IprFusionVariant]: @@ -402,21 +400,21 @@ def preprocess_structural_variants(rows: Iterable[Dict]) -> List[IprFusionVarian """ def row_key(row: Dict) -> Tuple[str, ...]: - return tuple(["sv"] + [row[key] for key in SV_KEY]) + return tuple(['sv'] + [row[key] for key in SV_KEY]) variants = validate_variant_rows(rows, SV_REQ, SV_OPTIONAL, row_key) result = [cast(IprFusionVariant, var) for var in variants] # genes are optional for structural variants for row in result: - row["variant"] = create_graphkb_sv_notation(row) - row["variantType"] = "sv" + row['variant'] = create_graphkb_sv_notation(row) + row['variantType'] = 'sv' # check and load the svg file where applicable - if row["svg"] and not pd.isnull(row["svg"]): - if not os.path.exists(row["svg"]): - raise FileNotFoundError(row["svg"]) - with open(row["svg"], "r") as fh: - row["svg"] = fh.read() + if row['svg'] and not pd.isnull(row['svg']): + if not os.path.exists(row['svg']): + raise FileNotFoundError(row['svg']) + with open(row['svg'], 'r') as fh: + row['svg'] = fh.read() return result @@ -428,15 +426,15 @@ def preprocess_signature_variants(rows: Iterable[Dict]) -> List[IprSignatureVari """ def row_key(row: Dict) -> Tuple[str, ...]: - return tuple(["sigv"] + [row[key] for key in SIGV_KEY]) + return tuple(['sigv'] + [row[key] for key in SIGV_KEY]) variants = validate_variant_rows(rows, SIGV_REQ, SIGV_OPTIONAL, row_key) result = [cast(IprSignatureVariant, var) for var in variants] # Adding additional required properties for row in result: - row["variant"] = row["displayName"] - row["variantType"] = "sigv" + row['variant'] = row['displayName'] + row['variantType'] = 'sigv' return result @@ -448,9 +446,9 @@ def preprocess_cosmic(rows: Iterable[Dict]) -> Iterable[Dict]: """ return [ { - "displayName": f"{signature} {COSMIC_SIGNATURE_VARIANT_TYPE}", - "signatureName": signature, - "variantTypeName": COSMIC_SIGNATURE_VARIANT_TYPE, + 'displayName': f'{signature} {COSMIC_SIGNATURE_VARIANT_TYPE}', + 'signatureName': signature, + 'variantTypeName': COSMIC_SIGNATURE_VARIANT_TYPE, } for signature in rows ] @@ -465,21 +463,21 @@ def preprocess_hla(rows: Iterable[Dict]) -> Iterable[Dict]: for k, v in row.items(): if k not in SIGV_HLA: continue - hla.add(f"HLA-{v}") # 2nd level, e.g. 'HLA-A*02:01' - hla.add(f"HLA-{v.split(':')[0]}") # 1st level, e.g. 'HLA-A*02' + hla.add(f'HLA-{v}') # 2nd level, e.g. 'HLA-A*02:01' + hla.add(f'HLA-{v.split(":")[0]}') # 1st level, e.g. 'HLA-A*02' return [ { - "displayName": f"{signature} {HLA_SIGNATURE_VARIANT_TYPE}", - "signatureName": signature, - "variantTypeName": HLA_SIGNATURE_VARIANT_TYPE, + 'displayName': f'{signature} {HLA_SIGNATURE_VARIANT_TYPE}', + 'signatureName': signature, + 'variantTypeName': HLA_SIGNATURE_VARIANT_TYPE, } for signature in hla ] def preprocess_tmb( - tmb_high: float, tmburMutationBurden: Dict = {}, genomeTmb: float | str = "" + tmb_high: float, tmburMutationBurden: Dict = {}, genomeTmb: float | str = '' ) -> Iterable[Dict]: """ Process tumour mutation burden (tmb) input(s) into preformatted signature input. @@ -493,15 +491,15 @@ def preprocess_tmb( if tmburMutationBurden: try: tmbur_tmb_val = float( - tmburMutationBurden["genomeIndelTmb"] + tmburMutationBurden["genomeSnvTmb"] + tmburMutationBurden['genomeIndelTmb'] + tmburMutationBurden['genomeSnvTmb'] ) if not genomeTmb and not isinstance(genomeTmb, float): logger.error( - "backwards compatibility: deriving genomeTmb from tmburMutationBurden genomeIndelTmb + genomeSnvTmb" + 'backwards compatibility: deriving genomeTmb from tmburMutationBurden genomeIndelTmb + genomeSnvTmb' ) tmb_val = tmbur_tmb_val except Exception as err: - logger.error(f"tmburMutationBurden parsing failure: {err}") + logger.error(f'tmburMutationBurden parsing failure: {err}') # genomeTmb # SDEV-4811 - mutation burden is now expected to be uploaded in genomeTmb as mutations/megabase @@ -512,19 +510,19 @@ def preprocess_tmb( tmb_val = float(genomeTmb) if tmburMutationBurden and tmbur_tmb_val != tmb_val: logger.warning( - f"genomeTmb given {tmb_val} does not match tmburMutationBurden TMB {tmbur_tmb_val}" + f'genomeTmb given {tmb_val} does not match tmburMutationBurden TMB {tmbur_tmb_val}' ) except TypeError as err: - logger.error(f"genomeTmb parsing failure {genomeTmb}: {err}") + logger.error(f'genomeTmb parsing failure {genomeTmb}: {err}') # comparaing tmb_val to threshold # Signature CategoryVariant created only if threshold met if tmb_val >= tmb_high: return [ { - "displayName": f"{TMB_SIGNATURE} {TMB_SIGNATURE_VARIANT_TYPE}", - "signatureName": TMB_SIGNATURE, - "variantTypeName": TMB_SIGNATURE_VARIANT_TYPE, + 'displayName': f'{TMB_SIGNATURE} {TMB_SIGNATURE_VARIANT_TYPE}', + 'signatureName': TMB_SIGNATURE, + 'variantTypeName': TMB_SIGNATURE_VARIANT_TYPE, } ] return [] @@ -536,17 +534,16 @@ def preprocess_msi(msi: Any) -> Iterable[Dict]: Both msi & mss gets mapped to corresponding GraphKB Signature CategoryVariants. """ if msi: - # MSI category is given from upstream (only one msi variant per library) if isinstance(msi, list): # msi is given as a list of one dict - msi_cat = msi[0].get("kbCategory", "") + msi_cat = msi[0].get('kbCategory', '') elif isinstance(msi, str): # msi is given as a string msi_cat = msi else: # msi is given as a dict; uncatched error if not. - msi_cat = msi.get("kbCategory", "") + msi_cat = msi.get('kbCategory', '') msi_variant = MSI_MAPPING.get(msi_cat, None) @@ -557,6 +554,23 @@ def preprocess_msi(msi: Any) -> Iterable[Dict]: return [] +def preprocess_hrd(hrd: Any) -> Iterable[Dict]: + """ + Process hrd input into preformatted signature input. + HRD gets mapped to corresponding GraphKB Signature CategoryVariants. + """ + if hrd: + hrd_cat = hrd.get('kbCategory', '') + + hrd_variant = HRD_MAPPING.get(hrd_cat, None) + + # Signature CategoryVariant created either for msi or mss + if hrd_variant: + return [hrd_variant] + + return [] + + def check_variant_links( small_mutations: List[IprSmallMutationVariant], expression_variants: List[IprExprVariant], @@ -580,67 +594,67 @@ def check_variant_links( missing_information_genes = set() missing_information_errors = set() - copy_variant_genes = {variant["gene"] for variant in copy_variants} - expression_variant_genes = {variant["gene"] for variant in expression_variants} + copy_variant_genes = {variant['gene'] for variant in copy_variants} + expression_variant_genes = {variant['gene'] for variant in expression_variants} genes_with_variants = set() # filter excess copy variants variant: IprCopyVariant | IprExprVariant | IprFusionVariant | IprSmallMutationVariant for variant in copy_variants: - gene = variant["gene"] + gene = variant['gene'] if not gene: - logger.error("copy_variant data cannot be applied to an empty genename") - elif variant["variant"]: + logger.error('copy_variant data cannot be applied to an empty genename') + elif variant['variant']: genes_with_variants.add(gene) if expression_variant_genes and gene not in expression_variant_genes: missing_information_genes.add(gene) missing_information_errors.add( - f"gene ({gene}) has a copy variant but is missing expression information" + f'gene ({gene}) has a copy variant but is missing expression information' ) for variant in expression_variants: - gene = variant["gene"] + gene = variant['gene'] if not gene: - logger.error("expression_variant data cannot be applied to an empty genename") - elif variant["variant"]: + logger.error('expression_variant data cannot be applied to an empty genename') + elif variant['variant']: genes_with_variants.add(gene) if copy_variant_genes and gene not in copy_variant_genes: missing_information_genes.add(gene) missing_information_errors.add( - f"gene ({gene}) has an expression variant but is missing copy number information" + f'gene ({gene}) has an expression variant but is missing copy number information' ) for variant in small_mutations: - gene = variant["gene"] + gene = variant['gene'] if not gene: - logger.error("small_mutation data cannot be applied to an empty genename") + logger.error('small_mutation data cannot be applied to an empty genename') continue if copy_variant_genes and gene not in copy_variant_genes: missing_information_genes.add(gene) missing_information_errors.add( - f"gene ({gene}) has a small mutation but is missing copy number information" + f'gene ({gene}) has a small mutation but is missing copy number information' ) if expression_variant_genes and gene not in expression_variant_genes: missing_information_genes.add(gene) missing_information_errors.add( - f"gene ({gene}) has a small mutation but is missing expression information" + f'gene ({gene}) has a small mutation but is missing expression information' ) genes_with_variants.add(gene) for variant in structural_variants: - for gene in [variant["gene1"], variant["gene2"]]: + for gene in [variant['gene1'], variant['gene2']]: if gene: # genes are optional for structural variants if gene not in copy_variant_genes: missing_information_genes.add(gene) missing_information_errors.add( - f"gene ({gene}) has a structural variant but is missing copy number information" + f'gene ({gene}) has a structural variant but is missing copy number information' ) if gene not in expression_variant_genes: missing_information_genes.add(gene) missing_information_errors.add( - f"gene ({gene}) has a structural variant but is missing expression information" + f'gene ({gene}) has a structural variant but is missing expression information' ) genes_with_variants.add(gene) @@ -648,7 +662,7 @@ def check_variant_links( for err_msg in sorted(missing_information_errors): logger.debug(err_msg) link_err_msg = ( - f"Missing information variant links on {len(missing_information_genes)} genes" + f'Missing information variant links on {len(missing_information_genes)} genes' ) logger.warning(link_err_msg) return genes_with_variants @@ -659,91 +673,91 @@ def check_comparators(content: Dict, expresssionVariants: List[IprExprVariant] = Given the optional content dictionary, check that based on the analyses present the correct/sufficient comparators have also been specified """ - mutation_burden = "mutationBurden" - comparator_roles = {c["analysisRole"] for c in content.get("comparators", [])} + mutation_burden = 'mutationBurden' + comparator_roles = {c['analysisRole'] for c in content.get('comparators', [])} - for image in content.get("images", []): - key = image["key"] + for image in content.get('images', []): + key = image['key'] if key.startswith(mutation_burden): - comp_type = key.split(".")[-1] - role = f"mutation burden ({comp_type})" + comp_type = key.split('.')[-1] + role = f'mutation burden ({comp_type})' if role in comparator_roles: continue - if "_sv." in key: - sv_role = f"mutation burden SV ({comp_type})" + if '_sv.' in key: + sv_role = f'mutation burden SV ({comp_type})' if sv_role in comparator_roles: continue - raise ValueError(f"missing required comparator definition ({role})") + raise ValueError(f'missing required comparator definition ({role})') if expresssionVariants: - required_comparators = {"expression (disease)"} + required_comparators = {'expression (disease)'} def all_none(row: IprExprVariant, columns: List[str]) -> bool: - return all([row.get(col) is None or row.get(col) == "" for col in columns]) + return all([row.get(col) is None or row.get(col) == '' for col in columns]) for exp in expresssionVariants: if not all_none( exp, [ - "primarySitekIQR", - "primarySitePercentile", - "primarySiteZScore", - "primarySiteFoldChange", + 'primarySitekIQR', + 'primarySitePercentile', + 'primarySiteZScore', + 'primarySiteFoldChange', ], ): - required_comparators.add("expression (primary site)") + required_comparators.add('expression (primary site)') if not all_none( exp, [ - "biopsySitekIQR", - "biopsySitePercentile", - "biopsySiteZScore", - "biopsySiteFoldChange", + 'biopsySitekIQR', + 'biopsySitePercentile', + 'biopsySiteZScore', + 'biopsySiteFoldChange', ], ): - required_comparators.add("expression (biopsy site)") + required_comparators.add('expression (biopsy site)') if not all_none( exp, [ - "internalPancancerkIQR", - "internalPancancerPercentile", - "internalPancancerZScore", - "internalPancancerFoldChange", + 'internalPancancerkIQR', + 'internalPancancerPercentile', + 'internalPancancerZScore', + 'internalPancancerFoldChange', ], ): - required_comparators.add("expression (internal pancancer cohort)") + required_comparators.add('expression (internal pancancer cohort)') if required_comparators - comparator_roles: - missing = "; ".join(sorted(list(required_comparators - comparator_roles))) - raise ValueError(f"missing required comparator definitions ({missing})") + missing = '; '.join(sorted(list(required_comparators - comparator_roles))) + raise ValueError(f'missing required comparator definitions ({missing})') def extend_with_default(validator_class): # https://python-jsonschema.readthedocs.io/en/latest/faq/#why-doesn-t-my-schema-s-default-property-set-the-default-on-my-instance - validate_properties = validator_class.VALIDATORS["properties"] + validate_properties = validator_class.VALIDATORS['properties'] def set_defaults(validator, properties, instance, schema): for property, subschema in properties.items(): - if "default" in subschema: - instance.setdefault(property, subschema["default"]) + if 'default' in subschema: + instance.setdefault(property, subschema['default']) for error in validate_properties(validator, properties, instance, schema): yield error def check_null(checker, instance): return ( - validator_class.TYPE_CHECKER.is_type(instance, "null") + validator_class.TYPE_CHECKER.is_type(instance, 'null') or pd.isnull(instance) - or instance == "" + or instance == '' ) - type_checker = validator_class.TYPE_CHECKER.redefine("null", check_null) + type_checker = validator_class.TYPE_CHECKER.redefine('null', check_null) return jsonschema.validators.extend( validator_class, - validators={"properties": set_defaults}, + validators={'properties': set_defaults}, type_checker=type_checker, ) @@ -758,7 +772,7 @@ def validate_report_content(content: Dict, schema_file: str = SPECIFICATION) -> Adds defaults as reccommended by: https://python-jsonschema.readthedocs.io/en/latest/faq/#why-doesn-t-my-schema-s-default-property-set-the-default-on-my-instance """ - with open(schema_file, "r") as fh: + with open(schema_file, 'r') as fh: schema = json.load(fh) return DefaultValidatingDraft7Validator(schema).validate(content) diff --git a/pori_python/ipr/ipr.py b/pori_python/ipr/ipr.py index 8487dc1d..06d9efbd 100644 --- a/pori_python/ipr/ipr.py +++ b/pori_python/ipr/ipr.py @@ -35,12 +35,12 @@ def display_evidence_levels(statement: Statement) -> str: result = [] - for evidence_level in statement.get("evidenceLevel", []) or []: + for evidence_level in statement.get('evidenceLevel', []) or []: if isinstance(evidence_level, str): result.append(evidence_level) - elif "displayName" in evidence_level: - result.append(evidence_level["displayName"]) - return ";".join(sorted(result)) + elif 'displayName' in evidence_level: + result.append(evidence_level['displayName']) + return ';'.join(sorted(result)) def filter_structural_variants( @@ -52,9 +52,9 @@ def filter_structural_variants( Filter structural variants to remove non-high quality events unless they are matched/annotated or they involve a gene that is a known fusion partner """ - matched_svs = {match["variant"] for match in kb_matches if match["variantType"] == "sv"} + matched_svs = {match['variant'] for match in kb_matches if match['variantType'] == 'sv'} fusion_genes = { - gene["name"] for gene in gene_annotations if gene.get("knownFusionPartner", False) + gene['name'] for gene in gene_annotations if gene.get('knownFusionPartner', False) } result = [] @@ -62,10 +62,10 @@ def filter_structural_variants( for structural_variant in structural_variants: if any( [ - structural_variant["highQuality"], - structural_variant["key"] in matched_svs, - structural_variant["gene1"] in fusion_genes, - structural_variant["gene2"] in fusion_genes, + structural_variant['highQuality'], + structural_variant['key'] in matched_svs, + structural_variant['gene1'] in fusion_genes, + structural_variant['gene2'] in fusion_genes, ] ): result.append(structural_variant) @@ -83,22 +83,22 @@ def get_evidencelevel_mapping(graphkb_conn: GraphKBConnection) -> Dict[str, str] """ # Get all EvidenceLevel from GraphKB # Note: not specifying any returnProperties allows for retreiving in/out_CrossReferenceOf - evidence_levels = graphkb_conn.query({"target": "EvidenceLevel"}) + evidence_levels = graphkb_conn.query({'target': 'EvidenceLevel'}) # Map EvidenceLevel RIDs to list of incoming CrossReferenceOf evidence_levels_mapping = dict( - map(lambda d: (d["@rid"], d.get("in_CrossReferenceOf", [])), evidence_levels) + map(lambda d: (d['@rid'], d.get('in_CrossReferenceOf', [])), evidence_levels) ) # Filter IPR EvidenceLevel and map each outgoing CrossReferenceOf to displayName - ipr_source_rid = graphkb_conn.get_source("ipr")["@rid"] - ipr_evidence_levels = filter(lambda d: d.get("source") == ipr_source_rid, evidence_levels) + ipr_source_rid = graphkb_conn.get_source('ipr')['@rid'] + ipr_evidence_levels = filter(lambda d: d.get('source') == ipr_source_rid, evidence_levels) cross_references_mapping: Dict[str, str] = dict() ipr_rids_to_displayname: Dict[str, str] = dict() for level in ipr_evidence_levels: - d = map(lambda i: (i, level["displayName"]), level.get("out_CrossReferenceOf", [])) # type: ignore + d = map(lambda i: (i, level['displayName']), level.get('out_CrossReferenceOf', [])) # type: ignore cross_references_mapping.update(d) - ipr_rids_to_displayname[level["@rid"]] = level["displayName"] # type: ignore + ipr_rids_to_displayname[level['@rid']] = level['displayName'] # type: ignore # Update EvidenceLevel mapping to corresponding IPR EvidenceLevel displayName def link_refs(refs) -> Tuple[str, str]: @@ -107,10 +107,10 @@ def link_refs(refs) -> Tuple[str, str]: return (refs[0], cross_references_mapping[rid]) if refs[0] in ipr_rids_to_displayname: # self-referencing IPR levels return (refs[0], ipr_rids_to_displayname[refs[0]]) - return (refs[0], "") + return (refs[0], '') evidence_levels_mapping = dict(map(link_refs, evidence_levels_mapping.items())) - evidence_levels_mapping[""] = "" + evidence_levels_mapping[''] = '' return evidence_levels_mapping # type: ignore @@ -142,11 +142,11 @@ def convert_statements_to_alterations( rows = [] ev_map = get_evidencelevel_mapping(graphkb_conn) # GERO-318 - add all IPR-A evidence equivalents to the approvedTherapy flag - approved = set([ev for (ev, ipr) in ev_map.items() if ipr == "IPR-A"]) + approved = set([ev for (ev, ipr) in ev_map.items() if ipr == 'IPR-A']) # get the recruitment status for any trial associated with a statement clinical_trials = [ - s["subject"]["@rid"] for s in statements if s["subject"]["@class"] == "ClinicalTrial" + s['subject']['@rid'] for s in statements if s['subject']['@class'] == 'ClinicalTrial' ] recruitment_statuses = {} if clinical_trials: @@ -154,76 +154,79 @@ def convert_statements_to_alterations( for rid in clinical_trials: query_result = graphkb_conn.query( { - "target": {"target": "ClinicalTrial", "filters": {"@rid": rid}}, - "returnProperties": ["@rid", "recruitmentStatus"], + 'target': {'target': 'ClinicalTrial', 'filters': {'@rid': rid}}, + 'returnProperties': ['@rid', 'recruitmentStatus'], } ) if query_result: - recruitment_statuses[rid] = query_result[0]["recruitmentStatus"] # type: ignore + recruitment_statuses[rid] = query_result[0]['recruitmentStatus'] # type: ignore for statement in statements: variants = [ - cast(Variant, c) for c in statement["conditions"] if c["@class"] in VARIANT_CLASSES + cast(Variant, c) for c in statement['conditions'] if c['@class'] in VARIANT_CLASSES ] - diseases = [c for c in statement["conditions"] if c["@class"] == "Disease"] - disease_match = len(diseases) == 1 and diseases[0]["@rid"] in disease_matches - pmid = ";".join([e["displayName"] for e in statement["evidence"]]) + diseases = [c for c in statement['conditions'] if c['@class'] == 'Disease'] + disease_match = len(diseases) == 1 and diseases[0]['@rid'] in disease_matches + reference = ';'.join([e['displayName'] for e in statement['evidence']]) + + if statement['relevance']['name'] == 'eligibility': + reference = ';'.join([e['sourceId'] for e in statement['evidence']]) ipr_section = gkb_statement.categorize_relevance( - graphkb_conn, statement["relevance"]["@rid"] + graphkb_conn, statement['relevance']['@rid'] ) approved_therapy = False - if ipr_section == "therapeutic": - for level in statement["evidenceLevel"] or []: - if level["@rid"] in approved: + if ipr_section == 'therapeutic': + for level in statement['evidenceLevel'] or []: + if level['@rid'] in approved: approved_therapy = True break - if ipr_section == "prognostic" and not disease_match: + if ipr_section == 'prognostic' and not disease_match: continue # GERO-72 / GERO-196 evidence_level_str = display_evidence_levels(statement) - evidence_levels = statement.get("evidenceLevel") or [] - ipr_evidence_levels = [ev_map[el.get("@rid", "")] for el in evidence_levels if el] - ipr_evidence_levels_str = ";".join(sorted(set([el for el in ipr_evidence_levels]))) + evidence_levels = statement.get('evidenceLevel') or [] + ipr_evidence_levels = [ev_map[el.get('@rid', '')] for el in evidence_levels if el] + ipr_evidence_levels_str = ';'.join(sorted(set([el for el in ipr_evidence_levels]))) for variant in variants: - if variant["@rid"] not in variant_matches: + if variant['@rid'] not in variant_matches: continue row = KbMatch( { - "approvedTherapy": approved_therapy or False, - "category": ipr_section or "unknown", - "context": ( - statement["subject"]["displayName"] if statement["subject"] else "" + 'approvedTherapy': approved_therapy or False, + 'category': ipr_section or 'unknown', + 'context': ( + statement['subject']['displayName'] if statement['subject'] else '' ), - "kbContextId": (statement["subject"]["@rid"] if statement["subject"] else ""), - "disease": ";".join(sorted(d.get("displayName", "") for d in diseases)), - "evidenceLevel": evidence_level_str or "", - "iprEvidenceLevel": ipr_evidence_levels_str or "", - "kbStatementId": statement["@rid"], - "kbVariant": str(variant.get("displayName", "")) or "", - "variant": str(variant.get("displayName", "")) or "", - "variantType": "", - "kbVariantId": variant["@rid"], - "matchedCancer": disease_match, - "reference": pmid, - "relevance": statement["relevance"]["displayName"], - "kbRelevanceId": statement["relevance"]["@rid"], - "externalSource": ( - str(statement["source"].get("displayName", "")) - if statement["source"] - else "" + 'kbContextId': (statement['subject']['@rid'] if statement['subject'] else ''), + 'disease': ';'.join(sorted(d.get('displayName', '') for d in diseases)), + 'evidenceLevel': evidence_level_str or '', + 'iprEvidenceLevel': ipr_evidence_levels_str or '', + 'kbStatementId': statement['@rid'], + 'kbVariant': str(variant.get('displayName', '')) or '', + 'variant': str(variant.get('displayName', '')) or '', + 'variantType': '', + 'kbVariantId': variant['@rid'], + 'matchedCancer': disease_match, + 'reference': reference, + 'relevance': statement['relevance']['displayName'], + 'kbRelevanceId': statement['relevance']['@rid'], + 'externalSource': ( + str(statement['source'].get('displayName', '')) + if statement['source'] + else '' ), - "requiredKbMatches": [item["@rid"] for item in variants], - "externalStatementId": statement.get("sourceId", "") or "", - "reviewStatus": statement.get("reviewStatus", "") or "", - "kbData": {}, + 'requiredKbMatches': [item['@rid'] for item in variants], + 'externalStatementId': statement.get('sourceId', '') or '', + 'reviewStatus': statement.get('reviewStatus', '') or '', + 'kbData': {}, } ) - if statement["relevance"]["name"] == "eligibility": - row["kbData"]["recruitment_status"] = recruitment_statuses.get( - row["kbContextId"], "not found" + if statement['relevance']['name'] == 'eligibility': + row['kbData']['recruitment_status'] = recruitment_statuses.get( + row['kbContextId'], 'not found' ) rows.append(row) return rows @@ -246,83 +249,99 @@ def select_expression_plots( """ selected_variants = { - (match["variantType"], match["variant"]) + (match['variantType'], match['variant']) for match in kb_matches - if match["category"] == "therapeutic" + if match['category'] == 'therapeutic' } images_by_gene: Dict[str, ImageDefinition] = {} selected_genes = set() for variant in all_variants: - if (variant["variantType"], variant["key"]) in selected_variants: - for key in ["gene", "gene1", "gene2"]: + if (variant['variantType'], variant['key']) in selected_variants: + for key in ['gene', 'gene1', 'gene2']: gene = variant.get(key) if gene: selected_genes.add(str(gene)) - gene = str(variant.get("gene", "")) - hist = str(variant.get("histogramImage", "")) + gene = str(variant.get('gene', '')) + hist = str(variant.get('histogramImage', '')) if hist: - images_by_gene[gene] = ImageDefinition({"key": f"expDensity.{gene}", "path": hist}) + images_by_gene[gene] = ImageDefinition({'key': f'expDensity.{gene}', 'path': hist}) return [images_by_gene[gene] for gene in selected_genes if gene in images_by_gene] def create_key_alterations( - kb_matches: List[Hashabledict], all_variants: Sequence[IprVariant] + kb_matches: List[Hashabledict], + all_variants: Sequence[IprVariant], + included_kb_matches: List[KbVariantMatch], ) -> Tuple[List[Dict], Dict]: """Create the list of significant variants matched by the KB. This list of matches is also used to create the variant counts. + + kb_matches: the full list of matched kb objects found for the reported variants + all_variants: the full list of all reported variants, matched or unmatched + included_kb_matches: the list of kb_variant ids to be allowed in the key alterations table; + this is all kb_variants if partially matched statements are allowed, or + the subset of kb_variants that are conditions for at least one + fully satisfied statement condition set, if partially matched statements + are not allowed (ie, kb_variants that are not part of any fully satisfied + statement condition set are excluded) """ alterations = [] type_mapping = { - "mut": "smallMutations", - "cnv": "CNVs", - "sv": "SVs", - "exp": "expressionOutliers", + 'mut': 'smallMutations', + 'cnv': 'CNVs', + 'sv': 'SVs', + 'exp': 'expressionOutliers', } counts: Dict[str, Set] = {v: set() for v in type_mapping.values()} skipped_variant_types = [] + + included_kbvariant_ids = list(set([item['kbVariantId'] for item in included_kb_matches])) + for kb_match in kb_matches: - variant_type = kb_match["variantType"] - variant_key = kb_match["variant"] - if kb_match["category"] == "unknown": + if kb_match['kbVariantId'] not in included_kbvariant_ids: + continue + variant_type = kb_match['variantType'] + variant_key = kb_match['variant'] + if kb_match['category'] == 'unknown': continue if variant_type not in type_mapping.keys(): if variant_type not in skipped_variant_types: skipped_variant_types.append(variant_type) logger.warning( - f"No summary key alterations for {variant_type}. Skipping {variant_key}" + f'No summary key alterations for {variant_type}. Skipping {variant_key}' ) continue try: variant = find_variant(all_variants, variant_type, variant_key) except KeyError as err: logger.error(err) - logger.error(f"No variant match found for {variant_key}") + logger.error(f'No variant match found for {variant_key}') continue counts[type_mapping[variant_type]].add(variant_key) - if variant_type == "exp": - alterations.append(f'{variant.get("gene","")} ({variant.get("expressionState")})') - elif variant_type == "cnv": - alterations.append(f'{variant.get("gene","")} ({variant.get("cnvState")})') + if variant_type == 'exp': + alterations.append(f'{variant.get("gene", "")} ({variant.get("expressionState")})') + elif variant_type == 'cnv': + alterations.append(f'{variant.get("gene", "")} ({variant.get("cnvState")})') # only show germline if relevant - elif kb_match["category"] in GERMLINE_BASE_TERMS and variant.get("germline"): - alterations.append(f"germline {variant['variant']}") + elif kb_match['category'] in GERMLINE_BASE_TERMS and variant.get('germline'): + alterations.append(f'germline {variant["variant"]}') else: - alterations.append(variant["variant"]) + alterations.append(variant['variant']) counted_variants = set.union(*counts.values()) - counts["variantsUnknown"] = set() + counts['variantsUnknown'] = set() # count the un-matched variants for variant in all_variants: - if variant["variant"] and variant["key"] not in counted_variants: - counts["variantsUnknown"].add(variant["key"]) + if variant['variant'] and variant['key'] not in counted_variants: + counts['variantsUnknown'].add(variant['key']) return ( - [{"geneVariant": alt} for alt in set(alterations)], + [{'geneVariant': alt} for alt in set(alterations)], {k: len(v) for k, v in counts.items()}, ) @@ -347,44 +366,44 @@ def germline_kb_matches( filtered list of kb_matches """ ret_list = [] - germ_alts = [alt for alt in kb_matches if alt["category"] in GERMLINE_BASE_TERMS] + germ_alts = [alt for alt in kb_matches if alt['category'] in GERMLINE_BASE_TERMS] somatic_alts = [alt for alt in kb_matches if alt not in germ_alts] if germ_alts: - logger.info(f"checking germline status of {GERMLINE_BASE_TERMS}") + logger.info(f'checking germline status of {GERMLINE_BASE_TERMS}') for alt in germ_alts: - var_list = [v for v in all_variants if v["key"] == alt["variant"]] - germline_var_list = [v for v in var_list if v.get("germline")] - unknown_var_list = [v for v in var_list if "germline" not in v] + var_list = [v for v in all_variants if v['key'] == alt['variant']] + germline_var_list = [v for v in var_list if v.get('germline')] + unknown_var_list = [v for v in var_list if 'germline' not in v] if germline_var_list: logger.debug( - f"germline kbStatementId:{alt['kbStatementId']}: {alt['kbVariant']} {alt['category']}" + f'germline kbStatementId:{alt["kbStatementId"]}: {alt["kbVariant"]} {alt["category"]}' ) ret_list.append(alt) elif unknown_var_list: logger.warning( - f"germline no data fail for: {alt['kbStatementId']}: {alt['kbVariant']} {alt['category']}" + f'germline no data fail for: {alt["kbStatementId"]}: {alt["kbVariant"]} {alt["category"]}' ) if not assume_somatic: logger.debug( - f"Keeping unverified match to germline kbStatementId:{alt['kbStatementId']}: {alt['kbVariant']} {alt['category']}" + f'Keeping unverified match to germline kbStatementId:{alt["kbStatementId"]}: {alt["kbVariant"]} {alt["category"]}' ) ret_list.append(alt) else: logger.debug( - f"Dropping unverified match to germline kbStatementId:{alt['kbStatementId']}: {alt['kbVariant']} {alt['category']}" + f'Dropping unverified match to germline kbStatementId:{alt["kbStatementId"]}: {alt["kbVariant"]} {alt["category"]}' ) else: logger.debug( - f"Dropping somatic match to germline kbStatementId:{alt['kbStatementId']}: {alt['kbVariant']} {alt['category']}" + f'Dropping somatic match to germline kbStatementId:{alt["kbStatementId"]}: {alt["kbVariant"]} {alt["category"]}' ) if somatic_alts: # Remove any matches to germline events for alt in somatic_alts: - var_list = [v for v in all_variants if v["key"] == alt["variant"]] - somatic_var_list = [v for v in var_list if not v.get("germline", not assume_somatic)] + var_list = [v for v in all_variants if v['key'] == alt['variant']] + somatic_var_list = [v for v in var_list if not v.get('germline', not assume_somatic)] if var_list and not somatic_var_list: logger.debug( - f"Dropping germline match to somatic statement kbStatementId:{alt['kbStatementId']}: {alt['kbVariant']} {alt['category']}" + f'Dropping germline match to somatic statement kbStatementId:{alt["kbStatementId"]}: {alt["kbVariant"]} {alt["category"]}' ) elif somatic_var_list: ret_list.append(alt) # match to somatic variant @@ -397,7 +416,7 @@ def germline_kb_matches( def multi_variant_filtering( graphkb_conn: GraphKBConnection, gkb_matches: List[KbMatch], - excludedTypes: List[str] = ["wildtype"], + excludedTypes: List[str] = ['wildtype'], ) -> List[KbMatch]: """Filters out GraphKB matches that doesn't match to all required variants on multi-variant statements @@ -417,42 +436,42 @@ def multi_variant_filtering( filtered list of KbMatch statements """ # All matching statements & variants (GKB RIDs) - matching_statement_rids = {match["kbStatementId"] for match in gkb_matches} - matching_variant_rids = {match["kbVariantId"] for match in gkb_matches} + matching_statement_rids = {match['kbStatementId'] for match in gkb_matches} + matching_variant_rids = {match['kbVariantId'] for match in gkb_matches} # Get conditions detail on all matching statements res = graphkb_conn.post( - uri="query", + uri='query', data={ - "target": "Statement", - "filters": { - "@rid": list(matching_statement_rids), - "operator": "IN", + 'target': 'Statement', + 'filters': { + '@rid': list(matching_statement_rids), + 'operator': 'IN', }, - "history": True, - "returnProperties": [ - "@rid", - "conditions.@rid", - "conditions.@class", - "conditions.type", + 'history': True, + 'returnProperties': [ + '@rid', + 'conditions.@rid', + 'conditions.@class', + 'conditions.type', ], }, ) - statements = res["result"] + statements = res['result'] # Get set of excluded Vocabulary RIDs for variant types excluded = {} - if len(excludedTypes) != 0 and excludedTypes[0] != "": + if len(excludedTypes) != 0 and excludedTypes[0] != '': excluded = gkb_vocab.get_terms_set(graphkb_conn, excludedTypes) # Mapping statements to their conditional variants # (discarding non-variant conditions & variant conditions from excluded types) statement_to_variants = {} for statement in statements: - statement_to_variants[statement["@rid"]] = { - el["@rid"] - for el in statement["conditions"] - if (el["@class"] in VARIANT_CLASSES and el.get("type", "") not in excluded) + statement_to_variants[statement['@rid']] = { + el['@rid'] + for el in statement['conditions'] + if (el['@class'] in VARIANT_CLASSES and el.get('type', '') not in excluded) } # Set of statements with complete matching @@ -464,7 +483,7 @@ def multi_variant_filtering( # Filtering out incompleted matches of gkb_matches return [ - match for match in gkb_matches if match["kbStatementId"] in complete_matching_statements + match for match in gkb_matches if match['kbStatementId'] in complete_matching_statements ] @@ -483,10 +502,10 @@ def get_kb_variants( for item in gkb_matches: kbv = KbVariantMatch( { - "kbVariant": item["kbVariant"], - "variant": item["variant"], - "variantType": item["variantType"], - "kbVariantId": item["kbVariantId"], + 'kbVariant': item['kbVariant'], + 'variant': item['variant'], + 'variantType': item['variantType'], + 'kbVariantId': item['kbVariantId'], } ) kbVariants[str(kbv)] = kbv @@ -509,7 +528,7 @@ def get_kb_matched_statements( kbs_keys = KbMatchedStatement.__annotations__.keys() for item in gkb_matches: stmt = copy(item) - stmt["requiredKbMatches"].sort() + stmt['requiredKbMatches'].sort() kbs = KbMatchedStatement({key: val for (key, val) in stmt.items() if key in kbs_keys}) dict_key = str(kbs) kbMatchedStatements[dict_key] = kbs @@ -568,20 +587,20 @@ def get_kb_statement_matched_conditions( kbMatchedStatementConditions = {} for kbStmt in kbMatchedStatements: - stmts = [item for item in gkb_matches if item["kbStatementId"] == kbStmt["kbStatementId"]] + stmts = [item for item in gkb_matches if item['kbStatementId'] == kbStmt['kbStatementId']] requirements = {} - for requirement in stmts[0]["requiredKbMatches"]: + for requirement in stmts[0]['requiredKbMatches']: if not requirements.get(requirement, False): # only use explicit variant/statement links reqlist = [ { - "kbVariantId": requirement, - "observedVariantKey": item["variant"], + 'kbVariantId': requirement, + 'observedVariantKey': item['variant'], } for item in gkb_matches if ( - item["kbVariantId"] == requirement - and item["kbStatementId"] == kbStmt["kbStatementId"] + item['kbVariantId'] == requirement + and item['kbStatementId'] == kbStmt['kbStatementId'] ) ] requirements[requirement] = reqlist @@ -592,18 +611,18 @@ def get_kb_statement_matched_conditions( variantConditionSets = list(product(*requirements.values())) conditionSets = [ - {"kbStatementId": kbStmt["kbStatementId"], "matchedConditions": item} + {'kbStatementId': kbStmt['kbStatementId'], 'matchedConditions': item} for item in variantConditionSets ] for conditionSet in conditionSets: matchedConditions = sorted( - conditionSet["matchedConditions"], - key=lambda x: (x["kbVariantId"], x["observedVariantKey"]), + conditionSet['matchedConditions'], + key=lambda x: (x['kbVariantId'], x['observedVariantKey']), ) kbmc = KbMatchedStatementConditionSet( { - "kbStatementId": conditionSet["kbStatementId"], - "matchedConditions": matchedConditions, + 'kbStatementId': conditionSet['kbStatementId'], + 'matchedConditions': matchedConditions, } ) key = str( @@ -622,10 +641,24 @@ def get_kb_matches_sections( kb_statement_matched_conditions = get_kb_statement_matched_conditions( gkb_matches, allow_partial_matches ) + + if not allow_partial_matches: + # remove kb_matches that are not part of any fully matched condition set + unique_kb_variant_ids = list( + set( + [ + item['kbVariantId'] + for conditionSet in kb_statement_matched_conditions + for item in conditionSet['matchedConditions'] + ] + ) + ) + kb_variants = [item for item in kb_variants if item['kbVariantId'] in unique_kb_variant_ids] + return { - "kbMatches": kb_variants, - "kbMatchedStatements": kb_matched_statements, - "kbStatementMatchedConditions": kb_statement_matched_conditions, + 'kbMatches': kb_variants, + 'kbMatchedStatements': kb_matched_statements, + 'kbStatementMatchedConditions': kb_statement_matched_conditions, } @@ -634,12 +667,12 @@ def get_kb_disease_matches( kb_disease_match: Optional[str] = None, verbose: bool = True, useSubgraphsRoute: bool = True, -) -> list[str]: +) -> list[Dict]: disease_matches = [] if not kb_disease_match: - kb_disease_match = "cancer" + kb_disease_match = 'cancer' if verbose: logger.warning(f"No disease provided; will use '{kb_disease_match}'") @@ -657,20 +690,20 @@ def get_kb_disease_matches( base_records = gkb_util.convert_to_rid_list( graphkb_conn.query( gkb_vocab.query_by_name( - "Disease", + 'Disease', kb_disease_match, ) ) ) if base_records: response = graphkb_conn.post( - "/subgraphs/Disease", + '/subgraphs/Disease', { - "subgraphType": "tree", - "base": base_records, + 'subgraphType': 'tree', + 'base': base_records, }, ) - disease_matches = list(response["result"]["g"]["nodes"].keys()) + disease_matches = list(response['result']['g']['nodes'].values()) except Exception: if verbose: @@ -681,20 +714,15 @@ def get_kb_disease_matches( # Traversal depth is limited if not useSubgraphsRoute: if verbose: - logger.info(f"Matching disease ({kb_disease_match}) to graphkb using get_term_tree()") - disease_matches = list( - { - r["@rid"] - for r in gkb_vocab.get_term_tree( - graphkb_conn, - kb_disease_match, - ontology_class="Disease", - ) - } + logger.info(f'Matching disease ({kb_disease_match}) to graphkb using get_term_tree()') + disease_matches = gkb_vocab.get_term_tree( + graphkb_conn, + kb_disease_match, + ontology_class='Disease', ) if not disease_matches: - msg = f"failed to match disease ({kb_disease_match}) to graphkb" + msg = f'failed to match disease ({kb_disease_match}) to graphkb' if verbose: logger.error(msg) raise ValueError(msg) diff --git a/pori_python/ipr/main.py b/pori_python/ipr/main.py index 63a028fe..cbb7c128 100644 --- a/pori_python/ipr/main.py +++ b/pori_python/ipr/main.py @@ -23,7 +23,7 @@ from .annotate import annotate_variants from .connection import IprConnection -from .constants import DEFAULT_URL, TMB_SIGNATURE_HIGH_THRESHOLD +from .constants import TMB_SIGNATURE_HIGH_THRESHOLD from .inputs import ( check_comparators, check_variant_links, @@ -32,6 +32,7 @@ preprocess_expression_variants, preprocess_hla, preprocess_msi, + preprocess_hrd, preprocess_signature_variants, preprocess_small_mutations, preprocess_structural_variants, @@ -53,19 +54,19 @@ CACHE_GENE_MINIMUM = 5000 RENAMED_GENE_PROPERTIES = { # old_name: new_name - "cancerRelated": "kbStatementRelated", - "cancerGene": "cancerGeneListMatch", + 'cancerRelated': 'kbStatementRelated', + 'cancerGene': 'cancerGeneListMatch', } def file_path(path: str) -> str: if not os.path.exists(path): - raise argparse.ArgumentTypeError(f"{repr(path)} is not a valid filename. does not exist") + raise argparse.ArgumentTypeError(f'{repr(path)} is not a valid filename. does not exist') return path def timestamp() -> str: - return datetime.datetime.now().strftime("%Y-%m-%dT%H:%M:%S") + return datetime.datetime.now().strftime('%Y-%m-%dT%H:%M:%S') def command_interface() -> None: @@ -73,92 +74,92 @@ def command_interface() -> None: Parsed arguments are used to call the ipr_report() function. """ parser = ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatter) - req = parser.add_argument_group("required arguments") - (req if not os.environ.get("USER") else parser).add_argument( - "--username", - required=not os.environ.get("USER"), - default=os.environ.get("USER"), - help="username to use connecting to graphkb/ipr", + req = parser.add_argument_group('required arguments') + (req if not os.environ.get('USER') else parser).add_argument( + '--username', + required=not os.environ.get('USER'), + default=os.environ.get('USER'), + help='username to use connecting to graphkb/ipr', ) - req.add_argument("--password", required=True, help="password to use connecting to graphkb/ipr") + req.add_argument('--password', required=True, help='password to use connecting to graphkb/ipr') req.add_argument( - "-c", "--content", required=True, type=file_path, help="Report Content as JSON" + '-c', '--content', required=True, type=file_path, help='Report Content as JSON' ) - parser.add_argument("--ipr_url", default=os.environ.get("IPR_URL", DEFAULT_URL)) + parser.add_argument('--ipr_url', default=os.environ.get('IPR_URL')) parser.add_argument( - "--graphkb_username", - help="username to use connecting to graphkb if different from ipr", + '--graphkb_username', + help='username to use connecting to graphkb if different from ipr', ) parser.add_argument( - "--graphkb_password", - help="password to use connecting to graphkb if different from ipr", + '--graphkb_password', + help='password to use connecting to graphkb if different from ipr', ) - parser.add_argument("--graphkb_url", default=os.environ.get("GRAPHKB_URL", None)) - parser.add_argument("--log_level", default="info", choices=LOG_LEVELS.keys()) + parser.add_argument('--graphkb_url', default=os.environ.get('GRAPHKB_URL', None)) + parser.add_argument('--log_level', default='info', choices=LOG_LEVELS.keys()) parser.add_argument( - "--therapeutics", + '--therapeutics', default=False, - help="Generate therapeutic options", - action="store_true", + help='Generate therapeutic options', + action='store_true', ) parser.add_argument( - "--skip_comments", + '--skip_comments', default=False, - action="store_true", - help="Turn off generating the analyst comments section of the report", + action='store_true', + help='Turn off generating the analyst comments section of the report', ) parser.add_argument( - "-o", - "--output_json_path", - default=f"pori_python_report_{timestamp()}.json", - help="path to a JSON to output the report upload body", + '-o', + '--output_json_path', + default=f'pori_python_report_{timestamp()}.json', + help='path to a JSON to output the report upload body', ) parser.add_argument( - "-w", - "--always_write_output_json", - action="store_true", - help="Write to output_json_path on successful IPR uploads instead of just when the upload fails", + '-w', + '--always_write_output_json', + action='store_true', + help='Write to output_json_path on successful IPR uploads instead of just when the upload fails', ) parser.add_argument( - "--async_upload", + '--async_upload', default=False, - action="store_true", - help="True if reports-async ipr endpoint should be used instead of basic reports", + action='store_true', + help='True if reports-async ipr endpoint should be used instead of basic reports', ) parser.add_argument( - "--mins_to_wait", + '--mins_to_wait', default=5, - action="store", - help="is using reports-async, number of minutes to wait before throwing error", + action='store', + help='is using reports-async, number of minutes to wait before throwing error', ) parser.add_argument( - "--allow_partial_matches", + '--allow_partial_matches', default=False, - action="store_true", - help="True to include matches to multivariant statements where not all variants are present", + action='store_true', + help='True to include matches to multivariant statements where not all variants are present', ) parser.add_argument( - "--upload_json", + '--upload_json', default=False, - action="store_true", - help="True to skip all the preprocessing and just submit a json to ipr", + action='store_true', + help='True to skip all the preprocessing and just submit a json to ipr', ) parser.add_argument( - "--validate_json", + '--validate_json', default=False, - action="store_true", - help="True if only need to validate the json", + action='store_true', + help='True if only need to validate the json', ) parser.add_argument( - "--ignore_extra_fields", + '--ignore_extra_fields', default=False, - action="store_true", - help="True if ignore extra fields in json", + action='store_true', + help='True if ignore extra fields in json', ) args = parser.parse_args() - with open(args.content, "r") as fh: + with open(args.content, 'r') as fh: content = json.load(fh) ipr_report( @@ -191,38 +192,38 @@ def clean_unsupported_content(upload_content: Dict, ipr_spec: Dict = {}) -> Dict """ if ( ipr_spec - and "components" in ipr_spec.keys() - and "schemas" in ipr_spec["components"].keys() - and "genesCreate" in ipr_spec["components"]["schemas"].keys() - and "properties" in ipr_spec["components"]["schemas"]["genesCreate"].keys() + and 'components' in ipr_spec.keys() + and 'schemas' in ipr_spec['components'].keys() + and 'genesCreate' in ipr_spec['components']['schemas'].keys() + and 'properties' in ipr_spec['components']['schemas']['genesCreate'].keys() ): - genes_spec = ipr_spec["components"]["schemas"]["genesCreate"]["properties"].keys() + genes_spec = ipr_spec['components']['schemas']['genesCreate']['properties'].keys() # check what ipr report upload expects and adjust contents to match for old_name, new_name in RENAMED_GENE_PROPERTIES.items(): if old_name in genes_spec: logger.warning( - f"Legacy IPR - Renaming property {new_name} to {old_name} for compatibility to ipr_spec" + f'Legacy IPR - Renaming property {new_name} to {old_name} for compatibility to ipr_spec' ) - for gene in upload_content["genes"]: + for gene in upload_content['genes']: if new_name in gene: gene[old_name] = gene[new_name] gene.pop(new_name) else: outdate_properties = 0 - for gene in upload_content["genes"]: + for gene in upload_content['genes']: if old_name in gene: gene[new_name] = gene[old_name] gene.pop(old_name) outdate_properties += 1 if outdate_properties: logger.warning( - f"Renamed property {old_name} to {new_name} on {outdate_properties} genes for ipr_spec" + f'Renamed property {old_name} to {new_name} on {outdate_properties} genes for ipr_spec' ) # remove any unhandled incompatible keys removed_keys: Dict[str, int] = {} - for gene in upload_content["genes"]: + for gene in upload_content['genes']: unsupported_keys = [key for key in gene.keys() if key not in genes_spec] for key in unsupported_keys: if key in removed_keys: @@ -233,23 +234,23 @@ def clean_unsupported_content(upload_content: Dict, ipr_spec: Dict = {}) -> Dict for key, count in removed_keys.items(): logger.warning(f"IPR unsupported property '{key}' removed from {count} genes.") - drop_columns = ["variant", "variantType", "histogramImage"] + drop_columns = ['variant', 'variantType', 'histogramImage'] # DEVSU-2034 - use a 'displayName' VARIANT_LIST_KEYS = [ - "expressionVariants", - "smallMutations", - "copyVariants", - "structuralVariants", - "probeResults", - "signatureVariants", + 'expressionVariants', + 'smallMutations', + 'copyVariants', + 'structuralVariants', + 'probeResults', + 'signatureVariants', ] for variant_list_section in VARIANT_LIST_KEYS: for variant in upload_content.get(variant_list_section, []): - if not variant.get("displayName"): - variant["displayName"] = ( - variant.get("variant") or variant.get("kbCategory") or variant.get("key", "") + if not variant.get('displayName'): + variant['displayName'] = ( + variant.get('variant') or variant.get('kbCategory') or variant.get('key', '') ) - if variant_list_section == "probeResults": + if variant_list_section == 'probeResults': # currently probeResults will error if they do NOT have a 'variant' column. # smallMutations will error if they DO have a 'variant' column. continue @@ -257,29 +258,29 @@ def clean_unsupported_content(upload_content: Dict, ipr_spec: Dict = {}) -> Dict if col in variant: del variant[col] # tmburMutationBurden is a single value, not list - if upload_content.get("tmburMutationBurden"): - if not upload_content["tmburMutationBurden"].get("displayName"): - upload_content["tmburMutationBurden"]["displayName"] = upload_content[ - "tmburMutationBurden" - ].get("kbCategory", "") + if upload_content.get('tmburMutationBurden'): + if not upload_content['tmburMutationBurden'].get('displayName'): + upload_content['tmburMutationBurden']['displayName'] = upload_content[ + 'tmburMutationBurden' + ].get('kbCategory', '') # TODO: check this is still necessary - for row in upload_content["kbMatches"]: - if "kbContextId" in row: - del row["kbContextId"] - if "kbRelevanceId" in row: - del row["kbRelevanceId"] - if "requiredKbMatches" in row: - del row["requiredKbMatches"] - - for row in upload_content["kbMatchedStatements"]: - if "kbContextId" in row: - del row["kbContextId"] - if "kbRelevanceId" in row: - del row["kbRelevanceId"] + for row in upload_content['kbMatches']: + if 'kbContextId' in row: + del row['kbContextId'] + if 'kbRelevanceId' in row: + del row['kbRelevanceId'] + if 'requiredKbMatches' in row: + del row['requiredKbMatches'] + + for row in upload_content['kbMatchedStatements']: + if 'kbContextId' in row: + del row['kbContextId'] + if 'kbRelevanceId' in row: + del row['kbRelevanceId'] # Removing cosmicSignatures. Temporary - upload_content.pop("cosmicSignatures", None) + upload_content.pop('cosmicSignatures', None) return upload_content @@ -293,15 +294,15 @@ def ipr_report( username: str, password: str, content: Dict, - ipr_url: str = DEFAULT_URL, - log_level: str = "info", - output_json_path: str = "", + ipr_url: str = '', + log_level: str = 'info', + output_json_path: str = '', always_write_output_json: bool = False, ipr_upload: bool = True, interactive: bool = False, - graphkb_username: str = "", - graphkb_password: str = "", - graphkb_url: str = "", + graphkb_username: str = '', + graphkb_password: str = '', + graphkb_url: str = '', generate_therapeutics: bool = False, generate_comments: bool = True, match_germline: bool = False, @@ -323,7 +324,7 @@ def ipr_report( Args: username: the username for connecting to GraphKB and IPR password: the password for connecting to GraphKB and IPR - ipr_url: base URL to use in connecting to IPR + ipr_url: base URL to use in connecting to IPR (eg. https://ipr-api.bcgsc.ca/api) log_level: the logging level content: report content output_json_path: path to a JSON file to output the report upload body. @@ -352,18 +353,27 @@ def ipr_report( # set the default logging configuration logging.basicConfig( level=LOG_LEVELS[log_level], - format="%(asctime)s %(name)s %(levelname)s %(message)s", - datefmt="%m-%d-%y %H:%M:%S", + format='%(asctime)s %(name)s %(levelname)s %(message)s', + datefmt='%m-%d-%y %H:%M:%S', ) # IPR CONNECTION - ipr_conn = IprConnection(username, password, ipr_url) + ipr_url = ipr_url if ipr_url else os.environ.get('IPR_URL', '') + ipr_conn = None + if ipr_url: + ipr_conn = IprConnection(username, password, ipr_url) + else: + logger.warning('No ipr_url given') if validate_json: + if not ipr_conn: + raise ValueError('ipr_url required to validate json') ipr_result = ipr_conn.validate_json(content) return ipr_result if upload_json: + if not ipr_conn: + raise ValueError('ipr_url required to upload json') ipr_result = ipr_conn.upload_report( content, mins_to_wait, async_upload, ignore_extra_fields ) @@ -373,31 +383,32 @@ def ipr_report( try: validate_report_content(content) except jsonschema.exceptions.ValidationError as err: - logger.error("Failed schema check - report variants may be corrupted or unmatched.") - logger.error(f"Failed schema check: {err}") + logger.error('Failed schema check - report variants may be corrupted or unmatched.') + logger.error(f'Failed schema check: {err}') # INPUT VARIANTS VALIDATION & PREPROCESSING (OBSERVED BIOMARKERS) signature_variants: List[IprSignatureVariant] = preprocess_signature_variants( [ - *preprocess_cosmic(content.get("cosmicSignatures", [])), # includes dMMR - *preprocess_hla(content.get("hlaTypes", [])), + *preprocess_cosmic(content.get('cosmicSignatures', [])), # includes dMMR + *preprocess_hla(content.get('hlaTypes', [])), *preprocess_tmb( tmb_high, - content.get("tmburMutationBurden", {}), # old tmb pipeline - content.get("genomeTmb", ""), # newer tmb pipeline + content.get('tmburMutationBurden', {}), # old tmb pipeline + content.get('genomeTmb', ''), # newer tmb pipeline ), - *preprocess_msi(content.get("msi", None)), + *preprocess_msi(content.get('msi', None)), + *preprocess_hrd(content.get('hrd', None)), ] ) small_mutations: List[IprSmallMutationVariant] = preprocess_small_mutations( - content.get("smallMutations", []) + content.get('smallMutations', []) ) structural_variants: List[IprFusionVariant] = preprocess_structural_variants( - content.get("structuralVariants", []) + content.get('structuralVariants', []) ) - copy_variants: List[IprCopyVariant] = preprocess_copy_variants(content.get("copyVariants", [])) + copy_variants: List[IprCopyVariant] = preprocess_copy_variants(content.get('copyVariants', [])) expression_variants: List[IprExprVariant] = preprocess_expression_variants( - content.get("expressionVariants", []) + content.get('expressionVariants', []) ) # Additional checks if expression_variants: @@ -408,30 +419,29 @@ def ipr_report( ) # GKB CONNECTION - if graphkb_url: - logger.info(f"connecting to graphkb: {graphkb_url}") - graphkb_conn = GraphKBConnection(graphkb_url) - else: - graphkb_conn = GraphKBConnection() - - gkb_user = graphkb_username if graphkb_username else username - gkb_pass = graphkb_password if graphkb_password else password + graphkb_conn = GraphKBConnection(graphkb_url) if graphkb_url else GraphKBConnection() + logger.info(f'connecting to graphkb: {graphkb_conn.url}') - graphkb_conn.login(gkb_user, gkb_pass) + graphkb_conn.login( + graphkb_username if graphkb_username else username, + graphkb_password if graphkb_password else password, + ) # DISEASE # Disease term from bioapps; expected OncoTree term - kb_disease_match: str = content["kbDiseaseMatch"] + kb_disease_match: str = content['kbDiseaseMatch'] # Matching disease RIDs from GraphKB using term tree # (Will raise uncatched error if no match) - disease_matches: list[str] = get_kb_disease_matches(graphkb_conn, kb_disease_match) + disease_match_records: list[Dict] = get_kb_disease_matches(graphkb_conn, kb_disease_match) + disease_match_rids: list[str] = [item['@rid'] for item in disease_match_records] + disease_match_names: list[str] = [item['name'] for item in disease_match_records] # GKB MATCHING (AKA ANNOTATION) gkb_matches: List[Hashabledict] = annotate_variants( graphkb_conn=graphkb_conn, interactive=interactive, - disease_matches=disease_matches, + disease_matches=disease_match_rids, # Variants, per type: signature_variants=signature_variants, small_mutations=small_mutations, @@ -458,53 +468,53 @@ def ipr_report( ] num_removed = org_len - len(gkb_matches) if num_removed: - logger.info(f"Removing {num_removed} germline events without medical matches.") + logger.info(f'Removing {num_removed} germline events without medical matches.') if custom_kb_match_filter: - logger.info(f"custom_kb_match_filter on {len(gkb_matches)} variants") + logger.info(f'custom_kb_match_filter on {len(gkb_matches)} variants') gkb_matches = [Hashabledict(match) for match in custom_kb_match_filter(gkb_matches)] - logger.info(f"\t custom_kb_match_filter left {len(gkb_matches)} variants") - - # KEY ALTERATIONS - key_alterations, variant_counts = create_key_alterations(gkb_matches, all_variants) + logger.info(f'\t custom_kb_match_filter left {len(gkb_matches)} variants') # GENE INFORMATION - logger.info("fetching gene annotations") + logger.info('fetching gene annotations') gene_information = get_gene_information(graphkb_conn, sorted(genes_with_variants)) # THERAPEUTIC OPTIONS if generate_therapeutics: - logger.info("generating therapeutic options") + logger.info('generating therapeutic options') targets = create_therapeutic_options(graphkb_conn, gkb_matches, all_variants) else: targets = [] # ANALYST COMMENTS - logger.info("generating analyst comments") + logger.info('generating analyst comments') comments_list = [] if generate_comments: graphkb_comments = auto_analyst_comments( graphkb_conn, gkb_matches, - disease_matches=set(disease_matches), + disease_matches=set(disease_match_rids), variants=all_variants, ) comments_list.append(graphkb_comments) if include_ipr_variant_text: + if not ipr_conn: + raise ValueError('ipr_url required to include ipr variant text') ipr_comments = get_ipr_analyst_comments( ipr_conn, gkb_matches, disease_name=kb_disease_match, - project_name=content["project"], - report_type=content["template"], + disease_match_names=disease_match_names, + project_name=content['project'], + report_type=content['template'], include_nonspecific_disease=include_nonspecific_disease, include_nonspecific_project=include_nonspecific_project, include_nonspecific_template=include_nonspecific_template, ) comments_list.append(ipr_comments) - comments = {"comments": "\n".join(comments_list)} + comments = {'comments': '\n'.join(comments_list)} # REFORMATTING KBMATCHES # kbMatches -> kbMatches, kbMatchedStatements & kbStatementMatchedConditions @@ -512,50 +522,64 @@ def ipr_report( gkb_matches, allow_partial_matches=allow_partial_matches ) + # KEY ALTERATIONS + key_alterations, variant_counts = create_key_alterations( + gkb_matches, all_variants, kb_matched_sections['kbMatches'] + ) + # OUTPUT CONTENT # thread safe deep-copy the original content output = json.loads(json.dumps(content)) - output.update(kb_matched_sections) output.update( { - "copyVariants": [ - trim_empty_values(c) for c in copy_variants if c["gene"] in genes_with_variants + 'copyVariants': [ + trim_empty_values(c) for c in copy_variants if c['gene'] in genes_with_variants ], - "smallMutations": [trim_empty_values(s) for s in small_mutations], - "expressionVariants": [ + 'smallMutations': [trim_empty_values(s) for s in small_mutations], + 'expressionVariants': [ trim_empty_values(e) for e in expression_variants - if e["gene"] in genes_with_variants + if e['gene'] in genes_with_variants ], - "kbDiseaseMatch": kb_disease_match, - "kbUrl": graphkb_conn.url, - "kbVersion": timestamp(), - "structuralVariants": [ + 'kbDiseaseMatch': kb_disease_match, + 'kbUrl': graphkb_conn.url, + 'kbVersion': timestamp(), + 'structuralVariants': [ trim_empty_values(s) for s in filter_structural_variants( structural_variants, gkb_matches, gene_information ) ], - "signatureVariants": [trim_empty_values(s) for s in signature_variants], - "genes": gene_information, - "genomicAlterationsIdentified": key_alterations, - "variantCounts": variant_counts, - "analystComments": comments, - "therapeuticTarget": targets, + 'signatureVariants': [trim_empty_values(s) for s in signature_variants], + 'genes': gene_information, + 'genomicAlterationsIdentified': key_alterations, + 'variantCounts': variant_counts, + 'analystComments': comments, + 'therapeuticTarget': targets, } ) - output.setdefault("images", []).extend(select_expression_plots(gkb_matches, all_variants)) + output.setdefault('images', []).extend(select_expression_plots(gkb_matches, all_variants)) + + # if input includes hrdScore field, that is ok to pass to db + # but prefer the 'hrd' field if it exists + if output.get('hrd'): + if output.get('hrd').get('score'): + output['hrdScore'] = output['hrd']['score'] + output.pop('hrd') # kbmatches have already been made - ipr_spec = ipr_conn.get_spec() - output = clean_unsupported_content(output, ipr_spec) ipr_result = {} upload_error = None # UPLOAD TO IPR + if ipr_upload: + if not ipr_conn: + raise ValueError('ipr_url required to upload report') + ipr_spec = ipr_conn.get_spec() + output = clean_unsupported_content(output, ipr_spec) try: - logger.info(f"Uploading to IPR {ipr_conn.url}") + logger.info(f'Uploading to IPR {ipr_conn.url}') ipr_result = ipr_conn.upload_report( output, mins_to_wait, async_upload, ignore_extra_fields ) @@ -563,16 +587,16 @@ def ipr_report( output.update(ipr_result) except Exception as err: upload_error = err - logger.error(f"ipr_conn.upload_report failed: {err}", exc_info=True) + logger.error(f'ipr_conn.upload_report failed: {err}', exc_info=True) # SAVE TO JSON FILE if always_write_output_json: - logger.info(f"Writing IPR upload json to: {output_json_path}") - with open(output_json_path, "w") as fh: + logger.info(f'Writing IPR upload json to: {output_json_path}') + with open(output_json_path, 'w') as fh: fh.write(json.dumps(output)) - logger.info(f"made {graphkb_conn.request_count} requests to graphkb") - logger.info(f"average load {int(graphkb_conn.load or 0)} req/s") + logger.info(f'made {graphkb_conn.request_count} requests to graphkb') + logger.info(f'average load {int(graphkb_conn.load or 0)} req/s') if upload_error: raise upload_error return output diff --git a/pori_python/ipr/summary.py b/pori_python/ipr/summary.py index c6d63b20..cfe60868 100644 --- a/pori_python/ipr/summary.py +++ b/pori_python/ipr/summary.py @@ -28,10 +28,10 @@ logger, ) -OTHER_DISEASES = "other disease types" -ENTREZ_GENE_URL = "https://www.ncbi.nlm.nih.gov/gene" +OTHER_DISEASES = 'other disease types' +ENTREZ_GENE_URL = 'https://www.ncbi.nlm.nih.gov/gene' # TODO: https://www.bcgsc.ca/jira/browse/DEVSU-1181 -GRAPHKB_GUI = "https://graphkb.bcgsc.ca" +GRAPHKB_GUI = 'https://graphkb.bcgsc.ca' def filter_by_record_class( @@ -45,17 +45,17 @@ def check(name: str) -> bool: else: return name in record_classes - return [rec for rec in record_list if check(rec["@class"])] + return [rec for rec in record_list if check(rec['@class'])] def natural_join(word_list: List[str]) -> str: if len(word_list) > 1: - return ", ".join(word_list[:-1]) + ", and " + word_list[-1] - return "".join(word_list) + return ', '.join(word_list[:-1]) + ', and ' + word_list[-1] + return ''.join(word_list) def get_displayname(rec: Record) -> str: - ret_val = rec.get("displayName", rec["@rid"]) + ret_val = rec.get('displayName', rec['@rid']) return str(ret_val) @@ -66,26 +66,26 @@ def natural_join_records( return natural_join(word_list) -def create_graphkb_link(record_ids: List[str], record_class: str = "Statement") -> str: +def create_graphkb_link(record_ids: List[str], record_class: str = 'Statement') -> str: """ Create a link for a set of statements to the GraphKB client """ record_ids = sorted(list(set(record_ids))) if len(record_ids) == 1: return f'{GRAPHKB_GUI}/view/{record_class}/{record_ids[0].replace("#", "")}' - complex_param = base64.b64encode(json.dumps({"target": record_ids}).encode("utf-8")) - search_params = {"complex": complex_param, "@class": record_class} - return f"{GRAPHKB_GUI}/data/table?{urlencode(search_params)}" + complex_param = base64.b64encode(json.dumps({'target': record_ids}).encode('utf-8')) + search_params = {'complex': complex_param, '@class': record_class} + return f'{GRAPHKB_GUI}/data/table?{urlencode(search_params)}' def merge_diseases( diseases: List[Ontology] | List[Record], disease_matches: Set[str] = set() ) -> str: if len(convert_to_rid_set(diseases) - disease_matches) >= 2 and all( - [d["@class"] == "Disease" for d in diseases] + [d['@class'] == 'Disease' for d in diseases] ): words = sorted( - list(set([get_displayname(s) for s in diseases if s["@rid"] in disease_matches])) + list(set([get_displayname(s) for s in diseases if s['@rid'] in disease_matches])) ) words.append(OTHER_DISEASES) return natural_join(words) @@ -105,54 +105,54 @@ def substitute_sentence_template( """Create the filled-in sentence template for a given template and list of substitutions which may be the result of the aggregation of 1 or more statements. """ - disease_conditions = filter_by_record_class(conditions, "Disease") + disease_conditions = filter_by_record_class(conditions, 'Disease') variant_conditions = filter_by_record_class( - conditions, "CategoryVariant", "CatalogueVariant", "PositionalVariant" + conditions, 'CategoryVariant', 'CatalogueVariant', 'PositionalVariant' ) other_conditions = filter_by_record_class( conditions, - "CategoryVariant", - "CatalogueVariant", - "PositionalVariant", - "Disease", + 'CategoryVariant', + 'CatalogueVariant', + 'PositionalVariant', + 'Disease', exclude=True, ) - result = template.replace(r"{relevance}", relevance["displayName"]) + result = template.replace(r'{relevance}', relevance['displayName']) - if r"{subject}" in template: + if r'{subject}' in template: # remove subject from the conditions replacements subjects_ids = convert_to_rid_set(subjects) disease_conditions = [ - cast(Ontology, d) for d in disease_conditions if d["@rid"] not in subjects_ids + cast(Ontology, d) for d in disease_conditions if d['@rid'] not in subjects_ids ] variant_conditions = [ - cast(Ontology, d) for d in variant_conditions if d["@rid"] not in subjects_ids + cast(Ontology, d) for d in variant_conditions if d['@rid'] not in subjects_ids ] - other_conditions = [d for d in other_conditions if d["@rid"] not in subjects_ids] + other_conditions = [d for d in other_conditions if d['@rid'] not in subjects_ids] - result = result.replace(r"{subject}", merge_diseases(subjects, disease_matches)) + result = result.replace(r'{subject}', merge_diseases(subjects, disease_matches)) - if r"{conditions:disease}" in template: + if r'{conditions:disease}' in template: result = result.replace( - r"{conditions:disease}", merge_diseases(disease_conditions, disease_matches) + r'{conditions:disease}', merge_diseases(disease_conditions, disease_matches) ) else: other_conditions.extend(disease_conditions) - if r"{conditions:variant}" in template: - result = result.replace(r"{conditions:variant}", natural_join_records(variant_conditions)) + if r'{conditions:variant}' in template: + result = result.replace(r'{conditions:variant}', natural_join_records(variant_conditions)) else: other_conditions.extend(variant_conditions) - result = result.replace(r"{conditions}", natural_join_records(other_conditions)) + result = result.replace(r'{conditions}', natural_join_records(other_conditions)) - link_url = create_graphkb_link(statement_rids) if statement_rids else "" + link_url = create_graphkb_link(statement_rids) if statement_rids else '' - if r"{evidence}" in template: - evidence_str = ", ".join(sorted(list({e["displayName"] for e in evidence}))) + if r'{evidence}' in template: + evidence_str = ', '.join(sorted(list({e['displayName'] for e in evidence}))) if link_url: evidence_str = f'{evidence_str}' - result = result.replace(r"{evidence}", evidence_str) + result = result.replace(r'{evidence}', evidence_str) return result @@ -170,18 +170,18 @@ def aggregate_statements( def generate_key(statement: Statement) -> Tuple: result = [ - cond.get("displayName", cond["@rid"]) - for cond in filter_by_record_class(statement["conditions"], "Disease", exclude=True) - if cond["@rid"] != statement["subject"]["@rid"] + cond.get('displayName', cond['@rid']) + for cond in filter_by_record_class(statement['conditions'], 'Disease', exclude=True) + if cond['@rid'] != statement['subject']['@rid'] ] - if statement.get("subject", {}).get("@class", "Disease") != "Disease": - subject = statement["subject"] - if subject["@class"] == "Therapy": - alt = get_preferred_drug_representation(graphkb_conn, subject["@rid"]) - statement["subject"] = alt - result.append(statement["subject"]["displayName"]) - result.append(statement["relevance"]["displayName"]) - result.append(statement["displayNameTemplate"]) + if statement.get('subject', {}).get('@class', 'Disease') != 'Disease': + subject = statement['subject'] + if subject['@class'] == 'Therapy': + alt = get_preferred_drug_representation(graphkb_conn, subject['@rid']) + statement['subject'] = alt + result.append(statement['subject']['displayName']) + result.append(statement['relevance']['displayName']) + result.append(statement['displayNameTemplate']) return tuple(sorted(set(result))) for statement in statements: @@ -193,12 +193,12 @@ def generate_key(statement: Statement) -> Tuple: conditions = [] subjects = [] evidence = [] - relevance = group[0]["relevance"] - template = group[0]["displayNameTemplate"] + relevance = group[0]['relevance'] + template = group[0]['displayNameTemplate'] for statement in group: - conditions.extend(statement["conditions"]) - evidence.extend(statement["evidence"]) - subjects.append(statement["subject"]) + conditions.extend(statement['conditions']) + evidence.extend(statement['evidence']) + subjects.append(statement['subject']) sentence = substitute_sentence_template( template, @@ -211,35 +211,35 @@ def generate_key(statement: Statement) -> Tuple: ) for statement in group: - result[statement["@rid"]] = sentence + result[statement['@rid']] = sentence return result def display_variant(variant: IprVariant) -> str: """Short, human readable variant description string.""" - gene = variant.get("gene", "") - if not gene and "gene1" in variant and "gene2" in variant: + gene = variant.get('gene', '') + if not gene and 'gene1' in variant and 'gene2' in variant: gene = f'({variant.get("gene1", "")},{variant.get("gene2", "")})' - if variant.get("kbCategory"): + if variant.get('kbCategory'): return f'{variant.get("kbCategory")} of {gene}' # Special display of IprFusionVariant with exons - if variant.get("exon1") or variant.get("exon2"): + if variant.get('exon1') or variant.get('exon2'): return create_graphkb_sv_notation(variant) # type: ignore # Use chosen legacy 'proteinChange' or an hgvs description of lowest detail. hgvs = variant.get( - "proteinChange", - variant.get("hgvsProtein", variant.get("hgvsCds", variant.get("hgvsGenomic", ""))), + 'proteinChange', + variant.get('hgvsProtein', variant.get('hgvsCds', variant.get('hgvsGenomic', ''))), ) if gene and hgvs: - return f"{gene}:{hgvs}" - elif variant.get("variant"): - return str(variant.get("variant")) + return f'{gene}:{hgvs}' + elif variant.get('variant'): + return str(variant.get('variant')) - raise ValueError(f"Unable to form display_variant of {variant}") + raise ValueError(f'Unable to form display_variant of {variant}') def display_variants(gene_name: str, variants: List[IprVariant]) -> str: @@ -247,11 +247,11 @@ def display_variants(gene_name: str, variants: List[IprVariant]) -> str: variants_text = natural_join(result) if len(result) > 1: return ( - f"Multiple variants of the gene {gene_name} were observed in this case: {variants_text}" + f'Multiple variants of the gene {gene_name} were observed in this case: {variants_text}' ) elif result: - return f"{variants_text[0].upper()}{variants_text[1:]} was observed in this case." - return "" + return f'{variants_text[0].upper()}{variants_text[1:]} was observed in this case.' + return '' def create_section_html( @@ -264,33 +264,33 @@ def create_section_html( """ Generate HTML for a gene section of the comments """ - output = [f"
{content}
") - return "\n".join(output) + output.append(f'{content}
') + return '\n'.join(output) def section_statements_by_genes( @@ -349,16 +349,16 @@ def section_statements_by_genes( genes: Dict[str, Set[str]] = {} for statement in statements: - for condition in statement["conditions"]: - if condition.get("biotype", "") == "gene": - gene = get_preferred_gene_name(graphkb_conn, condition["@rid"]) - genes.setdefault(gene, set()).add(statement["@rid"]) + for condition in statement['conditions']: + if condition.get('biotype', '') == 'gene': + gene = get_preferred_gene_name(graphkb_conn, condition['@rid']) + genes.setdefault(gene, set()).add(statement['@rid']) else: - for cond_ref_key in ("reference1", "reference2"): + for cond_ref_key in ('reference1', 'reference2'): cond_ref_gene = condition.get(cond_ref_key) if cond_ref_gene: gene = get_preferred_gene_name(graphkb_conn, str(cond_ref_gene)) - genes.setdefault(gene, set()).add(statement["@rid"]) + genes.setdefault(gene, set()).add(statement['@rid']) return genes @@ -372,12 +372,12 @@ def prep_single_ipr_variant_comment(variant_text): Returns: section: html-formatted string """ - cancer_type = ",".join(variant_text["cancerType"]) + cancer_type = ','.join(variant_text['cancerType']) if not cancer_type: - cancer_type = "no specific cancer types" - cancer_type = f" ({cancer_type})" - section = [f"{variant_text['text']}
") + cancer_type = 'no specific cancer types' + cancer_type = f' ({cancer_type})' + section = [f'{variant_text["text"]}
') return section @@ -385,6 +385,7 @@ def get_ipr_analyst_comments( ipr_conn: IprConnection, matches: Sequence[KbMatch] | Sequence[Hashabledict], disease_name: str, + disease_match_names: [str], project_name: str, report_type: str, include_nonspecific_disease: bool = False, @@ -403,6 +404,7 @@ def get_ipr_analyst_comments( ipr_conn: connection to the ipr db matches: list of kbmatches which will be included in the report disease_name: str, eg 'colorectal cancer' + disease_match_names: list[str] of names considered to be equivalent to the disease name project_name: str, eg TEST or pog report_type: str, eg genomic or rapid include_nonspecific_disease: bool - true if variant texts that don't explicitly @@ -414,48 +416,58 @@ def get_ipr_analyst_comments( Returns: html-formatted string """ - output_header = "no cancerType
", - "variantName": "ERBB2 amplification", - "cancerType": [], - "template": {"name": "test3"}, - "project": {"name": "test2"}, + 'text': 'no cancerType
', + 'variantName': 'ERBB2 amplification', + 'cancerType': [], + 'template': {'name': 'test3'}, + 'project': {'name': 'test2'}, }, { - "text": "normal
", - "variantName": "ERBB2 amplification", - "cancerType": ["test1", "test"], - "template": {"name": "test3"}, - "project": {"name": "test2"}, + 'text': 'normal
', + 'variantName': 'ERBB2 amplification', + 'cancerType': ['test1', 'test'], + 'template': {'name': 'test3'}, + 'project': {'name': 'test2'}, }, { - "text": "no project
", - "variantName": "ERBB2 amplification", - "cancerType": ["test1", "test"], - "template": {"name": "test3"}, + 'text': 'no project
', + 'variantName': 'ERBB2 amplification', + 'cancerType': ['test1', 'test'], + 'template': {'name': 'test3'}, }, { - "text": "no template
", - "variantName": "ERBB2 amplification", - "cancerType": ["test1", "test"], - "project": {"name": "test2"}, + 'text': 'no template
', + 'variantName': 'ERBB2 amplification', + 'cancerType': ['test1', 'test'], + 'project': {'name': 'test2'}, }, ], [ { - "text": "normal, second variant
", - "variantName": "second variant", - "cancerType": ["test1", "test"], - "template": {"name": "test3"}, - "project": {"name": "test2"}, + 'text': 'normal, second variant
', + 'variantName': 'second variant', + 'cancerType': ['test1', 'test'], + 'template': {'name': 'test3'}, + 'project': {'name': 'test2'}, }, ], ] -no_comments_found_output = "No comments found in IPR for variants in this report" +no_comments_found_output = 'No comments found in IPR for variants in this report' class TestVariantTextFromIPR: def test_gets_fully_matched_output_when_possible(self): ipr_conn = MagicMock(get=MagicMock(side_effect=copy(mock_ipr_results))) - matches = [{"kbVariant": "ERBB2 amplification"}] + matches = [{'kbVariant': 'ERBB2 amplification'}] ipr_summary = get_ipr_analyst_comments( ipr_conn, matches=matches, - disease_name="test1", - project_name="test2", - report_type="test3", + disease_name='test1', + disease_match_names=[], + project_name='test2', + report_type='test3', include_nonspecific_project=False, include_nonspecific_disease=True, include_nonspecific_template=True, ) - summary_lines = ipr_summary.split("\n") - assert summary_lines[1] == "normal
" + summary_lines = ipr_summary.split('\n') + assert summary_lines[1] == 'normal
' assert len(summary_lines) == 3 def test_omits_nonspecific_project_matches_when_specified(self): ipr_conn = MagicMock(get=MagicMock(side_effect=copy(mock_ipr_results))) - matches = [{"kbVariant": "ERBB2 amplification"}] + matches = [{'kbVariant': 'ERBB2 amplification'}] ipr_summary = get_ipr_analyst_comments( ipr_conn, matches=matches, - disease_name="test1", - project_name="notfound", - report_type="test3", + disease_name='test1', + disease_match_names=[], + project_name='notfound', + report_type='test3', include_nonspecific_project=False, include_nonspecific_disease=True, include_nonspecific_template=True, @@ -238,13 +240,14 @@ def test_omits_nonspecific_project_matches_when_specified(self): def test_omits_nonspecific_template_matches_when_specified(self): ipr_conn = MagicMock(get=MagicMock(side_effect=copy(mock_ipr_results))) - matches = [{"kbVariant": "ERBB2 amplification"}] + matches = [{'kbVariant': 'ERBB2 amplification'}] ipr_summary = get_ipr_analyst_comments( ipr_conn, matches=matches, - disease_name="test1", - project_name="test2", - report_type="notfound", + disease_name='test1', + disease_match_names=[], + project_name='test2', + report_type='notfound', include_nonspecific_project=True, include_nonspecific_disease=True, include_nonspecific_template=False, @@ -253,13 +256,14 @@ def test_omits_nonspecific_template_matches_when_specified(self): def test_omits_nonspecific_disease_matches_when_specified(self): ipr_conn = MagicMock(get=MagicMock(side_effect=copy(mock_ipr_results))) - matches = [{"kbVariant": "ERBB2 amplification"}] + matches = [{'kbVariant': 'ERBB2 amplification'}] ipr_summary = get_ipr_analyst_comments( ipr_conn, matches=matches, - disease_name="notfound", - project_name="test2", - report_type="test3", + disease_name='notfound', + disease_match_names=[], + project_name='test2', + report_type='test3', include_nonspecific_project=True, include_nonspecific_disease=False, include_nonspecific_template=True, @@ -268,86 +272,110 @@ def test_omits_nonspecific_disease_matches_when_specified(self): def test_includes_nonspecific_project_matches_when_specified(self): ipr_conn = MagicMock(get=MagicMock(side_effect=copy(mock_ipr_results))) - matches = [{"kbVariant": "ERBB2 amplification"}] + matches = [{'kbVariant': 'ERBB2 amplification'}] ipr_summary = get_ipr_analyst_comments( ipr_conn, matches=matches, - disease_name="test1", - project_name="notfound", - report_type="test3", + disease_name='test1', + disease_match_names=[], + project_name='notfound', + report_type='test3', include_nonspecific_project=True, include_nonspecific_disease=False, include_nonspecific_template=False, ) - summary_lines = ipr_summary.split("\n") - assert summary_lines[2] == "no project
" + summary_lines = ipr_summary.split('\n') + assert summary_lines[2] == 'no project
' assert len(summary_lines) == 3 def test_includes_nonspecific_template_matches_when_specified(self): ipr_conn = MagicMock(get=MagicMock(side_effect=copy(mock_ipr_results))) - matches = [{"kbVariant": "ERBB2 amplification"}] + matches = [{'kbVariant': 'ERBB2 amplification'}] ipr_summary = get_ipr_analyst_comments( ipr_conn, matches=matches, - disease_name="test1", - project_name="test2", - report_type="notfound", + disease_name='test1', + disease_match_names=[], + project_name='test2', + report_type='notfound', include_nonspecific_project=False, include_nonspecific_disease=False, include_nonspecific_template=True, ) - summary_lines = ipr_summary.split("\n") - assert summary_lines[2] == "no template
" + summary_lines = ipr_summary.split('\n') + assert summary_lines[2] == 'no template
' assert len(summary_lines) == 3 def test_includes_nonspecific_disease_matches_when_specified(self): ipr_conn = MagicMock(get=MagicMock(side_effect=copy(mock_ipr_results))) - matches = [{"kbVariant": "ERBB2 amplification"}] + matches = [{'kbVariant': 'ERBB2 amplification'}] ipr_summary = get_ipr_analyst_comments( ipr_conn, matches=matches, - disease_name="notfound", - project_name="test2", - report_type="test3", + disease_name='notfound', + disease_match_names=[], + project_name='test2', + report_type='test3', include_nonspecific_project=False, include_nonspecific_disease=True, include_nonspecific_template=False, ) - summary_lines = ipr_summary.split("\n") - assert summary_lines[1] == "no cancerType
" + summary_lines = ipr_summary.split('\n') + assert summary_lines[1] == 'no cancerType
' + assert len(summary_lines) == 3 + + def test_includes_all_graphkb_disease_matches(self): + ipr_conn = MagicMock(get=MagicMock(side_effect=copy(mock_ipr_results))) + matches = [{'kbVariant': 'ERBB2 amplification'}] + ipr_summary = get_ipr_analyst_comments( + ipr_conn, + matches=matches, + disease_name='notfound', + disease_match_names=['TEST1'], + project_name='test2', + report_type='test3', + include_nonspecific_project=False, + include_nonspecific_disease=False, + include_nonspecific_template=False, + ) + summary_lines = ipr_summary.split('\n') + assert summary_lines[1] == 'normal
' assert len(summary_lines) == 3 def test_prepare_section_for_multiple_variants(self): ipr_conn = MagicMock(get=MagicMock(side_effect=copy(mock_ipr_results))) # NB this test relies on matches being processed in this order - matches = [{"kbVariant": "ERBB2 amplification"}, {"kbVariant": "second variant"}] + matches = [{'kbVariant': 'ERBB2 amplification'}, {'kbVariant': 'second variant'}] ipr_summary = get_ipr_analyst_comments( ipr_conn, matches=matches, - disease_name="test1", - project_name="test2", - report_type="test3", + disease_name='test1', + disease_match_names=[], + project_name='test2', + report_type='test3', include_nonspecific_project=False, include_nonspecific_disease=False, include_nonspecific_template=False, ) - summary_lines = ipr_summary.split("\n") + summary_lines = ipr_summary.split('\n') assert len(summary_lines) == 5 assert ( - "\n".join(summary_lines[1:]) - == "normal
\nnormal, second variant
" + '\n'.join(summary_lines[1:]) + == 'normal
\nnormal, second variant
' ) def test_empty_section_when_no_variant_match(self): ipr_conn = MagicMock(get=MagicMock(side_effect=[[], []])) - matches = [{"kbVariant": "notfound1"}, {"kbVariant": "notfound2"}] + matches = [{'kbVariant': 'notfound1'}, {'kbVariant': 'notfound2'}] ipr_summary = get_ipr_analyst_comments( ipr_conn, matches=matches, - disease_name="test1", - project_name="test2", - report_type="test3", + disease_name='test1', + disease_match_names=[], + project_name='test2', + report_type='test3', include_nonspecific_project=False, include_nonspecific_disease=False, include_nonspecific_template=False, diff --git a/tests/test_ipr/test_upload.py b/tests/test_ipr/test_upload.py index 95f7fd7a..79568f75 100644 --- a/tests/test_ipr/test_upload.py +++ b/tests/test_ipr/test_upload.py @@ -13,87 +13,97 @@ from .constants import EXCLUDE_INTEGRATION_TESTS -EXCLUDE_BCGSC_TESTS = os.environ.get("EXCLUDE_BCGSC_TESTS") == "1" -EXCLUDE_ONCOKB_TESTS = os.environ.get("EXCLUDE_ONCOKB_TESTS") == "1" -INCLUDE_UPLOAD_TESTS = os.environ.get("INCLUDE_UPLOAD_TESTS", "0") == "1" -DELETE_UPLOAD_TEST_REPORTS = os.environ.get("DELETE_UPLOAD_TEST_REPORTS", "1") == "1" +EXCLUDE_BCGSC_TESTS = os.environ.get('EXCLUDE_BCGSC_TESTS') == '1' +EXCLUDE_ONCOKB_TESTS = os.environ.get('EXCLUDE_ONCOKB_TESTS') == '1' +INCLUDE_UPLOAD_TESTS = os.environ.get('INCLUDE_UPLOAD_TESTS', '0') == '1' +DELETE_UPLOAD_TEST_REPORTS = os.environ.get('DELETE_UPLOAD_TEST_REPORTS', '1') == '1' def get_test_spec(): - ipr_spec = {"components": {"schemas": {"genesCreate": {"properties": {}}}}} + ipr_spec = {'components': {'schemas': {'genesCreate': {'properties': {}}}}} ipr_gene_keys = IprGene.__required_keys__ | IprGene.__optional_keys__ for key in ipr_gene_keys: - ipr_spec["components"]["schemas"]["genesCreate"]["properties"][key] = "" + ipr_spec['components']['schemas']['genesCreate']['properties'][key] = '' return ipr_spec def get_test_file(name: str) -> str: - return os.path.join(os.path.dirname(__file__), "test_data", name) + return os.path.join(os.path.dirname(__file__), 'test_data', name) -@pytest.fixture(scope="module") +@pytest.fixture(scope='module') def loaded_reports(tmp_path_factory) -> Generator: - json_file = tmp_path_factory.mktemp("inputs") / "content.json" - async_json_file = tmp_path_factory.mktemp("inputs") / "async_content.json" - patient_id = f"TEST_{str(uuid.uuid4())}" - async_patient_id = f"TEST_ASYNC_{str(uuid.uuid4())}" + json_file = tmp_path_factory.mktemp('inputs') / 'content.json' + async_json_file = tmp_path_factory.mktemp('inputs') / 'async_content.json' + patient_id = f'TEST_{str(uuid.uuid4())}' + async_patient_id = f'TEST_ASYNC_{str(uuid.uuid4())}' json_contents = { - "comparators": [ - {"analysisRole": "expression (disease)", "name": "1"}, - {"analysisRole": "expression (primary site)", "name": "2"}, - {"analysisRole": "expression (biopsy site)", "name": "3"}, + 'comparators': [ + {'analysisRole': 'expression (disease)', 'name': '1'}, + {'analysisRole': 'expression (primary site)', 'name': '2'}, + {'analysisRole': 'expression (biopsy site)', 'name': '3'}, { - "analysisRole": "expression (internal pancancer cohort)", - "name": "4", + 'analysisRole': 'expression (internal pancancer cohort)', + 'name': '4', }, ], - "patientId": patient_id, - "project": "TEST", - "sampleInfo": [ + 'patientId': patient_id, + 'project': 'TEST', + 'sampleInfo': [ { - "sample": "Constitutional", - "biopsySite": "Normal tissue", - "sampleName": "SAMPLE1-PB", - "primarySite": "Blood-Peripheral", - "collectionDate": "11-11-11", + 'sample': 'Constitutional', + 'biopsySite': 'Normal tissue', + 'sampleName': 'SAMPLE1-PB', + 'primarySite': 'Blood-Peripheral', + 'collectionDate': '11-11-11', }, { - "sample": "Tumour", - "pathoTc": "90%", - "biopsySite": "hepatic", - "sampleName": "SAMPLE2-FF-1", - "primarySite": "Vena Cava-Hepatic", - "collectionDate": "12-12-12", + 'sample': 'Tumour', + 'pathoTc': '90%', + 'biopsySite': 'hepatic', + 'sampleName': 'SAMPLE2-FF-1', + 'primarySite': 'Vena Cava-Hepatic', + 'collectionDate': '12-12-12', }, ], - "expressionVariants": json.loads( - pd.read_csv(get_test_file("expression.short.tab"), sep="\t").to_json(orient="records") + 'msi': [ + { + 'score': 1000.0, + 'kbCategory': 'microsatellite instability', + } + ], + 'hrd': { + 'score': 9999.0, + 'kbCategory': 'homologous recombination deficiency strong signature', + }, + 'expressionVariants': json.loads( + pd.read_csv(get_test_file('expression.short.tab'), sep='\t').to_json(orient='records') ), - "smallMutations": json.loads( - pd.read_csv(get_test_file("small_mutations.short.tab"), sep="\t").to_json( - orient="records" + 'smallMutations': json.loads( + pd.read_csv(get_test_file('small_mutations.short.tab'), sep='\t').to_json( + orient='records' ) ), - "copyVariants": json.loads( - pd.read_csv(get_test_file("copy_variants.short.tab"), sep="\t").to_json( - orient="records" + 'copyVariants': json.loads( + pd.read_csv(get_test_file('copy_variants.short.tab'), sep='\t').to_json( + orient='records' ) ), - "structuralVariants": json.loads( - pd.read_csv(get_test_file("fusions.tab"), sep="\t").to_json(orient="records") + 'structuralVariants': json.loads( + pd.read_csv(get_test_file('fusions.tab'), sep='\t').to_json(orient='records') ), - "kbDiseaseMatch": "colorectal cancer", - "cosmicSignatures": pd.read_csv( - get_test_file("cosmic_variants.tab"), sep="\t" + 'kbDiseaseMatch': 'colorectal cancer', + 'cosmicSignatures': pd.read_csv( + get_test_file('cosmic_variants.tab'), sep='\t' ).signature.tolist(), - "hlaTypes": json.loads( - pd.read_csv(get_test_file("hla_variants.tab"), sep="\t").to_json(orient="records") + 'hlaTypes': json.loads( + pd.read_csv(get_test_file('hla_variants.tab'), sep='\t').to_json(orient='records') ), - "images": [ + 'images': [ { - "key": "cnvLoh.circos", - "path": "test/testData/images/cnvLoh.png", - "caption": "Test adding a caption to an image", + 'key': 'cnvLoh.circos', + 'path': 'test/testData/images/cnvLoh.png', + 'caption': 'Test adding a caption to an image', } ], } @@ -105,7 +115,7 @@ def loaded_reports(tmp_path_factory) -> Generator: ) ) - json_contents["patientId"] = async_patient_id + json_contents['patientId'] = async_patient_id async_json_file.write_text( json.dumps( json_contents, @@ -114,61 +124,61 @@ def loaded_reports(tmp_path_factory) -> Generator: ) argslist = [ - "ipr", - "--username", - os.environ.get("IPR_USER", os.environ["USER"]), - "--password", - os.environ["IPR_PASS"], - "--graphkb_username", - os.environ.get("GRAPHKB_USER", os.environ.get("IPR_USER", os.environ["USER"])), - "--graphkb_password", - os.environ.get("GRAPHKB_PASS", os.environ["IPR_PASS"]), - "--ipr_url", - os.environ["IPR_TEST_URL"], - "--graphkb_url", - os.environ.get("GRAPHKB_URL", False), - "--therapeutics", - "--allow_partial_matches", + 'ipr', + '--username', + os.environ.get('IPR_USER', os.environ['USER']), + '--password', + os.environ['IPR_PASS'], + '--graphkb_username', + os.environ.get('GRAPHKB_USER', os.environ.get('IPR_USER', os.environ['USER'])), + '--graphkb_password', + os.environ.get('GRAPHKB_PASS', os.environ['IPR_PASS']), + '--ipr_url', + os.environ['IPR_TEST_URL'], + '--graphkb_url', + os.environ.get('GRAPHKB_URL', False), + '--therapeutics', + '--allow_partial_matches', ] sync_argslist = argslist.copy() - sync_argslist.extend(["--content", str(json_file)]) - with patch.object(sys, "argv", sync_argslist): - with patch.object(IprConnection, "get_spec", return_value=get_test_spec()): + sync_argslist.extend(['--content', str(json_file)]) + with patch.object(sys, 'argv', sync_argslist): + with patch.object(IprConnection, 'get_spec', return_value=get_test_spec()): command_interface() async_argslist = argslist.copy() - async_argslist.extend(["--content", str(async_json_file), "--async_upload"]) - with patch.object(sys, "argv", async_argslist): - with patch.object(IprConnection, "get_spec", return_value=get_test_spec()): + async_argslist.extend(['--content', str(async_json_file), '--async_upload']) + with patch.object(sys, 'argv', async_argslist): + with patch.object(IprConnection, 'get_spec', return_value=get_test_spec()): command_interface() ipr_conn = IprConnection( - username=os.environ.get("IPR_USER", os.environ["USER"]), - password=os.environ["IPR_PASS"], - url=os.environ["IPR_TEST_URL"], + username=os.environ.get('IPR_USER', os.environ['USER']), + password=os.environ['IPR_PASS'], + url=os.environ['IPR_TEST_URL'], ) - loaded_report = ipr_conn.get(uri=f"reports?searchText={patient_id}") - async_loaded_report = ipr_conn.get(uri=f"reports?searchText={async_patient_id}") + loaded_report = ipr_conn.get(uri=f'reports?searchText={patient_id}') + async_loaded_report = ipr_conn.get(uri=f'reports?searchText={async_patient_id}') loaded_reports_result = { - "sync": (patient_id, loaded_report), - "async": (async_patient_id, async_loaded_report), + 'sync': (patient_id, loaded_report), + 'async': (async_patient_id, async_loaded_report), } yield loaded_reports_result if DELETE_UPLOAD_TEST_REPORTS: - ipr_conn.delete(uri=f"reports/{loaded_report['reports'][0]['ident']}") - ipr_conn.delete(uri=f"reports/{async_loaded_report['reports'][0]['ident']}") + ipr_conn.delete(uri=f'reports/{loaded_report["reports"][0]["ident"]}') + ipr_conn.delete(uri=f'reports/{async_loaded_report["reports"][0]["ident"]}') def get_section(loaded_report, section_name): - ident = loaded_report[1]["reports"][0]["ident"] + ident = loaded_report[1]['reports'][0]['ident'] ipr_conn = IprConnection( - username=os.environ.get("IPR_USER", os.environ["USER"]), - password=os.environ["IPR_PASS"], - url=os.environ["IPR_TEST_URL"], + username=os.environ.get('IPR_USER', os.environ['USER']), + password=os.environ['IPR_PASS'], + url=os.environ['IPR_TEST_URL'], ) - return ipr_conn.get(uri=f"reports/{ident}/{section_name}") + return ipr_conn.get(uri=f'reports/{ident}/{section_name}') def stringify_sorted(obj): @@ -181,7 +191,7 @@ def stringify_sorted(obj): obj.sort() return str(obj) elif isinstance(obj, dict): - for key in ("ident", "updatedAt", "createdAt", "deletedAt"): + for key in ('ident', 'updatedAt', 'createdAt', 'deletedAt'): obj.pop(key, None) keys = obj.keys() for key in keys: @@ -197,50 +207,50 @@ def stringify_sorted(obj): @pytest.mark.skipif( - not INCLUDE_UPLOAD_TESTS, reason="excluding tests of upload to live ipr instance" + not INCLUDE_UPLOAD_TESTS, reason='excluding tests of upload to live ipr instance' ) -@pytest.mark.skipif(EXCLUDE_INTEGRATION_TESTS, reason="excluding long running integration tests") +@pytest.mark.skipif(EXCLUDE_INTEGRATION_TESTS, reason='excluding long running integration tests') class TestCreateReport: def test_patient_id_loaded_once(self, loaded_reports) -> None: - sync_patient_id = loaded_reports["sync"][0] - assert loaded_reports["sync"][1]["total"] == 1 - assert loaded_reports["sync"][1]["reports"][0]["patientId"] == sync_patient_id - async_patient_id = loaded_reports["async"][0] - assert loaded_reports["async"][1]["total"] == 1 - assert loaded_reports["async"][1]["reports"][0]["patientId"] == async_patient_id + sync_patient_id = loaded_reports['sync'][0] + assert loaded_reports['sync'][1]['total'] == 1 + assert loaded_reports['sync'][1]['reports'][0]['patientId'] == sync_patient_id + async_patient_id = loaded_reports['async'][0] + assert loaded_reports['async'][1]['total'] == 1 + assert loaded_reports['async'][1]['reports'][0]['patientId'] == async_patient_id def test_expression_variants_loaded(self, loaded_reports) -> None: - section = get_section(loaded_reports["sync"], "expression-variants") - kbmatched = [item for item in section if item["kbMatches"]] - assert "PTP4A3" in [item["gene"]["name"] for item in kbmatched] - async_section = get_section(loaded_reports["async"], "expression-variants") + section = get_section(loaded_reports['sync'], 'expression-variants') + kbmatched = [item for item in section if item['kbMatches']] + assert 'PTP4A3' in [item['gene']['name'] for item in kbmatched] + async_section = get_section(loaded_reports['async'], 'expression-variants') async_equals_sync = stringify_sorted(section) == stringify_sorted(async_section) assert async_equals_sync def test_structural_variants_loaded(self, loaded_reports) -> None: - section = get_section(loaded_reports["sync"], "structural-variants") - kbmatched = [item for item in section if item["kbMatches"]] - assert "(EWSR1,FLI1):fusion(e.7,e.4)" in [item["displayName"] for item in kbmatched] - async_section = get_section(loaded_reports["async"], "structural-variants") + section = get_section(loaded_reports['sync'], 'structural-variants') + kbmatched = [item for item in section if item['kbMatches']] + assert '(EWSR1,FLI1):fusion(e.7,e.4)' in [item['displayName'] for item in kbmatched] + async_section = get_section(loaded_reports['async'], 'structural-variants') async_equals_sync = stringify_sorted(section) == stringify_sorted(async_section) assert async_equals_sync def test_small_mutations_loaded(self, loaded_reports) -> None: - section = get_section(loaded_reports["sync"], "small-mutations") - kbmatched = [item for item in section if item["kbMatches"]] - assert "FGFR2:p.R421C" in [item["displayName"] for item in kbmatched] - assert "CDKN2A:p.T18M" in [item["displayName"] for item in kbmatched] - async_section = get_section(loaded_reports["async"], "small-mutations") + section = get_section(loaded_reports['sync'], 'small-mutations') + kbmatched = [item for item in section if item['kbMatches']] + assert 'FGFR2:p.R421C' in [item['displayName'] for item in kbmatched] + assert 'CDKN2A:p.T18M' in [item['displayName'] for item in kbmatched] + async_section = get_section(loaded_reports['async'], 'small-mutations') async_equals_sync = stringify_sorted(section) == stringify_sorted(async_section) assert async_equals_sync def test_copy_variants_loaded(self, loaded_reports) -> None: - section = get_section(loaded_reports["sync"], "copy-variants") - kbmatched = [item for item in section if item["kbMatches"]] - assert ("ERBB2", "amplification") in [ - (item["gene"]["name"], item["displayName"]) for item in kbmatched + section = get_section(loaded_reports['sync'], 'copy-variants') + kbmatched = [item for item in section if item['kbMatches']] + assert ('ERBB2', 'amplification') in [ + (item['gene']['name'], item['displayName']) for item in kbmatched ] - async_section = get_section(loaded_reports["async"], "copy-variants") + async_section = get_section(loaded_reports['async'], 'copy-variants') async_equals_sync = stringify_sorted(section) == stringify_sorted(async_section) assert async_equals_sync @@ -255,58 +265,58 @@ def test_copy_variants_loaded(self, loaded_reports) -> None: # assert compare_sections(section, async_section) def test_kb_matches_loaded(self, loaded_reports) -> None: - section = get_section(loaded_reports["sync"], "kb-matches") + section = get_section(loaded_reports['sync'], 'kb-matches') observed_and_matched = set( - [(item["kbVariant"], item["variant"]["displayName"]) for item in section] + [(item['kbVariant'], item['variant']['displayName']) for item in section] ) for pair in [ - ("ERBB2 amplification", "amplification"), - ("FGFR2 mutation", "FGFR2:p.R421C"), - ("PTP4A3 overexpression", "increased expression"), - ("EWSR1 and FLI1 fusion", "(EWSR1,FLI1):fusion(e.7,e.4)"), - ("CDKN2A mutation", "CDKN2A:p.T18M"), + ('ERBB2 amplification', 'amplification'), + ('FGFR2 mutation', 'FGFR2:p.R421C'), + ('PTP4A3 overexpression', 'increased expression'), + ('EWSR1 and FLI1 fusion', '(EWSR1,FLI1):fusion(e.7,e.4)'), + ('CDKN2A mutation', 'CDKN2A:p.T18M'), ]: assert pair in observed_and_matched - async_section = get_section(loaded_reports["async"], "kb-matches") + async_section = get_section(loaded_reports['async'], 'kb-matches') async_equals_sync = stringify_sorted(section) == stringify_sorted(async_section) assert async_equals_sync def test_therapeutic_targets_loaded(self, loaded_reports) -> None: - section = get_section(loaded_reports["sync"], "therapeutic-targets") - therapeutic_target_genes = set([item["gene"] for item in section]) - for gene in ["CDKN2A", "ERBB2", "FGFR2", "PTP4A3"]: + section = get_section(loaded_reports['sync'], 'therapeutic-targets') + therapeutic_target_genes = set([item['gene'] for item in section]) + for gene in ['CDKN2A', 'ERBB2', 'FGFR2', 'PTP4A3']: assert gene in therapeutic_target_genes - async_section = get_section(loaded_reports["async"], "therapeutic-targets") + async_section = get_section(loaded_reports['async'], 'therapeutic-targets') async_equals_sync = stringify_sorted(section) == stringify_sorted(async_section) assert async_equals_sync def test_genomic_alterations_identified_loaded(self, loaded_reports) -> None: - section = get_section(loaded_reports["sync"], "summary/genomic-alterations-identified") - variants = set([item["geneVariant"] for item in section]) + section = get_section(loaded_reports['sync'], 'summary/genomic-alterations-identified') + variants = set([item['geneVariant'] for item in section]) for variant in [ - "FGFR2:p.R421C", - "PTP4A3 (high_percentile)", - "ERBB2 (Amplification)", - "(EWSR1,FLI1):fusion(e.7,e.4)", - "CDKN2A:p.T18M", + 'FGFR2:p.R421C', + 'PTP4A3 (high_percentile)', + 'ERBB2 (Amplification)', + '(EWSR1,FLI1):fusion(e.7,e.4)', + 'CDKN2A:p.T18M', ]: assert variant in variants async_section = get_section( - loaded_reports["async"], "summary/genomic-alterations-identified" + loaded_reports['async'], 'summary/genomic-alterations-identified' ) async_equals_sync = stringify_sorted(section) == stringify_sorted(async_section) assert async_equals_sync def test_analyst_comments_loaded(self, loaded_reports) -> None: - sync_section = get_section(loaded_reports["sync"], "summary/analyst-comments") - assert sync_section["comments"] - async_section = get_section(loaded_reports["async"], "summary/analyst-comments") - assert async_section["comments"] - assert sync_section["comments"] == async_section["comments"] + sync_section = get_section(loaded_reports['sync'], 'summary/analyst-comments') + assert sync_section['comments'] + async_section = get_section(loaded_reports['async'], 'summary/analyst-comments') + assert async_section['comments'] + assert sync_section['comments'] == async_section['comments'] def test_sample_info_loaded(self, loaded_reports) -> None: - sync_section = get_section(loaded_reports["sync"], "sample-info") - async_section = get_section(loaded_reports["async"], "sample-info") + sync_section = get_section(loaded_reports['sync'], 'sample-info') + async_section = get_section(loaded_reports['async'], 'sample-info') async_equals_sync = stringify_sorted(sync_section) == stringify_sorted(async_section) assert async_equals_sync @@ -322,31 +332,31 @@ def test_multivariant_multiconditionset_statements_loaded(self, loaded_reports) are met. This is also a test of multiple condition sets since there are two variants in the test data that satisfy one of the conditions (the APC mutation).""" - section = get_section(loaded_reports["sync"], "kb-matches/kb-matched-statements") - multivariant_stmts = [item for item in section if item["reference"] == "pmid:27302369"] + section = get_section(loaded_reports['sync'], 'kb-matches/kb-matched-statements') + multivariant_stmts = [item for item in section if item['reference'] == 'pmid:27302369'] # if this statement is entered more than once there may be multiple sets of records to # check, so to make sure the count checks work, go stmt_id by stmt_id: - stmt_ids = list(set([item["kbStatementId"] for item in multivariant_stmts])) + stmt_ids = list(set([item['kbStatementId'] for item in multivariant_stmts])) for stmt_id in stmt_ids: - stmts = [item for item in multivariant_stmts if item["kbStatementId"] == stmt_id] + stmts = [item for item in multivariant_stmts if item['kbStatementId'] == stmt_id] - # we expect two stmts, one for each condition set - assert len(stmts) == 2 + # we expect three stmts, one for each condition set + assert len(stmts) == 3 # we expect each condition set to have two kb variants in it # we expect the two kb variants to be the same in each stmt - assert len(stmts[0]["kbMatches"]) == 2 - assert len(stmts[1]["kbMatches"]) == 2 - kbmatches1 = [item["kbVariant"] for item in stmts[0]["kbMatches"]] - kbmatches2 = [item["kbVariant"] for item in stmts[1]["kbMatches"]] + assert len(stmts[0]['kbMatches']) == 2 + assert len(stmts[1]['kbMatches']) == 2 + kbmatches1 = [item['kbVariant'] for item in stmts[0]['kbMatches']] + kbmatches2 = [item['kbVariant'] for item in stmts[1]['kbMatches']] kbmatches1.sort() kbmatches2.sort() - assert kbmatches1 == kbmatches2 == ["APC mutation", "KRAS mutation"] + assert kbmatches1 == kbmatches2 == ['APC mutation', 'KRAS mutation'] # we expect the two stmts to have different observed variant sets - observedVariants1 = [item["variant"]["ident"] for item in stmts[0]["kbMatches"]] - observedVariants2 = [item["variant"]["ident"] for item in stmts[1]["kbMatches"]] + observedVariants1 = [item['variant']['ident'] for item in stmts[0]['kbMatches']] + observedVariants2 = [item['variant']['ident'] for item in stmts[1]['kbMatches']] observedVariants1.sort() observedVariants2.sort() assert observedVariants1 != observedVariants2 diff --git a/tests/test_ipr/test_util.py b/tests/test_ipr/test_util.py index 70318818..bbae6d98 100644 --- a/tests/test_ipr/test_util.py +++ b/tests/test_ipr/test_util.py @@ -4,8 +4,8 @@ @pytest.mark.parametrize( - "input,output_keys", - [[{"key": 0}, ["key"]], [{"key": None}, []], [{"key": ""}, []], [{"gene1": None}, ["gene1"]]], + 'input,output_keys', + [[{'key': 0}, ['key']], [{'key': None}, []], [{'key': ''}, []], [{'gene1': None}, ['gene1']]], ) def test_trim_empty_values(input, output_keys): modified_object = trim_empty_values(input) @@ -13,17 +13,17 @@ def test_trim_empty_values(input, output_keys): @pytest.mark.parametrize( - "variant,result", + 'variant,result', [ [ - {"variantType": "exp", "gene": "GENE", "expressionState": "increased expression"}, - "increased expression", + {'variantType': 'exp', 'gene': 'GENE', 'expressionState': 'increased expression'}, + 'increased expression', ], - [{"variantType": "cnv", "gene": "GENE", "cnvState": "amplification"}, "amplification"], - [{"variantType": "other", "gene2": "GENE", "variant": "GENE:anything"}, "anything"], + [{'variantType': 'cnv', 'gene': 'GENE', 'cnvState': 'amplification'}, 'amplification'], + [{'variantType': 'other', 'gene2': 'GENE', 'variant': 'GENE:anything'}, 'anything'], ], ) def test_create_variant_name_tuple(variant, result): gene, name = create_variant_name_tuple(variant) assert name == result - assert gene == "GENE" + assert gene == 'GENE'