populationgenomics · MattWellie · Apr 2, 2025 · Apr 2, 2025 · Apr 2, 2025
diff --git a/.bumpversion.cfg b/.bumpversion.cfg
@@ -1,13 +1,16 @@
 [bumpversion]
-current_version = 7.0.3
+current_version = 7.0.4
 commit = True
 tag = False
 
 [bumpversion:file:README.md]
+
 [bumpversion:file:pyproject.toml]
-[bumpversion:file:talos/version.py]
+
+[bumpversion:file:src/talos/version.py]
 
 [bumpversion:file:nextflow/annotation.config]
+
 [bumpversion:file:nextflow/talos.config]
 
 [bumpversion:file:.github/workflows/docker.yaml]
diff --git a/.github/workflows/docker.yaml b/.github/workflows/docker.yaml
@@ -14,7 +14,7 @@ permissions:
   contents: read
 
 env:
-  VERSION: 7.0.3
+  VERSION: 7.0.4
 
 jobs:
   docker:

diff --git a/.github/workflows/lint.yaml b/.github/workflows/lint.yaml
@@ -13,10 +13,9 @@ jobs:
     - uses: actions/setup-python@v4
       with:
         python-version: '3.10'
-        cache: 'pip'
         cache-dependency-path: requirements-lint.txt
 
-    - name: Install packages
+    - name: Install types
       run: pip install -r requirements-lint.txt
 
     - name: pre-commit

diff --git a/README.md b/README.md
@@ -37,14 +37,14 @@ annotation) and [Echtvar](https://github.com/brentp/echtvar) (used to rapidly ap
 > setting. If you apply another tag you'll have to make the corresponding change in the nextflow config files.
 
 ```commandline
-docker buildx build -t talos:7.0.3 .
+docker buildx build -t talos:7.0.4 .
 ```
 
 > **_NOTE:_**  Note the tag of the dockerfile in this command is kept in sync with the package version and config
 > setting. If you apply another tag you'll have to make the corresponding change in the nextflow config files.
 
 ```commandline
-docker buildx build -t talos:7.0.3 .
+docker buildx build -t talos:7.0.4 .
 ```
 
 The [individual Nextflow Modules](nextflow/modules) describe each step of the pipeline, and could be reimplemented in

diff --git a/nextflow/annotation.config b/nextflow/annotation.config
@@ -26,10 +26,10 @@ params.cohort_output_dir = "nextflow/${params.cohort}_outputs"
 params.generic_output_dir = "nextflow/outputs"
 
 // Not storing a copy of the AM file, collecting it from Zenodo at runtime
-params.alphamissense_url = "https://zenodo.org/records/8208688/files/AlphaMissense_hg38.tsv.gz"
-params.alphamissense_output = "${params.generic_output_dir}/alphamissense_38.ht.tar.gz"
+params.alphamissense_url = "https://zenodo.org/records/8208688/files/AlphaMissense_isoforms_hg38.tsv.gz"
+params.alphamissense_output = "${params.generic_output_dir}/alphamissense_isoforms_38.ht.tar"
 
 // Docker image - built from Dockerfile in the root directory
-// "docker build -t talos:7.0.3 ."
-params.container = 'talos:7.0.3'
+// "docker build -t talos:7.0.4 ."
+params.container = 'talos:7.0.4'
 docker.enabled = true
diff --git a/nextflow/annotation.nf b/nextflow/annotation.nf
@@ -32,14 +32,20 @@ workflow {
     ch_tbis = channel.fromPath(params.input_vcfs).map{ it -> file("${it}.tbi") }
     ch_ref_genome = channel.fromPath(params.ref_genome)
 
+    // pull and parse the MANE data into a Hail Table
+    ParseManeIntoJson()
+
     // generate the AlphaMissense HT - long running, stored in a separate folder
     // read in as a channel if this was already generated
     if (file(params.alphamissense_output).exists()) {
         ch_alphamissense_table = channel.fromPath(params.alphamissense_output)
     }
     else {
         LocaliseAlphamissenseWithWget()
-        ParseAlphaMissenseIntoHt(LocaliseAlphamissenseWithWget.out)
+        ParseAlphaMissenseIntoHt(
+        	LocaliseAlphamissenseWithWget.out,
+        	ParseManeIntoJson.out.json
+        )
         ch_alphamissense_table = ParseAlphaMissenseIntoHt.out
     }
 
@@ -75,9 +81,6 @@ workflow {
         ch_ref_genome
     )
 
-    // pull and parse the MANE data into a Hail Table
-    ParseManeIntoJson()
-
     // reformat the annotations in the VCF, retain as a Hail Table
     ReformatAnnotatedVcfIntoHailTable(
         AnnotateCsqWithBcftools.out,

diff --git a/nextflow/inputs/config.toml b/nextflow/inputs/config.toml
@@ -53,15 +53,46 @@ spliceai = 0.5
 csq_string = [ "consequence", "gene_id", "gene", "transcript", "mane_id", "mane", "biotype", "dna_change", "amino_acid_change", "codon", "ensp", "am_class", "am_pathogenicity",]
 
 [ValidateMOI]
-allow_common_clinvar = true
-gnomad_dominant = 0.001
-gnomad_max_homs_dominant = 0
-gnomad_max_homs_recessive = 1
-gnomad_max_ac_dominant = 20
-gnomad_max_hemi = 1
-max_callset_ac_dominant = 10
-callset_af_sv_dominant = 0.01
-phenotype_match = [ "6",]
+
+# callset AF filtering will only be applied to variants with this AC or higher
+min_callset_ac_to_filter = 10
+
+# Global frequency filter
+# Applied to all variants regardless of MOI. Variants reported in ClinVar as P/LP are exempt from these filters
+gnomad_max_af = 0.01
+gnomad_sv_max_af = 0.03
+
+# callset_* filters only apply to variants with a callset AC > min_callset_ac_to_filter.
+callset_max_af = 0.01
+callset_sv_max_af = 0.03
+
+gnomad_max_homozygotes = 5
+gnomad_max_hemizygotes = 5
+
+# Dominant frequency filters - only applied to variants being considered for a dominant MOI.
+# Variants reported in ClinVar as P/LP are exempt from these filters
+dominant_gnomad_max_af = 0.00001
+dominant_gnomad_sv_max_af = 0.01
+dominant_gnomad_max_ac = 10
+dominant_gnomad_max_homozygotes = 0
+
+# callset_* filters only apply to variants with a callset AC > min_callset_ac_to_filter.
+dominant_callset_max_af = 0.01
+dominant_callset_sv_max_af = 0.01
+dominant_callset_max_ac = 10
+
+# Clinvar frequency filters
+# Applied to variants reported as P/LP in clinvar
+clinvar_gnomad_max_af = 0.05
+clinvar_dominant_gnomad_max_af = 0.00005
+
+# callset_* filters only apply to variants with a callset AC > min_callset_ac_to_filter.
+clinvar_callset_max_af = 0.05
+clinvar_dominant_callset_max_af = 0.05
+
+ignore_categories = [ "exomiser", "svdb"]
+phenotype_match = ["6"]
+support_categories = ["6"]
 
 [HPOFlagging]
 semantic_match = true

diff --git a/nextflow/modules/annotation/ParseAlphaMissenseIntoHt/main.nf b/nextflow/modules/annotation/ParseAlphaMissenseIntoHt/main.nf
@@ -7,16 +7,19 @@ process ParseAlphaMissenseIntoHt {
 
     input:
         path(am_tsv)
+        path(mane_json)
 
     output:
-        path("alphamissense_38.ht.tar")
+        path("alphamissense_isoforms_38.ht.tar")
 
     script:
         """
         ParseAlphaMissenseIntoHt \
             --am_tsv ${am_tsv} \
-            --ht_out alphamissense_38.ht
-        tar --no-xattrs -cf alphamissense_38.ht.tar alphamissense_38.ht
-        rm -r alphamissense_38.ht
+            --ht_out alphamissense_isoforms_38.ht \
+            --mane_json ${mane_json}
+
+        tar --no-xattrs -cf alphamissense_isoforms_38.ht.tar alphamissense_isoforms_38.ht
+        rm -r alphamissense_isoforms_38.ht
         """
 }
diff --git a/nextflow/talos.config b/nextflow/talos.config
@@ -32,8 +32,8 @@ params.phenio_db = "${params.large_files}/phenio.db.gz"
 // outputs path(s)
 params.output_dir = "nextflow/${params.cohort}_outputs"
 
-// container built using "docker build -t talos:7.0.3 ."
+// container built using "docker build -t talos:7.0.4 ."
 // this builds a relatively small image with no GCP install
 // this may of may not be the default depending on the docker installation
-params.container = 'talos:7.0.3'
+params.container = 'talos:7.0.4'
 docker.enabled = true
diff --git a/pyproject.toml b/pyproject.toml
@@ -6,7 +6,7 @@ build-backend = "hatchling.build"
 name='talos'
 description='Centre for Population Genomics Variant Prioritisation'
 readme = "README.md"
-version='7.0.3'
+version='7.0.4'
 requires-python = ">=3.10,<3.11"
 license-files = ["LICENSE"]
 classifiers=[

diff --git a/requirements-lint.txt b/requirements-lint.txt
@@ -1,2 +1,4 @@
 pre-commit>=3.5.0
-types-protobuf>5
+types-protobuf
+types-python-dateutil
+types-toml
diff --git a/src/talos/annotation_scripts/ParseAlphaMissenseIntoHt.py b/src/talos/annotation_scripts/ParseAlphaMissenseIntoHt.py
@@ -29,6 +29,8 @@
 
 import hail as hl
 
+from talos.utils import read_json_from_path
+
 
 def process_header(final_header_line: str) -> dict[str, int]:
     """
@@ -53,13 +55,14 @@ def process_header(final_header_line: str) -> dict[str, int]:
     }
 
 
-def filter_for_pathogenic_am(input_file: str, intermediate_file: str):
+def filter_for_pathogenic_am(input_file: str, intermediate_file: str, mane_transcripts: set[str] | None = None):
     """
     read the tsv file, skim for pathogenic entries, then write out to a new file
 
     Args:
         input_file ():
         intermediate_file ():
+        mane_transcripts (set[str]): a set of transcripts to filter for
     """
 
     headers = ['chrom', 'pos', 'ref', 'alt', 'transcript', 'am_pathogenicity', 'am_class']
@@ -90,6 +93,9 @@ def filter_for_pathogenic_am(input_file: str, intermediate_file: str):
             # trim transcripts
             content_dict['transcript'] = str(content_dict['transcript']).split('.')[0]
 
+            if mane_transcripts and content_dict['transcript'] not in mane_transcripts:
+                continue
+
             # convert the AM score to a float, and pos to an int
             content_dict['pos'] = int(content_dict['pos'])
             content_dict['am_pathogenicity'] = float(content_dict['am_pathogenicity'])
@@ -126,29 +132,33 @@ def json_to_hail_table(json_file: str, new_ht: str):
 def cli_main():
     parser = ArgumentParser()
     parser.add_argument('--am_tsv', help='path to the AM tsv.gz file')
+    parser.add_argument('--mane_json', help='path to a JSON containing MANE transcript details')
     parser.add_argument('--ht_out', help='path to write a new Hail Table')
-    args, unknown = parser.parse_known_args()
+    args = parser.parse_args()
+    main(alpha_m_file=args.am_tsv, ht_path=args.ht_out, mane_json=args.mane_json)
 
-    if unknown:
-        raise ValueError(unknown)
-    main(alpha_m_file=args.am_tsv, ht_path=args.ht_out)
 
-
-def main(alpha_m_file: str, ht_path: str):
+def main(alpha_m_file: str, ht_path: str, mane_json: str | None = None):
     """
     takes the path to an AlphaMissense TSV, reorganises it into a Hail Table
 
     Args:
         alpha_m_file ():
         ht_path ():
+        mane_json (str, optional): A JSON file containing MANE transcript details
     """
 
     # generate a random file name so that we don't overwrite anything consistently
     random_intermediate_file: str = 'temp.json'
 
-    # generate a new tsv of just pathogenic entries
-    filter_for_pathogenic_am(alpha_m_file, random_intermediate_file)
-
+    # generate a new tsv of just pathogenic entries, optionally filtering down to MANE transcripts only
+    if mane_json:
+        # read the MANE JSON file
+        mane_data = read_json_from_path(mane_json)
+        mane_transcripts = set(mane_data.keys())
+        filter_for_pathogenic_am(alpha_m_file, random_intermediate_file, mane_transcripts=mane_transcripts)
+    else:
+        filter_for_pathogenic_am(alpha_m_file, random_intermediate_file)
     hl.default_reference('GRCh38')
 
     # now ingest as HT and re-jig some fields

diff --git a/src/talos/example_config.toml b/src/talos/example_config.toml
@@ -36,25 +36,48 @@ symbol = 'AGENE'
 chrom = '1'
 
 [ValidateMOI]
-# thresholds for different filters during the MOI checks
-gnomad_dominant = 0.001
 
-# by default common in callset/gnomAD filters aren't applied to ClinVar pathogenics
-# set this flag to false to apply apply these filters
-# in testing this created a drastic reduction in candidates, at the expense of a few causative variants
-allow_common_clinvar = true
+# callset AF filtering will only be applied to variants with this AC or higher
+min_callset_ac_to_filter = 10
+
+# Global frequency filter
+# Applied to all variants regardless of MOI. Variants reported in ClinVar as P/LP are exempt from these filters
+gnomad_max_af = 0.01
+gnomad_sv_max_af = 0.03
+
+# callset_* filters only apply to variants with a callset AC > min_callset_ac_to_filter.
+callset_max_af = 0.01
+callset_sv_max_af = 0.03
+
+gnomad_max_homozygotes = 5
+gnomad_max_hemizygotes = 5
+
+# Dominant frequency filters - only applied to variants being considered for a dominant MOI.
+# Variants reported in ClinVar as P/LP are exempt from these filters
+dominant_gnomad_max_af = 0.00001
+dominant_gnomad_sv_max_af = 0.01
+dominant_gnomad_max_ac = 10
+dominant_gnomad_max_homozygotes = 0
+
+# callset_* filters only apply to variants with a callset AC > min_callset_ac_to_filter.
+dominant_callset_max_af = 0.01
+dominant_callset_sv_max_af = 0.01
+dominant_callset_max_ac = 10
+
+# Clinvar frequency filters
+# Applied to variants reported as P/LP in clinvar
+clinvar_gnomad_max_af = 0.05
+clinvar_dominant_gnomad_max_af = 0.00005
+
+# callset_* filters only apply to variants with a callset AC > min_callset_ac_to_filter.
+clinvar_callset_max_af = 0.05
+clinvar_dominant_callset_max_af = 0.05
+
+ignore_categories = [ "exomiser", "svdb"]
 
 # we dismiss variants where the proband doesn't have at least this much alt support
 min_alt_depth = 5
 
-# if the population frequency annotations contain allele count, and hemi-count, these are used
-# if absent, these additional tests are skipped
-gnomad_max_homs_dominant = 0
-gnomad_max_homs_recessive = 1
-gnomad_max_ac_dominant = 10
-gnomad_max_hemi = 1
-callset_af_sv_dominant = 0.01
-
 # by default, only consider the top two exomiser results
 exomiser_rank_threshold = 2
 
@@ -66,7 +89,6 @@ exomiser_rank_threshold = 2
 
 # for these categories, require a phenotype-gene match
 # this is the final part of the Category name, e.g. categorydetailspm5 is "pm5", and categorybooleansv1 is "sv1"
-# apologies for the inconsistency
 phenotype_match = ['6']
 
 # all categories in this list are treated as support
@@ -75,15 +97,6 @@ phenotype_match = ['6']
 # all categories not in this list are treated as fully important
 support_categories = ['5', '6']
 
-# These two attributes relate to the filters applied specifically to Dominant MOI tests
-# if the total number of occurences in the callset is lower than this, don't apply an AF filter
-min_callset_ac_to_filter = 5
-# if applied, this is the threshold for the callset AF filter
-callset_af_threshold = 0.01
-
-# set to False if you want population/callset frequency filters to apply to ClinVar pathogenic variants
-allow_common_clinvar = true
-
 [RunHailFiltering]
 # variables affecting how the VCF variants are parsed, and AnalysisVariant objects are populated
 csq_string = [ "consequence", "gene_id", "gene", "transcript", "mane_id", "mane", "biotype", "dna_change", "amino_acid_change", "codon", "ensp", "am_class", "am_pathogenicity",]
-Original file line number
+Diff line change
@@ Expand Up / @@ -14,7 +14,7 @@ permissions: @@
       contents: read
     env:
-      VERSION: 7.0.3
+      VERSION: 7.0.4
     jobs:
       docker:
@@ Expand Down @@