Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions .bumpversion.cfg
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
[bumpversion]
current_version = 7.0.3
current_version = 7.0.4
commit = True
tag = False

[bumpversion:file:README.md]

[bumpversion:file:pyproject.toml]
[bumpversion:file:talos/version.py]

[bumpversion:file:src/talos/version.py]

[bumpversion:file:nextflow/annotation.config]

[bumpversion:file:nextflow/talos.config]

[bumpversion:file:.github/workflows/docker.yaml]
2 changes: 1 addition & 1 deletion .github/workflows/docker.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ permissions:
contents: read

env:
VERSION: 7.0.3
VERSION: 7.0.4

jobs:
docker:
Expand Down
3 changes: 1 addition & 2 deletions .github/workflows/lint.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,9 @@ jobs:
- uses: actions/setup-python@v4
with:
python-version: '3.10'
cache: 'pip'
cache-dependency-path: requirements-lint.txt

- name: Install packages
- name: Install types
run: pip install -r requirements-lint.txt

- name: pre-commit
Expand Down
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,14 +37,14 @@ annotation) and [Echtvar](https://github.com/brentp/echtvar) (used to rapidly ap
> setting. If you apply another tag you'll have to make the corresponding change in the nextflow config files.

```commandline
docker buildx build -t talos:7.0.3 .
docker buildx build -t talos:7.0.4 .
```

> **_NOTE:_** Note the tag of the dockerfile in this command is kept in sync with the package version and config
> setting. If you apply another tag you'll have to make the corresponding change in the nextflow config files.

```commandline
docker buildx build -t talos:7.0.3 .
docker buildx build -t talos:7.0.4 .
```

The [individual Nextflow Modules](nextflow/modules) describe each step of the pipeline, and could be reimplemented in
Expand Down
8 changes: 4 additions & 4 deletions nextflow/annotation.config
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,10 @@ params.cohort_output_dir = "nextflow/${params.cohort}_outputs"
params.generic_output_dir = "nextflow/outputs"

// Not storing a copy of the AM file, collecting it from Zenodo at runtime
params.alphamissense_url = "https://zenodo.org/records/8208688/files/AlphaMissense_hg38.tsv.gz"
params.alphamissense_output = "${params.generic_output_dir}/alphamissense_38.ht.tar.gz"
params.alphamissense_url = "https://zenodo.org/records/8208688/files/AlphaMissense_isoforms_hg38.tsv.gz"
params.alphamissense_output = "${params.generic_output_dir}/alphamissense_isoforms_38.ht.tar"

// Docker image - built from Dockerfile in the root directory
// "docker build -t talos:7.0.3 ."
params.container = 'talos:7.0.3'
// "docker build -t talos:7.0.4 ."
params.container = 'talos:7.0.4'
docker.enabled = true
11 changes: 7 additions & 4 deletions nextflow/annotation.nf
Original file line number Diff line number Diff line change
Expand Up @@ -32,14 +32,20 @@ workflow {
ch_tbis = channel.fromPath(params.input_vcfs).map{ it -> file("${it}.tbi") }
ch_ref_genome = channel.fromPath(params.ref_genome)

// pull and parse the MANE data into a Hail Table
ParseManeIntoJson()

// generate the AlphaMissense HT - long running, stored in a separate folder
// read in as a channel if this was already generated
if (file(params.alphamissense_output).exists()) {
ch_alphamissense_table = channel.fromPath(params.alphamissense_output)
}
else {
LocaliseAlphamissenseWithWget()
ParseAlphaMissenseIntoHt(LocaliseAlphamissenseWithWget.out)
ParseAlphaMissenseIntoHt(
LocaliseAlphamissenseWithWget.out,
ParseManeIntoJson.out.json
)
ch_alphamissense_table = ParseAlphaMissenseIntoHt.out
}

Expand Down Expand Up @@ -75,9 +81,6 @@ workflow {
ch_ref_genome
)

// pull and parse the MANE data into a Hail Table
ParseManeIntoJson()

// reformat the annotations in the VCF, retain as a Hail Table
ReformatAnnotatedVcfIntoHailTable(
AnnotateCsqWithBcftools.out,
Expand Down
49 changes: 40 additions & 9 deletions nextflow/inputs/config.toml
Original file line number Diff line number Diff line change
Expand Up @@ -53,15 +53,46 @@ spliceai = 0.5
csq_string = [ "consequence", "gene_id", "gene", "transcript", "mane_id", "mane", "biotype", "dna_change", "amino_acid_change", "codon", "ensp", "am_class", "am_pathogenicity",]

[ValidateMOI]
allow_common_clinvar = true
gnomad_dominant = 0.001
gnomad_max_homs_dominant = 0
gnomad_max_homs_recessive = 1
gnomad_max_ac_dominant = 20
gnomad_max_hemi = 1
max_callset_ac_dominant = 10
callset_af_sv_dominant = 0.01
phenotype_match = [ "6",]

# callset AF filtering will only be applied to variants with this AC or higher
min_callset_ac_to_filter = 10

# Global frequency filter
# Applied to all variants regardless of MOI. Variants reported in ClinVar as P/LP are exempt from these filters
gnomad_max_af = 0.01
gnomad_sv_max_af = 0.03

# callset_* filters only apply to variants with a callset AC > min_callset_ac_to_filter.
callset_max_af = 0.01
callset_sv_max_af = 0.03

gnomad_max_homozygotes = 5
gnomad_max_hemizygotes = 5

# Dominant frequency filters - only applied to variants being considered for a dominant MOI.
# Variants reported in ClinVar as P/LP are exempt from these filters
dominant_gnomad_max_af = 0.00001
dominant_gnomad_sv_max_af = 0.01
dominant_gnomad_max_ac = 10
dominant_gnomad_max_homozygotes = 0

# callset_* filters only apply to variants with a callset AC > min_callset_ac_to_filter.
dominant_callset_max_af = 0.01
dominant_callset_sv_max_af = 0.01
dominant_callset_max_ac = 10

# Clinvar frequency filters
# Applied to variants reported as P/LP in clinvar
clinvar_gnomad_max_af = 0.05
clinvar_dominant_gnomad_max_af = 0.00005

# callset_* filters only apply to variants with a callset AC > min_callset_ac_to_filter.
clinvar_callset_max_af = 0.05
clinvar_dominant_callset_max_af = 0.05

ignore_categories = [ "exomiser", "svdb"]
phenotype_match = ["6"]
support_categories = ["6"]

[HPOFlagging]
semantic_match = true
Expand Down
11 changes: 7 additions & 4 deletions nextflow/modules/annotation/ParseAlphaMissenseIntoHt/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -7,16 +7,19 @@ process ParseAlphaMissenseIntoHt {

input:
path(am_tsv)
path(mane_json)

output:
path("alphamissense_38.ht.tar")
path("alphamissense_isoforms_38.ht.tar")

script:
"""
ParseAlphaMissenseIntoHt \
--am_tsv ${am_tsv} \
--ht_out alphamissense_38.ht
tar --no-xattrs -cf alphamissense_38.ht.tar alphamissense_38.ht
rm -r alphamissense_38.ht
--ht_out alphamissense_isoforms_38.ht \
--mane_json ${mane_json}

tar --no-xattrs -cf alphamissense_isoforms_38.ht.tar alphamissense_isoforms_38.ht
rm -r alphamissense_isoforms_38.ht
"""
}
4 changes: 2 additions & 2 deletions nextflow/talos.config
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,8 @@ params.phenio_db = "${params.large_files}/phenio.db.gz"
// outputs path(s)
params.output_dir = "nextflow/${params.cohort}_outputs"

// container built using "docker build -t talos:7.0.3 ."
// container built using "docker build -t talos:7.0.4 ."
// this builds a relatively small image with no GCP install
// this may of may not be the default depending on the docker installation
params.container = 'talos:7.0.3'
params.container = 'talos:7.0.4'
docker.enabled = true
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ build-backend = "hatchling.build"
name='talos'
description='Centre for Population Genomics Variant Prioritisation'
readme = "README.md"
version='7.0.3'
version='7.0.4'
requires-python = ">=3.10,<3.11"
license-files = ["LICENSE"]
classifiers=[
Expand Down
4 changes: 3 additions & 1 deletion requirements-lint.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,4 @@
pre-commit>=3.5.0
types-protobuf>5
types-protobuf
types-python-dateutil
types-toml
30 changes: 20 additions & 10 deletions src/talos/annotation_scripts/ParseAlphaMissenseIntoHt.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@

import hail as hl

from talos.utils import read_json_from_path


def process_header(final_header_line: str) -> dict[str, int]:
"""
Expand All @@ -53,13 +55,14 @@ def process_header(final_header_line: str) -> dict[str, int]:
}


def filter_for_pathogenic_am(input_file: str, intermediate_file: str):
def filter_for_pathogenic_am(input_file: str, intermediate_file: str, mane_transcripts: set[str] | None = None):
"""
read the tsv file, skim for pathogenic entries, then write out to a new file

Args:
input_file ():
intermediate_file ():
mane_transcripts (set[str]): a set of transcripts to filter for
"""

headers = ['chrom', 'pos', 'ref', 'alt', 'transcript', 'am_pathogenicity', 'am_class']
Expand Down Expand Up @@ -90,6 +93,9 @@ def filter_for_pathogenic_am(input_file: str, intermediate_file: str):
# trim transcripts
content_dict['transcript'] = str(content_dict['transcript']).split('.')[0]

if mane_transcripts and content_dict['transcript'] not in mane_transcripts:
continue

# convert the AM score to a float, and pos to an int
content_dict['pos'] = int(content_dict['pos'])
content_dict['am_pathogenicity'] = float(content_dict['am_pathogenicity'])
Expand Down Expand Up @@ -126,29 +132,33 @@ def json_to_hail_table(json_file: str, new_ht: str):
def cli_main():
parser = ArgumentParser()
parser.add_argument('--am_tsv', help='path to the AM tsv.gz file')
parser.add_argument('--mane_json', help='path to a JSON containing MANE transcript details')
parser.add_argument('--ht_out', help='path to write a new Hail Table')
args, unknown = parser.parse_known_args()
args = parser.parse_args()
main(alpha_m_file=args.am_tsv, ht_path=args.ht_out, mane_json=args.mane_json)

if unknown:
raise ValueError(unknown)
main(alpha_m_file=args.am_tsv, ht_path=args.ht_out)


def main(alpha_m_file: str, ht_path: str):
def main(alpha_m_file: str, ht_path: str, mane_json: str | None = None):
"""
takes the path to an AlphaMissense TSV, reorganises it into a Hail Table

Args:
alpha_m_file ():
ht_path ():
mane_json (str, optional): A JSON file containing MANE transcript details
"""

# generate a random file name so that we don't overwrite anything consistently
random_intermediate_file: str = 'temp.json'

# generate a new tsv of just pathogenic entries
filter_for_pathogenic_am(alpha_m_file, random_intermediate_file)

# generate a new tsv of just pathogenic entries, optionally filtering down to MANE transcripts only
if mane_json:
# read the MANE JSON file
mane_data = read_json_from_path(mane_json)
mane_transcripts = set(mane_data.keys())
filter_for_pathogenic_am(alpha_m_file, random_intermediate_file, mane_transcripts=mane_transcripts)
else:
filter_for_pathogenic_am(alpha_m_file, random_intermediate_file)
hl.default_reference('GRCh38')

# now ingest as HT and re-jig some fields
Expand Down
61 changes: 37 additions & 24 deletions src/talos/example_config.toml
Original file line number Diff line number Diff line change
Expand Up @@ -36,25 +36,48 @@ symbol = 'AGENE'
chrom = '1'

[ValidateMOI]
# thresholds for different filters during the MOI checks
gnomad_dominant = 0.001

# by default common in callset/gnomAD filters aren't applied to ClinVar pathogenics
# set this flag to false to apply apply these filters
# in testing this created a drastic reduction in candidates, at the expense of a few causative variants
allow_common_clinvar = true
# callset AF filtering will only be applied to variants with this AC or higher
min_callset_ac_to_filter = 10

# Global frequency filter
# Applied to all variants regardless of MOI. Variants reported in ClinVar as P/LP are exempt from these filters
gnomad_max_af = 0.01
gnomad_sv_max_af = 0.03

# callset_* filters only apply to variants with a callset AC > min_callset_ac_to_filter.
callset_max_af = 0.01
callset_sv_max_af = 0.03

gnomad_max_homozygotes = 5
gnomad_max_hemizygotes = 5

# Dominant frequency filters - only applied to variants being considered for a dominant MOI.
# Variants reported in ClinVar as P/LP are exempt from these filters
dominant_gnomad_max_af = 0.00001
dominant_gnomad_sv_max_af = 0.01
dominant_gnomad_max_ac = 10
dominant_gnomad_max_homozygotes = 0

# callset_* filters only apply to variants with a callset AC > min_callset_ac_to_filter.
dominant_callset_max_af = 0.01
dominant_callset_sv_max_af = 0.01
dominant_callset_max_ac = 10

# Clinvar frequency filters
# Applied to variants reported as P/LP in clinvar
clinvar_gnomad_max_af = 0.05
clinvar_dominant_gnomad_max_af = 0.00005

# callset_* filters only apply to variants with a callset AC > min_callset_ac_to_filter.
clinvar_callset_max_af = 0.05
clinvar_dominant_callset_max_af = 0.05

ignore_categories = [ "exomiser", "svdb"]

# we dismiss variants where the proband doesn't have at least this much alt support
min_alt_depth = 5

# if the population frequency annotations contain allele count, and hemi-count, these are used
# if absent, these additional tests are skipped
gnomad_max_homs_dominant = 0
gnomad_max_homs_recessive = 1
gnomad_max_ac_dominant = 10
gnomad_max_hemi = 1
callset_af_sv_dominant = 0.01

# by default, only consider the top two exomiser results
exomiser_rank_threshold = 2

Expand All @@ -66,7 +89,6 @@ exomiser_rank_threshold = 2

# for these categories, require a phenotype-gene match
# this is the final part of the Category name, e.g. categorydetailspm5 is "pm5", and categorybooleansv1 is "sv1"
# apologies for the inconsistency
phenotype_match = ['6']

# all categories in this list are treated as support
Expand All @@ -75,15 +97,6 @@ phenotype_match = ['6']
# all categories not in this list are treated as fully important
support_categories = ['5', '6']

# These two attributes relate to the filters applied specifically to Dominant MOI tests
# if the total number of occurences in the callset is lower than this, don't apply an AF filter
min_callset_ac_to_filter = 5
# if applied, this is the threshold for the callset AF filter
callset_af_threshold = 0.01

# set to False if you want population/callset frequency filters to apply to ClinVar pathogenic variants
allow_common_clinvar = true

[RunHailFiltering]
# variables affecting how the VCF variants are parsed, and AnalysisVariant objects are populated
csq_string = [ "consequence", "gene_id", "gene", "transcript", "mane_id", "mane", "biotype", "dna_change", "amino_acid_change", "codon", "ensp", "am_class", "am_pathogenicity",]
Expand Down
Loading