From 17765ef115950f1d721786cc2e06423a1d1ae5b2 Mon Sep 17 00:00:00 2001 From: Ian Fiddes Date: Tue, 14 Apr 2020 08:53:52 -0700 Subject: [PATCH 01/69] issue #170 --- cat/hints_db.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cat/hints_db.py b/cat/hints_db.py index 8b73cfc0..3c049c94 100644 --- a/cat/hints_db.py +++ b/cat/hints_db.py @@ -307,7 +307,7 @@ def run_protein_aln(job, protein_subset, genome_fasta_file_id): # perform alignment tmp_exonerate = tools.fileOps.get_tmp_toil_file() cmd = ['exonerate', '--model', 'protein2genome', '--showvulgar', 'no', '--showalignment', 'no', - '--showquerygff', 'yes', genome_fasta, protein_fasta] + '--showquerygff', 'yes', protein_fasta, genome_fasta] tools.procOps.run_proc(cmd, stdout=tmp_exonerate) return job.fileStore.writeGlobalFile(tmp_exonerate) From eee10a4730ea100e15cf5abcb6b461db648ca026 Mon Sep 17 00:00:00 2001 From: Ian Fiddes Date: Tue, 28 Apr 2020 09:20:04 -0700 Subject: [PATCH 02/69] Cupcake script; assorted bugs --- programs/convert_cupcake_gff | 40 ++++++++++++++++++++++++++++++++++++ tools/gff3.py | 5 +++-- 2 files changed, 43 insertions(+), 2 deletions(-) create mode 100755 programs/convert_cupcake_gff diff --git a/programs/convert_cupcake_gff b/programs/convert_cupcake_gff new file mode 100755 index 00000000..746c8fe1 --- /dev/null +++ b/programs/convert_cupcake_gff @@ -0,0 +1,40 @@ +#!/usr/bin/env python +import argparse +from tools.fileOps import TemporaryFilePath, print_row +from tools.transcripts import gene_pred_iterator, group_transcripts_by_name2 +from tools.procOps import run_proc + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument('cupcake_gff', help='Cupcake GFF file') + parser.add_argument('gff3', help='Output GFF3') + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + with TemporaryFilePath() as tmp_gp: + cmd = ['gtfToGenePred', '-genePredExt', args.cupcake_gff, tmp_gp] + run_proc(cmd) + d = group_transcripts_by_name2(gene_pred_iterator(tmp_gp)) + with open(args.gff3, "w") as fh: + fh.write('##gff-version 3\n') + for gene_id, tx_list in d.items(): + for tx in tx_list: + attrs = {'transcript_id': tx.name, 'gene_id': gene_id, 'gene_type': 'IsoSeq_Cupcake', + 'transcript_type': 'IsoSeq_Cupcake'} + attrs['gene_name'] = attrs['gene_id'] + attrs['transcript_name'] = attrs['transcript_id'] + attrs['ID'] = attrs['transcript_id'] + # first write transcript line + tx_l = [tx.chromosome, 'PacBio', 'transcript', tx.start + 1, tx.stop, '.', tx.strand, '.', + ';'.join(['='.join([x, y]) for x, y in attrs.items()])] + print_row(fh, tx_l) + # exon lines + for e in tx.exon_intervals: + attrs['Parent'] = attrs['transcript_id'] + attrs['ID'] = 'exon:{}'.format(attrs['transcript_id']) + ex_l = [e.chromosome, 'PacBio', 'exon', e.start + 1, e.stop, '.', e.strand, '.', + ';'.join(['='.join([x, y]) for x, y in attrs.items()])] + print_row(fh, ex_l) diff --git a/tools/gff3.py b/tools/gff3.py index a424af6f..ece3cfc9 100644 --- a/tools/gff3.py +++ b/tools/gff3.py @@ -35,11 +35,12 @@ def parse_attrs(attrs): gene_id = d['gene_id'] tx_id = d['transcript_id'] tx_name = d['transcript_name'] - extra_tags = ';'.join(['{}={}'.format(x, y.replace(';', '%3B')) for x, y in d.items() if x not in reserved_keys]) + extra_tags = ';'.join(['{}={}'.format(x, y.replace(';', '%3B').replace('=', '%3D')) + for x, y in d.items() if x not in reserved_keys]) try: misc.parse_gff_attr_line(extra_tags) except: - raise Exception('Error parsing extra tags in input GFF3') + raise Exception(f'Error parsing extra tags in input GFF3 {extra_tags}') if is_external_reference is True: # hack to fix names gene_id = f'exRef-{gene_id}' From 4958b754462a7d771723d132eebe77c4aa1f86c8 Mon Sep 17 00:00:00 2001 From: Ian Fiddes Date: Tue, 28 Apr 2020 10:32:54 -0700 Subject: [PATCH 03/69] Allow for users to specify singularity image via environmental variables; for issue #172 --- cat/__init__.py | 8 +++++--- tools/procOps.py | 7 +++++-- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/cat/__init__.py b/cat/__init__.py index 09332363..1e4067a4 100644 --- a/cat/__init__.py +++ b/cat/__init__.py @@ -185,7 +185,7 @@ def get_pipeline_args(self): args.set('global_near_best', self.global_near_best, True) args.set('filter_overlapping_genes', self.filter_overlapping_genes, True) args.set('overlapping_gene_distance', self.overlapping_gene_distance, True) - + # user specified flags for consensus finding args.set('intron_rnaseq_support', self.intron_rnaseq_support, False) args.set('exon_rnaseq_support', self.exon_rnaseq_support, False) @@ -398,6 +398,8 @@ def load_docker(self): 'Either install it or use a different option for --binary-mode.') os.environ['SINGULARITY_PULLFOLDER'] = os.path.abspath(self.work_dir) os.environ['SINGULARITY_CACHEDIR'] = os.path.abspath(self.work_dir) + if os.environ.get('SINGULARITY_IMAGE'): + return tools.fileOps.ensure_dir(self.work_dir) if not os.path.isfile(os.path.join(self.work_dir, 'cat.img')): subprocess.check_call(['singularity', 'pull', '--name', 'cat.img', @@ -565,7 +567,7 @@ def success(task): os.remove(os.path.abspath(task.job_store)) except OSError: pass - else: + else: cmd = ['toil', 'stats', '--raw', os.path.abspath(task.job_store)] raw = tools.procOps.call_proc(cmd) parsed = raw[raw.index('{'):raw.rfind('}') + 1] @@ -2298,7 +2300,7 @@ def output(self): def run(self): pipeline_args = self.get_pipeline_args() luigi_stats = tools.sqlInterface.load_luigi_stats(pipeline_args.stats_db, 'stats') - + try: toil_stats = tools.sqlInterface.load_luigi_stats(pipeline_args.stats_db, 'toil_stats') except ValueError: diff --git a/tools/procOps.py b/tools/procOps.py index 9d83b490..47350980 100644 --- a/tools/procOps.py +++ b/tools/procOps.py @@ -26,7 +26,10 @@ def cmdLists(cmd): else: return getDockerCommand('quay.io/ucsc_cgl/cat', cmd) elif os.environ.get('CAT_BINARY_MODE') == 'singularity': - img = os.path.join(os.environ['SINGULARITY_PULLFOLDER'], 'cat.img') + if os.environ.get('SINGULARITY_IMAGE'): + img = os.environ['SINGULARITY_IMAGE'] + else: + img = os.path.join(os.environ['SINGULARITY_PULLFOLDER'], 'cat.img') assert os.path.exists(img) if isinstance(cmd[0], list): return list([get_singularity_command(img, c) for c in cmd]) @@ -107,7 +110,7 @@ def popen_catch(command, stdin=None): def mrca_path(path1, path2): """ Gives the Most Recent Common Ancestor directory that contains both paths. - + >>> mrca_path('/usr/lib/python2.7', '/usr/bin/python') '/usr' >>> mrca_path('/usr/', '/usr/') From 0e64724c1afd9d0d40f8c21f844c9c10be55b670 Mon Sep 17 00:00:00 2001 From: Ian Fiddes Date: Tue, 28 Apr 2020 11:48:36 -0700 Subject: [PATCH 04/69] Update __init__.py --- cat/__init__.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cat/__init__.py b/cat/__init__.py index 09332363..48b53085 100644 --- a/cat/__init__.py +++ b/cat/__init__.py @@ -3184,7 +3184,7 @@ def construct_consensus_gp_as(has_rna, has_pb): ''' -snake_composite = '''track hubCentral +snake_composite = '''track cactus compositeTrack on shortLabel Cactus longLabel Cactus Alignment Tracks @@ -3199,11 +3199,11 @@ def construct_consensus_gp_as(has_rna, has_pb): visibility full type bigBed 3 - track hubCentralAlignments + track cactusAlignments shortLabel Alignments view Alignments visibility full - subTrack hubCentral + subTrack cactus ''' @@ -3212,7 +3212,7 @@ def construct_consensus_gp_as(has_rna, has_pb): shortLabel {genome} otherSpecies {genome} visibility {visibility} - parent hubCentralAlignments off + parent cactusAlignments off priority 3 bigDataUrl {hal_path} type halSnake From 27d6dde91acd00a98c2e90d1749047249ed67cc6 Mon Sep 17 00:00:00 2001 From: Ian Fiddes Date: Wed, 29 Apr 2020 08:53:10 -0700 Subject: [PATCH 05/69] allow type --- tools/gff3.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/gff3.py b/tools/gff3.py index ece3cfc9..9383688a 100644 --- a/tools/gff3.py +++ b/tools/gff3.py @@ -29,8 +29,8 @@ def parse_attrs(attrs): results = [] for tx_id, gene_id in tx_name_map.items(): d = attrs_dict[tx_id] - gene_biotype = d['gene_biotype'] - tx_biotype = d['transcript_biotype'] + gene_biotype = d.get('gene_biotype', d['gene_type']) + tx_biotype = d.get('transcript_biotype', d['transcript_type']) gene_name = d['gene_name'] gene_id = d['gene_id'] tx_id = d['transcript_id'] From fc471445e8971863b6c7397718348dda9ce43972 Mon Sep 17 00:00:00 2001 From: Ian Fiddes Date: Wed, 29 Apr 2020 08:55:01 -0700 Subject: [PATCH 06/69] update reserved keys --- tools/gff3.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/gff3.py b/tools/gff3.py index 9383688a..ae204bab 100644 --- a/tools/gff3.py +++ b/tools/gff3.py @@ -7,6 +7,8 @@ reserved_keys = ['gene_biotype', 'transcript_biotype', + 'gene_type', + 'transcript_type', 'gene_name', 'gene_id', 'transcript_id', From 4991350332b6b631c4e89e44029400e947365d2d Mon Sep 17 00:00:00 2001 From: Ian Fiddes Date: Wed, 29 Apr 2020 09:11:50 -0700 Subject: [PATCH 07/69] I see the problem --- tools/gff3.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tools/gff3.py b/tools/gff3.py index ae204bab..b48db726 100644 --- a/tools/gff3.py +++ b/tools/gff3.py @@ -39,10 +39,11 @@ def parse_attrs(attrs): tx_name = d['transcript_name'] extra_tags = ';'.join(['{}={}'.format(x, y.replace(';', '%3B').replace('=', '%3D')) for x, y in d.items() if x not in reserved_keys]) - try: - misc.parse_gff_attr_line(extra_tags) - except: - raise Exception(f'Error parsing extra tags in input GFF3 {extra_tags}') + if len(extra_tags) > 0: + try: + misc.parse_gff_attr_line(extra_tags) + except: + raise Exception(f'Error parsing extra tags in input GFF3 {extra_tags}') if is_external_reference is True: # hack to fix names gene_id = f'exRef-{gene_id}' From 859a323b08b8703b653a85726e8bc22672445878 Mon Sep 17 00:00:00 2001 From: Ian Fiddes Date: Sun, 10 May 2020 08:24:22 -0700 Subject: [PATCH 08/69] gff parsing bugs --- cat/consensus.py | 4 ++-- tools/gff3.py | 8 ++++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/cat/consensus.py b/cat/consensus.py index e5443880..375f75f1 100644 --- a/cat/consensus.py +++ b/cat/consensus.py @@ -911,7 +911,7 @@ def convert_attrs(attrs, id_field): # don't include the support vectors in the string, they will be placed in their respective places attrs_str = ['='.join([key, str(val).replace('=', '_')]) for key, val in sorted(attrs.items()) if 'support' not in key] # explicitly escape any semicolons that may exist in the input strings - attrs_str = [x.replace(';', '%3B') for x in attrs_str] + attrs_str = [x.replace(';', '%3B').replace('=', '%3D') for x in attrs_str] return score, ';'.join(attrs_str) def find_feature_support(attrs, feature, i): @@ -929,7 +929,7 @@ def find_all_tx_modes(attrs_list): for attrs in attrs_list: tx_modes.update(attrs['transcript_modes'].split(',')) return ','.join(tx_modes) - + intervals = set() for tx in tx_objs: intervals.update(tx.exon_intervals) diff --git a/tools/gff3.py b/tools/gff3.py index b48db726..3ca808fd 100644 --- a/tools/gff3.py +++ b/tools/gff3.py @@ -31,8 +31,12 @@ def parse_attrs(attrs): results = [] for tx_id, gene_id in tx_name_map.items(): d = attrs_dict[tx_id] - gene_biotype = d.get('gene_biotype', d['gene_type']) - tx_biotype = d.get('transcript_biotype', d['transcript_type']) + gene_biotype = d.get('gene_biotype', d.get('gene_type', None)) + if gene_biotype is None: + raise Exception("Did not find a gene biotype or gene type for {}".format(gene_id)) + tx_biotype = d.get('transcript_biotype', d.get('transcript_type', None)) + if tx_biotype is None: + raise Exception("Did not find a transcript biotype or type for {}".format(tx_id)) gene_name = d['gene_name'] gene_id = d['gene_id'] tx_id = d['transcript_id'] From 2914597ef13c09526b1ab6f93ab63eaf395e4cbd Mon Sep 17 00:00:00 2001 From: Ian Fiddes Date: Sun, 10 May 2020 08:25:51 -0700 Subject: [PATCH 09/69] more clear error --- tools/gff3.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/gff3.py b/tools/gff3.py index 3ca808fd..f0101182 100644 --- a/tools/gff3.py +++ b/tools/gff3.py @@ -33,10 +33,10 @@ def parse_attrs(attrs): d = attrs_dict[tx_id] gene_biotype = d.get('gene_biotype', d.get('gene_type', None)) if gene_biotype is None: - raise Exception("Did not find a gene biotype or gene type for {}".format(gene_id)) + raise Exception("Did not find a gene biotype or gene type for {} (attrs={})".format(gene_id, d)) tx_biotype = d.get('transcript_biotype', d.get('transcript_type', None)) if tx_biotype is None: - raise Exception("Did not find a transcript biotype or type for {}".format(tx_id)) + raise Exception("Did not find a transcript biotype or type for {} (attrs={})".format(tx_id, d)) gene_name = d['gene_name'] gene_id = d['gene_id'] tx_id = d['transcript_id'] From 1c4c060c59e1e57e19b7ad5e34399704df92403b Mon Sep 17 00:00:00 2001 From: Ian Fiddes Date: Sun, 10 May 2020 08:43:10 -0700 Subject: [PATCH 10/69] update script --- programs/convert_ncbi_gff3.py | 82 +++++++++++++++++++++++++++++++++++ 1 file changed, 82 insertions(+) create mode 100644 programs/convert_ncbi_gff3.py diff --git a/programs/convert_ncbi_gff3.py b/programs/convert_ncbi_gff3.py new file mode 100644 index 00000000..13de7528 --- /dev/null +++ b/programs/convert_ncbi_gff3.py @@ -0,0 +1,82 @@ +#!/usr/bin/env python +"""Developed against Rnor6 RefSeq""" +import argparse +from BCBio import GFF +from collections import Counter, OrderedDict + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument('input_gff3', help='RefSeq GFF3 file') + parser.add_argument('output_gff3', help='Output GFF3') + return parser.parse_args() + + +def add_tags_to_feature(feature): + if feature.type not in ['ncRNA', 'tRNA', 'rRNA', 'pseudogene', 'lnc_RNA', + 'rRNA', 'mRNA', 'C_gene_segment', + 'J_gene_segment', 'V_gene_segment', + 'primary_transcript', 'miRNA', 'transcript', 'CDS', 'exon', 'gene']: + return + ids = dict([x.split(':') for x in feature.qualifiers.get('dbxref', feature.qualifiers['Dbxref'])]) + if feature.type == 'gene': + feature.qualifiers['gene_id'] = [ids['GeneID']] + return + feature.qualifiers['gene_id'] = [ids['GeneID']] + feature.qualifiers['gene_name'] = feature.qualifiers.get('gene', feature.qualifiers['gene_id']) + tx_id = feature.qualifiers.get('transcript_id', feature.qualifiers['ID'])[0] + if '-' in tx_id: + tx_id = tx_id.split('-', 1)[1] + feature.qualifiers['transcript_id'] = [tx_id] + feature.qualifiers['transcript_name'] = feature.qualifiers.get('Name', feature.qualifiers['transcript_id']) + if feature.type in ['mRNA', 'CDS']: + gene_biotype = transcript_biotype = ['protein_coding'] + elif feature.type == 'primary_transcript': + gene_biotype = ['miRNA'] + transcript_biotype = feature.type + else: + gene_biotype = transcript_biotype = [feature.type] + feature.qualifiers['gene_biotype'] = gene_biotype + feature.qualifiers['transcript_biotype'] = transcript_biotype + + +def construct_new_qualifiers(feature): + new_qualifiers = OrderedDict() + for key, val in feature.qualifiers.items(): + # no upper case keys unless it is ID or Parent or Name + if key not in ['ID', 'Parent', 'Name']: + key = key.lower() + # collapse to a single item + # replace all semicolons + if len(val) > 1: + val = [' '.join([x.replace(';', '%3B').replace('=', '%3D') for x in val])] + new_qualifiers[key] = val + # clean up and make parseable + for key, val in new_qualifiers.items(): + if sum(len(x) for x in val) == 0: + new_qualifiers[key] = 'True' + return new_qualifiers + + +def feature_traversal(feature): + yield feature + for sub_feature in feature.sub_features: + yield from feature_traversal(sub_feature) + + +if __name__ == '__main__': + args = parse_args() + lt_counts = Counter() + records = list(GFF.parse(args.input_gff3)) + for seqrecord in records: + for parent_feature in seqrecord.features: + for feature in feature_traversal(parent_feature): + try: + add_tags_to_feature(feature, lt_counts) + except KeyError: + assert False, feature.qualifiers + new_qualifiers = construct_new_qualifiers(feature) + feature.qualifiers = new_qualifiers + + with open(args.output_gff3, 'w') as fh: + GFF.write(records, fh) From 25f521cd77aefb9db48b35d056189f5d1f5abec9 Mon Sep 17 00:00:00 2001 From: Ian Fiddes Date: Sun, 10 May 2020 08:51:20 -0700 Subject: [PATCH 11/69] oops --- programs/convert_ncbi_gff3.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/programs/convert_ncbi_gff3.py b/programs/convert_ncbi_gff3.py index 13de7528..5d02827b 100644 --- a/programs/convert_ncbi_gff3.py +++ b/programs/convert_ncbi_gff3.py @@ -66,13 +66,12 @@ def feature_traversal(feature): if __name__ == '__main__': args = parse_args() - lt_counts = Counter() records = list(GFF.parse(args.input_gff3)) for seqrecord in records: for parent_feature in seqrecord.features: for feature in feature_traversal(parent_feature): try: - add_tags_to_feature(feature, lt_counts) + add_tags_to_feature(feature) except KeyError: assert False, feature.qualifiers new_qualifiers = construct_new_qualifiers(feature) From c0a8500d6cee1dad4b9c47cb48b8c90b62f826a2 Mon Sep 17 00:00:00 2001 From: Ian Fiddes Date: Sun, 10 May 2020 09:37:48 -0700 Subject: [PATCH 12/69] oops --- programs/convert_ncbi_gff3.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/programs/convert_ncbi_gff3.py b/programs/convert_ncbi_gff3.py index 5d02827b..e73f6a25 100644 --- a/programs/convert_ncbi_gff3.py +++ b/programs/convert_ncbi_gff3.py @@ -33,7 +33,7 @@ def add_tags_to_feature(feature): gene_biotype = transcript_biotype = ['protein_coding'] elif feature.type == 'primary_transcript': gene_biotype = ['miRNA'] - transcript_biotype = feature.type + transcript_biotype = [feature.type] else: gene_biotype = transcript_biotype = [feature.type] feature.qualifiers['gene_biotype'] = gene_biotype From 89071a15f884adbe66e43dc2b63904e16dbd8c9b Mon Sep 17 00:00:00 2001 From: Ian Fiddes Date: Sun, 10 May 2020 10:10:14 -0700 Subject: [PATCH 13/69] fixing install and permissions --- ...convert_ncbi_gff3.py => convert_ncbi_gff3} | 0 programs/validate_gff3 | 40 +++++++++---------- setup.py | 3 +- 3 files changed, 21 insertions(+), 22 deletions(-) rename programs/{convert_ncbi_gff3.py => convert_ncbi_gff3} (100%) mode change 100644 => 100755 diff --git a/programs/convert_ncbi_gff3.py b/programs/convert_ncbi_gff3 old mode 100644 new mode 100755 similarity index 100% rename from programs/convert_ncbi_gff3.py rename to programs/convert_ncbi_gff3 diff --git a/programs/validate_gff3 b/programs/validate_gff3 index 1d16f1bf..b81d70bd 100755 --- a/programs/validate_gff3 +++ b/programs/validate_gff3 @@ -1,4 +1,3 @@ -#!/usr/bin/env python """ This script allows you to test your GFF3 for compatibility with CAT before running the pipeline. @@ -11,7 +10,6 @@ import tools.fileOps import tools.transcripts import collections - def parse_args(): parser = argparse.ArgumentParser() parser.add_argument('gff3', help='GFF3 to validate') @@ -28,39 +26,39 @@ if __name__ == '__main__': for l in open(gp): l = l.split() c[l[0]] += 1 - duplicates = {x for x, y in c.items() if y > 1} - assert len(duplicates) == 0, 'Found {} duplicate genes: {}'.format(len(duplicates), '\n'.join(duplicates)) + duplicates = {x for x, y in c.iteritems() if y > 1} + assert len(duplicates) == 0, 'Found {} duplicate transcript IDs: {}'.format(len(duplicates), '\n'.join(duplicates)) df = tools.gff3.parse_gff3(attrs, gp) tx_dict = tools.transcripts.get_gene_pred_dict(gp) assert len(tx_dict) == len(df) - exons = {len(x.exon_intervals) for x in tx_dict.values()} + exons = {len(x.exon_intervals) for x in tx_dict.itervalues()} if len(exons) == 1: - print('WARNING: no multi-exonic transcripts found') - assert tx_dict.keys() == set(df.index) - genes = {x.name2 for x in tx_dict.values()} + print 'WARNING: no multi-exonic transcripts found' + assert tx_dict.viewkeys() == set(df.index) + genes = {x.name2 for x in tx_dict.itervalues()} assert genes == set(df.GeneId) - print('Found {} transcripts and {} genes'.format(len(tx_dict), len(genes))) + print 'Found {} transcripts and {} genes'.format(len(tx_dict), len(genes)) tmp = df.groupby('GeneId').first() - print('Found the following gene biotypes: {}'.format(collections.Counter(tmp.GeneBiotype))) - print('Found the following transcript biotypes: {}'.format(collections.Counter(df.TranscriptBiotype))) - print('Some example database rows:') - print(df.head(20)) - print('Some example gene rows:') - print(tmp.head(10)) + print 'Found the following gene biotypes: {}'.format(collections.Counter(tmp.GeneBiotype)) + print 'Found the following transcript biotypes: {}'.format(collections.Counter(df.TranscriptBiotype)) + print 'Some example database rows:' + print df.head(20) + print 'Some example gene rows:' + print tmp.head(10) # validate one-to-one mappings for reference mixed_genes = {} for gene_id, d in df.groupby('GeneId'): if len(set(d.GeneName)) != 1: mixed_genes[gene_id] = ','.join(set(d.GeneName)) if len(mixed_genes) > 0: - print('Found {} genes where one gene ID maps to multiple gene names:'.format(len(mixed_genes))) - for x, y in mixed_genes.items(): - print('{}: {}'.format(x, y)) + print 'Found {} genes where one gene ID maps to multiple gene names:'.format(len(mixed_genes)) + for x, y in mixed_genes.iteritems(): + print '{}: {}'.format(x, y) mixed_genes = {} for gene_id, d in df.groupby('GeneName'): if len(set(d.GeneId)) != 1: mixed_genes[gene_id] = ','.join(set(d.GeneId)) if len(mixed_genes) > 0: - print('Found {} genes where one gene name maps to multiple gene IDs:'.format(len(mixed_genes))) - for x, y in mixed_genes.items(): - print('{}: {}'.format(x, y)) + print 'Found {} genes where one gene name maps to multiple gene IDs:'.format(len(mixed_genes)) + for x, y in mixed_genes.iteritems(): + print '{}: {}'.format(x, y) diff --git a/setup.py b/setup.py index 94706655..7150170f 100644 --- a/setup.py +++ b/setup.py @@ -25,7 +25,8 @@ ], scripts=['programs/cat_to_ncbi_submit', 'programs/translate_gene_pred', 'programs/validate_gff3', 'programs/cat_parse_ncbi_genbank', - 'programs/cat_parse_ncbi_refseq', 'programs/cat_parse_prokka_gff3'], + 'programs/cat_parse_ncbi_refseq', 'programs/cat_parse_prokka_gff3', + 'programs/convert_ncbi_gff3', 'programs/convert_cupcake_gff'], author='Ian Fiddes', description='Comparative Annotation Toolkit', url='https://github.com/ComparativeGenomicsToolkit/Comparative-Annotation-Toolkit', From e3c467766efa20dc495bee522d3ace80d440ee15 Mon Sep 17 00:00:00 2001 From: Ian Fiddes Date: Mon, 11 May 2020 08:12:28 -0700 Subject: [PATCH 14/69] bad data; chromosome name script --- programs/convert_ncbi_gff3.py | 81 +++++++++++++++++++ ...convert_ncbi_refseq_genbank_chromosomes.py | 37 +++++++++ 2 files changed, 118 insertions(+) create mode 100644 programs/convert_ncbi_gff3.py create mode 100644 programs/convert_ncbi_refseq_genbank_chromosomes.py diff --git a/programs/convert_ncbi_gff3.py b/programs/convert_ncbi_gff3.py new file mode 100644 index 00000000..d279ed0c --- /dev/null +++ b/programs/convert_ncbi_gff3.py @@ -0,0 +1,81 @@ +#!/usr/bin/env python +"""Developed against Rnor6 RefSeq""" +import argparse +from BCBio import GFF +from collections import Counter, OrderedDict + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument('input_gff3', help='RefSeq GFF3 file') + parser.add_argument('output_gff3', help='Output GFF3') + return parser.parse_args() + + +def add_tags_to_feature(feature): + if feature.type not in ['ncRNA', 'tRNA', 'rRNA', 'pseudogene', 'lnc_RNA', + 'rRNA', 'mRNA', 'C_gene_segment', + 'J_gene_segment', 'V_gene_segment', + 'primary_transcript', 'miRNA', 'transcript', 'CDS', 'exon', 'gene']: + return + ids = dict([x.split(':', 1) for x in feature.qualifiers.get('dbxref', feature.qualifiers['Dbxref'])]) + if feature.type == 'gene': + feature.qualifiers['gene_id'] = [ids['GeneID']] + return + feature.qualifiers['gene_id'] = [ids['GeneID']] + feature.qualifiers['gene_name'] = feature.qualifiers.get('gene', feature.qualifiers['gene_id']) + tx_id = feature.qualifiers.get('transcript_id', feature.qualifiers['ID'])[0] + if '-' in tx_id: + tx_id = tx_id.split('-', 1)[1] + feature.qualifiers['transcript_id'] = [tx_id] + feature.qualifiers['transcript_name'] = feature.qualifiers.get('Name', feature.qualifiers['transcript_id']) + if feature.type in ['mRNA', 'CDS']: + gene_biotype = transcript_biotype = ['protein_coding'] + elif feature.type == 'primary_transcript': + gene_biotype = ['miRNA'] + transcript_biotype = [feature.type] + else: + gene_biotype = transcript_biotype = [feature.type] + feature.qualifiers['gene_biotype'] = gene_biotype + feature.qualifiers['transcript_biotype'] = transcript_biotype + + +def construct_new_qualifiers(feature): + new_qualifiers = OrderedDict() + for key, val in feature.qualifiers.items(): + # no upper case keys unless it is ID or Parent or Name + if key not in ['ID', 'Parent', 'Name']: + key = key.lower() + # collapse to a single item + # replace all semicolons + if len(val) > 1: + val = [' '.join([x.replace(';', '%3B').replace('=', '%3D') for x in val])] + new_qualifiers[key] = val + # clean up and make parseable + for key, val in new_qualifiers.items(): + if sum(len(x) for x in val) == 0: + new_qualifiers[key] = 'True' + return new_qualifiers + + +def feature_traversal(feature): + yield feature + for sub_feature in feature.sub_features: + yield from feature_traversal(sub_feature) + + +if __name__ == '__main__': + args = parse_args() + records = list(GFF.parse(args.input_gff3)) + for seqrecord in records: + for parent_feature in seqrecord.features: + for feature in feature_traversal(parent_feature): + try: + add_tags_to_feature(feature) + except KeyError: + assert False, feature.qualifiers + new_qualifiers = construct_new_qualifiers(feature) + feature.qualifiers = new_qualifiers + + with open(args.output_gff3, 'w') as fh: + GFF.write(records, fh) diff --git a/programs/convert_ncbi_refseq_genbank_chromosomes.py b/programs/convert_ncbi_refseq_genbank_chromosomes.py new file mode 100644 index 00000000..d4064af4 --- /dev/null +++ b/programs/convert_ncbi_refseq_genbank_chromosomes.py @@ -0,0 +1,37 @@ +#!/usr/bin/env python +"""Convert RefSeq to GenBank or vice versa""" +import argparse +import pandas as pd + +header = ['Sequence-Name', 'Sequence-Role', 'Assigned-Molecule', 'Assigned-Molecule-Location/Type', +'GenBank-Accn', 'Relationship', 'RefSeq-Accn', 'Assembly-Unit', 'Sequence-Length', 'UCSC-style-name'] + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument('input_gff3', help='RefSeq GFF3 file') + parser.add_argument('conversion_table', help='Conversion table') + parser.add_argument('output_gff3', help='Output GFF3') + parser.add_argument('--refseq-to-genbank', type=bool, default=True, help='Set this to flip direction to RefSeq -> GenBank') + return parser.parse_args() + + +if __name__ == "__main__": + df = pd.read_csv(args.conversion_table, sep='\t', comment='#') + df.columns = header + if args.refseq_to_genbank: + m = dict(zip(df['RefSeq-Accn'], df['GenBank-Accn'])) + else: + m = dict(zip(df['GenBank-Accn'], df['RefSeq-Accn'])) + with open(args.output_gff3, 'w') as fh: + for row in open(args.input_gff3): + if row.startswith('##gff'): + fh.write(row) + elif row.startswith('#'): + continue + row = row.split('\t') + if row[0] in m: + row[0] = m[row[0]] + else: + print('Row unparseable: {}'.format(row)) + fh.write('\t'.join(row)) \ No newline at end of file From b09781a23a4cc69e0b08ed882a6f14465dde08f5 Mon Sep 17 00:00:00 2001 From: Ian Fiddes Date: Mon, 11 May 2020 08:17:53 -0700 Subject: [PATCH 15/69] oops --- programs/convert_ncbi_gff3 | 2 +- programs/convert_ncbi_gff3.py | 81 ------------------- ...> convert_ncbi_refseq_genbank_chromosomes} | 0 setup.py | 3 +- 4 files changed, 3 insertions(+), 83 deletions(-) delete mode 100644 programs/convert_ncbi_gff3.py rename programs/{convert_ncbi_refseq_genbank_chromosomes.py => convert_ncbi_refseq_genbank_chromosomes} (100%) mode change 100644 => 100755 diff --git a/programs/convert_ncbi_gff3 b/programs/convert_ncbi_gff3 index e73f6a25..d279ed0c 100755 --- a/programs/convert_ncbi_gff3 +++ b/programs/convert_ncbi_gff3 @@ -18,7 +18,7 @@ def add_tags_to_feature(feature): 'J_gene_segment', 'V_gene_segment', 'primary_transcript', 'miRNA', 'transcript', 'CDS', 'exon', 'gene']: return - ids = dict([x.split(':') for x in feature.qualifiers.get('dbxref', feature.qualifiers['Dbxref'])]) + ids = dict([x.split(':', 1) for x in feature.qualifiers.get('dbxref', feature.qualifiers['Dbxref'])]) if feature.type == 'gene': feature.qualifiers['gene_id'] = [ids['GeneID']] return diff --git a/programs/convert_ncbi_gff3.py b/programs/convert_ncbi_gff3.py deleted file mode 100644 index d279ed0c..00000000 --- a/programs/convert_ncbi_gff3.py +++ /dev/null @@ -1,81 +0,0 @@ -#!/usr/bin/env python -"""Developed against Rnor6 RefSeq""" -import argparse -from BCBio import GFF -from collections import Counter, OrderedDict - - -def parse_args(): - parser = argparse.ArgumentParser() - parser.add_argument('input_gff3', help='RefSeq GFF3 file') - parser.add_argument('output_gff3', help='Output GFF3') - return parser.parse_args() - - -def add_tags_to_feature(feature): - if feature.type not in ['ncRNA', 'tRNA', 'rRNA', 'pseudogene', 'lnc_RNA', - 'rRNA', 'mRNA', 'C_gene_segment', - 'J_gene_segment', 'V_gene_segment', - 'primary_transcript', 'miRNA', 'transcript', 'CDS', 'exon', 'gene']: - return - ids = dict([x.split(':', 1) for x in feature.qualifiers.get('dbxref', feature.qualifiers['Dbxref'])]) - if feature.type == 'gene': - feature.qualifiers['gene_id'] = [ids['GeneID']] - return - feature.qualifiers['gene_id'] = [ids['GeneID']] - feature.qualifiers['gene_name'] = feature.qualifiers.get('gene', feature.qualifiers['gene_id']) - tx_id = feature.qualifiers.get('transcript_id', feature.qualifiers['ID'])[0] - if '-' in tx_id: - tx_id = tx_id.split('-', 1)[1] - feature.qualifiers['transcript_id'] = [tx_id] - feature.qualifiers['transcript_name'] = feature.qualifiers.get('Name', feature.qualifiers['transcript_id']) - if feature.type in ['mRNA', 'CDS']: - gene_biotype = transcript_biotype = ['protein_coding'] - elif feature.type == 'primary_transcript': - gene_biotype = ['miRNA'] - transcript_biotype = [feature.type] - else: - gene_biotype = transcript_biotype = [feature.type] - feature.qualifiers['gene_biotype'] = gene_biotype - feature.qualifiers['transcript_biotype'] = transcript_biotype - - -def construct_new_qualifiers(feature): - new_qualifiers = OrderedDict() - for key, val in feature.qualifiers.items(): - # no upper case keys unless it is ID or Parent or Name - if key not in ['ID', 'Parent', 'Name']: - key = key.lower() - # collapse to a single item - # replace all semicolons - if len(val) > 1: - val = [' '.join([x.replace(';', '%3B').replace('=', '%3D') for x in val])] - new_qualifiers[key] = val - # clean up and make parseable - for key, val in new_qualifiers.items(): - if sum(len(x) for x in val) == 0: - new_qualifiers[key] = 'True' - return new_qualifiers - - -def feature_traversal(feature): - yield feature - for sub_feature in feature.sub_features: - yield from feature_traversal(sub_feature) - - -if __name__ == '__main__': - args = parse_args() - records = list(GFF.parse(args.input_gff3)) - for seqrecord in records: - for parent_feature in seqrecord.features: - for feature in feature_traversal(parent_feature): - try: - add_tags_to_feature(feature) - except KeyError: - assert False, feature.qualifiers - new_qualifiers = construct_new_qualifiers(feature) - feature.qualifiers = new_qualifiers - - with open(args.output_gff3, 'w') as fh: - GFF.write(records, fh) diff --git a/programs/convert_ncbi_refseq_genbank_chromosomes.py b/programs/convert_ncbi_refseq_genbank_chromosomes old mode 100644 new mode 100755 similarity index 100% rename from programs/convert_ncbi_refseq_genbank_chromosomes.py rename to programs/convert_ncbi_refseq_genbank_chromosomes diff --git a/setup.py b/setup.py index 7150170f..7fbd7318 100644 --- a/setup.py +++ b/setup.py @@ -26,7 +26,8 @@ scripts=['programs/cat_to_ncbi_submit', 'programs/translate_gene_pred', 'programs/validate_gff3', 'programs/cat_parse_ncbi_genbank', 'programs/cat_parse_ncbi_refseq', 'programs/cat_parse_prokka_gff3', - 'programs/convert_ncbi_gff3', 'programs/convert_cupcake_gff'], + 'programs/convert_ncbi_gff3', 'programs/convert_cupcake_gff', + 'programs/convert_ncbi_refseq_genbank_chromosomes'], author='Ian Fiddes', description='Comparative Annotation Toolkit', url='https://github.com/ComparativeGenomicsToolkit/Comparative-Annotation-Toolkit', From 6d9fa5a7e2b71417a5b54c62c00325e3b52e4116 Mon Sep 17 00:00:00 2001 From: Ian Fiddes Date: Mon, 11 May 2020 08:19:30 -0700 Subject: [PATCH 16/69] oops --- programs/convert_ncbi_refseq_genbank_chromosomes | 1 + 1 file changed, 1 insertion(+) diff --git a/programs/convert_ncbi_refseq_genbank_chromosomes b/programs/convert_ncbi_refseq_genbank_chromosomes index d4064af4..1d59de4c 100755 --- a/programs/convert_ncbi_refseq_genbank_chromosomes +++ b/programs/convert_ncbi_refseq_genbank_chromosomes @@ -17,6 +17,7 @@ def parse_args(): if __name__ == "__main__": + args = parse_args() df = pd.read_csv(args.conversion_table, sep='\t', comment='#') df.columns = header if args.refseq_to_genbank: From 5780085596b95227597ae2c2f43da6d9d94b7636 Mon Sep 17 00:00:00 2001 From: Ian Fiddes Date: Mon, 11 May 2020 08:31:38 -0700 Subject: [PATCH 17/69] oops again --- programs/convert_ncbi_refseq_genbank_chromosomes | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/programs/convert_ncbi_refseq_genbank_chromosomes b/programs/convert_ncbi_refseq_genbank_chromosomes index 1d59de4c..db0f5638 100755 --- a/programs/convert_ncbi_refseq_genbank_chromosomes +++ b/programs/convert_ncbi_refseq_genbank_chromosomes @@ -12,13 +12,13 @@ def parse_args(): parser.add_argument('input_gff3', help='RefSeq GFF3 file') parser.add_argument('conversion_table', help='Conversion table') parser.add_argument('output_gff3', help='Output GFF3') - parser.add_argument('--refseq-to-genbank', type=bool, default=True, help='Set this to flip direction to RefSeq -> GenBank') + parser.add_argument('--refseq-to-genbank', action='store_true', help='Set this to flip direction to RefSeq -> GenBank') return parser.parse_args() if __name__ == "__main__": args = parse_args() - df = pd.read_csv(args.conversion_table, sep='\t', comment='#') + df = pd.read_csv(args.conversion_table, sep='\t', comment='#', na_rep='na', header=None) df.columns = header if args.refseq_to_genbank: m = dict(zip(df['RefSeq-Accn'], df['GenBank-Accn'])) From f9c42ae4b8a5b5c43f866d11c6c60b0267e25965 Mon Sep 17 00:00:00 2001 From: Ian Fiddes Date: Mon, 11 May 2020 08:32:24 -0700 Subject: [PATCH 18/69] oops again --- programs/convert_ncbi_refseq_genbank_chromosomes | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/programs/convert_ncbi_refseq_genbank_chromosomes b/programs/convert_ncbi_refseq_genbank_chromosomes index db0f5638..90e23fa4 100755 --- a/programs/convert_ncbi_refseq_genbank_chromosomes +++ b/programs/convert_ncbi_refseq_genbank_chromosomes @@ -18,7 +18,7 @@ def parse_args(): if __name__ == "__main__": args = parse_args() - df = pd.read_csv(args.conversion_table, sep='\t', comment='#', na_rep='na', header=None) + df = pd.read_csv(args.conversion_table, sep='\t', comment='#', na_values='na', header=None) df.columns = header if args.refseq_to_genbank: m = dict(zip(df['RefSeq-Accn'], df['GenBank-Accn'])) From 815d829a80dfd8f5f79c68f73b38843b6fad0ed8 Mon Sep 17 00:00:00 2001 From: Ian Fiddes Date: Mon, 11 May 2020 08:34:44 -0700 Subject: [PATCH 19/69] oops again --- programs/convert_ncbi_refseq_genbank_chromosomes | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/programs/convert_ncbi_refseq_genbank_chromosomes b/programs/convert_ncbi_refseq_genbank_chromosomes index 90e23fa4..46f7b176 100755 --- a/programs/convert_ncbi_refseq_genbank_chromosomes +++ b/programs/convert_ncbi_refseq_genbank_chromosomes @@ -20,15 +20,17 @@ if __name__ == "__main__": args = parse_args() df = pd.read_csv(args.conversion_table, sep='\t', comment='#', na_values='na', header=None) df.columns = header + filt_df = df[~(df['RefSeq-Accn'].isnull()) & ~(df['GenBank-Accn'].isnull())] + print("Threw out {} contigs due to no mapping".format(len(df) - len(filt_df))) if args.refseq_to_genbank: - m = dict(zip(df['RefSeq-Accn'], df['GenBank-Accn'])) + m = dict(zip(map(str, filt_df['RefSeq-Accn']), map(str, filt_df['GenBank-Accn']))) else: - m = dict(zip(df['GenBank-Accn'], df['RefSeq-Accn'])) + m = dict(zip(map(str, filt_df['GenBank-Accn']), map(str, filt_df['RefSeq-Accn']))) with open(args.output_gff3, 'w') as fh: for row in open(args.input_gff3): - if row.startswith('##gff'): - fh.write(row) - elif row.startswith('#'): + if row.startswith('#'): + if row.startswith('##gff'): + fh.write(row) continue row = row.split('\t') if row[0] in m: From 313e9af251bf7fe177ce00fb9bf15c7e7b96d69a Mon Sep 17 00:00:00 2001 From: Ian Fiddes Date: Mon, 11 May 2020 08:35:49 -0700 Subject: [PATCH 20/69] oops again --- programs/validate_gff3 | 1 + 1 file changed, 1 insertion(+) diff --git a/programs/validate_gff3 b/programs/validate_gff3 index b81d70bd..5d2b114d 100755 --- a/programs/validate_gff3 +++ b/programs/validate_gff3 @@ -1,3 +1,4 @@ +#!/usr/bin/env python """ This script allows you to test your GFF3 for compatibility with CAT before running the pipeline. From b4bcdc8c5ae70e0b07bf121f6113a157fcbc28d8 Mon Sep 17 00:00:00 2001 From: Ian Fiddes Date: Mon, 11 May 2020 08:37:42 -0700 Subject: [PATCH 21/69] update validate gff3 to python3 --- programs/validate_gff3 | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/programs/validate_gff3 b/programs/validate_gff3 index 5d2b114d..41910964 100755 --- a/programs/validate_gff3 +++ b/programs/validate_gff3 @@ -27,39 +27,39 @@ if __name__ == '__main__': for l in open(gp): l = l.split() c[l[0]] += 1 - duplicates = {x for x, y in c.iteritems() if y > 1} + duplicates = {x for x, y in c.items() if y > 1} assert len(duplicates) == 0, 'Found {} duplicate transcript IDs: {}'.format(len(duplicates), '\n'.join(duplicates)) df = tools.gff3.parse_gff3(attrs, gp) tx_dict = tools.transcripts.get_gene_pred_dict(gp) assert len(tx_dict) == len(df) - exons = {len(x.exon_intervals) for x in tx_dict.itervalues()} + exons = {len(x.exon_intervals) for x in tx_dict.values()} if len(exons) == 1: - print 'WARNING: no multi-exonic transcripts found' - assert tx_dict.viewkeys() == set(df.index) - genes = {x.name2 for x in tx_dict.itervalues()} + print('WARNING: no multi-exonic transcripts found') + assert tx_dict.keys() == set(df.index) + genes = {x.name2 for x in tx_dict.values()} assert genes == set(df.GeneId) - print 'Found {} transcripts and {} genes'.format(len(tx_dict), len(genes)) + print('Found {} transcripts and {} genes'.format(len(tx_dict), len(genes))) tmp = df.groupby('GeneId').first() - print 'Found the following gene biotypes: {}'.format(collections.Counter(tmp.GeneBiotype)) - print 'Found the following transcript biotypes: {}'.format(collections.Counter(df.TranscriptBiotype)) - print 'Some example database rows:' - print df.head(20) - print 'Some example gene rows:' - print tmp.head(10) + print('Found the following gene biotypes: {}'.format(collections.Counter(tmp.GeneBiotype))) + print('Found the following transcript biotypes: {}'.format(collections.Counter(df.TranscriptBiotype))) + print('Some example database rows:') + print(df.head(20)) + print('Some example gene rows:') + print(tmp.head(10)) # validate one-to-one mappings for reference mixed_genes = {} for gene_id, d in df.groupby('GeneId'): if len(set(d.GeneName)) != 1: mixed_genes[gene_id] = ','.join(set(d.GeneName)) if len(mixed_genes) > 0: - print 'Found {} genes where one gene ID maps to multiple gene names:'.format(len(mixed_genes)) - for x, y in mixed_genes.iteritems(): - print '{}: {}'.format(x, y) + print('Found {} genes where one gene ID maps to multiple gene names:'.format(len(mixed_genes))) + for x, y in mixed_genes.items(): + print('{}: {}'.format(x, y)) mixed_genes = {} for gene_id, d in df.groupby('GeneName'): if len(set(d.GeneId)) != 1: mixed_genes[gene_id] = ','.join(set(d.GeneId)) if len(mixed_genes) > 0: - print 'Found {} genes where one gene name maps to multiple gene IDs:'.format(len(mixed_genes)) - for x, y in mixed_genes.iteritems(): - print '{}: {}'.format(x, y) + print('Found {} genes where one gene name maps to multiple gene IDs:'.format(len(mixed_genes))) + for x, y in mixed_genes.items(): + print('{}: {}'.format(x, y)) From 264164fb21b51673eac04aa3dee03d5485f4b7d3 Mon Sep 17 00:00:00 2001 From: Ian Fiddes Date: Mon, 11 May 2020 08:38:55 -0700 Subject: [PATCH 22/69] somehow this got out of date --- programs/validate_gff3 | 64 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 64 insertions(+) diff --git a/programs/validate_gff3 b/programs/validate_gff3 index 41910964..353a2a79 100755 --- a/programs/validate_gff3 +++ b/programs/validate_gff3 @@ -48,6 +48,56 @@ if __name__ == '__main__': print(tmp.head(10)) # validate one-to-one mappings for reference mixed_genes = {} + for gene_id, d in df.groupby('GeneId'):#!/usr/bin/env python +""" +This script allows you to test your GFF3 for compatibility with CAT before running the pipeline. +""" + +import argparse +import tools.gff3 +import tools.procOps +import tools.fileOps +import tools.transcripts +import collections + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument('gff3', help='GFF3 to validate') + return parser.parse_args() + + +if __name__ == '__main__': + args = parse_args() + with tools.fileOps.TemporaryFilePath() as attrs, tools.fileOps.TemporaryFilePath() as gp: + cmd = tools.gff3.convert_gff3_cmd(attrs, args.gff3) + tools.procOps.run_proc(cmd, stdout=gp) + # check for duplicates + c = collections.Counter() + for l in open(gp): + l = l.split() + c[l[0]] += 1 + duplicates = {x for x, y in c.items() if y > 1} + assert len(duplicates) == 0, 'Found {} duplicate genes: {}'.format(len(duplicates), '\n'.join(duplicates)) + df = tools.gff3.parse_gff3(attrs, gp) + tx_dict = tools.transcripts.get_gene_pred_dict(gp) + assert len(tx_dict) == len(df) + exons = {len(x.exon_intervals) for x in tx_dict.values()} + if len(exons) == 1: + print('WARNING: no multi-exonic transcripts found') + assert tx_dict.keys() == set(df.index) + genes = {x.name2 for x in tx_dict.values()} + assert genes == set(df.GeneId) + print('Found {} transcripts and {} genes'.format(len(tx_dict), len(genes))) + tmp = df.groupby('GeneId').first() + print('Found the following gene biotypes: {}'.format(collections.Counter(tmp.GeneBiotype))) + print('Found the following transcript biotypes: {}'.format(collections.Counter(df.TranscriptBiotype))) + print('Some example database rows:') + print(df.head(20)) + print('Some example gene rows:') + print(tmp.head(10)) + # validate one-to-one mappings for reference + mixed_genes = {} for gene_id, d in df.groupby('GeneId'): if len(set(d.GeneName)) != 1: mixed_genes[gene_id] = ','.join(set(d.GeneName)) @@ -63,3 +113,17 @@ if __name__ == '__main__': print('Found {} genes where one gene name maps to multiple gene IDs:'.format(len(mixed_genes))) for x, y in mixed_genes.items(): print('{}: {}'.format(x, y)) + if len(set(d.GeneName)) != 1: + mixed_genes[gene_id] = ','.join(set(d.GeneName)) + if len(mixed_genes) > 0: + print('Found {} genes where one gene ID maps to multiple gene names:'.format(len(mixed_genes))) + for x, y in mixed_genes.items(): + print('{}: {}'.format(x, y)) + mixed_genes = {} + for gene_id, d in df.groupby('GeneName'): + if len(set(d.GeneId)) != 1: + mixed_genes[gene_id] = ','.join(set(d.GeneId)) + if len(mixed_genes) > 0: + print('Found {} genes where one gene name maps to multiple gene IDs:'.format(len(mixed_genes))) + for x, y in mixed_genes.items(): + print('{}: {}'.format(x, y)) From 6ab1a607f520e2af73135ec5176022069e20f931 Mon Sep 17 00:00:00 2001 From: Ian Fiddes Date: Mon, 11 May 2020 08:39:36 -0700 Subject: [PATCH 23/69] somehow this got out of date --- programs/validate_gff3 | 66 +----------------------------------------- 1 file changed, 1 insertion(+), 65 deletions(-) diff --git a/programs/validate_gff3 b/programs/validate_gff3 index 353a2a79..83cb154b 100755 --- a/programs/validate_gff3 +++ b/programs/validate_gff3 @@ -1,56 +1,6 @@ #!/usr/bin/env python """ This script allows you to test your GFF3 for compatibility with CAT before running the pipeline. - -""" - -import argparse -import tools.gff3 -import tools.procOps -import tools.fileOps -import tools.transcripts -import collections - -def parse_args(): - parser = argparse.ArgumentParser() - parser.add_argument('gff3', help='GFF3 to validate') - return parser.parse_args() - - -if __name__ == '__main__': - args = parse_args() - with tools.fileOps.TemporaryFilePath() as attrs, tools.fileOps.TemporaryFilePath() as gp: - cmd = tools.gff3.convert_gff3_cmd(attrs, args.gff3) - tools.procOps.run_proc(cmd, stdout=gp) - # check for duplicates - c = collections.Counter() - for l in open(gp): - l = l.split() - c[l[0]] += 1 - duplicates = {x for x, y in c.items() if y > 1} - assert len(duplicates) == 0, 'Found {} duplicate transcript IDs: {}'.format(len(duplicates), '\n'.join(duplicates)) - df = tools.gff3.parse_gff3(attrs, gp) - tx_dict = tools.transcripts.get_gene_pred_dict(gp) - assert len(tx_dict) == len(df) - exons = {len(x.exon_intervals) for x in tx_dict.values()} - if len(exons) == 1: - print('WARNING: no multi-exonic transcripts found') - assert tx_dict.keys() == set(df.index) - genes = {x.name2 for x in tx_dict.values()} - assert genes == set(df.GeneId) - print('Found {} transcripts and {} genes'.format(len(tx_dict), len(genes))) - tmp = df.groupby('GeneId').first() - print('Found the following gene biotypes: {}'.format(collections.Counter(tmp.GeneBiotype))) - print('Found the following transcript biotypes: {}'.format(collections.Counter(df.TranscriptBiotype))) - print('Some example database rows:') - print(df.head(20)) - print('Some example gene rows:') - print(tmp.head(10)) - # validate one-to-one mappings for reference - mixed_genes = {} - for gene_id, d in df.groupby('GeneId'):#!/usr/bin/env python -""" -This script allows you to test your GFF3 for compatibility with CAT before running the pipeline. """ import argparse @@ -112,18 +62,4 @@ if __name__ == '__main__': if len(mixed_genes) > 0: print('Found {} genes where one gene name maps to multiple gene IDs:'.format(len(mixed_genes))) for x, y in mixed_genes.items(): - print('{}: {}'.format(x, y)) - if len(set(d.GeneName)) != 1: - mixed_genes[gene_id] = ','.join(set(d.GeneName)) - if len(mixed_genes) > 0: - print('Found {} genes where one gene ID maps to multiple gene names:'.format(len(mixed_genes))) - for x, y in mixed_genes.items(): - print('{}: {}'.format(x, y)) - mixed_genes = {} - for gene_id, d in df.groupby('GeneName'): - if len(set(d.GeneId)) != 1: - mixed_genes[gene_id] = ','.join(set(d.GeneId)) - if len(mixed_genes) > 0: - print('Found {} genes where one gene name maps to multiple gene IDs:'.format(len(mixed_genes))) - for x, y in mixed_genes.items(): - print('{}: {}'.format(x, y)) + print('{}: {}'.format(x, y)) \ No newline at end of file From 17ac972cd46029a19804a0b7e19449108389f6bf Mon Sep 17 00:00:00 2001 From: Ian Fiddes Date: Mon, 11 May 2020 08:49:28 -0700 Subject: [PATCH 24/69] ugh --- programs/convert_ncbi_gff3 | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/programs/convert_ncbi_gff3 b/programs/convert_ncbi_gff3 index d279ed0c..7b537334 100755 --- a/programs/convert_ncbi_gff3 +++ b/programs/convert_ncbi_gff3 @@ -4,6 +4,9 @@ import argparse from BCBio import GFF from collections import Counter, OrderedDict +noid_count = 1 +global noid_count + def parse_args(): parser = argparse.ArgumentParser() @@ -18,7 +21,14 @@ def add_tags_to_feature(feature): 'J_gene_segment', 'V_gene_segment', 'primary_transcript', 'miRNA', 'transcript', 'CDS', 'exon', 'gene']: return - ids = dict([x.split(':', 1) for x in feature.qualifiers.get('dbxref', feature.qualifiers['Dbxref'])]) + if 'dbxref' in feature.qualifiers: + xref = feature.qualifiers['dbxref'] + elif 'Dbxref' in feature.qualifiers: + xref = feature.qualifiers['Dbxref'] + else: # hacky fix + ids = {'GeneID': 'NoID-{}'.format(noid_count)} + noid_count += 1 + ids = dict([x.split(':', 1) for x in xref]) if feature.type == 'gene': feature.qualifiers['gene_id'] = [ids['GeneID']] return From af2dd558ae68a06611cf18a9502ea2622b52bec9 Mon Sep 17 00:00:00 2001 From: Ian Fiddes Date: Mon, 11 May 2020 08:50:31 -0700 Subject: [PATCH 25/69] ugh --- programs/convert_ncbi_gff3 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/programs/convert_ncbi_gff3 b/programs/convert_ncbi_gff3 index 7b537334..69d399d2 100755 --- a/programs/convert_ncbi_gff3 +++ b/programs/convert_ncbi_gff3 @@ -4,8 +4,8 @@ import argparse from BCBio import GFF from collections import Counter, OrderedDict -noid_count = 1 global noid_count +noid_count = 1 def parse_args(): From 88be6085f6e6d96623230c7b5d6af6461059e9c7 Mon Sep 17 00:00:00 2001 From: Ian Fiddes Date: Mon, 11 May 2020 08:53:49 -0700 Subject: [PATCH 26/69] ugh --- programs/convert_ncbi_gff3 | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/programs/convert_ncbi_gff3 b/programs/convert_ncbi_gff3 index 69d399d2..0dd72444 100755 --- a/programs/convert_ncbi_gff3 +++ b/programs/convert_ncbi_gff3 @@ -4,9 +4,6 @@ import argparse from BCBio import GFF from collections import Counter, OrderedDict -global noid_count -noid_count = 1 - def parse_args(): parser = argparse.ArgumentParser() @@ -15,7 +12,7 @@ def parse_args(): return parser.parse_args() -def add_tags_to_feature(feature): +def add_tags_to_feature(feature, noid_count): if feature.type not in ['ncRNA', 'tRNA', 'rRNA', 'pseudogene', 'lnc_RNA', 'rRNA', 'mRNA', 'C_gene_segment', 'J_gene_segment', 'V_gene_segment', @@ -76,12 +73,13 @@ def feature_traversal(feature): if __name__ == '__main__': args = parse_args() + noid_count = 1 records = list(GFF.parse(args.input_gff3)) for seqrecord in records: for parent_feature in seqrecord.features: for feature in feature_traversal(parent_feature): try: - add_tags_to_feature(feature) + add_tags_to_feature(feature, noid_count) except KeyError: assert False, feature.qualifiers new_qualifiers = construct_new_qualifiers(feature) From 9e3abd88e5c0d4c172be35f9695c80dc39a492ee Mon Sep 17 00:00:00 2001 From: Ian Fiddes Date: Mon, 11 May 2020 08:56:24 -0700 Subject: [PATCH 27/69] ugh --- programs/convert_ncbi_gff3 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/programs/convert_ncbi_gff3 b/programs/convert_ncbi_gff3 index 0dd72444..ab3a9675 100755 --- a/programs/convert_ncbi_gff3 +++ b/programs/convert_ncbi_gff3 @@ -23,7 +23,7 @@ def add_tags_to_feature(feature, noid_count): elif 'Dbxref' in feature.qualifiers: xref = feature.qualifiers['Dbxref'] else: # hacky fix - ids = {'GeneID': 'NoID-{}'.format(noid_count)} + xref = {'GeneID': 'NoID-{}'.format(noid_count)} noid_count += 1 ids = dict([x.split(':', 1) for x in xref]) if feature.type == 'gene': From 3fa1eee262e4365613f994c0b40539e9f1268b94 Mon Sep 17 00:00:00 2001 From: Ian Fiddes Date: Mon, 11 May 2020 08:57:18 -0700 Subject: [PATCH 28/69] ugh --- programs/convert_ncbi_gff3 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/programs/convert_ncbi_gff3 b/programs/convert_ncbi_gff3 index ab3a9675..a6314ce5 100755 --- a/programs/convert_ncbi_gff3 +++ b/programs/convert_ncbi_gff3 @@ -23,7 +23,7 @@ def add_tags_to_feature(feature, noid_count): elif 'Dbxref' in feature.qualifiers: xref = feature.qualifiers['Dbxref'] else: # hacky fix - xref = {'GeneID': 'NoID-{}'.format(noid_count)} + xref = ['GeneID:NoID-{}'.format(noid_count)] noid_count += 1 ids = dict([x.split(':', 1) for x in xref]) if feature.type == 'gene': From f65c076abf46048410feb9b2a87b648c5a084f18 Mon Sep 17 00:00:00 2001 From: Ian Fiddes Date: Mon, 11 May 2020 18:03:10 -0700 Subject: [PATCH 29/69] subtle --- programs/convert_ncbi_gff3 | 26 ++++++++-- .../convert_ncbi_refseq_genbank_chromosomes | 52 +++++++++++++++++-- programs/validate_gff3 | 2 +- 3 files changed, 70 insertions(+), 10 deletions(-) diff --git a/programs/convert_ncbi_gff3 b/programs/convert_ncbi_gff3 index a6314ce5..4500822f 100755 --- a/programs/convert_ncbi_gff3 +++ b/programs/convert_ncbi_gff3 @@ -12,7 +12,18 @@ def parse_args(): return parser.parse_args() -def add_tags_to_feature(feature, noid_count): +def get_unique_id(i, c, update_id=True): + if i not in c: + c[i] = 0 + return i + if update_id is True: + c[i] += 1 + if c[i] > 0: + return f"{i}.{c[i]}" + return i + + +def add_tags_to_feature(feature, noid_count, tx_counts, gene_counts): if feature.type not in ['ncRNA', 'tRNA', 'rRNA', 'pseudogene', 'lnc_RNA', 'rRNA', 'mRNA', 'C_gene_segment', 'J_gene_segment', 'V_gene_segment', @@ -27,13 +38,18 @@ def add_tags_to_feature(feature, noid_count): noid_count += 1 ids = dict([x.split(':', 1) for x in xref]) if feature.type == 'gene': - feature.qualifiers['gene_id'] = [ids['GeneID']] + feature.qualifiers['gene_id'] = [get_unique_id(ids['GeneID'], gene_counts)] return - feature.qualifiers['gene_id'] = [ids['GeneID']] + feature.qualifiers['gene_id'] = [get_unique_id(ids['GeneID'], gene_counts, update_id=False)] feature.qualifiers['gene_name'] = feature.qualifiers.get('gene', feature.qualifiers['gene_id']) tx_id = feature.qualifiers.get('transcript_id', feature.qualifiers['ID'])[0] if '-' in tx_id: tx_id = tx_id.split('-', 1)[1] + if feature.type not in ['CDS', 'exon']: + # is a transcript; check for uniqueness + tx_id = get_unique_id(tx_id, tx_counts) + else: + tx_id = get_unique_id(tx_id, tx_counts, update_id=False) feature.qualifiers['transcript_id'] = [tx_id] feature.qualifiers['transcript_name'] = feature.qualifiers.get('Name', feature.qualifiers['transcript_id']) if feature.type in ['mRNA', 'CDS']: @@ -74,12 +90,14 @@ def feature_traversal(feature): if __name__ == '__main__': args = parse_args() noid_count = 1 + tx_counts = Counter() + gene_counts = Counter() records = list(GFF.parse(args.input_gff3)) for seqrecord in records: for parent_feature in seqrecord.features: for feature in feature_traversal(parent_feature): try: - add_tags_to_feature(feature, noid_count) + add_tags_to_feature(feature, noid_count, tx_counts, gene_counts) except KeyError: assert False, feature.qualifiers new_qualifiers = construct_new_qualifiers(feature) diff --git a/programs/convert_ncbi_refseq_genbank_chromosomes b/programs/convert_ncbi_refseq_genbank_chromosomes index 46f7b176..dbfb7804 100755 --- a/programs/convert_ncbi_refseq_genbank_chromosomes +++ b/programs/convert_ncbi_refseq_genbank_chromosomes @@ -12,20 +12,62 @@ def parse_args(): parser.add_argument('input_gff3', help='RefSeq GFF3 file') parser.add_argument('conversion_table', help='Conversion table') parser.add_argument('output_gff3', help='Output GFF3') - parser.add_argument('--refseq-to-genbank', action='store_true', help='Set this to flip direction to RefSeq -> GenBank') + e = parser.add_mutually_exclusive_group(required=True) + e.add_argument('--refseq-to-genbank', action='store_true', help='Set this to convert RefSeq -> GenBank') + e.add_argument('--genbank-to-refseq', action='store_true', help='Set this to convert GenBank -> RefSeq') + e.add_argument('--refseq-to-ucsc', action='store_true', help='Set this to convert RefSeq -> UCSC') + e.add_argument('--genbank-to-ucsc', action='store_true', help='Set this to convert GenBank -> UCSC') + e.add_argument('--ucsc-to-genbank', action='store_true', help='Set this to convert UCSC -> GenBank') + e.add_argument('--ucsc-to-refseq', action='store_true', help='Set this to convert UCSC -> RefSeq') + parser.add_argument('--retain-na', action='store_true', help='Default is to drop NA targets; retain them instead?') return parser.parse_args() +column_map = {'refseq': 'RefSeq-Accn', + 'genbank': 'GenBank-Accn', + 'ucsc': 'UCSC-style-name'} + + if __name__ == "__main__": args = parse_args() df = pd.read_csv(args.conversion_table, sep='\t', comment='#', na_values='na', header=None) df.columns = header - filt_df = df[~(df['RefSeq-Accn'].isnull()) & ~(df['GenBank-Accn'].isnull())] - print("Threw out {} contigs due to no mapping".format(len(df) - len(filt_df))) + if args.refseq_to_genbank: - m = dict(zip(map(str, filt_df['RefSeq-Accn']), map(str, filt_df['GenBank-Accn']))) + from_col = column_map['refseq'] + to_col = column_map['genbank'] + elif args.genbank_to_refseq: + from_col = column_map['genbank'] + to_col = column_map['refseq'] + elif args.refseq_to_ucsc: + from_col = column_map['refseq'] + to_col = column_map['ucsc'] + elif args.genbank_to_ucsc: + from_col = column_map['genbank'] + to_col = column_map['ucsc'] + elif ucsc_to_genbank: + from_col = column_map['ucsc'] + to_col = column_map['genbank'] else: - m = dict(zip(map(str, filt_df['GenBank-Accn']), map(str, filt_df['RefSeq-Accn']))) + from_col = column_map['ucsc'] + to_col = column_map['refseq'] + + source_is_na = 0 + tgt_is_na = 0 + m = {} + for f, t in zip(df[from_col], df[to_col]): + if f is None: + # if source is NA, we have no hope here + source_is_na += 1 + continue + elif t is None: + tgt_is_na += 1 + if args.retain_na: + t = f + else: + continue + m[f] = t + with open(args.output_gff3, 'w') as fh: for row in open(args.input_gff3): if row.startswith('#'): diff --git a/programs/validate_gff3 b/programs/validate_gff3 index 83cb154b..ad5354ea 100755 --- a/programs/validate_gff3 +++ b/programs/validate_gff3 @@ -28,7 +28,7 @@ if __name__ == '__main__': l = l.split() c[l[0]] += 1 duplicates = {x for x, y in c.items() if y > 1} - assert len(duplicates) == 0, 'Found {} duplicate genes: {}'.format(len(duplicates), '\n'.join(duplicates)) + assert len(duplicates) == 0, 'Found {} duplicate transcript IDs: {}'.format(len(duplicates), '\n'.join(duplicates)) df = tools.gff3.parse_gff3(attrs, gp) tx_dict = tools.transcripts.get_gene_pred_dict(gp) assert len(tx_dict) == len(df) From 3dec4a758a8dbdeb5f86ad187c617ed63a94bd9d Mon Sep 17 00:00:00 2001 From: Ian Fiddes Date: Mon, 11 May 2020 21:19:00 -0700 Subject: [PATCH 30/69] allow minimal --- tools/gff3.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/gff3.py b/tools/gff3.py index f0101182..5d3f165d 100644 --- a/tools/gff3.py +++ b/tools/gff3.py @@ -60,6 +60,7 @@ def parse_attrs(attrs): def convert_gff3_cmd(annotation_attrs, annotation): - cmd = ['gff3ToGenePred', '-rnaNameAttr=transcript_id', '-geneNameAttr=gene_id', '-honorStartStopCodons', + cmd = ['gff3ToGenePred', '-rnaNameAttr=transcript_id', '-geneNameAttr=gene_id', + '-honorStartStopCodons', '-allowMinimalGenes', '-attrsOut={}'.format(annotation_attrs), annotation, '/dev/stdout'] return cmd From 542c38b6902e9350237c593217b42bd4e5fb5cd5 Mon Sep 17 00:00:00 2001 From: Ian Fiddes Date: Mon, 11 May 2020 21:21:38 -0700 Subject: [PATCH 31/69] nevermind --- tools/gff3.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tools/gff3.py b/tools/gff3.py index 5d3f165d..f0101182 100644 --- a/tools/gff3.py +++ b/tools/gff3.py @@ -60,7 +60,6 @@ def parse_attrs(attrs): def convert_gff3_cmd(annotation_attrs, annotation): - cmd = ['gff3ToGenePred', '-rnaNameAttr=transcript_id', '-geneNameAttr=gene_id', - '-honorStartStopCodons', '-allowMinimalGenes', + cmd = ['gff3ToGenePred', '-rnaNameAttr=transcript_id', '-geneNameAttr=gene_id', '-honorStartStopCodons', '-attrsOut={}'.format(annotation_attrs), annotation, '/dev/stdout'] return cmd From 7d2a3decded4ea2ea96a6f6293977d9a57d7ffda Mon Sep 17 00:00:00 2001 From: Ian Fiddes Date: Mon, 11 May 2020 22:06:40 -0700 Subject: [PATCH 32/69] hacks --- tools/gff3.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/gff3.py b/tools/gff3.py index f0101182..b83d61c5 100644 --- a/tools/gff3.py +++ b/tools/gff3.py @@ -61,5 +61,6 @@ def parse_attrs(attrs): def convert_gff3_cmd(annotation_attrs, annotation): cmd = ['gff3ToGenePred', '-rnaNameAttr=transcript_id', '-geneNameAttr=gene_id', '-honorStartStopCodons', + '-refseqHacks', '-attrsOut={}'.format(annotation_attrs), annotation, '/dev/stdout'] return cmd From 18ff6818eb072f39e3106a453a64d1e5f595845f Mon Sep 17 00:00:00 2001 From: Ian Fiddes Date: Mon, 11 May 2020 22:13:55 -0700 Subject: [PATCH 33/69] convert --- programs/convert_ncbi_refseq_genbank_chromosomes | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/programs/convert_ncbi_refseq_genbank_chromosomes b/programs/convert_ncbi_refseq_genbank_chromosomes index dbfb7804..827a5949 100755 --- a/programs/convert_ncbi_refseq_genbank_chromosomes +++ b/programs/convert_ncbi_refseq_genbank_chromosomes @@ -2,6 +2,7 @@ """Convert RefSeq to GenBank or vice versa""" import argparse import pandas as pd +from tools.fileOps import print_row header = ['Sequence-Name', 'Sequence-Role', 'Assigned-Molecule', 'Assigned-Molecule-Location/Type', 'GenBank-Accn', 'Relationship', 'RefSeq-Accn', 'Assembly-Unit', 'Sequence-Length', 'UCSC-style-name'] @@ -79,4 +80,4 @@ if __name__ == "__main__": row[0] = m[row[0]] else: print('Row unparseable: {}'.format(row)) - fh.write('\t'.join(row)) \ No newline at end of file + print_row(fh, row) \ No newline at end of file From 7bd8d7ae3a3b3d84ae7c993e5e53a131e11ea8a2 Mon Sep 17 00:00:00 2001 From: Ian Fiddes Date: Mon, 11 May 2020 22:14:35 -0700 Subject: [PATCH 34/69] convert --- programs/convert_ncbi_refseq_genbank_chromosomes | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/programs/convert_ncbi_refseq_genbank_chromosomes b/programs/convert_ncbi_refseq_genbank_chromosomes index 827a5949..cdfd5b77 100755 --- a/programs/convert_ncbi_refseq_genbank_chromosomes +++ b/programs/convert_ncbi_refseq_genbank_chromosomes @@ -75,7 +75,7 @@ if __name__ == "__main__": if row.startswith('##gff'): fh.write(row) continue - row = row.split('\t') + row = row.rstrip().split('\t') if row[0] in m: row[0] = m[row[0]] else: From 8c1e4cd58cf671a018f1f28294b437062f1ec855 Mon Sep 17 00:00:00 2001 From: Ian Fiddes Date: Tue, 12 May 2020 14:03:56 -0700 Subject: [PATCH 35/69] more features --- programs/convert_ncbi_gff3 | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/programs/convert_ncbi_gff3 b/programs/convert_ncbi_gff3 index 4500822f..38941352 100755 --- a/programs/convert_ncbi_gff3 +++ b/programs/convert_ncbi_gff3 @@ -23,7 +23,7 @@ def get_unique_id(i, c, update_id=True): return i -def add_tags_to_feature(feature, noid_count, tx_counts, gene_counts): +def add_tags_to_feature(feature, noid_count, tx_counts, gene_counts, parent_feature=None): if feature.type not in ['ncRNA', 'tRNA', 'rRNA', 'pseudogene', 'lnc_RNA', 'rRNA', 'mRNA', 'C_gene_segment', 'J_gene_segment', 'V_gene_segment', @@ -38,7 +38,9 @@ def add_tags_to_feature(feature, noid_count, tx_counts, gene_counts): noid_count += 1 ids = dict([x.split(':', 1) for x in xref]) if feature.type == 'gene': + assert feature == parent_feature feature.qualifiers['gene_id'] = [get_unique_id(ids['GeneID'], gene_counts)] + assert 'gene_biotype' in feature.qualifiers return feature.qualifiers['gene_id'] = [get_unique_id(ids['GeneID'], gene_counts, update_id=False)] feature.qualifiers['gene_name'] = feature.qualifiers.get('gene', feature.qualifiers['gene_id']) @@ -52,14 +54,17 @@ def add_tags_to_feature(feature, noid_count, tx_counts, gene_counts): tx_id = get_unique_id(tx_id, tx_counts, update_id=False) feature.qualifiers['transcript_id'] = [tx_id] feature.qualifiers['transcript_name'] = feature.qualifiers.get('Name', feature.qualifiers['transcript_id']) - if feature.type in ['mRNA', 'CDS']: - gene_biotype = transcript_biotype = ['protein_coding'] - elif feature.type == 'primary_transcript': - gene_biotype = ['miRNA'] + if feature.type == 'primary_transcript': + transcript_biotype = ['miRNA'] + elif 'segment' in feature.type: + transcript_biotype = [feature.type] + feature.type = 'ncRNA' + elif feature.type == 'lnc_RNA': + feature.type = 'lncRNA' transcript_biotype = [feature.type] else: - gene_biotype = transcript_biotype = [feature.type] - feature.qualifiers['gene_biotype'] = gene_biotype + transcript_biotype = parent_feature.qualifiers['gene_biotype'] + feature.qualifiers['gene_biotype'] = parent_feature.qualifiers['gene_biotype'] feature.qualifiers['transcript_biotype'] = transcript_biotype @@ -97,7 +102,7 @@ if __name__ == '__main__': for parent_feature in seqrecord.features: for feature in feature_traversal(parent_feature): try: - add_tags_to_feature(feature, noid_count, tx_counts, gene_counts) + add_tags_to_feature(feature, noid_count, tx_counts, gene_counts, parent_feature) except KeyError: assert False, feature.qualifiers new_qualifiers = construct_new_qualifiers(feature) From 2be9410387306d75c8a7ceca3cd85d31f81f7dcb Mon Sep 17 00:00:00 2001 From: Ian Fiddes Date: Tue, 12 May 2020 14:15:46 -0700 Subject: [PATCH 36/69] fix convert --- programs/convert_ncbi_refseq_genbank_chromosomes | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/programs/convert_ncbi_refseq_genbank_chromosomes b/programs/convert_ncbi_refseq_genbank_chromosomes index cdfd5b77..ef7a5b23 100755 --- a/programs/convert_ncbi_refseq_genbank_chromosomes +++ b/programs/convert_ncbi_refseq_genbank_chromosomes @@ -2,6 +2,7 @@ """Convert RefSeq to GenBank or vice versa""" import argparse import pandas as pd +import numpy as np from tools.fileOps import print_row header = ['Sequence-Name', 'Sequence-Role', 'Assigned-Molecule', 'Assigned-Molecule-Location/Type', @@ -57,11 +58,11 @@ if __name__ == "__main__": tgt_is_na = 0 m = {} for f, t in zip(df[from_col], df[to_col]): - if f is None: + if not isinstance(f, str) and np.isnan(f): # if source is NA, we have no hope here source_is_na += 1 continue - elif t is None: + elif not isinstance(t, str) and np.isnan(t): tgt_is_na += 1 if args.retain_na: t = f From 7c8b04e933e1432437982cd96596e2865bc6085d Mon Sep 17 00:00:00 2001 From: Ian Fiddes Date: Tue, 12 May 2020 14:25:44 -0700 Subject: [PATCH 37/69] fix convert --- programs/convert_ncbi_gff3 | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/programs/convert_ncbi_gff3 b/programs/convert_ncbi_gff3 index 38941352..65f5b612 100755 --- a/programs/convert_ncbi_gff3 +++ b/programs/convert_ncbi_gff3 @@ -25,8 +25,8 @@ def get_unique_id(i, c, update_id=True): def add_tags_to_feature(feature, noid_count, tx_counts, gene_counts, parent_feature=None): if feature.type not in ['ncRNA', 'tRNA', 'rRNA', 'pseudogene', 'lnc_RNA', - 'rRNA', 'mRNA', 'C_gene_segment', - 'J_gene_segment', 'V_gene_segment', + 'rRNA', 'mRNA', 'snRNA', 'snoRNA', 'scRNA', 'antisense_RNA', 'guide_RNA', + 'C_gene_segment', 'J_gene_segment', 'V_gene_segment', 'primary_transcript', 'miRNA', 'transcript', 'CDS', 'exon', 'gene']: return if 'dbxref' in feature.qualifiers: From cb9537b17da74adf92240e216e701375c0393b9c Mon Sep 17 00:00:00 2001 From: Ian Fiddes Date: Tue, 12 May 2020 14:47:24 -0700 Subject: [PATCH 38/69] add script for HAL conversion --- programs/convert_ncbi_refseq_genbank_chromosomes | 9 ++++----- setup.py | 3 ++- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/programs/convert_ncbi_refseq_genbank_chromosomes b/programs/convert_ncbi_refseq_genbank_chromosomes index ef7a5b23..e2840eb5 100755 --- a/programs/convert_ncbi_refseq_genbank_chromosomes +++ b/programs/convert_ncbi_refseq_genbank_chromosomes @@ -11,7 +11,6 @@ header = ['Sequence-Name', 'Sequence-Role', 'Assigned-Molecule', 'Assigned-Molec def parse_args(): parser = argparse.ArgumentParser() - parser.add_argument('input_gff3', help='RefSeq GFF3 file') parser.add_argument('conversion_table', help='Conversion table') parser.add_argument('output_gff3', help='Output GFF3') e = parser.add_mutually_exclusive_group(required=True) @@ -21,7 +20,7 @@ def parse_args(): e.add_argument('--genbank-to-ucsc', action='store_true', help='Set this to convert GenBank -> UCSC') e.add_argument('--ucsc-to-genbank', action='store_true', help='Set this to convert UCSC -> GenBank') e.add_argument('--ucsc-to-refseq', action='store_true', help='Set this to convert UCSC -> RefSeq') - parser.add_argument('--retain-na', action='store_true', help='Default is to drop NA targets; retain them instead?') + parser.add_argument('--no-retain-na', action='store_true', help='Default is to retain NA targets; drop them instead?') return parser.parse_args() @@ -64,10 +63,10 @@ if __name__ == "__main__": continue elif not isinstance(t, str) and np.isnan(t): tgt_is_na += 1 - if args.retain_na: - t = f - else: + if args.no_retain_na: continue + else: + t = f m[f] = t with open(args.output_gff3, 'w') as fh: diff --git a/setup.py b/setup.py index 7fbd7318..0e9dbd1f 100644 --- a/setup.py +++ b/setup.py @@ -27,7 +27,8 @@ 'programs/validate_gff3', 'programs/cat_parse_ncbi_genbank', 'programs/cat_parse_ncbi_refseq', 'programs/cat_parse_prokka_gff3', 'programs/convert_ncbi_gff3', 'programs/convert_cupcake_gff', - 'programs/convert_ncbi_refseq_genbank_chromosomes'], + 'programs/convert_ncbi_refseq_genbank_chromosomes', + 'programs/convert_ncbi_refseq_genbank_chromosomes_hal'], author='Ian Fiddes', description='Comparative Annotation Toolkit', url='https://github.com/ComparativeGenomicsToolkit/Comparative-Annotation-Toolkit', From ea3bbca9d3b0dcf3763caf5d0b82692835c52da9 Mon Sep 17 00:00:00 2001 From: Ian Fiddes Date: Tue, 12 May 2020 14:48:05 -0700 Subject: [PATCH 39/69] oops --- ...onvert_ncbi_refseq_genbank_chromosomes_hal | 73 +++++++++++++++++++ 1 file changed, 73 insertions(+) create mode 100644 programs/convert_ncbi_refseq_genbank_chromosomes_hal diff --git a/programs/convert_ncbi_refseq_genbank_chromosomes_hal b/programs/convert_ncbi_refseq_genbank_chromosomes_hal new file mode 100644 index 00000000..c8a504da --- /dev/null +++ b/programs/convert_ncbi_refseq_genbank_chromosomes_hal @@ -0,0 +1,73 @@ +#!/usr/bin/env python +"""Convert RefSeq to GenBank or vice versa""" +import argparse +import pandas as pd +import numpy as np +from tools.fileOps import print_rows + +header = ['Sequence-Name', 'Sequence-Role', 'Assigned-Molecule', 'Assigned-Molecule-Location/Type', +'GenBank-Accn', 'Relationship', 'RefSeq-Accn', 'Assembly-Unit', 'Sequence-Length', 'UCSC-style-name'] + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument('conversion_table', help='Conversion table') + parser.add_argument('output_table', help='Output table for halRenameSequences') + e = parser.add_mutually_exclusive_group(required=True) + e.add_argument('--refseq-to-genbank', action='store_true', help='Set this to convert RefSeq -> GenBank') + e.add_argument('--genbank-to-refseq', action='store_true', help='Set this to convert GenBank -> RefSeq') + e.add_argument('--refseq-to-ucsc', action='store_true', help='Set this to convert RefSeq -> UCSC') + e.add_argument('--genbank-to-ucsc', action='store_true', help='Set this to convert GenBank -> UCSC') + e.add_argument('--ucsc-to-genbank', action='store_true', help='Set this to convert UCSC -> GenBank') + e.add_argument('--ucsc-to-refseq', action='store_true', help='Set this to convert UCSC -> RefSeq') + parser.add_argument('--no-retain-na', action='store_true', help='Default is to retain NA targets; drop them instead?') + return parser.parse_args() + + +column_map = {'refseq': 'RefSeq-Accn', + 'genbank': 'GenBank-Accn', + 'ucsc': 'UCSC-style-name'} + + +if __name__ == "__main__": + args = parse_args() + df = pd.read_csv(args.conversion_table, sep='\t', comment='#', na_values='na', header=None) + df.columns = header + + if args.refseq_to_genbank: + from_col = column_map['refseq'] + to_col = column_map['genbank'] + elif args.genbank_to_refseq: + from_col = column_map['genbank'] + to_col = column_map['refseq'] + elif args.refseq_to_ucsc: + from_col = column_map['refseq'] + to_col = column_map['ucsc'] + elif args.genbank_to_ucsc: + from_col = column_map['genbank'] + to_col = column_map['ucsc'] + elif ucsc_to_genbank: + from_col = column_map['ucsc'] + to_col = column_map['genbank'] + else: + from_col = column_map['ucsc'] + to_col = column_map['refseq'] + + source_is_na = 0 + tgt_is_na = 0 + m = {} + for f, t in zip(df[from_col], df[to_col]): + if not isinstance(f, str) and np.isnan(f): + # if source is NA, we have no hope here + source_is_na += 1 + continue + elif not isinstance(t, str) and np.isnan(t): + tgt_is_na += 1 + if args.no_retain_na: + continue + else: + t = f + m[f] = t + + with open(args.output_table, 'w') as fh: + print_rows(fh, m.items()) From e11a1f3a0b6fa79b3b411a70a957e1183fc60574 Mon Sep 17 00:00:00 2001 From: Ian Fiddes Date: Tue, 12 May 2020 15:14:39 -0700 Subject: [PATCH 40/69] hal gets mad --- programs/convert_ncbi_refseq_genbank_chromosomes_hal | 2 ++ 1 file changed, 2 insertions(+) diff --git a/programs/convert_ncbi_refseq_genbank_chromosomes_hal b/programs/convert_ncbi_refseq_genbank_chromosomes_hal index c8a504da..7d37396e 100644 --- a/programs/convert_ncbi_refseq_genbank_chromosomes_hal +++ b/programs/convert_ncbi_refseq_genbank_chromosomes_hal @@ -67,6 +67,8 @@ if __name__ == "__main__": continue else: t = f + if f == t: + continue m[f] = t with open(args.output_table, 'w') as fh: From 228219f13c07b3ab6f73474d99e5a8de3139e910 Mon Sep 17 00:00:00 2001 From: Ian Fiddes Date: Tue, 12 May 2020 16:07:29 -0700 Subject: [PATCH 41/69] oops --- programs/convert_ncbi_refseq_genbank_chromosomes | 1 + 1 file changed, 1 insertion(+) diff --git a/programs/convert_ncbi_refseq_genbank_chromosomes b/programs/convert_ncbi_refseq_genbank_chromosomes index e2840eb5..f5a127d6 100755 --- a/programs/convert_ncbi_refseq_genbank_chromosomes +++ b/programs/convert_ncbi_refseq_genbank_chromosomes @@ -12,6 +12,7 @@ header = ['Sequence-Name', 'Sequence-Role', 'Assigned-Molecule', 'Assigned-Molec def parse_args(): parser = argparse.ArgumentParser() parser.add_argument('conversion_table', help='Conversion table') + parser.add_argument('input_gff3', help='Input GFF3') parser.add_argument('output_gff3', help='Output GFF3') e = parser.add_mutually_exclusive_group(required=True) e.add_argument('--refseq-to-genbank', action='store_true', help='Set this to convert RefSeq -> GenBank') From 26fa1c4ed1ec09994695fa9ec3ec0430243f454c Mon Sep 17 00:00:00 2001 From: Ian Fiddes Date: Tue, 12 May 2020 16:44:52 -0700 Subject: [PATCH 42/69] fix biotypes --- programs/convert_ncbi_gff3 | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/programs/convert_ncbi_gff3 b/programs/convert_ncbi_gff3 index 65f5b612..310b58ac 100755 --- a/programs/convert_ncbi_gff3 +++ b/programs/convert_ncbi_gff3 @@ -43,7 +43,7 @@ def add_tags_to_feature(feature, noid_count, tx_counts, gene_counts, parent_feat assert 'gene_biotype' in feature.qualifiers return feature.qualifiers['gene_id'] = [get_unique_id(ids['GeneID'], gene_counts, update_id=False)] - feature.qualifiers['gene_name'] = feature.qualifiers.get('gene', feature.qualifiers['gene_id']) + feature.qualifiers['gene_name'] = feature.qualifiers.get('gene', feature.qualifiers.get('product', feature.qualifiers['gene_id'])) tx_id = feature.qualifiers.get('transcript_id', feature.qualifiers['ID'])[0] if '-' in tx_id: tx_id = tx_id.split('-', 1)[1] @@ -54,6 +54,13 @@ def add_tags_to_feature(feature, noid_count, tx_counts, gene_counts, parent_feat tx_id = get_unique_id(tx_id, tx_counts, update_id=False) feature.qualifiers['transcript_id'] = [tx_id] feature.qualifiers['transcript_name'] = feature.qualifiers.get('Name', feature.qualifiers['transcript_id']) + # figure out gene biotype + if 'gene_biotype' in parent_feature.qualifiers: + gene_biotype = parent_feature.qualifiers['gene_biotype'] + else: + gene_biotype = [feature.type] + + # figure out transcript biotype if feature.type == 'primary_transcript': transcript_biotype = ['miRNA'] elif 'segment' in feature.type: @@ -63,8 +70,8 @@ def add_tags_to_feature(feature, noid_count, tx_counts, gene_counts, parent_feat feature.type = 'lncRNA' transcript_biotype = [feature.type] else: - transcript_biotype = parent_feature.qualifiers['gene_biotype'] - feature.qualifiers['gene_biotype'] = parent_feature.qualifiers['gene_biotype'] + transcript_biotype = gene_biotype + feature.qualifiers['gene_biotype'] = gene_biotype feature.qualifiers['transcript_biotype'] = transcript_biotype From 56692be3a2c67ce856c713ae3c00f1ae8826ab44 Mon Sep 17 00:00:00 2001 From: Ian Fiddes Date: Tue, 12 May 2020 17:09:49 -0700 Subject: [PATCH 43/69] fix gene names --- programs/convert_ncbi_gff3 | 1 + 1 file changed, 1 insertion(+) diff --git a/programs/convert_ncbi_gff3 b/programs/convert_ncbi_gff3 index 310b58ac..7daf3467 100755 --- a/programs/convert_ncbi_gff3 +++ b/programs/convert_ncbi_gff3 @@ -40,6 +40,7 @@ def add_tags_to_feature(feature, noid_count, tx_counts, gene_counts, parent_feat if feature.type == 'gene': assert feature == parent_feature feature.qualifiers['gene_id'] = [get_unique_id(ids['GeneID'], gene_counts)] + feature.qualifiers['gene_name'] = feature.qualifiers.get('gene', feature.qualifiers.get('product', feature.qualifiers['gene_id'])) assert 'gene_biotype' in feature.qualifiers return feature.qualifiers['gene_id'] = [get_unique_id(ids['GeneID'], gene_counts, update_id=False)] From 162cdbeee6678153af1136b20dcce6ac91f860f0 Mon Sep 17 00:00:00 2001 From: Ian Fiddes Date: Tue, 12 May 2020 19:16:10 -0700 Subject: [PATCH 44/69] ugh --- programs/convert_ncbi_gff3 | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/programs/convert_ncbi_gff3 b/programs/convert_ncbi_gff3 index 7daf3467..677e9180 100755 --- a/programs/convert_ncbi_gff3 +++ b/programs/convert_ncbi_gff3 @@ -27,7 +27,8 @@ def add_tags_to_feature(feature, noid_count, tx_counts, gene_counts, parent_feat if feature.type not in ['ncRNA', 'tRNA', 'rRNA', 'pseudogene', 'lnc_RNA', 'rRNA', 'mRNA', 'snRNA', 'snoRNA', 'scRNA', 'antisense_RNA', 'guide_RNA', 'C_gene_segment', 'J_gene_segment', 'V_gene_segment', - 'primary_transcript', 'miRNA', 'transcript', 'CDS', 'exon', 'gene']: + 'primary_transcript', 'miRNA', 'transcript', 'CDS', 'exon', 'gene', + 'SRP_RNA']: return if 'dbxref' in feature.qualifiers: xref = feature.qualifiers['dbxref'] From 77b675424d1a1f82be57711f1893879141d7fc16 Mon Sep 17 00:00:00 2001 From: Ian Fiddes Date: Wed, 13 May 2020 16:13:06 -0700 Subject: [PATCH 45/69] bug in alignments --- cat/classify.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/cat/classify.py b/cat/classify.py index 3bc003c2..e0934c96 100644 --- a/cat/classify.py +++ b/cat/classify.py @@ -263,9 +263,6 @@ def parse_indel(left_pos, right_pos, coordinate_fn, tx, offset, gap_type): """Converts either an insertion or a deletion into a output transcript""" left_chrom_pos, right_chrom_pos = convert_coordinates_to_chromosome(left_pos, right_pos, coordinate_fn, tx.strand) - if left_chrom_pos is None or right_chrom_pos is None: - assert aln_mode == 'CDS' - return None if left_chrom_pos > tx.thick_start and right_chrom_pos < tx.thick_stop: indel_type = 'CodingMult3' if offset % 3 == 0 else 'Coding' @@ -295,8 +292,12 @@ def parse_indel(left_pos, right_pos, coordinate_fn, tx, offset, gap_type): t_offset = t_start - block_size - t_pos assert (q_offset >= 0 and t_offset >= 0) if q_offset != 0: # query insertion -> insertion in target sequence - left_pos = q_start - q_offset - right_pos = q_start + if tx.strand == '+': + left_pos = q_start - q_offset + right_pos = q_start + else: + left_pos = aln.q_size - q_start + right_pos = aln.q_size - q_start + q_offset row = parse_indel(left_pos, right_pos, coordinate_fn, tx, q_offset, 'Insertion') if row is not None: r.append(row) From a13a80b5b3bdb3bbc6749ee68d85db5e2c4abb6c Mon Sep 17 00:00:00 2001 From: Ian Fiddes Date: Wed, 13 May 2020 16:15:54 -0700 Subject: [PATCH 46/69] oops --- cat/classify.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cat/classify.py b/cat/classify.py index e0934c96..70b534e8 100644 --- a/cat/classify.py +++ b/cat/classify.py @@ -296,8 +296,8 @@ def parse_indel(left_pos, right_pos, coordinate_fn, tx, offset, gap_type): left_pos = q_start - q_offset right_pos = q_start else: - left_pos = aln.q_size - q_start - right_pos = aln.q_size - q_start + q_offset + left_pos = psl.q_size - q_start + right_pos = psl.q_size - q_start + q_offset row = parse_indel(left_pos, right_pos, coordinate_fn, tx, q_offset, 'Insertion') if row is not None: r.append(row) From 8852f753352f4744fe6fa148c6a4c90f29103cb7 Mon Sep 17 00:00:00 2001 From: Ian Fiddes Date: Wed, 13 May 2020 16:24:03 -0700 Subject: [PATCH 47/69] this aligner can walk off the end --- cat/classify.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cat/classify.py b/cat/classify.py index 70b534e8..8fe616ae 100644 --- a/cat/classify.py +++ b/cat/classify.py @@ -245,7 +245,6 @@ def find_indels(tx, psl, aln_mode): def convert_coordinates_to_chromosome(left_pos, right_pos, coordinate_fn, strand): """convert alignment coordinates to target chromosome coordinates, inverting if negative strand""" left_chrom_pos = coordinate_fn(left_pos) - assert left_chrom_pos is not None right_chrom_pos = coordinate_fn(right_pos) if right_chrom_pos is None: right_chrom_pos = coordinate_fn(right_pos - 1) @@ -253,7 +252,6 @@ def convert_coordinates_to_chromosome(left_pos, right_pos, coordinate_fn, strand left_chrom_pos += 1 else: left_chrom_pos -= 1 - assert right_chrom_pos is not None if strand == '-': left_chrom_pos, right_chrom_pos = right_chrom_pos, left_chrom_pos assert right_chrom_pos >= left_chrom_pos @@ -263,6 +261,8 @@ def parse_indel(left_pos, right_pos, coordinate_fn, tx, offset, gap_type): """Converts either an insertion or a deletion into a output transcript""" left_chrom_pos, right_chrom_pos = convert_coordinates_to_chromosome(left_pos, right_pos, coordinate_fn, tx.strand) + if left_chrom_pos is None or right_chrom_pos is None: + return None if left_chrom_pos > tx.thick_start and right_chrom_pos < tx.thick_stop: indel_type = 'CodingMult3' if offset % 3 == 0 else 'Coding' From 46a28a74a094e0d25eaa2a48a7623e9db3d2790f Mon Sep 17 00:00:00 2001 From: Ian Fiddes Date: Wed, 13 May 2020 16:39:28 -0700 Subject: [PATCH 48/69] more bugs --- cat/classify.py | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/cat/classify.py b/cat/classify.py index 8fe616ae..f857ec27 100644 --- a/cat/classify.py +++ b/cat/classify.py @@ -244,17 +244,12 @@ def find_indels(tx, psl, aln_mode): """ def convert_coordinates_to_chromosome(left_pos, right_pos, coordinate_fn, strand): """convert alignment coordinates to target chromosome coordinates, inverting if negative strand""" - left_chrom_pos = coordinate_fn(left_pos) - right_chrom_pos = coordinate_fn(right_pos) - if right_chrom_pos is None: - right_chrom_pos = coordinate_fn(right_pos - 1) - if strand == '-': - left_chrom_pos += 1 - else: - left_chrom_pos -= 1 - if strand == '-': - left_chrom_pos, right_chrom_pos = right_chrom_pos, left_chrom_pos - assert right_chrom_pos >= left_chrom_pos + if strand == '+': + right_chrom_pos = coordinate_fn(right_pos) + left_chrom_pos = coordinate_fn(left_pos) + else: + right_chrom_pos = coordinate_fn(left_pos) + 1 + left_chrom_pos = coordinate_fn(right_pos - 1) return left_chrom_pos, right_chrom_pos def parse_indel(left_pos, right_pos, coordinate_fn, tx, offset, gap_type): @@ -264,6 +259,8 @@ def parse_indel(left_pos, right_pos, coordinate_fn, tx, offset, gap_type): if left_chrom_pos is None or right_chrom_pos is None: return None + assert right_chrom_pos >= left_chrom_pos + if left_chrom_pos > tx.thick_start and right_chrom_pos < tx.thick_stop: indel_type = 'CodingMult3' if offset % 3 == 0 else 'Coding' else: From 03adbd27af9647b99711fdba0ebd106d9eb7e041 Mon Sep 17 00:00:00 2001 From: Ian Fiddes Date: Wed, 13 May 2020 16:59:10 -0700 Subject: [PATCH 49/69] more bugs --- cat/classify.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/cat/classify.py b/cat/classify.py index f857ec27..13f97324 100644 --- a/cat/classify.py +++ b/cat/classify.py @@ -244,9 +244,14 @@ def find_indels(tx, psl, aln_mode): """ def convert_coordinates_to_chromosome(left_pos, right_pos, coordinate_fn, strand): """convert alignment coordinates to target chromosome coordinates, inverting if negative strand""" - if strand == '+': + # this is a deletion, and we don't need to adjust + if left_pos == right_pos: + left_chrom_pos = right_chrom_pos = coordinate_fn(left_pos) + # for + strand insertion, just translate + elif strand == '+': right_chrom_pos = coordinate_fn(right_pos) left_chrom_pos = coordinate_fn(left_pos) + # for - strand insertion, translate but move outwards else: right_chrom_pos = coordinate_fn(left_pos) + 1 left_chrom_pos = coordinate_fn(right_pos - 1) From b0f6ad958a5b5730055a9b44d74aa19afe8da085 Mon Sep 17 00:00:00 2001 From: Ian Fiddes Date: Wed, 13 May 2020 17:20:47 -0700 Subject: [PATCH 50/69] try to improve logging --- cat/__init__.py | 7 +++---- logging.cfg | 20 ++++++++++++++++---- 2 files changed, 19 insertions(+), 8 deletions(-) diff --git a/cat/__init__.py b/cat/__init__.py index 09332363..ee55569c 100644 --- a/cat/__init__.py +++ b/cat/__init__.py @@ -53,7 +53,6 @@ from .exceptions import * logger = logging.getLogger('cat') -logger.setLevel('INFO') ### @@ -185,7 +184,7 @@ def get_pipeline_args(self): args.set('global_near_best', self.global_near_best, True) args.set('filter_overlapping_genes', self.filter_overlapping_genes, True) args.set('overlapping_gene_distance', self.overlapping_gene_distance, True) - + # user specified flags for consensus finding args.set('intron_rnaseq_support', self.intron_rnaseq_support, False) args.set('exon_rnaseq_support', self.exon_rnaseq_support, False) @@ -565,7 +564,7 @@ def success(task): os.remove(os.path.abspath(task.job_store)) except OSError: pass - else: + else: cmd = ['toil', 'stats', '--raw', os.path.abspath(task.job_store)] raw = tools.procOps.call_proc(cmd) parsed = raw[raw.index('{'):raw.rfind('}') + 1] @@ -2298,7 +2297,7 @@ def output(self): def run(self): pipeline_args = self.get_pipeline_args() luigi_stats = tools.sqlInterface.load_luigi_stats(pipeline_args.stats_db, 'stats') - + try: toil_stats = tools.sqlInterface.load_luigi_stats(pipeline_args.stats_db, 'toil_stats') except ValueError: diff --git a/logging.cfg b/logging.cfg index 304ec597..c65fce3f 100644 --- a/logging.cfg +++ b/logging.cfg @@ -1,21 +1,33 @@ [loggers] -keys=root +keys=root,luigi,cat [handlers] keys=consoleHandler [formatters] -keys=simpleFormatter +keys=consoleFormatter [logger_root] +level=WARNING +handlers=consoleHandler + +[logger_luigi] +level=INFO +handlers=consoleHandler +qualname=luigi-interface +propagate=0 + +[logger_cat] level=INFO handlers=consoleHandler +qualname=cat +propagate=0 [handler_consoleHandler] class=StreamHandler level=INFO -formatter=simpleFormatter +formatter=consoleFormatter args=(sys.stdout,) [formatter_simpleFormatter] -format=%(levelname)s: %(message)s \ No newline at end of file +format=%(levelname)s: %(asctime)-15s - %(message)s \ No newline at end of file From cb3bb14c293e8bea502735c65be549455841a611 Mon Sep 17 00:00:00 2001 From: Ian Fiddes Date: Thu, 14 May 2020 10:28:01 -0700 Subject: [PATCH 51/69] oops --- logging.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/logging.cfg b/logging.cfg index c65fce3f..9cb5d68f 100644 --- a/logging.cfg +++ b/logging.cfg @@ -29,5 +29,5 @@ level=INFO formatter=consoleFormatter args=(sys.stdout,) -[formatter_simpleFormatter] +[formatter_consoleFormatter] format=%(levelname)s: %(asctime)-15s - %(message)s \ No newline at end of file From 437584521662ce94e602a43935ff40986382b46a Mon Sep 17 00:00:00 2001 From: Ian Fiddes Date: Thu, 14 May 2020 10:29:47 -0700 Subject: [PATCH 52/69] summarize script --- cat/chaining.py | 5 +- scripts/summarize_cat_output.py | 170 ++++++++++++++++++++++++++++++++ 2 files changed, 174 insertions(+), 1 deletion(-) create mode 100644 scripts/summarize_cat_output.py diff --git a/cat/chaining.py b/cat/chaining.py index e9a00ff3..eb4533d6 100644 --- a/cat/chaining.py +++ b/cat/chaining.py @@ -1,5 +1,8 @@ """" -Toil program to generate UCSC chains and nets between two genomes in a HAL file. +Toil program to generate UCSC chains and nets between one reference genome and N target genomes in a HAL file. + +The reference genome is split up by contig/chromosome, and those are chained against all the things they align to +in each genome specified. """ import argparse import collections diff --git a/scripts/summarize_cat_output.py b/scripts/summarize_cat_output.py new file mode 100644 index 00000000..b3087578 --- /dev/null +++ b/scripts/summarize_cat_output.py @@ -0,0 +1,170 @@ +#!/usr/bin/env python +​ +import collections +import pandas as pd +​ +import tools.intervals +import tools.misc +import tools.mathOps +import tools.fileOps +import tools.sqlInterface +import tools.transcripts +import tools.nameConversions +import tools.procOps +from cat.consensus import * +from argparse import ArgumentParser +​ +def get_df(db_path, ref_db_path, chrXOnly=False): + tm_eval_df = load_transmap_evals(db_path) + ref_df = tools.sqlInterface.load_annotation(ref_db_path) + tx_modes = ['transMap'] + mrna_metrics_df = pd.concat([load_metrics_from_db(db_path, tx_mode, 'mRNA') for tx_mode in tx_modes]) + cds_metrics_df = pd.concat([load_metrics_from_db(db_path, tx_mode, 'CDS') for tx_mode in tx_modes]) + eval_df = pd.concat([load_evaluations_from_db(db_path, tx_mode) for tx_mode in tx_modes]).reset_index() + hgm_df = pd.concat([load_hgm_vectors(db_path, tx_mode) for tx_mode in tx_modes]) +​ +​ + # if chrXOnly: + # cmd = [['grep', 'chrX', 'gencode.v30.annotation.gp'], ['cut', '-f', '1']] + # chrx_txs = set(tools.procOps.call_proc_lines(cmd)) + # ref_df = ref_df[ref_df.TranscriptId.isin(chrx_txs)] + # else: + # # remove chrY + # cmd = [['grep', 'chrY', 'gencode.v30.annotation.gp'], ['cut', '-f', '1']] + # chry_txs = set(tools.procOps.call_proc_lines(cmd)) + # ref_df = ref_df[~ref_df.TranscriptId.isin(chry_txs)] +​ +​ + num_txs = len(set(ref_df[ref_df.TranscriptBiotype == 'protein_coding'].TranscriptId)) + num_genes = len(set(ref_df[ref_df.TranscriptBiotype == 'protein_coding'].GeneId)) +​ +​ + ## code below is from the consensus module. I ripped out from the combine_and_filter_dfs method + ## because it needs the genePred, but the info is also present elsewhere. +​ + #add the reference information to gain biotype information + hgm_ref_df = pd.merge(hgm_df, ref_df, on=['GeneId', 'TranscriptId']) + # combine in homGeneMapping results + hgm_ref_tm_df = pd.merge(hgm_ref_df, tm_eval_df, on=['GeneId', 'TranscriptId']) + # split merged_df into coding and noncoding + coding_df = hgm_ref_tm_df[hgm_ref_tm_df.TranscriptBiotype == 'protein_coding'] + non_coding_df = hgm_ref_tm_df[hgm_ref_tm_df.TranscriptBiotype != 'protein_coding'] + # add metrics information to coding df + metrics_df = pd.merge(mrna_metrics_df, cds_metrics_df, on='AlignmentId', suffixes=['_mRNA', '_CDS']) + coding_df = pd.merge(coding_df, metrics_df, on='AlignmentId') + # add evaluation information to coding df, where possible. This adds information on frame shifts. + coding_df = pd.merge(coding_df, eval_df, on='AlignmentId', how='left') + # fill the original intron values to 100 so we don't filter them out -- means a no-intron gene + coding_df['OriginalIntronsPercent_mRNA'] = coding_df.OriginalIntronsPercent_mRNA.fillna(100) + coding_df['OriginalIntronsPercent_CDS'] = coding_df.OriginalIntronsPercent_CDS.fillna(100) + non_coding_df['TransMapOriginalIntronsPercent'] = non_coding_df.TransMapOriginalIntronsPercent.fillna(100) +​ +​ + # rawest counts. homGeneMapping was ran on the unfiltered dataset, so use that + # do this only on coding transcripts for now + unique_genes = 0 + unique_txs = 0 + tmp = hgm_ref_df[hgm_ref_df.TranscriptBiotype == 'protein_coding'] + num_coding_genes = len(set(tmp.GeneId)) + num_coding_txs = len(set(tmp.TranscriptId)) + for gene_id, d in tmp.groupby('GeneId'): + paralogy = collections.Counter(x.split('-')[0] for x in d.AlignmentId) + if sum(paralogy.values()) == len(paralogy): + unique_genes += 1 + for tx_id, dd in d.groupby('TranscriptId'): + if len(dd) == 1: + unique_txs += 1 +​ + data = {} + data['GenesFound'] = num_coding_genes + data['GenesFoundPercent'] = 100.0 * num_coding_genes / num_genes + data['GenesMultiplyMapping'] = num_genes - unique_genes + data['GenesMultiplyMappingPercent'] = 100.0 * (num_genes - unique_genes) / num_genes + data['TranscriptsFound'] = num_coding_txs + data['TranscriptsFoundPercent'] = 100.0 * num_coding_txs / num_txs + data['TranscriptsMultiplyMapping'] = num_txs - unique_txs + data['TranscriptsMultiplyMappingPercent'] = 100.0 * (num_txs - unique_txs) / num_txs +​ + # full coverage + full_cov_mrna = len(coding_df[coding_df.AlnCoverage_mRNA == 100]) + full_cov_cds = len(coding_df[coding_df.AlnCoverage_CDS == 100]) + data['FullmRNACoverage'] = full_cov_mrna + data['FullmRNACoveragePercent'] = 100.0 * full_cov_mrna / num_txs + data['FullCDSCoverage'] = full_cov_cds + data['FullCDSCoveragePercent'] = 100.0 * full_cov_cds / num_txs +​ + # construct a stringent filter that requires the following: + # 1) Has all original introns + # 2) Full CDS Coverage + # 3) No Frame-shift + frameshift = len(coding_df[coding_df.Frameshift == True]) + original_introns = len(coding_df[coding_df.OriginalIntronsPercent_mRNA == 100]) + cov = len(coding_df[coding_df.AlnCoverage_CDS == 100]) + cov_frameshift = len(coding_df[(coding_df.AlnCoverage_CDS == 100) & + (coding_df.Frameshift != True)]) + cov_frameshift_original_introns = len(coding_df[(coding_df.AlnCoverage_CDS == 100) & + (coding_df.Frameshift != True) & + (coding_df.OriginalIntronsPercent_mRNA == 100)]) + data['TranscriptsWithFrameshift'] = frameshift + data['TranscriptsWithFrameshiftPercent'] = 100.0 * frameshift / num_txs + data['TranscriptsWithOriginalIntrons'] = original_introns + data['TranscriptsWithOriginalIntronsPercent'] = 100.0 * original_introns / num_txs + data['TranscriptsWithFullCDSCoverage'] = cov + data['TranscriptsWithFullCDSCoveragePercent'] = 100.0 * cov / num_txs + data['TranscriptsWithFullCDSCoverageAndNoFrameshifts'] = cov_frameshift + data['TranscriptsWithFullCDSCoverageAndNoFrameshiftsPercent'] = 100.0 * cov_frameshift / num_txs + data['TranscriptsWithFullCDSCoverageAndNoFrameshiftsAndOriginalIntrons'] = cov_frameshift_original_introns + data['TranscriptsWithFullCDSCoverageAndNoFrameshiftsAndOriginalIntronsPercent'] = 100.0 * cov_frameshift_original_introns / num_txs +​ + # naive gene level + frameshift = len(set(coding_df[coding_df.Frameshift == True].GeneId)) + original_introns = len(set(coding_df[coding_df.OriginalIntronsPercent_mRNA == 100].GeneId)) + cov = len(set(coding_df[(coding_df.ProperOrf == True) & (coding_df.AlnCoverage_CDS == 100)].GeneId)) + cov_frameshift = len(set(coding_df[(coding_df.AlnCoverage_CDS == 100) & + (coding_df.Frameshift != True)].GeneId)) + cov_frameshift_original_introns = len(set(coding_df[(coding_df.AlnCoverage_CDS == 100) & + (coding_df.Frameshift != True) & + (coding_df.OriginalIntronsPercent_mRNA == 100)].GeneId)) + data['GenesWithFrameshift'] = frameshift + data['GenesWithFrameshiftPercent'] = 100.0 * frameshift / num_genes + num_genes_all_shifted = 0 + for gene_id, d in coding_df.groupby('GeneId'): + if len(d[d.Frameshift == True]) == len(d): + num_genes_all_shifted += 1 + data['GenesWithFrameshiftAllIsoforms'] = num_genes_all_shifted + data['GenesWithFrameshiftAllIsoformsPercent'] = 100.0 * num_genes_all_shifted / num_genes + data['GenesWithOriginalIntrons'] = original_introns + data['GenesWithOriginalIntronsPercent'] = 100.0 * original_introns / num_genes + data['GenesWithFullCDSCoverage'] = cov + data['GenesWithFullCDSCoveragePercent'] = 100.0 * cov / num_genes + data['GenesWithFullCDSCoverageAndNoFrameshifts'] = cov_frameshift + data['GenesWithFullCDSCoverageAndNoFrameshiftsPercent'] = 100.0 * cov_frameshift / num_genes + data['GenesWithFullCDSCoverageAndNoFrameshiftsAndOriginalIntrons'] = cov_frameshift_original_introns + data['GenesWithFullCDSCoverageAndNoFrameshiftsAndOriginalIntronsPercent'] = 100.0 * cov_frameshift_original_introns / num_genes +​ + missing = set(ref_df[ref_df.TranscriptBiotype == 'protein_coding'].GeneId) - set(tmp.GeneId) +​ + data['MissingGenes'] = len(missing) + data['MissingGenesPercent'] = (100.0 * len(missing)) / num_genes +​ + data['Name'] = db_path.replace('.db', '') +​ + return data +​ +if __name__ == '__main__': + parser = ArgumentParser() + parser.add_argument('ref_db') + parser.add_argument('target_db', nargs='+') + parser.add_argument('--chrXOnly', action='store_true') + opts = parser.parse_args() + columns = ['Name', 'GenesFound', 'TranscriptsFound', 'GenesMultiplyMapping', 'TranscriptsMultiplyMapping', 'FullmRNACoverage', 'FullCDSCoverage', 'TranscriptsWithFrameshift', 'TranscriptsWithOriginalIntrons', 'TranscriptsWithFullCDSCoverage', 'TranscriptsWithFullCDSCoverageAndNoFrameshifts', 'TranscriptsWithFullCDSCoverageAndNoFrameshiftsAndOriginalIntrons', 'GenesWithFrameshift', 'GenesWithOriginalIntrons', 'GenesWithFullCDSCoverage', 'GenesWithFullCDSCoverageAndNoFrameshifts', 'GenesWithFullCDSCoverageAndNoFrameshiftsAndOriginalIntrons', 'MissingGenes'] + columns_w_percent = [] + for column in columns: + if column != 'Name': + columns_w_percent.extend([column, column + 'Percent']) + else: + columns_w_percent.append(column) + df = pd.DataFrame(columns=columns_w_percent) + for target_db in opts.target_db: + df = df.append(get_df(target_db, opts.ref_db, opts.chrXOnly), ignore_index=True) + print df.to_csv() \ No newline at end of file From 632ab218c2397572f6f618d857593ff95f4d4f00 Mon Sep 17 00:00:00 2001 From: Your Name Date: Thu, 14 May 2020 17:32:13 +0000 Subject: [PATCH 53/69] oops --- scripts/summarize_cat_output.py | 42 ++++++++++++++++----------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/scripts/summarize_cat_output.py b/scripts/summarize_cat_output.py index b3087578..7c3501a7 100644 --- a/scripts/summarize_cat_output.py +++ b/scripts/summarize_cat_output.py @@ -1,8 +1,8 @@ #!/usr/bin/env python -​ + import collections import pandas as pd -​ + import tools.intervals import tools.misc import tools.mathOps @@ -13,7 +13,7 @@ import tools.procOps from cat.consensus import * from argparse import ArgumentParser -​ + def get_df(db_path, ref_db_path, chrXOnly=False): tm_eval_df = load_transmap_evals(db_path) ref_df = tools.sqlInterface.load_annotation(ref_db_path) @@ -22,8 +22,8 @@ def get_df(db_path, ref_db_path, chrXOnly=False): cds_metrics_df = pd.concat([load_metrics_from_db(db_path, tx_mode, 'CDS') for tx_mode in tx_modes]) eval_df = pd.concat([load_evaluations_from_db(db_path, tx_mode) for tx_mode in tx_modes]).reset_index() hgm_df = pd.concat([load_hgm_vectors(db_path, tx_mode) for tx_mode in tx_modes]) -​ -​ + + # if chrXOnly: # cmd = [['grep', 'chrX', 'gencode.v30.annotation.gp'], ['cut', '-f', '1']] # chrx_txs = set(tools.procOps.call_proc_lines(cmd)) @@ -33,15 +33,15 @@ def get_df(db_path, ref_db_path, chrXOnly=False): # cmd = [['grep', 'chrY', 'gencode.v30.annotation.gp'], ['cut', '-f', '1']] # chry_txs = set(tools.procOps.call_proc_lines(cmd)) # ref_df = ref_df[~ref_df.TranscriptId.isin(chry_txs)] -​ -​ + + num_txs = len(set(ref_df[ref_df.TranscriptBiotype == 'protein_coding'].TranscriptId)) num_genes = len(set(ref_df[ref_df.TranscriptBiotype == 'protein_coding'].GeneId)) -​ -​ + + ## code below is from the consensus module. I ripped out from the combine_and_filter_dfs method ## because it needs the genePred, but the info is also present elsewhere. -​ + #add the reference information to gain biotype information hgm_ref_df = pd.merge(hgm_df, ref_df, on=['GeneId', 'TranscriptId']) # combine in homGeneMapping results @@ -58,8 +58,8 @@ def get_df(db_path, ref_db_path, chrXOnly=False): coding_df['OriginalIntronsPercent_mRNA'] = coding_df.OriginalIntronsPercent_mRNA.fillna(100) coding_df['OriginalIntronsPercent_CDS'] = coding_df.OriginalIntronsPercent_CDS.fillna(100) non_coding_df['TransMapOriginalIntronsPercent'] = non_coding_df.TransMapOriginalIntronsPercent.fillna(100) -​ -​ + + # rawest counts. homGeneMapping was ran on the unfiltered dataset, so use that # do this only on coding transcripts for now unique_genes = 0 @@ -74,7 +74,7 @@ def get_df(db_path, ref_db_path, chrXOnly=False): for tx_id, dd in d.groupby('TranscriptId'): if len(dd) == 1: unique_txs += 1 -​ + data = {} data['GenesFound'] = num_coding_genes data['GenesFoundPercent'] = 100.0 * num_coding_genes / num_genes @@ -84,7 +84,7 @@ def get_df(db_path, ref_db_path, chrXOnly=False): data['TranscriptsFoundPercent'] = 100.0 * num_coding_txs / num_txs data['TranscriptsMultiplyMapping'] = num_txs - unique_txs data['TranscriptsMultiplyMappingPercent'] = 100.0 * (num_txs - unique_txs) / num_txs -​ + # full coverage full_cov_mrna = len(coding_df[coding_df.AlnCoverage_mRNA == 100]) full_cov_cds = len(coding_df[coding_df.AlnCoverage_CDS == 100]) @@ -92,7 +92,7 @@ def get_df(db_path, ref_db_path, chrXOnly=False): data['FullmRNACoveragePercent'] = 100.0 * full_cov_mrna / num_txs data['FullCDSCoverage'] = full_cov_cds data['FullCDSCoveragePercent'] = 100.0 * full_cov_cds / num_txs -​ + # construct a stringent filter that requires the following: # 1) Has all original introns # 2) Full CDS Coverage @@ -115,7 +115,7 @@ def get_df(db_path, ref_db_path, chrXOnly=False): data['TranscriptsWithFullCDSCoverageAndNoFrameshiftsPercent'] = 100.0 * cov_frameshift / num_txs data['TranscriptsWithFullCDSCoverageAndNoFrameshiftsAndOriginalIntrons'] = cov_frameshift_original_introns data['TranscriptsWithFullCDSCoverageAndNoFrameshiftsAndOriginalIntronsPercent'] = 100.0 * cov_frameshift_original_introns / num_txs -​ + # naive gene level frameshift = len(set(coding_df[coding_df.Frameshift == True].GeneId)) original_introns = len(set(coding_df[coding_df.OriginalIntronsPercent_mRNA == 100].GeneId)) @@ -141,16 +141,16 @@ def get_df(db_path, ref_db_path, chrXOnly=False): data['GenesWithFullCDSCoverageAndNoFrameshiftsPercent'] = 100.0 * cov_frameshift / num_genes data['GenesWithFullCDSCoverageAndNoFrameshiftsAndOriginalIntrons'] = cov_frameshift_original_introns data['GenesWithFullCDSCoverageAndNoFrameshiftsAndOriginalIntronsPercent'] = 100.0 * cov_frameshift_original_introns / num_genes -​ + missing = set(ref_df[ref_df.TranscriptBiotype == 'protein_coding'].GeneId) - set(tmp.GeneId) -​ + data['MissingGenes'] = len(missing) data['MissingGenesPercent'] = (100.0 * len(missing)) / num_genes -​ + data['Name'] = db_path.replace('.db', '') -​ + return data -​ + if __name__ == '__main__': parser = ArgumentParser() parser.add_argument('ref_db') From 1886ba6d37b3d717b0cacf0270ed065ea73fa27d Mon Sep 17 00:00:00 2001 From: Ian Fiddes Date: Fri, 15 May 2020 09:40:12 -0700 Subject: [PATCH 54/69] docker should understand tmpdir --- tools/procOps.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tools/procOps.py b/tools/procOps.py index 9d83b490..8e38ba5e 100644 --- a/tools/procOps.py +++ b/tools/procOps.py @@ -107,7 +107,7 @@ def popen_catch(command, stdin=None): def mrca_path(path1, path2): """ Gives the Most Recent Common Ancestor directory that contains both paths. - + >>> mrca_path('/usr/lib/python2.7', '/usr/bin/python') '/usr' >>> mrca_path('/usr/', '/usr/') @@ -165,6 +165,10 @@ def getDockerCommand(image, cmd): cmd: list of arguments """ dockerPreamble = ['docker', 'run', '-i', '--rm', '-u', "%s:%s" % (os.getuid(), os.getgid())] + if "TMPDIR" in os.environ: + tmpdir = os.environ["TMPDIR"] + dockerPreamble.extend(["--env", "TMPDIR={}".format(tmpdir)]) + dockerPreamble.extend(['-v', tmpdir + ':' + tmpdir]) work_dirs = [] for i, arg in enumerate(cmd): if arg.startswith('-') and '=' in arg: From 279070ddd118b0a679ed5847664aaf2489320628 Mon Sep 17 00:00:00 2001 From: Ian Fiddes Date: Sun, 17 May 2020 16:52:59 -0700 Subject: [PATCH 55/69] docs --- README.md | 63 +++++++++++++++++++++++++++++++++---------------------- 1 file changed, 38 insertions(+), 25 deletions(-) diff --git a/README.md b/README.md index fe5b39eb..1bdafb5a 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,7 @@ Above is a flowchart schematic of the functionality of the `CAT` pipeline. # Installation -The pipeline can be installed by a simple `pip` install: +The pipeline can be installed by a simple `pip` install: `pip install git+https://github.com/ComparativeGenomicsToolkit/Comparative-Annotation-Toolkit.git` @@ -25,7 +25,19 @@ If you want to do the direct pip installation, you can grab the config files fro Either form of `pip` installation will install all of the python dependencies. -## Dependencies (if not using Docker) +## Docker/Singularity + +If you are capable of using Docker or Singularity, we highly recommend you do so. CAT will automatically pull the Docker container and use it for all of the binaries. If you use docker or singularity, all you need to do is install CAT to your python environment. + +### Environmental Variables + +There are two environmental variables to consider when running docker/singularity. + +`$TMPDIR`: Set this to a specific path if you are worried about running out of space in your tmp directory. CAT will automatically mount this location in the container and point temporary files to it. + +`$SINGULARITY_IMAGE`: If you want to use your own singularity image, you can specify the image path here. + +## Compiling dependencies Below is a full breakdown of the required dependencies if you are not using Docker. Some of these can be challenging to get to all compile and work properly. In addition to the breakdown below, you may get guidance looking at the [Dockerfile](https://github.com/ComparativeGenomicsToolkit/Comparative-Annotation-Toolkit/blob/master/Dockerfile) or looking at [this list of installation commands](https://github.com/ComparativeGenomicsToolkit/Comparative-Annotation-Toolkit/issues/118#issuecomment-434910846) generated by a helpful user. @@ -36,9 +48,9 @@ By default, you don't need to worry about installing any of these. However, ther 3. [samtools](http://www.htslib.org/) (1.3 or greater). 4. [Augustus](http://bioinf.uni-greifswald.de/augustus/binaries/). Make sure you are installing `augustus >= 3.3.1`. If you want to use Augustus CGP, install the development version from the [Github repository](https://github.com/Gaius-Augustus/Augustus). You need to follow the instructions to compile `augustus` in comparative augustus mode. This requires that you modify a few lines in the `common.mk` file, and also need to have `sqlite3`, `lp-solve`, `bamtools`, and `libboost` installed. If you are using ubuntu, this should work: `apt-get install libboost-all-dev libboost sqlite3 libsqlite3-0 libsqlite3-dev libgsl0-dev lp-solve liblpsolve55-dev bamtools libbamtools-dev` - - After you have the primary `augustus` binaries compiled, add the directory to your path. Note that if you move the `augustus` binaries from their original location, you will need to set the `AUGUSTUS_CONFIG_PATH` global variable to point to the species directory. - + + After you have the primary `augustus` binaries compiled, add the directory to your path. Note that if you move the `augustus` binaries from their original location, you will need to set the `AUGUSTUS_CONFIG_PATH` global variable to point to the species directory. + You will also need to put the contents of the `scripts` directory on your path. Next, you need to compile the following auxiliary programs from the folder `auxprogs`: 1. `joingenes`. Compiling this program will place it in the `augustus` binary directory. 2. `bam2hints`. Compiling this program will place it in the `augustus` binary directory. Requires `bamtools` to be installed. If the `bamtools` headers are not at `/usr/include/bamtools`, you will need to modify the makefile. @@ -46,7 +58,7 @@ By default, you don't need to worry about installing any of these. However, ther 4. `bam2wig`. Compiling this program will NOT place it in the `augustus` binary directory, you must do so yourself. This program requires you modify the makefile to explicitly point to your installation of `htslib`, `bcftools`, `samtools`, and `tabix`. `Tabix` is now packaged with `htslib`, and both are included in your `kent` directory at `$kent/src/htslib/`. 5. `homGeneMapping`. This program must also have its makefile at `$augustus/trunks/auxprogs/homGeneMapping/src/Makefile` modified to turn on the `BOOST = true` and `SQLITE = true` flags. Then run `make clean && make` to recompile. 6. There are a series of perl scripts that you need to place on your path from the `$augustus/trunks/scripts` directory: `wig2hints.pl`, `exonerate2hints.pl`, `transMap2hints.pl`, and `join_mult_hints.pl`. -5. [HAL toolkit](https://github.com/glennhickey/hal). To install the HAL toolkit, you must also have the [sonLib](https://github.com/benedictpaten/sonLib) repository in the same parent directory. Compile sonLib first, then compile hal. Once hal is compiled, you need to have the binaries on your path. +5. [HAL toolkit](https://github.com/glennhickey/hal). To install the HAL toolkit, you must also have the [sonLib](https://github.com/benedictpaten/sonLib) repository in the same parent directory. Compile sonLib first, then compile hal. Once hal is compiled, you need to have the binaries on your path. 6. [wiggletools](https://github.com/Ensembl/WiggleTools). Used to combine RNA-seq expression in assembly hubs. 7. [sambamba](https://github.com/lomereiter/sambamba/releases). Used to name sort faster than samtools for hints building. 8. [exonerate](https://www.ebi.ac.uk/about/vertebrate-genomics/software/exonerate). Used for protein alignments in hints building. @@ -58,11 +70,12 @@ Many of the above dependencies are on `conda`. However, you will still need to i 2. `toil`: The version on conda is too old (not python3 compatible). 3. `cat`: You will need to install this repo with `pip install` after you enter the conda env (`conda activate cattest`). 4. `HAL`: Not available on conda. + 5. The version of `augustus` on conda does not have the required flags set in its compliation. ``` conda create -y -n cattest -c conda-forge -c bioconda -c defaults python=3.7 pyfasta luigi seaborn pandas \ ete3 pysam numpy scipy bx-python bcbio-gff biopython parasail-python configobj sqlalchemy \ -samtools bamtools augustus exonerate wiggletools bedtools \ +samtools bamtools exonerate wiggletools bedtools \ ucsc-fatotwobit ucsc-gff3togenepred ucsc-genepredtobed ucsc-genepredtofakepsl ucsc-bamtopsl ucsc-transmappsltogenepred \ ucsc-pslpostarget ucsc-axtchain ucsc-chainmergesort ucsc-pslmap ucsc-pslrecalcmatch ucsc-pslmappostchain \ ucsc-gtftogenepred ucsc-genepredtogtf ucsc-pslcdnafilter ucsc-psltobigpsl \ @@ -71,16 +84,16 @@ ucsc-bedsort ucsc-bedtobigbed In total, you must have all of the binaries and scripts listed below on your path. The pipeline will check for them before executing steps. -`hal2fasta halStats halLiftover exonerate faToTwoBit pyfasta gff3ToGenePred genePredToBed genePredToFakePsl bamToPsl exonerate2hints.pl blat2hints.pl transMapPslToGenePred join_mult_hints.pl pslPosTarget axtChain chainMergeSort pslMap pslRecalcMatch pslMapPostChain augustus transMap2hints.pl joingenes hal2maf gtfToGenePred genePredToGtf bedtools homGeneMapping pslCDnaFilter clusterGenes pslToBigPsl bedSort bedToBigBed sambamba wig2hints.pl pal2nal.pl` -# Running the pipeline - -This pipeline makes use of [Luigi](https://github.com/spotify/luigi) to link the various steps together. First, start the `luigid` daemon: - -`luigid --background --logdir luigi_logs` +``` +hal2fasta halStats halLiftover exonerate faToTwoBit pyfasta gff3ToGenePred genePredToBed \ +genePredToFakePsl bamToPsl exonerate2hints.pl blat2hints.pl transMapPslToGenePred \ +join_mult_hints.pl pslPosTarget axtChain chainMergeSort pslMap pslRecalcMatch pslMapPostChain \ +augustus transMap2hints.pl joingenes hal2maf gtfToGenePred genePredToGtf bedtools homGeneMapping \ +pslCDnaFilter clusterGenes pslToBigPsl bedSort bedToBigBed sambamba wig2hints.pl pal2nal.pl +``` -Which provides the central scheduler as well as the web UI, which can be accessed at `localhost:8082`. -If you don't want to use the daemon, *which is highly recommended* add the flag `--local-scheduler` to the invocation. +# Running the pipeline To run the test data, change directories to the CAT installation folder and do the following: @@ -109,7 +122,7 @@ reserved_keys = ['gene_biotype', The keys `ID`, `Name` and `Parent` are required for valid GFF3 and define the hierarchical relationship. The remaining keys, `gene_biotype`, `transcript_biotype`, `gene_name`, `gene_id`, `transcript_id` and `transcript_name` are also all required. In many cases you will not have common names, and so it is fine for `transcript_name` to equal `transcript_id` and for `gene_name` to equal `gene_id`. The biotypes can be whatever you want, but `protein_coding` is a special biotype that tells CAT this gene or transcript is coding. You *must* flag your protein coding genes *and transcripts* as `protein_coding`! -You may have any arbitrary set of keys and values in the GFF3 that are not the reserved keys. These keys and values will be propagated on to the resulting transcript in the CAT output GFF3. +You may have any arbitrary set of keys and values in the GFF3 that are not the reserved keys. These keys and values will be propagated on to the resulting transcript in the CAT output GFF3. If your GFF3 has duplicate transcript names, the pipeline will complain. One common cause of this is PAR locus genes. You will want to remove PAR genes -- If your GFF3 came from GENCODE, you should be able to do this: `grep -v PAR_Y $gff > $gff.fixed`. @@ -125,7 +138,7 @@ As described above, the primary method to executing the pipeline is to follow th `--config`: Path to the config file with annotations and extrinsic hints. See [the config section](#config-file) for more information. (REQUIRED). -`--binary-mode`: How should CAT run its binary dependencies? Valid choices are "docker" (the default, in which case you must have a Docker daemon running) or "local" (in which case ensure your PATH contains the necessary dependencies). +`--binary-mode`: How should CAT run its binary dependencies? Valid choices are "docker" (the default, in which case you must have a Docker daemon running), "singularity", or "local" (in which case ensure your PATH contains the necessary dependencies). `--out-dir`: Output directory. Defaults to `./cat_output`. @@ -136,11 +149,11 @@ As described above, the primary method to executing the pipeline is to follow th `--workers`: Number of local cores to use. If running `toil` in singleMachine mode, care must be taken with the balance of this value and the `--maxCores` parameter. ## transMap options -`--global-near-best`: Adjusts the `globalNearBest` parameter passed to `pslCDnaFilter`. Defaults to 0.15. The `globalNearBest` algorithm determines which set of alignments are within a certain distance of the highest scoring alignment for a given source transcript. Making this value smaller will increase the number of alignments filtered out, decreasing the apparent paralogous alignment rate. Alignments which survive this filter are putatively paralogous. +`--global-near-best`: Adjusts the `globalNearBest` parameter passed to `pslCDnaFilter`. Defaults to 0.15. The `globalNearBest` algorithm determines which set of alignments are within a certain distance of the highest scoring alignment for a given source transcript. Making this value smaller will increase the number of alignments filtered out, decreasing the apparent paralogous alignment rate. Alignments which survive this filter are putatively paralogous. ## AugustusTM(R) options -`--augustus`: Run AugustusTM(R)? +`--augustus`: Run AugustusTM(R)? `--augustus-species`: What Augustus species do we want to use? If your species is not a mammal, please choose [one of the species listed here](http://bioinf.uni-greifswald.de/augustus/). @@ -168,7 +181,7 @@ As described above, the primary method to executing the pipeline is to follow th ## Filtering and consensus finding options -`--filter-overlapping-genes`: Should genes that get flagged as overlapping be removed? After consensus finding is finished, instances of gene family collapse or paralog mis-assignment may lead to overlapping CDS intervals on different genes. This also in some instances may be a feature of the original annotation set. However, some annotation databases do not like this, so this flag will remove all such instances and resolve them down to one gene. +`--filter-overlapping-genes`: Should genes that get flagged as overlapping be removed? After consensus finding is finished, instances of gene family collapse or paralog mis-assignment may lead to overlapping CDS intervals on different genes. This also in some instances may be a feature of the original annotation set. However, some annotation databases do not like this, so this flag will remove all such instances and resolve them down to one gene. `--overlapping-gene-distance`: This controls the amount of *exonic* overlap two genes must have to be flagged as overlapping. Setting this value higher than 1 is recommended for smaller genomes that likely have real overlapping genes. Default is 1. @@ -230,7 +243,7 @@ Follow these [instructions](https://toilpipelines.wordpress.com/2018/01/18/runni `--provisioner`: The provisioner of the cloud system. For AWS it's just `aws`. -`--maxNodes`: Maximum number of nodes running at once. +`--maxNodes`: Maximum number of nodes running at once. # Config file @@ -335,7 +348,7 @@ This step performs the preliminary classification of `transMap` transcripts. Thi 10. ValidStart -- start with ATG? 11. ValidStop -- valid stop codon (in frame)? 12. ProperOrf -- is the orf a multiple of 3? - + This module will populate the folder `--work-dir/transMap`. ## FilterTransMap @@ -383,7 +396,7 @@ This module will populate the folder `--work-dir/augustus_cgp`. ## AugustusPb -Running `AugustusPB` requires that IsoSeq data be provided. This mode runs on single genomes, and attempts to discover new isoforms. Transcripts predicted in this process undergo the same parental gene assignment described above. +Running `AugustusPB` requires that IsoSeq data be provided. This mode runs on single genomes, and attempts to discover new isoforms. Transcripts predicted in this process undergo the same parental gene assignment described above. This module will populate the folder `--work-dir/augustus_pb`. @@ -409,7 +422,7 @@ Which can be interpreted as 'species 0 had 6273 extrinsic hints (RNA-seq coverag ## AlignTranscripts -Transcript alignment allows for `AugustusTM(R)` transcripts to be compared to their parental `transMap`. As a result, only protein coding transcripts are aligned. For each transcripts, alignment is performed by parasail two ways -- CDS alignment, and mRNA alignment. The results of these alignments are saved in the folder `--work-dir/transcript_alignment`. These alignments are used to create functional annotations of transcripts in the [EvaluateTranscripts](#evaluatetranscripts) module. +Transcript alignment allows for `AugustusTM(R)` transcripts to be compared to their parental `transMap`. As a result, only protein coding transcripts are aligned. For each transcripts, alignment is performed by parasail two ways -- CDS alignment, and mRNA alignment. The results of these alignments are saved in the folder `--work-dir/transcript_alignment`. These alignments are used to create functional annotations of transcripts in the [EvaluateTranscripts](#evaluatetranscripts) module. ## EvaluateTranscripts @@ -455,7 +468,7 @@ The evaluation tables will be loaded as tracks in the final [assembly hub](#asse The consensus finding process takes in transcripts from every mode and attempts to find the highest quality ortholog for a source transcript. The de-novo transcript modes are also evaluated for providing novel isoforms or novel loci. The final gene set is output with a series of features measuring how confident the prediction is. To evaluate `transMap`, `AugustusTM` and `AugustusTMR` transcripts a consensus score is assigned to each. This score is the sum of the alignment goodness, intron/exon annotation support, original intron support, and intron/exon RNA-seq/IsoSeq support if extrinsic data were provided. The transcript with the highest consensus score is chosen. - + If one of the de-novo `augustus` modes is run, then the those transcripts are evaluated for providing novel information. If a prediction did not overlap any transMap projections, then it is tagged as putative novel and incorporated into the gene set. If a prediction overlaps a `transMap` projection that was filtered out during paralog resolution, then it is tagged as a possible paralog as well as with the names of overlapping transcripts and incorporated into the gene set. If a prediction overlaps a transMap projection and contains a splice junction not seen in the reference annotation, then it is tagged as a novel isoform and incorporated into the gene set as a member of the gene it overlapped with. After consensus finding is complete, a final filtering process is performed. This filtering process deduplicates the transcript set. Duplicates most often occur when the `augustus` execution modes create an identical transcript model from different input isoforms. In this case, the duplicates are removed and the remaining transcript tagged with the names of alternative source transcripts. Strand resolution throws out transcripts that are on opposite strands. The correct strand is chosen by looking at which contains the most high quality transcripts. Finally, the transcripts are again clustered using `clusterGenes` on CDS intervals to resolve the case where incorporating novel predictions lead to different gene IDs sharing CDS bases. From 5c014b9f8dd4107973c267e1e407fe3abf93573f Mon Sep 17 00:00:00 2001 From: Ian Fiddes Date: Wed, 20 May 2020 13:12:55 -0700 Subject: [PATCH 56/69] infer parent --- programs/convert_ncbi_gff3 | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/programs/convert_ncbi_gff3 b/programs/convert_ncbi_gff3 index 677e9180..ca6932a6 100755 --- a/programs/convert_ncbi_gff3 +++ b/programs/convert_ncbi_gff3 @@ -114,6 +114,11 @@ if __name__ == '__main__': add_tags_to_feature(feature, noid_count, tx_counts, gene_counts, parent_feature) except KeyError: assert False, feature.qualifiers + if feature.type == 'CDS' and parent_feature.type == 'gene': + # convert gene into fake transcript; we lack a parent + parent_feature.qualifiers['transcript_biotype'] = ['protein_coding'] + parent_feature.qualifiers['transcript_id'] = feature.qualifiers['transcript_id'] + parent_feature.qualifiers['transcript_name'] = feature.qualifiers['transcript_name'] new_qualifiers = construct_new_qualifiers(feature) feature.qualifiers = new_qualifiers From c0e9f543897f0d523a42e9130d5978faa190274c Mon Sep 17 00:00:00 2001 From: Ian Fiddes Date: Wed, 20 May 2020 13:15:11 -0700 Subject: [PATCH 57/69] new biotype --- programs/convert_ncbi_gff3 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/programs/convert_ncbi_gff3 b/programs/convert_ncbi_gff3 index ca6932a6..5c297608 100755 --- a/programs/convert_ncbi_gff3 +++ b/programs/convert_ncbi_gff3 @@ -28,7 +28,7 @@ def add_tags_to_feature(feature, noid_count, tx_counts, gene_counts, parent_feat 'rRNA', 'mRNA', 'snRNA', 'snoRNA', 'scRNA', 'antisense_RNA', 'guide_RNA', 'C_gene_segment', 'J_gene_segment', 'V_gene_segment', 'primary_transcript', 'miRNA', 'transcript', 'CDS', 'exon', 'gene', - 'SRP_RNA']: + 'SRP_RNA', 'telomerase_RNA']: return if 'dbxref' in feature.qualifiers: xref = feature.qualifiers['dbxref'] From f73c1711750b6639f08c95fbd555e7bc2cac3d3f Mon Sep 17 00:00:00 2001 From: Ian Fiddes Date: Wed, 20 May 2020 13:42:01 -0700 Subject: [PATCH 58/69] bug in conversion --- tools/misc.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tools/misc.py b/tools/misc.py index 668a57df..1f3d5f73 100644 --- a/tools/misc.py +++ b/tools/misc.py @@ -102,6 +102,8 @@ def sort_gff(input_file, output_file): def parse_gtf_attr_line(attr_line): """parse a GTF attributes line""" + if len(attr_line) == 0: + return {} attr_line = [x.split(' ') for x in re.split('; +', attr_line.replace('"', ''))] attr_line[-1][-1] = attr_line[-1][-1].rstrip().replace(';', '') return dict(attr_line) @@ -109,6 +111,8 @@ def parse_gtf_attr_line(attr_line): def parse_gff_attr_line(attr_line): """parse a GFF attributes line""" + if len(attr_line) == 0: + return {} attr_line = [x.split('=') for x in re.split('; *', attr_line.replace('"', ''))] attr_line[-1][-1] = attr_line[-1][-1].rstrip().replace(';', '') return dict(attr_line) From 4c2fe349335922c4e075bbd16d55bd35d1f828f8 Mon Sep 17 00:00:00 2001 From: Ian Fiddes Date: Sun, 24 May 2020 14:32:49 -0700 Subject: [PATCH 59/69] cgp wont work --- cat/consensus.py | 2 +- scripts/summarize_cat_output.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cat/consensus.py b/cat/consensus.py index 375f75f1..42fb7351 100644 --- a/cat/consensus.py +++ b/cat/consensus.py @@ -648,7 +648,7 @@ def add_exref_ids(s): 'source_gene_common_name': s.CommonName} # bring in extra tags for exRef - if 'exRef' in denovo_tx_modes: + if tools.nameConversions.aln_id_is_exref(aln_id): for key, val in tools.misc.parse_gff_attr_line(exref_annot.loc[aln_id].ExtraTags).items(): denovo_tx_dict[aln_id][key] = val diff --git a/scripts/summarize_cat_output.py b/scripts/summarize_cat_output.py index 7c3501a7..ffd5fb5c 100644 --- a/scripts/summarize_cat_output.py +++ b/scripts/summarize_cat_output.py @@ -167,4 +167,4 @@ def get_df(db_path, ref_db_path, chrXOnly=False): df = pd.DataFrame(columns=columns_w_percent) for target_db in opts.target_db: df = df.append(get_df(target_db, opts.ref_db, opts.chrXOnly), ignore_index=True) - print df.to_csv() \ No newline at end of file + print(df.to_csv()) \ No newline at end of file From 20ade5acd6ca3fda6677ebb8e7bae3ac24711115 Mon Sep 17 00:00:00 2001 From: Ian Fiddes Date: Sun, 24 May 2020 15:19:40 -0700 Subject: [PATCH 60/69] empty tags --- cat/consensus.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/cat/consensus.py b/cat/consensus.py index 42fb7351..8692ef85 100644 --- a/cat/consensus.py +++ b/cat/consensus.py @@ -649,8 +649,10 @@ def add_exref_ids(s): # bring in extra tags for exRef if tools.nameConversions.aln_id_is_exref(aln_id): - for key, val in tools.misc.parse_gff_attr_line(exref_annot.loc[aln_id].ExtraTags).items(): - denovo_tx_dict[aln_id][key] = val + tags = exref_annot.loc[aln_id].ExtraTags + if len(tags) > 0: + for key, val in tools.misc.parse_gff_attr_line(tags).items(): + denovo_tx_dict[aln_id][key] = val # record some metrics metrics['denovo'][tx_mode][s.TranscriptClass.replace('_', ' ').capitalize()] += 1 From a771d009b52b6b3fab60d1c724116f67828f0ba6 Mon Sep 17 00:00:00 2001 From: Ian Fiddes Date: Thu, 4 Jun 2020 08:26:39 -0700 Subject: [PATCH 61/69] exonerate in docker --- Dockerfile | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Dockerfile b/Dockerfile index f9d93d47..d043be0e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -9,7 +9,7 @@ RUN git clone git://github.com/samtools/htslib.git RUN cd htslib && make install # bcftools -RUN git clone git://github.com/samtools/bcftools.git +RUN git clone git://github.com/samtools/bcftools.git RUN cd bcftools && make # samtools @@ -33,7 +33,7 @@ RUN cd augustus && make # HDF5 RUN wget -q http://www.hdfgroup.org/ftp/HDF5/releases/hdf5-1.10/hdf5-1.10.1/src/hdf5-1.10.1.tar.gz RUN tar xzf hdf5-1.10.1.tar.gz -RUN cd hdf5-1.10.1 && ./configure --enable-cxx --prefix=/usr +RUN cd hdf5-1.10.1 && ./configure --enable-cxx --prefix=/usr RUN cd hdf5-1.10.1 && make && make install # sonLib @@ -62,7 +62,7 @@ RUN tar xvjf sambamba_v0.6.7_linux.tar.bz2 FROM ubuntu:18.04 RUN apt-get update -RUN apt-get install -y wget bedtools bamtools samtools sqlite3 libgsl0-dev libcolamd2 software-properties-common libcurl4-openssl-dev +RUN apt-get install -y wget bedtools bamtools samtools sqlite3 libgsl0-dev libcolamd2 software-properties-common libcurl4-openssl-dev exonerate RUN add-apt-repository -y ppa:deadsnakes/ppa RUN apt-get install -y python3.7 python3-pip # Kent From 6396bef703bd62b4f169c3cf8c978744855e46b1 Mon Sep 17 00:00:00 2001 From: Ian Fiddes Date: Thu, 11 Jun 2020 08:09:21 -0700 Subject: [PATCH 62/69] Update consensus.py Bug. Issue #183. --- cat/consensus.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/cat/consensus.py b/cat/consensus.py index 8692ef85..9fb5cc41 100644 --- a/cat/consensus.py +++ b/cat/consensus.py @@ -911,9 +911,7 @@ def convert_attrs(attrs, id_field): # convert empty strings into nan attrs = {x: y if y != '' else 'nan' for x, y in attrs.items()} # don't include the support vectors in the string, they will be placed in their respective places - attrs_str = ['='.join([key, str(val).replace('=', '_')]) for key, val in sorted(attrs.items()) if 'support' not in key] - # explicitly escape any semicolons that may exist in the input strings - attrs_str = [x.replace(';', '%3B').replace('=', '%3D') for x in attrs_str] + attrs_str = ['='.join([key.replace('=', '%3D').replace(';', '%3B']), str(val).replace('=', '%3D').replace(';', '%3B']) for key, val in sorted(attrs.items()) if 'support' not in key] return score, ';'.join(attrs_str) def find_feature_support(attrs, feature, i): From 81856ac371f7d4a2e198ce74e4723999d741f129 Mon Sep 17 00:00:00 2001 From: Ian Fiddes Date: Thu, 11 Jun 2020 08:59:00 -0700 Subject: [PATCH 63/69] oops --- cat/consensus.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/cat/consensus.py b/cat/consensus.py index 9fb5cc41..852ce207 100644 --- a/cat/consensus.py +++ b/cat/consensus.py @@ -909,10 +909,14 @@ def convert_attrs(attrs, id_field): else: attrs['Name'] = attrs['gene_id'] # convert empty strings into nan - attrs = {x: y if y != '' else 'nan' for x, y in attrs.items()} - # don't include the support vectors in the string, they will be placed in their respective places - attrs_str = ['='.join([key.replace('=', '%3D').replace(';', '%3B']), str(val).replace('=', '%3D').replace(';', '%3B']) for key, val in sorted(attrs.items()) if 'support' not in key] - return score, ';'.join(attrs_str) + attrs_str = [] + for key, val in attrs.items(): + if len(val) == 0: + y = 'nan' + val = str(val).replace('=', '%3D').replace(';', '%3B') + key = key.replace('=', '%3D').replace(';', '%3B') + attrs_str.append(f"{key}={val}") + return score, ";".join(attrs_str) def find_feature_support(attrs, feature, i): """Extracts the boolean value from the comma delimited string""" From 8d2540f742ad717731bf697fe9b7b67b7a8c78d7 Mon Sep 17 00:00:00 2001 From: Ian Fiddes Date: Thu, 11 Jun 2020 10:46:04 -0700 Subject: [PATCH 64/69] python 3.6 is fine --- setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 0e9dbd1f..4ef1e0ad 100644 --- a/setup.py +++ b/setup.py @@ -4,10 +4,10 @@ name='cat', version='2.0', packages=['cat', 'tools'], - python_requires='>=3.7.0', + python_requires='>=3.6.0', install_requires=[ 'pyfasta>=0.5.2', - 'toil>=3.5', + 'toil>=4.1', 'luigi>=2.5', 'seaborn>=0.7', 'pandas>=1.0', From 6e11f826a540d2c70f94c694c0fe9a0bbb852128 Mon Sep 17 00:00:00 2001 From: Ian Fiddes Date: Fri, 12 Jun 2020 10:11:16 -0700 Subject: [PATCH 65/69] bug in indel finding --- cat/classify.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/cat/classify.py b/cat/classify.py index 13f97324..88bf75f4 100644 --- a/cat/classify.py +++ b/cat/classify.py @@ -42,8 +42,6 @@ 2) mRNA """ -import itertools - import pandas as pd import tools.bio @@ -298,13 +296,16 @@ def parse_indel(left_pos, right_pos, coordinate_fn, tx, offset, gap_type): left_pos = q_start - q_offset right_pos = q_start else: - left_pos = psl.q_size - q_start + left_pos = psl.q_size - q_start - 1 right_pos = psl.q_size - q_start + q_offset row = parse_indel(left_pos, right_pos, coordinate_fn, tx, q_offset, 'Insertion') if row is not None: r.append(row) if t_offset != 0: # target insertion -> insertion in reference sequence - left_pos = right_pos = q_start + if tx.strand == '+': + left_pos = right_pos = q_start + else: + left_pos = right_pos = psl.q_size - q_start row = parse_indel(left_pos, right_pos, coordinate_fn, tx, t_offset, 'Deletion') if row is not None: r.append(row) From b20419467f68d1d665e0e0abc409260ffdee4139 Mon Sep 17 00:00:00 2001 From: Ian Fiddes Date: Fri, 19 Jun 2020 06:37:20 -0700 Subject: [PATCH 66/69] oops --- cat/consensus.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cat/consensus.py b/cat/consensus.py index 852ce207..17f1253e 100644 --- a/cat/consensus.py +++ b/cat/consensus.py @@ -911,8 +911,9 @@ def convert_attrs(attrs, id_field): # convert empty strings into nan attrs_str = [] for key, val in attrs.items(): + val = str(val) if len(val) == 0: - y = 'nan' + val = 'nan' val = str(val).replace('=', '%3D').replace(';', '%3B') key = key.replace('=', '%3D').replace(';', '%3B') attrs_str.append(f"{key}={val}") From 79b9b81d2fa56a85bc3381d28bb6ea944005d4f3 Mon Sep 17 00:00:00 2001 From: Ian Fiddes Date: Sun, 21 Jun 2020 14:37:01 -0700 Subject: [PATCH 67/69] refactor parental gene assignment --- cat/__init__.py | 9 +-- cat/parent_gene_assignment.py | 106 +++++++++++++++------------------- tools/gff3.py | 3 +- tools/intervals.py | 16 +++++ tools/nameConversions.py | 2 +- 5 files changed, 64 insertions(+), 72 deletions(-) diff --git a/cat/__init__.py b/cat/__init__.py index a0292da5..e6dd21ba 100644 --- a/cat/__init__.py +++ b/cat/__init__.py @@ -1627,8 +1627,6 @@ def get_args(pipeline_args, mode): for genome in pipeline_args.isoseq_genomes - {pipeline_args.ref_genome}} args.unfiltered_tm_gps = {genome: TransMap.get_args(pipeline_args, genome).tm_gp for genome in pipeline_args.isoseq_genomes - {pipeline_args.ref_genome}} - args.chrom_sizes = {genome: GenomeFiles.get_args(pipeline_args, genome).sizes - for genome in pipeline_args.isoseq_genomes} # add the reference annotation as a pseudo-transMap to assign parents in reference args.filtered_tm_gps[pipeline_args.ref_genome] = ReferenceFiles.get_args(pipeline_args).annotation_gp args.unfiltered_tm_gps[pipeline_args.ref_genome] = ReferenceFiles.get_args(pipeline_args).annotation_gp @@ -1644,8 +1642,6 @@ def get_args(pipeline_args, mode): unfiltered_tm_gp_files[pipeline_args.ref_genome] = ReferenceFiles.get_args(pipeline_args).annotation_gp args.filtered_tm_gps = filtered_tm_gp_files args.unfiltered_tm_gps = unfiltered_tm_gp_files - args.chrom_sizes = {genome: GenomeFiles.get_args(pipeline_args, genome).sizes - for genome in list(pipeline_args.target_genomes) + [pipeline_args.ref_genome]} elif mode == 'exRef': args.tablename = tools.sqlInterface.ExRefAlternativeGenes.__tablename__ args.gps = {genome: ExternalReferenceFiles.get_args(pipeline_args, genome).annotation_gp @@ -1659,8 +1655,6 @@ def get_args(pipeline_args, mode): unfiltered_tm_gp_files[pipeline_args.ref_genome] = ReferenceFiles.get_args(pipeline_args).annotation_gp args.filtered_tm_gps = filtered_tm_gp_files args.unfiltered_tm_gps = unfiltered_tm_gp_files - args.chrom_sizes = {genome: GenomeFiles.get_args(pipeline_args, genome).sizes - for genome in list(pipeline_args.target_genomes) + [pipeline_args.ref_genome]} else: raise Exception('Invalid mode passed to FindDenovoParents') return args @@ -1697,8 +1691,7 @@ def run(self): table_target = self.get_table_targets(genome, denovo_args.tablename, pipeline_args) filtered_tm_gp = denovo_args.filtered_tm_gps[genome] unfiltered_tm_gp = denovo_args.unfiltered_tm_gps[genome] - chrom_sizes = denovo_args.chrom_sizes[genome] - df = assign_parents(filtered_tm_gp, unfiltered_tm_gp, chrom_sizes, denovo_gp) + df = assign_parents(filtered_tm_gp, unfiltered_tm_gp, denovo_gp) db = pipeline_args.dbs[genome] with tools.sqlite.ExclusiveSqlConnection(db) as engine: df.to_sql(denovo_args.tablename, engine, if_exists='replace') diff --git a/cat/parent_gene_assignment.py b/cat/parent_gene_assignment.py index e5ec1ec0..661258ca 100644 --- a/cat/parent_gene_assignment.py +++ b/cat/parent_gene_assignment.py @@ -4,47 +4,68 @@ import pandas as pd import itertools import collections -from tools.defaultOrderedDict import DefaultOrderedDict import tools.procOps import tools.fileOps import tools.mathOps import tools.transcripts import tools.intervals +import tools.nameConversions -def assign_parents(filtered_tm_gp, unfiltered_tm_gp, chrom_sizes, denovo_gp, min_distance=0.9, stranded=True): +def assign_parents(filtered_tm_gp, unfiltered_tm_gp, denovo_gp, min_distance=0.75, + tm_jaccard_distance=0.2, stranded=True): """ Main function for assigning parental genes. Parental gene assignment methodology: - A) Each denovo transcript is evaluated for overlapping any transMap transcripts. - Overlap is defined as having at least 1 exonic base shared. + A) clusterGenes is used to cluster filtered transMap transcripts. B) If a denovo transcript is assigned to more than one gene, then this is attempted to be resolved. Resolution occurs by looking first at the transMap themselves. If any transMap projections overlap each other - with a Jaccard metric > 0.001, then we call this as a badAnnotOrTm. These will be discarded unless all splices - are supported. - Next, we look at the asymmetric distance between this prediction and the gene intervals. If this difference + with a Jaccard metric > tm_jaccard_distance, then we call this as a badAnnotOrTm. + These will be discarded unless all splices are supported. + C) Next, we look at the asymmetric distance between this prediction and the gene intervals. If this difference in these distances is over min_distance for all comparisons, we call this rescued and it can be incorporated. Otherwise, this transcript is tagged ambiguousOrFusion. + + Additionally, we look at all of the transMap projections that were filtered out and apply those + gene names to the AlternativeGeneIds column. This is a marker of possible paralogy. """ + def assign_type(s): + if tools.nameConversions.aln_id_is_denovo(s.gene): + return True + return False + filtered_transmap_dict = tools.transcripts.get_gene_pred_dict(filtered_tm_gp, stranded) unfiltered_transmap_dict = tools.transcripts.get_gene_pred_dict(unfiltered_tm_gp, stranded) - filtered_ids = unfiltered_transmap_dict.keys() - filtered_transmap_dict.keys() - - tm_chrom_dict = create_chrom_dict(unfiltered_transmap_dict, chrom_sizes) denovo_dict = tools.transcripts.get_gene_pred_dict(denovo_gp, stranded) - denovo_chrom_dict = create_chrom_dict(denovo_dict) - # begin parent gene assignment + with tools.fileOps.TemporaryFilePath() as tmp: + cmd = ['clusterGenes', '-minOverlappingBases=30', tmp, 'no', unfiltered_tm_gp, denovo_gp] + if not stranded: + cmd.append(['-ignoreStrand']) + tools.procOps.run_proc(cmd) + cluster_df = pd.read_csv(tmp, sep='\t') + + cluster_df['is_denovo'] = cluster_df.apply(assign_type, axis=1) + r = [] - for chrom, tm_tx_by_chromosome in tm_chrom_dict.items(): - for denovo_tx_id, denovo_tx in denovo_chrom_dict[chrom].items(): - # find the names of both filtered and unfiltered transMap transcript IDs that overlap - unfiltered_overlapping_tm_txs = find_tm_overlaps(denovo_tx, tm_tx_by_chromosome) - filtered_overlapping_tm_txs = {tx for tx in unfiltered_overlapping_tm_txs if tx.name not in filtered_ids} - # extract only gene names for the filtered set - filtered_gene_ids = {tx.name2 for tx in filtered_overlapping_tm_txs} + for _, d in cluster_df.groupby('#cluster'): + if not any(d.is_denovo): + continue + unfiltered_overlapping_tm_txs = set() + filtered_overlapping_tm_txs = set() + denovo_txs = set() + for tx_id, is_denovo in zip(d.gene, d.is_denovo): + if is_denovo: + denovo_txs.add(denovo_dict[tx_id]) + elif tx_id in filtered_transmap_dict: + filtered_overlapping_tm_txs.add(filtered_transmap_dict[tx_id]) + else: + unfiltered_overlapping_tm_txs.add(unfiltered_transmap_dict[tx_id]) + # extract only gene names for the filtered set + filtered_gene_ids = {tx.name2 for tx in filtered_overlapping_tm_txs} + for denovo_tx in denovo_txs: if len(filtered_gene_ids) > 1: # we have more than one match, so resolve it resolved_name, resolution_method = resolve_multiple_genes(denovo_tx, filtered_overlapping_tm_txs, - min_distance) + min_distance, tm_jaccard_distance) elif len(filtered_gene_ids) == 1: # yay, we have exactly one match resolved_name = list(filtered_gene_ids)[0] resolution_method = None @@ -61,45 +82,20 @@ def assign_parents(filtered_tm_gp, unfiltered_tm_gp, chrom_sizes, denovo_gp, min return combined_alternatives -def create_chrom_dict(tx_dict, chrom_sizes=None): - """ - Split up a dictionary of Transcript objects by chromosome. Add in extra chromosomes based on a sizes file - """ - chrom_dict = collections.defaultdict(dict) - for tx_id, tx in tx_dict.items(): - chrom_dict[tx.chromosome][tx_id] = tx - if chrom_sizes is not None: - for chrom, size in tools.fileOps.iter_lines(chrom_sizes): - if chrom not in chrom_dict: - chrom_dict[chrom] = {} - return chrom_dict - - -def find_tm_overlaps(denovo_tx, tm_tx_dict, cutoff=100): - """Find overlap with transMap transcripts first on a genomic scale then an exonic scale""" - r = DefaultOrderedDict(int) - for tx in tm_tx_dict.values(): - for tx_exon in tx.exon_intervals: - for denovo_exon in denovo_tx.exon_intervals: - i = tx_exon.intersection(denovo_exon) - if i is not None: - r[tx] += len(i) - return [tx_id for tx_id, num_bases in r.items() if num_bases >= cutoff] - - -def resolve_multiple_genes(denovo_tx, overlapping_tm_txs, min_distance): +def resolve_multiple_genes(denovo_tx, overlapping_tm_txs, min_distance, tm_jaccard_distance): """ Resolve multiple assignments based on the following rules: """ # use Jaccard metric to determine if the problem lies with transMap or annotation tm_txs_by_gene = tools.transcripts.group_transcripts_by_name2(overlapping_tm_txs) tm_jaccards = [find_highest_gene_jaccard(x, y) for x, y in itertools.combinations(list(tm_txs_by_gene.values()), 2)] - if any(x > 0.001 for x in tm_jaccards): + if any(x > tm_jaccard_distance for x in tm_jaccards): return None, 'badAnnotOrTm' # calculate asymmetric difference for this prediction scores = collections.defaultdict(list) for tx in overlapping_tm_txs: - scores[tx.name2].append(calculate_asymmetric_closeness(denovo_tx, tx)) + scores[tx.name2].append(tools.intervals.calculate_bed12_asymmetric_jaccard(denovo_tx.exon_intervals, + tx.exon_intervals)) best_scores = {gene_id: max(scores[gene_id]) for gene_id in scores} high_score = max(best_scores.values()) if all(high_score - x >= min_distance for x in best_scores.values() if x != high_score): @@ -125,15 +121,3 @@ def find_interval(gene_list): b_interval = find_interval(gene_list_b) return tools.intervals.calculate_bed12_jaccard(a_interval, b_interval) - -def calculate_asymmetric_closeness(denovo_tx, tm_tx): - """ - Calculates the asymmetric closeness between two transcripts. This allows for denovo predictions that are subsets - of existing annotations to get more weight. - - closeness = length(intersection) / length(denovo) in chromosome space - """ - intersection = denovo_tx.interval.intersection(tm_tx.interval) - if intersection is None: - return 0 - return tools.mathOps.format_ratio(len(intersection), len(denovo_tx.interval)) diff --git a/tools/gff3.py b/tools/gff3.py index b83d61c5..4f049ef2 100644 --- a/tools/gff3.py +++ b/tools/gff3.py @@ -61,6 +61,5 @@ def parse_attrs(attrs): def convert_gff3_cmd(annotation_attrs, annotation): cmd = ['gff3ToGenePred', '-rnaNameAttr=transcript_id', '-geneNameAttr=gene_id', '-honorStartStopCodons', - '-refseqHacks', - '-attrsOut={}'.format(annotation_attrs), annotation, '/dev/stdout'] + '-refseqHacks', '-attrsOut={}'.format(annotation_attrs), annotation, '/dev/stdout'] return cmd diff --git a/tools/intervals.py b/tools/intervals.py index 14e8dce6..f59b4291 100644 --- a/tools/intervals.py +++ b/tools/intervals.py @@ -341,6 +341,22 @@ def calculate_bed12_jaccard(intervals_a, intervals_b): return mathOps.format_ratio(intersection, (tot_a + tot_b) - intersection) +def calculate_bed12_asymmetric_jaccard(intervals_a, intervals_b): + """ + calculates the Jaccard similarity metric for two iterables of intervals. Will fail if they are not on the same + chromosome. + length(intersection) / (intervals_a) + """ + intersection = 0 + tot_a = sum(len(x) for x in intervals_a) + for a_interval in intervals_a: + for b_interval in intervals_b: + ins = a_interval.intersection(b_interval) + if ins is not None: + intersection += len(ins) + return mathOps.format_ratio(intersection, tot_a) + + def calculate_jaccard(interval_a, interval_b): """ calculates the Jaccard similarity metric for two intervals diff --git a/tools/nameConversions.py b/tools/nameConversions.py index 05e9d147..203c13cf 100644 --- a/tools/nameConversions.py +++ b/tools/nameConversions.py @@ -75,7 +75,7 @@ def aln_id_is_exref(aln_id): def aln_id_is_denovo(aln_id): - return aln_id_is_pb(aln_id) or aln_id_is_cgp(aln_id) + return aln_id_is_pb(aln_id) or aln_id_is_cgp(aln_id) or aln_id_is_exref(aln_id) def alignment_type(aln_id): From 3df1f58714a6370b8b41b889f2656f129b261fb8 Mon Sep 17 00:00:00 2001 From: Ian Fiddes Date: Thu, 25 Jun 2020 07:17:11 -0700 Subject: [PATCH 68/69] fix target genomes --- cat/__init__.py | 8 ++++---- cat/consensus.py | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/cat/__init__.py b/cat/__init__.py index e6dd21ba..59fa3ac3 100644 --- a/cat/__init__.py +++ b/cat/__init__.py @@ -1624,9 +1624,9 @@ def get_args(pipeline_args, mode): args.gps = {genome: AugustusPb.get_args(pipeline_args, genome).augustus_pb_gp for genome in pipeline_args.isoseq_genomes} args.filtered_tm_gps = {genome: TransMap.get_args(pipeline_args, genome).filtered_tm_gp - for genome in pipeline_args.isoseq_genomes - {pipeline_args.ref_genome}} + for genome in pipeline_args.target_genomes & (pipeline_args.isoseq_genomes - {pipeline_args.ref_genome})} args.unfiltered_tm_gps = {genome: TransMap.get_args(pipeline_args, genome).tm_gp - for genome in pipeline_args.isoseq_genomes - {pipeline_args.ref_genome}} + for genome in pipeline_args.target_genomes & (pipeline_args.isoseq_genomes - {pipeline_args.ref_genome})} # add the reference annotation as a pseudo-transMap to assign parents in reference args.filtered_tm_gps[pipeline_args.ref_genome] = ReferenceFiles.get_args(pipeline_args).annotation_gp args.unfiltered_tm_gps[pipeline_args.ref_genome] = ReferenceFiles.get_args(pipeline_args).annotation_gp @@ -1647,9 +1647,9 @@ def get_args(pipeline_args, mode): args.gps = {genome: ExternalReferenceFiles.get_args(pipeline_args, genome).annotation_gp for genome in pipeline_args.external_ref_genomes} filtered_tm_gp_files = {genome: TransMap.get_args(pipeline_args, genome).filtered_tm_gp - for genome in pipeline_args.external_ref_genomes} + for genome in pipeline_args.target_genomes & pipeline_args.external_ref_genomes} unfiltered_tm_gp_files = {genome: TransMap.get_args(pipeline_args, genome).tm_gp - for genome in pipeline_args.external_ref_genomes} + for genome in pipeline_args.target_genomes & pipeline_args.external_ref_genomes} # add the reference annotation as a pseudo-transMap to assign parents in reference filtered_tm_gp_files[pipeline_args.ref_genome] = ReferenceFiles.get_args(pipeline_args).annotation_gp unfiltered_tm_gp_files[pipeline_args.ref_genome] = ReferenceFiles.get_args(pipeline_args).annotation_gp diff --git a/cat/consensus.py b/cat/consensus.py index 17f1253e..f38808fb 100644 --- a/cat/consensus.py +++ b/cat/consensus.py @@ -914,7 +914,7 @@ def convert_attrs(attrs, id_field): val = str(val) if len(val) == 0: val = 'nan' - val = str(val).replace('=', '%3D').replace(';', '%3B') + val = val.replace('=', '%3D').replace(';', '%3B') key = key.replace('=', '%3D').replace(';', '%3B') attrs_str.append(f"{key}={val}") return score, ";".join(attrs_str) From e335d7578fe607a0ed163f684d8ebac12b28b160 Mon Sep 17 00:00:00 2001 From: Ian Fiddes Date: Thu, 25 Jun 2020 07:21:01 -0700 Subject: [PATCH 69/69] try again --- cat/__init__.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/cat/__init__.py b/cat/__init__.py index 59fa3ac3..a1ac296c 100644 --- a/cat/__init__.py +++ b/cat/__init__.py @@ -1621,12 +1621,13 @@ def get_args(pipeline_args, mode): args = tools.misc.HashableNamespace() if mode == 'augPB': args.tablename = tools.sqlInterface.AugPbAlternativeGenes.__tablename__ + pb_genomes = set(pipeline_args.target_genomes) & (pipeline_args.isoseq_genomes - {pipeline_args.ref_genome}) args.gps = {genome: AugustusPb.get_args(pipeline_args, genome).augustus_pb_gp - for genome in pipeline_args.isoseq_genomes} + for genome in pb_genomes} args.filtered_tm_gps = {genome: TransMap.get_args(pipeline_args, genome).filtered_tm_gp - for genome in pipeline_args.target_genomes & (pipeline_args.isoseq_genomes - {pipeline_args.ref_genome})} + for genome in pb_genomes} args.unfiltered_tm_gps = {genome: TransMap.get_args(pipeline_args, genome).tm_gp - for genome in pipeline_args.target_genomes & (pipeline_args.isoseq_genomes - {pipeline_args.ref_genome})} + for genome in pb_genomes} # add the reference annotation as a pseudo-transMap to assign parents in reference args.filtered_tm_gps[pipeline_args.ref_genome] = ReferenceFiles.get_args(pipeline_args).annotation_gp args.unfiltered_tm_gps[pipeline_args.ref_genome] = ReferenceFiles.get_args(pipeline_args).annotation_gp @@ -1643,13 +1644,14 @@ def get_args(pipeline_args, mode): args.filtered_tm_gps = filtered_tm_gp_files args.unfiltered_tm_gps = unfiltered_tm_gp_files elif mode == 'exRef': + exref_genomes = set(pipeline_args.target_genomes) & set(pipeline_args.external_ref_genomes) args.tablename = tools.sqlInterface.ExRefAlternativeGenes.__tablename__ args.gps = {genome: ExternalReferenceFiles.get_args(pipeline_args, genome).annotation_gp - for genome in pipeline_args.external_ref_genomes} + for genome in exref_genomes} filtered_tm_gp_files = {genome: TransMap.get_args(pipeline_args, genome).filtered_tm_gp - for genome in pipeline_args.target_genomes & pipeline_args.external_ref_genomes} + for genome in exref_genomes} unfiltered_tm_gp_files = {genome: TransMap.get_args(pipeline_args, genome).tm_gp - for genome in pipeline_args.target_genomes & pipeline_args.external_ref_genomes} + for genome in exref_genomes} # add the reference annotation as a pseudo-transMap to assign parents in reference filtered_tm_gp_files[pipeline_args.ref_genome] = ReferenceFiles.get_args(pipeline_args).annotation_gp unfiltered_tm_gp_files[pipeline_args.ref_genome] = ReferenceFiles.get_args(pipeline_args).annotation_gp