AnnotationPipeline/Annotation.smk at master · jtevns/AnnotationPipeline · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
#########################################################
# snakemake workflow for the Annotation of a set of bins
# Author: Jacob Evans
# steps:
#    - determine orthologous clusters
#    - select a representative from each cluster
#    - search database with representative
#    - map back to all contigs in bins
#    - generate gene count tables
##########################################################
import glob
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from os.path import basename


WANTED_ANNOTATIONS = config["ANNOTATIONS"]
BINS = [x for x in glob.glob("Passing_bins/*")]
BIN_NAMES = [x.split("/")[-1].split(".")[0] for x in BINS]

#rule All to indicate ending point
rule all:
    input:
       ["MappedToClusters_{annotation}_annotation.csv".format(annotation=annotation) for annotation in WANTED_ANNOTATIONS]

#call genes from all bins with prodigal
rule call_genes:
    input:
        "Passing_bins/{binName}." + config["INPUT_EXT"]
    output:
        faa="Gene_Calls/{binName}.faa",
        gbk="Gene_Calls/{binName}.gbk"
    shell:
        "prodigal -a {output.faa} -i {input} -o {output.gbk} -q "

# cluster genes from all gene calls with proteinortho
rule cluster_genes:
    input:
        expand("Gene_Calls/{binName}.faa",binName=BIN_NAMES)
    output:
        "annotation.proteinortho.tsv"
    shell:
        """
        proteinortho -project="annotation" -clean {input}
        #mv Protein_Ortho_Out.* Protein_Ortho_Out
        """

# make cluster file from proteinortho output with make_cluster_file.py
rule gen_cluster_file:
     input:
         "annotation.proteinortho.tsv"
     output:
         "ProteinOrthoClusters_reformatted.tsv"
     script:
         "scripts/make_cluster_file.py"

#select representatives
rule select_representatives:
    input:
        clus = "ProteinOrthoClusters_reformatted.tsv",
        faas = expand("Gene_Calls/{binName}.faa",binName=BIN_NAMES)
    output:
        "representatives.faa"
    script:
        "scripts/select_representatives.py"

rule search_pfam:
    input:
        reps = "representatives.faa",
        db = config["DATABASE_DIRS"]["pfam"]
    output:
        "pfam_annotation.txt"
    threads:36
    shell:
        "hmmsearch -o pfam.log --cpu {threads} --cut_nc --tblout pfam_annotation.txt {input.db} {input.reps}"

rule search_tigrfam:
    input:
        reps = "representatives.faa",
        db = config["DATABASE_DIRS"]["pfam"]
    output:
        "tigrfam_annotation.txt"
    threads:36
    shell:
        "hmmsearch -o tigrfam.log --cpu {threads} --cut_nc --tblout tigrfam_annotation.txt {input.db} {input.reps}"

rule search_eggnog:
    input:
        reps = "representatives.faa",
        db = config["DATABASE_DIRS"]["pfam"]
    output:
        "eggnog_annotation.txt"
    threads:36
    shell:
        "hmmsearch -o eggnog.log --cpu {threads} --cut_nc --tblout eggnog_annotation.txt {input.db} {input.reps}"

# map annotations to members of cluster
rule map_to_clusters:
    input:
        cluster = "ProteinOrthoClusters_reformatted.tsv",
        file = "{annotation}_annotation.txt"
    output:
        "MappedToClusters_{annotation}_annotation.csv"
    script:
        "scripts/map_to_clusters.py"