-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathCRAB_Analysis_Runner.py
More file actions
executable file
·212 lines (165 loc) · 8.06 KB
/
CRAB_Analysis_Runner.py
File metadata and controls
executable file
·212 lines (165 loc) · 8.06 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
#CRAB Pipeline
from Phoenix_Launcher.WF_1_Launcher import WF_1_Launch_Phoenix, WF_2_PushDB_Phoenix
from WF_0_Assembler.WF_0_Assembler_runner import run_assembly, run_pre_assembly
from WF_1_Annotate.WF_1_Annotate import run_annotate
from WF_2_FindAMR.WF_2_FindAMRs import find_AMR_genes
from WF_3_DB.WF_3_DB_push import run_DB_push
from WF_3_5_SNP_Phylo.WF_3_5_SNP_Phylo import run_WF_3_5
from WF_4_CreateReport.WF_4_helper import run_create_PDF
import os
import sys
import json
import reader
import shutil
class CRAB_pipeline_worker():
def __init__(self, cache_path) :
self.cache_path = cache_path
demo_cahce= reader.read_json(cache_path+"/data/pipeline_variables.json")
for item in [*demo_cahce] :
setattr(self,item, demo_cahce[item])
def run_CDCphoenix(self,path_to_reads,run_date):
sample_HSN = False
Assembly_stats = False
mlst = False
found_genes = False
WF_1_Launch_Phoenix(path_to_reads,self.phoenix_output+"/SampleSheet",run_date,self.phoenix_output,self.phoenix_p,self.kraken_path)
#run_DB_push(self.cache_path,sample_HSN,mlst,found_genes,Assembly_stats,run_date,self.csv_path)
WF_2_PushDB_Phoenix(self.phoenix_output,run_date,self.cache_path,self.CDC_csv_path,self.CDC_path_to_pdf_output,self.cache_path,True)
def run_pipeline(self,path_to_reads,run_date):
sample_HSN = False
Assembly_stats = False
mlst = False
found_genes = False
if os.path.exists(self.cache_path+'/data/run_data/'+run_date) :
print("Trying to import jsons")
sample_HSN , Assembly_stats, mlst, found_genes =self.import_json(self.cache_path+'/data/run_data/'+run_date,run_date)
else :
os.mkdir(self.cache_path+'/data/run_data/'+run_date)
if not sample_HSN:
#WF_0
#Fastq pre proccessing, runs SPADES assembler, RETURNS list of HSN
sample_HSN , Assembly_stats = run_assembly(self.cache_path,path_to_reads,self.assembly_output,self.busco_output,run_date)
with open(self.cache_path+'/data/run_data/'+run_date+'/sample_HSN.json', 'w') as fp:
json.dump(sample_HSN, fp)
with open(self.cache_path+'/data/run_data/'+run_date+'/assembly_stats.json', 'w') as fp:
json.dump(Assembly_stats, fp)
print("Assembly Done")
if not mlst:
#WF_1
#runs Prokka
#runs MLST typing, RETURNS MLST TYPE in DICT {"HSH":[species,type, something, ...]}
#{'2296669_manualy': ['2296669_manualy', 'abaumannii_2', '2']}
self.assembly_output+="/"+run_date
self.prokka_output+="/"+run_date
mlst = run_annotate(self.assembly_output,self.prokka_output,sample_HSN)
print("Annotation Done")
#print(mlst)
with open(self.cache_path+'/data/run_data/'+run_date+'/mlst.json', 'w') as fp:
json.dump(mlst, fp)
if not found_genes:
#WF_2
#Runs Abricate, converts the output to something to be pushed to DB
self.abricate_output+="/"+run_date
found_genes = find_AMR_genes(sample_HSN,self.assembly_output,self.abricate_output)
#print(found_genes)
print("found AMR genes")
with open(self.cache_path+'/data/run_data/'+run_date+'/found_genes.json', 'w') as fp:
json.dump(found_genes, fp)
#found_genes DICT {HSN:[GENE,%COV,%IDENT,DB_Used,Accession_Seq,Gene_Product,Resistance]}
#WF_3 DB push
#demographical push
#gene and anti-micorable data
#MLST typing
run_DB_push(self.cache_path,sample_HSN,mlst,found_genes,Assembly_stats,run_date,self.csv_path)
print("Push data to DB")
#3.5 workflow to pull contigs into assembled genome
#then do snp stuff
#and phylogenetic things
run_WF_3_5(path_to_reads,sample_HSN, self.path_to_shuffled_reads,run_date,self.path_to_referance_genome, self.path_to_snp_output )
print("Sequences Aligned")
#WF_4 report generation
#Phylogentics Tree of all samples on run
#SNP heat map of all samples
#bring together all informationself.path_to_pdf_output
run_create_PDF(sample_HSN,run_date, self.path_to_pdf_output ,self.cache_path,found_genes, mlst,self.path_to_snp_output)
print("Report Generated!")
#clean up temp files
self.clean_up_temp_files(run_date)
def run_phylo_build(self,path_to_reads,run_date):
#will be used due to consant asking for this funciton of only running a tree builder
#WF_0
#Fastq pre proccessing, RETURNS list of HSN
sample_HSN = run_assembly(self.cache_path,path_to_reads,self.assembly_output,self.busco_output,run_date)
#WF_3.5
#then do snp stuff
#and phylogenetic things
run_WF_3_5(path_to_reads,sample_HSN, self.path_to_shuffled_reads,run_date,self.path_to_referance_genome, self.path_to_snp_output )
print("\n\n\n")
print("Tree Bulit")
def clean_up_temp_files(self, run_date):
#assembled/run_date removes assembly files
shutil.rmtree(self.assembly_output+"/"+run_date)
#remove busco/run_date
shutil.rmtree(self.busco_output+"/"+run_date)
#amr_genes/run_date
shutil.rmtree(self.abricate_output+"/"+run_date)
#prokka/run_date
shutil.rmtree(self.prokka_output+"/"+run_date)
#result/strain_files
shutil.rmtree(self.path_to_pdf_output+"/"+run_date+"/strain_temp")
#docker_output/shuffle_reads
shutil.rmtree(self.path_to_shuffled_reads)
#docker_output/snp_output/run_date - maybe just keep msa files path_to_snp_output
shutil.rmtree(self.path_to_snp_output+"/"+run_date)
def import_json(self,path,run_date):
f_genes = False
mlst = False
sample_HSN = False
Assembly_stats = False
if os.path.exists(path+"/found_genes.json"):
#start analysis from here
with open(path+"/found_genes.json") as json_file:
f_genes = json.load(json_file)
with open(path+"/mlst.json") as json_file:
mlst = json.load(json_file)
with open(path+"/sample_HSN.json") as json_file:
sample_HSN = json.load(json_file)
with open(path+'/assembly_stats.json') as json_file:
Assembly_stats = json.load(json_file)
elif os.path.exists(path+"/mlst.json"):
with open(path+"/mlst.json") as json_file:
mlst = json.load(json_file)
with open(path+"/sample_HSN.json") as json_file:
sample_HSN = json.load(json_file)
with open(path+'/assembly_stats.json') as json_file:
Assembly_stats = json.load(json_file)
elif os.path.exists(path+"/sample_HSN.json"):
with open(path+"/sample_HSN.json") as json_file:
sample_HSN = json.load(json_file)
with open(path+'/assembly_stats.json') as json_file:
Assembly_stats = json.load(json_file)
return sample_HSN , Assembly_stats, mlst, f_genes
#else return all 4 variables
#need to read json files and return
if __name__ == "__main__":
dir_path = "/".join(os.path.dirname(os.path.realpath(__file__)).split("/")[:-1]) #path minus scripts
print(sys.argv)
input_path = sys.argv[1]
rundate = sys.argv[2]
try:
pipeline = sys.argv[3]
except:
print("NO CDC")
pipeline=""
print(input_path)
print("-----------")
print(rundate)
print("-----------")
print(pipeline)
CRAB_p = CRAB_pipeline_worker(dir_path)
if pipeline == "CDC" :
CRAB_p.run_CDCphoenix(input_path,rundate)
elif pipeline == "tree":
CRAB_p.run_phylo_build(input_path,rundate)
else:
CRAB_p.run_pipeline(input_path,rundate)