1313# See the License for the specific language governing permissions and
1414# limitations under the License.
1515
16+ from ast import Num
1617import os .path
1718import glob
1819import shutil
1920
2021from math import inf
22+ from multiprocessing import Pool
2123from argparse import ArgumentParser
2224
2325from Bio import SeqIO , SeqRecord
@@ -83,21 +85,34 @@ def process_command(small_rna, adapter, front, anywhere, cutoff, quiet):
8385
8486 do_log (quiet , '==> Completed command Process' )
8587
86- def sort_command (genome , small_rna , cds , min_length , max_length , quiet ):
88+ def sort_command (genome , small_rna , cds , min_length , max_length , num_mismatches , disable_align , threads , quiet ):
8789 '''
8890 Code to run when the user chooses the sort command
8991 '''
9092
91- if not validate_file (genome , 'fasta' ):
93+ if not disable_align and not validate_file (genome , 'fasta' ):
9294 print (f'Error: expected a genome in FASTA format, got { genome } ' )
95+
96+ # Add note if genome positional argument has been missed
97+ if genome is None :
98+ print ('It looks like you missed the Genome FASTA argument, try adding one to the end of your command' )
99+
93100 return False
94101
95- if not validate_file (small_rna , 'fastq' ):
96- print (f'Error: expected a small RNA FASTQ with at least one sequence, got { small_rna } ' )
102+ if validate_file (small_rna , 'fastq' ):
103+ small_rna_filetype = 'fastq'
104+ elif validate_file (small_rna , 'fasta' ):
105+ small_rna_filetype = 'fasta'
106+ else :
107+ print (f'Error: expected a small RNA FASTQ or FASTA with at least one sequence, got { small_rna } ' )
97108 return False
98109
99110 do_log (quiet , '==> Starting command Sort' )
100- new_fastq = align_to_genome (genome , small_rna , cds , quiet = quiet )
111+ if not disable_align :
112+ new_fastq = align_to_genome (genome , small_rna , cds , threads = threads , small_rna_filetype = small_rna_filetype , mismatches = num_mismatches , quiet = quiet )
113+ else :
114+ new_fastq = small_rna
115+
101116 table_file = bin_rna_size (new_fastq , min_length , max_length , quiet = quiet )
102117
103118 graph_length (table_file )
@@ -106,7 +121,7 @@ def sort_command(genome, small_rna, cds, min_length, max_length, quiet):
106121
107122def extractnc_command (genome , gff , quiet ):
108123 '''
109- Code to run when the user chooses to extract the noncoding mRNA reigon
124+ Code to run when the user chooses to extract the noncoding mRNA region
110125 '''
111126
112127 if not validate_file (genome , 'fasta' ):
@@ -127,7 +142,23 @@ def extractnc_command(genome, gff, quiet):
127142
128143 do_log (quiet , '==> Completed command extractNC' )
129144
130- def unitas_command (small_rna_path , species_name , ref_seqs , cds , unspliced_transcriptome , quiet ):
145+ # Parallelism bits for unitas
146+ # initialize worker processes
147+ def init_worker (a , b , c , d ):
148+ # declare scope of a new global variable
149+ global species_name , ref_seqs , quiet , UNITAS_OUTPUT
150+ # store argument in the global variable for this process
151+ species_name = a
152+ ref_seqs = b
153+ quiet = c
154+ UNITAS_OUTPUT = d
155+
156+
157+ # easiest way to implement this quickly
158+ def unitas_threads (small_rna ):
159+ run_unitas_annotation (small_rna , species_name , ref_seqs , quiet = quiet , unitas_output = UNITAS_OUTPUT )
160+
161+ def unitas_command (small_rna_path , species_name , ref_seqs , cds , unspliced_transcriptome , threads , quiet ):
131162 '''
132163 Code to run when the user chooses the unitas command
133164 '''
@@ -159,14 +190,16 @@ def unitas_command(small_rna_path, species_name, ref_seqs, cds, unspliced_transc
159190
160191 mkdir_if_not_exists (UNITAS_OUTPUT )
161192
162- for small_rna in glob .glob (os .path .join (small_rna_path , '*.fastq' )):
163- run_unitas_annotation (small_rna , species_name , ref_seqs , quiet = quiet , unitas_output = UNITAS_OUTPUT )
193+ small_rna_list = glob .glob (os .path .join (small_rna_path , '*.fastq' ))
194+
195+ with Pool (threads , initializer = init_worker , initargs = (species_name , ref_seqs , quiet , UNITAS_OUTPUT ,)) as p :
196+ p .map (unitas_threads , small_rna_list )
164197
165198 table_path = merge_summary ()
166199 graph_unitas_classification_type (table_path )
167200 do_log (quiet , '==> Completed command Unitas' )
168201
169- def targetid_command (small_rna , targets , min_seq_length , mismatches_allowed , quiet ):
202+ def targetid_command (small_rna , targets , min_seq_length , mismatches_allowed , threads , quiet ):
170203 '''
171204 Code to run when the user chooses the targetid command
172205 '''
@@ -185,7 +218,7 @@ def targetid_command(small_rna, targets, min_seq_length, mismatches_allowed, qui
185218 print ('Error: You need to supply at least one target file with -t' )
186219
187220 revcomp_file = revcomp_input_file (small_rna , quiet = quiet )
188- sam_files = find_targets (revcomp_file , targets , min_seq_length = min_seq_length , mismatches_allowed = mismatches_allowed , quiet = quiet )
221+ sam_files = find_targets (revcomp_file , targets , threads = threads , min_seq_length = min_seq_length , mismatches_allowed = mismatches_allowed , quiet = quiet )
189222 build_summary_files (sam_files , quiet = quiet )
190223
191224 do_log (quiet , '==> Ending TargetID command' )
@@ -205,15 +238,17 @@ def main():
205238 parser_process .add_argument ('small_rna' , help = 'Path to FASTQ containing the small RNA' )
206239
207240 parser_sort = subparsers .add_parser ('sort' , help = 'Find RNAs that align to a genome and sort them by length' )
208- parser_sort .add_argument ('-d' , '--cds' , help = 'Optional CDS region, also align this to the CDS reigon as well as the genome' )
241+ parser_sort .add_argument ('-d' , '--cds' , help = 'Optional CDS region, also align this to the CDS region as well as the genome' )
209242 parser_sort .add_argument ('-l' , '--min-length' , help = 'Minimum length to bin' , type = int , default = - inf )
210243 parser_sort .add_argument ('-x' , '--max-length' , help = 'Maximum length to bin' , type = int , default = inf )
244+ parser_sort .add_argument ('-m' , '--ref-mismatches' , type = int , default = None , help = 'Number of mismatches to use in bowtie2, None for default behaviour' )
245+ parser_sort .add_argument ('--disable-alignment' , action = 'store_true' , help = 'Skip the alignment to the reference genome step' )
211246 parser_sort .add_argument ('small_rna' , help = 'Path to FASTQ containing the small RNA' )
212- parser_sort .add_argument ('genome' , help = 'Genome to align against' )
247+ parser_sort .add_argument ('genome' , nargs = '?' , default = None , help = 'Genome to align against' )
213248
214- parser_extractnc = subparsers .add_parser ('extractnc' , help = 'Extarct the noncoding reigon from a fasta with a GFF file' )
249+ parser_extractnc = subparsers .add_parser ('extractnc' , help = 'Extarct the noncoding region from a fasta with a GFF file' )
215250 parser_extractnc .add_argument ('genome' , help = 'FASTA containing the genome to extract from' )
216- parser_extractnc .add_argument ('gff_file' , help = 'GFF file containing annotations of CDS and mRNA reigons ' )
251+ parser_extractnc .add_argument ('gff_file' , help = 'GFF file containing annotations of CDS and mRNA regions ' )
217252
218253 parser_unitas = subparsers .add_parser ('unitas' , help = 'Run unitas on split files and merge results' )
219254 parser_unitas .add_argument ('-d' , '--cds' , help = 'Optional CDS region, passed to unitas' )
@@ -230,7 +265,7 @@ def main():
230265
231266 parser_all = subparsers .add_parser ('all' , help = 'Run process, sort and unitas one after the other' )
232267 parser_all .add_argument ('-a' , '--adapter' , help = 'Sequence of the adapter to remove from the 3\' end' )
233- parser_all .add_argument ('-d' , '--cds' , help = 'Optional CDS region, also align this to the CDS reigon as well as the genome' )
268+ parser_all .add_argument ('-d' , '--cds' , help = 'Optional CDS region, also align this to the CDS region as well as the genome' )
234269 parser_all .add_argument ('-g' , '--front' , help = 'Sequence of the adapter to remove from the 5\' end' )
235270 parser_all .add_argument ('-b' , '--anywhere' , help = 'Sequence of the adapters to remove from both ends' )
236271 parser_all .add_argument ('-c' , '--cutoff' , help = 'Quality cutoff to trin RNA sequences at' , default = 20 , type = int )
@@ -239,8 +274,10 @@ def main():
239274 parser_all .add_argument ('-r' , '--refseq' , help = 'References for use with unitas' , nargs = '*' , default = None )
240275 parser_all .add_argument ('-s' , '--species' , help = 'Species to set in unitas arguments' , default = 'x' )
241276 parser_all .add_argument ('-u' , '--unspliced-transcriptome' , help = 'Optional, unspliced transcriptome, passed to unitas' )
277+ parser_all .add_argument ('-m' , '--ref-mismatches' , type = int , default = None , help = 'Number of mismatches to use in bowtie2 when aligning to the genome, None for default behaviour' )
278+ parser_all .add_argument ('--disable-alignment' , action = 'store_true' , help = 'Skip the alignment to the reference genome step' )
242279 parser_all .add_argument ('small_rna' , help = 'Path to FASTQ containing the small RNA' )
243- parser_all .add_argument ('genome' , help = 'Genome to align against' )
280+ parser_all .add_argument ('genome' , nargs = '?' , default = None , help = 'Genome to align against' )
244281
245282 args = parser .parse_args ()
246283
@@ -262,6 +299,7 @@ def get_command_args(name):
262299 return None
263300
264301 mkdir_if_not_exists (get_config_key ('general' , 'output_directory' ))
302+ num_threads = get_config_key ('general' , 'threads' )
265303
266304 if args .command == 'process' :
267305 process_command (
@@ -280,6 +318,9 @@ def get_command_args(name):
280318 get_command_args ('cds' ),
281319 get_command_args ('min_length' ),
282320 get_command_args ('max_length' ),
321+ get_command_args ('ref_mismatches' ),
322+ get_command_args ('disable_alignment' ),
323+ num_threads ,
283324 get_command_args ('quiet' )
284325 )
285326
@@ -297,6 +338,7 @@ def get_command_args(name):
297338 get_command_args ('refseq' ),
298339 get_command_args ('cds' ),
299340 get_command_args ('unspliced_transcriptome' ),
341+ num_threads ,
300342 get_command_args ('quiet' )
301343 )
302344
@@ -306,6 +348,7 @@ def get_command_args(name):
306348 get_command_args ('target_files' ),
307349 get_command_args ('min_seq_length' ),
308350 get_command_args ('num_mismatches' ),
351+ num_threads ,
309352 get_command_args ('quiet' )
310353 )
311354
@@ -328,16 +371,22 @@ def get_command_args(name):
328371 get_command_args ('cds' ),
329372 get_command_args ('min_length' ),
330373 get_command_args ('max_length' ),
374+ get_command_args ('ref_mismatches' ),
375+ get_command_args ('disable_alignment' ),
376+ num_threads ,
331377 get_command_args ('quiet' )
332378 )
333379
334380 if out_code is not None :
335381 return
336-
382+
337383 unitas_command (
338384 os .path .join (get_config_key ('general' , 'output_directory' ), 'binned_rna' ),
339385 get_command_args ('species' ),
340386 get_command_args ('refseq' ),
387+ get_command_args ('cds' ),
388+ get_command_args ('unspliced_transcriptome' ),
389+ num_threads ,
341390 get_command_args ('quiet' )
342391 )
343392
0 commit comments