-
Notifications
You must be signed in to change notification settings - Fork 5
Expand file tree
/
Copy pathprocess-illumina-file.py
More file actions
112 lines (72 loc) · 3.07 KB
/
process-illumina-file.py
File metadata and controls
112 lines (72 loc) · 3.07 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
# process-illumina.py
# program to run the alignment of Illumina WGS data
# goes from fastq.gz to CRAM + GVCF
import dogmap
import os
import sys
import argparse
import time
# SETUP
parser = argparse.ArgumentParser(description='process Illumina Reads')
parser.add_argument('--table', type=str,help='sample fastq table',required=True)
parser.add_argument('--ref', type=str,help='genome fasta with dictionary and .fai',required=True)
parser.add_argument('--refBWA', type=str,help='genome fasta BWA-MEM2 index',required=True)
parser.add_argument('-t', type=int,help='number of threads to use',required=True)
parser.add_argument('--tmpdir', type=str,help='tmp dir for running',required=True)
parser.add_argument('--finaldir', type=str,help='final dir for output',required=True)
parser.add_argument('--knownsites', type=str,help='vcf of known sites for BQSR',required=True)
args = parser.parse_args()
#####################################################################
myData = {} # dictionary for keeping and passing information
# table is sampleName LibraryName ReadGroupID fastq1 fastq2
myData['sampleTable'] = args.table
myData['tmpDir'] = args.tmpdir
myData['finalDir'] = args.finaldir
myData['ref'] = args.ref
myData['refBWA'] = args.refBWA
myData['threads'] = args.t
myData['knownSitesVCF'] = args.knownsites
if myData['tmpDir'][-1] != '/':
myData['tmpDir'] += '/'
if myData['finalDir'][-1] != '/':
myData['finalDir'] += '/'
myData['sampleName'] = myData['sampleTable'].split('/')[-1].split('.')[0]
myData['logFileName'] = myData['finalDir'] + myData['sampleName'] + '.map.log'
myData['completeToken'] = myData['finalDir'] + myData['sampleName'] + '.map.complete'
# if log file exists, then there is partial processing so not sure we want to redo and overwrite
# safe to just quite and letter user deal with it
if os.path.isfile(myData['logFileName']) is True:
print('ERROR!!!')
print('%s exists. Do you really want to rerun this pipeline?' % myData['logFileName'] )
print('ERROR!!!')
sys.exit()
myData['logFile'] = open(myData['logFileName'],'w')
# add initial infoto log
dogmap.init_log(myData)
# setup samples table info
dogmap.setup_sample_table(myData)
# make sure programs are availble
dogmap.check_prog_paths(myData)
dogmap.check_dir_space(myData)
dogmap.run_bwa_mem2_table(myData) # run it from the table...
dogmap.run_mdspark(myData)
dogmap.run_bqsr(myData)
dogmap.run_haplotypecaller(myData)
# clean up and get elapsed time!
dogmap.remove_tmp_dir(myData)
cmd = 'touch %s ' % myData['completeToken']
print(cmd,flush=True)
myData['logFile'].write(cmd + '\n')
dogmap.runCMD(cmd)
myData['endTime'] = time.localtime()
myData['tEnd'] = time.time()
t = time.strftime("%a, %d %b %Y %H:%M:%S", myData['endTime'])
myData['logFile'].write('\nEnded!\n%s\n' % t)
elapsedTime = myData['tEnd'] - myData['tStart'] # this is in nanoseconds??
# convert to minutes
elapsedTime= elapsedTime / 60
# convert to hours
elapsedTime = elapsedTime / 60
#t = time.strftime("%H:%M:%S",elapsedTime )
myData['logFile'].write('Elapsed time:\n%s hours\n' % elapsedTime)
myData['logFile'].close()