-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy path01_gather-soft-clipped-persample.py
More file actions
145 lines (123 loc) · 4.32 KB
/
01_gather-soft-clipped-persample.py
File metadata and controls
145 lines (123 loc) · 4.32 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
import genutils
import sys
import os.path
import numpy as np
from optparse import OptionParser
###############################################################################
def get_calls_from_vcf(vcfFileName):
calls = []
inFile = open(vcfFileName,'r')
for line in inFile:
if line[0] == '#':
continue
line = line.rstrip()
line = line.split()
c = line[0]
p = int(line[1])
#changed above for HGDP to accomodate the adding of begin/end coord's in get-filters-levels.py
calls.append([c,p])
inFile.close()
return calls
###############################################################################
USAGE = """
python gather-soft-clipped-persample.py --vcf <input vcf file> --bam <input bam file> --out <output read file>
Goes through VCF and identifies softclip reads near breakpoint
Imposes minimum filtering critiera
"""
parser = OptionParser(USAGE)
parser.add_option('--vcf',dest='vcfFile', help = 'input vcf file name')
parser.add_option('--bam',dest='bamFile', help = 'input bam file name')
parser.add_option('--out',dest='outFile', help = 'output file for soft clipped reads')
(options, args) = parser.parse_args()
if options.vcfFile is None:
parser.error('vcf file name not given')
if options.bamFile is None:
parser.error('BAM file name not given')
if options.outFile is None:
parser.error('reads output file name not given')
###############################################################################
inputVCFCalls = options.vcfFile
inputBAMFile = options.bamFile
outPutReadFile = options.outFile
# hardcoded values
min_mean_qual = 20
minSoftClip = 20
window_size= 200
max_check_pair_size_for_soft_clip = 3000
min_map_q = 20
offSet = 33
# will just add readlen to the starts to get intervals, this will be off by 1, but will match
# the output of RetroSeq
# get the calls
calls = get_calls_from_vcf(inputVCFCalls)
print 'Read in %i calls from VCF' % (len(calls))
outFile = open(outPutReadFile,'w')
rN = 0
for call in calls:
rN += 1
if rN % 100 == 0:
print 'Doing %i of %i ...' % (rN,len(calls))
c = call[0]
p = call[1]
b = p - window_size
e = p + window_size
reg = '%s:%i-%i' % (c,b,e)
# print reg
numReads = 0
numSC = 0
bamIn = genutils.open_bam_read(inputBAMFile,reg)
for line in bamIn:
numReads += 1
line = line.rstrip()
line = line.split()
samParse = genutils.parse_sam_line(line)
#get rid of things that are not good
if samParse['unMapped'] is True:
continue
if samParse['isDuplicate'] is True:
continue
if samParse['mapQ'] < min_map_q:
continue
if samParse['isPaired'] is False:
continue
if samParse['mateUnmapped'] is True:
continue
# check min soft clip
if samParse['cigarCounts']['S'] < minSoftClip:
continue
qual = samParse['qual']
cigar = samParse['cigar']
cExpand = genutils.expand_cigar(cigar)
if cExpand[0][1] == 'S':
n = cExpand[0][0]
qualPart = qual[0:n]
elif cExpand[-1][1] == 'S':
n = cExpand[-1][0]
qualPart = qual[(len(qual) - n) : ]
else:
print 'qhat??'
print line
print qual,cigar
print cExpand
sys.exit()
#check that soft clipped portion passes min qual
qualList = []
for c in qualPart:
i = ord(c) - offSet
qualList.append(i)
m = np.mean(qualList)
if m < min_mean_qual:
continue
# check if ins size is reasonable and has min softClip values
# not checking if softclip is near breakpoint, as will take all and hope that the
# assembler will figure it out for us
if (samParse['mateChrom'] == '=') and (abs(samParse['fragLen']) <= max_check_pair_size_for_soft_clip) and (samParse['cigarCounts']['S'] >= minSoftClip):
#print line
numSC += 1
c = samParse['chrom']
b = samParse['chromPos']
e = b + samParse['seqLen']
n = samParse['seqName']
outFile.write('%s\t%i\t%i\tSOFTCLIP\t%s\t%s\n' % (c,b,e,n,samParse['cigar']))
bamIn.close()
outFile.close()