-
Notifications
You must be signed in to change notification settings - Fork 6
Expand file tree
/
Copy pathconvertbam.py
More file actions
executable file
·186 lines (146 loc) · 6.94 KB
/
convertbam.py
File metadata and controls
executable file
·186 lines (146 loc) · 6.94 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
#!/usr/bin/env python3
import pysam
import sys
import argparse
import string
import itertools
from collections import defaultdict
import os.path
from trnasequtils import *
import subprocess
inputbam = sys.argv[1]
dbname = sys.argv[2]
bamfile = pysam.Samfile(inputbam, "rb" )
outfile = pysam.Samfile( "-", "w", template = bamfile )
#outfile = pysam.Samfile( outputbam, "wb", template = bamfile )
trnainfo = transcriptfile(dbname+"-trnatable.txt")
trnaloci = getnamedict(readbed(dbname+"-trnaloci.bed", includeintrons = True))
trnatranscripts = trnainfo.gettranscripts()
npad = 20
bam_cins = 1
bam_cdel = 2
'''
566645416..
566645491
chr13 23413247 23413320 tRNA-Val-AAC-2-2 1000 -
chr13 23285400 23285473 tRNA-Val-AAC-2-1 1000 +
Doesnt quite work with multiple bam indels
CIGAR strings are relative to the genome strand
chr11 59318766 59318852 tRNA-Arg-TCT-3-2 1000 + 59318766 59318852 0 2 37,36 0,50
chr9 131102354 131102445 tRNA-Arg-TCT-3-1 1000 - 131102354 131102445 0 2 38,35 0,56
Actual from the site:
chr9 131102354 131102445 tRNA-Arg-TCT-3-1 1000 - 131102354 131102445 0 2 36,37, 0,54,
'''
def cigarlength(cigar):
return sum(curr[1] for curr in cigar if curr[0] in set([0,bam_cins]))
def reverseintrons(introns, length):
for currint in reversed(introns):
yield tuple([length - currint[0] + 1, currint[1]])
cigarset = set()
uniquemode = False
for currmap in bamfile:
chromname = bamfile.getrname(currmap.tid)
readquals = currmap.query_qualities
readseq = currmap.query_sequence
readtags = currmap.get_tags()
origcigar = currmap.cigartuples
if chromname in trnatranscripts:
origstart = currmap.reference_start
for currlocusname in trnainfo.transcriptdict[chromname]:
currlocusmap = currmap
currlocus = trnaloci[currlocusname]
currlocusmap.reference_id = bamfile.get_tid(currlocus.chrom)
introns = list()
if currlocus.data["blockcount"] > 1:
lastblock = 0
for i in range(currlocus.data["blockcount"]):
currblocksize = int(currlocus.data["blocksizes"][i])
currblockstart = int(currlocus.data["blockstarts"][i])
if lastblock != 0:
introns.append([lastblock,currblockstart - lastblock])
lastblock += currblocksize
if len(origcigar) > 1:
#continue
pass
if currlocus.strand == '+':
currmap.reference_start = origstart - npad + currlocus.start
currmap.query_sequence = readseq
currmap.query_qualities = readquals
currmap.is_reverse = False
currpoint = origstart - npad
newcigar = list()
for i in range(len(introns)):
if currpoint >= introns[i][0]:
currmap.reference_start += introns[i][1]
currpoint += introns[i][1]
for currcigar in origcigar:
#print >>sys.stderr, currcigar
cigarset.add(currcigar[0])
if currcigar[0] in set([0,bam_cins]):
foundintron = False
for intronstart, intronlength in introns:
if currpoint < intronstart < currpoint + currcigar[1]:
firseglength = intronstart - currpoint
secseglength = currpoint + currcigar[1] - intronstart
newcigar.append(tuple([currcigar[0],firseglength]))
newcigar.append(tuple([bam_cdel,intronlength]))
newcigar.append(tuple([currcigar[0],secseglength]))
foundintron = True
if not foundintron:
newcigar.append(currcigar)
if currcigar[0] in set([0,bam_cdel]):
currpoint += currcigar[1]
else:
newcigar.append(currcigar)
if cigarlength(newcigar) != cigarlength(origcigar):
print(origcigar, file=sys.stderr)
print(newcigar, file=sys.stderr)
print("**||||", file=sys.stderr)
#sys.exit(1)
pass
currmap.cigartuples = newcigar
else:
#continue
currmap.reference_start = currlocus.end - (origstart - npad + cigarlength(origcigar) + sum(curr[1] for curr in introns))
currmap.query_sequence = revcom(readseq)
currmap.query_qualities = list(reversed(readquals))
currmap.is_reverse = True
currpoint = origstart - npad
newcigar = list()
for intronstart, intronlength in reverseintrons(introns, currlocus.length()):
if currpoint + cigarlength(origcigar) < intronstart:
currmap.reference_start += intronlength
currpoint -= intronlength
pass
for currcigar in reversed(origcigar):
cigarset.add(currcigar[0])
if currcigar[0] in set([0,bam_cins]):
foundintron = False
for intronstart, intronlength in introns:
if currpoint < intronstart < currpoint + currcigar[1]:
firseglength = intronstart - currpoint + 1
secseglength = currpoint + currcigar[1] - intronstart - 1
newcigar.append(tuple([currcigar[0],secseglength]))
newcigar.append(tuple([bam_cdel,intronlength]))
newcigar.append(tuple([currcigar[0],firseglength]))
foundintron = True
if not foundintron:
newcigar.append(currcigar)
if currcigar[0] in set([0,bam_cdel]):
currpoint += currcigar[1]
else:
newcigar.append(currcigar)
if cigarlength(newcigar) != cigarlength(origcigar):
#print >>sys.stderr, currmap
print("neg", file=sys.stderr)
print(origcigar, file=sys.stderr)
print(newcigar, file=sys.stderr)
print("**||", file=sys.stderr)
#sys.exit(1)
pass
currmap.cigartuples = newcigar
currmap.set_tags(readtags)
outfile.write(currmap)
else:
pass
outfile.write(currmap)