-
Notifications
You must be signed in to change notification settings - Fork 4
Expand file tree
/
Copy pathfilter_remapped_reads.py
More file actions
executable file
·181 lines (140 loc) · 6.56 KB
/
filter_remapped_reads.py
File metadata and controls
executable file
·181 lines (140 loc) · 6.56 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
import argparse
import sys
import pysam
def parse_options():
parser = argparse.ArgumentParser(description="This program checks "
"whether reads that overlap SNPs map "
"back to the same location as the "
"original reads after their alleles "
"are flipped by the "
"find_intersecting_snps.py script. "
"Reads where one or more allelic versions "
"map to a different location (or fail "
"to map) are discarded. Reads that are "
"kept are written to the specified "
"keep_bam output file. Reads in the "
"input remap_bam file are expected to "
"have read names encoding the original "
"map location and number of allelic "
"variants. Specifically, the read names "
"should be delimited with the '.' "
"character and "
"contain the following fields: "
"<orig_name>.<coordinate>."
"<read_number>.<total_read_number>. "
"These read names are "
"generated by the "
"find_intersecting_snps.py script.")
parser.add_argument("to_remap_bam", help="input BAM file containing "
"original set of reads that needed to "
"be remapped after having their alleles flipped."
" This file is output by the find_intersecting_snps.py "
"script.")
parser.add_argument("remap_bam", help="input BAM file containing "
"remapped reads (with flipped alleles)")
parser.add_argument("keep_bam", help="output BAM file to write "
"filtered set of reads to")
return parser.parse_args()
def filter_reads(remap_bam):
# dictionary to keep track of how many times a given read is observed
read_counts = {}
# names of reads that should be kept
keep_reads = set([])
bad_reads = set([])
for read in remap_bam:
if read.is_secondary:
# only keep primary alignments and discard 'secondary' alignments
continue
# parse name of read, which should contain:
# 1 - the original name of the read
# 2 - the coordinate that it should map to
# 3 - the number of the read
# 4 - the total number of reads being remapped
words = read.qname.split(".")
if len(words) < 4:
raise ValueError("expected read names to be formatted "
"like <orig_name>.<coordinate>."
"<read_number>.<total_read_number> but got "
"%s" % read.qname)
# token separator '.' can potentially occur in
# original read name, so if more than 4 tokens,
# assume first tokens make up original read name
orig_name = ".".join(words[0:len(words)-3])
coord_str, num_str, total_str = words[len(words)-3:]
num = int(num_str)
total = int(total_str)
correct_map = False
if '-' in coord_str:
# paired end read, coordinate gives expected positions for each end
c1, c2 = coord_str.split("-")
if not read.is_paired:
bad_reads.add(orig_name)
continue
if not read.is_proper_pair:
bad_reads.add(orig_name)
continue
pos1 = int(c1)
pos2 = int(c2)
if pos1 < pos2:
left_pos = pos1
right_pos = pos2
else:
left_pos = pos2
right_pos = pos1
# only use left end of reads, but check that right end is in
# correct location
if read.pos < read.next_reference_start:
if pos1 == read.pos+1 and pos2 == read.next_reference_start+1:
# both reads mapped to correct location
correct_map = True
else:
# this is right end of read
continue
else:
# single end read
pos = int(coord_str)
if pos == read.pos+1:
# read maps to correct location
correct_map = True
if correct_map:
if orig_name in read_counts:
read_counts[orig_name] += 1
else:
read_counts[orig_name] = 1
if read_counts[orig_name] == total:
# all alternative versions of this read
# mapped to correct location
if orig_name in keep_reads:
raise ValueError("saw read %s more times than "
"expected in input file" % orig_name)
keep_reads.add(orig_name)
# remove read from counts to save mem
del read_counts[orig_name]
else:
# read maps to different location
bad_reads.add(orig_name)
return keep_reads, bad_reads
def write_reads(to_remap_bam, keep_bam, keep_reads, bad_reads):
keep_count = 0
bad_count = 0
discard_count = 0
for read in to_remap_bam:
if read.qname in bad_reads:
bad_count += 1
elif read.qname in keep_reads:
keep_count += 1
keep_bam.write(read)
else:
discard_count += 1
sys.stderr.write("keep_reads: %d\n" % keep_count)
sys.stderr.write("bad_reads: %d\n" % bad_count)
sys.stderr.write("discard_reads: %d\n" % discard_count)
def main(to_remap_bam_path, remap_bam_path, keep_bam_path):
to_remap_bam = pysam.Samfile(to_remap_bam_path)
remap_bam = pysam.Samfile(remap_bam_path)
keep_bam = pysam.Samfile(keep_bam_path, "wb", template=to_remap_bam)
keep_reads, bad_reads = filter_reads(remap_bam)
write_reads(to_remap_bam, keep_bam, keep_reads, bad_reads)
if __name__ == "__main__":
options = parse_options()
main(options.to_remap_bam, options.remap_bam, options.keep_bam)