-
Notifications
You must be signed in to change notification settings - Fork 18
Expand file tree
/
Copy pathmakeNonRedundantAS.py
More file actions
executable file
·341 lines (265 loc) · 12.6 KB
/
makeNonRedundantAS.py
File metadata and controls
executable file
·341 lines (265 loc) · 12.6 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
#!/lab/64/bin/python
# makeNonRedundantAS.py
# Author: Angela Brooks
# Program Completion Date:
# Modification Date(s):
"""From a table containing all AS events with p_vals, removes redundant events.
Assumes that all events were already considered significant.
"""
import sys
import optparse
import pdb
from helperFunctions import coordsOverlap
from getASEventReadCounts import convertCoordStr
import rpy2.robjects as robjects
#############
# CONSTANTS #
#############
INFINITY = 100000000000000000000
NA = "NA"
#################
# END CONSTANTS #
#################
###########
# CLASSES #
###########
class OptionParser(optparse.OptionParser):
"""
Adding a method for required arguments.
Taken from:
http://www.python.org/doc/2.3/lib/optparse-extending-examples.html
"""
def check_required(self, opt):
option = self.get_option(opt)
# Assumes the option's 'default' is set to None!
if getattr(self.values, option.dest) is None:
print "%s option not supplied" % option
self.print_help()
sys.exit(1)
###############
# END CLASSES #
###############
########
# MAIN #
########
def main():
opt_parser = OptionParser()
# Add Options. Required options should have default=None
opt_parser.add_option("--input",
dest="input",
type="string",
help="""Table containing PSI values for events. Last
column indicates the combined p-value.""",
default=None)
opt_parser.add_option("--output",
dest="output",
type="string",
help="""Resulting table with all redundancies
removed.""",
default=None)
opt_parser.add_option("--use_mad",
dest="use_mad",
action="store_true",
help="""Instead of a p-value, redundancies will be
resolved by taking the event with the largest
median absolute deviation (MAD). All value
columns should be PSI values.""",
default=False)
(options, args) = opt_parser.parse_args()
# validate the command line arguments
opt_parser.check_required("--input")
opt_parser.check_required("--output")
input_file = open(options.input)
output_file = open(options.output, "w")
use_mad = options.use_mad
as_type2event2pval = {}
as_type2chr2strand2redundantGroup2event = {}
for event in input_file:
event = formatLine(event)
if event.startswith("#"):
# Print header
output_file.write(event + "\n")
continue
buildDictionaries(event, as_type2event2pval,
as_type2chr2strand2redundantGroup2event, use_mad)
input_file.close()
# Merge redundant events
changeOccurred = True
while changeOccurred:
changeOccurred = mergeRedundantEvents(as_type2chr2strand2redundantGroup2event)
# Now remove redundant events from as_type2event2pval dictionary
removeRedundantEvents(as_type2chr2strand2redundantGroup2event, as_type2event2pval)
# Now print out the remaining events
for as_type in as_type2event2pval:
for event in as_type2event2pval[as_type]:
output_file.write(event + "\n")
output_file.close()
sys.exit(0)
############
# END_MAIN #
############
#############
# FUNCTIONS #
#############
def buildDictionaries(event, as_type2event2pval, as_type2chr2strand2redundantGroup2event,
use_mad):
line_list = event.split("\t")
if use_mad:
pval = get_mad(line_list)
else:
pval = float(line_list[-1])
as_type = line_list[1]
if as_type in as_type2event2pval:
if event in as_type2event2pval[as_type]:
print "WARNING: Duplicate event exists."
if pval > as_type2event2pval[as_type][event]:
# Lowest pval is selected
pval = as_type2event2pval[as_type][event]
as_type2event2pval[as_type][event] = pval
else:
as_type2event2pval[as_type] = {event:pval}
chr = line_list[3]
strand = line_list[4]
if as_type == "mutually_exclusive":
# Largest region will be in exclusion junctions
group_start, group_end = findLargestRegion(",".join([line_list[5],
line_list[6]]))
else:
group_start, group_end = findLargestRegion(line_list[5])
redundantRegion = (chr, strand, group_start, group_end)
updateRedundantDictionary(as_type2chr2strand2redundantGroup2event, as_type, redundantRegion,
event)
#ef convertCoordStr(coord_str):
# chr, start_str, end_str = coord_str.split("_")
# return chr, int(start_str), int(end_str)
def findLargestRegion(coords_string):
first_list = coords_string.split(";")
full_list = []
for elem1 in first_list:
for elem2 in elem1.split(","):
full_list.append(elem2)
leftmost = INFINITY
rightmost = -1
for coord in full_list:
chr, start, end = convertCoordStr(coord)
if start < leftmost:
leftmost = start
if end > rightmost:
rightmost = end
return leftmost, rightmost
def findMostSignEvent(event2pval, events):
most_sign_pval = 2
most_sign_event = None
for event in events:
if event2pval[event] < most_sign_pval:
most_sign_pval = event2pval[event]
most_sign_event = event
return most_sign_event
def fixMissingVals(vals_str):
return_vals = []
for val in vals_str:
if val != NA:
return_vals.append(val)
return return_vals
def formatLine(line):
line = line.replace("\r","")
line = line.replace("\n","")
return line
def get_mad(line_list):
vals_str = line_list[11:]
vals_str = fixMissingVals(vals_str)
vals = map(float, vals_str)
abs_mad_val = abs(robjects.r['mad'](robjects.FloatVector(vals))[0])
# Since the lowest p-value is supposed to be used for selecting the
# non-redundant set of events, the negative of the larget MAD value will
# give the event with the largets MAD value.
return -abs_mad_val
def mergeRedundantEvents(as_type2chr2strand2redundantGroup2event):
"""
Because of the construction of the dictionary, some of the redundant
groups are overlapping.
"""
for as_type in as_type2chr2strand2redundantGroup2event:
for chr in as_type2chr2strand2redundantGroup2event[as_type]:
for strand in as_type2chr2strand2redundantGroup2event[as_type][chr]:
for first_group in as_type2chr2strand2redundantGroup2event[as_type][chr][strand]:
for second_group in as_type2chr2strand2redundantGroup2event[as_type][chr][strand]:
if first_group == second_group:
continue
if coordsOverlap(first_group[2], first_group[3],
second_group[2], second_group[3]):
# Add sets together
new_set = as_type2chr2strand2redundantGroup2event[as_type][chr][strand][first_group].union(as_type2chr2strand2redundantGroup2event[as_type][chr][strand][second_group])
new_start = min(first_group[2], second_group[2])
new_end = max(first_group[3], second_group[3])
new_region = (first_group[0], first_group[1],
new_start, new_end)
if new_region in as_type2chr2strand2redundantGroup2event[as_type][chr][strand]:
as_type2chr2strand2redundantGroup2event[as_type][chr][strand][new_region].update(new_set)
else:
as_type2chr2strand2redundantGroup2event[as_type][chr][strand][new_region] = new_set
# Remove old sets and return with flag
if new_region != first_group:
del as_type2chr2strand2redundantGroup2event[as_type][chr][strand][first_group]
if new_region != second_group:
del as_type2chr2strand2redundantGroup2event[as_type][chr][strand][second_group]
return True
# If not returned from within the loop, then no change occurred
return False
def removeRedundantEvents(as_type2chr2strand2redundantGroup2event, as_type2event2pval):
"""
Returns the as_type2event2pval dictionary with redundant values removed
"""
for as_type in as_type2chr2strand2redundantGroup2event:
for chr in as_type2chr2strand2redundantGroup2event[as_type]:
for strand in as_type2chr2strand2redundantGroup2event[as_type][chr]:
for redundant_group in as_type2chr2strand2redundantGroup2event[as_type][chr][strand]:
if len(as_type2chr2strand2redundantGroup2event[as_type][chr][strand][redundant_group]) > 1:
most_sign_event = findMostSignEvent(as_type2event2pval[as_type],
as_type2chr2strand2redundantGroup2event[as_type][chr][strand][redundant_group])
for event in as_type2chr2strand2redundantGroup2event[as_type][chr][strand][redundant_group]:
if event == most_sign_event:
continue
# Delete all non-significant events
del as_type2event2pval[as_type][event]
def updateRedundantDictionary(as_type2chr2strand2redundantGroup2event, as_type, redundantRegion,
event):
this_chr = redundantRegion[0]
this_strand = redundantRegion[1]
if as_type in as_type2chr2strand2redundantGroup2event:
if not this_chr in as_type2chr2strand2redundantGroup2event[as_type]:
as_type2chr2strand2redundantGroup2event[as_type][this_chr] = {this_strand:{redundantRegion: set([event])}}
return
if not this_strand in as_type2chr2strand2redundantGroup2event[as_type][this_chr]:
as_type2chr2strand2redundantGroup2event[as_type][this_chr][this_strand] = {redundantRegion: set([event])}
return
foundOverlap = False
for redundantGroup in as_type2chr2strand2redundantGroup2event[as_type][this_chr][this_strand]:
if coordsOverlap(redundantGroup[2], redundantGroup[3],
redundantRegion[2], redundantRegion[3]):
foundOverlap = True
# Pick longest region as the key
region_start = min(redundantGroup[2], redundantRegion[2])
region_end = max(redundantGroup[3], redundantRegion[3])
# Copy the existing set of exon_names
cur_set = set(as_type2chr2strand2redundantGroup2event[as_type][this_chr][this_strand][redundantGroup])
# Add the current event
cur_set.add(event)
del as_type2chr2strand2redundantGroup2event[as_type][this_chr][this_strand][redundantGroup]
# Check for existing new group
new_group = (redundantGroup[0], redundantGroup[1],
region_start, region_end)
if new_group in as_type2chr2strand2redundantGroup2event[as_type][this_chr][this_strand]:
as_type2chr2strand2redundantGroup2event[as_type][this_chr][this_strand][new_group].update(cur_set)
else:
as_type2chr2strand2redundantGroup2event[as_type][this_chr][this_strand][new_group] = cur_set
break
if not foundOverlap:
# Add this non overlapping region on its own
as_type2chr2strand2redundantGroup2event[as_type][this_chr][this_strand][redundantRegion] = set([event])
else:
as_type2chr2strand2redundantGroup2event[as_type] = {this_chr:{this_strand:{redundantRegion: set([event])}}}
#################
# END FUNCTIONS #
#################
if __name__ == "__main__": main()