juncBASE/makeNonRedundantAS.py at master · anbrooks/juncBASE · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
#!/lab/64/bin/python
# makeNonRedundantAS.py
# Author: Angela Brooks
# Program Completion Date:
# Modification Date(s):
"""From a table containing all AS events with p_vals, removes redundant events.
Assumes that all events were already considered significant.
"""

import sys
import optparse
import pdb

from helperFunctions import coordsOverlap
from getASEventReadCounts import convertCoordStr

import rpy2.robjects as robjects
#############
# CONSTANTS #
#############
INFINITY = 100000000000000000000

NA = "NA"
#################
# END CONSTANTS #
#################


###########
# CLASSES #
###########
class OptionParser(optparse.OptionParser):
    """
    Adding a method for required arguments.
    Taken from:
    http://www.python.org/doc/2.3/lib/optparse-extending-examples.html
    """
    def check_required(self, opt):
        option = self.get_option(opt)

        # Assumes the option's 'default' is set to None!
        if getattr(self.values, option.dest) is None:
            print "%s option not supplied" % option
            self.print_help()
            sys.exit(1)


###############
# END CLASSES #
###############

########
# MAIN #
########
def main():

    opt_parser = OptionParser()

    # Add Options. Required options should have default=None
    opt_parser.add_option("--input",
                          dest="input",
                          type="string",
                          help="""Table containing PSI values for events.  Last
                                  column indicates the combined p-value.""",
                          default=None)
    opt_parser.add_option("--output",
                          dest="output",
                          type="string",
                          help="""Resulting table with all redundancies
                                  removed.""",
                          default=None)
    opt_parser.add_option("--use_mad",
                          dest="use_mad",
                          action="store_true",
                          help="""Instead of a p-value, redundancies will be
                                  resolved by taking the event with the largest
                                  median absolute deviation (MAD). All value
                                  columns should be PSI values.""",
                          default=False)

    (options, args) = opt_parser.parse_args()

    # validate the command line arguments
    opt_parser.check_required("--input")
    opt_parser.check_required("--output")

    input_file = open(options.input)
    output_file = open(options.output, "w")

    use_mad = options.use_mad

    as_type2event2pval = {}

    as_type2chr2strand2redundantGroup2event = {}

    for event in input_file:
        event = formatLine(event)

        if event.startswith("#"):
            # Print header
            output_file.write(event + "\n")
            continue

        buildDictionaries(event, as_type2event2pval,
                          as_type2chr2strand2redundantGroup2event, use_mad)

    input_file.close()

    # Merge redundant events
    changeOccurred = True
    while changeOccurred:
        changeOccurred = mergeRedundantEvents(as_type2chr2strand2redundantGroup2event)

    # Now remove redundant events from as_type2event2pval dictionary
    removeRedundantEvents(as_type2chr2strand2redundantGroup2event, as_type2event2pval)

    # Now print out the remaining events
    for as_type in as_type2event2pval:
        for event in as_type2event2pval[as_type]:
            output_file.write(event + "\n")

    output_file.close()

    sys.exit(0)

############
# END_MAIN #
############

#############
# FUNCTIONS #
#############
def buildDictionaries(event, as_type2event2pval, as_type2chr2strand2redundantGroup2event,
                      use_mad):

    line_list = event.split("\t")

    if use_mad:
        pval = get_mad(line_list)
    else:
        pval = float(line_list[-1])
    as_type = line_list[1]

    if as_type in as_type2event2pval:
        if event in as_type2event2pval[as_type]:
            print "WARNING: Duplicate event exists."
            if pval > as_type2event2pval[as_type][event]:
                # Lowest pval is selected
                pval = as_type2event2pval[as_type][event]
        as_type2event2pval[as_type][event] = pval
    else:
        as_type2event2pval[as_type] = {event:pval}

    chr = line_list[3]
    strand = line_list[4]

    if as_type == "mutually_exclusive":
        # Largest region will be in exclusion junctions
        group_start, group_end = findLargestRegion(",".join([line_list[5],
                                                             line_list[6]]))
    else:
        group_start, group_end = findLargestRegion(line_list[5])

    redundantRegion = (chr, strand, group_start, group_end)

    updateRedundantDictionary(as_type2chr2strand2redundantGroup2event, as_type, redundantRegion,
                              event)

#ef convertCoordStr(coord_str):
#   chr, start_str, end_str = coord_str.split("_")

#   return chr, int(start_str), int(end_str)

def findLargestRegion(coords_string):

    first_list = coords_string.split(";")
    full_list = []
    for elem1 in first_list:
        for elem2 in elem1.split(","):
            full_list.append(elem2)

    leftmost = INFINITY
    rightmost = -1

    for coord in full_list:
        chr, start, end = convertCoordStr(coord)

        if start < leftmost:
            leftmost = start

        if end > rightmost:
            rightmost = end

    return leftmost, rightmost

def findMostSignEvent(event2pval, events):
    most_sign_pval = 2
    most_sign_event = None

    for event in events:
        if event2pval[event] < most_sign_pval:
            most_sign_pval = event2pval[event]
            most_sign_event = event

    return most_sign_event


def fixMissingVals(vals_str):

    return_vals = []

    for val in vals_str:
        if val != NA:
            return_vals.append(val)

    return return_vals

def formatLine(line):
    line = line.replace("\r","")
    line = line.replace("\n","")
    return line

def get_mad(line_list):
    vals_str = line_list[11:]

    vals_str = fixMissingVals(vals_str)

    vals = map(float, vals_str)

    abs_mad_val = abs(robjects.r['mad'](robjects.FloatVector(vals))[0])

    # Since the lowest p-value is supposed to be used for selecting the
    # non-redundant set of events, the negative of the larget MAD value will
    # give the event with the largets MAD value.
    return -abs_mad_val

def mergeRedundantEvents(as_type2chr2strand2redundantGroup2event):
    """
    Because of the construction of the dictionary, some of the redundant
    groups are overlapping.
    """
    for as_type in as_type2chr2strand2redundantGroup2event:
        for chr in as_type2chr2strand2redundantGroup2event[as_type]:
            for strand in as_type2chr2strand2redundantGroup2event[as_type][chr]:
                for first_group in as_type2chr2strand2redundantGroup2event[as_type][chr][strand]:
                    for second_group in as_type2chr2strand2redundantGroup2event[as_type][chr][strand]:
                        if first_group == second_group:
                            continue
                        if coordsOverlap(first_group[2], first_group[3],
                                         second_group[2], second_group[3]):

                            # Add sets together
                            new_set = as_type2chr2strand2redundantGroup2event[as_type][chr][strand][first_group].union(as_type2chr2strand2redundantGroup2event[as_type][chr][strand][second_group])
                            new_start = min(first_group[2], second_group[2])
                            new_end = max(first_group[3], second_group[3])

                            new_region = (first_group[0], first_group[1],
                                          new_start, new_end)

                            if new_region in as_type2chr2strand2redundantGroup2event[as_type][chr][strand]:
                                as_type2chr2strand2redundantGroup2event[as_type][chr][strand][new_region].update(new_set)
                            else:
                                as_type2chr2strand2redundantGroup2event[as_type][chr][strand][new_region] = new_set

                            # Remove old sets and return with flag
                            if new_region != first_group:
                                del as_type2chr2strand2redundantGroup2event[as_type][chr][strand][first_group]
                            if new_region != second_group:
                                del as_type2chr2strand2redundantGroup2event[as_type][chr][strand][second_group]

                            return True

    # If not returned from within the loop, then no change occurred
    return False

def removeRedundantEvents(as_type2chr2strand2redundantGroup2event, as_type2event2pval):
    """
    Returns the as_type2event2pval dictionary with redundant values removed
    """
    for as_type in as_type2chr2strand2redundantGroup2event:
        for chr in as_type2chr2strand2redundantGroup2event[as_type]:
            for strand in as_type2chr2strand2redundantGroup2event[as_type][chr]:
                for redundant_group in as_type2chr2strand2redundantGroup2event[as_type][chr][strand]:
                    if len(as_type2chr2strand2redundantGroup2event[as_type][chr][strand][redundant_group]) > 1:
                        most_sign_event = findMostSignEvent(as_type2event2pval[as_type],
                                                            as_type2chr2strand2redundantGroup2event[as_type][chr][strand][redundant_group])

                        for event in as_type2chr2strand2redundantGroup2event[as_type][chr][strand][redundant_group]:
                            if event == most_sign_event:
                                continue
                            # Delete all non-significant events
                            del as_type2event2pval[as_type][event]

def updateRedundantDictionary(as_type2chr2strand2redundantGroup2event, as_type, redundantRegion,
                              event):

    this_chr = redundantRegion[0]
    this_strand = redundantRegion[1]

    if as_type in as_type2chr2strand2redundantGroup2event:
        if not this_chr in as_type2chr2strand2redundantGroup2event[as_type]:
            as_type2chr2strand2redundantGroup2event[as_type][this_chr] = {this_strand:{redundantRegion: set([event])}}
            return
        if not this_strand in as_type2chr2strand2redundantGroup2event[as_type][this_chr]:
            as_type2chr2strand2redundantGroup2event[as_type][this_chr][this_strand] = {redundantRegion: set([event])}
            return

        foundOverlap = False
        for redundantGroup in as_type2chr2strand2redundantGroup2event[as_type][this_chr][this_strand]:
            if coordsOverlap(redundantGroup[2], redundantGroup[3],
                             redundantRegion[2], redundantRegion[3]):
                foundOverlap = True
                # Pick longest region as the key
                region_start = min(redundantGroup[2], redundantRegion[2])
                region_end = max(redundantGroup[3], redundantRegion[3])

                # Copy the existing set of exon_names
                cur_set = set(as_type2chr2strand2redundantGroup2event[as_type][this_chr][this_strand][redundantGroup])
                # Add the current event
                cur_set.add(event)

                del as_type2chr2strand2redundantGroup2event[as_type][this_chr][this_strand][redundantGroup]

                # Check for existing new group
                new_group = (redundantGroup[0], redundantGroup[1],
                             region_start, region_end)
                if new_group in as_type2chr2strand2redundantGroup2event[as_type][this_chr][this_strand]:
                    as_type2chr2strand2redundantGroup2event[as_type][this_chr][this_strand][new_group].update(cur_set)
                else:
                    as_type2chr2strand2redundantGroup2event[as_type][this_chr][this_strand][new_group] = cur_set
                break
        if not foundOverlap:
            # Add this non overlapping region on its own
            as_type2chr2strand2redundantGroup2event[as_type][this_chr][this_strand][redundantRegion] = set([event])
    else:
        as_type2chr2strand2redundantGroup2event[as_type] = {this_chr:{this_strand:{redundantRegion: set([event])}}}

#################
# END FUNCTIONS #
#################
if __name__ == "__main__": main()