-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsequence_modifier.py~
More file actions
124 lines (114 loc) · 5.4 KB
/
sequence_modifier.py~
File metadata and controls
124 lines (114 loc) · 5.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import random
__author__ = 'mwelland'
class Modifier:
""" This class creates single nucleotide changes in each exon of the input sequence
The exons are located using the exon coordinates from the input file
Random is used to choose a random location within the exon and a new base
The base substitution is made and the position is recorded in a dict
"""
def __init__(self, file_dict, type):
self.type = type
self.dict = file_dict
self.output_dict = {}
self.genomic = file_dict['full sequence']
self.padding = file_dict['offset']
self.transcript_dict = {}
self.transcript_pos = 0
def run_modifier(self):
"""
Main control method for the class
:return: the completed dictionary object
"""
for transcript in self.dict['transcripts']:
self.transcript_pos = 0
self.transcript_dict = self.dict['transcripts'][transcript]
self.output_dict[transcript] = {'variants': {}, 'exons': {}}
exon_list = self.transcript_dict['list_of_exons']
self.output_dict[transcript]['exon list'] = exon_list
for exon in exon_list:
try:
start = self.transcript_dict['exons'][exon]['genomic_start']
end = self.transcript_dict['exons'][exon]['genomic_end']
b_or_a = self.transcript_dict['exons'][exon]['cds']
old_cds = self.transcript_dict['exons'][exon]['old_cds_offset']
self.modify(exon, transcript, start, end, b_or_a, old_cds)
except IndexError:
print 'The index was out of range, line 43 seq_mod'
for exon in exon_list:
start = self.transcript_dict['exons'][exon]['genomic_start']
end = self.transcript_dict['exons'][exon]['genomic_end']
exon_seq = self.genomic[start - self.padding: end + self.padding]
self.output_dict[transcript]['exons'][exon] = {'start': start,
'end': end,
'padded seq': exon_seq,
'padded length': len(exon_seq),
'length': end - start}
return self.output_dict
def modify(self, exon_number, transcript, start, end, b_or_a, old_cds):
"""
Method to execute substitutions. This is called for each exon number.
This creates a single substitution per exon.
:param exon_number:
:param transcript:
"""
if self.type == 'lrg':
start -= 1
old_exon = self.genomic[start:end]
edit_coord = random.randint(start, end-1)
base = str(random.sample(['A', 'C', 'G', 'T'], 1)[0])
old_base = self.genomic[edit_coord]
while base == old_base:
base = str(random.sample(['A', 'C', 'G', 'T'], 1)[0])
self.genomic[edit_coord] = base
# Identify the position of the variant within the transcript (For HGVS)
cds_delay = self.transcript_dict['cds_offset']
p_length = self.transcript_dict['protein_length'] + 3 # For stop codon length
exon_length = end - start
"""
Put in something here to deal with if the exon is pre-coding
"""
# Edit coord + 1 to compensate for 0-base counting
variant_pos = (edit_coord + 1) - start
hgvs_pos = (self.transcript_pos + variant_pos) - cds_delay
"""
This is the correct hgvs nomenclature for annotation by annovar
"""
if hgvs_pos <= 0:
hgvs = 'c.%d%s>%s' % (hgvs_pos - 1, old_base, base)
elif hgvs_pos > p_length:
after_coding = hgvs_pos - p_length # compensate for no 0s
hgvs = 'c.*%d%s>%s' % (after_coding, old_base, base)
else:
hgvs = 'c.%s%d%s' % (old_base, hgvs_pos, base)
self.transcript_pos += exon_length
"""
# Un comment this section for positional debugging
print 'start: %d, end: %d' % (start, end)
print 'Length: %d' % (exon_length)
print 'for gene: %s, file: %s cds: %d' % (self.dict['genename'], self.dict['filename'], cds_delay)
print 'position %d in exon %d' % (variant_pos, exon_number)
print 'changed %s to %s' % (old_base, base)
print 'HGVS: %s' % hgvs
print 'Exon: \n%s' % self.genomic[start:end]
new_exon = self.genomic[start:end]
i = 0
while i <= len(new_exon):
base1 = old_exon[i]
base2 = new_exon[i]
if base1 == base2:
i += 1
else:
print base1
print base2
print 'position: %d' % (i + 1)
j = (i + 1) - cds_delay
if j <= 0:
j -= 1
print 'Assumed HGVS: c.%s%d%s' % (base1, j, base2)
break
this = raw_input()
"""
self.output_dict[transcript]['variants'][exon_number] = {'position': edit_coord,
'new base': base,
'hgvs': hgvs,
'transcript': self.transcript_dict['NM_number']}