-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcontigsparser.py
More file actions
208 lines (184 loc) · 7.25 KB
/
contigsparser.py
File metadata and controls
208 lines (184 loc) · 7.25 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
# -*- coding: utf-8 -*-
"""
Created on Tue Oct 07 10:11:09 2014
Deals with contigs generated by IBDA-UD -> exports them all into a single contig file
@author: Freeman
"""
import os
cwd = os.getcwd()
experiment = [] #a list of all the bargroups
class bargroup(object):
def __init__ (self, barcode = "", numreads = "", seqs = []):
self.barcode = barcode
self.numreads = int(numreads)
self.seqs = seqs
return
def outpickle(exp, outfile):
import cPickle as pickle
pickle.dump(exp, outfile)
print "experiment pickled"
return
def inpickle(infile):
import cPickle as pickle
exp = pickle.load(infile)
print "experiment imported"
return exp
def onefasta (infile): #this function inputs contigs that are all in one fasta
seqs = []
barcode = "ALL"
num_contigs = 0
for line in infile:
#store the reads
if line[0] != ">":
num_contigs += 1
seqs.append(line.strip())
print "%d contigs found"%num_contigs
experiment.append(bargroup(barcode, len(seqs), seqs = seqs))
return
def concatenate (path): #takes all the separate contigs and concatenates them into one fasta
contigs_per_bar=[] #keeps track of how many contigs were found in each barcode
for folder in os.listdir(path):
num_contigs = 0
if os.path.isdir("%s%s"%(path,folder)) and folder.find("fasta") != -1:
#go into folder, open up contig.fa, store it into memory
rev = folder.find(".fasta")
numreads = folder[15:rev]
seqs = []
try:
infile = open("%s%s/contig.fa"%(path, folder))
for line in infile:
#store the reads
if line[0] != ">":
num_contigs += 1
seqs.append(line.strip())
experiment.append(bargroup(folder[0:15], numreads, seqs))
contigs_per_bar.append(num_contigs)
except:
print "contig.fa not found"
else:
print "folder unexpected! %s" %folder
print "is it a folder?", os.path.isdir(folder)
#print folder.find("fasta")
dictionary = {}
for n in contigs_per_bar:
if n in dictionary:
dictionary[n] += 1
else:
dictionary[n] = 1
print dictionary
return
def trim_contigs(front, back): #trims the contigs
for group in experiment:
for seq in group.seqs:
seq = seq[front:len(seq)-back]
print "seqs trimmed!"
return
def export (outfile): #exports the experiment as a single fasta file
num = 0
for group in experiment:
for seq in group.seqs:
outfile.write(">%s-%s-Len:%s\n%s\n" %(group.barcode, group.numreads, len(seq), seq))
num += 1
print "%d sequences written!" %num
outfile.close()
return
def export_lengths (outfile): #exports the contig lengths into a file for histogram
for group in experiment:
for seq in group.seqs:
outfile.write("%d\n"%len(seq))
print "lengths written!"
return
def reads_numcontigs(outfile): #outputs the # of reads vs num contigs for each group
for group in experiment:
outfile.write("%d \t %d\n"%(group.numreads, len(group.seqs)))
print "cov_numcontigs written!"
return
def bar_numcontigs(outfile): #outputs the barcode and the # of contigs from that bar
for group in experiment:
outfile.write("%s\t%d\n"%(group.barcode, len(group.seqs)))
print "bar_numcontigs file written!"
return
def reads_assemble_success(bins, experiment, outfile):
import matplotlib as mpl
mpl.use('Agg')
import matplotlib.pyplot as plot
numreads = [] #used to determine the bins
histogram = {} #used to store bin:value
for bargroup in experiment:
numreads.append(bargroup.numreads)
n, bins, patches = plot.hist(numreads, bins, align = "left") #creates the bins
for b in bins: #initiate the histogram
histogram[b] = []
#add in the histogram data points for assembled = 1 or no assemblies = 0
for bargroup in experiment:
bindiffs = list(abs(bins - bargroup.numreads))
best_bin = bins[bindiffs.index(min(bindiffs))]
if len(bargroup.seqs) == 0:
histogram[best_bin].append(0)
else:
histogram[best_bin].append(1)
#convert the data points into a fraction which is assembly success
for b in histogram:
if len(histogram[b]) > 0:
histogram[b] = 1.0*sum(histogram[b])/len(histogram[b]) #convert the data to fractions
outfile.write("%f\t%f\n"%(b,histogram[b]))
print histogram
return
def entropy_assemble_success(bins, experiment, outfile, entropy_dict):
import matplotlib as mpl
mpl.use('Agg')
import matplotlib.pyplot as plot
#add entropies to all the bargroups
for bargroup in experiment:
bargroup.entropy = entropy_dict[bargroup.barcode]
entropies = [] #used to determine the bins
histogram = {} #used to store bin:value
for bargroup in experiment:
entropies.append(bargroup.entropy)
n, bins, patches = plot.hist(entropies, bins, align = "left") #creates the bins
for b in bins: #initiate the histogram
histogram[b] = []
#add in the histogram data points for assembled = 1 or no assemblies = 0
for bargroup in experiment:
bindiffs = list(abs(bins - bargroup.entropy))
best_bin = bins[bindiffs.index(min(bindiffs))]
if len(bargroup.seqs) == 0:
histogram[best_bin].append(0)
else:
histogram[best_bin].append(1)
#convert the data points into a fraction which is assembly success
for b in histogram:
if len(histogram[b]) > 0:
histogram[b] = 1.0*sum(histogram[b])/len(histogram[b]) #convert the data to fractions
outfile.write("%f\t%f\n"%(b,histogram[b]))
print histogram
return
if __name__ == "__main__":
import timeit
import numpy as np
tic = timeit.default_timer()
path = "./"
#with open("%sallcontigs.fa"%path) as infile:
# onefasta(infile)
#with open("%sreadlengths.txt"%path, "w") as outfile:
# export_lengths(outfile)
#concatenate("%ssamfastas/"%path)
#trim_contigs(200, 200)
#print len(experiment)
#outfile = open("%s/allcontigs.fa"%path, "w")
#export(outfile)
#with open("%scov_contigs.txt"%path, 'w') as outfile:
# reads_numcontigs(outfile)
#with open("./allcontig-length.txt", "w") as outfile:
# export_lengths(outfile)
#with open("%sbarnumcontigs.txt"%path,"w") as outfile:
# bar_numcontigs(outfile)
#with open("%sentropy_dict.pickle"%path) as infile:
# entropy_dict = inpickle(infile)
#with open("%sreads_success.txt"%path, "w") as outfile:
# bins = list(10 ** np.linspace(np.log10(100), np.log10(1e5), 50))
# reads_assemble_success(bins, experiment, outfile)
#with open("%sentropy_success.txt"%path, "w") as outfile:
# entropy_assemble_success(10, experiment, outfile, entropy_dict)
toc = timeit.default_timer()
print "%d seconds processing time" % (toc-tic)