-
Notifications
You must be signed in to change notification settings - Fork 6
Expand file tree
/
Copy pathcollapsefragments.py
More file actions
executable file
·138 lines (119 loc) · 4.13 KB
/
collapsefragments.py
File metadata and controls
executable file
·138 lines (119 loc) · 4.13 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
#!/usr/bin/env python3
import re
import sys
import os.path
import itertools
import subprocess
from trnasequtils import *
from distutils.spawn import find_executable
import time
from collections import defaultdict
def readmultifastq(fqfile, fullname = False):
#print chrom+":"+ chromstart+"-"+ chromend
if fqfile == "stdin":
fqfile = sys.stdin
elif fqfile.endswith(".gz"):
fqfile = gzip.open(fqfile, "rt")
else:
fqfile = open(fqfile, "r")
currloc = 0
currseq = None
sequence = ""
quality = ""
if fullname:
reheader = re.compile(r"\@(.+)")
else:
reheader = re.compile(r"\@([^\s\,]+)")
qualheader = re.compile(r"\+([^\s\,]*)")
readqual = False
for line in fqfile:
#print line
line = line.rstrip("\n")
seqheader = reheader.match(line)
qheader = qualheader.match(line)
if readqual:
quality += line
if len(quality) == len(sequence):
yield currseq, sequence, quality
readqual = False
elif seqheader:
currseq = seqheader.groups(1)[0]
sequence = ""
quality = ""
elif qheader and readqual == False:
readqual = True
pass
else:
sequence += line
class prunedict:
def __init__(self, maxkeys = 1000000):
self.counts = defaultdict(int)
self.maxkeys = maxkeys
self.trimcutoff = max([10,self.maxkeys/100000])
self.totalkeys = 0
self.trimmed = 0
def trim(self, trimcutoff = None):
#print >>sys.stderr, "**"
newdict = defaultdict(int)
trimmed = 0
if trimcutoff is None:
trimcutoff = self.trimcutoff
for curr in self.counts.keys():
if self.counts[curr] > trimcutoff:
newdict[curr] = self.counts[curr]
else:
trimmed += 1
self.trimmed += trimmed
#print (str(trimmed)+"/"+str(self.totalkeys)+" at "+str(self.trimcutoff), file = sys.stderr)
self.totalkeys = len(list(self.counts.keys()))
self.counts = newdict
def __getitem__(self, key):
if key not in self.counts:
self.totalkeys += 1
return self.counts[key]
def __setitem__(self, key, count):
if key not in self.counts:
self.totalkeys += 1
self.counts[key] = count
if self.totalkeys > self.maxkeys:
self.trim()
#print >>sys.stderr, str(len(self.counts.keys())) +"/"+ str(len(self.newdict.keys())) +":"+str(1.*len(self.counts.keys())/len(self.newdict.keys()))
if self.totalkeys > self.maxkeys:
self.trimcutoff *= 1.1
def keys(self):
return list(self.counts.keys())
sampledata = samplefile(sys.argv[1])
samples = sampledata.getsamples()
seqcount = dict()
allmode = False
for currsample in samples:
maxseqs = 10000000
seqcount[currsample] = prunedict()
total = 0
for name, seq, qual in readmultifastq(sampledata.getfastq(currsample)):
seqcount[currsample][seq] += 1
total += 1
#if total % 100000 == 0:
#print >>sys.stderr, str(total)
if not allmode:
pass
#seqcount[currsample].trim(trimcutoff = 1000)
seqfile = open(sys.argv[2], "w")
allseqs = defaultdict(int)
totalreads = defaultdict(int)
for currsample in samples:
for currseq in seqcount[currsample].keys():
allseqs[currseq] += 1
totalreads[currseq] += seqcount[currsample][currseq]
maxmissing = 0
print("\t".join(samples))
for i, currseq in enumerate(allseqs.keys()):
if not allmode and (allseqs[currseq] < 2 or totalreads[currseq] < 40): #allseqs[currseq] < len(samples)
currmax = max(seqcount[currsample][currseq] for currsample in samples)
if currmax > maxmissing:
maxmissing = currmax
continue
seqname = "frag"+str(i+1)+"_"+str(len(currseq))
print(seqname+"\t"+ "\t".join(str(seqcount[currsample][currseq]) for currsample in samples))
print(">"+ seqname, file=seqfile)
print(currseq, file=seqfile)