-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcollapsefragments.py
More file actions
executable file
·144 lines (124 loc) · 4.29 KB
/
collapsefragments.py
File metadata and controls
executable file
·144 lines (124 loc) · 4.29 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
#!/usr/bin/env python
import re
import sys
import os.path
import itertools
import subprocess
from tdrdbutils import *
from distutils.spawn import find_executable
import time
import argparse
from collections import defaultdict
def readmultifastq(fqfile, fullname = False):
#print chrom+":"+ chromstart+"-"+ chromend
if fqfile == "stdin":
fqfile = sys.stdin
elif fqfile.endswith(".gz"):
fqfile = gzip.open(fqfile, "rb")
else:
fqfile = open(fqfile, "r")
currloc = 0
currseq = None
sequence = ""
quality = ""
if fullname:
reheader = re.compile(r"\@(.+)")
else:
reheader = re.compile(r"\@([^\s\,]+)")
qualheader = re.compile(r"\+([^\s\,]*)")
readqual = False
for line in fqfile:
#print line
line = line.rstrip("\n")
seqheader = reheader.match(line)
qheader = qualheader.match(line)
if readqual:
quality += line
if len(quality) == len(sequence):
yield currseq, sequence, quality
readqual = False
elif seqheader:
currseq = seqheader.groups(1)[0]
sequence = ""
quality = ""
elif qheader and readqual == False:
readqual = True
pass
else:
sequence += line
class prunedict:
def __init__(self, trimcutoff = 10):
self.counts = defaultdict(int)
self.trimcutoff = trimcutoff
self.totalkeys = 0
self.trimmed = 0
def trimold(self):
#print >>sys.stderr, "**"
if self.trimcutoff is not None:
newdict = defaultdict(int)
trimmed = 0
for curr in self.counts.iterkeys():
if self.counts[curr] > self.trimcutoff:
newdict[curr] = self.counts[curr]
else:
trimmed += 1
self.trimmed += trimmed
print >>sys.stderr, str(trimmed)+"/"+str(self.totalkeys)+" at "+str(self.trimcutoff)
self.totalkeys = len(self.counts.keys())
self.counts = newdict
def trim(self):
#print >>sys.stderr, "**"
if self.trimcutoff is not None:
newdict = defaultdict(int)
trimmed = 0
for curr in self.counts.iterkeys():
if self.counts[curr] > self.trimcutoff:
newdict[curr] = self.counts[curr]
else:
trimmed += 1
self.trimmed += trimmed
self.totalkeys = len(self.counts.keys())
print >>sys.stderr, str(trimmed)+"/"+str(self.totalkeys)+" at "+str(self.trimcutoff)
self.counts = newdict
def __getitem__(self, key):
return self.counts[key]
def __setitem__(self, key, count):
if key not in self.counts:
self.totalkeys += 1
self.counts[key] = count
def iterkeys(self):
return self.counts.keys()
parser = argparse.ArgumentParser(description='Generate fasta file containing mature tRNA sequences.')
parser.add_argument('--skipprune', action="store_true", default=False,
help='Sum samples that are replicates')
parser.add_argument('--trimcutoff', type=int, default=10,
help='margin to add to feature coordinates')
parser.add_argument("fastqfile")
args = parser.parse_args()
argdict = vars(args)
trimcutoff = None
skipprune = argdict["skipprune"]
fastqfile = argdict["fastqfile"]
if not skipprune:
trimcutoff = int(argdict["trimcutoff"])
#skipprune = True
mincounts = 0
#maxseqs = 100000
seqcount = None
seqcount = prunedict(trimcutoff = trimcutoff)
total = 0
for name, seq, qual in readmultifastq(fastqfile):
seqcount[seq] += 1
total += 1
#if total % 100000 == 0:
#print >>sys.stderr, str(total)
if not skipprune:
seqcount.trim()
#print >>sys.stderr, "results: "+str(len(seqcount.counts.keys()))+"/"+str(total)+":"+str(1.*len(seqcount.counts.keys())/total)
#print >>sys.stderr, "totaltrimmed: "+str(seqcount.trimmed)
#print >>sys.stderr, "finalcutoff: "+str(seqcount.trimcutoff)
#print >>sys.stderr, "maxkeys: "+str(seqcount.maxkeys)
for i, curr in enumerate(seqcount.iterkeys()):
if seqcount[curr] > mincounts:
print ">frag"+str(i+1)+":"+str(seqcount[curr])
print curr