-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathadd-family-to-align-parse.py
More file actions
executable file
·136 lines (103 loc) · 3.59 KB
/
add-family-to-align-parse.py
File metadata and controls
executable file
·136 lines (103 loc) · 3.59 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
#!/usr/bin/env python2
import sys
import zipcodetools
from optparse import OptionParser
###############################################################################
USAGE = """
python add-family-to-align-parse.py --alignparse <txt file from process-bam.py>
--zipfamilies <file of zip families to match>
--editdistance <edit distance to merge zipcodes, default=1)
Will output updated set to file in same dir.
"""
parser = OptionParser(USAGE)
parser.add_option('--alignparse',dest='alignParse', help = 'align parse file')
parser.add_option('--zipfamilies',dest='zipFamilies', help = 'zip families file')
parser.add_option('--editdistance',dest='editDistance', type='int',default=1,help = 'edit distance to merge zipcodes in clustering')
(options, args) = parser.parse_args()
if options.alignParse is None:
parser.error('alignParse not given')
if options.zipFamilies is None:
parser.error('zipFamilies not given')
###############################################################################
print 'reading in zip families from',options.zipFamilies
inFile = open(options.zipFamilies,'r')
zipFamList = []
zipFamDict = {}
for line in inFile:
if line[0] == '#':
continue
line = line.rstrip()
line = line.split()
zc = line[1]
if zc == 'NotAssigned':
continue
zipFamList.append(zc)
zipFamDict[zc] = 1
inFile.close()
print 'Read in %i zip families' % len(zipFamList)
print 'And the matching dict is %i long' % len(zipFamDict) # to speed up lookups...
# add in two pass approach to make things go even faster.....
observedZipSet = {}
inFile = open(options.alignParse,'r')
for line in inFile:
if line[0] == '#':
continue
line = line.rstrip()
zc = line.split()[0]
observedZipSet[zc] = ''
inFile.close()
print 'Read in %i unique zips to match' % len(observedZipSet)
print 'Doing the preassignment...'
# now, go through each and assign zip to it
assigned = 0
notassigned = 0
n = 0
for zc in observedZipSet:
did = False
if zc in zipFamDict:
did = True
assigned += 1
zipFam = zc
observedZipSet[zc] = zipFam
continue
for i in range(len(zipFamList)):
# use lowmem=False to speed things up, since have reduced number of comparisons
numMisMatches = zipcodetools.score_num_missmatches(zipFamList[i],zc)
if numMisMatches <= options.editDistance:
assigned += 1
zipFam = zipFamList[i]
did = True
observedZipSet[zc] = zipFam
break
if did is False:
notassigned += 1
zipFam = 'NotAssigned'
observedZipSet[zc] = zipFam
print 'Assignment done:'
print 'Has assignment: %i No assignment: %i' % (assigned,notassigned)
numFound = 0
numNotFound = 0
ofn = options.alignParse + '.zipfamilies'
inFile = open(options.alignParse,'r')
print 'Matching to ',ofn
outFile = open(ofn,'w')
outFile.write('#%s\n' % options.zipFamilies)
for line in inFile:
if line[0] == '#':
line = line.rstrip()
line = line[1:]
line = '#zipFamily\t' + line + '\n'
outFile.write(line)
continue
line = line.rstrip()
zc = line.split()[0]
zipFam = observedZipSet[zc]
if zipFam == 'NotAssigned':
numNotFound += 1
else:
numFound += 1
nl = zipFam + '\t' + line + '\n'
outFile.write(nl)
outFile.close()
print 'Num Matched in set %i' % numFound
print 'Num not matched in set %i' % numNotFound