-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathquantify-clusters.py
More file actions
executable file
·103 lines (77 loc) · 3.11 KB
/
quantify-clusters.py
File metadata and controls
executable file
·103 lines (77 loc) · 3.11 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
#!/usr/bin/env python2
import os
import sys
import zipcodetools
import gzip
from optparse import OptionParser
###############################################################################
USAGE = """
python quantify-clusters.py --families <table of families to consider>
--table <processed table of zip counds to quantify (.ziptable.gz)>
--editdistance <edit distance to merge zipcodes, default=1)
--out <output file name for quantification results>
Quantify observations for set of zipcodes
--families is output from select-clusters.py
Assumes --table file is gziped, and that column 3 is (proper) normalized fraction.
"""
parser = OptionParser(USAGE)
parser.add_option('--families',dest='familiesFile', help = 'zipcode families file')
parser.add_option('--table',dest='zipTable', help = 'zipcode table table (.ziptable.gz)')
parser.add_option('--out',dest='outFile', help = 'output file')
parser.add_option('--editdistance',dest='editDistance', type='int',default=1,help = 'edit distance to merge zipcodes in clustering')
(options, args) = parser.parse_args()
if options.familiesFile is None:
parser.error('familiesFile not given')
if options.zipTable is None:
parser.error('zipTable not given')
if options.outFile is None:
parser.error('outFile not given')
###############################################################################
if options.zipTable[-12:] != '.ziptable.gz':
print 'name error -- expect file to end in \'ziptable.gz\' '
sys.exit()
print 'edit distance set to',options.editDistance
outFile = open(options.outFile,'w')
outFile.write('#Edit distance %i\n' % options.editDistance)
outFile.write('#Zipcode families %s\n' % options.familiesFile)
outFile.write('#Zipcode counts %s\n' % options.zipTable)
# read in the zipcode families
familySet = []
inFile = open(options.familiesFile,'r')
for line in inFile:
line = line.rstrip()
line = line.split()
familySet.append([line[0],line[1],0.0])
inFile.close()
print 'Setup %i families to match\n' % len(familySet)
noAssignment = 0.0
totPerDepth = 0.0
inFile = gzip.open(options.zipTable,'r')
lineNum = 0
for line in inFile:
line = line.rstrip()
line = line.split()
zipcode = line[0]
fracDepth = float(line[2])
totPerDepth += fracDepth
did = False
for i in range(len(familySet)):
numMisMatches = zipcodetools.score_num_missmatches(familySet[i][1],zipcode)
if numMisMatches <= options.editDistance:
familySet[i][2] += fracDepth
did = True
break
if did is False:
noAssignment += fracDepth
lineNum += 1
if lineNum % 50000 == 0:
print '... Line number',lineNum
inFile.close()
for i in range(len(familySet)):
outFile.write('%s\t%s\t%.8f\n' % (familySet[i][0],familySet[i][1],familySet[i][2]))
lastN = int(familySet[-1][0])
lastN += 1
outFile.write('%i\tNotAssigned\t%.8f\n' % (lastN,noAssignment))
outFile.close()
print 'Total depth encountered: %.8f' % totPerDepth
print 'Not assigned: %.8f' % noAssignment