-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathover-sample.py
More file actions
56 lines (46 loc) · 1.48 KB
/
over-sample.py
File metadata and controls
56 lines (46 loc) · 1.48 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
#!/usr/bin/env python3
"""
over-sample.py: use data duplication to convert unbalanced to balanced data
usage: over-sample.py < file
note: expected line format: label SPACE data
20181122 erikt(at)xs4all.nl
"""
import sys
COMMAND = sys.argv.pop(0)
def getLabel(line):
fields = line.split()
try:
label = fields.pop(0)
lineData = " ".join(fields)
except Exception as e: sys.exit(COMMAND+": error: "+str(e))
return(label,lineData)
def printAll(data,label):
for d in data: print(label,d)
def printPart(data,label,count):
if count > len(data):
sys.exit(COMMAND+": data set is too small: "+label+" ("+count+")")
for i in range(0,count): print(label,data[i])
def readFileFromStdin():
data = {}
for line in sys.stdin:
line = line.strip()
label,lineData = getLabel(line)
if not label in data: data[label] = []
data[label].append(lineData)
return(data)
def getLargestSize(data):
return(max([len(data[key]) for key in data.keys()]))
def printData(data,largestSize):
for label in data.keys():
size = 0
while size+len(data[label]) < largestSize:
printAll(data[label],label)
size += len(data[label])
if size < largestSize:
printPart(data[label],label,largestSize-size)
def main(argv):
data = readFileFromStdin()
largestSize = getLargestSize(data)
printData(data,largestSize)
if __name__ == "__main__":
sys.exit(main(sys.argv))