-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathbenchDL.py
More file actions
161 lines (117 loc) · 4.54 KB
/
benchDL.py
File metadata and controls
161 lines (117 loc) · 4.54 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
import random
import json
#ms: groups
#ps: proportion of groups
def gen_same_dist_dataset(ms, s):
random.seed(a=None, version=2)
# appending ids
# all datasets are sampled from the same distribution
return [(i, random.choice(ms)) for i in range(s)]
def gen_rand_dist_dataset(ms, s):
random.seed(a=None, version=2)
# create a random distribution of groups
dists = [random.randint(1,len(ms)) for m in ms]
dsum = sum(dists)
sdists = [d/dsum for d in dists]
# appending ids
return [(i, random.choices(ms, sdists, k=1)[0]) for i in range(s)]
def gen_one_minority_dist_dataset(ms, s, mingroup=0):
random.seed(a=None, version=21)
k = len(ms)
ps = [1.0/k for m in ms]
# mingroup is the minority
for i in range(1,k):
r = random.uniform(ps[0]/(k*(k-1)), ps[0]/(k-1))
ps[i] += r
ps[0] -= r
print('minority ps')
# change ps s.t. maingroup is minority
nmg = ps[mingroup]
ps[mingroup] = ps[0]
ps[0] = nmg
print(ps)
return [(i, random.choices(ms, ps, k=1)[0]) for i in range(s)]
def gen_one_majority_dist_dataset(ms, s):
random.seed(a=None, version=21)
k = len(ms)
ps = [1.0/k for m in ms]
# the first one is the majority
for i in range(1,k):
r = random.uniform(ps[0]/(k*(k-1)), ps[0]/(k-1))
ps[i] -= r
ps[0] += r
print('majority ps')
print(ps)
return [(i, random.choices(ms, ps, k=1)[0]) for i in range(s)]
def gen_dist_dataset(ms, ps, s):
random.seed(a=None, version=2)
vs = []
for im in range(len(ms)):
# adding tuples for each demographic
ss = [(i+len(vs), m[im]) for i in range(int(ps[im]*s))]
vs += ss
return vs
def add_rand_cost(ds, min_cost, max_cost):
cds = []
for d in ds:
d['cost'] = random.randint(min_cost, max_cost)
cds.append(d)
return cds
def add_uniform_cost(ds, min_cost, max_cost):
cost = random.randin(min_cost, max_cost)
cds = []
for d in ds:
d['cost'] = cost
cds.append(d)
return cds
def add_equi_cost(ds):
# assign unit cost to all data sets
cds = []
for d in ds:
d['cost'] = 1
cds.append(d)
return cds
def satisfy_cond(ms, cs, st):
if cs == None:
return []
return [(st+i*j, ms[i])for i in range(len(ms)) for j in range(cs[i])]
def gen_datalake(n, ms, min_row, max_row, min_cost, max_cost, cost=1, datadist='same-dist', costdist='random', mingroups = None, s=None, ps=None, mingroupid=0):
ds = []
if s == None:
if mingroups == None:
s = random.randint(min_row, max_row)
else:
s = random.randint(min_row - sum(mingroups), max_row - sum(mingroups))
if datadist == 'random':
print('datadist = random')
ds = [{'id': i, 'data': gen_rand_dist_dataset(ms, s) + satisfy_cond(ms, mingroups, s), 'groups': ms} for i in range(n)]
if datadist == 'same':
print('datadist = same')
# datasets with same distbn
ds = [{'id': i, 'data': gen_same_dist_dataset(ms, s) + satisfy_cond(ms, mingroups, s), 'groups': ms} for i in range(n)]
if datadist == 'minority':
print('datadist = minority')
ds = [{'id': i, 'data': gen_one_minority_dist_dataset(ms, s), 'groups': ms} for i in range(n)]
if datadist == 'minority-randgroup':
print('datadist = minority-randgroup')
ds = [{'id': i, 'data': gen_one_minority_dist_dataset(ms, s, random.choice(ms)), 'groups': ms} for i in range(n)]
if datadist == 'majority':
print('datadist = majority')
ds = [{'id': i, 'data': gen_one_majority_dist_dataset(ms, s), 'groups': ms} for i in range(n)]
if datadist == 'minmajorityratio':
print('datadist = minmajorityratio')
# ps is the percenntage of minority and majority data sets
ds = [{'id': i, 'data': gen_one_majority_dist_dataset(ms, s), 'groups': ms} for i in range(int(n*ps[0]))]
j = len(ds)
ds.extend([{'id': j+i, 'data': gen_one_minority_dist_dataset(ms, s), 'groups': ms} for i in range(int(n*ps[1]))])
if costdist == 'random':
cds = add_rand_cost(ds, min_cost, max_cost)
if costdist == 'uniform':
cds = add_uniform_cost(ds, min_cost, max_cost)
if costdist == 'equi':
cds = add_equi_cost(ds)
return cds
# n, ms, min_row, max_row, min_cost, max_cost, cost=1, dist='random', cost='random', s=None, ps=None
#ds = gen_datalake(3, 5, [0,1], 2, 5, 1, 3, None, 'random', 'random', ps=None)
#print(ds)
#json.dump(ds, open('data_synthetic/binary_repo.json', 'w'))