fairHashmap/sweep_and_cut.py at main · UIC-InDeXLab/fairHashmap · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import timeit

import numpy as np
import pandas as pd
from collections import defaultdict, Counter
from bisect import bisect


def sweep_and_cut(path, column, sens_attr, num_of_buckets):
    df = pd.read_csv(path)
    start = timeit.default_timer()
    df = df.sort_values(column[0])
    t = df[column[0]].values
    G = df[sens_attr].values
    G_unique, G_count = np.unique(df[sens_attr].values, return_counts=True)
    n = df.shape[0]
    C = np.zeros(len(G_unique), dtype=int)
    w = np.zeros(n, dtype=int)
    for i in range(n):
        g = np.where(G_unique == G[i])[0][0]
        w[i] = (C[g] // (G_count[g] / num_of_buckets)) + 1
        C[g] += 1
    hash_buckets = []
    boundary = []
    i = 0
    while True:
        while i < n - 1 and w[i] == w[i + 1]:
            i += 1
        if i == n - 1:
            break
        hash_buckets.append(w[i])
        boundary.append((t[i] + t[i + 1]) / 2)
        if i == n - 2:
            hash_buckets.append(w[i])
            boundary.append(t[i])
            break
        i += 1
    stop = timeit.default_timer()
    return boundary, hash_buckets, stop - start


def query(q, boundary, hash_buckets):
    idx = bisect(boundary, q)
    if idx == len(hash_buckets):
        return hash_buckets[-1]
    else:
        return hash_buckets[idx]

def fit_predict_eval_sweep(path_train, path_test, column,sens_attr_col,num_of_buckets):
    boundary, hash_buckets,_=sweep_and_cut(path_test,column,sens_attr_col,num_of_buckets)
    test=pd.read_csv(path_train)
    dict=defaultdict(list)
    G = list(test[sens_attr_col].values)
    n = len(G)
    freq = Counter(G)
    sens_attr_values = np.unique(G)

    for _, row in test.iterrows():
        bucket = query(row[column[0]], boundary, hash_buckets)
        dict[bucket].append(row[sens_attr_col])

    collision_prob_single = defaultdict(int)
    collision_prob_pairwise = defaultdict(int)
    collision_prob=0

    for bucket in dict.values():
        collision_prob += (len(bucket)/n)**2
        for val in sens_attr_values:
            collision_prob_single[val] += (bucket.count(val)/freq[val])*(len(bucket)/n)
            collision_prob_pairwise[val] += (bucket.count(val)/freq[val])**2

    max_collision_prob_single = np.max(
        [collision_prob_single[sens_attr] for sens_attr in sens_attr_values]
    )
    min_collision_prob_single = np.min(
        [collision_prob_single[sens_attr] for sens_attr in sens_attr_values]
    )
    single_fairness= (max_collision_prob_single / (min_collision_prob_single)) - 1

    max_collision_prob_pairwise = np.max(
        [collision_prob_pairwise[sens_attr] for sens_attr in sens_attr_values]
    )
    min_collision_prob_pairwise = np.min(
        [collision_prob_pairwise[sens_attr] for sens_attr in sens_attr_values]
    )

    pairwise_fairness= (max_collision_prob_pairwise / (min_collision_prob_pairwise)) - 1

    return collision_prob, single_fairness, pairwise_fairness
    # return collision_prob, collision_prob_single, collision_prob_pairwise