fairfs/dataset_loader.py at master · pnb/fairfs · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
# Load various datasets and provide them in a unified format for easy experimentation.
# Format of each dataset will be:
# {
#   dataset_name: {
#       'data': 2D Numpy array of feature values,
#       'labels': 1D Numpy array of labels (strings),
#       'participant_ids': 1D Numpy array of ids for cross-validation,
#       'feature_names': 1D Numpy array of feature names (strings)
#   }
# }
import pandas as pd
import numpy as np
from scipy.io import arff
from aif360.datasets import BinaryLabelDataset
from aif360.algorithms.preprocessing.reweighing import Reweighing


def get_uci_student_performance(median_split=True):
    # This is a regression dataset on final grade in two different courses.
    math = pd.read_csv('data/uci_student_performance/student-mat.csv', sep=';')
    portuguese = pd.read_csv('data/uci_student_performance/student-por.csv', sep=';')
    for ds in [math, portuguese]:  # Recode some attributes to make nicer features/labels.
        ds['school_id'] = (ds.school.values == 'MS').astype(int)
        ds['male'] = (ds.sex.values == 'M').astype(int)
        ds['rural'] = (ds.address.values == 'R').astype(int)
        ds['famsize_gt3'] = (ds.famsize.values == 'GT3').astype(int)
        ds['parents_cohabitation'] = (ds.Pstatus.values == 'T').astype(int)
        for col in ['Mjob', 'Fjob', 'reason', 'guardian']:
            for v in sorted(ds[col].unique()):
                ds[col + '_' + v] = (ds[col].values == v).astype(int)
            ds.drop(columns=col, inplace=True)
        for col in ['schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet',
                    'romantic']:
            ds[col] = (ds[col].values == 'yes').astype(int)
        ds.drop(columns=['school', 'sex', 'address', 'famsize', 'Pstatus'], inplace=True)
        ds['G3'] = (ds.G3.values - ds.G3.values.min()) / (ds.G3.values.max() - ds.G3.values.min())
        if median_split:
            ds['G3'] = (ds.G3.values > np.median(ds.G3.values)).astype(int)
    return {
        'uci_student_performance_math': {
            'data': math[[f for f in math if f not in ['G1', 'G2', 'G3']]].values,
            'labels': math.G3.values,  # Final grade.
            'participant_ids': np.arange(0, len(math)),  # Every row is one student.
            'feature_names': np.array([f for f in math if f not in ['G1', 'G2', 'G3']])
        },
        'uci_student_performance_portuguese': {
            'data': portuguese[[f for f in portuguese if f not in ['G1', 'G2', 'G3']]].values,
            'labels': portuguese.G3.values,
            'participant_ids': np.arange(0, len(portuguese)),
            'feature_names': np.array([f for f in portuguese if f not in ['G1', 'G2', 'G3']])
        }
    }


def get_uci_student_academics(median_split=True):
    # This is an ordinal regression dataset with several academic outcome variables:
    #   tnp and twp -- Grade in two exams: Best, Vg (very good), Good, Pass, Fail
    #   iap -- Grade on assessments leading up to the final exam
    #   esp -- End of semester grade
    data, _ = arff.loadarff('data/uci_student_academics/Sapfile1.arff')
    df = pd.DataFrame(data)
    processed = pd.DataFrame({
        'final_grade': df.esp.map({b'Best': 4, b'Vg': 3, b'Good': 2, b'Pass': 1, b'Fail': 0}),
        'male': (df['ge'] == b'M').astype(int),
        # 'caste': df.cst,
        'overdue_papers': (df['arr'] == b'Y').astype(int),
        # 'married': df['ms'] == b'Married',  # Zero variance (all unmarried)
        'rural': (df['ls'] == b'V').astype(int),
        'free_admission': (df['as'] == b'Free').astype(int),
        'family_income': df.fmi.map({b'Vh': 4, b'High': 3, b'Am': 2, b'Medium': 1, b'Low': 0}),
        'family_size': df.fs.map({b'Large': 2, b'Average': 1, b'Small': 0}),
        'father_edu': df.fq.map({b'Il': 0, b'Um': 1, b'10': 2, b'12': 3, b'Degree': 4, b'Pg': 5}),
        'mother_edu': df.mq.map({b'Il': 0, b'Um': 1, b'10': 2, b'12': 3, b'Degree': 4, b'Pg': 5}),
        'father_occupation_service': (df['fo'] == b'Service').astype(int),
        'father_occupation_business': (df['fo'] == b'Business').astype(int),
        'father_occupation_retired': (df['fo'] == b'Retired').astype(int),
        'father_occupation_farmer': (df['fo'] == b'Farmer').astype(int),
        'father_occupation_others': (df['fo'] == b'Others').astype(int),
        'mother_occupation_service': (df['mo'] == b'Service').astype(int),
        'mother_occupation_business': (df['mo'] == b'Business').astype(int),
        'mother_occupation_retired': (df['mo'] == b'Retired').astype(int),
        'mother_occupation_housewife': (df['mo'] == b'Housewife').astype(int),
        'mother_occupation_others': (df['mo'] == b'Others').astype(int),
        'friends': df.nf.map({b'Large': 2, b'Average': 1, b'Small': 0}),
        'study_habits': df.sh.map({b'Good': 2, b'Average': 1, b'Poor': 0}),
        'previous_private_school': (df['ss'] == b'Private').astype(int),
        'instruction_medium_english': (df['me'] == b'Eng').astype(int),
        'instruction_medium_assamese': (df['me'] == b'Asm').astype(int),
        'instruction_medium_hindi': (df['me'] == b'Hin').astype(int),
        'instruction_medium_bengali': (df['me'] == b'Ben').astype(int),
        'travel_time': df.tt.map({b'Large': 2, b'Average': 1, b'Small': 0}),
        'attendance': df.atd.map({b'Good': 2, b'Average': 1, b'Poor': 0}),
    })
    for col in list(processed.columns):
        if processed[col].sum() / len(processed) < .1:
            processed.drop(columns=[col], inplace=True)  # Remove columns with little variance
    if median_split:
        processed.final_grade = (processed.final_grade > processed.final_grade.median()).astype(int)
    return {
        'uci_student_academics': {
            'data': processed[[f for f in processed if f != 'final_grade']].values,
            'labels': processed.final_grade.values,
            'participant_ids': np.arange(0, len(processed)),  # Every row is one student
            'feature_names': np.array([f for f in processed if f != 'final_grade'])
        }
    }


def get_uci_adult():
    # Classic UCI "Adult" dataset
    cols = ['age', 'workclass', 'final_weight', 'education', 'education_num', 'marital_status',
            'occupation', 'relationship', 'race', 'sex', 'capital_gain', 'capital_loss',
            'hours_per_week', 'native_country', 'outcome']
    df = pd.concat([
        pd.read_csv('data/uci_adult/adult.data', sep=', ', names=cols, engine='python'),
        pd.read_csv('data/uci_adult/adult.test', skiprows=1, sep=', ', names=cols, engine='python')
    ]).drop(columns='education')
    df.outcome = df.outcome.str.replace('.', '')
    # One-hot encode categorical variables
    for col in ['workclass', 'marital_status', 'occupation', 'relationship', 'race', 'sex',
                'native_country', 'outcome']:
        for val in sorted(df[col].unique())[:-1]:  # Skip last value (reference level)
            df[col + '_' + val] = (df[col] == val).astype(int)
        df.drop(columns=col, inplace=True)
    # Remove columns with mostly zeros
    for col in df.columns:
        if df[col].sum() / len(df) < .1:
            df.drop(columns=col, inplace=True)
    return {
        'uci_adult': {
            'data': df[[f for f in df if f != 'outcome_<=50K']].values,
            'labels': df['outcome_<=50K'].values,
            'participant_ids': np.arange(0, len(df)),  # One row per person
            'feature_names': np.array([f for f in df if f != 'outcome_<=50K'])
        }
    }


def get_simulated_data():
    # Simple simulated classification dataset with unfair and fair features
    sample_data = pd.read_csv('data/simulated_data.csv', header=0)
    X = sample_data.drop('outcome', axis=1)
    y = sample_data['outcome']
    return {
        'simulated_data': {
            'data': X.values,
            'labels': y.values,
            'participant_ids': np.arange(0, len(sample_data)),
            'feature_names': np.array([f for f in sample_data if f not in ['outcome']])
        }
    }


def get_transformed_data(dataset='data/simulated_data.csv',
                         protected_attribute='group'):
    sample_data = pd.read_csv(dataset, header=0)

    pre_transform = BinaryLabelDataset(1.0, 0.0, df=sample_data,
                                       label_names=['outcome'],
                                       protected_attribute_names=[protected_attribute])

    RW = Reweighing(unprivileged_groups=[{'group': 0}],
                    privileged_groups=[{'group': 1}])
#    RW.fit(pre_transform)
    post_transform = RW.fit_transform(pre_transform)
    ds = post_transform.convert_to_dataframe()[0]
    X = ds.drop('outcome', axis=1)
    y = ds['outcome']
    return {
        'simulated_data': {
            'data': X.values,
            'labels': y.values,
            'participant_ids': np.arange(0, len(ds)),
            'feature_names': np.array([f for f in ds if f not in ['outcome']])
        }
    }


def get_all_datasets(median_split_regression=True):
    """Return all datasets, optionally converting regression problems to binary classification.

    Args:
        median_split_regression (bool): Whether or not to convert regression datasets to
            classification datasets by performing a median split.

    Returns:
        dict: mapping of {dataset_name: {'data':, 'labels':, 'participant_ids':, 'feature_names':}}
    """
    return {
        **get_uci_student_performance(median_split_regression),
        **get_uci_student_academics(median_split_regression),
        **get_uci_adult(),
        **get_simulated_data(),
    }


if __name__ == '__main__':
    print(get_uci_adult())