Simulating-Science/herding.py at main · AMISTAD-lab/Simulating-Science · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
import sqlite3
import pandas as pd
import numpy as np
from scipy.stats import pearsonr

def entropy(probabilities):
    probs = probabilities[probabilities > 0]
    return -np.sum(probs * np.log2(probs), axis=0)

def herding(cell_counts_vector):
    """ Input: cell_counts_vector is a numpy vector where each index corresponds
            to the number of scientists currently in that cell.

        Output: a tuple of the uniform entropy (which is maximum entropy for this
            number of scientists on this board size if we allow fractional scientists,
            board entropy, and a herding value between 0 and 1, which is maximum herding,
            namely, all scientists in a single cell.
    """
    total = np.sum(cell_counts_vector)
    uniform_vector = np.ones(cell_counts_vector.size) * (total / cell_counts_vector.size)
    uniform_entropy = entropy(uniform_vector / total)
    entr = entropy(cell_counts_vector / total)
    return (uniform_entropy, entr, 1 - entr/uniform_entropy)


if __name__ == "__main__":
    #=============
    # Test
    #=============
    num_of_subareas = 10
    vec = np.random.randint(0, 20, num_of_subareas)
    print(f"Vector: {vec}")
    (unif, entr, herd) = herding(vec)
    print(f"Uniform Entropy: {unif}, Board Entropy: {entr}, Herding: {herd}")

    # Case when all scientists are in a single cell. Making sure our herding metric makes sense.
    total = np.sum(vec)
    vec = np.zeros(num_of_subareas)
    vec[0] = total
    print(f"Vector 2: {vec}")
    (unif, entr, herd) = herding(vec)
    print(f"Uniform Entropy: {unif}, Board Entropy: {entr}, Herding: {herd}")

    # Case when all (fractional) scientists are distributed evenly (maximum entropy board).
    vec = np.ones(num_of_subareas) * (total / num_of_subareas)
    print(f"Vector 3: {vec}")
    (unif, entr, herd) = herding(vec)
    print(f"Uniform Entropy: {unif}, Board Entropy: {entr}, Herding: {herd}")

def remainingPayoffExtraction():
    '''
    Extracts raw payoff values (not proportions) for each time step per experiment.
    '''
    # Connect to the database
    conn = sqlite3.connect('data.db')

    # Query to get the data
    query = """
    SELECT numExperiment, timeStep, totalPayoffExtracted, inputStr
    FROM cStats
    ORDER BY inputStr, numExperiment, timeStep
    """

    # Load the data into a Pandas DataFrame
    df = pd.read_sql_query(query, conn)
    conn.close()

    # Get unique inputStr values
    input_str_values = df['inputStr'].unique()

    # Initialize payoff results across all input strings
    all_payoff_results = []

    # Iterate over each unique inputStr value
    for input_str in input_str_values:
        # Filter data for the current inputStr
        input_str_data = df[df['inputStr'] == input_str]

        # Get unique experiments
        experiments = input_str_data['numExperiment'].unique()

        for experiment in experiments:
            # Filter data for the current experiment
            exp_data = input_str_data[input_str_data['numExperiment'] == experiment]

            # Get unique time steps for the current experiment
            time_steps = exp_data['timeStep'].unique()

            # Track previous cumulative payoff to compute incremental payoffs
            previous_payoff = 0

            # Iterate over time steps to calculate the raw payoff at each time step
            for time_step in time_steps:
                # Sum totalPayoffExtracted for the current time step
                total_payoff_at_step = exp_data[exp_data['timeStep'] == time_step]['totalPayoffExtracted'].sum()

                # Calculate the incremental payoff at this step
                payoff_extracted_at_step = total_payoff_at_step - previous_payoff
                previous_payoff = total_payoff_at_step

                # Append the raw payoff result (not proportion) for this time step
                all_payoff_results.append([experiment, time_step, payoff_extracted_at_step])

    # Create DataFrame from raw payoff results
    df_payoff_results = pd.DataFrame(all_payoff_results, columns=['Experiment', 'Time Step', 'Raw Payoff'])

    # Optionally, write the results to CSV for further analysis
    df_payoff_results.to_csv('raw_payoff_per_time_step.csv', index=False)

    return df_payoff_results

from scipy.stats import pearsonr
import numpy as np
import pandas as pd
import sqlite3

def queriesExtraction():
    '''
    Produces numpy 2D arrays to run the herding function on corresponding to each
    time step, calculates Pearson correlation between herding and payoff values,
    and outputs Pearson correlation, p-value, and 95% confidence intervals.
    '''
    # Connect to the database
    conn = sqlite3.connect('data.db')

    # Query to get the data
    query = """
    SELECT numExperiment, location, timeStep, numQueries, inputStr
    FROM cStats
    ORDER BY inputStr, numExperiment, timeStep, location
    """

    # Load the data into a Pandas DataFrame
    df = pd.read_sql_query(query, conn)
    conn.close()

    # Load the raw payoff values per time step (from remainingPayoffExtraction)
    df_payoff_results = pd.read_csv('raw_payoff_per_time_step.csv')

    # Get unique inputStr values
    input_str_values = df['inputStr'].unique()

    # Initialize correlation results list
    correlation_results = []

    for input_str in input_str_values:
        # Filter data for the current inputStr
        input_str_data = df[df['inputStr'] == input_str]

        # Get unique experiments and locations
        experiments = input_str_data['numExperiment'].unique()
        locations = input_str_data['location'].unique()

        # Map locations to indices
        location_to_index = {loc: idx for idx, loc in enumerate(locations)}

        # **Re-initialize herding_results list for each inputStr**
        herding_results = []

        for experiment in experiments:
            # Filter data for the current experiment
            exp_data = input_str_data[input_str_data['numExperiment'] == experiment]

            # Get unique time steps for the current experiment
            time_steps = exp_data['timeStep'].unique()

            for time_step in time_steps:
                # Filter data for the current timeStep
                ts_data = exp_data[exp_data['timeStep'] == time_step]

                # Initialize an array to store numQueries for the current time step
                num_queries_array = np.zeros(len(locations))

                for _, row in ts_data.iterrows():
                    location_index = location_to_index[row['location']]
                    num_queries_array[location_index] = row['numQueries']

                # Calculate herding for this time step
                herding_result = herding(num_queries_array)
                herding_results.append([experiment, time_step, input_str, herding_result[0], herding_result[1], herding_result[2]])

        # Convert herding results to DataFrame
        df_herding_results = pd.DataFrame(herding_results, columns=['Experiment', 'Time Step', 'Input String', 'Uniform Entropy', 'Board Entropy', 'Herding'])

        # Save herding results to CSV for this inputStr only
        filename = f'QUERIES_{input_str}.csv'
        df_herding_results.to_csv(filename, index=False)

        # Merge herding results with raw payoff values by Experiment and Time Step
        df_combined = pd.merge(df_herding_results, df_payoff_results, on=['Experiment', 'Time Step'])

        # Calculate Pearson correlation between Herding and Raw Payoff values
        correlation, p_value = pearsonr(df_combined['Herding'], df_combined['Raw Payoff'])

        # Calculate 95% confidence interval for Pearson correlation
        n = len(df_combined)  # sample size
        if n > 2:  # Ensure there are enough data points for meaningful correlation
            # Standard error of the correlation
            se_r = np.sqrt((1 - correlation ** 2) / (n - 2))

            # 95% confidence interval
            z = 1.96  # Z-score for 95% confidence
            ci_lower = correlation - z * se_r
            ci_upper = correlation + z * se_r
        else:
            ci_lower, ci_upper = np.nan, np.nan

        # Append Pearson correlation results for this input_str, along with CI
        correlation_results.append([input_str, correlation, p_value, ci_lower, ci_upper])

    # Convert correlation results to DataFrame and save to CSV
    df_correlation_results = pd.DataFrame(correlation_results, columns=['Input String', 'Pearson Correlation', 'p-value', 'CI Lower', 'CI Upper'])
    df_correlation_results.to_csv('pearson_correlation_results_with_CI.csv', index=False)

    return df_herding_results, df_correlation_results