-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathstatistical_analysis.py
More file actions
118 lines (97 loc) · 5.75 KB
/
statistical_analysis.py
File metadata and controls
118 lines (97 loc) · 5.75 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import numpy as np
from data_manager import *
from scipy.stats import ranksums, ks_2samp
from utils import *
import copy
class statistical_analysis():
"""
The statistical_analysis class provides some statistical analysis or raw data and on the similarity scores.
Methods:
compute_cohen_d: computes the effect size between two arrays as the Cohen's d
compute_features_statistics: computes the Ranksum p-values and the Cohen's d values related to the comparison of
two raw datasets
compute_scores_statistics: computes the Ranksum p-values and the Cohen's d values related to the comparison of
two similarity score distributions
compute_permutation_test: TO ADD
"""
def __init__(self):
"""
The __init__ method is the initiaized, and sets the values for the initial attributes.
"""
self._data_manager = data_manager()
self._utils = utils()
def compute_cohen_d(self, first_data, second_data):
"""
The compute_cohen_d method computes the effect size between two array as the difference between two means
divided by the pooled standard deviation (Cohen's d).
:param first_data: it is the first 1D-array
:param second_fata: it is the second 1D-array
:return: the Cohen's d value
"""
first_N = np.shape(first_data)[0]
second_N = np.shape(second_data)[0]
first_var = np.var(first_data, ddof=1)
second_var = np.var(second_data, ddof=1)
first_mean = np.mean(first_data)
second_mean = np.mean(second_data)
pooled_std = np.sqrt(((first_N - 1) * first_var + (second_N - 1) * second_var) / (
first_N + second_N - 2 + np.finfo(float).eps)) + np.finfo(float).eps
return abs((first_mean - second_mean) / pooled_std)
def compute_features_statistics(self, first_data, second_data, first_labels=None, second_labels=None):
"""
The compute_features_statistics method computes the Wilcoxon p-value and the Cohen's d effect size on the
features between two data matrices.
:param first_data: it is the first (subjects*repetitions*features) matrix
:param second_data: it is the second (subjects*repetitions*features) matrix
:param first_labels: it is the list of labels identifying each subject in the first dataset (None by default)
:param second_labels: it is the list of labels identifying each subject in the second dataset (None by default)
:return: the (repetitions*features) pvalue and Cohen's d matrices, in order
"""
[first, second, features] = self.statistics_settings(first_data, second_data, first_labels, second_labels)
repetitions = 1
pvalue = np.zeros(shape=(features,))
d = np.zeros(shape=(features, ))
for f in range(features):
[stat, p] = ranksums(np.squeeze(first[0:, f]), np.squeeze(second[0:, f]))
pvalue[f] = p
d[f] = self.compute_cohen_d(np.squeeze(first[0:, f]), np.squeeze(second[0:, f]))
return pvalue, d
def compute_scores_statistics(self, first_data, second_data):
"""
The compute_scores_statistics method computes the p-value through the two-sample Kolmogorov-Smirnov test and the
Cohen's d effect size on the features between two arrays.
:param first_data: it is the first 1D-array of scores
:param second_data: it is the second 1D-array of scores
:return: the pvalue and Cohen's d value, in order
"""
first, second, first_features = self.statistics_settings(first_data, second_data)
[stat, pvalue] = ks_2samp(np.squeeze(first), np.squeeze(second))
d = self.compute_cohen_d(np.squeeze(first), np.squeeze(second))
return pvalue, d
def statistics_settings(self, first_data, second_data, first_labels=False, second_labels=False):
"""
The statistics_settings method set the data which has to be used for some statistical analysis.
:param first_data: it is a data matrix
:param second_data: it is another data matrix
:param first_labels: it is the list of labels identifying each subject in the first dataset
:param second_labels: it is the list of labels identifying each subject in the second dataset
:return: the two 2D data matrices, and the number of features
"""
first = np.array(copy.deepcopy(first_data))
second = np.array(copy.deepcopy(second_data))
if first_labels is False or second_labels is False:
aux_first = first
aux_second = second
first_features = 1
else:
first_L, first_subjects, first_repetitions, first_features = self._utils._dimensions(first, first_labels)
second_L, second_subjects, second_repetitions, second_features = self._utils._dimensions(second, second_labels)
first, second = self._utils._same_format_3D(first, second, first_labels, second_labels)
nSamples_first = first_repetitions*first_subjects
nSamples_second = second_repetitions*second_subjects
aux_first = np.zeros(shape=(nSamples_first, first_features))
aux_second = np.zeros(shape=(nSamples_second, first_features))
for f in range(first_features):
aux_first[:, f] = np.reshape(np.squeeze(first[0:, 0:, f]), (nSamples_first,))
aux_second[:, f] = np.reshape(np.squeeze(second[0:, 0:, f]), (nSamples_second,))
return aux_first, aux_second, first_features