-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcatStats.pl
More file actions
executable file
·111 lines (87 loc) · 2.74 KB
/
catStats.pl
File metadata and controls
executable file
·111 lines (87 loc) · 2.74 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
#!/usr/bin/perl -w
if(scalar(@ARGV) < 4) {
print "For each numeric field in the specified file excluding the categorical field, compute the specified statistics\n\n";
print "Usage: ~ <in.csv> <'catFld1|...|catFldN'> <min|max|mean|median|sum|std|ci> <out.csv>\n\n";
print "\tci: 95% confidence interval (half interval)\n";
print "\tstatistics can be a combination of different attributes such as mean and std\n\n";
exit(1);
}
use Flat;
use math;
my $in = Flat->new1(shift @ARGV);
my @catFlds = $in->getFieldIndice([split(/\|/, shift @ARGV)]);
my($out) = pop @ARGV;
my @stats = @ARGV;
my %isNumeric; # numeric fields
my @fldIndice = $in->getFieldIndice();
my @fldNames = $in->getVirtualFieldNames();
foreach $f (@fldIndice) {
if($in->fieldIsNumeric($f)) {
$isNumeric{$in->getFieldIndex($f)} = 1;
}
}
my %catFlds2fldVals; # category + fld index => fld values
$in->reset();
while($row = $in->readNextRow()) {
my $keyVal = join(",", map { $row->[$_]; } @catFlds);
map { push @{$catFlds2fldVals{$keyVal}{$_}}, $row->[$_];} @fldIndice;
}
undef $in;
open OUT, "+>$out" or die $!;
my @newFldNames;
foreach $s (@stats) {
foreach $fn (@fldNames) {
push @newFldNames, "$fn.$s";
}
}
print OUT join("\t", "SAMPLE_SIZE", @newFldNames), "\n";
my $sortOpt = "\$a <=> \$b";
map { if(math::util::isNaN($_)) { $sortOpt = "\$a cmp \$b"; } } keys %catFlds2fldVals;
foreach $cat (sort { eval($sortOpt) } keys %catFlds2fldVals) {
my @newFldVals;
push @newFldVals, scalar(@{$catFlds2fldVals{$cat}{0}});
foreach $s (@stats) {
for(my($i) = 0; $i < scalar(@fldIndice); $i++) {
my(@fldVals) = @{$catFlds2fldVals{$cat}{$i}};
my $isCatFld = 0;
map { if($i == $_) { $isCatFld = 1; } } @catFlds;
if($isCatFld) {
push @newFldVals, $fldVals[0];
}
elsif(math::util::isArrayNumeric(@fldVals)) { # if it's a numeric field
@fldVals = math::util::rmNaN(@fldVals);
my $statVal;
if($s eq "min") {
$statVal = math::util::getMin(@fldVals);
}
elsif($s eq "max") {
$statVal = math::util::getMax(@fldVals);
}
elsif($s eq "mean") {
$statVal = math::util::getMean(@fldVals);
}
elsif($s eq "median") {
$statVal = math::util::getMedian(@fldVals);
}
elsif($s eq "std") {
$statVal = math::util::getStandardDeviation(@fldVals);
}
elsif($s eq "ci") {
$statVal = math::util::getConfidenceInterval(@fldVals);
}
elsif($s eq "sum") {
$statVal = math::util::getSum(@fldVals);
}
else {
Util::dieIt("Unknown statistics '$s'\n");
}
push @newFldVals, $statVal;
}
else { # a non-numeric field, use it directly
push @newFldVals, join(",", Util::getUniqueElements(@fldVals));
}
}
}
print OUT join("\t", @newFldVals), "\n";
}
close OUT;