MatlabCustomFunc/grubbs.m at master · Li-GL/MatlabCustomFunc · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
function [Gmax_loop Gmin_loop G2_loop lambda1_loop lambda2_loop outlier outlier_num] = grubbs(x, x_date, sided, alpha, dist, nmin)
%
% The 'grubbs' function performs an iterative test to check for the max,
% min or both observations as outliers. The test checks first for the max
% or min data point to be outlier and if any of the two observations is
% labeled as outlier then removes it from the sample and checks for the
% next max or min. If both of the test for max or min are not passed, then
% it checks if both max and min data points can be labeled as outliers. The
% test stops when no outliers are identified or the data sample is less
% than 'nmin'. The first part of the algorithm is also called the 'Modified
% Thompson Tau' procedure.
%
% Data in 'x' are organized so that columns are the time series and rows
% are the time intervals. All series contain the same number of
% observations.
%
% 'x_date' is a column vector (cell array) containing the dates of the
% corresponding data in 'x'.
%
% 'sided' indicates if the critical values are from a two-sided
% distribution (default) or one-sided ('sided' = 0).
%
% 'alpha' specifies the significant level. By default 'alpha' = 0.05.
%
% The variable 'dist' indicates if the series are assumed to be normal
% (default) or log-normally distributed. It can take the values 0 (normal)
% or 1 (log-normal).
%
% 'nmin' specifies the minimum number of observations, either in the
% original series or after removing outliers, before the algorithm stops.
% The default value is 6.
%
% [Gmax_loop Gmin_loop G2_loop lambda1_loop lambda2_loop outlier outlier_num outlier] =
% = gesd(...) returns the following:
% Gmax_loop          - indicates the statistic for the max observation as
%                      outlier in each loop (rows).
% Gmin_loop          - indicates the statistic for the min observation as
%                      outlier in each loop (rows).
% G2_loop            - indicates the statistic for the max and min
%                      observations as outliers in each loop (rows).
% lambda1_loop       - critical values for the max or mmin observation as
%                      outlier in each loop (rows).
% lambda2_loop       - critical values for both the max and mmin
%                      observation as outliers in each loop (rows).
% outlier            - cell array specifying the date and the series number (column
%                      number in 'x') where the potential outliers are situated.
% outlier_num        - matrix providing row and column numbers of the values in
%                      'x' considered as potential outliers.
%
% Created by Francisco Augusto Alcaraz Garcia
%            alcaraz_garcia@yahoo.com
%
% References:
%
% 1) H.A David; H.O. Hartley; E.S. Pearson (1954). The Distribution of
% the Ratio, in a Single Normal Sample, of Range to Standard Deviation.
% Biometrika 41(3), pp. 482-493.
%
% 2) F.E. Grubbs (1969). Procedures for Detecting Outlying Observations
% in Samples. Technometrics 11(1), pp. 1-21.
%
% 3) NIST/SEMATECH e-Handbook of Statistical Methods (2010),
% http://www.itl.nist.gov/div898/handbook/.
%
% 4) S.L.R. Ellison; V.J. Barwick; T.J.D. Farrant (2009). Practical
% Statistics for the Analytical Scientist. A Bench Guide. RSC Publishing,
% 2nd ed., Cambridge.


% Check number of input arguements
if (nargin < 2) || (nargin > 6)
    error('Requires two to six input arguments.')
end

% Define default values
if nargin == 2,
    sided = 1;
    alpha = 0.05;
    dist = 0;
    nmin = 6;
elseif nargin == 3,
    alpha = 0.05;
    dist = 0;
    nmin = 6;
elseif nargin == 4,
    dist = 0;
    nmin = 6;
elseif nargin == 5,
    nmin = 6;
end

% Normal transformation
if dist == 1,
    x = log(x);
end

if sided == 1,
   alpha1 = alpha/2;
end

% Check for validity of inputs
if ~isnumeric(x) || ~isreal(x) || ~iscellstr(x_date),
    error('Input x must be a numeric array and x_date must be a string table.')
elseif alpha <= 0 || alpha >= 1,
    error('The confidence level must be between zero and one.')
end

[n, c] = size(x);
xr = x;
j4 = 1;
j7 = 1;
j9 = 1;
outlier = [];
outlier_num = [];
Gmax_loop = [];
Gmin_loop = [];
G2_loop = [];
lambda1_loop = [];
lambda2_loop = [];

if n <= nmin,
    error('Time series should contain more than %d data points.', nmin);
else
    while (sum(j4) + sum(j7) + sum(j9))~= 0,
        nr = n - sum(isnan(xr));
        [i1,j1] = find(nr > nmin);
        if isempty(j1),
            disp(['Time series after removal of outliers contain less than ', num2str(nmin), ' data points.']);
            break
        end
        p1 = 1 - alpha1./nr;
        lambda1 = tinv(p1,nr-2).*(nr-1)./sqrt((nr-2+tinv(p1,nr-2).^2).*nr);

        Gmax = (nanmax(xr) - nanmean(xr))./nanstd(xr);
        [i2,j2] = find(xr == repmat(nanmax(xr),n,1)); % location of max
        [i3,j3] = find(Gmax > lambda1);
        j4 = ismember(j1,j3)';

        Gmin = (nanmean(xr) - nanmin(xr))./nanstd(xr);
        [i5,j5] = find(xr == repmat(nanmin(xr),n,1)); % location of min
        [i6,j6] = find(Gmin > lambda1);
        j7 = ismember(j1,j6)';

        p2 = 1 - alpha./(nr.*(nr - 1));
        lambda2 = sqrt((2.*(nr-1).*tinv(p2,nr-2).^2)./(n-2+tinv(p2,n-2).^2));
        G2 = (nanmax(xr) - nanmin(xr))./nanstd(xr);

        if sum(j4)~=0,
            xr(i2(j4),j2(j4)) = NaN;
            outlier = [outlier; x_date(i2(j4)) cellstr(strcat('Series', num2str(j2(j4))))];
            outlier_num = [outlier_num; i2(j4) j2(j4)];
        end

        if sum(j7)~=0,
            xr(i5(j7),j5(j7)) = NaN;
            outlier = [outlier; x_date(i5(j7)) cellstr(strcat('Series', num2str(j5(j7))))];
            outlier_num = [outlier_num; i5(j7) j5(j7)];
        end

        if sum(j4)==0 && sum(j7)==0,
            [i8,j8] = find(G2 > lambda2);
            j9 = ismember(j1,j8)';
            if sum(j9)~=0,
                xr(i2(j9),j2(j9)) = NaN;
                xr(i5(j9),j5(j9)) = NaN;
                outlier = [outlier; x_date(i2(j9)) cellstr(strcat('Series', num2str(j2(j9)))); ...
                    x_date(i5(j9)) cellstr(strcat('Series', num2str(j5(j9))))];
                outlier_num = [outlier_num; i2(j9) j2(j9); i5(j9) j5(j9)];
            end
            j9 = 0;
        end
        Gmax_loop = [Gmax_loop; Gmax];
        Gmin_loop = [Gmin_loop; Gmin];
        G2_loop = [G2_loop; G2];
        lambda1_loop = [lambda1_loop; lambda1];
        lambda2_loop = [lambda2_loop; lambda2];
    end
end