R_resp/anova.nested.data.R at master · backenv/R_resp · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
# Title     : TODO
# Objective : TODO
# Created by: adria
# Created on: 17/10/17
anova.nested.data <- function(dataset,resp.var,factors=c('Treatment','Specie'),fact_rel=NULL) {
    ## Wrapper to optimize exploratory analysis on split-plot experimental design
    #     Perfoms analysis on all resp.var for balanced designs with two fixed factors
    #     Uses anova 2 way by Anderson & Legendre (1999) and allows export to csv file.
    #     By default checks for interaction of factors unless factors relation string is given.
    #     Returns data.frame object

    #@TODO: allow multiple experimental designs by adapting anova to parsed factors

    cat('[?] Loading anova.2way by Anderson & Legendre (1999).\n')
    anova.2way <- function(formula, data = NULL, model=1, nperm=0) {
        #
        ################################################################################
        #
        # Models I, II or III for 2-way crossed-factor anova (balanced designs)
        # with permutation tests.
        #
        # Arguments
        #
        #  formula: a formula specifying the model, as in 'lm' and 'aov'.
        #
        #  data   : A data frame in which the variables specified in the formula are to
        #           be found.
        #
        #  model  : anova model
        #
        #    model=1: Model I anova (two fixed factors).
        #             Returns the results computed by lm(),
        #             and computes permutation tests if requested (nperm > 0)
        #
        #    model=2: Model II anova (two random factors).
        #             Recomputes the F-statistics and P-values for the 2 random
        #             factors, and computes permutation tests if requested (nperm > 0)
        #
        #    model=3: Model III or mixed-model anova
        #             (the first factor is fixed, the second factor is random).
        #             Recomputes the F-statistic and P-value for the fixed factor,
        #             and computes permutation tests if requested (nperm > 0)
        #
        #  nperm  : number of permutations. A commonly-used value is 999.
        #
        # Notes   : 1. If 'data' is missing, the model must be specified as  Y~X1*X2
        #              where Y is the response, X1 and X2 are the two factors.
        #           2. Make sure X1 and X2 are defined 'as.factors' unless they are
        #              binary.
        #           3. Following Anderson and Legendre (1999), permutation of the raw
        #              data is adequate for anova since there are no outlier values
        #              in the factors.
        #
        # Reference -
        # Anderson, M. J. and P. Legendre. 1999. An empirical comparison of permutation
        # methods for tests of partial regression coefficients in a linear model.
        # Journal of Statistical Computation and Simulation 62: 271-303.
        #
        #                                                   Pierre Legendre, August 2007
        #
        ################################################################################
        #
        ## Example modified from Venables and Ripley (2002) p. 165.
        #
        # F1 = as.factor(c(0,1,0,1,1,1,0,0,0,1,1,0,1,1,0,0,1,0,1,0,1,1,0,0))
        # F2 = as.factor(c(1,1,0,0,0,1,0,1,1,1,0,0,0,1,0,1,1,0,0,1,0,1,1,0))
        # yield <- c(49.5,62.8,46.8,57.0,59.8,58.5,55.5,56.0,62.8,55.8,69.5,55.0,
        #            62.0,48.8,45.5,44.2,52.0,51.5,49.8,48.8,57.2,59.0,53.2,56.0)
        # mat = as.data.frame(cbind(yield,F1,F2))
        #
        ## Compute model III anova with permutation tests. F1 is fixed, F2 is random.
        #
        # out.2way = anova.2way(yield~F1*F2, data=mat, model=3, nperm=999)
        #
        ################################################################################
        if((model<1) | (model>3)) stop("Incorrect value for parameter 'model'.",'\n')

        # The reference 2-way anova
        toto = lm(formula, data)
        anova.res = anova(toto)
        if(anova.res[4,1] == 0) stop("There is no interaction in this problem.")

        # From the formula, find the variables as well as the number of observations 'n'
        mf <- match.call(expand.dots = FALSE)
        m <- match(c("formula", "data", "subset", "weights", "na.action",
        "offset"), names(mf), 0)
        mf <- mf[c(1, m)]
        mf$drop.unused.levels <- TRUE
        mf[[1]] <- as.name("model.frame")
        mf <- eval(mf, parent.frame())
        var.names = colnames(mf)
        Y = mf[,1]
        F1 = as.factor(mf[,2])
        F2 = as.factor(mf[,3])
        n = nrow(mf)
        k = nrow(anova.res) - 1

        # Check model balance
        balance = var(as.vector(table(mf[,2:3])))
        if(balance > 0)
        stop("Design unbalanced. This function can only handle balanced designs.")

        # Model I anova (two fixed factors)
        if(model==1) {
            note = "Model I anova (two fixed factors)"
            if(nperm > 0) {
                GE = rep(1,k)
                Pperm = c(rep(0,k), NA)
                for(j in 1:nperm)
                {
                    Yperm = sample(mf[,1],n)
                    toto = lm(Yperm ~ F1*F2)
                    anova.per = anova(toto)
                    for(i in 1:k) { if(anova.per[i,4] >= anova.res[i,4]) GE[i] = GE[i] + 1 }
                }
                for(i in 1:k) { Pperm[i]=GE[i]/(nperm+1) }
                anova.res  = data.frame(anova.res, Pperm)
                colnames(anova.res) = c("Df", "Sum Sq", "Mean Sq", "F value", "Prob(param)", "Prob(perm)")
                note = "Model I anova (two fixed factors) with permutation tests"
            }
        }

        # Model II anova (two random factors)
        if(model==2) {
            anova.res[1,4] = anova.res[1,3] / anova.res[3,3]
            anova.res[2,4] = anova.res[2,3] / anova.res[3,3]
            anova.res[1,5] = pf(anova.res[1,4], anova.res[1,1], anova.res[3,1], lower.tail=FALSE)
            anova.res[2,5] = pf(anova.res[2,4], anova.res[2,1], anova.res[3,1], lower.tail=FALSE)
            note = "Model II anova (two random factors)"
            if(nperm > 0) {
                GE = rep(1,k)
                Pperm = c(rep(0,k), NA)
                for(j in 1:nperm)
                {
                    Yperm = sample(mf[,1],n)
                    toto = lm(Yperm ~ F1*F2)
                    anova.per = anova(toto)
                    anova.per[1,4] = anova.per[1,3] / anova.per[3,3]
                    anova.per[2,4] = anova.per[2,3] / anova.per[3,3]
                    for(i in 1:k) { if(anova.per[i,4] >= anova.res[i,4]) GE[i] = GE[i] + 1 }
                }
                for(i in 1:k) { Pperm[i]=GE[i]/(nperm+1) }
                anova.res  = data.frame(anova.res, Pperm)
                colnames(anova.res) = c("Df", "Sum Sq", "Mean Sq", "F value", "Prob(param)", "Prob(perm)")
                note = "Model II anova (two random factors) with permutation tests"
            }
        }

        # Model III anova (mixed model)
        if(model==3) {
            anova.res[1,4] = anova.res[1,3] / anova.res[3,3]
            anova.res[1,5] = pf(anova.res[1,4], anova.res[1,1], anova.res[3,1], lower.tail=FALSE)
            note = "Model III anova (mixed model)"
            if(nperm > 0) {
                GE = rep(1,k)
                Pperm = c(rep(0,k), NA)
                for(j in 1:nperm)
                {
                    Yperm = sample(mf[,1],n)
                    toto = lm(Yperm ~ F1*F2)
                    anova.per = anova(toto)
                    anova.per[1,4] = anova.per[1,3] / anova.per[3,3]
                    for(i in 1:k) { if(anova.per[i,4] >= anova.res[i,4]) GE[i] = GE[i] + 1 }
                }
                for(i in 1:k) { Pperm[i]=GE[i]/(nperm+1) }
                anova.res  = data.frame(anova.res, Pperm)
                colnames(anova.res) = c("Df", "Sum Sq", "Mean Sq", "F value", "Prob(param)", "Prob(perm)")
                note = "Model III anova (mixed model) with permutation tests"
            }
        }

        if(nperm <= 0) {
            return(list(anova.type=note,anova.table=anova.res))
        } else {
            return(list(anova.type=note, nperm=nperm, response.var=var.names[1], anova.table=anova.res))
        }
    }

    cat('[?] Starting ANOVA calculation...\n')
    st <- Sys.time()

    out <- list()
    # Force factors as factor to avoid unused levels
    dataset[,factors[1]] <- factor(dataset[,factors[1]])
    dataset[,factors[2]] <- factor(dataset[,factors[2]])

    # Create factors relation if not given
    if (is.null(fact_rel)) {
        cat('[+] Checking effect and interaction of',factors[1],'and',factors[2],'on...\n')
        fact_rel <- paste(factors,collapse = "*")
    } else {
        cat('[?] Using user defined factor relation "',fact_rel,'" on...\n',sep = "")
    }

    for (i in resp.var) {
        form<-as.formula(paste(i,fact_rel,sep = " ~ "))
        cat('...',i,'\n')
        out[[i]] <- anova.2way(data=dataset,
        formula=form,
        nperm = 999)
    }
    #print(warnings())
    et <- Sys.time()
    cat('[?] ANOVA calculation time:\n')
    print(et-st)
    return(out)
}