Model-Averaging/modelAveraging_LOO_normal&logit.R at master · QuantEcol-ConsLab/Model-Averaging · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
  ##  Converse-Gardner Lab Meeting April 20, 2020
  ##  Using data generated by Nathan & Staci

  ##  Leave-one-out cross-validation (LOO-CV)
  ##  With LOO-CV, each data point is omitted in turn and models are fit with the
  ##  reduced data set, which are then used to predict the omitted data point.
  ##  Each model is weighted based on its predictive capability and those weights
  ##  are used when averaging the model set.

  #  Load packages
  library(MuMIn)
  library(tidyverse)
#---------------------------------------------------------------------
  #  Model averaging with normally distributed data and LOO-CV

  #  Read in data and remove 1st column
  dat.norm <-  read.csv("normalSimulatedData1April2020.csv") %>%
    select(-X)

  #  Fit global model
  fm1.norm <- glm(y~., data = dat.norm, na.action = "na.fail", family = "gaussian")

  #  Test different criterion with the global model just for funsies... not actually needed
  loo(fm1.norm, type = "rmse")
  loo(fm1.norm, type = "loglik") # "loglik" is another criteria offered in the loo function

  #  Model averaging with the MuMIn package
  #----------------------------------------
  #  Dredge the heck out of your models (*tisk tisk*) using LOO-CV to rank them
  #  Creates 256 possible models
  dd.norm <- dredge(fm1.norm, rank = "loo", type = "rmse")
  #  Keep all models, could keep only a subset if desired
  ddloo.norm <- get.models(dd.norm, subset = TRUE)

  #  Generate model-averaged coefficients
  #  Assuming the model.avg function weights models based on LOO-CV rank
  avgm.norm <- model.avg(ddloo.norm)
  #  Peak under the hood of this model.avg function
  #  RMSE for each individual model & model weights based on LOO-CV
  head(avgm.norm$msTable$loo)
  head(avgm.norm$msTable$weight)
  min(avgm.norm$msTable$weight); max(avgm.norm$msTable$weight)
  #  Model-averaged coefficients averaged across all models & a subset for each covariate
  head(avgm.norm$coefficients)

  #  Use model-averaged coefficients to predict responses to covariates
  meanPredY.norm <- (predict(avgm.norm, as.data.frame(dat.norm[,2:9]), se.fit = TRUE, type="response"))
  #  Calculate the overall RSME for the model-averaged results
  RMSE.norm_MuMIn <- sqrt(mean((meanPredY.norm[[1]] - dat.norm[,1])^2))
  print(RMSE.norm_MuMIn)

  #  End up with a set of predicted values and their SE, plus model-averaged RSME


  #  Model averaging by hand
  #-------------------------
  #  Find all possible models
  dd.norm <- dredge(fm1.norm)
  #  Dormann et al. (2018) supplemental code uses rank = "AIC" w/ dredge but not sure why

  #  Package models together
  list.norm <- get.models(dd.norm, subset = NA)
  #  Sort the model list from 1 : M (to make all outputs follow the same sequence,
  #  from simplest to full model)
  list.norm <- list.norm[order(as.numeric(rownames(dd.norm)))]
  #  Number of models
  M <- length(list.norm)
  #  Number of observation in the data set
  N <- nrow(dat.norm)

  #  LOO-CV by hand
  #  Removes data point i and fits each model, then predicts data point i based
  #  on each model's estimated coefficients
  #  This takes a few minutes
  loo.norm <- matrix(NA, nrow = N, ncol = M)
  for(i in 1:N){
    fm.norm <- lapply(list.norm, function(x) update(x, .~., data = dat.norm[-i,]))
    loo.norm[i,] <- suppressWarnings(sapply(fm.norm, function(x) predict(x, newdata = dat.norm[i, ,drop = F])))
  }
  #  Quick look at some predicted values for each model
  dim(loo.norm)
  loo.norm[1:6, 1:10]

  #  Calculate criterion when LOO-CV is used for model selection
  #  Take LOO-CV predicted values from each model and calculate:
  #  RMSE for each model (can compare results to MuMIn package)
  rmse.norm <- apply(loo.norm, 2, function(x) sqrt(mean((x - dat.norm[,1])^2)))
  #  R2 for each model
  R2.norm <- apply(loo.norm, 2, function(x) cor(x, dat.norm[,1])^2)
  #  dat.norm[,1] are the original observations in the dataset

  #  Calculate model weights based on RMSE & R2 criterion above
  #  Will give you one weight for each model
  #  Equation 13 from Dormann et al. (2018) pg.495
  wgt.rmse.norm <- (exp(-1*(rmse.norm - min(rmse.norm))))/sum(exp(-1*(rmse.norm - min(rmse.norm))))
  wgt.R2.norm <- (exp(-1*(R2.norm - min(R2.norm))))/sum(exp(-1*(R2.norm - min(R2.norm))))

  #  Generate full set of predicted values for each model using covariate data
  preds.norm <- sapply(list.norm, predict, newdata = dat.norm)

  #  Weight new predictions by weight of support for the model that generated them
  #  Matrix multiplication: predicted values * model weights for each criterion
  #  Predictions are averaged across all models based on model weights to produce
  #  a single model-averages set of predicted values
  wgtPreds.rmse.norm <- preds.norm %*% wgt.rmse.norm
  wgtPreds.R2.norm <- preds.norm %*% wgt.R2.norm

  #  Calculate the overall RSME for the model-averaged results
  RMSE.norm_byhand <- sqrt(mean((wgtPreds.rmse.norm - dat.norm[,1])^2))
  RMSE.R2.norm_byhand <- sqrt(mean((wgtPreds.R2.norm - dat.norm[,1])^2))
  print(RMSE.norm_byhand)
  print(RMSE.R2.norm_byhand)

  #  Compare to MuMIn results when LOO-CV was used
  print("predicted values model-averaged by hand"); wgtPreds.rmse.norm[1:10]
  print("predicted values model-averaged with MuMIn package"); meanPredY.norm$fit[1:10]
  print("RMSE for model-averaged results by hand"); RMSE.norm_byhand
  print("RMSE for model-averaged results with MuMIn package"); RMSE.norm_MuMIn
  plot(meanPredY.norm[[1]], wgtPreds.rmse.norm, xlab = "MuMIn predictions",
       ylab = "Hand-made Artisanal predicitons", main = "Compare MuMIn and hand-predicted results")
  abline(0,1)

  #  Plot to see how we did
  #  Compare MuMIn LOO-CV model-averaged predicted response against y
  plot(meanPredY.norm[[1]], dat.norm[,1], pch=16, xlab="LOO Predicted", ylab = "y")
  abline(0,1)
  #  Compare that to LOO-CV model-averaged predictions done by hand
  points(wgtPreds.rmse.norm, dat.norm[,1], col = "blue")

  #  Compare to AIC model-averaged predicted responses from Nathan & Staci's code
  source("modelAveraging_AICBIC_normal.R")
  plot(meanPredY.norm[[1]], dat.norm[,1], pch=16, xlab="LOO Predicted", ylab = "y")
  abline(0,1)
  points(wgtPreds.rmse.norm, dat.norm[,1], col = "blue")
  #  Add in MuMIn AIC model-averaged predictions
  points(meanPredY[[1]], dat.norm[,1], col="red")


#-------------------------------------------------------------------------
  #  Model averaging with binomially distributed data

  #  Read in data, remove 1st column, change y from integer to numeric for loo function
  dat.logit <- read.csv("logitSimulatedData1April2020.csv") %>%
    select(-X)
  dat.logit[,1] <- sapply(dat.logit[,1], as.numeric)

  #  Fit global model
  fm1.logit <- glm(y~., data = dat.logit, na.action = "na.fail", family = "binomial")

  #  Test different criterion with the global model for funsies
  loo(fm1.logit, type = "rmse")    # doesn't work if y is an integer
  loo(fm1.logit, type = "loglik")  # "loglik" is another criteria offered in the loo function


  #  Model averaging with the MuMIn package
  #----------------------------------------
  #  Data dredge like nobody's business
  dd.logit <- dredge(fm1.logit, rank = loo, type = "rmse") # could also use "loglik"
  #  Keep all models
  ddloo.logit <- get.models(dd.logit, subset = TRUE)

  #  Generate model-averaged coefficients
  #  Assuming the model.avg function weights models based on LOO-CV rank
  avgm.logit <- model.avg(ddloo.logit)
  #  Peak under the hood of this model.avg function
  #  RMSE for each individual model & model weights based on LOO-CV
  head(avgm.logit$msTable$loo)
  head(avgm.logit$msTable$weight)
  min(avgm.logit$msTable$weight); max(avgm.logit$msTable$weight)
  #  Model-averaged coefficients averaged across all models & a subset for each covariate
  head(avgm.logit$coefficients)

  #  Use model-averaged coefficients to predict responses (predicting on probability scale)
  meanPredY.logit <- (predict(avgm.logit, as.data.frame(dat.logit[,2:9]), se.fit = TRUE, type="response"))
  #  Calculate the overall RSME for the model-averaged results
  RMSE.logit_MuMIn <- sqrt(mean((meanPredY.logit[[1]] - dat.logit[,1])^2))
  print(RMSE.logit_MuMIn)

  #  End up with a set of predicted values and their SE, plus model-averaged RSME

  #  Alternatively, predict responses on the link scale (logit)
  lmeanPredY.logit <- (predict(avgm.logit, as.data.frame(dat.logit[,2:9]), se.fit = TRUE, type="link"))
  # And back-transform to the probability scale
  pmeanPredY.logit <- plogis(lmeanPredY.logit$fit)
  #  Calcualte the model-averaged RSME
  RMSE.logit.link_MuMIn <- sqrt(mean((pmeanPredY.logit - dat.logit[,1])^2))
  print(RMSE.logit.link_MuMIn)

  #  RMSE's pretty similar when prediction on response or link scale


  #  Model averaging by hand
  #-------------------------
  #  Find all possible models
  dd.logit <- dredge(fm1.logit, rank = "AIC")

  #  Package models together
  list.logit <- get.models(dd.logit, subset = NA)
  #  Sort the model list from 1 : M (to make all outputs follow the same sequence,
  #  from simplest to full model)
  list.logit <- list.logit[order(as.numeric(rownames(dd.logit)))]
  #  Number of models
  M <- length(list.logit)
  #  Number of observation in the data set
  N <- nrow(dat.logit)

  #  LOO by hand
  #  Removes data point i and fits each model, then predicts data point i based
  #  on each model's estimated coefficients
  #  This takes a few minutes
  loo.logit <- matrix(NA, nrow = N, ncol = M)
  for(i in 1:N){
    fm.logit <- lapply(list.logit, function(x) update(x, .~., data = dat.logit[-i,]))
    loo.logit[i,] <- suppressWarnings(sapply(fm.logit,
          function(x) predict(x, newdata = dat.logit[i, ,drop = F], type="response")))
  }
  #  Quick look at some predicted values for each model
  dim(loo.logit)
  loo.logit[1:6, 1:10]

  #  Calculate criterion when LOO-CV is used for model selection
  #  Take LOO-CV predicted values from each model and calculate:
  #  RMSE for each model (can compare results to MuMIn package)
  rmse.logit <- apply(loo.logit, 2, function(x) sqrt(mean((x - dat.logit[,1])^2)))
  #  R2 for each model
  R2.logit <- apply(loo.logit, 2, function(x) cor(x, dat.logit[,1])^2)

  #  Calculate model weights based on RMSE & R2 criterion above
  #  Will give you one weight for each model
  #  Equation 13 from Dormann et al. (2018) pg.495
  wgt.rmse.logit <- (exp(-1*(rmse.logit - min(rmse.logit))))/sum(exp(-1*(rmse.logit - min(rmse.logit))))
  wgt.R2.logit <- (exp(-1*(R2.logit - min(R2.logit))))/sum(exp(-1*(R2.logit - min(R2.logit))))

  #  Generate full set of predicted values for each model using covariate data
  #  Make sure to predict on the response scale if comparing to MuMIn package results
  preds.logit <- sapply(list.logit, predict, newdata = dat.logit, type="response")

  #  Weight new predictions by weight of support for the model that generated them
  #  Matrix multiplication: predicted values * model weights for each criterion
  #  Predictions are averaged across all models based on model weights to produce
  #  a single model-averages set of predicted values
  wgtPreds.rmse.logit <- preds.logit %*% wgt.rmse.logit
  wgtPreds.R2.logit <- preds.logit %*% wgt.R2.logit

  #  Calculate the overall RSME for the model-averaged results
  RMSE.logit_byhand <- sqrt(mean((wgtPreds.rmse.logit - dat.logit[,1])^2))
  RMSE.R2.logit_byhand <- sqrt(mean((wgtPreds.R2.logit - dat.logit[,1])^2))
  print(RMSE.logit_byhand)
  print(RMSE.R2.logit_byhand)

  #  Compare to MuMIn results when LOO-CV was used
  print("predicted values model-averaged by hand"); wgtPreds.rmse.logit[1:10]
  print("predicted values model-averaged with MuMIn package"); meanPredY.logit$fit[1:10]
  print("RMSE for model-averaged results by hand"); RMSE.logit_byhand
  print("RMSE for model-averaged results with MuMIn package"); RMSE.logit_MuMIn
  plot(meanPredY.logit[[1]], wgtPreds.rmse.logit, xlab = "MuMIn predictions",
       ylab = "Hand-made Artisanal predicitons", main = "Compare MuMIn and hand-predicted results")
  abline(0,1)

  #  Plot to see how we did
  #  Compare MuMIn LOO-CV model-averaged predicted to fitted results from global model
  plot(meanPredY.logit[[1]], fm1.logit$fitted.values, xlab = "Model-Averaged predictions", ylab = "Fitted global model")
  abline(0,1)
  #  Compare that to LOO-CV model-averaged predictions done by hand
  points(wgtPreds.rmse.logit, fm1.logit$fitted.values, col = "blue")

  #  Compare to AIC model-averaged predicted responses from Nathan & Staci's code
  source("modelAveraging_AICBIC_logit.R")
  plot(meanPredY.logit[[1]], fm1.logit$fitted.values, xlab = "Model-Averaged predictions", ylab = "Fitted global model")
  abline(0,1)
  points(wgtPreds.rmse.logit, fm1.logit$fitted.values, col = "blue")
  #  Add in MuMIn AIC model-averaged predictions
  points(meanPredY$fit, fm1.logit$fitted.values, col="red")

  #  End