Model-Averaging/mod_ave_funcs.R at master · QuantEcol-ConsLab/Model-Averaging · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
# this script contians functions for model averaging and comparison of methods

#function. Takes data, data type, vector of model averaging methods, and covariate columns to use. Returns a list of RMSE, predictions,weights, and run time for each method.

all_methods_MA<-function(data_type,dat,methods,covs){
  n_methods<-length(methods)      # number of methods
  methods_vec<-numeric(n_methods) # vector that will go in list below
  names(methods_vec)=methods

  n_test_data<-nrow(dat)/2        #number of data points in the test data
  n_mods<-2^length(covs)         # number of models being averaged


  #list to hold outputs from model averaging methods: weights, predictions, rmse, and run time
  compiled_outputs<-list(
    rmse=methods_vec,
    weights=matrix(NA,nrow=n_mods,ncol=n_methods,dimnames = list(1:n_mods,methods)),
    ma_preds=matrix(NA,nrow=n_test_data,ncol=n_methods,dimnames = list(1:n_test_data,methods)),
    time=methods_vec
  )

  for(i in 1:n_methods){
    ma_out<-model_average_function(method= methods[i],data_in = dat,covs=covs,data_type = data_type)

    compiled_outputs$rmse[i] <- ma_out$rmse
    compiled_outputs$weights[,i] <- ma_out$weights
    compiled_outputs$ma_preds[,i] <- ma_out$wtpreds
    compiled_outputs$time[i] <- ma_out$run_time
  }

  return(compiled_outputs)

}


# The functions take as inputs training and testing data, and return 1) model weights, 2) model predictions to test data, 3) RMSE for test data

# methods include: CP,  minimum variance, bottstrapping, AIC, BIC, leave one out cross validation, jackknife, and stacking

model_average_function<-function(
  method,   #model averaing method
  data_in,  #data
  covs,     # vector of columns of covariates to include
  data_type # for flm function "family" argument. e.g., "binomial" or "gaussian"
){

 #training data
  train<-data_in[1:(nrow(data_in)/2),c(1,covs)]

  #testing response
  test_y<-data_in[((nrow(data_in)/2)+1):(nrow(data_in)),1]

  #testing covariate values
  test_covs<-data_in[((nrow(data_in)/2)+1):(nrow(data_in)),covs]

  start_time<-Sys.time()

  ## fit global model
  fm1 <- glm(y~., data=train, na.action = "na.fail", family=data_type,x=TRUE)

  #get all possible models
  dfm1<-dredge(fm1)

  #get all of the model output
  fm1.mods.all<-get.models(dfm1, subset=TRUE)

#calculate weights

  #information criterion
  if(method%in%c("AIC","BIC","Cp","loo")){
    #Get the information criterion value
    if(method=="loo"){
    mod_criterion_value<-sapply(fm1.mods.all, method,type="rmse")
    }else{
    mod_criterion_value<-sapply(fm1.mods.all, method)
    }
    # Calculate the weights for each model based on Cp
    weights<-exp(-0.5*(mod_criterion_value-min(mod_criterion_value)))/sum(exp(-0.5*(mod_criterion_value-min(mod_criterion_value))))
  }else
      #Bates-granger/minimal variance weights
      if(method=="minimum_variance"){
        weights<-(BGWeights(fm1.mods.all, data=train))#some are negative
      }else

        if(method=="bootstrap"){
          set.seed(2020)
          weights<-(bootWeights(fm1.mods.all, data=train,R=1000))
        }else

          if(method=="stacking"){
            weights<-(stackingWeights(fm1.mods.all, data=train, R=1000))[1,]
          }else
            if(method=="jackknife"){
              weights<-(jackknifeWeights(fm1.mods.all, data=train))
            }else
              if(method=="cos_squared"){
                weights<-(cos2Weights(fm1.mods.all, data=train))
              }


#predict for all models
preds_all_mods <-sapply(fm1.mods.all, predict, newdata = test_covs,family=data_type, type = "response")
#weigh the models by the BG/MV weights
wtpreds<-preds_all_mods%*%weights
#RMSE
rmse<-sqrt(mean((wtpreds-test_y )^2))

#time it took to run
total_time<-Sys.time()-start_time

#return

return(list(
  weights=weights,
  wtpreds=wtpreds,
  rmse=rmse,
  run_time=total_time,
  method=method,
  true_test=test_y
))
}