ML_project/Environmental_Data.Rmd at master · KNMI-DataLab/ML_project · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
---
title: "Environmental_Data"
author: "Eva Kleingeld"
date: "August 12, 2016"
output: pdf_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```

```{r}
rm(list=ls())

#all extra required packages here
#install.packages("dummies")

```

```{r}

## First, read in all the environmental data sets
#Soil_type <- read.csv("F:/KNMI/Grondsoortenkaart_2006.txt")
#Soil_type <- read.csv("/run/media/kleingel/Lexar/KNMI/Grondsoortenkaart_2006.txt")
Soil_type <- read.csv("/data/project/GMS/data/auxcillary_data/Ancillary_point_data/Grondsoortenkaart_2006.txt")


#Height <- read.csv("F:/KNMI/Height_divagis.txt")
#Height <- read.csv("/run/media/kleingel/Lexar/KNMI/Height_divagis.txt")
Height <- read.csv("/data/project/GMS/data/auxcillary_data/Ancillary_point_data/Height_divagis.txt")

#MetaData_GMScoding<- read.csv("F:/KNMI/metadataGMScoding.csv", header = TRUE)
#MetaData_GMScoding<- read.csv("/run/media/kleingel/Lexar/KNMI/metadataGMScoding.csv", header = TRUE)
MetaData_GMScoding<- read.csv("/data/project/GMS/data/auxcillary_data/MetadataGMS/metadataGMScoding.csv", header = TRUE)

## Soil_type: Drop columns you don't need
Keep_Soil <- c("MSID", "LON", "LAT", "OMSCHRIJVI")
Soil_Type <- Soil_type[Keep_Soil]

## Height: Drop columns you don't need
Keep_Height <- c("MSID", "LON", "LAT", "NLD_alt")
Height <- Height[Keep_Height]

## MetaData: Drop columns you don't need
Keep_Meta <- c("loc_nr", "loc_lat", "loc_lon", "tw_1", "tw_2", "tw_3", "tw_4", "tw_5", "tw_6", "tw_7", "tw_8", "tw_9", "tw_10", "tw_11", "tw_12")
Sensor_Codes <- MetaData_GMScoding[Keep_Meta]
colnames(Sensor_Codes) <- c("MSID", "LAT", "LON", "TW_1", "TW_2", "TW_3", "TW_4", "TW_5", "TW_6", "TW_7", "TW_8", "TW_9", "TW_10", "TW_11", "TW_12")

## Melt the Sensor_Codes
library(reshape2)
Sensor_Codes <- melt(Sensor_Codes, id.vars = c("MSID", "LAT", "LON"))
colnames(Sensor_Codes)[4] <- c("SENSOR")
colnames(Sensor_Codes)[5] <- c("CODE")


## Merge Height and Soil_Type and Sensor_Codes data frames
Env_Data <- merge(x = Height, y = Soil_Type, by = c("MSID", "LAT", "LON"))
Env_Data_2 <- merge(x = Env_Data, y = Sensor_Codes, by = c("MSID"))

## There are two different LAT/LON (.x and .y)
## The .x are the "Rijksdriehoek" coordinates
## The .y are normal coordinates
## Here we drop the normal coordinates (.y)
Env_Data_2 <- Env_Data_2[ , - c(6,7)]

# Rename the .x lat/lon
colnames(Env_Data_2)[2] <- "LAT"
colnames(Env_Data_2)[3] <- "LON"


## Some sensors need to be removed from the data. These stations are:
  # No measurements 2014 - now: 108, 422, 818, 1015, 1502
  # Test stations RWS: 1501, 1502, 1503
bad_stations <- c("108", "422", "818", "1015", "1501", "1502", "1503")

for(i in bad_stations){
  print(i)
  Env_Data_2 <- Env_Data_2[!(Env_Data_2$MSID == i), ]
}

## The category 'Moerig op zand' does not occur often:
ggplot() + geom_bar(data = Env_Data_2, aes(x = OMSCHRIJVI))

## We decided to add 'Moerig op zand' to the 'Zand' category
Env_Data_2$OMSCHRIJVI[Env_Data_2$OMSCHRIJVI == "Moerig op zand"] <- "Veen"
any(Env_Data_2$OMSCHRIJVI == "Moerig op zand")

## We also merge the "Lichte klei" (: Light clay) and "Zware klei" (:Heavy Clay) categories to Klei (: Clay)
library(plyr)

Env_Data_2$OMSCHRIJVI <- revalue(Env_Data_2$OMSCHRIJVI, c("Lichte klei" = "Klei"))
Env_Data_2$OMSCHRIJVI[Env_Data_2$OMSCHRIJVI == "Zware klei"] <- "Klei"

## Next, we merge the "Leem" (:Silt), "Lichte zavel" and "Zware zavel" (:sand + clay) to "Zavel"
Env_Data_2$OMSCHRIJVI <- revalue(Env_Data_2$OMSCHRIJVI, c("Leem" = "Zavel"))
Env_Data_2$OMSCHRIJVI[Env_Data_2$OMSCHRIJVI == "Lichte zavel"] <- "Zavel"
Env_Data_2$OMSCHRIJVI[Env_Data_2$OMSCHRIJVI == "Zware zavel"] <- "Zavel"

# Drop unused levels from the OMSCHRIJVI column
Env_Data_2$OMSCHRIJVI <- droplevels(Env_Data_2$OMSCHRIJVI)

## Splitting the CODE column into 6 columns
Env_Data_2$CODE <- as.character(Env_Data_2$CODE)
CODE_loose <- strsplit(Env_Data_2$CODE, "")

Rijbaan <- sapply(CODE_loose, "[", i = 1)
Rijstrook <- sapply(CODE_loose, "[", i = 2)
Spitsstrook <- sapply(CODE_loose, "[", i = 3)
Oprit <- sapply(CODE_loose, "[", i = 4)
Afrit <- sapply(CODE_loose, "[", i = 5)
Brug_of_Viaduct <- sapply(CODE_loose, "[", i = 6)

CODE_DF <- cbind.data.frame(Rijbaan, Rijstrook, Spitsstrook, Oprit, Afrit, Brug_of_Viaduct)

# Test if all went well: same data length as before?
length(CODE_DF$Rijbaan) == length(Env_Data_2$CODE)

# Add code DF to Env_Data
Env_Data_3 <- cbind(Env_Data_2, CODE_DF)

# Remove original code column
Env_Data_3 <- Env_Data_3[ ,-7]

# Remove all rows with only NA values: If a station does not have a sensor, throw away row for nonexistent sensor
Env_Data_3 <- Env_Data_3[complete.cases(Env_Data_3), ]

## Next, make dummy variables of (transformed) CODE and Soil_Type data
library(dummies)

Env_Data_4 <- dummy.data.frame(Env_Data_3, names = c("OMSCHRIJVI", "Rijbaan", "Rijstrook", "Brug_of_Viaduct"), sep = "_")

colnames(Env_Data_4)[1:10] <- c("MISD", "LAT", "LON", "ALT", "Bebouwing", "Zavel", "Klei", "Veen", "Water", "Zand")

## Remove columns with CODE subset _NA
##Env_Data_4 <- Env_Data_4[ ,-grep("NA$", colnames(Env_Data_4))]

# Make the columns Afrit/Oprit and Spitsstrook integers instead of factors
Env_Data_4[ , c(25:27)] <- sapply(Env_Data_4[ , c(25:27)], as.character)
Env_Data_4[ , c(25:27)] <- sapply(Env_Data_4[ , c(25:27)], as.integer)


## How frequently do certain rijstrook values appear?
table(Env_Data_3$Rijbaan)
table(Env_Data_3$Rijstrook)
table(Env_Data_3$Spitsstrook)
table(Env_Data_3$Oprit)
table(Env_Data_3$Afrit)
table(Env_Data_3$Brug_of_Viaduct)


## As a final step, save the environmental dataset to the computer for later use
# Store as .csv
#write.csv(x = Env_Data_4, file = "F:/KNMI/Env_Data.csv")
write.csv(x = Env_Data_4, file = "/usr/people/kleingel/Projects/MLProject/Env_Data.csv")

# Store as data frame
#save(x = Env_Data_4, file = "F:/KNMI/Env_Data.Rda")
save(x = Env_Data_4, file = "/usr/people/kleingel/Projects/MLProject/Env_Data.Rda")

```

## Optional code

In this script the dummy variables were made using the dummy.data.frame function from the dummies package.
However, it is also possible to use the dummyVars function from the caret package to compute dummy variables.
The advantage of this function is that you can set the parameter fullRank so that the dummy variable trap is evaded.
For more information on the dummy variable trap see: http://amunategui.github.io/dummyVar-Walkthrough/
The dummy variable trap is not too much of a problem because we already perform a PCA.

The code below makes a correlation plot for both the dummy variables made with the dummies package as well as a corrplot for dummy variables computed with the dummyVars package & fullrank parameter.


```{r}
# # Plot correlation
# library(corrplot)
#
# testCorEnv <- Env_Data_4[, -c(1, 11) ]
#
# corEnv <- cor(testCorEnv, use = "complete.obs")
# corrplot(corEnv)
#
# ## Next, make dummy variables of (transformed) CODE and Soil_Type data
# library(caret)
#
# Env_Data_transform <- dummyVars(~. , Env_Data_3, fullRank = TRUE)
# Env_Data_test <- data.frame(predict(Env_Data_transform, newdata = Env_Data_3))
#
#
# corrplot(cor(Env_Data_test, use = "pairwise.complete.obs"))


```