forked from MariekeDirk/ML_project
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathSelect_Days.R
More file actions
149 lines (86 loc) · 5.99 KB
/
Select_Days.R
File metadata and controls
149 lines (86 loc) · 5.99 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
#############################################################################################################
# This is the script in which the GMS data for the skected days for the 2nd and 3rd model
# are put into dataframes with the right input format for the ML scripts
#############################################################################################################
# Empty environment
rm(list=ls())
## Load GMS data files of the relevant days
Col_Classes_1 <- c("integer", "POSIXct", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "factor", "factor", "factor", "numeric", "logical", "logical", "character")
Data_2013_12_03 <- read.csv("/data/project/GMS/data/GMSraw/2013-12-03.csv", colClasses = Col_Classes_1)
Data_2013_12_03$TIMESTAMP <- as.POSIXct(Data_2013_12_03$TIMESTAMP, tz = "GMT")
Data_2016_01_09 <- read.csv("/data/project/GMS/data/GMSraw/2016-01-09.csv", colClasses = Col_Classes_1)
Data_2016_01_09$TIMESTAMP <- as.POSIXct(Data_2016_01_09$TIMESTAMP, tz = "GMT")
Data_2014_12_28 <- read.csv("/data/project/GMS/data/GMSraw/2014-12-28.csv", colClasses = Col_Classes_1)
Data_2014_12_28$TIMESTAMP <- as.POSIXct(Data_2014_12_28$TIMESTAMP, tz = "GMT")
Data_2015_03_24 <- read.csv("/data/project/GMS/data/GMSraw/2015-03-24.csv", colClasses = Col_Classes_1)
Data_2015_03_24$TIMESTAMP <- as.POSIXct(Data_2015_03_24$TIMESTAMP, tz = "GMT")
Data_2015_03_08 <- read.csv("/data/project/GMS/data/GMSraw/2015-03-08.csv", colClasses = Col_Classes_1)
Data_2015_03_08$TIMESTAMP <- as.POSIXct(Data_2015_03_08$TIMESTAMP, tz = "GMT")
Data_2009_11_24 <- read.csv("/data/project/GMS/data/GMSraw/2009-11-24.csv", colClasses = Col_Classes_1)
Data_2009_11_24$TIMESTAMP <- as.POSIXct(Data_2009_11_24$TIMESTAMP, tz = "GMT")
# Make a data frame containing all the selected days
Six_Days <- rbind.data.frame(Data_2009_11_24, Data_2013_12_03, Data_2014_12_28, Data_2015_03_08,
Data_2015_03_24, Data_2016_01_09)
## Drop variables that are not of intrest by specifying columns you wish to keep
Keep <- c( "LOCATION", "TIMESTAMP", "TW_1","TW_2", "TW_3", "TW_4", "TW_5",
"TW_6", "TW_7", "TW_8", "TW_9", "TW_10", "TW_11", "TW_12", "TL", "TD")
Six_Days <- Six_Days[Keep]
## Melt the dataframe
# ID vars are LOCATION and TIMESTAMP, TL & TD
library(reshape2)
Six_Days <- melt(Six_Days, id.vars = c("LOCATION", "TIMESTAMP", "TL", "TD"))
colnames(Six_Days)[5] <- "SENSOR"
colnames(Six_Days)[6] <- c("TEMP")
# Make the sensor data a character column
Six_Days$SENSOR <- as.character(Six_Days$SENSOR)
## Download days with filtered GMS data
Col_Classes_2 <- c("integer", "numeric", "character", "POSIXct", "character")
Data_filtered_09_11_24 <- read.csv("/data/project/GMS/data/GMSfiltered/2009-11-24.csv",
header = FALSE, colClasses = Col_Classes_2)
Data_filtered_16_01_09 <- read.csv("/data/project/GMS/data/GMSfiltered/2016-01-09.csv",
header = FALSE, colClasses = Col_Classes_2)
Data_filtered_14_12_28 <- read.csv("/data/project/GMS/data/GMSfiltered/2014-12-28.csv",
header = FALSE, colClasses = Col_Classes_2)
Data_filtered_15_03_24 <- read.csv("/data/project/GMS/data/GMSfiltered/2015-03-24.csv",
header = FALSE, colClasses = Col_Classes_2)
Data_filtered_15_03_08 <- read.csv("/data/project/GMS/data/GMSfiltered/2015-03-08.csv",
header = FALSE, colClasses = Col_Classes_2)
Data_filtered_13_12_03 <- read.csv("/data/project/GMS/data/GMSfiltered/2013-12-03.csv",
header = FALSE, colClasses = Col_Classes_2)
# Merge all these filtered days into one dataframe
Data_filtered <- rbind.data.frame(Data_filtered_09_11_24, Data_filtered_16_01_09, Data_filtered_14_12_28,
Data_filtered_15_03_24, Data_filtered_15_03_08, Data_filtered_13_12_03)
colnames(Data_filtered) <- c("LOCATION", "TEMP","SENSOR", "TIMESTAMP", "QUALITY")
## Merge GMS Data & Filtered data by common columns
# Because the filtered subset contains no NA's for the sensors the merged data frame also contains no NA's
library(dplyr)
Six_Days_3 <- right_join(x = Six_Days, y = Data_filtered, by = c("LOCATION", "TIMESTAMP", "SENSOR", "TEMP"))
#Add in a Day of Year (DOY) and Hour of Day (HOD) column
library(lubridate)
# Build a DOY column
DOY_Days <- as.numeric(strftime(Six_Days_3$TIMESTAMP, format = "%j"))
# The selected days run from, for example 24-11-09 01:00:00 until 01:00:00 25-11-09 which means that there are 12 unique DOY values for six days of data
length(unique(DOY_Days))
# Build a HOD column
HOD_Days <- hour(Six_Days_3$TIMESTAMP) + minute(Six_Days_3$TIMESTAMP)/60
# Add the columns to the start of the Six_Days_3 data frame
Six_Days_3 <- cbind(DOY_Days, HOD_Days, Six_Days_3)
# Check if there are any 'bad' stations
# (either test stations from RWS or stations which have been moved) in the dataframe.
bad_stations <- c("108", "422", "818", "1015", "1501", "1502", "1503")
for(i in bad_stations){
print(i)
print(any(Six_Days_3$LOCATION == i))
}
# Remove the 'bad' stations
Six_Days_3 <- Six_Days_3[!(Six_Days_3$LOCATION == 108|
Six_Days_3$LOCATION == 422|
Six_Days_3$LOCATION == 818|
Six_Days_3$LOCATION == 1015|
Six_Days_3$LOCATION == 1501|
Six_Days_3$LOCATION == 1502|
Six_Days_3$LOCATION == 1503),]
# Store the GMS six days data frame as .csv
write.csv(x = Six_Days_3, file = "/usr/people/kleingel/Projects/MLProject/Six_Days.csv")
# Store the GMS six days data frame as R data
save(x = Six_Days_3, file = "/usr/people/kleingel/Projects/MLProject/Six_Days.Rda")