forked from MariekeDirk/ML_project
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathTest_merge_join.R
More file actions
80 lines (22 loc) · 1.22 KB
/
Test_merge_join.R
File metadata and controls
80 lines (22 loc) · 1.22 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
##########################################################################################################
# Here we test which is faster: merge or inner_join
# This is especially relevant for merging the environmental and GMS data
##########################################################################################################
# Empty environment
rm(list=ls())
# install.packages("dplyr")
# Load from USB
load("/usr/people/kleingel/Projects/MLProject/Data_10min.Rda")
#load("/usr/people/kleingel/Projects/MLProject/Data_10min.Rda")
# Remove data that is not valid
Data_10min <- Data_10min[Data_10min$QUALITY == "valid", ]
## Drop the quality column
Data_10min <- Data_10min[ ,-7]
# Load from USB
load("/usr/people/kleingel/Projects/MLProject/Env_Data.Rda")
#load("/usr/people/kleingel/Projects/MLProject/Env_Data.Rda")
## Merge subset and environmental data
system.time(data_GMS<-merge(Data_10min,Env_Data_4,by.x=c("LOCATION","SENSOR"),by.y=c("MISD","SENSOR")))
## Joining with dplyr gives very many rows ????
library(dplyr)
system.time(data_GMS_2 <- inner_join(x = Data_10min, y = Env_Data_4, by = c("LOCATION" = "MISD", "SENSOR" = "SENSOR")))