-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathPrincipalComponentAnalysis.R
More file actions
82 lines (70 loc) · 2.57 KB
/
PrincipalComponentAnalysis.R
File metadata and controls
82 lines (70 loc) · 2.57 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
### Get the data into R session
train<- read.csv("train.csv")
str(train)
pairs(train)
names(train)
View(train)
test<- read.csv("test.csv")
#Add total_sales variable to test
test$total_sales<- 0
names(test)
View(test)
sales.total<-train$total_sales
train<- train %>% select(-c("total_sales"))
model.glm<- glm(sales.total~., data = train)
test$total_sales<- predict(model.glm, data=test)
# Combine both data
data.combined<-rbind(train, test)
str(train)
pairs(train)
names(train)
View(train)
## Data preparation
library(dplyr)
# Change credit_score_range to integer
train$credit_score_range= as.integer(train$credit_score_range)
# Generate IDs for factor variable
train.modified= mutate(train, cityId= match(city, unique(city)), stateID=match(state, unique(state)),
zipID=match(zip, unique(zip)),
store_location_id= match(store_location, unique(store_location)),
time_zone_id= match(time_zone, unique(time_zone)),
location_employee_code_id= match(location_employee_code, unique(location_employee_code)),
credit_score_id= match(credit_score, unique(credit_score)),
store_location_id= match(store_location, unique(store_location)),
location_employee_code_id= match(location_employee_code, unique(location_employee_code))
)
# Capture the target variable
target_total_sales<- train.modified$total_sales
#Remove factor variables from training set
train.modified<- train.modified %>% select(-c(city, state, time_zone, location_employee_code,
credit_score, store_location, total_sales))
str(train.modified)
names(train.modified)
View(train.modified)
# ## Normalizing function
# normalise <- function(x) {
# (x - mean(x))/sd(x)
# }
#
# ##Normalise the data set
# train.modified<- sapply(train.modified, normalise, simplify = TRUE, USE.NAMES = TRUE)
# View(train.modified)
## PCA
pca1<- princomp( train.modified, scores = TRUE, cor = TRUE, tol = sqrt(.Machine$double.eps))
summary(pca1)
loadings(pca1)
plot(pca1, type="l")
biplot(pca1)
pca1$scores[1:15]
fa1<- factanal(train.modified, factors = 7, rotation = "varimax", scores = "regression")
fa1$loadings
pc1 <- pca1$scores[,1]
pc2 <- pca1$scores[,2]
pc3 <- pca1$scores[,3]
pc4 <- pca1$scores[,4]
pc5 <- pca1$scores[,5]
pc6 <- pca1$scores[,6]
pc7 <- pca1$scores[,7]
pc_df<- data.frame(pc1, pc2, pc3, pc4, pc5, pc6, pc7)
## Make prediction
##model1<- lm(target_total_sales~., data = pc_df)