predictive-analysis-course/class6-KNN.R at main · degr8sid-code/predictive-analysis-course · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
?iris
library(tidyverse)
# For train test split
library(caret)
# For KNN model
# install.packages('class')
library(class)
# to create chart in grid
# install.packages('gridExtra')
library(gridExtra)

# load the dataset
data <- iris
view(data)
# we'll separate the row labels as these labels are target
# variables, and we want to convert the actual labels
# to numerical labels
# assigning row labels
row_labels <- data[, 5]


# encode target as numeric
data$Species <- as.numeric(data$Species)
view(data)
# since we work with distance, we want to ensure that the
# distances between data points is as small as possible
data[,1:4] <- scale(data[,1:4])
view(data)

# # set seed
# set.seed(1)
# # pick 80% of the rows
# size <- floor(0.8 *  nrow(data))
# # assign it to train_ind as a sample
# train_ind <- sample(seq_len(nrow(data)), size = size)
# # pick labels of the particular training data rows
# train_labels <- data[train_ind, 5]
# # assign actual training data
# data_train <- data[train_ind,1:4]
# # assign testing data
# data_test <- data[-train_ind,1:4]
# data_test_labels <- row_labels[-train_ind]

# train/test split
# this time, in KNN, we split the numeric labels separately
# Set the seed for reproducibility
set.seed(1)

# Create a list of indices for the training set
train_index <- createDataPartition(data$Species, p = 0.8,
                                   list = FALSE)
# Split the data into training and testing sets
data_train <- data[train_index, 1:4]
train_labels <- data[train_index, 5]
data_test <- data[-train_index, 1:4]
data_test_labels <- data[-train_index, 5]
# view(data_test_labels)
# Fit KNN Model
classifications <- knn(train = data_train,
                   test = data_test,
                   cl = train_labels,
                   k= 11)
view(classifications)
typeof(classifications)
# we're creating a separate df that we'll put on the chart
# the df consists of results from actual values
# and predicted values
plot_classifications <- data.frame(
  data_test$Sepal.Length,
  data_test$Sepal.Width,
  data_test$Petal.Length,
  data_test$Petal.Width,
  classified = classifications,
  actual = as.factor(data_test_labels))

typeof(plot_classifications$actual)
# we also create a vector for to store column names of
# that df
colnames(plot_classifications) <- c("Sepal.Length",
                                "Sepal.Width",
                                "Petal.Length",
                                "Petal.Width",
                                "classified",
                                "Actual.Values")
# Visualize the KNN algorithm results.
p1 <- ggplot(plot_classifications,
             aes(Petal.Length, Petal.Width,
                 color = classified, fill = classified)) +
  geom_point(alpha= 0.5, size = 5) +
  ggtitle("Classification Result Between Petal Length
          and Width") +
  theme(plot.title = element_text(hjust = 0.5)) +
  theme(legend.position = "none")


p2 <- ggplot(plot_classifications,
             aes(Sepal.Length, Sepal.Width,
                 color = classified, fill = classified)) +
  geom_point(alpha=0.5, size = 5) +
  ggtitle("Classification Result between
          Sepal Length and Sepal") +
  theme(plot.title = element_text(hjust = 0.5)) +
  theme(legend.position = "none")

p3 <- ggplot(plot_classifications,
             aes(Petal.Length, Petal.Width,
                 color = Actual.Values)) +
  geom_point(alpha= 0.5, size = 5) +
  ggtitle("Actual Value Between Petal Length
          and Width") +
  theme(plot.title = element_text(hjust = 0.5)) +
  theme(legend.position = "none")

grid.arrange(p1, p2, ncol=2)