Random-Forest/Random Forest.Rmd at master · devan-github/Random-Forest · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
---
title: "Random Forest"
author: "Devan Goto"
date: "October 9, 2017"
output: html_document
---

```{r}
# 0. Import the data into the notebook/environment.

a1<-read.csv("agaricus-lepiota.data.txt",header=FALSE,sep=",")

View(a1)

a2<-a1

```

```{r}
# 1. Read the file and check for any null values

length(which(a2$stalk.root=="?"))

```

```{r}
#2. Add column names as specified in  https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/

colnames(a2) [1]<-"edible.poisen"
colnames(a2) [2]<-"cap.shape"
colnames(a2) [3]<-"cap.surface"
colnames(a2) [4]<-"cap.color"
colnames(a2) [5]<-"bruises"
colnames(a2) [6]<-"odor"
colnames(a2) [7]<-"gill.attachment"
colnames(a2) [8]<-"gill.spacing"
colnames(a2) [9]<-"gill.size"
colnames(a2) [10]<-"gill.color"
colnames(a2) [11]<-"stalk.shape"
colnames(a2) [12]<-"stalk.root"
colnames(a2) [13]<-"stalk.surface.above.ring"
colnames(a2) [14]<-"stalk.surface.below.ring"
colnames(a2) [15]<-"stalk.color.above.ring"
colnames(a2) [16]<-"stalk.color.below.ring"
colnames(a2) [17]<-"veil.type"
colnames(a2) [18]<-"veil.color"
colnames(a2) [19]<-"ring.number"
colnames(a2) [20]<-"ring.type"
colnames(a2) [21]<-"spore.print.color"
colnames(a2) [22]<-"population"
colnames(a2) [23]<-"habitat"

```

```{r}
#3. Print the structure of the data

print(str(a2))

```

```{r}
#4. Print the head of the data set

print(head(a2))

```

```{r}
#5. Using a core tidyverse package, try to fit the tibble in the screen

install.packages("tidyverse")
library(tidyverse)
as_tibble(a2)

```

```{r}
#6. Using a core tidyverse package, try to add row names as well

a3<-a2

#Creates a new column that represents each instance/participant/mushroom/etc.
a3$rowname<-1:nrow(a3)

#Moves rownames variable to the front.
a3<-a3[c(24,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23)]

```

```{r}
#7. Print the all columns for which class is edible

edible1<-subset(a2, a2$`edible-poisen` == "e")

```

```{r}
#8 Split the data into training and test sets of 80 -20 distribution

#training and testing for whole dataset
training<-dplyr::sample_frac(a3, .8)
testing<-dplyr::setdiff(a3, training)

#training and testing for only edible items
edible.training<-dplyr::sample_frac(edible1,.8)
edible.testing<-dplyr::setdiff(edible1, edible.training)

```

```{r}
#9. Use random forest on the train set and predict the top 10 variables of importance. Try the same on test set. Do the results vary?

#What:
##An ensemble learning method for classification and regression
##Operate by constructing a multitude of decision trees

#Why use Random Forest:
##Reasonable fast but very easy to use
##Handles sparse data/missing data well
##Overcomes problem with over fitting

#How:
##Tree bagging - random sample with replacement
##Random subset of the features.
##Voting: Algorithm creates mutiple decision trees, and votes to choose outcome based on multiple trees.

install.packages("randomForest")
library("randomForest")

#for training
rf.edible<-randomForest(edible.poisen ~., training)

#see overview info on randomForest
print(rf.edible)

#the higher the value the more important. useful for feature selection.
importance(rf.edible)

#see tree
getTree(rf.edible,labelVar =TRUE)

#for testing
rf.edible2<-randomForest(edible.poisen ~., testing)
print(rf.edible2)
importance(rf.edible2)
getTree(rf.edible2,labelVar =TRUE)

```

```{r}
#10. Print the top 10 variables columns for which class is edible


#need to install a newer version of R to install this package
install.packages("varimplot")
library("varimplot")

#Shows top 10 Variables in Training & Testing
varImplPlot(rf.edible,sort=T,main="Top 10 Variables Training", n.var=10)
varImplPlot(rf.edible2,sort=T,main="Top 10 Variables Testing", n.var=10)

#Shows how each variable is predicting edible.
v.train.importance<-data.frame(importance(rf.edible))
v.train.importance2<-data.frame(importance(rf.edible2,type = 2))

```