-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathRandomForest.py
More file actions
40 lines (29 loc) · 1.6 KB
/
RandomForest.py
File metadata and controls
40 lines (29 loc) · 1.6 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
# Index labels, adding metadata to the label column.
# Fit on whole dataset to include all labels in index.
labelIndexer = StringIndexer(inputCol="review_score", outputCol="indexedLabel").fit(s_df2)
# Automatically identify categorical features, and index them.
# Set maxCategories so features with > 4 distinct values are treated as continuous.
assembler = VectorAssembler(inputCols=['tfidf','sentimentIndex'],outputCol="indexedFeatures")
# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = s_df2.randomSplit([0.7, 0.3])
# Train a GBT model.
rf = RandomForestClassifier(labelCol="review_score", featuresCol="indexedFeatures", numTrees=10)
# Chain indexers and GBT in a Pipeline
pipeline = Pipeline(stages=[tfidf_pipeline, assembler, rf])
# Train model. This also runs the indexers.
model = pipeline.fit(trainingData)
# Make predictions.
predictions = model.transform(testData)
# Select example rows to display.
predictions.select("prediction", "review_score", "indexedFeatures").show(5)
# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
labelCol="review_score", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g" % (1.0 - accuracy))
gbtModel = model.stages[2]
print(gbtModel) # summary only