-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathreal_state.py
More file actions
59 lines (43 loc) · 2.1 KB
/
real_state.py
File metadata and controls
59 lines (43 loc) · 2.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
from __future__ import print_function
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
if __name__ == '__main__':
# Create a SparkSession
spark = SparkSession.builder.appName('DecisionTree').getOrCreate()
# Read the file as dataframe
data = spark.read.option('header', 'true').option('inferSchema', 'true')\
.csv(
'file:///Users/brian/code/from_courses/SparkCourse/realestate.csv')
# Creating the vector assembler to create a new column containing
# all the features in an array-like object
assembler = VectorAssembler(outputCol='features').setInputCols(
['HouseAge', 'DistanceToMRT', 'NumberConvenienceStores'])
# Transforming the data using the assembler and leave only
# the required fields (field to predict, features)
df = assembler.transform(data).select('PriceOfUnitArea', 'features')
# Split our data into training data and testing data
train_test = df.randomSplit([0.5, 0.5])
training_df = train_test[0]
test_df = train_test[1]
# Now create out Decision Tree Regressor model
dt = DecisionTreeRegressor(maxDepth=2)
# Change the default feature and label of the model to ours
dt.setFeaturesCol('features')
dt.setLabelCol('PriceOfUnitArea')
# Train the model using out training data
model = dt.fit(training_df)
# Now see if we can predict values in our test data.
# Generate predictions using out Decision Tree Regressor model for all
# features in our test dataframe:
full_predictions = model.transform(test_df).cache()
# Extract the predictions and the 'known' correct labels.
predictions = full_predictions.select('prediction').rdd.map(lambda x: x[0])
labels = full_predictions.select('PriceOfUnitArea').rdd.map(lambda x: x[0])
# Zip them together
prediction_and_label = predictions.zip(labels).collect()
# Print out the predicted and actual values for each Price of Unit Area
for prediction in prediction_and_label:
print(prediction)
# Stop the session
spark.stop()