Datascience/ModelCreation.py at main · sksigma/Datascience · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
#!/usr/bin/env python
# coding: utf-8

# In[1]:


from azureml.core import Workspace, Dataset, Experiment

print('Accessing the workspace....')
ws = Workspace.from_config("./config")

print('Accessing the dataset....')
az_dataset = Dataset.get_by_name(ws, 'Deploydataset')

# -----------------------------------------------------
# Create/Access an experiment object
# ----------------------------------------------------

print('Accessing/Creating the experiment...')
experiment = Experiment(workspace=ws, name='Webservice-exp001')

# -----------------------------------------------------
# Run an experiment using start_logging method
# -----------------------------------------------------
print('Start Experiment using Start Logging method...')
new_run = experiment.start_logging()

# --------------------------------------------------------
# Do your stuff here
# --------------------------------------------------------
import pandas as pd

# Load the data from the local files
print('Loading the dataset to pandas dataframe...')
df = az_dataset.to_pandas_dataframe()

# Create X and Y Variables
X = df.iloc[:, :-1]
Y = df.iloc[:, -1:]

# Create dummy variables
X = pd.get_dummies(X)

# Extract column names including dummy variables
train_enc_cols = X.columns


# Transform Categorical columns in Y dataset to dummy
Y = pd.get_dummies(Y)
Y = Y.iloc[:,-1]

# Split Data - X and Y datasets are training and testing sets
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3, random_state = 1234, stratify=Y)

# Build the Random Forest model
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(random_state=1234)

# Fit the data to the Random Forest object - Train Model
trained_model = rfc.fit(X_train, Y_train)


# Predict the outcome using Test data - Score Model
Y_predict = rfc.predict(X_test)

# Get the probability score - Scored Probabilities
Y_prob = rfc.predict_proba(X_test)[:, 1]

# Get Confusion matrix and the accuracy/score - Evaluate
from sklearn.metrics import confusion_matrix
cm    = confusion_matrix(Y_test, Y_predict)
score = rfc.score(X_test, Y_test)

# Always log the primary metric
new_run.log("accuracy", score)


# In[2]:


# -------------------------------------------------------
# Save all the transformations and models
# -------------------------------------------------------
import joblib
model_file = './outputs/models.pkl'

joblib.dump(value=[train_enc_cols, trained_model],
            filename=model_file)

# Complete the run
new_run.complete()

# Get the Run IDs from the experiment
list(experiment.get_runs())


# In[ ]: