-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathModelling Coding Challenge.py
More file actions
102 lines (67 loc) · 2.79 KB
/
Modelling Coding Challenge.py
File metadata and controls
102 lines (67 loc) · 2.79 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import pandas as pd
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)
pd.set_option('expand_frame_repr', False)
import numpy as np
import matplotlib.pyplot as plt
# read train and test set
train = pd.read_csv("C:/Users/sivac/Documents/Python Projects/Introduction to Data Science Course/Data Files/Predictive Modeling and Machine Learning/train_titanic.csv")
test = pd.read_csv("C:/Users/sivac/Documents/Python Projects/Introduction to Data Science Course/Data Files/Predictive Modeling and Machine Learning/test_titanic.csv")
# combining train and test dataset
df = (train.append(test, ignore_index=True)).reset_index(drop=True)
# check missing values
df.isnull().sum()
# remove Cabin variable
df.drop('Cabin', axis=1, inplace=True)
# fill missing values in Age and Embarked variables
df['Age'].fillna(df['Age'].median(), inplace = True)
df['Embarked'].fillna(df['Embarked'].mode()[0], inplace = True)
# check missing values again
df.isnull().sum()
# remove Ticket and Name variables
df.drop(['Ticket', 'Name', 'PassengerId'], axis=1, inplace=True)
from sklearn.preprocessing import LabelEncoder
# label enconding
label = LabelEncoder()
# label encode Embarked and Sex variables
df['Embarked_num'] = label.fit_transform(df['Embarked'])
df['Sex_num'] = label.fit_transform(df['Sex'])
# drop Embarked and Sex variables
df = df.drop(['Sex', 'Embarked'], axis = 1)
# see description of the dataframe df
df.describe()
df['Parch'].value_counts()
df['Pclass'].value_counts()
df['Sex_num'].value_counts()
df['Age'].plot.hist()
df['Age'].plot.box()
df['Fare'].plot.hist()
# scatter plot between Age and Fare
df.plot.scatter('Age', 'Fare')
# bar plot between Pclass and mean Age
df.groupby('Pclass')['Age'].mean().plot.bar()
# bar plot between Pclass and mean Fare
df.groupby('Pclass')['Fare'].mean().plot.bar()
train = df[:len(train)]
test = df[len(train):]
true_val = test['Survived']
# delete Survived variable from test
test.drop(['Survived'], axis=1, inplace=True)
# remove rows where Fare is 400 or above
train = train[train['Fare']<400]
# Replace the outliers in Fare with its mean. The outliers are approximately above the value 62.
train.loc[train['Fare']> 62, 'Fare'] = np.mean(train['Fare'])
# Replace the outliers in Age variable with its median. The outliers are approximately above the value 55.
train.loc[train['Age']> 55, 'Age'] = np.median(train['Age'])
# remove Survived from train
xtrain = train.drop('Survived', axis = 1)
ytrain = train['Survived']
from sklearn.linear_model import LogisticRegression
lreg = LogisticRegression(n_jobs=1, multi_class='ovr', solver='liblinear')
# fit the logistic regression model lreg
lreg.fit(xtrain, ytrain)
# make prediction on the test dataset
pred = lreg.predict(test)
# evaluate the model lreg
lreg.score(test, true_val)
# 0.81142857142857139