MachineLearning-Python/week1-assignment.py at master · SaraBenShabbat/MachineLearning-Python · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
#!/usr/bin/env python
# coding: utf-8
"""
Created on Mon Apr 27 16:15:25 2020

@author: Sara Ben Shabbat
week1-assinment.py - 'Applied Machine Learning' - University of Michigan online course.
the no.1 week assignment.
"""

import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection  import train_test_split
from sklearn.neighbors import KNeighborsClassifier

cancer = load_breast_cancer()


# Question 1
# Convert the sklearn.dataset `cancer` to a DataFrame.
def answer_one():
    dataframe = pd.DataFrame(data=cancer.data, columns=cancer.feature_names)
    dataframe['target'] = cancer.target

    return dataframe


# Question 2
# What is the class distribution? (i.e. how many instances of `malignant` (encoded 0) and how many `benign` (encoded 1)?)
def answer_two():
    cancerdf = answer_one()
    index = ['malignant', 'benign']

    malignants = np.where(cancerdf['target'] == 0.0)
    benings = np.where(cancerdf['target'] == 1.0)
    data = [np.size(malignants), np.size(benings)]

    series = pd.Series(data, index=index)

    return series


# Question 3
# Split the DataFrame into `X` (the data) and `y` (the labels).
def answer_three():
    df = answer_one()

    X = df[df.keys()[:len(df.keys())-1]]
    y = df['target']

    return (X ,y)

# Question 4
# Using `train_test_split`, split `X` and `y` into training and test sets `(X_train, X_test, y_train, and y_test)`.
def answer_four():
    X, y = answer_three()

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

    return X_train, X_test, y_train, y_test


# Question 5
# Using KNeighborsClassifier, fit a k-nearest neighbors (knn) classifier with `X_train`, `y_train` and using one nearest neighbor (`n_neighbors = 1`).
def answer_five():
    X_train, X_test, y_train, y_test = answer_four()

    knn = KNeighborsClassifier(n_neighbors = 1)
    knn.fit(X_train, y_train)

    return knn


# Question 6
# Using your knn classifier, predict the class label using the mean value for each feature.
def answer_six():
    cancerdf = answer_one()
    means = cancerdf.mean()[:-1].values.reshape(1, -1)

    knn = answer_five()

    return knn.predict(means)


# Question 7
# Using your knn classifier, predict the class labels for the test set `X_test`.
def answer_seven():
    X_train, X_test, y_train, y_test = answer_four()
    knn = answer_five()

    predict_lst = knn.predict(X_test)
    return predict_lst


# Question 8
# Find the score (mean accuracy) of your knn classifier using `X_test` and `y_test`.
def answer_eight():
    X_train, X_test, y_train, y_test = answer_four()
    knn = answer_five()

    score = knn.score(X_test, y_test)

    return score