Python/machine_learning/k_nearest_neighbours.py at 2ddbfce589affcbdb33bd341b9a1b80e2b93f903 · TheAlgorithms/Python · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
"""
k-Nearest Neighbours (kNN) is a simple non-parametric supervised learning
algorithm used for classification. Given some labelled training data, a given
point is classified using its k nearest neighbours according to some distance
metric. The most commonly occurring label among the neighbours becomes the label
of the given point. In effect, the label of the given point is decided by a
majority vote.

This implementation uses the Euclidean distance metric by default, and also
supports Manhattan (L1) and Minkowski (Lp) distances.

Reference: https://en.wikipedia.org/wiki/K-nearest_neighbors_algorithm
"""

from collections import Counter
from heapq import nsmallest

import numpy as np


class KNN:
    def __init__(
        self,
        train_data: np.ndarray[float],
        train_target: np.ndarray[int],
        class_labels: list[str],
        *,
        distance_metric: str = "euclidean",
        p: float = 2.0,
    ) -> None:
        """
        Create a kNN classifier using the given training data and class labels

        Parameters
        ----------
        train_data : np.ndarray[float]
            Training features.
        train_target : np.ndarray[int]
            Training labels as integer indices.
        class_labels : list[str]
            Mapping from label index to label name.
        distance_metric : {"euclidean", "manhattan", "minkowski"}
            Distance to use for neighbour search. Defaults to "euclidean".
        p : float
            Power parameter for Minkowski distance (Lp norm). Must be >= 1 when
            distance_metric is "minkowski". Defaults to 2.0.
        """
        # Store a reusable copy; zip() returns an iterator that would be
        # exhausted after one classification otherwise.
        self.data = list(zip(train_data, train_target))
        self.labels = class_labels
        self.distance_metric = distance_metric.lower()
        self.p = float(p)

        if self.distance_metric not in {"euclidean", "manhattan", "minkowski"}:
            msg = (
                "distance_metric must be one of {'euclidean', 'manhattan', 'minkowski'}"
            )
            raise ValueError(msg)
        if self.distance_metric == "minkowski" and self.p < 1:
            msg = "For Minkowski distance, p must be >= 1"
            raise ValueError(msg)

    @staticmethod
    def _euclidean_distance(a: np.ndarray[float], b: np.ndarray[float]) -> float:
        """
        Calculate the Euclidean distance between two points
        >>> KNN._euclidean_distance(np.array([0, 0]), np.array([3, 4]))
        5.0
        >>> KNN._euclidean_distance(np.array([1, 2, 3]), np.array([1, 8, 11]))
        10.0
        """
        return float(np.linalg.norm(a - b))

    @staticmethod
    def _manhattan_distance(a: np.ndarray[float], b: np.ndarray[float]) -> float:
        """
        Calculate the Manhattan (L1) distance between two points
        >>> KNN._manhattan_distance(np.array([0, 0]), np.array([3, 4]))
        7.0
        >>> KNN._manhattan_distance(np.array([1, 2, 3]), np.array([1, 8, 11]))
        14.0
        """
        return float(np.linalg.norm(a - b, ord=1))

    @staticmethod
    def _minkowski_distance(
        a: np.ndarray[float], b: np.ndarray[float], p: float
    ) -> float:
        """
        Calculate the Minkowski (Lp) distance between two points
        >>> KNN._minkowski_distance(np.array([0, 0]), np.array([3, 4]), 2)
        5.0
        >>> KNN._minkowski_distance(np.array([0, 0]), np.array([3, 4]), 1)
        7.0
        """
        return float(np.linalg.norm(a - b, ord=p))

    def classify(self, pred_point: np.ndarray[float], k: int = 5) -> str:
        """
        Classify a given point using the kNN algorithm
        >>> train_X = np.array(
        ...     [[0, 0], [1, 0], [0, 1], [0.5, 0.5], [3, 3], [2, 3], [3, 2]]
        ... )
        >>> train_y = np.array([0, 0, 0, 0, 1, 1, 1])
        >>> classes = ['A', 'B']
        >>> knn = KNN(train_X, train_y, classes)
        >>> point = np.array([1.2, 1.2])
        >>> knn.classify(point)
        'A'
        >>> # Manhattan distance yields the same class here
        >>> knn_l1 = KNN(train_X, train_y, classes, distance_metric='manhattan')
        >>> knn_l1.classify(point)
        'A'
        >>> # Minkowski with p=2 equals Euclidean
        >>> knn_lp = KNN(train_X, train_y, classes, distance_metric='minkowski', p=2)
        >>> knn_lp.classify(point)
        'A'
        >>> # Invalid distance metric
        >>> try:
        ...     _ = KNN(train_X, train_y, classes, distance_metric='chebyshev')
        ... except ValueError as e:
        ...     'distance_metric' in str(e)
        True
        >>> # Invalid Minkowski power
        >>> try:
        ...     _ = KNN(train_X, train_y, classes, distance_metric='minkowski', p=0.5)
        ... except ValueError as e:
        ...     'p must be >=' in str(e)
        True
        """
        # Choose the distance function once
        if self.distance_metric == "euclidean":

            def dist_fn(a: np.ndarray[float]) -> float:
                return self._euclidean_distance(a, pred_point)
        elif self.distance_metric == "manhattan":

            def dist_fn(a: np.ndarray[float]) -> float:
                return self._manhattan_distance(a, pred_point)
        else:  # minkowski
            p = self.p

            def dist_fn(a: np.ndarray[float]) -> float:
                return self._minkowski_distance(a, pred_point, p)

        # Distances of all points from the point to be classified
        distances = ((dist_fn(dp), lbl) for dp, lbl in self.data)

        # Choosing k points with the shortest distances
        votes = (i[1] for i in nsmallest(k, distances))

        # Most commonly occurring class is the one into which the point is classified
        result = Counter(votes).most_common(1)[0][0]
        return self.labels[result]


if __name__ == "__main__":
    import doctest

    doctest.testmod()

    # Optional demo using scikit-learn's iris dataset. Kept under __main__ to
    # avoid making scikit-learn a hard dependency for importing this module.
    from sklearn import datasets
    from sklearn.model_selection import train_test_split

    iris = datasets.load_iris()

    X = np.array(iris["data"])
    y = np.array(iris["target"])
    iris_classes = iris["target_names"]

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
    iris_point = np.array([4.4, 3.1, 1.3, 1.4])
    classifier = KNN(X_train, y_train, iris_classes)
    print(classifier.classify(iris_point, k=3))