import pandas as pd
from tabulate import tabulate
import matplotlib.pyplot as plt
from typing import Tuple, Dict, List
import numpy as np
from functools import reduce
from scipy.stats import mode
def print_tabulate(df: pd.DataFrame):
print(tabulate(df, headers=df.columns, tablefmt="orgtbl"))
def normalize_distribution(dist: np.array, n: int) -> np.array:
b = dist - min(dist) + 0.000001
c = (b / np.sum(b)) * n
return np.round(c)
def create_distribution(mean: float, size: int) -> pd.Series:
return normalize_distribution(np.random.standard_normal(size), mean * size)
def generate_df(means: List[Tuple[float, float, str]], n: int) -> pd.DataFrame:
lists = [
(create_distribution(_x, n), create_distribution(_y, n), np.repeat(_l, n))
for _x, _y, _l in means
]
x = np.array([])
y = np.array([])
labels = np.array([])
for _x, _y, _l in lists:
x = np.concatenate((x, _x), axis=None)
y = np.concatenate((y, _y))
labels = np.concatenate((labels, _l))
return pd.DataFrame({"x": x, "y": y, "label": labels})
def get_cmap(n, name="hsv"):
"""Returns a function that maps each index in 0, 1, ..., n-1 to a distinct
RGB color; the keyword argument name must be a standard mpl colormap name."""
return plt.cm.get_cmap(name, n)
def scatter_group_by(
file_path: str, df: pd.DataFrame, x_column: str, y_column: str, label_column: str
):
fig, ax = plt.subplots()
labels = pd.unique(df[label_column])
cmap = get_cmap(len(labels) + 1)
for i, label in enumerate(labels):
filter_df = df.query(f"{label_column} == '{label}'")
ax.scatter(filter_df[x_column], filter_df[y_column], label=label, color=cmap(i))
ax.legend()
plt.savefig(file_path)
plt.close()
def euclidean_distance(p_1: np.array, p_2: np.array) -> float:
return np.sqrt(np.sum((p_2 - p_1) ** 2))
def k_nearest_neightbors(
points: List[np.array], labels: np.array, input_data: List[np.array], k: int
):
label_indices = {label:index
for index,label in enumerate(pd.unique(labels))}
indices_labels = {index:label
for label,index in label_indices.items()}
input_distances = [
[euclidean_distance(input_point, point) for point in points]
for input_point in input_data
]
points_k_nearest = [
np.argsort(input_point_dist)[:k] for input_point_dist in input_distances
]
return [
indices_labels[mode([label_indices[labels[index]] for index in point_nearest]).mode]
for point_nearest in points_k_nearest
]
groups = [(20, 20, "grupo1"), (100, 50, "grupo2"), (200, 430, "grupo3")]
df = generate_df(groups, 50)
scatter_group_by("img/groups.png", df, "x", "y", "label")
list_t = [
(np.array(tuples[0:1]), tuples[2])
for tuples in df.itertuples(index=False, name=None)
]
input_data = [point for point, _ in list_t]
labels = [label for _, label in list_t]
new_points = [np.array([100, 150]), np.array([1, 1]), np.array([1, 300]), np.array([80, 40]), np.array([400, 400])] # y, x
kn = k_nearest_neightbors(
input_data,
labels,
new_points,
5,
)
print(f"{new_points}, {kn}") /tmp/babel-glUvxR/python-uVLGKe:42: MatplotlibDeprecationWarning: The get_cmap function was deprecated in Matplotlib 3.7 and will be removed two minor releases later. Use “matplotlib.colormaps[name]“ or “matplotlib.colormaps.get_cmap(obj)“ instead.
return plt.cm.get_cmap(name, n)
/tmp/babel-glUvxR/python-uVLGKe:67: FutureWarning: unique with argument that is not not a Series, Index, ExtensionArray, or np.ndarray is deprecated and will raise in a future version.
for index,label in enumerate(pd.unique(labels))}
[array([100, 150]), array([1, 1]), array([ 1, 300]), array([80, 40]), array([400, 400])], [‘grupo2’, ‘grupo1’, ‘grupo3’, ‘grupo2’, ‘grupo3’]