Classification

code

import pandas as pd
from tabulate import tabulate
import matplotlib.pyplot as plt
from typing import Tuple, Dict, List
import numpy as np
from functools import reduce
from scipy.stats import mode


def print_tabulate(df: pd.DataFrame):
    print(tabulate(df, headers=df.columns, tablefmt="orgtbl"))


def normalize_distribution(dist: np.array, n: int) -> np.array:
    b = dist - min(dist) + 0.000001
    c = (b / np.sum(b)) * n
    return np.round(c)


def create_distribution(mean: float, size: int) -> pd.Series:
    return normalize_distribution(np.random.standard_normal(size), mean * size)


def generate_df(means: List[Tuple[float, float, str]], n: int) -> pd.DataFrame:
    lists = [
        (create_distribution(_x, n), create_distribution(_y, n), np.repeat(_l, n))
        for _x, _y, _l in means
    ]
    x = np.array([])
    y = np.array([])
    labels = np.array([])
    for _x, _y, _l in lists:
        x = np.concatenate((x, _x), axis=None)
        y = np.concatenate((y, _y))
        labels = np.concatenate((labels, _l))
    return pd.DataFrame({"x": x, "y": y, "label": labels})


def get_cmap(n, name="hsv"):
    """Returns a function that maps each index in 0, 1, ..., n-1 to a distinct
    RGB color; the keyword argument name must be a standard mpl colormap name."""
    return plt.cm.get_cmap(name, n)


def scatter_group_by(
    file_path: str, df: pd.DataFrame, x_column: str, y_column: str, label_column: str
):
    fig, ax = plt.subplots()
    labels = pd.unique(df[label_column])
    cmap = get_cmap(len(labels) + 1)
    for i, label in enumerate(labels):
        filter_df = df.query(f"{label_column} == '{label}'")
        ax.scatter(filter_df[x_column], filter_df[y_column], label=label, color=cmap(i))
    ax.legend()
    plt.savefig(file_path)
    plt.close()


def euclidean_distance(p_1: np.array, p_2: np.array) -> float:
    return np.sqrt(np.sum((p_2 - p_1) ** 2))


def k_nearest_neightbors(
    points: List[np.array], labels: np.array, input_data: List[np.array], k: int
):
    label_indices = {label:index
                     for index,label in enumerate(pd.unique(labels))}
    indices_labels = {index:label
                      for label,index in label_indices.items()}
    input_distances = [
        [euclidean_distance(input_point, point) for point in points]
        for input_point in input_data
    ]
    points_k_nearest = [
        np.argsort(input_point_dist)[:k] for input_point_dist in input_distances
    ]
    return [
        indices_labels[mode([label_indices[labels[index]] for index in point_nearest]).mode]
        for point_nearest in points_k_nearest
    ]


groups = [(20, 20, "grupo1"), (100, 50, "grupo2"), (200, 430, "grupo3")]
df = generate_df(groups, 50)
scatter_group_by("img/groups.png", df, "x", "y", "label")
list_t = [
    (np.array(tuples[0:1]), tuples[2])
    for tuples in df.itertuples(index=False, name=None)
]
input_data = [point for point, _ in list_t]
labels = [label for _, label in list_t]
new_points = [np.array([100, 150]), np.array([1, 1]), np.array([1, 300]), np.array([80, 40]), np.array([400, 400])] # y, x
kn = k_nearest_neightbors(
    input_data,
    labels,
    new_points,
    5,
)
print(f"{new_points}, {kn}")

/tmp/babel-glUvxR/python-uVLGKe:42: MatplotlibDeprecationWarning: The get_cmap function was deprecated in Matplotlib 3.7 and will be removed two minor releases later. Use “matplotlib.colormaps[name]“ or “matplotlib.colormaps.get_cmap(obj)“ instead. return plt.cm.get_cmap(name, n) /tmp/babel-glUvxR/python-uVLGKe:67: FutureWarning: unique with argument that is not not a Series, Index, ExtensionArray, or np.ndarray is deprecated and will raise in a future version. for index,label in enumerate(pd.unique(labels))} [array([100, 150]), array([1, 1]), array([ 1, 300]), array([80, 40]), array([400, 400])], [‘grupo2’, ‘grupo1’, ‘grupo3’, ‘grupo2’, ‘grupo3’]

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Classification

code

FilesExpand file tree

classification.org

Latest commit

History

classification.org

File metadata and controls

Classification

code