data_cleaner/summary_stats.py at main · nbilabsystems/data_cleaner · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
# summary_stats.py

from typing import Any, Dict, List

import numpy as np
import pandas as pd

from cleaning_engine import CleaningSummary


def _global_stats(df: pd.DataFrame) -> Dict[str, Any]:
    """
    Compute high-level statistics about the dataset.
    """
    return {
        "num_rows": int(df.shape[0]),
        "num_columns": int(df.shape[1]),
        "total_missing_values": int(df.isna().sum().sum()),
        "columns": list(df.columns),
    }


def _numeric_column_stats(series: pd.Series) -> Dict[str, Any]:
    """
    Stats for numeric columns.
    """
    desc = series.describe()  # count, mean, std, min, 25%, 50%, 75%, max

    return {
        "dtype": str(series.dtype),
        "non_null_count": int(desc["count"]),
        "missing_count": int(series.isna().sum()),
        "mean": float(desc["mean"]) if not np.isnan(desc["mean"]) else None,
        "std": float(desc["std"]) if not np.isnan(desc["std"]) else None,
        "min": float(desc["min"]) if not np.isnan(desc["min"]) else None,
        "max": float(desc["max"]) if not np.isnan(desc["max"]) else None,
        "median": float(series.median()) if not np.isnan(series.median()) else None,
    }


def _non_numeric_column_stats(series: pd.Series) -> Dict[str, Any]:
    """
    Stats for non-numeric columns (strings, booleans, datetimes, etc.).
    """
    return {
        "dtype": str(series.dtype),
        "non_null_count": int(series.notna().sum()),
        "missing_count": int(series.isna().sum()),
        "num_unique_values": int(series.nunique(dropna=True)),
        "sample_values": [str(v) for v in series.dropna().unique()[:5]],
    }


def _per_column_stats(df: pd.DataFrame) -> List[Dict[str, Any]]:
    """
    Build a list of per-column statistics.
    """
    results: List[Dict[str, Any]] = []

    for col in df.columns:
        series = df[col]

        if np.issubdtype(series.dtype, np.number):
            stats = _numeric_column_stats(series)
        else:
            stats = _non_numeric_column_stats(series)

        stats["column_name"] = col
        results.append(stats)

    return results


def generate_summary(
    cleaned_df: pd.DataFrame,
    cleaning_summary: CleaningSummary,
) -> Dict[str, Any]:
    """
    Combine:
    - cleaning summary
    - global dataset stats
    - per-column stats

    into a single structured dict that can be used later by the PDF builder.
    """
    global_stats = _global_stats(cleaned_df)
    column_stats = _per_column_stats(cleaned_df)

    return {
        "cleaning_summary": cleaning_summary.as_dict(),
        "global_stats": global_stats,
        "columns": column_stats,
    }