data-tools/src/data_tools/adapters/models.py at 2780cdeeaed80dc738ea68950c7a5f8dfdeae520 · Intugle/data-tools · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
from enum import Enum
from typing import Any, List, Optional

from pydantic import BaseModel, Field


class ProfilingOutput(BaseModel):
    count: int
    columns: list[str]
    dtypes: dict[str, str]


class ColumnProfile(BaseModel):
    """
    A Pydantic model for validating the response of the column_profile function.

    """

    column_name: str = Field(..., description="The name of the column being profiled.")
    business_name: str = Field(..., description="Cleaned column name")
    table_name: str = Field(
        ..., description="The name of the source table or a placeholder (e.g., 'pandas_dataframe')."
    )
    null_count: int = Field(..., description="The total number of null (NaN or None) values in the column.")
    count: int = Field(..., description="The total number of rows in the DataFrame (including nulls).")
    distinct_count: int = Field(..., description="The number of unique non-null values in the column.")
    uniqueness: float = Field(
        ..., description="The ratio of distinct values to total count, indicating the uniqueness of the column."
    )
    completeness: float = Field(
        ..., description="The ratio of non-null values to total count, indicating the completeness of the column."
    )
    sample_data: List[Any] = Field(
        ..., description="A sample of unique values from the column, up to the specified sample_limit."
    )
    dtype_sample: Optional[List[Any]] = Field(
        None, description="A combined sample of unique and non-unique values, intended for data type inference."
    )
    ts: float = Field(..., description="The timestamp indicating how long the profiling took, in seconds.")
    datatype_l1: Optional[str] = Field(
        default=None, description="The inferred data type for the column, based on the sample data."
    )
    datatype_l2: Optional[str] = Field(
        default=None,
        description="The inferred data type category (dimension/measure) for the column, based on the datatype l1 results",  # noqa: E501
    )
    business_glossary: Optional[str] = Field(
        default=None, description="The business glossary entry for the column, if available."
    )
    business_tags: Optional[List[str]] = Field(
        default=None, description="A list of business tags associated with the column, if any."
    )


class DataTypeIdentificationL1Output(BaseModel):
    """
    A Pydantic model for validating the response of the datatype_identification function.

    """

    column_name: str = Field(..., description="The name of the column being profiled.")
    table_name: str = Field(
        ..., description="The name of the source table or a placeholder (e.g., 'pandas_dataframe')."
    )
    datatype_l1: str = Field(
        ..., validation_alias="predicted_datatype_l1", description="The inferred data type for the column."
    )


class DataTypeIdentificationL2Input(BaseModel):
    """
    A Pydantic model for validating the response of the column_profile function.

    """

    column_name: str = Field(..., description="The name of the column being profiled.")
    table_name: str = Field(
        ..., description="The name of the source table or a placeholder (e.g., 'pandas_dataframe')."
    )
    sample_data: List[Any] = Field(
        ..., description="A sample of unique values from the column, up to the specified sample_limit."
    )
    datatype_l1: Optional[str] = Field(
        default=None, description="The inferred data type for the column, based on the sample data."
    )


class L2OutputTypes(str, Enum):
    dimension = "dimension"
    measure = "measure"
    unknown = "unknown"


class DataTypeIdentificationL2Output(BaseModel):
    """
    A Pydantic model for validating the response of the datatype_identification function.

    """

    column_name: str = Field(..., description="The name of the column being profiled.")
    table_name: str = Field(
        ..., description="The name of the source table or a placeholder (e.g., 'pandas_dataframe')."
    )
    datatype_l2: L2OutputTypes = Field(
        ...,
        validation_alias="predicted_datatype_l2",
        description="The inferred category (dimension or measure) for the column.",
    )


class KeyIdentificationOutput(BaseModel):
    """
    A Pydantic model for validating the response of the key_identification function.

    """

    column_name: Optional[str] = Field(default=None, description="The name of the column identified as a primary key.")


class ColumnGlossary(BaseModel):
    """
    Represents the business glossary and tags for a single column.
    """

    column_name: str = Field(..., description="The name of the column.")
    business_glossary: str = Field(None, description="A business-friendly term for the column.")
    business_tags: List[str] = Field(
        default_factory=list, description="A list of business tags associated with the column."
    )


class BusinessGlossaryOutput(BaseModel):
    """
    Represents the business glossary and tags for all columns in a dataset.
    """

    table_name: str = Field(
        ..., description="The name of the source table or a placeholder (e.g., 'pandas_dataframe')."
    )
    table_glossary: str = Field(..., description="A business-friendly description for the dataset.")
    columns: List[ColumnGlossary] = Field(
        default_factory=list, description="A list of ColumnGlossary objects for each column in the dataset."
    )