@@ -38,20 +38,22 @@ def analyze(self, dataset: DataSet) -> None:
3838 Performs column-level profiling for each column.
3939 This step depends on the 'table_profile' result.
4040 """
41-
41+
4242 # Dependency check
43- if ' table_profile' not in dataset .results :
43+ if " table_profile" not in dataset .results :
4444 raise RuntimeError ("TableProfiler must be run before ColumnProfiler." )
4545
46- table_profile : ProfilingOutput = dataset .results [' table_profile' ]
46+ table_profile : ProfilingOutput = dataset .results [" table_profile" ]
4747 all_column_profiles = {}
4848
4949 for col_name in table_profile .columns :
5050 # We would add a method to our DataFrame wrapper to get stats for a single column
51- stats = dataset .dataframe_wrapper .column_profile (dataset .raw_df , dataset .name , col_name , settings .UPSTREAM_SAMPLE_LIMIT )
51+ stats = dataset .dataframe_wrapper .column_profile (
52+ dataset .raw_df , dataset .name , col_name , settings .UPSTREAM_SAMPLE_LIMIT
53+ )
5254 all_column_profiles [col_name ] = stats
53-
54- dataset .results [' column_profiles' ] = all_column_profiles
55+
56+ dataset .results [" column_profiles" ] = all_column_profiles
5557
5658
5759class DataTypeIdentifierL1 (AnalysisStep ):
@@ -60,19 +62,21 @@ def analyze(self, dataset: DataSet) -> None:
6062 Performs datatype identification level 1 for each column.
6163 This step depends on the 'column_profiles' result.
6264 """
63-
65+
6466 # Dependency check
65- if ' column_profiles' not in dataset .results :
67+ if " column_profiles" not in dataset .results :
6668 raise RuntimeError ("TableProfiler and ColumnProfiler must be run before DatatypeIdentifierL1." )
6769
68- column_profiles : dict [str , ColumnProfile ] = dataset .results [' column_profiles' ]
70+ column_profiles : dict [str , ColumnProfile ] = dataset .results [" column_profiles" ]
6971
70- column_datatypes_l1 = dataset .dataframe_wrapper .datatype_identification_l1 (dataset .raw_df , dataset .name , column_profiles )
72+ column_datatypes_l1 = dataset .dataframe_wrapper .datatype_identification_l1 (
73+ dataset .raw_df , dataset .name , column_profiles
74+ )
7175
7276 for column in column_datatypes_l1 :
7377 column_profiles [column .column_name ].datatype_l1 = column .datatype_l1
7478
75- dataset .results [' column_datatypes_l1' ] = column_datatypes_l1
79+ dataset .results [" column_datatypes_l1" ] = column_datatypes_l1
7680
7781
7882class DataTypeIdentifierL2 (AnalysisStep ):
@@ -81,19 +85,21 @@ def analyze(self, dataset: DataSet) -> None:
8185 Performs datatype identification level 2 for each column.
8286 This step depends on the 'column_datatypes_l1' result.
8387 """
84-
88+
8589 # Dependency check
86- if ' column_profiles' not in dataset .results :
90+ if " column_profiles" not in dataset .results :
8791 raise RuntimeError ("TableProfiler and ColumnProfiler must be run before DatatypeIdentifierL2." )
8892
89- column_profiles : dict [str , ColumnProfile ] = dataset .results [' column_profiles' ]
93+ column_profiles : dict [str , ColumnProfile ] = dataset .results [" column_profiles" ]
9094 columns_with_samples = [DataTypeIdentificationL2Input (** col .model_dump ()) for col in column_profiles .values ()]
91- column_datatypes_l2 = dataset .dataframe_wrapper .datatype_identification_l2 (dataset .raw_df , dataset .name , columns_with_samples )
95+ column_datatypes_l2 = dataset .dataframe_wrapper .datatype_identification_l2 (
96+ dataset .raw_df , dataset .name , columns_with_samples
97+ )
9298
9399 for column in column_datatypes_l2 :
94100 column_profiles [column .column_name ].datatype_l2 = column .datatype_l2
95101
96- dataset .results [' column_datatypes_l2' ] = column_datatypes_l2
102+ dataset .results [" column_datatypes_l2" ] = column_datatypes_l2
97103
98104
99105class KeyIdentifier (AnalysisStep ):
@@ -102,21 +108,22 @@ def analyze(self, dataset: DataSet) -> None:
102108 Performs key identification for the dataset.
103109 This step depends on the datatype identification results.
104110 """
105- if ' column_datatypes_l1' not in dataset .results or ' column_datatypes_l2' not in dataset .results :
111+ if " column_datatypes_l1" not in dataset .results or " column_datatypes_l2" not in dataset .results :
106112 raise RuntimeError ("DataTypeIdentifierL1 and L2 must be run before KeyIdentifier." )
107-
108- column_profiles : dict [str , ColumnProfile ] = dataset .results [' column_profiles' ]
113+
114+ column_profiles : dict [str , ColumnProfile ] = dataset .results [" column_profiles" ]
109115 column_profiles_df = pd .DataFrame ([col .model_dump () for col in column_profiles .values ()])
110116
111117 key = dataset .dataframe_wrapper .key_identification (dataset .name , column_profiles_df )
112- dataset .results ["key" ] = key
118+ if key is not None :
119+ dataset .results ["key" ] = key
113120
114121
115122class BusinessGlossaryGenerator (AnalysisStep ):
116123 def __init__ (self , domain : str ):
117124 """
118125 Initializes the BusinessGlossaryGenerator with optional additional context.
119-
126+
120127 :param domain: The industry domain to which the dataset belongs.
121128 """
122129 self .domain = domain
@@ -125,10 +132,10 @@ def analyze(self, dataset: DataSet) -> None:
125132 """
126133 Generates business glossary terms and tags for each column in the dataset.
127134 """
128- if ' column_datatypes_l1' not in dataset .results :
135+ if " column_datatypes_l1" not in dataset .results :
129136 raise RuntimeError ("DataTypeIdentifierL1 must be run before Business Glossary Generation." )
130-
131- column_profiles : dict [str , ColumnProfile ] = dataset .results [' column_profiles' ]
137+
138+ column_profiles : dict [str , ColumnProfile ] = dataset .results [" column_profiles" ]
132139 column_profiles_df = pd .DataFrame ([col .model_dump () for col in column_profiles .values ()])
133140
134141 glossary_output = dataset .dataframe_wrapper .generate_business_glossary (
@@ -138,7 +145,6 @@ def analyze(self, dataset: DataSet) -> None:
138145 for column in glossary_output .columns :
139146 column_profiles [column .column_name ].business_glossary = column .business_glossary
140147 column_profiles [column .column_name ].business_tags = column .business_tags
141-
142- dataset .results ["business_glossary_and_tags" ] = glossary_output
143- dataset .results ['table_glossary' ] = glossary_output .table_glossary
144148
149+ dataset .results ["business_glossary_and_tags" ] = glossary_output
150+ dataset .results ["table_glossary" ] = glossary_output .table_glossary
0 commit comments