Skip to content

Commit 5a7d715

Browse files
Feature/dimensionality column mean to be between (open-metadata#23984)
* Initial implementation for Dimensionality on Data Quality Tests * Fix ColumnValuesToBeUnique and create TestCaseResult API * Refactor dimension result * Initial E2E Implementation without Impact Score * Dimensionality Thin Slice * Update generated TypeScript types * Update generated TypeScript types * Removed useless method to use the one we already had * Fix Pandas Dimensionality checks * Remove useless comments * Implement PR comments, fix Tests * Improve the code a bit * Fix imports * Implement Dimensionality for ColumnMeanToBeBetween * Removed useless comments and improved minor things * Implement UnitTests * Fixes * Moved import pandas to type checking * Fix Min/Max being optional * Fix Unittests * small fixes --------- Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
1 parent 4ec5059 commit 5a7d715

16 files changed

Lines changed: 1592 additions & 367 deletions

ingestion/src/metadata/data_quality/validations/base_test_handler.py

Lines changed: 94 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,9 @@
6262
DIMENSION_IMPACT_SCORE_KEY = "impact_score"
6363
DIMENSION_FAILED_COUNT_KEY = "failed_count"
6464
DIMENSION_TOTAL_COUNT_KEY = "total_count"
65+
DIMENSION_SUM_VALUE_KEY = (
66+
"sum_value" # For statistical validators weighted calculations
67+
)
6568

6669

6770
class TestEvaluation(TypedDict, total=False):
@@ -208,6 +211,76 @@ def _run_dimensional_validation(self) -> List[DimensionResult]:
208211
"""
209212
return []
210213

214+
def _get_test_parameters(self) -> Optional[dict]:
215+
"""Get test-specific parameters from test case
216+
217+
Default implementation returns None. Override in child classes
218+
that need to extract and process test parameters.
219+
220+
Returns:
221+
Optional[dict]: Test parameters, or None if validator has no parameters.
222+
"""
223+
return None
224+
225+
def _evaluate_test_condition(
226+
self, metric_values: dict, test_params: Optional[dict] = None
227+
) -> TestEvaluation:
228+
"""Evaluate the test condition based on computed metrics
229+
230+
This is the core logic that determines if the test passes or fails.
231+
Override in child classes to implement test-specific evaluation logic.
232+
233+
Default implementation raises NotImplementedError. Validators that have been
234+
migrated to the new pattern should override this method.
235+
236+
Args:
237+
metric_values: Dictionary with Metrics enum names as keys
238+
e.g., {"COUNT": 100, "MEAN": 42.5}
239+
test_params: Optional test parameters (bounds, allowed values, etc.)
240+
Some validators don't need parameters (e.g., uniqueness test)
241+
242+
Returns:
243+
TestEvaluation: TypedDict with keys:
244+
- matched: bool - whether test passed
245+
- passed_rows: Optional[int] - number of passing rows (None for statistical tests)
246+
- failed_rows: Optional[int] - number of failing rows (None for statistical tests)
247+
- total_rows: Optional[int] - total row count (None for statistical tests)
248+
249+
Raises:
250+
NotImplementedError: If child class doesn't override this method
251+
"""
252+
raise NotImplementedError(
253+
f"{self.__class__.__name__} must implement _evaluate_test_condition()"
254+
)
255+
256+
def _format_result_message(
257+
self,
258+
metric_values: dict,
259+
dimension_info: Optional[DimensionInfo] = None,
260+
test_params: Optional[dict] = None,
261+
) -> str:
262+
"""Format the result message for the test
263+
264+
Override in child classes to provide human-readable test results.
265+
266+
Default implementation raises NotImplementedError. Validators that have been
267+
migrated to the new pattern should override this method.
268+
269+
Args:
270+
metric_values: Dictionary with Metrics enum names as keys
271+
dimension_info: Optional dimension details for dimensional results
272+
test_params: Optional test parameters (for displaying bounds, thresholds, etc.)
273+
274+
Returns:
275+
str: Formatted result message
276+
277+
Raises:
278+
NotImplementedError: If child class doesn't override this method
279+
"""
280+
raise NotImplementedError(
281+
f"{self.__class__.__name__} must implement _format_result_message()"
282+
)
283+
211284
def _extract_dimension_value(self, row: dict) -> str:
212285
"""Extract and format dimension value from result row
213286
@@ -278,6 +351,7 @@ def _create_dimension_result(
278351
dimension_name=dimension_col_name,
279352
dimension_value=dimension_value,
280353
),
354+
test_params=test_params,
281355
)
282356

283357
test_result_values = self._get_test_result_values(metric_values)
@@ -443,8 +517,8 @@ def get_dimension_result_object(
443517
test_case_status: TestCaseStatus,
444518
result: str,
445519
test_result_value: List[TestResultValue],
446-
total_rows: int,
447-
passed_rows: int,
520+
total_rows: Optional[int] = None,
521+
passed_rows: Optional[int] = None,
448522
failed_rows: Optional[int] = None,
449523
impact_score: Optional[float] = None,
450524
) -> "DimensionResult":
@@ -455,24 +529,30 @@ def get_dimension_result_object(
455529
test_case_status: Status of the test for this dimension combination
456530
result: Details of test case results for this dimension combination
457531
test_result_value: List of test result values
458-
total_rows: Total number of rows in this dimension
459-
passed_rows: Number of rows that passed for this dimension
460-
failed_rows: Number of rows that failed for this dimension (auto-calculated if None)
532+
total_rows: Total number of rows in this dimension (None for statistical validators)
533+
passed_rows: Number of rows that passed for this dimension (None for statistical validators)
534+
failed_rows: Number of rows that failed for this dimension (auto-calculated if None, None for statistical validators)
461535
impact_score: Optional impact score for this dimension (0-1 range)
462536
463537
Returns:
464538
DimensionResult: Dimension result object with calculated percentages
465539
"""
466-
if failed_rows is None:
467-
failed_rows = total_rows - passed_rows
468-
469-
# Derive one percentage from the other to ensure they sum to 100%
470-
if total_rows > 0:
471-
passed_rows_percentage = round(passed_rows / total_rows * 100, 2)
472-
failed_rows_percentage = round(100 - passed_rows_percentage, 2)
540+
# Handle row counts and percentages for statistical validators
541+
if total_rows is None or passed_rows is None:
542+
passed_rows_percentage = None
543+
failed_rows_percentage = None
473544
else:
474-
passed_rows_percentage = 0
475-
failed_rows_percentage = 0
545+
# Row-by-row validators: calculate percentages
546+
if failed_rows is None:
547+
failed_rows = total_rows - passed_rows
548+
549+
# Derive one percentage from the other to ensure they sum to 100%
550+
if total_rows > 0:
551+
passed_rows_percentage = round(passed_rows / total_rows * 100, 2)
552+
failed_rows_percentage = round(100 - passed_rows_percentage, 2)
553+
else:
554+
passed_rows_percentage = 0
555+
failed_rows_percentage = 0
476556

477557
dimension_values_array = [
478558
DimensionValue(name=name, value=value)

0 commit comments

Comments
 (0)