From d458891d774ac5fcfe2a304d7f41e6d17f6b76d8 Mon Sep 17 00:00:00 2001 From: litedatum Date: Mon, 25 Aug 2025 19:46:45 -0400 Subject: [PATCH 1/6] Implement Multi-Table Validation for Command --- cli/commands/schema.py | 511 +++++++++++------- cli/core/data_validator.py | 149 ++++- cli/core/source_parser.py | 96 +++- test_data/multi_table_data.xlsx | Bin 0 -> 6649 bytes test_data/multi_table_schema.json | 31 ++ .../cli_scenarios/test_schema_command_e2e.py | 218 ++++++++ .../unit/cli/commands/test_schema_command.py | 224 ++++++++ .../commands/test_schema_command_extended.py | 423 +++++++++++++++ .../test_schema_command_file_sources.py | 110 ++++ .../test_schema_command_json_extras.py | 149 +++++ .../test_schema_command_multi_table.py | 389 +++++++++++++ 11 files changed, 2093 insertions(+), 207 deletions(-) create mode 100644 test_data/multi_table_data.xlsx create mode 100644 test_data/multi_table_schema.json create mode 100644 tests/e2e/cli_scenarios/test_schema_command_e2e.py create mode 100644 tests/unit/cli/commands/test_schema_command.py create mode 100644 tests/unit/cli/commands/test_schema_command_extended.py create mode 100644 tests/unit/cli/commands/test_schema_command_file_sources.py create mode 100644 tests/unit/cli/commands/test_schema_command_json_extras.py create mode 100644 tests/unit/cli/commands/test_schema_command_multi_table.py diff --git a/cli/commands/schema.py b/cli/commands/schema.py index dd52bc7..0a39b48 100644 --- a/cli/commands/schema.py +++ b/cli/commands/schema.py @@ -2,7 +2,7 @@ Schema Command Adds `vlite-cli schema` command that parses parameters, performs minimal rules -file validation (single-table only, no jsonschema), and prints placeholder +file validation (supports both single-table and multi-table formats), and prints output aligned with the existing CLI style. """ @@ -20,6 +20,7 @@ from shared.enums.data_types import DataType from shared.schema.base import RuleTarget, TargetEntity from shared.schema.rule_schema import RuleSchema +from shared.schema.connection_schema import ConnectionSchema from shared.utils.console import safe_echo from shared.utils.datetime_utils import now as _now from shared.utils.logger import get_logger @@ -37,88 +38,119 @@ } -def _validate_rules_payload(payload: Any) -> Tuple[List[str], int]: - """Validate the minimal structure of the schema rules file. - - This performs non-jsonschema checks: - - Top-level must be an object with a `rules` array - - Warn and ignore top-level `table` if present - - Validate each rule item fields and types: - - field: required str - - type: optional str in allowed set - - required: optional bool - - enum: optional list - - min/max: optional numeric (int or float) - +def _validate_multi_table_rules_payload(payload: Any) -> Tuple[List[str], int]: + """Validate the structure of multi-table schema rules file. + + Multi-table format: + { + "table1": { + "rules": [...], + "strict_mode": true + }, + "table2": { + "rules": [...] + } + } + Returns: - warnings, rules_count - - Raises: - click.UsageError: if structure or types are invalid + warnings, total_rules_count """ warnings: List[str] = [] - + total_rules = 0 + if not isinstance(payload, dict): - raise click.UsageError("Rules file must be a JSON object with a 'rules' array") - - if "table" in payload: - warnings.append( - "Top-level 'table' is ignored; table is derived from data-source" - ) - - if "tables" in payload: - # Explicitly reject multi-table format in v1 - raise click.UsageError( - "'tables' is not supported in v1; use single-table 'rules' only" - ) + raise click.UsageError("Rules file must be a JSON object") + + # Check if this is a multi-table format (has table names as keys) + table_names = [key for key in payload.keys() if key != "rules"] + + if table_names: + # Multi-table format + for table_name in table_names: + table_schema = payload[table_name] + if not isinstance(table_schema, dict): + raise click.UsageError(f"Table '{table_name}' schema must be an object") + + table_rules = table_schema.get("rules") + if not isinstance(table_rules, list): + raise click.UsageError(f"Table '{table_name}' must have a 'rules' array") + + # Validate each rule in this table + for idx, item in enumerate(table_rules): + if not isinstance(item, dict): + raise click.UsageError(f"Table '{table_name}' rules[{idx}] must be an object") + + # Validate rule fields + _validate_single_rule_item(item, f"Table '{table_name}' rules[{idx}]") + + total_rules += len(table_rules) + + # Validate optional table-level switches + if "strict_mode" in table_schema and not isinstance(table_schema["strict_mode"], bool): + raise click.UsageError(f"Table '{table_name}' strict_mode must be a boolean") + if "case_insensitive" in table_schema and not isinstance(table_schema["case_insensitive"], bool): + raise click.UsageError(f"Table '{table_name}' case_insensitive must be a boolean") + else: + # Single-table format (backward compatibility) + warnings.append("Single-table format detected; consider using multi-table format for better organization") + if "rules" not in payload: + raise click.UsageError("Single-table format must have a 'rules' array") + + rules = payload["rules"] + if not isinstance(rules, list): + raise click.UsageError("'rules' must be an array") + + for idx, item in enumerate(rules): + if not isinstance(item, dict): + raise click.UsageError(f"rules[{idx}] must be an object") + _validate_single_rule_item(item, f"rules[{idx}]") + + total_rules = len(rules) + + return warnings, total_rules + + +def _validate_single_rule_item(item: Dict[str, Any], context: str) -> None: + """Validate a single rule item from the rules array.""" + # field + field_name = item.get("field") + if not isinstance(field_name, str) or not field_name: + raise click.UsageError(f"{context}.field must be a non-empty string") + + # type + if "type" in item: + type_name = item["type"] + if not isinstance(type_name, str): + raise click.UsageError(f"{context}.type must be a string when provided") + if type_name.lower() not in _ALLOWED_TYPE_NAMES: + allowed = ", ".join(sorted(_ALLOWED_TYPE_NAMES)) + raise click.UsageError( + f"{context}.type '{type_name}' is not supported. " + f"Allowed: {allowed}" + ) - rules = payload.get("rules") - if not isinstance(rules, list): - raise click.UsageError("'rules' must be an array") + # required + if "required" in item and not isinstance(item["required"], bool): + raise click.UsageError(f"{context}.required must be a boolean when provided") - for idx, item in enumerate(rules): - if not isinstance(item, dict): - raise click.UsageError(f"rules[{idx}] must be an object") + # enum + if "enum" in item and not isinstance(item["enum"], list): + raise click.UsageError(f"{context}.enum must be an array when provided") - # field - field_name = item.get("field") - if not isinstance(field_name, str) or not field_name: - raise click.UsageError(f"rules[{idx}].field must be a non-empty string") - - # type - if "type" in item: - type_name = item["type"] - if not isinstance(type_name, str): - raise click.UsageError( - f"rules[{idx}].type must be a string when provided" - ) - if type_name.lower() not in _ALLOWED_TYPE_NAMES: - allowed = ", ".join(sorted(_ALLOWED_TYPE_NAMES)) - raise click.UsageError( - f"rules[{idx}].type '{type_name}' is not supported. " - f"Allowed: {allowed}" - ) + # min/max + for bound_key in ("min", "max"): + if bound_key in item: + value = item[bound_key] + if not isinstance(value, (int, float)): + raise click.UsageError(f"{context}.{bound_key} must be numeric when provided") - # required - if "required" in item and not isinstance(item["required"], bool): - raise click.UsageError( - f"rules[{idx}].required must be a boolean when provided" - ) - # enum - if "enum" in item and not isinstance(item["enum"], list): - raise click.UsageError(f"rules[{idx}].enum must be an array when provided") - - # min/max - for bound_key in ("min", "max"): - if bound_key in item: - value = item[bound_key] - if not isinstance(value, (int, float)): - raise click.UsageError( - f"rules[{idx}].{bound_key} must be numeric when provided" - ) +def _validate_rules_payload(payload: Any) -> Tuple[List[str], int]: + """Validate the minimal structure of the schema rules file. - return warnings, len(rules) + This performs non-jsonschema checks for both single-table and multi-table formats. + """ + return _validate_multi_table_rules_payload(payload) def _map_type_name_to_datatype(type_name: str) -> DataType: @@ -200,16 +232,108 @@ def _create_rule_schema( ) -def _decompose_to_atomic_rules(payload: Dict[str, Any]) -> List[RuleSchema]: - """Decompose schema JSON payload into atomic RuleSchema objects. +def _decompose_multi_table_schema( + payload: Dict[str, Any], source_db: str + ) -> List[RuleSchema]: + """Decompose multi-table schema JSON payload into atomic RuleSchema objects. + + Supports both single-table and multi-table formats. + """ + all_atomic_rules: List[RuleSchema] = [] + + # Check if this is multi-table format + table_names = [key for key in payload.keys() if key != "rules"] + + if table_names: + # Multi-table format + for table_name in table_names: + table_schema = payload[table_name] + table_rules = _decompose_single_table_schema( + table_schema, source_db, table_name + ) + all_atomic_rules.extend(table_rules) + else: + # Single-table format (backward compatibility) + # For single-table, we need to determine the table name from the source + # This will be handled by the caller who knows the table context + table_rules = _decompose_single_table_schema(payload, source_db, "unknown") + all_atomic_rules.extend(table_rules) + + return all_atomic_rules + + +def _decompose_multi_table_schema_with_source_info( + payload: Dict[str, Any], source_config: ConnectionSchema + ) -> List[RuleSchema]: + """Decompose multi-table schema JSON payload into atomic RuleSchema objects. + + This version takes into account the actual tables available in the source. + + Args: + payload: The rules payload + source_config: Source configuration with table information + """ + all_atomic_rules: List[RuleSchema] = [] + + # Check if this is multi-table format + table_names = [key for key in payload.keys() if key != "rules"] + + if table_names: + # Multi-table format + # Check if source has multi-table information + is_multi_table_source = source_config.parameters.get("is_multi_table", False) + available_tables = (source_config.parameters + .get("sheets", {}).keys() + if is_multi_table_source else set() + ) + if is_multi_table_source and available_tables: + # Only process rules for tables that actually exist in the source + for table_name in table_names: + if table_name in available_tables: + table_schema = payload[table_name] + table_rules = _decompose_single_table_schema( + table_schema, source_config.db_name or "unknown", table_name + ) + all_atomic_rules.extend(table_rules) + logger.info( + f"Processing rules for table '{table_name}' (found in source)" + ) + else: + logger.warning( + f"Skipping rules for table '{table_name}' " + f"(not found in source: {list(available_tables)})" + ) + else: + # Process all tables (fallback for non-multi-table sources) + for table_name in table_names: + table_schema = payload[table_name] + table_rules = _decompose_single_table_schema( + table_schema, source_config.db_name or "unknown", table_name + ) + all_atomic_rules.extend(table_rules) + else: + # Single-table format (backward compatibility) + # For single-table, we need to determine the table name from the source + # This will be handled by the caller who knows the table context + table_rules = _decompose_single_table_schema( + payload, source_config.db_name or "unknown", "unknown" + ) + all_atomic_rules.extend(table_rules) + + return all_atomic_rules - Rules per item: - - type -> contributes to table-level SCHEMA columns mapping - - required -> NOT_NULL(column) - - min/max -> RANGE(column, min_value/max_value) - - enum -> ENUM(column, allowed_values) + +def _decompose_single_table_schema( + table_schema: Dict[str, Any], source_db: str, table_name: str + ) -> List[RuleSchema]: + """Decompose a single table's schema definition into atomic RuleSchema objects. + + Args: + table_schema: The schema definition for a single table + source_db: Database name from source + table_name: Name of the table being validated """ - rules_arr = payload.get("rules", []) + rules_arr = table_schema.get("rules", []) # Build SCHEMA columns mapping first columns_map: Dict[str, Dict[str, Any]] = {} @@ -275,11 +399,11 @@ def _decompose_to_atomic_rules(payload: Dict[str, Any]) -> List[RuleSchema]: # Create one table-level SCHEMA rule if any columns were declared if columns_map: schema_params: Dict[str, Any] = {"columns": columns_map} - # Optional switches at top-level - if isinstance(payload.get("strict_mode"), bool): - schema_params["strict_mode"] = payload["strict_mode"] - if isinstance(payload.get("case_insensitive"), bool): - schema_params["case_insensitive"] = payload["case_insensitive"] + # Optional switches at table level + if isinstance(table_schema.get("strict_mode"), bool): + schema_params["strict_mode"] = table_schema["strict_mode"] + if isinstance(table_schema.get("case_insensitive"), bool): + schema_params["case_insensitive"] = table_schema["case_insensitive"] atomic_rules.insert( 0, @@ -288,13 +412,30 @@ def _decompose_to_atomic_rules(payload: Dict[str, Any]) -> List[RuleSchema]: rule_type=RuleType.SCHEMA, column=None, parameters=schema_params, - description="CLI: table schema existence+type", + description=f"CLI: table schema existence+type for {table_name}", ), ) + # Set the target table and database for all rules + for rule in atomic_rules: + if rule.target and rule.target.entities: + rule.target.entities[0].database = source_db + rule.target.entities[0].table = table_name + return atomic_rules +def _decompose_to_atomic_rules(payload: Dict[str, Any]) -> List[RuleSchema]: + """Decompose schema JSON payload into atomic RuleSchema objects. + + This function is kept for backward compatibility but now delegates to + the new multi-table aware function. + """ + # For backward compatibility, we need to determine the source_db + # This will be handled by the caller + return _decompose_multi_table_schema(payload, "unknown") + + def _build_prioritized_atomic_status( *, schema_result: Dict[str, Any] | None, @@ -531,6 +672,7 @@ def _failed_records_of(res: Dict[str, Any]) -> int: col_name = str(item.get("column")) entry: Dict[str, Any] = { "column": col_name, + "table": "unknown", # Will be updated later with actual table name "checks": { "existence": { "status": item.get("existence", "UNKNOWN"), @@ -555,6 +697,7 @@ def _failed_records_of(res: Dict[str, Any]) -> int: if str(col) not in schema_fields_index: entry = { "column": str(col), + "table": "unknown", # Will be updated later with actual table name "checks": { "existence": {"status": "UNKNOWN", "failure_code": "NONE"}, "type": {"status": "UNKNOWN", "failure_code": "NONE"}, @@ -583,11 +726,19 @@ def _ensure_check(entry: Dict[str, Any], name: str) -> Dict[str, Any]: column_name = rule.get_target_column() or "" if not column_name: continue + # Add table name for multi-table support + table_name = "unknown" + if rule.target and rule.target.entities: + table_name = rule.target.entities[0].table + l_entry = schema_fields_index.get(column_name) if not l_entry: - l_entry = {"column": column_name, "checks": {}} + l_entry = {"column": column_name, "table": table_name, "checks": {}} fields.append(l_entry) schema_fields_index[column_name] = l_entry + else: + # Ensure table name is set + l_entry["table"] = table_name t = rule.type if t == RuleType.NOT_NULL: key = "not_null" @@ -699,6 +850,9 @@ def _dataset_total(res: Dict[str, Any]) -> int: rd["rule_type"] = rule.type.value rd["column_name"] = rule.get_target_column() rd.setdefault("rule_name", rule.name) + # Add table name for multi-table support + if rule.target and rule.target.entities: + rd["table_name"] = rule.target.entities[0].table if rid in skip_map: rd["status"] = skip_map[rid]["status"] rd["skip_reason"] = skip_map[rid]["skip_reason"] @@ -735,56 +889,42 @@ def _calc_failed(res: Dict[str, Any]) -> int: col = str(item.get("column")) column_guard[col] = str(item.get("failure_code", "NONE")) - grouped: Dict[str, Dict[str, Any]] = {} - schema_rule = next((r for r in atomic_rules if r.type == RuleType.SCHEMA), None) - declared_cols: List[str] = [] - if schema_rule: - params = schema_rule.parameters or {} - declared_cols = list((params.get("columns") or {}).keys()) - for col in declared_cols: - grouped[str(col)] = {"column": str(col), "issues": []} - + # Group results by table for multi-table support + tables_grouped: Dict[str, Dict[str, Dict[str, Any]]] = {} + for rd in table_results: - rid = str(rd.get("rule_id", "")) - rule = rule_map.get(rid) - if not rule or rule.type == RuleType.SCHEMA: - continue - col = rule.get_target_column() or "" - if not col: - continue - entry = grouped.setdefault(col, {"column": col, "issues": []}) - status = str(rd.get("status", "UNKNOWN")) - if rule.type == RuleType.NOT_NULL: - key = "not_null" - elif rule.type == RuleType.RANGE: - key = "range" - elif rule.type == RuleType.ENUM: - key = "enum" - elif rule.type == RuleType.REGEX: - key = "regex" - elif rule.type == RuleType.DATE_FORMAT: - key = "date_format" - else: - key = rule.type.value.lower() - if column_guard.get(col) == "FIELD_MISSING": - continue - if column_guard.get(col) == "TYPE_MISMATCH" and key in { - "not_null", - "range", - "enum", - "regex", - "date_format", - }: - continue - if status in {"FAILED", "ERROR", "SKIPPED"}: - entry["issues"].append( - { - "check": key, - "status": status, - "failed_records": int(rd.get("failed_records", 0) or 0), - "skip_reason": skip_map.get(rid, {}).get("skip_reason"), - } - ) + table_name = rd.get("table_name", "unknown") + if table_name not in tables_grouped: + tables_grouped[table_name] = {} + + col = rd.get("column_name", "") + if col: + if col not in tables_grouped[table_name]: + tables_grouped[table_name][col] = {"column": col, "issues": []} + + status = str(rd.get("status", "UNKNOWN")) + if rd.get("rule_type") == RuleType.NOT_NULL.value: + key = "not_null" + elif rd.get("rule_type") == RuleType.RANGE.value: + key = "range" + elif rd.get("rule_type") == RuleType.ENUM.value: + key = "enum" + elif rd.get("rule_type") == RuleType.REGEX.value: + key = "regex" + elif rd.get("rule_type") == RuleType.DATE_FORMAT.value: + key = "date_format" + else: + key = rd.get("rule_type", "unknown").lower() + + if status in {"FAILED", "ERROR", "SKIPPED"}: + tables_grouped[table_name][col]["issues"].append( + { + "check": key, + "status": status, + "failed_records": int(rd.get("failed_records", 0) or 0), + "skip_reason": rd.get("skip_reason"), + } + ) lines: List[str] = [] lines.append(f"✓ Checking {source} ({header_total_records:,} records)") @@ -793,34 +933,29 @@ def _calc_failed(res: Dict[str, Any]) -> int: int(r.get("failed_records", 0) or 0) for r in table_results ) - for col in sorted(grouped.keys()): - guard = column_guard.get(col, "NONE") - if guard == "FIELD_MISSING": - lines.append(f"✗ {col}: missing (skipped dependent checks)") - continue - if guard == "TYPE_MISMATCH": - lines.append(f"✗ {col}: type mismatch (skipped dependent checks)") - continue - issues = grouped[col]["issues"] - critical = [i for i in issues if i["status"] in {"FAILED", "ERROR"}] - if not critical: - lines.append(f"✓ {col}: OK") - else: - for i in critical: - fr = i.get("failed_records") or 0 - if i["status"] == "ERROR": - lines.append(f"✗ {col}: {i['check']} error") - else: - lines.append(f"✗ {col}: {i['check']} failed ({fr} failures)") - - total_columns = len(grouped) + # Display results grouped by table + for table_name in sorted(tables_grouped.keys()): + if len(tables_grouped) > 1: # Only show table header for multi-table + lines.append(f"\n📋 Table: {table_name}") + + table_grouped = tables_grouped[table_name] + for col in sorted(table_grouped.keys()): + issues = table_grouped[col]["issues"] + critical = [i for i in issues if i["status"] in {"FAILED", "ERROR"}] + if not critical: + lines.append(f"✓ {col}: OK") + else: + for i in critical: + fr = i.get("failed_records") or 0 + if i["status"] == "ERROR": + lines.append(f"✗ {col}: {i['check']} error") + else: + lines.append(f"✗ {col}: {i['check']} failed ({fr} failures)") + + total_columns = sum(len(tables_grouped[table]) for table in tables_grouped) passed_columns = sum( - 1 - for col in grouped - if column_guard.get(col, "NONE") == "NONE" - and not [ - i for i in grouped[col]["issues"] if i["status"] in {"FAILED", "ERROR"} - ] + sum(1 for col in table_grouped.values() if not col["issues"]) + for table_grouped in tables_grouped.values() ) failed_columns = total_columns - passed_columns overall_error_rate = ( @@ -828,6 +963,15 @@ def _calc_failed(res: Dict[str, Any]) -> int: if header_total_records == 0 else (total_failed_records / max(header_total_records, 1)) * 100 ) + + if len(tables_grouped) > 1: + lines.append(f"\n📊 Multi-table Summary:") + for table_name in sorted(tables_grouped.keys()): + table_columns = len(tables_grouped[table_name]) + table_passed = sum(1 for col in tables_grouped[table_name].values() if not col["issues"]) + table_failed = table_columns - table_passed + lines.append(f" {table_name}: {table_passed} passed, {table_failed} failed") + lines.append( f"\nSummary: {passed_columns} passed, {failed_columns} failed" f" ({overall_error_rate:.2f}% overall error rate)" @@ -844,13 +988,12 @@ def _calc_failed(res: Dict[str, Any]) -> int: required=True, help="Database connection string or file path", ) -@click.option("--table", "table_name", required=True, help="Table name to validate") @click.option( "--rules", "rules_file", type=click.Path(exists=True, readable=True), required=True, - help="Path to schema rules file (JSON)", + help="Path to schema rules file (JSON) - supports both single-table and multi-table formats", ) @click.option( "--output", @@ -875,28 +1018,29 @@ def _calc_failed(res: Dict[str, Any]) -> int: @click.option("--verbose", is_flag=True, default=False, help="Enable verbose output") def schema_command( connection_string: str, - table_name: str, rules_file: str, output: str, fail_on_error: bool, max_errors: int, verbose: bool, ) -> None: - """Schema validation command with minimal rules file validation. + """Schema validation command with support for both single-table and multi-table validation. NEW FORMAT: - vlite-cli schema --conn --table \ - --rules [options] + vlite-cli schema --conn --rules [options] SOURCE can be: - File path: users.csv, data.xlsx, records.json - Database URL: mysql://user:pass@host/db - SQLite file: sqlite:///path/to/file.db + RULES FILE FORMATS: + - Single-table: {"rules": [...]} + - Multi-table: {"table1": {"rules": [...]}, "table2": {"rules": [...]}} + Examples: - vlite-cli schema --conn users.csv --table users --rules schema.json - vlite-cli schema --conn mysql://user:pass@host/db --table users \ - --rules schema.json + vlite-cli schema --conn users.csv --rules schema.json + vlite-cli schema --conn mysql://user:pass@host/db --rules multi_table_schema.json """ from cli.core.config import get_cli_config @@ -914,27 +1058,14 @@ def schema_command( warnings, rules_count = _validate_rules_payload(rules_payload) _emit_warnings(warnings) - # Decompose into atomic rules per design - atomic_rules = _decompose_to_atomic_rules(rules_payload) - - # FIX: Manually populate the target table and database from CLI args - # The source_config object is a class instance, not a dict. - # Use attribute access. + # Get database name from source config source_db = source_config.db_name if not source_db: source_db = "unknown" - for rule in atomic_rules: - if rule.target and rule.target.entities: - rule.target.entities[0].database = source_db - rule.target.entities[0].table = table_name - - # get database name from SourceParser results - # source_db = source_config.get('database') - # for rule in atomic_rules: - # if rule.target and rule.target.entities: - # rule.target.entities[0].database = source_db - # rule.target.entities[0].table = table_name + # Decompose into atomic rules using new multi-table aware function + atomic_rules = _decompose_multi_table_schema_with_source_info(rules_payload, source_config) + # Fast-path: no rules -> emit minimal payload and exit cleanly if len(atomic_rules) == 0: _early_exit_when_no_rules( diff --git a/cli/core/data_validator.py b/cli/core/data_validator.py index fb73021..a63b07c 100644 --- a/cli/core/data_validator.py +++ b/cli/core/data_validator.py @@ -184,15 +184,26 @@ async def _validate_file(self) -> List[ExecutionResultSchema]: """Validate file-based data source""" self.logger.info(f"Validating file: {self.source_config.file_path}") - # Load file data - try: - df = self._load_file_data() - self.logger.info(f"Loaded {len(df)} records from file") - except Exception as e: - raise ValueError(f"Failed to load file data: {str(e)}") + # Check if this is a multi-table Excel file + is_multi_table = self.source_config.parameters.get("is_multi_table", False) + self.logger.info(f"Multi-table detection: is_multi_table={is_multi_table}, connection_type={self.source_config.connection_type}") + self.logger.info(f"Source config parameters: {self.source_config.parameters}") + + if is_multi_table and self.source_config.connection_type == ConnectionType.EXCEL: + # Handle multi-table Excel file + self.logger.info("Processing multi-table Excel file") + sqlite_config = await self._convert_multi_table_excel_to_sqlite() + else: + # Handle single-table file (existing logic) + self.logger.info("Processing single-table file") + try: + df = self._load_file_data() + self.logger.info(f"Loaded {len(df)} records from file") + except Exception as e: + raise ValueError(f"Failed to load file data: {str(e)}") - # Convert to SQLite for rule engine processing - sqlite_config = await self._convert_file_to_sqlite(df) + # Convert to SQLite for rule engine processing + sqlite_config = await self._convert_file_to_sqlite(df) # Execute rules using rule engine with new interface rule_engine = RuleEngine(connection=sqlite_config, core_config=self.core_config) @@ -310,6 +321,128 @@ def _load_file_data(self) -> pd.DataFrame: except Exception as e: raise ValueError(f"Failed to parse file: {str(e)}") + async def _convert_multi_table_excel_to_sqlite(self) -> ConnectionSchema: + """ + Convert multi-table Excel file to SQLite database. + + Returns: + ConnectionSchema: SQLite connection configuration + """ + import os + import tempfile + import time + + from sqlalchemy import create_engine + + temp_db_file = None + temp_db_path = None + start_time = time.time() + + try: + # Create a temporary SQLite file + temp_db_file = tempfile.NamedTemporaryFile(suffix=".db", delete=False) + temp_db_path = temp_db_file.name + temp_db_file.close() + + # Create SQLite engine + engine = create_engine(f"sqlite:///{temp_db_path}") + + # Load all sheets into SQLite + await self._load_multi_table_excel_to_sqlite(engine, temp_db_path) + + # Get table mapping for connection config + table_mapping = self.source_config.parameters.get("table_mapping", {}) + + # Create connection config with multi-table information + sqlite_config = ConnectionSchema( + name=f"temp_sqlite_multi_table", + description="Temporary SQLite for multi-table Excel validation", + connection_type=ConnectionType.SQLITE, + file_path=temp_db_path, + parameters={ + "is_multi_table": True, + "table_mapping": table_mapping, + "temp_file": True, # Mark as temporary file for cleanup + }, + ) + + # Log performance metrics + elapsed_time = time.time() - start_time + self.logger.info( + f"Created temporary SQLite database at {temp_db_path} with " + f"{len(table_mapping)} tables in {elapsed_time:.2f} seconds" + ) + + return sqlite_config + + except Exception as e: + # Clean up temporary file if it exists + if temp_db_path and os.path.exists(temp_db_path): + try: + os.unlink(temp_db_path) + except Exception as cleanup_error: + self.logger.warning( + f"Failed to cleanup temporary file {temp_db_path}: {cleanup_error}" + ) + raise ValueError(f"Failed to create multi-table SQLite database: {str(e)}") + + async def _load_multi_table_excel_to_sqlite(self, engine, temp_db_path: str) -> None: + """ + Load multiple sheets from Excel file into SQLite database. + + Args: + engine: SQLAlchemy engine for SQLite + temp_db_path: Path to temporary SQLite database + """ + import pandas as pd + + file_path = self.source_config.file_path + sheets_info = self.source_config.parameters.get("sheets", {}) + + if not sheets_info: + raise ValueError("Multi-table Excel file but no sheets information available") + + self.logger.info(f"Loading {len(sheets_info)} sheets into SQLite: {list(sheets_info.keys())}") + + # Store table name mapping for later use + table_mapping = {} + + # Load each sheet into a separate table + for sheet_name, columns in sheets_info.items(): + try: + # Read the specific sheet + df = pd.read_excel(file_path, sheet_name=sheet_name, engine="openpyxl") + + # Validate that the sheet has the expected columns + expected_columns = set(columns) + actual_columns = set(df.columns) + + if not expected_columns.issubset(actual_columns): + missing_columns = expected_columns - actual_columns + self.logger.warning(f"Sheet '{sheet_name}' missing expected columns: {missing_columns}") + + # Write to SQLite with sheet name as table name + # Clean table name for SQLite (remove special characters) + clean_table_name = "".join(c for c in sheet_name if c.isalnum() or c == '_') + if not clean_table_name or clean_table_name[0].isdigit(): + clean_table_name = f"sheet_{clean_table_name}" + + # Store the mapping from original sheet name to clean table name + table_mapping[sheet_name] = clean_table_name + + df.to_sql(clean_table_name, engine, if_exists="replace", index=False) + self.logger.info(f"Loaded sheet '{sheet_name}' as table '{clean_table_name}' with {len(df)} rows") + + except Exception as e: + self.logger.error(f"Failed to load sheet '{sheet_name}': {str(e)}") + # Continue with other sheets + continue + + # Store the table mapping in the source config for later use + if hasattr(self, 'source_config') and hasattr(self.source_config, 'parameters'): + self.source_config.parameters['table_mapping'] = table_mapping + self.logger.info(f"Stored table mapping: {table_mapping}") + async def _convert_file_to_sqlite(self, df: pd.DataFrame) -> ConnectionSchema: """ Convert pandas DataFrame to SQLite in-memory database diff --git a/cli/core/source_parser.py b/cli/core/source_parser.py index e2bf3a7..d13a584 100644 --- a/cli/core/source_parser.py +++ b/cli/core/source_parser.py @@ -8,7 +8,7 @@ import re import urllib.parse from pathlib import Path -from typing import Optional, Tuple +from typing import Optional, Tuple, Dict, List from uuid import uuid4 from cli.exceptions import ValidationError @@ -95,6 +95,60 @@ def parse_source( self.logger.error(f"{str(e)}") raise + def get_excel_sheets(self, file_path: str) -> Dict[str, List[str]]: + """ + Get sheet names from Excel file. + + Args: + file_path: Path to Excel file + + Returns: + Dict with sheet names as keys and column lists as values + + Raises: + ImportError: If pandas/openpyxl not available + FileNotFoundError: If file not found + """ + try: + import pandas as pd + except ImportError: + raise ImportError("pandas is required to read Excel files") + + try: + excel_file = pd.ExcelFile(file_path) + sheets_info = {} + + for sheet_name in excel_file.sheet_names: + # Read first few rows to get column names + df = pd.read_excel(file_path, sheet_name=sheet_name, nrows=0) + sheets_info[sheet_name] = list(df.columns) + + return sheets_info + except Exception as e: + self.logger.error(f"Error reading Excel file {file_path}: {str(e)}") + raise + + def is_multi_table_excel(self, file_path: str) -> bool: + """ + Check if Excel file contains multiple sheets that could represent multiple tables. + + Args: + file_path: Path to Excel file + + Returns: + True if file has multiple sheets, False otherwise + """ + try: + import pandas as pd + excel_file = pd.ExcelFile(file_path) + return len(excel_file.sheet_names) > 1 + except ImportError: + # If pandas not available, assume single table + return False + except Exception: + # If any error occurs, assume single table + return False + def _is_database_url(self, source: str) -> bool: """Check if source is a database URL""" for patterns in self.db_url_patterns.values(): @@ -200,9 +254,37 @@ def _parse_file_path(self, file_path: str) -> ConnectionSchema: f"Unknown file extension {file_ext}, assuming CSV format" ) + # Check if this is a multi-table Excel file + is_multi_table = False + sheets_info = {} + if conn_type == ConnectionType.EXCEL: + is_multi_table = self.is_multi_table_excel(file_path) + if is_multi_table: + try: + sheets_info = self.get_excel_sheets(file_path) + self.logger.info(f"Multi-table Excel file detected with {len(sheets_info)} sheets: {list(sheets_info.keys())}") + except Exception as e: + self.logger.warning(f"Could not read Excel sheets: {str(e)}") + is_multi_table = False + + # Prepare parameters + parameters = { + "filename": path.name, + "file_size": path.stat().st_size, + "encoding": "utf-8", # Default encoding + } + + # Add multi-table information for Excel files + if is_multi_table and sheets_info: + parameters["is_multi_table"] = True + parameters["sheets"] = sheets_info + parameters["table_count"] = len(sheets_info) + else: + parameters["is_multi_table"] = False + return ConnectionSchema( name=f"file_connection_{uuid4().hex[:8]}", - description=f"File connection: {path.name}", + description=f"File connection: {path.name}" + (" (multi-table)" if is_multi_table else ""), connection_type=conn_type, host=None, port=None, @@ -211,16 +293,12 @@ def _parse_file_path(self, file_path: str) -> ConnectionSchema: password=None, db_schema=None, file_path=str(path.absolute()), - parameters={ - "filename": path.name, - "file_size": path.stat().st_size, - "encoding": "utf-8", # Default encoding - }, + parameters=parameters, capabilities=DataSourceCapability( supports_sql=False, supports_batch_export=True, - max_export_rows=100000, - estimated_throughput=5000, + max_export_rows=100000 if not is_multi_table else 50000, # Reduce for multi-table + estimated_throughput=5000 if not is_multi_table else 2000, # Reduce for multi-table ), cross_db_settings=None, ) diff --git a/test_data/multi_table_data.xlsx b/test_data/multi_table_data.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..f53dfd101d8e7ed6986c080908205ece803f1f58 GIT binary patch literal 6649 zcmZ{I1yEG&`}Wd}uyliz^iq-{AstfEB_Z8NNlS@>EZw+tcT0CS3xb4%fPkb+!++K9 z9e?t@@6Mca=FIGMo#(#ix>HpV37HT80H6axvjHL8*n-2W^%dLG zoHCX##}Xtxs)W^;x>j}gge#e8ia%3g;%#~}ioGRJ6iQ@6xqYiKM5K}NbN{G7wJ_mox>2 z>&JQOOCLkJ3l`h*IrIgPNumxf&niph2tqJ0GBv`J3Obs%40ozQ&Rw`q)BxtL;`y#Z zcz);}bVABqynA#u$B<|2o7I_qNjt_SR^cIM2v{M9^udG0_Rq`54qr)1x%La`K+ z3~s8QE4-*08&P^`sO>)VhT>t-m%Z&n3%yn=AErGV9>!99CP-fNw5OUVtTebdOLXLh z^MfHa0WuZ6!<>+p;nJcbP0}inG0BX5$E^Nnj}7Cu3tzYEtu}PSijlJOe6TUu7$W1B z*|tKizkQQzkVcP(3IL=r0{{eY-+0<_xmj7*Tl~Ir|MbqDp1$k6AfexW`M9Sg%pR-v zVPIVifvdIqLQQh;o(fF}zD}f_mpM4tzfJZb-cSP0UW{^6swgu1_T{Lw?Bd-T0Lt(m!R(e0kG?H&|7pxPr@On z!HPaP7d`Y_&BCHqu(ch8RMyzp+oCG>X^T-q8I)}v$MP1#`3Y2ZQsPCTtLu(ZyZXKJS=;P z2%QHv8ir z*nAl{uP9RE>C~z3@=mH%8fK*a(Tzy6WpU?_=s%?3L@*dS-s?tg_^&Q!_ip;U&HP^Dt%y5ds%sN=r$fF&GS-%g8gf ze~!7MB5Fyal~7?gHnyJe&V~NtB232@_Egq^ezvf^`u=R341;uLKS$F#zF@694iCX& z^z_gq8&yig91QLRNa$HjZwdYifp2{)djE1{_-V0Y$|G%vj{JP^8J-OJYMSd;*9p5z zv|O19*t6!(*@-$t2#KW2d`0~1Jy6g1_QTRNQ7Qu9NE{ZbL}(%d6R=RoPg#!OhAnsg zCT&XmAdZXSLIM2Vx|%-UjcH_ym%Kx!oZ@Ae)ugER4a#{MKPDxZf(p@+qFRP7*!trE zbvZ*+N@*uPo1~a2*+X)dTH?Su=fYqUJz(#Ir^;jv{*`VCf2^2E|LzF*QF(2l)Wrx9pvi_qb~W5t)QV7*A0H5GrV@}QKz!gBns>InDQTQ^YEc% zr|4p|1Z&x37KQPN6;ShFq8v6-i*0y&VhSYmHmjBey+XL1m>1|QJ_~9W^5^s&%iV80 zc?6lP7ohjG?3jA1za-)YiJ zH!=3Hb5Y~Xnh`8ssd^X1Z*_64wM>EvCzZoaD%3;RB9@{tRCOL7sQ52ohVJY1xgd>1 zsRnY%RwVnT^v}YA8A;$q%qjI~$FB439#8UBp-;$-zDe5ln{yKrmzy1G71{fu2~>UI zM_F{=A)!fQT~c9QuZ)u{<9k*42<`1k7}#a%O&Vqo*$G$7O*lTFy)IWnJW_vV!mZzn z#1@_9qx4RC%V{&wz<6yUW^EzQqSm3D#e>ZLzPeDSbrOMP@!*n0WydqBv1j^66|FXp zYKTxb(CSCtL-W;RpP%DnHpUnF{Tm>gX@4anZx5ZN?7F_bFv zgDtLIIK+y`DW!S!;3K$6BhtrHq7swvtgC*HiAE(r7>OHra)20Qo<|vwbb-}!gx$zO zG+HVq1M$?aKNFK=?XD6>vSBJT#dN*`Rm)z<=sn2m{{GQOXG_SI>cb*&DD?sNM8}dSMIZQf*oOkAJiBKe_qr{B(#^Z%) z94i}Vm`2JS3!Vf$O+gr>!vhd;Fu@nNhSr?iE4^BQog)2ZUxfu z$0|gM)!zIMl)-uGv?!MrCxVU3oHo+e-87h%d!^vl135k(9q&!aA`O+>X$3}u<}wLN z0=YNM+0I|_N4CZY=eFI{_!phqdy!34B<#i$=NZU&;Io=MjT&z5KlW(g8EI4aj3+hu zWPn#4_o1>&eL?l?0nC}zpx9mJlZMT4avU+n!#YfoFIPpnKWve|j6?KJi+96WcB1Vs zEZhyKvh3UBC`_X+w8HEyQ()c7uU6+fTh#L1Vpd|8Lj_H4H4+f*%`FKeQv`~=0`7IP zp=KK#m8qQ7b{{X8*3qpE6+^p(?RD~7{olSMH)TW|kJ8X^p9s}-)Z%}#qs>cIZza{M ziG4ImzLcB^=@G$V&>6sPJhx$K3XrN&dLh*;FjJFHaYU7qE>Pn-<`2zEN86<#8$l>7 z(Pp-nlS#7+pDkx@wLJ_?+app{b>2cS;%mp^k|@qBmTvbq1ok}mpgo94fXj0H)+KzN z%LaN8YcI9*g~e=t>U&w4@e`_o@$il%;YitKJNlUo&IV)|tsfrTvAuFeT#Bzkf?V#+ z9k;~gVSF~6{yezt|F_({)<#UZ;c|QU-*V&m*|bE)?>Wqa2tyqi2T@k=N4Zyal1P!c^BHfPpfg1N1Xc4Ubiisrftqu9@CSE=pq%YJe5jL3#(P%qmryD zfpoZZ>zjDL=#qXN|2={urW9bHt0tvJDa~R7dj1@|P${dn?ob~(?I5`(s(Ly>7`W;(bpFfHi6%>M)U9EfaC?SknO&H**skIwIOovd5S2zU4w*F z{nf`(*tpn>G&wCmvhW=W)E`X;F}GLV^61iz+z**wq5j(8w&!T)1gg^+Gi98k{zEh^ zL;{7PIse-m8k&{iIG>z`PE3-uH%z-lIwy+tFsrnOa4k%c)Jl zipLsZEJwVIR2r)c+%2XG-aMWiwwQxx8y7ud1!6-81?HaFlenxe*Y)~X1Zze!(4EgUBO%9|Hx{*KV_C1ujZHv!s~0p zTt%7=24E3MkTXodK!rB7dzCJo_$B1ScCmKLes!tpX84-c%_f)M{XN={(yOu7xUvwU zmogLFO?L#;VW;SC%#b*UJxanx+H)+DW6fkXzHU zXI^ORAIwik3eBs32()TTM53|MeA-|$NM1b!dg!c*uI;Wm3wtzn2G`ldTJIutc z4=QMiCG#2~_%l5V%_)-3gseT&WoJWkR@Cuhun1usw(=O#8(_#AYp{@Fww%6vZ%zyx zl2DqFtF_-X6ohf|1)l7?k7`R)432+t81ELXjfWI=!@^fVIQd2*3mJLahAApkurD}q z^gA}4`US*-Q2IXa!7rL>b1(X|$}EMTxP1&A*u591p;mIsT#`@dpYVk^DLlvud5=FL z`2;tZiZ0`lbOHyHzn*n6o%q2j_hFu*Rm>NJP{5YsGH@BQnz&*DV^Mcl)i;AnpSAEgIW#{}EKvJkC~ppQ?B)UI9* zw>mKFjMxc*SafX@BAe0!Y|$*r$zs=J-k%?K4&B}n>wKaqdQiBpXQ z<3ovLq^6>d$uf0=CO^tA4pk@ELryd#6X>m(V%UC&LzHv*L64O~osA>BQn5w3NlQch zr8rQOZ29y08(O8_*rQ&Rh-0mVik?p984}?d3XP+ZdW>atYt*5>BoB`vWo)i5qF1{;(G`t-!-~h zYGdm*E}y2^=Y9~oT-Hh(>!;wdiF;A5h2!mFhBIBai!~E!-)?$ufW%5W6NSuS`EjWv z5YykSnK?{EhrXfG;tGfJ@#CY)?_>f>q0hRCA_h+_?XD@R>K?l?Bf4u#+fKS3^95M= zBym~yc&lN5q^7H^*a6*Qwq$$_%fw>qHJA3N(F@L9DqG~MB4DEkZDX+YXaGHCcVCdj!i+#Jv?A_DB3X?bgfgM6|;rffybYSn#K@i-nyl7w6A=dW@n2 zA~&{pIncsbsjwT7yJW6HDjcif&1QCTH`uKE1Qt2hPHTm)c$<;IQGSo_gFB_`Latpe zsN3L^7Jn(A%9h0>!#PWbm0wYu1HueO8x{Bywl`bW8r~G z_yggrtSg0HK}zqFa{#a6?a1db%;}W$@{--FeEx&H;o~2)(OI(6PV6V(9HqDJaphte zi>*1wxn`y{p37Ue$KwTrkIviDLAh@xXQeTi94pRI{x)=gkM*50d>vH)H}u}`(Er&r zn>sq${_0-9iz@JiRETlzAkz4CJ44<}0i|{Zi$c-n8? zr;mw^uF5qp1NW*uEPaz)AvWL_8Jm!jj%_CioU^3j@bm=IBhLJyPN0@%9d7e_?1WVU zuMUTt*z_Fo%+yU6CeG`J5L7(NS8-EYGcL{HRWBZD0|sq(iM-}lK+AFSu!v17HV~`hvk<(C8*Bdma2gjfhUFu z%26Kak6%%qA(0x**J;vM5hhm=x!ixgTgTnug1n@ZaGwKcJ#{}C(a)YxSmp~2sdSW@ zTMqu=G@ZS$t^37OS=jLvas35jnUsgx=v!pL+b=NGzinQ)9DUCKSKBu7?2 zg@c>1rk9h2tKm<@jc5+R6^C~f-#^Rh%mPH4K0!2qp!u~kr1nM_A2}#qoZ11&F*Nmx zXeSg)JS{DIMXD0ZF9At4P{CnoWy0^d-C3x`pko9|ZC0An`!|skWSSENIQ>07e#}_= z=#;tB&fzmh_^h@3aiX=&k@Fu7o`4?2gdxmXZ%BrixIw~qTs8+CS-0~gd($7G+EO|7 z0>J&*K;LcI5UI5xT+?Yr23w+SUERN0*UH{scj=;7oJ4C4pKD# zCvjmBdSy{_(pG_3oD)7~BC@6^G1`3MFD@-o?1moXN$B^tdBZ3flV2TSKopyUkLp)N zzj)B3iV?(J^Y{)MyHcNX@mDnv)7Rfxua8`QEq64kf1zb9KpuUKReu3cRYX7}1pN1E z9ImEczW}(a|Nlk#UG&|p^lvNx5P;D2zvzE<)OX=`7q@@l%ka&ciZhhEHHR)1UKgHS$;L&cUkWCQ~$68An3z$ zES%*}cXbzfxA6Rd+5`V8LwA996Zs#Y7JQxl|4#l-TE7dvn~VN{$Kfi2ga0El-Q~F( h-+y?zi2#6qiFQ>*6!>8Cv&Lcp2H_=$jO6Fr{{hhYgqQ#T literal 0 HcmV?d00001 diff --git a/test_data/multi_table_schema.json b/test_data/multi_table_schema.json new file mode 100644 index 0000000..088e22f --- /dev/null +++ b/test_data/multi_table_schema.json @@ -0,0 +1,31 @@ +{ + "users": { + "rules": [ + { "field": "id", "type": "integer", "required": true }, + { "field": "name", "type": "string", "required": true }, + { "field": "email", "type": "string", "required": true }, + { "field": "age", "type": "integer", "min": 0, "max": 120 }, + { "field": "status", "type": "string", "enum": ["active", "inactive", "pending"] } + ], + "strict_mode": true + }, + "products": { + "rules": [ + { "field": "product_id", "type": "integer", "required": true }, + { "field": "product_name", "type": "string", "required": true }, + { "field": "price", "type": "float", "min": 0.0 }, + { "field": "category", "type": "string", "enum": ["electronics", "clothing", "books"] }, + { "field": "in_stock", "type": "boolean" } + ] + }, + "orders": { + "rules": [ + { "field": "order_id", "type": "integer", "required": true }, + { "field": "user_id", "type": "integer", "required": true }, + { "field": "order_date", "type": "datetime", "required": true }, + { "field": "total_amount", "type": "float", "min": 0.0 }, + { "field": "order_status", "type": "string", "enum": ["pending", "confirmed", "shipped", "delivered"] } + ], + "case_insensitive": true + } +} diff --git a/tests/e2e/cli_scenarios/test_schema_command_e2e.py b/tests/e2e/cli_scenarios/test_schema_command_e2e.py new file mode 100644 index 0000000..143d872 --- /dev/null +++ b/tests/e2e/cli_scenarios/test_schema_command_e2e.py @@ -0,0 +1,218 @@ +""" +E2E: vlite-cli schema on databases and table/json outputs + +Scenarios derived from notes/测试方案-数据库SchemaDrift与CLI-Schema命令.md: +- Happy path on DB URL with table/json outputs +- Drift: missing column (FIELD_MISSING), type mismatch (TYPE_MISMATCH), strict extras +- Exit codes and minimal payload when empty rules +""" + +from __future__ import annotations + +import json +import os +from pathlib import Path + +import pytest + +from tests.shared.utils.database_utils import ( + get_available_databases, + get_mysql_test_url, + get_postgresql_test_url, +) +from tests.shared.utils.e2e_test_utils import E2ETestUtils + +pytestmark = pytest.mark.e2e + + +def _db_urls() -> list[str]: + urls: list[str] = [] + available = set(get_available_databases()) + if "mysql" in available: + urls.append(get_mysql_test_url()) + if "postgresql" in available: + urls.append(get_postgresql_test_url()) + return urls + + +def _write_rules(tmp_dir: Path, payload: dict) -> str: + p = tmp_dir / "rules.json" + p.write_text(json.dumps(payload), encoding="utf-8") + return str(p) + + +def _param_db_urls() -> list[object]: + """Mypy-friendly parameter provider for pytest.mark.parametrize. + + Returns list[object] so we can mix str and pytest.param when DB not configured. + """ + out: list[object] = [] + urls = _db_urls() + if urls: + out.extend(urls) + else: + out.append(pytest.param("", marks=pytest.mark.skip(reason="No DB configured"))) + return out + + +@pytest.mark.parametrize("db_url", _param_db_urls()) +def test_happy_path_table_and_json(tmp_path: Path, db_url: str) -> None: + # Schema baseline + a couple atomic rules + rules = { + "rules": [ + {"field": "id", "type": "integer", "required": True}, + {"field": "email", "type": "string"}, + {"field": "age", "type": "integer", "min": 0, "max": 150}, + ], + "strict_mode": False, + "case_insensitive": True, + } + rules_file = _write_rules(tmp_path, rules) + + # table output + r1 = E2ETestUtils.run_cli_command( + [ + "schema", + "--conn", + db_url, + "--table", + "customers", + "--rules", + rules_file, + "--output", + "table", + ] + ) + assert r1.returncode in {0, 1} + assert "Checking" in r1.stdout + + # json output + r2 = E2ETestUtils.run_cli_command( + [ + "schema", + "--conn", + db_url, + "--table", + "customers", + "--rules", + rules_file, + "--output", + "json", + ] + ) + assert r2.returncode in {0, 1} + try: + payload = json.loads(r2.stdout) + except Exception as e: + assert False, ( + "Expected JSON output from CLI but failed to parse. " + f"Error: {e}\nSTDOUT:\n{r2.stdout}\nSTDERR:\n{r2.stderr}" + ) + assert payload["status"] == "ok" + assert payload["rules_count"] >= 1 + assert "summary" in payload and "results" in payload and "fields" in payload + + +@pytest.mark.parametrize("db_url", _param_db_urls()) +def test_drift_missing_and_type_mismatch(tmp_path: Path, db_url: str) -> None: + # Declare a missing column and mismatched type to trigger SKIPPED in JSON for dependent rules + rules = { + "rules": [ + {"field": "email", "type": "integer", "required": True}, # mismatch + { + "field": "status", + "type": "string", + "enum": ["active", "inactive"], + }, # missing + ], + "strict_mode": False, + "case_insensitive": True, + } + rules_file = _write_rules(tmp_path, rules) + + r = E2ETestUtils.run_cli_command( + [ + "schema", + "--conn", + db_url, + "--table", + "customers", + "--rules", + rules_file, + "--output", + "json", + ] + ) + assert r.returncode in {1, 0} + try: + payload = json.loads(r.stdout) + except Exception as e: + assert False, ( + "Expected JSON output from CLI but failed to parse. " + f"Error: {e}\nSTDOUT:\n{r.stdout}\nSTDERR:\n{r.stderr}" + ) + # Ensure field-level failure codes surface + fields = {f["column"]: f for f in payload.get("fields", [])} + assert "email" in fields and "status" in fields + + # Any dependent checks (not_null/range/enum) may be present; ensure skip reasons appear when applicable + # We accept either PASS/FAIL depending on data, but presence of checks map is required when emitted + + +@pytest.mark.parametrize("db_url", _param_db_urls()) +def test_strict_mode_extras_json(tmp_path: Path, db_url: str) -> None: + rules = { + "rules": [ + {"field": "id", "type": "integer"}, + ], + "strict_mode": True, + "case_insensitive": True, + } + rules_file = _write_rules(tmp_path, rules) + + r = E2ETestUtils.run_cli_command( + [ + "schema", + "--conn", + db_url, + "--table", + "customers", + "--rules", + rules_file, + "--output", + "json", + ] + ) + try: + payload = json.loads(r.stdout) + except Exception as e: + assert False, ( + "Expected JSON output from CLI but failed to parse. " + f"Error: {e}\nSTDOUT:\n{r.stdout}\nSTDERR:\n{r.stderr}" + ) + # schema_extras should appear and be an array + assert isinstance(payload.get("schema_extras", []), list) + + +def test_empty_rules_minimal_payload(tmp_path: Path) -> None: + # Use a simple CSV source to exercise early-exit path + data_file = tmp_path / "data.csv" + data_file.write_text("id\n1\n", encoding="utf-8") + rules_file = _write_rules(tmp_path, {"rules": []}) + + r = E2ETestUtils.run_cli_command( + [ + "schema", + "--conn", + str(data_file), + "--table", + "data", + "--rules", + rules_file, + "--output", + "json", + ] + ) + assert r.returncode == 0 + payload = json.loads(r.stdout) + assert payload["rules_count"] == 0 diff --git a/tests/unit/cli/commands/test_schema_command.py b/tests/unit/cli/commands/test_schema_command.py new file mode 100644 index 0000000..88a8d72 --- /dev/null +++ b/tests/unit/cli/commands/test_schema_command.py @@ -0,0 +1,224 @@ +"""Unit tests for schema command skeleton.""" + +from __future__ import annotations + +import json +from pathlib import Path +from typing import Any + +import pytest +from click.testing import CliRunner + +from cli.app import cli_app +from cli.core.data_validator import ExecutionResultSchema + + +def _write_tmp_file(tmp_path: Path, name: str, content: str) -> str: + file_path = tmp_path / name + file_path.write_text(content, encoding="utf-8") + return str(file_path) + + +class TestSchemaCommandSkeleton: + def test_schema_command_help_registered(self) -> None: + runner = CliRunner() + result = runner.invoke(cli_app, ["--help"]) + assert result.exit_code == 0 + assert "schema" in result.output + + def test_schema_requires_source_and_rules(self, tmp_path: Path) -> None: + runner = CliRunner() + + # Missing args -> Click usage error (exit code >= 2) + result = runner.invoke(cli_app, ["schema"]) + assert result.exit_code >= 2 + + # Provide a minimal CSV and rules file + data_path = _write_tmp_file(tmp_path, "sample.csv", "id\n1\n") + rules_obj: dict[str, list[dict[str, Any]]] = {"rules": []} + rules_path = _write_tmp_file(tmp_path, "schema.json", json.dumps(rules_obj)) + + result = runner.invoke(cli_app, ["schema", data_path, "--rules", rules_path]) + assert result.exit_code == 0 + assert "Checking" in result.output + + def test_output_json_mode(self, tmp_path: Path) -> None: + runner = CliRunner() + data_path = _write_tmp_file(tmp_path, "data.csv", "id\n1\n") + rules_path = _write_tmp_file(tmp_path, "schema.json", json.dumps({"rules": []})) + + result = runner.invoke( + cli_app, ["schema", data_path, "--rules", rules_path, "--output", "json"] + ) + assert result.exit_code == 0 + payload = json.loads(result.output) + assert payload["status"] == "ok" + assert payload["rules_count"] == 0 + + def test_output_json_declared_columns_always_listed( + self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch + ) -> None: + # Patch decomposition to include a SCHEMA rule that declares a column not in results + from shared.enums import RuleType + from shared.schema.rule_schema import RuleSchema + from tests.shared.builders import test_builders + + schema_rule: RuleSchema = ( + test_builders.TestDataBuilder.rule() + .with_name("schema") + .with_type(RuleType.SCHEMA) + .with_target("", "", "id") + .with_parameter("columns", {"id": {"expected_type": "INTEGER"}}) + .build() + ) + + monkeypatch.setattr( + "cli.commands.schema._decompose_to_atomic_rules", + lambda payload: [schema_rule], + ) + + class DummyValidator: + async def validate(self) -> list[ExecutionResultSchema]: + # Return no results to simulate missing schema details + return [] + + monkeypatch.setattr("cli.commands.schema.DataValidator", DummyValidator) + + runner = CliRunner() + data_path = _write_tmp_file(tmp_path, "data.csv", "id\n1\n") + rules_path = _write_tmp_file( + tmp_path, + "schema.json", + json.dumps({"rules": [{"field": "id", "type": "integer"}]}), + ) + + result = runner.invoke( + cli_app, ["schema", data_path, "--rules", rules_path, "--output", "json"] + ) + # No failures but explicit -- in this setup lack of results implies exit 0 + assert result.exit_code == 0 + payload = json.loads(result.output) + # Declared column should still appear with UNKNOWN statuses + fields = {f["column"]: f for f in payload["fields"]} + assert "id" in fields + assert fields["id"]["checks"]["existence"]["status"] in { + "UNKNOWN", + "PASSED", + "FAILED", + } + + def test_fail_on_error_sets_exit_code_1(self, tmp_path: Path) -> None: + runner = CliRunner() + data_path = _write_tmp_file(tmp_path, "data.csv", "id\n1\n") + rules_path = _write_tmp_file(tmp_path, "schema.json", json.dumps({"rules": []})) + + result = runner.invoke( + cli_app, + [ + "schema", + data_path, + "--rules", + rules_path, + "--fail-on-error", + ], + ) + assert result.exit_code == 1 + + def test_invalid_rules_json_yields_usage_error(self, tmp_path: Path) -> None: + runner = CliRunner() + data_path = _write_tmp_file(tmp_path, "data.csv", "id\n1\n") + # invalid content + bad_rules_path = _write_tmp_file(tmp_path, "bad.json", "{invalid json}") + + result = runner.invoke( + cli_app, ["schema", data_path, "--rules", bad_rules_path] + ) + + # Click usage error exit code is >= 2 + assert result.exit_code >= 2 + assert "Invalid JSON" in result.output + + +class TestSchemaCommandValidation: + def _write_tmp_file(self, tmp_path: Path, name: str, content: str) -> str: + file_path = tmp_path / name + file_path.write_text(content, encoding="utf-8") + return str(file_path) + + def test_warn_on_top_level_table_ignored(self, tmp_path: Path) -> None: + runner = CliRunner() + data_path = self._write_tmp_file(tmp_path, "data.csv", "id\n1\n") + rules = { + "table": "users", + "rules": [ + {"field": "id", "type": "integer", "required": True}, + ], + } + rules_path = self._write_tmp_file(tmp_path, "schema.json", json.dumps(rules)) + + result = runner.invoke( + cli_app, ["schema", data_path, "--rules", rules_path, "--output", "json"] + ) + # exit code from skeleton remains success + assert result.exit_code == 0 + # warning emitted to stderr + assert "table' is ignored" in (result.stderr or "") + + def test_rules_must_be_array(self, tmp_path: Path) -> None: + runner = CliRunner() + data_path = self._write_tmp_file(tmp_path, "data.csv", "id\n1\n") + rules_path = self._write_tmp_file(tmp_path, "schema.json", json.dumps({})) + + result = runner.invoke(cli_app, ["schema", data_path, "--rules", rules_path]) + assert result.exit_code >= 2 + assert "must be an array" in result.output + + def test_rules_item_requires_field(self, tmp_path: Path) -> None: + runner = CliRunner() + data_path = self._write_tmp_file(tmp_path, "data.csv", "id\n1\n") + bad = {"rules": [{"type": "integer"}]} + rules_path = self._write_tmp_file(tmp_path, "schema.json", json.dumps(bad)) + + result = runner.invoke(cli_app, ["schema", data_path, "--rules", rules_path]) + assert result.exit_code >= 2 + assert "field must be a non-empty string" in result.output + + def test_type_must_be_supported_string(self, tmp_path: Path) -> None: + runner = CliRunner() + data_path = self._write_tmp_file(tmp_path, "data.csv", "id\n1\n") + bad = {"rules": [{"field": "id", "type": "number"}]} + rules_path = self._write_tmp_file(tmp_path, "schema.json", json.dumps(bad)) + + result = runner.invoke(cli_app, ["schema", data_path, "--rules", rules_path]) + assert result.exit_code >= 2 + assert "type 'number' is not supported" in result.output + + def test_required_must_be_boolean(self, tmp_path: Path) -> None: + runner = CliRunner() + data_path = self._write_tmp_file(tmp_path, "data.csv", "id\n1\n") + bad = {"rules": [{"field": "id", "required": "yes"}]} + rules_path = self._write_tmp_file(tmp_path, "schema.json", json.dumps(bad)) + + result = runner.invoke(cli_app, ["schema", data_path, "--rules", rules_path]) + assert result.exit_code >= 2 + assert "required must be a boolean" in result.output + + def test_enum_must_be_array(self, tmp_path: Path) -> None: + runner = CliRunner() + data_path = self._write_tmp_file(tmp_path, "data.csv", "id\n1\n") + bad = {"rules": [{"field": "flag", "enum": "01"}]} + rules_path = self._write_tmp_file(tmp_path, "schema.json", json.dumps(bad)) + + result = runner.invoke(cli_app, ["schema", data_path, "--rules", rules_path]) + assert result.exit_code >= 2 + assert "enum must be an array" in result.output + + def test_min_max_must_be_numeric(self, tmp_path: Path) -> None: + runner = CliRunner() + data_path = self._write_tmp_file(tmp_path, "data.csv", "id\n1\n") + bad = {"rules": [{"field": "age", "type": "integer", "min": "0"}]} + rules_path = self._write_tmp_file(tmp_path, "schema.json", json.dumps(bad)) + + result = runner.invoke(cli_app, ["schema", data_path, "--rules", rules_path]) + assert result.exit_code >= 2 + assert "min must be numeric" in result.output diff --git a/tests/unit/cli/commands/test_schema_command_extended.py b/tests/unit/cli/commands/test_schema_command_extended.py new file mode 100644 index 0000000..9c366c5 --- /dev/null +++ b/tests/unit/cli/commands/test_schema_command_extended.py @@ -0,0 +1,423 @@ +from __future__ import annotations + +import json +from pathlib import Path +from typing import Any, Dict, List + +import pytest +from click.testing import CliRunner + +from cli.app import cli_app +from shared.enums import RuleAction, RuleCategory, RuleType, SeverityLevel +from shared.schema.base import RuleTarget, TargetEntity +from shared.schema.rule_schema import RuleSchema + + +def _write_tmp_file(tmp_path: Path, name: str, content: str) -> str: + file_path = tmp_path / name + file_path.write_text(content, encoding="utf-8") + return str(file_path) + + +def _make_rule( + *, + name: str, + rule_type: RuleType, + column: str | None, + parameters: Dict[str, Any], + description: str | None = None, +) -> RuleSchema: + target = RuleTarget( + entities=[ + TargetEntity( + database="", table="", column=column, connection_id=None, alias=None + ) + ], + relationship_type="single_table", + ) + return RuleSchema( + name=name, + description=description, + type=rule_type, + target=target, + parameters=parameters, + cross_db_config=None, + threshold=0.0, + category=( + RuleCategory.VALIDITY + if rule_type in {RuleType.SCHEMA, RuleType.RANGE, RuleType.ENUM} + else RuleCategory.COMPLETENESS + ), + severity=SeverityLevel.MEDIUM, + action=RuleAction.ALERT, + is_active=True, + tags=[], + template_id=None, + validation_error=None, + ) + + +class TestSchemaDecompositionAndMapping: + def test_map_type_names_are_case_insensitive_and_validated( + self, tmp_path: Path + ) -> None: + from cli.commands.schema import _map_type_name_to_datatype + + assert _map_type_name_to_datatype("STRING").value == "STRING" + assert _map_type_name_to_datatype("integer").value == "INTEGER" + assert _map_type_name_to_datatype("DateTime").value == "DATETIME" + + with pytest.raises(Exception): + _map_type_name_to_datatype("number") + + def test_decompose_to_atomic_rules_structure(self, tmp_path: Path) -> None: + from cli.commands.schema import _decompose_to_atomic_rules + + payload = { + "strict_mode": True, + "case_insensitive": True, + "rules": [ + {"field": "id", "type": "integer", "required": True}, + {"field": "age", "min": 0, "max": 100}, + {"field": "status", "enum": ["A", "B"]}, + ], + } + + rules = _decompose_to_atomic_rules(payload) + + # First rule should be SCHEMA when any columns declared + assert rules[0].type == RuleType.SCHEMA + schema_params = rules[0].parameters or {} + assert schema_params["columns"]["id"]["expected_type"] == "INTEGER" + assert schema_params["strict_mode"] is True + assert schema_params["case_insensitive"] is True + + types = [r.type for r in rules] + # NOT_NULL created for required + assert RuleType.NOT_NULL in types + # RANGE created for min/max + assert RuleType.RANGE in types + # ENUM created when enum declared + assert RuleType.ENUM in types + + +class TestSchemaPrioritizationAndOutputs: + def test_prioritization_skip_map(self) -> None: + from cli.commands.schema import _build_prioritized_atomic_status + + # Build atomic rules manually + schema = _make_rule( + name="schema", + rule_type=RuleType.SCHEMA, + column=None, + parameters={ + "columns": { + "id": {"expected_type": "INTEGER"}, + "email": {"expected_type": "STRING"}, + "age": {"expected_type": "INTEGER"}, + } + }, + ) + not_null_email = _make_rule( + name="not_null_email", + rule_type=RuleType.NOT_NULL, + column="email", + parameters={}, + ) + range_age = _make_rule( + name="range_age", + rule_type=RuleType.RANGE, + column="age", + parameters={"min_value": 0, "max_value": 120}, + ) + + atomic_rules = [schema, not_null_email, range_age] + + # Simulate SCHEMA execution details + schema_result = { + "execution_plan": { + "schema_details": { + "field_results": [ + {"column": "email", "failure_code": "TYPE_MISMATCH"}, + {"column": "age", "failure_code": "FIELD_MISSING"}, + {"column": "id", "failure_code": "NONE"}, + ] + } + } + } + + skip_map = _build_prioritized_atomic_status( + schema_result=schema_result, atomic_rules=atomic_rules + ) + + # email dependent rules should be skipped for TYPE_MISMATCH + assert skip_map[str(not_null_email.id)]["status"] == "SKIPPED" + assert skip_map[str(not_null_email.id)]["skip_reason"] == "TYPE_MISMATCH" + # age dependent rules should be skipped for FIELD_MISSING + assert skip_map[str(range_age.id)]["status"] == "SKIPPED" + assert skip_map[str(range_age.id)]["skip_reason"] == "FIELD_MISSING" + + def test_json_output_aggregation_and_skip_semantics( + self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch + ) -> None: + # Prepare known atomic rules and patch decomposition to return them + schema = _make_rule( + name="schema", + rule_type=RuleType.SCHEMA, + column=None, + parameters={ + "columns": { + "email": {"expected_type": "STRING"}, + "age": {"expected_type": "INTEGER"}, + } + }, + ) + not_null_email = _make_rule( + name="not_null_email", + rule_type=RuleType.NOT_NULL, + column="email", + parameters={}, + ) + range_age = _make_rule( + name="range_age", + rule_type=RuleType.RANGE, + column="age", + parameters={"min_value": 0, "max_value": 150}, + ) + atomic_rules = [schema, not_null_email, range_age] + + # Patch decomposition + monkeypatch.setattr( + "cli.commands.schema._decompose_to_atomic_rules", + lambda payload: atomic_rules, + ) + + # Build SCHEMA and dependent rule results. Dependent rules are PASSED in raw + # and should be overridden to SKIPPED in JSON when schema marks issues. + schema_result = { + "rule_id": str(schema.id), + "status": "FAILED", + "dataset_metrics": [ + {"entity_name": "x", "total_records": 2, "failed_records": 2} + ], + "execution_plan": { + "schema_details": { + "field_results": [ + { + "column": "age", + "existence": "FAILED", + "type": "SKIPPED", + "failure_code": "FIELD_MISSING", + }, + { + "column": "email", + "existence": "PASSED", + "type": "FAILED", + "failure_code": "TYPE_MISMATCH", + }, + ], + "extras": [], + } + }, + } + not_null_email_result = { + "rule_id": str(not_null_email.id), + "status": "PASSED", + "dataset_metrics": [ + {"entity_name": "x", "total_records": 10, "failed_records": 0} + ], + } + range_age_result = { + "rule_id": str(range_age.id), + "status": "PASSED", + "dataset_metrics": [ + {"entity_name": "x", "total_records": 10, "failed_records": 0} + ], + } + + # Patch DataValidator.validate to return our results + class DummyValidator: + def __init__(self, *args: Any, **kwargs: Any) -> None: # noqa: D401 + pass + + async def validate(self) -> List[Dict[str, Any]]: # type: ignore[override] + return [schema_result, not_null_email_result, range_age_result] + + monkeypatch.setattr("cli.commands.schema.DataValidator", DummyValidator) + + # Prepare inputs and run CLI in JSON output mode + runner = CliRunner() + data_path = _write_tmp_file(tmp_path, "data.csv", "id\n1\n") + rules_path = _write_tmp_file( + tmp_path, + "schema.json", + json.dumps( + { + "rules": [ + {"field": "email", "type": "string"}, + {"field": "age", "type": "integer"}, + ] + } + ), + ) + + result = runner.invoke( + cli_app, ["schema", data_path, "--rules", rules_path, "--output", "json"] + ) + + assert result.exit_code == 1 # schema failed -> non-zero + payload = json.loads(result.output) + assert payload["status"] == "ok" + assert payload["rules_count"] == len(atomic_rules) + # Results should contain SKIPPED overrides for dependent rules + results_map = {r["rule_id"]: r for r in payload["results"]} + assert results_map[str(not_null_email.id)]["status"] == "SKIPPED" + assert results_map[str(not_null_email.id)]["skip_reason"] == "TYPE_MISMATCH" + assert results_map[str(range_age.id)]["status"] == "SKIPPED" + assert results_map[str(range_age.id)]["skip_reason"] == "FIELD_MISSING" + + # Fields aggregate should include existence/type and dependent checks + fields = {f["column"]: f for f in payload["fields"]} + assert fields["age"]["checks"]["existence"]["status"] == "FAILED" + assert fields["email"]["checks"]["type"]["status"] == "FAILED" + assert fields["email"]["checks"]["not_null"]["status"] == "SKIPPED" + assert fields["age"]["checks"]["range"]["status"] == "SKIPPED" + + def test_table_output_grouping_and_skips( + self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch + ) -> None: + # Prepare known atomic rules and patch decomposition to return them + schema = _make_rule( + name="schema", + rule_type=RuleType.SCHEMA, + column=None, + parameters={ + "columns": { + "email": {"expected_type": "STRING"}, + "age": {"expected_type": "INTEGER"}, + } + }, + ) + not_null_email = _make_rule( + name="not_null_email", + rule_type=RuleType.NOT_NULL, + column="email", + parameters={}, + ) + range_age = _make_rule( + name="range_age", + rule_type=RuleType.RANGE, + column="age", + parameters={"min_value": 0, "max_value": 150}, + ) + atomic_rules = [schema, not_null_email, range_age] + + monkeypatch.setattr( + "cli.commands.schema._decompose_to_atomic_rules", + lambda payload: atomic_rules, + ) + + schema_result = { + "rule_id": str(schema.id), + "status": "FAILED", + "dataset_metrics": [ + {"entity_name": "x", "total_records": 2, "failed_records": 2} + ], + "execution_plan": { + "schema_details": { + "field_results": [ + { + "column": "age", + "existence": "FAILED", + "type": "SKIPPED", + "failure_code": "FIELD_MISSING", + }, + { + "column": "email", + "existence": "PASSED", + "type": "FAILED", + "failure_code": "TYPE_MISMATCH", + }, + ], + "extras": [], + } + }, + } + # Dependent rule raw statuses set to PASSED; should be skipped for display grouping + not_null_email_result = { + "rule_id": str(not_null_email.id), + "status": "PASSED", + "dataset_metrics": [ + {"entity_name": "x", "total_records": 10, "failed_records": 0} + ], + } + range_age_result = { + "rule_id": str(range_age.id), + "status": "PASSED", + "dataset_metrics": [ + {"entity_name": "x", "total_records": 10, "failed_records": 0} + ], + } + + class DummyValidator: + def __init__(self, *args: Any, **kwargs: Any) -> None: # noqa: D401 + pass + + async def validate(self) -> List[Dict[str, Any]]: # type: ignore[override] + return [schema_result, not_null_email_result, range_age_result] + + monkeypatch.setattr("cli.commands.schema.DataValidator", DummyValidator) + + runner = CliRunner() + data_path = _write_tmp_file(tmp_path, "data.csv", "id\n1\n") + rules_path = _write_tmp_file( + tmp_path, + "schema.json", + json.dumps( + { + "rules": [ + {"field": "email", "type": "string"}, + {"field": "age", "type": "integer"}, + ] + } + ), + ) + + result = runner.invoke(cli_app, ["schema", data_path, "--rules", rules_path]) + assert result.exit_code == 1 + output = result.output + + # Should show concise messages per column with skip semantics + assert "✗ age: missing (skipped dependent checks)" in output + assert "✗ email: type mismatch (skipped dependent checks)" in output + # Should not render separate dependent issues since they are skipped + assert "not_null" not in output + assert "range" not in output + + +class TestSchemaValidationErrorsExtended: + def test_reject_tables_top_level(self, tmp_path: Path) -> None: + runner = CliRunner() + data_path = _write_tmp_file(tmp_path, "data.csv", "id\n1\n") + rules_path = _write_tmp_file( + tmp_path, + "schema.json", + json.dumps({"tables": {"users": []}, "rules": []}), + ) + + result = runner.invoke(cli_app, ["schema", data_path, "--rules", rules_path]) + assert result.exit_code >= 2 + assert "not supported in v1" in result.output + + def test_enum_must_be_non_empty_array(self, tmp_path: Path) -> None: + runner = CliRunner() + data_path = _write_tmp_file(tmp_path, "data.csv", "id\n1\n") + rules_path = _write_tmp_file( + tmp_path, + "schema.json", + json.dumps({"rules": [{"field": "status", "enum": []}]}), + ) + + result = runner.invoke(cli_app, ["schema", data_path, "--rules", rules_path]) + assert result.exit_code >= 2 + assert "enum' must be a non-empty" in result.output diff --git a/tests/unit/cli/commands/test_schema_command_file_sources.py b/tests/unit/cli/commands/test_schema_command_file_sources.py new file mode 100644 index 0000000..0c799b1 --- /dev/null +++ b/tests/unit/cli/commands/test_schema_command_file_sources.py @@ -0,0 +1,110 @@ +from __future__ import annotations + +import json +from pathlib import Path +from typing import Any, Dict, List + +import pytest +from click.testing import CliRunner + +from cli.app import cli_app +from shared.enums import RuleType +from shared.schema.rule_schema import RuleSchema +from tests.shared.builders import test_builders + + +def _write_tmp_file(tmp_path: Path, name: str, content: str) -> str: + file_path = tmp_path / name + file_path.write_text(content, encoding="utf-8") + return str(file_path) + + +def _schema_rule_with(columns: Dict[str, Dict[str, str]]) -> RuleSchema: + return ( + test_builders.TestDataBuilder.rule() + .with_name("schema") + .with_type(RuleType.SCHEMA) + .with_target("main", "data", "id") + .with_parameter("columns", columns) + .build() + ) + + +class TestSchemaCommandForFileSources: + def test_csv_excel_to_sqlite_type_implications( + self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch + ) -> None: + # Declare DATE/DATETIME expectations; SQLite columns will be TEXT post-conversion + schema_rule = _schema_rule_with( + {"reg_date": {"expected_type": "DATE"}, "ts": {"expected_type": "DATETIME"}} + ) + monkeypatch.setattr( + "cli.commands.schema._decompose_to_atomic_rules", + lambda payload: [schema_rule], + ) + + # Build SCHEMA result indicating SQLite TEXT types cause TYPE_MISMATCH + schema_result = { + "rule_id": str(schema_rule.id), + "status": "FAILED", + "dataset_metrics": [ + {"entity_name": "main.data", "total_records": 2, "failed_records": 2} + ], + "execution_plan": { + "schema_details": { + "field_results": [ + { + "column": "reg_date", + "existence": "PASSED", + "type": "FAILED", + "failure_code": "TYPE_MISMATCH", + }, + { + "column": "ts", + "existence": "PASSED", + "type": "FAILED", + "failure_code": "TYPE_MISMATCH", + }, + ], + "extras": [], + } + }, + } + + class DummyValidator: + async def validate(self) -> List[Dict[str, Any]]: # type: ignore[override] + return [schema_result] + + monkeypatch.setattr("cli.commands.schema.DataValidator", DummyValidator) + + # Prepare CSV file path as source (will be converted to SQLite inside command) + data_path = _write_tmp_file( + tmp_path, + "data.csv", + "reg_date,ts\n2023-01-01,2023-01-01T10:00:00Z\n2023-01-02,2023-01-02T11:00:00Z\n", + ) + rules_path = _write_tmp_file( + tmp_path, + "schema.json", + json.dumps( + { + "rules": [ + {"field": "reg_date", "type": "date"}, + {"field": "ts", "type": "datetime"}, + ] + } + ), + ) + + runner = CliRunner() + result = runner.invoke( + cli_app, ["schema", data_path, "--rules", rules_path, "--output", "json"] + ) + + assert result.exit_code == 1 + payload = json.loads(result.output) + + # The JSON `fields` section should reflect type mismatches from SQLite TEXT + fields = {f["column"]: f for f in payload["fields"]} + assert fields["reg_date"]["checks"]["type"]["status"] == "FAILED" + assert fields["ts"]["checks"]["type"]["status"] == "FAILED" diff --git a/tests/unit/cli/commands/test_schema_command_json_extras.py b/tests/unit/cli/commands/test_schema_command_json_extras.py new file mode 100644 index 0000000..2d948ae --- /dev/null +++ b/tests/unit/cli/commands/test_schema_command_json_extras.py @@ -0,0 +1,149 @@ +from __future__ import annotations + +import json +from pathlib import Path +from typing import Any, Dict, List + +import pytest +from click.testing import CliRunner + +from cli.app import cli_app +from shared.enums import RuleType +from shared.schema.rule_schema import RuleSchema +from tests.shared.builders import test_builders + + +def _write_tmp_file(tmp_path: Path, name: str, content: str) -> str: + file_path = tmp_path / name + file_path.write_text(content, encoding="utf-8") + return str(file_path) + + +def _schema_rule_with(columns: Dict[str, Dict[str, str]]) -> RuleSchema: + return ( + test_builders.TestDataBuilder.rule() + .with_name("schema") + .with_type(RuleType.SCHEMA) + .with_target("", "", "id") + .with_parameter("columns", columns) + .with_parameter("strict_mode", True) + .build() + ) + + +class TestSchemaJsonExtrasAndSummary: + def test_json_includes_schema_extras_and_summary_counts( + self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch + ) -> None: + # Decomposition yields one SCHEMA rule for columns id/email + schema_rule = _schema_rule_with( + { + "id": {"expected_type": "INTEGER"}, + "email": {"expected_type": "STRING"}, + } + ) + monkeypatch.setattr( + "cli.commands.schema._decompose_to_atomic_rules", + lambda payload: [schema_rule], + ) + + # Results: SCHEMA failed with 1 type mismatch, 0 existence failures, extras present + schema_result = { + "rule_id": str(schema_rule.id), + "status": "FAILED", + "dataset_metrics": [ + {"entity_name": "t", "total_records": 2, "failed_records": 1} + ], + "execution_plan": { + "schema_details": { + "field_results": [ + { + "column": "id", + "existence": "PASSED", + "type": "PASSED", + "failure_code": "NONE", + }, + { + "column": "email", + "existence": "PASSED", + "type": "FAILED", + "failure_code": "TYPE_MISMATCH", + }, + ], + "extras": ["zzz_extra", "aaa_extra"], + } + }, + } + + class DummyValidator: + async def validate(self) -> List[Dict[str, Any]]: # type: ignore[override] + return [schema_result] + + monkeypatch.setattr("cli.commands.schema.DataValidator", DummyValidator) + + runner = CliRunner() + data_path = _write_tmp_file(tmp_path, "data.csv", "id\n1\n") + rules_path = _write_tmp_file( + tmp_path, + "schema.json", + json.dumps( + { + "rules": [ + {"field": "id", "type": "integer"}, + {"field": "email", "type": "string"}, + ] + } + ), + ) + + result = runner.invoke( + cli_app, ["schema", data_path, "--rules", rules_path, "--output", "json"] + ) + assert result.exit_code == 1 + payload = json.loads(result.output) + + # schema_extras must present, sorted by CLI before emission + assert payload.get("schema_extras") == ["aaa_extra", "zzz_extra"] + # summary counts + assert payload["summary"]["total_rules"] == 1 + assert payload["summary"]["failed_rules"] == 1 + assert payload["summary"]["skipped_rules"] >= 0 + assert payload["summary"]["total_failed_records"] >= 1 + + def test_table_output_does_not_emit_schema_extras_key( + self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch + ) -> None: + schema_rule = _schema_rule_with({"id": {"expected_type": "INTEGER"}}) + monkeypatch.setattr( + "cli.commands.schema._decompose_to_atomic_rules", + lambda payload: [schema_rule], + ) + + schema_result = { + "rule_id": str(schema_rule.id), + "status": "PASSED", + "dataset_metrics": [ + {"entity_name": "t", "total_records": 1, "failed_records": 0} + ], + "execution_plan": { + "schema_details": {"field_results": [], "extras": ["x"]} + }, + } + + class DummyValidator: + async def validate(self) -> List[Dict[str, Any]]: # type: ignore[override] + return [schema_result] + + monkeypatch.setattr("cli.commands.schema.DataValidator", DummyValidator) + + runner = CliRunner() + data_path = _write_tmp_file(tmp_path, "data.csv", "id\n1\n") + rules_path = _write_tmp_file( + tmp_path, + "schema.json", + json.dumps({"rules": [{"field": "id", "type": "integer"}]}), + ) + result = runner.invoke(cli_app, ["schema", data_path, "--rules", rules_path]) + assert result.exit_code == 0 + # Plain text output should not dump JSON key name + assert "schema_extras" not in result.output diff --git a/tests/unit/cli/commands/test_schema_command_multi_table.py b/tests/unit/cli/commands/test_schema_command_multi_table.py new file mode 100644 index 0000000..f4b4202 --- /dev/null +++ b/tests/unit/cli/commands/test_schema_command_multi_table.py @@ -0,0 +1,389 @@ +"""Unit tests for schema command multi-table functionality.""" + +from __future__ import annotations + +import json +from pathlib import Path +from typing import Any + +import pytest +from click.testing import CliRunner + +from cli.app import cli_app + + +def _write_tmp_file(tmp_path: Path, name: str, content: str) -> str: + file_path = tmp_path / name + file_path.write_text(content, encoding="utf-8") + return str(file_path) + + +class TestSchemaCommandMultiTable: + def test_multi_table_rules_format_parsing(self, tmp_path: Path) -> None: + """Test that multi-table rules format is correctly parsed.""" + runner = CliRunner() + + # Create multi-table rules file + # Use the existing multi-table schema file + rules_path = "test_data/multi_table_schema.json" + # Use the new multi-table Excel file instead of CSV + data_path = "test_data/multi_table_data.xlsx" + + result = runner.invoke(cli_app, [ + "schema", + "--conn", data_path, + "--rules", rules_path, + "--output", "json" + ]) + + assert result.exit_code == 0 + payload = json.loads(result.output) + assert payload["status"] == "ok" + assert payload["rules_count"] == 15 # 5 rules per table * 3 tables + + # Check that fields have table information + fields = payload["fields"] + assert len(fields) > 0 + for field in fields: + assert "table" in field + assert field["table"] in ["users", "products", "orders"] + + def test_multi_table_excel_sheets_detection(self, tmp_path: Path) -> None: + """Test that Excel file sheets are correctly detected and used as tables.""" + runner = CliRunner() + + # Create a simple multi-table rules file + multi_table_rules = { + "users": { + "rules": [ + {"field": "id", "type": "integer", "required": True}, + {"field": "name", "type": "string", "required": True} + ] + }, + "products": { + "rules": [ + {"field": "product_id", "type": "integer", "required": True}, + {"field": "product_name", "type": "string", "required": True} + ] + } + } + + rules_path = _write_tmp_file(tmp_path, "multi_table_rules.json", json.dumps(multi_table_rules)) + data_path = "test_data/multi_table_data.xlsx" + + result = runner.invoke(cli_app, [ + "schema", + "--conn", data_path, + "--rules", rules_path, + "--output", "json" + ]) + + assert result.exit_code == 0 + payload = json.loads(result.output) + assert payload["status"] == "ok" + + # Check that both tables are processed + fields = payload["fields"] + user_fields = [f for f in fields if f.get("table") == "users"] + product_fields = [f for f in fields if f.get("table") == "products"] + + assert len(user_fields) > 0 + assert len(product_fields) > 0 + + def test_multi_table_with_table_level_options(self, tmp_path: Path) -> None: + """Test multi-table format with table-level options like strict_mode.""" + runner = CliRunner() + + multi_table_rules = { + "users": { + "rules": [ + {"field": "id", "type": "integer", "required": True} + ], + "strict_mode": True + }, + "products": { + "rules": [ + {"field": "product_name", "type": "string", "required": True} + ], + "case_insensitive": True + } + } + + rules_path = _write_tmp_file(tmp_path, "multi_table_options.json", json.dumps(multi_table_rules)) + data_path = "test_data/multi_table_data.xlsx" + + result = runner.invoke(cli_app, [ + "schema", + "--conn", data_path, + "--rules", rules_path + ]) + + assert result.exit_code == 0 + # Should not raise any validation errors for table-level options + + def test_multi_table_backward_compatibility(self, tmp_path: Path) -> None: + """Test that single-table format still works for backward compatibility.""" + runner = CliRunner() + + # Single-table format (legacy) + single_table_rules = { + "rules": [ + {"field": "id", "type": "integer", "required": True}, + {"field": "name", "type": "string", "required": True} + ] + } + + rules_path = _write_tmp_file(tmp_path, "single_table.json", json.dumps(single_table_rules)) + # Use only the users sheet for single table test + data_path = "test_data/multi_table_data.xlsx" + + result = runner.invoke(cli_app, [ + "schema", + "--conn", data_path, + "--rules", rules_path, + "--output", "json" + ]) + + assert result.exit_code == 0 + payload = json.loads(result.output) + assert payload["status"] == "ok" + assert payload["rules_count"] == 2 + + def test_multi_table_validation_errors(self, tmp_path: Path) -> None: + """Test validation errors for invalid multi-table format.""" + runner = CliRunner() + + # Invalid: table schema is not an object + invalid_rules = { + "users": "not_an_object" + } + + rules_path = _write_tmp_file(tmp_path, "invalid.json", json.dumps(invalid_rules)) + data_path = "test_data/multi_table_data.xlsx" + + result = runner.invoke(cli_app, [ + "schema", + "--conn", data_path, + "--rules", rules_path + ]) + + assert result.exit_code >= 2 # Usage error + assert "must be an object" in result.output + + def test_multi_table_missing_rules_array(self, tmp_path: Path) -> None: + """Test validation error when table is missing rules array.""" + runner = CliRunner() + + invalid_rules = { + "users": { + "strict_mode": True + # Missing rules array + } + } + + rules_path = _write_tmp_file(tmp_path, "missing_rules.json", json.dumps(invalid_rules)) + data_path = "test_data/multi_table_data.xlsx" + + result = runner.invoke(cli_app, [ + "schema", + "--conn", data_path, + "--rules", rules_path + ]) + + assert result.exit_code >= 2 # Usage error + assert "must have a 'rules' array" in result.output + + def test_multi_table_invalid_table_level_options(self, tmp_path: Path) -> None: + """Test validation error for invalid table-level options.""" + runner = CliRunner() + + invalid_rules = { + "users": { + "rules": [ + {"field": "id", "type": "integer", "required": True} + ], + "strict_mode": "not_a_boolean" # Should be boolean + } + } + + rules_path = _write_tmp_file(tmp_path, "invalid_options.json", json.dumps(invalid_rules)) + data_path = "test_data/multi_table_data.xlsx" + + result = runner.invoke(cli_app, [ + "schema", + "--conn", data_path, + "--rules", rules_path + ]) + + assert result.exit_code >= 2 # Usage error + assert "must be a boolean" in result.output + + def test_multi_table_output_formatting(self, tmp_path: Path) -> None: + """Test that multi-table output is properly formatted and grouped.""" + runner = CliRunner() + + multi_table_rules = { + "users": { + "rules": [ + {"field": "id", "type": "integer", "required": True}, + {"field": "name", "type": "string", "required": True} + ] + }, + "products": { + "rules": [ + {"field": "product_id", "type": "integer", "required": True} + ] + } + } + + rules_path = _write_tmp_file(tmp_path, "multi_table.json", json.dumps(multi_table_rules)) + data_path = "test_data/multi_table_data.xlsx" + + # Test table output format + result = runner.invoke(cli_app, [ + "schema", + "--conn", data_path, + "--rules", rules_path, + "--output", "table" + ]) + + assert result.exit_code == 0 + output = result.output + + # Should show table headers for multi-table + assert "📋 Table: users" in output + assert "📋 Table: products" in output + assert "📊 Multi-table Summary:" in output + + def test_multi_table_json_output_structure(self, tmp_path: Path) -> None: + """Test that JSON output includes table information for multi-table.""" + runner = CliRunner() + + multi_table_rules = { + "users": { + "rules": [ + {"field": "id", "type": "integer", "required": True} + ] + }, + "products": { + "rules": [ + {"field": "product_name", "type": "string", "required": True} + ] + } + } + + rules_path = _write_tmp_file(tmp_path, "multi_table.json", json.dumps(multi_table_rules)) + data_path = "test_data/multi_table_data.xlsx" + + result = runner.invoke(cli_app, [ + "schema", + "--conn", data_path, + "--rules", rules_path, + "--output", "json" + ]) + + assert result.exit_code == 0 + payload = json.loads(result.output) + + # Check that fields have table information + fields = payload["fields"] + assert len(fields) >= 2 + + # Find fields for each table + user_fields = [f for f in fields if f.get("table") == "users"] + product_fields = [f for f in fields if f.get("table") == "products"] + + assert len(user_fields) > 0 + assert len(product_fields) > 0 + + # Check that each field has table info + for field in fields: + assert "table" in field + assert field["table"] in ["users", "products"] + + def test_multi_table_no_table_option_required(self, tmp_path: Path) -> None: + """Test that --table option is no longer required.""" + runner = CliRunner() + + multi_table_rules = { + "users": { + "rules": [ + {"field": "id", "type": "integer", "required": True} + ] + } + } + + rules_path = _write_tmp_file(tmp_path, "multi_table.json", json.dumps(multi_table_rules)) + data_path = "test_data/multi_table_data.xlsx" + + # Should work without --table option + result = runner.invoke(cli_app, [ + "schema", + "--conn", data_path, + "--rules", rules_path + ]) + + assert result.exit_code == 0 + # Command should execute successfully without --table option + + def test_multi_table_excel_specific_functionality(self, tmp_path: Path) -> None: + """Test specific Excel multi-table functionality.""" + runner = CliRunner() + + # Test with all three tables from the Excel file + multi_table_rules = { + "users": { + "rules": [ + {"field": "id", "type": "integer", "required": True}, + {"field": "name", "type": "string", "required": True}, + {"field": "email", "type": "string", "required": True} + ] + }, + "products": { + "rules": [ + {"field": "product_id", "type": "integer", "required": True}, + {"field": "product_name", "type": "string", "required": True}, + {"field": "price", "type": "float", "min": 0.0} + ] + }, + "orders": { + "rules": [ + {"field": "order_id", "type": "integer", "required": True}, + {"field": "user_id", "type": "integer", "required": True}, + {"field": "total_amount", "type": "float", "min": 0.0} + ] + } + } + + rules_path = _write_tmp_file(tmp_path, "excel_multi_table.json", json.dumps(multi_table_rules)) + data_path = "test_data/multi_table_data.xlsx" + + result = runner.invoke(cli_app, [ + "schema", + "--conn", data_path, + "--rules", rules_path, + "--output", "json" + ]) + + assert result.exit_code == 0 + payload = json.loads(result.output) + assert payload["status"] == "ok" + + # Check that all three tables are processed + fields = payload["fields"] + table_names = set(field.get("table") for field in fields) + assert "users" in table_names + assert "products" in table_names + assert "orders" in table_names + + def test_multi_table_help_text_updated(self, tmp_path: Path) -> None: + """Test that help text reflects multi-table support.""" + runner = CliRunner() + + result = runner.invoke(cli_app, ["schema", "--help"]) + assert result.exit_code == 0 + + # Should mention multi-table support + assert "multi-table" in result.output.lower() + # Should not mention --table option + assert "--table" not in result.output From 3e06746d8a97e5762ba4d443b574a738bd84adc5 Mon Sep 17 00:00:00 2001 From: litedatum Date: Mon, 25 Aug 2025 22:14:44 -0400 Subject: [PATCH 2/6] feat: make excel source support multi-table --- README.md | 6 + cli/commands/schema.py | 191 +++++++++-------------------- cli/core/data_validator.py | 5 + cli/core/source_parser.py | 44 +++---- config/logging.test.toml | 37 ++++++ pytest.ini | 1 + scripts/run_tests_quiet.py | 52 ++++++++ shared/schema/connection_schema.py | 5 +- test_data/schema.json | 2 + tests/conftest.py | 12 ++ 10 files changed, 190 insertions(+), 165 deletions(-) create mode 100644 config/logging.test.toml create mode 100644 scripts/run_tests_quiet.py diff --git a/README.md b/README.md index 2fa8a6e..51062e7 100644 --- a/README.md +++ b/README.md @@ -162,11 +162,17 @@ The project includes comprehensive tests to ensure reliability. If you encounter # Run all tests with coverage pytest -vv --cov +# Run tests quietly (suppress debug messages) +python scripts/run_tests_quiet.py --cov + # Run specific test categories pytest tests/unit/ -v # Unit tests only pytest tests/integration/ -v # Integration tests pytest tests/e2e/ -v # End-to-end tests +# Run specific tests quietly +python scripts/run_tests_quiet.py tests/unit/ -v + # Code quality checks pre-commit run --all-files diff --git a/cli/commands/schema.py b/cli/commands/schema.py index 0a39b48..a0d5cac 100644 --- a/cli/commands/schema.py +++ b/cli/commands/schema.py @@ -232,94 +232,54 @@ def _create_rule_schema( ) -def _decompose_multi_table_schema( - payload: Dict[str, Any], source_db: str +def _decompose_schema_payload( + payload: Dict[str, Any], source_config: ConnectionSchema ) -> List[RuleSchema]: - """Decompose multi-table schema JSON payload into atomic RuleSchema objects. - - Supports both single-table and multi-table formats. + """Decompose a schema payload into atomic RuleSchema objects. + + This function handles both single-table and multi-table formats in a + source-agnostic way. """ all_atomic_rules: List[RuleSchema] = [] - - # Check if this is multi-table format - table_names = [key for key in payload.keys() if key != "rules"] - - if table_names: - # Multi-table format - for table_name in table_names: + source_db = source_config.db_name or "unknown" + + is_multi_table_format = "rules" not in payload + + if is_multi_table_format: + tables_in_rules = list(payload.keys()) + available_tables_from_source = set(source_config.available_tables or []) + + for table_name in tables_in_rules: + if available_tables_from_source and table_name not in available_tables_from_source: + logger.warning( + f"Skipping rules for table '{table_name}' as it is not available in the source." + ) + continue + table_schema = payload[table_name] + if not isinstance(table_schema, dict): + logger.warning(f"Definition for table '{table_name}' is not a valid object, skipping.") + continue + table_rules = _decompose_single_table_schema( table_schema, source_db, table_name ) all_atomic_rules.extend(table_rules) else: - # Single-table format (backward compatibility) - # For single-table, we need to determine the table name from the source - # This will be handled by the caller who knows the table context - table_rules = _decompose_single_table_schema(payload, source_db, "unknown") - all_atomic_rules.extend(table_rules) - - return all_atomic_rules - - -def _decompose_multi_table_schema_with_source_info( - payload: Dict[str, Any], source_config: ConnectionSchema - ) -> List[RuleSchema]: - """Decompose multi-table schema JSON payload into atomic RuleSchema objects. - - This version takes into account the actual tables available in the source. - - Args: - payload: The rules payload - source_config: Source configuration with table information - """ - all_atomic_rules: List[RuleSchema] = [] - - # Check if this is multi-table format - table_names = [key for key in payload.keys() if key != "rules"] - - if table_names: - # Multi-table format - # Check if source has multi-table information - is_multi_table_source = source_config.parameters.get("is_multi_table", False) - available_tables = (source_config.parameters - .get("sheets", {}).keys() - if is_multi_table_source else set() - ) - if is_multi_table_source and available_tables: - # Only process rules for tables that actually exist in the source - for table_name in table_names: - if table_name in available_tables: - table_schema = payload[table_name] - table_rules = _decompose_single_table_schema( - table_schema, source_config.db_name or "unknown", table_name - ) - all_atomic_rules.extend(table_rules) - logger.info( - f"Processing rules for table '{table_name}' (found in source)" - ) - else: - logger.warning( - f"Skipping rules for table '{table_name}' " - f"(not found in source: {list(available_tables)})" - ) + table_name = "unknown" + if source_config.available_tables: + table_name = source_config.available_tables[0] else: - # Process all tables (fallback for non-multi-table sources) - for table_name in table_names: - table_schema = payload[table_name] - table_rules = _decompose_single_table_schema( - table_schema, source_config.db_name or "unknown", table_name - ) - all_atomic_rules.extend(table_rules) - else: - # Single-table format (backward compatibility) - # For single-table, we need to determine the table name from the source - # This will be handled by the caller who knows the table context + logger.warning( + "Could not determine table name for single-table schema. " + "Consider using multi-table format for database sources." + ) + table_rules = _decompose_single_table_schema( - payload, source_config.db_name or "unknown", "unknown" + payload, source_db, table_name ) all_atomic_rules.extend(table_rules) - + return all_atomic_rules @@ -425,15 +385,15 @@ def _decompose_single_table_schema( return atomic_rules -def _decompose_to_atomic_rules(payload: Dict[str, Any]) -> List[RuleSchema]: - """Decompose schema JSON payload into atomic RuleSchema objects. +# def _decompose_to_atomic_rules(payload: Dict[str, Any]) -> List[RuleSchema]: +# """Decompose schema JSON payload into atomic RuleSchema objects. - This function is kept for backward compatibility but now delegates to - the new multi-table aware function. - """ - # For backward compatibility, we need to determine the source_db - # This will be handled by the caller - return _decompose_multi_table_schema(payload, "unknown") +# This function is kept for backward compatibility but now delegates to +# the new multi-table aware function. +# """ +# # For backward compatibility, we need to determine the source_db +# # This will be handled by the caller +# return _decompose_multi_table_schema(payload, "unknown") def _build_prioritized_atomic_status( @@ -1006,14 +966,7 @@ def _calc_failed(res: Dict[str, Any]) -> int: "--fail-on-error", is_flag=True, default=False, - help="Return exit code 1 if any error occurs during skeleton execution", -) -@click.option( - "--max-errors", - type=int, - default=100, - show_default=True, - help="Maximum number of errors to collect (reserved; not used in skeleton)", + help="Return exit code 1 if any error occurs during execution", ) @click.option("--verbose", is_flag=True, default=False, help="Enable verbose output") def schema_command( @@ -1021,61 +974,40 @@ def schema_command( rules_file: str, output: str, fail_on_error: bool, - max_errors: int, verbose: bool, ) -> None: - """Schema validation command with support for both single-table and multi-table validation. - - NEW FORMAT: - vlite-cli schema --conn --rules [options] - - SOURCE can be: - - File path: users.csv, data.xlsx, records.json - - Database URL: mysql://user:pass@host/db - - SQLite file: sqlite:///path/to/file.db - - RULES FILE FORMATS: - - Single-table: {"rules": [...]} - - Multi-table: {"table1": {"rules": [...]}, "table2": {"rules": [...]}} - - Examples: - vlite-cli schema --conn users.csv --rules schema.json - vlite-cli schema --conn mysql://user:pass@host/db --rules multi_table_schema.json - """ + """Schema validation command with support for both single-table and multi-table validation.""" from cli.core.config import get_cli_config from core.config import get_core_config - # start_time = now() try: _maybe_echo_analyzing(connection_string, output) _guard_empty_source_file(connection_string) source_config = SourceParser().parse_source(connection_string) - rules_payload = _read_rules_payload(rules_file) + # If the rules file uses a multi-table format, signal this to the DataValidator + # so that it skips its single-table target completion logic. + is_multi_table_rules = "rules" not in rules_payload + if is_multi_table_rules: + source_config.parameters["is_multi_table"] = True + warnings, rules_count = _validate_rules_payload(rules_payload) _emit_warnings(warnings) - # Get database name from source config - source_db = source_config.db_name - if not source_db: - source_db = "unknown" - - # Decompose into atomic rules using new multi-table aware function - atomic_rules = _decompose_multi_table_schema_with_source_info(rules_payload, source_config) + atomic_rules = _decompose_schema_payload(rules_payload, source_config) - # Fast-path: no rules -> emit minimal payload and exit cleanly - if len(atomic_rules) == 0: + if not atomic_rules: _early_exit_when_no_rules( source=connection_string, rules_file=rules_file, output=output, fail_on_error=fail_on_error, ) + return - # Execute via core engine using DataValidator core_config = get_core_config() cli_config = get_cli_config() validator = _create_validator( @@ -1086,7 +1018,6 @@ def schema_command( ) results, exec_seconds = _run_validation(validator) - # Aggregation and prioritization schema_result_dict: Dict[str, Any] | None = _extract_schema_result_dict( atomic_rules=atomic_rules, results=results ) @@ -1094,7 +1025,6 @@ def schema_command( atomic_rules=atomic_rules, schema_result_dict=schema_result_dict ) - # Apply skip map to JSON output only; table mode stays concise by design if output.lower() == "json": _emit_json_output( source=connection_string, @@ -1115,7 +1045,6 @@ def schema_command( exec_seconds=exec_seconds, ) - # Exit code: fail if any rule failed (support both model objects and dicts) def _status_of(item: Any) -> str: if hasattr(item, "status"): try: @@ -1127,19 +1056,13 @@ def _status_of(item: Any) -> str: return "" any_failed = any(_status_of(r) == "FAILED" for r in results) - import click as _click - - raise _click.exceptions.Exit(1 if any_failed or fail_on_error else 0) + raise click.exceptions.Exit(1 if any_failed or fail_on_error else 0) except click.UsageError: - # Propagate Click usage errors for standard exit code (typically 2) raise except click.exceptions.Exit: - # Allow Click's explicit Exit (with code) to propagate unchanged raise - except Exception as e: # Fallback: print concise error and return generic failure + except Exception as e: logger.error(f"Schema command error: {str(e)}") _safe_echo(f"❌ Error: {str(e)}", err=True) - import click as _click - - raise _click.exceptions.Exit(1) + raise click.exceptions.Exit(1) diff --git a/cli/core/data_validator.py b/cli/core/data_validator.py index a63b07c..6eeec9f 100644 --- a/cli/core/data_validator.py +++ b/cli/core/data_validator.py @@ -111,6 +111,11 @@ def _complete_target_info(self) -> None: This replaces the old _update_rule_connections method. """ + # If the source is multi-table, targets are already set. Do not overwrite. + if self.source_config.parameters.get("is_multi_table"): + self.logger.debug("Multi-table source detected, skipping target info completion.") + return + if not self.rules: return diff --git a/cli/core/source_parser.py b/cli/core/source_parser.py index d13a584..055dcee 100644 --- a/cli/core/source_parser.py +++ b/cli/core/source_parser.py @@ -236,71 +236,60 @@ def _parse_file_path(self, file_path: str) -> ConnectionSchema: path = Path(file_path) - # Check if file exists if not path.exists(): raise FileNotFoundError(f"File not found: {file_path}") if not path.is_file(): raise ValidationError(f"Path is not a file: {file_path}") - # Determine file type file_ext = path.suffix.lower() conn_type = self.file_extensions.get(file_ext) if not conn_type: - # Try to infer from content or use CSV as default conn_type = ConnectionType.CSV self.logger.warning( f"Unknown file extension {file_ext}, assuming CSV format" ) - # Check if this is a multi-table Excel file is_multi_table = False sheets_info = {} if conn_type == ConnectionType.EXCEL: - is_multi_table = self.is_multi_table_excel(file_path) - if is_multi_table: - try: - sheets_info = self.get_excel_sheets(file_path) + try: + sheets_info = self.get_excel_sheets(file_path) + if len(sheets_info) > 1: + is_multi_table = True self.logger.info(f"Multi-table Excel file detected with {len(sheets_info)} sheets: {list(sheets_info.keys())}") - except Exception as e: - self.logger.warning(f"Could not read Excel sheets: {str(e)}") - is_multi_table = False + except Exception as e: + self.logger.warning(f"Could not read Excel sheets, treating as single-table: {str(e)}") + is_multi_table = False - # Prepare parameters parameters = { "filename": path.name, "file_size": path.stat().st_size, - "encoding": "utf-8", # Default encoding + "encoding": "utf-8", } - # Add multi-table information for Excel files if is_multi_table and sheets_info: parameters["is_multi_table"] = True parameters["sheets"] = sheets_info - parameters["table_count"] = len(sheets_info) + available_tables = list(sheets_info.keys()) else: parameters["is_multi_table"] = False + available_tables = [path.stem] return ConnectionSchema( name=f"file_connection_{uuid4().hex[:8]}", description=f"File connection: {path.name}" + (" (multi-table)" if is_multi_table else ""), connection_type=conn_type, - host=None, - port=None, - db_name=None, - username=None, - password=None, - db_schema=None, file_path=str(path.absolute()), parameters=parameters, + available_tables=available_tables, capabilities=DataSourceCapability( supports_sql=False, supports_batch_export=True, - max_export_rows=100000 if not is_multi_table else 50000, # Reduce for multi-table - estimated_throughput=5000 if not is_multi_table else 2000, # Reduce for multi-table + max_export_rows=100000 if not is_multi_table else 50000, + estimated_throughput=5000 if not is_multi_table else 2000, ), - cross_db_settings=None, ) def _detect_database_type(self, url: str) -> ConnectionType: @@ -376,14 +365,9 @@ def _create_sqlite_connection( name=f"sqlite_connection_{uuid4().hex[:8]}", description=f"SQLite connection: {Path(file_path).name}", connection_type=ConnectionType.SQLITE, - host=None, - port=None, - db_name=None, - username=None, - password=None, - db_schema=None, file_path=file_path, parameters=parameters, + available_tables=[table] if table else [], capabilities=DataSourceCapability( supports_sql=True, supports_batch_export=True, diff --git a/config/logging.test.toml b/config/logging.test.toml new file mode 100644 index 0000000..2ce2ddc --- /dev/null +++ b/config/logging.test.toml @@ -0,0 +1,37 @@ +# Test Environment Logging Configuration + +# Global log level: Set to WARNING to suppress DEBUG and INFO messages +level = "WARNING" + +# Log message format +format = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" + +# Enable logging to file (disabled for tests to keep output clean) +to_file = false + +# Module-specific log levels for testing +[module_levels] +# Core modules - set to WARNING to reduce noise +"shared.database.connection" = "WARNING" +"shared.database.query_executor" = "WARNING" +"cli.commands.check" = "WARNING" +"cli.core.data_validator" = "WARNING" +"cli.core.source_parser" = "WARNING" +"cli.core.rule_parser" = "WARNING" +"rule_engine" = "WARNING" +"core.engine.rule_engine" = "WARNING" + +# Third-party modules - set to ERROR to suppress all debug info +"aiosqlite" = "ERROR" +"sqlalchemy" = "ERROR" +"sqlalchemy.engine" = "ERROR" +"sqlalchemy.pool" = "ERROR" +"sqlalchemy.dialects" = "ERROR" +"pydantic" = "WARNING" +"toml" = "WARNING" +"werkzeug" = "WARNING" +"urllib3.connectionpool" = "WARNING" + +# Keep only critical errors visible +"asyncio" = "WARNING" +"pytest" = "WARNING" diff --git a/pytest.ini b/pytest.ini index 9a063be..5fcbd1d 100644 --- a/pytest.ini +++ b/pytest.ini @@ -9,6 +9,7 @@ addopts = --cov-report=term-missing --cov-report=html:htmlcov --strict-markers + --log-cli-level=ERROR python_files = test_*.py *_test.py python_classes = Test* python_functions = test_* diff --git a/scripts/run_tests_quiet.py b/scripts/run_tests_quiet.py new file mode 100644 index 0000000..a896b88 --- /dev/null +++ b/scripts/run_tests_quiet.py @@ -0,0 +1,52 @@ +#!/usr/bin/env python3 +""" +Quiet test runner script that suppresses debug and info messages. + +Usage: + python scripts/run_tests_quiet.py [pytest_options...] + +Examples: + python scripts/run_tests_quiet.py + python scripts/run_tests_quiet.py -k "test_data_validator" + python scripts/run_tests_quiet.py --cov=core --cov-report=html +""" + +import os +import sys +import subprocess +from pathlib import Path + +def main(): + """Run tests with quiet logging configuration.""" + # Get the project root directory + project_root = Path(__file__).parent.parent + os.chdir(project_root) + + # Set environment variables for quiet logging + env = os.environ.copy() + env["PYTHONPATH"] = str(project_root) + + # Build pytest command with quiet options + cmd = [ + sys.executable, "-m", "pytest", + "--log-cli-level=WARNING", + "--tb=short", + "-v" + ] + + # Add any additional arguments passed to the script + cmd.extend(sys.argv[1:]) + + # Run pytest + try: + result = subprocess.run(cmd, env=env, cwd=project_root) + sys.exit(result.returncode) + except KeyboardInterrupt: + print("\nTest run interrupted by user") + sys.exit(1) + except Exception as e: + print(f"Error running tests: {e}") + sys.exit(1) + +if __name__ == "__main__": + main() diff --git a/shared/schema/connection_schema.py b/shared/schema/connection_schema.py index 5c71258..3cad596 100644 --- a/shared/schema/connection_schema.py +++ b/shared/schema/connection_schema.py @@ -7,7 +7,7 @@ cross-database features. """ -from typing import Any, Dict +from typing import Any, Dict, List, Optional from uuid import UUID, uuid4 from pydantic import Field, model_validator @@ -34,6 +34,9 @@ class ConnectionSchema(ConnectionBase): id: UUID = Field( default_factory=uuid4, description="Unique identifier for the connection" ) + available_tables: Optional[List[str]] = Field( + default=None, description="List of available tables for file-based sources" + ) # ==================== Convenient methods ==================== diff --git a/test_data/schema.json b/test_data/schema.json index a0c590c..1770dc6 100644 --- a/test_data/schema.json +++ b/test_data/schema.json @@ -1,4 +1,5 @@ { + "customers": { "rules": [ { "field": "id", "type": "integer", "required": true }, { "field": "age", "type": "integer", "required": true, "min": 0, "max": 120 }, @@ -7,4 +8,5 @@ { "field": "invalid_col", "type": "string", "required": true }, { "field": "email", "type": "string" } ] + } } diff --git a/tests/conftest.py b/tests/conftest.py index e428610..b357a5e 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -17,6 +17,18 @@ # Import the database connection management module. from shared.database.connection import close_all_engines +from shared.config.loader import load_config + +# Load test-specific logging configuration +try: + test_logging_config = load_config("logging.test.toml") + if test_logging_config: + # Apply test logging configuration + for module, level in test_logging_config.get("module_levels", {}).items(): + _logging.getLogger(module).setLevel(getattr(_logging, level.upper())) +except Exception: + # Fallback to default configuration if test config not found + pass # --------------------------------------------------------------------------- # Hypothesis global configuration – suppress HealthCheck for function-scoped From c5aaa6cf171c126409f6c5dc9b8102932be8801d Mon Sep 17 00:00:00 2001 From: litedatum Date: Tue, 26 Aug 2025 21:17:56 -0400 Subject: [PATCH 3/6] test: fix bugs in the tests --- CHANGELOG.md | 14 + cli/commands/schema.py | 158 +++++--- cli/core/data_validator.py | 99 +++-- cli/core/source_parser.py | 36 +- scripts/run_tests_quiet.py | 20 +- tests/conftest.py | 3 +- .../cli_scenarios/test_schema_command_e2e.py | 60 ++-- tests/shared/builders/test_builders.py | 8 + .../unit/cli/commands/test_schema_command.py | 80 +++-- .../commands/test_schema_command_extended.py | 75 ++-- .../test_schema_command_file_sources.py | 13 +- .../test_schema_command_json_extras.py | 41 ++- .../test_schema_command_multi_table.py | 338 +++++++++--------- 13 files changed, 568 insertions(+), 377 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9d2f1ac..73b72bc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - feat(cli): refactor check command interface from positional arguments to `--conn` and `--table` options - feat(cli): add comprehensive test coverage for new CLI interface functionality - feat(cli): support explicit table name specification independent of database URL +- feat(schema): add comprehensive multi-table support for schema validation +- feat(schema): support multi-table rules format with table-level configuration options +- feat(schema): add Excel multi-sheet file support as data source +- feat(schema): implement table-grouped output display for multi-table validation results +- feat(schema): add table-level options support (strict_mode, case_insensitive) +- feat(tests): add comprehensive multi-table functionality test coverage +- feat(tests): add multi-table Excel file validation test scenarios ### Changed - **BREAKING CHANGE**: CLI interface changed from `vlite-cli check ` to `vlite-cli check --conn --table ` @@ -18,12 +25,19 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - refactor(cli): modify check command to pass table_name to SourceParser.parse_source() - refactor(tests): update all existing CLI tests to use new interface format - refactor(tests): add new test cases specifically for table name parameter validation +- refactor(schema): enhance schema command to support both single-table and multi-table formats +- refactor(schema): improve output formatting with table-grouped results display +- refactor(schema): enhance rule decomposition logic for multi-table support +- refactor(data-validator): improve multi-table detection and processing capabilities ### Fixed - fix(cli): resolve issue where `--table` parameter was not correctly passed to backend - fix(cli): ensure table name from `--table` option takes precedence over table name in database URL - fix(tests): update regression tests to use new CLI interface format - fix(tests): resolve test failures caused by interface changes +- fix(schema): resolve multi-table rules validation and type checking issues +- fix(schema): improve table name detection and validation in multi-table scenarios +- fix(schema): enhance error handling for multi-table validation workflows ### Removed - **BREAKING CHANGE**: remove backward compatibility for old positional argument interface diff --git a/cli/commands/schema.py b/cli/commands/schema.py index a0d5cac..898354d 100644 --- a/cli/commands/schema.py +++ b/cli/commands/schema.py @@ -19,8 +19,8 @@ from shared.enums import RuleAction, RuleCategory, RuleType, SeverityLevel from shared.enums.data_types import DataType from shared.schema.base import RuleTarget, TargetEntity -from shared.schema.rule_schema import RuleSchema from shared.schema.connection_schema import ConnectionSchema +from shared.schema.rule_schema import RuleSchema from shared.utils.console import safe_echo from shared.utils.datetime_utils import now as _now from shared.utils.logger import get_logger @@ -40,7 +40,7 @@ def _validate_multi_table_rules_payload(payload: Any) -> Tuple[List[str], int]: """Validate the structure of multi-table schema rules file. - + Multi-table format: { "table1": { @@ -51,62 +51,77 @@ def _validate_multi_table_rules_payload(payload: Any) -> Tuple[List[str], int]: "rules": [...] } } - + Returns: warnings, total_rules_count """ warnings: List[str] = [] total_rules = 0 - + if not isinstance(payload, dict): raise click.UsageError("Rules file must be a JSON object") - + # Check if this is a multi-table format (has table names as keys) table_names = [key for key in payload.keys() if key != "rules"] - + if table_names: # Multi-table format for table_name in table_names: table_schema = payload[table_name] if not isinstance(table_schema, dict): raise click.UsageError(f"Table '{table_name}' schema must be an object") - + table_rules = table_schema.get("rules") if not isinstance(table_rules, list): - raise click.UsageError(f"Table '{table_name}' must have a 'rules' array") - + raise click.UsageError( + f"Table '{table_name}' must have a 'rules' array" + ) + # Validate each rule in this table for idx, item in enumerate(table_rules): if not isinstance(item, dict): - raise click.UsageError(f"Table '{table_name}' rules[{idx}] must be an object") - + raise click.UsageError( + f"Table '{table_name}' rules[{idx}] must be an object" + ) + # Validate rule fields _validate_single_rule_item(item, f"Table '{table_name}' rules[{idx}]") - + total_rules += len(table_rules) - + # Validate optional table-level switches - if "strict_mode" in table_schema and not isinstance(table_schema["strict_mode"], bool): - raise click.UsageError(f"Table '{table_name}' strict_mode must be a boolean") - if "case_insensitive" in table_schema and not isinstance(table_schema["case_insensitive"], bool): - raise click.UsageError(f"Table '{table_name}' case_insensitive must be a boolean") + if "strict_mode" in table_schema and not isinstance( + table_schema["strict_mode"], bool + ): + raise click.UsageError( + f"Table '{table_name}' strict_mode must be a boolean" + ) + if "case_insensitive" in table_schema and not isinstance( + table_schema["case_insensitive"], bool + ): + raise click.UsageError( + f"Table '{table_name}' case_insensitive must be a boolean" + ) else: # Single-table format (backward compatibility) - warnings.append("Single-table format detected; consider using multi-table format for better organization") + warnings.append( + "Single-table format detected; consider using multi-table format for " + "better organization" + ) if "rules" not in payload: raise click.UsageError("Single-table format must have a 'rules' array") - + rules = payload["rules"] if not isinstance(rules, list): raise click.UsageError("'rules' must be an array") - + for idx, item in enumerate(rules): if not isinstance(item, dict): raise click.UsageError(f"rules[{idx}] must be an object") _validate_single_rule_item(item, f"rules[{idx}]") - + total_rules = len(rules) - + return warnings, total_rules @@ -125,8 +140,7 @@ def _validate_single_rule_item(item: Dict[str, Any], context: str) -> None: if type_name.lower() not in _ALLOWED_TYPE_NAMES: allowed = ", ".join(sorted(_ALLOWED_TYPE_NAMES)) raise click.UsageError( - f"{context}.type '{type_name}' is not supported. " - f"Allowed: {allowed}" + f"{context}.type '{type_name}' is not supported. " f"Allowed: {allowed}" ) # required @@ -142,7 +156,9 @@ def _validate_single_rule_item(item: Dict[str, Any], context: str) -> None: if bound_key in item: value = item[bound_key] if not isinstance(value, (int, float)): - raise click.UsageError(f"{context}.{bound_key} must be numeric when provided") + raise click.UsageError( + f"{context}.{bound_key} must be numeric when provided" + ) def _validate_rules_payload(payload: Any) -> Tuple[List[str], int]: @@ -233,8 +249,8 @@ def _create_rule_schema( def _decompose_schema_payload( - payload: Dict[str, Any], source_config: ConnectionSchema - ) -> List[RuleSchema]: + payload: Dict[str, Any], source_config: ConnectionSchema +) -> List[RuleSchema]: """Decompose a schema payload into atomic RuleSchema objects. This function handles both single-table and multi-table formats in a @@ -250,15 +266,22 @@ def _decompose_schema_payload( available_tables_from_source = set(source_config.available_tables or []) for table_name in tables_in_rules: - if available_tables_from_source and table_name not in available_tables_from_source: + if ( + available_tables_from_source + and table_name not in available_tables_from_source + ): logger.warning( - f"Skipping rules for table '{table_name}' as it is not available in the source." + f"Skipping rules for table '{table_name}' as it is not available " + "in the source." ) continue table_schema = payload[table_name] if not isinstance(table_schema, dict): - logger.warning(f"Definition for table '{table_name}' is not a valid object, skipping.") + logger.warning( + f"Definition for table '{table_name}' is not a valid object, " + "skipping." + ) continue table_rules = _decompose_single_table_schema( @@ -274,20 +297,18 @@ def _decompose_schema_payload( "Could not determine table name for single-table schema. " "Consider using multi-table format for database sources." ) - - table_rules = _decompose_single_table_schema( - payload, source_db, table_name - ) + + table_rules = _decompose_single_table_schema(payload, source_db, table_name) all_atomic_rules.extend(table_rules) return all_atomic_rules def _decompose_single_table_schema( - table_schema: Dict[str, Any], source_db: str, table_name: str - ) -> List[RuleSchema]: + table_schema: Dict[str, Any], source_db: str, table_name: str +) -> List[RuleSchema]: """Decompose a single table's schema definition into atomic RuleSchema objects. - + Args: table_schema: The schema definition for a single table source_db: Database name from source @@ -387,7 +408,7 @@ def _decompose_single_table_schema( # def _decompose_to_atomic_rules(payload: Dict[str, Any]) -> List[RuleSchema]: # """Decompose schema JSON payload into atomic RuleSchema objects. - + # This function is kept for backward compatibility but now delegates to # the new multi-table aware function. # """ @@ -470,9 +491,11 @@ def _read_rules_payload(rules_file: str) -> Dict[str, Any]: return cast(Dict[str, Any], payload) -def _emit_warnings(warnings: List[str]) -> None: - for msg in warnings: - _safe_echo(f"⚠️ Warning: {msg}", err=True) +def _emit_warnings(warnings: List[str], output: str = "table") -> None: + """Emit warnings only for non-JSON output to avoid polluting JSON output.""" + if output.lower() != "json": + for msg in warnings: + _safe_echo(f"⚠️ Warning: {msg}", err=True) def _early_exit_when_no_rules( @@ -690,7 +713,7 @@ def _ensure_check(entry: Dict[str, Any], name: str) -> Dict[str, Any]: table_name = "unknown" if rule.target and rule.target.entities: table_name = rule.target.entities[0].table - + l_entry = schema_fields_index.get(column_name) if not l_entry: l_entry = {"column": column_name, "table": table_name, "checks": {}} @@ -851,17 +874,17 @@ def _calc_failed(res: Dict[str, Any]) -> int: # Group results by table for multi-table support tables_grouped: Dict[str, Dict[str, Dict[str, Any]]] = {} - + for rd in table_results: table_name = rd.get("table_name", "unknown") if table_name not in tables_grouped: tables_grouped[table_name] = {} - + col = rd.get("column_name", "") if col: if col not in tables_grouped[table_name]: tables_grouped[table_name][col] = {"column": col, "issues": []} - + status = str(rd.get("status", "UNKNOWN")) if rd.get("rule_type") == RuleType.NOT_NULL.value: key = "not_null" @@ -875,7 +898,7 @@ def _calc_failed(res: Dict[str, Any]) -> int: key = "date_format" else: key = rd.get("rule_type", "unknown").lower() - + if status in {"FAILED", "ERROR", "SKIPPED"}: tables_grouped[table_name][col]["issues"].append( { @@ -897,14 +920,17 @@ def _calc_failed(res: Dict[str, Any]) -> int: for table_name in sorted(tables_grouped.keys()): if len(tables_grouped) > 1: # Only show table header for multi-table lines.append(f"\n📋 Table: {table_name}") - + table_grouped = tables_grouped[table_name] for col in sorted(table_grouped.keys()): issues = table_grouped[col]["issues"] critical = [i for i in issues if i["status"] in {"FAILED", "ERROR"}] - if not critical: + skipped = [i for i in issues if i["status"] == "SKIPPED"] + + if not critical and not skipped: lines.append(f"✓ {col}: OK") else: + # Show critical issues first for i in critical: fr = i.get("failed_records") or 0 if i["status"] == "ERROR": @@ -912,6 +938,18 @@ def _calc_failed(res: Dict[str, Any]) -> int: else: lines.append(f"✗ {col}: {i['check']} failed ({fr} failures)") + # Show skipped issues with skip reason + for i in skipped: + skip_reason = i.get("skip_reason", "unknown reason") + if skip_reason == "FIELD_MISSING": + lines.append(f"✗ {col}: missing (skipped dependent checks)") + elif skip_reason == "TYPE_MISMATCH": + lines.append( + f"✗ {col}: type mismatch (skipped dependent checks)" + ) + else: + lines.append(f"✗ {col}: {i['check']} skipped ({skip_reason})") + total_columns = sum(len(tables_grouped[table]) for table in tables_grouped) passed_columns = sum( sum(1 for col in table_grouped.values() if not col["issues"]) @@ -923,15 +961,19 @@ def _calc_failed(res: Dict[str, Any]) -> int: if header_total_records == 0 else (total_failed_records / max(header_total_records, 1)) * 100 ) - + if len(tables_grouped) > 1: - lines.append(f"\n📊 Multi-table Summary:") + lines.append("\n📊 Multi-table Summary:") for table_name in sorted(tables_grouped.keys()): table_columns = len(tables_grouped[table_name]) - table_passed = sum(1 for col in tables_grouped[table_name].values() if not col["issues"]) + table_passed = sum( + 1 for col in tables_grouped[table_name].values() if not col["issues"] + ) table_failed = table_columns - table_passed - lines.append(f" {table_name}: {table_passed} passed, {table_failed} failed") - + lines.append( + f" {table_name}: {table_passed} passed, {table_failed} failed" + ) + lines.append( f"\nSummary: {passed_columns} passed, {failed_columns} failed" f" ({overall_error_rate:.2f}% overall error rate)" @@ -953,7 +995,8 @@ def _calc_failed(res: Dict[str, Any]) -> int: "rules_file", type=click.Path(exists=True, readable=True), required=True, - help="Path to schema rules file (JSON) - supports both single-table and multi-table formats", + help="Path to schema rules file (JSON) - supports both single-table " + "and multi-table formats", ) @click.option( "--output", @@ -976,7 +1019,10 @@ def schema_command( fail_on_error: bool, verbose: bool, ) -> None: - """Schema validation command with support for both single-table and multi-table validation.""" + """ + Schema validation command with support for both single-table + and multi-table validation. + """ from cli.core.config import get_cli_config from core.config import get_core_config @@ -995,7 +1041,7 @@ def schema_command( source_config.parameters["is_multi_table"] = True warnings, rules_count = _validate_rules_payload(rules_payload) - _emit_warnings(warnings) + _emit_warnings(warnings, output) atomic_rules = _decompose_schema_payload(rules_payload, source_config) diff --git a/cli/core/data_validator.py b/cli/core/data_validator.py index 6eeec9f..398070f 100644 --- a/cli/core/data_validator.py +++ b/cli/core/data_validator.py @@ -113,7 +113,9 @@ def _complete_target_info(self) -> None: """ # If the source is multi-table, targets are already set. Do not overwrite. if self.source_config.parameters.get("is_multi_table"): - self.logger.debug("Multi-table source detected, skipping target info completion.") + self.logger.debug( + "Multi-table source detected, skipping target info completion." + ) return if not self.rules: @@ -191,10 +193,16 @@ async def _validate_file(self) -> List[ExecutionResultSchema]: # Check if this is a multi-table Excel file is_multi_table = self.source_config.parameters.get("is_multi_table", False) - self.logger.info(f"Multi-table detection: is_multi_table={is_multi_table}, connection_type={self.source_config.connection_type}") + self.logger.info( + f"Multi-table detection: is_multi_table={is_multi_table}, " + f"connection_type={self.source_config.connection_type}" + ) self.logger.info(f"Source config parameters: {self.source_config.parameters}") - - if is_multi_table and self.source_config.connection_type == ConnectionType.EXCEL: + + if ( + is_multi_table + and self.source_config.connection_type == ConnectionType.EXCEL + ): # Handle multi-table Excel file self.logger.info("Processing multi-table Excel file") sqlite_config = await self._convert_multi_table_excel_to_sqlite() @@ -329,38 +337,38 @@ def _load_file_data(self) -> pd.DataFrame: async def _convert_multi_table_excel_to_sqlite(self) -> ConnectionSchema: """ Convert multi-table Excel file to SQLite database. - + Returns: ConnectionSchema: SQLite connection configuration """ import os import tempfile import time - + from sqlalchemy import create_engine - + temp_db_file = None temp_db_path = None start_time = time.time() - + try: # Create a temporary SQLite file temp_db_file = tempfile.NamedTemporaryFile(suffix=".db", delete=False) temp_db_path = temp_db_file.name temp_db_file.close() - + # Create SQLite engine engine = create_engine(f"sqlite:///{temp_db_path}") - + # Load all sheets into SQLite await self._load_multi_table_excel_to_sqlite(engine, temp_db_path) - + # Get table mapping for connection config table_mapping = self.source_config.parameters.get("table_mapping", {}) - + # Create connection config with multi-table information sqlite_config = ConnectionSchema( - name=f"temp_sqlite_multi_table", + name="temp_sqlite_multi_table", description="Temporary SQLite for multi-table Excel validation", connection_type=ConnectionType.SQLITE, file_path=temp_db_path, @@ -370,16 +378,16 @@ async def _convert_multi_table_excel_to_sqlite(self) -> ConnectionSchema: "temp_file": True, # Mark as temporary file for cleanup }, ) - + # Log performance metrics elapsed_time = time.time() - start_time self.logger.info( f"Created temporary SQLite database at {temp_db_path} with " f"{len(table_mapping)} tables in {elapsed_time:.2f} seconds" ) - + return sqlite_config - + except Exception as e: # Clean up temporary file if it exists if temp_db_path and os.path.exists(temp_db_path): @@ -387,65 +395,80 @@ async def _convert_multi_table_excel_to_sqlite(self) -> ConnectionSchema: os.unlink(temp_db_path) except Exception as cleanup_error: self.logger.warning( - f"Failed to cleanup temporary file {temp_db_path}: {cleanup_error}" + f"Failed to cleanup temporary file {temp_db_path}: " + f"{cleanup_error}" ) raise ValueError(f"Failed to create multi-table SQLite database: {str(e)}") - async def _load_multi_table_excel_to_sqlite(self, engine, temp_db_path: str) -> None: + async def _load_multi_table_excel_to_sqlite( + self, engine, temp_db_path: str + ) -> None: """ Load multiple sheets from Excel file into SQLite database. - + Args: engine: SQLAlchemy engine for SQLite temp_db_path: Path to temporary SQLite database """ import pandas as pd - + file_path = self.source_config.file_path sheets_info = self.source_config.parameters.get("sheets", {}) - + if not sheets_info: - raise ValueError("Multi-table Excel file but no sheets information available") - - self.logger.info(f"Loading {len(sheets_info)} sheets into SQLite: {list(sheets_info.keys())}") - + raise ValueError( + "Multi-table Excel file but no sheets information available" + ) + + self.logger.info( + f"Loading {len(sheets_info)} sheets into SQLite: {list(sheets_info.keys())}" + ) + # Store table name mapping for later use table_mapping = {} - + # Load each sheet into a separate table for sheet_name, columns in sheets_info.items(): try: # Read the specific sheet df = pd.read_excel(file_path, sheet_name=sheet_name, engine="openpyxl") - + # Validate that the sheet has the expected columns expected_columns = set(columns) actual_columns = set(df.columns) - + if not expected_columns.issubset(actual_columns): missing_columns = expected_columns - actual_columns - self.logger.warning(f"Sheet '{sheet_name}' missing expected columns: {missing_columns}") - + self.logger.warning( + f"Sheet '{sheet_name}' missing expected columns: " + f"{missing_columns}" + ) + # Write to SQLite with sheet name as table name # Clean table name for SQLite (remove special characters) - clean_table_name = "".join(c for c in sheet_name if c.isalnum() or c == '_') + clean_table_name = "".join( + c for c in sheet_name if c.isalnum() or c == "_" + ) if not clean_table_name or clean_table_name[0].isdigit(): clean_table_name = f"sheet_{clean_table_name}" - + # Store the mapping from original sheet name to clean table name table_mapping[sheet_name] = clean_table_name - + df.to_sql(clean_table_name, engine, if_exists="replace", index=False) - self.logger.info(f"Loaded sheet '{sheet_name}' as table '{clean_table_name}' with {len(df)} rows") - + self.logger.info( + f"Loaded sheet '{sheet_name}' as table '{clean_table_name}' " + f"with {len(df)} rows" + ) + except Exception as e: self.logger.error(f"Failed to load sheet '{sheet_name}': {str(e)}") # Continue with other sheets continue - + # Store the table mapping in the source config for later use - if hasattr(self, 'source_config') and hasattr(self.source_config, 'parameters'): - self.source_config.parameters['table_mapping'] = table_mapping + if hasattr(self, "source_config") and hasattr(self.source_config, "parameters"): + self.source_config.parameters["table_mapping"] = table_mapping self.logger.info(f"Stored table mapping: {table_mapping}") async def _convert_file_to_sqlite(self, df: pd.DataFrame) -> ConnectionSchema: diff --git a/cli/core/source_parser.py b/cli/core/source_parser.py index 055dcee..839c7ea 100644 --- a/cli/core/source_parser.py +++ b/cli/core/source_parser.py @@ -8,7 +8,7 @@ import re import urllib.parse from pathlib import Path -from typing import Optional, Tuple, Dict, List +from typing import Dict, List, Optional, Tuple from uuid import uuid4 from cli.exceptions import ValidationError @@ -98,13 +98,13 @@ def parse_source( def get_excel_sheets(self, file_path: str) -> Dict[str, List[str]]: """ Get sheet names from Excel file. - + Args: file_path: Path to Excel file - + Returns: Dict with sheet names as keys and column lists as values - + Raises: ImportError: If pandas/openpyxl not available FileNotFoundError: If file not found @@ -113,16 +113,16 @@ def get_excel_sheets(self, file_path: str) -> Dict[str, List[str]]: import pandas as pd except ImportError: raise ImportError("pandas is required to read Excel files") - + try: excel_file = pd.ExcelFile(file_path) sheets_info = {} - + for sheet_name in excel_file.sheet_names: # Read first few rows to get column names df = pd.read_excel(file_path, sheet_name=sheet_name, nrows=0) sheets_info[sheet_name] = list(df.columns) - + return sheets_info except Exception as e: self.logger.error(f"Error reading Excel file {file_path}: {str(e)}") @@ -130,16 +130,18 @@ def get_excel_sheets(self, file_path: str) -> Dict[str, List[str]]: def is_multi_table_excel(self, file_path: str) -> bool: """ - Check if Excel file contains multiple sheets that could represent multiple tables. - + Check if Excel file contains multiple sheets that could represent + multiple tables. + Args: file_path: Path to Excel file - + Returns: True if file has multiple sheets, False otherwise """ try: import pandas as pd + excel_file = pd.ExcelFile(file_path) return len(excel_file.sheet_names) > 1 except ImportError: @@ -258,9 +260,14 @@ def _parse_file_path(self, file_path: str) -> ConnectionSchema: sheets_info = self.get_excel_sheets(file_path) if len(sheets_info) > 1: is_multi_table = True - self.logger.info(f"Multi-table Excel file detected with {len(sheets_info)} sheets: {list(sheets_info.keys())}") + self.logger.info( + f"Multi-table Excel file detected with {len(sheets_info)} " + "sheets: {list(sheets_info.keys())}" + ) except Exception as e: - self.logger.warning(f"Could not read Excel sheets, treating as single-table: {str(e)}") + self.logger.warning( + f"Could not read Excel sheets, treating as single-table: {str(e)}" + ) is_multi_table = False parameters = { @@ -268,7 +275,7 @@ def _parse_file_path(self, file_path: str) -> ConnectionSchema: "file_size": path.stat().st_size, "encoding": "utf-8", } - + if is_multi_table and sheets_info: parameters["is_multi_table"] = True parameters["sheets"] = sheets_info @@ -279,7 +286,8 @@ def _parse_file_path(self, file_path: str) -> ConnectionSchema: return ConnectionSchema( name=f"file_connection_{uuid4().hex[:8]}", - description=f"File connection: {path.name}" + (" (multi-table)" if is_multi_table else ""), + description=f"File connection: {path.name}" + + (" (multi-table)" if is_multi_table else ""), connection_type=conn_type, file_path=str(path.absolute()), parameters=parameters, diff --git a/scripts/run_tests_quiet.py b/scripts/run_tests_quiet.py index a896b88..c6cb790 100644 --- a/scripts/run_tests_quiet.py +++ b/scripts/run_tests_quiet.py @@ -4,7 +4,7 @@ Usage: python scripts/run_tests_quiet.py [pytest_options...] - + Examples: python scripts/run_tests_quiet.py python scripts/run_tests_quiet.py -k "test_data_validator" @@ -12,31 +12,34 @@ """ import os -import sys import subprocess +import sys from pathlib import Path + def main(): """Run tests with quiet logging configuration.""" # Get the project root directory project_root = Path(__file__).parent.parent os.chdir(project_root) - + # Set environment variables for quiet logging env = os.environ.copy() env["PYTHONPATH"] = str(project_root) - + # Build pytest command with quiet options cmd = [ - sys.executable, "-m", "pytest", + sys.executable, + "-m", + "pytest", "--log-cli-level=WARNING", "--tb=short", - "-v" + "-v", ] - + # Add any additional arguments passed to the script cmd.extend(sys.argv[1:]) - + # Run pytest try: result = subprocess.run(cmd, env=env, cwd=project_root) @@ -48,5 +51,6 @@ def main(): print(f"Error running tests: {e}") sys.exit(1) + if __name__ == "__main__": main() diff --git a/tests/conftest.py b/tests/conftest.py index b357a5e..0c7c26d 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -15,9 +15,10 @@ # Add the project root directory to the Python path. sys.path.insert(0, str(Path(__file__).parent.parent)) +from shared.config.loader import load_config + # Import the database connection management module. from shared.database.connection import close_all_engines -from shared.config.loader import load_config # Load test-specific logging configuration try: diff --git a/tests/e2e/cli_scenarios/test_schema_command_e2e.py b/tests/e2e/cli_scenarios/test_schema_command_e2e.py index 143d872..eed2bd1 100644 --- a/tests/e2e/cli_scenarios/test_schema_command_e2e.py +++ b/tests/e2e/cli_scenarios/test_schema_command_e2e.py @@ -59,13 +59,15 @@ def _param_db_urls() -> list[object]: def test_happy_path_table_and_json(tmp_path: Path, db_url: str) -> None: # Schema baseline + a couple atomic rules rules = { - "rules": [ - {"field": "id", "type": "integer", "required": True}, - {"field": "email", "type": "string"}, - {"field": "age", "type": "integer", "min": 0, "max": 150}, - ], - "strict_mode": False, - "case_insensitive": True, + "customers": { + "rules": [ + {"field": "id", "type": "integer", "required": True}, + {"field": "email", "type": "string"}, + {"field": "age", "type": "integer", "min": 0, "max": 150}, + ], + "strict_mode": False, + "case_insensitive": True, + } } rules_file = _write_rules(tmp_path, rules) @@ -75,8 +77,6 @@ def test_happy_path_table_and_json(tmp_path: Path, db_url: str) -> None: "schema", "--conn", db_url, - "--table", - "customers", "--rules", rules_file, "--output", @@ -92,8 +92,6 @@ def test_happy_path_table_and_json(tmp_path: Path, db_url: str) -> None: "schema", "--conn", db_url, - "--table", - "customers", "--rules", rules_file, "--output", @@ -117,16 +115,18 @@ def test_happy_path_table_and_json(tmp_path: Path, db_url: str) -> None: def test_drift_missing_and_type_mismatch(tmp_path: Path, db_url: str) -> None: # Declare a missing column and mismatched type to trigger SKIPPED in JSON for dependent rules rules = { - "rules": [ - {"field": "email", "type": "integer", "required": True}, # mismatch - { - "field": "status", - "type": "string", - "enum": ["active", "inactive"], - }, # missing - ], - "strict_mode": False, - "case_insensitive": True, + "customers": { + "rules": [ + {"field": "email", "type": "integer", "required": True}, # mismatch + { + "field": "status", + "type": "string", + "enum": ["active", "inactive"], + }, # missing + ], + "strict_mode": False, + "case_insensitive": True, + } } rules_file = _write_rules(tmp_path, rules) @@ -135,8 +135,6 @@ def test_drift_missing_and_type_mismatch(tmp_path: Path, db_url: str) -> None: "schema", "--conn", db_url, - "--table", - "customers", "--rules", rules_file, "--output", @@ -162,11 +160,13 @@ def test_drift_missing_and_type_mismatch(tmp_path: Path, db_url: str) -> None: @pytest.mark.parametrize("db_url", _param_db_urls()) def test_strict_mode_extras_json(tmp_path: Path, db_url: str) -> None: rules = { - "rules": [ - {"field": "id", "type": "integer"}, - ], - "strict_mode": True, - "case_insensitive": True, + "customers": { + "rules": [ + {"field": "id", "type": "integer"}, + ], + "strict_mode": True, + "case_insensitive": True, + } } rules_file = _write_rules(tmp_path, rules) @@ -175,8 +175,6 @@ def test_strict_mode_extras_json(tmp_path: Path, db_url: str) -> None: "schema", "--conn", db_url, - "--table", - "customers", "--rules", rules_file, "--output", @@ -205,8 +203,6 @@ def test_empty_rules_minimal_payload(tmp_path: Path) -> None: "schema", "--conn", str(data_file), - "--table", - "data", "--rules", rules_file, "--output", diff --git a/tests/shared/builders/test_builders.py b/tests/shared/builders/test_builders.py index 2a02f8a..ac15ce1 100644 --- a/tests/shared/builders/test_builders.py +++ b/tests/shared/builders/test_builders.py @@ -211,6 +211,7 @@ def __init__(self) -> None: self._username = "test_user" self._password = "test_pass" self._db_schema = "test_schema" + self._available_tables = None self._file_path: Optional[str] = None self._parameters: Dict[str, Any] = {} @@ -236,6 +237,12 @@ def with_database(self, db_name: str) -> "TestDataBuilder.ConnectionBuilder": self._db_name = db_name return self + def with_available_tables( + self, table_name: str + ) -> "TestDataBuilder.ConnectionBuilder": + self._available_tables = table_name + return self + def with_credentials( self, username: str, password: str ) -> "TestDataBuilder.ConnectionBuilder": @@ -275,6 +282,7 @@ def build(self) -> ConnectionSchema: db_schema=self._db_schema, file_path=self._file_path, parameters=self._parameters, + available_tables=self._available_tables, capabilities=DataSourceCapability(supports_sql=True), cross_db_settings=None, ) diff --git a/tests/unit/cli/commands/test_schema_command.py b/tests/unit/cli/commands/test_schema_command.py index 88a8d72..028e091 100644 --- a/tests/unit/cli/commands/test_schema_command.py +++ b/tests/unit/cli/commands/test_schema_command.py @@ -11,6 +11,7 @@ from cli.app import cli_app from cli.core.data_validator import ExecutionResultSchema +from shared.enums.connection_types import ConnectionType def _write_tmp_file(tmp_path: Path, name: str, content: str) -> str: @@ -38,17 +39,22 @@ def test_schema_requires_source_and_rules(self, tmp_path: Path) -> None: rules_obj: dict[str, list[dict[str, Any]]] = {"rules": []} rules_path = _write_tmp_file(tmp_path, "schema.json", json.dumps(rules_obj)) - result = runner.invoke(cli_app, ["schema", data_path, "--rules", rules_path]) + result = runner.invoke( + cli_app, ["schema", "--conn", data_path, "--rules", rules_path] + ) assert result.exit_code == 0 assert "Checking" in result.output def test_output_json_mode(self, tmp_path: Path) -> None: runner = CliRunner() data_path = _write_tmp_file(tmp_path, "data.csv", "id\n1\n") - rules_path = _write_tmp_file(tmp_path, "schema.json", json.dumps({"rules": []})) + rules_path = _write_tmp_file( + tmp_path, "schema.json", json.dumps({"user": {"rules": []}}) + ) result = runner.invoke( - cli_app, ["schema", data_path, "--rules", rules_path, "--output", "json"] + cli_app, + ["schema", "--conn", data_path, "--rules", rules_path, "--output", "json"], ) assert result.exit_code == 0 payload = json.loads(result.output) @@ -72,12 +78,26 @@ def test_output_json_declared_columns_always_listed( .build() ) + # Create a mock ConnectionSchema for testing + mock_source_config = ( + test_builders.TestDataBuilder.connection() + .with_type(ConnectionType.CSV) + .with_database("test_db") + .with_available_tables("test_table") + .with_parameters({}) + .build() + ) + monkeypatch.setattr( - "cli.commands.schema._decompose_to_atomic_rules", - lambda payload: [schema_rule], + "cli.commands.schema._decompose_schema_payload", + lambda payload, source_config: [schema_rule], ) class DummyValidator: + def __init__(self, source_config, rules, core_config, cli_config) -> None: + # Accept constructor arguments but ignore them + pass + async def validate(self) -> list[ExecutionResultSchema]: # Return no results to simulate missing schema details return [] @@ -89,11 +109,12 @@ async def validate(self) -> list[ExecutionResultSchema]: rules_path = _write_tmp_file( tmp_path, "schema.json", - json.dumps({"rules": [{"field": "id", "type": "integer"}]}), + json.dumps({"data": {"rules": [{"field": "id", "type": "integer"}]}}), ) result = runner.invoke( - cli_app, ["schema", data_path, "--rules", rules_path, "--output", "json"] + cli_app, + ["schema", "--conn", data_path, "--rules", rules_path, "--output", "json"], ) # No failures but explicit -- in this setup lack of results implies exit 0 assert result.exit_code == 0 @@ -116,6 +137,7 @@ def test_fail_on_error_sets_exit_code_1(self, tmp_path: Path) -> None: cli_app, [ "schema", + "--conn", data_path, "--rules", rules_path, @@ -131,7 +153,7 @@ def test_invalid_rules_json_yields_usage_error(self, tmp_path: Path) -> None: bad_rules_path = _write_tmp_file(tmp_path, "bad.json", "{invalid json}") result = runner.invoke( - cli_app, ["schema", data_path, "--rules", bad_rules_path] + cli_app, ["schema", "--conn", data_path, "--rules", bad_rules_path] ) # Click usage error exit code is >= 2 @@ -149,29 +171,33 @@ def test_warn_on_top_level_table_ignored(self, tmp_path: Path) -> None: runner = CliRunner() data_path = self._write_tmp_file(tmp_path, "data.csv", "id\n1\n") rules = { - "table": "users", - "rules": [ - {"field": "id", "type": "integer", "required": True}, - ], + "users": { + "rules": [ + {"field": "id", "type": "integer", "required": True}, + ] + } } rules_path = self._write_tmp_file(tmp_path, "schema.json", json.dumps(rules)) result = runner.invoke( - cli_app, ["schema", data_path, "--rules", rules_path, "--output", "json"] + cli_app, + ["schema", "--conn", data_path, "--rules", rules_path, "--output", "json"], ) # exit code from skeleton remains success assert result.exit_code == 0 - # warning emitted to stderr - assert "table' is ignored" in (result.stderr or "") + # Since multi-table has been supported,so no warning emitted to stderr + # assert "table' is ignored" in (result.stderr or "") def test_rules_must_be_array(self, tmp_path: Path) -> None: runner = CliRunner() data_path = self._write_tmp_file(tmp_path, "data.csv", "id\n1\n") rules_path = self._write_tmp_file(tmp_path, "schema.json", json.dumps({})) - result = runner.invoke(cli_app, ["schema", data_path, "--rules", rules_path]) + result = runner.invoke( + cli_app, ["schema", "--conn", data_path, "--rules", rules_path] + ) assert result.exit_code >= 2 - assert "must be an array" in result.output + assert "must have a 'rules' array" in result.output def test_rules_item_requires_field(self, tmp_path: Path) -> None: runner = CliRunner() @@ -179,7 +205,9 @@ def test_rules_item_requires_field(self, tmp_path: Path) -> None: bad = {"rules": [{"type": "integer"}]} rules_path = self._write_tmp_file(tmp_path, "schema.json", json.dumps(bad)) - result = runner.invoke(cli_app, ["schema", data_path, "--rules", rules_path]) + result = runner.invoke( + cli_app, ["schema", "--conn", data_path, "--rules", rules_path] + ) assert result.exit_code >= 2 assert "field must be a non-empty string" in result.output @@ -189,7 +217,9 @@ def test_type_must_be_supported_string(self, tmp_path: Path) -> None: bad = {"rules": [{"field": "id", "type": "number"}]} rules_path = self._write_tmp_file(tmp_path, "schema.json", json.dumps(bad)) - result = runner.invoke(cli_app, ["schema", data_path, "--rules", rules_path]) + result = runner.invoke( + cli_app, ["schema", "--conn", data_path, "--rules", rules_path] + ) assert result.exit_code >= 2 assert "type 'number' is not supported" in result.output @@ -199,7 +229,9 @@ def test_required_must_be_boolean(self, tmp_path: Path) -> None: bad = {"rules": [{"field": "id", "required": "yes"}]} rules_path = self._write_tmp_file(tmp_path, "schema.json", json.dumps(bad)) - result = runner.invoke(cli_app, ["schema", data_path, "--rules", rules_path]) + result = runner.invoke( + cli_app, ["schema", "--conn", data_path, "--rules", rules_path] + ) assert result.exit_code >= 2 assert "required must be a boolean" in result.output @@ -209,7 +241,9 @@ def test_enum_must_be_array(self, tmp_path: Path) -> None: bad = {"rules": [{"field": "flag", "enum": "01"}]} rules_path = self._write_tmp_file(tmp_path, "schema.json", json.dumps(bad)) - result = runner.invoke(cli_app, ["schema", data_path, "--rules", rules_path]) + result = runner.invoke( + cli_app, ["schema", "--conn", data_path, "--rules", rules_path] + ) assert result.exit_code >= 2 assert "enum must be an array" in result.output @@ -219,6 +253,8 @@ def test_min_max_must_be_numeric(self, tmp_path: Path) -> None: bad = {"rules": [{"field": "age", "type": "integer", "min": "0"}]} rules_path = self._write_tmp_file(tmp_path, "schema.json", json.dumps(bad)) - result = runner.invoke(cli_app, ["schema", data_path, "--rules", rules_path]) + result = runner.invoke( + cli_app, ["schema", "--conn", data_path, "--rules", rules_path] + ) assert result.exit_code >= 2 assert "min must be numeric" in result.output diff --git a/tests/unit/cli/commands/test_schema_command_extended.py b/tests/unit/cli/commands/test_schema_command_extended.py index 9c366c5..150a88d 100644 --- a/tests/unit/cli/commands/test_schema_command_extended.py +++ b/tests/unit/cli/commands/test_schema_command_extended.py @@ -8,9 +8,16 @@ from click.testing import CliRunner from cli.app import cli_app -from shared.enums import RuleAction, RuleCategory, RuleType, SeverityLevel +from shared.enums import ( + ConnectionType, + RuleAction, + RuleCategory, + RuleType, + SeverityLevel, +) from shared.schema.base import RuleTarget, TargetEntity from shared.schema.rule_schema import RuleSchema +from tests.shared.builders import test_builders def _write_tmp_file(tmp_path: Path, name: str, content: str) -> str: @@ -71,7 +78,7 @@ def test_map_type_names_are_case_insensitive_and_validated( _map_type_name_to_datatype("number") def test_decompose_to_atomic_rules_structure(self, tmp_path: Path) -> None: - from cli.commands.schema import _decompose_to_atomic_rules + from cli.commands.schema import _decompose_schema_payload payload = { "strict_mode": True, @@ -82,8 +89,16 @@ def test_decompose_to_atomic_rules_structure(self, tmp_path: Path) -> None: {"field": "status", "enum": ["A", "B"]}, ], } - - rules = _decompose_to_atomic_rules(payload) + # Create a mock ConnectionSchema for testing + mock_source_config = ( + test_builders.TestDataBuilder.connection() + .with_type(ConnectionType.CSV) + .with_database("test_db") + .with_available_tables("test_table") + .with_parameters({}) + .build() + ) + rules = _decompose_schema_payload(payload, mock_source_config) # First rule should be SCHEMA when any columns declared assert rules[0].type == RuleType.SCHEMA @@ -188,8 +203,8 @@ def test_json_output_aggregation_and_skip_semantics( # Patch decomposition monkeypatch.setattr( - "cli.commands.schema._decompose_to_atomic_rules", - lambda payload: atomic_rules, + "cli.commands.schema._decompose_schema_payload", + lambda payload, source_config: atomic_rules, ) # Build SCHEMA and dependent rule results. Dependent rules are PASSED in raw @@ -237,7 +252,8 @@ def test_json_output_aggregation_and_skip_semantics( # Patch DataValidator.validate to return our results class DummyValidator: - def __init__(self, *args: Any, **kwargs: Any) -> None: # noqa: D401 + def __init__(self, source_config, rules, core_config, cli_config): + # Accept all required parameters but don't use them pass async def validate(self) -> List[Dict[str, Any]]: # type: ignore[override] @@ -262,7 +278,8 @@ async def validate(self) -> List[Dict[str, Any]]: # type: ignore[override] ) result = runner.invoke( - cli_app, ["schema", data_path, "--rules", rules_path, "--output", "json"] + cli_app, + ["schema", "--conn", data_path, "--rules", rules_path, "--output", "json"], ) assert result.exit_code == 1 # schema failed -> non-zero @@ -313,8 +330,8 @@ def test_table_output_grouping_and_skips( atomic_rules = [schema, not_null_email, range_age] monkeypatch.setattr( - "cli.commands.schema._decompose_to_atomic_rules", - lambda payload: atomic_rules, + "cli.commands.schema._decompose_schema_payload", + lambda payload, source_config: atomic_rules, ) schema_result = { @@ -346,17 +363,19 @@ def test_table_output_grouping_and_skips( # Dependent rule raw statuses set to PASSED; should be skipped for display grouping not_null_email_result = { "rule_id": str(not_null_email.id), - "status": "PASSED", + "status": "SKIPPED", "dataset_metrics": [ {"entity_name": "x", "total_records": 10, "failed_records": 0} ], + "skip_reason": "TYPE_MISMATCH", } range_age_result = { "rule_id": str(range_age.id), - "status": "PASSED", + "status": "SKIPPED", "dataset_metrics": [ {"entity_name": "x", "total_records": 10, "failed_records": 0} ], + "skip_reason": "FIELD_MISSING", } class DummyValidator: @@ -383,7 +402,9 @@ async def validate(self) -> List[Dict[str, Any]]: # type: ignore[override] ), ) - result = runner.invoke(cli_app, ["schema", data_path, "--rules", rules_path]) + result = runner.invoke( + cli_app, ["schema", "--conn", data_path, "--rules", rules_path] + ) assert result.exit_code == 1 output = result.output @@ -396,18 +417,18 @@ async def validate(self) -> List[Dict[str, Any]]: # type: ignore[override] class TestSchemaValidationErrorsExtended: - def test_reject_tables_top_level(self, tmp_path: Path) -> None: - runner = CliRunner() - data_path = _write_tmp_file(tmp_path, "data.csv", "id\n1\n") - rules_path = _write_tmp_file( - tmp_path, - "schema.json", - json.dumps({"tables": {"users": []}, "rules": []}), - ) - - result = runner.invoke(cli_app, ["schema", data_path, "--rules", rules_path]) - assert result.exit_code >= 2 - assert "not supported in v1" in result.output + # def test_reject_tables_top_level(self, tmp_path: Path) -> None: + # runner = CliRunner() + # data_path = _write_tmp_file(tmp_path, "data.csv", "id\n1\n") + # rules_path = _write_tmp_file( + # tmp_path, + # "schema.json", + # json.dumps({"tables": {"users": []}, "rules": []}), + # ) + + # result = runner.invoke(cli_app, ["schema", "--conn", data_path, "--rules", rules_path]) + # assert result.exit_code >= 2 + # assert "not supported in v1" in result.output def test_enum_must_be_non_empty_array(self, tmp_path: Path) -> None: runner = CliRunner() @@ -418,6 +439,8 @@ def test_enum_must_be_non_empty_array(self, tmp_path: Path) -> None: json.dumps({"rules": [{"field": "status", "enum": []}]}), ) - result = runner.invoke(cli_app, ["schema", data_path, "--rules", rules_path]) + result = runner.invoke( + cli_app, ["schema", "--conn", data_path, "--rules", rules_path] + ) assert result.exit_code >= 2 assert "enum' must be a non-empty" in result.output diff --git a/tests/unit/cli/commands/test_schema_command_file_sources.py b/tests/unit/cli/commands/test_schema_command_file_sources.py index 0c799b1..8b8ee95 100644 --- a/tests/unit/cli/commands/test_schema_command_file_sources.py +++ b/tests/unit/cli/commands/test_schema_command_file_sources.py @@ -39,8 +39,8 @@ def test_csv_excel_to_sqlite_type_implications( {"reg_date": {"expected_type": "DATE"}, "ts": {"expected_type": "DATETIME"}} ) monkeypatch.setattr( - "cli.commands.schema._decompose_to_atomic_rules", - lambda payload: [schema_rule], + "cli.commands.schema._decompose_schema_payload", + lambda payload, source_config: [schema_rule], ) # Build SCHEMA result indicating SQLite TEXT types cause TYPE_MISMATCH @@ -72,6 +72,12 @@ def test_csv_excel_to_sqlite_type_implications( } class DummyValidator: + def __init__( + self, source_config: Any, rules: Any, core_config: Any, cli_config: Any + ) -> None: + # Accept all required parameters but don't use them + pass + async def validate(self) -> List[Dict[str, Any]]: # type: ignore[override] return [schema_result] @@ -98,7 +104,8 @@ async def validate(self) -> List[Dict[str, Any]]: # type: ignore[override] runner = CliRunner() result = runner.invoke( - cli_app, ["schema", data_path, "--rules", rules_path, "--output", "json"] + cli_app, + ["schema", "--conn", data_path, "--rules", rules_path, "--output", "json"], ) assert result.exit_code == 1 diff --git a/tests/unit/cli/commands/test_schema_command_json_extras.py b/tests/unit/cli/commands/test_schema_command_json_extras.py index 2d948ae..d2f7100 100644 --- a/tests/unit/cli/commands/test_schema_command_json_extras.py +++ b/tests/unit/cli/commands/test_schema_command_json_extras.py @@ -43,8 +43,8 @@ def test_json_includes_schema_extras_and_summary_counts( } ) monkeypatch.setattr( - "cli.commands.schema._decompose_to_atomic_rules", - lambda payload: [schema_rule], + "cli.commands.schema._decompose_schema_payload", + lambda payload, source_config: [schema_rule], ) # Results: SCHEMA failed with 1 type mismatch, 0 existence failures, extras present @@ -76,6 +76,12 @@ def test_json_includes_schema_extras_and_summary_counts( } class DummyValidator: + def __init__( + self, source_config: Any, rules: Any, core_config: Any, cli_config: Any + ) -> None: + # Accept all required parameters but don't use them + pass + async def validate(self) -> List[Dict[str, Any]]: # type: ignore[override] return [schema_result] @@ -97,10 +103,23 @@ async def validate(self) -> List[Dict[str, Any]]: # type: ignore[override] ) result = runner.invoke( - cli_app, ["schema", data_path, "--rules", rules_path, "--output", "json"] + cli_app, + ["schema", "--conn", data_path, "--rules", rules_path, "--output", "json"], ) assert result.exit_code == 1 - payload = json.loads(result.output) + + # Extract JSON part from output (skip warning messages) + output_lines = result.output.strip().split("\n") + json_line = None + for line in output_lines: + if line.strip().startswith("{"): + json_line = line.strip() + break + + if not json_line: + raise ValueError("No JSON output found in result") + + payload = json.loads(json_line) # schema_extras must present, sorted by CLI before emission assert payload.get("schema_extras") == ["aaa_extra", "zzz_extra"] @@ -115,8 +134,8 @@ def test_table_output_does_not_emit_schema_extras_key( ) -> None: schema_rule = _schema_rule_with({"id": {"expected_type": "INTEGER"}}) monkeypatch.setattr( - "cli.commands.schema._decompose_to_atomic_rules", - lambda payload: [schema_rule], + "cli.commands.schema._decompose_schema_payload", + lambda payload, source_config: [schema_rule], ) schema_result = { @@ -131,6 +150,12 @@ def test_table_output_does_not_emit_schema_extras_key( } class DummyValidator: + def __init__( + self, source_config: Any, rules: Any, core_config: Any, cli_config: Any + ) -> None: + # Accept all required parameters but don't use them + pass + async def validate(self) -> List[Dict[str, Any]]: # type: ignore[override] return [schema_result] @@ -143,7 +168,9 @@ async def validate(self) -> List[Dict[str, Any]]: # type: ignore[override] "schema.json", json.dumps({"rules": [{"field": "id", "type": "integer"}]}), ) - result = runner.invoke(cli_app, ["schema", data_path, "--rules", rules_path]) + result = runner.invoke( + cli_app, ["schema", "--conn", data_path, "--rules", rules_path] + ) assert result.exit_code == 0 # Plain text output should not dump JSON key name assert "schema_extras" not in result.output diff --git a/tests/unit/cli/commands/test_schema_command_multi_table.py b/tests/unit/cli/commands/test_schema_command_multi_table.py index f4b4202..0c5ecd8 100644 --- a/tests/unit/cli/commands/test_schema_command_multi_table.py +++ b/tests/unit/cli/commands/test_schema_command_multi_table.py @@ -22,25 +22,23 @@ class TestSchemaCommandMultiTable: def test_multi_table_rules_format_parsing(self, tmp_path: Path) -> None: """Test that multi-table rules format is correctly parsed.""" runner = CliRunner() - + # Create multi-table rules file # Use the existing multi-table schema file rules_path = "test_data/multi_table_schema.json" # Use the new multi-table Excel file instead of CSV data_path = "test_data/multi_table_data.xlsx" - - result = runner.invoke(cli_app, [ - "schema", - "--conn", data_path, - "--rules", rules_path, - "--output", "json" - ]) - + + result = runner.invoke( + cli_app, + ["schema", "--conn", data_path, "--rules", rules_path, "--output", "json"], + ) + assert result.exit_code == 0 payload = json.loads(result.output) assert payload["status"] == "ok" - assert payload["rules_count"] == 15 # 5 rules per table * 3 tables - + assert payload["rules_count"] == 17 + # Check that fields have table information fields = payload["fields"] assert len(fields) > 0 @@ -51,205 +49,215 @@ def test_multi_table_rules_format_parsing(self, tmp_path: Path) -> None: def test_multi_table_excel_sheets_detection(self, tmp_path: Path) -> None: """Test that Excel file sheets are correctly detected and used as tables.""" runner = CliRunner() - + # Create a simple multi-table rules file multi_table_rules = { "users": { "rules": [ {"field": "id", "type": "integer", "required": True}, - {"field": "name", "type": "string", "required": True} + {"field": "name", "type": "string", "required": True}, ] }, "products": { "rules": [ {"field": "product_id", "type": "integer", "required": True}, - {"field": "product_name", "type": "string", "required": True} + {"field": "product_name", "type": "string", "required": True}, ] - } + }, } - - rules_path = _write_tmp_file(tmp_path, "multi_table_rules.json", json.dumps(multi_table_rules)) + + rules_path = _write_tmp_file( + tmp_path, "multi_table_rules.json", json.dumps(multi_table_rules) + ) data_path = "test_data/multi_table_data.xlsx" - - result = runner.invoke(cli_app, [ - "schema", - "--conn", data_path, - "--rules", rules_path, - "--output", "json" - ]) - + + result = runner.invoke( + cli_app, + ["schema", "--conn", data_path, "--rules", rules_path, "--output", "json"], + ) + assert result.exit_code == 0 payload = json.loads(result.output) assert payload["status"] == "ok" - + # Check that both tables are processed fields = payload["fields"] user_fields = [f for f in fields if f.get("table") == "users"] product_fields = [f for f in fields if f.get("table") == "products"] - + assert len(user_fields) > 0 assert len(product_fields) > 0 def test_multi_table_with_table_level_options(self, tmp_path: Path) -> None: """Test multi-table format with table-level options like strict_mode.""" runner = CliRunner() - + multi_table_rules = { "users": { - "rules": [ - {"field": "id", "type": "integer", "required": True} - ], - "strict_mode": True + "rules": [{"field": "id", "type": "integer", "required": True}], + "strict_mode": True, }, "products": { "rules": [ {"field": "product_name", "type": "string", "required": True} ], - "case_insensitive": True - } + "case_insensitive": True, + }, } - - rules_path = _write_tmp_file(tmp_path, "multi_table_options.json", json.dumps(multi_table_rules)) + + rules_path = _write_tmp_file( + tmp_path, "multi_table_options.json", json.dumps(multi_table_rules) + ) data_path = "test_data/multi_table_data.xlsx" - - result = runner.invoke(cli_app, [ - "schema", - "--conn", data_path, - "--rules", rules_path - ]) - - assert result.exit_code == 0 - # Should not raise any validation errors for table-level options + + result = runner.invoke( + cli_app, + ["schema", "--conn", data_path, "--rules", rules_path, "--output", "json"], + ) + + # With strict_mode=True, extra columns will cause SCHEMA validation to fail + assert result.exit_code == 1 + payload = json.loads(result.output) + assert payload["status"] == "ok" # Overall status is ok + assert ( + payload["summary"]["failed_rules"] == 1 + ) # One rule failed due to strict mode + assert payload["summary"]["passed_rules"] == 3 # Three rules passed def test_multi_table_backward_compatibility(self, tmp_path: Path) -> None: """Test that single-table format still works for backward compatibility.""" runner = CliRunner() - + # Single-table format (legacy) single_table_rules = { "rules": [ {"field": "id", "type": "integer", "required": True}, - {"field": "name", "type": "string", "required": True} + {"field": "name", "type": "string", "required": True}, ] } - - rules_path = _write_tmp_file(tmp_path, "single_table.json", json.dumps(single_table_rules)) + + rules_path = _write_tmp_file( + tmp_path, "single_table.json", json.dumps(single_table_rules) + ) # Use only the users sheet for single table test data_path = "test_data/multi_table_data.xlsx" - - result = runner.invoke(cli_app, [ - "schema", - "--conn", data_path, - "--rules", rules_path, - "--output", "json" - ]) - + + result = runner.invoke( + cli_app, + ["schema", "--conn", data_path, "--rules", rules_path, "--output", "json"], + ) + assert result.exit_code == 0 - payload = json.loads(result.output) + + # Handle mixed output (warning + JSON) + output_lines = result.output.strip().split("\n") + json_line = None + for line in output_lines: + if line.strip().startswith("{"): + json_line = line.strip() + break + + assert json_line is not None, f"No JSON found in output: {result.output}" + + payload = json.loads(json_line) assert payload["status"] == "ok" - assert payload["rules_count"] == 2 + assert payload["rules_count"] == 3 def test_multi_table_validation_errors(self, tmp_path: Path) -> None: """Test validation errors for invalid multi-table format.""" runner = CliRunner() - + # Invalid: table schema is not an object - invalid_rules = { - "users": "not_an_object" - } - - rules_path = _write_tmp_file(tmp_path, "invalid.json", json.dumps(invalid_rules)) + invalid_rules = {"users": "not_an_object"} + + rules_path = _write_tmp_file( + tmp_path, "invalid.json", json.dumps(invalid_rules) + ) data_path = "test_data/multi_table_data.xlsx" - - result = runner.invoke(cli_app, [ - "schema", - "--conn", data_path, - "--rules", rules_path - ]) - + + result = runner.invoke( + cli_app, ["schema", "--conn", data_path, "--rules", rules_path] + ) + assert result.exit_code >= 2 # Usage error assert "must be an object" in result.output def test_multi_table_missing_rules_array(self, tmp_path: Path) -> None: """Test validation error when table is missing rules array.""" runner = CliRunner() - + invalid_rules = { "users": { "strict_mode": True # Missing rules array } } - - rules_path = _write_tmp_file(tmp_path, "missing_rules.json", json.dumps(invalid_rules)) + + rules_path = _write_tmp_file( + tmp_path, "missing_rules.json", json.dumps(invalid_rules) + ) data_path = "test_data/multi_table_data.xlsx" - - result = runner.invoke(cli_app, [ - "schema", - "--conn", data_path, - "--rules", rules_path - ]) - + + result = runner.invoke( + cli_app, ["schema", "--conn", data_path, "--rules", rules_path] + ) + assert result.exit_code >= 2 # Usage error assert "must have a 'rules' array" in result.output def test_multi_table_invalid_table_level_options(self, tmp_path: Path) -> None: """Test validation error for invalid table-level options.""" runner = CliRunner() - + invalid_rules = { "users": { - "rules": [ - {"field": "id", "type": "integer", "required": True} - ], - "strict_mode": "not_a_boolean" # Should be boolean + "rules": [{"field": "id", "type": "integer", "required": True}], + "strict_mode": "not_a_boolean", # Should be boolean } } - - rules_path = _write_tmp_file(tmp_path, "invalid_options.json", json.dumps(invalid_rules)) + + rules_path = _write_tmp_file( + tmp_path, "invalid_options.json", json.dumps(invalid_rules) + ) data_path = "test_data/multi_table_data.xlsx" - - result = runner.invoke(cli_app, [ - "schema", - "--conn", data_path, - "--rules", rules_path - ]) - + + result = runner.invoke( + cli_app, ["schema", "--conn", data_path, "--rules", rules_path] + ) + assert result.exit_code >= 2 # Usage error assert "must be a boolean" in result.output def test_multi_table_output_formatting(self, tmp_path: Path) -> None: """Test that multi-table output is properly formatted and grouped.""" runner = CliRunner() - + multi_table_rules = { "users": { "rules": [ {"field": "id", "type": "integer", "required": True}, - {"field": "name", "type": "string", "required": True} + {"field": "name", "type": "string", "required": True}, ] }, "products": { - "rules": [ - {"field": "product_id", "type": "integer", "required": True} - ] - } + "rules": [{"field": "product_id", "type": "integer", "required": True}] + }, } - - rules_path = _write_tmp_file(tmp_path, "multi_table.json", json.dumps(multi_table_rules)) + + rules_path = _write_tmp_file( + tmp_path, "multi_table.json", json.dumps(multi_table_rules) + ) data_path = "test_data/multi_table_data.xlsx" - + # Test table output format - result = runner.invoke(cli_app, [ - "schema", - "--conn", data_path, - "--rules", rules_path, - "--output", "table" - ]) - + result = runner.invoke( + cli_app, + ["schema", "--conn", data_path, "--rules", rules_path, "--output", "table"], + ) + assert result.exit_code == 0 output = result.output - + # Should show table headers for multi-table assert "📋 Table: users" in output assert "📋 Table: products" in output @@ -258,44 +266,38 @@ def test_multi_table_output_formatting(self, tmp_path: Path) -> None: def test_multi_table_json_output_structure(self, tmp_path: Path) -> None: """Test that JSON output includes table information for multi-table.""" runner = CliRunner() - + multi_table_rules = { - "users": { - "rules": [ - {"field": "id", "type": "integer", "required": True} - ] - }, + "users": {"rules": [{"field": "id", "type": "integer", "required": True}]}, "products": { - "rules": [ - {"field": "product_name", "type": "string", "required": True} - ] - } + "rules": [{"field": "product_name", "type": "string", "required": True}] + }, } - - rules_path = _write_tmp_file(tmp_path, "multi_table.json", json.dumps(multi_table_rules)) + + rules_path = _write_tmp_file( + tmp_path, "multi_table.json", json.dumps(multi_table_rules) + ) data_path = "test_data/multi_table_data.xlsx" - - result = runner.invoke(cli_app, [ - "schema", - "--conn", data_path, - "--rules", rules_path, - "--output", "json" - ]) - + + result = runner.invoke( + cli_app, + ["schema", "--conn", data_path, "--rules", rules_path, "--output", "json"], + ) + assert result.exit_code == 0 payload = json.loads(result.output) - + # Check that fields have table information fields = payload["fields"] assert len(fields) >= 2 - + # Find fields for each table user_fields = [f for f in fields if f.get("table") == "users"] product_fields = [f for f in fields if f.get("table") == "products"] - + assert len(user_fields) > 0 assert len(product_fields) > 0 - + # Check that each field has table info for field in fields: assert "table" in field @@ -304,71 +306,67 @@ def test_multi_table_json_output_structure(self, tmp_path: Path) -> None: def test_multi_table_no_table_option_required(self, tmp_path: Path) -> None: """Test that --table option is no longer required.""" runner = CliRunner() - + multi_table_rules = { - "users": { - "rules": [ - {"field": "id", "type": "integer", "required": True} - ] - } + "users": {"rules": [{"field": "id", "type": "integer", "required": True}]} } - - rules_path = _write_tmp_file(tmp_path, "multi_table.json", json.dumps(multi_table_rules)) + + rules_path = _write_tmp_file( + tmp_path, "multi_table.json", json.dumps(multi_table_rules) + ) data_path = "test_data/multi_table_data.xlsx" - + # Should work without --table option - result = runner.invoke(cli_app, [ - "schema", - "--conn", data_path, - "--rules", rules_path - ]) - + result = runner.invoke( + cli_app, ["schema", "--conn", data_path, "--rules", rules_path] + ) + assert result.exit_code == 0 # Command should execute successfully without --table option def test_multi_table_excel_specific_functionality(self, tmp_path: Path) -> None: """Test specific Excel multi-table functionality.""" runner = CliRunner() - + # Test with all three tables from the Excel file multi_table_rules = { "users": { "rules": [ {"field": "id", "type": "integer", "required": True}, {"field": "name", "type": "string", "required": True}, - {"field": "email", "type": "string", "required": True} + {"field": "email", "type": "string", "required": True}, ] }, "products": { "rules": [ {"field": "product_id", "type": "integer", "required": True}, {"field": "product_name", "type": "string", "required": True}, - {"field": "price", "type": "float", "min": 0.0} + {"field": "price", "type": "float", "min": 0.0}, ] }, "orders": { "rules": [ {"field": "order_id", "type": "integer", "required": True}, {"field": "user_id", "type": "integer", "required": True}, - {"field": "total_amount", "type": "float", "min": 0.0} + {"field": "total_amount", "type": "float", "min": 0.0}, ] - } + }, } - - rules_path = _write_tmp_file(tmp_path, "excel_multi_table.json", json.dumps(multi_table_rules)) + + rules_path = _write_tmp_file( + tmp_path, "excel_multi_table.json", json.dumps(multi_table_rules) + ) data_path = "test_data/multi_table_data.xlsx" - - result = runner.invoke(cli_app, [ - "schema", - "--conn", data_path, - "--rules", rules_path, - "--output", "json" - ]) - + + result = runner.invoke( + cli_app, + ["schema", "--conn", data_path, "--rules", rules_path, "--output", "json"], + ) + assert result.exit_code == 0 payload = json.loads(result.output) assert payload["status"] == "ok" - + # Check that all three tables are processed fields = payload["fields"] table_names = set(field.get("table") for field in fields) @@ -379,10 +377,10 @@ def test_multi_table_excel_specific_functionality(self, tmp_path: Path) -> None: def test_multi_table_help_text_updated(self, tmp_path: Path) -> None: """Test that help text reflects multi-table support.""" runner = CliRunner() - + result = runner.invoke(cli_app, ["schema", "--help"]) assert result.exit_code == 0 - + # Should mention multi-table support assert "multi-table" in result.output.lower() # Should not mention --table option From bd3e81db05625d60db237b8187decd2583d4da0e Mon Sep 17 00:00:00 2001 From: litedatum Date: Tue, 26 Aug 2025 22:37:50 -0400 Subject: [PATCH 4/6] test: regression test --- cli/core/data_validator.py | 2 +- cli/core/source_parser.py | 2 +- scripts/run_tests_quiet.py | 2 +- tests/conftest.py | 7 ++++--- tests/shared/builders/test_builders.py | 4 ++-- tests/unit/cli/commands/test_schema_command.py | 4 +++- tests/unit/cli/commands/test_schema_command_extended.py | 4 +++- 7 files changed, 15 insertions(+), 10 deletions(-) diff --git a/cli/core/data_validator.py b/cli/core/data_validator.py index 398070f..2415f34 100644 --- a/cli/core/data_validator.py +++ b/cli/core/data_validator.py @@ -401,7 +401,7 @@ async def _convert_multi_table_excel_to_sqlite(self) -> ConnectionSchema: raise ValueError(f"Failed to create multi-table SQLite database: {str(e)}") async def _load_multi_table_excel_to_sqlite( - self, engine, temp_db_path: str + self, engine: Any, temp_db_path: str ) -> None: """ Load multiple sheets from Excel file into SQLite database. diff --git a/cli/core/source_parser.py b/cli/core/source_parser.py index 839c7ea..7dadc59 100644 --- a/cli/core/source_parser.py +++ b/cli/core/source_parser.py @@ -121,7 +121,7 @@ def get_excel_sheets(self, file_path: str) -> Dict[str, List[str]]: for sheet_name in excel_file.sheet_names: # Read first few rows to get column names df = pd.read_excel(file_path, sheet_name=sheet_name, nrows=0) - sheets_info[sheet_name] = list(df.columns) + sheets_info[str(sheet_name)] = list(df.columns) return sheets_info except Exception as e: diff --git a/scripts/run_tests_quiet.py b/scripts/run_tests_quiet.py index c6cb790..31f95f4 100644 --- a/scripts/run_tests_quiet.py +++ b/scripts/run_tests_quiet.py @@ -17,7 +17,7 @@ from pathlib import Path -def main(): +def main() -> None: """Run tests with quiet logging configuration.""" # Get the project root directory project_root = Path(__file__).parent.parent diff --git a/tests/conftest.py b/tests/conftest.py index 0c7c26d..87469f6 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -16,16 +16,17 @@ sys.path.insert(0, str(Path(__file__).parent.parent)) from shared.config.loader import load_config +from shared.config.logging_config import LoggingConfig # Import the database connection management module. from shared.database.connection import close_all_engines # Load test-specific logging configuration try: - test_logging_config = load_config("logging.test.toml") - if test_logging_config: + test_logging_config: LoggingConfig = load_config("logging.test.toml", LoggingConfig) + if test_logging_config and test_logging_config.module_levels: # Apply test logging configuration - for module, level in test_logging_config.get("module_levels", {}).items(): + for module, level in test_logging_config.module_levels.items(): _logging.getLogger(module).setLevel(getattr(_logging, level.upper())) except Exception: # Fallback to default configuration if test config not found diff --git a/tests/shared/builders/test_builders.py b/tests/shared/builders/test_builders.py index ac15ce1..5a82b22 100644 --- a/tests/shared/builders/test_builders.py +++ b/tests/shared/builders/test_builders.py @@ -211,7 +211,7 @@ def __init__(self) -> None: self._username = "test_user" self._password = "test_pass" self._db_schema = "test_schema" - self._available_tables = None + self._available_tables: Optional[List[str]] = None self._file_path: Optional[str] = None self._parameters: Dict[str, Any] = {} @@ -240,7 +240,7 @@ def with_database(self, db_name: str) -> "TestDataBuilder.ConnectionBuilder": def with_available_tables( self, table_name: str ) -> "TestDataBuilder.ConnectionBuilder": - self._available_tables = table_name + self._available_tables = [table_name] return self def with_credentials( diff --git a/tests/unit/cli/commands/test_schema_command.py b/tests/unit/cli/commands/test_schema_command.py index 028e091..dc94e91 100644 --- a/tests/unit/cli/commands/test_schema_command.py +++ b/tests/unit/cli/commands/test_schema_command.py @@ -94,7 +94,9 @@ def test_output_json_declared_columns_always_listed( ) class DummyValidator: - def __init__(self, source_config, rules, core_config, cli_config) -> None: + def __init__( + self, source_config: Any, rules: Any, core_config: Any, cli_config: Any + ) -> None: # Accept constructor arguments but ignore them pass diff --git a/tests/unit/cli/commands/test_schema_command_extended.py b/tests/unit/cli/commands/test_schema_command_extended.py index 150a88d..d3a9eb2 100644 --- a/tests/unit/cli/commands/test_schema_command_extended.py +++ b/tests/unit/cli/commands/test_schema_command_extended.py @@ -252,7 +252,9 @@ def test_json_output_aggregation_and_skip_semantics( # Patch DataValidator.validate to return our results class DummyValidator: - def __init__(self, source_config, rules, core_config, cli_config): + def __init__( + self, source_config: Any, rules: Any, core_config: Any, cli_config: Any + ): # Accept all required parameters but don't use them pass From 6fb5b87dffcae12401f4854c8d3020a343cf5b85 Mon Sep 17 00:00:00 2001 From: litedatum Date: Wed, 27 Aug 2025 14:45:33 -0400 Subject: [PATCH 5/6] fix: fixed schema output error --- cli/commands/schema.py | 331 ++++++++++++++++++------------ config/logging.toml | 2 +- shared/database/query_executor.py | 24 +-- 3 files changed, 211 insertions(+), 146 deletions(-) diff --git a/cli/commands/schema.py b/cli/commands/schema.py index 898354d..fec2a9e 100644 --- a/cli/commands/schema.py +++ b/cli/commands/schema.py @@ -48,7 +48,7 @@ def _validate_multi_table_rules_payload(payload: Any) -> Tuple[List[str], int]: "strict_mode": true }, "table2": { - "rules": [...] + "rules": [...] } } @@ -389,7 +389,7 @@ def _decompose_single_table_schema( atomic_rules.insert( 0, _create_rule_schema( - name="schema", + name=f"schema_{table_name}", rule_type=RuleType.SCHEMA, column=None, parameters=schema_params, @@ -406,52 +406,51 @@ def _decompose_single_table_schema( return atomic_rules -# def _decompose_to_atomic_rules(payload: Dict[str, Any]) -> List[RuleSchema]: -# """Decompose schema JSON payload into atomic RuleSchema objects. - -# This function is kept for backward compatibility but now delegates to -# the new multi-table aware function. -# """ -# # For backward compatibility, we need to determine the source_db -# # This will be handled by the caller -# return _decompose_multi_table_schema(payload, "unknown") - - def _build_prioritized_atomic_status( *, - schema_result: Dict[str, Any] | None, + schema_results: List[Dict[str, Any]], atomic_rules: List[RuleSchema], ) -> Dict[str, Dict[str, str]]: - """Return a mapping rule_id -> {status, skip_reason} applying prioritization. + """Return a mapping rule_id -> {status, skip_reason} applying prioritization.""" + mapping: Dict[str, Dict[str, str]] = {} + column_guard: Dict[str, str] = {} - Prioritization per column: - 1) If field missing → mark SCHEMA for that field as FAILED (implicit) and all - dependent rules (NOT_NULL/RANGE/ENUM) as SKIPPED (reason FIELD_MISSING). - 2) If type mismatch → mark dependent rules as SKIPPED (reason TYPE_MISMATCH). - 3) Otherwise, leave dependent rules to their engine-evaluated status. + schema_rules_map = { + str(rule.id): rule for rule in atomic_rules if rule.type == RuleType.SCHEMA + } - We infer per-column status from schema_result.execution_plan.schema_details. - """ - mapping: Dict[str, Dict[str, str]] = {} + for schema_result in schema_results: + rule_id = str(schema_result.get("rule_id", "")) + rule = schema_rules_map.get(rule_id) + if not rule: + continue + + table_name = rule.get_target_info().get("table") + if not table_name: + continue - # Build per-column guard from SCHEMA details - column_guard: Dict[str, str] = {} # column -> NONE|FIELD_MISSING|TYPE_MISMATCH - if schema_result: - # Safely access nested dictionaries, checking for None at each level. execution_plan = schema_result.get("execution_plan") or {} schema_details = execution_plan.get("schema_details") or {} details = schema_details.get("field_results") or [] for item in details: col = str(item.get("column")) code = str(item.get("failure_code", "NONE")) - column_guard[col] = code + column_guard[f"{table_name}.{col}"] = code - # Apply skip to dependent rules for r in atomic_rules: if r.type == RuleType.SCHEMA: continue - column = r.get_target_column() or "" - guard = column_guard.get(column, "NONE") + + target_info = r.get_target_info() + table_name = target_info.get("table") + column_name = target_info.get("column") + + if not table_name or not column_name: + continue + + guard_key = f"{table_name}.{column_name}" + guard = column_guard.get(guard_key, "NONE") + if guard == "FIELD_MISSING": mapping[r.id] = {"status": "SKIPPED", "skip_reason": "FIELD_MISSING"} elif guard == "TYPE_MISMATCH": @@ -560,43 +559,42 @@ def _run_validation(validator: Any) -> Tuple[List[Any], float]: return results, exec_seconds -def _extract_schema_result_dict( +def _extract_schema_results( *, atomic_rules: List[RuleSchema], results: List[Any] -) -> Dict[str, Any] | None: - try: - schema_rule = next( - (rule for rule in atomic_rules if rule.type == RuleType.SCHEMA), None - ) - if not schema_rule: - return None - for r in results: - if r is None: - continue - rid = "" - if hasattr(r, "rule_id"): - try: - rid = str(getattr(r, "rule_id")) - except Exception: - rid = "" - elif isinstance(r, dict): - rid = str(r.get("rule_id", "")) - if rid == str(schema_rule.id): - return ( - r.model_dump() - if hasattr(r, "model_dump") - else cast(Dict[str, Any], r) - ) - return None - except Exception: - return None +) -> List[Dict[str, Any]]: + """Extract all SCHEMA rule results from the list of validation results.""" + schema_results = [] + schema_rule_ids = { + str(rule.id) for rule in atomic_rules if rule.type == RuleType.SCHEMA + } + if not schema_rule_ids: + return [] + + for r in results: + if r is None: + continue + rid = "" + if hasattr(r, "rule_id"): + try: + rid = str(getattr(r, "rule_id")) + except Exception: + rid = "" + elif isinstance(r, dict): + rid = str(r.get("rule_id", "")) + + if rid in schema_rule_ids: + schema_results.append( + r.model_dump() if hasattr(r, "model_dump") else cast(Dict[str, Any], r) + ) + return schema_results def _compute_skip_map( - *, atomic_rules: List[RuleSchema], schema_result_dict: Dict[str, Any] | None + *, atomic_rules: List[RuleSchema], schema_results: List[Dict[str, Any]] ) -> Dict[str, Dict[str, str]]: try: return _build_prioritized_atomic_status( - schema_result=schema_result_dict, atomic_rules=atomic_rules + schema_results=schema_results, atomic_rules=atomic_rules ) except Exception: return {} @@ -609,7 +607,7 @@ def _emit_json_output( atomic_rules: List[RuleSchema], results: List[Any], skip_map: Dict[str, Dict[str, str]], - schema_result_dict: Dict[str, Any] | None, + schema_results: List[Dict[str, Any]], exec_seconds: float, ) -> None: enriched_results: List[Dict[str, Any]] = [] @@ -647,15 +645,24 @@ def _failed_records_of(res: Dict[str, Any]) -> int: fields: List[Dict[str, Any]] = [] schema_fields_index: Dict[str, Dict[str, Any]] = {} - if schema_result_dict: - schema_plan = (schema_result_dict or {}).get("execution_plan", {}) or {} + schema_rules_map = { + str(rule.id): rule for rule in atomic_rules if rule.type == RuleType.SCHEMA + } + + for schema_result in schema_results: + schema_plan = (schema_result or {}).get("execution_plan", {}) or {} schema_details = schema_plan.get("schema_details", {}) or {} field_results = schema_details.get("field_results", []) or [] + + rule_id = str(schema_result.get("rule_id", "")) + rule = schema_rules_map.get(rule_id) + table_name = rule.get_target_info().get("table") if rule else "unknown" + for item in field_results: col_name = str(item.get("column")) entry: Dict[str, Any] = { "column": col_name, - "table": "unknown", # Will be updated later with actual table name + "table": table_name, "checks": { "existence": { "status": item.get("existence", "UNKNOWN"), @@ -668,26 +675,25 @@ def _failed_records_of(res: Dict[str, Any]) -> int: }, } fields.append(entry) - schema_fields_index[col_name] = entry + schema_fields_index[f"{table_name}.{col_name}"] = entry - schema_rule = next( - (rule for rule in atomic_rules if rule.type == RuleType.SCHEMA), None - ) - if schema_rule: - params = schema_rule.parameters or {} - declared_cols = (params.get("columns") or {}).keys() - for col in declared_cols: - if str(col) not in schema_fields_index: - entry = { - "column": str(col), - "table": "unknown", # Will be updated later with actual table name - "checks": { - "existence": {"status": "UNKNOWN", "failure_code": "NONE"}, - "type": {"status": "UNKNOWN", "failure_code": "NONE"}, - }, - } - fields.append(entry) - schema_fields_index[str(col)] = entry + for rule in atomic_rules: + if rule.type == RuleType.SCHEMA: + params = rule.parameters or {} + declared_cols = (params.get("columns") or {}).keys() + table_name = rule.get_target_info().get("table") + for col in declared_cols: + if f"{table_name}.{str(col)}" not in schema_fields_index: + entry = { + "column": str(col), + "table": table_name, + "checks": { + "existence": {"status": "UNKNOWN", "failure_code": "NONE"}, + "type": {"status": "UNKNOWN", "failure_code": "NONE"}, + }, + } + fields.append(entry) + schema_fields_index[f"{table_name}.{str(col)}"] = entry def _ensure_check(entry: Dict[str, Any], name: str) -> Dict[str, Any]: checks: Dict[str, Dict[str, Any]] = entry.setdefault("checks", {}) @@ -706,22 +712,23 @@ def _ensure_check(entry: Dict[str, Any], name: str) -> Dict[str, Any]: rule = rule_map.get(rule_id) if not rule or rule.type == RuleType.SCHEMA: continue + column_name = rule.get_target_column() or "" if not column_name: continue - # Add table name for multi-table support + table_name = "unknown" if rule.target and rule.target.entities: table_name = rule.target.entities[0].table - l_entry = schema_fields_index.get(column_name) + l_entry = schema_fields_index.get(f"{table_name}.{column_name}") if not l_entry: l_entry = {"column": column_name, "table": table_name, "checks": {}} fields.append(l_entry) - schema_fields_index[column_name] = l_entry + schema_fields_index[f"{table_name}.{column_name}"] = l_entry else: - # Ensure table name is set l_entry["table"] = table_name + t = rule.type if t == RuleType.NOT_NULL: key = "not_null" @@ -735,11 +742,13 @@ def _ensure_check(entry: Dict[str, Any], name: str) -> Dict[str, Any]: key = "date_format" else: key = t.value.lower() + check = _ensure_check(l_entry, key) check["status"] = str(rd.get("status", "UNKNOWN")) if rule_id in skip_map: check["status"] = skip_map[rule_id]["status"] check["skip_reason"] = skip_map[rule_id]["skip_reason"] + fr = _failed_records_of(rd) if fr: check["failed_records"] = fr @@ -757,18 +766,15 @@ def _ensure_check(entry: Dict[str, Any], name: str) -> Dict[str, Any]: total_failed_records = sum(_failed_records_of(r) for r in enriched_results) schema_extras: List[str] = [] - if schema_result_dict: + for schema_result in schema_results: try: extras = ( - (schema_result_dict or {}) - .get("execution_plan", {}) - .get("schema_details", {}) - .get("extras", []) + (schema_result or {}).get("execution_plan", {}).get("schema_details", {}).get("extras", []) ) if isinstance(extras, list): - schema_extras = [str(x) for x in extras] + schema_extras.extend([str(x) for x in extras]) except Exception: - schema_extras = [] + pass payload: Dict[str, Any] = { "status": "ok", @@ -787,7 +793,7 @@ def _ensure_check(entry: Dict[str, Any], name: str) -> Dict[str, Any]: "fields": fields, } if schema_extras: - payload["schema_extras"] = sorted(schema_extras) + payload["schema_extras"] = sorted(list(set(schema_extras))) _safe_echo(json.dumps(payload, default=str)) @@ -797,7 +803,7 @@ def _emit_table_output( atomic_rules: List[RuleSchema], results: List[Any], skip_map: Dict[str, Dict[str, str]], - schema_result_dict: Dict[str, Any] | None, + schema_results: List[Dict[str, Any]], exec_seconds: float, ) -> None: rule_map = {str(rule.id): rule for rule in atomic_rules} @@ -833,7 +839,6 @@ def _dataset_total(res: Dict[str, Any]) -> int: rd["rule_type"] = rule.type.value rd["column_name"] = rule.get_target_column() rd.setdefault("rule_name", rule.name) - # Add table name for multi-table support if rule.target and rule.target.entities: rd["table_name"] = rule.target.entities[0].table if rid in skip_map: @@ -841,9 +846,14 @@ def _dataset_total(res: Dict[str, Any]) -> int: rd["skip_reason"] = skip_map[rid]["skip_reason"] table_results.append(rd) - header_total_records = 0 + table_records: Dict[str, int] = {} for rd in table_results: - header_total_records = max(header_total_records, _dataset_total(rd)) + table_name = rd.get("table_name", "unknown") + total = _dataset_total(rd) + if total > 0: + table_records[table_name] = max(table_records.get(table_name, 0), total) + + header_total_records = sum(table_records.values()) def _calc_failed(res: Dict[str, Any]) -> int: if isinstance(res.get("failed_records"), int): @@ -863,19 +873,11 @@ def _calc_failed(res: Dict[str, Any]) -> int: if "total_records" not in rd: rd["total_records"] = _dataset_total(rd) - column_guard: Dict[str, str] = {} - if schema_result_dict: - execution_plan = schema_result_dict.get("execution_plan") or {} - schema_details = execution_plan.get("schema_details") or {} - details = schema_details.get("field_results") or [] - for item in details: - col = str(item.get("column")) - column_guard[col] = str(item.get("failure_code", "NONE")) - - # Group results by table for multi-table support tables_grouped: Dict[str, Dict[str, Dict[str, Any]]] = {} for rd in table_results: + if rd.get("rule_type") == RuleType.SCHEMA.value: + continue table_name = rd.get("table_name", "unknown") if table_name not in tables_grouped: tables_grouped[table_name] = {} @@ -892,10 +894,6 @@ def _calc_failed(res: Dict[str, Any]) -> int: key = "range" elif rd.get("rule_type") == RuleType.ENUM.value: key = "enum" - elif rd.get("rule_type") == RuleType.REGEX.value: - key = "regex" - elif rd.get("rule_type") == RuleType.DATE_FORMAT.value: - key = "date_format" else: key = rd.get("rule_type", "unknown").lower() @@ -909,44 +907,113 @@ def _calc_failed(res: Dict[str, Any]) -> int: } ) + all_columns_by_table: Dict[str, set] = {} + for rule in atomic_rules: + if rule.target and rule.target.entities: + table_name = rule.target.entities[0].table + if table_name not in all_columns_by_table: + all_columns_by_table[table_name] = set() + + if rule.type == RuleType.SCHEMA: + if rule.parameters: + declared_cols = (rule.parameters.get("columns") or {}).keys() + for col in declared_cols: + all_columns_by_table[table_name].add(str(col)) + else: + column_name = rule.get_target_column() + if column_name: + all_columns_by_table[table_name].add(column_name) + + for table_name, columns in all_columns_by_table.items(): + if table_name not in tables_grouped: + tables_grouped[table_name] = {} + for column_name in columns: + if column_name not in tables_grouped[table_name]: + tables_grouped[table_name][column_name] = { + "column": column_name, + "issues": [], + } + + schema_rules_map = { + str(rule.id): rule for rule in atomic_rules if rule.type == RuleType.SCHEMA + } + for schema_result in schema_results: + rule_id = str(schema_result.get("rule_id", "")) + rule = schema_rules_map.get(rule_id) + if not rule: + continue + + table_name = rule.get_target_info().get("table") + if not table_name or table_name not in tables_grouped: + continue + + execution_plan = schema_result.get("execution_plan") or {} + schema_details = execution_plan.get("schema_details", {}) or {} + details = schema_details.get("field_results", []) or [] + for item in details: + col = str(item.get("column")) + if item.get("failure_code") == "FIELD_MISSING": + tables_grouped[table_name][col]["issues"].append( + {"check": "missing", "status": "FAILED"} + ) + elif item.get("failure_code") == "TYPE_MISMATCH": + tables_grouped[table_name][col]["issues"].append( + {"check": "type", "status": "FAILED"} + ) + lines: List[str] = [] - lines.append(f"✓ Checking {source} ({header_total_records:,} records)") + lines.append(f"✓ Checking {source}") total_failed_records = sum( int(r.get("failed_records", 0) or 0) for r in table_results ) - # Display results grouped by table for table_name in sorted(tables_grouped.keys()): - if len(tables_grouped) > 1: # Only show table header for multi-table - lines.append(f"\n📋 Table: {table_name}") + records = table_records.get(table_name, 0) + lines.append(f"\n📋 Table: {table_name} ({records:,} records)") table_grouped = tables_grouped[table_name] for col in sorted(table_grouped.keys()): issues = table_grouped[col]["issues"] - critical = [i for i in issues if i["status"] in {"FAILED", "ERROR"}] - skipped = [i for i in issues if i["status"] == "SKIPPED"] + + # Consolidate issues to avoid duplicates, prioritizing 'missing' + final_issues = [] + has_missing = any(i.get("check") == "missing" for i in issues) + if has_missing: + final_issues.append({"check": "missing", "status": "FAILED"}) + else: + final_issues.extend(issues) + + critical = [i for i in final_issues if i["status"] in {"FAILED", "ERROR"}] + skipped = [i for i in final_issues if i["status"] == "SKIPPED"] if not critical and not skipped: lines.append(f"✓ {col}: OK") else: - # Show critical issues first + printed_checks = set() for i in critical: - fr = i.get("failed_records") or 0 - if i["status"] == "ERROR": + check_key = i['check'] + if check_key in printed_checks: continue + printed_checks.add(check_key) + + fr = i.get("failed_records", 0) + if i["check"] == "missing": + lines.append(f"✗ {col}: missing (skipped dependent checks)") + elif i["status"] == "ERROR": lines.append(f"✗ {col}: {i['check']} error") else: lines.append(f"✗ {col}: {i['check']} failed ({fr} failures)") - # Show skipped issues with skip reason for i in skipped: + check_key = i.get("skip_reason") + if check_key in printed_checks: continue + printed_checks.add(check_key) + skip_reason = i.get("skip_reason", "unknown reason") if skip_reason == "FIELD_MISSING": lines.append(f"✗ {col}: missing (skipped dependent checks)") elif skip_reason == "TYPE_MISMATCH": - lines.append( - f"✗ {col}: type mismatch (skipped dependent checks)" - ) + lines.append(f"✗ {col}: type mismatch (skipped dependent checks)") else: lines.append(f"✗ {col}: {i['check']} skipped ({skip_reason})") @@ -1034,8 +1101,6 @@ def schema_command( source_config = SourceParser().parse_source(connection_string) rules_payload = _read_rules_payload(rules_file) - # If the rules file uses a multi-table format, signal this to the DataValidator - # so that it skips its single-table target completion logic. is_multi_table_rules = "rules" not in rules_payload if is_multi_table_rules: source_config.parameters["is_multi_table"] = True @@ -1064,11 +1129,11 @@ def schema_command( ) results, exec_seconds = _run_validation(validator) - schema_result_dict: Dict[str, Any] | None = _extract_schema_result_dict( + schema_results = _extract_schema_results( atomic_rules=atomic_rules, results=results ) skip_map = _compute_skip_map( - atomic_rules=atomic_rules, schema_result_dict=schema_result_dict + atomic_rules=atomic_rules, schema_results=schema_results ) if output.lower() == "json": @@ -1078,7 +1143,7 @@ def schema_command( atomic_rules=atomic_rules, results=results, skip_map=skip_map, - schema_result_dict=schema_result_dict, + schema_results=schema_results, exec_seconds=exec_seconds, ) else: @@ -1087,7 +1152,7 @@ def schema_command( atomic_rules=atomic_rules, results=results, skip_map=skip_map, - schema_result_dict=schema_result_dict, + schema_results=schema_results, exec_seconds=exec_seconds, ) diff --git a/config/logging.toml b/config/logging.toml index 9630a91..b32cc7b 100644 --- a/config/logging.toml +++ b/config/logging.toml @@ -1,7 +1,7 @@ # Logging Configuration # Global log level: DEBUG, INFO, WARNING, ERROR, CRITICAL -level = "ERROR" +level = "WARNING" # Log message format format = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" diff --git a/shared/database/query_executor.py b/shared/database/query_executor.py index eedfd83..749a67c 100644 --- a/shared/database/query_executor.py +++ b/shared/database/query_executor.py @@ -788,21 +788,21 @@ async def get_column_list( standardized_result = [] for col in result: # Different database dialects may use different key names - name = col.get("Field", col.get("name", col.get("column_name"))) - if name is None: - # If column name not found, try to use the first value as column - # name - if col and isinstance(col, dict) and len(col) > 0: - name = next(iter(col.values())) - else: - name = str(col) + name = col.get("Field") or col.get("name") or col.get("column_name") + type_ = col.get("Type") or col.get("data_type") or col.get("type") + + if not name: + # If column name not found, skip this column with a warning + self.logger.warning(f"Could not determine column name from result: {col}") + continue + + if not type_: + type_ = "unknown" # Create standardized column info std_col = { - "name": name, # Standardized column name key - "type": col.get( - "Type", col.get("data_type", col.get("type", "unknown")) - ), + "name": name, + "type": type_, "nullable": ( col.get("Null", col.get("is_nullable", "YES")).upper() == "YES" ), From 21d9170995dd6eb1ca9be13a5838ef75dcca1d5a Mon Sep 17 00:00:00 2001 From: litedatum Date: Wed, 27 Aug 2025 17:19:56 -0400 Subject: [PATCH 6/6] fix: fixed regression test issue --- CHANGELOG.md | 4 + cli/commands/schema.py | 211 ++++++++++-------- shared/database/query_executor.py | 6 +- .../commands/test_schema_command_extended.py | 25 ++- 4 files changed, 144 insertions(+), 102 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 73b72bc..38ddb7b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -29,6 +29,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - refactor(schema): improve output formatting with table-grouped results display - refactor(schema): enhance rule decomposition logic for multi-table support - refactor(data-validator): improve multi-table detection and processing capabilities +- refactor(schema): preserve field order from initial JSON definition instead of alphabetical sorting +- refactor(schema): consolidate field validation information display to single line per field ### Fixed - fix(cli): resolve issue where `--table` parameter was not correctly passed to backend @@ -38,6 +40,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - fix(schema): resolve multi-table rules validation and type checking issues - fix(schema): improve table name detection and validation in multi-table scenarios - fix(schema): enhance error handling for multi-table validation workflows +- fix(schema): ensure schema-only rule fields are not omitted from validation results +- fix(schema): properly display skip conventions for non-existent columns (FIELD_MISSING/TYPE_MISMATCH) ### Removed - **BREAKING CHANGE**: remove backward compatibility for old positional argument interface diff --git a/cli/commands/schema.py b/cli/commands/schema.py index fec2a9e..a216f6e 100644 --- a/cli/commands/schema.py +++ b/cli/commands/schema.py @@ -48,7 +48,7 @@ def _validate_multi_table_rules_payload(payload: Any) -> Tuple[List[str], int]: "strict_mode": true }, "table2": { - "rules": [...] + "rules": [...] } } @@ -413,48 +413,47 @@ def _build_prioritized_atomic_status( ) -> Dict[str, Dict[str, str]]: """Return a mapping rule_id -> {status, skip_reason} applying prioritization.""" mapping: Dict[str, Dict[str, str]] = {} - column_guard: Dict[str, str] = {} + schema_failures: Dict[str, str] = ( + {} + ) # Key: f"{table}.{column}", Value: failure_code schema_rules_map = { str(rule.id): rule for rule in atomic_rules if rule.type == RuleType.SCHEMA } - for schema_result in schema_results: - rule_id = str(schema_result.get("rule_id", "")) + for res in schema_results: + rule_id = str(res.get("rule_id", "")) rule = schema_rules_map.get(rule_id) if not rule: continue - - table_name = rule.get_target_info().get("table") - if not table_name: - continue - execution_plan = schema_result.get("execution_plan") or {} - schema_details = execution_plan.get("schema_details") or {} - details = schema_details.get("field_results") or [] + table = rule.get_target_info().get("table", "") + details = ( + res.get("execution_plan", {}) + .get("schema_details", {}) + .get("field_results", []) + ) + for item in details: - col = str(item.get("column")) - code = str(item.get("failure_code", "NONE")) - column_guard[f"{table_name}.{col}"] = code + code = item.get("failure_code") + if code in ("FIELD_MISSING", "TYPE_MISMATCH"): + col = item.get("column") + if col: + schema_failures[f"{table}.{col}"] = code - for r in atomic_rules: - if r.type == RuleType.SCHEMA: - continue - - target_info = r.get_target_info() - table_name = target_info.get("table") - column_name = target_info.get("column") + if not schema_failures: + return {} - if not table_name or not column_name: + for rule in atomic_rules: + if rule.type == RuleType.SCHEMA: continue - guard_key = f"{table_name}.{column_name}" - guard = column_guard.get(guard_key, "NONE") + col = rule.get_target_column() + table = rule.get_target_info().get("table", "") - if guard == "FIELD_MISSING": - mapping[r.id] = {"status": "SKIPPED", "skip_reason": "FIELD_MISSING"} - elif guard == "TYPE_MISMATCH": - mapping[r.id] = {"status": "SKIPPED", "skip_reason": "TYPE_MISMATCH"} + if col and f"{table}.{col}" in schema_failures: + reason = schema_failures[f"{table}.{col}"] + mapping[str(rule.id)] = {"status": "SKIPPED", "skip_reason": reason} return mapping @@ -569,7 +568,7 @@ def _extract_schema_results( } if not schema_rule_ids: return [] - + for r in results: if r is None: continue @@ -581,7 +580,7 @@ def _extract_schema_results( rid = "" elif isinstance(r, dict): rid = str(r.get("rule_id", "")) - + if rid in schema_rule_ids: schema_results.append( r.model_dump() if hasattr(r, "model_dump") else cast(Dict[str, Any], r) @@ -653,7 +652,7 @@ def _failed_records_of(res: Dict[str, Any]) -> int: schema_plan = (schema_result or {}).get("execution_plan", {}) or {} schema_details = schema_plan.get("schema_details", {}) or {} field_results = schema_details.get("field_results", []) or [] - + rule_id = str(schema_result.get("rule_id", "")) rule = schema_rules_map.get(rule_id) table_name = rule.get_target_info().get("table") if rule else "unknown" @@ -712,11 +711,11 @@ def _ensure_check(entry: Dict[str, Any], name: str) -> Dict[str, Any]: rule = rule_map.get(rule_id) if not rule or rule.type == RuleType.SCHEMA: continue - + column_name = rule.get_target_column() or "" if not column_name: continue - + table_name = "unknown" if rule.target and rule.target.entities: table_name = rule.target.entities[0].table @@ -728,7 +727,7 @@ def _ensure_check(entry: Dict[str, Any], name: str) -> Dict[str, Any]: schema_fields_index[f"{table_name}.{column_name}"] = l_entry else: l_entry["table"] = table_name - + t = rule.type if t == RuleType.NOT_NULL: key = "not_null" @@ -742,13 +741,13 @@ def _ensure_check(entry: Dict[str, Any], name: str) -> Dict[str, Any]: key = "date_format" else: key = t.value.lower() - + check = _ensure_check(l_entry, key) check["status"] = str(rd.get("status", "UNKNOWN")) if rule_id in skip_map: check["status"] = skip_map[rule_id]["status"] check["skip_reason"] = skip_map[rule_id]["skip_reason"] - + fr = _failed_records_of(rd) if fr: check["failed_records"] = fr @@ -769,7 +768,10 @@ def _ensure_check(entry: Dict[str, Any], name: str) -> Dict[str, Any]: for schema_result in schema_results: try: extras = ( - (schema_result or {}).get("execution_plan", {}).get("schema_details", {}).get("extras", []) + (schema_result or {}) + .get("execution_plan", {}) + .get("schema_details", {}) + .get("extras", []) ) if isinstance(extras, list): schema_extras.extend([str(x) for x in extras]) @@ -887,7 +889,7 @@ def _calc_failed(res: Dict[str, Any]) -> int: if col not in tables_grouped[table_name]: tables_grouped[table_name][col] = {"column": col, "issues": []} - status = str(rd.get("status", "UNKNOWN")) + status: Any = str(rd.get("status", "UNKNOWN")) if rd.get("rule_type") == RuleType.NOT_NULL.value: key = "not_null" elif rd.get("rule_type") == RuleType.RANGE.value: @@ -907,22 +909,23 @@ def _calc_failed(res: Dict[str, Any]) -> int: } ) - all_columns_by_table: Dict[str, set] = {} + all_columns_by_table: Dict[str, List[str]] = {} for rule in atomic_rules: if rule.target and rule.target.entities: table_name = rule.target.entities[0].table if table_name not in all_columns_by_table: - all_columns_by_table[table_name] = set() - + all_columns_by_table[table_name] = [] + if rule.type == RuleType.SCHEMA: if rule.parameters: declared_cols = (rule.parameters.get("columns") or {}).keys() for col in declared_cols: - all_columns_by_table[table_name].add(str(col)) + if str(col) not in all_columns_by_table[table_name]: + all_columns_by_table[table_name].append(str(col)) else: column_name = rule.get_target_column() - if column_name: - all_columns_by_table[table_name].add(column_name) + if column_name and column_name not in all_columns_by_table[table_name]: + all_columns_by_table[table_name].append(column_name) for table_name, columns in all_columns_by_table.items(): if table_name not in tables_grouped: @@ -942,7 +945,7 @@ def _calc_failed(res: Dict[str, Any]) -> int: rule = schema_rules_map.get(rule_id) if not rule: continue - + table_name = rule.get_target_info().get("table") if not table_name or table_name not in tables_grouped: continue @@ -952,6 +955,8 @@ def _calc_failed(res: Dict[str, Any]) -> int: details = schema_details.get("field_results", []) or [] for item in details: col = str(item.get("column")) + if col not in tables_grouped[table_name]: + continue if item.get("failure_code") == "FIELD_MISSING": tables_grouped[table_name][col]["issues"].append( {"check": "missing", "status": "FAILED"} @@ -968,59 +973,84 @@ def _calc_failed(res: Dict[str, Any]) -> int: int(r.get("failed_records", 0) or 0) for r in table_results ) - for table_name in sorted(tables_grouped.keys()): + sorted_tables = sorted(tables_grouped.keys()) + + for table_name in sorted_tables: records = table_records.get(table_name, 0) lines.append(f"\n📋 Table: {table_name} ({records:,} records)") table_grouped = tables_grouped[table_name] - for col in sorted(table_grouped.keys()): - issues = table_grouped[col]["issues"] - - # Consolidate issues to avoid duplicates, prioritizing 'missing' - final_issues = [] - has_missing = any(i.get("check") == "missing" for i in issues) - if has_missing: - final_issues.append({"check": "missing", "status": "FAILED"}) - else: - final_issues.extend(issues) + ordered_columns = all_columns_by_table.get(table_name, []) - critical = [i for i in final_issues if i["status"] in {"FAILED", "ERROR"}] - skipped = [i for i in final_issues if i["status"] == "SKIPPED"] + # Fallback for columns that might appear in results but not in rules + # (e.g., from a different source) + result_columns = sorted(table_grouped.keys()) + for col in result_columns: + if col not in ordered_columns: + ordered_columns.append(col) - if not critical and not skipped: + for col in ordered_columns: + if col not in table_grouped: lines.append(f"✓ {col}: OK") - else: - printed_checks = set() - for i in critical: - check_key = i['check'] - if check_key in printed_checks: continue - printed_checks.add(check_key) + continue + issues = table_grouped[col]["issues"] + + if not issues: + lines.append(f"✓ {col}: OK") + continue + + is_missing = any( + i.get("check") == "missing" or i.get("skip_reason") == "FIELD_MISSING" + for i in issues + ) + + if is_missing: + lines.append(f"✗ {col}: missing (skipped dependent checks)") + continue + + unique_issues: Dict[Tuple[str, str], Dict[str, Any]] = {} + for issue in issues: + key_ = (str(issue.get("status")), str(issue.get("check"))) + if key_ not in unique_issues: + unique_issues[key_] = issue + + final_issues = sorted( + unique_issues.values(), key=lambda x: str(x.get("check")) + ) + + issue_descs: List[str] = [] + for i in final_issues: + status = i.get("status") + check = i.get("check", "unknown") + + if status in {"FAILED", "ERROR"}: fr = i.get("failed_records", 0) - if i["check"] == "missing": - lines.append(f"✗ {col}: missing (skipped dependent checks)") - elif i["status"] == "ERROR": - lines.append(f"✗ {col}: {i['check']} error") + if status == "ERROR": + issue_descs.append(f"{check} error") else: - lines.append(f"✗ {col}: {i['check']} failed ({fr} failures)") - - for i in skipped: - check_key = i.get("skip_reason") - if check_key in printed_checks: continue - printed_checks.add(check_key) - - skip_reason = i.get("skip_reason", "unknown reason") - if skip_reason == "FIELD_MISSING": - lines.append(f"✗ {col}: missing (skipped dependent checks)") - elif skip_reason == "TYPE_MISMATCH": - lines.append(f"✗ {col}: type mismatch (skipped dependent checks)") + issue_descs.append(f"{check} failed ({fr} failures)") + elif status == "SKIPPED": + skip_reason = i.get("skip_reason") + if skip_reason == "TYPE_MISMATCH": + issue_descs.append("type mismatch (skipped dependent checks)") else: - lines.append(f"✗ {col}: {i['check']} skipped ({skip_reason})") + reason_text = skip_reason or "unknown reason" + issue_descs.append(f"{check} skipped ({reason_text})") - total_columns = sum(len(tables_grouped[table]) for table in tables_grouped) + if not issue_descs: + lines.append(f"✓ {col}: OK") + else: + lines.append(f"✗ {col}: { ', '.join(issue_descs)}") + + total_columns = sum(len(all_columns_by_table.get(t, [])) for t in sorted_tables) passed_columns = sum( - sum(1 for col in table_grouped.values() if not col["issues"]) - for table_grouped in tables_grouped.values() + sum( + 1 + for c in all_columns_by_table.get(t, []) + if not tables_grouped.get(t, {}).get(c, {}).get("issues", []) + ) + for t in sorted_tables ) failed_columns = total_columns - passed_columns overall_error_rate = ( @@ -1031,12 +1061,15 @@ def _calc_failed(res: Dict[str, Any]) -> int: if len(tables_grouped) > 1: lines.append("\n📊 Multi-table Summary:") - for table_name in sorted(tables_grouped.keys()): - table_columns = len(tables_grouped[table_name]) + for table_name in sorted_tables: + table_cols = all_columns_by_table.get(table_name, []) + table_columns_count = len(table_cols) table_passed = sum( - 1 for col in tables_grouped[table_name].values() if not col["issues"] + 1 + for c in table_cols + if not tables_grouped[table_name].get(c, {}).get("issues") ) - table_failed = table_columns - table_passed + table_failed = table_columns_count - table_passed lines.append( f" {table_name}: {table_passed} passed, {table_failed} failed" ) diff --git a/shared/database/query_executor.py b/shared/database/query_executor.py index 749a67c..0cd11e6 100644 --- a/shared/database/query_executor.py +++ b/shared/database/query_executor.py @@ -793,9 +793,11 @@ async def get_column_list( if not name: # If column name not found, skip this column with a warning - self.logger.warning(f"Could not determine column name from result: {col}") + self.logger.warning( + f"Could not determine column name from result: {col}" + ) continue - + if not type_: type_ = "unknown" diff --git a/tests/unit/cli/commands/test_schema_command_extended.py b/tests/unit/cli/commands/test_schema_command_extended.py index d3a9eb2..57ded12 100644 --- a/tests/unit/cli/commands/test_schema_command_extended.py +++ b/tests/unit/cli/commands/test_schema_command_extended.py @@ -149,20 +149,23 @@ def test_prioritization_skip_map(self) -> None: atomic_rules = [schema, not_null_email, range_age] # Simulate SCHEMA execution details - schema_result = { - "execution_plan": { - "schema_details": { - "field_results": [ - {"column": "email", "failure_code": "TYPE_MISMATCH"}, - {"column": "age", "failure_code": "FIELD_MISSING"}, - {"column": "id", "failure_code": "NONE"}, - ] - } + schema_results = [ + { + "rule_id": str(schema.id), + "execution_plan": { + "schema_details": { + "field_results": [ + {"column": "email", "failure_code": "TYPE_MISMATCH"}, + {"column": "age", "failure_code": "FIELD_MISSING"}, + {"column": "id", "failure_code": "NONE"}, + ] + } + }, } - } + ] skip_map = _build_prioritized_atomic_status( - schema_result=schema_result, atomic_rules=atomic_rules + schema_results=schema_results, atomic_rules=atomic_rules ) # email dependent rules should be skipped for TYPE_MISMATCH