diff --git a/CHANGELOG.md b/CHANGELOG.md index 3e6c027..820ac1a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,7 +19,49 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Removed - None -## [0.4.0] - 2025-01-27 +## [0.4.2] - 2025-08-27 + +### Added +- feat(cli): refactor check command interface from positional arguments to `--conn` and `--table` options +- feat(cli): add comprehensive test coverage for new CLI interface functionality +- feat(cli): support explicit table name specification independent of database URL +- feat(schema): add comprehensive multi-table support for schema validation +- feat(schema): support multi-table rules format with table-level configuration options +- feat(schema): add Excel multi-sheet file support as data source +- feat(schema): implement table-grouped output display for multi-table validation results +- feat(schema): add table-level options support (strict_mode, case_insensitive) +- feat(tests): add comprehensive multi-table functionality test coverage +- feat(tests): add multi-table Excel file validation test scenarios + +### Changed +- **BREAKING CHANGE**: CLI interface changed from `vlite check ` to `vlite check --conn --table ` +- refactor(cli): update SourceParser to accept optional table_name parameter +- refactor(cli): modify check command to pass table_name to SourceParser.parse_source() +- refactor(tests): update all existing CLI tests to use new interface format +- refactor(tests): add new test cases specifically for table name parameter validation +- refactor(schema): enhance schema command to support both single-table and multi-table formats +- refactor(schema): improve output formatting with table-grouped results display +- refactor(schema): enhance rule decomposition logic for multi-table support +- refactor(data-validator): improve multi-table detection and processing capabilities +- refactor(schema): preserve field order from initial JSON definition instead of alphabetical sorting +- refactor(schema): consolidate field validation information display to single line per field + +### Fixed +- fix(cli): resolve issue where `--table` parameter was not correctly passed to backend +- fix(cli): ensure table name from `--table` option takes precedence over table name in database URL +- fix(tests): update regression tests to use new CLI interface format +- fix(tests): resolve test failures caused by interface changes +- fix(schema): resolve multi-table rules validation and type checking issues +- fix(schema): improve table name detection and validation in multi-table scenarios +- fix(schema): enhance error handling for multi-table validation workflows +- fix(schema): ensure schema-only rule fields are not omitted from validation results +- fix(schema): properly display skip conventions for non-existent columns (FIELD_MISSING/TYPE_MISMATCH) + +### Removed +- **BREAKING CHANGE**: remove backward compatibility for old positional argument interface +- remove(cli): eliminate support for `` positional argument in check command + +## [0.4.0] - 2025-08-14 ### Added - feat(cli): add `schema` command skeleton @@ -33,7 +75,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - tests(cli): comprehensive unit tests for `schema` command covering argument parsing, rules file validation, decomposition/mapping, aggregation priority, output formats (table/json), and exit codes (AC satisfied) - tests(core): unit tests for `SCHEMA` rule covering normal/edge/error cases, strict type checks, and mypy compliance - tests(integration): database schema drift tests for MySQL and PostgreSQL (existence, type consistency, strict mode extras, case-insensitive) -- tests(e2e): end-to-end `vlite-cli schema` scenarios on database URLs covering happy path, drift (FIELD_MISSING/TYPE_MISMATCH), strict extras, empty rules minimal payload; JSON and table outputs +- tests(e2e): end-to-end `vlite schema` scenarios on database URLs covering happy path, drift (FIELD_MISSING/TYPE_MISMATCH), strict extras, empty rules minimal payload; JSON and table outputs ### Changed - docs: update README and USAGE with schema command overview and detailed usage diff --git a/README.md b/README.md index 2fa8a6e..f336ae6 100644 --- a/README.md +++ b/README.md @@ -1,228 +1,116 @@ # ValidateLite -ValidateLite is a lightweight, zero-config Python CLI tool for validating data quality across files and SQL databases - built for modern data pipelines and CI/CD automation. This python data validation tool is a flexible, extensible command-line tool for automated data quality validation, profiling, and rule-based checks across diverse data sources. Designed for data engineers, analysts, and developers to ensure data reliability and compliance in modern data pipelines. - [![Python 3.8+](https://img.shields.io/badge/python-3.8+-blue.svg)](https://www.python.org/downloads/) [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) [![Code Coverage](https://img.shields.io/badge/coverage-80%25-green.svg)](https://github.com/litedatum/validatelite) ---- - -## 📝 Development Blog - -Follow the journey of building ValidateLite through our development blog posts: - -- **[DevLog #1: Building a Zero-Config Data Validation Tool](https://blog.litedatum.com/posts/Devlog01-data-validation-tool/)** - The initial vision and architecture of ValidateLite -- **[DevLog #2: Why I Scrapped My Half-Built Data Validation Platform](https://blog.litedatum.com/posts/Devlog02-Rethinking-My-Data-Validation-Tool/)** - Lessons learned from scope creep and the pivot to a focused CLI tool -- **[Rule-Driven Schema Validation: A Lightweight Solution](https://blog.litedatum.com/posts/Rule-Driven-Schema-Validation/)** - Deep dive into schema drift challenges and how ValidateLite's schema validation provides a lightweight alternative to complex frameworks - ---- - -## 🚀 Quick Start - -### For Regular Users - -**Option 1: Install from [PyPI](https://pypi.org/project/validatelite/) (Recommended)** -```bash -pip install validatelite -vlite --help -``` - -**Option 2: Install from pre-built package** -```bash -# Download the latest release from GitHub -pip install validatelite-0.1.0-py3-none-any.whl -vlite --help -``` - -**Option 3: Run from source** -```bash -git clone https://github.com/litedatum/validatelite.git -cd validatelite -pip install -r requirements.txt -python cli_main.py --help -``` - -**Option 4: Install with pip-tools (for development)** -```bash -git clone https://github.com/litedatum/validatelite.git -cd validatelite -pip install pip-tools -pip-compile requirements.in -pip install -r requirements.txt -python cli_main.py --help -``` +**ValidateLite: A lightweight data validation tool for engineers who need answers, fast.** -### For Developers & Contributors +Unlike other complex **data validation tools**, ValidateLite provides two powerful, focused commands for different scenarios: -If you want to contribute to the project or need the latest development version: +* **`vlite check`**: For quick, ad-hoc data checks. Need to verify if a column is unique or not null *right now*? The `check` command gets you an answer in 30 seconds, zero config required. -```bash -git clone https://github.com/litedatum/validatelite.git -cd validatelite - -# Install dependencies (choose one approach) -# Option 1: Install from pinned requirements -pip install -r requirements.txt -pip install -r requirements-dev.txt - -# Option 2: Use pip-tools for development -pip install pip-tools -python scripts/update_requirements.py -pip install -r requirements.txt -pip install -r requirements-dev.txt - -# Install pre-commit hooks -pre-commit install -``` - -See [DEVELOPMENT_SETUP.md](docs/DEVELOPMENT_SETUP.md) for detailed development setup instructions. +* **`vlite schema`**: For robust, repeatable **database schema validation**. It's your best defense against **schema drift**. Embed it in your CI/CD and ETL pipelines to enforce data contracts, ensuring data integrity before it becomes a problem. --- -## ✨ Features +## Core Use Case: Automated Schema Validation -- **🔧 Rule-based Data Quality Engine**: Supports completeness, uniqueness, validity, and custom rules -- **🖥️ Extensible CLI**: Easily integrate with CI/CD and automation workflows -- **🗄️ Multi-Source Support**: Validate data from files (CSV, Excel) and databases (MySQL, PostgreSQL, SQLite) -- **⚙️ Configurable & Modular**: Flexible configuration via TOML and environment variables -- **🛡️ Comprehensive Error Handling**: Robust exception and error classification system -- **🧪 Tested & Reliable**: High code coverage, modular tests, and pre-commit hooks -- **📐 Schema Drift Prevention**: Lightweight schema validation that prevents data pipeline failures from unexpected schema changes - a simple alternative to complex validation frameworks +The `vlite schema` command is key to ensuring the stability of your data pipelines. It allows you to quickly verify that a database table or data file conforms to a defined structure. ---- +### Scenario 1: Gate Deployments in CI/CD -## 📖 Documentation +Automatically check for breaking schema changes before they get deployed, preventing production issues caused by unexpected modifications. -- **[USAGE.md](docs/USAGE.md)** - Complete user guide with examples and best practices -- Schema command JSON output contract: `docs/schemas/schema_results.schema.json` -- **[DEVELOPMENT_SETUP.md](docs/DEVELOPMENT_SETUP.md)** - Development environment setup and contribution guidelines -- **[CONFIG_REFERENCE.md](docs/CONFIG_REFERENCE.md)** - Configuration file reference -- **[ROADMAP.md](docs/ROADMAP.md)** - Development roadmap and future plans -- **[CHANGELOG.md](CHANGELOG.md)** - Release history and changes +**Example Workflow (`.github/workflows/ci.yml`)** +```yaml +jobs: + validate-db-schema: + name: Validate Database Schema + runs-on: ubuntu-latest + steps: + - name: Checkout code + uses: actions/checkout@v3 ---- + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.9' -## 🎯 Basic Usage + - name: Install ValidateLite + run: pip install validatelite -### Validate a CSV file -```bash -vlite check data.csv --rule "not_null(id)" --rule "unique(email)" + - name: Run Schema Validation + run: | + vlite schema --conn "mysql://${{ secrets.DB_USER }}:${{ secrets.DB_PASS }}@${{ secrets.DB_HOST }}/sales" \ + --rules ./schemas/customers_schema.json ``` -### Validate a database table -```bash -vlite check "mysql://user:pass@host:3306/db.table" --rules validation_rules.json +### Scenario 2: Monitor ETL/ELT Pipelines + +Set up validation checkpoints at various stages of your data pipelines to guarantee data quality and avoid "garbage in, garbage out." + +**Example Rule File (`customers_schema.json`)** +```json +{ + "customers": { + "rules": [ + { "field": "id", "type": "integer", "required": true }, + { "field": "name", "type": "string", "required": true }, + { "field": "email", "type": "string", "required": true }, + { "field": "age", "type": "integer", "min": 18, "max": 100 }, + { "field": "gender", "enum": ["Male", "Female", "Other"] }, + { "field": "invalid_col" } + ] + } +} ``` -### Check with verbose output +**Run Command:** ```bash -vlite check data.csv --rules rules.json --verbose -``` - -### Validate against a schema file (single table) -```bash -# Table is derived from the data-source URL, the schema file is single-table in v1 -vlite schema "mysql://user:pass@host:3306/sales.users" --rules schema.json - -# Get aggregated JSON with column-level details (see docs/schemas/schema_results.schema.json) -vlite schema "mysql://.../sales.users" --rules schema.json --output json -``` - -For detailed usage examples and advanced features, see [USAGE.md](docs/USAGE.md). - ---- - -## 🏗️ Project Structure - -``` -validatelite/ -├── cli/ # CLI logic and commands -├── core/ # Rule engine and core validation logic -├── shared/ # Common utilities, enums, exceptions, and schemas -├── config/ # Example and template configuration files -├── tests/ # Unit, integration, and E2E tests -├── scripts/ # Utility scripts -├── docs/ # Documentation -└── examples/ # Usage examples and sample data +vlite schema --conn "mysql://user:pass@host:3306/sales" --rules customers_schema.json ``` --- -## 🧪 Testing +## Quick Start: Ad-Hoc Checks with `check` -### For Regular Users -The project includes comprehensive tests to ensure reliability. If you encounter issues, please check the [troubleshooting section](docs/USAGE.md#error-handling) in the usage guide. +For temporary, one-off validation needs, the `check` command is your best friend. -### For Developers +**1. Install (if you haven't already):** ```bash -# Set up test databases (requires Docker) -./scripts/setup_test_databases.sh start - -# Run all tests with coverage -pytest -vv --cov - -# Run specific test categories -pytest tests/unit/ -v # Unit tests only -pytest tests/integration/ -v # Integration tests -pytest tests/e2e/ -v # End-to-end tests +pip install validatelite +``` -# Code quality checks -pre-commit run --all-files +**2. Run a check:** +```bash +# Check for nulls in a CSV file's 'id' column +vlite check --conn "customers.csv" --table customers --rule "not_null(id)" -# Stop test databases when done -./scripts/setup_test_databases.sh stop +# Check for uniqueness in a database table's 'email' column +vlite check --conn "mysql://user:pass@host/db" --table customers --rule "unique(email)" ``` --- -## 🤝 Contributing - -We welcome contributions! Please see our [Contributing Guidelines](CONTRIBUTING.md) and [Code of Conduct](CODE_OF_CONDUCT.md). +## Learn More -### Development Setup -For detailed development setup instructions, see [DEVELOPMENT_SETUP.md](docs/DEVELOPMENT_SETUP.md). +- **[Usage Guide (USAGE.md)](docs/USAGE.md)**: Learn about all commands, arguments, and advanced features. +- **[Configuration Reference (CONFIG_REFERENCE.md)](docs/CONFIG_REFERENCE.md)**: See how to configure the tool via `toml` files. +- **[Contributing Guide (CONTRIBUTING.md)](CONTRIBUTING.md)**: We welcome contributions! --- -## ❓ FAQ: Why ValidateLite? - -### Q: What is ValidateLite, in one sentence? -A: ValidateLite is a lightweight, zero-config Python CLI tool for data quality validation, profiling, and rule-based checks across CSV files and SQL databases. - -### Q: How is it different from other tools like Great Expectations or Pandera? -A: Unlike heavyweight frameworks, ValidateLite is built for simplicity and speed — no code generation, no DSLs, just one command to validate your data in pipelines or ad hoc scripts. - -### Q: What kind of data sources are supported? -A: Currently supports CSV, Excel, and SQL databases (MySQL, PostgreSQL, SQLite) with planned support for more cloud and file-based sources. - -### Q: Who should use this? -A: Data engineers, analysts, and Python developers who want to integrate fast, automated data quality checks into ETL jobs, CI/CD pipelines, or local workflows. - -### Q: Does it require writing Python code? -A: Not at all. You can specify rules inline in the command line or via a simple JSON config file — no coding needed. - -### Q: Is ValidateLite open-source? -A: Yes! It’s licensed under MIT and available on GitHub — stars and contributions are welcome! - -### Q: How can I use it in CI/CD? -A: Just install via pip and add a vlite check ... step in your data pipeline or GitHub Action. It returns exit codes you can use for gating deployments. - ---- +## 📝 Development Blog -## 🔒 Security +Follow the journey of building ValidateLite through our development blog posts: -For security issues, please review [SECURITY.md](SECURITY.md) and follow the recommended process. +- **[DevLog #1: Building a Zero-Config Data Validation Tool](https://blog.litedatum.com/posts/Devlog01-data-validation-tool/)** +- **[DevLog #2: Why I Scrapped My Half-Built Data Validation Platform](https://blog.litedatum.com/posts/Devlog02-Rethinking-My-Data-Validation-Tool/) +- **[Rule-Driven Schema Validation: A Lightweight Solution](https://blog.litedatum.com/posts/Rule-Driven-Schema-Validation/) --- ## 📄 License -This project is licensed under the terms of the [MIT License](LICENSE). - ---- - -## 🙏 Acknowledgements - -- Inspired by best practices in data engineering and open-source data quality tools -- Thanks to all contributors and users for their feedback and support +This project is licensed under the [MIT License](LICENSE). diff --git a/cli/__init__.py b/cli/__init__.py index 640c839..8bbfd0e 100644 --- a/cli/__init__.py +++ b/cli/__init__.py @@ -2,10 +2,10 @@ ValidateLite CLI Package Command-line interface for the data quality validation tool. -Provides a unified `vlite-cli check` command for data quality checking. +Provides a unified `vlite check` command for data quality checking. """ -__version__ = "0.4.0" +__version__ = "0.4.2" from .app import cli_app diff --git a/cli/app.py b/cli/app.py index eca4c6a..a7c5d90 100644 --- a/cli/app.py +++ b/cli/app.py @@ -2,7 +2,7 @@ CLI Application Entry Point Main CLI application using Click framework. -Provides the unified `vlite-cli check` command for data quality validation. +Provides the unified `vlite check` command for data quality validation. """ import sys @@ -67,8 +67,8 @@ def _setup_logging() -> None: logging.getLogger().setLevel(logging.WARNING) -@click.group(name="vlite-cli", invoke_without_command=True) -@click.version_option(version="0.4.0", prog_name="vlite-cli") +@click.group(name="vlite", invoke_without_command=True) +@click.version_option(version="0.4.2", prog_name="vlite") @click.pass_context def cli_app(ctx: click.Context) -> None: """ @@ -142,16 +142,16 @@ def rules_help() -> None: Usage Examples: # Single rule - vlite-cli check users.csv --rule "not_null(id)" + vlite check --conn users.csv --rule "not_null(id)" # Multiple rules - vlite-cli check users.csv --rule "not_null(id)" --rule "unique(email)" + vlite check --conn users.csv --rule "not_null(id)" --rule "unique(email)" # Rules file - vlite-cli check users.csv --rules validation.json + vlite check --conn users.csv --rules validation.json # Database check - vlite-cli check mysql://user:pass@host/db.users --rule "not_null(id)" + vlite check --conn mysql://user:pass@host/db --table users --rule "not_null(id)" """ safe_echo(help_text) diff --git a/cli/commands/check.py b/cli/commands/check.py index 026272d..cf8c531 100644 --- a/cli/commands/check.py +++ b/cli/commands/check.py @@ -1,7 +1,7 @@ """ Check Command Implementation -The core `vlite-cli check` command for data quality validation. +The core `vlite check` command for data quality validation. Supports smart source identification, rule parsing, and formatted output. """ @@ -38,7 +38,13 @@ @click.command("check") -@click.argument("source", required=True) +@click.option( + "--conn", + "connection_string", + required=True, + help="Database connection string or file path", +) +@click.option("--table", "table_name", required=True, help="Table name to validate") @click.option( "--rule", "rules", @@ -59,7 +65,8 @@ help="Show detailed information and failure samples", ) def check_command( - source: str, + connection_string: str, + table_name: str, rules: Tuple[str, ...], rules_file: Optional[str], quiet: bool, @@ -68,18 +75,22 @@ def check_command( """ Check data quality for the given source. + NEW FORMAT: + vlite check --conn --table [options] + SOURCE can be: - File path: users.csv, data.xlsx, records.json - - Database URL: mysql://user:pass@host/db.table + - Database URL: mysql://user:pass@host/db - SQLite file: sqlite:///path/to/file.db Examples: - vlite-cli check users.csv --rule "not_null(id)" - vlite-cli check mysql://user:pass@host/db.users --rules validation.json + vlite check --conn users.csv --table users --rule "not_null(id)" + vlite check --conn mysql://user:pass@host/db \ + --table users --rules validation.json """ # Record start time start_time = now() - logger.info(f"Starting data quality check for: {source}") + logger.info(f"Starting data quality check for: {connection_string}") # Create exception handler exception_handler = CliExceptionHandler(verbose=verbose) @@ -111,23 +122,23 @@ def check_command( ) # Parse source - safe_echo(f"🔍 Analyzing source: {source}") + safe_echo(f"🔍 Analyzing source: {connection_string}") # Proactively verify that a provided file is not empty – this avoids # kicking off heavy validation logic only to discover the file is # useless. The modern test-suite expects a graceful early-exit with a # clear error message in such a scenario. - potential_path = Path(source) + potential_path = Path(connection_string) if potential_path.exists() and potential_path.is_file(): if potential_path.stat().st_size == 0: raise click.ClickException( - f"Error: Source file '{source}' is empty " + f"Error: Source file '{connection_string}' is empty " "– nothing to validate." ) # Parse source config - this may raise Schema creation error # (OperationError) - source_config = source_parser.parse_source(source) + source_config = source_parser.parse_source(connection_string, table_name) # Parse rules - this may raise Schema creation error # (RuleExecutionError) @@ -205,7 +216,7 @@ def check_command( output_formatter.display_results( results=results_dicts, rules=rule_configs, # Pass as objects, not dicts - source=source, + source=connection_string, execution_time=execution_time, total_rules=len(rule_configs), ) @@ -248,7 +259,7 @@ def check_command( output_formatter.display_results( results=results_dicts, rules=rule_configs, # Pass as objects, not dicts - source=source, + source=connection_string, execution_time=execution_time, total_rules=len(rule_configs), ) @@ -289,17 +300,17 @@ def rules_help_command() -> None: enum(column,value1,value2...) - Check allowed enum values EXAMPLES: - vlite-cli check users.csv --rule "not_null(id)" - vlite-cli check users.csv --rule "length(name,2,50)" - vlite-cli check users.csv --rule "unique(email)" - vlite-cli check users.csv --rule "range(age,18,65)" - vlite-cli check users.csv --rule "regex(email,^[\\w.-]+@[\\w.-]+\\.[a-zA-Z]{2,}$)" + vlite check users.csv --rule "not_null(id)" + vlite check users.csv --rule "length(name,2,50)" + vlite check users.csv --rule "unique(email)" + vlite check users.csv --rule "range(age,18,65)" + vlite check users.csv --rule "regex(email,^[\\w.-]+@[\\w.-]+\\.[a-zA-Z]{2,}$)" MULTIPLE RULES: - vlite-cli check users.csv --rule "not_null(id)" --rule "unique(email)" + vlite check users.csv --rule "not_null(id)" --rule "unique(email)" RULES FILE: - vlite-cli check users.csv --rules validation.json + vlite check users.csv --rules validation.json Example validation.json: { diff --git a/cli/commands/schema.py b/cli/commands/schema.py index 16b6afb..122205c 100644 --- a/cli/commands/schema.py +++ b/cli/commands/schema.py @@ -1,8 +1,8 @@ """ Schema Command -Adds `vlite-cli schema` command that parses parameters, performs minimal rules -file validation (single-table only, no jsonschema), and prints placeholder +Adds `vlite schema` command that parses parameters, performs minimal rules +file validation (supports both single-table and multi-table formats), and prints output aligned with the existing CLI style. """ @@ -19,6 +19,7 @@ from shared.enums import RuleAction, RuleCategory, RuleType, SeverityLevel from shared.enums.data_types import DataType from shared.schema.base import RuleTarget, TargetEntity +from shared.schema.connection_schema import ConnectionSchema from shared.schema.rule_schema import RuleSchema from shared.utils.console import safe_echo from shared.utils.datetime_utils import now as _now @@ -37,88 +38,135 @@ } -def _validate_rules_payload(payload: Any) -> Tuple[List[str], int]: - """Validate the minimal structure of the schema rules file. +def _validate_multi_table_rules_payload(payload: Any) -> Tuple[List[str], int]: + """Validate the structure of multi-table schema rules file. - This performs non-jsonschema checks: - - Top-level must be an object with a `rules` array - - Warn and ignore top-level `table` if present - - Validate each rule item fields and types: - - field: required str - - type: optional str in allowed set - - required: optional bool - - enum: optional list - - min/max: optional numeric (int or float) + Multi-table format: + { + "table1": { + "rules": [...], + "strict_mode": true + }, + "table2": { + "rules": [...] + } + } Returns: - warnings, rules_count - - Raises: - click.UsageError: if structure or types are invalid + warnings, total_rules_count """ warnings: List[str] = [] + total_rules = 0 if not isinstance(payload, dict): - raise click.UsageError("Rules file must be a JSON object with a 'rules' array") + raise click.UsageError("Rules file must be a JSON object") - if "table" in payload: - warnings.append( - "Top-level 'table' is ignored; table is derived from data-source" - ) + # Check if this is a multi-table format (has table names as keys) + table_names = [key for key in payload.keys() if key != "rules"] - if "tables" in payload: - # Explicitly reject multi-table format in v1 - raise click.UsageError( - "'tables' is not supported in v1; use single-table 'rules' only" - ) + if table_names: + # Multi-table format + for table_name in table_names: + table_schema = payload[table_name] + if not isinstance(table_schema, dict): + raise click.UsageError(f"Table '{table_name}' schema must be an object") + + table_rules = table_schema.get("rules") + if not isinstance(table_rules, list): + raise click.UsageError( + f"Table '{table_name}' must have a 'rules' array" + ) - rules = payload.get("rules") - if not isinstance(rules, list): - raise click.UsageError("'rules' must be an array") + # Validate each rule in this table + for idx, item in enumerate(table_rules): + if not isinstance(item, dict): + raise click.UsageError( + f"Table '{table_name}' rules[{idx}] must be an object" + ) - for idx, item in enumerate(rules): - if not isinstance(item, dict): - raise click.UsageError(f"rules[{idx}] must be an object") + # Validate rule fields + _validate_single_rule_item(item, f"Table '{table_name}' rules[{idx}]") - # field - field_name = item.get("field") - if not isinstance(field_name, str) or not field_name: - raise click.UsageError(f"rules[{idx}].field must be a non-empty string") + total_rules += len(table_rules) - # type - if "type" in item: - type_name = item["type"] - if not isinstance(type_name, str): + # Validate optional table-level switches + if "strict_mode" in table_schema and not isinstance( + table_schema["strict_mode"], bool + ): raise click.UsageError( - f"rules[{idx}].type must be a string when provided" + f"Table '{table_name}' strict_mode must be a boolean" ) - if type_name.lower() not in _ALLOWED_TYPE_NAMES: - allowed = ", ".join(sorted(_ALLOWED_TYPE_NAMES)) + if "case_insensitive" in table_schema and not isinstance( + table_schema["case_insensitive"], bool + ): raise click.UsageError( - f"rules[{idx}].type '{type_name}' is not supported. " - f"Allowed: {allowed}" + f"Table '{table_name}' case_insensitive must be a boolean" ) + else: + # Single-table format (backward compatibility) + warnings.append( + "Single-table format detected; consider using multi-table format for " + "better organization" + ) + if "rules" not in payload: + raise click.UsageError("Single-table format must have a 'rules' array") + + rules = payload["rules"] + if not isinstance(rules, list): + raise click.UsageError("'rules' must be an array") + + for idx, item in enumerate(rules): + if not isinstance(item, dict): + raise click.UsageError(f"rules[{idx}] must be an object") + _validate_single_rule_item(item, f"rules[{idx}]") - # required - if "required" in item and not isinstance(item["required"], bool): + total_rules = len(rules) + + return warnings, total_rules + + +def _validate_single_rule_item(item: Dict[str, Any], context: str) -> None: + """Validate a single rule item from the rules array.""" + # field + field_name = item.get("field") + if not isinstance(field_name, str) or not field_name: + raise click.UsageError(f"{context}.field must be a non-empty string") + + # type + if "type" in item: + type_name = item["type"] + if not isinstance(type_name, str): + raise click.UsageError(f"{context}.type must be a string when provided") + if type_name.lower() not in _ALLOWED_TYPE_NAMES: + allowed = ", ".join(sorted(_ALLOWED_TYPE_NAMES)) raise click.UsageError( - f"rules[{idx}].required must be a boolean when provided" + f"{context}.type '{type_name}' is not supported. " f"Allowed: {allowed}" ) - # enum - if "enum" in item and not isinstance(item["enum"], list): - raise click.UsageError(f"rules[{idx}].enum must be an array when provided") + # required + if "required" in item and not isinstance(item["required"], bool): + raise click.UsageError(f"{context}.required must be a boolean when provided") + + # enum + if "enum" in item and not isinstance(item["enum"], list): + raise click.UsageError(f"{context}.enum must be an array when provided") + + # min/max + for bound_key in ("min", "max"): + if bound_key in item: + value = item[bound_key] + if not isinstance(value, (int, float)): + raise click.UsageError( + f"{context}.{bound_key} must be numeric when provided" + ) - # min/max - for bound_key in ("min", "max"): - if bound_key in item: - value = item[bound_key] - if not isinstance(value, (int, float)): - raise click.UsageError( - f"rules[{idx}].{bound_key} must be numeric when provided" - ) - return warnings, len(rules) +def _validate_rules_payload(payload: Any) -> Tuple[List[str], int]: + """Validate the minimal structure of the schema rules file. + + This performs non-jsonschema checks for both single-table and multi-table formats. + """ + return _validate_multi_table_rules_payload(payload) def _map_type_name_to_datatype(type_name: str) -> DataType: @@ -200,16 +248,73 @@ def _create_rule_schema( ) -def _decompose_to_atomic_rules(payload: Dict[str, Any]) -> List[RuleSchema]: - """Decompose schema JSON payload into atomic RuleSchema objects. +def _decompose_schema_payload( + payload: Dict[str, Any], source_config: ConnectionSchema +) -> List[RuleSchema]: + """Decompose a schema payload into atomic RuleSchema objects. + + This function handles both single-table and multi-table formats in a + source-agnostic way. + """ + all_atomic_rules: List[RuleSchema] = [] + source_db = source_config.db_name or "unknown" + + is_multi_table_format = "rules" not in payload + + if is_multi_table_format: + tables_in_rules = list(payload.keys()) + available_tables_from_source = set(source_config.available_tables or []) + + for table_name in tables_in_rules: + if ( + available_tables_from_source + and table_name not in available_tables_from_source + ): + logger.warning( + f"Skipping rules for table '{table_name}' as it is not available " + "in the source." + ) + continue + + table_schema = payload[table_name] + if not isinstance(table_schema, dict): + logger.warning( + f"Definition for table '{table_name}' is not a valid object, " + "skipping." + ) + continue + + table_rules = _decompose_single_table_schema( + table_schema, source_db, table_name + ) + all_atomic_rules.extend(table_rules) + else: + table_name = "unknown" + if source_config.available_tables: + table_name = source_config.available_tables[0] + else: + logger.warning( + "Could not determine table name for single-table schema. " + "Consider using multi-table format for database sources." + ) + + table_rules = _decompose_single_table_schema(payload, source_db, table_name) + all_atomic_rules.extend(table_rules) + + return all_atomic_rules + + +def _decompose_single_table_schema( + table_schema: Dict[str, Any], source_db: str, table_name: str +) -> List[RuleSchema]: + """Decompose a single table's schema definition into atomic RuleSchema objects. - Rules per item: - - type -> contributes to table-level SCHEMA columns mapping - - required -> NOT_NULL(column) - - min/max -> RANGE(column, min_value/max_value) - - enum -> ENUM(column, allowed_values) + Args: + table_schema: The schema definition for a single table + source_db: Database name from source + table_name: Name of the table being validated """ - rules_arr = payload.get("rules", []) + rules_arr = table_schema.get("rules", []) # Build SCHEMA columns mapping first columns_map: Dict[str, Dict[str, Any]] = {} @@ -275,66 +380,80 @@ def _decompose_to_atomic_rules(payload: Dict[str, Any]) -> List[RuleSchema]: # Create one table-level SCHEMA rule if any columns were declared if columns_map: schema_params: Dict[str, Any] = {"columns": columns_map} - # Optional switches at top-level - if isinstance(payload.get("strict_mode"), bool): - schema_params["strict_mode"] = payload["strict_mode"] - if isinstance(payload.get("case_insensitive"), bool): - schema_params["case_insensitive"] = payload["case_insensitive"] + # Optional switches at table level + if isinstance(table_schema.get("strict_mode"), bool): + schema_params["strict_mode"] = table_schema["strict_mode"] + if isinstance(table_schema.get("case_insensitive"), bool): + schema_params["case_insensitive"] = table_schema["case_insensitive"] atomic_rules.insert( 0, _create_rule_schema( - name="schema", + name=f"schema_{table_name}", rule_type=RuleType.SCHEMA, column=None, parameters=schema_params, - description="CLI: table schema existence+type", + description=f"CLI: table schema existence+type for {table_name}", ), ) + # Set the target table and database for all rules + for rule in atomic_rules: + if rule.target and rule.target.entities: + rule.target.entities[0].database = source_db + rule.target.entities[0].table = table_name + return atomic_rules def _build_prioritized_atomic_status( *, - schema_result: Dict[str, Any] | None, + schema_results: List[Dict[str, Any]], atomic_rules: List[RuleSchema], ) -> Dict[str, Dict[str, str]]: - """Return a mapping rule_id -> {status, skip_reason} applying prioritization. + """Return a mapping rule_id -> {status, skip_reason} applying prioritization.""" + mapping: Dict[str, Dict[str, str]] = {} + schema_failures: Dict[str, str] = ( + {} + ) # Key: f"{table}.{column}", Value: failure_code - Prioritization per column: - 1) If field missing → mark SCHEMA for that field as FAILED (implicit) and all - dependent rules (NOT_NULL/RANGE/ENUM) as SKIPPED (reason FIELD_MISSING). - 2) If type mismatch → mark dependent rules as SKIPPED (reason TYPE_MISMATCH). - 3) Otherwise, leave dependent rules to their engine-evaluated status. + schema_rules_map = { + str(rule.id): rule for rule in atomic_rules if rule.type == RuleType.SCHEMA + } - We infer per-column status from schema_result.execution_plan.schema_details. - """ - mapping: Dict[str, Dict[str, str]] = {} + for res in schema_results: + rule_id = str(res.get("rule_id", "")) + rule = schema_rules_map.get(rule_id) + if not rule: + continue - # Build per-column guard from SCHEMA details - column_guard: Dict[str, str] = {} # column -> NONE|FIELD_MISSING|TYPE_MISMATCH - if schema_result: + table = rule.get_target_info().get("table", "") details = ( - schema_result.get("execution_plan", {}) + res.get("execution_plan", {}) .get("schema_details", {}) .get("field_results", []) ) + for item in details: - col = str(item.get("column")) - code = str(item.get("failure_code", "NONE")) - column_guard[col] = code + code = item.get("failure_code") + if code in ("FIELD_MISSING", "TYPE_MISMATCH"): + col = item.get("column") + if col: + schema_failures[f"{table}.{col}"] = code - # Apply skip to dependent rules - for r in atomic_rules: - if r.type == RuleType.SCHEMA: + if not schema_failures: + return {} + + for rule in atomic_rules: + if rule.type == RuleType.SCHEMA: continue - column = r.get_target_column() or "" - guard = column_guard.get(column, "NONE") - if guard == "FIELD_MISSING": - mapping[r.id] = {"status": "SKIPPED", "skip_reason": "FIELD_MISSING"} - elif guard == "TYPE_MISMATCH": - mapping[r.id] = {"status": "SKIPPED", "skip_reason": "TYPE_MISMATCH"} + + col = rule.get_target_column() + table = rule.get_target_info().get("table", "") + + if col and f"{table}.{col}" in schema_failures: + reason = schema_failures[f"{table}.{col}"] + mapping[str(rule.id)] = {"status": "SKIPPED", "skip_reason": reason} return mapping @@ -370,9 +489,11 @@ def _read_rules_payload(rules_file: str) -> Dict[str, Any]: return cast(Dict[str, Any], payload) -def _emit_warnings(warnings: List[str]) -> None: - for msg in warnings: - _safe_echo(f"⚠️ Warning: {msg}", err=True) +def _emit_warnings(warnings: List[str], output: str = "table") -> None: + """Emit warnings only for non-JSON output to avoid polluting JSON output.""" + if output.lower() != "json": + for msg in warnings: + _safe_echo(f"⚠️ Warning: {msg}", err=True) def _early_exit_when_no_rules( @@ -417,54 +538,62 @@ def _create_validator( core_config=core_config, cli_config=cli_config, ) - except TypeError: - return DataValidator() # type: ignore[call-arg] + except Exception as e: + logger.error(f"Failed to create DataValidator: {str(e)}") + raise click.UsageError(f"Failed to create validator: {str(e)}") def _run_validation(validator: Any) -> Tuple[List[Any], float]: import asyncio start = _now() - results = asyncio.run(validator.validate()) + logger.debug("Starting validation") + try: + results = asyncio.run(validator.validate()) + logger.debug(f"Validation returned {len(results)} results") + except Exception as e: + logger.error(f"Validation failed: {str(e)}") + raise exec_seconds = (_now() - start).total_seconds() return results, exec_seconds -def _extract_schema_result_dict( +def _extract_schema_results( *, atomic_rules: List[RuleSchema], results: List[Any] -) -> Dict[str, Any] | None: - try: - schema_rule = next( - (rule for rule in atomic_rules if rule.type == RuleType.SCHEMA), None - ) - if not schema_rule: - return None - for r in results: - rid = "" - if hasattr(r, "rule_id"): - try: - rid = str(getattr(r, "rule_id")) - except Exception: - rid = "" - elif isinstance(r, dict): - rid = str(r.get("rule_id", "")) - if rid == str(schema_rule.id): - return ( - r.model_dump() - if hasattr(r, "model_dump") - else cast(Dict[str, Any], r) - ) - return None - except Exception: - return None +) -> List[Dict[str, Any]]: + """Extract all SCHEMA rule results from the list of validation results.""" + schema_results = [] + schema_rule_ids = { + str(rule.id) for rule in atomic_rules if rule.type == RuleType.SCHEMA + } + if not schema_rule_ids: + return [] + + for r in results: + if r is None: + continue + rid = "" + if hasattr(r, "rule_id"): + try: + rid = str(getattr(r, "rule_id")) + except Exception: + rid = "" + elif isinstance(r, dict): + rid = str(r.get("rule_id", "")) + + if rid in schema_rule_ids: + schema_results.append( + r.model_dump() if hasattr(r, "model_dump") else cast(Dict[str, Any], r) + ) + return schema_results def _compute_skip_map( - *, atomic_rules: List[RuleSchema], schema_result_dict: Dict[str, Any] | None + *, atomic_rules: List[RuleSchema], schema_results: List[Dict[str, Any]] ) -> Dict[str, Dict[str, str]]: try: return _build_prioritized_atomic_status( - schema_result=schema_result_dict, atomic_rules=atomic_rules + schema_results=schema_results, atomic_rules=atomic_rules ) except Exception: return {} @@ -477,7 +606,7 @@ def _emit_json_output( atomic_rules: List[RuleSchema], results: List[Any], skip_map: Dict[str, Dict[str, str]], - schema_result_dict: Dict[str, Any] | None, + schema_results: List[Dict[str, Any]], exec_seconds: float, ) -> None: enriched_results: List[Dict[str, Any]] = [] @@ -515,14 +644,24 @@ def _failed_records_of(res: Dict[str, Any]) -> int: fields: List[Dict[str, Any]] = [] schema_fields_index: Dict[str, Dict[str, Any]] = {} - if schema_result_dict: - schema_plan = (schema_result_dict or {}).get("execution_plan", {}) or {} + schema_rules_map = { + str(rule.id): rule for rule in atomic_rules if rule.type == RuleType.SCHEMA + } + + for schema_result in schema_results: + schema_plan = (schema_result or {}).get("execution_plan", {}) or {} schema_details = schema_plan.get("schema_details", {}) or {} field_results = schema_details.get("field_results", []) or [] + + rule_id = str(schema_result.get("rule_id", "")) + rule = schema_rules_map.get(rule_id) + table_name = rule.get_target_info().get("table") if rule else "unknown" + for item in field_results: col_name = str(item.get("column")) entry: Dict[str, Any] = { "column": col_name, + "table": table_name, "checks": { "existence": { "status": item.get("existence", "UNKNOWN"), @@ -535,25 +674,25 @@ def _failed_records_of(res: Dict[str, Any]) -> int: }, } fields.append(entry) - schema_fields_index[col_name] = entry - - schema_rule = next( - (rule for rule in atomic_rules if rule.type == RuleType.SCHEMA), None - ) - if schema_rule: - params = schema_rule.parameters or {} - declared_cols = (params.get("columns") or {}).keys() - for col in declared_cols: - if str(col) not in schema_fields_index: - entry = { - "column": str(col), - "checks": { - "existence": {"status": "UNKNOWN", "failure_code": "NONE"}, - "type": {"status": "UNKNOWN", "failure_code": "NONE"}, - }, - } - fields.append(entry) - schema_fields_index[str(col)] = entry + schema_fields_index[f"{table_name}.{col_name}"] = entry + + for rule in atomic_rules: + if rule.type == RuleType.SCHEMA: + params = rule.parameters or {} + declared_cols = (params.get("columns") or {}).keys() + table_name = rule.get_target_info().get("table") + for col in declared_cols: + if f"{table_name}.{str(col)}" not in schema_fields_index: + entry = { + "column": str(col), + "table": table_name, + "checks": { + "existence": {"status": "UNKNOWN", "failure_code": "NONE"}, + "type": {"status": "UNKNOWN", "failure_code": "NONE"}, + }, + } + fields.append(entry) + schema_fields_index[f"{table_name}.{str(col)}"] = entry def _ensure_check(entry: Dict[str, Any], name: str) -> Dict[str, Any]: checks: Dict[str, Dict[str, Any]] = entry.setdefault("checks", {}) @@ -572,14 +711,23 @@ def _ensure_check(entry: Dict[str, Any], name: str) -> Dict[str, Any]: rule = rule_map.get(rule_id) if not rule or rule.type == RuleType.SCHEMA: continue + column_name = rule.get_target_column() or "" if not column_name: continue - l_entry = schema_fields_index.get(column_name) + + table_name = "unknown" + if rule.target and rule.target.entities: + table_name = rule.target.entities[0].table + + l_entry = schema_fields_index.get(f"{table_name}.{column_name}") if not l_entry: - l_entry = {"column": column_name, "checks": {}} + l_entry = {"column": column_name, "table": table_name, "checks": {}} fields.append(l_entry) - schema_fields_index[column_name] = l_entry + schema_fields_index[f"{table_name}.{column_name}"] = l_entry + else: + l_entry["table"] = table_name + t = rule.type if t == RuleType.NOT_NULL: key = "not_null" @@ -593,11 +741,13 @@ def _ensure_check(entry: Dict[str, Any], name: str) -> Dict[str, Any]: key = "date_format" else: key = t.value.lower() + check = _ensure_check(l_entry, key) check["status"] = str(rd.get("status", "UNKNOWN")) if rule_id in skip_map: check["status"] = skip_map[rule_id]["status"] check["skip_reason"] = skip_map[rule_id]["skip_reason"] + fr = _failed_records_of(rd) if fr: check["failed_records"] = fr @@ -615,18 +765,18 @@ def _ensure_check(entry: Dict[str, Any], name: str) -> Dict[str, Any]: total_failed_records = sum(_failed_records_of(r) for r in enriched_results) schema_extras: List[str] = [] - if schema_result_dict: + for schema_result in schema_results: try: extras = ( - (schema_result_dict.get("execution_plan") or {}).get( - "schema_details", {} - ) - or {} - ).get("extras", []) + (schema_result or {}) + .get("execution_plan", {}) + .get("schema_details", {}) + .get("extras", []) + ) if isinstance(extras, list): - schema_extras = [str(x) for x in extras] + schema_extras.extend([str(x) for x in extras]) except Exception: - schema_extras = [] + pass payload: Dict[str, Any] = { "status": "ok", @@ -645,7 +795,7 @@ def _ensure_check(entry: Dict[str, Any], name: str) -> Dict[str, Any]: "fields": fields, } if schema_extras: - payload["schema_extras"] = sorted(schema_extras) + payload["schema_extras"] = sorted(list(set(schema_extras))) _safe_echo(json.dumps(payload, default=str)) @@ -655,7 +805,7 @@ def _emit_table_output( atomic_rules: List[RuleSchema], results: List[Any], skip_map: Dict[str, Dict[str, str]], - schema_result_dict: Dict[str, Any] | None, + schema_results: List[Dict[str, Any]], exec_seconds: float, ) -> None: rule_map = {str(rule.id): rule for rule in atomic_rules} @@ -691,14 +841,21 @@ def _dataset_total(res: Dict[str, Any]) -> int: rd["rule_type"] = rule.type.value rd["column_name"] = rule.get_target_column() rd.setdefault("rule_name", rule.name) + if rule.target and rule.target.entities: + rd["table_name"] = rule.target.entities[0].table if rid in skip_map: rd["status"] = skip_map[rid]["status"] rd["skip_reason"] = skip_map[rid]["skip_reason"] table_results.append(rd) - header_total_records = 0 + table_records: Dict[str, int] = {} for rd in table_results: - header_total_records = max(header_total_records, _dataset_total(rd)) + table_name = rd.get("table_name", "unknown") + total = _dataset_total(rd) + if total > 0: + table_records[table_name] = max(table_records.get(table_name, 0), total) + + header_total_records = sum(table_records.values()) def _calc_failed(res: Dict[str, Any]) -> int: if isinstance(res.get("failed_records"), int): @@ -718,103 +875,182 @@ def _calc_failed(res: Dict[str, Any]) -> int: if "total_records" not in rd: rd["total_records"] = _dataset_total(rd) - column_guard: Dict[str, str] = {} - if schema_result_dict: - details = ( - schema_result_dict.get("execution_plan", {}) - .get("schema_details", {}) - .get("field_results", []) - ) - for item in details: - col = str(item.get("column")) - column_guard[col] = str(item.get("failure_code", "NONE")) - - grouped: Dict[str, Dict[str, Any]] = {} - schema_rule = next((r for r in atomic_rules if r.type == RuleType.SCHEMA), None) - declared_cols: List[str] = [] - if schema_rule: - params = schema_rule.parameters or {} - declared_cols = list((params.get("columns") or {}).keys()) - for col in declared_cols: - grouped[str(col)] = {"column": str(col), "issues": []} + tables_grouped: Dict[str, Dict[str, Dict[str, Any]]] = {} for rd in table_results: - rid = str(rd.get("rule_id", "")) - rule = rule_map.get(rid) - if not rule or rule.type == RuleType.SCHEMA: - continue - col = rule.get_target_column() or "" - if not col: + if rd.get("rule_type") == RuleType.SCHEMA.value: continue - entry = grouped.setdefault(col, {"column": col, "issues": []}) - status = str(rd.get("status", "UNKNOWN")) - if rule.type == RuleType.NOT_NULL: - key = "not_null" - elif rule.type == RuleType.RANGE: - key = "range" - elif rule.type == RuleType.ENUM: - key = "enum" - elif rule.type == RuleType.REGEX: - key = "regex" - elif rule.type == RuleType.DATE_FORMAT: - key = "date_format" - else: - key = rule.type.value.lower() - if column_guard.get(col) == "FIELD_MISSING": + table_name = rd.get("table_name", "unknown") + if table_name not in tables_grouped: + tables_grouped[table_name] = {} + + col = rd.get("column_name", "") + if col: + if col not in tables_grouped[table_name]: + tables_grouped[table_name][col] = {"column": col, "issues": []} + + status: Any = str(rd.get("status", "UNKNOWN")) + if rd.get("rule_type") == RuleType.NOT_NULL.value: + key = "not_null" + elif rd.get("rule_type") == RuleType.RANGE.value: + key = "range" + elif rd.get("rule_type") == RuleType.ENUM.value: + key = "enum" + else: + key = rd.get("rule_type", "unknown").lower() + + if status in {"FAILED", "ERROR", "SKIPPED"}: + tables_grouped[table_name][col]["issues"].append( + { + "check": key, + "status": status, + "failed_records": int(rd.get("failed_records", 0) or 0), + "skip_reason": rd.get("skip_reason"), + } + ) + + all_columns_by_table: Dict[str, List[str]] = {} + for rule in atomic_rules: + if rule.target and rule.target.entities: + table_name = rule.target.entities[0].table + if table_name not in all_columns_by_table: + all_columns_by_table[table_name] = [] + + if rule.type == RuleType.SCHEMA: + if rule.parameters: + declared_cols = (rule.parameters.get("columns") or {}).keys() + for col in declared_cols: + if str(col) not in all_columns_by_table[table_name]: + all_columns_by_table[table_name].append(str(col)) + else: + column_name = rule.get_target_column() + if column_name and column_name not in all_columns_by_table[table_name]: + all_columns_by_table[table_name].append(column_name) + + for table_name, columns in all_columns_by_table.items(): + if table_name not in tables_grouped: + tables_grouped[table_name] = {} + for column_name in columns: + if column_name not in tables_grouped[table_name]: + tables_grouped[table_name][column_name] = { + "column": column_name, + "issues": [], + } + + schema_rules_map = { + str(rule.id): rule for rule in atomic_rules if rule.type == RuleType.SCHEMA + } + for schema_result in schema_results: + rule_id = str(schema_result.get("rule_id", "")) + rule = schema_rules_map.get(rule_id) + if not rule: continue - if column_guard.get(col) == "TYPE_MISMATCH" and key in { - "not_null", - "range", - "enum", - "regex", - "date_format", - }: + + table_name = rule.get_target_info().get("table") + if not table_name or table_name not in tables_grouped: continue - if status in {"FAILED", "ERROR", "SKIPPED"}: - entry["issues"].append( - { - "check": key, - "status": status, - "failed_records": int(rd.get("failed_records", 0) or 0), - "skip_reason": skip_map.get(rid, {}).get("skip_reason"), - } - ) + + execution_plan = schema_result.get("execution_plan") or {} + schema_details = execution_plan.get("schema_details", {}) or {} + details = schema_details.get("field_results", []) or [] + for item in details: + col = str(item.get("column")) + if col not in tables_grouped[table_name]: + continue + if item.get("failure_code") == "FIELD_MISSING": + tables_grouped[table_name][col]["issues"].append( + {"check": "missing", "status": "FAILED"} + ) + elif item.get("failure_code") == "TYPE_MISMATCH": + tables_grouped[table_name][col]["issues"].append( + {"check": "type", "status": "FAILED"} + ) lines: List[str] = [] - lines.append(f"✓ Checking {source} ({header_total_records:,} records)") + lines.append(f"✓ Checking {source}") total_failed_records = sum( int(r.get("failed_records", 0) or 0) for r in table_results ) - for col in sorted(grouped.keys()): - guard = column_guard.get(col, "NONE") - if guard == "FIELD_MISSING": - lines.append(f"✗ {col}: missing (skipped dependent checks)") - continue - if guard == "TYPE_MISMATCH": - lines.append(f"✗ {col}: type mismatch (skipped dependent checks)") - continue - issues = grouped[col]["issues"] - critical = [i for i in issues if i["status"] in {"FAILED", "ERROR"}] - if not critical: - lines.append(f"✓ {col}: OK") - else: - for i in critical: - fr = i.get("failed_records") or 0 - if i["status"] == "ERROR": - lines.append(f"✗ {col}: {i['check']} error") - else: - lines.append(f"✗ {col}: {i['check']} failed ({fr} failures)") - - total_columns = len(grouped) + sorted_tables = sorted(tables_grouped.keys()) + + for table_name in sorted_tables: + records = table_records.get(table_name, 0) + lines.append(f"\n📋 Table: {table_name} ({records:,} records)") + + table_grouped = tables_grouped[table_name] + ordered_columns = all_columns_by_table.get(table_name, []) + + # Fallback for columns that might appear in results but not in rules + # (e.g., from a different source) + result_columns = sorted(table_grouped.keys()) + for col in result_columns: + if col not in ordered_columns: + ordered_columns.append(col) + + for col in ordered_columns: + if col not in table_grouped: + lines.append(f"✓ {col}: OK") + continue + + issues = table_grouped[col]["issues"] + + if not issues: + lines.append(f"✓ {col}: OK") + continue + + is_missing = any( + i.get("check") == "missing" or i.get("skip_reason") == "FIELD_MISSING" + for i in issues + ) + + if is_missing: + lines.append(f"✗ {col}: missing (skipped dependent checks)") + continue + + unique_issues: Dict[Tuple[str, str], Dict[str, Any]] = {} + for issue in issues: + key_ = (str(issue.get("status")), str(issue.get("check"))) + if key_ not in unique_issues: + unique_issues[key_] = issue + + final_issues = sorted( + unique_issues.values(), key=lambda x: str(x.get("check")) + ) + + issue_descs: List[str] = [] + for i in final_issues: + status = i.get("status") + check = i.get("check", "unknown") + + if status in {"FAILED", "ERROR"}: + fr = i.get("failed_records", 0) + if status == "ERROR": + issue_descs.append(f"{check} error") + else: + issue_descs.append(f"{check} failed ({fr} failures)") + elif status == "SKIPPED": + skip_reason = i.get("skip_reason") + if skip_reason == "TYPE_MISMATCH": + issue_descs.append("type mismatch (skipped dependent checks)") + else: + reason_text = skip_reason or "unknown reason" + issue_descs.append(f"{check} skipped ({reason_text})") + + if not issue_descs: + lines.append(f"✓ {col}: OK") + else: + lines.append(f"✗ {col}: { ', '.join(issue_descs)}") + + total_columns = sum(len(all_columns_by_table.get(t, [])) for t in sorted_tables) passed_columns = sum( - 1 - for col in grouped - if column_guard.get(col, "NONE") == "NONE" - and not [ - i for i in grouped[col]["issues"] if i["status"] in {"FAILED", "ERROR"} - ] + sum( + 1 + for c in all_columns_by_table.get(t, []) + if not tables_grouped.get(t, {}).get(c, {}).get("issues", []) + ) + for t in sorted_tables ) failed_columns = total_columns - passed_columns overall_error_rate = ( @@ -822,6 +1058,22 @@ def _calc_failed(res: Dict[str, Any]) -> int: if header_total_records == 0 else (total_failed_records / max(header_total_records, 1)) * 100 ) + + if len(tables_grouped) > 1: + lines.append("\n📊 Multi-table Summary:") + for table_name in sorted_tables: + table_cols = all_columns_by_table.get(table_name, []) + table_columns_count = len(table_cols) + table_passed = sum( + 1 + for c in table_cols + if not tables_grouped[table_name].get(c, {}).get("issues") + ) + table_failed = table_columns_count - table_passed + lines.append( + f" {table_name}: {table_passed} passed, {table_failed} failed" + ) + lines.append( f"\nSummary: {passed_columns} passed, {failed_columns} failed" f" ({overall_error_rate:.2f}% overall error rate)" @@ -832,13 +1084,19 @@ def _calc_failed(res: Dict[str, Any]) -> int: @click.command("schema") -@click.argument("source", required=True) +@click.option( + "--conn", + "connection_string", + required=True, + help="Database connection string or file path", +) @click.option( "--rules", "rules_file", type=click.Path(exists=True, readable=True), required=True, - help="Path to schema rules file (JSON)", + help="Path to schema rules file (JSON) - supports both single-table " + "and multi-table formats", ) @click.option( "--output", @@ -851,57 +1109,49 @@ def _calc_failed(res: Dict[str, Any]) -> int: "--fail-on-error", is_flag=True, default=False, - help="Return exit code 1 if any error occurs during skeleton execution", -) -@click.option( - "--max-errors", - type=int, - default=100, - show_default=True, - help="Maximum number of errors to collect (reserved; not used in skeleton)", + help="Return exit code 1 if any error occurs during execution", ) @click.option("--verbose", is_flag=True, default=False, help="Enable verbose output") def schema_command( - source: str, + connection_string: str, rules_file: str, output: str, fail_on_error: bool, - max_errors: int, verbose: bool, ) -> None: - """Schema validation command with minimal rules file validation. - - Decomposition and execution are added in subsequent tasks. + """ + Schema validation command with support for both single-table + and multi-table validation. """ from cli.core.config import get_cli_config from core.config import get_core_config - # start_time = now() try: - _maybe_echo_analyzing(source, output) - _guard_empty_source_file(source) - - source_config = SourceParser().parse_source(source) + _maybe_echo_analyzing(connection_string, output) + _guard_empty_source_file(connection_string) + source_config = SourceParser().parse_source(connection_string) rules_payload = _read_rules_payload(rules_file) + is_multi_table_rules = "rules" not in rules_payload + if is_multi_table_rules: + source_config.parameters["is_multi_table"] = True + warnings, rules_count = _validate_rules_payload(rules_payload) - _emit_warnings(warnings) + _emit_warnings(warnings, output) - # Decompose into atomic rules per design - atomic_rules = _decompose_to_atomic_rules(rules_payload) + atomic_rules = _decompose_schema_payload(rules_payload, source_config) - # Fast-path: no rules → emit minimal payload and exit cleanly - if len(atomic_rules) == 0: + if not atomic_rules: _early_exit_when_no_rules( - source=source, + source=connection_string, rules_file=rules_file, output=output, fail_on_error=fail_on_error, ) + return - # Execute via core engine using DataValidator core_config = get_core_config() cli_config = get_cli_config() validator = _create_validator( @@ -912,36 +1162,33 @@ def schema_command( ) results, exec_seconds = _run_validation(validator) - # Aggregation and prioritization - schema_result_dict: Dict[str, Any] | None = _extract_schema_result_dict( + schema_results = _extract_schema_results( atomic_rules=atomic_rules, results=results ) skip_map = _compute_skip_map( - atomic_rules=atomic_rules, schema_result_dict=schema_result_dict + atomic_rules=atomic_rules, schema_results=schema_results ) - # Apply skip map to JSON output only; table mode stays concise by design if output.lower() == "json": _emit_json_output( - source=source, + source=connection_string, rules_file=rules_file, atomic_rules=atomic_rules, results=results, skip_map=skip_map, - schema_result_dict=schema_result_dict, + schema_results=schema_results, exec_seconds=exec_seconds, ) else: _emit_table_output( - source=source, + source=connection_string, atomic_rules=atomic_rules, results=results, skip_map=skip_map, - schema_result_dict=schema_result_dict, + schema_results=schema_results, exec_seconds=exec_seconds, ) - # Exit code: fail if any rule failed (support both model objects and dicts) def _status_of(item: Any) -> str: if hasattr(item, "status"): try: @@ -953,19 +1200,13 @@ def _status_of(item: Any) -> str: return "" any_failed = any(_status_of(r) == "FAILED" for r in results) - import click as _click - - raise _click.exceptions.Exit(1 if any_failed or fail_on_error else 0) + raise click.exceptions.Exit(1 if any_failed or fail_on_error else 0) except click.UsageError: - # Propagate Click usage errors for standard exit code (typically 2) raise except click.exceptions.Exit: - # Allow Click's explicit Exit (with code) to propagate unchanged raise - except Exception as e: # Fallback: print concise error and return generic failure + except Exception as e: logger.error(f"Schema command error: {str(e)}") _safe_echo(f"❌ Error: {str(e)}", err=True) - import click as _click - - raise _click.exceptions.Exit(1) + raise click.exceptions.Exit(1) diff --git a/cli/core/data_validator.py b/cli/core/data_validator.py index fb73021..2415f34 100644 --- a/cli/core/data_validator.py +++ b/cli/core/data_validator.py @@ -111,6 +111,13 @@ def _complete_target_info(self) -> None: This replaces the old _update_rule_connections method. """ + # If the source is multi-table, targets are already set. Do not overwrite. + if self.source_config.parameters.get("is_multi_table"): + self.logger.debug( + "Multi-table source detected, skipping target info completion." + ) + return + if not self.rules: return @@ -184,15 +191,32 @@ async def _validate_file(self) -> List[ExecutionResultSchema]: """Validate file-based data source""" self.logger.info(f"Validating file: {self.source_config.file_path}") - # Load file data - try: - df = self._load_file_data() - self.logger.info(f"Loaded {len(df)} records from file") - except Exception as e: - raise ValueError(f"Failed to load file data: {str(e)}") + # Check if this is a multi-table Excel file + is_multi_table = self.source_config.parameters.get("is_multi_table", False) + self.logger.info( + f"Multi-table detection: is_multi_table={is_multi_table}, " + f"connection_type={self.source_config.connection_type}" + ) + self.logger.info(f"Source config parameters: {self.source_config.parameters}") + + if ( + is_multi_table + and self.source_config.connection_type == ConnectionType.EXCEL + ): + # Handle multi-table Excel file + self.logger.info("Processing multi-table Excel file") + sqlite_config = await self._convert_multi_table_excel_to_sqlite() + else: + # Handle single-table file (existing logic) + self.logger.info("Processing single-table file") + try: + df = self._load_file_data() + self.logger.info(f"Loaded {len(df)} records from file") + except Exception as e: + raise ValueError(f"Failed to load file data: {str(e)}") - # Convert to SQLite for rule engine processing - sqlite_config = await self._convert_file_to_sqlite(df) + # Convert to SQLite for rule engine processing + sqlite_config = await self._convert_file_to_sqlite(df) # Execute rules using rule engine with new interface rule_engine = RuleEngine(connection=sqlite_config, core_config=self.core_config) @@ -310,6 +334,143 @@ def _load_file_data(self) -> pd.DataFrame: except Exception as e: raise ValueError(f"Failed to parse file: {str(e)}") + async def _convert_multi_table_excel_to_sqlite(self) -> ConnectionSchema: + """ + Convert multi-table Excel file to SQLite database. + + Returns: + ConnectionSchema: SQLite connection configuration + """ + import os + import tempfile + import time + + from sqlalchemy import create_engine + + temp_db_file = None + temp_db_path = None + start_time = time.time() + + try: + # Create a temporary SQLite file + temp_db_file = tempfile.NamedTemporaryFile(suffix=".db", delete=False) + temp_db_path = temp_db_file.name + temp_db_file.close() + + # Create SQLite engine + engine = create_engine(f"sqlite:///{temp_db_path}") + + # Load all sheets into SQLite + await self._load_multi_table_excel_to_sqlite(engine, temp_db_path) + + # Get table mapping for connection config + table_mapping = self.source_config.parameters.get("table_mapping", {}) + + # Create connection config with multi-table information + sqlite_config = ConnectionSchema( + name="temp_sqlite_multi_table", + description="Temporary SQLite for multi-table Excel validation", + connection_type=ConnectionType.SQLITE, + file_path=temp_db_path, + parameters={ + "is_multi_table": True, + "table_mapping": table_mapping, + "temp_file": True, # Mark as temporary file for cleanup + }, + ) + + # Log performance metrics + elapsed_time = time.time() - start_time + self.logger.info( + f"Created temporary SQLite database at {temp_db_path} with " + f"{len(table_mapping)} tables in {elapsed_time:.2f} seconds" + ) + + return sqlite_config + + except Exception as e: + # Clean up temporary file if it exists + if temp_db_path and os.path.exists(temp_db_path): + try: + os.unlink(temp_db_path) + except Exception as cleanup_error: + self.logger.warning( + f"Failed to cleanup temporary file {temp_db_path}: " + f"{cleanup_error}" + ) + raise ValueError(f"Failed to create multi-table SQLite database: {str(e)}") + + async def _load_multi_table_excel_to_sqlite( + self, engine: Any, temp_db_path: str + ) -> None: + """ + Load multiple sheets from Excel file into SQLite database. + + Args: + engine: SQLAlchemy engine for SQLite + temp_db_path: Path to temporary SQLite database + """ + import pandas as pd + + file_path = self.source_config.file_path + sheets_info = self.source_config.parameters.get("sheets", {}) + + if not sheets_info: + raise ValueError( + "Multi-table Excel file but no sheets information available" + ) + + self.logger.info( + f"Loading {len(sheets_info)} sheets into SQLite: {list(sheets_info.keys())}" + ) + + # Store table name mapping for later use + table_mapping = {} + + # Load each sheet into a separate table + for sheet_name, columns in sheets_info.items(): + try: + # Read the specific sheet + df = pd.read_excel(file_path, sheet_name=sheet_name, engine="openpyxl") + + # Validate that the sheet has the expected columns + expected_columns = set(columns) + actual_columns = set(df.columns) + + if not expected_columns.issubset(actual_columns): + missing_columns = expected_columns - actual_columns + self.logger.warning( + f"Sheet '{sheet_name}' missing expected columns: " + f"{missing_columns}" + ) + + # Write to SQLite with sheet name as table name + # Clean table name for SQLite (remove special characters) + clean_table_name = "".join( + c for c in sheet_name if c.isalnum() or c == "_" + ) + if not clean_table_name or clean_table_name[0].isdigit(): + clean_table_name = f"sheet_{clean_table_name}" + + # Store the mapping from original sheet name to clean table name + table_mapping[sheet_name] = clean_table_name + + df.to_sql(clean_table_name, engine, if_exists="replace", index=False) + self.logger.info( + f"Loaded sheet '{sheet_name}' as table '{clean_table_name}' " + f"with {len(df)} rows" + ) + + except Exception as e: + self.logger.error(f"Failed to load sheet '{sheet_name}': {str(e)}") + # Continue with other sheets + continue + + # Store the table mapping in the source config for later use + if hasattr(self, "source_config") and hasattr(self.source_config, "parameters"): + self.source_config.parameters["table_mapping"] = table_mapping + self.logger.info(f"Stored table mapping: {table_mapping}") + async def _convert_file_to_sqlite(self, df: pd.DataFrame) -> ConnectionSchema: """ Convert pandas DataFrame to SQLite in-memory database diff --git a/cli/core/source_parser.py b/cli/core/source_parser.py index 976592e..7dadc59 100644 --- a/cli/core/source_parser.py +++ b/cli/core/source_parser.py @@ -8,7 +8,7 @@ import re import urllib.parse from pathlib import Path -from typing import Optional, Tuple +from typing import Dict, List, Optional, Tuple from uuid import uuid4 from cli.exceptions import ValidationError @@ -52,12 +52,15 @@ def __init__(self) -> None: ".jsonl": ConnectionType.JSON, } - def parse_source(self, source: str) -> ConnectionSchema: + def parse_source( + self, source: str, table_name: Optional[str] = None + ) -> ConnectionSchema: """ Parse source string into ConnectionSchema. Args: source: Source string (file path or database URL) + table_name: Optional table name (overrides table from URL if provided) Returns: ConnectionSchema: Parsed connection configuration @@ -75,7 +78,7 @@ def parse_source(self, source: str) -> ConnectionSchema: raise ValidationError("Unrecognized source format: Empty source") if self._is_database_url(source): - return self._parse_database_url(source) + return self._parse_database_url(source, table_name) elif source.startswith("file://"): # Handle file:// protocol file_path = source[7:] # Remove file:// prefix @@ -92,6 +95,62 @@ def parse_source(self, source: str) -> ConnectionSchema: self.logger.error(f"{str(e)}") raise + def get_excel_sheets(self, file_path: str) -> Dict[str, List[str]]: + """ + Get sheet names from Excel file. + + Args: + file_path: Path to Excel file + + Returns: + Dict with sheet names as keys and column lists as values + + Raises: + ImportError: If pandas/openpyxl not available + FileNotFoundError: If file not found + """ + try: + import pandas as pd + except ImportError: + raise ImportError("pandas is required to read Excel files") + + try: + excel_file = pd.ExcelFile(file_path) + sheets_info = {} + + for sheet_name in excel_file.sheet_names: + # Read first few rows to get column names + df = pd.read_excel(file_path, sheet_name=sheet_name, nrows=0) + sheets_info[str(sheet_name)] = list(df.columns) + + return sheets_info + except Exception as e: + self.logger.error(f"Error reading Excel file {file_path}: {str(e)}") + raise + + def is_multi_table_excel(self, file_path: str) -> bool: + """ + Check if Excel file contains multiple sheets that could represent + multiple tables. + + Args: + file_path: Path to Excel file + + Returns: + True if file has multiple sheets, False otherwise + """ + try: + import pandas as pd + + excel_file = pd.ExcelFile(file_path) + return len(excel_file.sheet_names) > 1 + except ImportError: + # If pandas not available, assume single table + return False + except Exception: + # If any error occurs, assume single table + return False + def _is_database_url(self, source: str) -> bool: """Check if source is a database URL""" for patterns in self.db_url_patterns.values(): @@ -118,7 +177,9 @@ def _is_file_path(self, source: str) -> bool: return False - def _parse_database_url(self, url: str) -> ConnectionSchema: + def _parse_database_url( + self, url: str, table_name: Optional[str] = None + ) -> ConnectionSchema: """ Parse database URL into connection configuration. @@ -126,6 +187,10 @@ def _parse_database_url(self, url: str) -> ConnectionSchema: - mysql://user:pass@host:port/database.table - postgres://user:pass@host:port/database.table - sqlite:///path/to/database.db.table + + Args: + url: Database connection URL + table_name: Optional table name (overrides table from URL if provided) """ self.logger.debug(f"Parsing database URL: {url}") @@ -136,7 +201,10 @@ def _parse_database_url(self, url: str) -> ConnectionSchema: parsed = urllib.parse.urlparse(url) # Extract database and table from path - database, table = self._extract_db_table_from_path(parsed.path) + database, table_from_url = self._extract_db_table_from_path(parsed.path) + + # Use provided table_name if available, otherwise use table from URL + table = table_name if table_name is not None else table_from_url # Handle SQLite special case if conn_type == ConnectionType.SQLITE: @@ -170,47 +238,66 @@ def _parse_file_path(self, file_path: str) -> ConnectionSchema: path = Path(file_path) - # Check if file exists if not path.exists(): raise FileNotFoundError(f"File not found: {file_path}") if not path.is_file(): raise ValidationError(f"Path is not a file: {file_path}") - # Determine file type file_ext = path.suffix.lower() conn_type = self.file_extensions.get(file_ext) if not conn_type: - # Try to infer from content or use CSV as default conn_type = ConnectionType.CSV self.logger.warning( f"Unknown file extension {file_ext}, assuming CSV format" ) + is_multi_table = False + sheets_info = {} + if conn_type == ConnectionType.EXCEL: + try: + sheets_info = self.get_excel_sheets(file_path) + if len(sheets_info) > 1: + is_multi_table = True + self.logger.info( + f"Multi-table Excel file detected with {len(sheets_info)} " + "sheets: {list(sheets_info.keys())}" + ) + except Exception as e: + self.logger.warning( + f"Could not read Excel sheets, treating as single-table: {str(e)}" + ) + is_multi_table = False + + parameters = { + "filename": path.name, + "file_size": path.stat().st_size, + "encoding": "utf-8", + } + + if is_multi_table and sheets_info: + parameters["is_multi_table"] = True + parameters["sheets"] = sheets_info + available_tables = list(sheets_info.keys()) + else: + parameters["is_multi_table"] = False + available_tables = [path.stem] + return ConnectionSchema( name=f"file_connection_{uuid4().hex[:8]}", - description=f"File connection: {path.name}", + description=f"File connection: {path.name}" + + (" (multi-table)" if is_multi_table else ""), connection_type=conn_type, - host=None, - port=None, - db_name=None, - username=None, - password=None, - db_schema=None, file_path=str(path.absolute()), - parameters={ - "filename": path.name, - "file_size": path.stat().st_size, - "encoding": "utf-8", # Default encoding - }, + parameters=parameters, + available_tables=available_tables, capabilities=DataSourceCapability( supports_sql=False, supports_batch_export=True, - max_export_rows=100000, - estimated_throughput=5000, + max_export_rows=100000 if not is_multi_table else 50000, + estimated_throughput=5000 if not is_multi_table else 2000, ), - cross_db_settings=None, ) def _detect_database_type(self, url: str) -> ConnectionType: @@ -286,14 +373,9 @@ def _create_sqlite_connection( name=f"sqlite_connection_{uuid4().hex[:8]}", description=f"SQLite connection: {Path(file_path).name}", connection_type=ConnectionType.SQLITE, - host=None, - port=None, - db_name=None, - username=None, - password=None, - db_schema=None, file_path=file_path, parameters=parameters, + available_tables=[table] if table else [], capabilities=DataSourceCapability( supports_sql=True, supports_batch_export=True, diff --git a/cli_main.py b/cli_main.py index 7ac983d..7efead3 100644 --- a/cli_main.py +++ b/cli_main.py @@ -2,7 +2,7 @@ """ ValidateLite CLI Main Entry Point -Main entry point for the vlite-cli command-line tool. +Main entry point for the vlite command-line tool. """ import os diff --git a/config/logging.test.toml b/config/logging.test.toml new file mode 100644 index 0000000..2ce2ddc --- /dev/null +++ b/config/logging.test.toml @@ -0,0 +1,37 @@ +# Test Environment Logging Configuration + +# Global log level: Set to WARNING to suppress DEBUG and INFO messages +level = "WARNING" + +# Log message format +format = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" + +# Enable logging to file (disabled for tests to keep output clean) +to_file = false + +# Module-specific log levels for testing +[module_levels] +# Core modules - set to WARNING to reduce noise +"shared.database.connection" = "WARNING" +"shared.database.query_executor" = "WARNING" +"cli.commands.check" = "WARNING" +"cli.core.data_validator" = "WARNING" +"cli.core.source_parser" = "WARNING" +"cli.core.rule_parser" = "WARNING" +"rule_engine" = "WARNING" +"core.engine.rule_engine" = "WARNING" + +# Third-party modules - set to ERROR to suppress all debug info +"aiosqlite" = "ERROR" +"sqlalchemy" = "ERROR" +"sqlalchemy.engine" = "ERROR" +"sqlalchemy.pool" = "ERROR" +"sqlalchemy.dialects" = "ERROR" +"pydantic" = "WARNING" +"toml" = "WARNING" +"werkzeug" = "WARNING" +"urllib3.connectionpool" = "WARNING" + +# Keep only critical errors visible +"asyncio" = "WARNING" +"pytest" = "WARNING" diff --git a/config/logging.toml b/config/logging.toml index 9630a91..b32cc7b 100644 --- a/config/logging.toml +++ b/config/logging.toml @@ -1,7 +1,7 @@ # Logging Configuration # Global log level: DEBUG, INFO, WARNING, ERROR, CRITICAL -level = "ERROR" +level = "WARNING" # Log message format format = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" diff --git a/docs/CONFIG_REFERENCE.md b/docs/CONFIG_REFERENCE.md index 5bb029e..78caf02 100644 --- a/docs/CONFIG_REFERENCE.md +++ b/docs/CONFIG_REFERENCE.md @@ -129,7 +129,7 @@ export CLI_CONFIG_PATH=/path/to/custom/cli.toml export LOGGING_CONFIG_PATH=/path/to/custom/logging.toml # Run the application -vlite-cli check data.csv --rule "not_null(id)" +vlite check --conn data.csv --table data --rule "not_null(id)" ``` ## Configuration Loading Order diff --git a/docs/USAGE.md b/docs/USAGE.md index 6f2b687..b91a7c5 100644 --- a/docs/USAGE.md +++ b/docs/USAGE.md @@ -37,7 +37,7 @@ pip install validatelite **Option 2: Install from pre-built package** ```bash -pip install validatelite-0.4.0-py3-none-any.whl +pip install validatelite-0.4.2-py3-none-any.whl ``` **Option 3: Run from source** @@ -57,13 +57,13 @@ Let's start with a simple validation to check that all records in a CSV file hav ```bash # Validate a CSV file -vlite check examples/sample_data.csv --rule "not_null(customer_id)" +vlite check --conn examples/sample_data.csv --table data --rule "not_null(customer_id)" # Validate a database table -vlite check "mysql://user:pass@localhost:3306/mydb.customers" --rule "unique(email)" +vlite check --conn "mysql://user:pass@localhost:3306/mydb" --table customers --rule "unique(email)" # Validate against a schema file -vlite schema "mysql://user:pass@localhost:3306/mydb.customers" --rules schema.json +vlite schema --conn "mysql://user:pass@localhost:3306/mydb" --rules schema.json ``` --- @@ -79,7 +79,7 @@ ValidateLite provides two main commands: Both commands follow this general pattern: ```bash -vlite [options] +vlite --conn --table [options] ``` ### Data Source Types @@ -89,9 +89,9 @@ ValidateLite supports multiple data source types: | Type | Format | Example | |------|--------|---------| | **Local Files** | CSV, Excel, JSON, JSONL | `data/customers.csv` | -| **MySQL** | Connection string | `mysql://user:pass@host:3306/db.table` | -| **PostgreSQL** | Connection string | `postgresql://user:pass@host:5432/db.table` | -| **SQLite** | File path with table | `sqlite:///path/to/db.sqlite.table` | +| **MySQL** | Connection string | `mysql://user:pass@host:3306/db` | +| **PostgreSQL** | Connection string | `postgresql://user:pass@host:5432/db` | +| **SQLite** | File path with table | `sqlite:///path/to/db.sqlite` | ### Rule Types Overview @@ -114,11 +114,12 @@ The `check` command allows you to specify validation rules either inline or thro #### Basic Syntax & Parameters ```bash -vlite check [options] +vlite check --conn --table [options] ``` **Required Parameters:** -- `` - Path to file or database connection string +- `--conn ` - Path to file or database connection string +- `--table ` - Table name or identifier for the data source **Options:** | Option | Description | @@ -137,10 +138,10 @@ Use `--rule` for simple, quick validations: ```bash # Single rule -vlite check data.csv --rule "not_null(id)" +vlite check --conn data.csv --table data --rule "not_null(id)" # Multiple rules -vlite check data.csv \ +vlite check --conn data.csv --table data \ --rule "not_null(name)" \ --rule "unique(id)" \ --rule "range(age, 18, 99)" @@ -221,12 +222,12 @@ Sample Failed Data: **1. Basic file validation:** ```bash -vlite check test_data/customers.xlsx --rule "not_null(name)" +vlite check --conn test_data/customers.xlsx --table customers --rule "not_null(name)" ``` **2. Multiple rules with verbose output:** ```bash -vlite check test_data/customers.xlsx \ +vlite check --conn test_data/customers.xlsx --table customers \ --rule "unique(email)" \ --rule "regex(email, '^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}$')" \ --verbose @@ -234,14 +235,14 @@ vlite check test_data/customers.xlsx \ **3. Comprehensive validation using rules file:** ```bash -vlite check "mysql://root:password@localhost:3306/data_quality.customers" \ +vlite check --conn "mysql://root:password@localhost:3306/data_quality" --table customers \ --rules "validation_rules.json" \ --verbose ``` **4. CSV file with multiple constraints:** ```bash -vlite check examples/sample_data.csv \ +vlite check --conn examples/sample_data.csv --table data \ --rule "not_null(customer_id)" \ --rule "unique(customer_id)" \ --rule "length(email, 5, 100)" \ @@ -259,17 +260,17 @@ vlite check examples/sample_data.csv \ ### The `schema` Command - Schema Validation -The `schema` command validates tables against JSON schema files, automatically decomposing schemas into atomic rules with intelligent prioritization and aggregation. +The `schema` command validates tables against JSON schema files, automatically decomposing schemas into atomic rules with intelligent prioritization and aggregation. **NEW in v0.4.2**: Enhanced multi-table support, Excel multi-sheet file support, and improved output formatting. #### Basic Syntax & Parameters ```bash -vlite schema --rules [options] +vlite schema --conn --rules [options] ``` **Required Parameters:** -- `` - Database/table identifier (table derived from URL) -- `--rules ` - Path to JSON schema file +- `--conn ` - Database connection string or file path (now supports Excel multi-sheet files) +- `--rules ` - Path to JSON schema file (supports both single-table and multi-table formats) **Options:** | Option | Description | @@ -278,9 +279,10 @@ vlite schema --rules [options] | `--verbose` | Show detailed information in table mode | | `--help` | Display command help | -#### Schema File Structure (v1) +#### Schema File Structure -**Minimal Structure:** +**Single-Table Format (v1):** +_Only applicable to CSV file data sources_ ```json { "rules": [ @@ -295,6 +297,29 @@ vlite schema --rules [options] } ``` +**NEW: Multi-Table Format (v0.4.2):** +```json +{ + "customers": { + "rules": [ + { "field": "id", "type": "integer", "required": true }, + { "field": "name", "type": "string", "required": true }, + { "field": "email", "type": "string", "required": true } + ], + "strict_mode": true, + "case_insensitive": false + }, + "orders": { + "rules": [ + { "field": "order_id", "type": "integer", "required": true }, + { "field": "customer_id", "type": "integer", "required": true }, + { "field": "total", "type": "float", "min": 0.01 } + ], + "strict_mode": false + } +} +``` + **Supported Field Types:** - `string`, `integer`, `float`, `boolean`, `date`, `datetime` @@ -304,8 +329,24 @@ vlite schema --rules [options] - `required` - Generate NOT_NULL rule if true - `min`/`max` - Generate RANGE rule for numeric types - `enum` - Generate ENUM rule with allowed values -- `strict_mode` - Report extra columns as violations -- `case_insensitive` - Case-insensitive column matching +- `strict_mode` - Report extra columns as violations (table-level option) +- `case_insensitive` - Case-insensitive column matching (table-level option) + +#### NEW: Multi-Table and Excel Support + +**Excel Multi-Sheet Files:** +The schema command now supports Excel files with multiple worksheets as data sources. Each worksheet can be validated against its corresponding schema definition. + +```bash +# Validate Excel file with multiple sheets +vlite schema --conn "data.xlsx" --rules multi_table_schema.json +``` + +**Multi-Table Validation:** +- Support for validating multiple tables in a single command +- Table-level configuration options (strict_mode, case_insensitive) +- Automatic detection of multi-table data sources +- Grouped output display by table #### Rule Decomposition Logic @@ -328,7 +369,7 @@ Schema Field → Generated Rules #### Output Formats -**Table Mode (default)** - Column-grouped summary: +**Table Mode (default)** - Column-grouped summary with improved formatting: ``` Column Validation Results ═════════════════════════ @@ -345,42 +386,91 @@ Column: status ⚠ Dependent checks skipped ``` -**JSON Mode** (`--output json`) - Machine-readable format: +**NEW: Multi-Table Table Mode:** +``` +Table: customers +═══════════════ +Column: id + ✓ Field exists (integer) + ✓ Not null constraint + +Table: orders +═══════════════ +Column: order_id + ✓ Field exists (integer) + ✓ Not null constraint +``` + +**JSON Mode** (`--output json`) - Machine-readable format with enhanced structure: ```json { "summary": { - "total_checks": 8, - "passed": 5, - "failed": 2, - "skipped": 1 + "total_checks": 12, + "passed": 8, + "failed": 3, + "skipped": 1, + "execution_time_ms": 1250 }, "results": [...], "fields": { - "id": { "status": "passed", "checks": [...] }, - "age": { "status": "failed", "checks": [...] } + "age": { + "status": "passed", + "checks": ["existence", "type", "not_null", "range"] + }, + "unknown_field": { + "status": "extra", + "checks": [] + } }, - "schema_extras": ["unknown_column"] + "schema_extras": ["unknown_field"], + "tables": { + "customers": { + "status": "passed", + "total_checks": 6, + "passed": 6 + }, + "orders": { + "status": "failed", + "total_checks": 6, + "passed": 2, + "failed": 4 + } + } } ``` +**Full JSON schema definition:** `docs/schemas/schema_results.schema.json` + #### Practical Examples **1. Basic schema validation:** ```bash -vlite schema "mysql://root:password@localhost:3306/data_quality.customers" \ +vlite schema --conn "mysql://root:password@localhost:3306/data_quality" \ --rules test_data/schema.json ``` -**2. JSON output for automation:** +**2. NEW: Multi-table schema validation:** +```bash +vlite schema --conn "mysql://user:pass@host:3306/sales" \ + --rules multi_table_schema.json +``` + +**3. NEW: Excel multi-sheet validation:** +```bash +vlite schema --conn "data.xlsx" \ + --rules excel_schema.json +``` + +**4. JSON output for automation:** ```bash -vlite schema "mysql://user:pass@host:3306/sales.users" \ +vlite schema --conn "mysql://user:pass@host:3306/sales" \ --rules schema.json \ --output json ``` -**3. Verbose table output:** +**5. Verbose table output:** ```bash -vlite schema "postgresql://user:pass@localhost:5432/app.customers" \ +vlite schema --conn "postgresql://user:pass@localhost:5432/app" \ --rules customer_schema.json \ --verbose ``` @@ -407,13 +497,13 @@ vlite schema "postgresql://user:pass@localhost:5432/app.customers" \ **Examples:** ```bash # CSV with custom delimiter (auto-detected) -vlite check data/customers.csv --rule "not_null(id)" +vlite check --conn data/customers.csv --table customers --rule "not_null(id)" # Excel file (auto-detects first sheet) -vlite check reports/monthly_data.xlsx --rule "unique(transaction_id)" +vlite check --conn reports/monthly_data.xlsx --table data --rule "unique(transaction_id)" # JSON Lines file -vlite check logs/events.jsonl --rule "not_null(timestamp)" +vlite check --conn logs/events.jsonl --table events --rule "not_null(timestamp)" ``` #### Database Sources @@ -422,30 +512,30 @@ vlite check logs/events.jsonl --rule "not_null(timestamp)" **MySQL:** ``` -mysql://[username[:password]@]host[:port]/database.table +mysql://[username[:password]@]host[:port]/database ``` **PostgreSQL:** ``` -postgresql://[username[:password]@]host[:port]/database.table +postgresql://[username[:password]@]host[:port]/database ``` **SQLite:** ``` -sqlite:///[absolute_path_to_file].table -sqlite://[relative_path_to_file].table +sqlite:///[absolute_path_to_file] +sqlite://[relative_path_to_file] ``` **Connection Examples:** ```bash # MySQL with authentication -vlite check "mysql://admin:secret123@db.company.com:3306/sales.customers" --rule "unique(id)" +vlite check --conn "mysql://admin:secret123@db.company.com:3306/sales" --table customers --rule "unique(id)" # PostgreSQL with default port -vlite check "postgresql://analyst@analytics-db/warehouse.orders" --rules validation.json +vlite check --conn "postgresql://analyst@analytics-db/warehouse" --table orders --rules validation.json # SQLite local file -vlite check "sqlite:///data/local.db.users" --rule "not_null(email)" +vlite check --conn "sqlite:///data/local.db" --table users --rule "not_null(email)" ``` ### Validation Rules Deep Dive diff --git a/examples/README.md b/examples/README.md index a276956..6629940 100644 --- a/examples/README.md +++ b/examples/README.md @@ -18,14 +18,14 @@ This directory contains examples and sample files to help you get started with V 2. **Validate the sample data:** ```bash - python cli_main.py check examples/sample_data.csv --rules examples/sample_rules.json + python cli_main.py check --conn examples/sample_data.csv --table data --rules examples/sample_rules.json ``` 3. **Test with your own data:** ```bash # Create your own rules file based on sample_rules.json # Then run validation - python cli_main.py check your_data.csv --rules your_rules.json + python cli_main.py check --conn your_data.csv --table data --rules your_rules.json ``` ## Example Rules diff --git a/examples/basic_usage.py b/examples/basic_usage.py index 9800698..c872876 100644 --- a/examples/basic_usage.py +++ b/examples/basic_usage.py @@ -68,7 +68,9 @@ def example_csv_validation() -> None: print(f"CSV file: {csv_file}") print(f"Rules file: {rules_file}") print("Run command:") - print(f"python cli_main.py check {csv_file} --rules {rules_file}") + print( + f"python cli_main.py check --conn {csv_file} --table data --rules {rules_file}" + ) print() @@ -114,7 +116,10 @@ def example_database_validation() -> None: print(f"Database: {db_connection}") print(f"Rules file: {rules_file}") print("Run command:") - print(f'python cli_main.py check "{db_connection}" --rules {rules_file}') + print( + f'python cli_main.py check --conn "{db_connection}" --table customers ' + f"--rules {rules_file}" + ) print() @@ -153,7 +158,10 @@ def example_excel_validation() -> None: for rule in rules: print(f" - {rule['name']}: {rule['description']}") print("Run command:") - print("python cli_main.py check products.xlsx --rules rules.json") + print( + "python cli_main.py check --conn products.xlsx --table products " + "--rules rules.json" + ) print() @@ -195,7 +203,7 @@ def example_custom_sql_validation() -> None: print(f" - {rule['name']}: {rule['description']}") print("Run command:") print( - "python cli_main.py check " + "python cli_main.py check --conn " '"mysql://:@localhost:3306/testdb.sales" ' "--rules custom_rules.json" ) diff --git a/pyproject.toml b/pyproject.toml index 2beff36..d07390c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "validatelite" -version = "0.4.0" +version = "0.4.2" description = "A flexible, extensible command-line tool for automated data quality validation" readme = "README.md" license = {text = "MIT"} diff --git a/pytest.ini b/pytest.ini index 9a063be..5fcbd1d 100644 --- a/pytest.ini +++ b/pytest.ini @@ -9,6 +9,7 @@ addopts = --cov-report=term-missing --cov-report=html:htmlcov --strict-markers + --log-cli-level=ERROR python_files = test_*.py *_test.py python_classes = Test* python_functions = test_* diff --git a/scripts/generate_config_docs.py b/scripts/generate_config_docs.py index 8f8a893..a2ac108 100644 --- a/scripts/generate_config_docs.py +++ b/scripts/generate_config_docs.py @@ -158,7 +158,7 @@ def generate_environment_variables_docs() -> str: docs += "export LOGGING_CONFIG_PATH=/path/to/custom/logging.toml\n" docs += "\n" docs += "# Run the application\n" - docs += 'vlite-cli check data.csv --rule "not_null(id)"\n' + docs += 'vlite check data.csv --rule "not_null(id)"\n' docs += "```\n\n" return docs diff --git a/scripts/run_tests_quiet.py b/scripts/run_tests_quiet.py new file mode 100644 index 0000000..31f95f4 --- /dev/null +++ b/scripts/run_tests_quiet.py @@ -0,0 +1,56 @@ +#!/usr/bin/env python3 +""" +Quiet test runner script that suppresses debug and info messages. + +Usage: + python scripts/run_tests_quiet.py [pytest_options...] + +Examples: + python scripts/run_tests_quiet.py + python scripts/run_tests_quiet.py -k "test_data_validator" + python scripts/run_tests_quiet.py --cov=core --cov-report=html +""" + +import os +import subprocess +import sys +from pathlib import Path + + +def main() -> None: + """Run tests with quiet logging configuration.""" + # Get the project root directory + project_root = Path(__file__).parent.parent + os.chdir(project_root) + + # Set environment variables for quiet logging + env = os.environ.copy() + env["PYTHONPATH"] = str(project_root) + + # Build pytest command with quiet options + cmd = [ + sys.executable, + "-m", + "pytest", + "--log-cli-level=WARNING", + "--tb=short", + "-v", + ] + + # Add any additional arguments passed to the script + cmd.extend(sys.argv[1:]) + + # Run pytest + try: + result = subprocess.run(cmd, env=env, cwd=project_root) + sys.exit(result.returncode) + except KeyboardInterrupt: + print("\nTest run interrupted by user") + sys.exit(1) + except Exception as e: + print(f"Error running tests: {e}") + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/scripts/sql/generate_test_data.py b/scripts/sql/generate_test_data.py index 73b73c3..adc7a93 100644 --- a/scripts/sql/generate_test_data.py +++ b/scripts/sql/generate_test_data.py @@ -25,7 +25,10 @@ def generate_customer_data(count: int = 1000) -> List[Tuple]: - """Generate test customer data with specific patterns to ensure test cases pass.""" + """ + Generate test customer data with specific patterns to + ensure test cases pass/fail consistently. + """ names = [ "Alice", "Bob", @@ -101,7 +104,7 @@ def generate_customer_data(count: int = 1000) -> List[Tuple]: random.randint(18, 65), random.choice([0, 1]), ), - # Pattern 3: Duplicate emails (for unique test) + # Pattern 3: Duplicate emails (for unique email test) ( f"{random.choice(names)}3001", "duplicate@example.com", @@ -120,6 +123,25 @@ def generate_customer_data(count: int = 1000) -> List[Tuple]: random.randint(18, 65), random.choice([0, 1]), ), + # Pattern 6: Duplicate names (for unique name test) + ( + "DuplicateName", + f"unique1@{random.choice(domains)}", + random.randint(18, 65), + random.choice([0, 1]), + ), + ( + "DuplicateName", + f"unique2@{random.choice(domains)}", + random.randint(18, 65), + random.choice([0, 1]), + ), + ( + "DuplicateName", + f"unique3@{random.choice(domains)}", + random.randint(18, 65), + random.choice([0, 1]), + ), # Pattern 4: Invalid ages (for range test) ( f"{random.choice(names)}4001", diff --git a/shared/database/query_executor.py b/shared/database/query_executor.py index eedfd83..0cd11e6 100644 --- a/shared/database/query_executor.py +++ b/shared/database/query_executor.py @@ -788,21 +788,23 @@ async def get_column_list( standardized_result = [] for col in result: # Different database dialects may use different key names - name = col.get("Field", col.get("name", col.get("column_name"))) - if name is None: - # If column name not found, try to use the first value as column - # name - if col and isinstance(col, dict) and len(col) > 0: - name = next(iter(col.values())) - else: - name = str(col) + name = col.get("Field") or col.get("name") or col.get("column_name") + type_ = col.get("Type") or col.get("data_type") or col.get("type") + + if not name: + # If column name not found, skip this column with a warning + self.logger.warning( + f"Could not determine column name from result: {col}" + ) + continue + + if not type_: + type_ = "unknown" # Create standardized column info std_col = { - "name": name, # Standardized column name key - "type": col.get( - "Type", col.get("data_type", col.get("type", "unknown")) - ), + "name": name, + "type": type_, "nullable": ( col.get("Null", col.get("is_nullable", "YES")).upper() == "YES" ), diff --git a/shared/schema/connection_schema.py b/shared/schema/connection_schema.py index 5c71258..3cad596 100644 --- a/shared/schema/connection_schema.py +++ b/shared/schema/connection_schema.py @@ -7,7 +7,7 @@ cross-database features. """ -from typing import Any, Dict +from typing import Any, Dict, List, Optional from uuid import UUID, uuid4 from pydantic import Field, model_validator @@ -34,6 +34,9 @@ class ConnectionSchema(ConnectionBase): id: UUID = Field( default_factory=uuid4, description="Unique identifier for the connection" ) + available_tables: Optional[List[str]] = Field( + default=None, description="List of available tables for file-based sources" + ) # ==================== Convenient methods ==================== diff --git a/test_data/multi_table_data.xlsx b/test_data/multi_table_data.xlsx new file mode 100644 index 0000000..f53dfd1 Binary files /dev/null and b/test_data/multi_table_data.xlsx differ diff --git a/test_data/multi_table_schema.json b/test_data/multi_table_schema.json new file mode 100644 index 0000000..088e22f --- /dev/null +++ b/test_data/multi_table_schema.json @@ -0,0 +1,31 @@ +{ + "users": { + "rules": [ + { "field": "id", "type": "integer", "required": true }, + { "field": "name", "type": "string", "required": true }, + { "field": "email", "type": "string", "required": true }, + { "field": "age", "type": "integer", "min": 0, "max": 120 }, + { "field": "status", "type": "string", "enum": ["active", "inactive", "pending"] } + ], + "strict_mode": true + }, + "products": { + "rules": [ + { "field": "product_id", "type": "integer", "required": true }, + { "field": "product_name", "type": "string", "required": true }, + { "field": "price", "type": "float", "min": 0.0 }, + { "field": "category", "type": "string", "enum": ["electronics", "clothing", "books"] }, + { "field": "in_stock", "type": "boolean" } + ] + }, + "orders": { + "rules": [ + { "field": "order_id", "type": "integer", "required": true }, + { "field": "user_id", "type": "integer", "required": true }, + { "field": "order_date", "type": "datetime", "required": true }, + { "field": "total_amount", "type": "float", "min": 0.0 }, + { "field": "order_status", "type": "string", "enum": ["pending", "confirmed", "shipped", "delivered"] } + ], + "case_insensitive": true + } +} diff --git a/test_data/schema.json b/test_data/schema.json index a0c590c..1770dc6 100644 --- a/test_data/schema.json +++ b/test_data/schema.json @@ -1,4 +1,5 @@ { + "customers": { "rules": [ { "field": "id", "type": "integer", "required": true }, { "field": "age", "type": "integer", "required": true, "min": 0, "max": 120 }, @@ -7,4 +8,5 @@ { "field": "invalid_col", "type": "string", "required": true }, { "field": "email", "type": "string" } ] + } } diff --git a/tests/conftest.py b/tests/conftest.py index e428610..87469f6 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -15,9 +15,23 @@ # Add the project root directory to the Python path. sys.path.insert(0, str(Path(__file__).parent.parent)) +from shared.config.loader import load_config +from shared.config.logging_config import LoggingConfig + # Import the database connection management module. from shared.database.connection import close_all_engines +# Load test-specific logging configuration +try: + test_logging_config: LoggingConfig = load_config("logging.test.toml", LoggingConfig) + if test_logging_config and test_logging_config.module_levels: + # Apply test logging configuration + for module, level in test_logging_config.module_levels.items(): + _logging.getLogger(module).setLevel(getattr(_logging, level.upper())) +except Exception: + # Fallback to default configuration if test config not found + pass + # --------------------------------------------------------------------------- # Hypothesis global configuration – suppress HealthCheck for function-scoped # fixtures used in property-based tests (see OutputFormatter tests). diff --git a/tests/e2e/cli_scenarios/test_cli_error_handling.py b/tests/e2e/cli_scenarios/test_cli_error_handling.py index 758ec8e..192e4fe 100644 --- a/tests/e2e/cli_scenarios/test_cli_error_handling.py +++ b/tests/e2e/cli_scenarios/test_cli_error_handling.py @@ -40,7 +40,7 @@ def test_cli_check_command_no_rules(self, tmp_path: Path) -> None: # Arrange sample_data_file = tmp_path / "sample-data.csv" sample_data_file.write_text("id,name\n1,Alice") - command = ["check", str(sample_data_file)] + command = ["check", "--conn", str(sample_data_file), "--table", "sample-data"] # Act result = run_cli_command(command) @@ -56,7 +56,15 @@ def test_cli_check_command_invalid_rule(self, tmp_path: Path) -> None: # Arrange sample_data_file = tmp_path / "sample-data.csv" sample_data_file.write_text("id,name\n1,Alice") - command = ["check", str(sample_data_file), "--rule", "invalid_rule(name)"] + command = [ + "check", + "--conn", + str(sample_data_file), + "--table", + "sample-data", + "--rule", + "invalid_rule(name)", + ] # Act result = run_cli_command(command) @@ -70,7 +78,15 @@ def test_cli_check_command_nonexistent_file(self) -> None: Tests that the `check` command fails with a nonexistent source file. """ # Arrange - command = ["check", "nonexistent.csv", "--rule", "not_null(name)"] + command = [ + "check", + "--conn", + "nonexistent.csv", + "--table", + "nonexistent", + "--rule", + "not_null(name)", + ] # Act result = run_cli_command(command) @@ -86,7 +102,15 @@ def test_cli_check_command_empty_file(self, tmp_path: Path) -> None: # Arrange empty_file = tmp_path / "empty.csv" empty_file.write_text("") - command = ["check", str(empty_file), "--rule", "not_null(name)"] + command = [ + "check", + "--conn", + str(empty_file), + "--table", + "empty", + "--rule", + "not_null(name)", + ] # Act result = run_cli_command(command) diff --git a/tests/e2e/cli_scenarios/test_cli_happy_path.py b/tests/e2e/cli_scenarios/test_cli_happy_path.py index 6041cd0..cb40778 100644 --- a/tests/e2e/cli_scenarios/test_cli_happy_path.py +++ b/tests/e2e/cli_scenarios/test_cli_happy_path.py @@ -58,7 +58,10 @@ def test_cli_check_command_success_inline_rules( # Arrange command = [ "check", + "--conn", sample_data_file, + "--table", + "sample-data", "--rule", "not_null(name)", "--rule", @@ -96,7 +99,10 @@ def test_cli_check_command_success_rules_file( command = [ "check", + "--conn", sample_data_file, + "--table", + "sample-data", "--rules", str(rules_file), ] diff --git a/tests/e2e/cli_scenarios/test_e2e_comprehensive_scenarios.py b/tests/e2e/cli_scenarios/test_e2e_comprehensive_scenarios.py index 574883e..84d6a74 100644 --- a/tests/e2e/cli_scenarios/test_e2e_comprehensive_scenarios.py +++ b/tests/e2e/cli_scenarios/test_e2e_comprehensive_scenarios.py @@ -24,18 +24,26 @@ class TestE2EComprehensiveScenarios: # Test data sources SQLITE_DATA_SOURCE = "test_data/customers.xlsx" - MYSQL_DATA_SOURCE = get_mysql_test_url() + ".customers" - POSTGRES_DATA_SOURCE = get_postgresql_test_url() + ".customers" + MYSQL_DATA_SOURCE = get_mysql_test_url() + POSTGRES_DATA_SOURCE = get_postgresql_test_url() @pytest.mark.parametrize( "data_source", [SQLITE_DATA_SOURCE, MYSQL_DATA_SOURCE, POSTGRES_DATA_SOURCE] ) def test_not_null_name_rule(self, data_source: str) -> None: """ - Test: check *data_source* --rule="not_null(name)" + Test: check --conn *data_source* --table customers --rule="not_null(name)" Expected: PASSED """ - command = ["check", data_source, "--rule", "not_null(name)"] + command = [ + "check", + "--conn", + data_source, + "--table", + "customers", + "--rule", + "not_null(name)", + ] result = E2ETestUtils.run_cli_command(command) E2ETestUtils.assert_rule_result(result, "not_null(name)", "PASSED") @@ -46,10 +54,18 @@ def test_not_null_name_rule(self, data_source: str) -> None: ) def test_not_null_email_rule(self, data_source: str) -> None: """ - Test: check *data_source* --rule="not_null(email)" + Test: check --conn *data_source* --table customers --rule="not_null(email)" Expected: FAILED """ - command = ["check", data_source, "--rule", "not_null(email)"] + command = [ + "check", + "--conn", + data_source, + "--table", + "customers", + "--rule", + "not_null(email)", + ] result = E2ETestUtils.run_cli_command(command) E2ETestUtils.assert_rule_result(result, "not_null(email)", "FAILED") @@ -60,10 +76,18 @@ def test_not_null_email_rule(self, data_source: str) -> None: ) def test_unique_id_rule(self, data_source: str) -> None: """ - Test: check *data_source* --rule="unique(id)" + Test: check --conn *data_source* --table customers --rule="unique(id)" Expected: PASSED """ - command = ["check", data_source, "--rule", "unique(id)"] + command = [ + "check", + "--conn", + data_source, + "--table", + "customers", + "--rule", + "unique(id)", + ] result = E2ETestUtils.run_cli_command(command) E2ETestUtils.assert_rule_result(result, "unique(id)", "PASSED") @@ -74,10 +98,19 @@ def test_unique_id_rule(self, data_source: str) -> None: ) def test_unique_name_rule_verbose(self, data_source: str) -> None: """ - Test: check *data_source* --rule="unique(name)" --verbose + Test: check --conn *data_source* --table customers --rule="unique(name)" --verbose Expected: FAILED with sample data """ - command = ["check", data_source, "--rule", "unique(name)", "--verbose"] + command = [ + "check", + "--conn", + data_source, + "--table", + "customers", + "--rule", + "unique(name)", + "--verbose", + ] result = E2ETestUtils.run_cli_command(command) E2ETestUtils.assert_rule_result(result, "unique(name)", "FAILED") @@ -89,10 +122,19 @@ def test_unique_name_rule_verbose(self, data_source: str) -> None: ) def test_range_age_rule_verbose(self, data_source: str) -> None: """ - Test: check *data_source* --rule="range(age,0,120)" --verbose + Test: check --conn *data_source* --table customers --rule="range(age,0,120)" --verbose Expected: FAILED with sample data """ - command = ["check", data_source, "--rule", "range(age,0,120)", "--verbose"] + command = [ + "check", + "--conn", + data_source, + "--table", + "customers", + "--rule", + "range(age,0,120)", + "--verbose", + ] result = E2ETestUtils.run_cli_command(command) E2ETestUtils.assert_rule_result(result, "range(age)", "FAILED") @@ -104,12 +146,15 @@ def test_range_age_rule_verbose(self, data_source: str) -> None: ) def test_multiple_rules_verbose(self, data_source: str) -> None: """ - Test: check *data_source* --rule="length(name,1,30)" --rule="enum(gender,0,1)" --verbose + Test: check --conn *data_source* --table customers --rule="length(name,1,30)" --rule="enum(gender,0,1)" --verbose Expected: PASSED + FAILED, failed rules return sample data """ command = [ "check", + "--conn", data_source, + "--table", + "customers", "--rule", "length(name,1,30)", "--rule", @@ -130,12 +175,15 @@ def test_multiple_rules_verbose(self, data_source: str) -> None: ) def test_regex_email_rule_verbose(self, data_source: str) -> None: """ - Test: check *data_source* --rule="regex(email,'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}$')" --verbose + Test: check --conn *data_source* --table customers --rule="regex(email,'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}$')" --verbose Expected: FAILED with sample data """ command = [ "check", + "--conn", data_source, + "--table", + "customers", "--rule", "regex(email,'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}$')", "--verbose", @@ -151,12 +199,15 @@ def test_regex_email_rule_verbose(self, data_source: str) -> None: ) def test_validate_merge_rules_file(self, data_source: str) -> None: """ - Test: check *data_source* --rules="test_data/validate_merge.json" --verbose + Test: check --conn *data_source* --table customers --rules="test_data/validate_merge.json" --verbose Expected: 2 rules PASSED, 5 rules FAILED with sample data """ command = [ "check", + "--conn", data_source, + "--table", + "customers", "--rules", "test_data/validate_merge.json", "--verbose", @@ -187,12 +238,15 @@ def test_validate_merge_rules_file(self, data_source: str) -> None: ) def test_validate_invi_rules_file(self, data_source: str) -> None: """ - Test: check *data_source* --rules="test_data/validate_invi.json" --verbose + Test: check --conn *data_source* --table customers --rules="test_data/validate_invi.json" --verbose Expected: Both rules FAILED with sample data """ command = [ "check", + "--conn", data_source, + "--table", + "customers", "--rules", "test_data/validate_invi.json", "--verbose", @@ -218,9 +272,17 @@ def test_connection_timeout_handling(self) -> None: # Test with invalid connection parameters # Create a completely invalid MySQL connection string that doesn't depend on environment variables invalid_source = ( - "mysql://invalid-user:invalid-pass@invalid-host:3306/invalid-db.customers" + "mysql://invalid-user:invalid-pass@invalid-host:3306/invalid-db" ) - command = ["check", invalid_source, "--rule", "not_null(name)"] + command = [ + "check", + "--conn", + invalid_source, + "--table", + "customers", + "--rule", + "not_null(name)", + ] result = E2ETestUtils.run_cli_command(command) E2ETestUtils.assert_error_handling(result, "connection") @@ -232,7 +294,15 @@ def test_invalid_rule_syntax(self, data_source: str) -> None: """ Test handling of invalid rule syntax. """ - command = ["check", data_source, "--rule", "invalid_rule_type(column)"] + command = [ + "check", + "--conn", + data_source, + "--table", + "customers", + "--rule", + "invalid_rule_type(column)", + ] result = E2ETestUtils.run_cli_command(command) E2ETestUtils.assert_error_handling(result, "invalid") @@ -244,7 +314,15 @@ def test_missing_data_source(self, data_source: str) -> None: """ Test handling of missing data source. """ - command = ["check", "nonexistent_file.csv", "--rule", "not_null(name)"] + command = [ + "check", + "--conn", + "nonexistent_file.csv", + "--table", + "nonexistent", + "--rule", + "not_null(name)", + ] result = E2ETestUtils.run_cli_command(command) E2ETestUtils.assert_error_handling(result, "file") @@ -256,7 +334,7 @@ def test_empty_rules_list(self, data_source: str) -> None: """ Test handling of empty rules list. """ - command = ["check", data_source] + command = ["check", "--conn", data_source, "--table", "customers"] result = E2ETestUtils.run_cli_command(command) E2ETestUtils.assert_error_handling(result, "rule") @@ -268,7 +346,16 @@ def test_large_dataset_performance(self, data_source: str) -> None: """ Test performance with large dataset (basic timing check). """ - command = ["check", data_source, "--rule", "not_null(name)", "--verbose"] + command = [ + "check", + "--conn", + data_source, + "--table", + "customers", + "--rule", + "not_null(name)", + "--verbose", + ] result = E2ETestUtils.run_cli_command(command) E2ETestUtils.assert_performance_acceptable(result, max_time=30.0) @@ -283,7 +370,10 @@ def test_concurrent_rule_execution(self, data_source: str) -> None: """ command = [ "check", + "--conn", data_source, + "--table", + "customers", "--rule", "not_null(name)", "--rule", diff --git a/tests/e2e/cli_scenarios/test_schema_command_e2e.py b/tests/e2e/cli_scenarios/test_schema_command_e2e.py index 5767f3a..1a17013 100644 --- a/tests/e2e/cli_scenarios/test_schema_command_e2e.py +++ b/tests/e2e/cli_scenarios/test_schema_command_e2e.py @@ -1,5 +1,5 @@ """ -E2E: vlite-cli schema on databases and table/json outputs +E2E: vlite schema on databases and table/json outputs Scenarios derived from notes/测试方案-数据库SchemaDrift与CLI-Schema命令.md: - Happy path on DB URL with table/json outputs @@ -29,9 +29,9 @@ def _db_urls() -> list[str]: urls: list[str] = [] available = set(get_available_databases()) if "mysql" in available: - urls.append(get_mysql_test_url() + ".customers") + urls.append(get_mysql_test_url()) if "postgresql" in available: - urls.append(get_postgresql_test_url() + ".customers") + urls.append(get_postgresql_test_url()) return urls @@ -59,26 +59,44 @@ def _param_db_urls() -> list[object]: def test_happy_path_table_and_json(tmp_path: Path, db_url: str) -> None: # Schema baseline + a couple atomic rules rules = { - "rules": [ - {"field": "id", "type": "integer", "required": True}, - {"field": "email", "type": "string"}, - {"field": "age", "type": "integer", "min": 0, "max": 150}, - ], - "strict_mode": False, - "case_insensitive": True, + "customers": { + "rules": [ + {"field": "id", "type": "integer", "required": True}, + {"field": "email", "type": "string"}, + {"field": "age", "type": "integer", "min": 0, "max": 150}, + ], + "strict_mode": False, + "case_insensitive": True, + } } rules_file = _write_rules(tmp_path, rules) # table output r1 = E2ETestUtils.run_cli_command( - ["schema", db_url, "--rules", rules_file, "--output", "table"] + [ + "schema", + "--conn", + db_url, + "--rules", + rules_file, + "--output", + "table", + ] ) assert r1.returncode in {0, 1} assert "Checking" in r1.stdout # json output r2 = E2ETestUtils.run_cli_command( - ["schema", db_url, "--rules", rules_file, "--output", "json"] + [ + "schema", + "--conn", + db_url, + "--rules", + rules_file, + "--output", + "json", + ] ) assert r2.returncode in {0, 1} try: @@ -97,21 +115,31 @@ def test_happy_path_table_and_json(tmp_path: Path, db_url: str) -> None: def test_drift_missing_and_type_mismatch(tmp_path: Path, db_url: str) -> None: # Declare a missing column and mismatched type to trigger SKIPPED in JSON for dependent rules rules = { - "rules": [ - {"field": "email", "type": "integer", "required": True}, # mismatch - { - "field": "status", - "type": "string", - "enum": ["active", "inactive"], - }, # missing - ], - "strict_mode": False, - "case_insensitive": True, + "customers": { + "rules": [ + {"field": "email", "type": "integer", "required": True}, # mismatch + { + "field": "status", + "type": "string", + "enum": ["active", "inactive"], + }, # missing + ], + "strict_mode": False, + "case_insensitive": True, + } } rules_file = _write_rules(tmp_path, rules) r = E2ETestUtils.run_cli_command( - ["schema", db_url, "--rules", rules_file, "--output", "json"] + [ + "schema", + "--conn", + db_url, + "--rules", + rules_file, + "--output", + "json", + ] ) assert r.returncode in {1, 0} try: @@ -132,16 +160,26 @@ def test_drift_missing_and_type_mismatch(tmp_path: Path, db_url: str) -> None: @pytest.mark.parametrize("db_url", _param_db_urls()) def test_strict_mode_extras_json(tmp_path: Path, db_url: str) -> None: rules = { - "rules": [ - {"field": "id", "type": "integer"}, - ], - "strict_mode": True, - "case_insensitive": True, + "customers": { + "rules": [ + {"field": "id", "type": "integer"}, + ], + "strict_mode": True, + "case_insensitive": True, + } } rules_file = _write_rules(tmp_path, rules) r = E2ETestUtils.run_cli_command( - ["schema", db_url, "--rules", rules_file, "--output", "json"] + [ + "schema", + "--conn", + db_url, + "--rules", + rules_file, + "--output", + "json", + ] ) try: payload = json.loads(r.stdout) @@ -161,7 +199,15 @@ def test_empty_rules_minimal_payload(tmp_path: Path) -> None: rules_file = _write_rules(tmp_path, {"rules": []}) r = E2ETestUtils.run_cli_command( - ["schema", str(data_file), "--rules", rules_file, "--output", "json"] + [ + "schema", + "--conn", + str(data_file), + "--rules", + rules_file, + "--output", + "json", + ] ) assert r.returncode == 0 payload = json.loads(r.stdout) diff --git a/tests/integration/engine/test_engine_cli_integration.py b/tests/integration/engine/test_engine_cli_integration.py index 0793891..6bb9da7 100644 --- a/tests/integration/engine/test_engine_cli_integration.py +++ b/tests/integration/engine/test_engine_cli_integration.py @@ -215,7 +215,15 @@ def test_complete_cli_to_engine_workflow_success( # Execute CLI command result = cli_runner.invoke( check_command, - [test_csv_data, "--rules", validation_rules_file, "--verbose"], + [ + "--conn", + test_csv_data, + "--table", + "test_data", + "--rules", + validation_rules_file, + "--verbose", + ], ) # Verify CLI executed successfully @@ -289,7 +297,10 @@ def test_cli_to_engine_validation_failures( result = cli_runner.invoke( check_command, [ + "--conn", test_csv_data, + "--table", + "test_data", "--rule", "not_null(name)", "--rule", @@ -338,7 +349,8 @@ def test_cli_to_engine_error_propagation( # Execute CLI command result = cli_runner.invoke( - check_command, [test_csv_data, "--rule", "not_null(id)"] + check_command, + ["--conn", test_csv_data, "--table", "test_data", "--rule", "not_null(id)"], ) # CLI should handle the error gracefully @@ -428,7 +440,10 @@ def run_scalability_test(rule_count: int, record_count: int) -> None: cli_result = cli_runner.invoke( check_command, [ + "--conn", test_data, + "--table", + "test_data", *[item for rule in inline_rules for item in ["--rule", rule]], ], ) @@ -527,7 +542,10 @@ def test_cli_engine_performance_monitoring( func=lambda: cli_runner.invoke( check_command, [ + "--conn", large_dataset, + "--table", + "large_dataset", "--rule", "not_null(id)", "--rule", @@ -549,7 +567,10 @@ def test_cli_engine_performance_monitoring( result = cli_runner.invoke( check_command, [ + "--conn", large_dataset, + "--table", + "large_dataset", "--rule", "not_null(id)", "--rule", @@ -649,7 +670,8 @@ def test_cli_engine_empty_dataset_handling( try: # Execute CLI command on empty data result = cli_runner.invoke( - check_command, [empty_csv, "--rule", "not_null(id)"] + check_command, + ["--conn", empty_csv, "--table", "test_data", "--rule", "not_null(id)"], ) # Should handle empty data gracefully @@ -684,7 +706,10 @@ def run_cli_subprocess(idx: int) -> None: sys.executable, "cli_main.py", "check", + "--conn", test_csv_data, + "--table", + "test_data", "--rule", "not_null(id)", "--quiet", diff --git a/tests/shared/builders/test_builders.py b/tests/shared/builders/test_builders.py index 2a02f8a..5a82b22 100644 --- a/tests/shared/builders/test_builders.py +++ b/tests/shared/builders/test_builders.py @@ -211,6 +211,7 @@ def __init__(self) -> None: self._username = "test_user" self._password = "test_pass" self._db_schema = "test_schema" + self._available_tables: Optional[List[str]] = None self._file_path: Optional[str] = None self._parameters: Dict[str, Any] = {} @@ -236,6 +237,12 @@ def with_database(self, db_name: str) -> "TestDataBuilder.ConnectionBuilder": self._db_name = db_name return self + def with_available_tables( + self, table_name: str + ) -> "TestDataBuilder.ConnectionBuilder": + self._available_tables = [table_name] + return self + def with_credentials( self, username: str, password: str ) -> "TestDataBuilder.ConnectionBuilder": @@ -275,6 +282,7 @@ def build(self) -> ConnectionSchema: db_schema=self._db_schema, file_path=self._file_path, parameters=self._parameters, + available_tables=self._available_tables, capabilities=DataSourceCapability(supports_sql=True), cross_db_settings=None, ) diff --git a/tests/unit/cli/commands/test_check_command.py b/tests/unit/cli/commands/test_check_command.py index 2747201..acb041b 100644 --- a/tests/unit/cli/commands/test_check_command.py +++ b/tests/unit/cli/commands/test_check_command.py @@ -90,6 +90,7 @@ def validation_rules(self) -> List[RuleSchema]: # === MODERN SUCCESS FLOW TESTS === @patch("cli.commands.check.get_cli_config") + @patch("cli.commands.check.get_core_config") @patch("cli.commands.check.SourceParser") @patch("cli.commands.check.RuleParser") @patch("cli.commands.check.DataValidator") @@ -100,6 +101,7 @@ def test_csv_file_check_modern_success( mock_validator: Mock, mock_rule_parser: Mock, mock_source_parser: Mock, + mock_core_config: Mock, mock_cli_config: Mock, runner: CliRunner, sample_csv_data: str, @@ -109,6 +111,7 @@ def test_csv_file_check_modern_success( # Setup using Contract Testing mock_cli_config.return_value = Mock() + mock_core_config.return_value = Mock() # Source parsing with Builder Pattern source_connection = ( @@ -143,9 +146,10 @@ def test_csv_file_check_modern_success( # Contract-compliant formatter mock mock_formatter.return_value = Mock() - # Execute command + # Execute command with new interface result = runner.invoke( - check_command, [sample_csv_data, "--rule", "not_null(id)"] + check_command, + ["--conn", sample_csv_data, "--table", "users", "--rule", "not_null(id)"], ) # Verify execution @@ -153,12 +157,13 @@ def test_csv_file_check_modern_success( # Verify call patterns mock_source_parser.return_value.parse_source.assert_called_once_with( - sample_csv_data + sample_csv_data, "users" ) mock_rule_parser.return_value.parse_rules.assert_called_once() mock_validator_instance.validate.assert_called_once() @patch("cli.commands.check.get_cli_config") + @patch("cli.commands.check.get_core_config") @patch("cli.commands.check.SourceParser") @patch("cli.commands.check.RuleParser") @patch("cli.commands.check.DataValidator") @@ -169,15 +174,17 @@ def test_database_url_check_modern_success( mock_validator: Mock, mock_rule_parser: Mock, mock_source_parser: Mock, + mock_core_config: Mock, mock_cli_config: Mock, runner: CliRunner, ) -> None: """Modern database URL check with enhanced Builder Pattern""" - db_url = "mysql://testuser:testpass@localhost/testdb.users" + db_url = "mysql://testuser:testpass@localhost/testdb" # Modern component setup mock_cli_config.return_value = Mock() + mock_core_config.return_value = Mock() # Database connection with Builder Pattern db_connection = ( @@ -213,8 +220,11 @@ def test_database_url_check_modern_success( mock_validator.return_value = mock_validator_instance mock_formatter.return_value = Mock() - # Execute command - result = runner.invoke(check_command, [db_url, "--rule", "not_null(id)"]) + # Execute command with new interface + result = runner.invoke( + check_command, + ["--conn", db_url, "--table", "users", "--rule", "not_null(id)"], + ) # Verify success assert result.exit_code == 0 @@ -222,6 +232,7 @@ def test_database_url_check_modern_success( # === MODERN FAILURE FLOW TESTS === @patch("cli.commands.check.get_cli_config") + @patch("cli.commands.check.get_core_config") @patch("cli.commands.check.SourceParser") @patch("cli.commands.check.RuleParser") @patch("cli.commands.check.DataValidator") @@ -232,6 +243,7 @@ def test_validation_failures_with_samples( mock_validator: Mock, mock_rule_parser: Mock, mock_source_parser: Mock, + mock_core_config: Mock, mock_cli_config: Mock, runner: CliRunner, sample_csv_data: str, @@ -240,6 +252,7 @@ def test_validation_failures_with_samples( # Setup components mock_cli_config.return_value = Mock() + mock_core_config.return_value = Mock() source_connection = ( TestDataBuilder.connection() .with_type(ConnectionType.CSV) @@ -277,9 +290,18 @@ def test_validation_failures_with_samples( mock_validator.return_value = mock_validator_instance mock_formatter.return_value = Mock() - # Execute with verbose flag + # Execute with verbose flag using new interface result = runner.invoke( - check_command, [sample_csv_data, "--rule", "length(name,2,50)", "--verbose"] + check_command, + [ + "--conn", + sample_csv_data, + "--table", + "users", + "--rule", + "length(name,2,50)", + "--verbose", + ], ) # Modify the assertion to check for successful command execution instead of relying solely on the exit code. @@ -296,7 +318,8 @@ def test_file_not_found_modern_error(self, runner: CliRunner) -> None: nonexistent_file = "nonexistent_file.csv" result = runner.invoke( - check_command, [nonexistent_file, "--rule", "not_null(id)"] + check_command, + ["--conn", nonexistent_file, "--table", "users", "--rule", "not_null(id)"], ) assert result.exit_code == 20 @@ -309,7 +332,10 @@ def test_invalid_rule_syntax_modern_error( """Modern rule syntax error with helpful corrections""" invalid_rule = "not_nul(id)" # Typo - result = runner.invoke(check_command, [sample_csv_data, "--rule", invalid_rule]) + result = runner.invoke( + check_command, + ["--conn", sample_csv_data, "--table", "users", "--rule", invalid_rule], + ) assert result.exit_code == 26 # Check for erroneous output. @@ -324,7 +350,15 @@ def test_permission_denied_modern_error(self, runner: CliRunner) -> None: ) result = runner.invoke( - check_command, ["/restricted/data.csv", "--rule", "not_null(id)"] + check_command, + [ + "--conn", + "/restricted/data.csv", + "--table", + "users", + "--rule", + "not_null(id)", + ], ) assert result.exit_code == 21 @@ -343,7 +377,8 @@ def test_empty_file_modern_handling(self, runner: CliRunner) -> None: try: result = runner.invoke( - check_command, [empty_file, "--rule", "not_null(id)"] + check_command, + ["--conn", empty_file, "--table", "users", "--rule", "not_null(id)"], ) # Verify command execution and return the error code. @@ -374,7 +409,15 @@ def test_unicode_file_names_modern_support(self, runner: CliRunner) -> None: Path(temp_path).rename(unicode_path) result = runner.invoke( - check_command, [str(unicode_path), "--rule", "not_null(id)"] + check_command, + [ + "--conn", + str(unicode_path), + "--table", + "users", + "--rule", + "not_null(id)", + ], ) # Should handle Unicode filenames @@ -477,7 +520,15 @@ def run_with_filename(filename: str) -> None: # Executes the command. runner = CliRunner() result = runner.invoke( - check_command, [f"test_{filename}.csv", "--rule", "not_null(id)"] + check_command, + [ + "--conn", + f"test_{filename}.csv", + "--table", + "users", + "--rule", + "not_null(id)", + ], ) # Verify successful command execution. @@ -516,7 +567,16 @@ def test_large_dataset_modern_performance(self, runner: CliRunner) -> None: result = runner.invoke( check_command, - [large_data, "--rule", "not_null(id)", "--rule", "unique(email)"], + [ + "--conn", + large_data, + "--table", + "users", + "--rule", + "not_null(id)", + "--rule", + "unique(email)", + ], ) end_time = time.time() @@ -545,7 +605,10 @@ def test_memory_usage_modern_monitoring( result = runner.invoke( check_command, [ + "--conn", sample_csv_data, + "--table", + "users", "--rule", "not_null(id)", "--rule", @@ -594,7 +657,16 @@ def test_end_to_end_workflow_modern(self, runner: CliRunner) -> None: try: # Execute complete workflow result = runner.invoke( - check_command, [test_data, "--rules", rules_file, "--verbose"] + check_command, + [ + "--conn", + test_data, + "--table", + "users", + "--rules", + rules_file, + "--verbose", + ], ) # Verify command execution. diff --git a/tests/unit/cli/commands/test_check_command_new_interface.py b/tests/unit/cli/commands/test_check_command_new_interface.py new file mode 100644 index 0000000..7a9585d --- /dev/null +++ b/tests/unit/cli/commands/test_check_command_new_interface.py @@ -0,0 +1,648 @@ +""" +🧙‍♂️ Check Command New Interface Tests + +Tests for the new --conn and --table options in the check command. +This file focuses on testing the new interface functionality. +""" + +import json +import tempfile +from pathlib import Path +from typing import Any, Dict, List +from unittest.mock import AsyncMock, Mock, patch + +import pytest +from click.testing import CliRunner + +from cli.commands.check import check_command +from tests.shared.builders.test_builders import TestDataBuilder +from tests.shared.contracts.test_contracts import MockContract + + +class TestCheckCommandNewInterface: + """Test suite for the new --conn and --table interface""" + + @pytest.fixture + def runner(self) -> CliRunner: + """CLI test runner""" + return CliRunner() + + @pytest.fixture + def mock_components(self) -> Dict[str, Any]: + """Mock components using Contract Testing""" + return { + "config_manager": MockContract.create_config_manager_mock(), + "source_parser": MockContract.create_source_parser_mock(), + "rule_parser": MockContract.create_rule_parser_mock(), + "data_validator": MockContract.create_data_validator_mock(), + "output_formatter": MockContract.create_output_formatter_mock(), + } + + @pytest.fixture + def sample_csv_data(self) -> str: + """CSV test data""" + with tempfile.NamedTemporaryFile(mode="w", suffix=".csv", delete=False) as f: + f.write("id,name,email,age\n") + f.write("1,John,john@test.com,25\n") + f.write("2,Jane,jane@test.com,30\n") + temp_file = f.name + return temp_file + + @pytest.fixture + def sample_rules_file(self) -> str: + """Sample rules file""" + with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f: + json.dump( + { + "rules": [ + {"field": "id", "type": "integer", "required": True}, + {"field": "name", "type": "string", "required": True}, + ] + }, + f, + ) + temp_file = f.name + return temp_file + + # === NEW INTERFACE TESTS === + + @patch("cli.commands.check.get_cli_config") + @patch("cli.commands.check.get_core_config") + @patch("cli.commands.check.SourceParser") + @patch("cli.commands.check.RuleParser") + @patch("cli.commands.check.DataValidator") + @patch("cli.commands.check.OutputFormatter") + def test_new_interface_with_conn_and_table( + self, + mock_formatter: Mock, + mock_validator: Mock, + mock_rule_parser: Mock, + mock_source_parser: Mock, + mock_core_config: Mock, + mock_cli_config: Mock, + runner: CliRunner, + sample_csv_data: str, + sample_rules_file: str, + mock_components: Dict[str, Any], + ) -> None: + """Test the new --conn and --table interface""" + # Setup mocks using the same pattern as successful tests + mock_cli_config.return_value = Mock() + mock_core_config.return_value = Mock() + + # Source parsing mock + source_connection = Mock() + mock_source_parser.return_value.parse_source.return_value = source_connection + + # Rule parsing mock + rules = [Mock()] # Create a mock rule + mock_rule_parser.return_value.parse_rules.return_value = rules + + # Validation results mock + validation_results = [Mock()] + mock_validator_instance = AsyncMock() + mock_validator_instance.validate.return_value = validation_results + mock_validator.return_value = mock_validator_instance + + # Formatter mock + mock_formatter.return_value = Mock() + + # Execute command with new interface + result = runner.invoke( + check_command, + [ + "--conn", + sample_csv_data, + "--table", + "users", + "--rules", + sample_rules_file, + ], + ) + + # Verify success + assert result.exit_code == 0 + assert "Starting validation" in result.output + + @patch("cli.commands.check.get_cli_config") + @patch("cli.commands.check.get_core_config") + @patch("cli.commands.check.SourceParser") + @patch("cli.commands.check.RuleParser") + @patch("cli.commands.check.DataValidator") + @patch("cli.commands.check.OutputFormatter") + def test_new_interface_missing_table( + self, + mock_formatter: Mock, + mock_validator: Mock, + mock_rule_parser: Mock, + mock_source_parser: Mock, + mock_core_config: Mock, + mock_cli_config: Mock, + runner: CliRunner, + sample_csv_data: str, + ) -> None: + """Test that --table is required when using --conn""" + # Execute command with --conn but no --table + result = runner.invoke(check_command, ["--conn", sample_csv_data]) + + # Verify error + assert result.exit_code == 2 # Click error exit code + assert "Missing option '--table'" in result.output + + @patch("cli.commands.check.get_cli_config") + @patch("cli.commands.check.get_core_config") + @patch("cli.commands.check.SourceParser") + @patch("cli.commands.check.RuleParser") + @patch("cli.commands.check.DataValidator") + @patch("cli.commands.check.OutputFormatter") + def test_new_interface_missing_conn( + self, + mock_formatter: Mock, + mock_validator: Mock, + mock_rule_parser: Mock, + mock_source_parser: Mock, + mock_core_config: Mock, + mock_cli_config: Mock, + runner: CliRunner, + ) -> None: + """Test that --conn is required when using --table""" + # Execute command with --table but no --conn + result = runner.invoke(check_command, ["--table", "users"]) + + # Verify error + assert result.exit_code == 2 # Click error exit code + assert "Missing option '--conn'" in result.output + + @patch("cli.commands.check.get_cli_config") + @patch("cli.commands.check.get_core_config") + @patch("cli.commands.check.SourceParser") + @patch("cli.commands.check.RuleParser") + @patch("cli.commands.check.DataValidator") + @patch("cli.commands.check.OutputFormatter") + def test_new_interface_with_inline_rules( + self, + mock_formatter: Mock, + mock_validator: Mock, + mock_rule_parser: Mock, + mock_source_parser: Mock, + mock_core_config: Mock, + mock_cli_config: Mock, + runner: CliRunner, + sample_csv_data: str, + mock_components: Dict[str, Any], + ) -> None: + """Test new interface with inline rules""" + # Setup mocks using the same pattern as successful tests + mock_cli_config.return_value = Mock() + mock_core_config.return_value = Mock() + + # Source parsing mock + source_connection = Mock() + mock_source_parser.return_value.parse_source.return_value = source_connection + + # Rule parsing mock + rules = [Mock()] # Create a mock rule + mock_rule_parser.return_value.parse_rules.return_value = rules + + # Validation results mock + validation_results = [Mock()] + mock_validator_instance = AsyncMock() + mock_validator_instance.validate.return_value = validation_results + mock_validator.return_value = mock_validator_instance + + # Formatter mock + mock_formatter.return_value = Mock() + + # Execute command with new interface and inline rules + result = runner.invoke( + check_command, + [ + "--conn", + sample_csv_data, + "--table", + "users", + "--rule", + "not_null(id)", + "--rule", + "length(name, 2, 50)", + ], + ) + + # Verify success + assert result.exit_code == 0 + assert "Starting validation" in result.output + + @patch("cli.commands.check.get_cli_config") + @patch("cli.commands.check.get_core_config") + @patch("cli.commands.check.SourceParser") + @patch("cli.commands.check.RuleParser") + @patch("cli.commands.check.DataValidator") + @patch("cli.commands.check.OutputFormatter") + def test_new_interface_with_database_connection( + self, + mock_formatter: Mock, + mock_validator: Mock, + mock_rule_parser: Mock, + mock_source_parser: Mock, + mock_core_config: Mock, + mock_cli_config: Mock, + runner: CliRunner, + sample_rules_file: str, + mock_components: Dict[str, Any], + ) -> None: + """Test new interface with database connection""" + # Setup mocks using the same pattern as successful tests + mock_cli_config.return_value = Mock() + mock_core_config.return_value = Mock() + + # Source parsing mock + source_connection = Mock() + mock_source_parser.return_value.parse_source.return_value = source_connection + + # Rule parsing mock + rules = [Mock()] # Create a mock rule + mock_rule_parser.return_value.parse_rules.return_value = rules + + # Validation results mock + validation_results = [Mock()] + mock_validator_instance = AsyncMock() + mock_validator_instance.validate.return_value = validation_results + mock_validator.return_value = mock_validator_instance + + # Formatter mock + mock_formatter.return_value = Mock() + + # Execute command with database connection + result = runner.invoke( + check_command, + [ + "--conn", + "mysql://user:pass@host/db", + "--table", + "customers", + "--rules", + sample_rules_file, + ], + ) + + # Verify success + assert result.exit_code == 0 + assert "Starting validation" in result.output + + @patch("cli.commands.check.get_cli_config") + @patch("cli.commands.check.get_core_config") + @patch("cli.commands.check.SourceParser") + @patch("cli.commands.check.RuleParser") + @patch("cli.commands.check.DataValidator") + @patch("cli.commands.check.OutputFormatter") + def test_new_interface_with_sqlite_file( + self, + mock_formatter: Mock, + mock_validator: Mock, + mock_rule_parser: Mock, + mock_source_parser: Mock, + mock_core_config: Mock, + mock_cli_config: Mock, + runner: CliRunner, + sample_rules_file: str, + mock_components: Dict[str, Any], + ) -> None: + """Test new interface with SQLite file""" + # Setup mocks using the same pattern as successful tests + mock_cli_config.return_value = Mock() + mock_core_config.return_value = Mock() + + # Source parsing mock + source_connection = Mock() + mock_source_parser.return_value.parse_source.return_value = source_connection + + # Rule parsing mock + rules = [Mock()] # Create a mock rule + mock_rule_parser.return_value.parse_rules.return_value = rules + + # Validation results mock + validation_results = [Mock()] + mock_validator_instance = AsyncMock() + mock_validator_instance.validate.return_value = validation_results + mock_validator.return_value = mock_validator_instance + + # Formatter mock + mock_formatter.return_value = Mock() + + # Execute command with SQLite file + result = runner.invoke( + check_command, + [ + "--conn", + "sqlite:///path/to/database.db", + "--table", + "orders", + "--rules", + sample_rules_file, + ], + ) + + # Verify success + assert result.exit_code == 0 + assert "Starting validation" in result.output + + # === ERROR HANDLING TESTS === + + @patch("cli.commands.check.get_cli_config") + @patch("cli.commands.check.get_core_config") + @patch("cli.commands.check.SourceParser") + @patch("cli.commands.check.RuleParser") + @patch("cli.commands.check.DataValidator") + @patch("cli.commands.check.OutputFormatter") + def test_new_interface_no_rules_specified( + self, + mock_formatter: Mock, + mock_validator: Mock, + mock_rule_parser: Mock, + mock_source_parser: Mock, + mock_core_config: Mock, + mock_cli_config: Mock, + runner: CliRunner, + sample_csv_data: str, + mock_components: Dict[str, Any], + ) -> None: + """Test error when no rules are specified""" + # Execute command without rules + result = runner.invoke( + check_command, ["--conn", sample_csv_data, "--table", "users"] + ) + + # Verify error + assert result.exit_code == 2 # Click error exit code + assert "No rules specified" in result.output + + @patch("cli.commands.check.get_cli_config") + @patch("cli.commands.check.get_core_config") + @patch("cli.commands.check.SourceParser") + @patch("cli.commands.check.RuleParser") + @patch("cli.commands.check.DataValidator") + @patch("cli.commands.check.OutputFormatter") + def test_new_interface_empty_file( + self, + mock_formatter: Mock, + mock_validator: Mock, + mock_rule_parser: Mock, + mock_source_parser: Mock, + mock_core_config: Mock, + mock_cli_config: Mock, + runner: CliRunner, + sample_rules_file: str, + mock_components: Dict[str, Any], + ) -> None: + """Test error when source file is empty""" + # Create empty file + with tempfile.NamedTemporaryFile(mode="w", suffix=".csv", delete=False) as f: + temp_file = f.name + + # Execute command with empty file + result = runner.invoke( + check_command, + ["--conn", temp_file, "--table", "users", "--rules", sample_rules_file], + ) + + # Verify error + assert result.exit_code > 0 # Any non-zero exit code indicates error + assert "is empty" in result.output + + # Cleanup + Path(temp_file).unlink(missing_ok=True) + + def test_table_name_parameter_passed_to_source_parser( + self, + runner: CliRunner, + ) -> None: + """Test that table_name parameter is correctly passed to SourceParser.parse_source""" + with patch("cli.commands.check.SourceParser") as mock_source_parser_class: + # Setup mock + mock_source_parser = Mock() + mock_source_parser_class.return_value = mock_source_parser + + # Create mock source config + mock_source_config = Mock() + mock_source_parser.parse_source.return_value = mock_source_config + + # Mock other components + with patch("cli.commands.check.RuleParser") as mock_rule_parser_class: + with patch("cli.commands.check.DataValidator") as mock_validator_class: + with patch( + "cli.commands.check.OutputFormatter" + ) as mock_formatter_class: + with patch( + "cli.commands.check.get_cli_config" + ) as mock_cli_config: + with patch( + "cli.commands.check.get_core_config" + ) as mock_core_config: + with patch("asyncio.run") as mock_asyncio_run: + # Setup mocks + mock_cli_config.return_value = Mock() + mock_core_config.return_value = Mock() + + # Create mock rule + mock_rule = Mock() + mock_rule_parser_class.return_value.parse_rules.return_value = [ + mock_rule + ] + + # Create mock validation result + mock_result = Mock() + mock_validator_instance = Mock() + mock_validator_instance.validate.return_value = [ + mock_result + ] + mock_validator_class.return_value = ( + mock_validator_instance + ) + + # Create mock formatter + mock_formatter = Mock() + mock_formatter_class.return_value = mock_formatter + + # Mock asyncio.run + mock_asyncio_run.return_value = [mock_result] + + # Run the command + result = runner.invoke( + check_command, + [ + "--conn", + "test.csv", + "--table", + "customers", + "--rule", + "not_null(id)", + ], + ) + + # Verify that parse_source was called with both connection_string and table_name + mock_source_parser.parse_source.assert_called_once_with( + "test.csv", "customers" + ) + + # Verify success + assert result.exit_code == 0 + + def test_table_name_parameter_with_database_connection( + self, + runner: CliRunner, + ) -> None: + """Test that table_name parameter is correctly passed when using database connection""" + with patch("cli.commands.check.SourceParser") as mock_source_parser_class: + # Setup mock + mock_source_parser = Mock() + mock_source_parser_class.return_value = mock_source_parser + + # Create mock source config + mock_source_config = Mock() + mock_source_parser.parse_source.return_value = mock_source_config + + # Mock other components + with patch("cli.commands.check.RuleParser") as mock_rule_parser_class: + with patch("cli.commands.check.DataValidator") as mock_validator_class: + with patch( + "cli.commands.check.OutputFormatter" + ) as mock_formatter_class: + with patch( + "cli.commands.check.get_cli_config" + ) as mock_cli_config: + with patch( + "cli.commands.check.get_core_config" + ) as mock_core_config: + with patch("asyncio.run") as mock_asyncio_run: + # Setup mocks + mock_cli_config.return_value = Mock() + mock_core_config.return_value = Mock() + + # Create mock rule + mock_rule = Mock() + mock_rule_parser_class.return_value.parse_rules.return_value = [ + mock_rule + ] + + # Create mock validation result + mock_result = Mock() + mock_validator_instance = Mock() + mock_validator_instance.validate.return_value = [ + mock_result + ] + mock_validator_class.return_value = ( + mock_validator_instance + ) + + # Create mock formatter + mock_formatter = Mock() + mock_formatter_class.return_value = mock_formatter + + # Mock asyncio.run + mock_asyncio_run.return_value = [mock_result] + + # Run the command with database connection + db_url = "postgresql://user:pass@host/db" + table_name = "customers" + + result = runner.invoke( + check_command, + [ + "--conn", + db_url, + "--table", + table_name, + "--rule", + "not_null(id)", + ], + ) + + # Verify that parse_source was called with both db_url and table_name + mock_source_parser.parse_source.assert_called_once_with( + db_url, table_name + ) + + # Verify success + assert result.exit_code == 0 + + def test_table_name_parameter_overrides_url_table( + self, + runner: CliRunner, + ) -> None: + """Test that --table parameter overrides table name from URL when both are present""" + with patch("cli.commands.check.SourceParser") as mock_source_parser_class: + # Setup mock + mock_source_parser = Mock() + mock_source_parser_class.return_value = mock_source_parser + + # Create mock source config + mock_source_config = Mock() + mock_source_parser.parse_source.return_value = mock_source_config + + # Mock other components + with patch("cli.commands.check.RuleParser") as mock_rule_parser_class: + with patch("cli.commands.check.DataValidator") as mock_validator_class: + with patch( + "cli.commands.check.OutputFormatter" + ) as mock_formatter_class: + with patch( + "cli.commands.check.get_cli_config" + ) as mock_cli_config: + with patch( + "cli.commands.check.get_core_config" + ) as mock_core_config: + with patch("asyncio.run") as mock_asyncio_run: + # Setup mocks + mock_cli_config.return_value = Mock() + mock_core_config.return_value = Mock() + + # Create mock rule + mock_rule = Mock() + mock_rule_parser_class.return_value.parse_rules.return_value = [ + mock_rule + ] + + # Create mock validation result + mock_result = Mock() + mock_validator_instance = Mock() + mock_validator_instance.validate.return_value = [ + mock_result + ] + mock_validator_class.return_value = ( + mock_validator_instance + ) + + # Create mock formatter + mock_formatter = Mock() + mock_formatter_class.return_value = mock_formatter + + # Mock asyncio.run + mock_asyncio_run.return_value = [mock_result] + + # Run the command with URL that already contains table name + # URL has "users" table, but we specify "customers" table + db_url_with_table = ( + "postgresql://user:pass@host/db.users" + ) + override_table_name = "customers" + + result = runner.invoke( + check_command, + [ + "--conn", + db_url_with_table, + "--table", + override_table_name, + "--rule", + "not_null(id)", + ], + ) + + # Verify that parse_source was called with URL and override table name + # The --table parameter should take precedence over URL table + mock_source_parser.parse_source.assert_called_once_with( + db_url_with_table, override_table_name + ) + + # Verify success + assert result.exit_code == 0 diff --git a/tests/unit/cli/commands/test_schema_command.py b/tests/unit/cli/commands/test_schema_command.py index 88a8d72..dc94e91 100644 --- a/tests/unit/cli/commands/test_schema_command.py +++ b/tests/unit/cli/commands/test_schema_command.py @@ -11,6 +11,7 @@ from cli.app import cli_app from cli.core.data_validator import ExecutionResultSchema +from shared.enums.connection_types import ConnectionType def _write_tmp_file(tmp_path: Path, name: str, content: str) -> str: @@ -38,17 +39,22 @@ def test_schema_requires_source_and_rules(self, tmp_path: Path) -> None: rules_obj: dict[str, list[dict[str, Any]]] = {"rules": []} rules_path = _write_tmp_file(tmp_path, "schema.json", json.dumps(rules_obj)) - result = runner.invoke(cli_app, ["schema", data_path, "--rules", rules_path]) + result = runner.invoke( + cli_app, ["schema", "--conn", data_path, "--rules", rules_path] + ) assert result.exit_code == 0 assert "Checking" in result.output def test_output_json_mode(self, tmp_path: Path) -> None: runner = CliRunner() data_path = _write_tmp_file(tmp_path, "data.csv", "id\n1\n") - rules_path = _write_tmp_file(tmp_path, "schema.json", json.dumps({"rules": []})) + rules_path = _write_tmp_file( + tmp_path, "schema.json", json.dumps({"user": {"rules": []}}) + ) result = runner.invoke( - cli_app, ["schema", data_path, "--rules", rules_path, "--output", "json"] + cli_app, + ["schema", "--conn", data_path, "--rules", rules_path, "--output", "json"], ) assert result.exit_code == 0 payload = json.loads(result.output) @@ -72,12 +78,28 @@ def test_output_json_declared_columns_always_listed( .build() ) + # Create a mock ConnectionSchema for testing + mock_source_config = ( + test_builders.TestDataBuilder.connection() + .with_type(ConnectionType.CSV) + .with_database("test_db") + .with_available_tables("test_table") + .with_parameters({}) + .build() + ) + monkeypatch.setattr( - "cli.commands.schema._decompose_to_atomic_rules", - lambda payload: [schema_rule], + "cli.commands.schema._decompose_schema_payload", + lambda payload, source_config: [schema_rule], ) class DummyValidator: + def __init__( + self, source_config: Any, rules: Any, core_config: Any, cli_config: Any + ) -> None: + # Accept constructor arguments but ignore them + pass + async def validate(self) -> list[ExecutionResultSchema]: # Return no results to simulate missing schema details return [] @@ -89,11 +111,12 @@ async def validate(self) -> list[ExecutionResultSchema]: rules_path = _write_tmp_file( tmp_path, "schema.json", - json.dumps({"rules": [{"field": "id", "type": "integer"}]}), + json.dumps({"data": {"rules": [{"field": "id", "type": "integer"}]}}), ) result = runner.invoke( - cli_app, ["schema", data_path, "--rules", rules_path, "--output", "json"] + cli_app, + ["schema", "--conn", data_path, "--rules", rules_path, "--output", "json"], ) # No failures but explicit -- in this setup lack of results implies exit 0 assert result.exit_code == 0 @@ -116,6 +139,7 @@ def test_fail_on_error_sets_exit_code_1(self, tmp_path: Path) -> None: cli_app, [ "schema", + "--conn", data_path, "--rules", rules_path, @@ -131,7 +155,7 @@ def test_invalid_rules_json_yields_usage_error(self, tmp_path: Path) -> None: bad_rules_path = _write_tmp_file(tmp_path, "bad.json", "{invalid json}") result = runner.invoke( - cli_app, ["schema", data_path, "--rules", bad_rules_path] + cli_app, ["schema", "--conn", data_path, "--rules", bad_rules_path] ) # Click usage error exit code is >= 2 @@ -149,29 +173,33 @@ def test_warn_on_top_level_table_ignored(self, tmp_path: Path) -> None: runner = CliRunner() data_path = self._write_tmp_file(tmp_path, "data.csv", "id\n1\n") rules = { - "table": "users", - "rules": [ - {"field": "id", "type": "integer", "required": True}, - ], + "users": { + "rules": [ + {"field": "id", "type": "integer", "required": True}, + ] + } } rules_path = self._write_tmp_file(tmp_path, "schema.json", json.dumps(rules)) result = runner.invoke( - cli_app, ["schema", data_path, "--rules", rules_path, "--output", "json"] + cli_app, + ["schema", "--conn", data_path, "--rules", rules_path, "--output", "json"], ) # exit code from skeleton remains success assert result.exit_code == 0 - # warning emitted to stderr - assert "table' is ignored" in (result.stderr or "") + # Since multi-table has been supported,so no warning emitted to stderr + # assert "table' is ignored" in (result.stderr or "") def test_rules_must_be_array(self, tmp_path: Path) -> None: runner = CliRunner() data_path = self._write_tmp_file(tmp_path, "data.csv", "id\n1\n") rules_path = self._write_tmp_file(tmp_path, "schema.json", json.dumps({})) - result = runner.invoke(cli_app, ["schema", data_path, "--rules", rules_path]) + result = runner.invoke( + cli_app, ["schema", "--conn", data_path, "--rules", rules_path] + ) assert result.exit_code >= 2 - assert "must be an array" in result.output + assert "must have a 'rules' array" in result.output def test_rules_item_requires_field(self, tmp_path: Path) -> None: runner = CliRunner() @@ -179,7 +207,9 @@ def test_rules_item_requires_field(self, tmp_path: Path) -> None: bad = {"rules": [{"type": "integer"}]} rules_path = self._write_tmp_file(tmp_path, "schema.json", json.dumps(bad)) - result = runner.invoke(cli_app, ["schema", data_path, "--rules", rules_path]) + result = runner.invoke( + cli_app, ["schema", "--conn", data_path, "--rules", rules_path] + ) assert result.exit_code >= 2 assert "field must be a non-empty string" in result.output @@ -189,7 +219,9 @@ def test_type_must_be_supported_string(self, tmp_path: Path) -> None: bad = {"rules": [{"field": "id", "type": "number"}]} rules_path = self._write_tmp_file(tmp_path, "schema.json", json.dumps(bad)) - result = runner.invoke(cli_app, ["schema", data_path, "--rules", rules_path]) + result = runner.invoke( + cli_app, ["schema", "--conn", data_path, "--rules", rules_path] + ) assert result.exit_code >= 2 assert "type 'number' is not supported" in result.output @@ -199,7 +231,9 @@ def test_required_must_be_boolean(self, tmp_path: Path) -> None: bad = {"rules": [{"field": "id", "required": "yes"}]} rules_path = self._write_tmp_file(tmp_path, "schema.json", json.dumps(bad)) - result = runner.invoke(cli_app, ["schema", data_path, "--rules", rules_path]) + result = runner.invoke( + cli_app, ["schema", "--conn", data_path, "--rules", rules_path] + ) assert result.exit_code >= 2 assert "required must be a boolean" in result.output @@ -209,7 +243,9 @@ def test_enum_must_be_array(self, tmp_path: Path) -> None: bad = {"rules": [{"field": "flag", "enum": "01"}]} rules_path = self._write_tmp_file(tmp_path, "schema.json", json.dumps(bad)) - result = runner.invoke(cli_app, ["schema", data_path, "--rules", rules_path]) + result = runner.invoke( + cli_app, ["schema", "--conn", data_path, "--rules", rules_path] + ) assert result.exit_code >= 2 assert "enum must be an array" in result.output @@ -219,6 +255,8 @@ def test_min_max_must_be_numeric(self, tmp_path: Path) -> None: bad = {"rules": [{"field": "age", "type": "integer", "min": "0"}]} rules_path = self._write_tmp_file(tmp_path, "schema.json", json.dumps(bad)) - result = runner.invoke(cli_app, ["schema", data_path, "--rules", rules_path]) + result = runner.invoke( + cli_app, ["schema", "--conn", data_path, "--rules", rules_path] + ) assert result.exit_code >= 2 assert "min must be numeric" in result.output diff --git a/tests/unit/cli/commands/test_schema_command_extended.py b/tests/unit/cli/commands/test_schema_command_extended.py index 9c366c5..57ded12 100644 --- a/tests/unit/cli/commands/test_schema_command_extended.py +++ b/tests/unit/cli/commands/test_schema_command_extended.py @@ -8,9 +8,16 @@ from click.testing import CliRunner from cli.app import cli_app -from shared.enums import RuleAction, RuleCategory, RuleType, SeverityLevel +from shared.enums import ( + ConnectionType, + RuleAction, + RuleCategory, + RuleType, + SeverityLevel, +) from shared.schema.base import RuleTarget, TargetEntity from shared.schema.rule_schema import RuleSchema +from tests.shared.builders import test_builders def _write_tmp_file(tmp_path: Path, name: str, content: str) -> str: @@ -71,7 +78,7 @@ def test_map_type_names_are_case_insensitive_and_validated( _map_type_name_to_datatype("number") def test_decompose_to_atomic_rules_structure(self, tmp_path: Path) -> None: - from cli.commands.schema import _decompose_to_atomic_rules + from cli.commands.schema import _decompose_schema_payload payload = { "strict_mode": True, @@ -82,8 +89,16 @@ def test_decompose_to_atomic_rules_structure(self, tmp_path: Path) -> None: {"field": "status", "enum": ["A", "B"]}, ], } - - rules = _decompose_to_atomic_rules(payload) + # Create a mock ConnectionSchema for testing + mock_source_config = ( + test_builders.TestDataBuilder.connection() + .with_type(ConnectionType.CSV) + .with_database("test_db") + .with_available_tables("test_table") + .with_parameters({}) + .build() + ) + rules = _decompose_schema_payload(payload, mock_source_config) # First rule should be SCHEMA when any columns declared assert rules[0].type == RuleType.SCHEMA @@ -134,20 +149,23 @@ def test_prioritization_skip_map(self) -> None: atomic_rules = [schema, not_null_email, range_age] # Simulate SCHEMA execution details - schema_result = { - "execution_plan": { - "schema_details": { - "field_results": [ - {"column": "email", "failure_code": "TYPE_MISMATCH"}, - {"column": "age", "failure_code": "FIELD_MISSING"}, - {"column": "id", "failure_code": "NONE"}, - ] - } + schema_results = [ + { + "rule_id": str(schema.id), + "execution_plan": { + "schema_details": { + "field_results": [ + {"column": "email", "failure_code": "TYPE_MISMATCH"}, + {"column": "age", "failure_code": "FIELD_MISSING"}, + {"column": "id", "failure_code": "NONE"}, + ] + } + }, } - } + ] skip_map = _build_prioritized_atomic_status( - schema_result=schema_result, atomic_rules=atomic_rules + schema_results=schema_results, atomic_rules=atomic_rules ) # email dependent rules should be skipped for TYPE_MISMATCH @@ -188,8 +206,8 @@ def test_json_output_aggregation_and_skip_semantics( # Patch decomposition monkeypatch.setattr( - "cli.commands.schema._decompose_to_atomic_rules", - lambda payload: atomic_rules, + "cli.commands.schema._decompose_schema_payload", + lambda payload, source_config: atomic_rules, ) # Build SCHEMA and dependent rule results. Dependent rules are PASSED in raw @@ -237,7 +255,10 @@ def test_json_output_aggregation_and_skip_semantics( # Patch DataValidator.validate to return our results class DummyValidator: - def __init__(self, *args: Any, **kwargs: Any) -> None: # noqa: D401 + def __init__( + self, source_config: Any, rules: Any, core_config: Any, cli_config: Any + ): + # Accept all required parameters but don't use them pass async def validate(self) -> List[Dict[str, Any]]: # type: ignore[override] @@ -262,7 +283,8 @@ async def validate(self) -> List[Dict[str, Any]]: # type: ignore[override] ) result = runner.invoke( - cli_app, ["schema", data_path, "--rules", rules_path, "--output", "json"] + cli_app, + ["schema", "--conn", data_path, "--rules", rules_path, "--output", "json"], ) assert result.exit_code == 1 # schema failed -> non-zero @@ -313,8 +335,8 @@ def test_table_output_grouping_and_skips( atomic_rules = [schema, not_null_email, range_age] monkeypatch.setattr( - "cli.commands.schema._decompose_to_atomic_rules", - lambda payload: atomic_rules, + "cli.commands.schema._decompose_schema_payload", + lambda payload, source_config: atomic_rules, ) schema_result = { @@ -346,17 +368,19 @@ def test_table_output_grouping_and_skips( # Dependent rule raw statuses set to PASSED; should be skipped for display grouping not_null_email_result = { "rule_id": str(not_null_email.id), - "status": "PASSED", + "status": "SKIPPED", "dataset_metrics": [ {"entity_name": "x", "total_records": 10, "failed_records": 0} ], + "skip_reason": "TYPE_MISMATCH", } range_age_result = { "rule_id": str(range_age.id), - "status": "PASSED", + "status": "SKIPPED", "dataset_metrics": [ {"entity_name": "x", "total_records": 10, "failed_records": 0} ], + "skip_reason": "FIELD_MISSING", } class DummyValidator: @@ -383,7 +407,9 @@ async def validate(self) -> List[Dict[str, Any]]: # type: ignore[override] ), ) - result = runner.invoke(cli_app, ["schema", data_path, "--rules", rules_path]) + result = runner.invoke( + cli_app, ["schema", "--conn", data_path, "--rules", rules_path] + ) assert result.exit_code == 1 output = result.output @@ -396,18 +422,18 @@ async def validate(self) -> List[Dict[str, Any]]: # type: ignore[override] class TestSchemaValidationErrorsExtended: - def test_reject_tables_top_level(self, tmp_path: Path) -> None: - runner = CliRunner() - data_path = _write_tmp_file(tmp_path, "data.csv", "id\n1\n") - rules_path = _write_tmp_file( - tmp_path, - "schema.json", - json.dumps({"tables": {"users": []}, "rules": []}), - ) - - result = runner.invoke(cli_app, ["schema", data_path, "--rules", rules_path]) - assert result.exit_code >= 2 - assert "not supported in v1" in result.output + # def test_reject_tables_top_level(self, tmp_path: Path) -> None: + # runner = CliRunner() + # data_path = _write_tmp_file(tmp_path, "data.csv", "id\n1\n") + # rules_path = _write_tmp_file( + # tmp_path, + # "schema.json", + # json.dumps({"tables": {"users": []}, "rules": []}), + # ) + + # result = runner.invoke(cli_app, ["schema", "--conn", data_path, "--rules", rules_path]) + # assert result.exit_code >= 2 + # assert "not supported in v1" in result.output def test_enum_must_be_non_empty_array(self, tmp_path: Path) -> None: runner = CliRunner() @@ -418,6 +444,8 @@ def test_enum_must_be_non_empty_array(self, tmp_path: Path) -> None: json.dumps({"rules": [{"field": "status", "enum": []}]}), ) - result = runner.invoke(cli_app, ["schema", data_path, "--rules", rules_path]) + result = runner.invoke( + cli_app, ["schema", "--conn", data_path, "--rules", rules_path] + ) assert result.exit_code >= 2 assert "enum' must be a non-empty" in result.output diff --git a/tests/unit/cli/commands/test_schema_command_file_sources.py b/tests/unit/cli/commands/test_schema_command_file_sources.py index 0c799b1..8b8ee95 100644 --- a/tests/unit/cli/commands/test_schema_command_file_sources.py +++ b/tests/unit/cli/commands/test_schema_command_file_sources.py @@ -39,8 +39,8 @@ def test_csv_excel_to_sqlite_type_implications( {"reg_date": {"expected_type": "DATE"}, "ts": {"expected_type": "DATETIME"}} ) monkeypatch.setattr( - "cli.commands.schema._decompose_to_atomic_rules", - lambda payload: [schema_rule], + "cli.commands.schema._decompose_schema_payload", + lambda payload, source_config: [schema_rule], ) # Build SCHEMA result indicating SQLite TEXT types cause TYPE_MISMATCH @@ -72,6 +72,12 @@ def test_csv_excel_to_sqlite_type_implications( } class DummyValidator: + def __init__( + self, source_config: Any, rules: Any, core_config: Any, cli_config: Any + ) -> None: + # Accept all required parameters but don't use them + pass + async def validate(self) -> List[Dict[str, Any]]: # type: ignore[override] return [schema_result] @@ -98,7 +104,8 @@ async def validate(self) -> List[Dict[str, Any]]: # type: ignore[override] runner = CliRunner() result = runner.invoke( - cli_app, ["schema", data_path, "--rules", rules_path, "--output", "json"] + cli_app, + ["schema", "--conn", data_path, "--rules", rules_path, "--output", "json"], ) assert result.exit_code == 1 diff --git a/tests/unit/cli/commands/test_schema_command_json_extras.py b/tests/unit/cli/commands/test_schema_command_json_extras.py index 2d948ae..d2f7100 100644 --- a/tests/unit/cli/commands/test_schema_command_json_extras.py +++ b/tests/unit/cli/commands/test_schema_command_json_extras.py @@ -43,8 +43,8 @@ def test_json_includes_schema_extras_and_summary_counts( } ) monkeypatch.setattr( - "cli.commands.schema._decompose_to_atomic_rules", - lambda payload: [schema_rule], + "cli.commands.schema._decompose_schema_payload", + lambda payload, source_config: [schema_rule], ) # Results: SCHEMA failed with 1 type mismatch, 0 existence failures, extras present @@ -76,6 +76,12 @@ def test_json_includes_schema_extras_and_summary_counts( } class DummyValidator: + def __init__( + self, source_config: Any, rules: Any, core_config: Any, cli_config: Any + ) -> None: + # Accept all required parameters but don't use them + pass + async def validate(self) -> List[Dict[str, Any]]: # type: ignore[override] return [schema_result] @@ -97,10 +103,23 @@ async def validate(self) -> List[Dict[str, Any]]: # type: ignore[override] ) result = runner.invoke( - cli_app, ["schema", data_path, "--rules", rules_path, "--output", "json"] + cli_app, + ["schema", "--conn", data_path, "--rules", rules_path, "--output", "json"], ) assert result.exit_code == 1 - payload = json.loads(result.output) + + # Extract JSON part from output (skip warning messages) + output_lines = result.output.strip().split("\n") + json_line = None + for line in output_lines: + if line.strip().startswith("{"): + json_line = line.strip() + break + + if not json_line: + raise ValueError("No JSON output found in result") + + payload = json.loads(json_line) # schema_extras must present, sorted by CLI before emission assert payload.get("schema_extras") == ["aaa_extra", "zzz_extra"] @@ -115,8 +134,8 @@ def test_table_output_does_not_emit_schema_extras_key( ) -> None: schema_rule = _schema_rule_with({"id": {"expected_type": "INTEGER"}}) monkeypatch.setattr( - "cli.commands.schema._decompose_to_atomic_rules", - lambda payload: [schema_rule], + "cli.commands.schema._decompose_schema_payload", + lambda payload, source_config: [schema_rule], ) schema_result = { @@ -131,6 +150,12 @@ def test_table_output_does_not_emit_schema_extras_key( } class DummyValidator: + def __init__( + self, source_config: Any, rules: Any, core_config: Any, cli_config: Any + ) -> None: + # Accept all required parameters but don't use them + pass + async def validate(self) -> List[Dict[str, Any]]: # type: ignore[override] return [schema_result] @@ -143,7 +168,9 @@ async def validate(self) -> List[Dict[str, Any]]: # type: ignore[override] "schema.json", json.dumps({"rules": [{"field": "id", "type": "integer"}]}), ) - result = runner.invoke(cli_app, ["schema", data_path, "--rules", rules_path]) + result = runner.invoke( + cli_app, ["schema", "--conn", data_path, "--rules", rules_path] + ) assert result.exit_code == 0 # Plain text output should not dump JSON key name assert "schema_extras" not in result.output diff --git a/tests/unit/cli/commands/test_schema_command_multi_table.py b/tests/unit/cli/commands/test_schema_command_multi_table.py new file mode 100644 index 0000000..0c5ecd8 --- /dev/null +++ b/tests/unit/cli/commands/test_schema_command_multi_table.py @@ -0,0 +1,387 @@ +"""Unit tests for schema command multi-table functionality.""" + +from __future__ import annotations + +import json +from pathlib import Path +from typing import Any + +import pytest +from click.testing import CliRunner + +from cli.app import cli_app + + +def _write_tmp_file(tmp_path: Path, name: str, content: str) -> str: + file_path = tmp_path / name + file_path.write_text(content, encoding="utf-8") + return str(file_path) + + +class TestSchemaCommandMultiTable: + def test_multi_table_rules_format_parsing(self, tmp_path: Path) -> None: + """Test that multi-table rules format is correctly parsed.""" + runner = CliRunner() + + # Create multi-table rules file + # Use the existing multi-table schema file + rules_path = "test_data/multi_table_schema.json" + # Use the new multi-table Excel file instead of CSV + data_path = "test_data/multi_table_data.xlsx" + + result = runner.invoke( + cli_app, + ["schema", "--conn", data_path, "--rules", rules_path, "--output", "json"], + ) + + assert result.exit_code == 0 + payload = json.loads(result.output) + assert payload["status"] == "ok" + assert payload["rules_count"] == 17 + + # Check that fields have table information + fields = payload["fields"] + assert len(fields) > 0 + for field in fields: + assert "table" in field + assert field["table"] in ["users", "products", "orders"] + + def test_multi_table_excel_sheets_detection(self, tmp_path: Path) -> None: + """Test that Excel file sheets are correctly detected and used as tables.""" + runner = CliRunner() + + # Create a simple multi-table rules file + multi_table_rules = { + "users": { + "rules": [ + {"field": "id", "type": "integer", "required": True}, + {"field": "name", "type": "string", "required": True}, + ] + }, + "products": { + "rules": [ + {"field": "product_id", "type": "integer", "required": True}, + {"field": "product_name", "type": "string", "required": True}, + ] + }, + } + + rules_path = _write_tmp_file( + tmp_path, "multi_table_rules.json", json.dumps(multi_table_rules) + ) + data_path = "test_data/multi_table_data.xlsx" + + result = runner.invoke( + cli_app, + ["schema", "--conn", data_path, "--rules", rules_path, "--output", "json"], + ) + + assert result.exit_code == 0 + payload = json.loads(result.output) + assert payload["status"] == "ok" + + # Check that both tables are processed + fields = payload["fields"] + user_fields = [f for f in fields if f.get("table") == "users"] + product_fields = [f for f in fields if f.get("table") == "products"] + + assert len(user_fields) > 0 + assert len(product_fields) > 0 + + def test_multi_table_with_table_level_options(self, tmp_path: Path) -> None: + """Test multi-table format with table-level options like strict_mode.""" + runner = CliRunner() + + multi_table_rules = { + "users": { + "rules": [{"field": "id", "type": "integer", "required": True}], + "strict_mode": True, + }, + "products": { + "rules": [ + {"field": "product_name", "type": "string", "required": True} + ], + "case_insensitive": True, + }, + } + + rules_path = _write_tmp_file( + tmp_path, "multi_table_options.json", json.dumps(multi_table_rules) + ) + data_path = "test_data/multi_table_data.xlsx" + + result = runner.invoke( + cli_app, + ["schema", "--conn", data_path, "--rules", rules_path, "--output", "json"], + ) + + # With strict_mode=True, extra columns will cause SCHEMA validation to fail + assert result.exit_code == 1 + payload = json.loads(result.output) + assert payload["status"] == "ok" # Overall status is ok + assert ( + payload["summary"]["failed_rules"] == 1 + ) # One rule failed due to strict mode + assert payload["summary"]["passed_rules"] == 3 # Three rules passed + + def test_multi_table_backward_compatibility(self, tmp_path: Path) -> None: + """Test that single-table format still works for backward compatibility.""" + runner = CliRunner() + + # Single-table format (legacy) + single_table_rules = { + "rules": [ + {"field": "id", "type": "integer", "required": True}, + {"field": "name", "type": "string", "required": True}, + ] + } + + rules_path = _write_tmp_file( + tmp_path, "single_table.json", json.dumps(single_table_rules) + ) + # Use only the users sheet for single table test + data_path = "test_data/multi_table_data.xlsx" + + result = runner.invoke( + cli_app, + ["schema", "--conn", data_path, "--rules", rules_path, "--output", "json"], + ) + + assert result.exit_code == 0 + + # Handle mixed output (warning + JSON) + output_lines = result.output.strip().split("\n") + json_line = None + for line in output_lines: + if line.strip().startswith("{"): + json_line = line.strip() + break + + assert json_line is not None, f"No JSON found in output: {result.output}" + + payload = json.loads(json_line) + assert payload["status"] == "ok" + assert payload["rules_count"] == 3 + + def test_multi_table_validation_errors(self, tmp_path: Path) -> None: + """Test validation errors for invalid multi-table format.""" + runner = CliRunner() + + # Invalid: table schema is not an object + invalid_rules = {"users": "not_an_object"} + + rules_path = _write_tmp_file( + tmp_path, "invalid.json", json.dumps(invalid_rules) + ) + data_path = "test_data/multi_table_data.xlsx" + + result = runner.invoke( + cli_app, ["schema", "--conn", data_path, "--rules", rules_path] + ) + + assert result.exit_code >= 2 # Usage error + assert "must be an object" in result.output + + def test_multi_table_missing_rules_array(self, tmp_path: Path) -> None: + """Test validation error when table is missing rules array.""" + runner = CliRunner() + + invalid_rules = { + "users": { + "strict_mode": True + # Missing rules array + } + } + + rules_path = _write_tmp_file( + tmp_path, "missing_rules.json", json.dumps(invalid_rules) + ) + data_path = "test_data/multi_table_data.xlsx" + + result = runner.invoke( + cli_app, ["schema", "--conn", data_path, "--rules", rules_path] + ) + + assert result.exit_code >= 2 # Usage error + assert "must have a 'rules' array" in result.output + + def test_multi_table_invalid_table_level_options(self, tmp_path: Path) -> None: + """Test validation error for invalid table-level options.""" + runner = CliRunner() + + invalid_rules = { + "users": { + "rules": [{"field": "id", "type": "integer", "required": True}], + "strict_mode": "not_a_boolean", # Should be boolean + } + } + + rules_path = _write_tmp_file( + tmp_path, "invalid_options.json", json.dumps(invalid_rules) + ) + data_path = "test_data/multi_table_data.xlsx" + + result = runner.invoke( + cli_app, ["schema", "--conn", data_path, "--rules", rules_path] + ) + + assert result.exit_code >= 2 # Usage error + assert "must be a boolean" in result.output + + def test_multi_table_output_formatting(self, tmp_path: Path) -> None: + """Test that multi-table output is properly formatted and grouped.""" + runner = CliRunner() + + multi_table_rules = { + "users": { + "rules": [ + {"field": "id", "type": "integer", "required": True}, + {"field": "name", "type": "string", "required": True}, + ] + }, + "products": { + "rules": [{"field": "product_id", "type": "integer", "required": True}] + }, + } + + rules_path = _write_tmp_file( + tmp_path, "multi_table.json", json.dumps(multi_table_rules) + ) + data_path = "test_data/multi_table_data.xlsx" + + # Test table output format + result = runner.invoke( + cli_app, + ["schema", "--conn", data_path, "--rules", rules_path, "--output", "table"], + ) + + assert result.exit_code == 0 + output = result.output + + # Should show table headers for multi-table + assert "📋 Table: users" in output + assert "📋 Table: products" in output + assert "📊 Multi-table Summary:" in output + + def test_multi_table_json_output_structure(self, tmp_path: Path) -> None: + """Test that JSON output includes table information for multi-table.""" + runner = CliRunner() + + multi_table_rules = { + "users": {"rules": [{"field": "id", "type": "integer", "required": True}]}, + "products": { + "rules": [{"field": "product_name", "type": "string", "required": True}] + }, + } + + rules_path = _write_tmp_file( + tmp_path, "multi_table.json", json.dumps(multi_table_rules) + ) + data_path = "test_data/multi_table_data.xlsx" + + result = runner.invoke( + cli_app, + ["schema", "--conn", data_path, "--rules", rules_path, "--output", "json"], + ) + + assert result.exit_code == 0 + payload = json.loads(result.output) + + # Check that fields have table information + fields = payload["fields"] + assert len(fields) >= 2 + + # Find fields for each table + user_fields = [f for f in fields if f.get("table") == "users"] + product_fields = [f for f in fields if f.get("table") == "products"] + + assert len(user_fields) > 0 + assert len(product_fields) > 0 + + # Check that each field has table info + for field in fields: + assert "table" in field + assert field["table"] in ["users", "products"] + + def test_multi_table_no_table_option_required(self, tmp_path: Path) -> None: + """Test that --table option is no longer required.""" + runner = CliRunner() + + multi_table_rules = { + "users": {"rules": [{"field": "id", "type": "integer", "required": True}]} + } + + rules_path = _write_tmp_file( + tmp_path, "multi_table.json", json.dumps(multi_table_rules) + ) + data_path = "test_data/multi_table_data.xlsx" + + # Should work without --table option + result = runner.invoke( + cli_app, ["schema", "--conn", data_path, "--rules", rules_path] + ) + + assert result.exit_code == 0 + # Command should execute successfully without --table option + + def test_multi_table_excel_specific_functionality(self, tmp_path: Path) -> None: + """Test specific Excel multi-table functionality.""" + runner = CliRunner() + + # Test with all three tables from the Excel file + multi_table_rules = { + "users": { + "rules": [ + {"field": "id", "type": "integer", "required": True}, + {"field": "name", "type": "string", "required": True}, + {"field": "email", "type": "string", "required": True}, + ] + }, + "products": { + "rules": [ + {"field": "product_id", "type": "integer", "required": True}, + {"field": "product_name", "type": "string", "required": True}, + {"field": "price", "type": "float", "min": 0.0}, + ] + }, + "orders": { + "rules": [ + {"field": "order_id", "type": "integer", "required": True}, + {"field": "user_id", "type": "integer", "required": True}, + {"field": "total_amount", "type": "float", "min": 0.0}, + ] + }, + } + + rules_path = _write_tmp_file( + tmp_path, "excel_multi_table.json", json.dumps(multi_table_rules) + ) + data_path = "test_data/multi_table_data.xlsx" + + result = runner.invoke( + cli_app, + ["schema", "--conn", data_path, "--rules", rules_path, "--output", "json"], + ) + + assert result.exit_code == 0 + payload = json.loads(result.output) + assert payload["status"] == "ok" + + # Check that all three tables are processed + fields = payload["fields"] + table_names = set(field.get("table") for field in fields) + assert "users" in table_names + assert "products" in table_names + assert "orders" in table_names + + def test_multi_table_help_text_updated(self, tmp_path: Path) -> None: + """Test that help text reflects multi-table support.""" + runner = CliRunner() + + result = runner.invoke(cli_app, ["schema", "--help"]) + assert result.exit_code == 0 + + # Should mention multi-table support + assert "multi-table" in result.output.lower() + # Should not mention --table option + assert "--table" not in result.output diff --git a/tests/unit/cli/core/test_cli_app.py b/tests/unit/cli/core/test_cli_app.py index 909ede2..1a63664 100644 --- a/tests/unit/cli/core/test_cli_app.py +++ b/tests/unit/cli/core/test_cli_app.py @@ -55,7 +55,7 @@ def test_cli_app_version_option(self: Any, runner: CliRunner) -> None: result = runner.invoke(cli_app, ["--version"]) assert result.exit_code == 0 - assert "vlite-cli" in result.output + assert "vlite" in result.output # assert "1.0.0" in result.output def test_cli_app_help_option(self: Any, runner: CliRunner) -> None: @@ -118,7 +118,7 @@ def test_rules_help_command_content(self: Any, runner: CliRunner) -> None: assert "not_null(id)" in result.output assert "unique(email)" in result.output assert "length(name,2,50)" in result.output - assert "mysql://user:pass@host/db.users" in result.output + assert "mysql://user:pass@host/db" in result.output def test_rules_help_json_schema_example(self: Any, runner: CliRunner) -> None: """Test rules-help includes valid JSON schema example""" @@ -146,9 +146,9 @@ def test_rules_help_usage_examples(self: Any, runner: CliRunner) -> None: # Check usage examples usage_examples = [ - "vlite-cli check users.csv --rule", - "vlite-cli check users.csv --rules validation.json", - "vlite-cli check mysql://user:pass@host/db.users", + "vlite check --conn users.csv --rule", + "vlite check --conn users.csv --rules validation.json", + "vlite check --conn mysql://user:pass@host/db", ] for example in usage_examples: @@ -269,7 +269,16 @@ def test_end_to_end_check_command_integration( ] result = runner.invoke( - cli_app, ["check", temp_file, "--rule", "not_null(id)"] + cli_app, + [ + "check", + "--conn", + temp_file, + "--table", + "users", + "--rule", + "not_null(id)", + ], ) # Should execute without critical errors @@ -322,10 +331,14 @@ def test_extremely_long_command_line(self: Any, runner: CliRunner) -> None: """Test handling of extremely long command lines""" long_rule = "not_null(" + "a" * 1000 + ")" - result = runner.invoke(cli_app, ["check", "test.csv", "--rule", long_rule]) + result = runner.invoke( + cli_app, + ["check", "--conn", "test.csv", "--table", "users", "--rule", long_rule], + ) # Should handle gracefully (either succeed or fail with proper error) - assert result.exit_code in [20, 21, 22] + # Exit code 2 is Click's error exit code for missing required options + assert result.exit_code in [2, 20, 21, 22] assert ( "Error:" in result.output or "Usage:" in result.output @@ -398,7 +411,7 @@ def test_cli_app_contract_compliance(self: Any, runner: CliRunner) -> None: # Should have proper Click structure assert "Usage:" in result.output - assert "vlite-cli" in result.output + assert "vlite" in result.output assert "Commands:" in result.output def test_error_exit_codes_consistency(self: Any, runner: CliRunner) -> None: diff --git a/tests/unit/cli/core/test_cli_config_integration.py b/tests/unit/cli/core/test_cli_config_integration.py index df76a27..839434b 100644 --- a/tests/unit/cli/core/test_cli_config_integration.py +++ b/tests/unit/cli/core/test_cli_config_integration.py @@ -130,7 +130,14 @@ def test_check_command_uses_config(self) -> None: # Invoke the command result = runner.invoke( check_command, - ["test.csv", "--rule", "not_null(column1)"], + [ + "--conn", + "test.csv", + "--table", + "users", + "--rule", + "not_null(column1)", + ], ) # Verify configs were loaded