Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 14 additions & 4 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,16 +8,26 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
## [Unreleased]

### Added
- None
- feat(cli): refactor check command interface from positional arguments to `--conn` and `--table` options
- feat(cli): add comprehensive test coverage for new CLI interface functionality
- feat(cli): support explicit table name specification independent of database URL

### Changed
- None
- **BREAKING CHANGE**: CLI interface changed from `vlite-cli check <source>` to `vlite-cli check --conn <connection> --table <table_name>`
- refactor(cli): update SourceParser to accept optional table_name parameter
- refactor(cli): modify check command to pass table_name to SourceParser.parse_source()
- refactor(tests): update all existing CLI tests to use new interface format
- refactor(tests): add new test cases specifically for table name parameter validation

### Fixed
- None
- fix(cli): resolve issue where `--table` parameter was not correctly passed to backend
- fix(cli): ensure table name from `--table` option takes precedence over table name in database URL
- fix(tests): update regression tests to use new CLI interface format
- fix(tests): resolve test failures caused by interface changes

### Removed
- None
- **BREAKING CHANGE**: remove backward compatibility for old positional argument interface
- remove(cli): eliminate support for `<source>` positional argument in check command

## [0.4.0] - 2025-01-27

Expand Down
35 changes: 23 additions & 12 deletions cli/commands/check.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,13 @@


@click.command("check")
@click.argument("source", required=True)
@click.option(
"--conn",
"connection_string",
required=True,
help="Database connection string or file path",
)
@click.option("--table", "table_name", required=True, help="Table name to validate")
@click.option(
"--rule",
"rules",
Expand All @@ -59,7 +65,8 @@
help="Show detailed information and failure samples",
)
def check_command(
source: str,
connection_string: str,
table_name: str,
rules: Tuple[str, ...],
rules_file: Optional[str],
quiet: bool,
Expand All @@ -68,18 +75,22 @@ def check_command(
"""
Check data quality for the given source.

NEW FORMAT:
vlite-cli check --conn <connection> --table <table_name> [options]

SOURCE can be:
- File path: users.csv, data.xlsx, records.json
- Database URL: mysql://user:pass@host/db.table
- Database URL: mysql://user:pass@host/db
- SQLite file: sqlite:///path/to/file.db

Examples:
vlite-cli check users.csv --rule "not_null(id)"
vlite-cli check mysql://user:pass@host/db.users --rules validation.json
vlite-cli check --conn users.csv --table users --rule "not_null(id)"
vlite-cli check --conn mysql://user:pass@host/db \
--table users --rules validation.json
"""
# Record start time
start_time = now()
logger.info(f"Starting data quality check for: {source}")
logger.info(f"Starting data quality check for: {connection_string}")

# Create exception handler
exception_handler = CliExceptionHandler(verbose=verbose)
Expand Down Expand Up @@ -111,23 +122,23 @@ def check_command(
)

# Parse source
safe_echo(f"🔍 Analyzing source: {source}")
safe_echo(f"🔍 Analyzing source: {connection_string}")

# Proactively verify that a provided file is not empty – this avoids
# kicking off heavy validation logic only to discover the file is
# useless. The modern test-suite expects a graceful early-exit with a
# clear error message in such a scenario.
potential_path = Path(source)
potential_path = Path(connection_string)
if potential_path.exists() and potential_path.is_file():
if potential_path.stat().st_size == 0:
raise click.ClickException(
f"Error: Source file '{source}' is empty "
f"Error: Source file '{connection_string}' is empty "
"– nothing to validate."
)

# Parse source config - this may raise Schema creation error
# (OperationError)
source_config = source_parser.parse_source(source)
source_config = source_parser.parse_source(connection_string, table_name)

# Parse rules - this may raise Schema creation error
# (RuleExecutionError)
Expand Down Expand Up @@ -205,7 +216,7 @@ def check_command(
output_formatter.display_results(
results=results_dicts,
rules=rule_configs, # Pass as objects, not dicts
source=source,
source=connection_string,
execution_time=execution_time,
total_rules=len(rule_configs),
)
Expand Down Expand Up @@ -248,7 +259,7 @@ def check_command(
output_formatter.display_results(
results=results_dicts,
rules=rule_configs, # Pass as objects, not dicts
source=source,
source=connection_string,
execution_time=execution_time,
total_rules=len(rule_configs),
)
Expand Down
99 changes: 71 additions & 28 deletions cli/commands/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -315,11 +315,10 @@ def _build_prioritized_atomic_status(
# Build per-column guard from SCHEMA details
column_guard: Dict[str, str] = {} # column -> NONE|FIELD_MISSING|TYPE_MISMATCH
if schema_result:
details = (
schema_result.get("execution_plan", {})
.get("schema_details", {})
.get("field_results", [])
)
# Safely access nested dictionaries, checking for None at each level.
execution_plan = schema_result.get("execution_plan") or {}
schema_details = execution_plan.get("schema_details") or {}
details = schema_details.get("field_results") or []
for item in details:
col = str(item.get("column"))
code = str(item.get("failure_code", "NONE"))
Expand Down Expand Up @@ -417,15 +416,22 @@ def _create_validator(
core_config=core_config,
cli_config=cli_config,
)
except TypeError:
return DataValidator() # type: ignore[call-arg]
except Exception as e:
logger.error(f"Failed to create DataValidator: {str(e)}")
raise click.UsageError(f"Failed to create validator: {str(e)}")


def _run_validation(validator: Any) -> Tuple[List[Any], float]:
import asyncio

start = _now()
results = asyncio.run(validator.validate())
logger.debug("Starting validation")
try:
results = asyncio.run(validator.validate())
logger.debug(f"Validation returned {len(results)} results")
except Exception as e:
logger.error(f"Validation failed: {str(e)}")
raise
exec_seconds = (_now() - start).total_seconds()
return results, exec_seconds

Expand All @@ -440,6 +446,8 @@ def _extract_schema_result_dict(
if not schema_rule:
return None
for r in results:
if r is None:
continue
rid = ""
if hasattr(r, "rule_id"):
try:
Expand Down Expand Up @@ -618,11 +626,11 @@ def _ensure_check(entry: Dict[str, Any], name: str) -> Dict[str, Any]:
if schema_result_dict:
try:
extras = (
(schema_result_dict.get("execution_plan") or {}).get(
"schema_details", {}
)
or {}
).get("extras", [])
(schema_result_dict or {})
.get("execution_plan", {})
.get("schema_details", {})
.get("extras", [])
)
if isinstance(extras, list):
schema_extras = [str(x) for x in extras]
except Exception:
Expand Down Expand Up @@ -720,11 +728,9 @@ def _calc_failed(res: Dict[str, Any]) -> int:

column_guard: Dict[str, str] = {}
if schema_result_dict:
details = (
schema_result_dict.get("execution_plan", {})
.get("schema_details", {})
.get("field_results", [])
)
execution_plan = schema_result_dict.get("execution_plan") or {}
schema_details = execution_plan.get("schema_details") or {}
details = schema_details.get("field_results") or []
for item in details:
col = str(item.get("column"))
column_guard[col] = str(item.get("failure_code", "NONE"))
Expand Down Expand Up @@ -832,7 +838,13 @@ def _calc_failed(res: Dict[str, Any]) -> int:


@click.command("schema")
@click.argument("source", required=True)
@click.option(
"--conn",
"connection_string",
required=True,
help="Database connection string or file path",
)
@click.option("--table", "table_name", required=True, help="Table name to validate")
@click.option(
"--rules",
"rules_file",
Expand Down Expand Up @@ -862,7 +874,8 @@ def _calc_failed(res: Dict[str, Any]) -> int:
)
@click.option("--verbose", is_flag=True, default=False, help="Enable verbose output")
def schema_command(
source: str,
connection_string: str,
table_name: str,
rules_file: str,
output: str,
fail_on_error: bool,
Expand All @@ -871,18 +884,30 @@ def schema_command(
) -> None:
"""Schema validation command with minimal rules file validation.

Decomposition and execution are added in subsequent tasks.
NEW FORMAT:
vlite-cli schema --conn <connection> --table <table_name> \
--rules <rules_file> [options]

SOURCE can be:
- File path: users.csv, data.xlsx, records.json
- Database URL: mysql://user:pass@host/db
- SQLite file: sqlite:///path/to/file.db

Examples:
vlite-cli schema --conn users.csv --table users --rules schema.json
vlite-cli schema --conn mysql://user:pass@host/db --table users \
--rules schema.json
"""

from cli.core.config import get_cli_config
from core.config import get_core_config

# start_time = now()
try:
_maybe_echo_analyzing(source, output)
_guard_empty_source_file(source)
_maybe_echo_analyzing(connection_string, output)
_guard_empty_source_file(connection_string)

source_config = SourceParser().parse_source(source)
source_config = SourceParser().parse_source(connection_string)

rules_payload = _read_rules_payload(rules_file)

Expand All @@ -892,10 +917,28 @@ def schema_command(
# Decompose into atomic rules per design
atomic_rules = _decompose_to_atomic_rules(rules_payload)

# Fast-path: no rules → emit minimal payload and exit cleanly
# FIX: Manually populate the target table and database from CLI args
# The source_config object is a class instance, not a dict.
# Use attribute access.
source_db = source_config.db_name
if not source_db:
source_db = "unknown"

for rule in atomic_rules:
if rule.target and rule.target.entities:
rule.target.entities[0].database = source_db
rule.target.entities[0].table = table_name

# get database name from SourceParser results
# source_db = source_config.get('database')
# for rule in atomic_rules:
# if rule.target and rule.target.entities:
# rule.target.entities[0].database = source_db
# rule.target.entities[0].table = table_name
# Fast-path: no rules -> emit minimal payload and exit cleanly
if len(atomic_rules) == 0:
_early_exit_when_no_rules(
source=source,
source=connection_string,
rules_file=rules_file,
output=output,
fail_on_error=fail_on_error,
Expand Down Expand Up @@ -923,7 +966,7 @@ def schema_command(
# Apply skip map to JSON output only; table mode stays concise by design
if output.lower() == "json":
_emit_json_output(
source=source,
source=connection_string,
rules_file=rules_file,
atomic_rules=atomic_rules,
results=results,
Expand All @@ -933,7 +976,7 @@ def schema_command(
)
else:
_emit_table_output(
source=source,
source=connection_string,
atomic_rules=atomic_rules,
results=results,
skip_map=skip_map,
Expand Down
20 changes: 16 additions & 4 deletions cli/core/source_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,12 +52,15 @@ def __init__(self) -> None:
".jsonl": ConnectionType.JSON,
}

def parse_source(self, source: str) -> ConnectionSchema:
def parse_source(
self, source: str, table_name: Optional[str] = None
) -> ConnectionSchema:
"""
Parse source string into ConnectionSchema.

Args:
source: Source string (file path or database URL)
table_name: Optional table name (overrides table from URL if provided)

Returns:
ConnectionSchema: Parsed connection configuration
Expand All @@ -75,7 +78,7 @@ def parse_source(self, source: str) -> ConnectionSchema:
raise ValidationError("Unrecognized source format: Empty source")

if self._is_database_url(source):
return self._parse_database_url(source)
return self._parse_database_url(source, table_name)
elif source.startswith("file://"):
# Handle file:// protocol
file_path = source[7:] # Remove file:// prefix
Expand Down Expand Up @@ -118,14 +121,20 @@ def _is_file_path(self, source: str) -> bool:

return False

def _parse_database_url(self, url: str) -> ConnectionSchema:
def _parse_database_url(
self, url: str, table_name: Optional[str] = None
) -> ConnectionSchema:
"""
Parse database URL into connection configuration.

Supports formats:
- mysql://user:pass@host:port/database.table
- postgres://user:pass@host:port/database.table
- sqlite:///path/to/database.db.table

Args:
url: Database connection URL
table_name: Optional table name (overrides table from URL if provided)
"""
self.logger.debug(f"Parsing database URL: {url}")

Expand All @@ -136,7 +145,10 @@ def _parse_database_url(self, url: str) -> ConnectionSchema:
parsed = urllib.parse.urlparse(url)

# Extract database and table from path
database, table = self._extract_db_table_from_path(parsed.path)
database, table_from_url = self._extract_db_table_from_path(parsed.path)

# Use provided table_name if available, otherwise use table from URL
table = table_name if table_name is not None else table_from_url

# Handle SQLite special case
if conn_type == ConnectionType.SQLITE:
Expand Down
Loading