From 30f47fa1869c14788c419e3d1d6ebbe76fd67d5d Mon Sep 17 00:00:00 2001 From: DevForge Engineer Date: Mon, 18 May 2026 03:03:47 -0400 Subject: [PATCH] fix: CSV reader now respects custom delimiter; add CHANGELOG and missing validate CLI tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - CsvReader accepts delimiter parameter (pipe, tab, etc.) - convert() passes delimiter to reader when input is CSV - get_reader() accepts **kwargs for reader constructor params - Fix false-positive test_csv_with_custom_delimiter (was not actually validating pipe-delimited reads) - Add test_csv_custom_delimiter_roundtrip for read→write→read cycle - Add CLI tests for validate --format and validate --format --schema - Add CHANGELOG.md with v0.1.0 and v0.1.1 entries --- CHANGELOG.md | 28 ++++++++++++++++++++++++++++ src/datamorph/converters.py | 21 ++++++++++++++++----- tests/test_converters.py | 23 +++++++++++++++++++++-- tests/test_validate.py | 30 ++++++++++++++++++++++++++++++ 4 files changed, 95 insertions(+), 7 deletions(-) create mode 100644 CHANGELOG.md diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..56f62e2 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,28 @@ +# Changelog + +## [0.1.1] — 2026-05-18 + +### Fixed +- CSV reader now respects custom delimiter (`--csv-delimiter` / `csv_delimiter`). + Previously the delimiter was only applied to CSV output; input reads always + used comma, producing incorrect field parsing for pipe/tab-delimited files. +- `get_reader()` now accepts `**kwargs`, enabling format-specific reader options. +- `convert()` passes delimiter to the reader when input format is CSV. + +### Added +- CLI tests for `validate --format` and `validate --format --schema` combinations. +- Roundtrip test for pipe-delimited CSV read→write→read cycle. + +### Changed +- Version bumped to 0.1.1 (bugfix release). + +## [0.1.0] — Initial Release + +### Added +- CLI commands: `convert`, `batch`, `schema`, `validate`, `formats`. +- Format support: CSV, JSON, JSONL, YAML, Parquet, Avro (Protobuf optional). +- Streaming row-by-row conversion for CSV, JSONL, and Avro. +- Schema inference and validation with strict mode. +- Batch directory conversion with recursive glob support. +- Rich terminal output with progress feedback. +- CI/CD integration with exit-code-based validation. diff --git a/src/datamorph/converters.py b/src/datamorph/converters.py index 187afeb..52a3750 100644 --- a/src/datamorph/converters.py +++ b/src/datamorph/converters.py @@ -38,11 +38,11 @@ def supported_formats() -> list[str]: return sorted(set(_READERS.keys()) | set(_WRITERS.keys())) -def get_reader(name: str) -> "FormatReader": +def get_reader(name: str, **kwargs: Any) -> "FormatReader": cls = _READERS.get(name) if not cls: raise ValueError(f"Unsupported format for reading: {name}. Supported: {', '.join(_READERS.keys())}") - return cls() + return cls(**kwargs) def get_writer(name: str, **kwargs: Any) -> "FormatWriter": @@ -177,9 +177,13 @@ def _widen_type(a: str, b: str) -> str: class CsvReader(FormatReader): + def __init__(self, delimiter: str = ",") -> None: + super().__init__() + self.delimiter = delimiter + def read_stream(self, path: str | Path) -> RowStream: with open(path, "r", newline="", encoding="utf-8-sig") as f: - reader = csv.DictReader(f) + reader = csv.DictReader(f, delimiter=self.delimiter) for row in reader: yield {k.strip(): v.strip() if v else None for k, v in row.items()} @@ -428,8 +432,15 @@ def convert( result.input_format = input_format result.output_format = output_format - # Get reader and writer - reader = get_reader(input_format) + # Normalize csv_delimiter to delimiter for consistency + if "csv_delimiter" in writer_kwargs: + writer_kwargs.setdefault("delimiter", writer_kwargs.pop("csv_delimiter")) + + # Get reader and writer (pass writer_kwargs that apply to reader, like csv delimiter) + reader_kwargs: dict[str, Any] = {} + if input_format == "csv" and "delimiter" in writer_kwargs: + reader_kwargs["delimiter"] = writer_kwargs["delimiter"] + reader = get_reader(input_format, **reader_kwargs) writer = get_writer(output_format, **writer_kwargs) # If writing to parquet/avro, we may need field order from schema diff --git a/tests/test_converters.py b/tests/test_converters.py index 9ec63a3..1f92e2c 100644 --- a/tests/test_converters.py +++ b/tests/test_converters.py @@ -129,9 +129,28 @@ def test_csv_with_custom_delimiter(self, tmp_path): path.write_text("name|age\nAlice|30\nBob|25\n") output = tmp_path / "out.json" result = convert(path, output, csv_delimiter="|") - # The csv_delimiter as a special param — let me check this - # It's passed to the writer but we need to handle it in convert() assert not result.errors + assert result.rows_written == 2 + data = json.loads(output.read_text()) + assert data[0]["name"] == "Alice" + assert data[1]["age"] == "25" + + def test_csv_custom_delimiter_roundtrip(self, tmp_path): + """Verify pipe-delimited CSV can be read and written back.""" + csv_path = tmp_path / "data.csv" + csv_path.write_text("name|age\nAlice|30\nBob|25\n") + json_path = tmp_path / "intermediate.json" + result = convert(csv_path, json_path, csv_delimiter="|") + assert not result.errors + assert result.rows_written == 2 + + csv_out = tmp_path / "roundtrip.csv" + result = convert(json_path, csv_out, csv_delimiter="|") + assert not result.errors + assert result.rows_written == 2 + content = csv_out.read_text() + assert "Alice" in content + assert "name|age" in content # pipe-delimited header preserved # ── JSON ────────────────────────────────────────────────────────────── diff --git a/tests/test_validate.py b/tests/test_validate.py index 99964b5..5b1b2b8 100644 --- a/tests/test_validate.py +++ b/tests/test_validate.py @@ -186,3 +186,33 @@ def test_validate_help(self, runner): result = runner.invoke(cli, ["validate", "--help"]) assert result.exit_code == 0 assert "schema" in result.output.lower() + + def test_validate_with_format_override(self, runner, tmp_path): + """Validate a file with explicit --format flag (no extension-based detection).""" + path = tmp_path / "data.unknown" + path.write_text("name,age\nAlice,30\nBob,25\n") + result = runner.invoke(cli, [ + "validate", str(path), + "--format", "csv", + ]) + assert result.exit_code == 0 + assert "VALID" in result.output + assert "2 rows checked" in result.output + + def test_validate_with_format_and_schema(self, runner, tmp_path): + """Validate with both --format and --schema overrides.""" + data_file = tmp_path / "data.unknown" + data_file.write_text("name,age\nAlice,30\nBob,25\n") + schema_file = tmp_path / "schema.json" + import json + schema_file.write_text(json.dumps([ + {"name": "name", "type": "string"}, + {"name": "age", "type": "string"}, + ])) + result = runner.invoke(cli, [ + "validate", str(data_file), + "--format", "csv", + "--schema", str(schema_file), + ]) + assert result.exit_code == 0 + assert "VALID" in result.output