Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 28 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
# Changelog

## [0.1.1] — 2026-05-18

### Fixed
- CSV reader now respects custom delimiter (`--csv-delimiter` / `csv_delimiter`).
Previously the delimiter was only applied to CSV output; input reads always
used comma, producing incorrect field parsing for pipe/tab-delimited files.
- `get_reader()` now accepts `**kwargs`, enabling format-specific reader options.
- `convert()` passes delimiter to the reader when input format is CSV.

### Added
- CLI tests for `validate --format` and `validate --format --schema` combinations.
- Roundtrip test for pipe-delimited CSV read→write→read cycle.

### Changed
- Version bumped to 0.1.1 (bugfix release).

## [0.1.0] — Initial Release

### Added
- CLI commands: `convert`, `batch`, `schema`, `validate`, `formats`.
- Format support: CSV, JSON, JSONL, YAML, Parquet, Avro (Protobuf optional).
- Streaming row-by-row conversion for CSV, JSONL, and Avro.
- Schema inference and validation with strict mode.
- Batch directory conversion with recursive glob support.
- Rich terminal output with progress feedback.
- CI/CD integration with exit-code-based validation.
21 changes: 16 additions & 5 deletions src/datamorph/converters.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,11 +38,11 @@ def supported_formats() -> list[str]:
return sorted(set(_READERS.keys()) | set(_WRITERS.keys()))


def get_reader(name: str) -> "FormatReader":
def get_reader(name: str, **kwargs: Any) -> "FormatReader":
cls = _READERS.get(name)
if not cls:
raise ValueError(f"Unsupported format for reading: {name}. Supported: {', '.join(_READERS.keys())}")
return cls()
return cls(**kwargs)


def get_writer(name: str, **kwargs: Any) -> "FormatWriter":
Expand Down Expand Up @@ -177,9 +177,13 @@ def _widen_type(a: str, b: str) -> str:


class CsvReader(FormatReader):
def __init__(self, delimiter: str = ",") -> None:
super().__init__()
self.delimiter = delimiter

def read_stream(self, path: str | Path) -> RowStream:
with open(path, "r", newline="", encoding="utf-8-sig") as f:
reader = csv.DictReader(f)
reader = csv.DictReader(f, delimiter=self.delimiter)
for row in reader:
yield {k.strip(): v.strip() if v else None for k, v in row.items()}

Expand Down Expand Up @@ -428,8 +432,15 @@ def convert(
result.input_format = input_format
result.output_format = output_format

# Get reader and writer
reader = get_reader(input_format)
# Normalize csv_delimiter to delimiter for consistency
if "csv_delimiter" in writer_kwargs:
writer_kwargs.setdefault("delimiter", writer_kwargs.pop("csv_delimiter"))

# Get reader and writer (pass writer_kwargs that apply to reader, like csv delimiter)
reader_kwargs: dict[str, Any] = {}
if input_format == "csv" and "delimiter" in writer_kwargs:
reader_kwargs["delimiter"] = writer_kwargs["delimiter"]
reader = get_reader(input_format, **reader_kwargs)
writer = get_writer(output_format, **writer_kwargs)

# If writing to parquet/avro, we may need field order from schema
Expand Down
23 changes: 21 additions & 2 deletions tests/test_converters.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,9 +129,28 @@ def test_csv_with_custom_delimiter(self, tmp_path):
path.write_text("name|age\nAlice|30\nBob|25\n")
output = tmp_path / "out.json"
result = convert(path, output, csv_delimiter="|")
# The csv_delimiter as a special param — let me check this
# It's passed to the writer but we need to handle it in convert()
assert not result.errors
assert result.rows_written == 2
data = json.loads(output.read_text())
assert data[0]["name"] == "Alice"
assert data[1]["age"] == "25"

def test_csv_custom_delimiter_roundtrip(self, tmp_path):
"""Verify pipe-delimited CSV can be read and written back."""
csv_path = tmp_path / "data.csv"
csv_path.write_text("name|age\nAlice|30\nBob|25\n")
json_path = tmp_path / "intermediate.json"
result = convert(csv_path, json_path, csv_delimiter="|")
assert not result.errors
assert result.rows_written == 2

csv_out = tmp_path / "roundtrip.csv"
result = convert(json_path, csv_out, csv_delimiter="|")
assert not result.errors
assert result.rows_written == 2
content = csv_out.read_text()
assert "Alice" in content
assert "name|age" in content # pipe-delimited header preserved


# ── JSON ──────────────────────────────────────────────────────────────
Expand Down
30 changes: 30 additions & 0 deletions tests/test_validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,3 +186,33 @@ def test_validate_help(self, runner):
result = runner.invoke(cli, ["validate", "--help"])
assert result.exit_code == 0
assert "schema" in result.output.lower()

def test_validate_with_format_override(self, runner, tmp_path):
"""Validate a file with explicit --format flag (no extension-based detection)."""
path = tmp_path / "data.unknown"
path.write_text("name,age\nAlice,30\nBob,25\n")
result = runner.invoke(cli, [
"validate", str(path),
"--format", "csv",
])
assert result.exit_code == 0
assert "VALID" in result.output
assert "2 rows checked" in result.output

def test_validate_with_format_and_schema(self, runner, tmp_path):
"""Validate with both --format and --schema overrides."""
data_file = tmp_path / "data.unknown"
data_file.write_text("name,age\nAlice,30\nBob,25\n")
schema_file = tmp_path / "schema.json"
import json
schema_file.write_text(json.dumps([
{"name": "name", "type": "string"},
{"name": "age", "type": "string"},
]))
result = runner.invoke(cli, [
"validate", str(data_file),
"--format", "csv",
"--schema", str(schema_file),
])
assert result.exit_code == 0
assert "VALID" in result.output