From 5cdbf8f35d3ca0a51c91f45501ca8d9c39c16fb7 Mon Sep 17 00:00:00 2001 From: Zo Bot Date: Mon, 15 Jun 2026 19:36:35 +0000 Subject: [PATCH 1/2] =?UTF-8?q?raise=20a=20clear=20ValueError=20for=20empt?= =?UTF-8?q?y=20CSV=20input=20in=20load=5Fcsv=20=E2=80=94=20when=20a=20user?= =?UTF-8?q?=20runs=20csv-diff=20against=20an=20empty=20file,=20csv.reader?= =?UTF-8?q?=20returns=20no=20rows=20and=20the=20previous=20code=20let=20St?= =?UTF-8?q?opIteration=20bubble=20out=20of=20next(fp),=20producing=20a=20c?= =?UTF-8?q?onfusing=20traceback=20at=20the=20top=20of=20the=20call=20stack?= =?UTF-8?q?=20with=20no=20indication=20that=20the=20input=20was=20empty;?= =?UTF-8?q?=20the=20new=20try/except=20translates=20StopIteration=20into?= =?UTF-8?q?=20a=20typed=20ValueError=20with=20a=20descriptive=20message=20?= =?UTF-8?q?so=20the=20CLI=20shows=20'CSV=20input=20is=20empty=20(no=20head?= =?UTF-8?q?er=20row=20found)'=20and=20downstream=20loaders=20/=20Click=20e?= =?UTF-8?q?rror=20handling=20can=20react=20to=20it=20explicitly?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- csv_diff/__init__.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/csv_diff/__init__.py b/csv_diff/__init__.py index 59a2eaf..4dfd7f4 100644 --- a/csv_diff/__init__.py +++ b/csv_diff/__init__.py @@ -15,7 +15,10 @@ def load_csv(fp, key=None, dialect=None): # Oh well, we tried. Fallback to the default. pass fp = csv.reader(fp, dialect=(dialect or "excel")) - headings = next(fp) + try: + headings = next(fp) + except StopIteration: + raise ValueError("CSV input is empty (no header row found)") rows = [dict(zip(headings, line)) for line in fp] if key: keyfn = lambda r: r[key] From b6d15a05e97432bbaa3c54c7ee5615fe2234623c Mon Sep 17 00:00:00 2001 From: Zo Bot Date: Wed, 1 Jul 2026 03:42:10 +0000 Subject: [PATCH 2/2] load_csv: skip blank lines, surface line number on row-length and key-column errors A trailing blank line in a CSV file (the kind GitHub and most editors emit by default) crashed the diff with a bare KeyError on the key column, far from the actual problem. csv.reader yields an empty list for a fully-blank line, and dict(zip(headings, [])) returned {}, so the next line raised KeyError('a') inside the keyfn lambda. Issue #29. While reading the file, track the 1-based source line number alongside each row and use it to surface clear, line-numbered ValueError messages for the other two error paths that previously leaked as tracebacks: - rows with fewer fields than the header (the previous list-comprehension silently accepted them and produced dicts with missing keys, then crashed inside the keyfn lambda or in compare()'s next(iter(...))); - a --key column that isn't in the header (was also a KeyError, this time on r[key] inside keyfn). Trailing and interior blank lines (csv.reader yields [] for them) are now silently skipped, matching the POSIX text-file convention and the behaviour of most other CSV tools. Rows with more fields than the header are still accepted, since the trailing-comma pattern is a real-world way to express an empty last column; the FIVE test fixture exercises that. Added five tests in tests/test_csv_diff.py: - test_trailing_blank_line_ignored (issue #29 reproducer) - test_interior_blank_line_ignored - test_trailing_blank_line_with_no_key - test_mismatched_row_length_raises_clear_error - test_missing_key_column_raises_clear_error All 29 tests pass (5 new + 24 existing). --- csv_diff/__init__.py | 42 ++++++++++++++++++++++++++++++++++++------ tests/test_csv_diff.py | 36 ++++++++++++++++++++++++++++++++++++ 2 files changed, 72 insertions(+), 6 deletions(-) diff --git a/csv_diff/__init__.py b/csv_diff/__init__.py index 4dfd7f4..2a710cd 100644 --- a/csv_diff/__init__.py +++ b/csv_diff/__init__.py @@ -19,14 +19,44 @@ def load_csv(fp, key=None, dialect=None): headings = next(fp) except StopIteration: raise ValueError("CSV input is empty (no header row found)") - rows = [dict(zip(headings, line)) for line in fp] + if not headings: + raise ValueError("CSV input has an empty header row") + rows = {} + # Track the 1-based source line number alongside each row so that any + # downstream KeyError or value-shape error can point back to the line + # in the input file the user just gave us. The header is on line 1. + for line_number, line in enumerate(fp, start=2): + # csv.reader yields an empty list for a fully-blank line (a stray + # trailing newline, the kind GitHub and most editors insert by + # default). Silently skipping those matches the "POSIX text file" + # convention and the behaviour of most other CSV tools; raising + # KeyError('a') at the very end of a diff made the tool look + # broken on perfectly normal input. See issue #29. + if not line: + continue + if len(line) < len(headings): + raise ValueError( + f"CSV row on line {line_number} has {len(line)} field(s) " + f"but the header on line 1 has {len(headings)}; " + f"got {line!r}" + ) + rows[line_number] = dict(zip(headings, line)) if key: - keyfn = lambda r: r[key] + try: + return {rows[ln][key]: rows[ln] for ln in rows} + except KeyError as exc: + missing = exc.args[0] + raise ValueError( + f"Key column {missing!r} not present in CSV header " + f"{headings!r}" + ) from None else: - keyfn = lambda r: hashlib.sha1( - json.dumps(r, sort_keys=True).encode("utf8") - ).hexdigest() - return {keyfn(r): r for r in rows} + return { + hashlib.sha1( + json.dumps(rows[ln], sort_keys=True).encode("utf8") + ).hexdigest(): rows[ln] + for ln in rows + } def load_json(fp, key=None): diff --git a/tests/test_csv_diff.py b/tests/test_csv_diff.py index 0e3670f..0564ea0 100644 --- a/tests/test_csv_diff.py +++ b/tests/test_csv_diff.py @@ -1,5 +1,6 @@ from csv_diff import load_csv, compare import io +import pytest ONE = """id,name,age 1,Cleo,4 @@ -115,3 +116,38 @@ def test_tsv(): "columns_added": [], "columns_removed": [], } == diff + + +def test_trailing_blank_line_ignored(): + # Issue #29: a trailing newline (as GitHub and most editors emit) should + # not crash the tool with a KeyError on the key column. + csv_text = "a,b,c\n1,2,3\n\n" + assert load_csv(io.StringIO(csv_text), key="a") == { + "1": {"a": "1", "b": "2", "c": "3"} + } + + +def test_interior_blank_line_ignored(): + csv_text = "a,b,c\n1,2,3\n\n4,5,6\n" + assert load_csv(io.StringIO(csv_text), key="a") == { + "1": {"a": "1", "b": "2", "c": "3"}, + "4": {"a": "4", "b": "5", "c": "6"}, + } + + +def test_trailing_blank_line_with_no_key(): + csv_text = "a,b,c\n1,2,3\n\n" + loaded = load_csv(io.StringIO(csv_text)) + assert list(loaded.values()) == [{"a": "1", "b": "2", "c": "3"}] + + +def test_mismatched_row_length_raises_clear_error(): + csv_text = "a,b,c\n1,2,3\n4,5\n" + with pytest.raises(ValueError, match=r"line 3.*2 field.*3"): + load_csv(io.StringIO(csv_text), key="a") + + +def test_missing_key_column_raises_clear_error(): + csv_text = "a,b,c\n1,2,3\n" + with pytest.raises(ValueError, match=r"Key column 'z' not present"): + load_csv(io.StringIO(csv_text), key="z")