diff --git a/csv_diff/__init__.py b/csv_diff/__init__.py index 59a2eaf..2a710cd 100644 --- a/csv_diff/__init__.py +++ b/csv_diff/__init__.py @@ -15,15 +15,48 @@ def load_csv(fp, key=None, dialect=None): # Oh well, we tried. Fallback to the default. pass fp = csv.reader(fp, dialect=(dialect or "excel")) - headings = next(fp) - rows = [dict(zip(headings, line)) for line in fp] + try: + headings = next(fp) + except StopIteration: + raise ValueError("CSV input is empty (no header row found)") + if not headings: + raise ValueError("CSV input has an empty header row") + rows = {} + # Track the 1-based source line number alongside each row so that any + # downstream KeyError or value-shape error can point back to the line + # in the input file the user just gave us. The header is on line 1. + for line_number, line in enumerate(fp, start=2): + # csv.reader yields an empty list for a fully-blank line (a stray + # trailing newline, the kind GitHub and most editors insert by + # default). Silently skipping those matches the "POSIX text file" + # convention and the behaviour of most other CSV tools; raising + # KeyError('a') at the very end of a diff made the tool look + # broken on perfectly normal input. See issue #29. + if not line: + continue + if len(line) < len(headings): + raise ValueError( + f"CSV row on line {line_number} has {len(line)} field(s) " + f"but the header on line 1 has {len(headings)}; " + f"got {line!r}" + ) + rows[line_number] = dict(zip(headings, line)) if key: - keyfn = lambda r: r[key] + try: + return {rows[ln][key]: rows[ln] for ln in rows} + except KeyError as exc: + missing = exc.args[0] + raise ValueError( + f"Key column {missing!r} not present in CSV header " + f"{headings!r}" + ) from None else: - keyfn = lambda r: hashlib.sha1( - json.dumps(r, sort_keys=True).encode("utf8") - ).hexdigest() - return {keyfn(r): r for r in rows} + return { + hashlib.sha1( + json.dumps(rows[ln], sort_keys=True).encode("utf8") + ).hexdigest(): rows[ln] + for ln in rows + } def load_json(fp, key=None): diff --git a/tests/test_csv_diff.py b/tests/test_csv_diff.py index 0e3670f..0564ea0 100644 --- a/tests/test_csv_diff.py +++ b/tests/test_csv_diff.py @@ -1,5 +1,6 @@ from csv_diff import load_csv, compare import io +import pytest ONE = """id,name,age 1,Cleo,4 @@ -115,3 +116,38 @@ def test_tsv(): "columns_added": [], "columns_removed": [], } == diff + + +def test_trailing_blank_line_ignored(): + # Issue #29: a trailing newline (as GitHub and most editors emit) should + # not crash the tool with a KeyError on the key column. + csv_text = "a,b,c\n1,2,3\n\n" + assert load_csv(io.StringIO(csv_text), key="a") == { + "1": {"a": "1", "b": "2", "c": "3"} + } + + +def test_interior_blank_line_ignored(): + csv_text = "a,b,c\n1,2,3\n\n4,5,6\n" + assert load_csv(io.StringIO(csv_text), key="a") == { + "1": {"a": "1", "b": "2", "c": "3"}, + "4": {"a": "4", "b": "5", "c": "6"}, + } + + +def test_trailing_blank_line_with_no_key(): + csv_text = "a,b,c\n1,2,3\n\n" + loaded = load_csv(io.StringIO(csv_text)) + assert list(loaded.values()) == [{"a": "1", "b": "2", "c": "3"}] + + +def test_mismatched_row_length_raises_clear_error(): + csv_text = "a,b,c\n1,2,3\n4,5\n" + with pytest.raises(ValueError, match=r"line 3.*2 field.*3"): + load_csv(io.StringIO(csv_text), key="a") + + +def test_missing_key_column_raises_clear_error(): + csv_text = "a,b,c\n1,2,3\n" + with pytest.raises(ValueError, match=r"Key column 'z' not present"): + load_csv(io.StringIO(csv_text), key="z")