Skip to content

Commit 2b00ad2

Browse files
committed
TST: Add tests for on_bad_lines with dtype conversion failures #63168
1 parent 75db71c commit 2b00ad2

File tree

1 file changed

+108
-0
lines changed

1 file changed

+108
-0
lines changed

pandas/tests/io/parser/common/test_read_errors.py

Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -313,3 +313,111 @@ def test_on_bad_lines_warn_correct_formatting(all_parsers):
313313
):
314314
result = parser.read_csv(StringIO(data), on_bad_lines="warn")
315315
tm.assert_frame_equal(result, expected)
316+
317+
318+
@pytest.mark.parametrize(
319+
"on_bad_lines,should_warn",
320+
[
321+
("skip", False),
322+
("warn", True),
323+
],
324+
)
325+
def test_on_bad_lines_dtype_conversion_skip(c_parser_only, on_bad_lines, should_warn):
326+
# GH#63168 - on_bad_lines should handle dtype conversion failures
327+
parser = c_parser_only
328+
data = "col1,col2,col3\n1,2,3\na,4,5\n4,5,6"
329+
330+
if should_warn:
331+
with tm.assert_produces_warning(
332+
ParserWarning,
333+
match="Could not convert column|Skipped .* line",
334+
check_stacklevel=False,
335+
):
336+
result = parser.read_csv(
337+
StringIO(data),
338+
dtype={"col1": int, "col2": int, "col3": int},
339+
on_bad_lines=on_bad_lines,
340+
)
341+
else:
342+
result = parser.read_csv(
343+
StringIO(data),
344+
dtype={"col1": int, "col2": int, "col3": int},
345+
on_bad_lines=on_bad_lines,
346+
)
347+
348+
# Row with 'a' cannot convert to int, should be skipped
349+
expected = DataFrame({"col1": [1, 4], "col2": [2, 5], "col3": [3, 6]})
350+
tm.assert_frame_equal(result, expected)
351+
352+
353+
def test_on_bad_lines_dtype_conversion_error(c_parser_only):
354+
# GH#63168 - on_bad_lines='error' should raise on dtype conversion failure
355+
parser = c_parser_only
356+
data = "col1,col2\n1,2\na,3"
357+
358+
with pytest.raises(ValueError, match="invalid literal for int"):
359+
parser.read_csv(
360+
StringIO(data),
361+
dtype={"col1": int, "col2": int},
362+
on_bad_lines="error",
363+
)
364+
365+
366+
def test_on_bad_lines_dtype_float_conversion(c_parser_only):
367+
# GH#63168 - Float dtype with non-numeric values
368+
parser = c_parser_only
369+
data = "a,b\n1.5,2.5\nfoo,3.5\n4.5,5.5"
370+
371+
result = parser.read_csv(
372+
StringIO(data),
373+
dtype={"a": float, "b": float},
374+
on_bad_lines="skip",
375+
)
376+
377+
expected = DataFrame({"a": [1.5, 4.5], "b": [2.5, 5.5]})
378+
tm.assert_frame_equal(result, expected)
379+
380+
381+
def test_on_bad_lines_dtype_partial_columns(c_parser_only):
382+
# GH#63168 - Only some columns have dtype specified
383+
parser = c_parser_only
384+
data = "a,b,c\n1,hello,3\nfoo,world,6\n4,test,9"
385+
386+
result = parser.read_csv(
387+
StringIO(data),
388+
dtype={"a": int, "c": int},
389+
on_bad_lines="skip",
390+
)
391+
392+
expected = DataFrame({"a": [1, 4], "b": ["hello", "test"], "c": [3, 9]})
393+
tm.assert_frame_equal(result, expected)
394+
395+
396+
def test_on_bad_lines_dtype_mixed_errors(c_parser_only):
397+
# GH#63168 - Mix of structural errors (wrong field count) and dtype errors
398+
parser = c_parser_only
399+
data = "a,b,c\n1,2,3\nwrong_field_count\nfoo,4,5\n6,7,8"
400+
401+
result = parser.read_csv(
402+
StringIO(data),
403+
dtype={"a": int, "b": int, "c": int},
404+
on_bad_lines="skip",
405+
)
406+
407+
expected = DataFrame({"a": [1, 6], "b": [2, 7], "c": [3, 8]})
408+
tm.assert_frame_equal(result, expected)
409+
410+
411+
def test_on_bad_lines_dtype_all_bad_rows(c_parser_only):
412+
# GH#63168 - All data rows fail conversion
413+
parser = c_parser_only
414+
data = "a,b\nfoo,bar\nbaz,qux"
415+
416+
result = parser.read_csv(
417+
StringIO(data),
418+
dtype={"a": int, "b": int},
419+
on_bad_lines="skip",
420+
)
421+
422+
expected = DataFrame({"a": [], "b": []}).astype(int)
423+
tm.assert_frame_equal(result, expected)

0 commit comments

Comments
 (0)