Skip to content

Commit 3fd0acc

Browse files
fahadabd1elsayedhazemm
authored andcommitted
FIX: added on_bad_lines support for dtype conversion failures #63168
1 parent 499c5d4 commit 3fd0acc

File tree

1 file changed

+96
-0
lines changed

1 file changed

+96
-0
lines changed

pandas/_libs/parsers.pyx

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -937,6 +937,11 @@ cdef class TextReader:
937937
int64_t num_cols
938938
dict results
939939
bint is_default_dict_dtype
940+
set bad_rows
941+
dict failed_columns_dtypes
942+
943+
bad_rows = set()
944+
failed_columns_dtypes = {}
940945

941946
start = self.parser_start
942947

@@ -1009,6 +1014,26 @@ cdef class TextReader:
10091014
col_res, na_count = self._convert_tokens(
10101015
i, start, end, name, na_filter, na_hashset,
10111016
na_fset, col_dtype)
1017+
except (ValueError, TypeError, OverflowError) as e:
1018+
# GH#63168: Handle dtype conversion failures based on on_bad_lines
1019+
if self.parser.on_bad_lines == SKIP or self.parser.on_bad_lines == WARN:
1020+
# Fall back to string conversion
1021+
col_res, na_count = self._string_convert(
1022+
i, start, end, na_filter, na_hashset)
1023+
1024+
# Track this column's intended dtype for later bad row detection
1025+
if col_dtype is not None:
1026+
failed_columns_dtypes[i] = col_dtype
1027+
1028+
if self.parser.on_bad_lines == WARN:
1029+
warnings.warn(
1030+
f"Could not convert column {name} to dtype {col_dtype}: "
1031+
f"{e}. Rows with unconvertible values will be skipped.",
1032+
ParserWarning,
1033+
stacklevel=find_stack_level()
1034+
)
1035+
else:
1036+
raise
10121037
finally:
10131038
# gh-21353
10141039
#
@@ -1034,6 +1059,32 @@ cdef class TextReader:
10341059

10351060
results[i] = col_res
10361061

1062+
# GH#63168: Filter out bad rows if on_bad_lines is SKIP or WARN
1063+
if failed_columns_dtypes:
1064+
# Identify bad rows from columns that failed dtype conversion
1065+
for col_idx, target_dtype in failed_columns_dtypes.items():
1066+
col_values = results[col_idx]
1067+
bad_row_indices = _identify_bad_rows(col_values, target_dtype)
1068+
bad_rows.update(bad_row_indices)
1069+
1070+
if bad_rows:
1071+
num_rows = end - start
1072+
good_mask = np.ones(num_rows, dtype=np.bool_)
1073+
for bad_idx in bad_rows:
1074+
good_mask[bad_idx] = False
1075+
1076+
# Filter all columns to keep only good rows
1077+
for col_idx in results:
1078+
results[col_idx] = results[col_idx][good_mask]
1079+
1080+
if self.parser.on_bad_lines == WARN:
1081+
warnings.warn(
1082+
f"Skipped {len(bad_rows)} line(s) due to dtype "
1083+
f"conversion errors.",
1084+
ParserWarning,
1085+
stacklevel=find_stack_level()
1086+
)
1087+
10371088
self.parser_start += end - start
10381089

10391090
return results
@@ -1404,6 +1455,51 @@ STR_NA_VALUES = {
14041455
_NA_VALUES = _ensure_encoded(list(STR_NA_VALUES))
14051456

14061457

1458+
def _identify_bad_rows(values, dtype):
1459+
"""
1460+
Identify row indices where values cannot be converted to the target dtype.
1461+
1462+
GH#63168: Used to find rows that should be skipped when on_bad_lines='skip'.
1463+
1464+
Parameters
1465+
----------
1466+
values : ndarray
1467+
Array of values (typically strings/objects) to check.
1468+
dtype : numpy dtype
1469+
Target dtype to check conversion against.
1470+
1471+
Returns
1472+
-------
1473+
set
1474+
Set of row indices (0-based) that cannot be converted.
1475+
"""
1476+
bad_indices = set()
1477+
1478+
for idx in range(len(values)):
1479+
val = values[idx]
1480+
1481+
# Skip None/NaN values - they're handled separately
1482+
if val is None:
1483+
continue
1484+
if isinstance(val, float) and np.isnan(val):
1485+
continue
1486+
if isinstance(val, str) and val.strip() == "":
1487+
continue
1488+
1489+
try:
1490+
if dtype.kind in "iu": # integer types
1491+
int(val)
1492+
elif dtype.kind == "f": # float types
1493+
float(val)
1494+
elif dtype.kind == "b": # boolean
1495+
# Boolean conversion is more complex, skip for now
1496+
pass
1497+
except (ValueError, TypeError):
1498+
bad_indices.add(idx)
1499+
1500+
return bad_indices
1501+
1502+
14071503
def _maybe_upcast(
14081504
arr, use_dtype_backend: bool = False, dtype_backend: str = "numpy"
14091505
):

0 commit comments

Comments
 (0)