From c9d85ae60bc33153b518177e422f270c544992d8 Mon Sep 17 00:00:00 2001 From: David Cottrell Date: Sat, 18 Oct 2025 08:51:26 +0100 Subject: [PATCH] GH-47897: [C++][Python] Allow default column type for CSV columns --- cpp/src/arrow/csv/options.h | 2 ++ cpp/src/arrow/csv/reader.cc | 15 ++++++++-- docs/source/python/csv.rst | 5 ++++ python/pyarrow/_csv.pyx | 27 +++++++++++++++-- python/pyarrow/includes/libarrow.pxd | 1 + python/pyarrow/tests/test_csv.py | 43 ++++++++++++++++++++++++++-- 6 files changed, 85 insertions(+), 8 deletions(-) diff --git a/cpp/src/arrow/csv/options.h b/cpp/src/arrow/csv/options.h index 7723dcedc61..8f0531ad6e5 100644 --- a/cpp/src/arrow/csv/options.h +++ b/cpp/src/arrow/csv/options.h @@ -76,6 +76,8 @@ struct ARROW_EXPORT ConvertOptions { bool check_utf8 = true; /// Optional per-column types (disabling type inference on those columns) std::unordered_map> column_types; + /// Optional type that will be applied to any column without an explicit entry + std::shared_ptr column_type; /// Recognized spellings for null values std::vector null_values; /// Recognized spellings for boolean true values diff --git a/cpp/src/arrow/csv/reader.cc b/cpp/src/arrow/csv/reader.cc index fdc7fcb1380..ec6ac52fdb5 100644 --- a/cpp/src/arrow/csv/reader.cc +++ b/cpp/src/arrow/csv/reader.cc @@ -652,8 +652,13 @@ class ReaderMixin { // Does the named column have a fixed type? auto it = convert_options_.column_types.find(col_name); if (it == convert_options_.column_types.end()) { - conversion_schema_.columns.push_back( - ConversionSchema::InferredColumn(std::move(col_name), col_index)); + if (convert_options_.column_type) { + conversion_schema_.columns.push_back(ConversionSchema::TypedColumn( + std::move(col_name), col_index, convert_options_.column_type)); + } else { + conversion_schema_.columns.push_back( + ConversionSchema::InferredColumn(std::move(col_name), col_index)); + } } else { conversion_schema_.columns.push_back( ConversionSchema::TypedColumn(std::move(col_name), col_index, it->second)); @@ -666,7 +671,11 @@ class ReaderMixin { std::shared_ptr type; auto it = convert_options_.column_types.find(col_name); if (it == convert_options_.column_types.end()) { - type = null(); + if (convert_options_.column_type) { + type = convert_options_.column_type; + } else { + type = null(); + } } else { type = it->second; } diff --git a/docs/source/python/csv.rst b/docs/source/python/csv.rst index f2c344a6fb8..559b80aee2a 100644 --- a/docs/source/python/csv.rst +++ b/docs/source/python/csv.rst @@ -125,12 +125,17 @@ a :class:`ConvertOptions` instance and pass it to :func:`read_csv`:: } )) + table = csv.read_csv('tips.csv.gz', convert_options=csv.ConvertOptions( + column_type=pa.string() + )) + Available convert options are: .. autosummary:: ~ConvertOptions.check_utf8 ~ConvertOptions.column_types + ~ConvertOptions.column_type ~ConvertOptions.null_values ~ConvertOptions.true_values ~ConvertOptions.false_values diff --git a/python/pyarrow/_csv.pyx b/python/pyarrow/_csv.pyx index 0ac32f1bbf2..124d6ea9618 100644 --- a/python/pyarrow/_csv.pyx +++ b/python/pyarrow/_csv.pyx @@ -591,6 +591,10 @@ cdef class ConvertOptions(_Weakrefable): column_types : pyarrow.Schema or dict, optional Explicitly map column names to column types. Passing this argument disables type inference on the defined columns. + column_type : DataType or compatible input, optional + Apply the provided type to any column that does not have an entry in + ``column_types``. When set, type inference is disabled for unspecified + columns. null_values : list, optional A sequence of strings that denote nulls in the data (defaults are appropriate in most cases). Note that by default, @@ -787,7 +791,8 @@ cdef class ConvertOptions(_Weakrefable): self.options.reset( new CCSVConvertOptions(CCSVConvertOptions.Defaults())) - def __init__(self, *, check_utf8=None, column_types=None, null_values=None, + def __init__(self, *, check_utf8=None, column_types=None, column_type=None, + null_values=None, true_values=None, false_values=None, decimal_point=None, strings_can_be_null=None, quoted_strings_can_be_null=None, include_columns=None, include_missing_columns=None, @@ -797,6 +802,8 @@ cdef class ConvertOptions(_Weakrefable): self.check_utf8 = check_utf8 if column_types is not None: self.column_types = column_types + if column_type is not None: + self.column_type = column_type if null_values is not None: self.null_values = null_values if true_values is not None: @@ -878,9 +885,25 @@ cdef class ConvertOptions(_Weakrefable): else: k, v = item typ = pyarrow_unwrap_data_type(ensure_type(v)) - assert typ != NULL deref(self.options).column_types[tobytes(k)] = typ + @property + def column_type(self): + """ + Default type applied to columns without explicit mappings. + """ + cdef shared_ptr[CDataType] typ = deref(self.options).column_type + if typ.get() == NULL: + return None + return pyarrow_wrap_data_type(typ) + + @column_type.setter + def column_type(self, value): + if value is None: + deref(self.options).column_type.reset() + return + deref(self.options).column_type = pyarrow_unwrap_data_type(ensure_type(value)) + @property def null_values(self): """ diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 80a087740fa..7e20ae1ad16 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -1751,6 +1751,7 @@ cdef extern from "arrow/csv/api.h" namespace "arrow::csv" nogil: cdef cppclass CCSVConvertOptions" arrow::csv::ConvertOptions": c_bool check_utf8 unordered_map[c_string, shared_ptr[CDataType]] column_types + shared_ptr[CDataType] column_type vector[c_string] null_values vector[c_string] true_values vector[c_string] false_values diff --git a/python/pyarrow/tests/test_csv.py b/python/pyarrow/tests/test_csv.py index e92afce0352..d8fc3a33785 100644 --- a/python/pyarrow/tests/test_csv.py +++ b/python/pyarrow/tests/test_csv.py @@ -253,7 +253,8 @@ def test_convert_options(): include_columns=[[], ['def', 'abc']], include_missing_columns=[False, True], auto_dict_encode=[False, True], - timestamp_parsers=[[], [ISO8601, '%y-%m']]) + timestamp_parsers=[[], [ISO8601, '%y-%m']], + column_type=[None, pa.string()]) check_options_class_pickling( cls, check_utf8=False, @@ -263,7 +264,8 @@ def test_convert_options(): include_columns=['def', 'abc'], include_missing_columns=False, auto_dict_encode=True, - timestamp_parsers=[ISO8601, '%y-%m']) + timestamp_parsers=[ISO8601, '%y-%m'], + column_type=pa.string()) with pytest.raises(ValueError): opts.decimal_point = '..' @@ -312,12 +314,14 @@ def test_convert_options(): opts = cls(column_types={'a': pa.null()}, null_values=['N', 'nn'], true_values=['T', 'tt'], false_values=['F', 'ff'], auto_dict_max_cardinality=999, - timestamp_parsers=[ISO8601, '%Y-%m-%d']) + timestamp_parsers=[ISO8601, '%Y-%m-%d'], + column_type=pa.string()) assert opts.column_types == {'a': pa.null()} assert opts.null_values == ['N', 'nn'] assert opts.false_values == ['F', 'ff'] assert opts.true_values == ['T', 'tt'] assert opts.auto_dict_max_cardinality == 999 + assert opts.column_type == pa.string() assert opts.timestamp_parsers == [ISO8601, '%Y-%m-%d'] @@ -1223,6 +1227,39 @@ def test_column_types(self): assert "In CSV column #1: " in err assert "CSV conversion error to float: invalid value 'XXX'" in err + def test_column_type_default(self): + # Apply a single type to all columns without enumerating names + rows = b"a,b\n1,2\n3,4\n" + opts = ConvertOptions(column_type=pa.string()) + table = self.read_bytes(rows, convert_options=opts) + schema = pa.schema([('a', pa.string()), ('b', pa.string())]) + assert table.schema == schema + assert table.to_pydict() == {'a': ["1", "3"], 'b': ["2", "4"]} + + # Numeric defaults should coerce all inferred columns + opts = ConvertOptions(column_type=pa.int64()) + table = self.read_bytes(rows, convert_options=opts) + schema = pa.schema([('a', pa.int64()), ('b', pa.int64())]) + assert table.schema == schema + assert table.to_pydict() == {'a': [1, 3], 'b': [2, 4]} + + # Explicit column_types entries still win over the default + opts = ConvertOptions(column_type=pa.float64(), + column_types={'b': pa.string()}) + table = self.read_bytes(rows, convert_options=opts) + schema = pa.schema([('a', pa.float64()), ('b', pa.string())]) + assert table.schema == schema + assert table.to_pydict() == {'a': [1.0, 3.0], 'b': ["2", "4"]} + + # Missing columns should also use the default type when synthesized + opts = ConvertOptions(include_columns=['a', 'missing'], + include_missing_columns=True, + column_type=pa.string()) + table = self.read_bytes(rows, convert_options=opts) + schema = pa.schema([('a', pa.string()), ('missing', pa.string())]) + assert table.schema == schema + assert table.to_pydict() == {'a': ["1", "3"], 'missing': [None, None]} + def test_column_types_dict(self): # Ask for dict-encoded column types in ConvertOptions column_types = [