diff --git a/cpp/src/arrow/csv/options.h b/cpp/src/arrow/csv/options.h index 10e55bf838c..46cad8f2612 100644 --- a/cpp/src/arrow/csv/options.h +++ b/cpp/src/arrow/csv/options.h @@ -76,6 +76,8 @@ struct ARROW_EXPORT ConvertOptions { bool check_utf8 = true; /// Optional per-column types (disabling type inference on those columns) std::unordered_map> column_types; + /// Optional type that will be applied to any column without an explicit entry + std::shared_ptr column_type; /// Recognized spellings for null values std::vector null_values; /// Recognized spellings for boolean true values diff --git a/cpp/src/arrow/csv/reader.cc b/cpp/src/arrow/csv/reader.cc index 3c4e7e3da0c..fca8f1f7e3c 100644 --- a/cpp/src/arrow/csv/reader.cc +++ b/cpp/src/arrow/csv/reader.cc @@ -674,8 +674,13 @@ class ReaderMixin { // Does the named column have a fixed type? auto it = convert_options_.column_types.find(col_name); if (it == convert_options_.column_types.end()) { - conversion_schema_.columns.push_back( - ConversionSchema::InferredColumn(std::move(col_name), col_index)); + if (convert_options_.column_type) { + conversion_schema_.columns.push_back(ConversionSchema::TypedColumn( + std::move(col_name), col_index, convert_options_.column_type)); + } else { + conversion_schema_.columns.push_back( + ConversionSchema::InferredColumn(std::move(col_name), col_index)); + } } else { conversion_schema_.columns.push_back( ConversionSchema::TypedColumn(std::move(col_name), col_index, it->second)); @@ -688,7 +693,11 @@ class ReaderMixin { std::shared_ptr type; auto it = convert_options_.column_types.find(col_name); if (it == convert_options_.column_types.end()) { - type = null(); + if (convert_options_.column_type) { + type = convert_options_.column_type; + } else { + type = null(); + } } else { type = it->second; } diff --git a/docs/source/python/csv.rst b/docs/source/python/csv.rst index 5eb68e9ccdc..80828b22bb8 100644 --- a/docs/source/python/csv.rst +++ b/docs/source/python/csv.rst @@ -125,6 +125,10 @@ a :class:`ConvertOptions` instance and pass it to :func:`read_csv`:: } )) + table = csv.read_csv('tips.csv.gz', convert_options=csv.ConvertOptions( + column_type=pa.string() + )) + .. note:: To assign a column as ``duration``, the CSV values must be numeric strings that match the expected unit (e.g. ``60000`` for 60 seconds when @@ -136,6 +140,7 @@ Available convert options are: ~ConvertOptions.check_utf8 ~ConvertOptions.column_types + ~ConvertOptions.column_type ~ConvertOptions.null_values ~ConvertOptions.true_values ~ConvertOptions.false_values diff --git a/python/pyarrow/_csv.pyx b/python/pyarrow/_csv.pyx index ed9d20beb6b..bad2ac883e1 100644 --- a/python/pyarrow/_csv.pyx +++ b/python/pyarrow/_csv.pyx @@ -613,6 +613,10 @@ cdef class ConvertOptions(_Weakrefable): column_types : pyarrow.Schema or dict, optional Explicitly map column names to column types. Passing this argument disables type inference on the defined columns. + column_type : DataType or compatible input, optional + Apply the provided type to any column that does not have an entry in + ``column_types``. When set, type inference is disabled for unspecified + columns. null_values : list, optional A sequence of strings that denote nulls in the data (defaults are appropriate in most cases). Note that by default, @@ -816,7 +820,8 @@ cdef class ConvertOptions(_Weakrefable): self.options.reset( new CCSVConvertOptions(CCSVConvertOptions.Defaults())) - def __init__(self, *, check_utf8=None, column_types=None, null_values=None, + def __init__(self, *, check_utf8=None, column_types=None, column_type=None, + null_values=None, true_values=None, false_values=None, decimal_point=None, strings_can_be_null=None, quoted_strings_can_be_null=None, include_columns=None, include_missing_columns=None, @@ -826,6 +831,8 @@ cdef class ConvertOptions(_Weakrefable): self.check_utf8 = check_utf8 if column_types is not None: self.column_types = column_types + if column_type is not None: + self.column_type = column_type if null_values is not None: self.null_values = null_values if true_values is not None: @@ -907,9 +914,25 @@ cdef class ConvertOptions(_Weakrefable): else: k, v = item typ = pyarrow_unwrap_data_type(ensure_type(v)) - assert typ != NULL deref(self.options).column_types[tobytes(k)] = typ + @property + def column_type(self): + """ + Default type applied to columns without explicit mappings. + """ + cdef shared_ptr[CDataType] typ = deref(self.options).column_type + if typ.get() == NULL: + return None + return pyarrow_wrap_data_type(typ) + + @column_type.setter + def column_type(self, value): + if value is None: + deref(self.options).column_type.reset() + return + deref(self.options).column_type = pyarrow_unwrap_data_type(ensure_type(value)) + @property def null_values(self): """ diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index f294ee4d50b..6bec9921caf 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -2104,6 +2104,7 @@ cdef extern from "arrow/csv/api.h" namespace "arrow::csv" nogil: cdef cppclass CCSVConvertOptions" arrow::csv::ConvertOptions": c_bool check_utf8 unordered_map[c_string, shared_ptr[CDataType]] column_types + shared_ptr[CDataType] column_type vector[c_string] null_values vector[c_string] true_values vector[c_string] false_values diff --git a/python/pyarrow/tests/test_csv.py b/python/pyarrow/tests/test_csv.py index f510c6dbe23..f149b605562 100644 --- a/python/pyarrow/tests/test_csv.py +++ b/python/pyarrow/tests/test_csv.py @@ -286,7 +286,8 @@ def test_convert_options(pickle_module): include_columns=[[], ['def', 'abc']], include_missing_columns=[False, True], auto_dict_encode=[False, True], - timestamp_parsers=[[], [ISO8601, '%y-%m']]) + timestamp_parsers=[[], [ISO8601, '%y-%m']], + column_type=[None, pa.string()]) check_options_class_pickling( cls, pickler=pickle_module, @@ -297,7 +298,8 @@ def test_convert_options(pickle_module): include_columns=['def', 'abc'], include_missing_columns=False, auto_dict_encode=True, - timestamp_parsers=[ISO8601, '%y-%m']) + timestamp_parsers=[ISO8601, '%y-%m'], + column_type=pa.string()) with pytest.raises(ValueError): opts.decimal_point = '..' @@ -346,12 +348,14 @@ def test_convert_options(pickle_module): opts = cls(column_types={'a': pa.null()}, null_values=['N', 'nn'], true_values=['T', 'tt'], false_values=['F', 'ff'], auto_dict_max_cardinality=999, - timestamp_parsers=[ISO8601, '%Y-%m-%d']) + timestamp_parsers=[ISO8601, '%Y-%m-%d'], + column_type=pa.string()) assert opts.column_types == {'a': pa.null()} assert opts.null_values == ['N', 'nn'] assert opts.false_values == ['F', 'ff'] assert opts.true_values == ['T', 'tt'] assert opts.auto_dict_max_cardinality == 999 + assert opts.column_type == pa.string() assert opts.timestamp_parsers == [ISO8601, '%Y-%m-%d'] @@ -1283,6 +1287,39 @@ def test_column_types(self): assert "In CSV column #1: " in err assert "CSV conversion error to float: invalid value 'XXX'" in err + def test_column_type_default(self): + # Apply a single type to all columns without enumerating names + rows = b"a,b\n1,2\n3,4\n" + opts = ConvertOptions(column_type=pa.string()) + table = self.read_bytes(rows, convert_options=opts) + schema = pa.schema([('a', pa.string()), ('b', pa.string())]) + assert table.schema == schema + assert table.to_pydict() == {'a': ["1", "3"], 'b': ["2", "4"]} + + # Numeric defaults should coerce all inferred columns + opts = ConvertOptions(column_type=pa.int64()) + table = self.read_bytes(rows, convert_options=opts) + schema = pa.schema([('a', pa.int64()), ('b', pa.int64())]) + assert table.schema == schema + assert table.to_pydict() == {'a': [1, 3], 'b': [2, 4]} + + # Explicit column_types entries still win over the default + opts = ConvertOptions(column_type=pa.float64(), + column_types={'b': pa.string()}) + table = self.read_bytes(rows, convert_options=opts) + schema = pa.schema([('a', pa.float64()), ('b', pa.string())]) + assert table.schema == schema + assert table.to_pydict() == {'a': [1.0, 3.0], 'b': ["2", "4"]} + + # Missing columns should also use the default type when synthesized + opts = ConvertOptions(include_columns=['a', 'missing'], + include_missing_columns=True, + column_type=pa.string()) + table = self.read_bytes(rows, convert_options=opts) + schema = pa.schema([('a', pa.string()), ('missing', pa.string())]) + assert table.schema == schema + assert table.to_pydict() == {'a': ["1", "3"], 'missing': [None, None]} + def test_column_types_dict(self): # Ask for dict-encoded column types in ConvertOptions column_types = [