apache · cottrell · Oct 18, 2025 · Oct 21, 2025
diff --git a/cpp/src/arrow/csv/options.h b/cpp/src/arrow/csv/options.h
@@ -76,6 +76,8 @@ struct ARROW_EXPORT ConvertOptions {
   bool check_utf8 = true;
   /// Optional per-column types (disabling type inference on those columns)
   std::unordered_map<std::string, std::shared_ptr<DataType>> column_types;
+  /// Optional type that will be applied to any column without an explicit entry
+  std::shared_ptr<DataType> column_type;
   /// Recognized spellings for null values
   std::vector<std::string> null_values;
   /// Recognized spellings for boolean true values

diff --git a/cpp/src/arrow/csv/reader.cc b/cpp/src/arrow/csv/reader.cc
@@ -674,8 +674,13 @@ class ReaderMixin {
       // Does the named column have a fixed type?
       auto it = convert_options_.column_types.find(col_name);
       if (it == convert_options_.column_types.end()) {
-        conversion_schema_.columns.push_back(
-            ConversionSchema::InferredColumn(std::move(col_name), col_index));
+        if (convert_options_.column_type) {
+          conversion_schema_.columns.push_back(ConversionSchema::TypedColumn(
+              std::move(col_name), col_index, convert_options_.column_type));
+        } else {
+          conversion_schema_.columns.push_back(
+              ConversionSchema::InferredColumn(std::move(col_name), col_index));
+        }
       } else {
         conversion_schema_.columns.push_back(
             ConversionSchema::TypedColumn(std::move(col_name), col_index, it->second));
@@ -688,7 +693,11 @@ class ReaderMixin {
       std::shared_ptr<DataType> type;
       auto it = convert_options_.column_types.find(col_name);
       if (it == convert_options_.column_types.end()) {
-        type = null();
+        if (convert_options_.column_type) {
+          type = convert_options_.column_type;
+        } else {
+          type = null();
+        }
       } else {
         type = it->second;
       }

diff --git a/docs/source/python/csv.rst b/docs/source/python/csv.rst
@@ -125,6 +125,10 @@ a :class:`ConvertOptions` instance and pass it to :func:`read_csv`::
        }
    ))
 
+   table = csv.read_csv('tips.csv.gz', convert_options=csv.ConvertOptions(
+       column_type=pa.string()
+   ))
+
 .. note::
    To assign a column as ``duration``, the CSV values must be numeric strings
    that match the expected unit (e.g. ``60000`` for 60 seconds when
@@ -136,6 +140,7 @@ Available convert options are:
 
   ~ConvertOptions.check_utf8
   ~ConvertOptions.column_types
+  ~ConvertOptions.column_type
   ~ConvertOptions.null_values
   ~ConvertOptions.true_values
   ~ConvertOptions.false_values

@@ -613,6 +613,10 @@ cdef class ConvertOptions(_Weakrefable):
     column_types : pyarrow.Schema or dict, optional
         Explicitly map column names to column types. Passing this argument
         disables type inference on the defined columns.
+    column_type : DataType or compatible input, optional
+        Apply the provided type to any column that does not have an entry in
+        ``column_types``. When set, type inference is disabled for unspecified
+        columns.
     null_values : list, optional
         A sequence of strings that denote nulls in the data
         (defaults are appropriate in most cases). Note that by default,
@@ -816,7 +820,8 @@ cdef class ConvertOptions(_Weakrefable):
         self.options.reset(
             new CCSVConvertOptions(CCSVConvertOptions.Defaults()))
 
-    def __init__(self, *, check_utf8=None, column_types=None, null_values=None,
+    def __init__(self, *, check_utf8=None, column_types=None, column_type=None,
+                 null_values=None,
                  true_values=None, false_values=None, decimal_point=None,
                  strings_can_be_null=None, quoted_strings_can_be_null=None,
                  include_columns=None, include_missing_columns=None,
@@ -826,6 +831,8 @@ cdef class ConvertOptions(_Weakrefable):
             self.check_utf8 = check_utf8
         if column_types is not None:
             self.column_types = column_types
+        if column_type is not None:
+            self.column_type = column_type
         if null_values is not None:
             self.null_values = null_values
         if true_values is not None:
@@ -907,9 +914,25 @@ cdef class ConvertOptions(_Weakrefable):
             else:
                 k, v = item
             typ = pyarrow_unwrap_data_type(ensure_type(v))
-            assert typ != NULL
             deref(self.options).column_types[tobytes(k)] = typ
 
+    @property
+    def column_type(self):
+        """
+        Default type applied to columns without explicit mappings.
+        """
+        cdef shared_ptr[CDataType] typ = deref(self.options).column_type
+        if typ.get() == NULL:
+            return None
+        return pyarrow_wrap_data_type(typ)
+
+    @column_type.setter
+    def column_type(self, value):
+        if value is None:
+            deref(self.options).column_type.reset()
+            return
+        deref(self.options).column_type = pyarrow_unwrap_data_type(ensure_type(value))
+
     @property
     def null_values(self):
         """

@@ -2104,6 +2104,7 @@ cdef extern from "arrow/csv/api.h" namespace "arrow::csv" nogil:
     cdef cppclass CCSVConvertOptions" arrow::csv::ConvertOptions":
         c_bool check_utf8
         unordered_map[c_string, shared_ptr[CDataType]] column_types
+        shared_ptr[CDataType] column_type
         vector[c_string] null_values
         vector[c_string] true_values
         vector[c_string] false_values

@@ -286,7 +286,8 @@ def test_convert_options(pickle_module):
         include_columns=[[], ['def', 'abc']],
         include_missing_columns=[False, True],
         auto_dict_encode=[False, True],
-        timestamp_parsers=[[], [ISO8601, '%y-%m']])
+        timestamp_parsers=[[], [ISO8601, '%y-%m']],
+        column_type=[None, pa.string()])
 
     check_options_class_pickling(
         cls, pickler=pickle_module,
@@ -297,7 +298,8 @@ def test_convert_options(pickle_module):
         include_columns=['def', 'abc'],
         include_missing_columns=False,
         auto_dict_encode=True,
-        timestamp_parsers=[ISO8601, '%y-%m'])
+        timestamp_parsers=[ISO8601, '%y-%m'],
+        column_type=pa.string())
 
     with pytest.raises(ValueError):
         opts.decimal_point = '..'
@@ -346,12 +348,14 @@ def test_convert_options(pickle_module):
     opts = cls(column_types={'a': pa.null()},
                null_values=['N', 'nn'], true_values=['T', 'tt'],
                false_values=['F', 'ff'], auto_dict_max_cardinality=999,
-               timestamp_parsers=[ISO8601, '%Y-%m-%d'])
+               timestamp_parsers=[ISO8601, '%Y-%m-%d'],
+               column_type=pa.string())
     assert opts.column_types == {'a': pa.null()}
     assert opts.null_values == ['N', 'nn']
     assert opts.false_values == ['F', 'ff']
     assert opts.true_values == ['T', 'tt']
     assert opts.auto_dict_max_cardinality == 999
+    assert opts.column_type == pa.string()
     assert opts.timestamp_parsers == [ISO8601, '%Y-%m-%d']
 
 
@@ -1283,6 +1287,39 @@ def test_column_types(self):
         assert "In CSV column #1: " in err
         assert "CSV conversion error to float: invalid value 'XXX'" in err
 
+    def test_column_type_default(self):
+        # Apply a single type to all columns without enumerating names
+        rows = b"a,b\n1,2\n3,4\n"
+        opts = ConvertOptions(column_type=pa.string())
+        table = self.read_bytes(rows, convert_options=opts)
+        schema = pa.schema([('a', pa.string()), ('b', pa.string())])
+        assert table.schema == schema
+        assert table.to_pydict() == {'a': ["1", "3"], 'b': ["2", "4"]}
+
+        # Numeric defaults should coerce all inferred columns
+        opts = ConvertOptions(column_type=pa.int64())
+        table = self.read_bytes(rows, convert_options=opts)
+        schema = pa.schema([('a', pa.int64()), ('b', pa.int64())])
+        assert table.schema == schema
+        assert table.to_pydict() == {'a': [1, 3], 'b': [2, 4]}
+
+        # Explicit column_types entries still win over the default
+        opts = ConvertOptions(column_type=pa.float64(),
+                              column_types={'b': pa.string()})
+        table = self.read_bytes(rows, convert_options=opts)
+        schema = pa.schema([('a', pa.float64()), ('b', pa.string())])
+        assert table.schema == schema
+        assert table.to_pydict() == {'a': [1.0, 3.0], 'b': ["2", "4"]}
+
+        # Missing columns should also use the default type when synthesized
+        opts = ConvertOptions(include_columns=['a', 'missing'],
+                              include_missing_columns=True,
+                              column_type=pa.string())
+        table = self.read_bytes(rows, convert_options=opts)
+        schema = pa.schema([('a', pa.string()), ('missing', pa.string())])
+        assert table.schema == schema
+        assert table.to_pydict() == {'a': ["1", "3"], 'missing': [None, None]}
+
     def test_column_types_dict(self):
         # Ask for dict-encoded column types in ConvertOptions
         column_types = [