From c9d85ae60bc33153b518177e422f270c544992d8 Mon Sep 17 00:00:00 2001
From: David Cottrell <cottrell@users.noreply.github.com>
Date: Sat, 18 Oct 2025 08:51:26 +0100
Subject: [PATCH] GH-47897: [C++][Python] Allow default column type for CSV
 columns

---
 cpp/src/arrow/csv/options.h          |  2 ++
 cpp/src/arrow/csv/reader.cc          | 15 ++++++++--
 docs/source/python/csv.rst           |  5 ++++
 python/pyarrow/_csv.pyx              | 27 +++++++++++++++--
 python/pyarrow/includes/libarrow.pxd |  1 +
 python/pyarrow/tests/test_csv.py     | 43 ++++++++++++++++++++++++++--
 6 files changed, 85 insertions(+), 8 deletions(-)
diff --git a/cpp/src/arrow/csv/options.h b/cpp/src/arrow/csv/options.h
index 7723dcedc61..8f0531ad6e5 100644
--- a/cpp/src/arrow/csv/options.h
+++ b/cpp/src/arrow/csv/options.h
@@ -76,6 +76,8 @@ struct ARROW_EXPORT ConvertOptions {
   bool check_utf8 = true;
   /// Optional per-column types (disabling type inference on those columns)
   std::unordered_map<std::string, std::shared_ptr<DataType>> column_types;
+  /// Optional type that will be applied to any column without an explicit entry
+  std::shared_ptr<DataType> column_type;
   /// Recognized spellings for null values
   std::vector<std::string> null_values;
   /// Recognized spellings for boolean true values
diff --git a/cpp/src/arrow/csv/reader.cc b/cpp/src/arrow/csv/reader.cc
index fdc7fcb1380..ec6ac52fdb5 100644
--- a/cpp/src/arrow/csv/reader.cc
+++ b/cpp/src/arrow/csv/reader.cc
@@ -652,8 +652,13 @@ class ReaderMixin {
       // Does the named column have a fixed type?
       auto it = convert_options_.column_types.find(col_name);
       if (it == convert_options_.column_types.end()) {
-        conversion_schema_.columns.push_back(
-            ConversionSchema::InferredColumn(std::move(col_name), col_index));
+        if (convert_options_.column_type) {
+          conversion_schema_.columns.push_back(ConversionSchema::TypedColumn(
+              std::move(col_name), col_index, convert_options_.column_type));
+        } else {
+          conversion_schema_.columns.push_back(
+              ConversionSchema::InferredColumn(std::move(col_name), col_index));
+        }
       } else {
         conversion_schema_.columns.push_back(
             ConversionSchema::TypedColumn(std::move(col_name), col_index, it->second));
@@ -666,7 +671,11 @@ class ReaderMixin {
       std::shared_ptr<DataType> type;
       auto it = convert_options_.column_types.find(col_name);
       if (it == convert_options_.column_types.end()) {
-        type = null();
+        if (convert_options_.column_type) {
+          type = convert_options_.column_type;
+        } else {
+          type = null();
+        }
       } else {
         type = it->second;
       }
diff --git a/docs/source/python/csv.rst b/docs/source/python/csv.rst
index f2c344a6fb8..559b80aee2a 100644
--- a/docs/source/python/csv.rst
+++ b/docs/source/python/csv.rst
@@ -125,12 +125,17 @@ a :class:`ConvertOptions` instance and pass it to :func:`read_csv`::
        }
    ))
 
+   table = csv.read_csv('tips.csv.gz', convert_options=csv.ConvertOptions(
+       column_type=pa.string()
+   ))
+
 Available convert options are:
 
 .. autosummary::
 
   ~ConvertOptions.check_utf8
   ~ConvertOptions.column_types
+  ~ConvertOptions.column_type
   ~ConvertOptions.null_values
   ~ConvertOptions.true_values
   ~ConvertOptions.false_values
diff --git a/python/pyarrow/_csv.pyx b/python/pyarrow/_csv.pyx
index 0ac32f1bbf2..124d6ea9618 100644
--- a/python/pyarrow/_csv.pyx
+++ b/python/pyarrow/_csv.pyx
@@ -591,6 +591,10 @@ cdef class ConvertOptions(_Weakrefable):
     column_types : pyarrow.Schema or dict, optional
         Explicitly map column names to column types. Passing this argument
         disables type inference on the defined columns.
+    column_type : DataType or compatible input, optional
+        Apply the provided type to any column that does not have an entry in
+        ``column_types``. When set, type inference is disabled for unspecified
+        columns.
     null_values : list, optional
         A sequence of strings that denote nulls in the data
         (defaults are appropriate in most cases). Note that by default,
@@ -787,7 +791,8 @@ cdef class ConvertOptions(_Weakrefable):
         self.options.reset(
             new CCSVConvertOptions(CCSVConvertOptions.Defaults()))
 
-    def __init__(self, *, check_utf8=None, column_types=None, null_values=None,
+    def __init__(self, *, check_utf8=None, column_types=None, column_type=None,
+                 null_values=None,
                  true_values=None, false_values=None, decimal_point=None,
                  strings_can_be_null=None, quoted_strings_can_be_null=None,
                  include_columns=None, include_missing_columns=None,
@@ -797,6 +802,8 @@ cdef class ConvertOptions(_Weakrefable):
             self.check_utf8 = check_utf8
         if column_types is not None:
             self.column_types = column_types
+        if column_type is not None:
+            self.column_type = column_type
         if null_values is not None:
             self.null_values = null_values
         if true_values is not None:
@@ -878,9 +885,25 @@ cdef class ConvertOptions(_Weakrefable):
             else:
                 k, v = item
             typ = pyarrow_unwrap_data_type(ensure_type(v))
-            assert typ != NULL
             deref(self.options).column_types[tobytes(k)] = typ
 
+    @property
+    def column_type(self):
+        """
+        Default type applied to columns without explicit mappings.
+        """
+        cdef shared_ptr[CDataType] typ = deref(self.options).column_type
+        if typ.get() == NULL:
+            return None
+        return pyarrow_wrap_data_type(typ)
+
+    @column_type.setter
+    def column_type(self, value):
+        if value is None:
+            deref(self.options).column_type.reset()
+            return
+        deref(self.options).column_type = pyarrow_unwrap_data_type(ensure_type(value))
+
     @property
     def null_values(self):
         """
diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd
index 80a087740fa..7e20ae1ad16 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -1751,6 +1751,7 @@ cdef extern from "arrow/csv/api.h" namespace "arrow::csv" nogil:
     cdef cppclass CCSVConvertOptions" arrow::csv::ConvertOptions":
         c_bool check_utf8
         unordered_map[c_string, shared_ptr[CDataType]] column_types
+        shared_ptr[CDataType] column_type
         vector[c_string] null_values
         vector[c_string] true_values
         vector[c_string] false_values
diff --git a/python/pyarrow/tests/test_csv.py b/python/pyarrow/tests/test_csv.py
index e92afce0352..d8fc3a33785 100644
--- a/python/pyarrow/tests/test_csv.py
+++ b/python/pyarrow/tests/test_csv.py
@@ -253,7 +253,8 @@ def test_convert_options():
         include_columns=[[], ['def', 'abc']],
         include_missing_columns=[False, True],
         auto_dict_encode=[False, True],
-        timestamp_parsers=[[], [ISO8601, '%y-%m']])
+        timestamp_parsers=[[], [ISO8601, '%y-%m']],
+        column_type=[None, pa.string()])
 
     check_options_class_pickling(
         cls, check_utf8=False,
@@ -263,7 +264,8 @@ def test_convert_options():
         include_columns=['def', 'abc'],
         include_missing_columns=False,
         auto_dict_encode=True,
-        timestamp_parsers=[ISO8601, '%y-%m'])
+        timestamp_parsers=[ISO8601, '%y-%m'],
+        column_type=pa.string())
 
     with pytest.raises(ValueError):
         opts.decimal_point = '..'
@@ -312,12 +314,14 @@ def test_convert_options():
     opts = cls(column_types={'a': pa.null()},
                null_values=['N', 'nn'], true_values=['T', 'tt'],
                false_values=['F', 'ff'], auto_dict_max_cardinality=999,
-               timestamp_parsers=[ISO8601, '%Y-%m-%d'])
+               timestamp_parsers=[ISO8601, '%Y-%m-%d'],
+               column_type=pa.string())
     assert opts.column_types == {'a': pa.null()}
     assert opts.null_values == ['N', 'nn']
     assert opts.false_values == ['F', 'ff']
     assert opts.true_values == ['T', 'tt']
     assert opts.auto_dict_max_cardinality == 999
+    assert opts.column_type == pa.string()
     assert opts.timestamp_parsers == [ISO8601, '%Y-%m-%d']
 
 
@@ -1223,6 +1227,39 @@ def test_column_types(self):
         assert "In CSV column #1: " in err
         assert "CSV conversion error to float: invalid value 'XXX'" in err
 
+    def test_column_type_default(self):
+        # Apply a single type to all columns without enumerating names
+        rows = b"a,b\n1,2\n3,4\n"
+        opts = ConvertOptions(column_type=pa.string())
+        table = self.read_bytes(rows, convert_options=opts)
+        schema = pa.schema([('a', pa.string()), ('b', pa.string())])
+        assert table.schema == schema
+        assert table.to_pydict() == {'a': ["1", "3"], 'b': ["2", "4"]}
+
+        # Numeric defaults should coerce all inferred columns
+        opts = ConvertOptions(column_type=pa.int64())
+        table = self.read_bytes(rows, convert_options=opts)
+        schema = pa.schema([('a', pa.int64()), ('b', pa.int64())])
+        assert table.schema == schema
+        assert table.to_pydict() == {'a': [1, 3], 'b': [2, 4]}
+
+        # Explicit column_types entries still win over the default
+        opts = ConvertOptions(column_type=pa.float64(),
+                              column_types={'b': pa.string()})
+        table = self.read_bytes(rows, convert_options=opts)
+        schema = pa.schema([('a', pa.float64()), ('b', pa.string())])
+        assert table.schema == schema
+        assert table.to_pydict() == {'a': [1.0, 3.0], 'b': ["2", "4"]}
+
+        # Missing columns should also use the default type when synthesized
+        opts = ConvertOptions(include_columns=['a', 'missing'],
+                              include_missing_columns=True,
+                              column_type=pa.string())
+        table = self.read_bytes(rows, convert_options=opts)
+        schema = pa.schema([('a', pa.string()), ('missing', pa.string())])
+        assert table.schema == schema
+        assert table.to_pydict() == {'a': ["1", "3"], 'missing': [None, None]}
+
     def test_column_types_dict(self):
         # Ask for dict-encoded column types in ConvertOptions
         column_types = [