Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
3b49f62
Fix StringDType helper declaration and initialize UTF8
alippai Dec 7, 2025
6e4c3c6
Fix NumPy string dtype allocator guard
alippai Dec 8, 2025
a90ea23
Remove StringDType header comment
alippai Dec 8, 2025
8729eb3
Format numpy_to_arrow include
alippai Dec 8, 2025
f49ba67
Run clang-format on numpy_to_arrow
alippai Dec 8, 2025
050ca86
Handle missing NumPy dtypes module in StringDType tests
alippai Dec 8, 2025
da255c9
Make StringDType support unconditional
alippai Dec 12, 2025
80a3aca
Remove StringDType endif comments
alippai Dec 12, 2025
bef2c71
Add StringDType mask coverage and sentinel test
alippai Dec 12, 2025
166dd05
Merge branch 'apache:main' into main
alippai Dec 25, 2025
7b48c99
Adjust NumPy StringDType availability check
alippai Dec 25, 2025
38b2ee1
Clarify StringDType copy path and view packing
alippai Dec 25, 2025
6633e1e
Enable NumPy StringDType API and fix writer build
alippai Dec 25, 2025
70a389d
Use PyArray_API table for NumPy StringDType
alippai Dec 25, 2025
c150bfb
Remove unnecessary experimental dtype define
alippai Dec 25, 2025
5a21d1c
Fix StringDType writer run handling
alippai Dec 25, 2025
bdd2706
Fix StringConversionMode scoping and helper duplication
alippai Dec 25, 2025
ff3eaa9
Fix PandasOptions StringConversionMode declaration
alippai Dec 26, 2025
aff479b
Fix StringConversionMode enum mapping
alippai Dec 26, 2025
25343c2
Apply hook formatting fixes
alippai Dec 26, 2025
78e592c
Handle null validity when packing NumPy StringDType
alippai Dec 26, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 39 additions & 1 deletion python/pyarrow/array.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
# under the License.

from cpython.pycapsule cimport PyCapsule_CheckExact, PyCapsule_GetPointer, PyCapsule_New
from pyarrow.includes.libarrow_python cimport HasNumPyStringDType, StringConversionMode

from collections.abc import Sequence
import os
Expand Down Expand Up @@ -65,6 +66,30 @@ def _ndarray_to_arrow_type(object values, DataType type):
return pyarrow_wrap_data_type(_ndarray_to_type(values, type))


cdef inline StringConversionMode _resolve_string_conversion_mode(object string_dtype):
if string_dtype is True:
return StringConversionMode_STRING_DTYPE
if string_dtype is False:
return StringConversionMode_PYTHON_OBJECT

if string_dtype is None:
return StringConversionMode_PYTHON_OBJECT

if isinstance(string_dtype, str):
option = string_dtype.lower()
if option == "auto":
return StringConversionMode_PYTHON_OBJECT
if option in ("numpy", "string", "stringdtype"):
return StringConversionMode_STRING_DTYPE
if option in ("python", "object"):
return StringConversionMode_PYTHON_OBJECT

raise ValueError(
"string_dtype must be one of 'auto', 'numpy', 'python', 'object', "
"True or False"
)


cdef shared_ptr[CDataType] _ndarray_to_type(object values,
DataType type) except *:
cdef shared_ptr[CDataType] c_type
Expand Down Expand Up @@ -1734,7 +1759,7 @@ cdef class Array(_PandasConvertible):
return values
return np.asarray(values, dtype=dtype)

def to_numpy(self, zero_copy_only=True, writable=False):
def to_numpy(self, zero_copy_only=True, writable=False, *, string_dtype="auto"):
"""
Return a NumPy view or copy of this array.

Expand All @@ -1757,6 +1782,14 @@ cdef class Array(_PandasConvertible):
By setting this to True, a copy of the array is made to ensure
it is writable.

string_dtype : {"auto", "numpy", "python", "object", True, False}, default "auto"
Controls how string-like arrays are converted when NumPy 2.0's
:class:`~numpy.typing.StringDType` is available. ``"numpy"`` or
``True`` will request StringDType (copying), ``"python"``/``"object"``
or ``False`` will force Python object dtype. ``"auto"`` preserves the
default object dtype unless StringDType is explicitly requested.
Converting to NumPy's StringDType always copies string data.

Returns
-------
array : numpy.ndarray
Expand All @@ -1775,6 +1808,11 @@ cdef class Array(_PandasConvertible):
raise ValueError(
"Cannot return a writable array if asking for zero-copy")

c_options.string_conversion_mode = _resolve_string_conversion_mode(string_dtype)
if c_options.string_conversion_mode == StringConversionMode_STRING_DTYPE:
if not HasNumPyStringDType():
raise NotImplementedError("NumPy StringDType not available")

# If there are nulls and the array is a DictionaryArray
# decoding the dictionary will make sure nulls are correctly handled.
# Decoding a dictionary does imply a copy by the way,
Expand Down
8 changes: 8 additions & 0 deletions python/pyarrow/includes/libarrow_python.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,8 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py" nogil:
shared_ptr[CTable] table,
PyObject** out)

c_bool HasNumPyStringDType()

void c_set_default_memory_pool \
" arrow::py::set_default_memory_pool"(CMemoryPool* pool)\

Expand All @@ -182,6 +184,11 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py" nogil:
cdef cppclass PyOutputStream(COutputStream):
PyOutputStream(object fo)

cdef enum StringConversionMode "arrow::py::PandasOptions::StringConversionMode":
StringConversionMode_AUTO "arrow::py::PandasOptions::StringConversionMode::AUTO"
StringConversionMode_STRING_DTYPE "arrow::py::PandasOptions::StringConversionMode::STRING_DTYPE"
StringConversionMode_PYTHON_OBJECT "arrow::py::PandasOptions::StringConversionMode::PYTHON_OBJECT"

cdef cppclass PandasOptions:
CMemoryPool* pool
c_bool strings_to_categorical
Expand All @@ -201,6 +208,7 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py" nogil:
shared_ptr[const unordered_set[c_string]] categorical_columns
shared_ptr[const unordered_set[c_string]] extension_columns
c_bool to_numpy
StringConversionMode string_conversion_mode


cdef extern from "arrow/python/api.h" namespace "arrow::py::internal" nogil:
Expand Down
Loading
Loading