From 3b49f62b04e1515f67f3f3ec7839a354a96975c1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81d=C3=A1m=20Lippai?= Date: Sun, 7 Dec 2025 18:28:39 -0500 Subject: [PATCH 01/20] Fix StringDType helper declaration and initialize UTF8 --- .../pyarrow/src/arrow/python/numpy_convert.cc | 13 ++ .../pyarrow/src/arrow/python/numpy_convert.h | 2 + .../src/arrow/python/numpy_to_arrow.cc | 126 ++++++++++++++++++ python/pyarrow/tests/test_array.py | 36 +++++ 4 files changed, 177 insertions(+) diff --git a/python/pyarrow/src/arrow/python/numpy_convert.cc b/python/pyarrow/src/arrow/python/numpy_convert.cc index 4113cc67d2f..d5faef66193 100644 --- a/python/pyarrow/src/arrow/python/numpy_convert.cc +++ b/python/pyarrow/src/arrow/python/numpy_convert.cc @@ -122,6 +122,15 @@ Result> NumPyScalarToArrowDataType(PyObject* scalar) { return NumPyDtypeToArrow(descr); } +#if NPY_ABI_VERSION >= 0x02000000 +bool IsStringDType(PyArray_Descr* descr) { + // NumPy's variable-width StringDType exposes a dedicated dtype number. + return descr != nullptr && descr->type_num == NPY_VSTRING; +} +#else +bool IsStringDType(PyArray_Descr* /*descr*/) { return false; } +#endif + Result> NumPyDtypeToArrow(PyObject* dtype) { if (!PyObject_TypeCheck(dtype, &PyArrayDescr_Type)) { return Status::TypeError("Did not pass numpy.dtype object"); @@ -133,6 +142,10 @@ Result> NumPyDtypeToArrow(PyObject* dtype) { Result> NumPyDtypeToArrow(PyArray_Descr* descr) { int type_num = fix_numpy_type_num(descr->type_num); + if (IsStringDType(descr)) { + return utf8(); + } + switch (type_num) { TO_ARROW_TYPE_CASE(BOOL, boolean); TO_ARROW_TYPE_CASE(INT8, int8); diff --git a/python/pyarrow/src/arrow/python/numpy_convert.h b/python/pyarrow/src/arrow/python/numpy_convert.h index 2d1086e1355..cac389d17a1 100644 --- a/python/pyarrow/src/arrow/python/numpy_convert.h +++ b/python/pyarrow/src/arrow/python/numpy_convert.h @@ -55,6 +55,8 @@ Result> NumPyDtypeToArrow(PyArray_Descr* descr); ARROW_PYTHON_EXPORT Result> NumPyScalarToArrowDataType(PyObject* scalar); +ARROW_PYTHON_EXPORT bool IsStringDType(PyArray_Descr* descr); + ARROW_PYTHON_EXPORT Status NdarrayToTensor(MemoryPool* pool, PyObject* ao, const std::vector& dim_names, std::shared_ptr* out); diff --git a/python/pyarrow/src/arrow/python/numpy_to_arrow.cc b/python/pyarrow/src/arrow/python/numpy_to_arrow.cc index 5647e895d0f..b4598d4f3b6 100644 --- a/python/pyarrow/src/arrow/python/numpy_to_arrow.cc +++ b/python/pyarrow/src/arrow/python/numpy_to_arrow.cc @@ -27,6 +27,7 @@ #include #include #include +#include #include #include @@ -43,6 +44,7 @@ #include "arrow/util/endian.h" #include "arrow/util/logging.h" #include "arrow/util/macros.h" +#include "arrow/util/scope_guard.h" #include "arrow/util/string.h" #include "arrow/util/utf8.h" #include "arrow/visit_type_inline.h" @@ -59,6 +61,10 @@ #include "arrow/python/type_traits.h" #include "arrow/python/vendored/pythoncapi_compat.h" +#if NPY_ABI_VERSION >= 0x02000000 +#include +#endif + namespace arrow { using internal::checked_cast; @@ -233,6 +239,13 @@ class NumPyConverter { Status Visit(const LargeStringType& type); Status Visit(const StringViewType& type); +#if NPY_ABI_VERSION >= 0x02000000 + template + Status AppendStringDTypeValues(Builder* builder); + + Status ConvertStringDType(); +#endif + Status Visit(const StructType& type); Status Visit(const FixedSizeBinaryType& type); @@ -338,6 +351,25 @@ Status NumPyConverter::Convert() { return Status::OK(); } + if (IsStringDType(dtype_)) { +#if NPY_ABI_VERSION >= 0x02000000 + RETURN_NOT_OK(ConvertStringDType()); + return Status::OK(); +#else + // Fall back to the generic Python sequence conversion path when the StringDType + // C API is unavailable. + PyConversionOptions py_options; + py_options.type = type_; + py_options.from_pandas = from_pandas_; + ARROW_ASSIGN_OR_RAISE( + auto chunked_array, + ConvertPySequence(reinterpret_cast(arr_), + reinterpret_cast(mask_), py_options, pool_)); + out_arrays_ = chunked_array->chunks(); + return Status::OK(); +#endif + } + if (type_ == nullptr) { return Status::Invalid("Must pass data type for non-object arrays"); } @@ -815,6 +847,100 @@ Status NumPyConverter::Visit(const StringViewType& type) { return Status::OK(); } +#if NPY_ABI_VERSION >= 0x02000000 + +template +Status NumPyConverter::AppendStringDTypeValues(Builder* builder) { + auto* descr = reinterpret_cast(dtype_); + + PyAcquireGIL gil_lock; + + npy_string_allocator* allocator = NpyString_acquire_allocator(descr); + if (allocator == nullptr) { + return Status::Invalid("Failed to acquire NumPy StringDType allocator"); + } + + auto release_allocator = ::arrow::internal::MakeScopeGuard( + [&]() { NpyString_release_allocator(allocator); }); + + npy_static_string value = {0, nullptr}; + + auto append_value = [&](const npy_packed_static_string* packed) -> Status { + int rc = NpyString_load(allocator, packed, &value); + if (rc == -1) { + RETURN_IF_PYERROR(); + return Status::Invalid("Failed to unpack NumPy StringDType value"); + } + if (rc == 1) { + return builder->AppendNull(); + } + return builder->Append(std::string_view{value.buf, value.size}); + }; + + char* data = PyArray_BYTES(arr_); + + if (mask_ != nullptr) { + Ndarray1DIndexer mask_values(mask_); + for (int64_t i = 0; i < length_; ++i) { + if (mask_values[i]) { + RETURN_NOT_OK(builder->AppendNull()); + } else { + const auto* packed = + reinterpret_cast(data + i * stride_); + RETURN_NOT_OK(append_value(packed)); + } + } + } else { + for (int64_t i = 0; i < length_; ++i) { + const auto* packed = reinterpret_cast(data); + RETURN_NOT_OK(append_value(packed)); + data += stride_; + } + } + + return Status::OK(); +} + +Status NumPyConverter::ConvertStringDType() { + util::InitializeUTF8(); + + if (type_ == nullptr) { + type_ = utf8(); + } + + switch (type_->id()) { + case Type::STRING: { + internal::ChunkedStringBuilder builder(kBinaryChunksize, pool_); + RETURN_NOT_OK(builder.Reserve(length_)); + RETURN_NOT_OK(AppendStringDTypeValues(&builder)); + + ArrayVector chunks; + RETURN_NOT_OK(builder.Finish(&chunks)); + for (const auto& chunk : chunks) { + RETURN_NOT_OK(PushArray(chunk->data())); + } + return Status::OK(); + } + case Type::LARGE_STRING: { + LargeStringBuilder builder(pool_); + RETURN_NOT_OK(builder.Reserve(length_)); + RETURN_NOT_OK(AppendStringDTypeValues(&builder)); + return PushBuilderResult(&builder); + } + case Type::STRING_VIEW: { + StringViewBuilder builder(pool_); + RETURN_NOT_OK(builder.Reserve(length_)); + RETURN_NOT_OK(AppendStringDTypeValues(&builder)); + return PushBuilderResult(&builder); + } + default: + return Status::TypeError( + "NumPy StringDType can only be converted to Arrow string types"); + } +} + +#endif + Status NumPyConverter::Visit(const StructType& type) { std::vector sub_converters; std::vector sub_arrays; diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index ec361159c5f..a83e65bdf1c 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -2758,6 +2758,42 @@ def test_array_from_numpy_unicode(string_type): assert arrow_arr.equals(expected) +@pytest.mark.numpy +def test_array_from_numpy_string_dtype(): + StringDType = getattr(np.dtypes, "StringDType", None) + if StringDType is None: + pytest.skip("NumPy StringDType not available") + + arr = np.array(["some", "strings"], dtype=StringDType()) + + arrow_arr = pa.array(arr) + + assert arrow_arr.type == pa.utf8() + assert arrow_arr.to_pylist() == ["some", "strings"] + + arrow_arr = pa.array(arr, type=pa.large_string()) + assert arrow_arr.type == pa.large_string() + assert arrow_arr.to_pylist() == ["some", "strings"] + + +@pytest.mark.numpy +def test_array_from_numpy_string_dtype_nulls_and_mask(): + StringDType = getattr(np.dtypes, "StringDType", None) + if StringDType is None: + pytest.skip("NumPy StringDType not available") + + dtype = StringDType(na_object=None) + arr = np.array(["this array has", None, "as an entry"], dtype=dtype) + + arrow_arr = pa.array(arr) + assert arrow_arr.type == pa.utf8() + assert arrow_arr.to_pylist() == ["this array has", None, "as an entry"] + + mask = np.array([False, True, False]) + arrow_arr = pa.array(arr, mask=mask) + assert arrow_arr.to_pylist() == ["this array has", None, None] + + @pytest.mark.numpy def test_array_string_from_non_string(): # ARROW-5682 - when converting to string raise on non string-like dtype From 6e4c3c64c4278fa3138320370deec07d05476720 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81d=C3=A1m=20Lippai?= Date: Sun, 7 Dec 2025 19:28:45 -0500 Subject: [PATCH 02/20] Fix NumPy string dtype allocator guard --- python/pyarrow/src/arrow/python/numpy_to_arrow.cc | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/python/pyarrow/src/arrow/python/numpy_to_arrow.cc b/python/pyarrow/src/arrow/python/numpy_to_arrow.cc index b4598d4f3b6..5a6be35f5f0 100644 --- a/python/pyarrow/src/arrow/python/numpy_to_arrow.cc +++ b/python/pyarrow/src/arrow/python/numpy_to_arrow.cc @@ -44,7 +44,6 @@ #include "arrow/util/endian.h" #include "arrow/util/logging.h" #include "arrow/util/macros.h" -#include "arrow/util/scope_guard.h" #include "arrow/util/string.h" #include "arrow/util/utf8.h" #include "arrow/visit_type_inline.h" @@ -860,8 +859,8 @@ Status NumPyConverter::AppendStringDTypeValues(Builder* builder) { return Status::Invalid("Failed to acquire NumPy StringDType allocator"); } - auto release_allocator = ::arrow::internal::MakeScopeGuard( - [&]() { NpyString_release_allocator(allocator); }); + std::unique_ptr + allocator_guard(allocator, &NpyString_release_allocator); npy_static_string value = {0, nullptr}; From a90ea23f5f006e3b07c9bf7d54de5c186a0b88a9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81d=C3=A1m=20Lippai?= Date: Sun, 7 Dec 2025 21:45:34 -0500 Subject: [PATCH 03/20] Remove StringDType header comment --- python/pyarrow/src/arrow/python/numpy_to_arrow.cc | 4 ---- 1 file changed, 4 deletions(-) diff --git a/python/pyarrow/src/arrow/python/numpy_to_arrow.cc b/python/pyarrow/src/arrow/python/numpy_to_arrow.cc index 5a6be35f5f0..7e624c62751 100644 --- a/python/pyarrow/src/arrow/python/numpy_to_arrow.cc +++ b/python/pyarrow/src/arrow/python/numpy_to_arrow.cc @@ -60,10 +60,6 @@ #include "arrow/python/type_traits.h" #include "arrow/python/vendored/pythoncapi_compat.h" -#if NPY_ABI_VERSION >= 0x02000000 -#include -#endif - namespace arrow { using internal::checked_cast; From 8729eb3ca37413b60c3aa3c86bfda8481e1d4319 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81d=C3=A1m=20Lippai?= Date: Sun, 7 Dec 2025 22:36:49 -0500 Subject: [PATCH 04/20] Format numpy_to_arrow include --- .../src/arrow/python/numpy_to_arrow.cc | 22 ++++++++++++++----- python/pyarrow/tests/test_array.py | 14 ++++++++++++ 2 files changed, 30 insertions(+), 6 deletions(-) diff --git a/python/pyarrow/src/arrow/python/numpy_to_arrow.cc b/python/pyarrow/src/arrow/python/numpy_to_arrow.cc index 7e624c62751..c6e9e549f14 100644 --- a/python/pyarrow/src/arrow/python/numpy_to_arrow.cc +++ b/python/pyarrow/src/arrow/python/numpy_to_arrow.cc @@ -60,6 +60,12 @@ #include "arrow/python/type_traits.h" #include "arrow/python/vendored/pythoncapi_compat.h" +#if NPY_ABI_VERSION >= 0x02000000 +// Needed for NpyString_acquire_allocator / NpyString_load / +// NpyString_release_allocator +# include +#endif + namespace arrow { using internal::checked_cast; @@ -848,22 +854,26 @@ template Status NumPyConverter::AppendStringDTypeValues(Builder* builder) { auto* descr = reinterpret_cast(dtype_); - PyAcquireGIL gil_lock; - npy_string_allocator* allocator = NpyString_acquire_allocator(descr); if (allocator == nullptr) { return Status::Invalid("Failed to acquire NumPy StringDType allocator"); } - std::unique_ptr - allocator_guard(allocator, &NpyString_release_allocator); + struct AllocatorGuard { + npy_string_allocator* ptr; + explicit AllocatorGuard(npy_string_allocator* p) : ptr(p) {} + ~AllocatorGuard() { + if (ptr != nullptr) { + NpyString_release_allocator(ptr); + } + } + } guard(allocator); npy_static_string value = {0, nullptr}; auto append_value = [&](const npy_packed_static_string* packed) -> Status { int rc = NpyString_load(allocator, packed, &value); if (rc == -1) { - RETURN_IF_PYERROR(); return Status::Invalid("Failed to unpack NumPy StringDType value"); } if (rc == 1) { @@ -905,7 +915,7 @@ Status NumPyConverter::ConvertStringDType() { switch (type_->id()) { case Type::STRING: { - internal::ChunkedStringBuilder builder(kBinaryChunksize, pool_); + arrow::internal::ChunkedStringBuilder builder(kBinaryChunksize, pool_); RETURN_NOT_OK(builder.Reserve(length_)); RETURN_NOT_OK(AppendStringDTypeValues(&builder)); diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index a83e65bdf1c..987c9f6621b 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -2771,10 +2771,24 @@ def test_array_from_numpy_string_dtype(): assert arrow_arr.type == pa.utf8() assert arrow_arr.to_pylist() == ["some", "strings"] + arrow_arr = pa.array(arr, type=pa.string()) + assert arrow_arr.type == pa.string() + assert arrow_arr.to_pylist() == ["some", "strings"] + arrow_arr = pa.array(arr, type=pa.large_string()) assert arrow_arr.type == pa.large_string() assert arrow_arr.to_pylist() == ["some", "strings"] + arrow_arr = pa.array(arr, type=pa.string_view()) + assert arrow_arr.type == pa.string_view() + assert arrow_arr.to_pylist() == ["some", "strings"] + + arr_full = np.array(["a", "b", "c", "d", "e"], dtype=StringDType()) + arr = arr_full[::2] + arrow_arr = pa.array(arr) + assert arrow_arr.type == pa.utf8() + assert arrow_arr.to_pylist() == ["a", "c", "e"] + @pytest.mark.numpy def test_array_from_numpy_string_dtype_nulls_and_mask(): From f49ba675b6c55b6b7da283b3a33fb387359a2ad3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81d=C3=A1m=20Lippai?= Date: Sun, 7 Dec 2025 23:38:07 -0500 Subject: [PATCH 05/20] Run clang-format on numpy_to_arrow --- .../src/arrow/python/numpy_to_arrow.cc | 38 +++++++++++++++++-- 1 file changed, 35 insertions(+), 3 deletions(-) diff --git a/python/pyarrow/src/arrow/python/numpy_to_arrow.cc b/python/pyarrow/src/arrow/python/numpy_to_arrow.cc index c6e9e549f14..90d4a805d12 100644 --- a/python/pyarrow/src/arrow/python/numpy_to_arrow.cc +++ b/python/pyarrow/src/arrow/python/numpy_to_arrow.cc @@ -81,6 +81,37 @@ using internal::NumPyTypeSize; namespace { +#if NPY_ABI_VERSION >= 0x02000000 + +// NumPy exposes StringDType helpers in the C-API table from version 2.0 onward, +// but the corresponding macros are only available when compiling against a +// 2.0+ feature level. Arrow still targets an older feature level, so provide +// local wrappers that call the C-API entries directly. + +inline npy_string_allocator* ArrowNpyString_acquire_allocator( + const PyArray_StringDTypeObject* descr) { + using Func = npy_string_allocator* (*)(const PyArray_StringDTypeObject*); + auto func = reinterpret_cast(PyArray_API[316]); + return func(descr); +} + +inline void ArrowNpyString_release_allocator(npy_string_allocator* allocator) { + using Func = void (*)(npy_string_allocator*); + auto func = reinterpret_cast(PyArray_API[318]); + func(allocator); +} + +inline int ArrowNpyString_load(npy_string_allocator* allocator, + const npy_packed_static_string* packed, + npy_static_string* out) { + using Func = + int (*)(npy_string_allocator*, const npy_packed_static_string*, npy_static_string*); + auto func = reinterpret_cast(PyArray_API[313]); + return func(allocator, packed, out); +} + +#endif // NPY_ABI_VERSION >= 0x02000000 + Status AllocateNullBitmap(MemoryPool* pool, int64_t length, std::shared_ptr* out) { int64_t null_bytes = bit_util::BytesForBits(length); @@ -854,7 +885,7 @@ template Status NumPyConverter::AppendStringDTypeValues(Builder* builder) { auto* descr = reinterpret_cast(dtype_); - npy_string_allocator* allocator = NpyString_acquire_allocator(descr); + npy_string_allocator* allocator = ArrowNpyString_acquire_allocator(descr); if (allocator == nullptr) { return Status::Invalid("Failed to acquire NumPy StringDType allocator"); } @@ -864,7 +895,7 @@ Status NumPyConverter::AppendStringDTypeValues(Builder* builder) { explicit AllocatorGuard(npy_string_allocator* p) : ptr(p) {} ~AllocatorGuard() { if (ptr != nullptr) { - NpyString_release_allocator(ptr); + ArrowNpyString_release_allocator(ptr); } } } guard(allocator); @@ -872,8 +903,9 @@ Status NumPyConverter::AppendStringDTypeValues(Builder* builder) { npy_static_string value = {0, nullptr}; auto append_value = [&](const npy_packed_static_string* packed) -> Status { - int rc = NpyString_load(allocator, packed, &value); + int rc = ArrowNpyString_load(allocator, packed, &value); if (rc == -1) { + RETURN_IF_PYERROR(); return Status::Invalid("Failed to unpack NumPy StringDType value"); } if (rc == 1) { From 050ca867ad1a74d9b98f2aa1c321fc359562f875 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81d=C3=A1m=20Lippai?= Date: Mon, 8 Dec 2025 00:24:03 -0500 Subject: [PATCH 06/20] Handle missing NumPy dtypes module in StringDType tests --- python/pyarrow/tests/test_array.py | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index 987c9f6621b..f4d85904b3a 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -2760,11 +2760,17 @@ def test_array_from_numpy_unicode(string_type): @pytest.mark.numpy def test_array_from_numpy_string_dtype(): - StringDType = getattr(np.dtypes, "StringDType", None) + dtypes_mod = getattr(np, "dtypes", None) + if dtypes_mod is None: + pytest.skip("NumPy dtypes module not available") + + StringDType = getattr(dtypes_mod, "StringDType", None) if StringDType is None: pytest.skip("NumPy StringDType not available") - arr = np.array(["some", "strings"], dtype=StringDType()) + dtype = StringDType() + + arr = np.array(["some", "strings"], dtype=dtype) arrow_arr = pa.array(arr) @@ -2783,7 +2789,7 @@ def test_array_from_numpy_string_dtype(): assert arrow_arr.type == pa.string_view() assert arrow_arr.to_pylist() == ["some", "strings"] - arr_full = np.array(["a", "b", "c", "d", "e"], dtype=StringDType()) + arr_full = np.array(["a", "b", "c", "d", "e"], dtype=dtype) arr = arr_full[::2] arrow_arr = pa.array(arr) assert arrow_arr.type == pa.utf8() @@ -2792,10 +2798,15 @@ def test_array_from_numpy_string_dtype(): @pytest.mark.numpy def test_array_from_numpy_string_dtype_nulls_and_mask(): - StringDType = getattr(np.dtypes, "StringDType", None) + dtypes_mod = getattr(np, "dtypes", None) + if dtypes_mod is None: + pytest.skip("NumPy dtypes module not available") + + StringDType = getattr(dtypes_mod, "StringDType", None) if StringDType is None: pytest.skip("NumPy StringDType not available") + # Real StringDType, use its NA sentinel dtype = StringDType(na_object=None) arr = np.array(["this array has", None, "as an entry"], dtype=dtype) @@ -2803,7 +2814,10 @@ def test_array_from_numpy_string_dtype_nulls_and_mask(): assert arrow_arr.type == pa.utf8() assert arrow_arr.to_pylist() == ["this array has", None, "as an entry"] - mask = np.array([False, True, False]) + # Test interplay of NA sentinel and an explicit mask: + # - index 1 is null because of na_object / Python None + # - index 2 is forced null by the mask + mask = np.array([False, False, True]) arrow_arr = pa.array(arr, mask=mask) assert arrow_arr.to_pylist() == ["this array has", None, None] From da255c9ec0f8ec0f09cede930064c508866e3faa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81d=C3=A1m=20Lippai?= Date: Fri, 12 Dec 2025 00:08:23 -0500 Subject: [PATCH 07/20] Make StringDType support unconditional --- .../pyarrow/src/arrow/python/numpy_convert.cc | 9 +- .../src/arrow/python/numpy_to_arrow.cc | 93 +++++++------------ python/pyarrow/tests/test_array.py | 28 ++++++ 3 files changed, 64 insertions(+), 66 deletions(-) diff --git a/python/pyarrow/src/arrow/python/numpy_convert.cc b/python/pyarrow/src/arrow/python/numpy_convert.cc index d5faef66193..facad8adfc8 100644 --- a/python/pyarrow/src/arrow/python/numpy_convert.cc +++ b/python/pyarrow/src/arrow/python/numpy_convert.cc @@ -37,6 +37,10 @@ namespace arrow { namespace py { +#ifndef NPY_VSTRING +# define NPY_VSTRING 2056 +#endif + NumPyBuffer::NumPyBuffer(PyObject* ao) : Buffer(nullptr, 0) { PyAcquireGIL lock; arr_ = ao; @@ -122,14 +126,9 @@ Result> NumPyScalarToArrowDataType(PyObject* scalar) { return NumPyDtypeToArrow(descr); } -#if NPY_ABI_VERSION >= 0x02000000 bool IsStringDType(PyArray_Descr* descr) { - // NumPy's variable-width StringDType exposes a dedicated dtype number. return descr != nullptr && descr->type_num == NPY_VSTRING; } -#else -bool IsStringDType(PyArray_Descr* /*descr*/) { return false; } -#endif Result> NumPyDtypeToArrow(PyObject* dtype) { if (!PyObject_TypeCheck(dtype, &PyArrayDescr_Type)) { diff --git a/python/pyarrow/src/arrow/python/numpy_to_arrow.cc b/python/pyarrow/src/arrow/python/numpy_to_arrow.cc index 90d4a805d12..e39fdadea2f 100644 --- a/python/pyarrow/src/arrow/python/numpy_to_arrow.cc +++ b/python/pyarrow/src/arrow/python/numpy_to_arrow.cc @@ -60,11 +60,7 @@ #include "arrow/python/type_traits.h" #include "arrow/python/vendored/pythoncapi_compat.h" -#if NPY_ABI_VERSION >= 0x02000000 -// Needed for NpyString_acquire_allocator / NpyString_load / -// NpyString_release_allocator -# include -#endif +#include namespace arrow { @@ -81,24 +77,15 @@ using internal::NumPyTypeSize; namespace { -#if NPY_ABI_VERSION >= 0x02000000 - -// NumPy exposes StringDType helpers in the C-API table from version 2.0 onward, -// but the corresponding macros are only available when compiling against a -// 2.0+ feature level. Arrow still targets an older feature level, so provide -// local wrappers that call the C-API entries directly. - inline npy_string_allocator* ArrowNpyString_acquire_allocator( const PyArray_StringDTypeObject* descr) { using Func = npy_string_allocator* (*)(const PyArray_StringDTypeObject*); - auto func = reinterpret_cast(PyArray_API[316]); - return func(descr); + return reinterpret_cast(PyArray_API[316])(descr); } inline void ArrowNpyString_release_allocator(npy_string_allocator* allocator) { using Func = void (*)(npy_string_allocator*); - auto func = reinterpret_cast(PyArray_API[318]); - func(allocator); + reinterpret_cast(PyArray_API[318])(allocator); } inline int ArrowNpyString_load(npy_string_allocator* allocator, @@ -106,12 +93,9 @@ inline int ArrowNpyString_load(npy_string_allocator* allocator, npy_static_string* out) { using Func = int (*)(npy_string_allocator*, const npy_packed_static_string*, npy_static_string*); - auto func = reinterpret_cast(PyArray_API[313]); - return func(allocator, packed, out); + return reinterpret_cast(PyArray_API[313])(allocator, packed, out); } -#endif // NPY_ABI_VERSION >= 0x02000000 - Status AllocateNullBitmap(MemoryPool* pool, int64_t length, std::shared_ptr* out) { int64_t null_bytes = bit_util::BytesForBits(length); @@ -271,12 +255,10 @@ class NumPyConverter { Status Visit(const LargeStringType& type); Status Visit(const StringViewType& type); -#if NPY_ABI_VERSION >= 0x02000000 template Status AppendStringDTypeValues(Builder* builder); Status ConvertStringDType(); -#endif Status Visit(const StructType& type); @@ -384,22 +366,8 @@ Status NumPyConverter::Convert() { } if (IsStringDType(dtype_)) { -#if NPY_ABI_VERSION >= 0x02000000 RETURN_NOT_OK(ConvertStringDType()); return Status::OK(); -#else - // Fall back to the generic Python sequence conversion path when the StringDType - // C API is unavailable. - PyConversionOptions py_options; - py_options.type = type_; - py_options.from_pandas = from_pandas_; - ARROW_ASSIGN_OR_RAISE( - auto chunked_array, - ConvertPySequence(reinterpret_cast(arr_), - reinterpret_cast(mask_), py_options, pool_)); - out_arrays_ = chunked_array->chunks(); - return Status::OK(); -#endif } if (type_ == nullptr) { @@ -879,8 +847,6 @@ Status NumPyConverter::Visit(const StringViewType& type) { return Status::OK(); } -#if NPY_ABI_VERSION >= 0x02000000 - template Status NumPyConverter::AppendStringDTypeValues(Builder* builder) { auto* descr = reinterpret_cast(dtype_); @@ -901,19 +867,6 @@ Status NumPyConverter::AppendStringDTypeValues(Builder* builder) { } guard(allocator); npy_static_string value = {0, nullptr}; - - auto append_value = [&](const npy_packed_static_string* packed) -> Status { - int rc = ArrowNpyString_load(allocator, packed, &value); - if (rc == -1) { - RETURN_IF_PYERROR(); - return Status::Invalid("Failed to unpack NumPy StringDType value"); - } - if (rc == 1) { - return builder->AppendNull(); - } - return builder->Append(std::string_view{value.buf, value.size}); - }; - char* data = PyArray_BYTES(arr_); if (mask_ != nullptr) { @@ -921,18 +874,38 @@ Status NumPyConverter::AppendStringDTypeValues(Builder* builder) { for (int64_t i = 0; i < length_; ++i) { if (mask_values[i]) { RETURN_NOT_OK(builder->AppendNull()); + continue; + } + + const auto* packed = + reinterpret_cast(data + i * stride_); + const int is_null = ArrowNpyString_load(allocator, packed, &value); + if (is_null == -1) { + RETURN_IF_PYERROR(); + return Status::Invalid("Failed to unpack NumPy StringDType value"); + } + if (is_null) { + RETURN_NOT_OK(builder->AppendNull()); } else { - const auto* packed = - reinterpret_cast(data + i * stride_); - RETURN_NOT_OK(append_value(packed)); + RETURN_NOT_OK(builder->Append(std::string_view{value.buf, value.size})); } } - } else { - for (int64_t i = 0; i < length_; ++i) { - const auto* packed = reinterpret_cast(data); - RETURN_NOT_OK(append_value(packed)); - data += stride_; + return Status::OK(); + } + + for (int64_t i = 0; i < length_; ++i) { + const auto* packed = reinterpret_cast(data); + const int is_null = ArrowNpyString_load(allocator, packed, &value); + if (is_null == -1) { + RETURN_IF_PYERROR(); + return Status::Invalid("Failed to unpack NumPy StringDType value"); + } + if (is_null) { + RETURN_NOT_OK(builder->AppendNull()); + } else { + RETURN_NOT_OK(builder->Append(std::string_view{value.buf, value.size})); } + data += stride_; } return Status::OK(); @@ -976,8 +949,6 @@ Status NumPyConverter::ConvertStringDType() { } } -#endif - Status NumPyConverter::Visit(const StructType& type) { std::vector sub_converters; std::vector sub_arrays; diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index f4d85904b3a..a7377477dbe 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -2796,6 +2796,28 @@ def test_array_from_numpy_string_dtype(): assert arrow_arr.to_pylist() == ["a", "c", "e"] +@pytest.mark.numpy +def test_numpy_stringdtype_thresholds_and_unicode(): + dtypes_mod = getattr(np, "dtypes", None) + if dtypes_mod is None: + pytest.skip("NumPy dtypes module not available") + + StringDType = getattr(dtypes_mod, "StringDType", None) + if StringDType is None: + pytest.skip("NumPy StringDType not available") + + dtype = StringDType() + + short = "hello" + medium = "a" * 100 + long_ = "b" * 300 + unicode_ = "árvíztűrő tükörfúrógép 🥐 你好" + long_unicode = "🥐" * 200 + + arr = np.array([short, medium, long_, unicode_, long_unicode], dtype=dtype) + assert pa.array(arr).to_pylist() == [short, medium, long_, unicode_, long_unicode] + + @pytest.mark.numpy def test_array_from_numpy_string_dtype_nulls_and_mask(): dtypes_mod = getattr(np, "dtypes", None) @@ -2822,6 +2844,12 @@ def test_array_from_numpy_string_dtype_nulls_and_mask(): assert arrow_arr.to_pylist() == ["this array has", None, None] +@pytest.mark.numpy +def test_numpy_object_str_still_works(): + arr_obj = np.array(["x", "y", None], dtype=object) + assert pa.array(arr_obj).to_pylist() == ["x", "y", None] + + @pytest.mark.numpy def test_array_string_from_non_string(): # ARROW-5682 - when converting to string raise on non string-like dtype From 80a3aca59adb658533c2406920f3de8299c702ed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81d=C3=A1m=20Lippai?= Date: Fri, 12 Dec 2025 00:38:55 -0500 Subject: [PATCH 08/20] Remove StringDType endif comments --- python/pyarrow/src/arrow/python/numpy_to_arrow.cc | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/python/pyarrow/src/arrow/python/numpy_to_arrow.cc b/python/pyarrow/src/arrow/python/numpy_to_arrow.cc index e39fdadea2f..b3e0dc0c17d 100644 --- a/python/pyarrow/src/arrow/python/numpy_to_arrow.cc +++ b/python/pyarrow/src/arrow/python/numpy_to_arrow.cc @@ -77,6 +77,7 @@ using internal::NumPyTypeSize; namespace { +#ifdef npy_string_allocator inline npy_string_allocator* ArrowNpyString_acquire_allocator( const PyArray_StringDTypeObject* descr) { using Func = npy_string_allocator* (*)(const PyArray_StringDTypeObject*); @@ -95,6 +96,7 @@ inline int ArrowNpyString_load(npy_string_allocator* allocator, int (*)(npy_string_allocator*, const npy_packed_static_string*, npy_static_string*); return reinterpret_cast(PyArray_API[313])(allocator, packed, out); } +#endif Status AllocateNullBitmap(MemoryPool* pool, int64_t length, std::shared_ptr* out) { @@ -255,10 +257,12 @@ class NumPyConverter { Status Visit(const LargeStringType& type); Status Visit(const StringViewType& type); +#ifdef npy_string_allocator template Status AppendStringDTypeValues(Builder* builder); Status ConvertStringDType(); +#endif Status Visit(const StructType& type); @@ -366,8 +370,13 @@ Status NumPyConverter::Convert() { } if (IsStringDType(dtype_)) { +#ifdef npy_string_allocator RETURN_NOT_OK(ConvertStringDType()); return Status::OK(); +#else + return Status::NotImplemented( + "NumPy StringDType requires building PyArrow with NumPy >= 2.0"); +#endif } if (type_ == nullptr) { @@ -847,6 +856,7 @@ Status NumPyConverter::Visit(const StringViewType& type) { return Status::OK(); } +#ifdef npy_string_allocator template Status NumPyConverter::AppendStringDTypeValues(Builder* builder) { auto* descr = reinterpret_cast(dtype_); @@ -948,6 +958,7 @@ Status NumPyConverter::ConvertStringDType() { "NumPy StringDType can only be converted to Arrow string types"); } } +#endif Status NumPyConverter::Visit(const StructType& type) { std::vector sub_converters; From bef2c71b3d45baae280a4496cb78382a9ffd2e37 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81d=C3=A1m=20Lippai?= Date: Fri, 12 Dec 2025 01:23:33 -0500 Subject: [PATCH 09/20] Add StringDType mask coverage and sentinel test --- .../src/arrow/python/numpy_to_arrow.cc | 8 ++--- python/pyarrow/tests/test_array.py | 29 ++++++++++++++++--- 2 files changed, 29 insertions(+), 8 deletions(-) diff --git a/python/pyarrow/src/arrow/python/numpy_to_arrow.cc b/python/pyarrow/src/arrow/python/numpy_to_arrow.cc index b3e0dc0c17d..dfbdd25a026 100644 --- a/python/pyarrow/src/arrow/python/numpy_to_arrow.cc +++ b/python/pyarrow/src/arrow/python/numpy_to_arrow.cc @@ -77,7 +77,7 @@ using internal::NumPyTypeSize; namespace { -#ifdef npy_string_allocator +#if NPY_ABI_VERSION >= 0x02000000 inline npy_string_allocator* ArrowNpyString_acquire_allocator( const PyArray_StringDTypeObject* descr) { using Func = npy_string_allocator* (*)(const PyArray_StringDTypeObject*); @@ -257,7 +257,7 @@ class NumPyConverter { Status Visit(const LargeStringType& type); Status Visit(const StringViewType& type); -#ifdef npy_string_allocator +#if NPY_ABI_VERSION >= 0x02000000 template Status AppendStringDTypeValues(Builder* builder); @@ -370,7 +370,7 @@ Status NumPyConverter::Convert() { } if (IsStringDType(dtype_)) { -#ifdef npy_string_allocator +#if NPY_ABI_VERSION >= 0x02000000 RETURN_NOT_OK(ConvertStringDType()); return Status::OK(); #else @@ -856,7 +856,7 @@ Status NumPyConverter::Visit(const StringViewType& type) { return Status::OK(); } -#ifdef npy_string_allocator +#if NPY_ABI_VERSION >= 0x02000000 template Status NumPyConverter::AppendStringDTypeValues(Builder* builder) { auto* descr = reinterpret_cast(dtype_); diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index a7377477dbe..74ef81646ed 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -2839,15 +2839,36 @@ def test_array_from_numpy_string_dtype_nulls_and_mask(): # Test interplay of NA sentinel and an explicit mask: # - index 1 is null because of na_object / Python None # - index 2 is forced null by the mask - mask = np.array([False, False, True]) + mask = np.array([False, False, True], dtype=bool) arrow_arr = pa.array(arr, mask=mask) + assert arrow_arr.type == pa.utf8() + assert arrow_arr.null_count == 2 assert arrow_arr.to_pylist() == ["this array has", None, None] + mask = np.array([True, False, True], dtype=bool) + assert pa.array(arr, mask=mask).to_pylist() == [None, None, None] + @pytest.mark.numpy -def test_numpy_object_str_still_works(): - arr_obj = np.array(["x", "y", None], dtype=object) - assert pa.array(arr_obj).to_pylist() == ["x", "y", None] +def test_array_from_numpy_string_dtype_string_sentinel_and_mask(): + dtypes_mod = getattr(np, "dtypes", None) + if dtypes_mod is None: + pytest.skip("NumPy dtypes module not available") + + StringDType = getattr(dtypes_mod, "StringDType", None) + if StringDType is None: + pytest.skip("NumPy StringDType not available") + + sentinel = "__placeholder__" + dtype = StringDType(na_object=sentinel) + arr = np.array(["this array has", sentinel, "as an entry"], dtype=dtype) + + arrow_arr = pa.array(arr) + assert arrow_arr.type == pa.utf8() + assert arrow_arr.to_pylist() == ["this array has", None, "as an entry"] + + mask = np.array([False, False, True], dtype=bool) + assert pa.array(arr, mask=mask).to_pylist() == ["this array has", None, None] @pytest.mark.numpy From 7b48c9928ddb0136fe971ac9e155ad797ff1486a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81d=C3=A1m=20Lippai?= Date: Thu, 25 Dec 2025 00:59:22 -0500 Subject: [PATCH 10/20] Adjust NumPy StringDType availability check --- python/pyarrow/array.pxi | 40 +++- python/pyarrow/includes/libarrow_python.pxd | 8 + .../src/arrow/python/arrow_to_pandas.cc | 219 +++++++++++++++++- .../src/arrow/python/arrow_to_pandas.h | 8 + python/pyarrow/table.pxi | 38 ++- python/pyarrow/tests/test_array.py | 32 +++ 6 files changed, 339 insertions(+), 6 deletions(-) diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index ec58ac727e5..592d0863c23 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -16,6 +16,7 @@ # under the License. from cpython.pycapsule cimport PyCapsule_CheckExact, PyCapsule_GetPointer, PyCapsule_New +from pyarrow.includes.libarrow_python cimport HasNumPyStringDType, StringConversionMode from collections.abc import Sequence import os @@ -65,6 +66,30 @@ def _ndarray_to_arrow_type(object values, DataType type): return pyarrow_wrap_data_type(_ndarray_to_type(values, type)) +cdef inline StringConversionMode _resolve_string_conversion_mode(object string_dtype): + if string_dtype is True: + return StringConversionMode.STRING_DTYPE + if string_dtype is False: + return StringConversionMode.PYTHON_OBJECT + + if string_dtype is None: + return StringConversionMode.PYTHON_OBJECT + + if isinstance(string_dtype, str): + option = string_dtype.lower() + if option == "auto": + return StringConversionMode.PYTHON_OBJECT + if option in ("numpy", "string", "stringdtype"): + return StringConversionMode.STRING_DTYPE + if option in ("python", "object"): + return StringConversionMode.PYTHON_OBJECT + + raise ValueError( + "string_dtype must be one of 'auto', 'numpy', 'python', 'object', " + "True or False" + ) + + cdef shared_ptr[CDataType] _ndarray_to_type(object values, DataType type) except *: cdef shared_ptr[CDataType] c_type @@ -1734,7 +1759,7 @@ cdef class Array(_PandasConvertible): return values return np.asarray(values, dtype=dtype) - def to_numpy(self, zero_copy_only=True, writable=False): + def to_numpy(self, zero_copy_only=True, writable=False, *, string_dtype="auto"): """ Return a NumPy view or copy of this array. @@ -1757,6 +1782,14 @@ cdef class Array(_PandasConvertible): By setting this to True, a copy of the array is made to ensure it is writable. + string_dtype : {"auto", "numpy", "python", "object", True, False}, default "auto" + Controls how string-like arrays are converted when NumPy 2.0's + :class:`~numpy.typing.StringDType` is available. ``"numpy"`` or + ``True`` will request StringDType (copying), ``"python"``/``"object"`` + or ``False`` will force Python object dtype. ``"auto"`` preserves the + default object dtype unless StringDType is explicitly requested. + Converting to NumPy's StringDType always copies string data. + Returns ------- array : numpy.ndarray @@ -1775,6 +1808,11 @@ cdef class Array(_PandasConvertible): raise ValueError( "Cannot return a writable array if asking for zero-copy") + c_options.string_conversion_mode = _resolve_string_conversion_mode(string_dtype) + if c_options.string_conversion_mode == StringConversionMode.STRING_DTYPE: + if not HasNumPyStringDType(): + raise NotImplementedError("NumPy StringDType not available") + # If there are nulls and the array is a DictionaryArray # decoding the dictionary will make sure nulls are correctly handled. # Decoding a dictionary does imply a copy by the way, diff --git a/python/pyarrow/includes/libarrow_python.pxd b/python/pyarrow/includes/libarrow_python.pxd index 4724c52ccb5..c5661357217 100644 --- a/python/pyarrow/includes/libarrow_python.pxd +++ b/python/pyarrow/includes/libarrow_python.pxd @@ -161,6 +161,8 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py" nogil: shared_ptr[CTable] table, PyObject** out) + c_bool HasNumPyStringDType() + void c_set_default_memory_pool \ " arrow::py::set_default_memory_pool"(CMemoryPool* pool)\ @@ -182,6 +184,11 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py" nogil: cdef cppclass PyOutputStream(COutputStream): PyOutputStream(object fo) + cdef enum StringConversionMode "arrow::py::PandasOptions::StringConversionMode": + AUTO + STRING_DTYPE + PYTHON_OBJECT + cdef cppclass PandasOptions: CMemoryPool* pool c_bool strings_to_categorical @@ -201,6 +208,7 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py" nogil: shared_ptr[const unordered_set[c_string]] categorical_columns shared_ptr[const unordered_set[c_string]] extension_columns c_bool to_numpy + StringConversionMode string_conversion_mode cdef extern from "arrow/python/api.h" namespace "arrow::py::internal" nogil: diff --git a/python/pyarrow/src/arrow/python/arrow_to_pandas.cc b/python/pyarrow/src/arrow/python/arrow_to_pandas.cc index f163266f3b8..d939432c4fd 100644 --- a/python/pyarrow/src/arrow/python/arrow_to_pandas.cc +++ b/python/pyarrow/src/arrow/python/arrow_to_pandas.cc @@ -32,12 +32,14 @@ #include #include "arrow/array.h" +#include "arrow/array/array_binary.h" #include "arrow/buffer.h" #include "arrow/datum.h" #include "arrow/status.h" #include "arrow/table.h" #include "arrow/type.h" #include "arrow/type_traits.h" +#include "arrow/util/bit_run_reader.h" #include "arrow/util/checked_cast.h" #include "arrow/util/hashing.h" #include "arrow/util/int_util.h" @@ -68,6 +70,15 @@ using internal::CheckIndexBounds; using internal::OptionalParallelFor; namespace py { + +ARROW_PYTHON_EXPORT bool HasNumPyStringDType() { +#if NPY_ABI_VERSION >= 0x02000000 + return PyArray_StringDType != nullptr; +#else + return false; +#endif +} + namespace { // Fix options for conversion of an inner (child) array. @@ -344,6 +355,7 @@ class PandasWriter { public: enum type { OBJECT, + STRING_DTYPE, UINT8, INT8, UINT16, @@ -1405,6 +1417,189 @@ class ObjectWriter : public TypedPandasWriter { } }; +#if NPY_ABI_VERSION >= 0x02000000 +Status PackStringValue(npy_string_allocator* allocator, npy_packed_static_string* packed, + const std::string_view& view) { + const int result = NpyString_pack(allocator, packed, view.data(), view.size()); + if (result == -1) { + RETURN_IF_PYERROR(); + return Status::Invalid("Failed to pack NumPy StringDType value"); + } + return Status::OK(); +} + +Status PackNullString(npy_string_allocator* allocator, npy_packed_static_string* packed) { + const int result = NpyString_pack_null(allocator, packed); + if (result == -1) { + RETURN_IF_PYERROR(); + return Status::Invalid("Failed to pack NumPy StringDType value"); + } + return Status::OK(); +} + +template +Status WriteOffsetStringValues(const ArrayType& arr, npy_string_allocator* allocator, + char* data, npy_intp stride) { + using offset_type = typename ArrayType::offset_type; + + const offset_type* offsets = arr.raw_value_offsets(); + const auto base_offset = offsets[0]; + const uint8_t* value_data = arr.value_data()->data(); + const uint8_t* validity = arr.null_bitmap_data(); + + auto pack_values = [&](int64_t position, int64_t length) -> Status { + for (int64_t i = 0; i < length; ++i) { + const auto start = static_cast(offsets[position + i] - base_offset); + const auto end = static_cast(offsets[position + i + 1] - base_offset); + auto* packed = + reinterpret_cast(data + (position + i) * stride); + RETURN_NOT_OK(PackStringValue( + allocator, packed, + std::string_view(reinterpret_cast(value_data + start), + end - start))); + } + return Status::OK(); + }; + + auto pack_nulls = [&](int64_t position, int64_t length) -> Status { + for (int64_t i = 0; i < length; ++i) { + auto* packed = + reinterpret_cast(data + (position + i) * stride); + RETURN_NOT_OK(PackNullString(allocator, packed)); + } + return Status::OK(); + }; + + if (arr.null_count() == 0) { + return pack_values(/*position=*/0, arr.length()); + } + + internal::BitRunReader reader(validity, arr.offset(), arr.length()); + auto run = reader.NextRun(); + while (run.length > 0) { + if (run.set) { + RETURN_NOT_OK(pack_values(run.position - arr.offset(), run.length)); + } else { + RETURN_NOT_OK(pack_nulls(run.position - arr.offset(), run.length)); + } + run = reader.NextRun(); + } + + return Status::OK(); +} + +template +Status WriteViewStringValues(const ArrayType& arr, npy_string_allocator* allocator, + char* data, npy_intp stride) { + const auto* values = arr.raw_values(); + const uint8_t* validity = arr.null_bitmap_data(); + + auto pack_values = [&](int64_t position, int64_t length) -> Status { + for (int64_t i = 0; i < length; ++i) { + auto* packed = + reinterpret_cast(data + (position + i) * stride); + RETURN_NOT_OK(PackStringValue(allocator, packed, values[position + i])); + } + return Status::OK(); + }; + + auto pack_nulls = [&](int64_t position, int64_t length) -> Status { + for (int64_t i = 0; i < length; ++i) { + auto* packed = + reinterpret_cast(data + (position + i) * stride); + RETURN_NOT_OK(PackNullString(allocator, packed)); + } + return Status::OK(); + }; + + if (arr.null_count() == 0) { + return pack_values(/*position=*/0, arr.length()); + } + + internal::BitRunReader reader(validity, arr.offset(), arr.length()); + auto run = reader.NextRun(); + while (run.length > 0) { + if (run.set) { + RETURN_NOT_OK(pack_values(run.position - arr.offset(), run.length)); + } else { + RETURN_NOT_OK(pack_nulls(run.position - arr.offset(), run.length)); + } + run = reader.NextRun(); + } + + return Status::OK(); +} + +class StringDTypeWriter : public PandasWriter { + public: + using PandasWriter::PandasWriter; + + Status TransferSingle(std::shared_ptr data, PyObject* py_ref) override { + ARROW_UNUSED(py_ref); + RETURN_NOT_OK(CheckNotZeroCopyOnly(*data)); + RETURN_NOT_OK(EnsureAllocated()); + return CopyInto(std::move(data), /*rel_placement=*/0); + } + + Status CopyInto(std::shared_ptr data, int64_t rel_placement) override { + RETURN_NOT_OK(CheckNotZeroCopyOnly(*data)); + + PyAcquireGIL lock; + auto* np_arr = reinterpret_cast(block_arr_.obj()); + auto* descr = reinterpret_cast(PyArray_DESCR(np_arr)); + + npy_string_allocator* allocator = NpyString_acquire_allocator(descr); + if (allocator == nullptr) { + return Status::Invalid("Failed to acquire NumPy StringDType allocator"); + } + struct AllocatorGuard { + npy_string_allocator* allocator; + explicit AllocatorGuard(npy_string_allocator* alloc) : allocator(alloc) {} + ~AllocatorGuard() { NpyString_release_allocator(allocator); } + } guard(allocator); + + const npy_intp row_stride = PyArray_STRIDES(np_arr)[1]; + char* data_start = PyArray_BYTES(np_arr) + rel_placement * PyArray_STRIDES(np_arr)[0]; + int64_t offset = 0; + + for (const auto& chunk : data->chunks()) { + char* chunk_data = data_start + offset * row_stride; + switch (data->type()->id()) { + case Type::STRING: { + const auto& arr = checked_cast(*chunk); + RETURN_NOT_OK(WriteOffsetStringValues(arr, allocator, chunk_data, row_stride)); + break; + } + case Type::LARGE_STRING: { + const auto& arr = checked_cast(*chunk); + RETURN_NOT_OK(WriteOffsetStringValues(arr, allocator, chunk_data, row_stride)); + break; + } + case Type::STRING_VIEW: { + const auto& arr = checked_cast(*chunk); + RETURN_NOT_OK(WriteViewStringValues(arr, allocator, chunk_data, row_stride)); + break; + } + case Type::LARGE_STRING_VIEW: { + const auto& arr = checked_cast(*chunk); + RETURN_NOT_OK(WriteViewStringValues(arr, allocator, chunk_data, row_stride)); + break; + } + default: + return Status::TypeError("Expected an Arrow string array, got ", + data->type()->ToString()); + } + offset += chunk->length(); + } + + return Status::OK(); + } + + protected: + Status Allocate() override { return AllocateNDArray(NPY_VSTRING); } +}; +#endif + static inline bool IsNonNullContiguous(const ChunkedArray& data) { return data.num_chunks() == 1 && data.null_count() == 0; } @@ -2056,6 +2251,11 @@ Status MakeWriter(const PandasOptions& options, PandasWriter::type writer_type, case PandasWriter::EXTENSION: *writer = std::make_shared(options, num_rows, num_columns); break; +#if NPY_ABI_VERSION >= 0x02000000 + case PandasWriter::STRING_DTYPE: + *writer = std::make_shared(options, num_rows, num_columns); + break; +#endif BLOCK_CASE(OBJECT, ObjectWriter); BLOCK_CASE(UINT8, UInt8Writer); BLOCK_CASE(INT8, Int8Writer); @@ -2130,10 +2330,21 @@ static Status GetPandasWriterType(const ChunkedArray& data, const PandasOptions& case Type::DOUBLE: *output_type = PandasWriter::DOUBLE; break; - case Type::STRING: // fall through - case Type::LARGE_STRING: // fall through - case Type::STRING_VIEW: // fall through - case Type::BINARY: // fall through + case Type::STRING: // fall through + case Type::LARGE_STRING: // fall through + case Type::STRING_VIEW: // fall through + case Type::LARGE_STRING_VIEW: { // fall through +#if NPY_ABI_VERSION >= 0x02000000 + if (options.to_numpy && options.string_conversion_mode == + PandasOptions::StringConversionMode::STRING_DTYPE) { + *output_type = PandasWriter::STRING_DTYPE; + break; + } +#endif + *output_type = PandasWriter::OBJECT; + break; + } + case Type::BINARY: // fall through case Type::LARGE_BINARY: case Type::BINARY_VIEW: case Type::NA: // fall through diff --git a/python/pyarrow/src/arrow/python/arrow_to_pandas.h b/python/pyarrow/src/arrow/python/arrow_to_pandas.h index b4e91e6cf5a..ce45f4f3456 100644 --- a/python/pyarrow/src/arrow/python/arrow_to_pandas.h +++ b/python/pyarrow/src/arrow/python/arrow_to_pandas.h @@ -140,6 +140,12 @@ struct PandasOptions { // Used internally to decipher between to_numpy() and to_pandas() when // the expected output differs bool to_numpy = false; + + enum class StringConversionMode { AUTO, STRING_DTYPE, PYTHON_OBJECT }; + + // Controls how string-like Arrow arrays are converted when calling + // Array.to_numpy/ChunkedArray.to_numpy + StringConversionMode string_conversion_mode = StringConversionMode::PYTHON_OBJECT; }; ARROW_PYTHON_EXPORT @@ -161,5 +167,7 @@ ARROW_PYTHON_EXPORT Status ConvertTableToPandas(const PandasOptions& options, std::shared_ptr table, PyObject** out); +ARROW_PYTHON_EXPORT bool HasNumPyStringDType(); + } // namespace py } // namespace arrow diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index 9136f252980..10aa5916680 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -17,10 +17,35 @@ # under the License. from cpython.pycapsule cimport PyCapsule_CheckExact, PyCapsule_GetPointer, PyCapsule_New +from pyarrow.includes.libarrow_python cimport HasNumPyStringDType, StringConversionMode import warnings from cython import sizeof + +cdef inline StringConversionMode _resolve_string_conversion_mode(object string_dtype): + if string_dtype is True: + return StringConversionMode.STRING_DTYPE + if string_dtype is False: + return StringConversionMode.PYTHON_OBJECT + + if string_dtype is None: + return StringConversionMode.PYTHON_OBJECT + + if isinstance(string_dtype, str): + option = string_dtype.lower() + if option == "auto": + return StringConversionMode.PYTHON_OBJECT + if option in ("numpy", "string", "stringdtype"): + return StringConversionMode.STRING_DTYPE + if option in ("python", "object"): + return StringConversionMode.PYTHON_OBJECT + + raise ValueError( + "string_dtype must be one of 'auto', 'numpy', 'python', 'object', " + "True or False" + ) + cdef class ChunkedArray(_PandasConvertible): """ An array-like composed from a (possibly empty) collection of pyarrow.Arrays @@ -491,7 +516,7 @@ cdef class ChunkedArray(_PandasConvertible): self._assert_cpu() return _array_like_to_pandas(self, options, types_mapper=types_mapper) - def to_numpy(self, zero_copy_only=False): + def to_numpy(self, zero_copy_only=False, *, string_dtype="auto"): """ Return a NumPy copy of this array (experimental). @@ -500,6 +525,13 @@ cdef class ChunkedArray(_PandasConvertible): zero_copy_only : bool, default False Introduced for signature consistence with pyarrow.Array.to_numpy. This must be False here since NumPy arrays' buffer must be contiguous. + string_dtype : {"auto", "numpy", "python", "object", True, False}, default "auto" + Controls how string-like arrays are converted when NumPy 2.0's + :class:`~numpy.typing.StringDType` is available. ``"numpy"`` or + ``True`` will request StringDType (copying), ``"python"``/``"object"`` + or ``False`` will force Python object dtype. ``"auto"`` preserves the + default object dtype unless StringDType is explicitly requested. + Converting to NumPy's StringDType always copies string data. Returns ------- @@ -526,6 +558,10 @@ cdef class ChunkedArray(_PandasConvertible): object values c_options.to_numpy = True + c_options.string_conversion_mode = _resolve_string_conversion_mode(string_dtype) + if c_options.string_conversion_mode == StringConversionMode.STRING_DTYPE: + if not HasNumPyStringDType(): + raise NotImplementedError("NumPy StringDType not available") with nogil: check_status( diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index 74ef81646ed..58a62a40284 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -2331,6 +2331,38 @@ def test_to_numpy_roundtrip(): np.testing.assert_array_equal(narr[2:6], arr[2:6].to_numpy()) +@pytest.mark.numpy +@pytest.mark.parametrize( + "arrow_type", + [pa.string(), pa.large_string(), pa.string_view(), pa.large_string_view()], +) +@pytest.mark.parametrize("scenario", ["no_nulls", "with_nulls", "sliced", "empty"]) +def test_to_numpy_stringdtype(arrow_type, scenario): + dtypes_mod = getattr(np, "dtypes", None) + if dtypes_mod is None: + pytest.skip("NumPy dtypes module not available") + + StringDType = getattr(dtypes_mod, "StringDType", None) + if StringDType is None: + pytest.skip("NumPy StringDType not available") + + values = { + "no_nulls": ["a", "b", "c"], + "with_nulls": ["a", None, "c"], + "sliced": ["z", "a", None, "c", "q"], + "empty": [], + } + + arr = pa.array(values[scenario], type=arrow_type) + if scenario == "sliced": + arr = arr.slice(1, 3) + + result = arr.to_numpy(zero_copy_only=False, string_dtype="numpy") + + assert result.dtype == np.dtype(StringDType()) + assert result.tolist() == arr.to_pylist() + + @pytest.mark.numpy def test_array_uint64_from_py_over_range(): arr = pa.array([2 ** 63], type=pa.uint64()) From 38b2ee1dea915af29ecfc0d8ea89c59b5361d29f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81d=C3=A1m=20Lippai?= Date: Thu, 25 Dec 2025 01:27:05 -0500 Subject: [PATCH 11/20] Clarify StringDType copy path and view packing --- python/pyarrow/src/arrow/python/arrow_to_pandas.cc | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/python/pyarrow/src/arrow/python/arrow_to_pandas.cc b/python/pyarrow/src/arrow/python/arrow_to_pandas.cc index d939432c4fd..473a4e7e8ea 100644 --- a/python/pyarrow/src/arrow/python/arrow_to_pandas.cc +++ b/python/pyarrow/src/arrow/python/arrow_to_pandas.cc @@ -1498,7 +1498,10 @@ Status WriteViewStringValues(const ArrayType& arr, npy_string_allocator* allocat for (int64_t i = 0; i < length; ++i) { auto* packed = reinterpret_cast(data + (position + i) * stride); - RETURN_NOT_OK(PackStringValue(allocator, packed, values[position + i])); + const auto view = values[position + i]; + RETURN_NOT_OK(PackStringValue( + allocator, packed, + std::string_view(reinterpret_cast(view.data()), view.size()))); } return Status::OK(); }; @@ -2337,6 +2340,8 @@ static Status GetPandasWriterType(const ChunkedArray& data, const PandasOptions& #if NPY_ABI_VERSION >= 0x02000000 if (options.to_numpy && options.string_conversion_mode == PandasOptions::StringConversionMode::STRING_DTYPE) { + // NumPy's StringDType allocator always copies string data, so zero-copy + // requests must continue to route through the object-dtype path. *output_type = PandasWriter::STRING_DTYPE; break; } From 6633e1e6b97c8127a2c0770a1134c59fe4c05024 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81d=C3=A1m=20Lippai?= Date: Thu, 25 Dec 2025 01:27:09 -0500 Subject: [PATCH 12/20] Enable NumPy StringDType API and fix writer build --- python/pyarrow/src/arrow/python/arrow_to_pandas.cc | 4 ++-- python/pyarrow/src/arrow/python/numpy_interop.h | 4 ++++ 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/python/pyarrow/src/arrow/python/arrow_to_pandas.cc b/python/pyarrow/src/arrow/python/arrow_to_pandas.cc index 473a4e7e8ea..820d0b61d68 100644 --- a/python/pyarrow/src/arrow/python/arrow_to_pandas.cc +++ b/python/pyarrow/src/arrow/python/arrow_to_pandas.cc @@ -1474,7 +1474,7 @@ Status WriteOffsetStringValues(const ArrayType& arr, npy_string_allocator* alloc return pack_values(/*position=*/0, arr.length()); } - internal::BitRunReader reader(validity, arr.offset(), arr.length()); + arrow::internal::BitRunReader reader(validity, arr.offset(), arr.length()); auto run = reader.NextRun(); while (run.length > 0) { if (run.set) { @@ -1519,7 +1519,7 @@ Status WriteViewStringValues(const ArrayType& arr, npy_string_allocator* allocat return pack_values(/*position=*/0, arr.length()); } - internal::BitRunReader reader(validity, arr.offset(), arr.length()); + arrow::internal::BitRunReader reader(validity, arr.offset(), arr.length()); auto run = reader.NextRun(); while (run.length > 0) { if (run.set) { diff --git a/python/pyarrow/src/arrow/python/numpy_interop.h b/python/pyarrow/src/arrow/python/numpy_interop.h index a83ae4a62b9..b897912427d 100644 --- a/python/pyarrow/src/arrow/python/numpy_interop.h +++ b/python/pyarrow/src/arrow/python/numpy_interop.h @@ -21,6 +21,10 @@ #include // IWYU pragma: export +#if NPY_ABI_VERSION >= 0x02000000 +# define NPY_EXPERIMENTAL_DTYPE_API 1 +#endif + // Don't use the deprecated Numpy functions #ifdef NPY_1_7_API_VERSION # define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION From 70a389d6040a0d3a610d488423b5f010a1ea2590 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81d=C3=A1m=20Lippai?= Date: Thu, 25 Dec 2025 01:27:17 -0500 Subject: [PATCH 13/20] Use PyArray_API table for NumPy StringDType --- .../src/arrow/python/arrow_to_pandas.cc | 36 ++++++++++++++++--- 1 file changed, 31 insertions(+), 5 deletions(-) diff --git a/python/pyarrow/src/arrow/python/arrow_to_pandas.cc b/python/pyarrow/src/arrow/python/arrow_to_pandas.cc index 820d0b61d68..80f66267c48 100644 --- a/python/pyarrow/src/arrow/python/arrow_to_pandas.cc +++ b/python/pyarrow/src/arrow/python/arrow_to_pandas.cc @@ -73,7 +73,8 @@ namespace py { ARROW_PYTHON_EXPORT bool HasNumPyStringDType() { #if NPY_ABI_VERSION >= 0x02000000 - return PyArray_StringDType != nullptr; + auto* dtype_table = reinterpret_cast(PyArray_API + 320); + return dtype_table[39] != nullptr; #else return false; #endif @@ -1418,9 +1419,34 @@ class ObjectWriter : public TypedPandasWriter { }; #if NPY_ABI_VERSION >= 0x02000000 +inline npy_string_allocator* ArrowNpyString_acquire_allocator( + const PyArray_StringDTypeObject* descr) { + using Func = npy_string_allocator* (*)(const PyArray_StringDTypeObject*); + return reinterpret_cast(PyArray_API[316])(descr); +} + +inline void ArrowNpyString_release_allocator(npy_string_allocator* allocator) { + using Func = void (*)(npy_string_allocator*); + reinterpret_cast(PyArray_API[318])(allocator); +} + +inline int ArrowNpyString_pack(npy_string_allocator* allocator, + npy_packed_static_string* packed, const char* data, + size_t length) { + using Func = + int (*)(npy_string_allocator*, npy_packed_static_string*, const char*, size_t); + return reinterpret_cast(PyArray_API[314])(allocator, packed, data, length); +} + +inline int ArrowNpyString_pack_null(npy_string_allocator* allocator, + npy_packed_static_string* packed) { + using Func = int (*)(npy_string_allocator*, npy_packed_static_string*); + return reinterpret_cast(PyArray_API[315])(allocator, packed); +} + Status PackStringValue(npy_string_allocator* allocator, npy_packed_static_string* packed, const std::string_view& view) { - const int result = NpyString_pack(allocator, packed, view.data(), view.size()); + const int result = ArrowNpyString_pack(allocator, packed, view.data(), view.size()); if (result == -1) { RETURN_IF_PYERROR(); return Status::Invalid("Failed to pack NumPy StringDType value"); @@ -1429,7 +1455,7 @@ Status PackStringValue(npy_string_allocator* allocator, npy_packed_static_string } Status PackNullString(npy_string_allocator* allocator, npy_packed_static_string* packed) { - const int result = NpyString_pack_null(allocator, packed); + const int result = ArrowNpyString_pack_null(allocator, packed); if (result == -1) { RETURN_IF_PYERROR(); return Status::Invalid("Failed to pack NumPy StringDType value"); @@ -1551,14 +1577,14 @@ class StringDTypeWriter : public PandasWriter { auto* np_arr = reinterpret_cast(block_arr_.obj()); auto* descr = reinterpret_cast(PyArray_DESCR(np_arr)); - npy_string_allocator* allocator = NpyString_acquire_allocator(descr); + npy_string_allocator* allocator = ArrowNpyString_acquire_allocator(descr); if (allocator == nullptr) { return Status::Invalid("Failed to acquire NumPy StringDType allocator"); } struct AllocatorGuard { npy_string_allocator* allocator; explicit AllocatorGuard(npy_string_allocator* alloc) : allocator(alloc) {} - ~AllocatorGuard() { NpyString_release_allocator(allocator); } + ~AllocatorGuard() { ArrowNpyString_release_allocator(allocator); } } guard(allocator); const npy_intp row_stride = PyArray_STRIDES(np_arr)[1]; From c150bfbe3a9c68b9c1e25e4b2051b9e42ac8c0f6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81d=C3=A1m=20Lippai?= Date: Thu, 25 Dec 2025 16:47:56 -0500 Subject: [PATCH 14/20] Remove unnecessary experimental dtype define --- python/pyarrow/src/arrow/python/numpy_interop.h | 4 ---- 1 file changed, 4 deletions(-) diff --git a/python/pyarrow/src/arrow/python/numpy_interop.h b/python/pyarrow/src/arrow/python/numpy_interop.h index b897912427d..a83ae4a62b9 100644 --- a/python/pyarrow/src/arrow/python/numpy_interop.h +++ b/python/pyarrow/src/arrow/python/numpy_interop.h @@ -21,10 +21,6 @@ #include // IWYU pragma: export -#if NPY_ABI_VERSION >= 0x02000000 -# define NPY_EXPERIMENTAL_DTYPE_API 1 -#endif - // Don't use the deprecated Numpy functions #ifdef NPY_1_7_API_VERSION # define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION From 5a21d1c006be15e25eb22d065c9aa471dd4bea62 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81d=C3=A1m=20Lippai?= Date: Thu, 25 Dec 2025 16:52:45 -0500 Subject: [PATCH 15/20] Fix StringDType writer run handling --- .../src/arrow/python/arrow_to_pandas.cc | 31 ++++++++----------- python/pyarrow/tests/test_array.py | 2 +- 2 files changed, 14 insertions(+), 19 deletions(-) diff --git a/python/pyarrow/src/arrow/python/arrow_to_pandas.cc b/python/pyarrow/src/arrow/python/arrow_to_pandas.cc index 80f66267c48..514a49b3c0c 100644 --- a/python/pyarrow/src/arrow/python/arrow_to_pandas.cc +++ b/python/pyarrow/src/arrow/python/arrow_to_pandas.cc @@ -1501,13 +1501,15 @@ Status WriteOffsetStringValues(const ArrayType& arr, npy_string_allocator* alloc } arrow::internal::BitRunReader reader(validity, arr.offset(), arr.length()); + int64_t position = 0; auto run = reader.NextRun(); while (run.length > 0) { if (run.set) { - RETURN_NOT_OK(pack_values(run.position - arr.offset(), run.length)); + RETURN_NOT_OK(pack_values(position, run.length)); } else { - RETURN_NOT_OK(pack_nulls(run.position - arr.offset(), run.length)); + RETURN_NOT_OK(pack_nulls(position, run.length)); } + position += run.length; run = reader.NextRun(); } @@ -1517,17 +1519,14 @@ Status WriteOffsetStringValues(const ArrayType& arr, npy_string_allocator* alloc template Status WriteViewStringValues(const ArrayType& arr, npy_string_allocator* allocator, char* data, npy_intp stride) { - const auto* values = arr.raw_values(); const uint8_t* validity = arr.null_bitmap_data(); auto pack_values = [&](int64_t position, int64_t length) -> Status { for (int64_t i = 0; i < length; ++i) { auto* packed = reinterpret_cast(data + (position + i) * stride); - const auto view = values[position + i]; - RETURN_NOT_OK(PackStringValue( - allocator, packed, - std::string_view(reinterpret_cast(view.data()), view.size()))); + const auto view = arr.GetView(position + i); + RETURN_NOT_OK(PackStringValue(allocator, packed, view)); } return Status::OK(); }; @@ -1546,13 +1545,15 @@ Status WriteViewStringValues(const ArrayType& arr, npy_string_allocator* allocat } arrow::internal::BitRunReader reader(validity, arr.offset(), arr.length()); + int64_t position = 0; auto run = reader.NextRun(); while (run.length > 0) { if (run.set) { - RETURN_NOT_OK(pack_values(run.position - arr.offset(), run.length)); + RETURN_NOT_OK(pack_values(position, run.length)); } else { - RETURN_NOT_OK(pack_nulls(run.position - arr.offset(), run.length)); + RETURN_NOT_OK(pack_nulls(position, run.length)); } + position += run.length; run = reader.NextRun(); } @@ -1609,11 +1610,6 @@ class StringDTypeWriter : public PandasWriter { RETURN_NOT_OK(WriteViewStringValues(arr, allocator, chunk_data, row_stride)); break; } - case Type::LARGE_STRING_VIEW: { - const auto& arr = checked_cast(*chunk); - RETURN_NOT_OK(WriteViewStringValues(arr, allocator, chunk_data, row_stride)); - break; - } default: return Status::TypeError("Expected an Arrow string array, got ", data->type()->ToString()); @@ -2359,10 +2355,9 @@ static Status GetPandasWriterType(const ChunkedArray& data, const PandasOptions& case Type::DOUBLE: *output_type = PandasWriter::DOUBLE; break; - case Type::STRING: // fall through - case Type::LARGE_STRING: // fall through - case Type::STRING_VIEW: // fall through - case Type::LARGE_STRING_VIEW: { // fall through + case Type::STRING: // fall through + case Type::LARGE_STRING: // fall through + case Type::STRING_VIEW: { // fall through #if NPY_ABI_VERSION >= 0x02000000 if (options.to_numpy && options.string_conversion_mode == PandasOptions::StringConversionMode::STRING_DTYPE) { diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index 58a62a40284..7344b9b5b5d 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -2334,7 +2334,7 @@ def test_to_numpy_roundtrip(): @pytest.mark.numpy @pytest.mark.parametrize( "arrow_type", - [pa.string(), pa.large_string(), pa.string_view(), pa.large_string_view()], + [pa.string(), pa.large_string(), pa.string_view()], ) @pytest.mark.parametrize("scenario", ["no_nulls", "with_nulls", "sliced", "empty"]) def test_to_numpy_stringdtype(arrow_type, scenario): From bdd2706fe4cc99907372f01fb846785b10cbe019 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81d=C3=A1m=20Lippai?= Date: Thu, 25 Dec 2025 17:14:27 -0500 Subject: [PATCH 16/20] Fix StringConversionMode scoping and helper duplication --- python/pyarrow/includes/libarrow_python.pxd | 11 ++++++----- python/pyarrow/table.pxi | 4 ++-- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/python/pyarrow/includes/libarrow_python.pxd b/python/pyarrow/includes/libarrow_python.pxd index c5661357217..5139e4a4952 100644 --- a/python/pyarrow/includes/libarrow_python.pxd +++ b/python/pyarrow/includes/libarrow_python.pxd @@ -184,12 +184,11 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py" nogil: cdef cppclass PyOutputStream(COutputStream): PyOutputStream(object fo) - cdef enum StringConversionMode "arrow::py::PandasOptions::StringConversionMode": - AUTO - STRING_DTYPE - PYTHON_OBJECT - cdef cppclass PandasOptions: + cdef enum StringConversionMode: + AUTO + STRING_DTYPE + PYTHON_OBJECT CMemoryPool* pool c_bool strings_to_categorical c_bool zero_copy_only @@ -210,6 +209,8 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py" nogil: c_bool to_numpy StringConversionMode string_conversion_mode +ctypedef PandasOptions.StringConversionMode StringConversionMode + cdef extern from "arrow/python/api.h" namespace "arrow::py::internal" nogil: cdef cppclass CTimePoint "arrow::py::internal::TimePoint": diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index 10aa5916680..f9856ebc3c8 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -23,7 +23,7 @@ import warnings from cython import sizeof -cdef inline StringConversionMode _resolve_string_conversion_mode(object string_dtype): +cdef inline StringConversionMode _resolve_table_string_conversion_mode(object string_dtype): if string_dtype is True: return StringConversionMode.STRING_DTYPE if string_dtype is False: @@ -558,7 +558,7 @@ cdef class ChunkedArray(_PandasConvertible): object values c_options.to_numpy = True - c_options.string_conversion_mode = _resolve_string_conversion_mode(string_dtype) + c_options.string_conversion_mode = _resolve_table_string_conversion_mode(string_dtype) if c_options.string_conversion_mode == StringConversionMode.STRING_DTYPE: if not HasNumPyStringDType(): raise NotImplementedError("NumPy StringDType not available") From ff3eaa923a7209f2b1a0a6017fdc48f513c2595a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81d=C3=A1m=20Lippai?= Date: Thu, 25 Dec 2025 21:21:03 -0500 Subject: [PATCH 17/20] Fix PandasOptions StringConversionMode declaration --- python/pyarrow/array.pxi | 14 +++++++------- python/pyarrow/includes/libarrow_python.pxd | 11 +++++------ python/pyarrow/table.pxi | 14 +++++++------- 3 files changed, 19 insertions(+), 20 deletions(-) diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 592d0863c23..6d04278eb25 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -68,21 +68,21 @@ def _ndarray_to_arrow_type(object values, DataType type): cdef inline StringConversionMode _resolve_string_conversion_mode(object string_dtype): if string_dtype is True: - return StringConversionMode.STRING_DTYPE + return StringConversionMode_STRING_DTYPE if string_dtype is False: - return StringConversionMode.PYTHON_OBJECT + return StringConversionMode_PYTHON_OBJECT if string_dtype is None: - return StringConversionMode.PYTHON_OBJECT + return StringConversionMode_PYTHON_OBJECT if isinstance(string_dtype, str): option = string_dtype.lower() if option == "auto": - return StringConversionMode.PYTHON_OBJECT + return StringConversionMode_PYTHON_OBJECT if option in ("numpy", "string", "stringdtype"): - return StringConversionMode.STRING_DTYPE + return StringConversionMode_STRING_DTYPE if option in ("python", "object"): - return StringConversionMode.PYTHON_OBJECT + return StringConversionMode_PYTHON_OBJECT raise ValueError( "string_dtype must be one of 'auto', 'numpy', 'python', 'object', " @@ -1809,7 +1809,7 @@ cdef class Array(_PandasConvertible): "Cannot return a writable array if asking for zero-copy") c_options.string_conversion_mode = _resolve_string_conversion_mode(string_dtype) - if c_options.string_conversion_mode == StringConversionMode.STRING_DTYPE: + if c_options.string_conversion_mode == StringConversionMode_STRING_DTYPE: if not HasNumPyStringDType(): raise NotImplementedError("NumPy StringDType not available") diff --git a/python/pyarrow/includes/libarrow_python.pxd b/python/pyarrow/includes/libarrow_python.pxd index 5139e4a4952..127a456d7ab 100644 --- a/python/pyarrow/includes/libarrow_python.pxd +++ b/python/pyarrow/includes/libarrow_python.pxd @@ -184,11 +184,12 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py" nogil: cdef cppclass PyOutputStream(COutputStream): PyOutputStream(object fo) + ctypedef enum StringConversionMode "arrow::py::PandasOptions::StringConversionMode": + StringConversionMode_AUTO "arrow::py::PandasOptions::StringConversionMode::AUTO" + StringConversionMode_STRING_DTYPE "arrow::py::PandasOptions::StringConversionMode::STRING_DTYPE" + StringConversionMode_PYTHON_OBJECT "arrow::py::PandasOptions::StringConversionMode::PYTHON_OBJECT" + cdef cppclass PandasOptions: - cdef enum StringConversionMode: - AUTO - STRING_DTYPE - PYTHON_OBJECT CMemoryPool* pool c_bool strings_to_categorical c_bool zero_copy_only @@ -209,8 +210,6 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py" nogil: c_bool to_numpy StringConversionMode string_conversion_mode -ctypedef PandasOptions.StringConversionMode StringConversionMode - cdef extern from "arrow/python/api.h" namespace "arrow::py::internal" nogil: cdef cppclass CTimePoint "arrow::py::internal::TimePoint": diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index f9856ebc3c8..502bfa25563 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -25,21 +25,21 @@ from cython import sizeof cdef inline StringConversionMode _resolve_table_string_conversion_mode(object string_dtype): if string_dtype is True: - return StringConversionMode.STRING_DTYPE + return StringConversionMode_STRING_DTYPE if string_dtype is False: - return StringConversionMode.PYTHON_OBJECT + return StringConversionMode_PYTHON_OBJECT if string_dtype is None: - return StringConversionMode.PYTHON_OBJECT + return StringConversionMode_PYTHON_OBJECT if isinstance(string_dtype, str): option = string_dtype.lower() if option == "auto": - return StringConversionMode.PYTHON_OBJECT + return StringConversionMode_PYTHON_OBJECT if option in ("numpy", "string", "stringdtype"): - return StringConversionMode.STRING_DTYPE + return StringConversionMode_STRING_DTYPE if option in ("python", "object"): - return StringConversionMode.PYTHON_OBJECT + return StringConversionMode_PYTHON_OBJECT raise ValueError( "string_dtype must be one of 'auto', 'numpy', 'python', 'object', " @@ -559,7 +559,7 @@ cdef class ChunkedArray(_PandasConvertible): c_options.to_numpy = True c_options.string_conversion_mode = _resolve_table_string_conversion_mode(string_dtype) - if c_options.string_conversion_mode == StringConversionMode.STRING_DTYPE: + if c_options.string_conversion_mode == StringConversionMode_STRING_DTYPE: if not HasNumPyStringDType(): raise NotImplementedError("NumPy StringDType not available") From aff479b9a5285d5d94ffb3d9a2bdc0f5cac0942e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81d=C3=A1m=20Lippai?= Date: Thu, 25 Dec 2025 21:53:24 -0500 Subject: [PATCH 18/20] Fix StringConversionMode enum mapping --- python/pyarrow/includes/libarrow_python.pxd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyarrow/includes/libarrow_python.pxd b/python/pyarrow/includes/libarrow_python.pxd index 127a456d7ab..a1cb237ad7c 100644 --- a/python/pyarrow/includes/libarrow_python.pxd +++ b/python/pyarrow/includes/libarrow_python.pxd @@ -184,7 +184,7 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py" nogil: cdef cppclass PyOutputStream(COutputStream): PyOutputStream(object fo) - ctypedef enum StringConversionMode "arrow::py::PandasOptions::StringConversionMode": + cdef enum StringConversionMode "arrow::py::PandasOptions::StringConversionMode": StringConversionMode_AUTO "arrow::py::PandasOptions::StringConversionMode::AUTO" StringConversionMode_STRING_DTYPE "arrow::py::PandasOptions::StringConversionMode::STRING_DTYPE" StringConversionMode_PYTHON_OBJECT "arrow::py::PandasOptions::StringConversionMode::PYTHON_OBJECT" From 25343c259de113de258cb9bae47cb2bf0db40038 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81d=C3=A1m=20Lippai?= Date: Thu, 25 Dec 2025 21:53:49 -0500 Subject: [PATCH 19/20] Apply hook formatting fixes --- python/pyarrow/src/arrow/python/arrow_to_pandas.cc | 4 ++-- python/pyarrow/table.pxi | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/python/pyarrow/src/arrow/python/arrow_to_pandas.cc b/python/pyarrow/src/arrow/python/arrow_to_pandas.cc index 514a49b3c0c..2f026d211ee 100644 --- a/python/pyarrow/src/arrow/python/arrow_to_pandas.cc +++ b/python/pyarrow/src/arrow/python/arrow_to_pandas.cc @@ -2355,8 +2355,8 @@ static Status GetPandasWriterType(const ChunkedArray& data, const PandasOptions& case Type::DOUBLE: *output_type = PandasWriter::DOUBLE; break; - case Type::STRING: // fall through - case Type::LARGE_STRING: // fall through + case Type::STRING: // fall through + case Type::LARGE_STRING: // fall through case Type::STRING_VIEW: { // fall through #if NPY_ABI_VERSION >= 0x02000000 if (options.to_numpy && options.string_conversion_mode == diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index 502bfa25563..a2bd1edd114 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -558,7 +558,8 @@ cdef class ChunkedArray(_PandasConvertible): object values c_options.to_numpy = True - c_options.string_conversion_mode = _resolve_table_string_conversion_mode(string_dtype) + c_options.string_conversion_mode = _resolve_table_string_conversion_mode( + string_dtype) if c_options.string_conversion_mode == StringConversionMode_STRING_DTYPE: if not HasNumPyStringDType(): raise NotImplementedError("NumPy StringDType not available") From 78e592c4b9fadedc335aac24d4a1f5e56aff4449 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81d=C3=A1m=20Lippai?= Date: Thu, 25 Dec 2025 21:54:56 -0500 Subject: [PATCH 20/20] Handle null validity when packing NumPy StringDType --- .../src/arrow/python/arrow_to_pandas.cc | 68 +++++++++++++------ 1 file changed, 48 insertions(+), 20 deletions(-) diff --git a/python/pyarrow/src/arrow/python/arrow_to_pandas.cc b/python/pyarrow/src/arrow/python/arrow_to_pandas.cc index 2f026d211ee..4e699381b65 100644 --- a/python/pyarrow/src/arrow/python/arrow_to_pandas.cc +++ b/python/pyarrow/src/arrow/python/arrow_to_pandas.cc @@ -1500,17 +1500,33 @@ Status WriteOffsetStringValues(const ArrayType& arr, npy_string_allocator* alloc return pack_values(/*position=*/0, arr.length()); } - arrow::internal::BitRunReader reader(validity, arr.offset(), arr.length()); - int64_t position = 0; - auto run = reader.NextRun(); - while (run.length > 0) { - if (run.set) { - RETURN_NOT_OK(pack_values(position, run.length)); - } else { - RETURN_NOT_OK(pack_nulls(position, run.length)); + if (validity == nullptr) { + for (int64_t i = 0; i < arr.length(); ++i) { + auto* packed = reinterpret_cast(data + i * stride); + if (arr.IsNull(i)) { + RETURN_NOT_OK(PackNullString(allocator, packed)); + } else { + const auto start = static_cast(offsets[i] - base_offset); + const auto end = static_cast(offsets[i + 1] - base_offset); + RETURN_NOT_OK(PackStringValue( + allocator, packed, + std::string_view(reinterpret_cast(value_data + start), + end - start))); + } + } + } else { + arrow::internal::BitRunReader reader(validity, arr.offset(), arr.length()); + int64_t position = 0; + auto run = reader.NextRun(); + while (run.length > 0) { + if (run.set) { + RETURN_NOT_OK(pack_values(position, run.length)); + } else { + RETURN_NOT_OK(pack_nulls(position, run.length)); + } + position += run.length; + run = reader.NextRun(); } - position += run.length; - run = reader.NextRun(); } return Status::OK(); @@ -1544,17 +1560,29 @@ Status WriteViewStringValues(const ArrayType& arr, npy_string_allocator* allocat return pack_values(/*position=*/0, arr.length()); } - arrow::internal::BitRunReader reader(validity, arr.offset(), arr.length()); - int64_t position = 0; - auto run = reader.NextRun(); - while (run.length > 0) { - if (run.set) { - RETURN_NOT_OK(pack_values(position, run.length)); - } else { - RETURN_NOT_OK(pack_nulls(position, run.length)); + if (validity == nullptr) { + for (int64_t i = 0; i < arr.length(); ++i) { + auto* packed = reinterpret_cast(data + i * stride); + if (arr.IsNull(i)) { + RETURN_NOT_OK(PackNullString(allocator, packed)); + } else { + const auto view = arr.GetView(i); + RETURN_NOT_OK(PackStringValue(allocator, packed, view)); + } + } + } else { + arrow::internal::BitRunReader reader(validity, arr.offset(), arr.length()); + int64_t position = 0; + auto run = reader.NextRun(); + while (run.length > 0) { + if (run.set) { + RETURN_NOT_OK(pack_values(position, run.length)); + } else { + RETURN_NOT_OK(pack_nulls(position, run.length)); + } + position += run.length; + run = reader.NextRun(); } - position += run.length; - run = reader.NextRun(); } return Status::OK();