From 0d84e65a4ecfb44dd96178fb625a9d9d3be4098f Mon Sep 17 00:00:00 2001 From: kirilklein Date: Mon, 8 Jun 2026 09:57:42 +0200 Subject: [PATCH] GH-26685: [Python][C++] Trim buffers when pickling a sliced array Pickling a sliced array previously serialized the array's entire parent buffers instead of just the referenced slice. Add arrow::internal::TrimArrayDataBuffers, which compacts a sliced array (offset != 0) to its referenced range via Concatenate, and call it from Array.__reduce__ so pickling only serializes referenced bytes. Unsliced arrays are returned untouched, preserving zero-copy / protocol-5 out-of-band pickling. ChunkedArray/RecordBatch/Table inherit the fix. Co-Authored-By: Claude Opus 4.8 --- cpp/src/arrow/array/util.cc | 16 +++++++++ cpp/src/arrow/array/util.h | 16 +++++++++ python/pyarrow/array.pxi | 7 +++- python/pyarrow/includes/libarrow.pxd | 4 +++ python/pyarrow/tests/test_array.py | 50 ++++++++++++++++++++++++++++ 5 files changed, 92 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/array/util.cc b/cpp/src/arrow/array/util.cc index 1c19bd5a5468..546fa7492341 100644 --- a/cpp/src/arrow/array/util.cc +++ b/cpp/src/arrow/array/util.cc @@ -1001,5 +1001,21 @@ std::vector RechunkArraysConsistently( return rechunked_groups; } +Result> TrimArrayDataBuffers( + const std::shared_ptr& data, MemoryPool* pool) { + if (data->offset == 0) { + // An array with no offset owns its buffers from the start (any excess is + // just allocator padding), so there is nothing worth trimming. Returning it + // unchanged keeps pickling zero-copy / protocol-5 out-of-band friendly. + return data; + } + // A sliced array (offset != 0) shares its parent's buffers; compact it so only + // the referenced range is serialized. Concatenating the single array yields an + // equivalent array whose buffers start at offset 0, correctly handling every + // nested, variable-length and dictionary type. + ARROW_ASSIGN_OR_RAISE(auto trimmed, Concatenate({MakeArray(data)}, pool)); + return trimmed->data(); +} + } // namespace internal } // namespace arrow diff --git a/cpp/src/arrow/array/util.h b/cpp/src/arrow/array/util.h index fd8e75ddb864..e90e20c5d880 100644 --- a/cpp/src/arrow/array/util.h +++ b/cpp/src/arrow/array/util.h @@ -92,5 +92,21 @@ Result> SwapEndianArrayData( ARROW_EXPORT std::vector RechunkArraysConsistently(const std::vector&); +/// \brief Return an equivalent ArrayData whose buffers are trimmed to the range +/// the array actually references (its offset/length window), recursively for +/// child and dictionary data. +/// +/// A sliced array shares its parent's (potentially large) buffers and only +/// references a window of them. Serializing the raw buffers -- e.g. pickling in +/// PyArrow (GH-26685) -- would otherwise copy the whole parent buffers. If the +/// buffers are already minimal, \p data is returned unchanged without copying. +/// +/// \param[in] data the array contents +/// \param[in] pool the memory pool to allocate trimmed buffers from +/// \return the trimmed ArrayData (or \p data itself if already minimal) +ARROW_EXPORT +Result> TrimArrayDataBuffers( + const std::shared_ptr& data, MemoryPool* pool = default_memory_pool()); + } // namespace internal } // namespace arrow diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index ecdbb342d3e2..adf5e8be816b 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -1313,8 +1313,13 @@ cdef class Array(_PandasConvertible): def __reduce__(self): self._assert_cpu() + # Trim the buffers to the range this (possibly sliced) array references + # so we don't serialize the whole parent buffers (GH-26685). return _restore_array, \ - (_reduce_array_data(self.sp_array.get().data().get()),) + (_reduce_array_data( + GetResultValue(TrimArrayDataBuffers( + self.sp_array.get().data(), + c_default_memory_pool())).get()),) @staticmethod def from_buffers(DataType type, length, buffers, null_count=-1, offset=0, diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 79522c12474b..cd20daa8a705 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -3289,3 +3289,7 @@ cdef extern from "arrow/python/udf.h" namespace "arrow::py" nogil: cdef extern from "arrow/compute/cast.h" namespace "arrow::compute": CResult[CDatum] Cast(const CDatum& value, const CCastOptions& options) + +cdef extern from "arrow/array/util.h" namespace "arrow::internal" nogil: + CResult[shared_ptr[CArrayData]] TrimArrayDataBuffers( + const shared_ptr[CArrayData]& data, CMemoryPool* pool) diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index a103519dc5ac..c1387386e6fe 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -21,6 +21,7 @@ import hypothesis as h import hypothesis.strategies as st import itertools +import pickle import pytest import struct import subprocess @@ -4468,3 +4469,52 @@ def test_dunders_checked_overflow(): arr ** pa.scalar(2, type=pa.int8()) with pytest.raises(pa.ArrowInvalid, match=error_match): arr / (-arr) + + +@pytest.mark.parametrize("typ", [ + pa.int64(), + pa.float64(), + pa.bool_(), + pa.string(), + pa.large_string(), + pa.list_(pa.int64()), +]) +def test_pickle_sliced_array_does_not_copy_parent_buffers(typ): + # GH-26685: pickling a small slice of a large array must not serialize the + # whole parent buffers. + n = 100_000 + if pa.types.is_boolean(typ): + arr = pa.array([i % 2 == 0 for i in range(n)]) + elif pa.types.is_string(typ) or pa.types.is_large_string(typ): + arr = pa.array(["x%d" % i for i in range(n)], type=typ) + elif pa.types.is_list(typ): + arr = pa.array([[i] for i in range(n)], type=typ) + else: + arr = pa.array(range(n), type=typ) + + sliced = arr.slice(10, 5) + for protocol in (4, 5): + data = pickle.dumps(sliced, protocol=protocol) + # A 5-element slice serializes to a few hundred bytes; copying the + # parent buffers would be orders of magnitude larger. + assert len(data) < 2000 + assert pickle.loads(data).equals(sliced) + + +def test_pickle_sliced_array_preserves_out_of_band_buffers(): + # GH-26685: protocol-5 out-of-band buffers still work and carry only the + # referenced bytes, not the whole parent buffer. + arr = pa.array(range(100_000), type=pa.int64()) + sliced = arr.slice(10, 5) + collected = [] + data = pickle.dumps(sliced, protocol=5, buffer_callback=collected.append) + oob = sum(memoryview(b.raw()).nbytes for b in collected) + assert oob < arr.nbytes / 100 + restored = pickle.loads(data, buffers=[b.raw() for b in collected]) + assert restored.equals(sliced) + + +def test_pickle_unsliced_array_roundtrips(): + # Regression: a non-sliced array is unaffected. + arr = pa.array(range(1000), type=pa.int64()) + assert pickle.loads(pickle.dumps(arr)).equals(arr)