Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions cpp/src/arrow/array/util.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1001,5 +1001,21 @@ std::vector<ArrayVector> RechunkArraysConsistently(
return rechunked_groups;
}

Result<std::shared_ptr<ArrayData>> TrimArrayDataBuffers(
const std::shared_ptr<ArrayData>& data, MemoryPool* pool) {
if (data->offset == 0) {
// An array with no offset owns its buffers from the start (any excess is
// just allocator padding), so there is nothing worth trimming. Returning it
// unchanged keeps pickling zero-copy / protocol-5 out-of-band friendly.
return data;
}
// A sliced array (offset != 0) shares its parent's buffers; compact it so only
// the referenced range is serialized. Concatenating the single array yields an
// equivalent array whose buffers start at offset 0, correctly handling every
// nested, variable-length and dictionary type.
ARROW_ASSIGN_OR_RAISE(auto trimmed, Concatenate({MakeArray(data)}, pool));
return trimmed->data();
}

} // namespace internal
} // namespace arrow
16 changes: 16 additions & 0 deletions cpp/src/arrow/array/util.h
Original file line number Diff line number Diff line change
Expand Up @@ -92,5 +92,21 @@ Result<std::shared_ptr<ArrayData>> SwapEndianArrayData(
ARROW_EXPORT
std::vector<ArrayVector> RechunkArraysConsistently(const std::vector<ArrayVector>&);

/// \brief Return an equivalent ArrayData whose buffers are trimmed to the range
/// the array actually references (its offset/length window), recursively for
/// child and dictionary data.
///
/// A sliced array shares its parent's (potentially large) buffers and only
/// references a window of them. Serializing the raw buffers -- e.g. pickling in
/// PyArrow (GH-26685) -- would otherwise copy the whole parent buffers. If the
/// buffers are already minimal, \p data is returned unchanged without copying.
///
/// \param[in] data the array contents
/// \param[in] pool the memory pool to allocate trimmed buffers from
/// \return the trimmed ArrayData (or \p data itself if already minimal)
ARROW_EXPORT
Result<std::shared_ptr<ArrayData>> TrimArrayDataBuffers(
const std::shared_ptr<ArrayData>& data, MemoryPool* pool = default_memory_pool());

} // namespace internal
} // namespace arrow
7 changes: 6 additions & 1 deletion python/pyarrow/array.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -1313,8 +1313,13 @@ cdef class Array(_PandasConvertible):

def __reduce__(self):
self._assert_cpu()
# Trim the buffers to the range this (possibly sliced) array references
# so we don't serialize the whole parent buffers (GH-26685).
return _restore_array, \
(_reduce_array_data(self.sp_array.get().data().get()),)
(_reduce_array_data(
GetResultValue(TrimArrayDataBuffers(
self.sp_array.get().data(),
c_default_memory_pool())).get()),)

@staticmethod
def from_buffers(DataType type, length, buffers, null_count=-1, offset=0,
Expand Down
4 changes: 4 additions & 0 deletions python/pyarrow/includes/libarrow.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -3289,3 +3289,7 @@ cdef extern from "arrow/python/udf.h" namespace "arrow::py" nogil:

cdef extern from "arrow/compute/cast.h" namespace "arrow::compute":
CResult[CDatum] Cast(const CDatum& value, const CCastOptions& options)

cdef extern from "arrow/array/util.h" namespace "arrow::internal" nogil:
CResult[shared_ptr[CArrayData]] TrimArrayDataBuffers(
const shared_ptr[CArrayData]& data, CMemoryPool* pool)
50 changes: 50 additions & 0 deletions python/pyarrow/tests/test_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
import hypothesis as h
import hypothesis.strategies as st
import itertools
import pickle
import pytest
import struct
import subprocess
Expand Down Expand Up @@ -4468,3 +4469,52 @@ def test_dunders_checked_overflow():
arr ** pa.scalar(2, type=pa.int8())
with pytest.raises(pa.ArrowInvalid, match=error_match):
arr / (-arr)


@pytest.mark.parametrize("typ", [
pa.int64(),
pa.float64(),
pa.bool_(),
pa.string(),
pa.large_string(),
pa.list_(pa.int64()),
])
def test_pickle_sliced_array_does_not_copy_parent_buffers(typ):
# GH-26685: pickling a small slice of a large array must not serialize the
# whole parent buffers.
n = 100_000
if pa.types.is_boolean(typ):
arr = pa.array([i % 2 == 0 for i in range(n)])
elif pa.types.is_string(typ) or pa.types.is_large_string(typ):
arr = pa.array(["x%d" % i for i in range(n)], type=typ)
elif pa.types.is_list(typ):
arr = pa.array([[i] for i in range(n)], type=typ)
else:
arr = pa.array(range(n), type=typ)

sliced = arr.slice(10, 5)
for protocol in (4, 5):
data = pickle.dumps(sliced, protocol=protocol)
# A 5-element slice serializes to a few hundred bytes; copying the
# parent buffers would be orders of magnitude larger.
assert len(data) < 2000
assert pickle.loads(data).equals(sliced)


def test_pickle_sliced_array_preserves_out_of_band_buffers():
# GH-26685: protocol-5 out-of-band buffers still work and carry only the
# referenced bytes, not the whole parent buffer.
arr = pa.array(range(100_000), type=pa.int64())
sliced = arr.slice(10, 5)
collected = []
data = pickle.dumps(sliced, protocol=5, buffer_callback=collected.append)
oob = sum(memoryview(b.raw()).nbytes for b in collected)
assert oob < arr.nbytes / 100
restored = pickle.loads(data, buffers=[b.raw() for b in collected])
assert restored.equals(sliced)


def test_pickle_unsliced_array_roundtrips():
# Regression: a non-sliced array is unaffected.
arr = pa.array(range(1000), type=pa.int64())
assert pickle.loads(pickle.dumps(arr)).equals(arr)