From 5766fd98f21b582f78beec3c63eef136cd81a0db Mon Sep 17 00:00:00 2001 From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com> Date: Thu, 11 Jun 2026 12:38:49 +0300 Subject: [PATCH 1/9] wip --- arrow-buffer/src/buffer/offset.rs | 78 +++++++++++++++++++++++++++++++ 1 file changed, 78 insertions(+) diff --git a/arrow-buffer/src/buffer/offset.rs b/arrow-buffer/src/buffer/offset.rs index d3a4b3cffbda..2016c367b47b 100644 --- a/arrow-buffer/src/buffer/offset.rs +++ b/arrow-buffer/src/buffer/offset.rs @@ -326,6 +326,79 @@ impl OffsetBuffer { end_offset_of_last_valid_value != last_offset } + + /// Subtract `rhs` from all offsets + /// This will try to reuse the existing allocation as much as possible + /// + /// Panics: this will panic if `rhs` > first offset or if `rhs` will lead to overflow (when `rhs` is negative) + /// + /// TODO - add examples + pub fn subtract(self, rhs: O) -> Self where O: std::ops::Sub + std::cmp::Ord { + if rhs == O::usize_as(0) { + return self; + } + + let len = self.len(); + + self[0].checked_sub(rhs).is_some) + + // Offset buffer is guaranteed to be non-empty + assert!(self[0] >= rhs, "shifted offsets will become negative which is not allowed "); + + // If negative + if rhs < O::usize_as(0) { + let last_value = self[len - 1]; + + let last_value = i32::MAX - 5; + let rhs = -6; + // last_value - rhs == last_value - (-6) == last_value + 6 = i32::MAX - 5 + 6 = i32::MAX + 1 >= i32::MAX + + // i32::MAX - last_value == i32::MAX - (i32::MAX - 5) == i32::MAX - i32::MAX + 5 == 5 + // + let will_overflow = 0 - (O::MAX - last_value) > rhs + assert!(last_value - rhs >= O::MAX); + } + + let output_buffer = match self.into_inner().into_inner().into_mutable() { + Ok(mut mutable) => { + // TODO - add test when the offsets are sliced and the first offset outside the slice is 0 and we shift by > 0 + mutable.typed_data_mut::() + .iter_mut() + .for_each(|offset| *offset = *offset - rhs); + + mutable + } + Err(original_buffer) => { + let mut output_buffer = MutableBuffer::new(len * size_of::()); + + let underlying_buffer = original_buffer.typed_data::(); + + for i in 0..len { + unsafe { + // SAFETY: Already allocated sufficient capacity + output_buffer.push_unchecked( + // SAFETY: + // 1. `i` is within bounds + // 2. we will not cause underflow as we checked that the first offset is greater than or equal to the shift + *underlying_buffer.get_unchecked(i) - rhs, + ) + } + } + + output_buffer + } + }; + + let output_buffer = ScalarBuffer::::from(output_buffer); + + // This is safe as we keep the following properties: + // 1. buffer is non-empty - the output buffer is derived from a valid offset buffer + // 2. values are greater than or equal to zero - we validated before that the first offset is greater than or equal to the shift + // and the first buffer value is coming from a valid offset buffer, + // and the input values are monotonically increasing since they are coming from a valid offset buffer so checking the first offset is enough + // 3. monotonically increasing values - we subtract from all offset the same value, thus keeping the same property as the input buffer which is a valid offset buffer + unsafe { OffsetBuffer::new_unchecked(output_buffer) } + } } impl Deref for OffsetBuffer { @@ -810,4 +883,9 @@ mod tests { let nulls = NullBuffer::new_valid(5); // expects 3 offsets.has_non_empty_nulls(Some(&nulls)); } + + #[test] + fn subtract_by_value_that_will_cause_overflow() { + + } } From 6585d4b8b53774586683b2cdb763c77ebcb1d33e Mon Sep 17 00:00:00 2001 From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com> Date: Thu, 11 Jun 2026 13:12:49 +0300 Subject: [PATCH 2/9] add overflow check --- arrow-array/src/array/list_array.rs | 2 +- arrow-buffer/src/buffer/offset.rs | 70 +++++++++++++++++++++-------- 2 files changed, 52 insertions(+), 20 deletions(-) diff --git a/arrow-array/src/array/list_array.rs b/arrow-array/src/array/list_array.rs index 24f7774f2b7d..f74ff7d3f364 100644 --- a/arrow-array/src/array/list_array.rs +++ b/arrow-array/src/array/list_array.rs @@ -38,7 +38,7 @@ use std::sync::Arc; /// [`StringArray`]: crate::array::StringArray /// [`LargeStringArray`]: crate::array::LargeStringArray pub trait OffsetSizeTrait: - ArrowNativeType + std::ops::AddAssign + Integer + num_traits::CheckedAdd + ArrowNativeType + std::ops::AddAssign + Integer + num_traits::CheckedAdd + num_traits::CheckedSub { /// True for 64 bit offset size and false for 32 bit offset size const IS_LARGE: bool; diff --git a/arrow-buffer/src/buffer/offset.rs b/arrow-buffer/src/buffer/offset.rs index 2016c367b47b..320a9696aebf 100644 --- a/arrow-buffer/src/buffer/offset.rs +++ b/arrow-buffer/src/buffer/offset.rs @@ -330,33 +330,20 @@ impl OffsetBuffer { /// Subtract `rhs` from all offsets /// This will try to reuse the existing allocation as much as possible /// - /// Panics: this will panic if `rhs` > first offset or if `rhs` will lead to overflow (when `rhs` is negative) - /// - /// TODO - add examples - pub fn subtract(self, rhs: O) -> Self where O: std::ops::Sub + std::cmp::Ord { + /// Panics: this will panic if `rhs` > the first offset or if `rhs` will lead to overflow (when `rhs` is negative) + pub fn subtract(self, rhs: O) -> Self where O: std::ops::Sub + std::cmp::PartialOrd + num_traits::CheckedSub { if rhs == O::usize_as(0) { return self; } let len = self.len(); - self[0].checked_sub(rhs).is_some) - // Offset buffer is guaranteed to be non-empty - assert!(self[0] >= rhs, "shifted offsets will become negative which is not allowed "); + assert!(self[0] >= rhs, "shifted offsets will become negative which is not allowed"); - // If negative + // If negative, make sure that this will not create an overflow if rhs < O::usize_as(0) { - let last_value = self[len - 1]; - - let last_value = i32::MAX - 5; - let rhs = -6; - // last_value - rhs == last_value - (-6) == last_value + 6 = i32::MAX - 5 + 6 = i32::MAX + 1 >= i32::MAX - - // i32::MAX - last_value == i32::MAX - (i32::MAX - 5) == i32::MAX - i32::MAX + 5 == 5 - // - let will_overflow = 0 - (O::MAX - last_value) > rhs - assert!(last_value - rhs >= O::MAX); + self[len - 1].checked_sub(rhs).expect("must not overflow"); } let output_buffer = match self.into_inner().into_inner().into_mutable() { @@ -885,7 +872,52 @@ mod tests { } #[test] - fn subtract_by_value_that_will_cause_overflow() { + fn should_panic_for_subtract_by_value_that_will_cause_offsets_to_be_less_than_zero() { + todo!() + } + + #[test] + fn subtract_by_value_that_will_cause_offsets_to_be_less_than_zero_for_outside_the_slice() { + todo!() + } + + #[test] + fn should_panic_subtract_by_value_that_will_cause_offsets_to_overflow() { + todo!() + } + + #[test] + fn subtract_by_value_that_will_cause_offsets_to_overflow_outside_the_slice() { + todo!() + } + + #[test] + fn when_shift_is_0_subtract_should_reuse_the_buffer_even_when_it_is_shared() { + todo!() + } + + #[test] + fn should_reuse_the_underline_data_when_the_buffer_is_not_shared() { + todo!() + } + + #[test] + fn should_create_a_new_buffer_when_the_buffer_is_shared() { + todo!() + } + #[test] + fn when_shift_is_negative_it_should_shift_offsets_in_the_right_direction() { + todo!() + } + + #[test] + fn for_sliced_unshared_buffer_shift_should_reuse_buffer_but_only_modify_the_data_in_slice() { + todo!() + } + + #[test] + fn for_sliced_shared_buffer_shifted_buffer_should_only_include_the_sliced_data() { + todo!() } } From 43daae511a7a6aa3be290f216a24ba5affb44e4f Mon Sep 17 00:00:00 2001 From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com> Date: Thu, 11 Jun 2026 13:46:01 +0300 Subject: [PATCH 3/9] add tests --- arrow-buffer/src/buffer/offset.rs | 119 +++++++++++++++++++++++++++--- 1 file changed, 108 insertions(+), 11 deletions(-) diff --git a/arrow-buffer/src/buffer/offset.rs b/arrow-buffer/src/buffer/offset.rs index 320a9696aebf..9c338388e456 100644 --- a/arrow-buffer/src/buffer/offset.rs +++ b/arrow-buffer/src/buffer/offset.rs @@ -343,7 +343,7 @@ impl OffsetBuffer { // If negative, make sure that this will not create an overflow if rhs < O::usize_as(0) { - self[len - 1].checked_sub(rhs).expect("must not overflow"); + self[len - 1].checked_sub(&rhs).expect("must not overflow"); } let output_buffer = match self.into_inner().into_inner().into_mutable() { @@ -872,52 +872,149 @@ mod tests { } #[test] + #[should_panic(expected = "shifted offsets will become negative which is not allowed")] fn should_panic_for_subtract_by_value_that_will_cause_offsets_to_be_less_than_zero() { - todo!() + // self[0] = 0, rhs = 1 -> 0 >= 1 is false -> assert fires + let offsets = OffsetBuffer::new(ScalarBuffer::::from(vec![0, 3, 6])); + offsets.subtract(1); } #[test] fn subtract_by_value_that_will_cause_offsets_to_be_less_than_zero_for_outside_the_slice() { - todo!() + let offsets = OffsetBuffer::new(ScalarBuffer::::from(vec![0, 1, 4, 7])); + let sliced = offsets.slice(1, 2); // [1, 4, 7] + drop(offsets); + assert_eq!(sliced.as_ref(), &[1, 4, 7]); + + let result = sliced.subtract(1); + assert_eq!(result.as_ref(), &[0, 3, 6]); + assert_eq!(result.len(), 3); } #[test] + #[should_panic(expected = "must not overflow")] fn should_panic_subtract_by_value_that_will_cause_offsets_to_overflow() { - todo!() + // rhs = -1 (negative). self[0] = 0 >= -1 passes. + // last offset i32::MAX - (-1) = i32::MAX + 1 -> checked_sub returns None -> expect fires + let offsets = OffsetBuffer::new(ScalarBuffer::::from(vec![0, 5, i32::MAX])); + offsets.subtract(-1); } #[test] fn subtract_by_value_that_will_cause_offsets_to_overflow_outside_the_slice() { - todo!() + let offsets = OffsetBuffer::new(ScalarBuffer::::from(vec![0, 3, 6, i32::MAX])); + let sliced = offsets.slice(0, 2); // [0, 3, 6] + assert_eq!(sliced.as_ref(), &[0, 3, 6]); + + let result = sliced.subtract(-1); + assert_eq!(result.as_ref(), &[1, 4, 7]); + assert_eq!(result.len(), 3); } #[test] fn when_shift_is_0_subtract_should_reuse_the_buffer_even_when_it_is_shared() { - todo!() + // subtract(0) hits the early `return self` before any into_mutable, + // so the returned buffer is the exact same allocation even while shared. + let offsets = OffsetBuffer::new(ScalarBuffer::::from(vec![0, 3, 6])); + let shared = offsets.clone(); // refcount now 2 -> shared + let result = offsets.subtract(0); + assert!( + result.ptr_eq(&shared), + "subtract(0) must return the same underlying buffer, even when shared" + ); } #[test] fn should_reuse_the_underline_data_when_the_buffer_is_not_shared() { - todo!() + // Unique ownership, offset 0 -> into_mutable succeeds -> mutate in place, + // and MutableBuffer -> Buffer keeps the same allocation. + let offsets = OffsetBuffer::new(ScalarBuffer::::from(vec![2, 5, 8])); + let ptr_before = offsets.as_ptr(); + let result = offsets.subtract(2); + assert_eq!( + ptr_before, + result.as_ptr(), + "a non-shared buffer should be mutated in place, reusing the allocation" + ); + assert_eq!(result.as_ref(), &[0, 3, 6]); } #[test] fn should_create_a_new_buffer_when_the_buffer_is_shared() { - todo!() + let offsets = OffsetBuffer::new(ScalarBuffer::::from(vec![2, 5, 8])); + let shared = offsets.clone(); + let ptr_before = offsets.as_ptr(); + let result = offsets.subtract(2); + assert_ne!( + ptr_before, + result.as_ptr(), + "a shared buffer must not be mutated in place; a new allocation is created" + ); + assert_eq!(result.as_ref(), &[0, 3, 6]); + // The shared view is untouched. + assert_eq!(shared.as_ref(), &[2, 5, 8]); } #[test] fn when_shift_is_negative_it_should_shift_offsets_in_the_right_direction() { - todo!() + // rhs = -2 -> offset - (-2) = offset + 2, so all offsets move up. + let offsets = OffsetBuffer::new(ScalarBuffer::::from(vec![0, 3, 6])); + let result = offsets.subtract(-2); + assert_eq!(result.as_ref(), &[2, 5, 8]); } #[test] fn for_sliced_unshared_buffer_shift_should_reuse_buffer_but_only_modify_the_data_in_slice() { - todo!() + // Underlying [0, 3, 6, 9, 12]; slice -> view [3, 6, 9]. + let offsets = OffsetBuffer::new(ScalarBuffer::::from(vec![1, 3, 6, 9, 12])); + let sliced = offsets.slice(1, 2); // [3, 6, 9] + drop(offsets); // uniquely owned -> eligible for in-place reuse + assert_eq!(sliced.as_ref(), &[3, 6, 9]); + + let ptr_before = sliced.as_ptr(); + let result = sliced.subtract(1); + + // Reuses the allocation... + assert_eq!( + ptr_before, + result.as_ptr(), + "uniquely-owned sliced buffer should be mutated in place" + ); + // ...but only the slice is shifted, and the length stays the slice length. + assert_eq!(result.as_ref(), &[2, 5, 8]); + assert_eq!(result.len(), 3); + let underlying_buffer = result.inner().inner(); + // data should be shifted by 1 + assert_eq!(underlying_buffer.ptr_offset(), std::mem::size_of::()); + let original_value_ptr = unsafe {underlying_buffer.data_ptr().as_ptr() as *mut i32}; + let value_in_ptr = unsafe {*original_value_ptr}; + assert_eq!(value_in_ptr, 1); + let last_value_ptr = unsafe {(underlying_buffer.data_ptr().as_ptr() as *mut i32).add(5)}; + assert_eq!(unsafe {*last_value_ptr}, 12); } #[test] fn for_sliced_shared_buffer_shifted_buffer_should_only_include_the_sliced_data() { - todo!() + // Underlying: [0, 3, 6, 9, 12]; slice(1, 2) -> view [3, 6, 9]. + // `offsets` stays alive, so the sliced buffer is shared -> Err branch. + // The Err branch copies `len` (= 3) elements from the *sliced* typed_data, + // so the result contains only the sliced data, shifted. + let offsets = OffsetBuffer::new(ScalarBuffer::::from(vec![0, 3, 6, 9, 12])); + let sliced = offsets.slice(1, 2); + assert_eq!(sliced.as_ref(), &[3, 6, 9]); + + let result = sliced.subtract(3); + + assert_eq!( + result.as_ref(), + &[0, 3, 6], + "shifted result should contain only the sliced data" + ); + assert_eq!(result.len(), 3); + + // Assert that the underlying buffer of the result is not sliced to make sure it does not include the data outside the slice range from the original buffer + let underlying_buffer = result.inner().inner(); + assert_eq!(underlying_buffer.ptr_offset(), 0); + assert_eq!(underlying_buffer.len(), 3 * std::mem::size_of::()); } } From f60ef98a571a5ce335ecf3918f961251ba5f5982 Mon Sep 17 00:00:00 2001 From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com> Date: Thu, 11 Jun 2026 14:45:03 +0300 Subject: [PATCH 4/9] wip --- arrow-buffer/src/buffer/offset.rs | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/arrow-buffer/src/buffer/offset.rs b/arrow-buffer/src/buffer/offset.rs index 9c338388e456..70fa5e54f0a4 100644 --- a/arrow-buffer/src/buffer/offset.rs +++ b/arrow-buffer/src/buffer/offset.rs @@ -346,10 +346,15 @@ impl OffsetBuffer { self[len - 1].checked_sub(&rhs).expect("must not overflow"); } - let output_buffer = match self.into_inner().into_inner().into_mutable() { + let original_buffer = self.into_inner().into_inner(); + let original_buffer_offset = original_buffer.ptr_offset(); + let original_buffer_len = original_buffer.len(); + + let output_buffer = match original_buffer.into_mutable() { Ok(mut mutable) => { // TODO - add test when the offsets are sliced and the first offset outside the slice is 0 and we shift by > 0 - mutable.typed_data_mut::() + mutable + .typed_data_mut::() .iter_mut() .for_each(|offset| *offset = *offset - rhs); From 263edbd8908158e8e178f6c2256edba9056aebe1 Mon Sep 17 00:00:00 2001 From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com> Date: Thu, 11 Jun 2026 18:05:36 +0300 Subject: [PATCH 5/9] update code around the bug --- arrow-buffer/src/buffer/offset.rs | 75 ++++++++++++++----------------- 1 file changed, 34 insertions(+), 41 deletions(-) diff --git a/arrow-buffer/src/buffer/offset.rs b/arrow-buffer/src/buffer/offset.rs index 70fa5e54f0a4..4204d082e91e 100644 --- a/arrow-buffer/src/buffer/offset.rs +++ b/arrow-buffer/src/buffer/offset.rs @@ -16,7 +16,7 @@ // under the License. use crate::buffer::ScalarBuffer; -use crate::{ArrowNativeType, MutableBuffer, NullBuffer, OffsetBufferBuilder}; +use crate::{ArrowNativeType, Buffer, MutableBuffer, NullBuffer, OffsetBufferBuilder}; use std::ops::Deref; /// A non-empty buffer of monotonically increasing, positive integers. @@ -347,37 +347,39 @@ impl OffsetBuffer { } let original_buffer = self.into_inner().into_inner(); - let original_buffer_offset = original_buffer.ptr_offset(); - let original_buffer_len = original_buffer.len(); - - let output_buffer = match original_buffer.into_mutable() { + let original_length = original_buffer.len(); + + // Remove this once https://github.com/apache/arrow-rs/issues/10117 is resolved + let into_mutable_buffer_result = if original_buffer.ptr_offset() != 0 { + Err(original_buffer) + } else { + original_buffer.into_mutable() + }; + + let output_buffer = match into_mutable_buffer_result { Ok(mut mutable) => { - // TODO - add test when the offsets are sliced and the first offset outside the slice is 0 and we shift by > 0 - mutable - .typed_data_mut::() + let mut_sliced = mutable + .typed_data_mut::(); + + // Remove this once https://github.com/apache/arrow-rs/pull/10118 is merged + let mut_sliced = &mut mut_sliced[0..original_length / O::get_byte_width()]; + + mut_sliced .iter_mut() .for_each(|offset| *offset = *offset - rhs); - mutable + Buffer::from(mutable) } Err(original_buffer) => { - let mut output_buffer = MutableBuffer::new(len * size_of::()); - - let underlying_buffer = original_buffer.typed_data::(); - - for i in 0..len { - unsafe { - // SAFETY: Already allocated sufficient capacity - output_buffer.push_unchecked( - // SAFETY: - // 1. `i` is within bounds - // 2. we will not cause underflow as we checked that the first offset is greater than or equal to the shift - *underlying_buffer.get_unchecked(i) - rhs, - ) - } - } - - output_buffer + let shifted = original_buffer + .typed_data::() + .iter() + .map(|item| { + *item - rhs + }) + .collect::>(); + + Buffer::from_vec(shifted) } }; @@ -968,34 +970,25 @@ mod tests { assert_eq!(result.as_ref(), &[2, 5, 8]); } + // Replace this test with test that assert a reuse after PR #10118 is merged #[test] - fn for_sliced_unshared_buffer_shift_should_reuse_buffer_but_only_modify_the_data_in_slice() { + fn for_sliced_unshared_buffer_shift_should_not_reuse_buffer() { // Underlying [0, 3, 6, 9, 12]; slice -> view [3, 6, 9]. let offsets = OffsetBuffer::new(ScalarBuffer::::from(vec![1, 3, 6, 9, 12])); let sliced = offsets.slice(1, 2); // [3, 6, 9] - drop(offsets); // uniquely owned -> eligible for in-place reuse + drop(offsets); // uniquely owned assert_eq!(sliced.as_ref(), &[3, 6, 9]); let ptr_before = sliced.as_ptr(); let result = sliced.subtract(1); - // Reuses the allocation... - assert_eq!( + assert_ne!( ptr_before, result.as_ptr(), - "uniquely-owned sliced buffer should be mutated in place" + "should not be reused until #10118 is merged" ); - // ...but only the slice is shifted, and the length stays the slice length. + assert_eq!(result.as_ref(), &[2, 5, 8]); - assert_eq!(result.len(), 3); - let underlying_buffer = result.inner().inner(); - // data should be shifted by 1 - assert_eq!(underlying_buffer.ptr_offset(), std::mem::size_of::()); - let original_value_ptr = unsafe {underlying_buffer.data_ptr().as_ptr() as *mut i32}; - let value_in_ptr = unsafe {*original_value_ptr}; - assert_eq!(value_in_ptr, 1); - let last_value_ptr = unsafe {(underlying_buffer.data_ptr().as_ptr() as *mut i32).add(5)}; - assert_eq!(unsafe {*last_value_ptr}, 12); } #[test] From dfddba390d952a90e313cdccd64dd4e4261fac37 Mon Sep 17 00:00:00 2001 From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com> Date: Thu, 11 Jun 2026 18:10:47 +0300 Subject: [PATCH 6/9] format and lint --- arrow-buffer/src/buffer/offset.rs | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/arrow-buffer/src/buffer/offset.rs b/arrow-buffer/src/buffer/offset.rs index 4204d082e91e..1202cecb86e9 100644 --- a/arrow-buffer/src/buffer/offset.rs +++ b/arrow-buffer/src/buffer/offset.rs @@ -331,7 +331,10 @@ impl OffsetBuffer { /// This will try to reuse the existing allocation as much as possible /// /// Panics: this will panic if `rhs` > the first offset or if `rhs` will lead to overflow (when `rhs` is negative) - pub fn subtract(self, rhs: O) -> Self where O: std::ops::Sub + std::cmp::PartialOrd + num_traits::CheckedSub { + pub fn subtract(self, rhs: O) -> Self + where + O: std::ops::Sub + std::cmp::PartialOrd + num_traits::CheckedSub, + { if rhs == O::usize_as(0) { return self; } @@ -339,7 +342,10 @@ impl OffsetBuffer { let len = self.len(); // Offset buffer is guaranteed to be non-empty - assert!(self[0] >= rhs, "shifted offsets will become negative which is not allowed"); + assert!( + self[0] >= rhs, + "shifted offsets will become negative which is not allowed" + ); // If negative, make sure that this will not create an overflow if rhs < O::usize_as(0) { @@ -358,26 +364,23 @@ impl OffsetBuffer { let output_buffer = match into_mutable_buffer_result { Ok(mut mutable) => { - let mut_sliced = mutable - .typed_data_mut::(); + let mut_sliced = mutable.typed_data_mut::(); // Remove this once https://github.com/apache/arrow-rs/pull/10118 is merged let mut_sliced = &mut mut_sliced[0..original_length / O::get_byte_width()]; mut_sliced - .iter_mut() - .for_each(|offset| *offset = *offset - rhs); + .iter_mut() + .for_each(|offset| *offset = *offset - rhs); Buffer::from(mutable) } Err(original_buffer) => { let shifted = original_buffer - .typed_data::() - .iter() - .map(|item| { - *item - rhs - }) - .collect::>(); + .typed_data::() + .iter() + .map(|item| *item - rhs) + .collect::>(); Buffer::from_vec(shifted) } From 94564ce3fc0952af4bca0f01b7115ee75eef4d4b Mon Sep 17 00:00:00 2001 From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com> Date: Fri, 12 Jun 2026 01:16:29 +0300 Subject: [PATCH 7/9] fix pitfall that would have being fixed in #10118 --- arrow-buffer/src/buffer/offset.rs | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/arrow-buffer/src/buffer/offset.rs b/arrow-buffer/src/buffer/offset.rs index 1202cecb86e9..0c53cbb214ab 100644 --- a/arrow-buffer/src/buffer/offset.rs +++ b/arrow-buffer/src/buffer/offset.rs @@ -354,9 +354,10 @@ impl OffsetBuffer { let original_buffer = self.into_inner().into_inner(); let original_length = original_buffer.len(); + let buffer_offset = original_buffer.ptr_offset(); // Remove this once https://github.com/apache/arrow-rs/issues/10117 is resolved - let into_mutable_buffer_result = if original_buffer.ptr_offset() != 0 { + let into_mutable_buffer_result = if buffer_offset != 0 { Err(original_buffer) } else { original_buffer.into_mutable() @@ -373,7 +374,7 @@ impl OffsetBuffer { .iter_mut() .for_each(|offset| *offset = *offset - rhs); - Buffer::from(mutable) + Buffer::from(mutable).slice_with_length(buffer_offset, original_length) } Err(original_buffer) => { let shifted = original_buffer @@ -994,6 +995,21 @@ mod tests { assert_eq!(result.as_ref(), &[2, 5, 8]); } + #[test] + fn for_sliced_but_start_at_0_unshared_buffer_shift_should_reuse_buffer() { + let offsets = OffsetBuffer::new(ScalarBuffer::::from(vec![1, 3, 6, 9, 12])); + let sliced = offsets.slice(0, 2); + drop(offsets); // uniquely owned + assert_eq!(sliced.as_ref(), &[1, 3, 6]); + + let ptr_before = sliced.as_ptr(); + let result = sliced.subtract(1); + + assert_eq!(ptr_before, result.as_ptr(), "should be reused"); + + assert_eq!(result.as_ref(), &[0, 2, 5]); + } + #[test] fn for_sliced_shared_buffer_shifted_buffer_should_only_include_the_sliced_data() { // Underlying: [0, 3, 6, 9, 12]; slice(1, 2) -> view [3, 6, 9]. From 9fef849ee0f7edd023e8171045bac9bafdea4e77 Mon Sep 17 00:00:00 2001 From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com> Date: Fri, 12 Jun 2026 01:17:55 +0300 Subject: [PATCH 8/9] add comment --- arrow-buffer/src/buffer/offset.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/arrow-buffer/src/buffer/offset.rs b/arrow-buffer/src/buffer/offset.rs index 0c53cbb214ab..e57b8d1c20e0 100644 --- a/arrow-buffer/src/buffer/offset.rs +++ b/arrow-buffer/src/buffer/offset.rs @@ -374,6 +374,7 @@ impl OffsetBuffer { .iter_mut() .for_each(|offset| *offset = *offset - rhs); + // Needed until https://github.com/apache/arrow-rs/pull/10118 is merged Buffer::from(mutable).slice_with_length(buffer_offset, original_length) } Err(original_buffer) => { From d7923727d1da748d0f2eca4925d03ab7594e158b Mon Sep 17 00:00:00 2001 From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com> Date: Fri, 12 Jun 2026 01:19:35 +0300 Subject: [PATCH 9/9] Update offset.rs --- arrow-buffer/src/buffer/offset.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow-buffer/src/buffer/offset.rs b/arrow-buffer/src/buffer/offset.rs index e57b8d1c20e0..3e3033ca5633 100644 --- a/arrow-buffer/src/buffer/offset.rs +++ b/arrow-buffer/src/buffer/offset.rs @@ -374,7 +374,7 @@ impl OffsetBuffer { .iter_mut() .for_each(|offset| *offset = *offset - rhs); - // Needed until https://github.com/apache/arrow-rs/pull/10118 is merged + // Remove this slice once https://github.com/apache/arrow-rs/pull/10118 is merged Buffer::from(mutable).slice_with_length(buffer_offset, original_length) } Err(original_buffer) => {