Skip to content

Commit 1a00586

Browse files
API / COW: ensure every new Series/DataFrame also has new (shallow copy) index
1 parent 0bc16da commit 1a00586

File tree

8 files changed

+83
-8
lines changed

8 files changed

+83
-8
lines changed

pandas/_libs/internals.pyi

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,9 @@ class BlockManager:
9595
def __init__(
9696
self, blocks: tuple[B, ...], axes: list[Index], verify_integrity=...
9797
) -> None: ...
98-
def get_slice(self, slobj: slice, axis: int = ...) -> Self: ...
98+
def get_slice(
99+
self, slobj: slice, axis: int = ..., using_cow: bool = False
100+
) -> Self: ...
99101
def _rebuild_blknos_and_blklocs(self) -> None: ...
100102

101103
class BlockValuesRefs:

pandas/_libs/internals.pyx

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -899,7 +899,7 @@ cdef class BlockManager:
899899
# -------------------------------------------------------------------
900900
# Indexing
901901

902-
cdef BlockManager _slice_mgr_rows(self, slice slobj):
902+
cdef BlockManager _slice_mgr_rows(self, slice slobj, bint using_cow):
903903
cdef:
904904
SharedBlock blk, nb
905905
BlockManager mgr
@@ -910,7 +910,10 @@ cdef class BlockManager:
910910
nb = blk.slice_block_rows(slobj)
911911
nbs.append(nb)
912912

913-
new_axes = [self.axes[0], self.axes[1]._getitem_slice(slobj)]
913+
if using_cow:
914+
new_axes = [self.axes[0]._view(), self.axes[1]._getitem_slice(slobj)]
915+
else:
916+
new_axes = [self.axes[0], self.axes[1]._getitem_slice(slobj)]
914917
mgr = type(self)(tuple(nbs), new_axes, verify_integrity=False)
915918

916919
# We can avoid having to rebuild blklocs/blknos
@@ -921,17 +924,21 @@ cdef class BlockManager:
921924
mgr._blklocs = blklocs.copy()
922925
return mgr
923926

924-
def get_slice(self, slobj: slice, axis: int = 0) -> BlockManager:
927+
def get_slice(
928+
self, slobj: slice, axis: int = 0, using_cow: bool = False
929+
) -> BlockManager:
925930

926931
if axis == 0:
927932
new_blocks = self._slice_take_blocks_ax0(slobj)
928933
elif axis == 1:
929-
return self._slice_mgr_rows(slobj)
934+
return self._slice_mgr_rows(slobj, using_cow)
930935
else:
931936
raise IndexError("Requested axis not found in manager")
932937

933938
new_axes = list(self.axes)
934939
new_axes[axis] = new_axes[axis]._getitem_slice(slobj)
940+
if using_cow:
941+
new_axes[1 - axis] = self.axes[1 - axis]._view()
935942

936943
return type(self)(tuple(new_blocks), new_axes, verify_integrity=False)
937944

pandas/core/generic.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4218,7 +4218,9 @@ def _slice(self, slobj: slice, axis: AxisInt = 0) -> Self:
42184218
"""
42194219
assert isinstance(slobj, slice), type(slobj)
42204220
axis = self._get_block_manager_axis(axis)
4221-
result = self._constructor(self._mgr.get_slice(slobj, axis=axis))
4221+
result = self._constructor(
4222+
self._mgr.get_slice(slobj, axis=axis, using_cow=using_copy_on_write())
4223+
)
42224224
result = result.__finalize__(self)
42234225

42244226
# this could be a view

pandas/core/internals/array_manager.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -760,7 +760,9 @@ def fast_xs(self, loc: int) -> SingleArrayManager:
760760
result = np.array(values, dtype=dtype)
761761
return SingleArrayManager([result], [self._axes[1]])
762762

763-
def get_slice(self, slobj: slice, axis: AxisInt = 0) -> ArrayManager:
763+
def get_slice(
764+
self, slobj: slice, axis: AxisInt = 0, using_cow: bool = False
765+
) -> ArrayManager:
764766
axis = self._normalize_axis(axis)
765767

766768
if axis == 0:

pandas/core/internals/managers.py

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -197,6 +197,7 @@ def blklocs(self) -> npt.NDArray[np.intp]:
197197
def make_empty(self, axes=None) -> Self:
198198
"""return an empty BlockManager with the items axis of len 0"""
199199
if axes is None:
200+
# TODO shallow copy remaining axis?
200201
axes = [Index([])] + self.axes[1:]
201202

202203
# preserve dtype if possible
@@ -355,6 +356,7 @@ def apply(
355356
applied = getattr(b, f)(**kwargs)
356357
result_blocks = extend_blocks(applied, result_blocks)
357358

359+
# TODO shallow copy axes (in from_blocks or here?)
358360
out = type(self).from_blocks(result_blocks, self.axes)
359361
return out
360362

@@ -575,6 +577,7 @@ def get_numeric_data(self, copy: bool = False) -> Self:
575577
# Avoid somewhat expensive _combine
576578
if copy:
577579
return self.copy(deep=True)
580+
# TODO(CoW) need to return a shallow copy here?
578581
return self
579582
return self._combine(numeric_blocks, copy)
580583

@@ -606,6 +609,7 @@ def _combine(
606609
new_blocks.append(nb)
607610

608611
axes = list(self.axes)
612+
# TODO shallow copy of axes?
609613
if index is not None:
610614
axes[-1] = index
611615
axes[0] = self.items.take(indexer)
@@ -647,7 +651,10 @@ def copy_func(ax):
647651

648652
new_axes = [copy_func(ax) for ax in self.axes]
649653
else:
650-
new_axes = list(self.axes)
654+
if using_copy_on_write():
655+
new_axes = [ax.view() for ax in self.axes]
656+
else:
657+
new_axes = list(self.axes)
651658

652659
res = self.apply("copy", deep=deep)
653660
res.axes = new_axes
@@ -674,6 +681,7 @@ def consolidate(self) -> Self:
674681
if self.is_consolidated():
675682
return self
676683

684+
# TODO shallow copy is not needed here?
677685
bm = type(self)(self.blocks, self.axes, verify_integrity=False)
678686
bm._is_consolidated = False
679687
bm._consolidate_inplace()
@@ -718,6 +726,7 @@ def reindex_indexer(
718726

719727
if indexer is None:
720728
if new_axis is self.axes[axis] and not copy:
729+
# TODO(CoW) need to handle CoW?
721730
return self
722731

723732
result = self.copy(deep=copy)
@@ -756,6 +765,8 @@ def reindex_indexer(
756765

757766
new_axes = list(self.axes)
758767
new_axes[axis] = new_axis
768+
if self.ndim == 2 and using_copy_on_write():
769+
new_axes[1 - axis] = self.axes[1 - axis]._view()
759770

760771
new_mgr = type(self).from_blocks(new_blocks, new_axes)
761772
if axis == 1:
@@ -1034,6 +1045,7 @@ def fast_xs(self, loc: int) -> SingleBlockManager:
10341045
ndim=1,
10351046
refs=self.blocks[0].refs,
10361047
)
1048+
# TODO shallow copy columns
10371049
return SingleBlockManager(block, self.axes[0])
10381050

10391051
dtype = interleaved_dtype([blk.dtype for blk in self.blocks])
@@ -1067,6 +1079,7 @@ def fast_xs(self, loc: int) -> SingleBlockManager:
10671079

10681080
bp = BlockPlacement(slice(0, len(result)))
10691081
block = new_block(result, placement=bp, ndim=1)
1082+
# TODO shallow copy columns
10701083
return SingleBlockManager(block, self.axes[0])
10711084

10721085
def iget(self, i: int, track_ref: bool = True) -> SingleBlockManager:
@@ -1081,6 +1094,7 @@ def iget(self, i: int, track_ref: bool = True) -> SingleBlockManager:
10811094
nb = type(block)(
10821095
values, placement=bp, ndim=1, refs=block.refs if track_ref else None
10831096
)
1097+
# TODO shallow copy index? (might already be done where this gets called)
10841098
return SingleBlockManager(nb, self.axes[1])
10851099

10861100
def iget_values(self, i: int) -> ArrayLike:
@@ -1479,6 +1493,7 @@ def idelete(self, indexer) -> BlockManager:
14791493

14801494
nbs = self._slice_take_blocks_ax0(taker, only_slice=True)
14811495
new_columns = self.items[~is_deleted]
1496+
# TODO shallow copy index?
14821497
axes = [new_columns, self.axes[1]]
14831498
return type(self)(tuple(nbs), axes, verify_integrity=False)
14841499

@@ -1516,6 +1531,7 @@ def grouped_reduce(self, func: Callable) -> Self:
15161531
nrows = result_blocks[0].values.shape[-1]
15171532
index = Index(range(nrows))
15181533

1534+
# TODO shallow copy columns?
15191535
return type(self).from_blocks(result_blocks, [self.axes[0], index])
15201536

15211537
def reduce(self, func: Callable) -> Self:
@@ -1539,6 +1555,7 @@ def reduce(self, func: Callable) -> Self:
15391555
res_blocks.extend(nbs)
15401556

15411557
index = Index([None]) # placeholder
1558+
# TODO shallow copy self.items
15421559
new_mgr = type(self).from_blocks(res_blocks, [self.items, index])
15431560
return new_mgr
15441561

@@ -1585,6 +1602,7 @@ def quantile(
15851602
assert is_list_like(qs) # caller is responsible for this
15861603
assert axis == 1 # only ever called this way
15871604

1605+
# TODO shallow copy axes
15881606
new_axes = list(self.axes)
15891607
new_axes[1] = Index(qs, dtype=np.float64)
15901608

@@ -1873,6 +1891,7 @@ def concat_horizontal(cls, mgrs: list[Self], axes: list[Index]) -> Self:
18731891

18741892
offset += len(mgr.items)
18751893

1894+
# TODO relevant axis already shallow-copied at caller?
18761895
new_mgr = cls(tuple(blocks), axes)
18771896
return new_mgr
18781897

@@ -1942,6 +1961,7 @@ def to_2d_mgr(self, columns: Index) -> BlockManager:
19421961
arr = ensure_block_shape(blk.values, ndim=2)
19431962
bp = BlockPlacement(0)
19441963
new_blk = type(blk)(arr, placement=bp, ndim=2, refs=blk.refs)
1964+
# TODO shallow copy index
19451965
axes = [columns, self.axes[0]]
19461966
return BlockManager([new_blk], axes=axes, verify_integrity=False)
19471967

pandas/core/internals/ops.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,7 @@ def operate_blockwise(
8888
# assert len(slocs) == nlocs, (len(slocs), nlocs)
8989
# assert slocs == set(range(nlocs)), slocs
9090

91+
# TODO shallow copy axes?
9192
new_mgr = type(right)(tuple(res_blks), axes=right.axes, verify_integrity=False)
9293
return new_mgr
9394

pandas/tests/copy_view/test_indexing.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,9 @@ def test_subset_column_selection(backend, using_copy_on_write):
5959

6060
subset = df[["a", "c"]]
6161

62+
if using_copy_on_write:
63+
assert subset.index is not df.index
64+
6265
if using_copy_on_write:
6366
# the subset shares memory ...
6467
assert np.shares_memory(get_array(subset, "a"), get_array(df, "a"))
@@ -111,6 +114,9 @@ def test_subset_row_slice(backend, using_copy_on_write):
111114
subset = df[1:3]
112115
subset._mgr._verify_integrity()
113116

117+
if using_copy_on_write:
118+
assert subset.columns is not df.columns
119+
114120
assert np.shares_memory(get_array(subset, "a"), get_array(df, "a"))
115121

116122
if using_copy_on_write:
@@ -154,6 +160,9 @@ def test_subset_column_slice(backend, using_copy_on_write, using_array_manager,
154160
subset = df.iloc[:, 1:]
155161
subset._mgr._verify_integrity()
156162

163+
if using_copy_on_write:
164+
assert subset.index is not df.index
165+
157166
if using_copy_on_write:
158167
assert np.shares_memory(get_array(subset, "b"), get_array(df, "b"))
159168

@@ -213,6 +222,10 @@ def test_subset_loc_rows_columns(
213222

214223
subset = df.loc[row_indexer, column_indexer]
215224

225+
if using_copy_on_write:
226+
assert subset.index is not df.index
227+
assert subset.columns is not df.columns
228+
216229
# modifying the subset never modifies the parent
217230
subset.iloc[0, 0] = 0
218231

@@ -273,6 +286,10 @@ def test_subset_iloc_rows_columns(
273286

274287
subset = df.iloc[row_indexer, column_indexer]
275288

289+
if using_copy_on_write:
290+
assert subset.index is not df.index
291+
assert subset.columns is not df.columns
292+
276293
# modifying the subset never modifies the parent
277294
subset.iloc[0, 0] = 0
278295

@@ -718,6 +735,10 @@ def test_null_slice(backend, method, using_copy_on_write):
718735

719736
df2 = method(df)
720737

738+
if using_copy_on_write:
739+
assert df2.index is not df.index
740+
assert df2.columns is not df.columns
741+
721742
# we always return new objects (shallow copy), regardless of CoW or not
722743
assert df2 is not df
723744

@@ -745,6 +766,9 @@ def test_null_slice_series(backend, method, using_copy_on_write):
745766

746767
s2 = method(s)
747768

769+
if using_copy_on_write:
770+
assert s2.index is not s.index
771+
748772
# we always return new objects, regardless of CoW or not
749773
assert s2 is not s
750774

@@ -886,6 +910,9 @@ def test_column_as_series(backend, using_copy_on_write, using_array_manager):
886910

887911
s = df["a"]
888912

913+
if using_copy_on_write:
914+
assert s.index is not df.index
915+
889916
assert np.shares_memory(get_array(s, "a"), get_array(df, "a"))
890917

891918
if using_copy_on_write or using_array_manager:
@@ -963,6 +990,10 @@ def test_column_as_series_no_item_cache(
963990
s1 = method(df)
964991
s2 = method(df)
965992

993+
if using_copy_on_write:
994+
assert s1.index is not df.index
995+
assert s1.index is not s2.index
996+
966997
is_iloc = "iloc" in request.node.name
967998
if using_copy_on_write or is_iloc:
968999
assert s1 is not s2

pandas/tests/copy_view/test_methods.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,9 @@ def test_copy(using_copy_on_write):
2828
assert not df_copy._mgr.blocks[0].refs.has_reference()
2929
assert not df_copy._mgr.blocks[1].refs.has_reference()
3030

31+
assert df_copy.index is not df.index
32+
assert df_copy.columns is not df.columns
33+
3134
# mutating copy doesn't mutate original
3235
df_copy.iloc[0, 0] = 0
3336
assert df.iloc[0, 0] == 1
@@ -37,6 +40,13 @@ def test_copy_shallow(using_copy_on_write):
3740
df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]})
3841
df_copy = df.copy(deep=False)
3942

43+
if using_copy_on_write:
44+
assert df_copy.index is not df.index
45+
assert df_copy.columns is not df.columns
46+
else:
47+
assert df_copy.index is df.index
48+
assert df_copy.columns is df.columns
49+
4050
# the shallow copy still shares memory
4151
assert np.shares_memory(get_array(df_copy, "a"), get_array(df, "a"))
4252
if using_copy_on_write:

0 commit comments

Comments
 (0)