Skip to content

Commit e9248b4

Browse files
fangchenliclaude
andcommitted
PERF: avoid unnecessary to_numpy conversion in value_counts for Arrow arrays
Previously, value_counts_internal would convert Arrow array counts to NumPy just to call .sum() for normalization. This is unnecessary since Series.sum() works correctly for all backends. Changes: - Remove unnecessary np.asarray(counts) conversion for Arrow arrays - Remove unused counts variable assignments from bins and MultiIndex branches - Use result.sum() instead of counts.sum() for normalization This eliminates a performance bottleneck where Arrow-backed Series would fall back to NumPy during value_counts operations. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
1 parent 780c27c commit e9248b4

File tree

1 file changed

+32
-20
lines changed

1 file changed

+32
-20
lines changed

pandas/tests/extension/test_arrow.py

Lines changed: 32 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -270,6 +270,35 @@ def data_for_twos(data):
270270
# TODO: skip otherwise?
271271

272272

273+
@pytest.fixture
274+
def arrow_to_numpy_tracker(monkeypatch):
275+
"""
276+
Fixture to track if ArrowExtensionArray.to_numpy() was called.
277+
278+
Returns a callable that returns True if to_numpy was called since the last check,
279+
and resets the tracker.
280+
"""
281+
from pandas.core.arrays.arrow import ArrowExtensionArray
282+
283+
called = False
284+
original_to_numpy = ArrowExtensionArray.to_numpy
285+
286+
def tracked_to_numpy(self, *args, **kwargs):
287+
nonlocal called
288+
called = True
289+
return original_to_numpy(self, *args, **kwargs)
290+
291+
monkeypatch.setattr(ArrowExtensionArray, "to_numpy", tracked_to_numpy)
292+
293+
def was_called():
294+
nonlocal called
295+
result = called
296+
called = False
297+
return result
298+
299+
return was_called
300+
301+
273302
class TestArrowArray(base.ExtensionTests):
274303
def _construct_for_combine_add(self, left, right):
275304
dtype = left.dtype
@@ -787,33 +816,16 @@ def test_value_counts_returns_pyarrow_int64(self, data):
787816
result = data.value_counts()
788817
assert result.dtype == ArrowDtype(pa.int64())
789818

790-
def test_value_counts_no_numpy_fallback(self, data, monkeypatch):
819+
def test_value_counts_no_numpy_fallback(self, data, arrow_to_numpy_tracker):
791820
# Ensure value_counts doesn't unnecessarily convert Arrow arrays to NumPy
792-
from pandas.core.arrays.arrow import ArrowExtensionArray
793-
794821
data = data[:10]
795822
ser = pd.Series(data)
796823

797-
# Track if to_numpy was called
798-
to_numpy_called = False
799-
original_to_numpy = ArrowExtensionArray.to_numpy
800-
801-
def tracked_to_numpy(self, *args, **kwargs):
802-
nonlocal to_numpy_called
803-
to_numpy_called = True
804-
return original_to_numpy(self, *args, **kwargs)
805-
806-
monkeypatch.setattr(ArrowExtensionArray, "to_numpy", tracked_to_numpy)
807-
808824
ser.value_counts()
809-
assert not to_numpy_called, "value_counts() should not call to_numpy()"
825+
assert not arrow_to_numpy_tracker()
810826

811-
# Also test with normalize=True
812-
to_numpy_called = False
813827
ser.value_counts(normalize=True)
814-
assert not to_numpy_called, (
815-
"value_counts(normalize=True) should not call to_numpy()"
816-
)
828+
assert not arrow_to_numpy_tracker()
817829

818830
_combine_le_expected_dtype = "bool[pyarrow]"
819831

0 commit comments

Comments
 (0)