From 780c27c4ee1c4a39fda51ebe971cb24d1eac3349 Mon Sep 17 00:00:00 2001 From: Fangchen Li Date: Tue, 16 Dec 2025 21:10:52 -0800 Subject: [PATCH 1/5] PERF: avoid unnecessary to_numpy conversion in value_counts for Arrow arrays MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously, value_counts_internal would convert Arrow array counts to NumPy just to call .sum() for normalization. This is unnecessary since Series.sum() works correctly for all backends. Changes: - Remove unnecessary np.asarray(counts) conversion for Arrow arrays - Remove unused counts variable assignments from bins and MultiIndex branches - Use result.sum() instead of counts.sum() for normalization This eliminates a performance bottleneck where Arrow-backed Series would fall back to NumPy during value_counts operations. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- pandas/core/algorithms.py | 13 +------------ pandas/tests/extension/test_arrow.py | 28 ++++++++++++++++++++++++++++ 2 files changed, 29 insertions(+), 12 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index c31a2cbb41dd3..eb957851bb24e 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -897,19 +897,12 @@ def value_counts_internal( if dropna and (result._values == 0).all(): result = result.iloc[0:0] - # normalizing is by len of all (regardless of dropna) - counts = np.array([len(ii)]) - else: if is_extension_array_dtype(values): # handle Categorical and sparse, result = Series(values, copy=False)._values.value_counts(dropna=dropna) result.name = name result.index.name = index_name - counts = result._values - if not isinstance(counts, np.ndarray): - # e.g. ArrowExtensionArray - counts = np.asarray(counts) elif isinstance(values, ABCMultiIndex): # GH49558 @@ -920,10 +913,6 @@ def value_counts_internal( .size() ) result.index.names = values.names - # error: Incompatible types in assignment (expression has type - # "ndarray[Any, Any] | DatetimeArray | TimedeltaArray | PeriodArray | Any", - # variable has type "ndarray[tuple[int, ...], dtype[Any]]") - counts = result._values # type: ignore[assignment] else: values = _ensure_arraylike(values, func_name="value_counts") @@ -951,7 +940,7 @@ def value_counts_internal( result = result.sort_values(ascending=ascending, kind="stable") if normalize: - result = result / counts.sum() + result = result / result.sum() return result diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index ba5d257bd59e4..6e7f7fb472c4e 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -787,6 +787,34 @@ def test_value_counts_returns_pyarrow_int64(self, data): result = data.value_counts() assert result.dtype == ArrowDtype(pa.int64()) + def test_value_counts_no_numpy_fallback(self, data, monkeypatch): + # Ensure value_counts doesn't unnecessarily convert Arrow arrays to NumPy + from pandas.core.arrays.arrow import ArrowExtensionArray + + data = data[:10] + ser = pd.Series(data) + + # Track if to_numpy was called + to_numpy_called = False + original_to_numpy = ArrowExtensionArray.to_numpy + + def tracked_to_numpy(self, *args, **kwargs): + nonlocal to_numpy_called + to_numpy_called = True + return original_to_numpy(self, *args, **kwargs) + + monkeypatch.setattr(ArrowExtensionArray, "to_numpy", tracked_to_numpy) + + ser.value_counts() + assert not to_numpy_called, "value_counts() should not call to_numpy()" + + # Also test with normalize=True + to_numpy_called = False + ser.value_counts(normalize=True) + assert not to_numpy_called, ( + "value_counts(normalize=True) should not call to_numpy()" + ) + _combine_le_expected_dtype = "bool[pyarrow]" def get_op_from_name(self, op_name): From e9248b4a28d1a1437057758d0f00c21850bec5f5 Mon Sep 17 00:00:00 2001 From: Fangchen Li Date: Tue, 16 Dec 2025 21:10:52 -0800 Subject: [PATCH 2/5] PERF: avoid unnecessary to_numpy conversion in value_counts for Arrow arrays MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously, value_counts_internal would convert Arrow array counts to NumPy just to call .sum() for normalization. This is unnecessary since Series.sum() works correctly for all backends. Changes: - Remove unnecessary np.asarray(counts) conversion for Arrow arrays - Remove unused counts variable assignments from bins and MultiIndex branches - Use result.sum() instead of counts.sum() for normalization This eliminates a performance bottleneck where Arrow-backed Series would fall back to NumPy during value_counts operations. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- pandas/tests/extension/test_arrow.py | 52 +++++++++++++++++----------- 1 file changed, 32 insertions(+), 20 deletions(-) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 6e7f7fb472c4e..4db5d6ecbad49 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -270,6 +270,35 @@ def data_for_twos(data): # TODO: skip otherwise? +@pytest.fixture +def arrow_to_numpy_tracker(monkeypatch): + """ + Fixture to track if ArrowExtensionArray.to_numpy() was called. + + Returns a callable that returns True if to_numpy was called since the last check, + and resets the tracker. + """ + from pandas.core.arrays.arrow import ArrowExtensionArray + + called = False + original_to_numpy = ArrowExtensionArray.to_numpy + + def tracked_to_numpy(self, *args, **kwargs): + nonlocal called + called = True + return original_to_numpy(self, *args, **kwargs) + + monkeypatch.setattr(ArrowExtensionArray, "to_numpy", tracked_to_numpy) + + def was_called(): + nonlocal called + result = called + called = False + return result + + return was_called + + class TestArrowArray(base.ExtensionTests): def _construct_for_combine_add(self, left, right): dtype = left.dtype @@ -787,33 +816,16 @@ def test_value_counts_returns_pyarrow_int64(self, data): result = data.value_counts() assert result.dtype == ArrowDtype(pa.int64()) - def test_value_counts_no_numpy_fallback(self, data, monkeypatch): + def test_value_counts_no_numpy_fallback(self, data, arrow_to_numpy_tracker): # Ensure value_counts doesn't unnecessarily convert Arrow arrays to NumPy - from pandas.core.arrays.arrow import ArrowExtensionArray - data = data[:10] ser = pd.Series(data) - # Track if to_numpy was called - to_numpy_called = False - original_to_numpy = ArrowExtensionArray.to_numpy - - def tracked_to_numpy(self, *args, **kwargs): - nonlocal to_numpy_called - to_numpy_called = True - return original_to_numpy(self, *args, **kwargs) - - monkeypatch.setattr(ArrowExtensionArray, "to_numpy", tracked_to_numpy) - ser.value_counts() - assert not to_numpy_called, "value_counts() should not call to_numpy()" + assert not arrow_to_numpy_tracker() - # Also test with normalize=True - to_numpy_called = False ser.value_counts(normalize=True) - assert not to_numpy_called, ( - "value_counts(normalize=True) should not call to_numpy()" - ) + assert not arrow_to_numpy_tracker() _combine_le_expected_dtype = "bool[pyarrow]" From 4facd996e315f52950f892ca995a394776c67504 Mon Sep 17 00:00:00 2001 From: Fangchen Li Date: Tue, 16 Dec 2025 21:24:40 -0800 Subject: [PATCH 3/5] reformat --- pandas/tests/extension/test_arrow.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 4db5d6ecbad49..137efb5caa25a 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -816,16 +816,18 @@ def test_value_counts_returns_pyarrow_int64(self, data): result = data.value_counts() assert result.dtype == ArrowDtype(pa.int64()) - def test_value_counts_no_numpy_fallback(self, data, arrow_to_numpy_tracker): + @pytest.mark.parametrize("normalize", [False, True]) + def test_value_counts_no_numpy_fallback( + self, data, normalize, arrow_to_numpy_tracker + ): # Ensure value_counts doesn't unnecessarily convert Arrow arrays to NumPy data = data[:10] ser = pd.Series(data) - ser.value_counts() - assert not arrow_to_numpy_tracker() + ser.value_counts(normalize=normalize) - ser.value_counts(normalize=True) - assert not arrow_to_numpy_tracker() + numpy_called = arrow_to_numpy_tracker() + assert not numpy_called _combine_le_expected_dtype = "bool[pyarrow]" From 228919e9c5a931f88f622f942f67d50c350f31d8 Mon Sep 17 00:00:00 2001 From: Fangchen Li Date: Tue, 16 Dec 2025 22:11:49 -0800 Subject: [PATCH 4/5] BUG: fix bins normalization in value_counts after refactoring MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous change to avoid unnecessary to_numpy conversion broke normalization when bins is used. Bins normalization should divide by the total input length, not the sum of counts in bins. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- pandas/core/algorithms.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index eb957851bb24e..55a3022454e02 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -897,7 +897,11 @@ def value_counts_internal( if dropna and (result._values == 0).all(): result = result.iloc[0:0] + # normalizing is by len of all (regardless of dropna) + normalize_denominator = len(ii) + else: + normalize_denominator = None if is_extension_array_dtype(values): # handle Categorical and sparse, result = Series(values, copy=False)._values.value_counts(dropna=dropna) @@ -925,8 +929,7 @@ def value_counts_internal( idx = Index(keys, dtype=keys.dtype, name=index_name) if ( - bins is None - and not sort + not sort and isinstance(values, (DatetimeIndex, TimedeltaIndex)) and idx.equals(values) and values.inferred_freq is not None @@ -940,7 +943,10 @@ def value_counts_internal( result = result.sort_values(ascending=ascending, kind="stable") if normalize: - result = result / result.sum() + if normalize_denominator is not None: + result = result / normalize_denominator + else: + result = result / result.sum() return result From 3a5a69b64db235eda287709ea35deabaae805676 Mon Sep 17 00:00:00 2001 From: Fangchen Li Date: Wed, 17 Dec 2025 09:22:53 -0800 Subject: [PATCH 5/5] remove test --- pandas/tests/extension/test_arrow.py | 42 ---------------------------- 1 file changed, 42 deletions(-) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 137efb5caa25a..ba5d257bd59e4 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -270,35 +270,6 @@ def data_for_twos(data): # TODO: skip otherwise? -@pytest.fixture -def arrow_to_numpy_tracker(monkeypatch): - """ - Fixture to track if ArrowExtensionArray.to_numpy() was called. - - Returns a callable that returns True if to_numpy was called since the last check, - and resets the tracker. - """ - from pandas.core.arrays.arrow import ArrowExtensionArray - - called = False - original_to_numpy = ArrowExtensionArray.to_numpy - - def tracked_to_numpy(self, *args, **kwargs): - nonlocal called - called = True - return original_to_numpy(self, *args, **kwargs) - - monkeypatch.setattr(ArrowExtensionArray, "to_numpy", tracked_to_numpy) - - def was_called(): - nonlocal called - result = called - called = False - return result - - return was_called - - class TestArrowArray(base.ExtensionTests): def _construct_for_combine_add(self, left, right): dtype = left.dtype @@ -816,19 +787,6 @@ def test_value_counts_returns_pyarrow_int64(self, data): result = data.value_counts() assert result.dtype == ArrowDtype(pa.int64()) - @pytest.mark.parametrize("normalize", [False, True]) - def test_value_counts_no_numpy_fallback( - self, data, normalize, arrow_to_numpy_tracker - ): - # Ensure value_counts doesn't unnecessarily convert Arrow arrays to NumPy - data = data[:10] - ser = pd.Series(data) - - ser.value_counts(normalize=normalize) - - numpy_called = arrow_to_numpy_tracker() - assert not numpy_called - _combine_le_expected_dtype = "bool[pyarrow]" def get_op_from_name(self, op_name):