From 780c27c4ee1c4a39fda51ebe971cb24d1eac3349 Mon Sep 17 00:00:00 2001
From: Fangchen Li <fangchen.li@outlook.com>
Date: Tue, 16 Dec 2025 21:10:52 -0800
Subject: [PATCH 1/5] PERF: avoid unnecessary to_numpy conversion in
 value_counts for Arrow arrays
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Previously, value_counts_internal would convert Arrow array counts to
NumPy just to call .sum() for normalization. This is unnecessary since
Series.sum() works correctly for all backends.

Changes:
- Remove unnecessary np.asarray(counts) conversion for Arrow arrays
- Remove unused counts variable assignments from bins and MultiIndex branches
- Use result.sum() instead of counts.sum() for normalization

This eliminates a performance bottleneck where Arrow-backed Series
would fall back to NumPy during value_counts operations.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 pandas/core/algorithms.py            | 13 +------------
 pandas/tests/extension/test_arrow.py | 28 ++++++++++++++++++++++++++++
 2 files changed, 29 insertions(+), 12 deletions(-)

diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
index c31a2cbb41dd3..eb957851bb24e 100644
--- a/pandas/core/algorithms.py
+++ b/pandas/core/algorithms.py
@@ -897,19 +897,12 @@ def value_counts_internal(
         if dropna and (result._values == 0).all():
             result = result.iloc[0:0]
 
-        # normalizing is by len of all (regardless of dropna)
-        counts = np.array([len(ii)])
-
     else:
         if is_extension_array_dtype(values):
             # handle Categorical and sparse,
             result = Series(values, copy=False)._values.value_counts(dropna=dropna)
             result.name = name
             result.index.name = index_name
-            counts = result._values
-            if not isinstance(counts, np.ndarray):
-                # e.g. ArrowExtensionArray
-                counts = np.asarray(counts)
 
         elif isinstance(values, ABCMultiIndex):
             # GH49558
@@ -920,10 +913,6 @@ def value_counts_internal(
                 .size()
             )
             result.index.names = values.names
-            # error: Incompatible types in assignment (expression has type
-            # "ndarray[Any, Any] | DatetimeArray | TimedeltaArray | PeriodArray | Any",
-            # variable has type "ndarray[tuple[int, ...], dtype[Any]]")
-            counts = result._values  # type: ignore[assignment]
 
         else:
             values = _ensure_arraylike(values, func_name="value_counts")
@@ -951,7 +940,7 @@ def value_counts_internal(
         result = result.sort_values(ascending=ascending, kind="stable")
 
     if normalize:
-        result = result / counts.sum()
+        result = result / result.sum()
 
     return result
 
diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py
index ba5d257bd59e4..6e7f7fb472c4e 100644
--- a/pandas/tests/extension/test_arrow.py
+++ b/pandas/tests/extension/test_arrow.py
@@ -787,6 +787,34 @@ def test_value_counts_returns_pyarrow_int64(self, data):
         result = data.value_counts()
         assert result.dtype == ArrowDtype(pa.int64())
 
+    def test_value_counts_no_numpy_fallback(self, data, monkeypatch):
+        # Ensure value_counts doesn't unnecessarily convert Arrow arrays to NumPy
+        from pandas.core.arrays.arrow import ArrowExtensionArray
+
+        data = data[:10]
+        ser = pd.Series(data)
+
+        # Track if to_numpy was called
+        to_numpy_called = False
+        original_to_numpy = ArrowExtensionArray.to_numpy
+
+        def tracked_to_numpy(self, *args, **kwargs):
+            nonlocal to_numpy_called
+            to_numpy_called = True
+            return original_to_numpy(self, *args, **kwargs)
+
+        monkeypatch.setattr(ArrowExtensionArray, "to_numpy", tracked_to_numpy)
+
+        ser.value_counts()
+        assert not to_numpy_called, "value_counts() should not call to_numpy()"
+
+        # Also test with normalize=True
+        to_numpy_called = False
+        ser.value_counts(normalize=True)
+        assert not to_numpy_called, (
+            "value_counts(normalize=True) should not call to_numpy()"
+        )
+
     _combine_le_expected_dtype = "bool[pyarrow]"
 
     def get_op_from_name(self, op_name):

From e9248b4a28d1a1437057758d0f00c21850bec5f5 Mon Sep 17 00:00:00 2001
From: Fangchen Li <fangchen.li@outlook.com>
Date: Tue, 16 Dec 2025 21:10:52 -0800
Subject: [PATCH 2/5] PERF: avoid unnecessary to_numpy conversion in
 value_counts for Arrow arrays
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Previously, value_counts_internal would convert Arrow array counts to
NumPy just to call .sum() for normalization. This is unnecessary since
Series.sum() works correctly for all backends.

Changes:
- Remove unnecessary np.asarray(counts) conversion for Arrow arrays
- Remove unused counts variable assignments from bins and MultiIndex branches
- Use result.sum() instead of counts.sum() for normalization

This eliminates a performance bottleneck where Arrow-backed Series
would fall back to NumPy during value_counts operations.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 pandas/tests/extension/test_arrow.py | 52 +++++++++++++++++-----------
 1 file changed, 32 insertions(+), 20 deletions(-)

diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py
index 6e7f7fb472c4e..4db5d6ecbad49 100644
--- a/pandas/tests/extension/test_arrow.py
+++ b/pandas/tests/extension/test_arrow.py
@@ -270,6 +270,35 @@ def data_for_twos(data):
     # TODO: skip otherwise?
 
 
+@pytest.fixture
+def arrow_to_numpy_tracker(monkeypatch):
+    """
+    Fixture to track if ArrowExtensionArray.to_numpy() was called.
+
+    Returns a callable that returns True if to_numpy was called since the last check,
+    and resets the tracker.
+    """
+    from pandas.core.arrays.arrow import ArrowExtensionArray
+
+    called = False
+    original_to_numpy = ArrowExtensionArray.to_numpy
+
+    def tracked_to_numpy(self, *args, **kwargs):
+        nonlocal called
+        called = True
+        return original_to_numpy(self, *args, **kwargs)
+
+    monkeypatch.setattr(ArrowExtensionArray, "to_numpy", tracked_to_numpy)
+
+    def was_called():
+        nonlocal called
+        result = called
+        called = False
+        return result
+
+    return was_called
+
+
 class TestArrowArray(base.ExtensionTests):
     def _construct_for_combine_add(self, left, right):
         dtype = left.dtype
@@ -787,33 +816,16 @@ def test_value_counts_returns_pyarrow_int64(self, data):
         result = data.value_counts()
         assert result.dtype == ArrowDtype(pa.int64())
 
-    def test_value_counts_no_numpy_fallback(self, data, monkeypatch):
+    def test_value_counts_no_numpy_fallback(self, data, arrow_to_numpy_tracker):
         # Ensure value_counts doesn't unnecessarily convert Arrow arrays to NumPy
-        from pandas.core.arrays.arrow import ArrowExtensionArray
-
         data = data[:10]
         ser = pd.Series(data)
 
-        # Track if to_numpy was called
-        to_numpy_called = False
-        original_to_numpy = ArrowExtensionArray.to_numpy
-
-        def tracked_to_numpy(self, *args, **kwargs):
-            nonlocal to_numpy_called
-            to_numpy_called = True
-            return original_to_numpy(self, *args, **kwargs)
-
-        monkeypatch.setattr(ArrowExtensionArray, "to_numpy", tracked_to_numpy)
-
         ser.value_counts()
-        assert not to_numpy_called, "value_counts() should not call to_numpy()"
+        assert not arrow_to_numpy_tracker()
 
-        # Also test with normalize=True
-        to_numpy_called = False
         ser.value_counts(normalize=True)
-        assert not to_numpy_called, (
-            "value_counts(normalize=True) should not call to_numpy()"
-        )
+        assert not arrow_to_numpy_tracker()
 
     _combine_le_expected_dtype = "bool[pyarrow]"
 

From 4facd996e315f52950f892ca995a394776c67504 Mon Sep 17 00:00:00 2001
From: Fangchen Li <fangchen.li@outlook.com>
Date: Tue, 16 Dec 2025 21:24:40 -0800
Subject: [PATCH 3/5] reformat

---
 pandas/tests/extension/test_arrow.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py
index 4db5d6ecbad49..137efb5caa25a 100644
--- a/pandas/tests/extension/test_arrow.py
+++ b/pandas/tests/extension/test_arrow.py
@@ -816,16 +816,18 @@ def test_value_counts_returns_pyarrow_int64(self, data):
         result = data.value_counts()
         assert result.dtype == ArrowDtype(pa.int64())
 
-    def test_value_counts_no_numpy_fallback(self, data, arrow_to_numpy_tracker):
+    @pytest.mark.parametrize("normalize", [False, True])
+    def test_value_counts_no_numpy_fallback(
+        self, data, normalize, arrow_to_numpy_tracker
+    ):
         # Ensure value_counts doesn't unnecessarily convert Arrow arrays to NumPy
         data = data[:10]
         ser = pd.Series(data)
 
-        ser.value_counts()
-        assert not arrow_to_numpy_tracker()
+        ser.value_counts(normalize=normalize)
 
-        ser.value_counts(normalize=True)
-        assert not arrow_to_numpy_tracker()
+        numpy_called = arrow_to_numpy_tracker()
+        assert not numpy_called
 
     _combine_le_expected_dtype = "bool[pyarrow]"
 

From 228919e9c5a931f88f622f942f67d50c350f31d8 Mon Sep 17 00:00:00 2001
From: Fangchen Li <fangchen.li@outlook.com>
Date: Tue, 16 Dec 2025 22:11:49 -0800
Subject: [PATCH 4/5] BUG: fix bins normalization in value_counts after
 refactoring
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The previous change to avoid unnecessary to_numpy conversion broke
normalization when bins is used. Bins normalization should divide by
the total input length, not the sum of counts in bins.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 pandas/core/algorithms.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
index eb957851bb24e..55a3022454e02 100644
--- a/pandas/core/algorithms.py
+++ b/pandas/core/algorithms.py
@@ -897,7 +897,11 @@ def value_counts_internal(
         if dropna and (result._values == 0).all():
             result = result.iloc[0:0]
 
+        # normalizing is by len of all (regardless of dropna)
+        normalize_denominator = len(ii)
+
     else:
+        normalize_denominator = None
         if is_extension_array_dtype(values):
             # handle Categorical and sparse,
             result = Series(values, copy=False)._values.value_counts(dropna=dropna)
@@ -925,8 +929,7 @@ def value_counts_internal(
             idx = Index(keys, dtype=keys.dtype, name=index_name)
 
             if (
-                bins is None
-                and not sort
+                not sort
                 and isinstance(values, (DatetimeIndex, TimedeltaIndex))
                 and idx.equals(values)
                 and values.inferred_freq is not None
@@ -940,7 +943,10 @@ def value_counts_internal(
         result = result.sort_values(ascending=ascending, kind="stable")
 
     if normalize:
-        result = result / result.sum()
+        if normalize_denominator is not None:
+            result = result / normalize_denominator
+        else:
+            result = result / result.sum()
 
     return result
 

From 3a5a69b64db235eda287709ea35deabaae805676 Mon Sep 17 00:00:00 2001
From: Fangchen Li <fangchen.li@outlook.com>
Date: Wed, 17 Dec 2025 09:22:53 -0800
Subject: [PATCH 5/5] remove test

---
 pandas/tests/extension/test_arrow.py | 42 ----------------------------
 1 file changed, 42 deletions(-)

diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py
index 137efb5caa25a..ba5d257bd59e4 100644
--- a/pandas/tests/extension/test_arrow.py
+++ b/pandas/tests/extension/test_arrow.py
@@ -270,35 +270,6 @@ def data_for_twos(data):
     # TODO: skip otherwise?
 
 
-@pytest.fixture
-def arrow_to_numpy_tracker(monkeypatch):
-    """
-    Fixture to track if ArrowExtensionArray.to_numpy() was called.
-
-    Returns a callable that returns True if to_numpy was called since the last check,
-    and resets the tracker.
-    """
-    from pandas.core.arrays.arrow import ArrowExtensionArray
-
-    called = False
-    original_to_numpy = ArrowExtensionArray.to_numpy
-
-    def tracked_to_numpy(self, *args, **kwargs):
-        nonlocal called
-        called = True
-        return original_to_numpy(self, *args, **kwargs)
-
-    monkeypatch.setattr(ArrowExtensionArray, "to_numpy", tracked_to_numpy)
-
-    def was_called():
-        nonlocal called
-        result = called
-        called = False
-        return result
-
-    return was_called
-
-
 class TestArrowArray(base.ExtensionTests):
     def _construct_for_combine_add(self, left, right):
         dtype = left.dtype
@@ -816,19 +787,6 @@ def test_value_counts_returns_pyarrow_int64(self, data):
         result = data.value_counts()
         assert result.dtype == ArrowDtype(pa.int64())
 
-    @pytest.mark.parametrize("normalize", [False, True])
-    def test_value_counts_no_numpy_fallback(
-        self, data, normalize, arrow_to_numpy_tracker
-    ):
-        # Ensure value_counts doesn't unnecessarily convert Arrow arrays to NumPy
-        data = data[:10]
-        ser = pd.Series(data)
-
-        ser.value_counts(normalize=normalize)
-
-        numpy_called = arrow_to_numpy_tracker()
-        assert not numpy_called
-
     _combine_le_expected_dtype = "bool[pyarrow]"
 
     def get_op_from_name(self, op_name):