PERF: avoid unnecessary to_numpy conversion in value_counts for Arrow arrays

fangchenli · claude · fangchenli · commit 780c27c4ee1c · 2025-12-16T21:10:52.000-08:00
Previously, value_counts_internal would convert Arrow array counts to NumPy just to call .sum() for normalization. This is unnecessary since Series.sum() works correctly for all backends. Changes: - Remove unnecessary np.asarray(counts) conversion for Arrow arrays - Remove unused counts variable assignments from bins and MultiIndex branches - Use result.sum() instead of counts.sum() for normalization This eliminates a performance bottleneck where Arrow-backed Series would fall back to NumPy during value_counts operations. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -897,19 +897,12 @@ def value_counts_internal(
         if dropna and (result._values == 0).all():
             result = result.iloc[0:0]
 
-        # normalizing is by len of all (regardless of dropna)
-        counts = np.array([len(ii)])
-
     else:
         if is_extension_array_dtype(values):
             # handle Categorical and sparse,
             result = Series(values, copy=False)._values.value_counts(dropna=dropna)
             result.name = name
             result.index.name = index_name
-            counts = result._values
-            if not isinstance(counts, np.ndarray):
-                # e.g. ArrowExtensionArray
-                counts = np.asarray(counts)
 
         elif isinstance(values, ABCMultiIndex):
             # GH49558
@@ -920,10 +913,6 @@ def value_counts_internal(
                 .size()
             )
             result.index.names = values.names
-            # error: Incompatible types in assignment (expression has type
-            # "ndarray[Any, Any] | DatetimeArray | TimedeltaArray | PeriodArray | Any",
-            # variable has type "ndarray[tuple[int, ...], dtype[Any]]")
-            counts = result._values  # type: ignore[assignment]
 
         else:
             values = _ensure_arraylike(values, func_name="value_counts")
@@ -951,7 +940,7 @@ def value_counts_internal(
         result = result.sort_values(ascending=ascending, kind="stable")
 
     if normalize:
-        result = result / counts.sum()
+        result = result / result.sum()
 
     return result
 
diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py
@@ -787,6 +787,34 @@ def test_value_counts_returns_pyarrow_int64(self, data):
         result = data.value_counts()
         assert result.dtype == ArrowDtype(pa.int64())
 
+    def test_value_counts_no_numpy_fallback(self, data, monkeypatch):
+        # Ensure value_counts doesn't unnecessarily convert Arrow arrays to NumPy
+        from pandas.core.arrays.arrow import ArrowExtensionArray
+
+        data = data[:10]
+        ser = pd.Series(data)
+
+        # Track if to_numpy was called
+        to_numpy_called = False
+        original_to_numpy = ArrowExtensionArray.to_numpy
+
+        def tracked_to_numpy(self, *args, **kwargs):
+            nonlocal to_numpy_called
+            to_numpy_called = True
+            return original_to_numpy(self, *args, **kwargs)
+
+        monkeypatch.setattr(ArrowExtensionArray, "to_numpy", tracked_to_numpy)
+
+        ser.value_counts()
+        assert not to_numpy_called, "value_counts() should not call to_numpy()"
+
+        # Also test with normalize=True
+        to_numpy_called = False
+        ser.value_counts(normalize=True)
+        assert not to_numpy_called, (
+            "value_counts(normalize=True) should not call to_numpy()"
+        )
+
     _combine_le_expected_dtype = "bool[pyarrow]"
 
     def get_op_from_name(self, op_name):