Skip to content

Commit 780c27c

Browse files
fangchenliclaude
andcommitted
PERF: avoid unnecessary to_numpy conversion in value_counts for Arrow arrays
Previously, value_counts_internal would convert Arrow array counts to NumPy just to call .sum() for normalization. This is unnecessary since Series.sum() works correctly for all backends. Changes: - Remove unnecessary np.asarray(counts) conversion for Arrow arrays - Remove unused counts variable assignments from bins and MultiIndex branches - Use result.sum() instead of counts.sum() for normalization This eliminates a performance bottleneck where Arrow-backed Series would fall back to NumPy during value_counts operations. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
1 parent b43b95d commit 780c27c

File tree

2 files changed

+29
-12
lines changed

2 files changed

+29
-12
lines changed

pandas/core/algorithms.py

Lines changed: 1 addition & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -897,19 +897,12 @@ def value_counts_internal(
897897
if dropna and (result._values == 0).all():
898898
result = result.iloc[0:0]
899899

900-
# normalizing is by len of all (regardless of dropna)
901-
counts = np.array([len(ii)])
902-
903900
else:
904901
if is_extension_array_dtype(values):
905902
# handle Categorical and sparse,
906903
result = Series(values, copy=False)._values.value_counts(dropna=dropna)
907904
result.name = name
908905
result.index.name = index_name
909-
counts = result._values
910-
if not isinstance(counts, np.ndarray):
911-
# e.g. ArrowExtensionArray
912-
counts = np.asarray(counts)
913906

914907
elif isinstance(values, ABCMultiIndex):
915908
# GH49558
@@ -920,10 +913,6 @@ def value_counts_internal(
920913
.size()
921914
)
922915
result.index.names = values.names
923-
# error: Incompatible types in assignment (expression has type
924-
# "ndarray[Any, Any] | DatetimeArray | TimedeltaArray | PeriodArray | Any",
925-
# variable has type "ndarray[tuple[int, ...], dtype[Any]]")
926-
counts = result._values # type: ignore[assignment]
927916

928917
else:
929918
values = _ensure_arraylike(values, func_name="value_counts")
@@ -951,7 +940,7 @@ def value_counts_internal(
951940
result = result.sort_values(ascending=ascending, kind="stable")
952941

953942
if normalize:
954-
result = result / counts.sum()
943+
result = result / result.sum()
955944

956945
return result
957946

pandas/tests/extension/test_arrow.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -787,6 +787,34 @@ def test_value_counts_returns_pyarrow_int64(self, data):
787787
result = data.value_counts()
788788
assert result.dtype == ArrowDtype(pa.int64())
789789

790+
def test_value_counts_no_numpy_fallback(self, data, monkeypatch):
791+
# Ensure value_counts doesn't unnecessarily convert Arrow arrays to NumPy
792+
from pandas.core.arrays.arrow import ArrowExtensionArray
793+
794+
data = data[:10]
795+
ser = pd.Series(data)
796+
797+
# Track if to_numpy was called
798+
to_numpy_called = False
799+
original_to_numpy = ArrowExtensionArray.to_numpy
800+
801+
def tracked_to_numpy(self, *args, **kwargs):
802+
nonlocal to_numpy_called
803+
to_numpy_called = True
804+
return original_to_numpy(self, *args, **kwargs)
805+
806+
monkeypatch.setattr(ArrowExtensionArray, "to_numpy", tracked_to_numpy)
807+
808+
ser.value_counts()
809+
assert not to_numpy_called, "value_counts() should not call to_numpy()"
810+
811+
# Also test with normalize=True
812+
to_numpy_called = False
813+
ser.value_counts(normalize=True)
814+
assert not to_numpy_called, (
815+
"value_counts(normalize=True) should not call to_numpy()"
816+
)
817+
790818
_combine_le_expected_dtype = "bool[pyarrow]"
791819

792820
def get_op_from_name(self, op_name):

0 commit comments

Comments
 (0)