Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 7 additions & 12 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -898,18 +898,15 @@ def value_counts_internal(
result = result.iloc[0:0]

# normalizing is by len of all (regardless of dropna)
counts = np.array([len(ii)])
normalize_denominator = len(ii)

else:
normalize_denominator = None
if is_extension_array_dtype(values):
# handle Categorical and sparse,
result = Series(values, copy=False)._values.value_counts(dropna=dropna)
result.name = name
result.index.name = index_name
counts = result._values
if not isinstance(counts, np.ndarray):
# e.g. ArrowExtensionArray
counts = np.asarray(counts)

elif isinstance(values, ABCMultiIndex):
# GH49558
Expand All @@ -920,10 +917,6 @@ def value_counts_internal(
.size()
)
result.index.names = values.names
# error: Incompatible types in assignment (expression has type
# "ndarray[Any, Any] | DatetimeArray | TimedeltaArray | PeriodArray | Any",
# variable has type "ndarray[tuple[int, ...], dtype[Any]]")
counts = result._values # type: ignore[assignment]

else:
values = _ensure_arraylike(values, func_name="value_counts")
Expand All @@ -936,8 +929,7 @@ def value_counts_internal(
idx = Index(keys, dtype=keys.dtype, name=index_name)

if (
bins is None
and not sort
not sort
and isinstance(values, (DatetimeIndex, TimedeltaIndex))
and idx.equals(values)
and values.inferred_freq is not None
Expand All @@ -951,7 +943,10 @@ def value_counts_internal(
result = result.sort_values(ascending=ascending, kind="stable")

if normalize:
result = result / counts.sum()
if normalize_denominator is not None:
result = result / normalize_denominator
else:
result = result / result.sum()

return result

Expand Down
42 changes: 42 additions & 0 deletions pandas/tests/extension/test_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -270,6 +270,35 @@ def data_for_twos(data):
# TODO: skip otherwise?


@pytest.fixture
def arrow_to_numpy_tracker(monkeypatch):
"""
Fixture to track if ArrowExtensionArray.to_numpy() was called.

Returns a callable that returns True if to_numpy was called since the last check,
and resets the tracker.
"""
from pandas.core.arrays.arrow import ArrowExtensionArray

called = False
original_to_numpy = ArrowExtensionArray.to_numpy

def tracked_to_numpy(self, *args, **kwargs):
nonlocal called
called = True
return original_to_numpy(self, *args, **kwargs)

monkeypatch.setattr(ArrowExtensionArray, "to_numpy", tracked_to_numpy)

def was_called():
nonlocal called
result = called
called = False
return result

return was_called


class TestArrowArray(base.ExtensionTests):
def _construct_for_combine_add(self, left, right):
dtype = left.dtype
Expand Down Expand Up @@ -787,6 +816,19 @@ def test_value_counts_returns_pyarrow_int64(self, data):
result = data.value_counts()
assert result.dtype == ArrowDtype(pa.int64())

@pytest.mark.parametrize("normalize", [False, True])
def test_value_counts_no_numpy_fallback(
self, data, normalize, arrow_to_numpy_tracker
):
# Ensure value_counts doesn't unnecessarily convert Arrow arrays to NumPy
data = data[:10]
ser = pd.Series(data)

ser.value_counts(normalize=normalize)

numpy_called = arrow_to_numpy_tracker()
assert not numpy_called

_combine_le_expected_dtype = "bool[pyarrow]"

def get_op_from_name(self, op_name):
Expand Down
Loading