Skip to content
This repository was archived by the owner on Apr 1, 2026. It is now read-only.

Commit 952e446

Browse files
feat: Add value_counts to GroupBy classes
1 parent d17b711 commit 952e446

8 files changed

Lines changed: 271 additions & 11 deletions

File tree

bigframes/core/block_transforms.py

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -355,24 +355,28 @@ def value_counts(
355355
normalize: bool = False,
356356
sort: bool = True,
357357
ascending: bool = False,
358-
dropna: bool = True,
358+
drop_na: bool = True,
359+
grouping_keys: typing.Sequence[str] = (),
359360
):
360-
block, dummy = block.create_constant(1)
361+
if grouping_keys and drop_na:
362+
# only need this if grouping_keys is involved, otherwise the drop_na in the aggregation will handle it for us
363+
block = dropna(block, columns, how="any")
361364
block, agg_ids = block.aggregate(
362-
by_column_ids=columns,
363-
aggregations=[ex.UnaryAggregation(agg_ops.count_op, ex.deref(dummy))],
364-
dropna=dropna,
365+
by_column_ids=(*grouping_keys, *columns),
366+
aggregations=[ex.NullaryAggregation(agg_ops.size_op)],
367+
dropna=drop_na and not grouping_keys,
365368
)
366369
count_id = agg_ids[0]
367370
if normalize:
368-
unbound_window = windows.unbound()
371+
unbound_window = windows.unbound(grouping_keys=tuple(grouping_keys))
369372
block, total_count_id = block.apply_window_op(
370373
count_id, agg_ops.sum_op, unbound_window
371374
)
372375
block, count_id = block.apply_binary_op(count_id, total_count_id, ops.div_op)
373376

374377
if sort:
375-
block = block.order_by(
378+
order_parts = [ordering.ascending_over(id) for id in grouping_keys]
379+
order_parts.extend(
376380
[
377381
ordering.OrderingExpression(
378382
ex.deref(count_id),
@@ -382,6 +386,7 @@ def value_counts(
382386
)
383387
]
384388
)
389+
block = block.order_by(order_parts)
385390
return block.select_column(count_id).with_column_labels(
386391
["proportion" if normalize else "count"]
387392
)

bigframes/core/groupby/dataframe_group_by.py

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616

1717
import datetime
1818
import typing
19-
from typing import Literal, Sequence, Tuple, Union
19+
from typing import Literal, Optional, Sequence, Tuple, Union
2020

2121
import bigframes_vendored.constants as constants
2222
import bigframes_vendored.pandas.core.groupby as vendored_pandas_groupby
@@ -330,6 +330,39 @@ def diff(self, periods=1) -> series.Series:
330330
)
331331
return self._apply_window_op(agg_ops.DiffOp(periods), window=window)
332332

333+
def value_counts(
334+
self,
335+
subset: Optional[Sequence[blocks.Label]] = None,
336+
normalize: bool = False,
337+
sort: bool = True,
338+
ascending: bool = False,
339+
dropna: bool = True,
340+
) -> Union[df.DataFrame, series.Series]:
341+
if subset is None:
342+
columns = self._selected_cols
343+
else:
344+
columns = [
345+
column
346+
for column in self._block.value_columns
347+
if self._block.col_id_to_label[column] in subset
348+
]
349+
block = self._block
350+
if self._dropna: # this drops null grouping columns
351+
block = block_ops.dropna(block, self._by_col_ids)
352+
block = block_ops.value_counts(
353+
block,
354+
columns,
355+
normalize=normalize,
356+
sort=sort,
357+
ascending=ascending,
358+
drop_na=dropna, # this drops null value columns
359+
grouping_keys=self._by_col_ids,
360+
)
361+
if self._as_index:
362+
return series.Series(block)
363+
else:
364+
return series.Series(block).to_frame().reset_index(drop=False)
365+
333366
@validations.requires_ordering()
334367
def rolling(
335368
self,

bigframes/core/groupby/series_group_by.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -195,6 +195,30 @@ def agg(self, func=None) -> typing.Union[df.DataFrame, series.Series]:
195195

196196
aggregate = agg
197197

198+
def value_counts(
199+
self,
200+
normalize: bool = False,
201+
sort: bool = True,
202+
ascending: bool = False,
203+
dropna: bool = True,
204+
) -> Union[df.DataFrame, series.Series]:
205+
columns = [self._value_column]
206+
block = self._block
207+
if self._dropna: # this drops null grouping columns
208+
block = block_ops.dropna(block, self._by_col_ids)
209+
block = block_ops.value_counts(
210+
block,
211+
columns,
212+
normalize=normalize,
213+
sort=sort,
214+
ascending=ascending,
215+
drop_na=dropna, # this drops null value columns
216+
grouping_keys=self._by_col_ids,
217+
)
218+
# TODO: once as_index=Fales supported, return DataFrame instead by resetting index
219+
# with .to_frame().reset_index(drop=False)
220+
return series.Series(block)
221+
198222
@validations.requires_ordering()
199223
def cumsum(self, *args, **kwargs) -> series.Series:
200224
return self._apply_window_op(

bigframes/core/indexes/base.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -489,7 +489,7 @@ def value_counts(
489489
self._block.index_columns,
490490
normalize=normalize,
491491
ascending=ascending,
492-
dropna=dropna,
492+
drop_na=dropna,
493493
)
494494
import bigframes.series as series
495495

bigframes/dataframe.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2475,7 +2475,7 @@ def value_counts(
24752475
normalize=normalize,
24762476
sort=sort,
24772477
ascending=ascending,
2478-
dropna=dropna,
2478+
drop_na=dropna,
24792479
)
24802480
return bigframes.series.Series(block)
24812481

bigframes/series.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1631,7 +1631,7 @@ def value_counts(
16311631
[self._value_column],
16321632
normalize=normalize,
16331633
ascending=ascending,
1634-
dropna=dropna,
1634+
drop_na=dropna,
16351635
)
16361636
return Series(block)
16371637

tests/system/small/test_groupby.py

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -582,6 +582,46 @@ def test_dataframe_groupby_nonnumeric_with_mean():
582582
)
583583

584584

585+
@pytest.mark.parametrize(
586+
("subset", "normalize", "ascending", "dropna", "as_index"),
587+
[
588+
(None, True, True, True, True),
589+
(["int64_too", "int64_col"], False, False, False, False),
590+
],
591+
)
592+
def test_dataframe_groupby_value_counts(
593+
scalars_df_index,
594+
scalars_pandas_df_index,
595+
subset,
596+
normalize,
597+
ascending,
598+
dropna,
599+
as_index,
600+
):
601+
col_names = ["float64_col", "int64_col", "bool_col", "int64_too"]
602+
bf_result = (
603+
scalars_df_index[col_names]
604+
.groupby("bool_col", as_index=as_index)
605+
.value_counts(
606+
subset=subset, normalize=normalize, ascending=ascending, dropna=dropna
607+
)
608+
.to_pandas()
609+
)
610+
pd_result = (
611+
scalars_pandas_df_index[col_names]
612+
.groupby("bool_col", as_index=as_index)
613+
.value_counts(
614+
subset=subset, normalize=normalize, ascending=ascending, dropna=dropna
615+
)
616+
)
617+
618+
if as_index:
619+
pd.testing.assert_series_equal(pd_result, bf_result, check_dtype=False)
620+
else:
621+
pd_result.index = pd_result.index.astype("Int64")
622+
pd.testing.assert_frame_equal(pd_result, bf_result, check_dtype=False)
623+
624+
585625
# ==============
586626
# Series.groupby
587627
# ==============
@@ -768,3 +808,36 @@ def test_series_groupby_quantile(scalars_df_index, scalars_pandas_df_index, q):
768808
pd.testing.assert_series_equal(
769809
pd_result, bf_result, check_dtype=False, check_index_type=False
770810
)
811+
812+
813+
@pytest.mark.parametrize(
814+
("normalize", "ascending", "dropna"),
815+
[
816+
(
817+
True,
818+
True,
819+
True,
820+
),
821+
(
822+
False,
823+
False,
824+
False,
825+
),
826+
],
827+
)
828+
def test_series_groupby_value_counts(
829+
scalars_df_index,
830+
scalars_pandas_df_index,
831+
normalize,
832+
ascending,
833+
dropna,
834+
):
835+
bf_result = (
836+
scalars_df_index.groupby("bool_col")["string_col"]
837+
.value_counts(normalize=normalize, ascending=ascending, dropna=dropna)
838+
.to_pandas()
839+
)
840+
pd_result = scalars_pandas_df_index.groupby("bool_col")["string_col"].value_counts(
841+
normalize=normalize, ascending=ascending, dropna=dropna
842+
)
843+
pd.testing.assert_series_equal(pd_result, bf_result, check_dtype=False)

third_party/bigframes_vendored/pandas/core/groupby/__init__.py

Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1256,6 +1256,32 @@ def nunique(self):
12561256
"""
12571257
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
12581258

1259+
def value_counts(
1260+
self,
1261+
normalize: bool = False,
1262+
sort: bool = True,
1263+
ascending: bool = False,
1264+
dropna: bool = True,
1265+
):
1266+
"""
1267+
Return a Series or DataFrame containing counts of unique rows.
1268+
1269+
Args:
1270+
normalize (bool, default False):
1271+
Return proportions rather than frequencies.
1272+
sort (bool, default True):
1273+
Sort by frequencies.
1274+
ascending (bool, default False):
1275+
Sort in ascending order.
1276+
dropna (bool, default True):
1277+
Don't include counts of rows that contain NA values.
1278+
1279+
Returns:
1280+
Series or DataFrame:
1281+
Series if the groupby as_index is True, otherwise DataFrame.
1282+
"""
1283+
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
1284+
12591285

12601286
class DataFrameGroupBy(GroupBy):
12611287
def agg(self, func, **kwargs):
@@ -1406,3 +1432,102 @@ def nunique(self):
14061432
Number of unique values within a BigQuery DataFrame.
14071433
"""
14081434
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
1435+
1436+
def value_counts(
1437+
self,
1438+
subset=None,
1439+
normalize: bool = False,
1440+
sort: bool = True,
1441+
ascending: bool = False,
1442+
dropna: bool = True,
1443+
):
1444+
"""
1445+
Return a Series or DataFrame containing counts of unique rows.
1446+
1447+
**Examples:**
1448+
1449+
>>> import bigframes.pandas as bpd
1450+
>>> import numpy as np
1451+
>>> bpd.options.display.progress_bar = None
1452+
1453+
>>> df = bpd.DataFrame({
1454+
... 'gender': ['male', 'male', 'female', 'male', 'female', 'male'],
1455+
... 'education': ['low', 'medium', 'high', 'low', 'high', 'low'],
1456+
... 'country': ['US', 'FR', 'US', 'FR', 'FR', 'FR']
1457+
... })
1458+
1459+
>>> df
1460+
gender education country
1461+
0 male low US
1462+
1 male medium FR
1463+
2 female high US
1464+
3 male low FR
1465+
4 female high FR
1466+
5 male low FR
1467+
<BLANKLINE>
1468+
[6 rows x 3 columns]
1469+
1470+
>>> df.groupby('gender').value_counts()
1471+
gender education country
1472+
female high FR 1
1473+
US 1
1474+
male low FR 2
1475+
US 1
1476+
medium FR 1
1477+
Name: count, dtype: Int64
1478+
1479+
>>> df.groupby('gender').value_counts(ascending=True)
1480+
gender education country
1481+
female high FR 1
1482+
US 1
1483+
male low US 1
1484+
medium FR 1
1485+
low FR 2
1486+
Name: count, dtype: Int64
1487+
1488+
>>> df.groupby('gender').value_counts(normalize=True)
1489+
gender education country
1490+
female high FR 0.5
1491+
US 0.5
1492+
male low FR 0.5
1493+
US 0.25
1494+
medium FR 0.25
1495+
Name: proportion, dtype: Float64
1496+
1497+
>>> df.groupby('gender', as_index=False).value_counts()
1498+
gender education country count
1499+
0 female high FR 1
1500+
1 female high US 1
1501+
2 male low FR 2
1502+
3 male low US 1
1503+
4 male medium FR 1
1504+
<BLANKLINE>
1505+
[5 rows x 4 columns]
1506+
1507+
>>> df.groupby('gender', as_index=False).value_counts(normalize=True)
1508+
gender education country proportion
1509+
0 female high FR 0.5
1510+
1 female high US 0.5
1511+
2 male low FR 0.5
1512+
3 male low US 0.25
1513+
4 male medium FR 0.25
1514+
<BLANKLINE>
1515+
[5 rows x 4 columns]
1516+
1517+
Args:
1518+
subset (list-like, optional):
1519+
Columns to use when counting unique combinations.
1520+
normalize (bool, default False):
1521+
Return proportions rather than frequencies.
1522+
sort (bool, default True):
1523+
Sort by frequencies.
1524+
ascending (bool, default False):
1525+
Sort in ascending order.
1526+
dropna (bool, default True):
1527+
Don't include counts of rows that contain NA values.
1528+
1529+
Returns:
1530+
Series or DataFrame:
1531+
Series if the groupby as_index is True, otherwise DataFrame.
1532+
"""
1533+
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)

0 commit comments

Comments
 (0)