feat: Add value_counts to GroupBy classes

TrevorBergeron · TrevorBergeron · commit 952e4461f388 · 2025-08-07T19:42:35.000Z
diff --git a/bigframes/core/block_transforms.py b/bigframes/core/block_transforms.py
@@ -355,24 +355,28 @@ def value_counts(
     normalize: bool = False,
     sort: bool = True,
     ascending: bool = False,
-    dropna: bool = True,
+    drop_na: bool = True,
+    grouping_keys: typing.Sequence[str] = (),
 ):
-    block, dummy = block.create_constant(1)
+    if grouping_keys and drop_na:
+        # only need this if grouping_keys is involved, otherwise the drop_na in the aggregation will handle it for us
+        block = dropna(block, columns, how="any")
     block, agg_ids = block.aggregate(
-        by_column_ids=columns,
-        aggregations=[ex.UnaryAggregation(agg_ops.count_op, ex.deref(dummy))],
-        dropna=dropna,
+        by_column_ids=(*grouping_keys, *columns),
+        aggregations=[ex.NullaryAggregation(agg_ops.size_op)],
+        dropna=drop_na and not grouping_keys,
     )
     count_id = agg_ids[0]
     if normalize:
-        unbound_window = windows.unbound()
+        unbound_window = windows.unbound(grouping_keys=tuple(grouping_keys))
         block, total_count_id = block.apply_window_op(
             count_id, agg_ops.sum_op, unbound_window
         )
         block, count_id = block.apply_binary_op(count_id, total_count_id, ops.div_op)
 
     if sort:
-        block = block.order_by(
+        order_parts = [ordering.ascending_over(id) for id in grouping_keys]
+        order_parts.extend(
             [
                 ordering.OrderingExpression(
                     ex.deref(count_id),
@@ -382,6 +386,7 @@ def value_counts(
                 )
             ]
         )
+        block = block.order_by(order_parts)
     return block.select_column(count_id).with_column_labels(
         ["proportion" if normalize else "count"]
     )
diff --git a/bigframes/core/groupby/dataframe_group_by.py b/bigframes/core/groupby/dataframe_group_by.py
@@ -16,7 +16,7 @@
 
 import datetime
 import typing
-from typing import Literal, Sequence, Tuple, Union
+from typing import Literal, Optional, Sequence, Tuple, Union
 
 import bigframes_vendored.constants as constants
 import bigframes_vendored.pandas.core.groupby as vendored_pandas_groupby
@@ -330,6 +330,39 @@ def diff(self, periods=1) -> series.Series:
         )
         return self._apply_window_op(agg_ops.DiffOp(periods), window=window)
 
+    def value_counts(
+        self,
+        subset: Optional[Sequence[blocks.Label]] = None,
+        normalize: bool = False,
+        sort: bool = True,
+        ascending: bool = False,
+        dropna: bool = True,
+    ) -> Union[df.DataFrame, series.Series]:
+        if subset is None:
+            columns = self._selected_cols
+        else:
+            columns = [
+                column
+                for column in self._block.value_columns
+                if self._block.col_id_to_label[column] in subset
+            ]
+        block = self._block
+        if self._dropna:  # this drops null grouping columns
+            block = block_ops.dropna(block, self._by_col_ids)
+        block = block_ops.value_counts(
+            block,
+            columns,
+            normalize=normalize,
+            sort=sort,
+            ascending=ascending,
+            drop_na=dropna,  # this drops null value columns
+            grouping_keys=self._by_col_ids,
+        )
+        if self._as_index:
+            return series.Series(block)
+        else:
+            return series.Series(block).to_frame().reset_index(drop=False)
+
     @validations.requires_ordering()
     def rolling(
         self,
diff --git a/bigframes/core/groupby/series_group_by.py b/bigframes/core/groupby/series_group_by.py
@@ -195,6 +195,30 @@ def agg(self, func=None) -> typing.Union[df.DataFrame, series.Series]:
 
     aggregate = agg
 
+    def value_counts(
+        self,
+        normalize: bool = False,
+        sort: bool = True,
+        ascending: bool = False,
+        dropna: bool = True,
+    ) -> Union[df.DataFrame, series.Series]:
+        columns = [self._value_column]
+        block = self._block
+        if self._dropna:  # this drops null grouping columns
+            block = block_ops.dropna(block, self._by_col_ids)
+        block = block_ops.value_counts(
+            block,
+            columns,
+            normalize=normalize,
+            sort=sort,
+            ascending=ascending,
+            drop_na=dropna,  # this drops null value columns
+            grouping_keys=self._by_col_ids,
+        )
+        # TODO: once as_index=Fales supported, return DataFrame instead by resetting index
+        # with .to_frame().reset_index(drop=False)
+        return series.Series(block)
+
     @validations.requires_ordering()
     def cumsum(self, *args, **kwargs) -> series.Series:
         return self._apply_window_op(
diff --git a/bigframes/core/indexes/base.py b/bigframes/core/indexes/base.py
@@ -489,7 +489,7 @@ def value_counts(
             self._block.index_columns,
             normalize=normalize,
             ascending=ascending,
-            dropna=dropna,
+            drop_na=dropna,
         )
         import bigframes.series as series
 
diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py
@@ -2475,7 +2475,7 @@ def value_counts(
             normalize=normalize,
             sort=sort,
             ascending=ascending,
-            dropna=dropna,
+            drop_na=dropna,
         )
         return bigframes.series.Series(block)
 
diff --git a/bigframes/series.py b/bigframes/series.py
@@ -1631,7 +1631,7 @@ def value_counts(
             [self._value_column],
             normalize=normalize,
             ascending=ascending,
-            dropna=dropna,
+            drop_na=dropna,
         )
         return Series(block)
 
diff --git a/tests/system/small/test_groupby.py b/tests/system/small/test_groupby.py
@@ -582,6 +582,46 @@ def test_dataframe_groupby_nonnumeric_with_mean():
     )
 
 
+@pytest.mark.parametrize(
+    ("subset", "normalize", "ascending", "dropna", "as_index"),
+    [
+        (None, True, True, True, True),
+        (["int64_too", "int64_col"], False, False, False, False),
+    ],
+)
+def test_dataframe_groupby_value_counts(
+    scalars_df_index,
+    scalars_pandas_df_index,
+    subset,
+    normalize,
+    ascending,
+    dropna,
+    as_index,
+):
+    col_names = ["float64_col", "int64_col", "bool_col", "int64_too"]
+    bf_result = (
+        scalars_df_index[col_names]
+        .groupby("bool_col", as_index=as_index)
+        .value_counts(
+            subset=subset, normalize=normalize, ascending=ascending, dropna=dropna
+        )
+        .to_pandas()
+    )
+    pd_result = (
+        scalars_pandas_df_index[col_names]
+        .groupby("bool_col", as_index=as_index)
+        .value_counts(
+            subset=subset, normalize=normalize, ascending=ascending, dropna=dropna
+        )
+    )
+
+    if as_index:
+        pd.testing.assert_series_equal(pd_result, bf_result, check_dtype=False)
+    else:
+        pd_result.index = pd_result.index.astype("Int64")
+        pd.testing.assert_frame_equal(pd_result, bf_result, check_dtype=False)
+
+
 # ==============
 # Series.groupby
 # ==============
@@ -768,3 +808,36 @@ def test_series_groupby_quantile(scalars_df_index, scalars_pandas_df_index, q):
     pd.testing.assert_series_equal(
         pd_result, bf_result, check_dtype=False, check_index_type=False
     )
+
+
+@pytest.mark.parametrize(
+    ("normalize", "ascending", "dropna"),
+    [
+        (
+            True,
+            True,
+            True,
+        ),
+        (
+            False,
+            False,
+            False,
+        ),
+    ],
+)
+def test_series_groupby_value_counts(
+    scalars_df_index,
+    scalars_pandas_df_index,
+    normalize,
+    ascending,
+    dropna,
+):
+    bf_result = (
+        scalars_df_index.groupby("bool_col")["string_col"]
+        .value_counts(normalize=normalize, ascending=ascending, dropna=dropna)
+        .to_pandas()
+    )
+    pd_result = scalars_pandas_df_index.groupby("bool_col")["string_col"].value_counts(
+        normalize=normalize, ascending=ascending, dropna=dropna
+    )
+    pd.testing.assert_series_equal(pd_result, bf_result, check_dtype=False)
diff --git a/third_party/bigframes_vendored/pandas/core/groupby/__init__.py b/third_party/bigframes_vendored/pandas/core/groupby/__init__.py
@@ -1256,6 +1256,32 @@ def nunique(self):
         """
         raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
 
+    def value_counts(
+        self,
+        normalize: bool = False,
+        sort: bool = True,
+        ascending: bool = False,
+        dropna: bool = True,
+    ):
+        """
+        Return a Series or DataFrame containing counts of unique rows.
+
+        Args:
+            normalize (bool, default False):
+                Return proportions rather than frequencies.
+            sort (bool, default True):
+                Sort by frequencies.
+            ascending (bool, default False):
+                Sort in ascending order.
+            dropna (bool, default True):
+                Don't include counts of rows that contain NA values.
+
+        Returns:
+            Series or DataFrame:
+                Series if the groupby as_index is True, otherwise DataFrame.
+        """
+        raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
+
 
 class DataFrameGroupBy(GroupBy):
     def agg(self, func, **kwargs):
@@ -1406,3 +1432,102 @@ def nunique(self):
                 Number of unique values within a BigQuery DataFrame.
         """
         raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
+
+    def value_counts(
+        self,
+        subset=None,
+        normalize: bool = False,
+        sort: bool = True,
+        ascending: bool = False,
+        dropna: bool = True,
+    ):
+        """
+        Return a Series or DataFrame containing counts of unique rows.
+
+        **Examples:**
+
+            >>> import bigframes.pandas as bpd
+            >>> import numpy as np
+            >>> bpd.options.display.progress_bar = None
+
+            >>> df = bpd.DataFrame({
+            ...     'gender': ['male', 'male', 'female', 'male', 'female', 'male'],
+            ...     'education': ['low', 'medium', 'high', 'low', 'high', 'low'],
+            ...     'country': ['US', 'FR', 'US', 'FR', 'FR', 'FR']
+            ... })
+
+            >>> df
+               gender education country
+            0    male       low      US
+            1    male    medium      FR
+            2  female      high      US
+            3    male       low      FR
+            4  female      high      FR
+            5    male       low      FR
+            <BLANKLINE>
+            [6 rows x 3 columns]
+
+            >>> df.groupby('gender').value_counts()
+                 gender  education  country
+            female  high       FR         1
+                               US         1
+            male    low        FR         2
+                               US         1
+                    medium     FR         1
+            Name: count, dtype: Int64
+
+            >>> df.groupby('gender').value_counts(ascending=True)
+            gender  education  country
+            female  high       FR         1
+                               US         1
+            male    low        US         1
+                    medium     FR         1
+                    low        FR         2
+            Name: count, dtype: Int64
+
+            >>> df.groupby('gender').value_counts(normalize=True)
+            gender  education  country
+            female  high       FR          0.5
+                               US          0.5
+            male    low        FR          0.5
+                               US         0.25
+                    medium     FR         0.25
+            Name: proportion, dtype: Float64
+
+            >>> df.groupby('gender', as_index=False).value_counts()
+               gender education country  count
+            0  female      high      FR      1
+            1  female      high      US      1
+            2    male       low      FR      2
+            3    male       low      US      1
+            4    male    medium      FR      1
+            <BLANKLINE>
+            [5 rows x 4 columns]
+
+            >>> df.groupby('gender', as_index=False).value_counts(normalize=True)
+               gender education country  proportion
+            0  female      high      FR         0.5
+            1  female      high      US         0.5
+            2    male       low      FR         0.5
+            3    male       low      US        0.25
+            4    male    medium      FR        0.25
+            <BLANKLINE>
+            [5 rows x 4 columns]
+
+        Args:
+            subset (list-like, optional):
+                Columns to use when counting unique combinations.
+            normalize (bool, default False):
+                Return proportions rather than frequencies.
+            sort (bool, default True):
+                Sort by frequencies.
+            ascending (bool, default False):
+                Sort in ascending order.
+            dropna (bool, default True):
+                Don't include counts of rows that contain NA values.
+
+        Returns:
+            Series or DataFrame:
+                Series if the groupby as_index is True, otherwise DataFrame.
+        """
+        raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)

Original file line number	Diff line number	Diff line change
`@@ -489,7 +489,7 @@ def value_counts(`
`489`	`489`	`self._block.index_columns,`
`490`	`490`	`normalize=normalize,`
`491`	`491`	`ascending=ascending,`
`492`		`- dropna=dropna,`
	`492`	`+ drop_na=dropna,`
`493`	`493`	`)`
`494`	`494`	`import bigframes.series as series`
`495`	`495`
Original file line number	Diff line number	Diff line change
`@@ -2475,7 +2475,7 @@ def value_counts(`
`2475`	`2475`	`normalize=normalize,`
`2476`	`2476`	`sort=sort,`
`2477`	`2477`	`ascending=ascending,`
`2478`		`- dropna=dropna,`
	`2478`	`+ drop_na=dropna,`
`2479`	`2479`	`)`
`2480`	`2480`	`return bigframes.series.Series(block)`
`2481`	`2481`
Original file line number	Diff line number	Diff line change
`@@ -1631,7 +1631,7 @@ def value_counts(`
`1631`	`1631`	`[self._value_column],`
`1632`	`1632`	`normalize=normalize,`
`1633`	`1633`	`ascending=ascending,`
`1634`		`- dropna=dropna,`
	`1634`	`+ drop_na=dropna,`
`1635`	`1635`	`)`
`1636`	`1636`	`return Series(block)`
`1637`	`1637`