From 921fb7a1861443dbca811ff48b399256cdf849f3 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Fri, 11 Jul 2025 17:35:44 +0000 Subject: [PATCH 1/4] feat: _read_gbq_colab creates hybrid session --- bigframes/pandas/io/api.py | 2 ++ .../small/session/test_read_gbq_colab.py | 28 +++++++++++++++++++ 2 files changed, 30 insertions(+) diff --git a/bigframes/pandas/io/api.py b/bigframes/pandas/io/api.py index 65435bd902..5ec3626c7a 100644 --- a/bigframes/pandas/io/api.py +++ b/bigframes/pandas/io/api.py @@ -345,6 +345,8 @@ def _read_gbq_colab( dry_run=True, ) _set_default_session_location_if_possible_deferred_query(create_query) + if not config.options.bigquery._session_started: + config.options.bigquery.enable_polars_execution = True return global_session.with_default_session( bigframes.session.Session._read_gbq_colab, diff --git a/tests/system/small/session/test_read_gbq_colab.py b/tests/system/small/session/test_read_gbq_colab.py index af78117262..a8e476525e 100644 --- a/tests/system/small/session/test_read_gbq_colab.py +++ b/tests/system/small/session/test_read_gbq_colab.py @@ -19,6 +19,9 @@ import pandas.testing import pytest +import bigframes +import bigframes.pandas + def test_read_gbq_colab_to_pandas_batches_preserves_order_by(maybe_ordered_session): # This query should return enough results to be too big to fit in a single @@ -59,6 +62,31 @@ def test_read_gbq_colab_to_pandas_batches_preserves_order_by(maybe_ordered_sessi assert executions_after == executions_before_python == executions_before_sql + 1 +def test_read_gbq_colab_fresh_session_is_hybrid(): + bigframes.close_session() + df = bigframes.pandas._read_gbq_colab( + """ + SELECT + name, + SUM(number) AS total + FROM + `bigquery-public-data.usa_names.usa_1910_2013` + WHERE state LIKE 'W%' + GROUP BY name + ORDER BY total DESC + LIMIT 300 + """ + ) + session = df._session + executions_before_python = session._metrics.execution_count + result = df.sort_values("name").peek(100) + executions_after = session._metrics.execution_count + + assert len(result) == 100 + assert session._executor._enable_polars_execution is True # type: ignore + assert executions_after == executions_before_python == 1 + + def test_read_gbq_colab_peek_avoids_requery(maybe_ordered_session): executions_before_sql = maybe_ordered_session._metrics.execution_count df = maybe_ordered_session._read_gbq_colab( From dd29572cdc2e27208dcf2f9fb51db1d7d3e9c8df Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Fri, 11 Jul 2025 17:59:06 +0000 Subject: [PATCH 2/4] skip test if no polars --- tests/system/small/session/test_read_gbq_colab.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/system/small/session/test_read_gbq_colab.py b/tests/system/small/session/test_read_gbq_colab.py index a8e476525e..9ace2dbed7 100644 --- a/tests/system/small/session/test_read_gbq_colab.py +++ b/tests/system/small/session/test_read_gbq_colab.py @@ -22,6 +22,8 @@ import bigframes import bigframes.pandas +pytest.importorskip("polars") + def test_read_gbq_colab_to_pandas_batches_preserves_order_by(maybe_ordered_session): # This query should return enough results to be too big to fit in a single From 4c968b75f7427dfb3f7f95ff983669bae24136fa Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Sun, 20 Jul 2025 23:30:44 +0000 Subject: [PATCH 3/4] disable hybrid engine unless flag set --- bigframes/pandas/io/api.py | 9 +++++++-- tests/system/small/session/test_read_gbq_colab.py | 3 ++- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/bigframes/pandas/io/api.py b/bigframes/pandas/io/api.py index 5ec3626c7a..308283def7 100644 --- a/bigframes/pandas/io/api.py +++ b/bigframes/pandas/io/api.py @@ -279,6 +279,7 @@ def _read_gbq_colab( # type: ignore[overload-overlap] *, pyformat_args: Optional[Dict[str, Any]] = ..., dry_run: Literal[False] = ..., + use_hybrid_engine: bool = ..., ) -> bigframes.dataframe.DataFrame: ... @@ -289,6 +290,7 @@ def _read_gbq_colab( *, pyformat_args: Optional[Dict[str, Any]] = ..., dry_run: Literal[True] = ..., + use_hybrid_engine: bool = ..., ) -> pandas.Series: ... @@ -298,6 +300,7 @@ def _read_gbq_colab( *, pyformat_args: Optional[Dict[str, Any]] = None, dry_run: bool = False, + use_hybrid_engine: bool = False, ) -> bigframes.dataframe.DataFrame | pandas.Series: """A Colab-specific version of read_gbq. @@ -312,7 +315,9 @@ def _read_gbq_colab( dry_run (bool): If True, estimates the query results size without returning data. The return will be a pandas Series with query metadata. - + use_hybrid_engine (bool): + If True, and session not started, new session started will use + hybrid execution which pushes some execution to local cpu. Returns: Union[bigframes.dataframe.DataFrame, pandas.Series]: A BigQuery DataFrame if `dry_run` is False, otherwise a pandas Series. @@ -345,7 +350,7 @@ def _read_gbq_colab( dry_run=True, ) _set_default_session_location_if_possible_deferred_query(create_query) - if not config.options.bigquery._session_started: + if use_hybrid_engine and not config.options.bigquery._session_started: config.options.bigquery.enable_polars_execution = True return global_session.with_default_session( diff --git a/tests/system/small/session/test_read_gbq_colab.py b/tests/system/small/session/test_read_gbq_colab.py index 9ace2dbed7..6c425cba2c 100644 --- a/tests/system/small/session/test_read_gbq_colab.py +++ b/tests/system/small/session/test_read_gbq_colab.py @@ -77,7 +77,8 @@ def test_read_gbq_colab_fresh_session_is_hybrid(): GROUP BY name ORDER BY total DESC LIMIT 300 - """ + """, + use_hybrid_engine=True, ) session = df._session executions_before_python = session._metrics.execution_count From 02c80353cb588e228405db17c38593c09303634d Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Wed, 23 Jul 2025 00:08:04 +0000 Subject: [PATCH 4/4] Revert "disable hybrid engine unless flag set" This reverts commit 4c968b75f7427dfb3f7f95ff983669bae24136fa. --- bigframes/pandas/io/api.py | 9 ++------- tests/system/small/session/test_read_gbq_colab.py | 3 +-- 2 files changed, 3 insertions(+), 9 deletions(-) diff --git a/bigframes/pandas/io/api.py b/bigframes/pandas/io/api.py index 308283def7..5ec3626c7a 100644 --- a/bigframes/pandas/io/api.py +++ b/bigframes/pandas/io/api.py @@ -279,7 +279,6 @@ def _read_gbq_colab( # type: ignore[overload-overlap] *, pyformat_args: Optional[Dict[str, Any]] = ..., dry_run: Literal[False] = ..., - use_hybrid_engine: bool = ..., ) -> bigframes.dataframe.DataFrame: ... @@ -290,7 +289,6 @@ def _read_gbq_colab( *, pyformat_args: Optional[Dict[str, Any]] = ..., dry_run: Literal[True] = ..., - use_hybrid_engine: bool = ..., ) -> pandas.Series: ... @@ -300,7 +298,6 @@ def _read_gbq_colab( *, pyformat_args: Optional[Dict[str, Any]] = None, dry_run: bool = False, - use_hybrid_engine: bool = False, ) -> bigframes.dataframe.DataFrame | pandas.Series: """A Colab-specific version of read_gbq. @@ -315,9 +312,7 @@ def _read_gbq_colab( dry_run (bool): If True, estimates the query results size without returning data. The return will be a pandas Series with query metadata. - use_hybrid_engine (bool): - If True, and session not started, new session started will use - hybrid execution which pushes some execution to local cpu. + Returns: Union[bigframes.dataframe.DataFrame, pandas.Series]: A BigQuery DataFrame if `dry_run` is False, otherwise a pandas Series. @@ -350,7 +345,7 @@ def _read_gbq_colab( dry_run=True, ) _set_default_session_location_if_possible_deferred_query(create_query) - if use_hybrid_engine and not config.options.bigquery._session_started: + if not config.options.bigquery._session_started: config.options.bigquery.enable_polars_execution = True return global_session.with_default_session( diff --git a/tests/system/small/session/test_read_gbq_colab.py b/tests/system/small/session/test_read_gbq_colab.py index 6c425cba2c..9ace2dbed7 100644 --- a/tests/system/small/session/test_read_gbq_colab.py +++ b/tests/system/small/session/test_read_gbq_colab.py @@ -77,8 +77,7 @@ def test_read_gbq_colab_fresh_session_is_hybrid(): GROUP BY name ORDER BY total DESC LIMIT 300 - """, - use_hybrid_engine=True, + """ ) session = df._session executions_before_python = session._metrics.execution_count