fix: return a DataFrame containing query stats for all non-SELECT statements

tswast · tswast · commit 03b44263d86b · 2025-09-11T15:14:09.000Z
diff --git a/bigframes/session/loader.py b/bigframes/session/loader.py
@@ -42,6 +42,7 @@
 from google.cloud import bigquery_storage_v1
 import google.cloud.bigquery
 import google.cloud.bigquery as bigquery
+import google.cloud.bigquery.table
 from google.cloud.bigquery_storage_v1 import types as bq_storage_types
 import pandas
 import pyarrow as pa
@@ -1004,7 +1005,7 @@ def read_gbq_query(
                 configuration=configuration,
             )
             query_job_for_metrics = query_job
-            rows = None
+            rows: Optional[google.cloud.bigquery.table.RowIterator] = None
         else:
             job_config = typing.cast(
                 bigquery.QueryJobConfig,
@@ -1037,8 +1038,8 @@ def read_gbq_query(
                 query_job=query_job_for_metrics, row_iterator=rows
             )
 
-        # It's possible that there's no job and corresponding destination table.
-        # In this case, we must create a local node.
+        # It's possible that there's no job and therefore no corresponding
+        # destination table. In this case, we must create a local node.
         #
         # TODO(b/420984164): Tune the threshold for which we download to
         # local node. Likely there are a wide range of sizes in which it
@@ -1059,22 +1060,35 @@ def read_gbq_query(
                 columns=columns,
             )
 
-        # If there was no destination table and we've made it this far, that
-        # means the query must have been DDL or DML. Return some job metadata,
-        # instead.
-        if not destination:
+        # If the query was DDL or DML, return some job metadata. See
+        # https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobStatistics2.FIELDS.statement_type
+        # for possible statement types. Note that destination table does exist
+        # for some DDL operations such as CREATE VIEW, but we don't want to
+        # read from that. See internal issue b/444282709.
+        if destination is None or (
+            query_job_for_metrics is not None
+            and query_job_for_metrics.statement_type != "SELECT"
+        ):
             return bf_read_gbq_query.create_dataframe_from_query_job_stats(
                 query_job_for_metrics,
                 session=self._session,
             )
 
+        # Speed up counts by getting counts from result metadata.
+        if rows is not None:
+            n_rows = rows.total_rows
+        elif query_job_for_metrics is not None:
+            n_rows = query_job_for_metrics.result().total_rows
+        else:
+            n_rows = None
+
         return self.read_gbq_table(
             f"{destination.project}.{destination.dataset_id}.{destination.table_id}",
             index_col=index_col,
             columns=columns,
             use_cache=configuration["query"]["useQueryCache"],
             force_total_order=force_total_order,
-            n_rows=query_job.result().total_rows,
+            n_rows=n_rows,
             # max_results and filters are omitted because they are already
             # handled by to_query(), above.
         )
diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py
@@ -430,18 +430,63 @@ def test_read_gbq_w_max_results(
     assert bf_result.shape[0] == max_results
 
 
-def test_read_gbq_w_script_no_select(session, dataset_id: str):
-    ddl = f"""
-    CREATE TABLE `{dataset_id}.test_read_gbq_w_ddl` (
-        `col_a` INT64,
-        `col_b` STRING
-    );
-
-    INSERT INTO `{dataset_id}.test_read_gbq_w_ddl`
-    VALUES (123, 'hello world');
-    """
-    df = session.read_gbq(ddl).to_pandas()
-    assert df["statement_type"][0] == "SCRIPT"
+@pytest.mark.parametrize(
+    ("sql_template", "expected_statement_type"),
+    (
+        pytest.param(
+            """
+            CREATE OR REPLACE TABLE `{dataset_id}.test_read_gbq_w_ddl` (
+                `col_a` INT64,
+                `col_b` STRING
+            );
+            """,
+            "CREATE_TABLE",
+            id="ddl-create-table",
+        ),
+        pytest.param(
+            # From https://cloud.google.com/bigquery/docs/boosted-tree-classifier-tutorial
+            """
+            CREATE OR REPLACE VIEW `{dataset_id}.test_read_gbq_w_create_view`
+            AS
+            SELECT
+              age,
+              workclass,
+              marital_status,
+              education_num,
+              occupation,
+              hours_per_week,
+              income_bracket,
+              CASE
+                WHEN MOD(functional_weight, 10) < 8 THEN 'training'
+                WHEN MOD(functional_weight, 10) = 8 THEN 'evaluation'
+                WHEN MOD(functional_weight, 10) = 9 THEN 'prediction'
+              END AS dataframe
+            FROM
+              `bigquery-public-data.ml_datasets.census_adult_income`;
+            """,
+            "CREATE_VIEW",
+            id="ddl-create-view",
+        ),
+        pytest.param(
+            """
+            CREATE OR REPLACE TABLE `{dataset_id}.test_read_gbq_w_dml` (
+                `col_a` INT64,
+                `col_b` STRING
+            );
+
+            INSERT INTO `{dataset_id}.test_read_gbq_w_dml`
+            VALUES (123, 'hello world');
+            """,
+            "SCRIPT",
+            id="dml",
+        ),
+    ),
+)
+def test_read_gbq_w_script_no_select(
+    session, dataset_id: str, sql_template: str, expected_statement_type: str
+):
+    df = session.read_gbq(sql_template.format(dataset_id=dataset_id)).to_pandas()
+    assert df["statement_type"][0] == expected_statement_type
 
 
 def test_read_gbq_twice_with_same_timestamp(session, penguins_table_id):