Skip to content

Commit 03b4426

Browse files
committed
fix: return a DataFrame containing query stats for all non-SELECT statements
1 parent b0ff718 commit 03b4426

File tree

2 files changed

+79
-20
lines changed

2 files changed

+79
-20
lines changed

bigframes/session/loader.py

Lines changed: 22 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@
4242
from google.cloud import bigquery_storage_v1
4343
import google.cloud.bigquery
4444
import google.cloud.bigquery as bigquery
45+
import google.cloud.bigquery.table
4546
from google.cloud.bigquery_storage_v1 import types as bq_storage_types
4647
import pandas
4748
import pyarrow as pa
@@ -1004,7 +1005,7 @@ def read_gbq_query(
10041005
configuration=configuration,
10051006
)
10061007
query_job_for_metrics = query_job
1007-
rows = None
1008+
rows: Optional[google.cloud.bigquery.table.RowIterator] = None
10081009
else:
10091010
job_config = typing.cast(
10101011
bigquery.QueryJobConfig,
@@ -1037,8 +1038,8 @@ def read_gbq_query(
10371038
query_job=query_job_for_metrics, row_iterator=rows
10381039
)
10391040

1040-
# It's possible that there's no job and corresponding destination table.
1041-
# In this case, we must create a local node.
1041+
# It's possible that there's no job and therefore no corresponding
1042+
# destination table. In this case, we must create a local node.
10421043
#
10431044
# TODO(b/420984164): Tune the threshold for which we download to
10441045
# local node. Likely there are a wide range of sizes in which it
@@ -1059,22 +1060,35 @@ def read_gbq_query(
10591060
columns=columns,
10601061
)
10611062

1062-
# If there was no destination table and we've made it this far, that
1063-
# means the query must have been DDL or DML. Return some job metadata,
1064-
# instead.
1065-
if not destination:
1063+
# If the query was DDL or DML, return some job metadata. See
1064+
# https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobStatistics2.FIELDS.statement_type
1065+
# for possible statement types. Note that destination table does exist
1066+
# for some DDL operations such as CREATE VIEW, but we don't want to
1067+
# read from that. See internal issue b/444282709.
1068+
if destination is None or (
1069+
query_job_for_metrics is not None
1070+
and query_job_for_metrics.statement_type != "SELECT"
1071+
):
10661072
return bf_read_gbq_query.create_dataframe_from_query_job_stats(
10671073
query_job_for_metrics,
10681074
session=self._session,
10691075
)
10701076

1077+
# Speed up counts by getting counts from result metadata.
1078+
if rows is not None:
1079+
n_rows = rows.total_rows
1080+
elif query_job_for_metrics is not None:
1081+
n_rows = query_job_for_metrics.result().total_rows
1082+
else:
1083+
n_rows = None
1084+
10711085
return self.read_gbq_table(
10721086
f"{destination.project}.{destination.dataset_id}.{destination.table_id}",
10731087
index_col=index_col,
10741088
columns=columns,
10751089
use_cache=configuration["query"]["useQueryCache"],
10761090
force_total_order=force_total_order,
1077-
n_rows=query_job.result().total_rows,
1091+
n_rows=n_rows,
10781092
# max_results and filters are omitted because they are already
10791093
# handled by to_query(), above.
10801094
)

tests/system/small/test_session.py

Lines changed: 57 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -430,18 +430,63 @@ def test_read_gbq_w_max_results(
430430
assert bf_result.shape[0] == max_results
431431

432432

433-
def test_read_gbq_w_script_no_select(session, dataset_id: str):
434-
ddl = f"""
435-
CREATE TABLE `{dataset_id}.test_read_gbq_w_ddl` (
436-
`col_a` INT64,
437-
`col_b` STRING
438-
);
439-
440-
INSERT INTO `{dataset_id}.test_read_gbq_w_ddl`
441-
VALUES (123, 'hello world');
442-
"""
443-
df = session.read_gbq(ddl).to_pandas()
444-
assert df["statement_type"][0] == "SCRIPT"
433+
@pytest.mark.parametrize(
434+
("sql_template", "expected_statement_type"),
435+
(
436+
pytest.param(
437+
"""
438+
CREATE OR REPLACE TABLE `{dataset_id}.test_read_gbq_w_ddl` (
439+
`col_a` INT64,
440+
`col_b` STRING
441+
);
442+
""",
443+
"CREATE_TABLE",
444+
id="ddl-create-table",
445+
),
446+
pytest.param(
447+
# From https://cloud.google.com/bigquery/docs/boosted-tree-classifier-tutorial
448+
"""
449+
CREATE OR REPLACE VIEW `{dataset_id}.test_read_gbq_w_create_view`
450+
AS
451+
SELECT
452+
age,
453+
workclass,
454+
marital_status,
455+
education_num,
456+
occupation,
457+
hours_per_week,
458+
income_bracket,
459+
CASE
460+
WHEN MOD(functional_weight, 10) < 8 THEN 'training'
461+
WHEN MOD(functional_weight, 10) = 8 THEN 'evaluation'
462+
WHEN MOD(functional_weight, 10) = 9 THEN 'prediction'
463+
END AS dataframe
464+
FROM
465+
`bigquery-public-data.ml_datasets.census_adult_income`;
466+
""",
467+
"CREATE_VIEW",
468+
id="ddl-create-view",
469+
),
470+
pytest.param(
471+
"""
472+
CREATE OR REPLACE TABLE `{dataset_id}.test_read_gbq_w_dml` (
473+
`col_a` INT64,
474+
`col_b` STRING
475+
);
476+
477+
INSERT INTO `{dataset_id}.test_read_gbq_w_dml`
478+
VALUES (123, 'hello world');
479+
""",
480+
"SCRIPT",
481+
id="dml",
482+
),
483+
),
484+
)
485+
def test_read_gbq_w_script_no_select(
486+
session, dataset_id: str, sql_template: str, expected_statement_type: str
487+
):
488+
df = session.read_gbq(sql_template.format(dataset_id=dataset_id)).to_pandas()
489+
assert df["statement_type"][0] == expected_statement_type
445490

446491

447492
def test_read_gbq_twice_with_same_timestamp(session, penguins_table_id):

0 commit comments

Comments
 (0)