support index_col and columns

tswast · tswast · commit 78d29f0730dd · 2025-08-21T15:22:50.000Z
diff --git a/bigframes/session/_io/bigquery/read_gbq_query.py b/bigframes/session/_io/bigquery/read_gbq_query.py
@@ -16,7 +16,7 @@
 
 from __future__ import annotations
 
-from typing import Optional
+from typing import cast, Iterable, Optional, Tuple
 
 from google.cloud import bigquery
 import google.cloud.bigquery.table
@@ -28,6 +28,7 @@
 import bigframes.core.blocks as blocks
 import bigframes.core.guid
 import bigframes.core.schema as schemata
+import bigframes.enums
 import bigframes.session
 
 
@@ -53,19 +54,35 @@ def create_dataframe_from_query_job_stats(
 
 
 def create_dataframe_from_row_iterator(
-    rows: google.cloud.bigquery.table.RowIterator, *, session: bigframes.session.Session
+    rows: google.cloud.bigquery.table.RowIterator,
+    *,
+    session: bigframes.session.Session,
+    index_col: Iterable[str] | str | bigframes.enums.DefaultIndexKind,
+    columns: Iterable[str],
 ) -> dataframe.DataFrame:
     """Convert a RowIterator into a DataFrame wrapping a LocalNode.
 
     This allows us to create a DataFrame from query results, even in the
     'jobless' case where there's no destination table.
     """
     pa_table = rows.to_arrow()
+    bq_schema = list(rows.schema)
 
-    # TODO(tswast): Use array_value.promote_offsets() instead once that node is
-    # supported by the local engine.
-    offsets_col = bigframes.core.guid.generate_guid()
-    pa_table = pyarrow_utils.append_offsets(pa_table, offsets_col=offsets_col)
+    if not index_col or isinstance(index_col, bigframes.enums.DefaultIndexKind):
+        # We get a sequential index for free, so use that if no index is specified.
+        # TODO(tswast): Use array_value.promote_offsets() instead once that node is
+        # supported by the local engine.
+        offsets_col = bigframes.core.guid.generate_guid()
+        pa_table = pyarrow_utils.append_offsets(pa_table, offsets_col=offsets_col)
+        bq_schema += [bigquery.SchemaField(offsets_col, "INTEGER")]
+        index_columns: Tuple[str, ...] = (offsets_col,)
+        index_labels: Tuple[Optional[str], ...] = (None,)
+    elif isinstance(index_col, str):
+        index_columns = (index_col,)
+        index_labels = (index_col,)
+    else:
+        index_columns = tuple(index_col)
+        index_labels = cast(Tuple[Optional[str], ...], tuple(index_col))
 
     # We use the ManagedArrowTable constructor directly, because the
     # results of to_arrow() should be the source of truth with regards
@@ -74,17 +91,24 @@ def create_dataframe_from_row_iterator(
     # like the output of the BQ Storage Read API.
     mat = local_data.ManagedArrowTable(
         pa_table,
-        schemata.ArraySchema.from_bq_schema(
-            list(rows.schema) + [bigquery.SchemaField(offsets_col, "INTEGER")]
-        ),
+        schemata.ArraySchema.from_bq_schema(bq_schema),
     )
     mat.validate()
 
+    column_labels = [
+        field.name for field in rows.schema if field.name not in index_columns
+    ]
+
     array_value = core.ArrayValue.from_managed(mat, session)
     block = blocks.Block(
         array_value,
-        (offsets_col,),
-        [field.name for field in rows.schema],
-        (None,),
+        index_columns=index_columns,
+        column_labels=column_labels,
+        index_labels=index_labels,
     )
-    return dataframe.DataFrame(block)
+    df = dataframe.DataFrame(block)
+
+    if columns:
+        df = df[list(columns)]
+
+    return df
diff --git a/bigframes/session/loader.py b/bigframes/session/loader.py
@@ -1044,6 +1044,8 @@ def read_gbq_query(
             return bf_read_gbq_query.create_dataframe_from_row_iterator(
                 rows,
                 session=self._session,
+                index_col=index_col,
+                columns=columns,
             )
 
         # If there was no destination table and we've made it this far, that

Original file line number	Diff line number	Diff line change
`@@ -1044,6 +1044,8 @@ def read_gbq_query(`
`1044`	`1044`	`return bf_read_gbq_query.create_dataframe_from_row_iterator(`
`1045`	`1045`	`rows,`
`1046`	`1046`	`session=self._session,`
	`1047`	`+ index_col=index_col,`
	`1048`	`+ columns=columns,`
`1047`	`1049`	`)`
`1048`	`1050`
`1049`	`1051`	`# If there was no destination table and we've made it this far, that`