Skip to content
This repository was archived by the owner on Apr 1, 2026. It is now read-only.

Commit 05e9b69

Browse files
committed
Improve JSON type handling for to_gbq and to_pandas_batches
1 parent 8c34512 commit 05e9b69

3 files changed

Lines changed: 101 additions & 1 deletion

File tree

bigframes/core/blocks.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -731,6 +731,12 @@ def to_pandas_batches(
731731
# To reduce the number of edge cases to consider when working with the
732732
# results of this, always return at least one DataFrame. See:
733733
# b/428918844.
734+
empty_val = pd.DataFrame(
735+
{
736+
col: pd.Series([], dtype=self.expr.get_column_type(col))
737+
for col in itertools.chain(self.value_columns, self.index_columns)
738+
}
739+
)
734740
series_map = {}
735741
for col in itertools.chain(self.value_columns, self.index_columns):
736742
dtype = self.expr.get_column_type(col)
@@ -746,7 +752,9 @@ def to_pandas_batches(
746752
# MyPy doesn't automatically narrow the type of 'dtype' here,
747753
# so we add an explicit check.
748754
if isinstance(dtype, pd.ArrowDtype):
749-
safe_pa_type = _replace_json_arrow_with_string(dtype.pyarrow_dtype)
755+
safe_pa_type = bigframes.dtypes._replace_json_arrow_with_string(
756+
dtype.pyarrow_dtype
757+
)
750758
safe_dtype = pd.ArrowDtype(safe_pa_type)
751759
series_map[col] = pd.Series([], dtype=safe_dtype).astype(dtype)
752760
else:

bigframes/dtypes.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -954,6 +954,21 @@ def contains_db_dtypes_json_dtype(dtype):
954954
return contains_db_dtypes_json_arrow_type(dtype.pyarrow_dtype)
955955

956956

957+
def _replace_json_arrow_with_string(pa_type: pa.DataType) -> pa.DataType:
958+
"""Recursively replace JSONArrowType with string type."""
959+
if isinstance(pa_type, db_dtypes.JSONArrowType):
960+
return pa.string()
961+
if isinstance(pa_type, pa.ListType):
962+
return pa.list_(_replace_json_arrow_with_string(pa_type.value_type))
963+
if isinstance(pa_type, pa.StructType):
964+
new_fields = [
965+
field.with_type(_replace_json_arrow_with_string(field.type))
966+
for field in pa_type
967+
]
968+
return pa.struct(new_fields)
969+
return pa_type
970+
971+
957972
def warn_on_db_dtypes_json_dtype(dtypes):
958973
"""Warn that the JSON dtype is changing.
959974

tests/system/small/test_dataframe_io.py

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -376,6 +376,83 @@ def test_to_pandas_batches_w_empty_dataframe(session):
376376
pandas.testing.assert_series_equal(results[0].dtypes, empty.dtypes)
377377

378378

379+
def test_to_pandas_batches_w_empty_dataframe_json_in_list(session):
380+
"""Tests to_pandas_batches() with an empty DataFrame containing a list of JSON.
381+
382+
Regression test for https://github.com/googleapis/python-bigquery-dataframes/issues/1273
383+
"""
384+
import db_dtypes
385+
386+
json_list_dtype = pd.ArrowDtype(pa.list_(db_dtypes.JSONArrowType()))
387+
empty_df_with_json_list = bpd.DataFrame(
388+
{
389+
"idx": pd.Series([], dtype="Int64"),
390+
"json_list_col": pd.Series([], dtype=json_list_dtype),
391+
},
392+
session=session,
393+
).set_index("idx", drop=True)
394+
395+
results = list(empty_df_with_json_list.to_pandas_batches())
396+
397+
assert len(results) == 1
398+
assert list(results[0].columns) == ["json_list_col"]
399+
assert results[0].dtypes["json_list_col"] == json_list_dtype
400+
assert len(results[0]) == 0
401+
402+
403+
# --- Behavior 2: JSON in Struct ---
404+
405+
406+
def test_to_pandas_batches_w_empty_dataframe_json_in_struct(session):
407+
"""Tests to_pandas_batches() with an empty DataFrame containing a struct of JSON.
408+
409+
Regression test for https://github.com/googleapis/python-bigquery-dataframes/issues/1273
410+
"""
411+
import db_dtypes
412+
413+
json_struct_dtype = pd.ArrowDtype(
414+
pa.struct([("json_field", db_dtypes.JSONArrowType())])
415+
)
416+
empty_df_with_json_struct = bpd.DataFrame(
417+
{
418+
"idx": pd.Series([], dtype="Int64"),
419+
"json_struct_col": pd.Series([], dtype=json_struct_dtype),
420+
},
421+
session=session,
422+
).set_index("idx", drop=True)
423+
424+
results = list(empty_df_with_json_struct.to_pandas_batches())
425+
426+
assert len(results) == 1
427+
assert list(results[0].columns) == ["json_struct_col"]
428+
assert results[0].dtypes["json_struct_col"] == json_struct_dtype
429+
assert len(results[0]) == 0
430+
431+
432+
# --- Behavior 3: Simple JSON ---
433+
434+
435+
def test_to_pandas_batches_w_empty_dataframe_simple_json(session):
436+
"""Tests to_pandas_batches() with an empty DataFrame containing a simple JSON column.
437+
438+
Regression test for https://github.com/googleapis/python-bigquery-dataframes/issues/1273
439+
"""
440+
empty_df_with_json = bpd.DataFrame(
441+
{
442+
"idx": pd.Series([], dtype="Int64"),
443+
"json_col": pd.Series([], dtype=dtypes.JSON_DTYPE),
444+
},
445+
session=session,
446+
).set_index("idx", drop=True)
447+
448+
results = list(empty_df_with_json.to_pandas_batches())
449+
450+
assert len(results) == 1
451+
assert list(results[0].columns) == ["json_col"]
452+
assert results[0].dtypes["json_col"] == dtypes.JSON_DTYPE
453+
assert len(results[0]) == 0
454+
455+
379456
@pytest.mark.parametrize("allow_large_results", (True, False))
380457
def test_to_pandas_batches_w_page_size_and_max_results(session, allow_large_results):
381458
"""Verify to_pandas_batches() APIs returns the expected page size.

0 commit comments

Comments
 (0)