From 7de4cdd622995167086581ea0adaad3b501f33e7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Wed, 3 Jun 2026 21:27:55 +0000 Subject: [PATCH] fix(bigframes): avoid exceptions for unnamed JSON columns in SQL Cell outputs --- packages/bigframes/bigframes/core/indexers.py | 123 +++++++++- packages/bigframes/bigframes/dataframe.py | 22 +- packages/bigframes/bigframes/series.py | 1 - .../generative_ai/ai_functions.ipynb | 44 ++-- .../bigframes/tests/unit/test_iloc_setitem.py | 212 ++++++++++++++++++ 5 files changed, 374 insertions(+), 28 deletions(-) create mode 100644 packages/bigframes/tests/unit/test_iloc_setitem.py diff --git a/packages/bigframes/bigframes/core/indexers.py b/packages/bigframes/bigframes/core/indexers.py index c7cfc4f52ade..b3f8e4ce5639 100644 --- a/packages/bigframes/bigframes/core/indexers.py +++ b/packages/bigframes/bigframes/core/indexers.py @@ -28,6 +28,7 @@ import bigframes.core.guid as guid import bigframes.core.indexes as indexes import bigframes.core.scalar +import bigframes.core.validations as validations import bigframes.core.window_spec as windows import bigframes.dataframe import bigframes.dtypes @@ -102,6 +103,18 @@ def __getitem__( Other key types are not yet supported. """ + requires_ordering = True + if ( + isinstance(key, slice) + and (key.start is None or key.start == 0) + and (key.step is None or key.step == 1) + and key.stop is None + ): + requires_ordering = False + + if requires_ordering: + validations.enforce_ordered(self._series, "iloc") + return _iloc_getitem_series_or_dataframe(self._series, key) @@ -244,8 +257,113 @@ def __getitem__(self, key) -> Union[bigframes.dataframe.DataFrame, pd.Series]: Other key types are not yet supported. """ + requires_ordering = True + if isinstance(key, tuple): + if len(key) > 0: + row_indexer = key[0] + if ( + isinstance(row_indexer, slice) + and (row_indexer.start is None or row_indexer.start == 0) + and (row_indexer.step is None or row_indexer.step == 1) + and row_indexer.stop is None + ): + requires_ordering = False + else: + if ( + isinstance(key, slice) + and (key.start is None or key.start == 0) + and (key.step is None or key.step == 1) + and key.stop is None + ): + requires_ordering = False + + if requires_ordering: + validations.enforce_ordered(self._dataframe, "iloc") + return _iloc_getitem_series_or_dataframe(self._dataframe, key) + def __setitem__( + self, + key: Tuple[ + slice, Union[int, typing.Sequence[int], slice, typing.Sequence[bool]] + ], + value: Union[ + bigframes.dataframe.SingleItemValue, bigframes.dataframe.DataFrame + ], + ): + if not ( + isinstance(key, tuple) + and len(key) == 2 + and isinstance(key[0], slice) + and (key[0].start is None or key[0].start == 0) + and (key[0].step is None or key[0].step == 1) + and key[0].stop is None + ): + raise NotImplementedError( + "Only DataFrame.iloc[:, col_indexer] = value is supported." + ) + + col_indexer = key[1] + n_cols = len(self._dataframe.columns) + + if isinstance(col_indexer, bool): + raise TypeError( + "pos must be integer or slice or list-like of integers/booleans" + ) + + if isinstance(col_indexer, int): + col_offset = col_indexer + if col_offset < 0: + col_offset += n_cols + if col_offset < 0 or col_offset >= n_cols: + raise IndexError("single positional indexer is out-of-bounds") + + col_label = self._dataframe.columns[col_offset] + df = self._dataframe.assign(**{col_label: value}) + self._dataframe._set_block(df._get_block()) + + elif isinstance(col_indexer, slice): + col_offsets = list(range(*col_indexer.indices(n_cols))) + col_labels = [self._dataframe.columns[idx] for idx in col_offsets] + if not col_labels: + return + df = self._dataframe._assign_multi_items(col_labels, value) + self._dataframe._set_block(df._get_block()) + + elif pd.api.types.is_list_like(col_indexer): + col_indexer_list = list(col_indexer) + + if len(col_indexer_list) > 0 and all( + isinstance(x, bool) for x in col_indexer_list + ): + if len(col_indexer_list) != n_cols: + raise ValueError( + f"Boolean index has wrong length: {len(col_indexer_list)} instead of {n_cols}" + ) + col_offsets = [i for i, val in enumerate(col_indexer_list) if val] + else: + col_offsets = [] + for idx in col_indexer_list: + if isinstance(idx, bool): + raise TypeError("pos list must contain only integers") + if not isinstance(idx, int): + raise TypeError("pos list must contain only integers") + if idx < 0: + idx += n_cols + if idx < 0 or idx >= n_cols: + raise IndexError("positional indexer is out-of-bounds") + col_offsets.append(idx) + + col_labels = [self._dataframe.columns[idx] for idx in col_offsets] + if not col_labels: + return + df = self._dataframe._assign_multi_items(col_labels, value) + self._dataframe._set_block(df._get_block()) + else: + raise TypeError( + "pos must be integer or slice or list-like of integers/booleans" + ) + class IatDataFrameIndexer: def __init__(self, dataframe: bigframes.dataframe.DataFrame): @@ -470,8 +588,11 @@ def _iloc_getitem_series_or_dataframe( # len(key) == 2 df = typing.cast(bigframes.dataframe.DataFrame, series_or_dataframe) - if isinstance(key[1], int): + if isinstance(key[0], int) and isinstance(key[1], int): return df.iat[key] + elif isinstance(key[1], int): + col_label = df.columns[key[1]] + return df[col_label].iloc[key[0]] elif isinstance(key[1], list): columns = df.columns[key[1]] return _iloc_getitem_series_or_dataframe(df[columns], key[0]) diff --git a/packages/bigframes/bigframes/dataframe.py b/packages/bigframes/bigframes/dataframe.py index 6b7922fe9753..14e064b4c942 100644 --- a/packages/bigframes/bigframes/dataframe.py +++ b/packages/bigframes/bigframes/dataframe.py @@ -317,7 +317,6 @@ def loc(self) -> indexers.LocDataFrameIndexer: return indexers.LocDataFrameIndexer(self) @property - @validations.requires_ordering() def iloc(self) -> indexers.ILocDataFrameIndexer: return indexers.ILocDataFrameIndexer(self) @@ -821,22 +820,25 @@ def __repr__(self) -> str: def _get_display_df(self) -> DataFrame: """Process ObjectRef and JSON/nested JSON columns for display.""" + import bigframes.bigquery as bbq + df = self # Arrow/Pandas to_pandas_batches does not support raw JSON/nested JSON # columns. Pre-serialize them to string format to bypass this limit. # Using TO_JSON_STRING via SqlScalarOp handles complex nested STRUCT - # types correctly. - json_cols = [ - col - for col in df.columns + # types correctly. Use the offset so that we can handle duplicate and + # non-string column names. + json_col_indexes = [ + col_index + for col_index, col in enumerate(df.columns) if bigframes.dtypes.contains_db_dtypes_json_dtype(df[col].dtype) ] - if json_cols: - op = ops.SqlScalarOp( - _output_type=bigframes.dtypes.STRING_DTYPE, - sql_template="TO_JSON_STRING({0})", + if json_col_indexes: + df._block.apply_analytic + df.iloc[:, json_col_indexes] = cast( + DataFrame, + df.iloc[:, json_col_indexes].apply(bbq.to_json_string), # type: ignore ) - df = df.assign(**{col: df[col]._apply_unary_op(op) for col in json_cols}) return df def _repr_mimebundle_(self, include=None, exclude=None): diff --git a/packages/bigframes/bigframes/series.py b/packages/bigframes/bigframes/series.py index 60acad0c301f..f4985010b925 100644 --- a/packages/bigframes/bigframes/series.py +++ b/packages/bigframes/bigframes/series.py @@ -243,7 +243,6 @@ def loc(self) -> bigframes.core.indexers.LocSeriesIndexer: return bigframes.core.indexers.LocSeriesIndexer(self) @property - @validations.requires_ordering() def iloc(self) -> bigframes.core.indexers.IlocSeriesIndexer: return bigframes.core.indexers.IlocSeriesIndexer(self) diff --git a/packages/bigframes/notebooks/generative_ai/ai_functions.ipynb b/packages/bigframes/notebooks/generative_ai/ai_functions.ipynb index 13234414df5e..dbb044d3943f 100644 --- a/packages/bigframes/notebooks/generative_ai/ai_functions.ipynb +++ b/packages/bigframes/notebooks/generative_ai/ai_functions.ipynb @@ -71,14 +71,14 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "c9f924aa", "metadata": {}, "outputs": [], "source": [ - "import bigframes.pandas as bpd \n", + "import bigframes.pandas as bpd\n", "\n", - "PROJECT_ID = \"\" # @param {type:\"string\"}\n", + "PROJECT_ID = \"bigframes-dev\" # @param {type:\"string\"}\n", "\n", "bpd.options.bigquery.project = PROJECT_ID\n", "bpd.options.bigquery.ordering_mode = \"partial\"\n", @@ -105,16 +105,24 @@ "name": "stderr", "output_type": "stream", "text": [ - "/usr/local/google/home/sycai/src/python-bigquery-dataframes/bigframes/core/global_session.py:103: DefaultLocationWarning: No explicit location is set, so using location US for the session.\n", - " _global_session = bigframes.session.connect(\n" + "/usr/local/google/home/swast/src/github.com/googleapis/google-cloud-python/packages/bigframes/bigframes/core/global_session.py:113: DefaultLocationWarning: No explicit location is set, so using location US for the session.\n", + " _global_session = bigframes.session.connect(\n", + "/usr/local/google/home/swast/src/github.com/googleapis/google-cloud-python/packages/bigframes/bigframes/dtypes.py:1044: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", + "instead of using `db_dtypes` in the future when available in pandas\n", + "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", + " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n" ] }, { "data": { + "text/html": [ + "
0    {\"result\":\"Salad\",\"full_response\":{\"candidates...\n",
+       "1    {\"result\":\"Hotdog\",\"full_response\":{\"candidate...
" + ], "text/plain": [ - "0 {'result': 'Salad\\n', 'full_response': '{\"cand...\n", - "1 {'result': 'Sausageroll\\n', 'full_response': '...\n", - "dtype: struct>, status: string>[pyarrow]" + "0 {\"result\":\"Salad\",\"full_response\":{\"candidates...\n", + "1 {\"result\":\"Hotdog\",\"full_response\":{\"candidate...\n", + "Name: 0, dtype: string" ] }, "execution_count": 3, @@ -156,9 +164,13 @@ "outputs": [ { "data": { + "text/html": [ + "
0    \n",
+       "1    
" + ], "text/plain": [ - "0 Lettuce\n", - "1 The food\n", + "0 \n", + "1 \n", "Name: result, dtype: string" ] }, @@ -327,7 +339,7 @@ " \n", " 0\n", " tiger\n", - " 8.0\n", + " 7.0\n", " \n", " \n", " 2\n", @@ -342,7 +354,7 @@ "text/plain": [ " animals relative_weight\n", "1 spider 1.0\n", - "0 tiger 8.0\n", + "0 tiger 7.0\n", "2 blue whale 10.0\n", "\n", "[3 rows x 2 columns]" @@ -465,7 +477,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 8, "id": "2e66110a", "metadata": {}, "outputs": [ @@ -518,7 +530,7 @@ "[2 rows x 2 columns]" ] }, - "execution_count": 9, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -533,7 +545,7 @@ ], "metadata": { "kernelspec": { - "display_name": "venv (3.10.17)", + "display_name": "venv", "language": "python", "name": "python3" }, @@ -547,7 +559,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.17" + "version": "3.14.3" } }, "nbformat": 4, diff --git a/packages/bigframes/tests/unit/test_iloc_setitem.py b/packages/bigframes/tests/unit/test_iloc_setitem.py new file mode 100644 index 000000000000..66b9bf9e3086 --- /dev/null +++ b/packages/bigframes/tests/unit/test_iloc_setitem.py @@ -0,0 +1,212 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Generator + +import numpy as np +import pandas as pd +import pytest + +import bigframes +import bigframes.pandas as bpd +from bigframes.testing.utils import assert_frame_equal, assert_series_equal + +pytest.importorskip("polars") + + +@pytest.fixture(scope="module", autouse=True) +def session() -> Generator[bigframes.Session, None, None]: + import bigframes.core.global_session + from bigframes.testing import polars_session + + session = polars_session.TestSession() + with bigframes.core.global_session._GlobalSessionContext(session): + yield session + + +@pytest.fixture +def sample_df() -> bpd.DataFrame: + pd_df = pd.DataFrame( + { + "A": [1, 2, 3], + "B": [4, 5, 6], + "C": [7, 8, 9], + } + ) + return bpd.read_pandas(pd_df) + + +def test_iloc_setitem_single_integer(sample_df): + bf_df = sample_df.copy() + pd_df = sample_df.to_pandas() + + bf_df.iloc[:, 1] = 99 + pd_df.iloc[:, 1] = 99 + + assert_frame_equal(bf_df.to_pandas(), pd_df) + + +def test_iloc_setitem_single_integer_negative(sample_df): + bf_df = sample_df.copy() + pd_df = sample_df.to_pandas() + + bf_df.iloc[:, -1] = 99 + pd_df.iloc[:, -1] = 99 + + assert_frame_equal(bf_df.to_pandas(), pd_df) + + +def test_iloc_setitem_list_integer(sample_df): + bf_df = sample_df.copy() + pd_df = sample_df.to_pandas() + + bf_df.iloc[:, [0, 2]] = [99, 88] + pd_df.iloc[:, [0, 2]] = [99, 88] + + assert_frame_equal(bf_df.to_pandas(), pd_df) + + +def test_iloc_setitem_slice(sample_df): + bf_df = sample_df.copy() + pd_df = sample_df.to_pandas() + + bf_df.iloc[:, 0:2] = 99 + pd_df.iloc[:, 0:2] = 99 + + assert_frame_equal(bf_df.to_pandas(), pd_df) + + +def test_iloc_setitem_boolean_mask(sample_df): + bf_df = sample_df.copy() + pd_df = sample_df.to_pandas() + + mask = [True, False, True] + bf_df.iloc[:, mask] = 99 + pd_df.iloc[:, np.array(mask)] = 99 + + assert_frame_equal(bf_df.to_pandas(), pd_df) + + +def test_iloc_setitem_dataframe(sample_df): + bf_df = sample_df.copy() + pd_df = sample_df.to_pandas() + + value_df = bpd.DataFrame({"B": [99, 88, 77], "C": [66, 55, 44]}) + bf_df.iloc[:, 1:3] = value_df + pd_df.iloc[:, 1:3] = value_df.to_pandas() + + assert_frame_equal(bf_df.to_pandas(), pd_df) + + +def test_iloc_getitem_single_integer(sample_df): + bf_df = sample_df + pd_df = sample_df.to_pandas() + + bf_result = bf_df.iloc[:, 1].to_pandas() + pd_result = pd_df.iloc[:, 1] + + assert_series_equal(bf_result, pd_result) + + +def test_iloc_getitem_unordered(sample_df): + session = sample_df._session + original_strictly_ordered = session._strictly_ordered + original_allow_ambiguity = session._allow_ambiguity + + try: + session._strictly_ordered = False + session._allow_ambiguity = True + + import unittest.mock as mock + + with ( + mock.patch.object( + type(sample_df._block.expr), + "order_ambiguous", + new_callable=mock.PropertyMock, + ) as mock_ambiguous, + mock.patch.object( + type(sample_df._block), + "explicitly_ordered", + new_callable=mock.PropertyMock, + ) as mock_explicit, + ): + mock_ambiguous.return_value = True + mock_explicit.return_value = False + + # 1. Column indexing only - should NOT raise + try: + sample_df.iloc[:, 1] + except bigframes.exceptions.OrderRequiredError: + pytest.fail("iloc[:, col] raised OrderRequiredError unexpectedly!") + + # 1b. Column indexing with slice(0, None) (NOT exactly `:` but fine) - should NOT raise + try: + sample_df.iloc[slice(0, None), 1] + except bigframes.exceptions.OrderRequiredError: + pytest.fail("iloc[0:, col] raised OrderRequiredError unexpectedly!") + + # 1c. Column indexing with slice(None, None, 1) (NOT exactly `:` but fine) - should NOT raise + try: + sample_df.iloc[slice(None, None, 1), 1] + except bigframes.exceptions.OrderRequiredError: + pytest.fail("iloc[::1, col] raised OrderRequiredError unexpectedly!") + + # 1d. Column indexing with slice(1, None) (row subset) - should RAISE + with pytest.raises(bigframes.exceptions.OrderRequiredError): + sample_df.iloc[slice(1, None), 1] + + # 1e. Column indexing with slice(None, 2) (row subset) - should RAISE + with pytest.raises(bigframes.exceptions.OrderRequiredError): + sample_df.iloc[slice(None, 2), 1] + + # 2. Column setitem only - should NOT raise + try: + bf_df = sample_df.copy() + bf_df.iloc[:, 1] = 99 + except bigframes.exceptions.OrderRequiredError: + pytest.fail( + "iloc[:, col] = val raised OrderRequiredError unexpectedly!" + ) + + # 3. Row indexing - should RAISE + with pytest.raises(bigframes.exceptions.OrderRequiredError): + sample_df.iloc[1, :] + + # 4. Single indexer (row indexing) - should RAISE + with pytest.raises(bigframes.exceptions.OrderRequiredError): + sample_df.iloc[1] + + finally: + session._strictly_ordered = original_strictly_ordered + session._allow_ambiguity = original_allow_ambiguity + + +def test_iloc_setitem_errors(sample_df): + bf_df = sample_df.copy() + + # Out of bounds + with pytest.raises(IndexError): + bf_df.iloc[:, 3] = 99 + + with pytest.raises(IndexError): + bf_df.iloc[:, -4] = 99 + + # Invalid key type (not slice(None) for rows) + with pytest.raises(NotImplementedError): + bf_df.iloc[0, 1] = 99 + + # Invalid col indexer type + with pytest.raises(TypeError): + bf_df.iloc[:, "B"] = 99