Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
123 changes: 122 additions & 1 deletion packages/bigframes/bigframes/core/indexers.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
import bigframes.core.guid as guid
import bigframes.core.indexes as indexes
import bigframes.core.scalar
import bigframes.core.validations as validations
import bigframes.core.window_spec as windows
import bigframes.dataframe
import bigframes.dtypes
Expand Down Expand Up @@ -102,6 +103,18 @@ def __getitem__(

Other key types are not yet supported.
"""
requires_ordering = True
if (
isinstance(key, slice)
and (key.start is None or key.start == 0)
and (key.step is None or key.step == 1)
and key.stop is None
):
requires_ordering = False

if requires_ordering:
validations.enforce_ordered(self._series, "iloc")

return _iloc_getitem_series_or_dataframe(self._series, key)


Expand Down Expand Up @@ -244,8 +257,113 @@ def __getitem__(self, key) -> Union[bigframes.dataframe.DataFrame, pd.Series]:

Other key types are not yet supported.
"""
requires_ordering = True
if isinstance(key, tuple):
if len(key) > 0:
row_indexer = key[0]
if (
isinstance(row_indexer, slice)
and (row_indexer.start is None or row_indexer.start == 0)
and (row_indexer.step is None or row_indexer.step == 1)
and row_indexer.stop is None
):
requires_ordering = False
else:
if (
isinstance(key, slice)
and (key.start is None or key.start == 0)
and (key.step is None or key.step == 1)
and key.stop is None
):
requires_ordering = False

if requires_ordering:
validations.enforce_ordered(self._dataframe, "iloc")

return _iloc_getitem_series_or_dataframe(self._dataframe, key)

def __setitem__(
self,
key: Tuple[
slice, Union[int, typing.Sequence[int], slice, typing.Sequence[bool]]
],
value: Union[
bigframes.dataframe.SingleItemValue, bigframes.dataframe.DataFrame
],
):
if not (
isinstance(key, tuple)
and len(key) == 2
and isinstance(key[0], slice)
and (key[0].start is None or key[0].start == 0)
and (key[0].step is None or key[0].step == 1)
and key[0].stop is None
):
raise NotImplementedError(
"Only DataFrame.iloc[:, col_indexer] = value is supported."
)

col_indexer = key[1]
n_cols = len(self._dataframe.columns)

if isinstance(col_indexer, bool):
raise TypeError(
"pos must be integer or slice or list-like of integers/booleans"
)

if isinstance(col_indexer, int):
col_offset = col_indexer
if col_offset < 0:
col_offset += n_cols
if col_offset < 0 or col_offset >= n_cols:
raise IndexError("single positional indexer is out-of-bounds")

col_label = self._dataframe.columns[col_offset]
df = self._dataframe.assign(**{col_label: value})
self._dataframe._set_block(df._get_block())
Comment on lines +321 to +323
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

Using df.assign(**{col_label: value}) will raise a TypeError: assign() keywords must be strings if col_label is not a string (e.g., an integer or None). Since _assign_multi_items is already used in the other branches of this method and handles non-string/duplicate column names safely, we should use it here as well.

Suggested change
col_label = self._dataframe.columns[col_offset]
df = self._dataframe.assign(**{col_label: value})
self._dataframe._set_block(df._get_block())
col_label = self._dataframe.columns[col_offset]
df = self._dataframe._assign_multi_items([col_label], value)
self._dataframe._set_block(df._get_block())


elif isinstance(col_indexer, slice):
col_offsets = list(range(*col_indexer.indices(n_cols)))
col_labels = [self._dataframe.columns[idx] for idx in col_offsets]
if not col_labels:
return
df = self._dataframe._assign_multi_items(col_labels, value)
self._dataframe._set_block(df._get_block())

elif pd.api.types.is_list_like(col_indexer):
col_indexer_list = list(col_indexer)

if len(col_indexer_list) > 0 and all(
isinstance(x, bool) for x in col_indexer_list
):
if len(col_indexer_list) != n_cols:
raise ValueError(
f"Boolean index has wrong length: {len(col_indexer_list)} instead of {n_cols}"
)
col_offsets = [i for i, val in enumerate(col_indexer_list) if val]
else:
col_offsets = []
for idx in col_indexer_list:
if isinstance(idx, bool):
raise TypeError("pos list must contain only integers")
if not isinstance(idx, int):
raise TypeError("pos list must contain only integers")
if idx < 0:
idx += n_cols
if idx < 0 or idx >= n_cols:
raise IndexError("positional indexer is out-of-bounds")
col_offsets.append(idx)

col_labels = [self._dataframe.columns[idx] for idx in col_offsets]
if not col_labels:
return
df = self._dataframe._assign_multi_items(col_labels, value)
self._dataframe._set_block(df._get_block())
else:
raise TypeError(
"pos must be integer or slice or list-like of integers/booleans"
)


class IatDataFrameIndexer:
def __init__(self, dataframe: bigframes.dataframe.DataFrame):
Expand Down Expand Up @@ -470,8 +588,11 @@ def _iloc_getitem_series_or_dataframe(

# len(key) == 2
df = typing.cast(bigframes.dataframe.DataFrame, series_or_dataframe)
if isinstance(key[1], int):
if isinstance(key[0], int) and isinstance(key[1], int):
return df.iat[key]
elif isinstance(key[1], int):
col_label = df.columns[key[1]]
return df[col_label].iloc[key[0]]
elif isinstance(key[1], list):
columns = df.columns[key[1]]
return _iloc_getitem_series_or_dataframe(df[columns], key[0])
Expand Down
22 changes: 12 additions & 10 deletions packages/bigframes/bigframes/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -317,7 +317,6 @@ def loc(self) -> indexers.LocDataFrameIndexer:
return indexers.LocDataFrameIndexer(self)

@property
@validations.requires_ordering()
def iloc(self) -> indexers.ILocDataFrameIndexer:
return indexers.ILocDataFrameIndexer(self)

Expand Down Expand Up @@ -821,22 +820,25 @@ def __repr__(self) -> str:

def _get_display_df(self) -> DataFrame:
"""Process ObjectRef and JSON/nested JSON columns for display."""
import bigframes.bigquery as bbq

df = self
# Arrow/Pandas to_pandas_batches does not support raw JSON/nested JSON
# columns. Pre-serialize them to string format to bypass this limit.
# Using TO_JSON_STRING via SqlScalarOp handles complex nested STRUCT
# types correctly.
json_cols = [
col
for col in df.columns
# types correctly. Use the offset so that we can handle duplicate and
# non-string column names.
json_col_indexes = [
col_index
for col_index, col in enumerate(df.columns)
if bigframes.dtypes.contains_db_dtypes_json_dtype(df[col].dtype)
]
if json_cols:
op = ops.SqlScalarOp(
_output_type=bigframes.dtypes.STRING_DTYPE,
sql_template="TO_JSON_STRING({0})",
if json_col_indexes:
df._block.apply_analytic
df.iloc[:, json_col_indexes] = cast(
Comment on lines +837 to +838
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The statement df._block.apply_analytic is a useless expression with no side effects and should be removed.

            df.iloc[:, json_col_indexes] = cast(

DataFrame,
df.iloc[:, json_col_indexes].apply(bbq.to_json_string), # type: ignore
)
df = df.assign(**{col: df[col]._apply_unary_op(op) for col in json_cols})
return df

def _repr_mimebundle_(self, include=None, exclude=None):
Expand Down
1 change: 0 additions & 1 deletion packages/bigframes/bigframes/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,7 +243,6 @@ def loc(self) -> bigframes.core.indexers.LocSeriesIndexer:
return bigframes.core.indexers.LocSeriesIndexer(self)

@property
@validations.requires_ordering()
def iloc(self) -> bigframes.core.indexers.IlocSeriesIndexer:
return bigframes.core.indexers.IlocSeriesIndexer(self)

Expand Down
44 changes: 28 additions & 16 deletions packages/bigframes/notebooks/generative_ai/ai_functions.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -71,14 +71,14 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 2,
"id": "c9f924aa",
"metadata": {},
"outputs": [],
"source": [
"import bigframes.pandas as bpd \n",
"import bigframes.pandas as bpd\n",
"\n",
"PROJECT_ID = \"\" # @param {type:\"string\"}\n",
"PROJECT_ID = \"bigframes-dev\" # @param {type:\"string\"}\n",
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

It is generally better to keep PROJECT_ID as an empty string "" or a placeholder like "your-project-id" in public notebooks so that users are prompted to enter their own Google Cloud project ID, and to avoid hardcoding internal/development project names.

Suggested change
"PROJECT_ID = \"bigframes-dev\" # @param {type:\"string\"}\n",
"PROJECT_ID = \"\" # @param {type:\"string\"}\n",

"\n",
"bpd.options.bigquery.project = PROJECT_ID\n",
"bpd.options.bigquery.ordering_mode = \"partial\"\n",
Expand All @@ -105,16 +105,24 @@
"name": "stderr",
"output_type": "stream",
"text": [
"/usr/local/google/home/sycai/src/python-bigquery-dataframes/bigframes/core/global_session.py:103: DefaultLocationWarning: No explicit location is set, so using location US for the session.\n",
" _global_session = bigframes.session.connect(\n"
"/usr/local/google/home/swast/src/github.com/googleapis/google-cloud-python/packages/bigframes/bigframes/core/global_session.py:113: DefaultLocationWarning: No explicit location is set, so using location US for the session.\n",
" _global_session = bigframes.session.connect(\n",
"/usr/local/google/home/swast/src/github.com/googleapis/google-cloud-python/packages/bigframes/bigframes/dtypes.py:1044: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n",
"instead of using `db_dtypes` in the future when available in pandas\n",
"(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n",
" warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n"
]
},
{
"data": {
"text/html": [
"<pre>0 {\"result\":\"Salad\",\"full_response\":{\"candidates...\n",
"1 {\"result\":\"Hotdog\",\"full_response\":{\"candidate...</pre>"
],
"text/plain": [
"0 {'result': 'Salad\\n', 'full_response': '{\"cand...\n",
"1 {'result': 'Sausageroll\\n', 'full_response': '...\n",
"dtype: struct<result: string, full_response: extension<dbjson<JSONArrowType>>, status: string>[pyarrow]"
"0 {\"result\":\"Salad\",\"full_response\":{\"candidates...\n",
"1 {\"result\":\"Hotdog\",\"full_response\":{\"candidate...\n",
"Name: 0, dtype: string"
]
},
"execution_count": 3,
Expand Down Expand Up @@ -156,9 +164,13 @@
"outputs": [
{
"data": {
"text/html": [
"<pre>0 <NA>\n",
"1 <NA></pre>"
],
"text/plain": [
"0 Lettuce\n",
"1 The food\n",
"0 <NA>\n",
"1 <NA>\n",
"Name: result, dtype: string"
]
},
Expand Down Expand Up @@ -327,7 +339,7 @@
" <tr>\n",
" <th>0</th>\n",
" <td>tiger</td>\n",
" <td>8.0</td>\n",
" <td>7.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
Expand All @@ -342,7 +354,7 @@
"text/plain": [
" animals relative_weight\n",
"1 spider 1.0\n",
"0 tiger 8.0\n",
"0 tiger 7.0\n",
"2 blue whale 10.0\n",
"\n",
"[3 rows x 2 columns]"
Expand Down Expand Up @@ -465,7 +477,7 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 8,
"id": "2e66110a",
"metadata": {},
"outputs": [
Expand Down Expand Up @@ -518,7 +530,7 @@
"[2 rows x 2 columns]"
]
},
"execution_count": 9,
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -533,7 +545,7 @@
],
"metadata": {
"kernelspec": {
"display_name": "venv (3.10.17)",
"display_name": "venv",
"language": "python",
"name": "python3"
},
Expand All @@ -547,7 +559,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.17"
"version": "3.14.3"
}
},
"nbformat": 4,
Expand Down
Loading
Loading