-
Notifications
You must be signed in to change notification settings - Fork 1.7k
fix(bigframes): avoid exceptions for unnamed JSON columns in SQL Cell outputs #17367
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -317,7 +317,6 @@ def loc(self) -> indexers.LocDataFrameIndexer: | |
| return indexers.LocDataFrameIndexer(self) | ||
|
|
||
| @property | ||
| @validations.requires_ordering() | ||
| def iloc(self) -> indexers.ILocDataFrameIndexer: | ||
| return indexers.ILocDataFrameIndexer(self) | ||
|
|
||
|
|
@@ -821,22 +820,25 @@ def __repr__(self) -> str: | |
|
|
||
| def _get_display_df(self) -> DataFrame: | ||
| """Process ObjectRef and JSON/nested JSON columns for display.""" | ||
| import bigframes.bigquery as bbq | ||
|
|
||
| df = self | ||
| # Arrow/Pandas to_pandas_batches does not support raw JSON/nested JSON | ||
| # columns. Pre-serialize them to string format to bypass this limit. | ||
| # Using TO_JSON_STRING via SqlScalarOp handles complex nested STRUCT | ||
| # types correctly. | ||
| json_cols = [ | ||
| col | ||
| for col in df.columns | ||
| # types correctly. Use the offset so that we can handle duplicate and | ||
| # non-string column names. | ||
| json_col_indexes = [ | ||
| col_index | ||
| for col_index, col in enumerate(df.columns) | ||
| if bigframes.dtypes.contains_db_dtypes_json_dtype(df[col].dtype) | ||
| ] | ||
| if json_cols: | ||
| op = ops.SqlScalarOp( | ||
| _output_type=bigframes.dtypes.STRING_DTYPE, | ||
| sql_template="TO_JSON_STRING({0})", | ||
| if json_col_indexes: | ||
| df._block.apply_analytic | ||
| df.iloc[:, json_col_indexes] = cast( | ||
|
Comment on lines
+837
to
+838
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
| DataFrame, | ||
| df.iloc[:, json_col_indexes].apply(bbq.to_json_string), # type: ignore | ||
| ) | ||
| df = df.assign(**{col: df[col]._apply_unary_op(op) for col in json_cols}) | ||
| return df | ||
|
|
||
| def _repr_mimebundle_(self, include=None, exclude=None): | ||
|
|
||
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -71,14 +71,14 @@ | |||||
| }, | ||||||
| { | ||||||
| "cell_type": "code", | ||||||
| "execution_count": null, | ||||||
| "execution_count": 2, | ||||||
| "id": "c9f924aa", | ||||||
| "metadata": {}, | ||||||
| "outputs": [], | ||||||
| "source": [ | ||||||
| "import bigframes.pandas as bpd \n", | ||||||
| "import bigframes.pandas as bpd\n", | ||||||
| "\n", | ||||||
| "PROJECT_ID = \"\" # @param {type:\"string\"}\n", | ||||||
| "PROJECT_ID = \"bigframes-dev\" # @param {type:\"string\"}\n", | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It is generally better to keep
Suggested change
|
||||||
| "\n", | ||||||
| "bpd.options.bigquery.project = PROJECT_ID\n", | ||||||
| "bpd.options.bigquery.ordering_mode = \"partial\"\n", | ||||||
|
|
@@ -105,16 +105,24 @@ | |||||
| "name": "stderr", | ||||||
| "output_type": "stream", | ||||||
| "text": [ | ||||||
| "/usr/local/google/home/sycai/src/python-bigquery-dataframes/bigframes/core/global_session.py:103: DefaultLocationWarning: No explicit location is set, so using location US for the session.\n", | ||||||
| " _global_session = bigframes.session.connect(\n" | ||||||
| "/usr/local/google/home/swast/src/github.com/googleapis/google-cloud-python/packages/bigframes/bigframes/core/global_session.py:113: DefaultLocationWarning: No explicit location is set, so using location US for the session.\n", | ||||||
| " _global_session = bigframes.session.connect(\n", | ||||||
| "/usr/local/google/home/swast/src/github.com/googleapis/google-cloud-python/packages/bigframes/bigframes/dtypes.py:1044: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", | ||||||
| "instead of using `db_dtypes` in the future when available in pandas\n", | ||||||
| "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", | ||||||
| " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n" | ||||||
| ] | ||||||
| }, | ||||||
| { | ||||||
| "data": { | ||||||
| "text/html": [ | ||||||
| "<pre>0 {\"result\":\"Salad\",\"full_response\":{\"candidates...\n", | ||||||
| "1 {\"result\":\"Hotdog\",\"full_response\":{\"candidate...</pre>" | ||||||
| ], | ||||||
| "text/plain": [ | ||||||
| "0 {'result': 'Salad\\n', 'full_response': '{\"cand...\n", | ||||||
| "1 {'result': 'Sausageroll\\n', 'full_response': '...\n", | ||||||
| "dtype: struct<result: string, full_response: extension<dbjson<JSONArrowType>>, status: string>[pyarrow]" | ||||||
| "0 {\"result\":\"Salad\",\"full_response\":{\"candidates...\n", | ||||||
| "1 {\"result\":\"Hotdog\",\"full_response\":{\"candidate...\n", | ||||||
| "Name: 0, dtype: string" | ||||||
| ] | ||||||
| }, | ||||||
| "execution_count": 3, | ||||||
|
|
@@ -156,9 +164,13 @@ | |||||
| "outputs": [ | ||||||
| { | ||||||
| "data": { | ||||||
| "text/html": [ | ||||||
| "<pre>0 <NA>\n", | ||||||
| "1 <NA></pre>" | ||||||
| ], | ||||||
| "text/plain": [ | ||||||
| "0 Lettuce\n", | ||||||
| "1 The food\n", | ||||||
| "0 <NA>\n", | ||||||
| "1 <NA>\n", | ||||||
| "Name: result, dtype: string" | ||||||
| ] | ||||||
| }, | ||||||
|
|
@@ -327,7 +339,7 @@ | |||||
| " <tr>\n", | ||||||
| " <th>0</th>\n", | ||||||
| " <td>tiger</td>\n", | ||||||
| " <td>8.0</td>\n", | ||||||
| " <td>7.0</td>\n", | ||||||
| " </tr>\n", | ||||||
| " <tr>\n", | ||||||
| " <th>2</th>\n", | ||||||
|
|
@@ -342,7 +354,7 @@ | |||||
| "text/plain": [ | ||||||
| " animals relative_weight\n", | ||||||
| "1 spider 1.0\n", | ||||||
| "0 tiger 8.0\n", | ||||||
| "0 tiger 7.0\n", | ||||||
| "2 blue whale 10.0\n", | ||||||
| "\n", | ||||||
| "[3 rows x 2 columns]" | ||||||
|
|
@@ -465,7 +477,7 @@ | |||||
| }, | ||||||
| { | ||||||
| "cell_type": "code", | ||||||
| "execution_count": 9, | ||||||
| "execution_count": 8, | ||||||
| "id": "2e66110a", | ||||||
| "metadata": {}, | ||||||
| "outputs": [ | ||||||
|
|
@@ -518,7 +530,7 @@ | |||||
| "[2 rows x 2 columns]" | ||||||
| ] | ||||||
| }, | ||||||
| "execution_count": 9, | ||||||
| "execution_count": 8, | ||||||
| "metadata": {}, | ||||||
| "output_type": "execute_result" | ||||||
| } | ||||||
|
|
@@ -533,7 +545,7 @@ | |||||
| ], | ||||||
| "metadata": { | ||||||
| "kernelspec": { | ||||||
| "display_name": "venv (3.10.17)", | ||||||
| "display_name": "venv", | ||||||
| "language": "python", | ||||||
| "name": "python3" | ||||||
| }, | ||||||
|
|
@@ -547,7 +559,7 @@ | |||||
| "name": "python", | ||||||
| "nbconvert_exporter": "python", | ||||||
| "pygments_lexer": "ipython3", | ||||||
| "version": "3.10.17" | ||||||
| "version": "3.14.3" | ||||||
| } | ||||||
| }, | ||||||
| "nbformat": 4, | ||||||
|
|
||||||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Using
df.assign(**{col_label: value})will raise aTypeError: assign() keywords must be stringsifcol_labelis not a string (e.g., an integer orNone). Since_assign_multi_itemsis already used in the other branches of this method and handles non-string/duplicate column names safely, we should use it here as well.