Skip to content

Commit 540b9e1

Browse files
authored
Merge branch 'main' into interchange
2 parents b903833 + a63fc02 commit 540b9e1

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

54 files changed

+1022
-301
lines changed

CHANGELOG.md

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,21 @@
44

55
[1]: https://pypi.org/project/bigframes/#history
66

7+
## [2.19.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v2.18.0...v2.19.0) (2025-09-09)
8+
9+
10+
### Features
11+
12+
* Add str.join method ([#2054](https://github.com/googleapis/python-bigquery-dataframes/issues/2054)) ([8804ada](https://github.com/googleapis/python-bigquery-dataframes/commit/8804adaf8ba23fdcad6e42a7bf034bd0a11c890f))
13+
* Support display.max_colwidth option ([#2053](https://github.com/googleapis/python-bigquery-dataframes/issues/2053)) ([5229e07](https://github.com/googleapis/python-bigquery-dataframes/commit/5229e07b4535c01b0cdbd731455ff225a373b5c8))
14+
* Support VPC egress setting in remote function ([#2059](https://github.com/googleapis/python-bigquery-dataframes/issues/2059)) ([5df779d](https://github.com/googleapis/python-bigquery-dataframes/commit/5df779d4f421d3ba777cfd928d99ca2e8a3f79ad))
15+
16+
17+
### Bug Fixes
18+
19+
* Fix issue mishandling chunked array while loading data ([#2051](https://github.com/googleapis/python-bigquery-dataframes/issues/2051)) ([873d0ee](https://github.com/googleapis/python-bigquery-dataframes/commit/873d0eee474ed34f1d5164c37383f2737dbec4db))
20+
* Remove warning for slot_millis_sum ([#2047](https://github.com/googleapis/python-bigquery-dataframes/issues/2047)) ([425a691](https://github.com/googleapis/python-bigquery-dataframes/commit/425a6917d5442eeb4df486c6eed1fd136bbcedfb))
21+
722
## [2.18.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v2.17.0...v2.18.0) (2025-09-03)
823

924

bigframes/_config/auth.py

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
from __future__ import annotations
16+
17+
import threading
18+
from typing import Optional
19+
20+
import google.auth.credentials
21+
import google.auth.transport.requests
22+
import pydata_google_auth
23+
24+
_SCOPES = ["https://www.googleapis.com/auth/cloud-platform"]
25+
26+
# Put the lock here rather than in BigQueryOptions so that BigQueryOptions
27+
# remains deepcopy-able.
28+
_AUTH_LOCK = threading.Lock()
29+
_cached_credentials: Optional[google.auth.credentials.Credentials] = None
30+
_cached_project_default: Optional[str] = None
31+
32+
33+
def get_default_credentials_with_project() -> tuple[
34+
google.auth.credentials.Credentials, Optional[str]
35+
]:
36+
global _AUTH_LOCK, _cached_credentials, _cached_project_default
37+
38+
with _AUTH_LOCK:
39+
if _cached_credentials is not None:
40+
return _cached_credentials, _cached_project_default
41+
42+
_cached_credentials, _cached_project_default = pydata_google_auth.default(
43+
scopes=_SCOPES, use_local_webserver=False
44+
)
45+
46+
# Ensure an access token is available.
47+
_cached_credentials.refresh(google.auth.transport.requests.Request())
48+
49+
return _cached_credentials, _cached_project_default
50+
51+
52+
def reset_default_credentials_and_project():
53+
global _AUTH_LOCK, _cached_credentials, _cached_project_default
54+
55+
with _AUTH_LOCK:
56+
_cached_credentials = None
57+
_cached_project_default = None

bigframes/_config/bigquery_options.py

Lines changed: 41 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
import google.auth.credentials
2323
import requests.adapters
2424

25+
import bigframes._config.auth
2526
import bigframes._importing
2627
import bigframes.enums
2728
import bigframes.exceptions as bfe
@@ -37,6 +38,7 @@
3738

3839
def _get_validated_location(value: Optional[str]) -> Optional[str]:
3940
import bigframes._tools.strings
41+
import bigframes.constants
4042

4143
if value is None or value in bigframes.constants.ALL_BIGQUERY_LOCATIONS:
4244
return value
@@ -141,20 +143,52 @@ def application_name(self, value: Optional[str]):
141143
)
142144
self._application_name = value
143145

146+
def _try_set_default_credentials_and_project(
147+
self,
148+
) -> tuple[google.auth.credentials.Credentials, Optional[str]]:
149+
# Don't fetch credentials or project if credentials is already set.
150+
# If it's set, we've already authenticated, so if the user wants to
151+
# re-auth, they should explicitly reset the credentials.
152+
if self._credentials is not None:
153+
return self._credentials, self._project
154+
155+
(
156+
credentials,
157+
credentials_project,
158+
) = bigframes._config.auth.get_default_credentials_with_project()
159+
self._credentials = credentials
160+
161+
# Avoid overriding an explicitly set project with a default value.
162+
if self._project is None:
163+
self._project = credentials_project
164+
165+
return credentials, self._project
166+
144167
@property
145-
def credentials(self) -> Optional[google.auth.credentials.Credentials]:
168+
def credentials(self) -> google.auth.credentials.Credentials:
146169
"""The OAuth2 credentials to use for this client.
147170
171+
Set to None to force re-authentication.
172+
148173
Returns:
149174
None or google.auth.credentials.Credentials:
150175
google.auth.credentials.Credentials if exists; otherwise None.
151176
"""
152-
return self._credentials
177+
if self._credentials:
178+
return self._credentials
179+
180+
credentials, _ = self._try_set_default_credentials_and_project()
181+
return credentials
153182

154183
@credentials.setter
155184
def credentials(self, value: Optional[google.auth.credentials.Credentials]):
156185
if self._session_started and self._credentials is not value:
157186
raise ValueError(SESSION_STARTED_MESSAGE.format(attribute="credentials"))
187+
188+
if value is None:
189+
# The user has _explicitly_ asked that we re-authenticate.
190+
bigframes._config.auth.reset_default_credentials_and_project()
191+
158192
self._credentials = value
159193

160194
@property
@@ -183,7 +217,11 @@ def project(self) -> Optional[str]:
183217
None or str:
184218
Google Cloud project ID as a string; otherwise None.
185219
"""
186-
return self._project
220+
if self._project:
221+
return self._project
222+
223+
_, project = self._try_set_default_credentials_and_project()
224+
return project
187225

188226
@project.setter
189227
def project(self, value: Optional[str]):

bigframes/bigquery/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@
5050
json_value,
5151
json_value_array,
5252
parse_json,
53+
to_json_string,
5354
)
5455
from bigframes.bigquery._operations.search import create_vector_index, vector_search
5556
from bigframes.bigquery._operations.sql import sql_scalar
@@ -87,6 +88,7 @@
8788
json_value,
8889
json_value_array,
8990
parse_json,
91+
to_json_string,
9092
# search ops
9193
create_vector_index,
9294
vector_search,

bigframes/bigquery/_operations/json.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -430,6 +430,40 @@ def json_value_array(
430430
return input._apply_unary_op(ops.JSONValueArray(json_path=json_path))
431431

432432

433+
def to_json_string(
434+
input: series.Series,
435+
) -> series.Series:
436+
"""Converts a series to a JSON-formatted STRING value.
437+
438+
**Examples:**
439+
440+
>>> import bigframes.pandas as bpd
441+
>>> import bigframes.bigquery as bbq
442+
>>> bpd.options.display.progress_bar = None
443+
444+
>>> s = bpd.Series([1, 2, 3])
445+
>>> bbq.to_json_string(s)
446+
0 1
447+
1 2
448+
2 3
449+
dtype: string
450+
451+
>>> s = bpd.Series([{"int": 1, "str": "pandas"}, {"int": 2, "str": "numpy"}])
452+
>>> bbq.to_json_string(s)
453+
0 {"int":1,"str":"pandas"}
454+
1 {"int":2,"str":"numpy"}
455+
dtype: string
456+
457+
Args:
458+
input (bigframes.series.Series):
459+
The Series to be converted.
460+
461+
Returns:
462+
bigframes.series.Series: A new Series with the JSON-formatted STRING value.
463+
"""
464+
return input._apply_unary_op(ops.ToJSONString())
465+
466+
433467
@utils.preview(name="The JSON-related API `parse_json`")
434468
def parse_json(
435469
input: series.Series,

bigframes/core/blocks.py

Lines changed: 25 additions & 86 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,6 @@
2727
import functools
2828
import itertools
2929
import random
30-
import textwrap
3130
import typing
3231
from typing import (
3332
Iterable,
@@ -55,16 +54,13 @@
5554
from bigframes.core import agg_expressions, local_data
5655
import bigframes.core as core
5756
import bigframes.core.agg_expressions as ex_types
58-
import bigframes.core.compile.googlesql as googlesql
5957
import bigframes.core.expression as ex
6058
import bigframes.core.expression as scalars
6159
import bigframes.core.guid as guid
6260
import bigframes.core.identifiers
6361
import bigframes.core.join_def as join_defs
6462
import bigframes.core.ordering as ordering
6563
import bigframes.core.pyarrow_utils as pyarrow_utils
66-
import bigframes.core.schema as bf_schema
67-
import bigframes.core.sql as sql
6864
import bigframes.core.utils as utils
6965
import bigframes.core.window_spec as windows
7066
import bigframes.dtypes
@@ -2779,14 +2775,6 @@ def _throw_if_null_index(self, opname: str):
27792775
)
27802776

27812777
def _get_rows_as_json_values(self) -> Block:
2782-
# We want to preserve any ordering currently present before turning to
2783-
# direct SQL manipulation. We will restore the ordering when we rebuild
2784-
# expression.
2785-
# TODO(shobs): Replace direct SQL manipulation by structured expression
2786-
# manipulation
2787-
expr, ordering_column_name = self.expr.promote_offsets()
2788-
expr_sql = self.session._executor.to_sql(expr)
2789-
27902778
# Names of the columns to serialize for the row.
27912779
# We will use the repr-eval pattern to serialize a value here and
27922780
# deserialize in the cloud function. Let's make sure that would work.
@@ -2802,93 +2790,44 @@ def _get_rows_as_json_values(self) -> Block:
28022790
)
28032791

28042792
column_names.append(serialized_column_name)
2805-
column_names_csv = sql.csv(map(sql.simple_literal, column_names))
2806-
2807-
# index columns count
2808-
index_columns_count = len(self.index_columns)
28092793

28102794
# column references to form the array of values for the row
28112795
column_types = list(self.index.dtypes) + list(self.dtypes)
28122796
column_references = []
28132797
for type_, col in zip(column_types, self.expr.column_ids):
2814-
if isinstance(type_, pd.ArrowDtype) and pa.types.is_binary(
2815-
type_.pyarrow_dtype
2816-
):
2817-
column_references.append(sql.to_json_string(col))
2798+
if type_ == bigframes.dtypes.BYTES_DTYPE:
2799+
column_references.append(ops.ToJSONString().as_expr(col))
2800+
elif type_ == bigframes.dtypes.BOOL_DTYPE:
2801+
# cast operator produces True/False, but function template expects lower case
2802+
column_references.append(
2803+
ops.lower_op.as_expr(
2804+
ops.AsTypeOp(bigframes.dtypes.STRING_DTYPE).as_expr(col)
2805+
)
2806+
)
28182807
else:
2819-
column_references.append(sql.cast_as_string(col))
2820-
2821-
column_references_csv = sql.csv(column_references)
2822-
2823-
# types of the columns to serialize for the row
2824-
column_types_csv = sql.csv(
2825-
[sql.simple_literal(str(typ)) for typ in column_types]
2826-
)
2808+
column_references.append(
2809+
ops.AsTypeOp(bigframes.dtypes.STRING_DTYPE).as_expr(col)
2810+
)
28272811

28282812
# row dtype to use for deserializing the row as pandas series
28292813
pandas_row_dtype = bigframes.dtypes.lcd_type(*column_types)
28302814
if pandas_row_dtype is None:
28312815
pandas_row_dtype = "object"
2832-
pandas_row_dtype = sql.simple_literal(str(pandas_row_dtype))
2833-
2834-
# create a json column representing row through SQL manipulation
2835-
row_json_column_name = guid.generate_guid()
2836-
select_columns = (
2837-
[ordering_column_name] + list(self.index_columns) + [row_json_column_name]
2838-
)
2839-
select_columns_csv = sql.csv(
2840-
[googlesql.identifier(col) for col in select_columns]
2841-
)
2842-
json_sql = f"""\
2843-
With T0 AS (
2844-
{textwrap.indent(expr_sql, " ")}
2845-
),
2846-
T1 AS (
2847-
SELECT *,
2848-
TO_JSON_STRING(JSON_OBJECT(
2849-
"names", [{column_names_csv}],
2850-
"types", [{column_types_csv}],
2851-
"values", [{column_references_csv}],
2852-
"indexlength", {index_columns_count},
2853-
"dtype", {pandas_row_dtype}
2854-
)) AS {googlesql.identifier(row_json_column_name)} FROM T0
2855-
)
2856-
SELECT {select_columns_csv} FROM T1
2857-
"""
2858-
# The only ways this code is used is through df.apply(axis=1) cope path
2859-
destination, query_job = self.session._loader._query_to_destination(
2860-
json_sql, cluster_candidates=[ordering_column_name]
2861-
)
2862-
if not destination:
2863-
raise ValueError(f"Query job {query_job} did not produce result table")
2864-
2865-
new_schema = (
2866-
self.expr.schema.select([*self.index_columns])
2867-
.append(
2868-
bf_schema.SchemaItem(
2869-
row_json_column_name, bigframes.dtypes.STRING_DTYPE
2870-
)
2871-
)
2872-
.append(
2873-
bf_schema.SchemaItem(ordering_column_name, bigframes.dtypes.INT_DTYPE)
2874-
)
2875-
)
2816+
pandas_row_dtype = str(pandas_row_dtype)
28762817

2877-
dest_table = self.session.bqclient.get_table(destination)
2878-
expr = core.ArrayValue.from_table(
2879-
dest_table,
2880-
schema=new_schema,
2881-
session=self.session,
2882-
offsets_col=ordering_column_name,
2883-
n_rows=dest_table.num_rows,
2884-
).drop_columns([ordering_column_name])
2885-
block = Block(
2886-
expr,
2887-
index_columns=self.index_columns,
2888-
column_labels=[row_json_column_name],
2889-
index_labels=self._index_labels,
2818+
struct_op = ops.StructOp(
2819+
column_names=("names", "types", "values", "indexlength", "dtype")
28902820
)
2891-
return block
2821+
names_val = ex.const(tuple(column_names))
2822+
types_val = ex.const(tuple(map(str, column_types)))
2823+
values_val = ops.ToArrayOp().as_expr(*column_references)
2824+
indexlength_val = ex.const(len(self.index_columns))
2825+
dtype_val = ex.const(str(pandas_row_dtype))
2826+
struct_expr = struct_op.as_expr(
2827+
names_val, types_val, values_val, indexlength_val, dtype_val
2828+
)
2829+
block, col_id = self.project_expr(ops.ToJSONString().as_expr(struct_expr))
2830+
return block.select_column(col_id)
28922831

28932832

28942833
class BlockIndexProperties:

0 commit comments

Comments
 (0)