Skip to content

Commit a3c6d54

Browse files
Merge branch 'main' into better_uniform_sample
2 parents 1377016 + 7abfef0 commit a3c6d54

File tree

45 files changed

+1704
-257
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

45 files changed

+1704
-257
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,3 +64,4 @@ tests/js/node_modules/
6464
pylintrc
6565
pylintrc.test
6666
dummy.pkl
67+
.mypy_cache/

bigframes/bigquery/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818

1919
import sys
2020

21-
from bigframes.bigquery import ai, ml
21+
from bigframes.bigquery import ai, ml, obj
2222
from bigframes.bigquery._operations.approx_agg import approx_top_count
2323
from bigframes.bigquery._operations.array import (
2424
array_agg,
@@ -158,4 +158,5 @@
158158
# Modules / SQL namespaces
159159
"ai",
160160
"ml",
161+
"obj",
161162
]
Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
16+
"""This module exposes BigQuery ObjectRef functions.
17+
18+
See bigframes.bigquery.obj for public docs.
19+
"""
20+
21+
22+
from __future__ import annotations
23+
24+
import datetime
25+
from typing import Optional, Sequence, Union
26+
27+
import numpy as np
28+
import pandas as pd
29+
30+
from bigframes.core import convert
31+
from bigframes.core.logging import log_adapter
32+
import bigframes.core.utils as utils
33+
import bigframes.operations as ops
34+
import bigframes.series as series
35+
36+
37+
@log_adapter.method_logger(custom_base_name="bigquery_obj")
38+
def fetch_metadata(
39+
objectref: series.Series,
40+
) -> series.Series:
41+
"""[Preview] The OBJ.FETCH_METADATA function returns Cloud Storage metadata for a partially populated ObjectRef value.
42+
43+
Args:
44+
objectref (bigframes.pandas.Series):
45+
A partially populated ObjectRef value, in which the uri and authorizer fields are populated and the details field isn't.
46+
47+
Returns:
48+
bigframes.pandas.Series: A fully populated ObjectRef value. The metadata is provided in the details field of the returned ObjectRef value.
49+
"""
50+
objectref = convert.to_bf_series(objectref, default_index=None)
51+
return objectref._apply_unary_op(ops.obj_fetch_metadata_op)
52+
53+
54+
@log_adapter.method_logger(custom_base_name="bigquery_obj")
55+
def get_access_url(
56+
objectref: series.Series,
57+
mode: str,
58+
duration: Optional[Union[datetime.timedelta, pd.Timedelta, np.timedelta64]] = None,
59+
) -> series.Series:
60+
"""[Preview] The OBJ.GET_ACCESS_URL function returns JSON that contains reference information for the input ObjectRef value, and also access URLs that you can use to read or modify the Cloud Storage object.
61+
62+
Args:
63+
objectref (bigframes.pandas.Series):
64+
An ObjectRef value that represents a Cloud Storage object.
65+
mode (str):
66+
A STRING value that identifies the type of URL that you want to be returned. The following values are supported:
67+
'r': Returns a URL that lets you read the object.
68+
'rw': Returns two URLs, one that lets you read the object, and one that lets you modify the object.
69+
duration (Union[datetime.timedelta, pandas.Timedelta, numpy.timedelta64], optional):
70+
An optional INTERVAL value that specifies how long the generated access URLs remain valid. You can specify a value between 30 minutes and 6 hours. For example, you could specify INTERVAL 2 HOUR to generate URLs that expire after 2 hours. The default value is 6 hours.
71+
72+
Returns:
73+
bigframes.pandas.Series: A JSON value that contains the Cloud Storage object reference information from the input ObjectRef value, and also one or more URLs that you can use to access the Cloud Storage object.
74+
"""
75+
objectref = convert.to_bf_series(objectref, default_index=None)
76+
77+
duration_micros = None
78+
if duration is not None:
79+
duration_micros = utils.timedelta_to_micros(duration)
80+
81+
return objectref._apply_unary_op(
82+
ops.ObjGetAccessUrl(mode=mode, duration=duration_micros)
83+
)
84+
85+
86+
@log_adapter.method_logger(custom_base_name="bigquery_obj")
87+
def make_ref(
88+
uri_or_json: Union[series.Series, Sequence[str]],
89+
authorizer: Union[series.Series, str, None] = None,
90+
) -> series.Series:
91+
"""[Preview] Use the OBJ.MAKE_REF function to create an ObjectRef value that contains reference information for a Cloud Storage object.
92+
93+
Args:
94+
uri_or_json (bigframes.pandas.Series or str):
95+
A series of STRING values that contains the URI for the Cloud Storage object, for example, gs://mybucket/flowers/12345.jpg.
96+
OR
97+
A series of JSON value that represents a Cloud Storage object.
98+
authorizer (bigframes.pandas.Series or str, optional):
99+
A STRING value that contains the Cloud Resource connection used to access the Cloud Storage object.
100+
Required if ``uri_or_json`` is a URI string.
101+
102+
Returns:
103+
bigframes.pandas.Series: An ObjectRef value.
104+
"""
105+
uri_or_json = convert.to_bf_series(uri_or_json, default_index=None)
106+
107+
if authorizer is not None:
108+
# Avoid join problems encountered if we try to convert a literal into Series.
109+
if not isinstance(authorizer, str):
110+
authorizer = convert.to_bf_series(authorizer, default_index=None)
111+
112+
return uri_or_json._apply_binary_op(authorizer, ops.obj_make_ref_op)
113+
114+
# If authorizer is not provided, we assume uri_or_json is a JSON objectref
115+
return uri_or_json._apply_unary_op(ops.obj_make_ref_json_op)

bigframes/bigquery/obj.py

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
"""This module integrates BigQuery built-in 'ObjectRef' functions for use with Series/DataFrame objects,
16+
such as OBJ.FETCH_METADATA:
17+
https://docs.cloud.google.com/bigquery/docs/reference/standard-sql/objectref_functions
18+
19+
20+
.. warning::
21+
22+
This product or feature is subject to the "Pre-GA Offerings Terms" in the
23+
General Service Terms section of the `Service Specific Terms
24+
<https://cloud.google.com/terms/service-terms>`_. Pre-GA products and
25+
features are available "as is" and might have limited support. For more
26+
information, see the `launch stage descriptions
27+
<https://cloud.google.com/products?hl=en#product-launch-stages>`_.
28+
29+
.. note::
30+
31+
To provide feedback or request support for this feature, send an email to
32+
bq-objectref-feedback@google.com.
33+
"""
34+
35+
from bigframes.bigquery._operations.obj import fetch_metadata, get_access_url, make_ref
36+
37+
__all__ = [
38+
"fetch_metadata",
39+
"get_access_url",
40+
"make_ref",
41+
]

bigframes/core/compile/ibis_compiler/scalar_op_registry.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616

1717
import functools
1818
import typing
19+
from typing import cast
1920

2021
from bigframes_vendored import ibis
2122
import bigframes_vendored.ibis.expr.api as ibis_api
@@ -1247,6 +1248,13 @@ def obj_fetch_metadata_op_impl(obj_ref: ibis_types.Value):
12471248

12481249
@scalar_op_compiler.register_unary_op(ops.ObjGetAccessUrl, pass_op=True)
12491250
def obj_get_access_url_op_impl(obj_ref: ibis_types.Value, op: ops.ObjGetAccessUrl):
1251+
if op.duration is not None:
1252+
duration_value = cast(
1253+
ibis_types.IntegerValue, ibis_types.literal(op.duration)
1254+
).to_interval("us")
1255+
return obj_get_access_url_with_duration(
1256+
obj_ref=obj_ref, mode=op.mode, duration=duration_value
1257+
)
12501258
return obj_get_access_url(obj_ref=obj_ref, mode=op.mode)
12511259

12521260

@@ -1807,6 +1815,11 @@ def obj_make_ref_op(x: ibis_types.Value, y: ibis_types.Value):
18071815
return obj_make_ref(uri=x, authorizer=y)
18081816

18091817

1818+
@scalar_op_compiler.register_unary_op(ops.obj_make_ref_json_op)
1819+
def obj_make_ref_json_op(x: ibis_types.Value):
1820+
return obj_make_ref_json(objectref_json=x)
1821+
1822+
18101823
# Ternary Operations
18111824
@scalar_op_compiler.register_ternary_op(ops.where_op)
18121825
def where_op(
@@ -2141,11 +2154,21 @@ def obj_make_ref(uri: str, authorizer: str) -> _OBJ_REF_IBIS_DTYPE: # type: ign
21412154
"""Make ObjectRef Struct from uri and connection."""
21422155

21432156

2157+
@ibis_udf.scalar.builtin(name="OBJ.MAKE_REF")
2158+
def obj_make_ref_json(objectref_json: ibis_dtypes.JSON) -> _OBJ_REF_IBIS_DTYPE: # type: ignore
2159+
"""Make ObjectRef Struct from json."""
2160+
2161+
21442162
@ibis_udf.scalar.builtin(name="OBJ.GET_ACCESS_URL")
21452163
def obj_get_access_url(obj_ref: _OBJ_REF_IBIS_DTYPE, mode: ibis_dtypes.String) -> ibis_dtypes.JSON: # type: ignore
21462164
"""Get access url (as ObjectRefRumtime JSON) from ObjectRef."""
21472165

21482166

2167+
@ibis_udf.scalar.builtin(name="OBJ.GET_ACCESS_URL")
2168+
def obj_get_access_url_with_duration(obj_ref, mode, duration) -> ibis_dtypes.JSON: # type: ignore
2169+
"""Get access url (as ObjectRefRumtime JSON) from ObjectRef."""
2170+
2171+
21492172
@ibis_udf.scalar.builtin(name="ltrim")
21502173
def str_lstrip_op( # type: ignore[empty-body]
21512174
x: ibis_dtypes.String, to_strip: ibis_dtypes.String

bigframes/core/compile/sqlglot/aggregations/unary_compiler.py

Lines changed: 39 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
from bigframes.core import window_spec
2424
import bigframes.core.compile.sqlglot.aggregations.op_registration as reg
2525
from bigframes.core.compile.sqlglot.aggregations.windows import apply_window_if_present
26+
from bigframes.core.compile.sqlglot.expressions import constants
2627
import bigframes.core.compile.sqlglot.expressions.typed_expr as typed_expr
2728
import bigframes.core.compile.sqlglot.sqlglot_ir as ir
2829
from bigframes.operations import aggregations as agg_ops
@@ -44,9 +45,13 @@ def _(
4445
column: typed_expr.TypedExpr,
4546
window: typing.Optional[window_spec.WindowSpec] = None,
4647
) -> sge.Expression:
47-
# BQ will return null for empty column, result would be false in pandas.
48-
result = apply_window_if_present(sge.func("LOGICAL_AND", column.expr), window)
49-
return sge.func("IFNULL", result, sge.true())
48+
expr = column.expr
49+
if column.dtype != dtypes.BOOL_DTYPE:
50+
expr = sge.NEQ(this=expr, expression=sge.convert(0))
51+
expr = apply_window_if_present(sge.func("LOGICAL_AND", expr), window)
52+
53+
# BQ will return null for empty column, result would be true in pandas.
54+
return sge.func("COALESCE", expr, sge.convert(True))
5055

5156

5257
@UNARY_OP_REGISTRATION.register(agg_ops.AnyOp)
@@ -56,6 +61,8 @@ def _(
5661
window: typing.Optional[window_spec.WindowSpec] = None,
5762
) -> sge.Expression:
5863
expr = column.expr
64+
if column.dtype != dtypes.BOOL_DTYPE:
65+
expr = sge.NEQ(this=expr, expression=sge.convert(0))
5966
expr = apply_window_if_present(sge.func("LOGICAL_OR", expr), window)
6067

6168
# BQ will return null for empty column, result would be false in pandas.
@@ -326,6 +333,15 @@ def _(
326333
unit=sge.Identifier(this="MICROSECOND"),
327334
)
328335

336+
if column.dtype == dtypes.DATE_DTYPE:
337+
date_diff = sge.DateDiff(
338+
this=column.expr, expression=shifted, unit=sge.Identifier(this="DAY")
339+
)
340+
return sge.Cast(
341+
this=sge.Floor(this=date_diff * constants._DAY_TO_MICROSECONDS),
342+
to="INT64",
343+
)
344+
329345
raise TypeError(f"Cannot perform diff on type {column.dtype}")
330346

331347

@@ -410,24 +426,28 @@ def _(
410426
column: typed_expr.TypedExpr,
411427
window: typing.Optional[window_spec.WindowSpec] = None,
412428
) -> sge.Expression:
429+
expr = column.expr
430+
if column.dtype == dtypes.BOOL_DTYPE:
431+
expr = sge.Cast(this=expr, to="INT64")
432+
413433
# Need to short-circuit as log with zeroes is illegal sql
414-
is_zero = sge.EQ(this=column.expr, expression=sge.convert(0))
434+
is_zero = sge.EQ(this=expr, expression=sge.convert(0))
415435

416436
# There is no product sql aggregate function, so must implement as a sum of logs, and then
417437
# apply power after. Note, log and power base must be equal! This impl uses natural log.
418-
logs = (
419-
sge.Case()
420-
.when(is_zero, sge.convert(0))
421-
.else_(sge.func("LN", sge.func("ABS", column.expr)))
438+
logs = sge.If(
439+
this=is_zero,
440+
true=sge.convert(0),
441+
false=sge.func("LOG", sge.convert(2), sge.func("ABS", expr)),
422442
)
423443
logs_sum = apply_window_if_present(sge.func("SUM", logs), window)
424-
magnitude = sge.func("EXP", logs_sum)
444+
magnitude = sge.func("POWER", sge.convert(2), logs_sum)
425445

426446
# Can't determine sign from logs, so have to determine parity of count of negative inputs
427447
is_negative = (
428448
sge.Case()
429449
.when(
430-
sge.LT(this=sge.func("SIGN", column.expr), expression=sge.convert(0)),
450+
sge.EQ(this=sge.func("SIGN", expr), expression=sge.convert(-1)),
431451
sge.convert(1),
432452
)
433453
.else_(sge.convert(0))
@@ -445,11 +465,7 @@ def _(
445465
.else_(
446466
sge.Mul(
447467
this=magnitude,
448-
expression=sge.If(
449-
this=sge.EQ(this=negative_count_parity, expression=sge.convert(1)),
450-
true=sge.convert(-1),
451-
false=sge.convert(1),
452-
),
468+
expression=sge.func("POWER", sge.convert(-1), negative_count_parity),
453469
)
454470
)
455471
)
@@ -499,14 +515,18 @@ def _(
499515
column: typed_expr.TypedExpr,
500516
window: typing.Optional[window_spec.WindowSpec] = None,
501517
) -> sge.Expression:
502-
# TODO: Support interpolation argument
503-
# TODO: Support percentile_disc
504-
result: sge.Expression = sge.func("PERCENTILE_CONT", column.expr, sge.convert(op.q))
518+
expr = column.expr
519+
if column.dtype == dtypes.BOOL_DTYPE:
520+
expr = sge.Cast(this=expr, to="INT64")
521+
522+
result: sge.Expression = sge.func("PERCENTILE_CONT", expr, sge.convert(op.q))
505523
if window is None:
506-
# PERCENTILE_CONT is a navigation function, not an aggregate function, so it always needs an OVER clause.
524+
# PERCENTILE_CONT is a navigation function, not an aggregate function,
525+
# so it always needs an OVER clause.
507526
result = sge.Window(this=result)
508527
else:
509528
result = apply_window_if_present(result, window)
529+
510530
if op.should_floor_result:
511531
result = sge.Cast(this=sge.func("FLOOR", result), to="INT64")
512532
return result

bigframes/core/compile/sqlglot/expressions/blob_ops.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,9 +31,22 @@ def _(expr: TypedExpr) -> sge.Expression:
3131

3232
@register_unary_op(ops.ObjGetAccessUrl, pass_op=True)
3333
def _(expr: TypedExpr, op: ops.ObjGetAccessUrl) -> sge.Expression:
34-
return sge.func("OBJ.GET_ACCESS_URL", expr.expr, sge.convert(op.mode))
34+
args = [expr.expr, sge.Literal.string(op.mode)]
35+
if op.duration is not None:
36+
args.append(
37+
sge.Interval(
38+
this=sge.Literal.number(op.duration),
39+
unit=sge.Var(this="MICROSECOND"),
40+
)
41+
)
42+
return sge.func("OBJ.GET_ACCESS_URL", *args)
3543

3644

3745
@register_binary_op(ops.obj_make_ref_op)
3846
def _(left: TypedExpr, right: TypedExpr) -> sge.Expression:
3947
return sge.func("OBJ.MAKE_REF", left.expr, right.expr)
48+
49+
50+
@register_unary_op(ops.obj_make_ref_json_op)
51+
def _(expr: TypedExpr) -> sge.Expression:
52+
return sge.func("OBJ.MAKE_REF", expr.expr)

0 commit comments

Comments
 (0)