Skip to content
27 changes: 15 additions & 12 deletions packages/bigframes/bigframes/bigquery/_operations/ai.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ def generate(
>>> import bigframes.pandas as bpd
>>> import bigframes.bigquery as bbq
>>> country = bpd.Series(["Japan", "Canada"])
>>> bbq.ai.generate(("What's the capital city of ", country, " one word only"))
>>> bbq.ai.generate(("What's the capital city of ", country, " one word only")) # doctest: +ELLIPSIS
0 {'result': 'Tokyo', 'full_response': '{"cand...
1 {'result': 'Ottawa', 'full_response': '{"can...
dtype: struct<result: string, full_response: extension<dbjson<JSONArrowType>>, status: string>[pyarrow]
Expand Down Expand Up @@ -231,8 +231,8 @@ def generate_int(

>>> import bigframes.pandas as bpd
>>> import bigframes.bigquery as bbq
>>> animal = bpd.Series(["Kangaroo", "Rabbit", "Spider"])
>>> bbq.ai.generate_int(("How many legs does a ", animal, " have?"))
>>> animal = bpd.Series(["Ostrich", "Rabbit", "Spider"])
>>> bbq.ai.generate_int(("How many legs does a ", animal, " have?")) # doctest: +ELLIPSIS
0 {'result': 2, 'full_response': '{"candidates":...
1 {'result': 4, 'full_response': '{"candidates":...
2 {'result': 8, 'full_response': '{"candidates":...
Expand Down Expand Up @@ -305,8 +305,8 @@ def generate_double(

>>> import bigframes.pandas as bpd
>>> import bigframes.bigquery as bbq
>>> animal = bpd.Series(["Kangaroo", "Rabbit", "Spider"])
>>> bbq.ai.generate_double(("How many legs does a ", animal, " have?"))
>>> animal = bpd.Series(["Ostrich", "Rabbit", "Spider"])
>>> bbq.ai.generate_double(("How many legs does a ", animal, " have?")) # doctest: +ELLIPSIS
0 {'result': 2.0, 'full_response': '{"candidates...
1 {'result': 4.0, 'full_response': '{"candidates...
2 {'result': 8.0, 'full_response': '{"candidates...
Expand Down Expand Up @@ -383,7 +383,7 @@ def generate_embedding(
>>> import bigframes.pandas as bpd
>>> import bigframes.bigquery as bbq
>>> df = bpd.DataFrame({"content": ["apple", "bear", "pear"]})
>>> bbq.ai.generate_embedding(
>>> bbq.ai.generate_embedding( # doctest: +SKIP
... "project.dataset.model_name",
... df
... )
Expand Down Expand Up @@ -486,7 +486,7 @@ def generate_text(
>>> import bigframes.pandas as bpd
>>> import bigframes.bigquery as bbq
>>> df = bpd.DataFrame({"prompt": ["write a poem about apples"]})
>>> bbq.ai.generate_text(
>>> bbq.ai.generate_text( # doctest: +SKIP
... "project.dataset.model_name",
... df
... )
Expand Down Expand Up @@ -601,7 +601,7 @@ def generate_table(
>>> # the necessary columns for the model's prompt. For example, a
>>> # DataFrame with a 'prompt' column for text classification.
>>> df = bpd.DataFrame({'prompt': ["some text to classify"]})
>>> result = bbq.ai.generate_table(
>>> result = bbq.ai.generate_table( # doctest: +SKIP
... "project.dataset.model_name",
... data=df,
... output_schema="category STRING"
Expand Down Expand Up @@ -708,12 +708,14 @@ def embed(

>>> import bigframes.pandas as bpd
>>> import bigframes.bigquery as bbq
>>> bbq.ai.embed("dog", endpoint="text-embedding-005")
>>> bbq.ai.embed("dog", endpoint="text-embedding-005") # doctest: +ELLIPSIS
0 {'result': array([ 1.78243860e-03, -1.10658340...
dtype: struct<result: list<item: double>, status: string>[pyarrow]

>>> s = bpd.Series(['dog'])
>>> bbq.ai.embed(s, endpoint='text-embedding-005')
>>> bbq.ai.embed(s, endpoint='text-embedding-005') # doctest: +ELLIPSIS
0 {'result': array([ 1.78243860e-03, -1.10658340...
dtype: struct<result: list<item: double>, status: string>[pyarrow]

Args:
content (str | Series):
Expand Down Expand Up @@ -1004,6 +1006,7 @@ def similarity(
>>> bbq.ai.similarity(df['word'], 'glad', endpoint='text-embedding-005')
0 0.916601
1 0.660579
Name: word, dtype: Float64

Args:
content1 (str | Series):
Expand Down Expand Up @@ -1082,8 +1085,8 @@ def forecast(
>>> df = pd.DataFrame({"value": [1, 2, 3], "time": pd.to_datetime(["2020-01-01", "2020-01-02", "2020-01-03"])})
>>> bpd.options.display.progress_bar = None
>>> forecasted_pandas_df = df.bigquery.ai.forecast(data_col="value", timestamp_col="time", horizon=2)
>>> type(forecasted_pandas_df)
<class 'pandas.core.frame.DataFrame'>
>>> type(forecasted_pandas_df) # doctest: +ELLIPSIS
<class 'pandas...DataFrame'>

Forecast using a BigFrames DataFrame:

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -57,5 +57,5 @@ def struct(value: dataframe.DataFrame) -> series.Series:
block, result_id = block.apply_nary_op(
block.value_columns, ops.StructOp(column_names=tuple(block.column_labels))
)
block = block.select_column(result_id)
block = block.select_column(result_id).with_column_labels([None])
return series.Series(block)
6 changes: 6 additions & 0 deletions packages/bigframes/bigframes/core/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -1989,6 +1989,10 @@ def _generate_resample_label(
)
level = level or 0
col_id = self.index.resolve_level(level)[0]
if isinstance(level, int):
resample_label = self.index.names[level]
else:
resample_label = level
# Reset index to make the resampling level a column, then drop all other index columns.
# This simplifies processing by focusing solely on the column required for resampling.
block = self.reset_index(drop=False)
Expand All @@ -2007,6 +2011,7 @@ def _generate_resample_label(
raise KeyError(f"The grouper name {on} is not found")

col_id = matches[0]
resample_label = on
block = self
if level is None:
dtype = self._column_type(col_id)
Expand Down Expand Up @@ -2099,6 +2104,7 @@ def _generate_resample_label(
block.value_columns[0],
block.value_columns[1],
op=ops.IntegerLabelToDatetimeOp(freq=freq, label=label, origin=origin),
result_label=resample_label,
)

# After multiple merges, the columns:
Expand Down
53 changes: 52 additions & 1 deletion packages/bigframes/bigframes/core/compile/polars/compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,8 +178,59 @@ def _(
self,
expression: ex.OpExpression,
) -> pl.Expr:
# TODO: Complete the implementation
import datetime

import pyarrow as pa

op = expression.op

# Polars panics on nulls from pandas objects in timezone-aware
# datetimes for certain ops. Convert to timezone-naive temporarily
# to avoid this issue.
# TODO(tswast): Remove workaround when
# https://github.com/pola-rs/polars/issues/27862 has been fixed.
is_problematic_op = type(op) in (
date_ops.YearOp,
date_ops.QuarterOp,
date_ops.MonthOp,
date_ops.DayOp,
date_ops.IsoWeekOp,
)

if is_problematic_op and len(expression.inputs) == 1:
input_expr = expression.inputs[0]
if (
input_expr.is_resolved
and isinstance(input_expr.output_type, pd.ArrowDtype)
and isinstance(
input_expr.output_type.pyarrow_dtype, pa.TimestampType
)
and input_expr.output_type.pyarrow_dtype.tz is not None
):
tz_str = input_expr.output_type.pyarrow_dtype.tz
if tz_str == "UTC":
dummy_tz = datetime.timezone.utc
else:
try:
from zoneinfo import ZoneInfo

dummy_tz = ZoneInfo(tz_str) # type: ignore
except Exception:
dummy_tz = datetime.timezone.utc

dummy_val = datetime.datetime(1970, 1, 1, tzinfo=dummy_tz)

compiled_input = self.compile_expression(input_expr)
filled_input = compiled_input.fill_null(dummy_val)
compiled_op_with_fill = self.compile_op(op, filled_input)

return (
pl.when(compiled_input.is_null())
.then(None)
.otherwise(compiled_op_with_fill)
)

# TODO: Complete the implementation
args = tuple(map(self.compile_expression, expression.inputs))
return self.compile_op(op, *args)

Expand Down
1 change: 1 addition & 0 deletions packages/bigframes/bigframes/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -325,6 +325,7 @@ def get_loc(self, key) -> typing.Union[int, slice, "bigframes.series.Series"]:
# Return boolean mask for non-monotonic duplicates
mask_block = block_with_offsets.select_columns([match_col_id])
mask_block = mask_block.reset_index(drop=True)
mask_block = mask_block.with_column_labels([None])
result_series = bigframes.series.Series(mask_block)
return result_series.astype("boolean")

Expand Down
10 changes: 4 additions & 6 deletions packages/bigframes/bigframes/operations/ai.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,12 +122,10 @@ def map(
>>> model = llm.GeminiTextGenerator(model_name="gemini-2.5-pro")

>>> df = bpd.DataFrame({"ingredient_1": ["Burger Bun", "Soy Bean"], "ingredient_2": ["Beef Patty", "Bittern"]})
>>> df.ai.map("What is the food made from {ingredient_1} and {ingredient_2}? One word only.", model=model, output_schema={"food": "string"})
ingredient_1 ingredient_2 food
0 Burger Bun Beef Patty Burger
<BLANKLINE>
1 Soy Bean Bittern Tofu
<BLANKLINE>
>>> df.ai.map("What is the food made from {ingredient_1} and {ingredient_2}? One word only.", model=model, output_schema={"food": "string"}) # doctest: +ELLIPSIS
ingredient_1 ingredient_2...
0 Burger Bun Beef Patty...
1 Soy Bean Bittern...Tofu
<BLANKLINE>
[2 rows x 3 columns]

Expand Down
6 changes: 4 additions & 2 deletions packages/bigframes/bigframes/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -2470,7 +2470,9 @@ def map(

self_df = self.to_frame(name="series")
result_df = self_df.join(map_df, on="series")
return result_df[self.name]
result = cast(Series, result_df[self.name])
result.name = self.name
return result

@validations.requires_ordering()
def sample(
Expand Down Expand Up @@ -2696,7 +2698,7 @@ def _apply_nary_op(
others, ignore_self=ignore_self, cast_scalars=False
)
block, result_id = block.project_expr(op.as_expr(*values))
return Series(block.select_column(result_id))
return Series(block.select_column(result_id).with_column_labels([None]))

def _apply_binary_aggregation(
self, other: Series, stat: agg_ops.BinaryAggregateOp
Expand Down
8 changes: 8 additions & 0 deletions packages/bigframes/bigframes/testing/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,14 @@ def assert_series_equivalent(pd_series: pd.Series, bf_series: bpd.Series, **kwar
def _normalize_all_nulls(col: pd.Series) -> pd.Series:
if pd_types.is_float_dtype(col.dtype):
col = col.astype("float64").astype("Float64")
elif col.dtype == "object":
if any(isinstance(x, decimal.Decimal) for x in col):
pass
else:
try:
col = col.astype("Float64")
except (TypeError, ValueError, SystemError):
pass
return col


Expand Down
3 changes: 2 additions & 1 deletion packages/bigframes/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@
"fsspec >=2023.3.0",
"gcsfs >=2023.3.0, !=2025.5.0, !=2026.2.0, !=2026.3.0",
"geopandas >=0.12.2",
"google-auth >=2.15.0,<3.0",
"google-auth[pyopenssl] >=2.15.0,<3.0",
"google-cloud-bigquery[bqstorage,pandas] >=3.36.0",
# 2.30 needed for arrow support.
"google-cloud-bigquery-storage >= 2.30.0, < 3.0.0",
Expand Down Expand Up @@ -75,6 +75,7 @@
"pytest-snapshot",
"google-cloud-bigtable >=2.24.0",
"google-cloud-pubsub >=2.21.4",
"tzdata",
],
# used for local engine
"polars": ["polars >= 1.21.0"],
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -468,7 +468,12 @@ def add_one(x):

pd_int64_df = scalars_pandas_df[int64_cols]
pd_int64_df_filtered = pd_int64_df.dropna()
pd_result = pd_int64_df_filtered.applymap(add_one)

# TODO(swast): Remove when pandas 2.1.x+ is the minimum supported.
if hasattr(pd_int64_df_filtered, "map"):
pd_result = pd_int64_df_filtered.map(add_one)
Comment thread
tswast marked this conversation as resolved.
else:
pd_result = pd_int64_df_filtered.applymap(add_one)
# TODO(shobs): Figure why pandas .applymap() changes the dtype, i.e.
# pd_int64_df_filtered.dtype is Int64Dtype()
# pd_int64_df_filtered.applymap(lambda x: x).dtype is int64.
Expand Down Expand Up @@ -503,7 +508,13 @@ def add_one(x):

pd_int64_df = scalars_pandas_df[int64_cols]
pd_int64_df_filtered = pd_int64_df[pd_int64_df["int64_col"].notnull()]
pd_result = pd_int64_df_filtered.applymap(add_one)

# TODO(swast): Remove when pandas 2.1.x+ is the minimum supported.
if hasattr(pd_int64_df_filtered, "map"):
pd_result = pd_int64_df_filtered.map(add_one)
Comment thread
tswast marked this conversation as resolved.
else:
pd_result = pd_int64_df_filtered.applymap(add_one)

# TODO(shobs): Figure why pandas .applymap() changes the dtype, i.e.
# pd_int64_df_filtered.dtype is Int64Dtype()
# pd_int64_df_filtered.applymap(lambda x: x).dtype is int64.
Expand Down Expand Up @@ -536,7 +547,13 @@ def add_one(x):
bf_result = bf_int64_df.applymap(remote_add_one, na_action="ignore").to_pandas()

pd_int64_df = scalars_pandas_df[int64_cols]
pd_result = pd_int64_df.applymap(add_one, na_action="ignore")

# TODO(swast): Remove when pandas 2.1.x+ is the minimum supported.
if hasattr(pd_int64_df, "map"):
pd_result = pd_int64_df.map(add_one, na_action="ignore")
Comment thread
tswast marked this conversation as resolved.
else:
pd_result = pd_int64_df.applymap(add_one, na_action="ignore")

# TODO(shobs): Figure why pandas .applymap() changes the dtype, i.e.
# pd_int64_df_filtered.dtype is Int64Dtype()
# pd_int64_df_filtered.applymap(lambda x: x).dtype is int64.
Expand Down
4 changes: 2 additions & 2 deletions packages/bigframes/tests/system/small/test_magics.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def test_magic_select_lit_to_var(ip):
assert "dst_var" in ip.user_ns
result_df = ip.user_ns["dst_var"]
assert result_df.shape == (1, 1)
assert result_df.loc[0, 0] == 3
assert result_df.to_pandas().iloc[0, 0] == 3


def test_magic_select_lit_dry_run(ip):
Expand Down Expand Up @@ -97,4 +97,4 @@ def test_magic_select_interpolate(ip):
assert "dst_var" in ip.user_ns
result_df = ip.user_ns["dst_var"]
assert result_df.shape == (1, 1)
assert result_df.loc[0, 0] == 9
assert result_df.loc[0, "total"] == 9
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ WITH `bfcte_0` AS (
SELECT
CAST(TIMESTAMP_MICROS(
CAST(CAST(`bfcol_17` AS BIGNUMERIC) * 7000000 + CAST(UNIX_MICROS(CAST(CAST(`bfcol_8` AS DATE) AS TIMESTAMP)) AS BIGNUMERIC) AS INT64)
) AS DATETIME) AS `bigframes_unnamed_index`,
) AS DATETIME) AS `timestamp_col`,
`bfcol_11` AS `int64_col`,
`bfcol_12` AS `int64_too`
FROM (
Expand All @@ -72,4 +72,4 @@ FROM (
LEFT JOIN `bfcte_5`
ON `bfcol_17` = `bfcol_13`
ORDER BY
`bfcol_17` ASC NULLS LAST
`bfcol_17` ASC NULLS LAST
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
SELECT
`rowindex`,
ROUND(`int64_col` + `int64_too`) AS `0`
FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` AS `bft_0`
FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` AS `bft_0`
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
SELECT
`rowindex`,
ROUND(`int64_col` + `int64_too`) AS `0`
FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` AS `bft_0`
FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` AS `bft_0`
Original file line number Diff line number Diff line change
Expand Up @@ -32,4 +32,4 @@ def test_compile_fromrange(compiler_session, snapshot):
sql, _, _ = df.resample(rule="7s")._block.to_sql_query(
include_index=True, enable_cache=False
)
snapshot.assert_match(sql, "out.sql")
snapshot.assert_match(sql.strip() + "\n", "out.sql")
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,10 @@

pytest.importorskip("pytest_snapshot")

# Only test on the latest pandas since column naming behavior is slightly
# different across versions, e.g. unnamed vs 0 for unnamed Series.
pytest.importorskip("pandas", minversion="3.0.0")


def test_sql_scalar(scalar_types_df: bpd.DataFrame, snapshot, monkeypatch):
session = mock.create_autospec(bigframes.session.Session)
Expand All @@ -42,7 +46,7 @@ def to_pandas(series, *, ordered):
)

session.read_pandas.assert_called_once()
snapshot.assert_match(result, "out.sql")
snapshot.assert_match(result.strip() + "\n", "out.sql")
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would this issue impact every snapshot? I assume this is from mismatch between sqlglot version locally vs on the automated test runs?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's actually not a version mismatch. It's because I was fighting the pre-commit end-of-file newline "fixer". Basically, I'm applying that change here so that when we re-run it, the "fixer" won't see it as a problem because it's looking for exactly 1 trailing newline character.



def test_bigframes_sql_scalar(scalar_types_df: bpd.DataFrame, snapshot):
Expand All @@ -57,4 +61,4 @@ def test_bigframes_sql_scalar(scalar_types_df: bpd.DataFrame, snapshot):
session.read_pandas.assert_not_called()
# Bigframes implementation returns a bigframes.series.Series
sql, _, _ = result.to_frame()._to_sql_query(include_index=True)
snapshot.assert_match(sql, "out.sql")
snapshot.assert_match(sql.strip() + "\n", "out.sql")
Loading
Loading