Skip to content
This repository was archived by the owner on Apr 1, 2026. It is now read-only.

Commit 69afd94

Browse files
fixup str extract method
1 parent 7fb3cdc commit 69afd94

5 files changed

Lines changed: 53 additions & 14 deletions

File tree

bigframes/operations/strings.py

Lines changed: 8 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
from __future__ import annotations
1616

1717
import re
18-
from typing import Generic, Literal, Optional, TypeVar, Union
18+
from typing import Generic, Hashable, Literal, Optional, TypeVar, Union
1919

2020
import bigframes_vendored.constants as constants
2121
import bigframes_vendored.pandas.core.strings.accessor as vendorstr
@@ -209,23 +209,19 @@ def extract(self, pat: str, flags: int = 0) -> df.DataFrame:
209209
if compiled.groups == 0:
210210
raise ValueError("No capture groups in 'pat'")
211211

212-
results: list[str] = []
213-
block = self._data._block
212+
results: dict[Hashable, T] = {}
214213
for i in range(compiled.groups):
215214
labels = [
216215
label
217216
for label, groupn in compiled.groupindex.items()
218217
if i + 1 == groupn
219218
]
220-
label = labels[0] if labels else str(i)
221-
block, id = block.apply_unary_op(
222-
self._data._value_column,
219+
label = labels[0] if labels else i
220+
result = self._data._apply_unary_op(
223221
ops.StrExtractOp(pat=pat, n=i + 1),
224-
result_label=label,
225222
)
226-
results.append(id)
227-
block = block.select_columns(results)
228-
return df.DataFrame(block)
223+
results[label] = series.Series(result)
224+
return df.DataFrame(results)
229225

230226
def replace(
231227
self,
@@ -287,10 +283,10 @@ def split(
287283
)
288284
return self._data._apply_unary_op(ops.StringSplitOp(pat=pat))
289285

290-
def zfill(self, width: int) -> series.Series:
286+
def zfill(self, width: int) -> T:
291287
return self._data._apply_unary_op(ops.ZfillOp(width=width))
292288

293-
def center(self, width: int, fillchar: str = " ") -> series.Series:
289+
def center(self, width: int, fillchar: str = " ") -> T:
294290
return self._data._apply_unary_op(
295291
ops.StrPadOp(length=width, fillchar=fillchar, side="both")
296292
)

scripts/publish_api_coverage.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
import bigframes.core.groupby
3131
import bigframes.core.window
3232
import bigframes.operations.datetimes
33+
import bigframes.operations.strings
3334
import bigframes.pandas as bpd
3435

3536
REPO_ROOT = pathlib.Path(__file__).parent.parent

tests/system/small/operations/test_strings.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -78,8 +78,6 @@ def test_str_extract(scalars_dfs, pat):
7878
bf_result = bf_series.str.extract(pat).to_pandas()
7979
pd_result = scalars_pandas_df[col_name].str.extract(pat)
8080

81-
# Pandas produces int col labels, while bq df only supports str labels at present
82-
pd_result = pd_result.set_axis(pd_result.columns.astype(str), axis=1)
8381
pd.testing.assert_frame_equal(
8482
pd_result,
8583
bf_result,

tests/system/small/test_index.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -707,3 +707,17 @@ def test_index_str_accessor_binary(scalars_df_index, scalars_pandas_df_index):
707707
pd_result = pd_index.str.cat(pd_index.str[:4])
708708

709709
pd.testing.assert_index_equal(bf_result, pd_result)
710+
711+
712+
@pytest.mark.parametrize(
713+
("pat"),
714+
[(r"(ell)(lo)"), (r"(?P<somename>h..)"), (r"(?P<somename>e.*o)([g-l]+)")],
715+
)
716+
def test_index_str_extract(scalars_df_index, scalars_pandas_df_index, pat):
717+
bf_index = scalars_df_index.set_index("string_col").index
718+
pd_index = scalars_pandas_df_index.set_index("string_col").index
719+
720+
bf_result = bf_index.str.extract(pat).to_pandas()
721+
pd_result = pd_index.str.extract(pat)
722+
723+
pd.testing.assert_frame_equal(pd_result, bf_result, check_index_type=False)

third_party/bigframes_vendored/pandas/core/indexes/base.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -366,6 +366,36 @@ def T(self) -> Index:
366366
"""
367367
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
368368

369+
@property
370+
def str(self):
371+
"""
372+
Vectorized string functions for Series and Index.
373+
374+
NAs stay NA unless handled otherwise by a particular method. Patterned
375+
after Python’s string methods, with some inspiration from R’s stringr package.
376+
377+
**Examples:**
378+
379+
>>> import bigframes.pandas as bpd
380+
>>> s = bpd.Series(["A_Str_Series"])
381+
>>> s
382+
0 A_Str_Series
383+
dtype: string
384+
385+
>>> s.str.lower()
386+
0 a_str_series
387+
dtype: string
388+
389+
>>> s.str.replace("_", "")
390+
0 AStrSeries
391+
dtype: string
392+
393+
Returns:
394+
bigframes.operations.strings.StringMethods:
395+
An accessor containing string methods.
396+
"""
397+
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
398+
369399
def copy(
370400
self,
371401
name=None,

0 commit comments

Comments
 (0)