Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 10 additions & 5 deletions buckaroo/customizations/pandas_commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -473,16 +473,21 @@ class Search(Command):
command_pattern = [[3, 'term', 'type', 'string']]
quick_args_pattern = [[3, 'term', 'type', 'string']]

@staticmethod
@staticmethod
def transform(df, col, val):
#print("search_df", val)
if val is None or val == "":
#print("no search term set")
return df
return search_df_str(df, val)
filtered = search_df_str(df, val)
# search_df_str uses literal `Series.str.find`, so expose the term
# as highlight_phrase (list) — not highlight_regex — for matching
# filter semantics on the JS-side string displayer.
str_cols = list(df.select_dtypes("string").columns)
str_cols.extend(list(df.select_dtypes("object").columns))
sd_updates = {c: {'highlight_phrase': [val]} for c in str_cols}
return SDResult(filtered, sd_updates)


@staticmethod
@staticmethod
def transform_to_py(df, col, val):
return f""" from buckaroo.customizations.pandas_commands import search_df_str
return search_df_str(df, '{val}')"""
Expand Down
7 changes: 5 additions & 2 deletions tests/unit/commands/pandas_commands_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -378,12 +378,15 @@ def test_search_none_needle():


def test_search_with_match():
"""Test Search filters rows containing the needle."""
"""Test Search filters rows containing the needle. Search returns
SDResult(filtered_df, sd_updates) so the JS side can highlight the
search term — unwrap the df for the length assertion."""
base_df = pd.DataFrame({
'name': pd.Series(['Alice', 'Bob', 'Charlie'], dtype='object'),
'b': [1, 2, 3]})
result = Search.transform(base_df.copy(), 'name', 'Bob')
assert len(result) == 1
assert len(result.df) == 1
assert result.sd_updates['name'] == {'highlight_phrase': ['Bob']}


def test_search_to_py():
Expand Down
31 changes: 27 additions & 4 deletions tests/unit/dataflow/autocleaning_pd_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,7 @@
from buckaroo.dataflow.autocleaning import AutocleaningConfig
from buckaroo.dataflow.autocleaning import PandasAutocleaning, generate_quick_ops
from buckaroo.jlisp.lisp_utils import (s, sA, sQ)
from buckaroo.customizations.pandas_commands import (
Command,
SafeInt, DropCol, FillNA, GroupBy, NoOp, Search, OnlyOutliers
)
from buckaroo.customizations.pandas_commands import (Command, SafeInt, DropCol, FillNA, GroupBy, NoOp, Search, OnlyOutliers)
from buckaroo.customizations.pd_autoclean_conf import (NoCleaningConf)
from buckaroo.dataflow.dataflow import CustomizableDataflow

Expand Down Expand Up @@ -624,3 +621,29 @@ def test_two_arg_quick_command():





class SearchConf(AutocleaningConfig):
autocleaning_analysis_klasses = []
command_klasses = [Search]
name = ""


def test_search_threads_highlight_phrase_into_cleaning_sd_under_rename():
"""Search plumbs its search term into cleaning_sd as highlight_phrase on
every string/object column. The rest of the sd is keyed by buckaroo's
internal a/b/c names, so autocleaning rewrites the op-supplied keys to
match — otherwise the entries would sit alongside as orphans without
a `_type` and trip the styling fallback. pandas Search uses literal
str.find, so highlight_phrase (not _regex) matches the filter semantics."""
ac = PandasAutocleaning([SearchConf])
# 'businessname' (object) becomes 'a', 'rating' (int) becomes 'b'.
df = pd.DataFrame({'businessname': ['pizza', 'sushi'], 'rating': [5, 4]})
search_op = [{'symbol': 'search'}, s('df'), 'col', 'pizza']

_cleaned, cleaning_sd, _gen, _ops = ac.handle_ops_and_clean(
df, cleaning_method='', quick_command_args={}, existing_operations=[search_op])

assert cleaning_sd.get('a', {}).get('highlight_phrase') == ['pizza']
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Badge Include the pandas Search SDResult implementation

This assertion is unreachable with the code in this commit because only tests changed: Search.transform in buckaroo/customizations/pandas_commands.py still returns the bare search_df_str(df, val) result, so no SDResult metadata is merged and cleaning_sd remains empty. I verified the new tests with .venv/bin/python -m pytest tests/unit/dataflow/autocleaning_pd_test.py::test_search_threads_highlight_phrase_into_cleaning_sd_under_rename tests/unit/dataflow/customizable_dataflow_test.py::test_search_op_delivers_highlight_phrase_into_displayer_args -q; both fail, blocking CI until the pandas Search implementation is added.

Useful? React with 👍 / 👎.

assert 'businessname' not in cleaning_sd
assert 'highlight_phrase' not in cleaning_sd.get('b', {})
35 changes: 35 additions & 0 deletions tests/unit/dataflow/customizable_dataflow_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -581,3 +581,38 @@ class VCBuckarooWidget(BuckarooWidget):
[sQ('search'), s('df'), "col", "needle"]]
assert len(vcb.dataflow.processed_df) == 2
assert vcb.df_meta['filtered_rows'] == 2


def _find_cc(column_config, col_name):
for entry in column_config:
if entry.get('col_name') == col_name:
return entry
raise AssertionError(f"col_name {col_name!r} not in column_config")


def test_search_op_delivers_highlight_phrase_into_displayer_args():
"""End-to-end: a `search` operation on the widget should plumb its
search term into `displayer_args.highlight_phrase` for every string
column in the final df_viewer_config that gets sent to the JS side.
pandas Search uses literal str.find, so the search term flows as
highlight_phrase (not highlight_regex)."""
df = pd.DataFrame({
'businessname': ['pizza', 'sushi', 'taco'],
'comments': ['area code', 'no match', 'area zone'],
'rating': [5, 4, 3]})

class VCBuckarooWidget(BuckarooInfiniteWidget):
autoclean_conf = tuple([NoCleaningConf])

w = VCBuckarooWidget(df, debug=False)
w.dataflow.operations = [[{'symbol': 'search'}, s('df'), 'col', 'area']]

cc = w.df_display_args['main']['df_viewer_config']['column_config']
a_args = _find_cc(cc, 'a')['displayer_args']
assert a_args['displayer'] == 'string'
assert a_args['highlight_phrase'] == ['area']
b_args = _find_cc(cc, 'b')['displayer_args']
assert b_args['displayer'] == 'string'
assert b_args['highlight_phrase'] == ['area']
c_args = _find_cc(cc, 'c')['displayer_args']
assert 'highlight_phrase' not in c_args
Loading