From a35323cd6f4769318cc1ffd87c73d6c8648135d2 Mon Sep 17 00:00:00 2001 From: Paddy Mullen Date: Sun, 17 May 2026 14:44:10 -0400 Subject: [PATCH 1/3] test(pandas-search): highlight_phrase wiring contract (failing) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pins the pandas equivalent of #758 polars Search → SDResult, but using `highlight_phrase` (list) rather than `highlight_regex` (string) — pandas search_df_str uses literal `Series.str.find`, so a phrase match on the JS side matches the actual filter semantics. - tests/unit/dataflow/autocleaning_pd_test.py: unit-level — Search contributes `highlight_phrase` keyed by the renamed (a/b) column under PandasAutocleaning, with the rekey running over `cleaning_sd` so the orig-named entry merges into the internal letter key. - tests/unit/dataflow/customizable_dataflow_test.py: end-to-end through BuckarooInfiniteWidget with NoCleaningConf — a `search` op should land `highlight_phrase` in `displayer_args` for each string column and skip the numeric column. Both fail today: pandas `Search.transform` still returns a bare df. Co-Authored-By: Claude Opus 4.7 (1M context) --- tests/unit/dataflow/autocleaning_pd_test.py | 28 ++++++++++++++- .../dataflow/customizable_dataflow_test.py | 35 +++++++++++++++++++ 2 files changed, 62 insertions(+), 1 deletion(-) diff --git a/tests/unit/dataflow/autocleaning_pd_test.py b/tests/unit/dataflow/autocleaning_pd_test.py index fe2547f5e..7de15a300 100644 --- a/tests/unit/dataflow/autocleaning_pd_test.py +++ b/tests/unit/dataflow/autocleaning_pd_test.py @@ -10,7 +10,7 @@ from buckaroo.jlisp.lisp_utils import (s, sA, sQ) from buckaroo.customizations.pandas_commands import ( Command, - SafeInt, DropCol, FillNA, GroupBy, NoOp, Search, OnlyOutliers + SafeInt, DropCol, FillNA, GroupBy, NoOp, Search, OnlyOutliers, SDResult, ) from buckaroo.customizations.pd_autoclean_conf import (NoCleaningConf) from buckaroo.dataflow.dataflow import CustomizableDataflow @@ -624,3 +624,29 @@ def test_two_arg_quick_command(): + + +class SearchConf(AutocleaningConfig): + autocleaning_analysis_klasses = [] + command_klasses = [Search] + name = "" + + +def test_search_threads_highlight_phrase_into_cleaning_sd_under_rename(): + """Search plumbs its search term into cleaning_sd as highlight_phrase on + every string/object column. The rest of the sd is keyed by buckaroo's + internal a/b/c names, so autocleaning rewrites the op-supplied keys to + match — otherwise the entries would sit alongside as orphans without + a `_type` and trip the styling fallback. pandas Search uses literal + str.find, so highlight_phrase (not _regex) matches the filter semantics.""" + ac = PandasAutocleaning([SearchConf]) + # 'businessname' (object) becomes 'a', 'rating' (int) becomes 'b'. + df = pd.DataFrame({'businessname': ['pizza', 'sushi'], 'rating': [5, 4]}) + search_op = [{'symbol': 'search'}, s('df'), 'col', 'pizza'] + + _cleaned, cleaning_sd, _gen, _ops = ac.handle_ops_and_clean( + df, cleaning_method='', quick_command_args={}, existing_operations=[search_op]) + + assert cleaning_sd.get('a', {}).get('highlight_phrase') == ['pizza'] + assert 'businessname' not in cleaning_sd + assert 'highlight_phrase' not in cleaning_sd.get('b', {}) diff --git a/tests/unit/dataflow/customizable_dataflow_test.py b/tests/unit/dataflow/customizable_dataflow_test.py index 5f207cfc2..394cb69ac 100644 --- a/tests/unit/dataflow/customizable_dataflow_test.py +++ b/tests/unit/dataflow/customizable_dataflow_test.py @@ -581,3 +581,38 @@ class VCBuckarooWidget(BuckarooWidget): [sQ('search'), s('df'), "col", "needle"]] assert len(vcb.dataflow.processed_df) == 2 assert vcb.df_meta['filtered_rows'] == 2 + + +def _find_cc(column_config, col_name): + for entry in column_config: + if entry.get('col_name') == col_name: + return entry + raise AssertionError(f"col_name {col_name!r} not in column_config") + + +def test_search_op_delivers_highlight_phrase_into_displayer_args(): + """End-to-end: a `search` operation on the widget should plumb its + search term into `displayer_args.highlight_phrase` for every string + column in the final df_viewer_config that gets sent to the JS side. + pandas Search uses literal str.find, so the search term flows as + highlight_phrase (not highlight_regex).""" + df = pd.DataFrame({ + 'businessname': ['pizza', 'sushi', 'taco'], + 'comments': ['area code', 'no match', 'area zone'], + 'rating': [5, 4, 3]}) + + class VCBuckarooWidget(BuckarooInfiniteWidget): + autoclean_conf = tuple([NoCleaningConf]) + + w = VCBuckarooWidget(df, debug=False) + w.dataflow.operations = [[{'symbol': 'search'}, s('df'), 'col', 'area']] + + cc = w.df_display_args['main']['df_viewer_config']['column_config'] + a_args = _find_cc(cc, 'a')['displayer_args'] + assert a_args['displayer'] == 'string' + assert a_args['highlight_phrase'] == ['area'] + b_args = _find_cc(cc, 'b')['displayer_args'] + assert b_args['displayer'] == 'string' + assert b_args['highlight_phrase'] == ['area'] + c_args = _find_cc(cc, 'c')['displayer_args'] + assert 'highlight_phrase' not in c_args From b895de2f8b5bc694bdf843458baa769deb471a44 Mon Sep 17 00:00:00 2001 From: Paddy Mullen Date: Sun, 17 May 2026 14:48:43 -0400 Subject: [PATCH 2/3] feat(pandas-search): plumb search term to JS as highlight_phrase via SDResult MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Mirrors #758 for the pandas backend. Pandas `Search.transform` now returns `SDResult(filtered_df, sd_updates)` so the search term flows into `cleaning_sd` as `highlight_phrase` on every string/object column. Together with the existing `style_column` reader (added in #758) the phrase lands in the string `displayer_args`, where the JS-side displayer already renders matches as ``. Uses `highlight_phrase` (list of literal needles) rather than the `highlight_regex` (single regex string) variant polars emits because `search_df_str` uses `Series.str.find` — a literal substring match. Matching the filter semantics on the highlight side avoids the case where a search term containing regex metacharacters would filter on literal text but try to highlight as a regex. The string-column detection mirrors `search_df_str`: union of `select_dtypes("string")` and `select_dtypes("object")` columns. Co-Authored-By: Claude Opus 4.7 (1M context) --- buckaroo/customizations/pandas_commands.py | 15 ++++++++++----- tests/unit/commands/pandas_commands_test.py | 7 +++++-- tests/unit/dataflow/autocleaning_pd_test.py | 5 +---- 3 files changed, 16 insertions(+), 11 deletions(-) diff --git a/buckaroo/customizations/pandas_commands.py b/buckaroo/customizations/pandas_commands.py index 242c184ad..3a06c2427 100644 --- a/buckaroo/customizations/pandas_commands.py +++ b/buckaroo/customizations/pandas_commands.py @@ -473,16 +473,21 @@ class Search(Command): command_pattern = [[3, 'term', 'type', 'string']] quick_args_pattern = [[3, 'term', 'type', 'string']] - @staticmethod + @staticmethod def transform(df, col, val): - #print("search_df", val) if val is None or val == "": - #print("no search term set") return df - return search_df_str(df, val) + filtered = search_df_str(df, val) + # search_df_str uses literal `Series.str.find`, so expose the term + # as highlight_phrase (list) — not highlight_regex — for matching + # filter semantics on the JS-side string displayer. + str_cols = list(df.select_dtypes("string").columns) + str_cols.extend(list(df.select_dtypes("object").columns)) + sd_updates = {c: {'highlight_phrase': [val]} for c in str_cols} + return SDResult(filtered, sd_updates) - @staticmethod + @staticmethod def transform_to_py(df, col, val): return f""" from buckaroo.customizations.pandas_commands import search_df_str return search_df_str(df, '{val}')""" diff --git a/tests/unit/commands/pandas_commands_test.py b/tests/unit/commands/pandas_commands_test.py index 0d0fc09b9..e6d2142fd 100644 --- a/tests/unit/commands/pandas_commands_test.py +++ b/tests/unit/commands/pandas_commands_test.py @@ -378,12 +378,15 @@ def test_search_none_needle(): def test_search_with_match(): - """Test Search filters rows containing the needle.""" + """Test Search filters rows containing the needle. Search returns + SDResult(filtered_df, sd_updates) so the JS side can highlight the + search term — unwrap the df for the length assertion.""" base_df = pd.DataFrame({ 'name': pd.Series(['Alice', 'Bob', 'Charlie'], dtype='object'), 'b': [1, 2, 3]}) result = Search.transform(base_df.copy(), 'name', 'Bob') - assert len(result) == 1 + assert len(result.df) == 1 + assert result.sd_updates['name'] == {'highlight_phrase': ['Bob']} def test_search_to_py(): diff --git a/tests/unit/dataflow/autocleaning_pd_test.py b/tests/unit/dataflow/autocleaning_pd_test.py index 7de15a300..8f380e854 100644 --- a/tests/unit/dataflow/autocleaning_pd_test.py +++ b/tests/unit/dataflow/autocleaning_pd_test.py @@ -8,10 +8,7 @@ from buckaroo.dataflow.autocleaning import AutocleaningConfig from buckaroo.dataflow.autocleaning import PandasAutocleaning, generate_quick_ops from buckaroo.jlisp.lisp_utils import (s, sA, sQ) -from buckaroo.customizations.pandas_commands import ( - Command, - SafeInt, DropCol, FillNA, GroupBy, NoOp, Search, OnlyOutliers, SDResult, -) +from buckaroo.customizations.pandas_commands import (Command, SafeInt, DropCol, FillNA, GroupBy, NoOp, Search, OnlyOutliers, SDResult) from buckaroo.customizations.pd_autoclean_conf import (NoCleaningConf) from buckaroo.dataflow.dataflow import CustomizableDataflow From 32e626667affc64dced68ac1fc1d7a48406c20e2 Mon Sep 17 00:00:00 2001 From: Paddy Mullen Date: Sun, 17 May 2026 14:56:21 -0400 Subject: [PATCH 3/3] fix(test): drop unused SDResult import from autocleaning_pd_test ruff F401 on CI. SDResult was imported speculatively for the failing test but never used (only Search is referenced directly). Co-Authored-By: Claude Opus 4.7 (1M context) --- tests/unit/dataflow/autocleaning_pd_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/dataflow/autocleaning_pd_test.py b/tests/unit/dataflow/autocleaning_pd_test.py index 8f380e854..b0467dad2 100644 --- a/tests/unit/dataflow/autocleaning_pd_test.py +++ b/tests/unit/dataflow/autocleaning_pd_test.py @@ -8,7 +8,7 @@ from buckaroo.dataflow.autocleaning import AutocleaningConfig from buckaroo.dataflow.autocleaning import PandasAutocleaning, generate_quick_ops from buckaroo.jlisp.lisp_utils import (s, sA, sQ) -from buckaroo.customizations.pandas_commands import (Command, SafeInt, DropCol, FillNA, GroupBy, NoOp, Search, OnlyOutliers, SDResult) +from buckaroo.customizations.pandas_commands import (Command, SafeInt, DropCol, FillNA, GroupBy, NoOp, Search, OnlyOutliers) from buckaroo.customizations.pd_autoclean_conf import (NoCleaningConf) from buckaroo.dataflow.dataflow import CustomizableDataflow