From acbdd8eb66c0adddae8cff31e53a501d3a793a5f Mon Sep 17 00:00:00 2001 From: Paddy Mullen Date: Sun, 17 May 2026 14:46:04 -0400 Subject: [PATCH 1/3] test(xorq-search): highlight_phrase wiring contract (failing) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pins the xorq equivalent of #758 polars Search → SDResult, but using `highlight_phrase` (list) rather than `highlight_regex` — ibis `StringValue.contains` is a literal substring match, so a phrase match on the JS side matches the actual filter semantics. Both tests fail today: the xorq path (XorqAutocleaning._apply_xorq_ops bypassing the lisp interpreter) has no sd channel and the search handler returns a bare expr. - test_search_op_delivers_highlight_phrase_into_displayer_args: end-to-end through XorqBuckarooWidget with quick_command_args, asserts highlight_phrase lands in displayer_args for each ibis-String column (a=name, b=role) and the integer column (c=score) stays clean. - test_empty_search_drops_highlight_from_displayer_args: clearing the search box pulls the highlight back out — symmetric to the empty-val short-circuit in _xorq_search. Co-Authored-By: Claude Opus 4.7 (1M context) --- tests/unit/test_xorq_buckaroo_widget.py | 52 +++++++++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/tests/unit/test_xorq_buckaroo_widget.py b/tests/unit/test_xorq_buckaroo_widget.py index 13336da7e..2bdfde510 100644 --- a/tests/unit/test_xorq_buckaroo_widget.py +++ b/tests/unit/test_xorq_buckaroo_widget.py @@ -182,6 +182,58 @@ def test_empty_search_clears_filter(self): assert w.df_meta["filtered_rows"] == 5 +def _find_cc(column_config, col_name): + for entry in column_config: + if entry.get("col_name") == col_name: + return entry + raise AssertionError(f"col_name {col_name!r} not in column_config") + + +class TestSearchHighlight: + """Equivalent of the polars-search highlight wiring from #758, ported + to the xorq backend. ibis ``StringValue.contains`` is a literal + substring match, so the search term flows to the JS displayer as + ``highlight_phrase`` (list), not ``highlight_regex``.""" + + def test_search_op_delivers_highlight_phrase_into_displayer_args(self): + """End-to-end through XorqBuckarooWidget — a `search` op should + plumb its term into ``displayer_args.highlight_phrase`` for every + ibis-String column and skip non-string columns.""" + w = XorqBuckarooWidget(_searchable_expr()) + state = w.buckaroo_state.copy() + state["quick_command_args"] = {"search": ["admin"]} + w.buckaroo_state = state + + cc = w.df_display_args["main"]["df_viewer_config"]["column_config"] + # 'a' is name (string) + a_args = _find_cc(cc, "a")["displayer_args"] + assert a_args["displayer"] == "string" + assert a_args["highlight_phrase"] == ["admin"] + # 'b' is role (string) + b_args = _find_cc(cc, "b")["displayer_args"] + assert b_args["displayer"] == "string" + assert b_args["highlight_phrase"] == ["admin"] + # 'c' is score (integer) — no highlight + c_args = _find_cc(cc, "c")["displayer_args"] + assert "highlight_phrase" not in c_args + + def test_empty_search_drops_highlight_from_displayer_args(self): + """Clearing the search box (``""``) should remove the highlight + from displayer_args, matching the filter going back to no-op.""" + w = XorqBuckarooWidget(_searchable_expr()) + state = w.buckaroo_state.copy() + state["quick_command_args"] = {"search": ["admin"]} + w.buckaroo_state = state + + state = w.buckaroo_state.copy() + state["quick_command_args"] = {"search": [""]} + w.buckaroo_state = state + + cc = w.df_display_args["main"]["df_viewer_config"]["column_config"] + a_args = _find_cc(cc, "a")["displayer_args"] + assert "highlight_phrase" not in a_args + + def _paginated_expr(): return xo.memtable( {"a": [3, 1, 4, 1, 5, 9, 2, 6, 5, 3], "b": ["p", "q", "r", "s", "t", "u", "v", "w", "x", "y"]}) From e58d5f975427f2a26ea2426040aa104c1a633a47 Mon Sep 17 00:00:00 2001 From: Paddy Mullen Date: Sun, 17 May 2026 14:55:07 -0400 Subject: [PATCH 2/3] feat(xorq-search): plumb search term to JS as highlight_phrase via sd channel MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Mirrors #758 for the xorq backend. The xorq path doesn't go through configure_buckaroo's lisp interpreter (ibis exprs can't .copy()), so it can't reuse the SDResult machinery from #755 directly. Instead this adds an analogous sd channel inside XorqAutocleaning: - Handlers in _XORQ_OP_HANDLERS may now return either a bare expr (legacy) or (expr, sd_updates). _apply_xorq_ops accumulates the per-column sd entries across ops, merging col-by-col. - handle_ops_and_clean runs the accumulated updates through _rekey_op_sd_to_internal (the same helper PandasAutocleaning uses since #758) so orig-named entries land on buckaroo's internal a/b/c letter keys and compose cleanly with the summary_sd that XorqDataflow._get_summary_sd produces (also keyed by letter). - _xorq_search returns the filtered expr plus {col: {'highlight_phrase': [val]}} for every ibis-String column. Uses highlight_phrase (list of literal needles) rather than highlight_regex because ibis StringValue.contains is a literal substring match — matching the filter semantics on the highlight side avoids regex-metacharacter divergence. Scope: only the search command is wired today. The sd channel itself is generic — other ops can opt in by returning (expr, sd_updates). Co-Authored-By: Claude Opus 4.7 (1M context) --- buckaroo/xorq_buckaroo.py | 51 +++++++++++++++++++++++++++++++-------- 1 file changed, 41 insertions(+), 10 deletions(-) diff --git a/buckaroo/xorq_buckaroo.py b/buckaroo/xorq_buckaroo.py index cb03b742b..b71ca23e5 100644 --- a/buckaroo/xorq_buckaroo.py +++ b/buckaroo/xorq_buckaroo.py @@ -24,7 +24,8 @@ from .customizations.styling import DefaultMainStyling, DefaultSummaryStatsStyling from .customizations.xorq_stats_v2 import XORQ_STATS_V2 from .dataflow.autocleaning import ( - AutocleaningConfig, PandasAutocleaning, generate_quick_ops, merge_ops, ops_eq) + AutocleaningConfig, PandasAutocleaning, _rekey_op_sd_to_internal, + generate_quick_ops, merge_ops, ops_eq) from .dataflow.dataflow import CustomizableDataflow from .dataflow.dataflow_extras import Sampling from .df_util import old_col_new_col @@ -90,11 +91,17 @@ def serialize_sample(cls, df_or_expr): def _xorq_search(expr, _col, val): - """Filter rows where any string column contains ``val``. + """Filter rows where any string column contains ``val``, and emit + ``sd_updates`` so the JS-side string displayer highlights matches. - Mirrors the contract of the pandas / polars Search commands: an - empty value short-circuits to a no-op so the frontend can clear - the search by sending ``""``. + Return shape mirrors the SDResult contract from #758: + - bare expr : empty value short-circuits to a no-op + - (expr, sd_updates) : filtered expr + per-string-column highlight + + ibis ``StringValue.contains`` is a literal substring match (not + regex), so the term flows as ``highlight_phrase`` (list) rather + than ``highlight_regex`` — matching the filter semantics on the + JS-side string displayer. """ if val is None or val == "": return expr @@ -106,7 +113,8 @@ def _xorq_search(expr, _col, val): for c in string_cols: c_cond = expr[c].contains(val) cond = c_cond if cond is None else cond | c_cond - return expr.filter(cond) + sd_updates = {c: {'highlight_phrase': [val]} for c in string_cols} + return expr.filter(cond), sd_updates class XorqSearch: @@ -163,19 +171,42 @@ def handle_ops_and_clean(self, df, cleaning_method, quick_command_args, existing final_ops = merge_ops(existing_for_merge, quick_ops) if not final_ops: return [df, {}, "", []] - result = self._apply_xorq_ops(df, final_ops) - return [result, {}, "", final_ops] + result_expr, sd_updates = self._apply_xorq_ops(df, final_ops) + # Rekey op-supplied sd entries from orig col names onto buckaroo's + # internal a/b/c letter keys, so they merge cleanly with the + # summary_sd that XorqDataflow._get_summary_sd produces (also + # keyed by letter). Mapping uses the input expr's columns — + # filter ops preserve column identity, and orig→letter is the + # same on both sides of the filter for any current handler. + cleaning_sd = _rekey_op_sd_to_internal(sd_updates, df) + return [result_expr, cleaning_sd, "", final_ops] @staticmethod def _apply_xorq_ops(expr, ops): + """Apply ops to ``expr``; accumulate op-contributed sd entries. + + Each handler may return either a bare expr (legacy) or an + ``(expr, sd_updates)`` tuple. Tuples merge col-by-col into the + running sd_updates dict so multiple ops touching the same + column compose. + """ + sd_updates: dict = {} for op in ops: sym_name = op[0]['symbol'] if isinstance(op[0], dict) else op[0] handler = _XORQ_OP_HANDLERS.get(sym_name) if handler is None: continue handler_args = op[2:] - expr = handler(expr, *handler_args) - return expr + result = handler(expr, *handler_args) + if isinstance(result, tuple): + expr, op_sd = result + for col, updates in op_sd.items(): + merged = sd_updates.get(col, {}) + merged.update(updates) + sd_updates[col] = merged + else: + expr = result + return expr, sd_updates class XorqDataflow(CustomizableDataflow): From aaba902b895c613734bc45a1e59dc7cc69ea4fcb Mon Sep 17 00:00:00 2001 From: Paddy Mullen Date: Sun, 17 May 2026 16:04:42 -0400 Subject: [PATCH 3/3] refactor(xorq-search): apply review feedback for sd-channel plumbing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add an assert in handle_ops_and_clean that result_expr.columns == df.columns. The rekey runs against the input expr — correct today because the only sd-producing handler is _xorq_search (filter preserves identity), but a future op that renames/drops columns would silently mis-map the sd entries onto the wrong letter keys. - _apply_xorq_ops: type the signature (-> Tuple[Any, Dict[...]]) and collapse the dict accumulation to setdefault().update() — same semantics, less noise. Co-Authored-By: Claude Opus 4.7 (1M context) --- buckaroo/xorq_buckaroo.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/buckaroo/xorq_buckaroo.py b/buckaroo/xorq_buckaroo.py index b71ca23e5..0dbda1c45 100644 --- a/buckaroo/xorq_buckaroo.py +++ b/buckaroo/xorq_buckaroo.py @@ -13,7 +13,7 @@ import logging import traceback from io import BytesIO -from typing import Any +from typing import Any, Dict, Tuple import pandas as pd import pyarrow as pa @@ -175,14 +175,19 @@ def handle_ops_and_clean(self, df, cleaning_method, quick_command_args, existing # Rekey op-supplied sd entries from orig col names onto buckaroo's # internal a/b/c letter keys, so they merge cleanly with the # summary_sd that XorqDataflow._get_summary_sd produces (also - # keyed by letter). Mapping uses the input expr's columns — - # filter ops preserve column identity, and orig→letter is the - # same on both sides of the filter for any current handler. + # keyed by letter). Every current handler preserves column + # identity (filter ops), so orig→letter is the same on the input + # and the result. The assert guards against a future handler that + # renames/drops/reorders columns silently corrupting the rekey. + assert list(result_expr.columns) == list(df.columns), ( + "xorq op changed column identity — _rekey_op_sd_to_internal " + "would mis-map sd entries; rekey against result_expr or " + "thread the orig→letter mapping through _apply_xorq_ops") cleaning_sd = _rekey_op_sd_to_internal(sd_updates, df) return [result_expr, cleaning_sd, "", final_ops] @staticmethod - def _apply_xorq_ops(expr, ops): + def _apply_xorq_ops(expr, ops) -> Tuple[Any, Dict[str, Dict[str, Any]]]: """Apply ops to ``expr``; accumulate op-contributed sd entries. Each handler may return either a bare expr (legacy) or an @@ -190,7 +195,7 @@ def _apply_xorq_ops(expr, ops): running sd_updates dict so multiple ops touching the same column compose. """ - sd_updates: dict = {} + sd_updates: Dict[str, Dict[str, Any]] = {} for op in ops: sym_name = op[0]['symbol'] if isinstance(op[0], dict) else op[0] handler = _XORQ_OP_HANDLERS.get(sym_name) @@ -201,9 +206,7 @@ def _apply_xorq_ops(expr, ops): if isinstance(result, tuple): expr, op_sd = result for col, updates in op_sd.items(): - merged = sd_updates.get(col, {}) - merged.update(updates) - sd_updates[col] = merged + sd_updates.setdefault(col, {}).update(updates) else: expr = result return expr, sd_updates