From 92b652db6e685fe1f4fc937e5dbbe30318ca0b2f Mon Sep 17 00:00:00 2001 From: Paddy Mullen Date: Sun, 17 May 2026 15:10:53 -0400 Subject: [PATCH 1/3] feat(xorq): autocleaning + lisp-interpreter path with 5 ported commands MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit XorqAutocleaning previously sidestepped the lisp interpreter and ran a single dict-dispatched _xorq_search handler. Route it through the same configure_buckaroo interpreter that pandas/polars use, and port four more commands (NoOp, DropCol, FillNA, DropDuplicates) into customizations/xorq_commands.py. The interpreter's df_copy fork in jlisp/configure_utils.py grew a third branch: pandas → .copy(), polars → .clone(), ibis exprs → passthrough. Ibis expressions are immutable, so transforms must return a new expr anyway and a defensive copy is both unavailable and unnecessary. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../customizations/xorq_autoclean_conf.py | 22 +++ buckaroo/customizations/xorq_commands.py | 114 ++++++++++++++ buckaroo/jlisp/configure_utils.py | 8 +- buckaroo/xorq_buckaroo.py | 110 +++---------- tests/unit/test_xorq_commands.py | 146 ++++++++++++++++++ 5 files changed, 313 insertions(+), 87 deletions(-) create mode 100644 buckaroo/customizations/xorq_autoclean_conf.py create mode 100644 buckaroo/customizations/xorq_commands.py create mode 100644 tests/unit/test_xorq_commands.py diff --git a/buckaroo/customizations/xorq_autoclean_conf.py b/buckaroo/customizations/xorq_autoclean_conf.py new file mode 100644 index 000000000..5c691ebbb --- /dev/null +++ b/buckaroo/customizations/xorq_autoclean_conf.py @@ -0,0 +1,22 @@ +from buckaroo.dataflow.autocleaning import AutocleaningConfig +from buckaroo.customizations.xorq_commands import ( + DropCol, DropDuplicates, FillNA, NoOp, Search) + + +XORQ_BASE_COMMANDS = [DropCol, DropDuplicates, FillNA, NoOp, Search] + + +class NoCleaningConfXorq(AutocleaningConfig): + """No automatic cleaning — just expose the interpreter and quick-search. + + The autocleaning analysis classes are pandas-flavoured (HeuristicFracs, + PdCleaningStats, ...) and would not work against ibis exprs, so we + leave the analysis list empty. The lisp interpreter still runs against + the expression via the ported xorq_commands, and the frontend's + quick-search box drives the Search command. + """ + + autocleaning_analysis_klasses = [] + command_klasses = XORQ_BASE_COMMANDS + quick_command_klasses = [Search] + name = "" diff --git a/buckaroo/customizations/xorq_commands.py b/buckaroo/customizations/xorq_commands.py new file mode 100644 index 000000000..954ebf744 --- /dev/null +++ b/buckaroo/customizations/xorq_commands.py @@ -0,0 +1,114 @@ +"""buckaroo Commands targeting xorq/ibis expressions. + +Each command mirrors the shape of pandas_commands.py / polars_commands.py: +a ``command_default`` / ``command_pattern`` pair that the frontend reads, +plus ``transform`` (expr -> expr) and ``transform_to_py`` (str). Unlike +the pandas commands, transforms never mutate — ibis expressions are +immutable, so each transform builds and returns a new expression that +the dataflow continues to push down. + +xorq is an optional dependency; this module is import-safe without it, +because nothing here imports xorq at module load. Transforms call methods +on the passed-in expression (duck typing), so the module only matters +when an ibis/xorq expression actually flows through. +""" + +from ..jlisp.lisp_utils import s + + +class Command: + @staticmethod + def transform(expr, col, val): + return expr + + @staticmethod + def transform_to_py(expr, col, val): + return " # no op" + + +class NoOp(Command): + command_default = [s('noop'), s('df'), "col"] + command_pattern = [None] + + @staticmethod + def transform(expr, col): + return expr + + @staticmethod + def transform_to_py(expr, col): + return " #noop" + + +class DropCol(Command): + command_default = [s('dropcol'), s('df'), "col"] + command_pattern = [None] + + @staticmethod + def transform(expr, col): + return expr.drop(col) + + @staticmethod + def transform_to_py(expr, col): + return f" expr = expr.drop('{col}')" + + +class FillNA(Command): + command_default = [s('fillna'), s('df'), "col", 0] + command_pattern = [[3, 'fillVal', 'type', 'integer']] + + @staticmethod + def transform(expr, col, val): + return expr.mutate(**{col: expr[col].fill_null(val)}) + + @staticmethod + def transform_to_py(expr, col, val): + return f" expr = expr.mutate({col}=expr['{col}'].fill_null({val!r}))" + + +class DropDuplicates(Command): + command_default = [s('drop_duplicates'), s('df'), "col"] + command_pattern = [None] + + @staticmethod + def transform(expr, col): + return expr.distinct(on=[col]) + + @staticmethod + def transform_to_py(expr, col): + return f" expr = expr.distinct(on=['{col}'])" + + +def _search_expr(expr, val): + """Filter rows where any string column contains ``val``. + + Empty / None val short-circuits — the frontend sends "" to clear the + quick-search box, and pl.col-style ``contains(None)`` would drop every + row on the polars side; mirror that contract here. + """ + if val is None or val == "": + return expr + schema = expr.schema() + string_cols = [name for name in expr.columns if schema[name].is_string()] + if not string_cols: + return expr + cond = None + for c in string_cols: + c_cond = expr[c].contains(val) + cond = c_cond if cond is None else cond | c_cond + return expr.filter(cond) + + +class Search(Command): + command_default = [s('search'), s('df'), "col", ""] + command_pattern = [[3, 'term', 'type', 'string']] + quick_args_pattern = [[3, 'term', 'type', 'string']] + + @staticmethod + def transform(expr, col, val): + return _search_expr(expr, val) + + @staticmethod + def transform_to_py(expr, col, val): + return ( + " from buckaroo.customizations.xorq_commands import _search_expr\n" + f" expr = _search_expr(expr, '{val}')") diff --git a/buckaroo/jlisp/configure_utils.py b/buckaroo/jlisp/configure_utils.py index 3cdd1277d..24dd2567c 100644 --- a/buckaroo/jlisp/configure_utils.py +++ b/buckaroo/jlisp/configure_utils.py @@ -70,8 +70,14 @@ def buckaroo_transform(instructions, df, initial_sd): """ if isinstance(df, pd.DataFrame): df_copy = df.copy() - else: # hack we know it's polars here... just getting something working for now + elif hasattr(df, "clone") and callable(df.clone): + # polars DataFrame / LazyFrame df_copy = df.clone() + else: + # ibis/xorq expressions are immutable — transforms must return + # a new expr, so a defensive copy is both unavailable and + # unnecessary. + df_copy = df sd_dict = copy.deepcopy(initial_sd) sd_view = MappingProxyType(sd_dict) diff --git a/buckaroo/xorq_buckaroo.py b/buckaroo/xorq_buckaroo.py index cb03b742b..123f3584a 100644 --- a/buckaroo/xorq_buckaroo.py +++ b/buckaroo/xorq_buckaroo.py @@ -22,13 +22,12 @@ from .buckaroo_widget import BuckarooInfiniteWidget, BuckarooWidget from .customizations.styling import DefaultMainStyling, DefaultSummaryStatsStyling +from .customizations.xorq_autoclean_conf import NoCleaningConfXorq from .customizations.xorq_stats_v2 import XORQ_STATS_V2 -from .dataflow.autocleaning import ( - AutocleaningConfig, PandasAutocleaning, generate_quick_ops, merge_ops, ops_eq) +from .dataflow.autocleaning import PandasAutocleaning from .dataflow.dataflow import CustomizableDataflow from .dataflow.dataflow_extras import Sampling from .df_util import old_col_new_col -from .jlisp.lisp_utils import s from .pluggable_analysis_framework.col_analysis import ColAnalysis from .pluggable_analysis_framework.xorq_stat_pipeline import XorqDfStatsV2 from .serialization_utils import pd_to_obj, to_parquet @@ -89,93 +88,32 @@ def serialize_sample(cls, df_or_expr): return df_or_expr.limit(cls.serialize_limit).execute() -def _xorq_search(expr, _col, val): - """Filter rows where any string column contains ``val``. - - Mirrors the contract of the pandas / polars Search commands: an - empty value short-circuits to a no-op so the frontend can clear - the search by sending ``""``. - """ - if val is None or val == "": - return expr - schema = expr.schema() - string_cols = [name for name in expr.columns if schema[name].is_string()] - if not string_cols: - return expr - cond = None - for c in string_cols: - c_cond = expr[c].contains(val) - cond = c_cond if cond is None else cond | c_cond - return expr.filter(cond) - - -class XorqSearch: - """Search command for xorq exprs — symbol/pattern only. - - Defines the lisp symbol (``search``) and the quick-args pattern - that the frontend uses for the search box. The actual filter is - applied directly by ``XorqAutocleaning`` (see ``_XORQ_OP_HANDLERS``) - rather than going through ``configure_buckaroo``'s pandas/polars - interpreter, since ibis exprs are immutable and can't ``.copy()``. - """ - - command_default = [s('search'), s('df'), "col", ""] - command_pattern = [[3, 'term', 'type', 'string']] - quick_args_pattern = [[3, 'term', 'type', 'string']] - - @staticmethod - def transform(expr, col, val): - return _xorq_search(expr, col, val) - - @staticmethod - def transform_to_py(expr, col, val): - return f" expr = expr.filter(... contains('{val}'))" - - -_XORQ_OP_HANDLERS = {'search': _xorq_search} - - -class NoCleaningConfXorq(AutocleaningConfig): - autocleaning_analysis_klasses = [] - command_klasses = [XorqSearch] - quick_command_klasses = [XorqSearch] - name = "" - - class XorqAutocleaning(PandasAutocleaning): - """Cleaning is skipped for ibis exprs (the lisp interpreter targets - pandas), but quick commands like Search are applied directly. - - Each quick op is dispatched through ``_XORQ_OP_HANDLERS`` — - expression-to-expression transforms — so the result is still a - pushed-down xorq expr that downstream stats and pagination consume - unchanged. + """Autocleaning + interpreter for xorq/ibis expressions. + + Inherits the full ``PandasAutocleaning.handle_ops_and_clean`` pipeline + (quick-ops → merge → run lisp interpreter → make_origs → code + generation). Two pieces stay xorq-flavoured: + + * ``autocleaning_analysis_klasses`` on the conf is empty — the + pandas-flavoured cleaning analyses (HeuristicFracs, etc.) don't + work against ibis exprs. + * ``make_origs`` is overridden because the pandas version builds a + new ``pd.DataFrame`` from columns, which would force-materialise + an expression. For xorq we never add an _orig column today. + + The interpreter itself runs unmodified: ``buckaroo_transform`` in + ``jlisp.configure_utils`` detects that an ibis expr has neither + ``.copy()`` (pandas) nor ``.clone()`` (polars) and passes it through + unchanged. Each xorq Command's ``transform`` returns a new expr. """ - def handle_ops_and_clean(self, df, cleaning_method, quick_command_args, existing_operations): - if df is None: - return None - quick_ops = generate_quick_ops(self.quick_command_klasses, quick_command_args) - if ops_eq(existing_operations, [{'meta': 'no-op'}]): - existing_for_merge = [] - else: - existing_for_merge = existing_operations - final_ops = merge_ops(existing_for_merge, quick_ops) - if not final_ops: - return [df, {}, "", []] - result = self._apply_xorq_ops(df, final_ops) - return [result, {}, "", final_ops] - @staticmethod - def _apply_xorq_ops(expr, ops): - for op in ops: - sym_name = op[0]['symbol'] if isinstance(op[0], dict) else op[0] - handler = _XORQ_OP_HANDLERS.get(sym_name) - if handler is None: - continue - handler_args = op[2:] - expr = handler(expr, *handler_args) - return expr + def make_origs(raw_df, cleaned_df, cleaning_sd): + # No _orig column support today — returning the cleaned expr + # unchanged avoids pulling rows through pd.DataFrame(...) just to + # paper over the pandas-shaped helper on the base class. + return cleaned_df class XorqDataflow(CustomizableDataflow): diff --git a/tests/unit/test_xorq_commands.py b/tests/unit/test_xorq_commands.py new file mode 100644 index 000000000..9edbc8769 --- /dev/null +++ b/tests/unit/test_xorq_commands.py @@ -0,0 +1,146 @@ +"""xorq Command coverage — exercise each ported command through the +``XorqAutocleaning`` lisp-interpreter path that the widget uses for +quick commands today. + +These are not pure-function tests of the ``transform`` static methods; +the point of porting them was that they now flow through the same +``configure_buckaroo`` interpreter the pandas/polars commands do, so +the tests drive them via ``handle_ops_and_clean`` with a hand-built op +list. That's the same shape the widget sends on a ``quick_command_args`` +change, plus an ``existing_operations`` list for the cleaning_method +path. +""" +from __future__ import annotations + +import pytest + +xo = pytest.importorskip("xorq.api") + +from buckaroo.customizations.xorq_autoclean_conf import NoCleaningConfXorq # noqa: E402 +from buckaroo.customizations.xorq_commands import ( # noqa: E402 + DropCol, DropDuplicates, FillNA, NoOp, Search) +from buckaroo.jlisp.lisp_utils import s # noqa: E402 +from buckaroo.xorq_buckaroo import XorqAutocleaning # noqa: E402 + + +def _expr(): + # 'a' has a null, 'b' has duplicates, 'c' is a non-string for Search coverage. + return xo.memtable( + {"a": [1, 2, None, 4, 4], + "b": ["x", "y", "x", "z", "z"], + "c": [10, 20, 30, 40, 50]}) + + +def _ac(): + return XorqAutocleaning(ac_configs=(NoCleaningConfXorq,), conf_name="") + + +def _run(ac, expr, ops): + """Drive the interpreter path: pretend the frontend pre-merged + ``ops`` into existing_operations, with no cleaning_method and no + quick_command_args. ``handle_ops_and_clean`` then collapses through + ``produce_final_ops`` -> ``_run_df_interpreter`` and returns the + cleaned expression. + + Ops carry plain ``s(...)`` symbols (no ``auto_clean: True`` meta) so + ``merge_ops`` treats them as user-entered and preserves them rather + than discarding them as stale autocleaning output. + """ + return ac.handle_ops_and_clean(expr, "", {}, ops) + + +class TestCommandsViaInterpreter: + def test_noop_returns_same_expr(self): + ac = _ac() + expr = _expr() + op = [s("noop"), {"symbol": "df"}, "a"] + cleaned, _sd, _code, final_ops = _run(ac, expr, [op]) + assert cleaned.count().execute() == 5 + assert final_ops == [op] + + def test_dropcol_removes_column(self): + ac = _ac() + expr = _expr() + op = [s("dropcol"), {"symbol": "df"}, "b"] + cleaned, *_ = _run(ac, expr, [op]) + assert "b" not in cleaned.columns + assert set(cleaned.columns) == {"a", "c"} + + def test_fillna_substitutes_null(self): + ac = _ac() + expr = _expr() + op = [s("fillna"), {"symbol": "df"}, "a", 99] + cleaned, *_ = _run(ac, expr, [op]) + result = cleaned.execute().sort_values("c").reset_index(drop=True) + # The original null at row 2 became 99 — pandas may upcast to + # float so compare numerically. + assert list(result["a"]) == [1, 2, 99, 4, 4] + + def test_drop_duplicates_dedupes_by_column(self): + ac = _ac() + expr = _expr() + op = [s("drop_duplicates"), {"symbol": "df"}, "b"] + cleaned, *_ = _run(ac, expr, [op]) + result = cleaned.execute() + # Three distinct values in 'b': x, y, z. + assert sorted(result["b"]) == ["x", "y", "z"] + assert len(result) == 3 + + def test_search_via_quick_command_args(self): + """Search is the only quick-command — drive it through the + ``quick_command_args`` channel the widget actually uses.""" + ac = _ac() + expr = _expr() + cleaned, *_ = ac.handle_ops_and_clean( + expr, "", {"search": ["x"]}, [{"meta": "no-op"}]) + result = cleaned.execute() + # 'b' = ['x','y','x','z','z'] → rows where any string col contains 'x'. + assert sorted(result["b"]) == ["x", "x"] + + def test_search_empty_value_short_circuits(self): + ac = _ac() + expr = _expr() + cleaned, *_ = ac.handle_ops_and_clean( + expr, "", {"search": [""]}, [{"meta": "no-op"}]) + # Empty / cleared search must keep all rows — the polars regression + # in PR #743 hinged on this contract. Quick-arg machinery drops the + # empty value before it reaches the interpreter. + assert cleaned.count().execute() == 5 + # And on the no-op short-circuit the expression comes back by + # reference — the autocleaning ``df_interpreter`` short-circuit + # exists precisely so traitlets observers don't churn. + assert cleaned is expr + + +class TestPipelining: + """Ops compose through the interpreter: each transform returns a new + expression that the next op consumes — the same expr-to-expr push-down + pipeline that the postprocessing path already exercised.""" + + def test_dropcol_then_fillna(self): + ac = _ac() + expr = _expr() + ops = [ + [s("dropcol"), {"symbol": "df"}, "b"], + [s("fillna"), {"symbol": "df"}, "a", 0], + ] + cleaned, *_ = _run(ac, expr, ops) + result = cleaned.execute().sort_values("c").reset_index(drop=True) + assert "b" not in cleaned.columns + assert list(result["a"]) == [1, 2, 0, 4, 4] + + +class TestConfigRegistration: + def test_conf_lists_all_five_commands(self): + """Sanity-check the autocleaning conf so the widget surfaces every + ported command via ``command_config['argspecs']``.""" + ac = _ac() + argspecs = ac.command_config["argspecs"] + for k in ("noop", "dropcol", "fillna", "drop_duplicates", "search"): + assert k in argspecs + + def test_search_is_the_only_quick_command(self): + assert NoCleaningConfXorq.quick_command_klasses == [Search] + # Five commands total in the conf. + assert set(NoCleaningConfXorq.command_klasses) == { + DropCol, DropDuplicates, FillNA, NoOp, Search} From db457cd431141232ac34005c784f4c4c448d052a Mon Sep 17 00:00:00 2001 From: Paddy Mullen Date: Tue, 19 May 2026 05:29:32 -0400 Subject: [PATCH 2/3] fix(xorq): generated code assigns to df, not undefined expr The inherited code generator wraps transform_to_py snippets as def clean(df): ... return df. The xorq snippets assigned to a never-defined expr, raising UnboundLocalError. Mirror the pandas/polars commands and operate on df. Co-Authored-By: Claude Opus 4.7 (1M context) --- buckaroo/customizations/xorq_commands.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/buckaroo/customizations/xorq_commands.py b/buckaroo/customizations/xorq_commands.py index 954ebf744..661a4e706 100644 --- a/buckaroo/customizations/xorq_commands.py +++ b/buckaroo/customizations/xorq_commands.py @@ -49,7 +49,7 @@ def transform(expr, col): @staticmethod def transform_to_py(expr, col): - return f" expr = expr.drop('{col}')" + return f" df = df.drop('{col}')" class FillNA(Command): @@ -62,7 +62,7 @@ def transform(expr, col, val): @staticmethod def transform_to_py(expr, col, val): - return f" expr = expr.mutate({col}=expr['{col}'].fill_null({val!r}))" + return f" df = df.mutate({col}=df['{col}'].fill_null({val!r}))" class DropDuplicates(Command): @@ -75,7 +75,7 @@ def transform(expr, col): @staticmethod def transform_to_py(expr, col): - return f" expr = expr.distinct(on=['{col}'])" + return f" df = df.distinct(on=['{col}'])" def _search_expr(expr, val): @@ -111,4 +111,4 @@ def transform(expr, col, val): def transform_to_py(expr, col, val): return ( " from buckaroo.customizations.xorq_commands import _search_expr\n" - f" expr = _search_expr(expr, '{val}')") + f" df = _search_expr(df, '{val}')") From b75b39914245bd76ac2d1be443b4f48fd5cc20b6 Mon Sep 17 00:00:00 2001 From: Paddy Mullen Date: Tue, 19 May 2026 06:03:15 -0400 Subject: [PATCH 3/3] style(xorq): paddy-format test_xorq_commands.py Collapse the trailing-comma ops list to satisfy the paddy-format check in CI lint. Co-Authored-By: Claude Opus 4.7 (1M context) --- tests/unit/test_xorq_commands.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/tests/unit/test_xorq_commands.py b/tests/unit/test_xorq_commands.py index 9edbc8769..50025be1a 100644 --- a/tests/unit/test_xorq_commands.py +++ b/tests/unit/test_xorq_commands.py @@ -120,10 +120,7 @@ class TestPipelining: def test_dropcol_then_fillna(self): ac = _ac() expr = _expr() - ops = [ - [s("dropcol"), {"symbol": "df"}, "b"], - [s("fillna"), {"symbol": "df"}, "a", 0], - ] + ops = [[s("dropcol"), {"symbol": "df"}, "b"], [s("fillna"), {"symbol": "df"}, "a", 0]] cleaned, *_ = _run(ac, expr, ops) result = cleaned.execute().sort_values("c").reset_index(drop=True) assert "b" not in cleaned.columns