Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 22 additions & 0 deletions buckaroo/customizations/xorq_autoclean_conf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
from buckaroo.dataflow.autocleaning import AutocleaningConfig
from buckaroo.customizations.xorq_commands import (
DropCol, DropDuplicates, FillNA, NoOp, Search)


XORQ_BASE_COMMANDS = [DropCol, DropDuplicates, FillNA, NoOp, Search]


class NoCleaningConfXorq(AutocleaningConfig):
"""No automatic cleaning — just expose the interpreter and quick-search.

The autocleaning analysis classes are pandas-flavoured (HeuristicFracs,
PdCleaningStats, ...) and would not work against ibis exprs, so we
leave the analysis list empty. The lisp interpreter still runs against
the expression via the ported xorq_commands, and the frontend's
quick-search box drives the Search command.
"""

autocleaning_analysis_klasses = []
command_klasses = XORQ_BASE_COMMANDS
quick_command_klasses = [Search]
name = ""
114 changes: 114 additions & 0 deletions buckaroo/customizations/xorq_commands.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
"""buckaroo Commands targeting xorq/ibis expressions.

Each command mirrors the shape of pandas_commands.py / polars_commands.py:
a ``command_default`` / ``command_pattern`` pair that the frontend reads,
plus ``transform`` (expr -> expr) and ``transform_to_py`` (str). Unlike
the pandas commands, transforms never mutate — ibis expressions are
immutable, so each transform builds and returns a new expression that
the dataflow continues to push down.

xorq is an optional dependency; this module is import-safe without it,
because nothing here imports xorq at module load. Transforms call methods
on the passed-in expression (duck typing), so the module only matters
when an ibis/xorq expression actually flows through.
"""

from ..jlisp.lisp_utils import s


class Command:
@staticmethod
def transform(expr, col, val):
return expr

@staticmethod
def transform_to_py(expr, col, val):
return " # no op"


class NoOp(Command):
command_default = [s('noop'), s('df'), "col"]
command_pattern = [None]

@staticmethod
def transform(expr, col):
return expr

@staticmethod
def transform_to_py(expr, col):
return " #noop"


class DropCol(Command):
command_default = [s('dropcol'), s('df'), "col"]
command_pattern = [None]

@staticmethod
def transform(expr, col):
return expr.drop(col)

@staticmethod
def transform_to_py(expr, col):
return f" expr = expr.drop('{col}')"
Comment on lines +51 to +52
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Generate runnable code for xorq transforms

When a user applies this command, the inherited code generator wraps these snippets as def clean(df): ... return df in configure_utils.buckaroo_to_py, but the new xorq snippets assign to expr without ever defining it. For example, a drop-column op generates a function that raises UnboundLocalError at expr = expr.drop(...) instead of providing usable generated code in operation_results['generated_py_code']; the same pattern appears in the other non-noop xorq commands.

Useful? React with 👍 / 👎.



class FillNA(Command):
command_default = [s('fillna'), s('df'), "col", 0]
command_pattern = [[3, 'fillVal', 'type', 'integer']]

@staticmethod
def transform(expr, col, val):
return expr.mutate(**{col: expr[col].fill_null(val)})

@staticmethod
def transform_to_py(expr, col, val):
return f" expr = expr.mutate({col}=expr['{col}'].fill_null({val!r}))"


class DropDuplicates(Command):
command_default = [s('drop_duplicates'), s('df'), "col"]
command_pattern = [None]

@staticmethod
def transform(expr, col):
return expr.distinct(on=[col])

@staticmethod
def transform_to_py(expr, col):
return f" expr = expr.distinct(on=['{col}'])"


def _search_expr(expr, val):
"""Filter rows where any string column contains ``val``.

Empty / None val short-circuits — the frontend sends "" to clear the
quick-search box, and pl.col-style ``contains(None)`` would drop every
row on the polars side; mirror that contract here.
"""
if val is None or val == "":
return expr
schema = expr.schema()
string_cols = [name for name in expr.columns if schema[name].is_string()]
if not string_cols:
return expr
cond = None
for c in string_cols:
c_cond = expr[c].contains(val)
cond = c_cond if cond is None else cond | c_cond
return expr.filter(cond)


class Search(Command):
command_default = [s('search'), s('df'), "col", ""]
command_pattern = [[3, 'term', 'type', 'string']]
quick_args_pattern = [[3, 'term', 'type', 'string']]

@staticmethod
def transform(expr, col, val):
return _search_expr(expr, val)

@staticmethod
def transform_to_py(expr, col, val):
return (
" from buckaroo.customizations.xorq_commands import _search_expr\n"
f" expr = _search_expr(expr, '{val}')")
8 changes: 7 additions & 1 deletion buckaroo/jlisp/configure_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,8 +70,14 @@ def buckaroo_transform(instructions, df, initial_sd):
"""
if isinstance(df, pd.DataFrame):
df_copy = df.copy()
else: # hack we know it's polars here... just getting something working for now
elif hasattr(df, "clone") and callable(df.clone):
# polars DataFrame / LazyFrame
df_copy = df.clone()
else:
# ibis/xorq expressions are immutable — transforms must return
# a new expr, so a defensive copy is both unavailable and
# unnecessary.
df_copy = df

sd_dict = copy.deepcopy(initial_sd)
sd_view = MappingProxyType(sd_dict)
Expand Down
110 changes: 24 additions & 86 deletions buckaroo/xorq_buckaroo.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,12 @@

from .buckaroo_widget import BuckarooInfiniteWidget, BuckarooWidget
from .customizations.styling import DefaultMainStyling, DefaultSummaryStatsStyling
from .customizations.xorq_autoclean_conf import NoCleaningConfXorq
from .customizations.xorq_stats_v2 import XORQ_STATS_V2
from .dataflow.autocleaning import (
AutocleaningConfig, PandasAutocleaning, generate_quick_ops, merge_ops, ops_eq)
from .dataflow.autocleaning import PandasAutocleaning
from .dataflow.dataflow import CustomizableDataflow
from .dataflow.dataflow_extras import Sampling
from .df_util import old_col_new_col
from .jlisp.lisp_utils import s
from .pluggable_analysis_framework.col_analysis import ColAnalysis
from .pluggable_analysis_framework.xorq_stat_pipeline import XorqDfStatsV2
from .serialization_utils import pd_to_obj, to_parquet
Expand Down Expand Up @@ -89,93 +88,32 @@ def serialize_sample(cls, df_or_expr):
return df_or_expr.limit(cls.serialize_limit).execute()


def _xorq_search(expr, _col, val):
"""Filter rows where any string column contains ``val``.

Mirrors the contract of the pandas / polars Search commands: an
empty value short-circuits to a no-op so the frontend can clear
the search by sending ``""``.
"""
if val is None or val == "":
return expr
schema = expr.schema()
string_cols = [name for name in expr.columns if schema[name].is_string()]
if not string_cols:
return expr
cond = None
for c in string_cols:
c_cond = expr[c].contains(val)
cond = c_cond if cond is None else cond | c_cond
return expr.filter(cond)


class XorqSearch:
"""Search command for xorq exprs — symbol/pattern only.

Defines the lisp symbol (``search``) and the quick-args pattern
that the frontend uses for the search box. The actual filter is
applied directly by ``XorqAutocleaning`` (see ``_XORQ_OP_HANDLERS``)
rather than going through ``configure_buckaroo``'s pandas/polars
interpreter, since ibis exprs are immutable and can't ``.copy()``.
"""

command_default = [s('search'), s('df'), "col", ""]
command_pattern = [[3, 'term', 'type', 'string']]
quick_args_pattern = [[3, 'term', 'type', 'string']]

@staticmethod
def transform(expr, col, val):
return _xorq_search(expr, col, val)

@staticmethod
def transform_to_py(expr, col, val):
return f" expr = expr.filter(... contains('{val}'))"


_XORQ_OP_HANDLERS = {'search': _xorq_search}


class NoCleaningConfXorq(AutocleaningConfig):
autocleaning_analysis_klasses = []
command_klasses = [XorqSearch]
quick_command_klasses = [XorqSearch]
name = ""


class XorqAutocleaning(PandasAutocleaning):
"""Cleaning is skipped for ibis exprs (the lisp interpreter targets
pandas), but quick commands like Search are applied directly.

Each quick op is dispatched through ``_XORQ_OP_HANDLERS`` —
expression-to-expression transforms — so the result is still a
pushed-down xorq expr that downstream stats and pagination consume
unchanged.
"""Autocleaning + interpreter for xorq/ibis expressions.

Inherits the full ``PandasAutocleaning.handle_ops_and_clean`` pipeline
(quick-ops → merge → run lisp interpreter → make_origs → code
generation). Two pieces stay xorq-flavoured:

* ``autocleaning_analysis_klasses`` on the conf is empty — the
pandas-flavoured cleaning analyses (HeuristicFracs, etc.) don't
work against ibis exprs.
* ``make_origs`` is overridden because the pandas version builds a
new ``pd.DataFrame`` from columns, which would force-materialise
an expression. For xorq we never add an _orig column today.

The interpreter itself runs unmodified: ``buckaroo_transform`` in
``jlisp.configure_utils`` detects that an ibis expr has neither
``.copy()`` (pandas) nor ``.clone()`` (polars) and passes it through
unchanged. Each xorq Command's ``transform`` returns a new expr.
"""

def handle_ops_and_clean(self, df, cleaning_method, quick_command_args, existing_operations):
if df is None:
return None
quick_ops = generate_quick_ops(self.quick_command_klasses, quick_command_args)
if ops_eq(existing_operations, [{'meta': 'no-op'}]):
existing_for_merge = []
else:
existing_for_merge = existing_operations
final_ops = merge_ops(existing_for_merge, quick_ops)
if not final_ops:
return [df, {}, "", []]
result = self._apply_xorq_ops(df, final_ops)
return [result, {}, "", final_ops]

@staticmethod
def _apply_xorq_ops(expr, ops):
for op in ops:
sym_name = op[0]['symbol'] if isinstance(op[0], dict) else op[0]
handler = _XORQ_OP_HANDLERS.get(sym_name)
if handler is None:
continue
handler_args = op[2:]
expr = handler(expr, *handler_args)
return expr
def make_origs(raw_df, cleaned_df, cleaning_sd):
# No _orig column support today — returning the cleaned expr
# unchanged avoids pulling rows through pd.DataFrame(...) just to
# paper over the pandas-shaped helper on the base class.
return cleaned_df


class XorqDataflow(CustomizableDataflow):
Expand Down
Loading
Loading