From 40942a5b20b16da59504af3e87709da73526d7c8 Mon Sep 17 00:00:00 2001 From: Paddy Mullen Date: Sun, 17 May 2026 16:04:31 -0400 Subject: [PATCH 1/2] test(autocleaning): expect LazyFrame mid-pipeline for polars Adds a failing assertion that confirms a polars cleaning pipeline threads a LazyFrame through commands' transforms (so the optimizer fuses adjacent with_columns/filter calls and materialises once at exit) instead of re-materialising on each op. Failing today: probe command sees DataFrame both passes; will pass once polars conf grows lazy_enter/lazy_exit hooks. Co-Authored-By: Claude Opus 4.7 (1M context) --- tests/unit/dataflow/autocleaning_pl_test.py | 49 +++++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/tests/unit/dataflow/autocleaning_pl_test.py b/tests/unit/dataflow/autocleaning_pl_test.py index 7bd7fa120..d043e8843 100644 --- a/tests/unit/dataflow/autocleaning_pl_test.py +++ b/tests/unit/dataflow/autocleaning_pl_test.py @@ -5,6 +5,7 @@ from buckaroo.pluggable_analysis_framework.col_analysis import (ColAnalysis) from buckaroo.dataflow.autocleaning import (merge_ops, format_ops, AutocleaningConfig, _rekey_op_sd_to_internal) from buckaroo.polars_buckaroo import PolarsAutocleaning +from buckaroo.customizations.pl_autocleaning_conf import NoCleaningConfPl from buckaroo.customizations.polars_commands import ( Command, PlSafeInt, DropCol, FillNA, GroupBy, NoOp, Search, SDResult ) @@ -317,6 +318,54 @@ def test_init_sd_displayer_args_and_search_highlight_coexist_on_same_column(): assert cc['ag_grid_specs']['wrapText'] is True +class _RecordTypeCommand(Command): + """Probe command for the lazy-threading tests below: records the + runtime type name of `df` it sees mid-pipeline, returns df unchanged. + + Module-level recording — reset at the start of each test that uses it. + """ + seen_types: list = [] + + command_default = [s('record_type'), s('df'), 'col'] + command_pattern = [None] + + @staticmethod + def transform(df, col): + _RecordTypeCommand.seen_types.append(type(df).__name__) + return df + + @staticmethod + def transform_to_py(df, col): + return " # record_type" + + +class LazyProbeConf(NoCleaningConfPl): + autocleaning_analysis_klasses = [] + command_klasses = [FillNA, _RecordTypeCommand] + quick_command_klasses = [] + name = "" + + +def test_polars_pipeline_threads_lazyframe_between_ops(): + """The polars conf flips df to LazyFrame at interpreter entry and + collects once at exit. Each command's transform therefore sees a + LazyFrame mid-pipeline; the final cleaned df is a DataFrame so + make_origs / PlDfStatsV2 keep working unchanged.""" + _RecordTypeCommand.seen_types = [] + ac = PolarsAutocleaning([LazyProbeConf]) + df = pl.DataFrame({'a': [1, None, 3]}) + ops = [ + [{'symbol': 'fillna'}, s('df'), 'a', 0], + [{'symbol': 'record_type'}, s('df'), 'a'], + [{'symbol': 'fillna'}, s('df'), 'a', 0], + [{'symbol': 'record_type'}, s('df'), 'a']] + cleaned, _sd, _gen, _ops = ac.handle_ops_and_clean( + df, cleaning_method='', quick_command_args={}, existing_operations=ops) + + assert _RecordTypeCommand.seen_types == ['LazyFrame', 'LazyFrame'] + assert isinstance(cleaned, pl.DataFrame) + + def test_style_column_delete_keys_drops_tooltip(): """init_sd's delete_keys lets a user drop top-level keys that style_column adds by default. The motivating case: a string column where the user From 01320b93bc0e2ac3487a1105411a957103dd7a69 Mon Sep 17 00:00:00 2001 From: Paddy Mullen Date: Sun, 17 May 2026 16:09:43 -0400 Subject: [PATCH 2/2] feat(polars): lazy-thread interpreter pipeline (one collect per run) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Polars commands previously ran eager: each lisp op materialised a new DataFrame, so an N-op cleaning pipeline paid N times for what polars's query optimiser can fuse into one plan. Switch the polars autoclean conf to thread a LazyFrame through the interpreter and collect once at exit. - `AutocleaningConfig` grows two staticmethod hooks, `lazy_enter` and `lazy_exit`, defaulting to identity. Pandas inherits unchanged; xorq (when #767 lands) inherits the no-op default — ibis exprs are already lazy, so the unified pattern fits both dialects. - `NoCleaningConfPl` overrides with `df.lazy()` on entry and `df.collect() if isinstance(df, pl.LazyFrame) else df` on exit. The isinstance guard handles `GroupBy.transform`, which materialises mid-pipeline; anything downstream of a groupby runs eager and the exit becomes a no-op. - `_run_df_interpreter` wraps the interpreter call with the hooks. The no-op short-circuit fires *before* lazy_enter, preserving the by-reference identity contract the traitlets/anywidget init path depends on. - `Search.transform` switches to `df.collect_schema()` to avoid polars's PerformanceWarning when handed a LazyFrame. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../customizations/pl_autocleaning_conf.py | 15 +++++++++++++- buckaroo/customizations/polars_commands.py | 6 +++++- buckaroo/dataflow/autocleaning.py | 20 ++++++++++++++++++- 3 files changed, 38 insertions(+), 3 deletions(-) diff --git a/buckaroo/customizations/pl_autocleaning_conf.py b/buckaroo/customizations/pl_autocleaning_conf.py index ebd9e8181..da247a1e5 100644 --- a/buckaroo/customizations/pl_autocleaning_conf.py +++ b/buckaroo/customizations/pl_autocleaning_conf.py @@ -1,8 +1,19 @@ +import polars as pl + from buckaroo.dataflow.autocleaning import AutocleaningConfig from buckaroo.customizations.polars_commands import ( Search) +def _pl_lazy_enter(df): + return df.lazy() + + +def _pl_lazy_exit(df): + # GroupBy.transform calls .collect() mid-pipeline, so anything + # downstream of a groupby is already an eager DataFrame by the time + # we see it here. Only collect if we're still lazy. + return df.collect() if isinstance(df, pl.LazyFrame) else df class NoCleaningConfPl(AutocleaningConfig): @@ -11,5 +22,7 @@ class NoCleaningConfPl(AutocleaningConfig): command_klasses = [Search] quick_command_klasses = [Search] name="" - + lazy_enter = staticmethod(_pl_lazy_enter) + lazy_exit = staticmethod(_pl_lazy_exit) + diff --git a/buckaroo/customizations/polars_commands.py b/buckaroo/customizations/polars_commands.py index 75d7b0bd3..4393a917b 100644 --- a/buckaroo/customizations/polars_commands.py +++ b/buckaroo/customizations/polars_commands.py @@ -142,7 +142,11 @@ def transform(df, col, val): filtered = df.filter(pl.any_horizontal(pl.col(pl.String).str.contains(val))) # `.str.contains(val)` treats val as a regex, so expose it as # highlight_regex (not _phrase) for consistent semantics on the JS side. - string_cols = [c for c, dt in zip(df.columns, df.dtypes) if dt == pl.String] + # collect_schema avoids polars's PerformanceWarning when df is a + # LazyFrame (the interpreter threads LazyFrame between ops on the + # polars path); also works on eager DataFrames. + schema = df.collect_schema() + string_cols = [name for name, dt in schema.items() if dt == pl.String] sd_updates = {c: {'highlight_regex': val} for c in string_cols} return SDResult(filtered, sd_updates) diff --git a/buckaroo/dataflow/autocleaning.py b/buckaroo/dataflow/autocleaning.py index 87e39aecb..f9c31a503 100644 --- a/buckaroo/dataflow/autocleaning.py +++ b/buckaroo/dataflow/autocleaning.py @@ -70,11 +70,21 @@ def handle_ops_and_clean(self, df, cleaning_method, quick_command_args, existing cleaned_df = df return [cleaned_df, {}, generated_code, merged_operations] +def _identity(x): + return x + + class AutocleaningConfig: command_klasses = [DefaultCommandKlsList] autocleaning_analysis_klasses = [] quick_command_klasses = [] name = 'default' + # Hooks bookending the lisp-interpreter call in _run_df_interpreter. + # Default identity = current behaviour. Polars overrides with df.lazy() + # / collect-if-LazyFrame so a multi-op pipeline materialises once + # instead of after every command. See NoCleaningConfPl. + lazy_enter = staticmethod(_identity) + lazy_exit = staticmethod(_identity) class WrongFrontendQuickArgs(Exception): @@ -136,6 +146,8 @@ def _setup_from_command_kls_list(self, name): self.df_interpreter, self.gencode_interpreter = df_interpreter, gencode_interpreter self.command_config = dict(argspecs=c_patterns, defaultArgs=c_defaults) self.quick_command_klasses = conf.quick_command_klasses + self.lazy_enter = conf.lazy_enter + self.lazy_exit = conf.lazy_exit def _run_df_interpreter(self, df, operations, initial_sd): @@ -168,7 +180,13 @@ def wrap_set_df(form): # contract is precisely "no ops → caller's objects come back as-is". return df, initial_sd - return self.df_interpreter(full_ops, df, initial_sd) + # lazy_enter/lazy_exit are conf-provided hooks. Polars flips to + # LazyFrame on entry and collects on exit (one materialisation per + # pipeline instead of N). Pandas/xorq leave the defaults — identity. + # Both hooks run *after* the no-op short-circuit above, so the + # by-reference identity contract there is preserved. + ret_df, ret_sd = self.df_interpreter(full_ops, self.lazy_enter(df), initial_sd) + return self.lazy_exit(ret_df), ret_sd def _run_code_generator(self, operations): if len(operations) == 0: