buckaroo-data · paddymul · Mar 21, 2026 · chatgpt-codex-connector · Mar 21, 2026
diff --git a/docs/plans/646-wide-column-summary-stats.md b/docs/plans/646-wide-column-summary-stats.md
@@ -0,0 +1,61 @@
+# Plan: Replace JSON-in-Parquet summary stats with wide-column layout (#646)
+
+## Context
+
+Summary stats serialization (`sd_to_parquet_b64`) JSON-encodes every cell value to a string before writing to parquet, then JS `JSON.parse`'s every cell back. This defeats parquet's type preservation — numbers, bools, and strings all become JSON strings stuffed in string columns. The fix: flatten to one parquet column per cell (`a__mean`, `a__histogram`, etc.) so scalars go through parquet natively, and only lists/dicts still need JSON encoding.
+
+## Approach
+
+Replace `sd_to_parquet_b64` in-place — no legacy code, no backwards compat format tag. Python flattens the stats dict to `{col__stat: [value]}` (single-row, many columns). JS decodes parquet, pivots the wide row back to the row-based `DFData` that all consumers already expect. The format tag stays `parquet_b64` (same as before).
+
+## Files to modify
+
+### Python: `buckaroo/serialization_utils.py`
+1. Add `_to_python_native(val)` — convert numpy scalars to Python builtins for pyarrow
+2. Add `_sd_to_parquet_b64_wide(sd)`:
+   - Rename columns to a,b,c via `to_chars()` (reuse existing `old_col_new_col` logic)
+   - For each `(col, stats_dict)`, for each `(stat, value)`:
+     - Column name = `f"{short_col}__{stat}"`
+     - If value is list/dict/tuple → JSON-encode to string
+     - If value is numpy scalar → convert to Python native
+     - If value is None/NaN → store as `None` (pyarrow handles nulls natively)
+     - Otherwise → store as-is
+   - Build `pa.table()` directly (single-row, one column per cell)
+   - Write parquet, base64 encode
+   - Return `{'format': 'parquet_b64_wide', 'data': '...'}`
+3. Replace `sd_to_parquet_b64` body with the wide-column implementation (no legacy fallback)
+
+### TypeScript: `packages/buckaroo-js-core/src/components/DFViewerParts/resolveDFData.ts`
+1. Replace `parseParquetRow()` with `pivotWideSummaryStats(wideRow)` — splits column names on first `__`, groups by stat name, produces `DFData` rows like `{index: "mean", level_0: "mean", a: 42, b: 33}`
+2. For complex values (strings that are JSON arrays/objects), JSON.parse them during pivot
+3. Update `resolveDFData()` and `resolveDFDataAsync()` to call `pivotWideSummaryStats` on the single decoded row
+
+### No changes needed in downstream consumers
+- `extractSDFT()`, `extractPinnedRows()`, AG-Grid pinned rows, `Styler.tsx` — all receive `DFData` after the pivot, same shape as before
+- All Python callers of `sd_to_parquet_b64` — same function signature, same tagged return format
+
+## Tests
+
+### Pre-check: DOM integration test
+Before making any changes, verify an existing DOM/integration test checks that summary stat rows (e.g. mean, dtype) appear in the rendered grid. If no such test exists, add one. Run it green before proceeding.
+
+### Python: `tests/unit/test_sd_to_parquet_b64.py`
+- Rewrite tests for the new wide format (single-row, `col__stat` columns)
+- Scalar values are now native types in parquet (not JSON strings)
+- Histogram columns are still JSON strings (complex types)
+- None/NaN values are explicit nulls in parquet
+- Remove old round-trip tests that assert JSON-encoded cells
+
+### TypeScript: `resolveDFData.test.ts`
+- Regenerate `test-fixtures/summary_stats_parquet_b64.json` with new wide format
+- Add test for `pivotWideSummaryStats` directly
+- Remove old `parseParquetRow` tests
+- Verify async decode produces correct DFData shape
+
+## Verification
+
+1. Run DOM integration test green BEFORE changes
+2. `pytest tests/unit/test_sd_to_parquet_b64.py -vv`
+3. `cd packages/buckaroo-js-core && pnpm test`
+4. Run DOM integration test green AFTER changes
+5. Full test suite: `pytest -vv tests/unit/ && cd packages && pnpm test`
diff --git a/packages/buckaroo-js-core/src/components/DFViewerParts/DFViewerInfinite.test.tsx b/packages/buckaroo-js-core/src/components/DFViewerParts/DFViewerInfinite.test.tsx
@@ -130,4 +130,101 @@ describe("DFViewerInfinite", () => {
     expect(latestAgGridProps.gridOptions.rowModelType).toBe("infinite");
     expect(latestAgGridProps.datasource.rowCount).toBe(50);
   });
+
+  it("pins multiple stat rows when config requests them", () => {
+    const multiPinConfig: DFViewerConfig = {
+      pinned_rows: [
+        { primary_key_val: "mean", displayer_args: { displayer: "obj" } },
+        { primary_key_val: "dtype", displayer_args: { displayer: "obj" } },
+      ],
+      left_col_configs: [],
+      column_config: [
+        { col_name: "index", header_name: "index", displayer_args: { displayer: "obj" } },
+        { col_name: "a", header_name: "a", displayer_args: { displayer: "obj" } },
+        { col_name: "b", header_name: "b", displayer_args: { displayer: "obj" } },
+      ],
+      component_config: {},
+    };
+    const statsData = [
+      { index: "mean", a: 42.5, b: 10.1 },
+      { index: "dtype", a: "float64", b: "int64" },
+      { index: "histogram_bins", a: [0, 25, 50, 75, 100], b: [0, 5, 10] },
+    ];
+
+    render(
+      <DFViewerInfinite
+        data_wrapper={{ data_type: "Raw", data: [{ index: 0, a: 1, b: 2 }], length: 1 }}
+        df_viewer_config={multiPinConfig}
+        summary_stats_data={statsData}
+        setActiveCol={jest.fn()}
+      />,
+    );
+
+    expect(setGridOptionMock).toHaveBeenCalledWith("pinnedTopRowData", [
+      { index: "mean", a: 42.5, b: 10.1 },
+      { index: "dtype", a: "float64", b: "int64" },
+    ]);
+  });
+
+  it("passes histogram_stats in context for color mapping", () => {
+    const statsData = [
+      { index: "histogram_bins", a: [0, 25, 50, 75, 100] },
+      { index: "histogram_log_bins", a: [1, 10, 100] },
+      { index: "mean", a: 50 },
+    ];
+
+    render(
+      <DFViewerInfinite
+        data_wrapper={{ data_type: "Raw", data: [{ index: 0, a: 1 }], length: 1 }}
+        df_viewer_config={baseConfig}
+        summary_stats_data={statsData}
+        setActiveCol={jest.fn()}
+      />,
+    );
+
+    // The context passed to AG-Grid should include histogram_stats
+    const context = latestAgGridProps.context;
+    expect(context.histogram_stats).toBeDefined();
+    expect(context.histogram_stats.a).toEqual({
+      histogram_bins: [0, 25, 50, 75, 100],
+      histogram_log_bins: [1, 10, 100],
+    });
+  });
+
+  it("handles empty summary stats gracefully", () => {
+    render(
+      <DFViewerInfinite
+        data_wrapper={{ data_type: "Raw", data: [{ index: 0, a: 1 }], length: 1 }}
+        df_viewer_config={baseConfig}
+        summary_stats_data={[]}
+        setActiveCol={jest.fn()}
+      />,
+    );
+
+    // Should not crash, pinned rows should be empty
+    expect(setGridOptionMock).toHaveBeenCalledWith("pinnedTopRowData", [undefined]);
+    // histogram_stats should be empty object
+    const context = latestAgGridProps.context;
+    expect(context.histogram_stats).toEqual({});
+  });
+
+  it("handles summary stats with null values in columns", () => {
+    const statsData = [
+      { index: "mean", a: 42.5, b: null },
+      { index: "dtype", a: "float64", b: "object" },
+    ];
+
+    render(
+      <DFViewerInfinite
+        data_wrapper={{ data_type: "Raw", data: [{ index: 0, a: 1, b: "x" }], length: 1 }}
+        df_viewer_config={baseConfig}
+        summary_stats_data={statsData}
+        setActiveCol={jest.fn()}
+      />,
+    );
+
+    expect(setGridOptionMock).toHaveBeenCalledWith("pinnedTopRowData", [
+      { index: "mean", a: 42.5, b: null },
+    ]);
+  });
 });