Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions buckaroo/artifact.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def _df_to_parquet_b64_tagged(df: pd.DataFrame) -> dict:
JSON-encoded per cell (same convention as sd_to_parquet_b64) so the
JS side can decode them uniformly via parseParquetRow().

Returns {'format': 'parquet_b64', 'data': '<base64 string>'}
Returns {'format': 'parquet_b64', 'layout': 'row', 'data': '<base64 string>'}
"""
df2 = prepare_df_for_serialization(df)
if not isinstance(df.index, pd.MultiIndex):
Expand All @@ -55,7 +55,7 @@ def _df_to_parquet_b64_tagged(df: pd.DataFrame) -> dict:
df2.to_parquet(buf, engine='pyarrow')
buf.seek(0)
b64 = base64.b64encode(buf.read()).decode('ascii')
return {'format': 'parquet_b64', 'data': b64}
return {'format': 'parquet_b64', 'layout': 'row', 'data': b64}


def prepare_buckaroo_artifact(df, column_config_overrides=None,
Expand Down
40 changes: 21 additions & 19 deletions buckaroo/serialization_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -271,35 +271,37 @@ def _json_encode_cell(val):
def sd_to_parquet_b64(sd: Dict[str, Any]) -> Dict[str, str]:
"""Convert a summary stats dict to a tagged parquet-b64 payload.

Summary stats DataFrames have mixed-type columns (strings, numbers, lists)
which fastparquet can't handle directly. We JSON-encode every cell value
first so each column becomes a pure string column, then use pyarrow for
parquet serialization. The JS side decodes parquet then JSON.parse's each cell.
Uses a wide-column layout: one parquet column per (col, stat) pair.
Column names are ``{short_col}__{stat_name}`` (e.g. ``a__mean``).
The parquet file has a single row. All cell values are JSON-encoded
via ``_json_encode_cell()`` so the JS side can ``JSON.parse`` each one.

Returns {'format': 'parquet_b64', 'data': '<base64 string>'}
Returns ``{'format': 'parquet_b64', 'layout': 'wide', 'data': '<base64>'}``
Falls back to JSON if parquet serialization fails.
"""
# JSON-encode every value so parquet sees only string columns
json_sd: Dict[str, Any] = {}
for col, stats in sd.items():
if isinstance(stats, dict):
json_sd[col] = {k: _json_encode_cell(v) for k, v in stats.items()}
else:
json_sd[col] = stats
import pyarrow as pa
import pyarrow.parquet as pq

df = pd.DataFrame(json_sd)
df2 = prepare_df_for_serialization(df)
# Add level_0 for backwards compatibility with JSON path (pd_to_obj adds it)
if not isinstance(df.index, pd.MultiIndex):
df2['level_0'] = df2['index']
col_mapping = [(orig, to_chars(i)) for i, orig in enumerate(sd.keys())]
names: List[str] = []
arrays: List = []

for orig_col, short_col in col_mapping:
stats = sd[orig_col]
if not isinstance(stats, dict):
continue
for stat_name, val in stats.items():
names.append(f"{short_col}__{stat_name}")
arrays.append(pa.array([_json_encode_cell(val)]))

try:
table = pa.table(dict(zip(names, arrays)))
data = BytesIO()
df2.to_parquet(data, engine='pyarrow')
pq.write_table(table, data)
data.seek(0)
raw_bytes = data.read()
b64 = base64.b64encode(raw_bytes).decode('ascii')
return {'format': 'parquet_b64', 'data': b64}
return {'format': 'parquet_b64', 'layout': 'wide', 'data': b64}
except Exception as e:
logger.warning("Failed to serialize summary stats as parquet, falling back to JSON: %r", e)
return pd_to_obj(pd.DataFrame(sd))
Expand Down
13 changes: 13 additions & 0 deletions packages/buckaroo-js-core/pw-tests/static-embed.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,21 @@ import { waitForCells, getRowCount } from './ag-pw-utils';
test.describe('Static embed renders', () => {

test('AG-Grid table appears with data rows', async ({ page }) => {
// Capture all console messages for debugging
const logs: string[] = [];
page.on('console', msg => logs.push(`[${msg.type()}] ${msg.text()}`));
page.on('pageerror', err => logs.push(`[PAGE_ERROR] ${err.message}`));

await page.goto('/static-test.html');

// Give 5s for initial load, then dump console
await page.waitForTimeout(5000);
console.log('--- Browser console output ---');
for (const log of logs) console.log(log);
console.log('--- End browser console ---');
console.log('Page title:', await page.title());
console.log('Body text (first 500):', (await page.locator('body').innerText()).slice(0, 500));

// Wait for the AG-Grid cells to render (parquet decode + React mount)
await waitForCells(page);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ export const formatter = (value: any, name: any, props: any) => {
}
};

export function FloatingTooltip({ items, x, y }: any) {
export function FloatingTooltip({ items, x, y }: any): React.ReactPortal {
const offset = 30;
const renderedItems = items.map((name: [string, number], _value: number | string) => {
const [realName, realValue] = name;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -216,6 +216,7 @@ export type DFData = DFDataRow[];
export interface ParquetB64Payload {
format: 'parquet_b64';
data: string; // base64-encoded parquet bytes
layout?: 'wide' | 'row'; // 'wide' = summary stats (col__stat columns), 'row' = normal rows
}

// A value in df_data_dict can be plain JSON (DFData) or a tagged parquet payload
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ export const formatter = (value: any, name: any, props: any) => {
}
};

export function FloatingTooltip({ items, x, y }: any) {
export function FloatingTooltip({ items, x, y }: any): React.ReactPortal {
const offset = 30;
const renderedItems = items.map((name: [string, number], _value: number | string) => {
const [realName, realValue] = name;
Expand Down
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
import { parquetRead, parquetMetadata } from 'hyparquet';
import { resolveDFData, resolveDFDataAsync } from './resolveDFData';
import { resolveDFData, resolveDFDataAsync, pivotWideSummaryStats } from './resolveDFData';
import { DFData, DFDataRow, ParquetB64Payload } from './DFWhole';

// Fixture generated by Python's sd_to_parquet_b64() with a summary stats dict
// containing numeric histogram data for one column.
// Fixture generated by Python's sd_to_parquet_b64() with wide-column layout.
// eslint-disable-next-line @typescript-eslint/no-var-requires
const fixture = require('./test-fixtures/summary_stats_parquet_b64.json');
const parquetPayload: ParquetB64Payload = fixture as ParquetB64Payload;
Expand All @@ -29,9 +28,7 @@ describe('resolveDFData', () => {
expect(resolveDFData(data)).toBe(data);
});

it('hyparquet can read the parquet_b64 fixture', async () => {
// Verify the fixture is valid and hyparquet can decode it.
// This is independent of resolveDFData — it tests the raw decode path.
it('hyparquet can read the wide-format parquet_b64 fixture', async () => {
const buf = b64ToArrayBuffer(parquetPayload.data);
const metadata = parquetMetadata(buf);
expect(metadata.row_groups.length).toBeGreaterThan(0);
Expand All @@ -44,60 +41,124 @@ describe('resolveDFData', () => {
onComplete: (data: any[]) => { rows.push(...data); },
});

expect(rows.length).toBeGreaterThan(0);
// Wide format: single row with col__stat columns
expect(rows.length).toBe(1);
const keys = Object.keys(rows[0]);
expect(keys.some(k => k.includes('__'))).toBe(true);
expect(keys).toContain('a__mean');
expect(keys).toContain('b__dtype');
});

// Should have an 'index' column with stat names
const indices = rows.map(r => r.index).filter(Boolean);
expect(indices).toContain('histogram');
expect(indices).toContain('dtype');
it('sync resolveDFData returns [] for parquet_b64 (known async limitation)', () => {
const result = resolveDFData(parquetPayload);
expect(result.length).toBe(0);
});

it('parquet_b64 histogram data round-trips with correct types', async () => {
// Decode the fixture and verify histogram arrays have the right structure.
const buf = b64ToArrayBuffer(parquetPayload.data);
const metadata = parquetMetadata(buf);
it('async resolveDFDataAsync returns pivoted DFData for wide-format parquet', async () => {
const result = await resolveDFDataAsync(parquetPayload);
expect(result.length).toBeGreaterThan(0);

const rows: DFDataRow[] = [];
await parquetRead({
file: buf,
metadata,
rowFormat: 'object',
onComplete: (data: any[]) => { rows.push(...data); },
});
// Should have row-based format with index column
const meanRow = result.find(r => r.index === 'mean');
expect(meanRow).toBeDefined();
expect(meanRow!.a).toBe(50.0);
expect(meanRow!.b).toBe(22.0);

const histRow = rows.find(r => r.index === 'histogram');
expect(histRow).toBeDefined();
const dtypeRow = result.find(r => r.index === 'dtype');
expect(dtypeRow).toBeDefined();
expect(dtypeRow!.a).toBe('float64');
expect(dtypeRow!.b).toBe('int64');
});

// Column 'a' contains the JSON-encoded histogram array
const rawCell = histRow!['a'];
expect(typeof rawCell).toBe('string');
it('async decode produces histogram arrays from JSON strings', async () => {
const result = await resolveDFDataAsync(parquetPayload);

const parsed = JSON.parse(rawCell as string);
expect(Array.isArray(parsed)).toBe(true);
expect(parsed.length).toBeGreaterThan(0);
const histRow = result.find(r => r.index === 'histogram');
expect(histRow).toBeDefined();
expect(Array.isArray(histRow!.a)).toBe(true);
const hist = histRow!.a as any[];
expect(hist.length).toBe(5);
expect(typeof hist[0].population).toBe('number');
expect(hist[0].name).toBe('0-20');
});

// Verify types: population should be a number, not a string
const popBar = parsed.find((b: any) => b.population !== undefined);
expect(popBar).toBeDefined();
expect(typeof popBar.population).toBe('number');
expect(typeof parsed[0].name).toBe('string');
it('async decode produces histogram_bins arrays', async () => {
const result = await resolveDFDataAsync(parquetPayload);

const binsRow = result.find(r => r.index === 'histogram_bins');
expect(binsRow).toBeDefined();
expect(Array.isArray(binsRow!.a)).toBe(true);
expect((binsRow!.a as number[]).length).toBe(6);
});
});

it('sync resolveDFData returns [] for parquet_b64 (known async limitation)', () => {
// Documents #630: parquetRead is async so the sync wrapper returns [].
// Widget components use useResolvedDFDataDict which falls back to async.
// The static embed path uses resolveDFDataAsync which works correctly.
const result = resolveDFData(parquetPayload);
expect(result.length).toBe(0);
describe('pivotWideSummaryStats', () => {
it('pivots a wide row into row-based DFData', () => {
const wideRow = {
a__mean: 42.5,
a__dtype: 'float64',
b__mean: 10.0,
b__dtype: 'int64',
};
const result = pivotWideSummaryStats(wideRow);

const meanRow = result.find(r => r.index === 'mean');
expect(meanRow).toBeDefined();
expect(meanRow!.a).toBe(42.5);
expect(meanRow!.b).toBe(10.0);
expect(meanRow!.level_0).toBe('mean');

const dtypeRow = result.find(r => r.index === 'dtype');
expect(dtypeRow).toBeDefined();
expect(dtypeRow!.a).toBe('float64');
expect(dtypeRow!.b).toBe('int64');
});

it('async resolveDFDataAsync returns non-empty result for parquet_b64', async () => {
const result = await resolveDFDataAsync(parquetPayload);
expect(result.length).toBeGreaterThan(0);
it('JSON-parses list/object values in string cells', () => {
const wideRow = {
a__histogram: '[{"name": "foo", "population": 10}]',
a__dtype: 'float64',
};
const result = pivotWideSummaryStats(wideRow);

// Verify the histogram row was JSON-parsed correctly
const histRow = result.find(r => r.index === 'histogram');
expect(histRow).toBeDefined();
expect(Array.isArray(histRow!['a'])).toBe(true);
expect(Array.isArray(histRow!.a)).toBe(true);
expect((histRow!.a as any[])[0].population).toBe(10);
});

it('keeps plain strings as strings (not JSON-parsed)', () => {
const wideRow = {
a__dtype: 'float64',
};
const result = pivotWideSummaryStats(wideRow);
const row = result.find(r => r.index === 'dtype');
expect(row!.a).toBe('float64');
});

it('handles null values', () => {
const wideRow = {
a__mean: null,
a__dtype: 'float64',
};
const result = pivotWideSummaryStats(wideRow);
const meanRow = result.find(r => r.index === 'mean');
expect(meanRow!.a).toBeNull();
});

it('fills missing columns with null', () => {
const wideRow = {
a__mean: 42,
b__dtype: 'int64',
};
const result = pivotWideSummaryStats(wideRow);

const meanRow = result.find(r => r.index === 'mean');
expect(meanRow!.a).toBe(42);
expect(meanRow!.b).toBeNull();

const dtypeRow = result.find(r => r.index === 'dtype');
expect(dtypeRow!.a).toBeNull();
expect(dtypeRow!.b).toBe('int64');
});
});
Loading
Loading