Skip to content

Commit e5369df

Browse files
authored
Feat/0.11.1 bug fixes (#406)
* Release 0.11.1 * Fixes sort for pandas with new column naming * Fixes lowcode UI with new column naming * More reliable default analysis for pandas * Start of the ddd gallery for showing tricky dataframes * Start of Patrick big file blog post
1 parent 057e3e2 commit e5369df

22 files changed

Lines changed: 960 additions & 57 deletions

buckaroo/buckaroo_widget.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -370,7 +370,9 @@ def _handle_payload_args(self, new_payload_args):
370370
if sort:
371371
sort_dir = new_payload_args.get('sort_direction')
372372
ascending = sort_dir == 'asc'
373-
sorted_df = processed_df.sort_values(by=[sort], ascending=ascending)
373+
processed_sd = self.dataflow.widget_args_tuple[2]
374+
converted_sort_column = processed_sd[sort]['orig_col_name']
375+
sorted_df = processed_df.sort_values(by=[converted_sort_column], ascending=ascending)
374376
slice_df = sorted_df[start:end]
375377
self.send({ "type": "infinite_resp", 'key':new_payload_args, 'data':[], 'length':len(processed_df)}, [to_parquet(slice_df)])
376378
else:

buckaroo/customizations/analysis.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -125,7 +125,7 @@ def series_summary(sampled_ser, ser):
125125
mode=get_mode(ser),
126126
min=np.nan,
127127
max=np.nan)
128-
if is_numeric and not is_bool:
128+
if is_numeric and not is_bool and base_d['null_count'] < l:
129129
base_d.update({
130130
'std': ser.std(),
131131
'mean': ser.mean(),

buckaroo/customizations/pandas_commands.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -485,7 +485,10 @@ def transform(df, col, val):
485485

486486
@staticmethod
487487
def transform_to_py(df, col, val):
488-
return " df.fillna({'%s':%r}, inplace=True)" % (col, val)
488+
return f""" from buckaroo.customizations.pandas_commands import search_df_str
489+
return search_df_str(df, '{val}')"""
490+
491+
489492

490493

491494
def search_col_str(df, col, needle:str):
@@ -513,8 +516,8 @@ def transform(df, col, val):
513516

514517
@staticmethod
515518
def transform_to_py(df, col, needle):
516-
return f" df = df[~(df['{col}'].str.find('{needle}').fillna(-1) == -1).fillna(False)]"
517-
519+
return f""" from buckaroo.customizations.pandas_commands import search_col_str
520+
return search_col_str(df, '{col}', '{needle}')"""
518521

519522

520523
class DropDuplicates(Command):

buckaroo/dataflow/autocleaning.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -192,7 +192,6 @@ def handle_ops_and_clean(self, df, cleaning_method, quick_command_args, existing
192192
cleaning_sd = {}
193193
else:
194194
final_ops = self.produce_final_ops(cleaning_ops, quick_command_args, existing_operations)
195-
196195
if ops_eq(final_ops,[]) and cleaning_method == "":
197196
#nothing to be done here, no point in running the interpreter
198197
#this also has the nice effect of not copying the DF, which the interpreter does

buckaroo/ddd_library.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,14 +20,14 @@ def get_basic_df_with_named_index():
2020
basic_index_with_df.index.name = "named_index"
2121
return basic_index_with_df
2222

23-
def get_multindex_cols_df(rows=15) -> pd.DataFrame:
23+
def get_multiindex_cols_df(rows=15) -> pd.DataFrame:
2424
cols = pd.MultiIndex.from_tuples(
2525
[('foo', 'a'), ('foo', 'b'), ('bar', 'a'), ('bar', 'b'), ('bar', 'c')])
2626
return pd.DataFrame(
2727
[["asdf","foo_b", "bar_a", "bar_b", "bar_c"]] * rows,
2828
columns=cols)
2929

30-
def get_multindex_with_names_cols_df(rows=15) -> pd.DataFrame:
30+
def get_multiindex_with_names_cols_df(rows=15) -> pd.DataFrame:
3131
cols = pd.MultiIndex.from_tuples(
3232
[('foo', 'a'), ('foo', 'b'), ('bar', 'a'), ('bar', 'b'), ('bar', 'c')],
3333
names=['level_a', 'level_b'])
@@ -36,7 +36,7 @@ def get_multindex_with_names_cols_df(rows=15) -> pd.DataFrame:
3636
columns=cols)
3737

3838
def get_tuple_cols_df(rows=15) -> pd.DataFrame:
39-
multi_col_df = get_multindex_cols_df(rows)
39+
multi_col_df = get_multiindex_cols_df(rows)
4040
multi_col_df.columns = multi_col_df.columns.to_flat_index()
4141
return multi_col_df
4242

Lines changed: 209 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,209 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": null,
6+
"id": "4bacd1ad-6658-4e69-b596-d9db5a0a2201",
7+
"metadata": {},
8+
"outputs": [],
9+
"source": [
10+
"import pandas as pd\n",
11+
"import polars as pl\n",
12+
"import buckaroo\n",
13+
"JULY_FILE = \"~/NPPES_Data_Dissemination_July_2025/npidata_pfile_20050523-20250713.csv\"\n"
14+
]
15+
},
16+
{
17+
"cell_type": "markdown",
18+
"id": "2c21c2b4-8d86-4d2e-a7df-5022a0ed296c",
19+
"metadata": {},
20+
"source": [
21+
"# Lets investigate this file\n",
22+
"We are going to use some unix command line utils. These are generally going to be very fast and memory efficient"
23+
]
24+
},
25+
{
26+
"cell_type": "code",
27+
"execution_count": null,
28+
"id": "8eabb58f-ce10-4ac3-b5eb-99e1a2ada843",
29+
"metadata": {},
30+
"outputs": [],
31+
"source": [
32+
"!du -h /Users/paddy/NPPES_Data_Dissemination_July_2025/npidata_pfile_20050523-20250713.csv"
33+
]
34+
},
35+
{
36+
"cell_type": "code",
37+
"execution_count": null,
38+
"id": "1338b817-097d-4155-a131-cf5b011a8ccc",
39+
"metadata": {},
40+
"outputs": [],
41+
"source": [
42+
"!time cat /Users/paddy/NPPES_Data_Dissemination_July_2025/npidata_pfile_20050523-20250713.csv > /dev/null"
43+
]
44+
},
45+
{
46+
"cell_type": "code",
47+
"execution_count": null,
48+
"id": "15037796-b479-493d-b9ee-00ddbe69189b",
49+
"metadata": {},
50+
"outputs": [],
51+
"source": [
52+
"!time wc -l ~/NPPES_Data_Dissemination_July_2025/npidata_pfile_20050523-20250713.csv"
53+
]
54+
},
55+
{
56+
"cell_type": "markdown",
57+
"id": "538bfa98-9e1c-4732-b260-c076fb3aba55",
58+
"metadata": {},
59+
"source": [
60+
"# Let's talk about polars\n",
61+
"\n",
62+
"Pandas was a huge leap forward for data science when it came out 15 years ago. Polars is a clean slate design oriented around performance.\n",
63+
"\n",
64+
"Polars is faster for two reasons, one in operates in parallel, two it works lazily, in many cases it doesn't need to load an entire dataframe into memory"
65+
]
66+
},
67+
{
68+
"cell_type": "code",
69+
"execution_count": null,
70+
"id": "4cb4e1a5-b8da-4d39-8ef5-625f095e235b",
71+
"metadata": {},
72+
"outputs": [],
73+
"source": [
74+
"df = pl.read_csv(JULY_FILE, n_rows=5_000, low_memory=True)\n",
75+
"df"
76+
]
77+
},
78+
{
79+
"cell_type": "code",
80+
"execution_count": null,
81+
"id": "1cd590d9-c82e-4dfc-b1d9-a1b8d3d52d73",
82+
"metadata": {},
83+
"outputs": [],
84+
"source": [
85+
"#df"
86+
]
87+
},
88+
{
89+
"cell_type": "code",
90+
"execution_count": null,
91+
"id": "35a7f426-be62-43c2-881e-de892b3f3a1c",
92+
"metadata": {},
93+
"outputs": [],
94+
"source": [
95+
"#df.filter(pl.any_horizontal(pl.col(pl.String).str.contains('GRES')))"
96+
]
97+
},
98+
{
99+
"cell_type": "code",
100+
"execution_count": null,
101+
"id": "4d85918d-97da-467e-bd1d-3ac078a115bd",
102+
"metadata": {},
103+
"outputs": [],
104+
"source": [
105+
"NROWS=500_000\n",
106+
"%timeit pl.read_csv(JULY_FILE, n_rows=NROWS).filter(pl.any_horizontal(pl.col(pl.String).str.contains('GRES')))"
107+
]
108+
},
109+
{
110+
"cell_type": "code",
111+
"execution_count": null,
112+
"id": "e54a5ed2-b000-4563-9c15-e71c346d5621",
113+
"metadata": {},
114+
"outputs": [],
115+
"source": [
116+
"%timeit pl.scan_csv(JULY_FILE, n_rows=NROWS).filter(pl.any_horizontal(pl.col(pl.String).str.contains('GRES'))).collect()"
117+
]
118+
},
119+
{
120+
"cell_type": "code",
121+
"execution_count": null,
122+
"id": "74450ee4-c29a-4230-8ed3-e2e3fa987485",
123+
"metadata": {},
124+
"outputs": [],
125+
"source": [
126+
"!time cat ~/NPPES_Data_Dissemination_July_2025/npidata_pfile_20050523-20250713.csv | grep 367H | wc -l"
127+
]
128+
},
129+
{
130+
"cell_type": "code",
131+
"execution_count": null,
132+
"id": "cb877171-0af5-48a3-b939-647384691139",
133+
"metadata": {},
134+
"outputs": [],
135+
"source": [
136+
"from datetime import datetime\n",
137+
"start = datetime.now()\n",
138+
"filtered_df = pl.scan_csv(JULY_FILE, low_memory=True).filter(pl.any_horizontal(pl.col(pl.String).str.contains('367H'))).collect()\n",
139+
"end = datetime.now()\n"
140+
]
141+
},
142+
{
143+
"cell_type": "code",
144+
"execution_count": null,
145+
"id": "a0b7d2a9-fc86-4177-8b56-a6506042048c",
146+
"metadata": {},
147+
"outputs": [],
148+
"source": [
149+
"end - start"
150+
]
151+
},
152+
{
153+
"cell_type": "code",
154+
"execution_count": null,
155+
"id": "478c1bcf-b03f-440e-a341-82145ec9e9e2",
156+
"metadata": {},
157+
"outputs": [],
158+
"source": [
159+
"filtered_df.write_csv(\"367H.csv\")"
160+
]
161+
},
162+
{
163+
"cell_type": "code",
164+
"execution_count": null,
165+
"id": "8581fa24-c05a-4305-8466-e70e71726e2f",
166+
"metadata": {},
167+
"outputs": [],
168+
"source": [
169+
"filtered_df"
170+
]
171+
},
172+
{
173+
"cell_type": "code",
174+
"execution_count": null,
175+
"id": "fed6d1ff-a0f2-4032-8d8a-a5fa9adb5cce",
176+
"metadata": {},
177+
"outputs": [],
178+
"source": []
179+
}
180+
],
181+
"metadata": {
182+
"kernelspec": {
183+
"display_name": "Python 3 (ipykernel)",
184+
"language": "python",
185+
"name": "python3"
186+
},
187+
"language_info": {
188+
"codemirror_mode": {
189+
"name": "ipython",
190+
"version": 3
191+
},
192+
"file_extension": ".py",
193+
"mimetype": "text/x-python",
194+
"name": "python",
195+
"nbconvert_exporter": "python",
196+
"pygments_lexer": "ipython3",
197+
"version": "3.12.8"
198+
},
199+
"widgets": {
200+
"application/vnd.jupyter.widget-state+json": {
201+
"state": {},
202+
"version_major": 2,
203+
"version_minor": 0
204+
}
205+
}
206+
},
207+
"nbformat": 4,
208+
"nbformat_minor": 5
209+
}

0 commit comments

Comments
 (0)