buckaroo-data
diff --git a/‎buckaroo/buckaroo_widget.py‎
Lines changed: 3 additions & 1 deletion b/‎buckaroo/buckaroo_widget.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎buckaroo/customizations/analysis.py‎
Lines changed: 1 addition & 1 deletion b/‎buckaroo/customizations/analysis.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎buckaroo/customizations/pandas_commands.py‎
Lines changed: 6 additions & 3 deletions b/‎buckaroo/customizations/pandas_commands.py‎
Lines changed: 6 additions & 3 deletions
diff --git a/‎buckaroo/dataflow/autocleaning.py‎
Lines changed: 0 additions & 1 deletion b/‎buckaroo/dataflow/autocleaning.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎buckaroo/ddd_library.py‎
Lines changed: 3 additions & 3 deletions b/‎buckaroo/ddd_library.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎docs/example-notebooks/BigFile.ipynb‎
Lines changed: 209 additions & 0 deletions b/‎docs/example-notebooks/BigFile.ipynb‎
Lines changed: 209 additions & 0 deletions
@@ -370,7 +370,9 @@ def _handle_payload_args(self, new_payload_args):
             if sort:
                 sort_dir = new_payload_args.get('sort_direction')
                 ascending = sort_dir == 'asc'
-                sorted_df = processed_df.sort_values(by=[sort], ascending=ascending)
+                processed_sd = self.dataflow.widget_args_tuple[2]
+                converted_sort_column = processed_sd[sort]['orig_col_name']
+                sorted_df = processed_df.sort_values(by=[converted_sort_column], ascending=ascending)
                 slice_df = sorted_df[start:end]
                 self.send({ "type": "infinite_resp", 'key':new_payload_args, 'data':[], 'length':len(processed_df)}, [to_parquet(slice_df)])
             else:
 
@@ -125,7 +125,7 @@ def series_summary(sampled_ser, ser):
             mode=get_mode(ser),
             min=np.nan,
             max=np.nan)
-        if is_numeric and not is_bool:
+        if is_numeric and not is_bool and base_d['null_count'] < l:
             base_d.update({
                 'std': ser.std(),
                 'mean': ser.mean(),
 
@@ -485,7 +485,10 @@ def transform(df, col, val):
 
     @staticmethod 
     def transform_to_py(df, col, val):
-        return "    df.fillna({'%s':%r}, inplace=True)" % (col, val)
+        return f"""    from buckaroo.customizations.pandas_commands import search_df_str
+    return search_df_str(df, '{val}')"""
+
+
 
 
 def search_col_str(df, col, needle:str):
@@ -513,8 +516,8 @@ def transform(df, col, val):
 
     @staticmethod 
     def transform_to_py(df, col, needle):
-        return f"    df = df[~(df['{col}'].str.find('{needle}').fillna(-1) == -1).fillna(False)]"
-
+        return f"""    from buckaroo.customizations.pandas_commands import search_col_str
+    return search_col_str(df, '{col}', '{needle}')"""
 
 
 class DropDuplicates(Command):
 
@@ -192,7 +192,6 @@ def handle_ops_and_clean(self, df, cleaning_method, quick_command_args, existing
             cleaning_sd = {}
         else:
             final_ops = self.produce_final_ops(cleaning_ops, quick_command_args, existing_operations)
-
         if ops_eq(final_ops,[]) and cleaning_method == "":
             #nothing to be done here, no point in running the interpreter
             #this also has the nice effect of not copying the DF, which the interpreter does
 
@@ -20,14 +20,14 @@ def get_basic_df_with_named_index():
     basic_index_with_df.index.name = "named_index"
     return basic_index_with_df
 
-def get_multindex_cols_df(rows=15) -> pd.DataFrame:
+def get_multiindex_cols_df(rows=15) -> pd.DataFrame:
     cols = pd.MultiIndex.from_tuples(
     [('foo', 'a'), ('foo', 'b'),  ('bar', 'a'), ('bar', 'b'), ('bar', 'c')])
     return pd.DataFrame(
         [["asdf","foo_b", "bar_a", "bar_b", "bar_c"]] * rows,
         columns=cols)
 
-def get_multindex_with_names_cols_df(rows=15) -> pd.DataFrame:
+def get_multiindex_with_names_cols_df(rows=15) -> pd.DataFrame:
     cols = pd.MultiIndex.from_tuples(
         [('foo', 'a'), ('foo', 'b'),  ('bar', 'a'), ('bar', 'b'), ('bar', 'c')],
         names=['level_a', 'level_b'])
@@ -36,7 +36,7 @@ def get_multindex_with_names_cols_df(rows=15) -> pd.DataFrame:
         columns=cols)
 
 def get_tuple_cols_df(rows=15) -> pd.DataFrame:
-    multi_col_df = get_multindex_cols_df(rows)
+    multi_col_df = get_multiindex_cols_df(rows)
     multi_col_df.columns = multi_col_df.columns.to_flat_index()
     return multi_col_df
 
 
@@ -0,0 +1,209 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4bacd1ad-6658-4e69-b596-d9db5a0a2201",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import polars as pl\n",
+    "import buckaroo\n",
+    "JULY_FILE = \"~/NPPES_Data_Dissemination_July_2025/npidata_pfile_20050523-20250713.csv\"\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2c21c2b4-8d86-4d2e-a7df-5022a0ed296c",
+   "metadata": {},
+   "source": [
+    "# Lets investigate this file\n",
+    "We are going to use some unix command line utils.  These are generally going to be very fast and memory efficient"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8eabb58f-ce10-4ac3-b5eb-99e1a2ada843",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!du -h /Users/paddy/NPPES_Data_Dissemination_July_2025/npidata_pfile_20050523-20250713.csv"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1338b817-097d-4155-a131-cf5b011a8ccc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!time cat /Users/paddy/NPPES_Data_Dissemination_July_2025/npidata_pfile_20050523-20250713.csv > /dev/null"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "15037796-b479-493d-b9ee-00ddbe69189b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!time wc -l ~/NPPES_Data_Dissemination_July_2025/npidata_pfile_20050523-20250713.csv"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "538bfa98-9e1c-4732-b260-c076fb3aba55",
+   "metadata": {},
+   "source": [
+    "# Let's talk about polars\n",
+    "\n",
+    "Pandas was a huge leap forward for data science when it came out 15 years ago. Polars is a clean slate design oriented around performance.\n",
+    "\n",
+    "Polars is faster for two reasons, one in operates in parallel, two it works lazily, in many cases it doesn't need to load an entire dataframe into memory"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4cb4e1a5-b8da-4d39-8ef5-625f095e235b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = pl.read_csv(JULY_FILE, n_rows=5_000, low_memory=True)\n",
+    "df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1cd590d9-c82e-4dfc-b1d9-a1b8d3d52d73",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "35a7f426-be62-43c2-881e-de892b3f3a1c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#df.filter(pl.any_horizontal(pl.col(pl.String).str.contains('GRES')))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4d85918d-97da-467e-bd1d-3ac078a115bd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "NROWS=500_000\n",
+    "%timeit pl.read_csv(JULY_FILE, n_rows=NROWS).filter(pl.any_horizontal(pl.col(pl.String).str.contains('GRES')))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e54a5ed2-b000-4563-9c15-e71c346d5621",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%timeit pl.scan_csv(JULY_FILE, n_rows=NROWS).filter(pl.any_horizontal(pl.col(pl.String).str.contains('GRES'))).collect()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "74450ee4-c29a-4230-8ed3-e2e3fa987485",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!time cat  ~/NPPES_Data_Dissemination_July_2025/npidata_pfile_20050523-20250713.csv | grep 367H | wc -l"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cb877171-0af5-48a3-b939-647384691139",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from datetime import datetime\n",
+    "start = datetime.now()\n",
+    "filtered_df = pl.scan_csv(JULY_FILE, low_memory=True).filter(pl.any_horizontal(pl.col(pl.String).str.contains('367H'))).collect()\n",
+    "end = datetime.now()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a0b7d2a9-fc86-4177-8b56-a6506042048c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "end - start"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "478c1bcf-b03f-440e-a341-82145ec9e9e2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "filtered_df.write_csv(\"367H.csv\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8581fa24-c05a-4305-8466-e70e71726e2f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "filtered_df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fed6d1ff-a0f2-4032-8d8a-a5fa9adb5cce",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.8"
+  },
+  "widgets": {
+   "application/vnd.jupyter.widget-state+json": {
+    "state": {},
+    "version_major": 2,
+    "version_minor": 0
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}