From 20a849d140c4f703b91daa4684c2fa58c9662d5e Mon Sep 17 00:00:00 2001
From: Michael Chow <machow@princeton.edu>
Date: Thu, 27 Feb 2020 19:44:20 -0500
Subject: [PATCH 1/2] add siuba python code (will need to re-run nb)

---
 siuba.ipynb | 464 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 464 insertions(+)
 create mode 100644 siuba.ipynb

diff --git a/siuba.ipynb b/siuba.ipynb
new file mode 100644
index 0000000..3c6562c
--- /dev/null
+++ b/siuba.ipynb
@@ -0,0 +1,464 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Hardware Details\n",
+    "[GCP](https://cloud.google.com/) VM: [n1-highmem-16](https://cloud.google.com/compute/docs/machine-types#n1_machine_types) (16 vCPUs, 104 GB memory)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "%%bash\n",
+    "lscpu"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "%%bash\n",
+    "cat /proc/meminfo | head -n1"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Basic functions"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import random\n",
+    "import string\n",
+    "import gc\n",
+    "\n",
+    "from siuba import _, summarize, group_by\n",
+    "from siuba.experimental.pd_groups import fast_summarize"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def createTable(rowCount):\n",
+    "    gc.collect()\n",
+    "    return pd.DataFrame({'bucket': [''.join(random.choices(string.ascii_lowercase, k=2)) for _ in range(rowCount)],\n",
+    "                  'weight': np.random.uniform(0, 2, rowCount),\n",
+    "                  'qty': np.random.randint(100, size=rowCount),\n",
+    "                  'risk': np.random.randint(10, size=rowCount)})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def executeQueryFastSummarize(t):\n",
+    "    return (\n",
+    "        t >>\n",
+    "        group_by(_.bucket) >>\n",
+    "        fast_summarize(\n",
+    "            NR = _.bucket.count(),\n",
+    "            TOTAL_QTY = _.qty.sum(),\n",
+    "            AVG_QTY = _.qty.mean(),\n",
+    "            TOTAL_RISK = _.risk.sum(),\n",
+    "            AVG_RISK = _.risk.mean(),\n",
+    "            W_AVG_QTY = (_.qty * _.weight).sum() / _.weight.sum(),\n",
+    "            W_AVG_RISK = (_.risk * _.weight).sum() / _.weight.sum()\n",
+    "        )\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# keep executeQueryApply just for reference\n",
+    "def my_agg(x):\n",
+    "    data = {'NR': x.bucket.count(),\n",
+    "            'TOTAL_QTY': x.qty.sum(),\n",
+    "            'AVG_QTY': x.qty.mean(),\n",
+    "            'TOTAL_RISK': x.risk.sum(),\n",
+    "            'AVG_RISK': x.risk.mean(),\n",
+    "            'W_AVG_QTY':  np.average(x.qty, weights=x.weight),\n",
+    "            'W_AVG_RISK':  np.average(x.risk, weights=x.weight)\n",
+    "           }\n",
+    "    return pd.Series(data, index=['NR', 'TOTAL_QTY', 'AVG_QTY', 'TOTAL_RISK', \n",
+    "                                  'AVG_RISK', 'W_AVG_QTY', 'W_AVG_RISK'])\n",
+    "\n",
+    "def executeQueryApply(t):\n",
+    "    return t.groupby('bucket').apply(my_agg).astype(\n",
+    "        {'NR': 'int64', 'TOTAL_QTY': 'int64', 'TOTAL_RISK': 'int64'})"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Demonstrate equality of methods"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pandas.testing import assert_frame_equal\n",
+    "\n",
+    "t = createTable(10*10000)\n",
+    "assert_frame_equal(\n",
+    "    executeQueryFastSummarize(t).drop(columns = \"bucket\"),\n",
+    "    executeQueryApply(t).reset_index(drop = True)\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Row Number 10k"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "t = createTable(10 * 1000)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "26.4 ms ± 3.63 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
+     ]
+    }
+   ],
+   "source": [
+    "%timeit executeQueryFastSummarize(t)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "1.34 s ± 122 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
+     ]
+    }
+   ],
+   "source": [
+    "%timeit executeQueryApply(t)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Row Number 100k"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "del t\n",
+    "t = createTable(100 * 1000)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "36.9 ms ± 628 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
+     ]
+    }
+   ],
+   "source": [
+    "%timeit executeQueryFastSummarize(t)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "1.19 s ± 16.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
+     ]
+    }
+   ],
+   "source": [
+    "%timeit executeQueryApply(t)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Row Number 1M"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "del t\n",
+    "t = createTable(1000 * 1000)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "175 ms ± 2.78 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
+     ]
+    }
+   ],
+   "source": [
+    "%timeit executeQueryFastSummarize(t)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "1.65 s ± 9.68 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
+     ]
+    }
+   ],
+   "source": [
+    "%timeit executeQueryApply(t)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Row Number 10M"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "del t\n",
+    "t = createTable(10 * 1000 * 1000)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "1.51 s ± 18.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
+     ]
+    }
+   ],
+   "source": [
+    "%timeit executeQueryFastSummarize(t)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "7.2 s ± 460 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
+     ]
+    }
+   ],
+   "source": [
+    "%timeit executeQueryApply(t)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Row Number 100M"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "KeyboardInterrupt",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
+      "\u001b[0;32m<ipython-input-26-ee5840c7a24a>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0;31m#del t\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mt\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcreateTable\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m100\u001b[0m \u001b[0;34m*\u001b[0m \u001b[0;36m1000\u001b[0m \u001b[0;34m*\u001b[0m \u001b[0;36m1000\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
+      "\u001b[0;32m<ipython-input-4-ba0268379613>\u001b[0m in \u001b[0;36mcreateTable\u001b[0;34m(rowCount)\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mcreateTable\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrowCount\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      2\u001b[0m     \u001b[0mgc\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcollect\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m     return pd.DataFrame({'bucket': [''.join(random.choices(string.ascii_lowercase, k=2)) for _ in range(rowCount)],\n\u001b[0m\u001b[1;32m      4\u001b[0m                   \u001b[0;34m'weight'\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrandom\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0muniform\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrowCount\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      5\u001b[0m                   \u001b[0;34m'qty'\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrandom\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrandint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m100\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msize\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mrowCount\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m<ipython-input-4-ba0268379613>\u001b[0m in \u001b[0;36m<listcomp>\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mcreateTable\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrowCount\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      2\u001b[0m     \u001b[0mgc\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcollect\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m     return pd.DataFrame({'bucket': [''.join(random.choices(string.ascii_lowercase, k=2)) for _ in range(rowCount)],\n\u001b[0m\u001b[1;32m      4\u001b[0m                   \u001b[0;34m'weight'\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrandom\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0muniform\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrowCount\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      5\u001b[0m                   \u001b[0;34m'qty'\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrandom\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrandint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m100\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msize\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mrowCount\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
+     ]
+    }
+   ],
+   "source": [
+    "del t\n",
+    "t = createTable(100 * 1000 * 1000)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%timeit executeQueryFastAgg(t)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%timeit -n 1 -r 10 executeQueryFastSummarize(t)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "%timeit -n 1 -r 10  executeQueryApply(t)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Row Number 1B"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "del t\n",
+    "t = createTable(1000 * 1000 * 1000)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%timeit -n 1 -r 10 executeQueryFastSummarize(t)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "%timeit -n 1 -r 10 executeQueryApply(t)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}

From 73f1540a5d8d4842690c679a4e205068e39b8281 Mon Sep 17 00:00:00 2001
From: Michael Chow <machow@princeton.edu>
Date: Thu, 27 Feb 2020 20:18:21 -0500
Subject: [PATCH 2/2] remove unused cell from siuba

---
 siuba.ipynb | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/siuba.ipynb b/siuba.ipynb
index 3c6562c..1831c0b 100644
--- a/siuba.ipynb
+++ b/siuba.ipynb
@@ -373,15 +373,6 @@
     "t = createTable(100 * 1000 * 1000)"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "%timeit executeQueryFastAgg(t)"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,