From 20a849d140c4f703b91daa4684c2fa58c9662d5e Mon Sep 17 00:00:00 2001 From: Michael Chow Date: Thu, 27 Feb 2020 19:44:20 -0500 Subject: [PATCH 1/2] add siuba python code (will need to re-run nb) --- siuba.ipynb | 464 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 464 insertions(+) create mode 100644 siuba.ipynb diff --git a/siuba.ipynb b/siuba.ipynb new file mode 100644 index 0000000..3c6562c --- /dev/null +++ b/siuba.ipynb @@ -0,0 +1,464 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Hardware Details\n", + "[GCP](https://cloud.google.com/) VM: [n1-highmem-16](https://cloud.google.com/compute/docs/machine-types#n1_machine_types) (16 vCPUs, 104 GB memory)" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "%%bash\n", + "lscpu" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "%%bash\n", + "cat /proc/meminfo | head -n1" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Basic functions" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import random\n", + "import string\n", + "import gc\n", + "\n", + "from siuba import _, summarize, group_by\n", + "from siuba.experimental.pd_groups import fast_summarize" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "def createTable(rowCount):\n", + " gc.collect()\n", + " return pd.DataFrame({'bucket': [''.join(random.choices(string.ascii_lowercase, k=2)) for _ in range(rowCount)],\n", + " 'weight': np.random.uniform(0, 2, rowCount),\n", + " 'qty': np.random.randint(100, size=rowCount),\n", + " 'risk': np.random.randint(10, size=rowCount)})" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "def executeQueryFastSummarize(t):\n", + " return (\n", + " t >>\n", + " group_by(_.bucket) >>\n", + " fast_summarize(\n", + " NR = _.bucket.count(),\n", + " TOTAL_QTY = _.qty.sum(),\n", + " AVG_QTY = _.qty.mean(),\n", + " TOTAL_RISK = _.risk.sum(),\n", + " AVG_RISK = _.risk.mean(),\n", + " W_AVG_QTY = (_.qty * _.weight).sum() / _.weight.sum(),\n", + " W_AVG_RISK = (_.risk * _.weight).sum() / _.weight.sum()\n", + " )\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "# keep executeQueryApply just for reference\n", + "def my_agg(x):\n", + " data = {'NR': x.bucket.count(),\n", + " 'TOTAL_QTY': x.qty.sum(),\n", + " 'AVG_QTY': x.qty.mean(),\n", + " 'TOTAL_RISK': x.risk.sum(),\n", + " 'AVG_RISK': x.risk.mean(),\n", + " 'W_AVG_QTY': np.average(x.qty, weights=x.weight),\n", + " 'W_AVG_RISK': np.average(x.risk, weights=x.weight)\n", + " }\n", + " return pd.Series(data, index=['NR', 'TOTAL_QTY', 'AVG_QTY', 'TOTAL_RISK', \n", + " 'AVG_RISK', 'W_AVG_QTY', 'W_AVG_RISK'])\n", + "\n", + "def executeQueryApply(t):\n", + " return t.groupby('bucket').apply(my_agg).astype(\n", + " {'NR': 'int64', 'TOTAL_QTY': 'int64', 'TOTAL_RISK': 'int64'})" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Demonstrate equality of methods" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "from pandas.testing import assert_frame_equal\n", + "\n", + "t = createTable(10*10000)\n", + "assert_frame_equal(\n", + " executeQueryFastSummarize(t).drop(columns = \"bucket\"),\n", + " executeQueryApply(t).reset_index(drop = True)\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Row Number 10k" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "t = createTable(10 * 1000)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "26.4 ms ± 3.63 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" + ] + } + ], + "source": [ + "%timeit executeQueryFastSummarize(t)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1.34 s ± 122 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + ] + } + ], + "source": [ + "%timeit executeQueryApply(t)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Row Number 100k" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "del t\n", + "t = createTable(100 * 1000)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "36.9 ms ± 628 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" + ] + } + ], + "source": [ + "%timeit executeQueryFastSummarize(t)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1.19 s ± 16.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + ] + } + ], + "source": [ + "%timeit executeQueryApply(t)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Row Number 1M" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "del t\n", + "t = createTable(1000 * 1000)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "175 ms ± 2.78 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" + ] + } + ], + "source": [ + "%timeit executeQueryFastSummarize(t)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1.65 s ± 9.68 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + ] + } + ], + "source": [ + "%timeit executeQueryApply(t)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Row Number 10M" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "del t\n", + "t = createTable(10 * 1000 * 1000)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1.51 s ± 18.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + ] + } + ], + "source": [ + "%timeit executeQueryFastSummarize(t)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "7.2 s ± 460 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + ] + } + ], + "source": [ + "%timeit executeQueryApply(t)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Row Number 100M" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "ename": "KeyboardInterrupt", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m#del t\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mt\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcreateTable\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m100\u001b[0m \u001b[0;34m*\u001b[0m \u001b[0;36m1000\u001b[0m \u001b[0;34m*\u001b[0m \u001b[0;36m1000\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;32m\u001b[0m in \u001b[0;36mcreateTable\u001b[0;34m(rowCount)\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mcreateTable\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrowCount\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0mgc\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcollect\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m return pd.DataFrame({'bucket': [''.join(random.choices(string.ascii_lowercase, k=2)) for _ in range(rowCount)],\n\u001b[0m\u001b[1;32m 4\u001b[0m \u001b[0;34m'weight'\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrandom\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0muniform\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrowCount\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;34m'qty'\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrandom\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrandint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m100\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msize\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mrowCount\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mcreateTable\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrowCount\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0mgc\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcollect\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m return pd.DataFrame({'bucket': [''.join(random.choices(string.ascii_lowercase, k=2)) for _ in range(rowCount)],\n\u001b[0m\u001b[1;32m 4\u001b[0m \u001b[0;34m'weight'\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrandom\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0muniform\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrowCount\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;34m'qty'\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrandom\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrandint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m100\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msize\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mrowCount\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mKeyboardInterrupt\u001b[0m: " + ] + } + ], + "source": [ + "del t\n", + "t = createTable(100 * 1000 * 1000)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%timeit executeQueryFastAgg(t)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%timeit -n 1 -r 10 executeQueryFastSummarize(t)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "%timeit -n 1 -r 10 executeQueryApply(t)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Row Number 1B" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "del t\n", + "t = createTable(1000 * 1000 * 1000)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%timeit -n 1 -r 10 executeQueryFastSummarize(t)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "%timeit -n 1 -r 10 executeQueryApply(t)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.8" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} From 73f1540a5d8d4842690c679a4e205068e39b8281 Mon Sep 17 00:00:00 2001 From: Michael Chow Date: Thu, 27 Feb 2020 20:18:21 -0500 Subject: [PATCH 2/2] remove unused cell from siuba --- siuba.ipynb | 9 --------- 1 file changed, 9 deletions(-) diff --git a/siuba.ipynb b/siuba.ipynb index 3c6562c..1831c0b 100644 --- a/siuba.ipynb +++ b/siuba.ipynb @@ -373,15 +373,6 @@ "t = createTable(100 * 1000 * 1000)" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%timeit executeQueryFastAgg(t)" - ] - }, { "cell_type": "code", "execution_count": null,