diff --git a/.binder/Dockerfile b/.binder/Dockerfile new file mode 100644 index 0000000..dad2424 --- /dev/null +++ b/.binder/Dockerfile @@ -0,0 +1,71 @@ +FROM ubuntu:22.04 + +# Set non-interactive frontend to avoid prompts during package installation +ENV DEBIAN_FRONTEND=noninteractive +ENV TZ=Europe/Berlin + +# Define Binder-required user variables +ARG NB_USER=jovyan +ARG NB_UID=1000 +ENV USER=${NB_USER} +ENV NB_UID=${NB_UID} +ENV HOME=/home/${NB_USER} + +# Install necessary system dependencies +RUN apt-get update && apt-get install -y \ + software-properties-common \ + git \ + python3 \ + python3-pip \ + r-base \ + r-base-dev \ + libzmq3-dev \ + libssl-dev \ + libcurl4-openssl-dev \ + libxml2-dev \ + ca-certificates \ + openjdk-11-jre-headless + +# Upgrade pip and install base Python dependencies +RUN pip3 install --upgrade pip setuptools wheel + +# Install JupyterLab and notebook +RUN pip install --no-cache-dir jupyterlab notebook + +# Install IRkernel for R and register the R kernel system-wide +RUN R -e "install.packages('IRkernel', repos='https://cloud.r-project.org'); IRkernel::installspec(user=FALSE)" + +# Install h2o R package +RUN R -e "install.packages('h2o', repos='https://cloud.r-project.org')" + +# Create jovyan user with UID 1000 (Binder requirement) +RUN adduser --disabled-password \ + --gecos "Default user" \ + --uid ${NB_UID} \ + ${NB_USER} + +# Copy repository contents to the user's home directory +WORKDIR ${HOME} +COPY . ${HOME} + +# Change ownership of the directory to the created user +USER root +RUN chown -R ${NB_UID} ${HOME} +USER ${NB_USER} + +# Ensure the PATH includes the correct Python location +ENV PATH="${HOME}/.local/bin:${PATH}" + +# Install jumper_wrapper_kernel and jumper_ipython_extension from GitHub +RUN pip install --no-cache-dir \ + "git+https://github.com/ScaDS/jumper_wrapper_kernel.git@main" \ + "git+https://github.com/ScaDS/jumper_ipython_extension.git@main" + +# Register the wrapper kernel spec for jovyan (user-level, avoids /usr permissions) +RUN python3 -m jumper_wrapper_kernel.install install + +# Expose Jupyter Notebook port +EXPOSE 8888 + +# Set the default command for running Jupyter Notebook +CMD ["jupyter", "notebook", "--NotebookApp.default_url=/lab", "--ip=0.0.0.0", "--port=8888", "--no-browser"] diff --git a/README.md b/README.md index 31caeab..6c001b5 100644 --- a/README.md +++ b/README.md @@ -110,7 +110,12 @@ y = np.dot(x, x.T) ### Launch the demo interactively -[![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/ScaDS/jumper_wrapper_kernel/feature/binder?urlpath=%2Fdoc%2Ftree%2Fdemos%2Fnew_R_wrapping.ipynb) +- **How to Wrap a Kernel: Basic R Kernel Example**\ +[![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/ScaDS/jumper_wrapper_kernel/main?urlpath=%2Fdoc%2Ftree%2Fdemos%2Fnew_R_wrapping.ipynb) + + +- **H2O-Wrapped Tutorial**\ +[![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/ScaDS/jumper_wrapper_kernel/main?urlpath=%2Fdoc%2Ftree%2Fdemos%2Fh2o-wrapper-tutorial.ipynb) ## How It Works diff --git a/demos/.ipynb_checkpoints/h2o-wrapper-tutorial-checkpoint.ipynb b/demos/.ipynb_checkpoints/h2o-wrapper-tutorial-checkpoint.ipynb new file mode 100644 index 0000000..e34f1a9 --- /dev/null +++ b/demos/.ipynb_checkpoints/h2o-wrapper-tutorial-checkpoint.ipynb @@ -0,0 +1,2654 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Jumper Performance Monitoring on Wrapped R Kernel" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## List & Wrap a new R kernel" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Available Jupyter Kernels:\n", + "--------------------------------------------------\n", + " ir: R (R)\n", + " scorep_jupyter: Score-P_Python (python)\n", + " h2o_r: Jumper Wrapper (h2o_r) (R)\n", + " jumper_wrapper: Jumper Wrapper Kernel (python)\n", + " python3: Python 3 (ipykernel) (python)\n" + ] + } + ], + "source": [ + "%list_kernels" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "data": { + "application/javascript": [ + "\n", + " if (typeof Jupyter !== 'undefined' && Jupyter.notebook) {\n", + " // JupyterLab classic notebook\n", + " Jupyter.notebook.kernel.kernel_info(function(reply) {\n", + " if (reply.content && reply.content.language_info) {\n", + " Jupyter.notebook.metadata.language_info = reply.content.language_info;\n", + " // Trigger CodeMirror mode change for all cells\n", + " var mode = reply.content.language_info.codemirror_mode || reply.content.language_info.name;\n", + " Jupyter.notebook.get_cells().forEach(function(cell) {\n", + " if (cell.cell_type === 'code') {\n", + " cell.code_mirror.setOption('mode', mode);\n", + " }\n", + " });\n", + " }\n", + " });\n", + " }\n", + " " + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Successfully wrapped kernel: ir\n", + "Hint: Refresh the page (without restarting the kernel) to enable syntax highlighting for the wrapped language.\n", + "Created permanent kernel 'h2o_r' that auto-wraps 'ir'.\n" + ] + } + ], + "source": [ + "%wrap_kernel ir --save h2o_r" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Performance Monitoring Setup" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[JUmPER]: Performance monitoring started (PID: 8283, Interval: 1.0s)\n" + ] + } + ], + "source": [ + "%perfmonitor_start 1" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# H2O Tutorial: EEG Eye State Classification\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Author: Erin LeDell\n", + "\n", + "Contact: erin@h2o.ai\n", + "\n", + "This tutorial steps through a quick introduction to H2O's R API. The goal of this tutorial is to introduce through a complete example H2O's capabilities from R. \n", + "\n", + "Most of the functionality for R's `data.frame` is exactly the same syntax for an `H2OFrame`, so if you are comfortable with R, data frame manipulation will come naturally to you in H2O. The modeling syntax in the H2O R API may also remind you of other machine learning packages in R.\n", + "\n", + "References: [H2O R API documentation](http://h2o-release.s3.amazonaws.com/h2o/latest_stable_Rdoc.html), the [H2O Documentation landing page](http://www.h2o.ai/docs/) and [H2O general documentation](http://h2o-release.s3.amazonaws.com/h2o/latest_stable_doc.html)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Install H2O in R\n", + "\n", + "### Prerequisites\n", + "\n", + "This tutorial assumes you have R installed. The `h2o` R package has a few dependencies which can be installed using CRAN. The packages that are required (which also have their own dependencies) can be installed in R as follows:\n", + "```r\n", + "pkgs <- c(\"methods\",\"statmod\",\"stats\",\"graphics\",\"RCurl\",\"jsonlite\",\"tools\",\"utils\")\n", + "for (pkg in pkgs) {\n", + " if (! (pkg %in% rownames(installed.packages()))) { install.packages(pkg) }\n", + "}\n", + "```\n", + "\n", + "### Install h2o\n", + "\n", + "Once the dependencies are installed, you can install H2O. We will use the latest stable version of the `h2o` R package, which at the time of writing is H2O v3.8.0.4 (aka \"Tukey-4\"). The latest stable version can be installed using the commands on the [H2O R Installation](http://www.h2o.ai/download/h2o/r) page." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Start up an H2O cluster\n", + "\n", + "After the R package is installed, we can start up an H2O cluster. In a R terminal, we load the `h2o` package and start up an H2O cluster as follows:" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "----------------------------------------------------------------------\n", + "\n", + "Your next step is to start H2O:\n", + " > h2o.init()\n", + "\n", + "For H2O package documentation, ask for help:\n", + " > ??h2o\n", + "\n", + "After starting H2O, you can use the Web UI at http://localhost:54321\n", + "For more information visit https://docs.h2o.ai\n", + "\n", + "----------------------------------------------------------------------\n", + "\n", + "\n", + "\n", + "Attaching package: ‘h2o’\n", + "\n", + "\n", + "The following objects are masked from ‘package:stats’:\n", + "\n", + " cor, sd, var\n", + "\n", + "\n", + "The following objects are masked from ‘package:base’:\n", + "\n", + " &&, %*%, %in%, ||, apply, as.factor, as.numeric, colnames,\n", + " colnames<-, ifelse, is.character, is.factor, is.numeric, log,\n", + " log10, log1p, log2, round, signif, trunc\n", + "\n", + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Reading in config file: ./../../.h2oconfig\n", + " Connection successful!\n", + "\n", + "R is connected to the H2O cluster: \n", + " H2O cluster uptime: 4 hours 37 minutes \n", + " H2O cluster timezone: Europe/Berlin \n", + " H2O data parsing timezone: UTC \n", + " H2O cluster version: 3.44.0.3 \n", + " H2O cluster version age: 2 years, 1 month and 29 days \n", + " H2O cluster name: ub \n", + " H2O cluster total nodes: 1 \n", + " H2O cluster total memory: 2.14 GB \n", + " H2O cluster total cores: 12 \n", + " H2O cluster allowed cores: 12 \n", + " H2O cluster healthy: TRUE \n", + " H2O Connection ip: 172.26.185.80 \n", + " H2O Connection port: 54321 \n", + " H2O Connection proxy: NA \n", + " H2O Internal Security: FALSE \n", + " R Version: R version 4.1.2 (2021-11-01) \n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Warning message in h2o.clusterInfo():\n", + "“\n", + "Your H2O cluster version is (2 years, 1 month and 29 days) old. There may be a newer version available.\n", + "Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html”\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "library(h2o)\n", + "\n", + "# Start an H2O Cluster on your local machine\n", + "h2o.init(ip=\"172.26.185.80\", port=54321, startH2O = FALSE)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If you already have an H2O cluster running that you'd like to connect to (for example, in a multi-node Hadoop environment), then you can specify the IP and port of that cluster as follows:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [], + "source": [ + "# This will not actually do anything since it's a fake IP address\n", + "# h2o.init(ip=\"123.45.67.89\", port=54321)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Download EEG Data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The following code downloads a copy of the [EEG Eye State](http://archive.ics.uci.edu/ml/datasets/EEG+Eye+State#) dataset. All data is from one continuous EEG measurement with the [Emotiv EEG Neuroheadset](https://emotiv.com/epoc.php). The duration of the measurement was 117 seconds. The eye state was detected via a camera during the EEG measurement and added later manually to the file after analysing the video frames. '1' indicates the eye-closed and '0' the eye-open state. All values are in chronological order with the first measured value at the top of the data.\n", + "\n", + "![Emotiv Headset](http://dissociatedpress.com/wp-content/uploads/2013/03/emotiv-490.jpg)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can import the data directly into H2O using the `import_file` method in the Python API. The import path can be a URL, a local path, a path to an HDFS file, or a file on Amazon S3." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " |======================================================================| 100%\n" + ] + } + ], + "source": [ + "#csv_url <- \"http://www.stat.berkeley.edu/~ledell/data/eeg_eyestate_splits.csv\"\n", + "csv_url <- \"https://h2o-public-test-data.s3.amazonaws.com/smalldata/eeg/eeg_eyestate_splits.csv\"\n", + "data <- h2o.importFile(csv_url)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Explore Data\n", + "Once we have loaded the data, let's take a quick look. First the dimension of the frame:" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "
  1. 14980
  2. 16
\n" + ], + "text/latex": [ + "\\begin{enumerate*}\n", + "\\item 14980\n", + "\\item 16\n", + "\\end{enumerate*}\n" + ], + "text/markdown": [ + "1. 14980\n", + "2. 16\n", + "\n", + "\n" + ], + "text/plain": [ + "[1] 14980 16" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "dim(data)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now let's take a look at the top of the frame:" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\t\n", + "\t\n", + "\n", + "\n", + "\t\n", + "\t\n", + "\t\n", + "\t\n", + "\t\n", + "\t\n", + "\n", + "
A data.frame: 6 × 16
AF3F7F3FC5T7P7O1O2P8T8FC6F4F8AF4eyeDetectionsplit
<dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><fct>
14329.234009.234289.234148.214350.264586.154096.924641.034222.054238.464211.284280.514635.904393.850valid
24324.624004.624293.854148.724342.054586.674097.444638.974210.774226.674207.694279.494632.824384.100test
34327.694006.674295.384156.414336.924583.594096.924630.264207.694222.054206.674282.054628.724389.230train
44328.724011.794296.414155.904343.594582.564097.444630.774217.444235.384210.774287.694632.314396.410train
54326.154011.794292.314151.284347.694586.674095.904627.694210.774244.104212.824288.214632.824398.460train
64321.034004.624284.104153.334345.644587.184093.334616.924202.564232.824209.744281.034628.214389.740train
\n" + ], + "text/latex": [ + "A data.frame: 6 × 16\n", + "\\begin{tabular}{r|llllllllllllllll}\n", + " & AF3 & F7 & F3 & FC5 & T7 & P7 & O1 & O2 & P8 & T8 & FC6 & F4 & F8 & AF4 & eyeDetection & split\\\\\n", + " & & & & & & & & & & & & & & & & \\\\\n", + "\\hline\n", + "\t1 & 4329.23 & 4009.23 & 4289.23 & 4148.21 & 4350.26 & 4586.15 & 4096.92 & 4641.03 & 4222.05 & 4238.46 & 4211.28 & 4280.51 & 4635.90 & 4393.85 & 0 & valid\\\\\n", + "\t2 & 4324.62 & 4004.62 & 4293.85 & 4148.72 & 4342.05 & 4586.67 & 4097.44 & 4638.97 & 4210.77 & 4226.67 & 4207.69 & 4279.49 & 4632.82 & 4384.10 & 0 & test \\\\\n", + "\t3 & 4327.69 & 4006.67 & 4295.38 & 4156.41 & 4336.92 & 4583.59 & 4096.92 & 4630.26 & 4207.69 & 4222.05 & 4206.67 & 4282.05 & 4628.72 & 4389.23 & 0 & train\\\\\n", + "\t4 & 4328.72 & 4011.79 & 4296.41 & 4155.90 & 4343.59 & 4582.56 & 4097.44 & 4630.77 & 4217.44 & 4235.38 & 4210.77 & 4287.69 & 4632.31 & 4396.41 & 0 & train\\\\\n", + "\t5 & 4326.15 & 4011.79 & 4292.31 & 4151.28 & 4347.69 & 4586.67 & 4095.90 & 4627.69 & 4210.77 & 4244.10 & 4212.82 & 4288.21 & 4632.82 & 4398.46 & 0 & train\\\\\n", + "\t6 & 4321.03 & 4004.62 & 4284.10 & 4153.33 & 4345.64 & 4587.18 & 4093.33 & 4616.92 & 4202.56 & 4232.82 & 4209.74 & 4281.03 & 4628.21 & 4389.74 & 0 & train\\\\\n", + "\\end{tabular}\n" + ], + "text/markdown": [ + "\n", + "A data.frame: 6 × 16\n", + "\n", + "| | AF3 <dbl> | F7 <dbl> | F3 <dbl> | FC5 <dbl> | T7 <dbl> | P7 <dbl> | O1 <dbl> | O2 <dbl> | P8 <dbl> | T8 <dbl> | FC6 <dbl> | F4 <dbl> | F8 <dbl> | AF4 <dbl> | eyeDetection <dbl> | split <fct> |\n", + "|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|\n", + "| 1 | 4329.23 | 4009.23 | 4289.23 | 4148.21 | 4350.26 | 4586.15 | 4096.92 | 4641.03 | 4222.05 | 4238.46 | 4211.28 | 4280.51 | 4635.90 | 4393.85 | 0 | valid |\n", + "| 2 | 4324.62 | 4004.62 | 4293.85 | 4148.72 | 4342.05 | 4586.67 | 4097.44 | 4638.97 | 4210.77 | 4226.67 | 4207.69 | 4279.49 | 4632.82 | 4384.10 | 0 | test |\n", + "| 3 | 4327.69 | 4006.67 | 4295.38 | 4156.41 | 4336.92 | 4583.59 | 4096.92 | 4630.26 | 4207.69 | 4222.05 | 4206.67 | 4282.05 | 4628.72 | 4389.23 | 0 | train |\n", + "| 4 | 4328.72 | 4011.79 | 4296.41 | 4155.90 | 4343.59 | 4582.56 | 4097.44 | 4630.77 | 4217.44 | 4235.38 | 4210.77 | 4287.69 | 4632.31 | 4396.41 | 0 | train |\n", + "| 5 | 4326.15 | 4011.79 | 4292.31 | 4151.28 | 4347.69 | 4586.67 | 4095.90 | 4627.69 | 4210.77 | 4244.10 | 4212.82 | 4288.21 | 4632.82 | 4398.46 | 0 | train |\n", + "| 6 | 4321.03 | 4004.62 | 4284.10 | 4153.33 | 4345.64 | 4587.18 | 4093.33 | 4616.92 | 4202.56 | 4232.82 | 4209.74 | 4281.03 | 4628.21 | 4389.74 | 0 | train |\n", + "\n" + ], + "text/plain": [ + " AF3 F7 F3 FC5 T7 P7 O1 O2 P8 \n", + "1 4329.23 4009.23 4289.23 4148.21 4350.26 4586.15 4096.92 4641.03 4222.05\n", + "2 4324.62 4004.62 4293.85 4148.72 4342.05 4586.67 4097.44 4638.97 4210.77\n", + "3 4327.69 4006.67 4295.38 4156.41 4336.92 4583.59 4096.92 4630.26 4207.69\n", + "4 4328.72 4011.79 4296.41 4155.90 4343.59 4582.56 4097.44 4630.77 4217.44\n", + "5 4326.15 4011.79 4292.31 4151.28 4347.69 4586.67 4095.90 4627.69 4210.77\n", + "6 4321.03 4004.62 4284.10 4153.33 4345.64 4587.18 4093.33 4616.92 4202.56\n", + " T8 FC6 F4 F8 AF4 eyeDetection split\n", + "1 4238.46 4211.28 4280.51 4635.90 4393.85 0 valid\n", + "2 4226.67 4207.69 4279.49 4632.82 4384.10 0 test \n", + "3 4222.05 4206.67 4282.05 4628.72 4389.23 0 train\n", + "4 4235.38 4210.77 4287.69 4632.31 4396.41 0 train\n", + "5 4244.10 4212.82 4288.21 4632.82 4398.46 0 train\n", + "6 4232.82 4209.74 4281.03 4628.21 4389.74 0 train" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "head(data)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The first 14 columns are numeric values that represent EEG measurements from the headset. The \"eyeDetection\" column is the response. There is an additional column called \"split\" that was added (by me) in order to specify partitions of the data (so we can easily benchmark against other tools outside of H2O using the same splits). I randomly divided the dataset into three partitions: train (60%), valid (%20) and test (20%) and marked which split each row belongs to in the \"split\" column.\n", + "\n", + "Let's take a look at the column names. The data contains derived features from the medical images of the tumors." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "
  1. 'AF3'
  2. 'F7'
  3. 'F3'
  4. 'FC5'
  5. 'T7'
  6. 'P7'
  7. 'O1'
  8. 'O2'
  9. 'P8'
  10. 'T8'
  11. 'FC6'
  12. 'F4'
  13. 'F8'
  14. 'AF4'
  15. 'eyeDetection'
  16. 'split'
\n" + ], + "text/latex": [ + "\\begin{enumerate*}\n", + "\\item 'AF3'\n", + "\\item 'F7'\n", + "\\item 'F3'\n", + "\\item 'FC5'\n", + "\\item 'T7'\n", + "\\item 'P7'\n", + "\\item 'O1'\n", + "\\item 'O2'\n", + "\\item 'P8'\n", + "\\item 'T8'\n", + "\\item 'FC6'\n", + "\\item 'F4'\n", + "\\item 'F8'\n", + "\\item 'AF4'\n", + "\\item 'eyeDetection'\n", + "\\item 'split'\n", + "\\end{enumerate*}\n" + ], + "text/markdown": [ + "1. 'AF3'\n", + "2. 'F7'\n", + "3. 'F3'\n", + "4. 'FC5'\n", + "5. 'T7'\n", + "6. 'P7'\n", + "7. 'O1'\n", + "8. 'O2'\n", + "9. 'P8'\n", + "10. 'T8'\n", + "11. 'FC6'\n", + "12. 'F4'\n", + "13. 'F8'\n", + "14. 'AF4'\n", + "15. 'eyeDetection'\n", + "16. 'split'\n", + "\n", + "\n" + ], + "text/plain": [ + " [1] \"AF3\" \"F7\" \"F3\" \"FC5\" \"T7\" \n", + " [6] \"P7\" \"O1\" \"O2\" \"P8\" \"T8\" \n", + "[11] \"FC6\" \"F4\" \"F8\" \"AF4\" \"eyeDetection\"\n", + "[16] \"split\" " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "names(data)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To select a subset of the columns to look at, typical R data.frame indexing applies:" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\t\n", + "\t\n", + "\n", + "\n", + "\t\n", + "\t\n", + "\t\n", + "\t\n", + "\t\n", + "\t\n", + "\n", + "
A data.frame: 6 × 3
AF3eyeDetectionsplit
<dbl><dbl><fct>
14329.230valid
24324.620test
34327.690train
44328.720train
54326.150train
64321.030train
\n" + ], + "text/latex": [ + "A data.frame: 6 × 3\n", + "\\begin{tabular}{r|lll}\n", + " & AF3 & eyeDetection & split\\\\\n", + " & & & \\\\\n", + "\\hline\n", + "\t1 & 4329.23 & 0 & valid\\\\\n", + "\t2 & 4324.62 & 0 & test \\\\\n", + "\t3 & 4327.69 & 0 & train\\\\\n", + "\t4 & 4328.72 & 0 & train\\\\\n", + "\t5 & 4326.15 & 0 & train\\\\\n", + "\t6 & 4321.03 & 0 & train\\\\\n", + "\\end{tabular}\n" + ], + "text/markdown": [ + "\n", + "A data.frame: 6 × 3\n", + "\n", + "| | AF3 <dbl> | eyeDetection <dbl> | split <fct> |\n", + "|---|---|---|---|\n", + "| 1 | 4329.23 | 0 | valid |\n", + "| 2 | 4324.62 | 0 | test |\n", + "| 3 | 4327.69 | 0 | train |\n", + "| 4 | 4328.72 | 0 | train |\n", + "| 5 | 4326.15 | 0 | train |\n", + "| 6 | 4321.03 | 0 | train |\n", + "\n" + ], + "text/plain": [ + " AF3 eyeDetection split\n", + "1 4329.23 0 valid\n", + "2 4324.62 0 test \n", + "3 4327.69 0 train\n", + "4 4328.72 0 train\n", + "5 4326.15 0 train\n", + "6 4321.03 0 train" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "columns <- c('AF3', 'eyeDetection', 'split')\n", + "head(data[columns])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now let's select a single column, for example -- the response column, and look at the data more closely:" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + " eyeDetection\n", + "1 0\n", + "2 0\n", + "3 0\n", + "4 0\n", + "5 0\n", + "6 0\n", + "\n", + "[14980 rows x 1 column] " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "y <- 'eyeDetection'\n", + "data[y]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "It looks like a binary response, but let's validate that assumption:" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + " C1\n", + "1 0\n", + "2 1\n", + "\n", + "[2 rows x 1 column] " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "h2o.unique(data[y])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If you don't specify the column types when you import the file, H2O makes a guess at what your column types are. If there are 0's and 1's in a column, H2O will automatically parse that as numeric by default. \n", + "\n", + "Therefore, we should convert the response column to a more efficient \"factor\" representation (called \"enum\" in Java) -- in this case it is a categorial variable with two levels, 0 and 1. If the only column in my data that is categorical is the response, I typically don't bother specifying the column type during the parse, and instead use this one-liner to convert it aftewards:" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [], + "source": [ + "data[y] <- as.factor(data[y])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we can check that there are two levels in our response column:" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "2" + ], + "text/latex": [ + "2" + ], + "text/markdown": [ + "2" + ], + "text/plain": [ + "[1] 2" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "h2o.nlevels(data[y])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can query the categorical \"levels\" as well ('0' and '1' stand for \"eye open\" and \"eye closed\") to see what they are:" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "
  1. '0'
  2. '1'
\n" + ], + "text/latex": [ + "\\begin{enumerate*}\n", + "\\item '0'\n", + "\\item '1'\n", + "\\end{enumerate*}\n" + ], + "text/markdown": [ + "1. '0'\n", + "2. '1'\n", + "\n", + "\n" + ], + "text/plain": [ + "[1] \"0\" \"1\"" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "h2o.levels(data[y])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We may want to check if there are any missing values, so let's look for NAs in our dataset. For all the supervised H2O algorithms, H2O will handle missing values automatically, so it's not a problem if we are missing certain feature values. However, it is always a good idea to check to make sure that you are not missing any of the training labels. \n", + "\n", + "To figure out which, if any, values are missing, we can use the `h2o.nacnt` (NA count) method on any H2OFrame (or column). The columns in an H2O Frame are also H2O Frames themselves, so all the methods that apply to an H2OFrame also apply to a single column." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "0" + ], + "text/latex": [ + "0" + ], + "text/markdown": [ + "0" + ], + "text/plain": [ + "[1] 0" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "h2o.nacnt(data[y])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Great, no missing labels. :-)\n", + "\n", + "Out of curiosity, let's see if there is any missing data in any of the columsn of this frame:" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "
  1. 0
  2. 0
  3. 0
  4. 0
  5. 0
  6. 0
  7. 0
  8. 0
  9. 0
  10. 0
  11. 0
  12. 0
  13. 0
  14. 0
  15. 0
  16. 0
\n" + ], + "text/latex": [ + "\\begin{enumerate*}\n", + "\\item 0\n", + "\\item 0\n", + "\\item 0\n", + "\\item 0\n", + "\\item 0\n", + "\\item 0\n", + "\\item 0\n", + "\\item 0\n", + "\\item 0\n", + "\\item 0\n", + "\\item 0\n", + "\\item 0\n", + "\\item 0\n", + "\\item 0\n", + "\\item 0\n", + "\\item 0\n", + "\\end{enumerate*}\n" + ], + "text/markdown": [ + "1. 0\n", + "2. 0\n", + "3. 0\n", + "4. 0\n", + "5. 0\n", + "6. 0\n", + "7. 0\n", + "8. 0\n", + "9. 0\n", + "10. 0\n", + "11. 0\n", + "12. 0\n", + "13. 0\n", + "14. 0\n", + "15. 0\n", + "16. 0\n", + "\n", + "\n" + ], + "text/plain": [ + " [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "h2o.nacnt(data)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Each column returns a zero, so there are no missing values in any of the columns." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The next thing I may wonder about in a binary classification problem is the distribution of the response in the training data. Is one of the two outcomes under-represented in the training set? Many real datasets have what's called an \"imbalanace\" problem, where one of the classes has far fewer training examples than the other class. Let's take a look at the distribution:" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + " eyeDetection Count\n", + "1 0 8257\n", + "2 1 6723\n", + "\n", + "[2 rows x 2 columns] " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "h2o.table(data[y])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Ok, the data is not exactly evenly distributed between the two classes -- there are more 0's than 1's in the dataset. However, this level of imbalance shouldn't be much of an issue for the machine learning algos. (We will revisit this later in the modeling section below).\n", + "\n", + "Let's calculate the percentage that each class represents:" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + " Count\n", + "1 0.5512016\n", + "2 0.4487984\n", + "\n", + "[2 rows x 1 column] " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "n <- nrow(data) # Total number of training samples\n", + "h2o.table(data[y])['Count']/n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Split H2O Frame into a train and test set\n", + "\n", + "So far we have explored the original dataset (all rows). For the machine learning portion of this tutorial, we will break the dataset into three parts: a training set, validation set and a test set.\n", + "\n", + "If you want H2O to do the splitting for you, you can use the `split_frame` method. However, we have explicit splits that we want (for reproducibility reasons), so we can just subset the Frame to get the partitions we want. \n", + "\n", + "Subset the `data` H2O Frame on the \"split\" column:" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "8988" + ], + "text/latex": [ + "8988" + ], + "text/markdown": [ + "8988" + ], + "text/plain": [ + "[1] 8988" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "train <- data[data['split']==\"train\",]\n", + "nrow(train)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "2996" + ], + "text/latex": [ + "2996" + ], + "text/markdown": [ + "2996" + ], + "text/plain": [ + "[1] 2996" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "valid <- data[data['split']==\"valid\",]\n", + "nrow(valid)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "2996" + ], + "text/latex": [ + "2996" + ], + "text/markdown": [ + "2996" + ], + "text/plain": [ + "[1] 2996" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "test <- data[data['split']==\"test\",]\n", + "nrow(test)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Machine Learning in H2O\n", + "\n", + "We will do a quick demo of the H2O software using a Gradient Boosting Machine (GBM). The goal of this problem is to train a model to predict eye state (open vs closed) from EEG data. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Train and Test a GBM model" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In the steps above, we have already created the training set and validation set, so the next step is to specify the predictor set and response variable.\n", + "\n", + "#### Specify the predictor set and response\n", + "\n", + "As with any machine learning algorithm, we need to specify the response and predictor columns in the training set. \n", + "\n", + "The `x` argument should be a vector of predictor names in the training frame, and `y` specifies the response column. We have already set `y <- \"eyeDetector\"` above, but we still need to specify `x`." + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "
  1. 'AF3'
  2. 'F7'
  3. 'F3'
  4. 'FC5'
  5. 'T7'
  6. 'P7'
  7. 'O1'
  8. 'O2'
  9. 'P8'
  10. 'T8'
  11. 'FC6'
  12. 'F4'
  13. 'F8'
  14. 'AF4'
  15. 'eyeDetection'
  16. 'split'
\n" + ], + "text/latex": [ + "\\begin{enumerate*}\n", + "\\item 'AF3'\n", + "\\item 'F7'\n", + "\\item 'F3'\n", + "\\item 'FC5'\n", + "\\item 'T7'\n", + "\\item 'P7'\n", + "\\item 'O1'\n", + "\\item 'O2'\n", + "\\item 'P8'\n", + "\\item 'T8'\n", + "\\item 'FC6'\n", + "\\item 'F4'\n", + "\\item 'F8'\n", + "\\item 'AF4'\n", + "\\item 'eyeDetection'\n", + "\\item 'split'\n", + "\\end{enumerate*}\n" + ], + "text/markdown": [ + "1. 'AF3'\n", + "2. 'F7'\n", + "3. 'F3'\n", + "4. 'FC5'\n", + "5. 'T7'\n", + "6. 'P7'\n", + "7. 'O1'\n", + "8. 'O2'\n", + "9. 'P8'\n", + "10. 'T8'\n", + "11. 'FC6'\n", + "12. 'F4'\n", + "13. 'F8'\n", + "14. 'AF4'\n", + "15. 'eyeDetection'\n", + "16. 'split'\n", + "\n", + "\n" + ], + "text/plain": [ + " [1] \"AF3\" \"F7\" \"F3\" \"FC5\" \"T7\" \n", + " [6] \"P7\" \"O1\" \"O2\" \"P8\" \"T8\" \n", + "[11] \"FC6\" \"F4\" \"F8\" \"AF4\" \"eyeDetection\"\n", + "[16] \"split\" " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "names(train)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "
  1. 'AF3'
  2. 'F7'
  3. 'F3'
  4. 'FC5'
  5. 'T7'
  6. 'P7'
  7. 'O1'
  8. 'O2'
  9. 'P8'
  10. 'T8'
  11. 'FC6'
  12. 'F4'
  13. 'F8'
  14. 'AF4'
\n" + ], + "text/latex": [ + "\\begin{enumerate*}\n", + "\\item 'AF3'\n", + "\\item 'F7'\n", + "\\item 'F3'\n", + "\\item 'FC5'\n", + "\\item 'T7'\n", + "\\item 'P7'\n", + "\\item 'O1'\n", + "\\item 'O2'\n", + "\\item 'P8'\n", + "\\item 'T8'\n", + "\\item 'FC6'\n", + "\\item 'F4'\n", + "\\item 'F8'\n", + "\\item 'AF4'\n", + "\\end{enumerate*}\n" + ], + "text/markdown": [ + "1. 'AF3'\n", + "2. 'F7'\n", + "3. 'F3'\n", + "4. 'FC5'\n", + "5. 'T7'\n", + "6. 'P7'\n", + "7. 'O1'\n", + "8. 'O2'\n", + "9. 'P8'\n", + "10. 'T8'\n", + "11. 'FC6'\n", + "12. 'F4'\n", + "13. 'F8'\n", + "14. 'AF4'\n", + "\n", + "\n" + ], + "text/plain": [ + " [1] \"AF3\" \"F7\" \"F3\" \"FC5\" \"T7\" \"P7\" \"O1\" \"O2\" \"P8\" \"T8\" \"FC6\" \"F4\" \n", + "[13] \"F8\" \"AF4\"" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "x <- setdiff(names(train), c(\"eyeDetection\", \"split\")) #Remove the 13th and 14th columns\n", + "x" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## GPU-Bound Code Example" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " |======================================================================| 100%\n" + ] + } + ], + "source": [ + "# hyper_params <- list(\n", + "# max_depth = c(6, 10, 12),\n", + "# learn_rate = c(0.03, 0.01),\n", + "# sample_rate = c(0.8)\n", + "# )\n", + "\n", + "hyper_params <- list(\n", + " max_depth = c(6),\n", + " learn_rate = c(0.03),\n", + " sample_rate = c(0.8)\n", + ")\n", + "\n", + "grid_id <- paste0(\"xgb_gpu_grid_\", as.integer(Sys.time()))\n", + "gs <- h2o.grid(\n", + " algorithm = \"xgboost\",\n", + " grid_id = grid_id,\n", + " x = x, y = y,\n", + " training_frame = train,\n", + " validation_frame = valid,\n", + " hyper_params = hyper_params,\n", + " backend = \"gpu\",\n", + " tree_method = \"hist\",\n", + " ntrees = 2000,\n", + " stopping_rounds = 0\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + " \n", + " \n", + " JUmPER Performance Report\n", + " \n", + "\n", + "\n", + "
\n", + "
\n", + "
\n", + "
\n", + "
JUmPER Performance Report
\n", + "
\n", + "
\n", + " \n", + " Duration\n", + " 22.73s\n", + " \n", + " \n", + " \n", + " Cells\n", + " 1\n", + " \n", + "
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
MetricAVGMINMAXTotal/Limit
CPU Util (Across 12 CPUs)12.923.1626.38-
Memory (GB)3.383.373.4115.58
GPU Util (Across 1 GPUs)32.487.0060.00-
GPU Memory (GB)0.000.000.0012.00
\n", + "
\n", + "
\n", + "
\n", + "
\n", + "\n", + "" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%perfmonitor_perfreport --level user" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "38665ca1771945959671251c9ba2b6ce", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "VBox(children=(HBox(children=(HTML(value='Plot Configuration:'), Checkbox(value=False, description='Sho…" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%perfmonitor_plot" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now that we have specified `x` and `y`, we can train the GBM model using a few non-default model parameters. Since we are predicting a binary response, we set `distribution = \"bernoulli\"`." + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " |======================================================================| 100%\n" + ] + } + ], + "source": [ + "model <- h2o.gbm(x = x, y = y,\n", + " training_frame = train,\n", + " validation_frame = valid,\n", + " distribution = \"bernoulli\",\n", + " ntrees = 100,\n", + " max_depth = 4,\n", + " learn_rate = 0.1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Print Performance Report" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + " \n", + " \n", + " JUmPER Performance Report\n", + " \n", + "\n", + "\n", + "
\n", + "
\n", + "
\n", + "
\n", + "
JUmPER Performance Report
\n", + "
\n", + "
\n", + " \n", + " Duration\n", + " 2.50s\n", + " \n", + " \n", + " \n", + " Cells\n", + " 1\n", + " \n", + "
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
MetricAVGMINMAXTotal/Limit
CPU Util (Across 12 CPUs)30.1726.2734.07-
Memory (GB)3.363.353.3715.58
GPU Util (Across 1 GPUs)0.500.001.00-
GPU Memory (GB)0.000.000.0012.00
\n", + "
\n", + "
\n", + "
\n", + "
\n", + "\n", + "" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%perfmonitor_perfreport --level user" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Inspect Model\n", + "\n", + "The type of results shown when you print a model, are determined by the following:\n", + "- Model class of the estimator (e.g. GBM, RF, GLM, DL)\n", + "- The type of machine learning problem (e.g. binary classification, multiclass classification, regression)\n", + "- The data you specify (e.g. `training_frame` only, `training_frame` and `validation_frame`, or `training_frame` and `nfolds`)\n", + "\n", + "Below, we see a GBM Model Summary, as well as training and validation metrics since we supplied a `validation_frame`. Since this a binary classification task, we are shown the relevant performance metrics, which inclues: MSE, R^2, LogLoss, AUC and Gini. Also, we are shown a Confusion Matrix, where the threshold for classification is chosen automatically (by H2O) as the threshold which maximizes the F1 score.\n", + "\n", + "The scoring history is also printed, which shows the performance metrics over some increment such as \"number of trees\" in the case of GBM and RF.\n", + "\n", + "Lastly, for tree-based methods (GBM and RF), we also print variable importance." + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Model Details:\n", + "==============\n", + "\n", + "H2OBinomialModel: gbm\n", + "Model ID: GBM_model_R_1771497737180_221 \n", + "Model Summary: \n", + " number_of_trees number_of_internal_trees model_size_in_bytes min_depth\n", + "1 100 100 24848 4\n", + " max_depth mean_depth min_leaves max_leaves mean_leaves\n", + "1 4 4.00000 12 16 15.17000\n", + "\n", + "\n", + "H2OBinomialMetrics: gbm\n", + "** Reported on training data. **\n", + "\n", + "MSE: 0.1076065\n", + "RMSE: 0.3280343\n", + "LogLoss: 0.3600893\n", + "Mean Per-Class Error: 0.1300826\n", + "AUC: 0.9464722\n", + "AUCPR: 0.9406318\n", + "Gini: 0.8929444\n", + "R^2: 0.5657448\n", + "\n", + "Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:\n", + " 0 1 Error Rate\n", + "0 4330 586 0.119203 =586/4916\n", + "1 574 3498 0.140963 =574/4072\n", + "Totals 4904 4084 0.129061 =1160/8988\n", + "\n", + "Maximum Metrics: Maximum metrics at their respective thresholds\n", + " metric threshold value idx\n", + "1 max f1 0.463229 0.857773 199\n", + "2 max f2 0.306172 0.899687 260\n", + "3 max f0point5 0.582230 0.882353 154\n", + "4 max accuracy 0.463229 0.870939 199\n", + "5 max precision 0.990029 1.000000 0\n", + "6 max recall 0.062069 1.000000 380\n", + "7 max specificity 0.990029 1.000000 0\n", + "8 max absolute_mcc 0.463229 0.739650 199\n", + "9 max min_per_class_accuracy 0.448748 0.868999 204\n", + "10 max mean_per_class_accuracy 0.463229 0.869917 199\n", + "11 max tns 0.990029 4916.000000 0\n", + "12 max fns 0.990029 4071.000000 0\n", + "13 max fps 0.014820 4916.000000 399\n", + "14 max tps 0.062069 4072.000000 380\n", + "15 max tnr 0.990029 1.000000 0\n", + "16 max fnr 0.990029 0.999754 0\n", + "17 max fpr 0.014820 1.000000 399\n", + "18 max tpr 0.062069 1.000000 380\n", + "\n", + "Gains/Lift Table: Extract with `h2o.gainsLift(, )` or `h2o.gainsLift(, valid=, xval=)`\n", + "H2OBinomialMetrics: gbm\n", + "** Reported on validation data. **\n", + "** Validation metrics **\n", + "\n", + "MSE: 0.1200593\n", + "RMSE: 0.3464957\n", + "LogLoss: 0.3894168\n", + "Mean Per-Class Error: 0.1542851\n", + "AUC: 0.9239379\n", + "AUCPR: 0.9173234\n", + "Gini: 0.8478758\n", + "R^2: 0.5157124\n", + "\n", + "Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:\n", + " 0 1 Error Rate\n", + "0 1414 221 0.135168 =221/1635\n", + "1 236 1125 0.173402 =236/1361\n", + "Totals 1650 1346 0.152537 =457/2996\n", + "\n", + "Maximum Metrics: Maximum metrics at their respective thresholds\n", + " metric threshold value idx\n", + "1 max f1 0.482571 0.831178 200\n", + "2 max f2 0.329543 0.887175 262\n", + "3 max f0point5 0.606576 0.850985 152\n", + "4 max accuracy 0.482571 0.847463 200\n", + "5 max precision 0.978424 1.000000 0\n", + "6 max recall 0.084627 1.000000 373\n", + "7 max specificity 0.978424 1.000000 0\n", + "8 max absolute_mcc 0.482571 0.692104 200\n", + "9 max min_per_class_accuracy 0.458052 0.839089 210\n", + "10 max mean_per_class_accuracy 0.482571 0.845715 200\n", + "11 max tns 0.978424 1635.000000 0\n", + "12 max fns 0.978424 1358.000000 0\n", + "13 max fps 0.012874 1635.000000 399\n", + "14 max tps 0.084627 1361.000000 373\n", + "15 max tnr 0.978424 1.000000 0\n", + "16 max fnr 0.978424 0.997796 0\n", + "17 max fpr 0.012874 1.000000 399\n", + "18 max tpr 0.084627 1.000000 373\n", + "\n", + "Gains/Lift Table: Extract with `h2o.gainsLift(, )` or `h2o.gainsLift(, valid=, xval=)`\n", + "\n" + ] + } + ], + "source": [ + "print(model)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Model Performance on a Test Set\n", + "\n", + "Once a model has been trained, you can also use it to make predictions on a test set. In the case above, we just ran the model once, so our validation set (passed as `validation_frame`), could have also served as a \"test set.\" We technically have already created test set predictions and evaluated test set performance. \n", + "\n", + "However, when performing model selection over a variety of model parameters, it is common for users to train a variety of models (using different parameters) using the training set, `train`, and a validation set, `valid`. Once the user selects the best model (based on validation set performance), the true test of model performance is performed by making a final set of predictions on the held-out (never been used before) test set, `test`.\n", + "\n", + "You can use the `model_performance` method to generate predictions on a new dataset. The results are stored in an object of class, `\"H2OBinomialMetrics\"`. " + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "'H2OBinomialMetrics'" + ], + "text/latex": [ + "'H2OBinomialMetrics'" + ], + "text/markdown": [ + "'H2OBinomialMetrics'" + ], + "text/plain": [ + "[1] \"H2OBinomialMetrics\"\n", + "attr(,\"package\")\n", + "[1] \"h2o\"" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "perf <- h2o.performance(model = model, newdata = test)\n", + "class(perf)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Individual model performance metrics can be extracted using methods like `r2`, `auc` and `mse`. In the case of binary classification, we may be most interested in evaluating test set Area Under the ROC Curve (AUC). " + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "0.522888206929954" + ], + "text/latex": [ + "0.522888206929954" + ], + "text/markdown": [ + "0.522888206929954" + ], + "text/plain": [ + "[1] 0.5228882" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "h2o.r2(perf)" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "0.928588111271663" + ], + "text/latex": [ + "0.928588111271663" + ], + "text/markdown": [ + "0.928588111271663" + ], + "text/plain": [ + "[1] 0.9285881" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "h2o.auc(perf)" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "0.116978290533459" + ], + "text/latex": [ + "0.116978290533459" + ], + "text/markdown": [ + "0.116978290533459" + ], + "text/plain": [ + "[1] 0.1169783" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "h2o.mse(perf)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Cross-validated Performance\n", + "\n", + "To perform k-fold cross-validation, you use the same code as above, but you specify `nfolds` as an integer greater than 1, or add a \"fold_column\" to your H2O Frame which indicates a fold ID for each row.\n", + "\n", + "Unless you have a specific reason to manually assign the observations to folds, you will find it easiest to simply use the `nfolds` argument.\n", + "\n", + "When performing cross-validation, you can still pass a `validation_frame`, but you can also choose to use the original dataset that contains all the rows. We will cross-validate a model below using the original H2O Frame which is called `data`." + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " |======================================================================| 100%\n" + ] + } + ], + "source": [ + "cvmodel <- h2o.gbm(x = x, y = y,\n", + " training_frame = train,\n", + " validation_frame = valid,\n", + " distribution = \"bernoulli\",\n", + " ntrees = 100,\n", + " max_depth = 4,\n", + " learn_rate = 0.1,\n", + " nfolds = 5)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Build a performance report for the last executed cell" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "ename": "ERROR", + "evalue": "Error in parse(text = input): :1:1: unexpected input\n1: %perfmonitor_perfreport\n ^\n", + "output_type": "error", + "traceback": [ + "Error in parse(text = input): :1:1: unexpected input\n1: %perfmonitor_perfreport\n ^\nTraceback:\n" + ] + } + ], + "source": [ + "%perfmonitor_perfreport" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This time around, we will simply pull the training and cross-validation metrics out of the model. To do so, you use the `auc` method again, and you can specify `train` or `xval` as `TRUE` to get the correct metric." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [], + "source": [ + "print(h2o.auc(cvmodel, train = TRUE))\n", + "print(h2o.auc(cvmodel, xval = TRUE))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Grid Search\n", + "\n", + "One way of evaluting models with different parameters is to perform a grid search over a set of parameter values. For example, in GBM, here are three model parameters that may be useful to search over:\n", + "- `ntrees`: Number of trees\n", + "- `max_depth`: Maximum depth of a tree\n", + "- `learn_rate`: Learning rate in the GBM\n", + "\n", + "We will define a grid as follows:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ntrees_opt <- c(5,50,100)\n", + "max_depth_opt <- c(2,3,5)\n", + "learn_rate_opt <- c(0.1,0.2)\n", + "\n", + "hyper_params = list('ntrees' = ntrees_opt,\n", + " 'max_depth' = max_depth_opt,\n", + " 'learn_rate' = learn_rate_opt)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The `h2o.grid` function can be used to train a `\"H2OGrid\"` object for any of the H2O algorithms (specified by the `\"algorithm\"` argument." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [], + "source": [ + "gs <- h2o.grid(algorithm = \"gbm\", \n", + " grid_id = \"eeg_demo_gbm_grid\",\n", + " hyper_params = hyper_params,\n", + " x = x, y = y, \n", + " training_frame = train, \n", + " validation_frame = valid)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Analyze the algorithm performance with:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%perfmonitor_perfreport" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### And we can also plot performance metrics right away!" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "357caad54d474883ad88e58eedfacd15", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "VBox(children=(HBox(children=(HTML(value='Plot Configuration:'), Checkbox(value=False, description='Sho…" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%perfmonitor_plot" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Compare Models" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [], + "source": [ + "print(gs)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "By default, grids of models will return the grid results sorted by (increasing) logloss on the validation set. However, if we are interested in sorting on another model performance metric, we can do that using the `h2o.getGrid` function as follows:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [], + "source": [ + "# print out the auc for all of the models\n", + "auc_table <- h2o.getGrid(grid_id = \"eeg_demo_gbm_grid\", sort_by = \"auc\", decreasing = TRUE)\n", + "print(auc_table)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The \"best\" model in terms of validation set AUC is listed first in auc_table." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [], + "source": [ + "best_model <- h2o.getModel(auc_table@model_ids[[1]])\n", + "h2o.auc(best_model, valid = TRUE) #Validation AUC for best model" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The last thing we may want to do is generate predictions on the test set using the \"best\" model, and evaluate the test set AUC." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [], + "source": [ + "best_perf <- h2o.performance(model = best_model, newdata = test)\n", + "h2o.auc(best_perf)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The test set AUC is approximately 0.97. Not bad!!" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Jumper Wrapper Kernel", + "language": "python", + "name": "jumper_wrapper" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "pygments_lexer": "ipython3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/demos/h2o-wrapper-tutorial.ipynb b/demos/h2o-wrapper-tutorial.ipynb new file mode 100644 index 0000000..eaf186e --- /dev/null +++ b/demos/h2o-wrapper-tutorial.ipynb @@ -0,0 +1,2646 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Jumper Performance Monitoring on Wrapped R Kernel" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## List & Wrap a new R kernel" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Available Jupyter Kernels:\n", + "--------------------------------------------------\n", + " ir: R (R)\n", + " scorep_jupyter: Score-P_Python (python)\n", + " h2o_r: Jumper Wrapper (h2o_r) (R)\n", + " jumper_wrapper: Jumper Wrapper Kernel (python)\n", + " python3: Python 3 (ipykernel) (python)\n" + ] + } + ], + "source": [ + "%list_kernels" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "data": { + "application/javascript": [ + "\n", + " if (typeof Jupyter !== 'undefined' && Jupyter.notebook) {\n", + " // JupyterLab classic notebook\n", + " Jupyter.notebook.kernel.kernel_info(function(reply) {\n", + " if (reply.content && reply.content.language_info) {\n", + " Jupyter.notebook.metadata.language_info = reply.content.language_info;\n", + " // Trigger CodeMirror mode change for all cells\n", + " var mode = reply.content.language_info.codemirror_mode || reply.content.language_info.name;\n", + " Jupyter.notebook.get_cells().forEach(function(cell) {\n", + " if (cell.cell_type === 'code') {\n", + " cell.code_mirror.setOption('mode', mode);\n", + " }\n", + " });\n", + " }\n", + " });\n", + " }\n", + " " + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Successfully wrapped kernel: ir\n", + "Hint: Refresh the page (without restarting the kernel) to enable syntax highlighting for the wrapped language.\n", + "Created permanent kernel 'h2o_r' that auto-wraps 'ir'.\n" + ] + } + ], + "source": [ + "%wrap_kernel ir --save h2o_r" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Performance Monitoring Setup" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[JUmPER]: Performance monitoring started (PID: 8283, Interval: 1.0s)\n" + ] + } + ], + "source": [ + "%perfmonitor_start 1" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# H2O Tutorial: EEG Eye State Classification\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Author: Erin LeDell\n", + "\n", + "Contact: erin@h2o.ai\n", + "\n", + "This tutorial steps through a quick introduction to H2O's R API. The goal of this tutorial is to introduce through a complete example H2O's capabilities from R. \n", + "\n", + "Most of the functionality for R's `data.frame` is exactly the same syntax for an `H2OFrame`, so if you are comfortable with R, data frame manipulation will come naturally to you in H2O. The modeling syntax in the H2O R API may also remind you of other machine learning packages in R.\n", + "\n", + "References: [H2O R API documentation](http://h2o-release.s3.amazonaws.com/h2o/latest_stable_Rdoc.html), the [H2O Documentation landing page](http://www.h2o.ai/docs/) and [H2O general documentation](http://h2o-release.s3.amazonaws.com/h2o/latest_stable_doc.html)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Install H2O in R\n", + "\n", + "### Prerequisites\n", + "\n", + "This tutorial assumes you have R installed. The `h2o` R package has a few dependencies which can be installed using CRAN. The packages that are required (which also have their own dependencies) can be installed in R as follows:\n", + "```r\n", + "pkgs <- c(\"methods\",\"statmod\",\"stats\",\"graphics\",\"RCurl\",\"jsonlite\",\"tools\",\"utils\")\n", + "for (pkg in pkgs) {\n", + " if (! (pkg %in% rownames(installed.packages()))) { install.packages(pkg) }\n", + "}\n", + "```\n", + "\n", + "### Install h2o\n", + "\n", + "Once the dependencies are installed, you can install H2O. We will use the latest stable version of the `h2o` R package, which at the time of writing is H2O v3.8.0.4 (aka \"Tukey-4\"). The latest stable version can be installed using the commands on the [H2O R Installation](http://www.h2o.ai/download/h2o/r) page." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Start up an H2O cluster\n", + "\n", + "After the R package is installed, we can start up an H2O cluster. In a R terminal, we load the `h2o` package and start up an H2O cluster as follows:" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "----------------------------------------------------------------------\n", + "\n", + "Your next step is to start H2O:\n", + " > h2o.init()\n", + "\n", + "For H2O package documentation, ask for help:\n", + " > ??h2o\n", + "\n", + "After starting H2O, you can use the Web UI at http://localhost:54321\n", + "For more information visit https://docs.h2o.ai\n", + "\n", + "----------------------------------------------------------------------\n", + "\n", + "\n", + "\n", + "Attaching package: ‘h2o’\n", + "\n", + "\n", + "The following objects are masked from ‘package:stats’:\n", + "\n", + " cor, sd, var\n", + "\n", + "\n", + "The following objects are masked from ‘package:base’:\n", + "\n", + " &&, %*%, %in%, ||, apply, as.factor, as.numeric, colnames,\n", + " colnames<-, ifelse, is.character, is.factor, is.numeric, log,\n", + " log10, log1p, log2, round, signif, trunc\n", + "\n", + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Reading in config file: ./../../.h2oconfig\n", + " Connection successful!\n", + "\n", + "R is connected to the H2O cluster: \n", + " H2O cluster uptime: 4 hours 37 minutes \n", + " H2O cluster timezone: Europe/Berlin \n", + " H2O data parsing timezone: UTC \n", + " H2O cluster version: 3.44.0.3 \n", + " H2O cluster version age: 2 years, 1 month and 29 days \n", + " H2O cluster name: ub \n", + " H2O cluster total nodes: 1 \n", + " H2O cluster total memory: 2.14 GB \n", + " H2O cluster total cores: 12 \n", + " H2O cluster allowed cores: 12 \n", + " H2O cluster healthy: TRUE \n", + " H2O Connection ip: 172.26.185.80 \n", + " H2O Connection port: 54321 \n", + " H2O Connection proxy: NA \n", + " H2O Internal Security: FALSE \n", + " R Version: R version 4.1.2 (2021-11-01) \n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Warning message in h2o.clusterInfo():\n", + "“\n", + "Your H2O cluster version is (2 years, 1 month and 29 days) old. There may be a newer version available.\n", + "Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html”\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "library(h2o)\n", + "\n", + "# Start an H2O Cluster\n", + "h2o.init(ip = \"127.0.0.1\", port = 54321, nthreads = -1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If you already have an H2O cluster running that you'd like to connect to (for example, in a multi-node Hadoop environment), then you can specify the IP and port of that cluster as follows:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [], + "source": [ + "# This will not actually do anything since it's a fake IP address\n", + "# h2o.init(ip=\"123.45.67.89\", port=54321)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Download EEG Data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The following code downloads a copy of the [EEG Eye State](http://archive.ics.uci.edu/ml/datasets/EEG+Eye+State#) dataset. All data is from one continuous EEG measurement with the [Emotiv EEG Neuroheadset](https://emotiv.com/epoc.php). The duration of the measurement was 117 seconds. The eye state was detected via a camera during the EEG measurement and added later manually to the file after analysing the video frames. '1' indicates the eye-closed and '0' the eye-open state. All values are in chronological order with the first measured value at the top of the data.\n", + "\n", + "![Emotiv Headset](http://dissociatedpress.com/wp-content/uploads/2013/03/emotiv-490.jpg)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can import the data directly into H2O using the `import_file` method in the Python API. The import path can be a URL, a local path, a path to an HDFS file, or a file on Amazon S3." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " |======================================================================| 100%\n" + ] + } + ], + "source": [ + "#csv_url <- \"http://www.stat.berkeley.edu/~ledell/data/eeg_eyestate_splits.csv\"\n", + "csv_url <- \"https://h2o-public-test-data.s3.amazonaws.com/smalldata/eeg/eeg_eyestate_splits.csv\"\n", + "data <- h2o.importFile(csv_url)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Explore Data\n", + "Once we have loaded the data, let's take a quick look. First the dimension of the frame:" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "
  1. 14980
  2. 16
\n" + ], + "text/latex": [ + "\\begin{enumerate*}\n", + "\\item 14980\n", + "\\item 16\n", + "\\end{enumerate*}\n" + ], + "text/markdown": [ + "1. 14980\n", + "2. 16\n", + "\n", + "\n" + ], + "text/plain": [ + "[1] 14980 16" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "dim(data)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now let's take a look at the top of the frame:" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\t\n", + "\t\n", + "\n", + "\n", + "\t\n", + "\t\n", + "\t\n", + "\t\n", + "\t\n", + "\t\n", + "\n", + "
A data.frame: 6 × 16
AF3F7F3FC5T7P7O1O2P8T8FC6F4F8AF4eyeDetectionsplit
<dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><fct>
14329.234009.234289.234148.214350.264586.154096.924641.034222.054238.464211.284280.514635.904393.850valid
24324.624004.624293.854148.724342.054586.674097.444638.974210.774226.674207.694279.494632.824384.100test
34327.694006.674295.384156.414336.924583.594096.924630.264207.694222.054206.674282.054628.724389.230train
44328.724011.794296.414155.904343.594582.564097.444630.774217.444235.384210.774287.694632.314396.410train
54326.154011.794292.314151.284347.694586.674095.904627.694210.774244.104212.824288.214632.824398.460train
64321.034004.624284.104153.334345.644587.184093.334616.924202.564232.824209.744281.034628.214389.740train
\n" + ], + "text/latex": [ + "A data.frame: 6 × 16\n", + "\\begin{tabular}{r|llllllllllllllll}\n", + " & AF3 & F7 & F3 & FC5 & T7 & P7 & O1 & O2 & P8 & T8 & FC6 & F4 & F8 & AF4 & eyeDetection & split\\\\\n", + " & & & & & & & & & & & & & & & & \\\\\n", + "\\hline\n", + "\t1 & 4329.23 & 4009.23 & 4289.23 & 4148.21 & 4350.26 & 4586.15 & 4096.92 & 4641.03 & 4222.05 & 4238.46 & 4211.28 & 4280.51 & 4635.90 & 4393.85 & 0 & valid\\\\\n", + "\t2 & 4324.62 & 4004.62 & 4293.85 & 4148.72 & 4342.05 & 4586.67 & 4097.44 & 4638.97 & 4210.77 & 4226.67 & 4207.69 & 4279.49 & 4632.82 & 4384.10 & 0 & test \\\\\n", + "\t3 & 4327.69 & 4006.67 & 4295.38 & 4156.41 & 4336.92 & 4583.59 & 4096.92 & 4630.26 & 4207.69 & 4222.05 & 4206.67 & 4282.05 & 4628.72 & 4389.23 & 0 & train\\\\\n", + "\t4 & 4328.72 & 4011.79 & 4296.41 & 4155.90 & 4343.59 & 4582.56 & 4097.44 & 4630.77 & 4217.44 & 4235.38 & 4210.77 & 4287.69 & 4632.31 & 4396.41 & 0 & train\\\\\n", + "\t5 & 4326.15 & 4011.79 & 4292.31 & 4151.28 & 4347.69 & 4586.67 & 4095.90 & 4627.69 & 4210.77 & 4244.10 & 4212.82 & 4288.21 & 4632.82 & 4398.46 & 0 & train\\\\\n", + "\t6 & 4321.03 & 4004.62 & 4284.10 & 4153.33 & 4345.64 & 4587.18 & 4093.33 & 4616.92 & 4202.56 & 4232.82 & 4209.74 & 4281.03 & 4628.21 & 4389.74 & 0 & train\\\\\n", + "\\end{tabular}\n" + ], + "text/markdown": [ + "\n", + "A data.frame: 6 × 16\n", + "\n", + "| | AF3 <dbl> | F7 <dbl> | F3 <dbl> | FC5 <dbl> | T7 <dbl> | P7 <dbl> | O1 <dbl> | O2 <dbl> | P8 <dbl> | T8 <dbl> | FC6 <dbl> | F4 <dbl> | F8 <dbl> | AF4 <dbl> | eyeDetection <dbl> | split <fct> |\n", + "|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|\n", + "| 1 | 4329.23 | 4009.23 | 4289.23 | 4148.21 | 4350.26 | 4586.15 | 4096.92 | 4641.03 | 4222.05 | 4238.46 | 4211.28 | 4280.51 | 4635.90 | 4393.85 | 0 | valid |\n", + "| 2 | 4324.62 | 4004.62 | 4293.85 | 4148.72 | 4342.05 | 4586.67 | 4097.44 | 4638.97 | 4210.77 | 4226.67 | 4207.69 | 4279.49 | 4632.82 | 4384.10 | 0 | test |\n", + "| 3 | 4327.69 | 4006.67 | 4295.38 | 4156.41 | 4336.92 | 4583.59 | 4096.92 | 4630.26 | 4207.69 | 4222.05 | 4206.67 | 4282.05 | 4628.72 | 4389.23 | 0 | train |\n", + "| 4 | 4328.72 | 4011.79 | 4296.41 | 4155.90 | 4343.59 | 4582.56 | 4097.44 | 4630.77 | 4217.44 | 4235.38 | 4210.77 | 4287.69 | 4632.31 | 4396.41 | 0 | train |\n", + "| 5 | 4326.15 | 4011.79 | 4292.31 | 4151.28 | 4347.69 | 4586.67 | 4095.90 | 4627.69 | 4210.77 | 4244.10 | 4212.82 | 4288.21 | 4632.82 | 4398.46 | 0 | train |\n", + "| 6 | 4321.03 | 4004.62 | 4284.10 | 4153.33 | 4345.64 | 4587.18 | 4093.33 | 4616.92 | 4202.56 | 4232.82 | 4209.74 | 4281.03 | 4628.21 | 4389.74 | 0 | train |\n", + "\n" + ], + "text/plain": [ + " AF3 F7 F3 FC5 T7 P7 O1 O2 P8 \n", + "1 4329.23 4009.23 4289.23 4148.21 4350.26 4586.15 4096.92 4641.03 4222.05\n", + "2 4324.62 4004.62 4293.85 4148.72 4342.05 4586.67 4097.44 4638.97 4210.77\n", + "3 4327.69 4006.67 4295.38 4156.41 4336.92 4583.59 4096.92 4630.26 4207.69\n", + "4 4328.72 4011.79 4296.41 4155.90 4343.59 4582.56 4097.44 4630.77 4217.44\n", + "5 4326.15 4011.79 4292.31 4151.28 4347.69 4586.67 4095.90 4627.69 4210.77\n", + "6 4321.03 4004.62 4284.10 4153.33 4345.64 4587.18 4093.33 4616.92 4202.56\n", + " T8 FC6 F4 F8 AF4 eyeDetection split\n", + "1 4238.46 4211.28 4280.51 4635.90 4393.85 0 valid\n", + "2 4226.67 4207.69 4279.49 4632.82 4384.10 0 test \n", + "3 4222.05 4206.67 4282.05 4628.72 4389.23 0 train\n", + "4 4235.38 4210.77 4287.69 4632.31 4396.41 0 train\n", + "5 4244.10 4212.82 4288.21 4632.82 4398.46 0 train\n", + "6 4232.82 4209.74 4281.03 4628.21 4389.74 0 train" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "head(data)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The first 14 columns are numeric values that represent EEG measurements from the headset. The \"eyeDetection\" column is the response. There is an additional column called \"split\" that was added (by me) in order to specify partitions of the data (so we can easily benchmark against other tools outside of H2O using the same splits). I randomly divided the dataset into three partitions: train (60%), valid (%20) and test (20%) and marked which split each row belongs to in the \"split\" column.\n", + "\n", + "Let's take a look at the column names. The data contains derived features from the medical images of the tumors." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "
  1. 'AF3'
  2. 'F7'
  3. 'F3'
  4. 'FC5'
  5. 'T7'
  6. 'P7'
  7. 'O1'
  8. 'O2'
  9. 'P8'
  10. 'T8'
  11. 'FC6'
  12. 'F4'
  13. 'F8'
  14. 'AF4'
  15. 'eyeDetection'
  16. 'split'
\n" + ], + "text/latex": [ + "\\begin{enumerate*}\n", + "\\item 'AF3'\n", + "\\item 'F7'\n", + "\\item 'F3'\n", + "\\item 'FC5'\n", + "\\item 'T7'\n", + "\\item 'P7'\n", + "\\item 'O1'\n", + "\\item 'O2'\n", + "\\item 'P8'\n", + "\\item 'T8'\n", + "\\item 'FC6'\n", + "\\item 'F4'\n", + "\\item 'F8'\n", + "\\item 'AF4'\n", + "\\item 'eyeDetection'\n", + "\\item 'split'\n", + "\\end{enumerate*}\n" + ], + "text/markdown": [ + "1. 'AF3'\n", + "2. 'F7'\n", + "3. 'F3'\n", + "4. 'FC5'\n", + "5. 'T7'\n", + "6. 'P7'\n", + "7. 'O1'\n", + "8. 'O2'\n", + "9. 'P8'\n", + "10. 'T8'\n", + "11. 'FC6'\n", + "12. 'F4'\n", + "13. 'F8'\n", + "14. 'AF4'\n", + "15. 'eyeDetection'\n", + "16. 'split'\n", + "\n", + "\n" + ], + "text/plain": [ + " [1] \"AF3\" \"F7\" \"F3\" \"FC5\" \"T7\" \n", + " [6] \"P7\" \"O1\" \"O2\" \"P8\" \"T8\" \n", + "[11] \"FC6\" \"F4\" \"F8\" \"AF4\" \"eyeDetection\"\n", + "[16] \"split\" " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "names(data)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To select a subset of the columns to look at, typical R data.frame indexing applies:" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\t\n", + "\t\n", + "\n", + "\n", + "\t\n", + "\t\n", + "\t\n", + "\t\n", + "\t\n", + "\t\n", + "\n", + "
A data.frame: 6 × 3
AF3eyeDetectionsplit
<dbl><dbl><fct>
14329.230valid
24324.620test
34327.690train
44328.720train
54326.150train
64321.030train
\n" + ], + "text/latex": [ + "A data.frame: 6 × 3\n", + "\\begin{tabular}{r|lll}\n", + " & AF3 & eyeDetection & split\\\\\n", + " & & & \\\\\n", + "\\hline\n", + "\t1 & 4329.23 & 0 & valid\\\\\n", + "\t2 & 4324.62 & 0 & test \\\\\n", + "\t3 & 4327.69 & 0 & train\\\\\n", + "\t4 & 4328.72 & 0 & train\\\\\n", + "\t5 & 4326.15 & 0 & train\\\\\n", + "\t6 & 4321.03 & 0 & train\\\\\n", + "\\end{tabular}\n" + ], + "text/markdown": [ + "\n", + "A data.frame: 6 × 3\n", + "\n", + "| | AF3 <dbl> | eyeDetection <dbl> | split <fct> |\n", + "|---|---|---|---|\n", + "| 1 | 4329.23 | 0 | valid |\n", + "| 2 | 4324.62 | 0 | test |\n", + "| 3 | 4327.69 | 0 | train |\n", + "| 4 | 4328.72 | 0 | train |\n", + "| 5 | 4326.15 | 0 | train |\n", + "| 6 | 4321.03 | 0 | train |\n", + "\n" + ], + "text/plain": [ + " AF3 eyeDetection split\n", + "1 4329.23 0 valid\n", + "2 4324.62 0 test \n", + "3 4327.69 0 train\n", + "4 4328.72 0 train\n", + "5 4326.15 0 train\n", + "6 4321.03 0 train" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "columns <- c('AF3', 'eyeDetection', 'split')\n", + "head(data[columns])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now let's select a single column, for example -- the response column, and look at the data more closely:" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + " eyeDetection\n", + "1 0\n", + "2 0\n", + "3 0\n", + "4 0\n", + "5 0\n", + "6 0\n", + "\n", + "[14980 rows x 1 column] " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "y <- 'eyeDetection'\n", + "data[y]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "It looks like a binary response, but let's validate that assumption:" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + " C1\n", + "1 0\n", + "2 1\n", + "\n", + "[2 rows x 1 column] " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "h2o.unique(data[y])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If you don't specify the column types when you import the file, H2O makes a guess at what your column types are. If there are 0's and 1's in a column, H2O will automatically parse that as numeric by default. \n", + "\n", + "Therefore, we should convert the response column to a more efficient \"factor\" representation (called \"enum\" in Java) -- in this case it is a categorial variable with two levels, 0 and 1. If the only column in my data that is categorical is the response, I typically don't bother specifying the column type during the parse, and instead use this one-liner to convert it aftewards:" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [], + "source": [ + "data[y] <- as.factor(data[y])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we can check that there are two levels in our response column:" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "2" + ], + "text/latex": [ + "2" + ], + "text/markdown": [ + "2" + ], + "text/plain": [ + "[1] 2" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "h2o.nlevels(data[y])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can query the categorical \"levels\" as well ('0' and '1' stand for \"eye open\" and \"eye closed\") to see what they are:" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "
  1. '0'
  2. '1'
\n" + ], + "text/latex": [ + "\\begin{enumerate*}\n", + "\\item '0'\n", + "\\item '1'\n", + "\\end{enumerate*}\n" + ], + "text/markdown": [ + "1. '0'\n", + "2. '1'\n", + "\n", + "\n" + ], + "text/plain": [ + "[1] \"0\" \"1\"" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "h2o.levels(data[y])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We may want to check if there are any missing values, so let's look for NAs in our dataset. For all the supervised H2O algorithms, H2O will handle missing values automatically, so it's not a problem if we are missing certain feature values. However, it is always a good idea to check to make sure that you are not missing any of the training labels. \n", + "\n", + "To figure out which, if any, values are missing, we can use the `h2o.nacnt` (NA count) method on any H2OFrame (or column). The columns in an H2O Frame are also H2O Frames themselves, so all the methods that apply to an H2OFrame also apply to a single column." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "0" + ], + "text/latex": [ + "0" + ], + "text/markdown": [ + "0" + ], + "text/plain": [ + "[1] 0" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "h2o.nacnt(data[y])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Great, no missing labels. :-)\n", + "\n", + "Out of curiosity, let's see if there is any missing data in any of the columsn of this frame:" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "
  1. 0
  2. 0
  3. 0
  4. 0
  5. 0
  6. 0
  7. 0
  8. 0
  9. 0
  10. 0
  11. 0
  12. 0
  13. 0
  14. 0
  15. 0
  16. 0
\n" + ], + "text/latex": [ + "\\begin{enumerate*}\n", + "\\item 0\n", + "\\item 0\n", + "\\item 0\n", + "\\item 0\n", + "\\item 0\n", + "\\item 0\n", + "\\item 0\n", + "\\item 0\n", + "\\item 0\n", + "\\item 0\n", + "\\item 0\n", + "\\item 0\n", + "\\item 0\n", + "\\item 0\n", + "\\item 0\n", + "\\item 0\n", + "\\end{enumerate*}\n" + ], + "text/markdown": [ + "1. 0\n", + "2. 0\n", + "3. 0\n", + "4. 0\n", + "5. 0\n", + "6. 0\n", + "7. 0\n", + "8. 0\n", + "9. 0\n", + "10. 0\n", + "11. 0\n", + "12. 0\n", + "13. 0\n", + "14. 0\n", + "15. 0\n", + "16. 0\n", + "\n", + "\n" + ], + "text/plain": [ + " [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "h2o.nacnt(data)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Each column returns a zero, so there are no missing values in any of the columns." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The next thing I may wonder about in a binary classification problem is the distribution of the response in the training data. Is one of the two outcomes under-represented in the training set? Many real datasets have what's called an \"imbalanace\" problem, where one of the classes has far fewer training examples than the other class. Let's take a look at the distribution:" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + " eyeDetection Count\n", + "1 0 8257\n", + "2 1 6723\n", + "\n", + "[2 rows x 2 columns] " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "h2o.table(data[y])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Ok, the data is not exactly evenly distributed between the two classes -- there are more 0's than 1's in the dataset. However, this level of imbalance shouldn't be much of an issue for the machine learning algos. (We will revisit this later in the modeling section below).\n", + "\n", + "Let's calculate the percentage that each class represents:" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + " Count\n", + "1 0.5512016\n", + "2 0.4487984\n", + "\n", + "[2 rows x 1 column] " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "n <- nrow(data) # Total number of training samples\n", + "h2o.table(data[y])['Count']/n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Split H2O Frame into a train and test set\n", + "\n", + "So far we have explored the original dataset (all rows). For the machine learning portion of this tutorial, we will break the dataset into three parts: a training set, validation set and a test set.\n", + "\n", + "If you want H2O to do the splitting for you, you can use the `split_frame` method. However, we have explicit splits that we want (for reproducibility reasons), so we can just subset the Frame to get the partitions we want. \n", + "\n", + "Subset the `data` H2O Frame on the \"split\" column:" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "8988" + ], + "text/latex": [ + "8988" + ], + "text/markdown": [ + "8988" + ], + "text/plain": [ + "[1] 8988" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "train <- data[data['split']==\"train\",]\n", + "nrow(train)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "2996" + ], + "text/latex": [ + "2996" + ], + "text/markdown": [ + "2996" + ], + "text/plain": [ + "[1] 2996" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "valid <- data[data['split']==\"valid\",]\n", + "nrow(valid)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "2996" + ], + "text/latex": [ + "2996" + ], + "text/markdown": [ + "2996" + ], + "text/plain": [ + "[1] 2996" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "test <- data[data['split']==\"test\",]\n", + "nrow(test)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Machine Learning in H2O\n", + "\n", + "We will do a quick demo of the H2O software using a Gradient Boosting Machine (GBM). The goal of this problem is to train a model to predict eye state (open vs closed) from EEG data. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Train and Test a GBM model" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In the steps above, we have already created the training set and validation set, so the next step is to specify the predictor set and response variable.\n", + "\n", + "#### Specify the predictor set and response\n", + "\n", + "As with any machine learning algorithm, we need to specify the response and predictor columns in the training set. \n", + "\n", + "The `x` argument should be a vector of predictor names in the training frame, and `y` specifies the response column. We have already set `y <- \"eyeDetector\"` above, but we still need to specify `x`." + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "
  1. 'AF3'
  2. 'F7'
  3. 'F3'
  4. 'FC5'
  5. 'T7'
  6. 'P7'
  7. 'O1'
  8. 'O2'
  9. 'P8'
  10. 'T8'
  11. 'FC6'
  12. 'F4'
  13. 'F8'
  14. 'AF4'
  15. 'eyeDetection'
  16. 'split'
\n" + ], + "text/latex": [ + "\\begin{enumerate*}\n", + "\\item 'AF3'\n", + "\\item 'F7'\n", + "\\item 'F3'\n", + "\\item 'FC5'\n", + "\\item 'T7'\n", + "\\item 'P7'\n", + "\\item 'O1'\n", + "\\item 'O2'\n", + "\\item 'P8'\n", + "\\item 'T8'\n", + "\\item 'FC6'\n", + "\\item 'F4'\n", + "\\item 'F8'\n", + "\\item 'AF4'\n", + "\\item 'eyeDetection'\n", + "\\item 'split'\n", + "\\end{enumerate*}\n" + ], + "text/markdown": [ + "1. 'AF3'\n", + "2. 'F7'\n", + "3. 'F3'\n", + "4. 'FC5'\n", + "5. 'T7'\n", + "6. 'P7'\n", + "7. 'O1'\n", + "8. 'O2'\n", + "9. 'P8'\n", + "10. 'T8'\n", + "11. 'FC6'\n", + "12. 'F4'\n", + "13. 'F8'\n", + "14. 'AF4'\n", + "15. 'eyeDetection'\n", + "16. 'split'\n", + "\n", + "\n" + ], + "text/plain": [ + " [1] \"AF3\" \"F7\" \"F3\" \"FC5\" \"T7\" \n", + " [6] \"P7\" \"O1\" \"O2\" \"P8\" \"T8\" \n", + "[11] \"FC6\" \"F4\" \"F8\" \"AF4\" \"eyeDetection\"\n", + "[16] \"split\" " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "names(train)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "
  1. 'AF3'
  2. 'F7'
  3. 'F3'
  4. 'FC5'
  5. 'T7'
  6. 'P7'
  7. 'O1'
  8. 'O2'
  9. 'P8'
  10. 'T8'
  11. 'FC6'
  12. 'F4'
  13. 'F8'
  14. 'AF4'
\n" + ], + "text/latex": [ + "\\begin{enumerate*}\n", + "\\item 'AF3'\n", + "\\item 'F7'\n", + "\\item 'F3'\n", + "\\item 'FC5'\n", + "\\item 'T7'\n", + "\\item 'P7'\n", + "\\item 'O1'\n", + "\\item 'O2'\n", + "\\item 'P8'\n", + "\\item 'T8'\n", + "\\item 'FC6'\n", + "\\item 'F4'\n", + "\\item 'F8'\n", + "\\item 'AF4'\n", + "\\end{enumerate*}\n" + ], + "text/markdown": [ + "1. 'AF3'\n", + "2. 'F7'\n", + "3. 'F3'\n", + "4. 'FC5'\n", + "5. 'T7'\n", + "6. 'P7'\n", + "7. 'O1'\n", + "8. 'O2'\n", + "9. 'P8'\n", + "10. 'T8'\n", + "11. 'FC6'\n", + "12. 'F4'\n", + "13. 'F8'\n", + "14. 'AF4'\n", + "\n", + "\n" + ], + "text/plain": [ + " [1] \"AF3\" \"F7\" \"F3\" \"FC5\" \"T7\" \"P7\" \"O1\" \"O2\" \"P8\" \"T8\" \"FC6\" \"F4\" \n", + "[13] \"F8\" \"AF4\"" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "x <- setdiff(names(train), c(\"eyeDetection\", \"split\")) #Remove the 13th and 14th columns\n", + "x" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": "## GPU-Bound Code Example (Fails on Binder due to GPU not being available)" + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " |======================================================================| 100%\n" + ] + } + ], + "source": [ + "hyper_params <- list(\n", + " max_depth = c(6),\n", + " learn_rate = c(0.03),\n", + " sample_rate = c(0.8)\n", + ")\n", + "\n", + "grid_id <- paste0(\"xgb_gpu_grid_\", as.integer(Sys.time()))\n", + "gs <- h2o.grid(\n", + " algorithm = \"xgboost\",\n", + " grid_id = grid_id,\n", + " x = x, y = y,\n", + " training_frame = train,\n", + " validation_frame = valid,\n", + " hyper_params = hyper_params,\n", + " backend = \"gpu\",\n", + " tree_method = \"hist\",\n", + " ntrees = 2000,\n", + " stopping_rounds = 0\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + " \n", + " \n", + " JUmPER Performance Report\n", + " \n", + "\n", + "\n", + "
\n", + "
\n", + "
\n", + "
\n", + "
JUmPER Performance Report
\n", + "
\n", + "
\n", + " \n", + " Duration\n", + " 22.73s\n", + " \n", + " \n", + " \n", + " Cells\n", + " 1\n", + " \n", + "
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
MetricAVGMINMAXTotal/Limit
CPU Util (Across 12 CPUs)12.923.1626.38-
Memory (GB)3.383.373.4115.58
GPU Util (Across 1 GPUs)32.487.0060.00-
GPU Memory (GB)0.000.000.0012.00
\n", + "
\n", + "
\n", + "
\n", + "
\n", + "\n", + "" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%perfmonitor_perfreport --level user" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "38665ca1771945959671251c9ba2b6ce", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "VBox(children=(HBox(children=(HTML(value='Plot Configuration:'), Checkbox(value=False, description='Sho…" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%perfmonitor_plot" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now that we have specified `x` and `y`, we can train the GBM model using a few non-default model parameters. Since we are predicting a binary response, we set `distribution = \"bernoulli\"`." + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " |======================================================================| 100%\n" + ] + } + ], + "source": [ + "model <- h2o.gbm(x = x, y = y,\n", + " training_frame = train,\n", + " validation_frame = valid,\n", + " distribution = \"bernoulli\",\n", + " ntrees = 100,\n", + " max_depth = 4,\n", + " learn_rate = 0.1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Print Performance Report" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + " \n", + " \n", + " JUmPER Performance Report\n", + " \n", + "\n", + "\n", + "
\n", + "
\n", + "
\n", + "
\n", + "
JUmPER Performance Report
\n", + "
\n", + "
\n", + " \n", + " Duration\n", + " 2.50s\n", + " \n", + " \n", + " \n", + " Cells\n", + " 1\n", + " \n", + "
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
MetricAVGMINMAXTotal/Limit
CPU Util (Across 12 CPUs)30.1726.2734.07-
Memory (GB)3.363.353.3715.58
GPU Util (Across 1 GPUs)0.500.001.00-
GPU Memory (GB)0.000.000.0012.00
\n", + "
\n", + "
\n", + "
\n", + "
\n", + "\n", + "" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%perfmonitor_perfreport --level user" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Inspect Model\n", + "\n", + "The type of results shown when you print a model, are determined by the following:\n", + "- Model class of the estimator (e.g. GBM, RF, GLM, DL)\n", + "- The type of machine learning problem (e.g. binary classification, multiclass classification, regression)\n", + "- The data you specify (e.g. `training_frame` only, `training_frame` and `validation_frame`, or `training_frame` and `nfolds`)\n", + "\n", + "Below, we see a GBM Model Summary, as well as training and validation metrics since we supplied a `validation_frame`. Since this a binary classification task, we are shown the relevant performance metrics, which inclues: MSE, R^2, LogLoss, AUC and Gini. Also, we are shown a Confusion Matrix, where the threshold for classification is chosen automatically (by H2O) as the threshold which maximizes the F1 score.\n", + "\n", + "The scoring history is also printed, which shows the performance metrics over some increment such as \"number of trees\" in the case of GBM and RF.\n", + "\n", + "Lastly, for tree-based methods (GBM and RF), we also print variable importance." + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Model Details:\n", + "==============\n", + "\n", + "H2OBinomialModel: gbm\n", + "Model ID: GBM_model_R_1771497737180_221 \n", + "Model Summary: \n", + " number_of_trees number_of_internal_trees model_size_in_bytes min_depth\n", + "1 100 100 24848 4\n", + " max_depth mean_depth min_leaves max_leaves mean_leaves\n", + "1 4 4.00000 12 16 15.17000\n", + "\n", + "\n", + "H2OBinomialMetrics: gbm\n", + "** Reported on training data. **\n", + "\n", + "MSE: 0.1076065\n", + "RMSE: 0.3280343\n", + "LogLoss: 0.3600893\n", + "Mean Per-Class Error: 0.1300826\n", + "AUC: 0.9464722\n", + "AUCPR: 0.9406318\n", + "Gini: 0.8929444\n", + "R^2: 0.5657448\n", + "\n", + "Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:\n", + " 0 1 Error Rate\n", + "0 4330 586 0.119203 =586/4916\n", + "1 574 3498 0.140963 =574/4072\n", + "Totals 4904 4084 0.129061 =1160/8988\n", + "\n", + "Maximum Metrics: Maximum metrics at their respective thresholds\n", + " metric threshold value idx\n", + "1 max f1 0.463229 0.857773 199\n", + "2 max f2 0.306172 0.899687 260\n", + "3 max f0point5 0.582230 0.882353 154\n", + "4 max accuracy 0.463229 0.870939 199\n", + "5 max precision 0.990029 1.000000 0\n", + "6 max recall 0.062069 1.000000 380\n", + "7 max specificity 0.990029 1.000000 0\n", + "8 max absolute_mcc 0.463229 0.739650 199\n", + "9 max min_per_class_accuracy 0.448748 0.868999 204\n", + "10 max mean_per_class_accuracy 0.463229 0.869917 199\n", + "11 max tns 0.990029 4916.000000 0\n", + "12 max fns 0.990029 4071.000000 0\n", + "13 max fps 0.014820 4916.000000 399\n", + "14 max tps 0.062069 4072.000000 380\n", + "15 max tnr 0.990029 1.000000 0\n", + "16 max fnr 0.990029 0.999754 0\n", + "17 max fpr 0.014820 1.000000 399\n", + "18 max tpr 0.062069 1.000000 380\n", + "\n", + "Gains/Lift Table: Extract with `h2o.gainsLift(, )` or `h2o.gainsLift(, valid=, xval=)`\n", + "H2OBinomialMetrics: gbm\n", + "** Reported on validation data. **\n", + "** Validation metrics **\n", + "\n", + "MSE: 0.1200593\n", + "RMSE: 0.3464957\n", + "LogLoss: 0.3894168\n", + "Mean Per-Class Error: 0.1542851\n", + "AUC: 0.9239379\n", + "AUCPR: 0.9173234\n", + "Gini: 0.8478758\n", + "R^2: 0.5157124\n", + "\n", + "Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:\n", + " 0 1 Error Rate\n", + "0 1414 221 0.135168 =221/1635\n", + "1 236 1125 0.173402 =236/1361\n", + "Totals 1650 1346 0.152537 =457/2996\n", + "\n", + "Maximum Metrics: Maximum metrics at their respective thresholds\n", + " metric threshold value idx\n", + "1 max f1 0.482571 0.831178 200\n", + "2 max f2 0.329543 0.887175 262\n", + "3 max f0point5 0.606576 0.850985 152\n", + "4 max accuracy 0.482571 0.847463 200\n", + "5 max precision 0.978424 1.000000 0\n", + "6 max recall 0.084627 1.000000 373\n", + "7 max specificity 0.978424 1.000000 0\n", + "8 max absolute_mcc 0.482571 0.692104 200\n", + "9 max min_per_class_accuracy 0.458052 0.839089 210\n", + "10 max mean_per_class_accuracy 0.482571 0.845715 200\n", + "11 max tns 0.978424 1635.000000 0\n", + "12 max fns 0.978424 1358.000000 0\n", + "13 max fps 0.012874 1635.000000 399\n", + "14 max tps 0.084627 1361.000000 373\n", + "15 max tnr 0.978424 1.000000 0\n", + "16 max fnr 0.978424 0.997796 0\n", + "17 max fpr 0.012874 1.000000 399\n", + "18 max tpr 0.084627 1.000000 373\n", + "\n", + "Gains/Lift Table: Extract with `h2o.gainsLift(, )` or `h2o.gainsLift(, valid=, xval=)`\n", + "\n" + ] + } + ], + "source": [ + "print(model)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Model Performance on a Test Set\n", + "\n", + "Once a model has been trained, you can also use it to make predictions on a test set. In the case above, we just ran the model once, so our validation set (passed as `validation_frame`), could have also served as a \"test set.\" We technically have already created test set predictions and evaluated test set performance. \n", + "\n", + "However, when performing model selection over a variety of model parameters, it is common for users to train a variety of models (using different parameters) using the training set, `train`, and a validation set, `valid`. Once the user selects the best model (based on validation set performance), the true test of model performance is performed by making a final set of predictions on the held-out (never been used before) test set, `test`.\n", + "\n", + "You can use the `model_performance` method to generate predictions on a new dataset. The results are stored in an object of class, `\"H2OBinomialMetrics\"`. " + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "'H2OBinomialMetrics'" + ], + "text/latex": [ + "'H2OBinomialMetrics'" + ], + "text/markdown": [ + "'H2OBinomialMetrics'" + ], + "text/plain": [ + "[1] \"H2OBinomialMetrics\"\n", + "attr(,\"package\")\n", + "[1] \"h2o\"" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "perf <- h2o.performance(model = model, newdata = test)\n", + "class(perf)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Individual model performance metrics can be extracted using methods like `r2`, `auc` and `mse`. In the case of binary classification, we may be most interested in evaluating test set Area Under the ROC Curve (AUC). " + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "0.522888206929954" + ], + "text/latex": [ + "0.522888206929954" + ], + "text/markdown": [ + "0.522888206929954" + ], + "text/plain": [ + "[1] 0.5228882" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "h2o.r2(perf)" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "0.928588111271663" + ], + "text/latex": [ + "0.928588111271663" + ], + "text/markdown": [ + "0.928588111271663" + ], + "text/plain": [ + "[1] 0.9285881" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "h2o.auc(perf)" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "0.116978290533459" + ], + "text/latex": [ + "0.116978290533459" + ], + "text/markdown": [ + "0.116978290533459" + ], + "text/plain": [ + "[1] 0.1169783" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "h2o.mse(perf)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Cross-validated Performance\n", + "\n", + "To perform k-fold cross-validation, you use the same code as above, but you specify `nfolds` as an integer greater than 1, or add a \"fold_column\" to your H2O Frame which indicates a fold ID for each row.\n", + "\n", + "Unless you have a specific reason to manually assign the observations to folds, you will find it easiest to simply use the `nfolds` argument.\n", + "\n", + "When performing cross-validation, you can still pass a `validation_frame`, but you can also choose to use the original dataset that contains all the rows. We will cross-validate a model below using the original H2O Frame which is called `data`." + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " |======================================================================| 100%\n" + ] + } + ], + "source": [ + "cvmodel <- h2o.gbm(x = x, y = y,\n", + " training_frame = train,\n", + " validation_frame = valid,\n", + " distribution = \"bernoulli\",\n", + " ntrees = 100,\n", + " max_depth = 4,\n", + " learn_rate = 0.1,\n", + " nfolds = 5)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Build a performance report for the last executed cell" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "ename": "ERROR", + "evalue": "Error in parse(text = input): :1:1: unexpected input\n1: %perfmonitor_perfreport\n ^\n", + "output_type": "error", + "traceback": [ + "Error in parse(text = input): :1:1: unexpected input\n1: %perfmonitor_perfreport\n ^\nTraceback:\n" + ] + } + ], + "source": [ + "%perfmonitor_perfreport" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This time around, we will simply pull the training and cross-validation metrics out of the model. To do so, you use the `auc` method again, and you can specify `train` or `xval` as `TRUE` to get the correct metric." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [], + "source": [ + "print(h2o.auc(cvmodel, train = TRUE))\n", + "print(h2o.auc(cvmodel, xval = TRUE))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Grid Search\n", + "\n", + "One way of evaluting models with different parameters is to perform a grid search over a set of parameter values. For example, in GBM, here are three model parameters that may be useful to search over:\n", + "- `ntrees`: Number of trees\n", + "- `max_depth`: Maximum depth of a tree\n", + "- `learn_rate`: Learning rate in the GBM\n", + "\n", + "We will define a grid as follows:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ntrees_opt <- c(5,50,100)\n", + "max_depth_opt <- c(2,3,5)\n", + "learn_rate_opt <- c(0.1,0.2)\n", + "\n", + "hyper_params = list('ntrees' = ntrees_opt,\n", + " 'max_depth' = max_depth_opt,\n", + " 'learn_rate' = learn_rate_opt)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The `h2o.grid` function can be used to train a `\"H2OGrid\"` object for any of the H2O algorithms (specified by the `\"algorithm\"` argument." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [], + "source": [ + "gs <- h2o.grid(algorithm = \"gbm\", \n", + " grid_id = \"eeg_demo_gbm_grid\",\n", + " hyper_params = hyper_params,\n", + " x = x, y = y, \n", + " training_frame = train, \n", + " validation_frame = valid)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Analyze the algorithm performance with:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%perfmonitor_perfreport" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### And we can also plot performance metrics right away!" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "357caad54d474883ad88e58eedfacd15", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "VBox(children=(HBox(children=(HTML(value='Plot Configuration:'), Checkbox(value=False, description='Sho…" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%perfmonitor_plot" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Compare Models" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [], + "source": [ + "print(gs)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "By default, grids of models will return the grid results sorted by (increasing) logloss on the validation set. However, if we are interested in sorting on another model performance metric, we can do that using the `h2o.getGrid` function as follows:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [], + "source": [ + "# print out the auc for all of the models\n", + "auc_table <- h2o.getGrid(grid_id = \"eeg_demo_gbm_grid\", sort_by = \"auc\", decreasing = TRUE)\n", + "print(auc_table)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The \"best\" model in terms of validation set AUC is listed first in auc_table." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [], + "source": [ + "best_model <- h2o.getModel(auc_table@model_ids[[1]])\n", + "h2o.auc(best_model, valid = TRUE) #Validation AUC for best model" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The last thing we may want to do is generate predictions on the test set using the \"best\" model, and evaluate the test set AUC." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [], + "source": [ + "best_perf <- h2o.performance(model = best_model, newdata = test)\n", + "h2o.auc(best_perf)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The test set AUC is approximately 0.97. Not bad!!" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Jumper Wrapper Kernel", + "language": "python", + "name": "jumper_wrapper" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "pygments_lexer": "ipython3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/demos/new_R_wrapping.ipynb b/demos/new_R_wrapping.ipynb new file mode 100644 index 0000000..2add9ce --- /dev/null +++ b/demos/new_R_wrapping.ipynb @@ -0,0 +1,613 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# How to wrap a new kernel" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Available Jupyter Kernels:\n", + "--------------------------------------------------\n", + " python3: Python 3 (ipykernel) (python)\n", + " pypy: Jumper Wrapper (Python 3 (ipykernel)) (python)\n", + " r_new: Jumper Wrapper (r_new) (R)\n", + " ir: R (R)\n", + " scorep_jupyter: Score-P_Python (python)\n", + " h2o_r: Jumper Wrapper (h2o_r) (R)\n", + " jumper_wrapper: Jumper Wrapper Kernel (python)\n" + ] + } + ], + "source": [ + "%list_kernels" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "data": { + "application/javascript": [ + "\n", + " if (typeof Jupyter !== 'undefined' && Jupyter.notebook) {\n", + " // JupyterLab classic notebook\n", + " Jupyter.notebook.kernel.kernel_info(function(reply) {\n", + " if (reply.content && reply.content.language_info) {\n", + " Jupyter.notebook.metadata.language_info = reply.content.language_info;\n", + " // Trigger CodeMirror mode change for all cells\n", + " var mode = reply.content.language_info.codemirror_mode || reply.content.language_info.name;\n", + " Jupyter.notebook.get_cells().forEach(function(cell) {\n", + " if (cell.cell_type === 'code') {\n", + " cell.code_mirror.setOption('mode', mode);\n", + " }\n", + " });\n", + " }\n", + " });\n", + " }\n", + " " + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Successfully wrapped kernel: ir\n", + "Hint: Refresh the page (without restarting the kernel) to enable syntax highlighting for the wrapped language.\n", + "Created permanent kernel 'r_new' that auto-wraps 'ir'.\n" + ] + } + ], + "source": [ + "%wrap_kernel ir --save r_new" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Available Jupyter Kernels:\n", + "--------------------------------------------------\n", + " python3: Python 3 (ipykernel) (python)\n", + " pypy: Jumper Wrapper (Python 3 (ipykernel)) (python)\n", + " r_new: r_new (R)\n", + " ir: R (R)\n", + " scorep_jupyter: Score-P_Python (python)\n", + " jumper_wrapper: Jumper Wrapper Kernel (python)\n", + "\n", + "--------------------------------------------------\n", + "Currently wrapped kernel: ir\n" + ] + } + ], + "source": [ + "%list_kernels" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "0.013117395995469" + ], + "text/latex": [ + "0.013117395995469" + ], + "text/markdown": [ + "0.013117395995469" + ], + "text/plain": [ + "[1] 0.0131174" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAA0gAAANICAYAAAD958/bAAAACXBIWXMAABJ0AAASdAHeZh94\nAAAgAElEQVR4nOzde7xmd13Y+88kEwjRxCBBw72IRArJqfZCBEIstuAFVC62GoKIiAohFqUt\np1Qocjn1chD1UOGgYIkaqApqDkrB1iIkkWCqIARIDAohgQQwlEswJpPMnD/Wb8ozm5nZe8/e\ns9eeed7v1+t57b2ftZ79fGfvgeSTtdZvFQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\nAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\nAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAHHm+v9qz\n8DjzIPtevbDfxQvP/8jC858/PGOyjeys/kP1V9Wt1U3Vk2adCOAIsnPuAQA44hxTXbTw9c+2\nb5Axr/OqFy58fVx1x5lmATjiCCQA1mtH9ZiFr39jrkHYr+9Y+PzT1eurK2eaBeCIc8zcAwBw\n2P1y038Q21mdPPMsHH73WPj8N6rzq0tnmgXgiCOQAI5+e6rbFx4bcf/qG/fz3JnVl2/wex9p\ntus/Q3csfP652aYAAIDDYCsWadhRPa56S9OF/TePj/+t+r72PR37t1fMc7DZjque3XS90l9X\nn63+pOmI1hkH+XNUnVD9dPXeMfN/q/7+ij/LO1e85gUL264ezz2uend11Yq5fqB6W/WR6u+q\njzYdZflXfWno/Xz7/lz/j7Hv7uqW6n3j+9V0rc9PVH88/rwfqn61utsqf979We/P72C/mxes\n8l5nNsXz/v7+VP3Ciu/3I+v+0wAAwCY43IG0o/q9Dvwv1nuqP+uL0bDWQDqj+ouD7Htr9ZPt\ne7Rjr3s0XTOz8jWfqP7TwterBdK5TRGzp/rw2OeO43UH+/O+t30jaTGQPlr9rwO87l9X/+MA\n266rvmI/f9YDOZSf30YCqeqlK16z91qm+zSF4N7n39b+f28AAHDYrQyktT7WGkj/asXrrqre\n2BQRuxee/88Lrzl2xWu+Z8X3vFPTEajFfa6p/rTpaM3i8z/Ql7poxT6faFqqeuWf8WCBdEP1\nNwtf7w2kn1rxPT7YdLTnhhXPL64C9/N96XvfVH1hP8/vfXxqP889fz9/1v051J/fA6qHjX33\nbr9gPHevNb7vYmS/r+k0wtcuPPeF6n5r/HMAAMCmO9yB9OaFbb+2Ytvi627si0cNVgukFy9s\nu7166sK2u1WXL2z/VHXnhe3fveJ7P2u873HV/7Ni28ECaU/TUY9XVE+rzhn7fGBh+4sO8rP4\nrwvPrwykFzWFwwnVy1dsu6b6uvG6M5p+3nu3/U5rs5GfX03Rt3f7i9f4nnud3b5h/LPte+rd\nj6/z+wEAwKY63IH0Pxe2XVc9vS8ebbhT9U0Lj73XIq0WSIv/gn7hfv5Mp7fvv4Q/bmHbf1l4\n/pIVrztmxfdeLZAet2L7jjHr3scpC9tOrN6+8NrLF7YtBtIn23dxhvuueM9/teI9L2z/v5OD\n2cjPb+Xr1xtIte9pjIuPy9q+C1MAbBr/RwdwZPnGpn/R39/jUO518+aFz+9RvbLpOpurqp9r\niojLm+LhtjV8vztUX7vw9f6OmlzRFxdRqGnxhb0WFx/4gxWv2129YQ0z1BSCv7viuT3Vbzad\nQnhd9Yzqt6r3NB2JOXsN3/ejY469Vq4S96EVX693FbmN/vw2w79rOhK26NamI1m7v3R3gKOL\nQAJYbi+p/mPTDUUXndYUEG+orm86TW0t7tG+q9597AD7Xbfw+deMjzualgzf6+P7ed1H1zjH\nJw/w/Nc3LX5wSdOpct9dfVnTQhX/9QCvWbRnle0bXUZ9Iz+/zXJT06mJi/6g6fREgKOeQAJY\nbrc2LUv91U2n0f10U0AsOqm1Lc1d07/QLx5puvsB9lu8mem14+Pea532WjwFbq+vXsMMe7/X\nSndsOnr0wPH1f2m6puf+1fc2HUma20Z+fpvlLk0r8i36zurBm/w+ANuSQAJYXl9ePWg8vq56\nR/XcpqMsd2/fVdd2VA9fw/e8tWkFtr1WXh9TU6AsHil6/8Lni/cretR+XvvoNcxwIP+gfY+2\n/IemFfL2WksAHm4b/flthpdXX7XiuWObVjK84ya/F8C2I5AAltfdm65n2ft45MK265uOGt2y\n8NyHD/B9vmzF14vXzTyxevLC16c2LRu9958/n2m6vmmvtyx8/qimRSoa+/9k0zVYh+qkFV8/\ndOHzx7Sx+NpMG/n5bdR39cUV/2o6knTr+PyBTVEJAACzWbmK3cFuFLu4etlaV7FbvO/NbU3/\nsn1h9fvVZxe2faJppbe9Fm+W+pGmow73HdtOaIqpxbk/VF1a/e2K51de23RCU5wt7nNtUwis\nXFVttRvFrvTV7btk9Z6m0+re276rwu1pugfQXour2C2ublfT6WiLr/uWFdtfubBtravYbeTn\nV4e+it2d2/dnf+l4fvHPv6v6h+v4ngAAsKkOdyA9oH1vqLq/x8196Qpvv7mf/RZn+/qmo1IH\n+p63Nv3L+46+1D9tuhZp5WtuabqG6FADqeplB5jnr6rXLXy9q2k57dr6QKqN/fwONZB+bcV7\n7D3Cdpf2DdT3NN2XCuCo5BQ7gOV2ZXW/6searkH666Yg+pvqz6tfGNvfseJ1z6wuaDricPP4\nPovx9Z6mIw3PaToa9ZGx/V3Va6p/3HSN0/4WU/jj6p80XfNyVVMs/UH1zdV/O+Q/6eTfNN2r\n6L3VF5r+jC+rvqFp0Ya9dlY/uMH32oiN/PwOxaOr71v4+neqPxmf31j9zMK2f1D9+016XwAA\nYANe1BePZPzWzLMAcBRyBAmA7eQ/NZ0ed3V1WXWnhW07qycsfL0dluUGAAA4bH60fa+DeWPT\nanaPbDq9bu/zn6n+3jwjAgAAbI1jmk6dO9iiETdVj51rQAAAgK32z6qLqg80LaZwfdOiAS/t\nS29iCgAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\nAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\nAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABs\nUzvmHmAmJ1UnVrurT4yPAAAAS+P06oLq+mrPwuO26rrqddXDZpsOAABgi7y86SjRnurj1WXV\nH4zHO6tr+2Iw/cpMMwIAABx25zWFz1uqbzjIfg+q/svY99lbMBcAAMCWu7S6stq5hn13VO+o\nLjmsEwEAANvSMXMPsAVObzql7rY17Lununi8BgAAWDLLEEhXVGdWx65x/4eM1wAAABx19l6D\n9ObqjIPsd1p14dj3327BXAAAALN4ZV9cpe6jTafRXVT9XvX26q8Xtv/nlvf+UAAAsNSWKQS+\nvnpO9cjqlIXnb68+Wf1x9aqmYAI42tytabXOI8HupgVz1nLtKABsqmUKpEUnVydWu5riaPe8\n4wAcdq8+7rjjfvBOd7rT3HOs6vOf/3x79uz51uqtc88CwPJZy9LXR6Pd47GsgQgsn2O//du/\nvZe85CVzz7GqBz/4wd18883L+s8nAGa2DKvY7XV6dUF1ffXZ6rrq49Wt4/PXVQ+bbToAAGB2\ny/Jf6F5ePbPpiNH11buqG8e2r6zuWZ0zHq+ufmiGGQEAgJktQyCdV53fdC77c6t3H2C/B1XP\nr55WfbB62ZZMBwAAbBvLEEjnVldVj+ngKyK9v+kI0t2rx7exQLpz9eLquDXuf8fqftXDN/Ce\nAADABi1DIJ1e/W5rWy52T9M9kp65wffc0bRK3lqXi7pLdVZ1h6ZrogAAgBksQyBdUZ1ZHdt0\nz6PVPGS8ZiM+XX3/OvZ/aPXNG3xPAABgg5ZhFbsLqwdUb6rOOMh+p419H1FdtAVzAQAA28wy\nHEF6RVMYPb36tura6pqmozx7mq4Xuld137H/a6uXbvmUAADA7JYhkKqeUb2qek71yKbrffa6\nvfpk9fqxz9u3fDoAAGBbWJZAqnpP9cTx+clNiyjsaoqj3XMNBQAAbB/LFEhVX9V0St1fVZ85\nwD6nNC27/bGtGgoAANgelmGRhqqvr/6i+kR1ZdN1SAdaZe7Xq+u2aC4AAGAbWYYjSPer3tl0\nj6H/3nSfoW9uWozhy5oWcQAAAFiKI0gvaTpl7jFNCzQ8urp39aHq56qvm280AABgO1mGQDqz\n+sPqvy4896mmUNpT/d9zDAUAAGw/yxBIpzRdc7TSXzbd7+g7qodv6UQAAMC2tAyB9BfVQw+w\n7Wea4umVTdcoAQAAS2wZAuni6oHVy5uuRVr0herp1YOqC6rjt3Y0AABgO1mGQHpRUySd33Tt\n0ZtWbH9z9eLqe5sWbvhHWzodAACwbSxDIP1d9Z3VTzfd/PVr9rPPf6ieUt1U3XXLJgMAALaV\nZQikqs9Uz63+ftPpdPtzwdh+3+qfb9FcAADANrIMN4pdjz3VR8YDAABYMstyBAkAAGBVAgkA\nAGAQSAAAAINAAgAAGAQSAADAIJAAAAAGgQQAADAIJAAAgEEgAQAADAIJAABgEEgAAACDQAIA\nABgEEgAAwCCQAAAABoEEAAAwCCQAAIBBIAEAAAwCCQAAYBBIAAAAg0ACAAAYBBIAAMAgkAAA\nAAaBBAAAMAgkAACAQSABAAAMAgkAAGAQSAAAAINAAgAAGAQSAADAIJAAAAAGgQQAADAIJAAA\ngGHn3AMAwKLbbrut6inVw+edZE2uqV459xAAbB6BBMC2smvXrh7wgAd898knnzz3KAf1mc98\npiuvvPJvE0gARxWBBMC286M/+qOdffbZc49xUBdffHHnnXfe3GMAsMlcgwQAADAIJAAAgEEg\nAQAADAIJAABgEEgAAACDQAIAABgEEgAAwCCQAAAABoEEAAAwCCQAAIBBIAEAAAwCCQAAYBBI\nAAAAg0ACAAAYBBIAAMAgkAAAAAaBBAAAMAgkAACAQSABAAAMAgkAAGAQSAAAAINAAgAAGAQS\nAADAIJAAAAAGgQQAADAIJAAAgEEgAQAADAIJAABg2Dn3AABwJLr99turjq3+xcyjrNWfVB+b\newiA7U4gAcAhuOqqq9qxY8cdTzzxxN+ae5bV3Hzzze3ates11dPmngVguxNIAHAIdu/e3fHH\nH9+ll1469yiret7zntdFF1107NxzABwJXIMEAAAwCCQAAIBBIAEAAAwCCQAAYBBIAAAAg0AC\nAAAYBBIAAMAgkAAAAAaBBAAAMAgkAACAQSABAAAMAgkAAGAQSAAAAINAAgAAGAQSAADAIJAA\nAAAGgQQAADAIJAAAgEEgAQAADAIJAABgEEgAAACDQAIAABgEEgAAwCCQAAAABoEEAAAwCCQA\nAIBBIAEAAAwCCQAAYBBIAAAAg0ACAAAYBBIAAMAgkAAAAAaBBAAAMAgkAACAQSABAAAMAgkA\nAGAQSAAAAINAAgAAGAQSAADAIJAAAAAGgQQAADDsnHuAmZxUnVjtrj4xPgIAAEtumY4gnV5d\nUF1ffba6rvp4dev4/HXVw2abDgAAmN2yHEF6efXMakdTIL2runFs+8rqntU54/Hq6odmmBEA\nAJjZMgTSedX51Vur51bvPsB+D6qeXz2t+mD1si2ZDgAA2DaW4RS7c6urqsd04Diqen/TEaSL\nq8dvwVwAAMA2swyBdHp1WXXbGvbd0xRIpx/WiQAAgG1pGQLpiurM6tg17v+Q8RoAAGDJLEMg\nXVg9oHpTdcZB9jtt7PuI6qItmAsAANhmlmGRhlc0hdHTq2+rrq2uqT7ddErdnat7Vfcd+7+2\neumWTwkAAMxuGQKp6hnVq6rnVI+szlrYdnv1yer1Y5+3b/l0AADAtrAsgVT1nuqJ4/OTqxOr\nXU1xtHuuoQAAgO1jmQJp0WfG45jqftVx1V+2tpXuAACAo9QyLNLw4uqpK57b2XS63Webwuj9\n1U1Np9h9xZZOBwAAbBvLEEjPq75vxXMvq36m6RS7NzSF0burH67+pLrjVg4IAABsD8t4it2D\nqvOrP62+vbpxYdtTq9dU/756wdaPBgAAzGkZjiCt9NBqR/Vj7RtHVb9aXdq0HDgAALBklvEI\n0j3Hx/cdYPv7qnM3+B5fVf1idewa97/LBt8PAADYBMsYSFePj/dpWpxhpVOrj2zwPW6u/qq1\n/3xv2eD7AQAAm2BZAum0psUarqyuqj41vj5nxX7/pHp09esbfL/Pj++/Vg+tnrTB9wQAADZo\nGQLp2qbT6l684vnvrX65etv4+qeqZ1Wfq35yq4YDAAC2j2UIpHtXd6q+tulI0v3H47T2vTHs\nY6u/aVoS/NotnhEAANgGliGQarom6H0deGGGqic0nYK3e0smAgAAtp1lCaS1+MDcAwAAAPNa\nxvsgAQAA7JcjSACHbkd1RnXc3IOsgfutAcAaCCSAQ/fN1X+fewgAYPMc7YF0fl+6vPda3Xkz\nBwGOSscff/zx/dEf/dHcc6zqe77ne+YeAQCOCEd7IL2laUnvH6nu2HQD12tmnQg4quzYsaOT\nTjpp7jFWdcwxLjkFgLU42gPpQ003f31zUyy9vfqOWScCAAC2rWX5T4pvrf5y7iEAAIDtbVkC\nqeryatfcQwAAANvX0X6K3aInzT0AAACwvS3TESQAAICDEkgAAACDQAIAABgEEgAAwCCQAAAA\nBoEEAAAwCCQAAIBBIAEAAAwCCQAAYBBIAAAAg0ACAAAYBBIAAMAgkAAAAAaBBAAAMAgkAACA\nQSABAAAMAgkAAGAQSAAAAINAAgAAGAQSAADAIJAAAAAGgQQAADAIJAAAgEEgAQAADAIJAABg\nEEgAAACDQAIAABgEEgAAwCCQAAAABoEEAAAwCCQAAIBBIAEAAAwCCQAAYBBIAAAAg0ACAAAY\nBBIAAMAgkAAAAAaBBAAAMAgkAACAQSABAAAMAgkAAGAQSAAAAINAAgAAGAQSAADAIJAAAAAG\ngQQAADAIJAAAgEEgAQAADAIJAABgEEgAAACDQAIAABgEEgAAwCCQAAAABoEEAAAwCCQAAIBB\nIAEAAAwCCQAAYBBIAAAAg0ACAAAYBBIAAMAgkAAAAAaBBAAAMAgkAACAQSABAAAMAgkAAGAQ\nSAAAAINAAgAAGAQSAADAIJAAAAAGgQQAADAIJAAAgEEgAQAADAIJAABgEEgAAACDQAIAABgE\nEgAAwCCQAAAABoEEAAAwrDeQvr866XAMAgAAMLf1BtJrq09Ub6z+RXWnzR4IAABgLusNpGdW\n76oeW/1W9cnq16tHV8dt7mgAAABba72B9Irqn1b3qM6v/qx6YvX71Q3VL1ePOITvCwAAMLtD\nDZkbql9q31i6ovrB6n9U11W/UJ258REBAAC2xmYc6bmhuqQpjK4Zz92telZ1WXVV9YRNeB8A\nAIDDaucGXvfw6ruarke6z3j++ur/rX6nurE6t/rh6rerB1f/cyPDAgAAHE7rDaQnNEXRY6o7\nj+f+qnppUxRdVu1Z2P/Pq98YH5+QQAIAALax9QbSG8bHv6h+sfrd6r2rvOavqr9pOqIEAACw\nba03kP5NUxT99Tpe87nqrut8HwAAgC233kUafq4pju5f/fMV236k+rrNGAoAAGAOh7KK3S80\nrUz3ghXP/6fqg00RtWODcwEAAGy59QbSDzQt3/3O6qdWbPuO6o+rZ4/9AAAAjijrDaTvq65u\nukHsm1dse0v1qOoD1TM2PBkAAMAWW28gfUPTDWF3HWD7bU1HkVyLBAAAHHHWG0g3VPdeZZ/7\nNN0wFgAA4Iiy3kB6R9NpdN91gO3fOh5/spGhAAAA5rDe+yA9tymQfq/6702n091QnVI9rHpM\n9amxHwAAwBFlvYH06eqbqpdU5/al90L6g6abyd6w8dEAAAC21noDqeqj1ZOrf12d1nTN0Sea\n7o103eaNBgAAsLUOJZD2+tR4XLpJswAAAMzqUALpu6snNF13dDCPPITvDQAAMJv1BtIPVq8e\nn99U/d3mjgMAADCf9QbSj1efq749p9YBAABHmfXcB2lHdf/qgsQRAABwFFpPIN2hOq667TDN\nAgAAMKv1BNIt1Tuqx1VfcXjGAQAAmM96Aqmm+x99vrq4+pfV/aq7HOABAABwRFnvIg1/0XSa\n3ZdVv7nKvjsOaSIAAICZrDeQ3nBYpgAAANgG1htIP3RYpgAAANgG1nsN0qITqjOqb9ykWQAA\nAGZ1KIF0n+qN1Weq91bvHM+/sPqN6h6bM9phdVLTnHdrY5EIAAAcRdYbB3drWsHu8dXl1dsW\ntn2+Ord619hvuzm96Sa311efra6rPl7dOj5/XfWw2aYDAABmt95A+onqXk3LfT+ses3CtpdW\nT6lOrZ63GcNtopc3He16crWnKeLePB6Xj+fOqS6pfmWmGQEAgJmtd5GGxzQdNfr1A2y/oOno\n0iM3MtQmO686v3pr9dzq3QfY70HV86unVR+sXrYl0wEAANvGeo8gnVL95Sr7fKy6+6GNc1ic\nW13VFHcHiqOq9zcdRdp7CiEAALBk1htIV1TfsMo+Z1ZXHto4h8Xp1WXVbWvYd09TIJ1+WCcC\nAAC2pfUG0puqBzddY7S/1/5E9Q+rP9zgXJvpiqZoO3aN+z9kvAYAAFgy6w2kn67eUb246bS1\nZ47nf6n6n9VLqvdVL9qsATfBhdUDmuLujIPsd9rY9xHVRVswFwAAsM2sd5GG25sWYHhG9eym\noy01LYRwY1Mg/Wz1d5s14CZ4RVMYPb36tura6prq002n1N25aWW++479X9u0Ih8AALBk1htI\nNd036BfH48ure1c3NAXHdvWM6lXVc5oC76yFbbdXn6xeP/Z5+5ZPBwAAbAuHEkiLbqo+sBmD\nbIH3VE8cn59cnVjtaoqj3XMNBQAAbB/rDaQD3f9of75vnd97K+0ejx1zDwIAAGwf6w2kJ61h\nn89XnziEWQ6306t/Wz2qOnXh+dubThF8R9NiE5du/WgAAMB2sN5V7I4/wOOU6p83xcXO6lmb\nOONmeHn13urJTQszvKt683hcPp47p7qk+pWZZgQAAGa23iNItxzk+T+q3ln9efVr1d2arvGZ\n23nV+dVbq+dW7z7Afg+qnl89rfpg9bItmQ4AANg21nsEaTV/W/1/1V3a9zS2OZ3bdM+mx3Tg\nOKp6f9NRpIurx2/BXAAAwDaz0VXs9ucOTUeUPnYYvvehOL363eq2Ney7pymQnrnajqu4e/Xb\n1R3XuP+Xj48WjQAAgBltZiDtqB5ePaXpiM12WTr7iurM6timBRlW85Dxmo34X9UbmmJxLe5T\nfV1ToAEAADNZbyB9/iDbjuuLR0y20/U7FzatTvem6v+s3neA/U6rXlA9oumGshtxc/Xz69j/\noU03swUAAGa03kD641W2f7r6neqiQ5rm8HhFdUb19Orbqmura5pm3VPdubpXdd+x/2url275\nlAAAwOzWG0jfcVimOPyeUb2q6cjQI6uzFrbdXn2yev3Y5+1bPh0AALAtHI5FGrar91RPHJ+f\nXJ3YtAz5J9s+10sBAAAzWm8gXXuI7/ORpgUctovPjAcAAMD/tt5A+v2m0+zu0XT9zvXj8dXj\nuR1NMXTditd9akNTAgAAbIH1BtKF1Q9Xf1j9WPXBhW1fV/1i032HntS0EAIAAMAR45h17v+c\npvB5bPvGUU33Pnpc001irQIHAAAccdZ7BOnB1Vua7vOzPzc3LQX+qA3MtJnOr158iK+982YO\nAgAAbH/rDaS/q+69yj73ro49tHE23Vuq+1c/0nQT28/n1D8AAOAA1htI76y+p+kUu9/bz/bv\nrL65aTGH7eBD1bOqNzfF0ts7cu/lBAAAHGbrDaR/13Sj1TdWb6jeWn28ulv1rdV3N12D9PxN\nnHEzvLX6y7mHAAAAtrf1BtI11T+rXlb9y/FYdEXT6nbv3fhom+7y6oS5hwAAALav9QZS1V80\nRdKDqr/fdM3Rp6qrqz+tdm/adJvrSXMPAAAAbG+HEkh7fbhpmfDrqss2ZxwAAID5rPc+SFX3\naboG6TNNp9K9czz/wuo3qntszmgAAABba72BdLfq4urxTdf0vG1h2+erc6t3jf0AAACOKOsN\npJ+o7lU9uXpY9ZqFbS+tnlKdWj1vM4YDAADYSusNpMc0HTX69QNsv6D6g6alwAEAAI4o6w2k\nU1r9fkIfq+5+aOMAAADMZ72r2F1RfcMq+5xZXXlo4wAAm23Pnj1Vd6juPPMoa3F79bm5hwCW\n13oD6U3VS5quMfqP+9n+E9U/rH5qg3MBAJvkiiuuqHrieBwJHl5dMvcQwHJabyD9dPWo6sXV\n9zfdILbql5qOHP2j6n3VizZrQABgY2677ba+6Zu+qfPOO2/uUVb15Cc/uVtuueUr5p4DWF7r\nDaTbmxZgeEb17Ooh4/nzqhubji79bPV3mzUgALBxJ598cg984APnHmNVxxxzKLdoBNg86wmk\nL6ue1HRz2F8cjy+v7l3dUH1606cDAADYQuv5zzRfqH6ueubCczdVH0gcAQAAR4H1Hse+oOka\npLschlkAAABmtd5rkJ5VHdt0s9iXVH9efaLavZ99v7Cx0QAAALbWegPp2vHxrtVvrrLvjvWP\nAwAAMJ/1BtJbDssUAAAA28BqgfTy6vLq18bXP3B4xwEAAJjPaos0nN9036OVfqD6lc0fBwAA\nYD7rPcVur7Orp1Q/tHmjAPxvr6q+Zu4h1uCUuQcAADbXoQYSwOF07qMe9agvu+c97zn3HAf1\n7ne/uyuvvHLuMQCATSSQgG3pu77ruzr77LPnHuOgXvnKVwokADjKrPdGsQAAAEctgQQAADAI\nJAAAgGEt1yA9vHr9iufOHB9XPr/onEOaCAAAYCZrCaT7jMf+fO9BXieQAACAI8pqgfSPt2QK\nAACAbWC1QPqzLZkCAABgG7BIAwAAwCCQAAAABoEEAAAwCCQAAIBBIAEAAAwCCQa8vxsAABEQ\nSURBVAAAYBBIAAAAg0ACAAAYBBIAAMAgkAAAAAaBBAAAMAgkAACAQSABAAAMAgkAAGAQSAAA\nAINAAgAAGAQSAADAIJAAAAAGgQQAADAIJAAAgEEgAQAADAIJAABgEEgAAACDQAIAABgEEgAA\nwCCQAAAABoEEAAAwCCQAAIBBIAEAAAwCCQAAYBBIAAAAg0ACAAAYBBIAAMAgkAAAAAaBBAAA\nMAgkAACAQSABAAAMAgkAAGAQSAAAAINAAgAAGAQSAADAIJAAAAAGgQQAADAIJAAAgEEgAQAA\nDAIJAABgEEgAAACDQAIAABgEEgAAwCCQAAAABoEEAAAwCCQAAIBBIAEAAAwCCQAAYBBIAAAA\ng0ACAAAYBBIAAMAgkAAAAAaBBAAAMAgkAACAQSABAAAMAgkAAGAQSAAAAINAAgAAGAQSAADA\nIJAAAAAGgQQAADAIJAAAgEEgAQAADAIJAABgEEgAAACDQAIAABgEEgAAwCCQAAAABoEEAAAw\nCCQAAIBBIAEAAAwCCQAAYBBIAAAAg0ACAAAYBBIAAMAgkAAAAAaBBAAAMAgkAACAQSABAAAM\nAgkAAGAQSAAAAINAAgAAGHbOPcBMTqpOrHZXnxgfAQCAJbdMR5BOry6orq8+W11Xfby6dXz+\nuuphs00HAADMblmOIL28ema1oymQ3lXdOLZ9ZXXP6pzxeHX1QzPMCAAAzGwZAum86vzqrdVz\nq3cfYL8HVc+vnlZ9sHrZlkwHAABsG8twit251VXVYzpwHFW9v+kI0sXV47dgLgAAYJtZhkA6\nvbqsum0N++5pCqTTD+tEAADAtrQMgXRFdWZ17Br3f8h4DQAAsGSWIZAurB5Qvak64yD7nTb2\nfUR10RbMBQAAbDPLsEjDK5rC6OnVt1XXVtdUn246pe7O1b2q+479X1u9dMunBAAAZrcMgVT1\njOpV1XOqR1ZnLWy7vfpk9fqxz9u3fDoAAGBbWJZAqnpP9cTx+cnVidWupjjavcnvde/qD6vj\n1rj/8ePjjk2eAwAAWIdlCqRFnxmPY6r7NYXMX7a2le7W4vrqP/bF8FnN/ZqObu3ZpPcHAAAO\nwTIE0ourD1e/uvDczurZTTeG/fLx3C3VBU2h8tkNvueu6tfWsf9Dx/sCAAAzWoZV7J5Xfd+K\n515W/UxTyLyh6dqjd1c/XP1JdcetHBAAANgeluEI0koPqs6v/rT69urGhW1PrV5T/fvqBVs/\nGgAAMKdlOIK00kObFkP4sfaNo5pOw7u0aTlwAABgySxjIN1zfHzfAba/r+nGsgAAwJJZxkC6\neny8zwG2n1p9ZGtGAQAAtpNlCaTTmhZr+O7qqupT4+uV/kn16OryrRsNAADYLpZhkYZrm06r\ne/GK57+3+uXqbePrn6qeVX2u+smtGg4AANg+liGQ7l3dqfrapiNJ9x+P09r3xrCPrf6maUnw\na7d4RgAAYBtYhkCqurlp8YUDLcxQ9YTqymr3lkwEW+9h1XfMPcQa3WHuAQCA5bQsgbQWH5h7\nADjMnnbXu971Kfe73/3mnmNVl1122dwjAABLSiDBEnnoQx/aS17ykrnHWNUZZ5wx9wgAwJJa\nllXsAAAAViWQAAAABoEEAAAwCCQAAIBBIAEAAAwCCQAAYBBIAAAAg0ACAAAYBBIAAMAgkAAA\nAAaBBAAAMAgkAACAQSABAAAMAgkAAGAQSAAAAINAAgAAGAQSAADAIJAAAAAGgQQAADAIJAAA\ngEEgAQAADAIJAABgEEgAAACDQAIAABgEEgAAwCCQAAAABoEEAAAwCCQAAIBBIAEAAAwCCQAA\nYBBIAAAAg0ACAAAYBBIAAMAgkAAAAIadcw8AALDXbbfdVnV+9Z0zj7IWH6teNPcQwOYSSADA\ntrFr167OOuusbz311FPnHuWgbrjhhi655JK/TSDBUUcgAQDbyjnnnNPZZ5899xgHdfHFF3fJ\nJZfMPQZwGLgGCQAAYBBIAAAAg0ACAAAYBBIAAMAgkAAAAAaBBAAAMAgkAACAQSABAAAMAgkA\nAGAQSAAAAINAAgAAGAQSAADAIJAAAAAGgQQAADAIJAAAgEEgAQAADAIJAABgEEgAAACDQAIA\nABgEEgAAwCCQAAAABoEEAAAwCCQAAIBBIAEAAAwCCQAAYBBIAAAAg0ACAAAYBBIAAMAgkAAA\nAAaBBAAAMAgkAACAQSABAAAMAgkAAGAQSAAAAINAAgAAGAQSAADAIJAAAAAGgQQAADAIJAAA\ngEEgAQAADAIJAABgEEgAAACDQAIAABgEEgAAwCCQAAAABoEEAAAwCCQAAIBBIAEAAAwCCQAA\nYBBIAAAAg0ACAAAYds49AADAkebTn/501R2r35p5lLX67fEAViGQAADW6eMf/3g7d+489slP\nfvK/mHuW1Vx88cVdffXVX0ggwZoIJACAQ3Dcccf14z/+43OPsaobb7yxq6++eu4x4IjhGiQA\nAIBBIAEAAAwCCQAAYBBIAAAAg0ACAAAYBBIAAMAgkAAAAAaBBAAAMLhRLGzMHatHV8fOPcga\n/L25BwAA2O4EEmzMI3bs2PHGE088ce45VnXTTTfNPQIAwLYnkGBjjj3++OO79NJL555jVY9+\n9KPnHgEAYNtzDRIAAMAgkAAAAAaBBAAAMAgkAACAQSABAAAMAgkAAGAQSAAAAINAAgAAGAQS\nAADAIJAAAAAGgQQAADDsnHsAOIBTqpPmHmINTp17AAA4mNtvv73qxOprZh5lLW6vrpl7CJab\nQGK7+kB117mHAIAj3fvf//6qJ4zHkeCbq7fNPQTLSyCxXZ3wwhe+sAc/+MFzz3FQF154YW98\n4xvnHgMADuj222/vkY98ZM9+9rPnHmVVj33sY7vllltOmHsOlptAYts65ZRTuuc97zn3GAd1\n0klHwlmAACy7E044Ydv/M7XqmGNcHs/8/C0EAAAYBBIAAMAgkAAAAAaBBAAAMAgkAACAwSp2\nAABsC7fddlvV+dV3zjzKak6oHlD9+dyDrNGbqt+fe4gjxbIG0klNd5TeXX1ifAQAYEa7du3q\nrLPO+tZTTz117lEO6gMf+EBXXXVVj3vc4/7x3LOs5vLLL++aa665QwJpzZYpkE6v/m31qGrx\nf3W3VzdU76h+qbp060cDAKDqnHPO6eyzz557jIN65Stf2Yc//OFe8IIXzD3Kqp73vOd1zTXX\nzD3GEWVZAunl1TOrHdX11buqG8e2r6zuWZ0zHq+ufmiGGbfCD4/HkeBOcw8AAMDyWYZAOq/p\nXNa3Vs+t3n2A/R5UPb96WvXB6mVbMt3WesgDH/jAf/Qt3/Itc8+xqp//+Z+fewQAAJbQMgTS\nudVV1WOq2w6y3/ubjiDdvXp8R2cgdf/737+nPvWpc4+xKoEEAMAcdsw9wBb4bPW71VPWuP//\n1XQ63skbeM/7Np3Gt9YA3dm0aMQdql0beN/VvPq44477wTvdafufvfa5z32uE044oZ07t3fD\n33LLLd16662deOKJc4+yqptuuqljjz02v//N4/d/ePj9bz6//83n9394+P1vvptvvrldu3a9\npuksKdZgGQLp0qbrjE5vWpBhNf+jKVTO2sB7HlOd3doDaUf1VdWFG3jPtbhb06mER4KvqT7a\nwY/6bQc7q3tXfz33IGvwlePjp2edYm38/jef3//m8/s/PPz+N5/f/+Y7kn7/NZ0pdf3cQ7B9\nnFftqd5cnXGQ/U5rCpQ9TavdAQAAHJVe2RQ+e5r+q8TF1UXV71Vvb6r/vdv/c8txZA0AAFhi\nX1+9rvpUX4yhPU2HcD8+tn3TbNMBAACzW9YjJSc3LYqwq/pktXvecQAAAAAAAAAAAAAAAAAA\nAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACW2I65BwC2zDur\nb5x7CABgy11WPWTuIY4UO+ceANgyf119qnrh3IPAKi6unltdMvcgcBBnVT9VPXzuQWAVL6g+\nP/cQRxKBBMvj1urG6s/mHgRWsbv6UP6usr2d2vR31d9Ttrsb5x7gSHPM3AMAAABsFwIJAABg\nEEgAAACDQAIAABgEEgAAwCCQAAAABoEEAAAwCCQAAIBBIAEAAAw75x4A2DK3zj0ArNGt+fvK\n9ufvKUcKf08BDuDO4wHb3d/LGQ5sf8c0/V2F7c4//wEAAAAAAAAAAAAAAADg/2/vXkIku+o4\njn9nEpP4irqI0WQhaIhIRIYgCBpUfIG6FURCfCBZBF0kxBcufCzMRnGj4kJEg2QUgoiKikiQ\nkfgCEVGjEVRQRAUz4qgYo8m0i1Ntd8pq7QhTpyr9+UBxuf9zF7/F5V997j19CgAAAAAAAAAA\nAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACATXXe7AAAAMA595jqNdXp6s+TswBsjAur91Xf\nqM5Uv6hOVk+bGQoWbqjurP60ON4wNw6spI+yrT5R7VSvnB0EYFM8rvGFvlPdVX2s+mp1tvpb\ndWJeNOijjXvz7urW6meL8w/NDAVL9FG21asa960JEsA+tzQa44eX6q+oHqh+sPZEMJxo3Jtf\nqc5f1M5v7w/PZ07KBcv0UbbR5Y1ldX/JBAngQX7aWHN84YqxrzWa5hPXmgiGk43771lL9asX\n9VvXnghW00fZNseqO6pfNpaGmiAdwvHZAYC1OVudqu5bMfaPxfEJ64sD//bS6jfVD5fq369+\nV71s7YlgNX2UbXNz9cLqtdmYAeDQLqnurX7f3vImWJfHN55o3nnA+HcW449dWyJ46PRRNtGJ\nxmT+lsX52/MG6VC8QYKj7crqW9VF1Tuq++fG4QjanficPmB8t37xGrLA/0MfZRM9srqt+kn1\nnrlRto+nHPDw8ajq+qXaz6svrbj20dXbqrc21ie/ufrkuQwHB/jn4rjzP647e66DwEOkj7LJ\n3l89tXp2e8s/AY6cS9vbwnP3c/uK615e/Wox/sXq6esKCCscbzxxP3XA+LcX41Y8sEn0UTbZ\nixv35o1LdUvsAFZ4b6M5/rh6/uQssOu3jR/bXOXXjQ0cYFPoo2y6m/vPB6arPm+cFRBgU7yu\n0RA/XV0wOQvst7vN95VL9asW9ZNrTwSr6aNsg5c0fnx7+bO76c2XF+fPmxUQYBMcq+5uPIm/\naHIWWPaCxpf2p/bVjlWfWdSvmREKluijbDtL7A7JJg1wNDylsUb+D9Xn/st111X3rCUR7DnV\n+Of211eXNZ5yXtNYvvTxDt4CHNZJHwWAh5EXdbj1yJfPCsiRd6yxI9g3qzOL41umJoIH00fZ\ndt4gAQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\nAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\nAAAAAAAAAAAAAAAAAAAAAAAcZVdV91VfX6o/ovpRdbp60rpDAQAAzPLuaqd6w77aOxe1a6ck\nAgAAmOSC9t4WXVJdUd1bfWFmKAAAgFmeUz1Q3VbdUf2xevLURAAAABN9sLGsbqe6bnIWAACA\nqa5oTI7+Wl08OQsAAMBUn2/saLdTfWRyFgAAgGmubUyMbqpur85Wz52aCAAAYIJLq3uq71Xn\nVZdVZ6q7GjvcAQAAHBmfre6vrt5Xe1PjjdK7piQCAACY4NWNidAHlurHq+9Wf6+ese5QAAAA\nAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\nAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAATPMvscLAZj2n\n4gIAAAAASUVORK5CYII=", + "text/plain": [ + "Plot with title “Histogram of x”" + ] + }, + "metadata": { + "image/png": { + "height": 420, + "width": 420 + } + }, + "output_type": "display_data" + } + ], + "source": [ + "x <- rnorm(1000)\n", + "mean(x)\n", + "hist(x)" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Available commands:\n", + " perfmonitor_fast_setup -- quick setup: enable ipympl plots, start monitor, enable reports\n", + " perfmonitor_help -- show this comprehensive help\n", + " perfmonitor_resources -- show available hardware resources\n", + " show_cell_history -- show interactive table of cell execution history\n", + " perfmonitor_start [interval] -- start monitoring (default: 1 second)\n", + " perfmonitor_stop -- stop monitoring\n", + " perfmonitor_perfreport [--cell RANGE] [--level LEVEL] -- show report\n", + " perfmonitor_plot -- interactive plot with widgets for data exploration\n", + " perfmonitor_enable_perfreports [--level LEVEL] [--interval INTERVAL] [--text] -- enable auto-reports\n", + " perfmonitor_disable_perfreports -- disable auto-reports\n", + " perfmonitor_export_perfdata [--file FILE] [--level LEVEL] [--name NAME] -- export CSV; without --file pushes DataFrame (default 'perfdata_df')\n", + " perfmonitor_export_cell_history [--file FILE] [--name NAME] -- export history to JSON/CSV; without --file pushes DataFrame (default 'cell_history_df')\n", + " export_session [target|target.zip] -- export full session\n", + " import_session -- import full session for offline analysis\n", + " start_write_script [output_path] -- record subsequent cells to a Python script\n", + " end_write_script -- stop recording and save the script\n", + "\n", + "Monitoring Levels:\n", + " process -- current Python process only (default, most focused)\n", + " user -- all processes belonging to current user\n", + " system -- system-wide metrics across all processes\n", + "\n", + "Cell Range Formats:\n", + " 5 -- single cell (cell #5)\n", + " 2:8 -- range of cells (cells #2 through #8)\n", + " :5 -- from start to cell #5\n", + " 3: -- from cell #3 to end\n", + "\n", + "Metric Categories:\n", + " cpu, gpu, mem, io (default: all available)\n", + " cpu_all, gpu_all for detailed per-core/per-GPU metrics\n" + ] + } + ], + "source": [ + "%perfmonitor_help" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[JUmPER]: Enabled ipympl interactive plots\n", + "[JUmPER]: Performance monitoring started (PID: 37695, Interval: 1.0s)\n", + "[JUmPER]: Performance monitoring already running\n", + "[JUmPER]: Performance reports enabled for each cell (level: process, interval: 1.0, format: html)\n", + "[JUmPER]: Fast setup complete! Ready for interactive analysis.\n" + ] + } + ], + "source": [ + "%perfmonitor_fast_setup " + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Completed iterations: 626 \n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + " \n", + " \n", + " JUmPER Performance Report\n", + " \n", + "\n", + "\n", + "
\n", + "
\n", + "
\n", + "
\n", + "
JUmPER Performance Report
\n", + "
\n", + "
\n", + " \n", + " Duration\n", + " 10.07s\n", + " \n", + " \n", + " \n", + " Cells\n", + " 1\n", + " \n", + "
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
MetricAVGMINMAXTotal/Limit
CPU Util (Across 12 CPUs)8.418.329.16-
Memory (GB)0.270.270.2715.58
GPU Util (Across 1 GPUs)0.000.000.00-
GPU Memory (GB)0.000.000.0012.00
\n", + "
\n", + "
\n", + "
\n", + "
\n", + "\n", + "" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "burn_cpu <- function(seconds = 10) {\n", + " # This function keeps the CPU busy with heavy numeric work\n", + " # for approximately the given number of seconds.\n", + " \n", + " start_time <- Sys.time()\n", + " iterations <- 0\n", + " \n", + " repeat {\n", + " # Generate random matrices\n", + " a <- matrix(runif(300 * 300), 300, 300)\n", + " b <- matrix(runif(300 * 300), 300, 300)\n", + " \n", + " # Perform CPU-heavy matrix multiplication\n", + " c <- a %*% b\n", + " \n", + " # Do some extra math to avoid optimization shortcuts\n", + " d <- sum(sqrt(c))\n", + " \n", + " iterations <- iterations + 1\n", + " \n", + " # Check elapsed time\n", + " if (as.numeric(difftime(Sys.time(), start_time, units = \"secs\")) >= seconds) {\n", + " break\n", + " }\n", + " }\n", + " \n", + " cat(\"Completed iterations:\", iterations, \"\\n\")\n", + "}\n", + "\n", + "burn_cpu(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "d626957388b74133a1ae275b3017c1ee", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "VBox(children=(HBox(children=(HTML(value='Plot Configuration:'), Checkbox(value=False, description='Sho…" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": "%perfmonitor_plot" + } + ], + "metadata": { + "kernelspec": { + "display_name": "Jumper Wrapper Kernel", + "language": "python", + "name": "jumper_wrapper" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "pygments_lexer": "ipython3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}