Added Jupyter streaming example

Jolanrensen · Jolanrensen · commit 51085088620f · 2022-05-09T16:13:56.000+02:00
diff --git a/examples/src/main/kotlin/org/jetbrains/kotlinx/spark/examples/streaming/JupyterStreamingExample.ipynb b/examples/src/main/kotlin/org/jetbrains/kotlinx/spark/examples/streaming/JupyterStreamingExample.ipynb
@@ -0,0 +1,191 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "source": [
+    "By default the latest version of the API and the latest supported Spark version is chosen. To specify your own: %use spark-streaming(spark=3.2, v=1.1.0)"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "To start a spark streaming session, simply use `withSparkStreaming { }` inside a cell. To use Spark normally, use `withSpark { }` in a cell, or use `%use spark` to start a Spark session for the whole notebook.\n"
+     ]
+    }
+   ],
+   "source": [
+    "%use spark-streaming"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "Let's define some data class to work with."
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "outputs": [],
+   "source": [
+    "data class TestRow(\n",
+    "    val word: String,\n",
+    ")"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "To run this on your local machine, you need to first run a Netcat server: `$ nc -lk 9999`.\n",
+    "\n",
+    "This example will collect the data from this stream for 10 seconds and 1 second intervals, splitting and counting the input per word."
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "+---+--------+\n",
+      "|key|count(1)|\n",
+      "+---+--------+\n",
+      "+---+--------+\n",
+      "\n",
+      "+-----+--------+\n",
+      "|  key|count(1)|\n",
+      "+-----+--------+\n",
+      "|hello|       8|\n",
+      "|Hello|       6|\n",
+      "|world|       3|\n",
+      "|     |       2|\n",
+      "| test|       4|\n",
+      "+-----+--------+\n",
+      "\n",
+      "+-----+--------+\n",
+      "|  key|count(1)|\n",
+      "+-----+--------+\n",
+      "|hello|       3|\n",
+      "+-----+--------+\n",
+      "\n",
+      "+---+--------+\n",
+      "|key|count(1)|\n",
+      "+---+--------+\n",
+      "+---+--------+\n",
+      "\n",
+      "+---+--------+\n",
+      "|key|count(1)|\n",
+      "+---+--------+\n",
+      "+---+--------+\n",
+      "\n",
+      "+---+--------+\n",
+      "|key|count(1)|\n",
+      "+---+--------+\n",
+      "+---+--------+\n",
+      "\n",
+      "+---+--------+\n",
+      "|key|count(1)|\n",
+      "+---+--------+\n",
+      "+---+--------+\n",
+      "\n",
+      "+---+--------+\n",
+      "|key|count(1)|\n",
+      "+---+--------+\n",
+      "+---+--------+\n",
+      "\n",
+      "+-----+--------+\n",
+      "|  key|count(1)|\n",
+      "+-----+--------+\n",
+      "|hello|       1|\n",
+      "|world|       2|\n",
+      "+-----+--------+\n",
+      "\n",
+      "+---+--------+\n",
+      "|key|count(1)|\n",
+      "+---+--------+\n",
+      "+---+--------+\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "withSparkStreaming(batchDuration = Durations.seconds(1), timeout = 10_000) { // this: KSparkStreamingSession\n",
+    "\n",
+    "    val lines: JavaReceiverInputDStream<String> = ssc.socketTextStream(\"localhost\", 9999)\n",
+    "    val words: JavaDStream<String> = lines.flatMap { it.split(\" \").iterator() }\n",
+    "\n",
+    "    words.foreachRDD { rdd: JavaRDD<String>, _: Time ->\n",
+    "        withSpark(rdd) { // this: KSparkSession\n",
+    "            val dataframe: Dataset<TestRow> = rdd.map { TestRow(it) }.toDS()\n",
+    "            dataframe\n",
+    "                .groupByKey { it.word }\n",
+    "                .count()\n",
+    "                .show()\n",
+    "        }\n",
+    "    }\n",
+    "}"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Kotlin",
+   "language": "kotlin",
+   "name": "kotlin"
+  },
+  "language_info": {
+   "name": "kotlin",
+   "version": "1.7.0-dev-1825",
+   "mimetype": "text/x-kotlin",
+   "file_extension": ".kt",
+   "pygments_lexer": "kotlin",
+   "codemirror_mode": "text/x-kotlin",
+   "nbconvert_exporter": ""
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}